diff --git a/CHANGELOG b/CHANGELOG index 0d2ad140..d01f674f 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -1,6 +1,15 @@ Development version (next release) -- +- Added support for half-precision floating-point (fp16) in the library +- Added half-precision routines: + * Level-1: HSWAP/HSCAL/HCOPY/HAXPY/HDOT/HNRM2/HASUM/HSUM/iHAMAX/iHMAX/iHMIN + * Level-2: HGEMV/HGBMV/HHEMV/HHBMV/HHPMV/HSYMV/HSBMV/HSPMV/HTRMV/HTBMV/HTPMV/HGER/HSYR/HSPR/HSYR2/HSPR2 + * Level-3: HGEMM/HSYMM/HSYRK/HSYR2K/HTRMM + +Version 0.7.1 +- Improved performance of large power-of-2 xGEMM kernels for AMD GPUs +- Fixed a bug in the xGEMM routine related to the event incorrectly set +- Made MSVC link the run-time libraries statically Version 0.7.1 - Improved performance of large power-of-2 xGEMM kernels for AMD GPUs diff --git a/CMakeLists.txt b/CMakeLists.txt index 02ffba1d..8a02d290 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -125,7 +125,7 @@ include_directories(${clblast_SOURCE_DIR}/include ${OPENCL_INCLUDE_DIRS}) # Sets the supported routines and the used kernels. New routines and kernels should be added here. set(KERNELS copy pad transpose padtranspose xaxpy xdot xger xgemm xgemv) set(SAMPLE_PROGRAMS_CPP sgemm) -set(SAMPLE_PROGRAMS_C sasum dgemv sgemm cache) +set(SAMPLE_PROGRAMS_C sasum dgemv sgemm haxpy cache) set(LEVEL1_ROUTINES xswap xscal xcopy xaxpy xdot xdotu xdotc xnrm2 xasum xamax) set(LEVEL2_ROUTINES xgemv xgbmv xhemv xhbmv xhpmv xsymv xsbmv xspmv xtrmv xtbmv xtpmv xger xgeru xgerc xher xhpr xher2 xhpr2 xsyr xspr xsyr2 xspr2) @@ -156,6 +156,7 @@ target_link_libraries(clblast ${OPENCL_LIBRARIES}) install(TARGETS clblast DESTINATION lib) install(FILES include/clblast.h DESTINATION include) install(FILES include/clblast_c.h DESTINATION include) +install(FILES include/clblast_half.h DESTINATION include) # ================================================================================================== diff --git a/README.md b/README.md index e4564c26..51c282a3 100644 --- a/README.md +++ b/README.md @@ -20,6 +20,7 @@ Use CLBlast instead of clBLAS: * When you are still running on OpenCL 1.1 hardware. * When you value an organized and modern C++ codebase. * When you target Intel CPUs and GPUs or embedded devices +* When you can benefit from the increased performance of half-precision fp16 data-types. Use CLBlast instead of cuBLAS: @@ -127,7 +128,7 @@ If your device is not (yet) among this list or if you want to tune CLBlast for s cmake -DTUNERS=ON .. -Note that CLBlast's tuners are based on the CLTune auto-tuning library, which has to be installed separately (version 1.7.0 or higher). CLTune is available from GitHub. +Note that CLBlast's tuners are based on the CLTune auto-tuning library, which has to be installed separately (version 2.3.1 or higher). CLTune is available from GitHub. Compiling with `-DTUNERS=ON` will generate a number of tuners, each named `clblast_tuner_xxxxx`, in which `xxxxx` corresponds to a `.opencl` kernel file as found in `src/kernels`. These kernels corresponds to routines (e.g. `xgemm`) or to common pre-processing or post-processing kernels (`copy` and `transpose`). Running such a tuner will test a number of parameter-value combinations on your device and report which one gave the best performance. Running `make alltuners` runs all tuners for all precisions in one go. You can set the default device and platform for `alltuners` by setting the `DEFAULT_DEVICE` and `DEFAULT_PLATFORM` environmental variables before running CMake. @@ -177,64 +178,70 @@ These graphs can be generated automatically on your own device. First, compile C Supported routines ------------- -CLBlast is in active development but already supports almost all the BLAS routines. The supported routines are marked with '✔' in the following tables. Routines marked with '-' do not exist: they are not part of BLAS at all. +CLBlast is in active development but already supports almost all the BLAS routines. The supported routines are marked with '✔' in the following tables. Routines marked with '-' do not exist: they are not part of BLAS at all. The different data-types supported by the library are: -| Level-1 | S | D | C | Z | -| ---------|---|---|---|---| -| xSWAP | ✔ | ✔ | ✔ | ✔ | -| xSCAL | ✔ | ✔ | ✔ | ✔ | -| xCOPY | ✔ | ✔ | ✔ | ✔ | -| xAXPY | ✔ | ✔ | ✔ | ✔ | -| xDOT | ✔ | ✔ | - | - | -| xDOTU | - | - | ✔ | ✔ | -| xDOTC | - | - | ✔ | ✔ | -| xNRM2 | ✔ | ✔ | ✔ | ✔ | -| xASUM | ✔ | ✔ | ✔ | ✔ | -| IxAMAX | ✔ | ✔ | ✔ | ✔ | +* __S:__ Single-precision 32-bit floating-point (`float`). +* __D:__ Double-precision 64-bit floating-point (`double`). +* __C:__ Complex single-precision 2x32-bit floating-point (`std::complex`). +* __Z:__ Complex double-precision 2x64-bit floating-point (`std::complex`). +* __H:__ Half-precision 16-bit floating-point (`cl_half`). See section 'Half precision' for more information. -| Level-2 | S | D | C | Z | -| ---------|---|---|---|---| -| xGEMV | ✔ | ✔ | ✔ | ✔ | -| xGBMV | ✔ | ✔ | ✔ | ✔ | -| xHEMV | - | - | ✔ | ✔ | -| xHBMV | - | - | ✔ | ✔ | -| xHPMV | - | - | ✔ | ✔ | -| xSYMV | ✔ | ✔ | - | - | -| xSBMV | ✔ | ✔ | - | - | -| xSPMV | ✔ | ✔ | - | - | -| xTRMV | ✔ | ✔ | ✔ | ✔ | -| xTBMV | ✔ | ✔ | ✔ | ✔ | -| xTPMV | ✔ | ✔ | ✔ | ✔ | -| xGER | ✔ | ✔ | - | - | -| xGERU | - | - | ✔ | ✔ | -| xGERC | - | - | ✔ | ✔ | -| xHER | - | - | ✔ | ✔ | -| xHPR | - | - | ✔ | ✔ | -| xHER2 | - | - | ✔ | ✔ | -| xHPR2 | - | - | ✔ | ✔ | -| xSYR | ✔ | ✔ | - | - | -| xSPR | ✔ | ✔ | - | - | -| xSYR2 | ✔ | ✔ | - | - | -| xSPR2 | ✔ | ✔ | - | - | +| Level-1 | S | D | C | Z | H | +| ---------|---|---|---|---|---| +| xSWAP | ✔ | ✔ | ✔ | ✔ | ✔ | +| xSCAL | ✔ | ✔ | ✔ | ✔ | ✔ | +| xCOPY | ✔ | ✔ | ✔ | ✔ | ✔ | +| xAXPY | ✔ | ✔ | ✔ | ✔ | ✔ | +| xDOT | ✔ | ✔ | - | - | ✔ | +| xDOTU | - | - | ✔ | ✔ | - | +| xDOTC | - | - | ✔ | ✔ | - | +| xNRM2 | ✔ | ✔ | ✔ | ✔ | ✔ | +| xASUM | ✔ | ✔ | ✔ | ✔ | ✔ | +| IxAMAX | ✔ | ✔ | ✔ | ✔ | ✔ | -| Level-3 | S | D | C | Z | -| ---------|---|---|---|---| -| xGEMM | ✔ | ✔ | ✔ | ✔ | -| xSYMM | ✔ | ✔ | ✔ | ✔ | -| xHEMM | - | - | ✔ | ✔ | -| xSYRK | ✔ | ✔ | ✔ | ✔ | -| xHERK | - | - | ✔ | ✔ | -| xSYR2K | ✔ | ✔ | ✔ | ✔ | -| xHER2K | - | - | ✔ | ✔ | -| xTRMM | ✔ | ✔ | ✔ | ✔ | +| Level-2 | S | D | C | Z | H | +| ---------|---|---|---|---|---| +| xGEMV | ✔ | ✔ | ✔ | ✔ | ✔ | +| xGBMV | ✔ | ✔ | ✔ | ✔ | ✔ | +| xHEMV | - | - | ✔ | ✔ | - | +| xHBMV | - | - | ✔ | ✔ | - | +| xHPMV | - | - | ✔ | ✔ | - | +| xSYMV | ✔ | ✔ | - | - | ✔ | +| xSBMV | ✔ | ✔ | - | - | ✔ | +| xSPMV | ✔ | ✔ | - | - | ✔ | +| xTRMV | ✔ | ✔ | ✔ | ✔ | ✔ | +| xTBMV | ✔ | ✔ | ✔ | ✔ | ✔ | +| xTPMV | ✔ | ✔ | ✔ | ✔ | ✔ | +| xGER | ✔ | ✔ | - | - | ✔ | +| xGERU | - | - | ✔ | ✔ | - | +| xGERC | - | - | ✔ | ✔ | - | +| xHER | - | - | ✔ | ✔ | - | +| xHPR | - | - | ✔ | ✔ | - | +| xHER2 | - | - | ✔ | ✔ | - | +| xHPR2 | - | - | ✔ | ✔ | - | +| xSYR | ✔ | ✔ | - | - | ✔ | +| xSPR | ✔ | ✔ | - | - | ✔ | +| xSYR2 | ✔ | ✔ | - | - | ✔ | +| xSPR2 | ✔ | ✔ | - | - | ✔ | + +| Level-3 | S | D | C | Z | H | +| ---------|---|---|---|---|---| +| xGEMM | ✔ | ✔ | ✔ | ✔ | ✔ | +| xSYMM | ✔ | ✔ | ✔ | ✔ | ✔ | +| xHEMM | - | - | ✔ | ✔ | - | +| xSYRK | ✔ | ✔ | ✔ | ✔ | ✔ | +| xHERK | - | - | ✔ | ✔ | - | +| xSYR2K | ✔ | ✔ | ✔ | ✔ | ✔ | +| xHER2K | - | - | ✔ | ✔ | - | +| xTRMM | ✔ | ✔ | ✔ | ✔ | ✔ | In addition, some non-BLAS routines are also supported by CLBlast. They are experimental and should be used with care: -| Additional | S | D | C | Z | -| -----------|---|---|---|---| -| xSUM | ✔ | ✔ | ✔ | ✔ | -| IxMAX | ✔ | ✔ | ✔ | ✔ | -| IxMIN | ✔ | ✔ | ✔ | ✔ | +| Additional | S | D | C | Z | H | +| -----------|---|---|---|---|---| +| xSUM | ✔ | ✔ | ✔ | ✔ | ✔ | +| IxMAX | ✔ | ✔ | ✔ | ✔ | ✔ | +| IxMIN | ✔ | ✔ | ✔ | ✔ | ✔ | Some BLAS routines are not supported yet by CLBlast. They are shown in the following table: @@ -250,6 +257,19 @@ Some BLAS routines are not supported yet by CLBlast. They are shown in the follo | xTRSM | | | | | +Half precision (fp16) +------------- + +The half-precison fp16 format is a 16-bits floating-point data-type. Some OpenCL devices support the `cl_khr_fp16` extension, reducing storage and bandwidth requirements by a factor 2 compared to single-precision floating-point. In case the hardware also accelerates arithmetic on half-precision data-types, this can also greatly improve compute performance of e.g. level-3 routines such as GEMM. Devices which can benefit from this are among others Intel GPUs, ARM Mali GPUs, and NVIDIA's latest Pascal GPUs. Half-precision is in particular interest for the deep-learning community, in which convolutional neural networks can be processed much faster at a minor accuracy loss. + +Since there is no half-precision data-type in C or C++, OpenCL provides the `cl_half` type for the host device. Unfortunately, internally this translates to a 16-bits integer, so computations on the host using this data-type should be avoided. For convenience, CLBlast provides the `clblast_half.h` header (C99 and C++ compatible), defining the `half` type as a short-hand to `cl_half` and the following basic functions: + +* `half FloatToHalf(const float value)`: Converts a 32-bits floating-point value to a 16-bits floating-point value. +* `float HalfToFloat(const half value)`: Converts a 16-bits floating-point value to a 32-bits floating-point value. + +The `/samples` folder contains examples of how to use these convencience functions when calling one of the half-precision BLAS routines. + + Contributing ------------- @@ -270,6 +290,7 @@ Tuning and testing on a variety of OpenCL devices was made possible by: * [dividiti](http://www.dividiti.com) * [SURFsara HPC center](http://www.surfsara.com) + Support us ------------- diff --git a/doc/clblast.md b/doc/clblast.md index 9c9b9a6f..8dbb97e4 100644 --- a/doc/clblast.md +++ b/doc/clblast.md @@ -34,6 +34,10 @@ StatusCode CLBlastZswap(const size_t n, cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event) +StatusCode CLBlastHswap(const size_t n, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event) ``` Arguments to SWAP: @@ -82,6 +86,10 @@ StatusCode CLBlastZscal(const size_t n, const cl_double2 alpha, cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event) +StatusCode CLBlastHscal(const size_t n, + const cl_half alpha, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event) ``` Arguments to SCAL: @@ -128,6 +136,10 @@ StatusCode CLBlastZcopy(const size_t n, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event) +StatusCode CLBlastHcopy(const size_t n, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event) ``` Arguments to COPY: @@ -181,6 +193,11 @@ StatusCode CLBlastZaxpy(const size_t n, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event) +StatusCode CLBlastHaxpy(const size_t n, + const cl_half alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event) ``` Arguments to AXPY: @@ -225,6 +242,11 @@ StatusCode CLBlastDdot(const size_t n, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event) +StatusCode CLBlastHdot(const size_t n, + cl_mem dot_buffer, const size_t dot_offset, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event) ``` Arguments to DOT: @@ -371,6 +393,10 @@ StatusCode CLBlastDznrm2(const size_t n, cl_mem nrm2_buffer, const size_t nrm2_offset, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event) +StatusCode CLBlastHnrm2(const size_t n, + cl_mem nrm2_buffer, const size_t nrm2_offset, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event) ``` Arguments to NRM2: @@ -420,6 +446,10 @@ StatusCode CLBlastDzasum(const size_t n, cl_mem asum_buffer, const size_t asum_offset, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event) +StatusCode CLBlastHasum(const size_t n, + cl_mem asum_buffer, const size_t asum_offset, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event) ``` Arguments to ASUM: @@ -469,6 +499,10 @@ StatusCode CLBlastDzsum(const size_t n, cl_mem sum_buffer, const size_t sum_offset, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event) +StatusCode CLBlastHsum(const size_t n, + cl_mem sum_buffer, const size_t sum_offset, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event) ``` Arguments to SUM: @@ -518,6 +552,10 @@ StatusCode CLBlastiZamax(const size_t n, cl_mem imax_buffer, const size_t imax_offset, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event) +StatusCode CLBlastiHamax(const size_t n, + cl_mem imax_buffer, const size_t imax_offset, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event) ``` Arguments to AMAX: @@ -567,6 +605,10 @@ StatusCode CLBlastiZmax(const size_t n, cl_mem imax_buffer, const size_t imax_offset, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event) +StatusCode CLBlastiHmax(const size_t n, + cl_mem imax_buffer, const size_t imax_offset, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event) ``` Arguments to MAX: @@ -616,6 +658,10 @@ StatusCode CLBlastiZmin(const size_t n, cl_mem imin_buffer, const size_t imin_offset, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event) +StatusCode CLBlastiHmin(const size_t n, + cl_mem imin_buffer, const size_t imin_offset, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event) ``` Arguments to MIN: @@ -685,6 +731,14 @@ StatusCode CLBlastZgemv(const Layout layout, const Transpose a_transpose, const cl_double2 beta, cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event) +StatusCode CLBlastHgemv(const Layout layout, const Transpose a_transpose, + const size_t m, const size_t n, + const cl_half alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_half beta, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event) ``` Arguments to GEMV: @@ -761,6 +815,14 @@ StatusCode CLBlastZgbmv(const Layout layout, const Transpose a_transpose, const cl_double2 beta, cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event) +StatusCode CLBlastHgbmv(const Layout layout, const Transpose a_transpose, + const size_t m, const size_t n, const size_t kl, const size_t ku, + const cl_half alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_half beta, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event) ``` Arguments to GBMV: @@ -1000,6 +1062,14 @@ StatusCode CLBlastDsymv(const Layout layout, const Triangle triangle, const double beta, cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event) +StatusCode CLBlastHsymv(const Layout layout, const Triangle triangle, + const size_t n, + const cl_half alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_half beta, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event) ``` Arguments to SYMV: @@ -1059,6 +1129,14 @@ StatusCode CLBlastDsbmv(const Layout layout, const Triangle triangle, const double beta, cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event) +StatusCode CLBlastHsbmv(const Layout layout, const Triangle triangle, + const size_t n, const size_t k, + const cl_half alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_half beta, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event) ``` Arguments to SBMV: @@ -1119,6 +1197,14 @@ StatusCode CLBlastDspmv(const Layout layout, const Triangle triangle, const double beta, cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event) +StatusCode CLBlastHspmv(const Layout layout, const Triangle triangle, + const size_t n, + const cl_half alpha, + const cl_mem ap_buffer, const size_t ap_offset, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_half beta, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event) ``` Arguments to SPMV: @@ -1178,6 +1264,11 @@ StatusCode CLBlastZtrmv(const Layout layout, const Triangle triangle, const Tran const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event) +StatusCode CLBlastHtrmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t n, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event) ``` Arguments to TRMV: @@ -1235,6 +1326,11 @@ StatusCode CLBlastZtbmv(const Layout layout, const Triangle triangle, const Tran const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event) +StatusCode CLBlastHtbmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t n, const size_t k, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event) ``` Arguments to TBMV: @@ -1293,6 +1389,11 @@ StatusCode CLBlastZtpmv(const Layout layout, const Triangle triangle, const Tran const cl_mem ap_buffer, const size_t ap_offset, cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event) +StatusCode CLBlastHtpmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t n, + const cl_mem ap_buffer, const size_t ap_offset, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event) ``` Arguments to TPMV: @@ -1345,6 +1446,13 @@ StatusCode CLBlastDger(const Layout layout, const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_mem a_buffer, const size_t a_offset, const size_t a_ld, cl_command_queue* queue, cl_event* event) +StatusCode CLBlastHger(const Layout layout, + const size_t m, const size_t n, + const cl_half alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_command_queue* queue, cl_event* event) ``` Arguments to GER: @@ -1713,6 +1821,12 @@ StatusCode CLBlastDsyr(const Layout layout, const Triangle triangle, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_mem a_buffer, const size_t a_offset, const size_t a_ld, cl_command_queue* queue, cl_event* event) +StatusCode CLBlastHsyr(const Layout layout, const Triangle triangle, + const size_t n, + const cl_half alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_command_queue* queue, cl_event* event) ``` Arguments to SYR: @@ -1762,6 +1876,12 @@ StatusCode CLBlastDspr(const Layout layout, const Triangle triangle, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_mem ap_buffer, const size_t ap_offset, cl_command_queue* queue, cl_event* event) +StatusCode CLBlastHspr(const Layout layout, const Triangle triangle, + const size_t n, + const cl_half alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_mem ap_buffer, const size_t ap_offset, + cl_command_queue* queue, cl_event* event) ``` Arguments to SPR: @@ -1813,6 +1933,13 @@ StatusCode CLBlastDsyr2(const Layout layout, const Triangle triangle, const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_mem a_buffer, const size_t a_offset, const size_t a_ld, cl_command_queue* queue, cl_event* event) +StatusCode CLBlastHsyr2(const Layout layout, const Triangle triangle, + const size_t n, + const cl_half alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_command_queue* queue, cl_event* event) ``` Arguments to SYR2: @@ -1868,6 +1995,13 @@ StatusCode CLBlastDspr2(const Layout layout, const Triangle triangle, const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_mem ap_buffer, const size_t ap_offset, cl_command_queue* queue, cl_event* event) +StatusCode CLBlastHspr2(const Layout layout, const Triangle triangle, + const size_t n, + const cl_half alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_mem ap_buffer, const size_t ap_offset, + cl_command_queue* queue, cl_event* event) ``` Arguments to SPR2: @@ -1941,6 +2075,14 @@ StatusCode CLBlastZgemm(const Layout layout, const Transpose a_transpose, const const cl_double2 beta, cl_mem c_buffer, const size_t c_offset, const size_t c_ld, cl_command_queue* queue, cl_event* event) +StatusCode CLBlastHgemm(const Layout layout, const Transpose a_transpose, const Transpose b_transpose, + const size_t m, const size_t n, const size_t k, + const cl_half alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, + const cl_half beta, + cl_mem c_buffer, const size_t c_offset, const size_t c_ld, + cl_command_queue* queue, cl_event* event) ``` Arguments to GEMM: @@ -2019,6 +2161,14 @@ StatusCode CLBlastZsymm(const Layout layout, const Side side, const Triangle tri const cl_double2 beta, cl_mem c_buffer, const size_t c_offset, const size_t c_ld, cl_command_queue* queue, cl_event* event) +StatusCode CLBlastHsymm(const Layout layout, const Side side, const Triangle triangle, + const size_t m, const size_t n, + const cl_half alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, + const cl_half beta, + cl_mem c_buffer, const size_t c_offset, const size_t c_ld, + cl_command_queue* queue, cl_event* event) ``` Arguments to SYMM: @@ -2152,6 +2302,13 @@ StatusCode CLBlastZsyrk(const Layout layout, const Triangle triangle, const Tran const cl_double2 beta, cl_mem c_buffer, const size_t c_offset, const size_t c_ld, cl_command_queue* queue, cl_event* event) +StatusCode CLBlastHsyrk(const Layout layout, const Triangle triangle, const Transpose a_transpose, + const size_t n, const size_t k, + const cl_half alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const cl_half beta, + cl_mem c_buffer, const size_t c_offset, const size_t c_ld, + cl_command_queue* queue, cl_event* event) ``` Arguments to SYRK: @@ -2281,6 +2438,14 @@ StatusCode CLBlastZsyr2k(const Layout layout, const Triangle triangle, const Tra const cl_double2 beta, cl_mem c_buffer, const size_t c_offset, const size_t c_ld, cl_command_queue* queue, cl_event* event) +StatusCode CLBlastHsyr2k(const Layout layout, const Triangle triangle, const Transpose ab_transpose, + const size_t n, const size_t k, + const cl_half alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, + const cl_half beta, + cl_mem c_buffer, const size_t c_offset, const size_t c_ld, + cl_command_queue* queue, cl_event* event) ``` Arguments to SYR2K: @@ -2409,6 +2574,12 @@ StatusCode CLBlastZtrmm(const Layout layout, const Side side, const Triangle tri const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, cl_mem b_buffer, const size_t b_offset, const size_t b_ld, cl_command_queue* queue, cl_event* event) +StatusCode CLBlastHtrmm(const Layout layout, const Side side, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t m, const size_t n, + const cl_half alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_mem b_buffer, const size_t b_offset, const size_t b_ld, + cl_command_queue* queue, cl_event* event) ``` Arguments to TRMM: diff --git a/include/clblast.h b/include/clblast.h index 5df0f605..64b2610a 100644 --- a/include/clblast.h +++ b/include/clblast.h @@ -121,28 +121,28 @@ StatusCode Rotm(const size_t n, cl_mem sparam_buffer, const size_t sparam_offset, cl_command_queue* queue, cl_event* event = nullptr); -// Swap two vectors: SSWAP/DSWAP/CSWAP/ZSWAP +// Swap two vectors: SSWAP/DSWAP/CSWAP/ZSWAP/HSWAP template StatusCode Swap(const size_t n, cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event = nullptr); -// Vector scaling: SSCAL/DSCAL/CSCAL/ZSCAL +// Vector scaling: SSCAL/DSCAL/CSCAL/ZSCAL/HSCAL template StatusCode Scal(const size_t n, const T alpha, cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event = nullptr); -// Vector copy: SCOPY/DCOPY/CCOPY/ZCOPY +// Vector copy: SCOPY/DCOPY/CCOPY/ZCOPY/HCOPY template StatusCode Copy(const size_t n, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event = nullptr); -// Vector-times-constant plus vector: SAXPY/DAXPY/CAXPY/ZAXPY +// Vector-times-constant plus vector: SAXPY/DAXPY/CAXPY/ZAXPY/HAXPY template StatusCode Axpy(const size_t n, const T alpha, @@ -150,7 +150,7 @@ StatusCode Axpy(const size_t n, cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event = nullptr); -// Dot product of two vectors: SDOT/DDOT +// Dot product of two vectors: SDOT/DDOT/HDOT template StatusCode Dot(const size_t n, cl_mem dot_buffer, const size_t dot_offset, @@ -174,42 +174,42 @@ StatusCode Dotc(const size_t n, const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event = nullptr); -// Euclidian norm of a vector: SNRM2/DNRM2/ScNRM2/DzNRM2 +// Euclidian norm of a vector: SNRM2/DNRM2/ScNRM2/DzNRM2/HNRM2 template StatusCode Nrm2(const size_t n, cl_mem nrm2_buffer, const size_t nrm2_offset, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event = nullptr); -// Absolute sum of values in a vector: SASUM/DASUM/ScASUM/DzASUM +// Absolute sum of values in a vector: SASUM/DASUM/ScASUM/DzASUM/HASUM template StatusCode Asum(const size_t n, cl_mem asum_buffer, const size_t asum_offset, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event = nullptr); -// Sum of values in a vector (non-BLAS function): SSUM/DSUM/ScSUM/DzSUM +// Sum of values in a vector (non-BLAS function): SSUM/DSUM/ScSUM/DzSUM/HSUM template StatusCode Sum(const size_t n, cl_mem sum_buffer, const size_t sum_offset, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event = nullptr); -// Index of absolute maximum value in a vector: iSAMAX/iDAMAX/iCAMAX/iZAMAX +// Index of absolute maximum value in a vector: iSAMAX/iDAMAX/iCAMAX/iZAMAX/iHAMAX template StatusCode Amax(const size_t n, cl_mem imax_buffer, const size_t imax_offset, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event = nullptr); -// Index of maximum value in a vector (non-BLAS function): iSMAX/iDMAX/iCMAX/iZMAX +// Index of maximum value in a vector (non-BLAS function): iSMAX/iDMAX/iCMAX/iZMAX/iHMAX template StatusCode Max(const size_t n, cl_mem imax_buffer, const size_t imax_offset, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event = nullptr); -// Index of minimum value in a vector (non-BLAS function): iSMIN/iDMIN/iCMIN/iZMIN +// Index of minimum value in a vector (non-BLAS function): iSMIN/iDMIN/iCMIN/iZMIN/iHMIN template StatusCode Min(const size_t n, cl_mem imin_buffer, const size_t imin_offset, @@ -220,7 +220,7 @@ StatusCode Min(const size_t n, // BLAS level-2 (matrix-vector) routines // ================================================================================================= -// General matrix-vector multiplication: SGEMV/DGEMV/CGEMV/ZGEMV +// General matrix-vector multiplication: SGEMV/DGEMV/CGEMV/ZGEMV/HGEMV template StatusCode Gemv(const Layout layout, const Transpose a_transpose, const size_t m, const size_t n, @@ -231,7 +231,7 @@ StatusCode Gemv(const Layout layout, const Transpose a_transpose, cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event = nullptr); -// General banded matrix-vector multiplication: SGBMV/DGBMV/CGBMV/ZGBMV +// General banded matrix-vector multiplication: SGBMV/DGBMV/CGBMV/ZGBMV/HGBMV template StatusCode Gbmv(const Layout layout, const Transpose a_transpose, const size_t m, const size_t n, const size_t kl, const size_t ku, @@ -275,7 +275,7 @@ StatusCode Hpmv(const Layout layout, const Triangle triangle, cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event = nullptr); -// Symmetric matrix-vector multiplication: SSYMV/DSYMV +// Symmetric matrix-vector multiplication: SSYMV/DSYMV/HSYMV template StatusCode Symv(const Layout layout, const Triangle triangle, const size_t n, @@ -286,7 +286,7 @@ StatusCode Symv(const Layout layout, const Triangle triangle, cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event = nullptr); -// Symmetric banded matrix-vector multiplication: SSBMV/DSBMV +// Symmetric banded matrix-vector multiplication: SSBMV/DSBMV/HSBMV template StatusCode Sbmv(const Layout layout, const Triangle triangle, const size_t n, const size_t k, @@ -297,7 +297,7 @@ StatusCode Sbmv(const Layout layout, const Triangle triangle, cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event = nullptr); -// Symmetric packed matrix-vector multiplication: SSPMV/DSPMV +// Symmetric packed matrix-vector multiplication: SSPMV/DSPMV/HSPMV template StatusCode Spmv(const Layout layout, const Triangle triangle, const size_t n, @@ -308,7 +308,7 @@ StatusCode Spmv(const Layout layout, const Triangle triangle, cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event = nullptr); -// Triangular matrix-vector multiplication: STRMV/DTRMV/CTRMV/ZTRMV +// Triangular matrix-vector multiplication: STRMV/DTRMV/CTRMV/ZTRMV/HTRMV template StatusCode Trmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, const size_t n, @@ -316,7 +316,7 @@ StatusCode Trmv(const Layout layout, const Triangle triangle, const Transpose a_ cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event = nullptr); -// Triangular banded matrix-vector multiplication: STBMV/DTBMV/CTBMV/ZTBMV +// Triangular banded matrix-vector multiplication: STBMV/DTBMV/CTBMV/ZTBMV/HTBMV template StatusCode Tbmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, const size_t n, const size_t k, @@ -324,7 +324,7 @@ StatusCode Tbmv(const Layout layout, const Triangle triangle, const Transpose a_ cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event = nullptr); -// Triangular packed matrix-vector multiplication: STPMV/DTPMV/CTPMV/ZTPMV +// Triangular packed matrix-vector multiplication: STPMV/DTPMV/CTPMV/ZTPMV/HTPMV template StatusCode Tpmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, const size_t n, @@ -356,7 +356,7 @@ StatusCode Tpsv(const Layout layout, const Triangle triangle, const Transpose a_ cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event = nullptr); -// General rank-1 matrix update: SGER/DGER +// General rank-1 matrix update: SGER/DGER/HGER template StatusCode Ger(const Layout layout, const size_t m, const size_t n, @@ -424,7 +424,7 @@ StatusCode Hpr2(const Layout layout, const Triangle triangle, cl_mem ap_buffer, const size_t ap_offset, cl_command_queue* queue, cl_event* event = nullptr); -// Symmetric rank-1 matrix update: SSYR/DSYR +// Symmetric rank-1 matrix update: SSYR/DSYR/HSYR template StatusCode Syr(const Layout layout, const Triangle triangle, const size_t n, @@ -433,7 +433,7 @@ StatusCode Syr(const Layout layout, const Triangle triangle, cl_mem a_buffer, const size_t a_offset, const size_t a_ld, cl_command_queue* queue, cl_event* event = nullptr); -// Symmetric packed rank-1 matrix update: SSPR/DSPR +// Symmetric packed rank-1 matrix update: SSPR/DSPR/HSPR template StatusCode Spr(const Layout layout, const Triangle triangle, const size_t n, @@ -442,7 +442,7 @@ StatusCode Spr(const Layout layout, const Triangle triangle, cl_mem ap_buffer, const size_t ap_offset, cl_command_queue* queue, cl_event* event = nullptr); -// Symmetric rank-2 matrix update: SSYR2/DSYR2 +// Symmetric rank-2 matrix update: SSYR2/DSYR2/HSYR2 template StatusCode Syr2(const Layout layout, const Triangle triangle, const size_t n, @@ -452,7 +452,7 @@ StatusCode Syr2(const Layout layout, const Triangle triangle, cl_mem a_buffer, const size_t a_offset, const size_t a_ld, cl_command_queue* queue, cl_event* event = nullptr); -// Symmetric packed rank-2 matrix update: SSPR2/DSPR2 +// Symmetric packed rank-2 matrix update: SSPR2/DSPR2/HSPR2 template StatusCode Spr2(const Layout layout, const Triangle triangle, const size_t n, @@ -466,7 +466,7 @@ StatusCode Spr2(const Layout layout, const Triangle triangle, // BLAS level-3 (matrix-matrix) routines // ================================================================================================= -// General matrix-matrix multiplication: SGEMM/DGEMM/CGEMM/ZGEMM +// General matrix-matrix multiplication: SGEMM/DGEMM/CGEMM/ZGEMM/HGEMM template StatusCode Gemm(const Layout layout, const Transpose a_transpose, const Transpose b_transpose, const size_t m, const size_t n, const size_t k, @@ -477,7 +477,7 @@ StatusCode Gemm(const Layout layout, const Transpose a_transpose, const Transpos cl_mem c_buffer, const size_t c_offset, const size_t c_ld, cl_command_queue* queue, cl_event* event = nullptr); -// Symmetric matrix-matrix multiplication: SSYMM/DSYMM/CSYMM/ZSYMM +// Symmetric matrix-matrix multiplication: SSYMM/DSYMM/CSYMM/ZSYMM/HSYMM template StatusCode Symm(const Layout layout, const Side side, const Triangle triangle, const size_t m, const size_t n, @@ -499,7 +499,7 @@ StatusCode Hemm(const Layout layout, const Side side, const Triangle triangle, cl_mem c_buffer, const size_t c_offset, const size_t c_ld, cl_command_queue* queue, cl_event* event = nullptr); -// Rank-K update of a symmetric matrix: SSYRK/DSYRK/CSYRK/ZSYRK +// Rank-K update of a symmetric matrix: SSYRK/DSYRK/CSYRK/ZSYRK/HSYRK template StatusCode Syrk(const Layout layout, const Triangle triangle, const Transpose a_transpose, const size_t n, const size_t k, @@ -519,7 +519,7 @@ StatusCode Herk(const Layout layout, const Triangle triangle, const Transpose a_ cl_mem c_buffer, const size_t c_offset, const size_t c_ld, cl_command_queue* queue, cl_event* event = nullptr); -// Rank-2K update of a symmetric matrix: SSYR2K/DSYR2K/CSYR2K/ZSYR2K +// Rank-2K update of a symmetric matrix: SSYR2K/DSYR2K/CSYR2K/ZSYR2K/HSYR2K template StatusCode Syr2k(const Layout layout, const Triangle triangle, const Transpose ab_transpose, const size_t n, const size_t k, @@ -541,7 +541,7 @@ StatusCode Her2k(const Layout layout, const Triangle triangle, const Transpose a cl_mem c_buffer, const size_t c_offset, const size_t c_ld, cl_command_queue* queue, cl_event* event = nullptr); -// Triangular matrix-matrix multiplication: STRMM/DTRMM/CTRMM/ZTRMM +// Triangular matrix-matrix multiplication: STRMM/DTRMM/CTRMM/ZTRMM/HTRMM template StatusCode Trmm(const Layout layout, const Side side, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, const size_t m, const size_t n, @@ -550,7 +550,7 @@ StatusCode Trmm(const Layout layout, const Side side, const Triangle triangle, c cl_mem b_buffer, const size_t b_offset, const size_t b_ld, cl_command_queue* queue, cl_event* event = nullptr); -// Solves a triangular system of equations: STRSM/DTRSM/CTRSM/ZTRSM +// Solves a triangular system of equations: STRSM/DTRSM/CTRSM/ZTRSM/HTRSM template StatusCode Trsm(const Layout layout, const Side side, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, const size_t m, const size_t n, diff --git a/include/clblast_c.h b/include/clblast_c.h index 8b2bf73c..40248615 100644 --- a/include/clblast_c.h +++ b/include/clblast_c.h @@ -148,7 +148,7 @@ StatusCode PUBLIC_API CLBlastDrotm(const size_t n, cl_mem sparam_buffer, const size_t sparam_offset, cl_command_queue* queue, cl_event* event); -// Swap two vectors: SSWAP/DSWAP/CSWAP/ZSWAP +// Swap two vectors: SSWAP/DSWAP/CSWAP/ZSWAP/HSWAP StatusCode PUBLIC_API CLBlastSswap(const size_t n, cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_mem y_buffer, const size_t y_offset, const size_t y_inc, @@ -165,8 +165,12 @@ StatusCode PUBLIC_API CLBlastZswap(const size_t n, cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event); +StatusCode PUBLIC_API CLBlastHswap(const size_t n, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event); -// Vector scaling: SSCAL/DSCAL/CSCAL/ZSCAL +// Vector scaling: SSCAL/DSCAL/CSCAL/ZSCAL/HSCAL StatusCode PUBLIC_API CLBlastSscal(const size_t n, const float alpha, cl_mem x_buffer, const size_t x_offset, const size_t x_inc, @@ -183,8 +187,12 @@ StatusCode PUBLIC_API CLBlastZscal(const size_t n, const cl_double2 alpha, cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event); +StatusCode PUBLIC_API CLBlastHscal(const size_t n, + const cl_half alpha, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event); -// Vector copy: SCOPY/DCOPY/CCOPY/ZCOPY +// Vector copy: SCOPY/DCOPY/CCOPY/ZCOPY/HCOPY StatusCode PUBLIC_API CLBlastScopy(const size_t n, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_mem y_buffer, const size_t y_offset, const size_t y_inc, @@ -201,8 +209,12 @@ StatusCode PUBLIC_API CLBlastZcopy(const size_t n, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event); +StatusCode PUBLIC_API CLBlastHcopy(const size_t n, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event); -// Vector-times-constant plus vector: SAXPY/DAXPY/CAXPY/ZAXPY +// Vector-times-constant plus vector: SAXPY/DAXPY/CAXPY/ZAXPY/HAXPY StatusCode PUBLIC_API CLBlastSaxpy(const size_t n, const float alpha, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, @@ -223,8 +235,13 @@ StatusCode PUBLIC_API CLBlastZaxpy(const size_t n, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event); +StatusCode PUBLIC_API CLBlastHaxpy(const size_t n, + const cl_half alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event); -// Dot product of two vectors: SDOT/DDOT +// Dot product of two vectors: SDOT/DDOT/HDOT StatusCode PUBLIC_API CLBlastSdot(const size_t n, cl_mem dot_buffer, const size_t dot_offset, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, @@ -235,6 +252,11 @@ StatusCode PUBLIC_API CLBlastDdot(const size_t n, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event); +StatusCode PUBLIC_API CLBlastHdot(const size_t n, + cl_mem dot_buffer, const size_t dot_offset, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event); // Dot product of two complex vectors: CDOTU/ZDOTU StatusCode PUBLIC_API CLBlastCdotu(const size_t n, @@ -260,7 +282,7 @@ StatusCode PUBLIC_API CLBlastZdotc(const size_t n, const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event); -// Euclidian norm of a vector: SNRM2/DNRM2/ScNRM2/DzNRM2 +// Euclidian norm of a vector: SNRM2/DNRM2/ScNRM2/DzNRM2/HNRM2 StatusCode PUBLIC_API CLBlastSnrm2(const size_t n, cl_mem nrm2_buffer, const size_t nrm2_offset, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, @@ -277,8 +299,12 @@ StatusCode PUBLIC_API CLBlastDznrm2(const size_t n, cl_mem nrm2_buffer, const size_t nrm2_offset, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event); +StatusCode PUBLIC_API CLBlastHnrm2(const size_t n, + cl_mem nrm2_buffer, const size_t nrm2_offset, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event); -// Absolute sum of values in a vector: SASUM/DASUM/ScASUM/DzASUM +// Absolute sum of values in a vector: SASUM/DASUM/ScASUM/DzASUM/HASUM StatusCode PUBLIC_API CLBlastSasum(const size_t n, cl_mem asum_buffer, const size_t asum_offset, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, @@ -295,8 +321,12 @@ StatusCode PUBLIC_API CLBlastDzasum(const size_t n, cl_mem asum_buffer, const size_t asum_offset, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event); +StatusCode PUBLIC_API CLBlastHasum(const size_t n, + cl_mem asum_buffer, const size_t asum_offset, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event); -// Sum of values in a vector (non-BLAS function): SSUM/DSUM/ScSUM/DzSUM +// Sum of values in a vector (non-BLAS function): SSUM/DSUM/ScSUM/DzSUM/HSUM StatusCode PUBLIC_API CLBlastSsum(const size_t n, cl_mem sum_buffer, const size_t sum_offset, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, @@ -313,8 +343,12 @@ StatusCode PUBLIC_API CLBlastDzsum(const size_t n, cl_mem sum_buffer, const size_t sum_offset, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event); +StatusCode PUBLIC_API CLBlastHsum(const size_t n, + cl_mem sum_buffer, const size_t sum_offset, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event); -// Index of absolute maximum value in a vector: iSAMAX/iDAMAX/iCAMAX/iZAMAX +// Index of absolute maximum value in a vector: iSAMAX/iDAMAX/iCAMAX/iZAMAX/iHAMAX StatusCode PUBLIC_API CLBlastiSamax(const size_t n, cl_mem imax_buffer, const size_t imax_offset, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, @@ -331,8 +365,12 @@ StatusCode PUBLIC_API CLBlastiZamax(const size_t n, cl_mem imax_buffer, const size_t imax_offset, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event); +StatusCode PUBLIC_API CLBlastiHamax(const size_t n, + cl_mem imax_buffer, const size_t imax_offset, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event); -// Index of maximum value in a vector (non-BLAS function): iSMAX/iDMAX/iCMAX/iZMAX +// Index of maximum value in a vector (non-BLAS function): iSMAX/iDMAX/iCMAX/iZMAX/iHMAX StatusCode PUBLIC_API CLBlastiSmax(const size_t n, cl_mem imax_buffer, const size_t imax_offset, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, @@ -349,8 +387,12 @@ StatusCode PUBLIC_API CLBlastiZmax(const size_t n, cl_mem imax_buffer, const size_t imax_offset, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event); +StatusCode PUBLIC_API CLBlastiHmax(const size_t n, + cl_mem imax_buffer, const size_t imax_offset, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event); -// Index of minimum value in a vector (non-BLAS function): iSMIN/iDMIN/iCMIN/iZMIN +// Index of minimum value in a vector (non-BLAS function): iSMIN/iDMIN/iCMIN/iZMIN/iHMIN StatusCode PUBLIC_API CLBlastiSmin(const size_t n, cl_mem imin_buffer, const size_t imin_offset, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, @@ -367,12 +409,16 @@ StatusCode PUBLIC_API CLBlastiZmin(const size_t n, cl_mem imin_buffer, const size_t imin_offset, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event); +StatusCode PUBLIC_API CLBlastiHmin(const size_t n, + cl_mem imin_buffer, const size_t imin_offset, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event); // ================================================================================================= // BLAS level-2 (matrix-vector) routines // ================================================================================================= -// General matrix-vector multiplication: SGEMV/DGEMV/CGEMV/ZGEMV +// General matrix-vector multiplication: SGEMV/DGEMV/CGEMV/ZGEMV/HGEMV StatusCode PUBLIC_API CLBlastSgemv(const Layout layout, const Transpose a_transpose, const size_t m, const size_t n, const float alpha, @@ -405,8 +451,16 @@ StatusCode PUBLIC_API CLBlastZgemv(const Layout layout, const Transpose a_transp const cl_double2 beta, cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event); +StatusCode PUBLIC_API CLBlastHgemv(const Layout layout, const Transpose a_transpose, + const size_t m, const size_t n, + const cl_half alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_half beta, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event); -// General banded matrix-vector multiplication: SGBMV/DGBMV/CGBMV/ZGBMV +// General banded matrix-vector multiplication: SGBMV/DGBMV/CGBMV/ZGBMV/HGBMV StatusCode PUBLIC_API CLBlastSgbmv(const Layout layout, const Transpose a_transpose, const size_t m, const size_t n, const size_t kl, const size_t ku, const float alpha, @@ -439,6 +493,14 @@ StatusCode PUBLIC_API CLBlastZgbmv(const Layout layout, const Transpose a_transp const cl_double2 beta, cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event); +StatusCode PUBLIC_API CLBlastHgbmv(const Layout layout, const Transpose a_transpose, + const size_t m, const size_t n, const size_t kl, const size_t ku, + const cl_half alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_half beta, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event); // Hermitian matrix-vector multiplication: CHEMV/ZHEMV StatusCode PUBLIC_API CLBlastChemv(const Layout layout, const Triangle triangle, @@ -494,7 +556,7 @@ StatusCode PUBLIC_API CLBlastZhpmv(const Layout layout, const Triangle triangle, cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event); -// Symmetric matrix-vector multiplication: SSYMV/DSYMV +// Symmetric matrix-vector multiplication: SSYMV/DSYMV/HSYMV StatusCode PUBLIC_API CLBlastSsymv(const Layout layout, const Triangle triangle, const size_t n, const float alpha, @@ -511,8 +573,16 @@ StatusCode PUBLIC_API CLBlastDsymv(const Layout layout, const Triangle triangle, const double beta, cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event); +StatusCode PUBLIC_API CLBlastHsymv(const Layout layout, const Triangle triangle, + const size_t n, + const cl_half alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_half beta, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event); -// Symmetric banded matrix-vector multiplication: SSBMV/DSBMV +// Symmetric banded matrix-vector multiplication: SSBMV/DSBMV/HSBMV StatusCode PUBLIC_API CLBlastSsbmv(const Layout layout, const Triangle triangle, const size_t n, const size_t k, const float alpha, @@ -529,8 +599,16 @@ StatusCode PUBLIC_API CLBlastDsbmv(const Layout layout, const Triangle triangle, const double beta, cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event); +StatusCode PUBLIC_API CLBlastHsbmv(const Layout layout, const Triangle triangle, + const size_t n, const size_t k, + const cl_half alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_half beta, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event); -// Symmetric packed matrix-vector multiplication: SSPMV/DSPMV +// Symmetric packed matrix-vector multiplication: SSPMV/DSPMV/HSPMV StatusCode PUBLIC_API CLBlastSspmv(const Layout layout, const Triangle triangle, const size_t n, const float alpha, @@ -547,8 +625,16 @@ StatusCode PUBLIC_API CLBlastDspmv(const Layout layout, const Triangle triangle, const double beta, cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event); +StatusCode PUBLIC_API CLBlastHspmv(const Layout layout, const Triangle triangle, + const size_t n, + const cl_half alpha, + const cl_mem ap_buffer, const size_t ap_offset, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_half beta, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event); -// Triangular matrix-vector multiplication: STRMV/DTRMV/CTRMV/ZTRMV +// Triangular matrix-vector multiplication: STRMV/DTRMV/CTRMV/ZTRMV/HTRMV StatusCode PUBLIC_API CLBlastStrmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, const size_t n, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, @@ -569,8 +655,13 @@ StatusCode PUBLIC_API CLBlastZtrmv(const Layout layout, const Triangle triangle, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event); +StatusCode PUBLIC_API CLBlastHtrmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t n, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event); -// Triangular banded matrix-vector multiplication: STBMV/DTBMV/CTBMV/ZTBMV +// Triangular banded matrix-vector multiplication: STBMV/DTBMV/CTBMV/ZTBMV/HTBMV StatusCode PUBLIC_API CLBlastStbmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, const size_t n, const size_t k, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, @@ -591,8 +682,13 @@ StatusCode PUBLIC_API CLBlastZtbmv(const Layout layout, const Triangle triangle, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event); +StatusCode PUBLIC_API CLBlastHtbmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t n, const size_t k, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event); -// Triangular packed matrix-vector multiplication: STPMV/DTPMV/CTPMV/ZTPMV +// Triangular packed matrix-vector multiplication: STPMV/DTPMV/CTPMV/ZTPMV/HTPMV StatusCode PUBLIC_API CLBlastStpmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, const size_t n, const cl_mem ap_buffer, const size_t ap_offset, @@ -613,6 +709,11 @@ StatusCode PUBLIC_API CLBlastZtpmv(const Layout layout, const Triangle triangle, const cl_mem ap_buffer, const size_t ap_offset, cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event); +StatusCode PUBLIC_API CLBlastHtpmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t n, + const cl_mem ap_buffer, const size_t ap_offset, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event); // Solves a triangular system of equations: STRSV/DTRSV/CTRSV/ZTRSV StatusCode PUBLIC_API CLBlastStrsv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, @@ -680,7 +781,7 @@ StatusCode PUBLIC_API CLBlastZtpsv(const Layout layout, const Triangle triangle, cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event); -// General rank-1 matrix update: SGER/DGER +// General rank-1 matrix update: SGER/DGER/HGER StatusCode PUBLIC_API CLBlastSger(const Layout layout, const size_t m, const size_t n, const float alpha, @@ -695,6 +796,13 @@ StatusCode PUBLIC_API CLBlastDger(const Layout layout, const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_mem a_buffer, const size_t a_offset, const size_t a_ld, cl_command_queue* queue, cl_event* event); +StatusCode PUBLIC_API CLBlastHger(const Layout layout, + const size_t m, const size_t n, + const cl_half alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_command_queue* queue, cl_event* event); // General rank-1 complex matrix update: CGERU/ZGERU StatusCode PUBLIC_API CLBlastCgeru(const Layout layout, @@ -788,7 +896,7 @@ StatusCode PUBLIC_API CLBlastZhpr2(const Layout layout, const Triangle triangle, cl_mem ap_buffer, const size_t ap_offset, cl_command_queue* queue, cl_event* event); -// Symmetric rank-1 matrix update: SSYR/DSYR +// Symmetric rank-1 matrix update: SSYR/DSYR/HSYR StatusCode PUBLIC_API CLBlastSsyr(const Layout layout, const Triangle triangle, const size_t n, const float alpha, @@ -801,8 +909,14 @@ StatusCode PUBLIC_API CLBlastDsyr(const Layout layout, const Triangle triangle, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_mem a_buffer, const size_t a_offset, const size_t a_ld, cl_command_queue* queue, cl_event* event); +StatusCode PUBLIC_API CLBlastHsyr(const Layout layout, const Triangle triangle, + const size_t n, + const cl_half alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_command_queue* queue, cl_event* event); -// Symmetric packed rank-1 matrix update: SSPR/DSPR +// Symmetric packed rank-1 matrix update: SSPR/DSPR/HSPR StatusCode PUBLIC_API CLBlastSspr(const Layout layout, const Triangle triangle, const size_t n, const float alpha, @@ -815,8 +929,14 @@ StatusCode PUBLIC_API CLBlastDspr(const Layout layout, const Triangle triangle, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_mem ap_buffer, const size_t ap_offset, cl_command_queue* queue, cl_event* event); +StatusCode PUBLIC_API CLBlastHspr(const Layout layout, const Triangle triangle, + const size_t n, + const cl_half alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_mem ap_buffer, const size_t ap_offset, + cl_command_queue* queue, cl_event* event); -// Symmetric rank-2 matrix update: SSYR2/DSYR2 +// Symmetric rank-2 matrix update: SSYR2/DSYR2/HSYR2 StatusCode PUBLIC_API CLBlastSsyr2(const Layout layout, const Triangle triangle, const size_t n, const float alpha, @@ -831,8 +951,15 @@ StatusCode PUBLIC_API CLBlastDsyr2(const Layout layout, const Triangle triangle, const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_mem a_buffer, const size_t a_offset, const size_t a_ld, cl_command_queue* queue, cl_event* event); +StatusCode PUBLIC_API CLBlastHsyr2(const Layout layout, const Triangle triangle, + const size_t n, + const cl_half alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_command_queue* queue, cl_event* event); -// Symmetric packed rank-2 matrix update: SSPR2/DSPR2 +// Symmetric packed rank-2 matrix update: SSPR2/DSPR2/HSPR2 StatusCode PUBLIC_API CLBlastSspr2(const Layout layout, const Triangle triangle, const size_t n, const float alpha, @@ -847,12 +974,19 @@ StatusCode PUBLIC_API CLBlastDspr2(const Layout layout, const Triangle triangle, const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_mem ap_buffer, const size_t ap_offset, cl_command_queue* queue, cl_event* event); +StatusCode PUBLIC_API CLBlastHspr2(const Layout layout, const Triangle triangle, + const size_t n, + const cl_half alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_mem ap_buffer, const size_t ap_offset, + cl_command_queue* queue, cl_event* event); // ================================================================================================= // BLAS level-3 (matrix-matrix) routines // ================================================================================================= -// General matrix-matrix multiplication: SGEMM/DGEMM/CGEMM/ZGEMM +// General matrix-matrix multiplication: SGEMM/DGEMM/CGEMM/ZGEMM/HGEMM StatusCode PUBLIC_API CLBlastSgemm(const Layout layout, const Transpose a_transpose, const Transpose b_transpose, const size_t m, const size_t n, const size_t k, const float alpha, @@ -885,8 +1019,16 @@ StatusCode PUBLIC_API CLBlastZgemm(const Layout layout, const Transpose a_transp const cl_double2 beta, cl_mem c_buffer, const size_t c_offset, const size_t c_ld, cl_command_queue* queue, cl_event* event); +StatusCode PUBLIC_API CLBlastHgemm(const Layout layout, const Transpose a_transpose, const Transpose b_transpose, + const size_t m, const size_t n, const size_t k, + const cl_half alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, + const cl_half beta, + cl_mem c_buffer, const size_t c_offset, const size_t c_ld, + cl_command_queue* queue, cl_event* event); -// Symmetric matrix-matrix multiplication: SSYMM/DSYMM/CSYMM/ZSYMM +// Symmetric matrix-matrix multiplication: SSYMM/DSYMM/CSYMM/ZSYMM/HSYMM StatusCode PUBLIC_API CLBlastSsymm(const Layout layout, const Side side, const Triangle triangle, const size_t m, const size_t n, const float alpha, @@ -919,6 +1061,14 @@ StatusCode PUBLIC_API CLBlastZsymm(const Layout layout, const Side side, const T const cl_double2 beta, cl_mem c_buffer, const size_t c_offset, const size_t c_ld, cl_command_queue* queue, cl_event* event); +StatusCode PUBLIC_API CLBlastHsymm(const Layout layout, const Side side, const Triangle triangle, + const size_t m, const size_t n, + const cl_half alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, + const cl_half beta, + cl_mem c_buffer, const size_t c_offset, const size_t c_ld, + cl_command_queue* queue, cl_event* event); // Hermitian matrix-matrix multiplication: CHEMM/ZHEMM StatusCode PUBLIC_API CLBlastChemm(const Layout layout, const Side side, const Triangle triangle, @@ -938,7 +1088,7 @@ StatusCode PUBLIC_API CLBlastZhemm(const Layout layout, const Side side, const T cl_mem c_buffer, const size_t c_offset, const size_t c_ld, cl_command_queue* queue, cl_event* event); -// Rank-K update of a symmetric matrix: SSYRK/DSYRK/CSYRK/ZSYRK +// Rank-K update of a symmetric matrix: SSYRK/DSYRK/CSYRK/ZSYRK/HSYRK StatusCode PUBLIC_API CLBlastSsyrk(const Layout layout, const Triangle triangle, const Transpose a_transpose, const size_t n, const size_t k, const float alpha, @@ -967,6 +1117,13 @@ StatusCode PUBLIC_API CLBlastZsyrk(const Layout layout, const Triangle triangle, const cl_double2 beta, cl_mem c_buffer, const size_t c_offset, const size_t c_ld, cl_command_queue* queue, cl_event* event); +StatusCode PUBLIC_API CLBlastHsyrk(const Layout layout, const Triangle triangle, const Transpose a_transpose, + const size_t n, const size_t k, + const cl_half alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const cl_half beta, + cl_mem c_buffer, const size_t c_offset, const size_t c_ld, + cl_command_queue* queue, cl_event* event); // Rank-K update of a hermitian matrix: CHERK/ZHERK StatusCode PUBLIC_API CLBlastCherk(const Layout layout, const Triangle triangle, const Transpose a_transpose, @@ -984,7 +1141,7 @@ StatusCode PUBLIC_API CLBlastZherk(const Layout layout, const Triangle triangle, cl_mem c_buffer, const size_t c_offset, const size_t c_ld, cl_command_queue* queue, cl_event* event); -// Rank-2K update of a symmetric matrix: SSYR2K/DSYR2K/CSYR2K/ZSYR2K +// Rank-2K update of a symmetric matrix: SSYR2K/DSYR2K/CSYR2K/ZSYR2K/HSYR2K StatusCode PUBLIC_API CLBlastSsyr2k(const Layout layout, const Triangle triangle, const Transpose ab_transpose, const size_t n, const size_t k, const float alpha, @@ -1017,6 +1174,14 @@ StatusCode PUBLIC_API CLBlastZsyr2k(const Layout layout, const Triangle triangle const cl_double2 beta, cl_mem c_buffer, const size_t c_offset, const size_t c_ld, cl_command_queue* queue, cl_event* event); +StatusCode PUBLIC_API CLBlastHsyr2k(const Layout layout, const Triangle triangle, const Transpose ab_transpose, + const size_t n, const size_t k, + const cl_half alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, + const cl_half beta, + cl_mem c_buffer, const size_t c_offset, const size_t c_ld, + cl_command_queue* queue, cl_event* event); // Rank-2K update of a hermitian matrix: CHER2K/ZHER2K StatusCode PUBLIC_API CLBlastCher2k(const Layout layout, const Triangle triangle, const Transpose ab_transpose, @@ -1036,7 +1201,7 @@ StatusCode PUBLIC_API CLBlastZher2k(const Layout layout, const Triangle triangle cl_mem c_buffer, const size_t c_offset, const size_t c_ld, cl_command_queue* queue, cl_event* event); -// Triangular matrix-matrix multiplication: STRMM/DTRMM/CTRMM/ZTRMM +// Triangular matrix-matrix multiplication: STRMM/DTRMM/CTRMM/ZTRMM/HTRMM StatusCode PUBLIC_API CLBlastStrmm(const Layout layout, const Side side, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, const size_t m, const size_t n, const float alpha, @@ -1061,8 +1226,14 @@ StatusCode PUBLIC_API CLBlastZtrmm(const Layout layout, const Side side, const T const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, cl_mem b_buffer, const size_t b_offset, const size_t b_ld, cl_command_queue* queue, cl_event* event); +StatusCode PUBLIC_API CLBlastHtrmm(const Layout layout, const Side side, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t m, const size_t n, + const cl_half alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_mem b_buffer, const size_t b_offset, const size_t b_ld, + cl_command_queue* queue, cl_event* event); -// Solves a triangular system of equations: STRSM/DTRSM/CTRSM/ZTRSM +// Solves a triangular system of equations: STRSM/DTRSM/CTRSM/ZTRSM/HTRSM StatusCode PUBLIC_API CLBlastStrsm(const Layout layout, const Side side, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, const size_t m, const size_t n, const float alpha, @@ -1087,6 +1258,12 @@ StatusCode PUBLIC_API CLBlastZtrsm(const Layout layout, const Side side, const T const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, cl_mem b_buffer, const size_t b_offset, const size_t b_ld, cl_command_queue* queue, cl_event* event); +StatusCode PUBLIC_API CLBlastHtrsm(const Layout layout, const Side side, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t m, const size_t n, + const cl_half alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_mem b_buffer, const size_t b_offset, const size_t b_ld, + cl_command_queue* queue, cl_event* event); // ================================================================================================= diff --git a/include/clblast_half.h b/include/clblast_half.h new file mode 100644 index 00000000..269a520e --- /dev/null +++ b/include/clblast_half.h @@ -0,0 +1,256 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// This file provides simple conversion operations between fp16 (half) and fp32 (float). These +// conversion functions are based on ftp://ftp.fox-toolkit.org/pub/fasthalffloatconversion.pdf and +// are also part of the C++ half-precision header (http://half.sourceforge.net/). +// +// This file is pure C99. +// +// ================================================================================================= + +#ifndef CLBLAST_HALF_H_ +#define CLBLAST_HALF_H_ + +// Includes the normal OpenCL C header +#if defined(__APPLE__) || defined(__MACOSX) + #include +#else + #include +#endif + +// ================================================================================================= + +// Host data-type for half-precision floating-point (16-bit). This is based on the OpenCL type, +// which is a typedef for unsigned short. +typedef cl_half half; + +// 32-bit union for conversions +typedef union ConversionBits_ { + unsigned int i32; + float f32; +} ConversionBits; + +// ================================================================================================= + +// Converts a IEEE-compliant single-precision value to half-precision floating-point. This function +// applies simple truncation (round toward zero, but with overflows set to infinity) as rounding +// mode. +inline half FloatToHalf(const float value) { + static const unsigned short base_table[512] = { + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0001, 0x0002, 0x0004, 0x0008, 0x0010, 0x0020, 0x0040, 0x0080, 0x0100, + 0x0200, 0x0400, 0x0800, 0x0C00, 0x1000, 0x1400, 0x1800, 0x1C00, 0x2000, 0x2400, 0x2800, 0x2C00, 0x3000, 0x3400, 0x3800, 0x3C00, + 0x4000, 0x4400, 0x4800, 0x4C00, 0x5000, 0x5400, 0x5800, 0x5C00, 0x6000, 0x6400, 0x6800, 0x6C00, 0x7000, 0x7400, 0x7800, 0x7C00, + 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, + 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, + 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, + 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, + 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, + 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, + 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, + 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, + 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, + 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, + 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, + 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, + 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, + 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8001, 0x8002, 0x8004, 0x8008, 0x8010, 0x8020, 0x8040, 0x8080, 0x8100, + 0x8200, 0x8400, 0x8800, 0x8C00, 0x9000, 0x9400, 0x9800, 0x9C00, 0xA000, 0xA400, 0xA800, 0xAC00, 0xB000, 0xB400, 0xB800, 0xBC00, + 0xC000, 0xC400, 0xC800, 0xCC00, 0xD000, 0xD400, 0xD800, 0xDC00, 0xE000, 0xE400, 0xE800, 0xEC00, 0xF000, 0xF400, 0xF800, 0xFC00, + 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, + 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, + 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, + 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, + 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, + 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, + 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00 + }; + static const unsigned char shift_table[512] = { + 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, + 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, + 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, + 24, 24, 24, 24, 24, 24, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, + 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, + 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, + 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 13, + 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, + 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, + 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, + 24, 24, 24, 24, 24, 24, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, + 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, + 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, + 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 13 + }; + ConversionBits bits; + bits.f32 = value; + const unsigned short halfbits = base_table[bits.i32 >> 23] + + (unsigned short)((bits.i32 & 0x7FFFFF) >> shift_table[bits.i32 >> 23]); + return halfbits; +} + +// Converts a half-precision value to IEEE-compliant single-precision floating-point +inline float HalfToFloat(const half value) { + static const unsigned int mantissa_table[2048] = { + 0x00000000, 0x33800000, 0x34000000, 0x34400000, 0x34800000, 0x34A00000, 0x34C00000, 0x34E00000, 0x35000000, 0x35100000, 0x35200000, 0x35300000, 0x35400000, 0x35500000, 0x35600000, 0x35700000, + 0x35800000, 0x35880000, 0x35900000, 0x35980000, 0x35A00000, 0x35A80000, 0x35B00000, 0x35B80000, 0x35C00000, 0x35C80000, 0x35D00000, 0x35D80000, 0x35E00000, 0x35E80000, 0x35F00000, 0x35F80000, + 0x36000000, 0x36040000, 0x36080000, 0x360C0000, 0x36100000, 0x36140000, 0x36180000, 0x361C0000, 0x36200000, 0x36240000, 0x36280000, 0x362C0000, 0x36300000, 0x36340000, 0x36380000, 0x363C0000, + 0x36400000, 0x36440000, 0x36480000, 0x364C0000, 0x36500000, 0x36540000, 0x36580000, 0x365C0000, 0x36600000, 0x36640000, 0x36680000, 0x366C0000, 0x36700000, 0x36740000, 0x36780000, 0x367C0000, + 0x36800000, 0x36820000, 0x36840000, 0x36860000, 0x36880000, 0x368A0000, 0x368C0000, 0x368E0000, 0x36900000, 0x36920000, 0x36940000, 0x36960000, 0x36980000, 0x369A0000, 0x369C0000, 0x369E0000, + 0x36A00000, 0x36A20000, 0x36A40000, 0x36A60000, 0x36A80000, 0x36AA0000, 0x36AC0000, 0x36AE0000, 0x36B00000, 0x36B20000, 0x36B40000, 0x36B60000, 0x36B80000, 0x36BA0000, 0x36BC0000, 0x36BE0000, + 0x36C00000, 0x36C20000, 0x36C40000, 0x36C60000, 0x36C80000, 0x36CA0000, 0x36CC0000, 0x36CE0000, 0x36D00000, 0x36D20000, 0x36D40000, 0x36D60000, 0x36D80000, 0x36DA0000, 0x36DC0000, 0x36DE0000, + 0x36E00000, 0x36E20000, 0x36E40000, 0x36E60000, 0x36E80000, 0x36EA0000, 0x36EC0000, 0x36EE0000, 0x36F00000, 0x36F20000, 0x36F40000, 0x36F60000, 0x36F80000, 0x36FA0000, 0x36FC0000, 0x36FE0000, + 0x37000000, 0x37010000, 0x37020000, 0x37030000, 0x37040000, 0x37050000, 0x37060000, 0x37070000, 0x37080000, 0x37090000, 0x370A0000, 0x370B0000, 0x370C0000, 0x370D0000, 0x370E0000, 0x370F0000, + 0x37100000, 0x37110000, 0x37120000, 0x37130000, 0x37140000, 0x37150000, 0x37160000, 0x37170000, 0x37180000, 0x37190000, 0x371A0000, 0x371B0000, 0x371C0000, 0x371D0000, 0x371E0000, 0x371F0000, + 0x37200000, 0x37210000, 0x37220000, 0x37230000, 0x37240000, 0x37250000, 0x37260000, 0x37270000, 0x37280000, 0x37290000, 0x372A0000, 0x372B0000, 0x372C0000, 0x372D0000, 0x372E0000, 0x372F0000, + 0x37300000, 0x37310000, 0x37320000, 0x37330000, 0x37340000, 0x37350000, 0x37360000, 0x37370000, 0x37380000, 0x37390000, 0x373A0000, 0x373B0000, 0x373C0000, 0x373D0000, 0x373E0000, 0x373F0000, + 0x37400000, 0x37410000, 0x37420000, 0x37430000, 0x37440000, 0x37450000, 0x37460000, 0x37470000, 0x37480000, 0x37490000, 0x374A0000, 0x374B0000, 0x374C0000, 0x374D0000, 0x374E0000, 0x374F0000, + 0x37500000, 0x37510000, 0x37520000, 0x37530000, 0x37540000, 0x37550000, 0x37560000, 0x37570000, 0x37580000, 0x37590000, 0x375A0000, 0x375B0000, 0x375C0000, 0x375D0000, 0x375E0000, 0x375F0000, + 0x37600000, 0x37610000, 0x37620000, 0x37630000, 0x37640000, 0x37650000, 0x37660000, 0x37670000, 0x37680000, 0x37690000, 0x376A0000, 0x376B0000, 0x376C0000, 0x376D0000, 0x376E0000, 0x376F0000, + 0x37700000, 0x37710000, 0x37720000, 0x37730000, 0x37740000, 0x37750000, 0x37760000, 0x37770000, 0x37780000, 0x37790000, 0x377A0000, 0x377B0000, 0x377C0000, 0x377D0000, 0x377E0000, 0x377F0000, + 0x37800000, 0x37808000, 0x37810000, 0x37818000, 0x37820000, 0x37828000, 0x37830000, 0x37838000, 0x37840000, 0x37848000, 0x37850000, 0x37858000, 0x37860000, 0x37868000, 0x37870000, 0x37878000, + 0x37880000, 0x37888000, 0x37890000, 0x37898000, 0x378A0000, 0x378A8000, 0x378B0000, 0x378B8000, 0x378C0000, 0x378C8000, 0x378D0000, 0x378D8000, 0x378E0000, 0x378E8000, 0x378F0000, 0x378F8000, + 0x37900000, 0x37908000, 0x37910000, 0x37918000, 0x37920000, 0x37928000, 0x37930000, 0x37938000, 0x37940000, 0x37948000, 0x37950000, 0x37958000, 0x37960000, 0x37968000, 0x37970000, 0x37978000, + 0x37980000, 0x37988000, 0x37990000, 0x37998000, 0x379A0000, 0x379A8000, 0x379B0000, 0x379B8000, 0x379C0000, 0x379C8000, 0x379D0000, 0x379D8000, 0x379E0000, 0x379E8000, 0x379F0000, 0x379F8000, + 0x37A00000, 0x37A08000, 0x37A10000, 0x37A18000, 0x37A20000, 0x37A28000, 0x37A30000, 0x37A38000, 0x37A40000, 0x37A48000, 0x37A50000, 0x37A58000, 0x37A60000, 0x37A68000, 0x37A70000, 0x37A78000, + 0x37A80000, 0x37A88000, 0x37A90000, 0x37A98000, 0x37AA0000, 0x37AA8000, 0x37AB0000, 0x37AB8000, 0x37AC0000, 0x37AC8000, 0x37AD0000, 0x37AD8000, 0x37AE0000, 0x37AE8000, 0x37AF0000, 0x37AF8000, + 0x37B00000, 0x37B08000, 0x37B10000, 0x37B18000, 0x37B20000, 0x37B28000, 0x37B30000, 0x37B38000, 0x37B40000, 0x37B48000, 0x37B50000, 0x37B58000, 0x37B60000, 0x37B68000, 0x37B70000, 0x37B78000, + 0x37B80000, 0x37B88000, 0x37B90000, 0x37B98000, 0x37BA0000, 0x37BA8000, 0x37BB0000, 0x37BB8000, 0x37BC0000, 0x37BC8000, 0x37BD0000, 0x37BD8000, 0x37BE0000, 0x37BE8000, 0x37BF0000, 0x37BF8000, + 0x37C00000, 0x37C08000, 0x37C10000, 0x37C18000, 0x37C20000, 0x37C28000, 0x37C30000, 0x37C38000, 0x37C40000, 0x37C48000, 0x37C50000, 0x37C58000, 0x37C60000, 0x37C68000, 0x37C70000, 0x37C78000, + 0x37C80000, 0x37C88000, 0x37C90000, 0x37C98000, 0x37CA0000, 0x37CA8000, 0x37CB0000, 0x37CB8000, 0x37CC0000, 0x37CC8000, 0x37CD0000, 0x37CD8000, 0x37CE0000, 0x37CE8000, 0x37CF0000, 0x37CF8000, + 0x37D00000, 0x37D08000, 0x37D10000, 0x37D18000, 0x37D20000, 0x37D28000, 0x37D30000, 0x37D38000, 0x37D40000, 0x37D48000, 0x37D50000, 0x37D58000, 0x37D60000, 0x37D68000, 0x37D70000, 0x37D78000, + 0x37D80000, 0x37D88000, 0x37D90000, 0x37D98000, 0x37DA0000, 0x37DA8000, 0x37DB0000, 0x37DB8000, 0x37DC0000, 0x37DC8000, 0x37DD0000, 0x37DD8000, 0x37DE0000, 0x37DE8000, 0x37DF0000, 0x37DF8000, + 0x37E00000, 0x37E08000, 0x37E10000, 0x37E18000, 0x37E20000, 0x37E28000, 0x37E30000, 0x37E38000, 0x37E40000, 0x37E48000, 0x37E50000, 0x37E58000, 0x37E60000, 0x37E68000, 0x37E70000, 0x37E78000, + 0x37E80000, 0x37E88000, 0x37E90000, 0x37E98000, 0x37EA0000, 0x37EA8000, 0x37EB0000, 0x37EB8000, 0x37EC0000, 0x37EC8000, 0x37ED0000, 0x37ED8000, 0x37EE0000, 0x37EE8000, 0x37EF0000, 0x37EF8000, + 0x37F00000, 0x37F08000, 0x37F10000, 0x37F18000, 0x37F20000, 0x37F28000, 0x37F30000, 0x37F38000, 0x37F40000, 0x37F48000, 0x37F50000, 0x37F58000, 0x37F60000, 0x37F68000, 0x37F70000, 0x37F78000, + 0x37F80000, 0x37F88000, 0x37F90000, 0x37F98000, 0x37FA0000, 0x37FA8000, 0x37FB0000, 0x37FB8000, 0x37FC0000, 0x37FC8000, 0x37FD0000, 0x37FD8000, 0x37FE0000, 0x37FE8000, 0x37FF0000, 0x37FF8000, + 0x38000000, 0x38004000, 0x38008000, 0x3800C000, 0x38010000, 0x38014000, 0x38018000, 0x3801C000, 0x38020000, 0x38024000, 0x38028000, 0x3802C000, 0x38030000, 0x38034000, 0x38038000, 0x3803C000, + 0x38040000, 0x38044000, 0x38048000, 0x3804C000, 0x38050000, 0x38054000, 0x38058000, 0x3805C000, 0x38060000, 0x38064000, 0x38068000, 0x3806C000, 0x38070000, 0x38074000, 0x38078000, 0x3807C000, + 0x38080000, 0x38084000, 0x38088000, 0x3808C000, 0x38090000, 0x38094000, 0x38098000, 0x3809C000, 0x380A0000, 0x380A4000, 0x380A8000, 0x380AC000, 0x380B0000, 0x380B4000, 0x380B8000, 0x380BC000, + 0x380C0000, 0x380C4000, 0x380C8000, 0x380CC000, 0x380D0000, 0x380D4000, 0x380D8000, 0x380DC000, 0x380E0000, 0x380E4000, 0x380E8000, 0x380EC000, 0x380F0000, 0x380F4000, 0x380F8000, 0x380FC000, + 0x38100000, 0x38104000, 0x38108000, 0x3810C000, 0x38110000, 0x38114000, 0x38118000, 0x3811C000, 0x38120000, 0x38124000, 0x38128000, 0x3812C000, 0x38130000, 0x38134000, 0x38138000, 0x3813C000, + 0x38140000, 0x38144000, 0x38148000, 0x3814C000, 0x38150000, 0x38154000, 0x38158000, 0x3815C000, 0x38160000, 0x38164000, 0x38168000, 0x3816C000, 0x38170000, 0x38174000, 0x38178000, 0x3817C000, + 0x38180000, 0x38184000, 0x38188000, 0x3818C000, 0x38190000, 0x38194000, 0x38198000, 0x3819C000, 0x381A0000, 0x381A4000, 0x381A8000, 0x381AC000, 0x381B0000, 0x381B4000, 0x381B8000, 0x381BC000, + 0x381C0000, 0x381C4000, 0x381C8000, 0x381CC000, 0x381D0000, 0x381D4000, 0x381D8000, 0x381DC000, 0x381E0000, 0x381E4000, 0x381E8000, 0x381EC000, 0x381F0000, 0x381F4000, 0x381F8000, 0x381FC000, + 0x38200000, 0x38204000, 0x38208000, 0x3820C000, 0x38210000, 0x38214000, 0x38218000, 0x3821C000, 0x38220000, 0x38224000, 0x38228000, 0x3822C000, 0x38230000, 0x38234000, 0x38238000, 0x3823C000, + 0x38240000, 0x38244000, 0x38248000, 0x3824C000, 0x38250000, 0x38254000, 0x38258000, 0x3825C000, 0x38260000, 0x38264000, 0x38268000, 0x3826C000, 0x38270000, 0x38274000, 0x38278000, 0x3827C000, + 0x38280000, 0x38284000, 0x38288000, 0x3828C000, 0x38290000, 0x38294000, 0x38298000, 0x3829C000, 0x382A0000, 0x382A4000, 0x382A8000, 0x382AC000, 0x382B0000, 0x382B4000, 0x382B8000, 0x382BC000, + 0x382C0000, 0x382C4000, 0x382C8000, 0x382CC000, 0x382D0000, 0x382D4000, 0x382D8000, 0x382DC000, 0x382E0000, 0x382E4000, 0x382E8000, 0x382EC000, 0x382F0000, 0x382F4000, 0x382F8000, 0x382FC000, + 0x38300000, 0x38304000, 0x38308000, 0x3830C000, 0x38310000, 0x38314000, 0x38318000, 0x3831C000, 0x38320000, 0x38324000, 0x38328000, 0x3832C000, 0x38330000, 0x38334000, 0x38338000, 0x3833C000, + 0x38340000, 0x38344000, 0x38348000, 0x3834C000, 0x38350000, 0x38354000, 0x38358000, 0x3835C000, 0x38360000, 0x38364000, 0x38368000, 0x3836C000, 0x38370000, 0x38374000, 0x38378000, 0x3837C000, + 0x38380000, 0x38384000, 0x38388000, 0x3838C000, 0x38390000, 0x38394000, 0x38398000, 0x3839C000, 0x383A0000, 0x383A4000, 0x383A8000, 0x383AC000, 0x383B0000, 0x383B4000, 0x383B8000, 0x383BC000, + 0x383C0000, 0x383C4000, 0x383C8000, 0x383CC000, 0x383D0000, 0x383D4000, 0x383D8000, 0x383DC000, 0x383E0000, 0x383E4000, 0x383E8000, 0x383EC000, 0x383F0000, 0x383F4000, 0x383F8000, 0x383FC000, + 0x38400000, 0x38404000, 0x38408000, 0x3840C000, 0x38410000, 0x38414000, 0x38418000, 0x3841C000, 0x38420000, 0x38424000, 0x38428000, 0x3842C000, 0x38430000, 0x38434000, 0x38438000, 0x3843C000, + 0x38440000, 0x38444000, 0x38448000, 0x3844C000, 0x38450000, 0x38454000, 0x38458000, 0x3845C000, 0x38460000, 0x38464000, 0x38468000, 0x3846C000, 0x38470000, 0x38474000, 0x38478000, 0x3847C000, + 0x38480000, 0x38484000, 0x38488000, 0x3848C000, 0x38490000, 0x38494000, 0x38498000, 0x3849C000, 0x384A0000, 0x384A4000, 0x384A8000, 0x384AC000, 0x384B0000, 0x384B4000, 0x384B8000, 0x384BC000, + 0x384C0000, 0x384C4000, 0x384C8000, 0x384CC000, 0x384D0000, 0x384D4000, 0x384D8000, 0x384DC000, 0x384E0000, 0x384E4000, 0x384E8000, 0x384EC000, 0x384F0000, 0x384F4000, 0x384F8000, 0x384FC000, + 0x38500000, 0x38504000, 0x38508000, 0x3850C000, 0x38510000, 0x38514000, 0x38518000, 0x3851C000, 0x38520000, 0x38524000, 0x38528000, 0x3852C000, 0x38530000, 0x38534000, 0x38538000, 0x3853C000, + 0x38540000, 0x38544000, 0x38548000, 0x3854C000, 0x38550000, 0x38554000, 0x38558000, 0x3855C000, 0x38560000, 0x38564000, 0x38568000, 0x3856C000, 0x38570000, 0x38574000, 0x38578000, 0x3857C000, + 0x38580000, 0x38584000, 0x38588000, 0x3858C000, 0x38590000, 0x38594000, 0x38598000, 0x3859C000, 0x385A0000, 0x385A4000, 0x385A8000, 0x385AC000, 0x385B0000, 0x385B4000, 0x385B8000, 0x385BC000, + 0x385C0000, 0x385C4000, 0x385C8000, 0x385CC000, 0x385D0000, 0x385D4000, 0x385D8000, 0x385DC000, 0x385E0000, 0x385E4000, 0x385E8000, 0x385EC000, 0x385F0000, 0x385F4000, 0x385F8000, 0x385FC000, + 0x38600000, 0x38604000, 0x38608000, 0x3860C000, 0x38610000, 0x38614000, 0x38618000, 0x3861C000, 0x38620000, 0x38624000, 0x38628000, 0x3862C000, 0x38630000, 0x38634000, 0x38638000, 0x3863C000, + 0x38640000, 0x38644000, 0x38648000, 0x3864C000, 0x38650000, 0x38654000, 0x38658000, 0x3865C000, 0x38660000, 0x38664000, 0x38668000, 0x3866C000, 0x38670000, 0x38674000, 0x38678000, 0x3867C000, + 0x38680000, 0x38684000, 0x38688000, 0x3868C000, 0x38690000, 0x38694000, 0x38698000, 0x3869C000, 0x386A0000, 0x386A4000, 0x386A8000, 0x386AC000, 0x386B0000, 0x386B4000, 0x386B8000, 0x386BC000, + 0x386C0000, 0x386C4000, 0x386C8000, 0x386CC000, 0x386D0000, 0x386D4000, 0x386D8000, 0x386DC000, 0x386E0000, 0x386E4000, 0x386E8000, 0x386EC000, 0x386F0000, 0x386F4000, 0x386F8000, 0x386FC000, + 0x38700000, 0x38704000, 0x38708000, 0x3870C000, 0x38710000, 0x38714000, 0x38718000, 0x3871C000, 0x38720000, 0x38724000, 0x38728000, 0x3872C000, 0x38730000, 0x38734000, 0x38738000, 0x3873C000, + 0x38740000, 0x38744000, 0x38748000, 0x3874C000, 0x38750000, 0x38754000, 0x38758000, 0x3875C000, 0x38760000, 0x38764000, 0x38768000, 0x3876C000, 0x38770000, 0x38774000, 0x38778000, 0x3877C000, + 0x38780000, 0x38784000, 0x38788000, 0x3878C000, 0x38790000, 0x38794000, 0x38798000, 0x3879C000, 0x387A0000, 0x387A4000, 0x387A8000, 0x387AC000, 0x387B0000, 0x387B4000, 0x387B8000, 0x387BC000, + 0x387C0000, 0x387C4000, 0x387C8000, 0x387CC000, 0x387D0000, 0x387D4000, 0x387D8000, 0x387DC000, 0x387E0000, 0x387E4000, 0x387E8000, 0x387EC000, 0x387F0000, 0x387F4000, 0x387F8000, 0x387FC000, + 0x38000000, 0x38002000, 0x38004000, 0x38006000, 0x38008000, 0x3800A000, 0x3800C000, 0x3800E000, 0x38010000, 0x38012000, 0x38014000, 0x38016000, 0x38018000, 0x3801A000, 0x3801C000, 0x3801E000, + 0x38020000, 0x38022000, 0x38024000, 0x38026000, 0x38028000, 0x3802A000, 0x3802C000, 0x3802E000, 0x38030000, 0x38032000, 0x38034000, 0x38036000, 0x38038000, 0x3803A000, 0x3803C000, 0x3803E000, + 0x38040000, 0x38042000, 0x38044000, 0x38046000, 0x38048000, 0x3804A000, 0x3804C000, 0x3804E000, 0x38050000, 0x38052000, 0x38054000, 0x38056000, 0x38058000, 0x3805A000, 0x3805C000, 0x3805E000, + 0x38060000, 0x38062000, 0x38064000, 0x38066000, 0x38068000, 0x3806A000, 0x3806C000, 0x3806E000, 0x38070000, 0x38072000, 0x38074000, 0x38076000, 0x38078000, 0x3807A000, 0x3807C000, 0x3807E000, + 0x38080000, 0x38082000, 0x38084000, 0x38086000, 0x38088000, 0x3808A000, 0x3808C000, 0x3808E000, 0x38090000, 0x38092000, 0x38094000, 0x38096000, 0x38098000, 0x3809A000, 0x3809C000, 0x3809E000, + 0x380A0000, 0x380A2000, 0x380A4000, 0x380A6000, 0x380A8000, 0x380AA000, 0x380AC000, 0x380AE000, 0x380B0000, 0x380B2000, 0x380B4000, 0x380B6000, 0x380B8000, 0x380BA000, 0x380BC000, 0x380BE000, + 0x380C0000, 0x380C2000, 0x380C4000, 0x380C6000, 0x380C8000, 0x380CA000, 0x380CC000, 0x380CE000, 0x380D0000, 0x380D2000, 0x380D4000, 0x380D6000, 0x380D8000, 0x380DA000, 0x380DC000, 0x380DE000, + 0x380E0000, 0x380E2000, 0x380E4000, 0x380E6000, 0x380E8000, 0x380EA000, 0x380EC000, 0x380EE000, 0x380F0000, 0x380F2000, 0x380F4000, 0x380F6000, 0x380F8000, 0x380FA000, 0x380FC000, 0x380FE000, + 0x38100000, 0x38102000, 0x38104000, 0x38106000, 0x38108000, 0x3810A000, 0x3810C000, 0x3810E000, 0x38110000, 0x38112000, 0x38114000, 0x38116000, 0x38118000, 0x3811A000, 0x3811C000, 0x3811E000, + 0x38120000, 0x38122000, 0x38124000, 0x38126000, 0x38128000, 0x3812A000, 0x3812C000, 0x3812E000, 0x38130000, 0x38132000, 0x38134000, 0x38136000, 0x38138000, 0x3813A000, 0x3813C000, 0x3813E000, + 0x38140000, 0x38142000, 0x38144000, 0x38146000, 0x38148000, 0x3814A000, 0x3814C000, 0x3814E000, 0x38150000, 0x38152000, 0x38154000, 0x38156000, 0x38158000, 0x3815A000, 0x3815C000, 0x3815E000, + 0x38160000, 0x38162000, 0x38164000, 0x38166000, 0x38168000, 0x3816A000, 0x3816C000, 0x3816E000, 0x38170000, 0x38172000, 0x38174000, 0x38176000, 0x38178000, 0x3817A000, 0x3817C000, 0x3817E000, + 0x38180000, 0x38182000, 0x38184000, 0x38186000, 0x38188000, 0x3818A000, 0x3818C000, 0x3818E000, 0x38190000, 0x38192000, 0x38194000, 0x38196000, 0x38198000, 0x3819A000, 0x3819C000, 0x3819E000, + 0x381A0000, 0x381A2000, 0x381A4000, 0x381A6000, 0x381A8000, 0x381AA000, 0x381AC000, 0x381AE000, 0x381B0000, 0x381B2000, 0x381B4000, 0x381B6000, 0x381B8000, 0x381BA000, 0x381BC000, 0x381BE000, + 0x381C0000, 0x381C2000, 0x381C4000, 0x381C6000, 0x381C8000, 0x381CA000, 0x381CC000, 0x381CE000, 0x381D0000, 0x381D2000, 0x381D4000, 0x381D6000, 0x381D8000, 0x381DA000, 0x381DC000, 0x381DE000, + 0x381E0000, 0x381E2000, 0x381E4000, 0x381E6000, 0x381E8000, 0x381EA000, 0x381EC000, 0x381EE000, 0x381F0000, 0x381F2000, 0x381F4000, 0x381F6000, 0x381F8000, 0x381FA000, 0x381FC000, 0x381FE000, + 0x38200000, 0x38202000, 0x38204000, 0x38206000, 0x38208000, 0x3820A000, 0x3820C000, 0x3820E000, 0x38210000, 0x38212000, 0x38214000, 0x38216000, 0x38218000, 0x3821A000, 0x3821C000, 0x3821E000, + 0x38220000, 0x38222000, 0x38224000, 0x38226000, 0x38228000, 0x3822A000, 0x3822C000, 0x3822E000, 0x38230000, 0x38232000, 0x38234000, 0x38236000, 0x38238000, 0x3823A000, 0x3823C000, 0x3823E000, + 0x38240000, 0x38242000, 0x38244000, 0x38246000, 0x38248000, 0x3824A000, 0x3824C000, 0x3824E000, 0x38250000, 0x38252000, 0x38254000, 0x38256000, 0x38258000, 0x3825A000, 0x3825C000, 0x3825E000, + 0x38260000, 0x38262000, 0x38264000, 0x38266000, 0x38268000, 0x3826A000, 0x3826C000, 0x3826E000, 0x38270000, 0x38272000, 0x38274000, 0x38276000, 0x38278000, 0x3827A000, 0x3827C000, 0x3827E000, + 0x38280000, 0x38282000, 0x38284000, 0x38286000, 0x38288000, 0x3828A000, 0x3828C000, 0x3828E000, 0x38290000, 0x38292000, 0x38294000, 0x38296000, 0x38298000, 0x3829A000, 0x3829C000, 0x3829E000, + 0x382A0000, 0x382A2000, 0x382A4000, 0x382A6000, 0x382A8000, 0x382AA000, 0x382AC000, 0x382AE000, 0x382B0000, 0x382B2000, 0x382B4000, 0x382B6000, 0x382B8000, 0x382BA000, 0x382BC000, 0x382BE000, + 0x382C0000, 0x382C2000, 0x382C4000, 0x382C6000, 0x382C8000, 0x382CA000, 0x382CC000, 0x382CE000, 0x382D0000, 0x382D2000, 0x382D4000, 0x382D6000, 0x382D8000, 0x382DA000, 0x382DC000, 0x382DE000, + 0x382E0000, 0x382E2000, 0x382E4000, 0x382E6000, 0x382E8000, 0x382EA000, 0x382EC000, 0x382EE000, 0x382F0000, 0x382F2000, 0x382F4000, 0x382F6000, 0x382F8000, 0x382FA000, 0x382FC000, 0x382FE000, + 0x38300000, 0x38302000, 0x38304000, 0x38306000, 0x38308000, 0x3830A000, 0x3830C000, 0x3830E000, 0x38310000, 0x38312000, 0x38314000, 0x38316000, 0x38318000, 0x3831A000, 0x3831C000, 0x3831E000, + 0x38320000, 0x38322000, 0x38324000, 0x38326000, 0x38328000, 0x3832A000, 0x3832C000, 0x3832E000, 0x38330000, 0x38332000, 0x38334000, 0x38336000, 0x38338000, 0x3833A000, 0x3833C000, 0x3833E000, + 0x38340000, 0x38342000, 0x38344000, 0x38346000, 0x38348000, 0x3834A000, 0x3834C000, 0x3834E000, 0x38350000, 0x38352000, 0x38354000, 0x38356000, 0x38358000, 0x3835A000, 0x3835C000, 0x3835E000, + 0x38360000, 0x38362000, 0x38364000, 0x38366000, 0x38368000, 0x3836A000, 0x3836C000, 0x3836E000, 0x38370000, 0x38372000, 0x38374000, 0x38376000, 0x38378000, 0x3837A000, 0x3837C000, 0x3837E000, + 0x38380000, 0x38382000, 0x38384000, 0x38386000, 0x38388000, 0x3838A000, 0x3838C000, 0x3838E000, 0x38390000, 0x38392000, 0x38394000, 0x38396000, 0x38398000, 0x3839A000, 0x3839C000, 0x3839E000, + 0x383A0000, 0x383A2000, 0x383A4000, 0x383A6000, 0x383A8000, 0x383AA000, 0x383AC000, 0x383AE000, 0x383B0000, 0x383B2000, 0x383B4000, 0x383B6000, 0x383B8000, 0x383BA000, 0x383BC000, 0x383BE000, + 0x383C0000, 0x383C2000, 0x383C4000, 0x383C6000, 0x383C8000, 0x383CA000, 0x383CC000, 0x383CE000, 0x383D0000, 0x383D2000, 0x383D4000, 0x383D6000, 0x383D8000, 0x383DA000, 0x383DC000, 0x383DE000, + 0x383E0000, 0x383E2000, 0x383E4000, 0x383E6000, 0x383E8000, 0x383EA000, 0x383EC000, 0x383EE000, 0x383F0000, 0x383F2000, 0x383F4000, 0x383F6000, 0x383F8000, 0x383FA000, 0x383FC000, 0x383FE000, + 0x38400000, 0x38402000, 0x38404000, 0x38406000, 0x38408000, 0x3840A000, 0x3840C000, 0x3840E000, 0x38410000, 0x38412000, 0x38414000, 0x38416000, 0x38418000, 0x3841A000, 0x3841C000, 0x3841E000, + 0x38420000, 0x38422000, 0x38424000, 0x38426000, 0x38428000, 0x3842A000, 0x3842C000, 0x3842E000, 0x38430000, 0x38432000, 0x38434000, 0x38436000, 0x38438000, 0x3843A000, 0x3843C000, 0x3843E000, + 0x38440000, 0x38442000, 0x38444000, 0x38446000, 0x38448000, 0x3844A000, 0x3844C000, 0x3844E000, 0x38450000, 0x38452000, 0x38454000, 0x38456000, 0x38458000, 0x3845A000, 0x3845C000, 0x3845E000, + 0x38460000, 0x38462000, 0x38464000, 0x38466000, 0x38468000, 0x3846A000, 0x3846C000, 0x3846E000, 0x38470000, 0x38472000, 0x38474000, 0x38476000, 0x38478000, 0x3847A000, 0x3847C000, 0x3847E000, + 0x38480000, 0x38482000, 0x38484000, 0x38486000, 0x38488000, 0x3848A000, 0x3848C000, 0x3848E000, 0x38490000, 0x38492000, 0x38494000, 0x38496000, 0x38498000, 0x3849A000, 0x3849C000, 0x3849E000, + 0x384A0000, 0x384A2000, 0x384A4000, 0x384A6000, 0x384A8000, 0x384AA000, 0x384AC000, 0x384AE000, 0x384B0000, 0x384B2000, 0x384B4000, 0x384B6000, 0x384B8000, 0x384BA000, 0x384BC000, 0x384BE000, + 0x384C0000, 0x384C2000, 0x384C4000, 0x384C6000, 0x384C8000, 0x384CA000, 0x384CC000, 0x384CE000, 0x384D0000, 0x384D2000, 0x384D4000, 0x384D6000, 0x384D8000, 0x384DA000, 0x384DC000, 0x384DE000, + 0x384E0000, 0x384E2000, 0x384E4000, 0x384E6000, 0x384E8000, 0x384EA000, 0x384EC000, 0x384EE000, 0x384F0000, 0x384F2000, 0x384F4000, 0x384F6000, 0x384F8000, 0x384FA000, 0x384FC000, 0x384FE000, + 0x38500000, 0x38502000, 0x38504000, 0x38506000, 0x38508000, 0x3850A000, 0x3850C000, 0x3850E000, 0x38510000, 0x38512000, 0x38514000, 0x38516000, 0x38518000, 0x3851A000, 0x3851C000, 0x3851E000, + 0x38520000, 0x38522000, 0x38524000, 0x38526000, 0x38528000, 0x3852A000, 0x3852C000, 0x3852E000, 0x38530000, 0x38532000, 0x38534000, 0x38536000, 0x38538000, 0x3853A000, 0x3853C000, 0x3853E000, + 0x38540000, 0x38542000, 0x38544000, 0x38546000, 0x38548000, 0x3854A000, 0x3854C000, 0x3854E000, 0x38550000, 0x38552000, 0x38554000, 0x38556000, 0x38558000, 0x3855A000, 0x3855C000, 0x3855E000, + 0x38560000, 0x38562000, 0x38564000, 0x38566000, 0x38568000, 0x3856A000, 0x3856C000, 0x3856E000, 0x38570000, 0x38572000, 0x38574000, 0x38576000, 0x38578000, 0x3857A000, 0x3857C000, 0x3857E000, + 0x38580000, 0x38582000, 0x38584000, 0x38586000, 0x38588000, 0x3858A000, 0x3858C000, 0x3858E000, 0x38590000, 0x38592000, 0x38594000, 0x38596000, 0x38598000, 0x3859A000, 0x3859C000, 0x3859E000, + 0x385A0000, 0x385A2000, 0x385A4000, 0x385A6000, 0x385A8000, 0x385AA000, 0x385AC000, 0x385AE000, 0x385B0000, 0x385B2000, 0x385B4000, 0x385B6000, 0x385B8000, 0x385BA000, 0x385BC000, 0x385BE000, + 0x385C0000, 0x385C2000, 0x385C4000, 0x385C6000, 0x385C8000, 0x385CA000, 0x385CC000, 0x385CE000, 0x385D0000, 0x385D2000, 0x385D4000, 0x385D6000, 0x385D8000, 0x385DA000, 0x385DC000, 0x385DE000, + 0x385E0000, 0x385E2000, 0x385E4000, 0x385E6000, 0x385E8000, 0x385EA000, 0x385EC000, 0x385EE000, 0x385F0000, 0x385F2000, 0x385F4000, 0x385F6000, 0x385F8000, 0x385FA000, 0x385FC000, 0x385FE000, + 0x38600000, 0x38602000, 0x38604000, 0x38606000, 0x38608000, 0x3860A000, 0x3860C000, 0x3860E000, 0x38610000, 0x38612000, 0x38614000, 0x38616000, 0x38618000, 0x3861A000, 0x3861C000, 0x3861E000, + 0x38620000, 0x38622000, 0x38624000, 0x38626000, 0x38628000, 0x3862A000, 0x3862C000, 0x3862E000, 0x38630000, 0x38632000, 0x38634000, 0x38636000, 0x38638000, 0x3863A000, 0x3863C000, 0x3863E000, + 0x38640000, 0x38642000, 0x38644000, 0x38646000, 0x38648000, 0x3864A000, 0x3864C000, 0x3864E000, 0x38650000, 0x38652000, 0x38654000, 0x38656000, 0x38658000, 0x3865A000, 0x3865C000, 0x3865E000, + 0x38660000, 0x38662000, 0x38664000, 0x38666000, 0x38668000, 0x3866A000, 0x3866C000, 0x3866E000, 0x38670000, 0x38672000, 0x38674000, 0x38676000, 0x38678000, 0x3867A000, 0x3867C000, 0x3867E000, + 0x38680000, 0x38682000, 0x38684000, 0x38686000, 0x38688000, 0x3868A000, 0x3868C000, 0x3868E000, 0x38690000, 0x38692000, 0x38694000, 0x38696000, 0x38698000, 0x3869A000, 0x3869C000, 0x3869E000, + 0x386A0000, 0x386A2000, 0x386A4000, 0x386A6000, 0x386A8000, 0x386AA000, 0x386AC000, 0x386AE000, 0x386B0000, 0x386B2000, 0x386B4000, 0x386B6000, 0x386B8000, 0x386BA000, 0x386BC000, 0x386BE000, + 0x386C0000, 0x386C2000, 0x386C4000, 0x386C6000, 0x386C8000, 0x386CA000, 0x386CC000, 0x386CE000, 0x386D0000, 0x386D2000, 0x386D4000, 0x386D6000, 0x386D8000, 0x386DA000, 0x386DC000, 0x386DE000, + 0x386E0000, 0x386E2000, 0x386E4000, 0x386E6000, 0x386E8000, 0x386EA000, 0x386EC000, 0x386EE000, 0x386F0000, 0x386F2000, 0x386F4000, 0x386F6000, 0x386F8000, 0x386FA000, 0x386FC000, 0x386FE000, + 0x38700000, 0x38702000, 0x38704000, 0x38706000, 0x38708000, 0x3870A000, 0x3870C000, 0x3870E000, 0x38710000, 0x38712000, 0x38714000, 0x38716000, 0x38718000, 0x3871A000, 0x3871C000, 0x3871E000, + 0x38720000, 0x38722000, 0x38724000, 0x38726000, 0x38728000, 0x3872A000, 0x3872C000, 0x3872E000, 0x38730000, 0x38732000, 0x38734000, 0x38736000, 0x38738000, 0x3873A000, 0x3873C000, 0x3873E000, + 0x38740000, 0x38742000, 0x38744000, 0x38746000, 0x38748000, 0x3874A000, 0x3874C000, 0x3874E000, 0x38750000, 0x38752000, 0x38754000, 0x38756000, 0x38758000, 0x3875A000, 0x3875C000, 0x3875E000, + 0x38760000, 0x38762000, 0x38764000, 0x38766000, 0x38768000, 0x3876A000, 0x3876C000, 0x3876E000, 0x38770000, 0x38772000, 0x38774000, 0x38776000, 0x38778000, 0x3877A000, 0x3877C000, 0x3877E000, + 0x38780000, 0x38782000, 0x38784000, 0x38786000, 0x38788000, 0x3878A000, 0x3878C000, 0x3878E000, 0x38790000, 0x38792000, 0x38794000, 0x38796000, 0x38798000, 0x3879A000, 0x3879C000, 0x3879E000, + 0x387A0000, 0x387A2000, 0x387A4000, 0x387A6000, 0x387A8000, 0x387AA000, 0x387AC000, 0x387AE000, 0x387B0000, 0x387B2000, 0x387B4000, 0x387B6000, 0x387B8000, 0x387BA000, 0x387BC000, 0x387BE000, + 0x387C0000, 0x387C2000, 0x387C4000, 0x387C6000, 0x387C8000, 0x387CA000, 0x387CC000, 0x387CE000, 0x387D0000, 0x387D2000, 0x387D4000, 0x387D6000, 0x387D8000, 0x387DA000, 0x387DC000, 0x387DE000, + 0x387E0000, 0x387E2000, 0x387E4000, 0x387E6000, 0x387E8000, 0x387EA000, 0x387EC000, 0x387EE000, 0x387F0000, 0x387F2000, 0x387F4000, 0x387F6000, 0x387F8000, 0x387FA000, 0x387FC000, 0x387FE000 + }; + static const unsigned int exponent_table[64] = { + 0x00000000, 0x00800000, 0x01000000, 0x01800000, 0x02000000, 0x02800000, 0x03000000, 0x03800000, 0x04000000, 0x04800000, 0x05000000, 0x05800000, 0x06000000, 0x06800000, 0x07000000, 0x07800000, + 0x08000000, 0x08800000, 0x09000000, 0x09800000, 0x0A000000, 0x0A800000, 0x0B000000, 0x0B800000, 0x0C000000, 0x0C800000, 0x0D000000, 0x0D800000, 0x0E000000, 0x0E800000, 0x0F000000, 0x47800000, + 0x80000000, 0x80800000, 0x81000000, 0x81800000, 0x82000000, 0x82800000, 0x83000000, 0x83800000, 0x84000000, 0x84800000, 0x85000000, 0x85800000, 0x86000000, 0x86800000, 0x87000000, 0x87800000, + 0x88000000, 0x88800000, 0x89000000, 0x89800000, 0x8A000000, 0x8A800000, 0x8B000000, 0x8B800000, 0x8C000000, 0x8C800000, 0x8D000000, 0x8D800000, 0x8E000000, 0x8E800000, 0x8F000000, 0xC7800000 + }; + static const unsigned short offset_table[64] = { + 0, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, + 0, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024 + }; + ConversionBits bits; + bits.i32 = mantissa_table[offset_table[value >> 10] + (value & 0x3FF)] + + exponent_table[value >> 10]; + return bits.f32; +} + +// ================================================================================================= + +// CLBLAST_HALF_H_ +#endif diff --git a/include/internal/database.h b/include/internal/database.h index ca79fdad..f93eaa22 100644 --- a/include/internal/database.h +++ b/include/internal/database.h @@ -67,15 +67,15 @@ class Database { }; // The database consists of separate database entries, stored together in a vector - static const DatabaseEntry XaxpySingle, XaxpyDouble, XaxpyComplexSingle, XaxpyComplexDouble; - static const DatabaseEntry XdotSingle, XdotDouble, XdotComplexSingle, XdotComplexDouble; - static const DatabaseEntry XgemvSingle, XgemvDouble, XgemvComplexSingle, XgemvComplexDouble; - static const DatabaseEntry XgerSingle, XgerDouble, XgerComplexSingle, XgerComplexDouble; - static const DatabaseEntry XgemmSingle, XgemmDouble, XgemmComplexSingle, XgemmComplexDouble; - static const DatabaseEntry CopySingle, CopyDouble, CopyComplexSingle, CopyComplexDouble; - static const DatabaseEntry PadSingle, PadDouble, PadComplexSingle, PadComplexDouble; - static const DatabaseEntry TransposeSingle, TransposeDouble, TransposeComplexSingle, TransposeComplexDouble; - static const DatabaseEntry PadtransposeSingle, PadtransposeDouble, PadtransposeComplexSingle, PadtransposeComplexDouble; + static const DatabaseEntry XaxpyHalf, XaxpySingle, XaxpyDouble, XaxpyComplexSingle, XaxpyComplexDouble; + static const DatabaseEntry XdotHalf, XdotSingle, XdotDouble, XdotComplexSingle, XdotComplexDouble; + static const DatabaseEntry XgemvHalf, XgemvSingle, XgemvDouble, XgemvComplexSingle, XgemvComplexDouble; + static const DatabaseEntry XgerHalf, XgerSingle, XgerDouble, XgerComplexSingle, XgerComplexDouble; + static const DatabaseEntry XgemmHalf, XgemmSingle, XgemmDouble, XgemmComplexSingle, XgemmComplexDouble; + static const DatabaseEntry CopyHalf, CopySingle, CopyDouble, CopyComplexSingle, CopyComplexDouble; + static const DatabaseEntry PadHalf, PadSingle, PadDouble, PadComplexSingle, PadComplexDouble; + static const DatabaseEntry TransposeHalf, TransposeSingle, TransposeDouble, TransposeComplexSingle, TransposeComplexDouble; + static const DatabaseEntry PadtransposeHalf, PadtransposeSingle, PadtransposeDouble, PadtransposeComplexSingle, PadtransposeComplexDouble; static const std::vector database; // The constructor diff --git a/include/internal/database/copy.h b/include/internal/database/copy.h index 59a9e03a..63f8e814 100644 --- a/include/internal/database/copy.h +++ b/include/internal/database/copy.h @@ -14,6 +14,24 @@ namespace clblast { // ================================================================================================= +const Database::DatabaseEntry Database::CopyHalf = { + "Copy", Precision::kHalf, { + { // Intel GPUs + kDeviceTypeGPU, "Intel", { + { "Intel(R) HD Graphics Skylake ULT GT2", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",4}, {"COPY_WPT",1} } }, + { "default", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",4}, {"COPY_WPT",1} } }, + } + }, + { // Default + kDeviceTypeAll, "default", { + { "default", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",4}, {"COPY_WPT",1} } }, + } + }, + } +}; + +// ================================================================================================= + const Database::DatabaseEntry Database::CopySingle = { "Copy", Precision::kSingle, { { // AMD GPUs diff --git a/include/internal/database/pad.h b/include/internal/database/pad.h index d2de19e4..d0a85e7c 100644 --- a/include/internal/database/pad.h +++ b/include/internal/database/pad.h @@ -14,6 +14,24 @@ namespace clblast { // ================================================================================================= +const Database::DatabaseEntry Database::PadHalf = { + "Pad", Precision::kHalf, { + { // Intel GPUs + kDeviceTypeGPU, "Intel", { + { "Intel(R) HD Graphics Skylake ULT GT2", { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } }, + { "default", { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } }, + } + }, + { // Default + kDeviceTypeAll, "default", { + { "default", { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } }, + } + }, + } +}; + +// ================================================================================================= + const Database::DatabaseEntry Database::PadSingle = { "Pad", Precision::kSingle, { { // AMD GPUs diff --git a/include/internal/database/padtranspose.h b/include/internal/database/padtranspose.h index b1db1b21..0eb3b528 100644 --- a/include/internal/database/padtranspose.h +++ b/include/internal/database/padtranspose.h @@ -14,6 +14,24 @@ namespace clblast { // ================================================================================================= +const Database::DatabaseEntry Database::PadtransposeHalf = { + "Padtranspose", Precision::kHalf, { + { // Intel GPUs + kDeviceTypeGPU, "Intel", { + { "Intel(R) HD Graphics Skylake ULT GT2", { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",1} } }, + { "default", { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",1} } }, + } + }, + { // Default + kDeviceTypeAll, "default", { + { "default", { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",1} } }, + } + }, + } +}; + +// ================================================================================================= + const Database::DatabaseEntry Database::PadtransposeSingle = { "Padtranspose", Precision::kSingle, { { // AMD GPUs diff --git a/include/internal/database/transpose.h b/include/internal/database/transpose.h index d87f79a6..d7bdd90a 100644 --- a/include/internal/database/transpose.h +++ b/include/internal/database/transpose.h @@ -14,6 +14,24 @@ namespace clblast { // ================================================================================================= +const Database::DatabaseEntry Database::TransposeHalf = { + "Transpose", Precision::kHalf, { + { // Intel GPUs + kDeviceTypeGPU, "Intel", { + { "Intel(R) HD Graphics Skylake ULT GT2", { {"TRA_DIM",16}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",4} } }, + { "default", { {"TRA_DIM",16}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",4} } }, + } + }, + { // Default + kDeviceTypeAll, "default", { + { "default", { {"TRA_DIM",16}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",4} } }, + } + }, + } +}; + +// ================================================================================================= + const Database::DatabaseEntry Database::TransposeSingle = { "Transpose", Precision::kSingle, { { // AMD GPUs diff --git a/include/internal/database/xaxpy.h b/include/internal/database/xaxpy.h index 55be0bcb..72e6a43c 100644 --- a/include/internal/database/xaxpy.h +++ b/include/internal/database/xaxpy.h @@ -14,6 +14,24 @@ namespace clblast { // ================================================================================================= +const Database::DatabaseEntry Database::XaxpyHalf = { + "Xaxpy", Precision::kHalf, { + { // Intel GPUs + kDeviceTypeGPU, "Intel", { + { "Intel(R) HD Graphics Skylake ULT GT2", { {"VW",4}, {"WGS",512}, {"WPT",8} } }, + { "default", { {"VW",4}, {"WGS",512}, {"WPT",8} } }, + } + }, + { // Default + kDeviceTypeAll, "default", { + { "default", { {"VW",4}, {"WGS",512}, {"WPT",8} } }, + } + }, + } +}; + +// ================================================================================================= + const Database::DatabaseEntry Database::XaxpySingle = { "Xaxpy", Precision::kSingle, { { // AMD GPUs diff --git a/include/internal/database/xdot.h b/include/internal/database/xdot.h index f9ae975b..95def654 100644 --- a/include/internal/database/xdot.h +++ b/include/internal/database/xdot.h @@ -14,6 +14,24 @@ namespace clblast { // ================================================================================================= +const Database::DatabaseEntry Database::XdotHalf = { + "Xdot", Precision::kHalf, { + { // Intel GPUs + kDeviceTypeGPU, "Intel", { + { "Intel(R) HD Graphics Skylake ULT GT2", { {"WGS1",32}, {"WGS2",32} } }, + { "default", { {"WGS1",32}, {"WGS2",32} } }, + } + }, + { // Default + kDeviceTypeAll, "default", { + { "default", { {"WGS1",32}, {"WGS2",32} } }, + } + }, + } +}; + +// ================================================================================================= + const Database::DatabaseEntry Database::XdotSingle = { "Xdot", Precision::kSingle, { { // AMD GPUs diff --git a/include/internal/database/xgemm.h b/include/internal/database/xgemm.h index 9ca2bff5..647188e9 100644 --- a/include/internal/database/xgemm.h +++ b/include/internal/database/xgemm.h @@ -14,6 +14,18 @@ namespace clblast { // ================================================================================================= +const Database::DatabaseEntry Database::XgemmHalf = { + "Xgemm", Precision::kHalf, { + { // Default + kDeviceTypeAll, "default", { + { "default", { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",64}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",1} } }, + } + }, + } +}; + +// ================================================================================================= + const Database::DatabaseEntry Database::XgemmSingle = { "Xgemm", Precision::kSingle, { { // AMD GPUs diff --git a/include/internal/database/xgemv.h b/include/internal/database/xgemv.h index bbbe62f6..0d11f5ee 100644 --- a/include/internal/database/xgemv.h +++ b/include/internal/database/xgemv.h @@ -14,6 +14,24 @@ namespace clblast { // ================================================================================================= +const Database::DatabaseEntry Database::XgemvHalf = { + "Xgemv", Precision::kHalf, { + { // Intel GPUs + kDeviceTypeGPU, "Intel", { + { "Intel(R) HD Graphics Skylake ULT GT2", { {"WGS1",128}, {"WPT1",1}, {"VW2",2}, {"WGS2",128}, {"WPT2",2}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } }, + { "default", { {"WGS1",128}, {"WPT1",1}, {"VW2",2}, {"WGS2",128}, {"WPT2",2}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } }, + } + }, + { // Default + kDeviceTypeAll, "default", { + { "default", { {"WGS1",128}, {"WPT1",1}, {"VW2",2}, {"WGS2",128}, {"WPT2",2}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } }, + } + }, + } +}; + +// ================================================================================================= + const Database::DatabaseEntry Database::XgemvSingle = { "Xgemv", Precision::kSingle, { { // AMD GPUs diff --git a/include/internal/database/xger.h b/include/internal/database/xger.h index dae857cd..81b8b98a 100644 --- a/include/internal/database/xger.h +++ b/include/internal/database/xger.h @@ -14,6 +14,24 @@ namespace clblast { // ================================================================================================= +const Database::DatabaseEntry Database::XgerHalf = { + "Xger", Precision::kHalf, { + { // Intel GPUs + kDeviceTypeGPU, "Intel", { + { "Intel(R) HD Graphics Skylake ULT GT2", { {"WGS1",64}, {"WGS2",1}, {"WPT",1} } }, + { "default", { {"WGS1",64}, {"WGS2",1}, {"WPT",1} } }, + } + }, + { // Default + kDeviceTypeAll, "default", { + { "default", { {"WGS1",64}, {"WGS2",1}, {"WPT",1} } }, + } + }, + } +}; + +// ================================================================================================= + const Database::DatabaseEntry Database::XgerSingle = { "Xger", Precision::kSingle, { { // AMD GPUs diff --git a/include/internal/routines/level1/xaxpy.h b/include/internal/routines/level1/xaxpy.h index bc00c8e3..03771d53 100644 --- a/include/internal/routines/level1/xaxpy.h +++ b/include/internal/routines/level1/xaxpy.h @@ -29,6 +29,7 @@ class Xaxpy: public Routine { using Routine::source_string_; using Routine::queue_; using Routine::event_; + using Routine::context_; using Routine::GetProgramFromCache; using Routine::TestVectorX; using Routine::TestVectorY; diff --git a/include/internal/routines/level2/xgemv.h b/include/internal/routines/level2/xgemv.h index 0b2a8e66..875f936e 100644 --- a/include/internal/routines/level2/xgemv.h +++ b/include/internal/routines/level2/xgemv.h @@ -29,6 +29,7 @@ class Xgemv: public Routine { using Routine::source_string_; using Routine::queue_; using Routine::event_; + using Routine::context_; using Routine::GetProgramFromCache; using Routine::TestVectorX; using Routine::TestVectorY; diff --git a/include/internal/routines/level2/xger.h b/include/internal/routines/level2/xger.h index 5ace9da6..1d5c64bd 100644 --- a/include/internal/routines/level2/xger.h +++ b/include/internal/routines/level2/xger.h @@ -29,6 +29,7 @@ class Xger: public Routine { using Routine::source_string_; using Routine::queue_; using Routine::event_; + using Routine::context_; using Routine::GetProgramFromCache; using Routine::TestVectorX; using Routine::TestVectorY; diff --git a/include/internal/routines/level2/xher.h b/include/internal/routines/level2/xher.h index 861ba302..ebd20ee8 100644 --- a/include/internal/routines/level2/xher.h +++ b/include/internal/routines/level2/xher.h @@ -29,6 +29,7 @@ class Xher: public Routine { using Routine::source_string_; using Routine::queue_; using Routine::event_; + using Routine::context_; using Routine::GetProgramFromCache; using Routine::TestVectorX; using Routine::TestMatrixA; diff --git a/include/internal/routines/level2/xher2.h b/include/internal/routines/level2/xher2.h index 9a23199e..a33a71c3 100644 --- a/include/internal/routines/level2/xher2.h +++ b/include/internal/routines/level2/xher2.h @@ -29,6 +29,7 @@ class Xher2: public Routine { using Routine::source_string_; using Routine::queue_; using Routine::event_; + using Routine::context_; using Routine::GetProgramFromCache; using Routine::TestVectorX; using Routine::TestVectorY; diff --git a/include/internal/tuning.h b/include/internal/tuning.h index 215beb59..3eba6fdb 100644 --- a/include/internal/tuning.h +++ b/include/internal/tuning.h @@ -20,6 +20,8 @@ #include +#include "internal/utilities.h" + namespace clblast { // ================================================================================================= diff --git a/include/internal/utilities.h b/include/internal/utilities.h index 82cd7f44..d3c8ebdb 100644 --- a/include/internal/utilities.h +++ b/include/internal/utilities.h @@ -22,6 +22,7 @@ #include #include "clblast.h" +#include "clblast_half.h" #include "internal/clpp11.h" namespace clblast { @@ -94,6 +95,16 @@ constexpr auto kArgNoAbbreviations = "no_abbrv"; // ================================================================================================= +// Returns a scalar with a default value +template +T GetScalar(); + +// Returns a scalar of value 1 +template +T ConstantOne(); + +// ================================================================================================= + // Structure containing all possible arguments for test clients, including their default values template struct Arguments { @@ -124,8 +135,8 @@ struct Arguments { size_t nrm2_offset = 0; size_t asum_offset = 0; size_t imax_offset = 0; - T alpha = T{1.0}; - T beta = T{1.0}; + T alpha = ConstantOne(); + T beta = ConstantOne(); size_t x_size = 1; size_t y_size = 1; size_t a_size = 1; @@ -202,9 +213,13 @@ void PopulateVector(std::vector &vector); // ================================================================================================= -// Returns a scalar with a default value -template -T GetScalar(); +// Conversion between half and single-precision +std::vector HalfToFloatBuffer(const std::vector& source); +void FloatToHalfBuffer(std::vector& result, const std::vector& source); + +// As above, but now for OpenCL data-types instead of std::vectors +Buffer HalfToFloatBuffer(const Buffer& source, cl_command_queue queue_raw); +void FloatToHalfBuffer(Buffer& result, const Buffer& source, cl_command_queue queue_raw); // ================================================================================================= diff --git a/samples/haxpy.c b/samples/haxpy.c new file mode 100644 index 00000000..3c7bb33a --- /dev/null +++ b/samples/haxpy.c @@ -0,0 +1,105 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// This file demonstrates the use of the HAXPY routine. It demonstrates the use of half-precision. +// +// Note that this example is meant for illustration purposes only. CLBlast provides other programs +// for performance benchmarking ('client_xxxxx') and for correctness testing ('test_xxxxx'). +// +// ================================================================================================= + +#include +#include +#include + +// Includes the CLBlast library (C interface) +#include + +// Includes the float-to-half and half-to-float conversion utilities +#include + +// ================================================================================================= + +// Example use of the half-precision routine HAXPY +int main(void) { + + // OpenCL platform/device settings + const size_t platform_id = 0; + const size_t device_id = 0; + + // Example HAXPY arguments + const size_t n = 8192; + const cl_half alpha = FloatToHalf(0.5f); + + // Initializes the OpenCL platform + cl_uint num_platforms; + clGetPlatformIDs(0, NULL, &num_platforms); + cl_platform_id* platforms = (cl_platform_id*)malloc(num_platforms*sizeof(cl_platform_id)); + clGetPlatformIDs(num_platforms, platforms, NULL); + cl_platform_id platform = platforms[platform_id]; + + // Initializes the OpenCL device + cl_uint num_devices; + clGetDeviceIDs(platform, CL_DEVICE_TYPE_ALL, 0, NULL, &num_devices); + cl_device_id* devices = (cl_device_id*)malloc(num_devices*sizeof(cl_device_id)); + clGetDeviceIDs(platform, CL_DEVICE_TYPE_ALL, num_devices, devices, NULL); + cl_device_id device = devices[device_id]; + + // Creates the OpenCL context, queue, and an event + cl_context context = clCreateContext(NULL, 1, &device, NULL, NULL, NULL); + cl_command_queue queue = clCreateCommandQueue(context, device, 0, NULL); + cl_event event = NULL; + + // Populate host vectors with some example data + cl_half* host_a = (cl_half*)malloc(sizeof(cl_half)*n); + cl_half* host_b = (cl_half*)malloc(sizeof(cl_half)*n); + for (size_t i=0; i success). + printf("Completed HAXPY with status %d\n", status); + + // Prints the first output value + if (status == 0) { + printf("Output value at index 0: b[0] = %.3lf\n", HalfToFloat(host_b[0])); + } + + // Clean-up + free(platforms); + free(devices); + free(host_a); + free(host_b); + clReleaseMemObject(device_a); + clReleaseMemObject(device_b); + clReleaseCommandQueue(queue); + clReleaseContext(context); + return 0; +} + +// ================================================================================================= diff --git a/scripts/database/database.py b/scripts/database/database.py index 8e8f37f8..7f7f07e4 100644 --- a/scripts/database/database.py +++ b/scripts/database/database.py @@ -189,13 +189,20 @@ def GetFooter(): # The start of a new C++ precision entry def GetPrecision(family, precision): - precisionstring = "Single" - if precision == "64": + precisionstring = "" + if precision == "16": + precisionstring = "Half" + elif precision == "32": + precisionstring = "Single" + elif precision == "64": precisionstring = "Double" elif precision == "3232": precisionstring = "ComplexSingle" elif precision == "6464": precisionstring = "ComplexDouble" + else: + print("[ERROR] Unknown precision") + sys.exit() return("\n\nconst Database::DatabaseEntry Database::%s%s = {\n \"%s\", Precision::k%s, {\n" % (family.title(), precisionstring, family.title(), precisionstring)) diff --git a/scripts/generator/datatype.py b/scripts/generator/datatype.py index 5a58ab53..5bff95d1 100644 --- a/scripts/generator/datatype.py +++ b/scripts/generator/datatype.py @@ -13,10 +13,13 @@ # ================================================================================================== # Short-hands for data-types +HLF = "half" FLT = "float" DBL = "double" FLT2 = "float2" DBL2 = "double2" + +HCL = "cl_half" F2CL = "cl_float2" D2CL = "cl_double2" diff --git a/scripts/generator/generator.py b/scripts/generator/generator.py index 210f371f..f5fc5ecf 100644 --- a/scripts/generator/generator.py +++ b/scripts/generator/generator.py @@ -28,11 +28,12 @@ import os.path # Local files from routine import Routine -from datatype import DataType, FLT, DBL, FLT2, DBL2, F2CL, D2CL +from datatype import DataType, HLF, FLT, DBL, FLT2, DBL2, HCL, F2CL, D2CL # ================================================================================================== # Regular data-types +H = DataType("H", "H", HLF, [HLF, HLF, HCL, HCL], HLF ) # half (16) S = DataType("S", "S", FLT, [FLT, FLT, FLT, FLT], FLT ) # single (32) D = DataType("D", "D", DBL, [DBL, DBL, DBL, DBL], DBL ) # double (64) C = DataType("C", "C", FLT2, [FLT2, FLT2, F2CL, F2CL], FLT2) # single-complex (3232) @@ -41,6 +42,7 @@ Z = DataType("Z", "Z", DBL2, [DBL2, DBL2, D2CL, D2CL], DBL2) # double-complex (6 # Special cases Sc = DataType("C", "Sc", FLT2, [FLT2, FLT2, FLT2, FLT2], FLT2) # As C, but with real output Dz = DataType("Z", "Dz", DBL2, [DBL2, DBL2, DBL2, DBL2], DBL2) # As Z, but with real output +iH = DataType("H", "iH", HLF, [HLF, HLF, HLF, HLF], HLF ) # As H, but with integer output iS = DataType("S", "iS", FLT, [FLT, FLT, FLT, FLT], FLT ) # As S, but with integer output iD = DataType("D", "iD", DBL, [DBL, DBL, DBL, DBL], DBL ) # As D, but with integer output iC = DataType("C", "iC", FLT2, [FLT2, FLT2, F2CL, F2CL], FLT2) # As C, but with integer output @@ -60,62 +62,62 @@ TU = DataType("TU", "typename T, typename U", "T,U", ["T", "U", "T", "U"], "T") # Populates a list of routines routines = [ [ # Level 1: vector-vector - Routine(False, True, "1", "rotg", T, [S,D], [], [], [], ["sa","sb","sc","ss"], [], "", "Generate givens plane rotation", "", []), - Routine(False, True, "1", "rotmg", T, [S,D], [], [], ["sy1"], ["sd1","sd2","sx1","sparam"], [], "", "Generate modified givens plane rotation", "", []), - Routine(False, True, "1", "rot", T, [S,D], ["n"], [], [], ["x","y"], ["cos","sin"], "", "Apply givens plane rotation", "", []), - Routine(False, True, "1", "rotm", T, [S,D], ["n"], [], [], ["x","y","sparam"], [], "", "Apply modified givens plane rotation", "", []), - Routine(True, True, "1", "swap", T, [S,D,C,Z], ["n"], [], [], ["x","y"], [], "", "Swap two vectors", "Interchanges the contents of vectors x and y.", []), - Routine(True, True, "1", "scal", T, [S,D,C,Z], ["n"], [], [], ["x"], ["alpha"], "", "Vector scaling", "Multiplies all elements of vector x by a scalar constant alpha.", []), - Routine(True, True, "1", "copy", T, [S,D,C,Z], ["n"], [], ["x"], ["y"], [], "", "Vector copy", "Copies the contents of vector x into vector y.", []), - Routine(True, True, "1", "axpy", T, [S,D,C,Z], ["n"], [], ["x"], ["y"], ["alpha"], "", "Vector-times-constant plus vector", "Performs the operation y = alpha * x + y, in which x and y are vectors and alpha is a scalar constant.", []), - Routine(True, True, "1", "dot", T, [S,D], ["n"], [], ["x","y"], ["dot"], [], "n", "Dot product of two vectors", "Multiplies the vectors x and y element-wise and accumulates the results. The sum is stored in the dot buffer.", []), - Routine(True, True, "1", "dotu", T, [C,Z], ["n"], [], ["x","y"], ["dot"], [], "n", "Dot product of two complex vectors", "See the regular xDOT routine.", []), - Routine(True, True, "1", "dotc", T, [C,Z], ["n"], [], ["x","y"], ["dot"], [], "n", "Dot product of two complex vectors, one conjugated", "See the regular xDOT routine.", []), - Routine(True, True, "1", "nrm2", T, [S,D,Sc,Dz],["n"], [], ["x"], ["nrm2"], [], "2*n", "Euclidian norm of a vector", "Accumulates the square of each element in the x vector and takes the square root. The resulting L2 norm is stored in the nrm2 buffer.", []), - Routine(True, True, "1", "asum", T, [S,D,Sc,Dz],["n"], [], ["x"], ["asum"], [], "n", "Absolute sum of values in a vector", "Accumulates the absolute value of each element in the x vector. The results are stored in the asum buffer.", []), - Routine(True, False, "1", "sum", T, [S,D,Sc,Dz],["n"], [], ["x"], ["sum"], [], "n", "Sum of values in a vector (non-BLAS function)", "Accumulates the values of each element in the x vector. The results are stored in the sum buffer. This routine is the non-absolute version of the xASUM BLAS routine.", []), - Routine(True, True, "1", "amax", T, [iS,iD,iC,iZ],["n"], [], ["x"], ["imax"], [], "2*n", "Index of absolute maximum value in a vector", "Finds the index of the maximum of the absolute values in the x vector. The resulting integer index is stored in the imax buffer.", []), - Routine(True, False, "1", "max", T, [iS,iD,iC,iZ],["n"], [], ["x"], ["imax"], [], "2*n", "Index of maximum value in a vector (non-BLAS function)", "Finds the index of the maximum of the values in the x vector. The resulting integer index is stored in the imax buffer. This routine is the non-absolute version of the IxAMAX BLAS routine.", []), - Routine(True, False, "1", "min", T, [iS,iD,iC,iZ],["n"], [], ["x"], ["imin"], [], "2*n", "Index of minimum value in a vector (non-BLAS function)", "Finds the index of the minimum of the values in the x vector. The resulting integer index is stored in the imin buffer. This routine is the non-absolute minimum version of the IxAMAX BLAS routine.", []), + Routine(False, True, "1", "rotg", T, [S,D], [], [], [], ["sa","sb","sc","ss"], [], "", "Generate givens plane rotation", "", []), + Routine(False, True, "1", "rotmg", T, [S,D], [], [], ["sy1"], ["sd1","sd2","sx1","sparam"], [], "", "Generate modified givens plane rotation", "", []), + Routine(False, True, "1", "rot", T, [S,D], ["n"], [], [], ["x","y"], ["cos","sin"], "", "Apply givens plane rotation", "", []), + Routine(False, True, "1", "rotm", T, [S,D], ["n"], [], [], ["x","y","sparam"], [], "", "Apply modified givens plane rotation", "", []), + Routine(True, True, "1", "swap", T, [S,D,C,Z,H], ["n"], [], [], ["x","y"], [], "", "Swap two vectors", "Interchanges the contents of vectors x and y.", []), + Routine(True, True, "1", "scal", T, [S,D,C,Z,H], ["n"], [], [], ["x"], ["alpha"], "", "Vector scaling", "Multiplies all elements of vector x by a scalar constant alpha.", []), + Routine(True, True, "1", "copy", T, [S,D,C,Z,H], ["n"], [], ["x"], ["y"], [], "", "Vector copy", "Copies the contents of vector x into vector y.", []), + Routine(True, True, "1", "axpy", T, [S,D,C,Z,H], ["n"], [], ["x"], ["y"], ["alpha"], "", "Vector-times-constant plus vector", "Performs the operation y = alpha * x + y, in which x and y are vectors and alpha is a scalar constant.", []), + Routine(True, True, "1", "dot", T, [S,D,H], ["n"], [], ["x","y"], ["dot"], [], "n", "Dot product of two vectors", "Multiplies the vectors x and y element-wise and accumulates the results. The sum is stored in the dot buffer.", []), + Routine(True, True, "1", "dotu", T, [C,Z], ["n"], [], ["x","y"], ["dot"], [], "n", "Dot product of two complex vectors", "See the regular xDOT routine.", []), + Routine(True, True, "1", "dotc", T, [C,Z], ["n"], [], ["x","y"], ["dot"], [], "n", "Dot product of two complex vectors, one conjugated", "See the regular xDOT routine.", []), + Routine(True, True, "1", "nrm2", T, [S,D,Sc,Dz,H], ["n"], [], ["x"], ["nrm2"], [], "2*n", "Euclidian norm of a vector", "Accumulates the square of each element in the x vector and takes the square root. The resulting L2 norm is stored in the nrm2 buffer.", []), + Routine(True, True, "1", "asum", T, [S,D,Sc,Dz,H], ["n"], [], ["x"], ["asum"], [], "n", "Absolute sum of values in a vector", "Accumulates the absolute value of each element in the x vector. The results are stored in the asum buffer.", []), + Routine(True, False, "1", "sum", T, [S,D,Sc,Dz,H], ["n"], [], ["x"], ["sum"], [], "n", "Sum of values in a vector (non-BLAS function)", "Accumulates the values of each element in the x vector. The results are stored in the sum buffer. This routine is the non-absolute version of the xASUM BLAS routine.", []), + Routine(True, True, "1", "amax", T, [iS,iD,iC,iZ,iH], ["n"], [], ["x"], ["imax"], [], "2*n", "Index of absolute maximum value in a vector", "Finds the index of the maximum of the absolute values in the x vector. The resulting integer index is stored in the imax buffer.", []), + Routine(True, False, "1", "max", T, [iS,iD,iC,iZ,iH], ["n"], [], ["x"], ["imax"], [], "2*n", "Index of maximum value in a vector (non-BLAS function)", "Finds the index of the maximum of the values in the x vector. The resulting integer index is stored in the imax buffer. This routine is the non-absolute version of the IxAMAX BLAS routine.", []), + Routine(True, False, "1", "min", T, [iS,iD,iC,iZ,iH], ["n"], [], ["x"], ["imin"], [], "2*n", "Index of minimum value in a vector (non-BLAS function)", "Finds the index of the minimum of the values in the x vector. The resulting integer index is stored in the imin buffer. This routine is the non-absolute minimum version of the IxAMAX BLAS routine.", []), ], [ # Level 2: matrix-vector - Routine(True, True, "2a", "gemv", T, [S,D,C,Z], ["m","n"], ["layout","a_transpose"], ["a","x"], ["y"], ["alpha","beta"], "", "General matrix-vector multiplication", "Performs the operation y = alpha * A * x + beta * y, in which x is an input vector, y is an input and output vector, A is an input matrix, and alpha and beta are scalars. The matrix A can optionally be transposed before performing the operation.", []), - Routine(True, True, "2a", "gbmv", T, [S,D,C,Z], ["m","n","kl","ku"], ["layout","a_transpose"], ["a","x"], ["y"], ["alpha","beta"], "", "General banded matrix-vector multiplication", "Same operation as xGEMV, but matrix A is banded instead.", []), - Routine(True, True, "2a", "hemv", T, [C,Z], ["n"], ["layout","triangle"], ["a","x"], ["y"], ["alpha","beta"], "", "Hermitian matrix-vector multiplication", "Same operation as xGEMV, but matrix A is an Hermitian matrix instead.", []), - Routine(True, True, "2a", "hbmv", T, [C,Z], ["n","k"], ["layout","triangle"], ["a","x"], ["y"], ["alpha","beta"], "", "Hermitian banded matrix-vector multiplication", "Same operation as xGEMV, but matrix A is an Hermitian banded matrix instead.", []), - Routine(True, True, "2a", "hpmv", T, [C,Z], ["n"], ["layout","triangle"], ["ap","x"], ["y"], ["alpha","beta"], "", "Hermitian packed matrix-vector multiplication", "Same operation as xGEMV, but matrix A is an Hermitian packed matrix instead and represented as AP.", []), - Routine(True, True, "2a", "symv", T, [S,D], ["n"], ["layout","triangle"], ["a","x"], ["y"], ["alpha","beta"], "", "Symmetric matrix-vector multiplication", "Same operation as xGEMV, but matrix A is symmetric instead.", []), - Routine(True, True, "2a", "sbmv", T, [S,D], ["n","k"], ["layout","triangle"], ["a","x"], ["y"], ["alpha","beta"], "", "Symmetric banded matrix-vector multiplication", "Same operation as xGEMV, but matrix A is symmetric and banded instead.", []), - Routine(True, True, "2a", "spmv", T, [S,D], ["n"], ["layout","triangle"], ["ap","x"], ["y"], ["alpha","beta"], "", "Symmetric packed matrix-vector multiplication", "Same operation as xGEMV, but matrix A is a symmetric packed matrix instead and represented as AP.", []), - Routine(True, True, "2a", "trmv", T, [S,D,C,Z], ["n"], ["layout","triangle","a_transpose","diagonal"], ["a"], ["x"], [], "n", "Triangular matrix-vector multiplication", "Same operation as xGEMV, but matrix A is triangular instead.", []), - Routine(True, True, "2a", "tbmv", T, [S,D,C,Z], ["n","k"], ["layout","triangle","a_transpose","diagonal"], ["a"], ["x"], [], "n", "Triangular banded matrix-vector multiplication", "Same operation as xGEMV, but matrix A is triangular and banded instead.", []), - Routine(True, True, "2a", "tpmv", T, [S,D,C,Z], ["n"], ["layout","triangle","a_transpose","diagonal"], ["ap"], ["x"], [], "n", "Triangular packed matrix-vector multiplication", "Same operation as xGEMV, but matrix A is a triangular packed matrix instead and repreented as AP.", []), - Routine(False, True, "2a", "trsv", T, [S,D,C,Z], ["n"], ["layout","triangle","a_transpose","diagonal"], ["a"], ["x"], [], "", "Solves a triangular system of equations", "", []), - Routine(False, True, "2a", "tbsv", T, [S,D,C,Z], ["n","k"], ["layout","triangle","a_transpose","diagonal"], ["a"], ["x"], [], "", "Solves a banded triangular system of equations", "", []), - Routine(False, True, "2a", "tpsv", T, [S,D,C,Z], ["n"], ["layout","triangle","a_transpose","diagonal"], ["ap"], ["x"], [], "", "Solves a packed triangular system of equations", "", []), + Routine(True, True, "2a", "gemv", T, [S,D,C,Z,H], ["m","n"], ["layout","a_transpose"], ["a","x"], ["y"], ["alpha","beta"], "", "General matrix-vector multiplication", "Performs the operation y = alpha * A * x + beta * y, in which x is an input vector, y is an input and output vector, A is an input matrix, and alpha and beta are scalars. The matrix A can optionally be transposed before performing the operation.", []), + Routine(True, True, "2a", "gbmv", T, [S,D,C,Z,H], ["m","n","kl","ku"], ["layout","a_transpose"], ["a","x"], ["y"], ["alpha","beta"], "", "General banded matrix-vector multiplication", "Same operation as xGEMV, but matrix A is banded instead.", []), + Routine(True, True, "2a", "hemv", T, [C,Z], ["n"], ["layout","triangle"], ["a","x"], ["y"], ["alpha","beta"], "", "Hermitian matrix-vector multiplication", "Same operation as xGEMV, but matrix A is an Hermitian matrix instead.", []), + Routine(True, True, "2a", "hbmv", T, [C,Z], ["n","k"], ["layout","triangle"], ["a","x"], ["y"], ["alpha","beta"], "", "Hermitian banded matrix-vector multiplication", "Same operation as xGEMV, but matrix A is an Hermitian banded matrix instead.", []), + Routine(True, True, "2a", "hpmv", T, [C,Z], ["n"], ["layout","triangle"], ["ap","x"], ["y"], ["alpha","beta"], "", "Hermitian packed matrix-vector multiplication", "Same operation as xGEMV, but matrix A is an Hermitian packed matrix instead and represented as AP.", []), + Routine(True, True, "2a", "symv", T, [S,D,H], ["n"], ["layout","triangle"], ["a","x"], ["y"], ["alpha","beta"], "", "Symmetric matrix-vector multiplication", "Same operation as xGEMV, but matrix A is symmetric instead.", []), + Routine(True, True, "2a", "sbmv", T, [S,D,H], ["n","k"], ["layout","triangle"], ["a","x"], ["y"], ["alpha","beta"], "", "Symmetric banded matrix-vector multiplication", "Same operation as xGEMV, but matrix A is symmetric and banded instead.", []), + Routine(True, True, "2a", "spmv", T, [S,D,H], ["n"], ["layout","triangle"], ["ap","x"], ["y"], ["alpha","beta"], "", "Symmetric packed matrix-vector multiplication", "Same operation as xGEMV, but matrix A is a symmetric packed matrix instead and represented as AP.", []), + Routine(True, True, "2a", "trmv", T, [S,D,C,Z,H], ["n"], ["layout","triangle","a_transpose","diagonal"], ["a"], ["x"], [], "n", "Triangular matrix-vector multiplication", "Same operation as xGEMV, but matrix A is triangular instead.", []), + Routine(True, True, "2a", "tbmv", T, [S,D,C,Z,H], ["n","k"], ["layout","triangle","a_transpose","diagonal"], ["a"], ["x"], [], "n", "Triangular banded matrix-vector multiplication", "Same operation as xGEMV, but matrix A is triangular and banded instead.", []), + Routine(True, True, "2a", "tpmv", T, [S,D,C,Z,H], ["n"], ["layout","triangle","a_transpose","diagonal"], ["ap"], ["x"], [], "n", "Triangular packed matrix-vector multiplication", "Same operation as xGEMV, but matrix A is a triangular packed matrix instead and repreented as AP.", []), + Routine(False, True, "2a", "trsv", T, [S,D,C,Z], ["n"], ["layout","triangle","a_transpose","diagonal"], ["a"], ["x"], [], "", "Solves a triangular system of equations", "", []), + Routine(False, True, "2a", "tbsv", T, [S,D,C,Z], ["n","k"], ["layout","triangle","a_transpose","diagonal"], ["a"], ["x"], [], "", "Solves a banded triangular system of equations", "", []), + Routine(False, True, "2a", "tpsv", T, [S,D,C,Z], ["n"], ["layout","triangle","a_transpose","diagonal"], ["ap"], ["x"], [], "", "Solves a packed triangular system of equations", "", []), # Level 2: matrix update - Routine(True, True, "2b", "ger", T, [S,D], ["m","n"], ["layout"], ["x","y"], ["a"], ["alpha"], "", "General rank-1 matrix update", "", []), - Routine(True, True, "2b", "geru", T, [C,Z], ["m","n"], ["layout"], ["x","y"], ["a"], ["alpha"], "", "General rank-1 complex matrix update", "", []), - Routine(True, True, "2b", "gerc", T, [C,Z], ["m","n"], ["layout"], ["x","y"], ["a"], ["alpha"], "", "General rank-1 complex conjugated matrix update", "", []), - Routine(True, True, "2b", "her", Tc, [Css,Zdd], ["n"], ["layout","triangle"], ["x"], ["a"], ["alpha"], "", "Hermitian rank-1 matrix update", "", []), - Routine(True, True, "2b", "hpr", Tc, [Css,Zdd], ["n"], ["layout","triangle"], ["x"], ["ap"], ["alpha"], "", "Hermitian packed rank-1 matrix update", "", []), - Routine(True, True, "2b", "her2", T, [C,Z], ["n"], ["layout","triangle"], ["x","y"], ["a"], ["alpha"], "", "Hermitian rank-2 matrix update", "", []), - Routine(True, True, "2b", "hpr2", T, [C,Z], ["n"], ["layout","triangle"], ["x","y"], ["ap"], ["alpha"], "", "Hermitian packed rank-2 matrix update", "", []), - Routine(True, True, "2b", "syr", T, [S,D], ["n"], ["layout","triangle"], ["x"], ["a"], ["alpha"], "", "Symmetric rank-1 matrix update", "", []), - Routine(True, True, "2b", "spr", T, [S,D], ["n"], ["layout","triangle"], ["x"], ["ap"], ["alpha"], "", "Symmetric packed rank-1 matrix update", "", []), - Routine(True, True, "2b", "syr2", T, [S,D], ["n"], ["layout","triangle"], ["x","y"], ["a"], ["alpha"], "", "Symmetric rank-2 matrix update", "", []), - Routine(True, True, "2b", "spr2", T, [S,D], ["n"], ["layout","triangle"], ["x","y"], ["ap"], ["alpha"], "", "Symmetric packed rank-2 matrix update", "", []), + Routine(True, True, "2b", "ger", T, [S,D,H], ["m","n"], ["layout"], ["x","y"], ["a"], ["alpha"], "", "General rank-1 matrix update", "", []), + Routine(True, True, "2b", "geru", T, [C,Z], ["m","n"], ["layout"], ["x","y"], ["a"], ["alpha"], "", "General rank-1 complex matrix update", "", []), + Routine(True, True, "2b", "gerc", T, [C,Z], ["m","n"], ["layout"], ["x","y"], ["a"], ["alpha"], "", "General rank-1 complex conjugated matrix update", "", []), + Routine(True, True, "2b", "her", Tc, [Css,Zdd], ["n"], ["layout","triangle"], ["x"], ["a"], ["alpha"], "", "Hermitian rank-1 matrix update", "", []), + Routine(True, True, "2b", "hpr", Tc, [Css,Zdd], ["n"], ["layout","triangle"], ["x"], ["ap"], ["alpha"], "", "Hermitian packed rank-1 matrix update", "", []), + Routine(True, True, "2b", "her2", T, [C,Z], ["n"], ["layout","triangle"], ["x","y"], ["a"], ["alpha"], "", "Hermitian rank-2 matrix update", "", []), + Routine(True, True, "2b", "hpr2", T, [C,Z], ["n"], ["layout","triangle"], ["x","y"], ["ap"], ["alpha"], "", "Hermitian packed rank-2 matrix update", "", []), + Routine(True, True, "2b", "syr", T, [S,D,H], ["n"], ["layout","triangle"], ["x"], ["a"], ["alpha"], "", "Symmetric rank-1 matrix update", "", []), + Routine(True, True, "2b", "spr", T, [S,D,H], ["n"], ["layout","triangle"], ["x"], ["ap"], ["alpha"], "", "Symmetric packed rank-1 matrix update", "", []), + Routine(True, True, "2b", "syr2", T, [S,D,H], ["n"], ["layout","triangle"], ["x","y"], ["a"], ["alpha"], "", "Symmetric rank-2 matrix update", "", []), + Routine(True, True, "2b", "spr2", T, [S,D,H], ["n"], ["layout","triangle"], ["x","y"], ["ap"], ["alpha"], "", "Symmetric packed rank-2 matrix update", "", []), ], [ # Level 3: matrix-matrix - Routine(True, True, "3", "gemm", T, [S,D,C,Z], ["m","n","k"], ["layout","a_transpose","b_transpose"], ["a","b"], ["c"], ["alpha","beta"], "", "General matrix-matrix multiplication", "", []), - Routine(True, True, "3", "symm", T, [S,D,C,Z], ["m","n"], ["layout","side","triangle"], ["a","b"], ["c"], ["alpha","beta"], "", "Symmetric matrix-matrix multiplication", "", []), - Routine(True, True, "3", "hemm", T, [C,Z], ["m","n"], ["layout","side","triangle"], ["a","b"], ["c"], ["alpha","beta"], "", "Hermitian matrix-matrix multiplication", "", []), - Routine(True, True, "3", "syrk", T, [S,D,C,Z], ["n","k"], ["layout","triangle","a_transpose"], ["a"], ["c"], ["alpha","beta"], "", "Rank-K update of a symmetric matrix", "", []), - Routine(True, True, "3", "herk", Tc, [Css,Zdd], ["n","k"], ["layout","triangle","a_transpose"], ["a"], ["c"], ["alpha","beta"], "", "Rank-K update of a hermitian matrix", "", []), - Routine(True, True, "3", "syr2k", T, [S,D,C,Z], ["n","k"], ["layout","triangle","ab_transpose"], ["a","b"], ["c"], ["alpha","beta"], "", "Rank-2K update of a symmetric matrix", "", []), - Routine(True, True, "3", "her2k", TU, [Ccs,Zzd], ["n","k"], ["layout","triangle","ab_transpose"], ["a","b"], ["c"], ["alpha","beta"], "", "Rank-2K update of a hermitian matrix", "", []), - Routine(True, True, "3", "trmm", T, [S,D,C,Z], ["m","n"], ["layout","side","triangle","a_transpose","diagonal"], ["a"], ["b"], ["alpha"], "", "Triangular matrix-matrix multiplication", "", []), - Routine(False, True, "3", "trsm", T, [S,D,C,Z], ["m","n"], ["layout","side","triangle","a_transpose","diagonal"], ["a"], ["b"], ["alpha"], "", "Solves a triangular system of equations", "", []), + Routine(True, True, "3", "gemm", T, [S,D,C,Z,H], ["m","n","k"], ["layout","a_transpose","b_transpose"], ["a","b"], ["c"], ["alpha","beta"], "", "General matrix-matrix multiplication", "", []), + Routine(True, True, "3", "symm", T, [S,D,C,Z,H], ["m","n"], ["layout","side","triangle"], ["a","b"], ["c"], ["alpha","beta"], "", "Symmetric matrix-matrix multiplication", "", []), + Routine(True, True, "3", "hemm", T, [C,Z], ["m","n"], ["layout","side","triangle"], ["a","b"], ["c"], ["alpha","beta"], "", "Hermitian matrix-matrix multiplication", "", []), + Routine(True, True, "3", "syrk", T, [S,D,C,Z,H], ["n","k"], ["layout","triangle","a_transpose"], ["a"], ["c"], ["alpha","beta"], "", "Rank-K update of a symmetric matrix", "", []), + Routine(True, True, "3", "herk", Tc, [Css,Zdd], ["n","k"], ["layout","triangle","a_transpose"], ["a"], ["c"], ["alpha","beta"], "", "Rank-K update of a hermitian matrix", "", []), + Routine(True, True, "3", "syr2k", T, [S,D,C,Z,H], ["n","k"], ["layout","triangle","ab_transpose"], ["a","b"], ["c"], ["alpha","beta"], "", "Rank-2K update of a symmetric matrix", "", []), + Routine(True, True, "3", "her2k", TU, [Ccs,Zzd], ["n","k"], ["layout","triangle","ab_transpose"], ["a","b"], ["c"], ["alpha","beta"], "", "Rank-2K update of a hermitian matrix", "", []), + Routine(True, True, "3", "trmm", T, [S,D,C,Z,H], ["m","n"], ["layout","side","triangle","a_transpose","diagonal"], ["a"], ["b"], ["alpha"], "", "Triangular matrix-matrix multiplication", "", []), + Routine(False, True, "3", "trsm", T, [S,D,C,Z,H], ["m","n"], ["layout","side","triangle","a_transpose","diagonal"], ["a"], ["b"], ["alpha"], "", "Solves a triangular system of equations", "", []), ]] # ================================================================================================== @@ -229,21 +231,45 @@ def wrapper_clblas(routines): result = "" for routine in routines: if routine.has_tests: - result += "\n// Forwards the clBLAS calls for %s\n" % (routine.ShortNames()) + result += "\n// Forwards the clBLAS calls for %s\n" % (routine.ShortNamesTested()) if routine.NoScalars(): result += routine.RoutineHeaderWrapperCL(routine.template, True, 21)+";\n" for flavour in routine.flavours: - indent = " "*(17 + routine.Length()) result += routine.RoutineHeaderWrapperCL(flavour, False, 21)+" {\n" - arguments = routine.ArgumentsWrapperCL(flavour) - if routine.scratch: - result += " auto queue = Queue(queues[0]);\n" - result += " auto context = queue.GetContext();\n" - result += " auto scratch_buffer = Buffer<"+flavour.template+">(context, "+routine.scratch+");\n" - arguments += ["scratch_buffer()"] - result += " return clblas"+flavour.name+routine.name+"(" - result += (",\n"+indent).join([a for a in arguments]) - result += ",\n"+indent+"num_queues, queues, num_wait_events, wait_events, events);" + + # There is a version available in clBLAS + if flavour.precision_name in ["S","D","C","Z"]: + indent = " "*(17 + routine.Length()) + arguments = routine.ArgumentsWrapperCL(flavour) + if routine.scratch: + result += " auto queue = Queue(queues[0]);\n" + result += " auto context = queue.GetContext();\n" + result += " auto scratch_buffer = Buffer<"+flavour.template+">(context, "+routine.scratch+");\n" + arguments += ["scratch_buffer()"] + result += " return clblas"+flavour.name+routine.name+"(" + result += (",\n"+indent).join([a for a in arguments]) + result += ",\n"+indent+"num_queues, queues, num_wait_events, wait_events, events);" + + # There is no clBLAS available, forward the call to one of the available functions + else: # Half-precision + indent = " "*(24 + routine.Length()) + + # Convert to float (note: also integer buffers are stored as half/float) + for buf in routine.inputs + routine.outputs: + result += " auto "+buf+"_buffer_bis = HalfToFloatBuffer("+buf+"_buffer, queues[0]);\n" + + # Call the float routine + result += " auto status = clblasX"+routine.name+"(" + result += (",\n"+indent).join([a for a in routine.ArgumentsHalf()]) + result += ",\n"+indent+"num_queues, queues, num_wait_events, wait_events, events);" + result += "\n" + + # Convert back to half + for buf in routine.outputs: + result += " FloatToHalfBuffer("+buf+"_buffer, "+buf+"_buffer_bis, queues[0]);\n" + result += " return status;" + + # Complete result += "\n}\n" return result @@ -252,44 +278,66 @@ def wrapper_cblas(routines): result = "" for routine in routines: if routine.has_tests: - result += "\n// Forwards the Netlib BLAS calls for %s\n" % (routine.ShortNames()) + result += "\n// Forwards the Netlib BLAS calls for %s\n" % (routine.ShortNamesTested()) for flavour in routine.flavours: - indent = " "*(10 + routine.Length()) result += routine.RoutineHeaderWrapperC(flavour, False, 12)+" {\n" - arguments = routine.ArgumentsWrapperC(flavour) - # Double-precision scalars - for scalar in routine.scalars: - if flavour.IsComplex(scalar): - result += " const auto "+scalar+"_array = std::vector<"+flavour.buffertype[:-1]+">{"+scalar+".real(), "+scalar+".imag()};\n" + # There is a version available in CBLAS + if flavour.precision_name in ["S","D","C","Z"]: + indent = " "*(10 + routine.Length()) + arguments = routine.ArgumentsWrapperC(flavour) - # Special case for scalar outputs - assignment = "" - postfix = "" - endofline = "" - extra_argument = "" - for output_buffer in routine.outputs: - if output_buffer in routine.ScalarBuffersFirst(): - if flavour in [C,Z]: - postfix += "_sub" - indent += " " - extra_argument += ",\n"+indent+"reinterpret_cast(&"+output_buffer+"_buffer["+output_buffer+"_offset])" - elif output_buffer in routine.IndexBuffers(): - assignment = "((int*)&"+output_buffer+"_buffer[0])["+output_buffer+"_offset] = " - indent += " "*len(assignment) - else: - assignment = output_buffer+"_buffer["+output_buffer+"_offset]" - if (flavour.name in ["Sc","Dz"]): - assignment = assignment+".real(" - endofline += ")" + # Complex scalars + for scalar in routine.scalars: + if flavour.IsComplex(scalar): + result += " const auto "+scalar+"_array = std::vector<"+flavour.buffertype[:-1]+">{"+scalar+".real(), "+scalar+".imag()};\n" + + # Special case for scalar outputs + assignment = "" + postfix = "" + endofline = "" + extra_argument = "" + for output_buffer in routine.outputs: + if output_buffer in routine.ScalarBuffersFirst(): + if flavour in [C,Z]: + postfix += "_sub" + indent += " " + extra_argument += ",\n"+indent+"reinterpret_cast(&"+output_buffer+"_buffer["+output_buffer+"_offset])" + elif output_buffer in routine.IndexBuffers(): + assignment = "((int*)&"+output_buffer+"_buffer[0])["+output_buffer+"_offset] = " + indent += " "*len(assignment) else: - assignment = assignment+" = " - indent += " "*len(assignment) + assignment = output_buffer+"_buffer["+output_buffer+"_offset]" + if (flavour.name in ["Sc","Dz"]): + assignment = assignment+".real(" + endofline += ")" + else: + assignment = assignment+" = " + indent += " "*len(assignment) - result += " "+assignment+"cblas_"+flavour.name.lower()+routine.name+postfix+"(" - result += (",\n"+indent).join([a for a in arguments]) - result += extra_argument+endofline+");" - result += "\n}\n" + result += " "+assignment+"cblas_"+flavour.name.lower()+routine.name+postfix+"(" + result += (",\n"+indent).join([a for a in arguments]) + result += extra_argument+endofline+");\n" + + # There is no CBLAS available, forward the call to one of the available functions + else: # Half-precision + indent = " "*(9 + routine.Length()) + + # Convert to float (note: also integer buffers are stored as half/float) + for buf in routine.inputs + routine.outputs: + result += " auto "+buf+"_buffer_bis = HalfToFloatBuffer("+buf+"_buffer);\n" + + # Call the float routine + result += " cblasX"+routine.name+"(" + result += (",\n"+indent).join([a for a in routine.ArgumentsHalf()]) + result += ");\n" + + # Convert back to half + for buf in routine.outputs: + result += " FloatToHalfBuffer("+buf+"_buffer, "+buf+"_buffer_bis);\n" + + # Complete + result += "}\n" return result # ================================================================================================== diff --git a/scripts/generator/routine.py b/scripts/generator/routine.py index e5059c61..fe857ea8 100644 --- a/scripts/generator/routine.py +++ b/scripts/generator/routine.py @@ -99,6 +99,18 @@ class Routine(): def IndexBuffers(self): return ["imax","imin"] + # Lists of input/output buffers not index (integer) + def NonIndexInputs(self): + buffers = self.inputs[:] # make a copy + for i in self.IndexBuffers(): + if i in buffers: buffers.remove(i) + return buffers + def NonIndexOutputs(self): + buffers = self.outputs[:] # make a copy + for i in self.IndexBuffers(): + if i in buffers: buffers.remove(i) + return buffers + # List of buffers without 'inc' or 'ld' def BuffersWithoutLdInc(self): return self.ScalarBuffersFirst() + self.ScalarBuffersSecond() + ["ap"] @@ -119,6 +131,12 @@ class Routine(): def ShortNames(self): return "/".join([f.name+self.name.upper() for f in self.flavours]) + # As above, but excludes some + def ShortNamesTested(self): + names = [f.name+self.name.upper() for f in self.flavours] + if "H"+self.name.upper() in names: names.remove("H"+self.name.upper()) + return "/".join(names) + # Determines which buffers go first (between alpha and beta) and which ones go after def BuffersFirst(self): if self.level == "2b": @@ -146,6 +164,17 @@ class Routine(): return [", ".join(a+b+c)] return [] + # As above but with a '_bis' suffix for the buffer name + def BufferBis(self, name): + #if (name in self.IndexBuffers()): + # return self.Buffer(name) + if (name in self.inputs) or (name in self.outputs): + a = [name+"_buffer_bis"] + b = [name+"_offset"] + c = [name+"_"+self.Postfix(name)] if (name not in self.BuffersWithoutLdInc()) else [] + return [", ".join(a+b+c)] + return [] + # As above but with data-types def BufferDef(self, name): prefix = "const " if (name in self.inputs) else "" @@ -156,6 +185,16 @@ class Routine(): return [", ".join(a+b+c)] return [] + # As above but with data-types + def BufferDefWrapperCL(self, name, flavour): + prefix = "const " if (name in self.inputs) else "" + if (name in self.inputs) or (name in self.outputs): + a = [prefix+"Buffer<"+flavour.buffertype+">& "+name+"_buffer"] + b = ["const size_t "+name+"_offset"] + c = ["const size_t "+name+"_"+self.Postfix(name)] if (name not in self.BuffersWithoutLdInc()) else [] + return [", ".join(a+b+c)] + return [] + # As above but as vectors def BufferDefVector(self, name, flavour): prefix = "const " if (name in self.inputs) else "" @@ -179,7 +218,7 @@ class Routine(): # As above but with a static cast for clBLAS wrapper def BufferWrapperCL(self, name): if (name in self.inputs) or (name in self.outputs): - a = [name+"_buffer"] + a = [name+"_buffer()"] b = [name+"_offset"] c = [] if (name in ["x","y"]): @@ -238,6 +277,12 @@ class Routine(): return [name] return [] + # As above, but converts from float to half + def ScalarHalfToFloat(self, name): + if name in self.scalars: + return ["HalfToFloat("+name+")"] + return [] + # Retrieves the use of a scalar (alpha/beta) def ScalarUse(self, name, flavour): if name in self.scalars: @@ -248,7 +293,7 @@ class Routine(): return [name] return [] - # Retrieves the use of a scalar (alpha/beta) + # As above, but for the clBLAS wrapper def ScalarUseWrapper(self, name, flavour): if name in self.scalars: if name == "alpha": @@ -258,7 +303,7 @@ class Routine(): return [name] return [] - # Retrieves the use of a scalar for CBLAS (alpha/beta) + # As above, but for the CBLAS wrapper def ScalarUseWrapperC(self, name, flavour): if name in self.scalars: if flavour.IsComplex(name): @@ -377,6 +422,28 @@ class Routine(): # ============================================================================================== + # Retrieves a combination of all the argument names (no types) + def Arguments(self): + return (self.Options() + self.Sizes() + + list(chain(*[self.Buffer(b) for b in self.ScalarBuffersFirst()])) + + self.Scalar("alpha") + + list(chain(*[self.Buffer(b) for b in self.BuffersFirst()])) + + self.Scalar("beta") + + list(chain(*[self.Buffer(b) for b in self.BuffersSecond()])) + + list(chain(*[self.Buffer(b) for b in self.ScalarBuffersSecond()])) + + list(chain(*[self.Scalar(s) for s in self.OtherScalars()]))) + + # As above, but with conversions from half to float + def ArgumentsHalf(self): + return (self.Options() + self.Sizes() + + list(chain(*[self.BufferBis(b) for b in self.ScalarBuffersFirst()])) + + self.ScalarHalfToFloat("alpha") + + list(chain(*[self.BufferBis(b) for b in self.BuffersFirst()])) + + self.ScalarHalfToFloat("beta") + + list(chain(*[self.BufferBis(b) for b in self.BuffersSecond()])) + + list(chain(*[self.BufferBis(b) for b in self.ScalarBuffersSecond()])) + + list(chain(*[self.Scalar(s) for s in self.OtherScalars()]))) + # Retrieves a combination of all the argument names, with Claduc casts def ArgumentsCladuc(self, flavour, indent): return (self.Options() + self.Sizes() + @@ -388,7 +455,7 @@ class Routine(): list(chain(*[self.BufferCladuc(b) for b in self.ScalarBuffersSecond()])) + list(chain(*[self.Scalar(s) for s in self.OtherScalars()]))) - # Retrieves a combination of all the argument names, with CLBlast casts + # As above, but with CLBlast casts def ArgumentsCast(self, flavour, indent): return (self.OptionsCast(indent) + self.Sizes() + list(chain(*[self.Buffer(b) for b in self.ScalarBuffersFirst()])) + @@ -434,12 +501,12 @@ class Routine(): # As above, but clBLAS wrapper plain datatypes def ArgumentsDefWrapperCL(self, flavour): return (self.OptionsDefWrapperCL() + self.SizesDef() + - list(chain(*[self.BufferDef(b) for b in self.ScalarBuffersFirst()])) + + list(chain(*[self.BufferDefWrapperCL(b, flavour) for b in self.ScalarBuffersFirst()])) + self.ScalarDefPlain("alpha", flavour) + - list(chain(*[self.BufferDef(b) for b in self.BuffersFirst()])) + + list(chain(*[self.BufferDefWrapperCL(b, flavour) for b in self.BuffersFirst()])) + self.ScalarDefPlain("beta", flavour) + - list(chain(*[self.BufferDef(b) for b in self.BuffersSecond()])) + - list(chain(*[self.BufferDef(b) for b in self.ScalarBuffersSecond()])) + + list(chain(*[self.BufferDefWrapperCL(b, flavour) for b in self.BuffersSecond()])) + + list(chain(*[self.BufferDefWrapperCL(b, flavour) for b in self.ScalarBuffersSecond()])) + list(chain(*[self.ScalarDefPlain(s, flavour) for s in self.OtherScalars()]))) # As above, but CBLAS wrapper plain datatypes diff --git a/src/clblast.cc b/src/clblast.cc index 8a9465c3..07322327 100644 --- a/src/clblast.cc +++ b/src/clblast.cc @@ -160,7 +160,7 @@ template StatusCode PUBLIC_API Rotm(const size_t, cl_mem, const size_t, cl_command_queue*, cl_event*); -// Swap two vectors: SSWAP/DSWAP/CSWAP/ZSWAP +// Swap two vectors: SSWAP/DSWAP/CSWAP/ZSWAP/HSWAP template StatusCode Swap(const size_t n, cl_mem x_buffer, const size_t x_offset, const size_t x_inc, @@ -190,8 +190,12 @@ template StatusCode PUBLIC_API Swap(const size_t, cl_mem, const size_t, const size_t, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); +template StatusCode PUBLIC_API Swap(const size_t, + cl_mem, const size_t, const size_t, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); -// Vector scaling: SSCAL/DSCAL/CSCAL/ZSCAL +// Vector scaling: SSCAL/DSCAL/CSCAL/ZSCAL/HSCAL template StatusCode Scal(const size_t n, const T alpha, @@ -221,8 +225,12 @@ template StatusCode PUBLIC_API Scal(const size_t, const double2, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); +template StatusCode PUBLIC_API Scal(const size_t, + const half, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); -// Vector copy: SCOPY/DCOPY/CCOPY/ZCOPY +// Vector copy: SCOPY/DCOPY/CCOPY/ZCOPY/HCOPY template StatusCode Copy(const size_t n, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, @@ -252,8 +260,12 @@ template StatusCode PUBLIC_API Copy(const size_t, const cl_mem, const size_t, const size_t, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); +template StatusCode PUBLIC_API Copy(const size_t, + const cl_mem, const size_t, const size_t, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); -// Vector-times-constant plus vector: SAXPY/DAXPY/CAXPY/ZAXPY +// Vector-times-constant plus vector: SAXPY/DAXPY/CAXPY/ZAXPY/HAXPY template StatusCode Axpy(const size_t n, const T alpha, @@ -289,8 +301,13 @@ template StatusCode PUBLIC_API Axpy(const size_t, const cl_mem, const size_t, const size_t, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); +template StatusCode PUBLIC_API Axpy(const size_t, + const half, + const cl_mem, const size_t, const size_t, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); -// Dot product of two vectors: SDOT/DDOT +// Dot product of two vectors: SDOT/DDOT/HDOT template StatusCode Dot(const size_t n, cl_mem dot_buffer, const size_t dot_offset, @@ -316,6 +333,11 @@ template StatusCode PUBLIC_API Dot(const size_t, const cl_mem, const size_t, const size_t, const cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); +template StatusCode PUBLIC_API Dot(const size_t, + cl_mem, const size_t, + const cl_mem, const size_t, const size_t, + const cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); // Dot product of two complex vectors: CDOTU/ZDOTU template @@ -371,7 +393,7 @@ template StatusCode PUBLIC_API Dotc(const size_t, const cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); -// Euclidian norm of a vector: SNRM2/DNRM2/ScNRM2/DzNRM2 +// Euclidian norm of a vector: SNRM2/DNRM2/ScNRM2/DzNRM2/HNRM2 template StatusCode Nrm2(const size_t n, cl_mem nrm2_buffer, const size_t nrm2_offset, @@ -401,8 +423,12 @@ template StatusCode PUBLIC_API Nrm2(const size_t, cl_mem, const size_t, const cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); +template StatusCode PUBLIC_API Nrm2(const size_t, + cl_mem, const size_t, + const cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); -// Absolute sum of values in a vector: SASUM/DASUM/ScASUM/DzASUM +// Absolute sum of values in a vector: SASUM/DASUM/ScASUM/DzASUM/HASUM template StatusCode Asum(const size_t n, cl_mem asum_buffer, const size_t asum_offset, @@ -432,8 +458,12 @@ template StatusCode PUBLIC_API Asum(const size_t, cl_mem, const size_t, const cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); +template StatusCode PUBLIC_API Asum(const size_t, + cl_mem, const size_t, + const cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); -// Sum of values in a vector (non-BLAS function): SSUM/DSUM/ScSUM/DzSUM +// Sum of values in a vector (non-BLAS function): SSUM/DSUM/ScSUM/DzSUM/HSUM template StatusCode Sum(const size_t n, cl_mem sum_buffer, const size_t sum_offset, @@ -463,8 +493,12 @@ template StatusCode PUBLIC_API Sum(const size_t, cl_mem, const size_t, const cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); +template StatusCode PUBLIC_API Sum(const size_t, + cl_mem, const size_t, + const cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); -// Index of absolute maximum value in a vector: iSAMAX/iDAMAX/iCAMAX/iZAMAX +// Index of absolute maximum value in a vector: iSAMAX/iDAMAX/iCAMAX/iZAMAX/iHAMAX template StatusCode Amax(const size_t n, cl_mem imax_buffer, const size_t imax_offset, @@ -494,8 +528,12 @@ template StatusCode PUBLIC_API Amax(const size_t, cl_mem, const size_t, const cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); +template StatusCode PUBLIC_API Amax(const size_t, + cl_mem, const size_t, + const cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); -// Index of maximum value in a vector (non-BLAS function): iSMAX/iDMAX/iCMAX/iZMAX +// Index of maximum value in a vector (non-BLAS function): iSMAX/iDMAX/iCMAX/iZMAX/iHMAX template StatusCode Max(const size_t n, cl_mem imax_buffer, const size_t imax_offset, @@ -525,8 +563,12 @@ template StatusCode PUBLIC_API Max(const size_t, cl_mem, const size_t, const cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); +template StatusCode PUBLIC_API Max(const size_t, + cl_mem, const size_t, + const cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); -// Index of minimum value in a vector (non-BLAS function): iSMIN/iDMIN/iCMIN/iZMIN +// Index of minimum value in a vector (non-BLAS function): iSMIN/iDMIN/iCMIN/iZMIN/iHMIN template StatusCode Min(const size_t n, cl_mem imin_buffer, const size_t imin_offset, @@ -556,12 +598,16 @@ template StatusCode PUBLIC_API Min(const size_t, cl_mem, const size_t, const cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); +template StatusCode PUBLIC_API Min(const size_t, + cl_mem, const size_t, + const cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); // ================================================================================================= // BLAS level-2 (matrix-vector) routines // ================================================================================================= -// General matrix-vector multiplication: SGEMV/DGEMV/CGEMV/ZGEMV +// General matrix-vector multiplication: SGEMV/DGEMV/CGEMV/ZGEMV/HGEMV template StatusCode Gemv(const Layout layout, const Transpose a_transpose, const size_t m, const size_t n, @@ -615,8 +661,16 @@ template StatusCode PUBLIC_API Gemv(const Layout, const Transpose, const double2, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); +template StatusCode PUBLIC_API Gemv(const Layout, const Transpose, + const size_t, const size_t, + const half, + const cl_mem, const size_t, const size_t, + const cl_mem, const size_t, const size_t, + const half, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); -// General banded matrix-vector multiplication: SGBMV/DGBMV/CGBMV/ZGBMV +// General banded matrix-vector multiplication: SGBMV/DGBMV/CGBMV/ZGBMV/HGBMV template StatusCode Gbmv(const Layout layout, const Transpose a_transpose, const size_t m, const size_t n, const size_t kl, const size_t ku, @@ -670,6 +724,14 @@ template StatusCode PUBLIC_API Gbmv(const Layout, const Transpose, const double2, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); +template StatusCode PUBLIC_API Gbmv(const Layout, const Transpose, + const size_t, const size_t, const size_t, const size_t, + const half, + const cl_mem, const size_t, const size_t, + const cl_mem, const size_t, const size_t, + const half, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); // Hermitian matrix-vector multiplication: CHEMV/ZHEMV template @@ -788,7 +850,7 @@ template StatusCode PUBLIC_API Hpmv(const Layout, const Triangle, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); -// Symmetric matrix-vector multiplication: SSYMV/DSYMV +// Symmetric matrix-vector multiplication: SSYMV/DSYMV/HSYMV template StatusCode Symv(const Layout layout, const Triangle triangle, const size_t n, @@ -826,8 +888,16 @@ template StatusCode PUBLIC_API Symv(const Layout, const Triangle, const double, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); +template StatusCode PUBLIC_API Symv(const Layout, const Triangle, + const size_t, + const half, + const cl_mem, const size_t, const size_t, + const cl_mem, const size_t, const size_t, + const half, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); -// Symmetric banded matrix-vector multiplication: SSBMV/DSBMV +// Symmetric banded matrix-vector multiplication: SSBMV/DSBMV/HSBMV template StatusCode Sbmv(const Layout layout, const Triangle triangle, const size_t n, const size_t k, @@ -865,8 +935,16 @@ template StatusCode PUBLIC_API Sbmv(const Layout, const Triangle, const double, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); +template StatusCode PUBLIC_API Sbmv(const Layout, const Triangle, + const size_t, const size_t, + const half, + const cl_mem, const size_t, const size_t, + const cl_mem, const size_t, const size_t, + const half, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); -// Symmetric packed matrix-vector multiplication: SSPMV/DSPMV +// Symmetric packed matrix-vector multiplication: SSPMV/DSPMV/HSPMV template StatusCode Spmv(const Layout layout, const Triangle triangle, const size_t n, @@ -904,8 +982,16 @@ template StatusCode PUBLIC_API Spmv(const Layout, const Triangle, const double, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); +template StatusCode PUBLIC_API Spmv(const Layout, const Triangle, + const size_t, + const half, + const cl_mem, const size_t, + const cl_mem, const size_t, const size_t, + const half, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); -// Triangular matrix-vector multiplication: STRMV/DTRMV/CTRMV/ZTRMV +// Triangular matrix-vector multiplication: STRMV/DTRMV/CTRMV/ZTRMV/HTRMV template StatusCode Trmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, const size_t n, @@ -941,8 +1027,13 @@ template StatusCode PUBLIC_API Trmv(const Layout, const Triangle, const const cl_mem, const size_t, const size_t, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); +template StatusCode PUBLIC_API Trmv(const Layout, const Triangle, const Transpose, const Diagonal, + const size_t, + const cl_mem, const size_t, const size_t, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); -// Triangular banded matrix-vector multiplication: STBMV/DTBMV/CTBMV/ZTBMV +// Triangular banded matrix-vector multiplication: STBMV/DTBMV/CTBMV/ZTBMV/HTBMV template StatusCode Tbmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, const size_t n, const size_t k, @@ -978,8 +1069,13 @@ template StatusCode PUBLIC_API Tbmv(const Layout, const Triangle, const const cl_mem, const size_t, const size_t, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); +template StatusCode PUBLIC_API Tbmv(const Layout, const Triangle, const Transpose, const Diagonal, + const size_t, const size_t, + const cl_mem, const size_t, const size_t, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); -// Triangular packed matrix-vector multiplication: STPMV/DTPMV/CTPMV/ZTPMV +// Triangular packed matrix-vector multiplication: STPMV/DTPMV/CTPMV/ZTPMV/HTPMV template StatusCode Tpmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, const size_t n, @@ -1015,6 +1111,11 @@ template StatusCode PUBLIC_API Tpmv(const Layout, const Triangle, const const cl_mem, const size_t, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); +template StatusCode PUBLIC_API Tpmv(const Layout, const Triangle, const Transpose, const Diagonal, + const size_t, + const cl_mem, const size_t, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); // Solves a triangular system of equations: STRSV/DTRSV/CTRSV/ZTRSV template @@ -1106,7 +1207,7 @@ template StatusCode PUBLIC_API Tpsv(const Layout, const Triangle, const cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); -// General rank-1 matrix update: SGER/DGER +// General rank-1 matrix update: SGER/DGER/HGER template StatusCode Ger(const Layout layout, const size_t m, const size_t n, @@ -1140,6 +1241,13 @@ template StatusCode PUBLIC_API Ger(const Layout, const cl_mem, const size_t, const size_t, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); +template StatusCode PUBLIC_API Ger(const Layout, + const size_t, const size_t, + const half, + const cl_mem, const size_t, const size_t, + const cl_mem, const size_t, const size_t, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); // General rank-1 complex matrix update: CGERU/ZGERU template @@ -1343,7 +1451,7 @@ template StatusCode PUBLIC_API Hpr2(const Layout, const Triangle, cl_mem, const size_t, cl_command_queue*, cl_event*); -// Symmetric rank-1 matrix update: SSYR/DSYR +// Symmetric rank-1 matrix update: SSYR/DSYR/HSYR template StatusCode Syr(const Layout layout, const Triangle triangle, const size_t n, @@ -1373,8 +1481,14 @@ template StatusCode PUBLIC_API Syr(const Layout, const Triangle, const cl_mem, const size_t, const size_t, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); +template StatusCode PUBLIC_API Syr(const Layout, const Triangle, + const size_t, + const half, + const cl_mem, const size_t, const size_t, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); -// Symmetric packed rank-1 matrix update: SSPR/DSPR +// Symmetric packed rank-1 matrix update: SSPR/DSPR/HSPR template StatusCode Spr(const Layout layout, const Triangle triangle, const size_t n, @@ -1404,8 +1518,14 @@ template StatusCode PUBLIC_API Spr(const Layout, const Triangle, const cl_mem, const size_t, const size_t, cl_mem, const size_t, cl_command_queue*, cl_event*); +template StatusCode PUBLIC_API Spr(const Layout, const Triangle, + const size_t, + const half, + const cl_mem, const size_t, const size_t, + cl_mem, const size_t, + cl_command_queue*, cl_event*); -// Symmetric rank-2 matrix update: SSYR2/DSYR2 +// Symmetric rank-2 matrix update: SSYR2/DSYR2/HSYR2 template StatusCode Syr2(const Layout layout, const Triangle triangle, const size_t n, @@ -1439,8 +1559,15 @@ template StatusCode PUBLIC_API Syr2(const Layout, const Triangle, const cl_mem, const size_t, const size_t, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); +template StatusCode PUBLIC_API Syr2(const Layout, const Triangle, + const size_t, + const half, + const cl_mem, const size_t, const size_t, + const cl_mem, const size_t, const size_t, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); -// Symmetric packed rank-2 matrix update: SSPR2/DSPR2 +// Symmetric packed rank-2 matrix update: SSPR2/DSPR2/HSPR2 template StatusCode Spr2(const Layout layout, const Triangle triangle, const size_t n, @@ -1474,12 +1601,19 @@ template StatusCode PUBLIC_API Spr2(const Layout, const Triangle, const cl_mem, const size_t, const size_t, cl_mem, const size_t, cl_command_queue*, cl_event*); +template StatusCode PUBLIC_API Spr2(const Layout, const Triangle, + const size_t, + const half, + const cl_mem, const size_t, const size_t, + const cl_mem, const size_t, const size_t, + cl_mem, const size_t, + cl_command_queue*, cl_event*); // ================================================================================================= // BLAS level-3 (matrix-matrix) routines // ================================================================================================= -// General matrix-matrix multiplication: SGEMM/DGEMM/CGEMM/ZGEMM +// General matrix-matrix multiplication: SGEMM/DGEMM/CGEMM/ZGEMM/HGEMM template StatusCode Gemm(const Layout layout, const Transpose a_transpose, const Transpose b_transpose, const size_t m, const size_t n, const size_t k, @@ -1533,8 +1667,16 @@ template StatusCode PUBLIC_API Gemm(const Layout, const Transpose, cons const double2, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); +template StatusCode PUBLIC_API Gemm(const Layout, const Transpose, const Transpose, + const size_t, const size_t, const size_t, + const half, + const cl_mem, const size_t, const size_t, + const cl_mem, const size_t, const size_t, + const half, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); -// Symmetric matrix-matrix multiplication: SSYMM/DSYMM/CSYMM/ZSYMM +// Symmetric matrix-matrix multiplication: SSYMM/DSYMM/CSYMM/ZSYMM/HSYMM template StatusCode Symm(const Layout layout, const Side side, const Triangle triangle, const size_t m, const size_t n, @@ -1588,6 +1730,14 @@ template StatusCode PUBLIC_API Symm(const Layout, const Side, const Tri const double2, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); +template StatusCode PUBLIC_API Symm(const Layout, const Side, const Triangle, + const size_t, const size_t, + const half, + const cl_mem, const size_t, const size_t, + const cl_mem, const size_t, const size_t, + const half, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); // Hermitian matrix-matrix multiplication: CHEMM/ZHEMM template @@ -1628,7 +1778,7 @@ template StatusCode PUBLIC_API Hemm(const Layout, const Side, const Tri cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); -// Rank-K update of a symmetric matrix: SSYRK/DSYRK/CSYRK/ZSYRK +// Rank-K update of a symmetric matrix: SSYRK/DSYRK/CSYRK/ZSYRK/HSYRK template StatusCode Syrk(const Layout layout, const Triangle triangle, const Transpose a_transpose, const size_t n, const size_t k, @@ -1676,6 +1826,13 @@ template StatusCode PUBLIC_API Syrk(const Layout, const Triangle, const const double2, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); +template StatusCode PUBLIC_API Syrk(const Layout, const Triangle, const Transpose, + const size_t, const size_t, + const half, + const cl_mem, const size_t, const size_t, + const half, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); // Rank-K update of a hermitian matrix: CHERK/ZHERK template @@ -1712,7 +1869,7 @@ template StatusCode PUBLIC_API Herk(const Layout, const Triangle, const cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); -// Rank-2K update of a symmetric matrix: SSYR2K/DSYR2K/CSYR2K/ZSYR2K +// Rank-2K update of a symmetric matrix: SSYR2K/DSYR2K/CSYR2K/ZSYR2K/HSYR2K template StatusCode Syr2k(const Layout layout, const Triangle triangle, const Transpose ab_transpose, const size_t n, const size_t k, @@ -1766,6 +1923,14 @@ template StatusCode PUBLIC_API Syr2k(const Layout, const Triangle, cons const double2, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); +template StatusCode PUBLIC_API Syr2k(const Layout, const Triangle, const Transpose, + const size_t, const size_t, + const half, + const cl_mem, const size_t, const size_t, + const cl_mem, const size_t, const size_t, + const half, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); // Rank-2K update of a hermitian matrix: CHER2K/ZHER2K template @@ -1806,7 +1971,7 @@ template StatusCode PUBLIC_API Her2k(const Layout, const Triangl cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); -// Triangular matrix-matrix multiplication: STRMM/DTRMM/CTRMM/ZTRMM +// Triangular matrix-matrix multiplication: STRMM/DTRMM/CTRMM/ZTRMM/HTRMM template StatusCode Trmm(const Layout layout, const Side side, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, const size_t m, const size_t n, @@ -1848,8 +2013,14 @@ template StatusCode PUBLIC_API Trmm(const Layout, const Side, const Tri const cl_mem, const size_t, const size_t, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); +template StatusCode PUBLIC_API Trmm(const Layout, const Side, const Triangle, const Transpose, const Diagonal, + const size_t, const size_t, + const half, + const cl_mem, const size_t, const size_t, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); -// Solves a triangular system of equations: STRSM/DTRSM/CTRSM/ZTRSM +// Solves a triangular system of equations: STRSM/DTRSM/CTRSM/ZTRSM/HTRSM template StatusCode Trsm(const Layout, const Side, const Triangle, const Transpose, const Diagonal, const size_t, const size_t, @@ -1883,6 +2054,12 @@ template StatusCode PUBLIC_API Trsm(const Layout, const Side, const Tri const cl_mem, const size_t, const size_t, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); +template StatusCode PUBLIC_API Trsm(const Layout, const Side, const Triangle, const Transpose, const Diagonal, + const size_t, const size_t, + const half, + const cl_mem, const size_t, const size_t, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); // ================================================================================================= diff --git a/src/clblast_c.cc b/src/clblast_c.cc index 1fc63de2..2aac907a 100644 --- a/src/clblast_c.cc +++ b/src/clblast_c.cc @@ -178,6 +178,16 @@ StatusCode CLBlastZswap(const size_t n, queue, event); return static_cast(status); } +StatusCode CLBlastHswap(const size_t n, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Swap(n, + x_buffer, x_offset, x_inc, + y_buffer, y_offset, y_inc, + queue, event); + return static_cast(status); +} // SCAL StatusCode CLBlastSscal(const size_t n, @@ -220,6 +230,16 @@ StatusCode CLBlastZscal(const size_t n, queue, event); return static_cast(status); } +StatusCode CLBlastHscal(const size_t n, + const cl_half alpha, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Scal(n, + alpha, + x_buffer, x_offset, x_inc, + queue, event); + return static_cast(status); +} // COPY StatusCode CLBlastScopy(const size_t n, @@ -262,6 +282,16 @@ StatusCode CLBlastZcopy(const size_t n, queue, event); return static_cast(status); } +StatusCode CLBlastHcopy(const size_t n, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Copy(n, + x_buffer, x_offset, x_inc, + y_buffer, y_offset, y_inc, + queue, event); + return static_cast(status); +} // AXPY StatusCode CLBlastSaxpy(const size_t n, @@ -312,6 +342,18 @@ StatusCode CLBlastZaxpy(const size_t n, queue, event); return static_cast(status); } +StatusCode CLBlastHaxpy(const size_t n, + const cl_half alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Axpy(n, + alpha, + x_buffer, x_offset, x_inc, + y_buffer, y_offset, y_inc, + queue, event); + return static_cast(status); +} // DOT StatusCode CLBlastSdot(const size_t n, @@ -338,6 +380,18 @@ StatusCode CLBlastDdot(const size_t n, queue, event); return static_cast(status); } +StatusCode CLBlastHdot(const size_t n, + cl_mem dot_buffer, const size_t dot_offset, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Dot(n, + dot_buffer, dot_offset, + x_buffer, x_offset, x_inc, + y_buffer, y_offset, y_inc, + queue, event); + return static_cast(status); +} // DOTU StatusCode CLBlastCdotu(const size_t n, @@ -432,6 +486,16 @@ StatusCode CLBlastDznrm2(const size_t n, queue, event); return static_cast(status); } +StatusCode CLBlastHnrm2(const size_t n, + cl_mem nrm2_buffer, const size_t nrm2_offset, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Nrm2(n, + nrm2_buffer, nrm2_offset, + x_buffer, x_offset, x_inc, + queue, event); + return static_cast(status); +} // ASUM StatusCode CLBlastSasum(const size_t n, @@ -474,6 +538,16 @@ StatusCode CLBlastDzasum(const size_t n, queue, event); return static_cast(status); } +StatusCode CLBlastHasum(const size_t n, + cl_mem asum_buffer, const size_t asum_offset, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Asum(n, + asum_buffer, asum_offset, + x_buffer, x_offset, x_inc, + queue, event); + return static_cast(status); +} // SUM StatusCode CLBlastSsum(const size_t n, @@ -516,6 +590,16 @@ StatusCode CLBlastDzsum(const size_t n, queue, event); return static_cast(status); } +StatusCode CLBlastHsum(const size_t n, + cl_mem sum_buffer, const size_t sum_offset, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Sum(n, + sum_buffer, sum_offset, + x_buffer, x_offset, x_inc, + queue, event); + return static_cast(status); +} // AMAX StatusCode CLBlastiSamax(const size_t n, @@ -558,6 +642,16 @@ StatusCode CLBlastiZamax(const size_t n, queue, event); return static_cast(status); } +StatusCode CLBlastiHamax(const size_t n, + cl_mem imax_buffer, const size_t imax_offset, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Amax(n, + imax_buffer, imax_offset, + x_buffer, x_offset, x_inc, + queue, event); + return static_cast(status); +} // MAX StatusCode CLBlastiSmax(const size_t n, @@ -600,6 +694,16 @@ StatusCode CLBlastiZmax(const size_t n, queue, event); return static_cast(status); } +StatusCode CLBlastiHmax(const size_t n, + cl_mem imax_buffer, const size_t imax_offset, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Max(n, + imax_buffer, imax_offset, + x_buffer, x_offset, x_inc, + queue, event); + return static_cast(status); +} // MIN StatusCode CLBlastiSmin(const size_t n, @@ -642,6 +746,16 @@ StatusCode CLBlastiZmin(const size_t n, queue, event); return static_cast(status); } +StatusCode CLBlastiHmin(const size_t n, + cl_mem imin_buffer, const size_t imin_offset, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Min(n, + imin_buffer, imin_offset, + x_buffer, x_offset, x_inc, + queue, event); + return static_cast(status); +} // ================================================================================================= // BLAS level-2 (matrix-vector) routines @@ -724,6 +838,25 @@ StatusCode CLBlastZgemv(const Layout layout, const Transpose a_transpose, queue, event); return static_cast(status); } +StatusCode CLBlastHgemv(const Layout layout, const Transpose a_transpose, + const size_t m, const size_t n, + const cl_half alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_half beta, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Gemv(static_cast(layout), + static_cast(a_transpose), + m, n, + alpha, + a_buffer, a_offset, a_ld, + x_buffer, x_offset, x_inc, + beta, + y_buffer, y_offset, y_inc, + queue, event); + return static_cast(status); +} // GBMV StatusCode CLBlastSgbmv(const Layout layout, const Transpose a_transpose, @@ -802,6 +935,25 @@ StatusCode CLBlastZgbmv(const Layout layout, const Transpose a_transpose, queue, event); return static_cast(status); } +StatusCode CLBlastHgbmv(const Layout layout, const Transpose a_transpose, + const size_t m, const size_t n, const size_t kl, const size_t ku, + const cl_half alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_half beta, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Gbmv(static_cast(layout), + static_cast(a_transpose), + m, n, kl, ku, + alpha, + a_buffer, a_offset, a_ld, + x_buffer, x_offset, x_inc, + beta, + y_buffer, y_offset, y_inc, + queue, event); + return static_cast(status); +} // HEMV StatusCode CLBlastChemv(const Layout layout, const Triangle triangle, @@ -962,6 +1114,25 @@ StatusCode CLBlastDsymv(const Layout layout, const Triangle triangle, queue, event); return static_cast(status); } +StatusCode CLBlastHsymv(const Layout layout, const Triangle triangle, + const size_t n, + const cl_half alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_half beta, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Symv(static_cast(layout), + static_cast(triangle), + n, + alpha, + a_buffer, a_offset, a_ld, + x_buffer, x_offset, x_inc, + beta, + y_buffer, y_offset, y_inc, + queue, event); + return static_cast(status); +} // SBMV StatusCode CLBlastSsbmv(const Layout layout, const Triangle triangle, @@ -1002,6 +1173,25 @@ StatusCode CLBlastDsbmv(const Layout layout, const Triangle triangle, queue, event); return static_cast(status); } +StatusCode CLBlastHsbmv(const Layout layout, const Triangle triangle, + const size_t n, const size_t k, + const cl_half alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_half beta, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Sbmv(static_cast(layout), + static_cast(triangle), + n, k, + alpha, + a_buffer, a_offset, a_ld, + x_buffer, x_offset, x_inc, + beta, + y_buffer, y_offset, y_inc, + queue, event); + return static_cast(status); +} // SPMV StatusCode CLBlastSspmv(const Layout layout, const Triangle triangle, @@ -1042,6 +1232,25 @@ StatusCode CLBlastDspmv(const Layout layout, const Triangle triangle, queue, event); return static_cast(status); } +StatusCode CLBlastHspmv(const Layout layout, const Triangle triangle, + const size_t n, + const cl_half alpha, + const cl_mem ap_buffer, const size_t ap_offset, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_half beta, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Spmv(static_cast(layout), + static_cast(triangle), + n, + alpha, + ap_buffer, ap_offset, + x_buffer, x_offset, x_inc, + beta, + y_buffer, y_offset, y_inc, + queue, event); + return static_cast(status); +} // TRMV StatusCode CLBlastStrmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, @@ -1104,6 +1313,21 @@ StatusCode CLBlastZtrmv(const Layout layout, const Triangle triangle, const Tran queue, event); return static_cast(status); } +StatusCode CLBlastHtrmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t n, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Trmv(static_cast(layout), + static_cast(triangle), + static_cast(a_transpose), + static_cast(diagonal), + n, + a_buffer, a_offset, a_ld, + x_buffer, x_offset, x_inc, + queue, event); + return static_cast(status); +} // TBMV StatusCode CLBlastStbmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, @@ -1166,6 +1390,21 @@ StatusCode CLBlastZtbmv(const Layout layout, const Triangle triangle, const Tran queue, event); return static_cast(status); } +StatusCode CLBlastHtbmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t n, const size_t k, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Tbmv(static_cast(layout), + static_cast(triangle), + static_cast(a_transpose), + static_cast(diagonal), + n, k, + a_buffer, a_offset, a_ld, + x_buffer, x_offset, x_inc, + queue, event); + return static_cast(status); +} // TPMV StatusCode CLBlastStpmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, @@ -1228,6 +1467,21 @@ StatusCode CLBlastZtpmv(const Layout layout, const Triangle triangle, const Tran queue, event); return static_cast(status); } +StatusCode CLBlastHtpmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t n, + const cl_mem ap_buffer, const size_t ap_offset, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Tpmv(static_cast(layout), + static_cast(triangle), + static_cast(a_transpose), + static_cast(diagonal), + n, + ap_buffer, ap_offset, + x_buffer, x_offset, x_inc, + queue, event); + return static_cast(status); +} // TRSV StatusCode CLBlastStrsv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, @@ -1448,6 +1702,22 @@ StatusCode CLBlastDger(const Layout layout, queue, event); return static_cast(status); } +StatusCode CLBlastHger(const Layout layout, + const size_t m, const size_t n, + const cl_half alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Ger(static_cast(layout), + m, n, + alpha, + x_buffer, x_offset, x_inc, + y_buffer, y_offset, y_inc, + a_buffer, a_offset, a_ld, + queue, event); + return static_cast(status); +} // GERU StatusCode CLBlastCgeru(const Layout layout, @@ -1684,6 +1954,21 @@ StatusCode CLBlastDsyr(const Layout layout, const Triangle triangle, queue, event); return static_cast(status); } +StatusCode CLBlastHsyr(const Layout layout, const Triangle triangle, + const size_t n, + const cl_half alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Syr(static_cast(layout), + static_cast(triangle), + n, + alpha, + x_buffer, x_offset, x_inc, + a_buffer, a_offset, a_ld, + queue, event); + return static_cast(status); +} // SPR StatusCode CLBlastSspr(const Layout layout, const Triangle triangle, @@ -1716,6 +2001,21 @@ StatusCode CLBlastDspr(const Layout layout, const Triangle triangle, queue, event); return static_cast(status); } +StatusCode CLBlastHspr(const Layout layout, const Triangle triangle, + const size_t n, + const cl_half alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_mem ap_buffer, const size_t ap_offset, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Spr(static_cast(layout), + static_cast(triangle), + n, + alpha, + x_buffer, x_offset, x_inc, + ap_buffer, ap_offset, + queue, event); + return static_cast(status); +} // SYR2 StatusCode CLBlastSsyr2(const Layout layout, const Triangle triangle, @@ -1752,6 +2052,23 @@ StatusCode CLBlastDsyr2(const Layout layout, const Triangle triangle, queue, event); return static_cast(status); } +StatusCode CLBlastHsyr2(const Layout layout, const Triangle triangle, + const size_t n, + const cl_half alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Syr2(static_cast(layout), + static_cast(triangle), + n, + alpha, + x_buffer, x_offset, x_inc, + y_buffer, y_offset, y_inc, + a_buffer, a_offset, a_ld, + queue, event); + return static_cast(status); +} // SPR2 StatusCode CLBlastSspr2(const Layout layout, const Triangle triangle, @@ -1788,6 +2105,23 @@ StatusCode CLBlastDspr2(const Layout layout, const Triangle triangle, queue, event); return static_cast(status); } +StatusCode CLBlastHspr2(const Layout layout, const Triangle triangle, + const size_t n, + const cl_half alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_mem ap_buffer, const size_t ap_offset, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Spr2(static_cast(layout), + static_cast(triangle), + n, + alpha, + x_buffer, x_offset, x_inc, + y_buffer, y_offset, y_inc, + ap_buffer, ap_offset, + queue, event); + return static_cast(status); +} // ================================================================================================= // BLAS level-3 (matrix-matrix) routines @@ -1874,6 +2208,26 @@ StatusCode CLBlastZgemm(const Layout layout, const Transpose a_transpose, const queue, event); return static_cast(status); } +StatusCode CLBlastHgemm(const Layout layout, const Transpose a_transpose, const Transpose b_transpose, + const size_t m, const size_t n, const size_t k, + const cl_half alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, + const cl_half beta, + cl_mem c_buffer, const size_t c_offset, const size_t c_ld, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Gemm(static_cast(layout), + static_cast(a_transpose), + static_cast(b_transpose), + m, n, k, + alpha, + a_buffer, a_offset, a_ld, + b_buffer, b_offset, b_ld, + beta, + c_buffer, c_offset, c_ld, + queue, event); + return static_cast(status); +} // SYMM StatusCode CLBlastSsymm(const Layout layout, const Side side, const Triangle triangle, @@ -1956,6 +2310,26 @@ StatusCode CLBlastZsymm(const Layout layout, const Side side, const Triangle tri queue, event); return static_cast(status); } +StatusCode CLBlastHsymm(const Layout layout, const Side side, const Triangle triangle, + const size_t m, const size_t n, + const cl_half alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, + const cl_half beta, + cl_mem c_buffer, const size_t c_offset, const size_t c_ld, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Symm(static_cast(layout), + static_cast(side), + static_cast(triangle), + m, n, + alpha, + a_buffer, a_offset, a_ld, + b_buffer, b_offset, b_ld, + beta, + c_buffer, c_offset, c_ld, + queue, event); + return static_cast(status); +} // HEMM StatusCode CLBlastChemm(const Layout layout, const Side side, const Triangle triangle, @@ -2072,6 +2446,24 @@ StatusCode CLBlastZsyrk(const Layout layout, const Triangle triangle, const Tran queue, event); return static_cast(status); } +StatusCode CLBlastHsyrk(const Layout layout, const Triangle triangle, const Transpose a_transpose, + const size_t n, const size_t k, + const cl_half alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const cl_half beta, + cl_mem c_buffer, const size_t c_offset, const size_t c_ld, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Syrk(static_cast(layout), + static_cast(triangle), + static_cast(a_transpose), + n, k, + alpha, + a_buffer, a_offset, a_ld, + beta, + c_buffer, c_offset, c_ld, + queue, event); + return static_cast(status); +} // HERK StatusCode CLBlastCherk(const Layout layout, const Triangle triangle, const Transpose a_transpose, @@ -2192,6 +2584,26 @@ StatusCode CLBlastZsyr2k(const Layout layout, const Triangle triangle, const Tra queue, event); return static_cast(status); } +StatusCode CLBlastHsyr2k(const Layout layout, const Triangle triangle, const Transpose ab_transpose, + const size_t n, const size_t k, + const cl_half alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, + const cl_half beta, + cl_mem c_buffer, const size_t c_offset, const size_t c_ld, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Syr2k(static_cast(layout), + static_cast(triangle), + static_cast(ab_transpose), + n, k, + alpha, + a_buffer, a_offset, a_ld, + b_buffer, b_offset, b_ld, + beta, + c_buffer, c_offset, c_ld, + queue, event); + return static_cast(status); +} // HER2K StatusCode CLBlastCher2k(const Layout layout, const Triangle triangle, const Transpose ab_transpose, @@ -2308,6 +2720,24 @@ StatusCode CLBlastZtrmm(const Layout layout, const Side side, const Triangle tri queue, event); return static_cast(status); } +StatusCode CLBlastHtrmm(const Layout layout, const Side side, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t m, const size_t n, + const cl_half alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_mem b_buffer, const size_t b_offset, const size_t b_ld, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Trmm(static_cast(layout), + static_cast(side), + static_cast(triangle), + static_cast(a_transpose), + static_cast(diagonal), + m, n, + alpha, + a_buffer, a_offset, a_ld, + b_buffer, b_offset, b_ld, + queue, event); + return static_cast(status); +} // TRSM StatusCode CLBlastStrsm(const Layout layout, const Side side, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, @@ -2382,6 +2812,24 @@ StatusCode CLBlastZtrsm(const Layout layout, const Side side, const Triangle tri queue, event); return static_cast(status); } +StatusCode CLBlastHtrsm(const Layout layout, const Side side, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t m, const size_t n, + const cl_half alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_mem b_buffer, const size_t b_offset, const size_t b_ld, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Trsm(static_cast(layout), + static_cast(side), + static_cast(triangle), + static_cast(a_transpose), + static_cast(diagonal), + m, n, + alpha, + a_buffer, a_offset, a_ld, + b_buffer, b_offset, b_ld, + queue, event); + return static_cast(status); +} // ================================================================================================= diff --git a/src/database.cc b/src/database.cc index addd85d3..e20ae340 100644 --- a/src/database.cc +++ b/src/database.cc @@ -29,15 +29,15 @@ namespace clblast { // Initializes the database const std::vector Database::database = { - XaxpySingle, XaxpyDouble, XaxpyComplexSingle, XaxpyComplexDouble, - XdotSingle, XdotDouble, XdotComplexSingle, XdotComplexDouble, - XgemvSingle, XgemvDouble, XgemvComplexSingle, XgemvComplexDouble, - XgerSingle, XgerDouble, XgerComplexSingle, XgerComplexDouble, - XgemmSingle, XgemmDouble, XgemmComplexSingle, XgemmComplexDouble, - CopySingle, CopyDouble, CopyComplexSingle, CopyComplexDouble, - PadSingle, PadDouble, PadComplexSingle, PadComplexDouble, - TransposeSingle, TransposeDouble, TransposeComplexSingle, TransposeComplexDouble, - PadtransposeSingle, PadtransposeDouble, PadtransposeComplexSingle, PadtransposeComplexDouble + XaxpyHalf, XaxpySingle, XaxpyDouble, XaxpyComplexSingle, XaxpyComplexDouble, + XdotHalf, XdotSingle, XdotDouble, XdotComplexSingle, XdotComplexDouble, + XgemvHalf, XgemvSingle, XgemvDouble, XgemvComplexSingle, XgemvComplexDouble, + XgerHalf, XgerSingle, XgerDouble, XgerComplexSingle, XgerComplexDouble, + XgemmHalf, XgemmSingle, XgemmDouble, XgemmComplexSingle, XgemmComplexDouble, + CopyHalf, CopySingle, CopyDouble, CopyComplexSingle, CopyComplexDouble, + PadHalf, PadSingle, PadDouble, PadComplexSingle, PadComplexDouble, + TransposeHalf, TransposeSingle, TransposeDouble, TransposeComplexSingle, TransposeComplexDouble, + PadtransposeHalf, PadtransposeSingle, PadtransposeDouble, PadtransposeComplexSingle, PadtransposeComplexDouble }; // ================================================================================================= diff --git a/src/kernels/common.opencl b/src/kernels/common.opencl index b9e52e17..08c47d87 100644 --- a/src/kernels/common.opencl +++ b/src/kernels/common.opencl @@ -19,11 +19,16 @@ R"( // Parameters set by the tuner or by the database. Here they are given a basic default value in case // this file is used outside of the CLBlast library. #ifndef PRECISION - #define PRECISION 32 // Data-types: single or double precision, complex or regular + #define PRECISION 32 // Data-types: half, single or double precision, complex or regular #endif // ================================================================================================= +// Enable support for double-precision +#if PRECISION == 16 + #pragma OPENCL EXTENSION cl_khr_fp16: enable +#endif + // Enable support for double-precision #if PRECISION == 64 || PRECISION == 6464 #if __OPENCL_VERSION__ <= CL_VERSION_1_1 @@ -31,8 +36,19 @@ R"( #endif #endif +// Half-precision +#if PRECISION == 16 + typedef half real; + typedef half2 real2; + typedef half4 real4; + typedef half8 real8; + typedef half16 real16; + #define ZERO 0 + #define ONE 1 + #define SMALLEST -1.0e14 + // Single-precision -#if PRECISION == 32 +#elif PRECISION == 32 typedef float real; typedef float2 real2; typedef float4 real4; @@ -68,7 +84,7 @@ R"( #define ONE 1.0f #define SMALLEST -1.0e37f -// Complex Double-precision +// Complex double-precision #elif PRECISION == 6464 typedef struct cdouble {double x; double y;} real; typedef struct cdouble2 {real x; real y;} real2; diff --git a/src/kernels/level1/xaxpy.opencl b/src/kernels/level1/xaxpy.opencl index 574beb43..e0efadc1 100644 --- a/src/kernels/level1/xaxpy.opencl +++ b/src/kernels/level1/xaxpy.opencl @@ -23,9 +23,10 @@ R"( // Full version of the kernel with offsets and strided accesses __attribute__((reqd_work_group_size(WGS, 1, 1))) -__kernel void Xaxpy(const int n, const real alpha, +__kernel void Xaxpy(const int n, const __constant real* restrict arg_alpha, const __global real* restrict xgm, const int x_offset, const int x_inc, __global real* ygm, const int y_offset, const int y_inc) { + const real alpha = arg_alpha[0]; // Loops over the work that needs to be done (allows for an arbitrary number of threads) #pragma unroll @@ -40,9 +41,11 @@ __kernel void Xaxpy(const int n, const real alpha, // Faster version of the kernel without offsets and strided accesses. Also assumes that 'n' is // dividable by 'VW', 'WGS' and 'WPT'. __attribute__((reqd_work_group_size(WGS, 1, 1))) -__kernel void XaxpyFast(const int n, const real alpha, +__kernel void XaxpyFast(const int n, const __constant real* restrict arg_alpha, const __global realV* restrict xgm, __global realV* ygm) { + const real alpha = arg_alpha[0]; + #pragma unroll for (int w=0; w 'a_rotated' is 0 // --> 'do_conjugate' is 0 __attribute__((reqd_work_group_size(WGS2, 1, 1))) -__kernel void XgemvFast(const int m, const int n, const real alpha, const real beta, +__kernel void XgemvFast(const int m, const int n, + const __constant real* restrict arg_alpha, + const __constant real* restrict arg_beta, const int a_rotated, const __global realVF* restrict agm, const int a_offset, const int a_ld, const __global real* restrict xgm, const int x_offset, const int x_inc, __global real* ygm, const int y_offset, const int y_inc, const int do_conjugate, const int parameter, const int kl, const int ku) { + const real alpha = arg_alpha[0]; + const real beta = arg_beta[0]; + // Local memory for the vector X __local real xlm[WGS2]; @@ -192,13 +197,18 @@ __kernel void XgemvFast(const int m, const int n, const real alpha, const real b // --> 'a_rotated' is 1 // --> 'do_conjugate' is 0 __attribute__((reqd_work_group_size(WGS3, 1, 1))) -__kernel void XgemvFastRot(const int m, const int n, const real alpha, const real beta, +__kernel void XgemvFastRot(const int m, const int n, + const __constant real* restrict arg_alpha, + const __constant real* restrict arg_beta, const int a_rotated, const __global realVFR* restrict agm, const int a_offset, const int a_ld, const __global real* restrict xgm, const int x_offset, const int x_inc, __global real* ygm, const int y_offset, const int y_inc, const int do_conjugate, const int parameter, const int kl, const int ku) { + const real alpha = arg_alpha[0]; + const real beta = arg_beta[0]; + // Local memory for the vector X __local real xlm[WGS3]; diff --git a/src/kernels/level2/xger.opencl b/src/kernels/level2/xger.opencl index d377fbb0..63817afb 100644 --- a/src/kernels/level2/xger.opencl +++ b/src/kernels/level2/xger.opencl @@ -19,11 +19,13 @@ R"( // Regular version of the rank-1 matrix update kernel (GER, GERU, GERC) __attribute__((reqd_work_group_size(WGS1, WGS2, 1))) -__kernel void Xger(const int max1, const int max2, const real alpha, +__kernel void Xger(const int max1, const int max2, + const __constant real* restrict arg_alpha, const __global real* restrict xgm, const int x_offset, const int x_inc, const __global real* ygm, const int y_offset, const int y_inc, __global real* restrict agm, const int a_offset, const int a_ld, const int is_rowmajor) { + const real alpha = arg_alpha[0]; // Register storage for X and Y real xvalues[WPT]; diff --git a/src/kernels/level2/xher.opencl b/src/kernels/level2/xher.opencl index edb94ca8..fc635f2e 100644 --- a/src/kernels/level2/xher.opencl +++ b/src/kernels/level2/xher.opencl @@ -19,10 +19,12 @@ R"( // Symmetric version of the rank-1 matrix update kernel (HER, HPR, SYR, SPR) __attribute__((reqd_work_group_size(WGS1, WGS2, 1))) -__kernel void Xher(const int n, const real alpha, +__kernel void Xher(const int n, + const __constant real* restrict arg_alpha, const __global real* restrict xgm, const int x_offset, const int x_inc, __global real* restrict agm, const int a_offset, const int a_ld, const int is_upper, const int is_rowmajor) { + const real alpha = arg_alpha[0]; // Register storage for X and XT real xvalues[WPT]; diff --git a/src/kernels/level2/xher2.opencl b/src/kernels/level2/xher2.opencl index 4a2edce8..a66f255f 100644 --- a/src/kernels/level2/xher2.opencl +++ b/src/kernels/level2/xher2.opencl @@ -19,11 +19,13 @@ R"( // Symmetric version of the rank-2 matrix update kernel (HER2, HPR2, SYR2, SPR2) __attribute__((reqd_work_group_size(WGS1, WGS2, 1))) -__kernel void Xher2(const int n, const real alpha, +__kernel void Xher2(const int n, + const __constant real* restrict arg_alpha, const __global real* restrict xgm, const int x_offset, const int x_inc, const __global real* restrict ygm, const int y_offset, const int y_inc, __global real* restrict agm, const int a_offset, const int a_ld, const int is_upper, const int is_rowmajor) { + const real alpha = arg_alpha[0]; // Register storage for X and Y real xvalues[WPT]; diff --git a/src/kernels/level3/xgemm_part2.opencl b/src/kernels/level3/xgemm_part2.opencl index 599e01d5..56ccdb96 100644 --- a/src/kernels/level3/xgemm_part2.opencl +++ b/src/kernels/level3/xgemm_part2.opencl @@ -267,10 +267,13 @@ inline void XgemmBody(const int kSizeM, const int kSizeN, const int kSizeK, // Main entry point of the kernel. This is the upper-triangular version. __attribute__((reqd_work_group_size(MDIMC, NDIMC, 1))) __kernel void XgemmUpper(const int kSizeN, const int kSizeK, - const real alpha, const real beta, + const __constant real* restrict arg_alpha, + const __constant real* restrict arg_beta, const __global realM* restrict agm, const __global realN* restrict bgm, __global realM* cgm) { + const real alpha = arg_alpha[0]; + const real beta = arg_beta[0]; // Skip these threads if they do not contain threads contributing to the upper-triangle if (GetGroupID1()*NWG < GetGroupID0()*MWG) { @@ -304,10 +307,13 @@ __kernel void XgemmUpper(const int kSizeN, const int kSizeK, // Main entry point of the kernel. This is the lower-triangular version. __attribute__((reqd_work_group_size(MDIMC, NDIMC, 1))) __kernel void XgemmLower(const int kSizeN, const int kSizeK, - const real alpha, const real beta, + const __constant real* restrict arg_alpha, + const __constant real* restrict arg_beta, const __global realM* restrict agm, const __global realN* restrict bgm, __global realM* cgm) { + const real alpha = arg_alpha[0]; + const real beta = arg_beta[0]; // Skip these threads if they do not contain threads contributing to the lower-triangle if (GetGroupID1()*NWG > GetGroupID0()*MWG) { @@ -345,10 +351,13 @@ __kernel void XgemmLower(const int kSizeN, const int kSizeK, // Main entry point of the kernel. This is the regular full version. __attribute__((reqd_work_group_size(MDIMC, NDIMC, 1))) __kernel void Xgemm(const int kSizeM, const int kSizeN, const int kSizeK, - const real alpha, const real beta, + const __constant real* restrict arg_alpha, + const __constant real* restrict arg_beta, const __global realM* restrict agm, const __global realN* restrict bgm, __global realM* cgm) { + const real alpha = arg_alpha[0]; + const real beta = arg_beta[0]; // Allocates workgroup-private memory (local memory) #if SA == 1 diff --git a/src/routine.cc b/src/routine.cc index eee4c7cc..11c4281e 100644 --- a/src/routine.cc +++ b/src/routine.cc @@ -406,6 +406,7 @@ StatusCode Routine::PadCopyTransposeMatrix(EventPointer event, std::vector; template class Routine; template class Routine; template class Routine; diff --git a/src/routines/level1/xamax.cc b/src/routines/level1/xamax.cc index 682e2b63..335e59bc 100644 --- a/src/routines/level1/xamax.cc +++ b/src/routines/level1/xamax.cc @@ -20,6 +20,7 @@ namespace clblast { // ================================================================================================= // Specific implementations to get the memory-type based on a template argument +template <> const Precision Xamax::precision_ = Precision::kHalf; template <> const Precision Xamax::precision_ = Precision::kSingle; template <> const Precision Xamax::precision_ = Precision::kDouble; template <> const Precision Xamax::precision_ = Precision::kComplexSingle; @@ -103,6 +104,7 @@ StatusCode Xamax::DoAmax(const size_t n, // ================================================================================================= // Compiles the templated class +template class Xamax; template class Xamax; template class Xamax; template class Xamax; diff --git a/src/routines/level1/xasum.cc b/src/routines/level1/xasum.cc index ea33d7e1..e04f7064 100644 --- a/src/routines/level1/xasum.cc +++ b/src/routines/level1/xasum.cc @@ -20,6 +20,7 @@ namespace clblast { // ================================================================================================= // Specific implementations to get the memory-type based on a template argument +template <> const Precision Xasum::precision_ = Precision::kHalf; template <> const Precision Xasum::precision_ = Precision::kSingle; template <> const Precision Xasum::precision_ = Precision::kDouble; template <> const Precision Xasum::precision_ = Precision::kComplexSingle; @@ -100,6 +101,7 @@ StatusCode Xasum::DoAsum(const size_t n, // ================================================================================================= // Compiles the templated class +template class Xasum; template class Xasum; template class Xasum; template class Xasum; diff --git a/src/routines/level1/xaxpy.cc b/src/routines/level1/xaxpy.cc index 96809a57..66aa2336 100644 --- a/src/routines/level1/xaxpy.cc +++ b/src/routines/level1/xaxpy.cc @@ -20,6 +20,7 @@ namespace clblast { // ================================================================================================= // Specific implementations to get the memory-type based on a template argument +template <> const Precision Xaxpy::precision_ = Precision::kHalf; template <> const Precision Xaxpy::precision_ = Precision::kSingle; template <> const Precision Xaxpy::precision_ = Precision::kDouble; template <> const Precision Xaxpy::precision_ = Precision::kComplexSingle; @@ -67,16 +68,20 @@ StatusCode Xaxpy::DoAxpy(const size_t n, const T alpha, const auto program = GetProgramFromCache(); auto kernel = Kernel(program, kernel_name); + // Upload the scalar argument as a constant buffer to the device (needed for half-precision) + auto alpha_buffer = Buffer(context_, 1); + alpha_buffer.Write(queue_, 1, &alpha); + // Sets the kernel arguments if (use_fast_kernel) { kernel.SetArgument(0, static_cast(n)); - kernel.SetArgument(1, alpha); + kernel.SetArgument(1, alpha_buffer()); kernel.SetArgument(2, x_buffer()); kernel.SetArgument(3, y_buffer()); } else { kernel.SetArgument(0, static_cast(n)); - kernel.SetArgument(1, alpha); + kernel.SetArgument(1, alpha_buffer()); kernel.SetArgument(2, x_buffer()); kernel.SetArgument(3, static_cast(x_offset)); kernel.SetArgument(4, static_cast(x_inc)); @@ -107,6 +112,7 @@ StatusCode Xaxpy::DoAxpy(const size_t n, const T alpha, // ================================================================================================= // Compiles the templated class +template class Xaxpy; template class Xaxpy; template class Xaxpy; template class Xaxpy; diff --git a/src/routines/level1/xcopy.cc b/src/routines/level1/xcopy.cc index d34482ce..ff8f5999 100644 --- a/src/routines/level1/xcopy.cc +++ b/src/routines/level1/xcopy.cc @@ -20,6 +20,7 @@ namespace clblast { // ================================================================================================= // Specific implementations to get the memory-type based on a template argument +template <> const Precision Xcopy::precision_ = Precision::kHalf; template <> const Precision Xcopy::precision_ = Precision::kSingle; template <> const Precision Xcopy::precision_ = Precision::kDouble; template <> const Precision Xcopy::precision_ = Precision::kComplexSingle; @@ -105,6 +106,7 @@ StatusCode Xcopy::DoCopy(const size_t n, // ================================================================================================= // Compiles the templated class +template class Xcopy; template class Xcopy; template class Xcopy; template class Xcopy; diff --git a/src/routines/level1/xdot.cc b/src/routines/level1/xdot.cc index b2513485..db6a369e 100644 --- a/src/routines/level1/xdot.cc +++ b/src/routines/level1/xdot.cc @@ -20,6 +20,7 @@ namespace clblast { // ================================================================================================= // Specific implementations to get the memory-type based on a template argument +template <> const Precision Xdot::precision_ = Precision::kHalf; template <> const Precision Xdot::precision_ = Precision::kSingle; template <> const Precision Xdot::precision_ = Precision::kDouble; template <> const Precision Xdot::precision_ = Precision::kComplexSingle; @@ -108,6 +109,7 @@ StatusCode Xdot::DoDot(const size_t n, // ================================================================================================= // Compiles the templated class +template class Xdot; template class Xdot; template class Xdot; template class Xdot; diff --git a/src/routines/level1/xnrm2.cc b/src/routines/level1/xnrm2.cc index 86166a0c..14f7f6aa 100644 --- a/src/routines/level1/xnrm2.cc +++ b/src/routines/level1/xnrm2.cc @@ -20,6 +20,7 @@ namespace clblast { // ================================================================================================= // Specific implementations to get the memory-type based on a template argument +template <> const Precision Xnrm2::precision_ = Precision::kHalf; template <> const Precision Xnrm2::precision_ = Precision::kSingle; template <> const Precision Xnrm2::precision_ = Precision::kDouble; template <> const Precision Xnrm2::precision_ = Precision::kComplexSingle; @@ -100,6 +101,7 @@ StatusCode Xnrm2::DoNrm2(const size_t n, // ================================================================================================= // Compiles the templated class +template class Xnrm2; template class Xnrm2; template class Xnrm2; template class Xnrm2; diff --git a/src/routines/level1/xscal.cc b/src/routines/level1/xscal.cc index b92e2cdf..1207acfa 100644 --- a/src/routines/level1/xscal.cc +++ b/src/routines/level1/xscal.cc @@ -20,6 +20,7 @@ namespace clblast { // ================================================================================================= // Specific implementations to get the memory-type based on a template argument +template <> const Precision Xscal::precision_ = Precision::kHalf; template <> const Precision Xscal::precision_ = Precision::kSingle; template <> const Precision Xscal::precision_ = Precision::kDouble; template <> const Precision Xscal::precision_ = Precision::kComplexSingle; @@ -99,6 +100,7 @@ StatusCode Xscal::DoScal(const size_t n, const T alpha, // ================================================================================================= // Compiles the templated class +template class Xscal; template class Xscal; template class Xscal; template class Xscal; diff --git a/src/routines/level1/xswap.cc b/src/routines/level1/xswap.cc index bfc4a739..8844abff 100644 --- a/src/routines/level1/xswap.cc +++ b/src/routines/level1/xswap.cc @@ -20,6 +20,7 @@ namespace clblast { // ================================================================================================= // Specific implementations to get the memory-type based on a template argument +template <> const Precision Xswap::precision_ = Precision::kHalf; template <> const Precision Xswap::precision_ = Precision::kSingle; template <> const Precision Xswap::precision_ = Precision::kDouble; template <> const Precision Xswap::precision_ = Precision::kComplexSingle; @@ -105,6 +106,7 @@ StatusCode Xswap::DoSwap(const size_t n, // ================================================================================================= // Compiles the templated class +template class Xswap; template class Xswap; template class Xswap; template class Xswap; diff --git a/src/routines/level2/xgbmv.cc b/src/routines/level2/xgbmv.cc index f90e26b2..7a30c34a 100644 --- a/src/routines/level2/xgbmv.cc +++ b/src/routines/level2/xgbmv.cc @@ -58,6 +58,7 @@ StatusCode Xgbmv::DoGbmv(const Layout layout, const Transpose a_transpose, // ================================================================================================= // Compiles the templated class +template class Xgbmv; template class Xgbmv; template class Xgbmv; template class Xgbmv; diff --git a/src/routines/level2/xgemv.cc b/src/routines/level2/xgemv.cc index f8985038..71839e96 100644 --- a/src/routines/level2/xgemv.cc +++ b/src/routines/level2/xgemv.cc @@ -20,6 +20,7 @@ namespace clblast { // ================================================================================================= // Specific implementations to get the memory-type based on a template argument +template <> const Precision Xgemv::precision_ = Precision::kHalf; template <> const Precision Xgemv::precision_ = Precision::kSingle; template <> const Precision Xgemv::precision_ = Precision::kDouble; template <> const Precision Xgemv::precision_ = Precision::kComplexSingle; @@ -134,6 +135,12 @@ StatusCode Xgemv::MatVec(const Layout layout, const Transpose a_transpose, local_size = db_["WGS3"]; } + // Upload the scalar arguments as constant buffers to the device (needed for half-precision) + auto alpha_buffer = Buffer(context_, 1); + auto beta_buffer = Buffer(context_, 1); + alpha_buffer.Write(queue_, 1, &alpha); + beta_buffer.Write(queue_, 1, &beta); + // Retrieves the Xgemv kernel from the compiled binary try { const auto program = GetProgramFromCache(); @@ -142,8 +149,8 @@ StatusCode Xgemv::MatVec(const Layout layout, const Transpose a_transpose, // Sets the kernel arguments kernel.SetArgument(0, static_cast(m_real)); kernel.SetArgument(1, static_cast(n_real)); - kernel.SetArgument(2, alpha); - kernel.SetArgument(3, beta); + kernel.SetArgument(2, alpha_buffer()); + kernel.SetArgument(3, beta_buffer()); kernel.SetArgument(4, static_cast(a_rotated)); kernel.SetArgument(5, a_buffer()); kernel.SetArgument(6, static_cast(a_offset)); @@ -173,6 +180,7 @@ StatusCode Xgemv::MatVec(const Layout layout, const Transpose a_transpose, // ================================================================================================= // Compiles the templated class +template class Xgemv; template class Xgemv; template class Xgemv; template class Xgemv; diff --git a/src/routines/level2/xger.cc b/src/routines/level2/xger.cc index 686c7e60..d1f98990 100644 --- a/src/routines/level2/xger.cc +++ b/src/routines/level2/xger.cc @@ -20,6 +20,7 @@ namespace clblast { // ================================================================================================= // Specific implementations to get the memory-type based on a template argument +template <> const Precision Xger::precision_ = Precision::kHalf; template <> const Precision Xger::precision_ = Precision::kSingle; template <> const Precision Xger::precision_ = Precision::kDouble; template <> const Precision Xger::precision_ = Precision::kComplexSingle; @@ -64,7 +65,11 @@ StatusCode Xger::DoGer(const Layout layout, status = TestVectorY(n, y_buffer, y_offset, y_inc, sizeof(T)); if (ErrorIn(status)) { return status; } - // Retrieves the Xgemv kernel from the compiled binary + // Upload the scalar argument as a constant buffer to the device (needed for half-precision) + auto alpha_buffer = Buffer(context_, 1); + alpha_buffer.Write(queue_, 1, &alpha); + + // Retrieves the kernel from the compiled binary try { const auto program = GetProgramFromCache(); auto kernel = Kernel(program, "Xger"); @@ -72,7 +77,7 @@ StatusCode Xger::DoGer(const Layout layout, // Sets the kernel arguments kernel.SetArgument(0, static_cast(a_one)); kernel.SetArgument(1, static_cast(a_two)); - kernel.SetArgument(2, alpha); + kernel.SetArgument(2, alpha_buffer()); kernel.SetArgument(3, x_buffer()); kernel.SetArgument(4, static_cast(x_offset)); kernel.SetArgument(5, static_cast(x_inc)); @@ -100,6 +105,7 @@ StatusCode Xger::DoGer(const Layout layout, // ================================================================================================= // Compiles the templated class +template class Xger; template class Xger; template class Xger; template class Xger; diff --git a/src/routines/level2/xher.cc b/src/routines/level2/xher.cc index a7116213..73e7a47d 100644 --- a/src/routines/level2/xher.cc +++ b/src/routines/level2/xher.cc @@ -19,6 +19,7 @@ namespace clblast { // ================================================================================================= // Specific implementations to get the memory-type based on a template argument +template <> const Precision Xher::precision_ = Precision::kHalf; template <> const Precision Xher::precision_ = Precision::kSingle; template <> const Precision Xher::precision_ = Precision::kDouble; template <> const Precision Xher::precision_ = Precision::kComplexSingle; @@ -43,6 +44,7 @@ template <> float2 Xher::GetAlpha(const float alpha) { return floa template <> double2 Xher::GetAlpha(const double alpha) { return double2{alpha, 0.0}; } template <> float Xher::GetAlpha(const float alpha) { return alpha; } template <> double Xher::GetAlpha(const double alpha) { return alpha; } +template <> half Xher::GetAlpha(const half alpha) { return alpha; } // ================================================================================================= @@ -63,9 +65,6 @@ StatusCode Xher::DoHer(const Layout layout, const Triangle triangle, (triangle == Triangle::kLower && layout == Layout::kRowMajor)); const auto is_rowmajor = (layout == Layout::kRowMajor); - // Creates a matching version of alpha - const auto matching_alpha = GetAlpha(alpha); - // Tests the matrix and the vectors for validity auto status = StatusCode::kSuccess; if (packed) { status = TestMatrixAP(n, a_buffer, a_offset, sizeof(T)); } @@ -77,14 +76,21 @@ StatusCode Xher::DoHer(const Layout layout, const Triangle triangle, // If alpha is zero an update is not required if (alpha == U{0}) { return StatusCode::kSuccess; } - // Retrieves the Xgemv kernel from the compiled binary + // Creates a matching version of alpha + const auto matching_alpha = GetAlpha(alpha); + + // Upload the scalar argument as a constant buffer to the device (needed for half-precision) + auto alpha_buffer = Buffer(context_, 1); + alpha_buffer.Write(queue_, 1, &matching_alpha); + + // Retrieves the kernel from the compiled binary try { const auto program = GetProgramFromCache(); auto kernel = Kernel(program, "Xher"); // Sets the kernel arguments kernel.SetArgument(0, static_cast(n)); - kernel.SetArgument(1, matching_alpha); + kernel.SetArgument(1, alpha_buffer()); kernel.SetArgument(2, x_buffer()); kernel.SetArgument(3, static_cast(x_offset)); kernel.SetArgument(4, static_cast(x_inc)); @@ -110,6 +116,7 @@ StatusCode Xher::DoHer(const Layout layout, const Triangle triangle, // ================================================================================================= // Compiles the templated class +template class Xher; template class Xher; template class Xher; template class Xher; diff --git a/src/routines/level2/xher2.cc b/src/routines/level2/xher2.cc index 3fd1a961..a73dde52 100644 --- a/src/routines/level2/xher2.cc +++ b/src/routines/level2/xher2.cc @@ -19,6 +19,7 @@ namespace clblast { // ================================================================================================= // Specific implementations to get the memory-type based on a template argument +template <> const Precision Xher2::precision_ = Precision::kHalf; template <> const Precision Xher2::precision_ = Precision::kSingle; template <> const Precision Xher2::precision_ = Precision::kDouble; template <> const Precision Xher2::precision_ = Precision::kComplexSingle; @@ -66,14 +67,18 @@ StatusCode Xher2::DoHer2(const Layout layout, const Triangle triangle, status = TestVectorY(n, y_buffer, y_offset, y_inc, sizeof(T)); if (ErrorIn(status)) { return status; } - // Retrieves the Xgemv kernel from the compiled binary + // Upload the scalar argument as a constant buffer to the device (needed for half-precision) + auto alpha_buffer = Buffer(context_, 1); + alpha_buffer.Write(queue_, 1, &alpha); + + // Retrieves the kernel from the compiled binary try { const auto program = GetProgramFromCache(); auto kernel = Kernel(program, "Xher2"); // Sets the kernel arguments kernel.SetArgument(0, static_cast(n)); - kernel.SetArgument(1, alpha); + kernel.SetArgument(1, alpha_buffer()); kernel.SetArgument(2, x_buffer()); kernel.SetArgument(3, static_cast(x_offset)); kernel.SetArgument(4, static_cast(x_inc)); @@ -102,6 +107,7 @@ StatusCode Xher2::DoHer2(const Layout layout, const Triangle triangle, // ================================================================================================= // Compiles the templated class +template class Xher2; template class Xher2; template class Xher2; template class Xher2; diff --git a/src/routines/level2/xsbmv.cc b/src/routines/level2/xsbmv.cc index bc82c88d..66ba74e8 100644 --- a/src/routines/level2/xsbmv.cc +++ b/src/routines/level2/xsbmv.cc @@ -57,6 +57,7 @@ StatusCode Xsbmv::DoSbmv(const Layout layout, const Triangle triangle, // ================================================================================================= // Compiles the templated class +template class Xsbmv; template class Xsbmv; template class Xsbmv; diff --git a/src/routines/level2/xspmv.cc b/src/routines/level2/xspmv.cc index 6e00dcfa..589a97d4 100644 --- a/src/routines/level2/xspmv.cc +++ b/src/routines/level2/xspmv.cc @@ -57,6 +57,7 @@ StatusCode Xspmv::DoSpmv(const Layout layout, const Triangle triangle, // ================================================================================================= // Compiles the templated class +template class Xspmv; template class Xspmv; template class Xspmv; diff --git a/src/routines/level2/xspr.cc b/src/routines/level2/xspr.cc index 55af2f29..c556b920 100644 --- a/src/routines/level2/xspr.cc +++ b/src/routines/level2/xspr.cc @@ -44,6 +44,7 @@ StatusCode Xspr::DoSpr(const Layout layout, const Triangle triangle, // ================================================================================================= // Compiles the templated class +template class Xspr; template class Xspr; template class Xspr; diff --git a/src/routines/level2/xspr2.cc b/src/routines/level2/xspr2.cc index 9a3f97ce..c4ad5dc4 100644 --- a/src/routines/level2/xspr2.cc +++ b/src/routines/level2/xspr2.cc @@ -46,6 +46,7 @@ StatusCode Xspr2::DoSpr2(const Layout layout, const Triangle triangle, // ================================================================================================= // Compiles the templated class +template class Xspr2; template class Xspr2; template class Xspr2; diff --git a/src/routines/level2/xsymv.cc b/src/routines/level2/xsymv.cc index a9eb284f..2a404a8a 100644 --- a/src/routines/level2/xsymv.cc +++ b/src/routines/level2/xsymv.cc @@ -57,6 +57,7 @@ StatusCode Xsymv::DoSymv(const Layout layout, const Triangle triangle, // ================================================================================================= // Compiles the templated class +template class Xsymv; template class Xsymv; template class Xsymv; diff --git a/src/routines/level2/xsyr.cc b/src/routines/level2/xsyr.cc index 4b3928e5..892517d7 100644 --- a/src/routines/level2/xsyr.cc +++ b/src/routines/level2/xsyr.cc @@ -43,6 +43,7 @@ StatusCode Xsyr::DoSyr(const Layout layout, const Triangle triangle, // ================================================================================================= // Compiles the templated class +template class Xsyr; template class Xsyr; template class Xsyr; diff --git a/src/routines/level2/xsyr2.cc b/src/routines/level2/xsyr2.cc index 3ae389e0..e6dfd158 100644 --- a/src/routines/level2/xsyr2.cc +++ b/src/routines/level2/xsyr2.cc @@ -45,6 +45,7 @@ StatusCode Xsyr2::DoSyr2(const Layout layout, const Triangle triangle, // ================================================================================================= // Compiles the templated class +template class Xsyr2; template class Xsyr2; template class Xsyr2; diff --git a/src/routines/level2/xtbmv.cc b/src/routines/level2/xtbmv.cc index 47371c87..86e28dfb 100644 --- a/src/routines/level2/xtbmv.cc +++ b/src/routines/level2/xtbmv.cc @@ -72,6 +72,7 @@ StatusCode Xtbmv::DoTbmv(const Layout layout, const Triangle triangle, // ================================================================================================= // Compiles the templated class +template class Xtbmv; template class Xtbmv; template class Xtbmv; template class Xtbmv; diff --git a/src/routines/level2/xtpmv.cc b/src/routines/level2/xtpmv.cc index c63cb9b2..72445547 100644 --- a/src/routines/level2/xtpmv.cc +++ b/src/routines/level2/xtpmv.cc @@ -72,6 +72,7 @@ StatusCode Xtpmv::DoTpmv(const Layout layout, const Triangle triangle, // ================================================================================================= // Compiles the templated class +template class Xtpmv; template class Xtpmv; template class Xtpmv; template class Xtpmv; diff --git a/src/routines/level2/xtrmv.cc b/src/routines/level2/xtrmv.cc index 9111d41d..df6f85a3 100644 --- a/src/routines/level2/xtrmv.cc +++ b/src/routines/level2/xtrmv.cc @@ -72,6 +72,7 @@ StatusCode Xtrmv::DoTrmv(const Layout layout, const Triangle triangle, // ================================================================================================= // Compiles the templated class +template class Xtrmv; template class Xtrmv; template class Xtrmv; template class Xtrmv; diff --git a/src/routines/level3/xgemm.cc b/src/routines/level3/xgemm.cc index 3699b548..ab36076c 100644 --- a/src/routines/level3/xgemm.cc +++ b/src/routines/level3/xgemm.cc @@ -20,6 +20,7 @@ namespace clblast { // ================================================================================================= // Specific implementations to get the memory-type based on a template argument +template <> const Precision Xgemm::precision_ = Precision::kHalf; template <> const Precision Xgemm::precision_ = Precision::kSingle; template <> const Precision Xgemm::precision_ = Precision::kDouble; template <> const Precision Xgemm::precision_ = Precision::kComplexSingle; @@ -122,6 +123,12 @@ StatusCode Xgemm::DoGemm(const Layout layout, auto b_temp = (b_no_temp) ? b_buffer : Buffer(context_, k_ceiled*n_ceiled); auto c_temp = (c_no_temp) ? c_buffer : Buffer(context_, m_ceiled*n_ceiled); + // Upload the scalar arguments as constant buffers to the device (needed for half-precision) + auto alpha_buffer = Buffer(context_, 1); + auto beta_buffer = Buffer(context_, 1); + alpha_buffer.Write(queue_, 1, &alpha); + beta_buffer.Write(queue_, 1, &beta); + // Events of all kernels (including pre/post processing kernels) auto eventWaitList = std::vector(); auto emptyEventList = std::vector(); @@ -169,8 +176,8 @@ StatusCode Xgemm::DoGemm(const Layout layout, kernel.SetArgument(0, static_cast(m_ceiled)); kernel.SetArgument(1, static_cast(n_ceiled)); kernel.SetArgument(2, static_cast(k_ceiled)); - kernel.SetArgument(3, alpha); - kernel.SetArgument(4, beta); + kernel.SetArgument(3, alpha_buffer()); + kernel.SetArgument(4, beta_buffer()); kernel.SetArgument(5, a_temp()); kernel.SetArgument(6, b_temp()); kernel.SetArgument(7, c_temp()); @@ -207,6 +214,7 @@ StatusCode Xgemm::DoGemm(const Layout layout, // ================================================================================================= // Compiles the templated class +template class Xgemm; template class Xgemm; template class Xgemm; template class Xgemm; diff --git a/src/routines/level3/xher2k.cc b/src/routines/level3/xher2k.cc index 2c2c815d..1acba517 100644 --- a/src/routines/level3/xher2k.cc +++ b/src/routines/level3/xher2k.cc @@ -112,6 +112,13 @@ StatusCode Xher2k::DoHer2k(const Layout layout, const Triangle triangle, co auto b2_temp = (b2_no_temp) ? b_buffer : Buffer(context_, k_ceiled*n_ceiled); auto c_temp = Buffer(context_, n_ceiled*n_ceiled); + // Upload the scalar arguments as constant buffers to the device (needed for half-precision) + auto complex_beta = T{beta, static_cast(0.0)}; + auto alpha_buffer = Buffer(context_, 1); + auto beta_buffer = Buffer(context_, 1); + alpha_buffer.Write(queue_, 1, &alpha); + beta_buffer.Write(queue_, 1, &complex_beta); + // Events of all kernels (including pre/post processing kernels) auto eventWaitList = std::vector(); auto emptyEventList = std::vector(); @@ -171,11 +178,10 @@ StatusCode Xher2k::DoHer2k(const Layout layout, const Triangle triangle, co auto kernel = Kernel(program, kernel_name); // Sets the kernel arguments - auto complex_beta = T{beta, static_cast(0.0)}; kernel.SetArgument(0, static_cast(n_ceiled)); kernel.SetArgument(1, static_cast(k_ceiled)); - kernel.SetArgument(2, alpha); - kernel.SetArgument(3, complex_beta); + kernel.SetArgument(2, alpha_buffer()); + kernel.SetArgument(3, beta_buffer()); kernel.SetArgument(4, a1_temp()); kernel.SetArgument(5, b2_temp()); kernel.SetArgument(6, c_temp()); @@ -196,8 +202,10 @@ StatusCode Xher2k::DoHer2k(const Layout layout, const Triangle triangle, co // Swaps the arguments for matrices A and B, sets 'beta' to 1, and conjugate alpha auto conjugate_alpha = T{alpha.real(), -alpha.imag()}; auto complex_one = T{static_cast(1.0), static_cast(0.0)}; - kernel.SetArgument(2, conjugate_alpha); - kernel.SetArgument(3, complex_one); + alpha_buffer.Write(queue_, 1, &conjugate_alpha); + beta_buffer.Write(queue_, 1, &complex_one); + kernel.SetArgument(2, alpha_buffer()); + kernel.SetArgument(3, beta_buffer()); kernel.SetArgument(4, b1_temp()); kernel.SetArgument(5, a2_temp()); diff --git a/src/routines/level3/xherk.cc b/src/routines/level3/xherk.cc index 414c4760..ea1aa614 100644 --- a/src/routines/level3/xherk.cc +++ b/src/routines/level3/xherk.cc @@ -103,6 +103,14 @@ StatusCode Xherk::DoHerk(const Layout layout, const Triangle triangle, cons auto b_temp = (b_no_temp) ? a_buffer : Buffer(context_, k_ceiled*n_ceiled); auto c_temp = Buffer(context_, n_ceiled*n_ceiled); + // Upload the scalar arguments as constant buffers to the device (needed for half-precision) + auto complex_alpha = T{alpha, static_cast(0.0)}; + auto complex_beta = T{beta, static_cast(0.0)}; + auto alpha_buffer = Buffer(context_, 1); + auto beta_buffer = Buffer(context_, 1); + alpha_buffer.Write(queue_, 1, &complex_alpha); + beta_buffer.Write(queue_, 1, &complex_beta); + // Events of all kernels (including pre/post processing kernels) auto eventWaitList = std::vector(); auto emptyEventList = std::vector(); @@ -144,12 +152,10 @@ StatusCode Xherk::DoHerk(const Layout layout, const Triangle triangle, cons auto kernel = Kernel(program, kernel_name); // Sets the kernel arguments - auto complex_alpha = T{alpha, static_cast(0.0)}; - auto complex_beta = T{beta, static_cast(0.0)}; kernel.SetArgument(0, static_cast(n_ceiled)); kernel.SetArgument(1, static_cast(k_ceiled)); - kernel.SetArgument(2, complex_alpha); - kernel.SetArgument(3, complex_beta); + kernel.SetArgument(2, alpha_buffer()); + kernel.SetArgument(3, beta_buffer()); kernel.SetArgument(4, a_temp()); kernel.SetArgument(5, b_temp()); kernel.SetArgument(6, c_temp()); diff --git a/src/routines/level3/xsymm.cc b/src/routines/level3/xsymm.cc index a39026f1..d88d4653 100644 --- a/src/routines/level3/xsymm.cc +++ b/src/routines/level3/xsymm.cc @@ -127,6 +127,7 @@ StatusCode Xsymm::DoSymm(const Layout layout, const Side side, const Triangle // ================================================================================================= // Compiles the templated class +template class Xsymm; template class Xsymm; template class Xsymm; template class Xsymm; diff --git a/src/routines/level3/xsyr2k.cc b/src/routines/level3/xsyr2k.cc index 3206c669..4f86bac5 100644 --- a/src/routines/level3/xsyr2k.cc +++ b/src/routines/level3/xsyr2k.cc @@ -20,6 +20,7 @@ namespace clblast { // ================================================================================================= // Specific implementations to get the memory-type based on a template argument +template <> const Precision Xsyr2k::precision_ = Precision::kHalf; template <> const Precision Xsyr2k::precision_ = Precision::kSingle; template <> const Precision Xsyr2k::precision_ = Precision::kDouble; template <> const Precision Xsyr2k::precision_ = Precision::kComplexSingle; @@ -104,6 +105,12 @@ StatusCode Xsyr2k::DoSyr2k(const Layout layout, const Triangle triangle, cons auto b_temp = (b_no_temp) ? b_buffer : Buffer(context_, k_ceiled*n_ceiled); auto c_temp = Buffer(context_, n_ceiled*n_ceiled); + // Upload the scalar arguments as constant buffers to the device (needed for half-precision) + auto alpha_buffer = Buffer(context_, 1); + auto beta_buffer = Buffer(context_, 1); + alpha_buffer.Write(queue_, 1, &alpha); + beta_buffer.Write(queue_, 1, &beta); + // Events of all kernels (including pre/post processing kernels) auto eventWaitList = std::vector(); auto emptyEventList = std::vector(); @@ -147,8 +154,8 @@ StatusCode Xsyr2k::DoSyr2k(const Layout layout, const Triangle triangle, cons // Sets the kernel arguments kernel.SetArgument(0, static_cast(n_ceiled)); kernel.SetArgument(1, static_cast(k_ceiled)); - kernel.SetArgument(2, alpha); - kernel.SetArgument(3, beta); + kernel.SetArgument(2, alpha_buffer()); + kernel.SetArgument(3, beta_buffer()); kernel.SetArgument(4, a_temp()); kernel.SetArgument(5, b_temp()); kernel.SetArgument(6, c_temp()); @@ -168,7 +175,8 @@ StatusCode Xsyr2k::DoSyr2k(const Layout layout, const Triangle triangle, cons // Swaps the arguments for matrices A and B, and sets 'beta' to 1 auto one = static_cast(1); - kernel.SetArgument(3, one); + beta_buffer.Write(queue_, 1, &one); + kernel.SetArgument(3, beta_buffer()); kernel.SetArgument(4, b_temp()); kernel.SetArgument(5, a_temp()); @@ -196,6 +204,7 @@ StatusCode Xsyr2k::DoSyr2k(const Layout layout, const Triangle triangle, cons // ================================================================================================= // Compiles the templated class +template class Xsyr2k; template class Xsyr2k; template class Xsyr2k; template class Xsyr2k; diff --git a/src/routines/level3/xsyrk.cc b/src/routines/level3/xsyrk.cc index 741ad064..52cb58c0 100644 --- a/src/routines/level3/xsyrk.cc +++ b/src/routines/level3/xsyrk.cc @@ -20,6 +20,7 @@ namespace clblast { // ================================================================================================= // Specific implementations to get the memory-type based on a template argument +template <> const Precision Xsyrk::precision_ = Precision::kHalf; template <> const Precision Xsyrk::precision_ = Precision::kSingle; template <> const Precision Xsyrk::precision_ = Precision::kDouble; template <> const Precision Xsyrk::precision_ = Precision::kComplexSingle; @@ -97,6 +98,12 @@ StatusCode Xsyrk::DoSyrk(const Layout layout, const Triangle triangle, const auto a_temp = (a_no_temp) ? a_buffer : Buffer(context_, k_ceiled*n_ceiled); auto c_temp = Buffer(context_, n_ceiled*n_ceiled); + // Upload the scalar arguments as constant buffers to the device (needed for half-precision) + auto alpha_buffer = Buffer(context_, 1); + auto beta_buffer = Buffer(context_, 1); + alpha_buffer.Write(queue_, 1, &alpha); + beta_buffer.Write(queue_, 1, &beta); + // Events of all kernels (including pre/post processing kernels) auto eventWaitList = std::vector(); auto emptyEventList = std::vector(); @@ -131,8 +138,8 @@ StatusCode Xsyrk::DoSyrk(const Layout layout, const Triangle triangle, const // Sets the kernel arguments kernel.SetArgument(0, static_cast(n_ceiled)); kernel.SetArgument(1, static_cast(k_ceiled)); - kernel.SetArgument(2, alpha); - kernel.SetArgument(3, beta); + kernel.SetArgument(2, alpha_buffer()); + kernel.SetArgument(3, beta_buffer()); kernel.SetArgument(4, a_temp()); kernel.SetArgument(5, a_temp()); kernel.SetArgument(6, c_temp()); @@ -169,6 +176,7 @@ StatusCode Xsyrk::DoSyrk(const Layout layout, const Triangle triangle, const // ================================================================================================= // Compiles the templated class +template class Xsyrk; template class Xsyrk; template class Xsyrk; template class Xsyrk; diff --git a/src/routines/level3/xtrmm.cc b/src/routines/level3/xtrmm.cc index 9e3b27b4..18cbb1c0 100644 --- a/src/routines/level3/xtrmm.cc +++ b/src/routines/level3/xtrmm.cc @@ -130,6 +130,7 @@ StatusCode Xtrmm::DoTrmm(const Layout layout, const Side side, const Triangle // ================================================================================================= // Compiles the templated class +template class Xtrmm; template class Xtrmm; template class Xtrmm; template class Xtrmm; diff --git a/src/tuning/copy.cc b/src/tuning/copy.cc index e2837e60..09cdecf1 100644 --- a/src/tuning/copy.cc +++ b/src/tuning/copy.cc @@ -107,7 +107,7 @@ using double2 = clblast::double2; // Main function (not within the clblast namespace) int main(int argc, char *argv[]) { switch(clblast::GetPrecision(argc, argv)) { - case clblast::Precision::kHalf: throw std::runtime_error("Unsupported precision mode"); + case clblast::Precision::kHalf: clblast::Tuner, half>(argc, argv); break; case clblast::Precision::kSingle: clblast::Tuner, float>(argc, argv); break; case clblast::Precision::kDouble: clblast::Tuner, double>(argc, argv); break; case clblast::Precision::kComplexSingle: clblast::Tuner, float2>(argc, argv); break; diff --git a/src/tuning/pad.cc b/src/tuning/pad.cc index 72729422..075688db 100644 --- a/src/tuning/pad.cc +++ b/src/tuning/pad.cc @@ -85,17 +85,17 @@ class TunePad { std::vector &, std::vector &, std::vector &a_mat, std::vector &b_mat, std::vector &, std::vector &) { - tuner.AddArgumentScalar(static_cast(args.m)); - tuner.AddArgumentScalar(static_cast(args.n)); - tuner.AddArgumentScalar(static_cast(args.m)); - tuner.AddArgumentScalar(0); - tuner.AddArgumentInput(a_mat); - tuner.AddArgumentScalar(static_cast(args.m)); - tuner.AddArgumentScalar(static_cast(args.n)); - tuner.AddArgumentScalar(static_cast(args.m)); - tuner.AddArgumentScalar(0); - tuner.AddArgumentOutput(b_mat); - tuner.AddArgumentScalar(0); + tuner.AddArgumentScalar(static_cast(args.m)); + tuner.AddArgumentScalar(static_cast(args.n)); + tuner.AddArgumentScalar(static_cast(args.m)); + tuner.AddArgumentScalar(0); + tuner.AddArgumentInput(a_mat); + tuner.AddArgumentScalar(static_cast(args.m)); + tuner.AddArgumentScalar(static_cast(args.n)); + tuner.AddArgumentScalar(static_cast(args.m)); + tuner.AddArgumentScalar(0); + tuner.AddArgumentOutput(b_mat); + tuner.AddArgumentScalar(0); } // Describes how to compute the performance metrics @@ -115,7 +115,7 @@ using double2 = clblast::double2; // Main function (not within the clblast namespace) int main(int argc, char *argv[]) { switch(clblast::GetPrecision(argc, argv)) { - case clblast::Precision::kHalf: throw std::runtime_error("Unsupported precision mode"); + case clblast::Precision::kHalf: clblast::Tuner, half>(argc, argv); break; case clblast::Precision::kSingle: clblast::Tuner, float>(argc, argv); break; case clblast::Precision::kDouble: clblast::Tuner, double>(argc, argv); break; case clblast::Precision::kComplexSingle: clblast::Tuner, float2>(argc, argv); break; diff --git a/src/tuning/padtranspose.cc b/src/tuning/padtranspose.cc index 5edd89e0..a970f982 100644 --- a/src/tuning/padtranspose.cc +++ b/src/tuning/padtranspose.cc @@ -119,7 +119,7 @@ using double2 = clblast::double2; // Main function (not within the clblast namespace) int main(int argc, char *argv[]) { switch(clblast::GetPrecision(argc, argv)) { - case clblast::Precision::kHalf: throw std::runtime_error("Unsupported precision mode"); + case clblast::Precision::kHalf: clblast::Tuner, half>(argc, argv); break; case clblast::Precision::kSingle: clblast::Tuner, float>(argc, argv); break; case clblast::Precision::kDouble: clblast::Tuner, double>(argc, argv); break; case clblast::Precision::kComplexSingle: clblast::Tuner, float2>(argc, argv); break; diff --git a/src/tuning/transpose.cc b/src/tuning/transpose.cc index 113e0a81..d217a3df 100644 --- a/src/tuning/transpose.cc +++ b/src/tuning/transpose.cc @@ -112,7 +112,7 @@ using double2 = clblast::double2; // Main function (not within the clblast namespace) int main(int argc, char *argv[]) { switch(clblast::GetPrecision(argc, argv)) { - case clblast::Precision::kHalf: throw std::runtime_error("Unsupported precision mode"); + case clblast::Precision::kHalf: clblast::Tuner, half>(argc, argv); break; case clblast::Precision::kSingle: clblast::Tuner, float>(argc, argv); break; case clblast::Precision::kDouble: clblast::Tuner, double>(argc, argv); break; case clblast::Precision::kComplexSingle: clblast::Tuner, float2>(argc, argv); break; diff --git a/src/tuning/xaxpy.cc b/src/tuning/xaxpy.cc index 31aa6a8e..d27cb73d 100644 --- a/src/tuning/xaxpy.cc +++ b/src/tuning/xaxpy.cc @@ -89,8 +89,9 @@ class TuneXaxpy { std::vector &x_vec, std::vector &y_vec, std::vector &, std::vector &, std::vector &, std::vector &) { + auto alpha_buffer = std::vector{args.alpha}; tuner.AddArgumentScalar(static_cast(args.n)); - tuner.AddArgumentScalar(args.alpha); + tuner.AddArgumentInput(alpha_buffer); tuner.AddArgumentInput(x_vec); tuner.AddArgumentOutput(y_vec); } @@ -112,7 +113,7 @@ using double2 = clblast::double2; // Main function (not within the clblast namespace) int main(int argc, char *argv[]) { switch(clblast::GetPrecision(argc, argv)) { - case clblast::Precision::kHalf: throw std::runtime_error("Unsupported precision mode"); + case clblast::Precision::kHalf: clblast::Tuner, half>(argc, argv); break; case clblast::Precision::kSingle: clblast::Tuner, float>(argc, argv); break; case clblast::Precision::kDouble: clblast::Tuner, double>(argc, argv); break; case clblast::Precision::kComplexSingle: clblast::Tuner, float2>(argc, argv); break; diff --git a/src/tuning/xdot.cc b/src/tuning/xdot.cc index cff656c3..5f30296c 100644 --- a/src/tuning/xdot.cc +++ b/src/tuning/xdot.cc @@ -119,7 +119,7 @@ using double2 = clblast::double2; template void StartVariation(int argc, char *argv[]) { switch(clblast::GetPrecision(argc, argv)) { - case clblast::Precision::kHalf: throw std::runtime_error("Unsupported precision mode"); + case clblast::Precision::kHalf: clblast::Tuner, half>(argc, argv); break; case clblast::Precision::kSingle: clblast::Tuner, float>(argc, argv); break; case clblast::Precision::kDouble: clblast::Tuner, double>(argc, argv); break; case clblast::Precision::kComplexSingle: clblast::Tuner, float2>(argc, argv); break; diff --git a/src/tuning/xgemm.cc b/src/tuning/xgemm.cc index 2b4ff456..d309b830 100644 --- a/src/tuning/xgemm.cc +++ b/src/tuning/xgemm.cc @@ -121,11 +121,13 @@ class TuneXgemm { std::vector &, std::vector &, std::vector &a_mat, std::vector &b_mat, std::vector &c_mat, std::vector &) { + auto alpha_buffer = std::vector{args.alpha}; + auto beta_buffer = std::vector{args.beta}; tuner.AddArgumentScalar(static_cast(args.m)); tuner.AddArgumentScalar(static_cast(args.n)); tuner.AddArgumentScalar(static_cast(args.k)); - tuner.AddArgumentScalar(args.alpha); - tuner.AddArgumentScalar(args.beta); + tuner.AddArgumentInput(alpha_buffer); + tuner.AddArgumentInput(beta_buffer); tuner.AddArgumentInput(a_mat); tuner.AddArgumentInput(b_mat); tuner.AddArgumentOutput(c_mat); @@ -148,7 +150,7 @@ using double2 = clblast::double2; // Main function (not within the clblast namespace) int main(int argc, char *argv[]) { switch(clblast::GetPrecision(argc, argv)) { - case clblast::Precision::kHalf: throw std::runtime_error("Unsupported precision mode"); + case clblast::Precision::kHalf: clblast::Tuner, half>(argc, argv); break; case clblast::Precision::kSingle: clblast::Tuner, float>(argc, argv); break; case clblast::Precision::kDouble: clblast::Tuner, double>(argc, argv); break; case clblast::Precision::kComplexSingle: clblast::Tuner, float2>(argc, argv); break; diff --git a/src/tuning/xgemv.cc b/src/tuning/xgemv.cc index 43369c3b..6587dcf4 100644 --- a/src/tuning/xgemv.cc +++ b/src/tuning/xgemv.cc @@ -96,11 +96,13 @@ class TuneXgemv { std::vector &x_vec, std::vector &y_vec, std::vector &a_mat, std::vector &, std::vector &, std::vector &) { + auto alpha_buffer = std::vector{args.alpha}; + auto beta_buffer = std::vector{args.beta}; auto a_rotated = (V==3) ? 1 : 0; tuner.AddArgumentScalar(static_cast(args.m)); tuner.AddArgumentScalar(static_cast(args.n)); - tuner.AddArgumentScalar(args.alpha); - tuner.AddArgumentScalar(args.beta); + tuner.AddArgumentInput(alpha_buffer); + tuner.AddArgumentInput(beta_buffer); tuner.AddArgumentScalar(static_cast(a_rotated)); tuner.AddArgumentInput(a_mat); tuner.AddArgumentScalar(0); @@ -135,7 +137,7 @@ using double2 = clblast::double2; template void StartVariation(int argc, char *argv[]) { switch(clblast::GetPrecision(argc, argv)) { - case clblast::Precision::kHalf: throw std::runtime_error("Unsupported precision mode"); + case clblast::Precision::kHalf: clblast::Tuner, half>(argc, argv); break; case clblast::Precision::kSingle: clblast::Tuner, float>(argc, argv); break; case clblast::Precision::kDouble: clblast::Tuner, double>(argc, argv); break; case clblast::Precision::kComplexSingle: clblast::Tuner, float2>(argc, argv); break; diff --git a/src/tuning/xger.cc b/src/tuning/xger.cc index 39efdb81..4be80c86 100644 --- a/src/tuning/xger.cc +++ b/src/tuning/xger.cc @@ -85,9 +85,10 @@ class TuneXger { std::vector &x_vec, std::vector &y_vec, std::vector &a_mat, std::vector &, std::vector &, std::vector &) { + auto alpha_buffer = std::vector{args.alpha}; tuner.AddArgumentScalar(static_cast(args.m)); tuner.AddArgumentScalar(static_cast(args.n)); - tuner.AddArgumentScalar(args.alpha); + tuner.AddArgumentInput(alpha_buffer); tuner.AddArgumentInput(x_vec); tuner.AddArgumentScalar(0); // x_offset tuner.AddArgumentScalar(1); // x_increment @@ -117,7 +118,7 @@ using double2 = clblast::double2; // Main function (not within the clblast namespace) int main(int argc, char *argv[]) { switch(clblast::GetPrecision(argc, argv)) { - case clblast::Precision::kHalf: throw std::runtime_error("Unsupported precision mode"); + case clblast::Precision::kHalf: clblast::Tuner, half>(argc, argv); break; case clblast::Precision::kSingle: clblast::Tuner, float>(argc, argv); break; case clblast::Precision::kDouble: clblast::Tuner, double>(argc, argv); break; case clblast::Precision::kComplexSingle: clblast::Tuner, float2>(argc, argv); break; diff --git a/src/utilities.cc b/src/utilities.cc index 68a4f02a..7649b05c 100644 --- a/src/utilities.cc +++ b/src/utilities.cc @@ -22,6 +22,56 @@ namespace clblast { // ================================================================================================= +// Returns a scalar with a default value +template +T GetScalar() { + return static_cast(2.0); +} +template float GetScalar(); +template double GetScalar(); + +// Specialized version of the above for half-precision +template <> +half GetScalar() { + return FloatToHalf(2.0f); +} + +// Specialized versions of the above for complex data-types +template <> +float2 GetScalar() { + return {2.0f, 0.5f}; +} +template <> +double2 GetScalar() { + return {2.0, 0.5}; +} + +// Returns a scalar of value 1 +template +T ConstantOne() { + return static_cast(1.0); +} +template float ConstantOne(); +template double ConstantOne(); + +// Specialized version of the above for half-precision +template <> +half ConstantOne() { + return FloatToHalf(1.0f); +} + +// Specialized versions of the above for complex data-types +template <> +float2 ConstantOne() { + return {1.0f, 0.0f}; +} +template <> +double2 ConstantOne() { + return {1.0, 0.0}; +} + +// ================================================================================================= + // Implements the string conversion using std::to_string if possible template std::string ToString(T value) { @@ -48,6 +98,12 @@ std::string ToString(double2 value) { return real.str()+"+"+imag.str()+"i"; } +// If not possible directly: special case for half-precision +template <> +std::string ToString(half value) { + return std::to_string(HalfToFloat(value)); +} + // If not possible directly: special cases for CLBlast data-types template <> std::string ToString(Layout value) { @@ -105,6 +161,9 @@ template T ConvertArgument(const char* value) { return static_cast(std::stoi(value)); } +template <> half ConvertArgument(const char* value) { + return FloatToHalf(static_cast(std::stod(value))); +} template <> float ConvertArgument(const char* value) { return static_cast(std::stod(value)); } @@ -147,6 +206,7 @@ T GetArgument(const int argc, char *argv[], std::string &help, // Compiles the above function template int GetArgument(const int, char **, std::string&, const std::string&, const int); template size_t GetArgument(const int, char **, std::string&, const std::string&, const size_t); +template half GetArgument(const int, char **, std::string&, const std::string&, const half); template float GetArgument(const int, char **, std::string&, const std::string&, const float); template double GetArgument(const int, char **, std::string&, const std::string&, const double); template float2 GetArgument(const int, char **, std::string&, const std::string&, const float2); @@ -227,24 +287,49 @@ void PopulateVector(std::vector &vector) { for (auto &element: vector) { element.real(dist(mt)); element.imag(dist(mt)); } } +// Specialized versions of the above for half-precision +template <> +void PopulateVector(std::vector &vector) { + const auto lower_limit = static_cast(kTestDataLowerLimit); + const auto upper_limit = static_cast(kTestDataUpperLimit); + std::mt19937 mt(GetRandomSeed()); + std::uniform_real_distribution dist(lower_limit, upper_limit); + for (auto &element: vector) { element = FloatToHalf(dist(mt)); } +} + // ================================================================================================= -// Returns a scalar with a default value -template -T GetScalar() { - return static_cast(2.0); +// Conversion between half and single-precision +std::vector HalfToFloatBuffer(const std::vector& source) { + auto result = std::vector(source.size()); + for (auto i = size_t(0); i < source.size(); ++i) { result[i] = HalfToFloat(source[i]); } + return result; +} +void FloatToHalfBuffer(std::vector& result, const std::vector& source) { + for (auto i = size_t(0); i < source.size(); ++i) { result[i] = FloatToHalf(source[i]); } } -template float GetScalar(); -template double GetScalar(); -// Specialized versions of the above for complex data-types -template <> -float2 GetScalar() { - return {2.0f, 0.5f}; +// As above, but now for OpenCL data-types instead of std::vectors +Buffer HalfToFloatBuffer(const Buffer& source, cl_command_queue queue_raw) { + const auto size = source.GetSize() / sizeof(half); + auto queue = Queue(queue_raw); + auto context = queue.GetContext(); + auto source_cpu = std::vector(size); + source.Read(queue, size, source_cpu); + auto result_cpu = HalfToFloatBuffer(source_cpu); + auto result = Buffer(context, size); + result.Write(queue, size, result_cpu); + return result; } -template <> -double2 GetScalar() { - return {2.0, 0.5}; +void FloatToHalfBuffer(Buffer& result, const Buffer& source, cl_command_queue queue_raw) { + const auto size = source.GetSize() / sizeof(float); + auto queue = Queue(queue_raw); + auto context = queue.GetContext(); + auto source_cpu = std::vector(size); + source.Read(queue, size, source_cpu); + auto result_cpu = std::vector(size); + FloatToHalfBuffer(result_cpu, source_cpu); + result.Write(queue, size, result_cpu); } // ================================================================================================= @@ -288,6 +373,10 @@ template <> bool PrecisionSupported(const Device &device) { auto extensions = device.Capabilities(); return (extensions.find(kKhronosDoublePrecision) == std::string::npos) ? false : true; } +template <> bool PrecisionSupported(const Device &device) { + auto extensions = device.Capabilities(); + return (extensions.find(kKhronosHalfPrecision) == std::string::npos) ? false : true; +} // ================================================================================================= } // namespace clblast diff --git a/test/correctness/routines/level1/xamax.cc b/test/correctness/routines/level1/xamax.cc index ade09e7a..648abaa6 100644 --- a/test/correctness/routines/level1/xamax.cc +++ b/test/correctness/routines/level1/xamax.cc @@ -22,6 +22,7 @@ int main(int argc, char *argv[]) { clblast::RunTests, double, double>(argc, argv, true, "iDAMAX"); clblast::RunTests, float2, float2>(argc, argv, true, "iCAMAX"); clblast::RunTests, double2, double2>(argc, argv, true, "iZAMAX"); + clblast::RunTests, half, half>(argc, argv, true, "iHAMAX"); return 0; } diff --git a/test/correctness/routines/level1/xasum.cc b/test/correctness/routines/level1/xasum.cc index 5ec20596..d3b036c7 100644 --- a/test/correctness/routines/level1/xasum.cc +++ b/test/correctness/routines/level1/xasum.cc @@ -22,6 +22,7 @@ int main(int argc, char *argv[]) { clblast::RunTests, double, double>(argc, argv, true, "DASUM"); clblast::RunTests, float2, float2>(argc, argv, true, "ScASUM"); clblast::RunTests, double2, double2>(argc, argv, true, "DzASUM"); + clblast::RunTests, half, half>(argc, argv, true, "HASUM"); return 0; } diff --git a/test/correctness/routines/level1/xaxpy.cc b/test/correctness/routines/level1/xaxpy.cc index 746e0001..04f4c128 100644 --- a/test/correctness/routines/level1/xaxpy.cc +++ b/test/correctness/routines/level1/xaxpy.cc @@ -22,6 +22,7 @@ int main(int argc, char *argv[]) { clblast::RunTests, double, double>(argc, argv, true, "DAXPY"); clblast::RunTests, float2, float2>(argc, argv, true, "CAXPY"); clblast::RunTests, double2, double2>(argc, argv, true, "ZAXPY"); + clblast::RunTests, half, half>(argc, argv, true, "HAXPY"); return 0; } diff --git a/test/correctness/routines/level1/xcopy.cc b/test/correctness/routines/level1/xcopy.cc index 3e16ffc6..316c6982 100644 --- a/test/correctness/routines/level1/xcopy.cc +++ b/test/correctness/routines/level1/xcopy.cc @@ -22,6 +22,7 @@ int main(int argc, char *argv[]) { clblast::RunTests, double, double>(argc, argv, true, "DCOPY"); clblast::RunTests, float2, float2>(argc, argv, true, "CCOPY"); clblast::RunTests, double2, double2>(argc, argv, true, "ZCOPY"); + clblast::RunTests, half, half>(argc, argv, true, "HCOPY"); return 0; } diff --git a/test/correctness/routines/level1/xdot.cc b/test/correctness/routines/level1/xdot.cc index 5ea105e0..72dc9d5e 100644 --- a/test/correctness/routines/level1/xdot.cc +++ b/test/correctness/routines/level1/xdot.cc @@ -20,6 +20,7 @@ using double2 = clblast::double2; int main(int argc, char *argv[]) { clblast::RunTests, float, float>(argc, argv, false, "SDOT"); clblast::RunTests, double, double>(argc, argv, true, "DDOT"); + clblast::RunTests, half, half>(argc, argv, true, "HDOT"); return 0; } diff --git a/test/correctness/routines/level1/xnrm2.cc b/test/correctness/routines/level1/xnrm2.cc index 97fb0ad6..0fe8dc33 100644 --- a/test/correctness/routines/level1/xnrm2.cc +++ b/test/correctness/routines/level1/xnrm2.cc @@ -22,6 +22,7 @@ int main(int argc, char *argv[]) { clblast::RunTests, double, double>(argc, argv, true, "DNRM2"); clblast::RunTests, float2, float2>(argc, argv, true, "ScNRM2"); clblast::RunTests, double2, double2>(argc, argv, true, "DzNRM2"); + clblast::RunTests, half, half>(argc, argv, true, "HNRM2"); return 0; } diff --git a/test/correctness/routines/level1/xscal.cc b/test/correctness/routines/level1/xscal.cc index 4d138fad..9146e5ce 100644 --- a/test/correctness/routines/level1/xscal.cc +++ b/test/correctness/routines/level1/xscal.cc @@ -22,6 +22,7 @@ int main(int argc, char *argv[]) { clblast::RunTests, double, double>(argc, argv, true, "DSCAL"); clblast::RunTests, float2, float2>(argc, argv, true, "CSCAL"); clblast::RunTests, double2, double2>(argc, argv, true, "ZSCAL"); + clblast::RunTests, half, half>(argc, argv, true, "HSCAL"); return 0; } diff --git a/test/correctness/routines/level1/xswap.cc b/test/correctness/routines/level1/xswap.cc index 38f110f7..636a5b0f 100644 --- a/test/correctness/routines/level1/xswap.cc +++ b/test/correctness/routines/level1/xswap.cc @@ -22,6 +22,7 @@ int main(int argc, char *argv[]) { clblast::RunTests, double, double>(argc, argv, true, "DSWAP"); clblast::RunTests, float2, float2>(argc, argv, true, "CSWAP"); clblast::RunTests, double2, double2>(argc, argv, true, "ZSWAP"); + clblast::RunTests, half, half>(argc, argv, true, "HSWAP"); return 0; } diff --git a/test/correctness/routines/level2/xgbmv.cc b/test/correctness/routines/level2/xgbmv.cc index b28c5978..528a3325 100644 --- a/test/correctness/routines/level2/xgbmv.cc +++ b/test/correctness/routines/level2/xgbmv.cc @@ -22,6 +22,7 @@ int main(int argc, char *argv[]) { clblast::RunTests, double, double>(argc, argv, true, "DGBMV"); clblast::RunTests, float2, float2>(argc, argv, true, "CGBMV"); clblast::RunTests, double2, double2>(argc, argv, true, "ZGBMV"); + clblast::RunTests, half, half>(argc, argv, true, "HGBMV"); return 0; } diff --git a/test/correctness/routines/level2/xgemv.cc b/test/correctness/routines/level2/xgemv.cc index 14eb74d1..fc1cf3eb 100644 --- a/test/correctness/routines/level2/xgemv.cc +++ b/test/correctness/routines/level2/xgemv.cc @@ -22,6 +22,7 @@ int main(int argc, char *argv[]) { clblast::RunTests, double, double>(argc, argv, true, "DGEMV"); clblast::RunTests, float2, float2>(argc, argv, true, "CGEMV"); clblast::RunTests, double2, double2>(argc, argv, true, "ZGEMV"); + clblast::RunTests, half, half>(argc, argv, true, "HGEMV"); return 0; } diff --git a/test/correctness/routines/level2/xger.cc b/test/correctness/routines/level2/xger.cc index c37a5c41..c3c33ae6 100644 --- a/test/correctness/routines/level2/xger.cc +++ b/test/correctness/routines/level2/xger.cc @@ -20,6 +20,7 @@ using double2 = clblast::double2; int main(int argc, char *argv[]) { clblast::RunTests, float, float>(argc, argv, false, "SGER"); clblast::RunTests, double, double>(argc, argv, true, "DGER"); + clblast::RunTests, half, half>(argc, argv, true, "HGER"); return 0; } diff --git a/test/correctness/routines/level2/xsbmv.cc b/test/correctness/routines/level2/xsbmv.cc index 212e2c3a..c2effcc2 100644 --- a/test/correctness/routines/level2/xsbmv.cc +++ b/test/correctness/routines/level2/xsbmv.cc @@ -20,6 +20,7 @@ using double2 = clblast::double2; int main(int argc, char *argv[]) { clblast::RunTests, float, float>(argc, argv, false, "SSBMV"); clblast::RunTests, double, double>(argc, argv, true, "DSBMV"); + clblast::RunTests, half, half>(argc, argv, true, "HSBMV"); return 0; } diff --git a/test/correctness/routines/level2/xspmv.cc b/test/correctness/routines/level2/xspmv.cc index dc833024..4142636d 100644 --- a/test/correctness/routines/level2/xspmv.cc +++ b/test/correctness/routines/level2/xspmv.cc @@ -20,6 +20,7 @@ using double2 = clblast::double2; int main(int argc, char *argv[]) { clblast::RunTests, float, float>(argc, argv, false, "SSPMV"); clblast::RunTests, double, double>(argc, argv, true, "DSPMV"); + clblast::RunTests, half, half>(argc, argv, true, "HSPMV"); return 0; } diff --git a/test/correctness/routines/level2/xspr.cc b/test/correctness/routines/level2/xspr.cc index a0104dd4..c068b448 100644 --- a/test/correctness/routines/level2/xspr.cc +++ b/test/correctness/routines/level2/xspr.cc @@ -20,6 +20,7 @@ using double2 = clblast::double2; int main(int argc, char *argv[]) { clblast::RunTests, float, float>(argc, argv, false, "SSPR"); clblast::RunTests, double, double>(argc, argv, true, "DSPR"); + clblast::RunTests, half, half>(argc, argv, true, "HSPR"); return 0; } diff --git a/test/correctness/routines/level2/xspr2.cc b/test/correctness/routines/level2/xspr2.cc index 5fe5827f..904870d5 100644 --- a/test/correctness/routines/level2/xspr2.cc +++ b/test/correctness/routines/level2/xspr2.cc @@ -20,6 +20,7 @@ using double2 = clblast::double2; int main(int argc, char *argv[]) { clblast::RunTests, float, float>(argc, argv, false, "SSPR2"); clblast::RunTests, double, double>(argc, argv, true, "DSPR2"); + clblast::RunTests, half, half>(argc, argv, true, "HSPR2"); return 0; } diff --git a/test/correctness/routines/level2/xsymv.cc b/test/correctness/routines/level2/xsymv.cc index 6224739f..eb9b6eb7 100644 --- a/test/correctness/routines/level2/xsymv.cc +++ b/test/correctness/routines/level2/xsymv.cc @@ -20,6 +20,7 @@ using double2 = clblast::double2; int main(int argc, char *argv[]) { clblast::RunTests, float, float>(argc, argv, false, "SSYMV"); clblast::RunTests, double, double>(argc, argv, true, "DSYMV"); + clblast::RunTests, half, half>(argc, argv, true, "HSYMV"); return 0; } diff --git a/test/correctness/routines/level2/xsyr.cc b/test/correctness/routines/level2/xsyr.cc index a47b918f..eccf95e0 100644 --- a/test/correctness/routines/level2/xsyr.cc +++ b/test/correctness/routines/level2/xsyr.cc @@ -20,6 +20,7 @@ using double2 = clblast::double2; int main(int argc, char *argv[]) { clblast::RunTests, float, float>(argc, argv, false, "SSYR"); clblast::RunTests, double, double>(argc, argv, true, "DSYR"); + clblast::RunTests, half, half>(argc, argv, true, "HSYR"); return 0; } diff --git a/test/correctness/routines/level2/xsyr2.cc b/test/correctness/routines/level2/xsyr2.cc index 1743632c..46c939d2 100644 --- a/test/correctness/routines/level2/xsyr2.cc +++ b/test/correctness/routines/level2/xsyr2.cc @@ -20,6 +20,7 @@ using double2 = clblast::double2; int main(int argc, char *argv[]) { clblast::RunTests, float, float>(argc, argv, false, "SSYR2"); clblast::RunTests, double, double>(argc, argv, true, "DSYR2"); + clblast::RunTests, half, half>(argc, argv, true, "HSYR2"); return 0; } diff --git a/test/correctness/routines/level2/xtbmv.cc b/test/correctness/routines/level2/xtbmv.cc index d3bbbade..252abdc4 100644 --- a/test/correctness/routines/level2/xtbmv.cc +++ b/test/correctness/routines/level2/xtbmv.cc @@ -22,6 +22,7 @@ int main(int argc, char *argv[]) { clblast::RunTests, double, double>(argc, argv, true, "DTBMV"); clblast::RunTests, float2, float2>(argc, argv, true, "CTBMV"); clblast::RunTests, double2, double2>(argc, argv, true, "ZTBMV"); + clblast::RunTests, half, half>(argc, argv, true, "HTBMV"); return 0; } diff --git a/test/correctness/routines/level2/xtpmv.cc b/test/correctness/routines/level2/xtpmv.cc index 95489a65..b8776faa 100644 --- a/test/correctness/routines/level2/xtpmv.cc +++ b/test/correctness/routines/level2/xtpmv.cc @@ -22,6 +22,7 @@ int main(int argc, char *argv[]) { clblast::RunTests, double, double>(argc, argv, true, "DTPMV"); clblast::RunTests, float2, float2>(argc, argv, true, "CTPMV"); clblast::RunTests, double2, double2>(argc, argv, true, "ZTPMV"); + clblast::RunTests, half, half>(argc, argv, true, "HTPMV"); return 0; } diff --git a/test/correctness/routines/level2/xtrmv.cc b/test/correctness/routines/level2/xtrmv.cc index ca50af88..256fe900 100644 --- a/test/correctness/routines/level2/xtrmv.cc +++ b/test/correctness/routines/level2/xtrmv.cc @@ -22,6 +22,7 @@ int main(int argc, char *argv[]) { clblast::RunTests, double, double>(argc, argv, true, "DTRMV"); clblast::RunTests, float2, float2>(argc, argv, true, "CTRMV"); clblast::RunTests, double2, double2>(argc, argv, true, "ZTRMV"); + clblast::RunTests, half, half>(argc, argv, true, "HTRMV"); return 0; } diff --git a/test/correctness/routines/level3/xgemm.cc b/test/correctness/routines/level3/xgemm.cc index 632724ed..f8c8a891 100644 --- a/test/correctness/routines/level3/xgemm.cc +++ b/test/correctness/routines/level3/xgemm.cc @@ -22,6 +22,7 @@ int main(int argc, char *argv[]) { clblast::RunTests, double, double>(argc, argv, true, "DGEMM"); clblast::RunTests, float2, float2>(argc, argv, true, "CGEMM"); clblast::RunTests, double2, double2>(argc, argv, true, "ZGEMM"); + clblast::RunTests, half, half>(argc, argv, true, "HGEMM"); return 0; } diff --git a/test/correctness/routines/level3/xsymm.cc b/test/correctness/routines/level3/xsymm.cc index 046fca16..c29f03dd 100644 --- a/test/correctness/routines/level3/xsymm.cc +++ b/test/correctness/routines/level3/xsymm.cc @@ -22,6 +22,7 @@ int main(int argc, char *argv[]) { clblast::RunTests, double, double>(argc, argv, true, "DSYMM"); clblast::RunTests, float2, float2>(argc, argv, true, "CSYMM"); clblast::RunTests, double2, double2>(argc, argv, true, "ZSYMM"); + clblast::RunTests, half, half>(argc, argv, true, "HSYMM"); return 0; } diff --git a/test/correctness/routines/level3/xsyr2k.cc b/test/correctness/routines/level3/xsyr2k.cc index db2b83d9..9f9c87d8 100644 --- a/test/correctness/routines/level3/xsyr2k.cc +++ b/test/correctness/routines/level3/xsyr2k.cc @@ -22,6 +22,7 @@ int main(int argc, char *argv[]) { clblast::RunTests, double, double>(argc, argv, true, "DSYR2K"); clblast::RunTests, float2, float2>(argc, argv, true, "CSYR2K"); clblast::RunTests, double2, double2>(argc, argv, true, "ZSYR2K"); + clblast::RunTests, half, half>(argc, argv, true, "HSYR2K"); return 0; } diff --git a/test/correctness/routines/level3/xsyrk.cc b/test/correctness/routines/level3/xsyrk.cc index 3dad3535..12343074 100644 --- a/test/correctness/routines/level3/xsyrk.cc +++ b/test/correctness/routines/level3/xsyrk.cc @@ -22,6 +22,7 @@ int main(int argc, char *argv[]) { clblast::RunTests, double, double>(argc, argv, true, "DSYRK"); clblast::RunTests, float2, float2>(argc, argv, true, "CSYRK"); clblast::RunTests, double2, double2>(argc, argv, true, "ZSYRK"); + clblast::RunTests, half, half>(argc, argv, true, "HSYRK"); return 0; } diff --git a/test/correctness/routines/level3/xtrmm.cc b/test/correctness/routines/level3/xtrmm.cc index 2d843e3e..aca73f0d 100644 --- a/test/correctness/routines/level3/xtrmm.cc +++ b/test/correctness/routines/level3/xtrmm.cc @@ -22,6 +22,7 @@ int main(int argc, char *argv[]) { clblast::RunTests, double, double>(argc, argv, true, "DTRMM"); clblast::RunTests, float2, float2>(argc, argv, true, "CTRMM"); clblast::RunTests, double2, double2>(argc, argv, true, "ZTRMM"); + clblast::RunTests, half, half>(argc, argv, true, "HTRMM"); return 0; } diff --git a/test/correctness/routines/level3/xtrsm.cc b/test/correctness/routines/level3/xtrsm.cc index b5f5045e..b050269a 100644 --- a/test/correctness/routines/level3/xtrsm.cc +++ b/test/correctness/routines/level3/xtrsm.cc @@ -22,6 +22,7 @@ int main(int argc, char *argv[]) { clblast::RunTests, double, double>(argc, argv, true, "DTRSM"); clblast::RunTests, float2, float2>(argc, argv, true, "CTRSM"); clblast::RunTests, double2, double2>(argc, argv, true, "ZTRSM"); + clblast::RunTests, half, half>(argc, argv, true, "HTRSM"); return 0; } diff --git a/test/correctness/testblas.cc b/test/correctness/testblas.cc index e70c0361..50871402 100644 --- a/test/correctness/testblas.cc +++ b/test/correctness/testblas.cc @@ -20,6 +20,7 @@ namespace clblast { // ================================================================================================= // The transpose-options to test with (data-type dependent) +template <> const std::vector TestBlas::kTransposes = {Transpose::kNo, Transpose::kYes}; template <> const std::vector TestBlas::kTransposes = {Transpose::kNo, Transpose::kYes}; template <> const std::vector TestBlas::kTransposes = {Transpose::kNo, Transpose::kYes}; template <> const std::vector TestBlas::kTransposes = {Transpose::kNo, Transpose::kYes, Transpose::kConjugate}; @@ -147,10 +148,8 @@ void TestBlas::TestRegular(std::vector> &test_vector, const st if (verbose_) { if (get_id2_(args) == 1) { fprintf(stdout, "\n Error at index %zu: ", id1); } else { fprintf(stdout, "\n Error at %zu,%zu: ", id1, id2); } - std::cout << result1[index]; - fprintf(stdout, " (reference) versus "); - std::cout << result2[index]; - fprintf(stdout, " (CLBlast)"); + fprintf(stdout, " %s (reference) versus ", ToString(result1[index]).c_str()); + fprintf(stdout, " %s (CLBlast)", ToString(result2[index]).c_str()); } } } @@ -171,6 +170,7 @@ template void TestBlas::TestInvalid(std::vector> &test_vector, const std::string &name) { if (!PrecisionSupported(device_)) { return; } if (!compare_clblas_) { return; } + if (std::is_same::value) { return; } TestStart("invalid buffer sizes", name); // Iterates over all the to-be-tested combinations of arguments @@ -222,6 +222,7 @@ void TestBlas::TestInvalid(std::vector> &test_vector, const st // ================================================================================================= // Compiles the templated class +template class TestBlas; template class TestBlas; template class TestBlas; template class TestBlas; diff --git a/test/correctness/tester.cc b/test/correctness/tester.cc index 85ae7091..5b603585 100644 --- a/test/correctness/tester.cc +++ b/test/correctness/tester.cc @@ -351,11 +351,11 @@ bool TestSimilarity(const T val1, const T val2) { } } -// Compiles the default case for non-complex data-types +// Compiles the default case for standard data-types template bool TestSimilarity(const float, const float); template bool TestSimilarity(const double, const double); -// Specialisations for complex data-types +// Specialisations for non-standard data-types template <> bool TestSimilarity(const float2 val1, const float2 val2) { auto real = TestSimilarity(val1.real(), val2.real()); @@ -368,6 +368,10 @@ bool TestSimilarity(const double2 val1, const double2 val2) { auto imag = TestSimilarity(val1.imag(), val2.imag()); return (real && imag); } +template <> +bool TestSimilarity(const half val1, const half val2) { + return TestSimilarity(HalfToFloat(val1), HalfToFloat(val2)); +} // ================================================================================================= @@ -389,10 +393,15 @@ template <> const std::vector GetExampleScalars(const bool full_test) { if (full_test) { return {{0.0, 0.0}, {1.0, 1.3}, {2.42, 3.14}}; } else { return {{2.42, 3.14}}; } } +template <> const std::vector GetExampleScalars(const bool full_test) { + if (full_test) { return {FloatToHalf(0.0f), FloatToHalf(1.0f), FloatToHalf(3.14f)}; } + else { return {FloatToHalf(3.14f)}; } +} // ================================================================================================= // Compiles the templated class +template class Tester; template class Tester; template class Tester; template class Tester; diff --git a/test/performance/client.cc b/test/performance/client.cc index 9aaf1e4e..5a7226df 100644 --- a/test/performance/client.cc +++ b/test/performance/client.cc @@ -116,6 +116,17 @@ Arguments Client::ParseArguments(int argc, char *argv[], const GetMetric // which is thus always displayed (unless silence is specified). if (!args.silent) { fprintf(stdout, "%s\n", help.c_str()); } + // Comparison against clBLAS or a CPU BLAS library is not supported in case of half-precision + if (args.precision == Precision::kHalf) { + if (args.compare_clblas != 0 || args.compare_cblas != 0) { + if (!args.silent) { + fprintf(stdout, "* Disabling clBLAS and CPU BLAS comparisons for half-precision\n\n"); + } + } + args.compare_clblas = 0; + args.compare_cblas = 0; + } + // Returns the arguments return args; } @@ -339,6 +350,7 @@ void Client::PrintTableRow(const Arguments& args, // ================================================================================================= // Compiles the templated class +template class Client; template class Client; template class Client; template class Client; diff --git a/test/performance/routines/level1/xamax.cc b/test/performance/routines/level1/xamax.cc index 85caa483..4af1f1c0 100644 --- a/test/performance/routines/level1/xamax.cc +++ b/test/performance/routines/level1/xamax.cc @@ -19,7 +19,8 @@ using double2 = clblast::double2; // Main function (not within the clblast namespace) int main(int argc, char *argv[]) { switch(clblast::GetPrecision(argc, argv, clblast::Precision::kSingle)) { - case clblast::Precision::kHalf: throw std::runtime_error("Unsupported precision mode"); + case clblast::Precision::kHalf: + clblast::RunClient, half, half>(argc, argv); break; case clblast::Precision::kSingle: clblast::RunClient, float, float>(argc, argv); break; case clblast::Precision::kDouble: diff --git a/test/performance/routines/level1/xasum.cc b/test/performance/routines/level1/xasum.cc index 2680966e..8e098890 100644 --- a/test/performance/routines/level1/xasum.cc +++ b/test/performance/routines/level1/xasum.cc @@ -19,7 +19,8 @@ using double2 = clblast::double2; // Main function (not within the clblast namespace) int main(int argc, char *argv[]) { switch(clblast::GetPrecision(argc, argv, clblast::Precision::kSingle)) { - case clblast::Precision::kHalf: throw std::runtime_error("Unsupported precision mode"); + case clblast::Precision::kHalf: + clblast::RunClient, half, half>(argc, argv); break; case clblast::Precision::kSingle: clblast::RunClient, float, float>(argc, argv); break; case clblast::Precision::kDouble: diff --git a/test/performance/routines/level1/xaxpy.cc b/test/performance/routines/level1/xaxpy.cc index b423bc3a..b48c290d 100644 --- a/test/performance/routines/level1/xaxpy.cc +++ b/test/performance/routines/level1/xaxpy.cc @@ -19,7 +19,8 @@ using double2 = clblast::double2; // Main function (not within the clblast namespace) int main(int argc, char *argv[]) { switch(clblast::GetPrecision(argc, argv, clblast::Precision::kSingle)) { - case clblast::Precision::kHalf: throw std::runtime_error("Unsupported precision mode"); + case clblast::Precision::kHalf: + clblast::RunClient, half, half>(argc, argv); break; case clblast::Precision::kSingle: clblast::RunClient, float, float>(argc, argv); break; case clblast::Precision::kDouble: diff --git a/test/performance/routines/level1/xcopy.cc b/test/performance/routines/level1/xcopy.cc index c04c6c1c..b7c60f0f 100644 --- a/test/performance/routines/level1/xcopy.cc +++ b/test/performance/routines/level1/xcopy.cc @@ -19,7 +19,8 @@ using double2 = clblast::double2; // Main function (not within the clblast namespace) int main(int argc, char *argv[]) { switch(clblast::GetPrecision(argc, argv, clblast::Precision::kSingle)) { - case clblast::Precision::kHalf: throw std::runtime_error("Unsupported precision mode"); + case clblast::Precision::kHalf: + clblast::RunClient, half, half>(argc, argv); break; case clblast::Precision::kSingle: clblast::RunClient, float, float>(argc, argv); break; case clblast::Precision::kDouble: diff --git a/test/performance/routines/level1/xdot.cc b/test/performance/routines/level1/xdot.cc index f4616464..3edf2590 100644 --- a/test/performance/routines/level1/xdot.cc +++ b/test/performance/routines/level1/xdot.cc @@ -19,7 +19,8 @@ using double2 = clblast::double2; // Main function (not within the clblast namespace) int main(int argc, char *argv[]) { switch(clblast::GetPrecision(argc, argv, clblast::Precision::kSingle)) { - case clblast::Precision::kHalf: throw std::runtime_error("Unsupported precision mode"); + case clblast::Precision::kHalf: + clblast::RunClient, half, half>(argc, argv); break; case clblast::Precision::kSingle: clblast::RunClient, float, float>(argc, argv); break; case clblast::Precision::kDouble: diff --git a/test/performance/routines/level1/xnrm2.cc b/test/performance/routines/level1/xnrm2.cc index db6ec9ad..f167df95 100644 --- a/test/performance/routines/level1/xnrm2.cc +++ b/test/performance/routines/level1/xnrm2.cc @@ -19,7 +19,8 @@ using double2 = clblast::double2; // Main function (not within the clblast namespace) int main(int argc, char *argv[]) { switch(clblast::GetPrecision(argc, argv, clblast::Precision::kSingle)) { - case clblast::Precision::kHalf: throw std::runtime_error("Unsupported precision mode"); + case clblast::Precision::kHalf: + clblast::RunClient, half, half>(argc, argv); break; case clblast::Precision::kSingle: clblast::RunClient, float, float>(argc, argv); break; case clblast::Precision::kDouble: diff --git a/test/performance/routines/level1/xscal.cc b/test/performance/routines/level1/xscal.cc index bd38f43e..35e21ba8 100644 --- a/test/performance/routines/level1/xscal.cc +++ b/test/performance/routines/level1/xscal.cc @@ -19,7 +19,8 @@ using double2 = clblast::double2; // Main function (not within the clblast namespace) int main(int argc, char *argv[]) { switch(clblast::GetPrecision(argc, argv, clblast::Precision::kSingle)) { - case clblast::Precision::kHalf: throw std::runtime_error("Unsupported precision mode"); + case clblast::Precision::kHalf: + clblast::RunClient, half, half>(argc, argv); break; case clblast::Precision::kSingle: clblast::RunClient, float, float>(argc, argv); break; case clblast::Precision::kDouble: diff --git a/test/performance/routines/level1/xswap.cc b/test/performance/routines/level1/xswap.cc index 112641d3..4791d4c3 100644 --- a/test/performance/routines/level1/xswap.cc +++ b/test/performance/routines/level1/xswap.cc @@ -19,7 +19,8 @@ using double2 = clblast::double2; // Main function (not within the clblast namespace) int main(int argc, char *argv[]) { switch(clblast::GetPrecision(argc, argv, clblast::Precision::kSingle)) { - case clblast::Precision::kHalf: throw std::runtime_error("Unsupported precision mode"); + case clblast::Precision::kHalf: + clblast::RunClient, half, half>(argc, argv); break; case clblast::Precision::kSingle: clblast::RunClient, float, float>(argc, argv); break; case clblast::Precision::kDouble: diff --git a/test/performance/routines/level2/xgbmv.cc b/test/performance/routines/level2/xgbmv.cc index b050184d..be4056de 100644 --- a/test/performance/routines/level2/xgbmv.cc +++ b/test/performance/routines/level2/xgbmv.cc @@ -19,7 +19,8 @@ using double2 = clblast::double2; // Main function (not within the clblast namespace) int main(int argc, char *argv[]) { switch(clblast::GetPrecision(argc, argv, clblast::Precision::kSingle)) { - case clblast::Precision::kHalf: throw std::runtime_error("Unsupported precision mode"); + case clblast::Precision::kHalf: + clblast::RunClient, half, half>(argc, argv); break; case clblast::Precision::kSingle: clblast::RunClient, float, float>(argc, argv); break; case clblast::Precision::kDouble: diff --git a/test/performance/routines/level2/xgemv.cc b/test/performance/routines/level2/xgemv.cc index 51ab9a10..50e6225a 100644 --- a/test/performance/routines/level2/xgemv.cc +++ b/test/performance/routines/level2/xgemv.cc @@ -19,7 +19,8 @@ using double2 = clblast::double2; // Main function (not within the clblast namespace) int main(int argc, char *argv[]) { switch(clblast::GetPrecision(argc, argv, clblast::Precision::kSingle)) { - case clblast::Precision::kHalf: throw std::runtime_error("Unsupported precision mode"); + case clblast::Precision::kHalf: + clblast::RunClient, half, half>(argc, argv); break; case clblast::Precision::kSingle: clblast::RunClient, float, float>(argc, argv); break; case clblast::Precision::kDouble: diff --git a/test/performance/routines/level2/xger.cc b/test/performance/routines/level2/xger.cc index 2d956346..b1b5a268 100644 --- a/test/performance/routines/level2/xger.cc +++ b/test/performance/routines/level2/xger.cc @@ -19,7 +19,8 @@ using double2 = clblast::double2; // Main function (not within the clblast namespace) int main(int argc, char *argv[]) { switch(clblast::GetPrecision(argc, argv, clblast::Precision::kSingle)) { - case clblast::Precision::kHalf: throw std::runtime_error("Unsupported precision mode"); + case clblast::Precision::kHalf: + clblast::RunClient, half, half>(argc, argv); break; case clblast::Precision::kSingle: clblast::RunClient, float, float>(argc, argv); break; case clblast::Precision::kDouble: diff --git a/test/performance/routines/level2/xsbmv.cc b/test/performance/routines/level2/xsbmv.cc index eabab3b7..5fb6e8c0 100644 --- a/test/performance/routines/level2/xsbmv.cc +++ b/test/performance/routines/level2/xsbmv.cc @@ -19,7 +19,8 @@ using double2 = clblast::double2; // Main function (not within the clblast namespace) int main(int argc, char *argv[]) { switch(clblast::GetPrecision(argc, argv, clblast::Precision::kSingle)) { - case clblast::Precision::kHalf: throw std::runtime_error("Unsupported precision mode"); + case clblast::Precision::kHalf: + clblast::RunClient, half, half>(argc, argv); break; case clblast::Precision::kSingle: clblast::RunClient, float, float>(argc, argv); break; case clblast::Precision::kDouble: diff --git a/test/performance/routines/level2/xspmv.cc b/test/performance/routines/level2/xspmv.cc index 2a9ef925..e0ee2075 100644 --- a/test/performance/routines/level2/xspmv.cc +++ b/test/performance/routines/level2/xspmv.cc @@ -19,7 +19,8 @@ using double2 = clblast::double2; // Main function (not within the clblast namespace) int main(int argc, char *argv[]) { switch(clblast::GetPrecision(argc, argv, clblast::Precision::kSingle)) { - case clblast::Precision::kHalf: throw std::runtime_error("Unsupported precision mode"); + case clblast::Precision::kHalf: + clblast::RunClient, half, half>(argc, argv); break; case clblast::Precision::kSingle: clblast::RunClient, float, float>(argc, argv); break; case clblast::Precision::kDouble: diff --git a/test/performance/routines/level2/xspr.cc b/test/performance/routines/level2/xspr.cc index 84331d74..19651679 100644 --- a/test/performance/routines/level2/xspr.cc +++ b/test/performance/routines/level2/xspr.cc @@ -19,7 +19,8 @@ using double2 = clblast::double2; // Main function (not within the clblast namespace) int main(int argc, char *argv[]) { switch(clblast::GetPrecision(argc, argv, clblast::Precision::kSingle)) { - case clblast::Precision::kHalf: throw std::runtime_error("Unsupported precision mode"); + case clblast::Precision::kHalf: + clblast::RunClient, half, half>(argc, argv); break; case clblast::Precision::kSingle: clblast::RunClient, float, float>(argc, argv); break; case clblast::Precision::kDouble: diff --git a/test/performance/routines/level2/xspr2.cc b/test/performance/routines/level2/xspr2.cc index c42009a1..8745c004 100644 --- a/test/performance/routines/level2/xspr2.cc +++ b/test/performance/routines/level2/xspr2.cc @@ -19,7 +19,8 @@ using double2 = clblast::double2; // Main function (not within the clblast namespace) int main(int argc, char *argv[]) { switch(clblast::GetPrecision(argc, argv, clblast::Precision::kSingle)) { - case clblast::Precision::kHalf: throw std::runtime_error("Unsupported precision mode"); + case clblast::Precision::kHalf: + clblast::RunClient, half, half>(argc, argv); break; case clblast::Precision::kSingle: clblast::RunClient, float, float>(argc, argv); break; case clblast::Precision::kDouble: diff --git a/test/performance/routines/level2/xsymv.cc b/test/performance/routines/level2/xsymv.cc index 3f72fe77..42de1ed5 100644 --- a/test/performance/routines/level2/xsymv.cc +++ b/test/performance/routines/level2/xsymv.cc @@ -19,7 +19,8 @@ using double2 = clblast::double2; // Main function (not within the clblast namespace) int main(int argc, char *argv[]) { switch(clblast::GetPrecision(argc, argv, clblast::Precision::kSingle)) { - case clblast::Precision::kHalf: throw std::runtime_error("Unsupported precision mode"); + case clblast::Precision::kHalf: + clblast::RunClient, half, half>(argc, argv); break; case clblast::Precision::kSingle: clblast::RunClient, float, float>(argc, argv); break; case clblast::Precision::kDouble: diff --git a/test/performance/routines/level2/xsyr.cc b/test/performance/routines/level2/xsyr.cc index 6b31d3a9..310bfb5e 100644 --- a/test/performance/routines/level2/xsyr.cc +++ b/test/performance/routines/level2/xsyr.cc @@ -19,7 +19,8 @@ using double2 = clblast::double2; // Main function (not within the clblast namespace) int main(int argc, char *argv[]) { switch(clblast::GetPrecision(argc, argv, clblast::Precision::kSingle)) { - case clblast::Precision::kHalf: throw std::runtime_error("Unsupported precision mode"); + case clblast::Precision::kHalf: + clblast::RunClient, half, half>(argc, argv); break; case clblast::Precision::kSingle: clblast::RunClient, float, float>(argc, argv); break; case clblast::Precision::kDouble: diff --git a/test/performance/routines/level2/xsyr2.cc b/test/performance/routines/level2/xsyr2.cc index 0ad59d2d..bbeed3db 100644 --- a/test/performance/routines/level2/xsyr2.cc +++ b/test/performance/routines/level2/xsyr2.cc @@ -19,7 +19,8 @@ using double2 = clblast::double2; // Main function (not within the clblast namespace) int main(int argc, char *argv[]) { switch(clblast::GetPrecision(argc, argv, clblast::Precision::kSingle)) { - case clblast::Precision::kHalf: throw std::runtime_error("Unsupported precision mode"); + case clblast::Precision::kHalf: + clblast::RunClient, half, half>(argc, argv); break; case clblast::Precision::kSingle: clblast::RunClient, float, float>(argc, argv); break; case clblast::Precision::kDouble: diff --git a/test/performance/routines/level2/xtbmv.cc b/test/performance/routines/level2/xtbmv.cc index a3297f34..24eec61f 100644 --- a/test/performance/routines/level2/xtbmv.cc +++ b/test/performance/routines/level2/xtbmv.cc @@ -19,7 +19,8 @@ using double2 = clblast::double2; // Main function (not within the clblast namespace) int main(int argc, char *argv[]) { switch(clblast::GetPrecision(argc, argv, clblast::Precision::kSingle)) { - case clblast::Precision::kHalf: throw std::runtime_error("Unsupported precision mode"); + case clblast::Precision::kHalf: + clblast::RunClient, half, half>(argc, argv); break; case clblast::Precision::kSingle: clblast::RunClient, float, float>(argc, argv); break; case clblast::Precision::kDouble: diff --git a/test/performance/routines/level2/xtpmv.cc b/test/performance/routines/level2/xtpmv.cc index 72477f2d..2f2487f8 100644 --- a/test/performance/routines/level2/xtpmv.cc +++ b/test/performance/routines/level2/xtpmv.cc @@ -19,7 +19,8 @@ using double2 = clblast::double2; // Main function (not within the clblast namespace) int main(int argc, char *argv[]) { switch(clblast::GetPrecision(argc, argv, clblast::Precision::kSingle)) { - case clblast::Precision::kHalf: throw std::runtime_error("Unsupported precision mode"); + case clblast::Precision::kHalf: + clblast::RunClient, half, half>(argc, argv); break; case clblast::Precision::kSingle: clblast::RunClient, float, float>(argc, argv); break; case clblast::Precision::kDouble: diff --git a/test/performance/routines/level2/xtrmv.cc b/test/performance/routines/level2/xtrmv.cc index 894a7952..3f23afd1 100644 --- a/test/performance/routines/level2/xtrmv.cc +++ b/test/performance/routines/level2/xtrmv.cc @@ -19,7 +19,8 @@ using double2 = clblast::double2; // Main function (not within the clblast namespace) int main(int argc, char *argv[]) { switch(clblast::GetPrecision(argc, argv, clblast::Precision::kSingle)) { - case clblast::Precision::kHalf: throw std::runtime_error("Unsupported precision mode"); + case clblast::Precision::kHalf: + clblast::RunClient, half, half>(argc, argv); break; case clblast::Precision::kSingle: clblast::RunClient, float, float>(argc, argv); break; case clblast::Precision::kDouble: diff --git a/test/performance/routines/level3/xgemm.cc b/test/performance/routines/level3/xgemm.cc index 91897ee1..8e48dc3a 100644 --- a/test/performance/routines/level3/xgemm.cc +++ b/test/performance/routines/level3/xgemm.cc @@ -19,7 +19,8 @@ using double2 = clblast::double2; // Main function (not within the clblast namespace) int main(int argc, char *argv[]) { switch(clblast::GetPrecision(argc, argv, clblast::Precision::kSingle)) { - case clblast::Precision::kHalf: throw std::runtime_error("Unsupported precision mode"); + case clblast::Precision::kHalf: + clblast::RunClient, half, half>(argc, argv); break; case clblast::Precision::kSingle: clblast::RunClient, float, float>(argc, argv); break; case clblast::Precision::kDouble: diff --git a/test/performance/routines/level3/xsymm.cc b/test/performance/routines/level3/xsymm.cc index e0feadd1..7eac5537 100644 --- a/test/performance/routines/level3/xsymm.cc +++ b/test/performance/routines/level3/xsymm.cc @@ -19,7 +19,8 @@ using double2 = clblast::double2; // Main function (not within the clblast namespace) int main(int argc, char *argv[]) { switch(clblast::GetPrecision(argc, argv, clblast::Precision::kSingle)) { - case clblast::Precision::kHalf: throw std::runtime_error("Unsupported precision mode"); + case clblast::Precision::kHalf: + clblast::RunClient, half, half>(argc, argv); break; case clblast::Precision::kSingle: clblast::RunClient, float, float>(argc, argv); break; case clblast::Precision::kDouble: diff --git a/test/performance/routines/level3/xsyr2k.cc b/test/performance/routines/level3/xsyr2k.cc index 4a82ddc4..49d00f34 100644 --- a/test/performance/routines/level3/xsyr2k.cc +++ b/test/performance/routines/level3/xsyr2k.cc @@ -19,7 +19,8 @@ using double2 = clblast::double2; // Main function (not within the clblast namespace) int main(int argc, char *argv[]) { switch(clblast::GetPrecision(argc, argv, clblast::Precision::kSingle)) { - case clblast::Precision::kHalf: throw std::runtime_error("Unsupported precision mode"); + case clblast::Precision::kHalf: + clblast::RunClient, half, half>(argc, argv); break; case clblast::Precision::kSingle: clblast::RunClient, float, float>(argc, argv); break; case clblast::Precision::kDouble: diff --git a/test/performance/routines/level3/xsyrk.cc b/test/performance/routines/level3/xsyrk.cc index 70f61322..ad0a06b4 100644 --- a/test/performance/routines/level3/xsyrk.cc +++ b/test/performance/routines/level3/xsyrk.cc @@ -19,7 +19,8 @@ using double2 = clblast::double2; // Main function (not within the clblast namespace) int main(int argc, char *argv[]) { switch(clblast::GetPrecision(argc, argv, clblast::Precision::kSingle)) { - case clblast::Precision::kHalf: throw std::runtime_error("Unsupported precision mode"); + case clblast::Precision::kHalf: + clblast::RunClient, half, half>(argc, argv); break; case clblast::Precision::kSingle: clblast::RunClient, float, float>(argc, argv); break; case clblast::Precision::kDouble: diff --git a/test/performance/routines/level3/xtrmm.cc b/test/performance/routines/level3/xtrmm.cc index 6f6041e4..92526844 100644 --- a/test/performance/routines/level3/xtrmm.cc +++ b/test/performance/routines/level3/xtrmm.cc @@ -19,7 +19,8 @@ using double2 = clblast::double2; // Main function (not within the clblast namespace) int main(int argc, char *argv[]) { switch(clblast::GetPrecision(argc, argv, clblast::Precision::kSingle)) { - case clblast::Precision::kHalf: throw std::runtime_error("Unsupported precision mode"); + case clblast::Precision::kHalf: + clblast::RunClient, half, half>(argc, argv); break; case clblast::Precision::kSingle: clblast::RunClient, float, float>(argc, argv); break; case clblast::Precision::kDouble: diff --git a/test/performance/routines/level3/xtrsm.cc b/test/performance/routines/level3/xtrsm.cc index 76ef255a..08e4b4a9 100644 --- a/test/performance/routines/level3/xtrsm.cc +++ b/test/performance/routines/level3/xtrsm.cc @@ -19,7 +19,8 @@ using double2 = clblast::double2; // Main function (not within the clblast namespace) int main(int argc, char *argv[]) { switch(clblast::GetPrecision(argc, argv, clblast::Precision::kSingle)) { - case clblast::Precision::kHalf: throw std::runtime_error("Unsupported precision mode"); + case clblast::Precision::kHalf: + clblast::RunClient, half, half>(argc, argv); break; case clblast::Precision::kSingle: clblast::RunClient, float, float>(argc, argv); break; case clblast::Precision::kDouble: diff --git a/test/routines/level1/xamax.h b/test/routines/level1/xamax.h index 7b404dc3..12b031bc 100644 --- a/test/routines/level1/xamax.h +++ b/test/routines/level1/xamax.h @@ -86,8 +86,8 @@ class TestXamax { auto queue_plain = queue(); auto event = cl_event{}; auto status = clblasXamax(args.n, - buffers.scalar(), args.imax_offset, - buffers.x_vec(), args.x_offset, args.x_inc, + buffers.scalar, args.imax_offset, + buffers.x_vec, args.x_offset, args.x_inc, 1, &queue_plain, 0, nullptr, &event); clWaitForEvents(1, &event); return static_cast(status); diff --git a/test/routines/level1/xasum.h b/test/routines/level1/xasum.h index 6eae3c83..eb83817b 100644 --- a/test/routines/level1/xasum.h +++ b/test/routines/level1/xasum.h @@ -86,8 +86,8 @@ class TestXasum { auto queue_plain = queue(); auto event = cl_event{}; auto status = clblasXasum(args.n, - buffers.scalar(), args.asum_offset, - buffers.x_vec(), args.x_offset, args.x_inc, + buffers.scalar, args.asum_offset, + buffers.x_vec, args.x_offset, args.x_inc, 1, &queue_plain, 0, nullptr, &event); clWaitForEvents(1, &event); return static_cast(status); diff --git a/test/routines/level1/xaxpy.h b/test/routines/level1/xaxpy.h index 8f72f570..c241da91 100644 --- a/test/routines/level1/xaxpy.h +++ b/test/routines/level1/xaxpy.h @@ -87,8 +87,8 @@ class TestXaxpy { auto queue_plain = queue(); auto event = cl_event{}; auto status = clblasXaxpy(args.n, args.alpha, - buffers.x_vec(), args.x_offset, args.x_inc, - buffers.y_vec(), args.y_offset, args.y_inc, + buffers.x_vec, args.x_offset, args.x_inc, + buffers.y_vec, args.y_offset, args.y_inc, 1, &queue_plain, 0, nullptr, &event); clWaitForEvents(1, &event); return static_cast(status); diff --git a/test/routines/level1/xcopy.h b/test/routines/level1/xcopy.h index 0527ca6a..a1ff06ce 100644 --- a/test/routines/level1/xcopy.h +++ b/test/routines/level1/xcopy.h @@ -86,8 +86,8 @@ class TestXcopy { auto queue_plain = queue(); auto event = cl_event{}; auto status = clblasXcopy(args.n, - buffers.x_vec(), args.x_offset, args.x_inc, - buffers.y_vec(), args.y_offset, args.y_inc, + buffers.x_vec, args.x_offset, args.x_inc, + buffers.y_vec, args.y_offset, args.y_inc, 1, &queue_plain, 0, nullptr, &event); clWaitForEvents(1, &event); return static_cast(status); diff --git a/test/routines/level1/xdot.h b/test/routines/level1/xdot.h index d1c34c0f..0bbc93d5 100644 --- a/test/routines/level1/xdot.h +++ b/test/routines/level1/xdot.h @@ -91,9 +91,9 @@ class TestXdot { auto queue_plain = queue(); auto event = cl_event{}; auto status = clblasXdot(args.n, - buffers.scalar(), args.dot_offset, - buffers.x_vec(), args.x_offset, args.x_inc, - buffers.y_vec(), args.y_offset, args.y_inc, + buffers.scalar, args.dot_offset, + buffers.x_vec, args.x_offset, args.x_inc, + buffers.y_vec, args.y_offset, args.y_inc, 1, &queue_plain, 0, nullptr, &event); clWaitForEvents(1, &event); return static_cast(status); diff --git a/test/routines/level1/xdotc.h b/test/routines/level1/xdotc.h index a2742cb0..e1cc1854 100644 --- a/test/routines/level1/xdotc.h +++ b/test/routines/level1/xdotc.h @@ -91,9 +91,9 @@ class TestXdotc { auto queue_plain = queue(); auto event = cl_event{}; auto status = clblasXdotc(args.n, - buffers.scalar(), args.dot_offset, - buffers.x_vec(), args.x_offset, args.x_inc, - buffers.y_vec(), args.y_offset, args.y_inc, + buffers.scalar, args.dot_offset, + buffers.x_vec, args.x_offset, args.x_inc, + buffers.y_vec, args.y_offset, args.y_inc, 1, &queue_plain, 0, nullptr, &event); clWaitForEvents(1, &event); return static_cast(status); diff --git a/test/routines/level1/xdotu.h b/test/routines/level1/xdotu.h index 06ce979e..558257cc 100644 --- a/test/routines/level1/xdotu.h +++ b/test/routines/level1/xdotu.h @@ -91,9 +91,9 @@ class TestXdotu { auto queue_plain = queue(); auto event = cl_event{}; auto status = clblasXdotu(args.n, - buffers.scalar(), args.dot_offset, - buffers.x_vec(), args.x_offset, args.x_inc, - buffers.y_vec(), args.y_offset, args.y_inc, + buffers.scalar, args.dot_offset, + buffers.x_vec, args.x_offset, args.x_inc, + buffers.y_vec, args.y_offset, args.y_inc, 1, &queue_plain, 0, nullptr, &event); clWaitForEvents(1, &event); return static_cast(status); diff --git a/test/routines/level1/xnrm2.h b/test/routines/level1/xnrm2.h index d8a0de4e..19074ca2 100644 --- a/test/routines/level1/xnrm2.h +++ b/test/routines/level1/xnrm2.h @@ -86,8 +86,8 @@ class TestXnrm2 { auto queue_plain = queue(); auto event = cl_event{}; auto status = clblasXnrm2(args.n, - buffers.scalar(), args.nrm2_offset, - buffers.x_vec(), args.x_offset, args.x_inc, + buffers.scalar, args.nrm2_offset, + buffers.x_vec, args.x_offset, args.x_inc, 1, &queue_plain, 0, nullptr, &event); clWaitForEvents(1, &event); return static_cast(status); diff --git a/test/routines/level1/xscal.h b/test/routines/level1/xscal.h index 35855dbd..84d14ac7 100644 --- a/test/routines/level1/xscal.h +++ b/test/routines/level1/xscal.h @@ -82,7 +82,7 @@ class TestXscal { auto queue_plain = queue(); auto event = cl_event{}; auto status = clblasXscal(args.n, args.alpha, - buffers.x_vec(), args.x_offset, args.x_inc, + buffers.x_vec, args.x_offset, args.x_inc, 1, &queue_plain, 0, nullptr, &event); clWaitForEvents(1, &event); return static_cast(status); diff --git a/test/routines/level1/xswap.h b/test/routines/level1/xswap.h index ae69d3be..e870b602 100644 --- a/test/routines/level1/xswap.h +++ b/test/routines/level1/xswap.h @@ -86,8 +86,8 @@ class TestXswap { auto queue_plain = queue(); auto event = cl_event{}; auto status = clblasXswap(args.n, - buffers.x_vec(), args.x_offset, args.x_inc, - buffers.y_vec(), args.y_offset, args.y_inc, + buffers.x_vec, args.x_offset, args.x_inc, + buffers.y_vec, args.y_offset, args.y_inc, 1, &queue_plain, 0, nullptr, &event); clWaitForEvents(1, &event); return static_cast(status); diff --git a/test/routines/level2/xgbmv.h b/test/routines/level2/xgbmv.h index c88cdf2a..c777ff73 100644 --- a/test/routines/level2/xgbmv.h +++ b/test/routines/level2/xgbmv.h @@ -102,9 +102,9 @@ class TestXgbmv { auto status = clblasXgbmv(convertToCLBLAS(args.layout), convertToCLBLAS(args.a_transpose), args.m, args.n, args.kl, args.ku, args.alpha, - buffers.a_mat(), args.a_offset, args.a_ld, - buffers.x_vec(), args.x_offset, args.x_inc, args.beta, - buffers.y_vec(), args.y_offset, args.y_inc, + buffers.a_mat, args.a_offset, args.a_ld, + buffers.x_vec, args.x_offset, args.x_inc, args.beta, + buffers.y_vec, args.y_offset, args.y_inc, 1, &queue_plain, 0, nullptr, &event); clWaitForEvents(1, &event); return static_cast(status); diff --git a/test/routines/level2/xgemv.h b/test/routines/level2/xgemv.h index cf63d55f..f8a7e1d0 100644 --- a/test/routines/level2/xgemv.h +++ b/test/routines/level2/xgemv.h @@ -102,9 +102,9 @@ class TestXgemv { auto status = clblasXgemv(convertToCLBLAS(args.layout), convertToCLBLAS(args.a_transpose), args.m, args.n, args.alpha, - buffers.a_mat(), args.a_offset, args.a_ld, - buffers.x_vec(), args.x_offset, args.x_inc, args.beta, - buffers.y_vec(), args.y_offset, args.y_inc, + buffers.a_mat, args.a_offset, args.a_ld, + buffers.x_vec, args.x_offset, args.x_inc, args.beta, + buffers.y_vec, args.y_offset, args.y_inc, 1, &queue_plain, 0, nullptr, &event); clWaitForEvents(1, &event); return static_cast(status); diff --git a/test/routines/level2/xger.h b/test/routines/level2/xger.h index ae142e2e..e0d1fe49 100644 --- a/test/routines/level2/xger.h +++ b/test/routines/level2/xger.h @@ -97,9 +97,9 @@ class TestXger { auto event = cl_event{}; auto status = clblasXger(convertToCLBLAS(args.layout), args.m, args.n, args.alpha, - buffers.x_vec(), args.x_offset, args.x_inc, - buffers.y_vec(), args.y_offset, args.y_inc, - buffers.a_mat(), args.a_offset, args.a_ld, + buffers.x_vec, args.x_offset, args.x_inc, + buffers.y_vec, args.y_offset, args.y_inc, + buffers.a_mat, args.a_offset, args.a_ld, 1, &queue_plain, 0, nullptr, &event); clWaitForEvents(1, &event); return static_cast(status); diff --git a/test/routines/level2/xgerc.h b/test/routines/level2/xgerc.h index b236aef6..7449146b 100644 --- a/test/routines/level2/xgerc.h +++ b/test/routines/level2/xgerc.h @@ -97,9 +97,9 @@ class TestXgerc { auto event = cl_event{}; auto status = clblasXgerc(convertToCLBLAS(args.layout), args.m, args.n, args.alpha, - buffers.x_vec(), args.x_offset, args.x_inc, - buffers.y_vec(), args.y_offset, args.y_inc, - buffers.a_mat(), args.a_offset, args.a_ld, + buffers.x_vec, args.x_offset, args.x_inc, + buffers.y_vec, args.y_offset, args.y_inc, + buffers.a_mat, args.a_offset, args.a_ld, 1, &queue_plain, 0, nullptr, &event); clWaitForEvents(1, &event); return static_cast(status); diff --git a/test/routines/level2/xgeru.h b/test/routines/level2/xgeru.h index 3d3fa439..07837657 100644 --- a/test/routines/level2/xgeru.h +++ b/test/routines/level2/xgeru.h @@ -97,9 +97,9 @@ class TestXgeru { auto event = cl_event{}; auto status = clblasXgeru(convertToCLBLAS(args.layout), args.m, args.n, args.alpha, - buffers.x_vec(), args.x_offset, args.x_inc, - buffers.y_vec(), args.y_offset, args.y_inc, - buffers.a_mat(), args.a_offset, args.a_ld, + buffers.x_vec, args.x_offset, args.x_inc, + buffers.y_vec, args.y_offset, args.y_inc, + buffers.a_mat, args.a_offset, args.a_ld, 1, &queue_plain, 0, nullptr, &event); clWaitForEvents(1, &event); return static_cast(status); diff --git a/test/routines/level2/xhbmv.h b/test/routines/level2/xhbmv.h index 4098639a..73194975 100644 --- a/test/routines/level2/xhbmv.h +++ b/test/routines/level2/xhbmv.h @@ -96,9 +96,9 @@ class TestXhbmv { auto status = clblasXhbmv(convertToCLBLAS(args.layout), convertToCLBLAS(args.triangle), args.n, args.kl, args.alpha, - buffers.a_mat(), args.a_offset, args.a_ld, - buffers.x_vec(), args.x_offset, args.x_inc, args.beta, - buffers.y_vec(), args.y_offset, args.y_inc, + buffers.a_mat, args.a_offset, args.a_ld, + buffers.x_vec, args.x_offset, args.x_inc, args.beta, + buffers.y_vec, args.y_offset, args.y_inc, 1, &queue_plain, 0, nullptr, &event); clWaitForEvents(1, &event); return static_cast(status); diff --git a/test/routines/level2/xhemv.h b/test/routines/level2/xhemv.h index 5652872d..aabbf14a 100644 --- a/test/routines/level2/xhemv.h +++ b/test/routines/level2/xhemv.h @@ -96,9 +96,9 @@ class TestXhemv { auto status = clblasXhemv(convertToCLBLAS(args.layout), convertToCLBLAS(args.triangle), args.n, args.alpha, - buffers.a_mat(), args.a_offset, args.a_ld, - buffers.x_vec(), args.x_offset, args.x_inc, args.beta, - buffers.y_vec(), args.y_offset, args.y_inc, + buffers.a_mat, args.a_offset, args.a_ld, + buffers.x_vec, args.x_offset, args.x_inc, args.beta, + buffers.y_vec, args.y_offset, args.y_inc, 1, &queue_plain, 0, nullptr, &event); clWaitForEvents(1, &event); return static_cast(status); diff --git a/test/routines/level2/xher.h b/test/routines/level2/xher.h index 3bbf0887..1294832c 100644 --- a/test/routines/level2/xher.h +++ b/test/routines/level2/xher.h @@ -91,8 +91,8 @@ class TestXher { auto status = clblasXher(convertToCLBLAS(args.layout), convertToCLBLAS(args.triangle), args.n, args.alpha, - buffers.x_vec(), args.x_offset, args.x_inc, - buffers.a_mat(), args.a_offset, args.a_ld, + buffers.x_vec, args.x_offset, args.x_inc, + buffers.a_mat, args.a_offset, args.a_ld, 1, &queue_plain, 0, nullptr, &event); clWaitForEvents(1, &event); return static_cast(status); diff --git a/test/routines/level2/xher2.h b/test/routines/level2/xher2.h index dc7fbe73..5e90174d 100644 --- a/test/routines/level2/xher2.h +++ b/test/routines/level2/xher2.h @@ -96,9 +96,9 @@ class TestXher2 { auto status = clblasXher2(convertToCLBLAS(args.layout), convertToCLBLAS(args.triangle), args.n, args.alpha, - buffers.x_vec(), args.x_offset, args.x_inc, - buffers.y_vec(), args.y_offset, args.y_inc, - buffers.a_mat(), args.a_offset, args.a_ld, + buffers.x_vec, args.x_offset, args.x_inc, + buffers.y_vec, args.y_offset, args.y_inc, + buffers.a_mat, args.a_offset, args.a_ld, 1, &queue_plain, 0, nullptr, &event); clWaitForEvents(1, &event); return static_cast(status); diff --git a/test/routines/level2/xhpmv.h b/test/routines/level2/xhpmv.h index df5a90ee..8face6b6 100644 --- a/test/routines/level2/xhpmv.h +++ b/test/routines/level2/xhpmv.h @@ -96,9 +96,9 @@ class TestXhpmv { auto status = clblasXhpmv(convertToCLBLAS(args.layout), convertToCLBLAS(args.triangle), args.n, args.alpha, - buffers.ap_mat(), args.ap_offset, - buffers.x_vec(), args.x_offset, args.x_inc, args.beta, - buffers.y_vec(), args.y_offset, args.y_inc, + buffers.ap_mat, args.ap_offset, + buffers.x_vec, args.x_offset, args.x_inc, args.beta, + buffers.y_vec, args.y_offset, args.y_inc, 1, &queue_plain, 0, nullptr, &event); clWaitForEvents(1, &event); return static_cast(status); diff --git a/test/routines/level2/xhpr.h b/test/routines/level2/xhpr.h index 0db11db0..63cab31f 100644 --- a/test/routines/level2/xhpr.h +++ b/test/routines/level2/xhpr.h @@ -91,8 +91,8 @@ class TestXhpr { auto status = clblasXhpr(convertToCLBLAS(args.layout), convertToCLBLAS(args.triangle), args.n, args.alpha, - buffers.x_vec(), args.x_offset, args.x_inc, - buffers.ap_mat(), args.ap_offset, + buffers.x_vec, args.x_offset, args.x_inc, + buffers.ap_mat, args.ap_offset, 1, &queue_plain, 0, nullptr, &event); clWaitForEvents(1, &event); return static_cast(status); diff --git a/test/routines/level2/xhpr2.h b/test/routines/level2/xhpr2.h index e1e5b4c5..64d205a0 100644 --- a/test/routines/level2/xhpr2.h +++ b/test/routines/level2/xhpr2.h @@ -96,9 +96,9 @@ class TestXhpr2 { auto status = clblasXhpr2(convertToCLBLAS(args.layout), convertToCLBLAS(args.triangle), args.n, args.alpha, - buffers.x_vec(), args.x_offset, args.x_inc, - buffers.y_vec(), args.y_offset, args.y_inc, - buffers.ap_mat(), args.ap_offset, + buffers.x_vec, args.x_offset, args.x_inc, + buffers.y_vec, args.y_offset, args.y_inc, + buffers.ap_mat, args.ap_offset, 1, &queue_plain, 0, nullptr, &event); clWaitForEvents(1, &event); return static_cast(status); diff --git a/test/routines/level2/xsbmv.h b/test/routines/level2/xsbmv.h index fce88f4c..3f1446c8 100644 --- a/test/routines/level2/xsbmv.h +++ b/test/routines/level2/xsbmv.h @@ -96,9 +96,9 @@ class TestXsbmv { auto status = clblasXsbmv(convertToCLBLAS(args.layout), convertToCLBLAS(args.triangle), args.n, args.kl, args.alpha, - buffers.a_mat(), args.a_offset, args.a_ld, - buffers.x_vec(), args.x_offset, args.x_inc, args.beta, - buffers.y_vec(), args.y_offset, args.y_inc, + buffers.a_mat, args.a_offset, args.a_ld, + buffers.x_vec, args.x_offset, args.x_inc, args.beta, + buffers.y_vec, args.y_offset, args.y_inc, 1, &queue_plain, 0, nullptr, &event); clWaitForEvents(1, &event); return static_cast(status); diff --git a/test/routines/level2/xspmv.h b/test/routines/level2/xspmv.h index 2fdba77a..2add3cdd 100644 --- a/test/routines/level2/xspmv.h +++ b/test/routines/level2/xspmv.h @@ -96,9 +96,9 @@ class TestXspmv { auto status = clblasXspmv(convertToCLBLAS(args.layout), convertToCLBLAS(args.triangle), args.n, args.alpha, - buffers.ap_mat(), args.ap_offset, - buffers.x_vec(), args.x_offset, args.x_inc, args.beta, - buffers.y_vec(), args.y_offset, args.y_inc, + buffers.ap_mat, args.ap_offset, + buffers.x_vec, args.x_offset, args.x_inc, args.beta, + buffers.y_vec, args.y_offset, args.y_inc, 1, &queue_plain, 0, nullptr, &event); clWaitForEvents(1, &event); return static_cast(status); diff --git a/test/routines/level2/xspr.h b/test/routines/level2/xspr.h index dcacc5de..ad21bdf6 100644 --- a/test/routines/level2/xspr.h +++ b/test/routines/level2/xspr.h @@ -91,8 +91,8 @@ class TestXspr { auto status = clblasXspr(convertToCLBLAS(args.layout), convertToCLBLAS(args.triangle), args.n, args.alpha, - buffers.x_vec(), args.x_offset, args.x_inc, - buffers.ap_mat(), args.ap_offset, + buffers.x_vec, args.x_offset, args.x_inc, + buffers.ap_mat, args.ap_offset, 1, &queue_plain, 0, nullptr, &event); clWaitForEvents(1, &event); return static_cast(status); diff --git a/test/routines/level2/xspr2.h b/test/routines/level2/xspr2.h index 69fda2fb..c55e8181 100644 --- a/test/routines/level2/xspr2.h +++ b/test/routines/level2/xspr2.h @@ -96,9 +96,9 @@ class TestXspr2 { auto status = clblasXspr2(convertToCLBLAS(args.layout), convertToCLBLAS(args.triangle), args.n, args.alpha, - buffers.x_vec(), args.x_offset, args.x_inc, - buffers.y_vec(), args.y_offset, args.y_inc, - buffers.ap_mat(), args.ap_offset, + buffers.x_vec, args.x_offset, args.x_inc, + buffers.y_vec, args.y_offset, args.y_inc, + buffers.ap_mat, args.ap_offset, 1, &queue_plain, 0, nullptr, &event); clWaitForEvents(1, &event); return static_cast(status); diff --git a/test/routines/level2/xsymv.h b/test/routines/level2/xsymv.h index 16f94d6f..b6583a24 100644 --- a/test/routines/level2/xsymv.h +++ b/test/routines/level2/xsymv.h @@ -96,9 +96,9 @@ class TestXsymv { auto status = clblasXsymv(convertToCLBLAS(args.layout), convertToCLBLAS(args.triangle), args.n, args.alpha, - buffers.a_mat(), args.a_offset, args.a_ld, - buffers.x_vec(), args.x_offset, args.x_inc, args.beta, - buffers.y_vec(), args.y_offset, args.y_inc, + buffers.a_mat, args.a_offset, args.a_ld, + buffers.x_vec, args.x_offset, args.x_inc, args.beta, + buffers.y_vec, args.y_offset, args.y_inc, 1, &queue_plain, 0, nullptr, &event); clWaitForEvents(1, &event); return static_cast(status); diff --git a/test/routines/level2/xsyr.h b/test/routines/level2/xsyr.h index a66dd271..f3929588 100644 --- a/test/routines/level2/xsyr.h +++ b/test/routines/level2/xsyr.h @@ -91,8 +91,8 @@ class TestXsyr { auto status = clblasXsyr(convertToCLBLAS(args.layout), convertToCLBLAS(args.triangle), args.n, args.alpha, - buffers.x_vec(), args.x_offset, args.x_inc, - buffers.a_mat(), args.a_offset, args.a_ld, + buffers.x_vec, args.x_offset, args.x_inc, + buffers.a_mat, args.a_offset, args.a_ld, 1, &queue_plain, 0, nullptr, &event); clWaitForEvents(1, &event); return static_cast(status); diff --git a/test/routines/level2/xsyr2.h b/test/routines/level2/xsyr2.h index a36815e5..8cdb6a14 100644 --- a/test/routines/level2/xsyr2.h +++ b/test/routines/level2/xsyr2.h @@ -96,9 +96,9 @@ class TestXsyr2 { auto status = clblasXsyr2(convertToCLBLAS(args.layout), convertToCLBLAS(args.triangle), args.n, args.alpha, - buffers.x_vec(), args.x_offset, args.x_inc, - buffers.y_vec(), args.y_offset, args.y_inc, - buffers.a_mat(), args.a_offset, args.a_ld, + buffers.x_vec, args.x_offset, args.x_inc, + buffers.y_vec, args.y_offset, args.y_inc, + buffers.a_mat, args.a_offset, args.a_ld, 1, &queue_plain, 0, nullptr, &event); clWaitForEvents(1, &event); return static_cast(status); diff --git a/test/routines/level2/xtbmv.h b/test/routines/level2/xtbmv.h index 1425b60b..9c4131ec 100644 --- a/test/routines/level2/xtbmv.h +++ b/test/routines/level2/xtbmv.h @@ -92,8 +92,8 @@ class TestXtbmv { convertToCLBLAS(args.a_transpose), convertToCLBLAS(args.diagonal), args.n, args.kl, - buffers.a_mat(), args.a_offset, args.a_ld, - buffers.x_vec(), args.x_offset, args.x_inc, + buffers.a_mat, args.a_offset, args.a_ld, + buffers.x_vec, args.x_offset, args.x_inc, 1, &queue_plain, 0, nullptr, &event); clWaitForEvents(1, &event); return static_cast(status); diff --git a/test/routines/level2/xtpmv.h b/test/routines/level2/xtpmv.h index a834b437..58249227 100644 --- a/test/routines/level2/xtpmv.h +++ b/test/routines/level2/xtpmv.h @@ -92,8 +92,8 @@ class TestXtpmv { convertToCLBLAS(args.a_transpose), convertToCLBLAS(args.diagonal), args.n, - buffers.ap_mat(), args.ap_offset, - buffers.x_vec(), args.x_offset, args.x_inc, + buffers.ap_mat, args.ap_offset, + buffers.x_vec, args.x_offset, args.x_inc, 1, &queue_plain, 0, nullptr, &event); clWaitForEvents(1, &event); return static_cast(status); diff --git a/test/routines/level2/xtrmv.h b/test/routines/level2/xtrmv.h index cd502d5d..635a1319 100644 --- a/test/routines/level2/xtrmv.h +++ b/test/routines/level2/xtrmv.h @@ -92,8 +92,8 @@ class TestXtrmv { convertToCLBLAS(args.a_transpose), convertToCLBLAS(args.diagonal), args.n, - buffers.a_mat(), args.a_offset, args.a_ld, - buffers.x_vec(), args.x_offset, args.x_inc, + buffers.a_mat, args.a_offset, args.a_ld, + buffers.x_vec, args.x_offset, args.x_inc, 1, &queue_plain, 0, nullptr, &event); clWaitForEvents(1, &event); return static_cast(status); diff --git a/test/routines/level3/xgemm.h b/test/routines/level3/xgemm.h index cd5c2acd..842dae93 100644 --- a/test/routines/level3/xgemm.h +++ b/test/routines/level3/xgemm.h @@ -105,9 +105,9 @@ class TestXgemm { convertToCLBLAS(args.a_transpose), convertToCLBLAS(args.b_transpose), args.m, args.n, args.k, args.alpha, - buffers.a_mat(), args.a_offset, args.a_ld, - buffers.b_mat(), args.b_offset, args.b_ld, args.beta, - buffers.c_mat(), args.c_offset, args.c_ld, + buffers.a_mat, args.a_offset, args.a_ld, + buffers.b_mat, args.b_offset, args.b_ld, args.beta, + buffers.c_mat, args.c_offset, args.c_ld, 1, &queue_plain, 0, nullptr, &event); clWaitForEvents(1, &event); return static_cast(status); diff --git a/test/routines/level3/xhemm.h b/test/routines/level3/xhemm.h index edc71024..106b99ff 100644 --- a/test/routines/level3/xhemm.h +++ b/test/routines/level3/xhemm.h @@ -105,9 +105,9 @@ class TestXhemm { convertToCLBLAS(args.side), convertToCLBLAS(args.triangle), args.m, args.n, args.alpha, - buffers.a_mat(), args.a_offset, args.a_ld, - buffers.b_mat(), args.b_offset, args.b_ld, args.beta, - buffers.c_mat(), args.c_offset, args.c_ld, + buffers.a_mat, args.a_offset, args.a_ld, + buffers.b_mat, args.b_offset, args.b_ld, args.beta, + buffers.c_mat, args.c_offset, args.c_ld, 1, &queue_plain, 0, nullptr, &event); clWaitForEvents(1, &event); return static_cast(status); diff --git a/test/routines/level3/xher2k.h b/test/routines/level3/xher2k.h index a78e1293..e2f4448f 100644 --- a/test/routines/level3/xher2k.h +++ b/test/routines/level3/xher2k.h @@ -105,9 +105,9 @@ class TestXher2k { convertToCLBLAS(args.triangle), convertToCLBLAS(args.a_transpose), args.n, args.k, alpha2, - buffers.a_mat(), args.a_offset, args.a_ld, - buffers.b_mat(), args.b_offset, args.b_ld, args.beta, - buffers.c_mat(), args.c_offset, args.c_ld, + buffers.a_mat, args.a_offset, args.a_ld, + buffers.b_mat, args.b_offset, args.b_ld, args.beta, + buffers.c_mat, args.c_offset, args.c_ld, 1, &queue_plain, 0, nullptr, &event); clWaitForEvents(1, &event); return static_cast(status); diff --git a/test/routines/level3/xherk.h b/test/routines/level3/xherk.h index 245293d6..43d7cfcd 100644 --- a/test/routines/level3/xherk.h +++ b/test/routines/level3/xherk.h @@ -95,8 +95,8 @@ class TestXherk { convertToCLBLAS(args.triangle), convertToCLBLAS(args.a_transpose), args.n, args.k, args.alpha, - buffers.a_mat(), args.a_offset, args.a_ld, args.beta, - buffers.c_mat(), args.c_offset, args.c_ld, + buffers.a_mat, args.a_offset, args.a_ld, args.beta, + buffers.c_mat, args.c_offset, args.c_ld, 1, &queue_plain, 0, nullptr, &event); clWaitForEvents(1, &event); return static_cast(status); diff --git a/test/routines/level3/xsymm.h b/test/routines/level3/xsymm.h index e638b735..c32b4cf7 100644 --- a/test/routines/level3/xsymm.h +++ b/test/routines/level3/xsymm.h @@ -105,9 +105,9 @@ class TestXsymm { convertToCLBLAS(args.side), convertToCLBLAS(args.triangle), args.m, args.n, args.alpha, - buffers.a_mat(), args.a_offset, args.a_ld, - buffers.b_mat(), args.b_offset, args.b_ld, args.beta, - buffers.c_mat(), args.c_offset, args.c_ld, + buffers.a_mat, args.a_offset, args.a_ld, + buffers.b_mat, args.b_offset, args.b_ld, args.beta, + buffers.c_mat, args.c_offset, args.c_ld, 1, &queue_plain, 0, nullptr, &event); clWaitForEvents(1, &event); return static_cast(status); diff --git a/test/routines/level3/xsyr2k.h b/test/routines/level3/xsyr2k.h index abac20f4..57c3c203 100644 --- a/test/routines/level3/xsyr2k.h +++ b/test/routines/level3/xsyr2k.h @@ -103,9 +103,9 @@ class TestXsyr2k { convertToCLBLAS(args.triangle), convertToCLBLAS(args.a_transpose), args.n, args.k, args.alpha, - buffers.a_mat(), args.a_offset, args.a_ld, - buffers.b_mat(), args.b_offset, args.b_ld, args.beta, - buffers.c_mat(), args.c_offset, args.c_ld, + buffers.a_mat, args.a_offset, args.a_ld, + buffers.b_mat, args.b_offset, args.b_ld, args.beta, + buffers.c_mat, args.c_offset, args.c_ld, 1, &queue_plain, 0, nullptr, &event); clWaitForEvents(1, &event); return static_cast(status); diff --git a/test/routines/level3/xsyrk.h b/test/routines/level3/xsyrk.h index 8a5fcb5f..6c3a3786 100644 --- a/test/routines/level3/xsyrk.h +++ b/test/routines/level3/xsyrk.h @@ -95,8 +95,8 @@ class TestXsyrk { convertToCLBLAS(args.triangle), convertToCLBLAS(args.a_transpose), args.n, args.k, args.alpha, - buffers.a_mat(), args.a_offset, args.a_ld, args.beta, - buffers.c_mat(), args.c_offset, args.c_ld, + buffers.a_mat, args.a_offset, args.a_ld, args.beta, + buffers.c_mat, args.c_offset, args.c_ld, 1, &queue_plain, 0, nullptr, &event); clWaitForEvents(1, &event); return static_cast(status); diff --git a/test/routines/level3/xtrmm.h b/test/routines/level3/xtrmm.h index 7c9c21bc..3eb63030 100644 --- a/test/routines/level3/xtrmm.h +++ b/test/routines/level3/xtrmm.h @@ -97,8 +97,8 @@ class TestXtrmm { convertToCLBLAS(args.a_transpose), convertToCLBLAS(args.diagonal), args.m, args.n, args.alpha, - buffers.a_mat(), args.a_offset, args.a_ld, - buffers.b_mat(), args.b_offset, args.b_ld, + buffers.a_mat, args.a_offset, args.a_ld, + buffers.b_mat, args.b_offset, args.b_ld, 1, &queue_plain, 0, nullptr, &event); clWaitForEvents(1, &event); return static_cast(status); diff --git a/test/wrapper_cblas.h b/test/wrapper_cblas.h index 529acfbf..bf59aa94 100644 --- a/test/wrapper_cblas.h +++ b/test/wrapper_cblas.h @@ -161,6 +161,17 @@ void cblasXswap(const size_t n, reinterpret_cast(&x_buffer[x_offset]), static_cast(x_inc), reinterpret_cast(&y_buffer[y_offset]), static_cast(y_inc)); } +void cblasXswap(const size_t n, + std::vector& x_buffer, const size_t x_offset, const size_t x_inc, + std::vector& y_buffer, const size_t y_offset, const size_t y_inc) { + auto x_buffer_bis = HalfToFloatBuffer(x_buffer); + auto y_buffer_bis = HalfToFloatBuffer(y_buffer); + cblasXswap(n, + x_buffer_bis, x_offset, x_inc, + y_buffer_bis, y_offset, y_inc); + FloatToHalfBuffer(x_buffer, x_buffer_bis); + FloatToHalfBuffer(y_buffer, y_buffer_bis); +} // Forwards the Netlib BLAS calls for SSCAL/DSCAL/CSCAL/ZSCAL void cblasXscal(const size_t n, @@ -193,6 +204,15 @@ void cblasXscal(const size_t n, alpha_array.data(), reinterpret_cast(&x_buffer[x_offset]), static_cast(x_inc)); } +void cblasXscal(const size_t n, + const half alpha, + std::vector& x_buffer, const size_t x_offset, const size_t x_inc) { + auto x_buffer_bis = HalfToFloatBuffer(x_buffer); + cblasXscal(n, + HalfToFloat(alpha), + x_buffer_bis, x_offset, x_inc); + FloatToHalfBuffer(x_buffer, x_buffer_bis); +} // Forwards the Netlib BLAS calls for SCOPY/DCOPY/CCOPY/ZCOPY void cblasXcopy(const size_t n, @@ -223,6 +243,16 @@ void cblasXcopy(const size_t n, reinterpret_cast(&x_buffer[x_offset]), static_cast(x_inc), reinterpret_cast(&y_buffer[y_offset]), static_cast(y_inc)); } +void cblasXcopy(const size_t n, + const std::vector& x_buffer, const size_t x_offset, const size_t x_inc, + std::vector& y_buffer, const size_t y_offset, const size_t y_inc) { + auto x_buffer_bis = HalfToFloatBuffer(x_buffer); + auto y_buffer_bis = HalfToFloatBuffer(y_buffer); + cblasXcopy(n, + x_buffer_bis, x_offset, x_inc, + y_buffer_bis, y_offset, y_inc); + FloatToHalfBuffer(y_buffer, y_buffer_bis); +} // Forwards the Netlib BLAS calls for SAXPY/DAXPY/CAXPY/ZAXPY void cblasXaxpy(const size_t n, @@ -263,6 +293,18 @@ void cblasXaxpy(const size_t n, reinterpret_cast(&x_buffer[x_offset]), static_cast(x_inc), reinterpret_cast(&y_buffer[y_offset]), static_cast(y_inc)); } +void cblasXaxpy(const size_t n, + const half alpha, + const std::vector& x_buffer, const size_t x_offset, const size_t x_inc, + std::vector& y_buffer, const size_t y_offset, const size_t y_inc) { + auto x_buffer_bis = HalfToFloatBuffer(x_buffer); + auto y_buffer_bis = HalfToFloatBuffer(y_buffer); + cblasXaxpy(n, + HalfToFloat(alpha), + x_buffer_bis, x_offset, x_inc, + y_buffer_bis, y_offset, y_inc); + FloatToHalfBuffer(y_buffer, y_buffer_bis); +} // Forwards the Netlib BLAS calls for SDOT/DDOT void cblasXdot(const size_t n, @@ -281,6 +323,19 @@ void cblasXdot(const size_t n, &x_buffer[x_offset], static_cast(x_inc), &y_buffer[y_offset], static_cast(y_inc)); } +void cblasXdot(const size_t n, + std::vector& dot_buffer, const size_t dot_offset, + const std::vector& x_buffer, const size_t x_offset, const size_t x_inc, + const std::vector& y_buffer, const size_t y_offset, const size_t y_inc) { + auto x_buffer_bis = HalfToFloatBuffer(x_buffer); + auto y_buffer_bis = HalfToFloatBuffer(y_buffer); + auto dot_buffer_bis = HalfToFloatBuffer(dot_buffer); + cblasXdot(n, + dot_buffer_bis, dot_offset, + x_buffer_bis, x_offset, x_inc, + y_buffer_bis, y_offset, y_inc); + FloatToHalfBuffer(dot_buffer, dot_buffer_bis); +} // Forwards the Netlib BLAS calls for CDOTU/ZDOTU void cblasXdotu(const size_t n, @@ -347,6 +402,16 @@ void cblasXnrm2(const size_t n, nrm2_buffer[nrm2_offset].real(cblas_dznrm2(n, reinterpret_cast(&x_buffer[x_offset]), static_cast(x_inc))); } +void cblasXnrm2(const size_t n, + std::vector& nrm2_buffer, const size_t nrm2_offset, + const std::vector& x_buffer, const size_t x_offset, const size_t x_inc) { + auto x_buffer_bis = HalfToFloatBuffer(x_buffer); + auto nrm2_buffer_bis = HalfToFloatBuffer(nrm2_buffer); + cblasXnrm2(n, + nrm2_buffer_bis, nrm2_offset, + x_buffer_bis, x_offset, x_inc); + FloatToHalfBuffer(nrm2_buffer, nrm2_buffer_bis); +} // Forwards the Netlib BLAS calls for SASUM/DASUM/ScASUM/DzASUM void cblasXasum(const size_t n, @@ -373,8 +438,18 @@ void cblasXasum(const size_t n, asum_buffer[asum_offset].real(cblas_dzasum(n, reinterpret_cast(&x_buffer[x_offset]), static_cast(x_inc))); } +void cblasXasum(const size_t n, + std::vector& asum_buffer, const size_t asum_offset, + const std::vector& x_buffer, const size_t x_offset, const size_t x_inc) { + auto x_buffer_bis = HalfToFloatBuffer(x_buffer); + auto asum_buffer_bis = HalfToFloatBuffer(asum_buffer); + cblasXasum(n, + asum_buffer_bis, asum_offset, + x_buffer_bis, x_offset, x_inc); + FloatToHalfBuffer(asum_buffer, asum_buffer_bis); +} -// Forwards the Netlib BLAS calls for iSAMAX/iDAMAX/iCAMAX/iZAMAX +// Forwards the Netlib BLAS calls for iSAMAX/iDAMAX/iCAMAX/iZAMAX/iHAMAX void cblasXamax(const size_t n, std::vector& imax_buffer, const size_t imax_offset, const std::vector& x_buffer, const size_t x_offset, const size_t x_inc) { @@ -399,6 +474,16 @@ void cblasXamax(const size_t n, ((int*)&imax_buffer[0])[imax_offset] = cblas_izamax(n, reinterpret_cast(&x_buffer[x_offset]), static_cast(x_inc)); } +void cblasXamax(const size_t n, + std::vector& imax_buffer, const size_t imax_offset, + const std::vector& x_buffer, const size_t x_offset, const size_t x_inc) { + auto x_buffer_bis = HalfToFloatBuffer(x_buffer); + auto imax_buffer_bis = HalfToFloatBuffer(imax_buffer); + cblasXamax(n, + imax_buffer_bis, imax_offset, + x_buffer_bis, x_offset, x_inc); + FloatToHalfBuffer(imax_buffer, imax_buffer_bis); +} // ================================================================================================= // BLAS level-2 (matrix-vector) routines @@ -469,6 +554,25 @@ void cblasXgemv(const CBLAS_ORDER layout, const CBLAS_TRANSPOSE a_transpose, beta_array.data(), reinterpret_cast(&y_buffer[y_offset]), static_cast(y_inc)); } +void cblasXgemv(const CBLAS_ORDER layout, const CBLAS_TRANSPOSE a_transpose, + const size_t m, const size_t n, + const half alpha, + const std::vector& a_buffer, const size_t a_offset, const size_t a_ld, + const std::vector& x_buffer, const size_t x_offset, const size_t x_inc, + const half beta, + std::vector& y_buffer, const size_t y_offset, const size_t y_inc) { + auto a_buffer_bis = HalfToFloatBuffer(a_buffer); + auto x_buffer_bis = HalfToFloatBuffer(x_buffer); + auto y_buffer_bis = HalfToFloatBuffer(y_buffer); + cblasXgemv(layout, a_transpose, + m, n, + HalfToFloat(alpha), + a_buffer_bis, a_offset, a_ld, + x_buffer_bis, x_offset, x_inc, + HalfToFloat(beta), + y_buffer_bis, y_offset, y_inc); + FloatToHalfBuffer(y_buffer, y_buffer_bis); +} // Forwards the Netlib BLAS calls for SGBMV/DGBMV/CGBMV/ZGBMV void cblasXgbmv(const CBLAS_ORDER layout, const CBLAS_TRANSPOSE a_transpose, @@ -535,6 +639,25 @@ void cblasXgbmv(const CBLAS_ORDER layout, const CBLAS_TRANSPOSE a_transpose, beta_array.data(), reinterpret_cast(&y_buffer[y_offset]), static_cast(y_inc)); } +void cblasXgbmv(const CBLAS_ORDER layout, const CBLAS_TRANSPOSE a_transpose, + const size_t m, const size_t n, const size_t kl, const size_t ku, + const half alpha, + const std::vector& a_buffer, const size_t a_offset, const size_t a_ld, + const std::vector& x_buffer, const size_t x_offset, const size_t x_inc, + const half beta, + std::vector& y_buffer, const size_t y_offset, const size_t y_inc) { + auto a_buffer_bis = HalfToFloatBuffer(a_buffer); + auto x_buffer_bis = HalfToFloatBuffer(x_buffer); + auto y_buffer_bis = HalfToFloatBuffer(y_buffer); + cblasXgbmv(layout, a_transpose, + m, n, kl, ku, + HalfToFloat(alpha), + a_buffer_bis, a_offset, a_ld, + x_buffer_bis, x_offset, x_inc, + HalfToFloat(beta), + y_buffer_bis, y_offset, y_inc); + FloatToHalfBuffer(y_buffer, y_buffer_bis); +} // Forwards the Netlib BLAS calls for CHEMV/ZHEMV void cblasXhemv(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, @@ -675,6 +798,25 @@ void cblasXsymv(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, beta, &y_buffer[y_offset], static_cast(y_inc)); } +void cblasXsymv(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, + const size_t n, + const half alpha, + const std::vector& a_buffer, const size_t a_offset, const size_t a_ld, + const std::vector& x_buffer, const size_t x_offset, const size_t x_inc, + const half beta, + std::vector& y_buffer, const size_t y_offset, const size_t y_inc) { + auto a_buffer_bis = HalfToFloatBuffer(a_buffer); + auto x_buffer_bis = HalfToFloatBuffer(x_buffer); + auto y_buffer_bis = HalfToFloatBuffer(y_buffer); + cblasXsymv(layout, triangle, + n, + HalfToFloat(alpha), + a_buffer_bis, a_offset, a_ld, + x_buffer_bis, x_offset, x_inc, + HalfToFloat(beta), + y_buffer_bis, y_offset, y_inc); + FloatToHalfBuffer(y_buffer, y_buffer_bis); +} // Forwards the Netlib BLAS calls for SSBMV/DSBMV void cblasXsbmv(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, @@ -707,6 +849,25 @@ void cblasXsbmv(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, beta, &y_buffer[y_offset], static_cast(y_inc)); } +void cblasXsbmv(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, + const size_t n, const size_t k, + const half alpha, + const std::vector& a_buffer, const size_t a_offset, const size_t a_ld, + const std::vector& x_buffer, const size_t x_offset, const size_t x_inc, + const half beta, + std::vector& y_buffer, const size_t y_offset, const size_t y_inc) { + auto a_buffer_bis = HalfToFloatBuffer(a_buffer); + auto x_buffer_bis = HalfToFloatBuffer(x_buffer); + auto y_buffer_bis = HalfToFloatBuffer(y_buffer); + cblasXsbmv(layout, triangle, + n, k, + HalfToFloat(alpha), + a_buffer_bis, a_offset, a_ld, + x_buffer_bis, x_offset, x_inc, + HalfToFloat(beta), + y_buffer_bis, y_offset, y_inc); + FloatToHalfBuffer(y_buffer, y_buffer_bis); +} // Forwards the Netlib BLAS calls for SSPMV/DSPMV void cblasXspmv(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, @@ -739,6 +900,25 @@ void cblasXspmv(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, beta, &y_buffer[y_offset], static_cast(y_inc)); } +void cblasXspmv(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, + const size_t n, + const half alpha, + const std::vector& ap_buffer, const size_t ap_offset, + const std::vector& x_buffer, const size_t x_offset, const size_t x_inc, + const half beta, + std::vector& y_buffer, const size_t y_offset, const size_t y_inc) { + auto ap_buffer_bis = HalfToFloatBuffer(ap_buffer); + auto x_buffer_bis = HalfToFloatBuffer(x_buffer); + auto y_buffer_bis = HalfToFloatBuffer(y_buffer); + cblasXspmv(layout, triangle, + n, + HalfToFloat(alpha), + ap_buffer_bis, ap_offset, + x_buffer_bis, x_offset, x_inc, + HalfToFloat(beta), + y_buffer_bis, y_offset, y_inc); + FloatToHalfBuffer(y_buffer, y_buffer_bis); +} // Forwards the Netlib BLAS calls for STRMV/DTRMV/CTRMV/ZTRMV void cblasXtrmv(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, const CBLAS_TRANSPOSE a_transpose, const CBLAS_DIAG diagonal, @@ -777,6 +957,18 @@ void cblasXtrmv(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, const CBLAS reinterpret_cast(&a_buffer[a_offset]), a_ld, reinterpret_cast(&x_buffer[x_offset]), static_cast(x_inc)); } +void cblasXtrmv(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, const CBLAS_TRANSPOSE a_transpose, const CBLAS_DIAG diagonal, + const size_t n, + const std::vector& a_buffer, const size_t a_offset, const size_t a_ld, + std::vector& x_buffer, const size_t x_offset, const size_t x_inc) { + auto a_buffer_bis = HalfToFloatBuffer(a_buffer); + auto x_buffer_bis = HalfToFloatBuffer(x_buffer); + cblasXtrmv(layout, triangle, a_transpose, diagonal, + n, + a_buffer_bis, a_offset, a_ld, + x_buffer_bis, x_offset, x_inc); + FloatToHalfBuffer(x_buffer, x_buffer_bis); +} // Forwards the Netlib BLAS calls for STBMV/DTBMV/CTBMV/ZTBMV void cblasXtbmv(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, const CBLAS_TRANSPOSE a_transpose, const CBLAS_DIAG diagonal, @@ -815,6 +1007,18 @@ void cblasXtbmv(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, const CBLAS reinterpret_cast(&a_buffer[a_offset]), a_ld, reinterpret_cast(&x_buffer[x_offset]), static_cast(x_inc)); } +void cblasXtbmv(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, const CBLAS_TRANSPOSE a_transpose, const CBLAS_DIAG diagonal, + const size_t n, const size_t k, + const std::vector& a_buffer, const size_t a_offset, const size_t a_ld, + std::vector& x_buffer, const size_t x_offset, const size_t x_inc) { + auto a_buffer_bis = HalfToFloatBuffer(a_buffer); + auto x_buffer_bis = HalfToFloatBuffer(x_buffer); + cblasXtbmv(layout, triangle, a_transpose, diagonal, + n, k, + a_buffer_bis, a_offset, a_ld, + x_buffer_bis, x_offset, x_inc); + FloatToHalfBuffer(x_buffer, x_buffer_bis); +} // Forwards the Netlib BLAS calls for STPMV/DTPMV/CTPMV/ZTPMV void cblasXtpmv(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, const CBLAS_TRANSPOSE a_transpose, const CBLAS_DIAG diagonal, @@ -853,6 +1057,18 @@ void cblasXtpmv(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, const CBLAS reinterpret_cast(&ap_buffer[ap_offset]), reinterpret_cast(&x_buffer[x_offset]), static_cast(x_inc)); } +void cblasXtpmv(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, const CBLAS_TRANSPOSE a_transpose, const CBLAS_DIAG diagonal, + const size_t n, + const std::vector& ap_buffer, const size_t ap_offset, + std::vector& x_buffer, const size_t x_offset, const size_t x_inc) { + auto ap_buffer_bis = HalfToFloatBuffer(ap_buffer); + auto x_buffer_bis = HalfToFloatBuffer(x_buffer); + cblasXtpmv(layout, triangle, a_transpose, diagonal, + n, + ap_buffer_bis, ap_offset, + x_buffer_bis, x_offset, x_inc); + FloatToHalfBuffer(x_buffer, x_buffer_bis); +} // Forwards the Netlib BLAS calls for STRSV/DTRSV/CTRSV/ZTRSV void cblasXtrsv(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, const CBLAS_TRANSPOSE a_transpose, const CBLAS_DIAG diagonal, @@ -995,6 +1211,23 @@ void cblasXger(const CBLAS_ORDER layout, &y_buffer[y_offset], static_cast(y_inc), &a_buffer[a_offset], a_ld); } +void cblasXger(const CBLAS_ORDER layout, + const size_t m, const size_t n, + const half alpha, + const std::vector& x_buffer, const size_t x_offset, const size_t x_inc, + const std::vector& y_buffer, const size_t y_offset, const size_t y_inc, + std::vector& a_buffer, const size_t a_offset, const size_t a_ld) { + auto x_buffer_bis = HalfToFloatBuffer(x_buffer); + auto y_buffer_bis = HalfToFloatBuffer(y_buffer); + auto a_buffer_bis = HalfToFloatBuffer(a_buffer); + cblasXger(layout, + m, n, + HalfToFloat(alpha), + x_buffer_bis, x_offset, x_inc, + y_buffer_bis, y_offset, y_inc, + a_buffer_bis, a_offset, a_ld); + FloatToHalfBuffer(a_buffer, a_buffer_bis); +} // Forwards the Netlib BLAS calls for CGERU/ZGERU void cblasXgeru(const CBLAS_ORDER layout, @@ -1187,6 +1420,20 @@ void cblasXsyr(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, &x_buffer[x_offset], static_cast(x_inc), &a_buffer[a_offset], a_ld); } +void cblasXsyr(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, + const size_t n, + const half alpha, + const std::vector& x_buffer, const size_t x_offset, const size_t x_inc, + std::vector& a_buffer, const size_t a_offset, const size_t a_ld) { + auto x_buffer_bis = HalfToFloatBuffer(x_buffer); + auto a_buffer_bis = HalfToFloatBuffer(a_buffer); + cblasXsyr(layout, triangle, + n, + HalfToFloat(alpha), + x_buffer_bis, x_offset, x_inc, + a_buffer_bis, a_offset, a_ld); + FloatToHalfBuffer(a_buffer, a_buffer_bis); +} // Forwards the Netlib BLAS calls for SSPR/DSPR void cblasXspr(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, @@ -1211,6 +1458,20 @@ void cblasXspr(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, &x_buffer[x_offset], static_cast(x_inc), &ap_buffer[ap_offset]); } +void cblasXspr(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, + const size_t n, + const half alpha, + const std::vector& x_buffer, const size_t x_offset, const size_t x_inc, + std::vector& ap_buffer, const size_t ap_offset) { + auto x_buffer_bis = HalfToFloatBuffer(x_buffer); + auto ap_buffer_bis = HalfToFloatBuffer(ap_buffer); + cblasXspr(layout, triangle, + n, + HalfToFloat(alpha), + x_buffer_bis, x_offset, x_inc, + ap_buffer_bis, ap_offset); + FloatToHalfBuffer(ap_buffer, ap_buffer_bis); +} // Forwards the Netlib BLAS calls for SSYR2/DSYR2 void cblasXsyr2(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, @@ -1239,6 +1500,23 @@ void cblasXsyr2(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, &y_buffer[y_offset], static_cast(y_inc), &a_buffer[a_offset], a_ld); } +void cblasXsyr2(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, + const size_t n, + const half alpha, + const std::vector& x_buffer, const size_t x_offset, const size_t x_inc, + const std::vector& y_buffer, const size_t y_offset, const size_t y_inc, + std::vector& a_buffer, const size_t a_offset, const size_t a_ld) { + auto x_buffer_bis = HalfToFloatBuffer(x_buffer); + auto y_buffer_bis = HalfToFloatBuffer(y_buffer); + auto a_buffer_bis = HalfToFloatBuffer(a_buffer); + cblasXsyr2(layout, triangle, + n, + HalfToFloat(alpha), + x_buffer_bis, x_offset, x_inc, + y_buffer_bis, y_offset, y_inc, + a_buffer_bis, a_offset, a_ld); + FloatToHalfBuffer(a_buffer, a_buffer_bis); +} // Forwards the Netlib BLAS calls for SSPR2/DSPR2 void cblasXspr2(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, @@ -1267,6 +1545,23 @@ void cblasXspr2(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, &y_buffer[y_offset], static_cast(y_inc), &ap_buffer[ap_offset]); } +void cblasXspr2(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, + const size_t n, + const half alpha, + const std::vector& x_buffer, const size_t x_offset, const size_t x_inc, + const std::vector& y_buffer, const size_t y_offset, const size_t y_inc, + std::vector& ap_buffer, const size_t ap_offset) { + auto x_buffer_bis = HalfToFloatBuffer(x_buffer); + auto y_buffer_bis = HalfToFloatBuffer(y_buffer); + auto ap_buffer_bis = HalfToFloatBuffer(ap_buffer); + cblasXspr2(layout, triangle, + n, + HalfToFloat(alpha), + x_buffer_bis, x_offset, x_inc, + y_buffer_bis, y_offset, y_inc, + ap_buffer_bis, ap_offset); + FloatToHalfBuffer(ap_buffer, ap_buffer_bis); +} // ================================================================================================= // BLAS level-3 (matrix-matrix) routines @@ -1337,6 +1632,25 @@ void cblasXgemm(const CBLAS_ORDER layout, const CBLAS_TRANSPOSE a_transpose, con beta_array.data(), reinterpret_cast(&c_buffer[c_offset]), c_ld); } +void cblasXgemm(const CBLAS_ORDER layout, const CBLAS_TRANSPOSE a_transpose, const CBLAS_TRANSPOSE b_transpose, + const size_t m, const size_t n, const size_t k, + const half alpha, + const std::vector& a_buffer, const size_t a_offset, const size_t a_ld, + const std::vector& b_buffer, const size_t b_offset, const size_t b_ld, + const half beta, + std::vector& c_buffer, const size_t c_offset, const size_t c_ld) { + auto a_buffer_bis = HalfToFloatBuffer(a_buffer); + auto b_buffer_bis = HalfToFloatBuffer(b_buffer); + auto c_buffer_bis = HalfToFloatBuffer(c_buffer); + cblasXgemm(layout, a_transpose, b_transpose, + m, n, k, + HalfToFloat(alpha), + a_buffer_bis, a_offset, a_ld, + b_buffer_bis, b_offset, b_ld, + HalfToFloat(beta), + c_buffer_bis, c_offset, c_ld); + FloatToHalfBuffer(c_buffer, c_buffer_bis); +} // Forwards the Netlib BLAS calls for SSYMM/DSYMM/CSYMM/ZSYMM void cblasXsymm(const CBLAS_ORDER layout, const CBLAS_SIDE side, const CBLAS_UPLO triangle, @@ -1403,6 +1717,25 @@ void cblasXsymm(const CBLAS_ORDER layout, const CBLAS_SIDE side, const CBLAS_UPL beta_array.data(), reinterpret_cast(&c_buffer[c_offset]), c_ld); } +void cblasXsymm(const CBLAS_ORDER layout, const CBLAS_SIDE side, const CBLAS_UPLO triangle, + const size_t m, const size_t n, + const half alpha, + const std::vector& a_buffer, const size_t a_offset, const size_t a_ld, + const std::vector& b_buffer, const size_t b_offset, const size_t b_ld, + const half beta, + std::vector& c_buffer, const size_t c_offset, const size_t c_ld) { + auto a_buffer_bis = HalfToFloatBuffer(a_buffer); + auto b_buffer_bis = HalfToFloatBuffer(b_buffer); + auto c_buffer_bis = HalfToFloatBuffer(c_buffer); + cblasXsymm(layout, side, triangle, + m, n, + HalfToFloat(alpha), + a_buffer_bis, a_offset, a_ld, + b_buffer_bis, b_offset, b_ld, + HalfToFloat(beta), + c_buffer_bis, c_offset, c_ld); + FloatToHalfBuffer(c_buffer, c_buffer_bis); +} // Forwards the Netlib BLAS calls for CHEMM/ZHEMM void cblasXhemm(const CBLAS_ORDER layout, const CBLAS_SIDE side, const CBLAS_UPLO triangle, @@ -1497,6 +1830,22 @@ void cblasXsyrk(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, const CBLAS beta_array.data(), reinterpret_cast(&c_buffer[c_offset]), c_ld); } +void cblasXsyrk(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, const CBLAS_TRANSPOSE a_transpose, + const size_t n, const size_t k, + const half alpha, + const std::vector& a_buffer, const size_t a_offset, const size_t a_ld, + const half beta, + std::vector& c_buffer, const size_t c_offset, const size_t c_ld) { + auto a_buffer_bis = HalfToFloatBuffer(a_buffer); + auto c_buffer_bis = HalfToFloatBuffer(c_buffer); + cblasXsyrk(layout, triangle, a_transpose, + n, k, + HalfToFloat(alpha), + a_buffer_bis, a_offset, a_ld, + HalfToFloat(beta), + c_buffer_bis, c_offset, c_ld); + FloatToHalfBuffer(c_buffer, c_buffer_bis); +} // Forwards the Netlib BLAS calls for CHERK/ZHERK void cblasXherk(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, const CBLAS_TRANSPOSE a_transpose, @@ -1591,6 +1940,25 @@ void cblasXsyr2k(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, const CBLA beta_array.data(), reinterpret_cast(&c_buffer[c_offset]), c_ld); } +void cblasXsyr2k(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, const CBLAS_TRANSPOSE ab_transpose, + const size_t n, const size_t k, + const half alpha, + const std::vector& a_buffer, const size_t a_offset, const size_t a_ld, + const std::vector& b_buffer, const size_t b_offset, const size_t b_ld, + const half beta, + std::vector& c_buffer, const size_t c_offset, const size_t c_ld) { + auto a_buffer_bis = HalfToFloatBuffer(a_buffer); + auto b_buffer_bis = HalfToFloatBuffer(b_buffer); + auto c_buffer_bis = HalfToFloatBuffer(c_buffer); + cblasXsyr2k(layout, triangle, ab_transpose, + n, k, + HalfToFloat(alpha), + a_buffer_bis, a_offset, a_ld, + b_buffer_bis, b_offset, b_ld, + HalfToFloat(beta), + c_buffer_bis, c_offset, c_ld); + FloatToHalfBuffer(c_buffer, c_buffer_bis); +} // Forwards the Netlib BLAS calls for CHER2K/ZHER2K void cblasXher2k(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, const CBLAS_TRANSPOSE ab_transpose, @@ -1673,6 +2041,20 @@ void cblasXtrmm(const CBLAS_ORDER layout, const CBLAS_SIDE side, const CBLAS_UPL reinterpret_cast(&a_buffer[a_offset]), a_ld, reinterpret_cast(&b_buffer[b_offset]), b_ld); } +void cblasXtrmm(const CBLAS_ORDER layout, const CBLAS_SIDE side, const CBLAS_UPLO triangle, const CBLAS_TRANSPOSE a_transpose, const CBLAS_DIAG diagonal, + const size_t m, const size_t n, + const half alpha, + const std::vector& a_buffer, const size_t a_offset, const size_t a_ld, + std::vector& b_buffer, const size_t b_offset, const size_t b_ld) { + auto a_buffer_bis = HalfToFloatBuffer(a_buffer); + auto b_buffer_bis = HalfToFloatBuffer(b_buffer); + cblasXtrmm(layout, side, triangle, a_transpose, diagonal, + m, n, + HalfToFloat(alpha), + a_buffer_bis, a_offset, a_ld, + b_buffer_bis, b_offset, b_ld); + FloatToHalfBuffer(b_buffer, b_buffer_bis); +} // Forwards the Netlib BLAS calls for STRSM/DTRSM/CTRSM/ZTRSM void cblasXtrsm(const CBLAS_ORDER layout, const CBLAS_SIDE side, const CBLAS_UPLO triangle, const CBLAS_TRANSPOSE a_transpose, const CBLAS_DIAG diagonal, @@ -1721,6 +2103,20 @@ void cblasXtrsm(const CBLAS_ORDER layout, const CBLAS_SIDE side, const CBLAS_UPL reinterpret_cast(&a_buffer[a_offset]), a_ld, reinterpret_cast(&b_buffer[b_offset]), b_ld); } +void cblasXtrsm(const CBLAS_ORDER layout, const CBLAS_SIDE side, const CBLAS_UPLO triangle, const CBLAS_TRANSPOSE a_transpose, const CBLAS_DIAG diagonal, + const size_t m, const size_t n, + const half alpha, + const std::vector& a_buffer, const size_t a_offset, const size_t a_ld, + std::vector& b_buffer, const size_t b_offset, const size_t b_ld) { + auto a_buffer_bis = HalfToFloatBuffer(a_buffer); + auto b_buffer_bis = HalfToFloatBuffer(b_buffer); + cblasXtrsm(layout, side, triangle, a_transpose, diagonal, + m, n, + HalfToFloat(alpha), + a_buffer_bis, a_offset, a_ld, + b_buffer_bis, b_offset, b_ld); + FloatToHalfBuffer(b_buffer, b_buffer_bis); +} // ================================================================================================= } // namespace clblast diff --git a/test/wrapper_clblas.h b/test/wrapper_clblas.h index 23c55373..5115b3d9 100644 --- a/test/wrapper_clblas.h +++ b/test/wrapper_clblas.h @@ -34,104 +34,104 @@ clblasSide convertToCLBLAS(const Side v) { return (v == Side::kLeft) ? clblasLef // Forwards the clBLAS calls for SROTG/DROTG template -clblasStatus clblasXrotg(cl_mem sa_buffer, const size_t sa_offset, - cl_mem sb_buffer, const size_t sb_offset, - cl_mem sc_buffer, const size_t sc_offset, - cl_mem ss_buffer, const size_t ss_offset, +clblasStatus clblasXrotg(Buffer& sa_buffer, const size_t sa_offset, + Buffer& sb_buffer, const size_t sb_offset, + Buffer& sc_buffer, const size_t sc_offset, + Buffer& ss_buffer, const size_t ss_offset, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events); template <> -clblasStatus clblasXrotg(cl_mem sa_buffer, const size_t sa_offset, - cl_mem sb_buffer, const size_t sb_offset, - cl_mem sc_buffer, const size_t sc_offset, - cl_mem ss_buffer, const size_t ss_offset, +clblasStatus clblasXrotg(Buffer& sa_buffer, const size_t sa_offset, + Buffer& sb_buffer, const size_t sb_offset, + Buffer& sc_buffer, const size_t sc_offset, + Buffer& ss_buffer, const size_t ss_offset, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { - return clblasSrotg(sa_buffer, sa_offset, - sb_buffer, sb_offset, - sc_buffer, sc_offset, - ss_buffer, ss_offset, + return clblasSrotg(sa_buffer(), sa_offset, + sb_buffer(), sb_offset, + sc_buffer(), sc_offset, + ss_buffer(), ss_offset, num_queues, queues, num_wait_events, wait_events, events); } template <> -clblasStatus clblasXrotg(cl_mem sa_buffer, const size_t sa_offset, - cl_mem sb_buffer, const size_t sb_offset, - cl_mem sc_buffer, const size_t sc_offset, - cl_mem ss_buffer, const size_t ss_offset, +clblasStatus clblasXrotg(Buffer& sa_buffer, const size_t sa_offset, + Buffer& sb_buffer, const size_t sb_offset, + Buffer& sc_buffer, const size_t sc_offset, + Buffer& ss_buffer, const size_t ss_offset, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { - return clblasDrotg(sa_buffer, sa_offset, - sb_buffer, sb_offset, - sc_buffer, sc_offset, - ss_buffer, ss_offset, + return clblasDrotg(sa_buffer(), sa_offset, + sb_buffer(), sb_offset, + sc_buffer(), sc_offset, + ss_buffer(), ss_offset, num_queues, queues, num_wait_events, wait_events, events); } // Forwards the clBLAS calls for SROTMG/DROTMG template -clblasStatus clblasXrotmg(cl_mem sd1_buffer, const size_t sd1_offset, - cl_mem sd2_buffer, const size_t sd2_offset, - cl_mem sx1_buffer, const size_t sx1_offset, - const cl_mem sy1_buffer, const size_t sy1_offset, - cl_mem sparam_buffer, const size_t sparam_offset, +clblasStatus clblasXrotmg(Buffer& sd1_buffer, const size_t sd1_offset, + Buffer& sd2_buffer, const size_t sd2_offset, + Buffer& sx1_buffer, const size_t sx1_offset, + const Buffer& sy1_buffer, const size_t sy1_offset, + Buffer& sparam_buffer, const size_t sparam_offset, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events); template <> -clblasStatus clblasXrotmg(cl_mem sd1_buffer, const size_t sd1_offset, - cl_mem sd2_buffer, const size_t sd2_offset, - cl_mem sx1_buffer, const size_t sx1_offset, - const cl_mem sy1_buffer, const size_t sy1_offset, - cl_mem sparam_buffer, const size_t sparam_offset, +clblasStatus clblasXrotmg(Buffer& sd1_buffer, const size_t sd1_offset, + Buffer& sd2_buffer, const size_t sd2_offset, + Buffer& sx1_buffer, const size_t sx1_offset, + const Buffer& sy1_buffer, const size_t sy1_offset, + Buffer& sparam_buffer, const size_t sparam_offset, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { - return clblasSrotmg(sd1_buffer, sd1_offset, - sd2_buffer, sd2_offset, - sx1_buffer, sx1_offset, - sy1_buffer, sy1_offset, - sparam_buffer, sparam_offset, + return clblasSrotmg(sd1_buffer(), sd1_offset, + sd2_buffer(), sd2_offset, + sx1_buffer(), sx1_offset, + sy1_buffer(), sy1_offset, + sparam_buffer(), sparam_offset, num_queues, queues, num_wait_events, wait_events, events); } template <> -clblasStatus clblasXrotmg(cl_mem sd1_buffer, const size_t sd1_offset, - cl_mem sd2_buffer, const size_t sd2_offset, - cl_mem sx1_buffer, const size_t sx1_offset, - const cl_mem sy1_buffer, const size_t sy1_offset, - cl_mem sparam_buffer, const size_t sparam_offset, +clblasStatus clblasXrotmg(Buffer& sd1_buffer, const size_t sd1_offset, + Buffer& sd2_buffer, const size_t sd2_offset, + Buffer& sx1_buffer, const size_t sx1_offset, + const Buffer& sy1_buffer, const size_t sy1_offset, + Buffer& sparam_buffer, const size_t sparam_offset, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { - return clblasDrotmg(sd1_buffer, sd1_offset, - sd2_buffer, sd2_offset, - sx1_buffer, sx1_offset, - sy1_buffer, sy1_offset, - sparam_buffer, sparam_offset, + return clblasDrotmg(sd1_buffer(), sd1_offset, + sd2_buffer(), sd2_offset, + sx1_buffer(), sx1_offset, + sy1_buffer(), sy1_offset, + sparam_buffer(), sparam_offset, num_queues, queues, num_wait_events, wait_events, events); } // Forwards the clBLAS calls for SROT/DROT clblasStatus clblasXrot(const size_t n, - cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + Buffer& x_buffer, const size_t x_offset, const size_t x_inc, + Buffer& y_buffer, const size_t y_offset, const size_t y_inc, const float cos, const float sin, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { return clblasSrot(n, - x_buffer, x_offset, static_cast(x_inc), - y_buffer, y_offset, static_cast(y_inc), + x_buffer(), x_offset, static_cast(x_inc), + y_buffer(), y_offset, static_cast(y_inc), cos, sin, num_queues, queues, num_wait_events, wait_events, events); } clblasStatus clblasXrot(const size_t n, - cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + Buffer& x_buffer, const size_t x_offset, const size_t x_inc, + Buffer& y_buffer, const size_t y_offset, const size_t y_inc, const double cos, const double sin, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { return clblasDrot(n, - x_buffer, x_offset, static_cast(x_inc), - y_buffer, y_offset, static_cast(y_inc), + x_buffer(), x_offset, static_cast(x_inc), + y_buffer(), y_offset, static_cast(y_inc), cos, sin, num_queues, queues, num_wait_events, wait_events, events); @@ -140,316 +140,394 @@ clblasStatus clblasXrot(const size_t n, // Forwards the clBLAS calls for SROTM/DROTM template clblasStatus clblasXrotm(const size_t n, - cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - cl_mem sparam_buffer, const size_t sparam_offset, + Buffer& x_buffer, const size_t x_offset, const size_t x_inc, + Buffer& y_buffer, const size_t y_offset, const size_t y_inc, + Buffer& sparam_buffer, const size_t sparam_offset, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events); template <> clblasStatus clblasXrotm(const size_t n, - cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - cl_mem sparam_buffer, const size_t sparam_offset, + Buffer& x_buffer, const size_t x_offset, const size_t x_inc, + Buffer& y_buffer, const size_t y_offset, const size_t y_inc, + Buffer& sparam_buffer, const size_t sparam_offset, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { return clblasSrotm(n, - x_buffer, x_offset, static_cast(x_inc), - y_buffer, y_offset, static_cast(y_inc), - sparam_buffer, sparam_offset, + x_buffer(), x_offset, static_cast(x_inc), + y_buffer(), y_offset, static_cast(y_inc), + sparam_buffer(), sparam_offset, num_queues, queues, num_wait_events, wait_events, events); } template <> clblasStatus clblasXrotm(const size_t n, - cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - cl_mem sparam_buffer, const size_t sparam_offset, + Buffer& x_buffer, const size_t x_offset, const size_t x_inc, + Buffer& y_buffer, const size_t y_offset, const size_t y_inc, + Buffer& sparam_buffer, const size_t sparam_offset, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { return clblasDrotm(n, - x_buffer, x_offset, static_cast(x_inc), - y_buffer, y_offset, static_cast(y_inc), - sparam_buffer, sparam_offset, + x_buffer(), x_offset, static_cast(x_inc), + y_buffer(), y_offset, static_cast(y_inc), + sparam_buffer(), sparam_offset, num_queues, queues, num_wait_events, wait_events, events); } // Forwards the clBLAS calls for SSWAP/DSWAP/CSWAP/ZSWAP template clblasStatus clblasXswap(const size_t n, - cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + Buffer& x_buffer, const size_t x_offset, const size_t x_inc, + Buffer& y_buffer, const size_t y_offset, const size_t y_inc, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events); template <> clblasStatus clblasXswap(const size_t n, - cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + Buffer& x_buffer, const size_t x_offset, const size_t x_inc, + Buffer& y_buffer, const size_t y_offset, const size_t y_inc, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { return clblasSswap(n, - x_buffer, x_offset, static_cast(x_inc), - y_buffer, y_offset, static_cast(y_inc), + x_buffer(), x_offset, static_cast(x_inc), + y_buffer(), y_offset, static_cast(y_inc), num_queues, queues, num_wait_events, wait_events, events); } template <> clblasStatus clblasXswap(const size_t n, - cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + Buffer& x_buffer, const size_t x_offset, const size_t x_inc, + Buffer& y_buffer, const size_t y_offset, const size_t y_inc, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { return clblasDswap(n, - x_buffer, x_offset, static_cast(x_inc), - y_buffer, y_offset, static_cast(y_inc), + x_buffer(), x_offset, static_cast(x_inc), + y_buffer(), y_offset, static_cast(y_inc), num_queues, queues, num_wait_events, wait_events, events); } template <> clblasStatus clblasXswap(const size_t n, - cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + Buffer& x_buffer, const size_t x_offset, const size_t x_inc, + Buffer& y_buffer, const size_t y_offset, const size_t y_inc, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { return clblasCswap(n, - x_buffer, x_offset, static_cast(x_inc), - y_buffer, y_offset, static_cast(y_inc), + x_buffer(), x_offset, static_cast(x_inc), + y_buffer(), y_offset, static_cast(y_inc), num_queues, queues, num_wait_events, wait_events, events); } template <> clblasStatus clblasXswap(const size_t n, - cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + Buffer& x_buffer, const size_t x_offset, const size_t x_inc, + Buffer& y_buffer, const size_t y_offset, const size_t y_inc, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { return clblasZswap(n, - x_buffer, x_offset, static_cast(x_inc), - y_buffer, y_offset, static_cast(y_inc), + x_buffer(), x_offset, static_cast(x_inc), + y_buffer(), y_offset, static_cast(y_inc), num_queues, queues, num_wait_events, wait_events, events); } +template <> +clblasStatus clblasXswap(const size_t n, + Buffer& x_buffer, const size_t x_offset, const size_t x_inc, + Buffer& y_buffer, const size_t y_offset, const size_t y_inc, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { + auto x_buffer_bis = HalfToFloatBuffer(x_buffer, queues[0]); + auto y_buffer_bis = HalfToFloatBuffer(y_buffer, queues[0]); + auto status = clblasXswap(n, + x_buffer_bis, x_offset, x_inc, + y_buffer_bis, y_offset, y_inc, + num_queues, queues, num_wait_events, wait_events, events); + FloatToHalfBuffer(x_buffer, x_buffer_bis, queues[0]); + FloatToHalfBuffer(y_buffer, y_buffer_bis, queues[0]); + return status; +} // Forwards the clBLAS calls for SSCAL/DSCAL/CSCAL/ZSCAL clblasStatus clblasXscal(const size_t n, const float alpha, - cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + Buffer& x_buffer, const size_t x_offset, const size_t x_inc, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { return clblasSscal(n, alpha, - x_buffer, x_offset, static_cast(x_inc), + x_buffer(), x_offset, static_cast(x_inc), num_queues, queues, num_wait_events, wait_events, events); } clblasStatus clblasXscal(const size_t n, const double alpha, - cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + Buffer& x_buffer, const size_t x_offset, const size_t x_inc, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { return clblasDscal(n, alpha, - x_buffer, x_offset, static_cast(x_inc), + x_buffer(), x_offset, static_cast(x_inc), num_queues, queues, num_wait_events, wait_events, events); } clblasStatus clblasXscal(const size_t n, const float2 alpha, - cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + Buffer& x_buffer, const size_t x_offset, const size_t x_inc, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { return clblasCscal(n, cl_float2{{alpha.real(), alpha.imag()}}, - x_buffer, x_offset, static_cast(x_inc), + x_buffer(), x_offset, static_cast(x_inc), num_queues, queues, num_wait_events, wait_events, events); } clblasStatus clblasXscal(const size_t n, const double2 alpha, - cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + Buffer& x_buffer, const size_t x_offset, const size_t x_inc, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { return clblasZscal(n, cl_double2{{alpha.real(), alpha.imag()}}, - x_buffer, x_offset, static_cast(x_inc), + x_buffer(), x_offset, static_cast(x_inc), num_queues, queues, num_wait_events, wait_events, events); } +clblasStatus clblasXscal(const size_t n, + const half alpha, + Buffer& x_buffer, const size_t x_offset, const size_t x_inc, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { + auto x_buffer_bis = HalfToFloatBuffer(x_buffer, queues[0]); + auto status = clblasXscal(n, + HalfToFloat(alpha), + x_buffer_bis, x_offset, x_inc, + num_queues, queues, num_wait_events, wait_events, events); + FloatToHalfBuffer(x_buffer, x_buffer_bis, queues[0]); + return status; +} // Forwards the clBLAS calls for SCOPY/DCOPY/CCOPY/ZCOPY template clblasStatus clblasXcopy(const size_t n, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + const Buffer& x_buffer, const size_t x_offset, const size_t x_inc, + Buffer& y_buffer, const size_t y_offset, const size_t y_inc, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events); template <> clblasStatus clblasXcopy(const size_t n, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + const Buffer& x_buffer, const size_t x_offset, const size_t x_inc, + Buffer& y_buffer, const size_t y_offset, const size_t y_inc, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { return clblasScopy(n, - x_buffer, x_offset, static_cast(x_inc), - y_buffer, y_offset, static_cast(y_inc), + x_buffer(), x_offset, static_cast(x_inc), + y_buffer(), y_offset, static_cast(y_inc), num_queues, queues, num_wait_events, wait_events, events); } template <> clblasStatus clblasXcopy(const size_t n, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + const Buffer& x_buffer, const size_t x_offset, const size_t x_inc, + Buffer& y_buffer, const size_t y_offset, const size_t y_inc, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { return clblasDcopy(n, - x_buffer, x_offset, static_cast(x_inc), - y_buffer, y_offset, static_cast(y_inc), + x_buffer(), x_offset, static_cast(x_inc), + y_buffer(), y_offset, static_cast(y_inc), num_queues, queues, num_wait_events, wait_events, events); } template <> clblasStatus clblasXcopy(const size_t n, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + const Buffer& x_buffer, const size_t x_offset, const size_t x_inc, + Buffer& y_buffer, const size_t y_offset, const size_t y_inc, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { return clblasCcopy(n, - x_buffer, x_offset, static_cast(x_inc), - y_buffer, y_offset, static_cast(y_inc), + x_buffer(), x_offset, static_cast(x_inc), + y_buffer(), y_offset, static_cast(y_inc), num_queues, queues, num_wait_events, wait_events, events); } template <> clblasStatus clblasXcopy(const size_t n, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + const Buffer& x_buffer, const size_t x_offset, const size_t x_inc, + Buffer& y_buffer, const size_t y_offset, const size_t y_inc, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { return clblasZcopy(n, - x_buffer, x_offset, static_cast(x_inc), - y_buffer, y_offset, static_cast(y_inc), + x_buffer(), x_offset, static_cast(x_inc), + y_buffer(), y_offset, static_cast(y_inc), num_queues, queues, num_wait_events, wait_events, events); } +template <> +clblasStatus clblasXcopy(const size_t n, + const Buffer& x_buffer, const size_t x_offset, const size_t x_inc, + Buffer& y_buffer, const size_t y_offset, const size_t y_inc, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { + auto x_buffer_bis = HalfToFloatBuffer(x_buffer, queues[0]); + auto y_buffer_bis = HalfToFloatBuffer(y_buffer, queues[0]); + auto status = clblasXcopy(n, + x_buffer_bis, x_offset, x_inc, + y_buffer_bis, y_offset, y_inc, + num_queues, queues, num_wait_events, wait_events, events); + FloatToHalfBuffer(y_buffer, y_buffer_bis, queues[0]); + return status; +} // Forwards the clBLAS calls for SAXPY/DAXPY/CAXPY/ZAXPY clblasStatus clblasXaxpy(const size_t n, const float alpha, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + const Buffer& x_buffer, const size_t x_offset, const size_t x_inc, + Buffer& y_buffer, const size_t y_offset, const size_t y_inc, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { return clblasSaxpy(n, alpha, - x_buffer, x_offset, static_cast(x_inc), - y_buffer, y_offset, static_cast(y_inc), + x_buffer(), x_offset, static_cast(x_inc), + y_buffer(), y_offset, static_cast(y_inc), num_queues, queues, num_wait_events, wait_events, events); } clblasStatus clblasXaxpy(const size_t n, const double alpha, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + const Buffer& x_buffer, const size_t x_offset, const size_t x_inc, + Buffer& y_buffer, const size_t y_offset, const size_t y_inc, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { return clblasDaxpy(n, alpha, - x_buffer, x_offset, static_cast(x_inc), - y_buffer, y_offset, static_cast(y_inc), + x_buffer(), x_offset, static_cast(x_inc), + y_buffer(), y_offset, static_cast(y_inc), num_queues, queues, num_wait_events, wait_events, events); } clblasStatus clblasXaxpy(const size_t n, const float2 alpha, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + const Buffer& x_buffer, const size_t x_offset, const size_t x_inc, + Buffer& y_buffer, const size_t y_offset, const size_t y_inc, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { return clblasCaxpy(n, cl_float2{{alpha.real(), alpha.imag()}}, - x_buffer, x_offset, static_cast(x_inc), - y_buffer, y_offset, static_cast(y_inc), + x_buffer(), x_offset, static_cast(x_inc), + y_buffer(), y_offset, static_cast(y_inc), num_queues, queues, num_wait_events, wait_events, events); } clblasStatus clblasXaxpy(const size_t n, const double2 alpha, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + const Buffer& x_buffer, const size_t x_offset, const size_t x_inc, + Buffer& y_buffer, const size_t y_offset, const size_t y_inc, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { return clblasZaxpy(n, cl_double2{{alpha.real(), alpha.imag()}}, - x_buffer, x_offset, static_cast(x_inc), - y_buffer, y_offset, static_cast(y_inc), + x_buffer(), x_offset, static_cast(x_inc), + y_buffer(), y_offset, static_cast(y_inc), num_queues, queues, num_wait_events, wait_events, events); } +clblasStatus clblasXaxpy(const size_t n, + const half alpha, + const Buffer& x_buffer, const size_t x_offset, const size_t x_inc, + Buffer& y_buffer, const size_t y_offset, const size_t y_inc, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { + auto x_buffer_bis = HalfToFloatBuffer(x_buffer, queues[0]); + auto y_buffer_bis = HalfToFloatBuffer(y_buffer, queues[0]); + auto status = clblasXaxpy(n, + HalfToFloat(alpha), + x_buffer_bis, x_offset, x_inc, + y_buffer_bis, y_offset, y_inc, + num_queues, queues, num_wait_events, wait_events, events); + FloatToHalfBuffer(y_buffer, y_buffer_bis, queues[0]); + return status; +} // Forwards the clBLAS calls for SDOT/DDOT template clblasStatus clblasXdot(const size_t n, - cl_mem dot_buffer, const size_t dot_offset, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + Buffer& dot_buffer, const size_t dot_offset, + const Buffer& x_buffer, const size_t x_offset, const size_t x_inc, + const Buffer& y_buffer, const size_t y_offset, const size_t y_inc, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events); template <> clblasStatus clblasXdot(const size_t n, - cl_mem dot_buffer, const size_t dot_offset, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + Buffer& dot_buffer, const size_t dot_offset, + const Buffer& x_buffer, const size_t x_offset, const size_t x_inc, + const Buffer& y_buffer, const size_t y_offset, const size_t y_inc, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { auto queue = Queue(queues[0]); auto context = queue.GetContext(); auto scratch_buffer = Buffer(context, n); return clblasSdot(n, - dot_buffer, dot_offset, - x_buffer, x_offset, static_cast(x_inc), - y_buffer, y_offset, static_cast(y_inc), + dot_buffer(), dot_offset, + x_buffer(), x_offset, static_cast(x_inc), + y_buffer(), y_offset, static_cast(y_inc), scratch_buffer(), num_queues, queues, num_wait_events, wait_events, events); } template <> clblasStatus clblasXdot(const size_t n, - cl_mem dot_buffer, const size_t dot_offset, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + Buffer& dot_buffer, const size_t dot_offset, + const Buffer& x_buffer, const size_t x_offset, const size_t x_inc, + const Buffer& y_buffer, const size_t y_offset, const size_t y_inc, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { auto queue = Queue(queues[0]); auto context = queue.GetContext(); auto scratch_buffer = Buffer(context, n); return clblasDdot(n, - dot_buffer, dot_offset, - x_buffer, x_offset, static_cast(x_inc), - y_buffer, y_offset, static_cast(y_inc), + dot_buffer(), dot_offset, + x_buffer(), x_offset, static_cast(x_inc), + y_buffer(), y_offset, static_cast(y_inc), scratch_buffer(), num_queues, queues, num_wait_events, wait_events, events); } +template <> +clblasStatus clblasXdot(const size_t n, + Buffer& dot_buffer, const size_t dot_offset, + const Buffer& x_buffer, const size_t x_offset, const size_t x_inc, + const Buffer& y_buffer, const size_t y_offset, const size_t y_inc, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { + auto x_buffer_bis = HalfToFloatBuffer(x_buffer, queues[0]); + auto y_buffer_bis = HalfToFloatBuffer(y_buffer, queues[0]); + auto dot_buffer_bis = HalfToFloatBuffer(dot_buffer, queues[0]); + auto status = clblasXdot(n, + dot_buffer_bis, dot_offset, + x_buffer_bis, x_offset, x_inc, + y_buffer_bis, y_offset, y_inc, + num_queues, queues, num_wait_events, wait_events, events); + FloatToHalfBuffer(dot_buffer, dot_buffer_bis, queues[0]); + return status; +} // Forwards the clBLAS calls for CDOTU/ZDOTU template clblasStatus clblasXdotu(const size_t n, - cl_mem dot_buffer, const size_t dot_offset, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + Buffer& dot_buffer, const size_t dot_offset, + const Buffer& x_buffer, const size_t x_offset, const size_t x_inc, + const Buffer& y_buffer, const size_t y_offset, const size_t y_inc, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events); template <> clblasStatus clblasXdotu(const size_t n, - cl_mem dot_buffer, const size_t dot_offset, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + Buffer& dot_buffer, const size_t dot_offset, + const Buffer& x_buffer, const size_t x_offset, const size_t x_inc, + const Buffer& y_buffer, const size_t y_offset, const size_t y_inc, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { auto queue = Queue(queues[0]); auto context = queue.GetContext(); auto scratch_buffer = Buffer(context, n); return clblasCdotu(n, - dot_buffer, dot_offset, - x_buffer, x_offset, static_cast(x_inc), - y_buffer, y_offset, static_cast(y_inc), + dot_buffer(), dot_offset, + x_buffer(), x_offset, static_cast(x_inc), + y_buffer(), y_offset, static_cast(y_inc), scratch_buffer(), num_queues, queues, num_wait_events, wait_events, events); } template <> clblasStatus clblasXdotu(const size_t n, - cl_mem dot_buffer, const size_t dot_offset, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + Buffer& dot_buffer, const size_t dot_offset, + const Buffer& x_buffer, const size_t x_offset, const size_t x_inc, + const Buffer& y_buffer, const size_t y_offset, const size_t y_inc, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { auto queue = Queue(queues[0]); auto context = queue.GetContext(); auto scratch_buffer = Buffer(context, n); return clblasZdotu(n, - dot_buffer, dot_offset, - x_buffer, x_offset, static_cast(x_inc), - y_buffer, y_offset, static_cast(y_inc), + dot_buffer(), dot_offset, + x_buffer(), x_offset, static_cast(x_inc), + y_buffer(), y_offset, static_cast(y_inc), scratch_buffer(), num_queues, queues, num_wait_events, wait_events, events); } @@ -457,42 +535,42 @@ clblasStatus clblasXdotu(const size_t n, // Forwards the clBLAS calls for CDOTC/ZDOTC template clblasStatus clblasXdotc(const size_t n, - cl_mem dot_buffer, const size_t dot_offset, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + Buffer& dot_buffer, const size_t dot_offset, + const Buffer& x_buffer, const size_t x_offset, const size_t x_inc, + const Buffer& y_buffer, const size_t y_offset, const size_t y_inc, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events); template <> clblasStatus clblasXdotc(const size_t n, - cl_mem dot_buffer, const size_t dot_offset, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + Buffer& dot_buffer, const size_t dot_offset, + const Buffer& x_buffer, const size_t x_offset, const size_t x_inc, + const Buffer& y_buffer, const size_t y_offset, const size_t y_inc, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { auto queue = Queue(queues[0]); auto context = queue.GetContext(); auto scratch_buffer = Buffer(context, n); return clblasCdotc(n, - dot_buffer, dot_offset, - x_buffer, x_offset, static_cast(x_inc), - y_buffer, y_offset, static_cast(y_inc), + dot_buffer(), dot_offset, + x_buffer(), x_offset, static_cast(x_inc), + y_buffer(), y_offset, static_cast(y_inc), scratch_buffer(), num_queues, queues, num_wait_events, wait_events, events); } template <> clblasStatus clblasXdotc(const size_t n, - cl_mem dot_buffer, const size_t dot_offset, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + Buffer& dot_buffer, const size_t dot_offset, + const Buffer& x_buffer, const size_t x_offset, const size_t x_inc, + const Buffer& y_buffer, const size_t y_offset, const size_t y_inc, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { auto queue = Queue(queues[0]); auto context = queue.GetContext(); auto scratch_buffer = Buffer(context, n); return clblasZdotc(n, - dot_buffer, dot_offset, - x_buffer, x_offset, static_cast(x_inc), - y_buffer, y_offset, static_cast(y_inc), + dot_buffer(), dot_offset, + x_buffer(), x_offset, static_cast(x_inc), + y_buffer(), y_offset, static_cast(y_inc), scratch_buffer(), num_queues, queues, num_wait_events, wait_events, events); } @@ -500,206 +578,251 @@ clblasStatus clblasXdotc(const size_t n, // Forwards the clBLAS calls for SNRM2/DNRM2/ScNRM2/DzNRM2 template clblasStatus clblasXnrm2(const size_t n, - cl_mem nrm2_buffer, const size_t nrm2_offset, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + Buffer& nrm2_buffer, const size_t nrm2_offset, + const Buffer& x_buffer, const size_t x_offset, const size_t x_inc, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events); template <> clblasStatus clblasXnrm2(const size_t n, - cl_mem nrm2_buffer, const size_t nrm2_offset, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + Buffer& nrm2_buffer, const size_t nrm2_offset, + const Buffer& x_buffer, const size_t x_offset, const size_t x_inc, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { auto queue = Queue(queues[0]); auto context = queue.GetContext(); auto scratch_buffer = Buffer(context, 2*n); return clblasSnrm2(n, - nrm2_buffer, nrm2_offset, - x_buffer, x_offset, static_cast(x_inc), + nrm2_buffer(), nrm2_offset, + x_buffer(), x_offset, static_cast(x_inc), scratch_buffer(), num_queues, queues, num_wait_events, wait_events, events); } template <> clblasStatus clblasXnrm2(const size_t n, - cl_mem nrm2_buffer, const size_t nrm2_offset, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + Buffer& nrm2_buffer, const size_t nrm2_offset, + const Buffer& x_buffer, const size_t x_offset, const size_t x_inc, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { auto queue = Queue(queues[0]); auto context = queue.GetContext(); auto scratch_buffer = Buffer(context, 2*n); return clblasDnrm2(n, - nrm2_buffer, nrm2_offset, - x_buffer, x_offset, static_cast(x_inc), + nrm2_buffer(), nrm2_offset, + x_buffer(), x_offset, static_cast(x_inc), scratch_buffer(), num_queues, queues, num_wait_events, wait_events, events); } template <> clblasStatus clblasXnrm2(const size_t n, - cl_mem nrm2_buffer, const size_t nrm2_offset, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + Buffer& nrm2_buffer, const size_t nrm2_offset, + const Buffer& x_buffer, const size_t x_offset, const size_t x_inc, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { auto queue = Queue(queues[0]); auto context = queue.GetContext(); auto scratch_buffer = Buffer(context, 2*n); return clblasScnrm2(n, - nrm2_buffer, nrm2_offset, - x_buffer, x_offset, static_cast(x_inc), + nrm2_buffer(), nrm2_offset, + x_buffer(), x_offset, static_cast(x_inc), scratch_buffer(), num_queues, queues, num_wait_events, wait_events, events); } template <> clblasStatus clblasXnrm2(const size_t n, - cl_mem nrm2_buffer, const size_t nrm2_offset, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + Buffer& nrm2_buffer, const size_t nrm2_offset, + const Buffer& x_buffer, const size_t x_offset, const size_t x_inc, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { auto queue = Queue(queues[0]); auto context = queue.GetContext(); auto scratch_buffer = Buffer(context, 2*n); return clblasDznrm2(n, - nrm2_buffer, nrm2_offset, - x_buffer, x_offset, static_cast(x_inc), + nrm2_buffer(), nrm2_offset, + x_buffer(), x_offset, static_cast(x_inc), scratch_buffer(), num_queues, queues, num_wait_events, wait_events, events); } +template <> +clblasStatus clblasXnrm2(const size_t n, + Buffer& nrm2_buffer, const size_t nrm2_offset, + const Buffer& x_buffer, const size_t x_offset, const size_t x_inc, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { + auto x_buffer_bis = HalfToFloatBuffer(x_buffer, queues[0]); + auto nrm2_buffer_bis = HalfToFloatBuffer(nrm2_buffer, queues[0]); + auto status = clblasXnrm2(n, + nrm2_buffer_bis, nrm2_offset, + x_buffer_bis, x_offset, x_inc, + num_queues, queues, num_wait_events, wait_events, events); + FloatToHalfBuffer(nrm2_buffer, nrm2_buffer_bis, queues[0]); + return status; +} // Forwards the clBLAS calls for SASUM/DASUM/ScASUM/DzASUM template clblasStatus clblasXasum(const size_t n, - cl_mem asum_buffer, const size_t asum_offset, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + Buffer& asum_buffer, const size_t asum_offset, + const Buffer& x_buffer, const size_t x_offset, const size_t x_inc, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events); template <> clblasStatus clblasXasum(const size_t n, - cl_mem asum_buffer, const size_t asum_offset, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + Buffer& asum_buffer, const size_t asum_offset, + const Buffer& x_buffer, const size_t x_offset, const size_t x_inc, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { auto queue = Queue(queues[0]); auto context = queue.GetContext(); auto scratch_buffer = Buffer(context, n); return clblasSasum(n, - asum_buffer, asum_offset, - x_buffer, x_offset, static_cast(x_inc), + asum_buffer(), asum_offset, + x_buffer(), x_offset, static_cast(x_inc), scratch_buffer(), num_queues, queues, num_wait_events, wait_events, events); } template <> clblasStatus clblasXasum(const size_t n, - cl_mem asum_buffer, const size_t asum_offset, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + Buffer& asum_buffer, const size_t asum_offset, + const Buffer& x_buffer, const size_t x_offset, const size_t x_inc, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { auto queue = Queue(queues[0]); auto context = queue.GetContext(); auto scratch_buffer = Buffer(context, n); return clblasDasum(n, - asum_buffer, asum_offset, - x_buffer, x_offset, static_cast(x_inc), + asum_buffer(), asum_offset, + x_buffer(), x_offset, static_cast(x_inc), scratch_buffer(), num_queues, queues, num_wait_events, wait_events, events); } template <> clblasStatus clblasXasum(const size_t n, - cl_mem asum_buffer, const size_t asum_offset, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + Buffer& asum_buffer, const size_t asum_offset, + const Buffer& x_buffer, const size_t x_offset, const size_t x_inc, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { auto queue = Queue(queues[0]); auto context = queue.GetContext(); auto scratch_buffer = Buffer(context, n); return clblasScasum(n, - asum_buffer, asum_offset, - x_buffer, x_offset, static_cast(x_inc), + asum_buffer(), asum_offset, + x_buffer(), x_offset, static_cast(x_inc), scratch_buffer(), num_queues, queues, num_wait_events, wait_events, events); } template <> clblasStatus clblasXasum(const size_t n, - cl_mem asum_buffer, const size_t asum_offset, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + Buffer& asum_buffer, const size_t asum_offset, + const Buffer& x_buffer, const size_t x_offset, const size_t x_inc, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { auto queue = Queue(queues[0]); auto context = queue.GetContext(); auto scratch_buffer = Buffer(context, n); return clblasDzasum(n, - asum_buffer, asum_offset, - x_buffer, x_offset, static_cast(x_inc), + asum_buffer(), asum_offset, + x_buffer(), x_offset, static_cast(x_inc), scratch_buffer(), num_queues, queues, num_wait_events, wait_events, events); } +template <> +clblasStatus clblasXasum(const size_t n, + Buffer& asum_buffer, const size_t asum_offset, + const Buffer& x_buffer, const size_t x_offset, const size_t x_inc, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { + auto x_buffer_bis = HalfToFloatBuffer(x_buffer, queues[0]); + auto asum_buffer_bis = HalfToFloatBuffer(asum_buffer, queues[0]); + auto status = clblasXasum(n, + asum_buffer_bis, asum_offset, + x_buffer_bis, x_offset, x_inc, + num_queues, queues, num_wait_events, wait_events, events); + FloatToHalfBuffer(asum_buffer, asum_buffer_bis, queues[0]); + return status; +} -// Forwards the clBLAS calls for iSAMAX/iDAMAX/iCAMAX/iZAMAX +// Forwards the clBLAS calls for iSAMAX/iDAMAX/iCAMAX/iZAMAX/iHAMAX template clblasStatus clblasXamax(const size_t n, - cl_mem imax_buffer, const size_t imax_offset, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + Buffer& imax_buffer, const size_t imax_offset, + const Buffer& x_buffer, const size_t x_offset, const size_t x_inc, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events); template <> clblasStatus clblasXamax(const size_t n, - cl_mem imax_buffer, const size_t imax_offset, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + Buffer& imax_buffer, const size_t imax_offset, + const Buffer& x_buffer, const size_t x_offset, const size_t x_inc, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { auto queue = Queue(queues[0]); auto context = queue.GetContext(); auto scratch_buffer = Buffer(context, 2*n); return clblasiSamax(n, - imax_buffer, imax_offset, - x_buffer, x_offset, static_cast(x_inc), + imax_buffer(), imax_offset, + x_buffer(), x_offset, static_cast(x_inc), scratch_buffer(), num_queues, queues, num_wait_events, wait_events, events); } template <> clblasStatus clblasXamax(const size_t n, - cl_mem imax_buffer, const size_t imax_offset, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + Buffer& imax_buffer, const size_t imax_offset, + const Buffer& x_buffer, const size_t x_offset, const size_t x_inc, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { auto queue = Queue(queues[0]); auto context = queue.GetContext(); auto scratch_buffer = Buffer(context, 2*n); return clblasiDamax(n, - imax_buffer, imax_offset, - x_buffer, x_offset, static_cast(x_inc), + imax_buffer(), imax_offset, + x_buffer(), x_offset, static_cast(x_inc), scratch_buffer(), num_queues, queues, num_wait_events, wait_events, events); } template <> clblasStatus clblasXamax(const size_t n, - cl_mem imax_buffer, const size_t imax_offset, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + Buffer& imax_buffer, const size_t imax_offset, + const Buffer& x_buffer, const size_t x_offset, const size_t x_inc, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { auto queue = Queue(queues[0]); auto context = queue.GetContext(); auto scratch_buffer = Buffer(context, 2*n); return clblasiCamax(n, - imax_buffer, imax_offset, - x_buffer, x_offset, static_cast(x_inc), + imax_buffer(), imax_offset, + x_buffer(), x_offset, static_cast(x_inc), scratch_buffer(), num_queues, queues, num_wait_events, wait_events, events); } template <> clblasStatus clblasXamax(const size_t n, - cl_mem imax_buffer, const size_t imax_offset, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + Buffer& imax_buffer, const size_t imax_offset, + const Buffer& x_buffer, const size_t x_offset, const size_t x_inc, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { auto queue = Queue(queues[0]); auto context = queue.GetContext(); auto scratch_buffer = Buffer(context, 2*n); return clblasiZamax(n, - imax_buffer, imax_offset, - x_buffer, x_offset, static_cast(x_inc), + imax_buffer(), imax_offset, + x_buffer(), x_offset, static_cast(x_inc), scratch_buffer(), num_queues, queues, num_wait_events, wait_events, events); } +template <> +clblasStatus clblasXamax(const size_t n, + Buffer& imax_buffer, const size_t imax_offset, + const Buffer& x_buffer, const size_t x_offset, const size_t x_inc, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { + auto x_buffer_bis = HalfToFloatBuffer(x_buffer, queues[0]); + auto imax_buffer_bis = HalfToFloatBuffer(imax_buffer, queues[0]); + auto status = clblasXamax(n, + imax_buffer_bis, imax_offset, + x_buffer_bis, x_offset, x_inc, + num_queues, queues, num_wait_events, wait_events, events); + FloatToHalfBuffer(imax_buffer, imax_buffer_bis, queues[0]); + return status; +} // ================================================================================================= // BLAS level-2 (matrix-vector) routines @@ -709,185 +832,231 @@ clblasStatus clblasXamax(const size_t n, clblasStatus clblasXgemv(const clblasOrder layout, const clblasTranspose a_transpose, const size_t m, const size_t n, const float alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const Buffer& a_buffer, const size_t a_offset, const size_t a_ld, + const Buffer& x_buffer, const size_t x_offset, const size_t x_inc, const float beta, - cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + Buffer& y_buffer, const size_t y_offset, const size_t y_inc, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { return clblasSgemv(layout, a_transpose, m, n, alpha, - a_buffer, a_offset, a_ld, - x_buffer, x_offset, static_cast(x_inc), + a_buffer(), a_offset, a_ld, + x_buffer(), x_offset, static_cast(x_inc), beta, - y_buffer, y_offset, static_cast(y_inc), + y_buffer(), y_offset, static_cast(y_inc), num_queues, queues, num_wait_events, wait_events, events); } clblasStatus clblasXgemv(const clblasOrder layout, const clblasTranspose a_transpose, const size_t m, const size_t n, const double alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const Buffer& a_buffer, const size_t a_offset, const size_t a_ld, + const Buffer& x_buffer, const size_t x_offset, const size_t x_inc, const double beta, - cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + Buffer& y_buffer, const size_t y_offset, const size_t y_inc, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { return clblasDgemv(layout, a_transpose, m, n, alpha, - a_buffer, a_offset, a_ld, - x_buffer, x_offset, static_cast(x_inc), + a_buffer(), a_offset, a_ld, + x_buffer(), x_offset, static_cast(x_inc), beta, - y_buffer, y_offset, static_cast(y_inc), + y_buffer(), y_offset, static_cast(y_inc), num_queues, queues, num_wait_events, wait_events, events); } clblasStatus clblasXgemv(const clblasOrder layout, const clblasTranspose a_transpose, const size_t m, const size_t n, const float2 alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const Buffer& a_buffer, const size_t a_offset, const size_t a_ld, + const Buffer& x_buffer, const size_t x_offset, const size_t x_inc, const float2 beta, - cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + Buffer& y_buffer, const size_t y_offset, const size_t y_inc, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { return clblasCgemv(layout, a_transpose, m, n, cl_float2{{alpha.real(), alpha.imag()}}, - a_buffer, a_offset, a_ld, - x_buffer, x_offset, static_cast(x_inc), + a_buffer(), a_offset, a_ld, + x_buffer(), x_offset, static_cast(x_inc), cl_float2{{beta.real(), beta.imag()}}, - y_buffer, y_offset, static_cast(y_inc), + y_buffer(), y_offset, static_cast(y_inc), num_queues, queues, num_wait_events, wait_events, events); } clblasStatus clblasXgemv(const clblasOrder layout, const clblasTranspose a_transpose, const size_t m, const size_t n, const double2 alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const Buffer& a_buffer, const size_t a_offset, const size_t a_ld, + const Buffer& x_buffer, const size_t x_offset, const size_t x_inc, const double2 beta, - cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + Buffer& y_buffer, const size_t y_offset, const size_t y_inc, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { return clblasZgemv(layout, a_transpose, m, n, cl_double2{{alpha.real(), alpha.imag()}}, - a_buffer, a_offset, a_ld, - x_buffer, x_offset, static_cast(x_inc), + a_buffer(), a_offset, a_ld, + x_buffer(), x_offset, static_cast(x_inc), cl_double2{{beta.real(), beta.imag()}}, - y_buffer, y_offset, static_cast(y_inc), + y_buffer(), y_offset, static_cast(y_inc), num_queues, queues, num_wait_events, wait_events, events); } +clblasStatus clblasXgemv(const clblasOrder layout, const clblasTranspose a_transpose, + const size_t m, const size_t n, + const half alpha, + const Buffer& a_buffer, const size_t a_offset, const size_t a_ld, + const Buffer& x_buffer, const size_t x_offset, const size_t x_inc, + const half beta, + Buffer& y_buffer, const size_t y_offset, const size_t y_inc, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { + auto a_buffer_bis = HalfToFloatBuffer(a_buffer, queues[0]); + auto x_buffer_bis = HalfToFloatBuffer(x_buffer, queues[0]); + auto y_buffer_bis = HalfToFloatBuffer(y_buffer, queues[0]); + auto status = clblasXgemv(layout, a_transpose, + m, n, + HalfToFloat(alpha), + a_buffer_bis, a_offset, a_ld, + x_buffer_bis, x_offset, x_inc, + HalfToFloat(beta), + y_buffer_bis, y_offset, y_inc, + num_queues, queues, num_wait_events, wait_events, events); + FloatToHalfBuffer(y_buffer, y_buffer_bis, queues[0]); + return status; +} // Forwards the clBLAS calls for SGBMV/DGBMV/CGBMV/ZGBMV clblasStatus clblasXgbmv(const clblasOrder layout, const clblasTranspose a_transpose, const size_t m, const size_t n, const size_t kl, const size_t ku, const float alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const Buffer& a_buffer, const size_t a_offset, const size_t a_ld, + const Buffer& x_buffer, const size_t x_offset, const size_t x_inc, const float beta, - cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + Buffer& y_buffer, const size_t y_offset, const size_t y_inc, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { return clblasSgbmv(layout, a_transpose, m, n, kl, ku, alpha, - a_buffer, a_offset, a_ld, - x_buffer, x_offset, static_cast(x_inc), + a_buffer(), a_offset, a_ld, + x_buffer(), x_offset, static_cast(x_inc), beta, - y_buffer, y_offset, static_cast(y_inc), + y_buffer(), y_offset, static_cast(y_inc), num_queues, queues, num_wait_events, wait_events, events); } clblasStatus clblasXgbmv(const clblasOrder layout, const clblasTranspose a_transpose, const size_t m, const size_t n, const size_t kl, const size_t ku, const double alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const Buffer& a_buffer, const size_t a_offset, const size_t a_ld, + const Buffer& x_buffer, const size_t x_offset, const size_t x_inc, const double beta, - cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + Buffer& y_buffer, const size_t y_offset, const size_t y_inc, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { return clblasDgbmv(layout, a_transpose, m, n, kl, ku, alpha, - a_buffer, a_offset, a_ld, - x_buffer, x_offset, static_cast(x_inc), + a_buffer(), a_offset, a_ld, + x_buffer(), x_offset, static_cast(x_inc), beta, - y_buffer, y_offset, static_cast(y_inc), + y_buffer(), y_offset, static_cast(y_inc), num_queues, queues, num_wait_events, wait_events, events); } clblasStatus clblasXgbmv(const clblasOrder layout, const clblasTranspose a_transpose, const size_t m, const size_t n, const size_t kl, const size_t ku, const float2 alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const Buffer& a_buffer, const size_t a_offset, const size_t a_ld, + const Buffer& x_buffer, const size_t x_offset, const size_t x_inc, const float2 beta, - cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + Buffer& y_buffer, const size_t y_offset, const size_t y_inc, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { return clblasCgbmv(layout, a_transpose, m, n, kl, ku, cl_float2{{alpha.real(), alpha.imag()}}, - a_buffer, a_offset, a_ld, - x_buffer, x_offset, static_cast(x_inc), + a_buffer(), a_offset, a_ld, + x_buffer(), x_offset, static_cast(x_inc), cl_float2{{beta.real(), beta.imag()}}, - y_buffer, y_offset, static_cast(y_inc), + y_buffer(), y_offset, static_cast(y_inc), num_queues, queues, num_wait_events, wait_events, events); } clblasStatus clblasXgbmv(const clblasOrder layout, const clblasTranspose a_transpose, const size_t m, const size_t n, const size_t kl, const size_t ku, const double2 alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const Buffer& a_buffer, const size_t a_offset, const size_t a_ld, + const Buffer& x_buffer, const size_t x_offset, const size_t x_inc, const double2 beta, - cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + Buffer& y_buffer, const size_t y_offset, const size_t y_inc, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { return clblasZgbmv(layout, a_transpose, m, n, kl, ku, cl_double2{{alpha.real(), alpha.imag()}}, - a_buffer, a_offset, a_ld, - x_buffer, x_offset, static_cast(x_inc), + a_buffer(), a_offset, a_ld, + x_buffer(), x_offset, static_cast(x_inc), cl_double2{{beta.real(), beta.imag()}}, - y_buffer, y_offset, static_cast(y_inc), + y_buffer(), y_offset, static_cast(y_inc), num_queues, queues, num_wait_events, wait_events, events); } +clblasStatus clblasXgbmv(const clblasOrder layout, const clblasTranspose a_transpose, + const size_t m, const size_t n, const size_t kl, const size_t ku, + const half alpha, + const Buffer& a_buffer, const size_t a_offset, const size_t a_ld, + const Buffer& x_buffer, const size_t x_offset, const size_t x_inc, + const half beta, + Buffer& y_buffer, const size_t y_offset, const size_t y_inc, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { + auto a_buffer_bis = HalfToFloatBuffer(a_buffer, queues[0]); + auto x_buffer_bis = HalfToFloatBuffer(x_buffer, queues[0]); + auto y_buffer_bis = HalfToFloatBuffer(y_buffer, queues[0]); + auto status = clblasXgbmv(layout, a_transpose, + m, n, kl, ku, + HalfToFloat(alpha), + a_buffer_bis, a_offset, a_ld, + x_buffer_bis, x_offset, x_inc, + HalfToFloat(beta), + y_buffer_bis, y_offset, y_inc, + num_queues, queues, num_wait_events, wait_events, events); + FloatToHalfBuffer(y_buffer, y_buffer_bis, queues[0]); + return status; +} // Forwards the clBLAS calls for CHEMV/ZHEMV clblasStatus clblasXhemv(const clblasOrder layout, const clblasUplo triangle, const size_t n, const float2 alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const Buffer& a_buffer, const size_t a_offset, const size_t a_ld, + const Buffer& x_buffer, const size_t x_offset, const size_t x_inc, const float2 beta, - cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + Buffer& y_buffer, const size_t y_offset, const size_t y_inc, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { return clblasChemv(layout, triangle, n, cl_float2{{alpha.real(), alpha.imag()}}, - a_buffer, a_offset, a_ld, - x_buffer, x_offset, static_cast(x_inc), + a_buffer(), a_offset, a_ld, + x_buffer(), x_offset, static_cast(x_inc), cl_float2{{beta.real(), beta.imag()}}, - y_buffer, y_offset, static_cast(y_inc), + y_buffer(), y_offset, static_cast(y_inc), num_queues, queues, num_wait_events, wait_events, events); } clblasStatus clblasXhemv(const clblasOrder layout, const clblasUplo triangle, const size_t n, const double2 alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const Buffer& a_buffer, const size_t a_offset, const size_t a_ld, + const Buffer& x_buffer, const size_t x_offset, const size_t x_inc, const double2 beta, - cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + Buffer& y_buffer, const size_t y_offset, const size_t y_inc, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { return clblasZhemv(layout, triangle, n, cl_double2{{alpha.real(), alpha.imag()}}, - a_buffer, a_offset, a_ld, - x_buffer, x_offset, static_cast(x_inc), + a_buffer(), a_offset, a_ld, + x_buffer(), x_offset, static_cast(x_inc), cl_double2{{beta.real(), beta.imag()}}, - y_buffer, y_offset, static_cast(y_inc), + y_buffer(), y_offset, static_cast(y_inc), num_queues, queues, num_wait_events, wait_events, events); } @@ -895,37 +1064,37 @@ clblasStatus clblasXhemv(const clblasOrder layout, const clblasUplo triangle, clblasStatus clblasXhbmv(const clblasOrder layout, const clblasUplo triangle, const size_t n, const size_t k, const float2 alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const Buffer& a_buffer, const size_t a_offset, const size_t a_ld, + const Buffer& x_buffer, const size_t x_offset, const size_t x_inc, const float2 beta, - cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + Buffer& y_buffer, const size_t y_offset, const size_t y_inc, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { return clblasChbmv(layout, triangle, n, k, cl_float2{{alpha.real(), alpha.imag()}}, - a_buffer, a_offset, a_ld, - x_buffer, x_offset, static_cast(x_inc), + a_buffer(), a_offset, a_ld, + x_buffer(), x_offset, static_cast(x_inc), cl_float2{{beta.real(), beta.imag()}}, - y_buffer, y_offset, static_cast(y_inc), + y_buffer(), y_offset, static_cast(y_inc), num_queues, queues, num_wait_events, wait_events, events); } clblasStatus clblasXhbmv(const clblasOrder layout, const clblasUplo triangle, const size_t n, const size_t k, const double2 alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const Buffer& a_buffer, const size_t a_offset, const size_t a_ld, + const Buffer& x_buffer, const size_t x_offset, const size_t x_inc, const double2 beta, - cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + Buffer& y_buffer, const size_t y_offset, const size_t y_inc, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { return clblasZhbmv(layout, triangle, n, k, cl_double2{{alpha.real(), alpha.imag()}}, - a_buffer, a_offset, a_ld, - x_buffer, x_offset, static_cast(x_inc), + a_buffer(), a_offset, a_ld, + x_buffer(), x_offset, static_cast(x_inc), cl_double2{{beta.real(), beta.imag()}}, - y_buffer, y_offset, static_cast(y_inc), + y_buffer(), y_offset, static_cast(y_inc), num_queues, queues, num_wait_events, wait_events, events); } @@ -933,37 +1102,37 @@ clblasStatus clblasXhbmv(const clblasOrder layout, const clblasUplo triangle, clblasStatus clblasXhpmv(const clblasOrder layout, const clblasUplo triangle, const size_t n, const float2 alpha, - const cl_mem ap_buffer, const size_t ap_offset, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const Buffer& ap_buffer, const size_t ap_offset, + const Buffer& x_buffer, const size_t x_offset, const size_t x_inc, const float2 beta, - cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + Buffer& y_buffer, const size_t y_offset, const size_t y_inc, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { return clblasChpmv(layout, triangle, n, cl_float2{{alpha.real(), alpha.imag()}}, - ap_buffer, ap_offset, - x_buffer, x_offset, static_cast(x_inc), + ap_buffer(), ap_offset, + x_buffer(), x_offset, static_cast(x_inc), cl_float2{{beta.real(), beta.imag()}}, - y_buffer, y_offset, static_cast(y_inc), + y_buffer(), y_offset, static_cast(y_inc), num_queues, queues, num_wait_events, wait_events, events); } clblasStatus clblasXhpmv(const clblasOrder layout, const clblasUplo triangle, const size_t n, const double2 alpha, - const cl_mem ap_buffer, const size_t ap_offset, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const Buffer& ap_buffer, const size_t ap_offset, + const Buffer& x_buffer, const size_t x_offset, const size_t x_inc, const double2 beta, - cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + Buffer& y_buffer, const size_t y_offset, const size_t y_inc, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { return clblasZhpmv(layout, triangle, n, cl_double2{{alpha.real(), alpha.imag()}}, - ap_buffer, ap_offset, - x_buffer, x_offset, static_cast(x_inc), + ap_buffer(), ap_offset, + x_buffer(), x_offset, static_cast(x_inc), cl_double2{{beta.real(), beta.imag()}}, - y_buffer, y_offset, static_cast(y_inc), + y_buffer(), y_offset, static_cast(y_inc), num_queues, queues, num_wait_events, wait_events, events); } @@ -971,129 +1140,198 @@ clblasStatus clblasXhpmv(const clblasOrder layout, const clblasUplo triangle, clblasStatus clblasXsymv(const clblasOrder layout, const clblasUplo triangle, const size_t n, const float alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const Buffer& a_buffer, const size_t a_offset, const size_t a_ld, + const Buffer& x_buffer, const size_t x_offset, const size_t x_inc, const float beta, - cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + Buffer& y_buffer, const size_t y_offset, const size_t y_inc, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { return clblasSsymv(layout, triangle, n, alpha, - a_buffer, a_offset, a_ld, - x_buffer, x_offset, static_cast(x_inc), + a_buffer(), a_offset, a_ld, + x_buffer(), x_offset, static_cast(x_inc), beta, - y_buffer, y_offset, static_cast(y_inc), + y_buffer(), y_offset, static_cast(y_inc), num_queues, queues, num_wait_events, wait_events, events); } clblasStatus clblasXsymv(const clblasOrder layout, const clblasUplo triangle, const size_t n, const double alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const Buffer& a_buffer, const size_t a_offset, const size_t a_ld, + const Buffer& x_buffer, const size_t x_offset, const size_t x_inc, const double beta, - cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + Buffer& y_buffer, const size_t y_offset, const size_t y_inc, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { return clblasDsymv(layout, triangle, n, alpha, - a_buffer, a_offset, a_ld, - x_buffer, x_offset, static_cast(x_inc), + a_buffer(), a_offset, a_ld, + x_buffer(), x_offset, static_cast(x_inc), beta, - y_buffer, y_offset, static_cast(y_inc), + y_buffer(), y_offset, static_cast(y_inc), num_queues, queues, num_wait_events, wait_events, events); } +clblasStatus clblasXsymv(const clblasOrder layout, const clblasUplo triangle, + const size_t n, + const half alpha, + const Buffer& a_buffer, const size_t a_offset, const size_t a_ld, + const Buffer& x_buffer, const size_t x_offset, const size_t x_inc, + const half beta, + Buffer& y_buffer, const size_t y_offset, const size_t y_inc, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { + auto a_buffer_bis = HalfToFloatBuffer(a_buffer, queues[0]); + auto x_buffer_bis = HalfToFloatBuffer(x_buffer, queues[0]); + auto y_buffer_bis = HalfToFloatBuffer(y_buffer, queues[0]); + auto status = clblasXsymv(layout, triangle, + n, + HalfToFloat(alpha), + a_buffer_bis, a_offset, a_ld, + x_buffer_bis, x_offset, x_inc, + HalfToFloat(beta), + y_buffer_bis, y_offset, y_inc, + num_queues, queues, num_wait_events, wait_events, events); + FloatToHalfBuffer(y_buffer, y_buffer_bis, queues[0]); + return status; +} // Forwards the clBLAS calls for SSBMV/DSBMV clblasStatus clblasXsbmv(const clblasOrder layout, const clblasUplo triangle, const size_t n, const size_t k, const float alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const Buffer& a_buffer, const size_t a_offset, const size_t a_ld, + const Buffer& x_buffer, const size_t x_offset, const size_t x_inc, const float beta, - cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + Buffer& y_buffer, const size_t y_offset, const size_t y_inc, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { return clblasSsbmv(layout, triangle, n, k, alpha, - a_buffer, a_offset, a_ld, - x_buffer, x_offset, static_cast(x_inc), + a_buffer(), a_offset, a_ld, + x_buffer(), x_offset, static_cast(x_inc), beta, - y_buffer, y_offset, static_cast(y_inc), + y_buffer(), y_offset, static_cast(y_inc), num_queues, queues, num_wait_events, wait_events, events); } clblasStatus clblasXsbmv(const clblasOrder layout, const clblasUplo triangle, const size_t n, const size_t k, const double alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const Buffer& a_buffer, const size_t a_offset, const size_t a_ld, + const Buffer& x_buffer, const size_t x_offset, const size_t x_inc, const double beta, - cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + Buffer& y_buffer, const size_t y_offset, const size_t y_inc, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { return clblasDsbmv(layout, triangle, n, k, alpha, - a_buffer, a_offset, a_ld, - x_buffer, x_offset, static_cast(x_inc), + a_buffer(), a_offset, a_ld, + x_buffer(), x_offset, static_cast(x_inc), beta, - y_buffer, y_offset, static_cast(y_inc), + y_buffer(), y_offset, static_cast(y_inc), num_queues, queues, num_wait_events, wait_events, events); } +clblasStatus clblasXsbmv(const clblasOrder layout, const clblasUplo triangle, + const size_t n, const size_t k, + const half alpha, + const Buffer& a_buffer, const size_t a_offset, const size_t a_ld, + const Buffer& x_buffer, const size_t x_offset, const size_t x_inc, + const half beta, + Buffer& y_buffer, const size_t y_offset, const size_t y_inc, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { + auto a_buffer_bis = HalfToFloatBuffer(a_buffer, queues[0]); + auto x_buffer_bis = HalfToFloatBuffer(x_buffer, queues[0]); + auto y_buffer_bis = HalfToFloatBuffer(y_buffer, queues[0]); + auto status = clblasXsbmv(layout, triangle, + n, k, + HalfToFloat(alpha), + a_buffer_bis, a_offset, a_ld, + x_buffer_bis, x_offset, x_inc, + HalfToFloat(beta), + y_buffer_bis, y_offset, y_inc, + num_queues, queues, num_wait_events, wait_events, events); + FloatToHalfBuffer(y_buffer, y_buffer_bis, queues[0]); + return status; +} // Forwards the clBLAS calls for SSPMV/DSPMV clblasStatus clblasXspmv(const clblasOrder layout, const clblasUplo triangle, const size_t n, const float alpha, - const cl_mem ap_buffer, const size_t ap_offset, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const Buffer& ap_buffer, const size_t ap_offset, + const Buffer& x_buffer, const size_t x_offset, const size_t x_inc, const float beta, - cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + Buffer& y_buffer, const size_t y_offset, const size_t y_inc, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { return clblasSspmv(layout, triangle, n, alpha, - ap_buffer, ap_offset, - x_buffer, x_offset, static_cast(x_inc), + ap_buffer(), ap_offset, + x_buffer(), x_offset, static_cast(x_inc), beta, - y_buffer, y_offset, static_cast(y_inc), + y_buffer(), y_offset, static_cast(y_inc), num_queues, queues, num_wait_events, wait_events, events); } clblasStatus clblasXspmv(const clblasOrder layout, const clblasUplo triangle, const size_t n, const double alpha, - const cl_mem ap_buffer, const size_t ap_offset, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const Buffer& ap_buffer, const size_t ap_offset, + const Buffer& x_buffer, const size_t x_offset, const size_t x_inc, const double beta, - cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + Buffer& y_buffer, const size_t y_offset, const size_t y_inc, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { return clblasDspmv(layout, triangle, n, alpha, - ap_buffer, ap_offset, - x_buffer, x_offset, static_cast(x_inc), + ap_buffer(), ap_offset, + x_buffer(), x_offset, static_cast(x_inc), beta, - y_buffer, y_offset, static_cast(y_inc), + y_buffer(), y_offset, static_cast(y_inc), num_queues, queues, num_wait_events, wait_events, events); } +clblasStatus clblasXspmv(const clblasOrder layout, const clblasUplo triangle, + const size_t n, + const half alpha, + const Buffer& ap_buffer, const size_t ap_offset, + const Buffer& x_buffer, const size_t x_offset, const size_t x_inc, + const half beta, + Buffer& y_buffer, const size_t y_offset, const size_t y_inc, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { + auto ap_buffer_bis = HalfToFloatBuffer(ap_buffer, queues[0]); + auto x_buffer_bis = HalfToFloatBuffer(x_buffer, queues[0]); + auto y_buffer_bis = HalfToFloatBuffer(y_buffer, queues[0]); + auto status = clblasXspmv(layout, triangle, + n, + HalfToFloat(alpha), + ap_buffer_bis, ap_offset, + x_buffer_bis, x_offset, x_inc, + HalfToFloat(beta), + y_buffer_bis, y_offset, y_inc, + num_queues, queues, num_wait_events, wait_events, events); + FloatToHalfBuffer(y_buffer, y_buffer_bis, queues[0]); + return status; +} // Forwards the clBLAS calls for STRMV/DTRMV/CTRMV/ZTRMV template clblasStatus clblasXtrmv(const clblasOrder layout, const clblasUplo triangle, const clblasTranspose a_transpose, const clblasDiag diagonal, const size_t n, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const Buffer& a_buffer, const size_t a_offset, const size_t a_ld, + Buffer& x_buffer, const size_t x_offset, const size_t x_inc, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events); template <> clblasStatus clblasXtrmv(const clblasOrder layout, const clblasUplo triangle, const clblasTranspose a_transpose, const clblasDiag diagonal, const size_t n, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const Buffer& a_buffer, const size_t a_offset, const size_t a_ld, + Buffer& x_buffer, const size_t x_offset, const size_t x_inc, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { auto queue = Queue(queues[0]); @@ -1101,16 +1339,16 @@ clblasStatus clblasXtrmv(const clblasOrder layout, const clblasUplo trian auto scratch_buffer = Buffer(context, n); return clblasStrmv(layout, triangle, a_transpose, diagonal, n, - a_buffer, a_offset, a_ld, - x_buffer, x_offset, static_cast(x_inc), + a_buffer(), a_offset, a_ld, + x_buffer(), x_offset, static_cast(x_inc), scratch_buffer(), num_queues, queues, num_wait_events, wait_events, events); } template <> clblasStatus clblasXtrmv(const clblasOrder layout, const clblasUplo triangle, const clblasTranspose a_transpose, const clblasDiag diagonal, const size_t n, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const Buffer& a_buffer, const size_t a_offset, const size_t a_ld, + Buffer& x_buffer, const size_t x_offset, const size_t x_inc, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { auto queue = Queue(queues[0]); @@ -1118,16 +1356,16 @@ clblasStatus clblasXtrmv(const clblasOrder layout, const clblasUplo tria auto scratch_buffer = Buffer(context, n); return clblasDtrmv(layout, triangle, a_transpose, diagonal, n, - a_buffer, a_offset, a_ld, - x_buffer, x_offset, static_cast(x_inc), + a_buffer(), a_offset, a_ld, + x_buffer(), x_offset, static_cast(x_inc), scratch_buffer(), num_queues, queues, num_wait_events, wait_events, events); } template <> clblasStatus clblasXtrmv(const clblasOrder layout, const clblasUplo triangle, const clblasTranspose a_transpose, const clblasDiag diagonal, const size_t n, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const Buffer& a_buffer, const size_t a_offset, const size_t a_ld, + Buffer& x_buffer, const size_t x_offset, const size_t x_inc, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { auto queue = Queue(queues[0]); @@ -1135,16 +1373,16 @@ clblasStatus clblasXtrmv(const clblasOrder layout, const clblasUplo tria auto scratch_buffer = Buffer(context, n); return clblasCtrmv(layout, triangle, a_transpose, diagonal, n, - a_buffer, a_offset, a_ld, - x_buffer, x_offset, static_cast(x_inc), + a_buffer(), a_offset, a_ld, + x_buffer(), x_offset, static_cast(x_inc), scratch_buffer(), num_queues, queues, num_wait_events, wait_events, events); } template <> clblasStatus clblasXtrmv(const clblasOrder layout, const clblasUplo triangle, const clblasTranspose a_transpose, const clblasDiag diagonal, const size_t n, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const Buffer& a_buffer, const size_t a_offset, const size_t a_ld, + Buffer& x_buffer, const size_t x_offset, const size_t x_inc, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { auto queue = Queue(queues[0]); @@ -1152,25 +1390,42 @@ clblasStatus clblasXtrmv(const clblasOrder layout, const clblasUplo tri auto scratch_buffer = Buffer(context, n); return clblasZtrmv(layout, triangle, a_transpose, diagonal, n, - a_buffer, a_offset, a_ld, - x_buffer, x_offset, static_cast(x_inc), + a_buffer(), a_offset, a_ld, + x_buffer(), x_offset, static_cast(x_inc), scratch_buffer(), num_queues, queues, num_wait_events, wait_events, events); } +template <> +clblasStatus clblasXtrmv(const clblasOrder layout, const clblasUplo triangle, const clblasTranspose a_transpose, const clblasDiag diagonal, + const size_t n, + const Buffer& a_buffer, const size_t a_offset, const size_t a_ld, + Buffer& x_buffer, const size_t x_offset, const size_t x_inc, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { + auto a_buffer_bis = HalfToFloatBuffer(a_buffer, queues[0]); + auto x_buffer_bis = HalfToFloatBuffer(x_buffer, queues[0]); + auto status = clblasXtrmv(layout, triangle, a_transpose, diagonal, + n, + a_buffer_bis, a_offset, a_ld, + x_buffer_bis, x_offset, x_inc, + num_queues, queues, num_wait_events, wait_events, events); + FloatToHalfBuffer(x_buffer, x_buffer_bis, queues[0]); + return status; +} // Forwards the clBLAS calls for STBMV/DTBMV/CTBMV/ZTBMV template clblasStatus clblasXtbmv(const clblasOrder layout, const clblasUplo triangle, const clblasTranspose a_transpose, const clblasDiag diagonal, const size_t n, const size_t k, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const Buffer& a_buffer, const size_t a_offset, const size_t a_ld, + Buffer& x_buffer, const size_t x_offset, const size_t x_inc, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events); template <> clblasStatus clblasXtbmv(const clblasOrder layout, const clblasUplo triangle, const clblasTranspose a_transpose, const clblasDiag diagonal, const size_t n, const size_t k, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const Buffer& a_buffer, const size_t a_offset, const size_t a_ld, + Buffer& x_buffer, const size_t x_offset, const size_t x_inc, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { auto queue = Queue(queues[0]); @@ -1178,16 +1433,16 @@ clblasStatus clblasXtbmv(const clblasOrder layout, const clblasUplo trian auto scratch_buffer = Buffer(context, n); return clblasStbmv(layout, triangle, a_transpose, diagonal, n, k, - a_buffer, a_offset, a_ld, - x_buffer, x_offset, static_cast(x_inc), + a_buffer(), a_offset, a_ld, + x_buffer(), x_offset, static_cast(x_inc), scratch_buffer(), num_queues, queues, num_wait_events, wait_events, events); } template <> clblasStatus clblasXtbmv(const clblasOrder layout, const clblasUplo triangle, const clblasTranspose a_transpose, const clblasDiag diagonal, const size_t n, const size_t k, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const Buffer& a_buffer, const size_t a_offset, const size_t a_ld, + Buffer& x_buffer, const size_t x_offset, const size_t x_inc, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { auto queue = Queue(queues[0]); @@ -1195,16 +1450,16 @@ clblasStatus clblasXtbmv(const clblasOrder layout, const clblasUplo tria auto scratch_buffer = Buffer(context, n); return clblasDtbmv(layout, triangle, a_transpose, diagonal, n, k, - a_buffer, a_offset, a_ld, - x_buffer, x_offset, static_cast(x_inc), + a_buffer(), a_offset, a_ld, + x_buffer(), x_offset, static_cast(x_inc), scratch_buffer(), num_queues, queues, num_wait_events, wait_events, events); } template <> clblasStatus clblasXtbmv(const clblasOrder layout, const clblasUplo triangle, const clblasTranspose a_transpose, const clblasDiag diagonal, const size_t n, const size_t k, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const Buffer& a_buffer, const size_t a_offset, const size_t a_ld, + Buffer& x_buffer, const size_t x_offset, const size_t x_inc, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { auto queue = Queue(queues[0]); @@ -1212,16 +1467,16 @@ clblasStatus clblasXtbmv(const clblasOrder layout, const clblasUplo tria auto scratch_buffer = Buffer(context, n); return clblasCtbmv(layout, triangle, a_transpose, diagonal, n, k, - a_buffer, a_offset, a_ld, - x_buffer, x_offset, static_cast(x_inc), + a_buffer(), a_offset, a_ld, + x_buffer(), x_offset, static_cast(x_inc), scratch_buffer(), num_queues, queues, num_wait_events, wait_events, events); } template <> clblasStatus clblasXtbmv(const clblasOrder layout, const clblasUplo triangle, const clblasTranspose a_transpose, const clblasDiag diagonal, const size_t n, const size_t k, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const Buffer& a_buffer, const size_t a_offset, const size_t a_ld, + Buffer& x_buffer, const size_t x_offset, const size_t x_inc, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { auto queue = Queue(queues[0]); @@ -1229,25 +1484,42 @@ clblasStatus clblasXtbmv(const clblasOrder layout, const clblasUplo tri auto scratch_buffer = Buffer(context, n); return clblasZtbmv(layout, triangle, a_transpose, diagonal, n, k, - a_buffer, a_offset, a_ld, - x_buffer, x_offset, static_cast(x_inc), + a_buffer(), a_offset, a_ld, + x_buffer(), x_offset, static_cast(x_inc), scratch_buffer(), num_queues, queues, num_wait_events, wait_events, events); } +template <> +clblasStatus clblasXtbmv(const clblasOrder layout, const clblasUplo triangle, const clblasTranspose a_transpose, const clblasDiag diagonal, + const size_t n, const size_t k, + const Buffer& a_buffer, const size_t a_offset, const size_t a_ld, + Buffer& x_buffer, const size_t x_offset, const size_t x_inc, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { + auto a_buffer_bis = HalfToFloatBuffer(a_buffer, queues[0]); + auto x_buffer_bis = HalfToFloatBuffer(x_buffer, queues[0]); + auto status = clblasXtbmv(layout, triangle, a_transpose, diagonal, + n, k, + a_buffer_bis, a_offset, a_ld, + x_buffer_bis, x_offset, x_inc, + num_queues, queues, num_wait_events, wait_events, events); + FloatToHalfBuffer(x_buffer, x_buffer_bis, queues[0]); + return status; +} // Forwards the clBLAS calls for STPMV/DTPMV/CTPMV/ZTPMV template clblasStatus clblasXtpmv(const clblasOrder layout, const clblasUplo triangle, const clblasTranspose a_transpose, const clblasDiag diagonal, const size_t n, - const cl_mem ap_buffer, const size_t ap_offset, - cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const Buffer& ap_buffer, const size_t ap_offset, + Buffer& x_buffer, const size_t x_offset, const size_t x_inc, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events); template <> clblasStatus clblasXtpmv(const clblasOrder layout, const clblasUplo triangle, const clblasTranspose a_transpose, const clblasDiag diagonal, const size_t n, - const cl_mem ap_buffer, const size_t ap_offset, - cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const Buffer& ap_buffer, const size_t ap_offset, + Buffer& x_buffer, const size_t x_offset, const size_t x_inc, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { auto queue = Queue(queues[0]); @@ -1255,16 +1527,16 @@ clblasStatus clblasXtpmv(const clblasOrder layout, const clblasUplo trian auto scratch_buffer = Buffer(context, n); return clblasStpmv(layout, triangle, a_transpose, diagonal, n, - ap_buffer, ap_offset, - x_buffer, x_offset, static_cast(x_inc), + ap_buffer(), ap_offset, + x_buffer(), x_offset, static_cast(x_inc), scratch_buffer(), num_queues, queues, num_wait_events, wait_events, events); } template <> clblasStatus clblasXtpmv(const clblasOrder layout, const clblasUplo triangle, const clblasTranspose a_transpose, const clblasDiag diagonal, const size_t n, - const cl_mem ap_buffer, const size_t ap_offset, - cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const Buffer& ap_buffer, const size_t ap_offset, + Buffer& x_buffer, const size_t x_offset, const size_t x_inc, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { auto queue = Queue(queues[0]); @@ -1272,16 +1544,16 @@ clblasStatus clblasXtpmv(const clblasOrder layout, const clblasUplo tria auto scratch_buffer = Buffer(context, n); return clblasDtpmv(layout, triangle, a_transpose, diagonal, n, - ap_buffer, ap_offset, - x_buffer, x_offset, static_cast(x_inc), + ap_buffer(), ap_offset, + x_buffer(), x_offset, static_cast(x_inc), scratch_buffer(), num_queues, queues, num_wait_events, wait_events, events); } template <> clblasStatus clblasXtpmv(const clblasOrder layout, const clblasUplo triangle, const clblasTranspose a_transpose, const clblasDiag diagonal, const size_t n, - const cl_mem ap_buffer, const size_t ap_offset, - cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const Buffer& ap_buffer, const size_t ap_offset, + Buffer& x_buffer, const size_t x_offset, const size_t x_inc, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { auto queue = Queue(queues[0]); @@ -1289,16 +1561,16 @@ clblasStatus clblasXtpmv(const clblasOrder layout, const clblasUplo tria auto scratch_buffer = Buffer(context, n); return clblasCtpmv(layout, triangle, a_transpose, diagonal, n, - ap_buffer, ap_offset, - x_buffer, x_offset, static_cast(x_inc), + ap_buffer(), ap_offset, + x_buffer(), x_offset, static_cast(x_inc), scratch_buffer(), num_queues, queues, num_wait_events, wait_events, events); } template <> clblasStatus clblasXtpmv(const clblasOrder layout, const clblasUplo triangle, const clblasTranspose a_transpose, const clblasDiag diagonal, const size_t n, - const cl_mem ap_buffer, const size_t ap_offset, - cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const Buffer& ap_buffer, const size_t ap_offset, + Buffer& x_buffer, const size_t x_offset, const size_t x_inc, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { auto queue = Queue(queues[0]); @@ -1306,70 +1578,87 @@ clblasStatus clblasXtpmv(const clblasOrder layout, const clblasUplo tri auto scratch_buffer = Buffer(context, n); return clblasZtpmv(layout, triangle, a_transpose, diagonal, n, - ap_buffer, ap_offset, - x_buffer, x_offset, static_cast(x_inc), + ap_buffer(), ap_offset, + x_buffer(), x_offset, static_cast(x_inc), scratch_buffer(), num_queues, queues, num_wait_events, wait_events, events); } +template <> +clblasStatus clblasXtpmv(const clblasOrder layout, const clblasUplo triangle, const clblasTranspose a_transpose, const clblasDiag diagonal, + const size_t n, + const Buffer& ap_buffer, const size_t ap_offset, + Buffer& x_buffer, const size_t x_offset, const size_t x_inc, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { + auto ap_buffer_bis = HalfToFloatBuffer(ap_buffer, queues[0]); + auto x_buffer_bis = HalfToFloatBuffer(x_buffer, queues[0]); + auto status = clblasXtpmv(layout, triangle, a_transpose, diagonal, + n, + ap_buffer_bis, ap_offset, + x_buffer_bis, x_offset, x_inc, + num_queues, queues, num_wait_events, wait_events, events); + FloatToHalfBuffer(x_buffer, x_buffer_bis, queues[0]); + return status; +} // Forwards the clBLAS calls for STRSV/DTRSV/CTRSV/ZTRSV template clblasStatus clblasXtrsv(const clblasOrder layout, const clblasUplo triangle, const clblasTranspose a_transpose, const clblasDiag diagonal, const size_t n, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const Buffer& a_buffer, const size_t a_offset, const size_t a_ld, + Buffer& x_buffer, const size_t x_offset, const size_t x_inc, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events); template <> clblasStatus clblasXtrsv(const clblasOrder layout, const clblasUplo triangle, const clblasTranspose a_transpose, const clblasDiag diagonal, const size_t n, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const Buffer& a_buffer, const size_t a_offset, const size_t a_ld, + Buffer& x_buffer, const size_t x_offset, const size_t x_inc, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { return clblasStrsv(layout, triangle, a_transpose, diagonal, n, - a_buffer, a_offset, a_ld, - x_buffer, x_offset, static_cast(x_inc), + a_buffer(), a_offset, a_ld, + x_buffer(), x_offset, static_cast(x_inc), num_queues, queues, num_wait_events, wait_events, events); } template <> clblasStatus clblasXtrsv(const clblasOrder layout, const clblasUplo triangle, const clblasTranspose a_transpose, const clblasDiag diagonal, const size_t n, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const Buffer& a_buffer, const size_t a_offset, const size_t a_ld, + Buffer& x_buffer, const size_t x_offset, const size_t x_inc, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { return clblasDtrsv(layout, triangle, a_transpose, diagonal, n, - a_buffer, a_offset, a_ld, - x_buffer, x_offset, static_cast(x_inc), + a_buffer(), a_offset, a_ld, + x_buffer(), x_offset, static_cast(x_inc), num_queues, queues, num_wait_events, wait_events, events); } template <> clblasStatus clblasXtrsv(const clblasOrder layout, const clblasUplo triangle, const clblasTranspose a_transpose, const clblasDiag diagonal, const size_t n, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const Buffer& a_buffer, const size_t a_offset, const size_t a_ld, + Buffer& x_buffer, const size_t x_offset, const size_t x_inc, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { return clblasCtrsv(layout, triangle, a_transpose, diagonal, n, - a_buffer, a_offset, a_ld, - x_buffer, x_offset, static_cast(x_inc), + a_buffer(), a_offset, a_ld, + x_buffer(), x_offset, static_cast(x_inc), num_queues, queues, num_wait_events, wait_events, events); } template <> clblasStatus clblasXtrsv(const clblasOrder layout, const clblasUplo triangle, const clblasTranspose a_transpose, const clblasDiag diagonal, const size_t n, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const Buffer& a_buffer, const size_t a_offset, const size_t a_ld, + Buffer& x_buffer, const size_t x_offset, const size_t x_inc, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { return clblasZtrsv(layout, triangle, a_transpose, diagonal, n, - a_buffer, a_offset, a_ld, - x_buffer, x_offset, static_cast(x_inc), + a_buffer(), a_offset, a_ld, + x_buffer(), x_offset, static_cast(x_inc), num_queues, queues, num_wait_events, wait_events, events); } @@ -1377,60 +1666,60 @@ clblasStatus clblasXtrsv(const clblasOrder layout, const clblasUplo tri template clblasStatus clblasXtbsv(const clblasOrder layout, const clblasUplo triangle, const clblasTranspose a_transpose, const clblasDiag diagonal, const size_t n, const size_t k, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const Buffer& a_buffer, const size_t a_offset, const size_t a_ld, + Buffer& x_buffer, const size_t x_offset, const size_t x_inc, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events); template <> clblasStatus clblasXtbsv(const clblasOrder layout, const clblasUplo triangle, const clblasTranspose a_transpose, const clblasDiag diagonal, const size_t n, const size_t k, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const Buffer& a_buffer, const size_t a_offset, const size_t a_ld, + Buffer& x_buffer, const size_t x_offset, const size_t x_inc, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { return clblasStbsv(layout, triangle, a_transpose, diagonal, n, k, - a_buffer, a_offset, a_ld, - x_buffer, x_offset, static_cast(x_inc), + a_buffer(), a_offset, a_ld, + x_buffer(), x_offset, static_cast(x_inc), num_queues, queues, num_wait_events, wait_events, events); } template <> clblasStatus clblasXtbsv(const clblasOrder layout, const clblasUplo triangle, const clblasTranspose a_transpose, const clblasDiag diagonal, const size_t n, const size_t k, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const Buffer& a_buffer, const size_t a_offset, const size_t a_ld, + Buffer& x_buffer, const size_t x_offset, const size_t x_inc, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { return clblasDtbsv(layout, triangle, a_transpose, diagonal, n, k, - a_buffer, a_offset, a_ld, - x_buffer, x_offset, static_cast(x_inc), + a_buffer(), a_offset, a_ld, + x_buffer(), x_offset, static_cast(x_inc), num_queues, queues, num_wait_events, wait_events, events); } template <> clblasStatus clblasXtbsv(const clblasOrder layout, const clblasUplo triangle, const clblasTranspose a_transpose, const clblasDiag diagonal, const size_t n, const size_t k, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const Buffer& a_buffer, const size_t a_offset, const size_t a_ld, + Buffer& x_buffer, const size_t x_offset, const size_t x_inc, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { return clblasCtbsv(layout, triangle, a_transpose, diagonal, n, k, - a_buffer, a_offset, a_ld, - x_buffer, x_offset, static_cast(x_inc), + a_buffer(), a_offset, a_ld, + x_buffer(), x_offset, static_cast(x_inc), num_queues, queues, num_wait_events, wait_events, events); } template <> clblasStatus clblasXtbsv(const clblasOrder layout, const clblasUplo triangle, const clblasTranspose a_transpose, const clblasDiag diagonal, const size_t n, const size_t k, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const Buffer& a_buffer, const size_t a_offset, const size_t a_ld, + Buffer& x_buffer, const size_t x_offset, const size_t x_inc, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { return clblasZtbsv(layout, triangle, a_transpose, diagonal, n, k, - a_buffer, a_offset, a_ld, - x_buffer, x_offset, static_cast(x_inc), + a_buffer(), a_offset, a_ld, + x_buffer(), x_offset, static_cast(x_inc), num_queues, queues, num_wait_events, wait_events, events); } @@ -1438,60 +1727,60 @@ clblasStatus clblasXtbsv(const clblasOrder layout, const clblasUplo tri template clblasStatus clblasXtpsv(const clblasOrder layout, const clblasUplo triangle, const clblasTranspose a_transpose, const clblasDiag diagonal, const size_t n, - const cl_mem ap_buffer, const size_t ap_offset, - cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const Buffer& ap_buffer, const size_t ap_offset, + Buffer& x_buffer, const size_t x_offset, const size_t x_inc, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events); template <> clblasStatus clblasXtpsv(const clblasOrder layout, const clblasUplo triangle, const clblasTranspose a_transpose, const clblasDiag diagonal, const size_t n, - const cl_mem ap_buffer, const size_t ap_offset, - cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const Buffer& ap_buffer, const size_t ap_offset, + Buffer& x_buffer, const size_t x_offset, const size_t x_inc, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { return clblasStpsv(layout, triangle, a_transpose, diagonal, n, - ap_buffer, ap_offset, - x_buffer, x_offset, static_cast(x_inc), + ap_buffer(), ap_offset, + x_buffer(), x_offset, static_cast(x_inc), num_queues, queues, num_wait_events, wait_events, events); } template <> clblasStatus clblasXtpsv(const clblasOrder layout, const clblasUplo triangle, const clblasTranspose a_transpose, const clblasDiag diagonal, const size_t n, - const cl_mem ap_buffer, const size_t ap_offset, - cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const Buffer& ap_buffer, const size_t ap_offset, + Buffer& x_buffer, const size_t x_offset, const size_t x_inc, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { return clblasDtpsv(layout, triangle, a_transpose, diagonal, n, - ap_buffer, ap_offset, - x_buffer, x_offset, static_cast(x_inc), + ap_buffer(), ap_offset, + x_buffer(), x_offset, static_cast(x_inc), num_queues, queues, num_wait_events, wait_events, events); } template <> clblasStatus clblasXtpsv(const clblasOrder layout, const clblasUplo triangle, const clblasTranspose a_transpose, const clblasDiag diagonal, const size_t n, - const cl_mem ap_buffer, const size_t ap_offset, - cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const Buffer& ap_buffer, const size_t ap_offset, + Buffer& x_buffer, const size_t x_offset, const size_t x_inc, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { return clblasCtpsv(layout, triangle, a_transpose, diagonal, n, - ap_buffer, ap_offset, - x_buffer, x_offset, static_cast(x_inc), + ap_buffer(), ap_offset, + x_buffer(), x_offset, static_cast(x_inc), num_queues, queues, num_wait_events, wait_events, events); } template <> clblasStatus clblasXtpsv(const clblasOrder layout, const clblasUplo triangle, const clblasTranspose a_transpose, const clblasDiag diagonal, const size_t n, - const cl_mem ap_buffer, const size_t ap_offset, - cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const Buffer& ap_buffer, const size_t ap_offset, + Buffer& x_buffer, const size_t x_offset, const size_t x_inc, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { return clblasZtpsv(layout, triangle, a_transpose, diagonal, n, - ap_buffer, ap_offset, - x_buffer, x_offset, static_cast(x_inc), + ap_buffer(), ap_offset, + x_buffer(), x_offset, static_cast(x_inc), num_queues, queues, num_wait_events, wait_events, events); } @@ -1499,67 +1788,88 @@ clblasStatus clblasXtpsv(const clblasOrder layout, const clblasUplo tri clblasStatus clblasXger(const clblasOrder layout, const size_t m, const size_t n, const float alpha, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const Buffer& x_buffer, const size_t x_offset, const size_t x_inc, + const Buffer& y_buffer, const size_t y_offset, const size_t y_inc, + Buffer& a_buffer, const size_t a_offset, const size_t a_ld, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { return clblasSger(layout, m, n, alpha, - x_buffer, x_offset, static_cast(x_inc), - y_buffer, y_offset, static_cast(y_inc), - a_buffer, a_offset, a_ld, + x_buffer(), x_offset, static_cast(x_inc), + y_buffer(), y_offset, static_cast(y_inc), + a_buffer(), a_offset, a_ld, num_queues, queues, num_wait_events, wait_events, events); } clblasStatus clblasXger(const clblasOrder layout, const size_t m, const size_t n, const double alpha, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const Buffer& x_buffer, const size_t x_offset, const size_t x_inc, + const Buffer& y_buffer, const size_t y_offset, const size_t y_inc, + Buffer& a_buffer, const size_t a_offset, const size_t a_ld, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { return clblasDger(layout, m, n, alpha, - x_buffer, x_offset, static_cast(x_inc), - y_buffer, y_offset, static_cast(y_inc), - a_buffer, a_offset, a_ld, + x_buffer(), x_offset, static_cast(x_inc), + y_buffer(), y_offset, static_cast(y_inc), + a_buffer(), a_offset, a_ld, num_queues, queues, num_wait_events, wait_events, events); } +clblasStatus clblasXger(const clblasOrder layout, + const size_t m, const size_t n, + const half alpha, + const Buffer& x_buffer, const size_t x_offset, const size_t x_inc, + const Buffer& y_buffer, const size_t y_offset, const size_t y_inc, + Buffer& a_buffer, const size_t a_offset, const size_t a_ld, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { + auto x_buffer_bis = HalfToFloatBuffer(x_buffer, queues[0]); + auto y_buffer_bis = HalfToFloatBuffer(y_buffer, queues[0]); + auto a_buffer_bis = HalfToFloatBuffer(a_buffer, queues[0]); + auto status = clblasXger(layout, + m, n, + HalfToFloat(alpha), + x_buffer_bis, x_offset, x_inc, + y_buffer_bis, y_offset, y_inc, + a_buffer_bis, a_offset, a_ld, + num_queues, queues, num_wait_events, wait_events, events); + FloatToHalfBuffer(a_buffer, a_buffer_bis, queues[0]); + return status; +} // Forwards the clBLAS calls for CGERU/ZGERU clblasStatus clblasXgeru(const clblasOrder layout, const size_t m, const size_t n, const float2 alpha, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const Buffer& x_buffer, const size_t x_offset, const size_t x_inc, + const Buffer& y_buffer, const size_t y_offset, const size_t y_inc, + Buffer& a_buffer, const size_t a_offset, const size_t a_ld, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { return clblasCgeru(layout, m, n, cl_float2{{alpha.real(), alpha.imag()}}, - x_buffer, x_offset, static_cast(x_inc), - y_buffer, y_offset, static_cast(y_inc), - a_buffer, a_offset, a_ld, + x_buffer(), x_offset, static_cast(x_inc), + y_buffer(), y_offset, static_cast(y_inc), + a_buffer(), a_offset, a_ld, num_queues, queues, num_wait_events, wait_events, events); } clblasStatus clblasXgeru(const clblasOrder layout, const size_t m, const size_t n, const double2 alpha, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const Buffer& x_buffer, const size_t x_offset, const size_t x_inc, + const Buffer& y_buffer, const size_t y_offset, const size_t y_inc, + Buffer& a_buffer, const size_t a_offset, const size_t a_ld, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { return clblasZgeru(layout, m, n, cl_double2{{alpha.real(), alpha.imag()}}, - x_buffer, x_offset, static_cast(x_inc), - y_buffer, y_offset, static_cast(y_inc), - a_buffer, a_offset, a_ld, + x_buffer(), x_offset, static_cast(x_inc), + y_buffer(), y_offset, static_cast(y_inc), + a_buffer(), a_offset, a_ld, num_queues, queues, num_wait_events, wait_events, events); } @@ -1567,33 +1877,33 @@ clblasStatus clblasXgeru(const clblasOrder layout, clblasStatus clblasXgerc(const clblasOrder layout, const size_t m, const size_t n, const float2 alpha, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const Buffer& x_buffer, const size_t x_offset, const size_t x_inc, + const Buffer& y_buffer, const size_t y_offset, const size_t y_inc, + Buffer& a_buffer, const size_t a_offset, const size_t a_ld, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { return clblasCgerc(layout, m, n, cl_float2{{alpha.real(), alpha.imag()}}, - x_buffer, x_offset, static_cast(x_inc), - y_buffer, y_offset, static_cast(y_inc), - a_buffer, a_offset, a_ld, + x_buffer(), x_offset, static_cast(x_inc), + y_buffer(), y_offset, static_cast(y_inc), + a_buffer(), a_offset, a_ld, num_queues, queues, num_wait_events, wait_events, events); } clblasStatus clblasXgerc(const clblasOrder layout, const size_t m, const size_t n, const double2 alpha, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const Buffer& x_buffer, const size_t x_offset, const size_t x_inc, + const Buffer& y_buffer, const size_t y_offset, const size_t y_inc, + Buffer& a_buffer, const size_t a_offset, const size_t a_ld, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { return clblasZgerc(layout, m, n, cl_double2{{alpha.real(), alpha.imag()}}, - x_buffer, x_offset, static_cast(x_inc), - y_buffer, y_offset, static_cast(y_inc), - a_buffer, a_offset, a_ld, + x_buffer(), x_offset, static_cast(x_inc), + y_buffer(), y_offset, static_cast(y_inc), + a_buffer(), a_offset, a_ld, num_queues, queues, num_wait_events, wait_events, events); } @@ -1601,29 +1911,29 @@ clblasStatus clblasXgerc(const clblasOrder layout, clblasStatus clblasXher(const clblasOrder layout, const clblasUplo triangle, const size_t n, const float alpha, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const Buffer& x_buffer, const size_t x_offset, const size_t x_inc, + Buffer& a_buffer, const size_t a_offset, const size_t a_ld, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { return clblasCher(layout, triangle, n, alpha, - x_buffer, x_offset, static_cast(x_inc), - a_buffer, a_offset, a_ld, + x_buffer(), x_offset, static_cast(x_inc), + a_buffer(), a_offset, a_ld, num_queues, queues, num_wait_events, wait_events, events); } clblasStatus clblasXher(const clblasOrder layout, const clblasUplo triangle, const size_t n, const double alpha, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const Buffer& x_buffer, const size_t x_offset, const size_t x_inc, + Buffer& a_buffer, const size_t a_offset, const size_t a_ld, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { return clblasZher(layout, triangle, n, alpha, - x_buffer, x_offset, static_cast(x_inc), - a_buffer, a_offset, a_ld, + x_buffer(), x_offset, static_cast(x_inc), + a_buffer(), a_offset, a_ld, num_queues, queues, num_wait_events, wait_events, events); } @@ -1631,29 +1941,29 @@ clblasStatus clblasXher(const clblasOrder layout, const clblasUplo triangle, clblasStatus clblasXhpr(const clblasOrder layout, const clblasUplo triangle, const size_t n, const float alpha, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_mem ap_buffer, const size_t ap_offset, + const Buffer& x_buffer, const size_t x_offset, const size_t x_inc, + Buffer& ap_buffer, const size_t ap_offset, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { return clblasChpr(layout, triangle, n, alpha, - x_buffer, x_offset, static_cast(x_inc), - ap_buffer, ap_offset, + x_buffer(), x_offset, static_cast(x_inc), + ap_buffer(), ap_offset, num_queues, queues, num_wait_events, wait_events, events); } clblasStatus clblasXhpr(const clblasOrder layout, const clblasUplo triangle, const size_t n, const double alpha, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_mem ap_buffer, const size_t ap_offset, + const Buffer& x_buffer, const size_t x_offset, const size_t x_inc, + Buffer& ap_buffer, const size_t ap_offset, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { return clblasZhpr(layout, triangle, n, alpha, - x_buffer, x_offset, static_cast(x_inc), - ap_buffer, ap_offset, + x_buffer(), x_offset, static_cast(x_inc), + ap_buffer(), ap_offset, num_queues, queues, num_wait_events, wait_events, events); } @@ -1661,33 +1971,33 @@ clblasStatus clblasXhpr(const clblasOrder layout, const clblasUplo triangle, clblasStatus clblasXher2(const clblasOrder layout, const clblasUplo triangle, const size_t n, const float2 alpha, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const Buffer& x_buffer, const size_t x_offset, const size_t x_inc, + const Buffer& y_buffer, const size_t y_offset, const size_t y_inc, + Buffer& a_buffer, const size_t a_offset, const size_t a_ld, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { return clblasCher2(layout, triangle, n, cl_float2{{alpha.real(), alpha.imag()}}, - x_buffer, x_offset, static_cast(x_inc), - y_buffer, y_offset, static_cast(y_inc), - a_buffer, a_offset, a_ld, + x_buffer(), x_offset, static_cast(x_inc), + y_buffer(), y_offset, static_cast(y_inc), + a_buffer(), a_offset, a_ld, num_queues, queues, num_wait_events, wait_events, events); } clblasStatus clblasXher2(const clblasOrder layout, const clblasUplo triangle, const size_t n, const double2 alpha, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const Buffer& x_buffer, const size_t x_offset, const size_t x_inc, + const Buffer& y_buffer, const size_t y_offset, const size_t y_inc, + Buffer& a_buffer, const size_t a_offset, const size_t a_ld, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { return clblasZher2(layout, triangle, n, cl_double2{{alpha.real(), alpha.imag()}}, - x_buffer, x_offset, static_cast(x_inc), - y_buffer, y_offset, static_cast(y_inc), - a_buffer, a_offset, a_ld, + x_buffer(), x_offset, static_cast(x_inc), + y_buffer(), y_offset, static_cast(y_inc), + a_buffer(), a_offset, a_ld, num_queues, queues, num_wait_events, wait_events, events); } @@ -1695,33 +2005,33 @@ clblasStatus clblasXher2(const clblasOrder layout, const clblasUplo triangle, clblasStatus clblasXhpr2(const clblasOrder layout, const clblasUplo triangle, const size_t n, const float2 alpha, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - cl_mem ap_buffer, const size_t ap_offset, + const Buffer& x_buffer, const size_t x_offset, const size_t x_inc, + const Buffer& y_buffer, const size_t y_offset, const size_t y_inc, + Buffer& ap_buffer, const size_t ap_offset, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { return clblasChpr2(layout, triangle, n, cl_float2{{alpha.real(), alpha.imag()}}, - x_buffer, x_offset, static_cast(x_inc), - y_buffer, y_offset, static_cast(y_inc), - ap_buffer, ap_offset, + x_buffer(), x_offset, static_cast(x_inc), + y_buffer(), y_offset, static_cast(y_inc), + ap_buffer(), ap_offset, num_queues, queues, num_wait_events, wait_events, events); } clblasStatus clblasXhpr2(const clblasOrder layout, const clblasUplo triangle, const size_t n, const double2 alpha, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - cl_mem ap_buffer, const size_t ap_offset, + const Buffer& x_buffer, const size_t x_offset, const size_t x_inc, + const Buffer& y_buffer, const size_t y_offset, const size_t y_inc, + Buffer& ap_buffer, const size_t ap_offset, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { return clblasZhpr2(layout, triangle, n, cl_double2{{alpha.real(), alpha.imag()}}, - x_buffer, x_offset, static_cast(x_inc), - y_buffer, y_offset, static_cast(y_inc), - ap_buffer, ap_offset, + x_buffer(), x_offset, static_cast(x_inc), + y_buffer(), y_offset, static_cast(y_inc), + ap_buffer(), ap_offset, num_queues, queues, num_wait_events, wait_events, events); } @@ -1729,129 +2039,207 @@ clblasStatus clblasXhpr2(const clblasOrder layout, const clblasUplo triangle, clblasStatus clblasXsyr(const clblasOrder layout, const clblasUplo triangle, const size_t n, const float alpha, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const Buffer& x_buffer, const size_t x_offset, const size_t x_inc, + Buffer& a_buffer, const size_t a_offset, const size_t a_ld, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { return clblasSsyr(layout, triangle, n, alpha, - x_buffer, x_offset, static_cast(x_inc), - a_buffer, a_offset, a_ld, + x_buffer(), x_offset, static_cast(x_inc), + a_buffer(), a_offset, a_ld, num_queues, queues, num_wait_events, wait_events, events); } clblasStatus clblasXsyr(const clblasOrder layout, const clblasUplo triangle, const size_t n, const double alpha, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const Buffer& x_buffer, const size_t x_offset, const size_t x_inc, + Buffer& a_buffer, const size_t a_offset, const size_t a_ld, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { return clblasDsyr(layout, triangle, n, alpha, - x_buffer, x_offset, static_cast(x_inc), - a_buffer, a_offset, a_ld, + x_buffer(), x_offset, static_cast(x_inc), + a_buffer(), a_offset, a_ld, num_queues, queues, num_wait_events, wait_events, events); } +clblasStatus clblasXsyr(const clblasOrder layout, const clblasUplo triangle, + const size_t n, + const half alpha, + const Buffer& x_buffer, const size_t x_offset, const size_t x_inc, + Buffer& a_buffer, const size_t a_offset, const size_t a_ld, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { + auto x_buffer_bis = HalfToFloatBuffer(x_buffer, queues[0]); + auto a_buffer_bis = HalfToFloatBuffer(a_buffer, queues[0]); + auto status = clblasXsyr(layout, triangle, + n, + HalfToFloat(alpha), + x_buffer_bis, x_offset, x_inc, + a_buffer_bis, a_offset, a_ld, + num_queues, queues, num_wait_events, wait_events, events); + FloatToHalfBuffer(a_buffer, a_buffer_bis, queues[0]); + return status; +} // Forwards the clBLAS calls for SSPR/DSPR clblasStatus clblasXspr(const clblasOrder layout, const clblasUplo triangle, const size_t n, const float alpha, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_mem ap_buffer, const size_t ap_offset, + const Buffer& x_buffer, const size_t x_offset, const size_t x_inc, + Buffer& ap_buffer, const size_t ap_offset, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { return clblasSspr(layout, triangle, n, alpha, - x_buffer, x_offset, static_cast(x_inc), - ap_buffer, ap_offset, + x_buffer(), x_offset, static_cast(x_inc), + ap_buffer(), ap_offset, num_queues, queues, num_wait_events, wait_events, events); } clblasStatus clblasXspr(const clblasOrder layout, const clblasUplo triangle, const size_t n, const double alpha, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_mem ap_buffer, const size_t ap_offset, + const Buffer& x_buffer, const size_t x_offset, const size_t x_inc, + Buffer& ap_buffer, const size_t ap_offset, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { return clblasDspr(layout, triangle, n, alpha, - x_buffer, x_offset, static_cast(x_inc), - ap_buffer, ap_offset, + x_buffer(), x_offset, static_cast(x_inc), + ap_buffer(), ap_offset, num_queues, queues, num_wait_events, wait_events, events); } +clblasStatus clblasXspr(const clblasOrder layout, const clblasUplo triangle, + const size_t n, + const half alpha, + const Buffer& x_buffer, const size_t x_offset, const size_t x_inc, + Buffer& ap_buffer, const size_t ap_offset, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { + auto x_buffer_bis = HalfToFloatBuffer(x_buffer, queues[0]); + auto ap_buffer_bis = HalfToFloatBuffer(ap_buffer, queues[0]); + auto status = clblasXspr(layout, triangle, + n, + HalfToFloat(alpha), + x_buffer_bis, x_offset, x_inc, + ap_buffer_bis, ap_offset, + num_queues, queues, num_wait_events, wait_events, events); + FloatToHalfBuffer(ap_buffer, ap_buffer_bis, queues[0]); + return status; +} // Forwards the clBLAS calls for SSYR2/DSYR2 clblasStatus clblasXsyr2(const clblasOrder layout, const clblasUplo triangle, const size_t n, const float alpha, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const Buffer& x_buffer, const size_t x_offset, const size_t x_inc, + const Buffer& y_buffer, const size_t y_offset, const size_t y_inc, + Buffer& a_buffer, const size_t a_offset, const size_t a_ld, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { return clblasSsyr2(layout, triangle, n, alpha, - x_buffer, x_offset, static_cast(x_inc), - y_buffer, y_offset, static_cast(y_inc), - a_buffer, a_offset, a_ld, + x_buffer(), x_offset, static_cast(x_inc), + y_buffer(), y_offset, static_cast(y_inc), + a_buffer(), a_offset, a_ld, num_queues, queues, num_wait_events, wait_events, events); } clblasStatus clblasXsyr2(const clblasOrder layout, const clblasUplo triangle, const size_t n, const double alpha, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const Buffer& x_buffer, const size_t x_offset, const size_t x_inc, + const Buffer& y_buffer, const size_t y_offset, const size_t y_inc, + Buffer& a_buffer, const size_t a_offset, const size_t a_ld, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { return clblasDsyr2(layout, triangle, n, alpha, - x_buffer, x_offset, static_cast(x_inc), - y_buffer, y_offset, static_cast(y_inc), - a_buffer, a_offset, a_ld, + x_buffer(), x_offset, static_cast(x_inc), + y_buffer(), y_offset, static_cast(y_inc), + a_buffer(), a_offset, a_ld, num_queues, queues, num_wait_events, wait_events, events); } +clblasStatus clblasXsyr2(const clblasOrder layout, const clblasUplo triangle, + const size_t n, + const half alpha, + const Buffer& x_buffer, const size_t x_offset, const size_t x_inc, + const Buffer& y_buffer, const size_t y_offset, const size_t y_inc, + Buffer& a_buffer, const size_t a_offset, const size_t a_ld, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { + auto x_buffer_bis = HalfToFloatBuffer(x_buffer, queues[0]); + auto y_buffer_bis = HalfToFloatBuffer(y_buffer, queues[0]); + auto a_buffer_bis = HalfToFloatBuffer(a_buffer, queues[0]); + auto status = clblasXsyr2(layout, triangle, + n, + HalfToFloat(alpha), + x_buffer_bis, x_offset, x_inc, + y_buffer_bis, y_offset, y_inc, + a_buffer_bis, a_offset, a_ld, + num_queues, queues, num_wait_events, wait_events, events); + FloatToHalfBuffer(a_buffer, a_buffer_bis, queues[0]); + return status; +} // Forwards the clBLAS calls for SSPR2/DSPR2 clblasStatus clblasXspr2(const clblasOrder layout, const clblasUplo triangle, const size_t n, const float alpha, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - cl_mem ap_buffer, const size_t ap_offset, + const Buffer& x_buffer, const size_t x_offset, const size_t x_inc, + const Buffer& y_buffer, const size_t y_offset, const size_t y_inc, + Buffer& ap_buffer, const size_t ap_offset, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { return clblasSspr2(layout, triangle, n, alpha, - x_buffer, x_offset, static_cast(x_inc), - y_buffer, y_offset, static_cast(y_inc), - ap_buffer, ap_offset, + x_buffer(), x_offset, static_cast(x_inc), + y_buffer(), y_offset, static_cast(y_inc), + ap_buffer(), ap_offset, num_queues, queues, num_wait_events, wait_events, events); } clblasStatus clblasXspr2(const clblasOrder layout, const clblasUplo triangle, const size_t n, const double alpha, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - cl_mem ap_buffer, const size_t ap_offset, + const Buffer& x_buffer, const size_t x_offset, const size_t x_inc, + const Buffer& y_buffer, const size_t y_offset, const size_t y_inc, + Buffer& ap_buffer, const size_t ap_offset, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { return clblasDspr2(layout, triangle, n, alpha, - x_buffer, x_offset, static_cast(x_inc), - y_buffer, y_offset, static_cast(y_inc), - ap_buffer, ap_offset, + x_buffer(), x_offset, static_cast(x_inc), + y_buffer(), y_offset, static_cast(y_inc), + ap_buffer(), ap_offset, num_queues, queues, num_wait_events, wait_events, events); } +clblasStatus clblasXspr2(const clblasOrder layout, const clblasUplo triangle, + const size_t n, + const half alpha, + const Buffer& x_buffer, const size_t x_offset, const size_t x_inc, + const Buffer& y_buffer, const size_t y_offset, const size_t y_inc, + Buffer& ap_buffer, const size_t ap_offset, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { + auto x_buffer_bis = HalfToFloatBuffer(x_buffer, queues[0]); + auto y_buffer_bis = HalfToFloatBuffer(y_buffer, queues[0]); + auto ap_buffer_bis = HalfToFloatBuffer(ap_buffer, queues[0]); + auto status = clblasXspr2(layout, triangle, + n, + HalfToFloat(alpha), + x_buffer_bis, x_offset, x_inc, + y_buffer_bis, y_offset, y_inc, + ap_buffer_bis, ap_offset, + num_queues, queues, num_wait_events, wait_events, events); + FloatToHalfBuffer(ap_buffer, ap_buffer_bis, queues[0]); + return status; +} // ================================================================================================= // BLAS level-3 (matrix-matrix) routines @@ -1861,185 +2249,231 @@ clblasStatus clblasXspr2(const clblasOrder layout, const clblasUplo triangle, clblasStatus clblasXgemm(const clblasOrder layout, const clblasTranspose a_transpose, const clblasTranspose b_transpose, const size_t m, const size_t n, const size_t k, const float alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, + const Buffer& a_buffer, const size_t a_offset, const size_t a_ld, + const Buffer& b_buffer, const size_t b_offset, const size_t b_ld, const float beta, - cl_mem c_buffer, const size_t c_offset, const size_t c_ld, + Buffer& c_buffer, const size_t c_offset, const size_t c_ld, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { return clblasSgemm(layout, a_transpose, b_transpose, m, n, k, alpha, - a_buffer, a_offset, a_ld, - b_buffer, b_offset, b_ld, + a_buffer(), a_offset, a_ld, + b_buffer(), b_offset, b_ld, beta, - c_buffer, c_offset, c_ld, + c_buffer(), c_offset, c_ld, num_queues, queues, num_wait_events, wait_events, events); } clblasStatus clblasXgemm(const clblasOrder layout, const clblasTranspose a_transpose, const clblasTranspose b_transpose, const size_t m, const size_t n, const size_t k, const double alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, + const Buffer& a_buffer, const size_t a_offset, const size_t a_ld, + const Buffer& b_buffer, const size_t b_offset, const size_t b_ld, const double beta, - cl_mem c_buffer, const size_t c_offset, const size_t c_ld, + Buffer& c_buffer, const size_t c_offset, const size_t c_ld, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { return clblasDgemm(layout, a_transpose, b_transpose, m, n, k, alpha, - a_buffer, a_offset, a_ld, - b_buffer, b_offset, b_ld, + a_buffer(), a_offset, a_ld, + b_buffer(), b_offset, b_ld, beta, - c_buffer, c_offset, c_ld, + c_buffer(), c_offset, c_ld, num_queues, queues, num_wait_events, wait_events, events); } clblasStatus clblasXgemm(const clblasOrder layout, const clblasTranspose a_transpose, const clblasTranspose b_transpose, const size_t m, const size_t n, const size_t k, const float2 alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, + const Buffer& a_buffer, const size_t a_offset, const size_t a_ld, + const Buffer& b_buffer, const size_t b_offset, const size_t b_ld, const float2 beta, - cl_mem c_buffer, const size_t c_offset, const size_t c_ld, + Buffer& c_buffer, const size_t c_offset, const size_t c_ld, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { return clblasCgemm(layout, a_transpose, b_transpose, m, n, k, cl_float2{{alpha.real(), alpha.imag()}}, - a_buffer, a_offset, a_ld, - b_buffer, b_offset, b_ld, + a_buffer(), a_offset, a_ld, + b_buffer(), b_offset, b_ld, cl_float2{{beta.real(), beta.imag()}}, - c_buffer, c_offset, c_ld, + c_buffer(), c_offset, c_ld, num_queues, queues, num_wait_events, wait_events, events); } clblasStatus clblasXgemm(const clblasOrder layout, const clblasTranspose a_transpose, const clblasTranspose b_transpose, const size_t m, const size_t n, const size_t k, const double2 alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, + const Buffer& a_buffer, const size_t a_offset, const size_t a_ld, + const Buffer& b_buffer, const size_t b_offset, const size_t b_ld, const double2 beta, - cl_mem c_buffer, const size_t c_offset, const size_t c_ld, + Buffer& c_buffer, const size_t c_offset, const size_t c_ld, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { return clblasZgemm(layout, a_transpose, b_transpose, m, n, k, cl_double2{{alpha.real(), alpha.imag()}}, - a_buffer, a_offset, a_ld, - b_buffer, b_offset, b_ld, + a_buffer(), a_offset, a_ld, + b_buffer(), b_offset, b_ld, cl_double2{{beta.real(), beta.imag()}}, - c_buffer, c_offset, c_ld, + c_buffer(), c_offset, c_ld, num_queues, queues, num_wait_events, wait_events, events); } +clblasStatus clblasXgemm(const clblasOrder layout, const clblasTranspose a_transpose, const clblasTranspose b_transpose, + const size_t m, const size_t n, const size_t k, + const half alpha, + const Buffer& a_buffer, const size_t a_offset, const size_t a_ld, + const Buffer& b_buffer, const size_t b_offset, const size_t b_ld, + const half beta, + Buffer& c_buffer, const size_t c_offset, const size_t c_ld, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { + auto a_buffer_bis = HalfToFloatBuffer(a_buffer, queues[0]); + auto b_buffer_bis = HalfToFloatBuffer(b_buffer, queues[0]); + auto c_buffer_bis = HalfToFloatBuffer(c_buffer, queues[0]); + auto status = clblasXgemm(layout, a_transpose, b_transpose, + m, n, k, + HalfToFloat(alpha), + a_buffer_bis, a_offset, a_ld, + b_buffer_bis, b_offset, b_ld, + HalfToFloat(beta), + c_buffer_bis, c_offset, c_ld, + num_queues, queues, num_wait_events, wait_events, events); + FloatToHalfBuffer(c_buffer, c_buffer_bis, queues[0]); + return status; +} // Forwards the clBLAS calls for SSYMM/DSYMM/CSYMM/ZSYMM clblasStatus clblasXsymm(const clblasOrder layout, const clblasSide side, const clblasUplo triangle, const size_t m, const size_t n, const float alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, + const Buffer& a_buffer, const size_t a_offset, const size_t a_ld, + const Buffer& b_buffer, const size_t b_offset, const size_t b_ld, const float beta, - cl_mem c_buffer, const size_t c_offset, const size_t c_ld, + Buffer& c_buffer, const size_t c_offset, const size_t c_ld, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { return clblasSsymm(layout, side, triangle, m, n, alpha, - a_buffer, a_offset, a_ld, - b_buffer, b_offset, b_ld, + a_buffer(), a_offset, a_ld, + b_buffer(), b_offset, b_ld, beta, - c_buffer, c_offset, c_ld, + c_buffer(), c_offset, c_ld, num_queues, queues, num_wait_events, wait_events, events); } clblasStatus clblasXsymm(const clblasOrder layout, const clblasSide side, const clblasUplo triangle, const size_t m, const size_t n, const double alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, + const Buffer& a_buffer, const size_t a_offset, const size_t a_ld, + const Buffer& b_buffer, const size_t b_offset, const size_t b_ld, const double beta, - cl_mem c_buffer, const size_t c_offset, const size_t c_ld, + Buffer& c_buffer, const size_t c_offset, const size_t c_ld, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { return clblasDsymm(layout, side, triangle, m, n, alpha, - a_buffer, a_offset, a_ld, - b_buffer, b_offset, b_ld, + a_buffer(), a_offset, a_ld, + b_buffer(), b_offset, b_ld, beta, - c_buffer, c_offset, c_ld, + c_buffer(), c_offset, c_ld, num_queues, queues, num_wait_events, wait_events, events); } clblasStatus clblasXsymm(const clblasOrder layout, const clblasSide side, const clblasUplo triangle, const size_t m, const size_t n, const float2 alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, + const Buffer& a_buffer, const size_t a_offset, const size_t a_ld, + const Buffer& b_buffer, const size_t b_offset, const size_t b_ld, const float2 beta, - cl_mem c_buffer, const size_t c_offset, const size_t c_ld, + Buffer& c_buffer, const size_t c_offset, const size_t c_ld, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { return clblasCsymm(layout, side, triangle, m, n, cl_float2{{alpha.real(), alpha.imag()}}, - a_buffer, a_offset, a_ld, - b_buffer, b_offset, b_ld, + a_buffer(), a_offset, a_ld, + b_buffer(), b_offset, b_ld, cl_float2{{beta.real(), beta.imag()}}, - c_buffer, c_offset, c_ld, + c_buffer(), c_offset, c_ld, num_queues, queues, num_wait_events, wait_events, events); } clblasStatus clblasXsymm(const clblasOrder layout, const clblasSide side, const clblasUplo triangle, const size_t m, const size_t n, const double2 alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, + const Buffer& a_buffer, const size_t a_offset, const size_t a_ld, + const Buffer& b_buffer, const size_t b_offset, const size_t b_ld, const double2 beta, - cl_mem c_buffer, const size_t c_offset, const size_t c_ld, + Buffer& c_buffer, const size_t c_offset, const size_t c_ld, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { return clblasZsymm(layout, side, triangle, m, n, cl_double2{{alpha.real(), alpha.imag()}}, - a_buffer, a_offset, a_ld, - b_buffer, b_offset, b_ld, + a_buffer(), a_offset, a_ld, + b_buffer(), b_offset, b_ld, cl_double2{{beta.real(), beta.imag()}}, - c_buffer, c_offset, c_ld, + c_buffer(), c_offset, c_ld, num_queues, queues, num_wait_events, wait_events, events); } +clblasStatus clblasXsymm(const clblasOrder layout, const clblasSide side, const clblasUplo triangle, + const size_t m, const size_t n, + const half alpha, + const Buffer& a_buffer, const size_t a_offset, const size_t a_ld, + const Buffer& b_buffer, const size_t b_offset, const size_t b_ld, + const half beta, + Buffer& c_buffer, const size_t c_offset, const size_t c_ld, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { + auto a_buffer_bis = HalfToFloatBuffer(a_buffer, queues[0]); + auto b_buffer_bis = HalfToFloatBuffer(b_buffer, queues[0]); + auto c_buffer_bis = HalfToFloatBuffer(c_buffer, queues[0]); + auto status = clblasXsymm(layout, side, triangle, + m, n, + HalfToFloat(alpha), + a_buffer_bis, a_offset, a_ld, + b_buffer_bis, b_offset, b_ld, + HalfToFloat(beta), + c_buffer_bis, c_offset, c_ld, + num_queues, queues, num_wait_events, wait_events, events); + FloatToHalfBuffer(c_buffer, c_buffer_bis, queues[0]); + return status; +} // Forwards the clBLAS calls for CHEMM/ZHEMM clblasStatus clblasXhemm(const clblasOrder layout, const clblasSide side, const clblasUplo triangle, const size_t m, const size_t n, const float2 alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, + const Buffer& a_buffer, const size_t a_offset, const size_t a_ld, + const Buffer& b_buffer, const size_t b_offset, const size_t b_ld, const float2 beta, - cl_mem c_buffer, const size_t c_offset, const size_t c_ld, + Buffer& c_buffer, const size_t c_offset, const size_t c_ld, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { return clblasChemm(layout, side, triangle, m, n, cl_float2{{alpha.real(), alpha.imag()}}, - a_buffer, a_offset, a_ld, - b_buffer, b_offset, b_ld, + a_buffer(), a_offset, a_ld, + b_buffer(), b_offset, b_ld, cl_float2{{beta.real(), beta.imag()}}, - c_buffer, c_offset, c_ld, + c_buffer(), c_offset, c_ld, num_queues, queues, num_wait_events, wait_events, events); } clblasStatus clblasXhemm(const clblasOrder layout, const clblasSide side, const clblasUplo triangle, const size_t m, const size_t n, const double2 alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, + const Buffer& a_buffer, const size_t a_offset, const size_t a_ld, + const Buffer& b_buffer, const size_t b_offset, const size_t b_ld, const double2 beta, - cl_mem c_buffer, const size_t c_offset, const size_t c_ld, + Buffer& c_buffer, const size_t c_offset, const size_t c_ld, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { return clblasZhemm(layout, side, triangle, m, n, cl_double2{{alpha.real(), alpha.imag()}}, - a_buffer, a_offset, a_ld, - b_buffer, b_offset, b_ld, + a_buffer(), a_offset, a_ld, + b_buffer(), b_offset, b_ld, cl_double2{{beta.real(), beta.imag()}}, - c_buffer, c_offset, c_ld, + c_buffer(), c_offset, c_ld, num_queues, queues, num_wait_events, wait_events, events); } @@ -2047,99 +2481,119 @@ clblasStatus clblasXhemm(const clblasOrder layout, const clblasSide side, const clblasStatus clblasXsyrk(const clblasOrder layout, const clblasUplo triangle, const clblasTranspose a_transpose, const size_t n, const size_t k, const float alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const Buffer& a_buffer, const size_t a_offset, const size_t a_ld, const float beta, - cl_mem c_buffer, const size_t c_offset, const size_t c_ld, + Buffer& c_buffer, const size_t c_offset, const size_t c_ld, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { return clblasSsyrk(layout, triangle, a_transpose, n, k, alpha, - a_buffer, a_offset, a_ld, + a_buffer(), a_offset, a_ld, beta, - c_buffer, c_offset, c_ld, + c_buffer(), c_offset, c_ld, num_queues, queues, num_wait_events, wait_events, events); } clblasStatus clblasXsyrk(const clblasOrder layout, const clblasUplo triangle, const clblasTranspose a_transpose, const size_t n, const size_t k, const double alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const Buffer& a_buffer, const size_t a_offset, const size_t a_ld, const double beta, - cl_mem c_buffer, const size_t c_offset, const size_t c_ld, + Buffer& c_buffer, const size_t c_offset, const size_t c_ld, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { return clblasDsyrk(layout, triangle, a_transpose, n, k, alpha, - a_buffer, a_offset, a_ld, + a_buffer(), a_offset, a_ld, beta, - c_buffer, c_offset, c_ld, + c_buffer(), c_offset, c_ld, num_queues, queues, num_wait_events, wait_events, events); } clblasStatus clblasXsyrk(const clblasOrder layout, const clblasUplo triangle, const clblasTranspose a_transpose, const size_t n, const size_t k, const float2 alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const Buffer& a_buffer, const size_t a_offset, const size_t a_ld, const float2 beta, - cl_mem c_buffer, const size_t c_offset, const size_t c_ld, + Buffer& c_buffer, const size_t c_offset, const size_t c_ld, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { return clblasCsyrk(layout, triangle, a_transpose, n, k, cl_float2{{alpha.real(), alpha.imag()}}, - a_buffer, a_offset, a_ld, + a_buffer(), a_offset, a_ld, cl_float2{{beta.real(), beta.imag()}}, - c_buffer, c_offset, c_ld, + c_buffer(), c_offset, c_ld, num_queues, queues, num_wait_events, wait_events, events); } clblasStatus clblasXsyrk(const clblasOrder layout, const clblasUplo triangle, const clblasTranspose a_transpose, const size_t n, const size_t k, const double2 alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const Buffer& a_buffer, const size_t a_offset, const size_t a_ld, const double2 beta, - cl_mem c_buffer, const size_t c_offset, const size_t c_ld, + Buffer& c_buffer, const size_t c_offset, const size_t c_ld, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { return clblasZsyrk(layout, triangle, a_transpose, n, k, cl_double2{{alpha.real(), alpha.imag()}}, - a_buffer, a_offset, a_ld, + a_buffer(), a_offset, a_ld, cl_double2{{beta.real(), beta.imag()}}, - c_buffer, c_offset, c_ld, + c_buffer(), c_offset, c_ld, num_queues, queues, num_wait_events, wait_events, events); } +clblasStatus clblasXsyrk(const clblasOrder layout, const clblasUplo triangle, const clblasTranspose a_transpose, + const size_t n, const size_t k, + const half alpha, + const Buffer& a_buffer, const size_t a_offset, const size_t a_ld, + const half beta, + Buffer& c_buffer, const size_t c_offset, const size_t c_ld, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { + auto a_buffer_bis = HalfToFloatBuffer(a_buffer, queues[0]); + auto c_buffer_bis = HalfToFloatBuffer(c_buffer, queues[0]); + auto status = clblasXsyrk(layout, triangle, a_transpose, + n, k, + HalfToFloat(alpha), + a_buffer_bis, a_offset, a_ld, + HalfToFloat(beta), + c_buffer_bis, c_offset, c_ld, + num_queues, queues, num_wait_events, wait_events, events); + FloatToHalfBuffer(c_buffer, c_buffer_bis, queues[0]); + return status; +} // Forwards the clBLAS calls for CHERK/ZHERK clblasStatus clblasXherk(const clblasOrder layout, const clblasUplo triangle, const clblasTranspose a_transpose, const size_t n, const size_t k, const float alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const Buffer& a_buffer, const size_t a_offset, const size_t a_ld, const float beta, - cl_mem c_buffer, const size_t c_offset, const size_t c_ld, + Buffer& c_buffer, const size_t c_offset, const size_t c_ld, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { return clblasCherk(layout, triangle, a_transpose, n, k, alpha, - a_buffer, a_offset, a_ld, + a_buffer(), a_offset, a_ld, beta, - c_buffer, c_offset, c_ld, + c_buffer(), c_offset, c_ld, num_queues, queues, num_wait_events, wait_events, events); } clblasStatus clblasXherk(const clblasOrder layout, const clblasUplo triangle, const clblasTranspose a_transpose, const size_t n, const size_t k, const double alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const Buffer& a_buffer, const size_t a_offset, const size_t a_ld, const double beta, - cl_mem c_buffer, const size_t c_offset, const size_t c_ld, + Buffer& c_buffer, const size_t c_offset, const size_t c_ld, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { return clblasZherk(layout, triangle, a_transpose, n, k, alpha, - a_buffer, a_offset, a_ld, + a_buffer(), a_offset, a_ld, beta, - c_buffer, c_offset, c_ld, + c_buffer(), c_offset, c_ld, num_queues, queues, num_wait_events, wait_events, events); } @@ -2147,111 +2601,134 @@ clblasStatus clblasXherk(const clblasOrder layout, const clblasUplo triangle, co clblasStatus clblasXsyr2k(const clblasOrder layout, const clblasUplo triangle, const clblasTranspose ab_transpose, const size_t n, const size_t k, const float alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, + const Buffer& a_buffer, const size_t a_offset, const size_t a_ld, + const Buffer& b_buffer, const size_t b_offset, const size_t b_ld, const float beta, - cl_mem c_buffer, const size_t c_offset, const size_t c_ld, + Buffer& c_buffer, const size_t c_offset, const size_t c_ld, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { return clblasSsyr2k(layout, triangle, ab_transpose, n, k, alpha, - a_buffer, a_offset, a_ld, - b_buffer, b_offset, b_ld, + a_buffer(), a_offset, a_ld, + b_buffer(), b_offset, b_ld, beta, - c_buffer, c_offset, c_ld, + c_buffer(), c_offset, c_ld, num_queues, queues, num_wait_events, wait_events, events); } clblasStatus clblasXsyr2k(const clblasOrder layout, const clblasUplo triangle, const clblasTranspose ab_transpose, const size_t n, const size_t k, const double alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, + const Buffer& a_buffer, const size_t a_offset, const size_t a_ld, + const Buffer& b_buffer, const size_t b_offset, const size_t b_ld, const double beta, - cl_mem c_buffer, const size_t c_offset, const size_t c_ld, + Buffer& c_buffer, const size_t c_offset, const size_t c_ld, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { return clblasDsyr2k(layout, triangle, ab_transpose, n, k, alpha, - a_buffer, a_offset, a_ld, - b_buffer, b_offset, b_ld, + a_buffer(), a_offset, a_ld, + b_buffer(), b_offset, b_ld, beta, - c_buffer, c_offset, c_ld, + c_buffer(), c_offset, c_ld, num_queues, queues, num_wait_events, wait_events, events); } clblasStatus clblasXsyr2k(const clblasOrder layout, const clblasUplo triangle, const clblasTranspose ab_transpose, const size_t n, const size_t k, const float2 alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, + const Buffer& a_buffer, const size_t a_offset, const size_t a_ld, + const Buffer& b_buffer, const size_t b_offset, const size_t b_ld, const float2 beta, - cl_mem c_buffer, const size_t c_offset, const size_t c_ld, + Buffer& c_buffer, const size_t c_offset, const size_t c_ld, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { return clblasCsyr2k(layout, triangle, ab_transpose, n, k, cl_float2{{alpha.real(), alpha.imag()}}, - a_buffer, a_offset, a_ld, - b_buffer, b_offset, b_ld, + a_buffer(), a_offset, a_ld, + b_buffer(), b_offset, b_ld, cl_float2{{beta.real(), beta.imag()}}, - c_buffer, c_offset, c_ld, + c_buffer(), c_offset, c_ld, num_queues, queues, num_wait_events, wait_events, events); } clblasStatus clblasXsyr2k(const clblasOrder layout, const clblasUplo triangle, const clblasTranspose ab_transpose, const size_t n, const size_t k, const double2 alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, + const Buffer& a_buffer, const size_t a_offset, const size_t a_ld, + const Buffer& b_buffer, const size_t b_offset, const size_t b_ld, const double2 beta, - cl_mem c_buffer, const size_t c_offset, const size_t c_ld, + Buffer& c_buffer, const size_t c_offset, const size_t c_ld, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { return clblasZsyr2k(layout, triangle, ab_transpose, n, k, cl_double2{{alpha.real(), alpha.imag()}}, - a_buffer, a_offset, a_ld, - b_buffer, b_offset, b_ld, + a_buffer(), a_offset, a_ld, + b_buffer(), b_offset, b_ld, cl_double2{{beta.real(), beta.imag()}}, - c_buffer, c_offset, c_ld, + c_buffer(), c_offset, c_ld, num_queues, queues, num_wait_events, wait_events, events); } +clblasStatus clblasXsyr2k(const clblasOrder layout, const clblasUplo triangle, const clblasTranspose ab_transpose, + const size_t n, const size_t k, + const half alpha, + const Buffer& a_buffer, const size_t a_offset, const size_t a_ld, + const Buffer& b_buffer, const size_t b_offset, const size_t b_ld, + const half beta, + Buffer& c_buffer, const size_t c_offset, const size_t c_ld, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { + auto a_buffer_bis = HalfToFloatBuffer(a_buffer, queues[0]); + auto b_buffer_bis = HalfToFloatBuffer(b_buffer, queues[0]); + auto c_buffer_bis = HalfToFloatBuffer(c_buffer, queues[0]); + auto status = clblasXsyr2k(layout, triangle, ab_transpose, + n, k, + HalfToFloat(alpha), + a_buffer_bis, a_offset, a_ld, + b_buffer_bis, b_offset, b_ld, + HalfToFloat(beta), + c_buffer_bis, c_offset, c_ld, + num_queues, queues, num_wait_events, wait_events, events); + FloatToHalfBuffer(c_buffer, c_buffer_bis, queues[0]); + return status; +} // Forwards the clBLAS calls for CHER2K/ZHER2K clblasStatus clblasXher2k(const clblasOrder layout, const clblasUplo triangle, const clblasTranspose ab_transpose, const size_t n, const size_t k, const float2 alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, + const Buffer& a_buffer, const size_t a_offset, const size_t a_ld, + const Buffer& b_buffer, const size_t b_offset, const size_t b_ld, const float beta, - cl_mem c_buffer, const size_t c_offset, const size_t c_ld, + Buffer& c_buffer, const size_t c_offset, const size_t c_ld, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { return clblasCher2k(layout, triangle, ab_transpose, n, k, cl_float2{{alpha.real(), alpha.imag()}}, - a_buffer, a_offset, a_ld, - b_buffer, b_offset, b_ld, + a_buffer(), a_offset, a_ld, + b_buffer(), b_offset, b_ld, beta, - c_buffer, c_offset, c_ld, + c_buffer(), c_offset, c_ld, num_queues, queues, num_wait_events, wait_events, events); } clblasStatus clblasXher2k(const clblasOrder layout, const clblasUplo triangle, const clblasTranspose ab_transpose, const size_t n, const size_t k, const double2 alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, + const Buffer& a_buffer, const size_t a_offset, const size_t a_ld, + const Buffer& b_buffer, const size_t b_offset, const size_t b_ld, const double beta, - cl_mem c_buffer, const size_t c_offset, const size_t c_ld, + Buffer& c_buffer, const size_t c_offset, const size_t c_ld, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { return clblasZher2k(layout, triangle, ab_transpose, n, k, cl_double2{{alpha.real(), alpha.imag()}}, - a_buffer, a_offset, a_ld, - b_buffer, b_offset, b_ld, + a_buffer(), a_offset, a_ld, + b_buffer(), b_offset, b_ld, beta, - c_buffer, c_offset, c_ld, + c_buffer(), c_offset, c_ld, num_queues, queues, num_wait_events, wait_events, events); } @@ -2259,117 +2736,153 @@ clblasStatus clblasXher2k(const clblasOrder layout, const clblasUplo triangle, c clblasStatus clblasXtrmm(const clblasOrder layout, const clblasSide side, const clblasUplo triangle, const clblasTranspose a_transpose, const clblasDiag diagonal, const size_t m, const size_t n, const float alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - cl_mem b_buffer, const size_t b_offset, const size_t b_ld, + const Buffer& a_buffer, const size_t a_offset, const size_t a_ld, + Buffer& b_buffer, const size_t b_offset, const size_t b_ld, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { return clblasStrmm(layout, side, triangle, a_transpose, diagonal, m, n, alpha, - a_buffer, a_offset, a_ld, - b_buffer, b_offset, b_ld, + a_buffer(), a_offset, a_ld, + b_buffer(), b_offset, b_ld, num_queues, queues, num_wait_events, wait_events, events); } clblasStatus clblasXtrmm(const clblasOrder layout, const clblasSide side, const clblasUplo triangle, const clblasTranspose a_transpose, const clblasDiag diagonal, const size_t m, const size_t n, const double alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - cl_mem b_buffer, const size_t b_offset, const size_t b_ld, + const Buffer& a_buffer, const size_t a_offset, const size_t a_ld, + Buffer& b_buffer, const size_t b_offset, const size_t b_ld, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { return clblasDtrmm(layout, side, triangle, a_transpose, diagonal, m, n, alpha, - a_buffer, a_offset, a_ld, - b_buffer, b_offset, b_ld, + a_buffer(), a_offset, a_ld, + b_buffer(), b_offset, b_ld, num_queues, queues, num_wait_events, wait_events, events); } clblasStatus clblasXtrmm(const clblasOrder layout, const clblasSide side, const clblasUplo triangle, const clblasTranspose a_transpose, const clblasDiag diagonal, const size_t m, const size_t n, const float2 alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - cl_mem b_buffer, const size_t b_offset, const size_t b_ld, + const Buffer& a_buffer, const size_t a_offset, const size_t a_ld, + Buffer& b_buffer, const size_t b_offset, const size_t b_ld, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { return clblasCtrmm(layout, side, triangle, a_transpose, diagonal, m, n, cl_float2{{alpha.real(), alpha.imag()}}, - a_buffer, a_offset, a_ld, - b_buffer, b_offset, b_ld, + a_buffer(), a_offset, a_ld, + b_buffer(), b_offset, b_ld, num_queues, queues, num_wait_events, wait_events, events); } clblasStatus clblasXtrmm(const clblasOrder layout, const clblasSide side, const clblasUplo triangle, const clblasTranspose a_transpose, const clblasDiag diagonal, const size_t m, const size_t n, const double2 alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - cl_mem b_buffer, const size_t b_offset, const size_t b_ld, + const Buffer& a_buffer, const size_t a_offset, const size_t a_ld, + Buffer& b_buffer, const size_t b_offset, const size_t b_ld, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { return clblasZtrmm(layout, side, triangle, a_transpose, diagonal, m, n, cl_double2{{alpha.real(), alpha.imag()}}, - a_buffer, a_offset, a_ld, - b_buffer, b_offset, b_ld, + a_buffer(), a_offset, a_ld, + b_buffer(), b_offset, b_ld, num_queues, queues, num_wait_events, wait_events, events); } +clblasStatus clblasXtrmm(const clblasOrder layout, const clblasSide side, const clblasUplo triangle, const clblasTranspose a_transpose, const clblasDiag diagonal, + const size_t m, const size_t n, + const half alpha, + const Buffer& a_buffer, const size_t a_offset, const size_t a_ld, + Buffer& b_buffer, const size_t b_offset, const size_t b_ld, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { + auto a_buffer_bis = HalfToFloatBuffer(a_buffer, queues[0]); + auto b_buffer_bis = HalfToFloatBuffer(b_buffer, queues[0]); + auto status = clblasXtrmm(layout, side, triangle, a_transpose, diagonal, + m, n, + HalfToFloat(alpha), + a_buffer_bis, a_offset, a_ld, + b_buffer_bis, b_offset, b_ld, + num_queues, queues, num_wait_events, wait_events, events); + FloatToHalfBuffer(b_buffer, b_buffer_bis, queues[0]); + return status; +} // Forwards the clBLAS calls for STRSM/DTRSM/CTRSM/ZTRSM clblasStatus clblasXtrsm(const clblasOrder layout, const clblasSide side, const clblasUplo triangle, const clblasTranspose a_transpose, const clblasDiag diagonal, const size_t m, const size_t n, const float alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - cl_mem b_buffer, const size_t b_offset, const size_t b_ld, + const Buffer& a_buffer, const size_t a_offset, const size_t a_ld, + Buffer& b_buffer, const size_t b_offset, const size_t b_ld, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { return clblasStrsm(layout, side, triangle, a_transpose, diagonal, m, n, alpha, - a_buffer, a_offset, a_ld, - b_buffer, b_offset, b_ld, + a_buffer(), a_offset, a_ld, + b_buffer(), b_offset, b_ld, num_queues, queues, num_wait_events, wait_events, events); } clblasStatus clblasXtrsm(const clblasOrder layout, const clblasSide side, const clblasUplo triangle, const clblasTranspose a_transpose, const clblasDiag diagonal, const size_t m, const size_t n, const double alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - cl_mem b_buffer, const size_t b_offset, const size_t b_ld, + const Buffer& a_buffer, const size_t a_offset, const size_t a_ld, + Buffer& b_buffer, const size_t b_offset, const size_t b_ld, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { return clblasDtrsm(layout, side, triangle, a_transpose, diagonal, m, n, alpha, - a_buffer, a_offset, a_ld, - b_buffer, b_offset, b_ld, + a_buffer(), a_offset, a_ld, + b_buffer(), b_offset, b_ld, num_queues, queues, num_wait_events, wait_events, events); } clblasStatus clblasXtrsm(const clblasOrder layout, const clblasSide side, const clblasUplo triangle, const clblasTranspose a_transpose, const clblasDiag diagonal, const size_t m, const size_t n, const float2 alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - cl_mem b_buffer, const size_t b_offset, const size_t b_ld, + const Buffer& a_buffer, const size_t a_offset, const size_t a_ld, + Buffer& b_buffer, const size_t b_offset, const size_t b_ld, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { return clblasCtrsm(layout, side, triangle, a_transpose, diagonal, m, n, cl_float2{{alpha.real(), alpha.imag()}}, - a_buffer, a_offset, a_ld, - b_buffer, b_offset, b_ld, + a_buffer(), a_offset, a_ld, + b_buffer(), b_offset, b_ld, num_queues, queues, num_wait_events, wait_events, events); } clblasStatus clblasXtrsm(const clblasOrder layout, const clblasSide side, const clblasUplo triangle, const clblasTranspose a_transpose, const clblasDiag diagonal, const size_t m, const size_t n, const double2 alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - cl_mem b_buffer, const size_t b_offset, const size_t b_ld, + const Buffer& a_buffer, const size_t a_offset, const size_t a_ld, + Buffer& b_buffer, const size_t b_offset, const size_t b_ld, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { return clblasZtrsm(layout, side, triangle, a_transpose, diagonal, m, n, cl_double2{{alpha.real(), alpha.imag()}}, - a_buffer, a_offset, a_ld, - b_buffer, b_offset, b_ld, + a_buffer(), a_offset, a_ld, + b_buffer(), b_offset, b_ld, num_queues, queues, num_wait_events, wait_events, events); } +clblasStatus clblasXtrsm(const clblasOrder layout, const clblasSide side, const clblasUplo triangle, const clblasTranspose a_transpose, const clblasDiag diagonal, + const size_t m, const size_t n, + const half alpha, + const Buffer& a_buffer, const size_t a_offset, const size_t a_ld, + Buffer& b_buffer, const size_t b_offset, const size_t b_ld, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { + auto a_buffer_bis = HalfToFloatBuffer(a_buffer, queues[0]); + auto b_buffer_bis = HalfToFloatBuffer(b_buffer, queues[0]); + auto status = clblasXtrsm(layout, side, triangle, a_transpose, diagonal, + m, n, + HalfToFloat(alpha), + a_buffer_bis, a_offset, a_ld, + b_buffer_bis, b_offset, b_ld, + num_queues, queues, num_wait_events, wait_events, events); + FloatToHalfBuffer(b_buffer, b_buffer_bis, queues[0]); + return status; +} // ================================================================================================= } // namespace clblast