mirror of
https://github.com/CNugteren/CLBlast.git
synced 2024-07-16 03:15:41 +02:00
Merge branch 'dvasschemacq-master' into development
This commit is contained in:
commit
ce9ba27450
|
@ -6,6 +6,7 @@ Development version (next release)
|
|||
- Fixed a bug with a size_t and cl_ulong mismatch on 32-bit systems
|
||||
- Fixed a bug related to the cache and retrieval of programs based on the OpenCL context
|
||||
- Fixed a performance issue (caused by fp16 support) by optimizing alpha/beta parameter passing to kernels
|
||||
- Fixed a bug in the OpenCL kernels: now placing __kernel before __attribute__
|
||||
- Added an option (-warm_up) to do a warm-up run before timing in the performance clients
|
||||
- Improved performance significantly of rotated GEMV computations
|
||||
- Added tuned parameters for various devices (see README)
|
||||
|
|
|
@ -286,6 +286,7 @@ The contributing authors (code, pull requests, testing) so far are:
|
|||
* [Marco Hutter](https://github.com/gpus)
|
||||
* [Hugh Perkins](https://github.com/hughperkins)
|
||||
* [Gian-Carlo Pascutto](https://github.com/gcp)
|
||||
* [Dimitri VA](https://github.com/dvasschemacq)
|
||||
|
||||
Tuning and testing on a variety of OpenCL devices was made possible by:
|
||||
|
||||
|
|
|
@ -30,10 +30,10 @@ R"(
|
|||
// =================================================================================================
|
||||
|
||||
// The main reduction kernel, performing the loading and the majority of the operation
|
||||
__attribute__((reqd_work_group_size(WGS1, 1, 1)))
|
||||
__kernel void Xamax(const int n,
|
||||
const __global real* restrict xgm, const int x_offset, const int x_inc,
|
||||
__global singlereal* maxgm, __global unsigned int* imaxgm) {
|
||||
__kernel __attribute__((reqd_work_group_size(WGS1, 1, 1)))
|
||||
void Xamax(const int n,
|
||||
const __global real* restrict xgm, const int x_offset, const int x_inc,
|
||||
__global singlereal* maxgm, __global unsigned int* imaxgm) {
|
||||
__local singlereal maxlm[WGS1];
|
||||
__local unsigned int imaxlm[WGS1];
|
||||
const int lid = get_local_id(0);
|
||||
|
@ -95,10 +95,10 @@ __kernel void Xamax(const int n,
|
|||
|
||||
// The epilogue reduction kernel, performing the final bit of the operation. This kernel has to
|
||||
// be launched with a single workgroup only.
|
||||
__attribute__((reqd_work_group_size(WGS2, 1, 1)))
|
||||
__kernel void XamaxEpilogue(const __global singlereal* restrict maxgm,
|
||||
const __global unsigned int* restrict imaxgm,
|
||||
__global unsigned int* imax, const int imax_offset) {
|
||||
__kernel __attribute__((reqd_work_group_size(WGS2, 1, 1)))
|
||||
void XamaxEpilogue(const __global singlereal* restrict maxgm,
|
||||
const __global unsigned int* restrict imaxgm,
|
||||
__global unsigned int* imax, const int imax_offset) {
|
||||
__local singlereal maxlm[WGS2];
|
||||
__local unsigned int imaxlm[WGS2];
|
||||
const int lid = get_local_id(0);
|
||||
|
|
|
@ -30,10 +30,10 @@ R"(
|
|||
// =================================================================================================
|
||||
|
||||
// The main reduction kernel, performing the loading and the majority of the operation
|
||||
__attribute__((reqd_work_group_size(WGS1, 1, 1)))
|
||||
__kernel void Xasum(const int n,
|
||||
const __global real* restrict xgm, const int x_offset, const int x_inc,
|
||||
__global real* output) {
|
||||
__kernel __attribute__((reqd_work_group_size(WGS1, 1, 1)))
|
||||
void Xasum(const int n,
|
||||
const __global real* restrict xgm, const int x_offset, const int x_inc,
|
||||
__global real* output) {
|
||||
__local real lm[WGS1];
|
||||
const int lid = get_local_id(0);
|
||||
const int wgid = get_group_id(0);
|
||||
|
@ -74,9 +74,9 @@ __kernel void Xasum(const int n,
|
|||
|
||||
// The epilogue reduction kernel, performing the final bit of the operation. This kernel has to
|
||||
// be launched with a single workgroup only.
|
||||
__attribute__((reqd_work_group_size(WGS2, 1, 1)))
|
||||
__kernel void XasumEpilogue(const __global real* restrict input,
|
||||
__global real* asum, const int asum_offset) {
|
||||
__kernel __attribute__((reqd_work_group_size(WGS2, 1, 1)))
|
||||
void XasumEpilogue(const __global real* restrict input,
|
||||
__global real* asum, const int asum_offset) {
|
||||
__local real lm[WGS2];
|
||||
const int lid = get_local_id(0);
|
||||
|
||||
|
|
|
@ -22,10 +22,10 @@ R"(
|
|||
// =================================================================================================
|
||||
|
||||
// Full version of the kernel with offsets and strided accesses
|
||||
__attribute__((reqd_work_group_size(WGS, 1, 1)))
|
||||
__kernel void Xaxpy(const int n, const real_arg arg_alpha,
|
||||
const __global real* restrict xgm, const int x_offset, const int x_inc,
|
||||
__global real* ygm, const int y_offset, const int y_inc) {
|
||||
__kernel __attribute__((reqd_work_group_size(WGS, 1, 1)))
|
||||
void Xaxpy(const int n, const real_arg arg_alpha,
|
||||
const __global real* restrict xgm, const int x_offset, const int x_inc,
|
||||
__global real* ygm, const int y_offset, const int y_inc) {
|
||||
const real alpha = GetRealArg(arg_alpha);
|
||||
|
||||
// Loops over the work that needs to be done (allows for an arbitrary number of threads)
|
||||
|
@ -40,10 +40,10 @@ __kernel void Xaxpy(const int n, const real_arg arg_alpha,
|
|||
|
||||
// Faster version of the kernel without offsets and strided accesses. Also assumes that 'n' is
|
||||
// dividable by 'VW', 'WGS' and 'WPT'.
|
||||
__attribute__((reqd_work_group_size(WGS, 1, 1)))
|
||||
__kernel void XaxpyFast(const int n, const real_arg arg_alpha,
|
||||
const __global realV* restrict xgm,
|
||||
__global realV* ygm) {
|
||||
__kernel __attribute__((reqd_work_group_size(WGS, 1, 1)))
|
||||
void XaxpyFast(const int n, const real_arg arg_alpha,
|
||||
const __global realV* restrict xgm,
|
||||
__global realV* ygm) {
|
||||
const real alpha = GetRealArg(arg_alpha);
|
||||
|
||||
#pragma unroll
|
||||
|
|
|
@ -22,10 +22,10 @@ R"(
|
|||
// =================================================================================================
|
||||
|
||||
// Full version of the kernel with offsets and strided accesses
|
||||
__attribute__((reqd_work_group_size(WGS, 1, 1)))
|
||||
__kernel void Xcopy(const int n,
|
||||
const __global real* restrict xgm, const int x_offset, const int x_inc,
|
||||
__global real* ygm, const int y_offset, const int y_inc) {
|
||||
__kernel __attribute__((reqd_work_group_size(WGS, 1, 1)))
|
||||
void Xcopy(const int n,
|
||||
const __global real* restrict xgm, const int x_offset, const int x_inc,
|
||||
__global real* ygm, const int y_offset, const int y_inc) {
|
||||
|
||||
// Loops over the work that needs to be done (allows for an arbitrary number of threads)
|
||||
#pragma unroll
|
||||
|
@ -38,10 +38,10 @@ __kernel void Xcopy(const int n,
|
|||
|
||||
// Faster version of the kernel without offsets and strided accesses. Also assumes that 'n' is
|
||||
// dividable by 'VW', 'WGS' and 'WPT'.
|
||||
__attribute__((reqd_work_group_size(WGS, 1, 1)))
|
||||
__kernel void XcopyFast(const int n,
|
||||
const __global realV* restrict xgm,
|
||||
__global realV* ygm) {
|
||||
__kernel __attribute__((reqd_work_group_size(WGS, 1, 1)))
|
||||
void XcopyFast(const int n,
|
||||
const __global realV* restrict xgm,
|
||||
__global realV* ygm) {
|
||||
#pragma unroll
|
||||
for (int w=0; w<WPT; ++w) {
|
||||
const int id = w*get_global_size(0) + get_global_id(0);
|
||||
|
|
|
@ -30,11 +30,11 @@ R"(
|
|||
// =================================================================================================
|
||||
|
||||
// The main reduction kernel, performing the multiplication and the majority of the sum operation
|
||||
__attribute__((reqd_work_group_size(WGS1, 1, 1)))
|
||||
__kernel void Xdot(const int n,
|
||||
const __global real* restrict xgm, const int x_offset, const int x_inc,
|
||||
const __global real* restrict ygm, const int y_offset, const int y_inc,
|
||||
__global real* output, const int do_conjugate) {
|
||||
__kernel __attribute__((reqd_work_group_size(WGS1, 1, 1)))
|
||||
void Xdot(const int n,
|
||||
const __global real* restrict xgm, const int x_offset, const int x_inc,
|
||||
const __global real* restrict ygm, const int y_offset, const int y_inc,
|
||||
__global real* output, const int do_conjugate) {
|
||||
__local real lm[WGS1];
|
||||
const int lid = get_local_id(0);
|
||||
const int wgid = get_group_id(0);
|
||||
|
@ -73,9 +73,9 @@ __kernel void Xdot(const int n,
|
|||
|
||||
// The epilogue reduction kernel, performing the final bit of the sum operation. This kernel has to
|
||||
// be launched with a single workgroup only.
|
||||
__attribute__((reqd_work_group_size(WGS2, 1, 1)))
|
||||
__kernel void XdotEpilogue(const __global real* restrict input,
|
||||
__global real* dot, const int dot_offset) {
|
||||
__kernel __attribute__((reqd_work_group_size(WGS2, 1, 1)))
|
||||
void XdotEpilogue(const __global real* restrict input,
|
||||
__global real* dot, const int dot_offset) {
|
||||
__local real lm[WGS2];
|
||||
const int lid = get_local_id(0);
|
||||
|
||||
|
|
|
@ -30,10 +30,10 @@ R"(
|
|||
// =================================================================================================
|
||||
|
||||
// The main reduction kernel, performing the multiplication and the majority of the operation
|
||||
__attribute__((reqd_work_group_size(WGS1, 1, 1)))
|
||||
__kernel void Xnrm2(const int n,
|
||||
const __global real* restrict xgm, const int x_offset, const int x_inc,
|
||||
__global real* output) {
|
||||
__kernel __attribute__((reqd_work_group_size(WGS1, 1, 1)))
|
||||
void Xnrm2(const int n,
|
||||
const __global real* restrict xgm, const int x_offset, const int x_inc,
|
||||
__global real* output) {
|
||||
__local real lm[WGS1];
|
||||
const int lid = get_local_id(0);
|
||||
const int wgid = get_group_id(0);
|
||||
|
@ -72,9 +72,9 @@ __kernel void Xnrm2(const int n,
|
|||
|
||||
// The epilogue reduction kernel, performing the final bit of the operation. This kernel has to
|
||||
// be launched with a single workgroup only.
|
||||
__attribute__((reqd_work_group_size(WGS2, 1, 1)))
|
||||
__kernel void Xnrm2Epilogue(const __global real* restrict input,
|
||||
__global real* nrm2, const int nrm2_offset) {
|
||||
__kernel __attribute__((reqd_work_group_size(WGS2, 1, 1)))
|
||||
void Xnrm2Epilogue(const __global real* restrict input,
|
||||
__global real* nrm2, const int nrm2_offset) {
|
||||
__local real lm[WGS2];
|
||||
const int lid = get_local_id(0);
|
||||
|
||||
|
|
|
@ -22,9 +22,10 @@ R"(
|
|||
// =================================================================================================
|
||||
|
||||
// Full version of the kernel with offsets and strided accesses
|
||||
__attribute__((reqd_work_group_size(WGS, 1, 1)))
|
||||
__kernel void Xscal(const int n, const real alpha,
|
||||
__global real* xgm, const int x_offset, const int x_inc) {
|
||||
__kernel __attribute__((reqd_work_group_size(WGS, 1, 1)))
|
||||
void Xscal(const int n, const real_arg arg_alpha,
|
||||
__global real* xgm, const int x_offset, const int x_inc) {
|
||||
const real alpha = GetRealArg(arg_alpha);
|
||||
|
||||
// Loops over the work that needs to be done (allows for an arbitrary number of threads)
|
||||
#pragma unroll
|
||||
|
@ -40,9 +41,11 @@ __kernel void Xscal(const int n, const real alpha,
|
|||
|
||||
// Faster version of the kernel without offsets and strided accesses. Also assumes that 'n' is
|
||||
// dividable by 'VW', 'WGS' and 'WPT'.
|
||||
__attribute__((reqd_work_group_size(WGS, 1, 1)))
|
||||
__kernel void XscalFast(const int n, const real alpha,
|
||||
__global realV* xgm) {
|
||||
__kernel __attribute__((reqd_work_group_size(WGS, 1, 1)))
|
||||
void XscalFast(const int n, const real_arg arg_alpha,
|
||||
__global realV* xgm) {
|
||||
const real alpha = GetRealArg(arg_alpha);
|
||||
|
||||
#pragma unroll
|
||||
for (int w=0; w<WPT; ++w) {
|
||||
const int id = w*get_global_size(0) + get_global_id(0);
|
||||
|
|
|
@ -22,10 +22,10 @@ R"(
|
|||
// =================================================================================================
|
||||
|
||||
// Full version of the kernel with offsets and strided accesses
|
||||
__attribute__((reqd_work_group_size(WGS, 1, 1)))
|
||||
__kernel void Xswap(const int n,
|
||||
__global real* xgm, const int x_offset, const int x_inc,
|
||||
__global real* ygm, const int y_offset, const int y_inc) {
|
||||
__kernel __attribute__((reqd_work_group_size(WGS, 1, 1)))
|
||||
void Xswap(const int n,
|
||||
__global real* xgm, const int x_offset, const int x_inc,
|
||||
__global real* ygm, const int y_offset, const int y_inc) {
|
||||
|
||||
// Loops over the work that needs to be done (allows for an arbitrary number of threads)
|
||||
#pragma unroll
|
||||
|
@ -40,10 +40,10 @@ __kernel void Xswap(const int n,
|
|||
|
||||
// Faster version of the kernel without offsets and strided accesses. Also assumes that 'n' is
|
||||
// dividable by 'VW', 'WGS' and 'WPT'.
|
||||
__attribute__((reqd_work_group_size(WGS, 1, 1)))
|
||||
__kernel void XswapFast(const int n,
|
||||
__global realV* xgm,
|
||||
__global realV* ygm) {
|
||||
__kernel __attribute__((reqd_work_group_size(WGS, 1, 1)))
|
||||
void XswapFast(const int n,
|
||||
__global realV* xgm,
|
||||
__global realV* ygm) {
|
||||
#pragma unroll
|
||||
for (int w=0; w<WPT; ++w) {
|
||||
const int id = w*get_global_size(0) + get_global_id(0);
|
||||
|
|
|
@ -210,8 +210,8 @@ inline real LoadMatrixA(const __global real* restrict agm, const int x, const in
|
|||
// =================================================================================================
|
||||
|
||||
// Full version of the kernel
|
||||
__attribute__((reqd_work_group_size(WGS1, 1, 1)))
|
||||
__kernel void Xgemv(const int m, const int n,
|
||||
__kernel __attribute__((reqd_work_group_size(WGS1, 1, 1)))
|
||||
void Xgemv(const int m, const int n,
|
||||
const real_arg arg_alpha,
|
||||
const real_arg arg_beta,
|
||||
const int a_rotated,
|
||||
|
|
|
@ -88,16 +88,16 @@ inline realVF LoadMatrixAVF(const __global realVF* restrict agm, const int x, co
|
|||
// --> 'a_ld' is a multiple of VW2
|
||||
// --> 'a_rotated' is 0
|
||||
// --> 'do_conjugate' is 0
|
||||
__attribute__((reqd_work_group_size(WGS2, 1, 1)))
|
||||
__kernel void XgemvFast(const int m, const int n,
|
||||
const real_arg arg_alpha,
|
||||
const real_arg arg_beta,
|
||||
const int a_rotated,
|
||||
const __global realVF* restrict agm, const int a_offset, const int a_ld,
|
||||
const __global real* restrict xgm, const int x_offset, const int x_inc,
|
||||
__global real* ygm, const int y_offset, const int y_inc,
|
||||
const int do_conjugate, const int parameter,
|
||||
const int kl_unused, const int ku_unused) {
|
||||
__kernel __attribute__((reqd_work_group_size(WGS2, 1, 1)))
|
||||
void XgemvFast(const int m, const int n,
|
||||
const real_arg arg_alpha,
|
||||
const real_arg arg_beta,
|
||||
const int a_rotated,
|
||||
const __global realVF* restrict agm, const int a_offset, const int a_ld,
|
||||
const __global real* restrict xgm, const int x_offset, const int x_inc,
|
||||
__global real* ygm, const int y_offset, const int y_inc,
|
||||
const int do_conjugate, const int parameter,
|
||||
const int kl_unused, const int ku_unused) {
|
||||
const real alpha = GetRealArg(arg_alpha);
|
||||
const real beta = GetRealArg(arg_beta);
|
||||
|
||||
|
@ -190,16 +190,16 @@ __kernel void XgemvFast(const int m, const int n,
|
|||
// --> 'a_ld' is a multiple of VW3
|
||||
// --> 'a_rotated' is 1
|
||||
// --> 'do_conjugate' is 0
|
||||
__attribute__((reqd_work_group_size(WGS3, 1, 1)))
|
||||
__kernel void XgemvFastRot(const int m, const int n,
|
||||
const real_arg arg_alpha,
|
||||
const real_arg arg_beta,
|
||||
const int a_rotated,
|
||||
const __global realVFR* restrict agm, const int a_offset, const int a_ld,
|
||||
const __global real* restrict xgm, const int x_offset, const int x_inc,
|
||||
__global real* ygm, const int y_offset, const int y_inc,
|
||||
const int do_conjugate, const int parameter,
|
||||
const int kl_unused, const int ku_unused) {
|
||||
__kernel __attribute__((reqd_work_group_size(WGS3, 1, 1)))
|
||||
void XgemvFastRot(const int m, const int n,
|
||||
const real_arg arg_alpha,
|
||||
const real_arg arg_beta,
|
||||
const int a_rotated,
|
||||
const __global realVFR* restrict agm, const int a_offset, const int a_ld,
|
||||
const __global real* restrict xgm, const int x_offset, const int x_inc,
|
||||
__global real* ygm, const int y_offset, const int y_inc,
|
||||
const int do_conjugate, const int parameter,
|
||||
const int kl_unused, const int ku_unused) {
|
||||
const real alpha = GetRealArg(arg_alpha);
|
||||
const real beta = GetRealArg(arg_beta);
|
||||
|
||||
|
|
|
@ -18,13 +18,13 @@ R"(
|
|||
// =================================================================================================
|
||||
|
||||
// Regular version of the rank-1 matrix update kernel (GER, GERU, GERC)
|
||||
__attribute__((reqd_work_group_size(WGS1, WGS2, 1)))
|
||||
__kernel void Xger(const int max1, const int max2,
|
||||
const real_arg arg_alpha,
|
||||
const __global real* restrict xgm, const int x_offset, const int x_inc,
|
||||
const __global real* ygm, const int y_offset, const int y_inc,
|
||||
__global real* restrict agm, const int a_offset, const int a_ld,
|
||||
const int is_rowmajor) {
|
||||
__kernel __attribute__((reqd_work_group_size(WGS1, WGS2, 1)))
|
||||
void Xger(const int max1, const int max2,
|
||||
const real_arg arg_alpha,
|
||||
const __global real* restrict xgm, const int x_offset, const int x_inc,
|
||||
const __global real* ygm, const int y_offset, const int y_inc,
|
||||
__global real* restrict agm, const int a_offset, const int a_ld,
|
||||
const int is_rowmajor) {
|
||||
const real alpha = GetRealArg(arg_alpha);
|
||||
|
||||
// Register storage for X and Y
|
||||
|
|
|
@ -18,12 +18,12 @@ R"(
|
|||
// =================================================================================================
|
||||
|
||||
// Symmetric version of the rank-1 matrix update kernel (HER, HPR, SYR, SPR)
|
||||
__attribute__((reqd_work_group_size(WGS1, WGS2, 1)))
|
||||
__kernel void Xher(const int n,
|
||||
const real_arg arg_alpha,
|
||||
const __global real* restrict xgm, const int x_offset, const int x_inc,
|
||||
__global real* restrict agm, const int a_offset, const int a_ld,
|
||||
const int is_upper, const int is_rowmajor) {
|
||||
__kernel __attribute__((reqd_work_group_size(WGS1, WGS2, 1)))
|
||||
void Xher(const int n,
|
||||
const real_arg arg_alpha,
|
||||
const __global real* restrict xgm, const int x_offset, const int x_inc,
|
||||
__global real* restrict agm, const int a_offset, const int a_ld,
|
||||
const int is_upper, const int is_rowmajor) {
|
||||
const real alpha = GetRealArg(arg_alpha);
|
||||
|
||||
// Register storage for X and XT
|
||||
|
|
|
@ -18,13 +18,13 @@ R"(
|
|||
// =================================================================================================
|
||||
|
||||
// Symmetric version of the rank-2 matrix update kernel (HER2, HPR2, SYR2, SPR2)
|
||||
__attribute__((reqd_work_group_size(WGS1, WGS2, 1)))
|
||||
__kernel void Xher2(const int n,
|
||||
const real_arg arg_alpha,
|
||||
const __global real* restrict xgm, const int x_offset, const int x_inc,
|
||||
const __global real* restrict ygm, const int y_offset, const int y_inc,
|
||||
__global real* restrict agm, const int a_offset, const int a_ld,
|
||||
const int is_upper, const int is_rowmajor) {
|
||||
__kernel __attribute__((reqd_work_group_size(WGS1, WGS2, 1)))
|
||||
void Xher2(const int n,
|
||||
const real_arg arg_alpha,
|
||||
const __global real* restrict xgm, const int x_offset, const int x_inc,
|
||||
const __global real* restrict ygm, const int y_offset, const int y_inc,
|
||||
__global real* restrict agm, const int a_offset, const int a_ld,
|
||||
const int is_upper, const int is_rowmajor) {
|
||||
const real alpha = GetRealArg(arg_alpha);
|
||||
|
||||
// Register storage for X and Y
|
||||
|
|
|
@ -20,13 +20,13 @@ R"(
|
|||
|
||||
// Kernel to populate a squared hermitian matrix, given that the triangle which holds the data is
|
||||
// stored as the lower-triangle of the input matrix. This uses the padding kernel's parameters.
|
||||
__attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1)))
|
||||
__kernel void HermLowerToSquared(const int src_dim,
|
||||
const int src_ld, const int src_offset,
|
||||
__global const real* restrict src,
|
||||
const int dest_dim,
|
||||
const int dest_ld, const int dest_offset,
|
||||
__global real* dest) {
|
||||
__kernel __attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1)))
|
||||
void HermLowerToSquared(const int src_dim,
|
||||
const int src_ld, const int src_offset,
|
||||
__global const real* restrict src,
|
||||
const int dest_dim,
|
||||
const int dest_ld, const int dest_offset,
|
||||
__global real* dest) {
|
||||
|
||||
// Loops over the work per thread in both dimensions
|
||||
#pragma unroll
|
||||
|
@ -59,13 +59,13 @@ __kernel void HermLowerToSquared(const int src_dim,
|
|||
}
|
||||
|
||||
// Same as above, but now the matrix' data is stored in the upper-triangle
|
||||
__attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1)))
|
||||
__kernel void HermUpperToSquared(const int src_dim,
|
||||
const int src_ld, const int src_offset,
|
||||
__global const real* restrict src,
|
||||
const int dest_dim,
|
||||
const int dest_ld, const int dest_offset,
|
||||
__global real* dest) {
|
||||
__kernel __attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1)))
|
||||
void HermUpperToSquared(const int src_dim,
|
||||
const int src_ld, const int src_offset,
|
||||
__global const real* restrict src,
|
||||
const int dest_dim,
|
||||
const int dest_ld, const int dest_offset,
|
||||
__global real* dest) {
|
||||
|
||||
// Loops over the work per thread in both dimensions
|
||||
#pragma unroll
|
||||
|
|
|
@ -20,13 +20,13 @@ R"(
|
|||
|
||||
// Kernel to populate a squared symmetric matrix, given that the triangle which holds the data is
|
||||
// stored as the lower-triangle of the input matrix. This uses the padding kernel's parameters.
|
||||
__attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1)))
|
||||
__kernel void SymmLowerToSquared(const int src_dim,
|
||||
const int src_ld, const int src_offset,
|
||||
__global const real* restrict src,
|
||||
const int dest_dim,
|
||||
const int dest_ld, const int dest_offset,
|
||||
__global real* dest) {
|
||||
__kernel __attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1)))
|
||||
void SymmLowerToSquared(const int src_dim,
|
||||
const int src_ld, const int src_offset,
|
||||
__global const real* restrict src,
|
||||
const int dest_dim,
|
||||
const int dest_ld, const int dest_offset,
|
||||
__global real* dest) {
|
||||
|
||||
// Loops over the work per thread in both dimensions
|
||||
#pragma unroll
|
||||
|
@ -53,13 +53,13 @@ __kernel void SymmLowerToSquared(const int src_dim,
|
|||
}
|
||||
|
||||
// Same as above, but now the matrix' data is stored in the upper-triangle
|
||||
__attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1)))
|
||||
__kernel void SymmUpperToSquared(const int src_dim,
|
||||
const int src_ld, const int src_offset,
|
||||
__global const real* restrict src,
|
||||
const int dest_dim,
|
||||
const int dest_ld, const int dest_offset,
|
||||
__global real* dest) {
|
||||
__kernel __attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1)))
|
||||
void SymmUpperToSquared(const int src_dim,
|
||||
const int src_ld, const int src_offset,
|
||||
__global const real* restrict src,
|
||||
const int dest_dim,
|
||||
const int dest_ld, const int dest_offset,
|
||||
__global real* dest) {
|
||||
|
||||
// Loops over the work per thread in both dimensions
|
||||
#pragma unroll
|
||||
|
|
|
@ -20,14 +20,14 @@ R"(
|
|||
|
||||
// Kernel to populate a squared triangular matrix, given that the triangle which holds the data is
|
||||
// stored as the lower-triangle of the input matrix. This uses the padding kernel's parameters.
|
||||
__attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1)))
|
||||
__kernel void TriaLowerToSquared(const int src_dim,
|
||||
const int src_ld, const int src_offset,
|
||||
__global const real* restrict src,
|
||||
const int dest_dim,
|
||||
const int dest_ld, const int dest_offset,
|
||||
__global real* dest,
|
||||
const int unit_diagonal) {
|
||||
__kernel __attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1)))
|
||||
void TriaLowerToSquared(const int src_dim,
|
||||
const int src_ld, const int src_offset,
|
||||
__global const real* restrict src,
|
||||
const int dest_dim,
|
||||
const int dest_ld, const int dest_offset,
|
||||
__global real* dest,
|
||||
const int unit_diagonal) {
|
||||
|
||||
// Loops over the work per thread in both dimensions
|
||||
#pragma unroll
|
||||
|
@ -55,14 +55,14 @@ __kernel void TriaLowerToSquared(const int src_dim,
|
|||
}
|
||||
|
||||
// Same as above, but now the matrix' data is stored in the upper-triangle
|
||||
__attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1)))
|
||||
__kernel void TriaUpperToSquared(const int src_dim,
|
||||
const int src_ld, const int src_offset,
|
||||
__global const real* restrict src,
|
||||
const int dest_dim,
|
||||
const int dest_ld, const int dest_offset,
|
||||
__global real* dest,
|
||||
const int unit_diagonal) {
|
||||
__kernel __attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1)))
|
||||
void TriaUpperToSquared(const int src_dim,
|
||||
const int src_ld, const int src_offset,
|
||||
__global const real* restrict src,
|
||||
const int dest_dim,
|
||||
const int dest_ld, const int dest_offset,
|
||||
__global real* dest,
|
||||
const int unit_diagonal) {
|
||||
|
||||
// Loops over the work per thread in both dimensions
|
||||
#pragma unroll
|
||||
|
|
|
@ -35,11 +35,11 @@ R"(
|
|||
|
||||
// Fast copy kernel. Requires 'ld' and the number of threads in dimension 0 to be a multiple of
|
||||
// COPY_VW. Also requires both matrices to be of the same dimensions and without offset.
|
||||
__attribute__((reqd_work_group_size(COPY_DIMX, COPY_DIMY, 1)))
|
||||
__kernel void CopyMatrixFast(const int ld,
|
||||
__global const realC* restrict src,
|
||||
__global realC* dest,
|
||||
const real_arg arg_alpha) {
|
||||
__kernel __attribute__((reqd_work_group_size(COPY_DIMX, COPY_DIMY, 1)))
|
||||
void CopyMatrixFast(const int ld,
|
||||
__global const realC* restrict src,
|
||||
__global realC* dest,
|
||||
const real_arg arg_alpha) {
|
||||
const real alpha = GetRealArg(arg_alpha);
|
||||
#pragma unroll
|
||||
for (int w_one=0; w_one<COPY_WPT; ++w_one) {
|
||||
|
|
|
@ -24,15 +24,15 @@ R"(
|
|||
// Copies a matrix from source to destination. The output is padded with zero values in case the
|
||||
// destination matrix dimensions are larger than the source matrix dimensions. Additionally, the ld
|
||||
// value and offset can be different.
|
||||
__attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1)))
|
||||
__kernel void CopyPadMatrix(const int src_one, const int src_two,
|
||||
const int src_ld, const int src_offset,
|
||||
__global const real* restrict src,
|
||||
const int dest_one, const int dest_two,
|
||||
const int dest_ld, const int dest_offset,
|
||||
__global real* dest,
|
||||
const real_arg arg_alpha,
|
||||
const int do_conjugate) {
|
||||
__kernel __attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1)))
|
||||
void CopyPadMatrix(const int src_one, const int src_two,
|
||||
const int src_ld, const int src_offset,
|
||||
__global const real* restrict src,
|
||||
const int dest_one, const int dest_two,
|
||||
const int dest_ld, const int dest_offset,
|
||||
__global real* dest,
|
||||
const real_arg arg_alpha,
|
||||
const int do_conjugate) {
|
||||
const real alpha = GetRealArg(arg_alpha);
|
||||
|
||||
// Loops over the work per thread in both dimensions
|
||||
|
@ -65,16 +65,16 @@ __kernel void CopyPadMatrix(const int src_one, const int src_two,
|
|||
// Same as above, but now un-pads a matrix. This kernel reads data from a padded source matrix, but
|
||||
// writes only the actual data back to the destination matrix. Again, the ld value and offset can
|
||||
// be different.
|
||||
__attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1)))
|
||||
__kernel void CopyMatrix(const int src_one, const int src_two,
|
||||
const int src_ld, const int src_offset,
|
||||
__global const real* restrict src,
|
||||
const int dest_one, const int dest_two,
|
||||
const int dest_ld, const int dest_offset,
|
||||
__global real* dest,
|
||||
const real_arg arg_alpha,
|
||||
const int upper, const int lower,
|
||||
const int diagonal_imag_zero) {
|
||||
__kernel __attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1)))
|
||||
void CopyMatrix(const int src_one, const int src_two,
|
||||
const int src_ld, const int src_offset,
|
||||
__global const real* restrict src,
|
||||
const int dest_one, const int dest_two,
|
||||
const int dest_ld, const int dest_offset,
|
||||
__global real* dest,
|
||||
const real_arg arg_alpha,
|
||||
const int upper, const int lower,
|
||||
const int diagonal_imag_zero) {
|
||||
const real alpha = GetRealArg(arg_alpha);
|
||||
|
||||
// Loops over the work per thread in both dimensions
|
||||
|
|
|
@ -36,11 +36,11 @@ R"(
|
|||
|
||||
// Transposes and copies a matrix. Requires both matrices to be of the same dimensions and without
|
||||
// offset. A more general version is available in 'padtranspose.opencl'.
|
||||
__attribute__((reqd_work_group_size(TRA_DIM, TRA_DIM, 1)))
|
||||
__kernel void TransposeMatrixFast(const int ld,
|
||||
__global const realT* restrict src,
|
||||
__global realT* dest,
|
||||
const real_arg arg_alpha) {
|
||||
__kernel __attribute__((reqd_work_group_size(TRA_DIM, TRA_DIM, 1)))
|
||||
void TransposeMatrixFast(const int ld,
|
||||
__global const realT* restrict src,
|
||||
__global realT* dest,
|
||||
const real_arg arg_alpha) {
|
||||
const real alpha = GetRealArg(arg_alpha);
|
||||
|
||||
// Sets the group identifiers. They might be 'shuffled' around to distribute work in a different
|
||||
|
|
|
@ -24,15 +24,15 @@ R"(
|
|||
|
||||
// Transposes a matrix from source to destination. The output is padded with zero values in case the
|
||||
// destination matrix dimensions are larger than the transposed source matrix dimensions.
|
||||
__attribute__((reqd_work_group_size(PADTRA_TILE, PADTRA_TILE, 1)))
|
||||
__kernel void TransposePadMatrix(const int src_one, const int src_two,
|
||||
const int src_ld, const int src_offset,
|
||||
__global const real* restrict src,
|
||||
const int dest_one, const int dest_two,
|
||||
const int dest_ld, const int dest_offset,
|
||||
__global real* dest,
|
||||
const real_arg arg_alpha,
|
||||
const int do_conjugate) {
|
||||
__kernel __attribute__((reqd_work_group_size(PADTRA_TILE, PADTRA_TILE, 1)))
|
||||
void TransposePadMatrix(const int src_one, const int src_two,
|
||||
const int src_ld, const int src_offset,
|
||||
__global const real* restrict src,
|
||||
const int dest_one, const int dest_two,
|
||||
const int dest_ld, const int dest_offset,
|
||||
__global real* dest,
|
||||
const real_arg arg_alpha,
|
||||
const int do_conjugate) {
|
||||
const real alpha = GetRealArg(arg_alpha);
|
||||
|
||||
// Local memory to store a tile of the matrix (for coalescing)
|
||||
|
@ -88,16 +88,16 @@ __kernel void TransposePadMatrix(const int src_one, const int src_two,
|
|||
// Transposes a matrix, while considering possible padding in the source matrix. Data is read from a
|
||||
// padded source matrix, but only the actual data is written back to the transposed destination
|
||||
// matrix. This kernel optionally checks for upper/lower triangular matrices.
|
||||
__attribute__((reqd_work_group_size(PADTRA_TILE, PADTRA_TILE, 1)))
|
||||
__kernel void TransposeMatrix(const int src_one, const int src_two,
|
||||
const int src_ld, const int src_offset,
|
||||
__global const real* restrict src,
|
||||
const int dest_one, const int dest_two,
|
||||
const int dest_ld, const int dest_offset,
|
||||
__global real* dest,
|
||||
const real_arg arg_alpha,
|
||||
const int upper, const int lower,
|
||||
const int diagonal_imag_zero) {
|
||||
__kernel __attribute__((reqd_work_group_size(PADTRA_TILE, PADTRA_TILE, 1)))
|
||||
void TransposeMatrix(const int src_one, const int src_two,
|
||||
const int src_ld, const int src_offset,
|
||||
__global const real* restrict src,
|
||||
const int dest_one, const int dest_two,
|
||||
const int dest_ld, const int dest_offset,
|
||||
__global real* dest,
|
||||
const real_arg arg_alpha,
|
||||
const int upper, const int lower,
|
||||
const int diagonal_imag_zero) {
|
||||
const real alpha = GetRealArg(arg_alpha);
|
||||
|
||||
// Local memory to store a tile of the matrix (for coalescing)
|
||||
|
|
|
@ -268,13 +268,13 @@ inline void XgemmBody(const int kSizeM, const int kSizeN, const int kSizeK,
|
|||
#if defined(ROUTINE_SYRK) || defined(ROUTINE_HERK) || defined(ROUTINE_SYR2K) || defined(ROUTINE_HER2K)
|
||||
|
||||
// Main entry point of the kernel. This is the upper-triangular version.
|
||||
__attribute__((reqd_work_group_size(MDIMC, NDIMC, 1)))
|
||||
__kernel void XgemmUpper(const int kSizeN, const int kSizeK,
|
||||
const real_arg arg_alpha,
|
||||
const real_arg arg_beta,
|
||||
const __global realM* restrict agm,
|
||||
const __global realN* restrict bgm,
|
||||
__global realM* cgm) {
|
||||
__kernel __attribute__((reqd_work_group_size(MDIMC, NDIMC, 1)))
|
||||
void XgemmUpper(const int kSizeN, const int kSizeK,
|
||||
const real_arg arg_alpha,
|
||||
const real_arg arg_beta,
|
||||
const __global realM* restrict agm,
|
||||
const __global realN* restrict bgm,
|
||||
__global realM* cgm) {
|
||||
const real alpha = GetRealArg(arg_alpha);
|
||||
const real beta = GetRealArg(arg_beta);
|
||||
|
||||
|
@ -308,13 +308,13 @@ __kernel void XgemmUpper(const int kSizeN, const int kSizeK,
|
|||
}
|
||||
|
||||
// Main entry point of the kernel. This is the lower-triangular version.
|
||||
__attribute__((reqd_work_group_size(MDIMC, NDIMC, 1)))
|
||||
__kernel void XgemmLower(const int kSizeN, const int kSizeK,
|
||||
const real_arg arg_alpha,
|
||||
const real_arg arg_beta,
|
||||
const __global realM* restrict agm,
|
||||
const __global realN* restrict bgm,
|
||||
__global realM* cgm) {
|
||||
__kernel __attribute__((reqd_work_group_size(MDIMC, NDIMC, 1)))
|
||||
void XgemmLower(const int kSizeN, const int kSizeK,
|
||||
const real_arg arg_alpha,
|
||||
const real_arg arg_beta,
|
||||
const __global realM* restrict agm,
|
||||
const __global realN* restrict bgm,
|
||||
__global realM* cgm) {
|
||||
const real alpha = GetRealArg(arg_alpha);
|
||||
const real beta = GetRealArg(arg_beta);
|
||||
|
||||
|
@ -352,13 +352,13 @@ __kernel void XgemmLower(const int kSizeN, const int kSizeK,
|
|||
#else
|
||||
|
||||
// Main entry point of the kernel. This is the regular full version.
|
||||
__attribute__((reqd_work_group_size(MDIMC, NDIMC, 1)))
|
||||
__kernel void Xgemm(const int kSizeM, const int kSizeN, const int kSizeK,
|
||||
const real_arg arg_alpha,
|
||||
const real_arg arg_beta,
|
||||
const __global realM* restrict agm,
|
||||
const __global realN* restrict bgm,
|
||||
__global realM* cgm) {
|
||||
__kernel __attribute__((reqd_work_group_size(MDIMC, NDIMC, 1)))
|
||||
void Xgemm(const int kSizeM, const int kSizeN, const int kSizeK,
|
||||
const real_arg arg_alpha,
|
||||
const real_arg arg_beta,
|
||||
const __global realM* restrict agm,
|
||||
const __global realN* restrict bgm,
|
||||
__global realM* cgm) {
|
||||
const real alpha = GetRealArg(arg_alpha);
|
||||
const real beta = GetRealArg(arg_beta);
|
||||
|
||||
|
|
Loading…
Reference in a new issue