Updated according to feedback from CNugteren
parent
ff6a5689df
commit
73f49e9b3d
Binary file not shown.
Binary file not shown.
|
@ -1,5 +1,10 @@
|
|||
Development version (next version)
|
||||
- Fixes two small issues in the plotting script
|
||||
- Modifications to improve performance on Qualcomm Adreno GPUs:
|
||||
* Unique database entries for specific Adreno devices
|
||||
* Toggle OpenCL kernel compilation options for Adreno
|
||||
* New preprocessor directive RELAX_WORKGROUP_SIZE
|
||||
- Fixed a bug in handling of #undef in CLBlast loop unrolling and array-to-register mapping functions
|
||||
|
||||
Version 1.5.3
|
||||
- Fix a correctness issue with DGEMM on SM 7.5 Turing GPUs
|
||||
|
|
|
@ -101,8 +101,6 @@ Known performance related issues:
|
|||
|
||||
* Severe performance issues with Beignet v1.3.0 due to missing support for local memory. Please downgrade to v1.2.1 or upgrade to v1.3.1 or newer.
|
||||
|
||||
* Performance issues on Qualcomm Adreno GPUs.
|
||||
|
||||
Other known issues:
|
||||
|
||||
* Routines returning an integer are currently not properly tested for half-precision FP16: IHAMAX/IHAMIN/IHMAX/IHMIN
|
||||
|
|
|
@ -32,7 +32,7 @@ R"(
|
|||
// The main reduction kernel, performing the loading and the majority of the operation
|
||||
#if RELAX_WORKGROUP_SIZE == 1
|
||||
__kernel
|
||||
#elif
|
||||
#else
|
||||
__kernel __attribute__((reqd_work_group_size(WGS1, 1, 1)))
|
||||
#endif
|
||||
void Xamax(const int n,
|
||||
|
@ -102,7 +102,7 @@ void Xamax(const int n,
|
|||
// be launched with a single workgroup only.
|
||||
#if RELAX_WORKGROUP_SIZE == 1
|
||||
__kernel
|
||||
#elif
|
||||
#else
|
||||
__kernel __attribute__((reqd_work_group_size(WGS2, 1, 1)))
|
||||
#endif
|
||||
void XamaxEpilogue(const __global singlereal* restrict maxgm,
|
||||
|
|
|
@ -32,7 +32,7 @@ R"(
|
|||
// The main reduction kernel, performing the loading and the majority of the operation
|
||||
#if RELAX_WORKGROUP_SIZE == 1
|
||||
__kernel
|
||||
#elif
|
||||
#else
|
||||
__kernel __attribute__((reqd_work_group_size(WGS1, 1, 1)))
|
||||
#endif
|
||||
void Xasum(const int n,
|
||||
|
@ -79,7 +79,7 @@ void Xasum(const int n,
|
|||
// be launched with a single workgroup only.
|
||||
#if RELAX_WORKGROUP_SIZE == 1
|
||||
__kernel
|
||||
#elif
|
||||
#else
|
||||
__kernel __attribute__((reqd_work_group_size(WGS2, 1, 1)))
|
||||
#endif
|
||||
void XasumEpilogue(const __global real* restrict input,
|
||||
|
|
|
@ -24,7 +24,7 @@ R"(
|
|||
// Full version of the kernel with offsets and strided accesses
|
||||
#if RELAX_WORKGROUP_SIZE == 1
|
||||
__kernel
|
||||
#elif
|
||||
#else
|
||||
__kernel __attribute__((reqd_work_group_size(WGS, 1, 1)))
|
||||
#endif
|
||||
void Xaxpy(const int n, const real_arg arg_alpha,
|
||||
|
@ -43,7 +43,7 @@ void Xaxpy(const int n, const real_arg arg_alpha,
|
|||
// assumes that 'n' is dividable by 'VW' and 'WPT'.
|
||||
#if RELAX_WORKGROUP_SIZE == 1
|
||||
__kernel
|
||||
#elif
|
||||
#else
|
||||
__kernel __attribute__((reqd_work_group_size(WGS, 1, 1)))
|
||||
#endif
|
||||
void XaxpyFaster(const int n, const real_arg arg_alpha,
|
||||
|
@ -67,7 +67,7 @@ void XaxpyFaster(const int n, const real_arg arg_alpha,
|
|||
// dividable by 'VW', 'WGS' and 'WPT'.
|
||||
#if RELAX_WORKGROUP_SIZE == 1
|
||||
__kernel
|
||||
#elif
|
||||
#else
|
||||
__kernel __attribute__((reqd_work_group_size(WGS, 1, 1)))
|
||||
#endif
|
||||
void XaxpyFastest(const int n, const real_arg arg_alpha,
|
||||
|
@ -89,7 +89,7 @@ void XaxpyFastest(const int n, const real_arg arg_alpha,
|
|||
// Full version of the kernel with offsets and strided accesses: batched version
|
||||
#if RELAX_WORKGROUP_SIZE == 1
|
||||
__kernel
|
||||
#elif
|
||||
#else
|
||||
__kernel __attribute__((reqd_work_group_size(WGS, 1, 1)))
|
||||
#endif
|
||||
void XaxpyBatched(const int n, const __constant real_arg* arg_alphas,
|
||||
|
|
|
@ -24,7 +24,7 @@ R"(
|
|||
// Full version of the kernel with offsets and strided accesses
|
||||
#if RELAX_WORKGROUP_SIZE == 1
|
||||
__kernel
|
||||
#elif
|
||||
#else
|
||||
__kernel __attribute__((reqd_work_group_size(WGS, 1, 1)))
|
||||
#endif
|
||||
void Xcopy(const int n,
|
||||
|
@ -43,7 +43,7 @@ void Xcopy(const int n,
|
|||
// dividable by 'VW', 'WGS' and 'WPT'.
|
||||
#if RELAX_WORKGROUP_SIZE == 1
|
||||
__kernel
|
||||
#elif
|
||||
#else
|
||||
__kernel __attribute__((reqd_work_group_size(WGS, 1, 1)))
|
||||
#endif
|
||||
void XcopyFast(const int n,
|
||||
|
|
|
@ -32,7 +32,7 @@ R"(
|
|||
// The main reduction kernel, performing the multiplication and the majority of the sum operation
|
||||
#if RELAX_WORKGROUP_SIZE == 1
|
||||
__kernel
|
||||
#elif
|
||||
#else
|
||||
__kernel __attribute__((reqd_work_group_size(WGS1, 1, 1)))
|
||||
#endif
|
||||
void Xdot(const int n,
|
||||
|
@ -78,7 +78,7 @@ void Xdot(const int n,
|
|||
// be launched with a single workgroup only.
|
||||
#if RELAX_WORKGROUP_SIZE == 1
|
||||
__kernel
|
||||
#elif
|
||||
#else
|
||||
__kernel __attribute__((reqd_work_group_size(WGS2, 1, 1)))
|
||||
#endif
|
||||
void XdotEpilogue(const __global real* restrict input,
|
||||
|
|
|
@ -68,7 +68,7 @@ INLINE_FUNC realV MultiplyVectorVector(realV cvec, const realV aval, const realV
|
|||
// Full version of the kernel with offsets and strided accesses
|
||||
#if RELAX_WORKGROUP_SIZE == 1
|
||||
__kernel
|
||||
#elif
|
||||
#else
|
||||
__kernel __attribute__((reqd_work_group_size(WGS, 1, 1)))
|
||||
#endif
|
||||
void Xhad(const int n, const real_arg arg_alpha, const real_arg arg_beta,
|
||||
|
@ -96,7 +96,7 @@ void Xhad(const int n, const real_arg arg_alpha, const real_arg arg_beta,
|
|||
// assumes that 'n' is dividable by 'VW' and 'WPT'.
|
||||
#if RELAX_WORKGROUP_SIZE == 1
|
||||
__kernel
|
||||
#elif
|
||||
#else
|
||||
__kernel __attribute__((reqd_work_group_size(WGS, 1, 1)))
|
||||
#endif
|
||||
void XhadFaster(const int n, const real_arg arg_alpha, const real_arg arg_beta,
|
||||
|
@ -127,7 +127,7 @@ void XhadFaster(const int n, const real_arg arg_alpha, const real_arg arg_beta,
|
|||
// dividable by 'VW', 'WGS' and 'WPT'.
|
||||
#if RELAX_WORKGROUP_SIZE == 1
|
||||
__kernel
|
||||
#elif
|
||||
#else
|
||||
__kernel __attribute__((reqd_work_group_size(WGS, 1, 1)))
|
||||
#endif
|
||||
void XhadFastest(const int n, const real_arg arg_alpha, const real_arg arg_beta,
|
||||
|
|
|
@ -32,7 +32,7 @@ R"(
|
|||
// The main reduction kernel, performing the multiplication and the majority of the operation
|
||||
#if RELAX_WORKGROUP_SIZE == 1
|
||||
__kernel
|
||||
#elif
|
||||
#else
|
||||
__kernel __attribute__((reqd_work_group_size(WGS1, 1, 1)))
|
||||
#endif
|
||||
void Xnrm2(const int n,
|
||||
|
@ -77,7 +77,7 @@ void Xnrm2(const int n,
|
|||
// be launched with a single workgroup only.
|
||||
#if RELAX_WORKGROUP_SIZE == 1
|
||||
__kernel
|
||||
#elif
|
||||
#else
|
||||
__kernel __attribute__((reqd_work_group_size(WGS2, 1, 1)))
|
||||
#endif
|
||||
void Xnrm2Epilogue(const __global real* restrict input,
|
||||
|
|
|
@ -24,7 +24,7 @@ R"(
|
|||
// Full version of the kernel with offsets and strided accesses
|
||||
#if RELAX_WORKGROUP_SIZE == 1
|
||||
__kernel
|
||||
#elif
|
||||
#else
|
||||
__kernel __attribute__((reqd_work_group_size(WGS, 1, 1)))
|
||||
#endif
|
||||
void Xscal(const int n, const real_arg arg_alpha,
|
||||
|
@ -46,7 +46,7 @@ void Xscal(const int n, const real_arg arg_alpha,
|
|||
// dividable by 'VW', 'WGS' and 'WPT'.
|
||||
#if RELAX_WORKGROUP_SIZE == 1
|
||||
__kernel
|
||||
#elif
|
||||
#else
|
||||
__kernel __attribute__((reqd_work_group_size(WGS, 1, 1)))
|
||||
#endif
|
||||
void XscalFast(const int n, const real_arg arg_alpha,
|
||||
|
|
|
@ -24,7 +24,7 @@ R"(
|
|||
// Full version of the kernel with offsets and strided accesses
|
||||
#if RELAX_WORKGROUP_SIZE == 1
|
||||
__kernel
|
||||
#elif
|
||||
#else
|
||||
__kernel __attribute__((reqd_work_group_size(WGS, 1, 1)))
|
||||
#endif
|
||||
void Xswap(const int n,
|
||||
|
@ -45,7 +45,7 @@ void Xswap(const int n,
|
|||
// dividable by 'VW', 'WGS' and 'WPT'.
|
||||
#if RELAX_WORKGROUP_SIZE == 1
|
||||
__kernel
|
||||
#elif
|
||||
#else
|
||||
__kernel __attribute__((reqd_work_group_size(WGS, 1, 1)))
|
||||
#endif
|
||||
void XswapFast(const int n,
|
||||
|
|
|
@ -212,7 +212,7 @@ INLINE_FUNC real LoadMatrixA(const __global real* restrict agm, const int x, con
|
|||
// Full version of the kernel
|
||||
#if RELAX_WORKGROUP_SIZE == 1
|
||||
__kernel
|
||||
#elif
|
||||
#else
|
||||
__kernel __attribute__((reqd_work_group_size(WGS1, 1, 1)))
|
||||
#endif
|
||||
void Xgemv(const int m, const int n,
|
||||
|
|
|
@ -90,7 +90,7 @@ INLINE_FUNC realVF LoadMatrixAVF(const __global realVF* restrict agm, const int
|
|||
// --> 'do_conjugate' is 0
|
||||
#if RELAX_WORKGROUP_SIZE == 1
|
||||
__kernel
|
||||
#elif
|
||||
#else
|
||||
__kernel __attribute__((reqd_work_group_size(WGS2, 1, 1)))
|
||||
#endif
|
||||
void XgemvFast(const int m, const int n,
|
||||
|
@ -197,7 +197,7 @@ void XgemvFast(const int m, const int n,
|
|||
// --> 'do_conjugate' is 0
|
||||
#if RELAX_WORKGROUP_SIZE == 1
|
||||
__kernel
|
||||
#elif
|
||||
#else
|
||||
__kernel __attribute__((reqd_work_group_size(WGS3, 1, 1)))
|
||||
#endif
|
||||
void XgemvFastRot(const int m, const int n,
|
||||
|
|
|
@ -20,7 +20,7 @@ R"(
|
|||
// Regular version of the rank-1 matrix update kernel (GER, GERU, GERC)
|
||||
#if RELAX_WORKGROUP_SIZE == 1
|
||||
__kernel
|
||||
#elif
|
||||
#else
|
||||
__kernel __attribute__((reqd_work_group_size(WGS1, WGS2, 1)))
|
||||
#endif
|
||||
void Xger(const int max1, const int max2,
|
||||
|
|
|
@ -20,7 +20,7 @@ R"(
|
|||
// Symmetric version of the rank-1 matrix update kernel (HER, HPR, SYR, SPR)
|
||||
#if RELAX_WORKGROUP_SIZE == 1
|
||||
__kernel
|
||||
#elif
|
||||
#else
|
||||
__kernel __attribute__((reqd_work_group_size(WGS1, WGS2, 1)))
|
||||
#endif
|
||||
void Xher(const int n,
|
||||
|
|
|
@ -20,7 +20,7 @@ R"(
|
|||
// Symmetric version of the rank-2 matrix update kernel (HER2, HPR2, SYR2, SPR2)
|
||||
#if RELAX_WORKGROUP_SIZE == 1
|
||||
__kernel
|
||||
#elif
|
||||
#else
|
||||
__kernel __attribute__((reqd_work_group_size(WGS1, WGS2, 1)))
|
||||
#endif
|
||||
void Xher2(const int n,
|
||||
|
|
|
@ -41,7 +41,7 @@ void FillVector(const int n, const int inc, const int offset,
|
|||
|
||||
#if RELAX_WORKGROUP_SIZE == 1
|
||||
__kernel
|
||||
#elif
|
||||
#else
|
||||
__kernel __attribute__((reqd_work_group_size(TRSV_BLOCK_SIZE, 1, 1)))
|
||||
#endif
|
||||
void trsv_forward(int n,
|
||||
|
@ -93,7 +93,7 @@ void trsv_forward(int n,
|
|||
|
||||
#if RELAX_WORKGROUP_SIZE == 1
|
||||
__kernel
|
||||
#elif
|
||||
#else
|
||||
__kernel __attribute__((reqd_work_group_size(TRSV_BLOCK_SIZE, 1, 1)))
|
||||
#endif
|
||||
void trsv_backward(int n,
|
||||
|
|
|
@ -23,7 +23,7 @@ R"(
|
|||
// stored as the lower-triangle of the input matrix. This uses the padding kernel's parameters.
|
||||
#if RELAX_WORKGROUP_SIZE == 1
|
||||
__kernel
|
||||
#elif
|
||||
#else
|
||||
__kernel __attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1)))
|
||||
#endif
|
||||
void HermLowerToSquared(const int src_dim,
|
||||
|
@ -66,7 +66,7 @@ void HermLowerToSquared(const int src_dim,
|
|||
// Same as above, but now the matrix' data is stored in the upper-triangle
|
||||
#if RELAX_WORKGROUP_SIZE == 1
|
||||
__kernel
|
||||
#elif
|
||||
#else
|
||||
__kernel __attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1)))
|
||||
#endif
|
||||
void HermUpperToSquared(const int src_dim,
|
||||
|
|
|
@ -22,7 +22,7 @@ R"(
|
|||
// stored as the lower-triangle of the input matrix. This uses the padding kernel's parameters.
|
||||
#if RELAX_WORKGROUP_SIZE == 1
|
||||
__kernel
|
||||
#elif
|
||||
#else
|
||||
__kernel __attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1)))
|
||||
#endif
|
||||
void SymmLowerToSquared(const int src_dim,
|
||||
|
@ -59,7 +59,7 @@ void SymmLowerToSquared(const int src_dim,
|
|||
// Same as above, but now the matrix' data is stored in the upper-triangle
|
||||
#if RELAX_WORKGROUP_SIZE == 1
|
||||
__kernel
|
||||
#elif
|
||||
#else
|
||||
__kernel __attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1)))
|
||||
#endif
|
||||
void SymmUpperToSquared(const int src_dim,
|
||||
|
|
|
@ -22,7 +22,7 @@ R"(
|
|||
// stored as the lower-triangle of the input matrix. This uses the padding kernel's parameters.
|
||||
#if RELAX_WORKGROUP_SIZE == 1
|
||||
__kernel
|
||||
#elif
|
||||
#else
|
||||
__kernel __attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1)))
|
||||
#endif
|
||||
void TriaLowerToSquared(const int src_dim,
|
||||
|
@ -61,7 +61,7 @@ void TriaLowerToSquared(const int src_dim,
|
|||
// Same as above, but now the matrix' data is stored in the upper-triangle
|
||||
#if RELAX_WORKGROUP_SIZE == 1
|
||||
__kernel
|
||||
#elif
|
||||
#else
|
||||
__kernel __attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1)))
|
||||
#endif
|
||||
void TriaUpperToSquared(const int src_dim,
|
||||
|
|
|
@ -37,7 +37,7 @@ R"(
|
|||
// COPY_VW. Also requires both matrices to be of the same dimensions and without offset.
|
||||
#if RELAX_WORKGROUP_SIZE == 1
|
||||
__kernel
|
||||
#elif
|
||||
#else
|
||||
__kernel __attribute__((reqd_work_group_size(COPY_DIMX, COPY_DIMY, 1)))
|
||||
#endif
|
||||
void CopyMatrixFast(const int ld,
|
||||
|
|
|
@ -61,7 +61,7 @@ INLINE_FUNC void _CopyPadMatrix(const int src_one, const int src_two,
|
|||
// Interface to the above function
|
||||
#if RELAX_WORKGROUP_SIZE == 1
|
||||
__kernel
|
||||
#elif
|
||||
#else
|
||||
__kernel __attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1)))
|
||||
#endif
|
||||
void CopyPadMatrix(const int src_one, const int src_two,
|
||||
|
@ -124,7 +124,7 @@ INLINE_FUNC void _CopyMatrix(const int src_one, const int src_two,
|
|||
// Interface to the above function
|
||||
#if RELAX_WORKGROUP_SIZE == 1
|
||||
__kernel
|
||||
#elif
|
||||
#else
|
||||
__kernel __attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1)))
|
||||
#endif
|
||||
void CopyMatrix(const int src_one, const int src_two,
|
||||
|
@ -148,7 +148,7 @@ void CopyMatrix(const int src_one, const int src_two,
|
|||
// Batched version of the above
|
||||
#if RELAX_WORKGROUP_SIZE == 1
|
||||
__kernel
|
||||
#elif
|
||||
#else
|
||||
__kernel __attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1)))
|
||||
#endif
|
||||
void CopyPadMatrixBatched(const int src_one, const int src_two,
|
||||
|
@ -170,7 +170,7 @@ void CopyPadMatrixBatched(const int src_one, const int src_two,
|
|||
// Batched version of the above
|
||||
#if RELAX_WORKGROUP_SIZE == 1
|
||||
__kernel
|
||||
#elif
|
||||
#else
|
||||
__kernel __attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1)))
|
||||
#endif
|
||||
void CopyMatrixBatched(const int src_one, const int src_two,
|
||||
|
@ -195,7 +195,7 @@ void CopyMatrixBatched(const int src_one, const int src_two,
|
|||
// Strided-batched version of the above
|
||||
#if RELAX_WORKGROUP_SIZE == 1
|
||||
__kernel
|
||||
#elif
|
||||
#else
|
||||
__kernel __attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1)))
|
||||
#endif
|
||||
void CopyPadMatrixStridedBatched(const int src_one, const int src_two,
|
||||
|
@ -217,7 +217,7 @@ void CopyPadMatrixStridedBatched(const int src_one, const int src_two,
|
|||
// Strided-batched version of the above
|
||||
#if RELAX_WORKGROUP_SIZE == 1
|
||||
__kernel
|
||||
#elif
|
||||
#else
|
||||
__kernel __attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1)))
|
||||
#endif
|
||||
void CopyMatrixStridedBatched(const int src_one, const int src_two,
|
||||
|
|
|
@ -84,7 +84,7 @@ R"(
|
|||
// Inverts a diagonal block of INTERNAL_BLOCK_SIZE by INTERNAL_BLOCK_SIZE elements in a larger matrix
|
||||
#if RELAX_WORKGROUP_SIZE == 1
|
||||
__kernel
|
||||
#elif
|
||||
#else
|
||||
__kernel __attribute__((reqd_work_group_size(INTERNAL_BLOCK_SIZE, 1, 1)))
|
||||
#endif
|
||||
void InvertDiagonalBlock(const int n, __global const real* restrict src, const int src_offset, const int src_ld,
|
||||
|
|
|
@ -38,7 +38,7 @@ R"(
|
|||
// offset. A more general version is available in 'padtranspose.opencl'.
|
||||
#if RELAX_WORKGROUP_SIZE == 1
|
||||
__kernel
|
||||
#elif
|
||||
#else
|
||||
__kernel __attribute__((reqd_work_group_size(TRA_DIM, TRA_DIM, 1)))
|
||||
#endif
|
||||
void TransposeMatrixFast(const int ld,
|
||||
|
|
|
@ -86,7 +86,7 @@ INLINE_FUNC void _TransposePadMatrix(LOCAL_PTR real* tile,
|
|||
// Interface to the above function
|
||||
#if RELAX_WORKGROUP_SIZE == 1
|
||||
__kernel
|
||||
#elif
|
||||
#else
|
||||
__kernel __attribute__((reqd_work_group_size(PADTRA_TILE, PADTRA_TILE, 1)))
|
||||
#endif
|
||||
void TransposePadMatrix(const int src_one, const int src_two,
|
||||
|
@ -178,7 +178,7 @@ INLINE_FUNC void _TransposeMatrix(LOCAL_PTR real* tile,
|
|||
// Interface to the above function
|
||||
#if RELAX_WORKGROUP_SIZE == 1
|
||||
__kernel
|
||||
#elif
|
||||
#else
|
||||
__kernel __attribute__((reqd_work_group_size(PADTRA_TILE, PADTRA_TILE, 1)))
|
||||
#endif
|
||||
void TransposeMatrix(const int src_one, const int src_two,
|
||||
|
@ -203,7 +203,7 @@ void TransposeMatrix(const int src_one, const int src_two,
|
|||
// Batched version of the above
|
||||
#if RELAX_WORKGROUP_SIZE == 1
|
||||
__kernel
|
||||
#elif
|
||||
#else
|
||||
__kernel __attribute__((reqd_work_group_size(PADTRA_TILE, PADTRA_TILE, 1)))
|
||||
#endif
|
||||
void TransposePadMatrixBatched(const int src_one, const int src_two,
|
||||
|
@ -226,7 +226,7 @@ void TransposePadMatrixBatched(const int src_one, const int src_two,
|
|||
// Batched version of the above
|
||||
#if RELAX_WORKGROUP_SIZE == 1
|
||||
__kernel
|
||||
#elif
|
||||
#else
|
||||
__kernel __attribute__((reqd_work_group_size(PADTRA_TILE, PADTRA_TILE, 1)))
|
||||
#endif
|
||||
void TransposeMatrixBatched(const int src_one, const int src_two,
|
||||
|
@ -252,7 +252,7 @@ void TransposeMatrixBatched(const int src_one, const int src_two,
|
|||
// Strided-batched version of the above
|
||||
#if RELAX_WORKGROUP_SIZE == 1
|
||||
__kernel
|
||||
#elif
|
||||
#else
|
||||
__kernel __attribute__((reqd_work_group_size(PADTRA_TILE, PADTRA_TILE, 1)))
|
||||
#endif
|
||||
void TransposePadMatrixStridedBatched(const int src_one, const int src_two,
|
||||
|
@ -275,7 +275,7 @@ void TransposePadMatrixStridedBatched(const int src_one, const int src_two,
|
|||
// Strided-batched version of the above
|
||||
#if RELAX_WORKGROUP_SIZE == 1
|
||||
__kernel
|
||||
#elif
|
||||
#else
|
||||
__kernel __attribute__((reqd_work_group_size(PADTRA_TILE, PADTRA_TILE, 1)))
|
||||
#endif
|
||||
void TransposeMatrixStridedBatched(const int src_one, const int src_two,
|
||||
|
|
|
@ -21,7 +21,7 @@ R"(
|
|||
|
||||
#if RELAX_WORKGROUP_SIZE == 1
|
||||
__kernel
|
||||
#elif
|
||||
#else
|
||||
__kernel __attribute__((reqd_work_group_size(MDIMC, NDIMC, 1)))
|
||||
#endif
|
||||
void XgemmBatched(const int kSizeM, const int kSizeN, const int kSizeK,
|
||||
|
@ -68,7 +68,7 @@ void XgemmBatched(const int kSizeM, const int kSizeN, const int kSizeK,
|
|||
|
||||
#if RELAX_WORKGROUP_SIZE == 1
|
||||
__kernel
|
||||
#elif
|
||||
#else
|
||||
__kernel __attribute__((reqd_work_group_size(MDIMC, NDIMC, 1)))
|
||||
#endif
|
||||
void XgemmStridedBatched(const int kSizeM, const int kSizeN, const int kSizeK,
|
||||
|
|
|
@ -22,7 +22,7 @@ R"(
|
|||
// Direct version of the batched GEMM kernel with [A, B] = [non-transposed, non-transposed]
|
||||
#if RELAX_WORKGROUP_SIZE == 1
|
||||
__kernel
|
||||
#elif
|
||||
#else
|
||||
__kernel __attribute__((reqd_work_group_size(MDIMCD, NDIMCD, 1)))
|
||||
#endif
|
||||
void XgemmDirectBatchedNN(const int kSizeM, const int kSizeN, const int kSizeK,
|
||||
|
@ -47,7 +47,7 @@ void XgemmDirectBatchedNN(const int kSizeM, const int kSizeN, const int kSizeK,
|
|||
// Direct version of the batched GEMM kernel with [A, B] = [non-transposed, transposed]
|
||||
#if RELAX_WORKGROUP_SIZE == 1
|
||||
__kernel
|
||||
#elif
|
||||
#else
|
||||
__kernel __attribute__((reqd_work_group_size(MDIMCD, NDIMCD, 1)))
|
||||
#endif
|
||||
void XgemmDirectBatchedNT(const int kSizeM, const int kSizeN, const int kSizeK,
|
||||
|
@ -72,7 +72,7 @@ void XgemmDirectBatchedNT(const int kSizeM, const int kSizeN, const int kSizeK,
|
|||
// Direct version of the batched GEMM kernel with [A, B] = [transposed, non-transposed]
|
||||
#if RELAX_WORKGROUP_SIZE == 1
|
||||
__kernel
|
||||
#elif
|
||||
#else
|
||||
__kernel __attribute__((reqd_work_group_size(MDIMCD, NDIMCD, 1)))
|
||||
#endif
|
||||
void XgemmDirectBatchedTN(const int kSizeM, const int kSizeN, const int kSizeK,
|
||||
|
@ -97,7 +97,7 @@ void XgemmDirectBatchedTN(const int kSizeM, const int kSizeN, const int kSizeK,
|
|||
// Direct version of the batched GEMM kernel with [A, B] = [transposed, transposed]
|
||||
#if RELAX_WORKGROUP_SIZE == 1
|
||||
__kernel
|
||||
#elif
|
||||
#else
|
||||
__kernel __attribute__((reqd_work_group_size(MDIMCD, NDIMCD, 1)))
|
||||
#endif
|
||||
void XgemmDirectBatchedTT(const int kSizeM, const int kSizeN, const int kSizeK,
|
||||
|
@ -126,7 +126,7 @@ void XgemmDirectBatchedTT(const int kSizeM, const int kSizeN, const int kSizeK,
|
|||
// Direct version of the strided-batched GEMM kernel with [A, B] = [non-transposed, non-transposed]
|
||||
#if RELAX_WORKGROUP_SIZE == 1
|
||||
__kernel
|
||||
#elif
|
||||
#else
|
||||
__kernel __attribute__((reqd_work_group_size(MDIMCD, NDIMCD, 1)))
|
||||
#endif
|
||||
void XgemmDirectStridedBatchedNN(const int kSizeM, const int kSizeN, const int kSizeK,
|
||||
|
@ -149,7 +149,7 @@ void XgemmDirectStridedBatchedNN(const int kSizeM, const int kSizeN, const int k
|
|||
// Direct version of the strided-batched GEMM kernel with [A, B] = [non-transposed, transposed]
|
||||
#if RELAX_WORKGROUP_SIZE == 1
|
||||
__kernel
|
||||
#elif
|
||||
#else
|
||||
__kernel __attribute__((reqd_work_group_size(MDIMCD, NDIMCD, 1)))
|
||||
#endif
|
||||
void XgemmDirectStridedBatchedNT(const int kSizeM, const int kSizeN, const int kSizeK,
|
||||
|
@ -172,7 +172,7 @@ void XgemmDirectStridedBatchedNT(const int kSizeM, const int kSizeN, const int k
|
|||
// Direct version of the strided-batched GEMM kernel with [A, B] = [transposed, non-transposed]
|
||||
#if RELAX_WORKGROUP_SIZE == 1
|
||||
__kernel
|
||||
#elif
|
||||
#else
|
||||
__kernel __attribute__((reqd_work_group_size(MDIMCD, NDIMCD, 1)))
|
||||
#endif
|
||||
void XgemmDirectStridedBatchedTN(const int kSizeM, const int kSizeN, const int kSizeK,
|
||||
|
@ -195,7 +195,7 @@ void XgemmDirectStridedBatchedTN(const int kSizeM, const int kSizeN, const int k
|
|||
// Direct version of the strided-batched GEMM kernel with [A, B] = [transposed, transposed]
|
||||
#if RELAX_WORKGROUP_SIZE == 1
|
||||
__kernel
|
||||
#elif
|
||||
#else
|
||||
__kernel __attribute__((reqd_work_group_size(MDIMCD, NDIMCD, 1)))
|
||||
#endif
|
||||
void XgemmDirectStridedBatchedTT(const int kSizeM, const int kSizeN, const int kSizeK,
|
||||
|
|
|
@ -220,7 +220,7 @@ INLINE_FUNC void XgemmDirect(const int kSizeM, const int kSizeN, const int kSize
|
|||
// Direct version of the GEMM kernel with [A, B] = [non-transposed, non-transposed]
|
||||
#if RELAX_WORKGROUP_SIZE == 1
|
||||
__kernel
|
||||
#elif
|
||||
#else
|
||||
__kernel __attribute__((reqd_work_group_size(MDIMCD, NDIMCD, 1)))
|
||||
#endif
|
||||
void XgemmDirectNN(const int kSizeM, const int kSizeN, const int kSizeK,
|
||||
|
@ -239,7 +239,7 @@ void XgemmDirectNN(const int kSizeM, const int kSizeN, const int kSizeK,
|
|||
// Direct version of the GEMM kernel with [A, B] = [non-transposed, transposed]
|
||||
#if RELAX_WORKGROUP_SIZE == 1
|
||||
__kernel
|
||||
#elif
|
||||
#else
|
||||
__kernel __attribute__((reqd_work_group_size(MDIMCD, NDIMCD, 1)))
|
||||
#endif
|
||||
void XgemmDirectNT(const int kSizeM, const int kSizeN, const int kSizeK,
|
||||
|
@ -258,7 +258,7 @@ void XgemmDirectNT(const int kSizeM, const int kSizeN, const int kSizeK,
|
|||
// Direct version of the GEMM kernel with [A, B] = [transposed, non-transposed]
|
||||
#if RELAX_WORKGROUP_SIZE == 1
|
||||
__kernel
|
||||
#elif
|
||||
#else
|
||||
__kernel __attribute__((reqd_work_group_size(MDIMCD, NDIMCD, 1)))
|
||||
#endif
|
||||
void XgemmDirectTN(const int kSizeM, const int kSizeN, const int kSizeK,
|
||||
|
@ -277,7 +277,7 @@ void XgemmDirectTN(const int kSizeM, const int kSizeN, const int kSizeK,
|
|||
// Direct version of the GEMM kernel with [A, B] = [transposed, transposed]
|
||||
#if RELAX_WORKGROUP_SIZE == 1
|
||||
__kernel
|
||||
#elif
|
||||
#else
|
||||
__kernel __attribute__((reqd_work_group_size(MDIMCD, NDIMCD, 1)))
|
||||
#endif
|
||||
void XgemmDirectTT(const int kSizeM, const int kSizeN, const int kSizeK,
|
||||
|
|
|
@ -21,7 +21,7 @@ R"(
|
|||
// Main entry point of the kernel. This is the upper-triangular version.
|
||||
#if RELAX_WORKGROUP_SIZE == 1
|
||||
__kernel
|
||||
#elif
|
||||
#else
|
||||
__kernel __attribute__((reqd_work_group_size(MDIMC, NDIMC, 1)))
|
||||
#endif
|
||||
void XgemmUpper(const int kSizeN, const int kSizeK,
|
||||
|
@ -61,7 +61,7 @@ void XgemmUpper(const int kSizeN, const int kSizeK,
|
|||
// Main entry point of the kernel. This is the lower-triangular version.
|
||||
#if RELAX_WORKGROUP_SIZE == 1
|
||||
__kernel
|
||||
#elif
|
||||
#else
|
||||
__kernel __attribute__((reqd_work_group_size(MDIMC, NDIMC, 1)))
|
||||
#endif
|
||||
void XgemmLower(const int kSizeN, const int kSizeK,
|
||||
|
@ -105,7 +105,7 @@ void XgemmLower(const int kSizeN, const int kSizeK,
|
|||
// Main entry point of the kernel. This is the regular full version.
|
||||
#if RELAX_WORKGROUP_SIZE == 1
|
||||
__kernel
|
||||
#elif
|
||||
#else
|
||||
__kernel __attribute__((reqd_work_group_size(MDIMC, NDIMC, 1)))
|
||||
#endif
|
||||
void Xgemm(const int kSizeM, const int kSizeN, const int kSizeK,
|
||||
|
|
|
@ -94,7 +94,7 @@ INLINE_FUNC void Xcol2im(const int input_h, const int input_w, const int channel
|
|||
// Kernel flip version of the Xcol2im kernel (for convolution)
|
||||
#if RELAX_WORKGROUP_SIZE == 1
|
||||
__kernel
|
||||
#elif
|
||||
#else
|
||||
__kernel __attribute__((reqd_work_group_size(COPY_DIMX, COPY_DIMY, 1)))
|
||||
#endif
|
||||
void Xcol2imKernelFlip(const int input_h, const int input_w, const int channels,
|
||||
|
@ -119,7 +119,7 @@ void Xcol2imKernelFlip(const int input_h, const int input_w, const int channels,
|
|||
// Normal version of the Xcol2im kernel (for cross-correlation)
|
||||
#if RELAX_WORKGROUP_SIZE == 1
|
||||
__kernel
|
||||
#elif
|
||||
#else
|
||||
__kernel __attribute__((reqd_work_group_size(COPY_DIMX, COPY_DIMY, 1)))
|
||||
#endif
|
||||
void Xcol2imKernelNormal(const int input_h, const int input_w, const int channels,
|
||||
|
|
|
@ -76,7 +76,7 @@ INLINE_FUNC void Xim2col(const int input_h, const int input_w, const int channel
|
|||
// Kernel flip version of the Xim2col kernel (for convolution)
|
||||
#if RELAX_WORKGROUP_SIZE == 1
|
||||
__kernel
|
||||
#elif
|
||||
#else
|
||||
__kernel __attribute__((reqd_work_group_size(COPY_DIMX, COPY_DIMY, 1)))
|
||||
#endif
|
||||
void Xim2colKernelFlip(const int input_h, const int input_w, const int channels,
|
||||
|
@ -97,7 +97,7 @@ void Xim2colKernelFlip(const int input_h, const int input_w, const int channels,
|
|||
// Normal version of the Xim2col kernel (for cross-correlation)
|
||||
#if RELAX_WORKGROUP_SIZE == 1
|
||||
__kernel
|
||||
#elif
|
||||
#else
|
||||
__kernel __attribute__((reqd_work_group_size(COPY_DIMX, COPY_DIMY, 1)))
|
||||
#endif
|
||||
void Xim2colKernelNormal(const int input_h, const int input_w, const int channels,
|
||||
|
|
|
@ -25,7 +25,7 @@ R"(
|
|||
#if defined(CONVGEMM_WITH_IM2COL)
|
||||
#if RELAX_WORKGROUP_SIZE == 1
|
||||
__kernel
|
||||
#elif
|
||||
#else
|
||||
__kernel __attribute__((reqd_work_group_size(MDIMCD, NDIMCD, 1)))
|
||||
#endif
|
||||
void Xconvgemm(const int num_patches, const int num_kernels, const int patch_size,
|
||||
|
@ -291,7 +291,7 @@ INLINE_FUNC void Xconvgemm(const int num_patches, const int num_kernels, const i
|
|||
#if !defined(CONVGEMM_WITH_IM2COL)
|
||||
#if RELAX_WORKGROUP_SIZE == 1
|
||||
__kernel
|
||||
#elif
|
||||
#else
|
||||
__kernel __attribute__((reqd_work_group_size(MDIMCD, NDIMCD, 1)))
|
||||
#endif
|
||||
void XconvgemmFlip(const int num_patches, const int num_kernels, const int patch_size,
|
||||
|
@ -316,7 +316,7 @@ void XconvgemmFlip(const int num_patches, const int num_kernels, const int patch
|
|||
|
||||
#if RELAX_WORKGROUP_SIZE == 1
|
||||
__kernel
|
||||
#elif
|
||||
#else
|
||||
__kernel __attribute__((reqd_work_group_size(MDIMCD, NDIMCD, 1)))
|
||||
#endif
|
||||
void XconvgemmNormal(const int num_patches, const int num_kernels, const int patch_size,
|
||||
|
|
|
@ -43,7 +43,7 @@ std::shared_ptr<Program> CompileFromSource(
|
|||
|
||||
// For specific devices, use the non-IEE754 compliant OpenCL mad() instruction. This can improve
|
||||
// performance, but might result in a reduced accuracy.
|
||||
if ((device.IsAMD() && device.IsGPU()) || device.IsQualcomm()) {
|
||||
if ((device.IsAMD() && device.IsGPU()) || (device.IsQualcomm() && device.IsGPU())) {
|
||||
header_string += "#define USE_CL_MAD 1\n";
|
||||
}
|
||||
|
||||
|
@ -54,7 +54,7 @@ std::shared_ptr<Program> CompileFromSource(
|
|||
|
||||
// For specific devices add a global synchronisation barrier to the GEMM kernel to optimize
|
||||
// performance through better cache behaviour
|
||||
if ((device.IsARM() && device.IsGPU()) || device.IsQualcomm()) {
|
||||
if ((device.IsARM() && device.IsGPU()) || (device.IsQualcomm() && device.IsGPU())) {
|
||||
header_string += "#define GLOBAL_MEM_FENCE 1\n";
|
||||
}
|
||||
|
||||
|
|
|
@ -463,7 +463,7 @@ std::string GetDeviceArchitecture(const Device& device) {
|
|||
else if (device.HasExtension(kKhronosAttributesAMD)) {
|
||||
device_architecture = device.Name(); // Name is architecture for AMD APP and AMD ROCm
|
||||
}
|
||||
else if (device.IsQualcomm()) { // queries the Adreno GPU architecture version
|
||||
else if ((device.IsQualcomm() && device.IsGPU())) { // queries the Adreno GPU architecture version
|
||||
device_architecture = device.AdrenoVersion();
|
||||
}
|
||||
// Note: no else - 'device_architecture' might be the empty string
|
||||
|
|
Loading…
Reference in New Issue