290 lines
9.3 KiB
Plaintext
290 lines
9.3 KiB
Plaintext
|
|
// =================================================================================================
|
|
// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
|
|
// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
|
|
// width of 100 characters per line.
|
|
//
|
|
// Author(s):
|
|
// Cedric Nugteren <www.cedricnugteren.nl>
|
|
//
|
|
// This file contains the common defines and type-defs for the CLBlast OpenCL kernels.
|
|
//
|
|
// =================================================================================================
|
|
|
|
// Enables loading of this file using the C++ pre-processor's #include (C++11 standard raw string
|
|
// literal). Comment-out this line for syntax-highlighting when developing.
|
|
R"(
|
|
// =================================================================================================
|
|
|
|
// Parameters set by the tuner or by the database. Here they are given a basic default value in case
|
|
// this file is used outside of the CLBlast library.
|
|
#ifndef PRECISION
|
|
#define PRECISION 32 // Data-types: half, single or double precision, complex or regular
|
|
#endif
|
|
|
|
// =================================================================================================
|
|
|
|
#ifndef CUDA
|
|
// Enable support for half-precision
|
|
#if PRECISION == 16
|
|
#pragma OPENCL EXTENSION cl_khr_fp16: enable
|
|
#endif
|
|
|
|
// Enable support for double-precision
|
|
#if PRECISION == 64 || PRECISION == 6464
|
|
#pragma OPENCL EXTENSION cl_khr_fp64: enable
|
|
#endif
|
|
#endif
|
|
|
|
// Half-precision
|
|
#if PRECISION == 16
|
|
typedef half real;
|
|
typedef half2 real2;
|
|
typedef half4 real4;
|
|
typedef half8 real8;
|
|
typedef half16 real16;
|
|
#define ZERO 0
|
|
#define ONE 1
|
|
#define SMALLEST -1.0e14
|
|
|
|
// Single-precision
|
|
#elif PRECISION == 32
|
|
typedef float real;
|
|
typedef float2 real2;
|
|
typedef float4 real4;
|
|
typedef float8 real8;
|
|
typedef float16 real16;
|
|
#define ZERO 0.0f
|
|
#define ONE 1.0f
|
|
#define SMALLEST -1.0e37f
|
|
|
|
// Double-precision
|
|
#elif PRECISION == 64
|
|
typedef double real;
|
|
typedef double2 real2;
|
|
typedef double4 real4;
|
|
typedef double8 real8;
|
|
typedef double16 real16;
|
|
#define ZERO 0.0
|
|
#define ONE 1.0
|
|
#define SMALLEST -1.0e37
|
|
|
|
// Complex single-precision
|
|
#elif PRECISION == 3232
|
|
typedef float2 real;
|
|
typedef struct cfloat2 {real x; real y;} real2;
|
|
typedef struct cfloat4 {real x; real y; real z; real w;} real4;
|
|
typedef struct cfloat8 {real s0; real s1; real s2; real s3;
|
|
real s4; real s5; real s6; real s7;} real8;
|
|
typedef struct cfloat16 {real s0; real s1; real s2; real s3;
|
|
real s4; real s5; real s6; real s7;
|
|
real s8; real s9; real sA; real sB;
|
|
real sC; real sD; real sE; real sF;} real16;
|
|
#define ZERO 0.0f
|
|
#define ONE 1.0f
|
|
#define SMALLEST -1.0e37f
|
|
|
|
// Complex double-precision
|
|
#elif PRECISION == 6464
|
|
typedef double2 real;
|
|
typedef struct cdouble2 {real x; real y;} real2;
|
|
typedef struct cdouble4 {real x; real y; real z; real w;} real4;
|
|
typedef struct cdouble8 {real s0; real s1; real s2; real s3;
|
|
real s4; real s5; real s6; real s7;} real8;
|
|
typedef struct cdouble16 {real s0; real s1; real s2; real s3;
|
|
real s4; real s5; real s6; real s7;
|
|
real s8; real s9; real sA; real sB;
|
|
real sC; real sD; real sE; real sF;} real16;
|
|
#define ZERO 0.0
|
|
#define ONE 1.0
|
|
#define SMALLEST -1.0e37
|
|
#endif
|
|
|
|
// Single-element version of a complex number
|
|
#if PRECISION == 3232
|
|
typedef float singlereal;
|
|
#elif PRECISION == 6464
|
|
typedef double singlereal;
|
|
#else
|
|
typedef real singlereal;
|
|
#endif
|
|
|
|
// Converts a 'real argument' value to a 'real' value as passed to the kernel. Normally there is no
|
|
// conversion, but half-precision is not supported as kernel argument so it is converted from float.
|
|
#if PRECISION == 16
|
|
typedef float real_arg;
|
|
#define GetRealArg(x) (half)x
|
|
#else
|
|
typedef real real_arg;
|
|
#define GetRealArg(x) x
|
|
#endif
|
|
|
|
// Pointers to local memory objects (using a define because CUDA doesn't need them)
|
|
#ifndef LOCAL_PTR
|
|
#define LOCAL_PTR __local
|
|
#endif
|
|
|
|
// =================================================================================================
|
|
|
|
// Don't use the non-IEEE754 compliant OpenCL built-in mad() instruction per default. For specific
|
|
// devices, this is enabled (see src/routine.cpp).
|
|
#ifndef USE_CL_MAD
|
|
#define USE_CL_MAD 0
|
|
#endif
|
|
|
|
// By default the workgroup size requirement is enabled. For Qualcomm devices the workgroup size
|
|
// requirement results in worse performance and is disabled (src/utilities/compile.cpp)
|
|
#ifndef RELAX_WORKGROUP_SIZE
|
|
#define RELAX_WORKGROUP_SIZE 0
|
|
#endif
|
|
|
|
// Sets a variable to zero
|
|
#if PRECISION == 3232 || PRECISION == 6464
|
|
#define SetToZero(a) a.x = ZERO; a.y = ZERO
|
|
#else
|
|
#define SetToZero(a) a = ZERO
|
|
#endif
|
|
|
|
// Sets a variable to zero (only the imaginary part)
|
|
#if PRECISION == 3232 || PRECISION == 6464
|
|
#define ImagToZero(a) a.y = ZERO
|
|
#else
|
|
#define ImagToZero(a)
|
|
#endif
|
|
|
|
// Sets a variable to one
|
|
#if PRECISION == 3232 || PRECISION == 6464
|
|
#define SetToOne(a) a.x = ONE; a.y = ZERO
|
|
#else
|
|
#define SetToOne(a) a = ONE
|
|
#endif
|
|
|
|
// Determines whether a variable is zero
|
|
#if PRECISION == 3232 || PRECISION == 6464
|
|
#define IsZero(a) ((a.x == ZERO) && (a.y == ZERO))
|
|
#else
|
|
#define IsZero(a) (a == ZERO)
|
|
#endif
|
|
|
|
// The absolute value (component-wise)
|
|
#if PRECISION == 3232 || PRECISION == 6464
|
|
#define AbsoluteValue(value) value.x = fabs(value.x); value.y = fabs(value.y)
|
|
#else
|
|
#define AbsoluteValue(value) value = fabs(value)
|
|
#endif
|
|
|
|
// Negation (component-wise)
|
|
#if PRECISION == 3232 || PRECISION == 6464
|
|
#define Negate(value) value.x = -(value.x); value.y = -(value.y)
|
|
#else
|
|
#define Negate(value) value = -(value)
|
|
#endif
|
|
|
|
// Adds two complex variables
|
|
#if PRECISION == 3232 || PRECISION == 6464
|
|
#define Add(c,a,b) c.x = a.x + b.x; c.y = a.y + b.y
|
|
#else
|
|
#define Add(c,a,b) c = a + b
|
|
#endif
|
|
|
|
// Subtracts two complex variables
|
|
#if PRECISION == 3232 || PRECISION == 6464
|
|
#define Subtract(c,a,b) c.x = a.x - b.x; c.y = a.y - b.y
|
|
#else
|
|
#define Subtract(c,a,b) c = a - b
|
|
#endif
|
|
|
|
// Multiply two complex variables (used in the defines below)
|
|
#if PRECISION == 3232 || PRECISION == 6464
|
|
#define MulReal(a,b) a.x*b.x - a.y*b.y
|
|
#define MulImag(a,b) a.x*b.y + a.y*b.x
|
|
#endif
|
|
|
|
// The scalar multiply function
|
|
#if PRECISION == 3232 || PRECISION == 6464
|
|
#define Multiply(c,a,b) c.x = MulReal(a,b); c.y = MulImag(a,b)
|
|
#else
|
|
#define Multiply(c,a,b) c = a * b
|
|
#endif
|
|
|
|
// The scalar multiply-add function
|
|
#if PRECISION == 3232 || PRECISION == 6464
|
|
#define MultiplyAdd(c,a,b) c.x += MulReal(a,b); c.y += MulImag(a,b)
|
|
#else
|
|
#if USE_CL_MAD == 1
|
|
#define MultiplyAdd(c,a,b) c = mad(a, b, c)
|
|
#else
|
|
#define MultiplyAdd(c,a,b) c += a * b
|
|
#endif
|
|
#endif
|
|
|
|
// The scalar multiply-subtract function
|
|
#if PRECISION == 3232 || PRECISION == 6464
|
|
#define MultiplySubtract(c,a,b) c.x -= MulReal(a,b); c.y -= MulImag(a,b)
|
|
#else
|
|
#define MultiplySubtract(c,a,b) c -= a * b
|
|
#endif
|
|
|
|
// The scalar division function: full division
|
|
#if PRECISION == 3232 || PRECISION == 6464
|
|
#define DivideFull(c,a,b) singlereal num_x = (a.x * b.x) + (a.y * b.y); singlereal num_y = (a.y * b.x) - (a.x * b.y); singlereal denom = (b.x * b.x) + (b.y * b.y); c.x = num_x / denom; c.y = num_y / denom
|
|
#else
|
|
#define DivideFull(c,a,b) c = a / b
|
|
#endif
|
|
|
|
// The scalar AXPBY function
|
|
#if PRECISION == 3232 || PRECISION == 6464
|
|
#define AXPBY(e,a,b,c,d) e.x = MulReal(a,b) + MulReal(c,d); e.y = MulImag(a,b) + MulImag(c,d)
|
|
#else
|
|
#define AXPBY(e,a,b,c,d) e = a*b + c*d
|
|
#endif
|
|
|
|
// The complex conjugate operation for complex transforms
|
|
#if PRECISION == 3232 || PRECISION == 6464
|
|
#define COMPLEX_CONJUGATE(value) value.x = value.x; value.y = -value.y
|
|
#else
|
|
#define COMPLEX_CONJUGATE(value)
|
|
#endif
|
|
|
|
// =================================================================================================
|
|
|
|
// Force inlining functions or not: some compilers don't support the inline keyword
|
|
#ifdef USE_INLINE_KEYWORD
|
|
#define INLINE_FUNC inline
|
|
#else
|
|
#define INLINE_FUNC
|
|
#endif
|
|
|
|
// =================================================================================================
|
|
|
|
// Shuffled workgroup indices to avoid partition camping, see below. For specific devices, this is
|
|
// enabled (see src/routine.cc).
|
|
#ifndef USE_STAGGERED_INDICES
|
|
#define USE_STAGGERED_INDICES 0
|
|
#endif
|
|
|
|
// Staggered/shuffled group indices to avoid partition camping (AMD GPUs). Formula's are taken from:
|
|
// http://docs.nvidia.com/cuda/samples/6_Advanced/transpose/doc/MatrixTranspose.pdf
|
|
// More details: https://github.com/CNugteren/CLBlast/issues/53
|
|
#if USE_STAGGERED_INDICES == 1 && GEMMK == 0
|
|
INLINE_FUNC int GetGroupIDFlat() {
|
|
return get_group_id(0) + get_num_groups(0) * get_group_id(1);
|
|
}
|
|
INLINE_FUNC int GetGroupID1() {
|
|
return (GetGroupIDFlat()) % get_num_groups(1);
|
|
}
|
|
INLINE_FUNC int GetGroupID0() {
|
|
return ((GetGroupIDFlat() / get_num_groups(1)) + GetGroupID1()) % get_num_groups(0);
|
|
}
|
|
#else
|
|
INLINE_FUNC int GetGroupID1() { return get_group_id(1); }
|
|
INLINE_FUNC int GetGroupID0() { return get_group_id(0); }
|
|
#endif
|
|
|
|
// =================================================================================================
|
|
|
|
// End of the C++11 raw string literal
|
|
)"
|
|
|
|
// =================================================================================================
|