Fixed several (not all) CUDA kernel compilation issues

pull/204/head
Cedric Nugteren 2017-10-14 16:01:12 +02:00
parent 74d6e0048c
commit 313fc796b2
2 changed files with 48 additions and 28 deletions

View File

@ -23,15 +23,18 @@ R"(
#endif
// =================================================================================================
#ifndef CUDA
// Enable support for double-precision
#if PRECISION == 16
#pragma OPENCL EXTENSION cl_khr_fp16: enable
#endif
// Enable support for double-precision
#if PRECISION == 16
#pragma OPENCL EXTENSION cl_khr_fp16: enable
#endif
// Enable support for double-precision
#if PRECISION == 64 || PRECISION == 6464
#pragma OPENCL EXTENSION cl_khr_fp64: enable
#endif
// Enable support for double-precision
#if PRECISION == 64 || PRECISION == 6464
#pragma OPENCL EXTENSION cl_khr_fp64: enable
#endif
// Half-precision
@ -254,18 +257,18 @@ R"(
// http://docs.nvidia.com/cuda/samples/6_Advanced/transpose/doc/MatrixTranspose.pdf
// More details: https://github.com/CNugteren/CLBlast/issues/53
#if USE_STAGGERED_INDICES == 1
INLINE_FUNC size_t GetGroupIDFlat() {
INLINE_FUNC int GetGroupIDFlat() {
return get_group_id(0) + get_num_groups(0) * get_group_id(1);
}
INLINE_FUNC size_t GetGroupID1() {
INLINE_FUNC int GetGroupID1() {
return (GetGroupIDFlat()) % get_num_groups(1);
}
INLINE_FUNC size_t GetGroupID0() {
INLINE_FUNC int GetGroupID0() {
return ((GetGroupIDFlat() / get_num_groups(1)) + GetGroupID1()) % get_num_groups(0);
}
#else
INLINE_FUNC size_t GetGroupID1() { return get_group_id(1); }
INLINE_FUNC size_t GetGroupID0() { return get_group_id(0); }
INLINE_FUNC int GetGroupID1() { return get_group_id(1); }
INLINE_FUNC int GetGroupID0() { return get_group_id(0); }
#endif
// =================================================================================================

View File

@ -16,32 +16,28 @@
R"(
// =================================================================================================
// Replaces the OpenCL keywords with CUDA equivalent
#define __kernel __placeholder__
#define __global
#define __placeholder__ extern "C" __global__
#define __local __shared__
#define restrict __restrict__
#define __constant const
#define inline __device__ inline // assumes all device functions are annotated with inline in OpenCL
// Replaces OpenCL synchronisation with CUDA synchronisation
#define barrier(x) __syncthreads()
// CLBlast specific additions
#define CUDA 1
// Replaces the OpenCL get_xxx_ID with CUDA equivalents
__device__ int get_local_id(int x) {
__device__ int get_local_id(const int x) {
if (x == 0) { return threadIdx.x; }
if (x == 1) { return threadIdx.y; }
return threadIdx.z;
}
__device__ int get_group_id(int x) {
__device__ int get_group_id(const int x) {
if (x == 0) { return blockIdx.x; }
if (x == 1) { return blockIdx.y;}
if (x == 1) { return blockIdx.y; }
return blockIdx.z;
}
__device__ int get_global_id(int x) {
__device__ int get_global_size(const int x) {
if (x == 0) { return gridDim.x; }
if (x == 1) { return gridDim.y; }
return gridDim.z;
}
__device__ int get_global_id(const int x) {
if (x == 0) { return blockIdx.x*blockDim.x + threadIdx.x; }
if (y == 0) { return blockIdx.y*blockDim.y + threadIdx.y; }
if (x == 1) { return blockIdx.y*blockDim.y + threadIdx.y; }
return blockIdx.z*blockDim.z + threadIdx.z;
}
@ -52,6 +48,27 @@ typedef struct { float s0; float s1; float s2; float s3;
float s4; float s5; float s6; float s7;
float s8; float s9; float s10; float s11;
float s12; float s13; float s14; float s15; } float16;
typedef struct { double s0; double s1; double s2; double s3;
double s4; double s5; double s6; double s7; } double8;
typedef struct { double s0; double s1; double s2; double s3;
double s4; double s5; double s6; double s7;
double s8; double s9; double s10; double s11;
double s12; double s13; double s14; double s15; } double16;
// Replaces the OpenCL keywords with CUDA equivalent
#define __kernel __placeholder__
#define __global
#define __placeholder__ extern "C" __global__
#define __local __shared__
#define restrict __restrict__
#define __constant const
#define inline __device__ // assumes all device functions are annotated with inline in OpenCL
// Kernel attributes (don't replace currently)
#define reqd_work_group_size(x, y, z)
// Replaces OpenCL synchronisation with CUDA synchronisation
#define barrier(x) __syncthreads()
// =================================================================================================