Some potential fixes for error -54 when launching TRSV and TRSM kernels

CLBlast-288-local-memory-optional-for-direct-GEMM
Cedric Nugteren 2018-05-31 20:09:49 +02:00
parent ff4d5558a6
commit e609220393
4 changed files with 16 additions and 15 deletions

View File

@ -18,7 +18,7 @@ R"(
// =================================================================================================
#if defined(ROUTINE_TRSV)
__kernel __attribute__((reqd_work_group_size(16, 1, 1)))
__kernel
void FillVector(const int n, const int inc, const int offset,
__global real* restrict dest, const real_arg arg_value) {
const real value = GetRealArg(arg_value);

View File

@ -19,7 +19,7 @@ R"(
#if defined(ROUTINE_INVERT)
// B21 = A21 * B11
__kernel __attribute__((reqd_work_group_size(1 * TMMWGSX, TMMWGSY, 1)))
__kernel
void TripleMatMul16Part1Lower(int n, __global const real* restrict src, const int a_offset, const int lda,
__global real* restrict dest, int current_size, int num_pages, const int block_size)
{
@ -28,7 +28,7 @@ void TripleMatMul16Part1Lower(int n, __global const real* restrict src, const in
}
// B21 = -B22 * B21
__kernel __attribute__((reqd_work_group_size(1 * TMMWGSX, TMMWGSY, 1)))
__kernel
void TripleMatMul16Part2Lower(int n, __global real* restrict dest, int current_size, int num_pages, const int block_size)
{
__local real lm[LOCALY * LOCALX];
@ -36,7 +36,7 @@ void TripleMatMul16Part2Lower(int n, __global real* restrict dest, int current_s
}
// B21 = A21 * B11
__kernel __attribute__((reqd_work_group_size(2 * TMMWGSX, TMMWGSY, 1)))
__kernel
void TripleMatMul32Part1Lower(int n, __global const real* restrict src, const int a_offset, const int lda,
__global real* restrict dest, int current_size, int num_pages, const int block_size)
{
@ -45,7 +45,7 @@ void TripleMatMul32Part1Lower(int n, __global const real* restrict src, const in
}
// B21 = -B22 * B21
__kernel __attribute__((reqd_work_group_size(2 * TMMWGSX, TMMWGSY, 1)))
__kernel
void TripleMatMul32Part2Lower(int n, __global real* restrict dest, int current_size, int num_pages, const int block_size)
{
__local real lm[LOCALY * LOCALX];
@ -53,7 +53,7 @@ void TripleMatMul32Part2Lower(int n, __global real* restrict dest, int current_s
}
// B21 = A21 * B11
__kernel __attribute__((reqd_work_group_size(4 * TMMWGSX, TMMWGSY, 1)))
__kernel
void TripleMatMul64Part1Lower(int n, __global const real* restrict src, const int a_offset, const int lda,
__global real* restrict dest, int current_size, int num_pages, const int block_size)
{
@ -62,7 +62,7 @@ void TripleMatMul64Part1Lower(int n, __global const real* restrict src, const in
}
// B21 = -B22 * B21
__kernel __attribute__((reqd_work_group_size(4 * TMMWGSX, TMMWGSY, 1)))
__kernel
void TripleMatMul64Part2Lower(int n, __global real* restrict dest, int current_size, int num_pages, const int block_size)
{
__local real lm[LOCALY * LOCALX];
@ -72,7 +72,7 @@ void TripleMatMul64Part2Lower(int n, __global real* restrict dest, int current_s
// =================================================================================================
// B12 = A12 * B22
__kernel __attribute__((reqd_work_group_size(1 * TMMWGSX, TMMWGSY, 1)))
__kernel
void TripleMatMul16Part1Upper(int n, __global const real* restrict src, const int a_offset, const int lda,
__global real* restrict dest, int current_size, int num_pages, const int block_size)
{
@ -81,7 +81,7 @@ void TripleMatMul16Part1Upper(int n, __global const real* restrict src, const in
}
// B12 = -B11 * B12
__kernel __attribute__((reqd_work_group_size(1 * TMMWGSX, TMMWGSY, 1)))
__kernel
void TripleMatMul16Part2Upper(int n, __global real* restrict dest, int current_size, int num_pages, const int block_size)
{
__local real lm[LOCALY * LOCALX];
@ -89,7 +89,7 @@ void TripleMatMul16Part2Upper(int n, __global real* restrict dest, int current_s
}
// B12 = A12 * B22
__kernel __attribute__((reqd_work_group_size(2 * TMMWGSX, TMMWGSY, 1)))
__kernel
void TripleMatMul32Part1Upper(int n, __global const real* restrict src, const int a_offset, const int lda,
__global real* restrict dest, int current_size, int num_pages, const int block_size)
{
@ -98,7 +98,7 @@ void TripleMatMul32Part1Upper(int n, __global const real* restrict src, const in
}
// B12 = -B11 * B12
__kernel __attribute__((reqd_work_group_size(2 * TMMWGSX, TMMWGSY, 1)))
__kernel
void TripleMatMul32Part2Upper(int n, __global real* restrict dest, int current_size, int num_pages, const int block_size)
{
__local real lm[LOCALY * LOCALX];
@ -106,7 +106,7 @@ void TripleMatMul32Part2Upper(int n, __global real* restrict dest, int current_s
}
// B12 = A12 * B22
__kernel __attribute__((reqd_work_group_size(4 * TMMWGSX, TMMWGSY, 1)))
__kernel
void TripleMatMul64Part1Upper(int n, __global const real* restrict src, const int a_offset, const int lda,
__global real* restrict dest, int current_size, int num_pages, const int block_size)
{
@ -115,7 +115,7 @@ void TripleMatMul64Part1Upper(int n, __global const real* restrict src, const in
}
// B12 = -B11 * B12
__kernel __attribute__((reqd_work_group_size(4 * TMMWGSX, TMMWGSY, 1)))
__kernel
void TripleMatMul64Part2Upper(int n, __global real* restrict dest, int current_size, int num_pages, const int block_size)
{
__local real lm[LOCALY * LOCALX];

View File

@ -76,7 +76,7 @@ R"(
// =================================================================================================
#if defined(ROUTINE_INVERT) || defined(ROUTINE_TRSM)
__kernel __attribute__((reqd_work_group_size(16, 1, 1)))
__kernel
void FillMatrix(const int m, const int n, const int ld, const int offset,
__global real* restrict dest, const real_arg arg_value) {
const real value = GetRealArg(arg_value);

View File

@ -113,7 +113,8 @@ void Xinvert<T>::InvertMatrixDiagonalBlocks(const Layout layout, const Triangle
const auto npages = CeilDiv(n, current_size*2);
const auto local0 = (current_size <= 32) ? current_size/4 : 16;
const auto local = std::vector<size_t>{local0, 4};
const auto global = std::vector<size_t>{(current_size/local[1]), npages*(current_size/16)*local[1]};
const auto global = std::vector<size_t>{Ceil(current_size/local[1], local[0]),
Ceil(npages*(current_size/16)*local[1], local[1])};
// Part 1
auto kernel1 = Kernel(program_, "TripleMatMul" + ToString(current_size) + "Part1" + name_postfix);