Added a first version of the diagonal block invert routine in preparation of TRSM

pull/138/head
Cedric Nugteren 2017-01-15 17:30:00 +01:00
parent 4a4be0c3a5
commit 4b3ffd9989
13 changed files with 1062 additions and 1 deletions

View File

@ -159,7 +159,7 @@ set(LEVEL1_ROUTINES xswap xscal xcopy xaxpy xdot xdotu xdotc xnrm2 xasum xamax)
set(LEVEL2_ROUTINES xgemv xgbmv xhemv xhbmv xhpmv xsymv xsbmv xspmv xtrmv xtbmv xtpmv
xger xgeru xgerc xher xhpr xher2 xhpr2 xsyr xspr xsyr2 xspr2)
set(LEVEL3_ROUTINES xgemm xsymm xhemm xsyrk xherk xsyr2k xher2k xtrmm xtrsm)
set(LEVELX_ROUTINES xomatcopy)
set(LEVELX_ROUTINES xomatcopy xinvert)
set(ROUTINES ${LEVEL1_ROUTINES} ${LEVEL2_ROUTINES} ${LEVEL3_ROUTINES} ${LEVELX_ROUTINES})
set(PRECISIONS 32 64 3232 6464 16)

View File

@ -26,6 +26,7 @@
#include "database/kernels/pad.hpp"
#include "database/kernels/transpose.hpp"
#include "database/kernels/padtranspose.hpp"
#include "database/kernels/invert.hpp"
#include "database/kernel_selection.hpp"
namespace clblast {
@ -45,6 +46,7 @@ const std::vector<const Database::DatabaseEntry*> Database::database = {
&database::PadHalf, &database::PadSingle, &database::PadDouble, &database::PadComplexSingle, &database::PadComplexDouble,
&database::TransposeHalf, &database::TransposeSingle, &database::TransposeDouble, &database::TransposeComplexSingle, &database::TransposeComplexDouble,
&database::PadtransposeHalf, &database::PadtransposeSingle, &database::PadtransposeDouble, &database::PadtransposeComplexSingle, &database::PadtransposeComplexDouble,
&database::InvertHalf, &database::InvertSingle, &database::InvertDouble, &database::InvertComplexSingle, &database::InvertComplexDouble,
&database::KernelSelectionHalf, &database::KernelSelectionSingle, &database::KernelSelectionDouble, &database::KernelSelectionComplexSingle, &database::KernelSelectionComplexDouble
};

View File

@ -0,0 +1,78 @@
// =================================================================================================
// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
// width of 100 characters per line.
//
// Author(s):
// Cedric Nugteren <www.cedricnugteren.nl>
//
// Tuning parameters for the diagonal matrix inversion kernels
//
// =================================================================================================
namespace clblast {
namespace database {
// =================================================================================================
const Database::DatabaseEntry InvertHalf = {
"Invert", Precision::kHalf, {
{ // Default
kDeviceTypeAll, "default", {
{ "default", { {"INTERNAL_BLOCK_SIZE",16} } },
}
},
}
};
// =================================================================================================
const Database::DatabaseEntry InvertSingle = {
"Invert", Precision::kSingle, {
{ // Default
kDeviceTypeAll, "default", {
{ "default", { {"INTERNAL_BLOCK_SIZE",16} } },
}
},
}
};
// =================================================================================================
const Database::DatabaseEntry InvertComplexSingle = {
"Invert", Precision::kComplexSingle, {
{ // Default
kDeviceTypeAll, "default", {
{ "default", { {"INTERNAL_BLOCK_SIZE",16} } },
}
},
}
};
// =================================================================================================
const Database::DatabaseEntry InvertDouble = {
"Invert", Precision::kDouble, {
{ // Default
kDeviceTypeAll, "default", {
{ "default", { {"INTERNAL_BLOCK_SIZE",16} } },
}
},
}
};
// =================================================================================================
const Database::DatabaseEntry InvertComplexDouble = {
"Invert", Precision::kComplexDouble, {
{ // Default
kDeviceTypeAll, "default", {
{ "default", { {"INTERNAL_BLOCK_SIZE",16} } },
}
},
}
};
// =================================================================================================
} // namespace database
} // namespace clblast

View File

@ -162,6 +162,13 @@ R"(
#define AbsoluteValue(value) value = fabs(value)
#endif
// Negation (component-wise)
#if PRECISION == 3232 || PRECISION == 6464
#define Negate(value) value.x = -(value.x); value.y = -(value.y)
#else
#define Negate(value) value = -(value)
#endif
// Adds two complex variables
#if PRECISION == 3232 || PRECISION == 6464
#define Add(c, a, b) c.x = a.x + b.x; c.y = a.y + b.y
@ -193,6 +200,13 @@ R"(
#endif
#endif
// The scalar division function
#if PRECISION == 3232 || PRECISION == 6464
#define DivideReal(c, a, b) c.x = a.x / b.x; c.y = a.x
#else
#define DivideReal(c, a, b) c = a / b
#endif
// The scalar AXPBY function
#if PRECISION == 3232 || PRECISION == 6464
#define AXPBY(e, a, b, c, d) e.x = MulReal(a,b) + MulReal(c,d); e.y = MulImag(a,b) + MulImag(c,d)

View File

@ -0,0 +1,440 @@
// =================================================================================================
// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
// width of 100 characters per line.
//
// Author(s):
// Cedric Nugteren <www.cedricnugteren.nl>
//
// This file contains kernels to invert squared diagonal blocks of a matrix. These kernels are based
// on the TRSM implementation in the CUDA version of Magma version 2.2.0 and the poster "Triangular
// Linear System Solver for GPU with CUDA and OpenCL" by Peng Du, Stanimire Tomov, Piotr Luszczek,
// and Jack Dongarra.
//
// =================================================================================================
//
// Let A be an block_size*block_size lower triangular matrix, and B its inverse.
// Then the block decomposition
//
// [ A11 0 ] * [ B11 0 ] = [ I 0 ]
// [ A21 A22 ] [ B21 B22 ] [ 0 I ]
//
// yields
//
// A11*B11 = I ==> B11 = A11^{-1},
// A22*B22 = I ==> B22 = A22^{-1},
// A21*B11 + A22*B21 = 0 ==> B21 = -A22^{-1}*A21*B11 = -B22*A21*B11.
//
// The InvertDiagonalBlock kernel inverts A11 and A22.
// The TripleMatMul routines multiply:
// part 1: B21 = A21 * B11,
// part 2: B21 = -B22 * B21.
//
// At this level, inner block is current_size=16, with one 4 x 4 work-group per inner block. Each
// submatrix Aij and Bij is current_size x current_size. The submatrix dimension is multiplied by 2
// at each level, so the next level is current_size*2 = 32. A 'page' is the next bigger block,
// here current_size*2=32,
// [ B11 0 ]
// which contains [ B21 B22 ].
// Outer blocks are block_size x block_size.
//
// A21 may have < current_size rows, but is guaranteed to have current_size cols since A22 is on
// the right. This makes a single check easy to do.
//
// B is stored in workspace that is a full multiple of block_size x block_size; no checks needed.
//
// We split this into part1 & part2 to synchronize all blocks and make sure
// that writes to B12 are observed by all blocks.
//
// =================================================================================================
// Enables loading of this file using the C++ pre-processor's #include (C++11 standard raw string
// literal). Comment-out this line for syntax-highlighting when developing.
R"(
// =================================================================================================
#if defined(ROUTINE_INVERT)
#define LOCALX 17 // 16 + 1 to avoid bank conflicts
#define LOCALY 16
// =================================================================================================
__kernel __attribute__((reqd_work_group_size(8, 8, 1)))
void FillMatrix(const int n, const int ld, const int offset,
__global real* restrict dest, const real_arg arg_value) {
const real value = GetRealArg(arg_value);
const int id_one = get_global_id(0);
const int id_two = get_global_id(1);
if (id_one < ld && id_two < n) {
dest[id_two*ld + id_one + offset] = value;
}
}
// =================================================================================================
// Inverts a diagonal block of INTERNAL_BLOCK_SIZE by INTERNAL_BLOCK_SIZE elements in a larger matrix
__kernel __attribute__((reqd_work_group_size(INTERNAL_BLOCK_SIZE, 1, 1)))
void InvertDiagonalBlock(int n, __global const real* restrict src, const int src_offset, const int src_ld,
__global real* restrict dest, const int outer_block_size,
const int unit_diagonal, const int is_upper)
{
const int thread_index = get_local_id(0);
const int block_index = get_group_id(0);
// Sets the offset for this particular block in the source and destination matrices
const int src_block_offset = block_index * (INTERNAL_BLOCK_SIZE + src_ld * INTERNAL_BLOCK_SIZE) + src_offset;
const int num_inner_blocks = outer_block_size / INTERNAL_BLOCK_SIZE;
const int dest_block_offset = (block_index / num_inner_blocks) * outer_block_size * outer_block_size + // go to the (block_index / num_inner_blocks) outer outer_block_size*outer_block_size block,
(block_index % num_inner_blocks) * (outer_block_size*INTERNAL_BLOCK_SIZE + INTERNAL_BLOCK_SIZE); // then to the (block_index % num_inner_blocks) inner INTERNAL_BLOCK_SIZE*INTERNAL_BLOCK_SIZE block inside that
// Local memory to store the inverted block of INTERNAL_BLOCK_SIZE by INTERNAL_BLOCK_SIZE
__local real lm[INTERNAL_BLOCK_SIZE][INTERNAL_BLOCK_SIZE];
// Loads the source lower triangle into local memory. Any values in the upper triangle or
// outside of the matrix are set to zero
#pragma unroll
for (int j = 0; j < INTERNAL_BLOCK_SIZE; ++j) {
const bool condition = (is_upper) ? (thread_index <= j && block_index*INTERNAL_BLOCK_SIZE + j < n) :
(thread_index >= j && block_index*INTERNAL_BLOCK_SIZE + thread_index < n);
if (condition) {
lm[thread_index][j] = src[j*src_ld + thread_index + src_block_offset];
}
else {
SetToZero(lm[thread_index][j]);
}
}
barrier(CLK_LOCAL_MEM_FENCE);
// Inverts the diagonal
real inverted_diagonal;
SetToOne(inverted_diagonal);
if (unit_diagonal == 0) {
const real diagonal_value = lm[thread_index][thread_index];
if (!IsZero(diagonal_value)) { // Only for non-singular values and values inside the matrix
DivideReal(inverted_diagonal, inverted_diagonal, diagonal_value);
}
}
lm[thread_index][thread_index] = inverted_diagonal;
barrier(CLK_LOCAL_MEM_FENCE);
// Upper-triangular
if (is_upper) {
// Computes the elements 0:j-1 of the j-th column
for (int j = 1; j < INTERNAL_BLOCK_SIZE; ++j) {
if (thread_index < j) {
real sum;
SetToZero(sum);
#pragma unroll
for (int k = 0; k < j; ++k) {
MultiplyAdd(sum, lm[thread_index][k], lm[k][j]);
}
real diagonal_value = lm[j][j];
Negate(diagonal_value);
Multiply(lm[thread_index][j], diagonal_value, sum);
}
barrier(CLK_LOCAL_MEM_FENCE);
}
}
// Lower triangular
else {
// Computes the elements j+1:INTERNAL_BLOCK_SIZE-1 of the j-th column
for (int j = INTERNAL_BLOCK_SIZE - 2; j >= 0; --j) {
if (thread_index > j) {
real sum;
SetToZero(sum);
#pragma unroll
for (int k = j + 1; k < INTERNAL_BLOCK_SIZE; ++k) {
MultiplyAdd(sum, lm[thread_index][k], lm[k][j]);
}
Multiply(lm[thread_index][j], -lm[j][j], sum);
}
barrier(CLK_LOCAL_MEM_FENCE);
}
}
// Writes the result to global memory
#pragma unroll
for (int j = 0; j < INTERNAL_BLOCK_SIZE; ++j) {
dest[j*outer_block_size + thread_index + dest_block_offset] = lm[thread_index][j];
}
}
// =================================================================================================
// Triple matrix-multiplication kernel: C = A * B
inline void TripleMatMul(const int size, const bool upper, const int part, __local real* blm, int n,
__global const real* agm, __global const real* bgm, __global real* cgm,
const int lda, const int ldb, const int ldc,
int current_size, int num_pages, const int block_size) {
// Emulates a 3D grid: NX * (NY * num_pages)
const int by = get_group_id(1) / num_pages;
const int page = get_group_id(1) % num_pages;
const int lidx = get_local_id(0);
const int lidy = get_local_id(1);
const int ibx = get_group_id(0) * (get_local_size(0)*get_local_size(1));
const int iby = by*16;
const int id = lidx + lidy*get_local_size(0);
const int row = page*current_size*2 + current_size + ibx + id;
int col = page*current_size*2 + current_size;
// Sets the offsets for this specific thread
agm += ibx + id;
bgm += lidx + (iby + lidy)*ldb;
cgm += ibx + id + iby*ldc;
// Initializes the result registers
real cpm[16];
#pragma unroll
for (int j = 0; j < 16; ++j) {
SetToZero(cpm[j]);
}
// Computes NT x 16 block of C, each thread computes one 1 x 16 row
for (int k = 0; k < current_size; k += 16) {
// Loads a 16 x 16 block of B into local memory using NX x 4 threads
#pragma unroll
for( int i=0; i < 16; i += (size/4) ) { // += get_local_size(0)
#pragma unroll
for( int j=0; j < 16; j += 4 ) { // += get_local_size(1)
blm[(lidx + i) * LOCALX + (lidy + j)] = bgm[k + i + j*ldb];
}
}
barrier(CLK_LOCAL_MEM_FENCE);
// Upper triangular
if (upper) {
// Performs 16 x 16 multiply-add operations
#pragma unroll
for (int i = 0; i < 16; ++i) {
if (part == 2 || col++ < n) {
#pragma unroll
for (int j = 0; j < 16; ++j) {
MultiplyAdd(cpm[j], agm[(i + k) * lda], blm[i * LOCALX + j]);
}
}
}
}
// Lower triangular
else {
if (row < n) {
// Performs 16 x 16 multiply-add operations
#pragma unroll
for (int i = 0; i < 16; ++i) {
#pragma unroll
for (int j = 0; j < 16; ++j) {
MultiplyAdd(cpm[j], agm[(i + k) * lda], blm[i * LOCALX + j]);
}
}
}
}
barrier(CLK_LOCAL_MEM_FENCE);
}
// Stores NT x 16 results: each thread writes one 16 x 1 row
#pragma unroll
for (int i = 0; i < 16; ++i) {
if (part == 2) { Negate(cpm[i]); }
cgm[0] = cpm[i];
cgm += ldc;
}
}
// =================================================================================================
// Triple matrix-multiplication kernel part 1: B12 = A12 * B22 (upper) or B21 = A21 * B11 (lower)
inline void TripleMatMulPart1(const int size, const bool upper, __local real* blm, int n,
__global const real* src, const int a_offset, const int lda,
__global real* dest, int current_size, int num_pages, const int block_size) {
// Emulates a 3D grid: NX * (NY * num_pages)
const int page = get_group_id(1) % num_pages;
// Computes the destination block offset:
// - go to the (page / pages_per_block) outer block_size * block_size block
// - then the (page % pages_per_block) inner (current_size*2) * (current_size*2) page inside that
const int pages_per_block = block_size / (current_size*2);
dest += (page / pages_per_block) * block_size * block_size +
(page % pages_per_block) * (current_size*2*block_size + current_size*2);
// Using the GEMM notation: C = A*B
__global const real* agm;
__global const real* bgm;
__global real* cgm;
if (upper) { // upper triangular: B12 = A12 * B22
agm = src + a_offset + page*current_size*2*lda + page*current_size*2 + current_size*lda; // A12
bgm = dest + current_size*block_size + current_size; // B22
cgm = dest + current_size*block_size; // B12
}
else { // lower triangular: B21 = A21 * B11
agm = src + a_offset + page*current_size*2*lda + page*current_size*2 + current_size; // A21
bgm = dest; // B11
cgm = dest + current_size; // B21
}
// Runs the generic C = A * B matrix multiplication
const int ldb = block_size;
const int ldc = block_size;
TripleMatMul(size, upper, 1, blm, n, agm, bgm, cgm, lda, ldb, ldc, current_size, num_pages, block_size);
}
// Triple matrix-multiplication kernel part 1: B12 = -B11 * B12 (upper) or B21 = -B22 * B21 (lower)
inline void TripleMatMulPart2(const int size, const bool upper, __local real* blm, const int n,
__global real* dest, int current_size, int num_pages, const int block_size) {
// Emulates a 3D grid: NX * (NY * num_pages)
const int page = get_group_id(1) % num_pages;
// Computes the destination block offset:
// - go to the (page / pages_per_block) outer block_size * block_size block
// - then the (page % pages_per_block) inner (current_size*2) * (current_size*2) page inside that
const int pages_per_block = block_size / (current_size*2);
dest += (page / pages_per_block) * block_size * block_size +
(page % pages_per_block) * (current_size*2*block_size + current_size*2);
// Using the GEMM notation: C = A*B
__global const real* agm;
__global const real* bgm;
__global real* cgm;
if (upper) { // upper triangular: B12 = -B11 * B12
agm = dest; // B11
cgm = dest + current_size*block_size; // B12
bgm = cgm; // B12, okay to overwrite
}
else { // lower triangular: B21 = -B22 * B21
agm = dest + current_size*block_size + current_size; // B22
cgm = dest + current_size; // B21
bgm = cgm; // B21, okay to overwrite
}
// Runs the generic C = A * B matrix multiplication
const int lda = block_size;
const int ldb = block_size;
const int ldc = block_size;
TripleMatMul(size, upper, 2, blm, n, agm, bgm, cgm, lda, ldb, ldc, current_size, num_pages, block_size);
}
// =================================================================================================
// B21 = A21 * B11
__kernel __attribute__((reqd_work_group_size(4, 4, 1)))
void TripleMatMul16Part1Lower(int n, __global const real* restrict src, const int a_offset, const int lda,
__global real* restrict dest, int current_size, int num_pages, const int block_size)
{
__local real lm[LOCALY * LOCALX];
TripleMatMulPart1(16, false, lm, n, src, a_offset, lda, dest, current_size, num_pages, block_size);
}
// B21 = -B22 * B21
__kernel __attribute__((reqd_work_group_size(4, 4, 1)))
void TripleMatMul16Part2Lower(int n, __global real* restrict dest, int current_size, int num_pages, const int block_size)
{
__local real lm[LOCALY * LOCALX];
TripleMatMulPart2(16, false, lm, n, dest, current_size, num_pages, block_size);
}
// B21 = A21 * B11
__kernel __attribute__((reqd_work_group_size(8, 4, 1)))
void TripleMatMul32Part1Lower(int n, __global const real* restrict src, const int a_offset, const int lda,
__global real* restrict dest, int current_size, int num_pages, const int block_size)
{
__local real lm[LOCALY * LOCALX];
TripleMatMulPart1(32, false, lm, n, src, a_offset, lda, dest, current_size, num_pages, block_size);
}
// B21 = -B22 * B21
__kernel __attribute__((reqd_work_group_size(8, 4, 1)))
void TripleMatMul32Part2Lower(int n, __global real* restrict dest, int current_size, int num_pages, const int block_size)
{
__local real lm[LOCALY * LOCALX];
TripleMatMulPart2(32, false, lm, n, dest, current_size, num_pages, block_size);
}
// B21 = A21 * B11
__kernel __attribute__((reqd_work_group_size(16, 4, 1)))
void TripleMatMul64Part1Lower(int n, __global const real* restrict src, const int a_offset, const int lda,
__global real* restrict dest, int current_size, int num_pages, const int block_size)
{
__local real lm[LOCALY * LOCALX];
TripleMatMulPart1(64, false, lm, n, src, a_offset, lda, dest, current_size, num_pages, block_size);
}
// B21 = -B22 * B21
__kernel __attribute__((reqd_work_group_size(16, 4, 1)))
void TripleMatMul64Part2Lower(int n, __global real* restrict dest, int current_size, int num_pages, const int block_size)
{
__local real lm[LOCALY * LOCALX];
TripleMatMulPart2(64, false, lm, n, dest, current_size, num_pages, block_size);
}
// =================================================================================================
// B12 = A12 * B22
__kernel __attribute__((reqd_work_group_size(4, 4, 1)))
void TripleMatMul16Part1Upper(int n, __global const real* restrict src, const int a_offset, const int lda,
__global real* restrict dest, int current_size, int num_pages, const int block_size)
{
__local real lm[LOCALY * LOCALX];
TripleMatMulPart1(16, true, lm, n, src, a_offset, lda, dest, current_size, num_pages, block_size);
}
// B12 = -B11 * B12
__kernel __attribute__((reqd_work_group_size(4, 4, 1)))
void TripleMatMul16Part2Upper(int n, __global real* restrict dest, int current_size, int num_pages, const int block_size)
{
__local real lm[LOCALY * LOCALX];
TripleMatMulPart2(16, true, lm, n, dest, current_size, num_pages, block_size);
}
// B12 = A12 * B22
__kernel __attribute__((reqd_work_group_size(8, 4, 1)))
void TripleMatMul32Part1Upper(int n, __global const real* restrict src, const int a_offset, const int lda,
__global real* restrict dest, int current_size, int num_pages, const int block_size)
{
__local real lm[LOCALY * LOCALX];
TripleMatMulPart1(32, true, lm, n, src, a_offset, lda, dest, current_size, num_pages, block_size);
}
// B12 = -B11 * B12
__kernel __attribute__((reqd_work_group_size(8, 4, 1)))
void TripleMatMul32Part2Upper(int n, __global real* restrict dest, int current_size, int num_pages, const int block_size)
{
__local real lm[LOCALY * LOCALX];
TripleMatMulPart2(32, true, lm, n, dest, current_size, num_pages, block_size);
}
// B12 = A12 * B22
__kernel __attribute__((reqd_work_group_size(16, 4, 1)))
void TripleMatMul64Part1Upper(int n, __global const real* restrict src, const int a_offset, const int lda,
__global real* restrict dest, int current_size, int num_pages, const int block_size)
{
__local real lm[LOCALY * LOCALX];
TripleMatMulPart1(64, true, lm, n, src, a_offset, lda, dest, current_size, num_pages, block_size);
}
// B12 = -B11 * B12
__kernel __attribute__((reqd_work_group_size(16, 4, 1)))
void TripleMatMul64Part2Upper(int n, __global real* restrict dest, int current_size, int num_pages, const int block_size)
{
__local real lm[LOCALY * LOCALX];
TripleMatMulPart2(64, true, lm, n, dest, current_size, num_pages, block_size);
}
#endif
// =================================================================================================
// End of the C++11 raw string literal
)"
// =================================================================================================

View File

@ -33,6 +33,27 @@ void RunKernel(Kernel &kernel, Queue &queue, const Device &device,
// =================================================================================================
// Sets all elements of a matrix to a constant value
template <typename T>
void FillMatrix(Queue &queue, const Device &device,
const Program &program, const Database &,
EventPointer event, const std::vector<Event> &waitForEvents,
const size_t n, const size_t ld, const size_t offset,
const Buffer<T> &dest,
const T constant_value) {
auto kernel = Kernel(program, "FillMatrix");
kernel.SetArgument(0, static_cast<int>(n));
kernel.SetArgument(1, static_cast<int>(ld));
kernel.SetArgument(2, static_cast<int>(offset));
kernel.SetArgument(3, dest());
kernel.SetArgument(4, GetRealArg(constant_value));
auto local = std::vector<size_t>{8, 8};
auto global = std::vector<size_t>{Ceil(ld, 8), Ceil(n, 8)};
RunKernel(kernel, queue, device, global, local, event, waitForEvents);
}
// =================================================================================================
// Copies or transposes a matrix and optionally pads/unpads it with zeros. This method is also able
// to write to symmetric and triangular matrices through optional arguments.
template <typename T>

View File

@ -0,0 +1,152 @@
// =================================================================================================
// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
// width of 100 characters per line.
//
// Author(s):
// Cedric Nugteren <www.cedricnugteren.nl>
//
// This file contains all the common code to perform (partial) matrix inverting. This code is based
// on the TRSM implementation in the CUDA version of Magma version 2.2.0 and the poster "Triangular
// Linear System Solver for GPU with CUDA and OpenCL" by Peng Du, Stanimire Tomov, Piotr Luszczek,
// and Jack Dongarra.
//
// =================================================================================================
#include "routines/levelx/xinvert.hpp"
#include <string>
#include <vector>
#include <assert.h>
namespace clblast {
// =================================================================================================
// Constructor: forwards to base class constructor
template <typename T>
Xinvert<T>::Xinvert(Queue &queue, EventPointer event, const std::string &name):
Routine(queue, event, name, {"Invert"}, PrecisionValue<T>(), {}, {
#include "../../kernels/level3/invert_diagonal_blocks.opencl"
}) {
}
// =================================================================================================
// Inverts diagonal square blocks of a matrix
template <typename T>
void Xinvert<T>::InvertMatrixDiagonalBlocks(const Layout layout, const Triangle triangle, const Diagonal diag,
const size_t n, const size_t block_size,
const Buffer<T> &src, const size_t offset, const size_t ld_src,
Buffer<T> &dest) {
// Makes sure all dimensions are larger than zero and the block size is smaller than n
if ((block_size == 0) || (n == 0) || (block_size > n)) {
throw BLASError(StatusCode::kInvalidDimension);
}
// Helper variables
const auto internal_block_size = static_cast<size_t>(db_["INTERNAL_BLOCK_SIZE"]);
assert(internal_block_size == 16);
const auto num_blocks = CeilDiv(n, block_size);
const auto num_internal_blocks = CeilDiv(n, internal_block_size);
const auto unit_diagonal = (diag == Diagonal::kUnit) ? true : false;
// This routine only supports block sizes which are a multiple of the internal block size and
// block sizes up to and including 128
if ((block_size % internal_block_size != 0) || (block_size > 128)) {
throw BLASError(StatusCode::kInvalidDimension);
}
// Checks for validity of the source and destination matrices
TestMatrixA(n, n, src, offset, ld_src);
TestMatrixB(block_size, num_blocks * block_size, dest, 0, block_size);
// Determines which kernels to run based on the layout (the kernels assume column-major as
// default) and on whether we are dealing with an upper or lower triangle of the triangular matrix
const bool is_upper = ((triangle == Triangle::kUpper && layout != Layout::kRowMajor) ||
(triangle == Triangle::kLower && layout == Layout::kRowMajor));
const auto name_postfix = (is_upper) ? "Upper" : "Lower";
// Retrieves the program from the cache
auto event_wait_list = std::vector<Event>();
const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), "INVERT");
// Fills the output buffer with zeros
auto fill_matrix_event = Event();
FillMatrix(queue_, device_, program, db_, fill_matrix_event.pointer(), event_wait_list,
num_blocks * block_size, block_size, 0, dest, ConstantZero<T>());
event_wait_list.push_back(fill_matrix_event);
// Inverts the diagonal IB by IB inner blocks of the matrix: one block per work-group
auto kernel = Kernel(program, "InvertDiagonalBlock");
kernel.SetArgument(0, static_cast<int>(n));
kernel.SetArgument(1, src());
kernel.SetArgument(2, static_cast<int>(offset));
kernel.SetArgument(3, static_cast<int>(ld_src));
kernel.SetArgument(4, dest());
kernel.SetArgument(5, static_cast<int>(block_size));
kernel.SetArgument(6, static_cast<int>(unit_diagonal));
kernel.SetArgument(7, static_cast<int>(is_upper));
const auto local = std::vector<size_t>{internal_block_size};
const auto global = std::vector<size_t>{num_internal_blocks * internal_block_size};
auto base_kernel_event = Event();
RunKernel(kernel, queue_, device_, global, local, base_kernel_event.pointer(), event_wait_list);
event_wait_list.push_back(base_kernel_event);
// Builds up block_size x block_size blocks. For example, internal_block_size=16:
// use 16 x 16 blocks to build 32 x 32 blocks, 1 x (1 x npages) grid, 4 x 4 threads;
// then 32 x 32 blocks to build 64 x 64 blocks, 1 x (2 x npages) grid, 8 x 4 threads;
// then 64 x 64 blocks to build 128 x 128 blocks, 1 x (4 x npages) grid, 16 x 4 threads;
for (auto current_size = internal_block_size; current_size < block_size; current_size *= 2) {
assert(current_size == 16 || current_size == 32 || current_size == 64);
// Emulates a 3D grid: NX * (NY * npages)
const auto npages = CeilDiv(n, current_size*2);
const auto local0 = (current_size <= 32) ? current_size/4 : 16;
const auto local = std::vector<size_t>{local0, 4};
const auto global = std::vector<size_t>{(current_size/local[1]), npages*(current_size/16)*local[1]};
// Part 1
auto kernel1 = Kernel(program, "TripleMatMul" + ToString(current_size) + "Part1" + name_postfix);
kernel1.SetArgument(0, static_cast<int>(n));
kernel1.SetArgument(1, src());
kernel1.SetArgument(2, static_cast<int>(offset));
kernel1.SetArgument(3, static_cast<int>(ld_src));
kernel1.SetArgument(4, dest());
kernel1.SetArgument(5, static_cast<int>(current_size));
kernel1.SetArgument(6, static_cast<int>(npages));
kernel1.SetArgument(7, static_cast<int>(block_size));
auto kernel1_event = Event();
RunKernel(kernel1, queue_, device_, global, local, kernel1_event.pointer(), event_wait_list);
event_wait_list.push_back(kernel1_event);
// Part 2
const bool is_last_kernel = (current_size * 2 >= block_size);
auto kernel2 = Kernel(program, "TripleMatMul" + ToString(current_size) + "Part2" + name_postfix);
kernel2.SetArgument(0, static_cast<int>(n));
kernel2.SetArgument(1, dest());
kernel2.SetArgument(2, static_cast<int>(current_size));
kernel2.SetArgument(3, static_cast<int>(npages));
kernel2.SetArgument(4, static_cast<int>(block_size));
auto kernel2_event = Event();
auto eventPointer = (is_last_kernel) ? event_ : kernel2_event.pointer();
RunKernel(kernel2, queue_, device_, global, local, eventPointer, event_wait_list);
if (!is_last_kernel) { event_wait_list.push_back(kernel2_event); }
// Exit in case we reach beyond the bounds of the input matrix
if (current_size*2 >= n) { break; }
}
}
// =================================================================================================
// Compiles the templated class
template class Xinvert<half>;
template class Xinvert<float>;
template class Xinvert<double>;
template class Xinvert<float2>;
template class Xinvert<double2>;
// =================================================================================================
} // namespace clblast

View File

@ -0,0 +1,40 @@
// =================================================================================================
// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
// width of 100 characters per line.
//
// Author(s):
// Cedric Nugteren <www.cedricnugteren.nl>
//
// This file contains all the common code to perform (partial) matrix inverting.
//
// =================================================================================================
#ifndef CLBLAST_ROUTINES_XINVERT_H_
#define CLBLAST_ROUTINES_XINVERT_H_
#include "routine.hpp"
namespace clblast {
// =================================================================================================
template <typename T>
class Xinvert: public Routine {
public:
// Constructor
Xinvert(Queue &queue, EventPointer event, const std::string &name = "INVERT");
// Inverts diagonal square blocks of a matrix
void InvertMatrixDiagonalBlocks(const Layout layout, const Triangle triangle, const Diagonal diag,
const size_t n, const size_t block_size,
const Buffer<T> &src, const size_t offset, const size_t ld_src,
Buffer<T> &dest);
};
// =================================================================================================
} // namespace clblast
// CLBLAST_ROUTINES_XINVERT_H_
#endif

View File

@ -46,6 +46,30 @@ double2 GetScalar() {
return {2.0, 0.5};
}
// Returns a scalar of value 0
template <typename T>
T ConstantZero() {
return static_cast<T>(0.0);
}
template float ConstantZero<float>();
template double ConstantZero<double>();
// Specialized version of the above for half-precision
template <>
half ConstantZero() {
return FloatToHalf(0.0f);
}
// Specialized versions of the above for complex data-types
template <>
float2 ConstantZero() {
return {0.0f, 0.0f};
}
template <>
double2 ConstantZero() {
return {0.0, 0.0};
}
// Returns a scalar of value 1
template <typename T>
T ConstantOne() {

View File

@ -102,6 +102,10 @@ constexpr auto kArgNoAbbreviations = "no_abbrv";
template <typename T>
T GetScalar();
// Returns a scalar of value 0
template <typename T>
T ConstantZero();
// Returns a scalar of value 1
template <typename T>
T ConstantOne();

View File

@ -0,0 +1,30 @@
// =================================================================================================
// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
// width of 100 characters per line.
//
// Author(s):
// Cedric Nugteren <www.cedricnugteren.nl>
//
// =================================================================================================
#include "test/correctness/testblas.hpp"
#include "test/routines/levelx/xinvert.hpp"
// Shortcuts to the clblast namespace
using float2 = clblast::float2;
using double2 = clblast::double2;
// Main function (not within the clblast namespace)
int main(int argc, char *argv[]) {
auto errors = size_t{0};
errors += clblast::RunTests<clblast::TestXinvert<float>, float, float>(argc, argv, false, "SINVERT");
errors += clblast::RunTests<clblast::TestXinvert<double>, double, double>(argc, argv, true, "DINVERT");
errors += clblast::RunTests<clblast::TestXinvert<float2>, float2, float2>(argc, argv, true, "CINVERT");
errors += clblast::RunTests<clblast::TestXinvert<double2>, double2, double2>(argc, argv, true, "ZINVERT");
errors += clblast::RunTests<clblast::TestXinvert<half>, half, half>(argc, argv, true, "HINVERT");
if (errors > 0) { return 1; } else { return 0; }
}
// =================================================================================================

View File

@ -0,0 +1,37 @@
// =================================================================================================
// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
// width of 100 characters per line.
//
// Author(s):
// Cedric Nugteren <www.cedricnugteren.nl>
//
// =================================================================================================
#include "test/performance/client.hpp"
#include "test/routines/levelx/xinvert.hpp"
// Shortcuts to the clblast namespace
using float2 = clblast::float2;
using double2 = clblast::double2;
// Main function (not within the clblast namespace)
int main(int argc, char *argv[]) {
const auto command_line_args = clblast::RetrieveCommandLineArguments(argc, argv);
switch(clblast::GetPrecision(command_line_args, clblast::Precision::kSingle)) {
case clblast::Precision::kHalf:
clblast::RunClient<clblast::TestXinvert<half>, half, half>(argc, argv); break;
case clblast::Precision::kSingle:
clblast::RunClient<clblast::TestXinvert<float>, float, float>(argc, argv); break;
case clblast::Precision::kDouble:
clblast::RunClient<clblast::TestXinvert<double>, double, double>(argc, argv); break;
case clblast::Precision::kComplexSingle:
clblast::RunClient<clblast::TestXinvert<float2>, float2, float2>(argc, argv); break;
case clblast::Precision::kComplexDouble:
clblast::RunClient<clblast::TestXinvert<double2>, double2, double2>(argc, argv); break;
}
return 0;
}
// =================================================================================================

View File

@ -0,0 +1,219 @@
// =================================================================================================
// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
// width of 100 characters per line.
//
// Author(s):
// Cedric Nugteren <www.cedricnugteren.nl>
//
// This file implements a class with static methods to describe the Xinvert routine. Examples of
// such 'descriptions' are how to calculate the size a of buffer or how to run the routine. These
// static methods are used by the correctness tester and the performance tester.
//
// =================================================================================================
#ifndef CLBLAST_TEST_ROUTINES_XINVERT_H_
#define CLBLAST_TEST_ROUTINES_XINVERT_H_
#include <vector>
#include <string>
#include "routines/levelx/xinvert.hpp"
namespace clblast {
// =================================================================================================
template <typename T>
StatusCode RunReference(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
const bool is_upper = ((args.triangle == Triangle::kUpper && args.layout != Layout::kRowMajor) ||
(args.triangle == Triangle::kLower && args.layout == Layout::kRowMajor));
// Data transfer from OpenCL to std::vector
std::vector<T> a_mat_cpu(args.a_size, T{0.0});
buffers.a_mat.Read(queue, args.a_size, a_mat_cpu);
// Creates the output buffer
std::vector<T> b_mat_cpu(args.b_size, T{0.0});
// Helper variables
const auto block_size = args.m;
const auto num_blocks = CeilDiv(args.n, block_size);
const auto a_ld = args.a_ld;
const auto b_ld = block_size;
if ((block_size == 0) || (args.n == 0) || (block_size > args.n)) {
return StatusCode::kInvalidDimension;
}
// Loops over the amount of diagonal blocks of size args.m by args.m each
for (auto block_id = size_t{0}; block_id < num_blocks; ++block_id) {
const auto a_offset = block_id * (block_size + a_ld * block_size) + args.a_offset;
const auto b_offset = block_id * block_size * block_size;
// Inverts the diagonal elements of the matrix
for (auto i = size_t{0}; i < block_size; ++i) {
auto a_value = T{1.0};
if (args.diagonal == Diagonal::kNonUnit) {
if (i + block_id * block_size < args.n) {
if (a_mat_cpu[i * a_ld + i + a_offset] == T{0.0}) { return StatusCode::kUnknownError; }
a_value = T{1.0} / a_mat_cpu[i * a_ld + i + a_offset];
}
}
b_mat_cpu[i * b_ld + i + b_offset] = a_value;
}
// Inverts the upper triangle row by row
if (is_upper) {
for (int i = static_cast<int>(block_size) - 2; i >= 0; --i) {
for (auto j = static_cast<int>(block_size) - 1; j > i; --j) {
auto sum = T{0.0};
for (auto k = i + 1; k <= j; ++k) {
auto a_value = T{0.0};
if ((i + block_id * block_size < args.n) && (k + block_id * block_size < args.n)) {
a_value = a_mat_cpu[k * a_ld + i + a_offset];
}
sum += a_value * b_mat_cpu[j * b_ld + k + b_offset];
}
b_mat_cpu[j * b_ld + i + b_offset] = - sum * b_mat_cpu[i * b_ld + i + b_offset];
}
}
}
// Inverts the lower triangle row by row
else {
for (auto i = size_t{1}; i < block_size; ++i) {
for (auto j = size_t{0}; j < i; ++j) {
auto sum = T{0.0};
for (auto k = j; k < i; ++k) {
auto a_value = T{0.0};
if ((i + block_id * block_size < args.n) && (k + block_id * block_size < args.n)) {
a_value = a_mat_cpu[k * a_ld + i + a_offset];
}
sum += a_value * b_mat_cpu[j * b_ld + k + b_offset];
}
b_mat_cpu[j * b_ld + i + b_offset] = - sum * b_mat_cpu[i * b_ld + i + b_offset];
}
}
}
}
// Data transfer back to OpenCL
buffers.b_mat.Write(queue, args.b_size, b_mat_cpu);
return StatusCode::kSuccess;
}
// Half-precision version calling the above reference implementation after conversions
template <>
StatusCode RunReference<half>(const Arguments<half> &args, Buffers<half> &buffers, Queue &queue) {
auto a_buffer2 = HalfToFloatBuffer(buffers.a_mat, queue());
auto b_buffer2 = HalfToFloatBuffer(buffers.b_mat, queue());
auto dummy = clblast::Buffer<float>(0);
auto buffers2 = Buffers<float>{dummy, dummy, a_buffer2, b_buffer2, dummy, dummy, dummy};
auto args2 = Arguments<float>();
args2.a_size = args.a_size; args2.b_size = args.b_size;
args2.a_ld = args.a_ld; args2.m = args.m; args2.n = args.n;
args2.a_offset = args.a_offset;
args2.layout = args.layout; args2.triangle = args.triangle; args2.diagonal = args.diagonal;
auto status = RunReference(args2, buffers2, queue);
FloatToHalfBuffer(buffers.b_mat, b_buffer2, queue());
return status;
}
// =================================================================================================
// See comment at top of file for a description of the class
template <typename T>
class TestXinvert {
public:
// The BLAS level: 4 for the extra routines
static size_t BLASLevel() { return 4; }
// The list of arguments relevant for this routine
static std::vector<std::string> GetOptions() {
return {kArgN, kArgM,
kArgLayout, kArgTriangle, kArgDiagonal,
kArgALeadDim, kArgAOffset};
}
// Describes how to obtain the sizes of the buffers
static size_t GetSizeA(const Arguments<T> &args) {
return args.n * args.a_ld + args.a_offset;
}
static size_t GetSizeB(const Arguments<T> &args) {
const auto block_size = args.m;
const auto num_blocks = CeilDiv(args.n, block_size);
return num_blocks * block_size * block_size;
}
// Describes how to set the sizes of all the buffers
static void SetSizes(Arguments<T> &args) {
args.a_size = GetSizeA(args);
args.b_size = GetSizeB(args);
}
// Describes what the default values of the leading dimensions of the matrices are
static size_t DefaultLDA(const Arguments<T> &args) { return args.n; }
static size_t DefaultLDB(const Arguments<T> &) { return 1; } // N/A for this routine
static size_t DefaultLDC(const Arguments<T> &) { return 1; } // N/A for this routine
// Describes which omatcopyose options are relevant for this routine
using Transposes = std::vector<Transpose>;
static Transposes GetATransposes(const Transposes &) { return {}; } // N/A for this routine
static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine
// Describes how to run the CLBlast routine
static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
try {
auto event = cl_event{};
auto inverter = Xinvert<T>(queue, &event);
inverter.InvertMatrixDiagonalBlocks(args.layout, args.triangle, args.diagonal,
args.n, args.m,
buffers.a_mat, args.a_offset, args.a_ld,
buffers.b_mat);
clWaitForEvents(1, &event);
clReleaseEvent(event);
} catch (...) { return DispatchException(); }
return StatusCode::kSuccess;
}
// Describes how to run a naive version of the routine (for correctness/performance comparison).
// Note that a proper clBLAS or CPU BLAS comparison is not available for non-BLAS routines.
static StatusCode RunReference1(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
return RunReference(args, buffers, queue);
}
static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
return RunReference(args, buffers, queue);
}
// Describes how to download the results of the computation (more importantly: which buffer)
static std::vector<T> DownloadResult(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
std::vector<T> result(args.b_size, static_cast<T>(0));
buffers.b_mat.Read(queue, args.b_size, result);
return result;
}
// Describes how to compute the indices of the result buffer
static size_t ResultID1(const Arguments<T> &args) { return args.m; }
static size_t ResultID2(const Arguments<T> &args) { return Ceil(args.n, args.m); }
static size_t GetResultIndex(const Arguments<T> &args, const size_t id1, const size_t id2) {
return id1 * Ceil(args.n, args.m) + id2;
}
// Describes how to compute performance metrics
static size_t GetFlops(const Arguments<T> &args) {
const auto block_size = args.m;
const auto num_blocks = CeilDiv(args.n, block_size);
return num_blocks * (block_size * (block_size / 2) * (block_size / 2));
}
static size_t GetBytes(const Arguments<T> &args) {
return (args.a_size * args.b_size) * sizeof(T);
}
};
// =================================================================================================
} // namespace clblast
// CLBLAST_TEST_ROUTINES_XINVERT_H_
#endif