mirror of
https://github.com/CNugteren/CLBlast.git
synced 2024-07-15 19:05:44 +02:00
152 lines
7.3 KiB
C++
152 lines
7.3 KiB
C++
|
|
// =================================================================================================
|
|
// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
|
|
// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
|
|
// width of 100 characters per line.
|
|
//
|
|
// Author(s):
|
|
// Cedric Nugteren <www.cedricnugteren.nl>
|
|
//
|
|
// This file contains all the common code to perform (partial) matrix inverting. This code is based
|
|
// on the TRSM implementation in the CUDA version of Magma version 2.2.0 and the poster "Triangular
|
|
// Linear System Solver for GPU with CUDA and OpenCL" by Peng Du, Stanimire Tomov, Piotr Luszczek,
|
|
// and Jack Dongarra.
|
|
//
|
|
// =================================================================================================
|
|
|
|
#include "routines/levelx/xinvert.hpp"
|
|
|
|
#include <string>
|
|
#include <vector>
|
|
#include <assert.h>
|
|
|
|
namespace clblast {
|
|
// =================================================================================================
|
|
|
|
// Constructor: forwards to base class constructor
|
|
template <typename T>
|
|
Xinvert<T>::Xinvert(Queue &queue, EventPointer event, const std::string &name):
|
|
Routine(queue, event, name, {"Invert"}, PrecisionValue<T>(), {}, {
|
|
#include "../../kernels/level3/level3.opencl"
|
|
#include "../../kernels/level3/invert_diagonal_blocks.opencl"
|
|
}) {
|
|
}
|
|
|
|
// =================================================================================================
|
|
|
|
// Inverts diagonal square blocks of a matrix
|
|
template <typename T>
|
|
void Xinvert<T>::InvertMatrixDiagonalBlocks(const Layout layout, const Triangle triangle, const Diagonal diag,
|
|
const size_t n, const size_t block_size,
|
|
const Buffer<T> &src, const size_t offset, const size_t ld_src,
|
|
Buffer<T> &dest) {
|
|
|
|
// Makes sure all dimensions are larger than zero
|
|
if ((block_size == 0) || (n == 0)) {
|
|
throw BLASError(StatusCode::kInvalidDimension);
|
|
}
|
|
|
|
// Helper variables
|
|
const auto internal_block_size = static_cast<size_t>(db_["INTERNAL_BLOCK_SIZE"]);
|
|
assert(internal_block_size == 16);
|
|
const auto num_blocks = CeilDiv(n, block_size);
|
|
const auto num_internal_blocks = CeilDiv(n, internal_block_size);
|
|
const auto unit_diagonal = (diag == Diagonal::kUnit) ? true : false;
|
|
|
|
// This routine only supports block sizes which are a multiple of the internal block size and
|
|
// block sizes up to and including 128
|
|
if ((block_size % internal_block_size != 0) || (block_size > 128)) {
|
|
throw BLASError(StatusCode::kUnknownError);
|
|
}
|
|
|
|
// Checks for validity of the source and destination matrices
|
|
TestMatrixA(n, n, src, offset, ld_src);
|
|
TestMatrixB(block_size, num_blocks * block_size, dest, 0, block_size);
|
|
|
|
// Determines which kernels to run based on the layout (the kernels assume column-major as
|
|
// default) and on whether we are dealing with an upper or lower triangle of the triangular matrix
|
|
const bool is_upper = ((triangle == Triangle::kUpper && layout != Layout::kRowMajor) ||
|
|
(triangle == Triangle::kLower && layout == Layout::kRowMajor));
|
|
const auto name_postfix = (is_upper) ? "Upper" : "Lower";
|
|
|
|
// Fills the output buffer with zeros
|
|
auto event_wait_list = std::vector<Event>();
|
|
auto fill_matrix_event = Event();
|
|
FillMatrix(queue_, device_, program_, db_, fill_matrix_event.pointer(), event_wait_list,
|
|
block_size, num_blocks * block_size, block_size, 0, dest, ConstantZero<T>());
|
|
event_wait_list.push_back(fill_matrix_event);
|
|
|
|
// Inverts the diagonal IB by IB inner blocks of the matrix: one block per work-group
|
|
auto kernel = Kernel(program_, "InvertDiagonalBlock");
|
|
kernel.SetArgument(0, static_cast<int>(n));
|
|
kernel.SetArgument(1, src());
|
|
kernel.SetArgument(2, static_cast<int>(offset));
|
|
kernel.SetArgument(3, static_cast<int>(ld_src));
|
|
kernel.SetArgument(4, dest());
|
|
kernel.SetArgument(5, static_cast<int>(block_size));
|
|
kernel.SetArgument(6, static_cast<int>(unit_diagonal));
|
|
kernel.SetArgument(7, static_cast<int>(is_upper));
|
|
const auto local = std::vector<size_t>{internal_block_size};
|
|
const auto global = std::vector<size_t>{num_internal_blocks * internal_block_size};
|
|
auto base_kernel_event = Event();
|
|
auto base_kernel_event_pointer = (internal_block_size == block_size) ? event_ : base_kernel_event.pointer();
|
|
RunKernel(kernel, queue_, device_, global, local, base_kernel_event_pointer, event_wait_list);
|
|
if (internal_block_size == block_size) { event_wait_list.push_back(base_kernel_event); }
|
|
|
|
// Builds up block_size x block_size blocks. For example, internal_block_size=16:
|
|
// use 16 x 16 blocks to build 32 x 32 blocks, 1 x (1 x npages) grid, 4 x 4 threads;
|
|
// then 32 x 32 blocks to build 64 x 64 blocks, 1 x (2 x npages) grid, 8 x 4 threads;
|
|
// then 64 x 64 blocks to build 128 x 128 blocks, 1 x (4 x npages) grid, 16 x 4 threads;
|
|
for (auto current_size = internal_block_size; current_size < block_size; current_size *= 2) {
|
|
assert(current_size == 16 || current_size == 32 || current_size == 64);
|
|
|
|
// Emulates a 3D grid: NX * (NY * npages)
|
|
const auto npages = CeilDiv(n, current_size*2);
|
|
const auto local0 = (current_size <= 32) ? current_size/4 : 16;
|
|
const auto local = std::vector<size_t>{local0, 4};
|
|
const auto global = std::vector<size_t>{(current_size/local[1]), npages*(current_size/16)*local[1]};
|
|
|
|
// Part 1
|
|
auto kernel1 = Kernel(program_, "TripleMatMul" + ToString(current_size) + "Part1" + name_postfix);
|
|
kernel1.SetArgument(0, static_cast<int>(n));
|
|
kernel1.SetArgument(1, src());
|
|
kernel1.SetArgument(2, static_cast<int>(offset));
|
|
kernel1.SetArgument(3, static_cast<int>(ld_src));
|
|
kernel1.SetArgument(4, dest());
|
|
kernel1.SetArgument(5, static_cast<int>(current_size));
|
|
kernel1.SetArgument(6, static_cast<int>(npages));
|
|
kernel1.SetArgument(7, static_cast<int>(block_size));
|
|
auto kernel1_event = Event();
|
|
RunKernel(kernel1, queue_, device_, global, local, kernel1_event.pointer(), event_wait_list);
|
|
event_wait_list.push_back(kernel1_event);
|
|
|
|
// Part 2
|
|
const bool is_last_kernel = (current_size * 2 >= block_size);
|
|
auto kernel2 = Kernel(program_, "TripleMatMul" + ToString(current_size) + "Part2" + name_postfix);
|
|
kernel2.SetArgument(0, static_cast<int>(n));
|
|
kernel2.SetArgument(1, dest());
|
|
kernel2.SetArgument(2, static_cast<int>(current_size));
|
|
kernel2.SetArgument(3, static_cast<int>(npages));
|
|
kernel2.SetArgument(4, static_cast<int>(block_size));
|
|
auto kernel2_event = Event();
|
|
auto kernel2_event_pointer = (is_last_kernel) ? event_ : kernel2_event.pointer();
|
|
RunKernel(kernel2, queue_, device_, global, local, kernel2_event_pointer, event_wait_list);
|
|
if (!is_last_kernel) { event_wait_list.push_back(kernel2_event); }
|
|
|
|
// Exit in case we reach beyond the bounds of the input matrix
|
|
if (current_size*2 >= n) { break; }
|
|
}
|
|
}
|
|
|
|
// =================================================================================================
|
|
|
|
// Compiles the templated class
|
|
template class Xinvert<half>;
|
|
template class Xinvert<float>;
|
|
template class Xinvert<double>;
|
|
template class Xinvert<float2>;
|
|
template class Xinvert<double2>;
|
|
|
|
// =================================================================================================
|
|
} // namespace clblast
|