treewide: use C++ exceptions properly

Since the codebase is designed around proper C++ idioms such as RAII, it
makes sense to only use C++ exceptions internally instead of mixing
exceptions and error codes. The exceptions are now caught at top level
to preserve compatibility with the existing error code-based API.

Note that we deliberately do not catch C++ runtime errors (such as
`std::bad_alloc`) nor logic errors (aka failed assertions) because no
actual handling can ever happen for such errors.

However, in the C interface we do catch _all_ exceptions (...) and
convert them into a wild-card error code.
pull/117/head
Ivan Shapovalov 2016-10-22 05:14:19 +03:00
parent 5d03d48f7a
commit b98af44fcf
105 changed files with 4285 additions and 3987 deletions

View File

@ -169,6 +169,7 @@ set(SOURCES
src/routines/common.cpp
src/cache.cpp
src/clblast.cpp
src/clblast_exceptions.cpp
src/clblast_c.cpp
src/routine.cpp
src/utilities.cpp

View File

@ -75,13 +75,14 @@ enum class StatusCode {
kInsufficientMemoryY = -1007, // Vector Y's OpenCL buffer is too small
// Custom additional status codes for CLBlast
kKernelLaunchError = -2048, // Problem occurred when enqueuing the kernel
kKernelRunError = -2047, // Problem occurred while running the kernel
kInvalidLocalMemUsage = -2046, // Not enough local memory available on this device
kNoHalfPrecision = -2045, // Half precision (16-bits) not supported by the device
kNoDoublePrecision = -2044, // Double precision (64-bits) not supported by the device
kInvalidVectorScalar = -2043, // The unit-sized vector is not a valid OpenCL buffer
kInsufficientMemoryScalar = -2042, // The unit-sized vector's OpenCL buffer is too small
kDatabaseError = -2041, // Entry for the device was not found in the database
kUnknownError = -2040, // A catch-all error code representing an unspecified error
kUnexpectedError = -2039, // A catch-all error code representing an unexpected exception
};
// Matrix layout and transpose types

View File

@ -76,13 +76,14 @@ typedef enum StatusCode_ {
kInsufficientMemoryY = -1007, // Vector Y's OpenCL buffer is too small
// Custom additional status codes for CLBlast
kKernelLaunchError = -2048, // Problem occurred when enqueuing the kernel
kKernelRunError = -2047, // Problem occurred while running the kernel
kInvalidLocalMemUsage = -2046, // Not enough local memory available on this device
kNoHalfPrecision = -2045, // Half precision (16-bits) not supported by the device
kNoDoublePrecision = -2044, // Double precision (64-bits) not supported by the device
kInvalidVectorScalar = -2043, // The unit-sized vector is not a valid OpenCL buffer
kInsufficientMemoryScalar = -2042, // The unit-sized vector's OpenCL buffer is too small
kDatabaseError = -2041, // Entry for the device was not found in the database
kUnknownError = -2040, // A catch-all error code representing an unspecified error
kUnexpectedError = -2039, // A catch-all error code representing an unexpected exception
} StatusCode;
// Matrix layout and transpose types

View File

@ -30,8 +30,8 @@ from generator.routine import Routine
from generator.datatype import H, S, D, C, Z, Sc, Dz, iH, iS, iD, iC, iZ, Css, Zdd, Ccs, Zzd, T, Tc, TU
HEADER_LINES = [96, 73, 97, 22, 29, 41]
FOOTER_LINES = [17, 75, 19, 14, 6, 6]
HEADER_LINES = [97, 73, 98, 22, 29, 41]
FOOTER_LINES = [17, 80, 19, 18, 6, 6]
# Different possibilities for requirements
ald_m = "The value of `a_ld` must be at least `m`."

View File

@ -45,17 +45,19 @@ def clblast_h(routine):
def clblast_cc(routine):
"""The C++ API implementation (.cpp)"""
indent1 = " " * (20 + routine.length())
indent1 = " " * (15 + routine.length())
result = NL + "// " + routine.description + ": " + routine.short_names() + NL
if routine.implemented:
result += routine.routine_header_cpp(12, "") + " {" + NL
result += " auto queue_cpp = Queue(*queue);" + NL
result += " auto routine = X" + routine.name + "<" + routine.template.template + ">(queue_cpp, event);" + NL
result += " auto status = routine.SetUp();" + NL
result += " if (status != StatusCode::kSuccess) { return status; }" + NL
result += " return routine.Do" + routine.name.capitalize() + "("
result += " try {" + NL
result += " auto queue_cpp = Queue(*queue);" + NL
result += " auto routine = X" + routine.name + "<" + routine.template.template + ">(queue_cpp, event);" + NL
result += " routine.SetUp();" + NL
result += " routine.Do" + routine.name.capitalize() + "("
result += ("," + NL + indent1).join([a for a in routine.arguments_clcudaapi()])
result += ");" + NL
result += " return StatusCode::kSuccess;" + NL
result += " } catch (...) { return DispatchException(); }" + NL
else:
result += routine.routine_header_type_cpp(12) + " {" + NL
result += " return StatusCode::kNotImplemented;" + NL
@ -81,12 +83,14 @@ def clblast_c_cc(routine):
result = NL + "// " + routine.name.upper() + NL
for flavour in routine.flavours:
template = "<" + flavour.template + ">" if routine.no_scalars() else ""
indent = " " * (26 + routine.length() + len(template))
indent = " " * (45 + routine.length() + len(template))
result += routine.routine_header_c(flavour, 20, "") + " {" + NL
result += " auto status = clblast::" + routine.name.capitalize() + template + "("
result += " try {" + NL
result += " return static_cast<StatusCode>(clblast::" + routine.name.capitalize() + template + "("
result += ("," + NL + indent).join([a for a in routine.arguments_cast(flavour, indent)])
result += "," + NL + indent + "queue, event);"
result += NL + " return static_cast<StatusCode>(status);" + NL + "}" + NL
result += "," + NL + indent + "queue, event));" + NL
result += " } catch (...) { return static_cast<StatusCode>(clblast::DispatchExceptionForC()); }" + NL
result += "}" + NL
return result

View File

@ -22,96 +22,88 @@ namespace clblast {
// Tests matrix 'A' for validity
template <typename T>
StatusCode TestMatrixA(const size_t one, const size_t two, const Buffer<T> &buffer,
void TestMatrixA(const size_t one, const size_t two, const Buffer<T> &buffer,
const size_t offset, const size_t ld) {
if (ld < one) { return StatusCode::kInvalidLeadDimA; }
if (ld < one) { throw BLASError(StatusCode::kInvalidLeadDimA); }
try {
const auto required_size = (ld * (two - 1) + one + offset) * sizeof(T);
if (buffer.GetSize() < required_size) { return StatusCode::kInsufficientMemoryA; }
} catch (...) { return StatusCode::kInvalidMatrixA; }
return StatusCode::kSuccess;
if (buffer.GetSize() < required_size) { throw BLASError(StatusCode::kInsufficientMemoryA); }
} catch (const Error<std::runtime_error> &e) { throw BLASError(StatusCode::kInvalidMatrixA, e.what()); }
}
// Tests matrix 'B' for validity
template <typename T>
StatusCode TestMatrixB(const size_t one, const size_t two, const Buffer<T> &buffer,
void TestMatrixB(const size_t one, const size_t two, const Buffer<T> &buffer,
const size_t offset, const size_t ld) {
if (ld < one) { return StatusCode::kInvalidLeadDimB; }
if (ld < one) { throw BLASError(StatusCode::kInvalidLeadDimB); }
try {
const auto required_size = (ld * (two - 1) + one + offset) * sizeof(T);
if (buffer.GetSize() < required_size) { return StatusCode::kInsufficientMemoryB; }
} catch (...) { return StatusCode::kInvalidMatrixB; }
return StatusCode::kSuccess;
if (buffer.GetSize() < required_size) { throw BLASError(StatusCode::kInsufficientMemoryB); }
} catch (const Error<std::runtime_error> &e) { throw BLASError(StatusCode::kInvalidMatrixB, e.what()); }
}
// Tests matrix 'C' for validity
template <typename T>
StatusCode TestMatrixC(const size_t one, const size_t two, const Buffer<T> &buffer,
void TestMatrixC(const size_t one, const size_t two, const Buffer<T> &buffer,
const size_t offset, const size_t ld) {
if (ld < one) { return StatusCode::kInvalidLeadDimC; }
if (ld < one) { throw BLASError(StatusCode::kInvalidLeadDimC); }
try {
const auto required_size = (ld * (two - 1) + one + offset) * sizeof(T);
if (buffer.GetSize() < required_size) { return StatusCode::kInsufficientMemoryC; }
} catch (...) { return StatusCode::kInvalidMatrixC; }
return StatusCode::kSuccess;
if (buffer.GetSize() < required_size) { throw BLASError(StatusCode::kInsufficientMemoryC); }
} catch (const Error<std::runtime_error> &e) { throw BLASError(StatusCode::kInvalidMatrixC, e.what()); }
}
// Tests matrix 'AP' for validity
template <typename T>
StatusCode TestMatrixAP(const size_t n, const Buffer<T> &buffer, const size_t offset) {
void TestMatrixAP(const size_t n, const Buffer<T> &buffer, const size_t offset) {
try {
const auto required_size = (((n * (n + 1)) / 2) + offset) * sizeof(T);
if (buffer.GetSize() < required_size) { return StatusCode::kInsufficientMemoryA; }
} catch (...) { return StatusCode::kInvalidMatrixA; }
return StatusCode::kSuccess;
if (buffer.GetSize() < required_size) { throw BLASError(StatusCode::kInsufficientMemoryA); }
} catch (const Error<std::runtime_error> &e) { throw BLASError(StatusCode::kInvalidMatrixA, e.what()); }
}
// =================================================================================================
// Tests vector 'X' for validity
template <typename T>
StatusCode TestVectorX(const size_t n, const Buffer<T> &buffer, const size_t offset,
void TestVectorX(const size_t n, const Buffer<T> &buffer, const size_t offset,
const size_t inc) {
if (inc == 0) { return StatusCode::kInvalidIncrementX; }
if (inc == 0) { throw BLASError(StatusCode::kInvalidIncrementX); }
try {
const auto required_size = ((n - 1) * inc + 1 + offset) * sizeof(T);
if (buffer.GetSize() < required_size) { return StatusCode::kInsufficientMemoryX; }
} catch (...) { return StatusCode::kInvalidVectorX; }
return StatusCode::kSuccess;
if (buffer.GetSize() < required_size) { throw BLASError(StatusCode::kInsufficientMemoryX); }
} catch (const Error<std::runtime_error> &e) { throw BLASError(StatusCode::kInvalidVectorX, e.what()); }
}
// Tests vector 'Y' for validity
template <typename T>
StatusCode TestVectorY(const size_t n, const Buffer<T> &buffer, const size_t offset,
void TestVectorY(const size_t n, const Buffer<T> &buffer, const size_t offset,
const size_t inc) {
if (inc == 0) { return StatusCode::kInvalidIncrementY; }
if (inc == 0) { throw BLASError(StatusCode::kInvalidIncrementY); }
try {
const auto required_size = ((n - 1) * inc + 1 + offset) * sizeof(T);
if (buffer.GetSize() < required_size) { return StatusCode::kInsufficientMemoryY; }
} catch (...) { return StatusCode::kInvalidVectorY; }
return StatusCode::kSuccess;
if (buffer.GetSize() < required_size) { throw BLASError(StatusCode::kInsufficientMemoryY); }
} catch (const Error<std::runtime_error> &e) { throw BLASError(StatusCode::kInvalidVectorY, e.what()); }
}
// =================================================================================================
// Tests vector 'scalar' for validity
template <typename T>
StatusCode TestVectorScalar(const size_t n, const Buffer<T> &buffer, const size_t offset) {
void TestVectorScalar(const size_t n, const Buffer<T> &buffer, const size_t offset) {
try {
const auto required_size = (n + offset) * sizeof(T);
if (buffer.GetSize() < required_size) { return StatusCode::kInsufficientMemoryScalar; }
} catch (...) { return StatusCode::kInvalidVectorScalar; }
return StatusCode::kSuccess;
if (buffer.GetSize() < required_size) { throw BLASError(StatusCode::kInsufficientMemoryScalar); }
} catch (const Error<std::runtime_error> &e) { throw BLASError(StatusCode::kInvalidVectorScalar, e.what()); }
}
// Tests vector 'index' for validity
template <typename T>
StatusCode TestVectorIndex(const size_t n, const Buffer<T> &buffer, const size_t offset) {
void TestVectorIndex(const size_t n, const Buffer<T> &buffer, const size_t offset) {
try {
const auto required_size = (n + offset) * sizeof(T);
if (buffer.GetSize() < required_size) { return StatusCode::kInsufficientMemoryScalar; }
} catch (...) { return StatusCode::kInvalidVectorScalar; }
return StatusCode::kSuccess;
if (buffer.GetSize() < required_size) { throw BLASError(StatusCode::kInsufficientMemoryScalar); }
} catch (const Error<std::runtime_error> &e) { throw BLASError(StatusCode::kInvalidVectorScalar, e.what()); }
}
// =================================================================================================

View File

@ -57,7 +57,7 @@ const std::string& GetBinaryFromCache(const std::string &device_name, const Prec
}
}
binary_cache_mutex_.unlock();
throw std::runtime_error("Internal CLBlast error: Expected binary in cache, but found none.");
throw LogicError("GetBinaryFromCache: Expected binary in cache, but found none");
}
// Queries the cache and retrieves a matching program. Assumes that the match is available, throws
@ -75,7 +75,7 @@ const Program& GetProgramFromCache(const Context &context, const Precision &prec
}
}
program_cache_mutex_.unlock();
throw std::runtime_error("Internal CLBlast error: Expected program in cache, but found none.");
throw LogicError("GetProgramFromCache: Expected program in cache, but found none");
}
// Queries the cache to see whether or not the compiled kernel is already there
@ -109,14 +109,13 @@ bool ProgramIsInCache(const Context &context, const Precision &precision,
// =================================================================================================
// Clears the cache of stored binaries and programs
StatusCode CacheClearAll() {
void CacheClearAll() {
binary_cache_mutex_.lock();
binary_cache_.clear();
binary_cache_mutex_.unlock();
program_cache_mutex_.lock();
program_cache_.clear();
program_cache_mutex_.unlock();
return StatusCode::kSuccess;
}
// =================================================================================================

View File

@ -89,7 +89,7 @@ bool ProgramIsInCache(const Context &context, const Precision &precision,
// =================================================================================================
// Clears the cache of stored binaries
StatusCode CacheClearAll();
void CacheClearAll();
// =================================================================================================
} // namespace clblast

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,95 @@
// =================================================================================================
// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
// width of 100 characters per line.
//
// Author(s):
// Ivan Shapovalov <intelfx@intelfx.name>
//
// This file implements the exception hierarchy for CLBlast. It contains classes for exceptions
// generated by different parts of CLBlast (e.g. OpenCL API calls, internal logic, semantic BLAS
// errors).
//
// =================================================================================================
#include "clblast_exceptions.hpp"
namespace {
// =================================================================================================
std::string MakeReason(const std::string &reason, const std::string &subreason) {
std::string r = reason;
if (!subreason.empty()) {
r += " (" + subreason + ")";
}
return r;
}
} // anonymous namespace
namespace clblast {
// =================================================================================================
BLASError::BLASError(StatusCode status, const std::string &subreason):
ErrorCode(status,
subreason,
"BLAS error: " + MakeReason(std::to_string(static_cast<int>(status)), subreason)) {
}
RuntimeErrorCode::RuntimeErrorCode(StatusCode status, const std::string &subreason):
ErrorCode(status,
subreason,
MakeReason(std::to_string(static_cast<int>(status)), subreason)) {
}
// =================================================================================================
StatusCode DispatchException()
{
const char *message = nullptr;
StatusCode status;
try {
throw;
} catch (BLASError &e) {
// no message is printed for invalid argument errors
status = e.status();
} catch (CLError &e) {
message = e.what();
status = static_cast<StatusCode>(e.status());
} catch (RuntimeErrorCode &e) {
message = e.what();
status = e.status();
} catch (Error<std::runtime_error> &e) {
message = e.what();
status = StatusCode::kUnknownError;
}
if (message) {
fprintf(stderr, "CLBlast: %s\n", message);
}
return status;
}
// =================================================================================================
StatusCode DispatchExceptionForC()
{
const char *message = nullptr;
try {
throw;
} catch (std::exception &e) {
message = e.what();
} catch (...) {
message = "unknown exception";
}
fprintf (stderr, "CLBlast (unexpected): %s\n", message);
return StatusCode::kUnexpectedError;
}
// =================================================================================================
} // namespace clblast

View File

@ -0,0 +1,50 @@
// =================================================================================================
// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
// width of 100 characters per line.
//
// Author(s):
// Ivan Shapovalov <intelfx@intelfx.name>
//
// This file implements the exception hierarchy for CLBlast. It contains classes for exceptions
// generated by different parts of CLBlast (e.g. OpenCL API calls, internal logic, semantic BLAS
// errors).
//
// =================================================================================================
#ifndef CLBLAST_EXCEPTIONS_H_
#define CLBLAST_EXCEPTIONS_H_
#include "clblast.h"
#include "clpp11.hpp"
namespace clblast {
// =================================================================================================
// Represents a semantic error in BLAS function arguments
class PUBLIC_API BLASError : public ErrorCode<Error<std::invalid_argument>, StatusCode> {
public:
explicit BLASError(StatusCode status, const std::string &subreason = std::string{});
};
// =================================================================================================
// Represents a runtime error generated by internal logic
class PUBLIC_API RuntimeErrorCode : public ErrorCode<RuntimeError, StatusCode> {
public:
explicit RuntimeErrorCode(StatusCode status, const std::string &subreason = std::string{});
};
// =================================================================================================
// Handles (most of the) runtime exceptions and converts them to StatusCode
StatusCode DispatchException();
// Handles remaining exceptions and converts them to StatusCode::kUnhandledError
StatusCode DispatchExceptionForC();
// =================================================================================================
} // namespace clblast
#endif // CLBLAST_EXCEPTIONS_H_

View File

@ -41,7 +41,6 @@
#include <string> // std::string
#include <vector> // std::vector
#include <memory> // std::shared_ptr
#include <stdexcept> // std::runtime_error
#include <numeric> // std::accumulate
#include <cstring> // std::strlen
@ -52,28 +51,41 @@
#include <CL/opencl.h>
#endif
// Exception classes
#include "cxpp11_common.hpp"
namespace clblast {
// =================================================================================================
// Error occurred in the C++11 OpenCL header (this file)
inline void Error(const std::string &message) {
throw std::runtime_error("Internal OpenCL error: "+message);
}
// Represents a runtime error returned by an OpenCL API function
class CLError : public ErrorCode<DeviceError, cl_int> {
public:
explicit CLError(cl_int status, const std::string &where):
ErrorCode(status,
where,
"OpenCL error: " + where + ": " + std::to_string(static_cast<int>(status))) {
}
static void Check(const cl_int status, const std::string &where) {
if (status != CL_SUCCESS) {
throw CLError(status, where);
}
}
static void CheckDtor(const cl_int status, const std::string &where) {
if (status != CL_SUCCESS) {
fprintf(stderr, "CLBlast: %s (ignoring)\n", CLError(status, where).what());
}
}
};
// =================================================================================================
// Error occurred in OpenCL
inline void CheckError(const cl_int status) {
if (status != CL_SUCCESS) {
throw std::runtime_error("Internal OpenCL error: "+std::to_string(status));
}
}
#define CheckError(call) CLError::Check(call, CLError::TrimCallString(#call))
// Error occured in OpenCL (no-exception version for destructors)
inline void CheckErrorDtor(const cl_int status) {
if (status != CL_SUCCESS) {
auto message = "Internal OpenCL Error: "+std::to_string(status) + " (ignoring)";
fprintf(stderr, "%s\n", message.c_str());
}
}
#define CheckErrorDtor(call) CLError::CheckDtor(call, CLError::TrimCallString(#call))
// =================================================================================================
@ -140,10 +152,14 @@ class Platform {
explicit Platform(const size_t platform_id) {
auto num_platforms = cl_uint{0};
CheckError(clGetPlatformIDs(0, nullptr, &num_platforms));
if (num_platforms == 0) { Error("no platforms found"); }
if (num_platforms == 0) {
throw RuntimeError("Platform: no platforms found");
}
if (platform_id >= num_platforms) {
throw RuntimeError("Platform: invalid platform ID "+std::to_string(platform_id));
}
auto platforms = std::vector<cl_platform_id>(num_platforms);
CheckError(clGetPlatformIDs(num_platforms, platforms.data(), nullptr));
if (platform_id >= num_platforms) { Error("invalid platform ID "+std::to_string(platform_id)); }
platform_ = platforms[platform_id];
}
@ -183,11 +199,16 @@ class Device {
// Initialize the device. Note that this constructor can throw exceptions!
explicit Device(const Platform &platform, const size_t device_id) {
auto num_devices = platform.NumDevices();
if (num_devices == 0) { Error("no devices found"); }
if (num_devices == 0) {
throw RuntimeError("Device: no devices found");
}
if (device_id >= num_devices) {
throw RuntimeError("Device: invalid device ID "+std::to_string(device_id));
}
auto devices = std::vector<cl_device_id>(num_devices);
CheckError(clGetDeviceIDs(platform(), CL_DEVICE_TYPE_ALL, static_cast<cl_uint>(num_devices),
devices.data(), nullptr));
if (device_id >= num_devices) { Error("invalid device ID "+std::to_string(device_id)); }
device_ = devices[device_id];
}
@ -315,7 +336,7 @@ class Context {
auto status = CL_SUCCESS;
const cl_device_id dev = device();
*context_ = clCreateContext(nullptr, 1, &dev, nullptr, nullptr, &status);
CheckError(status);
CLError::Check(status, "clCreateContext");
}
// Accessor to the private data-member
@ -346,7 +367,7 @@ class Program {
source_ptr_(&source_[0]) {
auto status = CL_SUCCESS;
*program_ = clCreateProgramWithSource(context(), 1, &source_ptr_, &length_, &status);
CheckError(status);
CLError::Check(status, "clCreateProgramWithSource");
}
// Binary-based constructor with memory management
@ -361,25 +382,15 @@ class Program {
*program_ = clCreateProgramWithBinary(context(), 1, &dev, &length_,
reinterpret_cast<const unsigned char**>(&source_ptr_),
&status1, &status2);
CheckError(status1);
CheckError(status2);
CLError::Check(status1, "clCreateProgramWithBinary (binary status)");
CLError::Check(status2, "clCreateProgramWithBinary");
}
// Compiles the device program and returns whether or not there where any warnings/errors
BuildStatus Build(const Device &device, std::vector<std::string> &options) {
void Build(const Device &device, std::vector<std::string> &options) {
auto options_string = std::accumulate(options.begin(), options.end(), std::string{" "});
const cl_device_id dev = device();
auto status = clBuildProgram(*program_, 1, &dev, options_string.c_str(), nullptr, nullptr);
if (status == CL_BUILD_PROGRAM_FAILURE) {
return BuildStatus::kError;
}
else if (status == CL_INVALID_BINARY) {
return BuildStatus::kInvalid;
}
else {
CheckError(status);
return BuildStatus::kSuccess;
}
CheckError(clBuildProgram(*program_, 1, &dev, options_string.c_str(), nullptr, nullptr));
}
// Retrieves the warning/error message from the compiler (if any)
@ -436,15 +447,17 @@ class Queue {
{
cl_queue_properties properties[] = {CL_QUEUE_PROPERTIES, CL_QUEUE_PROFILING_ENABLE, 0};
*queue_ = clCreateCommandQueueWithProperties(context(), device(), properties, &status);
CLError::Check(status, "clCreateCommandQueueWithProperties");
}
else
{
*queue_ = clCreateCommandQueue(context(), device(), CL_QUEUE_PROFILING_ENABLE, &status);
CLError::Check(status, "clCreateCommandQueue");
}
#else
*queue_ = clCreateCommandQueue(context(), device(), CL_QUEUE_PROFILING_ENABLE, &status);
CLError::Check(status, "clCreateCommandQueue");
#endif
CheckError(status);
}
// Synchronizes the queue
@ -536,7 +549,7 @@ class Buffer {
if (access_ == BufferAccess::kWriteOnly) { flags = CL_MEM_WRITE_ONLY; }
auto status = CL_SUCCESS;
*buffer_ = clCreateBuffer(context(), flags, size*sizeof(T), nullptr, &status);
CheckError(status);
CLError::Check(status, "clCreateBuffer");
}
// As above, but now with read/write access as a default
@ -557,18 +570,24 @@ class Buffer {
// Copies from device to host: reading the device buffer a-synchronously
void ReadAsync(const Queue &queue, const size_t size, T* host, const size_t offset = 0) const {
if (access_ == BufferAccess::kWriteOnly) { Error("reading from a write-only buffer"); }
if (access_ == BufferAccess::kWriteOnly) {
throw LogicError("Buffer: reading from a write-only buffer");
}
CheckError(clEnqueueReadBuffer(queue(), *buffer_, CL_FALSE, offset*sizeof(T), size*sizeof(T),
host, 0, nullptr, nullptr));
}
void ReadAsync(const Queue &queue, const size_t size, std::vector<T> &host,
const size_t offset = 0) const {
if (host.size() < size) { Error("target host buffer is too small"); }
if (host.size() < size) {
throw LogicError("Buffer: target host buffer is too small");
}
ReadAsync(queue, size, host.data(), offset);
}
void ReadAsync(const Queue &queue, const size_t size, BufferHost<T> &host,
const size_t offset = 0) const {
if (host.size() < size) { Error("target host buffer is too small"); }
if (host.size() < size) {
throw LogicError("Buffer: target host buffer is too small");
}
ReadAsync(queue, size, host.data(), offset);
}
@ -588,8 +607,12 @@ class Buffer {
// Copies from host to device: writing the device buffer a-synchronously
void WriteAsync(const Queue &queue, const size_t size, const T* host, const size_t offset = 0) {
if (access_ == BufferAccess::kReadOnly) { Error("writing to a read-only buffer"); }
if (GetSize() < (offset+size)*sizeof(T)) { Error("target device buffer is too small"); }
if (access_ == BufferAccess::kReadOnly) {
throw LogicError("Buffer: writing to a read-only buffer");
}
if (GetSize() < (offset+size)*sizeof(T)) {
throw LogicError("Buffer: target device buffer is too small");
}
CheckError(clEnqueueWriteBuffer(queue(), *buffer_, CL_FALSE, offset*sizeof(T), size*sizeof(T),
host, 0, nullptr, nullptr));
}
@ -658,7 +681,7 @@ class Kernel {
kernel_(new cl_kernel, [](cl_kernel* k) { CheckErrorDtor(clReleaseKernel(*k)); delete k; }) {
auto status = CL_SUCCESS;
*kernel_ = clCreateKernel(program(), name.c_str(), &status);
CheckError(status);
CLError::Check(status, "clCreateKernel");
}
// Sets a kernel argument at the indicated position

View File

@ -0,0 +1,87 @@
#ifndef CLBLAST_CXPP11_COMMON_H_
#define CLBLAST_CXPP11_COMMON_H_
// C++
#include <string> // std::string
#include <stdexcept> // std::runtime_error
namespace clblast {
// =================================================================================================
// Basic exception class: represents an error happened inside our code
// (as opposed to an error in C++ runtime)
template <typename Base>
class Error : public Base {
public:
using Base::Base;
};
// =================================================================================================
// Represents a generic device-specific runtime error (returned by an OpenCL or CUDA API function)
class DeviceError : public Error<std::runtime_error> {
public:
using Error<std::runtime_error>::Error;
static std::string TrimCallString(const char *where) {
const char *paren = strchr(where, '(');
if (paren) {
return std::string(where, paren);
} else {
return std::string(where);
}
}
};
// =================================================================================================
// Represents a generic runtime error (aka environmental problem)
class RuntimeError : public Error<std::runtime_error> {
public:
explicit RuntimeError(const std::string &reason):
Error("Run-time error: " + reason) {
}
};
// =================================================================================================
// Represents a generic logic error (aka failed assertion)
class LogicError : public Error<std::logic_error> {
public:
explicit LogicError(const std::string &reason):
Error("Internal logic error: " + reason) {
}
};
// =================================================================================================
// Internal exception base class with a status field and a subclass-specific "details" field
// which can be used to recreate an exception
template <typename Base, typename Status>
class ErrorCode : public Base {
public:
ErrorCode(Status status, const std::string &details, const std::string &reason):
Base(reason),
status_(status),
details_(details) {
}
Status status() const {
return status_;
}
const std::string& details() const {
return details_;
}
private:
const Status status_;
const std::string details_;
};
// =================================================================================================
} // namespace clblast
// CLBLAST_CXPP11_COMMON_H_
#endif

View File

@ -92,7 +92,7 @@ Database::Database(const Queue &queue, const std::vector<std::string> &kernels,
}
}
if (!search_result) { throw std::runtime_error("Database error, could not find a suitable entry"); }
if (!search_result) { throw RuntimeErrorCode(StatusCode::kDatabaseError); }
}
}

View File

@ -38,10 +38,10 @@ Routine::Routine(Queue &queue, EventPointer event, const std::string &name,
// =================================================================================================
// Separate set-up function to allow for status codes to be returned
StatusCode Routine::SetUp() {
void Routine::SetUp() {
// Queries the cache to see whether or not the program (context-specific) is already there
if (ProgramIsInCache(context_, precision_, routine_name_)) { return StatusCode::kSuccess; }
if (ProgramIsInCache(context_, precision_, routine_name_)) { return; }
// Sets the build options from an environmental variable (if set)
auto options = std::vector<std::string>();
@ -53,13 +53,10 @@ StatusCode Routine::SetUp() {
// Queries the cache to see whether or not the binary (device-specific) is already there. If it
// is, a program is created and stored in the cache
if (BinaryIsInCache(device_name_, precision_, routine_name_)) {
try {
auto& binary = GetBinaryFromCache(device_name_, precision_, routine_name_);
auto program = Program(device_, context_, binary);
program.Build(device_, options);
StoreProgramToCache(program, context_, precision_, routine_name_);
} catch (...) { return StatusCode::kBuildProgramFailure; }
return StatusCode::kSuccess;
auto& binary = GetBinaryFromCache(device_name_, precision_, routine_name_);
auto program = Program(device_, context_, binary);
program.Build(device_, options);
StoreProgramToCache(program, context_, precision_, routine_name_);
}
// Otherwise, the kernel will be compiled and program will be built. Both the binary and the
@ -69,14 +66,14 @@ StatusCode Routine::SetUp() {
const auto extensions = device_.Capabilities();
if (precision_ == Precision::kDouble || precision_ == Precision::kComplexDouble) {
if (extensions.find(kKhronosDoublePrecision) == std::string::npos) {
return StatusCode::kNoDoublePrecision;
throw RuntimeErrorCode(StatusCode::kNoDoublePrecision);
}
}
// As above, but for cl_khr_fp16 (half precision)
if (precision_ == Precision::kHalf) {
if (extensions.find(kKhronosHalfPrecision) == std::string::npos) {
return StatusCode::kNoHalfPrecision;
throw RuntimeErrorCode(StatusCode::kNoHalfPrecision);
}
}
@ -120,23 +117,21 @@ StatusCode Routine::SetUp() {
#endif
// Compiles the kernel
auto program = Program(context_, source_string);
try {
auto program = Program(context_, source_string);
const auto build_status = program.Build(device_, options);
// Checks for compiler crashes/errors/warnings
if (build_status == BuildStatus::kError) {
const auto message = program.GetBuildInfo(device_);
fprintf(stdout, "OpenCL compiler error/warning: %s\n", message.c_str());
return StatusCode::kBuildProgramFailure;
program.Build(device_, options);
} catch (const CLError &e) {
if (e.status() == CL_BUILD_PROGRAM_FAILURE) {
fprintf(stdout, "OpenCL compiler error/warning: %s\n",
program.GetBuildInfo(device_).c_str());
}
if (build_status == BuildStatus::kInvalid) { return StatusCode::kInvalidBinary; }
throw;
}
// Store the compiled binary and program in the cache
const auto binary = program.GetIR();
StoreBinaryToCache(binary, device_name_, precision_, routine_name_);
StoreProgramToCache(program, context_, precision_, routine_name_);
} catch (...) { return StatusCode::kBuildProgramFailure; }
// Store the compiled binary and program in the cache
const auto binary = program.GetIR();
StoreBinaryToCache(binary, device_name_, precision_, routine_name_);
StoreProgramToCache(program, context_, precision_, routine_name_);
// Prints the elapsed compilation time in case of debugging in verbose mode
#ifdef VERBOSE
@ -144,9 +139,6 @@ StatusCode Routine::SetUp() {
const auto timing = std::chrono::duration<double,std::milli>(elapsed_time).count();
printf("[DEBUG] Completed compilation in %.2lf ms\n", timing);
#endif
// No errors, normal termination of this function
return StatusCode::kSuccess;
}
// =================================================================================================

View File

@ -39,7 +39,7 @@ class Routine {
const std::vector<const Database::DatabaseEntry*> &userDatabase = {});
// Set-up phase of the kernel
StatusCode SetUp();
void SetUp();
protected:

View File

@ -20,22 +20,26 @@ namespace clblast {
// =================================================================================================
// Enqueues a kernel, waits for completion, and checks for errors
StatusCode RunKernel(Kernel &kernel, Queue &queue, const Device &device,
std::vector<size_t> global, const std::vector<size_t> &local,
EventPointer event, const std::vector<Event> &waitForEvents) {
void RunKernel(Kernel &kernel, Queue &queue, const Device &device,
std::vector<size_t> global, const std::vector<size_t> &local,
EventPointer event, const std::vector<Event> &waitForEvents) {
if (!local.empty()) {
// Tests for validity of the local thread sizes
if (local.size() > device.MaxWorkItemDimensions()) {
return StatusCode::kInvalidLocalNumDimensions;
throw RuntimeErrorCode(StatusCode::kInvalidLocalNumDimensions);
}
const auto max_work_item_sizes = device.MaxWorkItemSizes();
for (auto i=size_t{0}; i<local.size(); ++i) {
if (local[i] > max_work_item_sizes[i]) { return StatusCode::kInvalidLocalThreadsDim; }
if (local[i] > max_work_item_sizes[i]) {
throw RuntimeErrorCode(StatusCode::kInvalidLocalThreadsDim);
}
}
auto local_size = size_t{1};
for (auto &item: local) { local_size *= item; }
if (local_size > device.MaxWorkGroupSize()) { return StatusCode::kInvalidLocalThreadsTotal; }
if (local_size > device.MaxWorkGroupSize()) {
throw RuntimeErrorCode(StatusCode::kInvalidLocalThreadsTotal);
}
// Make sure the global thread sizes are at least equal to the local sizes
for (auto i=size_t{0}; i<global.size(); ++i) {
@ -45,7 +49,9 @@ StatusCode RunKernel(Kernel &kernel, Queue &queue, const Device &device,
// Tests for local memory usage
const auto local_mem_usage = kernel.LocalMemUsage(device);
if (!device.IsLocalMemoryValid(local_mem_usage)) { return StatusCode::kInvalidLocalMemUsage; }
if (!device.IsLocalMemoryValid(local_mem_usage)) {
throw RuntimeErrorCode(StatusCode::kInvalidLocalMemUsage);
}
// Prints the name of the kernel to launch in case of debugging in verbose mode
#ifdef VERBOSE
@ -55,9 +61,7 @@ StatusCode RunKernel(Kernel &kernel, Queue &queue, const Device &device,
#endif
// Launches the kernel (and checks for launch errors)
try {
kernel.Launch(queue, global, local, event, waitForEvents);
} catch (...) { return StatusCode::kKernelLaunchError; }
kernel.Launch(queue, global, local, event, waitForEvents);
// Prints the elapsed execution time in case of debugging in verbose mode
#ifdef VERBOSE
@ -66,9 +70,6 @@ StatusCode RunKernel(Kernel &kernel, Queue &queue, const Device &device,
const auto timing = std::chrono::duration<double,std::milli>(elapsed_time).count();
printf("[DEBUG] Completed kernel in %.2lf ms\n", timing);
#endif
// No errors, normal termination of this function
return StatusCode::kSuccess;
}
// =================================================================================================

View File

@ -27,29 +27,29 @@ namespace clblast {
// =================================================================================================
// Enqueues a kernel, waits for completion, and checks for errors
StatusCode RunKernel(Kernel &kernel, Queue &queue, const Device &device,
std::vector<size_t> global, const std::vector<size_t> &local,
EventPointer event, const std::vector<Event> &waitForEvents = {});
void RunKernel(Kernel &kernel, Queue &queue, const Device &device,
std::vector<size_t> global, const std::vector<size_t> &local,
EventPointer event, const std::vector<Event> &waitForEvents = {});
// =================================================================================================
// Copies or transposes a matrix and optionally pads/unpads it with zeros. This method is also able
// to write to symmetric and triangular matrices through optional arguments.
template <typename T>
StatusCode PadCopyTransposeMatrix(Queue &queue, const Device &device,
const Database &db,
EventPointer event, const std::vector<Event> &waitForEvents,
const size_t src_one, const size_t src_two,
const size_t src_ld, const size_t src_offset,
const Buffer<T> &src,
const size_t dest_one, const size_t dest_two,
const size_t dest_ld, const size_t dest_offset,
const Buffer<T> &dest,
const T alpha,
const Program &program, const bool do_pad,
const bool do_transpose, const bool do_conjugate,
const bool upper = false, const bool lower = false,
const bool diagonal_imag_zero = false) {
void PadCopyTransposeMatrix(Queue &queue, const Device &device,
const Database &db,
EventPointer event, const std::vector<Event> &waitForEvents,
const size_t src_one, const size_t src_two,
const size_t src_ld, const size_t src_offset,
const Buffer<T> &src,
const size_t dest_one, const size_t dest_two,
const size_t dest_ld, const size_t dest_offset,
const Buffer<T> &dest,
const T alpha,
const Program &program, const bool do_pad,
const bool do_transpose, const bool do_conjugate,
const bool upper = false, const bool lower = false,
const bool diagonal_imag_zero = false) {
// Determines whether or not the fast-version could potentially be used
auto use_fast_kernel = (src_offset == 0) && (dest_offset == 0) && (do_conjugate == false) &&
@ -84,77 +84,75 @@ StatusCode PadCopyTransposeMatrix(Queue &queue, const Device &device,
}
// Retrieves the kernel from the compiled binary
try {
auto kernel = Kernel(program, kernel_name);
auto kernel = Kernel(program, kernel_name);
// Sets the kernel arguments
// Sets the kernel arguments
if (use_fast_kernel) {
kernel.SetArgument(0, static_cast<int>(src_ld));
kernel.SetArgument(1, src());
kernel.SetArgument(2, dest());
kernel.SetArgument(3, GetRealArg(alpha));
}
else {
kernel.SetArgument(0, static_cast<int>(src_one));
kernel.SetArgument(1, static_cast<int>(src_two));
kernel.SetArgument(2, static_cast<int>(src_ld));
kernel.SetArgument(3, static_cast<int>(src_offset));
kernel.SetArgument(4, src());
kernel.SetArgument(5, static_cast<int>(dest_one));
kernel.SetArgument(6, static_cast<int>(dest_two));
kernel.SetArgument(7, static_cast<int>(dest_ld));
kernel.SetArgument(8, static_cast<int>(dest_offset));
kernel.SetArgument(9, dest());
kernel.SetArgument(10, GetRealArg(alpha));
if (do_pad) {
kernel.SetArgument(11, static_cast<int>(do_conjugate));
}
else {
kernel.SetArgument(11, static_cast<int>(upper));
kernel.SetArgument(12, static_cast<int>(lower));
kernel.SetArgument(13, static_cast<int>(diagonal_imag_zero));
}
}
// Launches the kernel and returns the error code. Uses global and local thread sizes based on
// parameters in the database.
if (do_transpose) {
if (use_fast_kernel) {
kernel.SetArgument(0, static_cast<int>(src_ld));
kernel.SetArgument(1, src());
kernel.SetArgument(2, dest());
kernel.SetArgument(3, GetRealArg(alpha));
const auto global = std::vector<size_t>{
dest_one / db["TRA_WPT"],
dest_two / db["TRA_WPT"]
};
const auto local = std::vector<size_t>{db["TRA_DIM"], db["TRA_DIM"]};
RunKernel(kernel, queue, device, global, local, event, waitForEvents);
}
else {
kernel.SetArgument(0, static_cast<int>(src_one));
kernel.SetArgument(1, static_cast<int>(src_two));
kernel.SetArgument(2, static_cast<int>(src_ld));
kernel.SetArgument(3, static_cast<int>(src_offset));
kernel.SetArgument(4, src());
kernel.SetArgument(5, static_cast<int>(dest_one));
kernel.SetArgument(6, static_cast<int>(dest_two));
kernel.SetArgument(7, static_cast<int>(dest_ld));
kernel.SetArgument(8, static_cast<int>(dest_offset));
kernel.SetArgument(9, dest());
kernel.SetArgument(10, GetRealArg(alpha));
if (do_pad) {
kernel.SetArgument(11, static_cast<int>(do_conjugate));
}
else {
kernel.SetArgument(11, static_cast<int>(upper));
kernel.SetArgument(12, static_cast<int>(lower));
kernel.SetArgument(13, static_cast<int>(diagonal_imag_zero));
}
const auto global = std::vector<size_t>{
Ceil(CeilDiv(dest_one, db["PADTRA_WPT"]), db["PADTRA_TILE"]),
Ceil(CeilDiv(dest_two, db["PADTRA_WPT"]), db["PADTRA_TILE"])
};
const auto local = std::vector<size_t>{db["PADTRA_TILE"], db["PADTRA_TILE"]};
RunKernel(kernel, queue, device, global, local, event, waitForEvents);
}
// Launches the kernel and returns the error code. Uses global and local thread sizes based on
// parameters in the database.
if (do_transpose) {
if (use_fast_kernel) {
const auto global = std::vector<size_t>{
dest_one / db["TRA_WPT"],
dest_two / db["TRA_WPT"]
};
const auto local = std::vector<size_t>{db["TRA_DIM"], db["TRA_DIM"]};
return RunKernel(kernel, queue, device, global, local, event, waitForEvents);
}
else {
const auto global = std::vector<size_t>{
Ceil(CeilDiv(dest_one, db["PADTRA_WPT"]), db["PADTRA_TILE"]),
Ceil(CeilDiv(dest_two, db["PADTRA_WPT"]), db["PADTRA_TILE"])
};
const auto local = std::vector<size_t>{db["PADTRA_TILE"], db["PADTRA_TILE"]};
return RunKernel(kernel, queue, device, global, local, event, waitForEvents);
}
}
else {
if (use_fast_kernel) {
const auto global = std::vector<size_t>{
dest_one / db["COPY_VW"],
dest_two / db["COPY_WPT"]
};
const auto local = std::vector<size_t>{db["COPY_DIMX"], db["COPY_DIMY"]};
RunKernel(kernel, queue, device, global, local, event, waitForEvents);
}
else {
if (use_fast_kernel) {
const auto global = std::vector<size_t>{
dest_one / db["COPY_VW"],
dest_two / db["COPY_WPT"]
};
const auto local = std::vector<size_t>{db["COPY_DIMX"], db["COPY_DIMY"]};
return RunKernel(kernel, queue, device, global, local, event, waitForEvents);
}
else {
const auto global = std::vector<size_t>{
Ceil(CeilDiv(dest_one, db["PAD_WPTX"]), db["PAD_DIMX"]),
Ceil(CeilDiv(dest_two, db["PAD_WPTY"]), db["PAD_DIMY"])
};
const auto local = std::vector<size_t>{db["PAD_DIMX"], db["PAD_DIMY"]};
return RunKernel(kernel, queue, device, global, local, event, waitForEvents);
}
const auto global = std::vector<size_t>{
Ceil(CeilDiv(dest_one, db["PAD_WPTX"]), db["PAD_DIMX"]),
Ceil(CeilDiv(dest_two, db["PAD_WPTY"]), db["PAD_DIMY"])
};
const auto local = std::vector<size_t>{db["PAD_DIMX"], db["PAD_DIMY"]};
RunKernel(kernel, queue, device, global, local, event, waitForEvents);
}
} catch (...) { return StatusCode::kInvalidKernel; }
}
}
// =================================================================================================

View File

@ -32,64 +32,55 @@ Xamax<T>::Xamax(Queue &queue, EventPointer event, const std::string &name):
// The main routine
template <typename T>
StatusCode Xamax<T>::DoAmax(const size_t n,
const Buffer<unsigned int> &imax_buffer, const size_t imax_offset,
const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc) {
void Xamax<T>::DoAmax(const size_t n,
const Buffer<unsigned int> &imax_buffer, const size_t imax_offset,
const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc) {
// Makes sure all dimensions are larger than zero
if (n == 0) { return StatusCode::kInvalidDimension; }
if (n == 0) { throw BLASError(StatusCode::kInvalidDimension); }
// Tests the vectors for validity
auto status = TestVectorX(n, x_buffer, x_offset, x_inc);
if (ErrorIn(status)) { return status; }
status = TestVectorIndex(1, imax_buffer, imax_offset);
if (ErrorIn(status)) { return status; }
TestVectorX(n, x_buffer, x_offset, x_inc);
TestVectorIndex(1, imax_buffer, imax_offset);
// Retrieves the Xamax kernels from the compiled binary
try {
const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);
auto kernel1 = Kernel(program, "Xamax");
auto kernel2 = Kernel(program, "XamaxEpilogue");
const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);
auto kernel1 = Kernel(program, "Xamax");
auto kernel2 = Kernel(program, "XamaxEpilogue");
// Creates the buffer for intermediate values
auto temp_size = 2*db_["WGS2"];
auto temp_buffer1 = Buffer<T>(context_, temp_size);
auto temp_buffer2 = Buffer<unsigned int>(context_, temp_size);
// Creates the buffer for intermediate values
auto temp_size = 2*db_["WGS2"];
auto temp_buffer1 = Buffer<T>(context_, temp_size);
auto temp_buffer2 = Buffer<unsigned int>(context_, temp_size);
// Sets the kernel arguments
kernel1.SetArgument(0, static_cast<int>(n));
kernel1.SetArgument(1, x_buffer());
kernel1.SetArgument(2, static_cast<int>(x_offset));
kernel1.SetArgument(3, static_cast<int>(x_inc));
kernel1.SetArgument(4, temp_buffer1());
kernel1.SetArgument(5, temp_buffer2());
// Sets the kernel arguments
kernel1.SetArgument(0, static_cast<int>(n));
kernel1.SetArgument(1, x_buffer());
kernel1.SetArgument(2, static_cast<int>(x_offset));
kernel1.SetArgument(3, static_cast<int>(x_inc));
kernel1.SetArgument(4, temp_buffer1());
kernel1.SetArgument(5, temp_buffer2());
// Event waiting list
auto eventWaitList = std::vector<Event>();
// Event waiting list
auto eventWaitList = std::vector<Event>();
// Launches the main kernel
auto global1 = std::vector<size_t>{db_["WGS1"]*temp_size};
auto local1 = std::vector<size_t>{db_["WGS1"]};
auto kernelEvent = Event();
status = RunKernel(kernel1, queue_, device_, global1, local1, kernelEvent.pointer());
if (ErrorIn(status)) { return status; }
eventWaitList.push_back(kernelEvent);
// Launches the main kernel
auto global1 = std::vector<size_t>{db_["WGS1"]*temp_size};
auto local1 = std::vector<size_t>{db_["WGS1"]};
auto kernelEvent = Event();
RunKernel(kernel1, queue_, device_, global1, local1, kernelEvent.pointer());
eventWaitList.push_back(kernelEvent);
// Sets the arguments for the epilogue kernel
kernel2.SetArgument(0, temp_buffer1());
kernel2.SetArgument(1, temp_buffer2());
kernel2.SetArgument(2, imax_buffer());
kernel2.SetArgument(3, static_cast<int>(imax_offset));
// Sets the arguments for the epilogue kernel
kernel2.SetArgument(0, temp_buffer1());
kernel2.SetArgument(1, temp_buffer2());
kernel2.SetArgument(2, imax_buffer());
kernel2.SetArgument(3, static_cast<int>(imax_offset));
// Launches the epilogue kernel
auto global2 = std::vector<size_t>{db_["WGS2"]};
auto local2 = std::vector<size_t>{db_["WGS2"]};
status = RunKernel(kernel2, queue_, device_, global2, local2, event_, eventWaitList);
if (ErrorIn(status)) { return status; }
// Succesfully finished the computation
return StatusCode::kSuccess;
} catch (...) { return StatusCode::kInvalidKernel; }
// Launches the epilogue kernel
auto global2 = std::vector<size_t>{db_["WGS2"]};
auto local2 = std::vector<size_t>{db_["WGS2"]};
RunKernel(kernel2, queue_, device_, global2, local2, event_, eventWaitList);
}
// =================================================================================================

View File

@ -28,9 +28,9 @@ class Xamax: public Routine {
Xamax(Queue &queue, EventPointer event, const std::string &name = "AMAX");
// Templated-precision implementation of the routine
StatusCode DoAmax(const size_t n,
const Buffer<unsigned int> &imax_buffer, const size_t imax_offset,
const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc);
void DoAmax(const size_t n,
const Buffer<unsigned int> &imax_buffer, const size_t imax_offset,
const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc);
};
// =================================================================================================

View File

@ -32,61 +32,52 @@ Xasum<T>::Xasum(Queue &queue, EventPointer event, const std::string &name):
// The main routine
template <typename T>
StatusCode Xasum<T>::DoAsum(const size_t n,
const Buffer<T> &asum_buffer, const size_t asum_offset,
const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc) {
void Xasum<T>::DoAsum(const size_t n,
const Buffer<T> &asum_buffer, const size_t asum_offset,
const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc) {
// Makes sure all dimensions are larger than zero
if (n == 0) { return StatusCode::kInvalidDimension; }
if (n == 0) { throw BLASError(StatusCode::kInvalidDimension); }
// Tests the vectors for validity
auto status = TestVectorX(n, x_buffer, x_offset, x_inc);
if (ErrorIn(status)) { return status; }
status = TestVectorScalar(1, asum_buffer, asum_offset);
if (ErrorIn(status)) { return status; }
TestVectorX(n, x_buffer, x_offset, x_inc);
TestVectorScalar(1, asum_buffer, asum_offset);
// Retrieves the Xasum kernels from the compiled binary
try {
const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);
auto kernel1 = Kernel(program, "Xasum");
auto kernel2 = Kernel(program, "XasumEpilogue");
const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);
auto kernel1 = Kernel(program, "Xasum");
auto kernel2 = Kernel(program, "XasumEpilogue");
// Creates the buffer for intermediate values
auto temp_size = 2*db_["WGS2"];
auto temp_buffer = Buffer<T>(context_, temp_size);
// Creates the buffer for intermediate values
auto temp_size = 2*db_["WGS2"];
auto temp_buffer = Buffer<T>(context_, temp_size);
// Sets the kernel arguments
kernel1.SetArgument(0, static_cast<int>(n));
kernel1.SetArgument(1, x_buffer());
kernel1.SetArgument(2, static_cast<int>(x_offset));
kernel1.SetArgument(3, static_cast<int>(x_inc));
kernel1.SetArgument(4, temp_buffer());
// Sets the kernel arguments
kernel1.SetArgument(0, static_cast<int>(n));
kernel1.SetArgument(1, x_buffer());
kernel1.SetArgument(2, static_cast<int>(x_offset));
kernel1.SetArgument(3, static_cast<int>(x_inc));
kernel1.SetArgument(4, temp_buffer());
// Event waiting list
auto eventWaitList = std::vector<Event>();
// Event waiting list
auto eventWaitList = std::vector<Event>();
// Launches the main kernel
auto global1 = std::vector<size_t>{db_["WGS1"]*temp_size};
auto local1 = std::vector<size_t>{db_["WGS1"]};
auto kernelEvent = Event();
status = RunKernel(kernel1, queue_, device_, global1, local1, kernelEvent.pointer());
if (ErrorIn(status)) { return status; }
eventWaitList.push_back(kernelEvent);
// Launches the main kernel
auto global1 = std::vector<size_t>{db_["WGS1"]*temp_size};
auto local1 = std::vector<size_t>{db_["WGS1"]};
auto kernelEvent = Event();
RunKernel(kernel1, queue_, device_, global1, local1, kernelEvent.pointer());
eventWaitList.push_back(kernelEvent);
// Sets the arguments for the epilogue kernel
kernel2.SetArgument(0, temp_buffer());
kernel2.SetArgument(1, asum_buffer());
kernel2.SetArgument(2, static_cast<int>(asum_offset));
// Sets the arguments for the epilogue kernel
kernel2.SetArgument(0, temp_buffer());
kernel2.SetArgument(1, asum_buffer());
kernel2.SetArgument(2, static_cast<int>(asum_offset));
// Launches the epilogue kernel
auto global2 = std::vector<size_t>{db_["WGS2"]};
auto local2 = std::vector<size_t>{db_["WGS2"]};
status = RunKernel(kernel2, queue_, device_, global2, local2, event_, eventWaitList);
if (ErrorIn(status)) { return status; }
// Succesfully finished the computation
return StatusCode::kSuccess;
} catch (...) { return StatusCode::kInvalidKernel; }
// Launches the epilogue kernel
auto global2 = std::vector<size_t>{db_["WGS2"]};
auto local2 = std::vector<size_t>{db_["WGS2"]};
RunKernel(kernel2, queue_, device_, global2, local2, event_, eventWaitList);
}
// =================================================================================================

View File

@ -28,9 +28,9 @@ class Xasum: public Routine {
Xasum(Queue &queue, EventPointer event, const std::string &name = "ASUM");
// Templated-precision implementation of the routine
StatusCode DoAsum(const size_t n,
const Buffer<T> &asum_buffer, const size_t asum_offset,
const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc);
void DoAsum(const size_t n,
const Buffer<T> &asum_buffer, const size_t asum_offset,
const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc);
};
// =================================================================================================

View File

@ -33,18 +33,16 @@ Xaxpy<T>::Xaxpy(Queue &queue, EventPointer event, const std::string &name):
// The main routine
template <typename T>
StatusCode Xaxpy<T>::DoAxpy(const size_t n, const T alpha,
const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc) {
void Xaxpy<T>::DoAxpy(const size_t n, const T alpha,
const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc) {
// Makes sure all dimensions are larger than zero
if (n == 0) { return StatusCode::kInvalidDimension; }
if (n == 0) { throw BLASError(StatusCode::kInvalidDimension); }
// Tests the vectors for validity
auto status = TestVectorX(n, x_buffer, x_offset, x_inc);
if (ErrorIn(status)) { return status; }
status = TestVectorY(n, y_buffer, y_offset, y_inc);
if (ErrorIn(status)) { return status; }
TestVectorX(n, x_buffer, x_offset, x_inc);
TestVectorY(n, y_buffer, y_offset, y_inc);
// Determines whether or not the fast-version can be used
bool use_fast_kernel = (x_offset == 0) && (x_inc == 1) &&
@ -55,45 +53,39 @@ StatusCode Xaxpy<T>::DoAxpy(const size_t n, const T alpha,
auto kernel_name = (use_fast_kernel) ? "XaxpyFast" : "Xaxpy";
// Retrieves the Xaxpy kernel from the compiled binary
try {
const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);
auto kernel = Kernel(program, kernel_name);
const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);
auto kernel = Kernel(program, kernel_name);
// Sets the kernel arguments
if (use_fast_kernel) {
kernel.SetArgument(0, static_cast<int>(n));
kernel.SetArgument(1, GetRealArg(alpha));
kernel.SetArgument(2, x_buffer());
kernel.SetArgument(3, y_buffer());
}
else {
kernel.SetArgument(0, static_cast<int>(n));
kernel.SetArgument(1, GetRealArg(alpha));
kernel.SetArgument(2, x_buffer());
kernel.SetArgument(3, static_cast<int>(x_offset));
kernel.SetArgument(4, static_cast<int>(x_inc));
kernel.SetArgument(5, y_buffer());
kernel.SetArgument(6, static_cast<int>(y_offset));
kernel.SetArgument(7, static_cast<int>(y_inc));
}
// Sets the kernel arguments
if (use_fast_kernel) {
kernel.SetArgument(0, static_cast<int>(n));
kernel.SetArgument(1, GetRealArg(alpha));
kernel.SetArgument(2, x_buffer());
kernel.SetArgument(3, y_buffer());
}
else {
kernel.SetArgument(0, static_cast<int>(n));
kernel.SetArgument(1, GetRealArg(alpha));
kernel.SetArgument(2, x_buffer());
kernel.SetArgument(3, static_cast<int>(x_offset));
kernel.SetArgument(4, static_cast<int>(x_inc));
kernel.SetArgument(5, y_buffer());
kernel.SetArgument(6, static_cast<int>(y_offset));
kernel.SetArgument(7, static_cast<int>(y_inc));
}
// Launches the kernel
if (use_fast_kernel) {
auto global = std::vector<size_t>{CeilDiv(n, db_["WPT"]*db_["VW"])};
auto local = std::vector<size_t>{db_["WGS"]};
status = RunKernel(kernel, queue_, device_, global, local, event_);
}
else {
auto n_ceiled = Ceil(n, db_["WGS"]*db_["WPT"]);
auto global = std::vector<size_t>{n_ceiled/db_["WPT"]};
auto local = std::vector<size_t>{db_["WGS"]};
status = RunKernel(kernel, queue_, device_, global, local, event_);
}
if (ErrorIn(status)) { return status; }
// Succesfully finished the computation
return StatusCode::kSuccess;
} catch (...) { return StatusCode::kInvalidKernel; }
// Launches the kernel
if (use_fast_kernel) {
auto global = std::vector<size_t>{CeilDiv(n, db_["WPT"]*db_["VW"])};
auto local = std::vector<size_t>{db_["WGS"]};
RunKernel(kernel, queue_, device_, global, local, event_);
}
else {
auto n_ceiled = Ceil(n, db_["WGS"]*db_["WPT"]);
auto global = std::vector<size_t>{n_ceiled/db_["WPT"]};
auto local = std::vector<size_t>{db_["WGS"]};
RunKernel(kernel, queue_, device_, global, local, event_);
}
}
// =================================================================================================

View File

@ -28,9 +28,9 @@ class Xaxpy: public Routine {
Xaxpy(Queue &queue, EventPointer event, const std::string &name = "AXPY");
// Templated-precision implementation of the routine
StatusCode DoAxpy(const size_t n, const T alpha,
const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc);
void DoAxpy(const size_t n, const T alpha,
const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc);
};
// =================================================================================================

View File

@ -33,18 +33,16 @@ Xcopy<T>::Xcopy(Queue &queue, EventPointer event, const std::string &name):
// The main routine
template <typename T>
StatusCode Xcopy<T>::DoCopy(const size_t n,
const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc) {
void Xcopy<T>::DoCopy(const size_t n,
const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc) {
// Makes sure all dimensions are larger than zero
if (n == 0) { return StatusCode::kInvalidDimension; }
if (n == 0) { throw BLASError(StatusCode::kInvalidDimension); }
// Tests the vectors for validity
auto status = TestVectorX(n, x_buffer, x_offset, x_inc);
if (ErrorIn(status)) { return status; }
status = TestVectorY(n, y_buffer, y_offset, y_inc);
if (ErrorIn(status)) { return status; }
TestVectorX(n, x_buffer, x_offset, x_inc);
TestVectorY(n, y_buffer, y_offset, y_inc);
// Determines whether or not the fast-version can be used
bool use_fast_kernel = (x_offset == 0) && (x_inc == 1) &&
@ -55,43 +53,37 @@ StatusCode Xcopy<T>::DoCopy(const size_t n,
auto kernel_name = (use_fast_kernel) ? "XcopyFast" : "Xcopy";
// Retrieves the Xcopy kernel from the compiled binary
try {
const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);
auto kernel = Kernel(program, kernel_name);
const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);
auto kernel = Kernel(program, kernel_name);
// Sets the kernel arguments
if (use_fast_kernel) {
kernel.SetArgument(0, static_cast<int>(n));
kernel.SetArgument(1, x_buffer());
kernel.SetArgument(2, y_buffer());
}
else {
kernel.SetArgument(0, static_cast<int>(n));
kernel.SetArgument(1, x_buffer());
kernel.SetArgument(2, static_cast<int>(x_offset));
kernel.SetArgument(3, static_cast<int>(x_inc));
kernel.SetArgument(4, y_buffer());
kernel.SetArgument(5, static_cast<int>(y_offset));
kernel.SetArgument(6, static_cast<int>(y_inc));
}
// Sets the kernel arguments
if (use_fast_kernel) {
kernel.SetArgument(0, static_cast<int>(n));
kernel.SetArgument(1, x_buffer());
kernel.SetArgument(2, y_buffer());
}
else {
kernel.SetArgument(0, static_cast<int>(n));
kernel.SetArgument(1, x_buffer());
kernel.SetArgument(2, static_cast<int>(x_offset));
kernel.SetArgument(3, static_cast<int>(x_inc));
kernel.SetArgument(4, y_buffer());
kernel.SetArgument(5, static_cast<int>(y_offset));
kernel.SetArgument(6, static_cast<int>(y_inc));
}
// Launches the kernel
if (use_fast_kernel) {
auto global = std::vector<size_t>{CeilDiv(n, db_["WPT"]*db_["VW"])};
auto local = std::vector<size_t>{db_["WGS"]};
status = RunKernel(kernel, queue_, device_, global, local, event_);
}
else {
auto n_ceiled = Ceil(n, db_["WGS"]*db_["WPT"]);
auto global = std::vector<size_t>{n_ceiled/db_["WPT"]};
auto local = std::vector<size_t>{db_["WGS"]};
status = RunKernel(kernel, queue_, device_, global, local, event_);
}
if (ErrorIn(status)) { return status; }
// Succesfully finished the computation
return StatusCode::kSuccess;
} catch (...) { return StatusCode::kInvalidKernel; }
// Launches the kernel
if (use_fast_kernel) {
auto global = std::vector<size_t>{CeilDiv(n, db_["WPT"]*db_["VW"])};
auto local = std::vector<size_t>{db_["WGS"]};
RunKernel(kernel, queue_, device_, global, local, event_);
}
else {
auto n_ceiled = Ceil(n, db_["WGS"]*db_["WPT"]);
auto global = std::vector<size_t>{n_ceiled/db_["WPT"]};
auto local = std::vector<size_t>{db_["WGS"]};
RunKernel(kernel, queue_, device_, global, local, event_);
}
}
// =================================================================================================

View File

@ -28,9 +28,9 @@ class Xcopy: public Routine {
Xcopy(Queue &queue, EventPointer event, const std::string &name = "COPY");
// Templated-precision implementation of the routine
StatusCode DoCopy(const size_t n,
const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc);
void DoCopy(const size_t n,
const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc);
};
// =================================================================================================

View File

@ -32,69 +32,59 @@ Xdot<T>::Xdot(Queue &queue, EventPointer event, const std::string &name):
// The main routine
template <typename T>
StatusCode Xdot<T>::DoDot(const size_t n,
const Buffer<T> &dot_buffer, const size_t dot_offset,
const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc,
const bool do_conjugate) {
void Xdot<T>::DoDot(const size_t n,
const Buffer<T> &dot_buffer, const size_t dot_offset,
const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc,
const bool do_conjugate) {
// Makes sure all dimensions are larger than zero
if (n == 0) { return StatusCode::kInvalidDimension; }
if (n == 0) { throw BLASError(StatusCode::kInvalidDimension); }
// Tests the vectors for validity
auto status = TestVectorX(n, x_buffer, x_offset, x_inc);
if (ErrorIn(status)) { return status; }
status = TestVectorY(n, y_buffer, y_offset, y_inc);
if (ErrorIn(status)) { return status; }
status = TestVectorScalar(1, dot_buffer, dot_offset);
if (ErrorIn(status)) { return status; }
TestVectorX(n, x_buffer, x_offset, x_inc);
TestVectorY(n, y_buffer, y_offset, y_inc);
TestVectorScalar(1, dot_buffer, dot_offset);
// Retrieves the Xdot kernels from the compiled binary
try {
const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);
auto kernel1 = Kernel(program, "Xdot");
auto kernel2 = Kernel(program, "XdotEpilogue");
const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);
auto kernel1 = Kernel(program, "Xdot");
auto kernel2 = Kernel(program, "XdotEpilogue");
// Creates the buffer for intermediate values
auto temp_size = 2*db_["WGS2"];
auto temp_buffer = Buffer<T>(context_, temp_size);
// Creates the buffer for intermediate values
auto temp_size = 2*db_["WGS2"];
auto temp_buffer = Buffer<T>(context_, temp_size);
// Sets the kernel arguments
kernel1.SetArgument(0, static_cast<int>(n));
kernel1.SetArgument(1, x_buffer());
kernel1.SetArgument(2, static_cast<int>(x_offset));
kernel1.SetArgument(3, static_cast<int>(x_inc));
kernel1.SetArgument(4, y_buffer());
kernel1.SetArgument(5, static_cast<int>(y_offset));
kernel1.SetArgument(6, static_cast<int>(y_inc));
kernel1.SetArgument(7, temp_buffer());
kernel1.SetArgument(8, static_cast<int>(do_conjugate));
// Sets the kernel arguments
kernel1.SetArgument(0, static_cast<int>(n));
kernel1.SetArgument(1, x_buffer());
kernel1.SetArgument(2, static_cast<int>(x_offset));
kernel1.SetArgument(3, static_cast<int>(x_inc));
kernel1.SetArgument(4, y_buffer());
kernel1.SetArgument(5, static_cast<int>(y_offset));
kernel1.SetArgument(6, static_cast<int>(y_inc));
kernel1.SetArgument(7, temp_buffer());
kernel1.SetArgument(8, static_cast<int>(do_conjugate));
// Event waiting list
auto eventWaitList = std::vector<Event>();
// Event waiting list
auto eventWaitList = std::vector<Event>();
// Launches the main kernel
auto global1 = std::vector<size_t>{db_["WGS1"]*temp_size};
auto local1 = std::vector<size_t>{db_["WGS1"]};
auto kernelEvent = Event();
status = RunKernel(kernel1, queue_, device_, global1, local1, kernelEvent.pointer());
if (ErrorIn(status)) { return status; }
eventWaitList.push_back(kernelEvent);
// Launches the main kernel
auto global1 = std::vector<size_t>{db_["WGS1"]*temp_size};
auto local1 = std::vector<size_t>{db_["WGS1"]};
auto kernelEvent = Event();
RunKernel(kernel1, queue_, device_, global1, local1, kernelEvent.pointer());
eventWaitList.push_back(kernelEvent);
// Sets the arguments for the epilogue kernel
kernel2.SetArgument(0, temp_buffer());
kernel2.SetArgument(1, dot_buffer());
kernel2.SetArgument(2, static_cast<int>(dot_offset));
// Sets the arguments for the epilogue kernel
kernel2.SetArgument(0, temp_buffer());
kernel2.SetArgument(1, dot_buffer());
kernel2.SetArgument(2, static_cast<int>(dot_offset));
// Launches the epilogue kernel
auto global2 = std::vector<size_t>{db_["WGS2"]};
auto local2 = std::vector<size_t>{db_["WGS2"]};
status = RunKernel(kernel2, queue_, device_, global2, local2, event_, eventWaitList);
if (ErrorIn(status)) { return status; }
// Succesfully finished the computation
return StatusCode::kSuccess;
} catch (...) { return StatusCode::kInvalidKernel; }
// Launches the epilogue kernel
auto global2 = std::vector<size_t>{db_["WGS2"]};
auto local2 = std::vector<size_t>{db_["WGS2"]};
RunKernel(kernel2, queue_, device_, global2, local2, event_, eventWaitList);
}
// =================================================================================================

View File

@ -28,11 +28,11 @@ class Xdot: public Routine {
Xdot(Queue &queue, EventPointer event, const std::string &name = "DOT");
// Templated-precision implementation of the routine
StatusCode DoDot(const size_t n,
const Buffer<T> &dot_buffer, const size_t dot_offset,
const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc,
const bool do_conjugate = false);
void DoDot(const size_t n,
const Buffer<T> &dot_buffer, const size_t dot_offset,
const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc,
const bool do_conjugate = false);
};
// =================================================================================================

View File

@ -29,14 +29,14 @@ Xdotc<T>::Xdotc(Queue &queue, EventPointer event, const std::string &name):
// The main routine
template <typename T>
StatusCode Xdotc<T>::DoDotc(const size_t n,
const Buffer<T> &dot_buffer, const size_t dot_offset,
const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc) {
return DoDot(n, dot_buffer, dot_offset,
x_buffer, x_offset, x_inc,
y_buffer, y_offset, y_inc,
true);
void Xdotc<T>::DoDotc(const size_t n,
const Buffer<T> &dot_buffer, const size_t dot_offset,
const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc) {
DoDot(n, dot_buffer, dot_offset,
x_buffer, x_offset, x_inc,
y_buffer, y_offset, y_inc,
true);
}
// =================================================================================================

View File

@ -31,10 +31,10 @@ class Xdotc: public Xdot<T> {
Xdotc(Queue &queue, EventPointer event, const std::string &name = "DOTC");
// Templated-precision implementation of the routine
StatusCode DoDotc(const size_t n,
const Buffer<T> &dot_buffer, const size_t dot_offset,
const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc);
void DoDotc(const size_t n,
const Buffer<T> &dot_buffer, const size_t dot_offset,
const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc);
};
// =================================================================================================

View File

@ -28,14 +28,14 @@ Xdotu<T>::Xdotu(Queue &queue, EventPointer event, const std::string &name):
// The main routine
template <typename T>
StatusCode Xdotu<T>::DoDotu(const size_t n,
const Buffer<T> &dot_buffer, const size_t dot_offset,
const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc) {
return DoDot(n, dot_buffer, dot_offset,
x_buffer, x_offset, x_inc,
y_buffer, y_offset, y_inc,
false);
void Xdotu<T>::DoDotu(const size_t n,
const Buffer<T> &dot_buffer, const size_t dot_offset,
const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc) {
DoDot(n, dot_buffer, dot_offset,
x_buffer, x_offset, x_inc,
y_buffer, y_offset, y_inc,
false);
}
// =================================================================================================

View File

@ -31,10 +31,10 @@ class Xdotu: public Xdot<T> {
Xdotu(Queue &queue, EventPointer event, const std::string &name = "DOTU");
// Templated-precision implementation of the routine
StatusCode DoDotu(const size_t n,
const Buffer<T> &dot_buffer, const size_t dot_offset,
const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc);
void DoDotu(const size_t n,
const Buffer<T> &dot_buffer, const size_t dot_offset,
const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc);
};
// =================================================================================================

View File

@ -35,10 +35,10 @@ class Xmax: public Xamax<T> {
// Forwards to the regular absolute version. The implementation difference is realised in the
// kernel through a pre-processor macro based on the name of the routine.
StatusCode DoMax(const size_t n,
const Buffer<unsigned int> &imax_buffer, const size_t imax_offset,
const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc) {
return DoAmax(n, imax_buffer, imax_offset, x_buffer, x_offset, x_inc);
void DoMax(const size_t n,
const Buffer<unsigned int> &imax_buffer, const size_t imax_offset,
const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc) {
DoAmax(n, imax_buffer, imax_offset, x_buffer, x_offset, x_inc);
}
};

View File

@ -35,10 +35,10 @@ class Xmin: public Xamax<T> {
// Forwards to the regular max-absolute version. The implementation difference is realised in the
// kernel through a pre-processor macro based on the name of the routine.
StatusCode DoMin(const size_t n,
const Buffer<unsigned int> &imin_buffer, const size_t imin_offset,
const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc) {
return DoAmax(n, imin_buffer, imin_offset, x_buffer, x_offset, x_inc);
void DoMin(const size_t n,
const Buffer<unsigned int> &imin_buffer, const size_t imin_offset,
const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc) {
DoAmax(n, imin_buffer, imin_offset, x_buffer, x_offset, x_inc);
}
};

View File

@ -32,61 +32,52 @@ Xnrm2<T>::Xnrm2(Queue &queue, EventPointer event, const std::string &name):
// The main routine
template <typename T>
StatusCode Xnrm2<T>::DoNrm2(const size_t n,
const Buffer<T> &nrm2_buffer, const size_t nrm2_offset,
const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc) {
void Xnrm2<T>::DoNrm2(const size_t n,
const Buffer<T> &nrm2_buffer, const size_t nrm2_offset,
const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc) {
// Makes sure all dimensions are larger than zero
if (n == 0) { return StatusCode::kInvalidDimension; }
if (n == 0) { throw BLASError(StatusCode::kInvalidDimension); }
// Tests the vectors for validity
auto status = TestVectorX(n, x_buffer, x_offset, x_inc);
if (ErrorIn(status)) { return status; }
status = TestVectorScalar(1, nrm2_buffer, nrm2_offset);
if (ErrorIn(status)) { return status; }
TestVectorX(n, x_buffer, x_offset, x_inc);
TestVectorScalar(1, nrm2_buffer, nrm2_offset);
// Retrieves the Xnrm2 kernels from the compiled binary
try {
const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);
auto kernel1 = Kernel(program, "Xnrm2");
auto kernel2 = Kernel(program, "Xnrm2Epilogue");
const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);
auto kernel1 = Kernel(program, "Xnrm2");
auto kernel2 = Kernel(program, "Xnrm2Epilogue");
// Creates the buffer for intermediate values
auto temp_size = 2*db_["WGS2"];
auto temp_buffer = Buffer<T>(context_, temp_size);
// Creates the buffer for intermediate values
auto temp_size = 2*db_["WGS2"];
auto temp_buffer = Buffer<T>(context_, temp_size);
// Sets the kernel arguments
kernel1.SetArgument(0, static_cast<int>(n));
kernel1.SetArgument(1, x_buffer());
kernel1.SetArgument(2, static_cast<int>(x_offset));
kernel1.SetArgument(3, static_cast<int>(x_inc));
kernel1.SetArgument(4, temp_buffer());
// Sets the kernel arguments
kernel1.SetArgument(0, static_cast<int>(n));
kernel1.SetArgument(1, x_buffer());
kernel1.SetArgument(2, static_cast<int>(x_offset));
kernel1.SetArgument(3, static_cast<int>(x_inc));
kernel1.SetArgument(4, temp_buffer());
// Event waiting list
auto eventWaitList = std::vector<Event>();
// Event waiting list
auto eventWaitList = std::vector<Event>();
// Launches the main kernel
auto global1 = std::vector<size_t>{db_["WGS1"]*temp_size};
auto local1 = std::vector<size_t>{db_["WGS1"]};
auto kernelEvent = Event();
status = RunKernel(kernel1, queue_, device_, global1, local1, kernelEvent.pointer());
if (ErrorIn(status)) { return status; }
eventWaitList.push_back(kernelEvent);
// Launches the main kernel
auto global1 = std::vector<size_t>{db_["WGS1"]*temp_size};
auto local1 = std::vector<size_t>{db_["WGS1"]};
auto kernelEvent = Event();
RunKernel(kernel1, queue_, device_, global1, local1, kernelEvent.pointer());
eventWaitList.push_back(kernelEvent);
// Sets the arguments for the epilogue kernel
kernel2.SetArgument(0, temp_buffer());
kernel2.SetArgument(1, nrm2_buffer());
kernel2.SetArgument(2, static_cast<int>(nrm2_offset));
// Sets the arguments for the epilogue kernel
kernel2.SetArgument(0, temp_buffer());
kernel2.SetArgument(1, nrm2_buffer());
kernel2.SetArgument(2, static_cast<int>(nrm2_offset));
// Launches the epilogue kernel
auto global2 = std::vector<size_t>{db_["WGS2"]};
auto local2 = std::vector<size_t>{db_["WGS2"]};
status = RunKernel(kernel2, queue_, device_, global2, local2, event_, eventWaitList);
if (ErrorIn(status)) { return status; }
// Succesfully finished the computation
return StatusCode::kSuccess;
} catch (...) { return StatusCode::kInvalidKernel; }
// Launches the epilogue kernel
auto global2 = std::vector<size_t>{db_["WGS2"]};
auto local2 = std::vector<size_t>{db_["WGS2"]};
RunKernel(kernel2, queue_, device_, global2, local2, event_, eventWaitList);
}
// =================================================================================================

View File

@ -28,9 +28,9 @@ class Xnrm2: public Routine {
Xnrm2(Queue &queue, EventPointer event, const std::string &name = "NRM2");
// Templated-precision implementation of the routine
StatusCode DoNrm2(const size_t n,
const Buffer<T> &nrm2_buffer, const size_t nrm2_offset,
const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc);
void DoNrm2(const size_t n,
const Buffer<T> &nrm2_buffer, const size_t nrm2_offset,
const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc);
};
// =================================================================================================

View File

@ -33,15 +33,14 @@ Xscal<T>::Xscal(Queue &queue, EventPointer event, const std::string &name):
// The main routine
template <typename T>
StatusCode Xscal<T>::DoScal(const size_t n, const T alpha,
const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc) {
void Xscal<T>::DoScal(const size_t n, const T alpha,
const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc) {
// Makes sure all dimensions are larger than zero
if (n == 0) { return StatusCode::kInvalidDimension; }
if (n == 0) { throw BLASError(StatusCode::kInvalidDimension); }
// Tests the vector for validity
auto status = TestVectorX(n, x_buffer, x_offset, x_inc);
if (ErrorIn(status)) { return status; }
TestVectorX(n, x_buffer, x_offset, x_inc);
// Determines whether or not the fast-version can be used
bool use_fast_kernel = (x_offset == 0) && (x_inc == 1) &&
@ -51,41 +50,35 @@ StatusCode Xscal<T>::DoScal(const size_t n, const T alpha,
auto kernel_name = (use_fast_kernel) ? "XscalFast" : "Xscal";
// Retrieves the Xscal kernel from the compiled binary
try {
const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);
auto kernel = Kernel(program, kernel_name);
const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);
auto kernel = Kernel(program, kernel_name);
// Sets the kernel arguments
if (use_fast_kernel) {
kernel.SetArgument(0, static_cast<int>(n));
kernel.SetArgument(1, alpha);
kernel.SetArgument(2, x_buffer());
}
else {
kernel.SetArgument(0, static_cast<int>(n));
kernel.SetArgument(1, alpha);
kernel.SetArgument(2, x_buffer());
kernel.SetArgument(3, static_cast<int>(x_offset));
kernel.SetArgument(4, static_cast<int>(x_inc));
}
// Sets the kernel arguments
if (use_fast_kernel) {
kernel.SetArgument(0, static_cast<int>(n));
kernel.SetArgument(1, alpha);
kernel.SetArgument(2, x_buffer());
}
else {
kernel.SetArgument(0, static_cast<int>(n));
kernel.SetArgument(1, alpha);
kernel.SetArgument(2, x_buffer());
kernel.SetArgument(3, static_cast<int>(x_offset));
kernel.SetArgument(4, static_cast<int>(x_inc));
}
// Launches the kernel
if (use_fast_kernel) {
auto global = std::vector<size_t>{CeilDiv(n, db_["WPT"]*db_["VW"])};
auto local = std::vector<size_t>{db_["WGS"]};
status = RunKernel(kernel, queue_, device_, global, local, event_);
}
else {
auto n_ceiled = Ceil(n, db_["WGS"]*db_["WPT"]);
auto global = std::vector<size_t>{n_ceiled/db_["WPT"]};
auto local = std::vector<size_t>{db_["WGS"]};
status = RunKernel(kernel, queue_, device_, global, local, event_);
}
if (ErrorIn(status)) { return status; }
// Succesfully finished the computation
return StatusCode::kSuccess;
} catch (...) { return StatusCode::kInvalidKernel; }
// Launches the kernel
if (use_fast_kernel) {
auto global = std::vector<size_t>{CeilDiv(n, db_["WPT"]*db_["VW"])};
auto local = std::vector<size_t>{db_["WGS"]};
RunKernel(kernel, queue_, device_, global, local, event_);
}
else {
auto n_ceiled = Ceil(n, db_["WGS"]*db_["WPT"]);
auto global = std::vector<size_t>{n_ceiled/db_["WPT"]};
auto local = std::vector<size_t>{db_["WGS"]};
RunKernel(kernel, queue_, device_, global, local, event_);
}
}
// =================================================================================================

View File

@ -28,8 +28,8 @@ class Xscal: public Routine {
Xscal(Queue &queue, EventPointer event, const std::string &name = "SCAL");
// Templated-precision implementation of the routine
StatusCode DoScal(const size_t n, const T alpha,
const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc);
void DoScal(const size_t n, const T alpha,
const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc);
};
// =================================================================================================

View File

@ -35,10 +35,10 @@ class Xsum: public Xasum<T> {
// Forwards to the regular absolute version. The implementation difference is realised in the
// kernel through a pre-processor macro based on the name of the routine.
StatusCode DoSum(const size_t n,
const Buffer<T> &sum_buffer, const size_t sum_offset,
const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc) {
return DoAsum(n, sum_buffer, sum_offset, x_buffer, x_offset, x_inc);
void DoSum(const size_t n,
const Buffer<T> &sum_buffer, const size_t sum_offset,
const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc) {
DoAsum(n, sum_buffer, sum_offset, x_buffer, x_offset, x_inc);
}
};

View File

@ -33,18 +33,16 @@ Xswap<T>::Xswap(Queue &queue, EventPointer event, const std::string &name):
// The main routine
template <typename T>
StatusCode Xswap<T>::DoSwap(const size_t n,
const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc) {
void Xswap<T>::DoSwap(const size_t n,
const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc) {
// Makes sure all dimensions are larger than zero
if (n == 0) { return StatusCode::kInvalidDimension; }
if (n == 0) { throw BLASError(StatusCode::kInvalidDimension); }
// Tests the vectors for validity
auto status = TestVectorX(n, x_buffer, x_offset, x_inc);
if (ErrorIn(status)) { return status; }
status = TestVectorY(n, y_buffer, y_offset, y_inc);
if (ErrorIn(status)) { return status; }
TestVectorX(n, x_buffer, x_offset, x_inc);
TestVectorY(n, y_buffer, y_offset, y_inc);
// Determines whether or not the fast-version can be used
bool use_fast_kernel = (x_offset == 0) && (x_inc == 1) &&
@ -55,43 +53,37 @@ StatusCode Xswap<T>::DoSwap(const size_t n,
auto kernel_name = (use_fast_kernel) ? "XswapFast" : "Xswap";
// Retrieves the Xswap kernel from the compiled binary
try {
const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);
auto kernel = Kernel(program, kernel_name);
const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);
auto kernel = Kernel(program, kernel_name);
// Sets the kernel arguments
if (use_fast_kernel) {
kernel.SetArgument(0, static_cast<int>(n));
kernel.SetArgument(1, x_buffer());
kernel.SetArgument(2, y_buffer());
}
else {
kernel.SetArgument(0, static_cast<int>(n));
kernel.SetArgument(1, x_buffer());
kernel.SetArgument(2, static_cast<int>(x_offset));
kernel.SetArgument(3, static_cast<int>(x_inc));
kernel.SetArgument(4, y_buffer());
kernel.SetArgument(5, static_cast<int>(y_offset));
kernel.SetArgument(6, static_cast<int>(y_inc));
}
// Sets the kernel arguments
if (use_fast_kernel) {
kernel.SetArgument(0, static_cast<int>(n));
kernel.SetArgument(1, x_buffer());
kernel.SetArgument(2, y_buffer());
}
else {
kernel.SetArgument(0, static_cast<int>(n));
kernel.SetArgument(1, x_buffer());
kernel.SetArgument(2, static_cast<int>(x_offset));
kernel.SetArgument(3, static_cast<int>(x_inc));
kernel.SetArgument(4, y_buffer());
kernel.SetArgument(5, static_cast<int>(y_offset));
kernel.SetArgument(6, static_cast<int>(y_inc));
}
// Launches the kernel
if (use_fast_kernel) {
auto global = std::vector<size_t>{CeilDiv(n, db_["WPT"]*db_["VW"])};
auto local = std::vector<size_t>{db_["WGS"]};
status = RunKernel(kernel, queue_, device_, global, local, event_);
}
else {
auto n_ceiled = Ceil(n, db_["WGS"]*db_["WPT"]);
auto global = std::vector<size_t>{n_ceiled/db_["WPT"]};
auto local = std::vector<size_t>{db_["WGS"]};
status = RunKernel(kernel, queue_, device_, global, local, event_);
}
if (ErrorIn(status)) { return status; }
// Succesfully finished the computation
return StatusCode::kSuccess;
} catch (...) { return StatusCode::kInvalidKernel; }
// Launches the kernel
if (use_fast_kernel) {
auto global = std::vector<size_t>{CeilDiv(n, db_["WPT"]*db_["VW"])};
auto local = std::vector<size_t>{db_["WGS"]};
RunKernel(kernel, queue_, device_, global, local, event_);
}
else {
auto n_ceiled = Ceil(n, db_["WGS"]*db_["WPT"]);
auto global = std::vector<size_t>{n_ceiled/db_["WPT"]};
auto local = std::vector<size_t>{db_["WGS"]};
RunKernel(kernel, queue_, device_, global, local, event_);
}
}
// =================================================================================================

View File

@ -28,9 +28,9 @@ class Xswap: public Routine {
Xswap(Queue &queue, EventPointer event, const std::string &name = "SWAP");
// Templated-precision implementation of the routine
StatusCode DoSwap(const size_t n,
const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc);
void DoSwap(const size_t n,
const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc);
};
// =================================================================================================

View File

@ -29,13 +29,13 @@ Xgbmv<T>::Xgbmv(Queue &queue, EventPointer event, const std::string &name):
// The main routine
template <typename T>
StatusCode Xgbmv<T>::DoGbmv(const Layout layout, const Transpose a_transpose,
const size_t m, const size_t n, const size_t kl, const size_t ku,
const T alpha,
const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
const T beta,
const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc) {
void Xgbmv<T>::DoGbmv(const Layout layout, const Transpose a_transpose,
const size_t m, const size_t n, const size_t kl, const size_t ku,
const T alpha,
const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
const T beta,
const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc) {
// Reverses the upper and lower band count
auto rotated = (layout == Layout::kRowMajor);
@ -46,13 +46,13 @@ StatusCode Xgbmv<T>::DoGbmv(const Layout layout, const Transpose a_transpose,
// The specific hermitian matrix-accesses are implemented in the kernel guarded by the
// ROUTINE_GBMV define.
bool fast_kernels = false;
return MatVec(layout, a_transpose,
m, n, alpha,
a_buffer, a_offset, a_ld,
x_buffer, x_offset, x_inc, beta,
y_buffer, y_offset, y_inc,
fast_kernels, fast_kernels,
0, false, kl_real, ku_real);
MatVec(layout, a_transpose,
m, n, alpha,
a_buffer, a_offset, a_ld,
x_buffer, x_offset, x_inc, beta,
y_buffer, y_offset, y_inc,
fast_kernels, fast_kernels,
0, false, kl_real, ku_real);
}
// =================================================================================================

View File

@ -33,13 +33,13 @@ class Xgbmv: public Xgemv<T> {
Xgbmv(Queue &queue, EventPointer event, const std::string &name = "GBMV");
// Templated-precision implementation of the routine
StatusCode DoGbmv(const Layout layout, const Transpose a_transpose,
const size_t m, const size_t n, const size_t kl, const size_t ku,
const T alpha,
const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
const T beta,
const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc);
void DoGbmv(const Layout layout, const Transpose a_transpose,
const size_t m, const size_t n, const size_t kl, const size_t ku,
const T alpha,
const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
const T beta,
const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc);
};
// =================================================================================================

View File

@ -33,41 +33,41 @@ Xgemv<T>::Xgemv(Queue &queue, EventPointer event, const std::string &name):
// The main routine
template <typename T>
StatusCode Xgemv<T>::DoGemv(const Layout layout, const Transpose a_transpose,
const size_t m, const size_t n,
const T alpha,
const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
const T beta,
const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc) {
void Xgemv<T>::DoGemv(const Layout layout, const Transpose a_transpose,
const size_t m, const size_t n,
const T alpha,
const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
const T beta,
const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc) {
// Performs the matrix-vector multiplication
return MatVec(layout, a_transpose,
m, n, alpha,
a_buffer, a_offset, a_ld,
x_buffer, x_offset, x_inc, beta,
y_buffer, y_offset, y_inc,
true, true,
0, false, 0, 0); // N/A for this routine
MatVec(layout, a_transpose,
m, n, alpha,
a_buffer, a_offset, a_ld,
x_buffer, x_offset, x_inc, beta,
y_buffer, y_offset, y_inc,
true, true,
0, false, 0, 0); // N/A for this routine
}
// =================================================================================================
// The generic implementation, also suited for other (non general) matrix-vector multiplications
template <typename T>
StatusCode Xgemv<T>::MatVec(const Layout layout, const Transpose a_transpose,
const size_t m, const size_t n,
const T alpha,
const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
const T beta,
const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc,
bool fast_kernel, bool fast_kernel_rot,
const size_t parameter, const bool packed,
const size_t kl, const size_t ku) {
void Xgemv<T>::MatVec(const Layout layout, const Transpose a_transpose,
const size_t m, const size_t n,
const T alpha,
const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
const T beta,
const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc,
bool fast_kernel, bool fast_kernel_rot,
const size_t parameter, const bool packed,
const size_t kl, const size_t ku) {
// Makes sure all dimensions are larger than zero
if (m == 0 || n == 0) { return StatusCode::kInvalidDimension; }
if (m == 0 || n == 0) { throw BLASError(StatusCode::kInvalidDimension); }
// Computes whether or not the matrix has an alternative layout (row or column-major).
auto a_altlayout = (layout == Layout::kRowMajor);
@ -91,14 +91,10 @@ StatusCode Xgemv<T>::MatVec(const Layout layout, const Transpose a_transpose,
auto a_conjugate = (a_transpose == Transpose::kConjugate);
// Tests the matrix and the vectors for validity
auto status = StatusCode::kSuccess;
if (packed) { status = TestMatrixAP(n, a_buffer, a_offset); }
else { status = TestMatrixA(a_one, a_two, a_buffer, a_offset, a_ld); }
if (ErrorIn(status)) { return status; }
status = TestVectorX(n_real, x_buffer, x_offset, x_inc);
if (ErrorIn(status)) { return status; }
status = TestVectorY(m_real, y_buffer, y_offset, y_inc);
if (ErrorIn(status)) { return status; }
if (packed) { TestMatrixAP(n, a_buffer, a_offset); }
else { TestMatrixA(a_one, a_two, a_buffer, a_offset, a_ld); }
TestVectorX(n_real, x_buffer, x_offset, x_inc);
TestVectorY(m_real, y_buffer, y_offset, y_inc);
// Determines whether or not the fast-version can be used
fast_kernel = fast_kernel && (a_offset == 0) && (a_rotated == 0) && (a_conjugate == 0) &&
@ -127,39 +123,33 @@ StatusCode Xgemv<T>::MatVec(const Layout layout, const Transpose a_transpose,
}
// Retrieves the Xgemv kernel from the compiled binary
try {
const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);
auto kernel = Kernel(program, kernel_name);
const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);
auto kernel = Kernel(program, kernel_name);
// Sets the kernel arguments
kernel.SetArgument(0, static_cast<int>(m_real));
kernel.SetArgument(1, static_cast<int>(n_real));
kernel.SetArgument(2, GetRealArg(alpha));
kernel.SetArgument(3, GetRealArg(beta));
kernel.SetArgument(4, static_cast<int>(a_rotated));
kernel.SetArgument(5, a_buffer());
kernel.SetArgument(6, static_cast<int>(a_offset));
kernel.SetArgument(7, static_cast<int>(a_ld));
kernel.SetArgument(8, x_buffer());
kernel.SetArgument(9, static_cast<int>(x_offset));
kernel.SetArgument(10, static_cast<int>(x_inc));
kernel.SetArgument(11, y_buffer());
kernel.SetArgument(12, static_cast<int>(y_offset));
kernel.SetArgument(13, static_cast<int>(y_inc));
kernel.SetArgument(14, static_cast<int>(a_conjugate));
kernel.SetArgument(15, static_cast<int>(parameter)); // extra parameter used for symm/herm
kernel.SetArgument(16, static_cast<int>(kl)); // only used for banded matrices
kernel.SetArgument(17, static_cast<int>(ku)); // only used for banded matrices
// Sets the kernel arguments
kernel.SetArgument(0, static_cast<int>(m_real));
kernel.SetArgument(1, static_cast<int>(n_real));
kernel.SetArgument(2, GetRealArg(alpha));
kernel.SetArgument(3, GetRealArg(beta));
kernel.SetArgument(4, static_cast<int>(a_rotated));
kernel.SetArgument(5, a_buffer());
kernel.SetArgument(6, static_cast<int>(a_offset));
kernel.SetArgument(7, static_cast<int>(a_ld));
kernel.SetArgument(8, x_buffer());
kernel.SetArgument(9, static_cast<int>(x_offset));
kernel.SetArgument(10, static_cast<int>(x_inc));
kernel.SetArgument(11, y_buffer());
kernel.SetArgument(12, static_cast<int>(y_offset));
kernel.SetArgument(13, static_cast<int>(y_inc));
kernel.SetArgument(14, static_cast<int>(a_conjugate));
kernel.SetArgument(15, static_cast<int>(parameter)); // extra parameter used for symm/herm
kernel.SetArgument(16, static_cast<int>(kl)); // only used for banded matrices
kernel.SetArgument(17, static_cast<int>(ku)); // only used for banded matrices
// Launches the kernel
auto global = std::vector<size_t>{global_size};
auto local = std::vector<size_t>{local_size};
status = RunKernel(kernel, queue_, device_, global, local, event_);
if (ErrorIn(status)) { return status; }
// Succesfully finished the computation
return StatusCode::kSuccess;
} catch (...) { return StatusCode::kInvalidKernel; }
// Launches the kernel
auto global = std::vector<size_t>{global_size};
auto local = std::vector<size_t>{local_size};
RunKernel(kernel, queue_, device_, global, local, event_);
}
// =================================================================================================

View File

@ -28,25 +28,25 @@ class Xgemv: public Routine {
Xgemv(Queue &queue, EventPointer event, const std::string &name = "GEMV");
// Templated-precision implementation of the routine
StatusCode DoGemv(const Layout layout, const Transpose a_transpose,
const size_t m, const size_t n,
const T alpha,
const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
const T beta,
const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc);
void DoGemv(const Layout layout, const Transpose a_transpose,
const size_t m, const size_t n,
const T alpha,
const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
const T beta,
const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc);
// Generic version used also for other matrix-vector multiplications
StatusCode MatVec(const Layout layout, const Transpose a_transpose,
const size_t m, const size_t n,
const T alpha,
const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
const T beta,
const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc,
bool fast_kernel, bool fast_kernel_rot,
const size_t parameter, const bool packed,
const size_t kl, const size_t ku);
void MatVec(const Layout layout, const Transpose a_transpose,
const size_t m, const size_t n,
const T alpha,
const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
const T beta,
const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc,
bool fast_kernel, bool fast_kernel_rot,
const size_t parameter, const bool packed,
const size_t kl, const size_t ku);
};
// =================================================================================================

View File

@ -33,15 +33,15 @@ Xger<T>::Xger(Queue &queue, EventPointer event, const std::string &name):
// The main routine
template <typename T>
StatusCode Xger<T>::DoGer(const Layout layout,
const size_t m, const size_t n,
const T alpha,
const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc,
const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld) {
void Xger<T>::DoGer(const Layout layout,
const size_t m, const size_t n,
const T alpha,
const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc,
const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld) {
// Makes sure all dimensions are larger than zero
if (m == 0 || n == 0) { return StatusCode::kInvalidDimension; }
if (m == 0 || n == 0) { throw BLASError(StatusCode::kInvalidDimension); }
// Computes whether or not the matrix has an alternative layout (row or column-major).
const auto a_is_rowmajor = (layout == Layout::kRowMajor);
@ -49,44 +49,35 @@ StatusCode Xger<T>::DoGer(const Layout layout,
const auto a_two = (a_is_rowmajor) ? m : n;
// Tests the matrix and the vectors for validity
auto status = TestMatrixA(a_one, a_two, a_buffer, a_offset, a_ld);
if (ErrorIn(status)) { return status; }
status = TestVectorX(m, x_buffer, x_offset, x_inc);
if (ErrorIn(status)) { return status; }
status = TestVectorY(n, y_buffer, y_offset, y_inc);
if (ErrorIn(status)) { return status; }
TestMatrixA(a_one, a_two, a_buffer, a_offset, a_ld);
TestVectorX(m, x_buffer, x_offset, x_inc);
TestVectorY(n, y_buffer, y_offset, y_inc);
// Retrieves the kernel from the compiled binary
try {
const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);
auto kernel = Kernel(program, "Xger");
const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);
auto kernel = Kernel(program, "Xger");
// Sets the kernel arguments
kernel.SetArgument(0, static_cast<int>(a_one));
kernel.SetArgument(1, static_cast<int>(a_two));
kernel.SetArgument(2, GetRealArg(alpha));
kernel.SetArgument(3, x_buffer());
kernel.SetArgument(4, static_cast<int>(x_offset));
kernel.SetArgument(5, static_cast<int>(x_inc));
kernel.SetArgument(6, y_buffer());
kernel.SetArgument(7, static_cast<int>(y_offset));
kernel.SetArgument(8, static_cast<int>(y_inc));
kernel.SetArgument(9, a_buffer());
kernel.SetArgument(10, static_cast<int>(a_offset));
kernel.SetArgument(11, static_cast<int>(a_ld));
kernel.SetArgument(12, static_cast<int>(a_is_rowmajor));
// Sets the kernel arguments
kernel.SetArgument(0, static_cast<int>(a_one));
kernel.SetArgument(1, static_cast<int>(a_two));
kernel.SetArgument(2, GetRealArg(alpha));
kernel.SetArgument(3, x_buffer());
kernel.SetArgument(4, static_cast<int>(x_offset));
kernel.SetArgument(5, static_cast<int>(x_inc));
kernel.SetArgument(6, y_buffer());
kernel.SetArgument(7, static_cast<int>(y_offset));
kernel.SetArgument(8, static_cast<int>(y_inc));
kernel.SetArgument(9, a_buffer());
kernel.SetArgument(10, static_cast<int>(a_offset));
kernel.SetArgument(11, static_cast<int>(a_ld));
kernel.SetArgument(12, static_cast<int>(a_is_rowmajor));
// Launches the kernel
auto a_one_ceiled = Ceil(CeilDiv(a_one, db_["WPT"]), db_["WGS1"]);
auto a_two_ceiled = Ceil(CeilDiv(a_two, db_["WPT"]), db_["WGS2"]);
auto global = std::vector<size_t>{a_one_ceiled, a_two_ceiled};
auto local = std::vector<size_t>{db_["WGS1"], db_["WGS2"]};
status = RunKernel(kernel, queue_, device_, global, local, event_);
if (ErrorIn(status)) { return status; }
// Succesfully finished the computation
return StatusCode::kSuccess;
} catch (...) { return StatusCode::kInvalidKernel; }
// Launches the kernel
auto a_one_ceiled = Ceil(CeilDiv(a_one, db_["WPT"]), db_["WGS1"]);
auto a_two_ceiled = Ceil(CeilDiv(a_two, db_["WPT"]), db_["WGS2"]);
auto global = std::vector<size_t>{a_one_ceiled, a_two_ceiled};
auto local = std::vector<size_t>{db_["WGS1"], db_["WGS2"]};
RunKernel(kernel, queue_, device_, global, local, event_);
}
// =================================================================================================

View File

@ -28,12 +28,12 @@ class Xger: public Routine {
Xger(Queue &queue, EventPointer event, const std::string &name = "GER");
// Templated-precision implementation of the routine
StatusCode DoGer(const Layout layout,
const size_t m, const size_t n,
const T alpha,
const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc,
const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld);
void DoGer(const Layout layout,
const size_t m, const size_t n,
const T alpha,
const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc,
const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld);
};
// =================================================================================================

View File

@ -28,19 +28,19 @@ Xgerc<T>::Xgerc(Queue &queue, EventPointer event, const std::string &name):
// The main routine
template <typename T>
StatusCode Xgerc<T>::DoGerc(const Layout layout,
const size_t m, const size_t n,
const T alpha,
const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc,
const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld) {
void Xgerc<T>::DoGerc(const Layout layout,
const size_t m, const size_t n,
const T alpha,
const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc,
const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld) {
// Regular Ger operation on complex data, plus conjugation in the kernel guarded by the
// ROUTINE_GERC guard.
return DoGer(layout, m, n, alpha,
x_buffer, x_offset, x_inc,
y_buffer, y_offset, y_inc,
a_buffer, a_offset, a_ld);
DoGer(layout, m, n, alpha,
x_buffer, x_offset, x_inc,
y_buffer, y_offset, y_inc,
a_buffer, a_offset, a_ld);
}
// =================================================================================================

View File

@ -31,12 +31,12 @@ class Xgerc: public Xger<T> {
Xgerc(Queue &queue, EventPointer event, const std::string &name = "GERC");
// Templated-precision implementation of the routine
StatusCode DoGerc(const Layout layout,
const size_t m, const size_t n,
const T alpha,
const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc,
const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld);
void DoGerc(const Layout layout,
const size_t m, const size_t n,
const T alpha,
const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc,
const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld);
};
// =================================================================================================

View File

@ -28,18 +28,18 @@ Xgeru<T>::Xgeru(Queue &queue, EventPointer event, const std::string &name):
// The main routine
template <typename T>
StatusCode Xgeru<T>::DoGeru(const Layout layout,
const size_t m, const size_t n,
const T alpha,
const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc,
const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld) {
void Xgeru<T>::DoGeru(const Layout layout,
const size_t m, const size_t n,
const T alpha,
const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc,
const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld) {
// Regular Ger operation on complex data
return DoGer(layout, m, n, alpha,
x_buffer, x_offset, x_inc,
y_buffer, y_offset, y_inc,
a_buffer, a_offset, a_ld);
DoGer(layout, m, n, alpha,
x_buffer, x_offset, x_inc,
y_buffer, y_offset, y_inc,
a_buffer, a_offset, a_ld);
}
// =================================================================================================

View File

@ -31,12 +31,12 @@ class Xgeru: public Xger<T> {
Xgeru(Queue &queue, EventPointer event, const std::string &name = "GERU");
// Templated-precision implementation of the routine
StatusCode DoGeru(const Layout layout,
const size_t m, const size_t n,
const T alpha,
const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc,
const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld);
void DoGeru(const Layout layout,
const size_t m, const size_t n,
const T alpha,
const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc,
const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld);
};
// =================================================================================================

View File

@ -29,13 +29,13 @@ Xhbmv<T>::Xhbmv(Queue &queue, EventPointer event, const std::string &name):
// The main routine
template <typename T>
StatusCode Xhbmv<T>::DoHbmv(const Layout layout, const Triangle triangle,
const size_t n, const size_t k,
const T alpha,
const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
const T beta,
const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc) {
void Xhbmv<T>::DoHbmv(const Layout layout, const Triangle triangle,
const size_t n, const size_t k,
const T alpha,
const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
const T beta,
const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc) {
// The data is either in the upper or lower triangle
size_t is_upper = ((triangle == Triangle::kUpper && layout != Layout::kRowMajor) ||
@ -45,13 +45,13 @@ StatusCode Xhbmv<T>::DoHbmv(const Layout layout, const Triangle triangle,
// The specific hermitian banded matrix-accesses are implemented in the kernel guarded by the
// ROUTINE_HBMV define.
bool fast_kernels = false;
return MatVec(layout, Transpose::kNo,
n, n, alpha,
a_buffer, a_offset, a_ld,
x_buffer, x_offset, x_inc, beta,
y_buffer, y_offset, y_inc,
fast_kernels, fast_kernels,
is_upper, false, k, 0);
MatVec(layout, Transpose::kNo,
n, n, alpha,
a_buffer, a_offset, a_ld,
x_buffer, x_offset, x_inc, beta,
y_buffer, y_offset, y_inc,
fast_kernels, fast_kernels,
is_upper, false, k, 0);
}
// =================================================================================================

View File

@ -33,13 +33,13 @@ class Xhbmv: public Xgemv<T> {
Xhbmv(Queue &queue, EventPointer event, const std::string &name = "HBMV");
// Templated-precision implementation of the routine
StatusCode DoHbmv(const Layout layout, const Triangle triangle,
const size_t n, const size_t k,
const T alpha,
const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
const T beta,
const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc);
void DoHbmv(const Layout layout, const Triangle triangle,
const size_t n, const size_t k,
const T alpha,
const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
const T beta,
const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc);
};
// =================================================================================================

View File

@ -29,13 +29,13 @@ Xhemv<T>::Xhemv(Queue &queue, EventPointer event, const std::string &name):
// The main routine
template <typename T>
StatusCode Xhemv<T>::DoHemv(const Layout layout, const Triangle triangle,
const size_t n,
const T alpha,
const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
const T beta,
const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc) {
void Xhemv<T>::DoHemv(const Layout layout, const Triangle triangle,
const size_t n,
const T alpha,
const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
const T beta,
const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc) {
// The data is either in the upper or lower triangle
size_t is_upper = ((triangle == Triangle::kUpper && layout != Layout::kRowMajor) ||
@ -45,13 +45,13 @@ StatusCode Xhemv<T>::DoHemv(const Layout layout, const Triangle triangle,
// The specific hermitian matrix-accesses are implemented in the kernel guarded by the
// ROUTINE_HEMV define.
bool fast_kernels = false;
return MatVec(layout, Transpose::kNo,
n, n, alpha,
a_buffer, a_offset, a_ld,
x_buffer, x_offset, x_inc, beta,
y_buffer, y_offset, y_inc,
fast_kernels, fast_kernels,
is_upper, false, 0, 0);
MatVec(layout, Transpose::kNo,
n, n, alpha,
a_buffer, a_offset, a_ld,
x_buffer, x_offset, x_inc, beta,
y_buffer, y_offset, y_inc,
fast_kernels, fast_kernels,
is_upper, false, 0, 0);
}
// =================================================================================================

View File

@ -33,13 +33,13 @@ class Xhemv: public Xgemv<T> {
Xhemv(Queue &queue, EventPointer event, const std::string &name = "HEMV");
// Templated-precision implementation of the routine
StatusCode DoHemv(const Layout layout, const Triangle triangle,
const size_t n,
const T alpha,
const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
const T beta,
const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc);
void DoHemv(const Layout layout, const Triangle triangle,
const size_t n,
const T alpha,
const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
const T beta,
const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc);
};
// =================================================================================================

View File

@ -41,15 +41,15 @@ template <> half Xher<half,half>::GetAlpha(const half alpha) { return alpha; }
// The main routine
template <typename T, typename U>
StatusCode Xher<T,U>::DoHer(const Layout layout, const Triangle triangle,
const size_t n,
const U alpha,
const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
const bool packed) {
void Xher<T,U>::DoHer(const Layout layout, const Triangle triangle,
const size_t n,
const U alpha,
const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
const bool packed) {
// Makes sure the dimensions are larger than zero
if (n == 0) { return StatusCode::kInvalidDimension; }
if (n == 0) { throw BLASError(StatusCode::kInvalidDimension); }
// The data is either in the upper or lower triangle
const auto is_upper = ((triangle == Triangle::kUpper && layout != Layout::kRowMajor) ||
@ -57,47 +57,38 @@ StatusCode Xher<T,U>::DoHer(const Layout layout, const Triangle triangle,
const auto is_rowmajor = (layout == Layout::kRowMajor);
// Tests the matrix and the vectors for validity
auto status = StatusCode::kSuccess;
if (packed) { status = TestMatrixAP(n, a_buffer, a_offset); }
else { status = TestMatrixA(n, n, a_buffer, a_offset, a_ld); }
if (ErrorIn(status)) { return status; }
status = TestVectorX(n, x_buffer, x_offset, x_inc);
if (ErrorIn(status)) { return status; }
if (packed) { TestMatrixAP(n, a_buffer, a_offset); }
else { TestMatrixA(n, n, a_buffer, a_offset, a_ld); }
TestVectorX(n, x_buffer, x_offset, x_inc);
// If alpha is zero an update is not required
if (alpha == U{0}) { return StatusCode::kSuccess; }
if (alpha == U{0}) { return; }
// Creates a matching version of alpha
const auto matching_alpha = GetAlpha(alpha);
// Retrieves the kernel from the compiled binary
try {
const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);
auto kernel = Kernel(program, "Xher");
const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);
auto kernel = Kernel(program, "Xher");
// Sets the kernel arguments
kernel.SetArgument(0, static_cast<int>(n));
kernel.SetArgument(1, GetRealArg(matching_alpha));
kernel.SetArgument(2, x_buffer());
kernel.SetArgument(3, static_cast<int>(x_offset));
kernel.SetArgument(4, static_cast<int>(x_inc));
kernel.SetArgument(5, a_buffer());
kernel.SetArgument(6, static_cast<int>(a_offset));
kernel.SetArgument(7, static_cast<int>(a_ld));
kernel.SetArgument(8, static_cast<int>(is_upper));
kernel.SetArgument(9, static_cast<int>(is_rowmajor));
// Sets the kernel arguments
kernel.SetArgument(0, static_cast<int>(n));
kernel.SetArgument(1, GetRealArg(matching_alpha));
kernel.SetArgument(2, x_buffer());
kernel.SetArgument(3, static_cast<int>(x_offset));
kernel.SetArgument(4, static_cast<int>(x_inc));
kernel.SetArgument(5, a_buffer());
kernel.SetArgument(6, static_cast<int>(a_offset));
kernel.SetArgument(7, static_cast<int>(a_ld));
kernel.SetArgument(8, static_cast<int>(is_upper));
kernel.SetArgument(9, static_cast<int>(is_rowmajor));
// Launches the kernel
auto global_one = Ceil(CeilDiv(n, db_["WPT"]), db_["WGS1"]);
auto global_two = Ceil(CeilDiv(n, db_["WPT"]), db_["WGS2"]);
auto global = std::vector<size_t>{global_one, global_two};
auto local = std::vector<size_t>{db_["WGS1"], db_["WGS2"]};
status = RunKernel(kernel, queue_, device_, global, local, event_);
if (ErrorIn(status)) { return status; }
// Succesfully finished the computation
return StatusCode::kSuccess;
} catch (...) { return StatusCode::kInvalidKernel; }
// Launches the kernel
auto global_one = Ceil(CeilDiv(n, db_["WPT"]), db_["WGS1"]);
auto global_two = Ceil(CeilDiv(n, db_["WPT"]), db_["WGS2"]);
auto global = std::vector<size_t>{global_one, global_two};
auto local = std::vector<size_t>{db_["WGS1"], db_["WGS2"]};
RunKernel(kernel, queue_, device_, global, local, event_);
}
// =================================================================================================

View File

@ -31,12 +31,12 @@ class Xher: public Routine {
T GetAlpha(const U alpha);
// Templated-precision implementation of the routine
StatusCode DoHer(const Layout layout, const Triangle triangle,
const size_t n,
const U alpha,
const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
const bool packed = false);
void DoHer(const Layout layout, const Triangle triangle,
const size_t n,
const U alpha,
const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
const bool packed = false);
};
// =================================================================================================

View File

@ -32,16 +32,16 @@ Xher2<T>::Xher2(Queue &queue, EventPointer event, const std::string &name):
// The main routine
template <typename T>
StatusCode Xher2<T>::DoHer2(const Layout layout, const Triangle triangle,
const size_t n,
const T alpha,
const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc,
const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
const bool packed) {
void Xher2<T>::DoHer2(const Layout layout, const Triangle triangle,
const size_t n,
const T alpha,
const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc,
const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
const bool packed) {
// Makes sure the dimensions are larger than zero
if (n == 0) { return StatusCode::kInvalidDimension; }
if (n == 0) { throw BLASError(StatusCode::kInvalidDimension); }
// The data is either in the upper or lower triangle
const auto is_upper = ((triangle == Triangle::kUpper && layout != Layout::kRowMajor) ||
@ -49,46 +49,36 @@ StatusCode Xher2<T>::DoHer2(const Layout layout, const Triangle triangle,
const auto is_rowmajor = (layout == Layout::kRowMajor);
// Tests the matrix and the vectors for validity
auto status = StatusCode::kSuccess;
if (packed) { status = TestMatrixAP(n, a_buffer, a_offset); }
else { status = TestMatrixA(n, n, a_buffer, a_offset, a_ld); }
if (ErrorIn(status)) { return status; }
status = TestVectorX(n, x_buffer, x_offset, x_inc);
if (ErrorIn(status)) { return status; }
status = TestVectorY(n, y_buffer, y_offset, y_inc);
if (ErrorIn(status)) { return status; }
if (packed) { TestMatrixAP(n, a_buffer, a_offset); }
else { TestMatrixA(n, n, a_buffer, a_offset, a_ld); }
TestVectorX(n, x_buffer, x_offset, x_inc);
TestVectorY(n, y_buffer, y_offset, y_inc);
// Retrieves the kernel from the compiled binary
try {
const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);
auto kernel = Kernel(program, "Xher2");
const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);
auto kernel = Kernel(program, "Xher2");
// Sets the kernel arguments
kernel.SetArgument(0, static_cast<int>(n));
kernel.SetArgument(1, GetRealArg(alpha));
kernel.SetArgument(2, x_buffer());
kernel.SetArgument(3, static_cast<int>(x_offset));
kernel.SetArgument(4, static_cast<int>(x_inc));
kernel.SetArgument(5, y_buffer());
kernel.SetArgument(6, static_cast<int>(y_offset));
kernel.SetArgument(7, static_cast<int>(y_inc));
kernel.SetArgument(8, a_buffer());
kernel.SetArgument(9, static_cast<int>(a_offset));
kernel.SetArgument(10, static_cast<int>(a_ld));
kernel.SetArgument(11, static_cast<int>(is_upper));
kernel.SetArgument(12, static_cast<int>(is_rowmajor));
// Sets the kernel arguments
kernel.SetArgument(0, static_cast<int>(n));
kernel.SetArgument(1, GetRealArg(alpha));
kernel.SetArgument(2, x_buffer());
kernel.SetArgument(3, static_cast<int>(x_offset));
kernel.SetArgument(4, static_cast<int>(x_inc));
kernel.SetArgument(5, y_buffer());
kernel.SetArgument(6, static_cast<int>(y_offset));
kernel.SetArgument(7, static_cast<int>(y_inc));
kernel.SetArgument(8, a_buffer());
kernel.SetArgument(9, static_cast<int>(a_offset));
kernel.SetArgument(10, static_cast<int>(a_ld));
kernel.SetArgument(11, static_cast<int>(is_upper));
kernel.SetArgument(12, static_cast<int>(is_rowmajor));
// Launches the kernel
auto global_one = Ceil(CeilDiv(n, db_["WPT"]), db_["WGS1"]);
auto global_two = Ceil(CeilDiv(n, db_["WPT"]), db_["WGS2"]);
auto global = std::vector<size_t>{global_one, global_two};
auto local = std::vector<size_t>{db_["WGS1"], db_["WGS2"]};
status = RunKernel(kernel, queue_, device_, global, local, event_);
if (ErrorIn(status)) { return status; }
// Succesfully finished the computation
return StatusCode::kSuccess;
} catch (...) { return StatusCode::kInvalidKernel; }
// Launches the kernel
auto global_one = Ceil(CeilDiv(n, db_["WPT"]), db_["WGS1"]);
auto global_two = Ceil(CeilDiv(n, db_["WPT"]), db_["WGS2"]);
auto global = std::vector<size_t>{global_one, global_two};
auto local = std::vector<size_t>{db_["WGS1"], db_["WGS2"]};
RunKernel(kernel, queue_, device_, global, local, event_);
}
// =================================================================================================

View File

@ -28,13 +28,13 @@ class Xher2: public Routine {
Xher2(Queue &queue, EventPointer event, const std::string &name = "HER2");
// Templated-precision implementation of the routine
StatusCode DoHer2(const Layout layout, const Triangle triangle,
const size_t n,
const T alpha,
const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc,
const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
const bool packed = false);
void DoHer2(const Layout layout, const Triangle triangle,
const size_t n,
const T alpha,
const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc,
const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
const bool packed = false);
};
// =================================================================================================

View File

@ -29,13 +29,13 @@ Xhpmv<T>::Xhpmv(Queue &queue, EventPointer event, const std::string &name):
// The main routine
template <typename T>
StatusCode Xhpmv<T>::DoHpmv(const Layout layout, const Triangle triangle,
const size_t n,
const T alpha,
const Buffer<T> &ap_buffer, const size_t ap_offset,
const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
const T beta,
const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc) {
void Xhpmv<T>::DoHpmv(const Layout layout, const Triangle triangle,
const size_t n,
const T alpha,
const Buffer<T> &ap_buffer, const size_t ap_offset,
const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
const T beta,
const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc) {
// The data is either in the upper or lower triangle
size_t is_upper = ((triangle == Triangle::kUpper && layout != Layout::kRowMajor) ||
@ -45,13 +45,13 @@ StatusCode Xhpmv<T>::DoHpmv(const Layout layout, const Triangle triangle,
// The specific hermitian packed matrix-accesses are implemented in the kernel guarded by the
// ROUTINE_HPMV define.
bool fast_kernels = false;
return MatVec(layout, Transpose::kNo,
n, n, alpha,
ap_buffer, ap_offset, n,
x_buffer, x_offset, x_inc, beta,
y_buffer, y_offset, y_inc,
fast_kernels, fast_kernels,
is_upper, true, 0, 0);
MatVec(layout, Transpose::kNo,
n, n, alpha,
ap_buffer, ap_offset, n,
x_buffer, x_offset, x_inc, beta,
y_buffer, y_offset, y_inc,
fast_kernels, fast_kernels,
is_upper, true, 0, 0);
}
// =================================================================================================

View File

@ -33,13 +33,13 @@ class Xhpmv: public Xgemv<T> {
Xhpmv(Queue &queue, EventPointer event, const std::string &name = "HPMV");
// Templated-precision implementation of the routine
StatusCode DoHpmv(const Layout layout, const Triangle triangle,
const size_t n,
const T alpha,
const Buffer<T> &ap_buffer, const size_t ap_offset,
const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
const T beta,
const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc);
void DoHpmv(const Layout layout, const Triangle triangle,
const size_t n,
const T alpha,
const Buffer<T> &ap_buffer, const size_t ap_offset,
const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
const T beta,
const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc);
};
// =================================================================================================

View File

@ -28,17 +28,17 @@ Xhpr<T,U>::Xhpr(Queue &queue, EventPointer event, const std::string &name):
// The main routine
template <typename T, typename U>
StatusCode Xhpr<T,U>::DoHpr(const Layout layout, const Triangle triangle,
const size_t n,
const U alpha,
const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
const Buffer<T> &ap_buffer, const size_t ap_offset) {
void Xhpr<T,U>::DoHpr(const Layout layout, const Triangle triangle,
const size_t n,
const U alpha,
const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
const Buffer<T> &ap_buffer, const size_t ap_offset) {
// Specific Xhpr functionality is implemented in the kernel using defines
return DoHer(layout, triangle, n, alpha,
x_buffer, x_offset, x_inc,
ap_buffer, ap_offset, n,
true); // packed matrix
DoHer(layout, triangle, n, alpha,
x_buffer, x_offset, x_inc,
ap_buffer, ap_offset, n,
true); // packed matrix
}
// =================================================================================================

View File

@ -31,11 +31,11 @@ class Xhpr: public Xher<T,U> {
Xhpr(Queue &queue, EventPointer event, const std::string &name = "HPR");
// Templated-precision implementation of the routine
StatusCode DoHpr(const Layout layout, const Triangle triangle,
const size_t n,
const U alpha,
const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
const Buffer<T> &ap_buffer, const size_t ap_offset);
void DoHpr(const Layout layout, const Triangle triangle,
const size_t n,
const U alpha,
const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
const Buffer<T> &ap_buffer, const size_t ap_offset);
};
// =================================================================================================

View File

@ -28,19 +28,19 @@ Xhpr2<T>::Xhpr2(Queue &queue, EventPointer event, const std::string &name):
// The main routine
template <typename T>
StatusCode Xhpr2<T>::DoHpr2(const Layout layout, const Triangle triangle,
const size_t n,
const T alpha,
const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc,
const Buffer<T> &ap_buffer, const size_t ap_offset) {
void Xhpr2<T>::DoHpr2(const Layout layout, const Triangle triangle,
const size_t n,
const T alpha,
const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc,
const Buffer<T> &ap_buffer, const size_t ap_offset) {
// Specific Xhpr2 functionality is implemented in the kernel using defines
return DoHer2(layout, triangle, n, alpha,
x_buffer, x_offset, x_inc,
y_buffer, y_offset, y_inc,
ap_buffer, ap_offset, n,
true); // packed matrix
DoHer2(layout, triangle, n, alpha,
x_buffer, x_offset, x_inc,
y_buffer, y_offset, y_inc,
ap_buffer, ap_offset, n,
true); // packed matrix
}
// =================================================================================================

View File

@ -31,12 +31,12 @@ class Xhpr2: public Xher2<T> {
Xhpr2(Queue &queue, EventPointer event, const std::string &name = "HPR2");
// Templated-precision implementation of the routine
StatusCode DoHpr2(const Layout layout, const Triangle triangle,
const size_t n,
const T alpha,
const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc,
const Buffer<T> &ap_buffer, const size_t ap_offset);
void DoHpr2(const Layout layout, const Triangle triangle,
const size_t n,
const T alpha,
const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc,
const Buffer<T> &ap_buffer, const size_t ap_offset);
};
// =================================================================================================

View File

@ -29,13 +29,13 @@ Xsbmv<T>::Xsbmv(Queue &queue, EventPointer event, const std::string &name):
// The main routine
template <typename T>
StatusCode Xsbmv<T>::DoSbmv(const Layout layout, const Triangle triangle,
const size_t n, const size_t k,
const T alpha,
const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
const T beta,
const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc) {
void Xsbmv<T>::DoSbmv(const Layout layout, const Triangle triangle,
const size_t n, const size_t k,
const T alpha,
const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
const T beta,
const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc) {
// The data is either in the upper or lower triangle
size_t is_upper = ((triangle == Triangle::kUpper && layout != Layout::kRowMajor) ||
@ -45,13 +45,13 @@ StatusCode Xsbmv<T>::DoSbmv(const Layout layout, const Triangle triangle,
// The specific symmetric banded matrix-accesses are implemented in the kernel guarded by the
// ROUTINE_SBMV define.
bool fast_kernels = false;
return MatVec(layout, Transpose::kNo,
n, n, alpha,
a_buffer, a_offset, a_ld,
x_buffer, x_offset, x_inc, beta,
y_buffer, y_offset, y_inc,
fast_kernels, fast_kernels,
is_upper, false, k, 0);
MatVec(layout, Transpose::kNo,
n, n, alpha,
a_buffer, a_offset, a_ld,
x_buffer, x_offset, x_inc, beta,
y_buffer, y_offset, y_inc,
fast_kernels, fast_kernels,
is_upper, false, k, 0);
}
// =================================================================================================

View File

@ -33,13 +33,13 @@ class Xsbmv: public Xgemv<T> {
Xsbmv(Queue &queue, EventPointer event, const std::string &name = "SBMV");
// Templated-precision implementation of the routine
StatusCode DoSbmv(const Layout layout, const Triangle triangle,
const size_t n, const size_t k,
const T alpha,
const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
const T beta,
const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc);
void DoSbmv(const Layout layout, const Triangle triangle,
const size_t n, const size_t k,
const T alpha,
const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
const T beta,
const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc);
};
// =================================================================================================

View File

@ -29,13 +29,13 @@ Xspmv<T>::Xspmv(Queue &queue, EventPointer event, const std::string &name):
// The main routine
template <typename T>
StatusCode Xspmv<T>::DoSpmv(const Layout layout, const Triangle triangle,
const size_t n,
const T alpha,
const Buffer<T> &ap_buffer, const size_t ap_offset,
const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
const T beta,
const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc) {
void Xspmv<T>::DoSpmv(const Layout layout, const Triangle triangle,
const size_t n,
const T alpha,
const Buffer<T> &ap_buffer, const size_t ap_offset,
const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
const T beta,
const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc) {
// The data is either in the upper or lower triangle
size_t is_upper = ((triangle == Triangle::kUpper && layout != Layout::kRowMajor) ||
@ -45,13 +45,13 @@ StatusCode Xspmv<T>::DoSpmv(const Layout layout, const Triangle triangle,
// The specific symmetric packed matrix-accesses are implemented in the kernel guarded by the
// ROUTINE_SPMV define.
bool fast_kernels = false;
return MatVec(layout, Transpose::kNo,
n, n, alpha,
ap_buffer, ap_offset, n,
x_buffer, x_offset, x_inc, beta,
y_buffer, y_offset, y_inc,
fast_kernels, fast_kernels,
is_upper, true, 0, 0);
MatVec(layout, Transpose::kNo,
n, n, alpha,
ap_buffer, ap_offset, n,
x_buffer, x_offset, x_inc, beta,
y_buffer, y_offset, y_inc,
fast_kernels, fast_kernels,
is_upper, true, 0, 0);
}
// =================================================================================================

View File

@ -33,13 +33,13 @@ class Xspmv: public Xgemv<T> {
Xspmv(Queue &queue, EventPointer event, const std::string &name = "SPMV");
// Templated-precision implementation of the routine
StatusCode DoSpmv(const Layout layout, const Triangle triangle,
const size_t n,
const T alpha,
const Buffer<T> &ap_buffer, const size_t ap_offset,
const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
const T beta,
const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc);
void DoSpmv(const Layout layout, const Triangle triangle,
const size_t n,
const T alpha,
const Buffer<T> &ap_buffer, const size_t ap_offset,
const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
const T beta,
const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc);
};
// =================================================================================================

View File

@ -28,17 +28,17 @@ Xspr<T>::Xspr(Queue &queue, EventPointer event, const std::string &name):
// The main routine
template <typename T>
StatusCode Xspr<T>::DoSpr(const Layout layout, const Triangle triangle,
const size_t n,
const T alpha,
const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
const Buffer<T> &ap_buffer, const size_t ap_offset) {
void Xspr<T>::DoSpr(const Layout layout, const Triangle triangle,
const size_t n,
const T alpha,
const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
const Buffer<T> &ap_buffer, const size_t ap_offset) {
// Specific Xspr functionality is implemented in the kernel using defines
return DoHer(layout, triangle, n, alpha,
x_buffer, x_offset, x_inc,
ap_buffer, ap_offset, n,
true); // packed matrix
DoHer(layout, triangle, n, alpha,
x_buffer, x_offset, x_inc,
ap_buffer, ap_offset, n,
true); // packed matrix
}
// =================================================================================================

View File

@ -31,11 +31,11 @@ class Xspr: public Xher<T,T> {
Xspr(Queue &queue, EventPointer event, const std::string &name = "SPR");
// Templated-precision implementation of the routine
StatusCode DoSpr(const Layout layout, const Triangle triangle,
const size_t n,
const T alpha,
const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
const Buffer<T> &ap_buffer, const size_t ap_offset);
void DoSpr(const Layout layout, const Triangle triangle,
const size_t n,
const T alpha,
const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
const Buffer<T> &ap_buffer, const size_t ap_offset);
};
// =================================================================================================

View File

@ -28,19 +28,19 @@ Xspr2<T>::Xspr2(Queue &queue, EventPointer event, const std::string &name):
// The main routine
template <typename T>
StatusCode Xspr2<T>::DoSpr2(const Layout layout, const Triangle triangle,
const size_t n,
const T alpha,
const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc,
const Buffer<T> &ap_buffer, const size_t ap_offset) {
void Xspr2<T>::DoSpr2(const Layout layout, const Triangle triangle,
const size_t n,
const T alpha,
const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc,
const Buffer<T> &ap_buffer, const size_t ap_offset) {
// Specific Xspr2 functionality is implemented in the kernel using defines
return DoHer2(layout, triangle, n, alpha,
x_buffer, x_offset, x_inc,
y_buffer, y_offset, y_inc,
ap_buffer, ap_offset, n,
true); // packed matrix
DoHer2(layout, triangle, n, alpha,
x_buffer, x_offset, x_inc,
y_buffer, y_offset, y_inc,
ap_buffer, ap_offset, n,
true); // packed matrix
}
// =================================================================================================

View File

@ -31,12 +31,12 @@ class Xspr2: public Xher2<T> {
Xspr2(Queue &queue, EventPointer event, const std::string &name = "SPR2");
// Templated-precision implementation of the routine
StatusCode DoSpr2(const Layout layout, const Triangle triangle,
const size_t n,
const T alpha,
const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc,
const Buffer<T> &ap_buffer, const size_t ap_offset);
void DoSpr2(const Layout layout, const Triangle triangle,
const size_t n,
const T alpha,
const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc,
const Buffer<T> &ap_buffer, const size_t ap_offset);
};
// =================================================================================================

View File

@ -29,13 +29,13 @@ Xsymv<T>::Xsymv(Queue &queue, EventPointer event, const std::string &name):
// The main routine
template <typename T>
StatusCode Xsymv<T>::DoSymv(const Layout layout, const Triangle triangle,
const size_t n,
const T alpha,
const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
const T beta,
const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc) {
void Xsymv<T>::DoSymv(const Layout layout, const Triangle triangle,
const size_t n,
const T alpha,
const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
const T beta,
const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc) {
// The data is either in the upper or lower triangle
size_t is_upper = ((triangle == Triangle::kUpper && layout != Layout::kRowMajor) ||
@ -45,13 +45,13 @@ StatusCode Xsymv<T>::DoSymv(const Layout layout, const Triangle triangle,
// The specific symmetric matrix-accesses are implemented in the kernel guarded by the
// ROUTINE_SYMV define.
bool fast_kernels = false;
return MatVec(layout, Transpose::kNo,
n, n, alpha,
a_buffer, a_offset, a_ld,
x_buffer, x_offset, x_inc, beta,
y_buffer, y_offset, y_inc,
fast_kernels, fast_kernels,
is_upper, false, 0, 0);
MatVec(layout, Transpose::kNo,
n, n, alpha,
a_buffer, a_offset, a_ld,
x_buffer, x_offset, x_inc, beta,
y_buffer, y_offset, y_inc,
fast_kernels, fast_kernels,
is_upper, false, 0, 0);
}
// =================================================================================================

View File

@ -33,13 +33,13 @@ class Xsymv: public Xgemv<T> {
Xsymv(Queue &queue, EventPointer event, const std::string &name = "SYMV");
// Templated-precision implementation of the routine
StatusCode DoSymv(const Layout layout, const Triangle triangle,
const size_t n,
const T alpha,
const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
const T beta,
const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc);
void DoSymv(const Layout layout, const Triangle triangle,
const size_t n,
const T alpha,
const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
const T beta,
const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc);
};
// =================================================================================================

View File

@ -28,16 +28,16 @@ Xsyr<T>::Xsyr(Queue &queue, EventPointer event, const std::string &name):
// The main routine
template <typename T>
StatusCode Xsyr<T>::DoSyr(const Layout layout, const Triangle triangle,
const size_t n,
const T alpha,
const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld) {
void Xsyr<T>::DoSyr(const Layout layout, const Triangle triangle,
const size_t n,
const T alpha,
const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld) {
// Specific Xsyr functionality is implemented in the kernel using defines
return DoHer(layout, triangle, n, alpha,
x_buffer, x_offset, x_inc,
a_buffer, a_offset, a_ld);
DoHer(layout, triangle, n, alpha,
x_buffer, x_offset, x_inc,
a_buffer, a_offset, a_ld);
}
// =================================================================================================

View File

@ -31,11 +31,11 @@ class Xsyr: public Xher<T,T> {
Xsyr(Queue &queue, EventPointer event, const std::string &name = "SYR");
// Templated-precision implementation of the routine
StatusCode DoSyr(const Layout layout, const Triangle triangle,
const size_t n,
const T alpha,
const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld);
void DoSyr(const Layout layout, const Triangle triangle,
const size_t n,
const T alpha,
const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld);
};
// =================================================================================================

View File

@ -28,18 +28,18 @@ Xsyr2<T>::Xsyr2(Queue &queue, EventPointer event, const std::string &name):
// The main routine
template <typename T>
StatusCode Xsyr2<T>::DoSyr2(const Layout layout, const Triangle triangle,
const size_t n,
const T alpha,
const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc,
const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld) {
void Xsyr2<T>::DoSyr2(const Layout layout, const Triangle triangle,
const size_t n,
const T alpha,
const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc,
const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld) {
// Specific Xsyr2 functionality is implemented in the kernel using defines
return DoHer2(layout, triangle, n, alpha,
x_buffer, x_offset, x_inc,
y_buffer, y_offset, y_inc,
a_buffer, a_offset, a_ld);
DoHer2(layout, triangle, n, alpha,
x_buffer, x_offset, x_inc,
y_buffer, y_offset, y_inc,
a_buffer, a_offset, a_ld);
}
// =================================================================================================

View File

@ -31,12 +31,12 @@ class Xsyr2: public Xher2<T> {
Xsyr2(Queue &queue, EventPointer event, const std::string &name = "SYR2");
// Templated-precision implementation of the routine
StatusCode DoSyr2(const Layout layout, const Triangle triangle,
const size_t n,
const T alpha,
const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc,
const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld);
void DoSyr2(const Layout layout, const Triangle triangle,
const size_t n,
const T alpha,
const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc,
const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld);
};
// =================================================================================================

View File

@ -29,17 +29,15 @@ Xtbmv<T>::Xtbmv(Queue &queue, EventPointer event, const std::string &name):
// The main routine
template <typename T>
StatusCode Xtbmv<T>::DoTbmv(const Layout layout, const Triangle triangle,
const Transpose a_transpose, const Diagonal diagonal,
const size_t n, const size_t k,
const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc) {
void Xtbmv<T>::DoTbmv(const Layout layout, const Triangle triangle,
const Transpose a_transpose, const Diagonal diagonal,
const size_t n, const size_t k,
const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc) {
// Creates a copy of X: a temporary scratch buffer
auto scratch_buffer = Buffer<T>(context_, n*x_inc + x_offset);
try {
x_buffer.CopyTo(queue_, n*x_inc + x_offset, scratch_buffer);
} catch (...) { } // Continues: error-code is returned in MatVec
x_buffer.CopyTo(queue_, n*x_inc + x_offset, scratch_buffer);
// The data is either in the upper or lower triangle
size_t is_upper = ((triangle == Triangle::kUpper && layout != Layout::kRowMajor) ||
@ -52,20 +50,22 @@ StatusCode Xtbmv<T>::DoTbmv(const Layout layout, const Triangle triangle,
// The specific triangular banded matrix-accesses are implemented in the kernel guarded by the
// ROUTINE_TBMV define.
auto fast_kernels = false;
auto status = MatVec(layout, a_transpose,
n, n, static_cast<T>(1),
a_buffer, a_offset, a_ld,
scratch_buffer, x_offset, x_inc, static_cast<T>(0),
x_buffer, x_offset, x_inc,
fast_kernels, fast_kernels,
parameter, false, k, 0);
// Returns the proper error code (renames vector Y to X)
switch(status) {
case StatusCode::kInvalidVectorY: return StatusCode::kInvalidVectorX;
case StatusCode::kInvalidIncrementY: return StatusCode::kInvalidIncrementX;
case StatusCode::kInsufficientMemoryY: return StatusCode::kInsufficientMemoryX;
default: return status;
try {
MatVec(layout, a_transpose,
n, n, static_cast<T>(1),
a_buffer, a_offset, a_ld,
scratch_buffer, x_offset, x_inc, static_cast<T>(0),
x_buffer, x_offset, x_inc,
fast_kernels, fast_kernels,
parameter, false, k, 0);
} catch (BLASError &e) {
// Returns the proper error code (renames vector Y to X)
switch (e.status()) {
case StatusCode::kInvalidVectorY: throw BLASError(StatusCode::kInvalidVectorX, e.details());
case StatusCode::kInvalidIncrementY: throw BLASError(StatusCode::kInvalidIncrementX, e.details());
case StatusCode::kInsufficientMemoryY: throw BLASError(StatusCode::kInsufficientMemoryX, e.details());
default: throw;
}
}
}

View File

@ -35,11 +35,11 @@ class Xtbmv: public Xgemv<T> {
Xtbmv(Queue &queue, EventPointer event, const std::string &name = "TBMV");
// Templated-precision implementation of the routine
StatusCode DoTbmv(const Layout layout, const Triangle triangle,
const Transpose a_transpose, const Diagonal diagonal,
const size_t n, const size_t k,
const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc);
void DoTbmv(const Layout layout, const Triangle triangle,
const Transpose a_transpose, const Diagonal diagonal,
const size_t n, const size_t k,
const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc);
};
// =================================================================================================

View File

@ -29,17 +29,15 @@ Xtpmv<T>::Xtpmv(Queue &queue, EventPointer event, const std::string &name):
// The main routine
template <typename T>
StatusCode Xtpmv<T>::DoTpmv(const Layout layout, const Triangle triangle,
const Transpose a_transpose, const Diagonal diagonal,
const size_t n,
const Buffer<T> &ap_buffer, const size_t ap_offset,
const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc) {
void Xtpmv<T>::DoTpmv(const Layout layout, const Triangle triangle,
const Transpose a_transpose, const Diagonal diagonal,
const size_t n,
const Buffer<T> &ap_buffer, const size_t ap_offset,
const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc) {
// Creates a copy of X: a temporary scratch buffer
auto scratch_buffer = Buffer<T>(context_, n*x_inc + x_offset);
try {
x_buffer.CopyTo(queue_, n*x_inc + x_offset, scratch_buffer);
} catch (...) { } // Continues: error-code is returned in MatVec
x_buffer.CopyTo(queue_, n*x_inc + x_offset, scratch_buffer);
// The data is either in the upper or lower triangle
size_t is_upper = ((triangle == Triangle::kUpper && layout != Layout::kRowMajor) ||
@ -52,20 +50,22 @@ StatusCode Xtpmv<T>::DoTpmv(const Layout layout, const Triangle triangle,
// The specific triangular packed matrix-accesses are implemented in the kernel guarded by the
// ROUTINE_TPMV define.
auto fast_kernels = false;
auto status = MatVec(layout, a_transpose,
n, n, static_cast<T>(1),
ap_buffer, ap_offset, n,
scratch_buffer, x_offset, x_inc, static_cast<T>(0),
x_buffer, x_offset, x_inc,
fast_kernels, fast_kernels,
parameter, true, 0, 0);
// Returns the proper error code (renames vector Y to X)
switch(status) {
case StatusCode::kInvalidVectorY: return StatusCode::kInvalidVectorX;
case StatusCode::kInvalidIncrementY: return StatusCode::kInvalidIncrementX;
case StatusCode::kInsufficientMemoryY: return StatusCode::kInsufficientMemoryX;
default: return status;
try {
MatVec(layout, a_transpose,
n, n, static_cast<T>(1),
ap_buffer, ap_offset, n,
scratch_buffer, x_offset, x_inc, static_cast<T>(0),
x_buffer, x_offset, x_inc,
fast_kernels, fast_kernels,
parameter, true, 0, 0);
} catch (BLASError &e) {
// Returns the proper error code (renames vector Y to X)
switch (e.status()) {
case StatusCode::kInvalidVectorY: throw BLASError(StatusCode::kInvalidVectorX, e.details());
case StatusCode::kInvalidIncrementY: throw BLASError(StatusCode::kInvalidIncrementX, e.details());
case StatusCode::kInsufficientMemoryY: throw BLASError(StatusCode::kInsufficientMemoryX, e.details());
default: throw;
}
}
}

View File

@ -35,11 +35,11 @@ class Xtpmv: public Xgemv<T> {
Xtpmv(Queue &queue, EventPointer event, const std::string &name = "TPMV");
// Templated-precision implementation of the routine
StatusCode DoTpmv(const Layout layout, const Triangle triangle,
const Transpose a_transpose, const Diagonal diagonal,
const size_t n,
const Buffer<T> &ap_buffer, const size_t ap_offset,
const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc);
void DoTpmv(const Layout layout, const Triangle triangle,
const Transpose a_transpose, const Diagonal diagonal,
const size_t n,
const Buffer<T> &ap_buffer, const size_t ap_offset,
const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc);
};
// =================================================================================================

View File

@ -29,17 +29,15 @@ Xtrmv<T>::Xtrmv(Queue &queue, EventPointer event, const std::string &name):
// The main routine
template <typename T>
StatusCode Xtrmv<T>::DoTrmv(const Layout layout, const Triangle triangle,
const Transpose a_transpose, const Diagonal diagonal,
const size_t n,
const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc) {
void Xtrmv<T>::DoTrmv(const Layout layout, const Triangle triangle,
const Transpose a_transpose, const Diagonal diagonal,
const size_t n,
const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc) {
// Creates a copy of X: a temporary scratch buffer
auto scratch_buffer = Buffer<T>(context_, n*x_inc + x_offset);
try {
x_buffer.CopyTo(queue_, n*x_inc + x_offset, scratch_buffer);
} catch (...) { } // Continues: error-code is returned in MatVec
x_buffer.CopyTo(queue_, n*x_inc + x_offset, scratch_buffer);
// The data is either in the upper or lower triangle
size_t is_upper = ((triangle == Triangle::kUpper && layout != Layout::kRowMajor) ||
@ -52,20 +50,22 @@ StatusCode Xtrmv<T>::DoTrmv(const Layout layout, const Triangle triangle,
// The specific triangular matrix-accesses are implemented in the kernel guarded by the
// ROUTINE_TRMV define.
auto fast_kernels = false;
auto status = MatVec(layout, a_transpose,
n, n, static_cast<T>(1),
a_buffer, a_offset, a_ld,
scratch_buffer, x_offset, x_inc, static_cast<T>(0),
x_buffer, x_offset, x_inc,
fast_kernels, fast_kernels,
parameter, false, 0, 0);
// Returns the proper error code (renames vector Y to X)
switch(status) {
case StatusCode::kInvalidVectorY: return StatusCode::kInvalidVectorX;
case StatusCode::kInvalidIncrementY: return StatusCode::kInvalidIncrementX;
case StatusCode::kInsufficientMemoryY: return StatusCode::kInsufficientMemoryX;
default: return status;
try {
MatVec(layout, a_transpose,
n, n, static_cast<T>(1),
a_buffer, a_offset, a_ld,
scratch_buffer, x_offset, x_inc, static_cast<T>(0),
x_buffer, x_offset, x_inc,
fast_kernels, fast_kernels,
parameter, false, 0, 0);
} catch (BLASError &e) {
// Returns the proper error code (renames vector Y to X)
switch (e.status()) {
case StatusCode::kInvalidVectorY: throw BLASError(StatusCode::kInvalidVectorX, e.details());
case StatusCode::kInvalidIncrementY: throw BLASError(StatusCode::kInvalidIncrementX, e.details());
case StatusCode::kInsufficientMemoryY: throw BLASError(StatusCode::kInsufficientMemoryX, e.details());
default: throw;
}
}
}

View File

@ -35,11 +35,11 @@ class Xtrmv: public Xgemv<T> {
Xtrmv(Queue &queue, EventPointer event, const std::string &name = "TRMV");
// Templated-precision implementation of the routine
StatusCode DoTrmv(const Layout layout, const Triangle triangle,
const Transpose a_transpose, const Diagonal diagonal,
const size_t n,
const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc);
void DoTrmv(const Layout layout, const Triangle triangle,
const Transpose a_transpose, const Diagonal diagonal,
const size_t n,
const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc);
};
// =================================================================================================

View File

@ -50,17 +50,17 @@ Xgemm<T>::Xgemm(Queue &queue, EventPointer event, const std::string &name):
// The main routine
template <typename T>
StatusCode Xgemm<T>::DoGemm(const Layout layout,
const Transpose a_transpose, const Transpose b_transpose,
const size_t m, const size_t n, const size_t k,
const T alpha,
const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
const Buffer<T> &b_buffer, const size_t b_offset, const size_t b_ld,
const T beta,
const Buffer<T> &c_buffer, const size_t c_offset, const size_t c_ld) {
void Xgemm<T>::DoGemm(const Layout layout,
const Transpose a_transpose, const Transpose b_transpose,
const size_t m, const size_t n, const size_t k,
const T alpha,
const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
const Buffer<T> &b_buffer, const size_t b_offset, const size_t b_ld,
const T beta,
const Buffer<T> &c_buffer, const size_t c_offset, const size_t c_ld) {
// Makes sure all dimensions are larger than zero
if ((m == 0) || (n == 0) || (k == 0)) { return StatusCode::kInvalidDimension; }
if ((m == 0) || (n == 0) || (k == 0)) { throw BLASError(StatusCode::kInvalidDimension); }
// Computes whether or not the matrices are transposed in memory. This is based on their layout
// (row or column-major) and whether or not they are requested to be pre-transposed. Note
@ -99,12 +99,9 @@ StatusCode Xgemm<T>::DoGemm(const Layout layout,
// matrix A cannot be less than K when rotated, or less than M when not-rotated
// matrix B cannot be less than N when rotated, or less than K when not-rotated
// matrix C cannot be less than N when rotated, or less than M when not-rotated
auto status = TestMatrixA(a_one, a_two, a_buffer, a_offset, a_ld);
if (ErrorIn(status)) { return status; }
status = TestMatrixB(b_one, b_two, b_buffer, b_offset, b_ld);
if (ErrorIn(status)) { return status; }
status = TestMatrixC(c_one, c_two, c_buffer, c_offset, c_ld);
if (ErrorIn(status)) { return status; }
TestMatrixA(a_one, a_two, a_buffer, a_offset, a_ld);
TestMatrixB(b_one, b_two, b_buffer, b_offset, b_ld);
TestMatrixC(c_one, c_two, c_buffer, c_offset, c_ld);
// Selects which version of GEMM to run
const auto do_gemm_direct = (m * n * k < db_["XGEMM_MIN_INDIRECT_SIZE"]);
@ -131,7 +128,7 @@ StatusCode Xgemm<T>::DoGemm(const Layout layout,
// requirements, but several pre and post-processing kernels take care of those. However, the
// overhead of these extra kernels might not be ideal for certain devices/arguments.
template <typename T>
StatusCode Xgemm<T>::GemmIndirect(const size_t m, const size_t n, const size_t k,
void Xgemm<T>::GemmIndirect(const size_t m, const size_t n, const size_t k,
const T alpha,
const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
const Buffer<T> &b_buffer, const size_t b_offset, const size_t b_ld,
@ -142,8 +139,6 @@ StatusCode Xgemm<T>::GemmIndirect(const size_t m, const size_t n, const size_t k
const size_t a_one, const size_t a_two, const bool a_want_rotated,
const size_t b_one, const size_t b_two, const bool b_want_rotated,
const size_t c_one, const size_t c_two, const bool c_want_rotated) {
auto status = StatusCode::kSuccess;
// Calculates the ceiled versions of m, n, and k
const auto m_ceiled = Ceil(m, db_["MWG"]);
const auto n_ceiled = Ceil(n, db_["NWG"]);
@ -158,109 +153,95 @@ StatusCode Xgemm<T>::GemmIndirect(const size_t m, const size_t n, const size_t k
const auto c_one_i = (c_want_rotated) ? n_ceiled : m_ceiled;
const auto c_two_i = (c_want_rotated) ? m_ceiled : n_ceiled;
// The padded/transposed input/output matrices: if memory allocation fails, throw an exception
try {
// Loads the program from the database
const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);
// Loads the program from the database
const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);
// Determines whether or not temporary matrices are needed
auto a_no_temp = a_one == a_one_i && a_two == a_two_i && a_ld == a_one && a_offset == 0 &&
a_do_transpose == false && a_conjugate == false;
auto b_no_temp = b_one == b_one_i && b_two == b_two_i && b_ld == b_one && b_offset == 0 &&
b_do_transpose == false && b_conjugate == false;
auto c_no_temp = c_one == c_one_i && c_two == c_two_i && c_ld == c_one && c_offset == 0 &&
c_do_transpose == false;
// Determines whether or not temporary matrices are needed
auto a_no_temp = a_one == a_one_i && a_two == a_two_i && a_ld == a_one && a_offset == 0 &&
a_do_transpose == false && a_conjugate == false;
auto b_no_temp = b_one == b_one_i && b_two == b_two_i && b_ld == b_one && b_offset == 0 &&
b_do_transpose == false && b_conjugate == false;
auto c_no_temp = c_one == c_one_i && c_two == c_two_i && c_ld == c_one && c_offset == 0 &&
c_do_transpose == false;
// Creates the temporary matrices
const auto a_temp = (a_no_temp) ? a_buffer : Buffer<T>(context_, a_one_i*a_two_i);
const auto b_temp = (b_no_temp) ? b_buffer : Buffer<T>(context_, b_one_i*b_two_i);
const auto c_temp = (c_no_temp) ? c_buffer : Buffer<T>(context_, c_one_i*c_two_i);
// Creates the temporary matrices
const auto a_temp = (a_no_temp) ? a_buffer : Buffer<T>(context_, a_one_i*a_two_i);
const auto b_temp = (b_no_temp) ? b_buffer : Buffer<T>(context_, b_one_i*b_two_i);
const auto c_temp = (c_no_temp) ? c_buffer : Buffer<T>(context_, c_one_i*c_two_i);
// Events of all kernels (including pre/post processing kernels)
auto eventWaitList = std::vector<Event>();
auto emptyEventList = std::vector<Event>();
// Events of all kernels (including pre/post processing kernels)
auto eventWaitList = std::vector<Event>();
auto emptyEventList = std::vector<Event>();
// Runs the pre-processing kernel for matrix A. This transposes the matrix, but also pads zeros
// to fill it up until it reaches a certain multiple of size (kernel parameter dependent). In
// case nothing has to be done, these kernels can be skipped.
if (!a_no_temp) {
auto eventProcessA = Event();
PadCopyTransposeMatrix(queue_, device_, db_, eventProcessA.pointer(), emptyEventList,
a_one, a_two, a_ld, a_offset, a_buffer,
a_one_i, a_two_i, a_one_i, 0, a_temp,
ConstantOne<T>(), program,
true, a_do_transpose, a_conjugate);
eventWaitList.push_back(eventProcessA);
}
// Runs the pre-processing kernel for matrix A. This transposes the matrix, but also pads zeros
// to fill it up until it reaches a certain multiple of size (kernel parameter dependent). In
// case nothing has to be done, these kernels can be skipped.
if (!a_no_temp) {
auto eventProcessA = Event();
status = PadCopyTransposeMatrix(queue_, device_, db_, eventProcessA.pointer(), emptyEventList,
a_one, a_two, a_ld, a_offset, a_buffer,
a_one_i, a_two_i, a_one_i, 0, a_temp,
ConstantOne<T>(), program,
true, a_do_transpose, a_conjugate);
if (ErrorIn(status)) { return status; }
eventWaitList.push_back(eventProcessA);
}
// As above, but now for matrix B
if (!b_no_temp) {
auto eventProcessB = Event();
PadCopyTransposeMatrix(queue_, device_, db_, eventProcessB.pointer(), emptyEventList,
b_one, b_two, b_ld, b_offset, b_buffer,
b_one_i, b_two_i, b_one_i, 0, b_temp,
ConstantOne<T>(), program,
true, b_do_transpose, b_conjugate);
eventWaitList.push_back(eventProcessB);
}
// As above, but now for matrix B
if (!b_no_temp) {
auto eventProcessB = Event();
status = PadCopyTransposeMatrix(queue_, device_, db_, eventProcessB.pointer(), emptyEventList,
b_one, b_two, b_ld, b_offset, b_buffer,
b_one_i, b_two_i, b_one_i, 0, b_temp,
ConstantOne<T>(), program,
true, b_do_transpose, b_conjugate);
if (ErrorIn(status)) { return status; }
eventWaitList.push_back(eventProcessB);
}
// As above, but now for matrix C. This is only necessary if C is used both as input and output.
if (!c_no_temp && beta != static_cast<T>(0)) {
auto eventProcessC = Event();
PadCopyTransposeMatrix(queue_, device_, db_, eventProcessC.pointer(), emptyEventList,
c_one, c_two, c_ld, c_offset, c_buffer,
c_one_i, c_two_i, c_one_i, 0, c_temp,
ConstantOne<T>(), program,
true, c_do_transpose, false);
eventWaitList.push_back(eventProcessC);
}
// As above, but now for matrix C. This is only necessary if C is used both as input and output.
if (!c_no_temp && beta != static_cast<T>(0)) {
auto eventProcessC = Event();
status = PadCopyTransposeMatrix(queue_, device_, db_, eventProcessC.pointer(), emptyEventList,
c_one, c_two, c_ld, c_offset, c_buffer,
c_one_i, c_two_i, c_one_i, 0, c_temp,
ConstantOne<T>(), program,
true, c_do_transpose, false);
if (ErrorIn(status)) { return status; }
eventWaitList.push_back(eventProcessC);
}
// Retrieves the Xgemm kernel from the compiled binary
auto kernel = Kernel(program, "Xgemm");
// Retrieves the Xgemm kernel from the compiled binary
try {
auto kernel = Kernel(program, "Xgemm");
// Sets the kernel arguments
kernel.SetArgument(0, static_cast<int>(m_ceiled));
kernel.SetArgument(1, static_cast<int>(n_ceiled));
kernel.SetArgument(2, static_cast<int>(k_ceiled));
kernel.SetArgument(3, GetRealArg(alpha));
kernel.SetArgument(4, GetRealArg(beta));
kernel.SetArgument(5, a_temp());
kernel.SetArgument(6, b_temp());
kernel.SetArgument(7, c_temp());
// Sets the kernel arguments
kernel.SetArgument(0, static_cast<int>(m_ceiled));
kernel.SetArgument(1, static_cast<int>(n_ceiled));
kernel.SetArgument(2, static_cast<int>(k_ceiled));
kernel.SetArgument(3, GetRealArg(alpha));
kernel.SetArgument(4, GetRealArg(beta));
kernel.SetArgument(5, a_temp());
kernel.SetArgument(6, b_temp());
kernel.SetArgument(7, c_temp());
// Computes the global and local thread sizes
const auto global = std::vector<size_t>{
(c_one_i * db_["MDIMC"]) / db_["MWG"],
(c_two_i * db_["NDIMC"]) / db_["NWG"]
};
const auto local = std::vector<size_t>{db_["MDIMC"], db_["NDIMC"]};
// Computes the global and local thread sizes
const auto global = std::vector<size_t>{
(c_one_i * db_["MDIMC"]) / db_["MWG"],
(c_two_i * db_["NDIMC"]) / db_["NWG"]
};
const auto local = std::vector<size_t>{db_["MDIMC"], db_["NDIMC"]};
// Launches the kernel
auto eventKernel = Event();
auto eventPointer = (!c_no_temp) ? eventKernel.pointer() : event_;
RunKernel(kernel, queue_, device_, global, local, eventPointer, eventWaitList);
// Launches the kernel
auto eventKernel = Event();
auto eventPointer = (!c_no_temp) ? eventKernel.pointer() : event_;
status = RunKernel(kernel, queue_, device_, global, local, eventPointer, eventWaitList);
if (ErrorIn(status)) { return status; }
// Runs the post-processing kernel if needed
if (!c_no_temp) {
eventWaitList.push_back(eventKernel);
status = PadCopyTransposeMatrix(queue_, device_, db_, event_, eventWaitList,
c_one_i, c_two_i, c_one_i, 0, c_temp,
c_one, c_two, c_ld, c_offset, c_buffer,
ConstantOne<T>(), program,
false, c_do_transpose, false);
if (ErrorIn(status)) { return status; }
}
// Successfully finished the computation
return StatusCode::kSuccess;
} catch (...) { return StatusCode::kInvalidKernel; }
} catch (...) { return StatusCode::kTempBufferAllocFailure; }
// Runs the post-processing kernel if needed
if (!c_no_temp) {
eventWaitList.push_back(eventKernel);
PadCopyTransposeMatrix(queue_, device_, db_, event_, eventWaitList,
c_one_i, c_two_i, c_one_i, 0, c_temp,
c_one, c_two, c_ld, c_offset, c_buffer,
ConstantOne<T>(), program,
false, c_do_transpose, false);
}
}
@ -268,7 +249,7 @@ StatusCode Xgemm<T>::GemmIndirect(const size_t m, const size_t n, const size_t k
// The direct version of GEMM, requiring just one kernel, no pre or post-processing kernels.
template <typename T>
StatusCode Xgemm<T>::GemmDirect(const size_t m, const size_t n, const size_t k,
void Xgemm<T>::GemmDirect(const size_t m, const size_t n, const size_t k,
const T alpha,
const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
const Buffer<T> &b_buffer, const size_t b_offset, const size_t b_ld,
@ -281,46 +262,40 @@ StatusCode Xgemm<T>::GemmDirect(const size_t m, const size_t n, const size_t k,
const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);
// Retrieves the proper XgemmDirect kernel from the compiled binary
try {
const auto name = (a_do_transpose) ? (b_do_transpose ? "XgemmDirectTT" : "XgemmDirectTN") :
(b_do_transpose ? "XgemmDirectNT" : "XgemmDirectNN");
auto kernel = Kernel(program, name);
const auto name = (a_do_transpose) ? (b_do_transpose ? "XgemmDirectTT" : "XgemmDirectTN") :
(b_do_transpose ? "XgemmDirectNT" : "XgemmDirectNN");
auto kernel = Kernel(program, name);
// Sets the kernel arguments
kernel.SetArgument(0, static_cast<int>(m));
kernel.SetArgument(1, static_cast<int>(n));
kernel.SetArgument(2, static_cast<int>(k));
kernel.SetArgument(3, GetRealArg(alpha));
kernel.SetArgument(4, GetRealArg(beta));
kernel.SetArgument(5, a_buffer());
kernel.SetArgument(6, static_cast<int>(a_offset));
kernel.SetArgument(7, static_cast<int>(a_ld));
kernel.SetArgument(8, b_buffer());
kernel.SetArgument(9, static_cast<int>(b_offset));
kernel.SetArgument(10, static_cast<int>(b_ld));
kernel.SetArgument(11, c_buffer());
kernel.SetArgument(12, static_cast<int>(c_offset));
kernel.SetArgument(13, static_cast<int>(c_ld));
kernel.SetArgument(14, static_cast<int>(c_do_transpose));
kernel.SetArgument(15, static_cast<int>(a_conjugate));
kernel.SetArgument(16, static_cast<int>(b_conjugate));
// Sets the kernel arguments
kernel.SetArgument(0, static_cast<int>(m));
kernel.SetArgument(1, static_cast<int>(n));
kernel.SetArgument(2, static_cast<int>(k));
kernel.SetArgument(3, GetRealArg(alpha));
kernel.SetArgument(4, GetRealArg(beta));
kernel.SetArgument(5, a_buffer());
kernel.SetArgument(6, static_cast<int>(a_offset));
kernel.SetArgument(7, static_cast<int>(a_ld));
kernel.SetArgument(8, b_buffer());
kernel.SetArgument(9, static_cast<int>(b_offset));
kernel.SetArgument(10, static_cast<int>(b_ld));
kernel.SetArgument(11, c_buffer());
kernel.SetArgument(12, static_cast<int>(c_offset));
kernel.SetArgument(13, static_cast<int>(c_ld));
kernel.SetArgument(14, static_cast<int>(c_do_transpose));
kernel.SetArgument(15, static_cast<int>(a_conjugate));
kernel.SetArgument(16, static_cast<int>(b_conjugate));
// Computes the global and local thread sizes
const auto m_ceiled = Ceil(m, db_["WGD"]);
const auto n_ceiled = Ceil(n, db_["WGD"]);
const auto global = std::vector<size_t>{
(m_ceiled * db_["MDIMCD"]) / db_["WGD"],
(n_ceiled * db_["NDIMCD"]) / db_["WGD"]
};
const auto local = std::vector<size_t>{db_["MDIMCD"], db_["NDIMCD"]};
// Computes the global and local thread sizes
const auto m_ceiled = Ceil(m, db_["WGD"]);
const auto n_ceiled = Ceil(n, db_["WGD"]);
const auto global = std::vector<size_t>{
(m_ceiled * db_["MDIMCD"]) / db_["WGD"],
(n_ceiled * db_["NDIMCD"]) / db_["WGD"]
};
const auto local = std::vector<size_t>{db_["MDIMCD"], db_["NDIMCD"]};
// Launches the kernel
auto status = RunKernel(kernel, queue_, device_, global, local, event_);
if (ErrorIn(status)) { return status; }
// Successfully finished the computation
return StatusCode::kSuccess;
} catch (...) { return StatusCode::kInvalidKernel; }
// Launches the kernel
RunKernel(kernel, queue_, device_, global, local, event_);
}
// =================================================================================================

View File

@ -28,36 +28,36 @@ class Xgemm: public Routine {
Xgemm(Queue &queue, EventPointer event, const std::string &name = "GEMM");
// Templated-precision implementation of the routine
StatusCode DoGemm(const Layout layout, const Transpose a_transpose, const Transpose b_transpose,
const size_t m, const size_t n, const size_t k,
void DoGemm(const Layout layout, const Transpose a_transpose, const Transpose b_transpose,
const size_t m, const size_t n, const size_t k,
const T alpha,
const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
const Buffer<T> &b_buffer, const size_t b_offset, const size_t b_ld,
const T beta,
const Buffer<T> &c_buffer, const size_t c_offset, const size_t c_ld);
// Indirect version of GEMM (with pre and post-processing kernels)
void GemmIndirect(const size_t m, const size_t n, const size_t k,
const T alpha,
const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
const Buffer<T> &b_buffer, const size_t b_offset, const size_t b_ld,
const T beta,
const Buffer<T> &c_buffer, const size_t c_offset, const size_t c_ld);
// Indirect version of GEMM (with pre and post-processing kernels)
StatusCode GemmIndirect(const size_t m, const size_t n, const size_t k,
const T alpha,
const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
const Buffer<T> &b_buffer, const size_t b_offset, const size_t b_ld,
const T beta,
const Buffer<T> &c_buffer, const size_t c_offset, const size_t c_ld,
const bool a_do_transpose, const bool b_do_transpose, const bool c_do_transpose,
const bool a_conjugate, const bool b_conjugate,
const size_t a_one, const size_t a_two, const bool a_want_rotated,
const size_t b_one, const size_t b_two, const bool b_want_rotated,
const size_t c_one, const size_t c_two, const bool c_want_rotated);
const Buffer<T> &c_buffer, const size_t c_offset, const size_t c_ld,
const bool a_do_transpose, const bool b_do_transpose, const bool c_do_transpose,
const bool a_conjugate, const bool b_conjugate,
const size_t a_one, const size_t a_two, const bool a_want_rotated,
const size_t b_one, const size_t b_two, const bool b_want_rotated,
const size_t c_one, const size_t c_two, const bool c_want_rotated);
// Direct version of GEMM (no pre and post-processing kernels)
StatusCode GemmDirect(const size_t m, const size_t n, const size_t k,
const T alpha,
const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
const Buffer<T> &b_buffer, const size_t b_offset, const size_t b_ld,
const T beta,
const Buffer<T> &c_buffer, const size_t c_offset, const size_t c_ld,
const bool a_do_transpose, const bool b_do_transpose, const bool c_do_transpose,
const bool a_conjugate, const bool b_conjugate);
void GemmDirect(const size_t m, const size_t n, const size_t k,
const T alpha,
const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
const Buffer<T> &b_buffer, const size_t b_offset, const size_t b_ld,
const T beta,
const Buffer<T> &c_buffer, const size_t c_offset, const size_t c_ld,
const bool a_do_transpose, const bool b_do_transpose, const bool c_do_transpose,
const bool a_conjugate, const bool b_conjugate);
};
// =================================================================================================

View File

@ -29,7 +29,7 @@ Xhemm<T>::Xhemm(Queue &queue, EventPointer event, const std::string &name):
// The main routine
template <typename T>
StatusCode Xhemm<T>::DoHemm(const Layout layout, const Side side, const Triangle triangle,
void Xhemm<T>::DoHemm(const Layout layout, const Side side, const Triangle triangle,
const size_t m, const size_t n,
const T alpha,
const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
@ -38,15 +38,14 @@ StatusCode Xhemm<T>::DoHemm(const Layout layout, const Side side, const Triangle
const Buffer<T> &c_buffer, const size_t c_offset, const size_t c_ld) {
// Makes sure all dimensions are larger than zero
if ((m == 0) || (n == 0) ) { return StatusCode::kInvalidDimension; }
if ((m == 0) || (n == 0) ) { throw BLASError(StatusCode::kInvalidDimension); }
// Computes the k dimension. This is based on whether or not the hermitian matrix is A (on the
// left) or B (on the right) in the Xgemm routine.
auto k = (side == Side::kLeft) ? m : n;
// Checks for validity of the squared A matrix
auto status = TestMatrixA(k, k, a_buffer, a_offset, a_ld);
if (ErrorIn(status)) { return status; }
TestMatrixA(k, k, a_buffer, a_offset, a_ld);
// Determines which kernel to run based on the layout (the Xgemm kernel assumes column-major as
// default) and on whether we are dealing with an upper or lower triangle of the hermitian matrix
@ -55,73 +54,68 @@ StatusCode Xhemm<T>::DoHemm(const Layout layout, const Side side, const Triangle
auto kernel_name = (is_upper) ? "HermUpperToSquared" : "HermLowerToSquared";
// Temporary buffer for a copy of the hermitian matrix
try {
auto temp_herm = Buffer<T>(context_, k*k);
auto temp_herm = Buffer<T>(context_, k*k);
// Creates a general matrix from the hermitian matrix to be able to run the regular Xgemm
// routine afterwards
// Creates a general matrix from the hermitian matrix to be able to run the regular Xgemm
// routine afterwards
const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);
auto kernel = Kernel(program, kernel_name);
// Sets the arguments for the hermitian-to-squared kernel
kernel.SetArgument(0, static_cast<int>(k));
kernel.SetArgument(1, static_cast<int>(a_ld));
kernel.SetArgument(2, static_cast<int>(a_offset));
kernel.SetArgument(3, a_buffer());
kernel.SetArgument(4, static_cast<int>(k));
kernel.SetArgument(5, static_cast<int>(k));
kernel.SetArgument(6, static_cast<int>(0));
kernel.SetArgument(7, temp_herm());
// Uses the common padding kernel's thread configuration. This is allowed, since the
// hermitian-to-squared kernel uses the same parameters.
auto global = std::vector<size_t>{Ceil(CeilDiv(k, db_["PAD_WPTX"]), db_["PAD_DIMX"]),
Ceil(CeilDiv(k, db_["PAD_WPTY"]), db_["PAD_DIMY"])};
auto local = std::vector<size_t>{db_["PAD_DIMX"], db_["PAD_DIMY"]};
auto kernelEvent = Event();
RunKernel(kernel, queue_, device_, global, local, kernelEvent.pointer());
// Synchronize now: 'DoGemm' does not accept a list of events to wait for
kernelEvent.WaitForCompletion();
// Runs the regular Xgemm code with either "C := AB+C" or ...
if (side == Side::kLeft) {
DoGemm(layout, Transpose::kNo, Transpose::kNo,
m, n, k,
alpha,
temp_herm, 0, k,
b_buffer, b_offset, b_ld,
beta,
c_buffer, c_offset, c_ld);
}
// ... with "C := BA+C". Note that A and B are now reversed.
else {
try {
const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);
auto kernel = Kernel(program, kernel_name);
// Sets the arguments for the hermitian-to-squared kernel
kernel.SetArgument(0, static_cast<int>(k));
kernel.SetArgument(1, static_cast<int>(a_ld));
kernel.SetArgument(2, static_cast<int>(a_offset));
kernel.SetArgument(3, a_buffer());
kernel.SetArgument(4, static_cast<int>(k));
kernel.SetArgument(5, static_cast<int>(k));
kernel.SetArgument(6, static_cast<int>(0));
kernel.SetArgument(7, temp_herm());
// Uses the common padding kernel's thread configuration. This is allowed, since the
// hermitian-to-squared kernel uses the same parameters.
auto global = std::vector<size_t>{Ceil(CeilDiv(k, db_["PAD_WPTX"]), db_["PAD_DIMX"]),
Ceil(CeilDiv(k, db_["PAD_WPTY"]), db_["PAD_DIMY"])};
auto local = std::vector<size_t>{db_["PAD_DIMX"], db_["PAD_DIMY"]};
auto kernelEvent = Event();
status = RunKernel(kernel, queue_, device_, global, local, kernelEvent.pointer());
if (ErrorIn(status)) { return status; }
// Synchronize now: 'DoGemm' does not accept a list of events to wait for
kernelEvent.WaitForCompletion();
// Runs the regular Xgemm code with either "C := AB+C" or ...
if (side == Side::kLeft) {
status = DoGemm(layout, Transpose::kNo, Transpose::kNo,
m, n, k,
alpha,
temp_herm, 0, k,
b_buffer, b_offset, b_ld,
beta,
c_buffer, c_offset, c_ld);
DoGemm(layout, Transpose::kNo, Transpose::kNo,
m, n, k,
alpha,
b_buffer, b_offset, b_ld,
temp_herm, 0, k,
beta,
c_buffer, c_offset, c_ld);
} catch (BLASError &e) {
// A and B are now reversed, so also reverse the error codes returned from the Xgemm routine
switch(e.status()) {
case StatusCode::kInvalidMatrixA: throw BLASError(StatusCode::kInvalidMatrixB, e.details());
case StatusCode::kInvalidMatrixB: throw BLASError(StatusCode::kInvalidMatrixA, e.details());
case StatusCode::kInvalidLeadDimA: throw BLASError(StatusCode::kInvalidLeadDimB, e.details());
case StatusCode::kInvalidLeadDimB: throw BLASError(StatusCode::kInvalidLeadDimA, e.details());
case StatusCode::kInsufficientMemoryA: throw BLASError(StatusCode::kInsufficientMemoryB, e.details());
case StatusCode::kInsufficientMemoryB: throw BLASError(StatusCode::kInsufficientMemoryA, e.details());
default: throw;
}
// ... with "C := BA+C". Note that A and B are now reversed.
else {
status = DoGemm(layout, Transpose::kNo, Transpose::kNo,
m, n, k,
alpha,
b_buffer, b_offset, b_ld,
temp_herm, 0, k,
beta,
c_buffer, c_offset, c_ld);
// A and B are now reversed, so also reverse the error codes returned from the Xgemm routine
switch(status) {
case StatusCode::kInvalidMatrixA: status = StatusCode::kInvalidMatrixB; break;
case StatusCode::kInvalidMatrixB: status = StatusCode::kInvalidMatrixA; break;
case StatusCode::kInvalidLeadDimA: status = StatusCode::kInvalidLeadDimB; break;
case StatusCode::kInvalidLeadDimB: status = StatusCode::kInvalidLeadDimA; break;
case StatusCode::kInsufficientMemoryA: status = StatusCode::kInsufficientMemoryB; break;
case StatusCode::kInsufficientMemoryB: status = StatusCode::kInsufficientMemoryA; break;
}
}
// Return the status of the Xgemm routine
return status;
} catch (...) { return StatusCode::kInvalidKernel; }
} catch (...) { return StatusCode::kTempBufferAllocFailure; }
}
}
}
// =================================================================================================

View File

@ -37,13 +37,13 @@ class Xhemm: public Xgemm<T> {
Xhemm(Queue &queue, EventPointer event, const std::string &name = "HEMM");
// Templated-precision implementation of the routine
StatusCode DoHemm(const Layout layout, const Side side, const Triangle triangle,
const size_t m, const size_t n,
const T alpha,
const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
const Buffer<T> &b_buffer, const size_t b_offset, const size_t b_ld,
const T beta,
const Buffer<T> &c_buffer, const size_t c_offset, const size_t c_ld);
void DoHemm(const Layout layout, const Side side, const Triangle triangle,
const size_t m, const size_t n,
const T alpha,
const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
const Buffer<T> &b_buffer, const size_t b_offset, const size_t b_ld,
const T beta,
const Buffer<T> &c_buffer, const size_t c_offset, const size_t c_ld);
};
// =================================================================================================

View File

@ -39,16 +39,16 @@ Xher2k<T,U>::Xher2k(Queue &queue, EventPointer event, const std::string &name):
// The main routine
template <typename T, typename U>
StatusCode Xher2k<T,U>::DoHer2k(const Layout layout, const Triangle triangle, const Transpose ab_transpose,
const size_t n, const size_t k,
const T alpha,
const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
const Buffer<T> &b_buffer, const size_t b_offset, const size_t b_ld,
const U beta,
const Buffer<T> &c_buffer, const size_t c_offset, const size_t c_ld) {
void Xher2k<T,U>::DoHer2k(const Layout layout, const Triangle triangle, const Transpose ab_transpose,
const size_t n, const size_t k,
const T alpha,
const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
const Buffer<T> &b_buffer, const size_t b_offset, const size_t b_ld,
const U beta,
const Buffer<T> &c_buffer, const size_t c_offset, const size_t c_ld) {
// Makes sure all dimensions are larger than zero
if ((n == 0) || (k == 0) ) { return StatusCode::kInvalidDimension; }
if ((n == 0) || (k == 0) ) { throw BLASError(StatusCode::kInvalidDimension); }
// Determines whether to apply the conjugate transpose to matrix B (argument: no transpose) or
// to matrix A (argument: conjugate transpose)
@ -71,12 +71,9 @@ StatusCode Xher2k<T,U>::DoHer2k(const Layout layout, const Triangle triangle, co
// matrix A cannot be less than N when rotated, or less than K when not-rotated
// matrix B cannot be less than N when rotated, or less than K when not-rotated
// matrix C cannot be less than N
auto status = TestMatrixA(ab_one, ab_two, a_buffer, a_offset, a_ld);
if (ErrorIn(status)) { return status; }
status = TestMatrixB(ab_one, ab_two, b_buffer, b_offset, b_ld);
if (ErrorIn(status)) { return status; }
status = TestMatrixC(n, n, c_buffer, c_offset, c_ld);
if (ErrorIn(status)) { return status; }
TestMatrixA(ab_one, ab_two, a_buffer, a_offset, a_ld);
TestMatrixB(ab_one, ab_two, b_buffer, b_offset, b_ld);
TestMatrixC(n, n, c_buffer, c_offset, c_ld);
// Calculates the ceiled versions of n and k
auto n_ceiled = Ceil(n, db_["NWG"]);
@ -85,145 +82,128 @@ StatusCode Xher2k<T,U>::DoHer2k(const Layout layout, const Triangle triangle, co
// Decides which kernel to run: the upper-triangular or lower-triangular version
auto kernel_name = (triangle == Triangle::kUpper) ? "XgemmUpper" : "XgemmLower";
// The padded/transposed input/output matrices: if memory allocation fails, throw an exception
try {
// Loads the program from the database
const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);
// Loads the program from the database
const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);
// Determines whether or not temporary matrices are needed
auto a1_no_temp = ab_one == n_ceiled && ab_two == k_ceiled && a_ld == n_ceiled && a_offset == 0 &&
ab_rotated == false && ab_conjugate == false;
auto a2_no_temp = ab_one == n_ceiled && ab_two == k_ceiled && a_ld == n_ceiled && a_offset == 0 &&
ab_rotated == false && ab_conjugate == true;
auto b1_no_temp = ab_one == n_ceiled && ab_two == k_ceiled && b_ld == n_ceiled && b_offset == 0 &&
ab_rotated == false && ab_conjugate == false;
auto b2_no_temp = ab_one == n_ceiled && ab_two == k_ceiled && b_ld == n_ceiled && b_offset == 0 &&
ab_rotated == false && ab_conjugate == true;
// Determines whether or not temporary matrices are needed
auto a1_no_temp = ab_one == n_ceiled && ab_two == k_ceiled && a_ld == n_ceiled && a_offset == 0 &&
ab_rotated == false && ab_conjugate == false;
auto a2_no_temp = ab_one == n_ceiled && ab_two == k_ceiled && a_ld == n_ceiled && a_offset == 0 &&
ab_rotated == false && ab_conjugate == true;
auto b1_no_temp = ab_one == n_ceiled && ab_two == k_ceiled && b_ld == n_ceiled && b_offset == 0 &&
ab_rotated == false && ab_conjugate == false;
auto b2_no_temp = ab_one == n_ceiled && ab_two == k_ceiled && b_ld == n_ceiled && b_offset == 0 &&
ab_rotated == false && ab_conjugate == true;
// Creates the temporary matrices
auto a1_temp = (a1_no_temp) ? a_buffer : Buffer<T>(context_, k_ceiled*n_ceiled);
auto a2_temp = (a2_no_temp) ? a_buffer : Buffer<T>(context_, k_ceiled*n_ceiled);
auto b1_temp = (b1_no_temp) ? b_buffer : Buffer<T>(context_, k_ceiled*n_ceiled);
auto b2_temp = (b2_no_temp) ? b_buffer : Buffer<T>(context_, k_ceiled*n_ceiled);
auto c_temp = Buffer<T>(context_, n_ceiled*n_ceiled);
// Creates the temporary matrices
auto a1_temp = (a1_no_temp) ? a_buffer : Buffer<T>(context_, k_ceiled*n_ceiled);
auto a2_temp = (a2_no_temp) ? a_buffer : Buffer<T>(context_, k_ceiled*n_ceiled);
auto b1_temp = (b1_no_temp) ? b_buffer : Buffer<T>(context_, k_ceiled*n_ceiled);
auto b2_temp = (b2_no_temp) ? b_buffer : Buffer<T>(context_, k_ceiled*n_ceiled);
auto c_temp = Buffer<T>(context_, n_ceiled*n_ceiled);
// Convert the arguments to complex versions
auto complex_beta = T{beta, static_cast<U>(0.0)};
// Convert the arguments to complex versions
auto complex_beta = T{beta, static_cast<U>(0.0)};
// Events of all kernels (including pre/post processing kernels)
auto eventWaitList = std::vector<Event>();
auto emptyEventList = std::vector<Event>();
// Events of all kernels (including pre/post processing kernels)
auto eventWaitList = std::vector<Event>();
auto emptyEventList = std::vector<Event>();
// Runs the pre-processing kernels. This transposes the matrices A and B, but also pads zeros to
// to fill it up until it reaches a certain multiple of size (kernel parameter dependent). In
// case nothing has to be done, these kernels can be skipped.
if (!a1_no_temp) {
auto eventProcessA1 = Event();
PadCopyTransposeMatrix(queue_, device_, db_, eventProcessA1.pointer(), emptyEventList,
ab_one, ab_two, a_ld, a_offset, a_buffer,
n_ceiled, k_ceiled, n_ceiled, 0, a1_temp,
ConstantOne<T>(), program,
true, ab_rotated, ab_conjugate);
eventWaitList.push_back(eventProcessA1);
}
if (!a2_no_temp) {
auto eventProcessA2 = Event();
PadCopyTransposeMatrix(queue_, device_, db_, eventProcessA2.pointer(), emptyEventList,
ab_one, ab_two, a_ld, a_offset, a_buffer,
n_ceiled, k_ceiled, n_ceiled, 0, a2_temp,
ConstantOne<T>(), program,
true, ab_rotated, !ab_conjugate);
eventWaitList.push_back(eventProcessA2);
}
if (!b1_no_temp) {
auto eventProcessB1 = Event();
PadCopyTransposeMatrix(queue_, device_, db_, eventProcessB1.pointer(), emptyEventList,
ab_one, ab_two, b_ld, b_offset, b_buffer,
n_ceiled, k_ceiled, n_ceiled, 0, b1_temp,
ConstantOne<T>(), program,
true, ab_rotated, ab_conjugate);
eventWaitList.push_back(eventProcessB1);
}
if (!b2_no_temp) {
auto eventProcessB2 = Event();
PadCopyTransposeMatrix(queue_, device_, db_, eventProcessB2.pointer(), emptyEventList,
ab_one, ab_two, b_ld, b_offset, b_buffer,
n_ceiled, k_ceiled, n_ceiled, 0, b2_temp,
ConstantOne<T>(), program,
true, ab_rotated, !ab_conjugate);
eventWaitList.push_back(eventProcessB2);
}
// Runs the pre-processing kernels. This transposes the matrices A and B, but also pads zeros to
// to fill it up until it reaches a certain multiple of size (kernel parameter dependent). In
// case nothing has to be done, these kernels can be skipped.
if (!a1_no_temp) {
auto eventProcessA1 = Event();
status = PadCopyTransposeMatrix(queue_, device_, db_, eventProcessA1.pointer(), emptyEventList,
ab_one, ab_two, a_ld, a_offset, a_buffer,
n_ceiled, k_ceiled, n_ceiled, 0, a1_temp,
ConstantOne<T>(), program,
true, ab_rotated, ab_conjugate);
eventWaitList.push_back(eventProcessA1);
if (ErrorIn(status)) { return status; }
}
if (!a2_no_temp) {
auto eventProcessA2 = Event();
status = PadCopyTransposeMatrix(queue_, device_, db_, eventProcessA2.pointer(), emptyEventList,
ab_one, ab_two, a_ld, a_offset, a_buffer,
n_ceiled, k_ceiled, n_ceiled, 0, a2_temp,
ConstantOne<T>(), program,
true, ab_rotated, !ab_conjugate);
eventWaitList.push_back(eventProcessA2);
if (ErrorIn(status)) { return status; }
}
if (!b1_no_temp) {
auto eventProcessB1 = Event();
status = PadCopyTransposeMatrix(queue_, device_, db_, eventProcessB1.pointer(), emptyEventList,
ab_one, ab_two, b_ld, b_offset, b_buffer,
n_ceiled, k_ceiled, n_ceiled, 0, b1_temp,
ConstantOne<T>(), program,
true, ab_rotated, ab_conjugate);
eventWaitList.push_back(eventProcessB1);
if (ErrorIn(status)) { return status; }
}
if (!b2_no_temp) {
auto eventProcessB2 = Event();
status = PadCopyTransposeMatrix(queue_, device_, db_, eventProcessB2.pointer(), emptyEventList,
ab_one, ab_two, b_ld, b_offset, b_buffer,
n_ceiled, k_ceiled, n_ceiled, 0, b2_temp,
ConstantOne<T>(), program,
true, ab_rotated, !ab_conjugate);
eventWaitList.push_back(eventProcessB2);
if (ErrorIn(status)) { return status; }
}
// Furthermore, also creates a (possibly padded) copy of matrix C, since it is not allowed to
// modify the other triangle.
auto eventProcessC = Event();
PadCopyTransposeMatrix(queue_, device_, db_, eventProcessC.pointer(), emptyEventList,
n, n, c_ld, c_offset, c_buffer,
n_ceiled, n_ceiled, n_ceiled, 0, c_temp,
ConstantOne<T>(), program,
true, c_rotated, false);
eventWaitList.push_back(eventProcessC);
// Furthermore, also creates a (possibly padded) copy of matrix C, since it is not allowed to
// modify the other triangle.
auto eventProcessC = Event();
status = PadCopyTransposeMatrix(queue_, device_, db_, eventProcessC.pointer(), emptyEventList,
n, n, c_ld, c_offset, c_buffer,
n_ceiled, n_ceiled, n_ceiled, 0, c_temp,
ConstantOne<T>(), program,
true, c_rotated, false);
eventWaitList.push_back(eventProcessC);
if (ErrorIn(status)) { return status; }
// Retrieves the XgemmUpper or XgemmLower kernel from the compiled binary
auto kernel = Kernel(program, kernel_name);
// Retrieves the XgemmUpper or XgemmLower kernel from the compiled binary
try {
auto kernel = Kernel(program, kernel_name);
// Sets the kernel arguments
kernel.SetArgument(0, static_cast<int>(n_ceiled));
kernel.SetArgument(1, static_cast<int>(k_ceiled));
kernel.SetArgument(2, GetRealArg(alpha));
kernel.SetArgument(3, GetRealArg(complex_beta));
kernel.SetArgument(4, a1_temp());
kernel.SetArgument(5, b2_temp());
kernel.SetArgument(6, c_temp());
// Sets the kernel arguments
kernel.SetArgument(0, static_cast<int>(n_ceiled));
kernel.SetArgument(1, static_cast<int>(k_ceiled));
kernel.SetArgument(2, GetRealArg(alpha));
kernel.SetArgument(3, GetRealArg(complex_beta));
kernel.SetArgument(4, a1_temp());
kernel.SetArgument(5, b2_temp());
kernel.SetArgument(6, c_temp());
// Computes the global and local thread sizes
auto global = std::vector<size_t>{
(n_ceiled * db_["MDIMC"]) / db_["MWG"],
(n_ceiled * db_["NDIMC"]) / db_["NWG"]
};
auto local = std::vector<size_t>{db_["MDIMC"], db_["NDIMC"]};
// Computes the global and local thread sizes
auto global = std::vector<size_t>{
(n_ceiled * db_["MDIMC"]) / db_["MWG"],
(n_ceiled * db_["NDIMC"]) / db_["NWG"]
};
auto local = std::vector<size_t>{db_["MDIMC"], db_["NDIMC"]};
// Launches the kernel
auto eventKernel1 = Event();
RunKernel(kernel, queue_, device_, global, local, eventKernel1.pointer(), eventWaitList);
eventWaitList.push_back(eventKernel1);
// Launches the kernel
auto eventKernel1 = Event();
status = RunKernel(kernel, queue_, device_, global, local, eventKernel1.pointer(), eventWaitList);
if (ErrorIn(status)) { return status; }
eventWaitList.push_back(eventKernel1);
// Swaps the arguments for matrices A and B, sets 'beta' to 1, and conjugate alpha
auto conjugate_alpha = T{alpha.real(), -alpha.imag()};
auto complex_one = T{static_cast<U>(1.0), static_cast<U>(0.0)};
kernel.SetArgument(2, GetRealArg(conjugate_alpha));
kernel.SetArgument(3, GetRealArg(complex_one));
kernel.SetArgument(4, b1_temp());
kernel.SetArgument(5, a2_temp());
// Swaps the arguments for matrices A and B, sets 'beta' to 1, and conjugate alpha
auto conjugate_alpha = T{alpha.real(), -alpha.imag()};
auto complex_one = T{static_cast<U>(1.0), static_cast<U>(0.0)};
kernel.SetArgument(2, GetRealArg(conjugate_alpha));
kernel.SetArgument(3, GetRealArg(complex_one));
kernel.SetArgument(4, b1_temp());
kernel.SetArgument(5, a2_temp());
// Runs the kernel again
auto eventKernel2 = Event();
RunKernel(kernel, queue_, device_, global, local, eventKernel2.pointer(), eventWaitList);
eventWaitList.push_back(eventKernel2);
// Runs the kernel again
auto eventKernel2 = Event();
status = RunKernel(kernel, queue_, device_, global, local, eventKernel2.pointer(), eventWaitList);
if (ErrorIn(status)) { return status; }
eventWaitList.push_back(eventKernel2);
// Runs the post-processing kernel
auto upper = (triangle == Triangle::kUpper);
auto lower = (triangle == Triangle::kLower);
status = PadCopyTransposeMatrix(queue_, device_, db_, event_, eventWaitList,
n_ceiled, n_ceiled, n_ceiled, 0, c_temp,
n, n, c_ld, c_offset, c_buffer,
ConstantOne<T>(), program,
false, c_rotated, false, upper, lower, true);
if (ErrorIn(status)) { return status; }
// Successfully finished the computation
return StatusCode::kSuccess;
} catch (...) { return StatusCode::kInvalidKernel; }
} catch (...) { return StatusCode::kTempBufferAllocFailure; }
// Runs the post-processing kernel
auto upper = (triangle == Triangle::kUpper);
auto lower = (triangle == Triangle::kLower);
PadCopyTransposeMatrix(queue_, device_, db_, event_, eventWaitList,
n_ceiled, n_ceiled, n_ceiled, 0, c_temp,
n, n, c_ld, c_offset, c_buffer,
ConstantOne<T>(), program,
false, c_rotated, false, upper, lower, true);
}
// =================================================================================================

View File

@ -30,13 +30,13 @@ class Xher2k: public Routine {
Xher2k(Queue &queue, EventPointer event, const std::string &name = "HER2K");
// Templated-precision implementation of the routine
StatusCode DoHer2k(const Layout layout, const Triangle triangle, const Transpose ab_transpose,
const size_t n, const size_t k,
const T alpha,
const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
const Buffer<T> &b_buffer, const size_t b_offset, const size_t b_ld,
const U beta,
const Buffer<T> &c_buffer, const size_t c_offset, const size_t c_ld);
void DoHer2k(const Layout layout, const Triangle triangle, const Transpose ab_transpose,
const size_t n, const size_t k,
const T alpha,
const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
const Buffer<T> &b_buffer, const size_t b_offset, const size_t b_ld,
const U beta,
const Buffer<T> &c_buffer, const size_t c_offset, const size_t c_ld);
};
// =================================================================================================

View File

@ -39,7 +39,7 @@ Xherk<T,U>::Xherk(Queue &queue, EventPointer event, const std::string &name):
// The main routine
template <typename T, typename U>
StatusCode Xherk<T,U>::DoHerk(const Layout layout, const Triangle triangle, const Transpose a_transpose,
void Xherk<T,U>::DoHerk(const Layout layout, const Triangle triangle, const Transpose a_transpose,
const size_t n, const size_t k,
const U alpha,
const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
@ -47,7 +47,7 @@ StatusCode Xherk<T,U>::DoHerk(const Layout layout, const Triangle triangle, cons
const Buffer<T> &c_buffer, const size_t c_offset, const size_t c_ld) {
// Makes sure all dimensions are larger than zero
if ((n == 0) || (k == 0) ) { return StatusCode::kInvalidDimension; }
if ((n == 0) || (k == 0) ) { throw BLASError(StatusCode::kInvalidDimension); }
// Determines whether to apply the conjugate transpose to matrix B (argument: no transpose) or
// to matrix A (argument: conjugate transpose)
@ -70,10 +70,8 @@ StatusCode Xherk<T,U>::DoHerk(const Layout layout, const Triangle triangle, cons
// space. Also tests that the leading dimensions of:
// matrix A cannot be less than N when rotated, or less than K when not-rotated
// matrix C cannot be less than N
auto status = TestMatrixA(a_one, a_two, a_buffer, a_offset, a_ld);
if (ErrorIn(status)) { return status; }
status = TestMatrixC(n, n, c_buffer, c_offset, c_ld);
if (ErrorIn(status)) { return status; }
TestMatrixA(a_one, a_two, a_buffer, a_offset, a_ld);
TestMatrixC(n, n, c_buffer, c_offset, c_ld);
// Calculates the ceiled versions of n and k
auto n_ceiled = Ceil(n, db_["NWG"]);
@ -82,106 +80,92 @@ StatusCode Xherk<T,U>::DoHerk(const Layout layout, const Triangle triangle, cons
// Decides which kernel to run: the upper-triangular or lower-triangular version
auto kernel_name = (triangle == Triangle::kUpper) ? "XgemmUpper" : "XgemmLower";
// The padded/transposed input/output matrices: if memory allocation fails, throw an exception
try {
// Loads the program from the database
const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);
// Loads the program from the database
const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);
// Determines whether or not temporary matrices are needed
auto a_no_temp = a_one == n_ceiled && a_two == k_ceiled && a_ld == n_ceiled && a_offset == 0 &&
a_rotated == false && a_conjugate == false;
auto b_no_temp = a_one == n_ceiled && a_two == k_ceiled && a_ld == n_ceiled && a_offset == 0 &&
a_rotated == false && b_conjugate == false;
// Determines whether or not temporary matrices are needed
auto a_no_temp = a_one == n_ceiled && a_two == k_ceiled && a_ld == n_ceiled && a_offset == 0 &&
a_rotated == false && a_conjugate == false;
auto b_no_temp = a_one == n_ceiled && a_two == k_ceiled && a_ld == n_ceiled && a_offset == 0 &&
a_rotated == false && b_conjugate == false;
// Creates the temporary matrices
auto a_temp = (a_no_temp) ? a_buffer : Buffer<T>(context_, k_ceiled*n_ceiled);
auto b_temp = (b_no_temp) ? a_buffer : Buffer<T>(context_, k_ceiled*n_ceiled);
auto c_temp = Buffer<T>(context_, n_ceiled*n_ceiled);
// Creates the temporary matrices
auto a_temp = (a_no_temp) ? a_buffer : Buffer<T>(context_, k_ceiled*n_ceiled);
auto b_temp = (b_no_temp) ? a_buffer : Buffer<T>(context_, k_ceiled*n_ceiled);
auto c_temp = Buffer<T>(context_, n_ceiled*n_ceiled);
// Convert the arguments to complex versions
auto complex_alpha = T{alpha, static_cast<U>(0.0)};
auto complex_beta = T{beta, static_cast<U>(0.0)};
// Convert the arguments to complex versions
auto complex_alpha = T{alpha, static_cast<U>(0.0)};
auto complex_beta = T{beta, static_cast<U>(0.0)};
// Events of all kernels (including pre/post processing kernels)
auto eventWaitList = std::vector<Event>();
auto emptyEventList = std::vector<Event>();
// Events of all kernels (including pre/post processing kernels)
auto eventWaitList = std::vector<Event>();
auto emptyEventList = std::vector<Event>();
// Runs the pre-processing kernel for matrix A. This transposes the matrix, but also pads zeros
// to fill it up until it reaches a certain multiple of size (kernel parameter dependent). In
// case nothing has to be done, these kernels can be skipped. Two copies are created.
if (!a_no_temp) {
auto eventProcessA = Event();
PadCopyTransposeMatrix(queue_, device_, db_, eventProcessA.pointer(), emptyEventList,
a_one, a_two, a_ld, a_offset, a_buffer,
n_ceiled, k_ceiled, n_ceiled, 0, a_temp,
ConstantOne<T>(), program,
true, a_rotated, a_conjugate);
eventWaitList.push_back(eventProcessA);
}
if (!b_no_temp) {
auto eventProcessB = Event();
PadCopyTransposeMatrix(queue_, device_, db_, eventProcessB.pointer(), emptyEventList,
a_one, a_two, a_ld, a_offset, a_buffer,
n_ceiled, k_ceiled, n_ceiled, 0, b_temp,
ConstantOne<T>(), program,
true, a_rotated, b_conjugate);
eventWaitList.push_back(eventProcessB);
}
// Runs the pre-processing kernel for matrix A. This transposes the matrix, but also pads zeros
// to fill it up until it reaches a certain multiple of size (kernel parameter dependent). In
// case nothing has to be done, these kernels can be skipped. Two copies are created.
if (!a_no_temp) {
auto eventProcessA = Event();
status = PadCopyTransposeMatrix(queue_, device_, db_, eventProcessA.pointer(), emptyEventList,
a_one, a_two, a_ld, a_offset, a_buffer,
n_ceiled, k_ceiled, n_ceiled, 0, a_temp,
ConstantOne<T>(), program,
true, a_rotated, a_conjugate);
eventWaitList.push_back(eventProcessA);
if (ErrorIn(status)) { return status; }
}
if (!b_no_temp) {
auto eventProcessB = Event();
status = PadCopyTransposeMatrix(queue_, device_, db_, eventProcessB.pointer(), emptyEventList,
a_one, a_two, a_ld, a_offset, a_buffer,
n_ceiled, k_ceiled, n_ceiled, 0, b_temp,
ConstantOne<T>(), program,
true, a_rotated, b_conjugate);
eventWaitList.push_back(eventProcessB);
if (ErrorIn(status)) { return status; }
}
// Furthermore, also creates a (possibly padded) copy of matrix C, since it is not allowed to
// modify the other triangle.
auto eventProcessC = Event();
PadCopyTransposeMatrix(queue_, device_, db_, eventProcessC.pointer(), emptyEventList,
n, n, c_ld, c_offset, c_buffer,
n_ceiled, n_ceiled, n_ceiled, 0, c_temp,
ConstantOne<T>(), program,
true, c_rotated, false);
eventWaitList.push_back(eventProcessC);
// Furthermore, also creates a (possibly padded) copy of matrix C, since it is not allowed to
// modify the other triangle.
auto eventProcessC = Event();
status = PadCopyTransposeMatrix(queue_, device_, db_, eventProcessC.pointer(), emptyEventList,
n, n, c_ld, c_offset, c_buffer,
n_ceiled, n_ceiled, n_ceiled, 0, c_temp,
ConstantOne<T>(), program,
true, c_rotated, false);
eventWaitList.push_back(eventProcessC);
if (ErrorIn(status)) { return status; }
// Retrieves the XgemmUpper or XgemmLower kernel from the compiled binary
auto kernel = Kernel(program, kernel_name);
// Retrieves the XgemmUpper or XgemmLower kernel from the compiled binary
try {
auto kernel = Kernel(program, kernel_name);
// Sets the kernel arguments
kernel.SetArgument(0, static_cast<int>(n_ceiled));
kernel.SetArgument(1, static_cast<int>(k_ceiled));
kernel.SetArgument(2, GetRealArg(complex_alpha));
kernel.SetArgument(3, GetRealArg(complex_beta));
kernel.SetArgument(4, a_temp());
kernel.SetArgument(5, b_temp());
kernel.SetArgument(6, c_temp());
// Sets the kernel arguments
kernel.SetArgument(0, static_cast<int>(n_ceiled));
kernel.SetArgument(1, static_cast<int>(k_ceiled));
kernel.SetArgument(2, GetRealArg(complex_alpha));
kernel.SetArgument(3, GetRealArg(complex_beta));
kernel.SetArgument(4, a_temp());
kernel.SetArgument(5, b_temp());
kernel.SetArgument(6, c_temp());
// Computes the global and local thread sizes
auto global = std::vector<size_t>{
(n_ceiled * db_["MDIMC"]) / db_["MWG"],
(n_ceiled * db_["NDIMC"]) / db_["NWG"]
};
auto local = std::vector<size_t>{db_["MDIMC"], db_["NDIMC"]};
// Computes the global and local thread sizes
auto global = std::vector<size_t>{
(n_ceiled * db_["MDIMC"]) / db_["MWG"],
(n_ceiled * db_["NDIMC"]) / db_["NWG"]
};
auto local = std::vector<size_t>{db_["MDIMC"], db_["NDIMC"]};
// Launches the kernel
auto eventKernel = Event();
RunKernel(kernel, queue_, device_, global, local, eventKernel.pointer(), eventWaitList);
eventWaitList.push_back(eventKernel);
// Launches the kernel
auto eventKernel = Event();
status = RunKernel(kernel, queue_, device_, global, local, eventKernel.pointer(), eventWaitList);
if (ErrorIn(status)) { return status; }
eventWaitList.push_back(eventKernel);
// Runs the post-processing kernel
auto upper = (triangle == Triangle::kUpper);
auto lower = (triangle == Triangle::kLower);
status = PadCopyTransposeMatrix(queue_, device_, db_, event_, eventWaitList,
n_ceiled, n_ceiled, n_ceiled, 0, c_temp,
n, n, c_ld, c_offset, c_buffer,
ConstantOne<T>(), program,
false, c_rotated, false, upper, lower, true);
if (ErrorIn(status)) { return status; }
// Successfully finished the computation
return StatusCode::kSuccess;
} catch (...) { return StatusCode::kInvalidKernel; }
} catch (...) { return StatusCode::kTempBufferAllocFailure; }
// Runs the post-processing kernel
auto upper = (triangle == Triangle::kUpper);
auto lower = (triangle == Triangle::kLower);
PadCopyTransposeMatrix(queue_, device_, db_, event_, eventWaitList,
n_ceiled, n_ceiled, n_ceiled, 0, c_temp,
n, n, c_ld, c_offset, c_buffer,
ConstantOne<T>(), program,
false, c_rotated, false, upper, lower, true);
}
// =================================================================================================

View File

@ -30,12 +30,12 @@ class Xherk: public Routine {
Xherk(Queue &queue, EventPointer event, const std::string &name = "HERK");
// Templated-precision implementation of the routine
StatusCode DoHerk(const Layout layout, const Triangle triangle, const Transpose a_transpose,
const size_t n, const size_t k,
const U alpha,
const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
const U beta,
const Buffer<T> &c_buffer, const size_t c_offset, const size_t c_ld);
void DoHerk(const Layout layout, const Triangle triangle, const Transpose a_transpose,
const size_t n, const size_t k,
const U alpha,
const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
const U beta,
const Buffer<T> &c_buffer, const size_t c_offset, const size_t c_ld);
};
// =================================================================================================

View File

@ -29,7 +29,7 @@ Xsymm<T>::Xsymm(Queue &queue, EventPointer event, const std::string &name):
// The main routine
template <typename T>
StatusCode Xsymm<T>::DoSymm(const Layout layout, const Side side, const Triangle triangle,
void Xsymm<T>::DoSymm(const Layout layout, const Side side, const Triangle triangle,
const size_t m, const size_t n,
const T alpha,
const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
@ -38,15 +38,14 @@ StatusCode Xsymm<T>::DoSymm(const Layout layout, const Side side, const Triangle
const Buffer<T> &c_buffer, const size_t c_offset, const size_t c_ld) {
// Makes sure all dimensions are larger than zero
if ((m == 0) || (n == 0) ) { return StatusCode::kInvalidDimension; }
if ((m == 0) || (n == 0) ) { throw BLASError(StatusCode::kInvalidDimension); }
// Computes the k dimension. This is based on whether or not the symmetric matrix is A (on the
// left) or B (on the right) in the Xgemm routine.
auto k = (side == Side::kLeft) ? m : n;
// Checks for validity of the squared A matrix
auto status = TestMatrixA(k, k, a_buffer, a_offset, a_ld);
if (ErrorIn(status)) { return status; }
TestMatrixA(k, k, a_buffer, a_offset, a_ld);
// Determines which kernel to run based on the layout (the Xgemm kernel assumes column-major as
// default) and on whether we are dealing with an upper or lower triangle of the symmetric matrix
@ -55,73 +54,68 @@ StatusCode Xsymm<T>::DoSymm(const Layout layout, const Side side, const Triangle
auto kernel_name = (is_upper) ? "SymmUpperToSquared" : "SymmLowerToSquared";
// Temporary buffer for a copy of the symmetric matrix
try {
auto temp_symm = Buffer<T>(context_, k*k);
auto temp_symm = Buffer<T>(context_, k*k);
// Creates a general matrix from the symmetric matrix to be able to run the regular Xgemm
// routine afterwards
// Creates a general matrix from the symmetric matrix to be able to run the regular Xgemm
// routine afterwards
const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);
auto kernel = Kernel(program, kernel_name);
// Sets the arguments for the symmetric-to-squared kernel
kernel.SetArgument(0, static_cast<int>(k));
kernel.SetArgument(1, static_cast<int>(a_ld));
kernel.SetArgument(2, static_cast<int>(a_offset));
kernel.SetArgument(3, a_buffer());
kernel.SetArgument(4, static_cast<int>(k));
kernel.SetArgument(5, static_cast<int>(k));
kernel.SetArgument(6, static_cast<int>(0));
kernel.SetArgument(7, temp_symm());
// Uses the common padding kernel's thread configuration. This is allowed, since the
// symmetric-to-squared kernel uses the same parameters.
auto global = std::vector<size_t>{Ceil(CeilDiv(k, db_["PAD_WPTX"]), db_["PAD_DIMX"]),
Ceil(CeilDiv(k, db_["PAD_WPTY"]), db_["PAD_DIMY"])};
auto local = std::vector<size_t>{db_["PAD_DIMX"], db_["PAD_DIMY"]};
auto kernelEvent = Event();
RunKernel(kernel, queue_, device_, global, local, kernelEvent.pointer());
// Synchronize now: 'DoGemm' does not accept a list of events to wait for
kernelEvent.WaitForCompletion();
// Runs the regular Xgemm code with either "C := AB+C" or ...
if (side == Side::kLeft) {
DoGemm(layout, Transpose::kNo, Transpose::kNo,
m, n, k,
alpha,
temp_symm, 0, k,
b_buffer, b_offset, b_ld,
beta,
c_buffer, c_offset, c_ld);
}
// ... with "C := BA+C". Note that A and B are now reversed.
else {
try {
const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);
auto kernel = Kernel(program, kernel_name);
// Sets the arguments for the symmetric-to-squared kernel
kernel.SetArgument(0, static_cast<int>(k));
kernel.SetArgument(1, static_cast<int>(a_ld));
kernel.SetArgument(2, static_cast<int>(a_offset));
kernel.SetArgument(3, a_buffer());
kernel.SetArgument(4, static_cast<int>(k));
kernel.SetArgument(5, static_cast<int>(k));
kernel.SetArgument(6, static_cast<int>(0));
kernel.SetArgument(7, temp_symm());
// Uses the common padding kernel's thread configuration. This is allowed, since the
// symmetric-to-squared kernel uses the same parameters.
auto global = std::vector<size_t>{Ceil(CeilDiv(k, db_["PAD_WPTX"]), db_["PAD_DIMX"]),
Ceil(CeilDiv(k, db_["PAD_WPTY"]), db_["PAD_DIMY"])};
auto local = std::vector<size_t>{db_["PAD_DIMX"], db_["PAD_DIMY"]};
auto kernelEvent = Event();
status = RunKernel(kernel, queue_, device_, global, local, kernelEvent.pointer());
if (ErrorIn(status)) { return status; }
// Synchronize now: 'DoGemm' does not accept a list of events to wait for
kernelEvent.WaitForCompletion();
// Runs the regular Xgemm code with either "C := AB+C" or ...
if (side == Side::kLeft) {
status = DoGemm(layout, Transpose::kNo, Transpose::kNo,
m, n, k,
alpha,
temp_symm, 0, k,
b_buffer, b_offset, b_ld,
beta,
c_buffer, c_offset, c_ld);
DoGemm(layout, Transpose::kNo, Transpose::kNo,
m, n, k,
alpha,
b_buffer, b_offset, b_ld,
temp_symm, 0, k,
beta,
c_buffer, c_offset, c_ld);
} catch (BLASError &e) {
// A and B are now reversed, so also reverse the error codes returned from the Xgemm routine
switch(e.status()) {
case StatusCode::kInvalidMatrixA: throw BLASError(StatusCode::kInvalidMatrixB, e.details());
case StatusCode::kInvalidMatrixB: throw BLASError(StatusCode::kInvalidMatrixA, e.details());
case StatusCode::kInvalidLeadDimA: throw BLASError(StatusCode::kInvalidLeadDimB, e.details());
case StatusCode::kInvalidLeadDimB: throw BLASError(StatusCode::kInvalidLeadDimA, e.details());
case StatusCode::kInsufficientMemoryA: throw BLASError(StatusCode::kInsufficientMemoryB, e.details());
case StatusCode::kInsufficientMemoryB: throw BLASError(StatusCode::kInsufficientMemoryA, e.details());
default: throw;
}
// ... with "C := BA+C". Note that A and B are now reversed.
else {
status = DoGemm(layout, Transpose::kNo, Transpose::kNo,
m, n, k,
alpha,
b_buffer, b_offset, b_ld,
temp_symm, 0, k,
beta,
c_buffer, c_offset, c_ld);
// A and B are now reversed, so also reverse the error codes returned from the Xgemm routine
switch(status) {
case StatusCode::kInvalidMatrixA: status = StatusCode::kInvalidMatrixB; break;
case StatusCode::kInvalidMatrixB: status = StatusCode::kInvalidMatrixA; break;
case StatusCode::kInvalidLeadDimA: status = StatusCode::kInvalidLeadDimB; break;
case StatusCode::kInvalidLeadDimB: status = StatusCode::kInvalidLeadDimA; break;
case StatusCode::kInsufficientMemoryA: status = StatusCode::kInsufficientMemoryB; break;
case StatusCode::kInsufficientMemoryB: status = StatusCode::kInsufficientMemoryA; break;
}
}
// Return the status of the Xgemm routine
return status;
} catch (...) { return StatusCode::kInvalidKernel; }
} catch (...) { return StatusCode::kTempBufferAllocFailure; }
}
}
}
// =================================================================================================

View File

@ -39,13 +39,13 @@ class Xsymm: public Xgemm<T> {
Xsymm(Queue &queue, EventPointer event, const std::string &name = "SYMM");
// Templated-precision implementation of the routine
StatusCode DoSymm(const Layout layout, const Side side, const Triangle triangle,
const size_t m, const size_t n,
const T alpha,
const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
const Buffer<T> &b_buffer, const size_t b_offset, const size_t b_ld,
const T beta,
const Buffer<T> &c_buffer, const size_t c_offset, const size_t c_ld);
void DoSymm(const Layout layout, const Side side, const Triangle triangle,
const size_t m, const size_t n,
const T alpha,
const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
const Buffer<T> &b_buffer, const size_t b_offset, const size_t b_ld,
const T beta,
const Buffer<T> &c_buffer, const size_t c_offset, const size_t c_ld);
};
// =================================================================================================

View File

@ -39,7 +39,7 @@ Xsyr2k<T>::Xsyr2k(Queue &queue, EventPointer event, const std::string &name):
// The main routine
template <typename T>
StatusCode Xsyr2k<T>::DoSyr2k(const Layout layout, const Triangle triangle, const Transpose ab_transpose,
void Xsyr2k<T>::DoSyr2k(const Layout layout, const Triangle triangle, const Transpose ab_transpose,
const size_t n, const size_t k,
const T alpha,
const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
@ -48,7 +48,7 @@ StatusCode Xsyr2k<T>::DoSyr2k(const Layout layout, const Triangle triangle, cons
const Buffer<T> &c_buffer, const size_t c_offset, const size_t c_ld) {
// Makes sure all dimensions are larger than zero
if ((n == 0) || (k == 0) ) { return StatusCode::kInvalidDimension; }
if ((n == 0) || (k == 0) ) { throw BLASError(StatusCode::kInvalidDimension); }
// Computes whether or not the matrices are transposed in memory. This is based on their layout
// (row or column-major) and whether or not they are requested to be pre-transposed.
@ -67,12 +67,9 @@ StatusCode Xsyr2k<T>::DoSyr2k(const Layout layout, const Triangle triangle, cons
// matrix A cannot be less than N when rotated, or less than K when not-rotated
// matrix B cannot be less than N when rotated, or less than K when not-rotated
// matrix C cannot be less than N
auto status = TestMatrixA(ab_one, ab_two, a_buffer, a_offset, a_ld);
if (ErrorIn(status)) { return status; }
status = TestMatrixB(ab_one, ab_two, b_buffer, b_offset, b_ld);
if (ErrorIn(status)) { return status; }
status = TestMatrixC(n, n, c_buffer, c_offset, c_ld);
if (ErrorIn(status)) { return status; }
TestMatrixA(ab_one, ab_two, a_buffer, a_offset, a_ld);
TestMatrixB(ab_one, ab_two, b_buffer, b_offset, b_ld);
TestMatrixC(n, n, c_buffer, c_offset, c_ld);
// Calculates the ceiled versions of n and k
auto n_ceiled = Ceil(n, db_["NWG"]);
@ -81,114 +78,99 @@ StatusCode Xsyr2k<T>::DoSyr2k(const Layout layout, const Triangle triangle, cons
// Decides which kernel to run: the upper-triangular or lower-triangular version
auto kernel_name = (triangle == Triangle::kUpper) ? "XgemmUpper" : "XgemmLower";
// The padded/transposed input/output matrices: if memory allocation fails, throw an exception
try {
// Loads the program from the database
const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);
// Loads the program from the database
const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);
// Determines whether or not temporary matrices are needed
auto a_no_temp = ab_one == n_ceiled && ab_two == k_ceiled && a_ld == n_ceiled && a_offset == 0 &&
ab_rotated == false;
auto b_no_temp = ab_one == n_ceiled && ab_two == k_ceiled && b_ld == n_ceiled && b_offset == 0 &&
ab_rotated == false;
// Determines whether or not temporary matrices are needed
auto a_no_temp = ab_one == n_ceiled && ab_two == k_ceiled && a_ld == n_ceiled && a_offset == 0 &&
ab_rotated == false;
auto b_no_temp = ab_one == n_ceiled && ab_two == k_ceiled && b_ld == n_ceiled && b_offset == 0 &&
ab_rotated == false;
// Creates the temporary matrices
auto a_temp = (a_no_temp) ? a_buffer : Buffer<T>(context_, k_ceiled*n_ceiled);
auto b_temp = (b_no_temp) ? b_buffer : Buffer<T>(context_, k_ceiled*n_ceiled);
auto c_temp = Buffer<T>(context_, n_ceiled*n_ceiled);
// Creates the temporary matrices
auto a_temp = (a_no_temp) ? a_buffer : Buffer<T>(context_, k_ceiled*n_ceiled);
auto b_temp = (b_no_temp) ? b_buffer : Buffer<T>(context_, k_ceiled*n_ceiled);
auto c_temp = Buffer<T>(context_, n_ceiled*n_ceiled);
// Events of all kernels (including pre/post processing kernels)
auto eventWaitList = std::vector<Event>();
auto emptyEventList = std::vector<Event>();
// Events of all kernels (including pre/post processing kernels)
auto eventWaitList = std::vector<Event>();
auto emptyEventList = std::vector<Event>();
// Runs the pre-processing kernels. This transposes the matrices A and B, but also pads zeros to
// to fill it up until it reaches a certain multiple of size (kernel parameter dependent). In
// case nothing has to be done, these kernels can be skipped.
if (!a_no_temp) {
auto eventProcessA = Event();
PadCopyTransposeMatrix(queue_, device_, db_, eventProcessA.pointer(), emptyEventList,
ab_one, ab_two, a_ld, a_offset, a_buffer,
n_ceiled, k_ceiled, n_ceiled, 0, a_temp,
ConstantOne<T>(), program,
true, ab_rotated, false);
eventWaitList.push_back(eventProcessA);
}
if (!b_no_temp) {
auto eventProcessB = Event();
PadCopyTransposeMatrix(queue_, device_, db_, eventProcessB.pointer(), emptyEventList,
ab_one, ab_two, b_ld, b_offset, b_buffer,
n_ceiled, k_ceiled, n_ceiled, 0, b_temp,
ConstantOne<T>(), program,
true, ab_rotated, false);
eventWaitList.push_back(eventProcessB);
}
// Runs the pre-processing kernels. This transposes the matrices A and B, but also pads zeros to
// to fill it up until it reaches a certain multiple of size (kernel parameter dependent). In
// case nothing has to be done, these kernels can be skipped.
if (!a_no_temp) {
auto eventProcessA = Event();
status = PadCopyTransposeMatrix(queue_, device_, db_, eventProcessA.pointer(), emptyEventList,
ab_one, ab_two, a_ld, a_offset, a_buffer,
n_ceiled, k_ceiled, n_ceiled, 0, a_temp,
ConstantOne<T>(), program,
true, ab_rotated, false);
if (ErrorIn(status)) { return status; }
eventWaitList.push_back(eventProcessA);
}
if (!b_no_temp) {
auto eventProcessB = Event();
status = PadCopyTransposeMatrix(queue_, device_, db_, eventProcessB.pointer(), emptyEventList,
ab_one, ab_two, b_ld, b_offset, b_buffer,
n_ceiled, k_ceiled, n_ceiled, 0, b_temp,
ConstantOne<T>(), program,
true, ab_rotated, false);
if (ErrorIn(status)) { return status; }
eventWaitList.push_back(eventProcessB);
}
// Furthermore, also creates a (possibly padded) copy of matrix C, since it is not allowed to
// modify the other triangle.
auto eventProcessC = Event();
PadCopyTransposeMatrix(queue_, device_, db_, eventProcessC.pointer(), emptyEventList,
n, n, c_ld, c_offset, c_buffer,
n_ceiled, n_ceiled, n_ceiled, 0, c_temp,
ConstantOne<T>(), program,
true, c_rotated, false);
eventWaitList.push_back(eventProcessC);
// Furthermore, also creates a (possibly padded) copy of matrix C, since it is not allowed to
// modify the other triangle.
auto eventProcessC = Event();
status = PadCopyTransposeMatrix(queue_, device_, db_, eventProcessC.pointer(), emptyEventList,
n, n, c_ld, c_offset, c_buffer,
n_ceiled, n_ceiled, n_ceiled, 0, c_temp,
ConstantOne<T>(), program,
true, c_rotated, false);
if (ErrorIn(status)) { return status; }
eventWaitList.push_back(eventProcessC);
// Retrieves the XgemmUpper or XgemmLower kernel from the compiled binary
auto kernel = Kernel(program, kernel_name);
// Retrieves the XgemmUpper or XgemmLower kernel from the compiled binary
try {
auto kernel = Kernel(program, kernel_name);
// Sets the kernel arguments
kernel.SetArgument(0, static_cast<int>(n_ceiled));
kernel.SetArgument(1, static_cast<int>(k_ceiled));
kernel.SetArgument(2, GetRealArg(alpha));
kernel.SetArgument(3, GetRealArg(beta));
kernel.SetArgument(4, a_temp());
kernel.SetArgument(5, b_temp());
kernel.SetArgument(6, c_temp());
// Sets the kernel arguments
kernel.SetArgument(0, static_cast<int>(n_ceiled));
kernel.SetArgument(1, static_cast<int>(k_ceiled));
kernel.SetArgument(2, GetRealArg(alpha));
kernel.SetArgument(3, GetRealArg(beta));
kernel.SetArgument(4, a_temp());
kernel.SetArgument(5, b_temp());
kernel.SetArgument(6, c_temp());
// Computes the global and local thread sizes
auto global = std::vector<size_t>{
(n_ceiled * db_["MDIMC"]) / db_["MWG"],
(n_ceiled * db_["NDIMC"]) / db_["NWG"]
};
auto local = std::vector<size_t>{db_["MDIMC"], db_["NDIMC"]};
// Computes the global and local thread sizes
auto global = std::vector<size_t>{
(n_ceiled * db_["MDIMC"]) / db_["MWG"],
(n_ceiled * db_["NDIMC"]) / db_["NWG"]
};
auto local = std::vector<size_t>{db_["MDIMC"], db_["NDIMC"]};
// Launches the kernel
auto eventKernel1 = Event();
RunKernel(kernel, queue_, device_, global, local, eventKernel1.pointer(), eventWaitList);
eventWaitList.push_back(eventKernel1);
// Launches the kernel
auto eventKernel1 = Event();
status = RunKernel(kernel, queue_, device_, global, local, eventKernel1.pointer(), eventWaitList);
if (ErrorIn(status)) { return status; }
eventWaitList.push_back(eventKernel1);
// Swaps the arguments for matrices A and B, and sets 'beta' to 1
auto one = static_cast<T>(1);
kernel.SetArgument(3, GetRealArg(one));
kernel.SetArgument(4, b_temp());
kernel.SetArgument(5, a_temp());
// Swaps the arguments for matrices A and B, and sets 'beta' to 1
auto one = static_cast<T>(1);
kernel.SetArgument(3, GetRealArg(one));
kernel.SetArgument(4, b_temp());
kernel.SetArgument(5, a_temp());
// Runs the kernel again
auto eventKernel2 = Event();
RunKernel(kernel, queue_, device_, global, local, eventKernel2.pointer(), eventWaitList);
eventWaitList.push_back(eventKernel2);
// Runs the kernel again
auto eventKernel2 = Event();
status = RunKernel(kernel, queue_, device_, global, local, eventKernel2.pointer(), eventWaitList);
if (ErrorIn(status)) { return status; }
eventWaitList.push_back(eventKernel2);
// Runs the post-processing kernel
auto upper = (triangle == Triangle::kUpper);
auto lower = (triangle == Triangle::kLower);
status = PadCopyTransposeMatrix(queue_, device_, db_, event_, eventWaitList,
n_ceiled, n_ceiled, n_ceiled, 0, c_temp,
n, n, c_ld, c_offset, c_buffer,
ConstantOne<T>(), program,
false, c_rotated, false, upper, lower, false);
if (ErrorIn(status)) { return status; }
// Successfully finished the computation
return StatusCode::kSuccess;
} catch (...) { return StatusCode::kInvalidKernel; }
} catch (...) { return StatusCode::kTempBufferAllocFailure; }
// Runs the post-processing kernel
auto upper = (triangle == Triangle::kUpper);
auto lower = (triangle == Triangle::kLower);
PadCopyTransposeMatrix(queue_, device_, db_, event_, eventWaitList,
n_ceiled, n_ceiled, n_ceiled, 0, c_temp,
n, n, c_ld, c_offset, c_buffer,
ConstantOne<T>(), program,
false, c_rotated, false, upper, lower, false);
}
// =================================================================================================

View File

@ -30,13 +30,13 @@ class Xsyr2k: public Routine {
Xsyr2k(Queue &queue, EventPointer event, const std::string &name = "SYR2K");
// Templated-precision implementation of the routine
StatusCode DoSyr2k(const Layout layout, const Triangle triangle, const Transpose ab_transpose,
const size_t n, const size_t k,
const T alpha,
const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
const Buffer<T> &b_buffer, const size_t b_offset, const size_t b_ld,
const T beta,
const Buffer<T> &c_buffer, const size_t c_offset, const size_t c_ld);
void DoSyr2k(const Layout layout, const Triangle triangle, const Transpose ab_transpose,
const size_t n, const size_t k,
const T alpha,
const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
const Buffer<T> &b_buffer, const size_t b_offset, const size_t b_ld,
const T beta,
const Buffer<T> &c_buffer, const size_t c_offset, const size_t c_ld);
};
// =================================================================================================

View File

@ -39,7 +39,7 @@ Xsyrk<T>::Xsyrk(Queue &queue, EventPointer event, const std::string &name):
// The main routine
template <typename T>
StatusCode Xsyrk<T>::DoSyrk(const Layout layout, const Triangle triangle, const Transpose a_transpose,
void Xsyrk<T>::DoSyrk(const Layout layout, const Triangle triangle, const Transpose a_transpose,
const size_t n, const size_t k,
const T alpha,
const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
@ -47,7 +47,7 @@ StatusCode Xsyrk<T>::DoSyrk(const Layout layout, const Triangle triangle, const
const Buffer<T> &c_buffer, const size_t c_offset, const size_t c_ld) {
// Makes sure all dimensions are larger than zero
if ((n == 0) || (k == 0) ) { return StatusCode::kInvalidDimension; }
if ((n == 0) || (k == 0) ) { throw BLASError(StatusCode::kInvalidDimension); }
// Computes whether or not the matrices are transposed in memory. This is based on their layout
// (row or column-major) and whether or not they are requested to be pre-transposed.
@ -65,10 +65,8 @@ StatusCode Xsyrk<T>::DoSyrk(const Layout layout, const Triangle triangle, const
// space. Also tests that the leading dimensions of:
// matrix A cannot be less than N when rotated, or less than K when not-rotated
// matrix C cannot be less than N
auto status = TestMatrixA(a_one, a_two, a_buffer, a_offset, a_ld);
if (ErrorIn(status)) { return status; }
status = TestMatrixC(n, n, c_buffer, c_offset, c_ld);
if (ErrorIn(status)) { return status; }
TestMatrixA(a_one, a_two, a_buffer, a_offset, a_ld);
TestMatrixC(n, n, c_buffer, c_offset, c_ld);
// Calculates the ceiled versions of n and k
auto n_ceiled = Ceil(n, db_["NWG"]);
@ -77,90 +75,76 @@ StatusCode Xsyrk<T>::DoSyrk(const Layout layout, const Triangle triangle, const
// Decides which kernel to run: the upper-triangular or lower-triangular version
auto kernel_name = (triangle == Triangle::kUpper) ? "XgemmUpper" : "XgemmLower";
// The padded/transposed input/output matrices: if memory allocation fails, throw an exception
try {
// Loads the program from the database
const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);
// Loads the program from the database
const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);
// Determines whether or not temporary matrices are needed
auto a_no_temp = a_one == n_ceiled && a_two == k_ceiled && a_ld == n_ceiled && a_offset == 0 &&
a_rotated == false;
// Determines whether or not temporary matrices are needed
auto a_no_temp = a_one == n_ceiled && a_two == k_ceiled && a_ld == n_ceiled && a_offset == 0 &&
a_rotated == false;
// Creates the temporary matrices
auto a_temp = (a_no_temp) ? a_buffer : Buffer<T>(context_, k_ceiled*n_ceiled);
auto c_temp = Buffer<T>(context_, n_ceiled*n_ceiled);
// Creates the temporary matrices
auto a_temp = (a_no_temp) ? a_buffer : Buffer<T>(context_, k_ceiled*n_ceiled);
auto c_temp = Buffer<T>(context_, n_ceiled*n_ceiled);
// Events of all kernels (including pre/post processing kernels)
auto eventWaitList = std::vector<Event>();
auto emptyEventList = std::vector<Event>();
// Events of all kernels (including pre/post processing kernels)
auto eventWaitList = std::vector<Event>();
auto emptyEventList = std::vector<Event>();
// Runs the pre-processing kernel for matrix A. This transposes the matrix, but also pads zeros
// to fill it up until it reaches a certain multiple of size (kernel parameter dependent). In
// case nothing has to be done, these kernels can be skipped.
if (!a_no_temp) {
auto eventProcessA = Event();
PadCopyTransposeMatrix(queue_, device_, db_, eventProcessA.pointer(), emptyEventList,
a_one, a_two, a_ld, a_offset, a_buffer,
n_ceiled, k_ceiled, n_ceiled, 0, a_temp,
ConstantOne<T>(), program,
true, a_rotated, false);
eventWaitList.push_back(eventProcessA);
}
// Runs the pre-processing kernel for matrix A. This transposes the matrix, but also pads zeros
// to fill it up until it reaches a certain multiple of size (kernel parameter dependent). In
// case nothing has to be done, these kernels can be skipped.
if (!a_no_temp) {
auto eventProcessA = Event();
status = PadCopyTransposeMatrix(queue_, device_, db_, eventProcessA.pointer(), emptyEventList,
a_one, a_two, a_ld, a_offset, a_buffer,
n_ceiled, k_ceiled, n_ceiled, 0, a_temp,
ConstantOne<T>(), program,
true, a_rotated, false);
if (ErrorIn(status)) { return status; }
eventWaitList.push_back(eventProcessA);
}
// Furthermore, also creates a (possibly padded) copy of matrix C, since it is not allowed to
// modify the other triangle.
auto eventProcessC = Event();
PadCopyTransposeMatrix(queue_, device_, db_, eventProcessC.pointer(), emptyEventList,
n, n, c_ld, c_offset, c_buffer,
n_ceiled, n_ceiled, n_ceiled, 0, c_temp,
ConstantOne<T>(), program,
true, c_rotated, false);
eventWaitList.push_back(eventProcessC);
// Furthermore, also creates a (possibly padded) copy of matrix C, since it is not allowed to
// modify the other triangle.
auto eventProcessC = Event();
status = PadCopyTransposeMatrix(queue_, device_, db_, eventProcessC.pointer(), emptyEventList,
n, n, c_ld, c_offset, c_buffer,
n_ceiled, n_ceiled, n_ceiled, 0, c_temp,
ConstantOne<T>(), program,
true, c_rotated, false);
if (ErrorIn(status)) { return status; }
eventWaitList.push_back(eventProcessC);
// Retrieves the XgemmUpper or XgemmLower kernel from the compiled binary
auto kernel = Kernel(program, kernel_name);
// Retrieves the XgemmUpper or XgemmLower kernel from the compiled binary
try {
auto kernel = Kernel(program, kernel_name);
// Sets the kernel arguments
kernel.SetArgument(0, static_cast<int>(n_ceiled));
kernel.SetArgument(1, static_cast<int>(k_ceiled));
kernel.SetArgument(2, GetRealArg(alpha));
kernel.SetArgument(3, GetRealArg(beta));
kernel.SetArgument(4, a_temp());
kernel.SetArgument(5, a_temp());
kernel.SetArgument(6, c_temp());
// Sets the kernel arguments
kernel.SetArgument(0, static_cast<int>(n_ceiled));
kernel.SetArgument(1, static_cast<int>(k_ceiled));
kernel.SetArgument(2, GetRealArg(alpha));
kernel.SetArgument(3, GetRealArg(beta));
kernel.SetArgument(4, a_temp());
kernel.SetArgument(5, a_temp());
kernel.SetArgument(6, c_temp());
// Computes the global and local thread sizes
auto global = std::vector<size_t>{
(n_ceiled * db_["MDIMC"]) / db_["MWG"],
(n_ceiled * db_["NDIMC"]) / db_["NWG"]
};
auto local = std::vector<size_t>{db_["MDIMC"], db_["NDIMC"]};
// Computes the global and local thread sizes
auto global = std::vector<size_t>{
(n_ceiled * db_["MDIMC"]) / db_["MWG"],
(n_ceiled * db_["NDIMC"]) / db_["NWG"]
};
auto local = std::vector<size_t>{db_["MDIMC"], db_["NDIMC"]};
// Launches the kernel
auto eventKernel = Event();
RunKernel(kernel, queue_, device_, global, local, eventKernel.pointer(), eventWaitList);
eventWaitList.push_back(eventKernel);
// Launches the kernel
auto eventKernel = Event();
status = RunKernel(kernel, queue_, device_, global, local, eventKernel.pointer(), eventWaitList);
if (ErrorIn(status)) { return status; }
eventWaitList.push_back(eventKernel);
// Runs the post-processing kernel
auto upper = (triangle == Triangle::kUpper);
auto lower = (triangle == Triangle::kLower);
status = PadCopyTransposeMatrix(queue_, device_, db_, event_, eventWaitList,
n_ceiled, n_ceiled, n_ceiled, 0, c_temp,
n, n, c_ld, c_offset, c_buffer,
ConstantOne<T>(), program,
false, c_rotated, false, upper, lower, false);
if (ErrorIn(status)) { return status; }
// Successfully finished the computation
return StatusCode::kSuccess;
} catch (...) { return StatusCode::kInvalidKernel; }
} catch (...) { return StatusCode::kTempBufferAllocFailure; }
// Runs the post-processing kernel
auto upper = (triangle == Triangle::kUpper);
auto lower = (triangle == Triangle::kLower);
PadCopyTransposeMatrix(queue_, device_, db_, event_, eventWaitList,
n_ceiled, n_ceiled, n_ceiled, 0, c_temp,
n, n, c_ld, c_offset, c_buffer,
ConstantOne<T>(), program,
false, c_rotated, false, upper, lower, false);
}
// =================================================================================================

View File

@ -32,12 +32,12 @@ class Xsyrk: public Routine {
Xsyrk(Queue &queue, EventPointer event, const std::string &name = "SYRK");
// Templated-precision implementation of the routine
StatusCode DoSyrk(const Layout layout, const Triangle triangle, const Transpose a_transpose,
const size_t n, const size_t k,
const T alpha,
const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
const T beta,
const Buffer<T> &c_buffer, const size_t c_offset, const size_t c_ld);
void DoSyrk(const Layout layout, const Triangle triangle, const Transpose a_transpose,
const size_t n, const size_t k,
const T alpha,
const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
const T beta,
const Buffer<T> &c_buffer, const size_t c_offset, const size_t c_ld);
};
// =================================================================================================

Some files were not shown because too many files have changed in this diff Show More