Separated host-device and device-host memory copies from execution of the CBLAS reference code; for fair timing and code de-duplication
parent
a98c00a267
commit
b84d2296b8
|
@ -353,6 +353,54 @@ void PopulateVector(std::vector<half> &vector, std::mt19937 &mt, std::uniform_re
|
|||
|
||||
// =================================================================================================
|
||||
|
||||
template <typename T, typename U>
|
||||
void DeviceToHost(const Arguments<U> &args, Buffers<T> &buffers, BuffersHost<T> &buffers_host,
|
||||
Queue &queue, const std::vector<std::string> &names) {
|
||||
for (auto &name: names) {
|
||||
if (name == kBufVecX) {buffers_host.x_vec = std::vector<T>(args.x_size, static_cast<T>(0)); buffers.x_vec.Read(queue, args.x_size, buffers_host.x_vec); }
|
||||
else if (name == kBufVecY) { buffers_host.y_vec = std::vector<T>(args.y_size, static_cast<T>(0)); buffers.y_vec.Read(queue, args.y_size, buffers_host.y_vec); }
|
||||
else if (name == kBufMatA) { buffers_host.a_mat = std::vector<T>(args.a_size, static_cast<T>(0)); buffers.a_mat.Read(queue, args.a_size, buffers_host.a_mat); }
|
||||
else if (name == kBufMatB) { buffers_host.b_mat = std::vector<T>(args.b_size, static_cast<T>(0)); buffers.b_mat.Read(queue, args.b_size, buffers_host.b_mat); }
|
||||
else if (name == kBufMatC) { buffers_host.c_mat = std::vector<T>(args.c_size, static_cast<T>(0)); buffers.c_mat.Read(queue, args.c_size, buffers_host.c_mat); }
|
||||
else if (name == kBufMatAP) { buffers_host.ap_mat = std::vector<T>(args.ap_size, static_cast<T>(0)); buffers.ap_mat.Read(queue, args.ap_size, buffers_host.ap_mat); }
|
||||
else if (name == kBufScalar) { buffers_host.scalar = std::vector<T>(args.scalar_size, static_cast<T>(0)); buffers.scalar.Read(queue, args.scalar_size, buffers_host.scalar); }
|
||||
else { throw std::runtime_error("Invalid buffer name"); }
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T, typename U>
|
||||
void HostToDevice(const Arguments<U> &args, Buffers<T> &buffers, BuffersHost<T> &buffers_host,
|
||||
Queue &queue, const std::vector<std::string> &names) {
|
||||
for (auto &name: names) {
|
||||
if (name == kBufVecX) { buffers.x_vec.Write(queue, args.x_size, buffers_host.x_vec); }
|
||||
else if (name == kBufVecY) { buffers.y_vec.Write(queue, args.y_size, buffers_host.y_vec); }
|
||||
else if (name == kBufMatA) { buffers.a_mat.Write(queue, args.a_size, buffers_host.a_mat); }
|
||||
else if (name == kBufMatB) { buffers.b_mat.Write(queue, args.b_size, buffers_host.b_mat); }
|
||||
else if (name == kBufMatC) { buffers.c_mat.Write(queue, args.c_size, buffers_host.c_mat); }
|
||||
else if (name == kBufMatAP) { buffers.ap_mat.Write(queue, args.ap_size, buffers_host.ap_mat); }
|
||||
else if (name == kBufScalar) { buffers.scalar.Write(queue, args.scalar_size, buffers_host.scalar); }
|
||||
else { throw std::runtime_error("Invalid buffer name"); }
|
||||
}
|
||||
}
|
||||
|
||||
// Compiles the above functions
|
||||
template void DeviceToHost(const Arguments<half>&, Buffers<half>&, BuffersHost<half>&, Queue&, const std::vector<std::string>&);
|
||||
template void DeviceToHost(const Arguments<float>&, Buffers<float>&, BuffersHost<float>&, Queue&, const std::vector<std::string>&);
|
||||
template void DeviceToHost(const Arguments<double>&, Buffers<double>&, BuffersHost<double>&, Queue&, const std::vector<std::string>&);
|
||||
template void DeviceToHost(const Arguments<float>&, Buffers<float2>&, BuffersHost<float2>&, Queue&, const std::vector<std::string>&);
|
||||
template void DeviceToHost(const Arguments<double>&, Buffers<double2>&, BuffersHost<double2>&, Queue&, const std::vector<std::string>&);
|
||||
template void DeviceToHost(const Arguments<float2>&, Buffers<float2>&, BuffersHost<float2>&, Queue&, const std::vector<std::string>&);
|
||||
template void DeviceToHost(const Arguments<double2>&, Buffers<double2>&, BuffersHost<double2>&, Queue&, const std::vector<std::string>&);
|
||||
template void HostToDevice(const Arguments<half>&, Buffers<half>&, BuffersHost<half>&, Queue&, const std::vector<std::string>&);
|
||||
template void HostToDevice(const Arguments<float>&, Buffers<float>&, BuffersHost<float>&, Queue&, const std::vector<std::string>&);
|
||||
template void HostToDevice(const Arguments<double>&, Buffers<double>&, BuffersHost<double>&, Queue&, const std::vector<std::string>&);
|
||||
template void HostToDevice(const Arguments<float>&, Buffers<float2>&, BuffersHost<float2>&, Queue&, const std::vector<std::string>&);
|
||||
template void HostToDevice(const Arguments<double>&, Buffers<double2>&, BuffersHost<double2>&, Queue&, const std::vector<std::string>&);
|
||||
template void HostToDevice(const Arguments<float2>&, Buffers<float2>&, BuffersHost<float2>&, Queue&, const std::vector<std::string>&);
|
||||
template void HostToDevice(const Arguments<double2>&, Buffers<double2>&, BuffersHost<double2>&, Queue&, const std::vector<std::string>&);
|
||||
|
||||
// =================================================================================================
|
||||
|
||||
// Conversion between half and single-precision
|
||||
std::vector<float> HalfToFloatBuffer(const std::vector<half>& source) {
|
||||
auto result = std::vector<float>(source.size());
|
||||
|
|
|
@ -98,6 +98,15 @@ constexpr auto kArgHelp = "h";
|
|||
constexpr auto kArgQuiet = "q";
|
||||
constexpr auto kArgNoAbbreviations = "no_abbrv";
|
||||
|
||||
// The buffer names
|
||||
constexpr auto kBufVecX = "X";
|
||||
constexpr auto kBufVecY = "Y";
|
||||
constexpr auto kBufMatA = "A";
|
||||
constexpr auto kBufMatB = "B";
|
||||
constexpr auto kBufMatC = "C";
|
||||
constexpr auto kBufMatAP = "AP";
|
||||
constexpr auto kBufScalar = "Scalar";
|
||||
|
||||
// =================================================================================================
|
||||
|
||||
// Converts a regular or complex type to it's base type (e.g. float2 to float)
|
||||
|
@ -202,6 +211,16 @@ struct Buffers {
|
|||
Buffer<T> ap_mat;
|
||||
Buffer<T> scalar;
|
||||
};
|
||||
template <typename T>
|
||||
struct BuffersHost {
|
||||
std::vector<T> x_vec;
|
||||
std::vector<T> y_vec;
|
||||
std::vector<T> a_mat;
|
||||
std::vector<T> b_mat;
|
||||
std::vector<T> c_mat;
|
||||
std::vector<T> ap_mat;
|
||||
std::vector<T> scalar;
|
||||
};
|
||||
|
||||
// =================================================================================================
|
||||
|
||||
|
@ -250,6 +269,18 @@ void PopulateVector(std::vector<T> &vector, std::mt19937 &mt, std::uniform_real_
|
|||
|
||||
// =================================================================================================
|
||||
|
||||
// Copies buffers from the OpenCL device to the host
|
||||
template <typename T, typename U>
|
||||
void DeviceToHost(const Arguments<U> &args, Buffers<T> &buffers, BuffersHost<T> &buffers_host,
|
||||
Queue &queue, const std::vector<std::string> &names);
|
||||
|
||||
// Copies buffers from the host to the OpenCL device
|
||||
template <typename T, typename U>
|
||||
void HostToDevice(const Arguments<U> &args, Buffers<T> &buffers, BuffersHost<T> &buffers_host,
|
||||
Queue &queue, const std::vector<std::string> &names);
|
||||
|
||||
// =================================================================================================
|
||||
|
||||
// Conversion between half and single-precision
|
||||
std::vector<float> HalfToFloatBuffer(const std::vector<half>& source);
|
||||
void FloatToHalfBuffer(std::vector<half>& result, const std::vector<float>& source);
|
||||
|
|
|
@ -67,15 +67,17 @@ TestBlas<T,U>::TestBlas(const std::vector<std::string> &arguments, const bool si
|
|||
kBetaValues(GetExampleScalars<U>(full_test_)),
|
||||
prepare_data_(prepare_data),
|
||||
run_routine_(run_routine),
|
||||
run_reference1_(run_reference1),
|
||||
run_reference2_(run_reference2),
|
||||
get_result_(get_result),
|
||||
get_index_(get_index),
|
||||
get_id1_(get_id1),
|
||||
get_id2_(get_id2) {
|
||||
|
||||
// Sets the reference to test against
|
||||
if (compare_clblas_) { run_reference_ = run_reference1; }
|
||||
else if (compare_cblas_) { run_reference_ = run_reference2; }
|
||||
else { throw std::runtime_error("Invalid configuration: no reference to test against"); }
|
||||
// Sanity check
|
||||
if (!compare_clblas_ && !compare_cblas_) {
|
||||
throw std::runtime_error("Invalid configuration: no reference to test against");
|
||||
}
|
||||
|
||||
// Computes the maximum sizes. This allows for a single set of input/output buffers.
|
||||
const auto max_vec = *std::max_element(kVectorDims.begin(), kVectorDims.end());
|
||||
|
@ -184,7 +186,9 @@ void TestBlas<T,U>::TestRegular(std::vector<Arguments<U>> &test_vector, const st
|
|||
else if (compare_cblas_) { fprintf(stdout, " [CPU BLAS]"); }
|
||||
std::cout << std::flush;
|
||||
}
|
||||
const auto status1 = run_reference_(args, buffers1, queue_);
|
||||
auto status1 = StatusCode::kSuccess;
|
||||
if (compare_clblas_) { status1 = run_reference1_(args, buffers1, queue_); }
|
||||
else if (compare_cblas_) { status1 = run_reference2_(args, buffers1, queue_); }
|
||||
|
||||
// Tests for equality of the two status codes
|
||||
if (verbose_) { fprintf(stdout, " -> "); std::cout << std::flush; }
|
||||
|
@ -305,7 +309,9 @@ void TestBlas<T,U>::TestInvalid(std::vector<Arguments<U>> &test_vector, const st
|
|||
else if (compare_cblas_) { fprintf(stdout, " [CPU BLAS]"); }
|
||||
std::cout << std::flush;
|
||||
}
|
||||
const auto status1 = run_reference_(args, buffers1, queue_);
|
||||
auto status1 = StatusCode::kSuccess;
|
||||
if (compare_clblas_) { status1 = run_reference1_(args, buffers1, queue_); }
|
||||
else if (compare_cblas_) { status1 = run_reference2_(args, buffers1, queue_); }
|
||||
|
||||
// Tests for equality of the two status codes
|
||||
if (verbose_) { fprintf(stdout, " -> "); std::cout << std::flush; }
|
||||
|
|
|
@ -109,33 +109,48 @@ class TestBlas: public Tester<T,U> {
|
|||
std::vector<T> scalar_source_;
|
||||
|
||||
// The routine-specific functions passed to the tester
|
||||
DataPrepare prepare_data_;
|
||||
Routine run_routine_;
|
||||
Routine run_reference_;
|
||||
ResultGet get_result_;
|
||||
ResultIndex get_index_;
|
||||
ResultIterator get_id1_;
|
||||
ResultIterator get_id2_;
|
||||
const DataPrepare prepare_data_;
|
||||
const Routine run_routine_;
|
||||
const Routine run_reference1_;
|
||||
const Routine run_reference2_;
|
||||
const ResultGet get_result_;
|
||||
const ResultIndex get_index_;
|
||||
const ResultIterator get_id1_;
|
||||
const ResultIterator get_id2_;
|
||||
};
|
||||
|
||||
// =================================================================================================
|
||||
|
||||
// Bogus reference function, in case a comparison library is not available
|
||||
template <typename T, typename U, typename BufferType>
|
||||
static StatusCode ReferenceNotAvailable(const Arguments<U> &, BufferType &, Queue &) {
|
||||
return StatusCode::kNotImplemented;
|
||||
}
|
||||
|
||||
// The interface to the correctness tester. This is a separate function in the header such that it
|
||||
// is automatically compiled for each routine, templated by the parameter "C".
|
||||
template <typename C, typename T, typename U>
|
||||
size_t RunTests(int argc, char *argv[], const bool silent, const std::string &name) {
|
||||
auto command_line_args = RetrieveCommandLineArguments(argc, argv);
|
||||
|
||||
// Sets the reference to test against
|
||||
#if defined(CLBLAST_REF_CLBLAS) && defined(CLBLAST_REF_CBLAS)
|
||||
const auto reference_routine1 = C::RunReference1; // clBLAS
|
||||
const auto reference_routine2 = C::RunReference2; // CBLAS
|
||||
#elif CLBLAST_REF_CLBLAS
|
||||
const auto reference_routine1 = C::RunReference1; // clBLAS
|
||||
const auto reference_routine2 = C::RunReference1; // not used, dummy
|
||||
#elif CLBLAST_REF_CBLAS
|
||||
const auto reference_routine1 = C::RunReference2; // not used, dummy
|
||||
const auto reference_routine2 = C::RunReference2; // CBLAS
|
||||
// Sets the clBLAS reference to test against
|
||||
#ifdef CLBLAST_REF_CLBLAS
|
||||
auto reference_routine1 = C::RunReference1; // clBLAS when available
|
||||
#else
|
||||
auto reference_routine1 = ReferenceNotAvailable<T,U,Buffers<T>>;
|
||||
#endif
|
||||
|
||||
// Sets the CBLAS reference to test against
|
||||
#ifdef CLBLAST_REF_CBLAS
|
||||
auto reference_routine2 = [](const Arguments<U> &args, Buffers<T> &buffers, Queue &queue) -> StatusCode {
|
||||
auto buffers_host = BuffersHost<T>();
|
||||
DeviceToHost(args, buffers, buffers_host, queue, C::BuffersIn());
|
||||
C::RunReference2(args, buffers_host, queue);
|
||||
HostToDevice(args, buffers, buffers_host, queue, C::BuffersOut());
|
||||
return StatusCode::kSuccess;
|
||||
};
|
||||
#else
|
||||
auto reference_routine2 = ReferenceNotAvailable<T,U,Buffers<T>>;
|
||||
#endif
|
||||
|
||||
// Non-BLAS routines cannot be fully tested
|
||||
|
|
|
@ -29,13 +29,17 @@ template <typename T, typename U> const int Client<T,U>::kSeed = 42; // fixed se
|
|||
// Constructor
|
||||
template <typename T, typename U>
|
||||
Client<T,U>::Client(const Routine run_routine,
|
||||
const Routine run_reference1, const Routine run_reference2,
|
||||
const Reference1 run_reference1, const Reference2 run_reference2,
|
||||
const std::vector<std::string> &options,
|
||||
const std::vector<std::string> &buffers_in,
|
||||
const std::vector<std::string> &buffers_out,
|
||||
const GetMetric get_flops, const GetMetric get_bytes):
|
||||
run_routine_(run_routine),
|
||||
run_reference1_(run_reference1),
|
||||
run_reference2_(run_reference2),
|
||||
options_(options),
|
||||
buffers_in_(buffers_in),
|
||||
buffers_out_(buffers_out),
|
||||
get_flops_(get_flops),
|
||||
get_bytes_(get_bytes) {
|
||||
}
|
||||
|
@ -222,7 +226,10 @@ void Client<T,U>::PerformanceTest(Arguments<U> &args, const SetMetric set_sizes)
|
|||
timings.push_back(std::pair<std::string, double>("clBLAS", ms_clblas));
|
||||
}
|
||||
if (args.compare_cblas) {
|
||||
auto ms_cblas = TimedExecution(args.num_runs, args, buffers, queue, run_reference2_, "CPU BLAS");
|
||||
auto buffers_host = BuffersHost<T>();
|
||||
DeviceToHost(args, buffers, buffers_host, queue, buffers_in_);
|
||||
auto ms_cblas = TimedExecution(args.num_runs, args, buffers_host, queue, run_reference2_, "CPU BLAS");
|
||||
HostToDevice(args, buffers, buffers_host, queue, buffers_out_);
|
||||
timings.push_back(std::pair<std::string, double>("CPU BLAS", ms_cblas));
|
||||
}
|
||||
|
||||
|
@ -252,9 +259,10 @@ void Client<T,U>::PerformanceTest(Arguments<U> &args, const SetMetric set_sizes)
|
|||
// timing is performed using the milliseconds chrono functions. The function returns the minimum
|
||||
// value found in the vector of timing results. The return value is in milliseconds.
|
||||
template <typename T, typename U>
|
||||
template <typename BufferType, typename RoutineType>
|
||||
double Client<T,U>::TimedExecution(const size_t num_runs, const Arguments<U> &args,
|
||||
Buffers<T> &buffers, Queue &queue,
|
||||
Routine run_blas, const std::string &library_name) {
|
||||
BufferType &buffers, Queue &queue,
|
||||
RoutineType run_blas, const std::string &library_name) {
|
||||
auto status = StatusCode::kSuccess;
|
||||
|
||||
// Do an optional warm-up to omit compilation times and initialisations from the measurements
|
||||
|
|
|
@ -44,12 +44,15 @@ class Client {
|
|||
|
||||
// Shorthand for the routine-specific functions passed to the tester
|
||||
using Routine = std::function<StatusCode(const Arguments<U>&, Buffers<T>&, Queue&)>;
|
||||
using Reference1 = std::function<StatusCode(const Arguments<U>&, Buffers<T>&, Queue&)>;
|
||||
using Reference2 = std::function<StatusCode(const Arguments<U>&, BuffersHost<T>&, Queue&)>;
|
||||
using SetMetric = std::function<void(Arguments<U>&)>;
|
||||
using GetMetric = std::function<size_t(const Arguments<U>&)>;
|
||||
|
||||
// The constructor
|
||||
Client(const Routine run_routine, const Routine run_reference1, const Routine run_reference2,
|
||||
Client(const Routine run_routine, const Reference1 run_reference1, const Reference2 run_reference2,
|
||||
const std::vector<std::string> &options,
|
||||
const std::vector<std::string> &buffers_in, const std::vector<std::string> &buffers_out,
|
||||
const GetMetric get_flops, const GetMetric get_bytes);
|
||||
|
||||
// Parses all command-line arguments, filling in the arguments structure. If no command-line
|
||||
|
@ -66,8 +69,9 @@ class Client {
|
|||
private:
|
||||
|
||||
// Runs a function a given number of times and returns the execution time of the shortest instance
|
||||
double TimedExecution(const size_t num_runs, const Arguments<U> &args, Buffers<T> &buffers,
|
||||
Queue &queue, Routine run_blas, const std::string &library_name);
|
||||
template <typename BufferType, typename RoutineType>
|
||||
double TimedExecution(const size_t num_runs, const Arguments<U> &args, BufferType &buffers,
|
||||
Queue &queue, RoutineType run_blas, const std::string &library_name);
|
||||
|
||||
// Prints the header of a performance-data table
|
||||
void PrintTableHeader(const Arguments<U>& args);
|
||||
|
@ -78,9 +82,11 @@ class Client {
|
|||
|
||||
// The routine-specific functions passed to the tester
|
||||
const Routine run_routine_;
|
||||
const Routine run_reference1_;
|
||||
const Routine run_reference2_;
|
||||
const Reference1 run_reference1_;
|
||||
const Reference2 run_reference2_;
|
||||
const std::vector<std::string> options_;
|
||||
const std::vector<std::string> buffers_in_;
|
||||
const std::vector<std::string> buffers_out_;
|
||||
const GetMetric get_flops_;
|
||||
const GetMetric get_bytes_;
|
||||
|
||||
|
@ -91,8 +97,8 @@ class Client {
|
|||
// =================================================================================================
|
||||
|
||||
// Bogus reference function, in case a comparison library is not available
|
||||
template <typename T, typename U>
|
||||
static StatusCode ReferenceNotAvailable(const Arguments<U> &, Buffers<T> &, Queue &) {
|
||||
template <typename T, typename U, typename BufferType>
|
||||
static StatusCode ReferenceNotAvailable(const Arguments<U> &, BufferType &, Queue &) {
|
||||
return StatusCode::kNotImplemented;
|
||||
}
|
||||
|
||||
|
@ -105,17 +111,17 @@ void RunClient(int argc, char *argv[]) {
|
|||
#ifdef CLBLAST_REF_CLBLAS
|
||||
auto reference1 = C::RunReference1; // clBLAS when available
|
||||
#else
|
||||
auto reference1 = ReferenceNotAvailable<T,U>;
|
||||
auto reference1 = ReferenceNotAvailable<T,U,Buffers<T>>;
|
||||
#endif
|
||||
#ifdef CLBLAST_REF_CBLAS
|
||||
auto reference2 = C::RunReference2; // CBLAS when available
|
||||
#else
|
||||
auto reference2 = ReferenceNotAvailable<T,U>;
|
||||
auto reference2 = ReferenceNotAvailable<T,U,BuffersHost<T>>;
|
||||
#endif
|
||||
|
||||
// Creates a new client
|
||||
auto client = Client<T,U>(C::RunRoutine, reference1, reference2, C::GetOptions(),
|
||||
C::GetFlops, C::GetBytes);
|
||||
C::BuffersIn(), C::BuffersOut(), C::GetFlops, C::GetBytes);
|
||||
|
||||
// Simple command line argument parser with defaults
|
||||
auto args = client.ParseArguments(argc, argv, C::BLASLevel(),
|
||||
|
|
|
@ -43,6 +43,8 @@ class TestXamax {
|
|||
kArgXInc,
|
||||
kArgXOffset, kArgImaxOffset};
|
||||
}
|
||||
static std::vector<std::string> BuffersIn() { return {kBufVecX, kBufScalar}; }
|
||||
static std::vector<std::string> BuffersOut() { return {kBufScalar}; }
|
||||
|
||||
// Describes how to obtain the sizes of the buffers
|
||||
static size_t GetSizeX(const Arguments<T> &args) {
|
||||
|
@ -101,15 +103,10 @@ class TestXamax {
|
|||
|
||||
// Describes how to run the CPU BLAS routine (for correctness/performance comparison)
|
||||
#ifdef CLBLAST_REF_CBLAS
|
||||
static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
|
||||
std::vector<T> scalar_cpu(args.scalar_size, static_cast<T>(0));
|
||||
std::vector<T> x_vec_cpu(args.x_size, static_cast<T>(0));
|
||||
buffers.scalar.Read(queue, args.scalar_size, scalar_cpu);
|
||||
buffers.x_vec.Read(queue, args.x_size, x_vec_cpu);
|
||||
static StatusCode RunReference2(const Arguments<T> &args, BuffersHost<T> &buffers_host, Queue &) {
|
||||
cblasXamax(args.n,
|
||||
scalar_cpu, args.imax_offset,
|
||||
x_vec_cpu, args.x_offset, args.x_inc);
|
||||
buffers.scalar.Write(queue, args.scalar_size, scalar_cpu);
|
||||
buffers_host.scalar, args.imax_offset,
|
||||
buffers_host.x_vec, args.x_offset, args.x_inc);
|
||||
return StatusCode::kSuccess;
|
||||
}
|
||||
#endif
|
||||
|
|
|
@ -43,6 +43,8 @@ class TestXasum {
|
|||
kArgXInc,
|
||||
kArgXOffset, kArgAsumOffset};
|
||||
}
|
||||
static std::vector<std::string> BuffersIn() { return {kBufVecX, kBufScalar}; }
|
||||
static std::vector<std::string> BuffersOut() { return {kBufScalar}; }
|
||||
|
||||
// Describes how to obtain the sizes of the buffers
|
||||
static size_t GetSizeX(const Arguments<T> &args) {
|
||||
|
@ -101,15 +103,10 @@ class TestXasum {
|
|||
|
||||
// Describes how to run the CPU BLAS routine (for correctness/performance comparison)
|
||||
#ifdef CLBLAST_REF_CBLAS
|
||||
static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
|
||||
std::vector<T> scalar_cpu(args.scalar_size, static_cast<T>(0));
|
||||
std::vector<T> x_vec_cpu(args.x_size, static_cast<T>(0));
|
||||
buffers.scalar.Read(queue, args.scalar_size, scalar_cpu);
|
||||
buffers.x_vec.Read(queue, args.x_size, x_vec_cpu);
|
||||
static StatusCode RunReference2(const Arguments<T> &args, BuffersHost<T> &buffers_host, Queue &) {
|
||||
cblasXasum(args.n,
|
||||
scalar_cpu, args.asum_offset,
|
||||
x_vec_cpu, args.x_offset, args.x_inc);
|
||||
buffers.scalar.Write(queue, args.scalar_size, scalar_cpu);
|
||||
buffers_host.scalar, args.asum_offset,
|
||||
buffers_host.x_vec, args.x_offset, args.x_inc);
|
||||
return StatusCode::kSuccess;
|
||||
}
|
||||
#endif
|
||||
|
|
|
@ -44,6 +44,8 @@ class TestXaxpy {
|
|||
kArgXOffset, kArgYOffset,
|
||||
kArgAlpha};
|
||||
}
|
||||
static std::vector<std::string> BuffersIn() { return {kBufVecX, kBufVecY}; }
|
||||
static std::vector<std::string> BuffersOut() { return {kBufVecY}; }
|
||||
|
||||
// Describes how to obtain the sizes of the buffers
|
||||
static size_t GetSizeX(const Arguments<T> &args) {
|
||||
|
@ -102,15 +104,10 @@ class TestXaxpy {
|
|||
|
||||
// Describes how to run the CPU BLAS routine (for correctness/performance comparison)
|
||||
#ifdef CLBLAST_REF_CBLAS
|
||||
static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
|
||||
std::vector<T> x_vec_cpu(args.x_size, static_cast<T>(0));
|
||||
std::vector<T> y_vec_cpu(args.y_size, static_cast<T>(0));
|
||||
buffers.x_vec.Read(queue, args.x_size, x_vec_cpu);
|
||||
buffers.y_vec.Read(queue, args.y_size, y_vec_cpu);
|
||||
static StatusCode RunReference2(const Arguments<T> &args, BuffersHost<T> &buffers_host, Queue &) {
|
||||
cblasXaxpy(args.n, args.alpha,
|
||||
x_vec_cpu, args.x_offset, args.x_inc,
|
||||
y_vec_cpu, args.y_offset, args.y_inc);
|
||||
buffers.y_vec.Write(queue, args.y_size, y_vec_cpu);
|
||||
buffers_host.x_vec, args.x_offset, args.x_inc,
|
||||
buffers_host.y_vec, args.y_offset, args.y_inc);
|
||||
return StatusCode::kSuccess;
|
||||
}
|
||||
#endif
|
||||
|
|
|
@ -43,6 +43,8 @@ class TestXcopy {
|
|||
kArgXInc, kArgYInc,
|
||||
kArgXOffset, kArgYOffset};
|
||||
}
|
||||
static std::vector<std::string> BuffersIn() { return {kBufVecX, kBufVecY}; }
|
||||
static std::vector<std::string> BuffersOut() { return {kBufVecY}; }
|
||||
|
||||
// Describes how to obtain the sizes of the buffers
|
||||
static size_t GetSizeX(const Arguments<T> &args) {
|
||||
|
@ -101,15 +103,10 @@ class TestXcopy {
|
|||
|
||||
// Describes how to run the CPU BLAS routine (for correctness/performance comparison)
|
||||
#ifdef CLBLAST_REF_CBLAS
|
||||
static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
|
||||
std::vector<T> x_vec_cpu(args.x_size, static_cast<T>(0));
|
||||
std::vector<T> y_vec_cpu(args.y_size, static_cast<T>(0));
|
||||
buffers.x_vec.Read(queue, args.x_size, x_vec_cpu);
|
||||
buffers.y_vec.Read(queue, args.y_size, y_vec_cpu);
|
||||
static StatusCode RunReference2(const Arguments<T> &args, BuffersHost<T> &buffers_host, Queue &) {
|
||||
cblasXcopy(args.n,
|
||||
x_vec_cpu, args.x_offset, args.x_inc,
|
||||
y_vec_cpu, args.y_offset, args.y_inc);
|
||||
buffers.y_vec.Write(queue, args.y_size, y_vec_cpu);
|
||||
buffers_host.x_vec, args.x_offset, args.x_inc,
|
||||
buffers_host.y_vec, args.y_offset, args.y_inc);
|
||||
return StatusCode::kSuccess;
|
||||
}
|
||||
#endif
|
||||
|
|
|
@ -43,6 +43,8 @@ class TestXdot {
|
|||
kArgXInc, kArgYInc,
|
||||
kArgXOffset, kArgYOffset, kArgDotOffset};
|
||||
}
|
||||
static std::vector<std::string> BuffersIn() { return {kBufVecX, kBufVecY, kBufScalar}; }
|
||||
static std::vector<std::string> BuffersOut() { return {kBufScalar}; }
|
||||
|
||||
// Describes how to obtain the sizes of the buffers
|
||||
static size_t GetSizeX(const Arguments<T> &args) {
|
||||
|
@ -107,18 +109,11 @@ class TestXdot {
|
|||
|
||||
// Describes how to run the CPU BLAS routine (for correctness/performance comparison)
|
||||
#ifdef CLBLAST_REF_CBLAS
|
||||
static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
|
||||
std::vector<T> scalar_cpu(args.scalar_size, static_cast<T>(0));
|
||||
std::vector<T> x_vec_cpu(args.x_size, static_cast<T>(0));
|
||||
std::vector<T> y_vec_cpu(args.y_size, static_cast<T>(0));
|
||||
buffers.scalar.Read(queue, args.scalar_size, scalar_cpu);
|
||||
buffers.x_vec.Read(queue, args.x_size, x_vec_cpu);
|
||||
buffers.y_vec.Read(queue, args.y_size, y_vec_cpu);
|
||||
static StatusCode RunReference2(const Arguments<T> &args, BuffersHost<T> &buffers_host, Queue &) {
|
||||
cblasXdot(args.n,
|
||||
scalar_cpu, args.dot_offset,
|
||||
x_vec_cpu, args.x_offset, args.x_inc,
|
||||
y_vec_cpu, args.y_offset, args.y_inc);
|
||||
buffers.scalar.Write(queue, args.scalar_size, scalar_cpu);
|
||||
buffers_host.scalar, args.dot_offset,
|
||||
buffers_host.x_vec, args.x_offset, args.x_inc,
|
||||
buffers_host.y_vec, args.y_offset, args.y_inc);
|
||||
return StatusCode::kSuccess;
|
||||
}
|
||||
#endif
|
||||
|
|
|
@ -43,6 +43,8 @@ class TestXdotc {
|
|||
kArgXInc, kArgYInc,
|
||||
kArgXOffset, kArgYOffset, kArgDotOffset};
|
||||
}
|
||||
static std::vector<std::string> BuffersIn() { return {kBufVecX, kBufVecY, kBufScalar}; }
|
||||
static std::vector<std::string> BuffersOut() { return {kBufScalar}; }
|
||||
|
||||
// Describes how to obtain the sizes of the buffers
|
||||
static size_t GetSizeX(const Arguments<T> &args) {
|
||||
|
@ -107,18 +109,11 @@ class TestXdotc {
|
|||
|
||||
// Describes how to run the CPU BLAS routine (for correctness/performance comparison)
|
||||
#ifdef CLBLAST_REF_CBLAS
|
||||
static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
|
||||
std::vector<T> scalar_cpu(args.scalar_size, static_cast<T>(0));
|
||||
std::vector<T> x_vec_cpu(args.x_size, static_cast<T>(0));
|
||||
std::vector<T> y_vec_cpu(args.y_size, static_cast<T>(0));
|
||||
buffers.scalar.Read(queue, args.scalar_size, scalar_cpu);
|
||||
buffers.x_vec.Read(queue, args.x_size, x_vec_cpu);
|
||||
buffers.y_vec.Read(queue, args.y_size, y_vec_cpu);
|
||||
static StatusCode RunReference2(const Arguments<T> &args, BuffersHost<T> &buffers_host, Queue &) {
|
||||
cblasXdotc(args.n,
|
||||
scalar_cpu, args.dot_offset,
|
||||
x_vec_cpu, args.x_offset, args.x_inc,
|
||||
y_vec_cpu, args.y_offset, args.y_inc);
|
||||
buffers.scalar.Write(queue, args.scalar_size, scalar_cpu);
|
||||
buffers_host.scalar, args.dot_offset,
|
||||
buffers_host.x_vec, args.x_offset, args.x_inc,
|
||||
buffers_host.y_vec, args.y_offset, args.y_inc);
|
||||
return StatusCode::kSuccess;
|
||||
}
|
||||
#endif
|
||||
|
|
|
@ -43,6 +43,8 @@ class TestXdotu {
|
|||
kArgXInc, kArgYInc,
|
||||
kArgXOffset, kArgYOffset, kArgDotOffset};
|
||||
}
|
||||
static std::vector<std::string> BuffersIn() { return {kBufVecX, kBufVecY, kBufScalar}; }
|
||||
static std::vector<std::string> BuffersOut() { return {kBufScalar}; }
|
||||
|
||||
// Describes how to obtain the sizes of the buffers
|
||||
static size_t GetSizeX(const Arguments<T> &args) {
|
||||
|
@ -107,18 +109,11 @@ class TestXdotu {
|
|||
|
||||
// Describes how to run the CPU BLAS routine (for correctness/performance comparison)
|
||||
#ifdef CLBLAST_REF_CBLAS
|
||||
static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
|
||||
std::vector<T> scalar_cpu(args.scalar_size, static_cast<T>(0));
|
||||
std::vector<T> x_vec_cpu(args.x_size, static_cast<T>(0));
|
||||
std::vector<T> y_vec_cpu(args.y_size, static_cast<T>(0));
|
||||
buffers.scalar.Read(queue, args.scalar_size, scalar_cpu);
|
||||
buffers.x_vec.Read(queue, args.x_size, x_vec_cpu);
|
||||
buffers.y_vec.Read(queue, args.y_size, y_vec_cpu);
|
||||
static StatusCode RunReference2(const Arguments<T> &args, BuffersHost<T> &buffers_host, Queue &) {
|
||||
cblasXdotu(args.n,
|
||||
scalar_cpu, args.dot_offset,
|
||||
x_vec_cpu, args.x_offset, args.x_inc,
|
||||
y_vec_cpu, args.y_offset, args.y_inc);
|
||||
buffers.scalar.Write(queue, args.scalar_size, scalar_cpu);
|
||||
buffers_host.scalar, args.dot_offset,
|
||||
buffers_host.x_vec, args.x_offset, args.x_inc,
|
||||
buffers_host.y_vec, args.y_offset, args.y_inc);
|
||||
return StatusCode::kSuccess;
|
||||
}
|
||||
#endif
|
||||
|
|
|
@ -43,6 +43,8 @@ class TestXnrm2 {
|
|||
kArgXInc,
|
||||
kArgXOffset, kArgNrm2Offset};
|
||||
}
|
||||
static std::vector<std::string> BuffersIn() { return {kBufVecX, kBufScalar}; }
|
||||
static std::vector<std::string> BuffersOut() { return {kBufScalar}; }
|
||||
|
||||
// Describes how to obtain the sizes of the buffers
|
||||
static size_t GetSizeX(const Arguments<T> &args) {
|
||||
|
@ -101,15 +103,10 @@ class TestXnrm2 {
|
|||
|
||||
// Describes how to run the CPU BLAS routine (for correctness/performance comparison)
|
||||
#ifdef CLBLAST_REF_CBLAS
|
||||
static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
|
||||
std::vector<T> scalar_cpu(args.scalar_size, static_cast<T>(0));
|
||||
std::vector<T> x_vec_cpu(args.x_size, static_cast<T>(0));
|
||||
buffers.scalar.Read(queue, args.scalar_size, scalar_cpu);
|
||||
buffers.x_vec.Read(queue, args.x_size, x_vec_cpu);
|
||||
static StatusCode RunReference2(const Arguments<T> &args, BuffersHost<T> &buffers_host, Queue &) {
|
||||
cblasXnrm2(args.n,
|
||||
scalar_cpu, args.nrm2_offset,
|
||||
x_vec_cpu, args.x_offset, args.x_inc);
|
||||
buffers.scalar.Write(queue, args.scalar_size, scalar_cpu);
|
||||
buffers_host.scalar, args.nrm2_offset,
|
||||
buffers_host.x_vec, args.x_offset, args.x_inc);
|
||||
return StatusCode::kSuccess;
|
||||
}
|
||||
#endif
|
||||
|
|
|
@ -44,6 +44,8 @@ class TestXscal {
|
|||
kArgXOffset,
|
||||
kArgAlpha};
|
||||
}
|
||||
static std::vector<std::string> BuffersIn() { return {kBufVecX}; }
|
||||
static std::vector<std::string> BuffersOut() { return {kBufVecX}; }
|
||||
|
||||
// Describes how to obtain the sizes of the buffers
|
||||
static size_t GetSizeX(const Arguments<T> &args) {
|
||||
|
@ -96,12 +98,9 @@ class TestXscal {
|
|||
|
||||
// Describes how to run the CPU BLAS routine (for correctness/performance comparison)
|
||||
#ifdef CLBLAST_REF_CBLAS
|
||||
static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
|
||||
std::vector<T> x_vec_cpu(args.x_size, static_cast<T>(0));
|
||||
buffers.x_vec.Read(queue, args.x_size, x_vec_cpu);
|
||||
static StatusCode RunReference2(const Arguments<T> &args, BuffersHost<T> &buffers_host, Queue &) {
|
||||
cblasXscal(args.n, args.alpha,
|
||||
x_vec_cpu, args.x_offset, args.x_inc);
|
||||
buffers.x_vec.Write(queue, args.x_size, x_vec_cpu);
|
||||
buffers_host.x_vec, args.x_offset, args.x_inc);
|
||||
return StatusCode::kSuccess;
|
||||
}
|
||||
#endif
|
||||
|
|
|
@ -43,6 +43,8 @@ class TestXswap {
|
|||
kArgXInc, kArgYInc,
|
||||
kArgXOffset, kArgYOffset};
|
||||
}
|
||||
static std::vector<std::string> BuffersIn() { return {kBufVecX, kBufVecY}; }
|
||||
static std::vector<std::string> BuffersOut() { return {kBufVecX, kBufVecY}; }
|
||||
|
||||
// Describes how to obtain the sizes of the buffers
|
||||
static size_t GetSizeX(const Arguments<T> &args) {
|
||||
|
@ -101,16 +103,10 @@ class TestXswap {
|
|||
|
||||
// Describes how to run the CPU BLAS routine (for correctness/performance comparison)
|
||||
#ifdef CLBLAST_REF_CBLAS
|
||||
static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
|
||||
std::vector<T> x_vec_cpu(args.x_size, static_cast<T>(0));
|
||||
std::vector<T> y_vec_cpu(args.y_size, static_cast<T>(0));
|
||||
buffers.x_vec.Read(queue, args.x_size, x_vec_cpu);
|
||||
buffers.y_vec.Read(queue, args.y_size, y_vec_cpu);
|
||||
static StatusCode RunReference2(const Arguments<T> &args, BuffersHost<T> &buffers_host, Queue &) {
|
||||
cblasXswap(args.n,
|
||||
x_vec_cpu, args.x_offset, args.x_inc,
|
||||
y_vec_cpu, args.y_offset, args.y_inc);
|
||||
buffers.x_vec.Write(queue, args.x_size, x_vec_cpu);
|
||||
buffers.y_vec.Write(queue, args.y_size, y_vec_cpu);
|
||||
buffers_host.x_vec, args.x_offset, args.x_inc,
|
||||
buffers_host.y_vec, args.y_offset, args.y_inc);
|
||||
return StatusCode::kSuccess;
|
||||
}
|
||||
#endif
|
||||
|
|
|
@ -45,6 +45,8 @@ class TestXgbmv {
|
|||
kArgAOffset, kArgXOffset, kArgYOffset,
|
||||
kArgAlpha, kArgBeta};
|
||||
}
|
||||
static std::vector<std::string> BuffersIn() { return {kBufMatA, kBufVecX, kBufVecY}; }
|
||||
static std::vector<std::string> BuffersOut() { return {kBufVecY}; }
|
||||
|
||||
// Describes how to obtain the sizes of the buffers
|
||||
static size_t GetSizeX(const Arguments<T> &args) {
|
||||
|
@ -118,20 +120,13 @@ class TestXgbmv {
|
|||
|
||||
// Describes how to run the CPU BLAS routine (for correctness/performance comparison)
|
||||
#ifdef CLBLAST_REF_CBLAS
|
||||
static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
|
||||
std::vector<T> a_mat_cpu(args.a_size, static_cast<T>(0));
|
||||
std::vector<T> x_vec_cpu(args.x_size, static_cast<T>(0));
|
||||
std::vector<T> y_vec_cpu(args.y_size, static_cast<T>(0));
|
||||
buffers.a_mat.Read(queue, args.a_size, a_mat_cpu);
|
||||
buffers.x_vec.Read(queue, args.x_size, x_vec_cpu);
|
||||
buffers.y_vec.Read(queue, args.y_size, y_vec_cpu);
|
||||
static StatusCode RunReference2(const Arguments<T> &args, BuffersHost<T> &buffers_host, Queue &) {
|
||||
cblasXgbmv(convertToCBLAS(args.layout),
|
||||
convertToCBLAS(args.a_transpose),
|
||||
args.m, args.n, args.kl, args.ku, args.alpha,
|
||||
a_mat_cpu, args.a_offset, args.a_ld,
|
||||
x_vec_cpu, args.x_offset, args.x_inc, args.beta,
|
||||
y_vec_cpu, args.y_offset, args.y_inc);
|
||||
buffers.y_vec.Write(queue, args.y_size, y_vec_cpu);
|
||||
buffers_host.a_mat, args.a_offset, args.a_ld,
|
||||
buffers_host.x_vec, args.x_offset, args.x_inc, args.beta,
|
||||
buffers_host.y_vec, args.y_offset, args.y_inc);
|
||||
return StatusCode::kSuccess;
|
||||
}
|
||||
#endif
|
||||
|
|
|
@ -45,6 +45,8 @@ class TestXgemv {
|
|||
kArgAOffset, kArgXOffset, kArgYOffset,
|
||||
kArgAlpha, kArgBeta};
|
||||
}
|
||||
static std::vector<std::string> BuffersIn() { return {kBufMatA, kBufVecX, kBufVecY}; }
|
||||
static std::vector<std::string> BuffersOut() { return {kBufVecY}; }
|
||||
|
||||
// Describes how to obtain the sizes of the buffers
|
||||
static size_t GetSizeX(const Arguments<T> &args) {
|
||||
|
@ -118,20 +120,13 @@ class TestXgemv {
|
|||
|
||||
// Describes how to run the CPU BLAS routine (for correctness/performance comparison)
|
||||
#ifdef CLBLAST_REF_CBLAS
|
||||
static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
|
||||
std::vector<T> a_mat_cpu(args.a_size, static_cast<T>(0));
|
||||
std::vector<T> x_vec_cpu(args.x_size, static_cast<T>(0));
|
||||
std::vector<T> y_vec_cpu(args.y_size, static_cast<T>(0));
|
||||
buffers.a_mat.Read(queue, args.a_size, a_mat_cpu);
|
||||
buffers.x_vec.Read(queue, args.x_size, x_vec_cpu);
|
||||
buffers.y_vec.Read(queue, args.y_size, y_vec_cpu);
|
||||
static StatusCode RunReference2(const Arguments<T> &args, BuffersHost<T> &buffers_host, Queue &) {
|
||||
cblasXgemv(convertToCBLAS(args.layout),
|
||||
convertToCBLAS(args.a_transpose),
|
||||
args.m, args.n, args.alpha,
|
||||
a_mat_cpu, args.a_offset, args.a_ld,
|
||||
x_vec_cpu, args.x_offset, args.x_inc, args.beta,
|
||||
y_vec_cpu, args.y_offset, args.y_inc);
|
||||
buffers.y_vec.Write(queue, args.y_size, y_vec_cpu);
|
||||
buffers_host.a_mat, args.a_offset, args.a_ld,
|
||||
buffers_host.x_vec, args.x_offset, args.x_inc, args.beta,
|
||||
buffers_host.y_vec, args.y_offset, args.y_inc);
|
||||
return StatusCode::kSuccess;
|
||||
}
|
||||
#endif
|
||||
|
|
|
@ -45,6 +45,8 @@ class TestXger {
|
|||
kArgAOffset, kArgXOffset, kArgYOffset,
|
||||
kArgAlpha};
|
||||
}
|
||||
static std::vector<std::string> BuffersIn() { return {kBufMatA, kBufVecX, kBufVecY}; }
|
||||
static std::vector<std::string> BuffersOut() { return {kBufMatA}; }
|
||||
|
||||
// Describes how to obtain the sizes of the buffers
|
||||
static size_t GetSizeX(const Arguments<T> &args) {
|
||||
|
@ -113,19 +115,12 @@ class TestXger {
|
|||
|
||||
// Describes how to run the CPU BLAS routine (for correctness/performance comparison)
|
||||
#ifdef CLBLAST_REF_CBLAS
|
||||
static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
|
||||
std::vector<T> a_mat_cpu(args.a_size, static_cast<T>(0));
|
||||
std::vector<T> x_vec_cpu(args.x_size, static_cast<T>(0));
|
||||
std::vector<T> y_vec_cpu(args.y_size, static_cast<T>(0));
|
||||
buffers.a_mat.Read(queue, args.a_size, a_mat_cpu);
|
||||
buffers.x_vec.Read(queue, args.x_size, x_vec_cpu);
|
||||
buffers.y_vec.Read(queue, args.y_size, y_vec_cpu);
|
||||
static StatusCode RunReference2(const Arguments<T> &args, BuffersHost<T> &buffers_host, Queue &) {
|
||||
cblasXger(convertToCBLAS(args.layout),
|
||||
args.m, args.n, args.alpha,
|
||||
x_vec_cpu, args.x_offset, args.x_inc,
|
||||
y_vec_cpu, args.y_offset, args.y_inc,
|
||||
a_mat_cpu, args.a_offset, args.a_ld);
|
||||
buffers.a_mat.Write(queue, args.a_size, a_mat_cpu);
|
||||
buffers_host.x_vec, args.x_offset, args.x_inc,
|
||||
buffers_host.y_vec, args.y_offset, args.y_inc,
|
||||
buffers_host.a_mat, args.a_offset, args.a_ld);
|
||||
return StatusCode::kSuccess;
|
||||
}
|
||||
#endif
|
||||
|
|
|
@ -45,6 +45,8 @@ class TestXgerc {
|
|||
kArgAOffset, kArgXOffset, kArgYOffset,
|
||||
kArgAlpha};
|
||||
}
|
||||
static std::vector<std::string> BuffersIn() { return {kBufMatA, kBufVecX, kBufVecY}; }
|
||||
static std::vector<std::string> BuffersOut() { return {kBufMatA}; }
|
||||
|
||||
// Describes how to obtain the sizes of the buffers
|
||||
static size_t GetSizeX(const Arguments<T> &args) {
|
||||
|
@ -113,19 +115,12 @@ class TestXgerc {
|
|||
|
||||
// Describes how to run the CPU BLAS routine (for correctness/performance comparison)
|
||||
#ifdef CLBLAST_REF_CBLAS
|
||||
static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
|
||||
std::vector<T> a_mat_cpu(args.a_size, static_cast<T>(0));
|
||||
std::vector<T> x_vec_cpu(args.x_size, static_cast<T>(0));
|
||||
std::vector<T> y_vec_cpu(args.y_size, static_cast<T>(0));
|
||||
buffers.a_mat.Read(queue, args.a_size, a_mat_cpu);
|
||||
buffers.x_vec.Read(queue, args.x_size, x_vec_cpu);
|
||||
buffers.y_vec.Read(queue, args.y_size, y_vec_cpu);
|
||||
static StatusCode RunReference2(const Arguments<T> &args, BuffersHost<T> &buffers_host, Queue &) {
|
||||
cblasXgerc(convertToCBLAS(args.layout),
|
||||
args.m, args.n, args.alpha,
|
||||
x_vec_cpu, args.x_offset, args.x_inc,
|
||||
y_vec_cpu, args.y_offset, args.y_inc,
|
||||
a_mat_cpu, args.a_offset, args.a_ld);
|
||||
buffers.a_mat.Write(queue, args.a_size, a_mat_cpu);
|
||||
buffers_host.x_vec, args.x_offset, args.x_inc,
|
||||
buffers_host.y_vec, args.y_offset, args.y_inc,
|
||||
buffers_host.a_mat, args.a_offset, args.a_ld);
|
||||
return StatusCode::kSuccess;
|
||||
}
|
||||
#endif
|
||||
|
|
|
@ -45,6 +45,8 @@ class TestXgeru {
|
|||
kArgAOffset, kArgXOffset, kArgYOffset,
|
||||
kArgAlpha};
|
||||
}
|
||||
static std::vector<std::string> BuffersIn() { return {kBufMatA, kBufVecX, kBufVecY}; }
|
||||
static std::vector<std::string> BuffersOut() { return {kBufMatA}; }
|
||||
|
||||
// Describes how to obtain the sizes of the buffers
|
||||
static size_t GetSizeX(const Arguments<T> &args) {
|
||||
|
@ -113,19 +115,12 @@ class TestXgeru {
|
|||
|
||||
// Describes how to run the CPU BLAS routine (for correctness/performance comparison)
|
||||
#ifdef CLBLAST_REF_CBLAS
|
||||
static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
|
||||
std::vector<T> a_mat_cpu(args.a_size, static_cast<T>(0));
|
||||
std::vector<T> x_vec_cpu(args.x_size, static_cast<T>(0));
|
||||
std::vector<T> y_vec_cpu(args.y_size, static_cast<T>(0));
|
||||
buffers.a_mat.Read(queue, args.a_size, a_mat_cpu);
|
||||
buffers.x_vec.Read(queue, args.x_size, x_vec_cpu);
|
||||
buffers.y_vec.Read(queue, args.y_size, y_vec_cpu);
|
||||
static StatusCode RunReference2(const Arguments<T> &args, BuffersHost<T> &buffers_host, Queue &) {
|
||||
cblasXgeru(convertToCBLAS(args.layout),
|
||||
args.m, args.n, args.alpha,
|
||||
x_vec_cpu, args.x_offset, args.x_inc,
|
||||
y_vec_cpu, args.y_offset, args.y_inc,
|
||||
a_mat_cpu, args.a_offset, args.a_ld);
|
||||
buffers.a_mat.Write(queue, args.a_size, a_mat_cpu);
|
||||
buffers_host.x_vec, args.x_offset, args.x_inc,
|
||||
buffers_host.y_vec, args.y_offset, args.y_inc,
|
||||
buffers_host.a_mat, args.a_offset, args.a_ld);
|
||||
return StatusCode::kSuccess;
|
||||
}
|
||||
#endif
|
||||
|
|
|
@ -45,6 +45,8 @@ class TestXhbmv {
|
|||
kArgAOffset, kArgXOffset, kArgYOffset,
|
||||
kArgAlpha, kArgBeta};
|
||||
}
|
||||
static std::vector<std::string> BuffersIn() { return {kBufMatA, kBufVecX, kBufVecY}; }
|
||||
static std::vector<std::string> BuffersOut() { return {kBufVecY}; }
|
||||
|
||||
// Describes how to obtain the sizes of the buffers
|
||||
static size_t GetSizeX(const Arguments<T> &args) {
|
||||
|
@ -112,20 +114,13 @@ class TestXhbmv {
|
|||
|
||||
// Describes how to run the CPU BLAS routine (for correctness/performance comparison)
|
||||
#ifdef CLBLAST_REF_CBLAS
|
||||
static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
|
||||
std::vector<T> a_mat_cpu(args.a_size, static_cast<T>(0));
|
||||
std::vector<T> x_vec_cpu(args.x_size, static_cast<T>(0));
|
||||
std::vector<T> y_vec_cpu(args.y_size, static_cast<T>(0));
|
||||
buffers.a_mat.Read(queue, args.a_size, a_mat_cpu);
|
||||
buffers.x_vec.Read(queue, args.x_size, x_vec_cpu);
|
||||
buffers.y_vec.Read(queue, args.y_size, y_vec_cpu);
|
||||
static StatusCode RunReference2(const Arguments<T> &args, BuffersHost<T> &buffers_host, Queue &) {
|
||||
cblasXhbmv(convertToCBLAS(args.layout),
|
||||
convertToCBLAS(args.triangle),
|
||||
args.n, args.kl, args.alpha,
|
||||
a_mat_cpu, args.a_offset, args.a_ld,
|
||||
x_vec_cpu, args.x_offset, args.x_inc, args.beta,
|
||||
y_vec_cpu, args.y_offset, args.y_inc);
|
||||
buffers.y_vec.Write(queue, args.y_size, y_vec_cpu);
|
||||
buffers_host.a_mat, args.a_offset, args.a_ld,
|
||||
buffers_host.x_vec, args.x_offset, args.x_inc, args.beta,
|
||||
buffers_host.y_vec, args.y_offset, args.y_inc);
|
||||
return StatusCode::kSuccess;
|
||||
}
|
||||
#endif
|
||||
|
|
|
@ -45,6 +45,8 @@ class TestXhemv {
|
|||
kArgAOffset, kArgXOffset, kArgYOffset,
|
||||
kArgAlpha, kArgBeta};
|
||||
}
|
||||
static std::vector<std::string> BuffersIn() { return {kBufMatA, kBufVecX, kBufVecY}; }
|
||||
static std::vector<std::string> BuffersOut() { return {kBufVecY}; }
|
||||
|
||||
// Describes how to obtain the sizes of the buffers
|
||||
static size_t GetSizeX(const Arguments<T> &args) {
|
||||
|
@ -112,20 +114,13 @@ class TestXhemv {
|
|||
|
||||
// Describes how to run the CPU BLAS routine (for correctness/performance comparison)
|
||||
#ifdef CLBLAST_REF_CBLAS
|
||||
static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
|
||||
std::vector<T> a_mat_cpu(args.a_size, static_cast<T>(0));
|
||||
std::vector<T> x_vec_cpu(args.x_size, static_cast<T>(0));
|
||||
std::vector<T> y_vec_cpu(args.y_size, static_cast<T>(0));
|
||||
buffers.a_mat.Read(queue, args.a_size, a_mat_cpu);
|
||||
buffers.x_vec.Read(queue, args.x_size, x_vec_cpu);
|
||||
buffers.y_vec.Read(queue, args.y_size, y_vec_cpu);
|
||||
static StatusCode RunReference2(const Arguments<T> &args, BuffersHost<T> &buffers_host, Queue &) {
|
||||
cblasXhemv(convertToCBLAS(args.layout),
|
||||
convertToCBLAS(args.triangle),
|
||||
args.n, args.alpha,
|
||||
a_mat_cpu, args.a_offset, args.a_ld,
|
||||
x_vec_cpu, args.x_offset, args.x_inc, args.beta,
|
||||
y_vec_cpu, args.y_offset, args.y_inc);
|
||||
buffers.y_vec.Write(queue, args.y_size, y_vec_cpu);
|
||||
buffers_host.a_mat, args.a_offset, args.a_ld,
|
||||
buffers_host.x_vec, args.x_offset, args.x_inc, args.beta,
|
||||
buffers_host.y_vec, args.y_offset, args.y_inc);
|
||||
return StatusCode::kSuccess;
|
||||
}
|
||||
#endif
|
||||
|
|
|
@ -45,6 +45,8 @@ class TestXher {
|
|||
kArgAOffset, kArgXOffset,
|
||||
kArgAlpha};
|
||||
}
|
||||
static std::vector<std::string> BuffersIn() { return {kBufMatA, kBufVecX}; }
|
||||
static std::vector<std::string> BuffersOut() { return {kBufMatA}; }
|
||||
|
||||
// Describes how to obtain the sizes of the buffers
|
||||
static size_t GetSizeX(const Arguments<U> &args) {
|
||||
|
@ -106,17 +108,12 @@ class TestXher {
|
|||
|
||||
// Describes how to run the CPU BLAS routine (for correctness/performance comparison)
|
||||
#ifdef CLBLAST_REF_CBLAS
|
||||
static StatusCode RunReference2(const Arguments<U> &args, Buffers<T> &buffers, Queue &queue) {
|
||||
std::vector<T> a_mat_cpu(args.a_size, static_cast<T>(0));
|
||||
std::vector<T> x_vec_cpu(args.x_size, static_cast<T>(0));
|
||||
buffers.a_mat.Read(queue, args.a_size, a_mat_cpu);
|
||||
buffers.x_vec.Read(queue, args.x_size, x_vec_cpu);
|
||||
static StatusCode RunReference2(const Arguments<U> &args, BuffersHost<T> &buffers_host, Queue&) {
|
||||
cblasXher(convertToCBLAS(args.layout),
|
||||
convertToCBLAS(args.triangle),
|
||||
args.n, args.alpha,
|
||||
x_vec_cpu, args.x_offset, args.x_inc,
|
||||
a_mat_cpu, args.a_offset, args.a_ld);
|
||||
buffers.a_mat.Write(queue, args.a_size, a_mat_cpu);
|
||||
buffers_host.x_vec, args.x_offset, args.x_inc,
|
||||
buffers_host.a_mat, args.a_offset, args.a_ld);
|
||||
return StatusCode::kSuccess;
|
||||
}
|
||||
#endif
|
||||
|
|
|
@ -45,6 +45,8 @@ class TestXher2 {
|
|||
kArgAOffset, kArgXOffset, kArgYOffset,
|
||||
kArgAlpha};
|
||||
}
|
||||
static std::vector<std::string> BuffersIn() { return {kBufMatA, kBufVecX, kBufVecY}; }
|
||||
static std::vector<std::string> BuffersOut() { return {kBufMatA}; }
|
||||
|
||||
// Describes how to obtain the sizes of the buffers
|
||||
static size_t GetSizeX(const Arguments<T> &args) {
|
||||
|
@ -112,20 +114,13 @@ class TestXher2 {
|
|||
|
||||
// Describes how to run the CPU BLAS routine (for correctness/performance comparison)
|
||||
#ifdef CLBLAST_REF_CBLAS
|
||||
static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
|
||||
std::vector<T> a_mat_cpu(args.a_size, static_cast<T>(0));
|
||||
std::vector<T> x_vec_cpu(args.x_size, static_cast<T>(0));
|
||||
std::vector<T> y_vec_cpu(args.y_size, static_cast<T>(0));
|
||||
buffers.a_mat.Read(queue, args.a_size, a_mat_cpu);
|
||||
buffers.x_vec.Read(queue, args.x_size, x_vec_cpu);
|
||||
buffers.y_vec.Read(queue, args.y_size, y_vec_cpu);
|
||||
static StatusCode RunReference2(const Arguments<T> &args, BuffersHost<T> &buffers_host, Queue &) {
|
||||
cblasXher2(convertToCBLAS(args.layout),
|
||||
convertToCBLAS(args.triangle),
|
||||
args.n, args.alpha,
|
||||
x_vec_cpu, args.x_offset, args.x_inc,
|
||||
y_vec_cpu, args.y_offset, args.y_inc,
|
||||
a_mat_cpu, args.a_offset, args.a_ld);
|
||||
buffers.a_mat.Write(queue, args.a_size, a_mat_cpu);
|
||||
buffers_host.x_vec, args.x_offset, args.x_inc,
|
||||
buffers_host.y_vec, args.y_offset, args.y_inc,
|
||||
buffers_host.a_mat, args.a_offset, args.a_ld);
|
||||
return StatusCode::kSuccess;
|
||||
}
|
||||
#endif
|
||||
|
|
|
@ -45,6 +45,8 @@ class TestXhpmv {
|
|||
kArgAPOffset, kArgXOffset, kArgYOffset,
|
||||
kArgAlpha, kArgBeta};
|
||||
}
|
||||
static std::vector<std::string> BuffersIn() { return {kBufMatAP, kBufVecX, kBufVecY}; }
|
||||
static std::vector<std::string> BuffersOut() { return {kBufVecY}; }
|
||||
|
||||
// Describes how to obtain the sizes of the buffers
|
||||
static size_t GetSizeX(const Arguments<T> &args) {
|
||||
|
@ -112,20 +114,13 @@ class TestXhpmv {
|
|||
|
||||
// Describes how to run the CPU BLAS routine (for correctness/performance comparison)
|
||||
#ifdef CLBLAST_REF_CBLAS
|
||||
static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
|
||||
std::vector<T> ap_mat_cpu(args.ap_size, static_cast<T>(0));
|
||||
std::vector<T> x_vec_cpu(args.x_size, static_cast<T>(0));
|
||||
std::vector<T> y_vec_cpu(args.y_size, static_cast<T>(0));
|
||||
buffers.ap_mat.Read(queue, args.ap_size, ap_mat_cpu);
|
||||
buffers.x_vec.Read(queue, args.x_size, x_vec_cpu);
|
||||
buffers.y_vec.Read(queue, args.y_size, y_vec_cpu);
|
||||
static StatusCode RunReference2(const Arguments<T> &args, BuffersHost<T> &buffers_host, Queue &) {
|
||||
cblasXhpmv(convertToCBLAS(args.layout),
|
||||
convertToCBLAS(args.triangle),
|
||||
args.n, args.alpha,
|
||||
ap_mat_cpu, args.ap_offset,
|
||||
x_vec_cpu, args.x_offset, args.x_inc, args.beta,
|
||||
y_vec_cpu, args.y_offset, args.y_inc);
|
||||
buffers.y_vec.Write(queue, args.y_size, y_vec_cpu);
|
||||
buffers_host.ap_mat, args.ap_offset,
|
||||
buffers_host.x_vec, args.x_offset, args.x_inc, args.beta,
|
||||
buffers_host.y_vec, args.y_offset, args.y_inc);
|
||||
return StatusCode::kSuccess;
|
||||
}
|
||||
#endif
|
||||
|
|
|
@ -45,6 +45,8 @@ class TestXhpr {
|
|||
kArgAPOffset, kArgXOffset,
|
||||
kArgAlpha};
|
||||
}
|
||||
static std::vector<std::string> BuffersIn() { return {kBufMatAP, kBufVecX}; }
|
||||
static std::vector<std::string> BuffersOut() { return {kBufMatAP}; }
|
||||
|
||||
// Describes how to obtain the sizes of the buffers
|
||||
static size_t GetSizeX(const Arguments<U> &args) {
|
||||
|
@ -106,17 +108,12 @@ class TestXhpr {
|
|||
|
||||
// Describes how to run the CPU BLAS routine (for correctness/performance comparison)
|
||||
#ifdef CLBLAST_REF_CBLAS
|
||||
static StatusCode RunReference2(const Arguments<U> &args, Buffers<T> &buffers, Queue &queue) {
|
||||
std::vector<T> ap_mat_cpu(args.ap_size, static_cast<T>(0));
|
||||
std::vector<T> x_vec_cpu(args.x_size, static_cast<T>(0));
|
||||
buffers.ap_mat.Read(queue, args.ap_size, ap_mat_cpu);
|
||||
buffers.x_vec.Read(queue, args.x_size, x_vec_cpu);
|
||||
static StatusCode RunReference2(const Arguments<U> &args, BuffersHost<T> &buffers_host, Queue&) {
|
||||
cblasXhpr(convertToCBLAS(args.layout),
|
||||
convertToCBLAS(args.triangle),
|
||||
args.n, args.alpha,
|
||||
x_vec_cpu, args.x_offset, args.x_inc,
|
||||
ap_mat_cpu, args.ap_offset);
|
||||
buffers.ap_mat.Write(queue, args.ap_size, ap_mat_cpu);
|
||||
buffers_host.x_vec, args.x_offset, args.x_inc,
|
||||
buffers_host.ap_mat, args.ap_offset);
|
||||
return StatusCode::kSuccess;
|
||||
}
|
||||
#endif
|
||||
|
|
|
@ -45,6 +45,8 @@ class TestXhpr2 {
|
|||
kArgAPOffset, kArgXOffset, kArgYOffset,
|
||||
kArgAlpha};
|
||||
}
|
||||
static std::vector<std::string> BuffersIn() { return {kBufMatAP, kBufVecX, kBufVecY}; }
|
||||
static std::vector<std::string> BuffersOut() { return {kBufMatAP}; }
|
||||
|
||||
// Describes how to obtain the sizes of the buffers
|
||||
static size_t GetSizeX(const Arguments<T> &args) {
|
||||
|
@ -112,20 +114,13 @@ class TestXhpr2 {
|
|||
|
||||
// Describes how to run the CPU BLAS routine (for correctness/performance comparison)
|
||||
#ifdef CLBLAST_REF_CBLAS
|
||||
static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
|
||||
std::vector<T> ap_mat_cpu(args.ap_size, static_cast<T>(0));
|
||||
std::vector<T> x_vec_cpu(args.x_size, static_cast<T>(0));
|
||||
std::vector<T> y_vec_cpu(args.y_size, static_cast<T>(0));
|
||||
buffers.ap_mat.Read(queue, args.ap_size, ap_mat_cpu);
|
||||
buffers.x_vec.Read(queue, args.x_size, x_vec_cpu);
|
||||
buffers.y_vec.Read(queue, args.y_size, y_vec_cpu);
|
||||
static StatusCode RunReference2(const Arguments<T> &args, BuffersHost<T> &buffers_host, Queue &) {
|
||||
cblasXhpr2(convertToCBLAS(args.layout),
|
||||
convertToCBLAS(args.triangle),
|
||||
args.n, args.alpha,
|
||||
x_vec_cpu, args.x_offset, args.x_inc,
|
||||
y_vec_cpu, args.y_offset, args.y_inc,
|
||||
ap_mat_cpu, args.ap_offset);
|
||||
buffers.ap_mat.Write(queue, args.ap_size, ap_mat_cpu);
|
||||
buffers_host.x_vec, args.x_offset, args.x_inc,
|
||||
buffers_host.y_vec, args.y_offset, args.y_inc,
|
||||
buffers_host.ap_mat, args.ap_offset);
|
||||
return StatusCode::kSuccess;
|
||||
}
|
||||
#endif
|
||||
|
|
|
@ -45,6 +45,8 @@ class TestXsbmv {
|
|||
kArgAOffset, kArgXOffset, kArgYOffset,
|
||||
kArgAlpha, kArgBeta};
|
||||
}
|
||||
static std::vector<std::string> BuffersIn() { return {kBufMatA, kBufVecX, kBufVecY}; }
|
||||
static std::vector<std::string> BuffersOut() { return {kBufVecY}; }
|
||||
|
||||
// Describes how to obtain the sizes of the buffers
|
||||
static size_t GetSizeX(const Arguments<T> &args) {
|
||||
|
@ -112,20 +114,13 @@ class TestXsbmv {
|
|||
|
||||
// Describes how to run the CPU BLAS routine (for correctness/performance comparison)
|
||||
#ifdef CLBLAST_REF_CBLAS
|
||||
static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
|
||||
std::vector<T> a_mat_cpu(args.a_size, static_cast<T>(0));
|
||||
std::vector<T> x_vec_cpu(args.x_size, static_cast<T>(0));
|
||||
std::vector<T> y_vec_cpu(args.y_size, static_cast<T>(0));
|
||||
buffers.a_mat.Read(queue, args.a_size, a_mat_cpu);
|
||||
buffers.x_vec.Read(queue, args.x_size, x_vec_cpu);
|
||||
buffers.y_vec.Read(queue, args.y_size, y_vec_cpu);
|
||||
static StatusCode RunReference2(const Arguments<T> &args, BuffersHost<T> &buffers_host, Queue &) {
|
||||
cblasXsbmv(convertToCBLAS(args.layout),
|
||||
convertToCBLAS(args.triangle),
|
||||
args.n, args.kl, args.alpha,
|
||||
a_mat_cpu, args.a_offset, args.a_ld,
|
||||
x_vec_cpu, args.x_offset, args.x_inc, args.beta,
|
||||
y_vec_cpu, args.y_offset, args.y_inc);
|
||||
buffers.y_vec.Write(queue, args.y_size, y_vec_cpu);
|
||||
buffers_host.a_mat, args.a_offset, args.a_ld,
|
||||
buffers_host.x_vec, args.x_offset, args.x_inc, args.beta,
|
||||
buffers_host.y_vec, args.y_offset, args.y_inc);
|
||||
return StatusCode::kSuccess;
|
||||
}
|
||||
#endif
|
||||
|
|
|
@ -45,6 +45,8 @@ class TestXspmv {
|
|||
kArgAPOffset, kArgXOffset, kArgYOffset,
|
||||
kArgAlpha, kArgBeta};
|
||||
}
|
||||
static std::vector<std::string> BuffersIn() { return {kBufMatAP, kBufVecX, kBufVecY}; }
|
||||
static std::vector<std::string> BuffersOut() { return {kBufVecY}; }
|
||||
|
||||
// Describes how to obtain the sizes of the buffers
|
||||
static size_t GetSizeX(const Arguments<T> &args) {
|
||||
|
@ -112,20 +114,13 @@ class TestXspmv {
|
|||
|
||||
// Describes how to run the CPU BLAS routine (for correctness/performance comparison)
|
||||
#ifdef CLBLAST_REF_CBLAS
|
||||
static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
|
||||
std::vector<T> ap_mat_cpu(args.ap_size, static_cast<T>(0));
|
||||
std::vector<T> x_vec_cpu(args.x_size, static_cast<T>(0));
|
||||
std::vector<T> y_vec_cpu(args.y_size, static_cast<T>(0));
|
||||
buffers.ap_mat.Read(queue, args.ap_size, ap_mat_cpu);
|
||||
buffers.x_vec.Read(queue, args.x_size, x_vec_cpu);
|
||||
buffers.y_vec.Read(queue, args.y_size, y_vec_cpu);
|
||||
static StatusCode RunReference2(const Arguments<T> &args, BuffersHost<T> &buffers_host, Queue &) {
|
||||
cblasXspmv(convertToCBLAS(args.layout),
|
||||
convertToCBLAS(args.triangle),
|
||||
args.n, args.alpha,
|
||||
ap_mat_cpu, args.ap_offset,
|
||||
x_vec_cpu, args.x_offset, args.x_inc, args.beta,
|
||||
y_vec_cpu, args.y_offset, args.y_inc);
|
||||
buffers.y_vec.Write(queue, args.y_size, y_vec_cpu);
|
||||
buffers_host.ap_mat, args.ap_offset,
|
||||
buffers_host.x_vec, args.x_offset, args.x_inc, args.beta,
|
||||
buffers_host.y_vec, args.y_offset, args.y_inc);
|
||||
return StatusCode::kSuccess;
|
||||
}
|
||||
#endif
|
||||
|
|
|
@ -45,6 +45,8 @@ class TestXspr {
|
|||
kArgAPOffset, kArgXOffset,
|
||||
kArgAlpha};
|
||||
}
|
||||
static std::vector<std::string> BuffersIn() { return {kBufMatAP, kBufVecX}; }
|
||||
static std::vector<std::string> BuffersOut() { return {kBufMatAP}; }
|
||||
|
||||
// Describes how to obtain the sizes of the buffers
|
||||
static size_t GetSizeX(const Arguments<T> &args) {
|
||||
|
@ -106,17 +108,12 @@ class TestXspr {
|
|||
|
||||
// Describes how to run the CPU BLAS routine (for correctness/performance comparison)
|
||||
#ifdef CLBLAST_REF_CBLAS
|
||||
static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
|
||||
std::vector<T> ap_mat_cpu(args.ap_size, static_cast<T>(0));
|
||||
std::vector<T> x_vec_cpu(args.x_size, static_cast<T>(0));
|
||||
buffers.ap_mat.Read(queue, args.ap_size, ap_mat_cpu);
|
||||
buffers.x_vec.Read(queue, args.x_size, x_vec_cpu);
|
||||
static StatusCode RunReference2(const Arguments<T> &args, BuffersHost<T> &buffers_host, Queue &) {
|
||||
cblasXspr(convertToCBLAS(args.layout),
|
||||
convertToCBLAS(args.triangle),
|
||||
args.n, args.alpha,
|
||||
x_vec_cpu, args.x_offset, args.x_inc,
|
||||
ap_mat_cpu, args.ap_offset);
|
||||
buffers.ap_mat.Write(queue, args.ap_size, ap_mat_cpu);
|
||||
buffers_host.x_vec, args.x_offset, args.x_inc,
|
||||
buffers_host.ap_mat, args.ap_offset);
|
||||
return StatusCode::kSuccess;
|
||||
}
|
||||
#endif
|
||||
|
|
|
@ -45,6 +45,8 @@ class TestXspr2 {
|
|||
kArgAPOffset, kArgXOffset, kArgYOffset,
|
||||
kArgAlpha};
|
||||
}
|
||||
static std::vector<std::string> BuffersIn() { return {kBufMatAP, kBufVecX, kBufVecY}; }
|
||||
static std::vector<std::string> BuffersOut() { return {kBufMatAP}; }
|
||||
|
||||
// Describes how to obtain the sizes of the buffers
|
||||
static size_t GetSizeX(const Arguments<T> &args) {
|
||||
|
@ -112,20 +114,13 @@ class TestXspr2 {
|
|||
|
||||
// Describes how to run the CPU BLAS routine (for correctness/performance comparison)
|
||||
#ifdef CLBLAST_REF_CBLAS
|
||||
static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
|
||||
std::vector<T> ap_mat_cpu(args.ap_size, static_cast<T>(0));
|
||||
std::vector<T> x_vec_cpu(args.x_size, static_cast<T>(0));
|
||||
std::vector<T> y_vec_cpu(args.y_size, static_cast<T>(0));
|
||||
buffers.ap_mat.Read(queue, args.ap_size, ap_mat_cpu);
|
||||
buffers.x_vec.Read(queue, args.x_size, x_vec_cpu);
|
||||
buffers.y_vec.Read(queue, args.y_size, y_vec_cpu);
|
||||
static StatusCode RunReference2(const Arguments<T> &args, BuffersHost<T> &buffers_host, Queue &) {
|
||||
cblasXspr2(convertToCBLAS(args.layout),
|
||||
convertToCBLAS(args.triangle),
|
||||
args.n, args.alpha,
|
||||
x_vec_cpu, args.x_offset, args.x_inc,
|
||||
y_vec_cpu, args.y_offset, args.y_inc,
|
||||
ap_mat_cpu, args.ap_offset);
|
||||
buffers.ap_mat.Write(queue, args.ap_size, ap_mat_cpu);
|
||||
buffers_host.x_vec, args.x_offset, args.x_inc,
|
||||
buffers_host.y_vec, args.y_offset, args.y_inc,
|
||||
buffers_host.ap_mat, args.ap_offset);
|
||||
return StatusCode::kSuccess;
|
||||
}
|
||||
#endif
|
||||
|
|
|
@ -45,6 +45,8 @@ class TestXsymv {
|
|||
kArgAOffset, kArgXOffset, kArgYOffset,
|
||||
kArgAlpha, kArgBeta};
|
||||
}
|
||||
static std::vector<std::string> BuffersIn() { return {kBufMatA, kBufVecX, kBufVecY}; }
|
||||
static std::vector<std::string> BuffersOut() { return {kBufVecY}; }
|
||||
|
||||
// Describes how to obtain the sizes of the buffers
|
||||
static size_t GetSizeX(const Arguments<T> &args) {
|
||||
|
@ -112,20 +114,13 @@ class TestXsymv {
|
|||
|
||||
// Describes how to run the CPU BLAS routine (for correctness/performance comparison)
|
||||
#ifdef CLBLAST_REF_CBLAS
|
||||
static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
|
||||
std::vector<T> a_mat_cpu(args.a_size, static_cast<T>(0));
|
||||
std::vector<T> x_vec_cpu(args.x_size, static_cast<T>(0));
|
||||
std::vector<T> y_vec_cpu(args.y_size, static_cast<T>(0));
|
||||
buffers.a_mat.Read(queue, args.a_size, a_mat_cpu);
|
||||
buffers.x_vec.Read(queue, args.x_size, x_vec_cpu);
|
||||
buffers.y_vec.Read(queue, args.y_size, y_vec_cpu);
|
||||
static StatusCode RunReference2(const Arguments<T> &args, BuffersHost<T> &buffers_host, Queue &) {
|
||||
cblasXsymv(convertToCBLAS(args.layout),
|
||||
convertToCBLAS(args.triangle),
|
||||
args.n, args.alpha,
|
||||
a_mat_cpu, args.a_offset, args.a_ld,
|
||||
x_vec_cpu, args.x_offset, args.x_inc, args.beta,
|
||||
y_vec_cpu, args.y_offset, args.y_inc);
|
||||
buffers.y_vec.Write(queue, args.y_size, y_vec_cpu);
|
||||
buffers_host.a_mat, args.a_offset, args.a_ld,
|
||||
buffers_host.x_vec, args.x_offset, args.x_inc, args.beta,
|
||||
buffers_host.y_vec, args.y_offset, args.y_inc);
|
||||
return StatusCode::kSuccess;
|
||||
}
|
||||
#endif
|
||||
|
|
|
@ -45,6 +45,8 @@ class TestXsyr {
|
|||
kArgAOffset, kArgXOffset,
|
||||
kArgAlpha};
|
||||
}
|
||||
static std::vector<std::string> BuffersIn() { return {kBufMatA, kBufVecX}; }
|
||||
static std::vector<std::string> BuffersOut() { return {kBufMatA}; }
|
||||
|
||||
// Describes how to obtain the sizes of the buffers
|
||||
static size_t GetSizeX(const Arguments<T> &args) {
|
||||
|
@ -106,17 +108,12 @@ class TestXsyr {
|
|||
|
||||
// Describes how to run the CPU BLAS routine (for correctness/performance comparison)
|
||||
#ifdef CLBLAST_REF_CBLAS
|
||||
static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
|
||||
std::vector<T> a_mat_cpu(args.a_size, static_cast<T>(0));
|
||||
std::vector<T> x_vec_cpu(args.x_size, static_cast<T>(0));
|
||||
buffers.a_mat.Read(queue, args.a_size, a_mat_cpu);
|
||||
buffers.x_vec.Read(queue, args.x_size, x_vec_cpu);
|
||||
static StatusCode RunReference2(const Arguments<T> &args, BuffersHost<T> &buffers_host, Queue &) {
|
||||
cblasXsyr(convertToCBLAS(args.layout),
|
||||
convertToCBLAS(args.triangle),
|
||||
args.n, args.alpha,
|
||||
x_vec_cpu, args.x_offset, args.x_inc,
|
||||
a_mat_cpu, args.a_offset, args.a_ld);
|
||||
buffers.a_mat.Write(queue, args.a_size, a_mat_cpu);
|
||||
buffers_host.x_vec, args.x_offset, args.x_inc,
|
||||
buffers_host.a_mat, args.a_offset, args.a_ld);
|
||||
return StatusCode::kSuccess;
|
||||
}
|
||||
#endif
|
||||
|
|
|
@ -45,6 +45,8 @@ class TestXsyr2 {
|
|||
kArgAOffset, kArgXOffset, kArgYOffset,
|
||||
kArgAlpha};
|
||||
}
|
||||
static std::vector<std::string> BuffersIn() { return {kBufMatA, kBufVecX, kBufVecY}; }
|
||||
static std::vector<std::string> BuffersOut() { return {kBufMatA}; }
|
||||
|
||||
// Describes how to obtain the sizes of the buffers
|
||||
static size_t GetSizeX(const Arguments<T> &args) {
|
||||
|
@ -112,20 +114,13 @@ class TestXsyr2 {
|
|||
|
||||
// Describes how to run the CPU BLAS routine (for correctness/performance comparison)
|
||||
#ifdef CLBLAST_REF_CBLAS
|
||||
static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
|
||||
std::vector<T> a_mat_cpu(args.a_size, static_cast<T>(0));
|
||||
std::vector<T> x_vec_cpu(args.x_size, static_cast<T>(0));
|
||||
std::vector<T> y_vec_cpu(args.y_size, static_cast<T>(0));
|
||||
buffers.a_mat.Read(queue, args.a_size, a_mat_cpu);
|
||||
buffers.x_vec.Read(queue, args.x_size, x_vec_cpu);
|
||||
buffers.y_vec.Read(queue, args.y_size, y_vec_cpu);
|
||||
static StatusCode RunReference2(const Arguments<T> &args, BuffersHost<T> &buffers_host, Queue &) {
|
||||
cblasXsyr2(convertToCBLAS(args.layout),
|
||||
convertToCBLAS(args.triangle),
|
||||
args.n, args.alpha,
|
||||
x_vec_cpu, args.x_offset, args.x_inc,
|
||||
y_vec_cpu, args.y_offset, args.y_inc,
|
||||
a_mat_cpu, args.a_offset, args.a_ld);
|
||||
buffers.a_mat.Write(queue, args.a_size, a_mat_cpu);
|
||||
buffers_host.x_vec, args.x_offset, args.x_inc,
|
||||
buffers_host.y_vec, args.y_offset, args.y_inc,
|
||||
buffers_host.a_mat, args.a_offset, args.a_ld);
|
||||
return StatusCode::kSuccess;
|
||||
}
|
||||
#endif
|
||||
|
|
|
@ -44,6 +44,8 @@ class TestXtbmv {
|
|||
kArgALeadDim, kArgXInc,
|
||||
kArgAOffset, kArgXOffset};
|
||||
}
|
||||
static std::vector<std::string> BuffersIn() { return {kBufMatA, kBufVecX}; }
|
||||
static std::vector<std::string> BuffersOut() { return {kBufVecX}; }
|
||||
|
||||
// Describes how to obtain the sizes of the buffers
|
||||
static size_t GetSizeX(const Arguments<T> &args) {
|
||||
|
@ -107,19 +109,14 @@ class TestXtbmv {
|
|||
|
||||
// Describes how to run the CPU BLAS routine (for correctness/performance comparison)
|
||||
#ifdef CLBLAST_REF_CBLAS
|
||||
static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
|
||||
std::vector<T> a_mat_cpu(args.a_size, static_cast<T>(0));
|
||||
std::vector<T> x_vec_cpu(args.x_size, static_cast<T>(0));
|
||||
buffers.a_mat.Read(queue, args.a_size, a_mat_cpu);
|
||||
buffers.x_vec.Read(queue, args.x_size, x_vec_cpu);
|
||||
static StatusCode RunReference2(const Arguments<T> &args, BuffersHost<T> &buffers_host, Queue &) {
|
||||
cblasXtbmv(convertToCBLAS(args.layout),
|
||||
convertToCBLAS(args.triangle),
|
||||
convertToCBLAS(args.a_transpose),
|
||||
convertToCBLAS(args.diagonal),
|
||||
args.n, args.kl,
|
||||
a_mat_cpu, args.a_offset, args.a_ld,
|
||||
x_vec_cpu, args.x_offset, args.x_inc);
|
||||
buffers.x_vec.Write(queue, args.x_size, x_vec_cpu);
|
||||
buffers_host.a_mat, args.a_offset, args.a_ld,
|
||||
buffers_host.x_vec, args.x_offset, args.x_inc);
|
||||
return StatusCode::kSuccess;
|
||||
}
|
||||
#endif
|
||||
|
|
|
@ -44,6 +44,8 @@ class TestXtpmv {
|
|||
kArgXInc,
|
||||
kArgAPOffset, kArgXOffset};
|
||||
}
|
||||
static std::vector<std::string> BuffersIn() { return {kBufMatAP, kBufVecX}; }
|
||||
static std::vector<std::string> BuffersOut() { return {kBufVecX}; }
|
||||
|
||||
// Describes how to obtain the sizes of the buffers
|
||||
static size_t GetSizeX(const Arguments<T> &args) {
|
||||
|
@ -107,19 +109,14 @@ class TestXtpmv {
|
|||
|
||||
// Describes how to run the CPU BLAS routine (for correctness/performance comparison)
|
||||
#ifdef CLBLAST_REF_CBLAS
|
||||
static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
|
||||
std::vector<T> ap_mat_cpu(args.ap_size, static_cast<T>(0));
|
||||
std::vector<T> x_vec_cpu(args.x_size, static_cast<T>(0));
|
||||
buffers.ap_mat.Read(queue, args.ap_size, ap_mat_cpu);
|
||||
buffers.x_vec.Read(queue, args.x_size, x_vec_cpu);
|
||||
static StatusCode RunReference2(const Arguments<T> &args, BuffersHost<T> &buffers_host, Queue &) {
|
||||
cblasXtpmv(convertToCBLAS(args.layout),
|
||||
convertToCBLAS(args.triangle),
|
||||
convertToCBLAS(args.a_transpose),
|
||||
convertToCBLAS(args.diagonal),
|
||||
args.n,
|
||||
ap_mat_cpu, args.ap_offset,
|
||||
x_vec_cpu, args.x_offset, args.x_inc);
|
||||
buffers.x_vec.Write(queue, args.x_size, x_vec_cpu);
|
||||
buffers_host.ap_mat, args.ap_offset,
|
||||
buffers_host.x_vec, args.x_offset, args.x_inc);
|
||||
return StatusCode::kSuccess;
|
||||
}
|
||||
#endif
|
||||
|
|
|
@ -44,6 +44,8 @@ class TestXtrmv {
|
|||
kArgALeadDim, kArgXInc,
|
||||
kArgAOffset, kArgXOffset};
|
||||
}
|
||||
static std::vector<std::string> BuffersIn() { return {kBufMatA, kBufVecX}; }
|
||||
static std::vector<std::string> BuffersOut() { return {kBufVecX}; }
|
||||
|
||||
// Describes how to obtain the sizes of the buffers
|
||||
static size_t GetSizeX(const Arguments<T> &args) {
|
||||
|
@ -107,19 +109,14 @@ class TestXtrmv {
|
|||
|
||||
// Describes how to run the CPU BLAS routine (for correctness/performance comparison)
|
||||
#ifdef CLBLAST_REF_CBLAS
|
||||
static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
|
||||
std::vector<T> a_mat_cpu(args.a_size, static_cast<T>(0));
|
||||
std::vector<T> x_vec_cpu(args.x_size, static_cast<T>(0));
|
||||
buffers.a_mat.Read(queue, args.a_size, a_mat_cpu);
|
||||
buffers.x_vec.Read(queue, args.x_size, x_vec_cpu);
|
||||
static StatusCode RunReference2(const Arguments<T> &args, BuffersHost<T> &buffers_host, Queue &) {
|
||||
cblasXtrmv(convertToCBLAS(args.layout),
|
||||
convertToCBLAS(args.triangle),
|
||||
convertToCBLAS(args.a_transpose),
|
||||
convertToCBLAS(args.diagonal),
|
||||
args.n,
|
||||
a_mat_cpu, args.a_offset, args.a_ld,
|
||||
x_vec_cpu, args.x_offset, args.x_inc);
|
||||
buffers.x_vec.Write(queue, args.x_size, x_vec_cpu);
|
||||
buffers_host.a_mat, args.a_offset, args.a_ld,
|
||||
buffers_host.x_vec, args.x_offset, args.x_inc);
|
||||
return StatusCode::kSuccess;
|
||||
}
|
||||
#endif
|
||||
|
|
|
@ -44,6 +44,8 @@ class TestXtrsv {
|
|||
kArgALeadDim, kArgXInc,
|
||||
kArgAOffset, kArgXOffset};
|
||||
}
|
||||
static std::vector<std::string> BuffersIn() { return {kBufMatA, kBufVecX}; }
|
||||
static std::vector<std::string> BuffersOut() { return {kBufVecX}; }
|
||||
|
||||
// Describes how to obtain the sizes of the buffers
|
||||
static size_t GetSizeX(const Arguments<T> &args) {
|
||||
|
@ -122,19 +124,14 @@ class TestXtrsv {
|
|||
|
||||
// Describes how to run the CPU BLAS routine (for correctness/performance comparison)
|
||||
#ifdef CLBLAST_REF_CBLAS
|
||||
static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
|
||||
std::vector<T> a_mat_cpu(args.a_size, static_cast<T>(0));
|
||||
std::vector<T> x_vec_cpu(args.x_size, static_cast<T>(0));
|
||||
buffers.a_mat.Read(queue, args.a_size, a_mat_cpu);
|
||||
buffers.x_vec.Read(queue, args.x_size, x_vec_cpu);
|
||||
static StatusCode RunReference2(const Arguments<T> &args, BuffersHost<T> &buffers_host, Queue &) {
|
||||
cblasXtrsv(convertToCBLAS(args.layout),
|
||||
convertToCBLAS(args.triangle),
|
||||
convertToCBLAS(args.a_transpose),
|
||||
convertToCBLAS(args.diagonal),
|
||||
args.n,
|
||||
a_mat_cpu, args.a_offset, args.a_ld,
|
||||
x_vec_cpu, args.x_offset, args.x_inc);
|
||||
buffers.x_vec.Write(queue, args.x_size, x_vec_cpu);
|
||||
buffers_host.a_mat, args.a_offset, args.a_ld,
|
||||
buffers_host.x_vec, args.x_offset, args.x_inc);
|
||||
return StatusCode::kSuccess;
|
||||
}
|
||||
#endif
|
||||
|
|
|
@ -45,6 +45,8 @@ class TestXgemm {
|
|||
kArgAOffset, kArgBOffset, kArgCOffset,
|
||||
kArgAlpha, kArgBeta};
|
||||
}
|
||||
static std::vector<std::string> BuffersIn() { return {kBufMatA, kBufMatB, kBufMatC}; }
|
||||
static std::vector<std::string> BuffersOut() { return {kBufMatC}; }
|
||||
|
||||
// Describes how to obtain the sizes of the buffers
|
||||
static size_t GetSizeA(const Arguments<T> &args) {
|
||||
|
@ -121,21 +123,14 @@ class TestXgemm {
|
|||
|
||||
// Describes how to run the CPU BLAS routine (for correctness/performance comparison)
|
||||
#ifdef CLBLAST_REF_CBLAS
|
||||
static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
|
||||
std::vector<T> a_mat_cpu(args.a_size, static_cast<T>(0));
|
||||
std::vector<T> b_mat_cpu(args.b_size, static_cast<T>(0));
|
||||
std::vector<T> c_mat_cpu(args.c_size, static_cast<T>(0));
|
||||
buffers.a_mat.Read(queue, args.a_size, a_mat_cpu);
|
||||
buffers.b_mat.Read(queue, args.b_size, b_mat_cpu);
|
||||
buffers.c_mat.Read(queue, args.c_size, c_mat_cpu);
|
||||
static StatusCode RunReference2(const Arguments<T> &args, BuffersHost<T> &buffers_host, Queue &) {
|
||||
cblasXgemm(convertToCBLAS(args.layout),
|
||||
convertToCBLAS(args.a_transpose),
|
||||
convertToCBLAS(args.b_transpose),
|
||||
args.m, args.n, args.k, args.alpha,
|
||||
a_mat_cpu, args.a_offset, args.a_ld,
|
||||
b_mat_cpu, args.b_offset, args.b_ld, args.beta,
|
||||
c_mat_cpu, args.c_offset, args.c_ld);
|
||||
buffers.c_mat.Write(queue, args.c_size, c_mat_cpu);
|
||||
buffers_host.a_mat, args.a_offset, args.a_ld,
|
||||
buffers_host.b_mat, args.b_offset, args.b_ld, args.beta,
|
||||
buffers_host.c_mat, args.c_offset, args.c_ld);
|
||||
return StatusCode::kSuccess;
|
||||
}
|
||||
#endif
|
||||
|
|
|
@ -45,6 +45,8 @@ class TestXhemm {
|
|||
kArgAOffset, kArgBOffset, kArgCOffset,
|
||||
kArgAlpha, kArgBeta};
|
||||
}
|
||||
static std::vector<std::string> BuffersIn() { return {kBufMatA, kBufMatB, kBufMatC}; }
|
||||
static std::vector<std::string> BuffersOut() { return {kBufMatC}; }
|
||||
|
||||
// Describes how to obtain the sizes of the buffers
|
||||
static size_t GetSizeA(const Arguments<T> &args) {
|
||||
|
@ -121,21 +123,14 @@ class TestXhemm {
|
|||
|
||||
// Describes how to run the CPU BLAS routine (for correctness/performance comparison)
|
||||
#ifdef CLBLAST_REF_CBLAS
|
||||
static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
|
||||
std::vector<T> a_mat_cpu(args.a_size, static_cast<T>(0));
|
||||
std::vector<T> b_mat_cpu(args.b_size, static_cast<T>(0));
|
||||
std::vector<T> c_mat_cpu(args.c_size, static_cast<T>(0));
|
||||
buffers.a_mat.Read(queue, args.a_size, a_mat_cpu);
|
||||
buffers.b_mat.Read(queue, args.b_size, b_mat_cpu);
|
||||
buffers.c_mat.Read(queue, args.c_size, c_mat_cpu);
|
||||
static StatusCode RunReference2(const Arguments<T> &args, BuffersHost<T> &buffers_host, Queue &) {
|
||||
cblasXhemm(convertToCBLAS(args.layout),
|
||||
convertToCBLAS(args.side),
|
||||
convertToCBLAS(args.triangle),
|
||||
args.m, args.n, args.alpha,
|
||||
a_mat_cpu, args.a_offset, args.a_ld,
|
||||
b_mat_cpu, args.b_offset, args.b_ld, args.beta,
|
||||
c_mat_cpu, args.c_offset, args.c_ld);
|
||||
buffers.c_mat.Write(queue, args.c_size, c_mat_cpu);
|
||||
buffers_host.a_mat, args.a_offset, args.a_ld,
|
||||
buffers_host.b_mat, args.b_offset, args.b_ld, args.beta,
|
||||
buffers_host.c_mat, args.c_offset, args.c_ld);
|
||||
return StatusCode::kSuccess;
|
||||
}
|
||||
#endif
|
||||
|
|
|
@ -45,6 +45,8 @@ class TestXher2k {
|
|||
kArgAOffset, kArgBOffset, kArgCOffset,
|
||||
kArgAlpha, kArgBeta};
|
||||
}
|
||||
static std::vector<std::string> BuffersIn() { return {kBufMatA, kBufMatB, kBufMatC}; }
|
||||
static std::vector<std::string> BuffersOut() { return {kBufMatC}; }
|
||||
|
||||
// Describes how to obtain the sizes of the buffers
|
||||
static size_t GetSizeA(const Arguments<U> &args) {
|
||||
|
@ -121,22 +123,15 @@ class TestXher2k {
|
|||
|
||||
// Describes how to run the CPU BLAS routine (for correctness/performance comparison)
|
||||
#ifdef CLBLAST_REF_CBLAS
|
||||
static StatusCode RunReference2(const Arguments<U> &args, Buffers<T> &buffers, Queue &queue) {
|
||||
std::vector<T> a_mat_cpu(args.a_size, static_cast<T>(0));
|
||||
std::vector<T> b_mat_cpu(args.b_size, static_cast<T>(0));
|
||||
std::vector<T> c_mat_cpu(args.c_size, static_cast<T>(0));
|
||||
buffers.a_mat.Read(queue, args.a_size, a_mat_cpu);
|
||||
buffers.b_mat.Read(queue, args.b_size, b_mat_cpu);
|
||||
buffers.c_mat.Read(queue, args.c_size, c_mat_cpu);
|
||||
static StatusCode RunReference2(const Arguments<U> &args, BuffersHost<T> &buffers_host, Queue&) {
|
||||
auto alpha2 = T{args.alpha, args.alpha};
|
||||
cblasXher2k(convertToCBLAS(args.layout),
|
||||
convertToCBLAS(args.triangle),
|
||||
convertToCBLAS(args.a_transpose),
|
||||
args.n, args.k, alpha2,
|
||||
a_mat_cpu, args.a_offset, args.a_ld,
|
||||
b_mat_cpu, args.b_offset, args.b_ld, args.beta,
|
||||
c_mat_cpu, args.c_offset, args.c_ld);
|
||||
buffers.c_mat.Write(queue, args.c_size, c_mat_cpu);
|
||||
buffers_host.a_mat, args.a_offset, args.a_ld,
|
||||
buffers_host.b_mat, args.b_offset, args.b_ld, args.beta,
|
||||
buffers_host.c_mat, args.c_offset, args.c_ld);
|
||||
return StatusCode::kSuccess;
|
||||
}
|
||||
#endif
|
||||
|
|
|
@ -45,6 +45,8 @@ class TestXherk {
|
|||
kArgAOffset, kArgCOffset,
|
||||
kArgAlpha, kArgBeta};
|
||||
}
|
||||
static std::vector<std::string> BuffersIn() { return {kBufMatA, kBufMatC}; }
|
||||
static std::vector<std::string> BuffersOut() { return {kBufMatC}; }
|
||||
|
||||
// Describes how to obtain the sizes of the buffers
|
||||
static size_t GetSizeA(const Arguments<U> &args) {
|
||||
|
@ -110,18 +112,13 @@ class TestXherk {
|
|||
|
||||
// Describes how to run the CPU BLAS routine (for correctness/performance comparison)
|
||||
#ifdef CLBLAST_REF_CBLAS
|
||||
static StatusCode RunReference2(const Arguments<U> &args, Buffers<T> &buffers, Queue &queue) {
|
||||
std::vector<T> a_mat_cpu(args.a_size, static_cast<T>(0));
|
||||
std::vector<T> c_mat_cpu(args.c_size, static_cast<T>(0));
|
||||
buffers.a_mat.Read(queue, args.a_size, a_mat_cpu);
|
||||
buffers.c_mat.Read(queue, args.c_size, c_mat_cpu);
|
||||
static StatusCode RunReference2(const Arguments<U> &args, BuffersHost<T> &buffers_host, Queue&) {
|
||||
cblasXherk(convertToCBLAS(args.layout),
|
||||
convertToCBLAS(args.triangle),
|
||||
convertToCBLAS(args.a_transpose),
|
||||
args.n, args.k, args.alpha,
|
||||
a_mat_cpu, args.a_offset, args.a_ld, args.beta,
|
||||
c_mat_cpu, args.c_offset, args.c_ld);
|
||||
buffers.c_mat.Write(queue, args.c_size, c_mat_cpu);
|
||||
buffers_host.a_mat, args.a_offset, args.a_ld, args.beta,
|
||||
buffers_host.c_mat, args.c_offset, args.c_ld);
|
||||
return StatusCode::kSuccess;
|
||||
}
|
||||
#endif
|
||||
|
|
|
@ -45,6 +45,8 @@ class TestXsymm {
|
|||
kArgAOffset, kArgBOffset, kArgCOffset,
|
||||
kArgAlpha, kArgBeta};
|
||||
}
|
||||
static std::vector<std::string> BuffersIn() { return {kBufMatA, kBufMatB, kBufMatC}; }
|
||||
static std::vector<std::string> BuffersOut() { return {kBufMatC}; }
|
||||
|
||||
// Describes how to obtain the sizes of the buffers
|
||||
static size_t GetSizeA(const Arguments<T> &args) {
|
||||
|
@ -121,21 +123,14 @@ class TestXsymm {
|
|||
|
||||
// Describes how to run the CPU BLAS routine (for correctness/performance comparison)
|
||||
#ifdef CLBLAST_REF_CBLAS
|
||||
static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
|
||||
std::vector<T> a_mat_cpu(args.a_size, static_cast<T>(0));
|
||||
std::vector<T> b_mat_cpu(args.b_size, static_cast<T>(0));
|
||||
std::vector<T> c_mat_cpu(args.c_size, static_cast<T>(0));
|
||||
buffers.a_mat.Read(queue, args.a_size, a_mat_cpu);
|
||||
buffers.b_mat.Read(queue, args.b_size, b_mat_cpu);
|
||||
buffers.c_mat.Read(queue, args.c_size, c_mat_cpu);
|
||||
static StatusCode RunReference2(const Arguments<T> &args, BuffersHost<T> &buffers_host, Queue &) {
|
||||
cblasXsymm(convertToCBLAS(args.layout),
|
||||
convertToCBLAS(args.side),
|
||||
convertToCBLAS(args.triangle),
|
||||
args.m, args.n, args.alpha,
|
||||
a_mat_cpu, args.a_offset, args.a_ld,
|
||||
b_mat_cpu, args.b_offset, args.b_ld, args.beta,
|
||||
c_mat_cpu, args.c_offset, args.c_ld);
|
||||
buffers.c_mat.Write(queue, args.c_size, c_mat_cpu);
|
||||
buffers_host.a_mat, args.a_offset, args.a_ld,
|
||||
buffers_host.b_mat, args.b_offset, args.b_ld, args.beta,
|
||||
buffers_host.c_mat, args.c_offset, args.c_ld);
|
||||
return StatusCode::kSuccess;
|
||||
}
|
||||
#endif
|
||||
|
|
|
@ -45,6 +45,8 @@ class TestXsyr2k {
|
|||
kArgAOffset, kArgBOffset, kArgCOffset,
|
||||
kArgAlpha, kArgBeta};
|
||||
}
|
||||
static std::vector<std::string> BuffersIn() { return {kBufMatA, kBufMatB, kBufMatC}; }
|
||||
static std::vector<std::string> BuffersOut() { return {kBufMatC}; }
|
||||
|
||||
// Describes how to obtain the sizes of the buffers
|
||||
static size_t GetSizeA(const Arguments<T> &args) {
|
||||
|
@ -119,21 +121,14 @@ class TestXsyr2k {
|
|||
|
||||
// Describes how to run the CPU BLAS routine (for correctness/performance comparison)
|
||||
#ifdef CLBLAST_REF_CBLAS
|
||||
static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
|
||||
std::vector<T> a_mat_cpu(args.a_size, static_cast<T>(0));
|
||||
std::vector<T> b_mat_cpu(args.b_size, static_cast<T>(0));
|
||||
std::vector<T> c_mat_cpu(args.c_size, static_cast<T>(0));
|
||||
buffers.a_mat.Read(queue, args.a_size, a_mat_cpu);
|
||||
buffers.b_mat.Read(queue, args.b_size, b_mat_cpu);
|
||||
buffers.c_mat.Read(queue, args.c_size, c_mat_cpu);
|
||||
static StatusCode RunReference2(const Arguments<T> &args, BuffersHost<T> &buffers_host, Queue &) {
|
||||
cblasXsyr2k(convertToCBLAS(args.layout),
|
||||
convertToCBLAS(args.triangle),
|
||||
convertToCBLAS(args.a_transpose),
|
||||
args.n, args.k, args.alpha,
|
||||
a_mat_cpu, args.a_offset, args.a_ld,
|
||||
b_mat_cpu, args.b_offset, args.b_ld, args.beta,
|
||||
c_mat_cpu, args.c_offset, args.c_ld);
|
||||
buffers.c_mat.Write(queue, args.c_size, c_mat_cpu);
|
||||
buffers_host.a_mat, args.a_offset, args.a_ld,
|
||||
buffers_host.b_mat, args.b_offset, args.b_ld, args.beta,
|
||||
buffers_host.c_mat, args.c_offset, args.c_ld);
|
||||
return StatusCode::kSuccess;
|
||||
}
|
||||
#endif
|
||||
|
|
|
@ -45,6 +45,8 @@ class TestXsyrk {
|
|||
kArgAOffset, kArgCOffset,
|
||||
kArgAlpha, kArgBeta};
|
||||
}
|
||||
static std::vector<std::string> BuffersIn() { return {kBufMatA, kBufMatC}; }
|
||||
static std::vector<std::string> BuffersOut() { return {kBufMatC}; }
|
||||
|
||||
// Describes how to obtain the sizes of the buffers
|
||||
static size_t GetSizeA(const Arguments<T> &args) {
|
||||
|
@ -110,18 +112,13 @@ class TestXsyrk {
|
|||
|
||||
// Describes how to run the CPU BLAS routine (for correctness/performance comparison)
|
||||
#ifdef CLBLAST_REF_CBLAS
|
||||
static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
|
||||
std::vector<T> a_mat_cpu(args.a_size, static_cast<T>(0));
|
||||
std::vector<T> c_mat_cpu(args.c_size, static_cast<T>(0));
|
||||
buffers.a_mat.Read(queue, args.a_size, a_mat_cpu);
|
||||
buffers.c_mat.Read(queue, args.c_size, c_mat_cpu);
|
||||
static StatusCode RunReference2(const Arguments<T> &args, BuffersHost<T> &buffers_host, Queue &) {
|
||||
cblasXsyrk(convertToCBLAS(args.layout),
|
||||
convertToCBLAS(args.triangle),
|
||||
convertToCBLAS(args.a_transpose),
|
||||
args.n, args.k, args.alpha,
|
||||
a_mat_cpu, args.a_offset, args.a_ld, args.beta,
|
||||
c_mat_cpu, args.c_offset, args.c_ld);
|
||||
buffers.c_mat.Write(queue, args.c_size, c_mat_cpu);
|
||||
buffers_host.a_mat, args.a_offset, args.a_ld, args.beta,
|
||||
buffers_host.c_mat, args.c_offset, args.c_ld);
|
||||
return StatusCode::kSuccess;
|
||||
}
|
||||
#endif
|
||||
|
|
|
@ -45,6 +45,8 @@ class TestXtrmm {
|
|||
kArgAOffset, kArgBOffset,
|
||||
kArgAlpha};
|
||||
}
|
||||
static std::vector<std::string> BuffersIn() { return {kBufMatA, kBufMatB}; }
|
||||
static std::vector<std::string> BuffersOut() { return {kBufMatB}; }
|
||||
|
||||
// Describes how to obtain the sizes of the buffers
|
||||
static size_t GetSizeA(const Arguments<T> &args) {
|
||||
|
@ -112,20 +114,15 @@ class TestXtrmm {
|
|||
|
||||
// Describes how to run the CPU BLAS routine (for correctness/performance comparison)
|
||||
#ifdef CLBLAST_REF_CBLAS
|
||||
static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
|
||||
std::vector<T> a_mat_cpu(args.a_size, static_cast<T>(0));
|
||||
std::vector<T> b_mat_cpu(args.b_size, static_cast<T>(0));
|
||||
buffers.a_mat.Read(queue, args.a_size, a_mat_cpu);
|
||||
buffers.b_mat.Read(queue, args.b_size, b_mat_cpu);
|
||||
static StatusCode RunReference2(const Arguments<T> &args, BuffersHost<T> &buffers_host, Queue &) {
|
||||
cblasXtrmm(convertToCBLAS(args.layout),
|
||||
convertToCBLAS(args.side),
|
||||
convertToCBLAS(args.triangle),
|
||||
convertToCBLAS(args.a_transpose),
|
||||
convertToCBLAS(args.diagonal),
|
||||
args.m, args.n, args.alpha,
|
||||
a_mat_cpu, args.a_offset, args.a_ld,
|
||||
b_mat_cpu, args.b_offset, args.b_ld);
|
||||
buffers.b_mat.Write(queue, args.b_size, b_mat_cpu);
|
||||
buffers_host.a_mat, args.a_offset, args.a_ld,
|
||||
buffers_host.b_mat, args.b_offset, args.b_ld);
|
||||
return StatusCode::kSuccess;
|
||||
}
|
||||
#endif
|
||||
|
|
|
@ -47,6 +47,8 @@ class TestXtrsm {
|
|||
kArgAOffset, kArgBOffset,
|
||||
kArgAlpha};
|
||||
}
|
||||
static std::vector<std::string> BuffersIn() { return {kBufMatA, kBufMatB}; }
|
||||
static std::vector<std::string> BuffersOut() { return {kBufMatB}; }
|
||||
|
||||
// Describes how to obtain the sizes of the buffers
|
||||
static size_t GetSizeA(const Arguments<T> &args) {
|
||||
|
@ -124,20 +126,15 @@ class TestXtrsm {
|
|||
|
||||
// Describes how to run the CPU BLAS routine (for correctness/performance comparison)
|
||||
#ifdef CLBLAST_REF_CBLAS
|
||||
static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
|
||||
std::vector<T> a_mat_cpu(args.a_size, static_cast<T>(0));
|
||||
std::vector<T> b_mat_cpu(args.b_size, static_cast<T>(0));
|
||||
buffers.a_mat.Read(queue, args.a_size, a_mat_cpu);
|
||||
buffers.b_mat.Read(queue, args.b_size, b_mat_cpu);
|
||||
static StatusCode RunReference2(const Arguments<T> &args, BuffersHost<T> &buffers_host, Queue &) {
|
||||
cblasXtrsm(convertToCBLAS(args.layout),
|
||||
convertToCBLAS(args.side),
|
||||
convertToCBLAS(args.triangle),
|
||||
convertToCBLAS(args.a_transpose),
|
||||
convertToCBLAS(args.diagonal),
|
||||
args.m, args.n, args.alpha,
|
||||
a_mat_cpu, args.a_offset, args.a_ld,
|
||||
b_mat_cpu, args.b_offset, args.b_ld);
|
||||
buffers.b_mat.Write(queue, args.b_size, b_mat_cpu);
|
||||
buffers_host.a_mat, args.a_offset, args.a_ld,
|
||||
buffers_host.b_mat, args.b_offset, args.b_ld);
|
||||
return StatusCode::kSuccess;
|
||||
}
|
||||
#endif
|
||||
|
|
|
@ -45,6 +45,8 @@ class TestXaxpyBatched {
|
|||
kArgXInc, kArgYInc,
|
||||
kArgBatchCount, kArgAlpha};
|
||||
}
|
||||
static std::vector<std::string> BuffersIn() { return {kBufVecX, kBufVecY}; }
|
||||
static std::vector<std::string> BuffersOut() { return {kBufVecY}; }
|
||||
|
||||
// Helper for the sizes per batch
|
||||
static size_t PerBatchSizeX(const Arguments<T> &args) { return args.n * args.x_inc; }
|
||||
|
@ -123,17 +125,12 @@ class TestXaxpyBatched {
|
|||
|
||||
// Describes how to run the CPU BLAS routine (for correctness/performance comparison)
|
||||
#ifdef CLBLAST_REF_CBLAS
|
||||
static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
|
||||
std::vector<T> x_vec_cpu(args.x_size, static_cast<T>(0));
|
||||
std::vector<T> y_vec_cpu(args.y_size, static_cast<T>(0));
|
||||
buffers.x_vec.Read(queue, args.x_size, x_vec_cpu);
|
||||
buffers.y_vec.Read(queue, args.y_size, y_vec_cpu);
|
||||
static StatusCode RunReference2(const Arguments<T> &args, BuffersHost<T> &buffers_host, Queue &) {
|
||||
for (auto batch = size_t{0}; batch < args.batch_count; ++batch) {
|
||||
cblasXaxpy(args.n, args.alphas[batch],
|
||||
x_vec_cpu, args.x_offsets[batch], args.x_inc,
|
||||
y_vec_cpu, args.y_offsets[batch], args.y_inc);
|
||||
buffers_host.x_vec, args.x_offsets[batch], args.x_inc,
|
||||
buffers_host.y_vec, args.y_offsets[batch], args.y_inc);
|
||||
}
|
||||
buffers.y_vec.Write(queue, args.y_size, y_vec_cpu);
|
||||
return StatusCode::kSuccess;
|
||||
}
|
||||
#endif
|
||||
|
|
|
@ -45,6 +45,8 @@ class TestXgemmBatched {
|
|||
kArgAOffset, kArgBOffset, kArgCOffset,
|
||||
kArgBatchCount, kArgAlpha, kArgBeta};
|
||||
}
|
||||
static std::vector<std::string> BuffersIn() { return {kBufMatA, kBufMatB, kBufMatC}; }
|
||||
static std::vector<std::string> BuffersOut() { return {kBufMatC}; }
|
||||
|
||||
// Helper for the sizes per batch
|
||||
static size_t PerBatchSizeA(const Arguments<T> &args) {
|
||||
|
@ -152,23 +154,16 @@ class TestXgemmBatched {
|
|||
|
||||
// Describes how to run the CPU BLAS routine (for correctness/performance comparison)
|
||||
#ifdef CLBLAST_REF_CBLAS
|
||||
static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
|
||||
std::vector<T> a_mat_cpu(args.a_size, static_cast<T>(0));
|
||||
std::vector<T> b_mat_cpu(args.b_size, static_cast<T>(0));
|
||||
std::vector<T> c_mat_cpu(args.c_size, static_cast<T>(0));
|
||||
buffers.a_mat.Read(queue, args.a_size, a_mat_cpu);
|
||||
buffers.b_mat.Read(queue, args.b_size, b_mat_cpu);
|
||||
buffers.c_mat.Read(queue, args.c_size, c_mat_cpu);
|
||||
static StatusCode RunReference2(const Arguments<T> &args, BuffersHost<T> &buffers_host, Queue &) {
|
||||
for (auto batch = size_t{0}; batch < args.batch_count; ++batch) {
|
||||
cblasXgemm(convertToCBLAS(args.layout),
|
||||
convertToCBLAS(args.a_transpose),
|
||||
convertToCBLAS(args.b_transpose),
|
||||
args.m, args.n, args.k, args.alphas[batch],
|
||||
a_mat_cpu, args.a_offsets[batch], args.a_ld,
|
||||
b_mat_cpu, args.b_offsets[batch], args.b_ld, args.betas[batch],
|
||||
c_mat_cpu, args.c_offsets[batch], args.c_ld);
|
||||
buffers_host.a_mat, args.a_offsets[batch], args.a_ld,
|
||||
buffers_host.b_mat, args.b_offsets[batch], args.b_ld, args.betas[batch],
|
||||
buffers_host.c_mat, args.c_offsets[batch], args.c_ld);
|
||||
}
|
||||
buffers.c_mat.Write(queue, args.c_size, c_mat_cpu);
|
||||
return StatusCode::kSuccess;
|
||||
}
|
||||
#endif
|
||||
|
|
|
@ -25,17 +25,10 @@ namespace clblast {
|
|||
// =================================================================================================
|
||||
|
||||
template <typename T>
|
||||
StatusCode RunReference(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
|
||||
StatusCode RunReference(const Arguments<T> &args, BuffersHost<T> &buffers_host) {
|
||||
const bool is_upper = ((args.triangle == Triangle::kUpper && args.layout != Layout::kRowMajor) ||
|
||||
(args.triangle == Triangle::kLower && args.layout == Layout::kRowMajor));
|
||||
|
||||
// Data transfer from OpenCL to std::vector
|
||||
std::vector<T> a_mat_cpu(args.a_size, T{0.0});
|
||||
buffers.a_mat.Read(queue, args.a_size, a_mat_cpu);
|
||||
|
||||
// Creates the output buffer
|
||||
std::vector<T> b_mat_cpu(args.b_size, T{0.0});
|
||||
|
||||
// Helper variables
|
||||
const auto block_size = args.m;
|
||||
const auto num_blocks = CeilDiv(args.n, block_size);
|
||||
|
@ -60,11 +53,11 @@ StatusCode RunReference(const Arguments<T> &args, Buffers<T> &buffers, Queue &qu
|
|||
auto a_value = T{1.0};
|
||||
if (args.diagonal == Diagonal::kNonUnit) {
|
||||
if (i + block_id * block_size < args.n) {
|
||||
if (a_mat_cpu[i * a_ld + i + a_offset] == T{0.0}) { return StatusCode::kUnknownError; }
|
||||
a_value = T{1.0} / a_mat_cpu[i * a_ld + i + a_offset];
|
||||
if (buffers_host.a_mat[i * a_ld + i + a_offset] == T{0.0}) { return StatusCode::kUnknownError; }
|
||||
a_value = T{1.0} / buffers_host.a_mat[i * a_ld + i + a_offset];
|
||||
}
|
||||
}
|
||||
b_mat_cpu[i * b_ld + i + b_offset] = a_value;
|
||||
buffers_host.b_mat[i * b_ld + i + b_offset] = a_value;
|
||||
}
|
||||
|
||||
// Inverts the upper triangle row by row
|
||||
|
@ -75,11 +68,11 @@ StatusCode RunReference(const Arguments<T> &args, Buffers<T> &buffers, Queue &qu
|
|||
for (auto k = i + 1; k <= j; ++k) {
|
||||
auto a_value = T{0.0};
|
||||
if ((i + block_id * block_size < args.n) && (k + block_id * block_size < args.n)) {
|
||||
a_value = a_mat_cpu[k * a_ld + i + a_offset];
|
||||
a_value = buffers_host.a_mat[k * a_ld + i + a_offset];
|
||||
}
|
||||
sum += a_value * b_mat_cpu[j * b_ld + k + b_offset];
|
||||
sum += a_value * buffers_host.b_mat[j * b_ld + k + b_offset];
|
||||
}
|
||||
b_mat_cpu[j * b_ld + i + b_offset] = - sum * b_mat_cpu[i * b_ld + i + b_offset];
|
||||
buffers_host.b_mat[j * b_ld + i + b_offset] = - sum * buffers_host.b_mat[i * b_ld + i + b_offset];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -92,35 +85,32 @@ StatusCode RunReference(const Arguments<T> &args, Buffers<T> &buffers, Queue &qu
|
|||
for (auto k = j; k < i; ++k) {
|
||||
auto a_value = T{0.0};
|
||||
if ((i + block_id * block_size < args.n) && (k + block_id * block_size < args.n)) {
|
||||
a_value = a_mat_cpu[k * a_ld + i + a_offset];
|
||||
a_value = buffers_host.a_mat[k * a_ld + i + a_offset];
|
||||
}
|
||||
sum += a_value * b_mat_cpu[j * b_ld + k + b_offset];
|
||||
sum += a_value * buffers_host.b_mat[j * b_ld + k + b_offset];
|
||||
}
|
||||
b_mat_cpu[j * b_ld + i + b_offset] = - sum * b_mat_cpu[i * b_ld + i + b_offset];
|
||||
buffers_host.b_mat[j * b_ld + i + b_offset] = - sum * buffers_host.b_mat[i * b_ld + i + b_offset];
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Data transfer back to OpenCL
|
||||
buffers.b_mat.Write(queue, args.b_size, b_mat_cpu);
|
||||
return StatusCode::kSuccess;
|
||||
}
|
||||
|
||||
// Half-precision version calling the above reference implementation after conversions
|
||||
template <>
|
||||
StatusCode RunReference<half>(const Arguments<half> &args, Buffers<half> &buffers, Queue &queue) {
|
||||
auto a_buffer2 = HalfToFloatBuffer(buffers.a_mat, queue());
|
||||
auto b_buffer2 = HalfToFloatBuffer(buffers.b_mat, queue());
|
||||
auto dummy = clblast::Buffer<float>(0);
|
||||
auto buffers2 = Buffers<float>{dummy, dummy, a_buffer2, b_buffer2, dummy, dummy, dummy};
|
||||
StatusCode RunReference<half>(const Arguments<half> &args, BuffersHost<half> &buffers_host) {
|
||||
auto a_buffer2 = HalfToFloatBuffer(buffers_host.a_mat);
|
||||
auto b_buffer2 = HalfToFloatBuffer(buffers_host.b_mat);
|
||||
auto dummy = std::vector<float>(0);
|
||||
auto buffers2 = BuffersHost<float>{dummy, dummy, a_buffer2, b_buffer2, dummy, dummy, dummy};
|
||||
auto args2 = Arguments<float>();
|
||||
args2.a_size = args.a_size; args2.b_size = args.b_size;
|
||||
args2.a_ld = args.a_ld; args2.m = args.m; args2.n = args.n;
|
||||
args2.a_offset = args.a_offset;
|
||||
args2.layout = args.layout; args2.triangle = args.triangle; args2.diagonal = args.diagonal;
|
||||
auto status = RunReference(args2, buffers2, queue);
|
||||
FloatToHalfBuffer(buffers.b_mat, b_buffer2, queue());
|
||||
auto status = RunReference(args2, buffers2);
|
||||
FloatToHalfBuffer(buffers_host.b_mat, b_buffer2);
|
||||
return status;
|
||||
}
|
||||
|
||||
|
@ -140,6 +130,8 @@ class TestXinvert {
|
|||
kArgLayout, kArgTriangle, kArgDiagonal,
|
||||
kArgALeadDim, kArgAOffset};
|
||||
}
|
||||
static std::vector<std::string> BuffersIn() { return {kBufMatA, kBufMatB}; }
|
||||
static std::vector<std::string> BuffersOut() { return {kBufMatB}; }
|
||||
|
||||
// Describes how to obtain the sizes of the buffers
|
||||
static size_t GetSizeA(const Arguments<T> &args) {
|
||||
|
@ -190,11 +182,15 @@ class TestXinvert {
|
|||
// Describes how to run a naive version of the routine (for correctness/performance comparison).
|
||||
// Note that a proper clBLAS or CPU BLAS comparison is not available for non-BLAS routines.
|
||||
static StatusCode RunReference1(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
|
||||
return RunReference(args, buffers, queue);
|
||||
auto buffers_host = BuffersHost<T>();
|
||||
DeviceToHost(args, buffers, buffers_host, queue, BuffersIn());
|
||||
const auto status = RunReference(args, buffers_host);
|
||||
HostToDevice(args, buffers, buffers_host, queue, BuffersOut());
|
||||
return status;
|
||||
}
|
||||
|
||||
static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
|
||||
return RunReference(args, buffers, queue);
|
||||
static StatusCode RunReference2(const Arguments<T> &args, BuffersHost<T> &buffers_host, Queue&) {
|
||||
return RunReference(args, buffers_host);
|
||||
}
|
||||
|
||||
// Describes how to download the results of the computation (more importantly: which buffer)
|
||||
|
|
|
@ -23,13 +23,7 @@ namespace clblast {
|
|||
// =================================================================================================
|
||||
|
||||
template <typename T>
|
||||
StatusCode RunReference(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
|
||||
|
||||
// Data transfer from OpenCL to std::vector
|
||||
std::vector<T> a_mat_cpu(args.a_size, static_cast<T>(0));
|
||||
std::vector<T> b_mat_cpu(args.b_size, static_cast<T>(0));
|
||||
buffers.a_mat.Read(queue, args.a_size, a_mat_cpu);
|
||||
buffers.b_mat.Read(queue, args.b_size, b_mat_cpu);
|
||||
StatusCode RunReference(const Arguments<T> &args, BuffersHost<T> &buffers_host) {
|
||||
|
||||
// Checking for invalid arguments
|
||||
const auto a_rotated = (args.layout == Layout::kRowMajor);
|
||||
|
@ -40,8 +34,8 @@ StatusCode RunReference(const Arguments<T> &args, Buffers<T> &buffers, Queue &qu
|
|||
if ((args.m == 0) || (args.n == 0)) { return StatusCode::kInvalidDimension; }
|
||||
if ((args.a_ld < args.m && !a_rotated) || (args.a_ld < args.n && a_rotated)) { return StatusCode::kInvalidLeadDimA; }
|
||||
if ((args.b_ld < args.m && !b_rotated) || (args.b_ld < args.n && b_rotated)) { return StatusCode::kInvalidLeadDimB; }
|
||||
if (buffers.a_mat.GetSize() < (a_base + args.a_offset) * sizeof(T)) { return StatusCode::kInsufficientMemoryA; }
|
||||
if (buffers.b_mat.GetSize() < (b_base + args.b_offset) * sizeof(T)) { return StatusCode::kInsufficientMemoryB; }
|
||||
if (buffers_host.a_mat.size() * sizeof(T) < (a_base + args.a_offset) * sizeof(T)) { return StatusCode::kInsufficientMemoryA; }
|
||||
if (buffers_host.b_mat.size() * sizeof(T) < (b_base + args.b_offset) * sizeof(T)) { return StatusCode::kInsufficientMemoryB; }
|
||||
|
||||
// Matrix copy, scaling, and/or transpose
|
||||
for (auto id1 = size_t{0}; id1 < args.m; ++id1) {
|
||||
|
@ -52,30 +46,27 @@ StatusCode RunReference(const Arguments<T> &args, Buffers<T> &buffers, Queue &qu
|
|||
const auto b_two = (b_rotated) ? id1 : id2;
|
||||
const auto a_index = a_two * args.a_ld + a_one + args.a_offset;
|
||||
const auto b_index = b_two * args.b_ld + b_one + args.b_offset;
|
||||
b_mat_cpu[b_index] = args.alpha * a_mat_cpu[a_index];
|
||||
buffers_host.b_mat[b_index] = args.alpha * buffers_host.a_mat[a_index];
|
||||
}
|
||||
}
|
||||
|
||||
// Data transfer back to OpenCL
|
||||
buffers.b_mat.Write(queue, args.b_size, b_mat_cpu);
|
||||
return StatusCode::kSuccess;
|
||||
}
|
||||
|
||||
// Half-precision version calling the above reference implementation after conversions
|
||||
template <>
|
||||
StatusCode RunReference<half>(const Arguments<half> &args, Buffers<half> &buffers, Queue &queue) {
|
||||
auto a_buffer2 = HalfToFloatBuffer(buffers.a_mat, queue());
|
||||
auto b_buffer2 = HalfToFloatBuffer(buffers.b_mat, queue());
|
||||
auto dummy = clblast::Buffer<float>(0);
|
||||
auto buffers2 = Buffers<float>{dummy, dummy, a_buffer2, b_buffer2, dummy, dummy, dummy};
|
||||
StatusCode RunReference<half>(const Arguments<half> &args, BuffersHost<half> &buffers_host) {
|
||||
auto a_buffer2 = HalfToFloatBuffer(buffers_host.a_mat);
|
||||
auto b_buffer2 = HalfToFloatBuffer(buffers_host.b_mat);
|
||||
auto dummy = std::vector<float>(0);
|
||||
auto buffers2 = BuffersHost<float>{dummy, dummy, a_buffer2, b_buffer2, dummy, dummy, dummy};
|
||||
auto args2 = Arguments<float>();
|
||||
args2.a_size = args.a_size; args2.b_size = args.b_size;
|
||||
args2.a_ld = args.a_ld; args2.b_ld = args.b_ld; args2.m = args.m; args2.n = args.n;
|
||||
args2.a_offset = args.a_offset; args2.b_offset = args.b_offset;
|
||||
args2.layout = args.layout; args2.a_transpose = args.a_transpose;
|
||||
args2.alpha = HalfToFloat(args.alpha);
|
||||
auto status = RunReference(args2, buffers2, queue);
|
||||
FloatToHalfBuffer(buffers.b_mat, b_buffer2, queue());
|
||||
auto status = RunReference(args2, buffers2);
|
||||
FloatToHalfBuffer(buffers_host.b_mat, b_buffer2);
|
||||
return status;
|
||||
}
|
||||
|
||||
|
@ -97,6 +88,8 @@ class TestXomatcopy {
|
|||
kArgAOffset, kArgBOffset,
|
||||
kArgAlpha};
|
||||
}
|
||||
static std::vector<std::string> BuffersIn() { return {kBufMatA, kBufMatB}; }
|
||||
static std::vector<std::string> BuffersOut() { return {kBufMatB}; }
|
||||
|
||||
// Describes how to obtain the sizes of the buffers
|
||||
static size_t GetSizeA(const Arguments<T> &args) {
|
||||
|
@ -148,11 +141,15 @@ class TestXomatcopy {
|
|||
// Describes how to run a naive version of the routine (for correctness/performance comparison).
|
||||
// Note that a proper clBLAS or CPU BLAS comparison is not available for non-BLAS routines.
|
||||
static StatusCode RunReference1(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
|
||||
return RunReference(args, buffers, queue);
|
||||
auto buffers_host = BuffersHost<T>();
|
||||
DeviceToHost(args, buffers, buffers_host, queue, BuffersIn());
|
||||
const auto status = RunReference(args, buffers_host);
|
||||
HostToDevice(args, buffers, buffers_host, queue, BuffersOut());
|
||||
return status;
|
||||
}
|
||||
|
||||
static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
|
||||
return RunReference(args, buffers, queue);
|
||||
static StatusCode RunReference2(const Arguments<T> &args, BuffersHost<T> &buffers_host, Queue&) {
|
||||
return RunReference(args, buffers_host);
|
||||
}
|
||||
|
||||
// Describes how to download the results of the computation (more importantly: which buffer)
|
||||
|
|
Loading…
Reference in New Issue