From fa0a9c689fc21a2a24aeadf82ae0acdf6d8bf831 Mon Sep 17 00:00:00 2001 From: Cedric Nugteren Date: Wed, 8 Mar 2017 20:10:20 +0100 Subject: [PATCH] Make batched routines based on offsets instead of a vector of cl_mem objects - undoing many earlier changes --- doc/clblast.md | 34 ++-- include/clblast.h | 4 +- include/clblast_c.h | 20 +- scripts/generator/generator/routine.py | 31 ++- src/clblast.cpp | 36 ++-- src/clblast_c.cpp | 40 ++-- src/clpp11.hpp | 3 - src/kernels/level1/xaxpy.opencl | 22 +- src/routines/levelx/xaxpybatched.cpp | 59 +++++- src/routines/levelx/xaxpybatched.hpp | 11 +- src/utilities/utilities.hpp | 6 + test/correctness/misc/override_parameters.cpp | 2 +- test/correctness/testblas.cpp | 191 ++++++++---------- test/correctness/testblas.hpp | 2 +- test/performance/client.cpp | 53 +++-- test/performance/client.hpp | 4 +- test/routines/level1/xamax.hpp | 20 +- test/routines/level1/xasum.hpp | 20 +- test/routines/level1/xaxpy.hpp | 20 +- test/routines/level1/xcopy.hpp | 20 +- test/routines/level1/xdot.hpp | 26 +-- test/routines/level1/xdotc.hpp | 26 +-- test/routines/level1/xdotu.hpp | 26 +-- test/routines/level1/xnrm2.hpp | 20 +- test/routines/level1/xscal.hpp | 14 +- test/routines/level1/xswap.hpp | 22 +- test/routines/level2/xgbmv.hpp | 26 +-- test/routines/level2/xgemv.hpp | 26 +-- test/routines/level2/xger.hpp | 26 +-- test/routines/level2/xgerc.hpp | 26 +-- test/routines/level2/xgeru.hpp | 26 +-- test/routines/level2/xhbmv.hpp | 26 +-- test/routines/level2/xhemv.hpp | 26 +-- test/routines/level2/xher.hpp | 20 +- test/routines/level2/xher2.hpp | 26 +-- test/routines/level2/xhpmv.hpp | 26 +-- test/routines/level2/xhpr.hpp | 20 +- test/routines/level2/xhpr2.hpp | 26 +-- test/routines/level2/xsbmv.hpp | 26 +-- test/routines/level2/xspmv.hpp | 26 +-- test/routines/level2/xspr.hpp | 20 +- test/routines/level2/xspr2.hpp | 26 +-- test/routines/level2/xsymv.hpp | 26 +-- test/routines/level2/xsyr.hpp | 20 +- test/routines/level2/xsyr2.hpp | 26 +-- test/routines/level2/xtbmv.hpp | 20 +- test/routines/level2/xtpmv.hpp | 20 +- test/routines/level2/xtrmv.hpp | 20 +- test/routines/level2/xtrsv.hpp | 20 +- test/routines/level3/xgemm.hpp | 26 +-- test/routines/level3/xhemm.hpp | 26 +-- test/routines/level3/xher2k.hpp | 26 +-- test/routines/level3/xherk.hpp | 20 +- test/routines/level3/xsymm.hpp | 26 +-- test/routines/level3/xsyr2k.hpp | 26 +-- test/routines/level3/xsyrk.hpp | 20 +- test/routines/level3/xtrmm.hpp | 20 +- test/routines/level3/xtrsm.hpp | 20 +- test/routines/levelx/xaxpybatched.hpp | 66 +++--- test/routines/levelx/xinvert.hpp | 10 +- test/routines/levelx/xomatcopy.hpp | 14 +- 61 files changed, 810 insertions(+), 772 deletions(-) diff --git a/doc/clblast.md b/doc/clblast.md index c919169a..120c0c2c 100644 --- a/doc/clblast.md +++ b/doc/clblast.md @@ -2913,8 +2913,8 @@ C++ API: template StatusCode AxpyBatched(const size_t n, const T *alphas, - const cl_mem *x_buffers, const size_t x_inc, - cl_mem *y_buffers, const size_t y_inc, + const cl_mem x_buffer, const size_t *x_offsets, const size_t x_inc, + cl_mem y_buffer, const size_t *y_offsets, const size_t y_inc, const size_t batch_count, cl_command_queue* queue, cl_event* event) ``` @@ -2923,32 +2923,32 @@ C API: ``` CLBlastStatusCode CLBlastSaxpyBatched(const size_t n, const float *alphas, - const cl_mem *x_buffers, const size_t x_inc, - cl_mem *y_buffers, const size_t y_inc, + const cl_mem x_buffer, const size_t *x_offsets, const size_t x_inc, + cl_mem y_buffer, const size_t *y_offsets, const size_t y_inc, const size_t batch_count, cl_command_queue* queue, cl_event* event) CLBlastStatusCode CLBlastDaxpyBatched(const size_t n, const double *alphas, - const cl_mem *x_buffers, const size_t x_inc, - cl_mem *y_buffers, const size_t y_inc, + const cl_mem x_buffer, const size_t *x_offsets, const size_t x_inc, + cl_mem y_buffer, const size_t *y_offsets, const size_t y_inc, const size_t batch_count, cl_command_queue* queue, cl_event* event) CLBlastStatusCode CLBlastCaxpyBatched(const size_t n, const cl_float2 *alphas, - const cl_mem *x_buffers, const size_t x_inc, - cl_mem *y_buffers, const size_t y_inc, + const cl_mem x_buffer, const size_t *x_offsets, const size_t x_inc, + cl_mem y_buffer, const size_t *y_offsets, const size_t y_inc, const size_t batch_count, cl_command_queue* queue, cl_event* event) CLBlastStatusCode CLBlastZaxpyBatched(const size_t n, const cl_double2 *alphas, - const cl_mem *x_buffers, const size_t x_inc, - cl_mem *y_buffers, const size_t y_inc, + const cl_mem x_buffer, const size_t *x_offsets, const size_t x_inc, + cl_mem y_buffer, const size_t *y_offsets, const size_t y_inc, const size_t batch_count, cl_command_queue* queue, cl_event* event) CLBlastStatusCode CLBlastHaxpyBatched(const size_t n, const cl_half *alphas, - const cl_mem *x_buffers, const size_t x_inc, - cl_mem *y_buffers, const size_t y_inc, + const cl_mem x_buffer, const size_t *x_offsets, const size_t x_inc, + cl_mem y_buffer, const size_t *y_offsets, const size_t y_inc, const size_t batch_count, cl_command_queue* queue, cl_event* event) ``` @@ -2957,10 +2957,12 @@ Arguments to AXPYBATCHED: * `const size_t n`: Integer size argument. This value must be positive. * `const T *alphas`: Input scalar constants. -* `const cl_mem *x_buffers`: OpenCL buffers to store the input x vectors. -* `const size_t x_inc`: Stride/increment of the input x vectors. This value must be greater than 0. -* `cl_mem *y_buffers`: OpenCL buffers to store the output y vectors. -* `const size_t y_inc`: Stride/increment of the output y vectors. This value must be greater than 0. +* `const cl_mem x_buffer`: OpenCL buffer to store the input x vector. +* `const size_t *x_offsets`: The offsets in elements from the start of the input x vector. +* `const size_t x_inc`: Stride/increment of the input x vector. This value must be greater than 0. +* `cl_mem y_buffer`: OpenCL buffer to store the output y vector. +* `const size_t *y_offsets`: The offsets in elements from the start of the output y vector. +* `const size_t y_inc`: Stride/increment of the output y vector. This value must be greater than 0. * `const size_t batch_count`: Number of batches. This value must be positive. * `cl_command_queue* queue`: Pointer to an OpenCL command queue associated with a context and device to execute the routine on. * `cl_event* event`: Pointer to an OpenCL event to be able to wait for completion of the routine's OpenCL kernel(s). This is an optional argument. diff --git a/include/clblast.h b/include/clblast.h index f3f73893..a1f14471 100644 --- a/include/clblast.h +++ b/include/clblast.h @@ -614,8 +614,8 @@ StatusCode Omatcopy(const Layout layout, const Transpose a_transpose, template StatusCode AxpyBatched(const size_t n, const T *alphas, - const cl_mem *x_buffers, const size_t x_inc, - cl_mem *y_buffers, const size_t y_inc, + const cl_mem x_buffer, const size_t *x_offsets, const size_t x_inc, + cl_mem y_buffer, const size_t *y_offsets, const size_t y_inc, const size_t batch_count, cl_command_queue* queue, cl_event* event = nullptr); diff --git a/include/clblast_c.h b/include/clblast_c.h index 5c84b5d7..4f21ba17 100644 --- a/include/clblast_c.h +++ b/include/clblast_c.h @@ -1331,32 +1331,32 @@ CLBlastStatusCode PUBLIC_API CLBlastHomatcopy(const CLBlastLayout layout, const // Batched version of AXPY: SAXPYBATCHED/DAXPYBATCHED/CAXPYBATCHED/ZAXPYBATCHED/HAXPYBATCHED CLBlastStatusCode PUBLIC_API CLBlastSaxpyBatched(const size_t n, const float *alphas, - const cl_mem *x_buffers, const size_t x_inc, - cl_mem *y_buffers, const size_t y_inc, + const cl_mem x_buffer, const size_t *x_offsets, const size_t x_inc, + cl_mem y_buffer, const size_t *y_offsets, const size_t y_inc, const size_t batch_count, cl_command_queue* queue, cl_event* event); CLBlastStatusCode PUBLIC_API CLBlastDaxpyBatched(const size_t n, const double *alphas, - const cl_mem *x_buffers, const size_t x_inc, - cl_mem *y_buffers, const size_t y_inc, + const cl_mem x_buffer, const size_t *x_offsets, const size_t x_inc, + cl_mem y_buffer, const size_t *y_offsets, const size_t y_inc, const size_t batch_count, cl_command_queue* queue, cl_event* event); CLBlastStatusCode PUBLIC_API CLBlastCaxpyBatched(const size_t n, const cl_float2 *alphas, - const cl_mem *x_buffers, const size_t x_inc, - cl_mem *y_buffers, const size_t y_inc, + const cl_mem x_buffer, const size_t *x_offsets, const size_t x_inc, + cl_mem y_buffer, const size_t *y_offsets, const size_t y_inc, const size_t batch_count, cl_command_queue* queue, cl_event* event); CLBlastStatusCode PUBLIC_API CLBlastZaxpyBatched(const size_t n, const cl_double2 *alphas, - const cl_mem *x_buffers, const size_t x_inc, - cl_mem *y_buffers, const size_t y_inc, + const cl_mem x_buffer, const size_t *x_offsets, const size_t x_inc, + cl_mem y_buffer, const size_t *y_offsets, const size_t y_inc, const size_t batch_count, cl_command_queue* queue, cl_event* event); CLBlastStatusCode PUBLIC_API CLBlastHaxpyBatched(const size_t n, const cl_half *alphas, - const cl_mem *x_buffers, const size_t x_inc, - cl_mem *y_buffers, const size_t y_inc, + const cl_mem x_buffer, const size_t *x_offsets, const size_t x_inc, + cl_mem y_buffer, const size_t *y_offsets, const size_t y_inc, const size_t batch_count, cl_command_queue* queue, cl_event* event); diff --git a/scripts/generator/generator/routine.py b/scripts/generator/generator/routine.py index 8807fd8e..59b2ed73 100644 --- a/scripts/generator/generator/routine.py +++ b/scripts/generator/generator/routine.py @@ -72,12 +72,12 @@ class Routine: for scalar in self.scalars: result.append("auto " + scalar + "s_cpp = std::vector();") for buffer_name in self.inputs + self.outputs: - result.append("auto " + buffer_name + "_buffers_cpp = std::vector>();") + result.append("auto " + buffer_name + "_offsets_cpp = std::vector();") result.append("for (auto batch = size_t{0}; batch < batch_count; ++batch) {") for scalar in self.scalars: result.append(" " + scalar + "s_cpp.push_back(" + scalar + "s[batch]);") for buffer_name in self.inputs + self.outputs: - result.append(" " + buffer_name + "_buffers_cpp.push_back(Buffer(" + buffer_name + "_buffers[batch]));") + result.append(" " + buffer_name + "_offsets_cpp.push_back(" + buffer_name + "_offsets[batch]);") result.append("}") return result @@ -222,8 +222,8 @@ class Routine: def buffer(self, name): """Retrieves a variable name for a specific input/output vector/matrix (e.g. 'x')""" if name in self.inputs or name in self.outputs: - a = [name + "_buffer" + self.b_s()] - b = [name + "_offset"] if not self.batched else [] + a = [name + "_buffer"] + b = [name + "_offset" + self.b_s()] c = [name + "_" + self.postfix(name)] if (name not in self.buffers_without_ld_inc()) else [] return [", ".join(a + b + c)] return [] @@ -250,8 +250,8 @@ class Routine: """As above but with data-types""" prefix = "const " if name in self.inputs else "" if name in self.inputs or name in self.outputs: - a = [prefix + "cl_mem " + self.b_star() + name + "_buffer" + self.b_s()] - b = ["const size_t " + name + "_offset"] if not self.batched else [] + a = [prefix + "cl_mem " + name + "_buffer"] + b = ["const size_t " + self.b_star() + name + "_offset" + self.b_s()] c = ["const size_t " + name + "_" + self.postfix(name)] if name not in self.buffers_without_ld_inc() else [] return [", ".join(a + b + c)] return [] @@ -291,11 +291,8 @@ class Routine: """As above but with CLCudaAPI buffers""" if name in self.inputs or name in self.outputs: buffer_type = "unsigned int" if (name in self.index_buffers()) else self.template.buffer_type - if self.batched: - a = [name + "_buffers_cpp"] - else: - a = ["Buffer<" + buffer_type + ">(" + name + "_buffer)"] - b = [name + "_offset"] if not self.batched else [] + a = ["Buffer<" + buffer_type + ">(" + name + "_buffer)"] + b = [name + "_offsets_cpp"] if self.batched else [name + "_offset"] c = [name + "_" + self.postfix(name)] if (name not in self.buffers_without_ld_inc()) else [] return [", ".join(a + b + c)] return [] @@ -336,8 +333,8 @@ class Routine: """As above, but only data-types""" prefix = "const " if (name in self.inputs) else "" if (name in self.inputs) or (name in self.outputs): - a = [prefix + "cl_mem" + self.b_star()] - b = ["const size_t"] if not self.batched else [] + a = [prefix + "cl_mem"] + b = ["const size_t" + self.b_star()] c = ["const size_t"] if (name not in self.buffers_without_ld_inc()) else [] return [", ".join(a + b + c)] return [] @@ -347,12 +344,10 @@ class Routine: prefix = "const " if (name in self.inputs) else "" inout = "input" if (name in self.inputs) else "output" if (name in self.inputs) or (name in self.outputs): - math_name = name.upper() + " matrix" + self.b_s() if (name in self.buffers_matrix()) else name + " vector" + self.b_s() + math_name = name.upper() + " matrix" if (name in self.buffers_matrix()) else name + " vector" inc_ld_description = "Leading dimension " if (name in self.buffers_matrix()) else "Stride/increment " - a = ["`" + prefix + "cl_mem " + self.b_star() + name + "_buffer" + self.b_s() + "`: OpenCL buffer" + self.b_s() + " to store the " + inout + " " + math_name + "."] - b = [] - if not self.batched: - b = ["`const size_t " + name + "_offset`: The offset in elements from the start of the " + inout + " " + math_name + "."] + a = ["`" + prefix + "cl_mem " + name + "_buffer`: OpenCL buffer to store the " + inout + " " + math_name + "."] + b = ["`const size_t " + self.b_star() + name + "_offset" + self.b_s() + "`: The offset" + self.b_s() + " in elements from the start of the " + inout + " " + math_name + "."] c = [] if name not in self.buffers_without_ld_inc(): c = ["`const size_t " + name + "_" + self.postfix(name) + "`: " + diff --git a/src/clblast.cpp b/src/clblast.cpp index e9cac664..d3db8edf 100644 --- a/src/clblast.cpp +++ b/src/clblast.cpp @@ -2178,57 +2178,57 @@ template StatusCode PUBLIC_API Omatcopy(const Layout, const Transpose, template StatusCode AxpyBatched(const size_t n, const T *alphas, - const cl_mem *x_buffers, const size_t x_inc, - cl_mem *y_buffers, const size_t y_inc, + const cl_mem x_buffer, const size_t *x_offsets, const size_t x_inc, + cl_mem y_buffer, const size_t *y_offsets, const size_t y_inc, const size_t batch_count, cl_command_queue* queue, cl_event* event) { try { auto queue_cpp = Queue(*queue); auto routine = XaxpyBatched(queue_cpp, event); auto alphas_cpp = std::vector(); - auto x_buffers_cpp = std::vector>(); - auto y_buffers_cpp = std::vector>(); + auto x_offsets_cpp = std::vector(); + auto y_offsets_cpp = std::vector(); for (auto batch = size_t{0}; batch < batch_count; ++batch) { alphas_cpp.push_back(alphas[batch]); - x_buffers_cpp.push_back(Buffer(x_buffers[batch])); - y_buffers_cpp.push_back(Buffer(y_buffers[batch])); + x_offsets_cpp.push_back(x_offsets[batch]); + y_offsets_cpp.push_back(y_offsets[batch]); } routine.DoAxpyBatched(n, alphas_cpp, - x_buffers_cpp, x_inc, - y_buffers_cpp, y_inc, + Buffer(x_buffer), x_offsets_cpp, x_inc, + Buffer(y_buffer), y_offsets_cpp, y_inc, batch_count); return StatusCode::kSuccess; } catch (...) { return DispatchException(); } } template StatusCode PUBLIC_API AxpyBatched(const size_t, const float*, - const cl_mem*, const size_t, - cl_mem*, const size_t, + const cl_mem, const size_t*, const size_t, + cl_mem, const size_t*, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API AxpyBatched(const size_t, const double*, - const cl_mem*, const size_t, - cl_mem*, const size_t, + const cl_mem, const size_t*, const size_t, + cl_mem, const size_t*, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API AxpyBatched(const size_t, const float2*, - const cl_mem*, const size_t, - cl_mem*, const size_t, + const cl_mem, const size_t*, const size_t, + cl_mem, const size_t*, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API AxpyBatched(const size_t, const double2*, - const cl_mem*, const size_t, - cl_mem*, const size_t, + const cl_mem, const size_t*, const size_t, + cl_mem, const size_t*, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API AxpyBatched(const size_t, const half*, - const cl_mem*, const size_t, - cl_mem*, const size_t, + const cl_mem, const size_t*, const size_t, + cl_mem, const size_t*, const size_t, const size_t, cl_command_queue*, cl_event*); // ================================================================================================= diff --git a/src/clblast_c.cpp b/src/clblast_c.cpp index bd8ea51a..b09f8c54 100644 --- a/src/clblast_c.cpp +++ b/src/clblast_c.cpp @@ -3450,8 +3450,8 @@ CLBlastStatusCode CLBlastHomatcopy(const CLBlastLayout layout, const CLBlastTran // AXPY CLBlastStatusCode CLBlastSaxpyBatched(const size_t n, const float *alphas, - const cl_mem *x_buffers, const size_t x_inc, - cl_mem *y_buffers, const size_t y_inc, + const cl_mem x_buffer, const size_t *x_offsets, const size_t x_inc, + cl_mem y_buffer, const size_t *y_offsets, const size_t y_inc, const size_t batch_count, cl_command_queue* queue, cl_event* event) { auto alphas_cpp = std::vector(); @@ -3462,8 +3462,8 @@ CLBlastStatusCode CLBlastSaxpyBatched(const size_t n, return static_cast( clblast::AxpyBatched(n, alphas_cpp.data(), - x_buffers, x_inc, - y_buffers, y_inc, + x_buffer, x_offsets, x_inc, + y_buffer, y_offsets, y_inc, batch_count, queue, event) ); @@ -3471,8 +3471,8 @@ CLBlastStatusCode CLBlastSaxpyBatched(const size_t n, } CLBlastStatusCode CLBlastDaxpyBatched(const size_t n, const double *alphas, - const cl_mem *x_buffers, const size_t x_inc, - cl_mem *y_buffers, const size_t y_inc, + const cl_mem x_buffer, const size_t *x_offsets, const size_t x_inc, + cl_mem y_buffer, const size_t *y_offsets, const size_t y_inc, const size_t batch_count, cl_command_queue* queue, cl_event* event) { auto alphas_cpp = std::vector(); @@ -3483,8 +3483,8 @@ CLBlastStatusCode CLBlastDaxpyBatched(const size_t n, return static_cast( clblast::AxpyBatched(n, alphas_cpp.data(), - x_buffers, x_inc, - y_buffers, y_inc, + x_buffer, x_offsets, x_inc, + y_buffer, y_offsets, y_inc, batch_count, queue, event) ); @@ -3492,8 +3492,8 @@ CLBlastStatusCode CLBlastDaxpyBatched(const size_t n, } CLBlastStatusCode CLBlastCaxpyBatched(const size_t n, const cl_float2 *alphas, - const cl_mem *x_buffers, const size_t x_inc, - cl_mem *y_buffers, const size_t y_inc, + const cl_mem x_buffer, const size_t *x_offsets, const size_t x_inc, + cl_mem y_buffer, const size_t *y_offsets, const size_t y_inc, const size_t batch_count, cl_command_queue* queue, cl_event* event) { auto alphas_cpp = std::vector(); @@ -3504,8 +3504,8 @@ CLBlastStatusCode CLBlastCaxpyBatched(const size_t n, return static_cast( clblast::AxpyBatched(n, alphas_cpp.data(), - x_buffers, x_inc, - y_buffers, y_inc, + x_buffer, x_offsets, x_inc, + y_buffer, y_offsets, y_inc, batch_count, queue, event) ); @@ -3513,8 +3513,8 @@ CLBlastStatusCode CLBlastCaxpyBatched(const size_t n, } CLBlastStatusCode CLBlastZaxpyBatched(const size_t n, const cl_double2 *alphas, - const cl_mem *x_buffers, const size_t x_inc, - cl_mem *y_buffers, const size_t y_inc, + const cl_mem x_buffer, const size_t *x_offsets, const size_t x_inc, + cl_mem y_buffer, const size_t *y_offsets, const size_t y_inc, const size_t batch_count, cl_command_queue* queue, cl_event* event) { auto alphas_cpp = std::vector(); @@ -3525,8 +3525,8 @@ CLBlastStatusCode CLBlastZaxpyBatched(const size_t n, return static_cast( clblast::AxpyBatched(n, alphas_cpp.data(), - x_buffers, x_inc, - y_buffers, y_inc, + x_buffer, x_offsets, x_inc, + y_buffer, y_offsets, y_inc, batch_count, queue, event) ); @@ -3534,8 +3534,8 @@ CLBlastStatusCode CLBlastZaxpyBatched(const size_t n, } CLBlastStatusCode CLBlastHaxpyBatched(const size_t n, const cl_half *alphas, - const cl_mem *x_buffers, const size_t x_inc, - cl_mem *y_buffers, const size_t y_inc, + const cl_mem x_buffer, const size_t *x_offsets, const size_t x_inc, + cl_mem y_buffer, const size_t *y_offsets, const size_t y_inc, const size_t batch_count, cl_command_queue* queue, cl_event* event) { auto alphas_cpp = std::vector(); @@ -3546,8 +3546,8 @@ CLBlastStatusCode CLBlastHaxpyBatched(const size_t n, return static_cast( clblast::AxpyBatched(n, alphas_cpp.data(), - x_buffers, x_inc, - y_buffers, y_inc, + x_buffer, x_offsets, x_inc, + y_buffer, y_offsets, y_inc, batch_count, queue, event) ); diff --git a/src/clpp11.hpp b/src/clpp11.hpp index 41af28da..29f81cf8 100644 --- a/src/clpp11.hpp +++ b/src/clpp11.hpp @@ -600,9 +600,6 @@ class Buffer { // Copies from host to device: writing the device buffer a-synchronously void WriteAsync(const Queue &queue, const size_t size, const T* host, const size_t offset = 0) { - if (access_ == BufferAccess::kReadOnly) { - throw LogicError("Buffer: writing to a read-only buffer"); - } if (GetSize() < (offset+size)*sizeof(T)) { throw LogicError("Buffer: target device buffer is too small"); } diff --git a/src/kernels/level1/xaxpy.opencl b/src/kernels/level1/xaxpy.opencl index ece8476e..0d730c9e 100644 --- a/src/kernels/level1/xaxpy.opencl +++ b/src/kernels/level1/xaxpy.opencl @@ -9,7 +9,7 @@ // // This file contains the Xaxpy kernel. It contains one fast vectorized version in case of unit // strides (incx=incy=1) and no offsets (offx=offy=0). Another version is more general, but doesn't -// support vector data-types. +// support vector data-types. The general version has a batched implementation as well. // // This kernel uses the level-1 BLAS common tuning parameters. // @@ -36,8 +36,6 @@ void Xaxpy(const int n, const real_arg arg_alpha, } } -// ================================================================================================= - // Faster version of the kernel without offsets and strided accesses. Also assumes that 'n' is // dividable by 'VW', 'WGS' and 'WPT'. __kernel __attribute__((reqd_work_group_size(WGS, 1, 1))) @@ -57,6 +55,24 @@ void XaxpyFast(const int n, const real_arg arg_alpha, // ================================================================================================= +// Full version of the kernel with offsets and strided accesses: batched version +__kernel __attribute__((reqd_work_group_size(WGS, 1, 1))) +void XaxpyBatched(const int n, const real_arg arg_alpha, + const __global real* restrict xgm, const int x_offset, const int x_inc, + __global real* ygm, const int y_offset, const int y_inc, + const int batch) { + const real alpha = GetRealArg(arg_alpha); + + // Loops over the work that needs to be done (allows for an arbitrary number of threads) + #pragma unroll + for (int id = get_global_id(0); id XaxpyBatched::XaxpyBatched(Queue &queue, EventPointer event, const std::string &name): - Xaxpy(queue, event, name) { + Routine(queue, event, name, {"Xaxpy"}, PrecisionValue(), {}, { + #include "../../kernels/level1/level1.opencl" + #include "../../kernels/level1/xaxpy.opencl" + }) { } // ================================================================================================= @@ -30,19 +33,55 @@ XaxpyBatched::XaxpyBatched(Queue &queue, EventPointer event, const std::strin // The main routine template void XaxpyBatched::DoAxpyBatched(const size_t n, const std::vector &alphas, - const std::vector> &x_buffers, const size_t x_inc, - const std::vector> &y_buffers, const size_t y_inc, + const Buffer &x_buffer, const std::vector &x_offsets, const size_t x_inc, + const Buffer &y_buffer, const std::vector &y_offsets, const size_t y_inc, const size_t batch_count) { - if (batch_count < 1) { throw BLASError(StatusCode::kInvalidBatchCount); } - if (alphas.size() != batch_count) { throw BLASError(StatusCode::kInvalidBatchCount); } - if (x_buffers.size() != batch_count) { throw BLASError(StatusCode::kInvalidBatchCount); } - if (y_buffers.size() != batch_count) { throw BLASError(StatusCode::kInvalidBatchCount); } + + // Tests for a valid batch count + if ((batch_count < 1) || (alphas.size() != batch_count) || + (x_offsets.size() != batch_count) || (y_offsets.size() != batch_count)) { + throw BLASError(StatusCode::kInvalidBatchCount); + } + + // Makes sure all dimensions are larger than zero + if (n == 0) { throw BLASError(StatusCode::kInvalidDimension); } + + // Tests the vectors for validity + for (auto batch = size_t{0}; batch < batch_count; ++batch) { + TestVectorX(n, x_buffer, x_offsets[batch], x_inc); + TestVectorY(n, y_buffer, y_offsets[batch], y_inc); + } + + // Upload the arguments to the device + std::vector x_offsets_int(x_offsets.begin(), x_offsets.end()); + std::vector y_offsets_int(y_offsets.begin(), y_offsets.end()); + auto x_offsets_device = Buffer(context_, BufferAccess::kReadOnly, batch_count); + auto y_offsets_device = Buffer(context_, BufferAccess::kReadOnly, batch_count); + x_offsets_device.Write(queue_, batch_count, x_offsets_int); + y_offsets_device.Write(queue_, batch_count, y_offsets_int); + + // Retrieves the Xaxpy kernel from the compiled binary + auto kernel = Kernel(program_, "XaxpyBatched"); // Naive implementation: calls regular Axpy multiple times for (auto batch = size_t{0}; batch < batch_count; ++batch) { - DoAxpy(n, alphas[batch], - x_buffers[batch], 0, x_inc, - y_buffers[batch], 0, y_inc); + + // Sets the kernel arguments + kernel.SetArgument(0, static_cast(n)); + kernel.SetArgument(1, GetRealArg(alphas[batch])); + kernel.SetArgument(2, x_buffer()); + kernel.SetArgument(3, static_cast(x_offsets[batch])); + kernel.SetArgument(4, static_cast(x_inc)); + kernel.SetArgument(5, y_buffer()); + kernel.SetArgument(6, static_cast(y_offsets[batch])); + kernel.SetArgument(7, static_cast(y_inc)); + kernel.SetArgument(8, static_cast(batch)); + + // Launches the kernel + auto n_ceiled = Ceil(n, db_["WGS"]*db_["WPT"]); + auto global = std::vector{n_ceiled/db_["WPT"]}; + auto local = std::vector{db_["WGS"]}; + RunKernel(kernel, queue_, device_, global, local, event_); } } diff --git a/src/routines/levelx/xaxpybatched.hpp b/src/routines/levelx/xaxpybatched.hpp index 7fd14a74..513792ea 100644 --- a/src/routines/levelx/xaxpybatched.hpp +++ b/src/routines/levelx/xaxpybatched.hpp @@ -16,26 +16,23 @@ #include -#include "routines/level1/xaxpy.hpp" +#include "routine.hpp" namespace clblast { // ================================================================================================= // See comment at top of file for a description of the class template -class XaxpyBatched: public Xaxpy { +class XaxpyBatched: public Routine { public: - // Uses the regular Xaxpy routine - using Xaxpy::DoAxpy; - // Constructor XaxpyBatched(Queue &queue, EventPointer event, const std::string &name = "AXPYBATCHED"); // Templated-precision implementation of the routine void DoAxpyBatched(const size_t n, const std::vector &alphas, - const std::vector> &x_buffers, const size_t x_inc, - const std::vector> &y_buffers, const size_t y_inc, + const Buffer &x_buffer, const std::vector &x_offsets, const size_t x_inc, + const Buffer &y_buffer, const std::vector &y_offsets, const size_t y_inc, const size_t batch_count); }; diff --git a/src/utilities/utilities.hpp b/src/utilities/utilities.hpp index 851fa251..d271ffee 100644 --- a/src/utilities/utilities.hpp +++ b/src/utilities/utilities.hpp @@ -157,7 +157,13 @@ struct Arguments { size_t imax_offset = 0; T alpha = ConstantOne(); T beta = ConstantOne(); + // Batch-specific arguments size_t batch_count = 1; + std::vector x_offsets = {0}; + std::vector y_offsets = {0}; + std::vector a_offsets = {0}; + std::vector b_offsets = {0}; + std::vector c_offsets = {0}; // Sizes size_t x_size = 1; size_t y_size = 1; diff --git a/test/correctness/misc/override_parameters.cpp b/test/correctness/misc/override_parameters.cpp index c6c70d9f..e6eebef7 100644 --- a/test/correctness/misc/override_parameters.cpp +++ b/test/correctness/misc/override_parameters.cpp @@ -88,7 +88,7 @@ size_t RunOverrideTests(int argc, char *argv[], const bool silent, const std::st device_b.Write(queue, host_b.size(), host_b); device_c.Write(queue, host_c.size(), host_c); auto dummy = Buffer(context, 1); - auto buffers = std::vector>{Buffers{dummy, dummy, device_a, device_b, device_c, dummy, dummy}}; + auto buffers = Buffers{dummy, dummy, device_a, device_b, device_c, dummy, dummy}; // Loops over the valid combinations: run before and run afterwards fprintf(stdout, "* Testing OverrideParameters for '%s'\n", routine_name.c_str()); diff --git a/test/correctness/testblas.cpp b/test/correctness/testblas.cpp index fcb2eceb..56376d0b 100644 --- a/test/correctness/testblas.cpp +++ b/test/correctness/testblas.cpp @@ -126,24 +126,21 @@ void TestBlas::TestRegular(std::vector> &test_vector, const st ap_source_, scalar_source_); // Set-up for the CLBlast run - auto buffers2 = std::vector>(); - for (auto batch = size_t{0}; batch < args.batch_count; ++batch) { - auto x_vec2 = Buffer(context_, args.x_size); - auto y_vec2 = Buffer(context_, args.y_size); - auto a_mat2 = Buffer(context_, args.a_size); - auto b_mat2 = Buffer(context_, args.b_size); - auto c_mat2 = Buffer(context_, args.c_size); - auto ap_mat2 = Buffer(context_, args.ap_size); - auto scalar2 = Buffer(context_, args.scalar_size); - x_vec2.Write(queue_, args.x_size, &x_source_[batch * args.x_size]); - y_vec2.Write(queue_, args.y_size, &y_source_[batch * args.y_size]); - a_mat2.Write(queue_, args.a_size, &a_source_[batch * args.a_size]); - b_mat2.Write(queue_, args.b_size, &b_source_[batch * args.b_size]); - c_mat2.Write(queue_, args.c_size, &c_source_[batch * args.c_size]); - ap_mat2.Write(queue_, args.ap_size, &ap_source_[batch * args.ap_size]); - scalar2.Write(queue_, args.scalar_size, &scalar_source_[batch * args.scalar_size]); - buffers2.push_back(Buffers{x_vec2, y_vec2, a_mat2, b_mat2, c_mat2, ap_mat2, scalar2}); - } + auto x_vec2 = Buffer(context_, args.x_size); + auto y_vec2 = Buffer(context_, args.y_size); + auto a_mat2 = Buffer(context_, args.a_size); + auto b_mat2 = Buffer(context_, args.b_size); + auto c_mat2 = Buffer(context_, args.c_size); + auto ap_mat2 = Buffer(context_, args.ap_size); + auto scalar2 = Buffer(context_, args.scalar_size); + x_vec2.Write(queue_, args.x_size, x_source_); + y_vec2.Write(queue_, args.y_size, y_source_); + a_mat2.Write(queue_, args.a_size, a_source_); + b_mat2.Write(queue_, args.b_size, b_source_); + c_mat2.Write(queue_, args.c_size, c_source_); + ap_mat2.Write(queue_, args.ap_size, ap_source_); + scalar2.Write(queue_, args.scalar_size, scalar_source_); + auto buffers2 = Buffers{x_vec2, y_vec2, a_mat2, b_mat2, c_mat2, ap_mat2, scalar2}; // Runs CLBlast if (verbose_) { @@ -163,24 +160,21 @@ void TestBlas::TestRegular(std::vector> &test_vector, const st } // Set-up for the reference run - auto buffers1 = std::vector>(); - for (auto batch = size_t{0}; batch < args.batch_count; ++batch) { - auto x_vec1 = Buffer(context_, args.x_size); - auto y_vec1 = Buffer(context_, args.y_size); - auto a_mat1 = Buffer(context_, args.a_size); - auto b_mat1 = Buffer(context_, args.b_size); - auto c_mat1 = Buffer(context_, args.c_size); - auto ap_mat1 = Buffer(context_, args.ap_size); - auto scalar1 = Buffer(context_, args.scalar_size); - x_vec1.Write(queue_, args.x_size, &x_source_[batch * args.x_size]); - y_vec1.Write(queue_, args.y_size, &y_source_[batch * args.y_size]); - a_mat1.Write(queue_, args.a_size, &a_source_[batch * args.a_size]); - b_mat1.Write(queue_, args.b_size, &b_source_[batch * args.b_size]); - c_mat1.Write(queue_, args.c_size, &c_source_[batch * args.c_size]); - ap_mat1.Write(queue_, args.ap_size, &ap_source_[batch * args.ap_size]); - scalar1.Write(queue_, args.scalar_size, &scalar_source_[batch * args.scalar_size]); - buffers1.push_back(Buffers{x_vec1, y_vec1, a_mat1, b_mat1, c_mat1, ap_mat1, scalar1}); - } + auto x_vec1 = Buffer(context_, args.x_size); + auto y_vec1 = Buffer(context_, args.y_size); + auto a_mat1 = Buffer(context_, args.a_size); + auto b_mat1 = Buffer(context_, args.b_size); + auto c_mat1 = Buffer(context_, args.c_size); + auto ap_mat1 = Buffer(context_, args.ap_size); + auto scalar1 = Buffer(context_, args.scalar_size); + x_vec1.Write(queue_, args.x_size, x_source_); + y_vec1.Write(queue_, args.y_size, y_source_); + a_mat1.Write(queue_, args.a_size, a_source_); + b_mat1.Write(queue_, args.b_size, b_source_); + c_mat1.Write(queue_, args.c_size, c_source_); + ap_mat1.Write(queue_, args.ap_size, ap_source_); + scalar1.Write(queue_, args.scalar_size, scalar_source_); + auto buffers1 = Buffers{x_vec1, y_vec1, a_mat1, b_mat1, c_mat1, ap_mat1, scalar1}; // Runs the reference code if (verbose_) { @@ -197,47 +191,40 @@ void TestBlas::TestRegular(std::vector> &test_vector, const st continue; } - // Error checking for each batch - auto errors = size_t{0}; + // Downloads the results + auto result1 = get_result_(args, buffers1, queue_); + auto result2 = get_result_(args, buffers2, queue_); + + // Computes the L2 error auto l2error = 0.0; - for (auto batch = size_t{0}; batch < args.batch_count; ++batch) { - - // Downloads the results - auto result1 = get_result_(args, buffers1[batch], queue_); - auto result2 = get_result_(args, buffers2[batch], queue_); - - // Computes the L2 error - auto l2error_batch = 0.0; - const auto kErrorMarginL2 = getL2ErrorMargin(); - for (auto id1=size_t{0}; id1(); + for (auto id1=size_t{0}; id1(get_id1_(args) * get_id2_(args)); - l2error += l2error_batch; + } + l2error /= static_cast(get_id1_(args) * get_id2_(args)); - // Checks for differences in the output - for (auto id1=size_t{0}; id1= kErrorMarginL2) { errors++; } - if (verbose_) { - if (get_id2_(args) == 1) { fprintf(stdout, "\n Error at index %zu: ", id1); } - else { fprintf(stdout, "\n Error at %zu,%zu: ", id1, id2); } - fprintf(stdout, " %s (reference) versus ", ToString(result1[index]).c_str()); - fprintf(stdout, " %s (CLBlast)", ToString(result2[index]).c_str()); - if (l2error_batch < kErrorMarginL2) { - fprintf(stdout, " - error suppressed by a low total L2 error\n"); - } + // Checks for differences in the output + auto errors = size_t{0}; + for (auto id1=size_t{0}; id1= kErrorMarginL2) { errors++; } + if (verbose_) { + if (get_id2_(args) == 1) { fprintf(stdout, "\n Error at index %zu: ", id1); } + else { fprintf(stdout, "\n Error at %zu,%zu: ", id1, id2); } + fprintf(stdout, " %s (reference) versus ", ToString(result1[index]).c_str()); + fprintf(stdout, " %s (CLBlast)", ToString(result2[index]).c_str()); + if (l2error < kErrorMarginL2) { + fprintf(stdout, " - error suppressed by a low total L2 error\n"); } } } } } - l2error /= static_cast(args.batch_count); // Report the results if (verbose_ && errors > 0) { @@ -245,7 +232,7 @@ void TestBlas::TestRegular(std::vector> &test_vector, const st } // Tests the error count (should be zero) - TestErrorCount(errors, get_id1_(args)*get_id2_(args)*args.batch_count, args); + TestErrorCount(errors, get_id1_(args)*get_id2_(args), args); } TestEnd(); } @@ -272,40 +259,36 @@ void TestBlas::TestInvalid(std::vector> &test_vector, const st // Creates the OpenCL buffers. Note: we are not using the C++ version since we explicitly // want to be able to create invalid buffers (no error checking here). - auto buffers1 = std::vector>(); - auto buffers2 = std::vector>(); - for (auto batch = size_t{0}; batch < args.batch_count; ++batch) { - auto x1 = clCreateBuffer(context_(), CL_MEM_READ_WRITE, args.x_size*sizeof(T), nullptr,nullptr); - auto y1 = clCreateBuffer(context_(), CL_MEM_READ_WRITE, args.y_size*sizeof(T), nullptr,nullptr); - auto a1 = clCreateBuffer(context_(), CL_MEM_READ_WRITE, args.a_size*sizeof(T), nullptr,nullptr); - auto b1 = clCreateBuffer(context_(), CL_MEM_READ_WRITE, args.b_size*sizeof(T), nullptr,nullptr); - auto c1 = clCreateBuffer(context_(), CL_MEM_READ_WRITE, args.c_size*sizeof(T), nullptr,nullptr); - auto ap1 = clCreateBuffer(context_(), CL_MEM_READ_WRITE, args.ap_size*sizeof(T), nullptr,nullptr); - auto d1 = clCreateBuffer(context_(), CL_MEM_READ_WRITE, args.scalar_size*sizeof(T), nullptr,nullptr); - auto x_vec1 = Buffer(x1); - auto y_vec1 = Buffer(y1); - auto a_mat1 = Buffer(a1); - auto b_mat1 = Buffer(b1); - auto c_mat1 = Buffer(c1); - auto ap_mat1 = Buffer(ap1); - auto scalar1 = Buffer(d1); - auto x2 = clCreateBuffer(context_(), CL_MEM_READ_WRITE, args.x_size*sizeof(T), nullptr,nullptr); - auto y2 = clCreateBuffer(context_(), CL_MEM_READ_WRITE, args.y_size*sizeof(T), nullptr,nullptr); - auto a2 = clCreateBuffer(context_(), CL_MEM_READ_WRITE, args.a_size*sizeof(T), nullptr,nullptr); - auto b2 = clCreateBuffer(context_(), CL_MEM_READ_WRITE, args.b_size*sizeof(T), nullptr,nullptr); - auto c2 = clCreateBuffer(context_(), CL_MEM_READ_WRITE, args.c_size*sizeof(T), nullptr,nullptr); - auto ap2 = clCreateBuffer(context_(), CL_MEM_READ_WRITE, args.ap_size*sizeof(T), nullptr,nullptr); - auto d2 = clCreateBuffer(context_(), CL_MEM_READ_WRITE, args.scalar_size*sizeof(T), nullptr,nullptr); - auto x_vec2 = Buffer(x2); - auto y_vec2 = Buffer(y2); - auto a_mat2 = Buffer(a2); - auto b_mat2 = Buffer(b2); - auto c_mat2 = Buffer(c2); - auto ap_mat2 = Buffer(ap2); - auto scalar2 = Buffer(d2); - buffers1.push_back(Buffers{x_vec1, y_vec1, a_mat1, b_mat1, c_mat1, ap_mat1, scalar1}); - buffers2.push_back(Buffers{x_vec2, y_vec2, a_mat2, b_mat2, c_mat2, ap_mat2, scalar2}); - } + auto x1 = clCreateBuffer(context_(), CL_MEM_READ_WRITE, args.x_size*sizeof(T), nullptr,nullptr); + auto y1 = clCreateBuffer(context_(), CL_MEM_READ_WRITE, args.y_size*sizeof(T), nullptr,nullptr); + auto a1 = clCreateBuffer(context_(), CL_MEM_READ_WRITE, args.a_size*sizeof(T), nullptr,nullptr); + auto b1 = clCreateBuffer(context_(), CL_MEM_READ_WRITE, args.b_size*sizeof(T), nullptr,nullptr); + auto c1 = clCreateBuffer(context_(), CL_MEM_READ_WRITE, args.c_size*sizeof(T), nullptr,nullptr); + auto ap1 = clCreateBuffer(context_(), CL_MEM_READ_WRITE, args.ap_size*sizeof(T), nullptr,nullptr); + auto d1 = clCreateBuffer(context_(), CL_MEM_READ_WRITE, args.scalar_size*sizeof(T), nullptr,nullptr); + auto x_vec1 = Buffer(x1); + auto y_vec1 = Buffer(y1); + auto a_mat1 = Buffer(a1); + auto b_mat1 = Buffer(b1); + auto c_mat1 = Buffer(c1); + auto ap_mat1 = Buffer(ap1); + auto scalar1 = Buffer(d1); + auto x2 = clCreateBuffer(context_(), CL_MEM_READ_WRITE, args.x_size*sizeof(T), nullptr,nullptr); + auto y2 = clCreateBuffer(context_(), CL_MEM_READ_WRITE, args.y_size*sizeof(T), nullptr,nullptr); + auto a2 = clCreateBuffer(context_(), CL_MEM_READ_WRITE, args.a_size*sizeof(T), nullptr,nullptr); + auto b2 = clCreateBuffer(context_(), CL_MEM_READ_WRITE, args.b_size*sizeof(T), nullptr,nullptr); + auto c2 = clCreateBuffer(context_(), CL_MEM_READ_WRITE, args.c_size*sizeof(T), nullptr,nullptr); + auto ap2 = clCreateBuffer(context_(), CL_MEM_READ_WRITE, args.ap_size*sizeof(T), nullptr,nullptr); + auto d2 = clCreateBuffer(context_(), CL_MEM_READ_WRITE, args.scalar_size*sizeof(T), nullptr,nullptr); + auto x_vec2 = Buffer(x2); + auto y_vec2 = Buffer(y2); + auto a_mat2 = Buffer(a2); + auto b_mat2 = Buffer(b2); + auto c_mat2 = Buffer(c2); + auto ap_mat2 = Buffer(ap2); + auto scalar2 = Buffer(d2); + auto buffers1 = Buffers{x_vec1, y_vec1, a_mat1, b_mat1, c_mat1, ap_mat1, scalar1}; + auto buffers2 = Buffers{x_vec2, y_vec2, a_mat2, b_mat2, c_mat2, ap_mat2, scalar2}; // Runs CLBlast if (verbose_) { diff --git a/test/correctness/testblas.hpp b/test/correctness/testblas.hpp index e675fa9b..42e8aef7 100644 --- a/test/correctness/testblas.hpp +++ b/test/correctness/testblas.hpp @@ -79,7 +79,7 @@ class TestBlas: public Tester { std::vector&, std::vector&, std::vector&, std::vector&, std::vector&, std::vector&, std::vector&)>; - using Routine = std::function&, std::vector>&, Queue&)>; + using Routine = std::function&, Buffers&, Queue&)>; using ResultGet = std::function(const Arguments&, Buffers&, Queue&)>; using ResultIndex = std::function&, const size_t, const size_t)>; using ResultIterator = std::function&)>; diff --git a/test/performance/client.cpp b/test/performance/client.cpp index 2b4cdb9b..bd48b047 100644 --- a/test/performance/client.cpp +++ b/test/performance/client.cpp @@ -177,13 +177,13 @@ void Client::PerformanceTest(Arguments &args, const SetMetric set_sizes) set_sizes(args); // Populates input host matrices with random data - std::vector x_source(args.batch_count * args.x_size); - std::vector y_source(args.batch_count * args.y_size); - std::vector a_source(args.batch_count * args.a_size); - std::vector b_source(args.batch_count * args.b_size); - std::vector c_source(args.batch_count * args.c_size); - std::vector ap_source(args.batch_count * args.ap_size); - std::vector scalar_source(args.batch_count * args.scalar_size); + std::vector x_source(args.x_size); + std::vector y_source(args.y_size); + std::vector a_source(args.a_size); + std::vector b_source(args.b_size); + std::vector c_source(args.c_size); + std::vector ap_source(args.ap_size); + std::vector scalar_source(args.scalar_size); std::mt19937 mt(kSeed); std::uniform_real_distribution dist(kTestDataLowerLimit, kTestDataUpperLimit); PopulateVector(x_source, mt, dist); @@ -195,24 +195,21 @@ void Client::PerformanceTest(Arguments &args, const SetMetric set_sizes) PopulateVector(scalar_source, mt, dist); // Creates the matrices on the device - auto buffers = std::vector>(); - for (auto batch = size_t{0}; batch < args.batch_count; ++batch) { - auto x_vec = Buffer(context, args.x_size); - auto y_vec = Buffer(context, args.y_size); - auto a_mat = Buffer(context, args.a_size); - auto b_mat = Buffer(context, args.b_size); - auto c_mat = Buffer(context, args.c_size); - auto ap_mat = Buffer(context, args.ap_size); - auto scalar = Buffer(context, args.scalar_size); - x_vec.Write(queue, args.x_size, &x_source[batch * args.x_size]); - y_vec.Write(queue, args.y_size, &y_source[batch * args.y_size]); - a_mat.Write(queue, args.a_size, &a_source[batch * args.a_size]); - b_mat.Write(queue, args.b_size, &b_source[batch * args.b_size]); - c_mat.Write(queue, args.c_size, &c_source[batch * args.c_size]); - ap_mat.Write(queue, args.ap_size, &ap_source[batch * args.ap_size]); - scalar.Write(queue, args.scalar_size, &scalar_source[batch * args.scalar_size]); - buffers.push_back(Buffers{x_vec, y_vec, a_mat, b_mat, c_mat, ap_mat, scalar}); - } + auto x_vec = Buffer(context, args.x_size); + auto y_vec = Buffer(context, args.y_size); + auto a_mat = Buffer(context, args.a_size); + auto b_mat = Buffer(context, args.b_size); + auto c_mat = Buffer(context, args.c_size); + auto ap_mat = Buffer(context, args.ap_size); + auto scalar = Buffer(context, args.scalar_size); + x_vec.Write(queue, args.x_size, x_source); + y_vec.Write(queue, args.y_size, y_source); + a_mat.Write(queue, args.a_size, a_source); + b_mat.Write(queue, args.b_size, b_source); + c_mat.Write(queue, args.c_size, c_source); + ap_mat.Write(queue, args.ap_size, ap_source); + scalar.Write(queue, args.scalar_size, scalar_source); + auto buffers = Buffers{x_vec, y_vec, a_mat, b_mat, c_mat, ap_mat, scalar}; // Runs the routines and collects the timings auto timings = std::vector>(); @@ -254,7 +251,7 @@ void Client::PerformanceTest(Arguments &args, const SetMetric set_sizes) // value found in the vector of timing results. The return value is in milliseconds. template double Client::TimedExecution(const size_t num_runs, const Arguments &args, - std::vector> &buffers, Queue &queue, + Buffers &buffers, Queue &queue, Routine run_blas, const std::string &library_name) { auto status = StatusCode::kSuccess; @@ -373,8 +370,8 @@ void Client::PrintTableRow(const Arguments& args, for (const auto& timing : timings) { // Computes the GFLOPS and GB/s metrics - auto flops = get_flops_(args) * args.batch_count; - auto bytes = get_bytes_(args) * args.batch_count; + auto flops = get_flops_(args); + auto bytes = get_bytes_(args); auto gflops = (timing.second != 0.0) ? (flops*1e-6)/timing.second : 0; auto gbs = (timing.second != 0.0) ? (bytes*1e-6)/timing.second : 0; diff --git a/test/performance/client.hpp b/test/performance/client.hpp index a8e31419..4b3e17c7 100644 --- a/test/performance/client.hpp +++ b/test/performance/client.hpp @@ -43,7 +43,7 @@ class Client { static constexpr auto kSeed = 42; // fixed seed for reproducibility // Shorthand for the routine-specific functions passed to the tester - using Routine = std::function&, std::vector>&, Queue&)>; + using Routine = std::function&, Buffers&, Queue&)>; using SetMetric = std::function&)>; using GetMetric = std::function&)>; @@ -66,7 +66,7 @@ class Client { private: // Runs a function a given number of times and returns the execution time of the shortest instance - double TimedExecution(const size_t num_runs, const Arguments &args, std::vector> &buffers, + double TimedExecution(const size_t num_runs, const Arguments &args, Buffers &buffers, Queue &queue, Routine run_blas, const std::string &library_name); // Prints the header of a performance-data table diff --git a/test/routines/level1/xamax.hpp b/test/routines/level1/xamax.hpp index faffff33..a22f681f 100644 --- a/test/routines/level1/xamax.hpp +++ b/test/routines/level1/xamax.hpp @@ -74,12 +74,12 @@ class TestXamax { std::vector&, std::vector&) {} // N/A for this routine // Describes how to run the CLBlast routine - static StatusCode RunRoutine(const Arguments &args, std::vector> &buffers, Queue &queue) { + static StatusCode RunRoutine(const Arguments &args, Buffers &buffers, Queue &queue) { auto queue_plain = queue(); auto event = cl_event{}; auto status = Amax(args.n, - buffers[0].scalar(), args.imax_offset, - buffers[0].x_vec(), args.x_offset, args.x_inc, + buffers.scalar(), args.imax_offset, + buffers.x_vec(), args.x_offset, args.x_inc, &queue_plain, &event); if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); } return status; @@ -87,12 +87,12 @@ class TestXamax { // Describes how to run the clBLAS routine (for correctness/performance comparison) #ifdef CLBLAST_REF_CLBLAS - static StatusCode RunReference1(const Arguments &args, std::vector> &buffers, Queue &queue) { + static StatusCode RunReference1(const Arguments &args, Buffers &buffers, Queue &queue) { auto queue_plain = queue(); auto event = cl_event{}; auto status = clblasXamax(args.n, - buffers[0].scalar, args.imax_offset, - buffers[0].x_vec, args.x_offset, args.x_inc, + buffers.scalar, args.imax_offset, + buffers.x_vec, args.x_offset, args.x_inc, 1, &queue_plain, 0, nullptr, &event); clWaitForEvents(1, &event); return static_cast(status); @@ -101,15 +101,15 @@ class TestXamax { // Describes how to run the CPU BLAS routine (for correctness/performance comparison) #ifdef CLBLAST_REF_CBLAS - static StatusCode RunReference2(const Arguments &args, std::vector> &buffers, Queue &queue) { + static StatusCode RunReference2(const Arguments &args, Buffers &buffers, Queue &queue) { std::vector scalar_cpu(args.scalar_size, static_cast(0)); std::vector x_vec_cpu(args.x_size, static_cast(0)); - buffers[0].scalar.Read(queue, args.scalar_size, scalar_cpu); - buffers[0].x_vec.Read(queue, args.x_size, x_vec_cpu); + buffers.scalar.Read(queue, args.scalar_size, scalar_cpu); + buffers.x_vec.Read(queue, args.x_size, x_vec_cpu); cblasXamax(args.n, scalar_cpu, args.imax_offset, x_vec_cpu, args.x_offset, args.x_inc); - buffers[0].scalar.Write(queue, args.scalar_size, scalar_cpu); + buffers.scalar.Write(queue, args.scalar_size, scalar_cpu); return StatusCode::kSuccess; } #endif diff --git a/test/routines/level1/xasum.hpp b/test/routines/level1/xasum.hpp index fb2c9f1a..64377189 100644 --- a/test/routines/level1/xasum.hpp +++ b/test/routines/level1/xasum.hpp @@ -74,12 +74,12 @@ class TestXasum { std::vector&, std::vector&) {} // N/A for this routine // Describes how to run the CLBlast routine - static StatusCode RunRoutine(const Arguments &args, std::vector> &buffers, Queue &queue) { + static StatusCode RunRoutine(const Arguments &args, Buffers &buffers, Queue &queue) { auto queue_plain = queue(); auto event = cl_event{}; auto status = Asum(args.n, - buffers[0].scalar(), args.asum_offset, - buffers[0].x_vec(), args.x_offset, args.x_inc, + buffers.scalar(), args.asum_offset, + buffers.x_vec(), args.x_offset, args.x_inc, &queue_plain, &event); if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); } return status; @@ -87,12 +87,12 @@ class TestXasum { // Describes how to run the clBLAS routine (for correctness/performance comparison) #ifdef CLBLAST_REF_CLBLAS - static StatusCode RunReference1(const Arguments &args, std::vector> &buffers, Queue &queue) { + static StatusCode RunReference1(const Arguments &args, Buffers &buffers, Queue &queue) { auto queue_plain = queue(); auto event = cl_event{}; auto status = clblasXasum(args.n, - buffers[0].scalar, args.asum_offset, - buffers[0].x_vec, args.x_offset, args.x_inc, + buffers.scalar, args.asum_offset, + buffers.x_vec, args.x_offset, args.x_inc, 1, &queue_plain, 0, nullptr, &event); clWaitForEvents(1, &event); return static_cast(status); @@ -101,15 +101,15 @@ class TestXasum { // Describes how to run the CPU BLAS routine (for correctness/performance comparison) #ifdef CLBLAST_REF_CBLAS - static StatusCode RunReference2(const Arguments &args, std::vector> &buffers, Queue &queue) { + static StatusCode RunReference2(const Arguments &args, Buffers &buffers, Queue &queue) { std::vector scalar_cpu(args.scalar_size, static_cast(0)); std::vector x_vec_cpu(args.x_size, static_cast(0)); - buffers[0].scalar.Read(queue, args.scalar_size, scalar_cpu); - buffers[0].x_vec.Read(queue, args.x_size, x_vec_cpu); + buffers.scalar.Read(queue, args.scalar_size, scalar_cpu); + buffers.x_vec.Read(queue, args.x_size, x_vec_cpu); cblasXasum(args.n, scalar_cpu, args.asum_offset, x_vec_cpu, args.x_offset, args.x_inc); - buffers[0].scalar.Write(queue, args.scalar_size, scalar_cpu); + buffers.scalar.Write(queue, args.scalar_size, scalar_cpu); return StatusCode::kSuccess; } #endif diff --git a/test/routines/level1/xaxpy.hpp b/test/routines/level1/xaxpy.hpp index 1c74f67f..eba067c0 100644 --- a/test/routines/level1/xaxpy.hpp +++ b/test/routines/level1/xaxpy.hpp @@ -75,12 +75,12 @@ class TestXaxpy { std::vector&, std::vector&) {} // N/A for this routine // Describes how to run the CLBlast routine - static StatusCode RunRoutine(const Arguments &args, std::vector> &buffers, Queue &queue) { + static StatusCode RunRoutine(const Arguments &args, Buffers &buffers, Queue &queue) { auto queue_plain = queue(); auto event = cl_event{}; auto status = Axpy(args.n, args.alpha, - buffers[0].x_vec(), args.x_offset, args.x_inc, - buffers[0].y_vec(), args.y_offset, args.y_inc, + buffers.x_vec(), args.x_offset, args.x_inc, + buffers.y_vec(), args.y_offset, args.y_inc, &queue_plain, &event); if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); } return status; @@ -88,12 +88,12 @@ class TestXaxpy { // Describes how to run the clBLAS routine (for correctness/performance comparison) #ifdef CLBLAST_REF_CLBLAS - static StatusCode RunReference1(const Arguments &args, std::vector> &buffers, Queue &queue) { + static StatusCode RunReference1(const Arguments &args, Buffers &buffers, Queue &queue) { auto queue_plain = queue(); auto event = cl_event{}; auto status = clblasXaxpy(args.n, args.alpha, - buffers[0].x_vec, args.x_offset, args.x_inc, - buffers[0].y_vec, args.y_offset, args.y_inc, + buffers.x_vec, args.x_offset, args.x_inc, + buffers.y_vec, args.y_offset, args.y_inc, 1, &queue_plain, 0, nullptr, &event); clWaitForEvents(1, &event); return static_cast(status); @@ -102,15 +102,15 @@ class TestXaxpy { // Describes how to run the CPU BLAS routine (for correctness/performance comparison) #ifdef CLBLAST_REF_CBLAS - static StatusCode RunReference2(const Arguments &args, std::vector> &buffers, Queue &queue) { + static StatusCode RunReference2(const Arguments &args, Buffers &buffers, Queue &queue) { std::vector x_vec_cpu(args.x_size, static_cast(0)); std::vector y_vec_cpu(args.y_size, static_cast(0)); - buffers[0].x_vec.Read(queue, args.x_size, x_vec_cpu); - buffers[0].y_vec.Read(queue, args.y_size, y_vec_cpu); + buffers.x_vec.Read(queue, args.x_size, x_vec_cpu); + buffers.y_vec.Read(queue, args.y_size, y_vec_cpu); cblasXaxpy(args.n, args.alpha, x_vec_cpu, args.x_offset, args.x_inc, y_vec_cpu, args.y_offset, args.y_inc); - buffers[0].y_vec.Write(queue, args.y_size, y_vec_cpu); + buffers.y_vec.Write(queue, args.y_size, y_vec_cpu); return StatusCode::kSuccess; } #endif diff --git a/test/routines/level1/xcopy.hpp b/test/routines/level1/xcopy.hpp index 55980f30..753f0da5 100644 --- a/test/routines/level1/xcopy.hpp +++ b/test/routines/level1/xcopy.hpp @@ -74,12 +74,12 @@ class TestXcopy { std::vector&, std::vector&) {} // N/A for this routine // Describes how to run the CLBlast routine - static StatusCode RunRoutine(const Arguments &args, std::vector> &buffers, Queue &queue) { + static StatusCode RunRoutine(const Arguments &args, Buffers &buffers, Queue &queue) { auto queue_plain = queue(); auto event = cl_event{}; auto status = Copy(args.n, - buffers[0].x_vec(), args.x_offset, args.x_inc, - buffers[0].y_vec(), args.y_offset, args.y_inc, + buffers.x_vec(), args.x_offset, args.x_inc, + buffers.y_vec(), args.y_offset, args.y_inc, &queue_plain, &event); if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); } return status; @@ -87,12 +87,12 @@ class TestXcopy { // Describes how to run the clBLAS routine (for correctness/performance comparison) #ifdef CLBLAST_REF_CLBLAS - static StatusCode RunReference1(const Arguments &args, std::vector> &buffers, Queue &queue) { + static StatusCode RunReference1(const Arguments &args, Buffers &buffers, Queue &queue) { auto queue_plain = queue(); auto event = cl_event{}; auto status = clblasXcopy(args.n, - buffers[0].x_vec, args.x_offset, args.x_inc, - buffers[0].y_vec, args.y_offset, args.y_inc, + buffers.x_vec, args.x_offset, args.x_inc, + buffers.y_vec, args.y_offset, args.y_inc, 1, &queue_plain, 0, nullptr, &event); clWaitForEvents(1, &event); return static_cast(status); @@ -101,15 +101,15 @@ class TestXcopy { // Describes how to run the CPU BLAS routine (for correctness/performance comparison) #ifdef CLBLAST_REF_CBLAS - static StatusCode RunReference2(const Arguments &args, std::vector> &buffers, Queue &queue) { + static StatusCode RunReference2(const Arguments &args, Buffers &buffers, Queue &queue) { std::vector x_vec_cpu(args.x_size, static_cast(0)); std::vector y_vec_cpu(args.y_size, static_cast(0)); - buffers[0].x_vec.Read(queue, args.x_size, x_vec_cpu); - buffers[0].y_vec.Read(queue, args.y_size, y_vec_cpu); + buffers.x_vec.Read(queue, args.x_size, x_vec_cpu); + buffers.y_vec.Read(queue, args.y_size, y_vec_cpu); cblasXcopy(args.n, x_vec_cpu, args.x_offset, args.x_inc, y_vec_cpu, args.y_offset, args.y_inc); - buffers[0].y_vec.Write(queue, args.y_size, y_vec_cpu); + buffers.y_vec.Write(queue, args.y_size, y_vec_cpu); return StatusCode::kSuccess; } #endif diff --git a/test/routines/level1/xdot.hpp b/test/routines/level1/xdot.hpp index 1ea69c17..8127247d 100644 --- a/test/routines/level1/xdot.hpp +++ b/test/routines/level1/xdot.hpp @@ -78,13 +78,13 @@ class TestXdot { std::vector&, std::vector&) {} // N/A for this routine // Describes how to run the CLBlast routine - static StatusCode RunRoutine(const Arguments &args, std::vector> &buffers, Queue &queue) { + static StatusCode RunRoutine(const Arguments &args, Buffers &buffers, Queue &queue) { auto queue_plain = queue(); auto event = cl_event{}; auto status = Dot(args.n, - buffers[0].scalar(), args.dot_offset, - buffers[0].x_vec(), args.x_offset, args.x_inc, - buffers[0].y_vec(), args.y_offset, args.y_inc, + buffers.scalar(), args.dot_offset, + buffers.x_vec(), args.x_offset, args.x_inc, + buffers.y_vec(), args.y_offset, args.y_inc, &queue_plain, &event); if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); } return status; @@ -92,13 +92,13 @@ class TestXdot { // Describes how to run the clBLAS routine (for correctness/performance comparison) #ifdef CLBLAST_REF_CLBLAS - static StatusCode RunReference1(const Arguments &args, std::vector> &buffers, Queue &queue) { + static StatusCode RunReference1(const Arguments &args, Buffers &buffers, Queue &queue) { auto queue_plain = queue(); auto event = cl_event{}; auto status = clblasXdot(args.n, - buffers[0].scalar, args.dot_offset, - buffers[0].x_vec, args.x_offset, args.x_inc, - buffers[0].y_vec, args.y_offset, args.y_inc, + buffers.scalar, args.dot_offset, + buffers.x_vec, args.x_offset, args.x_inc, + buffers.y_vec, args.y_offset, args.y_inc, 1, &queue_plain, 0, nullptr, &event); clWaitForEvents(1, &event); return static_cast(status); @@ -107,18 +107,18 @@ class TestXdot { // Describes how to run the CPU BLAS routine (for correctness/performance comparison) #ifdef CLBLAST_REF_CBLAS - static StatusCode RunReference2(const Arguments &args, std::vector> &buffers, Queue &queue) { + static StatusCode RunReference2(const Arguments &args, Buffers &buffers, Queue &queue) { std::vector scalar_cpu(args.scalar_size, static_cast(0)); std::vector x_vec_cpu(args.x_size, static_cast(0)); std::vector y_vec_cpu(args.y_size, static_cast(0)); - buffers[0].scalar.Read(queue, args.scalar_size, scalar_cpu); - buffers[0].x_vec.Read(queue, args.x_size, x_vec_cpu); - buffers[0].y_vec.Read(queue, args.y_size, y_vec_cpu); + buffers.scalar.Read(queue, args.scalar_size, scalar_cpu); + buffers.x_vec.Read(queue, args.x_size, x_vec_cpu); + buffers.y_vec.Read(queue, args.y_size, y_vec_cpu); cblasXdot(args.n, scalar_cpu, args.dot_offset, x_vec_cpu, args.x_offset, args.x_inc, y_vec_cpu, args.y_offset, args.y_inc); - buffers[0].scalar.Write(queue, args.scalar_size, scalar_cpu); + buffers.scalar.Write(queue, args.scalar_size, scalar_cpu); return StatusCode::kSuccess; } #endif diff --git a/test/routines/level1/xdotc.hpp b/test/routines/level1/xdotc.hpp index 00dcf7c2..96d97dc4 100644 --- a/test/routines/level1/xdotc.hpp +++ b/test/routines/level1/xdotc.hpp @@ -78,13 +78,13 @@ class TestXdotc { std::vector&, std::vector&) {} // N/A for this routine // Describes how to run the CLBlast routine - static StatusCode RunRoutine(const Arguments &args, std::vector> &buffers, Queue &queue) { + static StatusCode RunRoutine(const Arguments &args, Buffers &buffers, Queue &queue) { auto queue_plain = queue(); auto event = cl_event{}; auto status = Dotc(args.n, - buffers[0].scalar(), args.dot_offset, - buffers[0].x_vec(), args.x_offset, args.x_inc, - buffers[0].y_vec(), args.y_offset, args.y_inc, + buffers.scalar(), args.dot_offset, + buffers.x_vec(), args.x_offset, args.x_inc, + buffers.y_vec(), args.y_offset, args.y_inc, &queue_plain, &event); if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); } return status; @@ -92,13 +92,13 @@ class TestXdotc { // Describes how to run the clBLAS routine (for correctness/performance comparison) #ifdef CLBLAST_REF_CLBLAS - static StatusCode RunReference1(const Arguments &args, std::vector> &buffers, Queue &queue) { + static StatusCode RunReference1(const Arguments &args, Buffers &buffers, Queue &queue) { auto queue_plain = queue(); auto event = cl_event{}; auto status = clblasXdotc(args.n, - buffers[0].scalar, args.dot_offset, - buffers[0].x_vec, args.x_offset, args.x_inc, - buffers[0].y_vec, args.y_offset, args.y_inc, + buffers.scalar, args.dot_offset, + buffers.x_vec, args.x_offset, args.x_inc, + buffers.y_vec, args.y_offset, args.y_inc, 1, &queue_plain, 0, nullptr, &event); clWaitForEvents(1, &event); return static_cast(status); @@ -107,18 +107,18 @@ class TestXdotc { // Describes how to run the CPU BLAS routine (for correctness/performance comparison) #ifdef CLBLAST_REF_CBLAS - static StatusCode RunReference2(const Arguments &args, std::vector> &buffers, Queue &queue) { + static StatusCode RunReference2(const Arguments &args, Buffers &buffers, Queue &queue) { std::vector scalar_cpu(args.scalar_size, static_cast(0)); std::vector x_vec_cpu(args.x_size, static_cast(0)); std::vector y_vec_cpu(args.y_size, static_cast(0)); - buffers[0].scalar.Read(queue, args.scalar_size, scalar_cpu); - buffers[0].x_vec.Read(queue, args.x_size, x_vec_cpu); - buffers[0].y_vec.Read(queue, args.y_size, y_vec_cpu); + buffers.scalar.Read(queue, args.scalar_size, scalar_cpu); + buffers.x_vec.Read(queue, args.x_size, x_vec_cpu); + buffers.y_vec.Read(queue, args.y_size, y_vec_cpu); cblasXdotc(args.n, scalar_cpu, args.dot_offset, x_vec_cpu, args.x_offset, args.x_inc, y_vec_cpu, args.y_offset, args.y_inc); - buffers[0].scalar.Write(queue, args.scalar_size, scalar_cpu); + buffers.scalar.Write(queue, args.scalar_size, scalar_cpu); return StatusCode::kSuccess; } #endif diff --git a/test/routines/level1/xdotu.hpp b/test/routines/level1/xdotu.hpp index 512de985..70c7fceb 100644 --- a/test/routines/level1/xdotu.hpp +++ b/test/routines/level1/xdotu.hpp @@ -78,13 +78,13 @@ class TestXdotu { std::vector&, std::vector&) {} // N/A for this routine // Describes how to run the CLBlast routine - static StatusCode RunRoutine(const Arguments &args, std::vector> &buffers, Queue &queue) { + static StatusCode RunRoutine(const Arguments &args, Buffers &buffers, Queue &queue) { auto queue_plain = queue(); auto event = cl_event{}; auto status = Dotu(args.n, - buffers[0].scalar(), args.dot_offset, - buffers[0].x_vec(), args.x_offset, args.x_inc, - buffers[0].y_vec(), args.y_offset, args.y_inc, + buffers.scalar(), args.dot_offset, + buffers.x_vec(), args.x_offset, args.x_inc, + buffers.y_vec(), args.y_offset, args.y_inc, &queue_plain, &event); if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); } return status; @@ -92,13 +92,13 @@ class TestXdotu { // Describes how to run the clBLAS routine (for correctness/performance comparison) #ifdef CLBLAST_REF_CLBLAS - static StatusCode RunReference1(const Arguments &args, std::vector> &buffers, Queue &queue) { + static StatusCode RunReference1(const Arguments &args, Buffers &buffers, Queue &queue) { auto queue_plain = queue(); auto event = cl_event{}; auto status = clblasXdotu(args.n, - buffers[0].scalar, args.dot_offset, - buffers[0].x_vec, args.x_offset, args.x_inc, - buffers[0].y_vec, args.y_offset, args.y_inc, + buffers.scalar, args.dot_offset, + buffers.x_vec, args.x_offset, args.x_inc, + buffers.y_vec, args.y_offset, args.y_inc, 1, &queue_plain, 0, nullptr, &event); clWaitForEvents(1, &event); return static_cast(status); @@ -107,18 +107,18 @@ class TestXdotu { // Describes how to run the CPU BLAS routine (for correctness/performance comparison) #ifdef CLBLAST_REF_CBLAS - static StatusCode RunReference2(const Arguments &args, std::vector> &buffers, Queue &queue) { + static StatusCode RunReference2(const Arguments &args, Buffers &buffers, Queue &queue) { std::vector scalar_cpu(args.scalar_size, static_cast(0)); std::vector x_vec_cpu(args.x_size, static_cast(0)); std::vector y_vec_cpu(args.y_size, static_cast(0)); - buffers[0].scalar.Read(queue, args.scalar_size, scalar_cpu); - buffers[0].x_vec.Read(queue, args.x_size, x_vec_cpu); - buffers[0].y_vec.Read(queue, args.y_size, y_vec_cpu); + buffers.scalar.Read(queue, args.scalar_size, scalar_cpu); + buffers.x_vec.Read(queue, args.x_size, x_vec_cpu); + buffers.y_vec.Read(queue, args.y_size, y_vec_cpu); cblasXdotu(args.n, scalar_cpu, args.dot_offset, x_vec_cpu, args.x_offset, args.x_inc, y_vec_cpu, args.y_offset, args.y_inc); - buffers[0].scalar.Write(queue, args.scalar_size, scalar_cpu); + buffers.scalar.Write(queue, args.scalar_size, scalar_cpu); return StatusCode::kSuccess; } #endif diff --git a/test/routines/level1/xnrm2.hpp b/test/routines/level1/xnrm2.hpp index 20f75226..ce33fe59 100644 --- a/test/routines/level1/xnrm2.hpp +++ b/test/routines/level1/xnrm2.hpp @@ -74,12 +74,12 @@ class TestXnrm2 { std::vector&, std::vector&) {} // N/A for this routine // Describes how to run the CLBlast routine - static StatusCode RunRoutine(const Arguments &args, std::vector> &buffers, Queue &queue) { + static StatusCode RunRoutine(const Arguments &args, Buffers &buffers, Queue &queue) { auto queue_plain = queue(); auto event = cl_event{}; auto status = Nrm2(args.n, - buffers[0].scalar(), args.nrm2_offset, - buffers[0].x_vec(), args.x_offset, args.x_inc, + buffers.scalar(), args.nrm2_offset, + buffers.x_vec(), args.x_offset, args.x_inc, &queue_plain, &event); if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); } return status; @@ -87,12 +87,12 @@ class TestXnrm2 { // Describes how to run the clBLAS routine (for correctness/performance comparison) #ifdef CLBLAST_REF_CLBLAS - static StatusCode RunReference1(const Arguments &args, std::vector> &buffers, Queue &queue) { + static StatusCode RunReference1(const Arguments &args, Buffers &buffers, Queue &queue) { auto queue_plain = queue(); auto event = cl_event{}; auto status = clblasXnrm2(args.n, - buffers[0].scalar, args.nrm2_offset, - buffers[0].x_vec, args.x_offset, args.x_inc, + buffers.scalar, args.nrm2_offset, + buffers.x_vec, args.x_offset, args.x_inc, 1, &queue_plain, 0, nullptr, &event); clWaitForEvents(1, &event); return static_cast(status); @@ -101,15 +101,15 @@ class TestXnrm2 { // Describes how to run the CPU BLAS routine (for correctness/performance comparison) #ifdef CLBLAST_REF_CBLAS - static StatusCode RunReference2(const Arguments &args, std::vector> &buffers, Queue &queue) { + static StatusCode RunReference2(const Arguments &args, Buffers &buffers, Queue &queue) { std::vector scalar_cpu(args.scalar_size, static_cast(0)); std::vector x_vec_cpu(args.x_size, static_cast(0)); - buffers[0].scalar.Read(queue, args.scalar_size, scalar_cpu); - buffers[0].x_vec.Read(queue, args.x_size, x_vec_cpu); + buffers.scalar.Read(queue, args.scalar_size, scalar_cpu); + buffers.x_vec.Read(queue, args.x_size, x_vec_cpu); cblasXnrm2(args.n, scalar_cpu, args.nrm2_offset, x_vec_cpu, args.x_offset, args.x_inc); - buffers[0].scalar.Write(queue, args.scalar_size, scalar_cpu); + buffers.scalar.Write(queue, args.scalar_size, scalar_cpu); return StatusCode::kSuccess; } #endif diff --git a/test/routines/level1/xscal.hpp b/test/routines/level1/xscal.hpp index e2600834..d89688b4 100644 --- a/test/routines/level1/xscal.hpp +++ b/test/routines/level1/xscal.hpp @@ -71,11 +71,11 @@ class TestXscal { std::vector&, std::vector&) {} // N/A for this routine // Describes how to run the CLBlast routine - static StatusCode RunRoutine(const Arguments &args, std::vector> &buffers, Queue &queue) { + static StatusCode RunRoutine(const Arguments &args, Buffers &buffers, Queue &queue) { auto queue_plain = queue(); auto event = cl_event{}; auto status = Scal(args.n, args.alpha, - buffers[0].x_vec(), args.x_offset, args.x_inc, + buffers.x_vec(), args.x_offset, args.x_inc, &queue_plain, &event); if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); } return status; @@ -83,11 +83,11 @@ class TestXscal { // Describes how to run the clBLAS routine (for correctness/performance comparison) #ifdef CLBLAST_REF_CLBLAS - static StatusCode RunReference1(const Arguments &args, std::vector> &buffers, Queue &queue) { + static StatusCode RunReference1(const Arguments &args, Buffers &buffers, Queue &queue) { auto queue_plain = queue(); auto event = cl_event{}; auto status = clblasXscal(args.n, args.alpha, - buffers[0].x_vec, args.x_offset, args.x_inc, + buffers.x_vec, args.x_offset, args.x_inc, 1, &queue_plain, 0, nullptr, &event); clWaitForEvents(1, &event); return static_cast(status); @@ -96,12 +96,12 @@ class TestXscal { // Describes how to run the CPU BLAS routine (for correctness/performance comparison) #ifdef CLBLAST_REF_CBLAS - static StatusCode RunReference2(const Arguments &args, std::vector> &buffers, Queue &queue) { + static StatusCode RunReference2(const Arguments &args, Buffers &buffers, Queue &queue) { std::vector x_vec_cpu(args.x_size, static_cast(0)); - buffers[0].x_vec.Read(queue, args.x_size, x_vec_cpu); + buffers.x_vec.Read(queue, args.x_size, x_vec_cpu); cblasXscal(args.n, args.alpha, x_vec_cpu, args.x_offset, args.x_inc); - buffers[0].x_vec.Write(queue, args.x_size, x_vec_cpu); + buffers.x_vec.Write(queue, args.x_size, x_vec_cpu); return StatusCode::kSuccess; } #endif diff --git a/test/routines/level1/xswap.hpp b/test/routines/level1/xswap.hpp index b9f06eb7..49b0d3d0 100644 --- a/test/routines/level1/xswap.hpp +++ b/test/routines/level1/xswap.hpp @@ -74,12 +74,12 @@ class TestXswap { std::vector&, std::vector&) {} // N/A for this routine // Describes how to run the CLBlast routine - static StatusCode RunRoutine(const Arguments &args, std::vector> &buffers, Queue &queue) { + static StatusCode RunRoutine(const Arguments &args, Buffers &buffers, Queue &queue) { auto queue_plain = queue(); auto event = cl_event{}; auto status = Swap(args.n, - buffers[0].x_vec(), args.x_offset, args.x_inc, - buffers[0].y_vec(), args.y_offset, args.y_inc, + buffers.x_vec(), args.x_offset, args.x_inc, + buffers.y_vec(), args.y_offset, args.y_inc, &queue_plain, &event); if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); } return status; @@ -87,12 +87,12 @@ class TestXswap { // Describes how to run the clBLAS routine (for correctness/performance comparison) #ifdef CLBLAST_REF_CLBLAS - static StatusCode RunReference1(const Arguments &args, std::vector> &buffers, Queue &queue) { + static StatusCode RunReference1(const Arguments &args, Buffers &buffers, Queue &queue) { auto queue_plain = queue(); auto event = cl_event{}; auto status = clblasXswap(args.n, - buffers[0].x_vec, args.x_offset, args.x_inc, - buffers[0].y_vec, args.y_offset, args.y_inc, + buffers.x_vec, args.x_offset, args.x_inc, + buffers.y_vec, args.y_offset, args.y_inc, 1, &queue_plain, 0, nullptr, &event); clWaitForEvents(1, &event); return static_cast(status); @@ -101,16 +101,16 @@ class TestXswap { // Describes how to run the CPU BLAS routine (for correctness/performance comparison) #ifdef CLBLAST_REF_CBLAS - static StatusCode RunReference2(const Arguments &args, std::vector> &buffers, Queue &queue) { + static StatusCode RunReference2(const Arguments &args, Buffers &buffers, Queue &queue) { std::vector x_vec_cpu(args.x_size, static_cast(0)); std::vector y_vec_cpu(args.y_size, static_cast(0)); - buffers[0].x_vec.Read(queue, args.x_size, x_vec_cpu); - buffers[0].y_vec.Read(queue, args.y_size, y_vec_cpu); + buffers.x_vec.Read(queue, args.x_size, x_vec_cpu); + buffers.y_vec.Read(queue, args.y_size, y_vec_cpu); cblasXswap(args.n, x_vec_cpu, args.x_offset, args.x_inc, y_vec_cpu, args.y_offset, args.y_inc); - buffers[0].x_vec.Write(queue, args.x_size, x_vec_cpu); - buffers[0].y_vec.Write(queue, args.y_size, y_vec_cpu); + buffers.x_vec.Write(queue, args.x_size, x_vec_cpu); + buffers.y_vec.Write(queue, args.y_size, y_vec_cpu); return StatusCode::kSuccess; } #endif diff --git a/test/routines/level2/xgbmv.hpp b/test/routines/level2/xgbmv.hpp index 57c16104..f371b9a7 100644 --- a/test/routines/level2/xgbmv.hpp +++ b/test/routines/level2/xgbmv.hpp @@ -86,14 +86,14 @@ class TestXgbmv { std::vector&, std::vector&) {} // N/A for this routine // Describes how to run the CLBlast routine - static StatusCode RunRoutine(const Arguments &args, std::vector> &buffers, Queue &queue) { + static StatusCode RunRoutine(const Arguments &args, Buffers &buffers, Queue &queue) { auto queue_plain = queue(); auto event = cl_event{}; auto status = Gbmv(args.layout, args.a_transpose, args.m, args.n, args.kl, args.ku, args.alpha, - buffers[0].a_mat(), args.a_offset, args.a_ld, - buffers[0].x_vec(), args.x_offset, args.x_inc, args.beta, - buffers[0].y_vec(), args.y_offset, args.y_inc, + buffers.a_mat(), args.a_offset, args.a_ld, + buffers.x_vec(), args.x_offset, args.x_inc, args.beta, + buffers.y_vec(), args.y_offset, args.y_inc, &queue_plain, &event); if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); } return status; @@ -101,15 +101,15 @@ class TestXgbmv { // Describes how to run the clBLAS routine (for correctness/performance comparison) #ifdef CLBLAST_REF_CLBLAS - static StatusCode RunReference1(const Arguments &args, std::vector> &buffers, Queue &queue) { + static StatusCode RunReference1(const Arguments &args, Buffers &buffers, Queue &queue) { auto queue_plain = queue(); auto event = cl_event{}; auto status = clblasXgbmv(convertToCLBLAS(args.layout), convertToCLBLAS(args.a_transpose), args.m, args.n, args.kl, args.ku, args.alpha, - buffers[0].a_mat, args.a_offset, args.a_ld, - buffers[0].x_vec, args.x_offset, args.x_inc, args.beta, - buffers[0].y_vec, args.y_offset, args.y_inc, + buffers.a_mat, args.a_offset, args.a_ld, + buffers.x_vec, args.x_offset, args.x_inc, args.beta, + buffers.y_vec, args.y_offset, args.y_inc, 1, &queue_plain, 0, nullptr, &event); clWaitForEvents(1, &event); return static_cast(status); @@ -118,20 +118,20 @@ class TestXgbmv { // Describes how to run the CPU BLAS routine (for correctness/performance comparison) #ifdef CLBLAST_REF_CBLAS - static StatusCode RunReference2(const Arguments &args, std::vector> &buffers, Queue &queue) { + static StatusCode RunReference2(const Arguments &args, Buffers &buffers, Queue &queue) { std::vector a_mat_cpu(args.a_size, static_cast(0)); std::vector x_vec_cpu(args.x_size, static_cast(0)); std::vector y_vec_cpu(args.y_size, static_cast(0)); - buffers[0].a_mat.Read(queue, args.a_size, a_mat_cpu); - buffers[0].x_vec.Read(queue, args.x_size, x_vec_cpu); - buffers[0].y_vec.Read(queue, args.y_size, y_vec_cpu); + buffers.a_mat.Read(queue, args.a_size, a_mat_cpu); + buffers.x_vec.Read(queue, args.x_size, x_vec_cpu); + buffers.y_vec.Read(queue, args.y_size, y_vec_cpu); cblasXgbmv(convertToCBLAS(args.layout), convertToCBLAS(args.a_transpose), args.m, args.n, args.kl, args.ku, args.alpha, a_mat_cpu, args.a_offset, args.a_ld, x_vec_cpu, args.x_offset, args.x_inc, args.beta, y_vec_cpu, args.y_offset, args.y_inc); - buffers[0].y_vec.Write(queue, args.y_size, y_vec_cpu); + buffers.y_vec.Write(queue, args.y_size, y_vec_cpu); return StatusCode::kSuccess; } #endif diff --git a/test/routines/level2/xgemv.hpp b/test/routines/level2/xgemv.hpp index 3c56c405..2442be4c 100644 --- a/test/routines/level2/xgemv.hpp +++ b/test/routines/level2/xgemv.hpp @@ -86,14 +86,14 @@ class TestXgemv { std::vector&, std::vector&) {} // N/A for this routine // Describes how to run the CLBlast routine - static StatusCode RunRoutine(const Arguments &args, std::vector> &buffers, Queue &queue) { + static StatusCode RunRoutine(const Arguments &args, Buffers &buffers, Queue &queue) { auto queue_plain = queue(); auto event = cl_event{}; auto status = Gemv(args.layout, args.a_transpose, args.m, args.n, args.alpha, - buffers[0].a_mat(), args.a_offset, args.a_ld, - buffers[0].x_vec(), args.x_offset, args.x_inc, args.beta, - buffers[0].y_vec(), args.y_offset, args.y_inc, + buffers.a_mat(), args.a_offset, args.a_ld, + buffers.x_vec(), args.x_offset, args.x_inc, args.beta, + buffers.y_vec(), args.y_offset, args.y_inc, &queue_plain, &event); if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); } return status; @@ -101,15 +101,15 @@ class TestXgemv { // Describes how to run the clBLAS routine (for correctness/performance comparison) #ifdef CLBLAST_REF_CLBLAS - static StatusCode RunReference1(const Arguments &args, std::vector> &buffers, Queue &queue) { + static StatusCode RunReference1(const Arguments &args, Buffers &buffers, Queue &queue) { auto queue_plain = queue(); auto event = cl_event{}; auto status = clblasXgemv(convertToCLBLAS(args.layout), convertToCLBLAS(args.a_transpose), args.m, args.n, args.alpha, - buffers[0].a_mat, args.a_offset, args.a_ld, - buffers[0].x_vec, args.x_offset, args.x_inc, args.beta, - buffers[0].y_vec, args.y_offset, args.y_inc, + buffers.a_mat, args.a_offset, args.a_ld, + buffers.x_vec, args.x_offset, args.x_inc, args.beta, + buffers.y_vec, args.y_offset, args.y_inc, 1, &queue_plain, 0, nullptr, &event); clWaitForEvents(1, &event); return static_cast(status); @@ -118,20 +118,20 @@ class TestXgemv { // Describes how to run the CPU BLAS routine (for correctness/performance comparison) #ifdef CLBLAST_REF_CBLAS - static StatusCode RunReference2(const Arguments &args, std::vector> &buffers, Queue &queue) { + static StatusCode RunReference2(const Arguments &args, Buffers &buffers, Queue &queue) { std::vector a_mat_cpu(args.a_size, static_cast(0)); std::vector x_vec_cpu(args.x_size, static_cast(0)); std::vector y_vec_cpu(args.y_size, static_cast(0)); - buffers[0].a_mat.Read(queue, args.a_size, a_mat_cpu); - buffers[0].x_vec.Read(queue, args.x_size, x_vec_cpu); - buffers[0].y_vec.Read(queue, args.y_size, y_vec_cpu); + buffers.a_mat.Read(queue, args.a_size, a_mat_cpu); + buffers.x_vec.Read(queue, args.x_size, x_vec_cpu); + buffers.y_vec.Read(queue, args.y_size, y_vec_cpu); cblasXgemv(convertToCBLAS(args.layout), convertToCBLAS(args.a_transpose), args.m, args.n, args.alpha, a_mat_cpu, args.a_offset, args.a_ld, x_vec_cpu, args.x_offset, args.x_inc, args.beta, y_vec_cpu, args.y_offset, args.y_inc); - buffers[0].y_vec.Write(queue, args.y_size, y_vec_cpu); + buffers.y_vec.Write(queue, args.y_size, y_vec_cpu); return StatusCode::kSuccess; } #endif diff --git a/test/routines/level2/xger.hpp b/test/routines/level2/xger.hpp index f9a6fefd..3e7ccbc3 100644 --- a/test/routines/level2/xger.hpp +++ b/test/routines/level2/xger.hpp @@ -82,14 +82,14 @@ class TestXger { std::vector&, std::vector&) {} // N/A for this routine // Describes how to run the CLBlast routine - static StatusCode RunRoutine(const Arguments &args, std::vector> &buffers, Queue &queue) { + static StatusCode RunRoutine(const Arguments &args, Buffers &buffers, Queue &queue) { auto queue_plain = queue(); auto event = cl_event{}; auto status = Ger(args.layout, args.m, args.n, args.alpha, - buffers[0].x_vec(), args.x_offset, args.x_inc, - buffers[0].y_vec(), args.y_offset, args.y_inc, - buffers[0].a_mat(), args.a_offset, args.a_ld, + buffers.x_vec(), args.x_offset, args.x_inc, + buffers.y_vec(), args.y_offset, args.y_inc, + buffers.a_mat(), args.a_offset, args.a_ld, &queue_plain, &event); if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); } return status; @@ -97,14 +97,14 @@ class TestXger { // Describes how to run the clBLAS routine (for correctness/performance comparison) #ifdef CLBLAST_REF_CLBLAS - static StatusCode RunReference1(const Arguments &args, std::vector> &buffers, Queue &queue) { + static StatusCode RunReference1(const Arguments &args, Buffers &buffers, Queue &queue) { auto queue_plain = queue(); auto event = cl_event{}; auto status = clblasXger(convertToCLBLAS(args.layout), args.m, args.n, args.alpha, - buffers[0].x_vec, args.x_offset, args.x_inc, - buffers[0].y_vec, args.y_offset, args.y_inc, - buffers[0].a_mat, args.a_offset, args.a_ld, + buffers.x_vec, args.x_offset, args.x_inc, + buffers.y_vec, args.y_offset, args.y_inc, + buffers.a_mat, args.a_offset, args.a_ld, 1, &queue_plain, 0, nullptr, &event); clWaitForEvents(1, &event); return static_cast(status); @@ -113,19 +113,19 @@ class TestXger { // Describes how to run the CPU BLAS routine (for correctness/performance comparison) #ifdef CLBLAST_REF_CBLAS - static StatusCode RunReference2(const Arguments &args, std::vector> &buffers, Queue &queue) { + static StatusCode RunReference2(const Arguments &args, Buffers &buffers, Queue &queue) { std::vector a_mat_cpu(args.a_size, static_cast(0)); std::vector x_vec_cpu(args.x_size, static_cast(0)); std::vector y_vec_cpu(args.y_size, static_cast(0)); - buffers[0].a_mat.Read(queue, args.a_size, a_mat_cpu); - buffers[0].x_vec.Read(queue, args.x_size, x_vec_cpu); - buffers[0].y_vec.Read(queue, args.y_size, y_vec_cpu); + buffers.a_mat.Read(queue, args.a_size, a_mat_cpu); + buffers.x_vec.Read(queue, args.x_size, x_vec_cpu); + buffers.y_vec.Read(queue, args.y_size, y_vec_cpu); cblasXger(convertToCBLAS(args.layout), args.m, args.n, args.alpha, x_vec_cpu, args.x_offset, args.x_inc, y_vec_cpu, args.y_offset, args.y_inc, a_mat_cpu, args.a_offset, args.a_ld); - buffers[0].a_mat.Write(queue, args.a_size, a_mat_cpu); + buffers.a_mat.Write(queue, args.a_size, a_mat_cpu); return StatusCode::kSuccess; } #endif diff --git a/test/routines/level2/xgerc.hpp b/test/routines/level2/xgerc.hpp index ddc9030a..d880ae1f 100644 --- a/test/routines/level2/xgerc.hpp +++ b/test/routines/level2/xgerc.hpp @@ -82,14 +82,14 @@ class TestXgerc { std::vector&, std::vector&) {} // N/A for this routine // Describes how to run the CLBlast routine - static StatusCode RunRoutine(const Arguments &args, std::vector> &buffers, Queue &queue) { + static StatusCode RunRoutine(const Arguments &args, Buffers &buffers, Queue &queue) { auto queue_plain = queue(); auto event = cl_event{}; auto status = Gerc(args.layout, args.m, args.n, args.alpha, - buffers[0].x_vec(), args.x_offset, args.x_inc, - buffers[0].y_vec(), args.y_offset, args.y_inc, - buffers[0].a_mat(), args.a_offset, args.a_ld, + buffers.x_vec(), args.x_offset, args.x_inc, + buffers.y_vec(), args.y_offset, args.y_inc, + buffers.a_mat(), args.a_offset, args.a_ld, &queue_plain, &event); if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); } return status; @@ -97,14 +97,14 @@ class TestXgerc { // Describes how to run the clBLAS routine (for correctness/performance comparison) #ifdef CLBLAST_REF_CLBLAS - static StatusCode RunReference1(const Arguments &args, std::vector> &buffers, Queue &queue) { + static StatusCode RunReference1(const Arguments &args, Buffers &buffers, Queue &queue) { auto queue_plain = queue(); auto event = cl_event{}; auto status = clblasXgerc(convertToCLBLAS(args.layout), args.m, args.n, args.alpha, - buffers[0].x_vec, args.x_offset, args.x_inc, - buffers[0].y_vec, args.y_offset, args.y_inc, - buffers[0].a_mat, args.a_offset, args.a_ld, + buffers.x_vec, args.x_offset, args.x_inc, + buffers.y_vec, args.y_offset, args.y_inc, + buffers.a_mat, args.a_offset, args.a_ld, 1, &queue_plain, 0, nullptr, &event); clWaitForEvents(1, &event); return static_cast(status); @@ -113,19 +113,19 @@ class TestXgerc { // Describes how to run the CPU BLAS routine (for correctness/performance comparison) #ifdef CLBLAST_REF_CBLAS - static StatusCode RunReference2(const Arguments &args, std::vector> &buffers, Queue &queue) { + static StatusCode RunReference2(const Arguments &args, Buffers &buffers, Queue &queue) { std::vector a_mat_cpu(args.a_size, static_cast(0)); std::vector x_vec_cpu(args.x_size, static_cast(0)); std::vector y_vec_cpu(args.y_size, static_cast(0)); - buffers[0].a_mat.Read(queue, args.a_size, a_mat_cpu); - buffers[0].x_vec.Read(queue, args.x_size, x_vec_cpu); - buffers[0].y_vec.Read(queue, args.y_size, y_vec_cpu); + buffers.a_mat.Read(queue, args.a_size, a_mat_cpu); + buffers.x_vec.Read(queue, args.x_size, x_vec_cpu); + buffers.y_vec.Read(queue, args.y_size, y_vec_cpu); cblasXgerc(convertToCBLAS(args.layout), args.m, args.n, args.alpha, x_vec_cpu, args.x_offset, args.x_inc, y_vec_cpu, args.y_offset, args.y_inc, a_mat_cpu, args.a_offset, args.a_ld); - buffers[0].a_mat.Write(queue, args.a_size, a_mat_cpu); + buffers.a_mat.Write(queue, args.a_size, a_mat_cpu); return StatusCode::kSuccess; } #endif diff --git a/test/routines/level2/xgeru.hpp b/test/routines/level2/xgeru.hpp index 8d5b8589..1735e42a 100644 --- a/test/routines/level2/xgeru.hpp +++ b/test/routines/level2/xgeru.hpp @@ -82,14 +82,14 @@ class TestXgeru { std::vector&, std::vector&) {} // N/A for this routine // Describes how to run the CLBlast routine - static StatusCode RunRoutine(const Arguments &args, std::vector> &buffers, Queue &queue) { + static StatusCode RunRoutine(const Arguments &args, Buffers &buffers, Queue &queue) { auto queue_plain = queue(); auto event = cl_event{}; auto status = Geru(args.layout, args.m, args.n, args.alpha, - buffers[0].x_vec(), args.x_offset, args.x_inc, - buffers[0].y_vec(), args.y_offset, args.y_inc, - buffers[0].a_mat(), args.a_offset, args.a_ld, + buffers.x_vec(), args.x_offset, args.x_inc, + buffers.y_vec(), args.y_offset, args.y_inc, + buffers.a_mat(), args.a_offset, args.a_ld, &queue_plain, &event); if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); } return status; @@ -97,14 +97,14 @@ class TestXgeru { // Describes how to run the clBLAS routine (for correctness/performance comparison) #ifdef CLBLAST_REF_CLBLAS - static StatusCode RunReference1(const Arguments &args, std::vector> &buffers, Queue &queue) { + static StatusCode RunReference1(const Arguments &args, Buffers &buffers, Queue &queue) { auto queue_plain = queue(); auto event = cl_event{}; auto status = clblasXgeru(convertToCLBLAS(args.layout), args.m, args.n, args.alpha, - buffers[0].x_vec, args.x_offset, args.x_inc, - buffers[0].y_vec, args.y_offset, args.y_inc, - buffers[0].a_mat, args.a_offset, args.a_ld, + buffers.x_vec, args.x_offset, args.x_inc, + buffers.y_vec, args.y_offset, args.y_inc, + buffers.a_mat, args.a_offset, args.a_ld, 1, &queue_plain, 0, nullptr, &event); clWaitForEvents(1, &event); return static_cast(status); @@ -113,19 +113,19 @@ class TestXgeru { // Describes how to run the CPU BLAS routine (for correctness/performance comparison) #ifdef CLBLAST_REF_CBLAS - static StatusCode RunReference2(const Arguments &args, std::vector> &buffers, Queue &queue) { + static StatusCode RunReference2(const Arguments &args, Buffers &buffers, Queue &queue) { std::vector a_mat_cpu(args.a_size, static_cast(0)); std::vector x_vec_cpu(args.x_size, static_cast(0)); std::vector y_vec_cpu(args.y_size, static_cast(0)); - buffers[0].a_mat.Read(queue, args.a_size, a_mat_cpu); - buffers[0].x_vec.Read(queue, args.x_size, x_vec_cpu); - buffers[0].y_vec.Read(queue, args.y_size, y_vec_cpu); + buffers.a_mat.Read(queue, args.a_size, a_mat_cpu); + buffers.x_vec.Read(queue, args.x_size, x_vec_cpu); + buffers.y_vec.Read(queue, args.y_size, y_vec_cpu); cblasXgeru(convertToCBLAS(args.layout), args.m, args.n, args.alpha, x_vec_cpu, args.x_offset, args.x_inc, y_vec_cpu, args.y_offset, args.y_inc, a_mat_cpu, args.a_offset, args.a_ld); - buffers[0].a_mat.Write(queue, args.a_size, a_mat_cpu); + buffers.a_mat.Write(queue, args.a_size, a_mat_cpu); return StatusCode::kSuccess; } #endif diff --git a/test/routines/level2/xhbmv.hpp b/test/routines/level2/xhbmv.hpp index 50130359..99538bf1 100644 --- a/test/routines/level2/xhbmv.hpp +++ b/test/routines/level2/xhbmv.hpp @@ -80,14 +80,14 @@ class TestXhbmv { std::vector&, std::vector&) {} // N/A for this routine // Describes how to run the CLBlast routine - static StatusCode RunRoutine(const Arguments &args, std::vector> &buffers, Queue &queue) { + static StatusCode RunRoutine(const Arguments &args, Buffers &buffers, Queue &queue) { auto queue_plain = queue(); auto event = cl_event{}; auto status = Hbmv(args.layout, args.triangle, args.n, args.kl, args.alpha, - buffers[0].a_mat(), args.a_offset, args.a_ld, - buffers[0].x_vec(), args.x_offset, args.x_inc, args.beta, - buffers[0].y_vec(), args.y_offset, args.y_inc, + buffers.a_mat(), args.a_offset, args.a_ld, + buffers.x_vec(), args.x_offset, args.x_inc, args.beta, + buffers.y_vec(), args.y_offset, args.y_inc, &queue_plain, &event); if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); } return status; @@ -95,15 +95,15 @@ class TestXhbmv { // Describes how to run the clBLAS routine (for correctness/performance comparison) #ifdef CLBLAST_REF_CLBLAS - static StatusCode RunReference1(const Arguments &args, std::vector> &buffers, Queue &queue) { + static StatusCode RunReference1(const Arguments &args, Buffers &buffers, Queue &queue) { auto queue_plain = queue(); auto event = cl_event{}; auto status = clblasXhbmv(convertToCLBLAS(args.layout), convertToCLBLAS(args.triangle), args.n, args.kl, args.alpha, - buffers[0].a_mat, args.a_offset, args.a_ld, - buffers[0].x_vec, args.x_offset, args.x_inc, args.beta, - buffers[0].y_vec, args.y_offset, args.y_inc, + buffers.a_mat, args.a_offset, args.a_ld, + buffers.x_vec, args.x_offset, args.x_inc, args.beta, + buffers.y_vec, args.y_offset, args.y_inc, 1, &queue_plain, 0, nullptr, &event); clWaitForEvents(1, &event); return static_cast(status); @@ -112,20 +112,20 @@ class TestXhbmv { // Describes how to run the CPU BLAS routine (for correctness/performance comparison) #ifdef CLBLAST_REF_CBLAS - static StatusCode RunReference2(const Arguments &args, std::vector> &buffers, Queue &queue) { + static StatusCode RunReference2(const Arguments &args, Buffers &buffers, Queue &queue) { std::vector a_mat_cpu(args.a_size, static_cast(0)); std::vector x_vec_cpu(args.x_size, static_cast(0)); std::vector y_vec_cpu(args.y_size, static_cast(0)); - buffers[0].a_mat.Read(queue, args.a_size, a_mat_cpu); - buffers[0].x_vec.Read(queue, args.x_size, x_vec_cpu); - buffers[0].y_vec.Read(queue, args.y_size, y_vec_cpu); + buffers.a_mat.Read(queue, args.a_size, a_mat_cpu); + buffers.x_vec.Read(queue, args.x_size, x_vec_cpu); + buffers.y_vec.Read(queue, args.y_size, y_vec_cpu); cblasXhbmv(convertToCBLAS(args.layout), convertToCBLAS(args.triangle), args.n, args.kl, args.alpha, a_mat_cpu, args.a_offset, args.a_ld, x_vec_cpu, args.x_offset, args.x_inc, args.beta, y_vec_cpu, args.y_offset, args.y_inc); - buffers[0].y_vec.Write(queue, args.y_size, y_vec_cpu); + buffers.y_vec.Write(queue, args.y_size, y_vec_cpu); return StatusCode::kSuccess; } #endif diff --git a/test/routines/level2/xhemv.hpp b/test/routines/level2/xhemv.hpp index f69b031c..3792cb66 100644 --- a/test/routines/level2/xhemv.hpp +++ b/test/routines/level2/xhemv.hpp @@ -80,14 +80,14 @@ class TestXhemv { std::vector&, std::vector&) {} // N/A for this routine // Describes how to run the CLBlast routine - static StatusCode RunRoutine(const Arguments &args, std::vector> &buffers, Queue &queue) { + static StatusCode RunRoutine(const Arguments &args, Buffers &buffers, Queue &queue) { auto queue_plain = queue(); auto event = cl_event{}; auto status = Hemv(args.layout, args.triangle, args.n, args.alpha, - buffers[0].a_mat(), args.a_offset, args.a_ld, - buffers[0].x_vec(), args.x_offset, args.x_inc, args.beta, - buffers[0].y_vec(), args.y_offset, args.y_inc, + buffers.a_mat(), args.a_offset, args.a_ld, + buffers.x_vec(), args.x_offset, args.x_inc, args.beta, + buffers.y_vec(), args.y_offset, args.y_inc, &queue_plain, &event); if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); } return status; @@ -95,15 +95,15 @@ class TestXhemv { // Describes how to run the clBLAS routine (for correctness/performance comparison) #ifdef CLBLAST_REF_CLBLAS - static StatusCode RunReference1(const Arguments &args, std::vector> &buffers, Queue &queue) { + static StatusCode RunReference1(const Arguments &args, Buffers &buffers, Queue &queue) { auto queue_plain = queue(); auto event = cl_event{}; auto status = clblasXhemv(convertToCLBLAS(args.layout), convertToCLBLAS(args.triangle), args.n, args.alpha, - buffers[0].a_mat, args.a_offset, args.a_ld, - buffers[0].x_vec, args.x_offset, args.x_inc, args.beta, - buffers[0].y_vec, args.y_offset, args.y_inc, + buffers.a_mat, args.a_offset, args.a_ld, + buffers.x_vec, args.x_offset, args.x_inc, args.beta, + buffers.y_vec, args.y_offset, args.y_inc, 1, &queue_plain, 0, nullptr, &event); clWaitForEvents(1, &event); return static_cast(status); @@ -112,20 +112,20 @@ class TestXhemv { // Describes how to run the CPU BLAS routine (for correctness/performance comparison) #ifdef CLBLAST_REF_CBLAS - static StatusCode RunReference2(const Arguments &args, std::vector> &buffers, Queue &queue) { + static StatusCode RunReference2(const Arguments &args, Buffers &buffers, Queue &queue) { std::vector a_mat_cpu(args.a_size, static_cast(0)); std::vector x_vec_cpu(args.x_size, static_cast(0)); std::vector y_vec_cpu(args.y_size, static_cast(0)); - buffers[0].a_mat.Read(queue, args.a_size, a_mat_cpu); - buffers[0].x_vec.Read(queue, args.x_size, x_vec_cpu); - buffers[0].y_vec.Read(queue, args.y_size, y_vec_cpu); + buffers.a_mat.Read(queue, args.a_size, a_mat_cpu); + buffers.x_vec.Read(queue, args.x_size, x_vec_cpu); + buffers.y_vec.Read(queue, args.y_size, y_vec_cpu); cblasXhemv(convertToCBLAS(args.layout), convertToCBLAS(args.triangle), args.n, args.alpha, a_mat_cpu, args.a_offset, args.a_ld, x_vec_cpu, args.x_offset, args.x_inc, args.beta, y_vec_cpu, args.y_offset, args.y_inc); - buffers[0].y_vec.Write(queue, args.y_size, y_vec_cpu); + buffers.y_vec.Write(queue, args.y_size, y_vec_cpu); return StatusCode::kSuccess; } #endif diff --git a/test/routines/level2/xher.hpp b/test/routines/level2/xher.hpp index c3d809bf..c58eb189 100644 --- a/test/routines/level2/xher.hpp +++ b/test/routines/level2/xher.hpp @@ -76,13 +76,13 @@ class TestXher { std::vector&, std::vector&) {} // N/A for this routine // Describes how to run the CLBlast routine - static StatusCode RunRoutine(const Arguments &args, std::vector> &buffers, Queue &queue) { + static StatusCode RunRoutine(const Arguments &args, Buffers &buffers, Queue &queue) { auto queue_plain = queue(); auto event = cl_event{}; auto status = Her(args.layout, args.triangle, args.n, args.alpha, - buffers[0].x_vec(), args.x_offset, args.x_inc, - buffers[0].a_mat(), args.a_offset, args.a_ld, + buffers.x_vec(), args.x_offset, args.x_inc, + buffers.a_mat(), args.a_offset, args.a_ld, &queue_plain, &event); if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); } return status; @@ -90,14 +90,14 @@ class TestXher { // Describes how to run the clBLAS routine (for correctness/performance comparison) #ifdef CLBLAST_REF_CLBLAS - static StatusCode RunReference1(const Arguments &args, std::vector> &buffers, Queue &queue) { + static StatusCode RunReference1(const Arguments &args, Buffers &buffers, Queue &queue) { auto queue_plain = queue(); auto event = cl_event{}; auto status = clblasXher(convertToCLBLAS(args.layout), convertToCLBLAS(args.triangle), args.n, args.alpha, - buffers[0].x_vec, args.x_offset, args.x_inc, - buffers[0].a_mat, args.a_offset, args.a_ld, + buffers.x_vec, args.x_offset, args.x_inc, + buffers.a_mat, args.a_offset, args.a_ld, 1, &queue_plain, 0, nullptr, &event); clWaitForEvents(1, &event); return static_cast(status); @@ -106,17 +106,17 @@ class TestXher { // Describes how to run the CPU BLAS routine (for correctness/performance comparison) #ifdef CLBLAST_REF_CBLAS - static StatusCode RunReference2(const Arguments &args, std::vector> &buffers, Queue &queue) { + static StatusCode RunReference2(const Arguments &args, Buffers &buffers, Queue &queue) { std::vector a_mat_cpu(args.a_size, static_cast(0)); std::vector x_vec_cpu(args.x_size, static_cast(0)); - buffers[0].a_mat.Read(queue, args.a_size, a_mat_cpu); - buffers[0].x_vec.Read(queue, args.x_size, x_vec_cpu); + buffers.a_mat.Read(queue, args.a_size, a_mat_cpu); + buffers.x_vec.Read(queue, args.x_size, x_vec_cpu); cblasXher(convertToCBLAS(args.layout), convertToCBLAS(args.triangle), args.n, args.alpha, x_vec_cpu, args.x_offset, args.x_inc, a_mat_cpu, args.a_offset, args.a_ld); - buffers[0].a_mat.Write(queue, args.a_size, a_mat_cpu); + buffers.a_mat.Write(queue, args.a_size, a_mat_cpu); return StatusCode::kSuccess; } #endif diff --git a/test/routines/level2/xher2.hpp b/test/routines/level2/xher2.hpp index 7ddf9ed1..8a7eb0b6 100644 --- a/test/routines/level2/xher2.hpp +++ b/test/routines/level2/xher2.hpp @@ -80,14 +80,14 @@ class TestXher2 { std::vector&, std::vector&) {} // N/A for this routine // Describes how to run the CLBlast routine - static StatusCode RunRoutine(const Arguments &args, std::vector> &buffers, Queue &queue) { + static StatusCode RunRoutine(const Arguments &args, Buffers &buffers, Queue &queue) { auto queue_plain = queue(); auto event = cl_event{}; auto status = Her2(args.layout, args.triangle, args.n, args.alpha, - buffers[0].x_vec(), args.x_offset, args.x_inc, - buffers[0].y_vec(), args.y_offset, args.y_inc, - buffers[0].a_mat(), args.a_offset, args.a_ld, + buffers.x_vec(), args.x_offset, args.x_inc, + buffers.y_vec(), args.y_offset, args.y_inc, + buffers.a_mat(), args.a_offset, args.a_ld, &queue_plain, &event); if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); } return status; @@ -95,15 +95,15 @@ class TestXher2 { // Describes how to run the clBLAS routine (for correctness/performance comparison) #ifdef CLBLAST_REF_CLBLAS - static StatusCode RunReference1(const Arguments &args, std::vector> &buffers, Queue &queue) { + static StatusCode RunReference1(const Arguments &args, Buffers &buffers, Queue &queue) { auto queue_plain = queue(); auto event = cl_event{}; auto status = clblasXher2(convertToCLBLAS(args.layout), convertToCLBLAS(args.triangle), args.n, args.alpha, - buffers[0].x_vec, args.x_offset, args.x_inc, - buffers[0].y_vec, args.y_offset, args.y_inc, - buffers[0].a_mat, args.a_offset, args.a_ld, + buffers.x_vec, args.x_offset, args.x_inc, + buffers.y_vec, args.y_offset, args.y_inc, + buffers.a_mat, args.a_offset, args.a_ld, 1, &queue_plain, 0, nullptr, &event); clWaitForEvents(1, &event); return static_cast(status); @@ -112,20 +112,20 @@ class TestXher2 { // Describes how to run the CPU BLAS routine (for correctness/performance comparison) #ifdef CLBLAST_REF_CBLAS - static StatusCode RunReference2(const Arguments &args, std::vector> &buffers, Queue &queue) { + static StatusCode RunReference2(const Arguments &args, Buffers &buffers, Queue &queue) { std::vector a_mat_cpu(args.a_size, static_cast(0)); std::vector x_vec_cpu(args.x_size, static_cast(0)); std::vector y_vec_cpu(args.y_size, static_cast(0)); - buffers[0].a_mat.Read(queue, args.a_size, a_mat_cpu); - buffers[0].x_vec.Read(queue, args.x_size, x_vec_cpu); - buffers[0].y_vec.Read(queue, args.y_size, y_vec_cpu); + buffers.a_mat.Read(queue, args.a_size, a_mat_cpu); + buffers.x_vec.Read(queue, args.x_size, x_vec_cpu); + buffers.y_vec.Read(queue, args.y_size, y_vec_cpu); cblasXher2(convertToCBLAS(args.layout), convertToCBLAS(args.triangle), args.n, args.alpha, x_vec_cpu, args.x_offset, args.x_inc, y_vec_cpu, args.y_offset, args.y_inc, a_mat_cpu, args.a_offset, args.a_ld); - buffers[0].a_mat.Write(queue, args.a_size, a_mat_cpu); + buffers.a_mat.Write(queue, args.a_size, a_mat_cpu); return StatusCode::kSuccess; } #endif diff --git a/test/routines/level2/xhpmv.hpp b/test/routines/level2/xhpmv.hpp index 7fae80b8..0862b619 100644 --- a/test/routines/level2/xhpmv.hpp +++ b/test/routines/level2/xhpmv.hpp @@ -80,14 +80,14 @@ class TestXhpmv { std::vector&, std::vector&) {} // N/A for this routine // Describes how to run the CLBlast routine - static StatusCode RunRoutine(const Arguments &args, std::vector> &buffers, Queue &queue) { + static StatusCode RunRoutine(const Arguments &args, Buffers &buffers, Queue &queue) { auto queue_plain = queue(); auto event = cl_event{}; auto status = Hpmv(args.layout, args.triangle, args.n, args.alpha, - buffers[0].ap_mat(), args.ap_offset, - buffers[0].x_vec(), args.x_offset, args.x_inc, args.beta, - buffers[0].y_vec(), args.y_offset, args.y_inc, + buffers.ap_mat(), args.ap_offset, + buffers.x_vec(), args.x_offset, args.x_inc, args.beta, + buffers.y_vec(), args.y_offset, args.y_inc, &queue_plain, &event); if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); } return status; @@ -95,15 +95,15 @@ class TestXhpmv { // Describes how to run the clBLAS routine (for correctness/performance comparison) #ifdef CLBLAST_REF_CLBLAS - static StatusCode RunReference1(const Arguments &args, std::vector> &buffers, Queue &queue) { + static StatusCode RunReference1(const Arguments &args, Buffers &buffers, Queue &queue) { auto queue_plain = queue(); auto event = cl_event{}; auto status = clblasXhpmv(convertToCLBLAS(args.layout), convertToCLBLAS(args.triangle), args.n, args.alpha, - buffers[0].ap_mat, args.ap_offset, - buffers[0].x_vec, args.x_offset, args.x_inc, args.beta, - buffers[0].y_vec, args.y_offset, args.y_inc, + buffers.ap_mat, args.ap_offset, + buffers.x_vec, args.x_offset, args.x_inc, args.beta, + buffers.y_vec, args.y_offset, args.y_inc, 1, &queue_plain, 0, nullptr, &event); clWaitForEvents(1, &event); return static_cast(status); @@ -112,20 +112,20 @@ class TestXhpmv { // Describes how to run the CPU BLAS routine (for correctness/performance comparison) #ifdef CLBLAST_REF_CBLAS - static StatusCode RunReference2(const Arguments &args, std::vector> &buffers, Queue &queue) { + static StatusCode RunReference2(const Arguments &args, Buffers &buffers, Queue &queue) { std::vector ap_mat_cpu(args.ap_size, static_cast(0)); std::vector x_vec_cpu(args.x_size, static_cast(0)); std::vector y_vec_cpu(args.y_size, static_cast(0)); - buffers[0].ap_mat.Read(queue, args.ap_size, ap_mat_cpu); - buffers[0].x_vec.Read(queue, args.x_size, x_vec_cpu); - buffers[0].y_vec.Read(queue, args.y_size, y_vec_cpu); + buffers.ap_mat.Read(queue, args.ap_size, ap_mat_cpu); + buffers.x_vec.Read(queue, args.x_size, x_vec_cpu); + buffers.y_vec.Read(queue, args.y_size, y_vec_cpu); cblasXhpmv(convertToCBLAS(args.layout), convertToCBLAS(args.triangle), args.n, args.alpha, ap_mat_cpu, args.ap_offset, x_vec_cpu, args.x_offset, args.x_inc, args.beta, y_vec_cpu, args.y_offset, args.y_inc); - buffers[0].y_vec.Write(queue, args.y_size, y_vec_cpu); + buffers.y_vec.Write(queue, args.y_size, y_vec_cpu); return StatusCode::kSuccess; } #endif diff --git a/test/routines/level2/xhpr.hpp b/test/routines/level2/xhpr.hpp index a46cb8e6..5b454174 100644 --- a/test/routines/level2/xhpr.hpp +++ b/test/routines/level2/xhpr.hpp @@ -76,13 +76,13 @@ class TestXhpr { std::vector&, std::vector&) {} // N/A for this routine // Describes how to run the CLBlast routine - static StatusCode RunRoutine(const Arguments &args, std::vector> &buffers, Queue &queue) { + static StatusCode RunRoutine(const Arguments &args, Buffers &buffers, Queue &queue) { auto queue_plain = queue(); auto event = cl_event{}; auto status = Hpr(args.layout, args.triangle, args.n, args.alpha, - buffers[0].x_vec(), args.x_offset, args.x_inc, - buffers[0].ap_mat(), args.ap_offset, + buffers.x_vec(), args.x_offset, args.x_inc, + buffers.ap_mat(), args.ap_offset, &queue_plain, &event); if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); } return status; @@ -90,14 +90,14 @@ class TestXhpr { // Describes how to run the clBLAS routine (for correctness/performance comparison) #ifdef CLBLAST_REF_CLBLAS - static StatusCode RunReference1(const Arguments &args, std::vector> &buffers, Queue &queue) { + static StatusCode RunReference1(const Arguments &args, Buffers &buffers, Queue &queue) { auto queue_plain = queue(); auto event = cl_event{}; auto status = clblasXhpr(convertToCLBLAS(args.layout), convertToCLBLAS(args.triangle), args.n, args.alpha, - buffers[0].x_vec, args.x_offset, args.x_inc, - buffers[0].ap_mat, args.ap_offset, + buffers.x_vec, args.x_offset, args.x_inc, + buffers.ap_mat, args.ap_offset, 1, &queue_plain, 0, nullptr, &event); clWaitForEvents(1, &event); return static_cast(status); @@ -106,17 +106,17 @@ class TestXhpr { // Describes how to run the CPU BLAS routine (for correctness/performance comparison) #ifdef CLBLAST_REF_CBLAS - static StatusCode RunReference2(const Arguments &args, std::vector> &buffers, Queue &queue) { + static StatusCode RunReference2(const Arguments &args, Buffers &buffers, Queue &queue) { std::vector ap_mat_cpu(args.ap_size, static_cast(0)); std::vector x_vec_cpu(args.x_size, static_cast(0)); - buffers[0].ap_mat.Read(queue, args.ap_size, ap_mat_cpu); - buffers[0].x_vec.Read(queue, args.x_size, x_vec_cpu); + buffers.ap_mat.Read(queue, args.ap_size, ap_mat_cpu); + buffers.x_vec.Read(queue, args.x_size, x_vec_cpu); cblasXhpr(convertToCBLAS(args.layout), convertToCBLAS(args.triangle), args.n, args.alpha, x_vec_cpu, args.x_offset, args.x_inc, ap_mat_cpu, args.ap_offset); - buffers[0].ap_mat.Write(queue, args.ap_size, ap_mat_cpu); + buffers.ap_mat.Write(queue, args.ap_size, ap_mat_cpu); return StatusCode::kSuccess; } #endif diff --git a/test/routines/level2/xhpr2.hpp b/test/routines/level2/xhpr2.hpp index 08f12768..b770da2e 100644 --- a/test/routines/level2/xhpr2.hpp +++ b/test/routines/level2/xhpr2.hpp @@ -80,14 +80,14 @@ class TestXhpr2 { std::vector&, std::vector&) {} // N/A for this routine // Describes how to run the CLBlast routine - static StatusCode RunRoutine(const Arguments &args, std::vector> &buffers, Queue &queue) { + static StatusCode RunRoutine(const Arguments &args, Buffers &buffers, Queue &queue) { auto queue_plain = queue(); auto event = cl_event{}; auto status = Hpr2(args.layout, args.triangle, args.n, args.alpha, - buffers[0].x_vec(), args.x_offset, args.x_inc, - buffers[0].y_vec(), args.y_offset, args.y_inc, - buffers[0].ap_mat(), args.ap_offset, + buffers.x_vec(), args.x_offset, args.x_inc, + buffers.y_vec(), args.y_offset, args.y_inc, + buffers.ap_mat(), args.ap_offset, &queue_plain, &event); if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); } return status; @@ -95,15 +95,15 @@ class TestXhpr2 { // Describes how to run the clBLAS routine (for correctness/performance comparison) #ifdef CLBLAST_REF_CLBLAS - static StatusCode RunReference1(const Arguments &args, std::vector> &buffers, Queue &queue) { + static StatusCode RunReference1(const Arguments &args, Buffers &buffers, Queue &queue) { auto queue_plain = queue(); auto event = cl_event{}; auto status = clblasXhpr2(convertToCLBLAS(args.layout), convertToCLBLAS(args.triangle), args.n, args.alpha, - buffers[0].x_vec, args.x_offset, args.x_inc, - buffers[0].y_vec, args.y_offset, args.y_inc, - buffers[0].ap_mat, args.ap_offset, + buffers.x_vec, args.x_offset, args.x_inc, + buffers.y_vec, args.y_offset, args.y_inc, + buffers.ap_mat, args.ap_offset, 1, &queue_plain, 0, nullptr, &event); clWaitForEvents(1, &event); return static_cast(status); @@ -112,20 +112,20 @@ class TestXhpr2 { // Describes how to run the CPU BLAS routine (for correctness/performance comparison) #ifdef CLBLAST_REF_CBLAS - static StatusCode RunReference2(const Arguments &args, std::vector> &buffers, Queue &queue) { + static StatusCode RunReference2(const Arguments &args, Buffers &buffers, Queue &queue) { std::vector ap_mat_cpu(args.ap_size, static_cast(0)); std::vector x_vec_cpu(args.x_size, static_cast(0)); std::vector y_vec_cpu(args.y_size, static_cast(0)); - buffers[0].ap_mat.Read(queue, args.ap_size, ap_mat_cpu); - buffers[0].x_vec.Read(queue, args.x_size, x_vec_cpu); - buffers[0].y_vec.Read(queue, args.y_size, y_vec_cpu); + buffers.ap_mat.Read(queue, args.ap_size, ap_mat_cpu); + buffers.x_vec.Read(queue, args.x_size, x_vec_cpu); + buffers.y_vec.Read(queue, args.y_size, y_vec_cpu); cblasXhpr2(convertToCBLAS(args.layout), convertToCBLAS(args.triangle), args.n, args.alpha, x_vec_cpu, args.x_offset, args.x_inc, y_vec_cpu, args.y_offset, args.y_inc, ap_mat_cpu, args.ap_offset); - buffers[0].ap_mat.Write(queue, args.ap_size, ap_mat_cpu); + buffers.ap_mat.Write(queue, args.ap_size, ap_mat_cpu); return StatusCode::kSuccess; } #endif diff --git a/test/routines/level2/xsbmv.hpp b/test/routines/level2/xsbmv.hpp index a45dbe8f..7a836170 100644 --- a/test/routines/level2/xsbmv.hpp +++ b/test/routines/level2/xsbmv.hpp @@ -80,14 +80,14 @@ class TestXsbmv { std::vector&, std::vector&) {} // N/A for this routine // Describes how to run the CLBlast routine - static StatusCode RunRoutine(const Arguments &args, std::vector> &buffers, Queue &queue) { + static StatusCode RunRoutine(const Arguments &args, Buffers &buffers, Queue &queue) { auto queue_plain = queue(); auto event = cl_event{}; auto status = Sbmv(args.layout, args.triangle, args.n, args.kl, args.alpha, - buffers[0].a_mat(), args.a_offset, args.a_ld, - buffers[0].x_vec(), args.x_offset, args.x_inc, args.beta, - buffers[0].y_vec(), args.y_offset, args.y_inc, + buffers.a_mat(), args.a_offset, args.a_ld, + buffers.x_vec(), args.x_offset, args.x_inc, args.beta, + buffers.y_vec(), args.y_offset, args.y_inc, &queue_plain, &event); if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); } return status; @@ -95,15 +95,15 @@ class TestXsbmv { // Describes how to run the clBLAS routine (for correctness/performance comparison) #ifdef CLBLAST_REF_CLBLAS - static StatusCode RunReference1(const Arguments &args, std::vector> &buffers, Queue &queue) { + static StatusCode RunReference1(const Arguments &args, Buffers &buffers, Queue &queue) { auto queue_plain = queue(); auto event = cl_event{}; auto status = clblasXsbmv(convertToCLBLAS(args.layout), convertToCLBLAS(args.triangle), args.n, args.kl, args.alpha, - buffers[0].a_mat, args.a_offset, args.a_ld, - buffers[0].x_vec, args.x_offset, args.x_inc, args.beta, - buffers[0].y_vec, args.y_offset, args.y_inc, + buffers.a_mat, args.a_offset, args.a_ld, + buffers.x_vec, args.x_offset, args.x_inc, args.beta, + buffers.y_vec, args.y_offset, args.y_inc, 1, &queue_plain, 0, nullptr, &event); clWaitForEvents(1, &event); return static_cast(status); @@ -112,20 +112,20 @@ class TestXsbmv { // Describes how to run the CPU BLAS routine (for correctness/performance comparison) #ifdef CLBLAST_REF_CBLAS - static StatusCode RunReference2(const Arguments &args, std::vector> &buffers, Queue &queue) { + static StatusCode RunReference2(const Arguments &args, Buffers &buffers, Queue &queue) { std::vector a_mat_cpu(args.a_size, static_cast(0)); std::vector x_vec_cpu(args.x_size, static_cast(0)); std::vector y_vec_cpu(args.y_size, static_cast(0)); - buffers[0].a_mat.Read(queue, args.a_size, a_mat_cpu); - buffers[0].x_vec.Read(queue, args.x_size, x_vec_cpu); - buffers[0].y_vec.Read(queue, args.y_size, y_vec_cpu); + buffers.a_mat.Read(queue, args.a_size, a_mat_cpu); + buffers.x_vec.Read(queue, args.x_size, x_vec_cpu); + buffers.y_vec.Read(queue, args.y_size, y_vec_cpu); cblasXsbmv(convertToCBLAS(args.layout), convertToCBLAS(args.triangle), args.n, args.kl, args.alpha, a_mat_cpu, args.a_offset, args.a_ld, x_vec_cpu, args.x_offset, args.x_inc, args.beta, y_vec_cpu, args.y_offset, args.y_inc); - buffers[0].y_vec.Write(queue, args.y_size, y_vec_cpu); + buffers.y_vec.Write(queue, args.y_size, y_vec_cpu); return StatusCode::kSuccess; } #endif diff --git a/test/routines/level2/xspmv.hpp b/test/routines/level2/xspmv.hpp index a455f652..352c8cfd 100644 --- a/test/routines/level2/xspmv.hpp +++ b/test/routines/level2/xspmv.hpp @@ -80,14 +80,14 @@ class TestXspmv { std::vector&, std::vector&) {} // N/A for this routine // Describes how to run the CLBlast routine - static StatusCode RunRoutine(const Arguments &args, std::vector> &buffers, Queue &queue) { + static StatusCode RunRoutine(const Arguments &args, Buffers &buffers, Queue &queue) { auto queue_plain = queue(); auto event = cl_event{}; auto status = Spmv(args.layout, args.triangle, args.n, args.alpha, - buffers[0].ap_mat(), args.ap_offset, - buffers[0].x_vec(), args.x_offset, args.x_inc, args.beta, - buffers[0].y_vec(), args.y_offset, args.y_inc, + buffers.ap_mat(), args.ap_offset, + buffers.x_vec(), args.x_offset, args.x_inc, args.beta, + buffers.y_vec(), args.y_offset, args.y_inc, &queue_plain, &event); if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); } return status; @@ -95,15 +95,15 @@ class TestXspmv { // Describes how to run the clBLAS routine (for correctness/performance comparison) #ifdef CLBLAST_REF_CLBLAS - static StatusCode RunReference1(const Arguments &args, std::vector> &buffers, Queue &queue) { + static StatusCode RunReference1(const Arguments &args, Buffers &buffers, Queue &queue) { auto queue_plain = queue(); auto event = cl_event{}; auto status = clblasXspmv(convertToCLBLAS(args.layout), convertToCLBLAS(args.triangle), args.n, args.alpha, - buffers[0].ap_mat, args.ap_offset, - buffers[0].x_vec, args.x_offset, args.x_inc, args.beta, - buffers[0].y_vec, args.y_offset, args.y_inc, + buffers.ap_mat, args.ap_offset, + buffers.x_vec, args.x_offset, args.x_inc, args.beta, + buffers.y_vec, args.y_offset, args.y_inc, 1, &queue_plain, 0, nullptr, &event); clWaitForEvents(1, &event); return static_cast(status); @@ -112,20 +112,20 @@ class TestXspmv { // Describes how to run the CPU BLAS routine (for correctness/performance comparison) #ifdef CLBLAST_REF_CBLAS - static StatusCode RunReference2(const Arguments &args, std::vector> &buffers, Queue &queue) { + static StatusCode RunReference2(const Arguments &args, Buffers &buffers, Queue &queue) { std::vector ap_mat_cpu(args.ap_size, static_cast(0)); std::vector x_vec_cpu(args.x_size, static_cast(0)); std::vector y_vec_cpu(args.y_size, static_cast(0)); - buffers[0].ap_mat.Read(queue, args.ap_size, ap_mat_cpu); - buffers[0].x_vec.Read(queue, args.x_size, x_vec_cpu); - buffers[0].y_vec.Read(queue, args.y_size, y_vec_cpu); + buffers.ap_mat.Read(queue, args.ap_size, ap_mat_cpu); + buffers.x_vec.Read(queue, args.x_size, x_vec_cpu); + buffers.y_vec.Read(queue, args.y_size, y_vec_cpu); cblasXspmv(convertToCBLAS(args.layout), convertToCBLAS(args.triangle), args.n, args.alpha, ap_mat_cpu, args.ap_offset, x_vec_cpu, args.x_offset, args.x_inc, args.beta, y_vec_cpu, args.y_offset, args.y_inc); - buffers[0].y_vec.Write(queue, args.y_size, y_vec_cpu); + buffers.y_vec.Write(queue, args.y_size, y_vec_cpu); return StatusCode::kSuccess; } #endif diff --git a/test/routines/level2/xspr.hpp b/test/routines/level2/xspr.hpp index ab9ab85f..988bcdc2 100644 --- a/test/routines/level2/xspr.hpp +++ b/test/routines/level2/xspr.hpp @@ -76,13 +76,13 @@ class TestXspr { std::vector&, std::vector&) {} // N/A for this routine // Describes how to run the CLBlast routine - static StatusCode RunRoutine(const Arguments &args, std::vector> &buffers, Queue &queue) { + static StatusCode RunRoutine(const Arguments &args, Buffers &buffers, Queue &queue) { auto queue_plain = queue(); auto event = cl_event{}; auto status = Spr(args.layout, args.triangle, args.n, args.alpha, - buffers[0].x_vec(), args.x_offset, args.x_inc, - buffers[0].ap_mat(), args.ap_offset, + buffers.x_vec(), args.x_offset, args.x_inc, + buffers.ap_mat(), args.ap_offset, &queue_plain, &event); if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); } return status; @@ -90,14 +90,14 @@ class TestXspr { // Describes how to run the clBLAS routine (for correctness/performance comparison) #ifdef CLBLAST_REF_CLBLAS - static StatusCode RunReference1(const Arguments &args, std::vector> &buffers, Queue &queue) { + static StatusCode RunReference1(const Arguments &args, Buffers &buffers, Queue &queue) { auto queue_plain = queue(); auto event = cl_event{}; auto status = clblasXspr(convertToCLBLAS(args.layout), convertToCLBLAS(args.triangle), args.n, args.alpha, - buffers[0].x_vec, args.x_offset, args.x_inc, - buffers[0].ap_mat, args.ap_offset, + buffers.x_vec, args.x_offset, args.x_inc, + buffers.ap_mat, args.ap_offset, 1, &queue_plain, 0, nullptr, &event); clWaitForEvents(1, &event); return static_cast(status); @@ -106,17 +106,17 @@ class TestXspr { // Describes how to run the CPU BLAS routine (for correctness/performance comparison) #ifdef CLBLAST_REF_CBLAS - static StatusCode RunReference2(const Arguments &args, std::vector> &buffers, Queue &queue) { + static StatusCode RunReference2(const Arguments &args, Buffers &buffers, Queue &queue) { std::vector ap_mat_cpu(args.ap_size, static_cast(0)); std::vector x_vec_cpu(args.x_size, static_cast(0)); - buffers[0].ap_mat.Read(queue, args.ap_size, ap_mat_cpu); - buffers[0].x_vec.Read(queue, args.x_size, x_vec_cpu); + buffers.ap_mat.Read(queue, args.ap_size, ap_mat_cpu); + buffers.x_vec.Read(queue, args.x_size, x_vec_cpu); cblasXspr(convertToCBLAS(args.layout), convertToCBLAS(args.triangle), args.n, args.alpha, x_vec_cpu, args.x_offset, args.x_inc, ap_mat_cpu, args.ap_offset); - buffers[0].ap_mat.Write(queue, args.ap_size, ap_mat_cpu); + buffers.ap_mat.Write(queue, args.ap_size, ap_mat_cpu); return StatusCode::kSuccess; } #endif diff --git a/test/routines/level2/xspr2.hpp b/test/routines/level2/xspr2.hpp index a73975a5..ee517bc1 100644 --- a/test/routines/level2/xspr2.hpp +++ b/test/routines/level2/xspr2.hpp @@ -80,14 +80,14 @@ class TestXspr2 { std::vector&, std::vector&) {} // N/A for this routine // Describes how to run the CLBlast routine - static StatusCode RunRoutine(const Arguments &args, std::vector> &buffers, Queue &queue) { + static StatusCode RunRoutine(const Arguments &args, Buffers &buffers, Queue &queue) { auto queue_plain = queue(); auto event = cl_event{}; auto status = Spr2(args.layout, args.triangle, args.n, args.alpha, - buffers[0].x_vec(), args.x_offset, args.x_inc, - buffers[0].y_vec(), args.y_offset, args.y_inc, - buffers[0].ap_mat(), args.ap_offset, + buffers.x_vec(), args.x_offset, args.x_inc, + buffers.y_vec(), args.y_offset, args.y_inc, + buffers.ap_mat(), args.ap_offset, &queue_plain, &event); if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); } return status; @@ -95,15 +95,15 @@ class TestXspr2 { // Describes how to run the clBLAS routine (for correctness/performance comparison) #ifdef CLBLAST_REF_CLBLAS - static StatusCode RunReference1(const Arguments &args, std::vector> &buffers, Queue &queue) { + static StatusCode RunReference1(const Arguments &args, Buffers &buffers, Queue &queue) { auto queue_plain = queue(); auto event = cl_event{}; auto status = clblasXspr2(convertToCLBLAS(args.layout), convertToCLBLAS(args.triangle), args.n, args.alpha, - buffers[0].x_vec, args.x_offset, args.x_inc, - buffers[0].y_vec, args.y_offset, args.y_inc, - buffers[0].ap_mat, args.ap_offset, + buffers.x_vec, args.x_offset, args.x_inc, + buffers.y_vec, args.y_offset, args.y_inc, + buffers.ap_mat, args.ap_offset, 1, &queue_plain, 0, nullptr, &event); clWaitForEvents(1, &event); return static_cast(status); @@ -112,20 +112,20 @@ class TestXspr2 { // Describes how to run the CPU BLAS routine (for correctness/performance comparison) #ifdef CLBLAST_REF_CBLAS - static StatusCode RunReference2(const Arguments &args, std::vector> &buffers, Queue &queue) { + static StatusCode RunReference2(const Arguments &args, Buffers &buffers, Queue &queue) { std::vector ap_mat_cpu(args.ap_size, static_cast(0)); std::vector x_vec_cpu(args.x_size, static_cast(0)); std::vector y_vec_cpu(args.y_size, static_cast(0)); - buffers[0].ap_mat.Read(queue, args.ap_size, ap_mat_cpu); - buffers[0].x_vec.Read(queue, args.x_size, x_vec_cpu); - buffers[0].y_vec.Read(queue, args.y_size, y_vec_cpu); + buffers.ap_mat.Read(queue, args.ap_size, ap_mat_cpu); + buffers.x_vec.Read(queue, args.x_size, x_vec_cpu); + buffers.y_vec.Read(queue, args.y_size, y_vec_cpu); cblasXspr2(convertToCBLAS(args.layout), convertToCBLAS(args.triangle), args.n, args.alpha, x_vec_cpu, args.x_offset, args.x_inc, y_vec_cpu, args.y_offset, args.y_inc, ap_mat_cpu, args.ap_offset); - buffers[0].ap_mat.Write(queue, args.ap_size, ap_mat_cpu); + buffers.ap_mat.Write(queue, args.ap_size, ap_mat_cpu); return StatusCode::kSuccess; } #endif diff --git a/test/routines/level2/xsymv.hpp b/test/routines/level2/xsymv.hpp index c93492ed..5eecfb74 100644 --- a/test/routines/level2/xsymv.hpp +++ b/test/routines/level2/xsymv.hpp @@ -80,14 +80,14 @@ class TestXsymv { std::vector&, std::vector&) {} // N/A for this routine // Describes how to run the CLBlast routine - static StatusCode RunRoutine(const Arguments &args, std::vector> &buffers, Queue &queue) { + static StatusCode RunRoutine(const Arguments &args, Buffers &buffers, Queue &queue) { auto queue_plain = queue(); auto event = cl_event{}; auto status = Symv(args.layout, args.triangle, args.n, args.alpha, - buffers[0].a_mat(), args.a_offset, args.a_ld, - buffers[0].x_vec(), args.x_offset, args.x_inc, args.beta, - buffers[0].y_vec(), args.y_offset, args.y_inc, + buffers.a_mat(), args.a_offset, args.a_ld, + buffers.x_vec(), args.x_offset, args.x_inc, args.beta, + buffers.y_vec(), args.y_offset, args.y_inc, &queue_plain, &event); if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); } return status; @@ -95,15 +95,15 @@ class TestXsymv { // Describes how to run the clBLAS routine (for correctness/performance comparison) #ifdef CLBLAST_REF_CLBLAS - static StatusCode RunReference1(const Arguments &args, std::vector> &buffers, Queue &queue) { + static StatusCode RunReference1(const Arguments &args, Buffers &buffers, Queue &queue) { auto queue_plain = queue(); auto event = cl_event{}; auto status = clblasXsymv(convertToCLBLAS(args.layout), convertToCLBLAS(args.triangle), args.n, args.alpha, - buffers[0].a_mat, args.a_offset, args.a_ld, - buffers[0].x_vec, args.x_offset, args.x_inc, args.beta, - buffers[0].y_vec, args.y_offset, args.y_inc, + buffers.a_mat, args.a_offset, args.a_ld, + buffers.x_vec, args.x_offset, args.x_inc, args.beta, + buffers.y_vec, args.y_offset, args.y_inc, 1, &queue_plain, 0, nullptr, &event); clWaitForEvents(1, &event); return static_cast(status); @@ -112,20 +112,20 @@ class TestXsymv { // Describes how to run the CPU BLAS routine (for correctness/performance comparison) #ifdef CLBLAST_REF_CBLAS - static StatusCode RunReference2(const Arguments &args, std::vector> &buffers, Queue &queue) { + static StatusCode RunReference2(const Arguments &args, Buffers &buffers, Queue &queue) { std::vector a_mat_cpu(args.a_size, static_cast(0)); std::vector x_vec_cpu(args.x_size, static_cast(0)); std::vector y_vec_cpu(args.y_size, static_cast(0)); - buffers[0].a_mat.Read(queue, args.a_size, a_mat_cpu); - buffers[0].x_vec.Read(queue, args.x_size, x_vec_cpu); - buffers[0].y_vec.Read(queue, args.y_size, y_vec_cpu); + buffers.a_mat.Read(queue, args.a_size, a_mat_cpu); + buffers.x_vec.Read(queue, args.x_size, x_vec_cpu); + buffers.y_vec.Read(queue, args.y_size, y_vec_cpu); cblasXsymv(convertToCBLAS(args.layout), convertToCBLAS(args.triangle), args.n, args.alpha, a_mat_cpu, args.a_offset, args.a_ld, x_vec_cpu, args.x_offset, args.x_inc, args.beta, y_vec_cpu, args.y_offset, args.y_inc); - buffers[0].y_vec.Write(queue, args.y_size, y_vec_cpu); + buffers.y_vec.Write(queue, args.y_size, y_vec_cpu); return StatusCode::kSuccess; } #endif diff --git a/test/routines/level2/xsyr.hpp b/test/routines/level2/xsyr.hpp index ac2c5e98..ac4ee1ff 100644 --- a/test/routines/level2/xsyr.hpp +++ b/test/routines/level2/xsyr.hpp @@ -76,13 +76,13 @@ class TestXsyr { std::vector&, std::vector&) {} // N/A for this routine // Describes how to run the CLBlast routine - static StatusCode RunRoutine(const Arguments &args, std::vector> &buffers, Queue &queue) { + static StatusCode RunRoutine(const Arguments &args, Buffers &buffers, Queue &queue) { auto queue_plain = queue(); auto event = cl_event{}; auto status = Syr(args.layout, args.triangle, args.n, args.alpha, - buffers[0].x_vec(), args.x_offset, args.x_inc, - buffers[0].a_mat(), args.a_offset, args.a_ld, + buffers.x_vec(), args.x_offset, args.x_inc, + buffers.a_mat(), args.a_offset, args.a_ld, &queue_plain, &event); if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); } return status; @@ -90,14 +90,14 @@ class TestXsyr { // Describes how to run the clBLAS routine (for correctness/performance comparison) #ifdef CLBLAST_REF_CLBLAS - static StatusCode RunReference1(const Arguments &args, std::vector> &buffers, Queue &queue) { + static StatusCode RunReference1(const Arguments &args, Buffers &buffers, Queue &queue) { auto queue_plain = queue(); auto event = cl_event{}; auto status = clblasXsyr(convertToCLBLAS(args.layout), convertToCLBLAS(args.triangle), args.n, args.alpha, - buffers[0].x_vec, args.x_offset, args.x_inc, - buffers[0].a_mat, args.a_offset, args.a_ld, + buffers.x_vec, args.x_offset, args.x_inc, + buffers.a_mat, args.a_offset, args.a_ld, 1, &queue_plain, 0, nullptr, &event); clWaitForEvents(1, &event); return static_cast(status); @@ -106,17 +106,17 @@ class TestXsyr { // Describes how to run the CPU BLAS routine (for correctness/performance comparison) #ifdef CLBLAST_REF_CBLAS - static StatusCode RunReference2(const Arguments &args, std::vector> &buffers, Queue &queue) { + static StatusCode RunReference2(const Arguments &args, Buffers &buffers, Queue &queue) { std::vector a_mat_cpu(args.a_size, static_cast(0)); std::vector x_vec_cpu(args.x_size, static_cast(0)); - buffers[0].a_mat.Read(queue, args.a_size, a_mat_cpu); - buffers[0].x_vec.Read(queue, args.x_size, x_vec_cpu); + buffers.a_mat.Read(queue, args.a_size, a_mat_cpu); + buffers.x_vec.Read(queue, args.x_size, x_vec_cpu); cblasXsyr(convertToCBLAS(args.layout), convertToCBLAS(args.triangle), args.n, args.alpha, x_vec_cpu, args.x_offset, args.x_inc, a_mat_cpu, args.a_offset, args.a_ld); - buffers[0].a_mat.Write(queue, args.a_size, a_mat_cpu); + buffers.a_mat.Write(queue, args.a_size, a_mat_cpu); return StatusCode::kSuccess; } #endif diff --git a/test/routines/level2/xsyr2.hpp b/test/routines/level2/xsyr2.hpp index 9f8d315b..43644883 100644 --- a/test/routines/level2/xsyr2.hpp +++ b/test/routines/level2/xsyr2.hpp @@ -80,14 +80,14 @@ class TestXsyr2 { std::vector&, std::vector&) {} // N/A for this routine // Describes how to run the CLBlast routine - static StatusCode RunRoutine(const Arguments &args, std::vector> &buffers, Queue &queue) { + static StatusCode RunRoutine(const Arguments &args, Buffers &buffers, Queue &queue) { auto queue_plain = queue(); auto event = cl_event{}; auto status = Syr2(args.layout, args.triangle, args.n, args.alpha, - buffers[0].x_vec(), args.x_offset, args.x_inc, - buffers[0].y_vec(), args.y_offset, args.y_inc, - buffers[0].a_mat(), args.a_offset, args.a_ld, + buffers.x_vec(), args.x_offset, args.x_inc, + buffers.y_vec(), args.y_offset, args.y_inc, + buffers.a_mat(), args.a_offset, args.a_ld, &queue_plain, &event); if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); } return status; @@ -95,15 +95,15 @@ class TestXsyr2 { // Describes how to run the clBLAS routine (for correctness/performance comparison) #ifdef CLBLAST_REF_CLBLAS - static StatusCode RunReference1(const Arguments &args, std::vector> &buffers, Queue &queue) { + static StatusCode RunReference1(const Arguments &args, Buffers &buffers, Queue &queue) { auto queue_plain = queue(); auto event = cl_event{}; auto status = clblasXsyr2(convertToCLBLAS(args.layout), convertToCLBLAS(args.triangle), args.n, args.alpha, - buffers[0].x_vec, args.x_offset, args.x_inc, - buffers[0].y_vec, args.y_offset, args.y_inc, - buffers[0].a_mat, args.a_offset, args.a_ld, + buffers.x_vec, args.x_offset, args.x_inc, + buffers.y_vec, args.y_offset, args.y_inc, + buffers.a_mat, args.a_offset, args.a_ld, 1, &queue_plain, 0, nullptr, &event); clWaitForEvents(1, &event); return static_cast(status); @@ -112,20 +112,20 @@ class TestXsyr2 { // Describes how to run the CPU BLAS routine (for correctness/performance comparison) #ifdef CLBLAST_REF_CBLAS - static StatusCode RunReference2(const Arguments &args, std::vector> &buffers, Queue &queue) { + static StatusCode RunReference2(const Arguments &args, Buffers &buffers, Queue &queue) { std::vector a_mat_cpu(args.a_size, static_cast(0)); std::vector x_vec_cpu(args.x_size, static_cast(0)); std::vector y_vec_cpu(args.y_size, static_cast(0)); - buffers[0].a_mat.Read(queue, args.a_size, a_mat_cpu); - buffers[0].x_vec.Read(queue, args.x_size, x_vec_cpu); - buffers[0].y_vec.Read(queue, args.y_size, y_vec_cpu); + buffers.a_mat.Read(queue, args.a_size, a_mat_cpu); + buffers.x_vec.Read(queue, args.x_size, x_vec_cpu); + buffers.y_vec.Read(queue, args.y_size, y_vec_cpu); cblasXsyr2(convertToCBLAS(args.layout), convertToCBLAS(args.triangle), args.n, args.alpha, x_vec_cpu, args.x_offset, args.x_inc, y_vec_cpu, args.y_offset, args.y_inc, a_mat_cpu, args.a_offset, args.a_ld); - buffers[0].a_mat.Write(queue, args.a_size, a_mat_cpu); + buffers.a_mat.Write(queue, args.a_size, a_mat_cpu); return StatusCode::kSuccess; } #endif diff --git a/test/routines/level2/xtbmv.hpp b/test/routines/level2/xtbmv.hpp index 2d964fda..ab9244af 100644 --- a/test/routines/level2/xtbmv.hpp +++ b/test/routines/level2/xtbmv.hpp @@ -75,13 +75,13 @@ class TestXtbmv { std::vector&, std::vector&) {} // N/A for this routine // Describes how to run the CLBlast routine - static StatusCode RunRoutine(const Arguments &args, std::vector> &buffers, Queue &queue) { + static StatusCode RunRoutine(const Arguments &args, Buffers &buffers, Queue &queue) { auto queue_plain = queue(); auto event = cl_event{}; auto status = Tbmv(args.layout, args.triangle, args.a_transpose, args.diagonal, args.n, args.kl, - buffers[0].a_mat(), args.a_offset, args.a_ld, - buffers[0].x_vec(), args.x_offset, args.x_inc, + buffers.a_mat(), args.a_offset, args.a_ld, + buffers.x_vec(), args.x_offset, args.x_inc, &queue_plain, &event); if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); } return status; @@ -89,7 +89,7 @@ class TestXtbmv { // Describes how to run the clBLAS routine (for correctness/performance comparison) #ifdef CLBLAST_REF_CLBLAS - static StatusCode RunReference1(const Arguments &args, std::vector> &buffers, Queue &queue) { + static StatusCode RunReference1(const Arguments &args, Buffers &buffers, Queue &queue) { auto queue_plain = queue(); auto event = cl_event{}; auto status = clblasXtbmv(convertToCLBLAS(args.layout), @@ -97,8 +97,8 @@ class TestXtbmv { convertToCLBLAS(args.a_transpose), convertToCLBLAS(args.diagonal), args.n, args.kl, - buffers[0].a_mat, args.a_offset, args.a_ld, - buffers[0].x_vec, args.x_offset, args.x_inc, + buffers.a_mat, args.a_offset, args.a_ld, + buffers.x_vec, args.x_offset, args.x_inc, 1, &queue_plain, 0, nullptr, &event); clWaitForEvents(1, &event); return static_cast(status); @@ -107,11 +107,11 @@ class TestXtbmv { // Describes how to run the CPU BLAS routine (for correctness/performance comparison) #ifdef CLBLAST_REF_CBLAS - static StatusCode RunReference2(const Arguments &args, std::vector> &buffers, Queue &queue) { + static StatusCode RunReference2(const Arguments &args, Buffers &buffers, Queue &queue) { std::vector a_mat_cpu(args.a_size, static_cast(0)); std::vector x_vec_cpu(args.x_size, static_cast(0)); - buffers[0].a_mat.Read(queue, args.a_size, a_mat_cpu); - buffers[0].x_vec.Read(queue, args.x_size, x_vec_cpu); + buffers.a_mat.Read(queue, args.a_size, a_mat_cpu); + buffers.x_vec.Read(queue, args.x_size, x_vec_cpu); cblasXtbmv(convertToCBLAS(args.layout), convertToCBLAS(args.triangle), convertToCBLAS(args.a_transpose), @@ -119,7 +119,7 @@ class TestXtbmv { args.n, args.kl, a_mat_cpu, args.a_offset, args.a_ld, x_vec_cpu, args.x_offset, args.x_inc); - buffers[0].x_vec.Write(queue, args.x_size, x_vec_cpu); + buffers.x_vec.Write(queue, args.x_size, x_vec_cpu); return StatusCode::kSuccess; } #endif diff --git a/test/routines/level2/xtpmv.hpp b/test/routines/level2/xtpmv.hpp index fcfd86bf..3821e1a4 100644 --- a/test/routines/level2/xtpmv.hpp +++ b/test/routines/level2/xtpmv.hpp @@ -75,13 +75,13 @@ class TestXtpmv { std::vector&, std::vector&) {} // N/A for this routine // Describes how to run the CLBlast routine - static StatusCode RunRoutine(const Arguments &args, std::vector> &buffers, Queue &queue) { + static StatusCode RunRoutine(const Arguments &args, Buffers &buffers, Queue &queue) { auto queue_plain = queue(); auto event = cl_event{}; auto status = Tpmv(args.layout, args.triangle, args.a_transpose, args.diagonal, args.n, - buffers[0].ap_mat(), args.ap_offset, - buffers[0].x_vec(), args.x_offset, args.x_inc, + buffers.ap_mat(), args.ap_offset, + buffers.x_vec(), args.x_offset, args.x_inc, &queue_plain, &event); if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); } return status; @@ -89,7 +89,7 @@ class TestXtpmv { // Describes how to run the clBLAS routine (for correctness/performance comparison) #ifdef CLBLAST_REF_CLBLAS - static StatusCode RunReference1(const Arguments &args, std::vector> &buffers, Queue &queue) { + static StatusCode RunReference1(const Arguments &args, Buffers &buffers, Queue &queue) { auto queue_plain = queue(); auto event = cl_event{}; auto status = clblasXtpmv(convertToCLBLAS(args.layout), @@ -97,8 +97,8 @@ class TestXtpmv { convertToCLBLAS(args.a_transpose), convertToCLBLAS(args.diagonal), args.n, - buffers[0].ap_mat, args.ap_offset, - buffers[0].x_vec, args.x_offset, args.x_inc, + buffers.ap_mat, args.ap_offset, + buffers.x_vec, args.x_offset, args.x_inc, 1, &queue_plain, 0, nullptr, &event); clWaitForEvents(1, &event); return static_cast(status); @@ -107,11 +107,11 @@ class TestXtpmv { // Describes how to run the CPU BLAS routine (for correctness/performance comparison) #ifdef CLBLAST_REF_CBLAS - static StatusCode RunReference2(const Arguments &args, std::vector> &buffers, Queue &queue) { + static StatusCode RunReference2(const Arguments &args, Buffers &buffers, Queue &queue) { std::vector ap_mat_cpu(args.ap_size, static_cast(0)); std::vector x_vec_cpu(args.x_size, static_cast(0)); - buffers[0].ap_mat.Read(queue, args.ap_size, ap_mat_cpu); - buffers[0].x_vec.Read(queue, args.x_size, x_vec_cpu); + buffers.ap_mat.Read(queue, args.ap_size, ap_mat_cpu); + buffers.x_vec.Read(queue, args.x_size, x_vec_cpu); cblasXtpmv(convertToCBLAS(args.layout), convertToCBLAS(args.triangle), convertToCBLAS(args.a_transpose), @@ -119,7 +119,7 @@ class TestXtpmv { args.n, ap_mat_cpu, args.ap_offset, x_vec_cpu, args.x_offset, args.x_inc); - buffers[0].x_vec.Write(queue, args.x_size, x_vec_cpu); + buffers.x_vec.Write(queue, args.x_size, x_vec_cpu); return StatusCode::kSuccess; } #endif diff --git a/test/routines/level2/xtrmv.hpp b/test/routines/level2/xtrmv.hpp index 4e209584..7211c757 100644 --- a/test/routines/level2/xtrmv.hpp +++ b/test/routines/level2/xtrmv.hpp @@ -75,13 +75,13 @@ class TestXtrmv { std::vector&, std::vector&) {} // N/A for this routine // Describes how to run the CLBlast routine - static StatusCode RunRoutine(const Arguments &args, std::vector> &buffers, Queue &queue) { + static StatusCode RunRoutine(const Arguments &args, Buffers &buffers, Queue &queue) { auto queue_plain = queue(); auto event = cl_event{}; auto status = Trmv(args.layout, args.triangle, args.a_transpose, args.diagonal, args.n, - buffers[0].a_mat(), args.a_offset, args.a_ld, - buffers[0].x_vec(), args.x_offset, args.x_inc, + buffers.a_mat(), args.a_offset, args.a_ld, + buffers.x_vec(), args.x_offset, args.x_inc, &queue_plain, &event); if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); } return status; @@ -89,7 +89,7 @@ class TestXtrmv { // Describes how to run the clBLAS routine (for correctness/performance comparison) #ifdef CLBLAST_REF_CLBLAS - static StatusCode RunReference1(const Arguments &args, std::vector> &buffers, Queue &queue) { + static StatusCode RunReference1(const Arguments &args, Buffers &buffers, Queue &queue) { auto queue_plain = queue(); auto event = cl_event{}; auto status = clblasXtrmv(convertToCLBLAS(args.layout), @@ -97,8 +97,8 @@ class TestXtrmv { convertToCLBLAS(args.a_transpose), convertToCLBLAS(args.diagonal), args.n, - buffers[0].a_mat, args.a_offset, args.a_ld, - buffers[0].x_vec, args.x_offset, args.x_inc, + buffers.a_mat, args.a_offset, args.a_ld, + buffers.x_vec, args.x_offset, args.x_inc, 1, &queue_plain, 0, nullptr, &event); clWaitForEvents(1, &event); return static_cast(status); @@ -107,11 +107,11 @@ class TestXtrmv { // Describes how to run the CPU BLAS routine (for correctness/performance comparison) #ifdef CLBLAST_REF_CBLAS - static StatusCode RunReference2(const Arguments &args, std::vector> &buffers, Queue &queue) { + static StatusCode RunReference2(const Arguments &args, Buffers &buffers, Queue &queue) { std::vector a_mat_cpu(args.a_size, static_cast(0)); std::vector x_vec_cpu(args.x_size, static_cast(0)); - buffers[0].a_mat.Read(queue, args.a_size, a_mat_cpu); - buffers[0].x_vec.Read(queue, args.x_size, x_vec_cpu); + buffers.a_mat.Read(queue, args.a_size, a_mat_cpu); + buffers.x_vec.Read(queue, args.x_size, x_vec_cpu); cblasXtrmv(convertToCBLAS(args.layout), convertToCBLAS(args.triangle), convertToCBLAS(args.a_transpose), @@ -119,7 +119,7 @@ class TestXtrmv { args.n, a_mat_cpu, args.a_offset, args.a_ld, x_vec_cpu, args.x_offset, args.x_inc); - buffers[0].x_vec.Write(queue, args.x_size, x_vec_cpu); + buffers.x_vec.Write(queue, args.x_size, x_vec_cpu); return StatusCode::kSuccess; } #endif diff --git a/test/routines/level2/xtrsv.hpp b/test/routines/level2/xtrsv.hpp index 090684b1..78b9672f 100644 --- a/test/routines/level2/xtrsv.hpp +++ b/test/routines/level2/xtrsv.hpp @@ -90,13 +90,13 @@ class TestXtrsv { } // Describes how to run the CLBlast routine - static StatusCode RunRoutine(const Arguments &args, std::vector> &buffers, Queue &queue) { + static StatusCode RunRoutine(const Arguments &args, Buffers &buffers, Queue &queue) { auto queue_plain = queue(); auto event = cl_event{}; auto status = Trsv(args.layout, args.triangle, args.a_transpose, args.diagonal, args.n, - buffers[0].a_mat(), args.a_offset, args.a_ld, - buffers[0].x_vec(), args.x_offset, args.x_inc, + buffers.a_mat(), args.a_offset, args.a_ld, + buffers.x_vec(), args.x_offset, args.x_inc, &queue_plain, &event); if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); } return status; @@ -104,7 +104,7 @@ class TestXtrsv { // Describes how to run the clBLAS routine (for correctness/performance comparison) #ifdef CLBLAST_REF_CLBLAS - static StatusCode RunReference1(const Arguments &args, std::vector> &buffers, Queue &queue) { + static StatusCode RunReference1(const Arguments &args, Buffers &buffers, Queue &queue) { auto queue_plain = queue(); auto event = cl_event{}; auto status = clblasXtrsv(convertToCLBLAS(args.layout), @@ -112,8 +112,8 @@ class TestXtrsv { convertToCLBLAS(args.a_transpose), convertToCLBLAS(args.diagonal), args.n, - buffers[0].a_mat, args.a_offset, args.a_ld, - buffers[0].x_vec, args.x_offset, args.x_inc, + buffers.a_mat, args.a_offset, args.a_ld, + buffers.x_vec, args.x_offset, args.x_inc, 1, &queue_plain, 0, nullptr, &event); clWaitForEvents(1, &event); return static_cast(status); @@ -122,11 +122,11 @@ class TestXtrsv { // Describes how to run the CPU BLAS routine (for correctness/performance comparison) #ifdef CLBLAST_REF_CBLAS - static StatusCode RunReference2(const Arguments &args, std::vector> &buffers, Queue &queue) { + static StatusCode RunReference2(const Arguments &args, Buffers &buffers, Queue &queue) { std::vector a_mat_cpu(args.a_size, static_cast(0)); std::vector x_vec_cpu(args.x_size, static_cast(0)); - buffers[0].a_mat.Read(queue, args.a_size, a_mat_cpu); - buffers[0].x_vec.Read(queue, args.x_size, x_vec_cpu); + buffers.a_mat.Read(queue, args.a_size, a_mat_cpu); + buffers.x_vec.Read(queue, args.x_size, x_vec_cpu); cblasXtrsv(convertToCBLAS(args.layout), convertToCBLAS(args.triangle), convertToCBLAS(args.a_transpose), @@ -134,7 +134,7 @@ class TestXtrsv { args.n, a_mat_cpu, args.a_offset, args.a_ld, x_vec_cpu, args.x_offset, args.x_inc); - buffers[0].x_vec.Write(queue, args.x_size, x_vec_cpu); + buffers.x_vec.Write(queue, args.x_size, x_vec_cpu); return StatusCode::kSuccess; } #endif diff --git a/test/routines/level3/xgemm.hpp b/test/routines/level3/xgemm.hpp index 5b220889..1b12fb1c 100644 --- a/test/routines/level3/xgemm.hpp +++ b/test/routines/level3/xgemm.hpp @@ -88,14 +88,14 @@ class TestXgemm { std::vector&, std::vector&) {} // N/A for this routine // Describes how to run the CLBlast routine - static StatusCode RunRoutine(const Arguments &args, std::vector> &buffers, Queue &queue) { + static StatusCode RunRoutine(const Arguments &args, Buffers &buffers, Queue &queue) { auto queue_plain = queue(); auto event = cl_event{}; auto status = Gemm(args.layout, args.a_transpose, args.b_transpose, args.m, args.n, args.k, args.alpha, - buffers[0].a_mat(), args.a_offset, args.a_ld, - buffers[0].b_mat(), args.b_offset, args.b_ld, args.beta, - buffers[0].c_mat(), args.c_offset, args.c_ld, + buffers.a_mat(), args.a_offset, args.a_ld, + buffers.b_mat(), args.b_offset, args.b_ld, args.beta, + buffers.c_mat(), args.c_offset, args.c_ld, &queue_plain, &event); if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); } return status; @@ -103,16 +103,16 @@ class TestXgemm { // Describes how to run the clBLAS routine (for correctness/performance comparison) #ifdef CLBLAST_REF_CLBLAS - static StatusCode RunReference1(const Arguments &args, std::vector> &buffers, Queue &queue) { + static StatusCode RunReference1(const Arguments &args, Buffers &buffers, Queue &queue) { auto queue_plain = queue(); auto event = cl_event{}; auto status = clblasXgemm(convertToCLBLAS(args.layout), convertToCLBLAS(args.a_transpose), convertToCLBLAS(args.b_transpose), args.m, args.n, args.k, args.alpha, - buffers[0].a_mat, args.a_offset, args.a_ld, - buffers[0].b_mat, args.b_offset, args.b_ld, args.beta, - buffers[0].c_mat, args.c_offset, args.c_ld, + buffers.a_mat, args.a_offset, args.a_ld, + buffers.b_mat, args.b_offset, args.b_ld, args.beta, + buffers.c_mat, args.c_offset, args.c_ld, 1, &queue_plain, 0, nullptr, &event); clWaitForEvents(1, &event); return static_cast(status); @@ -121,13 +121,13 @@ class TestXgemm { // Describes how to run the CPU BLAS routine (for correctness/performance comparison) #ifdef CLBLAST_REF_CBLAS - static StatusCode RunReference2(const Arguments &args, std::vector> &buffers, Queue &queue) { + static StatusCode RunReference2(const Arguments &args, Buffers &buffers, Queue &queue) { std::vector a_mat_cpu(args.a_size, static_cast(0)); std::vector b_mat_cpu(args.b_size, static_cast(0)); std::vector c_mat_cpu(args.c_size, static_cast(0)); - buffers[0].a_mat.Read(queue, args.a_size, a_mat_cpu); - buffers[0].b_mat.Read(queue, args.b_size, b_mat_cpu); - buffers[0].c_mat.Read(queue, args.c_size, c_mat_cpu); + buffers.a_mat.Read(queue, args.a_size, a_mat_cpu); + buffers.b_mat.Read(queue, args.b_size, b_mat_cpu); + buffers.c_mat.Read(queue, args.c_size, c_mat_cpu); cblasXgemm(convertToCBLAS(args.layout), convertToCBLAS(args.a_transpose), convertToCBLAS(args.b_transpose), @@ -135,7 +135,7 @@ class TestXgemm { a_mat_cpu, args.a_offset, args.a_ld, b_mat_cpu, args.b_offset, args.b_ld, args.beta, c_mat_cpu, args.c_offset, args.c_ld); - buffers[0].c_mat.Write(queue, args.c_size, c_mat_cpu); + buffers.c_mat.Write(queue, args.c_size, c_mat_cpu); return StatusCode::kSuccess; } #endif diff --git a/test/routines/level3/xhemm.hpp b/test/routines/level3/xhemm.hpp index e6e8724f..76550b15 100644 --- a/test/routines/level3/xhemm.hpp +++ b/test/routines/level3/xhemm.hpp @@ -88,14 +88,14 @@ class TestXhemm { std::vector&, std::vector&) {} // N/A for this routine // Describes how to run the CLBlast routine - static StatusCode RunRoutine(const Arguments &args, std::vector> &buffers, Queue &queue) { + static StatusCode RunRoutine(const Arguments &args, Buffers &buffers, Queue &queue) { auto queue_plain = queue(); auto event = cl_event{}; auto status = Hemm(args.layout, args.side, args.triangle, args.m, args.n, args.alpha, - buffers[0].a_mat(), args.a_offset, args.a_ld, - buffers[0].b_mat(), args.b_offset, args.b_ld, args.beta, - buffers[0].c_mat(), args.c_offset, args.c_ld, + buffers.a_mat(), args.a_offset, args.a_ld, + buffers.b_mat(), args.b_offset, args.b_ld, args.beta, + buffers.c_mat(), args.c_offset, args.c_ld, &queue_plain, &event); if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); } return status; @@ -103,16 +103,16 @@ class TestXhemm { // Describes how to run the clBLAS routine (for correctness/performance comparison) #ifdef CLBLAST_REF_CLBLAS - static StatusCode RunReference1(const Arguments &args, std::vector> &buffers, Queue &queue) { + static StatusCode RunReference1(const Arguments &args, Buffers &buffers, Queue &queue) { auto queue_plain = queue(); auto event = cl_event{}; auto status = clblasXhemm(convertToCLBLAS(args.layout), convertToCLBLAS(args.side), convertToCLBLAS(args.triangle), args.m, args.n, args.alpha, - buffers[0].a_mat, args.a_offset, args.a_ld, - buffers[0].b_mat, args.b_offset, args.b_ld, args.beta, - buffers[0].c_mat, args.c_offset, args.c_ld, + buffers.a_mat, args.a_offset, args.a_ld, + buffers.b_mat, args.b_offset, args.b_ld, args.beta, + buffers.c_mat, args.c_offset, args.c_ld, 1, &queue_plain, 0, nullptr, &event); clWaitForEvents(1, &event); return static_cast(status); @@ -121,13 +121,13 @@ class TestXhemm { // Describes how to run the CPU BLAS routine (for correctness/performance comparison) #ifdef CLBLAST_REF_CBLAS - static StatusCode RunReference2(const Arguments &args, std::vector> &buffers, Queue &queue) { + static StatusCode RunReference2(const Arguments &args, Buffers &buffers, Queue &queue) { std::vector a_mat_cpu(args.a_size, static_cast(0)); std::vector b_mat_cpu(args.b_size, static_cast(0)); std::vector c_mat_cpu(args.c_size, static_cast(0)); - buffers[0].a_mat.Read(queue, args.a_size, a_mat_cpu); - buffers[0].b_mat.Read(queue, args.b_size, b_mat_cpu); - buffers[0].c_mat.Read(queue, args.c_size, c_mat_cpu); + buffers.a_mat.Read(queue, args.a_size, a_mat_cpu); + buffers.b_mat.Read(queue, args.b_size, b_mat_cpu); + buffers.c_mat.Read(queue, args.c_size, c_mat_cpu); cblasXhemm(convertToCBLAS(args.layout), convertToCBLAS(args.side), convertToCBLAS(args.triangle), @@ -135,7 +135,7 @@ class TestXhemm { a_mat_cpu, args.a_offset, args.a_ld, b_mat_cpu, args.b_offset, args.b_ld, args.beta, c_mat_cpu, args.c_offset, args.c_ld); - buffers[0].c_mat.Write(queue, args.c_size, c_mat_cpu); + buffers.c_mat.Write(queue, args.c_size, c_mat_cpu); return StatusCode::kSuccess; } #endif diff --git a/test/routines/level3/xher2k.hpp b/test/routines/level3/xher2k.hpp index 749eca11..5ca3aac6 100644 --- a/test/routines/level3/xher2k.hpp +++ b/test/routines/level3/xher2k.hpp @@ -86,15 +86,15 @@ class TestXher2k { std::vector&, std::vector&) {} // N/A for this routine // Describes how to run the CLBlast routine - static StatusCode RunRoutine(const Arguments &args, std::vector> &buffers, Queue &queue) { + static StatusCode RunRoutine(const Arguments &args, Buffers &buffers, Queue &queue) { auto queue_plain = queue(); auto event = cl_event{}; auto alpha2 = T{args.alpha, args.alpha}; auto status = Her2k(args.layout, args.triangle, args.a_transpose, args.n, args.k, alpha2, - buffers[0].a_mat(), args.a_offset, args.a_ld, - buffers[0].b_mat(), args.b_offset, args.b_ld, args.beta, - buffers[0].c_mat(), args.c_offset, args.c_ld, + buffers.a_mat(), args.a_offset, args.a_ld, + buffers.b_mat(), args.b_offset, args.b_ld, args.beta, + buffers.c_mat(), args.c_offset, args.c_ld, &queue_plain, &event); if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); } return status; @@ -102,7 +102,7 @@ class TestXher2k { // Describes how to run the clBLAS routine (for correctness/performance comparison) #ifdef CLBLAST_REF_CLBLAS - static StatusCode RunReference1(const Arguments &args, std::vector> &buffers, Queue &queue) { + static StatusCode RunReference1(const Arguments &args, Buffers &buffers, Queue &queue) { auto queue_plain = queue(); auto event = cl_event{}; auto alpha2 = T{args.alpha, args.alpha}; @@ -110,9 +110,9 @@ class TestXher2k { convertToCLBLAS(args.triangle), convertToCLBLAS(args.a_transpose), args.n, args.k, alpha2, - buffers[0].a_mat, args.a_offset, args.a_ld, - buffers[0].b_mat, args.b_offset, args.b_ld, args.beta, - buffers[0].c_mat, args.c_offset, args.c_ld, + buffers.a_mat, args.a_offset, args.a_ld, + buffers.b_mat, args.b_offset, args.b_ld, args.beta, + buffers.c_mat, args.c_offset, args.c_ld, 1, &queue_plain, 0, nullptr, &event); clWaitForEvents(1, &event); return static_cast(status); @@ -121,13 +121,13 @@ class TestXher2k { // Describes how to run the CPU BLAS routine (for correctness/performance comparison) #ifdef CLBLAST_REF_CBLAS - static StatusCode RunReference2(const Arguments &args, std::vector> &buffers, Queue &queue) { + static StatusCode RunReference2(const Arguments &args, Buffers &buffers, Queue &queue) { std::vector a_mat_cpu(args.a_size, static_cast(0)); std::vector b_mat_cpu(args.b_size, static_cast(0)); std::vector c_mat_cpu(args.c_size, static_cast(0)); - buffers[0].a_mat.Read(queue, args.a_size, a_mat_cpu); - buffers[0].b_mat.Read(queue, args.b_size, b_mat_cpu); - buffers[0].c_mat.Read(queue, args.c_size, c_mat_cpu); + buffers.a_mat.Read(queue, args.a_size, a_mat_cpu); + buffers.b_mat.Read(queue, args.b_size, b_mat_cpu); + buffers.c_mat.Read(queue, args.c_size, c_mat_cpu); auto alpha2 = T{args.alpha, args.alpha}; cblasXher2k(convertToCBLAS(args.layout), convertToCBLAS(args.triangle), @@ -136,7 +136,7 @@ class TestXher2k { a_mat_cpu, args.a_offset, args.a_ld, b_mat_cpu, args.b_offset, args.b_ld, args.beta, c_mat_cpu, args.c_offset, args.c_ld); - buffers[0].c_mat.Write(queue, args.c_size, c_mat_cpu); + buffers.c_mat.Write(queue, args.c_size, c_mat_cpu); return StatusCode::kSuccess; } #endif diff --git a/test/routines/level3/xherk.hpp b/test/routines/level3/xherk.hpp index e9193847..e93d887a 100644 --- a/test/routines/level3/xherk.hpp +++ b/test/routines/level3/xherk.hpp @@ -79,13 +79,13 @@ class TestXherk { std::vector&, std::vector&) {} // N/A for this routine // Describes how to run the CLBlast routine - static StatusCode RunRoutine(const Arguments &args, std::vector> &buffers, Queue &queue) { + static StatusCode RunRoutine(const Arguments &args, Buffers &buffers, Queue &queue) { auto queue_plain = queue(); auto event = cl_event{}; auto status = Herk(args.layout, args.triangle, args.a_transpose, args.n, args.k, args.alpha, - buffers[0].a_mat(), args.a_offset, args.a_ld, args.beta, - buffers[0].c_mat(), args.c_offset, args.c_ld, + buffers.a_mat(), args.a_offset, args.a_ld, args.beta, + buffers.c_mat(), args.c_offset, args.c_ld, &queue_plain, &event); if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); } return status; @@ -93,15 +93,15 @@ class TestXherk { // Describes how to run the clBLAS routine (for correctness/performance comparison) #ifdef CLBLAST_REF_CLBLAS - static StatusCode RunReference1(const Arguments &args, std::vector> &buffers, Queue &queue) { + static StatusCode RunReference1(const Arguments &args, Buffers &buffers, Queue &queue) { auto queue_plain = queue(); auto event = cl_event{}; auto status = clblasXherk(convertToCLBLAS(args.layout), convertToCLBLAS(args.triangle), convertToCLBLAS(args.a_transpose), args.n, args.k, args.alpha, - buffers[0].a_mat, args.a_offset, args.a_ld, args.beta, - buffers[0].c_mat, args.c_offset, args.c_ld, + buffers.a_mat, args.a_offset, args.a_ld, args.beta, + buffers.c_mat, args.c_offset, args.c_ld, 1, &queue_plain, 0, nullptr, &event); clWaitForEvents(1, &event); return static_cast(status); @@ -110,18 +110,18 @@ class TestXherk { // Describes how to run the CPU BLAS routine (for correctness/performance comparison) #ifdef CLBLAST_REF_CBLAS - static StatusCode RunReference2(const Arguments &args, std::vector> &buffers, Queue &queue) { + static StatusCode RunReference2(const Arguments &args, Buffers &buffers, Queue &queue) { std::vector a_mat_cpu(args.a_size, static_cast(0)); std::vector c_mat_cpu(args.c_size, static_cast(0)); - buffers[0].a_mat.Read(queue, args.a_size, a_mat_cpu); - buffers[0].c_mat.Read(queue, args.c_size, c_mat_cpu); + buffers.a_mat.Read(queue, args.a_size, a_mat_cpu); + buffers.c_mat.Read(queue, args.c_size, c_mat_cpu); cblasXherk(convertToCBLAS(args.layout), convertToCBLAS(args.triangle), convertToCBLAS(args.a_transpose), args.n, args.k, args.alpha, a_mat_cpu, args.a_offset, args.a_ld, args.beta, c_mat_cpu, args.c_offset, args.c_ld); - buffers[0].c_mat.Write(queue, args.c_size, c_mat_cpu); + buffers.c_mat.Write(queue, args.c_size, c_mat_cpu); return StatusCode::kSuccess; } #endif diff --git a/test/routines/level3/xsymm.hpp b/test/routines/level3/xsymm.hpp index bcd74fda..9d127e26 100644 --- a/test/routines/level3/xsymm.hpp +++ b/test/routines/level3/xsymm.hpp @@ -88,14 +88,14 @@ class TestXsymm { std::vector&, std::vector&) {} // N/A for this routine // Describes how to run the CLBlast routine - static StatusCode RunRoutine(const Arguments &args, std::vector> &buffers, Queue &queue) { + static StatusCode RunRoutine(const Arguments &args, Buffers &buffers, Queue &queue) { auto queue_plain = queue(); auto event = cl_event{}; auto status = Symm(args.layout, args.side, args.triangle, args.m, args.n, args.alpha, - buffers[0].a_mat(), args.a_offset, args.a_ld, - buffers[0].b_mat(), args.b_offset, args.b_ld, args.beta, - buffers[0].c_mat(), args.c_offset, args.c_ld, + buffers.a_mat(), args.a_offset, args.a_ld, + buffers.b_mat(), args.b_offset, args.b_ld, args.beta, + buffers.c_mat(), args.c_offset, args.c_ld, &queue_plain, &event); if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); } return status; @@ -103,16 +103,16 @@ class TestXsymm { // Describes how to run the clBLAS routine (for correctness/performance comparison) #ifdef CLBLAST_REF_CLBLAS - static StatusCode RunReference1(const Arguments &args, std::vector> &buffers, Queue &queue) { + static StatusCode RunReference1(const Arguments &args, Buffers &buffers, Queue &queue) { auto queue_plain = queue(); auto event = cl_event{}; auto status = clblasXsymm(convertToCLBLAS(args.layout), convertToCLBLAS(args.side), convertToCLBLAS(args.triangle), args.m, args.n, args.alpha, - buffers[0].a_mat, args.a_offset, args.a_ld, - buffers[0].b_mat, args.b_offset, args.b_ld, args.beta, - buffers[0].c_mat, args.c_offset, args.c_ld, + buffers.a_mat, args.a_offset, args.a_ld, + buffers.b_mat, args.b_offset, args.b_ld, args.beta, + buffers.c_mat, args.c_offset, args.c_ld, 1, &queue_plain, 0, nullptr, &event); clWaitForEvents(1, &event); return static_cast(status); @@ -121,13 +121,13 @@ class TestXsymm { // Describes how to run the CPU BLAS routine (for correctness/performance comparison) #ifdef CLBLAST_REF_CBLAS - static StatusCode RunReference2(const Arguments &args, std::vector> &buffers, Queue &queue) { + static StatusCode RunReference2(const Arguments &args, Buffers &buffers, Queue &queue) { std::vector a_mat_cpu(args.a_size, static_cast(0)); std::vector b_mat_cpu(args.b_size, static_cast(0)); std::vector c_mat_cpu(args.c_size, static_cast(0)); - buffers[0].a_mat.Read(queue, args.a_size, a_mat_cpu); - buffers[0].b_mat.Read(queue, args.b_size, b_mat_cpu); - buffers[0].c_mat.Read(queue, args.c_size, c_mat_cpu); + buffers.a_mat.Read(queue, args.a_size, a_mat_cpu); + buffers.b_mat.Read(queue, args.b_size, b_mat_cpu); + buffers.c_mat.Read(queue, args.c_size, c_mat_cpu); cblasXsymm(convertToCBLAS(args.layout), convertToCBLAS(args.side), convertToCBLAS(args.triangle), @@ -135,7 +135,7 @@ class TestXsymm { a_mat_cpu, args.a_offset, args.a_ld, b_mat_cpu, args.b_offset, args.b_ld, args.beta, c_mat_cpu, args.c_offset, args.c_ld); - buffers[0].c_mat.Write(queue, args.c_size, c_mat_cpu); + buffers.c_mat.Write(queue, args.c_size, c_mat_cpu); return StatusCode::kSuccess; } #endif diff --git a/test/routines/level3/xsyr2k.hpp b/test/routines/level3/xsyr2k.hpp index c722e0cf..d1bdac56 100644 --- a/test/routines/level3/xsyr2k.hpp +++ b/test/routines/level3/xsyr2k.hpp @@ -86,14 +86,14 @@ class TestXsyr2k { std::vector&, std::vector&) {} // N/A for this routine // Describes how to run the CLBlast routine - static StatusCode RunRoutine(const Arguments &args, std::vector> &buffers, Queue &queue) { + static StatusCode RunRoutine(const Arguments &args, Buffers &buffers, Queue &queue) { auto queue_plain = queue(); auto event = cl_event{}; auto status = Syr2k(args.layout, args.triangle, args.a_transpose, args.n, args.k, args.alpha, - buffers[0].a_mat(), args.a_offset, args.a_ld, - buffers[0].b_mat(), args.b_offset, args.b_ld, args.beta, - buffers[0].c_mat(), args.c_offset, args.c_ld, + buffers.a_mat(), args.a_offset, args.a_ld, + buffers.b_mat(), args.b_offset, args.b_ld, args.beta, + buffers.c_mat(), args.c_offset, args.c_ld, &queue_plain, &event); if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); } return status; @@ -101,16 +101,16 @@ class TestXsyr2k { // Describes how to run the clBLAS routine (for correctness/performance comparison) #ifdef CLBLAST_REF_CLBLAS - static StatusCode RunReference1(const Arguments &args, std::vector> &buffers, Queue &queue) { + static StatusCode RunReference1(const Arguments &args, Buffers &buffers, Queue &queue) { auto queue_plain = queue(); auto event = cl_event{}; auto status = clblasXsyr2k(convertToCLBLAS(args.layout), convertToCLBLAS(args.triangle), convertToCLBLAS(args.a_transpose), args.n, args.k, args.alpha, - buffers[0].a_mat, args.a_offset, args.a_ld, - buffers[0].b_mat, args.b_offset, args.b_ld, args.beta, - buffers[0].c_mat, args.c_offset, args.c_ld, + buffers.a_mat, args.a_offset, args.a_ld, + buffers.b_mat, args.b_offset, args.b_ld, args.beta, + buffers.c_mat, args.c_offset, args.c_ld, 1, &queue_plain, 0, nullptr, &event); clWaitForEvents(1, &event); return static_cast(status); @@ -119,13 +119,13 @@ class TestXsyr2k { // Describes how to run the CPU BLAS routine (for correctness/performance comparison) #ifdef CLBLAST_REF_CBLAS - static StatusCode RunReference2(const Arguments &args, std::vector> &buffers, Queue &queue) { + static StatusCode RunReference2(const Arguments &args, Buffers &buffers, Queue &queue) { std::vector a_mat_cpu(args.a_size, static_cast(0)); std::vector b_mat_cpu(args.b_size, static_cast(0)); std::vector c_mat_cpu(args.c_size, static_cast(0)); - buffers[0].a_mat.Read(queue, args.a_size, a_mat_cpu); - buffers[0].b_mat.Read(queue, args.b_size, b_mat_cpu); - buffers[0].c_mat.Read(queue, args.c_size, c_mat_cpu); + buffers.a_mat.Read(queue, args.a_size, a_mat_cpu); + buffers.b_mat.Read(queue, args.b_size, b_mat_cpu); + buffers.c_mat.Read(queue, args.c_size, c_mat_cpu); cblasXsyr2k(convertToCBLAS(args.layout), convertToCBLAS(args.triangle), convertToCBLAS(args.a_transpose), @@ -133,7 +133,7 @@ class TestXsyr2k { a_mat_cpu, args.a_offset, args.a_ld, b_mat_cpu, args.b_offset, args.b_ld, args.beta, c_mat_cpu, args.c_offset, args.c_ld); - buffers[0].c_mat.Write(queue, args.c_size, c_mat_cpu); + buffers.c_mat.Write(queue, args.c_size, c_mat_cpu); return StatusCode::kSuccess; } #endif diff --git a/test/routines/level3/xsyrk.hpp b/test/routines/level3/xsyrk.hpp index 7d5c2039..1330924e 100644 --- a/test/routines/level3/xsyrk.hpp +++ b/test/routines/level3/xsyrk.hpp @@ -79,13 +79,13 @@ class TestXsyrk { std::vector&, std::vector&) {} // N/A for this routine // Describes how to run the CLBlast routine - static StatusCode RunRoutine(const Arguments &args, std::vector> &buffers, Queue &queue) { + static StatusCode RunRoutine(const Arguments &args, Buffers &buffers, Queue &queue) { auto queue_plain = queue(); auto event = cl_event{}; auto status = Syrk(args.layout, args.triangle, args.a_transpose, args.n, args.k, args.alpha, - buffers[0].a_mat(), args.a_offset, args.a_ld, args.beta, - buffers[0].c_mat(), args.c_offset, args.c_ld, + buffers.a_mat(), args.a_offset, args.a_ld, args.beta, + buffers.c_mat(), args.c_offset, args.c_ld, &queue_plain, &event); if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); } return status; @@ -93,15 +93,15 @@ class TestXsyrk { // Describes how to run the clBLAS routine (for correctness/performance comparison) #ifdef CLBLAST_REF_CLBLAS - static StatusCode RunReference1(const Arguments &args, std::vector> &buffers, Queue &queue) { + static StatusCode RunReference1(const Arguments &args, Buffers &buffers, Queue &queue) { auto queue_plain = queue(); auto event = cl_event{}; auto status = clblasXsyrk(convertToCLBLAS(args.layout), convertToCLBLAS(args.triangle), convertToCLBLAS(args.a_transpose), args.n, args.k, args.alpha, - buffers[0].a_mat, args.a_offset, args.a_ld, args.beta, - buffers[0].c_mat, args.c_offset, args.c_ld, + buffers.a_mat, args.a_offset, args.a_ld, args.beta, + buffers.c_mat, args.c_offset, args.c_ld, 1, &queue_plain, 0, nullptr, &event); clWaitForEvents(1, &event); return static_cast(status); @@ -110,18 +110,18 @@ class TestXsyrk { // Describes how to run the CPU BLAS routine (for correctness/performance comparison) #ifdef CLBLAST_REF_CBLAS - static StatusCode RunReference2(const Arguments &args, std::vector> &buffers, Queue &queue) { + static StatusCode RunReference2(const Arguments &args, Buffers &buffers, Queue &queue) { std::vector a_mat_cpu(args.a_size, static_cast(0)); std::vector c_mat_cpu(args.c_size, static_cast(0)); - buffers[0].a_mat.Read(queue, args.a_size, a_mat_cpu); - buffers[0].c_mat.Read(queue, args.c_size, c_mat_cpu); + buffers.a_mat.Read(queue, args.a_size, a_mat_cpu); + buffers.c_mat.Read(queue, args.c_size, c_mat_cpu); cblasXsyrk(convertToCBLAS(args.layout), convertToCBLAS(args.triangle), convertToCBLAS(args.a_transpose), args.n, args.k, args.alpha, a_mat_cpu, args.a_offset, args.a_ld, args.beta, c_mat_cpu, args.c_offset, args.c_ld); - buffers[0].c_mat.Write(queue, args.c_size, c_mat_cpu); + buffers.c_mat.Write(queue, args.c_size, c_mat_cpu); return StatusCode::kSuccess; } #endif diff --git a/test/routines/level3/xtrmm.hpp b/test/routines/level3/xtrmm.hpp index 50cca6f8..7c5bd842 100644 --- a/test/routines/level3/xtrmm.hpp +++ b/test/routines/level3/xtrmm.hpp @@ -79,13 +79,13 @@ class TestXtrmm { std::vector&, std::vector&) {} // N/A for this routine // Describes how to run the CLBlast routine - static StatusCode RunRoutine(const Arguments &args, std::vector> &buffers, Queue &queue) { + static StatusCode RunRoutine(const Arguments &args, Buffers &buffers, Queue &queue) { auto queue_plain = queue(); auto event = cl_event{}; auto status = Trmm(args.layout, args.side, args.triangle, args.a_transpose, args.diagonal, args.m, args.n, args.alpha, - buffers[0].a_mat(), args.a_offset, args.a_ld, - buffers[0].b_mat(), args.b_offset, args.b_ld, + buffers.a_mat(), args.a_offset, args.a_ld, + buffers.b_mat(), args.b_offset, args.b_ld, &queue_plain, &event); if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); } return status; @@ -93,7 +93,7 @@ class TestXtrmm { // Describes how to run the clBLAS routine (for correctness/performance comparison) #ifdef CLBLAST_REF_CLBLAS - static StatusCode RunReference1(const Arguments &args, std::vector> &buffers, Queue &queue) { + static StatusCode RunReference1(const Arguments &args, Buffers &buffers, Queue &queue) { auto queue_plain = queue(); auto event = cl_event{}; auto status = clblasXtrmm(convertToCLBLAS(args.layout), @@ -102,8 +102,8 @@ class TestXtrmm { convertToCLBLAS(args.a_transpose), convertToCLBLAS(args.diagonal), args.m, args.n, args.alpha, - buffers[0].a_mat, args.a_offset, args.a_ld, - buffers[0].b_mat, args.b_offset, args.b_ld, + buffers.a_mat, args.a_offset, args.a_ld, + buffers.b_mat, args.b_offset, args.b_ld, 1, &queue_plain, 0, nullptr, &event); clWaitForEvents(1, &event); return static_cast(status); @@ -112,11 +112,11 @@ class TestXtrmm { // Describes how to run the CPU BLAS routine (for correctness/performance comparison) #ifdef CLBLAST_REF_CBLAS - static StatusCode RunReference2(const Arguments &args, std::vector> &buffers, Queue &queue) { + static StatusCode RunReference2(const Arguments &args, Buffers &buffers, Queue &queue) { std::vector a_mat_cpu(args.a_size, static_cast(0)); std::vector b_mat_cpu(args.b_size, static_cast(0)); - buffers[0].a_mat.Read(queue, args.a_size, a_mat_cpu); - buffers[0].b_mat.Read(queue, args.b_size, b_mat_cpu); + buffers.a_mat.Read(queue, args.a_size, a_mat_cpu); + buffers.b_mat.Read(queue, args.b_size, b_mat_cpu); cblasXtrmm(convertToCBLAS(args.layout), convertToCBLAS(args.side), convertToCBLAS(args.triangle), @@ -125,7 +125,7 @@ class TestXtrmm { args.m, args.n, args.alpha, a_mat_cpu, args.a_offset, args.a_ld, b_mat_cpu, args.b_offset, args.b_ld); - buffers[0].b_mat.Write(queue, args.b_size, b_mat_cpu); + buffers.b_mat.Write(queue, args.b_size, b_mat_cpu); return StatusCode::kSuccess; } #endif diff --git a/test/routines/level3/xtrsm.hpp b/test/routines/level3/xtrsm.hpp index 91f91d0b..a70ef03f 100644 --- a/test/routines/level3/xtrsm.hpp +++ b/test/routines/level3/xtrsm.hpp @@ -91,13 +91,13 @@ class TestXtrsm { } // Describes how to run the CLBlast routine - static StatusCode RunRoutine(const Arguments &args, std::vector> &buffers, Queue &queue) { + static StatusCode RunRoutine(const Arguments &args, Buffers &buffers, Queue &queue) { auto queue_plain = queue(); auto event = cl_event{}; auto status = Trsm(args.layout, args.side, args.triangle, args.a_transpose, args.diagonal, args.m, args.n, args.alpha, - buffers[0].a_mat(), args.a_offset, args.a_ld, - buffers[0].b_mat(), args.b_offset, args.b_ld, + buffers.a_mat(), args.a_offset, args.a_ld, + buffers.b_mat(), args.b_offset, args.b_ld, &queue_plain, &event); if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); } return status; @@ -105,7 +105,7 @@ class TestXtrsm { // Describes how to run the clBLAS routine (for correctness/performance comparison) #ifdef CLBLAST_REF_CLBLAS - static StatusCode RunReference1(const Arguments &args, std::vector> &buffers, Queue &queue) { + static StatusCode RunReference1(const Arguments &args, Buffers &buffers, Queue &queue) { auto queue_plain = queue(); auto event = cl_event{}; auto status = clblasXtrsm(convertToCLBLAS(args.layout), @@ -114,8 +114,8 @@ class TestXtrsm { convertToCLBLAS(args.a_transpose), convertToCLBLAS(args.diagonal), args.m, args.n, args.alpha, - buffers[0].a_mat, args.a_offset, args.a_ld, - buffers[0].b_mat, args.b_offset, args.b_ld, + buffers.a_mat, args.a_offset, args.a_ld, + buffers.b_mat, args.b_offset, args.b_ld, 1, &queue_plain, 0, nullptr, &event); clWaitForEvents(1, &event); return static_cast(status); @@ -124,11 +124,11 @@ class TestXtrsm { // Describes how to run the CPU BLAS routine (for correctness/performance comparison) #ifdef CLBLAST_REF_CBLAS - static StatusCode RunReference2(const Arguments &args, std::vector> &buffers, Queue &queue) { + static StatusCode RunReference2(const Arguments &args, Buffers &buffers, Queue &queue) { std::vector a_mat_cpu(args.a_size, static_cast(0)); std::vector b_mat_cpu(args.b_size, static_cast(0)); - buffers[0].a_mat.Read(queue, args.a_size, a_mat_cpu); - buffers[0].b_mat.Read(queue, args.b_size, b_mat_cpu); + buffers.a_mat.Read(queue, args.a_size, a_mat_cpu); + buffers.b_mat.Read(queue, args.b_size, b_mat_cpu); cblasXtrsm(convertToCBLAS(args.layout), convertToCBLAS(args.side), convertToCBLAS(args.triangle), @@ -137,7 +137,7 @@ class TestXtrsm { args.m, args.n, args.alpha, a_mat_cpu, args.a_offset, args.a_ld, b_mat_cpu, args.b_offset, args.b_ld); - buffers[0].b_mat.Write(queue, args.b_size, b_mat_cpu); + buffers.b_mat.Write(queue, args.b_size, b_mat_cpu); return StatusCode::kSuccess; } #endif diff --git a/test/routines/levelx/xaxpybatched.hpp b/test/routines/levelx/xaxpybatched.hpp index 7922359d..8f6a5985 100644 --- a/test/routines/levelx/xaxpybatched.hpp +++ b/test/routines/levelx/xaxpybatched.hpp @@ -51,18 +51,28 @@ class TestXaxpyBatched { return alpha_base + Constant(batch_id); } - // Describes how to obtain the sizes of the buffers (per item, not for the full batch) + // Helper for the sizes per batch + static size_t PerBatchSizeX(const Arguments &args) { return args.n * args.x_inc; } + static size_t PerBatchSizeY(const Arguments &args) { return args.n * args.y_inc; } + + // Describes how to obtain the sizes of the buffers static size_t GetSizeX(const Arguments &args) { - return args.n * args.x_inc; + return PerBatchSizeX(args) * args.batch_count + args.x_offset; } static size_t GetSizeY(const Arguments &args) { - return args.n * args.y_inc; + return PerBatchSizeY(args) * args.batch_count + args.y_offset; } - // Describes how to set the sizes of all the buffers (per item, not for the full batch) + // Describes how to set the sizes of all the buffers static void SetSizes(Arguments &args) { args.x_size = GetSizeX(args); args.y_size = GetSizeY(args); + args.x_offsets = std::vector(args.batch_count); + args.y_offsets = std::vector(args.batch_count); + for (auto batch = size_t{0}; batch < args.batch_count; ++batch) { + args.x_offsets[batch] = batch * PerBatchSizeX(args) + args.x_offset; + args.y_offsets[batch] = batch * PerBatchSizeY(args) + args.y_offset; + } } // Describes what the default values of the leading dimensions of the matrices are @@ -81,20 +91,16 @@ class TestXaxpyBatched { std::vector&, std::vector&) {} // N/A for this routine // Describes how to run the CLBlast routine - static StatusCode RunRoutine(const Arguments &args, std::vector> &buffers, Queue &queue) { + static StatusCode RunRoutine(const Arguments &args, Buffers &buffers, Queue &queue) { auto queue_plain = queue(); auto event = cl_event{}; auto alphas = std::vector(); - auto x_buffers = std::vector(); - auto y_buffers = std::vector(); for (auto batch = size_t{0}; batch < args.batch_count; ++batch) { alphas.push_back(GetAlpha(args.alpha, batch)); - x_buffers.push_back(buffers[batch].x_vec()); - y_buffers.push_back(buffers[batch].y_vec()); } auto status = AxpyBatched(args.n, alphas.data(), - x_buffers.data(), args.x_inc, - y_buffers.data(), args.y_inc, + buffers.x_vec(), args.x_offsets.data(), args.x_inc, + buffers.y_vec(), args.y_offsets.data(), args.y_inc, args.batch_count, &queue_plain, &event); if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); } @@ -103,13 +109,13 @@ class TestXaxpyBatched { // Describes how to run the clBLAS routine (for correctness/performance comparison) #ifdef CLBLAST_REF_CLBLAS - static StatusCode RunReference1(const Arguments &args, std::vector> &buffers, Queue &queue) { + static StatusCode RunReference1(const Arguments &args, Buffers &buffers, Queue &queue) { auto queue_plain = queue(); for (auto batch = size_t{0}; batch < args.batch_count; ++batch) { auto event = cl_event{}; auto status = clblasXaxpy(args.n, GetAlpha(args.alpha, batch), - buffers[batch].x_vec, 0, args.x_inc, - buffers[batch].y_vec, 0, args.y_inc, + buffers.x_vec, args.x_offsets[batch], args.x_inc, + buffers.y_vec, args.y_offsets[batch], args.y_inc, 1, &queue_plain, 0, nullptr, &event); clWaitForEvents(1, &event); if (static_cast(status) != StatusCode::kSuccess) { @@ -122,41 +128,41 @@ class TestXaxpyBatched { // Describes how to run the CPU BLAS routine (for correctness/performance comparison) #ifdef CLBLAST_REF_CBLAS - static StatusCode RunReference2(const Arguments &args, std::vector> &buffers, Queue &queue) { + static StatusCode RunReference2(const Arguments &args, Buffers &buffers, Queue &queue) { + std::vector x_vec_cpu(args.x_size, static_cast(0)); + std::vector y_vec_cpu(args.y_size, static_cast(0)); + buffers.x_vec.Read(queue, args.x_size, x_vec_cpu); + buffers.y_vec.Read(queue, args.y_size, y_vec_cpu); for (auto batch = size_t{0}; batch < args.batch_count; ++batch) { - std::vector x_vec_cpu(args.x_size, static_cast(0)); - std::vector y_vec_cpu(args.y_size, static_cast(0)); - buffers[batch].x_vec.Read(queue, args.x_size, x_vec_cpu); - buffers[batch].y_vec.Read(queue, args.y_size, y_vec_cpu); cblasXaxpy(args.n, GetAlpha(args.alpha, batch), - x_vec_cpu, 0, args.x_inc, - y_vec_cpu, 0, args.y_inc); - buffers[batch].y_vec.Write(queue, args.y_size, y_vec_cpu); + x_vec_cpu, args.x_offsets[batch], args.x_inc, + y_vec_cpu, args.y_offsets[batch], args.y_inc); } + buffers.y_vec.Write(queue, args.y_size, y_vec_cpu); return StatusCode::kSuccess; } #endif - // Describes how to download the results of the computation (per item, not for the full batch) + // Describes how to download the results of the computation static std::vector DownloadResult(const Arguments &args, Buffers &buffers, Queue &queue) { std::vector result(args.y_size, static_cast(0)); buffers.y_vec.Read(queue, args.y_size, result); return result; } - // Describes how to compute the indices of the result buffer (per item, not for the full batch) + // Describes how to compute the indices of the result buffer static size_t ResultID1(const Arguments &args) { return args.n; } - static size_t ResultID2(const Arguments &) { return 1; } // N/A for this routine - static size_t GetResultIndex(const Arguments &args, const size_t id1, const size_t) { - return id1 * args.y_inc; + static size_t ResultID2(const Arguments &args) { return args.batch_count; } + static size_t GetResultIndex(const Arguments &args, const size_t id1, const size_t id2) { + return (id1 * args.y_inc) + args.y_offsets[id2]; } - // Describes how to compute performance metrics (per item, not for the full batch) + // Describes how to compute performance metrics static size_t GetFlops(const Arguments &args) { - return 2 * args.n; + return args.batch_count * (2 * args.n); } static size_t GetBytes(const Arguments &args) { - return (3 * args.n) * sizeof(T); + return args.batch_count * (3 * args.n) * sizeof(T); } }; diff --git a/test/routines/levelx/xinvert.hpp b/test/routines/levelx/xinvert.hpp index 2cb1b2ce..94cd9393 100644 --- a/test/routines/levelx/xinvert.hpp +++ b/test/routines/levelx/xinvert.hpp @@ -173,14 +173,14 @@ class TestXinvert { std::vector&, std::vector&) {} // N/A for this routine // Describes how to run the CLBlast routine - static StatusCode RunRoutine(const Arguments &args, std::vector> &buffers, Queue &queue) { + static StatusCode RunRoutine(const Arguments &args, Buffers &buffers, Queue &queue) { try { auto event = cl_event{}; auto inverter = Xinvert(queue, &event); inverter.InvertMatrixDiagonalBlocks(args.layout, args.triangle, args.diagonal, args.n, args.m, - buffers[0].a_mat, args.a_offset, args.a_ld, - buffers[0].b_mat); + buffers.a_mat, args.a_offset, args.a_ld, + buffers.b_mat); clWaitForEvents(1, &event); clReleaseEvent(event); } catch (...) { return DispatchException(); } @@ -189,11 +189,11 @@ class TestXinvert { // Describes how to run a naive version of the routine (for correctness/performance comparison). // Note that a proper clBLAS or CPU BLAS comparison is not available for non-BLAS routines. - static StatusCode RunReference1(const Arguments &args, std::vector> &buffers, Queue &queue) { + static StatusCode RunReference1(const Arguments &args, Buffers &buffers, Queue &queue) { return RunReference(args, buffers[0], queue); } - static StatusCode RunReference2(const Arguments &args, std::vector> &buffers, Queue &queue) { + static StatusCode RunReference2(const Arguments &args, Buffers &buffers, Queue &queue) { return RunReference(args, buffers[0], queue); } diff --git a/test/routines/levelx/xomatcopy.hpp b/test/routines/levelx/xomatcopy.hpp index 69f0b2b6..d1064d0c 100644 --- a/test/routines/levelx/xomatcopy.hpp +++ b/test/routines/levelx/xomatcopy.hpp @@ -133,13 +133,13 @@ class TestXomatcopy { std::vector&, std::vector&) {} // N/A for this routine // Describes how to run the CLBlast routine - static StatusCode RunRoutine(const Arguments &args, std::vector> &buffers, Queue &queue) { + static StatusCode RunRoutine(const Arguments &args, Buffers &buffers, Queue &queue) { auto queue_plain = queue(); auto event = cl_event{}; auto status = Omatcopy(args.layout, args.a_transpose, args.m, args.n, args.alpha, - buffers[0].a_mat(), args.a_offset, args.a_ld, - buffers[0].b_mat(), args.b_offset, args.b_ld, + buffers.a_mat(), args.a_offset, args.a_ld, + buffers.b_mat(), args.b_offset, args.b_ld, &queue_plain, &event); if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); } return status; @@ -147,12 +147,12 @@ class TestXomatcopy { // Describes how to run a naive version of the routine (for correctness/performance comparison). // Note that a proper clBLAS or CPU BLAS comparison is not available for non-BLAS routines. - static StatusCode RunReference1(const Arguments &args, std::vector> &buffers, Queue &queue) { - return RunReference(args, buffers[0], queue); + static StatusCode RunReference1(const Arguments &args, Buffers &buffers, Queue &queue) { + return RunReference(args, buffers, queue); } - static StatusCode RunReference2(const Arguments &args, std::vector> &buffers, Queue &queue) { - return RunReference(args, buffers[0], queue); + static StatusCode RunReference2(const Arguments &args, Buffers &buffers, Queue &queue) { + return RunReference(args, buffers, queue); } // Describes how to download the results of the computation (more importantly: which buffer)