Make batched routines based on offsets instead of a vector of cl_mem objects - undoing many earlier changes
parent
6aba0bbae7
commit
fa0a9c689f
|
@ -2913,8 +2913,8 @@ C++ API:
|
|||
template <typename T>
|
||||
StatusCode AxpyBatched(const size_t n,
|
||||
const T *alphas,
|
||||
const cl_mem *x_buffers, const size_t x_inc,
|
||||
cl_mem *y_buffers, const size_t y_inc,
|
||||
const cl_mem x_buffer, const size_t *x_offsets, const size_t x_inc,
|
||||
cl_mem y_buffer, const size_t *y_offsets, const size_t y_inc,
|
||||
const size_t batch_count,
|
||||
cl_command_queue* queue, cl_event* event)
|
||||
```
|
||||
|
@ -2923,32 +2923,32 @@ C API:
|
|||
```
|
||||
CLBlastStatusCode CLBlastSaxpyBatched(const size_t n,
|
||||
const float *alphas,
|
||||
const cl_mem *x_buffers, const size_t x_inc,
|
||||
cl_mem *y_buffers, const size_t y_inc,
|
||||
const cl_mem x_buffer, const size_t *x_offsets, const size_t x_inc,
|
||||
cl_mem y_buffer, const size_t *y_offsets, const size_t y_inc,
|
||||
const size_t batch_count,
|
||||
cl_command_queue* queue, cl_event* event)
|
||||
CLBlastStatusCode CLBlastDaxpyBatched(const size_t n,
|
||||
const double *alphas,
|
||||
const cl_mem *x_buffers, const size_t x_inc,
|
||||
cl_mem *y_buffers, const size_t y_inc,
|
||||
const cl_mem x_buffer, const size_t *x_offsets, const size_t x_inc,
|
||||
cl_mem y_buffer, const size_t *y_offsets, const size_t y_inc,
|
||||
const size_t batch_count,
|
||||
cl_command_queue* queue, cl_event* event)
|
||||
CLBlastStatusCode CLBlastCaxpyBatched(const size_t n,
|
||||
const cl_float2 *alphas,
|
||||
const cl_mem *x_buffers, const size_t x_inc,
|
||||
cl_mem *y_buffers, const size_t y_inc,
|
||||
const cl_mem x_buffer, const size_t *x_offsets, const size_t x_inc,
|
||||
cl_mem y_buffer, const size_t *y_offsets, const size_t y_inc,
|
||||
const size_t batch_count,
|
||||
cl_command_queue* queue, cl_event* event)
|
||||
CLBlastStatusCode CLBlastZaxpyBatched(const size_t n,
|
||||
const cl_double2 *alphas,
|
||||
const cl_mem *x_buffers, const size_t x_inc,
|
||||
cl_mem *y_buffers, const size_t y_inc,
|
||||
const cl_mem x_buffer, const size_t *x_offsets, const size_t x_inc,
|
||||
cl_mem y_buffer, const size_t *y_offsets, const size_t y_inc,
|
||||
const size_t batch_count,
|
||||
cl_command_queue* queue, cl_event* event)
|
||||
CLBlastStatusCode CLBlastHaxpyBatched(const size_t n,
|
||||
const cl_half *alphas,
|
||||
const cl_mem *x_buffers, const size_t x_inc,
|
||||
cl_mem *y_buffers, const size_t y_inc,
|
||||
const cl_mem x_buffer, const size_t *x_offsets, const size_t x_inc,
|
||||
cl_mem y_buffer, const size_t *y_offsets, const size_t y_inc,
|
||||
const size_t batch_count,
|
||||
cl_command_queue* queue, cl_event* event)
|
||||
```
|
||||
|
@ -2957,10 +2957,12 @@ Arguments to AXPYBATCHED:
|
|||
|
||||
* `const size_t n`: Integer size argument. This value must be positive.
|
||||
* `const T *alphas`: Input scalar constants.
|
||||
* `const cl_mem *x_buffers`: OpenCL buffers to store the input x vectors.
|
||||
* `const size_t x_inc`: Stride/increment of the input x vectors. This value must be greater than 0.
|
||||
* `cl_mem *y_buffers`: OpenCL buffers to store the output y vectors.
|
||||
* `const size_t y_inc`: Stride/increment of the output y vectors. This value must be greater than 0.
|
||||
* `const cl_mem x_buffer`: OpenCL buffer to store the input x vector.
|
||||
* `const size_t *x_offsets`: The offsets in elements from the start of the input x vector.
|
||||
* `const size_t x_inc`: Stride/increment of the input x vector. This value must be greater than 0.
|
||||
* `cl_mem y_buffer`: OpenCL buffer to store the output y vector.
|
||||
* `const size_t *y_offsets`: The offsets in elements from the start of the output y vector.
|
||||
* `const size_t y_inc`: Stride/increment of the output y vector. This value must be greater than 0.
|
||||
* `const size_t batch_count`: Number of batches. This value must be positive.
|
||||
* `cl_command_queue* queue`: Pointer to an OpenCL command queue associated with a context and device to execute the routine on.
|
||||
* `cl_event* event`: Pointer to an OpenCL event to be able to wait for completion of the routine's OpenCL kernel(s). This is an optional argument.
|
||||
|
|
|
@ -614,8 +614,8 @@ StatusCode Omatcopy(const Layout layout, const Transpose a_transpose,
|
|||
template <typename T>
|
||||
StatusCode AxpyBatched(const size_t n,
|
||||
const T *alphas,
|
||||
const cl_mem *x_buffers, const size_t x_inc,
|
||||
cl_mem *y_buffers, const size_t y_inc,
|
||||
const cl_mem x_buffer, const size_t *x_offsets, const size_t x_inc,
|
||||
cl_mem y_buffer, const size_t *y_offsets, const size_t y_inc,
|
||||
const size_t batch_count,
|
||||
cl_command_queue* queue, cl_event* event = nullptr);
|
||||
|
||||
|
|
|
@ -1331,32 +1331,32 @@ CLBlastStatusCode PUBLIC_API CLBlastHomatcopy(const CLBlastLayout layout, const
|
|||
// Batched version of AXPY: SAXPYBATCHED/DAXPYBATCHED/CAXPYBATCHED/ZAXPYBATCHED/HAXPYBATCHED
|
||||
CLBlastStatusCode PUBLIC_API CLBlastSaxpyBatched(const size_t n,
|
||||
const float *alphas,
|
||||
const cl_mem *x_buffers, const size_t x_inc,
|
||||
cl_mem *y_buffers, const size_t y_inc,
|
||||
const cl_mem x_buffer, const size_t *x_offsets, const size_t x_inc,
|
||||
cl_mem y_buffer, const size_t *y_offsets, const size_t y_inc,
|
||||
const size_t batch_count,
|
||||
cl_command_queue* queue, cl_event* event);
|
||||
CLBlastStatusCode PUBLIC_API CLBlastDaxpyBatched(const size_t n,
|
||||
const double *alphas,
|
||||
const cl_mem *x_buffers, const size_t x_inc,
|
||||
cl_mem *y_buffers, const size_t y_inc,
|
||||
const cl_mem x_buffer, const size_t *x_offsets, const size_t x_inc,
|
||||
cl_mem y_buffer, const size_t *y_offsets, const size_t y_inc,
|
||||
const size_t batch_count,
|
||||
cl_command_queue* queue, cl_event* event);
|
||||
CLBlastStatusCode PUBLIC_API CLBlastCaxpyBatched(const size_t n,
|
||||
const cl_float2 *alphas,
|
||||
const cl_mem *x_buffers, const size_t x_inc,
|
||||
cl_mem *y_buffers, const size_t y_inc,
|
||||
const cl_mem x_buffer, const size_t *x_offsets, const size_t x_inc,
|
||||
cl_mem y_buffer, const size_t *y_offsets, const size_t y_inc,
|
||||
const size_t batch_count,
|
||||
cl_command_queue* queue, cl_event* event);
|
||||
CLBlastStatusCode PUBLIC_API CLBlastZaxpyBatched(const size_t n,
|
||||
const cl_double2 *alphas,
|
||||
const cl_mem *x_buffers, const size_t x_inc,
|
||||
cl_mem *y_buffers, const size_t y_inc,
|
||||
const cl_mem x_buffer, const size_t *x_offsets, const size_t x_inc,
|
||||
cl_mem y_buffer, const size_t *y_offsets, const size_t y_inc,
|
||||
const size_t batch_count,
|
||||
cl_command_queue* queue, cl_event* event);
|
||||
CLBlastStatusCode PUBLIC_API CLBlastHaxpyBatched(const size_t n,
|
||||
const cl_half *alphas,
|
||||
const cl_mem *x_buffers, const size_t x_inc,
|
||||
cl_mem *y_buffers, const size_t y_inc,
|
||||
const cl_mem x_buffer, const size_t *x_offsets, const size_t x_inc,
|
||||
cl_mem y_buffer, const size_t *y_offsets, const size_t y_inc,
|
||||
const size_t batch_count,
|
||||
cl_command_queue* queue, cl_event* event);
|
||||
|
||||
|
|
|
@ -72,12 +72,12 @@ class Routine:
|
|||
for scalar in self.scalars:
|
||||
result.append("auto " + scalar + "s_cpp = std::vector<T>();")
|
||||
for buffer_name in self.inputs + self.outputs:
|
||||
result.append("auto " + buffer_name + "_buffers_cpp = std::vector<Buffer<T>>();")
|
||||
result.append("auto " + buffer_name + "_offsets_cpp = std::vector<size_t>();")
|
||||
result.append("for (auto batch = size_t{0}; batch < batch_count; ++batch) {")
|
||||
for scalar in self.scalars:
|
||||
result.append(" " + scalar + "s_cpp.push_back(" + scalar + "s[batch]);")
|
||||
for buffer_name in self.inputs + self.outputs:
|
||||
result.append(" " + buffer_name + "_buffers_cpp.push_back(Buffer<T>(" + buffer_name + "_buffers[batch]));")
|
||||
result.append(" " + buffer_name + "_offsets_cpp.push_back(" + buffer_name + "_offsets[batch]);")
|
||||
result.append("}")
|
||||
return result
|
||||
|
||||
|
@ -222,8 +222,8 @@ class Routine:
|
|||
def buffer(self, name):
|
||||
"""Retrieves a variable name for a specific input/output vector/matrix (e.g. 'x')"""
|
||||
if name in self.inputs or name in self.outputs:
|
||||
a = [name + "_buffer" + self.b_s()]
|
||||
b = [name + "_offset"] if not self.batched else []
|
||||
a = [name + "_buffer"]
|
||||
b = [name + "_offset" + self.b_s()]
|
||||
c = [name + "_" + self.postfix(name)] if (name not in self.buffers_without_ld_inc()) else []
|
||||
return [", ".join(a + b + c)]
|
||||
return []
|
||||
|
@ -250,8 +250,8 @@ class Routine:
|
|||
"""As above but with data-types"""
|
||||
prefix = "const " if name in self.inputs else ""
|
||||
if name in self.inputs or name in self.outputs:
|
||||
a = [prefix + "cl_mem " + self.b_star() + name + "_buffer" + self.b_s()]
|
||||
b = ["const size_t " + name + "_offset"] if not self.batched else []
|
||||
a = [prefix + "cl_mem " + name + "_buffer"]
|
||||
b = ["const size_t " + self.b_star() + name + "_offset" + self.b_s()]
|
||||
c = ["const size_t " + name + "_" + self.postfix(name)] if name not in self.buffers_without_ld_inc() else []
|
||||
return [", ".join(a + b + c)]
|
||||
return []
|
||||
|
@ -291,11 +291,8 @@ class Routine:
|
|||
"""As above but with CLCudaAPI buffers"""
|
||||
if name in self.inputs or name in self.outputs:
|
||||
buffer_type = "unsigned int" if (name in self.index_buffers()) else self.template.buffer_type
|
||||
if self.batched:
|
||||
a = [name + "_buffers_cpp"]
|
||||
else:
|
||||
a = ["Buffer<" + buffer_type + ">(" + name + "_buffer)"]
|
||||
b = [name + "_offset"] if not self.batched else []
|
||||
a = ["Buffer<" + buffer_type + ">(" + name + "_buffer)"]
|
||||
b = [name + "_offsets_cpp"] if self.batched else [name + "_offset"]
|
||||
c = [name + "_" + self.postfix(name)] if (name not in self.buffers_without_ld_inc()) else []
|
||||
return [", ".join(a + b + c)]
|
||||
return []
|
||||
|
@ -336,8 +333,8 @@ class Routine:
|
|||
"""As above, but only data-types"""
|
||||
prefix = "const " if (name in self.inputs) else ""
|
||||
if (name in self.inputs) or (name in self.outputs):
|
||||
a = [prefix + "cl_mem" + self.b_star()]
|
||||
b = ["const size_t"] if not self.batched else []
|
||||
a = [prefix + "cl_mem"]
|
||||
b = ["const size_t" + self.b_star()]
|
||||
c = ["const size_t"] if (name not in self.buffers_without_ld_inc()) else []
|
||||
return [", ".join(a + b + c)]
|
||||
return []
|
||||
|
@ -347,12 +344,10 @@ class Routine:
|
|||
prefix = "const " if (name in self.inputs) else ""
|
||||
inout = "input" if (name in self.inputs) else "output"
|
||||
if (name in self.inputs) or (name in self.outputs):
|
||||
math_name = name.upper() + " matrix" + self.b_s() if (name in self.buffers_matrix()) else name + " vector" + self.b_s()
|
||||
math_name = name.upper() + " matrix" if (name in self.buffers_matrix()) else name + " vector"
|
||||
inc_ld_description = "Leading dimension " if (name in self.buffers_matrix()) else "Stride/increment "
|
||||
a = ["`" + prefix + "cl_mem " + self.b_star() + name + "_buffer" + self.b_s() + "`: OpenCL buffer" + self.b_s() + " to store the " + inout + " " + math_name + "."]
|
||||
b = []
|
||||
if not self.batched:
|
||||
b = ["`const size_t " + name + "_offset`: The offset in elements from the start of the " + inout + " " + math_name + "."]
|
||||
a = ["`" + prefix + "cl_mem " + name + "_buffer`: OpenCL buffer to store the " + inout + " " + math_name + "."]
|
||||
b = ["`const size_t " + self.b_star() + name + "_offset" + self.b_s() + "`: The offset" + self.b_s() + " in elements from the start of the " + inout + " " + math_name + "."]
|
||||
c = []
|
||||
if name not in self.buffers_without_ld_inc():
|
||||
c = ["`const size_t " + name + "_" + self.postfix(name) + "`: " +
|
||||
|
|
|
@ -2178,57 +2178,57 @@ template StatusCode PUBLIC_API Omatcopy<half>(const Layout, const Transpose,
|
|||
template <typename T>
|
||||
StatusCode AxpyBatched(const size_t n,
|
||||
const T *alphas,
|
||||
const cl_mem *x_buffers, const size_t x_inc,
|
||||
cl_mem *y_buffers, const size_t y_inc,
|
||||
const cl_mem x_buffer, const size_t *x_offsets, const size_t x_inc,
|
||||
cl_mem y_buffer, const size_t *y_offsets, const size_t y_inc,
|
||||
const size_t batch_count,
|
||||
cl_command_queue* queue, cl_event* event) {
|
||||
try {
|
||||
auto queue_cpp = Queue(*queue);
|
||||
auto routine = XaxpyBatched<T>(queue_cpp, event);
|
||||
auto alphas_cpp = std::vector<T>();
|
||||
auto x_buffers_cpp = std::vector<Buffer<T>>();
|
||||
auto y_buffers_cpp = std::vector<Buffer<T>>();
|
||||
auto x_offsets_cpp = std::vector<size_t>();
|
||||
auto y_offsets_cpp = std::vector<size_t>();
|
||||
for (auto batch = size_t{0}; batch < batch_count; ++batch) {
|
||||
alphas_cpp.push_back(alphas[batch]);
|
||||
x_buffers_cpp.push_back(Buffer<T>(x_buffers[batch]));
|
||||
y_buffers_cpp.push_back(Buffer<T>(y_buffers[batch]));
|
||||
x_offsets_cpp.push_back(x_offsets[batch]);
|
||||
y_offsets_cpp.push_back(y_offsets[batch]);
|
||||
}
|
||||
routine.DoAxpyBatched(n,
|
||||
alphas_cpp,
|
||||
x_buffers_cpp, x_inc,
|
||||
y_buffers_cpp, y_inc,
|
||||
Buffer<T>(x_buffer), x_offsets_cpp, x_inc,
|
||||
Buffer<T>(y_buffer), y_offsets_cpp, y_inc,
|
||||
batch_count);
|
||||
return StatusCode::kSuccess;
|
||||
} catch (...) { return DispatchException(); }
|
||||
}
|
||||
template StatusCode PUBLIC_API AxpyBatched<float>(const size_t,
|
||||
const float*,
|
||||
const cl_mem*, const size_t,
|
||||
cl_mem*, const size_t,
|
||||
const cl_mem, const size_t*, const size_t,
|
||||
cl_mem, const size_t*, const size_t,
|
||||
const size_t,
|
||||
cl_command_queue*, cl_event*);
|
||||
template StatusCode PUBLIC_API AxpyBatched<double>(const size_t,
|
||||
const double*,
|
||||
const cl_mem*, const size_t,
|
||||
cl_mem*, const size_t,
|
||||
const cl_mem, const size_t*, const size_t,
|
||||
cl_mem, const size_t*, const size_t,
|
||||
const size_t,
|
||||
cl_command_queue*, cl_event*);
|
||||
template StatusCode PUBLIC_API AxpyBatched<float2>(const size_t,
|
||||
const float2*,
|
||||
const cl_mem*, const size_t,
|
||||
cl_mem*, const size_t,
|
||||
const cl_mem, const size_t*, const size_t,
|
||||
cl_mem, const size_t*, const size_t,
|
||||
const size_t,
|
||||
cl_command_queue*, cl_event*);
|
||||
template StatusCode PUBLIC_API AxpyBatched<double2>(const size_t,
|
||||
const double2*,
|
||||
const cl_mem*, const size_t,
|
||||
cl_mem*, const size_t,
|
||||
const cl_mem, const size_t*, const size_t,
|
||||
cl_mem, const size_t*, const size_t,
|
||||
const size_t,
|
||||
cl_command_queue*, cl_event*);
|
||||
template StatusCode PUBLIC_API AxpyBatched<half>(const size_t,
|
||||
const half*,
|
||||
const cl_mem*, const size_t,
|
||||
cl_mem*, const size_t,
|
||||
const cl_mem, const size_t*, const size_t,
|
||||
cl_mem, const size_t*, const size_t,
|
||||
const size_t,
|
||||
cl_command_queue*, cl_event*);
|
||||
// =================================================================================================
|
||||
|
|
|
@ -3450,8 +3450,8 @@ CLBlastStatusCode CLBlastHomatcopy(const CLBlastLayout layout, const CLBlastTran
|
|||
// AXPY
|
||||
CLBlastStatusCode CLBlastSaxpyBatched(const size_t n,
|
||||
const float *alphas,
|
||||
const cl_mem *x_buffers, const size_t x_inc,
|
||||
cl_mem *y_buffers, const size_t y_inc,
|
||||
const cl_mem x_buffer, const size_t *x_offsets, const size_t x_inc,
|
||||
cl_mem y_buffer, const size_t *y_offsets, const size_t y_inc,
|
||||
const size_t batch_count,
|
||||
cl_command_queue* queue, cl_event* event) {
|
||||
auto alphas_cpp = std::vector<float>();
|
||||
|
@ -3462,8 +3462,8 @@ CLBlastStatusCode CLBlastSaxpyBatched(const size_t n,
|
|||
return static_cast<CLBlastStatusCode>(
|
||||
clblast::AxpyBatched(n,
|
||||
alphas_cpp.data(),
|
||||
x_buffers, x_inc,
|
||||
y_buffers, y_inc,
|
||||
x_buffer, x_offsets, x_inc,
|
||||
y_buffer, y_offsets, y_inc,
|
||||
batch_count,
|
||||
queue, event)
|
||||
);
|
||||
|
@ -3471,8 +3471,8 @@ CLBlastStatusCode CLBlastSaxpyBatched(const size_t n,
|
|||
}
|
||||
CLBlastStatusCode CLBlastDaxpyBatched(const size_t n,
|
||||
const double *alphas,
|
||||
const cl_mem *x_buffers, const size_t x_inc,
|
||||
cl_mem *y_buffers, const size_t y_inc,
|
||||
const cl_mem x_buffer, const size_t *x_offsets, const size_t x_inc,
|
||||
cl_mem y_buffer, const size_t *y_offsets, const size_t y_inc,
|
||||
const size_t batch_count,
|
||||
cl_command_queue* queue, cl_event* event) {
|
||||
auto alphas_cpp = std::vector<double>();
|
||||
|
@ -3483,8 +3483,8 @@ CLBlastStatusCode CLBlastDaxpyBatched(const size_t n,
|
|||
return static_cast<CLBlastStatusCode>(
|
||||
clblast::AxpyBatched(n,
|
||||
alphas_cpp.data(),
|
||||
x_buffers, x_inc,
|
||||
y_buffers, y_inc,
|
||||
x_buffer, x_offsets, x_inc,
|
||||
y_buffer, y_offsets, y_inc,
|
||||
batch_count,
|
||||
queue, event)
|
||||
);
|
||||
|
@ -3492,8 +3492,8 @@ CLBlastStatusCode CLBlastDaxpyBatched(const size_t n,
|
|||
}
|
||||
CLBlastStatusCode CLBlastCaxpyBatched(const size_t n,
|
||||
const cl_float2 *alphas,
|
||||
const cl_mem *x_buffers, const size_t x_inc,
|
||||
cl_mem *y_buffers, const size_t y_inc,
|
||||
const cl_mem x_buffer, const size_t *x_offsets, const size_t x_inc,
|
||||
cl_mem y_buffer, const size_t *y_offsets, const size_t y_inc,
|
||||
const size_t batch_count,
|
||||
cl_command_queue* queue, cl_event* event) {
|
||||
auto alphas_cpp = std::vector<float2>();
|
||||
|
@ -3504,8 +3504,8 @@ CLBlastStatusCode CLBlastCaxpyBatched(const size_t n,
|
|||
return static_cast<CLBlastStatusCode>(
|
||||
clblast::AxpyBatched(n,
|
||||
alphas_cpp.data(),
|
||||
x_buffers, x_inc,
|
||||
y_buffers, y_inc,
|
||||
x_buffer, x_offsets, x_inc,
|
||||
y_buffer, y_offsets, y_inc,
|
||||
batch_count,
|
||||
queue, event)
|
||||
);
|
||||
|
@ -3513,8 +3513,8 @@ CLBlastStatusCode CLBlastCaxpyBatched(const size_t n,
|
|||
}
|
||||
CLBlastStatusCode CLBlastZaxpyBatched(const size_t n,
|
||||
const cl_double2 *alphas,
|
||||
const cl_mem *x_buffers, const size_t x_inc,
|
||||
cl_mem *y_buffers, const size_t y_inc,
|
||||
const cl_mem x_buffer, const size_t *x_offsets, const size_t x_inc,
|
||||
cl_mem y_buffer, const size_t *y_offsets, const size_t y_inc,
|
||||
const size_t batch_count,
|
||||
cl_command_queue* queue, cl_event* event) {
|
||||
auto alphas_cpp = std::vector<double2>();
|
||||
|
@ -3525,8 +3525,8 @@ CLBlastStatusCode CLBlastZaxpyBatched(const size_t n,
|
|||
return static_cast<CLBlastStatusCode>(
|
||||
clblast::AxpyBatched(n,
|
||||
alphas_cpp.data(),
|
||||
x_buffers, x_inc,
|
||||
y_buffers, y_inc,
|
||||
x_buffer, x_offsets, x_inc,
|
||||
y_buffer, y_offsets, y_inc,
|
||||
batch_count,
|
||||
queue, event)
|
||||
);
|
||||
|
@ -3534,8 +3534,8 @@ CLBlastStatusCode CLBlastZaxpyBatched(const size_t n,
|
|||
}
|
||||
CLBlastStatusCode CLBlastHaxpyBatched(const size_t n,
|
||||
const cl_half *alphas,
|
||||
const cl_mem *x_buffers, const size_t x_inc,
|
||||
cl_mem *y_buffers, const size_t y_inc,
|
||||
const cl_mem x_buffer, const size_t *x_offsets, const size_t x_inc,
|
||||
cl_mem y_buffer, const size_t *y_offsets, const size_t y_inc,
|
||||
const size_t batch_count,
|
||||
cl_command_queue* queue, cl_event* event) {
|
||||
auto alphas_cpp = std::vector<half>();
|
||||
|
@ -3546,8 +3546,8 @@ CLBlastStatusCode CLBlastHaxpyBatched(const size_t n,
|
|||
return static_cast<CLBlastStatusCode>(
|
||||
clblast::AxpyBatched(n,
|
||||
alphas_cpp.data(),
|
||||
x_buffers, x_inc,
|
||||
y_buffers, y_inc,
|
||||
x_buffer, x_offsets, x_inc,
|
||||
y_buffer, y_offsets, y_inc,
|
||||
batch_count,
|
||||
queue, event)
|
||||
);
|
||||
|
|
|
@ -600,9 +600,6 @@ class Buffer {
|
|||
|
||||
// Copies from host to device: writing the device buffer a-synchronously
|
||||
void WriteAsync(const Queue &queue, const size_t size, const T* host, const size_t offset = 0) {
|
||||
if (access_ == BufferAccess::kReadOnly) {
|
||||
throw LogicError("Buffer: writing to a read-only buffer");
|
||||
}
|
||||
if (GetSize() < (offset+size)*sizeof(T)) {
|
||||
throw LogicError("Buffer: target device buffer is too small");
|
||||
}
|
||||
|
|
|
@ -9,7 +9,7 @@
|
|||
//
|
||||
// This file contains the Xaxpy kernel. It contains one fast vectorized version in case of unit
|
||||
// strides (incx=incy=1) and no offsets (offx=offy=0). Another version is more general, but doesn't
|
||||
// support vector data-types.
|
||||
// support vector data-types. The general version has a batched implementation as well.
|
||||
//
|
||||
// This kernel uses the level-1 BLAS common tuning parameters.
|
||||
//
|
||||
|
@ -36,8 +36,6 @@ void Xaxpy(const int n, const real_arg arg_alpha,
|
|||
}
|
||||
}
|
||||
|
||||
// =================================================================================================
|
||||
|
||||
// Faster version of the kernel without offsets and strided accesses. Also assumes that 'n' is
|
||||
// dividable by 'VW', 'WGS' and 'WPT'.
|
||||
__kernel __attribute__((reqd_work_group_size(WGS, 1, 1)))
|
||||
|
@ -57,6 +55,24 @@ void XaxpyFast(const int n, const real_arg arg_alpha,
|
|||
|
||||
// =================================================================================================
|
||||
|
||||
// Full version of the kernel with offsets and strided accesses: batched version
|
||||
__kernel __attribute__((reqd_work_group_size(WGS, 1, 1)))
|
||||
void XaxpyBatched(const int n, const real_arg arg_alpha,
|
||||
const __global real* restrict xgm, const int x_offset, const int x_inc,
|
||||
__global real* ygm, const int y_offset, const int y_inc,
|
||||
const int batch) {
|
||||
const real alpha = GetRealArg(arg_alpha);
|
||||
|
||||
// Loops over the work that needs to be done (allows for an arbitrary number of threads)
|
||||
#pragma unroll
|
||||
for (int id = get_global_id(0); id<n; id += get_global_size(0)) {
|
||||
real xvalue = xgm[id*x_inc + x_offset];
|
||||
MultiplyAdd(ygm[id*y_inc + y_offset], alpha, xvalue);
|
||||
}
|
||||
}
|
||||
|
||||
// =================================================================================================
|
||||
|
||||
// End of the C++11 raw string literal
|
||||
)"
|
||||
|
||||
|
|
|
@ -22,7 +22,10 @@ namespace clblast {
|
|||
// Constructor: forwards to base class constructor
|
||||
template <typename T>
|
||||
XaxpyBatched<T>::XaxpyBatched(Queue &queue, EventPointer event, const std::string &name):
|
||||
Xaxpy<T>(queue, event, name) {
|
||||
Routine(queue, event, name, {"Xaxpy"}, PrecisionValue<T>(), {}, {
|
||||
#include "../../kernels/level1/level1.opencl"
|
||||
#include "../../kernels/level1/xaxpy.opencl"
|
||||
}) {
|
||||
}
|
||||
|
||||
// =================================================================================================
|
||||
|
@ -30,19 +33,55 @@ XaxpyBatched<T>::XaxpyBatched(Queue &queue, EventPointer event, const std::strin
|
|||
// The main routine
|
||||
template <typename T>
|
||||
void XaxpyBatched<T>::DoAxpyBatched(const size_t n, const std::vector<T> &alphas,
|
||||
const std::vector<Buffer<T>> &x_buffers, const size_t x_inc,
|
||||
const std::vector<Buffer<T>> &y_buffers, const size_t y_inc,
|
||||
const Buffer<T> &x_buffer, const std::vector<size_t> &x_offsets, const size_t x_inc,
|
||||
const Buffer<T> &y_buffer, const std::vector<size_t> &y_offsets, const size_t y_inc,
|
||||
const size_t batch_count) {
|
||||
if (batch_count < 1) { throw BLASError(StatusCode::kInvalidBatchCount); }
|
||||
if (alphas.size() != batch_count) { throw BLASError(StatusCode::kInvalidBatchCount); }
|
||||
if (x_buffers.size() != batch_count) { throw BLASError(StatusCode::kInvalidBatchCount); }
|
||||
if (y_buffers.size() != batch_count) { throw BLASError(StatusCode::kInvalidBatchCount); }
|
||||
|
||||
// Tests for a valid batch count
|
||||
if ((batch_count < 1) || (alphas.size() != batch_count) ||
|
||||
(x_offsets.size() != batch_count) || (y_offsets.size() != batch_count)) {
|
||||
throw BLASError(StatusCode::kInvalidBatchCount);
|
||||
}
|
||||
|
||||
// Makes sure all dimensions are larger than zero
|
||||
if (n == 0) { throw BLASError(StatusCode::kInvalidDimension); }
|
||||
|
||||
// Tests the vectors for validity
|
||||
for (auto batch = size_t{0}; batch < batch_count; ++batch) {
|
||||
TestVectorX(n, x_buffer, x_offsets[batch], x_inc);
|
||||
TestVectorY(n, y_buffer, y_offsets[batch], y_inc);
|
||||
}
|
||||
|
||||
// Upload the arguments to the device
|
||||
std::vector<int> x_offsets_int(x_offsets.begin(), x_offsets.end());
|
||||
std::vector<int> y_offsets_int(y_offsets.begin(), y_offsets.end());
|
||||
auto x_offsets_device = Buffer<int>(context_, BufferAccess::kReadOnly, batch_count);
|
||||
auto y_offsets_device = Buffer<int>(context_, BufferAccess::kReadOnly, batch_count);
|
||||
x_offsets_device.Write(queue_, batch_count, x_offsets_int);
|
||||
y_offsets_device.Write(queue_, batch_count, y_offsets_int);
|
||||
|
||||
// Retrieves the Xaxpy kernel from the compiled binary
|
||||
auto kernel = Kernel(program_, "XaxpyBatched");
|
||||
|
||||
// Naive implementation: calls regular Axpy multiple times
|
||||
for (auto batch = size_t{0}; batch < batch_count; ++batch) {
|
||||
DoAxpy(n, alphas[batch],
|
||||
x_buffers[batch], 0, x_inc,
|
||||
y_buffers[batch], 0, y_inc);
|
||||
|
||||
// Sets the kernel arguments
|
||||
kernel.SetArgument(0, static_cast<int>(n));
|
||||
kernel.SetArgument(1, GetRealArg(alphas[batch]));
|
||||
kernel.SetArgument(2, x_buffer());
|
||||
kernel.SetArgument(3, static_cast<int>(x_offsets[batch]));
|
||||
kernel.SetArgument(4, static_cast<int>(x_inc));
|
||||
kernel.SetArgument(5, y_buffer());
|
||||
kernel.SetArgument(6, static_cast<int>(y_offsets[batch]));
|
||||
kernel.SetArgument(7, static_cast<int>(y_inc));
|
||||
kernel.SetArgument(8, static_cast<int>(batch));
|
||||
|
||||
// Launches the kernel
|
||||
auto n_ceiled = Ceil(n, db_["WGS"]*db_["WPT"]);
|
||||
auto global = std::vector<size_t>{n_ceiled/db_["WPT"]};
|
||||
auto local = std::vector<size_t>{db_["WGS"]};
|
||||
RunKernel(kernel, queue_, device_, global, local, event_);
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -16,26 +16,23 @@
|
|||
|
||||
#include <vector>
|
||||
|
||||
#include "routines/level1/xaxpy.hpp"
|
||||
#include "routine.hpp"
|
||||
|
||||
namespace clblast {
|
||||
// =================================================================================================
|
||||
|
||||
// See comment at top of file for a description of the class
|
||||
template <typename T>
|
||||
class XaxpyBatched: public Xaxpy<T> {
|
||||
class XaxpyBatched: public Routine {
|
||||
public:
|
||||
|
||||
// Uses the regular Xaxpy routine
|
||||
using Xaxpy<T>::DoAxpy;
|
||||
|
||||
// Constructor
|
||||
XaxpyBatched(Queue &queue, EventPointer event, const std::string &name = "AXPYBATCHED");
|
||||
|
||||
// Templated-precision implementation of the routine
|
||||
void DoAxpyBatched(const size_t n, const std::vector<T> &alphas,
|
||||
const std::vector<Buffer<T>> &x_buffers, const size_t x_inc,
|
||||
const std::vector<Buffer<T>> &y_buffers, const size_t y_inc,
|
||||
const Buffer<T> &x_buffer, const std::vector<size_t> &x_offsets, const size_t x_inc,
|
||||
const Buffer<T> &y_buffer, const std::vector<size_t> &y_offsets, const size_t y_inc,
|
||||
const size_t batch_count);
|
||||
};
|
||||
|
||||
|
|
|
@ -157,7 +157,13 @@ struct Arguments {
|
|||
size_t imax_offset = 0;
|
||||
T alpha = ConstantOne<T>();
|
||||
T beta = ConstantOne<T>();
|
||||
// Batch-specific arguments
|
||||
size_t batch_count = 1;
|
||||
std::vector<size_t> x_offsets = {0};
|
||||
std::vector<size_t> y_offsets = {0};
|
||||
std::vector<size_t> a_offsets = {0};
|
||||
std::vector<size_t> b_offsets = {0};
|
||||
std::vector<size_t> c_offsets = {0};
|
||||
// Sizes
|
||||
size_t x_size = 1;
|
||||
size_t y_size = 1;
|
||||
|
|
|
@ -88,7 +88,7 @@ size_t RunOverrideTests(int argc, char *argv[], const bool silent, const std::st
|
|||
device_b.Write(queue, host_b.size(), host_b);
|
||||
device_c.Write(queue, host_c.size(), host_c);
|
||||
auto dummy = Buffer<T>(context, 1);
|
||||
auto buffers = std::vector<Buffers<T>>{Buffers<T>{dummy, dummy, device_a, device_b, device_c, dummy, dummy}};
|
||||
auto buffers = Buffers<T>{dummy, dummy, device_a, device_b, device_c, dummy, dummy};
|
||||
|
||||
// Loops over the valid combinations: run before and run afterwards
|
||||
fprintf(stdout, "* Testing OverrideParameters for '%s'\n", routine_name.c_str());
|
||||
|
|
|
@ -126,24 +126,21 @@ void TestBlas<T,U>::TestRegular(std::vector<Arguments<U>> &test_vector, const st
|
|||
ap_source_, scalar_source_);
|
||||
|
||||
// Set-up for the CLBlast run
|
||||
auto buffers2 = std::vector<Buffers<T>>();
|
||||
for (auto batch = size_t{0}; batch < args.batch_count; ++batch) {
|
||||
auto x_vec2 = Buffer<T>(context_, args.x_size);
|
||||
auto y_vec2 = Buffer<T>(context_, args.y_size);
|
||||
auto a_mat2 = Buffer<T>(context_, args.a_size);
|
||||
auto b_mat2 = Buffer<T>(context_, args.b_size);
|
||||
auto c_mat2 = Buffer<T>(context_, args.c_size);
|
||||
auto ap_mat2 = Buffer<T>(context_, args.ap_size);
|
||||
auto scalar2 = Buffer<T>(context_, args.scalar_size);
|
||||
x_vec2.Write(queue_, args.x_size, &x_source_[batch * args.x_size]);
|
||||
y_vec2.Write(queue_, args.y_size, &y_source_[batch * args.y_size]);
|
||||
a_mat2.Write(queue_, args.a_size, &a_source_[batch * args.a_size]);
|
||||
b_mat2.Write(queue_, args.b_size, &b_source_[batch * args.b_size]);
|
||||
c_mat2.Write(queue_, args.c_size, &c_source_[batch * args.c_size]);
|
||||
ap_mat2.Write(queue_, args.ap_size, &ap_source_[batch * args.ap_size]);
|
||||
scalar2.Write(queue_, args.scalar_size, &scalar_source_[batch * args.scalar_size]);
|
||||
buffers2.push_back(Buffers<T>{x_vec2, y_vec2, a_mat2, b_mat2, c_mat2, ap_mat2, scalar2});
|
||||
}
|
||||
auto x_vec2 = Buffer<T>(context_, args.x_size);
|
||||
auto y_vec2 = Buffer<T>(context_, args.y_size);
|
||||
auto a_mat2 = Buffer<T>(context_, args.a_size);
|
||||
auto b_mat2 = Buffer<T>(context_, args.b_size);
|
||||
auto c_mat2 = Buffer<T>(context_, args.c_size);
|
||||
auto ap_mat2 = Buffer<T>(context_, args.ap_size);
|
||||
auto scalar2 = Buffer<T>(context_, args.scalar_size);
|
||||
x_vec2.Write(queue_, args.x_size, x_source_);
|
||||
y_vec2.Write(queue_, args.y_size, y_source_);
|
||||
a_mat2.Write(queue_, args.a_size, a_source_);
|
||||
b_mat2.Write(queue_, args.b_size, b_source_);
|
||||
c_mat2.Write(queue_, args.c_size, c_source_);
|
||||
ap_mat2.Write(queue_, args.ap_size, ap_source_);
|
||||
scalar2.Write(queue_, args.scalar_size, scalar_source_);
|
||||
auto buffers2 = Buffers<T>{x_vec2, y_vec2, a_mat2, b_mat2, c_mat2, ap_mat2, scalar2};
|
||||
|
||||
// Runs CLBlast
|
||||
if (verbose_) {
|
||||
|
@ -163,24 +160,21 @@ void TestBlas<T,U>::TestRegular(std::vector<Arguments<U>> &test_vector, const st
|
|||
}
|
||||
|
||||
// Set-up for the reference run
|
||||
auto buffers1 = std::vector<Buffers<T>>();
|
||||
for (auto batch = size_t{0}; batch < args.batch_count; ++batch) {
|
||||
auto x_vec1 = Buffer<T>(context_, args.x_size);
|
||||
auto y_vec1 = Buffer<T>(context_, args.y_size);
|
||||
auto a_mat1 = Buffer<T>(context_, args.a_size);
|
||||
auto b_mat1 = Buffer<T>(context_, args.b_size);
|
||||
auto c_mat1 = Buffer<T>(context_, args.c_size);
|
||||
auto ap_mat1 = Buffer<T>(context_, args.ap_size);
|
||||
auto scalar1 = Buffer<T>(context_, args.scalar_size);
|
||||
x_vec1.Write(queue_, args.x_size, &x_source_[batch * args.x_size]);
|
||||
y_vec1.Write(queue_, args.y_size, &y_source_[batch * args.y_size]);
|
||||
a_mat1.Write(queue_, args.a_size, &a_source_[batch * args.a_size]);
|
||||
b_mat1.Write(queue_, args.b_size, &b_source_[batch * args.b_size]);
|
||||
c_mat1.Write(queue_, args.c_size, &c_source_[batch * args.c_size]);
|
||||
ap_mat1.Write(queue_, args.ap_size, &ap_source_[batch * args.ap_size]);
|
||||
scalar1.Write(queue_, args.scalar_size, &scalar_source_[batch * args.scalar_size]);
|
||||
buffers1.push_back(Buffers<T>{x_vec1, y_vec1, a_mat1, b_mat1, c_mat1, ap_mat1, scalar1});
|
||||
}
|
||||
auto x_vec1 = Buffer<T>(context_, args.x_size);
|
||||
auto y_vec1 = Buffer<T>(context_, args.y_size);
|
||||
auto a_mat1 = Buffer<T>(context_, args.a_size);
|
||||
auto b_mat1 = Buffer<T>(context_, args.b_size);
|
||||
auto c_mat1 = Buffer<T>(context_, args.c_size);
|
||||
auto ap_mat1 = Buffer<T>(context_, args.ap_size);
|
||||
auto scalar1 = Buffer<T>(context_, args.scalar_size);
|
||||
x_vec1.Write(queue_, args.x_size, x_source_);
|
||||
y_vec1.Write(queue_, args.y_size, y_source_);
|
||||
a_mat1.Write(queue_, args.a_size, a_source_);
|
||||
b_mat1.Write(queue_, args.b_size, b_source_);
|
||||
c_mat1.Write(queue_, args.c_size, c_source_);
|
||||
ap_mat1.Write(queue_, args.ap_size, ap_source_);
|
||||
scalar1.Write(queue_, args.scalar_size, scalar_source_);
|
||||
auto buffers1 = Buffers<T>{x_vec1, y_vec1, a_mat1, b_mat1, c_mat1, ap_mat1, scalar1};
|
||||
|
||||
// Runs the reference code
|
||||
if (verbose_) {
|
||||
|
@ -197,47 +191,40 @@ void TestBlas<T,U>::TestRegular(std::vector<Arguments<U>> &test_vector, const st
|
|||
continue;
|
||||
}
|
||||
|
||||
// Error checking for each batch
|
||||
auto errors = size_t{0};
|
||||
// Downloads the results
|
||||
auto result1 = get_result_(args, buffers1, queue_);
|
||||
auto result2 = get_result_(args, buffers2, queue_);
|
||||
|
||||
// Computes the L2 error
|
||||
auto l2error = 0.0;
|
||||
for (auto batch = size_t{0}; batch < args.batch_count; ++batch) {
|
||||
|
||||
// Downloads the results
|
||||
auto result1 = get_result_(args, buffers1[batch], queue_);
|
||||
auto result2 = get_result_(args, buffers2[batch], queue_);
|
||||
|
||||
// Computes the L2 error
|
||||
auto l2error_batch = 0.0;
|
||||
const auto kErrorMarginL2 = getL2ErrorMargin<T>();
|
||||
for (auto id1=size_t{0}; id1<get_id1_(args); ++id1) {
|
||||
for (auto id2=size_t{0}; id2<get_id2_(args); ++id2) {
|
||||
auto index = get_index_(args, id1, id2);
|
||||
l2error_batch += SquaredDifference(result1[index], result2[index]);
|
||||
}
|
||||
const auto kErrorMarginL2 = getL2ErrorMargin<T>();
|
||||
for (auto id1=size_t{0}; id1<get_id1_(args); ++id1) {
|
||||
for (auto id2=size_t{0}; id2<get_id2_(args); ++id2) {
|
||||
auto index = get_index_(args, id1, id2);
|
||||
l2error += SquaredDifference(result1[index], result2[index]);
|
||||
}
|
||||
l2error_batch /= static_cast<double>(get_id1_(args) * get_id2_(args));
|
||||
l2error += l2error_batch;
|
||||
}
|
||||
l2error /= static_cast<double>(get_id1_(args) * get_id2_(args));
|
||||
|
||||
// Checks for differences in the output
|
||||
for (auto id1=size_t{0}; id1<get_id1_(args); ++id1) {
|
||||
for (auto id2=size_t{0}; id2<get_id2_(args); ++id2) {
|
||||
auto index = get_index_(args, id1, id2);
|
||||
if (!TestSimilarity(result1[index], result2[index])) {
|
||||
if (l2error_batch >= kErrorMarginL2) { errors++; }
|
||||
if (verbose_) {
|
||||
if (get_id2_(args) == 1) { fprintf(stdout, "\n Error at index %zu: ", id1); }
|
||||
else { fprintf(stdout, "\n Error at %zu,%zu: ", id1, id2); }
|
||||
fprintf(stdout, " %s (reference) versus ", ToString(result1[index]).c_str());
|
||||
fprintf(stdout, " %s (CLBlast)", ToString(result2[index]).c_str());
|
||||
if (l2error_batch < kErrorMarginL2) {
|
||||
fprintf(stdout, " - error suppressed by a low total L2 error\n");
|
||||
}
|
||||
// Checks for differences in the output
|
||||
auto errors = size_t{0};
|
||||
for (auto id1=size_t{0}; id1<get_id1_(args); ++id1) {
|
||||
for (auto id2=size_t{0}; id2<get_id2_(args); ++id2) {
|
||||
auto index = get_index_(args, id1, id2);
|
||||
if (!TestSimilarity(result1[index], result2[index])) {
|
||||
if (l2error >= kErrorMarginL2) { errors++; }
|
||||
if (verbose_) {
|
||||
if (get_id2_(args) == 1) { fprintf(stdout, "\n Error at index %zu: ", id1); }
|
||||
else { fprintf(stdout, "\n Error at %zu,%zu: ", id1, id2); }
|
||||
fprintf(stdout, " %s (reference) versus ", ToString(result1[index]).c_str());
|
||||
fprintf(stdout, " %s (CLBlast)", ToString(result2[index]).c_str());
|
||||
if (l2error < kErrorMarginL2) {
|
||||
fprintf(stdout, " - error suppressed by a low total L2 error\n");
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
l2error /= static_cast<double>(args.batch_count);
|
||||
|
||||
// Report the results
|
||||
if (verbose_ && errors > 0) {
|
||||
|
@ -245,7 +232,7 @@ void TestBlas<T,U>::TestRegular(std::vector<Arguments<U>> &test_vector, const st
|
|||
}
|
||||
|
||||
// Tests the error count (should be zero)
|
||||
TestErrorCount(errors, get_id1_(args)*get_id2_(args)*args.batch_count, args);
|
||||
TestErrorCount(errors, get_id1_(args)*get_id2_(args), args);
|
||||
}
|
||||
TestEnd();
|
||||
}
|
||||
|
@ -272,40 +259,36 @@ void TestBlas<T,U>::TestInvalid(std::vector<Arguments<U>> &test_vector, const st
|
|||
|
||||
// Creates the OpenCL buffers. Note: we are not using the C++ version since we explicitly
|
||||
// want to be able to create invalid buffers (no error checking here).
|
||||
auto buffers1 = std::vector<Buffers<T>>();
|
||||
auto buffers2 = std::vector<Buffers<T>>();
|
||||
for (auto batch = size_t{0}; batch < args.batch_count; ++batch) {
|
||||
auto x1 = clCreateBuffer(context_(), CL_MEM_READ_WRITE, args.x_size*sizeof(T), nullptr,nullptr);
|
||||
auto y1 = clCreateBuffer(context_(), CL_MEM_READ_WRITE, args.y_size*sizeof(T), nullptr,nullptr);
|
||||
auto a1 = clCreateBuffer(context_(), CL_MEM_READ_WRITE, args.a_size*sizeof(T), nullptr,nullptr);
|
||||
auto b1 = clCreateBuffer(context_(), CL_MEM_READ_WRITE, args.b_size*sizeof(T), nullptr,nullptr);
|
||||
auto c1 = clCreateBuffer(context_(), CL_MEM_READ_WRITE, args.c_size*sizeof(T), nullptr,nullptr);
|
||||
auto ap1 = clCreateBuffer(context_(), CL_MEM_READ_WRITE, args.ap_size*sizeof(T), nullptr,nullptr);
|
||||
auto d1 = clCreateBuffer(context_(), CL_MEM_READ_WRITE, args.scalar_size*sizeof(T), nullptr,nullptr);
|
||||
auto x_vec1 = Buffer<T>(x1);
|
||||
auto y_vec1 = Buffer<T>(y1);
|
||||
auto a_mat1 = Buffer<T>(a1);
|
||||
auto b_mat1 = Buffer<T>(b1);
|
||||
auto c_mat1 = Buffer<T>(c1);
|
||||
auto ap_mat1 = Buffer<T>(ap1);
|
||||
auto scalar1 = Buffer<T>(d1);
|
||||
auto x2 = clCreateBuffer(context_(), CL_MEM_READ_WRITE, args.x_size*sizeof(T), nullptr,nullptr);
|
||||
auto y2 = clCreateBuffer(context_(), CL_MEM_READ_WRITE, args.y_size*sizeof(T), nullptr,nullptr);
|
||||
auto a2 = clCreateBuffer(context_(), CL_MEM_READ_WRITE, args.a_size*sizeof(T), nullptr,nullptr);
|
||||
auto b2 = clCreateBuffer(context_(), CL_MEM_READ_WRITE, args.b_size*sizeof(T), nullptr,nullptr);
|
||||
auto c2 = clCreateBuffer(context_(), CL_MEM_READ_WRITE, args.c_size*sizeof(T), nullptr,nullptr);
|
||||
auto ap2 = clCreateBuffer(context_(), CL_MEM_READ_WRITE, args.ap_size*sizeof(T), nullptr,nullptr);
|
||||
auto d2 = clCreateBuffer(context_(), CL_MEM_READ_WRITE, args.scalar_size*sizeof(T), nullptr,nullptr);
|
||||
auto x_vec2 = Buffer<T>(x2);
|
||||
auto y_vec2 = Buffer<T>(y2);
|
||||
auto a_mat2 = Buffer<T>(a2);
|
||||
auto b_mat2 = Buffer<T>(b2);
|
||||
auto c_mat2 = Buffer<T>(c2);
|
||||
auto ap_mat2 = Buffer<T>(ap2);
|
||||
auto scalar2 = Buffer<T>(d2);
|
||||
buffers1.push_back(Buffers<T>{x_vec1, y_vec1, a_mat1, b_mat1, c_mat1, ap_mat1, scalar1});
|
||||
buffers2.push_back(Buffers<T>{x_vec2, y_vec2, a_mat2, b_mat2, c_mat2, ap_mat2, scalar2});
|
||||
}
|
||||
auto x1 = clCreateBuffer(context_(), CL_MEM_READ_WRITE, args.x_size*sizeof(T), nullptr,nullptr);
|
||||
auto y1 = clCreateBuffer(context_(), CL_MEM_READ_WRITE, args.y_size*sizeof(T), nullptr,nullptr);
|
||||
auto a1 = clCreateBuffer(context_(), CL_MEM_READ_WRITE, args.a_size*sizeof(T), nullptr,nullptr);
|
||||
auto b1 = clCreateBuffer(context_(), CL_MEM_READ_WRITE, args.b_size*sizeof(T), nullptr,nullptr);
|
||||
auto c1 = clCreateBuffer(context_(), CL_MEM_READ_WRITE, args.c_size*sizeof(T), nullptr,nullptr);
|
||||
auto ap1 = clCreateBuffer(context_(), CL_MEM_READ_WRITE, args.ap_size*sizeof(T), nullptr,nullptr);
|
||||
auto d1 = clCreateBuffer(context_(), CL_MEM_READ_WRITE, args.scalar_size*sizeof(T), nullptr,nullptr);
|
||||
auto x_vec1 = Buffer<T>(x1);
|
||||
auto y_vec1 = Buffer<T>(y1);
|
||||
auto a_mat1 = Buffer<T>(a1);
|
||||
auto b_mat1 = Buffer<T>(b1);
|
||||
auto c_mat1 = Buffer<T>(c1);
|
||||
auto ap_mat1 = Buffer<T>(ap1);
|
||||
auto scalar1 = Buffer<T>(d1);
|
||||
auto x2 = clCreateBuffer(context_(), CL_MEM_READ_WRITE, args.x_size*sizeof(T), nullptr,nullptr);
|
||||
auto y2 = clCreateBuffer(context_(), CL_MEM_READ_WRITE, args.y_size*sizeof(T), nullptr,nullptr);
|
||||
auto a2 = clCreateBuffer(context_(), CL_MEM_READ_WRITE, args.a_size*sizeof(T), nullptr,nullptr);
|
||||
auto b2 = clCreateBuffer(context_(), CL_MEM_READ_WRITE, args.b_size*sizeof(T), nullptr,nullptr);
|
||||
auto c2 = clCreateBuffer(context_(), CL_MEM_READ_WRITE, args.c_size*sizeof(T), nullptr,nullptr);
|
||||
auto ap2 = clCreateBuffer(context_(), CL_MEM_READ_WRITE, args.ap_size*sizeof(T), nullptr,nullptr);
|
||||
auto d2 = clCreateBuffer(context_(), CL_MEM_READ_WRITE, args.scalar_size*sizeof(T), nullptr,nullptr);
|
||||
auto x_vec2 = Buffer<T>(x2);
|
||||
auto y_vec2 = Buffer<T>(y2);
|
||||
auto a_mat2 = Buffer<T>(a2);
|
||||
auto b_mat2 = Buffer<T>(b2);
|
||||
auto c_mat2 = Buffer<T>(c2);
|
||||
auto ap_mat2 = Buffer<T>(ap2);
|
||||
auto scalar2 = Buffer<T>(d2);
|
||||
auto buffers1 = Buffers<T>{x_vec1, y_vec1, a_mat1, b_mat1, c_mat1, ap_mat1, scalar1};
|
||||
auto buffers2 = Buffers<T>{x_vec2, y_vec2, a_mat2, b_mat2, c_mat2, ap_mat2, scalar2};
|
||||
|
||||
// Runs CLBlast
|
||||
if (verbose_) {
|
||||
|
|
|
@ -79,7 +79,7 @@ class TestBlas: public Tester<T,U> {
|
|||
std::vector<T>&, std::vector<T>&,
|
||||
std::vector<T>&, std::vector<T>&, std::vector<T>&,
|
||||
std::vector<T>&, std::vector<T>&)>;
|
||||
using Routine = std::function<StatusCode(const Arguments<U>&, std::vector<Buffers<T>>&, Queue&)>;
|
||||
using Routine = std::function<StatusCode(const Arguments<U>&, Buffers<T>&, Queue&)>;
|
||||
using ResultGet = std::function<std::vector<T>(const Arguments<U>&, Buffers<T>&, Queue&)>;
|
||||
using ResultIndex = std::function<size_t(const Arguments<U>&, const size_t, const size_t)>;
|
||||
using ResultIterator = std::function<size_t(const Arguments<U>&)>;
|
||||
|
|
|
@ -177,13 +177,13 @@ void Client<T,U>::PerformanceTest(Arguments<U> &args, const SetMetric set_sizes)
|
|||
set_sizes(args);
|
||||
|
||||
// Populates input host matrices with random data
|
||||
std::vector<T> x_source(args.batch_count * args.x_size);
|
||||
std::vector<T> y_source(args.batch_count * args.y_size);
|
||||
std::vector<T> a_source(args.batch_count * args.a_size);
|
||||
std::vector<T> b_source(args.batch_count * args.b_size);
|
||||
std::vector<T> c_source(args.batch_count * args.c_size);
|
||||
std::vector<T> ap_source(args.batch_count * args.ap_size);
|
||||
std::vector<T> scalar_source(args.batch_count * args.scalar_size);
|
||||
std::vector<T> x_source(args.x_size);
|
||||
std::vector<T> y_source(args.y_size);
|
||||
std::vector<T> a_source(args.a_size);
|
||||
std::vector<T> b_source(args.b_size);
|
||||
std::vector<T> c_source(args.c_size);
|
||||
std::vector<T> ap_source(args.ap_size);
|
||||
std::vector<T> scalar_source(args.scalar_size);
|
||||
std::mt19937 mt(kSeed);
|
||||
std::uniform_real_distribution<double> dist(kTestDataLowerLimit, kTestDataUpperLimit);
|
||||
PopulateVector(x_source, mt, dist);
|
||||
|
@ -195,24 +195,21 @@ void Client<T,U>::PerformanceTest(Arguments<U> &args, const SetMetric set_sizes)
|
|||
PopulateVector(scalar_source, mt, dist);
|
||||
|
||||
// Creates the matrices on the device
|
||||
auto buffers = std::vector<Buffers<T>>();
|
||||
for (auto batch = size_t{0}; batch < args.batch_count; ++batch) {
|
||||
auto x_vec = Buffer<T>(context, args.x_size);
|
||||
auto y_vec = Buffer<T>(context, args.y_size);
|
||||
auto a_mat = Buffer<T>(context, args.a_size);
|
||||
auto b_mat = Buffer<T>(context, args.b_size);
|
||||
auto c_mat = Buffer<T>(context, args.c_size);
|
||||
auto ap_mat = Buffer<T>(context, args.ap_size);
|
||||
auto scalar = Buffer<T>(context, args.scalar_size);
|
||||
x_vec.Write(queue, args.x_size, &x_source[batch * args.x_size]);
|
||||
y_vec.Write(queue, args.y_size, &y_source[batch * args.y_size]);
|
||||
a_mat.Write(queue, args.a_size, &a_source[batch * args.a_size]);
|
||||
b_mat.Write(queue, args.b_size, &b_source[batch * args.b_size]);
|
||||
c_mat.Write(queue, args.c_size, &c_source[batch * args.c_size]);
|
||||
ap_mat.Write(queue, args.ap_size, &ap_source[batch * args.ap_size]);
|
||||
scalar.Write(queue, args.scalar_size, &scalar_source[batch * args.scalar_size]);
|
||||
buffers.push_back(Buffers<T>{x_vec, y_vec, a_mat, b_mat, c_mat, ap_mat, scalar});
|
||||
}
|
||||
auto x_vec = Buffer<T>(context, args.x_size);
|
||||
auto y_vec = Buffer<T>(context, args.y_size);
|
||||
auto a_mat = Buffer<T>(context, args.a_size);
|
||||
auto b_mat = Buffer<T>(context, args.b_size);
|
||||
auto c_mat = Buffer<T>(context, args.c_size);
|
||||
auto ap_mat = Buffer<T>(context, args.ap_size);
|
||||
auto scalar = Buffer<T>(context, args.scalar_size);
|
||||
x_vec.Write(queue, args.x_size, x_source);
|
||||
y_vec.Write(queue, args.y_size, y_source);
|
||||
a_mat.Write(queue, args.a_size, a_source);
|
||||
b_mat.Write(queue, args.b_size, b_source);
|
||||
c_mat.Write(queue, args.c_size, c_source);
|
||||
ap_mat.Write(queue, args.ap_size, ap_source);
|
||||
scalar.Write(queue, args.scalar_size, scalar_source);
|
||||
auto buffers = Buffers<T>{x_vec, y_vec, a_mat, b_mat, c_mat, ap_mat, scalar};
|
||||
|
||||
// Runs the routines and collects the timings
|
||||
auto timings = std::vector<std::pair<std::string, double>>();
|
||||
|
@ -254,7 +251,7 @@ void Client<T,U>::PerformanceTest(Arguments<U> &args, const SetMetric set_sizes)
|
|||
// value found in the vector of timing results. The return value is in milliseconds.
|
||||
template <typename T, typename U>
|
||||
double Client<T,U>::TimedExecution(const size_t num_runs, const Arguments<U> &args,
|
||||
std::vector<Buffers<T>> &buffers, Queue &queue,
|
||||
Buffers<T> &buffers, Queue &queue,
|
||||
Routine run_blas, const std::string &library_name) {
|
||||
auto status = StatusCode::kSuccess;
|
||||
|
||||
|
@ -373,8 +370,8 @@ void Client<T,U>::PrintTableRow(const Arguments<U>& args,
|
|||
for (const auto& timing : timings) {
|
||||
|
||||
// Computes the GFLOPS and GB/s metrics
|
||||
auto flops = get_flops_(args) * args.batch_count;
|
||||
auto bytes = get_bytes_(args) * args.batch_count;
|
||||
auto flops = get_flops_(args);
|
||||
auto bytes = get_bytes_(args);
|
||||
auto gflops = (timing.second != 0.0) ? (flops*1e-6)/timing.second : 0;
|
||||
auto gbs = (timing.second != 0.0) ? (bytes*1e-6)/timing.second : 0;
|
||||
|
||||
|
|
|
@ -43,7 +43,7 @@ class Client {
|
|||
static constexpr auto kSeed = 42; // fixed seed for reproducibility
|
||||
|
||||
// Shorthand for the routine-specific functions passed to the tester
|
||||
using Routine = std::function<StatusCode(const Arguments<U>&, std::vector<Buffers<T>>&, Queue&)>;
|
||||
using Routine = std::function<StatusCode(const Arguments<U>&, Buffers<T>&, Queue&)>;
|
||||
using SetMetric = std::function<void(Arguments<U>&)>;
|
||||
using GetMetric = std::function<size_t(const Arguments<U>&)>;
|
||||
|
||||
|
@ -66,7 +66,7 @@ class Client {
|
|||
private:
|
||||
|
||||
// Runs a function a given number of times and returns the execution time of the shortest instance
|
||||
double TimedExecution(const size_t num_runs, const Arguments<U> &args, std::vector<Buffers<T>> &buffers,
|
||||
double TimedExecution(const size_t num_runs, const Arguments<U> &args, Buffers<T> &buffers,
|
||||
Queue &queue, Routine run_blas, const std::string &library_name);
|
||||
|
||||
// Prints the header of a performance-data table
|
||||
|
|
|
@ -74,12 +74,12 @@ class TestXamax {
|
|||
std::vector<T>&, std::vector<T>&) {} // N/A for this routine
|
||||
|
||||
// Describes how to run the CLBlast routine
|
||||
static StatusCode RunRoutine(const Arguments<T> &args, std::vector<Buffers<T>> &buffers, Queue &queue) {
|
||||
static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
|
||||
auto queue_plain = queue();
|
||||
auto event = cl_event{};
|
||||
auto status = Amax<T>(args.n,
|
||||
buffers[0].scalar(), args.imax_offset,
|
||||
buffers[0].x_vec(), args.x_offset, args.x_inc,
|
||||
buffers.scalar(), args.imax_offset,
|
||||
buffers.x_vec(), args.x_offset, args.x_inc,
|
||||
&queue_plain, &event);
|
||||
if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); }
|
||||
return status;
|
||||
|
@ -87,12 +87,12 @@ class TestXamax {
|
|||
|
||||
// Describes how to run the clBLAS routine (for correctness/performance comparison)
|
||||
#ifdef CLBLAST_REF_CLBLAS
|
||||
static StatusCode RunReference1(const Arguments<T> &args, std::vector<Buffers<T>> &buffers, Queue &queue) {
|
||||
static StatusCode RunReference1(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
|
||||
auto queue_plain = queue();
|
||||
auto event = cl_event{};
|
||||
auto status = clblasXamax<T>(args.n,
|
||||
buffers[0].scalar, args.imax_offset,
|
||||
buffers[0].x_vec, args.x_offset, args.x_inc,
|
||||
buffers.scalar, args.imax_offset,
|
||||
buffers.x_vec, args.x_offset, args.x_inc,
|
||||
1, &queue_plain, 0, nullptr, &event);
|
||||
clWaitForEvents(1, &event);
|
||||
return static_cast<StatusCode>(status);
|
||||
|
@ -101,15 +101,15 @@ class TestXamax {
|
|||
|
||||
// Describes how to run the CPU BLAS routine (for correctness/performance comparison)
|
||||
#ifdef CLBLAST_REF_CBLAS
|
||||
static StatusCode RunReference2(const Arguments<T> &args, std::vector<Buffers<T>> &buffers, Queue &queue) {
|
||||
static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
|
||||
std::vector<T> scalar_cpu(args.scalar_size, static_cast<T>(0));
|
||||
std::vector<T> x_vec_cpu(args.x_size, static_cast<T>(0));
|
||||
buffers[0].scalar.Read(queue, args.scalar_size, scalar_cpu);
|
||||
buffers[0].x_vec.Read(queue, args.x_size, x_vec_cpu);
|
||||
buffers.scalar.Read(queue, args.scalar_size, scalar_cpu);
|
||||
buffers.x_vec.Read(queue, args.x_size, x_vec_cpu);
|
||||
cblasXamax(args.n,
|
||||
scalar_cpu, args.imax_offset,
|
||||
x_vec_cpu, args.x_offset, args.x_inc);
|
||||
buffers[0].scalar.Write(queue, args.scalar_size, scalar_cpu);
|
||||
buffers.scalar.Write(queue, args.scalar_size, scalar_cpu);
|
||||
return StatusCode::kSuccess;
|
||||
}
|
||||
#endif
|
||||
|
|
|
@ -74,12 +74,12 @@ class TestXasum {
|
|||
std::vector<T>&, std::vector<T>&) {} // N/A for this routine
|
||||
|
||||
// Describes how to run the CLBlast routine
|
||||
static StatusCode RunRoutine(const Arguments<T> &args, std::vector<Buffers<T>> &buffers, Queue &queue) {
|
||||
static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
|
||||
auto queue_plain = queue();
|
||||
auto event = cl_event{};
|
||||
auto status = Asum<T>(args.n,
|
||||
buffers[0].scalar(), args.asum_offset,
|
||||
buffers[0].x_vec(), args.x_offset, args.x_inc,
|
||||
buffers.scalar(), args.asum_offset,
|
||||
buffers.x_vec(), args.x_offset, args.x_inc,
|
||||
&queue_plain, &event);
|
||||
if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); }
|
||||
return status;
|
||||
|
@ -87,12 +87,12 @@ class TestXasum {
|
|||
|
||||
// Describes how to run the clBLAS routine (for correctness/performance comparison)
|
||||
#ifdef CLBLAST_REF_CLBLAS
|
||||
static StatusCode RunReference1(const Arguments<T> &args, std::vector<Buffers<T>> &buffers, Queue &queue) {
|
||||
static StatusCode RunReference1(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
|
||||
auto queue_plain = queue();
|
||||
auto event = cl_event{};
|
||||
auto status = clblasXasum<T>(args.n,
|
||||
buffers[0].scalar, args.asum_offset,
|
||||
buffers[0].x_vec, args.x_offset, args.x_inc,
|
||||
buffers.scalar, args.asum_offset,
|
||||
buffers.x_vec, args.x_offset, args.x_inc,
|
||||
1, &queue_plain, 0, nullptr, &event);
|
||||
clWaitForEvents(1, &event);
|
||||
return static_cast<StatusCode>(status);
|
||||
|
@ -101,15 +101,15 @@ class TestXasum {
|
|||
|
||||
// Describes how to run the CPU BLAS routine (for correctness/performance comparison)
|
||||
#ifdef CLBLAST_REF_CBLAS
|
||||
static StatusCode RunReference2(const Arguments<T> &args, std::vector<Buffers<T>> &buffers, Queue &queue) {
|
||||
static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
|
||||
std::vector<T> scalar_cpu(args.scalar_size, static_cast<T>(0));
|
||||
std::vector<T> x_vec_cpu(args.x_size, static_cast<T>(0));
|
||||
buffers[0].scalar.Read(queue, args.scalar_size, scalar_cpu);
|
||||
buffers[0].x_vec.Read(queue, args.x_size, x_vec_cpu);
|
||||
buffers.scalar.Read(queue, args.scalar_size, scalar_cpu);
|
||||
buffers.x_vec.Read(queue, args.x_size, x_vec_cpu);
|
||||
cblasXasum(args.n,
|
||||
scalar_cpu, args.asum_offset,
|
||||
x_vec_cpu, args.x_offset, args.x_inc);
|
||||
buffers[0].scalar.Write(queue, args.scalar_size, scalar_cpu);
|
||||
buffers.scalar.Write(queue, args.scalar_size, scalar_cpu);
|
||||
return StatusCode::kSuccess;
|
||||
}
|
||||
#endif
|
||||
|
|
|
@ -75,12 +75,12 @@ class TestXaxpy {
|
|||
std::vector<T>&, std::vector<T>&) {} // N/A for this routine
|
||||
|
||||
// Describes how to run the CLBlast routine
|
||||
static StatusCode RunRoutine(const Arguments<T> &args, std::vector<Buffers<T>> &buffers, Queue &queue) {
|
||||
static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
|
||||
auto queue_plain = queue();
|
||||
auto event = cl_event{};
|
||||
auto status = Axpy(args.n, args.alpha,
|
||||
buffers[0].x_vec(), args.x_offset, args.x_inc,
|
||||
buffers[0].y_vec(), args.y_offset, args.y_inc,
|
||||
buffers.x_vec(), args.x_offset, args.x_inc,
|
||||
buffers.y_vec(), args.y_offset, args.y_inc,
|
||||
&queue_plain, &event);
|
||||
if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); }
|
||||
return status;
|
||||
|
@ -88,12 +88,12 @@ class TestXaxpy {
|
|||
|
||||
// Describes how to run the clBLAS routine (for correctness/performance comparison)
|
||||
#ifdef CLBLAST_REF_CLBLAS
|
||||
static StatusCode RunReference1(const Arguments<T> &args, std::vector<Buffers<T>> &buffers, Queue &queue) {
|
||||
static StatusCode RunReference1(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
|
||||
auto queue_plain = queue();
|
||||
auto event = cl_event{};
|
||||
auto status = clblasXaxpy(args.n, args.alpha,
|
||||
buffers[0].x_vec, args.x_offset, args.x_inc,
|
||||
buffers[0].y_vec, args.y_offset, args.y_inc,
|
||||
buffers.x_vec, args.x_offset, args.x_inc,
|
||||
buffers.y_vec, args.y_offset, args.y_inc,
|
||||
1, &queue_plain, 0, nullptr, &event);
|
||||
clWaitForEvents(1, &event);
|
||||
return static_cast<StatusCode>(status);
|
||||
|
@ -102,15 +102,15 @@ class TestXaxpy {
|
|||
|
||||
// Describes how to run the CPU BLAS routine (for correctness/performance comparison)
|
||||
#ifdef CLBLAST_REF_CBLAS
|
||||
static StatusCode RunReference2(const Arguments<T> &args, std::vector<Buffers<T>> &buffers, Queue &queue) {
|
||||
static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
|
||||
std::vector<T> x_vec_cpu(args.x_size, static_cast<T>(0));
|
||||
std::vector<T> y_vec_cpu(args.y_size, static_cast<T>(0));
|
||||
buffers[0].x_vec.Read(queue, args.x_size, x_vec_cpu);
|
||||
buffers[0].y_vec.Read(queue, args.y_size, y_vec_cpu);
|
||||
buffers.x_vec.Read(queue, args.x_size, x_vec_cpu);
|
||||
buffers.y_vec.Read(queue, args.y_size, y_vec_cpu);
|
||||
cblasXaxpy(args.n, args.alpha,
|
||||
x_vec_cpu, args.x_offset, args.x_inc,
|
||||
y_vec_cpu, args.y_offset, args.y_inc);
|
||||
buffers[0].y_vec.Write(queue, args.y_size, y_vec_cpu);
|
||||
buffers.y_vec.Write(queue, args.y_size, y_vec_cpu);
|
||||
return StatusCode::kSuccess;
|
||||
}
|
||||
#endif
|
||||
|
|
|
@ -74,12 +74,12 @@ class TestXcopy {
|
|||
std::vector<T>&, std::vector<T>&) {} // N/A for this routine
|
||||
|
||||
// Describes how to run the CLBlast routine
|
||||
static StatusCode RunRoutine(const Arguments<T> &args, std::vector<Buffers<T>> &buffers, Queue &queue) {
|
||||
static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
|
||||
auto queue_plain = queue();
|
||||
auto event = cl_event{};
|
||||
auto status = Copy<T>(args.n,
|
||||
buffers[0].x_vec(), args.x_offset, args.x_inc,
|
||||
buffers[0].y_vec(), args.y_offset, args.y_inc,
|
||||
buffers.x_vec(), args.x_offset, args.x_inc,
|
||||
buffers.y_vec(), args.y_offset, args.y_inc,
|
||||
&queue_plain, &event);
|
||||
if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); }
|
||||
return status;
|
||||
|
@ -87,12 +87,12 @@ class TestXcopy {
|
|||
|
||||
// Describes how to run the clBLAS routine (for correctness/performance comparison)
|
||||
#ifdef CLBLAST_REF_CLBLAS
|
||||
static StatusCode RunReference1(const Arguments<T> &args, std::vector<Buffers<T>> &buffers, Queue &queue) {
|
||||
static StatusCode RunReference1(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
|
||||
auto queue_plain = queue();
|
||||
auto event = cl_event{};
|
||||
auto status = clblasXcopy<T>(args.n,
|
||||
buffers[0].x_vec, args.x_offset, args.x_inc,
|
||||
buffers[0].y_vec, args.y_offset, args.y_inc,
|
||||
buffers.x_vec, args.x_offset, args.x_inc,
|
||||
buffers.y_vec, args.y_offset, args.y_inc,
|
||||
1, &queue_plain, 0, nullptr, &event);
|
||||
clWaitForEvents(1, &event);
|
||||
return static_cast<StatusCode>(status);
|
||||
|
@ -101,15 +101,15 @@ class TestXcopy {
|
|||
|
||||
// Describes how to run the CPU BLAS routine (for correctness/performance comparison)
|
||||
#ifdef CLBLAST_REF_CBLAS
|
||||
static StatusCode RunReference2(const Arguments<T> &args, std::vector<Buffers<T>> &buffers, Queue &queue) {
|
||||
static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
|
||||
std::vector<T> x_vec_cpu(args.x_size, static_cast<T>(0));
|
||||
std::vector<T> y_vec_cpu(args.y_size, static_cast<T>(0));
|
||||
buffers[0].x_vec.Read(queue, args.x_size, x_vec_cpu);
|
||||
buffers[0].y_vec.Read(queue, args.y_size, y_vec_cpu);
|
||||
buffers.x_vec.Read(queue, args.x_size, x_vec_cpu);
|
||||
buffers.y_vec.Read(queue, args.y_size, y_vec_cpu);
|
||||
cblasXcopy(args.n,
|
||||
x_vec_cpu, args.x_offset, args.x_inc,
|
||||
y_vec_cpu, args.y_offset, args.y_inc);
|
||||
buffers[0].y_vec.Write(queue, args.y_size, y_vec_cpu);
|
||||
buffers.y_vec.Write(queue, args.y_size, y_vec_cpu);
|
||||
return StatusCode::kSuccess;
|
||||
}
|
||||
#endif
|
||||
|
|
|
@ -78,13 +78,13 @@ class TestXdot {
|
|||
std::vector<T>&, std::vector<T>&) {} // N/A for this routine
|
||||
|
||||
// Describes how to run the CLBlast routine
|
||||
static StatusCode RunRoutine(const Arguments<T> &args, std::vector<Buffers<T>> &buffers, Queue &queue) {
|
||||
static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
|
||||
auto queue_plain = queue();
|
||||
auto event = cl_event{};
|
||||
auto status = Dot<T>(args.n,
|
||||
buffers[0].scalar(), args.dot_offset,
|
||||
buffers[0].x_vec(), args.x_offset, args.x_inc,
|
||||
buffers[0].y_vec(), args.y_offset, args.y_inc,
|
||||
buffers.scalar(), args.dot_offset,
|
||||
buffers.x_vec(), args.x_offset, args.x_inc,
|
||||
buffers.y_vec(), args.y_offset, args.y_inc,
|
||||
&queue_plain, &event);
|
||||
if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); }
|
||||
return status;
|
||||
|
@ -92,13 +92,13 @@ class TestXdot {
|
|||
|
||||
// Describes how to run the clBLAS routine (for correctness/performance comparison)
|
||||
#ifdef CLBLAST_REF_CLBLAS
|
||||
static StatusCode RunReference1(const Arguments<T> &args, std::vector<Buffers<T>> &buffers, Queue &queue) {
|
||||
static StatusCode RunReference1(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
|
||||
auto queue_plain = queue();
|
||||
auto event = cl_event{};
|
||||
auto status = clblasXdot<T>(args.n,
|
||||
buffers[0].scalar, args.dot_offset,
|
||||
buffers[0].x_vec, args.x_offset, args.x_inc,
|
||||
buffers[0].y_vec, args.y_offset, args.y_inc,
|
||||
buffers.scalar, args.dot_offset,
|
||||
buffers.x_vec, args.x_offset, args.x_inc,
|
||||
buffers.y_vec, args.y_offset, args.y_inc,
|
||||
1, &queue_plain, 0, nullptr, &event);
|
||||
clWaitForEvents(1, &event);
|
||||
return static_cast<StatusCode>(status);
|
||||
|
@ -107,18 +107,18 @@ class TestXdot {
|
|||
|
||||
// Describes how to run the CPU BLAS routine (for correctness/performance comparison)
|
||||
#ifdef CLBLAST_REF_CBLAS
|
||||
static StatusCode RunReference2(const Arguments<T> &args, std::vector<Buffers<T>> &buffers, Queue &queue) {
|
||||
static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
|
||||
std::vector<T> scalar_cpu(args.scalar_size, static_cast<T>(0));
|
||||
std::vector<T> x_vec_cpu(args.x_size, static_cast<T>(0));
|
||||
std::vector<T> y_vec_cpu(args.y_size, static_cast<T>(0));
|
||||
buffers[0].scalar.Read(queue, args.scalar_size, scalar_cpu);
|
||||
buffers[0].x_vec.Read(queue, args.x_size, x_vec_cpu);
|
||||
buffers[0].y_vec.Read(queue, args.y_size, y_vec_cpu);
|
||||
buffers.scalar.Read(queue, args.scalar_size, scalar_cpu);
|
||||
buffers.x_vec.Read(queue, args.x_size, x_vec_cpu);
|
||||
buffers.y_vec.Read(queue, args.y_size, y_vec_cpu);
|
||||
cblasXdot(args.n,
|
||||
scalar_cpu, args.dot_offset,
|
||||
x_vec_cpu, args.x_offset, args.x_inc,
|
||||
y_vec_cpu, args.y_offset, args.y_inc);
|
||||
buffers[0].scalar.Write(queue, args.scalar_size, scalar_cpu);
|
||||
buffers.scalar.Write(queue, args.scalar_size, scalar_cpu);
|
||||
return StatusCode::kSuccess;
|
||||
}
|
||||
#endif
|
||||
|
|
|
@ -78,13 +78,13 @@ class TestXdotc {
|
|||
std::vector<T>&, std::vector<T>&) {} // N/A for this routine
|
||||
|
||||
// Describes how to run the CLBlast routine
|
||||
static StatusCode RunRoutine(const Arguments<T> &args, std::vector<Buffers<T>> &buffers, Queue &queue) {
|
||||
static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
|
||||
auto queue_plain = queue();
|
||||
auto event = cl_event{};
|
||||
auto status = Dotc<T>(args.n,
|
||||
buffers[0].scalar(), args.dot_offset,
|
||||
buffers[0].x_vec(), args.x_offset, args.x_inc,
|
||||
buffers[0].y_vec(), args.y_offset, args.y_inc,
|
||||
buffers.scalar(), args.dot_offset,
|
||||
buffers.x_vec(), args.x_offset, args.x_inc,
|
||||
buffers.y_vec(), args.y_offset, args.y_inc,
|
||||
&queue_plain, &event);
|
||||
if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); }
|
||||
return status;
|
||||
|
@ -92,13 +92,13 @@ class TestXdotc {
|
|||
|
||||
// Describes how to run the clBLAS routine (for correctness/performance comparison)
|
||||
#ifdef CLBLAST_REF_CLBLAS
|
||||
static StatusCode RunReference1(const Arguments<T> &args, std::vector<Buffers<T>> &buffers, Queue &queue) {
|
||||
static StatusCode RunReference1(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
|
||||
auto queue_plain = queue();
|
||||
auto event = cl_event{};
|
||||
auto status = clblasXdotc<T>(args.n,
|
||||
buffers[0].scalar, args.dot_offset,
|
||||
buffers[0].x_vec, args.x_offset, args.x_inc,
|
||||
buffers[0].y_vec, args.y_offset, args.y_inc,
|
||||
buffers.scalar, args.dot_offset,
|
||||
buffers.x_vec, args.x_offset, args.x_inc,
|
||||
buffers.y_vec, args.y_offset, args.y_inc,
|
||||
1, &queue_plain, 0, nullptr, &event);
|
||||
clWaitForEvents(1, &event);
|
||||
return static_cast<StatusCode>(status);
|
||||
|
@ -107,18 +107,18 @@ class TestXdotc {
|
|||
|
||||
// Describes how to run the CPU BLAS routine (for correctness/performance comparison)
|
||||
#ifdef CLBLAST_REF_CBLAS
|
||||
static StatusCode RunReference2(const Arguments<T> &args, std::vector<Buffers<T>> &buffers, Queue &queue) {
|
||||
static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
|
||||
std::vector<T> scalar_cpu(args.scalar_size, static_cast<T>(0));
|
||||
std::vector<T> x_vec_cpu(args.x_size, static_cast<T>(0));
|
||||
std::vector<T> y_vec_cpu(args.y_size, static_cast<T>(0));
|
||||
buffers[0].scalar.Read(queue, args.scalar_size, scalar_cpu);
|
||||
buffers[0].x_vec.Read(queue, args.x_size, x_vec_cpu);
|
||||
buffers[0].y_vec.Read(queue, args.y_size, y_vec_cpu);
|
||||
buffers.scalar.Read(queue, args.scalar_size, scalar_cpu);
|
||||
buffers.x_vec.Read(queue, args.x_size, x_vec_cpu);
|
||||
buffers.y_vec.Read(queue, args.y_size, y_vec_cpu);
|
||||
cblasXdotc(args.n,
|
||||
scalar_cpu, args.dot_offset,
|
||||
x_vec_cpu, args.x_offset, args.x_inc,
|
||||
y_vec_cpu, args.y_offset, args.y_inc);
|
||||
buffers[0].scalar.Write(queue, args.scalar_size, scalar_cpu);
|
||||
buffers.scalar.Write(queue, args.scalar_size, scalar_cpu);
|
||||
return StatusCode::kSuccess;
|
||||
}
|
||||
#endif
|
||||
|
|
|
@ -78,13 +78,13 @@ class TestXdotu {
|
|||
std::vector<T>&, std::vector<T>&) {} // N/A for this routine
|
||||
|
||||
// Describes how to run the CLBlast routine
|
||||
static StatusCode RunRoutine(const Arguments<T> &args, std::vector<Buffers<T>> &buffers, Queue &queue) {
|
||||
static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
|
||||
auto queue_plain = queue();
|
||||
auto event = cl_event{};
|
||||
auto status = Dotu<T>(args.n,
|
||||
buffers[0].scalar(), args.dot_offset,
|
||||
buffers[0].x_vec(), args.x_offset, args.x_inc,
|
||||
buffers[0].y_vec(), args.y_offset, args.y_inc,
|
||||
buffers.scalar(), args.dot_offset,
|
||||
buffers.x_vec(), args.x_offset, args.x_inc,
|
||||
buffers.y_vec(), args.y_offset, args.y_inc,
|
||||
&queue_plain, &event);
|
||||
if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); }
|
||||
return status;
|
||||
|
@ -92,13 +92,13 @@ class TestXdotu {
|
|||
|
||||
// Describes how to run the clBLAS routine (for correctness/performance comparison)
|
||||
#ifdef CLBLAST_REF_CLBLAS
|
||||
static StatusCode RunReference1(const Arguments<T> &args, std::vector<Buffers<T>> &buffers, Queue &queue) {
|
||||
static StatusCode RunReference1(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
|
||||
auto queue_plain = queue();
|
||||
auto event = cl_event{};
|
||||
auto status = clblasXdotu<T>(args.n,
|
||||
buffers[0].scalar, args.dot_offset,
|
||||
buffers[0].x_vec, args.x_offset, args.x_inc,
|
||||
buffers[0].y_vec, args.y_offset, args.y_inc,
|
||||
buffers.scalar, args.dot_offset,
|
||||
buffers.x_vec, args.x_offset, args.x_inc,
|
||||
buffers.y_vec, args.y_offset, args.y_inc,
|
||||
1, &queue_plain, 0, nullptr, &event);
|
||||
clWaitForEvents(1, &event);
|
||||
return static_cast<StatusCode>(status);
|
||||
|
@ -107,18 +107,18 @@ class TestXdotu {
|
|||
|
||||
// Describes how to run the CPU BLAS routine (for correctness/performance comparison)
|
||||
#ifdef CLBLAST_REF_CBLAS
|
||||
static StatusCode RunReference2(const Arguments<T> &args, std::vector<Buffers<T>> &buffers, Queue &queue) {
|
||||
static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
|
||||
std::vector<T> scalar_cpu(args.scalar_size, static_cast<T>(0));
|
||||
std::vector<T> x_vec_cpu(args.x_size, static_cast<T>(0));
|
||||
std::vector<T> y_vec_cpu(args.y_size, static_cast<T>(0));
|
||||
buffers[0].scalar.Read(queue, args.scalar_size, scalar_cpu);
|
||||
buffers[0].x_vec.Read(queue, args.x_size, x_vec_cpu);
|
||||
buffers[0].y_vec.Read(queue, args.y_size, y_vec_cpu);
|
||||
buffers.scalar.Read(queue, args.scalar_size, scalar_cpu);
|
||||
buffers.x_vec.Read(queue, args.x_size, x_vec_cpu);
|
||||
buffers.y_vec.Read(queue, args.y_size, y_vec_cpu);
|
||||
cblasXdotu(args.n,
|
||||
scalar_cpu, args.dot_offset,
|
||||
x_vec_cpu, args.x_offset, args.x_inc,
|
||||
y_vec_cpu, args.y_offset, args.y_inc);
|
||||
buffers[0].scalar.Write(queue, args.scalar_size, scalar_cpu);
|
||||
buffers.scalar.Write(queue, args.scalar_size, scalar_cpu);
|
||||
return StatusCode::kSuccess;
|
||||
}
|
||||
#endif
|
||||
|
|
|
@ -74,12 +74,12 @@ class TestXnrm2 {
|
|||
std::vector<T>&, std::vector<T>&) {} // N/A for this routine
|
||||
|
||||
// Describes how to run the CLBlast routine
|
||||
static StatusCode RunRoutine(const Arguments<T> &args, std::vector<Buffers<T>> &buffers, Queue &queue) {
|
||||
static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
|
||||
auto queue_plain = queue();
|
||||
auto event = cl_event{};
|
||||
auto status = Nrm2<T>(args.n,
|
||||
buffers[0].scalar(), args.nrm2_offset,
|
||||
buffers[0].x_vec(), args.x_offset, args.x_inc,
|
||||
buffers.scalar(), args.nrm2_offset,
|
||||
buffers.x_vec(), args.x_offset, args.x_inc,
|
||||
&queue_plain, &event);
|
||||
if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); }
|
||||
return status;
|
||||
|
@ -87,12 +87,12 @@ class TestXnrm2 {
|
|||
|
||||
// Describes how to run the clBLAS routine (for correctness/performance comparison)
|
||||
#ifdef CLBLAST_REF_CLBLAS
|
||||
static StatusCode RunReference1(const Arguments<T> &args, std::vector<Buffers<T>> &buffers, Queue &queue) {
|
||||
static StatusCode RunReference1(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
|
||||
auto queue_plain = queue();
|
||||
auto event = cl_event{};
|
||||
auto status = clblasXnrm2<T>(args.n,
|
||||
buffers[0].scalar, args.nrm2_offset,
|
||||
buffers[0].x_vec, args.x_offset, args.x_inc,
|
||||
buffers.scalar, args.nrm2_offset,
|
||||
buffers.x_vec, args.x_offset, args.x_inc,
|
||||
1, &queue_plain, 0, nullptr, &event);
|
||||
clWaitForEvents(1, &event);
|
||||
return static_cast<StatusCode>(status);
|
||||
|
@ -101,15 +101,15 @@ class TestXnrm2 {
|
|||
|
||||
// Describes how to run the CPU BLAS routine (for correctness/performance comparison)
|
||||
#ifdef CLBLAST_REF_CBLAS
|
||||
static StatusCode RunReference2(const Arguments<T> &args, std::vector<Buffers<T>> &buffers, Queue &queue) {
|
||||
static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
|
||||
std::vector<T> scalar_cpu(args.scalar_size, static_cast<T>(0));
|
||||
std::vector<T> x_vec_cpu(args.x_size, static_cast<T>(0));
|
||||
buffers[0].scalar.Read(queue, args.scalar_size, scalar_cpu);
|
||||
buffers[0].x_vec.Read(queue, args.x_size, x_vec_cpu);
|
||||
buffers.scalar.Read(queue, args.scalar_size, scalar_cpu);
|
||||
buffers.x_vec.Read(queue, args.x_size, x_vec_cpu);
|
||||
cblasXnrm2(args.n,
|
||||
scalar_cpu, args.nrm2_offset,
|
||||
x_vec_cpu, args.x_offset, args.x_inc);
|
||||
buffers[0].scalar.Write(queue, args.scalar_size, scalar_cpu);
|
||||
buffers.scalar.Write(queue, args.scalar_size, scalar_cpu);
|
||||
return StatusCode::kSuccess;
|
||||
}
|
||||
#endif
|
||||
|
|
|
@ -71,11 +71,11 @@ class TestXscal {
|
|||
std::vector<T>&, std::vector<T>&) {} // N/A for this routine
|
||||
|
||||
// Describes how to run the CLBlast routine
|
||||
static StatusCode RunRoutine(const Arguments<T> &args, std::vector<Buffers<T>> &buffers, Queue &queue) {
|
||||
static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
|
||||
auto queue_plain = queue();
|
||||
auto event = cl_event{};
|
||||
auto status = Scal(args.n, args.alpha,
|
||||
buffers[0].x_vec(), args.x_offset, args.x_inc,
|
||||
buffers.x_vec(), args.x_offset, args.x_inc,
|
||||
&queue_plain, &event);
|
||||
if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); }
|
||||
return status;
|
||||
|
@ -83,11 +83,11 @@ class TestXscal {
|
|||
|
||||
// Describes how to run the clBLAS routine (for correctness/performance comparison)
|
||||
#ifdef CLBLAST_REF_CLBLAS
|
||||
static StatusCode RunReference1(const Arguments<T> &args, std::vector<Buffers<T>> &buffers, Queue &queue) {
|
||||
static StatusCode RunReference1(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
|
||||
auto queue_plain = queue();
|
||||
auto event = cl_event{};
|
||||
auto status = clblasXscal(args.n, args.alpha,
|
||||
buffers[0].x_vec, args.x_offset, args.x_inc,
|
||||
buffers.x_vec, args.x_offset, args.x_inc,
|
||||
1, &queue_plain, 0, nullptr, &event);
|
||||
clWaitForEvents(1, &event);
|
||||
return static_cast<StatusCode>(status);
|
||||
|
@ -96,12 +96,12 @@ class TestXscal {
|
|||
|
||||
// Describes how to run the CPU BLAS routine (for correctness/performance comparison)
|
||||
#ifdef CLBLAST_REF_CBLAS
|
||||
static StatusCode RunReference2(const Arguments<T> &args, std::vector<Buffers<T>> &buffers, Queue &queue) {
|
||||
static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
|
||||
std::vector<T> x_vec_cpu(args.x_size, static_cast<T>(0));
|
||||
buffers[0].x_vec.Read(queue, args.x_size, x_vec_cpu);
|
||||
buffers.x_vec.Read(queue, args.x_size, x_vec_cpu);
|
||||
cblasXscal(args.n, args.alpha,
|
||||
x_vec_cpu, args.x_offset, args.x_inc);
|
||||
buffers[0].x_vec.Write(queue, args.x_size, x_vec_cpu);
|
||||
buffers.x_vec.Write(queue, args.x_size, x_vec_cpu);
|
||||
return StatusCode::kSuccess;
|
||||
}
|
||||
#endif
|
||||
|
|
|
@ -74,12 +74,12 @@ class TestXswap {
|
|||
std::vector<T>&, std::vector<T>&) {} // N/A for this routine
|
||||
|
||||
// Describes how to run the CLBlast routine
|
||||
static StatusCode RunRoutine(const Arguments<T> &args, std::vector<Buffers<T>> &buffers, Queue &queue) {
|
||||
static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
|
||||
auto queue_plain = queue();
|
||||
auto event = cl_event{};
|
||||
auto status = Swap<T>(args.n,
|
||||
buffers[0].x_vec(), args.x_offset, args.x_inc,
|
||||
buffers[0].y_vec(), args.y_offset, args.y_inc,
|
||||
buffers.x_vec(), args.x_offset, args.x_inc,
|
||||
buffers.y_vec(), args.y_offset, args.y_inc,
|
||||
&queue_plain, &event);
|
||||
if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); }
|
||||
return status;
|
||||
|
@ -87,12 +87,12 @@ class TestXswap {
|
|||
|
||||
// Describes how to run the clBLAS routine (for correctness/performance comparison)
|
||||
#ifdef CLBLAST_REF_CLBLAS
|
||||
static StatusCode RunReference1(const Arguments<T> &args, std::vector<Buffers<T>> &buffers, Queue &queue) {
|
||||
static StatusCode RunReference1(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
|
||||
auto queue_plain = queue();
|
||||
auto event = cl_event{};
|
||||
auto status = clblasXswap<T>(args.n,
|
||||
buffers[0].x_vec, args.x_offset, args.x_inc,
|
||||
buffers[0].y_vec, args.y_offset, args.y_inc,
|
||||
buffers.x_vec, args.x_offset, args.x_inc,
|
||||
buffers.y_vec, args.y_offset, args.y_inc,
|
||||
1, &queue_plain, 0, nullptr, &event);
|
||||
clWaitForEvents(1, &event);
|
||||
return static_cast<StatusCode>(status);
|
||||
|
@ -101,16 +101,16 @@ class TestXswap {
|
|||
|
||||
// Describes how to run the CPU BLAS routine (for correctness/performance comparison)
|
||||
#ifdef CLBLAST_REF_CBLAS
|
||||
static StatusCode RunReference2(const Arguments<T> &args, std::vector<Buffers<T>> &buffers, Queue &queue) {
|
||||
static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
|
||||
std::vector<T> x_vec_cpu(args.x_size, static_cast<T>(0));
|
||||
std::vector<T> y_vec_cpu(args.y_size, static_cast<T>(0));
|
||||
buffers[0].x_vec.Read(queue, args.x_size, x_vec_cpu);
|
||||
buffers[0].y_vec.Read(queue, args.y_size, y_vec_cpu);
|
||||
buffers.x_vec.Read(queue, args.x_size, x_vec_cpu);
|
||||
buffers.y_vec.Read(queue, args.y_size, y_vec_cpu);
|
||||
cblasXswap(args.n,
|
||||
x_vec_cpu, args.x_offset, args.x_inc,
|
||||
y_vec_cpu, args.y_offset, args.y_inc);
|
||||
buffers[0].x_vec.Write(queue, args.x_size, x_vec_cpu);
|
||||
buffers[0].y_vec.Write(queue, args.y_size, y_vec_cpu);
|
||||
buffers.x_vec.Write(queue, args.x_size, x_vec_cpu);
|
||||
buffers.y_vec.Write(queue, args.y_size, y_vec_cpu);
|
||||
return StatusCode::kSuccess;
|
||||
}
|
||||
#endif
|
||||
|
|
|
@ -86,14 +86,14 @@ class TestXgbmv {
|
|||
std::vector<T>&, std::vector<T>&) {} // N/A for this routine
|
||||
|
||||
// Describes how to run the CLBlast routine
|
||||
static StatusCode RunRoutine(const Arguments<T> &args, std::vector<Buffers<T>> &buffers, Queue &queue) {
|
||||
static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
|
||||
auto queue_plain = queue();
|
||||
auto event = cl_event{};
|
||||
auto status = Gbmv(args.layout, args.a_transpose,
|
||||
args.m, args.n, args.kl, args.ku, args.alpha,
|
||||
buffers[0].a_mat(), args.a_offset, args.a_ld,
|
||||
buffers[0].x_vec(), args.x_offset, args.x_inc, args.beta,
|
||||
buffers[0].y_vec(), args.y_offset, args.y_inc,
|
||||
buffers.a_mat(), args.a_offset, args.a_ld,
|
||||
buffers.x_vec(), args.x_offset, args.x_inc, args.beta,
|
||||
buffers.y_vec(), args.y_offset, args.y_inc,
|
||||
&queue_plain, &event);
|
||||
if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); }
|
||||
return status;
|
||||
|
@ -101,15 +101,15 @@ class TestXgbmv {
|
|||
|
||||
// Describes how to run the clBLAS routine (for correctness/performance comparison)
|
||||
#ifdef CLBLAST_REF_CLBLAS
|
||||
static StatusCode RunReference1(const Arguments<T> &args, std::vector<Buffers<T>> &buffers, Queue &queue) {
|
||||
static StatusCode RunReference1(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
|
||||
auto queue_plain = queue();
|
||||
auto event = cl_event{};
|
||||
auto status = clblasXgbmv(convertToCLBLAS(args.layout),
|
||||
convertToCLBLAS(args.a_transpose),
|
||||
args.m, args.n, args.kl, args.ku, args.alpha,
|
||||
buffers[0].a_mat, args.a_offset, args.a_ld,
|
||||
buffers[0].x_vec, args.x_offset, args.x_inc, args.beta,
|
||||
buffers[0].y_vec, args.y_offset, args.y_inc,
|
||||
buffers.a_mat, args.a_offset, args.a_ld,
|
||||
buffers.x_vec, args.x_offset, args.x_inc, args.beta,
|
||||
buffers.y_vec, args.y_offset, args.y_inc,
|
||||
1, &queue_plain, 0, nullptr, &event);
|
||||
clWaitForEvents(1, &event);
|
||||
return static_cast<StatusCode>(status);
|
||||
|
@ -118,20 +118,20 @@ class TestXgbmv {
|
|||
|
||||
// Describes how to run the CPU BLAS routine (for correctness/performance comparison)
|
||||
#ifdef CLBLAST_REF_CBLAS
|
||||
static StatusCode RunReference2(const Arguments<T> &args, std::vector<Buffers<T>> &buffers, Queue &queue) {
|
||||
static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
|
||||
std::vector<T> a_mat_cpu(args.a_size, static_cast<T>(0));
|
||||
std::vector<T> x_vec_cpu(args.x_size, static_cast<T>(0));
|
||||
std::vector<T> y_vec_cpu(args.y_size, static_cast<T>(0));
|
||||
buffers[0].a_mat.Read(queue, args.a_size, a_mat_cpu);
|
||||
buffers[0].x_vec.Read(queue, args.x_size, x_vec_cpu);
|
||||
buffers[0].y_vec.Read(queue, args.y_size, y_vec_cpu);
|
||||
buffers.a_mat.Read(queue, args.a_size, a_mat_cpu);
|
||||
buffers.x_vec.Read(queue, args.x_size, x_vec_cpu);
|
||||
buffers.y_vec.Read(queue, args.y_size, y_vec_cpu);
|
||||
cblasXgbmv(convertToCBLAS(args.layout),
|
||||
convertToCBLAS(args.a_transpose),
|
||||
args.m, args.n, args.kl, args.ku, args.alpha,
|
||||
a_mat_cpu, args.a_offset, args.a_ld,
|
||||
x_vec_cpu, args.x_offset, args.x_inc, args.beta,
|
||||
y_vec_cpu, args.y_offset, args.y_inc);
|
||||
buffers[0].y_vec.Write(queue, args.y_size, y_vec_cpu);
|
||||
buffers.y_vec.Write(queue, args.y_size, y_vec_cpu);
|
||||
return StatusCode::kSuccess;
|
||||
}
|
||||
#endif
|
||||
|
|
|
@ -86,14 +86,14 @@ class TestXgemv {
|
|||
std::vector<T>&, std::vector<T>&) {} // N/A for this routine
|
||||
|
||||
// Describes how to run the CLBlast routine
|
||||
static StatusCode RunRoutine(const Arguments<T> &args, std::vector<Buffers<T>> &buffers, Queue &queue) {
|
||||
static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
|
||||
auto queue_plain = queue();
|
||||
auto event = cl_event{};
|
||||
auto status = Gemv(args.layout, args.a_transpose,
|
||||
args.m, args.n, args.alpha,
|
||||
buffers[0].a_mat(), args.a_offset, args.a_ld,
|
||||
buffers[0].x_vec(), args.x_offset, args.x_inc, args.beta,
|
||||
buffers[0].y_vec(), args.y_offset, args.y_inc,
|
||||
buffers.a_mat(), args.a_offset, args.a_ld,
|
||||
buffers.x_vec(), args.x_offset, args.x_inc, args.beta,
|
||||
buffers.y_vec(), args.y_offset, args.y_inc,
|
||||
&queue_plain, &event);
|
||||
if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); }
|
||||
return status;
|
||||
|
@ -101,15 +101,15 @@ class TestXgemv {
|
|||
|
||||
// Describes how to run the clBLAS routine (for correctness/performance comparison)
|
||||
#ifdef CLBLAST_REF_CLBLAS
|
||||
static StatusCode RunReference1(const Arguments<T> &args, std::vector<Buffers<T>> &buffers, Queue &queue) {
|
||||
static StatusCode RunReference1(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
|
||||
auto queue_plain = queue();
|
||||
auto event = cl_event{};
|
||||
auto status = clblasXgemv(convertToCLBLAS(args.layout),
|
||||
convertToCLBLAS(args.a_transpose),
|
||||
args.m, args.n, args.alpha,
|
||||
buffers[0].a_mat, args.a_offset, args.a_ld,
|
||||
buffers[0].x_vec, args.x_offset, args.x_inc, args.beta,
|
||||
buffers[0].y_vec, args.y_offset, args.y_inc,
|
||||
buffers.a_mat, args.a_offset, args.a_ld,
|
||||
buffers.x_vec, args.x_offset, args.x_inc, args.beta,
|
||||
buffers.y_vec, args.y_offset, args.y_inc,
|
||||
1, &queue_plain, 0, nullptr, &event);
|
||||
clWaitForEvents(1, &event);
|
||||
return static_cast<StatusCode>(status);
|
||||
|
@ -118,20 +118,20 @@ class TestXgemv {
|
|||
|
||||
// Describes how to run the CPU BLAS routine (for correctness/performance comparison)
|
||||
#ifdef CLBLAST_REF_CBLAS
|
||||
static StatusCode RunReference2(const Arguments<T> &args, std::vector<Buffers<T>> &buffers, Queue &queue) {
|
||||
static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
|
||||
std::vector<T> a_mat_cpu(args.a_size, static_cast<T>(0));
|
||||
std::vector<T> x_vec_cpu(args.x_size, static_cast<T>(0));
|
||||
std::vector<T> y_vec_cpu(args.y_size, static_cast<T>(0));
|
||||
buffers[0].a_mat.Read(queue, args.a_size, a_mat_cpu);
|
||||
buffers[0].x_vec.Read(queue, args.x_size, x_vec_cpu);
|
||||
buffers[0].y_vec.Read(queue, args.y_size, y_vec_cpu);
|
||||
buffers.a_mat.Read(queue, args.a_size, a_mat_cpu);
|
||||
buffers.x_vec.Read(queue, args.x_size, x_vec_cpu);
|
||||
buffers.y_vec.Read(queue, args.y_size, y_vec_cpu);
|
||||
cblasXgemv(convertToCBLAS(args.layout),
|
||||
convertToCBLAS(args.a_transpose),
|
||||
args.m, args.n, args.alpha,
|
||||
a_mat_cpu, args.a_offset, args.a_ld,
|
||||
x_vec_cpu, args.x_offset, args.x_inc, args.beta,
|
||||
y_vec_cpu, args.y_offset, args.y_inc);
|
||||
buffers[0].y_vec.Write(queue, args.y_size, y_vec_cpu);
|
||||
buffers.y_vec.Write(queue, args.y_size, y_vec_cpu);
|
||||
return StatusCode::kSuccess;
|
||||
}
|
||||
#endif
|
||||
|
|
|
@ -82,14 +82,14 @@ class TestXger {
|
|||
std::vector<T>&, std::vector<T>&) {} // N/A for this routine
|
||||
|
||||
// Describes how to run the CLBlast routine
|
||||
static StatusCode RunRoutine(const Arguments<T> &args, std::vector<Buffers<T>> &buffers, Queue &queue) {
|
||||
static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
|
||||
auto queue_plain = queue();
|
||||
auto event = cl_event{};
|
||||
auto status = Ger(args.layout,
|
||||
args.m, args.n, args.alpha,
|
||||
buffers[0].x_vec(), args.x_offset, args.x_inc,
|
||||
buffers[0].y_vec(), args.y_offset, args.y_inc,
|
||||
buffers[0].a_mat(), args.a_offset, args.a_ld,
|
||||
buffers.x_vec(), args.x_offset, args.x_inc,
|
||||
buffers.y_vec(), args.y_offset, args.y_inc,
|
||||
buffers.a_mat(), args.a_offset, args.a_ld,
|
||||
&queue_plain, &event);
|
||||
if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); }
|
||||
return status;
|
||||
|
@ -97,14 +97,14 @@ class TestXger {
|
|||
|
||||
// Describes how to run the clBLAS routine (for correctness/performance comparison)
|
||||
#ifdef CLBLAST_REF_CLBLAS
|
||||
static StatusCode RunReference1(const Arguments<T> &args, std::vector<Buffers<T>> &buffers, Queue &queue) {
|
||||
static StatusCode RunReference1(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
|
||||
auto queue_plain = queue();
|
||||
auto event = cl_event{};
|
||||
auto status = clblasXger(convertToCLBLAS(args.layout),
|
||||
args.m, args.n, args.alpha,
|
||||
buffers[0].x_vec, args.x_offset, args.x_inc,
|
||||
buffers[0].y_vec, args.y_offset, args.y_inc,
|
||||
buffers[0].a_mat, args.a_offset, args.a_ld,
|
||||
buffers.x_vec, args.x_offset, args.x_inc,
|
||||
buffers.y_vec, args.y_offset, args.y_inc,
|
||||
buffers.a_mat, args.a_offset, args.a_ld,
|
||||
1, &queue_plain, 0, nullptr, &event);
|
||||
clWaitForEvents(1, &event);
|
||||
return static_cast<StatusCode>(status);
|
||||
|
@ -113,19 +113,19 @@ class TestXger {
|
|||
|
||||
// Describes how to run the CPU BLAS routine (for correctness/performance comparison)
|
||||
#ifdef CLBLAST_REF_CBLAS
|
||||
static StatusCode RunReference2(const Arguments<T> &args, std::vector<Buffers<T>> &buffers, Queue &queue) {
|
||||
static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
|
||||
std::vector<T> a_mat_cpu(args.a_size, static_cast<T>(0));
|
||||
std::vector<T> x_vec_cpu(args.x_size, static_cast<T>(0));
|
||||
std::vector<T> y_vec_cpu(args.y_size, static_cast<T>(0));
|
||||
buffers[0].a_mat.Read(queue, args.a_size, a_mat_cpu);
|
||||
buffers[0].x_vec.Read(queue, args.x_size, x_vec_cpu);
|
||||
buffers[0].y_vec.Read(queue, args.y_size, y_vec_cpu);
|
||||
buffers.a_mat.Read(queue, args.a_size, a_mat_cpu);
|
||||
buffers.x_vec.Read(queue, args.x_size, x_vec_cpu);
|
||||
buffers.y_vec.Read(queue, args.y_size, y_vec_cpu);
|
||||
cblasXger(convertToCBLAS(args.layout),
|
||||
args.m, args.n, args.alpha,
|
||||
x_vec_cpu, args.x_offset, args.x_inc,
|
||||
y_vec_cpu, args.y_offset, args.y_inc,
|
||||
a_mat_cpu, args.a_offset, args.a_ld);
|
||||
buffers[0].a_mat.Write(queue, args.a_size, a_mat_cpu);
|
||||
buffers.a_mat.Write(queue, args.a_size, a_mat_cpu);
|
||||
return StatusCode::kSuccess;
|
||||
}
|
||||
#endif
|
||||
|
|
|
@ -82,14 +82,14 @@ class TestXgerc {
|
|||
std::vector<T>&, std::vector<T>&) {} // N/A for this routine
|
||||
|
||||
// Describes how to run the CLBlast routine
|
||||
static StatusCode RunRoutine(const Arguments<T> &args, std::vector<Buffers<T>> &buffers, Queue &queue) {
|
||||
static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
|
||||
auto queue_plain = queue();
|
||||
auto event = cl_event{};
|
||||
auto status = Gerc(args.layout,
|
||||
args.m, args.n, args.alpha,
|
||||
buffers[0].x_vec(), args.x_offset, args.x_inc,
|
||||
buffers[0].y_vec(), args.y_offset, args.y_inc,
|
||||
buffers[0].a_mat(), args.a_offset, args.a_ld,
|
||||
buffers.x_vec(), args.x_offset, args.x_inc,
|
||||
buffers.y_vec(), args.y_offset, args.y_inc,
|
||||
buffers.a_mat(), args.a_offset, args.a_ld,
|
||||
&queue_plain, &event);
|
||||
if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); }
|
||||
return status;
|
||||
|
@ -97,14 +97,14 @@ class TestXgerc {
|
|||
|
||||
// Describes how to run the clBLAS routine (for correctness/performance comparison)
|
||||
#ifdef CLBLAST_REF_CLBLAS
|
||||
static StatusCode RunReference1(const Arguments<T> &args, std::vector<Buffers<T>> &buffers, Queue &queue) {
|
||||
static StatusCode RunReference1(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
|
||||
auto queue_plain = queue();
|
||||
auto event = cl_event{};
|
||||
auto status = clblasXgerc(convertToCLBLAS(args.layout),
|
||||
args.m, args.n, args.alpha,
|
||||
buffers[0].x_vec, args.x_offset, args.x_inc,
|
||||
buffers[0].y_vec, args.y_offset, args.y_inc,
|
||||
buffers[0].a_mat, args.a_offset, args.a_ld,
|
||||
buffers.x_vec, args.x_offset, args.x_inc,
|
||||
buffers.y_vec, args.y_offset, args.y_inc,
|
||||
buffers.a_mat, args.a_offset, args.a_ld,
|
||||
1, &queue_plain, 0, nullptr, &event);
|
||||
clWaitForEvents(1, &event);
|
||||
return static_cast<StatusCode>(status);
|
||||
|
@ -113,19 +113,19 @@ class TestXgerc {
|
|||
|
||||
// Describes how to run the CPU BLAS routine (for correctness/performance comparison)
|
||||
#ifdef CLBLAST_REF_CBLAS
|
||||
static StatusCode RunReference2(const Arguments<T> &args, std::vector<Buffers<T>> &buffers, Queue &queue) {
|
||||
static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
|
||||
std::vector<T> a_mat_cpu(args.a_size, static_cast<T>(0));
|
||||
std::vector<T> x_vec_cpu(args.x_size, static_cast<T>(0));
|
||||
std::vector<T> y_vec_cpu(args.y_size, static_cast<T>(0));
|
||||
buffers[0].a_mat.Read(queue, args.a_size, a_mat_cpu);
|
||||
buffers[0].x_vec.Read(queue, args.x_size, x_vec_cpu);
|
||||
buffers[0].y_vec.Read(queue, args.y_size, y_vec_cpu);
|
||||
buffers.a_mat.Read(queue, args.a_size, a_mat_cpu);
|
||||
buffers.x_vec.Read(queue, args.x_size, x_vec_cpu);
|
||||
buffers.y_vec.Read(queue, args.y_size, y_vec_cpu);
|
||||
cblasXgerc(convertToCBLAS(args.layout),
|
||||
args.m, args.n, args.alpha,
|
||||
x_vec_cpu, args.x_offset, args.x_inc,
|
||||
y_vec_cpu, args.y_offset, args.y_inc,
|
||||
a_mat_cpu, args.a_offset, args.a_ld);
|
||||
buffers[0].a_mat.Write(queue, args.a_size, a_mat_cpu);
|
||||
buffers.a_mat.Write(queue, args.a_size, a_mat_cpu);
|
||||
return StatusCode::kSuccess;
|
||||
}
|
||||
#endif
|
||||
|
|
|
@ -82,14 +82,14 @@ class TestXgeru {
|
|||
std::vector<T>&, std::vector<T>&) {} // N/A for this routine
|
||||
|
||||
// Describes how to run the CLBlast routine
|
||||
static StatusCode RunRoutine(const Arguments<T> &args, std::vector<Buffers<T>> &buffers, Queue &queue) {
|
||||
static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
|
||||
auto queue_plain = queue();
|
||||
auto event = cl_event{};
|
||||
auto status = Geru(args.layout,
|
||||
args.m, args.n, args.alpha,
|
||||
buffers[0].x_vec(), args.x_offset, args.x_inc,
|
||||
buffers[0].y_vec(), args.y_offset, args.y_inc,
|
||||
buffers[0].a_mat(), args.a_offset, args.a_ld,
|
||||
buffers.x_vec(), args.x_offset, args.x_inc,
|
||||
buffers.y_vec(), args.y_offset, args.y_inc,
|
||||
buffers.a_mat(), args.a_offset, args.a_ld,
|
||||
&queue_plain, &event);
|
||||
if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); }
|
||||
return status;
|
||||
|
@ -97,14 +97,14 @@ class TestXgeru {
|
|||
|
||||
// Describes how to run the clBLAS routine (for correctness/performance comparison)
|
||||
#ifdef CLBLAST_REF_CLBLAS
|
||||
static StatusCode RunReference1(const Arguments<T> &args, std::vector<Buffers<T>> &buffers, Queue &queue) {
|
||||
static StatusCode RunReference1(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
|
||||
auto queue_plain = queue();
|
||||
auto event = cl_event{};
|
||||
auto status = clblasXgeru(convertToCLBLAS(args.layout),
|
||||
args.m, args.n, args.alpha,
|
||||
buffers[0].x_vec, args.x_offset, args.x_inc,
|
||||
buffers[0].y_vec, args.y_offset, args.y_inc,
|
||||
buffers[0].a_mat, args.a_offset, args.a_ld,
|
||||
buffers.x_vec, args.x_offset, args.x_inc,
|
||||
buffers.y_vec, args.y_offset, args.y_inc,
|
||||
buffers.a_mat, args.a_offset, args.a_ld,
|
||||
1, &queue_plain, 0, nullptr, &event);
|
||||
clWaitForEvents(1, &event);
|
||||
return static_cast<StatusCode>(status);
|
||||
|
@ -113,19 +113,19 @@ class TestXgeru {
|
|||
|
||||
// Describes how to run the CPU BLAS routine (for correctness/performance comparison)
|
||||
#ifdef CLBLAST_REF_CBLAS
|
||||
static StatusCode RunReference2(const Arguments<T> &args, std::vector<Buffers<T>> &buffers, Queue &queue) {
|
||||
static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
|
||||
std::vector<T> a_mat_cpu(args.a_size, static_cast<T>(0));
|
||||
std::vector<T> x_vec_cpu(args.x_size, static_cast<T>(0));
|
||||
std::vector<T> y_vec_cpu(args.y_size, static_cast<T>(0));
|
||||
buffers[0].a_mat.Read(queue, args.a_size, a_mat_cpu);
|
||||
buffers[0].x_vec.Read(queue, args.x_size, x_vec_cpu);
|
||||
buffers[0].y_vec.Read(queue, args.y_size, y_vec_cpu);
|
||||
buffers.a_mat.Read(queue, args.a_size, a_mat_cpu);
|
||||
buffers.x_vec.Read(queue, args.x_size, x_vec_cpu);
|
||||
buffers.y_vec.Read(queue, args.y_size, y_vec_cpu);
|
||||
cblasXgeru(convertToCBLAS(args.layout),
|
||||
args.m, args.n, args.alpha,
|
||||
x_vec_cpu, args.x_offset, args.x_inc,
|
||||
y_vec_cpu, args.y_offset, args.y_inc,
|
||||
a_mat_cpu, args.a_offset, args.a_ld);
|
||||
buffers[0].a_mat.Write(queue, args.a_size, a_mat_cpu);
|
||||
buffers.a_mat.Write(queue, args.a_size, a_mat_cpu);
|
||||
return StatusCode::kSuccess;
|
||||
}
|
||||
#endif
|
||||
|
|
|
@ -80,14 +80,14 @@ class TestXhbmv {
|
|||
std::vector<T>&, std::vector<T>&) {} // N/A for this routine
|
||||
|
||||
// Describes how to run the CLBlast routine
|
||||
static StatusCode RunRoutine(const Arguments<T> &args, std::vector<Buffers<T>> &buffers, Queue &queue) {
|
||||
static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
|
||||
auto queue_plain = queue();
|
||||
auto event = cl_event{};
|
||||
auto status = Hbmv(args.layout, args.triangle,
|
||||
args.n, args.kl, args.alpha,
|
||||
buffers[0].a_mat(), args.a_offset, args.a_ld,
|
||||
buffers[0].x_vec(), args.x_offset, args.x_inc, args.beta,
|
||||
buffers[0].y_vec(), args.y_offset, args.y_inc,
|
||||
buffers.a_mat(), args.a_offset, args.a_ld,
|
||||
buffers.x_vec(), args.x_offset, args.x_inc, args.beta,
|
||||
buffers.y_vec(), args.y_offset, args.y_inc,
|
||||
&queue_plain, &event);
|
||||
if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); }
|
||||
return status;
|
||||
|
@ -95,15 +95,15 @@ class TestXhbmv {
|
|||
|
||||
// Describes how to run the clBLAS routine (for correctness/performance comparison)
|
||||
#ifdef CLBLAST_REF_CLBLAS
|
||||
static StatusCode RunReference1(const Arguments<T> &args, std::vector<Buffers<T>> &buffers, Queue &queue) {
|
||||
static StatusCode RunReference1(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
|
||||
auto queue_plain = queue();
|
||||
auto event = cl_event{};
|
||||
auto status = clblasXhbmv(convertToCLBLAS(args.layout),
|
||||
convertToCLBLAS(args.triangle),
|
||||
args.n, args.kl, args.alpha,
|
||||
buffers[0].a_mat, args.a_offset, args.a_ld,
|
||||
buffers[0].x_vec, args.x_offset, args.x_inc, args.beta,
|
||||
buffers[0].y_vec, args.y_offset, args.y_inc,
|
||||
buffers.a_mat, args.a_offset, args.a_ld,
|
||||
buffers.x_vec, args.x_offset, args.x_inc, args.beta,
|
||||
buffers.y_vec, args.y_offset, args.y_inc,
|
||||
1, &queue_plain, 0, nullptr, &event);
|
||||
clWaitForEvents(1, &event);
|
||||
return static_cast<StatusCode>(status);
|
||||
|
@ -112,20 +112,20 @@ class TestXhbmv {
|
|||
|
||||
// Describes how to run the CPU BLAS routine (for correctness/performance comparison)
|
||||
#ifdef CLBLAST_REF_CBLAS
|
||||
static StatusCode RunReference2(const Arguments<T> &args, std::vector<Buffers<T>> &buffers, Queue &queue) {
|
||||
static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
|
||||
std::vector<T> a_mat_cpu(args.a_size, static_cast<T>(0));
|
||||
std::vector<T> x_vec_cpu(args.x_size, static_cast<T>(0));
|
||||
std::vector<T> y_vec_cpu(args.y_size, static_cast<T>(0));
|
||||
buffers[0].a_mat.Read(queue, args.a_size, a_mat_cpu);
|
||||
buffers[0].x_vec.Read(queue, args.x_size, x_vec_cpu);
|
||||
buffers[0].y_vec.Read(queue, args.y_size, y_vec_cpu);
|
||||
buffers.a_mat.Read(queue, args.a_size, a_mat_cpu);
|
||||
buffers.x_vec.Read(queue, args.x_size, x_vec_cpu);
|
||||
buffers.y_vec.Read(queue, args.y_size, y_vec_cpu);
|
||||
cblasXhbmv(convertToCBLAS(args.layout),
|
||||
convertToCBLAS(args.triangle),
|
||||
args.n, args.kl, args.alpha,
|
||||
a_mat_cpu, args.a_offset, args.a_ld,
|
||||
x_vec_cpu, args.x_offset, args.x_inc, args.beta,
|
||||
y_vec_cpu, args.y_offset, args.y_inc);
|
||||
buffers[0].y_vec.Write(queue, args.y_size, y_vec_cpu);
|
||||
buffers.y_vec.Write(queue, args.y_size, y_vec_cpu);
|
||||
return StatusCode::kSuccess;
|
||||
}
|
||||
#endif
|
||||
|
|
|
@ -80,14 +80,14 @@ class TestXhemv {
|
|||
std::vector<T>&, std::vector<T>&) {} // N/A for this routine
|
||||
|
||||
// Describes how to run the CLBlast routine
|
||||
static StatusCode RunRoutine(const Arguments<T> &args, std::vector<Buffers<T>> &buffers, Queue &queue) {
|
||||
static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
|
||||
auto queue_plain = queue();
|
||||
auto event = cl_event{};
|
||||
auto status = Hemv(args.layout, args.triangle,
|
||||
args.n, args.alpha,
|
||||
buffers[0].a_mat(), args.a_offset, args.a_ld,
|
||||
buffers[0].x_vec(), args.x_offset, args.x_inc, args.beta,
|
||||
buffers[0].y_vec(), args.y_offset, args.y_inc,
|
||||
buffers.a_mat(), args.a_offset, args.a_ld,
|
||||
buffers.x_vec(), args.x_offset, args.x_inc, args.beta,
|
||||
buffers.y_vec(), args.y_offset, args.y_inc,
|
||||
&queue_plain, &event);
|
||||
if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); }
|
||||
return status;
|
||||
|
@ -95,15 +95,15 @@ class TestXhemv {
|
|||
|
||||
// Describes how to run the clBLAS routine (for correctness/performance comparison)
|
||||
#ifdef CLBLAST_REF_CLBLAS
|
||||
static StatusCode RunReference1(const Arguments<T> &args, std::vector<Buffers<T>> &buffers, Queue &queue) {
|
||||
static StatusCode RunReference1(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
|
||||
auto queue_plain = queue();
|
||||
auto event = cl_event{};
|
||||
auto status = clblasXhemv(convertToCLBLAS(args.layout),
|
||||
convertToCLBLAS(args.triangle),
|
||||
args.n, args.alpha,
|
||||
buffers[0].a_mat, args.a_offset, args.a_ld,
|
||||
buffers[0].x_vec, args.x_offset, args.x_inc, args.beta,
|
||||
buffers[0].y_vec, args.y_offset, args.y_inc,
|
||||
buffers.a_mat, args.a_offset, args.a_ld,
|
||||
buffers.x_vec, args.x_offset, args.x_inc, args.beta,
|
||||
buffers.y_vec, args.y_offset, args.y_inc,
|
||||
1, &queue_plain, 0, nullptr, &event);
|
||||
clWaitForEvents(1, &event);
|
||||
return static_cast<StatusCode>(status);
|
||||
|
@ -112,20 +112,20 @@ class TestXhemv {
|
|||
|
||||
// Describes how to run the CPU BLAS routine (for correctness/performance comparison)
|
||||
#ifdef CLBLAST_REF_CBLAS
|
||||
static StatusCode RunReference2(const Arguments<T> &args, std::vector<Buffers<T>> &buffers, Queue &queue) {
|
||||
static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
|
||||
std::vector<T> a_mat_cpu(args.a_size, static_cast<T>(0));
|
||||
std::vector<T> x_vec_cpu(args.x_size, static_cast<T>(0));
|
||||
std::vector<T> y_vec_cpu(args.y_size, static_cast<T>(0));
|
||||
buffers[0].a_mat.Read(queue, args.a_size, a_mat_cpu);
|
||||
buffers[0].x_vec.Read(queue, args.x_size, x_vec_cpu);
|
||||
buffers[0].y_vec.Read(queue, args.y_size, y_vec_cpu);
|
||||
buffers.a_mat.Read(queue, args.a_size, a_mat_cpu);
|
||||
buffers.x_vec.Read(queue, args.x_size, x_vec_cpu);
|
||||
buffers.y_vec.Read(queue, args.y_size, y_vec_cpu);
|
||||
cblasXhemv(convertToCBLAS(args.layout),
|
||||
convertToCBLAS(args.triangle),
|
||||
args.n, args.alpha,
|
||||
a_mat_cpu, args.a_offset, args.a_ld,
|
||||
x_vec_cpu, args.x_offset, args.x_inc, args.beta,
|
||||
y_vec_cpu, args.y_offset, args.y_inc);
|
||||
buffers[0].y_vec.Write(queue, args.y_size, y_vec_cpu);
|
||||
buffers.y_vec.Write(queue, args.y_size, y_vec_cpu);
|
||||
return StatusCode::kSuccess;
|
||||
}
|
||||
#endif
|
||||
|
|
|
@ -76,13 +76,13 @@ class TestXher {
|
|||
std::vector<T>&, std::vector<T>&) {} // N/A for this routine
|
||||
|
||||
// Describes how to run the CLBlast routine
|
||||
static StatusCode RunRoutine(const Arguments<U> &args, std::vector<Buffers<T>> &buffers, Queue &queue) {
|
||||
static StatusCode RunRoutine(const Arguments<U> &args, Buffers<T> &buffers, Queue &queue) {
|
||||
auto queue_plain = queue();
|
||||
auto event = cl_event{};
|
||||
auto status = Her(args.layout, args.triangle,
|
||||
args.n, args.alpha,
|
||||
buffers[0].x_vec(), args.x_offset, args.x_inc,
|
||||
buffers[0].a_mat(), args.a_offset, args.a_ld,
|
||||
buffers.x_vec(), args.x_offset, args.x_inc,
|
||||
buffers.a_mat(), args.a_offset, args.a_ld,
|
||||
&queue_plain, &event);
|
||||
if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); }
|
||||
return status;
|
||||
|
@ -90,14 +90,14 @@ class TestXher {
|
|||
|
||||
// Describes how to run the clBLAS routine (for correctness/performance comparison)
|
||||
#ifdef CLBLAST_REF_CLBLAS
|
||||
static StatusCode RunReference1(const Arguments<U> &args, std::vector<Buffers<T>> &buffers, Queue &queue) {
|
||||
static StatusCode RunReference1(const Arguments<U> &args, Buffers<T> &buffers, Queue &queue) {
|
||||
auto queue_plain = queue();
|
||||
auto event = cl_event{};
|
||||
auto status = clblasXher(convertToCLBLAS(args.layout),
|
||||
convertToCLBLAS(args.triangle),
|
||||
args.n, args.alpha,
|
||||
buffers[0].x_vec, args.x_offset, args.x_inc,
|
||||
buffers[0].a_mat, args.a_offset, args.a_ld,
|
||||
buffers.x_vec, args.x_offset, args.x_inc,
|
||||
buffers.a_mat, args.a_offset, args.a_ld,
|
||||
1, &queue_plain, 0, nullptr, &event);
|
||||
clWaitForEvents(1, &event);
|
||||
return static_cast<StatusCode>(status);
|
||||
|
@ -106,17 +106,17 @@ class TestXher {
|
|||
|
||||
// Describes how to run the CPU BLAS routine (for correctness/performance comparison)
|
||||
#ifdef CLBLAST_REF_CBLAS
|
||||
static StatusCode RunReference2(const Arguments<U> &args, std::vector<Buffers<T>> &buffers, Queue &queue) {
|
||||
static StatusCode RunReference2(const Arguments<U> &args, Buffers<T> &buffers, Queue &queue) {
|
||||
std::vector<T> a_mat_cpu(args.a_size, static_cast<T>(0));
|
||||
std::vector<T> x_vec_cpu(args.x_size, static_cast<T>(0));
|
||||
buffers[0].a_mat.Read(queue, args.a_size, a_mat_cpu);
|
||||
buffers[0].x_vec.Read(queue, args.x_size, x_vec_cpu);
|
||||
buffers.a_mat.Read(queue, args.a_size, a_mat_cpu);
|
||||
buffers.x_vec.Read(queue, args.x_size, x_vec_cpu);
|
||||
cblasXher(convertToCBLAS(args.layout),
|
||||
convertToCBLAS(args.triangle),
|
||||
args.n, args.alpha,
|
||||
x_vec_cpu, args.x_offset, args.x_inc,
|
||||
a_mat_cpu, args.a_offset, args.a_ld);
|
||||
buffers[0].a_mat.Write(queue, args.a_size, a_mat_cpu);
|
||||
buffers.a_mat.Write(queue, args.a_size, a_mat_cpu);
|
||||
return StatusCode::kSuccess;
|
||||
}
|
||||
#endif
|
||||
|
|
|
@ -80,14 +80,14 @@ class TestXher2 {
|
|||
std::vector<T>&, std::vector<T>&) {} // N/A for this routine
|
||||
|
||||
// Describes how to run the CLBlast routine
|
||||
static StatusCode RunRoutine(const Arguments<T> &args, std::vector<Buffers<T>> &buffers, Queue &queue) {
|
||||
static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
|
||||
auto queue_plain = queue();
|
||||
auto event = cl_event{};
|
||||
auto status = Her2(args.layout, args.triangle,
|
||||
args.n, args.alpha,
|
||||
buffers[0].x_vec(), args.x_offset, args.x_inc,
|
||||
buffers[0].y_vec(), args.y_offset, args.y_inc,
|
||||
buffers[0].a_mat(), args.a_offset, args.a_ld,
|
||||
buffers.x_vec(), args.x_offset, args.x_inc,
|
||||
buffers.y_vec(), args.y_offset, args.y_inc,
|
||||
buffers.a_mat(), args.a_offset, args.a_ld,
|
||||
&queue_plain, &event);
|
||||
if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); }
|
||||
return status;
|
||||
|
@ -95,15 +95,15 @@ class TestXher2 {
|
|||
|
||||
// Describes how to run the clBLAS routine (for correctness/performance comparison)
|
||||
#ifdef CLBLAST_REF_CLBLAS
|
||||
static StatusCode RunReference1(const Arguments<T> &args, std::vector<Buffers<T>> &buffers, Queue &queue) {
|
||||
static StatusCode RunReference1(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
|
||||
auto queue_plain = queue();
|
||||
auto event = cl_event{};
|
||||
auto status = clblasXher2(convertToCLBLAS(args.layout),
|
||||
convertToCLBLAS(args.triangle),
|
||||
args.n, args.alpha,
|
||||
buffers[0].x_vec, args.x_offset, args.x_inc,
|
||||
buffers[0].y_vec, args.y_offset, args.y_inc,
|
||||
buffers[0].a_mat, args.a_offset, args.a_ld,
|
||||
buffers.x_vec, args.x_offset, args.x_inc,
|
||||
buffers.y_vec, args.y_offset, args.y_inc,
|
||||
buffers.a_mat, args.a_offset, args.a_ld,
|
||||
1, &queue_plain, 0, nullptr, &event);
|
||||
clWaitForEvents(1, &event);
|
||||
return static_cast<StatusCode>(status);
|
||||
|
@ -112,20 +112,20 @@ class TestXher2 {
|
|||
|
||||
// Describes how to run the CPU BLAS routine (for correctness/performance comparison)
|
||||
#ifdef CLBLAST_REF_CBLAS
|
||||
static StatusCode RunReference2(const Arguments<T> &args, std::vector<Buffers<T>> &buffers, Queue &queue) {
|
||||
static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
|
||||
std::vector<T> a_mat_cpu(args.a_size, static_cast<T>(0));
|
||||
std::vector<T> x_vec_cpu(args.x_size, static_cast<T>(0));
|
||||
std::vector<T> y_vec_cpu(args.y_size, static_cast<T>(0));
|
||||
buffers[0].a_mat.Read(queue, args.a_size, a_mat_cpu);
|
||||
buffers[0].x_vec.Read(queue, args.x_size, x_vec_cpu);
|
||||
buffers[0].y_vec.Read(queue, args.y_size, y_vec_cpu);
|
||||
buffers.a_mat.Read(queue, args.a_size, a_mat_cpu);
|
||||
buffers.x_vec.Read(queue, args.x_size, x_vec_cpu);
|
||||
buffers.y_vec.Read(queue, args.y_size, y_vec_cpu);
|
||||
cblasXher2(convertToCBLAS(args.layout),
|
||||
convertToCBLAS(args.triangle),
|
||||
args.n, args.alpha,
|
||||
x_vec_cpu, args.x_offset, args.x_inc,
|
||||
y_vec_cpu, args.y_offset, args.y_inc,
|
||||
a_mat_cpu, args.a_offset, args.a_ld);
|
||||
buffers[0].a_mat.Write(queue, args.a_size, a_mat_cpu);
|
||||
buffers.a_mat.Write(queue, args.a_size, a_mat_cpu);
|
||||
return StatusCode::kSuccess;
|
||||
}
|
||||
#endif
|
||||
|
|
|
@ -80,14 +80,14 @@ class TestXhpmv {
|
|||
std::vector<T>&, std::vector<T>&) {} // N/A for this routine
|
||||
|
||||
// Describes how to run the CLBlast routine
|
||||
static StatusCode RunRoutine(const Arguments<T> &args, std::vector<Buffers<T>> &buffers, Queue &queue) {
|
||||
static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
|
||||
auto queue_plain = queue();
|
||||
auto event = cl_event{};
|
||||
auto status = Hpmv(args.layout, args.triangle,
|
||||
args.n, args.alpha,
|
||||
buffers[0].ap_mat(), args.ap_offset,
|
||||
buffers[0].x_vec(), args.x_offset, args.x_inc, args.beta,
|
||||
buffers[0].y_vec(), args.y_offset, args.y_inc,
|
||||
buffers.ap_mat(), args.ap_offset,
|
||||
buffers.x_vec(), args.x_offset, args.x_inc, args.beta,
|
||||
buffers.y_vec(), args.y_offset, args.y_inc,
|
||||
&queue_plain, &event);
|
||||
if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); }
|
||||
return status;
|
||||
|
@ -95,15 +95,15 @@ class TestXhpmv {
|
|||
|
||||
// Describes how to run the clBLAS routine (for correctness/performance comparison)
|
||||
#ifdef CLBLAST_REF_CLBLAS
|
||||
static StatusCode RunReference1(const Arguments<T> &args, std::vector<Buffers<T>> &buffers, Queue &queue) {
|
||||
static StatusCode RunReference1(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
|
||||
auto queue_plain = queue();
|
||||
auto event = cl_event{};
|
||||
auto status = clblasXhpmv(convertToCLBLAS(args.layout),
|
||||
convertToCLBLAS(args.triangle),
|
||||
args.n, args.alpha,
|
||||
buffers[0].ap_mat, args.ap_offset,
|
||||
buffers[0].x_vec, args.x_offset, args.x_inc, args.beta,
|
||||
buffers[0].y_vec, args.y_offset, args.y_inc,
|
||||
buffers.ap_mat, args.ap_offset,
|
||||
buffers.x_vec, args.x_offset, args.x_inc, args.beta,
|
||||
buffers.y_vec, args.y_offset, args.y_inc,
|
||||
1, &queue_plain, 0, nullptr, &event);
|
||||
clWaitForEvents(1, &event);
|
||||
return static_cast<StatusCode>(status);
|
||||
|
@ -112,20 +112,20 @@ class TestXhpmv {
|
|||
|
||||
// Describes how to run the CPU BLAS routine (for correctness/performance comparison)
|
||||
#ifdef CLBLAST_REF_CBLAS
|
||||
static StatusCode RunReference2(const Arguments<T> &args, std::vector<Buffers<T>> &buffers, Queue &queue) {
|
||||
static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
|
||||
std::vector<T> ap_mat_cpu(args.ap_size, static_cast<T>(0));
|
||||
std::vector<T> x_vec_cpu(args.x_size, static_cast<T>(0));
|
||||
std::vector<T> y_vec_cpu(args.y_size, static_cast<T>(0));
|
||||
buffers[0].ap_mat.Read(queue, args.ap_size, ap_mat_cpu);
|
||||
buffers[0].x_vec.Read(queue, args.x_size, x_vec_cpu);
|
||||
buffers[0].y_vec.Read(queue, args.y_size, y_vec_cpu);
|
||||
buffers.ap_mat.Read(queue, args.ap_size, ap_mat_cpu);
|
||||
buffers.x_vec.Read(queue, args.x_size, x_vec_cpu);
|
||||
buffers.y_vec.Read(queue, args.y_size, y_vec_cpu);
|
||||
cblasXhpmv(convertToCBLAS(args.layout),
|
||||
convertToCBLAS(args.triangle),
|
||||
args.n, args.alpha,
|
||||
ap_mat_cpu, args.ap_offset,
|
||||
x_vec_cpu, args.x_offset, args.x_inc, args.beta,
|
||||
y_vec_cpu, args.y_offset, args.y_inc);
|
||||
buffers[0].y_vec.Write(queue, args.y_size, y_vec_cpu);
|
||||
buffers.y_vec.Write(queue, args.y_size, y_vec_cpu);
|
||||
return StatusCode::kSuccess;
|
||||
}
|
||||
#endif
|
||||
|
|
|
@ -76,13 +76,13 @@ class TestXhpr {
|
|||
std::vector<T>&, std::vector<T>&) {} // N/A for this routine
|
||||
|
||||
// Describes how to run the CLBlast routine
|
||||
static StatusCode RunRoutine(const Arguments<U> &args, std::vector<Buffers<T>> &buffers, Queue &queue) {
|
||||
static StatusCode RunRoutine(const Arguments<U> &args, Buffers<T> &buffers, Queue &queue) {
|
||||
auto queue_plain = queue();
|
||||
auto event = cl_event{};
|
||||
auto status = Hpr(args.layout, args.triangle,
|
||||
args.n, args.alpha,
|
||||
buffers[0].x_vec(), args.x_offset, args.x_inc,
|
||||
buffers[0].ap_mat(), args.ap_offset,
|
||||
buffers.x_vec(), args.x_offset, args.x_inc,
|
||||
buffers.ap_mat(), args.ap_offset,
|
||||
&queue_plain, &event);
|
||||
if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); }
|
||||
return status;
|
||||
|
@ -90,14 +90,14 @@ class TestXhpr {
|
|||
|
||||
// Describes how to run the clBLAS routine (for correctness/performance comparison)
|
||||
#ifdef CLBLAST_REF_CLBLAS
|
||||
static StatusCode RunReference1(const Arguments<U> &args, std::vector<Buffers<T>> &buffers, Queue &queue) {
|
||||
static StatusCode RunReference1(const Arguments<U> &args, Buffers<T> &buffers, Queue &queue) {
|
||||
auto queue_plain = queue();
|
||||
auto event = cl_event{};
|
||||
auto status = clblasXhpr(convertToCLBLAS(args.layout),
|
||||
convertToCLBLAS(args.triangle),
|
||||
args.n, args.alpha,
|
||||
buffers[0].x_vec, args.x_offset, args.x_inc,
|
||||
buffers[0].ap_mat, args.ap_offset,
|
||||
buffers.x_vec, args.x_offset, args.x_inc,
|
||||
buffers.ap_mat, args.ap_offset,
|
||||
1, &queue_plain, 0, nullptr, &event);
|
||||
clWaitForEvents(1, &event);
|
||||
return static_cast<StatusCode>(status);
|
||||
|
@ -106,17 +106,17 @@ class TestXhpr {
|
|||
|
||||
// Describes how to run the CPU BLAS routine (for correctness/performance comparison)
|
||||
#ifdef CLBLAST_REF_CBLAS
|
||||
static StatusCode RunReference2(const Arguments<U> &args, std::vector<Buffers<T>> &buffers, Queue &queue) {
|
||||
static StatusCode RunReference2(const Arguments<U> &args, Buffers<T> &buffers, Queue &queue) {
|
||||
std::vector<T> ap_mat_cpu(args.ap_size, static_cast<T>(0));
|
||||
std::vector<T> x_vec_cpu(args.x_size, static_cast<T>(0));
|
||||
buffers[0].ap_mat.Read(queue, args.ap_size, ap_mat_cpu);
|
||||
buffers[0].x_vec.Read(queue, args.x_size, x_vec_cpu);
|
||||
buffers.ap_mat.Read(queue, args.ap_size, ap_mat_cpu);
|
||||
buffers.x_vec.Read(queue, args.x_size, x_vec_cpu);
|
||||
cblasXhpr(convertToCBLAS(args.layout),
|
||||
convertToCBLAS(args.triangle),
|
||||
args.n, args.alpha,
|
||||
x_vec_cpu, args.x_offset, args.x_inc,
|
||||
ap_mat_cpu, args.ap_offset);
|
||||
buffers[0].ap_mat.Write(queue, args.ap_size, ap_mat_cpu);
|
||||
buffers.ap_mat.Write(queue, args.ap_size, ap_mat_cpu);
|
||||
return StatusCode::kSuccess;
|
||||
}
|
||||
#endif
|
||||
|
|
|
@ -80,14 +80,14 @@ class TestXhpr2 {
|
|||
std::vector<T>&, std::vector<T>&) {} // N/A for this routine
|
||||
|
||||
// Describes how to run the CLBlast routine
|
||||
static StatusCode RunRoutine(const Arguments<T> &args, std::vector<Buffers<T>> &buffers, Queue &queue) {
|
||||
static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
|
||||
auto queue_plain = queue();
|
||||
auto event = cl_event{};
|
||||
auto status = Hpr2(args.layout, args.triangle,
|
||||
args.n, args.alpha,
|
||||
buffers[0].x_vec(), args.x_offset, args.x_inc,
|
||||
buffers[0].y_vec(), args.y_offset, args.y_inc,
|
||||
buffers[0].ap_mat(), args.ap_offset,
|
||||
buffers.x_vec(), args.x_offset, args.x_inc,
|
||||
buffers.y_vec(), args.y_offset, args.y_inc,
|
||||
buffers.ap_mat(), args.ap_offset,
|
||||
&queue_plain, &event);
|
||||
if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); }
|
||||
return status;
|
||||
|
@ -95,15 +95,15 @@ class TestXhpr2 {
|
|||
|
||||
// Describes how to run the clBLAS routine (for correctness/performance comparison)
|
||||
#ifdef CLBLAST_REF_CLBLAS
|
||||
static StatusCode RunReference1(const Arguments<T> &args, std::vector<Buffers<T>> &buffers, Queue &queue) {
|
||||
static StatusCode RunReference1(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
|
||||
auto queue_plain = queue();
|
||||
auto event = cl_event{};
|
||||
auto status = clblasXhpr2(convertToCLBLAS(args.layout),
|
||||
convertToCLBLAS(args.triangle),
|
||||
args.n, args.alpha,
|
||||
buffers[0].x_vec, args.x_offset, args.x_inc,
|
||||
buffers[0].y_vec, args.y_offset, args.y_inc,
|
||||
buffers[0].ap_mat, args.ap_offset,
|
||||
buffers.x_vec, args.x_offset, args.x_inc,
|
||||
buffers.y_vec, args.y_offset, args.y_inc,
|
||||
buffers.ap_mat, args.ap_offset,
|
||||
1, &queue_plain, 0, nullptr, &event);
|
||||
clWaitForEvents(1, &event);
|
||||
return static_cast<StatusCode>(status);
|
||||
|
@ -112,20 +112,20 @@ class TestXhpr2 {
|
|||
|
||||
// Describes how to run the CPU BLAS routine (for correctness/performance comparison)
|
||||
#ifdef CLBLAST_REF_CBLAS
|
||||
static StatusCode RunReference2(const Arguments<T> &args, std::vector<Buffers<T>> &buffers, Queue &queue) {
|
||||
static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
|
||||
std::vector<T> ap_mat_cpu(args.ap_size, static_cast<T>(0));
|
||||
std::vector<T> x_vec_cpu(args.x_size, static_cast<T>(0));
|
||||
std::vector<T> y_vec_cpu(args.y_size, static_cast<T>(0));
|
||||
buffers[0].ap_mat.Read(queue, args.ap_size, ap_mat_cpu);
|
||||
buffers[0].x_vec.Read(queue, args.x_size, x_vec_cpu);
|
||||
buffers[0].y_vec.Read(queue, args.y_size, y_vec_cpu);
|
||||
buffers.ap_mat.Read(queue, args.ap_size, ap_mat_cpu);
|
||||
buffers.x_vec.Read(queue, args.x_size, x_vec_cpu);
|
||||
buffers.y_vec.Read(queue, args.y_size, y_vec_cpu);
|
||||
cblasXhpr2(convertToCBLAS(args.layout),
|
||||
convertToCBLAS(args.triangle),
|
||||
args.n, args.alpha,
|
||||
x_vec_cpu, args.x_offset, args.x_inc,
|
||||
y_vec_cpu, args.y_offset, args.y_inc,
|
||||
ap_mat_cpu, args.ap_offset);
|
||||
buffers[0].ap_mat.Write(queue, args.ap_size, ap_mat_cpu);
|
||||
buffers.ap_mat.Write(queue, args.ap_size, ap_mat_cpu);
|
||||
return StatusCode::kSuccess;
|
||||
}
|
||||
#endif
|
||||
|
|
|
@ -80,14 +80,14 @@ class TestXsbmv {
|
|||
std::vector<T>&, std::vector<T>&) {} // N/A for this routine
|
||||
|
||||
// Describes how to run the CLBlast routine
|
||||
static StatusCode RunRoutine(const Arguments<T> &args, std::vector<Buffers<T>> &buffers, Queue &queue) {
|
||||
static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
|
||||
auto queue_plain = queue();
|
||||
auto event = cl_event{};
|
||||
auto status = Sbmv(args.layout, args.triangle,
|
||||
args.n, args.kl, args.alpha,
|
||||
buffers[0].a_mat(), args.a_offset, args.a_ld,
|
||||
buffers[0].x_vec(), args.x_offset, args.x_inc, args.beta,
|
||||
buffers[0].y_vec(), args.y_offset, args.y_inc,
|
||||
buffers.a_mat(), args.a_offset, args.a_ld,
|
||||
buffers.x_vec(), args.x_offset, args.x_inc, args.beta,
|
||||
buffers.y_vec(), args.y_offset, args.y_inc,
|
||||
&queue_plain, &event);
|
||||
if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); }
|
||||
return status;
|
||||
|
@ -95,15 +95,15 @@ class TestXsbmv {
|
|||
|
||||
// Describes how to run the clBLAS routine (for correctness/performance comparison)
|
||||
#ifdef CLBLAST_REF_CLBLAS
|
||||
static StatusCode RunReference1(const Arguments<T> &args, std::vector<Buffers<T>> &buffers, Queue &queue) {
|
||||
static StatusCode RunReference1(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
|
||||
auto queue_plain = queue();
|
||||
auto event = cl_event{};
|
||||
auto status = clblasXsbmv(convertToCLBLAS(args.layout),
|
||||
convertToCLBLAS(args.triangle),
|
||||
args.n, args.kl, args.alpha,
|
||||
buffers[0].a_mat, args.a_offset, args.a_ld,
|
||||
buffers[0].x_vec, args.x_offset, args.x_inc, args.beta,
|
||||
buffers[0].y_vec, args.y_offset, args.y_inc,
|
||||
buffers.a_mat, args.a_offset, args.a_ld,
|
||||
buffers.x_vec, args.x_offset, args.x_inc, args.beta,
|
||||
buffers.y_vec, args.y_offset, args.y_inc,
|
||||
1, &queue_plain, 0, nullptr, &event);
|
||||
clWaitForEvents(1, &event);
|
||||
return static_cast<StatusCode>(status);
|
||||
|
@ -112,20 +112,20 @@ class TestXsbmv {
|
|||
|
||||
// Describes how to run the CPU BLAS routine (for correctness/performance comparison)
|
||||
#ifdef CLBLAST_REF_CBLAS
|
||||
static StatusCode RunReference2(const Arguments<T> &args, std::vector<Buffers<T>> &buffers, Queue &queue) {
|
||||
static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
|
||||
std::vector<T> a_mat_cpu(args.a_size, static_cast<T>(0));
|
||||
std::vector<T> x_vec_cpu(args.x_size, static_cast<T>(0));
|
||||
std::vector<T> y_vec_cpu(args.y_size, static_cast<T>(0));
|
||||
buffers[0].a_mat.Read(queue, args.a_size, a_mat_cpu);
|
||||
buffers[0].x_vec.Read(queue, args.x_size, x_vec_cpu);
|
||||
buffers[0].y_vec.Read(queue, args.y_size, y_vec_cpu);
|
||||
buffers.a_mat.Read(queue, args.a_size, a_mat_cpu);
|
||||
buffers.x_vec.Read(queue, args.x_size, x_vec_cpu);
|
||||
buffers.y_vec.Read(queue, args.y_size, y_vec_cpu);
|
||||
cblasXsbmv(convertToCBLAS(args.layout),
|
||||
convertToCBLAS(args.triangle),
|
||||
args.n, args.kl, args.alpha,
|
||||
a_mat_cpu, args.a_offset, args.a_ld,
|
||||
x_vec_cpu, args.x_offset, args.x_inc, args.beta,
|
||||
y_vec_cpu, args.y_offset, args.y_inc);
|
||||
buffers[0].y_vec.Write(queue, args.y_size, y_vec_cpu);
|
||||
buffers.y_vec.Write(queue, args.y_size, y_vec_cpu);
|
||||
return StatusCode::kSuccess;
|
||||
}
|
||||
#endif
|
||||
|
|
|
@ -80,14 +80,14 @@ class TestXspmv {
|
|||
std::vector<T>&, std::vector<T>&) {} // N/A for this routine
|
||||
|
||||
// Describes how to run the CLBlast routine
|
||||
static StatusCode RunRoutine(const Arguments<T> &args, std::vector<Buffers<T>> &buffers, Queue &queue) {
|
||||
static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
|
||||
auto queue_plain = queue();
|
||||
auto event = cl_event{};
|
||||
auto status = Spmv(args.layout, args.triangle,
|
||||
args.n, args.alpha,
|
||||
buffers[0].ap_mat(), args.ap_offset,
|
||||
buffers[0].x_vec(), args.x_offset, args.x_inc, args.beta,
|
||||
buffers[0].y_vec(), args.y_offset, args.y_inc,
|
||||
buffers.ap_mat(), args.ap_offset,
|
||||
buffers.x_vec(), args.x_offset, args.x_inc, args.beta,
|
||||
buffers.y_vec(), args.y_offset, args.y_inc,
|
||||
&queue_plain, &event);
|
||||
if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); }
|
||||
return status;
|
||||
|
@ -95,15 +95,15 @@ class TestXspmv {
|
|||
|
||||
// Describes how to run the clBLAS routine (for correctness/performance comparison)
|
||||
#ifdef CLBLAST_REF_CLBLAS
|
||||
static StatusCode RunReference1(const Arguments<T> &args, std::vector<Buffers<T>> &buffers, Queue &queue) {
|
||||
static StatusCode RunReference1(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
|
||||
auto queue_plain = queue();
|
||||
auto event = cl_event{};
|
||||
auto status = clblasXspmv(convertToCLBLAS(args.layout),
|
||||
convertToCLBLAS(args.triangle),
|
||||
args.n, args.alpha,
|
||||
buffers[0].ap_mat, args.ap_offset,
|
||||
buffers[0].x_vec, args.x_offset, args.x_inc, args.beta,
|
||||
buffers[0].y_vec, args.y_offset, args.y_inc,
|
||||
buffers.ap_mat, args.ap_offset,
|
||||
buffers.x_vec, args.x_offset, args.x_inc, args.beta,
|
||||
buffers.y_vec, args.y_offset, args.y_inc,
|
||||
1, &queue_plain, 0, nullptr, &event);
|
||||
clWaitForEvents(1, &event);
|
||||
return static_cast<StatusCode>(status);
|
||||
|
@ -112,20 +112,20 @@ class TestXspmv {
|
|||
|
||||
// Describes how to run the CPU BLAS routine (for correctness/performance comparison)
|
||||
#ifdef CLBLAST_REF_CBLAS
|
||||
static StatusCode RunReference2(const Arguments<T> &args, std::vector<Buffers<T>> &buffers, Queue &queue) {
|
||||
static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
|
||||
std::vector<T> ap_mat_cpu(args.ap_size, static_cast<T>(0));
|
||||
std::vector<T> x_vec_cpu(args.x_size, static_cast<T>(0));
|
||||
std::vector<T> y_vec_cpu(args.y_size, static_cast<T>(0));
|
||||
buffers[0].ap_mat.Read(queue, args.ap_size, ap_mat_cpu);
|
||||
buffers[0].x_vec.Read(queue, args.x_size, x_vec_cpu);
|
||||
buffers[0].y_vec.Read(queue, args.y_size, y_vec_cpu);
|
||||
buffers.ap_mat.Read(queue, args.ap_size, ap_mat_cpu);
|
||||
buffers.x_vec.Read(queue, args.x_size, x_vec_cpu);
|
||||
buffers.y_vec.Read(queue, args.y_size, y_vec_cpu);
|
||||
cblasXspmv(convertToCBLAS(args.layout),
|
||||
convertToCBLAS(args.triangle),
|
||||
args.n, args.alpha,
|
||||
ap_mat_cpu, args.ap_offset,
|
||||
x_vec_cpu, args.x_offset, args.x_inc, args.beta,
|
||||
y_vec_cpu, args.y_offset, args.y_inc);
|
||||
buffers[0].y_vec.Write(queue, args.y_size, y_vec_cpu);
|
||||
buffers.y_vec.Write(queue, args.y_size, y_vec_cpu);
|
||||
return StatusCode::kSuccess;
|
||||
}
|
||||
#endif
|
||||
|
|
|
@ -76,13 +76,13 @@ class TestXspr {
|
|||
std::vector<T>&, std::vector<T>&) {} // N/A for this routine
|
||||
|
||||
// Describes how to run the CLBlast routine
|
||||
static StatusCode RunRoutine(const Arguments<T> &args, std::vector<Buffers<T>> &buffers, Queue &queue) {
|
||||
static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
|
||||
auto queue_plain = queue();
|
||||
auto event = cl_event{};
|
||||
auto status = Spr(args.layout, args.triangle,
|
||||
args.n, args.alpha,
|
||||
buffers[0].x_vec(), args.x_offset, args.x_inc,
|
||||
buffers[0].ap_mat(), args.ap_offset,
|
||||
buffers.x_vec(), args.x_offset, args.x_inc,
|
||||
buffers.ap_mat(), args.ap_offset,
|
||||
&queue_plain, &event);
|
||||
if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); }
|
||||
return status;
|
||||
|
@ -90,14 +90,14 @@ class TestXspr {
|
|||
|
||||
// Describes how to run the clBLAS routine (for correctness/performance comparison)
|
||||
#ifdef CLBLAST_REF_CLBLAS
|
||||
static StatusCode RunReference1(const Arguments<T> &args, std::vector<Buffers<T>> &buffers, Queue &queue) {
|
||||
static StatusCode RunReference1(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
|
||||
auto queue_plain = queue();
|
||||
auto event = cl_event{};
|
||||
auto status = clblasXspr(convertToCLBLAS(args.layout),
|
||||
convertToCLBLAS(args.triangle),
|
||||
args.n, args.alpha,
|
||||
buffers[0].x_vec, args.x_offset, args.x_inc,
|
||||
buffers[0].ap_mat, args.ap_offset,
|
||||
buffers.x_vec, args.x_offset, args.x_inc,
|
||||
buffers.ap_mat, args.ap_offset,
|
||||
1, &queue_plain, 0, nullptr, &event);
|
||||
clWaitForEvents(1, &event);
|
||||
return static_cast<StatusCode>(status);
|
||||
|
@ -106,17 +106,17 @@ class TestXspr {
|
|||
|
||||
// Describes how to run the CPU BLAS routine (for correctness/performance comparison)
|
||||
#ifdef CLBLAST_REF_CBLAS
|
||||
static StatusCode RunReference2(const Arguments<T> &args, std::vector<Buffers<T>> &buffers, Queue &queue) {
|
||||
static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
|
||||
std::vector<T> ap_mat_cpu(args.ap_size, static_cast<T>(0));
|
||||
std::vector<T> x_vec_cpu(args.x_size, static_cast<T>(0));
|
||||
buffers[0].ap_mat.Read(queue, args.ap_size, ap_mat_cpu);
|
||||
buffers[0].x_vec.Read(queue, args.x_size, x_vec_cpu);
|
||||
buffers.ap_mat.Read(queue, args.ap_size, ap_mat_cpu);
|
||||
buffers.x_vec.Read(queue, args.x_size, x_vec_cpu);
|
||||
cblasXspr(convertToCBLAS(args.layout),
|
||||
convertToCBLAS(args.triangle),
|
||||
args.n, args.alpha,
|
||||
x_vec_cpu, args.x_offset, args.x_inc,
|
||||
ap_mat_cpu, args.ap_offset);
|
||||
buffers[0].ap_mat.Write(queue, args.ap_size, ap_mat_cpu);
|
||||
buffers.ap_mat.Write(queue, args.ap_size, ap_mat_cpu);
|
||||
return StatusCode::kSuccess;
|
||||
}
|
||||
#endif
|
||||
|
|
|
@ -80,14 +80,14 @@ class TestXspr2 {
|
|||
std::vector<T>&, std::vector<T>&) {} // N/A for this routine
|
||||
|
||||
// Describes how to run the CLBlast routine
|
||||
static StatusCode RunRoutine(const Arguments<T> &args, std::vector<Buffers<T>> &buffers, Queue &queue) {
|
||||
static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
|
||||
auto queue_plain = queue();
|
||||
auto event = cl_event{};
|
||||
auto status = Spr2(args.layout, args.triangle,
|
||||
args.n, args.alpha,
|
||||
buffers[0].x_vec(), args.x_offset, args.x_inc,
|
||||
buffers[0].y_vec(), args.y_offset, args.y_inc,
|
||||
buffers[0].ap_mat(), args.ap_offset,
|
||||
buffers.x_vec(), args.x_offset, args.x_inc,
|
||||
buffers.y_vec(), args.y_offset, args.y_inc,
|
||||
buffers.ap_mat(), args.ap_offset,
|
||||
&queue_plain, &event);
|
||||
if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); }
|
||||
return status;
|
||||
|
@ -95,15 +95,15 @@ class TestXspr2 {
|
|||
|
||||
// Describes how to run the clBLAS routine (for correctness/performance comparison)
|
||||
#ifdef CLBLAST_REF_CLBLAS
|
||||
static StatusCode RunReference1(const Arguments<T> &args, std::vector<Buffers<T>> &buffers, Queue &queue) {
|
||||
static StatusCode RunReference1(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
|
||||
auto queue_plain = queue();
|
||||
auto event = cl_event{};
|
||||
auto status = clblasXspr2(convertToCLBLAS(args.layout),
|
||||
convertToCLBLAS(args.triangle),
|
||||
args.n, args.alpha,
|
||||
buffers[0].x_vec, args.x_offset, args.x_inc,
|
||||
buffers[0].y_vec, args.y_offset, args.y_inc,
|
||||
buffers[0].ap_mat, args.ap_offset,
|
||||
buffers.x_vec, args.x_offset, args.x_inc,
|
||||
buffers.y_vec, args.y_offset, args.y_inc,
|
||||
buffers.ap_mat, args.ap_offset,
|
||||
1, &queue_plain, 0, nullptr, &event);
|
||||
clWaitForEvents(1, &event);
|
||||
return static_cast<StatusCode>(status);
|
||||
|
@ -112,20 +112,20 @@ class TestXspr2 {
|
|||
|
||||
// Describes how to run the CPU BLAS routine (for correctness/performance comparison)
|
||||
#ifdef CLBLAST_REF_CBLAS
|
||||
static StatusCode RunReference2(const Arguments<T> &args, std::vector<Buffers<T>> &buffers, Queue &queue) {
|
||||
static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
|
||||
std::vector<T> ap_mat_cpu(args.ap_size, static_cast<T>(0));
|
||||
std::vector<T> x_vec_cpu(args.x_size, static_cast<T>(0));
|
||||
std::vector<T> y_vec_cpu(args.y_size, static_cast<T>(0));
|
||||
buffers[0].ap_mat.Read(queue, args.ap_size, ap_mat_cpu);
|
||||
buffers[0].x_vec.Read(queue, args.x_size, x_vec_cpu);
|
||||
buffers[0].y_vec.Read(queue, args.y_size, y_vec_cpu);
|
||||
buffers.ap_mat.Read(queue, args.ap_size, ap_mat_cpu);
|
||||
buffers.x_vec.Read(queue, args.x_size, x_vec_cpu);
|
||||
buffers.y_vec.Read(queue, args.y_size, y_vec_cpu);
|
||||
cblasXspr2(convertToCBLAS(args.layout),
|
||||
convertToCBLAS(args.triangle),
|
||||
args.n, args.alpha,
|
||||
x_vec_cpu, args.x_offset, args.x_inc,
|
||||
y_vec_cpu, args.y_offset, args.y_inc,
|
||||
ap_mat_cpu, args.ap_offset);
|
||||
buffers[0].ap_mat.Write(queue, args.ap_size, ap_mat_cpu);
|
||||
buffers.ap_mat.Write(queue, args.ap_size, ap_mat_cpu);
|
||||
return StatusCode::kSuccess;
|
||||
}
|
||||
#endif
|
||||
|
|
|
@ -80,14 +80,14 @@ class TestXsymv {
|
|||
std::vector<T>&, std::vector<T>&) {} // N/A for this routine
|
||||
|
||||
// Describes how to run the CLBlast routine
|
||||
static StatusCode RunRoutine(const Arguments<T> &args, std::vector<Buffers<T>> &buffers, Queue &queue) {
|
||||
static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
|
||||
auto queue_plain = queue();
|
||||
auto event = cl_event{};
|
||||
auto status = Symv(args.layout, args.triangle,
|
||||
args.n, args.alpha,
|
||||
buffers[0].a_mat(), args.a_offset, args.a_ld,
|
||||
buffers[0].x_vec(), args.x_offset, args.x_inc, args.beta,
|
||||
buffers[0].y_vec(), args.y_offset, args.y_inc,
|
||||
buffers.a_mat(), args.a_offset, args.a_ld,
|
||||
buffers.x_vec(), args.x_offset, args.x_inc, args.beta,
|
||||
buffers.y_vec(), args.y_offset, args.y_inc,
|
||||
&queue_plain, &event);
|
||||
if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); }
|
||||
return status;
|
||||
|
@ -95,15 +95,15 @@ class TestXsymv {
|
|||
|
||||
// Describes how to run the clBLAS routine (for correctness/performance comparison)
|
||||
#ifdef CLBLAST_REF_CLBLAS
|
||||
static StatusCode RunReference1(const Arguments<T> &args, std::vector<Buffers<T>> &buffers, Queue &queue) {
|
||||
static StatusCode RunReference1(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
|
||||
auto queue_plain = queue();
|
||||
auto event = cl_event{};
|
||||
auto status = clblasXsymv(convertToCLBLAS(args.layout),
|
||||
convertToCLBLAS(args.triangle),
|
||||
args.n, args.alpha,
|
||||
buffers[0].a_mat, args.a_offset, args.a_ld,
|
||||
buffers[0].x_vec, args.x_offset, args.x_inc, args.beta,
|
||||
buffers[0].y_vec, args.y_offset, args.y_inc,
|
||||
buffers.a_mat, args.a_offset, args.a_ld,
|
||||
buffers.x_vec, args.x_offset, args.x_inc, args.beta,
|
||||
buffers.y_vec, args.y_offset, args.y_inc,
|
||||
1, &queue_plain, 0, nullptr, &event);
|
||||
clWaitForEvents(1, &event);
|
||||
return static_cast<StatusCode>(status);
|
||||
|
@ -112,20 +112,20 @@ class TestXsymv {
|
|||
|
||||
// Describes how to run the CPU BLAS routine (for correctness/performance comparison)
|
||||
#ifdef CLBLAST_REF_CBLAS
|
||||
static StatusCode RunReference2(const Arguments<T> &args, std::vector<Buffers<T>> &buffers, Queue &queue) {
|
||||
static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
|
||||
std::vector<T> a_mat_cpu(args.a_size, static_cast<T>(0));
|
||||
std::vector<T> x_vec_cpu(args.x_size, static_cast<T>(0));
|
||||
std::vector<T> y_vec_cpu(args.y_size, static_cast<T>(0));
|
||||
buffers[0].a_mat.Read(queue, args.a_size, a_mat_cpu);
|
||||
buffers[0].x_vec.Read(queue, args.x_size, x_vec_cpu);
|
||||
buffers[0].y_vec.Read(queue, args.y_size, y_vec_cpu);
|
||||
buffers.a_mat.Read(queue, args.a_size, a_mat_cpu);
|
||||
buffers.x_vec.Read(queue, args.x_size, x_vec_cpu);
|
||||
buffers.y_vec.Read(queue, args.y_size, y_vec_cpu);
|
||||
cblasXsymv(convertToCBLAS(args.layout),
|
||||
convertToCBLAS(args.triangle),
|
||||
args.n, args.alpha,
|
||||
a_mat_cpu, args.a_offset, args.a_ld,
|
||||
x_vec_cpu, args.x_offset, args.x_inc, args.beta,
|
||||
y_vec_cpu, args.y_offset, args.y_inc);
|
||||
buffers[0].y_vec.Write(queue, args.y_size, y_vec_cpu);
|
||||
buffers.y_vec.Write(queue, args.y_size, y_vec_cpu);
|
||||
return StatusCode::kSuccess;
|
||||
}
|
||||
#endif
|
||||
|
|
|
@ -76,13 +76,13 @@ class TestXsyr {
|
|||
std::vector<T>&, std::vector<T>&) {} // N/A for this routine
|
||||
|
||||
// Describes how to run the CLBlast routine
|
||||
static StatusCode RunRoutine(const Arguments<T> &args, std::vector<Buffers<T>> &buffers, Queue &queue) {
|
||||
static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
|
||||
auto queue_plain = queue();
|
||||
auto event = cl_event{};
|
||||
auto status = Syr(args.layout, args.triangle,
|
||||
args.n, args.alpha,
|
||||
buffers[0].x_vec(), args.x_offset, args.x_inc,
|
||||
buffers[0].a_mat(), args.a_offset, args.a_ld,
|
||||
buffers.x_vec(), args.x_offset, args.x_inc,
|
||||
buffers.a_mat(), args.a_offset, args.a_ld,
|
||||
&queue_plain, &event);
|
||||
if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); }
|
||||
return status;
|
||||
|
@ -90,14 +90,14 @@ class TestXsyr {
|
|||
|
||||
// Describes how to run the clBLAS routine (for correctness/performance comparison)
|
||||
#ifdef CLBLAST_REF_CLBLAS
|
||||
static StatusCode RunReference1(const Arguments<T> &args, std::vector<Buffers<T>> &buffers, Queue &queue) {
|
||||
static StatusCode RunReference1(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
|
||||
auto queue_plain = queue();
|
||||
auto event = cl_event{};
|
||||
auto status = clblasXsyr(convertToCLBLAS(args.layout),
|
||||
convertToCLBLAS(args.triangle),
|
||||
args.n, args.alpha,
|
||||
buffers[0].x_vec, args.x_offset, args.x_inc,
|
||||
buffers[0].a_mat, args.a_offset, args.a_ld,
|
||||
buffers.x_vec, args.x_offset, args.x_inc,
|
||||
buffers.a_mat, args.a_offset, args.a_ld,
|
||||
1, &queue_plain, 0, nullptr, &event);
|
||||
clWaitForEvents(1, &event);
|
||||
return static_cast<StatusCode>(status);
|
||||
|
@ -106,17 +106,17 @@ class TestXsyr {
|
|||
|
||||
// Describes how to run the CPU BLAS routine (for correctness/performance comparison)
|
||||
#ifdef CLBLAST_REF_CBLAS
|
||||
static StatusCode RunReference2(const Arguments<T> &args, std::vector<Buffers<T>> &buffers, Queue &queue) {
|
||||
static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
|
||||
std::vector<T> a_mat_cpu(args.a_size, static_cast<T>(0));
|
||||
std::vector<T> x_vec_cpu(args.x_size, static_cast<T>(0));
|
||||
buffers[0].a_mat.Read(queue, args.a_size, a_mat_cpu);
|
||||
buffers[0].x_vec.Read(queue, args.x_size, x_vec_cpu);
|
||||
buffers.a_mat.Read(queue, args.a_size, a_mat_cpu);
|
||||
buffers.x_vec.Read(queue, args.x_size, x_vec_cpu);
|
||||
cblasXsyr(convertToCBLAS(args.layout),
|
||||
convertToCBLAS(args.triangle),
|
||||
args.n, args.alpha,
|
||||
x_vec_cpu, args.x_offset, args.x_inc,
|
||||
a_mat_cpu, args.a_offset, args.a_ld);
|
||||
buffers[0].a_mat.Write(queue, args.a_size, a_mat_cpu);
|
||||
buffers.a_mat.Write(queue, args.a_size, a_mat_cpu);
|
||||
return StatusCode::kSuccess;
|
||||
}
|
||||
#endif
|
||||
|
|
|
@ -80,14 +80,14 @@ class TestXsyr2 {
|
|||
std::vector<T>&, std::vector<T>&) {} // N/A for this routine
|
||||
|
||||
// Describes how to run the CLBlast routine
|
||||
static StatusCode RunRoutine(const Arguments<T> &args, std::vector<Buffers<T>> &buffers, Queue &queue) {
|
||||
static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
|
||||
auto queue_plain = queue();
|
||||
auto event = cl_event{};
|
||||
auto status = Syr2(args.layout, args.triangle,
|
||||
args.n, args.alpha,
|
||||
buffers[0].x_vec(), args.x_offset, args.x_inc,
|
||||
buffers[0].y_vec(), args.y_offset, args.y_inc,
|
||||
buffers[0].a_mat(), args.a_offset, args.a_ld,
|
||||
buffers.x_vec(), args.x_offset, args.x_inc,
|
||||
buffers.y_vec(), args.y_offset, args.y_inc,
|
||||
buffers.a_mat(), args.a_offset, args.a_ld,
|
||||
&queue_plain, &event);
|
||||
if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); }
|
||||
return status;
|
||||
|
@ -95,15 +95,15 @@ class TestXsyr2 {
|
|||
|
||||
// Describes how to run the clBLAS routine (for correctness/performance comparison)
|
||||
#ifdef CLBLAST_REF_CLBLAS
|
||||
static StatusCode RunReference1(const Arguments<T> &args, std::vector<Buffers<T>> &buffers, Queue &queue) {
|
||||
static StatusCode RunReference1(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
|
||||
auto queue_plain = queue();
|
||||
auto event = cl_event{};
|
||||
auto status = clblasXsyr2(convertToCLBLAS(args.layout),
|
||||
convertToCLBLAS(args.triangle),
|
||||
args.n, args.alpha,
|
||||
buffers[0].x_vec, args.x_offset, args.x_inc,
|
||||
buffers[0].y_vec, args.y_offset, args.y_inc,
|
||||
buffers[0].a_mat, args.a_offset, args.a_ld,
|
||||
buffers.x_vec, args.x_offset, args.x_inc,
|
||||
buffers.y_vec, args.y_offset, args.y_inc,
|
||||
buffers.a_mat, args.a_offset, args.a_ld,
|
||||
1, &queue_plain, 0, nullptr, &event);
|
||||
clWaitForEvents(1, &event);
|
||||
return static_cast<StatusCode>(status);
|
||||
|
@ -112,20 +112,20 @@ class TestXsyr2 {
|
|||
|
||||
// Describes how to run the CPU BLAS routine (for correctness/performance comparison)
|
||||
#ifdef CLBLAST_REF_CBLAS
|
||||
static StatusCode RunReference2(const Arguments<T> &args, std::vector<Buffers<T>> &buffers, Queue &queue) {
|
||||
static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
|
||||
std::vector<T> a_mat_cpu(args.a_size, static_cast<T>(0));
|
||||
std::vector<T> x_vec_cpu(args.x_size, static_cast<T>(0));
|
||||
std::vector<T> y_vec_cpu(args.y_size, static_cast<T>(0));
|
||||
buffers[0].a_mat.Read(queue, args.a_size, a_mat_cpu);
|
||||
buffers[0].x_vec.Read(queue, args.x_size, x_vec_cpu);
|
||||
buffers[0].y_vec.Read(queue, args.y_size, y_vec_cpu);
|
||||
buffers.a_mat.Read(queue, args.a_size, a_mat_cpu);
|
||||
buffers.x_vec.Read(queue, args.x_size, x_vec_cpu);
|
||||
buffers.y_vec.Read(queue, args.y_size, y_vec_cpu);
|
||||
cblasXsyr2(convertToCBLAS(args.layout),
|
||||
convertToCBLAS(args.triangle),
|
||||
args.n, args.alpha,
|
||||
x_vec_cpu, args.x_offset, args.x_inc,
|
||||
y_vec_cpu, args.y_offset, args.y_inc,
|
||||
a_mat_cpu, args.a_offset, args.a_ld);
|
||||
buffers[0].a_mat.Write(queue, args.a_size, a_mat_cpu);
|
||||
buffers.a_mat.Write(queue, args.a_size, a_mat_cpu);
|
||||
return StatusCode::kSuccess;
|
||||
}
|
||||
#endif
|
||||
|
|
|
@ -75,13 +75,13 @@ class TestXtbmv {
|
|||
std::vector<T>&, std::vector<T>&) {} // N/A for this routine
|
||||
|
||||
// Describes how to run the CLBlast routine
|
||||
static StatusCode RunRoutine(const Arguments<T> &args, std::vector<Buffers<T>> &buffers, Queue &queue) {
|
||||
static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
|
||||
auto queue_plain = queue();
|
||||
auto event = cl_event{};
|
||||
auto status = Tbmv<T>(args.layout, args.triangle, args.a_transpose, args.diagonal,
|
||||
args.n, args.kl,
|
||||
buffers[0].a_mat(), args.a_offset, args.a_ld,
|
||||
buffers[0].x_vec(), args.x_offset, args.x_inc,
|
||||
buffers.a_mat(), args.a_offset, args.a_ld,
|
||||
buffers.x_vec(), args.x_offset, args.x_inc,
|
||||
&queue_plain, &event);
|
||||
if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); }
|
||||
return status;
|
||||
|
@ -89,7 +89,7 @@ class TestXtbmv {
|
|||
|
||||
// Describes how to run the clBLAS routine (for correctness/performance comparison)
|
||||
#ifdef CLBLAST_REF_CLBLAS
|
||||
static StatusCode RunReference1(const Arguments<T> &args, std::vector<Buffers<T>> &buffers, Queue &queue) {
|
||||
static StatusCode RunReference1(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
|
||||
auto queue_plain = queue();
|
||||
auto event = cl_event{};
|
||||
auto status = clblasXtbmv<T>(convertToCLBLAS(args.layout),
|
||||
|
@ -97,8 +97,8 @@ class TestXtbmv {
|
|||
convertToCLBLAS(args.a_transpose),
|
||||
convertToCLBLAS(args.diagonal),
|
||||
args.n, args.kl,
|
||||
buffers[0].a_mat, args.a_offset, args.a_ld,
|
||||
buffers[0].x_vec, args.x_offset, args.x_inc,
|
||||
buffers.a_mat, args.a_offset, args.a_ld,
|
||||
buffers.x_vec, args.x_offset, args.x_inc,
|
||||
1, &queue_plain, 0, nullptr, &event);
|
||||
clWaitForEvents(1, &event);
|
||||
return static_cast<StatusCode>(status);
|
||||
|
@ -107,11 +107,11 @@ class TestXtbmv {
|
|||
|
||||
// Describes how to run the CPU BLAS routine (for correctness/performance comparison)
|
||||
#ifdef CLBLAST_REF_CBLAS
|
||||
static StatusCode RunReference2(const Arguments<T> &args, std::vector<Buffers<T>> &buffers, Queue &queue) {
|
||||
static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
|
||||
std::vector<T> a_mat_cpu(args.a_size, static_cast<T>(0));
|
||||
std::vector<T> x_vec_cpu(args.x_size, static_cast<T>(0));
|
||||
buffers[0].a_mat.Read(queue, args.a_size, a_mat_cpu);
|
||||
buffers[0].x_vec.Read(queue, args.x_size, x_vec_cpu);
|
||||
buffers.a_mat.Read(queue, args.a_size, a_mat_cpu);
|
||||
buffers.x_vec.Read(queue, args.x_size, x_vec_cpu);
|
||||
cblasXtbmv(convertToCBLAS(args.layout),
|
||||
convertToCBLAS(args.triangle),
|
||||
convertToCBLAS(args.a_transpose),
|
||||
|
@ -119,7 +119,7 @@ class TestXtbmv {
|
|||
args.n, args.kl,
|
||||
a_mat_cpu, args.a_offset, args.a_ld,
|
||||
x_vec_cpu, args.x_offset, args.x_inc);
|
||||
buffers[0].x_vec.Write(queue, args.x_size, x_vec_cpu);
|
||||
buffers.x_vec.Write(queue, args.x_size, x_vec_cpu);
|
||||
return StatusCode::kSuccess;
|
||||
}
|
||||
#endif
|
||||
|
|
|
@ -75,13 +75,13 @@ class TestXtpmv {
|
|||
std::vector<T>&, std::vector<T>&) {} // N/A for this routine
|
||||
|
||||
// Describes how to run the CLBlast routine
|
||||
static StatusCode RunRoutine(const Arguments<T> &args, std::vector<Buffers<T>> &buffers, Queue &queue) {
|
||||
static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
|
||||
auto queue_plain = queue();
|
||||
auto event = cl_event{};
|
||||
auto status = Tpmv<T>(args.layout, args.triangle, args.a_transpose, args.diagonal,
|
||||
args.n,
|
||||
buffers[0].ap_mat(), args.ap_offset,
|
||||
buffers[0].x_vec(), args.x_offset, args.x_inc,
|
||||
buffers.ap_mat(), args.ap_offset,
|
||||
buffers.x_vec(), args.x_offset, args.x_inc,
|
||||
&queue_plain, &event);
|
||||
if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); }
|
||||
return status;
|
||||
|
@ -89,7 +89,7 @@ class TestXtpmv {
|
|||
|
||||
// Describes how to run the clBLAS routine (for correctness/performance comparison)
|
||||
#ifdef CLBLAST_REF_CLBLAS
|
||||
static StatusCode RunReference1(const Arguments<T> &args, std::vector<Buffers<T>> &buffers, Queue &queue) {
|
||||
static StatusCode RunReference1(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
|
||||
auto queue_plain = queue();
|
||||
auto event = cl_event{};
|
||||
auto status = clblasXtpmv<T>(convertToCLBLAS(args.layout),
|
||||
|
@ -97,8 +97,8 @@ class TestXtpmv {
|
|||
convertToCLBLAS(args.a_transpose),
|
||||
convertToCLBLAS(args.diagonal),
|
||||
args.n,
|
||||
buffers[0].ap_mat, args.ap_offset,
|
||||
buffers[0].x_vec, args.x_offset, args.x_inc,
|
||||
buffers.ap_mat, args.ap_offset,
|
||||
buffers.x_vec, args.x_offset, args.x_inc,
|
||||
1, &queue_plain, 0, nullptr, &event);
|
||||
clWaitForEvents(1, &event);
|
||||
return static_cast<StatusCode>(status);
|
||||
|
@ -107,11 +107,11 @@ class TestXtpmv {
|
|||
|
||||
// Describes how to run the CPU BLAS routine (for correctness/performance comparison)
|
||||
#ifdef CLBLAST_REF_CBLAS
|
||||
static StatusCode RunReference2(const Arguments<T> &args, std::vector<Buffers<T>> &buffers, Queue &queue) {
|
||||
static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
|
||||
std::vector<T> ap_mat_cpu(args.ap_size, static_cast<T>(0));
|
||||
std::vector<T> x_vec_cpu(args.x_size, static_cast<T>(0));
|
||||
buffers[0].ap_mat.Read(queue, args.ap_size, ap_mat_cpu);
|
||||
buffers[0].x_vec.Read(queue, args.x_size, x_vec_cpu);
|
||||
buffers.ap_mat.Read(queue, args.ap_size, ap_mat_cpu);
|
||||
buffers.x_vec.Read(queue, args.x_size, x_vec_cpu);
|
||||
cblasXtpmv(convertToCBLAS(args.layout),
|
||||
convertToCBLAS(args.triangle),
|
||||
convertToCBLAS(args.a_transpose),
|
||||
|
@ -119,7 +119,7 @@ class TestXtpmv {
|
|||
args.n,
|
||||
ap_mat_cpu, args.ap_offset,
|
||||
x_vec_cpu, args.x_offset, args.x_inc);
|
||||
buffers[0].x_vec.Write(queue, args.x_size, x_vec_cpu);
|
||||
buffers.x_vec.Write(queue, args.x_size, x_vec_cpu);
|
||||
return StatusCode::kSuccess;
|
||||
}
|
||||
#endif
|
||||
|
|
|
@ -75,13 +75,13 @@ class TestXtrmv {
|
|||
std::vector<T>&, std::vector<T>&) {} // N/A for this routine
|
||||
|
||||
// Describes how to run the CLBlast routine
|
||||
static StatusCode RunRoutine(const Arguments<T> &args, std::vector<Buffers<T>> &buffers, Queue &queue) {
|
||||
static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
|
||||
auto queue_plain = queue();
|
||||
auto event = cl_event{};
|
||||
auto status = Trmv<T>(args.layout, args.triangle, args.a_transpose, args.diagonal,
|
||||
args.n,
|
||||
buffers[0].a_mat(), args.a_offset, args.a_ld,
|
||||
buffers[0].x_vec(), args.x_offset, args.x_inc,
|
||||
buffers.a_mat(), args.a_offset, args.a_ld,
|
||||
buffers.x_vec(), args.x_offset, args.x_inc,
|
||||
&queue_plain, &event);
|
||||
if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); }
|
||||
return status;
|
||||
|
@ -89,7 +89,7 @@ class TestXtrmv {
|
|||
|
||||
// Describes how to run the clBLAS routine (for correctness/performance comparison)
|
||||
#ifdef CLBLAST_REF_CLBLAS
|
||||
static StatusCode RunReference1(const Arguments<T> &args, std::vector<Buffers<T>> &buffers, Queue &queue) {
|
||||
static StatusCode RunReference1(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
|
||||
auto queue_plain = queue();
|
||||
auto event = cl_event{};
|
||||
auto status = clblasXtrmv<T>(convertToCLBLAS(args.layout),
|
||||
|
@ -97,8 +97,8 @@ class TestXtrmv {
|
|||
convertToCLBLAS(args.a_transpose),
|
||||
convertToCLBLAS(args.diagonal),
|
||||
args.n,
|
||||
buffers[0].a_mat, args.a_offset, args.a_ld,
|
||||
buffers[0].x_vec, args.x_offset, args.x_inc,
|
||||
buffers.a_mat, args.a_offset, args.a_ld,
|
||||
buffers.x_vec, args.x_offset, args.x_inc,
|
||||
1, &queue_plain, 0, nullptr, &event);
|
||||
clWaitForEvents(1, &event);
|
||||
return static_cast<StatusCode>(status);
|
||||
|
@ -107,11 +107,11 @@ class TestXtrmv {
|
|||
|
||||
// Describes how to run the CPU BLAS routine (for correctness/performance comparison)
|
||||
#ifdef CLBLAST_REF_CBLAS
|
||||
static StatusCode RunReference2(const Arguments<T> &args, std::vector<Buffers<T>> &buffers, Queue &queue) {
|
||||
static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
|
||||
std::vector<T> a_mat_cpu(args.a_size, static_cast<T>(0));
|
||||
std::vector<T> x_vec_cpu(args.x_size, static_cast<T>(0));
|
||||
buffers[0].a_mat.Read(queue, args.a_size, a_mat_cpu);
|
||||
buffers[0].x_vec.Read(queue, args.x_size, x_vec_cpu);
|
||||
buffers.a_mat.Read(queue, args.a_size, a_mat_cpu);
|
||||
buffers.x_vec.Read(queue, args.x_size, x_vec_cpu);
|
||||
cblasXtrmv(convertToCBLAS(args.layout),
|
||||
convertToCBLAS(args.triangle),
|
||||
convertToCBLAS(args.a_transpose),
|
||||
|
@ -119,7 +119,7 @@ class TestXtrmv {
|
|||
args.n,
|
||||
a_mat_cpu, args.a_offset, args.a_ld,
|
||||
x_vec_cpu, args.x_offset, args.x_inc);
|
||||
buffers[0].x_vec.Write(queue, args.x_size, x_vec_cpu);
|
||||
buffers.x_vec.Write(queue, args.x_size, x_vec_cpu);
|
||||
return StatusCode::kSuccess;
|
||||
}
|
||||
#endif
|
||||
|
|
|
@ -90,13 +90,13 @@ class TestXtrsv {
|
|||
}
|
||||
|
||||
// Describes how to run the CLBlast routine
|
||||
static StatusCode RunRoutine(const Arguments<T> &args, std::vector<Buffers<T>> &buffers, Queue &queue) {
|
||||
static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
|
||||
auto queue_plain = queue();
|
||||
auto event = cl_event{};
|
||||
auto status = Trsv<T>(args.layout, args.triangle, args.a_transpose, args.diagonal,
|
||||
args.n,
|
||||
buffers[0].a_mat(), args.a_offset, args.a_ld,
|
||||
buffers[0].x_vec(), args.x_offset, args.x_inc,
|
||||
buffers.a_mat(), args.a_offset, args.a_ld,
|
||||
buffers.x_vec(), args.x_offset, args.x_inc,
|
||||
&queue_plain, &event);
|
||||
if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); }
|
||||
return status;
|
||||
|
@ -104,7 +104,7 @@ class TestXtrsv {
|
|||
|
||||
// Describes how to run the clBLAS routine (for correctness/performance comparison)
|
||||
#ifdef CLBLAST_REF_CLBLAS
|
||||
static StatusCode RunReference1(const Arguments<T> &args, std::vector<Buffers<T>> &buffers, Queue &queue) {
|
||||
static StatusCode RunReference1(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
|
||||
auto queue_plain = queue();
|
||||
auto event = cl_event{};
|
||||
auto status = clblasXtrsv<T>(convertToCLBLAS(args.layout),
|
||||
|
@ -112,8 +112,8 @@ class TestXtrsv {
|
|||
convertToCLBLAS(args.a_transpose),
|
||||
convertToCLBLAS(args.diagonal),
|
||||
args.n,
|
||||
buffers[0].a_mat, args.a_offset, args.a_ld,
|
||||
buffers[0].x_vec, args.x_offset, args.x_inc,
|
||||
buffers.a_mat, args.a_offset, args.a_ld,
|
||||
buffers.x_vec, args.x_offset, args.x_inc,
|
||||
1, &queue_plain, 0, nullptr, &event);
|
||||
clWaitForEvents(1, &event);
|
||||
return static_cast<StatusCode>(status);
|
||||
|
@ -122,11 +122,11 @@ class TestXtrsv {
|
|||
|
||||
// Describes how to run the CPU BLAS routine (for correctness/performance comparison)
|
||||
#ifdef CLBLAST_REF_CBLAS
|
||||
static StatusCode RunReference2(const Arguments<T> &args, std::vector<Buffers<T>> &buffers, Queue &queue) {
|
||||
static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
|
||||
std::vector<T> a_mat_cpu(args.a_size, static_cast<T>(0));
|
||||
std::vector<T> x_vec_cpu(args.x_size, static_cast<T>(0));
|
||||
buffers[0].a_mat.Read(queue, args.a_size, a_mat_cpu);
|
||||
buffers[0].x_vec.Read(queue, args.x_size, x_vec_cpu);
|
||||
buffers.a_mat.Read(queue, args.a_size, a_mat_cpu);
|
||||
buffers.x_vec.Read(queue, args.x_size, x_vec_cpu);
|
||||
cblasXtrsv(convertToCBLAS(args.layout),
|
||||
convertToCBLAS(args.triangle),
|
||||
convertToCBLAS(args.a_transpose),
|
||||
|
@ -134,7 +134,7 @@ class TestXtrsv {
|
|||
args.n,
|
||||
a_mat_cpu, args.a_offset, args.a_ld,
|
||||
x_vec_cpu, args.x_offset, args.x_inc);
|
||||
buffers[0].x_vec.Write(queue, args.x_size, x_vec_cpu);
|
||||
buffers.x_vec.Write(queue, args.x_size, x_vec_cpu);
|
||||
return StatusCode::kSuccess;
|
||||
}
|
||||
#endif
|
||||
|
|
|
@ -88,14 +88,14 @@ class TestXgemm {
|
|||
std::vector<T>&, std::vector<T>&) {} // N/A for this routine
|
||||
|
||||
// Describes how to run the CLBlast routine
|
||||
static StatusCode RunRoutine(const Arguments<T> &args, std::vector<Buffers<T>> &buffers, Queue &queue) {
|
||||
static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
|
||||
auto queue_plain = queue();
|
||||
auto event = cl_event{};
|
||||
auto status = Gemm(args.layout, args.a_transpose, args.b_transpose,
|
||||
args.m, args.n, args.k, args.alpha,
|
||||
buffers[0].a_mat(), args.a_offset, args.a_ld,
|
||||
buffers[0].b_mat(), args.b_offset, args.b_ld, args.beta,
|
||||
buffers[0].c_mat(), args.c_offset, args.c_ld,
|
||||
buffers.a_mat(), args.a_offset, args.a_ld,
|
||||
buffers.b_mat(), args.b_offset, args.b_ld, args.beta,
|
||||
buffers.c_mat(), args.c_offset, args.c_ld,
|
||||
&queue_plain, &event);
|
||||
if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); }
|
||||
return status;
|
||||
|
@ -103,16 +103,16 @@ class TestXgemm {
|
|||
|
||||
// Describes how to run the clBLAS routine (for correctness/performance comparison)
|
||||
#ifdef CLBLAST_REF_CLBLAS
|
||||
static StatusCode RunReference1(const Arguments<T> &args, std::vector<Buffers<T>> &buffers, Queue &queue) {
|
||||
static StatusCode RunReference1(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
|
||||
auto queue_plain = queue();
|
||||
auto event = cl_event{};
|
||||
auto status = clblasXgemm(convertToCLBLAS(args.layout),
|
||||
convertToCLBLAS(args.a_transpose),
|
||||
convertToCLBLAS(args.b_transpose),
|
||||
args.m, args.n, args.k, args.alpha,
|
||||
buffers[0].a_mat, args.a_offset, args.a_ld,
|
||||
buffers[0].b_mat, args.b_offset, args.b_ld, args.beta,
|
||||
buffers[0].c_mat, args.c_offset, args.c_ld,
|
||||
buffers.a_mat, args.a_offset, args.a_ld,
|
||||
buffers.b_mat, args.b_offset, args.b_ld, args.beta,
|
||||
buffers.c_mat, args.c_offset, args.c_ld,
|
||||
1, &queue_plain, 0, nullptr, &event);
|
||||
clWaitForEvents(1, &event);
|
||||
return static_cast<StatusCode>(status);
|
||||
|
@ -121,13 +121,13 @@ class TestXgemm {
|
|||
|
||||
// Describes how to run the CPU BLAS routine (for correctness/performance comparison)
|
||||
#ifdef CLBLAST_REF_CBLAS
|
||||
static StatusCode RunReference2(const Arguments<T> &args, std::vector<Buffers<T>> &buffers, Queue &queue) {
|
||||
static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
|
||||
std::vector<T> a_mat_cpu(args.a_size, static_cast<T>(0));
|
||||
std::vector<T> b_mat_cpu(args.b_size, static_cast<T>(0));
|
||||
std::vector<T> c_mat_cpu(args.c_size, static_cast<T>(0));
|
||||
buffers[0].a_mat.Read(queue, args.a_size, a_mat_cpu);
|
||||
buffers[0].b_mat.Read(queue, args.b_size, b_mat_cpu);
|
||||
buffers[0].c_mat.Read(queue, args.c_size, c_mat_cpu);
|
||||
buffers.a_mat.Read(queue, args.a_size, a_mat_cpu);
|
||||
buffers.b_mat.Read(queue, args.b_size, b_mat_cpu);
|
||||
buffers.c_mat.Read(queue, args.c_size, c_mat_cpu);
|
||||
cblasXgemm(convertToCBLAS(args.layout),
|
||||
convertToCBLAS(args.a_transpose),
|
||||
convertToCBLAS(args.b_transpose),
|
||||
|
@ -135,7 +135,7 @@ class TestXgemm {
|
|||
a_mat_cpu, args.a_offset, args.a_ld,
|
||||
b_mat_cpu, args.b_offset, args.b_ld, args.beta,
|
||||
c_mat_cpu, args.c_offset, args.c_ld);
|
||||
buffers[0].c_mat.Write(queue, args.c_size, c_mat_cpu);
|
||||
buffers.c_mat.Write(queue, args.c_size, c_mat_cpu);
|
||||
return StatusCode::kSuccess;
|
||||
}
|
||||
#endif
|
||||
|
|
|
@ -88,14 +88,14 @@ class TestXhemm {
|
|||
std::vector<T>&, std::vector<T>&) {} // N/A for this routine
|
||||
|
||||
// Describes how to run the CLBlast routine
|
||||
static StatusCode RunRoutine(const Arguments<T> &args, std::vector<Buffers<T>> &buffers, Queue &queue) {
|
||||
static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
|
||||
auto queue_plain = queue();
|
||||
auto event = cl_event{};
|
||||
auto status = Hemm(args.layout, args.side, args.triangle,
|
||||
args.m, args.n, args.alpha,
|
||||
buffers[0].a_mat(), args.a_offset, args.a_ld,
|
||||
buffers[0].b_mat(), args.b_offset, args.b_ld, args.beta,
|
||||
buffers[0].c_mat(), args.c_offset, args.c_ld,
|
||||
buffers.a_mat(), args.a_offset, args.a_ld,
|
||||
buffers.b_mat(), args.b_offset, args.b_ld, args.beta,
|
||||
buffers.c_mat(), args.c_offset, args.c_ld,
|
||||
&queue_plain, &event);
|
||||
if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); }
|
||||
return status;
|
||||
|
@ -103,16 +103,16 @@ class TestXhemm {
|
|||
|
||||
// Describes how to run the clBLAS routine (for correctness/performance comparison)
|
||||
#ifdef CLBLAST_REF_CLBLAS
|
||||
static StatusCode RunReference1(const Arguments<T> &args, std::vector<Buffers<T>> &buffers, Queue &queue) {
|
||||
static StatusCode RunReference1(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
|
||||
auto queue_plain = queue();
|
||||
auto event = cl_event{};
|
||||
auto status = clblasXhemm(convertToCLBLAS(args.layout),
|
||||
convertToCLBLAS(args.side),
|
||||
convertToCLBLAS(args.triangle),
|
||||
args.m, args.n, args.alpha,
|
||||
buffers[0].a_mat, args.a_offset, args.a_ld,
|
||||
buffers[0].b_mat, args.b_offset, args.b_ld, args.beta,
|
||||
buffers[0].c_mat, args.c_offset, args.c_ld,
|
||||
buffers.a_mat, args.a_offset, args.a_ld,
|
||||
buffers.b_mat, args.b_offset, args.b_ld, args.beta,
|
||||
buffers.c_mat, args.c_offset, args.c_ld,
|
||||
1, &queue_plain, 0, nullptr, &event);
|
||||
clWaitForEvents(1, &event);
|
||||
return static_cast<StatusCode>(status);
|
||||
|
@ -121,13 +121,13 @@ class TestXhemm {
|
|||
|
||||
// Describes how to run the CPU BLAS routine (for correctness/performance comparison)
|
||||
#ifdef CLBLAST_REF_CBLAS
|
||||
static StatusCode RunReference2(const Arguments<T> &args, std::vector<Buffers<T>> &buffers, Queue &queue) {
|
||||
static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
|
||||
std::vector<T> a_mat_cpu(args.a_size, static_cast<T>(0));
|
||||
std::vector<T> b_mat_cpu(args.b_size, static_cast<T>(0));
|
||||
std::vector<T> c_mat_cpu(args.c_size, static_cast<T>(0));
|
||||
buffers[0].a_mat.Read(queue, args.a_size, a_mat_cpu);
|
||||
buffers[0].b_mat.Read(queue, args.b_size, b_mat_cpu);
|
||||
buffers[0].c_mat.Read(queue, args.c_size, c_mat_cpu);
|
||||
buffers.a_mat.Read(queue, args.a_size, a_mat_cpu);
|
||||
buffers.b_mat.Read(queue, args.b_size, b_mat_cpu);
|
||||
buffers.c_mat.Read(queue, args.c_size, c_mat_cpu);
|
||||
cblasXhemm(convertToCBLAS(args.layout),
|
||||
convertToCBLAS(args.side),
|
||||
convertToCBLAS(args.triangle),
|
||||
|
@ -135,7 +135,7 @@ class TestXhemm {
|
|||
a_mat_cpu, args.a_offset, args.a_ld,
|
||||
b_mat_cpu, args.b_offset, args.b_ld, args.beta,
|
||||
c_mat_cpu, args.c_offset, args.c_ld);
|
||||
buffers[0].c_mat.Write(queue, args.c_size, c_mat_cpu);
|
||||
buffers.c_mat.Write(queue, args.c_size, c_mat_cpu);
|
||||
return StatusCode::kSuccess;
|
||||
}
|
||||
#endif
|
||||
|
|
|
@ -86,15 +86,15 @@ class TestXher2k {
|
|||
std::vector<T>&, std::vector<T>&) {} // N/A for this routine
|
||||
|
||||
// Describes how to run the CLBlast routine
|
||||
static StatusCode RunRoutine(const Arguments<U> &args, std::vector<Buffers<T>> &buffers, Queue &queue) {
|
||||
static StatusCode RunRoutine(const Arguments<U> &args, Buffers<T> &buffers, Queue &queue) {
|
||||
auto queue_plain = queue();
|
||||
auto event = cl_event{};
|
||||
auto alpha2 = T{args.alpha, args.alpha};
|
||||
auto status = Her2k(args.layout, args.triangle, args.a_transpose,
|
||||
args.n, args.k, alpha2,
|
||||
buffers[0].a_mat(), args.a_offset, args.a_ld,
|
||||
buffers[0].b_mat(), args.b_offset, args.b_ld, args.beta,
|
||||
buffers[0].c_mat(), args.c_offset, args.c_ld,
|
||||
buffers.a_mat(), args.a_offset, args.a_ld,
|
||||
buffers.b_mat(), args.b_offset, args.b_ld, args.beta,
|
||||
buffers.c_mat(), args.c_offset, args.c_ld,
|
||||
&queue_plain, &event);
|
||||
if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); }
|
||||
return status;
|
||||
|
@ -102,7 +102,7 @@ class TestXher2k {
|
|||
|
||||
// Describes how to run the clBLAS routine (for correctness/performance comparison)
|
||||
#ifdef CLBLAST_REF_CLBLAS
|
||||
static StatusCode RunReference1(const Arguments<U> &args, std::vector<Buffers<T>> &buffers, Queue &queue) {
|
||||
static StatusCode RunReference1(const Arguments<U> &args, Buffers<T> &buffers, Queue &queue) {
|
||||
auto queue_plain = queue();
|
||||
auto event = cl_event{};
|
||||
auto alpha2 = T{args.alpha, args.alpha};
|
||||
|
@ -110,9 +110,9 @@ class TestXher2k {
|
|||
convertToCLBLAS(args.triangle),
|
||||
convertToCLBLAS(args.a_transpose),
|
||||
args.n, args.k, alpha2,
|
||||
buffers[0].a_mat, args.a_offset, args.a_ld,
|
||||
buffers[0].b_mat, args.b_offset, args.b_ld, args.beta,
|
||||
buffers[0].c_mat, args.c_offset, args.c_ld,
|
||||
buffers.a_mat, args.a_offset, args.a_ld,
|
||||
buffers.b_mat, args.b_offset, args.b_ld, args.beta,
|
||||
buffers.c_mat, args.c_offset, args.c_ld,
|
||||
1, &queue_plain, 0, nullptr, &event);
|
||||
clWaitForEvents(1, &event);
|
||||
return static_cast<StatusCode>(status);
|
||||
|
@ -121,13 +121,13 @@ class TestXher2k {
|
|||
|
||||
// Describes how to run the CPU BLAS routine (for correctness/performance comparison)
|
||||
#ifdef CLBLAST_REF_CBLAS
|
||||
static StatusCode RunReference2(const Arguments<U> &args, std::vector<Buffers<T>> &buffers, Queue &queue) {
|
||||
static StatusCode RunReference2(const Arguments<U> &args, Buffers<T> &buffers, Queue &queue) {
|
||||
std::vector<T> a_mat_cpu(args.a_size, static_cast<T>(0));
|
||||
std::vector<T> b_mat_cpu(args.b_size, static_cast<T>(0));
|
||||
std::vector<T> c_mat_cpu(args.c_size, static_cast<T>(0));
|
||||
buffers[0].a_mat.Read(queue, args.a_size, a_mat_cpu);
|
||||
buffers[0].b_mat.Read(queue, args.b_size, b_mat_cpu);
|
||||
buffers[0].c_mat.Read(queue, args.c_size, c_mat_cpu);
|
||||
buffers.a_mat.Read(queue, args.a_size, a_mat_cpu);
|
||||
buffers.b_mat.Read(queue, args.b_size, b_mat_cpu);
|
||||
buffers.c_mat.Read(queue, args.c_size, c_mat_cpu);
|
||||
auto alpha2 = T{args.alpha, args.alpha};
|
||||
cblasXher2k(convertToCBLAS(args.layout),
|
||||
convertToCBLAS(args.triangle),
|
||||
|
@ -136,7 +136,7 @@ class TestXher2k {
|
|||
a_mat_cpu, args.a_offset, args.a_ld,
|
||||
b_mat_cpu, args.b_offset, args.b_ld, args.beta,
|
||||
c_mat_cpu, args.c_offset, args.c_ld);
|
||||
buffers[0].c_mat.Write(queue, args.c_size, c_mat_cpu);
|
||||
buffers.c_mat.Write(queue, args.c_size, c_mat_cpu);
|
||||
return StatusCode::kSuccess;
|
||||
}
|
||||
#endif
|
||||
|
|
|
@ -79,13 +79,13 @@ class TestXherk {
|
|||
std::vector<T>&, std::vector<T>&) {} // N/A for this routine
|
||||
|
||||
// Describes how to run the CLBlast routine
|
||||
static StatusCode RunRoutine(const Arguments<U> &args, std::vector<Buffers<T>> &buffers, Queue &queue) {
|
||||
static StatusCode RunRoutine(const Arguments<U> &args, Buffers<T> &buffers, Queue &queue) {
|
||||
auto queue_plain = queue();
|
||||
auto event = cl_event{};
|
||||
auto status = Herk(args.layout, args.triangle, args.a_transpose,
|
||||
args.n, args.k, args.alpha,
|
||||
buffers[0].a_mat(), args.a_offset, args.a_ld, args.beta,
|
||||
buffers[0].c_mat(), args.c_offset, args.c_ld,
|
||||
buffers.a_mat(), args.a_offset, args.a_ld, args.beta,
|
||||
buffers.c_mat(), args.c_offset, args.c_ld,
|
||||
&queue_plain, &event);
|
||||
if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); }
|
||||
return status;
|
||||
|
@ -93,15 +93,15 @@ class TestXherk {
|
|||
|
||||
// Describes how to run the clBLAS routine (for correctness/performance comparison)
|
||||
#ifdef CLBLAST_REF_CLBLAS
|
||||
static StatusCode RunReference1(const Arguments<U> &args, std::vector<Buffers<T>> &buffers, Queue &queue) {
|
||||
static StatusCode RunReference1(const Arguments<U> &args, Buffers<T> &buffers, Queue &queue) {
|
||||
auto queue_plain = queue();
|
||||
auto event = cl_event{};
|
||||
auto status = clblasXherk(convertToCLBLAS(args.layout),
|
||||
convertToCLBLAS(args.triangle),
|
||||
convertToCLBLAS(args.a_transpose),
|
||||
args.n, args.k, args.alpha,
|
||||
buffers[0].a_mat, args.a_offset, args.a_ld, args.beta,
|
||||
buffers[0].c_mat, args.c_offset, args.c_ld,
|
||||
buffers.a_mat, args.a_offset, args.a_ld, args.beta,
|
||||
buffers.c_mat, args.c_offset, args.c_ld,
|
||||
1, &queue_plain, 0, nullptr, &event);
|
||||
clWaitForEvents(1, &event);
|
||||
return static_cast<StatusCode>(status);
|
||||
|
@ -110,18 +110,18 @@ class TestXherk {
|
|||
|
||||
// Describes how to run the CPU BLAS routine (for correctness/performance comparison)
|
||||
#ifdef CLBLAST_REF_CBLAS
|
||||
static StatusCode RunReference2(const Arguments<U> &args, std::vector<Buffers<T>> &buffers, Queue &queue) {
|
||||
static StatusCode RunReference2(const Arguments<U> &args, Buffers<T> &buffers, Queue &queue) {
|
||||
std::vector<T> a_mat_cpu(args.a_size, static_cast<T>(0));
|
||||
std::vector<T> c_mat_cpu(args.c_size, static_cast<T>(0));
|
||||
buffers[0].a_mat.Read(queue, args.a_size, a_mat_cpu);
|
||||
buffers[0].c_mat.Read(queue, args.c_size, c_mat_cpu);
|
||||
buffers.a_mat.Read(queue, args.a_size, a_mat_cpu);
|
||||
buffers.c_mat.Read(queue, args.c_size, c_mat_cpu);
|
||||
cblasXherk(convertToCBLAS(args.layout),
|
||||
convertToCBLAS(args.triangle),
|
||||
convertToCBLAS(args.a_transpose),
|
||||
args.n, args.k, args.alpha,
|
||||
a_mat_cpu, args.a_offset, args.a_ld, args.beta,
|
||||
c_mat_cpu, args.c_offset, args.c_ld);
|
||||
buffers[0].c_mat.Write(queue, args.c_size, c_mat_cpu);
|
||||
buffers.c_mat.Write(queue, args.c_size, c_mat_cpu);
|
||||
return StatusCode::kSuccess;
|
||||
}
|
||||
#endif
|
||||
|
|
|
@ -88,14 +88,14 @@ class TestXsymm {
|
|||
std::vector<T>&, std::vector<T>&) {} // N/A for this routine
|
||||
|
||||
// Describes how to run the CLBlast routine
|
||||
static StatusCode RunRoutine(const Arguments<T> &args, std::vector<Buffers<T>> &buffers, Queue &queue) {
|
||||
static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
|
||||
auto queue_plain = queue();
|
||||
auto event = cl_event{};
|
||||
auto status = Symm(args.layout, args.side, args.triangle,
|
||||
args.m, args.n, args.alpha,
|
||||
buffers[0].a_mat(), args.a_offset, args.a_ld,
|
||||
buffers[0].b_mat(), args.b_offset, args.b_ld, args.beta,
|
||||
buffers[0].c_mat(), args.c_offset, args.c_ld,
|
||||
buffers.a_mat(), args.a_offset, args.a_ld,
|
||||
buffers.b_mat(), args.b_offset, args.b_ld, args.beta,
|
||||
buffers.c_mat(), args.c_offset, args.c_ld,
|
||||
&queue_plain, &event);
|
||||
if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); }
|
||||
return status;
|
||||
|
@ -103,16 +103,16 @@ class TestXsymm {
|
|||
|
||||
// Describes how to run the clBLAS routine (for correctness/performance comparison)
|
||||
#ifdef CLBLAST_REF_CLBLAS
|
||||
static StatusCode RunReference1(const Arguments<T> &args, std::vector<Buffers<T>> &buffers, Queue &queue) {
|
||||
static StatusCode RunReference1(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
|
||||
auto queue_plain = queue();
|
||||
auto event = cl_event{};
|
||||
auto status = clblasXsymm(convertToCLBLAS(args.layout),
|
||||
convertToCLBLAS(args.side),
|
||||
convertToCLBLAS(args.triangle),
|
||||
args.m, args.n, args.alpha,
|
||||
buffers[0].a_mat, args.a_offset, args.a_ld,
|
||||
buffers[0].b_mat, args.b_offset, args.b_ld, args.beta,
|
||||
buffers[0].c_mat, args.c_offset, args.c_ld,
|
||||
buffers.a_mat, args.a_offset, args.a_ld,
|
||||
buffers.b_mat, args.b_offset, args.b_ld, args.beta,
|
||||
buffers.c_mat, args.c_offset, args.c_ld,
|
||||
1, &queue_plain, 0, nullptr, &event);
|
||||
clWaitForEvents(1, &event);
|
||||
return static_cast<StatusCode>(status);
|
||||
|
@ -121,13 +121,13 @@ class TestXsymm {
|
|||
|
||||
// Describes how to run the CPU BLAS routine (for correctness/performance comparison)
|
||||
#ifdef CLBLAST_REF_CBLAS
|
||||
static StatusCode RunReference2(const Arguments<T> &args, std::vector<Buffers<T>> &buffers, Queue &queue) {
|
||||
static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
|
||||
std::vector<T> a_mat_cpu(args.a_size, static_cast<T>(0));
|
||||
std::vector<T> b_mat_cpu(args.b_size, static_cast<T>(0));
|
||||
std::vector<T> c_mat_cpu(args.c_size, static_cast<T>(0));
|
||||
buffers[0].a_mat.Read(queue, args.a_size, a_mat_cpu);
|
||||
buffers[0].b_mat.Read(queue, args.b_size, b_mat_cpu);
|
||||
buffers[0].c_mat.Read(queue, args.c_size, c_mat_cpu);
|
||||
buffers.a_mat.Read(queue, args.a_size, a_mat_cpu);
|
||||
buffers.b_mat.Read(queue, args.b_size, b_mat_cpu);
|
||||
buffers.c_mat.Read(queue, args.c_size, c_mat_cpu);
|
||||
cblasXsymm(convertToCBLAS(args.layout),
|
||||
convertToCBLAS(args.side),
|
||||
convertToCBLAS(args.triangle),
|
||||
|
@ -135,7 +135,7 @@ class TestXsymm {
|
|||
a_mat_cpu, args.a_offset, args.a_ld,
|
||||
b_mat_cpu, args.b_offset, args.b_ld, args.beta,
|
||||
c_mat_cpu, args.c_offset, args.c_ld);
|
||||
buffers[0].c_mat.Write(queue, args.c_size, c_mat_cpu);
|
||||
buffers.c_mat.Write(queue, args.c_size, c_mat_cpu);
|
||||
return StatusCode::kSuccess;
|
||||
}
|
||||
#endif
|
||||
|
|
|
@ -86,14 +86,14 @@ class TestXsyr2k {
|
|||
std::vector<T>&, std::vector<T>&) {} // N/A for this routine
|
||||
|
||||
// Describes how to run the CLBlast routine
|
||||
static StatusCode RunRoutine(const Arguments<T> &args, std::vector<Buffers<T>> &buffers, Queue &queue) {
|
||||
static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
|
||||
auto queue_plain = queue();
|
||||
auto event = cl_event{};
|
||||
auto status = Syr2k(args.layout, args.triangle, args.a_transpose,
|
||||
args.n, args.k, args.alpha,
|
||||
buffers[0].a_mat(), args.a_offset, args.a_ld,
|
||||
buffers[0].b_mat(), args.b_offset, args.b_ld, args.beta,
|
||||
buffers[0].c_mat(), args.c_offset, args.c_ld,
|
||||
buffers.a_mat(), args.a_offset, args.a_ld,
|
||||
buffers.b_mat(), args.b_offset, args.b_ld, args.beta,
|
||||
buffers.c_mat(), args.c_offset, args.c_ld,
|
||||
&queue_plain, &event);
|
||||
if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); }
|
||||
return status;
|
||||
|
@ -101,16 +101,16 @@ class TestXsyr2k {
|
|||
|
||||
// Describes how to run the clBLAS routine (for correctness/performance comparison)
|
||||
#ifdef CLBLAST_REF_CLBLAS
|
||||
static StatusCode RunReference1(const Arguments<T> &args, std::vector<Buffers<T>> &buffers, Queue &queue) {
|
||||
static StatusCode RunReference1(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
|
||||
auto queue_plain = queue();
|
||||
auto event = cl_event{};
|
||||
auto status = clblasXsyr2k(convertToCLBLAS(args.layout),
|
||||
convertToCLBLAS(args.triangle),
|
||||
convertToCLBLAS(args.a_transpose),
|
||||
args.n, args.k, args.alpha,
|
||||
buffers[0].a_mat, args.a_offset, args.a_ld,
|
||||
buffers[0].b_mat, args.b_offset, args.b_ld, args.beta,
|
||||
buffers[0].c_mat, args.c_offset, args.c_ld,
|
||||
buffers.a_mat, args.a_offset, args.a_ld,
|
||||
buffers.b_mat, args.b_offset, args.b_ld, args.beta,
|
||||
buffers.c_mat, args.c_offset, args.c_ld,
|
||||
1, &queue_plain, 0, nullptr, &event);
|
||||
clWaitForEvents(1, &event);
|
||||
return static_cast<StatusCode>(status);
|
||||
|
@ -119,13 +119,13 @@ class TestXsyr2k {
|
|||
|
||||
// Describes how to run the CPU BLAS routine (for correctness/performance comparison)
|
||||
#ifdef CLBLAST_REF_CBLAS
|
||||
static StatusCode RunReference2(const Arguments<T> &args, std::vector<Buffers<T>> &buffers, Queue &queue) {
|
||||
static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
|
||||
std::vector<T> a_mat_cpu(args.a_size, static_cast<T>(0));
|
||||
std::vector<T> b_mat_cpu(args.b_size, static_cast<T>(0));
|
||||
std::vector<T> c_mat_cpu(args.c_size, static_cast<T>(0));
|
||||
buffers[0].a_mat.Read(queue, args.a_size, a_mat_cpu);
|
||||
buffers[0].b_mat.Read(queue, args.b_size, b_mat_cpu);
|
||||
buffers[0].c_mat.Read(queue, args.c_size, c_mat_cpu);
|
||||
buffers.a_mat.Read(queue, args.a_size, a_mat_cpu);
|
||||
buffers.b_mat.Read(queue, args.b_size, b_mat_cpu);
|
||||
buffers.c_mat.Read(queue, args.c_size, c_mat_cpu);
|
||||
cblasXsyr2k(convertToCBLAS(args.layout),
|
||||
convertToCBLAS(args.triangle),
|
||||
convertToCBLAS(args.a_transpose),
|
||||
|
@ -133,7 +133,7 @@ class TestXsyr2k {
|
|||
a_mat_cpu, args.a_offset, args.a_ld,
|
||||
b_mat_cpu, args.b_offset, args.b_ld, args.beta,
|
||||
c_mat_cpu, args.c_offset, args.c_ld);
|
||||
buffers[0].c_mat.Write(queue, args.c_size, c_mat_cpu);
|
||||
buffers.c_mat.Write(queue, args.c_size, c_mat_cpu);
|
||||
return StatusCode::kSuccess;
|
||||
}
|
||||
#endif
|
||||
|
|
|
@ -79,13 +79,13 @@ class TestXsyrk {
|
|||
std::vector<T>&, std::vector<T>&) {} // N/A for this routine
|
||||
|
||||
// Describes how to run the CLBlast routine
|
||||
static StatusCode RunRoutine(const Arguments<T> &args, std::vector<Buffers<T>> &buffers, Queue &queue) {
|
||||
static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
|
||||
auto queue_plain = queue();
|
||||
auto event = cl_event{};
|
||||
auto status = Syrk(args.layout, args.triangle, args.a_transpose,
|
||||
args.n, args.k, args.alpha,
|
||||
buffers[0].a_mat(), args.a_offset, args.a_ld, args.beta,
|
||||
buffers[0].c_mat(), args.c_offset, args.c_ld,
|
||||
buffers.a_mat(), args.a_offset, args.a_ld, args.beta,
|
||||
buffers.c_mat(), args.c_offset, args.c_ld,
|
||||
&queue_plain, &event);
|
||||
if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); }
|
||||
return status;
|
||||
|
@ -93,15 +93,15 @@ class TestXsyrk {
|
|||
|
||||
// Describes how to run the clBLAS routine (for correctness/performance comparison)
|
||||
#ifdef CLBLAST_REF_CLBLAS
|
||||
static StatusCode RunReference1(const Arguments<T> &args, std::vector<Buffers<T>> &buffers, Queue &queue) {
|
||||
static StatusCode RunReference1(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
|
||||
auto queue_plain = queue();
|
||||
auto event = cl_event{};
|
||||
auto status = clblasXsyrk(convertToCLBLAS(args.layout),
|
||||
convertToCLBLAS(args.triangle),
|
||||
convertToCLBLAS(args.a_transpose),
|
||||
args.n, args.k, args.alpha,
|
||||
buffers[0].a_mat, args.a_offset, args.a_ld, args.beta,
|
||||
buffers[0].c_mat, args.c_offset, args.c_ld,
|
||||
buffers.a_mat, args.a_offset, args.a_ld, args.beta,
|
||||
buffers.c_mat, args.c_offset, args.c_ld,
|
||||
1, &queue_plain, 0, nullptr, &event);
|
||||
clWaitForEvents(1, &event);
|
||||
return static_cast<StatusCode>(status);
|
||||
|
@ -110,18 +110,18 @@ class TestXsyrk {
|
|||
|
||||
// Describes how to run the CPU BLAS routine (for correctness/performance comparison)
|
||||
#ifdef CLBLAST_REF_CBLAS
|
||||
static StatusCode RunReference2(const Arguments<T> &args, std::vector<Buffers<T>> &buffers, Queue &queue) {
|
||||
static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
|
||||
std::vector<T> a_mat_cpu(args.a_size, static_cast<T>(0));
|
||||
std::vector<T> c_mat_cpu(args.c_size, static_cast<T>(0));
|
||||
buffers[0].a_mat.Read(queue, args.a_size, a_mat_cpu);
|
||||
buffers[0].c_mat.Read(queue, args.c_size, c_mat_cpu);
|
||||
buffers.a_mat.Read(queue, args.a_size, a_mat_cpu);
|
||||
buffers.c_mat.Read(queue, args.c_size, c_mat_cpu);
|
||||
cblasXsyrk(convertToCBLAS(args.layout),
|
||||
convertToCBLAS(args.triangle),
|
||||
convertToCBLAS(args.a_transpose),
|
||||
args.n, args.k, args.alpha,
|
||||
a_mat_cpu, args.a_offset, args.a_ld, args.beta,
|
||||
c_mat_cpu, args.c_offset, args.c_ld);
|
||||
buffers[0].c_mat.Write(queue, args.c_size, c_mat_cpu);
|
||||
buffers.c_mat.Write(queue, args.c_size, c_mat_cpu);
|
||||
return StatusCode::kSuccess;
|
||||
}
|
||||
#endif
|
||||
|
|
|
@ -79,13 +79,13 @@ class TestXtrmm {
|
|||
std::vector<T>&, std::vector<T>&) {} // N/A for this routine
|
||||
|
||||
// Describes how to run the CLBlast routine
|
||||
static StatusCode RunRoutine(const Arguments<T> &args, std::vector<Buffers<T>> &buffers, Queue &queue) {
|
||||
static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
|
||||
auto queue_plain = queue();
|
||||
auto event = cl_event{};
|
||||
auto status = Trmm(args.layout, args.side, args.triangle, args.a_transpose, args.diagonal,
|
||||
args.m, args.n, args.alpha,
|
||||
buffers[0].a_mat(), args.a_offset, args.a_ld,
|
||||
buffers[0].b_mat(), args.b_offset, args.b_ld,
|
||||
buffers.a_mat(), args.a_offset, args.a_ld,
|
||||
buffers.b_mat(), args.b_offset, args.b_ld,
|
||||
&queue_plain, &event);
|
||||
if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); }
|
||||
return status;
|
||||
|
@ -93,7 +93,7 @@ class TestXtrmm {
|
|||
|
||||
// Describes how to run the clBLAS routine (for correctness/performance comparison)
|
||||
#ifdef CLBLAST_REF_CLBLAS
|
||||
static StatusCode RunReference1(const Arguments<T> &args, std::vector<Buffers<T>> &buffers, Queue &queue) {
|
||||
static StatusCode RunReference1(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
|
||||
auto queue_plain = queue();
|
||||
auto event = cl_event{};
|
||||
auto status = clblasXtrmm(convertToCLBLAS(args.layout),
|
||||
|
@ -102,8 +102,8 @@ class TestXtrmm {
|
|||
convertToCLBLAS(args.a_transpose),
|
||||
convertToCLBLAS(args.diagonal),
|
||||
args.m, args.n, args.alpha,
|
||||
buffers[0].a_mat, args.a_offset, args.a_ld,
|
||||
buffers[0].b_mat, args.b_offset, args.b_ld,
|
||||
buffers.a_mat, args.a_offset, args.a_ld,
|
||||
buffers.b_mat, args.b_offset, args.b_ld,
|
||||
1, &queue_plain, 0, nullptr, &event);
|
||||
clWaitForEvents(1, &event);
|
||||
return static_cast<StatusCode>(status);
|
||||
|
@ -112,11 +112,11 @@ class TestXtrmm {
|
|||
|
||||
// Describes how to run the CPU BLAS routine (for correctness/performance comparison)
|
||||
#ifdef CLBLAST_REF_CBLAS
|
||||
static StatusCode RunReference2(const Arguments<T> &args, std::vector<Buffers<T>> &buffers, Queue &queue) {
|
||||
static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
|
||||
std::vector<T> a_mat_cpu(args.a_size, static_cast<T>(0));
|
||||
std::vector<T> b_mat_cpu(args.b_size, static_cast<T>(0));
|
||||
buffers[0].a_mat.Read(queue, args.a_size, a_mat_cpu);
|
||||
buffers[0].b_mat.Read(queue, args.b_size, b_mat_cpu);
|
||||
buffers.a_mat.Read(queue, args.a_size, a_mat_cpu);
|
||||
buffers.b_mat.Read(queue, args.b_size, b_mat_cpu);
|
||||
cblasXtrmm(convertToCBLAS(args.layout),
|
||||
convertToCBLAS(args.side),
|
||||
convertToCBLAS(args.triangle),
|
||||
|
@ -125,7 +125,7 @@ class TestXtrmm {
|
|||
args.m, args.n, args.alpha,
|
||||
a_mat_cpu, args.a_offset, args.a_ld,
|
||||
b_mat_cpu, args.b_offset, args.b_ld);
|
||||
buffers[0].b_mat.Write(queue, args.b_size, b_mat_cpu);
|
||||
buffers.b_mat.Write(queue, args.b_size, b_mat_cpu);
|
||||
return StatusCode::kSuccess;
|
||||
}
|
||||
#endif
|
||||
|
|
|
@ -91,13 +91,13 @@ class TestXtrsm {
|
|||
}
|
||||
|
||||
// Describes how to run the CLBlast routine
|
||||
static StatusCode RunRoutine(const Arguments<T> &args, std::vector<Buffers<T>> &buffers, Queue &queue) {
|
||||
static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
|
||||
auto queue_plain = queue();
|
||||
auto event = cl_event{};
|
||||
auto status = Trsm(args.layout, args.side, args.triangle, args.a_transpose, args.diagonal,
|
||||
args.m, args.n, args.alpha,
|
||||
buffers[0].a_mat(), args.a_offset, args.a_ld,
|
||||
buffers[0].b_mat(), args.b_offset, args.b_ld,
|
||||
buffers.a_mat(), args.a_offset, args.a_ld,
|
||||
buffers.b_mat(), args.b_offset, args.b_ld,
|
||||
&queue_plain, &event);
|
||||
if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); }
|
||||
return status;
|
||||
|
@ -105,7 +105,7 @@ class TestXtrsm {
|
|||
|
||||
// Describes how to run the clBLAS routine (for correctness/performance comparison)
|
||||
#ifdef CLBLAST_REF_CLBLAS
|
||||
static StatusCode RunReference1(const Arguments<T> &args, std::vector<Buffers<T>> &buffers, Queue &queue) {
|
||||
static StatusCode RunReference1(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
|
||||
auto queue_plain = queue();
|
||||
auto event = cl_event{};
|
||||
auto status = clblasXtrsm(convertToCLBLAS(args.layout),
|
||||
|
@ -114,8 +114,8 @@ class TestXtrsm {
|
|||
convertToCLBLAS(args.a_transpose),
|
||||
convertToCLBLAS(args.diagonal),
|
||||
args.m, args.n, args.alpha,
|
||||
buffers[0].a_mat, args.a_offset, args.a_ld,
|
||||
buffers[0].b_mat, args.b_offset, args.b_ld,
|
||||
buffers.a_mat, args.a_offset, args.a_ld,
|
||||
buffers.b_mat, args.b_offset, args.b_ld,
|
||||
1, &queue_plain, 0, nullptr, &event);
|
||||
clWaitForEvents(1, &event);
|
||||
return static_cast<StatusCode>(status);
|
||||
|
@ -124,11 +124,11 @@ class TestXtrsm {
|
|||
|
||||
// Describes how to run the CPU BLAS routine (for correctness/performance comparison)
|
||||
#ifdef CLBLAST_REF_CBLAS
|
||||
static StatusCode RunReference2(const Arguments<T> &args, std::vector<Buffers<T>> &buffers, Queue &queue) {
|
||||
static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
|
||||
std::vector<T> a_mat_cpu(args.a_size, static_cast<T>(0));
|
||||
std::vector<T> b_mat_cpu(args.b_size, static_cast<T>(0));
|
||||
buffers[0].a_mat.Read(queue, args.a_size, a_mat_cpu);
|
||||
buffers[0].b_mat.Read(queue, args.b_size, b_mat_cpu);
|
||||
buffers.a_mat.Read(queue, args.a_size, a_mat_cpu);
|
||||
buffers.b_mat.Read(queue, args.b_size, b_mat_cpu);
|
||||
cblasXtrsm(convertToCBLAS(args.layout),
|
||||
convertToCBLAS(args.side),
|
||||
convertToCBLAS(args.triangle),
|
||||
|
@ -137,7 +137,7 @@ class TestXtrsm {
|
|||
args.m, args.n, args.alpha,
|
||||
a_mat_cpu, args.a_offset, args.a_ld,
|
||||
b_mat_cpu, args.b_offset, args.b_ld);
|
||||
buffers[0].b_mat.Write(queue, args.b_size, b_mat_cpu);
|
||||
buffers.b_mat.Write(queue, args.b_size, b_mat_cpu);
|
||||
return StatusCode::kSuccess;
|
||||
}
|
||||
#endif
|
||||
|
|
|
@ -51,18 +51,28 @@ class TestXaxpyBatched {
|
|||
return alpha_base + Constant<T>(batch_id);
|
||||
}
|
||||
|
||||
// Describes how to obtain the sizes of the buffers (per item, not for the full batch)
|
||||
// Helper for the sizes per batch
|
||||
static size_t PerBatchSizeX(const Arguments<T> &args) { return args.n * args.x_inc; }
|
||||
static size_t PerBatchSizeY(const Arguments<T> &args) { return args.n * args.y_inc; }
|
||||
|
||||
// Describes how to obtain the sizes of the buffers
|
||||
static size_t GetSizeX(const Arguments<T> &args) {
|
||||
return args.n * args.x_inc;
|
||||
return PerBatchSizeX(args) * args.batch_count + args.x_offset;
|
||||
}
|
||||
static size_t GetSizeY(const Arguments<T> &args) {
|
||||
return args.n * args.y_inc;
|
||||
return PerBatchSizeY(args) * args.batch_count + args.y_offset;
|
||||
}
|
||||
|
||||
// Describes how to set the sizes of all the buffers (per item, not for the full batch)
|
||||
// Describes how to set the sizes of all the buffers
|
||||
static void SetSizes(Arguments<T> &args) {
|
||||
args.x_size = GetSizeX(args);
|
||||
args.y_size = GetSizeY(args);
|
||||
args.x_offsets = std::vector<size_t>(args.batch_count);
|
||||
args.y_offsets = std::vector<size_t>(args.batch_count);
|
||||
for (auto batch = size_t{0}; batch < args.batch_count; ++batch) {
|
||||
args.x_offsets[batch] = batch * PerBatchSizeX(args) + args.x_offset;
|
||||
args.y_offsets[batch] = batch * PerBatchSizeY(args) + args.y_offset;
|
||||
}
|
||||
}
|
||||
|
||||
// Describes what the default values of the leading dimensions of the matrices are
|
||||
|
@ -81,20 +91,16 @@ class TestXaxpyBatched {
|
|||
std::vector<T>&, std::vector<T>&) {} // N/A for this routine
|
||||
|
||||
// Describes how to run the CLBlast routine
|
||||
static StatusCode RunRoutine(const Arguments<T> &args, std::vector<Buffers<T>> &buffers, Queue &queue) {
|
||||
static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
|
||||
auto queue_plain = queue();
|
||||
auto event = cl_event{};
|
||||
auto alphas = std::vector<T>();
|
||||
auto x_buffers = std::vector<cl_mem>();
|
||||
auto y_buffers = std::vector<cl_mem>();
|
||||
for (auto batch = size_t{0}; batch < args.batch_count; ++batch) {
|
||||
alphas.push_back(GetAlpha(args.alpha, batch));
|
||||
x_buffers.push_back(buffers[batch].x_vec());
|
||||
y_buffers.push_back(buffers[batch].y_vec());
|
||||
}
|
||||
auto status = AxpyBatched(args.n, alphas.data(),
|
||||
x_buffers.data(), args.x_inc,
|
||||
y_buffers.data(), args.y_inc,
|
||||
buffers.x_vec(), args.x_offsets.data(), args.x_inc,
|
||||
buffers.y_vec(), args.y_offsets.data(), args.y_inc,
|
||||
args.batch_count,
|
||||
&queue_plain, &event);
|
||||
if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); }
|
||||
|
@ -103,13 +109,13 @@ class TestXaxpyBatched {
|
|||
|
||||
// Describes how to run the clBLAS routine (for correctness/performance comparison)
|
||||
#ifdef CLBLAST_REF_CLBLAS
|
||||
static StatusCode RunReference1(const Arguments<T> &args, std::vector<Buffers<T>> &buffers, Queue &queue) {
|
||||
static StatusCode RunReference1(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
|
||||
auto queue_plain = queue();
|
||||
for (auto batch = size_t{0}; batch < args.batch_count; ++batch) {
|
||||
auto event = cl_event{};
|
||||
auto status = clblasXaxpy(args.n, GetAlpha(args.alpha, batch),
|
||||
buffers[batch].x_vec, 0, args.x_inc,
|
||||
buffers[batch].y_vec, 0, args.y_inc,
|
||||
buffers.x_vec, args.x_offsets[batch], args.x_inc,
|
||||
buffers.y_vec, args.y_offsets[batch], args.y_inc,
|
||||
1, &queue_plain, 0, nullptr, &event);
|
||||
clWaitForEvents(1, &event);
|
||||
if (static_cast<StatusCode>(status) != StatusCode::kSuccess) {
|
||||
|
@ -122,41 +128,41 @@ class TestXaxpyBatched {
|
|||
|
||||
// Describes how to run the CPU BLAS routine (for correctness/performance comparison)
|
||||
#ifdef CLBLAST_REF_CBLAS
|
||||
static StatusCode RunReference2(const Arguments<T> &args, std::vector<Buffers<T>> &buffers, Queue &queue) {
|
||||
static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
|
||||
std::vector<T> x_vec_cpu(args.x_size, static_cast<T>(0));
|
||||
std::vector<T> y_vec_cpu(args.y_size, static_cast<T>(0));
|
||||
buffers.x_vec.Read(queue, args.x_size, x_vec_cpu);
|
||||
buffers.y_vec.Read(queue, args.y_size, y_vec_cpu);
|
||||
for (auto batch = size_t{0}; batch < args.batch_count; ++batch) {
|
||||
std::vector<T> x_vec_cpu(args.x_size, static_cast<T>(0));
|
||||
std::vector<T> y_vec_cpu(args.y_size, static_cast<T>(0));
|
||||
buffers[batch].x_vec.Read(queue, args.x_size, x_vec_cpu);
|
||||
buffers[batch].y_vec.Read(queue, args.y_size, y_vec_cpu);
|
||||
cblasXaxpy(args.n, GetAlpha(args.alpha, batch),
|
||||
x_vec_cpu, 0, args.x_inc,
|
||||
y_vec_cpu, 0, args.y_inc);
|
||||
buffers[batch].y_vec.Write(queue, args.y_size, y_vec_cpu);
|
||||
x_vec_cpu, args.x_offsets[batch], args.x_inc,
|
||||
y_vec_cpu, args.y_offsets[batch], args.y_inc);
|
||||
}
|
||||
buffers.y_vec.Write(queue, args.y_size, y_vec_cpu);
|
||||
return StatusCode::kSuccess;
|
||||
}
|
||||
#endif
|
||||
|
||||
// Describes how to download the results of the computation (per item, not for the full batch)
|
||||
// Describes how to download the results of the computation
|
||||
static std::vector<T> DownloadResult(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
|
||||
std::vector<T> result(args.y_size, static_cast<T>(0));
|
||||
buffers.y_vec.Read(queue, args.y_size, result);
|
||||
return result;
|
||||
}
|
||||
|
||||
// Describes how to compute the indices of the result buffer (per item, not for the full batch)
|
||||
// Describes how to compute the indices of the result buffer
|
||||
static size_t ResultID1(const Arguments<T> &args) { return args.n; }
|
||||
static size_t ResultID2(const Arguments<T> &) { return 1; } // N/A for this routine
|
||||
static size_t GetResultIndex(const Arguments<T> &args, const size_t id1, const size_t) {
|
||||
return id1 * args.y_inc;
|
||||
static size_t ResultID2(const Arguments<T> &args) { return args.batch_count; }
|
||||
static size_t GetResultIndex(const Arguments<T> &args, const size_t id1, const size_t id2) {
|
||||
return (id1 * args.y_inc) + args.y_offsets[id2];
|
||||
}
|
||||
|
||||
// Describes how to compute performance metrics (per item, not for the full batch)
|
||||
// Describes how to compute performance metrics
|
||||
static size_t GetFlops(const Arguments<T> &args) {
|
||||
return 2 * args.n;
|
||||
return args.batch_count * (2 * args.n);
|
||||
}
|
||||
static size_t GetBytes(const Arguments<T> &args) {
|
||||
return (3 * args.n) * sizeof(T);
|
||||
return args.batch_count * (3 * args.n) * sizeof(T);
|
||||
}
|
||||
};
|
||||
|
||||
|
|
|
@ -173,14 +173,14 @@ class TestXinvert {
|
|||
std::vector<T>&, std::vector<T>&) {} // N/A for this routine
|
||||
|
||||
// Describes how to run the CLBlast routine
|
||||
static StatusCode RunRoutine(const Arguments<T> &args, std::vector<Buffers<T>> &buffers, Queue &queue) {
|
||||
static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
|
||||
try {
|
||||
auto event = cl_event{};
|
||||
auto inverter = Xinvert<T>(queue, &event);
|
||||
inverter.InvertMatrixDiagonalBlocks(args.layout, args.triangle, args.diagonal,
|
||||
args.n, args.m,
|
||||
buffers[0].a_mat, args.a_offset, args.a_ld,
|
||||
buffers[0].b_mat);
|
||||
buffers.a_mat, args.a_offset, args.a_ld,
|
||||
buffers.b_mat);
|
||||
clWaitForEvents(1, &event);
|
||||
clReleaseEvent(event);
|
||||
} catch (...) { return DispatchException(); }
|
||||
|
@ -189,11 +189,11 @@ class TestXinvert {
|
|||
|
||||
// Describes how to run a naive version of the routine (for correctness/performance comparison).
|
||||
// Note that a proper clBLAS or CPU BLAS comparison is not available for non-BLAS routines.
|
||||
static StatusCode RunReference1(const Arguments<T> &args, std::vector<Buffers<T>> &buffers, Queue &queue) {
|
||||
static StatusCode RunReference1(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
|
||||
return RunReference(args, buffers[0], queue);
|
||||
}
|
||||
|
||||
static StatusCode RunReference2(const Arguments<T> &args, std::vector<Buffers<T>> &buffers, Queue &queue) {
|
||||
static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
|
||||
return RunReference(args, buffers[0], queue);
|
||||
}
|
||||
|
||||
|
|
|
@ -133,13 +133,13 @@ class TestXomatcopy {
|
|||
std::vector<T>&, std::vector<T>&) {} // N/A for this routine
|
||||
|
||||
// Describes how to run the CLBlast routine
|
||||
static StatusCode RunRoutine(const Arguments<T> &args, std::vector<Buffers<T>> &buffers, Queue &queue) {
|
||||
static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
|
||||
auto queue_plain = queue();
|
||||
auto event = cl_event{};
|
||||
auto status = Omatcopy<T>(args.layout, args.a_transpose,
|
||||
args.m, args.n, args.alpha,
|
||||
buffers[0].a_mat(), args.a_offset, args.a_ld,
|
||||
buffers[0].b_mat(), args.b_offset, args.b_ld,
|
||||
buffers.a_mat(), args.a_offset, args.a_ld,
|
||||
buffers.b_mat(), args.b_offset, args.b_ld,
|
||||
&queue_plain, &event);
|
||||
if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); }
|
||||
return status;
|
||||
|
@ -147,12 +147,12 @@ class TestXomatcopy {
|
|||
|
||||
// Describes how to run a naive version of the routine (for correctness/performance comparison).
|
||||
// Note that a proper clBLAS or CPU BLAS comparison is not available for non-BLAS routines.
|
||||
static StatusCode RunReference1(const Arguments<T> &args, std::vector<Buffers<T>> &buffers, Queue &queue) {
|
||||
return RunReference(args, buffers[0], queue);
|
||||
static StatusCode RunReference1(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
|
||||
return RunReference(args, buffers, queue);
|
||||
}
|
||||
|
||||
static StatusCode RunReference2(const Arguments<T> &args, std::vector<Buffers<T>> &buffers, Queue &queue) {
|
||||
return RunReference(args, buffers[0], queue);
|
||||
static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
|
||||
return RunReference(args, buffers, queue);
|
||||
}
|
||||
|
||||
// Describes how to download the results of the computation (more importantly: which buffer)
|
||||
|
|
Loading…
Reference in New Issue