From 6307d2e5db1347112d992b2ef7a6cde9b3441389 Mon Sep 17 00:00:00 2001 From: CNugteren Date: Thu, 17 Sep 2015 10:14:33 +0200 Subject: [PATCH 1/3] Added script to generate API interface and implementation automatically --- include/clblast.h | 39 +- include/clblast_c.h | 8 +- scripts/generator/datatype.py | 54 ++ scripts/generator/generator.py | 234 +++++ scripts/generator/routine.py | 320 +++++++ src/clblast.cc | 537 ++++++----- test/wrapper_clblas.h | 1586 ++++++++++++++++---------------- 7 files changed, 1696 insertions(+), 1082 deletions(-) create mode 100644 scripts/generator/datatype.py create mode 100644 scripts/generator/generator.py create mode 100644 scripts/generator/routine.py diff --git a/include/clblast.h b/include/clblast.h index 72825e0b..953e6953 100644 --- a/include/clblast.h +++ b/include/clblast.h @@ -90,7 +90,7 @@ enum class Precision { kHalf = 16, kSingle = 32, kDouble = 64, // Swap two vectors: SSWAP/DSWAP/CSWAP/ZSWAP template StatusCode Swap(const size_t n, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event); @@ -144,7 +144,7 @@ StatusCode Dotc(const size_t n, // BLAS level-2 (matrix-vector) routines // ================================================================================================= -// Templated-precision generalized matrix-vector multiplication: SGEMV/DGEMV/CGEMV/ZGEMV +// Generalized matrix-vector multiplication: SGEMV/DGEMV/CGEMV/ZGEMV template StatusCode Gemv(const Layout layout, const Transpose a_transpose, const size_t m, const size_t n, @@ -155,7 +155,7 @@ StatusCode Gemv(const Layout layout, const Transpose a_transpose, cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event); -// Templated-precision hermitian matrix-vector multiplication: SHEMV/DHEMV +// Hermitian matrix-vector multiplication: CHEMV/ZHEMV template StatusCode Hemv(const Layout layout, const Triangle triangle, const size_t n, @@ -166,7 +166,7 @@ StatusCode Hemv(const Layout layout, const Triangle triangle, cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event); -// Templated-precision symmetric matrix-vector multiplication: SSYMV/DSYMV +// Symmetric matrix-vector multiplication: SSYMV/DSYMV template StatusCode Symv(const Layout layout, const Triangle triangle, const size_t n, @@ -181,7 +181,7 @@ StatusCode Symv(const Layout layout, const Triangle triangle, // BLAS level-3 (matrix-matrix) routines // ================================================================================================= -// Templated-precision generalized matrix-matrix multiplication: SGEMM/DGEMM/CGEMM/ZGEMM +// Generalized matrix-matrix multiplication: SGEMM/DGEMM/CGEMM/ZGEMM template StatusCode Gemm(const Layout layout, const Transpose a_transpose, const Transpose b_transpose, const size_t m, const size_t n, const size_t k, @@ -192,7 +192,7 @@ StatusCode Gemm(const Layout layout, const Transpose a_transpose, const Transpos cl_mem c_buffer, const size_t c_offset, const size_t c_ld, cl_command_queue* queue, cl_event* event); -// Templated-precision symmetric matrix-matrix multiplication: SSYMM/DSYMM/CSYMM/ZSYMM +// Symmetric matrix-matrix multiplication: SSYMM/DSYMM/CSYMM/ZSYMM template StatusCode Symm(const Layout layout, const Side side, const Triangle triangle, const size_t m, const size_t n, @@ -203,7 +203,7 @@ StatusCode Symm(const Layout layout, const Side side, const Triangle triangle, cl_mem c_buffer, const size_t c_offset, const size_t c_ld, cl_command_queue* queue, cl_event* event); -// Templated-precision hermitian matrix-matrix multiplication: CHEMM/ZHEMM +// Hermitian matrix-matrix multiplication: CHEMM/ZHEMM template StatusCode Hemm(const Layout layout, const Side side, const Triangle triangle, const size_t m, const size_t n, @@ -214,7 +214,7 @@ StatusCode Hemm(const Layout layout, const Side side, const Triangle triangle, cl_mem c_buffer, const size_t c_offset, const size_t c_ld, cl_command_queue* queue, cl_event* event); -// Templated-precision rank-K update of a symmetric matrix: SSYRK/DSYRK/CSYRK/ZSYRK +// Rank-K update of a symmetric matrix: SSYRK/DSYRK/CSYRK/ZSYRK template StatusCode Syrk(const Layout layout, const Triangle triangle, const Transpose a_transpose, const size_t n, const size_t k, @@ -224,7 +224,7 @@ StatusCode Syrk(const Layout layout, const Triangle triangle, const Transpose a_ cl_mem c_buffer, const size_t c_offset, const size_t c_ld, cl_command_queue* queue, cl_event* event); -// Templated-precision rank-K update of a hermitian matrix: CHERK/ZHERK +// Rank-K update of a hermitian matrix: CHERK/ZHERK template StatusCode Herk(const Layout layout, const Triangle triangle, const Transpose a_transpose, const size_t n, const size_t k, @@ -234,7 +234,7 @@ StatusCode Herk(const Layout layout, const Triangle triangle, const Transpose a_ cl_mem c_buffer, const size_t c_offset, const size_t c_ld, cl_command_queue* queue, cl_event* event); -// Templated-precision rank-2K update of a symmetric matrix: SSYR2K/DSYR2K/CSYR2K/ZSYR2K +// Rank-2K update of a symmetric matrix: SSYR2K/DSYR2K/CSYR2K/ZSYR2K template StatusCode Syr2k(const Layout layout, const Triangle triangle, const Transpose ab_transpose, const size_t n, const size_t k, @@ -245,7 +245,7 @@ StatusCode Syr2k(const Layout layout, const Triangle triangle, const Transpose a cl_mem c_buffer, const size_t c_offset, const size_t c_ld, cl_command_queue* queue, cl_event* event); -// Templated-precision rank-2K update of a hermitian matrix: CHER2K/ZHER2K +// Rank-2K update of a hermitian matrix: CHER2K/ZHER2K template StatusCode Her2k(const Layout layout, const Triangle triangle, const Transpose ab_transpose, const size_t n, const size_t k, @@ -256,28 +256,15 @@ StatusCode Her2k(const Layout layout, const Triangle triangle, const Transpose a cl_mem c_buffer, const size_t c_offset, const size_t c_ld, cl_command_queue* queue, cl_event* event); -// Templated-precision triangular matrix-matrix multiplication: STRMM/DTRMM/CTRMM/ZTRMM +// Triangular matrix-matrix multiplication: STRMM/DTRMM/CTRMM/ZTRMM template -StatusCode Trmm(const Layout layout, const Side side, const Triangle triangle, - const Transpose a_transpose, const Diagonal diagonal, +StatusCode Trmm(const Layout layout, const Side side, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, const size_t m, const size_t n, const T alpha, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, cl_mem b_buffer, const size_t b_offset, const size_t b_ld, cl_command_queue* queue, cl_event* event); -// Templated-precision matrix equation solver: STRSM/DTRSM/CTRSM/ZTRSM -/* -template -StatusCode Trsm(const Layout layout, const Side side, const Triangle triangle, - const Transpose a_transpose, const Diagonal diagonal, - const size_t m, const size_t n, - const T alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - cl_mem b_buffer, const size_t b_offset, const size_t b_ld, - cl_command_queue* queue, cl_event* event); -*/ - // ================================================================================================= } // namespace clblast diff --git a/include/clblast_c.h b/include/clblast_c.h index 88754990..56507625 100644 --- a/include/clblast_c.h +++ b/include/clblast_c.h @@ -85,19 +85,19 @@ typedef enum Precision_ { kHalf = 16, kSingle = 32, kDouble = 64, // Swap two vectors: SSWAP/DSWAP/CSWAP/ZSWAP StatusCode CLBlastSswap(const size_t n, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event); StatusCode CLBlastDswap(const size_t n, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event); StatusCode CLBlastCswap(const size_t n, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event); StatusCode CLBlastZswap(const size_t n, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event); diff --git a/scripts/generator/datatype.py b/scripts/generator/datatype.py new file mode 100644 index 00000000..cca3534d --- /dev/null +++ b/scripts/generator/datatype.py @@ -0,0 +1,54 @@ +#!/usr/bin/env python + +# ================================================================================================== +# This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +# project loosely follows the Google C++ styleguide and uses a max-width of 100 characters per line. +# +# Author(s): +# Cedric Nugteren +# +# This file contains the 'DataType' class, used in the generator script to generate the CLBlast API +# interface and implementation. +# +# ================================================================================================== + +# Short-hands for data-types +FLT = "float" +DBL = "double" +FLT2 = "float2" +DBL2 = "double2" +F2CL = "cl_float2" +D2CL = "cl_double2" + +# Structure holding data-type and precision information +class DataType(): + def __init__(self, name, template, scalars, buffertype): + self.name = name + self.template = template + self.alpha_cpp = scalars[0] + self.beta_cpp = scalars[1] + self.alpha_cl = scalars[2] + self.beta_cl = scalars[3] + self.buffertype = buffertype # Only used for template types + + # Outputs the name of the data-type (alpha/beta), possibly transforming into the right type + def UseAlpha(self): + if self.alpha_cpp in [FLT2, DBL2]: + return self.alpha_cpp+"{alpha.s[0], alpha.s[1]}" + return "alpha" + def UseBeta(self): + if self.beta_cpp in [FLT2, DBL2]: + return self.beta_cpp+"{beta.s[0], beta.s[1]}" + return "beta" + + # As above, but the transformation is in the opposite direction + def UseAlphaCL(self): + if self.alpha_cpp in [FLT2, DBL2]: + return self.alpha_cl+"{{alpha.real(), alpha.imag()}}" + return "alpha" + def UseBetaCL(self): + if self.beta_cpp in [FLT2, DBL2]: + return self.beta_cl+"{{beta.real(), beta.imag()}}" + return "beta" + +# ================================================================================================== diff --git a/scripts/generator/generator.py b/scripts/generator/generator.py new file mode 100644 index 00000000..699cd9cf --- /dev/null +++ b/scripts/generator/generator.py @@ -0,0 +1,234 @@ +#!/usr/bin/env python + +# ================================================================================================== +# This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +# project loosely follows the Google C++ styleguide and uses a max-width of 100 characters per line. +# +# Author(s): +# Cedric Nugteren +# +# This script automatically generates the bodies of the following files, creating the full CLBlast +# API interface and implementation (C, C++, and clBLAS wrapper): +# clblast.h +# clblast.cc +# clblast_c.h +# clblast_c.cc +# wrapper_clblas.h +# +# ================================================================================================== + +# System modules +import sys +import os.path + +# Local files +from routine import Routine +from datatype import DataType, FLT, DBL, FLT2, DBL2, F2CL, D2CL + +# ================================================================================================== + +# Regular data-types +S = DataType("S", FLT, [FLT, FLT, FLT, FLT], FLT ) # single (32) +D = DataType("D", DBL, [DBL, DBL, DBL, DBL], DBL ) # double (64) +C = DataType("C", FLT2, [FLT2, FLT2, F2CL, F2CL], F2CL) # single-complex (3232) +Z = DataType("Z", DBL2, [DBL2, DBL2, D2CL, D2CL], D2CL) # double-complex (6464) + +# Special cases +Css = DataType("C", FLT, [FLT, FLT, FLT, FLT], FLT ) # As C, but with constants from S +Zdd = DataType("Z", DBL, [DBL, DBL, DBL, DBL], DBL ) # As Z, but with constants from D +Ccs = DataType("C", FLT2+","+FLT, [FLT2, FLT, F2CL, FLT], FLT2) # As C, but with one constant from S +Zzd = DataType("Z", DBL2+","+DBL, [DBL2, DBL, D2CL, DBL], DBL2) # As Z, but with one constant from D + +# C++ template data-types +T = DataType("typename T", "T", ["T", "T", "T", "T"], "T") # regular routine +Tc = DataType("typename T", "std::complex,T", ["T", "T", "T", "T"], "std::complex") # for herk +TU = DataType("typename T, typename U", "T,U", ["T", "U", "T", "U"], "T") # for her2k + +# ================================================================================================== + +# Populates a list of routines +routines = [ +[ # Level 1 + Routine(True, 1, "swap", T, [S,D,C,Z], ["n"], [], [], ["x","y"], [], False, "Swap two vectors"), + Routine(True, 1, "scal", T, [S,D,C,Z], ["n"], [], [], ["x"], ["alpha"], False, "Vector scaling"), + Routine(True, 1, "copy", T, [S,D,C,Z], ["n"], [], ["x"], ["y"], [], False, "Vector copy"), + Routine(True, 1, "axpy", T, [S,D,C,Z], ["n"], [], ["x"], ["y"], ["alpha"], False, "Vector-times-constant plus vector"), + Routine(True, 1, "dot", T, [S,D], ["n"], [], ["x","y"], ["dot"], [], True, "Dot product of two vectors"), + Routine(True, 1, "dotu", T, [C,Z], ["n"], [], ["x","y"], ["dot"], [], True, "Dot product of two complex vectors"), + Routine(True, 1, "dotc", T, [C,Z], ["n"], [], ["x","y"], ["dot"], [], True, "Dot product of two complex vectors, one conjugated"), +], +[ # Level 2 + Routine(True, 2, "gemv", T, [S,D,C,Z], ["m","n"], ["layout","a_transpose"], ["a","x"], ["y"], ["alpha","beta"], False, "Generalized matrix-vector multiplication"), + Routine(True, 2, "hemv", T, [C,Z], ["n"], ["layout","triangle"], ["a","x"], ["y"], ["alpha","beta"], False, "Hermitian matrix-vector multiplication"), + Routine(True, 2, "symv", T, [S,D], ["n"], ["layout","triangle"], ["a","x"], ["y"], ["alpha","beta"], False, "Symmetric matrix-vector multiplication"), +], +[ # Level 3 + Routine(True, 3, "gemm", T, [S,D,C,Z], ["m","n","k"], ["layout","a_transpose","b_transpose"], ["a","b"], ["c"], ["alpha","beta"], False, "Generalized matrix-matrix multiplication"), + Routine(True, 3, "symm", T, [S,D,C,Z], ["m","n"], ["layout","side","triangle"], ["a","b"], ["c"], ["alpha","beta"], False, "Symmetric matrix-matrix multiplication"), + Routine(True, 3, "hemm", T, [C,Z], ["m","n"], ["layout","side","triangle"], ["a","b"], ["c"], ["alpha","beta"], False, "Hermitian matrix-matrix multiplication"), + Routine(True, 3, "syrk", T, [S,D,C,Z], ["n","k"], ["layout","triangle","a_transpose"], ["a"], ["c"], ["alpha","beta"], False, "Rank-K update of a symmetric matrix"), + Routine(True, 3, "herk", Tc, [Css,Zdd], ["n","k"], ["layout","triangle","a_transpose"], ["a"], ["c"], ["alpha","beta"], False, "Rank-K update of a hermitian matrix"), + Routine(True, 3, "syr2k", T, [S,D,C,Z], ["n","k"], ["layout","triangle","ab_transpose"], ["a","b"], ["c"], ["alpha","beta"], False, "Rank-2K update of a symmetric matrix"), + Routine(True, 3, "her2k", TU, [Ccs,Zzd], ["n","k"], ["layout","triangle","ab_transpose"], ["a","b"], ["c"], ["alpha","beta"], False, "Rank-2K update of a hermitian matrix"), + Routine(True, 3, "trmm", T, [S,D,C,Z], ["m","n"], ["layout","side","triangle","a_transpose","diagonal"], ["a"], ["b"], ["alpha"], False, "Triangular matrix-matrix multiplication"), +]] + +# ================================================================================================== + +# Separators for the BLAS levels +separators = [""" +// ================================================================================================= +// BLAS level-1 (vector-vector) routines +// =================================================================================================""", +""" +// ================================================================================================= +// BLAS level-2 (matrix-vector) routines +// =================================================================================================""", +""" +// ================================================================================================= +// BLAS level-3 (matrix-matrix) routines +// ================================================================================================="""] + +# ================================================================================================== + +# The C++ API header (.h) +def clblast_h(routines): + result = "" + for routine in routines: + result += "\n// "+routine.description+": "+routine.ShortNames()+"\n" + result += routine.RoutineHeaderCPP(12)+";\n" + return result + +# The C++ API implementation (.cc) +def clblast_cc(routines): + result = "" + for routine in routines: + indent1 = " "*(20 + routine.Length()) + result += "\n// "+routine.description+": "+routine.ShortNames()+"\n" + if routine.implemented: + result += routine.RoutineHeaderCPP(12)+" {\n" + result += " auto queue_cpp = Queue(*queue);\n" + result += " auto event_cpp = Event(*event);\n" + result += " auto routine = X"+routine.name+"<"+routine.template.template+">(queue_cpp, event_cpp);\n" + result += " auto status = routine.SetUp();\n" + result += " if (status != StatusCode::kSuccess) { return status; }\n" + result += " return routine.Do"+routine.name.capitalize()+"(" + result += (",\n"+indent1).join([a for a in routine.ArgumentsCladuc(routine.template, indent1)]) + result += ");\n" + else: + result += routine.RoutineHeaderTypeCPP(12)+" {\n" + result += " return StatusCode::kNotImplemented;\n" + result += "}\n" + for flavour in routine.flavours: + indent2 = " "*(23 + routine.Length() + len(flavour.template)) + result += "template StatusCode "+routine.name.capitalize()+"<"+flavour.template+">(" + result += (",\n"+indent2).join([a for a in routine.ArgumentsType(flavour)]) + result += ",\n"+indent2+"cl_command_queue*, cl_event*);\n" + return result + +# ================================================================================================== + +# The C API header (.h) +def clblast_c_h(routines): + result = "" + for routine in routines: + result += "\n// "+routine.description+": "+routine.ShortNames()+"\n" + for flavour in routine.flavours: + result += routine.RoutineHeaderC(flavour, 20)+";\n" + return result + +# The C API implementation (.cc) +def clblast_c_cc(routines): + result = "" + for routine in routines: + result += "\n// "+routine.name.upper()+"\n" + for flavour in routine.flavours: + template = "<"+flavour.template+">" if routine.NoScalars() else "" + indent = " "*(26 + routine.Length() + len(template)) + result += routine.RoutineHeaderC(flavour, 20)+" {\n" + result += " auto status = clblast::"+routine.name.capitalize()+template+"(" + result += (",\n"+indent).join([a for a in routine.ArgumentsCast(flavour, indent)]) + result += ",\n"+indent+"queue, event);" + result += "\n return static_cast(status);\n}\n" + return result + +# ================================================================================================== + +# The wrapper to the reference clBLAS routines (for performance/correctness testing) +def wrapper_clblas(routines): + result = "" + for routine in routines: + result += "\n// Forwards the clBLAS calls for %s\n" % (routine.ShortNames()) + if routine.NoScalars(): + result += routine.RoutineHeaderWrapper(routine.template, True, 21)+";\n" + for flavour in routine.flavours: + indent = " "*(17 + routine.Length()) + result += routine.RoutineHeaderWrapper(flavour, False, 21)+" {\n" + arguments = routine.ArgumentsWrapper(flavour) + if routine.scratch: + result += " auto queue = Queue(queues[0]);\n" + result += " auto context = queue.GetContext();\n" + result += " auto scratch_buffer = Buffer<"+flavour.template+">(context, n);\n" + arguments += ["scratch_buffer()"] + result += " return clblas"+flavour.name+routine.name+"(" + result += (",\n"+indent).join([a for a in arguments]) + result += ",\n"+indent+"num_queues, queues, num_wait_events, wait_events, events);" + result += "\n}\n" + return result + +# ================================================================================================== + +# Checks for the number of command-line arguments +if len(sys.argv) != 2: + print "[ERROR] Usage: generator.py " + sys.exit() + +# Parses the command-line arguments +path_clblast = sys.argv[1] +files = [ + path_clblast+"/include/clblast.h", + path_clblast+"/src/clblast.cc", + path_clblast+"/include/clblast_c.h", + path_clblast+"/src/clblast_c.cc", + path_clblast+"/test/wrapper_clblas.h", +] +header_lines = [84, 44, 80, 24, 22] +footer_lines = [6, 3, 5, 2, 6] + +# Checks whether the command-line arguments are valid; exists otherwise +for f in files: + if not os.path.isfile(f): + print "[ERROR] The path '"+path_clblast+"' does not point to the root of the CLBlast library" + sys.exit() + +# ================================================================================================== + +# Iterates over all files to output +for i in xrange(0,len(files)): + + # Stores the header and the footer of the original file + with open(files[i]) as f: + original = f.readlines() + header = original[:header_lines[i]] + footer = original[-footer_lines[i]:] + + # Re-writes the body of the file + with open(files[i], "w") as f: + body = "" + for level in [1,2,3]: + body += separators[level-1]+"\n" + if i == 0: + body += clblast_h(routines[level-1]) + if i == 1: + body += clblast_cc(routines[level-1]) + if i == 2: + body += clblast_c_h(routines[level-1]) + if i == 3: + body += clblast_c_cc(routines[level-1]) + if i == 4: + body += wrapper_clblas(routines[level-1]) + f.write("".join(header)) + f.write(body) + f.write("".join(footer)) + +# ================================================================================================== diff --git a/scripts/generator/routine.py b/scripts/generator/routine.py new file mode 100644 index 00000000..b2c50e3d --- /dev/null +++ b/scripts/generator/routine.py @@ -0,0 +1,320 @@ +#!/usr/bin/env python + +# ================================================================================================== +# This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +# project loosely follows the Google C++ styleguide and uses a max-width of 100 characters per line. +# +# Author(s): +# Cedric Nugteren +# +# This file contains the 'Routine' class, used in the generator script to generate the CLBlast API +# interface and implementation. +# +# ================================================================================================== + +# Translates an option name to a CLBlast data-type +def OptionToCLBlast(x): + return { + 'layout': "Layout", + 'a_transpose': "Transpose", + 'b_transpose': "Transpose", + 'ab_transpose': "Transpose", + 'side': "Side", + 'triangle': "Triangle", + 'diagonal': "Diagonal", + }[x] + +# As above, but for clBLAS data-types +def OptionToWrapper(x): + return { + 'layout': "clblasOrder", + 'a_transpose': "clblasTranspose", + 'b_transpose': "clblasTranspose", + 'ab_transpose': "clblasTranspose", + 'side': "clblasSide", + 'triangle': "clblasUplo", + 'diagonal': "clblasDiag", + }[x] + +# ================================================================================================== + +# Class holding routine-specific information (e.g. name, which arguments, which precisions) +class Routine(): + def __init__(self, implemented, level, name, template, flavours, sizes, options, + inputs, outputs, scalars, scratch, description): + self.implemented = implemented + self.level = level + self.name = name + self.template = template + self.flavours = flavours + self.sizes = sizes + self.options = options + self.inputs = inputs + self.outputs = outputs + self.scalars = scalars + self.scratch = scratch # Scratch buffer (e.g. for xDOT) + self.description = description + + # Retrieves the number of characters in the routine's name + def Length(self): + return len(self.name) + + # Retrieves the postfix for a buffer + def Postfix(self, name): + return "inc" if (name in ["x","y"]) else "ld" + + # Determines whether or not this routine has scalar arguments (alpha/beta) + def NoScalars(self): + return self.scalars == [] + + # Returns the upper-case names of these routines (all flavours) + def ShortNames(self): + return "/".join([f.name+self.name.upper() for f in self.flavours]) + + # ============================================================================================== + + # Retrieves a variable name for a specific input/output vector/matrix (e.g. 'x') + def Buffer(self, name): + if (name in self.inputs) or (name in self.outputs): + a = [name+"_buffer"] + b = [name+"_offset"] + c = [name+"_"+self.Postfix(name)] if (name not in ["dot"]) else [] + return [", ".join(a+b+c)] + return [] + + # As above but with data-types + def BufferDef(self, name): + prefix = "const " if (name in self.inputs) else "" + if (name in self.inputs) or (name in self.outputs): + a = [prefix+"cl_mem "+name+"_buffer"] + b = ["const size_t "+name+"_offset"] + c = ["const size_t "+name+"_"+self.Postfix(name)] if (name not in ["dot"]) else [] + return [", ".join(a+b+c)] + return [] + + # As above but with Claduc buffers + def BufferCladuc(self, name): + if (name in self.inputs) or (name in self.outputs): + a = ["Buffer<"+self.template.buffertype+">("+name+"_buffer)"] + b = [name+"_offset"] + c = [name+"_"+self.Postfix(name)] if (name not in ["dot"]) else [] + return [", ".join(a+b+c)] + return [] + + # As above but with a static cast for clBLAS wrapper + def BufferWrapper(self, name): + if (name in self.inputs) or (name in self.outputs): + a = [name+"_buffer"] + b = [name+"_offset"] + c = [] + if (name in ["x","y"]): + c = ["static_cast("+name+"_"+self.Postfix(name)+")"] + elif (name in ["a","b","c"]): + c = [name+"_"+self.Postfix(name)] + return [", ".join(a+b+c)] + return [] + + # As above, but only data-types + def BufferType(self, name): + prefix = "const " if (name in self.inputs) else "" + if (name in self.inputs) or (name in self.outputs): + a = [prefix+"cl_mem"] + b = ["const size_t"] + c = ["const size_t"] if (name not in ["dot"]) else [] + return [", ".join(a+b+c)] + return [] + + # ============================================================================================== + + # Retrieves the name of a scalar (alpha/beta) + def Scalar(self, name): + if (name in self.scalars): + return [name] + return [] + + # Retrieves the use of a scalar (alpha/beta) + def ScalarUse(self, name, flavour): + if ((name == "alpha") and (name in self.scalars)): + return [flavour.UseAlpha()] + elif ((name == "beta") and (name in self.scalars)): + return [flavour.UseBeta()] + return [] + + # Retrieves the use of a scalar (alpha/beta) + def ScalarUseWrapper(self, name, flavour): + if ((name == "alpha") and (name in self.scalars)): + return [flavour.UseAlphaCL()] + elif ((name == "beta") and (name in self.scalars)): + return [flavour.UseBetaCL()] + return [] + + # Retrieves the definition of a scalar (alpha/beta) + def ScalarDef(self, name, flavour): + if ((name == "alpha") and (name in self.scalars)): + return ["const "+flavour.alpha_cl+" "+name] + elif ((name == "beta") and (name in self.scalars)): + return ["const "+flavour.beta_cl+" "+name] + return [] + + # As above, but without 'cl_' prefix + def ScalarDefPlain(self, name, flavour): + if ((name == "alpha") and (name in self.scalars)): + return ["const "+flavour.alpha_cpp+" "+name] + elif ((name == "beta") and (name in self.scalars)): + return ["const "+flavour.beta_cpp+" "+name] + return [] + + # Retrieves the type of a scalar (alpha/beta) + def ScalarType(self, name, flavour): + if ((name == "alpha") and (name in self.scalars)): + return ["const "+flavour.alpha_cpp] + elif ((name == "beta") and (name in self.scalars)): + return ["const "+flavour.beta_cpp] + return [] + + # ============================================================================================== + + # Retrieves a list of comma-separated sizes (m, n, k) + def Sizes(self): + if self.sizes: + return [", ".join([s for s in self.sizes])] + return [] + + # Retrieves the definition of the sizes (m,n,k) + def SizesDef(self): + if self.sizes: + return [", ".join(["const size_t "+s for s in self.sizes])] + return [] + + # Retrieves the types of the sizes (m,n,k) + def SizesType(self): + if self.sizes: + return [", ".join(["const size_t" for s in self.sizes])] + return [] + + # ============================================================================================== + + # Retrieves a list of options + def Options(self): + if self.options: + return [", ".join(self.options)] + return [] + + # As above, but now casted to CLBlast data-types + def OptionsCast(self, indent): + if self.options: + options = ["static_cast("+o+")" for o in self.options] + return [(",\n"+indent).join(options)] + return [] + + # Retrieves the definitions of the options (layout, transpose, side, etc.) + def OptionsDef(self): + if self.options: + definitions = ["const "+OptionToCLBlast(o)+" "+o for o in self.options] + return [", ".join(definitions)] + return [] + + # As above, but now using clBLAS data-types + def OptionsDefWrapper(self): + if self.options: + definitions = ["const "+OptionToWrapper(o)+" "+o for o in self.options] + return [", ".join(definitions)] + return [] + + # Retrieves the types of the options (layout, transpose, side, etc.) + def OptionsType(self): + if self.options: + definitions = ["const "+OptionToCLBlast(o) for o in self.options] + return [", ".join(definitions)] + return [] + + # ============================================================================================== + + # Retrieves a combination of all the argument names, with Claduc casts + def ArgumentsCladuc(self, flavour, indent): + return (self.Options() + self.Sizes() + self.BufferCladuc("dot") + + self.Scalar("alpha") + + self.BufferCladuc("a") + self.BufferCladuc("b") + self.BufferCladuc("x") + + self.Scalar("beta") + self.BufferCladuc("y") + self.BufferCladuc("c")) + + # Retrieves a combination of all the argument names, with CLBlast casts + def ArgumentsCast(self, flavour, indent): + return (self.OptionsCast(indent) + self.Sizes() + self.Buffer("dot") + + self.ScalarUse("alpha", flavour) + + self.Buffer("a") + self.Buffer("b") + self.Buffer("x") + + self.ScalarUse("beta", flavour) + self.Buffer("y") + self.Buffer("c")) + + # As above, but for the clBLAS wrapper + def ArgumentsWrapper(self, flavour): + return (self.Options() + self.Sizes() + self.BufferWrapper("dot") + + self.ScalarUseWrapper("alpha", flavour) + + self.BufferWrapper("a") + self.BufferWrapper("b") + self.BufferWrapper("x") + + self.ScalarUseWrapper("beta", flavour) + self.BufferWrapper("y") + self.BufferWrapper("c")) + + # Retrieves a combination of all the argument definitions + def ArgumentsDef(self, flavour): + return (self.OptionsDef() + self.SizesDef() + self.BufferDef("dot") + + self.ScalarDef("alpha", flavour) + + self.BufferDef("a") + self.BufferDef("b") + self.BufferDef("x") + + self.ScalarDef("beta", flavour) + self.BufferDef("y") + self.BufferDef("c")) + + # As above, but clBLAS wrapper plain datatypes + def ArgumentsDefWrapper(self, flavour): + return (self.OptionsDefWrapper() + self.SizesDef() + self.BufferDef("dot") + + self.ScalarDefPlain("alpha", flavour) + + self.BufferDef("a") + self.BufferDef("b") + self.BufferDef("x") + + self.ScalarDefPlain("beta", flavour) + self.BufferDef("y") + self.BufferDef("c")) + + # Retrieves a combination of all the argument types + def ArgumentsType(self, flavour): + return (self.OptionsType() + self.SizesType() + self.BufferType("dot") + + self.ScalarType("alpha", flavour) + + self.BufferType("a") + self.BufferType("b") + self.BufferType("x") + + self.ScalarType("beta", flavour) + self.BufferType("y") + self.BufferType("c")) + + + # ============================================================================================== + + # Retrieves the C++ templated definition for a routine + def RoutineHeaderCPP(self, spaces): + indent = " "*(spaces + self.Length()) + result = "template <"+self.template.name+">\n" + result += "StatusCode "+self.name.capitalize()+"(" + result += (",\n"+indent).join([a for a in self.ArgumentsDef(self.template)]) + result += ",\n"+indent+"cl_command_queue* queue, cl_event* event)" + return result + + # As above, but now without variable names + def RoutineHeaderTypeCPP(self, spaces): + indent = " "*(spaces + self.Length()) + result = "template <"+self.template.name+">\n" + result += "StatusCode "+self.name.capitalize()+"(" + result += (",\n"+indent).join([a for a in self.ArgumentsType(self.template)]) + result += ",\n"+indent+"cl_command_queue* queue, cl_event* event)" + return result + + # As above, but now for C + def RoutineHeaderC(self, flavour, spaces): + indent = " "*(spaces + self.Length()) + result = "StatusCode CLBlast"+flavour.name+self.name+"(" + result += (",\n"+indent).join([a for a in self.ArgumentsDef(flavour)]) + result += ",\n"+indent+"cl_command_queue* queue, cl_event* event)" + return result + + # As above, but now for the clBLAS wrapper + def RoutineHeaderWrapper(self, flavour, def_only, spaces): + template = "<"+flavour.template+">" if self.NoScalars() and not def_only else "" + indent = " "*(spaces + self.Length() + len(template)) + result = "" + if self.NoScalars(): + result += "template <" + if def_only: + result += flavour.name + result += ">\n" + result += "clblasStatus clblasX"+self.name+template+"(" + result += (",\n"+indent).join([a for a in self.ArgumentsDefWrapper(flavour)]) + result += ",\n"+indent+"cl_uint num_queues, cl_command_queue *queues" + result += ",\n"+indent+"cl_uint num_wait_events, const cl_event *wait_events, cl_event *events)" + return result + +# ================================================================================================== diff --git a/src/clblast.cc b/src/clblast.cc index 3303085e..0ced9ff7 100644 --- a/src/clblast.cc +++ b/src/clblast.cc @@ -42,11 +42,12 @@ #include "internal/routines/level3/xtrmm.h" namespace clblast { + // ================================================================================================= // BLAS level-1 (vector-vector) routines // ================================================================================================= -// SWAP +// Swap two vectors: SSWAP/DSWAP/CSWAP/ZSWAP template StatusCode Swap(const size_t n, cl_mem x_buffer, const size_t x_offset, const size_t x_inc, @@ -64,21 +65,21 @@ StatusCode Swap(const size_t n, template StatusCode Swap(const size_t, cl_mem, const size_t, const size_t, cl_mem, const size_t, const size_t, - cl_command_queue* queue, cl_event* event); + cl_command_queue*, cl_event*); template StatusCode Swap(const size_t, cl_mem, const size_t, const size_t, cl_mem, const size_t, const size_t, - cl_command_queue* queue, cl_event* event); + cl_command_queue*, cl_event*); template StatusCode Swap(const size_t, cl_mem, const size_t, const size_t, cl_mem, const size_t, const size_t, - cl_command_queue* queue, cl_event* event); + cl_command_queue*, cl_event*); template StatusCode Swap(const size_t, cl_mem, const size_t, const size_t, cl_mem, const size_t, const size_t, - cl_command_queue* queue, cl_event* event); + cl_command_queue*, cl_event*); -// SCAL +// Vector scaling: SSCAL/DSCAL/CSCAL/ZSCAL template StatusCode Scal(const size_t n, const T alpha, @@ -96,21 +97,21 @@ StatusCode Scal(const size_t n, template StatusCode Scal(const size_t, const float, cl_mem, const size_t, const size_t, - cl_command_queue* queue, cl_event* event); + cl_command_queue*, cl_event*); template StatusCode Scal(const size_t, const double, cl_mem, const size_t, const size_t, - cl_command_queue* queue, cl_event* event); + cl_command_queue*, cl_event*); template StatusCode Scal(const size_t, const float2, cl_mem, const size_t, const size_t, - cl_command_queue* queue, cl_event* event); + cl_command_queue*, cl_event*); template StatusCode Scal(const size_t, const double2, cl_mem, const size_t, const size_t, - cl_command_queue* queue, cl_event* event); + cl_command_queue*, cl_event*); -// COPY +// Vector copy: SCOPY/DCOPY/CCOPY/ZCOPY template StatusCode Copy(const size_t n, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, @@ -128,21 +129,21 @@ StatusCode Copy(const size_t n, template StatusCode Copy(const size_t, const cl_mem, const size_t, const size_t, cl_mem, const size_t, const size_t, - cl_command_queue* queue, cl_event* event); + cl_command_queue*, cl_event*); template StatusCode Copy(const size_t, const cl_mem, const size_t, const size_t, cl_mem, const size_t, const size_t, - cl_command_queue* queue, cl_event* event); + cl_command_queue*, cl_event*); template StatusCode Copy(const size_t, const cl_mem, const size_t, const size_t, cl_mem, const size_t, const size_t, - cl_command_queue* queue, cl_event* event); + cl_command_queue*, cl_event*); template StatusCode Copy(const size_t, const cl_mem, const size_t, const size_t, cl_mem, const size_t, const size_t, - cl_command_queue* queue, cl_event* event); + cl_command_queue*, cl_event*); -// AXPY +// Vector-times-constant plus vector: SAXPY/DAXPY/CAXPY/ZAXPY template StatusCode Axpy(const size_t n, const T alpha, @@ -163,24 +164,24 @@ template StatusCode Axpy(const size_t, const float, const cl_mem, const size_t, const size_t, cl_mem, const size_t, const size_t, - cl_command_queue* queue, cl_event* event); + cl_command_queue*, cl_event*); template StatusCode Axpy(const size_t, const double, const cl_mem, const size_t, const size_t, cl_mem, const size_t, const size_t, - cl_command_queue* queue, cl_event* event); + cl_command_queue*, cl_event*); template StatusCode Axpy(const size_t, const float2, const cl_mem, const size_t, const size_t, cl_mem, const size_t, const size_t, - cl_command_queue* queue, cl_event* event); + cl_command_queue*, cl_event*); template StatusCode Axpy(const size_t, const double2, const cl_mem, const size_t, const size_t, cl_mem, const size_t, const size_t, - cl_command_queue* queue, cl_event* event); + cl_command_queue*, cl_event*); -// DOT +// Dot product of two vectors: SDOT/DDOT template StatusCode Dot(const size_t n, cl_mem dot_buffer, const size_t dot_offset, @@ -201,14 +202,14 @@ template StatusCode Dot(const size_t, cl_mem, const size_t, const cl_mem, const size_t, const size_t, const cl_mem, const size_t, const size_t, - cl_command_queue* queue, cl_event* event); + cl_command_queue*, cl_event*); template StatusCode Dot(const size_t, cl_mem, const size_t, const cl_mem, const size_t, const size_t, const cl_mem, const size_t, const size_t, - cl_command_queue* queue, cl_event* event); + cl_command_queue*, cl_event*); -// DOTU +// Dot product of two complex vectors: CDOTU/ZDOTU template StatusCode Dotu(const size_t n, cl_mem dot_buffer, const size_t dot_offset, @@ -229,14 +230,14 @@ template StatusCode Dotu(const size_t, cl_mem, const size_t, const cl_mem, const size_t, const size_t, const cl_mem, const size_t, const size_t, - cl_command_queue* queue, cl_event* event); + cl_command_queue*, cl_event*); template StatusCode Dotu(const size_t, cl_mem, const size_t, const cl_mem, const size_t, const size_t, const cl_mem, const size_t, const size_t, - cl_command_queue* queue, cl_event* event); + cl_command_queue*, cl_event*); -// DOTC +// Dot product of two complex vectors, one conjugated: CDOTC/ZDOTC template StatusCode Dotc(const size_t n, cl_mem dot_buffer, const size_t dot_offset, @@ -257,138 +258,150 @@ template StatusCode Dotc(const size_t, cl_mem, const size_t, const cl_mem, const size_t, const size_t, const cl_mem, const size_t, const size_t, - cl_command_queue* queue, cl_event* event); + cl_command_queue*, cl_event*); template StatusCode Dotc(const size_t, cl_mem, const size_t, const cl_mem, const size_t, const size_t, const cl_mem, const size_t, const size_t, - cl_command_queue* queue, cl_event* event); + cl_command_queue*, cl_event*); // ================================================================================================= // BLAS level-2 (matrix-vector) routines // ================================================================================================= -// GEMV +// Generalized matrix-vector multiplication: SGEMV/DGEMV/CGEMV/ZGEMV template StatusCode Gemv(const Layout layout, const Transpose a_transpose, - const size_t m, const size_t n, const T alpha, + const size_t m, const size_t n, + const T alpha, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, const T beta, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const T beta, cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event) { - auto queue_cpp = Queue(*queue); auto event_cpp = Event(*event); auto routine = Xgemv(queue_cpp, event_cpp); - - // Compiles the routine's device kernels auto status = routine.SetUp(); if (status != StatusCode::kSuccess) { return status; } - - // Runs the routine - return routine.DoGemv(layout, a_transpose, m, n, alpha, + return routine.DoGemv(layout, a_transpose, + m, n, + alpha, Buffer(a_buffer), a_offset, a_ld, - Buffer(x_buffer), x_offset, x_inc, beta, + Buffer(x_buffer), x_offset, x_inc, + beta, Buffer(y_buffer), y_offset, y_inc); } template StatusCode Gemv(const Layout, const Transpose, - const size_t, const size_t, const float, + const size_t, const size_t, + const float, const cl_mem, const size_t, const size_t, - const cl_mem, const size_t, const size_t, const float, + const cl_mem, const size_t, const size_t, + const float, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode Gemv(const Layout, const Transpose, - const size_t, const size_t, const double, + const size_t, const size_t, + const double, const cl_mem, const size_t, const size_t, - const cl_mem, const size_t, const size_t, const double, + const cl_mem, const size_t, const size_t, + const double, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode Gemv(const Layout, const Transpose, - const size_t, const size_t, const float2, + const size_t, const size_t, + const float2, const cl_mem, const size_t, const size_t, - const cl_mem, const size_t, const size_t, const float2, + const cl_mem, const size_t, const size_t, + const float2, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode Gemv(const Layout, const Transpose, - const size_t, const size_t, const double2, + const size_t, const size_t, + const double2, const cl_mem, const size_t, const size_t, - const cl_mem, const size_t, const size_t, const double2, + const cl_mem, const size_t, const size_t, + const double2, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); -// ================================================================================================= - -// HEMV +// Hermitian matrix-vector multiplication: CHEMV/ZHEMV template StatusCode Hemv(const Layout layout, const Triangle triangle, - const size_t n, const T alpha, + const size_t n, + const T alpha, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, const T beta, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const T beta, cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event) { - auto queue_cpp = Queue(*queue); auto event_cpp = Event(*event); auto routine = Xhemv(queue_cpp, event_cpp); - - // Compiles the routine's device kernels auto status = routine.SetUp(); if (status != StatusCode::kSuccess) { return status; } - - // Runs the routine - return routine.DoHemv(layout, triangle, n, alpha, + return routine.DoHemv(layout, triangle, + n, + alpha, Buffer(a_buffer), a_offset, a_ld, - Buffer(x_buffer), x_offset, x_inc, beta, + Buffer(x_buffer), x_offset, x_inc, + beta, Buffer(y_buffer), y_offset, y_inc); } template StatusCode Hemv(const Layout, const Triangle, - const size_t, const float2, + const size_t, + const float2, const cl_mem, const size_t, const size_t, - const cl_mem, const size_t, const size_t, const float2, + const cl_mem, const size_t, const size_t, + const float2, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode Hemv(const Layout, const Triangle, - const size_t, const double2, + const size_t, + const double2, const cl_mem, const size_t, const size_t, - const cl_mem, const size_t, const size_t, const double2, + const cl_mem, const size_t, const size_t, + const double2, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); -// ================================================================================================= - -// SYMV +// Symmetric matrix-vector multiplication: SSYMV/DSYMV template StatusCode Symv(const Layout layout, const Triangle triangle, - const size_t n, const T alpha, + const size_t n, + const T alpha, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, const T beta, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const T beta, cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event) { - auto queue_cpp = Queue(*queue); auto event_cpp = Event(*event); auto routine = Xsymv(queue_cpp, event_cpp); - - // Compiles the routine's device kernels auto status = routine.SetUp(); if (status != StatusCode::kSuccess) { return status; } - - // Runs the routine - return routine.DoSymv(layout, triangle, n, alpha, + return routine.DoSymv(layout, triangle, + n, + alpha, Buffer(a_buffer), a_offset, a_ld, - Buffer(x_buffer), x_offset, x_inc, beta, + Buffer(x_buffer), x_offset, x_inc, + beta, Buffer(y_buffer), y_offset, y_inc); } template StatusCode Symv(const Layout, const Triangle, - const size_t, const float, + const size_t, + const float, const cl_mem, const size_t, const size_t, - const cl_mem, const size_t, const size_t, const float, + const cl_mem, const size_t, const size_t, + const float, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode Symv(const Layout, const Triangle, - const size_t, const double, + const size_t, + const double, const cl_mem, const size_t, const size_t, - const cl_mem, const size_t, const size_t, const double, + const cl_mem, const size_t, const size_t, + const double, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); @@ -396,307 +409,343 @@ template StatusCode Symv(const Layout, const Triangle, // BLAS level-3 (matrix-matrix) routines // ================================================================================================= -// GEMM +// Generalized matrix-matrix multiplication: SGEMM/DGEMM/CGEMM/ZGEMM template StatusCode Gemm(const Layout layout, const Transpose a_transpose, const Transpose b_transpose, - const size_t m, const size_t n, const size_t k, const T alpha, + const size_t m, const size_t n, const size_t k, + const T alpha, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, const T beta, + const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, + const T beta, cl_mem c_buffer, const size_t c_offset, const size_t c_ld, cl_command_queue* queue, cl_event* event) { auto queue_cpp = Queue(*queue); auto event_cpp = Event(*event); auto routine = Xgemm(queue_cpp, event_cpp); - - // Compiles the routine's device kernels auto status = routine.SetUp(); if (status != StatusCode::kSuccess) { return status; } - - // Runs the routine - return routine.DoGemm(layout, a_transpose, b_transpose, m, n, k, alpha, + return routine.DoGemm(layout, a_transpose, b_transpose, + m, n, k, + alpha, Buffer(a_buffer), a_offset, a_ld, - Buffer(b_buffer), b_offset, b_ld, beta, + Buffer(b_buffer), b_offset, b_ld, + beta, Buffer(c_buffer), c_offset, c_ld); } template StatusCode Gemm(const Layout, const Transpose, const Transpose, - const size_t, const size_t, const size_t, const float, + const size_t, const size_t, const size_t, + const float, const cl_mem, const size_t, const size_t, - const cl_mem, const size_t, const size_t, const float, + const cl_mem, const size_t, const size_t, + const float, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode Gemm(const Layout, const Transpose, const Transpose, - const size_t, const size_t, const size_t, const double, + const size_t, const size_t, const size_t, + const double, const cl_mem, const size_t, const size_t, - const cl_mem, const size_t, const size_t, const double, + const cl_mem, const size_t, const size_t, + const double, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode Gemm(const Layout, const Transpose, const Transpose, - const size_t, const size_t, const size_t, const float2, + const size_t, const size_t, const size_t, + const float2, const cl_mem, const size_t, const size_t, - const cl_mem, const size_t, const size_t, const float2, + const cl_mem, const size_t, const size_t, + const float2, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode Gemm(const Layout, const Transpose, const Transpose, - const size_t, const size_t, const size_t, const double2, + const size_t, const size_t, const size_t, + const double2, const cl_mem, const size_t, const size_t, - const cl_mem, const size_t, const size_t, const double2, + const cl_mem, const size_t, const size_t, + const double2, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); -// ================================================================================================= - -// SYMM +// Symmetric matrix-matrix multiplication: SSYMM/DSYMM/CSYMM/ZSYMM template StatusCode Symm(const Layout layout, const Side side, const Triangle triangle, - const size_t m, const size_t n, const T alpha, + const size_t m, const size_t n, + const T alpha, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, const T beta, + const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, + const T beta, cl_mem c_buffer, const size_t c_offset, const size_t c_ld, cl_command_queue* queue, cl_event* event) { auto queue_cpp = Queue(*queue); auto event_cpp = Event(*event); auto routine = Xsymm(queue_cpp, event_cpp); - - // Compiles the routine's device kernels auto status = routine.SetUp(); if (status != StatusCode::kSuccess) { return status; } - - // Runs the routine - return routine.DoSymm(layout, side, triangle, m, n, alpha, + return routine.DoSymm(layout, side, triangle, + m, n, + alpha, Buffer(a_buffer), a_offset, a_ld, - Buffer(b_buffer), b_offset, b_ld, beta, + Buffer(b_buffer), b_offset, b_ld, + beta, Buffer(c_buffer), c_offset, c_ld); } template StatusCode Symm(const Layout, const Side, const Triangle, - const size_t, const size_t, const float, + const size_t, const size_t, + const float, const cl_mem, const size_t, const size_t, - const cl_mem, const size_t, const size_t, const float, + const cl_mem, const size_t, const size_t, + const float, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode Symm(const Layout, const Side, const Triangle, - const size_t, const size_t, const double, + const size_t, const size_t, + const double, const cl_mem, const size_t, const size_t, - const cl_mem, const size_t, const size_t, const double, + const cl_mem, const size_t, const size_t, + const double, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode Symm(const Layout, const Side, const Triangle, - const size_t, const size_t, const float2, + const size_t, const size_t, + const float2, const cl_mem, const size_t, const size_t, - const cl_mem, const size_t, const size_t, const float2, + const cl_mem, const size_t, const size_t, + const float2, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode Symm(const Layout, const Side, const Triangle, - const size_t, const size_t, const double2, + const size_t, const size_t, + const double2, const cl_mem, const size_t, const size_t, - const cl_mem, const size_t, const size_t, const double2, + const cl_mem, const size_t, const size_t, + const double2, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); -// ================================================================================================= - -// HEMM +// Hermitian matrix-matrix multiplication: CHEMM/ZHEMM template StatusCode Hemm(const Layout layout, const Side side, const Triangle triangle, - const size_t m, const size_t n, const T alpha, + const size_t m, const size_t n, + const T alpha, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, const T beta, + const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, + const T beta, cl_mem c_buffer, const size_t c_offset, const size_t c_ld, cl_command_queue* queue, cl_event* event) { auto queue_cpp = Queue(*queue); auto event_cpp = Event(*event); auto routine = Xhemm(queue_cpp, event_cpp); - - // Compiles the routine's device kernels auto status = routine.SetUp(); if (status != StatusCode::kSuccess) { return status; } - - // Runs the routine - return routine.DoHemm(layout, side, triangle, m, n, alpha, + return routine.DoHemm(layout, side, triangle, + m, n, + alpha, Buffer(a_buffer), a_offset, a_ld, - Buffer(b_buffer), b_offset, b_ld, beta, + Buffer(b_buffer), b_offset, b_ld, + beta, Buffer(c_buffer), c_offset, c_ld); } template StatusCode Hemm(const Layout, const Side, const Triangle, - const size_t, const size_t, const float2, + const size_t, const size_t, + const float2, const cl_mem, const size_t, const size_t, - const cl_mem, const size_t, const size_t, const float2, + const cl_mem, const size_t, const size_t, + const float2, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode Hemm(const Layout, const Side, const Triangle, - const size_t, const size_t, const double2, + const size_t, const size_t, + const double2, const cl_mem, const size_t, const size_t, - const cl_mem, const size_t, const size_t, const double2, + const cl_mem, const size_t, const size_t, + const double2, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); -// ================================================================================================= - -// SYRK +// Rank-K update of a symmetric matrix: SSYRK/DSYRK/CSYRK/ZSYRK template StatusCode Syrk(const Layout layout, const Triangle triangle, const Transpose a_transpose, - const size_t n, const size_t k, const T alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, const T beta, + const size_t n, const size_t k, + const T alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const T beta, cl_mem c_buffer, const size_t c_offset, const size_t c_ld, cl_command_queue* queue, cl_event* event) { auto queue_cpp = Queue(*queue); auto event_cpp = Event(*event); auto routine = Xsyrk(queue_cpp, event_cpp); - - // Compiles the routine's device kernels auto status = routine.SetUp(); if (status != StatusCode::kSuccess) { return status; } - - // Runs the routine - return routine.DoSyrk(layout, triangle, a_transpose, n, k, alpha, - Buffer(a_buffer), a_offset, a_ld, beta, + return routine.DoSyrk(layout, triangle, a_transpose, + n, k, + alpha, + Buffer(a_buffer), a_offset, a_ld, + beta, Buffer(c_buffer), c_offset, c_ld); } template StatusCode Syrk(const Layout, const Triangle, const Transpose, - const size_t, const size_t, const float, - const cl_mem, const size_t, const size_t, const float, + const size_t, const size_t, + const float, + const cl_mem, const size_t, const size_t, + const float, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode Syrk(const Layout, const Triangle, const Transpose, - const size_t, const size_t, const double, - const cl_mem, const size_t, const size_t, const double, + const size_t, const size_t, + const double, + const cl_mem, const size_t, const size_t, + const double, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode Syrk(const Layout, const Triangle, const Transpose, - const size_t, const size_t, const float2, - const cl_mem, const size_t, const size_t, const float2, + const size_t, const size_t, + const float2, + const cl_mem, const size_t, const size_t, + const float2, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode Syrk(const Layout, const Triangle, const Transpose, - const size_t, const size_t, const double2, - const cl_mem, const size_t, const size_t, const double2, + const size_t, const size_t, + const double2, + const cl_mem, const size_t, const size_t, + const double2, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); -// ================================================================================================= - -// HERK +// Rank-K update of a hermitian matrix: CHERK/ZHERK template StatusCode Herk(const Layout layout, const Triangle triangle, const Transpose a_transpose, - const size_t n, const size_t k, const T alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, const T beta, + const size_t n, const size_t k, + const T alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const T beta, cl_mem c_buffer, const size_t c_offset, const size_t c_ld, cl_command_queue* queue, cl_event* event) { auto queue_cpp = Queue(*queue); auto event_cpp = Event(*event); auto routine = Xherk,T>(queue_cpp, event_cpp); - - // Compiles the routine's device kernels auto status = routine.SetUp(); if (status != StatusCode::kSuccess) { return status; } - - // Runs the routine - return routine.DoHerk(layout, triangle, a_transpose, n, k, alpha, - Buffer>(a_buffer), a_offset, a_ld, beta, + return routine.DoHerk(layout, triangle, a_transpose, + n, k, + alpha, + Buffer>(a_buffer), a_offset, a_ld, + beta, Buffer>(c_buffer), c_offset, c_ld); } template StatusCode Herk(const Layout, const Triangle, const Transpose, - const size_t, const size_t, const float, - const cl_mem, const size_t, const size_t, const float, + const size_t, const size_t, + const float, + const cl_mem, const size_t, const size_t, + const float, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode Herk(const Layout, const Triangle, const Transpose, - const size_t, const size_t, const double, - const cl_mem, const size_t, const size_t, const double, + const size_t, const size_t, + const double, + const cl_mem, const size_t, const size_t, + const double, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); -// ================================================================================================= - -// SYR2K +// Rank-2K update of a symmetric matrix: SSYR2K/DSYR2K/CSYR2K/ZSYR2K template StatusCode Syr2k(const Layout layout, const Triangle triangle, const Transpose ab_transpose, - const size_t n, const size_t k, const T alpha, + const size_t n, const size_t k, + const T alpha, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, const T beta, + const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, + const T beta, cl_mem c_buffer, const size_t c_offset, const size_t c_ld, cl_command_queue* queue, cl_event* event) { auto queue_cpp = Queue(*queue); auto event_cpp = Event(*event); auto routine = Xsyr2k(queue_cpp, event_cpp); - - // Compiles the routine's device kernels auto status = routine.SetUp(); if (status != StatusCode::kSuccess) { return status; } - - // Runs the routine - return routine.DoSyr2k(layout, triangle, ab_transpose, n, k, alpha, + return routine.DoSyr2k(layout, triangle, ab_transpose, + n, k, + alpha, Buffer(a_buffer), a_offset, a_ld, - Buffer(b_buffer), b_offset, b_ld, beta, + Buffer(b_buffer), b_offset, b_ld, + beta, Buffer(c_buffer), c_offset, c_ld); } template StatusCode Syr2k(const Layout, const Triangle, const Transpose, - const size_t, const size_t, const float, + const size_t, const size_t, + const float, const cl_mem, const size_t, const size_t, - const cl_mem, const size_t, const size_t, const float, + const cl_mem, const size_t, const size_t, + const float, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode Syr2k(const Layout, const Triangle, const Transpose, - const size_t, const size_t, const double, + const size_t, const size_t, + const double, const cl_mem, const size_t, const size_t, - const cl_mem, const size_t, const size_t, const double, + const cl_mem, const size_t, const size_t, + const double, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode Syr2k(const Layout, const Triangle, const Transpose, - const size_t, const size_t, const float2, + const size_t, const size_t, + const float2, const cl_mem, const size_t, const size_t, - const cl_mem, const size_t, const size_t, const float2, + const cl_mem, const size_t, const size_t, + const float2, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode Syr2k(const Layout, const Triangle, const Transpose, - const size_t, const size_t, const double2, + const size_t, const size_t, + const double2, const cl_mem, const size_t, const size_t, - const cl_mem, const size_t, const size_t, const double2, + const cl_mem, const size_t, const size_t, + const double2, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); -// ================================================================================================= - -// SYR2K +// Rank-2K update of a hermitian matrix: CHER2K/ZHER2K template StatusCode Her2k(const Layout layout, const Triangle triangle, const Transpose ab_transpose, - const size_t n, const size_t k, const T alpha, + const size_t n, const size_t k, + const T alpha, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, const U beta, + const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, + const U beta, cl_mem c_buffer, const size_t c_offset, const size_t c_ld, cl_command_queue* queue, cl_event* event) { auto queue_cpp = Queue(*queue); auto event_cpp = Event(*event); auto routine = Xher2k(queue_cpp, event_cpp); - - // Compiles the routine's device kernels auto status = routine.SetUp(); if (status != StatusCode::kSuccess) { return status; } - - // Runs the routine - return routine.DoHer2k(layout, triangle, ab_transpose, n, k, alpha, + return routine.DoHer2k(layout, triangle, ab_transpose, + n, k, + alpha, Buffer(a_buffer), a_offset, a_ld, - Buffer(b_buffer), b_offset, b_ld, beta, + Buffer(b_buffer), b_offset, b_ld, + beta, Buffer(c_buffer), c_offset, c_ld); } template StatusCode Her2k(const Layout, const Triangle, const Transpose, - const size_t, const size_t, const float2, + const size_t, const size_t, + const float2, const cl_mem, const size_t, const size_t, - const cl_mem, const size_t, const size_t, const float, + const cl_mem, const size_t, const size_t, + const float, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode Her2k(const Layout, const Triangle, const Transpose, - const size_t, const size_t, const double2, + const size_t, const size_t, + const double2, const cl_mem, const size_t, const size_t, - const cl_mem, const size_t, const size_t, const double, + const cl_mem, const size_t, const size_t, + const double, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); -// ================================================================================================= - -// TRMM +// Triangular matrix-matrix multiplication: STRMM/DTRMM/CTRMM/ZTRMM template -StatusCode Trmm(const Layout layout, const Side side, const Triangle triangle, - const Transpose a_transpose, const Diagonal diagonal, +StatusCode Trmm(const Layout layout, const Side side, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, const size_t m, const size_t n, const T alpha, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, @@ -705,90 +754,38 @@ StatusCode Trmm(const Layout layout, const Side side, const Triangle triangle, auto queue_cpp = Queue(*queue); auto event_cpp = Event(*event); auto routine = Xtrmm(queue_cpp, event_cpp); - - // Compiles the routine's device kernels auto status = routine.SetUp(); if (status != StatusCode::kSuccess) { return status; } - - // Runs the routine - return routine.DoTrmm(layout, side, triangle, a_transpose, diagonal, m, n, alpha, + return routine.DoTrmm(layout, side, triangle, a_transpose, diagonal, + m, n, + alpha, Buffer(a_buffer), a_offset, a_ld, Buffer(b_buffer), b_offset, b_ld); } -template StatusCode Trmm(const Layout, const Side, const Triangle, - const Transpose, const Diagonal, - const size_t, const size_t, const float, +template StatusCode Trmm(const Layout, const Side, const Triangle, const Transpose, const Diagonal, + const size_t, const size_t, + const float, const cl_mem, const size_t, const size_t, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); -template StatusCode Trmm(const Layout, const Side, const Triangle, - const Transpose, const Diagonal, - const size_t, const size_t, const double, +template StatusCode Trmm(const Layout, const Side, const Triangle, const Transpose, const Diagonal, + const size_t, const size_t, + const double, const cl_mem, const size_t, const size_t, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); -template StatusCode Trmm(const Layout, const Side, const Triangle, - const Transpose, const Diagonal, - const size_t, const size_t, const float2, +template StatusCode Trmm(const Layout, const Side, const Triangle, const Transpose, const Diagonal, + const size_t, const size_t, + const float2, const cl_mem, const size_t, const size_t, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); -template StatusCode Trmm(const Layout, const Side, const Triangle, - const Transpose, const Diagonal, - const size_t, const size_t, const double2, +template StatusCode Trmm(const Layout, const Side, const Triangle, const Transpose, const Diagonal, + const size_t, const size_t, + const double2, const cl_mem, const size_t, const size_t, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); // ================================================================================================= - -// TRSM -/* -template -StatusCode Trsm(const Layout layout, const Side side, const Triangle triangle, - const Transpose a_transpose, const Diagonal diagonal, - const size_t m, const size_t n, - const T alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - cl_mem b_buffer, const size_t b_offset, const size_t b_ld, - cl_command_queue* queue, cl_event* event) { - auto queue_cpp = Queue(*queue); - auto event_cpp = Event(*event); - auto routine = Xtrsm(queue_cpp, event_cpp); - - // Compiles the routine's device kernels - auto status = routine.SetUp(); - if (status != StatusCode::kSuccess) { return status; } - - // Runs the routine - return routine.DoTrsm(layout, side, triangle, a_transpose, diagonal, m, n, alpha, - Buffer(a_buffer), a_offset, a_ld, - Buffer(b_buffer), b_offset, b_ld); -} -template StatusCode Trsm(const Layout, const Side, const Triangle, - const Transpose, const Diagonal, - const size_t, const size_t, const float, - const cl_mem, const size_t, const size_t, - cl_mem, const size_t, const size_t, - cl_command_queue*, cl_event*); -template StatusCode Trsm(const Layout, const Side, const Triangle, - const Transpose, const Diagonal, - const size_t, const size_t, const double, - const cl_mem, const size_t, const size_t, - cl_mem, const size_t, const size_t, - cl_command_queue*, cl_event*); -template StatusCode Trsm(const Layout, const Side, const Triangle, - const Transpose, const Diagonal, - const size_t, const size_t, const float2, - const cl_mem, const size_t, const size_t, - cl_mem, const size_t, const size_t, - cl_command_queue*, cl_event*); -template StatusCode Trsm(const Layout, const Side, const Triangle, - const Transpose, const Diagonal, - const size_t, const size_t, const double2, - const cl_mem, const size_t, const size_t, - cl_mem, const size_t, const size_t, - cl_command_queue*, cl_event*); -*/ -// ================================================================================================= } // namespace clblast diff --git a/test/wrapper_clblas.h b/test/wrapper_clblas.h index fcf1a918..85729470 100644 --- a/test/wrapper_clblas.h +++ b/test/wrapper_clblas.h @@ -20,216 +20,222 @@ #include "internal/utilities.h" namespace clblast { + // ================================================================================================= // BLAS level-1 (vector-vector) routines +// ================================================================================================= -// Calls {clblasSswap, clblasDswap, clblasCswap, clblasZswap} with the arguments forwarded. -template clblasStatus clblasXswap( - size_t n, - const cl_mem x_vec, size_t x_offset, size_t x_inc, - const cl_mem y_vec, size_t y_offset, size_t y_inc, - cl_uint num_queues, cl_command_queue *queues, - cl_uint num_wait_events, const cl_event *wait_events, cl_event *events); -template <> clblasStatus clblasXswap( - size_t n, - const cl_mem x_vec, size_t x_offset, size_t x_inc, - const cl_mem y_vec, size_t y_offset, size_t y_inc, - cl_uint num_queues, cl_command_queue *queues, - cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { - return clblasSswap(n, - x_vec, x_offset, static_cast(x_inc), - y_vec, y_offset, static_cast(y_inc), - num_queues, queues, num_wait_events, wait_events, events); +// Forwards the clBLAS calls for SSWAP/DSWAP/CSWAP/ZSWAP +template +clblasStatus clblasXswap(const size_t n, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events); +template <> +clblasStatus clblasXswap(const size_t n, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { + return clblasSswap(n, + x_buffer, x_offset, static_cast(x_inc), + y_buffer, y_offset, static_cast(y_inc), + num_queues, queues, num_wait_events, wait_events, events); } -template <> clblasStatus clblasXswap( - size_t n, - const cl_mem x_vec, size_t x_offset, size_t x_inc, - const cl_mem y_vec, size_t y_offset, size_t y_inc, - cl_uint num_queues, cl_command_queue *queues, - cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { - return clblasDswap(n, - x_vec, x_offset, static_cast(x_inc), - y_vec, y_offset, static_cast(y_inc), - num_queues, queues, num_wait_events, wait_events, events); +template <> +clblasStatus clblasXswap(const size_t n, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { + return clblasDswap(n, + x_buffer, x_offset, static_cast(x_inc), + y_buffer, y_offset, static_cast(y_inc), + num_queues, queues, num_wait_events, wait_events, events); } -template <> clblasStatus clblasXswap( - size_t n, - const cl_mem x_vec, size_t x_offset, size_t x_inc, - const cl_mem y_vec, size_t y_offset, size_t y_inc, - cl_uint num_queues, cl_command_queue *queues, - cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { - return clblasCswap(n, - x_vec, x_offset, static_cast(x_inc), - y_vec, y_offset, static_cast(y_inc), - num_queues, queues, num_wait_events, wait_events, events); +template <> +clblasStatus clblasXswap(const size_t n, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { + return clblasCswap(n, + x_buffer, x_offset, static_cast(x_inc), + y_buffer, y_offset, static_cast(y_inc), + num_queues, queues, num_wait_events, wait_events, events); } -template <> clblasStatus clblasXswap( - size_t n, - const cl_mem x_vec, size_t x_offset, size_t x_inc, - const cl_mem y_vec, size_t y_offset, size_t y_inc, - cl_uint num_queues, cl_command_queue *queues, - cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { - return clblasZswap(n, - x_vec, x_offset, static_cast(x_inc), - y_vec, y_offset, static_cast(y_inc), - num_queues, queues, num_wait_events, wait_events, events); +template <> +clblasStatus clblasXswap(const size_t n, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { + return clblasZswap(n, + x_buffer, x_offset, static_cast(x_inc), + y_buffer, y_offset, static_cast(y_inc), + num_queues, queues, num_wait_events, wait_events, events); } -// Calls {clblasSscal, clblasDscal, clblasCscal, clblasZscal} with the arguments forwarded. -clblasStatus clblasXscal( - size_t n, float alpha, - const cl_mem x_vec, size_t x_offset, size_t x_inc, - cl_uint num_queues, cl_command_queue *queues, - cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { - return clblasSscal(n, alpha, - x_vec, x_offset, static_cast(x_inc), - num_queues, queues, num_wait_events, wait_events, events); +// Forwards the clBLAS calls for SSCAL/DSCAL/CSCAL/ZSCAL +clblasStatus clblasXscal(const size_t n, + const float alpha, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { + return clblasSscal(n, + alpha, + x_buffer, x_offset, static_cast(x_inc), + num_queues, queues, num_wait_events, wait_events, events); } -clblasStatus clblasXscal( - size_t n, double alpha, - const cl_mem x_vec, size_t x_offset, size_t x_inc, - cl_uint num_queues, cl_command_queue *queues, - cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { - return clblasDscal(n, alpha, - x_vec, x_offset, static_cast(x_inc), - num_queues, queues, num_wait_events, wait_events, events); +clblasStatus clblasXscal(const size_t n, + const double alpha, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { + return clblasDscal(n, + alpha, + x_buffer, x_offset, static_cast(x_inc), + num_queues, queues, num_wait_events, wait_events, events); } -clblasStatus clblasXscal( - size_t n, float2 alpha, - const cl_mem x_vec, size_t x_offset, size_t x_inc, - cl_uint num_queues, cl_command_queue *queues, - cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { - auto cl_alpha = cl_float2{{alpha.real(), alpha.imag()}}; - return clblasCscal(n, cl_alpha, - x_vec, x_offset, static_cast(x_inc), - num_queues, queues, num_wait_events, wait_events, events); +clblasStatus clblasXscal(const size_t n, + const float2 alpha, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { + return clblasCscal(n, + cl_float2{{alpha.real(), alpha.imag()}}, + x_buffer, x_offset, static_cast(x_inc), + num_queues, queues, num_wait_events, wait_events, events); } -clblasStatus clblasXscal( - size_t n, double2 alpha, - const cl_mem x_vec, size_t x_offset, size_t x_inc, - cl_uint num_queues, cl_command_queue *queues, - cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { - auto cl_alpha = cl_double2{{alpha.real(), alpha.imag()}}; - return clblasZscal(n, cl_alpha, - x_vec, x_offset, static_cast(x_inc), - num_queues, queues, num_wait_events, wait_events, events); +clblasStatus clblasXscal(const size_t n, + const double2 alpha, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { + return clblasZscal(n, + cl_double2{{alpha.real(), alpha.imag()}}, + x_buffer, x_offset, static_cast(x_inc), + num_queues, queues, num_wait_events, wait_events, events); } -// Calls {clblasScopy, clblasDcopy, clblasCcopy, clblasZcopy} with the arguments forwarded. -template clblasStatus clblasXcopy( - size_t n, - const cl_mem x_vec, size_t x_offset, size_t x_inc, - const cl_mem y_vec, size_t y_offset, size_t y_inc, - cl_uint num_queues, cl_command_queue *queues, - cl_uint num_wait_events, const cl_event *wait_events, cl_event *events); -template <> clblasStatus clblasXcopy( - size_t n, - const cl_mem x_vec, size_t x_offset, size_t x_inc, - const cl_mem y_vec, size_t y_offset, size_t y_inc, - cl_uint num_queues, cl_command_queue *queues, - cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { - return clblasScopy(n, - x_vec, x_offset, static_cast(x_inc), - y_vec, y_offset, static_cast(y_inc), - num_queues, queues, num_wait_events, wait_events, events); +// Forwards the clBLAS calls for SCOPY/DCOPY/CCOPY/ZCOPY +template +clblasStatus clblasXcopy(const size_t n, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events); +template <> +clblasStatus clblasXcopy(const size_t n, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { + return clblasScopy(n, + x_buffer, x_offset, static_cast(x_inc), + y_buffer, y_offset, static_cast(y_inc), + num_queues, queues, num_wait_events, wait_events, events); } -template <> clblasStatus clblasXcopy( - size_t n, - const cl_mem x_vec, size_t x_offset, size_t x_inc, - const cl_mem y_vec, size_t y_offset, size_t y_inc, - cl_uint num_queues, cl_command_queue *queues, - cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { - return clblasDcopy(n, - x_vec, x_offset, static_cast(x_inc), - y_vec, y_offset, static_cast(y_inc), - num_queues, queues, num_wait_events, wait_events, events); +template <> +clblasStatus clblasXcopy(const size_t n, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { + return clblasDcopy(n, + x_buffer, x_offset, static_cast(x_inc), + y_buffer, y_offset, static_cast(y_inc), + num_queues, queues, num_wait_events, wait_events, events); } -template <> clblasStatus clblasXcopy( - size_t n, - const cl_mem x_vec, size_t x_offset, size_t x_inc, - const cl_mem y_vec, size_t y_offset, size_t y_inc, - cl_uint num_queues, cl_command_queue *queues, - cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { - return clblasCcopy(n, - x_vec, x_offset, static_cast(x_inc), - y_vec, y_offset, static_cast(y_inc), - num_queues, queues, num_wait_events, wait_events, events); +template <> +clblasStatus clblasXcopy(const size_t n, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { + return clblasCcopy(n, + x_buffer, x_offset, static_cast(x_inc), + y_buffer, y_offset, static_cast(y_inc), + num_queues, queues, num_wait_events, wait_events, events); } -template <> clblasStatus clblasXcopy( - size_t n, - const cl_mem x_vec, size_t x_offset, size_t x_inc, - const cl_mem y_vec, size_t y_offset, size_t y_inc, - cl_uint num_queues, cl_command_queue *queues, - cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { - return clblasZcopy(n, - x_vec, x_offset, static_cast(x_inc), - y_vec, y_offset, static_cast(y_inc), - num_queues, queues, num_wait_events, wait_events, events); +template <> +clblasStatus clblasXcopy(const size_t n, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { + return clblasZcopy(n, + x_buffer, x_offset, static_cast(x_inc), + y_buffer, y_offset, static_cast(y_inc), + num_queues, queues, num_wait_events, wait_events, events); } -// Calls {clblasSaxpy, clblasDaxpy, clblasCaxpy, clblasZaxpy} with the arguments forwarded. -clblasStatus clblasXaxpy( - size_t n, float alpha, - const cl_mem x_vec, size_t x_offset, size_t x_inc, - const cl_mem y_vec, size_t y_offset, size_t y_inc, - cl_uint num_queues, cl_command_queue *queues, - cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { - return clblasSaxpy(n, alpha, - x_vec, x_offset, static_cast(x_inc), - y_vec, y_offset, static_cast(y_inc), - num_queues, queues, num_wait_events, wait_events, events); +// Forwards the clBLAS calls for SAXPY/DAXPY/CAXPY/ZAXPY +clblasStatus clblasXaxpy(const size_t n, + const float alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { + return clblasSaxpy(n, + alpha, + x_buffer, x_offset, static_cast(x_inc), + y_buffer, y_offset, static_cast(y_inc), + num_queues, queues, num_wait_events, wait_events, events); } -clblasStatus clblasXaxpy( - size_t n, double alpha, - const cl_mem x_vec, size_t x_offset, size_t x_inc, - const cl_mem y_vec, size_t y_offset, size_t y_inc, - cl_uint num_queues, cl_command_queue *queues, - cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { - return clblasDaxpy(n, alpha, - x_vec, x_offset, static_cast(x_inc), - y_vec, y_offset, static_cast(y_inc), - num_queues, queues, num_wait_events, wait_events, events); +clblasStatus clblasXaxpy(const size_t n, + const double alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { + return clblasDaxpy(n, + alpha, + x_buffer, x_offset, static_cast(x_inc), + y_buffer, y_offset, static_cast(y_inc), + num_queues, queues, num_wait_events, wait_events, events); } -clblasStatus clblasXaxpy( - size_t n, float2 alpha, - const cl_mem x_vec, size_t x_offset, size_t x_inc, - const cl_mem y_vec, size_t y_offset, size_t y_inc, - cl_uint num_queues, cl_command_queue *queues, - cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { - auto cl_alpha = cl_float2{{alpha.real(), alpha.imag()}}; - return clblasCaxpy(n, cl_alpha, - x_vec, x_offset, static_cast(x_inc), - y_vec, y_offset, static_cast(y_inc), - num_queues, queues, num_wait_events, wait_events, events); +clblasStatus clblasXaxpy(const size_t n, + const float2 alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { + return clblasCaxpy(n, + cl_float2{{alpha.real(), alpha.imag()}}, + x_buffer, x_offset, static_cast(x_inc), + y_buffer, y_offset, static_cast(y_inc), + num_queues, queues, num_wait_events, wait_events, events); } -clblasStatus clblasXaxpy( - size_t n, double2 alpha, - const cl_mem x_vec, size_t x_offset, size_t x_inc, - const cl_mem y_vec, size_t y_offset, size_t y_inc, - cl_uint num_queues, cl_command_queue *queues, - cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { - auto cl_alpha = cl_double2{{alpha.real(), alpha.imag()}}; - return clblasZaxpy(n, cl_alpha, - x_vec, x_offset, static_cast(x_inc), - y_vec, y_offset, static_cast(y_inc), - num_queues, queues, num_wait_events, wait_events, events); +clblasStatus clblasXaxpy(const size_t n, + const double2 alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { + return clblasZaxpy(n, + cl_double2{{alpha.real(), alpha.imag()}}, + x_buffer, x_offset, static_cast(x_inc), + y_buffer, y_offset, static_cast(y_inc), + num_queues, queues, num_wait_events, wait_events, events); } // Forwards the clBLAS calls for SDOT/DDOT -template clblasStatus clblasXdot( - const size_t n, - cl_mem dot_buffer, const size_t dot_offset, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - cl_uint num_queues, cl_command_queue *queues, - cl_uint num_wait_events, const cl_event *wait_events, cl_event *events); -template <> clblasStatus clblasXdot( - const size_t n, - cl_mem dot_buffer, const size_t dot_offset, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - cl_uint num_queues, cl_command_queue *queues, - cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { +template +clblasStatus clblasXdot(const size_t n, + cl_mem dot_buffer, const size_t dot_offset, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events); +template <> +clblasStatus clblasXdot(const size_t n, + cl_mem dot_buffer, const size_t dot_offset, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { auto queue = Queue(queues[0]); auto context = queue.GetContext(); auto scratch_buffer = Buffer(context, n); @@ -240,13 +246,13 @@ template <> clblasStatus clblasXdot( scratch_buffer(), num_queues, queues, num_wait_events, wait_events, events); } -template <> clblasStatus clblasXdot( - const size_t n, - cl_mem dot_buffer, const size_t dot_offset, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - cl_uint num_queues, cl_command_queue *queues, - cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { +template <> +clblasStatus clblasXdot(const size_t n, + cl_mem dot_buffer, const size_t dot_offset, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { auto queue = Queue(queues[0]); auto context = queue.GetContext(); auto scratch_buffer = Buffer(context, n); @@ -259,20 +265,20 @@ template <> clblasStatus clblasXdot( } // Forwards the clBLAS calls for CDOTU/ZDOTU -template clblasStatus clblasXdotu( - const size_t n, - cl_mem dot_buffer, const size_t dot_offset, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - cl_uint num_queues, cl_command_queue *queues, - cl_uint num_wait_events, const cl_event *wait_events, cl_event *events); -template <> clblasStatus clblasXdotu( - const size_t n, - cl_mem dot_buffer, const size_t dot_offset, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - cl_uint num_queues, cl_command_queue *queues, - cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { +template +clblasStatus clblasXdotu(const size_t n, + cl_mem dot_buffer, const size_t dot_offset, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events); +template <> +clblasStatus clblasXdotu(const size_t n, + cl_mem dot_buffer, const size_t dot_offset, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { auto queue = Queue(queues[0]); auto context = queue.GetContext(); auto scratch_buffer = Buffer(context, n); @@ -283,13 +289,13 @@ template <> clblasStatus clblasXdotu( scratch_buffer(), num_queues, queues, num_wait_events, wait_events, events); } -template <> clblasStatus clblasXdotu( - const size_t n, - cl_mem dot_buffer, const size_t dot_offset, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - cl_uint num_queues, cl_command_queue *queues, - cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { +template <> +clblasStatus clblasXdotu(const size_t n, + cl_mem dot_buffer, const size_t dot_offset, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { auto queue = Queue(queues[0]); auto context = queue.GetContext(); auto scratch_buffer = Buffer(context, n); @@ -302,20 +308,20 @@ template <> clblasStatus clblasXdotu( } // Forwards the clBLAS calls for CDOTC/ZDOTC -template clblasStatus clblasXdotc( - const size_t n, - cl_mem dot_buffer, const size_t dot_offset, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - cl_uint num_queues, cl_command_queue *queues, - cl_uint num_wait_events, const cl_event *wait_events, cl_event *events); -template <> clblasStatus clblasXdotc( - const size_t n, - cl_mem dot_buffer, const size_t dot_offset, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - cl_uint num_queues, cl_command_queue *queues, - cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { +template +clblasStatus clblasXdotc(const size_t n, + cl_mem dot_buffer, const size_t dot_offset, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events); +template <> +clblasStatus clblasXdotc(const size_t n, + cl_mem dot_buffer, const size_t dot_offset, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { auto queue = Queue(queues[0]); auto context = queue.GetContext(); auto scratch_buffer = Buffer(context, n); @@ -326,13 +332,13 @@ template <> clblasStatus clblasXdotc( scratch_buffer(), num_queues, queues, num_wait_events, wait_events, events); } -template <> clblasStatus clblasXdotc( - const size_t n, - cl_mem dot_buffer, const size_t dot_offset, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - cl_uint num_queues, cl_command_queue *queues, - cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { +template <> +clblasStatus clblasXdotc(const size_t n, + cl_mem dot_buffer, const size_t dot_offset, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { auto queue = Queue(queues[0]); auto context = queue.GetContext(); auto scratch_buffer = Buffer(context, n); @@ -346,600 +352,616 @@ template <> clblasStatus clblasXdotc( // ================================================================================================= // BLAS level-2 (matrix-vector) routines +// ================================================================================================= -// Calls {clblasSgemv, clblasDgemv, clblasCgemv, clblasZgemv} with the arguments forwarded. -clblasStatus clblasXgemv( - clblasOrder layout, clblasTranspose a_transpose, size_t m, size_t n, float alpha, - const cl_mem a_mat, size_t a_offset, size_t a_ld, - const cl_mem x_vec, size_t x_offset, size_t x_inc, float beta, - const cl_mem y_vec, size_t y_offset, size_t y_inc, - cl_uint num_queues, cl_command_queue *queues, - cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { - return clblasSgemv(layout, a_transpose, m, n, alpha, - a_mat, a_offset, a_ld, - x_vec, x_offset, static_cast(x_inc), beta, - y_vec, y_offset, static_cast(y_inc), - num_queues, queues, num_wait_events, wait_events, events); +// Forwards the clBLAS calls for SGEMV/DGEMV/CGEMV/ZGEMV +clblasStatus clblasXgemv(const clblasOrder layout, const clblasTranspose a_transpose, + const size_t m, const size_t n, + const float alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const float beta, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { + return clblasSgemv(layout, a_transpose, + m, n, + alpha, + a_buffer, a_offset, a_ld, + x_buffer, x_offset, static_cast(x_inc), + beta, + y_buffer, y_offset, static_cast(y_inc), + num_queues, queues, num_wait_events, wait_events, events); } -clblasStatus clblasXgemv( - clblasOrder layout, clblasTranspose a_transpose, size_t m, size_t n, double alpha, - const cl_mem a_mat, size_t a_offset, size_t a_ld, - const cl_mem x_vec, size_t x_offset, size_t x_inc, double beta, - const cl_mem y_vec, size_t y_offset, size_t y_inc, - cl_uint num_queues, cl_command_queue *queues, - cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { - return clblasDgemv(layout, a_transpose, m, n, alpha, - a_mat, a_offset, a_ld, - x_vec, x_offset, static_cast(x_inc), beta, - y_vec, y_offset, static_cast(y_inc), - num_queues, queues, num_wait_events, wait_events, events); +clblasStatus clblasXgemv(const clblasOrder layout, const clblasTranspose a_transpose, + const size_t m, const size_t n, + const double alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const double beta, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { + return clblasDgemv(layout, a_transpose, + m, n, + alpha, + a_buffer, a_offset, a_ld, + x_buffer, x_offset, static_cast(x_inc), + beta, + y_buffer, y_offset, static_cast(y_inc), + num_queues, queues, num_wait_events, wait_events, events); } -clblasStatus clblasXgemv( - clblasOrder layout, clblasTranspose a_transpose, size_t m, size_t n, float2 alpha, - const cl_mem a_mat, size_t a_offset, size_t a_ld, - const cl_mem x_vec, size_t x_offset, size_t x_inc, float2 beta, - const cl_mem y_vec, size_t y_offset, size_t y_inc, - cl_uint num_queues, cl_command_queue *queues, - cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { - auto cl_alpha = cl_float2{{alpha.real(), alpha.imag()}}; - auto cl_beta = cl_float2{{beta.real(), beta.imag()}}; - return clblasCgemv(layout, a_transpose, m, n, cl_alpha, - a_mat, a_offset, a_ld, - x_vec, x_offset, static_cast(x_inc), cl_beta, - y_vec, y_offset, static_cast(y_inc), - num_queues, queues, num_wait_events, wait_events, events); +clblasStatus clblasXgemv(const clblasOrder layout, const clblasTranspose a_transpose, + const size_t m, const size_t n, + const float2 alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const float2 beta, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { + return clblasCgemv(layout, a_transpose, + m, n, + cl_float2{{alpha.real(), alpha.imag()}}, + a_buffer, a_offset, a_ld, + x_buffer, x_offset, static_cast(x_inc), + cl_float2{{beta.real(), beta.imag()}}, + y_buffer, y_offset, static_cast(y_inc), + num_queues, queues, num_wait_events, wait_events, events); } -clblasStatus clblasXgemv( - clblasOrder layout, clblasTranspose a_transpose, size_t m, size_t n, double2 alpha, - const cl_mem a_mat, size_t a_offset, size_t a_ld, - const cl_mem x_vec, size_t x_offset, size_t x_inc, double2 beta, - const cl_mem y_vec, size_t y_offset, size_t y_inc, - cl_uint num_queues, cl_command_queue *queues, - cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { - auto cl_alpha = cl_double2{{alpha.real(), alpha.imag()}}; - auto cl_beta = cl_double2{{beta.real(), beta.imag()}}; - return clblasZgemv(layout, a_transpose, m, n, cl_alpha, - a_mat, a_offset, a_ld, - x_vec, x_offset, static_cast(x_inc), cl_beta, - y_vec, y_offset, static_cast(y_inc), - num_queues, queues, num_wait_events, wait_events, events); +clblasStatus clblasXgemv(const clblasOrder layout, const clblasTranspose a_transpose, + const size_t m, const size_t n, + const double2 alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const double2 beta, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { + return clblasZgemv(layout, a_transpose, + m, n, + cl_double2{{alpha.real(), alpha.imag()}}, + a_buffer, a_offset, a_ld, + x_buffer, x_offset, static_cast(x_inc), + cl_double2{{beta.real(), beta.imag()}}, + y_buffer, y_offset, static_cast(y_inc), + num_queues, queues, num_wait_events, wait_events, events); } -// Calls {clblasChemv, clblasZhemv} with the arguments forwarded. -clblasStatus clblasXhemv( - clblasOrder layout, clblasUplo triangle, size_t n, float2 alpha, - const cl_mem a_mat, size_t a_offset, size_t a_ld, - const cl_mem x_vec, size_t x_offset, size_t x_inc, float2 beta, - const cl_mem y_vec, size_t y_offset, size_t y_inc, - cl_uint num_queues, cl_command_queue *queues, - cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { - auto cl_alpha = cl_float2{{alpha.real(), alpha.imag()}}; - auto cl_beta = cl_float2{{beta.real(), beta.imag()}}; - return clblasChemv(layout, triangle, n, cl_alpha, - a_mat, a_offset, a_ld, - x_vec, x_offset, static_cast(x_inc), cl_beta, - y_vec, y_offset, static_cast(y_inc), - num_queues, queues, num_wait_events, wait_events, events); +// Forwards the clBLAS calls for CHEMV/ZHEMV +clblasStatus clblasXhemv(const clblasOrder layout, const clblasUplo triangle, + const size_t n, + const float2 alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const float2 beta, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { + return clblasChemv(layout, triangle, + n, + cl_float2{{alpha.real(), alpha.imag()}}, + a_buffer, a_offset, a_ld, + x_buffer, x_offset, static_cast(x_inc), + cl_float2{{beta.real(), beta.imag()}}, + y_buffer, y_offset, static_cast(y_inc), + num_queues, queues, num_wait_events, wait_events, events); } -clblasStatus clblasXhemv( - clblasOrder layout, clblasUplo triangle, size_t n, double2 alpha, - const cl_mem a_mat, size_t a_offset, size_t a_ld, - const cl_mem x_vec, size_t x_offset, size_t x_inc, double2 beta, - const cl_mem y_vec, size_t y_offset, size_t y_inc, - cl_uint num_queues, cl_command_queue *queues, - cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { - auto cl_alpha = cl_double2{{alpha.real(), alpha.imag()}}; - auto cl_beta = cl_double2{{beta.real(), beta.imag()}}; - return clblasZhemv(layout, triangle, n, cl_alpha, - a_mat, a_offset, a_ld, - x_vec, x_offset, static_cast(x_inc), cl_beta, - y_vec, y_offset, static_cast(y_inc), - num_queues, queues, num_wait_events, wait_events, events); +clblasStatus clblasXhemv(const clblasOrder layout, const clblasUplo triangle, + const size_t n, + const double2 alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const double2 beta, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { + return clblasZhemv(layout, triangle, + n, + cl_double2{{alpha.real(), alpha.imag()}}, + a_buffer, a_offset, a_ld, + x_buffer, x_offset, static_cast(x_inc), + cl_double2{{beta.real(), beta.imag()}}, + y_buffer, y_offset, static_cast(y_inc), + num_queues, queues, num_wait_events, wait_events, events); } -// Calls {clblasSsymv, clblasDsymv} with the arguments forwarded. -clblasStatus clblasXsymv( - clblasOrder layout, clblasUplo triangle, size_t n, float alpha, - const cl_mem a_mat, size_t a_offset, size_t a_ld, - const cl_mem x_vec, size_t x_offset, size_t x_inc, float beta, - const cl_mem y_vec, size_t y_offset, size_t y_inc, - cl_uint num_queues, cl_command_queue *queues, - cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { - return clblasSsymv(layout, triangle, n, alpha, - a_mat, a_offset, a_ld, - x_vec, x_offset, static_cast(x_inc), beta, - y_vec, y_offset, static_cast(y_inc), - num_queues, queues, num_wait_events, wait_events, events); +// Forwards the clBLAS calls for SSYMV/DSYMV +clblasStatus clblasXsymv(const clblasOrder layout, const clblasUplo triangle, + const size_t n, + const float alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const float beta, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { + return clblasSsymv(layout, triangle, + n, + alpha, + a_buffer, a_offset, a_ld, + x_buffer, x_offset, static_cast(x_inc), + beta, + y_buffer, y_offset, static_cast(y_inc), + num_queues, queues, num_wait_events, wait_events, events); } -clblasStatus clblasXsymv( - clblasOrder layout, clblasUplo triangle, size_t n, double alpha, - const cl_mem a_mat, size_t a_offset, size_t a_ld, - const cl_mem x_vec, size_t x_offset, size_t x_inc, double beta, - const cl_mem y_vec, size_t y_offset, size_t y_inc, - cl_uint num_queues, cl_command_queue *queues, - cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { - return clblasDsymv(layout, triangle, n, alpha, - a_mat, a_offset, a_ld, - x_vec, x_offset, static_cast(x_inc), beta, - y_vec, y_offset, static_cast(y_inc), - num_queues, queues, num_wait_events, wait_events, events); +clblasStatus clblasXsymv(const clblasOrder layout, const clblasUplo triangle, + const size_t n, + const double alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const double beta, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { + return clblasDsymv(layout, triangle, + n, + alpha, + a_buffer, a_offset, a_ld, + x_buffer, x_offset, static_cast(x_inc), + beta, + y_buffer, y_offset, static_cast(y_inc), + num_queues, queues, num_wait_events, wait_events, events); } // ================================================================================================= // BLAS level-3 (matrix-matrix) routines +// ================================================================================================= -// This calls {clblasSgemm, clblasDgemm, clblasCgemm, clblasZgemm} with the arguments forwarded. -clblasStatus clblasXgemm( - clblasOrder layout, clblasTranspose a_transpose, clblasTranspose b_transpose, - size_t m, size_t n, size_t k, float alpha, - const cl_mem a_mat, size_t a_offset, size_t a_ld, - const cl_mem b_mat, size_t b_offset, size_t b_ld, float beta, - const cl_mem c_mat, size_t c_offset, size_t c_ld, - cl_uint num_queues, cl_command_queue *queues, - cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { - return clblasSgemm(layout, a_transpose, b_transpose, - m, n, k, alpha, - a_mat, a_offset, a_ld, - b_mat, b_offset, b_ld, beta, - c_mat, c_offset, c_ld, - num_queues, queues, num_wait_events, wait_events, events); +// Forwards the clBLAS calls for SGEMM/DGEMM/CGEMM/ZGEMM +clblasStatus clblasXgemm(const clblasOrder layout, const clblasTranspose a_transpose, const clblasTranspose b_transpose, + const size_t m, const size_t n, const size_t k, + const float alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, + const float beta, + cl_mem c_buffer, const size_t c_offset, const size_t c_ld, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { + return clblasSgemm(layout, a_transpose, b_transpose, + m, n, k, + alpha, + a_buffer, a_offset, a_ld, + b_buffer, b_offset, b_ld, + beta, + c_buffer, c_offset, c_ld, + num_queues, queues, num_wait_events, wait_events, events); } -clblasStatus clblasXgemm( - clblasOrder layout, clblasTranspose a_transpose, clblasTranspose b_transpose, - size_t m, size_t n, size_t k, double alpha, - const cl_mem a_mat, size_t a_offset, size_t a_ld, - const cl_mem b_mat, size_t b_offset, size_t b_ld, double beta, - const cl_mem c_mat, size_t c_offset, size_t c_ld, - cl_uint num_queues, cl_command_queue *queues, - cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { - return clblasDgemm(layout, a_transpose, b_transpose, - m, n, k, alpha, - a_mat, a_offset, a_ld, - b_mat, b_offset, b_ld, beta, - c_mat, c_offset, c_ld, - num_queues, queues, num_wait_events, wait_events, events); +clblasStatus clblasXgemm(const clblasOrder layout, const clblasTranspose a_transpose, const clblasTranspose b_transpose, + const size_t m, const size_t n, const size_t k, + const double alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, + const double beta, + cl_mem c_buffer, const size_t c_offset, const size_t c_ld, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { + return clblasDgemm(layout, a_transpose, b_transpose, + m, n, k, + alpha, + a_buffer, a_offset, a_ld, + b_buffer, b_offset, b_ld, + beta, + c_buffer, c_offset, c_ld, + num_queues, queues, num_wait_events, wait_events, events); } -clblasStatus clblasXgemm( - clblasOrder layout, clblasTranspose a_transpose, clblasTranspose b_transpose, - size_t m, size_t n, size_t k, float2 alpha, - const cl_mem a_mat, size_t a_offset, size_t a_ld, - const cl_mem b_mat, size_t b_offset, size_t b_ld, float2 beta, - const cl_mem c_mat, size_t c_offset, size_t c_ld, - cl_uint num_queues, cl_command_queue *queues, - cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { - auto cl_alpha = cl_float2{{alpha.real(), alpha.imag()}}; - auto cl_beta = cl_float2{{beta.real(), beta.imag()}}; - return clblasCgemm(layout, a_transpose, b_transpose, - m, n, k, cl_alpha, - a_mat, a_offset, a_ld, - b_mat, b_offset, b_ld, cl_beta, - c_mat, c_offset, c_ld, - num_queues, queues, num_wait_events, wait_events, events); +clblasStatus clblasXgemm(const clblasOrder layout, const clblasTranspose a_transpose, const clblasTranspose b_transpose, + const size_t m, const size_t n, const size_t k, + const float2 alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, + const float2 beta, + cl_mem c_buffer, const size_t c_offset, const size_t c_ld, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { + return clblasCgemm(layout, a_transpose, b_transpose, + m, n, k, + cl_float2{{alpha.real(), alpha.imag()}}, + a_buffer, a_offset, a_ld, + b_buffer, b_offset, b_ld, + cl_float2{{beta.real(), beta.imag()}}, + c_buffer, c_offset, c_ld, + num_queues, queues, num_wait_events, wait_events, events); } -clblasStatus clblasXgemm( - clblasOrder layout, clblasTranspose a_transpose, clblasTranspose b_transpose, - size_t m, size_t n, size_t k, double2 alpha, - const cl_mem a_mat, size_t a_offset, size_t a_ld, - const cl_mem b_mat, size_t b_offset, size_t b_ld, double2 beta, - const cl_mem c_mat, size_t c_offset, size_t c_ld, - cl_uint num_queues, cl_command_queue *queues, - cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { - auto cl_alpha = cl_double2{{alpha.real(), alpha.imag()}}; - auto cl_beta = cl_double2{{beta.real(), beta.imag()}}; - return clblasZgemm(layout, a_transpose, b_transpose, - m, n, k, cl_alpha, - a_mat, a_offset, a_ld, - b_mat, b_offset, b_ld, cl_beta, - c_mat, c_offset, c_ld, - num_queues, queues, num_wait_events, wait_events, events); +clblasStatus clblasXgemm(const clblasOrder layout, const clblasTranspose a_transpose, const clblasTranspose b_transpose, + const size_t m, const size_t n, const size_t k, + const double2 alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, + const double2 beta, + cl_mem c_buffer, const size_t c_offset, const size_t c_ld, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { + return clblasZgemm(layout, a_transpose, b_transpose, + m, n, k, + cl_double2{{alpha.real(), alpha.imag()}}, + a_buffer, a_offset, a_ld, + b_buffer, b_offset, b_ld, + cl_double2{{beta.real(), beta.imag()}}, + c_buffer, c_offset, c_ld, + num_queues, queues, num_wait_events, wait_events, events); } -// This calls {clblasSsymm, clblasDsymm, clblasCsymm, clblasZsymm} with the arguments forwarded. -clblasStatus clblasXsymm( - clblasOrder layout, clblasSide side, clblasUplo triangle, - size_t m, size_t n, float alpha, - const cl_mem a_mat, size_t a_offset, size_t a_ld, - const cl_mem b_mat, size_t b_offset, size_t b_ld, float beta, - const cl_mem c_mat, size_t c_offset, size_t c_ld, - cl_uint num_queues, cl_command_queue *queues, - cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { - return clblasSsymm(layout, side, triangle, - m, n, alpha, - a_mat, a_offset, a_ld, - b_mat, b_offset, b_ld, beta, - c_mat, c_offset, c_ld, - num_queues, queues, num_wait_events, wait_events, events); +// Forwards the clBLAS calls for SSYMM/DSYMM/CSYMM/ZSYMM +clblasStatus clblasXsymm(const clblasOrder layout, const clblasSide side, const clblasUplo triangle, + const size_t m, const size_t n, + const float alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, + const float beta, + cl_mem c_buffer, const size_t c_offset, const size_t c_ld, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { + return clblasSsymm(layout, side, triangle, + m, n, + alpha, + a_buffer, a_offset, a_ld, + b_buffer, b_offset, b_ld, + beta, + c_buffer, c_offset, c_ld, + num_queues, queues, num_wait_events, wait_events, events); } -clblasStatus clblasXsymm( - clblasOrder layout, clblasSide side, clblasUplo triangle, - size_t m, size_t n, double alpha, - const cl_mem a_mat, size_t a_offset, size_t a_ld, - const cl_mem b_mat, size_t b_offset, size_t b_ld, double beta, - const cl_mem c_mat, size_t c_offset, size_t c_ld, - cl_uint num_queues, cl_command_queue *queues, - cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { - return clblasDsymm(layout, side, triangle, - m, n, alpha, - a_mat, a_offset, a_ld, - b_mat, b_offset, b_ld, beta, - c_mat, c_offset, c_ld, - num_queues, queues, num_wait_events, wait_events, events); +clblasStatus clblasXsymm(const clblasOrder layout, const clblasSide side, const clblasUplo triangle, + const size_t m, const size_t n, + const double alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, + const double beta, + cl_mem c_buffer, const size_t c_offset, const size_t c_ld, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { + return clblasDsymm(layout, side, triangle, + m, n, + alpha, + a_buffer, a_offset, a_ld, + b_buffer, b_offset, b_ld, + beta, + c_buffer, c_offset, c_ld, + num_queues, queues, num_wait_events, wait_events, events); } -clblasStatus clblasXsymm( - clblasOrder layout, clblasSide side, clblasUplo triangle, - size_t m, size_t n, float2 alpha, - const cl_mem a_mat, size_t a_offset, size_t a_ld, - const cl_mem b_mat, size_t b_offset, size_t b_ld, float2 beta, - const cl_mem c_mat, size_t c_offset, size_t c_ld, - cl_uint num_queues, cl_command_queue *queues, - cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { - auto cl_alpha = cl_float2{{alpha.real(), alpha.imag()}}; - auto cl_beta = cl_float2{{beta.real(), beta.imag()}}; - return clblasCsymm(layout, side, triangle, - m, n, cl_alpha, - a_mat, a_offset, a_ld, - b_mat, b_offset, b_ld, cl_beta, - c_mat, c_offset, c_ld, - num_queues, queues, num_wait_events, wait_events, events); +clblasStatus clblasXsymm(const clblasOrder layout, const clblasSide side, const clblasUplo triangle, + const size_t m, const size_t n, + const float2 alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, + const float2 beta, + cl_mem c_buffer, const size_t c_offset, const size_t c_ld, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { + return clblasCsymm(layout, side, triangle, + m, n, + cl_float2{{alpha.real(), alpha.imag()}}, + a_buffer, a_offset, a_ld, + b_buffer, b_offset, b_ld, + cl_float2{{beta.real(), beta.imag()}}, + c_buffer, c_offset, c_ld, + num_queues, queues, num_wait_events, wait_events, events); } -clblasStatus clblasXsymm( - clblasOrder layout, clblasSide side, clblasUplo triangle, - size_t m, size_t n, double2 alpha, - const cl_mem a_mat, size_t a_offset, size_t a_ld, - const cl_mem b_mat, size_t b_offset, size_t b_ld, double2 beta, - const cl_mem c_mat, size_t c_offset, size_t c_ld, - cl_uint num_queues, cl_command_queue *queues, - cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { - auto cl_alpha = cl_double2{{alpha.real(), alpha.imag()}}; - auto cl_beta = cl_double2{{beta.real(), beta.imag()}}; - return clblasZsymm(layout, side, triangle, - m, n, cl_alpha, - a_mat, a_offset, a_ld, - b_mat, b_offset, b_ld, cl_beta, - c_mat, c_offset, c_ld, - num_queues, queues, num_wait_events, wait_events, events); +clblasStatus clblasXsymm(const clblasOrder layout, const clblasSide side, const clblasUplo triangle, + const size_t m, const size_t n, + const double2 alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, + const double2 beta, + cl_mem c_buffer, const size_t c_offset, const size_t c_ld, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { + return clblasZsymm(layout, side, triangle, + m, n, + cl_double2{{alpha.real(), alpha.imag()}}, + a_buffer, a_offset, a_ld, + b_buffer, b_offset, b_ld, + cl_double2{{beta.real(), beta.imag()}}, + c_buffer, c_offset, c_ld, + num_queues, queues, num_wait_events, wait_events, events); } -// This calls {clblasChemm, clblasZhemm} with the arguments forwarded. -clblasStatus clblasXhemm( - clblasOrder layout, clblasSide side, clblasUplo triangle, - size_t m, size_t n, float2 alpha, - const cl_mem a_mat, size_t a_offset, size_t a_ld, - const cl_mem b_mat, size_t b_offset, size_t b_ld, float2 beta, - const cl_mem c_mat, size_t c_offset, size_t c_ld, - cl_uint num_queues, cl_command_queue *queues, - cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { - auto cl_alpha = cl_float2{{alpha.real(), alpha.imag()}}; - auto cl_beta = cl_float2{{beta.real(), beta.imag()}}; - return clblasChemm(layout, side, triangle, - m, n, cl_alpha, - a_mat, a_offset, a_ld, - b_mat, b_offset, b_ld, cl_beta, - c_mat, c_offset, c_ld, - num_queues, queues, num_wait_events, wait_events, events); +// Forwards the clBLAS calls for CHEMM/ZHEMM +clblasStatus clblasXhemm(const clblasOrder layout, const clblasSide side, const clblasUplo triangle, + const size_t m, const size_t n, + const float2 alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, + const float2 beta, + cl_mem c_buffer, const size_t c_offset, const size_t c_ld, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { + return clblasChemm(layout, side, triangle, + m, n, + cl_float2{{alpha.real(), alpha.imag()}}, + a_buffer, a_offset, a_ld, + b_buffer, b_offset, b_ld, + cl_float2{{beta.real(), beta.imag()}}, + c_buffer, c_offset, c_ld, + num_queues, queues, num_wait_events, wait_events, events); } -clblasStatus clblasXhemm( - clblasOrder layout, clblasSide side, clblasUplo triangle, - size_t m, size_t n, double2 alpha, - const cl_mem a_mat, size_t a_offset, size_t a_ld, - const cl_mem b_mat, size_t b_offset, size_t b_ld, double2 beta, - const cl_mem c_mat, size_t c_offset, size_t c_ld, - cl_uint num_queues, cl_command_queue *queues, - cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { - auto cl_alpha = cl_double2{{alpha.real(), alpha.imag()}}; - auto cl_beta = cl_double2{{beta.real(), beta.imag()}}; - return clblasZhemm(layout, side, triangle, - m, n, cl_alpha, - a_mat, a_offset, a_ld, - b_mat, b_offset, b_ld, cl_beta, - c_mat, c_offset, c_ld, - num_queues, queues, num_wait_events, wait_events, events); +clblasStatus clblasXhemm(const clblasOrder layout, const clblasSide side, const clblasUplo triangle, + const size_t m, const size_t n, + const double2 alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, + const double2 beta, + cl_mem c_buffer, const size_t c_offset, const size_t c_ld, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { + return clblasZhemm(layout, side, triangle, + m, n, + cl_double2{{alpha.real(), alpha.imag()}}, + a_buffer, a_offset, a_ld, + b_buffer, b_offset, b_ld, + cl_double2{{beta.real(), beta.imag()}}, + c_buffer, c_offset, c_ld, + num_queues, queues, num_wait_events, wait_events, events); } -// This calls {clblasSsyrk, clblasDsyrk, clblasCsyrk, clblasZsyrk} with the arguments forwarded. -clblasStatus clblasXsyrk( - clblasOrder layout, clblasUplo triangle, clblasTranspose a_transpose, - size_t n, size_t k, float alpha, - const cl_mem a_mat, size_t a_offset, size_t a_ld, float beta, - const cl_mem c_mat, size_t c_offset, size_t c_ld, - cl_uint num_queues, cl_command_queue *queues, - cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { - return clblasSsyrk(layout, triangle, a_transpose, - n, k, alpha, - a_mat, a_offset, a_ld, beta, - c_mat, c_offset, c_ld, - num_queues, queues, num_wait_events, wait_events, events); +// Forwards the clBLAS calls for SSYRK/DSYRK/CSYRK/ZSYRK +clblasStatus clblasXsyrk(const clblasOrder layout, const clblasUplo triangle, const clblasTranspose a_transpose, + const size_t n, const size_t k, + const float alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const float beta, + cl_mem c_buffer, const size_t c_offset, const size_t c_ld, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { + return clblasSsyrk(layout, triangle, a_transpose, + n, k, + alpha, + a_buffer, a_offset, a_ld, + beta, + c_buffer, c_offset, c_ld, + num_queues, queues, num_wait_events, wait_events, events); } -clblasStatus clblasXsyrk( - clblasOrder layout, clblasUplo triangle, clblasTranspose a_transpose, - size_t n, size_t k, double alpha, - const cl_mem a_mat, size_t a_offset, size_t a_ld, double beta, - const cl_mem c_mat, size_t c_offset, size_t c_ld, - cl_uint num_queues, cl_command_queue *queues, - cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { - return clblasDsyrk(layout, triangle, a_transpose, - n, k, alpha, - a_mat, a_offset, a_ld, beta, - c_mat, c_offset, c_ld, - num_queues, queues, num_wait_events, wait_events, events); +clblasStatus clblasXsyrk(const clblasOrder layout, const clblasUplo triangle, const clblasTranspose a_transpose, + const size_t n, const size_t k, + const double alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const double beta, + cl_mem c_buffer, const size_t c_offset, const size_t c_ld, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { + return clblasDsyrk(layout, triangle, a_transpose, + n, k, + alpha, + a_buffer, a_offset, a_ld, + beta, + c_buffer, c_offset, c_ld, + num_queues, queues, num_wait_events, wait_events, events); } -clblasStatus clblasXsyrk( - clblasOrder layout, clblasUplo triangle, clblasTranspose a_transpose, - size_t n, size_t k, float2 alpha, - const cl_mem a_mat, size_t a_offset, size_t a_ld, float2 beta, - const cl_mem c_mat, size_t c_offset, size_t c_ld, - cl_uint num_queues, cl_command_queue *queues, - cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { - auto cl_alpha = cl_float2{{alpha.real(), alpha.imag()}}; - auto cl_beta = cl_float2{{beta.real(), beta.imag()}}; - return clblasCsyrk(layout, triangle, a_transpose, - n, k, cl_alpha, - a_mat, a_offset, a_ld, cl_beta, - c_mat, c_offset, c_ld, - num_queues, queues, num_wait_events, wait_events, events); +clblasStatus clblasXsyrk(const clblasOrder layout, const clblasUplo triangle, const clblasTranspose a_transpose, + const size_t n, const size_t k, + const float2 alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const float2 beta, + cl_mem c_buffer, const size_t c_offset, const size_t c_ld, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { + return clblasCsyrk(layout, triangle, a_transpose, + n, k, + cl_float2{{alpha.real(), alpha.imag()}}, + a_buffer, a_offset, a_ld, + cl_float2{{beta.real(), beta.imag()}}, + c_buffer, c_offset, c_ld, + num_queues, queues, num_wait_events, wait_events, events); } -clblasStatus clblasXsyrk( - clblasOrder layout, clblasUplo triangle, clblasTranspose a_transpose, - size_t n, size_t k, double2 alpha, - const cl_mem a_mat, size_t a_offset, size_t a_ld, double2 beta, - const cl_mem c_mat, size_t c_offset, size_t c_ld, - cl_uint num_queues, cl_command_queue *queues, - cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { - auto cl_alpha = cl_double2{{alpha.real(), alpha.imag()}}; - auto cl_beta = cl_double2{{beta.real(), beta.imag()}}; - return clblasZsyrk(layout, triangle, a_transpose, - n, k, cl_alpha, - a_mat, a_offset, a_ld, cl_beta, - c_mat, c_offset, c_ld, - num_queues, queues, num_wait_events, wait_events, events); +clblasStatus clblasXsyrk(const clblasOrder layout, const clblasUplo triangle, const clblasTranspose a_transpose, + const size_t n, const size_t k, + const double2 alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const double2 beta, + cl_mem c_buffer, const size_t c_offset, const size_t c_ld, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { + return clblasZsyrk(layout, triangle, a_transpose, + n, k, + cl_double2{{alpha.real(), alpha.imag()}}, + a_buffer, a_offset, a_ld, + cl_double2{{beta.real(), beta.imag()}}, + c_buffer, c_offset, c_ld, + num_queues, queues, num_wait_events, wait_events, events); } -// This calls {clblasCherk, clblasZherk} with the arguments forwarded. -clblasStatus clblasXherk( - clblasOrder layout, clblasUplo triangle, clblasTranspose a_transpose, - size_t n, size_t k, float alpha, - const cl_mem a_mat, size_t a_offset, size_t a_ld, float beta, - const cl_mem c_mat, size_t c_offset, size_t c_ld, - cl_uint num_queues, cl_command_queue *queues, - cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { - return clblasCherk(layout, triangle, a_transpose, - n, k, alpha, - a_mat, a_offset, a_ld, beta, - c_mat, c_offset, c_ld, - num_queues, queues, num_wait_events, wait_events, events); +// Forwards the clBLAS calls for CHERK/ZHERK +clblasStatus clblasXherk(const clblasOrder layout, const clblasUplo triangle, const clblasTranspose a_transpose, + const size_t n, const size_t k, + const float alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const float beta, + cl_mem c_buffer, const size_t c_offset, const size_t c_ld, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { + return clblasCherk(layout, triangle, a_transpose, + n, k, + alpha, + a_buffer, a_offset, a_ld, + beta, + c_buffer, c_offset, c_ld, + num_queues, queues, num_wait_events, wait_events, events); } -clblasStatus clblasXherk( - clblasOrder layout, clblasUplo triangle, clblasTranspose a_transpose, - size_t n, size_t k, double alpha, - const cl_mem a_mat, size_t a_offset, size_t a_ld, double beta, - const cl_mem c_mat, size_t c_offset, size_t c_ld, - cl_uint num_queues, cl_command_queue *queues, - cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { - return clblasZherk(layout, triangle, a_transpose, - n, k, alpha, - a_mat, a_offset, a_ld, beta, - c_mat, c_offset, c_ld, - num_queues, queues, num_wait_events, wait_events, events); +clblasStatus clblasXherk(const clblasOrder layout, const clblasUplo triangle, const clblasTranspose a_transpose, + const size_t n, const size_t k, + const double alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const double beta, + cl_mem c_buffer, const size_t c_offset, const size_t c_ld, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { + return clblasZherk(layout, triangle, a_transpose, + n, k, + alpha, + a_buffer, a_offset, a_ld, + beta, + c_buffer, c_offset, c_ld, + num_queues, queues, num_wait_events, wait_events, events); } -// This calls {clblasSsyr2k, clblasDsyr2k, clblasCsyr2k, clblasZsyr2k} with the arguments forwarded. -clblasStatus clblasXsyr2k( - clblasOrder layout, clblasUplo triangle, clblasTranspose ab_transpose, - size_t n, size_t k, float alpha, - const cl_mem a_mat, size_t a_offset, size_t a_ld, - const cl_mem b_mat, size_t b_offset, size_t b_ld, float beta, - const cl_mem c_mat, size_t c_offset, size_t c_ld, - cl_uint num_queues, cl_command_queue *queues, - cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { - return clblasSsyr2k(layout, triangle, ab_transpose, - n, k, alpha, - a_mat, a_offset, a_ld, - b_mat, b_offset, b_ld, beta, - c_mat, c_offset, c_ld, - num_queues, queues, num_wait_events, wait_events, events); +// Forwards the clBLAS calls for SSYR2K/DSYR2K/CSYR2K/ZSYR2K +clblasStatus clblasXsyr2k(const clblasOrder layout, const clblasUplo triangle, const clblasTranspose ab_transpose, + const size_t n, const size_t k, + const float alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, + const float beta, + cl_mem c_buffer, const size_t c_offset, const size_t c_ld, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { + return clblasSsyr2k(layout, triangle, ab_transpose, + n, k, + alpha, + a_buffer, a_offset, a_ld, + b_buffer, b_offset, b_ld, + beta, + c_buffer, c_offset, c_ld, + num_queues, queues, num_wait_events, wait_events, events); } -clblasStatus clblasXsyr2k( - clblasOrder layout, clblasUplo triangle, clblasTranspose ab_transpose, - size_t n, size_t k, double alpha, - const cl_mem a_mat, size_t a_offset, size_t a_ld, - const cl_mem b_mat, size_t b_offset, size_t b_ld, double beta, - const cl_mem c_mat, size_t c_offset, size_t c_ld, - cl_uint num_queues, cl_command_queue *queues, - cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { - return clblasDsyr2k(layout, triangle, ab_transpose, - n, k, alpha, - a_mat, a_offset, a_ld, - b_mat, b_offset, b_ld, beta, - c_mat, c_offset, c_ld, - num_queues, queues, num_wait_events, wait_events, events); +clblasStatus clblasXsyr2k(const clblasOrder layout, const clblasUplo triangle, const clblasTranspose ab_transpose, + const size_t n, const size_t k, + const double alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, + const double beta, + cl_mem c_buffer, const size_t c_offset, const size_t c_ld, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { + return clblasDsyr2k(layout, triangle, ab_transpose, + n, k, + alpha, + a_buffer, a_offset, a_ld, + b_buffer, b_offset, b_ld, + beta, + c_buffer, c_offset, c_ld, + num_queues, queues, num_wait_events, wait_events, events); } -clblasStatus clblasXsyr2k( - clblasOrder layout, clblasUplo triangle, clblasTranspose ab_transpose, - size_t n, size_t k, float2 alpha, - const cl_mem a_mat, size_t a_offset, size_t a_ld, - const cl_mem b_mat, size_t b_offset, size_t b_ld, float2 beta, - const cl_mem c_mat, size_t c_offset, size_t c_ld, - cl_uint num_queues, cl_command_queue *queues, - cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { - auto cl_alpha = cl_float2{{alpha.real(), alpha.imag()}}; - auto cl_beta = cl_float2{{beta.real(), beta.imag()}}; - return clblasCsyr2k(layout, triangle, ab_transpose, - n, k, cl_alpha, - a_mat, a_offset, a_ld, - b_mat, b_offset, b_ld, cl_beta, - c_mat, c_offset, c_ld, - num_queues, queues, num_wait_events, wait_events, events); +clblasStatus clblasXsyr2k(const clblasOrder layout, const clblasUplo triangle, const clblasTranspose ab_transpose, + const size_t n, const size_t k, + const float2 alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, + const float2 beta, + cl_mem c_buffer, const size_t c_offset, const size_t c_ld, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { + return clblasCsyr2k(layout, triangle, ab_transpose, + n, k, + cl_float2{{alpha.real(), alpha.imag()}}, + a_buffer, a_offset, a_ld, + b_buffer, b_offset, b_ld, + cl_float2{{beta.real(), beta.imag()}}, + c_buffer, c_offset, c_ld, + num_queues, queues, num_wait_events, wait_events, events); } -clblasStatus clblasXsyr2k( - clblasOrder layout, clblasUplo triangle, clblasTranspose ab_transpose, - size_t n, size_t k, double2 alpha, - const cl_mem a_mat, size_t a_offset, size_t a_ld, - const cl_mem b_mat, size_t b_offset, size_t b_ld, double2 beta, - const cl_mem c_mat, size_t c_offset, size_t c_ld, - cl_uint num_queues, cl_command_queue *queues, - cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { - auto cl_alpha = cl_double2{{alpha.real(), alpha.imag()}}; - auto cl_beta = cl_double2{{beta.real(), beta.imag()}}; - return clblasZsyr2k(layout, triangle, ab_transpose, - n, k, cl_alpha, - a_mat, a_offset, a_ld, - b_mat, b_offset, b_ld, cl_beta, - c_mat, c_offset, c_ld, - num_queues, queues, num_wait_events, wait_events, events); +clblasStatus clblasXsyr2k(const clblasOrder layout, const clblasUplo triangle, const clblasTranspose ab_transpose, + const size_t n, const size_t k, + const double2 alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, + const double2 beta, + cl_mem c_buffer, const size_t c_offset, const size_t c_ld, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { + return clblasZsyr2k(layout, triangle, ab_transpose, + n, k, + cl_double2{{alpha.real(), alpha.imag()}}, + a_buffer, a_offset, a_ld, + b_buffer, b_offset, b_ld, + cl_double2{{beta.real(), beta.imag()}}, + c_buffer, c_offset, c_ld, + num_queues, queues, num_wait_events, wait_events, events); } -// This calls {clblasCher2k, clblasZher2k} with the arguments forwarded. -clblasStatus clblasXher2k( - clblasOrder layout, clblasUplo triangle, clblasTranspose ab_transpose, - size_t n, size_t k, float2 alpha, - const cl_mem a_mat, size_t a_offset, size_t a_ld, - const cl_mem b_mat, size_t b_offset, size_t b_ld, float beta, - const cl_mem c_mat, size_t c_offset, size_t c_ld, - cl_uint num_queues, cl_command_queue *queues, - cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { - auto cl_alpha = cl_float2{{alpha.real(), alpha.imag()}}; - return clblasCher2k(layout, triangle, ab_transpose, - n, k, cl_alpha, - a_mat, a_offset, a_ld, - b_mat, b_offset, b_ld, beta, - c_mat, c_offset, c_ld, - num_queues, queues, num_wait_events, wait_events, events); +// Forwards the clBLAS calls for CHER2K/ZHER2K +clblasStatus clblasXher2k(const clblasOrder layout, const clblasUplo triangle, const clblasTranspose ab_transpose, + const size_t n, const size_t k, + const float2 alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, + const float beta, + cl_mem c_buffer, const size_t c_offset, const size_t c_ld, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { + return clblasCher2k(layout, triangle, ab_transpose, + n, k, + cl_float2{{alpha.real(), alpha.imag()}}, + a_buffer, a_offset, a_ld, + b_buffer, b_offset, b_ld, + beta, + c_buffer, c_offset, c_ld, + num_queues, queues, num_wait_events, wait_events, events); } -clblasStatus clblasXher2k( - clblasOrder layout, clblasUplo triangle, clblasTranspose ab_transpose, - size_t n, size_t k, double2 alpha, - const cl_mem a_mat, size_t a_offset, size_t a_ld, - const cl_mem b_mat, size_t b_offset, size_t b_ld, double beta, - const cl_mem c_mat, size_t c_offset, size_t c_ld, - cl_uint num_queues, cl_command_queue *queues, - cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { - auto cl_alpha = cl_double2{{alpha.real(), alpha.imag()}}; - return clblasZher2k(layout, triangle, ab_transpose, - n, k, cl_alpha, - a_mat, a_offset, a_ld, - b_mat, b_offset, b_ld, beta, - c_mat, c_offset, c_ld, - num_queues, queues, num_wait_events, wait_events, events); +clblasStatus clblasXher2k(const clblasOrder layout, const clblasUplo triangle, const clblasTranspose ab_transpose, + const size_t n, const size_t k, + const double2 alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, + const double beta, + cl_mem c_buffer, const size_t c_offset, const size_t c_ld, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { + return clblasZher2k(layout, triangle, ab_transpose, + n, k, + cl_double2{{alpha.real(), alpha.imag()}}, + a_buffer, a_offset, a_ld, + b_buffer, b_offset, b_ld, + beta, + c_buffer, c_offset, c_ld, + num_queues, queues, num_wait_events, wait_events, events); } -// This calls {clblasStrmm, clblasDtrmm, clblasCtrmm, clblasZtrmm} with the arguments forwarded. -clblasStatus clblasXtrmm( - clblasOrder layout, clblasSide side, clblasUplo triangle, - clblasTranspose a_transpose, clblasDiag diagonal, - size_t m, size_t n, float alpha, - const cl_mem a_mat, size_t a_offset, size_t a_ld, - const cl_mem b_mat, size_t b_offset, size_t b_ld, - cl_uint num_queues, cl_command_queue *queues, - cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { - return clblasStrmm(layout, side, triangle, a_transpose, diagonal, - m, n, alpha, - a_mat, a_offset, a_ld, - b_mat, b_offset, b_ld, - num_queues, queues, num_wait_events, wait_events, events); +// Forwards the clBLAS calls for STRMM/DTRMM/CTRMM/ZTRMM +clblasStatus clblasXtrmm(const clblasOrder layout, const clblasSide side, const clblasUplo triangle, const clblasTranspose a_transpose, const clblasDiag diagonal, + const size_t m, const size_t n, + const float alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_mem b_buffer, const size_t b_offset, const size_t b_ld, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { + return clblasStrmm(layout, side, triangle, a_transpose, diagonal, + m, n, + alpha, + a_buffer, a_offset, a_ld, + b_buffer, b_offset, b_ld, + num_queues, queues, num_wait_events, wait_events, events); } -clblasStatus clblasXtrmm( - clblasOrder layout, clblasSide side, clblasUplo triangle, - clblasTranspose a_transpose, clblasDiag diagonal, - size_t m, size_t n, double alpha, - const cl_mem a_mat, size_t a_offset, size_t a_ld, - const cl_mem b_mat, size_t b_offset, size_t b_ld, - cl_uint num_queues, cl_command_queue *queues, - cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { - return clblasDtrmm(layout, side, triangle, a_transpose, diagonal, - m, n, alpha, - a_mat, a_offset, a_ld, - b_mat, b_offset, b_ld, - num_queues, queues, num_wait_events, wait_events, events); +clblasStatus clblasXtrmm(const clblasOrder layout, const clblasSide side, const clblasUplo triangle, const clblasTranspose a_transpose, const clblasDiag diagonal, + const size_t m, const size_t n, + const double alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_mem b_buffer, const size_t b_offset, const size_t b_ld, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { + return clblasDtrmm(layout, side, triangle, a_transpose, diagonal, + m, n, + alpha, + a_buffer, a_offset, a_ld, + b_buffer, b_offset, b_ld, + num_queues, queues, num_wait_events, wait_events, events); } -clblasStatus clblasXtrmm( - clblasOrder layout, clblasSide side, clblasUplo triangle, - clblasTranspose a_transpose, clblasDiag diagonal, - size_t m, size_t n, float2 alpha, - const cl_mem a_mat, size_t a_offset, size_t a_ld, - const cl_mem b_mat, size_t b_offset, size_t b_ld, - cl_uint num_queues, cl_command_queue *queues, - cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { - auto cl_alpha = cl_float2{{alpha.real(), alpha.imag()}}; - return clblasCtrmm(layout, side, triangle, a_transpose, diagonal, - m, n, cl_alpha, - a_mat, a_offset, a_ld, - b_mat, b_offset, b_ld, - num_queues, queues, num_wait_events, wait_events, events); +clblasStatus clblasXtrmm(const clblasOrder layout, const clblasSide side, const clblasUplo triangle, const clblasTranspose a_transpose, const clblasDiag diagonal, + const size_t m, const size_t n, + const float2 alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_mem b_buffer, const size_t b_offset, const size_t b_ld, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { + return clblasCtrmm(layout, side, triangle, a_transpose, diagonal, + m, n, + cl_float2{{alpha.real(), alpha.imag()}}, + a_buffer, a_offset, a_ld, + b_buffer, b_offset, b_ld, + num_queues, queues, num_wait_events, wait_events, events); } -clblasStatus clblasXtrmm( - clblasOrder layout, clblasSide side, clblasUplo triangle, - clblasTranspose a_transpose, clblasDiag diagonal, - size_t m, size_t n, double2 alpha, - const cl_mem a_mat, size_t a_offset, size_t a_ld, - const cl_mem b_mat, size_t b_offset, size_t b_ld, - cl_uint num_queues, cl_command_queue *queues, - cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { - auto cl_alpha = cl_double2{{alpha.real(), alpha.imag()}}; - return clblasZtrmm(layout, side, triangle, a_transpose, diagonal, - m, n, cl_alpha, - a_mat, a_offset, a_ld, - b_mat, b_offset, b_ld, - num_queues, queues, num_wait_events, wait_events, events); -} - -// This calls {clblasStrsm, clblasDtrsm, clblasCtrsm, clblasZtrsm} with the arguments forwarded. -clblasStatus clblasXtrsm( - clblasOrder layout, clblasSide side, clblasUplo triangle, - clblasTranspose a_transpose, clblasDiag diagonal, - size_t m, size_t n, float alpha, - const cl_mem a_mat, size_t a_offset, size_t a_ld, - const cl_mem b_mat, size_t b_offset, size_t b_ld, - cl_uint num_queues, cl_command_queue *queues, - cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { - return clblasStrsm(layout, side, triangle, a_transpose, diagonal, - m, n, alpha, - a_mat, a_offset, a_ld, - b_mat, b_offset, b_ld, - num_queues, queues, num_wait_events, wait_events, events); -} -clblasStatus clblasXtrsm( - clblasOrder layout, clblasSide side, clblasUplo triangle, - clblasTranspose a_transpose, clblasDiag diagonal, - size_t m, size_t n, double alpha, - const cl_mem a_mat, size_t a_offset, size_t a_ld, - const cl_mem b_mat, size_t b_offset, size_t b_ld, - cl_uint num_queues, cl_command_queue *queues, - cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { - return clblasDtrsm(layout, side, triangle, a_transpose, diagonal, - m, n, alpha, - a_mat, a_offset, a_ld, - b_mat, b_offset, b_ld, - num_queues, queues, num_wait_events, wait_events, events); -} -clblasStatus clblasXtrsm( - clblasOrder layout, clblasSide side, clblasUplo triangle, - clblasTranspose a_transpose, clblasDiag diagonal, - size_t m, size_t n, float2 alpha, - const cl_mem a_mat, size_t a_offset, size_t a_ld, - const cl_mem b_mat, size_t b_offset, size_t b_ld, - cl_uint num_queues, cl_command_queue *queues, - cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { - auto cl_alpha = cl_float2{{alpha.real(), alpha.imag()}}; - return clblasCtrsm(layout, side, triangle, a_transpose, diagonal, - m, n, cl_alpha, - a_mat, a_offset, a_ld, - b_mat, b_offset, b_ld, - num_queues, queues, num_wait_events, wait_events, events); -} -clblasStatus clblasXtrsm( - clblasOrder layout, clblasSide side, clblasUplo triangle, - clblasTranspose a_transpose, clblasDiag diagonal, - size_t m, size_t n, double2 alpha, - const cl_mem a_mat, size_t a_offset, size_t a_ld, - const cl_mem b_mat, size_t b_offset, size_t b_ld, - cl_uint num_queues, cl_command_queue *queues, - cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { - auto cl_alpha = cl_double2{{alpha.real(), alpha.imag()}}; - return clblasZtrsm(layout, side, triangle, a_transpose, diagonal, - m, n, cl_alpha, - a_mat, a_offset, a_ld, - b_mat, b_offset, b_ld, - num_queues, queues, num_wait_events, wait_events, events); +clblasStatus clblasXtrmm(const clblasOrder layout, const clblasSide side, const clblasUplo triangle, const clblasTranspose a_transpose, const clblasDiag diagonal, + const size_t m, const size_t n, + const double2 alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_mem b_buffer, const size_t b_offset, const size_t b_ld, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { + return clblasZtrmm(layout, side, triangle, a_transpose, diagonal, + m, n, + cl_double2{{alpha.real(), alpha.imag()}}, + a_buffer, a_offset, a_ld, + b_buffer, b_offset, b_ld, + num_queues, queues, num_wait_events, wait_events, events); } // ================================================================================================= From 6105ad6f5b40b319477be7b51b8631e510d58672 Mon Sep 17 00:00:00 2001 From: CNugteren Date: Thu, 17 Sep 2015 17:05:45 +0200 Subject: [PATCH 2/3] Added interface of all level 2 routines --- include/clblast.h | 222 ++++++- include/clblast_c.h | 436 ++++++++++++- scripts/generator/generator.py | 68 +- scripts/generator/routine.py | 100 ++- src/clblast.cc | 654 +++++++++++++++++++- src/clblast_c.cc | 1058 ++++++++++++++++++++++++++++++++ test/wrapper_clblas.h | 1056 +++++++++++++++++++++++++++++++ 7 files changed, 3533 insertions(+), 61 deletions(-) diff --git a/include/clblast.h b/include/clblast.h index 953e6953..70a3b5bc 100644 --- a/include/clblast.h +++ b/include/clblast.h @@ -144,7 +144,7 @@ StatusCode Dotc(const size_t n, // BLAS level-2 (matrix-vector) routines // ================================================================================================= -// Generalized matrix-vector multiplication: SGEMV/DGEMV/CGEMV/ZGEMV +// General matrix-vector multiplication: SGEMV/DGEMV/CGEMV/ZGEMV template StatusCode Gemv(const Layout layout, const Transpose a_transpose, const size_t m, const size_t n, @@ -155,6 +155,17 @@ StatusCode Gemv(const Layout layout, const Transpose a_transpose, cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event); +// General banded matrix-vector multiplication: SGBMV/DGBMV/CGBMV/ZGBMV +template +StatusCode Gbmv(const Layout layout, const Transpose a_transpose, + const size_t m, const size_t n, const size_t kl, const size_t ku, + const T alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const T beta, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event); + // Hermitian matrix-vector multiplication: CHEMV/ZHEMV template StatusCode Hemv(const Layout layout, const Triangle triangle, @@ -166,6 +177,28 @@ StatusCode Hemv(const Layout layout, const Triangle triangle, cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event); +// Hermitian banded matrix-vector multiplication: CHBMV/ZHBMV +template +StatusCode Hbmv(const Layout layout, const Triangle triangle, + const size_t n, const size_t k, + const T alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const T beta, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event); + +// Hermitian packed matrix-vector multiplication: CHPMV/ZHPMV +template +StatusCode Hpmv(const Layout layout, const Triangle triangle, + const size_t n, + const T alpha, + const cl_mem ap_buffer, const size_t ap_offset, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const T beta, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event); + // Symmetric matrix-vector multiplication: SSYMV/DSYMV template StatusCode Symv(const Layout layout, const Triangle triangle, @@ -177,11 +210,187 @@ StatusCode Symv(const Layout layout, const Triangle triangle, cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event); +// Symmetric banded matrix-vector multiplication: SSBMV/DSBMV +template +StatusCode Sbmv(const Layout layout, const Triangle triangle, + const size_t n, const size_t k, + const T alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const T beta, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event); + +// Symmetric packed matrix-vector multiplication: SSPMV/DSPMV +template +StatusCode Spmv(const Layout layout, const Triangle triangle, + const size_t n, + const T alpha, + const cl_mem ap_buffer, const size_t ap_offset, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const T beta, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event); + +// Triangular matrix-vector multiplication: STRMV/DTRMV/CTRMV/ZTRMV +template +StatusCode Trmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t n, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event); + +// Triangular banded matrix-vector multiplication: STBMV/DTBMV/CTBMV/ZTBMV +template +StatusCode Tbmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t n, const size_t k, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event); + +// Triangular packed matrix-vector multiplication: STPMV/DTPMV/CTPMV/ZTPMV +template +StatusCode Tpmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t n, + const cl_mem ap_buffer, const size_t ap_offset, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event); + +// Solves a triangular system of equations: STRSV/DTRSV/CTRSV/ZTRSV +template +StatusCode Trsv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t n, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event); + +// Solves a banded triangular system of equations: STBSV/DTBSV/CTBSV/ZTBSV +template +StatusCode Tbsv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t n, const size_t k, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event); + +// Solves a packed triangular system of equations: STPSV/DTPSV/CTPSV/ZTPSV +template +StatusCode Tpsv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t n, + const cl_mem ap_buffer, const size_t ap_offset, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event); + +// General rank-1 matrix update: SGER/DGER +template +StatusCode Ger(const Layout layout, + const size_t m, const size_t n, + const T alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_command_queue* queue, cl_event* event); + +// General rank-1 complex matrix update: CGERU/ZGERU +template +StatusCode Geru(const Layout layout, + const size_t m, const size_t n, + const T alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_command_queue* queue, cl_event* event); + +// General rank-1 complex conjugated matrix update: CGERC/ZGERC +template +StatusCode Gerc(const Layout layout, + const size_t m, const size_t n, + const T alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_command_queue* queue, cl_event* event); + +// Hermitian rank-1 matrix update: CHER/ZHER +template +StatusCode Her(const Layout layout, const Triangle triangle, + const size_t n, + const T alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_command_queue* queue, cl_event* event); + +// Hermitian packed rank-1 matrix update: CHPR/ZHPR +template +StatusCode Hpr(const Layout layout, const Triangle triangle, + const size_t n, + const T alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_mem ap_buffer, const size_t ap_offset, + cl_command_queue* queue, cl_event* event); + +// Hermitian rank-2 matrix update: CHER2/ZHER2 +template +StatusCode Her2(const Layout layout, const Triangle triangle, + const size_t n, + const T alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_command_queue* queue, cl_event* event); + +// Hermitian packed rank-2 matrix update: CHPR2/ZHPR2 +template +StatusCode Hpr2(const Layout layout, const Triangle triangle, + const size_t n, + const T alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_mem ap_buffer, const size_t ap_offset, + cl_command_queue* queue, cl_event* event); + +// Symmetric rank-1 matrix update: SSYR/DSYR +template +StatusCode Syr(const Layout layout, const Triangle triangle, + const size_t n, + const T alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_command_queue* queue, cl_event* event); + +// Symmetric packed rank-1 matrix update: SSPR/DSPR +template +StatusCode Spr(const Layout layout, const Triangle triangle, + const size_t n, + const T alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_mem ap_buffer, const size_t ap_offset, + cl_command_queue* queue, cl_event* event); + +// Symmetric rank-2 matrix update: SSYR2/DSYR2 +template +StatusCode Syr2(const Layout layout, const Triangle triangle, + const size_t n, + const T alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_command_queue* queue, cl_event* event); + +// Symmetric packed rank-2 matrix update: SSPR2/DSPR2 +template +StatusCode Spr2(const Layout layout, const Triangle triangle, + const size_t n, + const T alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_mem ap_buffer, const size_t ap_offset, + cl_command_queue* queue, cl_event* event); + // ================================================================================================= // BLAS level-3 (matrix-matrix) routines // ================================================================================================= -// Generalized matrix-matrix multiplication: SGEMM/DGEMM/CGEMM/ZGEMM +// General matrix-matrix multiplication: SGEMM/DGEMM/CGEMM/ZGEMM template StatusCode Gemm(const Layout layout, const Transpose a_transpose, const Transpose b_transpose, const size_t m, const size_t n, const size_t k, @@ -265,6 +474,15 @@ StatusCode Trmm(const Layout layout, const Side side, const Triangle triangle, c cl_mem b_buffer, const size_t b_offset, const size_t b_ld, cl_command_queue* queue, cl_event* event); +// Solves a triangular system of equations: STRSM/DTRSM/CTRSM/ZTRSM +template +StatusCode Trsm(const Layout layout, const Side side, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t m, const size_t n, + const T alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_mem b_buffer, const size_t b_offset, const size_t b_ld, + cl_command_queue* queue, cl_event* event); + // ================================================================================================= } // namespace clblast diff --git a/include/clblast_c.h b/include/clblast_c.h index 56507625..fac39a58 100644 --- a/include/clblast_c.h +++ b/include/clblast_c.h @@ -199,7 +199,7 @@ StatusCode CLBlastZdotc(const size_t n, // BLAS level-2 (matrix-vector) routines // ================================================================================================= -// Generalized matrix-vector multiplication: SGEMV/DGEMV/CGEMV/ZGEMV +// General matrix-vector multiplication: SGEMV/DGEMV/CGEMV/ZGEMV StatusCode CLBlastSgemv(const Layout layout, const Transpose a_transpose, const size_t m, const size_t n, const float alpha, @@ -233,6 +233,40 @@ StatusCode CLBlastZgemv(const Layout layout, const Transpose a_transpose, cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event); +// General banded matrix-vector multiplication: SGBMV/DGBMV/CGBMV/ZGBMV +StatusCode CLBlastSgbmv(const Layout layout, const Transpose a_transpose, + const size_t m, const size_t n, const size_t kl, const size_t ku, + const float alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const float beta, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event); +StatusCode CLBlastDgbmv(const Layout layout, const Transpose a_transpose, + const size_t m, const size_t n, const size_t kl, const size_t ku, + const double alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const double beta, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event); +StatusCode CLBlastCgbmv(const Layout layout, const Transpose a_transpose, + const size_t m, const size_t n, const size_t kl, const size_t ku, + const cl_float2 alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_float2 beta, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event); +StatusCode CLBlastZgbmv(const Layout layout, const Transpose a_transpose, + const size_t m, const size_t n, const size_t kl, const size_t ku, + const cl_double2 alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_double2 beta, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event); + // Hermitian matrix-vector multiplication: CHEMV/ZHEMV StatusCode CLBlastChemv(const Layout layout, const Triangle triangle, const size_t n, @@ -251,6 +285,42 @@ StatusCode CLBlastZhemv(const Layout layout, const Triangle triangle, cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event); +// Hermitian banded matrix-vector multiplication: CHBMV/ZHBMV +StatusCode CLBlastChbmv(const Layout layout, const Triangle triangle, + const size_t n, const size_t k, + const cl_float2 alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_float2 beta, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event); +StatusCode CLBlastZhbmv(const Layout layout, const Triangle triangle, + const size_t n, const size_t k, + const cl_double2 alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_double2 beta, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event); + +// Hermitian packed matrix-vector multiplication: CHPMV/ZHPMV +StatusCode CLBlastChpmv(const Layout layout, const Triangle triangle, + const size_t n, + const cl_float2 alpha, + const cl_mem ap_buffer, const size_t ap_offset, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_float2 beta, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event); +StatusCode CLBlastZhpmv(const Layout layout, const Triangle triangle, + const size_t n, + const cl_double2 alpha, + const cl_mem ap_buffer, const size_t ap_offset, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_double2 beta, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event); + // Symmetric matrix-vector multiplication: SSYMV/DSYMV StatusCode CLBlastSsymv(const Layout layout, const Triangle triangle, const size_t n, @@ -269,11 +339,347 @@ StatusCode CLBlastDsymv(const Layout layout, const Triangle triangle, cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event); +// Symmetric banded matrix-vector multiplication: SSBMV/DSBMV +StatusCode CLBlastSsbmv(const Layout layout, const Triangle triangle, + const size_t n, const size_t k, + const float alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const float beta, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event); +StatusCode CLBlastDsbmv(const Layout layout, const Triangle triangle, + const size_t n, const size_t k, + const double alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const double beta, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event); + +// Symmetric packed matrix-vector multiplication: SSPMV/DSPMV +StatusCode CLBlastSspmv(const Layout layout, const Triangle triangle, + const size_t n, + const float alpha, + const cl_mem ap_buffer, const size_t ap_offset, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const float beta, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event); +StatusCode CLBlastDspmv(const Layout layout, const Triangle triangle, + const size_t n, + const double alpha, + const cl_mem ap_buffer, const size_t ap_offset, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const double beta, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event); + +// Triangular matrix-vector multiplication: STRMV/DTRMV/CTRMV/ZTRMV +StatusCode CLBlastStrmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t n, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event); +StatusCode CLBlastDtrmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t n, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event); +StatusCode CLBlastCtrmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t n, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event); +StatusCode CLBlastZtrmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t n, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event); + +// Triangular banded matrix-vector multiplication: STBMV/DTBMV/CTBMV/ZTBMV +StatusCode CLBlastStbmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t n, const size_t k, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event); +StatusCode CLBlastDtbmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t n, const size_t k, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event); +StatusCode CLBlastCtbmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t n, const size_t k, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event); +StatusCode CLBlastZtbmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t n, const size_t k, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event); + +// Triangular packed matrix-vector multiplication: STPMV/DTPMV/CTPMV/ZTPMV +StatusCode CLBlastStpmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t n, + const cl_mem ap_buffer, const size_t ap_offset, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event); +StatusCode CLBlastDtpmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t n, + const cl_mem ap_buffer, const size_t ap_offset, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event); +StatusCode CLBlastCtpmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t n, + const cl_mem ap_buffer, const size_t ap_offset, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event); +StatusCode CLBlastZtpmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t n, + const cl_mem ap_buffer, const size_t ap_offset, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event); + +// Solves a triangular system of equations: STRSV/DTRSV/CTRSV/ZTRSV +StatusCode CLBlastStrsv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t n, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event); +StatusCode CLBlastDtrsv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t n, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event); +StatusCode CLBlastCtrsv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t n, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event); +StatusCode CLBlastZtrsv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t n, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event); + +// Solves a banded triangular system of equations: STBSV/DTBSV/CTBSV/ZTBSV +StatusCode CLBlastStbsv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t n, const size_t k, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event); +StatusCode CLBlastDtbsv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t n, const size_t k, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event); +StatusCode CLBlastCtbsv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t n, const size_t k, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event); +StatusCode CLBlastZtbsv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t n, const size_t k, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event); + +// Solves a packed triangular system of equations: STPSV/DTPSV/CTPSV/ZTPSV +StatusCode CLBlastStpsv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t n, + const cl_mem ap_buffer, const size_t ap_offset, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event); +StatusCode CLBlastDtpsv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t n, + const cl_mem ap_buffer, const size_t ap_offset, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event); +StatusCode CLBlastCtpsv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t n, + const cl_mem ap_buffer, const size_t ap_offset, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event); +StatusCode CLBlastZtpsv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t n, + const cl_mem ap_buffer, const size_t ap_offset, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event); + +// General rank-1 matrix update: SGER/DGER +StatusCode CLBlastSger(const Layout layout, + const size_t m, const size_t n, + const float alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_command_queue* queue, cl_event* event); +StatusCode CLBlastDger(const Layout layout, + const size_t m, const size_t n, + const double alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_command_queue* queue, cl_event* event); + +// General rank-1 complex matrix update: CGERU/ZGERU +StatusCode CLBlastCgeru(const Layout layout, + const size_t m, const size_t n, + const cl_float2 alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_command_queue* queue, cl_event* event); +StatusCode CLBlastZgeru(const Layout layout, + const size_t m, const size_t n, + const cl_double2 alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_command_queue* queue, cl_event* event); + +// General rank-1 complex conjugated matrix update: CGERC/ZGERC +StatusCode CLBlastCgerc(const Layout layout, + const size_t m, const size_t n, + const cl_float2 alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_command_queue* queue, cl_event* event); +StatusCode CLBlastZgerc(const Layout layout, + const size_t m, const size_t n, + const cl_double2 alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_command_queue* queue, cl_event* event); + +// Hermitian rank-1 matrix update: CHER/ZHER +StatusCode CLBlastCher(const Layout layout, const Triangle triangle, + const size_t n, + const float alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_command_queue* queue, cl_event* event); +StatusCode CLBlastZher(const Layout layout, const Triangle triangle, + const size_t n, + const double alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_command_queue* queue, cl_event* event); + +// Hermitian packed rank-1 matrix update: CHPR/ZHPR +StatusCode CLBlastChpr(const Layout layout, const Triangle triangle, + const size_t n, + const float alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_mem ap_buffer, const size_t ap_offset, + cl_command_queue* queue, cl_event* event); +StatusCode CLBlastZhpr(const Layout layout, const Triangle triangle, + const size_t n, + const double alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_mem ap_buffer, const size_t ap_offset, + cl_command_queue* queue, cl_event* event); + +// Hermitian rank-2 matrix update: CHER2/ZHER2 +StatusCode CLBlastCher2(const Layout layout, const Triangle triangle, + const size_t n, + const cl_float2 alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_command_queue* queue, cl_event* event); +StatusCode CLBlastZher2(const Layout layout, const Triangle triangle, + const size_t n, + const cl_double2 alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_command_queue* queue, cl_event* event); + +// Hermitian packed rank-2 matrix update: CHPR2/ZHPR2 +StatusCode CLBlastChpr2(const Layout layout, const Triangle triangle, + const size_t n, + const cl_float2 alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_mem ap_buffer, const size_t ap_offset, + cl_command_queue* queue, cl_event* event); +StatusCode CLBlastZhpr2(const Layout layout, const Triangle triangle, + const size_t n, + const cl_double2 alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_mem ap_buffer, const size_t ap_offset, + cl_command_queue* queue, cl_event* event); + +// Symmetric rank-1 matrix update: SSYR/DSYR +StatusCode CLBlastSsyr(const Layout layout, const Triangle triangle, + const size_t n, + const float alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_command_queue* queue, cl_event* event); +StatusCode CLBlastDsyr(const Layout layout, const Triangle triangle, + const size_t n, + const double alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_command_queue* queue, cl_event* event); + +// Symmetric packed rank-1 matrix update: SSPR/DSPR +StatusCode CLBlastSspr(const Layout layout, const Triangle triangle, + const size_t n, + const float alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_mem ap_buffer, const size_t ap_offset, + cl_command_queue* queue, cl_event* event); +StatusCode CLBlastDspr(const Layout layout, const Triangle triangle, + const size_t n, + const double alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_mem ap_buffer, const size_t ap_offset, + cl_command_queue* queue, cl_event* event); + +// Symmetric rank-2 matrix update: SSYR2/DSYR2 +StatusCode CLBlastSsyr2(const Layout layout, const Triangle triangle, + const size_t n, + const float alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_command_queue* queue, cl_event* event); +StatusCode CLBlastDsyr2(const Layout layout, const Triangle triangle, + const size_t n, + const double alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_command_queue* queue, cl_event* event); + +// Symmetric packed rank-2 matrix update: SSPR2/DSPR2 +StatusCode CLBlastSspr2(const Layout layout, const Triangle triangle, + const size_t n, + const float alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_mem ap_buffer, const size_t ap_offset, + cl_command_queue* queue, cl_event* event); +StatusCode CLBlastDspr2(const Layout layout, const Triangle triangle, + const size_t n, + const double alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_mem ap_buffer, const size_t ap_offset, + cl_command_queue* queue, cl_event* event); + // ================================================================================================= // BLAS level-3 (matrix-matrix) routines // ================================================================================================= -// Generalized matrix-matrix multiplication: SGEMM/DGEMM/CGEMM/ZGEMM +// General matrix-matrix multiplication: SGEMM/DGEMM/CGEMM/ZGEMM StatusCode CLBlastSgemm(const Layout layout, const Transpose a_transpose, const Transpose b_transpose, const size_t m, const size_t n, const size_t k, const float alpha, @@ -483,6 +889,32 @@ StatusCode CLBlastZtrmm(const Layout layout, const Side side, const Triangle tri cl_mem b_buffer, const size_t b_offset, const size_t b_ld, cl_command_queue* queue, cl_event* event); +// Solves a triangular system of equations: STRSM/DTRSM/CTRSM/ZTRSM +StatusCode CLBlastStrsm(const Layout layout, const Side side, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t m, const size_t n, + const float alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_mem b_buffer, const size_t b_offset, const size_t b_ld, + cl_command_queue* queue, cl_event* event); +StatusCode CLBlastDtrsm(const Layout layout, const Side side, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t m, const size_t n, + const double alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_mem b_buffer, const size_t b_offset, const size_t b_ld, + cl_command_queue* queue, cl_event* event); +StatusCode CLBlastCtrsm(const Layout layout, const Side side, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t m, const size_t n, + const cl_float2 alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_mem b_buffer, const size_t b_offset, const size_t b_ld, + cl_command_queue* queue, cl_event* event); +StatusCode CLBlastZtrsm(const Layout layout, const Side side, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t m, const size_t n, + const cl_double2 alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_mem b_buffer, const size_t b_offset, const size_t b_ld, + cl_command_queue* queue, cl_event* event); + // ================================================================================================= // CLBLAST_CLBLAST_C_H_ diff --git a/scripts/generator/generator.py b/scripts/generator/generator.py index 699cd9cf..9c9675b8 100644 --- a/scripts/generator/generator.py +++ b/scripts/generator/generator.py @@ -48,29 +48,55 @@ TU = DataType("typename T, typename U", "T,U", ["T", "U", "T", "U"], "T") # for # Populates a list of routines routines = [ -[ # Level 1 - Routine(True, 1, "swap", T, [S,D,C,Z], ["n"], [], [], ["x","y"], [], False, "Swap two vectors"), - Routine(True, 1, "scal", T, [S,D,C,Z], ["n"], [], [], ["x"], ["alpha"], False, "Vector scaling"), - Routine(True, 1, "copy", T, [S,D,C,Z], ["n"], [], ["x"], ["y"], [], False, "Vector copy"), - Routine(True, 1, "axpy", T, [S,D,C,Z], ["n"], [], ["x"], ["y"], ["alpha"], False, "Vector-times-constant plus vector"), - Routine(True, 1, "dot", T, [S,D], ["n"], [], ["x","y"], ["dot"], [], True, "Dot product of two vectors"), - Routine(True, 1, "dotu", T, [C,Z], ["n"], [], ["x","y"], ["dot"], [], True, "Dot product of two complex vectors"), - Routine(True, 1, "dotc", T, [C,Z], ["n"], [], ["x","y"], ["dot"], [], True, "Dot product of two complex vectors, one conjugated"), +[ # Level 1: vector-vector + #Routine(False, "1", "rotg", T, [S,D], [], [], [], [], ["a","b","c","s"], False, "Generate plane rotation"), + #Routine(False, "1", "rot", T, [S,D], ["n"], [], [], ["x","y"], ["c","s"], False, "Apply plane rotation"), + Routine(True, "1", "swap", T, [S,D,C,Z], ["n"], [], [], ["x","y"], [], False, "Swap two vectors"), + Routine(True, "1", "scal", T, [S,D,C,Z], ["n"], [], [], ["x"], ["alpha"], False, "Vector scaling"), + Routine(True, "1", "copy", T, [S,D,C,Z], ["n"], [], ["x"], ["y"], [], False, "Vector copy"), + Routine(True, "1", "axpy", T, [S,D,C,Z], ["n"], [], ["x"], ["y"], ["alpha"], False, "Vector-times-constant plus vector"), + Routine(True, "1", "dot", T, [S,D], ["n"], [], ["x","y"], ["dot"], [], True, "Dot product of two vectors"), + Routine(True, "1", "dotu", T, [C,Z], ["n"], [], ["x","y"], ["dot"], [], True, "Dot product of two complex vectors"), + Routine(True, "1", "dotc", T, [C,Z], ["n"], [], ["x","y"], ["dot"], [], True, "Dot product of two complex vectors, one conjugated"), ], -[ # Level 2 - Routine(True, 2, "gemv", T, [S,D,C,Z], ["m","n"], ["layout","a_transpose"], ["a","x"], ["y"], ["alpha","beta"], False, "Generalized matrix-vector multiplication"), - Routine(True, 2, "hemv", T, [C,Z], ["n"], ["layout","triangle"], ["a","x"], ["y"], ["alpha","beta"], False, "Hermitian matrix-vector multiplication"), - Routine(True, 2, "symv", T, [S,D], ["n"], ["layout","triangle"], ["a","x"], ["y"], ["alpha","beta"], False, "Symmetric matrix-vector multiplication"), +[ # Level 2: matrix-vector + Routine(True, "2a", "gemv", T, [S,D,C,Z], ["m","n"], ["layout","a_transpose"], ["a","x"], ["y"], ["alpha","beta"], False, "General matrix-vector multiplication"), + Routine(False, "2a", "gbmv", T, [S,D,C,Z], ["m","n","kl","ku"], ["layout","a_transpose"], ["a","x"], ["y"], ["alpha","beta"], False, "General banded matrix-vector multiplication"), + Routine(True, "2a", "hemv", T, [C,Z], ["n"], ["layout","triangle"], ["a","x"], ["y"], ["alpha","beta"], False, "Hermitian matrix-vector multiplication"), + Routine(False, "2a", "hbmv", T, [C,Z], ["n","k"], ["layout","triangle"], ["a","x"], ["y"], ["alpha","beta"], False, "Hermitian banded matrix-vector multiplication"), + Routine(False, "2a", "hpmv", T, [C,Z], ["n"], ["layout","triangle"], ["ap","x"], ["y"], ["alpha","beta"], False, "Hermitian packed matrix-vector multiplication"), + Routine(True, "2a", "symv", T, [S,D], ["n"], ["layout","triangle"], ["a","x"], ["y"], ["alpha","beta"], False, "Symmetric matrix-vector multiplication"), + Routine(False, "2a", "sbmv", T, [S,D], ["n","k"], ["layout","triangle"], ["a","x"], ["y"], ["alpha","beta"], False, "Symmetric banded matrix-vector multiplication"), + Routine(False, "2a", "spmv", T, [S,D], ["n"], ["layout","triangle"], ["ap","x"], ["y"], ["alpha","beta"], False, "Symmetric packed matrix-vector multiplication"), + Routine(False, "2a", "trmv", T, [S,D,C,Z], ["n"], ["layout","triangle","a_transpose","diagonal"], ["a"], ["x"], [], True, "Triangular matrix-vector multiplication"), + Routine(False, "2a", "tbmv", T, [S,D,C,Z], ["n","k"], ["layout","triangle","a_transpose","diagonal"], ["a"], ["x"], [], True, "Triangular banded matrix-vector multiplication"), + Routine(False, "2a", "tpmv", T, [S,D,C,Z], ["n"], ["layout","triangle","a_transpose","diagonal"], ["ap"], ["x"], [], True, "Triangular packed matrix-vector multiplication"), + Routine(False, "2a", "trsv", T, [S,D,C,Z], ["n"], ["layout","triangle","a_transpose","diagonal"], ["a"], ["x"], [], False, "Solves a triangular system of equations"), + Routine(False, "2a", "tbsv", T, [S,D,C,Z], ["n","k"], ["layout","triangle","a_transpose","diagonal"], ["a"], ["x"], [], False, "Solves a banded triangular system of equations"), + Routine(False, "2a", "tpsv", T, [S,D,C,Z], ["n"], ["layout","triangle","a_transpose","diagonal"], ["ap"], ["x"], [], False, "Solves a packed triangular system of equations"), + # Level 2: matrix update + Routine(False, "2b", "ger", T, [S,D], ["m","n"], ["layout"], ["x","y"], ["a"], ["alpha"], False, "General rank-1 matrix update"), + Routine(False, "2b", "geru", T, [C,Z], ["m","n"], ["layout"], ["x","y"], ["a"], ["alpha"], False, "General rank-1 complex matrix update"), + Routine(False, "2b", "gerc", T, [C,Z], ["m","n"], ["layout"], ["x","y"], ["a"], ["alpha"], False, "General rank-1 complex conjugated matrix update"), + Routine(False, "2b", "her", Tc, [Css,Zdd], ["n"], ["layout","triangle"], ["x"], ["a"], ["alpha"], False, "Hermitian rank-1 matrix update"), + Routine(False, "2b", "hpr", Tc, [Css,Zdd], ["n"], ["layout","triangle"], ["x"], ["ap"], ["alpha"], False, "Hermitian packed rank-1 matrix update"), + Routine(False, "2b", "her2", T, [C,Z], ["n"], ["layout","triangle"], ["x","y"], ["a"], ["alpha"], False, "Hermitian rank-2 matrix update"), + Routine(False, "2b", "hpr2", T, [C,Z], ["n"], ["layout","triangle"], ["x","y"], ["ap"], ["alpha"], False, "Hermitian packed rank-2 matrix update"), + Routine(False, "2b", "syr", T, [S,D], ["n"], ["layout","triangle"], ["x"], ["a"], ["alpha"], False, "Symmetric rank-1 matrix update"), + Routine(False, "2b", "spr", T, [S,D], ["n"], ["layout","triangle"], ["x"], ["ap"], ["alpha"], False, "Symmetric packed rank-1 matrix update"), + Routine(False, "2b", "syr2", T, [S,D], ["n"], ["layout","triangle"], ["x","y"], ["a"], ["alpha"], False, "Symmetric rank-2 matrix update"), + Routine(False, "2b", "spr2", T, [S,D], ["n"], ["layout","triangle"], ["x","y"], ["ap"], ["alpha"], False, "Symmetric packed rank-2 matrix update"), ], -[ # Level 3 - Routine(True, 3, "gemm", T, [S,D,C,Z], ["m","n","k"], ["layout","a_transpose","b_transpose"], ["a","b"], ["c"], ["alpha","beta"], False, "Generalized matrix-matrix multiplication"), - Routine(True, 3, "symm", T, [S,D,C,Z], ["m","n"], ["layout","side","triangle"], ["a","b"], ["c"], ["alpha","beta"], False, "Symmetric matrix-matrix multiplication"), - Routine(True, 3, "hemm", T, [C,Z], ["m","n"], ["layout","side","triangle"], ["a","b"], ["c"], ["alpha","beta"], False, "Hermitian matrix-matrix multiplication"), - Routine(True, 3, "syrk", T, [S,D,C,Z], ["n","k"], ["layout","triangle","a_transpose"], ["a"], ["c"], ["alpha","beta"], False, "Rank-K update of a symmetric matrix"), - Routine(True, 3, "herk", Tc, [Css,Zdd], ["n","k"], ["layout","triangle","a_transpose"], ["a"], ["c"], ["alpha","beta"], False, "Rank-K update of a hermitian matrix"), - Routine(True, 3, "syr2k", T, [S,D,C,Z], ["n","k"], ["layout","triangle","ab_transpose"], ["a","b"], ["c"], ["alpha","beta"], False, "Rank-2K update of a symmetric matrix"), - Routine(True, 3, "her2k", TU, [Ccs,Zzd], ["n","k"], ["layout","triangle","ab_transpose"], ["a","b"], ["c"], ["alpha","beta"], False, "Rank-2K update of a hermitian matrix"), - Routine(True, 3, "trmm", T, [S,D,C,Z], ["m","n"], ["layout","side","triangle","a_transpose","diagonal"], ["a"], ["b"], ["alpha"], False, "Triangular matrix-matrix multiplication"), +[ # Level 3: matrix-matrix + Routine(True, "3", "gemm", T, [S,D,C,Z], ["m","n","k"], ["layout","a_transpose","b_transpose"], ["a","b"], ["c"], ["alpha","beta"], False, "General matrix-matrix multiplication"), + Routine(True, "3", "symm", T, [S,D,C,Z], ["m","n"], ["layout","side","triangle"], ["a","b"], ["c"], ["alpha","beta"], False, "Symmetric matrix-matrix multiplication"), + Routine(True, "3", "hemm", T, [C,Z], ["m","n"], ["layout","side","triangle"], ["a","b"], ["c"], ["alpha","beta"], False, "Hermitian matrix-matrix multiplication"), + Routine(True, "3", "syrk", T, [S,D,C,Z], ["n","k"], ["layout","triangle","a_transpose"], ["a"], ["c"], ["alpha","beta"], False, "Rank-K update of a symmetric matrix"), + Routine(True, "3", "herk", Tc, [Css,Zdd], ["n","k"], ["layout","triangle","a_transpose"], ["a"], ["c"], ["alpha","beta"], False, "Rank-K update of a hermitian matrix"), + Routine(True, "3", "syr2k", T, [S,D,C,Z], ["n","k"], ["layout","triangle","ab_transpose"], ["a","b"], ["c"], ["alpha","beta"], False, "Rank-2K update of a symmetric matrix"), + Routine(True, "3", "her2k", TU, [Ccs,Zzd], ["n","k"], ["layout","triangle","ab_transpose"], ["a","b"], ["c"], ["alpha","beta"], False, "Rank-2K update of a hermitian matrix"), + Routine(True, "3", "trmm", T, [S,D,C,Z], ["m","n"], ["layout","side","triangle","a_transpose","diagonal"], ["a"], ["b"], ["alpha"], False, "Triangular matrix-matrix multiplication"), + Routine(False, "3", "trsm", T, [S,D,C,Z], ["m","n"], ["layout","side","triangle","a_transpose","diagonal"], ["a"], ["b"], ["alpha"], False, "Solves a triangular system of equations"), ]] # ================================================================================================== diff --git a/scripts/generator/routine.py b/scripts/generator/routine.py index b2c50e3d..df4dd019 100644 --- a/scripts/generator/routine.py +++ b/scripts/generator/routine.py @@ -12,6 +12,9 @@ # # ================================================================================================== +# System modules +from itertools import chain + # Translates an option name to a CLBlast data-type def OptionToCLBlast(x): return { @@ -36,6 +39,9 @@ def OptionToWrapper(x): 'diagonal': "clblasDiag", }[x] +# Buffers without 'ld' or 'inc' parameter +NO_LD_INC = ["dot","ap"] + # ================================================================================================== # Class holding routine-specific information (e.g. name, which arguments, which precisions) @@ -71,6 +77,16 @@ class Routine(): def ShortNames(self): return "/".join([f.name+self.name.upper() for f in self.flavours]) + # Determines which buffers go first (between alpha and beta) and which ones go after + def BuffersFirst(self): + if self.level == "2b": + return ["x","y"] + return ["ap","a","b","x"] + def BuffersSecond(self): + if self.level == "2b": + return ["ap","a","b","c"] + return ["y","c"] + # ============================================================================================== # Retrieves a variable name for a specific input/output vector/matrix (e.g. 'x') @@ -78,7 +94,7 @@ class Routine(): if (name in self.inputs) or (name in self.outputs): a = [name+"_buffer"] b = [name+"_offset"] - c = [name+"_"+self.Postfix(name)] if (name not in ["dot"]) else [] + c = [name+"_"+self.Postfix(name)] if (name not in NO_LD_INC) else [] return [", ".join(a+b+c)] return [] @@ -88,7 +104,7 @@ class Routine(): if (name in self.inputs) or (name in self.outputs): a = [prefix+"cl_mem "+name+"_buffer"] b = ["const size_t "+name+"_offset"] - c = ["const size_t "+name+"_"+self.Postfix(name)] if (name not in ["dot"]) else [] + c = ["const size_t "+name+"_"+self.Postfix(name)] if (name not in NO_LD_INC) else [] return [", ".join(a+b+c)] return [] @@ -97,7 +113,7 @@ class Routine(): if (name in self.inputs) or (name in self.outputs): a = ["Buffer<"+self.template.buffertype+">("+name+"_buffer)"] b = [name+"_offset"] - c = [name+"_"+self.Postfix(name)] if (name not in ["dot"]) else [] + c = [name+"_"+self.Postfix(name)] if (name not in NO_LD_INC) else [] return [", ".join(a+b+c)] return [] @@ -120,7 +136,7 @@ class Routine(): if (name in self.inputs) or (name in self.outputs): a = [prefix+"cl_mem"] b = ["const size_t"] - c = ["const size_t"] if (name not in ["dot"]) else [] + c = ["const size_t"] if (name not in NO_LD_INC) else [] return [", ".join(a+b+c)] return [] @@ -134,41 +150,45 @@ class Routine(): # Retrieves the use of a scalar (alpha/beta) def ScalarUse(self, name, flavour): - if ((name == "alpha") and (name in self.scalars)): - return [flavour.UseAlpha()] - elif ((name == "beta") and (name in self.scalars)): - return [flavour.UseBeta()] + if name in self.scalars: + if name == "alpha": + return [flavour.UseAlpha()] + elif name == "beta": + return [flavour.UseBeta()] + return [name] return [] # Retrieves the use of a scalar (alpha/beta) def ScalarUseWrapper(self, name, flavour): - if ((name == "alpha") and (name in self.scalars)): - return [flavour.UseAlphaCL()] - elif ((name == "beta") and (name in self.scalars)): - return [flavour.UseBetaCL()] + if name in self.scalars: + if name == "alpha": + return [flavour.UseAlphaCL()] + elif name == "beta": + return [flavour.UseBetaCL()] + return [name] return [] # Retrieves the definition of a scalar (alpha/beta) def ScalarDef(self, name, flavour): - if ((name == "alpha") and (name in self.scalars)): - return ["const "+flavour.alpha_cl+" "+name] - elif ((name == "beta") and (name in self.scalars)): + if name in self.scalars: + if name == "alpha": + return ["const "+flavour.alpha_cl+" "+name] return ["const "+flavour.beta_cl+" "+name] return [] # As above, but without 'cl_' prefix def ScalarDefPlain(self, name, flavour): - if ((name == "alpha") and (name in self.scalars)): - return ["const "+flavour.alpha_cpp+" "+name] - elif ((name == "beta") and (name in self.scalars)): + if name in self.scalars: + if name == "alpha": + return ["const "+flavour.alpha_cpp+" "+name] return ["const "+flavour.beta_cpp+" "+name] return [] # Retrieves the type of a scalar (alpha/beta) def ScalarType(self, name, flavour): - if ((name == "alpha") and (name in self.scalars)): - return ["const "+flavour.alpha_cpp] - elif ((name == "beta") and (name in self.scalars)): + if name in self.scalars: + if name == "alpha": + return ["const "+flavour.alpha_cpp] return ["const "+flavour.beta_cpp] return [] @@ -234,43 +254,55 @@ class Routine(): def ArgumentsCladuc(self, flavour, indent): return (self.Options() + self.Sizes() + self.BufferCladuc("dot") + self.Scalar("alpha") + - self.BufferCladuc("a") + self.BufferCladuc("b") + self.BufferCladuc("x") + - self.Scalar("beta") + self.BufferCladuc("y") + self.BufferCladuc("c")) + list(chain(*[self.BufferCladuc(b) for b in self.BuffersFirst()])) + + self.Scalar("beta") + + list(chain(*[self.BufferCladuc(b) for b in self.BuffersSecond()])) + + list(chain(*[self.Scalar(s) for s in ["d1","d2","a","b","c","s"]]))) # Retrieves a combination of all the argument names, with CLBlast casts def ArgumentsCast(self, flavour, indent): return (self.OptionsCast(indent) + self.Sizes() + self.Buffer("dot") + self.ScalarUse("alpha", flavour) + - self.Buffer("a") + self.Buffer("b") + self.Buffer("x") + - self.ScalarUse("beta", flavour) + self.Buffer("y") + self.Buffer("c")) + list(chain(*[self.Buffer(b) for b in self.BuffersFirst()])) + + self.ScalarUse("beta", flavour) + + list(chain(*[self.Buffer(b) for b in self.BuffersSecond()])) + + list(chain(*[self.ScalarUse(s, flavour) for s in ["d1","d2","a","b","c","s"]]))) # As above, but for the clBLAS wrapper def ArgumentsWrapper(self, flavour): return (self.Options() + self.Sizes() + self.BufferWrapper("dot") + self.ScalarUseWrapper("alpha", flavour) + - self.BufferWrapper("a") + self.BufferWrapper("b") + self.BufferWrapper("x") + - self.ScalarUseWrapper("beta", flavour) + self.BufferWrapper("y") + self.BufferWrapper("c")) + list(chain(*[self.BufferWrapper(b) for b in self.BuffersFirst()])) + + self.ScalarUseWrapper("beta", flavour) + + list(chain(*[self.BufferWrapper(b) for b in self.BuffersSecond()])) + + list(chain(*[self.ScalarUseWrapper(s, flavour) for s in ["d1","d2","a","b","c","s"]]))) # Retrieves a combination of all the argument definitions def ArgumentsDef(self, flavour): return (self.OptionsDef() + self.SizesDef() + self.BufferDef("dot") + self.ScalarDef("alpha", flavour) + - self.BufferDef("a") + self.BufferDef("b") + self.BufferDef("x") + - self.ScalarDef("beta", flavour) + self.BufferDef("y") + self.BufferDef("c")) + list(chain(*[self.BufferDef(b) for b in self.BuffersFirst()])) + + self.ScalarDef("beta", flavour) + + list(chain(*[self.BufferDef(b) for b in self.BuffersSecond()])) + + list(chain(*[self.ScalarDef(s, flavour) for s in ["d1","d2","a","b","c","s"]]))) # As above, but clBLAS wrapper plain datatypes def ArgumentsDefWrapper(self, flavour): return (self.OptionsDefWrapper() + self.SizesDef() + self.BufferDef("dot") + self.ScalarDefPlain("alpha", flavour) + - self.BufferDef("a") + self.BufferDef("b") + self.BufferDef("x") + - self.ScalarDefPlain("beta", flavour) + self.BufferDef("y") + self.BufferDef("c")) + list(chain(*[self.BufferDef(b) for b in self.BuffersFirst()])) + + self.ScalarDefPlain("beta", flavour) + + list(chain(*[self.BufferDef(b) for b in self.BuffersSecond()])) + + list(chain(*[self.ScalarDefPlain(s, flavour) for s in ["d1","d2","a","b","c","s"]]))) # Retrieves a combination of all the argument types def ArgumentsType(self, flavour): return (self.OptionsType() + self.SizesType() + self.BufferType("dot") + self.ScalarType("alpha", flavour) + - self.BufferType("a") + self.BufferType("b") + self.BufferType("x") + - self.ScalarType("beta", flavour) + self.BufferType("y") + self.BufferType("c")) + list(chain(*[self.BufferType(b) for b in self.BuffersFirst()])) + + self.ScalarType("beta", flavour) + + list(chain(*[self.BufferType(b) for b in self.BuffersSecond()])) + + list(chain(*[self.ScalarType(s, flavour) for s in ["d1","d2","a","b","c","s"]]))) # ============================================================================================== @@ -290,7 +322,7 @@ class Routine(): result = "template <"+self.template.name+">\n" result += "StatusCode "+self.name.capitalize()+"(" result += (",\n"+indent).join([a for a in self.ArgumentsType(self.template)]) - result += ",\n"+indent+"cl_command_queue* queue, cl_event* event)" + result += ",\n"+indent+"cl_command_queue*, cl_event*)" return result # As above, but now for C diff --git a/src/clblast.cc b/src/clblast.cc index 0ced9ff7..a0dd8c70 100644 --- a/src/clblast.cc +++ b/src/clblast.cc @@ -269,7 +269,7 @@ template StatusCode Dotc(const size_t, // BLAS level-2 (matrix-vector) routines // ================================================================================================= -// Generalized matrix-vector multiplication: SGEMV/DGEMV/CGEMV/ZGEMV +// General matrix-vector multiplication: SGEMV/DGEMV/CGEMV/ZGEMV template StatusCode Gemv(const Layout layout, const Transpose a_transpose, const size_t m, const size_t n, @@ -325,6 +325,51 @@ template StatusCode Gemv(const Layout, const Transpose, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); +// General banded matrix-vector multiplication: SGBMV/DGBMV/CGBMV/ZGBMV +template +StatusCode Gbmv(const Layout, const Transpose, + const size_t, const size_t, const size_t, const size_t, + const T, + const cl_mem, const size_t, const size_t, + const cl_mem, const size_t, const size_t, + const T, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*) { + return StatusCode::kNotImplemented; +} +template StatusCode Gbmv(const Layout, const Transpose, + const size_t, const size_t, const size_t, const size_t, + const float, + const cl_mem, const size_t, const size_t, + const cl_mem, const size_t, const size_t, + const float, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); +template StatusCode Gbmv(const Layout, const Transpose, + const size_t, const size_t, const size_t, const size_t, + const double, + const cl_mem, const size_t, const size_t, + const cl_mem, const size_t, const size_t, + const double, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); +template StatusCode Gbmv(const Layout, const Transpose, + const size_t, const size_t, const size_t, const size_t, + const float2, + const cl_mem, const size_t, const size_t, + const cl_mem, const size_t, const size_t, + const float2, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); +template StatusCode Gbmv(const Layout, const Transpose, + const size_t, const size_t, const size_t, const size_t, + const double2, + const cl_mem, const size_t, const size_t, + const cl_mem, const size_t, const size_t, + const double2, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); + // Hermitian matrix-vector multiplication: CHEMV/ZHEMV template StatusCode Hemv(const Layout layout, const Triangle triangle, @@ -365,6 +410,64 @@ template StatusCode Hemv(const Layout, const Triangle, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); +// Hermitian banded matrix-vector multiplication: CHBMV/ZHBMV +template +StatusCode Hbmv(const Layout, const Triangle, + const size_t, const size_t, + const T, + const cl_mem, const size_t, const size_t, + const cl_mem, const size_t, const size_t, + const T, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*) { + return StatusCode::kNotImplemented; +} +template StatusCode Hbmv(const Layout, const Triangle, + const size_t, const size_t, + const float2, + const cl_mem, const size_t, const size_t, + const cl_mem, const size_t, const size_t, + const float2, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); +template StatusCode Hbmv(const Layout, const Triangle, + const size_t, const size_t, + const double2, + const cl_mem, const size_t, const size_t, + const cl_mem, const size_t, const size_t, + const double2, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); + +// Hermitian packed matrix-vector multiplication: CHPMV/ZHPMV +template +StatusCode Hpmv(const Layout, const Triangle, + const size_t, + const T, + const cl_mem, const size_t, + const cl_mem, const size_t, const size_t, + const T, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*) { + return StatusCode::kNotImplemented; +} +template StatusCode Hpmv(const Layout, const Triangle, + const size_t, + const float2, + const cl_mem, const size_t, + const cl_mem, const size_t, const size_t, + const float2, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); +template StatusCode Hpmv(const Layout, const Triangle, + const size_t, + const double2, + const cl_mem, const size_t, + const cl_mem, const size_t, const size_t, + const double2, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); + // Symmetric matrix-vector multiplication: SSYMV/DSYMV template StatusCode Symv(const Layout layout, const Triangle triangle, @@ -405,11 +508,523 @@ template StatusCode Symv(const Layout, const Triangle, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); +// Symmetric banded matrix-vector multiplication: SSBMV/DSBMV +template +StatusCode Sbmv(const Layout, const Triangle, + const size_t, const size_t, + const T, + const cl_mem, const size_t, const size_t, + const cl_mem, const size_t, const size_t, + const T, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*) { + return StatusCode::kNotImplemented; +} +template StatusCode Sbmv(const Layout, const Triangle, + const size_t, const size_t, + const float, + const cl_mem, const size_t, const size_t, + const cl_mem, const size_t, const size_t, + const float, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); +template StatusCode Sbmv(const Layout, const Triangle, + const size_t, const size_t, + const double, + const cl_mem, const size_t, const size_t, + const cl_mem, const size_t, const size_t, + const double, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); + +// Symmetric packed matrix-vector multiplication: SSPMV/DSPMV +template +StatusCode Spmv(const Layout, const Triangle, + const size_t, + const T, + const cl_mem, const size_t, + const cl_mem, const size_t, const size_t, + const T, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*) { + return StatusCode::kNotImplemented; +} +template StatusCode Spmv(const Layout, const Triangle, + const size_t, + const float, + const cl_mem, const size_t, + const cl_mem, const size_t, const size_t, + const float, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); +template StatusCode Spmv(const Layout, const Triangle, + const size_t, + const double, + const cl_mem, const size_t, + const cl_mem, const size_t, const size_t, + const double, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); + +// Triangular matrix-vector multiplication: STRMV/DTRMV/CTRMV/ZTRMV +template +StatusCode Trmv(const Layout, const Triangle, const Transpose, const Diagonal, + const size_t, + const cl_mem, const size_t, const size_t, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*) { + return StatusCode::kNotImplemented; +} +template StatusCode Trmv(const Layout, const Triangle, const Transpose, const Diagonal, + const size_t, + const cl_mem, const size_t, const size_t, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); +template StatusCode Trmv(const Layout, const Triangle, const Transpose, const Diagonal, + const size_t, + const cl_mem, const size_t, const size_t, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); +template StatusCode Trmv(const Layout, const Triangle, const Transpose, const Diagonal, + const size_t, + const cl_mem, const size_t, const size_t, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); +template StatusCode Trmv(const Layout, const Triangle, const Transpose, const Diagonal, + const size_t, + const cl_mem, const size_t, const size_t, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); + +// Triangular banded matrix-vector multiplication: STBMV/DTBMV/CTBMV/ZTBMV +template +StatusCode Tbmv(const Layout, const Triangle, const Transpose, const Diagonal, + const size_t, const size_t, + const cl_mem, const size_t, const size_t, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*) { + return StatusCode::kNotImplemented; +} +template StatusCode Tbmv(const Layout, const Triangle, const Transpose, const Diagonal, + const size_t, const size_t, + const cl_mem, const size_t, const size_t, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); +template StatusCode Tbmv(const Layout, const Triangle, const Transpose, const Diagonal, + const size_t, const size_t, + const cl_mem, const size_t, const size_t, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); +template StatusCode Tbmv(const Layout, const Triangle, const Transpose, const Diagonal, + const size_t, const size_t, + const cl_mem, const size_t, const size_t, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); +template StatusCode Tbmv(const Layout, const Triangle, const Transpose, const Diagonal, + const size_t, const size_t, + const cl_mem, const size_t, const size_t, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); + +// Triangular packed matrix-vector multiplication: STPMV/DTPMV/CTPMV/ZTPMV +template +StatusCode Tpmv(const Layout, const Triangle, const Transpose, const Diagonal, + const size_t, + const cl_mem, const size_t, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*) { + return StatusCode::kNotImplemented; +} +template StatusCode Tpmv(const Layout, const Triangle, const Transpose, const Diagonal, + const size_t, + const cl_mem, const size_t, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); +template StatusCode Tpmv(const Layout, const Triangle, const Transpose, const Diagonal, + const size_t, + const cl_mem, const size_t, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); +template StatusCode Tpmv(const Layout, const Triangle, const Transpose, const Diagonal, + const size_t, + const cl_mem, const size_t, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); +template StatusCode Tpmv(const Layout, const Triangle, const Transpose, const Diagonal, + const size_t, + const cl_mem, const size_t, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); + +// Solves a triangular system of equations: STRSV/DTRSV/CTRSV/ZTRSV +template +StatusCode Trsv(const Layout, const Triangle, const Transpose, const Diagonal, + const size_t, + const cl_mem, const size_t, const size_t, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*) { + return StatusCode::kNotImplemented; +} +template StatusCode Trsv(const Layout, const Triangle, const Transpose, const Diagonal, + const size_t, + const cl_mem, const size_t, const size_t, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); +template StatusCode Trsv(const Layout, const Triangle, const Transpose, const Diagonal, + const size_t, + const cl_mem, const size_t, const size_t, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); +template StatusCode Trsv(const Layout, const Triangle, const Transpose, const Diagonal, + const size_t, + const cl_mem, const size_t, const size_t, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); +template StatusCode Trsv(const Layout, const Triangle, const Transpose, const Diagonal, + const size_t, + const cl_mem, const size_t, const size_t, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); + +// Solves a banded triangular system of equations: STBSV/DTBSV/CTBSV/ZTBSV +template +StatusCode Tbsv(const Layout, const Triangle, const Transpose, const Diagonal, + const size_t, const size_t, + const cl_mem, const size_t, const size_t, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*) { + return StatusCode::kNotImplemented; +} +template StatusCode Tbsv(const Layout, const Triangle, const Transpose, const Diagonal, + const size_t, const size_t, + const cl_mem, const size_t, const size_t, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); +template StatusCode Tbsv(const Layout, const Triangle, const Transpose, const Diagonal, + const size_t, const size_t, + const cl_mem, const size_t, const size_t, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); +template StatusCode Tbsv(const Layout, const Triangle, const Transpose, const Diagonal, + const size_t, const size_t, + const cl_mem, const size_t, const size_t, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); +template StatusCode Tbsv(const Layout, const Triangle, const Transpose, const Diagonal, + const size_t, const size_t, + const cl_mem, const size_t, const size_t, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); + +// Solves a packed triangular system of equations: STPSV/DTPSV/CTPSV/ZTPSV +template +StatusCode Tpsv(const Layout, const Triangle, const Transpose, const Diagonal, + const size_t, + const cl_mem, const size_t, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*) { + return StatusCode::kNotImplemented; +} +template StatusCode Tpsv(const Layout, const Triangle, const Transpose, const Diagonal, + const size_t, + const cl_mem, const size_t, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); +template StatusCode Tpsv(const Layout, const Triangle, const Transpose, const Diagonal, + const size_t, + const cl_mem, const size_t, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); +template StatusCode Tpsv(const Layout, const Triangle, const Transpose, const Diagonal, + const size_t, + const cl_mem, const size_t, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); +template StatusCode Tpsv(const Layout, const Triangle, const Transpose, const Diagonal, + const size_t, + const cl_mem, const size_t, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); + +// General rank-1 matrix update: SGER/DGER +template +StatusCode Ger(const Layout, + const size_t, const size_t, + const T, + const cl_mem, const size_t, const size_t, + const cl_mem, const size_t, const size_t, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*) { + return StatusCode::kNotImplemented; +} +template StatusCode Ger(const Layout, + const size_t, const size_t, + const float, + const cl_mem, const size_t, const size_t, + const cl_mem, const size_t, const size_t, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); +template StatusCode Ger(const Layout, + const size_t, const size_t, + const double, + const cl_mem, const size_t, const size_t, + const cl_mem, const size_t, const size_t, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); + +// General rank-1 complex matrix update: CGERU/ZGERU +template +StatusCode Geru(const Layout, + const size_t, const size_t, + const T, + const cl_mem, const size_t, const size_t, + const cl_mem, const size_t, const size_t, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*) { + return StatusCode::kNotImplemented; +} +template StatusCode Geru(const Layout, + const size_t, const size_t, + const float2, + const cl_mem, const size_t, const size_t, + const cl_mem, const size_t, const size_t, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); +template StatusCode Geru(const Layout, + const size_t, const size_t, + const double2, + const cl_mem, const size_t, const size_t, + const cl_mem, const size_t, const size_t, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); + +// General rank-1 complex conjugated matrix update: CGERC/ZGERC +template +StatusCode Gerc(const Layout, + const size_t, const size_t, + const T, + const cl_mem, const size_t, const size_t, + const cl_mem, const size_t, const size_t, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*) { + return StatusCode::kNotImplemented; +} +template StatusCode Gerc(const Layout, + const size_t, const size_t, + const float2, + const cl_mem, const size_t, const size_t, + const cl_mem, const size_t, const size_t, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); +template StatusCode Gerc(const Layout, + const size_t, const size_t, + const double2, + const cl_mem, const size_t, const size_t, + const cl_mem, const size_t, const size_t, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); + +// Hermitian rank-1 matrix update: CHER/ZHER +template +StatusCode Her(const Layout, const Triangle, + const size_t, + const T, + const cl_mem, const size_t, const size_t, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*) { + return StatusCode::kNotImplemented; +} +template StatusCode Her(const Layout, const Triangle, + const size_t, + const float, + const cl_mem, const size_t, const size_t, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); +template StatusCode Her(const Layout, const Triangle, + const size_t, + const double, + const cl_mem, const size_t, const size_t, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); + +// Hermitian packed rank-1 matrix update: CHPR/ZHPR +template +StatusCode Hpr(const Layout, const Triangle, + const size_t, + const T, + const cl_mem, const size_t, const size_t, + cl_mem, const size_t, + cl_command_queue*, cl_event*) { + return StatusCode::kNotImplemented; +} +template StatusCode Hpr(const Layout, const Triangle, + const size_t, + const float, + const cl_mem, const size_t, const size_t, + cl_mem, const size_t, + cl_command_queue*, cl_event*); +template StatusCode Hpr(const Layout, const Triangle, + const size_t, + const double, + const cl_mem, const size_t, const size_t, + cl_mem, const size_t, + cl_command_queue*, cl_event*); + +// Hermitian rank-2 matrix update: CHER2/ZHER2 +template +StatusCode Her2(const Layout, const Triangle, + const size_t, + const T, + const cl_mem, const size_t, const size_t, + const cl_mem, const size_t, const size_t, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*) { + return StatusCode::kNotImplemented; +} +template StatusCode Her2(const Layout, const Triangle, + const size_t, + const float2, + const cl_mem, const size_t, const size_t, + const cl_mem, const size_t, const size_t, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); +template StatusCode Her2(const Layout, const Triangle, + const size_t, + const double2, + const cl_mem, const size_t, const size_t, + const cl_mem, const size_t, const size_t, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); + +// Hermitian packed rank-2 matrix update: CHPR2/ZHPR2 +template +StatusCode Hpr2(const Layout, const Triangle, + const size_t, + const T, + const cl_mem, const size_t, const size_t, + const cl_mem, const size_t, const size_t, + cl_mem, const size_t, + cl_command_queue*, cl_event*) { + return StatusCode::kNotImplemented; +} +template StatusCode Hpr2(const Layout, const Triangle, + const size_t, + const float2, + const cl_mem, const size_t, const size_t, + const cl_mem, const size_t, const size_t, + cl_mem, const size_t, + cl_command_queue*, cl_event*); +template StatusCode Hpr2(const Layout, const Triangle, + const size_t, + const double2, + const cl_mem, const size_t, const size_t, + const cl_mem, const size_t, const size_t, + cl_mem, const size_t, + cl_command_queue*, cl_event*); + +// Symmetric rank-1 matrix update: SSYR/DSYR +template +StatusCode Syr(const Layout, const Triangle, + const size_t, + const T, + const cl_mem, const size_t, const size_t, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*) { + return StatusCode::kNotImplemented; +} +template StatusCode Syr(const Layout, const Triangle, + const size_t, + const float, + const cl_mem, const size_t, const size_t, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); +template StatusCode Syr(const Layout, const Triangle, + const size_t, + const double, + const cl_mem, const size_t, const size_t, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); + +// Symmetric packed rank-1 matrix update: SSPR/DSPR +template +StatusCode Spr(const Layout, const Triangle, + const size_t, + const T, + const cl_mem, const size_t, const size_t, + cl_mem, const size_t, + cl_command_queue*, cl_event*) { + return StatusCode::kNotImplemented; +} +template StatusCode Spr(const Layout, const Triangle, + const size_t, + const float, + const cl_mem, const size_t, const size_t, + cl_mem, const size_t, + cl_command_queue*, cl_event*); +template StatusCode Spr(const Layout, const Triangle, + const size_t, + const double, + const cl_mem, const size_t, const size_t, + cl_mem, const size_t, + cl_command_queue*, cl_event*); + +// Symmetric rank-2 matrix update: SSYR2/DSYR2 +template +StatusCode Syr2(const Layout, const Triangle, + const size_t, + const T, + const cl_mem, const size_t, const size_t, + const cl_mem, const size_t, const size_t, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*) { + return StatusCode::kNotImplemented; +} +template StatusCode Syr2(const Layout, const Triangle, + const size_t, + const float, + const cl_mem, const size_t, const size_t, + const cl_mem, const size_t, const size_t, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); +template StatusCode Syr2(const Layout, const Triangle, + const size_t, + const double, + const cl_mem, const size_t, const size_t, + const cl_mem, const size_t, const size_t, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); + +// Symmetric packed rank-2 matrix update: SSPR2/DSPR2 +template +StatusCode Spr2(const Layout, const Triangle, + const size_t, + const T, + const cl_mem, const size_t, const size_t, + const cl_mem, const size_t, const size_t, + cl_mem, const size_t, + cl_command_queue*, cl_event*) { + return StatusCode::kNotImplemented; +} +template StatusCode Spr2(const Layout, const Triangle, + const size_t, + const float, + const cl_mem, const size_t, const size_t, + const cl_mem, const size_t, const size_t, + cl_mem, const size_t, + cl_command_queue*, cl_event*); +template StatusCode Spr2(const Layout, const Triangle, + const size_t, + const double, + const cl_mem, const size_t, const size_t, + const cl_mem, const size_t, const size_t, + cl_mem, const size_t, + cl_command_queue*, cl_event*); + // ================================================================================================= // BLAS level-3 (matrix-matrix) routines // ================================================================================================= -// Generalized matrix-matrix multiplication: SGEMM/DGEMM/CGEMM/ZGEMM +// General matrix-matrix multiplication: SGEMM/DGEMM/CGEMM/ZGEMM template StatusCode Gemm(const Layout layout, const Transpose a_transpose, const Transpose b_transpose, const size_t m, const size_t n, const size_t k, @@ -787,5 +1402,40 @@ template StatusCode Trmm(const Layout, const Side, const Triangle, cons cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); +// Solves a triangular system of equations: STRSM/DTRSM/CTRSM/ZTRSM +template +StatusCode Trsm(const Layout, const Side, const Triangle, const Transpose, const Diagonal, + const size_t, const size_t, + const T, + const cl_mem, const size_t, const size_t, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*) { + return StatusCode::kNotImplemented; +} +template StatusCode Trsm(const Layout, const Side, const Triangle, const Transpose, const Diagonal, + const size_t, const size_t, + const float, + const cl_mem, const size_t, const size_t, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); +template StatusCode Trsm(const Layout, const Side, const Triangle, const Transpose, const Diagonal, + const size_t, const size_t, + const double, + const cl_mem, const size_t, const size_t, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); +template StatusCode Trsm(const Layout, const Side, const Triangle, const Transpose, const Diagonal, + const size_t, const size_t, + const float2, + const cl_mem, const size_t, const size_t, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); +template StatusCode Trsm(const Layout, const Side, const Triangle, const Transpose, const Diagonal, + const size_t, const size_t, + const double2, + const cl_mem, const size_t, const size_t, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); + // ================================================================================================= } // namespace clblast diff --git a/src/clblast_c.cc b/src/clblast_c.cc index eccf517f..fcec0951 100644 --- a/src/clblast_c.cc +++ b/src/clblast_c.cc @@ -363,6 +363,84 @@ StatusCode CLBlastZgemv(const Layout layout, const Transpose a_transpose, return static_cast(status); } +// GBMV +StatusCode CLBlastSgbmv(const Layout layout, const Transpose a_transpose, + const size_t m, const size_t n, const size_t kl, const size_t ku, + const float alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const float beta, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Gbmv(static_cast(layout), + static_cast(a_transpose), + m, n, kl, ku, + alpha, + a_buffer, a_offset, a_ld, + x_buffer, x_offset, x_inc, + beta, + y_buffer, y_offset, y_inc, + queue, event); + return static_cast(status); +} +StatusCode CLBlastDgbmv(const Layout layout, const Transpose a_transpose, + const size_t m, const size_t n, const size_t kl, const size_t ku, + const double alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const double beta, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Gbmv(static_cast(layout), + static_cast(a_transpose), + m, n, kl, ku, + alpha, + a_buffer, a_offset, a_ld, + x_buffer, x_offset, x_inc, + beta, + y_buffer, y_offset, y_inc, + queue, event); + return static_cast(status); +} +StatusCode CLBlastCgbmv(const Layout layout, const Transpose a_transpose, + const size_t m, const size_t n, const size_t kl, const size_t ku, + const cl_float2 alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_float2 beta, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Gbmv(static_cast(layout), + static_cast(a_transpose), + m, n, kl, ku, + float2{alpha.s[0], alpha.s[1]}, + a_buffer, a_offset, a_ld, + x_buffer, x_offset, x_inc, + float2{beta.s[0], beta.s[1]}, + y_buffer, y_offset, y_inc, + queue, event); + return static_cast(status); +} +StatusCode CLBlastZgbmv(const Layout layout, const Transpose a_transpose, + const size_t m, const size_t n, const size_t kl, const size_t ku, + const cl_double2 alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_double2 beta, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Gbmv(static_cast(layout), + static_cast(a_transpose), + m, n, kl, ku, + double2{alpha.s[0], alpha.s[1]}, + a_buffer, a_offset, a_ld, + x_buffer, x_offset, x_inc, + double2{beta.s[0], beta.s[1]}, + y_buffer, y_offset, y_inc, + queue, event); + return static_cast(status); +} + // HEMV StatusCode CLBlastChemv(const Layout layout, const Triangle triangle, const size_t n, @@ -403,6 +481,86 @@ StatusCode CLBlastZhemv(const Layout layout, const Triangle triangle, return static_cast(status); } +// HBMV +StatusCode CLBlastChbmv(const Layout layout, const Triangle triangle, + const size_t n, const size_t k, + const cl_float2 alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_float2 beta, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Hbmv(static_cast(layout), + static_cast(triangle), + n, k, + float2{alpha.s[0], alpha.s[1]}, + a_buffer, a_offset, a_ld, + x_buffer, x_offset, x_inc, + float2{beta.s[0], beta.s[1]}, + y_buffer, y_offset, y_inc, + queue, event); + return static_cast(status); +} +StatusCode CLBlastZhbmv(const Layout layout, const Triangle triangle, + const size_t n, const size_t k, + const cl_double2 alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_double2 beta, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Hbmv(static_cast(layout), + static_cast(triangle), + n, k, + double2{alpha.s[0], alpha.s[1]}, + a_buffer, a_offset, a_ld, + x_buffer, x_offset, x_inc, + double2{beta.s[0], beta.s[1]}, + y_buffer, y_offset, y_inc, + queue, event); + return static_cast(status); +} + +// HPMV +StatusCode CLBlastChpmv(const Layout layout, const Triangle triangle, + const size_t n, + const cl_float2 alpha, + const cl_mem ap_buffer, const size_t ap_offset, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_float2 beta, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Hpmv(static_cast(layout), + static_cast(triangle), + n, + float2{alpha.s[0], alpha.s[1]}, + ap_buffer, ap_offset, + x_buffer, x_offset, x_inc, + float2{beta.s[0], beta.s[1]}, + y_buffer, y_offset, y_inc, + queue, event); + return static_cast(status); +} +StatusCode CLBlastZhpmv(const Layout layout, const Triangle triangle, + const size_t n, + const cl_double2 alpha, + const cl_mem ap_buffer, const size_t ap_offset, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_double2 beta, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Hpmv(static_cast(layout), + static_cast(triangle), + n, + double2{alpha.s[0], alpha.s[1]}, + ap_buffer, ap_offset, + x_buffer, x_offset, x_inc, + double2{beta.s[0], beta.s[1]}, + y_buffer, y_offset, y_inc, + queue, event); + return static_cast(status); +} + // SYMV StatusCode CLBlastSsymv(const Layout layout, const Triangle triangle, const size_t n, @@ -443,6 +601,832 @@ StatusCode CLBlastDsymv(const Layout layout, const Triangle triangle, return static_cast(status); } +// SBMV +StatusCode CLBlastSsbmv(const Layout layout, const Triangle triangle, + const size_t n, const size_t k, + const float alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const float beta, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Sbmv(static_cast(layout), + static_cast(triangle), + n, k, + alpha, + a_buffer, a_offset, a_ld, + x_buffer, x_offset, x_inc, + beta, + y_buffer, y_offset, y_inc, + queue, event); + return static_cast(status); +} +StatusCode CLBlastDsbmv(const Layout layout, const Triangle triangle, + const size_t n, const size_t k, + const double alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const double beta, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Sbmv(static_cast(layout), + static_cast(triangle), + n, k, + alpha, + a_buffer, a_offset, a_ld, + x_buffer, x_offset, x_inc, + beta, + y_buffer, y_offset, y_inc, + queue, event); + return static_cast(status); +} + +// SPMV +StatusCode CLBlastSspmv(const Layout layout, const Triangle triangle, + const size_t n, + const float alpha, + const cl_mem ap_buffer, const size_t ap_offset, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const float beta, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Spmv(static_cast(layout), + static_cast(triangle), + n, + alpha, + ap_buffer, ap_offset, + x_buffer, x_offset, x_inc, + beta, + y_buffer, y_offset, y_inc, + queue, event); + return static_cast(status); +} +StatusCode CLBlastDspmv(const Layout layout, const Triangle triangle, + const size_t n, + const double alpha, + const cl_mem ap_buffer, const size_t ap_offset, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const double beta, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Spmv(static_cast(layout), + static_cast(triangle), + n, + alpha, + ap_buffer, ap_offset, + x_buffer, x_offset, x_inc, + beta, + y_buffer, y_offset, y_inc, + queue, event); + return static_cast(status); +} + +// TRMV +StatusCode CLBlastStrmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t n, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Trmv(static_cast(layout), + static_cast(triangle), + static_cast(a_transpose), + static_cast(diagonal), + n, + a_buffer, a_offset, a_ld, + x_buffer, x_offset, x_inc, + queue, event); + return static_cast(status); +} +StatusCode CLBlastDtrmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t n, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Trmv(static_cast(layout), + static_cast(triangle), + static_cast(a_transpose), + static_cast(diagonal), + n, + a_buffer, a_offset, a_ld, + x_buffer, x_offset, x_inc, + queue, event); + return static_cast(status); +} +StatusCode CLBlastCtrmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t n, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Trmv(static_cast(layout), + static_cast(triangle), + static_cast(a_transpose), + static_cast(diagonal), + n, + a_buffer, a_offset, a_ld, + x_buffer, x_offset, x_inc, + queue, event); + return static_cast(status); +} +StatusCode CLBlastZtrmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t n, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Trmv(static_cast(layout), + static_cast(triangle), + static_cast(a_transpose), + static_cast(diagonal), + n, + a_buffer, a_offset, a_ld, + x_buffer, x_offset, x_inc, + queue, event); + return static_cast(status); +} + +// TBMV +StatusCode CLBlastStbmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t n, const size_t k, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Tbmv(static_cast(layout), + static_cast(triangle), + static_cast(a_transpose), + static_cast(diagonal), + n, k, + a_buffer, a_offset, a_ld, + x_buffer, x_offset, x_inc, + queue, event); + return static_cast(status); +} +StatusCode CLBlastDtbmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t n, const size_t k, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Tbmv(static_cast(layout), + static_cast(triangle), + static_cast(a_transpose), + static_cast(diagonal), + n, k, + a_buffer, a_offset, a_ld, + x_buffer, x_offset, x_inc, + queue, event); + return static_cast(status); +} +StatusCode CLBlastCtbmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t n, const size_t k, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Tbmv(static_cast(layout), + static_cast(triangle), + static_cast(a_transpose), + static_cast(diagonal), + n, k, + a_buffer, a_offset, a_ld, + x_buffer, x_offset, x_inc, + queue, event); + return static_cast(status); +} +StatusCode CLBlastZtbmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t n, const size_t k, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Tbmv(static_cast(layout), + static_cast(triangle), + static_cast(a_transpose), + static_cast(diagonal), + n, k, + a_buffer, a_offset, a_ld, + x_buffer, x_offset, x_inc, + queue, event); + return static_cast(status); +} + +// TPMV +StatusCode CLBlastStpmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t n, + const cl_mem ap_buffer, const size_t ap_offset, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Tpmv(static_cast(layout), + static_cast(triangle), + static_cast(a_transpose), + static_cast(diagonal), + n, + ap_buffer, ap_offset, + x_buffer, x_offset, x_inc, + queue, event); + return static_cast(status); +} +StatusCode CLBlastDtpmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t n, + const cl_mem ap_buffer, const size_t ap_offset, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Tpmv(static_cast(layout), + static_cast(triangle), + static_cast(a_transpose), + static_cast(diagonal), + n, + ap_buffer, ap_offset, + x_buffer, x_offset, x_inc, + queue, event); + return static_cast(status); +} +StatusCode CLBlastCtpmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t n, + const cl_mem ap_buffer, const size_t ap_offset, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Tpmv(static_cast(layout), + static_cast(triangle), + static_cast(a_transpose), + static_cast(diagonal), + n, + ap_buffer, ap_offset, + x_buffer, x_offset, x_inc, + queue, event); + return static_cast(status); +} +StatusCode CLBlastZtpmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t n, + const cl_mem ap_buffer, const size_t ap_offset, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Tpmv(static_cast(layout), + static_cast(triangle), + static_cast(a_transpose), + static_cast(diagonal), + n, + ap_buffer, ap_offset, + x_buffer, x_offset, x_inc, + queue, event); + return static_cast(status); +} + +// TRSV +StatusCode CLBlastStrsv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t n, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Trsv(static_cast(layout), + static_cast(triangle), + static_cast(a_transpose), + static_cast(diagonal), + n, + a_buffer, a_offset, a_ld, + x_buffer, x_offset, x_inc, + queue, event); + return static_cast(status); +} +StatusCode CLBlastDtrsv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t n, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Trsv(static_cast(layout), + static_cast(triangle), + static_cast(a_transpose), + static_cast(diagonal), + n, + a_buffer, a_offset, a_ld, + x_buffer, x_offset, x_inc, + queue, event); + return static_cast(status); +} +StatusCode CLBlastCtrsv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t n, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Trsv(static_cast(layout), + static_cast(triangle), + static_cast(a_transpose), + static_cast(diagonal), + n, + a_buffer, a_offset, a_ld, + x_buffer, x_offset, x_inc, + queue, event); + return static_cast(status); +} +StatusCode CLBlastZtrsv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t n, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Trsv(static_cast(layout), + static_cast(triangle), + static_cast(a_transpose), + static_cast(diagonal), + n, + a_buffer, a_offset, a_ld, + x_buffer, x_offset, x_inc, + queue, event); + return static_cast(status); +} + +// TBSV +StatusCode CLBlastStbsv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t n, const size_t k, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Tbsv(static_cast(layout), + static_cast(triangle), + static_cast(a_transpose), + static_cast(diagonal), + n, k, + a_buffer, a_offset, a_ld, + x_buffer, x_offset, x_inc, + queue, event); + return static_cast(status); +} +StatusCode CLBlastDtbsv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t n, const size_t k, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Tbsv(static_cast(layout), + static_cast(triangle), + static_cast(a_transpose), + static_cast(diagonal), + n, k, + a_buffer, a_offset, a_ld, + x_buffer, x_offset, x_inc, + queue, event); + return static_cast(status); +} +StatusCode CLBlastCtbsv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t n, const size_t k, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Tbsv(static_cast(layout), + static_cast(triangle), + static_cast(a_transpose), + static_cast(diagonal), + n, k, + a_buffer, a_offset, a_ld, + x_buffer, x_offset, x_inc, + queue, event); + return static_cast(status); +} +StatusCode CLBlastZtbsv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t n, const size_t k, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Tbsv(static_cast(layout), + static_cast(triangle), + static_cast(a_transpose), + static_cast(diagonal), + n, k, + a_buffer, a_offset, a_ld, + x_buffer, x_offset, x_inc, + queue, event); + return static_cast(status); +} + +// TPSV +StatusCode CLBlastStpsv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t n, + const cl_mem ap_buffer, const size_t ap_offset, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Tpsv(static_cast(layout), + static_cast(triangle), + static_cast(a_transpose), + static_cast(diagonal), + n, + ap_buffer, ap_offset, + x_buffer, x_offset, x_inc, + queue, event); + return static_cast(status); +} +StatusCode CLBlastDtpsv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t n, + const cl_mem ap_buffer, const size_t ap_offset, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Tpsv(static_cast(layout), + static_cast(triangle), + static_cast(a_transpose), + static_cast(diagonal), + n, + ap_buffer, ap_offset, + x_buffer, x_offset, x_inc, + queue, event); + return static_cast(status); +} +StatusCode CLBlastCtpsv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t n, + const cl_mem ap_buffer, const size_t ap_offset, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Tpsv(static_cast(layout), + static_cast(triangle), + static_cast(a_transpose), + static_cast(diagonal), + n, + ap_buffer, ap_offset, + x_buffer, x_offset, x_inc, + queue, event); + return static_cast(status); +} +StatusCode CLBlastZtpsv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t n, + const cl_mem ap_buffer, const size_t ap_offset, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Tpsv(static_cast(layout), + static_cast(triangle), + static_cast(a_transpose), + static_cast(diagonal), + n, + ap_buffer, ap_offset, + x_buffer, x_offset, x_inc, + queue, event); + return static_cast(status); +} + +// GER +StatusCode CLBlastSger(const Layout layout, + const size_t m, const size_t n, + const float alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Ger(static_cast(layout), + m, n, + alpha, + x_buffer, x_offset, x_inc, + y_buffer, y_offset, y_inc, + a_buffer, a_offset, a_ld, + queue, event); + return static_cast(status); +} +StatusCode CLBlastDger(const Layout layout, + const size_t m, const size_t n, + const double alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Ger(static_cast(layout), + m, n, + alpha, + x_buffer, x_offset, x_inc, + y_buffer, y_offset, y_inc, + a_buffer, a_offset, a_ld, + queue, event); + return static_cast(status); +} + +// GERU +StatusCode CLBlastCgeru(const Layout layout, + const size_t m, const size_t n, + const cl_float2 alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Geru(static_cast(layout), + m, n, + float2{alpha.s[0], alpha.s[1]}, + x_buffer, x_offset, x_inc, + y_buffer, y_offset, y_inc, + a_buffer, a_offset, a_ld, + queue, event); + return static_cast(status); +} +StatusCode CLBlastZgeru(const Layout layout, + const size_t m, const size_t n, + const cl_double2 alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Geru(static_cast(layout), + m, n, + double2{alpha.s[0], alpha.s[1]}, + x_buffer, x_offset, x_inc, + y_buffer, y_offset, y_inc, + a_buffer, a_offset, a_ld, + queue, event); + return static_cast(status); +} + +// GERC +StatusCode CLBlastCgerc(const Layout layout, + const size_t m, const size_t n, + const cl_float2 alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Gerc(static_cast(layout), + m, n, + float2{alpha.s[0], alpha.s[1]}, + x_buffer, x_offset, x_inc, + y_buffer, y_offset, y_inc, + a_buffer, a_offset, a_ld, + queue, event); + return static_cast(status); +} +StatusCode CLBlastZgerc(const Layout layout, + const size_t m, const size_t n, + const cl_double2 alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Gerc(static_cast(layout), + m, n, + double2{alpha.s[0], alpha.s[1]}, + x_buffer, x_offset, x_inc, + y_buffer, y_offset, y_inc, + a_buffer, a_offset, a_ld, + queue, event); + return static_cast(status); +} + +// HER +StatusCode CLBlastCher(const Layout layout, const Triangle triangle, + const size_t n, + const float alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Her(static_cast(layout), + static_cast(triangle), + n, + alpha, + x_buffer, x_offset, x_inc, + a_buffer, a_offset, a_ld, + queue, event); + return static_cast(status); +} +StatusCode CLBlastZher(const Layout layout, const Triangle triangle, + const size_t n, + const double alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Her(static_cast(layout), + static_cast(triangle), + n, + alpha, + x_buffer, x_offset, x_inc, + a_buffer, a_offset, a_ld, + queue, event); + return static_cast(status); +} + +// HPR +StatusCode CLBlastChpr(const Layout layout, const Triangle triangle, + const size_t n, + const float alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_mem ap_buffer, const size_t ap_offset, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Hpr(static_cast(layout), + static_cast(triangle), + n, + alpha, + x_buffer, x_offset, x_inc, + ap_buffer, ap_offset, + queue, event); + return static_cast(status); +} +StatusCode CLBlastZhpr(const Layout layout, const Triangle triangle, + const size_t n, + const double alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_mem ap_buffer, const size_t ap_offset, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Hpr(static_cast(layout), + static_cast(triangle), + n, + alpha, + x_buffer, x_offset, x_inc, + ap_buffer, ap_offset, + queue, event); + return static_cast(status); +} + +// HER2 +StatusCode CLBlastCher2(const Layout layout, const Triangle triangle, + const size_t n, + const cl_float2 alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Her2(static_cast(layout), + static_cast(triangle), + n, + float2{alpha.s[0], alpha.s[1]}, + x_buffer, x_offset, x_inc, + y_buffer, y_offset, y_inc, + a_buffer, a_offset, a_ld, + queue, event); + return static_cast(status); +} +StatusCode CLBlastZher2(const Layout layout, const Triangle triangle, + const size_t n, + const cl_double2 alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Her2(static_cast(layout), + static_cast(triangle), + n, + double2{alpha.s[0], alpha.s[1]}, + x_buffer, x_offset, x_inc, + y_buffer, y_offset, y_inc, + a_buffer, a_offset, a_ld, + queue, event); + return static_cast(status); +} + +// HPR2 +StatusCode CLBlastChpr2(const Layout layout, const Triangle triangle, + const size_t n, + const cl_float2 alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_mem ap_buffer, const size_t ap_offset, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Hpr2(static_cast(layout), + static_cast(triangle), + n, + float2{alpha.s[0], alpha.s[1]}, + x_buffer, x_offset, x_inc, + y_buffer, y_offset, y_inc, + ap_buffer, ap_offset, + queue, event); + return static_cast(status); +} +StatusCode CLBlastZhpr2(const Layout layout, const Triangle triangle, + const size_t n, + const cl_double2 alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_mem ap_buffer, const size_t ap_offset, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Hpr2(static_cast(layout), + static_cast(triangle), + n, + double2{alpha.s[0], alpha.s[1]}, + x_buffer, x_offset, x_inc, + y_buffer, y_offset, y_inc, + ap_buffer, ap_offset, + queue, event); + return static_cast(status); +} + +// SYR +StatusCode CLBlastSsyr(const Layout layout, const Triangle triangle, + const size_t n, + const float alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Syr(static_cast(layout), + static_cast(triangle), + n, + alpha, + x_buffer, x_offset, x_inc, + a_buffer, a_offset, a_ld, + queue, event); + return static_cast(status); +} +StatusCode CLBlastDsyr(const Layout layout, const Triangle triangle, + const size_t n, + const double alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Syr(static_cast(layout), + static_cast(triangle), + n, + alpha, + x_buffer, x_offset, x_inc, + a_buffer, a_offset, a_ld, + queue, event); + return static_cast(status); +} + +// SPR +StatusCode CLBlastSspr(const Layout layout, const Triangle triangle, + const size_t n, + const float alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_mem ap_buffer, const size_t ap_offset, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Spr(static_cast(layout), + static_cast(triangle), + n, + alpha, + x_buffer, x_offset, x_inc, + ap_buffer, ap_offset, + queue, event); + return static_cast(status); +} +StatusCode CLBlastDspr(const Layout layout, const Triangle triangle, + const size_t n, + const double alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_mem ap_buffer, const size_t ap_offset, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Spr(static_cast(layout), + static_cast(triangle), + n, + alpha, + x_buffer, x_offset, x_inc, + ap_buffer, ap_offset, + queue, event); + return static_cast(status); +} + +// SYR2 +StatusCode CLBlastSsyr2(const Layout layout, const Triangle triangle, + const size_t n, + const float alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Syr2(static_cast(layout), + static_cast(triangle), + n, + alpha, + x_buffer, x_offset, x_inc, + y_buffer, y_offset, y_inc, + a_buffer, a_offset, a_ld, + queue, event); + return static_cast(status); +} +StatusCode CLBlastDsyr2(const Layout layout, const Triangle triangle, + const size_t n, + const double alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Syr2(static_cast(layout), + static_cast(triangle), + n, + alpha, + x_buffer, x_offset, x_inc, + y_buffer, y_offset, y_inc, + a_buffer, a_offset, a_ld, + queue, event); + return static_cast(status); +} + +// SPR2 +StatusCode CLBlastSspr2(const Layout layout, const Triangle triangle, + const size_t n, + const float alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_mem ap_buffer, const size_t ap_offset, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Spr2(static_cast(layout), + static_cast(triangle), + n, + alpha, + x_buffer, x_offset, x_inc, + y_buffer, y_offset, y_inc, + ap_buffer, ap_offset, + queue, event); + return static_cast(status); +} +StatusCode CLBlastDspr2(const Layout layout, const Triangle triangle, + const size_t n, + const double alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_mem ap_buffer, const size_t ap_offset, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Spr2(static_cast(layout), + static_cast(triangle), + n, + alpha, + x_buffer, x_offset, x_inc, + y_buffer, y_offset, y_inc, + ap_buffer, ap_offset, + queue, event); + return static_cast(status); +} + // ================================================================================================= // BLAS level-3 (matrix-matrix) routines // ================================================================================================= @@ -963,4 +1947,78 @@ StatusCode CLBlastZtrmm(const Layout layout, const Side side, const Triangle tri return static_cast(status); } +// TRSM +StatusCode CLBlastStrsm(const Layout layout, const Side side, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t m, const size_t n, + const float alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_mem b_buffer, const size_t b_offset, const size_t b_ld, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Trsm(static_cast(layout), + static_cast(side), + static_cast(triangle), + static_cast(a_transpose), + static_cast(diagonal), + m, n, + alpha, + a_buffer, a_offset, a_ld, + b_buffer, b_offset, b_ld, + queue, event); + return static_cast(status); +} +StatusCode CLBlastDtrsm(const Layout layout, const Side side, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t m, const size_t n, + const double alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_mem b_buffer, const size_t b_offset, const size_t b_ld, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Trsm(static_cast(layout), + static_cast(side), + static_cast(triangle), + static_cast(a_transpose), + static_cast(diagonal), + m, n, + alpha, + a_buffer, a_offset, a_ld, + b_buffer, b_offset, b_ld, + queue, event); + return static_cast(status); +} +StatusCode CLBlastCtrsm(const Layout layout, const Side side, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t m, const size_t n, + const cl_float2 alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_mem b_buffer, const size_t b_offset, const size_t b_ld, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Trsm(static_cast(layout), + static_cast(side), + static_cast(triangle), + static_cast(a_transpose), + static_cast(diagonal), + m, n, + float2{alpha.s[0], alpha.s[1]}, + a_buffer, a_offset, a_ld, + b_buffer, b_offset, b_ld, + queue, event); + return static_cast(status); +} +StatusCode CLBlastZtrsm(const Layout layout, const Side side, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t m, const size_t n, + const cl_double2 alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_mem b_buffer, const size_t b_offset, const size_t b_ld, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Trsm(static_cast(layout), + static_cast(side), + static_cast(triangle), + static_cast(a_transpose), + static_cast(diagonal), + m, n, + double2{alpha.s[0], alpha.s[1]}, + a_buffer, a_offset, a_ld, + b_buffer, b_offset, b_ld, + queue, event); + return static_cast(status); +} + // ================================================================================================= diff --git a/test/wrapper_clblas.h b/test/wrapper_clblas.h index 85729470..10c7dd47 100644 --- a/test/wrapper_clblas.h +++ b/test/wrapper_clblas.h @@ -428,6 +428,80 @@ clblasStatus clblasXgemv(const clblasOrder layout, const clblasTranspose a_trans num_queues, queues, num_wait_events, wait_events, events); } +// Forwards the clBLAS calls for SGBMV/DGBMV/CGBMV/ZGBMV +clblasStatus clblasXgbmv(const clblasOrder layout, const clblasTranspose a_transpose, + const size_t m, const size_t n, const size_t kl, const size_t ku, + const float alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const float beta, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { + return clblasSgbmv(layout, a_transpose, + m, n, kl, ku, + alpha, + a_buffer, a_offset, a_ld, + x_buffer, x_offset, static_cast(x_inc), + beta, + y_buffer, y_offset, static_cast(y_inc), + num_queues, queues, num_wait_events, wait_events, events); +} +clblasStatus clblasXgbmv(const clblasOrder layout, const clblasTranspose a_transpose, + const size_t m, const size_t n, const size_t kl, const size_t ku, + const double alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const double beta, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { + return clblasDgbmv(layout, a_transpose, + m, n, kl, ku, + alpha, + a_buffer, a_offset, a_ld, + x_buffer, x_offset, static_cast(x_inc), + beta, + y_buffer, y_offset, static_cast(y_inc), + num_queues, queues, num_wait_events, wait_events, events); +} +clblasStatus clblasXgbmv(const clblasOrder layout, const clblasTranspose a_transpose, + const size_t m, const size_t n, const size_t kl, const size_t ku, + const float2 alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const float2 beta, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { + return clblasCgbmv(layout, a_transpose, + m, n, kl, ku, + cl_float2{{alpha.real(), alpha.imag()}}, + a_buffer, a_offset, a_ld, + x_buffer, x_offset, static_cast(x_inc), + cl_float2{{beta.real(), beta.imag()}}, + y_buffer, y_offset, static_cast(y_inc), + num_queues, queues, num_wait_events, wait_events, events); +} +clblasStatus clblasXgbmv(const clblasOrder layout, const clblasTranspose a_transpose, + const size_t m, const size_t n, const size_t kl, const size_t ku, + const double2 alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const double2 beta, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { + return clblasZgbmv(layout, a_transpose, + m, n, kl, ku, + cl_double2{{alpha.real(), alpha.imag()}}, + a_buffer, a_offset, a_ld, + x_buffer, x_offset, static_cast(x_inc), + cl_double2{{beta.real(), beta.imag()}}, + y_buffer, y_offset, static_cast(y_inc), + num_queues, queues, num_wait_events, wait_events, events); +} + // Forwards the clBLAS calls for CHEMV/ZHEMV clblasStatus clblasXhemv(const clblasOrder layout, const clblasUplo triangle, const size_t n, @@ -466,6 +540,82 @@ clblasStatus clblasXhemv(const clblasOrder layout, const clblasUplo triangle, num_queues, queues, num_wait_events, wait_events, events); } +// Forwards the clBLAS calls for CHBMV/ZHBMV +clblasStatus clblasXhbmv(const clblasOrder layout, const clblasUplo triangle, + const size_t n, const size_t k, + const float2 alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const float2 beta, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { + return clblasChbmv(layout, triangle, + n, k, + cl_float2{{alpha.real(), alpha.imag()}}, + a_buffer, a_offset, a_ld, + x_buffer, x_offset, static_cast(x_inc), + cl_float2{{beta.real(), beta.imag()}}, + y_buffer, y_offset, static_cast(y_inc), + num_queues, queues, num_wait_events, wait_events, events); +} +clblasStatus clblasXhbmv(const clblasOrder layout, const clblasUplo triangle, + const size_t n, const size_t k, + const double2 alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const double2 beta, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { + return clblasZhbmv(layout, triangle, + n, k, + cl_double2{{alpha.real(), alpha.imag()}}, + a_buffer, a_offset, a_ld, + x_buffer, x_offset, static_cast(x_inc), + cl_double2{{beta.real(), beta.imag()}}, + y_buffer, y_offset, static_cast(y_inc), + num_queues, queues, num_wait_events, wait_events, events); +} + +// Forwards the clBLAS calls for CHPMV/ZHPMV +clblasStatus clblasXhpmv(const clblasOrder layout, const clblasUplo triangle, + const size_t n, + const float2 alpha, + const cl_mem ap_buffer, const size_t ap_offset, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const float2 beta, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { + return clblasChpmv(layout, triangle, + n, + cl_float2{{alpha.real(), alpha.imag()}}, + ap_buffer, ap_offset, + x_buffer, x_offset, static_cast(x_inc), + cl_float2{{beta.real(), beta.imag()}}, + y_buffer, y_offset, static_cast(y_inc), + num_queues, queues, num_wait_events, wait_events, events); +} +clblasStatus clblasXhpmv(const clblasOrder layout, const clblasUplo triangle, + const size_t n, + const double2 alpha, + const cl_mem ap_buffer, const size_t ap_offset, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const double2 beta, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { + return clblasZhpmv(layout, triangle, + n, + cl_double2{{alpha.real(), alpha.imag()}}, + ap_buffer, ap_offset, + x_buffer, x_offset, static_cast(x_inc), + cl_double2{{beta.real(), beta.imag()}}, + y_buffer, y_offset, static_cast(y_inc), + num_queues, queues, num_wait_events, wait_events, events); +} + // Forwards the clBLAS calls for SSYMV/DSYMV clblasStatus clblasXsymv(const clblasOrder layout, const clblasUplo triangle, const size_t n, @@ -504,6 +654,854 @@ clblasStatus clblasXsymv(const clblasOrder layout, const clblasUplo triangle, num_queues, queues, num_wait_events, wait_events, events); } +// Forwards the clBLAS calls for SSBMV/DSBMV +clblasStatus clblasXsbmv(const clblasOrder layout, const clblasUplo triangle, + const size_t n, const size_t k, + const float alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const float beta, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { + return clblasSsbmv(layout, triangle, + n, k, + alpha, + a_buffer, a_offset, a_ld, + x_buffer, x_offset, static_cast(x_inc), + beta, + y_buffer, y_offset, static_cast(y_inc), + num_queues, queues, num_wait_events, wait_events, events); +} +clblasStatus clblasXsbmv(const clblasOrder layout, const clblasUplo triangle, + const size_t n, const size_t k, + const double alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const double beta, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { + return clblasDsbmv(layout, triangle, + n, k, + alpha, + a_buffer, a_offset, a_ld, + x_buffer, x_offset, static_cast(x_inc), + beta, + y_buffer, y_offset, static_cast(y_inc), + num_queues, queues, num_wait_events, wait_events, events); +} + +// Forwards the clBLAS calls for SSPMV/DSPMV +clblasStatus clblasXspmv(const clblasOrder layout, const clblasUplo triangle, + const size_t n, + const float alpha, + const cl_mem ap_buffer, const size_t ap_offset, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const float beta, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { + return clblasSspmv(layout, triangle, + n, + alpha, + ap_buffer, ap_offset, + x_buffer, x_offset, static_cast(x_inc), + beta, + y_buffer, y_offset, static_cast(y_inc), + num_queues, queues, num_wait_events, wait_events, events); +} +clblasStatus clblasXspmv(const clblasOrder layout, const clblasUplo triangle, + const size_t n, + const double alpha, + const cl_mem ap_buffer, const size_t ap_offset, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const double beta, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { + return clblasDspmv(layout, triangle, + n, + alpha, + ap_buffer, ap_offset, + x_buffer, x_offset, static_cast(x_inc), + beta, + y_buffer, y_offset, static_cast(y_inc), + num_queues, queues, num_wait_events, wait_events, events); +} + +// Forwards the clBLAS calls for STRMV/DTRMV/CTRMV/ZTRMV +template +clblasStatus clblasXtrmv(const clblasOrder layout, const clblasUplo triangle, const clblasTranspose a_transpose, const clblasDiag diagonal, + const size_t n, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events); +template <> +clblasStatus clblasXtrmv(const clblasOrder layout, const clblasUplo triangle, const clblasTranspose a_transpose, const clblasDiag diagonal, + const size_t n, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { + auto queue = Queue(queues[0]); + auto context = queue.GetContext(); + auto scratch_buffer = Buffer(context, n); + return clblasStrmv(layout, triangle, a_transpose, diagonal, + n, + a_buffer, a_offset, a_ld, + x_buffer, x_offset, static_cast(x_inc), + scratch_buffer(), + num_queues, queues, num_wait_events, wait_events, events); +} +template <> +clblasStatus clblasXtrmv(const clblasOrder layout, const clblasUplo triangle, const clblasTranspose a_transpose, const clblasDiag diagonal, + const size_t n, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { + auto queue = Queue(queues[0]); + auto context = queue.GetContext(); + auto scratch_buffer = Buffer(context, n); + return clblasDtrmv(layout, triangle, a_transpose, diagonal, + n, + a_buffer, a_offset, a_ld, + x_buffer, x_offset, static_cast(x_inc), + scratch_buffer(), + num_queues, queues, num_wait_events, wait_events, events); +} +template <> +clblasStatus clblasXtrmv(const clblasOrder layout, const clblasUplo triangle, const clblasTranspose a_transpose, const clblasDiag diagonal, + const size_t n, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { + auto queue = Queue(queues[0]); + auto context = queue.GetContext(); + auto scratch_buffer = Buffer(context, n); + return clblasCtrmv(layout, triangle, a_transpose, diagonal, + n, + a_buffer, a_offset, a_ld, + x_buffer, x_offset, static_cast(x_inc), + scratch_buffer(), + num_queues, queues, num_wait_events, wait_events, events); +} +template <> +clblasStatus clblasXtrmv(const clblasOrder layout, const clblasUplo triangle, const clblasTranspose a_transpose, const clblasDiag diagonal, + const size_t n, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { + auto queue = Queue(queues[0]); + auto context = queue.GetContext(); + auto scratch_buffer = Buffer(context, n); + return clblasZtrmv(layout, triangle, a_transpose, diagonal, + n, + a_buffer, a_offset, a_ld, + x_buffer, x_offset, static_cast(x_inc), + scratch_buffer(), + num_queues, queues, num_wait_events, wait_events, events); +} + +// Forwards the clBLAS calls for STBMV/DTBMV/CTBMV/ZTBMV +template +clblasStatus clblasXtbmv(const clblasOrder layout, const clblasUplo triangle, const clblasTranspose a_transpose, const clblasDiag diagonal, + const size_t n, const size_t k, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events); +template <> +clblasStatus clblasXtbmv(const clblasOrder layout, const clblasUplo triangle, const clblasTranspose a_transpose, const clblasDiag diagonal, + const size_t n, const size_t k, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { + auto queue = Queue(queues[0]); + auto context = queue.GetContext(); + auto scratch_buffer = Buffer(context, n); + return clblasStbmv(layout, triangle, a_transpose, diagonal, + n, k, + a_buffer, a_offset, a_ld, + x_buffer, x_offset, static_cast(x_inc), + scratch_buffer(), + num_queues, queues, num_wait_events, wait_events, events); +} +template <> +clblasStatus clblasXtbmv(const clblasOrder layout, const clblasUplo triangle, const clblasTranspose a_transpose, const clblasDiag diagonal, + const size_t n, const size_t k, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { + auto queue = Queue(queues[0]); + auto context = queue.GetContext(); + auto scratch_buffer = Buffer(context, n); + return clblasDtbmv(layout, triangle, a_transpose, diagonal, + n, k, + a_buffer, a_offset, a_ld, + x_buffer, x_offset, static_cast(x_inc), + scratch_buffer(), + num_queues, queues, num_wait_events, wait_events, events); +} +template <> +clblasStatus clblasXtbmv(const clblasOrder layout, const clblasUplo triangle, const clblasTranspose a_transpose, const clblasDiag diagonal, + const size_t n, const size_t k, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { + auto queue = Queue(queues[0]); + auto context = queue.GetContext(); + auto scratch_buffer = Buffer(context, n); + return clblasCtbmv(layout, triangle, a_transpose, diagonal, + n, k, + a_buffer, a_offset, a_ld, + x_buffer, x_offset, static_cast(x_inc), + scratch_buffer(), + num_queues, queues, num_wait_events, wait_events, events); +} +template <> +clblasStatus clblasXtbmv(const clblasOrder layout, const clblasUplo triangle, const clblasTranspose a_transpose, const clblasDiag diagonal, + const size_t n, const size_t k, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { + auto queue = Queue(queues[0]); + auto context = queue.GetContext(); + auto scratch_buffer = Buffer(context, n); + return clblasZtbmv(layout, triangle, a_transpose, diagonal, + n, k, + a_buffer, a_offset, a_ld, + x_buffer, x_offset, static_cast(x_inc), + scratch_buffer(), + num_queues, queues, num_wait_events, wait_events, events); +} + +// Forwards the clBLAS calls for STPMV/DTPMV/CTPMV/ZTPMV +template +clblasStatus clblasXtpmv(const clblasOrder layout, const clblasUplo triangle, const clblasTranspose a_transpose, const clblasDiag diagonal, + const size_t n, + const cl_mem ap_buffer, const size_t ap_offset, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events); +template <> +clblasStatus clblasXtpmv(const clblasOrder layout, const clblasUplo triangle, const clblasTranspose a_transpose, const clblasDiag diagonal, + const size_t n, + const cl_mem ap_buffer, const size_t ap_offset, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { + auto queue = Queue(queues[0]); + auto context = queue.GetContext(); + auto scratch_buffer = Buffer(context, n); + return clblasStpmv(layout, triangle, a_transpose, diagonal, + n, + ap_buffer, ap_offset, + x_buffer, x_offset, static_cast(x_inc), + scratch_buffer(), + num_queues, queues, num_wait_events, wait_events, events); +} +template <> +clblasStatus clblasXtpmv(const clblasOrder layout, const clblasUplo triangle, const clblasTranspose a_transpose, const clblasDiag diagonal, + const size_t n, + const cl_mem ap_buffer, const size_t ap_offset, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { + auto queue = Queue(queues[0]); + auto context = queue.GetContext(); + auto scratch_buffer = Buffer(context, n); + return clblasDtpmv(layout, triangle, a_transpose, diagonal, + n, + ap_buffer, ap_offset, + x_buffer, x_offset, static_cast(x_inc), + scratch_buffer(), + num_queues, queues, num_wait_events, wait_events, events); +} +template <> +clblasStatus clblasXtpmv(const clblasOrder layout, const clblasUplo triangle, const clblasTranspose a_transpose, const clblasDiag diagonal, + const size_t n, + const cl_mem ap_buffer, const size_t ap_offset, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { + auto queue = Queue(queues[0]); + auto context = queue.GetContext(); + auto scratch_buffer = Buffer(context, n); + return clblasCtpmv(layout, triangle, a_transpose, diagonal, + n, + ap_buffer, ap_offset, + x_buffer, x_offset, static_cast(x_inc), + scratch_buffer(), + num_queues, queues, num_wait_events, wait_events, events); +} +template <> +clblasStatus clblasXtpmv(const clblasOrder layout, const clblasUplo triangle, const clblasTranspose a_transpose, const clblasDiag diagonal, + const size_t n, + const cl_mem ap_buffer, const size_t ap_offset, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { + auto queue = Queue(queues[0]); + auto context = queue.GetContext(); + auto scratch_buffer = Buffer(context, n); + return clblasZtpmv(layout, triangle, a_transpose, diagonal, + n, + ap_buffer, ap_offset, + x_buffer, x_offset, static_cast(x_inc), + scratch_buffer(), + num_queues, queues, num_wait_events, wait_events, events); +} + +// Forwards the clBLAS calls for STRSV/DTRSV/CTRSV/ZTRSV +template +clblasStatus clblasXtrsv(const clblasOrder layout, const clblasUplo triangle, const clblasTranspose a_transpose, const clblasDiag diagonal, + const size_t n, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events); +template <> +clblasStatus clblasXtrsv(const clblasOrder layout, const clblasUplo triangle, const clblasTranspose a_transpose, const clblasDiag diagonal, + const size_t n, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { + return clblasStrsv(layout, triangle, a_transpose, diagonal, + n, + a_buffer, a_offset, a_ld, + x_buffer, x_offset, static_cast(x_inc), + num_queues, queues, num_wait_events, wait_events, events); +} +template <> +clblasStatus clblasXtrsv(const clblasOrder layout, const clblasUplo triangle, const clblasTranspose a_transpose, const clblasDiag diagonal, + const size_t n, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { + return clblasDtrsv(layout, triangle, a_transpose, diagonal, + n, + a_buffer, a_offset, a_ld, + x_buffer, x_offset, static_cast(x_inc), + num_queues, queues, num_wait_events, wait_events, events); +} +template <> +clblasStatus clblasXtrsv(const clblasOrder layout, const clblasUplo triangle, const clblasTranspose a_transpose, const clblasDiag diagonal, + const size_t n, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { + return clblasCtrsv(layout, triangle, a_transpose, diagonal, + n, + a_buffer, a_offset, a_ld, + x_buffer, x_offset, static_cast(x_inc), + num_queues, queues, num_wait_events, wait_events, events); +} +template <> +clblasStatus clblasXtrsv(const clblasOrder layout, const clblasUplo triangle, const clblasTranspose a_transpose, const clblasDiag diagonal, + const size_t n, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { + return clblasZtrsv(layout, triangle, a_transpose, diagonal, + n, + a_buffer, a_offset, a_ld, + x_buffer, x_offset, static_cast(x_inc), + num_queues, queues, num_wait_events, wait_events, events); +} + +// Forwards the clBLAS calls for STBSV/DTBSV/CTBSV/ZTBSV +template +clblasStatus clblasXtbsv(const clblasOrder layout, const clblasUplo triangle, const clblasTranspose a_transpose, const clblasDiag diagonal, + const size_t n, const size_t k, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events); +template <> +clblasStatus clblasXtbsv(const clblasOrder layout, const clblasUplo triangle, const clblasTranspose a_transpose, const clblasDiag diagonal, + const size_t n, const size_t k, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { + return clblasStbsv(layout, triangle, a_transpose, diagonal, + n, k, + a_buffer, a_offset, a_ld, + x_buffer, x_offset, static_cast(x_inc), + num_queues, queues, num_wait_events, wait_events, events); +} +template <> +clblasStatus clblasXtbsv(const clblasOrder layout, const clblasUplo triangle, const clblasTranspose a_transpose, const clblasDiag diagonal, + const size_t n, const size_t k, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { + return clblasDtbsv(layout, triangle, a_transpose, diagonal, + n, k, + a_buffer, a_offset, a_ld, + x_buffer, x_offset, static_cast(x_inc), + num_queues, queues, num_wait_events, wait_events, events); +} +template <> +clblasStatus clblasXtbsv(const clblasOrder layout, const clblasUplo triangle, const clblasTranspose a_transpose, const clblasDiag diagonal, + const size_t n, const size_t k, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { + return clblasCtbsv(layout, triangle, a_transpose, diagonal, + n, k, + a_buffer, a_offset, a_ld, + x_buffer, x_offset, static_cast(x_inc), + num_queues, queues, num_wait_events, wait_events, events); +} +template <> +clblasStatus clblasXtbsv(const clblasOrder layout, const clblasUplo triangle, const clblasTranspose a_transpose, const clblasDiag diagonal, + const size_t n, const size_t k, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { + return clblasZtbsv(layout, triangle, a_transpose, diagonal, + n, k, + a_buffer, a_offset, a_ld, + x_buffer, x_offset, static_cast(x_inc), + num_queues, queues, num_wait_events, wait_events, events); +} + +// Forwards the clBLAS calls for STPSV/DTPSV/CTPSV/ZTPSV +template +clblasStatus clblasXtpsv(const clblasOrder layout, const clblasUplo triangle, const clblasTranspose a_transpose, const clblasDiag diagonal, + const size_t n, + const cl_mem ap_buffer, const size_t ap_offset, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events); +template <> +clblasStatus clblasXtpsv(const clblasOrder layout, const clblasUplo triangle, const clblasTranspose a_transpose, const clblasDiag diagonal, + const size_t n, + const cl_mem ap_buffer, const size_t ap_offset, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { + return clblasStpsv(layout, triangle, a_transpose, diagonal, + n, + ap_buffer, ap_offset, + x_buffer, x_offset, static_cast(x_inc), + num_queues, queues, num_wait_events, wait_events, events); +} +template <> +clblasStatus clblasXtpsv(const clblasOrder layout, const clblasUplo triangle, const clblasTranspose a_transpose, const clblasDiag diagonal, + const size_t n, + const cl_mem ap_buffer, const size_t ap_offset, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { + return clblasDtpsv(layout, triangle, a_transpose, diagonal, + n, + ap_buffer, ap_offset, + x_buffer, x_offset, static_cast(x_inc), + num_queues, queues, num_wait_events, wait_events, events); +} +template <> +clblasStatus clblasXtpsv(const clblasOrder layout, const clblasUplo triangle, const clblasTranspose a_transpose, const clblasDiag diagonal, + const size_t n, + const cl_mem ap_buffer, const size_t ap_offset, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { + return clblasCtpsv(layout, triangle, a_transpose, diagonal, + n, + ap_buffer, ap_offset, + x_buffer, x_offset, static_cast(x_inc), + num_queues, queues, num_wait_events, wait_events, events); +} +template <> +clblasStatus clblasXtpsv(const clblasOrder layout, const clblasUplo triangle, const clblasTranspose a_transpose, const clblasDiag diagonal, + const size_t n, + const cl_mem ap_buffer, const size_t ap_offset, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { + return clblasZtpsv(layout, triangle, a_transpose, diagonal, + n, + ap_buffer, ap_offset, + x_buffer, x_offset, static_cast(x_inc), + num_queues, queues, num_wait_events, wait_events, events); +} + +// Forwards the clBLAS calls for SGER/DGER +clblasStatus clblasXger(const clblasOrder layout, + const size_t m, const size_t n, + const float alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { + return clblasSger(layout, + m, n, + alpha, + x_buffer, x_offset, static_cast(x_inc), + y_buffer, y_offset, static_cast(y_inc), + a_buffer, a_offset, a_ld, + num_queues, queues, num_wait_events, wait_events, events); +} +clblasStatus clblasXger(const clblasOrder layout, + const size_t m, const size_t n, + const double alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { + return clblasDger(layout, + m, n, + alpha, + x_buffer, x_offset, static_cast(x_inc), + y_buffer, y_offset, static_cast(y_inc), + a_buffer, a_offset, a_ld, + num_queues, queues, num_wait_events, wait_events, events); +} + +// Forwards the clBLAS calls for CGERU/ZGERU +clblasStatus clblasXgeru(const clblasOrder layout, + const size_t m, const size_t n, + const float2 alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { + return clblasCgeru(layout, + m, n, + cl_float2{{alpha.real(), alpha.imag()}}, + x_buffer, x_offset, static_cast(x_inc), + y_buffer, y_offset, static_cast(y_inc), + a_buffer, a_offset, a_ld, + num_queues, queues, num_wait_events, wait_events, events); +} +clblasStatus clblasXgeru(const clblasOrder layout, + const size_t m, const size_t n, + const double2 alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { + return clblasZgeru(layout, + m, n, + cl_double2{{alpha.real(), alpha.imag()}}, + x_buffer, x_offset, static_cast(x_inc), + y_buffer, y_offset, static_cast(y_inc), + a_buffer, a_offset, a_ld, + num_queues, queues, num_wait_events, wait_events, events); +} + +// Forwards the clBLAS calls for CGERC/ZGERC +clblasStatus clblasXgerc(const clblasOrder layout, + const size_t m, const size_t n, + const float2 alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { + return clblasCgerc(layout, + m, n, + cl_float2{{alpha.real(), alpha.imag()}}, + x_buffer, x_offset, static_cast(x_inc), + y_buffer, y_offset, static_cast(y_inc), + a_buffer, a_offset, a_ld, + num_queues, queues, num_wait_events, wait_events, events); +} +clblasStatus clblasXgerc(const clblasOrder layout, + const size_t m, const size_t n, + const double2 alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { + return clblasZgerc(layout, + m, n, + cl_double2{{alpha.real(), alpha.imag()}}, + x_buffer, x_offset, static_cast(x_inc), + y_buffer, y_offset, static_cast(y_inc), + a_buffer, a_offset, a_ld, + num_queues, queues, num_wait_events, wait_events, events); +} + +// Forwards the clBLAS calls for CHER/ZHER +clblasStatus clblasXher(const clblasOrder layout, const clblasUplo triangle, + const size_t n, + const float alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { + return clblasCher(layout, triangle, + n, + alpha, + x_buffer, x_offset, static_cast(x_inc), + a_buffer, a_offset, a_ld, + num_queues, queues, num_wait_events, wait_events, events); +} +clblasStatus clblasXher(const clblasOrder layout, const clblasUplo triangle, + const size_t n, + const double alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { + return clblasZher(layout, triangle, + n, + alpha, + x_buffer, x_offset, static_cast(x_inc), + a_buffer, a_offset, a_ld, + num_queues, queues, num_wait_events, wait_events, events); +} + +// Forwards the clBLAS calls for CHPR/ZHPR +clblasStatus clblasXhpr(const clblasOrder layout, const clblasUplo triangle, + const size_t n, + const float alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_mem ap_buffer, const size_t ap_offset, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { + return clblasChpr(layout, triangle, + n, + alpha, + x_buffer, x_offset, static_cast(x_inc), + ap_buffer, ap_offset, + num_queues, queues, num_wait_events, wait_events, events); +} +clblasStatus clblasXhpr(const clblasOrder layout, const clblasUplo triangle, + const size_t n, + const double alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_mem ap_buffer, const size_t ap_offset, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { + return clblasZhpr(layout, triangle, + n, + alpha, + x_buffer, x_offset, static_cast(x_inc), + ap_buffer, ap_offset, + num_queues, queues, num_wait_events, wait_events, events); +} + +// Forwards the clBLAS calls for CHER2/ZHER2 +clblasStatus clblasXher2(const clblasOrder layout, const clblasUplo triangle, + const size_t n, + const float2 alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { + return clblasCher2(layout, triangle, + n, + cl_float2{{alpha.real(), alpha.imag()}}, + x_buffer, x_offset, static_cast(x_inc), + y_buffer, y_offset, static_cast(y_inc), + a_buffer, a_offset, a_ld, + num_queues, queues, num_wait_events, wait_events, events); +} +clblasStatus clblasXher2(const clblasOrder layout, const clblasUplo triangle, + const size_t n, + const double2 alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { + return clblasZher2(layout, triangle, + n, + cl_double2{{alpha.real(), alpha.imag()}}, + x_buffer, x_offset, static_cast(x_inc), + y_buffer, y_offset, static_cast(y_inc), + a_buffer, a_offset, a_ld, + num_queues, queues, num_wait_events, wait_events, events); +} + +// Forwards the clBLAS calls for CHPR2/ZHPR2 +clblasStatus clblasXhpr2(const clblasOrder layout, const clblasUplo triangle, + const size_t n, + const float2 alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_mem ap_buffer, const size_t ap_offset, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { + return clblasChpr2(layout, triangle, + n, + cl_float2{{alpha.real(), alpha.imag()}}, + x_buffer, x_offset, static_cast(x_inc), + y_buffer, y_offset, static_cast(y_inc), + ap_buffer, ap_offset, + num_queues, queues, num_wait_events, wait_events, events); +} +clblasStatus clblasXhpr2(const clblasOrder layout, const clblasUplo triangle, + const size_t n, + const double2 alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_mem ap_buffer, const size_t ap_offset, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { + return clblasZhpr2(layout, triangle, + n, + cl_double2{{alpha.real(), alpha.imag()}}, + x_buffer, x_offset, static_cast(x_inc), + y_buffer, y_offset, static_cast(y_inc), + ap_buffer, ap_offset, + num_queues, queues, num_wait_events, wait_events, events); +} + +// Forwards the clBLAS calls for SSYR/DSYR +clblasStatus clblasXsyr(const clblasOrder layout, const clblasUplo triangle, + const size_t n, + const float alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { + return clblasSsyr(layout, triangle, + n, + alpha, + x_buffer, x_offset, static_cast(x_inc), + a_buffer, a_offset, a_ld, + num_queues, queues, num_wait_events, wait_events, events); +} +clblasStatus clblasXsyr(const clblasOrder layout, const clblasUplo triangle, + const size_t n, + const double alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { + return clblasDsyr(layout, triangle, + n, + alpha, + x_buffer, x_offset, static_cast(x_inc), + a_buffer, a_offset, a_ld, + num_queues, queues, num_wait_events, wait_events, events); +} + +// Forwards the clBLAS calls for SSPR/DSPR +clblasStatus clblasXspr(const clblasOrder layout, const clblasUplo triangle, + const size_t n, + const float alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_mem ap_buffer, const size_t ap_offset, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { + return clblasSspr(layout, triangle, + n, + alpha, + x_buffer, x_offset, static_cast(x_inc), + ap_buffer, ap_offset, + num_queues, queues, num_wait_events, wait_events, events); +} +clblasStatus clblasXspr(const clblasOrder layout, const clblasUplo triangle, + const size_t n, + const double alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_mem ap_buffer, const size_t ap_offset, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { + return clblasDspr(layout, triangle, + n, + alpha, + x_buffer, x_offset, static_cast(x_inc), + ap_buffer, ap_offset, + num_queues, queues, num_wait_events, wait_events, events); +} + +// Forwards the clBLAS calls for SSYR2/DSYR2 +clblasStatus clblasXsyr2(const clblasOrder layout, const clblasUplo triangle, + const size_t n, + const float alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { + return clblasSsyr2(layout, triangle, + n, + alpha, + x_buffer, x_offset, static_cast(x_inc), + y_buffer, y_offset, static_cast(y_inc), + a_buffer, a_offset, a_ld, + num_queues, queues, num_wait_events, wait_events, events); +} +clblasStatus clblasXsyr2(const clblasOrder layout, const clblasUplo triangle, + const size_t n, + const double alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { + return clblasDsyr2(layout, triangle, + n, + alpha, + x_buffer, x_offset, static_cast(x_inc), + y_buffer, y_offset, static_cast(y_inc), + a_buffer, a_offset, a_ld, + num_queues, queues, num_wait_events, wait_events, events); +} + +// Forwards the clBLAS calls for SSPR2/DSPR2 +clblasStatus clblasXspr2(const clblasOrder layout, const clblasUplo triangle, + const size_t n, + const float alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_mem ap_buffer, const size_t ap_offset, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { + return clblasSspr2(layout, triangle, + n, + alpha, + x_buffer, x_offset, static_cast(x_inc), + y_buffer, y_offset, static_cast(y_inc), + ap_buffer, ap_offset, + num_queues, queues, num_wait_events, wait_events, events); +} +clblasStatus clblasXspr2(const clblasOrder layout, const clblasUplo triangle, + const size_t n, + const double alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_mem ap_buffer, const size_t ap_offset, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { + return clblasDspr2(layout, triangle, + n, + alpha, + x_buffer, x_offset, static_cast(x_inc), + y_buffer, y_offset, static_cast(y_inc), + ap_buffer, ap_offset, + num_queues, queues, num_wait_events, wait_events, events); +} + // ================================================================================================= // BLAS level-3 (matrix-matrix) routines // ================================================================================================= @@ -964,6 +1962,64 @@ clblasStatus clblasXtrmm(const clblasOrder layout, const clblasSide side, const num_queues, queues, num_wait_events, wait_events, events); } +// Forwards the clBLAS calls for STRSM/DTRSM/CTRSM/ZTRSM +clblasStatus clblasXtrsm(const clblasOrder layout, const clblasSide side, const clblasUplo triangle, const clblasTranspose a_transpose, const clblasDiag diagonal, + const size_t m, const size_t n, + const float alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_mem b_buffer, const size_t b_offset, const size_t b_ld, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { + return clblasStrsm(layout, side, triangle, a_transpose, diagonal, + m, n, + alpha, + a_buffer, a_offset, a_ld, + b_buffer, b_offset, b_ld, + num_queues, queues, num_wait_events, wait_events, events); +} +clblasStatus clblasXtrsm(const clblasOrder layout, const clblasSide side, const clblasUplo triangle, const clblasTranspose a_transpose, const clblasDiag diagonal, + const size_t m, const size_t n, + const double alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_mem b_buffer, const size_t b_offset, const size_t b_ld, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { + return clblasDtrsm(layout, side, triangle, a_transpose, diagonal, + m, n, + alpha, + a_buffer, a_offset, a_ld, + b_buffer, b_offset, b_ld, + num_queues, queues, num_wait_events, wait_events, events); +} +clblasStatus clblasXtrsm(const clblasOrder layout, const clblasSide side, const clblasUplo triangle, const clblasTranspose a_transpose, const clblasDiag diagonal, + const size_t m, const size_t n, + const float2 alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_mem b_buffer, const size_t b_offset, const size_t b_ld, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { + return clblasCtrsm(layout, side, triangle, a_transpose, diagonal, + m, n, + cl_float2{{alpha.real(), alpha.imag()}}, + a_buffer, a_offset, a_ld, + b_buffer, b_offset, b_ld, + num_queues, queues, num_wait_events, wait_events, events); +} +clblasStatus clblasXtrsm(const clblasOrder layout, const clblasSide side, const clblasUplo triangle, const clblasTranspose a_transpose, const clblasDiag diagonal, + const size_t m, const size_t n, + const double2 alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_mem b_buffer, const size_t b_offset, const size_t b_ld, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { + return clblasZtrsm(layout, side, triangle, a_transpose, diagonal, + m, n, + cl_double2{{alpha.real(), alpha.imag()}}, + a_buffer, a_offset, a_ld, + b_buffer, b_offset, b_ld, + num_queues, queues, num_wait_events, wait_events, events); +} + // ================================================================================================= } // namespace clblast From 4796c9bcbd84a9e8be1e2864ba47e0d6bf3e6632 Mon Sep 17 00:00:00 2001 From: CNugteren Date: Fri, 18 Sep 2015 10:19:03 +0200 Subject: [PATCH 3/3] Added generated main functions for correctness/performance tests for level 2 routines --- scripts/generator/datatype.py | 9 +- scripts/generator/generator.py | 99 ++++++++++++++++++++-- test/correctness/routines/level1/xaxpy.cc | 4 - test/correctness/routines/level1/xcopy.cc | 4 - test/correctness/routines/level1/xdot.cc | 4 - test/correctness/routines/level1/xdotc.cc | 6 +- test/correctness/routines/level1/xdotu.cc | 6 +- test/correctness/routines/level1/xscal.cc | 4 - test/correctness/routines/level1/xswap.cc | 4 - test/correctness/routines/level2/xgbmv.cc | 28 ++++++ test/correctness/routines/level2/xgemv.cc | 4 - test/correctness/routines/level2/xger.cc | 26 ++++++ test/correctness/routines/level2/xgerc.cc | 26 ++++++ test/correctness/routines/level2/xgeru.cc | 26 ++++++ test/correctness/routines/level2/xhbmv.cc | 26 ++++++ test/correctness/routines/level2/xhemv.cc | 4 - test/correctness/routines/level2/xher.cc | 26 ++++++ test/correctness/routines/level2/xher2.cc | 26 ++++++ test/correctness/routines/level2/xhpmv.cc | 26 ++++++ test/correctness/routines/level2/xhpr.cc | 26 ++++++ test/correctness/routines/level2/xhpr2.cc | 26 ++++++ test/correctness/routines/level2/xsbmv.cc | 26 ++++++ test/correctness/routines/level2/xspmv.cc | 26 ++++++ test/correctness/routines/level2/xspr.cc | 26 ++++++ test/correctness/routines/level2/xspr2.cc | 26 ++++++ test/correctness/routines/level2/xsymv.cc | 6 +- test/correctness/routines/level2/xsyr.cc | 26 ++++++ test/correctness/routines/level2/xsyr2.cc | 26 ++++++ test/correctness/routines/level2/xtbmv.cc | 28 ++++++ test/correctness/routines/level2/xtbsv.cc | 28 ++++++ test/correctness/routines/level2/xtpmv.cc | 28 ++++++ test/correctness/routines/level2/xtpsv.cc | 28 ++++++ test/correctness/routines/level2/xtrmv.cc | 28 ++++++ test/correctness/routines/level2/xtrsv.cc | 28 ++++++ test/correctness/routines/level3/xgemm.cc | 4 - test/correctness/routines/level3/xhemm.cc | 6 +- test/correctness/routines/level3/xher2k.cc | 4 - test/correctness/routines/level3/xherk.cc | 4 - test/correctness/routines/level3/xsymm.cc | 4 - test/correctness/routines/level3/xsyr2k.cc | 4 - test/correctness/routines/level3/xsyrk.cc | 4 - test/correctness/routines/level3/xtrmm.cc | 4 - test/correctness/routines/level3/xtrsm.cc | 28 ++++++ test/performance/routines/level1/xaxpy.cc | 7 +- test/performance/routines/level1/xcopy.cc | 7 +- test/performance/routines/level1/xdot.cc | 13 +-- test/performance/routines/level1/xdotc.cc | 13 +-- test/performance/routines/level1/xdotu.cc | 13 +-- test/performance/routines/level1/xscal.cc | 7 +- test/performance/routines/level1/xswap.cc | 7 +- test/performance/routines/level2/xgbmv.cc | 35 ++++++++ test/performance/routines/level2/xgemv.cc | 7 +- test/performance/routines/level2/xger.cc | 33 ++++++++ test/performance/routines/level2/xgerc.cc | 33 ++++++++ test/performance/routines/level2/xgeru.cc | 33 ++++++++ test/performance/routines/level2/xhbmv.cc | 33 ++++++++ test/performance/routines/level2/xhemv.cc | 13 +-- test/performance/routines/level2/xher.cc | 33 ++++++++ test/performance/routines/level2/xher2.cc | 33 ++++++++ test/performance/routines/level2/xhpmv.cc | 33 ++++++++ test/performance/routines/level2/xhpr.cc | 33 ++++++++ test/performance/routines/level2/xhpr2.cc | 33 ++++++++ test/performance/routines/level2/xsbmv.cc | 33 ++++++++ test/performance/routines/level2/xspmv.cc | 33 ++++++++ test/performance/routines/level2/xspr.cc | 33 ++++++++ test/performance/routines/level2/xspr2.cc | 33 ++++++++ test/performance/routines/level2/xsymv.cc | 15 ++-- test/performance/routines/level2/xsyr.cc | 33 ++++++++ test/performance/routines/level2/xsyr2.cc | 33 ++++++++ test/performance/routines/level2/xtbmv.cc | 35 ++++++++ test/performance/routines/level2/xtbsv.cc | 35 ++++++++ test/performance/routines/level2/xtpmv.cc | 35 ++++++++ test/performance/routines/level2/xtpsv.cc | 35 ++++++++ test/performance/routines/level2/xtrmv.cc | 35 ++++++++ test/performance/routines/level2/xtrsv.cc | 35 ++++++++ test/performance/routines/level3/xgemm.cc | 7 +- test/performance/routines/level3/xhemm.cc | 13 +-- test/performance/routines/level3/xher2k.cc | 13 +-- test/performance/routines/level3/xherk.cc | 13 +-- test/performance/routines/level3/xsymm.cc | 7 +- test/performance/routines/level3/xsyr2k.cc | 7 +- test/performance/routines/level3/xsyrk.cc | 7 +- test/performance/routines/level3/xtrmm.cc | 7 +- test/performance/routines/level3/xtrsm.cc | 35 ++++++++ 84 files changed, 1531 insertions(+), 222 deletions(-) create mode 100644 test/correctness/routines/level2/xgbmv.cc create mode 100644 test/correctness/routines/level2/xger.cc create mode 100644 test/correctness/routines/level2/xgerc.cc create mode 100644 test/correctness/routines/level2/xgeru.cc create mode 100644 test/correctness/routines/level2/xhbmv.cc create mode 100644 test/correctness/routines/level2/xher.cc create mode 100644 test/correctness/routines/level2/xher2.cc create mode 100644 test/correctness/routines/level2/xhpmv.cc create mode 100644 test/correctness/routines/level2/xhpr.cc create mode 100644 test/correctness/routines/level2/xhpr2.cc create mode 100644 test/correctness/routines/level2/xsbmv.cc create mode 100644 test/correctness/routines/level2/xspmv.cc create mode 100644 test/correctness/routines/level2/xspr.cc create mode 100644 test/correctness/routines/level2/xspr2.cc create mode 100644 test/correctness/routines/level2/xsyr.cc create mode 100644 test/correctness/routines/level2/xsyr2.cc create mode 100644 test/correctness/routines/level2/xtbmv.cc create mode 100644 test/correctness/routines/level2/xtbsv.cc create mode 100644 test/correctness/routines/level2/xtpmv.cc create mode 100644 test/correctness/routines/level2/xtpsv.cc create mode 100644 test/correctness/routines/level2/xtrmv.cc create mode 100644 test/correctness/routines/level2/xtrsv.cc create mode 100644 test/correctness/routines/level3/xtrsm.cc create mode 100644 test/performance/routines/level2/xgbmv.cc create mode 100644 test/performance/routines/level2/xger.cc create mode 100644 test/performance/routines/level2/xgerc.cc create mode 100644 test/performance/routines/level2/xgeru.cc create mode 100644 test/performance/routines/level2/xhbmv.cc create mode 100644 test/performance/routines/level2/xher.cc create mode 100644 test/performance/routines/level2/xher2.cc create mode 100644 test/performance/routines/level2/xhpmv.cc create mode 100644 test/performance/routines/level2/xhpr.cc create mode 100644 test/performance/routines/level2/xhpr2.cc create mode 100644 test/performance/routines/level2/xsbmv.cc create mode 100644 test/performance/routines/level2/xspmv.cc create mode 100644 test/performance/routines/level2/xspr.cc create mode 100644 test/performance/routines/level2/xspr2.cc create mode 100644 test/performance/routines/level2/xsyr.cc create mode 100644 test/performance/routines/level2/xsyr2.cc create mode 100644 test/performance/routines/level2/xtbmv.cc create mode 100644 test/performance/routines/level2/xtbsv.cc create mode 100644 test/performance/routines/level2/xtpmv.cc create mode 100644 test/performance/routines/level2/xtpsv.cc create mode 100644 test/performance/routines/level2/xtrmv.cc create mode 100644 test/performance/routines/level2/xtrsv.cc create mode 100644 test/performance/routines/level3/xtrsm.cc diff --git a/scripts/generator/datatype.py b/scripts/generator/datatype.py index cca3534d..0aa27197 100644 --- a/scripts/generator/datatype.py +++ b/scripts/generator/datatype.py @@ -29,7 +29,7 @@ class DataType(): self.beta_cpp = scalars[1] self.alpha_cl = scalars[2] self.beta_cl = scalars[3] - self.buffertype = buffertype # Only used for template types + self.buffertype = buffertype # Outputs the name of the data-type (alpha/beta), possibly transforming into the right type def UseAlpha(self): @@ -51,4 +51,11 @@ class DataType(): return self.beta_cl+"{{beta.real(), beta.imag()}}" return "beta" + # Returns the template as used in the correctness/performance tests + def TestTemplate(self): + if self.buffertype != self.beta_cpp: + return "<"+self.buffertype+","+self.beta_cpp+">, "+self.buffertype+", "+self.beta_cpp + return "<"+self.buffertype+">, "+self.buffertype+", "+self.beta_cpp + + # ================================================================================================== diff --git a/scripts/generator/generator.py b/scripts/generator/generator.py index 9c9675b8..677c8afc 100644 --- a/scripts/generator/generator.py +++ b/scripts/generator/generator.py @@ -14,6 +14,9 @@ # clblast_c.h # clblast_c.cc # wrapper_clblas.h +# It also generates the main functions for the correctness and performance tests as found in +# test/correctness/routines/levelX/xYYYY.cc +# test/performance/routines/levelX/xYYYY.cc # # ================================================================================================== @@ -30,12 +33,12 @@ from datatype import DataType, FLT, DBL, FLT2, DBL2, F2CL, D2CL # Regular data-types S = DataType("S", FLT, [FLT, FLT, FLT, FLT], FLT ) # single (32) D = DataType("D", DBL, [DBL, DBL, DBL, DBL], DBL ) # double (64) -C = DataType("C", FLT2, [FLT2, FLT2, F2CL, F2CL], F2CL) # single-complex (3232) -Z = DataType("Z", DBL2, [DBL2, DBL2, D2CL, D2CL], D2CL) # double-complex (6464) +C = DataType("C", FLT2, [FLT2, FLT2, F2CL, F2CL], FLT2) # single-complex (3232) +Z = DataType("Z", DBL2, [DBL2, DBL2, D2CL, D2CL], DBL2) # double-complex (6464) # Special cases -Css = DataType("C", FLT, [FLT, FLT, FLT, FLT], FLT ) # As C, but with constants from S -Zdd = DataType("Z", DBL, [DBL, DBL, DBL, DBL], DBL ) # As Z, but with constants from D +Css = DataType("C", FLT, [FLT, FLT, FLT, FLT], FLT2) # As C, but with constants from S +Zdd = DataType("Z", DBL, [DBL, DBL, DBL, DBL], DBL2) # As Z, but with constants from D Ccs = DataType("C", FLT2+","+FLT, [FLT2, FLT, F2CL, FLT], FLT2) # As C, but with one constant from S Zzd = DataType("Z", DBL2+","+DBL, [DBL2, DBL, D2CL, DBL], DBL2) # As Z, but with one constant from D @@ -115,6 +118,22 @@ separators = [""" // BLAS level-3 (matrix-matrix) routines // ================================================================================================="""] +# Main header/footer for source files +header = """ +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// ================================================================================================= +""" +footer = """ +// ================================================================================================= +""" + # ================================================================================================== # The C++ API header (.h) @@ -235,8 +254,8 @@ for i in xrange(0,len(files)): # Stores the header and the footer of the original file with open(files[i]) as f: original = f.readlines() - header = original[:header_lines[i]] - footer = original[-footer_lines[i]:] + file_header = original[:header_lines[i]] + file_footer = original[-footer_lines[i]:] # Re-writes the body of the file with open(files[i], "w") as f: @@ -253,8 +272,72 @@ for i in xrange(0,len(files)): body += clblast_c_cc(routines[level-1]) if i == 4: body += wrapper_clblas(routines[level-1]) - f.write("".join(header)) + f.write("".join(file_header)) f.write(body) - f.write("".join(footer)) + f.write("".join(file_footer)) + +# ================================================================================================== + +# Outputs all the correctness-test implementations +for level in [1,2,3]: + for routine in routines[level-1]: + filename = path_clblast+"/test/correctness/routines/level"+str(level)+"/x"+routine.name+".cc" + with open(filename, "w") as f: + body = "" + body += "#include \"correctness/testblas.h\"\n" + body += "#include \"routines/level"+str(level)+"/x"+routine.name+".h\"\n\n" + body += "// Shortcuts to the clblast namespace\n" + body += "using float2 = clblast::float2;\n" + body += "using double2 = clblast::double2;\n\n" + body += "// Main function (not within the clblast namespace)\n" + body += "int main(int argc, char *argv[]) {\n" + not_first = "false" + for flavour in routine.flavours: + body += " clblast::RunTests // -// This file implements the tests for the Xaxpy routine. -// // ================================================================================================= #include "correctness/testblas.h" #include "routines/level1/xaxpy.h" -// ================================================================================================= - // Shortcuts to the clblast namespace using float2 = clblast::float2; using double2 = clblast::double2; diff --git a/test/correctness/routines/level1/xcopy.cc b/test/correctness/routines/level1/xcopy.cc index 8a06a722..3e16ffc6 100644 --- a/test/correctness/routines/level1/xcopy.cc +++ b/test/correctness/routines/level1/xcopy.cc @@ -7,15 +7,11 @@ // Author(s): // Cedric Nugteren // -// This file implements the tests for the Xcopy routine. -// // ================================================================================================= #include "correctness/testblas.h" #include "routines/level1/xcopy.h" -// ================================================================================================= - // Shortcuts to the clblast namespace using float2 = clblast::float2; using double2 = clblast::double2; diff --git a/test/correctness/routines/level1/xdot.cc b/test/correctness/routines/level1/xdot.cc index e1b70cb2..5ea105e0 100644 --- a/test/correctness/routines/level1/xdot.cc +++ b/test/correctness/routines/level1/xdot.cc @@ -7,15 +7,11 @@ // Author(s): // Cedric Nugteren // -// This file implements the tests for the Xdot routine. -// // ================================================================================================= #include "correctness/testblas.h" #include "routines/level1/xdot.h" -// ================================================================================================= - // Shortcuts to the clblast namespace using float2 = clblast::float2; using double2 = clblast::double2; diff --git a/test/correctness/routines/level1/xdotc.cc b/test/correctness/routines/level1/xdotc.cc index 15a2f88b..76aaa0ec 100644 --- a/test/correctness/routines/level1/xdotc.cc +++ b/test/correctness/routines/level1/xdotc.cc @@ -7,22 +7,18 @@ // Author(s): // Cedric Nugteren // -// This file implements the tests for the Xdotc routine. -// // ================================================================================================= #include "correctness/testblas.h" #include "routines/level1/xdotc.h" -// ================================================================================================= - // Shortcuts to the clblast namespace using float2 = clblast::float2; using double2 = clblast::double2; // Main function (not within the clblast namespace) int main(int argc, char *argv[]) { - clblast::RunTests, float2, float2>(argc, argv, true, "CDOTC"); + clblast::RunTests, float2, float2>(argc, argv, false, "CDOTC"); clblast::RunTests, double2, double2>(argc, argv, true, "ZDOTC"); return 0; } diff --git a/test/correctness/routines/level1/xdotu.cc b/test/correctness/routines/level1/xdotu.cc index c8af0388..aecde4f5 100644 --- a/test/correctness/routines/level1/xdotu.cc +++ b/test/correctness/routines/level1/xdotu.cc @@ -7,22 +7,18 @@ // Author(s): // Cedric Nugteren // -// This file implements the tests for the Xdotu routine. -// // ================================================================================================= #include "correctness/testblas.h" #include "routines/level1/xdotu.h" -// ================================================================================================= - // Shortcuts to the clblast namespace using float2 = clblast::float2; using double2 = clblast::double2; // Main function (not within the clblast namespace) int main(int argc, char *argv[]) { - clblast::RunTests, float2, float2>(argc, argv, true, "CDOTU"); + clblast::RunTests, float2, float2>(argc, argv, false, "CDOTU"); clblast::RunTests, double2, double2>(argc, argv, true, "ZDOTU"); return 0; } diff --git a/test/correctness/routines/level1/xscal.cc b/test/correctness/routines/level1/xscal.cc index ceb1b7cd..4d138fad 100644 --- a/test/correctness/routines/level1/xscal.cc +++ b/test/correctness/routines/level1/xscal.cc @@ -7,15 +7,11 @@ // Author(s): // Cedric Nugteren // -// This file implements the tests for the Xscal routine. -// // ================================================================================================= #include "correctness/testblas.h" #include "routines/level1/xscal.h" -// ================================================================================================= - // Shortcuts to the clblast namespace using float2 = clblast::float2; using double2 = clblast::double2; diff --git a/test/correctness/routines/level1/xswap.cc b/test/correctness/routines/level1/xswap.cc index 140ccf24..38f110f7 100644 --- a/test/correctness/routines/level1/xswap.cc +++ b/test/correctness/routines/level1/xswap.cc @@ -7,15 +7,11 @@ // Author(s): // Cedric Nugteren // -// This file implements the tests for the Xswap routine. -// // ================================================================================================= #include "correctness/testblas.h" #include "routines/level1/xswap.h" -// ================================================================================================= - // Shortcuts to the clblast namespace using float2 = clblast::float2; using double2 = clblast::double2; diff --git a/test/correctness/routines/level2/xgbmv.cc b/test/correctness/routines/level2/xgbmv.cc new file mode 100644 index 00000000..b28c5978 --- /dev/null +++ b/test/correctness/routines/level2/xgbmv.cc @@ -0,0 +1,28 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// ================================================================================================= + +#include "correctness/testblas.h" +#include "routines/level2/xgbmv.h" + +// Shortcuts to the clblast namespace +using float2 = clblast::float2; +using double2 = clblast::double2; + +// Main function (not within the clblast namespace) +int main(int argc, char *argv[]) { + clblast::RunTests, float, float>(argc, argv, false, "SGBMV"); + clblast::RunTests, double, double>(argc, argv, true, "DGBMV"); + clblast::RunTests, float2, float2>(argc, argv, true, "CGBMV"); + clblast::RunTests, double2, double2>(argc, argv, true, "ZGBMV"); + return 0; +} + +// ================================================================================================= diff --git a/test/correctness/routines/level2/xgemv.cc b/test/correctness/routines/level2/xgemv.cc index f7229735..14eb74d1 100644 --- a/test/correctness/routines/level2/xgemv.cc +++ b/test/correctness/routines/level2/xgemv.cc @@ -7,15 +7,11 @@ // Author(s): // Cedric Nugteren // -// This file implements the tests for the Xgemv routine. -// // ================================================================================================= #include "correctness/testblas.h" #include "routines/level2/xgemv.h" -// ================================================================================================= - // Shortcuts to the clblast namespace using float2 = clblast::float2; using double2 = clblast::double2; diff --git a/test/correctness/routines/level2/xger.cc b/test/correctness/routines/level2/xger.cc new file mode 100644 index 00000000..c37a5c41 --- /dev/null +++ b/test/correctness/routines/level2/xger.cc @@ -0,0 +1,26 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// ================================================================================================= + +#include "correctness/testblas.h" +#include "routines/level2/xger.h" + +// Shortcuts to the clblast namespace +using float2 = clblast::float2; +using double2 = clblast::double2; + +// Main function (not within the clblast namespace) +int main(int argc, char *argv[]) { + clblast::RunTests, float, float>(argc, argv, false, "SGER"); + clblast::RunTests, double, double>(argc, argv, true, "DGER"); + return 0; +} + +// ================================================================================================= diff --git a/test/correctness/routines/level2/xgerc.cc b/test/correctness/routines/level2/xgerc.cc new file mode 100644 index 00000000..8fd31142 --- /dev/null +++ b/test/correctness/routines/level2/xgerc.cc @@ -0,0 +1,26 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// ================================================================================================= + +#include "correctness/testblas.h" +#include "routines/level2/xgerc.h" + +// Shortcuts to the clblast namespace +using float2 = clblast::float2; +using double2 = clblast::double2; + +// Main function (not within the clblast namespace) +int main(int argc, char *argv[]) { + clblast::RunTests, float2, float2>(argc, argv, false, "CGERC"); + clblast::RunTests, double2, double2>(argc, argv, true, "ZGERC"); + return 0; +} + +// ================================================================================================= diff --git a/test/correctness/routines/level2/xgeru.cc b/test/correctness/routines/level2/xgeru.cc new file mode 100644 index 00000000..ee92416b --- /dev/null +++ b/test/correctness/routines/level2/xgeru.cc @@ -0,0 +1,26 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// ================================================================================================= + +#include "correctness/testblas.h" +#include "routines/level2/xgeru.h" + +// Shortcuts to the clblast namespace +using float2 = clblast::float2; +using double2 = clblast::double2; + +// Main function (not within the clblast namespace) +int main(int argc, char *argv[]) { + clblast::RunTests, float2, float2>(argc, argv, false, "CGERU"); + clblast::RunTests, double2, double2>(argc, argv, true, "ZGERU"); + return 0; +} + +// ================================================================================================= diff --git a/test/correctness/routines/level2/xhbmv.cc b/test/correctness/routines/level2/xhbmv.cc new file mode 100644 index 00000000..4cd137a7 --- /dev/null +++ b/test/correctness/routines/level2/xhbmv.cc @@ -0,0 +1,26 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// ================================================================================================= + +#include "correctness/testblas.h" +#include "routines/level2/xhbmv.h" + +// Shortcuts to the clblast namespace +using float2 = clblast::float2; +using double2 = clblast::double2; + +// Main function (not within the clblast namespace) +int main(int argc, char *argv[]) { + clblast::RunTests, float2, float2>(argc, argv, false, "CHBMV"); + clblast::RunTests, double2, double2>(argc, argv, true, "ZHBMV"); + return 0; +} + +// ================================================================================================= diff --git a/test/correctness/routines/level2/xhemv.cc b/test/correctness/routines/level2/xhemv.cc index 183aebc2..20c5370c 100644 --- a/test/correctness/routines/level2/xhemv.cc +++ b/test/correctness/routines/level2/xhemv.cc @@ -7,15 +7,11 @@ // Author(s): // Cedric Nugteren // -// This file implements the tests for the Xhemv routine. -// // ================================================================================================= #include "correctness/testblas.h" #include "routines/level2/xhemv.h" -// ================================================================================================= - // Shortcuts to the clblast namespace using float2 = clblast::float2; using double2 = clblast::double2; diff --git a/test/correctness/routines/level2/xher.cc b/test/correctness/routines/level2/xher.cc new file mode 100644 index 00000000..5b9b48be --- /dev/null +++ b/test/correctness/routines/level2/xher.cc @@ -0,0 +1,26 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// ================================================================================================= + +#include "correctness/testblas.h" +#include "routines/level2/xher.h" + +// Shortcuts to the clblast namespace +using float2 = clblast::float2; +using double2 = clblast::double2; + +// Main function (not within the clblast namespace) +int main(int argc, char *argv[]) { + clblast::RunTests, float2, float>(argc, argv, false, "CHER"); + clblast::RunTests, double2, double>(argc, argv, true, "ZHER"); + return 0; +} + +// ================================================================================================= diff --git a/test/correctness/routines/level2/xher2.cc b/test/correctness/routines/level2/xher2.cc new file mode 100644 index 00000000..093b3959 --- /dev/null +++ b/test/correctness/routines/level2/xher2.cc @@ -0,0 +1,26 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// ================================================================================================= + +#include "correctness/testblas.h" +#include "routines/level2/xher2.h" + +// Shortcuts to the clblast namespace +using float2 = clblast::float2; +using double2 = clblast::double2; + +// Main function (not within the clblast namespace) +int main(int argc, char *argv[]) { + clblast::RunTests, float2, float2>(argc, argv, false, "CHER2"); + clblast::RunTests, double2, double2>(argc, argv, true, "ZHER2"); + return 0; +} + +// ================================================================================================= diff --git a/test/correctness/routines/level2/xhpmv.cc b/test/correctness/routines/level2/xhpmv.cc new file mode 100644 index 00000000..cbf41443 --- /dev/null +++ b/test/correctness/routines/level2/xhpmv.cc @@ -0,0 +1,26 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// ================================================================================================= + +#include "correctness/testblas.h" +#include "routines/level2/xhpmv.h" + +// Shortcuts to the clblast namespace +using float2 = clblast::float2; +using double2 = clblast::double2; + +// Main function (not within the clblast namespace) +int main(int argc, char *argv[]) { + clblast::RunTests, float2, float2>(argc, argv, false, "CHPMV"); + clblast::RunTests, double2, double2>(argc, argv, true, "ZHPMV"); + return 0; +} + +// ================================================================================================= diff --git a/test/correctness/routines/level2/xhpr.cc b/test/correctness/routines/level2/xhpr.cc new file mode 100644 index 00000000..a720aaef --- /dev/null +++ b/test/correctness/routines/level2/xhpr.cc @@ -0,0 +1,26 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// ================================================================================================= + +#include "correctness/testblas.h" +#include "routines/level2/xhpr.h" + +// Shortcuts to the clblast namespace +using float2 = clblast::float2; +using double2 = clblast::double2; + +// Main function (not within the clblast namespace) +int main(int argc, char *argv[]) { + clblast::RunTests, float2, float>(argc, argv, false, "CHPR"); + clblast::RunTests, double2, double>(argc, argv, true, "ZHPR"); + return 0; +} + +// ================================================================================================= diff --git a/test/correctness/routines/level2/xhpr2.cc b/test/correctness/routines/level2/xhpr2.cc new file mode 100644 index 00000000..0fed97e1 --- /dev/null +++ b/test/correctness/routines/level2/xhpr2.cc @@ -0,0 +1,26 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// ================================================================================================= + +#include "correctness/testblas.h" +#include "routines/level2/xhpr2.h" + +// Shortcuts to the clblast namespace +using float2 = clblast::float2; +using double2 = clblast::double2; + +// Main function (not within the clblast namespace) +int main(int argc, char *argv[]) { + clblast::RunTests, float2, float2>(argc, argv, false, "CHPR2"); + clblast::RunTests, double2, double2>(argc, argv, true, "ZHPR2"); + return 0; +} + +// ================================================================================================= diff --git a/test/correctness/routines/level2/xsbmv.cc b/test/correctness/routines/level2/xsbmv.cc new file mode 100644 index 00000000..212e2c3a --- /dev/null +++ b/test/correctness/routines/level2/xsbmv.cc @@ -0,0 +1,26 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// ================================================================================================= + +#include "correctness/testblas.h" +#include "routines/level2/xsbmv.h" + +// Shortcuts to the clblast namespace +using float2 = clblast::float2; +using double2 = clblast::double2; + +// Main function (not within the clblast namespace) +int main(int argc, char *argv[]) { + clblast::RunTests, float, float>(argc, argv, false, "SSBMV"); + clblast::RunTests, double, double>(argc, argv, true, "DSBMV"); + return 0; +} + +// ================================================================================================= diff --git a/test/correctness/routines/level2/xspmv.cc b/test/correctness/routines/level2/xspmv.cc new file mode 100644 index 00000000..dc833024 --- /dev/null +++ b/test/correctness/routines/level2/xspmv.cc @@ -0,0 +1,26 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// ================================================================================================= + +#include "correctness/testblas.h" +#include "routines/level2/xspmv.h" + +// Shortcuts to the clblast namespace +using float2 = clblast::float2; +using double2 = clblast::double2; + +// Main function (not within the clblast namespace) +int main(int argc, char *argv[]) { + clblast::RunTests, float, float>(argc, argv, false, "SSPMV"); + clblast::RunTests, double, double>(argc, argv, true, "DSPMV"); + return 0; +} + +// ================================================================================================= diff --git a/test/correctness/routines/level2/xspr.cc b/test/correctness/routines/level2/xspr.cc new file mode 100644 index 00000000..a0104dd4 --- /dev/null +++ b/test/correctness/routines/level2/xspr.cc @@ -0,0 +1,26 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// ================================================================================================= + +#include "correctness/testblas.h" +#include "routines/level2/xspr.h" + +// Shortcuts to the clblast namespace +using float2 = clblast::float2; +using double2 = clblast::double2; + +// Main function (not within the clblast namespace) +int main(int argc, char *argv[]) { + clblast::RunTests, float, float>(argc, argv, false, "SSPR"); + clblast::RunTests, double, double>(argc, argv, true, "DSPR"); + return 0; +} + +// ================================================================================================= diff --git a/test/correctness/routines/level2/xspr2.cc b/test/correctness/routines/level2/xspr2.cc new file mode 100644 index 00000000..5fe5827f --- /dev/null +++ b/test/correctness/routines/level2/xspr2.cc @@ -0,0 +1,26 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// ================================================================================================= + +#include "correctness/testblas.h" +#include "routines/level2/xspr2.h" + +// Shortcuts to the clblast namespace +using float2 = clblast::float2; +using double2 = clblast::double2; + +// Main function (not within the clblast namespace) +int main(int argc, char *argv[]) { + clblast::RunTests, float, float>(argc, argv, false, "SSPR2"); + clblast::RunTests, double, double>(argc, argv, true, "DSPR2"); + return 0; +} + +// ================================================================================================= diff --git a/test/correctness/routines/level2/xsymv.cc b/test/correctness/routines/level2/xsymv.cc index a479b999..6224739f 100644 --- a/test/correctness/routines/level2/xsymv.cc +++ b/test/correctness/routines/level2/xsymv.cc @@ -7,14 +7,14 @@ // Author(s): // Cedric Nugteren // -// This file implements the tests for the Xsymv routine. -// // ================================================================================================= #include "correctness/testblas.h" #include "routines/level2/xsymv.h" -// ================================================================================================= +// Shortcuts to the clblast namespace +using float2 = clblast::float2; +using double2 = clblast::double2; // Main function (not within the clblast namespace) int main(int argc, char *argv[]) { diff --git a/test/correctness/routines/level2/xsyr.cc b/test/correctness/routines/level2/xsyr.cc new file mode 100644 index 00000000..a47b918f --- /dev/null +++ b/test/correctness/routines/level2/xsyr.cc @@ -0,0 +1,26 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// ================================================================================================= + +#include "correctness/testblas.h" +#include "routines/level2/xsyr.h" + +// Shortcuts to the clblast namespace +using float2 = clblast::float2; +using double2 = clblast::double2; + +// Main function (not within the clblast namespace) +int main(int argc, char *argv[]) { + clblast::RunTests, float, float>(argc, argv, false, "SSYR"); + clblast::RunTests, double, double>(argc, argv, true, "DSYR"); + return 0; +} + +// ================================================================================================= diff --git a/test/correctness/routines/level2/xsyr2.cc b/test/correctness/routines/level2/xsyr2.cc new file mode 100644 index 00000000..1743632c --- /dev/null +++ b/test/correctness/routines/level2/xsyr2.cc @@ -0,0 +1,26 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// ================================================================================================= + +#include "correctness/testblas.h" +#include "routines/level2/xsyr2.h" + +// Shortcuts to the clblast namespace +using float2 = clblast::float2; +using double2 = clblast::double2; + +// Main function (not within the clblast namespace) +int main(int argc, char *argv[]) { + clblast::RunTests, float, float>(argc, argv, false, "SSYR2"); + clblast::RunTests, double, double>(argc, argv, true, "DSYR2"); + return 0; +} + +// ================================================================================================= diff --git a/test/correctness/routines/level2/xtbmv.cc b/test/correctness/routines/level2/xtbmv.cc new file mode 100644 index 00000000..d3bbbade --- /dev/null +++ b/test/correctness/routines/level2/xtbmv.cc @@ -0,0 +1,28 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// ================================================================================================= + +#include "correctness/testblas.h" +#include "routines/level2/xtbmv.h" + +// Shortcuts to the clblast namespace +using float2 = clblast::float2; +using double2 = clblast::double2; + +// Main function (not within the clblast namespace) +int main(int argc, char *argv[]) { + clblast::RunTests, float, float>(argc, argv, false, "STBMV"); + clblast::RunTests, double, double>(argc, argv, true, "DTBMV"); + clblast::RunTests, float2, float2>(argc, argv, true, "CTBMV"); + clblast::RunTests, double2, double2>(argc, argv, true, "ZTBMV"); + return 0; +} + +// ================================================================================================= diff --git a/test/correctness/routines/level2/xtbsv.cc b/test/correctness/routines/level2/xtbsv.cc new file mode 100644 index 00000000..c8a8a583 --- /dev/null +++ b/test/correctness/routines/level2/xtbsv.cc @@ -0,0 +1,28 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// ================================================================================================= + +#include "correctness/testblas.h" +#include "routines/level2/xtbsv.h" + +// Shortcuts to the clblast namespace +using float2 = clblast::float2; +using double2 = clblast::double2; + +// Main function (not within the clblast namespace) +int main(int argc, char *argv[]) { + clblast::RunTests, float, float>(argc, argv, false, "STBSV"); + clblast::RunTests, double, double>(argc, argv, true, "DTBSV"); + clblast::RunTests, float2, float2>(argc, argv, true, "CTBSV"); + clblast::RunTests, double2, double2>(argc, argv, true, "ZTBSV"); + return 0; +} + +// ================================================================================================= diff --git a/test/correctness/routines/level2/xtpmv.cc b/test/correctness/routines/level2/xtpmv.cc new file mode 100644 index 00000000..95489a65 --- /dev/null +++ b/test/correctness/routines/level2/xtpmv.cc @@ -0,0 +1,28 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// ================================================================================================= + +#include "correctness/testblas.h" +#include "routines/level2/xtpmv.h" + +// Shortcuts to the clblast namespace +using float2 = clblast::float2; +using double2 = clblast::double2; + +// Main function (not within the clblast namespace) +int main(int argc, char *argv[]) { + clblast::RunTests, float, float>(argc, argv, false, "STPMV"); + clblast::RunTests, double, double>(argc, argv, true, "DTPMV"); + clblast::RunTests, float2, float2>(argc, argv, true, "CTPMV"); + clblast::RunTests, double2, double2>(argc, argv, true, "ZTPMV"); + return 0; +} + +// ================================================================================================= diff --git a/test/correctness/routines/level2/xtpsv.cc b/test/correctness/routines/level2/xtpsv.cc new file mode 100644 index 00000000..97d27271 --- /dev/null +++ b/test/correctness/routines/level2/xtpsv.cc @@ -0,0 +1,28 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// ================================================================================================= + +#include "correctness/testblas.h" +#include "routines/level2/xtpsv.h" + +// Shortcuts to the clblast namespace +using float2 = clblast::float2; +using double2 = clblast::double2; + +// Main function (not within the clblast namespace) +int main(int argc, char *argv[]) { + clblast::RunTests, float, float>(argc, argv, false, "STPSV"); + clblast::RunTests, double, double>(argc, argv, true, "DTPSV"); + clblast::RunTests, float2, float2>(argc, argv, true, "CTPSV"); + clblast::RunTests, double2, double2>(argc, argv, true, "ZTPSV"); + return 0; +} + +// ================================================================================================= diff --git a/test/correctness/routines/level2/xtrmv.cc b/test/correctness/routines/level2/xtrmv.cc new file mode 100644 index 00000000..ca50af88 --- /dev/null +++ b/test/correctness/routines/level2/xtrmv.cc @@ -0,0 +1,28 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// ================================================================================================= + +#include "correctness/testblas.h" +#include "routines/level2/xtrmv.h" + +// Shortcuts to the clblast namespace +using float2 = clblast::float2; +using double2 = clblast::double2; + +// Main function (not within the clblast namespace) +int main(int argc, char *argv[]) { + clblast::RunTests, float, float>(argc, argv, false, "STRMV"); + clblast::RunTests, double, double>(argc, argv, true, "DTRMV"); + clblast::RunTests, float2, float2>(argc, argv, true, "CTRMV"); + clblast::RunTests, double2, double2>(argc, argv, true, "ZTRMV"); + return 0; +} + +// ================================================================================================= diff --git a/test/correctness/routines/level2/xtrsv.cc b/test/correctness/routines/level2/xtrsv.cc new file mode 100644 index 00000000..bfca0f20 --- /dev/null +++ b/test/correctness/routines/level2/xtrsv.cc @@ -0,0 +1,28 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// ================================================================================================= + +#include "correctness/testblas.h" +#include "routines/level2/xtrsv.h" + +// Shortcuts to the clblast namespace +using float2 = clblast::float2; +using double2 = clblast::double2; + +// Main function (not within the clblast namespace) +int main(int argc, char *argv[]) { + clblast::RunTests, float, float>(argc, argv, false, "STRSV"); + clblast::RunTests, double, double>(argc, argv, true, "DTRSV"); + clblast::RunTests, float2, float2>(argc, argv, true, "CTRSV"); + clblast::RunTests, double2, double2>(argc, argv, true, "ZTRSV"); + return 0; +} + +// ================================================================================================= diff --git a/test/correctness/routines/level3/xgemm.cc b/test/correctness/routines/level3/xgemm.cc index 90302095..632724ed 100644 --- a/test/correctness/routines/level3/xgemm.cc +++ b/test/correctness/routines/level3/xgemm.cc @@ -7,15 +7,11 @@ // Author(s): // Cedric Nugteren // -// This file implements the tests for the Xgemm routine. -// // ================================================================================================= #include "correctness/testblas.h" #include "routines/level3/xgemm.h" -// ================================================================================================= - // Shortcuts to the clblast namespace using float2 = clblast::float2; using double2 = clblast::double2; diff --git a/test/correctness/routines/level3/xhemm.cc b/test/correctness/routines/level3/xhemm.cc index 60555604..74e22080 100644 --- a/test/correctness/routines/level3/xhemm.cc +++ b/test/correctness/routines/level3/xhemm.cc @@ -7,22 +7,18 @@ // Author(s): // Cedric Nugteren // -// This file implements the tests for the Xhemm routine. -// // ================================================================================================= #include "correctness/testblas.h" #include "routines/level3/xhemm.h" -// ================================================================================================= - // Shortcuts to the clblast namespace using float2 = clblast::float2; using double2 = clblast::double2; // Main function (not within the clblast namespace) int main(int argc, char *argv[]) { - clblast::RunTests, float2, float2>(argc, argv, true, "CHEMM"); + clblast::RunTests, float2, float2>(argc, argv, false, "CHEMM"); clblast::RunTests, double2, double2>(argc, argv, true, "ZHEMM"); return 0; } diff --git a/test/correctness/routines/level3/xher2k.cc b/test/correctness/routines/level3/xher2k.cc index dd03fcd7..6377572a 100644 --- a/test/correctness/routines/level3/xher2k.cc +++ b/test/correctness/routines/level3/xher2k.cc @@ -7,15 +7,11 @@ // Author(s): // Cedric Nugteren // -// This file implements the tests for the Xher2k routine. -// // ================================================================================================= #include "correctness/testblas.h" #include "routines/level3/xher2k.h" -// ================================================================================================= - // Shortcuts to the clblast namespace using float2 = clblast::float2; using double2 = clblast::double2; diff --git a/test/correctness/routines/level3/xherk.cc b/test/correctness/routines/level3/xherk.cc index 32b8aa2c..32a39bfc 100644 --- a/test/correctness/routines/level3/xherk.cc +++ b/test/correctness/routines/level3/xherk.cc @@ -7,15 +7,11 @@ // Author(s): // Cedric Nugteren // -// This file implements the tests for the Xherk routine. -// // ================================================================================================= #include "correctness/testblas.h" #include "routines/level3/xherk.h" -// ================================================================================================= - // Shortcuts to the clblast namespace using float2 = clblast::float2; using double2 = clblast::double2; diff --git a/test/correctness/routines/level3/xsymm.cc b/test/correctness/routines/level3/xsymm.cc index 94968e31..046fca16 100644 --- a/test/correctness/routines/level3/xsymm.cc +++ b/test/correctness/routines/level3/xsymm.cc @@ -7,15 +7,11 @@ // Author(s): // Cedric Nugteren // -// This file implements the tests for the Xsymm routine. -// // ================================================================================================= #include "correctness/testblas.h" #include "routines/level3/xsymm.h" -// ================================================================================================= - // Shortcuts to the clblast namespace using float2 = clblast::float2; using double2 = clblast::double2; diff --git a/test/correctness/routines/level3/xsyr2k.cc b/test/correctness/routines/level3/xsyr2k.cc index 3b8e601b..db2b83d9 100644 --- a/test/correctness/routines/level3/xsyr2k.cc +++ b/test/correctness/routines/level3/xsyr2k.cc @@ -7,15 +7,11 @@ // Author(s): // Cedric Nugteren // -// This file implements the tests for the Xsyr2k routine. -// // ================================================================================================= #include "correctness/testblas.h" #include "routines/level3/xsyr2k.h" -// ================================================================================================= - // Shortcuts to the clblast namespace using float2 = clblast::float2; using double2 = clblast::double2; diff --git a/test/correctness/routines/level3/xsyrk.cc b/test/correctness/routines/level3/xsyrk.cc index f299342a..3dad3535 100644 --- a/test/correctness/routines/level3/xsyrk.cc +++ b/test/correctness/routines/level3/xsyrk.cc @@ -7,15 +7,11 @@ // Author(s): // Cedric Nugteren // -// This file implements the tests for the Xsyrk routine. -// // ================================================================================================= #include "correctness/testblas.h" #include "routines/level3/xsyrk.h" -// ================================================================================================= - // Shortcuts to the clblast namespace using float2 = clblast::float2; using double2 = clblast::double2; diff --git a/test/correctness/routines/level3/xtrmm.cc b/test/correctness/routines/level3/xtrmm.cc index 6efde5f8..2d843e3e 100644 --- a/test/correctness/routines/level3/xtrmm.cc +++ b/test/correctness/routines/level3/xtrmm.cc @@ -7,15 +7,11 @@ // Author(s): // Cedric Nugteren // -// This file implements the tests for the Xtrmm routine. -// // ================================================================================================= #include "correctness/testblas.h" #include "routines/level3/xtrmm.h" -// ================================================================================================= - // Shortcuts to the clblast namespace using float2 = clblast::float2; using double2 = clblast::double2; diff --git a/test/correctness/routines/level3/xtrsm.cc b/test/correctness/routines/level3/xtrsm.cc new file mode 100644 index 00000000..b5f5045e --- /dev/null +++ b/test/correctness/routines/level3/xtrsm.cc @@ -0,0 +1,28 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// ================================================================================================= + +#include "correctness/testblas.h" +#include "routines/level3/xtrsm.h" + +// Shortcuts to the clblast namespace +using float2 = clblast::float2; +using double2 = clblast::double2; + +// Main function (not within the clblast namespace) +int main(int argc, char *argv[]) { + clblast::RunTests, float, float>(argc, argv, false, "STRSM"); + clblast::RunTests, double, double>(argc, argv, true, "DTRSM"); + clblast::RunTests, float2, float2>(argc, argv, true, "CTRSM"); + clblast::RunTests, double2, double2>(argc, argv, true, "ZTRSM"); + return 0; +} + +// ================================================================================================= diff --git a/test/performance/routines/level1/xaxpy.cc b/test/performance/routines/level1/xaxpy.cc index fe90c697..7ab15f28 100644 --- a/test/performance/routines/level1/xaxpy.cc +++ b/test/performance/routines/level1/xaxpy.cc @@ -7,15 +7,11 @@ // Author(s): // Cedric Nugteren // -// This file implements the Xaxpy command-line interface performance tester. -// // ================================================================================================= #include "performance/client.h" #include "routines/level1/xaxpy.h" -// ================================================================================================= - // Shortcuts to the clblast namespace using float2 = clblast::float2; using double2 = clblast::double2; @@ -23,8 +19,7 @@ using double2 = clblast::double2; // Main function (not within the clblast namespace) int main(int argc, char *argv[]) { switch(clblast::GetPrecision(argc, argv)) { - case clblast::Precision::kHalf: - throw std::runtime_error("Unsupported precision mode"); + case clblast::Precision::kHalf: throw std::runtime_error("Unsupported precision mode"); case clblast::Precision::kSingle: clblast::RunClient, float, float>(argc, argv); break; case clblast::Precision::kDouble: diff --git a/test/performance/routines/level1/xcopy.cc b/test/performance/routines/level1/xcopy.cc index 70b6b348..6277e8fb 100644 --- a/test/performance/routines/level1/xcopy.cc +++ b/test/performance/routines/level1/xcopy.cc @@ -7,15 +7,11 @@ // Author(s): // Cedric Nugteren // -// This file implements the Xcopy command-line interface performance tester. -// // ================================================================================================= #include "performance/client.h" #include "routines/level1/xcopy.h" -// ================================================================================================= - // Shortcuts to the clblast namespace using float2 = clblast::float2; using double2 = clblast::double2; @@ -23,8 +19,7 @@ using double2 = clblast::double2; // Main function (not within the clblast namespace) int main(int argc, char *argv[]) { switch(clblast::GetPrecision(argc, argv)) { - case clblast::Precision::kHalf: - throw std::runtime_error("Unsupported precision mode"); + case clblast::Precision::kHalf: throw std::runtime_error("Unsupported precision mode"); case clblast::Precision::kSingle: clblast::RunClient, float, float>(argc, argv); break; case clblast::Precision::kDouble: diff --git a/test/performance/routines/level1/xdot.cc b/test/performance/routines/level1/xdot.cc index c82547da..5aa76762 100644 --- a/test/performance/routines/level1/xdot.cc +++ b/test/performance/routines/level1/xdot.cc @@ -7,15 +7,11 @@ // Author(s): // Cedric Nugteren // -// This file implements the Xdot command-line interface performance tester. -// // ================================================================================================= #include "performance/client.h" #include "routines/level1/xdot.h" -// ================================================================================================= - // Shortcuts to the clblast namespace using float2 = clblast::float2; using double2 = clblast::double2; @@ -23,16 +19,13 @@ using double2 = clblast::double2; // Main function (not within the clblast namespace) int main(int argc, char *argv[]) { switch(clblast::GetPrecision(argc, argv)) { - case clblast::Precision::kHalf: - throw std::runtime_error("Unsupported precision mode"); + case clblast::Precision::kHalf: throw std::runtime_error("Unsupported precision mode"); case clblast::Precision::kSingle: clblast::RunClient, float, float>(argc, argv); break; case clblast::Precision::kDouble: clblast::RunClient, double, double>(argc, argv); break; - case clblast::Precision::kComplexSingle: - throw std::runtime_error("Unsupported precision mode"); - case clblast::Precision::kComplexDouble: - throw std::runtime_error("Unsupported precision mode"); + case clblast::Precision::kComplexSingle: throw std::runtime_error("Unsupported precision mode"); + case clblast::Precision::kComplexDouble: throw std::runtime_error("Unsupported precision mode"); } return 0; } diff --git a/test/performance/routines/level1/xdotc.cc b/test/performance/routines/level1/xdotc.cc index 327975d8..81511085 100644 --- a/test/performance/routines/level1/xdotc.cc +++ b/test/performance/routines/level1/xdotc.cc @@ -7,15 +7,11 @@ // Author(s): // Cedric Nugteren // -// This file implements the Xdotc command-line interface performance tester. -// // ================================================================================================= #include "performance/client.h" #include "routines/level1/xdotc.h" -// ================================================================================================= - // Shortcuts to the clblast namespace using float2 = clblast::float2; using double2 = clblast::double2; @@ -23,12 +19,9 @@ using double2 = clblast::double2; // Main function (not within the clblast namespace) int main(int argc, char *argv[]) { switch(clblast::GetPrecision(argc, argv)) { - case clblast::Precision::kHalf: - throw std::runtime_error("Unsupported precision mode"); - case clblast::Precision::kSingle: - throw std::runtime_error("Unsupported precision mode"); - case clblast::Precision::kDouble: - throw std::runtime_error("Unsupported precision mode"); + case clblast::Precision::kHalf: throw std::runtime_error("Unsupported precision mode"); + case clblast::Precision::kSingle: throw std::runtime_error("Unsupported precision mode"); + case clblast::Precision::kDouble: throw std::runtime_error("Unsupported precision mode"); case clblast::Precision::kComplexSingle: clblast::RunClient, float2, float2>(argc, argv); break; case clblast::Precision::kComplexDouble: diff --git a/test/performance/routines/level1/xdotu.cc b/test/performance/routines/level1/xdotu.cc index 622ffb8e..888eede3 100644 --- a/test/performance/routines/level1/xdotu.cc +++ b/test/performance/routines/level1/xdotu.cc @@ -7,15 +7,11 @@ // Author(s): // Cedric Nugteren // -// This file implements the Xdotu command-line interface performance tester. -// // ================================================================================================= #include "performance/client.h" #include "routines/level1/xdotu.h" -// ================================================================================================= - // Shortcuts to the clblast namespace using float2 = clblast::float2; using double2 = clblast::double2; @@ -23,12 +19,9 @@ using double2 = clblast::double2; // Main function (not within the clblast namespace) int main(int argc, char *argv[]) { switch(clblast::GetPrecision(argc, argv)) { - case clblast::Precision::kHalf: - throw std::runtime_error("Unsupported precision mode"); - case clblast::Precision::kSingle: - throw std::runtime_error("Unsupported precision mode"); - case clblast::Precision::kDouble: - throw std::runtime_error("Unsupported precision mode"); + case clblast::Precision::kHalf: throw std::runtime_error("Unsupported precision mode"); + case clblast::Precision::kSingle: throw std::runtime_error("Unsupported precision mode"); + case clblast::Precision::kDouble: throw std::runtime_error("Unsupported precision mode"); case clblast::Precision::kComplexSingle: clblast::RunClient, float2, float2>(argc, argv); break; case clblast::Precision::kComplexDouble: diff --git a/test/performance/routines/level1/xscal.cc b/test/performance/routines/level1/xscal.cc index 3963ba3a..be49c066 100644 --- a/test/performance/routines/level1/xscal.cc +++ b/test/performance/routines/level1/xscal.cc @@ -7,15 +7,11 @@ // Author(s): // Cedric Nugteren // -// This file implements the Xscal command-line interface performance tester. -// // ================================================================================================= #include "performance/client.h" #include "routines/level1/xscal.h" -// ================================================================================================= - // Shortcuts to the clblast namespace using float2 = clblast::float2; using double2 = clblast::double2; @@ -23,8 +19,7 @@ using double2 = clblast::double2; // Main function (not within the clblast namespace) int main(int argc, char *argv[]) { switch(clblast::GetPrecision(argc, argv)) { - case clblast::Precision::kHalf: - throw std::runtime_error("Unsupported precision mode"); + case clblast::Precision::kHalf: throw std::runtime_error("Unsupported precision mode"); case clblast::Precision::kSingle: clblast::RunClient, float, float>(argc, argv); break; case clblast::Precision::kDouble: diff --git a/test/performance/routines/level1/xswap.cc b/test/performance/routines/level1/xswap.cc index 94f271ee..52fdc580 100644 --- a/test/performance/routines/level1/xswap.cc +++ b/test/performance/routines/level1/xswap.cc @@ -7,15 +7,11 @@ // Author(s): // Cedric Nugteren // -// This file implements the Xswap command-line interface performance tester. -// // ================================================================================================= #include "performance/client.h" #include "routines/level1/xswap.h" -// ================================================================================================= - // Shortcuts to the clblast namespace using float2 = clblast::float2; using double2 = clblast::double2; @@ -23,8 +19,7 @@ using double2 = clblast::double2; // Main function (not within the clblast namespace) int main(int argc, char *argv[]) { switch(clblast::GetPrecision(argc, argv)) { - case clblast::Precision::kHalf: - throw std::runtime_error("Unsupported precision mode"); + case clblast::Precision::kHalf: throw std::runtime_error("Unsupported precision mode"); case clblast::Precision::kSingle: clblast::RunClient, float, float>(argc, argv); break; case clblast::Precision::kDouble: diff --git a/test/performance/routines/level2/xgbmv.cc b/test/performance/routines/level2/xgbmv.cc new file mode 100644 index 00000000..629e2182 --- /dev/null +++ b/test/performance/routines/level2/xgbmv.cc @@ -0,0 +1,35 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// ================================================================================================= + +#include "performance/client.h" +#include "routines/level2/xgbmv.h" + +// Shortcuts to the clblast namespace +using float2 = clblast::float2; +using double2 = clblast::double2; + +// Main function (not within the clblast namespace) +int main(int argc, char *argv[]) { + switch(clblast::GetPrecision(argc, argv)) { + case clblast::Precision::kHalf: throw std::runtime_error("Unsupported precision mode"); + case clblast::Precision::kSingle: + clblast::RunClient, float, float>(argc, argv); break; + case clblast::Precision::kDouble: + clblast::RunClient, double, double>(argc, argv); break; + case clblast::Precision::kComplexSingle: + clblast::RunClient, float2, float2>(argc, argv); break; + case clblast::Precision::kComplexDouble: + clblast::RunClient, double2, double2>(argc, argv); break; + } + return 0; +} + +// ================================================================================================= diff --git a/test/performance/routines/level2/xgemv.cc b/test/performance/routines/level2/xgemv.cc index 376c6c33..2a1983de 100644 --- a/test/performance/routines/level2/xgemv.cc +++ b/test/performance/routines/level2/xgemv.cc @@ -7,15 +7,11 @@ // Author(s): // Cedric Nugteren // -// This file implements the Xgemv command-line interface performance tester. -// // ================================================================================================= #include "performance/client.h" #include "routines/level2/xgemv.h" -// ================================================================================================= - // Shortcuts to the clblast namespace using float2 = clblast::float2; using double2 = clblast::double2; @@ -23,8 +19,7 @@ using double2 = clblast::double2; // Main function (not within the clblast namespace) int main(int argc, char *argv[]) { switch(clblast::GetPrecision(argc, argv)) { - case clblast::Precision::kHalf: - throw std::runtime_error("Unsupported precision mode"); + case clblast::Precision::kHalf: throw std::runtime_error("Unsupported precision mode"); case clblast::Precision::kSingle: clblast::RunClient, float, float>(argc, argv); break; case clblast::Precision::kDouble: diff --git a/test/performance/routines/level2/xger.cc b/test/performance/routines/level2/xger.cc new file mode 100644 index 00000000..5fb0d91d --- /dev/null +++ b/test/performance/routines/level2/xger.cc @@ -0,0 +1,33 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// ================================================================================================= + +#include "performance/client.h" +#include "routines/level2/xger.h" + +// Shortcuts to the clblast namespace +using float2 = clblast::float2; +using double2 = clblast::double2; + +// Main function (not within the clblast namespace) +int main(int argc, char *argv[]) { + switch(clblast::GetPrecision(argc, argv)) { + case clblast::Precision::kHalf: throw std::runtime_error("Unsupported precision mode"); + case clblast::Precision::kSingle: + clblast::RunClient, float, float>(argc, argv); break; + case clblast::Precision::kDouble: + clblast::RunClient, double, double>(argc, argv); break; + case clblast::Precision::kComplexSingle: throw std::runtime_error("Unsupported precision mode"); + case clblast::Precision::kComplexDouble: throw std::runtime_error("Unsupported precision mode"); + } + return 0; +} + +// ================================================================================================= diff --git a/test/performance/routines/level2/xgerc.cc b/test/performance/routines/level2/xgerc.cc new file mode 100644 index 00000000..fd511e42 --- /dev/null +++ b/test/performance/routines/level2/xgerc.cc @@ -0,0 +1,33 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// ================================================================================================= + +#include "performance/client.h" +#include "routines/level2/xgerc.h" + +// Shortcuts to the clblast namespace +using float2 = clblast::float2; +using double2 = clblast::double2; + +// Main function (not within the clblast namespace) +int main(int argc, char *argv[]) { + switch(clblast::GetPrecision(argc, argv)) { + case clblast::Precision::kHalf: throw std::runtime_error("Unsupported precision mode"); + case clblast::Precision::kSingle: throw std::runtime_error("Unsupported precision mode"); + case clblast::Precision::kDouble: throw std::runtime_error("Unsupported precision mode"); + case clblast::Precision::kComplexSingle: + clblast::RunClient, float2, float2>(argc, argv); break; + case clblast::Precision::kComplexDouble: + clblast::RunClient, double2, double2>(argc, argv); break; + } + return 0; +} + +// ================================================================================================= diff --git a/test/performance/routines/level2/xgeru.cc b/test/performance/routines/level2/xgeru.cc new file mode 100644 index 00000000..689ab2b1 --- /dev/null +++ b/test/performance/routines/level2/xgeru.cc @@ -0,0 +1,33 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// ================================================================================================= + +#include "performance/client.h" +#include "routines/level2/xgeru.h" + +// Shortcuts to the clblast namespace +using float2 = clblast::float2; +using double2 = clblast::double2; + +// Main function (not within the clblast namespace) +int main(int argc, char *argv[]) { + switch(clblast::GetPrecision(argc, argv)) { + case clblast::Precision::kHalf: throw std::runtime_error("Unsupported precision mode"); + case clblast::Precision::kSingle: throw std::runtime_error("Unsupported precision mode"); + case clblast::Precision::kDouble: throw std::runtime_error("Unsupported precision mode"); + case clblast::Precision::kComplexSingle: + clblast::RunClient, float2, float2>(argc, argv); break; + case clblast::Precision::kComplexDouble: + clblast::RunClient, double2, double2>(argc, argv); break; + } + return 0; +} + +// ================================================================================================= diff --git a/test/performance/routines/level2/xhbmv.cc b/test/performance/routines/level2/xhbmv.cc new file mode 100644 index 00000000..dabe6ec8 --- /dev/null +++ b/test/performance/routines/level2/xhbmv.cc @@ -0,0 +1,33 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// ================================================================================================= + +#include "performance/client.h" +#include "routines/level2/xhbmv.h" + +// Shortcuts to the clblast namespace +using float2 = clblast::float2; +using double2 = clblast::double2; + +// Main function (not within the clblast namespace) +int main(int argc, char *argv[]) { + switch(clblast::GetPrecision(argc, argv)) { + case clblast::Precision::kHalf: throw std::runtime_error("Unsupported precision mode"); + case clblast::Precision::kSingle: throw std::runtime_error("Unsupported precision mode"); + case clblast::Precision::kDouble: throw std::runtime_error("Unsupported precision mode"); + case clblast::Precision::kComplexSingle: + clblast::RunClient, float2, float2>(argc, argv); break; + case clblast::Precision::kComplexDouble: + clblast::RunClient, double2, double2>(argc, argv); break; + } + return 0; +} + +// ================================================================================================= diff --git a/test/performance/routines/level2/xhemv.cc b/test/performance/routines/level2/xhemv.cc index dd70528e..77447d76 100644 --- a/test/performance/routines/level2/xhemv.cc +++ b/test/performance/routines/level2/xhemv.cc @@ -7,15 +7,11 @@ // Author(s): // Cedric Nugteren // -// This file implements the Xhemv command-line interface performance tester. -// // ================================================================================================= #include "performance/client.h" #include "routines/level2/xhemv.h" -// ================================================================================================= - // Shortcuts to the clblast namespace using float2 = clblast::float2; using double2 = clblast::double2; @@ -23,12 +19,9 @@ using double2 = clblast::double2; // Main function (not within the clblast namespace) int main(int argc, char *argv[]) { switch(clblast::GetPrecision(argc, argv)) { - case clblast::Precision::kHalf: - throw std::runtime_error("Unsupported precision mode"); - case clblast::Precision::kSingle: - throw std::runtime_error("Unsupported precision mode"); - case clblast::Precision::kDouble: - throw std::runtime_error("Unsupported precision mode"); + case clblast::Precision::kHalf: throw std::runtime_error("Unsupported precision mode"); + case clblast::Precision::kSingle: throw std::runtime_error("Unsupported precision mode"); + case clblast::Precision::kDouble: throw std::runtime_error("Unsupported precision mode"); case clblast::Precision::kComplexSingle: clblast::RunClient, float2, float2>(argc, argv); break; case clblast::Precision::kComplexDouble: diff --git a/test/performance/routines/level2/xher.cc b/test/performance/routines/level2/xher.cc new file mode 100644 index 00000000..4ef87e45 --- /dev/null +++ b/test/performance/routines/level2/xher.cc @@ -0,0 +1,33 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// ================================================================================================= + +#include "performance/client.h" +#include "routines/level2/xher.h" + +// Shortcuts to the clblast namespace +using float2 = clblast::float2; +using double2 = clblast::double2; + +// Main function (not within the clblast namespace) +int main(int argc, char *argv[]) { + switch(clblast::GetPrecision(argc, argv)) { + case clblast::Precision::kHalf: throw std::runtime_error("Unsupported precision mode"); + case clblast::Precision::kSingle: throw std::runtime_error("Unsupported precision mode"); + case clblast::Precision::kDouble: throw std::runtime_error("Unsupported precision mode"); + case clblast::Precision::kComplexSingle: + clblast::RunClient, float2, float>(argc, argv); break; + case clblast::Precision::kComplexDouble: + clblast::RunClient, double2, double>(argc, argv); break; + } + return 0; +} + +// ================================================================================================= diff --git a/test/performance/routines/level2/xher2.cc b/test/performance/routines/level2/xher2.cc new file mode 100644 index 00000000..2d7e17ab --- /dev/null +++ b/test/performance/routines/level2/xher2.cc @@ -0,0 +1,33 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// ================================================================================================= + +#include "performance/client.h" +#include "routines/level2/xher2.h" + +// Shortcuts to the clblast namespace +using float2 = clblast::float2; +using double2 = clblast::double2; + +// Main function (not within the clblast namespace) +int main(int argc, char *argv[]) { + switch(clblast::GetPrecision(argc, argv)) { + case clblast::Precision::kHalf: throw std::runtime_error("Unsupported precision mode"); + case clblast::Precision::kSingle: throw std::runtime_error("Unsupported precision mode"); + case clblast::Precision::kDouble: throw std::runtime_error("Unsupported precision mode"); + case clblast::Precision::kComplexSingle: + clblast::RunClient, float2, float2>(argc, argv); break; + case clblast::Precision::kComplexDouble: + clblast::RunClient, double2, double2>(argc, argv); break; + } + return 0; +} + +// ================================================================================================= diff --git a/test/performance/routines/level2/xhpmv.cc b/test/performance/routines/level2/xhpmv.cc new file mode 100644 index 00000000..b9dd3f82 --- /dev/null +++ b/test/performance/routines/level2/xhpmv.cc @@ -0,0 +1,33 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// ================================================================================================= + +#include "performance/client.h" +#include "routines/level2/xhpmv.h" + +// Shortcuts to the clblast namespace +using float2 = clblast::float2; +using double2 = clblast::double2; + +// Main function (not within the clblast namespace) +int main(int argc, char *argv[]) { + switch(clblast::GetPrecision(argc, argv)) { + case clblast::Precision::kHalf: throw std::runtime_error("Unsupported precision mode"); + case clblast::Precision::kSingle: throw std::runtime_error("Unsupported precision mode"); + case clblast::Precision::kDouble: throw std::runtime_error("Unsupported precision mode"); + case clblast::Precision::kComplexSingle: + clblast::RunClient, float2, float2>(argc, argv); break; + case clblast::Precision::kComplexDouble: + clblast::RunClient, double2, double2>(argc, argv); break; + } + return 0; +} + +// ================================================================================================= diff --git a/test/performance/routines/level2/xhpr.cc b/test/performance/routines/level2/xhpr.cc new file mode 100644 index 00000000..f596682c --- /dev/null +++ b/test/performance/routines/level2/xhpr.cc @@ -0,0 +1,33 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// ================================================================================================= + +#include "performance/client.h" +#include "routines/level2/xhpr.h" + +// Shortcuts to the clblast namespace +using float2 = clblast::float2; +using double2 = clblast::double2; + +// Main function (not within the clblast namespace) +int main(int argc, char *argv[]) { + switch(clblast::GetPrecision(argc, argv)) { + case clblast::Precision::kHalf: throw std::runtime_error("Unsupported precision mode"); + case clblast::Precision::kSingle: throw std::runtime_error("Unsupported precision mode"); + case clblast::Precision::kDouble: throw std::runtime_error("Unsupported precision mode"); + case clblast::Precision::kComplexSingle: + clblast::RunClient, float2, float>(argc, argv); break; + case clblast::Precision::kComplexDouble: + clblast::RunClient, double2, double>(argc, argv); break; + } + return 0; +} + +// ================================================================================================= diff --git a/test/performance/routines/level2/xhpr2.cc b/test/performance/routines/level2/xhpr2.cc new file mode 100644 index 00000000..1c493226 --- /dev/null +++ b/test/performance/routines/level2/xhpr2.cc @@ -0,0 +1,33 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// ================================================================================================= + +#include "performance/client.h" +#include "routines/level2/xhpr2.h" + +// Shortcuts to the clblast namespace +using float2 = clblast::float2; +using double2 = clblast::double2; + +// Main function (not within the clblast namespace) +int main(int argc, char *argv[]) { + switch(clblast::GetPrecision(argc, argv)) { + case clblast::Precision::kHalf: throw std::runtime_error("Unsupported precision mode"); + case clblast::Precision::kSingle: throw std::runtime_error("Unsupported precision mode"); + case clblast::Precision::kDouble: throw std::runtime_error("Unsupported precision mode"); + case clblast::Precision::kComplexSingle: + clblast::RunClient, float2, float2>(argc, argv); break; + case clblast::Precision::kComplexDouble: + clblast::RunClient, double2, double2>(argc, argv); break; + } + return 0; +} + +// ================================================================================================= diff --git a/test/performance/routines/level2/xsbmv.cc b/test/performance/routines/level2/xsbmv.cc new file mode 100644 index 00000000..febc6bfd --- /dev/null +++ b/test/performance/routines/level2/xsbmv.cc @@ -0,0 +1,33 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// ================================================================================================= + +#include "performance/client.h" +#include "routines/level2/xsbmv.h" + +// Shortcuts to the clblast namespace +using float2 = clblast::float2; +using double2 = clblast::double2; + +// Main function (not within the clblast namespace) +int main(int argc, char *argv[]) { + switch(clblast::GetPrecision(argc, argv)) { + case clblast::Precision::kHalf: throw std::runtime_error("Unsupported precision mode"); + case clblast::Precision::kSingle: + clblast::RunClient, float, float>(argc, argv); break; + case clblast::Precision::kDouble: + clblast::RunClient, double, double>(argc, argv); break; + case clblast::Precision::kComplexSingle: throw std::runtime_error("Unsupported precision mode"); + case clblast::Precision::kComplexDouble: throw std::runtime_error("Unsupported precision mode"); + } + return 0; +} + +// ================================================================================================= diff --git a/test/performance/routines/level2/xspmv.cc b/test/performance/routines/level2/xspmv.cc new file mode 100644 index 00000000..97c6b032 --- /dev/null +++ b/test/performance/routines/level2/xspmv.cc @@ -0,0 +1,33 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// ================================================================================================= + +#include "performance/client.h" +#include "routines/level2/xspmv.h" + +// Shortcuts to the clblast namespace +using float2 = clblast::float2; +using double2 = clblast::double2; + +// Main function (not within the clblast namespace) +int main(int argc, char *argv[]) { + switch(clblast::GetPrecision(argc, argv)) { + case clblast::Precision::kHalf: throw std::runtime_error("Unsupported precision mode"); + case clblast::Precision::kSingle: + clblast::RunClient, float, float>(argc, argv); break; + case clblast::Precision::kDouble: + clblast::RunClient, double, double>(argc, argv); break; + case clblast::Precision::kComplexSingle: throw std::runtime_error("Unsupported precision mode"); + case clblast::Precision::kComplexDouble: throw std::runtime_error("Unsupported precision mode"); + } + return 0; +} + +// ================================================================================================= diff --git a/test/performance/routines/level2/xspr.cc b/test/performance/routines/level2/xspr.cc new file mode 100644 index 00000000..cc18d9b6 --- /dev/null +++ b/test/performance/routines/level2/xspr.cc @@ -0,0 +1,33 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// ================================================================================================= + +#include "performance/client.h" +#include "routines/level2/xspr.h" + +// Shortcuts to the clblast namespace +using float2 = clblast::float2; +using double2 = clblast::double2; + +// Main function (not within the clblast namespace) +int main(int argc, char *argv[]) { + switch(clblast::GetPrecision(argc, argv)) { + case clblast::Precision::kHalf: throw std::runtime_error("Unsupported precision mode"); + case clblast::Precision::kSingle: + clblast::RunClient, float, float>(argc, argv); break; + case clblast::Precision::kDouble: + clblast::RunClient, double, double>(argc, argv); break; + case clblast::Precision::kComplexSingle: throw std::runtime_error("Unsupported precision mode"); + case clblast::Precision::kComplexDouble: throw std::runtime_error("Unsupported precision mode"); + } + return 0; +} + +// ================================================================================================= diff --git a/test/performance/routines/level2/xspr2.cc b/test/performance/routines/level2/xspr2.cc new file mode 100644 index 00000000..768452be --- /dev/null +++ b/test/performance/routines/level2/xspr2.cc @@ -0,0 +1,33 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// ================================================================================================= + +#include "performance/client.h" +#include "routines/level2/xspr2.h" + +// Shortcuts to the clblast namespace +using float2 = clblast::float2; +using double2 = clblast::double2; + +// Main function (not within the clblast namespace) +int main(int argc, char *argv[]) { + switch(clblast::GetPrecision(argc, argv)) { + case clblast::Precision::kHalf: throw std::runtime_error("Unsupported precision mode"); + case clblast::Precision::kSingle: + clblast::RunClient, float, float>(argc, argv); break; + case clblast::Precision::kDouble: + clblast::RunClient, double, double>(argc, argv); break; + case clblast::Precision::kComplexSingle: throw std::runtime_error("Unsupported precision mode"); + case clblast::Precision::kComplexDouble: throw std::runtime_error("Unsupported precision mode"); + } + return 0; +} + +// ================================================================================================= diff --git a/test/performance/routines/level2/xsymv.cc b/test/performance/routines/level2/xsymv.cc index 30e953a5..6748026f 100644 --- a/test/performance/routines/level2/xsymv.cc +++ b/test/performance/routines/level2/xsymv.cc @@ -7,28 +7,25 @@ // Author(s): // Cedric Nugteren // -// This file implements the Xsymv command-line interface performance tester. -// // ================================================================================================= #include "performance/client.h" #include "routines/level2/xsymv.h" -// ================================================================================================= +// Shortcuts to the clblast namespace +using float2 = clblast::float2; +using double2 = clblast::double2; // Main function (not within the clblast namespace) int main(int argc, char *argv[]) { switch(clblast::GetPrecision(argc, argv)) { - case clblast::Precision::kHalf: - throw std::runtime_error("Unsupported precision mode"); + case clblast::Precision::kHalf: throw std::runtime_error("Unsupported precision mode"); case clblast::Precision::kSingle: clblast::RunClient, float, float>(argc, argv); break; case clblast::Precision::kDouble: clblast::RunClient, double, double>(argc, argv); break; - case clblast::Precision::kComplexSingle: - throw std::runtime_error("Unsupported precision mode"); - case clblast::Precision::kComplexDouble: - throw std::runtime_error("Unsupported precision mode"); + case clblast::Precision::kComplexSingle: throw std::runtime_error("Unsupported precision mode"); + case clblast::Precision::kComplexDouble: throw std::runtime_error("Unsupported precision mode"); } return 0; } diff --git a/test/performance/routines/level2/xsyr.cc b/test/performance/routines/level2/xsyr.cc new file mode 100644 index 00000000..84510e5d --- /dev/null +++ b/test/performance/routines/level2/xsyr.cc @@ -0,0 +1,33 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// ================================================================================================= + +#include "performance/client.h" +#include "routines/level2/xsyr.h" + +// Shortcuts to the clblast namespace +using float2 = clblast::float2; +using double2 = clblast::double2; + +// Main function (not within the clblast namespace) +int main(int argc, char *argv[]) { + switch(clblast::GetPrecision(argc, argv)) { + case clblast::Precision::kHalf: throw std::runtime_error("Unsupported precision mode"); + case clblast::Precision::kSingle: + clblast::RunClient, float, float>(argc, argv); break; + case clblast::Precision::kDouble: + clblast::RunClient, double, double>(argc, argv); break; + case clblast::Precision::kComplexSingle: throw std::runtime_error("Unsupported precision mode"); + case clblast::Precision::kComplexDouble: throw std::runtime_error("Unsupported precision mode"); + } + return 0; +} + +// ================================================================================================= diff --git a/test/performance/routines/level2/xsyr2.cc b/test/performance/routines/level2/xsyr2.cc new file mode 100644 index 00000000..b8c177d8 --- /dev/null +++ b/test/performance/routines/level2/xsyr2.cc @@ -0,0 +1,33 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// ================================================================================================= + +#include "performance/client.h" +#include "routines/level2/xsyr2.h" + +// Shortcuts to the clblast namespace +using float2 = clblast::float2; +using double2 = clblast::double2; + +// Main function (not within the clblast namespace) +int main(int argc, char *argv[]) { + switch(clblast::GetPrecision(argc, argv)) { + case clblast::Precision::kHalf: throw std::runtime_error("Unsupported precision mode"); + case clblast::Precision::kSingle: + clblast::RunClient, float, float>(argc, argv); break; + case clblast::Precision::kDouble: + clblast::RunClient, double, double>(argc, argv); break; + case clblast::Precision::kComplexSingle: throw std::runtime_error("Unsupported precision mode"); + case clblast::Precision::kComplexDouble: throw std::runtime_error("Unsupported precision mode"); + } + return 0; +} + +// ================================================================================================= diff --git a/test/performance/routines/level2/xtbmv.cc b/test/performance/routines/level2/xtbmv.cc new file mode 100644 index 00000000..1663dca0 --- /dev/null +++ b/test/performance/routines/level2/xtbmv.cc @@ -0,0 +1,35 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// ================================================================================================= + +#include "performance/client.h" +#include "routines/level2/xtbmv.h" + +// Shortcuts to the clblast namespace +using float2 = clblast::float2; +using double2 = clblast::double2; + +// Main function (not within the clblast namespace) +int main(int argc, char *argv[]) { + switch(clblast::GetPrecision(argc, argv)) { + case clblast::Precision::kHalf: throw std::runtime_error("Unsupported precision mode"); + case clblast::Precision::kSingle: + clblast::RunClient, float, float>(argc, argv); break; + case clblast::Precision::kDouble: + clblast::RunClient, double, double>(argc, argv); break; + case clblast::Precision::kComplexSingle: + clblast::RunClient, float2, float2>(argc, argv); break; + case clblast::Precision::kComplexDouble: + clblast::RunClient, double2, double2>(argc, argv); break; + } + return 0; +} + +// ================================================================================================= diff --git a/test/performance/routines/level2/xtbsv.cc b/test/performance/routines/level2/xtbsv.cc new file mode 100644 index 00000000..e0cb9f2e --- /dev/null +++ b/test/performance/routines/level2/xtbsv.cc @@ -0,0 +1,35 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// ================================================================================================= + +#include "performance/client.h" +#include "routines/level2/xtbsv.h" + +// Shortcuts to the clblast namespace +using float2 = clblast::float2; +using double2 = clblast::double2; + +// Main function (not within the clblast namespace) +int main(int argc, char *argv[]) { + switch(clblast::GetPrecision(argc, argv)) { + case clblast::Precision::kHalf: throw std::runtime_error("Unsupported precision mode"); + case clblast::Precision::kSingle: + clblast::RunClient, float, float>(argc, argv); break; + case clblast::Precision::kDouble: + clblast::RunClient, double, double>(argc, argv); break; + case clblast::Precision::kComplexSingle: + clblast::RunClient, float2, float2>(argc, argv); break; + case clblast::Precision::kComplexDouble: + clblast::RunClient, double2, double2>(argc, argv); break; + } + return 0; +} + +// ================================================================================================= diff --git a/test/performance/routines/level2/xtpmv.cc b/test/performance/routines/level2/xtpmv.cc new file mode 100644 index 00000000..407fdc8c --- /dev/null +++ b/test/performance/routines/level2/xtpmv.cc @@ -0,0 +1,35 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// ================================================================================================= + +#include "performance/client.h" +#include "routines/level2/xtpmv.h" + +// Shortcuts to the clblast namespace +using float2 = clblast::float2; +using double2 = clblast::double2; + +// Main function (not within the clblast namespace) +int main(int argc, char *argv[]) { + switch(clblast::GetPrecision(argc, argv)) { + case clblast::Precision::kHalf: throw std::runtime_error("Unsupported precision mode"); + case clblast::Precision::kSingle: + clblast::RunClient, float, float>(argc, argv); break; + case clblast::Precision::kDouble: + clblast::RunClient, double, double>(argc, argv); break; + case clblast::Precision::kComplexSingle: + clblast::RunClient, float2, float2>(argc, argv); break; + case clblast::Precision::kComplexDouble: + clblast::RunClient, double2, double2>(argc, argv); break; + } + return 0; +} + +// ================================================================================================= diff --git a/test/performance/routines/level2/xtpsv.cc b/test/performance/routines/level2/xtpsv.cc new file mode 100644 index 00000000..e402dc60 --- /dev/null +++ b/test/performance/routines/level2/xtpsv.cc @@ -0,0 +1,35 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// ================================================================================================= + +#include "performance/client.h" +#include "routines/level2/xtpsv.h" + +// Shortcuts to the clblast namespace +using float2 = clblast::float2; +using double2 = clblast::double2; + +// Main function (not within the clblast namespace) +int main(int argc, char *argv[]) { + switch(clblast::GetPrecision(argc, argv)) { + case clblast::Precision::kHalf: throw std::runtime_error("Unsupported precision mode"); + case clblast::Precision::kSingle: + clblast::RunClient, float, float>(argc, argv); break; + case clblast::Precision::kDouble: + clblast::RunClient, double, double>(argc, argv); break; + case clblast::Precision::kComplexSingle: + clblast::RunClient, float2, float2>(argc, argv); break; + case clblast::Precision::kComplexDouble: + clblast::RunClient, double2, double2>(argc, argv); break; + } + return 0; +} + +// ================================================================================================= diff --git a/test/performance/routines/level2/xtrmv.cc b/test/performance/routines/level2/xtrmv.cc new file mode 100644 index 00000000..c5563240 --- /dev/null +++ b/test/performance/routines/level2/xtrmv.cc @@ -0,0 +1,35 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// ================================================================================================= + +#include "performance/client.h" +#include "routines/level2/xtrmv.h" + +// Shortcuts to the clblast namespace +using float2 = clblast::float2; +using double2 = clblast::double2; + +// Main function (not within the clblast namespace) +int main(int argc, char *argv[]) { + switch(clblast::GetPrecision(argc, argv)) { + case clblast::Precision::kHalf: throw std::runtime_error("Unsupported precision mode"); + case clblast::Precision::kSingle: + clblast::RunClient, float, float>(argc, argv); break; + case clblast::Precision::kDouble: + clblast::RunClient, double, double>(argc, argv); break; + case clblast::Precision::kComplexSingle: + clblast::RunClient, float2, float2>(argc, argv); break; + case clblast::Precision::kComplexDouble: + clblast::RunClient, double2, double2>(argc, argv); break; + } + return 0; +} + +// ================================================================================================= diff --git a/test/performance/routines/level2/xtrsv.cc b/test/performance/routines/level2/xtrsv.cc new file mode 100644 index 00000000..136e2108 --- /dev/null +++ b/test/performance/routines/level2/xtrsv.cc @@ -0,0 +1,35 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// ================================================================================================= + +#include "performance/client.h" +#include "routines/level2/xtrsv.h" + +// Shortcuts to the clblast namespace +using float2 = clblast::float2; +using double2 = clblast::double2; + +// Main function (not within the clblast namespace) +int main(int argc, char *argv[]) { + switch(clblast::GetPrecision(argc, argv)) { + case clblast::Precision::kHalf: throw std::runtime_error("Unsupported precision mode"); + case clblast::Precision::kSingle: + clblast::RunClient, float, float>(argc, argv); break; + case clblast::Precision::kDouble: + clblast::RunClient, double, double>(argc, argv); break; + case clblast::Precision::kComplexSingle: + clblast::RunClient, float2, float2>(argc, argv); break; + case clblast::Precision::kComplexDouble: + clblast::RunClient, double2, double2>(argc, argv); break; + } + return 0; +} + +// ================================================================================================= diff --git a/test/performance/routines/level3/xgemm.cc b/test/performance/routines/level3/xgemm.cc index c45c238f..2082ceac 100644 --- a/test/performance/routines/level3/xgemm.cc +++ b/test/performance/routines/level3/xgemm.cc @@ -7,15 +7,11 @@ // Author(s): // Cedric Nugteren // -// This file implements the Xgemm command-line interface performance tester. -// // ================================================================================================= #include "performance/client.h" #include "routines/level3/xgemm.h" -// ================================================================================================= - // Shortcuts to the clblast namespace using float2 = clblast::float2; using double2 = clblast::double2; @@ -23,8 +19,7 @@ using double2 = clblast::double2; // Main function (not within the clblast namespace) int main(int argc, char *argv[]) { switch(clblast::GetPrecision(argc, argv)) { - case clblast::Precision::kHalf: - throw std::runtime_error("Unsupported precision mode"); + case clblast::Precision::kHalf: throw std::runtime_error("Unsupported precision mode"); case clblast::Precision::kSingle: clblast::RunClient, float, float>(argc, argv); break; case clblast::Precision::kDouble: diff --git a/test/performance/routines/level3/xhemm.cc b/test/performance/routines/level3/xhemm.cc index d215653b..cc68e937 100644 --- a/test/performance/routines/level3/xhemm.cc +++ b/test/performance/routines/level3/xhemm.cc @@ -7,15 +7,11 @@ // Author(s): // Cedric Nugteren // -// This file implements the Xhemm command-line interface performance tester. -// // ================================================================================================= #include "performance/client.h" #include "routines/level3/xhemm.h" -// ================================================================================================= - // Shortcuts to the clblast namespace using float2 = clblast::float2; using double2 = clblast::double2; @@ -23,12 +19,9 @@ using double2 = clblast::double2; // Main function (not within the clblast namespace) int main(int argc, char *argv[]) { switch(clblast::GetPrecision(argc, argv)) { - case clblast::Precision::kHalf: - throw std::runtime_error("Unsupported precision mode"); - case clblast::Precision::kSingle: - throw std::runtime_error("Unsupported precision mode"); - case clblast::Precision::kDouble: - throw std::runtime_error("Unsupported precision mode"); + case clblast::Precision::kHalf: throw std::runtime_error("Unsupported precision mode"); + case clblast::Precision::kSingle: throw std::runtime_error("Unsupported precision mode"); + case clblast::Precision::kDouble: throw std::runtime_error("Unsupported precision mode"); case clblast::Precision::kComplexSingle: clblast::RunClient, float2, float2>(argc, argv); break; case clblast::Precision::kComplexDouble: diff --git a/test/performance/routines/level3/xher2k.cc b/test/performance/routines/level3/xher2k.cc index 2e1f248a..70d76bed 100644 --- a/test/performance/routines/level3/xher2k.cc +++ b/test/performance/routines/level3/xher2k.cc @@ -7,15 +7,11 @@ // Author(s): // Cedric Nugteren // -// This file implements the Xher2k command-line interface performance tester. -// // ================================================================================================= #include "performance/client.h" #include "routines/level3/xher2k.h" -// ================================================================================================= - // Shortcuts to the clblast namespace using float2 = clblast::float2; using double2 = clblast::double2; @@ -23,12 +19,9 @@ using double2 = clblast::double2; // Main function (not within the clblast namespace) int main(int argc, char *argv[]) { switch(clblast::GetPrecision(argc, argv)) { - case clblast::Precision::kHalf: - throw std::runtime_error("Unsupported precision mode"); - case clblast::Precision::kSingle: - throw std::runtime_error("Unsupported precision mode"); - case clblast::Precision::kDouble: - throw std::runtime_error("Unsupported precision mode"); + case clblast::Precision::kHalf: throw std::runtime_error("Unsupported precision mode"); + case clblast::Precision::kSingle: throw std::runtime_error("Unsupported precision mode"); + case clblast::Precision::kDouble: throw std::runtime_error("Unsupported precision mode"); case clblast::Precision::kComplexSingle: clblast::RunClient, float2, float>(argc, argv); break; case clblast::Precision::kComplexDouble: diff --git a/test/performance/routines/level3/xherk.cc b/test/performance/routines/level3/xherk.cc index 4386f78c..b3b5dddf 100644 --- a/test/performance/routines/level3/xherk.cc +++ b/test/performance/routines/level3/xherk.cc @@ -7,15 +7,11 @@ // Author(s): // Cedric Nugteren // -// This file implements the Xherk command-line interface performance tester. -// // ================================================================================================= #include "performance/client.h" #include "routines/level3/xherk.h" -// ================================================================================================= - // Shortcuts to the clblast namespace using float2 = clblast::float2; using double2 = clblast::double2; @@ -23,12 +19,9 @@ using double2 = clblast::double2; // Main function (not within the clblast namespace) int main(int argc, char *argv[]) { switch(clblast::GetPrecision(argc, argv)) { - case clblast::Precision::kHalf: - throw std::runtime_error("Unsupported precision mode"); - case clblast::Precision::kSingle: - throw std::runtime_error("Unsupported precision mode"); - case clblast::Precision::kDouble: - throw std::runtime_error("Unsupported precision mode"); + case clblast::Precision::kHalf: throw std::runtime_error("Unsupported precision mode"); + case clblast::Precision::kSingle: throw std::runtime_error("Unsupported precision mode"); + case clblast::Precision::kDouble: throw std::runtime_error("Unsupported precision mode"); case clblast::Precision::kComplexSingle: clblast::RunClient, float2, float>(argc, argv); break; case clblast::Precision::kComplexDouble: diff --git a/test/performance/routines/level3/xsymm.cc b/test/performance/routines/level3/xsymm.cc index bd014cee..f2292273 100644 --- a/test/performance/routines/level3/xsymm.cc +++ b/test/performance/routines/level3/xsymm.cc @@ -7,15 +7,11 @@ // Author(s): // Cedric Nugteren // -// This file implements the Xsymm command-line interface performance tester. -// // ================================================================================================= #include "performance/client.h" #include "routines/level3/xsymm.h" -// ================================================================================================= - // Shortcuts to the clblast namespace using float2 = clblast::float2; using double2 = clblast::double2; @@ -23,8 +19,7 @@ using double2 = clblast::double2; // Main function (not within the clblast namespace) int main(int argc, char *argv[]) { switch(clblast::GetPrecision(argc, argv)) { - case clblast::Precision::kHalf: - throw std::runtime_error("Unsupported precision mode"); + case clblast::Precision::kHalf: throw std::runtime_error("Unsupported precision mode"); case clblast::Precision::kSingle: clblast::RunClient, float, float>(argc, argv); break; case clblast::Precision::kDouble: diff --git a/test/performance/routines/level3/xsyr2k.cc b/test/performance/routines/level3/xsyr2k.cc index 1261be88..0c8f8f7c 100644 --- a/test/performance/routines/level3/xsyr2k.cc +++ b/test/performance/routines/level3/xsyr2k.cc @@ -7,15 +7,11 @@ // Author(s): // Cedric Nugteren // -// This file implements the Xsyr2k command-line interface performance tester. -// // ================================================================================================= #include "performance/client.h" #include "routines/level3/xsyr2k.h" -// ================================================================================================= - // Shortcuts to the clblast namespace using float2 = clblast::float2; using double2 = clblast::double2; @@ -23,8 +19,7 @@ using double2 = clblast::double2; // Main function (not within the clblast namespace) int main(int argc, char *argv[]) { switch(clblast::GetPrecision(argc, argv)) { - case clblast::Precision::kHalf: - throw std::runtime_error("Unsupported precision mode"); + case clblast::Precision::kHalf: throw std::runtime_error("Unsupported precision mode"); case clblast::Precision::kSingle: clblast::RunClient, float, float>(argc, argv); break; case clblast::Precision::kDouble: diff --git a/test/performance/routines/level3/xsyrk.cc b/test/performance/routines/level3/xsyrk.cc index 5799130f..ccd4511a 100644 --- a/test/performance/routines/level3/xsyrk.cc +++ b/test/performance/routines/level3/xsyrk.cc @@ -7,15 +7,11 @@ // Author(s): // Cedric Nugteren // -// This file implements the Xsyrk command-line interface performance tester. -// // ================================================================================================= #include "performance/client.h" #include "routines/level3/xsyrk.h" -// ================================================================================================= - // Shortcuts to the clblast namespace using float2 = clblast::float2; using double2 = clblast::double2; @@ -23,8 +19,7 @@ using double2 = clblast::double2; // Main function (not within the clblast namespace) int main(int argc, char *argv[]) { switch(clblast::GetPrecision(argc, argv)) { - case clblast::Precision::kHalf: - throw std::runtime_error("Unsupported precision mode"); + case clblast::Precision::kHalf: throw std::runtime_error("Unsupported precision mode"); case clblast::Precision::kSingle: clblast::RunClient, float, float>(argc, argv); break; case clblast::Precision::kDouble: diff --git a/test/performance/routines/level3/xtrmm.cc b/test/performance/routines/level3/xtrmm.cc index c30866e9..8278d077 100644 --- a/test/performance/routines/level3/xtrmm.cc +++ b/test/performance/routines/level3/xtrmm.cc @@ -7,15 +7,11 @@ // Author(s): // Cedric Nugteren // -// This file implements the Xtrmm command-line interface performance tester. -// // ================================================================================================= #include "performance/client.h" #include "routines/level3/xtrmm.h" -// ================================================================================================= - // Shortcuts to the clblast namespace using float2 = clblast::float2; using double2 = clblast::double2; @@ -23,8 +19,7 @@ using double2 = clblast::double2; // Main function (not within the clblast namespace) int main(int argc, char *argv[]) { switch(clblast::GetPrecision(argc, argv)) { - case clblast::Precision::kHalf: - throw std::runtime_error("Unsupported precision mode"); + case clblast::Precision::kHalf: throw std::runtime_error("Unsupported precision mode"); case clblast::Precision::kSingle: clblast::RunClient, float, float>(argc, argv); break; case clblast::Precision::kDouble: diff --git a/test/performance/routines/level3/xtrsm.cc b/test/performance/routines/level3/xtrsm.cc new file mode 100644 index 00000000..45f71c5e --- /dev/null +++ b/test/performance/routines/level3/xtrsm.cc @@ -0,0 +1,35 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// ================================================================================================= + +#include "performance/client.h" +#include "routines/level3/xtrsm.h" + +// Shortcuts to the clblast namespace +using float2 = clblast::float2; +using double2 = clblast::double2; + +// Main function (not within the clblast namespace) +int main(int argc, char *argv[]) { + switch(clblast::GetPrecision(argc, argv)) { + case clblast::Precision::kHalf: throw std::runtime_error("Unsupported precision mode"); + case clblast::Precision::kSingle: + clblast::RunClient, float, float>(argc, argv); break; + case clblast::Precision::kDouble: + clblast::RunClient, double, double>(argc, argv); break; + case clblast::Precision::kComplexSingle: + clblast::RunClient, float2, float2>(argc, argv); break; + case clblast::Precision::kComplexDouble: + clblast::RunClient, double2, double2>(argc, argv); break; + } + return 0; +} + +// =================================================================================================