From 681a465b355d0fed7a82644f4472a5afa024721e Mon Sep 17 00:00:00 2001
From: Cedric Nugteren <web@cedricnugteren.nl>
Date: Sun, 18 Dec 2016 12:30:16 +0100
Subject: [PATCH 01/24] Prepared for the addition of the TRSM triangular solver
 kernel

---
 CMakeLists.txt                 |   2 +-
 doc/clblast.md                 |  71 +++++++++++++++
 scripts/generator/generator.py |   4 +-
 src/clblast.cpp                |  24 +++--
 src/routines/level3/xtrsm.cpp  |  72 +++++++++++++++
 src/routines/level3/xtrsm.hpp  |  52 +++++++++++
 test/routines/level3/xtrsm.hpp | 159 +++++++++++++++++++++++++++++++++
 7 files changed, 374 insertions(+), 10 deletions(-)
 create mode 100644 src/routines/level3/xtrsm.cpp
 create mode 100644 src/routines/level3/xtrsm.hpp
 create mode 100644 test/routines/level3/xtrsm.hpp
diff --git a/CMakeLists.txt b/CMakeLists.txt
index edb03dbf..f6fa4045 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -158,7 +158,7 @@ endif()
 set(LEVEL1_ROUTINES xswap xscal xcopy xaxpy xdot xdotu xdotc xnrm2 xasum xamax)
 set(LEVEL2_ROUTINES xgemv xgbmv xhemv xhbmv xhpmv xsymv xsbmv xspmv xtrmv xtbmv xtpmv
                     xger xgeru xgerc xher xhpr xher2 xhpr2 xsyr xspr xsyr2 xspr2)
-set(LEVEL3_ROUTINES xgemm xsymm xhemm xsyrk xherk xsyr2k xher2k xtrmm)
+set(LEVEL3_ROUTINES xgemm xsymm xhemm xsyrk xherk xsyr2k xher2k xtrmm xtrsm)
 set(LEVELX_ROUTINES xomatcopy)
 set(ROUTINES ${LEVEL1_ROUTINES} ${LEVEL2_ROUTINES} ${LEVEL3_ROUTINES} ${LEVELX_ROUTINES})
 set(PRECISIONS 32 64 3232 6464 16)
diff --git a/doc/clblast.md b/doc/clblast.md
index 37b99f3d..d7be0005 100644
--- a/doc/clblast.md
+++ b/doc/clblast.md
@@ -2708,6 +2708,77 @@ Requirements for TRMM:
 
 
 
+xTRSM: Solves a triangular system of equations
+-------------
+
+Solves the equation _A * X = alpha * B_ for the unknown _m_ by _n_ matrix X, in which _A_ is an _n_ by _n_ unit or non-unit triangular matrix and B is an _m_ by _n_ matrix. The matrix _B_ is overwritten by the solution _X_.
+
+C++ API:
+```
+template <typename T>
+StatusCode Trsm(const Layout layout, const Side side, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal,
+                const size_t m, const size_t n,
+                const T alpha,
+                const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+                cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
+                cl_command_queue* queue, cl_event* event)
+```
+
+C API:
+```
+CLBlastStatusCode CLBlastStrsm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
+                               const size_t m, const size_t n,
+                               const float alpha,
+                               const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+                               cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
+                               cl_command_queue* queue, cl_event* event)
+CLBlastStatusCode CLBlastDtrsm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
+                               const size_t m, const size_t n,
+                               const double alpha,
+                               const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+                               cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
+                               cl_command_queue* queue, cl_event* event)
+CLBlastStatusCode CLBlastCtrsm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
+                               const size_t m, const size_t n,
+                               const cl_float2 alpha,
+                               const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+                               cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
+                               cl_command_queue* queue, cl_event* event)
+CLBlastStatusCode CLBlastZtrsm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
+                               const size_t m, const size_t n,
+                               const cl_double2 alpha,
+                               const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+                               cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
+                               cl_command_queue* queue, cl_event* event)
+CLBlastStatusCode CLBlastHtrsm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
+                               const size_t m, const size_t n,
+                               const cl_half alpha,
+                               const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+                               cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
+                               cl_command_queue* queue, cl_event* event)
+```
+
+Arguments to TRSM:
+
+* `const Layout layout`: Data-layout of the matrices, either `Layout::kRowMajor` (101) for row-major layout or `Layout::kColMajor` (102) for column-major data-layout.
+* `const Side side`: The position of the triangular matrix in the operation, either on the `Side::kLeft` (141) or `Side::kRight` (142).
+* `const Triangle triangle`: The part of the array of the triangular matrix to be used, either `Triangle::kUpper` (121) or `Triangle::kLower` (122).
+* `const Transpose a_transpose`: Transposing the input matrix A, either `Transpose::kNo` (111), `Transpose::kYes` (112), or `Transpose::kConjugate` (113) for a complex-conjugate transpose.
+* `const Diagonal diagonal`: The property of the diagonal matrix, either `Diagonal::kNonUnit` (131) for non-unit values on the diagonal or `Diagonal::kUnit` (132) for unit values on the diagonal.
+* `const size_t m`: Integer size argument. This value must be positive.
+* `const size_t n`: Integer size argument. This value must be positive.
+* `const T alpha`: Input scalar constant.
+* `const cl_mem a_buffer`: OpenCL buffer to store the input A matrix.
+* `const size_t a_offset`: The offset in elements from the start of the input A matrix.
+* `const size_t a_ld`: Leading dimension of the input A matrix. This value must be greater than 0.
+* `cl_mem b_buffer`: OpenCL buffer to store the output B matrix.
+* `const size_t b_offset`: The offset in elements from the start of the output B matrix.
+* `const size_t b_ld`: Leading dimension of the output B matrix. This value must be greater than 0.
+* `cl_command_queue* queue`: Pointer to an OpenCL command queue associated with a context and device to execute the routine on.
+* `cl_event* event`: Pointer to an OpenCL event to be able to wait for completion of the routine's OpenCL kernel(s). This is an optional argument.
+
+
+
 xOMATCOPY: Scaling and out-place transpose/copy (non-BLAS function)
 -------------
 
diff --git a/scripts/generator/generator.py b/scripts/generator/generator.py
index 35d902b7..d71e392d 100755
--- a/scripts/generator/generator.py
+++ b/scripts/generator/generator.py
@@ -41,7 +41,7 @@ FILES = [
     "/include/clblast_netlib_c.h",
     "/src/clblast_netlib_c.cpp",
 ]
-HEADER_LINES = [117, 73, 118, 22, 29, 41, 65, 32]
+HEADER_LINES = [117, 74, 118, 22, 29, 41, 65, 32]
 FOOTER_LINES = [17, 80, 19, 18, 6, 6, 9, 2]
 
 # Different possibilities for requirements
@@ -154,7 +154,7 @@ ROUTINES = [
   Routine(True,  True,  "3", "syr2k", T,  [S,D,C,Z,H],     ["n","k"],            ["layout","triangle","ab_transpose"],                  ["a","b"],  ["c"],                        [ankab,bnkab,cn],["alpha","beta"], "",    "Rank-2K update of a symmetric matrix", "Performs the matrix product _C = alpha * A * B^T + alpha * B * A^T + beta * C_ or _C = alpha * A^T * B + alpha * B^T * A + beta * C_, in which _A_ and _B_ are general matrices and _A^T_ and _B^T_ are their transposed versions, _C_ (_n_ by _n_) is the symmetric matrix to be updated, and _alpha_ and _beta_ are scalar values.", [ald_trans_n_k, bld_trans_n_k, cld_n]),
   Routine(True,  True,  "3", "her2k", TU, [Ccs,Zzd],       ["n","k"],            ["layout","triangle","ab_transpose"],                  ["a","b"],  ["c"],                        [ankab,bnkab,cn],["alpha","beta"], "",    "Rank-2K update of a hermitian matrix", "Same operation as xSYR2K, but _C_ is an Hermitian matrix instead.", [ald_trans_n_k, bld_trans_n_k, cld_n]),
   Routine(True,  True,  "3", "trmm",  T,  [S,D,C,Z,H],     ["m","n"],            ["layout","side","triangle","a_transpose","diagonal"], ["a"],      ["b"],                        [amns,bmn],      ["alpha"],        "",    "Triangular matrix-matrix multiplication", "Performs the matrix product _B = alpha * A * B_ or _B = alpha * B * A_, in which _A_ is a unit or non-unit triangular matrix, _B_ (_m_ by _n_) is the general matrix to be updated, and _alpha_ is a scalar value.", [ald_side_m_n, bld_m]),
-  Routine(False, True,  "3", "trsm",  T,  [S,D,C,Z,H],     ["m","n"],            ["layout","side","triangle","a_transpose","diagonal"], ["a"],      ["b"],                        [amns,bmn],      ["alpha"],        "",    "Solves a triangular system of equations", "", []),
+  Routine(True,  True,  "3", "trsm",  T,  [S,D,C,Z,H],     ["m","n"],            ["layout","side","triangle","a_transpose","diagonal"], ["a"],      ["b"],                        [amns,bmn],      ["alpha"],        "",    "Solves a triangular system of equations", "Solves the equation _A * X = alpha * B_ for the unknown _m_ by _n_ matrix X, in which _A_ is an _n_ by _n_ unit or non-unit triangular matrix and B is an _m_ by _n_ matrix. The matrix _B_ is overwritten by the solution _X_.", []),
 ],
 [  # Level X: extra routines (not part of BLAS)
   Routine(True,  True,  "x", "omatcopy", T, [S,D,C,Z,H],   ["m","n"],            ["layout","a_transpose"],                              ["a"],      ["b"],                        [amn,bnma],      ["alpha"],        "",    "Scaling and out-place transpose/copy (non-BLAS function)", "Performs scaling and out-of-place transposition/copying of matrices according to _B = alpha*op(A)_, in which _A_ is an input matrix (_m_ rows by _n_ columns), _B_ an output matrix, and _alpha_ a scalar value. The operation _op_ can be a normal matrix copy, a transposition or a conjugate transposition.", [ald_m, bld_n]),
diff --git a/src/clblast.cpp b/src/clblast.cpp
index 4bb4e0b3..68671e50 100644
--- a/src/clblast.cpp
+++ b/src/clblast.cpp
@@ -66,6 +66,7 @@
 #include "routines/level3/xsyr2k.hpp"
 #include "routines/level3/xher2k.hpp"
 #include "routines/level3/xtrmm.hpp"
+#include "routines/level3/xtrsm.hpp"
 
 // Level-x includes (non-BLAS)
 #include "routines/levelx/xomatcopy.hpp"
@@ -2067,13 +2068,22 @@ template StatusCode PUBLIC_API Trmm<half>(const Layout, const Side, const Triang
 
 // Solves a triangular system of equations: STRSM/DTRSM/CTRSM/ZTRSM/HTRSM
 template <typename T>
-StatusCode Trsm(const Layout, const Side, const Triangle, const Transpose, const Diagonal,
-                const size_t, const size_t,
-                const T,
-                const cl_mem, const size_t, const size_t,
-                cl_mem, const size_t, const size_t,
-                cl_command_queue*, cl_event*) {
-  return StatusCode::kNotImplemented;
+StatusCode Trsm(const Layout layout, const Side side, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal,
+                const size_t m, const size_t n,
+                const T alpha,
+                const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+                cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
+                cl_command_queue* queue, cl_event* event) {
+  try {
+    auto queue_cpp = Queue(*queue);
+    auto routine = Xtrsm<T>(queue_cpp, event);
+    routine.DoTrsm(layout, side, triangle, a_transpose, diagonal,
+                   m, n,
+                   alpha,
+                   Buffer<T>(a_buffer), a_offset, a_ld,
+                   Buffer<T>(b_buffer), b_offset, b_ld);
+    return StatusCode::kSuccess;
+  } catch (...) { return DispatchException(); }
 }
 template StatusCode PUBLIC_API Trsm<float>(const Layout, const Side, const Triangle, const Transpose, const Diagonal,
                                            const size_t, const size_t,
diff --git a/src/routines/level3/xtrsm.cpp b/src/routines/level3/xtrsm.cpp
new file mode 100644
index 00000000..0ac1a58e
--- /dev/null
+++ b/src/routines/level3/xtrsm.cpp
@@ -0,0 +1,72 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the Xtrsm class (see the header for information about the class).
+//
+// =================================================================================================
+
+#include "routines/level3/xtrsm.hpp"
+
+#include <string>
+#include <vector>
+
+namespace clblast {
+// =================================================================================================
+
+// Constructor: forwards to base class constructor
+template <typename T>
+Xtrsm<T>::Xtrsm(Queue &queue, EventPointer event, const std::string &name):
+    Xgemm<T>(queue, event, name) {
+}
+
+// =================================================================================================
+
+// The main routine
+template <typename T>
+void Xtrsm<T>::DoTrsm(const Layout layout, const Side side, const Triangle triangle,
+                      const Transpose a_transpose, const Diagonal diagonal,
+                      const size_t m, const size_t n,
+                      const T alpha,
+                      const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
+                      const Buffer<T> &b_buffer, const size_t b_offset, const size_t b_ld) {
+
+  // Makes sure all dimensions are larger than zero
+  if ((m == 0) || (n == 0)) { throw BLASError(StatusCode::kInvalidDimension); }
+
+  // Computes the k dimension. This is based on whether or not matrix is A (on the left)
+  // or B (on the right) in the Xgemm routine.
+  auto k = (side == Side::kLeft) ? m : n;
+
+  // Checks for validity of the triangular A matrix
+  TestMatrixA(k, k, a_buffer, a_offset, a_ld);
+
+  // Checks for validity of the input/output B matrix
+  const auto b_one = (layout == Layout::kRowMajor) ? n : m;
+  const auto b_two = (layout == Layout::kRowMajor) ? m : n;
+  TestMatrixB(b_one, b_two, b_buffer, b_offset, b_ld);
+
+  // Creates a copy of B to avoid overwriting input in GEMM while computing output
+  const auto b_size = (b_ld * (b_two - 1) + b_one + b_offset);
+  auto b_buffer_copy = Buffer<T>(context_, b_size);
+  b_buffer.CopyTo(queue_, b_size, b_buffer_copy);
+
+  // TODO: Implement TRSM computation
+}
+
+// =================================================================================================
+
+// Compiles the templated class
+template class Xtrsm<half>;
+template class Xtrsm<float>;
+template class Xtrsm<double>;
+template class Xtrsm<float2>;
+template class Xtrsm<double2>;
+
+// =================================================================================================
+} // namespace clblast
diff --git a/src/routines/level3/xtrsm.hpp b/src/routines/level3/xtrsm.hpp
new file mode 100644
index 00000000..288e9d11
--- /dev/null
+++ b/src/routines/level3/xtrsm.hpp
@@ -0,0 +1,52 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the Xtrsm routine. The implementation is based on ??? (TODO).
+// Therefore, this class inherits from the Xgemm class.
+//
+// =================================================================================================
+
+#ifndef CLBLAST_ROUTINES_XTRSM_H_
+#define CLBLAST_ROUTINES_XTRSM_H_
+
+#include "routines/level3/xgemm.hpp"
+
+namespace clblast {
+// =================================================================================================
+
+// See comment at top of file for a description of the class
+template <typename T>
+class Xtrsm: public Xgemm<T> {
+ public:
+
+  // Uses methods and variables the Xgemm routine
+  using Xgemm<T>::routine_name_;
+  using Xgemm<T>::queue_;
+  using Xgemm<T>::context_;
+  using Xgemm<T>::device_;
+  using Xgemm<T>::db_;
+  using Xgemm<T>::DoGemm;
+
+  // Constructor
+  Xtrsm(Queue &queue, EventPointer event, const std::string &name = "TRSM");
+
+  // Templated-precision implementation of the routine
+  void DoTrsm(const Layout layout, const Side side, const Triangle triangle,
+              const Transpose a_transpose, const Diagonal diagonal,
+              const size_t m, const size_t n,
+              const T alpha,
+              const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
+              const Buffer<T> &b_buffer, const size_t b_offset, const size_t b_ld);
+};
+
+// =================================================================================================
+} // namespace clblast
+
+// CLBLAST_ROUTINES_XTRSM_H_
+#endif
diff --git a/test/routines/level3/xtrsm.hpp b/test/routines/level3/xtrsm.hpp
new file mode 100644
index 00000000..2480a817
--- /dev/null
+++ b/test/routines/level3/xtrsm.hpp
@@ -0,0 +1,159 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements a class with static methods to describe the Xtrsm routine. Examples of
+// such 'descriptions' are how to calculate the size a of buffer or how to run the routine. These
+// static methods are used by the correctness tester and the performance tester.
+//
+// =================================================================================================
+
+#ifndef CLBLAST_TEST_ROUTINES_XTRSM_H_
+#define CLBLAST_TEST_ROUTINES_XTRSM_H_
+
+#include <vector>
+#include <string>
+
+#ifdef CLBLAST_REF_CLBLAS
+  #include "test/wrapper_clblas.hpp"
+#endif
+#ifdef CLBLAST_REF_CBLAS
+  #include "test/wrapper_cblas.hpp"
+#endif
+
+namespace clblast {
+// =================================================================================================
+
+// See comment at top of file for a description of the class
+template <typename T>
+class TestXtrsm {
+ public:
+
+  // The BLAS level: 1, 2, or 3
+  static size_t BLASLevel() { return 3; }
+
+  // The list of arguments relevant for this routine
+  static std::vector<std::string> GetOptions() {
+    return {kArgM, kArgN,
+            kArgLayout, kArgSide, kArgTriangle, kArgATransp, kArgDiagonal,
+            kArgALeadDim, kArgBLeadDim,
+            kArgAOffset, kArgBOffset,
+            kArgAlpha};
+  }
+
+  // Describes how to obtain the sizes of the buffers
+  static size_t GetSizeA(const Arguments<T> &args) {
+    auto k = (args.side == Side::kLeft) ? args.m : args.n;
+    return k * args.a_ld + args.a_offset;
+  }
+  static size_t GetSizeB(const Arguments<T> &args) {
+    auto b_rotated = (args.layout == Layout::kRowMajor);
+    auto b_two = (b_rotated) ? args.m : args.n;
+    return b_two * args.b_ld + args.b_offset;
+  }
+
+  // Describes how to set the sizes of all the buffers
+  static void SetSizes(Arguments<T> &args) {
+    args.a_size = GetSizeA(args);
+    args.b_size = GetSizeB(args);
+  }
+
+  // Describes what the default values of the leading dimensions of the matrices are
+  static size_t DefaultLDA(const Arguments<T> &args) { return args.m; }
+  static size_t DefaultLDB(const Arguments<T> &args) { return args.n; }
+  static size_t DefaultLDC(const Arguments<T> &) { return 1; } // N/A for this routine
+
+  // Describes which transpose options are relevant for this routine
+  using Transposes = std::vector<Transpose>;
+  static Transposes GetATransposes(const Transposes &all) { return all; }
+  static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine
+
+  // Describes how to run the CLBlast routine
+  static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
+    auto queue_plain = queue();
+    auto event = cl_event{};
+    auto status = Trsm(args.layout, args.side, args.triangle, args.a_transpose, args.diagonal,
+                       args.m, args.n, args.alpha,
+                       buffers.a_mat(), args.a_offset, args.a_ld,
+                       buffers.b_mat(), args.b_offset, args.b_ld,
+                       &queue_plain, &event);
+    if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); }
+    return status;
+  }
+
+  // Describes how to run the clBLAS routine (for correctness/performance comparison)
+  #ifdef CLBLAST_REF_CLBLAS
+    static StatusCode RunReference1(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
+      auto queue_plain = queue();
+      auto event = cl_event{};
+      auto status = clblasXtrsm(convertToCLBLAS(args.layout),
+                                convertToCLBLAS(args.side),
+                                convertToCLBLAS(args.triangle),
+                                convertToCLBLAS(args.a_transpose),
+                                convertToCLBLAS(args.diagonal),
+                                args.m, args.n, args.alpha,
+                                buffers.a_mat, args.a_offset, args.a_ld,
+                                buffers.b_mat, args.b_offset, args.b_ld,
+                                1, &queue_plain, 0, nullptr, &event);
+      clWaitForEvents(1, &event);
+      return static_cast<StatusCode>(status);
+    }
+  #endif
+
+  // Describes how to run the CPU BLAS routine (for correctness/performance comparison)
+  #ifdef CLBLAST_REF_CBLAS
+    static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
+      std::vector<T> a_mat_cpu(args.a_size, static_cast<T>(0));
+      std::vector<T> b_mat_cpu(args.b_size, static_cast<T>(0));
+      buffers.a_mat.Read(queue, args.a_size, a_mat_cpu);
+      buffers.b_mat.Read(queue, args.b_size, b_mat_cpu);
+      cblasXtrsm(convertToCBLAS(args.layout),
+                 convertToCBLAS(args.side),
+                 convertToCBLAS(args.triangle),
+                 convertToCBLAS(args.a_transpose),
+                 convertToCBLAS(args.diagonal),
+                 args.m, args.n, args.alpha,
+                 a_mat_cpu, args.a_offset, args.a_ld,
+                 b_mat_cpu, args.b_offset, args.b_ld);
+      buffers.b_mat.Write(queue, args.b_size, b_mat_cpu);
+      return StatusCode::kSuccess;
+    }
+  #endif
+
+  // Describes how to download the results of the computation (more importantly: which buffer)
+  static std::vector<T> DownloadResult(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
+    std::vector<T> result(args.b_size, static_cast<T>(0));
+    buffers.b_mat.Read(queue, args.b_size, result);
+    return result;
+  }
+
+  // Describes how to compute the indices of the result buffer
+  static size_t ResultID1(const Arguments<T> &args) { return args.m; }
+  static size_t ResultID2(const Arguments<T> &args) { return args.n; }
+  static size_t GetResultIndex(const Arguments<T> &args, const size_t id1, const size_t id2) {
+    return (args.layout == Layout::kRowMajor) ?
+           id1*args.b_ld + id2 + args.b_offset:
+           id2*args.b_ld + id1 + args.b_offset;
+  }
+
+  // Describes how to compute performance metrics
+  static size_t GetFlops(const Arguments<T> &args) {
+    auto k = (args.side == Side::kLeft) ? args.m : args.n;
+    return args.m * args.n * k;
+  }
+  static size_t GetBytes(const Arguments<T> &args) {
+    auto k = (args.side == Side::kLeft) ? args.m : args.n;
+    return (k*k + 2*args.m*args.n) * sizeof(T);
+  }
+};
+
+// =================================================================================================
+} // namespace clblast
+
+// CLBLAST_TEST_ROUTINES_XTRSM_H_
+#endif

From 4a4be0c3a5e603a9c0c4c7aebcb660b5e62b99ad Mon Sep 17 00:00:00 2001
From: Cedric Nugteren <web@cedricnugteren.nl>
Date: Sun, 15 Jan 2017 17:17:40 +0100
Subject: [PATCH 02/24] Prints additional information in verbose/debug mode

---
 src/cache.cpp                 | 8 ++++----
 test/correctness/testblas.cpp | 5 ++++-
 2 files changed, 8 insertions(+), 5 deletions(-)

diff --git a/src/cache.cpp b/src/cache.cpp
index 6786eaa2..a2e51792 100644
--- a/src/cache.cpp
+++ b/src/cache.cpp
@@ -24,7 +24,7 @@ namespace clblast {
 void StoreBinaryToCache(const std::string &binary, const std::string &device_name,
                         const Precision &precision, const std::string &routine_name) {
   #ifdef VERBOSE
-    printf("[DEBUG] Storing binary in cache\n");
+    printf("[DEBUG] Storing binary '%s' in cache\n", routine_name.c_str());
   #endif
   binary_cache_mutex_.lock();
   binary_cache_.push_back(BinaryCache{binary, device_name, precision, routine_name});
@@ -35,7 +35,7 @@ void StoreBinaryToCache(const std::string &binary, const std::string &device_nam
 void StoreProgramToCache(const Program &program, const Context &context,
                          const Precision &precision, const std::string &routine_name) {
   #ifdef VERBOSE
-    printf("[DEBUG] Storing program in cache\n");
+    printf("[DEBUG] Storing program '%s' in cache\n", routine_name.c_str());
   #endif
   program_cache_mutex_.lock();
   program_cache_.push_back(ProgramCache{program, context(), precision, routine_name});
@@ -47,7 +47,7 @@ void StoreProgramToCache(const Program &program, const Context &context,
 const std::string& GetBinaryFromCache(const std::string &device_name, const Precision &precision,
                                       const std::string &routine_name) {
   #ifdef VERBOSE
-    printf("[DEBUG] Retrieving binary from cache\n");
+    printf("[DEBUG] Retrieving binary '%s' from cache\n", routine_name.c_str());
   #endif
   binary_cache_mutex_.lock();
   for (auto &cached_binary: binary_cache_) {
@@ -65,7 +65,7 @@ const std::string& GetBinaryFromCache(const std::string &device_name, const Prec
 const Program& GetProgramFromCache(const Context &context, const Precision &precision,
                                    const std::string &routine_name) {
   #ifdef VERBOSE
-    printf("[DEBUG] Retrieving program from cache\n");
+    printf("[DEBUG] Retrieving program '%s' from cache\n", routine_name.c_str());
   #endif
   program_cache_mutex_.lock();
   for (auto &cached_program: program_cache_) {
diff --git a/test/correctness/testblas.cpp b/test/correctness/testblas.cpp
index 5fddb37b..7c64855a 100644
--- a/test/correctness/testblas.cpp
+++ b/test/correctness/testblas.cpp
@@ -138,7 +138,10 @@ void TestBlas<T,U>::TestRegular(std::vector<Arguments<U>> &test_vector, const st
 
     // Don't continue with CBLAS if there are incorrect parameters
     if (compare_cblas_ && status2 != StatusCode::kSuccess) {
-      if (verbose_) { fprintf(stdout, " -> "); std::cout << std::flush; }
+      if (verbose_) {
+        fprintf(stdout, " -> %d -> ", static_cast<int>(status2));
+        std::cout << std::flush;
+      }
       TestErrorCodes(status2, status2, args);
       continue;
     }

From 4b3ffd998904f5c848edc5917308f5942fa71da3 Mon Sep 17 00:00:00 2001
From: Cedric Nugteren <web@cedricnugteren.nl>
Date: Sun, 15 Jan 2017 17:30:00 +0100
Subject: [PATCH 03/24] Added a first version of the diagonal block invert
 routine in preparation of TRSM

---
 CMakeLists.txt                                |   2 +-
 src/database/database.cpp                     |   2 +
 src/database/kernels/invert.hpp               |  78 ++++
 src/kernels/common.opencl                     |  14 +
 .../level3/invert_diagonal_blocks.opencl      | 440 ++++++++++++++++++
 src/routines/common.hpp                       |  21 +
 src/routines/levelx/xinvert.cpp               | 152 ++++++
 src/routines/levelx/xinvert.hpp               |  40 ++
 src/utilities/utilities.cpp                   |  24 +
 src/utilities/utilities.hpp                   |   4 +
 test/correctness/routines/levelx/xinvert.cpp  |  30 ++
 test/performance/routines/levelx/xinvert.cpp  |  37 ++
 test/routines/levelx/xinvert.hpp              | 219 +++++++++
 13 files changed, 1062 insertions(+), 1 deletion(-)
 create mode 100644 src/database/kernels/invert.hpp
 create mode 100644 src/kernels/level3/invert_diagonal_blocks.opencl
 create mode 100644 src/routines/levelx/xinvert.cpp
 create mode 100644 src/routines/levelx/xinvert.hpp
 create mode 100644 test/correctness/routines/levelx/xinvert.cpp
 create mode 100644 test/performance/routines/levelx/xinvert.cpp
 create mode 100644 test/routines/levelx/xinvert.hpp

diff --git a/CMakeLists.txt b/CMakeLists.txt
index f6fa4045..a9cabac7 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -159,7 +159,7 @@ set(LEVEL1_ROUTINES xswap xscal xcopy xaxpy xdot xdotu xdotc xnrm2 xasum xamax)
 set(LEVEL2_ROUTINES xgemv xgbmv xhemv xhbmv xhpmv xsymv xsbmv xspmv xtrmv xtbmv xtpmv
                     xger xgeru xgerc xher xhpr xher2 xhpr2 xsyr xspr xsyr2 xspr2)
 set(LEVEL3_ROUTINES xgemm xsymm xhemm xsyrk xherk xsyr2k xher2k xtrmm xtrsm)
-set(LEVELX_ROUTINES xomatcopy)
+set(LEVELX_ROUTINES xomatcopy xinvert)
 set(ROUTINES ${LEVEL1_ROUTINES} ${LEVEL2_ROUTINES} ${LEVEL3_ROUTINES} ${LEVELX_ROUTINES})
 set(PRECISIONS 32 64 3232 6464 16)
 
diff --git a/src/database/database.cpp b/src/database/database.cpp
index cf548d46..bcbd9955 100644
--- a/src/database/database.cpp
+++ b/src/database/database.cpp
@@ -26,6 +26,7 @@
 #include "database/kernels/pad.hpp"
 #include "database/kernels/transpose.hpp"
 #include "database/kernels/padtranspose.hpp"
+#include "database/kernels/invert.hpp"
 #include "database/kernel_selection.hpp"
 
 namespace clblast {
@@ -45,6 +46,7 @@ const std::vector<const Database::DatabaseEntry*> Database::database = {
   &database::PadHalf, &database::PadSingle, &database::PadDouble, &database::PadComplexSingle, &database::PadComplexDouble,
   &database::TransposeHalf, &database::TransposeSingle, &database::TransposeDouble, &database::TransposeComplexSingle, &database::TransposeComplexDouble,
   &database::PadtransposeHalf, &database::PadtransposeSingle, &database::PadtransposeDouble, &database::PadtransposeComplexSingle, &database::PadtransposeComplexDouble,
+  &database::InvertHalf, &database::InvertSingle, &database::InvertDouble, &database::InvertComplexSingle, &database::InvertComplexDouble,
   &database::KernelSelectionHalf, &database::KernelSelectionSingle, &database::KernelSelectionDouble, &database::KernelSelectionComplexSingle, &database::KernelSelectionComplexDouble
 };
 
diff --git a/src/database/kernels/invert.hpp b/src/database/kernels/invert.hpp
new file mode 100644
index 00000000..2717f182
--- /dev/null
+++ b/src/database/kernels/invert.hpp
@@ -0,0 +1,78 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// Tuning parameters for the diagonal matrix inversion kernels
+//
+// =================================================================================================
+
+namespace clblast {
+namespace database {
+// =================================================================================================
+
+const Database::DatabaseEntry InvertHalf = {
+  "Invert", Precision::kHalf, {
+    { // Default
+      kDeviceTypeAll, "default", {
+        { "default",                                         { {"INTERNAL_BLOCK_SIZE",16} } },
+      }
+    },
+  }
+};
+
+// =================================================================================================
+
+const Database::DatabaseEntry InvertSingle = {
+  "Invert", Precision::kSingle, {
+    { // Default
+      kDeviceTypeAll, "default", {
+        { "default",                                         { {"INTERNAL_BLOCK_SIZE",16} } },
+      }
+    },
+  }
+};
+
+// =================================================================================================
+
+const Database::DatabaseEntry InvertComplexSingle = {
+  "Invert", Precision::kComplexSingle, {
+    { // Default
+      kDeviceTypeAll, "default", {
+        { "default",                                         { {"INTERNAL_BLOCK_SIZE",16} } },
+      }
+    },
+  }
+};
+
+// =================================================================================================
+
+const Database::DatabaseEntry InvertDouble = {
+  "Invert", Precision::kDouble, {
+    { // Default
+      kDeviceTypeAll, "default", {
+        { "default",                                         { {"INTERNAL_BLOCK_SIZE",16} } },
+      }
+    },
+  }
+};
+
+// =================================================================================================
+
+const Database::DatabaseEntry InvertComplexDouble = {
+  "Invert", Precision::kComplexDouble, {
+    { // Default
+      kDeviceTypeAll, "default", {
+        { "default",                                         { {"INTERNAL_BLOCK_SIZE",16} } },
+      }
+    },
+  }
+};
+
+// =================================================================================================
+} // namespace database
+} // namespace clblast
diff --git a/src/kernels/common.opencl b/src/kernels/common.opencl
index b0817242..bfa1042d 100644
--- a/src/kernels/common.opencl
+++ b/src/kernels/common.opencl
@@ -162,6 +162,13 @@ R"(
   #define AbsoluteValue(value) value = fabs(value)
 #endif
 
+// Negation (component-wise)
+#if PRECISION == 3232 || PRECISION == 6464
+  #define Negate(value) value.x = -(value.x); value.y = -(value.y)
+#else
+  #define Negate(value) value = -(value)
+#endif
+
 // Adds two complex variables
 #if PRECISION == 3232 || PRECISION == 6464
   #define Add(c, a, b) c.x = a.x + b.x; c.y = a.y + b.y
@@ -193,6 +200,13 @@ R"(
   #endif
 #endif
 
+// The scalar division function
+#if PRECISION == 3232 || PRECISION == 6464
+  #define DivideReal(c, a, b) c.x = a.x / b.x; c.y = a.x
+#else
+  #define DivideReal(c, a, b) c = a / b
+#endif
+
 // The scalar AXPBY function
 #if PRECISION == 3232 || PRECISION == 6464
   #define AXPBY(e, a, b, c, d) e.x = MulReal(a,b) + MulReal(c,d); e.y = MulImag(a,b) + MulImag(c,d)
diff --git a/src/kernels/level3/invert_diagonal_blocks.opencl b/src/kernels/level3/invert_diagonal_blocks.opencl
new file mode 100644
index 00000000..9231d725
--- /dev/null
+++ b/src/kernels/level3/invert_diagonal_blocks.opencl
@@ -0,0 +1,440 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file contains kernels to invert squared diagonal blocks of a matrix. These kernels are based
+// on the TRSM implementation in the CUDA version of Magma version 2.2.0 and the poster "Triangular
+// Linear System Solver for GPU with CUDA and OpenCL" by Peng Du, Stanimire Tomov, Piotr Luszczek,
+// and Jack Dongarra.
+//
+// =================================================================================================
+//
+//  Let A be an block_size*block_size lower triangular matrix, and B its inverse.
+//  Then the block decomposition
+//  
+//      [ A11   0  ] * [ B11   0  ] = [ I 0 ]
+//      [ A21  A22 ]   [ B21  B22 ]   [ 0 I ]
+//  
+//  yields
+//  
+//      A11*B11 = I            ==>  B11 =  A11^{-1},
+//      A22*B22 = I            ==>  B22 =  A22^{-1},
+//      A21*B11 + A22*B21 = 0  ==>  B21 = -A22^{-1}*A21*B11 = -B22*A21*B11.
+//  
+//  The InvertDiagonalBlock kernel inverts A11 and A22.
+//  The TripleMatMul routines multiply:
+//  part 1:  B21 =  A21 * B11,
+//  part 2:  B21 = -B22 * B21.
+//  
+//  At this level, inner block is current_size=16, with one 4 x 4 work-group per inner block. Each
+//  submatrix Aij and Bij is current_size x current_size. The submatrix dimension is multiplied by 2
+//  at each level, so the next level is current_size*2 = 32. A 'page' is the next bigger block,
+//  here current_size*2=32,
+//                 [ B11   0  ]
+//  which contains [ B21  B22 ].
+//  Outer blocks are block_size x block_size.
+//  
+//  A21 may have < current_size rows, but is guaranteed to have current_size cols since A22 is on
+//  the right. This makes a single check easy to do.
+//  
+//  B is stored in workspace that is a full multiple of block_size x block_size; no checks needed.
+//  
+//  We split this into part1 & part2 to synchronize all blocks and make sure
+//  that writes to B12 are observed by all blocks.
+//
+// =================================================================================================
+
+// Enables loading of this file using the C++ pre-processor's #include (C++11 standard raw string
+// literal). Comment-out this line for syntax-highlighting when developing.
+R"(
+
+// =================================================================================================
+#if defined(ROUTINE_INVERT)
+
+#define LOCALX 17 // 16 + 1 to avoid bank conflicts
+#define LOCALY 16
+
+// =================================================================================================
+
+__kernel __attribute__((reqd_work_group_size(8, 8, 1)))
+void FillMatrix(const int n, const int ld, const int offset,
+                __global real* restrict dest, const real_arg arg_value) {
+  const real value = GetRealArg(arg_value);
+  const int id_one = get_global_id(0);
+  const int id_two = get_global_id(1);
+  if (id_one < ld && id_two < n) {
+    dest[id_two*ld + id_one + offset] = value;
+  }
+}
+
+// =================================================================================================
+
+// Inverts a diagonal block of INTERNAL_BLOCK_SIZE by INTERNAL_BLOCK_SIZE elements in a larger matrix
+__kernel __attribute__((reqd_work_group_size(INTERNAL_BLOCK_SIZE, 1, 1)))
+void InvertDiagonalBlock(int n, __global const real* restrict src, const int src_offset, const int src_ld,
+                         __global real* restrict dest, const int outer_block_size,
+                         const int unit_diagonal, const int is_upper)
+{
+  const int thread_index = get_local_id(0);
+  const int block_index = get_group_id(0);
+
+  // Sets the offset for this particular block in the source and destination matrices
+  const int src_block_offset = block_index * (INTERNAL_BLOCK_SIZE + src_ld * INTERNAL_BLOCK_SIZE) + src_offset;
+  const int num_inner_blocks = outer_block_size / INTERNAL_BLOCK_SIZE;
+  const int dest_block_offset = (block_index / num_inner_blocks) * outer_block_size * outer_block_size + // go to the (block_index / num_inner_blocks) outer outer_block_size*outer_block_size block,
+                                (block_index % num_inner_blocks) * (outer_block_size*INTERNAL_BLOCK_SIZE + INTERNAL_BLOCK_SIZE); // then to the (block_index % num_inner_blocks) inner INTERNAL_BLOCK_SIZE*INTERNAL_BLOCK_SIZE block inside that
+
+  // Local memory to store the inverted block of INTERNAL_BLOCK_SIZE by INTERNAL_BLOCK_SIZE
+  __local real lm[INTERNAL_BLOCK_SIZE][INTERNAL_BLOCK_SIZE];
+
+  // Loads the source lower triangle into local memory. Any values in the upper triangle or
+  // outside of the matrix are set to zero
+  #pragma unroll
+  for (int j = 0; j < INTERNAL_BLOCK_SIZE; ++j) {
+    const bool condition = (is_upper) ? (thread_index <= j && block_index*INTERNAL_BLOCK_SIZE + j < n) :
+                                        (thread_index >= j && block_index*INTERNAL_BLOCK_SIZE + thread_index < n);
+    if (condition) {
+      lm[thread_index][j] = src[j*src_ld + thread_index + src_block_offset];
+    }
+    else {
+      SetToZero(lm[thread_index][j]);
+    }
+  }
+  barrier(CLK_LOCAL_MEM_FENCE);
+  
+  // Inverts the diagonal
+  real inverted_diagonal;
+  SetToOne(inverted_diagonal);
+  if (unit_diagonal == 0) {
+    const real diagonal_value = lm[thread_index][thread_index];
+    if (!IsZero(diagonal_value)) { // Only for non-singular values and values inside the matrix
+      DivideReal(inverted_diagonal, inverted_diagonal, diagonal_value);
+    }
+  }
+  lm[thread_index][thread_index] = inverted_diagonal;
+  barrier(CLK_LOCAL_MEM_FENCE);
+
+  // Upper-triangular
+  if (is_upper) {
+
+    // Computes the elements 0:j-1 of the j-th column
+    for (int j = 1; j < INTERNAL_BLOCK_SIZE; ++j) {
+      if (thread_index < j) {
+        real sum;
+        SetToZero(sum);
+        #pragma unroll
+        for (int k = 0; k < j; ++k) {
+          MultiplyAdd(sum, lm[thread_index][k], lm[k][j]);
+        }
+        real diagonal_value = lm[j][j];
+        Negate(diagonal_value);
+        Multiply(lm[thread_index][j], diagonal_value, sum);
+      }
+      barrier(CLK_LOCAL_MEM_FENCE);
+    }
+  }
+
+  // Lower triangular
+  else {
+
+    // Computes the elements j+1:INTERNAL_BLOCK_SIZE-1 of the j-th column
+    for (int j = INTERNAL_BLOCK_SIZE - 2; j >= 0; --j) {
+      if (thread_index > j) {
+        real sum;
+        SetToZero(sum);
+        #pragma unroll
+        for (int k = j + 1; k < INTERNAL_BLOCK_SIZE; ++k) {
+          MultiplyAdd(sum, lm[thread_index][k], lm[k][j]);
+        }
+        Multiply(lm[thread_index][j], -lm[j][j], sum);
+      }
+      barrier(CLK_LOCAL_MEM_FENCE);
+    }
+  }
+  
+  // Writes the result to global memory
+  #pragma unroll
+  for (int j = 0; j < INTERNAL_BLOCK_SIZE; ++j) {
+    dest[j*outer_block_size + thread_index + dest_block_offset] = lm[thread_index][j];
+  }
+}
+
+// =================================================================================================
+
+// Triple matrix-multiplication kernel: C = A * B
+inline void TripleMatMul(const int size, const bool upper, const int part, __local real* blm, int n,
+                         __global const real* agm, __global const real* bgm, __global real* cgm,
+                         const int lda, const int ldb, const int ldc,
+                         int current_size, int num_pages, const int block_size) {
+
+  // Emulates a 3D grid: NX * (NY * num_pages)
+  const int by   = get_group_id(1) / num_pages;
+  const int page = get_group_id(1) % num_pages;
+  const int lidx = get_local_id(0);
+  const int lidy = get_local_id(1);
+  const int ibx  = get_group_id(0) * (get_local_size(0)*get_local_size(1));
+  const int iby  = by*16;
+  const int id   = lidx + lidy*get_local_size(0);
+  const int row  = page*current_size*2 + current_size + ibx + id;
+  int col        = page*current_size*2 + current_size;
+
+  // Sets the offsets for this specific thread
+  agm += ibx + id;
+  bgm += lidx + (iby + lidy)*ldb;
+  cgm += ibx + id + iby*ldc;
+
+  // Initializes the result registers
+  real cpm[16];
+  #pragma unroll
+  for (int j = 0; j < 16; ++j) {
+    SetToZero(cpm[j]);
+  }
+
+  // Computes NT x 16 block of C, each thread computes one 1 x 16 row
+  for (int k = 0; k < current_size; k += 16) {
+
+    // Loads a 16 x 16 block of B into local memory using NX x 4 threads
+    #pragma unroll
+    for( int i=0; i < 16; i += (size/4) ) {  // += get_local_size(0)
+      #pragma unroll
+      for( int j=0; j < 16; j += 4 ) {  // += get_local_size(1)
+        blm[(lidx + i) * LOCALX + (lidy + j)] = bgm[k + i + j*ldb];
+      }
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    // Upper triangular
+    if (upper) {
+
+      // Performs 16 x 16 multiply-add operations
+      #pragma unroll
+      for (int i = 0; i < 16; ++i) {
+        if (part == 2 || col++ < n) {
+          #pragma unroll
+          for (int j = 0; j < 16; ++j) {
+            MultiplyAdd(cpm[j], agm[(i + k) * lda], blm[i * LOCALX + j]);
+          }
+        }
+      }
+    }
+
+    // Lower triangular
+    else {
+      if (row < n) {
+
+        // Performs 16 x 16 multiply-add operations
+        #pragma unroll
+        for (int i = 0; i < 16; ++i) {
+          #pragma unroll
+          for (int j = 0; j < 16; ++j) {
+            MultiplyAdd(cpm[j], agm[(i + k) * lda], blm[i * LOCALX + j]);
+          }
+        }
+      }
+    }
+
+    barrier(CLK_LOCAL_MEM_FENCE);
+  }
+
+  // Stores NT x 16 results: each thread writes one 16 x 1 row
+  #pragma unroll
+  for (int i = 0; i < 16; ++i) {
+    if (part == 2) { Negate(cpm[i]); }
+    cgm[0] = cpm[i];
+    cgm += ldc;
+  }
+}
+
+// =================================================================================================
+
+// Triple matrix-multiplication kernel part 1: B12 = A12 * B22 (upper) or B21 = A21 * B11 (lower)
+inline void TripleMatMulPart1(const int size, const bool upper, __local real* blm, int n,
+                              __global const real* src, const int a_offset, const int lda,
+                              __global real* dest, int current_size, int num_pages, const int block_size) {
+
+  // Emulates a 3D grid: NX * (NY * num_pages)
+  const int page = get_group_id(1) % num_pages;
+
+  // Computes the destination block offset:
+  // - go to the (page / pages_per_block) outer block_size * block_size block
+  // - then the (page % pages_per_block) inner (current_size*2) * (current_size*2) page inside that
+  const int pages_per_block = block_size / (current_size*2);
+  dest += (page / pages_per_block) * block_size * block_size +
+          (page % pages_per_block) * (current_size*2*block_size + current_size*2);
+
+  // Using the GEMM notation: C = A*B
+  __global const real* agm;
+  __global const real* bgm;
+  __global real* cgm;
+  if (upper) { // upper triangular: B12 = A12 * B22
+    agm = src + a_offset + page*current_size*2*lda + page*current_size*2 + current_size*lda;  // A12
+    bgm = dest + current_size*block_size + current_size;                                      // B22
+    cgm = dest + current_size*block_size;                                                     // B12
+  }
+  else { // lower triangular: B21 = A21 * B11
+    agm = src + a_offset + page*current_size*2*lda + page*current_size*2 + current_size;  // A21
+    bgm = dest;                                                                           // B11
+    cgm = dest + current_size;                                                            // B21
+  }
+
+  // Runs the generic C = A * B matrix multiplication
+  const int ldb = block_size;
+  const int ldc = block_size;
+  TripleMatMul(size, upper, 1, blm, n, agm, bgm, cgm, lda, ldb, ldc, current_size, num_pages, block_size);
+}
+
+// Triple matrix-multiplication kernel part 1: B12 = -B11 * B12 (upper) or B21 = -B22 * B21 (lower)
+inline void TripleMatMulPart2(const int size, const bool upper, __local real* blm, const int n,
+                              __global real* dest, int current_size, int num_pages, const int block_size) {
+
+  // Emulates a 3D grid: NX * (NY * num_pages)
+  const int page = get_group_id(1) % num_pages;
+
+  // Computes the destination block offset:
+  // - go to the (page / pages_per_block) outer block_size * block_size block
+  // - then the (page % pages_per_block) inner (current_size*2) * (current_size*2) page inside that
+  const int pages_per_block = block_size / (current_size*2);
+  dest += (page / pages_per_block) * block_size * block_size +
+          (page % pages_per_block) * (current_size*2*block_size + current_size*2);
+
+  // Using the GEMM notation: C = A*B
+  __global const real* agm;
+  __global const real* bgm;
+  __global real* cgm;
+  if (upper) { // upper triangular: B12 = -B11 * B12
+    agm = dest;                            // B11
+    cgm = dest + current_size*block_size;  // B12
+    bgm = cgm;                             // B12, okay to overwrite
+  }
+
+  else { // lower triangular: B21 = -B22 * B21
+    agm = dest + current_size*block_size + current_size;  // B22
+    cgm = dest + current_size;                            // B21
+    bgm = cgm;                                            // B21, okay to overwrite
+  }
+
+  // Runs the generic C = A * B matrix multiplication
+  const int lda = block_size;
+  const int ldb = block_size;
+  const int ldc = block_size;
+  TripleMatMul(size, upper, 2, blm, n, agm, bgm, cgm, lda, ldb, ldc, current_size, num_pages, block_size);
+}
+
+// =================================================================================================
+
+// B21 = A21 * B11
+__kernel __attribute__((reqd_work_group_size(4, 4, 1)))
+void TripleMatMul16Part1Lower(int n, __global const real* restrict src, const int a_offset, const int lda,
+                              __global real* restrict dest, int current_size, int num_pages, const int block_size)
+{
+  __local real lm[LOCALY * LOCALX];
+  TripleMatMulPart1(16, false, lm, n, src, a_offset, lda, dest, current_size, num_pages, block_size);
+}
+
+// B21 = -B22 * B21
+__kernel __attribute__((reqd_work_group_size(4, 4, 1)))
+void TripleMatMul16Part2Lower(int n, __global real* restrict dest, int current_size, int num_pages, const int block_size)
+{
+  __local real lm[LOCALY * LOCALX];
+  TripleMatMulPart2(16, false, lm, n, dest, current_size, num_pages, block_size);
+}
+
+// B21 = A21 * B11
+__kernel __attribute__((reqd_work_group_size(8, 4, 1)))
+void TripleMatMul32Part1Lower(int n, __global const real* restrict src, const int a_offset, const int lda,
+                              __global real* restrict dest, int current_size, int num_pages, const int block_size)
+{
+  __local real lm[LOCALY * LOCALX];
+  TripleMatMulPart1(32, false, lm, n, src, a_offset, lda, dest, current_size, num_pages, block_size);
+}
+
+// B21 = -B22 * B21
+__kernel __attribute__((reqd_work_group_size(8, 4, 1)))
+void TripleMatMul32Part2Lower(int n, __global real* restrict dest, int current_size, int num_pages, const int block_size)
+{
+  __local real lm[LOCALY * LOCALX];
+  TripleMatMulPart2(32, false, lm, n, dest, current_size, num_pages, block_size);
+}
+
+// B21 = A21 * B11
+__kernel __attribute__((reqd_work_group_size(16, 4, 1)))
+void TripleMatMul64Part1Lower(int n, __global const real* restrict src, const int a_offset, const int lda,
+                              __global real* restrict dest, int current_size, int num_pages, const int block_size)
+{
+  __local real lm[LOCALY * LOCALX];
+  TripleMatMulPart1(64, false, lm, n, src, a_offset, lda, dest, current_size, num_pages, block_size);
+}
+
+// B21 = -B22 * B21
+__kernel __attribute__((reqd_work_group_size(16, 4, 1)))
+void TripleMatMul64Part2Lower(int n, __global real* restrict dest, int current_size, int num_pages, const int block_size)
+{
+  __local real lm[LOCALY * LOCALX];
+  TripleMatMulPart2(64, false, lm, n, dest, current_size, num_pages, block_size);
+}
+
+// =================================================================================================
+
+// B12 =  A12 * B22
+__kernel __attribute__((reqd_work_group_size(4, 4, 1)))
+void TripleMatMul16Part1Upper(int n, __global const real* restrict src, const int a_offset, const int lda,
+                              __global real* restrict dest, int current_size, int num_pages, const int block_size)
+{
+  __local real lm[LOCALY * LOCALX];
+  TripleMatMulPart1(16, true, lm, n, src, a_offset, lda, dest, current_size, num_pages, block_size);
+}
+
+// B12 = -B11 * B12
+__kernel __attribute__((reqd_work_group_size(4, 4, 1)))
+void TripleMatMul16Part2Upper(int n, __global real* restrict dest, int current_size, int num_pages, const int block_size)
+{
+  __local real lm[LOCALY * LOCALX];
+  TripleMatMulPart2(16, true, lm, n, dest, current_size, num_pages, block_size);
+}
+
+// B12 =  A12 * B22
+__kernel __attribute__((reqd_work_group_size(8, 4, 1)))
+void TripleMatMul32Part1Upper(int n, __global const real* restrict src, const int a_offset, const int lda,
+                              __global real* restrict dest, int current_size, int num_pages, const int block_size)
+{
+  __local real lm[LOCALY * LOCALX];
+  TripleMatMulPart1(32, true, lm, n, src, a_offset, lda, dest, current_size, num_pages, block_size);
+}
+
+// B12 = -B11 * B12
+__kernel __attribute__((reqd_work_group_size(8, 4, 1)))
+void TripleMatMul32Part2Upper(int n, __global real* restrict dest, int current_size, int num_pages, const int block_size)
+{
+  __local real lm[LOCALY * LOCALX];
+  TripleMatMulPart2(32, true, lm, n, dest, current_size, num_pages, block_size);
+}
+
+// B12 =  A12 * B22
+__kernel __attribute__((reqd_work_group_size(16, 4, 1)))
+void TripleMatMul64Part1Upper(int n, __global const real* restrict src, const int a_offset, const int lda,
+                              __global real* restrict dest, int current_size, int num_pages, const int block_size)
+{
+  __local real lm[LOCALY * LOCALX];
+  TripleMatMulPart1(64, true, lm, n, src, a_offset, lda, dest, current_size, num_pages, block_size);
+}
+
+// B12 = -B11 * B12
+__kernel __attribute__((reqd_work_group_size(16, 4, 1)))
+void TripleMatMul64Part2Upper(int n, __global real* restrict dest, int current_size, int num_pages, const int block_size)
+{
+  __local real lm[LOCALY * LOCALX];
+  TripleMatMulPart2(64, true, lm, n, dest, current_size, num_pages, block_size);
+}
+
+#endif
+// =================================================================================================
+
+// End of the C++11 raw string literal
+)"
+
+// =================================================================================================
diff --git a/src/routines/common.hpp b/src/routines/common.hpp
index 53ca6355..47f98de0 100644
--- a/src/routines/common.hpp
+++ b/src/routines/common.hpp
@@ -33,6 +33,27 @@ void RunKernel(Kernel &kernel, Queue &queue, const Device &device,
 
 // =================================================================================================
 
+// Sets all elements of a matrix to a constant value
+template <typename T>
+void FillMatrix(Queue &queue, const Device &device,
+                const Program &program, const Database &,
+                EventPointer event, const std::vector<Event> &waitForEvents,
+                const size_t n, const size_t ld, const size_t offset,
+                const Buffer<T> &dest,
+                const T constant_value) {
+  auto kernel = Kernel(program, "FillMatrix");
+  kernel.SetArgument(0, static_cast<int>(n));
+  kernel.SetArgument(1, static_cast<int>(ld));
+  kernel.SetArgument(2, static_cast<int>(offset));
+  kernel.SetArgument(3, dest());
+  kernel.SetArgument(4, GetRealArg(constant_value));
+  auto local = std::vector<size_t>{8, 8};
+  auto global = std::vector<size_t>{Ceil(ld, 8), Ceil(n, 8)};
+  RunKernel(kernel, queue, device, global, local, event, waitForEvents);
+}
+
+// =================================================================================================
+
 // Copies or transposes a matrix and optionally pads/unpads it with zeros. This method is also able
 // to write to symmetric and triangular matrices through optional arguments.
 template <typename T>
diff --git a/src/routines/levelx/xinvert.cpp b/src/routines/levelx/xinvert.cpp
new file mode 100644
index 00000000..5ffba958
--- /dev/null
+++ b/src/routines/levelx/xinvert.cpp
@@ -0,0 +1,152 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file contains all the common code to perform (partial) matrix inverting. This code is based
+// on the TRSM implementation in the CUDA version of Magma version 2.2.0 and the poster "Triangular
+// Linear System Solver for GPU with CUDA and OpenCL" by Peng Du, Stanimire Tomov, Piotr Luszczek,
+// and Jack Dongarra.
+//
+// =================================================================================================
+
+#include "routines/levelx/xinvert.hpp"
+
+#include <string>
+#include <vector>
+#include <assert.h>
+
+namespace clblast {
+// =================================================================================================
+
+// Constructor: forwards to base class constructor
+template <typename T>
+Xinvert<T>::Xinvert(Queue &queue, EventPointer event, const std::string &name):
+    Routine(queue, event, name, {"Invert"}, PrecisionValue<T>(), {}, {
+    #include "../../kernels/level3/invert_diagonal_blocks.opencl"
+    }) {
+}
+
+// =================================================================================================
+
+// Inverts diagonal square blocks of a matrix
+template <typename T>
+void Xinvert<T>::InvertMatrixDiagonalBlocks(const Layout layout, const Triangle triangle, const Diagonal diag,
+                                            const size_t n, const size_t block_size,
+                                            const Buffer<T> &src, const size_t offset, const size_t ld_src,
+                                            Buffer<T> &dest) {
+
+  // Makes sure all dimensions are larger than zero and the block size is smaller than n
+  if ((block_size == 0) || (n == 0) || (block_size > n)) {
+    throw BLASError(StatusCode::kInvalidDimension);
+  }
+
+  // Helper variables
+  const auto internal_block_size = static_cast<size_t>(db_["INTERNAL_BLOCK_SIZE"]);
+  assert(internal_block_size == 16);
+  const auto num_blocks = CeilDiv(n, block_size);
+  const auto num_internal_blocks = CeilDiv(n, internal_block_size);
+  const auto unit_diagonal = (diag == Diagonal::kUnit) ? true : false;
+
+  // This routine only supports block sizes which are a multiple of the internal block size and
+  // block sizes up to and including 128
+  if ((block_size % internal_block_size != 0) || (block_size > 128)) {
+    throw BLASError(StatusCode::kInvalidDimension);
+  }
+
+  // Checks for validity of the source and destination matrices
+  TestMatrixA(n, n, src, offset, ld_src);
+  TestMatrixB(block_size, num_blocks * block_size, dest, 0, block_size);
+
+  // Determines which kernels to run based on the layout (the kernels assume column-major as
+  // default) and on whether we are dealing with an upper or lower triangle of the triangular matrix
+  const bool is_upper = ((triangle == Triangle::kUpper && layout != Layout::kRowMajor) ||
+                         (triangle == Triangle::kLower && layout == Layout::kRowMajor));
+  const auto name_postfix = (is_upper) ? "Upper" : "Lower";
+
+  // Retrieves the program from the cache
+  auto event_wait_list = std::vector<Event>();
+  const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), "INVERT");
+
+  // Fills the output buffer with zeros
+  auto fill_matrix_event = Event();
+  FillMatrix(queue_, device_, program, db_, fill_matrix_event.pointer(), event_wait_list,
+             num_blocks * block_size, block_size, 0, dest, ConstantZero<T>());
+  event_wait_list.push_back(fill_matrix_event);
+
+  // Inverts the diagonal IB by IB inner blocks of the matrix: one block per work-group
+  auto kernel = Kernel(program, "InvertDiagonalBlock");
+  kernel.SetArgument(0, static_cast<int>(n));
+  kernel.SetArgument(1, src());
+  kernel.SetArgument(2, static_cast<int>(offset));
+  kernel.SetArgument(3, static_cast<int>(ld_src));
+  kernel.SetArgument(4, dest());
+  kernel.SetArgument(5, static_cast<int>(block_size));
+  kernel.SetArgument(6, static_cast<int>(unit_diagonal));
+  kernel.SetArgument(7, static_cast<int>(is_upper));
+  const auto local = std::vector<size_t>{internal_block_size};
+  const auto global = std::vector<size_t>{num_internal_blocks * internal_block_size};
+  auto base_kernel_event = Event();
+  RunKernel(kernel, queue_, device_, global, local, base_kernel_event.pointer(), event_wait_list);
+  event_wait_list.push_back(base_kernel_event);
+
+  // Builds up block_size x block_size blocks. For example, internal_block_size=16:
+  // use   16 x 16  blocks to build  32 x 32  blocks,  1 x (1 x npages) grid,  4 x 4 threads;
+  // then  32 x 32  blocks to build  64 x 64  blocks,  1 x (2 x npages) grid,  8 x 4 threads;
+  // then  64 x 64  blocks to build 128 x 128 blocks,  1 x (4 x npages) grid, 16 x 4 threads;
+  for (auto current_size = internal_block_size; current_size < block_size; current_size *= 2) {
+    assert(current_size == 16 || current_size == 32 || current_size == 64);
+
+    // Emulates a 3D grid: NX * (NY * npages)
+    const auto npages = CeilDiv(n, current_size*2);
+    const auto local0 = (current_size <= 32) ? current_size/4 : 16;
+    const auto local = std::vector<size_t>{local0, 4};
+    const auto global = std::vector<size_t>{(current_size/local[1]), npages*(current_size/16)*local[1]};
+
+    // Part 1
+    auto kernel1 = Kernel(program, "TripleMatMul" + ToString(current_size) + "Part1" + name_postfix);
+    kernel1.SetArgument(0, static_cast<int>(n));
+    kernel1.SetArgument(1, src());
+    kernel1.SetArgument(2, static_cast<int>(offset));
+    kernel1.SetArgument(3, static_cast<int>(ld_src));
+    kernel1.SetArgument(4, dest());
+    kernel1.SetArgument(5, static_cast<int>(current_size));
+    kernel1.SetArgument(6, static_cast<int>(npages));
+    kernel1.SetArgument(7, static_cast<int>(block_size));
+    auto kernel1_event = Event();
+    RunKernel(kernel1, queue_, device_, global, local, kernel1_event.pointer(), event_wait_list);
+    event_wait_list.push_back(kernel1_event);
+
+    // Part 2
+    const bool is_last_kernel = (current_size * 2 >= block_size);
+    auto kernel2 = Kernel(program, "TripleMatMul" + ToString(current_size) + "Part2" + name_postfix);
+    kernel2.SetArgument(0, static_cast<int>(n));
+    kernel2.SetArgument(1, dest());
+    kernel2.SetArgument(2, static_cast<int>(current_size));
+    kernel2.SetArgument(3, static_cast<int>(npages));
+    kernel2.SetArgument(4, static_cast<int>(block_size));
+    auto kernel2_event = Event();
+    auto eventPointer = (is_last_kernel) ? event_ : kernel2_event.pointer();
+    RunKernel(kernel2, queue_, device_, global, local, eventPointer, event_wait_list);
+    if (!is_last_kernel) { event_wait_list.push_back(kernel2_event); }
+
+    // Exit in case we reach beyond the bounds of the input matrix
+    if (current_size*2 >= n) { break; }
+  }
+}
+
+// =================================================================================================
+
+// Compiles the templated class
+template class Xinvert<half>;
+template class Xinvert<float>;
+template class Xinvert<double>;
+template class Xinvert<float2>;
+template class Xinvert<double2>;
+
+// =================================================================================================
+} // namespace clblast
diff --git a/src/routines/levelx/xinvert.hpp b/src/routines/levelx/xinvert.hpp
new file mode 100644
index 00000000..fa0a80e7
--- /dev/null
+++ b/src/routines/levelx/xinvert.hpp
@@ -0,0 +1,40 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file contains all the common code to perform (partial) matrix inverting.
+//
+// =================================================================================================
+
+#ifndef CLBLAST_ROUTINES_XINVERT_H_
+#define CLBLAST_ROUTINES_XINVERT_H_
+
+#include "routine.hpp"
+
+namespace clblast {
+// =================================================================================================
+
+template <typename T>
+class Xinvert: public Routine {
+ public:
+
+  // Constructor
+  Xinvert(Queue &queue, EventPointer event, const std::string &name = "INVERT");
+
+  // Inverts diagonal square blocks of a matrix
+  void InvertMatrixDiagonalBlocks(const Layout layout, const Triangle triangle, const Diagonal diag,
+                                  const size_t n, const size_t block_size,
+                                  const Buffer<T> &src, const size_t offset, const size_t ld_src,
+                                  Buffer<T> &dest);
+};
+
+// =================================================================================================
+} // namespace clblast
+
+// CLBLAST_ROUTINES_XINVERT_H_
+#endif
diff --git a/src/utilities/utilities.cpp b/src/utilities/utilities.cpp
index 5e445bb9..b8e011c5 100644
--- a/src/utilities/utilities.cpp
+++ b/src/utilities/utilities.cpp
@@ -46,6 +46,30 @@ double2 GetScalar() {
   return {2.0, 0.5};
 }
 
+// Returns a scalar of value 0
+template <typename T>
+T ConstantZero() {
+  return static_cast<T>(0.0);
+}
+template float ConstantZero<float>();
+template double ConstantZero<double>();
+
+// Specialized version of the above for half-precision
+template <>
+half ConstantZero() {
+  return FloatToHalf(0.0f);
+}
+
+// Specialized versions of the above for complex data-types
+template <>
+float2 ConstantZero() {
+  return {0.0f, 0.0f};
+}
+template <>
+double2 ConstantZero() {
+  return {0.0, 0.0};
+}
+
 // Returns a scalar of value 1
 template <typename T>
 T ConstantOne() {
diff --git a/src/utilities/utilities.hpp b/src/utilities/utilities.hpp
index 20587bd4..26dee23b 100644
--- a/src/utilities/utilities.hpp
+++ b/src/utilities/utilities.hpp
@@ -102,6 +102,10 @@ constexpr auto kArgNoAbbreviations = "no_abbrv";
 template <typename T>
 T GetScalar();
 
+// Returns a scalar of value 0
+template <typename T>
+T ConstantZero();
+
 // Returns a scalar of value 1
 template <typename T>
 T ConstantOne();
diff --git a/test/correctness/routines/levelx/xinvert.cpp b/test/correctness/routines/levelx/xinvert.cpp
new file mode 100644
index 00000000..0ccc0829
--- /dev/null
+++ b/test/correctness/routines/levelx/xinvert.cpp
@@ -0,0 +1,30 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// =================================================================================================
+
+#include "test/correctness/testblas.hpp"
+#include "test/routines/levelx/xinvert.hpp"
+
+// Shortcuts to the clblast namespace
+using float2 = clblast::float2;
+using double2 = clblast::double2;
+
+// Main function (not within the clblast namespace)
+int main(int argc, char *argv[]) {
+  auto errors = size_t{0};
+  errors += clblast::RunTests<clblast::TestXinvert<float>, float, float>(argc, argv, false, "SINVERT");
+  errors += clblast::RunTests<clblast::TestXinvert<double>, double, double>(argc, argv, true, "DINVERT");
+  errors += clblast::RunTests<clblast::TestXinvert<float2>, float2, float2>(argc, argv, true, "CINVERT");
+  errors += clblast::RunTests<clblast::TestXinvert<double2>, double2, double2>(argc, argv, true, "ZINVERT");
+  errors += clblast::RunTests<clblast::TestXinvert<half>, half, half>(argc, argv, true, "HINVERT");
+  if (errors > 0) { return 1; } else { return 0; }
+}
+
+// =================================================================================================
diff --git a/test/performance/routines/levelx/xinvert.cpp b/test/performance/routines/levelx/xinvert.cpp
new file mode 100644
index 00000000..87f36b1e
--- /dev/null
+++ b/test/performance/routines/levelx/xinvert.cpp
@@ -0,0 +1,37 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// =================================================================================================
+
+#include "test/performance/client.hpp"
+#include "test/routines/levelx/xinvert.hpp"
+
+// Shortcuts to the clblast namespace
+using float2 = clblast::float2;
+using double2 = clblast::double2;
+
+// Main function (not within the clblast namespace)
+int main(int argc, char *argv[]) {
+  const auto command_line_args = clblast::RetrieveCommandLineArguments(argc, argv);
+  switch(clblast::GetPrecision(command_line_args, clblast::Precision::kSingle)) {
+    case clblast::Precision::kHalf:
+      clblast::RunClient<clblast::TestXinvert<half>, half, half>(argc, argv); break;
+    case clblast::Precision::kSingle:
+      clblast::RunClient<clblast::TestXinvert<float>, float, float>(argc, argv); break;
+    case clblast::Precision::kDouble:
+      clblast::RunClient<clblast::TestXinvert<double>, double, double>(argc, argv); break;
+    case clblast::Precision::kComplexSingle:
+      clblast::RunClient<clblast::TestXinvert<float2>, float2, float2>(argc, argv); break;
+    case clblast::Precision::kComplexDouble:
+      clblast::RunClient<clblast::TestXinvert<double2>, double2, double2>(argc, argv); break;
+  }
+  return 0;
+}
+
+// =================================================================================================
diff --git a/test/routines/levelx/xinvert.hpp b/test/routines/levelx/xinvert.hpp
new file mode 100644
index 00000000..4408e8d5
--- /dev/null
+++ b/test/routines/levelx/xinvert.hpp
@@ -0,0 +1,219 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements a class with static methods to describe the Xinvert routine. Examples of
+// such 'descriptions' are how to calculate the size a of buffer or how to run the routine. These
+// static methods are used by the correctness tester and the performance tester.
+//
+// =================================================================================================
+
+#ifndef CLBLAST_TEST_ROUTINES_XINVERT_H_
+#define CLBLAST_TEST_ROUTINES_XINVERT_H_
+
+#include <vector>
+#include <string>
+
+#include "routines/levelx/xinvert.hpp"
+
+namespace clblast {
+// =================================================================================================
+
+template <typename T>
+StatusCode RunReference(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
+  const bool is_upper = ((args.triangle == Triangle::kUpper && args.layout != Layout::kRowMajor) ||
+                         (args.triangle == Triangle::kLower && args.layout == Layout::kRowMajor));
+
+  // Data transfer from OpenCL to std::vector
+  std::vector<T> a_mat_cpu(args.a_size, T{0.0});
+  buffers.a_mat.Read(queue, args.a_size, a_mat_cpu);
+
+  // Creates the output buffer
+  std::vector<T> b_mat_cpu(args.b_size, T{0.0});
+
+  // Helper variables
+  const auto block_size = args.m;
+  const auto num_blocks = CeilDiv(args.n, block_size);
+  const auto a_ld = args.a_ld;
+  const auto b_ld = block_size;
+  if ((block_size == 0) || (args.n == 0) || (block_size > args.n)) {
+    return StatusCode::kInvalidDimension;
+  }
+
+  // Loops over the amount of diagonal blocks of size args.m by args.m each
+  for (auto block_id = size_t{0}; block_id < num_blocks; ++block_id) {
+    const auto a_offset = block_id * (block_size + a_ld * block_size) + args.a_offset;
+    const auto b_offset = block_id * block_size * block_size;
+
+    // Inverts the diagonal elements of the matrix
+    for (auto i = size_t{0}; i < block_size; ++i) {
+      auto a_value = T{1.0};
+      if (args.diagonal == Diagonal::kNonUnit) {
+        if (i + block_id * block_size < args.n) {
+          if (a_mat_cpu[i * a_ld + i + a_offset] == T{0.0}) { return StatusCode::kUnknownError; }
+          a_value = T{1.0} / a_mat_cpu[i * a_ld + i + a_offset];
+        }
+      }
+      b_mat_cpu[i * b_ld + i + b_offset] = a_value;
+    }
+
+    // Inverts the upper triangle row by row
+    if (is_upper) {
+      for (int i = static_cast<int>(block_size) - 2; i >= 0; --i) {
+        for (auto j = static_cast<int>(block_size) - 1; j > i; --j) {
+          auto sum = T{0.0};
+          for (auto k = i + 1; k <= j; ++k) {
+            auto a_value = T{0.0};
+            if ((i + block_id * block_size < args.n) && (k + block_id * block_size < args.n)) {
+              a_value = a_mat_cpu[k * a_ld + i + a_offset];
+            }
+            sum += a_value * b_mat_cpu[j * b_ld + k + b_offset];
+          }
+          b_mat_cpu[j * b_ld + i + b_offset] = - sum * b_mat_cpu[i * b_ld + i + b_offset];
+        }
+      }
+    }
+
+    // Inverts the lower triangle row by row
+    else {
+      for (auto i = size_t{1}; i < block_size; ++i) {
+        for (auto j = size_t{0}; j < i; ++j) {
+          auto sum = T{0.0};
+          for (auto k = j; k < i; ++k) {
+            auto a_value = T{0.0};
+            if ((i + block_id * block_size < args.n) && (k + block_id * block_size < args.n)) {
+              a_value = a_mat_cpu[k * a_ld + i + a_offset];
+            }
+            sum += a_value * b_mat_cpu[j * b_ld + k + b_offset];
+          }
+          b_mat_cpu[j * b_ld + i + b_offset] = - sum * b_mat_cpu[i * b_ld + i + b_offset];
+        }
+      }
+    }
+  }
+
+  // Data transfer back to OpenCL
+  buffers.b_mat.Write(queue, args.b_size, b_mat_cpu);
+  return StatusCode::kSuccess;
+}
+
+// Half-precision version calling the above reference implementation after conversions
+template <>
+StatusCode RunReference<half>(const Arguments<half> &args, Buffers<half> &buffers, Queue &queue) {
+  auto a_buffer2 = HalfToFloatBuffer(buffers.a_mat, queue());
+  auto b_buffer2 = HalfToFloatBuffer(buffers.b_mat, queue());
+  auto dummy = clblast::Buffer<float>(0);
+  auto buffers2 = Buffers<float>{dummy, dummy, a_buffer2, b_buffer2, dummy, dummy, dummy};
+  auto args2 = Arguments<float>();
+  args2.a_size = args.a_size; args2.b_size = args.b_size;
+  args2.a_ld = args.a_ld; args2.m = args.m; args2.n = args.n;
+  args2.a_offset = args.a_offset;
+  args2.layout = args.layout; args2.triangle = args.triangle; args2.diagonal = args.diagonal;
+  auto status = RunReference(args2, buffers2, queue);
+  FloatToHalfBuffer(buffers.b_mat, b_buffer2, queue());
+  return status;
+}
+
+// =================================================================================================
+
+// See comment at top of file for a description of the class
+template <typename T>
+class TestXinvert {
+ public:
+
+  // The BLAS level: 4 for the extra routines
+  static size_t BLASLevel() { return 4; }
+
+  // The list of arguments relevant for this routine
+  static std::vector<std::string> GetOptions() {
+    return {kArgN, kArgM,
+            kArgLayout, kArgTriangle, kArgDiagonal,
+            kArgALeadDim, kArgAOffset};
+  }
+
+  // Describes how to obtain the sizes of the buffers
+  static size_t GetSizeA(const Arguments<T> &args) {
+    return args.n * args.a_ld + args.a_offset;
+  }
+  static size_t GetSizeB(const Arguments<T> &args) {
+    const auto block_size = args.m;
+    const auto num_blocks = CeilDiv(args.n, block_size);
+    return num_blocks * block_size * block_size;
+  }
+
+  // Describes how to set the sizes of all the buffers
+  static void SetSizes(Arguments<T> &args) {
+    args.a_size = GetSizeA(args);
+    args.b_size = GetSizeB(args);
+  }
+
+  // Describes what the default values of the leading dimensions of the matrices are
+  static size_t DefaultLDA(const Arguments<T> &args) { return args.n; }
+  static size_t DefaultLDB(const Arguments<T> &) { return 1; } // N/A for this routine
+  static size_t DefaultLDC(const Arguments<T> &) { return 1; } // N/A for this routine
+
+  // Describes which omatcopyose options are relevant for this routine
+  using Transposes = std::vector<Transpose>;
+  static Transposes GetATransposes(const Transposes &) { return {}; } // N/A for this routine
+  static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine
+
+  // Describes how to run the CLBlast routine
+  static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
+    try {
+      auto event = cl_event{};
+      auto inverter = Xinvert<T>(queue, &event);
+      inverter.InvertMatrixDiagonalBlocks(args.layout, args.triangle, args.diagonal,
+                                          args.n, args.m,
+                                          buffers.a_mat, args.a_offset, args.a_ld,
+                                          buffers.b_mat);
+      clWaitForEvents(1, &event);
+      clReleaseEvent(event);
+    } catch (...) { return DispatchException(); }
+    return StatusCode::kSuccess;
+  }
+
+  // Describes how to run a naive version of the routine (for correctness/performance comparison).
+  // Note that a proper clBLAS or CPU BLAS comparison is not available for non-BLAS routines.
+  static StatusCode RunReference1(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
+    return RunReference(args, buffers, queue);
+  }
+
+  static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
+    return RunReference(args, buffers, queue);
+  }
+
+  // Describes how to download the results of the computation (more importantly: which buffer)
+  static std::vector<T> DownloadResult(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
+    std::vector<T> result(args.b_size, static_cast<T>(0));
+    buffers.b_mat.Read(queue, args.b_size, result);
+    return result;
+  }
+
+  // Describes how to compute the indices of the result buffer
+  static size_t ResultID1(const Arguments<T> &args) { return args.m; }
+  static size_t ResultID2(const Arguments<T> &args) { return Ceil(args.n, args.m); }
+  static size_t GetResultIndex(const Arguments<T> &args, const size_t id1, const size_t id2) {
+    return id1 * Ceil(args.n, args.m) + id2;
+  }
+
+  // Describes how to compute performance metrics
+  static size_t GetFlops(const Arguments<T> &args) {
+    const auto block_size = args.m;
+    const auto num_blocks = CeilDiv(args.n, block_size);
+    return num_blocks * (block_size * (block_size / 2) * (block_size / 2));
+  }
+  static size_t GetBytes(const Arguments<T> &args) {
+    return (args.a_size * args.b_size) * sizeof(T);
+  }
+};
+
+// =================================================================================================
+} // namespace clblast
+
+// CLBLAST_TEST_ROUTINES_XINVERT_H_
+#endif

From df9a77d74d87fb8832264e9e9a37336001873151 Mon Sep 17 00:00:00 2001
From: Cedric Nugteren <web@cedricnugteren.nl>
Date: Wed, 18 Jan 2017 21:29:59 +0100
Subject: [PATCH 04/24] Added first version of the TRSM routine based on the
 diagonal invert kernel

---
 .../level3/invert_diagonal_blocks.opencl      |  13 --
 src/kernels/level3/level3.opencl              |  16 ++
 src/routines/level3/xtrsm.cpp                 | 145 +++++++++++++++++-
 src/routines/levelx/xinvert.cpp               |  10 +-
 src/utilities/utilities.cpp                   |  24 +++
 src/utilities/utilities.hpp                   |   4 +
 test/routines/level3/xtrsm.hpp                |   6 +-
 7 files changed, 191 insertions(+), 27 deletions(-)

diff --git a/src/kernels/level3/invert_diagonal_blocks.opencl b/src/kernels/level3/invert_diagonal_blocks.opencl
index 9231d725..e94b4d30 100644
--- a/src/kernels/level3/invert_diagonal_blocks.opencl
+++ b/src/kernels/level3/invert_diagonal_blocks.opencl
@@ -61,19 +61,6 @@ R"(
 
 // =================================================================================================
 
-__kernel __attribute__((reqd_work_group_size(8, 8, 1)))
-void FillMatrix(const int n, const int ld, const int offset,
-                __global real* restrict dest, const real_arg arg_value) {
-  const real value = GetRealArg(arg_value);
-  const int id_one = get_global_id(0);
-  const int id_two = get_global_id(1);
-  if (id_one < ld && id_two < n) {
-    dest[id_two*ld + id_one + offset] = value;
-  }
-}
-
-// =================================================================================================
-
 // Inverts a diagonal block of INTERNAL_BLOCK_SIZE by INTERNAL_BLOCK_SIZE elements in a larger matrix
 __kernel __attribute__((reqd_work_group_size(INTERNAL_BLOCK_SIZE, 1, 1)))
 void InvertDiagonalBlock(int n, __global const real* restrict src, const int src_offset, const int src_ld,
diff --git a/src/kernels/level3/level3.opencl b/src/kernels/level3/level3.opencl
index bf14ab12..0f5a8607 100644
--- a/src/kernels/level3/level3.opencl
+++ b/src/kernels/level3/level3.opencl
@@ -73,6 +73,22 @@ R"(
   #define PADTRA_PAD 0    // Padding of the local memory to avoid bank-conflicts
 #endif
 
+// =================================================================================================
+#if defined(ROUTINE_INVERT) || defined(ROUTINE_TRSM)
+
+__kernel __attribute__((reqd_work_group_size(8, 8, 1)))
+void FillMatrix(const int n, const int ld, const int offset,
+                __global real* restrict dest, const real_arg arg_value) {
+  const real value = GetRealArg(arg_value);
+  const int id_one = get_global_id(0);
+  const int id_two = get_global_id(1);
+  if (id_one < ld && id_two < n) {
+    dest[id_two*ld + id_one + offset] = value;
+  }
+}
+
+#endif
+
 // =================================================================================================
 
 // End of the C++11 raw string literal
diff --git a/src/routines/level3/xtrsm.cpp b/src/routines/level3/xtrsm.cpp
index 0ac1a58e..8061b508 100644
--- a/src/routines/level3/xtrsm.cpp
+++ b/src/routines/level3/xtrsm.cpp
@@ -7,11 +7,15 @@
 // Author(s):
 //   Cedric Nugteren <www.cedricnugteren.nl>
 //
-// This file implements the Xtrsm class (see the header for information about the class).
+// This file implements the triangular matrix solver (A * X = B) TRSM class. This code is based
+// on the TRSM implementation in the CUDA version of Magma version 2.2.0 and the poster "Triangular
+// Linear System Solver for GPU with CUDA and OpenCL" by Peng Du, Stanimire Tomov, Piotr Luszczek,
+// and Jack Dongarra.
 //
 // =================================================================================================
 
 #include "routines/level3/xtrsm.hpp"
+#include "routines/levelx/xinvert.hpp"
 
 #include <string>
 #include <vector>
@@ -25,6 +29,7 @@ Xtrsm<T>::Xtrsm(Queue &queue, EventPointer event, const std::string &name):
     Xgemm<T>(queue, event, name) {
 }
 
+
 // =================================================================================================
 
 // The main routine
@@ -36,27 +41,153 @@ void Xtrsm<T>::DoTrsm(const Layout layout, const Side side, const Triangle trian
                       const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
                       const Buffer<T> &b_buffer, const size_t b_offset, const size_t b_ld) {
 
+  // Settings
+  constexpr auto block_size = size_t{32}; // tuneable
+
   // Makes sure all dimensions are larger than zero
   if ((m == 0) || (n == 0)) { throw BLASError(StatusCode::kInvalidDimension); }
 
   // Computes the k dimension. This is based on whether or not matrix is A (on the left)
   // or B (on the right) in the Xgemm routine.
-  auto k = (side == Side::kLeft) ? m : n;
+  const auto k = (side == Side::kLeft) ? m : n;
 
   // Checks for validity of the triangular A matrix
   TestMatrixA(k, k, a_buffer, a_offset, a_ld);
 
-  // Checks for validity of the input/output B matrix
+  // Determines which kernels to run based on the layout (the kernels assume column-major as
+  // default) and on whether we are dealing with an upper or lower triangle of the triangular matrix
+  const bool is_upper = ((triangle == Triangle::kUpper && layout != Layout::kRowMajor) ||
+                         (triangle == Triangle::kLower && layout == Layout::kRowMajor));
+
+  // Checks for validity of the input B matrix
   const auto b_one = (layout == Layout::kRowMajor) ? n : m;
   const auto b_two = (layout == Layout::kRowMajor) ? m : n;
   TestMatrixB(b_one, b_two, b_buffer, b_offset, b_ld);
 
   // Creates a copy of B to avoid overwriting input in GEMM while computing output
-  const auto b_size = (b_ld * (b_two - 1) + b_one + b_offset);
-  auto b_buffer_copy = Buffer<T>(context_, b_size);
-  b_buffer.CopyTo(queue_, b_size, b_buffer_copy);
+  const auto b_size = b_ld * (b_two - 1) + b_one + b_offset;
+  const auto x_one = b_one;
+  const auto x_size = b_size;
+  const auto x_ld = b_ld;
+  const auto x_offset = b_offset;
+  auto x_buffer = Buffer<T>(context_, x_size);
+  b_buffer.CopyTo(queue_, x_size, x_buffer);
 
-  // TODO: Implement TRSM computation
+  // Temporary buffer for the inverse of the A matrix
+  const auto a_inv_size = Ceil(k, block_size) * block_size;
+  auto a_inv_buffer = Buffer<T>(context_, a_inv_size);
+
+  // Fills the output buffer with zeros
+  auto eventWaitList = std::vector<Event>();
+  const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), "TRSM");
+  auto fill_matrix_event = Event();
+  FillMatrix(queue_, device_, program, db_, fill_matrix_event.pointer(), eventWaitList,
+             x_one, x_ld, x_offset, x_buffer, ConstantZero<T>());
+  fill_matrix_event.WaitForCompletion();
+
+  // Inverts the diagonal blocks
+  auto diagonal_invert_event = Event();
+  auto inverter = Xinvert<T>(queue_, diagonal_invert_event.pointer());
+  inverter.InvertMatrixDiagonalBlocks(layout, triangle, diagonal,
+                                      k, block_size, a_buffer, a_offset, a_ld, a_inv_buffer);
+  diagonal_invert_event.WaitForCompletion();
+
+  // Lower of upper triangular
+  const bool condition = ((triangle == Triangle::kUpper && a_transpose != Transpose::kNo) ||
+                          (triangle == Triangle::kLower && a_transpose == Transpose::kNo));
+
+  // Left side
+  if (side == Side::kLeft) {
+
+    // True when (lower triangular) or (upper triangular and transposed)
+    if (condition) {
+      for (auto i = size_t{0}; i < m; i += block_size) {
+        const auto gemm_alpha = (i == 0) ? alpha : ConstantOne<T>();
+        const auto current_block_size = std::min(m - i, block_size);
+        DoGemm(layout, a_transpose, Transpose::kNo,
+               current_block_size, n, current_block_size, gemm_alpha,
+               a_inv_buffer, i * block_size, block_size,
+               b_buffer, i, b_ld, ConstantZero<T>(),
+               x_buffer, i, x_ld);
+        if (i + block_size >= m) { break; }
+        const auto this_a_offset = (a_transpose == Transpose::kNo) ? (i + block_size) + i * a_ld : i + (block_size + i) * a_ld;
+        DoGemm(layout, a_transpose, Transpose::kNo,
+               m - i - block_size, n, block_size, ConstantNegOne<T>(),
+               a_buffer, this_a_offset, a_ld,
+               x_buffer, i, x_ld, ConstantOne<T>(),
+               b_buffer, i + block_size, b_ld);
+      }
+    }
+
+    // True when (upper triangular) or (lower triangular and transposed)
+    else {
+      const auto current_block_size = (m % block_size == 0) ? block_size : (m % block_size);
+      const auto i_start = static_cast<int>(m) - static_cast<int>(current_block_size);
+      for (auto i = i_start; i >= 0; i -= static_cast<int>(block_size)) {
+        const auto gemm_alpha = (i == i_start) ? alpha : ConstantOne<T>();
+        DoGemm(layout, a_transpose, Transpose::kNo,
+               block_size, n, block_size, gemm_alpha,
+               a_inv_buffer, i * block_size, block_size,
+               b_buffer, i, b_ld, ConstantZero<T>(),
+               x_buffer, i, x_ld);
+        if (i - static_cast<int>(block_size) < 0) { break; }
+        const auto this_a_offset = (a_transpose == Transpose::kNo) ? i * a_ld : i;
+        DoGemm(layout, a_transpose, Transpose::kNo,
+               i, n, block_size, ConstantNegOne<T>(),
+               a_buffer, this_a_offset, a_ld,
+               x_buffer, i, x_ld, ConstantOne<T>(),
+               b_buffer, 0, b_ld);
+      }
+    }
+  }
+
+  // Right side
+  else {
+
+    // True when (lower triangular) or (upper triangular and transposed)
+    if (condition) {
+      const auto current_block_size = (n % block_size == 0) ? block_size : (n % block_size);
+      const auto i_start = static_cast<int>(n) - static_cast<int>(current_block_size);
+      for (auto i = i_start; i >= 0; i -= static_cast<int>(block_size)) {
+        const auto gemm_alpha = (i == i_start) ? alpha : ConstantOne<T>();
+        DoGemm(layout, Transpose::kNo, a_transpose,
+               m, block_size, block_size, gemm_alpha,
+               b_buffer, i * b_ld, b_ld,
+               a_inv_buffer, i * block_size, block_size, ConstantZero<T>(),
+               x_buffer, i * x_ld, x_ld);
+        if (i - static_cast<int>(block_size) < 0) { break; }
+        const auto this_a_offset = (a_transpose == Transpose::kNo) ? i : i * a_ld;
+        DoGemm(layout, Transpose::kNo, a_transpose,
+               m, i, block_size, ConstantNegOne<T>(),
+               x_buffer, i * x_ld, x_ld,
+               a_buffer, this_a_offset, a_ld, ConstantOne<T>(),
+               b_buffer, 0, b_ld);
+      }
+    }
+
+    // True when (upper triangular) or (lower triangular and transposed)
+    else {
+      for (auto i = size_t{0}; i < n; i += block_size) {
+        const auto gemm_alpha = (i == 0) ? alpha : ConstantOne<T>();
+        const auto current_block_size = std::min(n - i, block_size);
+        DoGemm(layout, Transpose::kNo, a_transpose,
+               m, current_block_size, current_block_size, gemm_alpha,
+               b_buffer, i * b_ld, b_ld,
+               a_inv_buffer, i * block_size, block_size, ConstantZero<T>(),
+               x_buffer, i * x_ld, x_ld);
+        if (i + block_size >= n) { break; }
+        const auto this_a_offset = (a_transpose == Transpose::kNo) ? i + (block_size + i) * a_ld : (i + block_size) + i * a_ld;
+        DoGemm(layout, Transpose::kNo, a_transpose,
+               m, n - i - block_size, block_size, ConstantNegOne<T>(),
+               x_buffer, i * x_ld, x_ld,
+               a_buffer, this_a_offset, a_ld, ConstantOne<T>(),
+               b_buffer, (i + block_size) * b_ld, b_ld);
+      }
+    }
+  }
+
+  // Retrieves the results
+  x_buffer.CopyTo(queue_, b_size, b_buffer);
 }
 
 // =================================================================================================
diff --git a/src/routines/levelx/xinvert.cpp b/src/routines/levelx/xinvert.cpp
index 5ffba958..ffee9b7c 100644
--- a/src/routines/levelx/xinvert.cpp
+++ b/src/routines/levelx/xinvert.cpp
@@ -27,6 +27,7 @@ namespace clblast {
 template <typename T>
 Xinvert<T>::Xinvert(Queue &queue, EventPointer event, const std::string &name):
     Routine(queue, event, name, {"Invert"}, PrecisionValue<T>(), {}, {
+    #include "../../kernels/level3/level3.opencl"
     #include "../../kernels/level3/invert_diagonal_blocks.opencl"
     }) {
 }
@@ -91,8 +92,9 @@ void Xinvert<T>::InvertMatrixDiagonalBlocks(const Layout layout, const Triangle
   const auto local = std::vector<size_t>{internal_block_size};
   const auto global = std::vector<size_t>{num_internal_blocks * internal_block_size};
   auto base_kernel_event = Event();
-  RunKernel(kernel, queue_, device_, global, local, base_kernel_event.pointer(), event_wait_list);
-  event_wait_list.push_back(base_kernel_event);
+  auto base_kernel_event_pointer = (internal_block_size == block_size) ? event_ : base_kernel_event.pointer();
+  RunKernel(kernel, queue_, device_, global, local, base_kernel_event_pointer, event_wait_list);
+  if (internal_block_size == block_size) { event_wait_list.push_back(base_kernel_event); }
 
   // Builds up block_size x block_size blocks. For example, internal_block_size=16:
   // use   16 x 16  blocks to build  32 x 32  blocks,  1 x (1 x npages) grid,  4 x 4 threads;
@@ -130,8 +132,8 @@ void Xinvert<T>::InvertMatrixDiagonalBlocks(const Layout layout, const Triangle
     kernel2.SetArgument(3, static_cast<int>(npages));
     kernel2.SetArgument(4, static_cast<int>(block_size));
     auto kernel2_event = Event();
-    auto eventPointer = (is_last_kernel) ? event_ : kernel2_event.pointer();
-    RunKernel(kernel2, queue_, device_, global, local, eventPointer, event_wait_list);
+    auto kernel2_event_pointer = (is_last_kernel) ? event_ : kernel2_event.pointer();
+    RunKernel(kernel2, queue_, device_, global, local, kernel2_event_pointer, event_wait_list);
     if (!is_last_kernel) { event_wait_list.push_back(kernel2_event); }
 
     // Exit in case we reach beyond the bounds of the input matrix
diff --git a/src/utilities/utilities.cpp b/src/utilities/utilities.cpp
index b8e011c5..663fdffa 100644
--- a/src/utilities/utilities.cpp
+++ b/src/utilities/utilities.cpp
@@ -94,6 +94,30 @@ double2 ConstantOne() {
   return {1.0, 0.0};
 }
 
+// Returns a scalar of value -1
+template <typename T>
+T ConstantNegOne() {
+  return static_cast<T>(-1.0);
+}
+template float ConstantNegOne<float>();
+template double ConstantNegOne<double>();
+
+// Specialized version of the above for half-precision
+template <>
+half ConstantNegOne() {
+  return FloatToHalf(-1.0f);
+}
+
+// Specialized versions of the above for complex data-types
+template <>
+float2 ConstantNegOne() {
+  return {-1.0f, 0.0f};
+}
+template <>
+double2 ConstantNegOne() {
+  return {-1.0, 0.0};
+}
+
 // =================================================================================================
 
 // Implements the string conversion using std::to_string if possible
diff --git a/src/utilities/utilities.hpp b/src/utilities/utilities.hpp
index 26dee23b..3e408bb7 100644
--- a/src/utilities/utilities.hpp
+++ b/src/utilities/utilities.hpp
@@ -110,6 +110,10 @@ T ConstantZero();
 template <typename T>
 T ConstantOne();
 
+// Returns a scalar of value -1
+template <typename T>
+T ConstantNegOne();
+
 // =================================================================================================
 
 // Structure containing all possible arguments for test clients, including their default values
diff --git a/test/routines/level3/xtrsm.hpp b/test/routines/level3/xtrsm.hpp
index 2480a817..e59c96ff 100644
--- a/test/routines/level3/xtrsm.hpp
+++ b/test/routines/level3/xtrsm.hpp
@@ -48,12 +48,12 @@ class TestXtrsm {
 
   // Describes how to obtain the sizes of the buffers
   static size_t GetSizeA(const Arguments<T> &args) {
-    auto k = (args.side == Side::kLeft) ? args.m : args.n;
+    const auto k = (args.side == Side::kLeft) ? args.m : args.n;
     return k * args.a_ld + args.a_offset;
   }
   static size_t GetSizeB(const Arguments<T> &args) {
-    auto b_rotated = (args.layout == Layout::kRowMajor);
-    auto b_two = (b_rotated) ? args.m : args.n;
+    const auto b_rotated = (args.layout == Layout::kRowMajor);
+    const auto b_two = (b_rotated) ? args.m : args.n;
     return b_two * args.b_ld + args.b_offset;
   }
 

From a2c0a9c5514e7cb9dbf9674843ba806b459d3544 Mon Sep 17 00:00:00 2001
From: Cedric Nugteren <web@cedricnugteren.nl>
Date: Fri, 20 Jan 2017 11:13:44 +0100
Subject: [PATCH 05/24] Set number of decimals for floating-point printing for
 error reporting

---
 src/utilities/utilities.cpp | 24 ++++++++++++++----------
 1 file changed, 14 insertions(+), 10 deletions(-)

diff --git a/src/utilities/utilities.cpp b/src/utilities/utilities.cpp
index 663fdffa..fd198c0d 100644
--- a/src/utilities/utilities.cpp
+++ b/src/utilities/utilities.cpp
@@ -127,23 +127,27 @@ std::string ToString(T value) {
 }
 template std::string ToString<int>(int value);
 template std::string ToString<size_t>(size_t value);
-template std::string ToString<float>(float value);
-template std::string ToString<double>(double value);
+template <>
+std::string ToString(float value) {
+  std::ostringstream result;
+  result << std::fixed << std::setprecision(2) << value;
+  return result.str();
+}
+template <>
+std::string ToString(double value) {
+  std::ostringstream result;
+  result << std::fixed << std::setprecision(2) << value;
+  return result.str();
+}
 
 // If not possible directly: special cases for complex data-types
 template <>
 std::string ToString(float2 value) {
-  std::ostringstream real, imag;
-  real << std::setprecision(2) << value.real();
-  imag << std::setprecision(2) << value.imag();
-  return real.str()+"+"+imag.str()+"i";
+  return ToString(value.real())+"+"+ToString(value.imag())+"i";
 }
 template <>
 std::string ToString(double2 value) {
-  std::ostringstream real, imag;
-  real << std::setprecision(2) << value.real();
-  imag << std::setprecision(2) << value.imag();
-  return real.str()+"+"+imag.str()+"i";
+  return ToString(value.real())+"+"+ToString(value.imag())+"i";
 }
 
 // If not possible directly: special case for half-precision

From a5fd2323b6d9ce793f12618951012fcfec257b95 Mon Sep 17 00:00:00 2001
From: Cedric Nugteren <web@cedricnugteren.nl>
Date: Fri, 20 Jan 2017 11:30:32 +0100
Subject: [PATCH 06/24] Added prototype for the TRSV routine

---
 CMakeLists.txt                 |   2 +-
 doc/clblast.md                 |  57 +++++++++++++
 scripts/generator/generator.py |   4 +-
 src/clblast.cpp                |  21 +++--
 src/routines/level2/xtrsv.cpp  |  66 ++++++++++++++
 src/routines/level2/xtrsv.hpp  |  47 ++++++++++
 test/routines/level2/xtrsv.hpp | 151 +++++++++++++++++++++++++++++++++
 7 files changed, 339 insertions(+), 9 deletions(-)
 create mode 100644 src/routines/level2/xtrsv.cpp
 create mode 100644 src/routines/level2/xtrsv.hpp
 create mode 100644 test/routines/level2/xtrsv.hpp

diff --git a/CMakeLists.txt b/CMakeLists.txt
index a9cabac7..41982b21 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -156,7 +156,7 @@ if(NETLIB)
   set(SAMPLE_PROGRAMS_C ${SAMPLE_PROGRAMS_C} sgemm_netlib)
 endif()
 set(LEVEL1_ROUTINES xswap xscal xcopy xaxpy xdot xdotu xdotc xnrm2 xasum xamax)
-set(LEVEL2_ROUTINES xgemv xgbmv xhemv xhbmv xhpmv xsymv xsbmv xspmv xtrmv xtbmv xtpmv
+set(LEVEL2_ROUTINES xgemv xgbmv xhemv xhbmv xhpmv xsymv xsbmv xspmv xtrmv xtbmv xtpmv xtrsv
                     xger xgeru xgerc xher xhpr xher2 xhpr2 xsyr xspr xsyr2 xspr2)
 set(LEVEL3_ROUTINES xgemm xsymm xhemm xsyrk xherk xsyr2k xher2k xtrmm xtrsm)
 set(LEVELX_ROUTINES xomatcopy xinvert)
diff --git a/doc/clblast.md b/doc/clblast.md
index d7be0005..d90cb61b 100644
--- a/doc/clblast.md
+++ b/doc/clblast.md
@@ -1445,6 +1445,63 @@ Arguments to TPMV:
 
 
 
+xTRSV: Solves a triangular system of equations
+-------------
+
+
+
+C++ API:
+```
+template <typename T>
+StatusCode Trsv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal,
+                const size_t n,
+                const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+                cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                cl_command_queue* queue, cl_event* event)
+```
+
+C API:
+```
+CLBlastStatusCode CLBlastStrsv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
+                               const size_t n,
+                               const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+                               cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                               cl_command_queue* queue, cl_event* event)
+CLBlastStatusCode CLBlastDtrsv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
+                               const size_t n,
+                               const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+                               cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                               cl_command_queue* queue, cl_event* event)
+CLBlastStatusCode CLBlastCtrsv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
+                               const size_t n,
+                               const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+                               cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                               cl_command_queue* queue, cl_event* event)
+CLBlastStatusCode CLBlastZtrsv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
+                               const size_t n,
+                               const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+                               cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                               cl_command_queue* queue, cl_event* event)
+```
+
+Arguments to TRSV:
+
+* `const Layout layout`: Data-layout of the matrices, either `Layout::kRowMajor` (101) for row-major layout or `Layout::kColMajor` (102) for column-major data-layout.
+* `const Triangle triangle`: The part of the array of the triangular matrix to be used, either `Triangle::kUpper` (121) or `Triangle::kLower` (122).
+* `const Transpose a_transpose`: Transposing the input matrix A, either `Transpose::kNo` (111), `Transpose::kYes` (112), or `Transpose::kConjugate` (113) for a complex-conjugate transpose.
+* `const Diagonal diagonal`: The property of the diagonal matrix, either `Diagonal::kNonUnit` (131) for non-unit values on the diagonal or `Diagonal::kUnit` (132) for unit values on the diagonal.
+* `const size_t n`: Integer size argument. This value must be positive.
+* `const cl_mem a_buffer`: OpenCL buffer to store the input A matrix.
+* `const size_t a_offset`: The offset in elements from the start of the input A matrix.
+* `const size_t a_ld`: Leading dimension of the input A matrix. This value must be greater than 0.
+* `cl_mem x_buffer`: OpenCL buffer to store the output x vector.
+* `const size_t x_offset`: The offset in elements from the start of the output x vector.
+* `const size_t x_inc`: Stride/increment of the output x vector. This value must be greater than 0.
+* `cl_command_queue* queue`: Pointer to an OpenCL command queue associated with a context and device to execute the routine on.
+* `cl_event* event`: Pointer to an OpenCL event to be able to wait for completion of the routine's OpenCL kernel(s). This is an optional argument.
+
+
+
 xGER: General rank-1 matrix update
 -------------
 
diff --git a/scripts/generator/generator.py b/scripts/generator/generator.py
index d71e392d..1bd0b58e 100755
--- a/scripts/generator/generator.py
+++ b/scripts/generator/generator.py
@@ -41,7 +41,7 @@ FILES = [
     "/include/clblast_netlib_c.h",
     "/src/clblast_netlib_c.cpp",
 ]
-HEADER_LINES = [117, 74, 118, 22, 29, 41, 65, 32]
+HEADER_LINES = [117, 75, 118, 22, 29, 41, 65, 32]
 FOOTER_LINES = [17, 80, 19, 18, 6, 6, 9, 2]
 
 # Different possibilities for requirements
@@ -129,7 +129,7 @@ ROUTINES = [
   Routine(True,  True,  "2a", "trmv",  T,  [S,D,C,Z,H],    ["n"],               ["layout","triangle","a_transpose","diagonal"],         ["a"],      ["x"],                        [an,xn],       [],               "n",   "Triangular matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is triangular instead.", [ald_n]),
   Routine(True,  True,  "2a", "tbmv",  T,  [S,D,C,Z,H],    ["n","k"],           ["layout","triangle","a_transpose","diagonal"],         ["a"],      ["x"],                        [an,xn],       [],               "n",   "Triangular banded matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is triangular and banded instead.", [ald_k_one]),
   Routine(True,  True,  "2a", "tpmv",  T,  [S,D,C,Z,H],    ["n"],               ["layout","triangle","a_transpose","diagonal"],         ["ap"],     ["x"],                        [apn,xn],      [],               "n",   "Triangular packed matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is a triangular packed matrix instead and repreented as _AP_.", []),
-  Routine(False, True,  "2a", "trsv",  T,  [S,D,C,Z],      ["n"],               ["layout","triangle","a_transpose","diagonal"],         ["a"],      ["x"],                        [an,xn],       [],               "",    "Solves a triangular system of equations", "", []),
+  Routine(True,  True,  "2a", "trsv",  T,  [S,D,C,Z],      ["n"],               ["layout","triangle","a_transpose","diagonal"],         ["a"],      ["x"],                        [an,xn],       [],               "",    "Solves a triangular system of equations", "", []),
   Routine(False, True,  "2a", "tbsv",  T,  [S,D,C,Z],      ["n","k"],           ["layout","triangle","a_transpose","diagonal"],         ["a"],      ["x"],                        [an,xn],       [],               "",    "Solves a banded triangular system of equations", "", [ald_k_one]),
   Routine(False, True,  "2a", "tpsv",  T,  [S,D,C,Z],      ["n"],               ["layout","triangle","a_transpose","diagonal"],         ["ap"],     ["x"],                        [apn,xn],      [],               "",    "Solves a packed triangular system of equations", "", []),
   # Level 2: matrix update
diff --git a/src/clblast.cpp b/src/clblast.cpp
index 68671e50..ef1cedf9 100644
--- a/src/clblast.cpp
+++ b/src/clblast.cpp
@@ -45,6 +45,7 @@
 #include "routines/level2/xtrmv.hpp"
 #include "routines/level2/xtbmv.hpp"
 #include "routines/level2/xtpmv.hpp"
+#include "routines/level2/xtrsv.hpp"
 #include "routines/level2/xger.hpp"
 #include "routines/level2/xgeru.hpp"
 #include "routines/level2/xgerc.hpp"
@@ -1146,12 +1147,20 @@ template StatusCode PUBLIC_API Tpmv<half>(const Layout, const Triangle, const Tr
 
 // Solves a triangular system of equations: STRSV/DTRSV/CTRSV/ZTRSV
 template <typename T>
-StatusCode Trsv(const Layout, const Triangle, const Transpose, const Diagonal,
-                const size_t,
-                const cl_mem, const size_t, const size_t,
-                cl_mem, const size_t, const size_t,
-                cl_command_queue*, cl_event*) {
-  return StatusCode::kNotImplemented;
+StatusCode Trsv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal,
+                const size_t n,
+                const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+                cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                cl_command_queue* queue, cl_event* event) {
+  try {
+    auto queue_cpp = Queue(*queue);
+    auto routine = Xtrsv<T>(queue_cpp, event);
+    routine.DoTrsv(layout, triangle, a_transpose, diagonal,
+                   n,
+                   Buffer<T>(a_buffer), a_offset, a_ld,
+                   Buffer<T>(x_buffer), x_offset, x_inc);
+    return StatusCode::kSuccess;
+  } catch (...) { return DispatchException(); }
 }
 template StatusCode PUBLIC_API Trsv<float>(const Layout, const Triangle, const Transpose, const Diagonal,
                                            const size_t,
diff --git a/src/routines/level2/xtrsv.cpp b/src/routines/level2/xtrsv.cpp
new file mode 100644
index 00000000..d5d5a7ca
--- /dev/null
+++ b/src/routines/level2/xtrsv.cpp
@@ -0,0 +1,66 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the Xtrsv class (see the header for information about the class).
+//
+// =================================================================================================
+
+#include "routines/level2/xtrsv.hpp"
+
+#include <string>
+#include <vector>
+
+namespace clblast {
+// =================================================================================================
+
+// Constructor: forwards to base class constructor
+template <typename T>
+Xtrsv<T>::Xtrsv(Queue &queue, EventPointer event, const std::string &name):
+    Xgemv<T>(queue, event, name) {
+}
+
+// =================================================================================================
+
+// The main routine
+template <typename T>
+void Xtrsv<T>::DoTrsv(const Layout layout, const Triangle triangle,
+                      const Transpose a_transpose, const Diagonal diagonal,
+                      const size_t n,
+                      const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
+                      const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc) {
+
+  // Makes sure all dimensions are larger than zero
+  if (n == 0) { throw BLASError(StatusCode::kInvalidDimension); }
+
+  // Tests the matrix and vector
+  TestMatrixA(n, n, a_buffer, a_offset, a_ld);
+  TestVectorX(n, x_buffer, x_offset, x_inc);
+
+  // Creates a copy of X: a temporary scratch buffer
+  auto scratch_buffer = Buffer<T>(context_, n*x_inc + x_offset);
+  x_buffer.CopyTo(queue_, n*x_inc + x_offset, scratch_buffer);
+
+  // The data is either in the upper or lower triangle
+  size_t is_upper = ((triangle == Triangle::kUpper && layout != Layout::kRowMajor) ||
+                     (triangle == Triangle::kLower && layout == Layout::kRowMajor));
+
+  // TODO: Implement the routine
+}
+
+// =================================================================================================
+
+// Compiles the templated class
+template class Xtrsv<half>;
+template class Xtrsv<float>;
+template class Xtrsv<double>;
+template class Xtrsv<float2>;
+template class Xtrsv<double2>;
+
+// =================================================================================================
+} // namespace clblast
diff --git a/src/routines/level2/xtrsv.hpp b/src/routines/level2/xtrsv.hpp
new file mode 100644
index 00000000..4a73b5eb
--- /dev/null
+++ b/src/routines/level2/xtrsv.hpp
@@ -0,0 +1,47 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the Xtrsv routine.
+//
+// =================================================================================================
+
+#ifndef CLBLAST_ROUTINES_XTRSV_H_
+#define CLBLAST_ROUTINES_XTRSV_H_
+
+#include "routines/level2/xgemv.hpp"
+
+namespace clblast {
+// =================================================================================================
+
+// See comment at top of file for a description of the class
+template <typename T>
+class Xtrsv: public Xgemv<T> {
+ public:
+
+  // Uses the generic matrix-vector routine
+  using Xgemv<T>::queue_;
+  using Xgemv<T>::context_;
+  using Xgemv<T>::MatVec;
+
+  // Constructor
+  Xtrsv(Queue &queue, EventPointer event, const std::string &name = "TRSV");
+
+  // Templated-precision implementation of the routine
+  void DoTrsv(const Layout layout, const Triangle triangle,
+              const Transpose a_transpose, const Diagonal diagonal,
+              const size_t n,
+              const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
+              const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc);
+};
+
+// =================================================================================================
+} // namespace clblast
+
+// CLBLAST_ROUTINES_XTRSV_H_
+#endif
diff --git a/test/routines/level2/xtrsv.hpp b/test/routines/level2/xtrsv.hpp
new file mode 100644
index 00000000..67094b3d
--- /dev/null
+++ b/test/routines/level2/xtrsv.hpp
@@ -0,0 +1,151 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements a class with static methods to describe the Xtrsv routine. Examples of
+// such 'descriptions' are how to calculate the size a of buffer or how to run the routine. These
+// static methods are used by the correctness tester and the performance tester.
+//
+// =================================================================================================
+
+#ifndef CLBLAST_TEST_ROUTINES_XTRSV_H_
+#define CLBLAST_TEST_ROUTINES_XTRSV_H_
+
+#include <vector>
+#include <string>
+
+#ifdef CLBLAST_REF_CLBLAS
+  #include "test/wrapper_clblas.hpp"
+#endif
+#ifdef CLBLAST_REF_CBLAS
+  #include "test/wrapper_cblas.hpp"
+#endif
+
+namespace clblast {
+// =================================================================================================
+
+// See comment at top of file for a description of the class
+template <typename T>
+class TestXtrsv {
+ public:
+
+  // The BLAS level: 1, 2, or 3
+  static size_t BLASLevel() { return 2; }
+
+  // The list of arguments relevant for this routine
+  static std::vector<std::string> GetOptions() {
+    return {kArgN,
+            kArgLayout, kArgTriangle, kArgATransp, kArgDiagonal,
+            kArgALeadDim, kArgXInc,
+            kArgAOffset, kArgXOffset};
+  }
+
+  // Describes how to obtain the sizes of the buffers
+  static size_t GetSizeX(const Arguments<T> &args) {
+    return args.n * args.x_inc + args.x_offset;
+  }
+  static size_t GetSizeA(const Arguments<T> &args) {
+    return args.n * args.a_ld + args.a_offset;
+  }
+
+  // Describes how to set the sizes of all the buffers
+  static void SetSizes(Arguments<T> &args) {
+    args.a_size = GetSizeA(args);
+    args.x_size = GetSizeX(args);
+  }
+
+  // Describes what the default values of the leading dimensions of the matrices are
+  static size_t DefaultLDA(const Arguments<T> &args) { return args.n; }
+  static size_t DefaultLDB(const Arguments<T> &) { return 1; } // N/A for this routine
+  static size_t DefaultLDC(const Arguments<T> &) { return 1; } // N/A for this routine
+
+  // Describes which transpose options are relevant for this routine
+  using Transposes = std::vector<Transpose>;
+  static Transposes GetATransposes(const Transposes &all) { return all; }
+  static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine
+
+  // Describes how to run the CLBlast routine
+  static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
+    auto queue_plain = queue();
+    auto event = cl_event{};
+    auto status = Trsv<T>(args.layout, args.triangle, args.a_transpose, args.diagonal,
+                          args.n,
+                          buffers.a_mat(), args.a_offset, args.a_ld,
+                          buffers.x_vec(), args.x_offset, args.x_inc,
+                          &queue_plain, &event);
+    if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); }
+    return status;
+  }
+
+  // Describes how to run the clBLAS routine (for correctness/performance comparison)
+  #ifdef CLBLAST_REF_CLBLAS
+    static StatusCode RunReference1(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
+      auto queue_plain = queue();
+      auto event = cl_event{};
+      auto status = clblasXtrsv<T>(convertToCLBLAS(args.layout),
+                                   convertToCLBLAS(args.triangle),
+                                   convertToCLBLAS(args.a_transpose),
+                                   convertToCLBLAS(args.diagonal),
+                                   args.n,
+                                   buffers.a_mat, args.a_offset, args.a_ld,
+                                   buffers.x_vec, args.x_offset, args.x_inc,
+                                   1, &queue_plain, 0, nullptr, &event);
+      clWaitForEvents(1, &event);
+      return static_cast<StatusCode>(status);
+    }
+  #endif
+
+  // Describes how to run the CPU BLAS routine (for correctness/performance comparison)
+  #ifdef CLBLAST_REF_CBLAS
+    static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
+      std::vector<T> a_mat_cpu(args.a_size, static_cast<T>(0));
+      std::vector<T> x_vec_cpu(args.x_size, static_cast<T>(0));
+      buffers.a_mat.Read(queue, args.a_size, a_mat_cpu);
+      buffers.x_vec.Read(queue, args.x_size, x_vec_cpu);
+      cblasXtrsv(convertToCBLAS(args.layout),
+                 convertToCBLAS(args.triangle),
+                 convertToCBLAS(args.a_transpose),
+                 convertToCBLAS(args.diagonal),
+                 args.n,
+                 a_mat_cpu, args.a_offset, args.a_ld,
+                 x_vec_cpu, args.x_offset, args.x_inc);
+      buffers.x_vec.Write(queue, args.x_size, x_vec_cpu);
+      return StatusCode::kSuccess;
+    }
+  #endif
+
+  // Describes how to download the results of the computation (more importantly: which buffer)
+  static std::vector<T> DownloadResult(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
+    std::vector<T> result(args.x_size, static_cast<T>(0));
+    buffers.x_vec.Read(queue, args.x_size, result);
+    return result;
+  }
+
+  // Describes how to compute the indices of the result buffer
+  static size_t ResultID1(const Arguments<T> &args) {
+    return args.n;
+  }
+  static size_t ResultID2(const Arguments<T> &) { return 1; } // N/A for this routine
+  static size_t GetResultIndex(const Arguments<T> &args, const size_t id1, const size_t) {
+    return id1*args.x_inc + args.x_offset;
+  }
+
+  // Describes how to compute performance metrics
+  static size_t GetFlops(const Arguments<T> &args) {
+    return 2 * args.n * args.n;
+  }
+  static size_t GetBytes(const Arguments<T> &args) {
+    return (args.n*args.n + 2*args.n + args.n) * sizeof(T);
+  }
+};
+
+// =================================================================================================
+} // namespace clblast
+
+// CLBLAST_TEST_ROUTINES_XTRSV_H_
+#endif

From 7c73ceb095f22099067b8a880961cdcbee4257ea Mon Sep 17 00:00:00 2001
From: Cedric Nugteren <web@cedricnugteren.nl>
Date: Sun, 29 Jan 2017 17:02:00 +0100
Subject: [PATCH 07/24] Added first (incomplete) version of TRSV routine

---
 src/kernels/level2/xtrsv.opencl |  94 +++++++++++++++++++++++
 src/routines/common.hpp         |  19 +++++
 src/routines/level2/xgemv.cpp   |   1 +
 src/routines/level2/xtrsv.cpp   | 131 +++++++++++++++++++++++++++++---
 src/routines/level2/xtrsv.hpp   |  17 ++++-
 5 files changed, 251 insertions(+), 11 deletions(-)
 create mode 100644 src/kernels/level2/xtrsv.opencl

diff --git a/src/kernels/level2/xtrsv.opencl b/src/kernels/level2/xtrsv.opencl
new file mode 100644
index 00000000..00f29e47
--- /dev/null
+++ b/src/kernels/level2/xtrsv.opencl
@@ -0,0 +1,94 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file contains kernels to perform forward or backward substition, as used in the TRSV routine
+//
+// =================================================================================================
+
+// Enables loading of this file using the C++ pre-processor's #include (C++11 standard raw string
+// literal). Comment-out this line for syntax-highlighting when developing.
+R"(
+
+// =================================================================================================
+#if defined(ROUTINE_TRSV)
+
+__kernel __attribute__((reqd_work_group_size(64, 1, 1)))
+void FillVector(const int n, const int inc, const int offset,
+                __global real* restrict dest, const real_arg arg_value) {
+  const real value = GetRealArg(arg_value);
+  const int tid = get_global_id(0);
+  if (tid < n) {
+    dest[tid*inc + offset] = value;
+  }
+}
+
+// =================================================================================================
+
+// TODO: Put variable in database
+#define TRSV_BLOCK_SIZE 256
+
+__kernel __attribute__((reqd_work_group_size(1, 1, 1)))
+void trsv_forward(int n,
+                  const __global float *A, const int a_offset, int lda,
+                  __global float *b, const int b_offset, int b_inc,
+                  __global float *x, const int x_offset, int x_inc,
+                  const int is_transposed, const int is_unit_diagonal) {
+  __local float sx[TRSV_BLOCK_SIZE];
+  barrier(CLK_LOCAL_MEM_FENCE);
+  for (int i = 0; i < n; ++i) {
+    real sum = b[i*b_inc + b_offset];
+    for (int j = 0; j < i; ++j) {
+      real a_value;
+      if (is_transposed == 0) { a_value = A[i + j*lda + a_offset]; }
+      else { a_value = A[j + i*lda + a_offset]; }
+      sum -= a_value * sx[j];
+    }
+    sum -= x[i*x_inc + x_offset];
+    if (is_unit_diagonal == 0) { sum /= A[i + i*lda + a_offset]; }
+    sx[i] = sum;
+    barrier(CLK_LOCAL_MEM_FENCE);
+  }
+  for (int i = 0; i < n; ++i) {
+    x[i*x_inc + x_offset] = sx[i];
+  }
+}
+
+__kernel __attribute__((reqd_work_group_size(1, 1, 1)))
+void trsv_backward(int n,
+                   const __global float *A, const int a_offset, int lda,
+                   __global float *b, const int b_offset, int b_inc,
+                   __global float *x, const int x_offset, int x_inc,
+                   const int is_trans, const int is_unit_diagonal) {
+  __local float sx[TRSV_BLOCK_SIZE];
+  barrier(CLK_LOCAL_MEM_FENCE);
+  for (int i = n - 1; i >= 0; --i) {
+    real sum = b[i*b_inc + b_offset];
+    for (int j = i + 1; j < n; ++j) {
+      real a_value;
+      if (is_trans == 0) { a_value = A[i + j*lda + a_offset]; }
+      else { a_value = A[j + i*lda + a_offset]; }
+      sum -= a_value * sx[j];
+    }
+    sum -= x[i*x_inc + x_offset];
+    if (is_unit_diagonal == 0) { sum /= A[i + i*lda + a_offset]; }
+    sx[i] = sum;
+    barrier(CLK_LOCAL_MEM_FENCE);
+  }
+  for (int i = 0; i < n; ++i) {
+    x[i*x_inc + x_offset] = sx[i];
+  }
+}
+
+#endif
+// =================================================================================================
+
+// End of the C++11 raw string literal
+)"
+
+// =================================================================================================
diff --git a/src/routines/common.hpp b/src/routines/common.hpp
index 47f98de0..8046c0be 100644
--- a/src/routines/common.hpp
+++ b/src/routines/common.hpp
@@ -52,6 +52,25 @@ void FillMatrix(Queue &queue, const Device &device,
   RunKernel(kernel, queue, device, global, local, event, waitForEvents);
 }
 
+// Sets all elements of a vector to a constant value
+template <typename T>
+void FillVector(Queue &queue, const Device &device,
+                const Program &program, const Database &,
+                EventPointer event, const std::vector<Event> &waitForEvents,
+                const size_t n, const size_t inc, const size_t offset,
+                const Buffer<T> &dest,
+                const T constant_value) {
+  auto kernel = Kernel(program, "FillVector");
+  kernel.SetArgument(0, static_cast<int>(n));
+  kernel.SetArgument(1, static_cast<int>(inc));
+  kernel.SetArgument(2, static_cast<int>(offset));
+  kernel.SetArgument(3, dest());
+  kernel.SetArgument(4, GetRealArg(constant_value));
+  auto local = std::vector<size_t>{64};
+  auto global = std::vector<size_t>{Ceil(n, 64)};
+  RunKernel(kernel, queue, device, global, local, event, waitForEvents);
+}
+
 // =================================================================================================
 
 // Copies or transposes a matrix and optionally pads/unpads it with zeros. This method is also able
diff --git a/src/routines/level2/xgemv.cpp b/src/routines/level2/xgemv.cpp
index 7b4c2e8f..26eed5e0 100644
--- a/src/routines/level2/xgemv.cpp
+++ b/src/routines/level2/xgemv.cpp
@@ -25,6 +25,7 @@ Xgemv<T>::Xgemv(Queue &queue, EventPointer event, const std::string &name):
     Routine(queue, event, name, {"Pad", "Xgemv", "XgemvFast", "XgemvFastRot"}, PrecisionValue<T>(), {}, {
     #include "../../kernels/level2/xgemv.opencl"
     #include "../../kernels/level2/xgemv_fast.opencl"
+    #include "../../kernels/level2/xtrsv.opencl"
     }) {
 }
 
diff --git a/src/routines/level2/xtrsv.cpp b/src/routines/level2/xtrsv.cpp
index d5d5a7ca..93f4174f 100644
--- a/src/routines/level2/xtrsv.cpp
+++ b/src/routines/level2/xtrsv.cpp
@@ -25,6 +25,59 @@ Xtrsv<T>::Xtrsv(Queue &queue, EventPointer event, const std::string &name):
     Xgemv<T>(queue, event, name) {
 }
 
+// TODO: Temporary variable, put in database instead
+constexpr size_t TRSV_BLOCK_SIZE = 9;
+
+// =================================================================================================
+
+template <typename T>
+void Xtrsv<T>::Substitution(const Layout layout, const Triangle triangle,
+                            const Transpose a_transpose, const Diagonal diagonal,
+                            const size_t n,
+                            const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
+                            const Buffer<T> &b_buffer, const size_t b_offset, const size_t b_inc,
+                            const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc) {
+
+  if (n > TRSV_BLOCK_SIZE) { throw BLASError(StatusCode::kUnexpectedError); };
+
+  // Retrieves the program from the cache
+  const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), "TRSV");
+
+  // Translates CLBlast arguments to 0/1 integers for the OpenCL kernel
+  const auto is_unit_diagonal = (diagonal == Diagonal::kNonUnit) ? 0 : 1;
+  const auto is_transposed = ((a_transpose == Transpose::kNo && layout == Layout::kColMajor) ||
+                              (a_transpose != Transpose::kNo && layout != Layout::kColMajor)) ? 0 : 1;
+
+  // The data is either in the upper or lower triangle
+  auto is_upper = ((triangle == Triangle::kUpper && a_transpose == Transpose::kNo) ||
+                   (triangle == Triangle::kLower && a_transpose != Transpose::kNo));
+
+  // Retrieves the kernel from the compiled binary
+  const auto kernel_name = (is_upper) ? "trsv_backward" : "trsv_forward";
+  auto kernel = Kernel(program, kernel_name);
+
+  // Sets the kernel arguments
+  kernel.SetArgument(0, static_cast<int>(n));
+  kernel.SetArgument(1, a_buffer());
+  kernel.SetArgument(2, static_cast<int>(a_offset));
+  kernel.SetArgument(3, static_cast<int>(a_ld));
+  kernel.SetArgument(4, b_buffer());
+  kernel.SetArgument(5, static_cast<int>(b_offset));
+  kernel.SetArgument(6, static_cast<int>(b_inc));
+  kernel.SetArgument(7, x_buffer());
+  kernel.SetArgument(8, static_cast<int>(x_offset));
+  kernel.SetArgument(9, static_cast<int>(x_inc));
+  kernel.SetArgument(10, static_cast<int>(is_transposed));
+  kernel.SetArgument(11, static_cast<int>(is_unit_diagonal));
+
+  // Launches the kernel
+  const auto local = std::vector<size_t>{1};
+  const auto global = std::vector<size_t>{1};
+  auto event = Event();
+  RunKernel(kernel, queue_, device_, global, local, event.pointer());
+  event.WaitForCompletion();
+}
+
 // =================================================================================================
 
 // The main routine
@@ -33,24 +86,84 @@ void Xtrsv<T>::DoTrsv(const Layout layout, const Triangle triangle,
                       const Transpose a_transpose, const Diagonal diagonal,
                       const size_t n,
                       const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
-                      const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc) {
+                      const Buffer<T> &b_buffer, const size_t b_offset, const size_t b_inc) {
 
   // Makes sure all dimensions are larger than zero
   if (n == 0) { throw BLASError(StatusCode::kInvalidDimension); }
 
   // Tests the matrix and vector
   TestMatrixA(n, n, a_buffer, a_offset, a_ld);
-  TestVectorX(n, x_buffer, x_offset, x_inc);
+  TestVectorX(n, b_buffer, b_offset, b_inc);
 
-  // Creates a copy of X: a temporary scratch buffer
-  auto scratch_buffer = Buffer<T>(context_, n*x_inc + x_offset);
-  x_buffer.CopyTo(queue_, n*x_inc + x_offset, scratch_buffer);
+  // Retrieves the program from the cache
+  const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), "TRSV");
 
-  // The data is either in the upper or lower triangle
-  size_t is_upper = ((triangle == Triangle::kUpper && layout != Layout::kRowMajor) ||
-                     (triangle == Triangle::kLower && layout == Layout::kRowMajor));
+  // Creates a copy of B to avoid overwriting input while computing output
+  // TODO: Make x with 0 offset and unit increment by creating custom copy-to and copy-from kernels
+  const auto x_offset = b_offset;
+  const auto x_inc = b_inc;
+  const auto x_size = n*x_inc + x_offset;
+  auto x_buffer = Buffer<T>(context_, x_size);
+  b_buffer.CopyTo(queue_, x_size, x_buffer);
 
-  // TODO: Implement the routine
+  // Fills the output buffer with zeros
+  auto eventWaitList = std::vector<Event>();
+  auto fill_vector_event = Event();
+  FillVector(queue_, device_, program, db_, fill_vector_event.pointer(), eventWaitList,
+             n, x_inc, x_offset, x_buffer, ConstantZero<T>());
+  fill_vector_event.WaitForCompletion();
+
+  // TODO: Not working for row-major at the moment
+  const auto is_transposed = ((a_transpose == Transpose::kNo && layout == Layout::kRowMajor) ||
+                              (a_transpose != Transpose::kNo && layout != Layout::kRowMajor));
+
+  // Loops over the blocks
+  auto col = n; // the initial column position
+  for (auto i = size_t{0}; i < n; i += TRSV_BLOCK_SIZE) {
+    const auto block_size = std::min(TRSV_BLOCK_SIZE, n - i);
+
+    if (!is_transposed) {
+
+      // Sets the offsets for upper or lower triangular
+      col = (triangle == Triangle::kUpper) ? col - block_size : i;
+      const auto extra_offset_a = (triangle == Triangle::kUpper) ? col + (col+block_size)*a_ld : col;
+      const auto extra_offset_x = (triangle == Triangle::kUpper) ? (col+block_size)*x_inc : 0;
+      const auto extra_offset_b = col*x_inc;
+
+      // Runs the GEMV routine to compute x' = A * x
+      if (i > 0) {
+        DoGemv(layout, a_transpose, block_size, i, ConstantOne<T>(),
+               a_buffer, a_offset + extra_offset_a, a_ld,
+               x_buffer, x_offset + extra_offset_x, x_inc, ConstantOne<T>(),
+               x_buffer, x_offset + extra_offset_b, x_inc );
+      }
+    }
+    else {
+
+      // Sets the offsets for upper or lower triangular
+      col = (triangle == Triangle::kLower) ? col - block_size : i;
+      const auto extra_offset_a = (triangle == Triangle::kLower) ? col+block_size + col*a_ld : col*a_ld;
+      const auto extra_offset_x = (triangle == Triangle::kLower) ? (col+block_size)*x_inc : 0;
+      const auto extra_offset_b = col*x_inc;
+
+      // Runs the GEMV routine to compute x' = A * x
+      if (i > 0) {
+        DoGemv(layout, a_transpose, i, block_size, ConstantOne<T>(),
+               a_buffer, a_offset + extra_offset_a, a_ld,
+               x_buffer, x_offset + extra_offset_x, x_inc, ConstantOne<T>(),
+               x_buffer, x_offset + extra_offset_b, x_inc );
+      }
+    }
+
+    // Runs the triangular substitution for the block size
+    Substitution(layout, triangle, a_transpose, diagonal, block_size,
+                 a_buffer, a_offset + col + col*a_ld, a_ld,
+                 b_buffer, b_offset + col*b_inc, b_inc,
+                 x_buffer, x_offset + col*x_inc, x_inc);
+  }
+
+  // Retrieves the results
+  x_buffer.CopyTo(queue_, x_size, b_buffer);
 }
 
 // =================================================================================================
diff --git a/src/routines/level2/xtrsv.hpp b/src/routines/level2/xtrsv.hpp
index 4a73b5eb..dc3f32f0 100644
--- a/src/routines/level2/xtrsv.hpp
+++ b/src/routines/level2/xtrsv.hpp
@@ -7,7 +7,9 @@
 // Author(s):
 //   Cedric Nugteren <www.cedricnugteren.nl>
 //
-// This file implements the Xtrsv routine.
+// This file implements the Xtrsv routine. It uses a block-algorithm and performs small triangular
+// forward and backward substitutions on the diagonal parts of the matrix in combination with larger
+// GEMV computation on the remainder of the matrix.
 //
 // =================================================================================================
 
@@ -25,9 +27,12 @@ class Xtrsv: public Xgemv<T> {
  public:
 
   // Uses the generic matrix-vector routine
+  using Xgemv<T>::routine_name_;
   using Xgemv<T>::queue_;
   using Xgemv<T>::context_;
-  using Xgemv<T>::MatVec;
+  using Xgemv<T>::device_;
+  using Xgemv<T>::db_;
+  using Xgemv<T>::DoGemv;
 
   // Constructor
   Xtrsv(Queue &queue, EventPointer event, const std::string &name = "TRSV");
@@ -38,6 +43,14 @@ class Xtrsv: public Xgemv<T> {
               const size_t n,
               const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
               const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc);
+
+  // Performs forward or backward substitution on a small triangular matrix
+  void Substitution(const Layout layout, const Triangle triangle,
+                    const Transpose a_transpose, const Diagonal diagonal,
+                    const size_t n,
+                    const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
+                    const Buffer<T> &b_buffer, const size_t b_offset, const size_t b_inc,
+                    const Buffer<T> &x_buffer, const size_t offset_x, const size_t x_inc);
 };
 
 // =================================================================================================

From a6ba6470aa45dff3c224da9644b98d49b0cce199 Mon Sep 17 00:00:00 2001
From: Cedric Nugteren <web@cedricnugteren.nl>
Date: Sat, 4 Feb 2017 14:25:27 +0100
Subject: [PATCH 08/24] Added row-major support for TRSV

---
 src/routines/level2/xtrsv.cpp | 33 +++++++++++++++++++--------------
 1 file changed, 19 insertions(+), 14 deletions(-)

diff --git a/src/routines/level2/xtrsv.cpp b/src/routines/level2/xtrsv.cpp
index 93f4174f..4725b1c1 100644
--- a/src/routines/level2/xtrsv.cpp
+++ b/src/routines/level2/xtrsv.cpp
@@ -49,8 +49,8 @@ void Xtrsv<T>::Substitution(const Layout layout, const Triangle triangle,
                               (a_transpose != Transpose::kNo && layout != Layout::kColMajor)) ? 0 : 1;
 
   // The data is either in the upper or lower triangle
-  auto is_upper = ((triangle == Triangle::kUpper && a_transpose == Transpose::kNo) ||
-                   (triangle == Triangle::kLower && a_transpose != Transpose::kNo));
+  const auto is_upper = ((triangle == Triangle::kUpper && a_transpose == Transpose::kNo) ||
+                         (triangle == Triangle::kLower && a_transpose != Transpose::kNo));
 
   // Retrieves the kernel from the compiled binary
   const auto kernel_name = (is_upper) ? "trsv_backward" : "trsv_forward";
@@ -113,22 +113,28 @@ void Xtrsv<T>::DoTrsv(const Layout layout, const Triangle triangle,
              n, x_inc, x_offset, x_buffer, ConstantZero<T>());
   fill_vector_event.WaitForCompletion();
 
-  // TODO: Not working for row-major at the moment
-  const auto is_transposed = ((a_transpose == Transpose::kNo && layout == Layout::kRowMajor) ||
-                              (a_transpose != Transpose::kNo && layout != Layout::kRowMajor));
+  // The data is either in the upper or lower triangle
+  const auto is_upper = ((triangle == Triangle::kUpper && a_transpose == Transpose::kNo) ||
+                         (triangle == Triangle::kLower && a_transpose != Transpose::kNo));
 
   // Loops over the blocks
   auto col = n; // the initial column position
   for (auto i = size_t{0}; i < n; i += TRSV_BLOCK_SIZE) {
     const auto block_size = std::min(TRSV_BLOCK_SIZE, n - i);
 
-    if (!is_transposed) {
+    // Sets the next column position
+    col = (is_upper) ? col - block_size : i;
+
+    // Sets the offsets for upper or lower triangular
+    const auto extra_offset_x = (is_upper) ? (col+block_size)*x_inc : 0;
+    const auto extra_offset_b = col*x_inc;
+
+    if (a_transpose == Transpose::kNo) {
 
       // Sets the offsets for upper or lower triangular
-      col = (triangle == Triangle::kUpper) ? col - block_size : i;
-      const auto extra_offset_a = (triangle == Triangle::kUpper) ? col + (col+block_size)*a_ld : col;
-      const auto extra_offset_x = (triangle == Triangle::kUpper) ? (col+block_size)*x_inc : 0;
-      const auto extra_offset_b = col*x_inc;
+      const auto extra_offset_a = (layout == Layout::kColMajor) ?
+                                  ((triangle == Triangle::kUpper) ? col + (col+block_size)*a_ld : col) :
+                                  ((triangle == Triangle::kUpper) ? col+block_size + (col)*a_ld : col*a_ld);
 
       // Runs the GEMV routine to compute x' = A * x
       if (i > 0) {
@@ -141,10 +147,9 @@ void Xtrsv<T>::DoTrsv(const Layout layout, const Triangle triangle,
     else {
 
       // Sets the offsets for upper or lower triangular
-      col = (triangle == Triangle::kLower) ? col - block_size : i;
-      const auto extra_offset_a = (triangle == Triangle::kLower) ? col+block_size + col*a_ld : col*a_ld;
-      const auto extra_offset_x = (triangle == Triangle::kLower) ? (col+block_size)*x_inc : 0;
-      const auto extra_offset_b = col*x_inc;
+      const auto extra_offset_a = (layout == Layout::kColMajor) ?
+                                  ((triangle == Triangle::kLower) ? col+block_size + col*a_ld : col*a_ld) :
+                                  ((triangle == Triangle::kLower) ? col + (col+block_size)*a_ld : col);
 
       // Runs the GEMV routine to compute x' = A * x
       if (i > 0) {

From fec8c1a8069a2307b8d3aba118ebb61b94871996 Mon Sep 17 00:00:00 2001
From: Cedric Nugteren <web@cedricnugteren.nl>
Date: Sat, 4 Feb 2017 16:04:19 +0100
Subject: [PATCH 09/24] Completed a first STRSV implementation

---
 src/database/database.cpp       |  2 +
 src/database/kernels/xtrsv.hpp  | 78 +++++++++++++++++++++++++++++
 src/kernels/level2/xtrsv.opencl | 88 ++++++++++++++++++++-------------
 src/routines/level2/xgemv.cpp   |  2 +-
 src/routines/level2/xtrsv.cpp   | 55 +++++++--------------
 5 files changed, 153 insertions(+), 72 deletions(-)
 create mode 100644 src/database/kernels/xtrsv.hpp

diff --git a/src/database/database.cpp b/src/database/database.cpp
index bcbd9955..c000b0b7 100644
--- a/src/database/database.cpp
+++ b/src/database/database.cpp
@@ -20,6 +20,7 @@
 #include "database/kernels/xgemv_fast.hpp"
 #include "database/kernels/xgemv_fast_rot.hpp"
 #include "database/kernels/xger.hpp"
+#include "database/kernels/xtrsv.hpp"
 #include "database/kernels/xgemm.hpp"
 #include "database/kernels/xgemm_direct.hpp"
 #include "database/kernels/copy.hpp"
@@ -40,6 +41,7 @@ const std::vector<const Database::DatabaseEntry*> Database::database = {
   &database::XgemvFastHalf, &database::XgemvFastSingle, &database::XgemvFastDouble, &database::XgemvFastComplexSingle, &database::XgemvFastComplexDouble,
   &database::XgemvFastRotHalf, &database::XgemvFastRotSingle, &database::XgemvFastRotDouble, &database::XgemvFastRotComplexSingle, &database::XgemvFastRotComplexDouble,
   &database::XgerHalf, &database::XgerSingle, &database::XgerDouble, &database::XgerComplexSingle, &database::XgerComplexDouble,
+  &database::XtrsvHalf, &database::XtrsvSingle, &database::XtrsvDouble, &database::XtrsvComplexSingle, &database::XtrsvComplexDouble,
   &database::XgemmHalf, &database::XgemmSingle, &database::XgemmDouble, &database::XgemmComplexSingle, &database::XgemmComplexDouble,
   &database::XgemmDirectHalf, &database::XgemmDirectSingle, &database::XgemmDirectDouble, &database::XgemmDirectComplexSingle, &database::XgemmDirectComplexDouble,
   &database::CopyHalf, &database::CopySingle, &database::CopyDouble, &database::CopyComplexSingle, &database::CopyComplexDouble,
diff --git a/src/database/kernels/xtrsv.hpp b/src/database/kernels/xtrsv.hpp
new file mode 100644
index 00000000..0741569e
--- /dev/null
+++ b/src/database/kernels/xtrsv.hpp
@@ -0,0 +1,78 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file populates the database with best-found tuning parameters for the 'Xtrsv' kernels.
+//
+// =================================================================================================
+
+namespace clblast {
+namespace database {
+// =================================================================================================
+
+const Database::DatabaseEntry XtrsvHalf = {
+  "Xtrsv", Precision::kHalf, {
+    { // Default
+      kDeviceTypeAll, "default", {
+        { "default",                                         { {"TRSV_BLOCK_SIZE",32} } },
+      }
+    },
+  }
+};
+
+// =================================================================================================
+
+const Database::DatabaseEntry XtrsvSingle = {
+  "Xtrsv", Precision::kSingle, {
+    { // Default
+      kDeviceTypeAll, "default", {
+        { "default",                                         { {"TRSV_BLOCK_SIZE",32} } },
+      }
+    },
+  }
+};
+
+// =================================================================================================
+
+const Database::DatabaseEntry XtrsvComplexSingle = {
+  "Xtrsv", Precision::kComplexSingle, {
+    { // Default
+      kDeviceTypeAll, "default", {
+        { "default",                                         { {"TRSV_BLOCK_SIZE",32} } },
+      }
+    },
+  }
+};
+
+// =================================================================================================
+
+const Database::DatabaseEntry XtrsvDouble = {
+  "Xtrsv", Precision::kDouble, {
+    { // Default
+      kDeviceTypeAll, "default", {
+        { "default",                                         { {"TRSV_BLOCK_SIZE",32} } },
+      }
+    },
+  }
+};
+
+// =================================================================================================
+
+const Database::DatabaseEntry XtrsvComplexDouble = {
+  "Xtrsv", Precision::kComplexDouble, {
+    { // Default
+      kDeviceTypeAll, "default", {
+        { "default",                                         { {"TRSV_BLOCK_SIZE",32} } },
+      }
+    },
+  }
+};
+
+// =================================================================================================
+} // namespace database
+} // namespace clblast
diff --git a/src/kernels/level2/xtrsv.opencl b/src/kernels/level2/xtrsv.opencl
index 00f29e47..01bd6ba5 100644
--- a/src/kernels/level2/xtrsv.opencl
+++ b/src/kernels/level2/xtrsv.opencl
@@ -30,58 +30,78 @@ void FillVector(const int n, const int inc, const int offset,
 
 // =================================================================================================
 
-// TODO: Put variable in database
-#define TRSV_BLOCK_SIZE 256
+// Parameters set by the tuner or by the database. Here they are given a basic default value in case
+// this kernel file is used outside of the CLBlast library.
 
-__kernel __attribute__((reqd_work_group_size(1, 1, 1)))
+#ifndef TRSV_BLOCK_SIZE
+  #define TRSV_BLOCK_SIZE 32    // The block size for forward or backward substition
+#endif
+
+// =================================================================================================
+
+__kernel __attribute__((reqd_work_group_size(TRSV_BLOCK_SIZE, 1, 1)))
 void trsv_forward(int n,
-                  const __global float *A, const int a_offset, int lda,
-                  __global float *b, const int b_offset, int b_inc,
-                  __global float *x, const int x_offset, int x_inc,
+                  const __global real *A, const int a_offset, int lda,
+                  __global real *b, const int b_offset, int b_inc,
+                  __global real *x, const int x_offset, int x_inc,
                   const int is_transposed, const int is_unit_diagonal) {
-  __local float sx[TRSV_BLOCK_SIZE];
+  __local real sx[TRSV_BLOCK_SIZE];
+  const int tid = get_local_id(0);
+  if (tid < n) {
+    sx[tid] = b[tid*b_inc + b_offset];
+  }
   barrier(CLK_LOCAL_MEM_FENCE);
   for (int i = 0; i < n; ++i) {
-    real sum = b[i*b_inc + b_offset];
-    for (int j = 0; j < i; ++j) {
-      real a_value;
-      if (is_transposed == 0) { a_value = A[i + j*lda + a_offset]; }
-      else { a_value = A[j + i*lda + a_offset]; }
-      sum -= a_value * sx[j];
+    if (tid == 0) {
+      real sum = sx[i];
+      for (int j = 0; j < i; ++j) {
+        real a_value;
+        if (is_transposed == 0) { a_value = A[i + j*lda + a_offset]; }
+        else { a_value = A[j + i*lda + a_offset]; }
+        sum -= a_value * sx[j];
+      }
+      sum -= x[i*x_inc + x_offset];
+      if (is_unit_diagonal == 0) { sum /= A[i + i*lda + a_offset]; }
+      sx[i] = sum;
     }
-    sum -= x[i*x_inc + x_offset];
-    if (is_unit_diagonal == 0) { sum /= A[i + i*lda + a_offset]; }
-    sx[i] = sum;
     barrier(CLK_LOCAL_MEM_FENCE);
   }
-  for (int i = 0; i < n; ++i) {
-    x[i*x_inc + x_offset] = sx[i];
+  barrier(CLK_LOCAL_MEM_FENCE);
+  if (tid < n) {
+    x[tid*x_inc + x_offset] = sx[tid];
   }
 }
 
-__kernel __attribute__((reqd_work_group_size(1, 1, 1)))
+__kernel __attribute__((reqd_work_group_size(TRSV_BLOCK_SIZE, 1, 1)))
 void trsv_backward(int n,
-                   const __global float *A, const int a_offset, int lda,
-                   __global float *b, const int b_offset, int b_inc,
-                   __global float *x, const int x_offset, int x_inc,
+                   const __global real *A, const int a_offset, int lda,
+                   __global real *b, const int b_offset, int b_inc,
+                   __global real *x, const int x_offset, int x_inc,
                    const int is_trans, const int is_unit_diagonal) {
-  __local float sx[TRSV_BLOCK_SIZE];
+  __local real sx[TRSV_BLOCK_SIZE];
+  const int tid = get_local_id(0);
+  if (tid < n) {
+    sx[tid] = b[tid*b_inc + b_offset];
+  }
   barrier(CLK_LOCAL_MEM_FENCE);
   for (int i = n - 1; i >= 0; --i) {
-    real sum = b[i*b_inc + b_offset];
-    for (int j = i + 1; j < n; ++j) {
-      real a_value;
-      if (is_trans == 0) { a_value = A[i + j*lda + a_offset]; }
-      else { a_value = A[j + i*lda + a_offset]; }
-      sum -= a_value * sx[j];
+    if (tid == 0) {
+      real sum = sx[i];
+      for (int j = i + 1; j < n; ++j) {
+        real a_value;
+        if (is_trans == 0) { a_value = A[i + j*lda + a_offset]; }
+        else { a_value = A[j + i*lda + a_offset]; }
+        sum -= a_value * sx[j];
+      }
+      sum -= x[i*x_inc + x_offset];
+      if (is_unit_diagonal == 0) { sum /= A[i + i*lda + a_offset]; }
+      sx[i] = sum;
     }
-    sum -= x[i*x_inc + x_offset];
-    if (is_unit_diagonal == 0) { sum /= A[i + i*lda + a_offset]; }
-    sx[i] = sum;
     barrier(CLK_LOCAL_MEM_FENCE);
   }
-  for (int i = 0; i < n; ++i) {
-    x[i*x_inc + x_offset] = sx[i];
+  barrier(CLK_LOCAL_MEM_FENCE);
+  if (tid < n) {
+    x[tid*x_inc + x_offset] = sx[tid];
   }
 }
 
diff --git a/src/routines/level2/xgemv.cpp b/src/routines/level2/xgemv.cpp
index 26eed5e0..52e66de6 100644
--- a/src/routines/level2/xgemv.cpp
+++ b/src/routines/level2/xgemv.cpp
@@ -22,7 +22,7 @@ namespace clblast {
 // Constructor: forwards to base class constructor
 template <typename T>
 Xgemv<T>::Xgemv(Queue &queue, EventPointer event, const std::string &name):
-    Routine(queue, event, name, {"Pad", "Xgemv", "XgemvFast", "XgemvFastRot"}, PrecisionValue<T>(), {}, {
+    Routine(queue, event, name, {"Pad", "Xgemv", "XgemvFast", "XgemvFastRot", "Xtrsv"}, PrecisionValue<T>(), {}, {
     #include "../../kernels/level2/xgemv.opencl"
     #include "../../kernels/level2/xgemv_fast.opencl"
     #include "../../kernels/level2/xtrsv.opencl"
diff --git a/src/routines/level2/xtrsv.cpp b/src/routines/level2/xtrsv.cpp
index 4725b1c1..6e651e33 100644
--- a/src/routines/level2/xtrsv.cpp
+++ b/src/routines/level2/xtrsv.cpp
@@ -25,9 +25,6 @@ Xtrsv<T>::Xtrsv(Queue &queue, EventPointer event, const std::string &name):
     Xgemv<T>(queue, event, name) {
 }
 
-// TODO: Temporary variable, put in database instead
-constexpr size_t TRSV_BLOCK_SIZE = 9;
-
 // =================================================================================================
 
 template <typename T>
@@ -38,7 +35,7 @@ void Xtrsv<T>::Substitution(const Layout layout, const Triangle triangle,
                             const Buffer<T> &b_buffer, const size_t b_offset, const size_t b_inc,
                             const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc) {
 
-  if (n > TRSV_BLOCK_SIZE) { throw BLASError(StatusCode::kUnexpectedError); };
+  if (n > db_["TRSV_BLOCK_SIZE"]) { throw BLASError(StatusCode::kUnexpectedError); };
 
   // Retrieves the program from the cache
   const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), "TRSV");
@@ -71,7 +68,7 @@ void Xtrsv<T>::Substitution(const Layout layout, const Triangle triangle,
   kernel.SetArgument(11, static_cast<int>(is_unit_diagonal));
 
   // Launches the kernel
-  const auto local = std::vector<size_t>{1};
+  const auto local = std::vector<size_t>{db_["TRSV_BLOCK_SIZE"]};
   const auto global = std::vector<size_t>{1};
   auto event = Event();
   RunKernel(kernel, queue_, device_, global, local, event.pointer());
@@ -113,51 +110,35 @@ void Xtrsv<T>::DoTrsv(const Layout layout, const Triangle triangle,
              n, x_inc, x_offset, x_buffer, ConstantZero<T>());
   fill_vector_event.WaitForCompletion();
 
-  // The data is either in the upper or lower triangle
+  // Derives properties based on the arguments
   const auto is_upper = ((triangle == Triangle::kUpper && a_transpose == Transpose::kNo) ||
                          (triangle == Triangle::kLower && a_transpose != Transpose::kNo));
+  const auto is_transposed = ((layout == Layout::kColMajor && a_transpose == Transpose::kNo) ||
+                              (layout != Layout::kColMajor && a_transpose != Transpose::kNo));
 
   // Loops over the blocks
   auto col = n; // the initial column position
-  for (auto i = size_t{0}; i < n; i += TRSV_BLOCK_SIZE) {
-    const auto block_size = std::min(TRSV_BLOCK_SIZE, n - i);
+  for (auto i = size_t{0}; i < n; i += db_["TRSV_BLOCK_SIZE"]) {
+    const auto block_size = std::min(db_["TRSV_BLOCK_SIZE"], n - i);
 
     // Sets the next column position
     col = (is_upper) ? col - block_size : i;
 
     // Sets the offsets for upper or lower triangular
+    const auto extra_offset_a = (is_transposed) ?
+                                (is_upper ? col + (col+block_size)*a_ld : col) :
+                                (is_upper ? col+block_size + col*a_ld : col*a_ld);
     const auto extra_offset_x = (is_upper) ? (col+block_size)*x_inc : 0;
     const auto extra_offset_b = col*x_inc;
 
-    if (a_transpose == Transpose::kNo) {
-
-      // Sets the offsets for upper or lower triangular
-      const auto extra_offset_a = (layout == Layout::kColMajor) ?
-                                  ((triangle == Triangle::kUpper) ? col + (col+block_size)*a_ld : col) :
-                                  ((triangle == Triangle::kUpper) ? col+block_size + (col)*a_ld : col*a_ld);
-
-      // Runs the GEMV routine to compute x' = A * x
-      if (i > 0) {
-        DoGemv(layout, a_transpose, block_size, i, ConstantOne<T>(),
-               a_buffer, a_offset + extra_offset_a, a_ld,
-               x_buffer, x_offset + extra_offset_x, x_inc, ConstantOne<T>(),
-               x_buffer, x_offset + extra_offset_b, x_inc );
-      }
-    }
-    else {
-
-      // Sets the offsets for upper or lower triangular
-      const auto extra_offset_a = (layout == Layout::kColMajor) ?
-                                  ((triangle == Triangle::kLower) ? col+block_size + col*a_ld : col*a_ld) :
-                                  ((triangle == Triangle::kLower) ? col + (col+block_size)*a_ld : col);
-
-      // Runs the GEMV routine to compute x' = A * x
-      if (i > 0) {
-        DoGemv(layout, a_transpose, i, block_size, ConstantOne<T>(),
-               a_buffer, a_offset + extra_offset_a, a_ld,
-               x_buffer, x_offset + extra_offset_x, x_inc, ConstantOne<T>(),
-               x_buffer, x_offset + extra_offset_b, x_inc );
-      }
+    // Runs the GEMV routine to compute x' = A * x
+    if (i > 0) {
+      const auto gemv_m = (a_transpose == Transpose::kNo) ? block_size : i;
+      const auto gemv_n = (a_transpose == Transpose::kNo) ? i : block_size;
+      DoGemv(layout, a_transpose, gemv_m, gemv_n, ConstantOne<T>(),
+             a_buffer, a_offset + extra_offset_a, a_ld,
+             x_buffer, x_offset + extra_offset_x, x_inc, ConstantOne<T>(),
+             x_buffer, x_offset + extra_offset_b, x_inc );
     }
 
     // Runs the triangular substitution for the block size

From c209dd7af90d604c8210cc5680b6c7a50b2b995f Mon Sep 17 00:00:00 2001
From: Cedric Nugteren <web@cedricnugteren.nl>
Date: Sat, 4 Feb 2017 22:48:06 +0100
Subject: [PATCH 10/24] Improved substition kernels a bit; added complex
 support

---
 src/kernels/common.opencl       | 14 +++++
 src/kernels/level2/xtrsv.opencl | 98 ++++++++++++++++++++-------------
 src/routines/level2/xtrsv.cpp   |  2 +
 3 files changed, 76 insertions(+), 38 deletions(-)

diff --git a/src/kernels/common.opencl b/src/kernels/common.opencl
index bfa1042d..8e59a5fe 100644
--- a/src/kernels/common.opencl
+++ b/src/kernels/common.opencl
@@ -176,6 +176,13 @@ R"(
   #define Add(c, a, b) c = a + b
 #endif
 
+// Subtracts two complex variables
+#if PRECISION == 3232 || PRECISION == 6464
+  #define Subtract(c, a, b) c.x = a.x - b.x; c.y = a.y - b.y
+#else
+  #define Subtract(c, a, b) c = a - b
+#endif
+
 // Multiply two complex variables (used in the defines below)
 #if PRECISION == 3232 || PRECISION == 6464
   #define MulReal(a, b) a.x*b.x - a.y*b.y
@@ -200,6 +207,13 @@ R"(
   #endif
 #endif
 
+// The scalar multiply-subtract function
+#if PRECISION == 3232 || PRECISION == 6464
+  #define MultiplySubtract(c, a, b) c.x -= MulReal(a,b); c.y -= MulImag(a,b)
+#else
+  #define MultiplySubtract(c, a, b) c -= a * b
+#endif
+
 // The scalar division function
 #if PRECISION == 3232 || PRECISION == 6464
   #define DivideReal(c, a, b) c.x = a.x / b.x; c.y = a.x
diff --git a/src/kernels/level2/xtrsv.opencl b/src/kernels/level2/xtrsv.opencl
index 01bd6ba5..fd5de200 100644
--- a/src/kernels/level2/xtrsv.opencl
+++ b/src/kernels/level2/xtrsv.opencl
@@ -41,67 +41,89 @@ void FillVector(const int n, const int inc, const int offset,
 
 __kernel __attribute__((reqd_work_group_size(TRSV_BLOCK_SIZE, 1, 1)))
 void trsv_forward(int n,
-                  const __global real *A, const int a_offset, int lda,
+                  const __global real *A, const int a_offset, int a_ld,
                   __global real *b, const int b_offset, int b_inc,
                   __global real *x, const int x_offset, int x_inc,
-                  const int is_transposed, const int is_unit_diagonal) {
-  __local real sx[TRSV_BLOCK_SIZE];
+                  const int is_transposed, const int is_unit_diagonal, const int do_conjugate) {
+  __local real alm[TRSV_BLOCK_SIZE][TRSV_BLOCK_SIZE];
+  __local real xlm[TRSV_BLOCK_SIZE];
   const int tid = get_local_id(0);
+
+  // Pre-loads the data into local memory
   if (tid < n) {
-    sx[tid] = b[tid*b_inc + b_offset];
-  }
-  barrier(CLK_LOCAL_MEM_FENCE);
-  for (int i = 0; i < n; ++i) {
-    if (tid == 0) {
-      real sum = sx[i];
-      for (int j = 0; j < i; ++j) {
-        real a_value;
-        if (is_transposed == 0) { a_value = A[i + j*lda + a_offset]; }
-        else { a_value = A[j + i*lda + a_offset]; }
-        sum -= a_value * sx[j];
+    Subtract(xlm[tid], b[tid*b_inc + b_offset], x[tid*x_inc + x_offset]);
+    if (is_transposed == 0) {
+      for (int i = 0; i < n; ++i) {
+        alm[i][tid] = A[i + tid*a_ld + a_offset];
+        if (do_conjugate) { COMPLEX_CONJUGATE(alm[i][tid]); }
+      }
+    }
+    else {
+      for (int i = 0; i < n; ++i) {
+        alm[i][tid] = A[tid + i*a_ld + a_offset];
       }
-      sum -= x[i*x_inc + x_offset];
-      if (is_unit_diagonal == 0) { sum /= A[i + i*lda + a_offset]; }
-      sx[i] = sum;
     }
-    barrier(CLK_LOCAL_MEM_FENCE);
   }
   barrier(CLK_LOCAL_MEM_FENCE);
+
+  // Computes the result (single-threaded for now)
+  if (tid == 0) {
+    for (int i = 0; i < n; ++i) {
+      for (int j = 0; j < i; ++j) {
+        MultiplySubtract(xlm[i], alm[i][j], xlm[j]);
+      }
+      if (is_unit_diagonal == 0) { DivideReal(xlm[i], xlm[i], alm[i][i]); }
+    }
+  }
+  barrier(CLK_LOCAL_MEM_FENCE);
+
+  // Stores the results
   if (tid < n) {
-    x[tid*x_inc + x_offset] = sx[tid];
+    x[tid*x_inc + x_offset] = xlm[tid];
   }
 }
 
 __kernel __attribute__((reqd_work_group_size(TRSV_BLOCK_SIZE, 1, 1)))
 void trsv_backward(int n,
-                   const __global real *A, const int a_offset, int lda,
+                   const __global real *A, const int a_offset, int a_ld,
                    __global real *b, const int b_offset, int b_inc,
                    __global real *x, const int x_offset, int x_inc,
-                   const int is_trans, const int is_unit_diagonal) {
-  __local real sx[TRSV_BLOCK_SIZE];
+                   const int is_transposed, const int is_unit_diagonal, const int do_conjugate) {
+  __local real alm[TRSV_BLOCK_SIZE][TRSV_BLOCK_SIZE];
+  __local real xlm[TRSV_BLOCK_SIZE];
   const int tid = get_local_id(0);
+
+  // Pre-loads the data into local memory
   if (tid < n) {
-    sx[tid] = b[tid*b_inc + b_offset];
-  }
-  barrier(CLK_LOCAL_MEM_FENCE);
-  for (int i = n - 1; i >= 0; --i) {
-    if (tid == 0) {
-      real sum = sx[i];
-      for (int j = i + 1; j < n; ++j) {
-        real a_value;
-        if (is_trans == 0) { a_value = A[i + j*lda + a_offset]; }
-        else { a_value = A[j + i*lda + a_offset]; }
-        sum -= a_value * sx[j];
+    Subtract(xlm[tid], b[tid*b_inc + b_offset], x[tid*x_inc + x_offset]);
+    if (is_transposed == 0) {
+      for (int i = 0; i < n; ++i) {
+        alm[i][tid] = A[i + tid*a_ld + a_offset];
+        if (do_conjugate) { COMPLEX_CONJUGATE(alm[i][tid]); }
+      }
+    }
+    else {
+      for (int i = 0; i < n; ++i) {
+        alm[i][tid] = A[tid + i*a_ld + a_offset];
       }
-      sum -= x[i*x_inc + x_offset];
-      if (is_unit_diagonal == 0) { sum /= A[i + i*lda + a_offset]; }
-      sx[i] = sum;
     }
-    barrier(CLK_LOCAL_MEM_FENCE);
   }
   barrier(CLK_LOCAL_MEM_FENCE);
+
+  // Computes the result (single-threaded for now)
+  if (tid == 0) {
+    for (int i = n - 1; i >= 0; --i) {
+      for (int j = i + 1; j < n; ++j) {
+        MultiplySubtract(xlm[i], alm[i][j], xlm[j]);
+      }
+      if (is_unit_diagonal == 0) { DivideReal(xlm[i], xlm[i], alm[i][i]); }
+    }
+  }
+  barrier(CLK_LOCAL_MEM_FENCE);
+
+  // Stores the results
   if (tid < n) {
-    x[tid*x_inc + x_offset] = sx[tid];
+    x[tid*x_inc + x_offset] = xlm[tid];
   }
 }
 
diff --git a/src/routines/level2/xtrsv.cpp b/src/routines/level2/xtrsv.cpp
index 6e651e33..b0e4c5ae 100644
--- a/src/routines/level2/xtrsv.cpp
+++ b/src/routines/level2/xtrsv.cpp
@@ -44,6 +44,7 @@ void Xtrsv<T>::Substitution(const Layout layout, const Triangle triangle,
   const auto is_unit_diagonal = (diagonal == Diagonal::kNonUnit) ? 0 : 1;
   const auto is_transposed = ((a_transpose == Transpose::kNo && layout == Layout::kColMajor) ||
                               (a_transpose != Transpose::kNo && layout != Layout::kColMajor)) ? 0 : 1;
+  const auto do_conjugate = (a_transpose == Transpose::kConjugate) ? 1 : 0;
 
   // The data is either in the upper or lower triangle
   const auto is_upper = ((triangle == Triangle::kUpper && a_transpose == Transpose::kNo) ||
@@ -66,6 +67,7 @@ void Xtrsv<T>::Substitution(const Layout layout, const Triangle triangle,
   kernel.SetArgument(9, static_cast<int>(x_inc));
   kernel.SetArgument(10, static_cast<int>(is_transposed));
   kernel.SetArgument(11, static_cast<int>(is_unit_diagonal));
+  kernel.SetArgument(12, static_cast<int>(do_conjugate));
 
   // Launches the kernel
   const auto local = std::vector<size_t>{db_["TRSV_BLOCK_SIZE"]};

From e7cbb5915aef16f3a64566292459eaede5a600e5 Mon Sep 17 00:00:00 2001
From: Cedric Nugteren <web@cedricnugteren.nl>
Date: Sun, 5 Feb 2017 14:36:31 +0100
Subject: [PATCH 11/24] Fixed complex version of the TRSV kernel

---
 src/kernels/common.opencl       | 11 +++++++++--
 src/kernels/level2/xtrsv.opencl | 16 ++++++++++++----
 test/correctness/tester.cpp     |  4 ++++
 3 files changed, 25 insertions(+), 6 deletions(-)

diff --git a/src/kernels/common.opencl b/src/kernels/common.opencl
index 8e59a5fe..c052e94f 100644
--- a/src/kernels/common.opencl
+++ b/src/kernels/common.opencl
@@ -214,13 +214,20 @@ R"(
   #define MultiplySubtract(c, a, b) c -= a * b
 #endif
 
-// The scalar division function
+// The scalar division function: real-value only
 #if PRECISION == 3232 || PRECISION == 6464
-  #define DivideReal(c, a, b) c.x = a.x / b.x; c.y = a.x
+  #define DivideReal(c, a, b) c.x = a.x / b.x; c.y = a.y
 #else
   #define DivideReal(c, a, b) c = a / b
 #endif
 
+// The scalar division function: full division
+#if PRECISION == 3232 || PRECISION == 6464
+  #define DivideFull(c, a, b) singlereal num_x = (a.x * b.x) + (a.y * b.y); singlereal num_y = (a.y * b.x) - (a.x * b.y); singlereal denom = (b.x * b.x) + (b.y * b.y); c.x = num_x / denom; c.y = num_y / denom
+#else
+  #define DivideFull(c, a, b) c = a / b
+#endif
+
 // The scalar AXPBY function
 #if PRECISION == 3232 || PRECISION == 6464
   #define AXPBY(e, a, b, c, d) e.x = MulReal(a,b) + MulReal(c,d); e.y = MulImag(a,b) + MulImag(c,d)
diff --git a/src/kernels/level2/xtrsv.opencl b/src/kernels/level2/xtrsv.opencl
index fd5de200..ebea77a3 100644
--- a/src/kernels/level2/xtrsv.opencl
+++ b/src/kernels/level2/xtrsv.opencl
@@ -55,7 +55,6 @@ void trsv_forward(int n,
     if (is_transposed == 0) {
       for (int i = 0; i < n; ++i) {
         alm[i][tid] = A[i + tid*a_ld + a_offset];
-        if (do_conjugate) { COMPLEX_CONJUGATE(alm[i][tid]); }
       }
     }
     else {
@@ -63,6 +62,11 @@ void trsv_forward(int n,
         alm[i][tid] = A[tid + i*a_ld + a_offset];
       }
     }
+    if (do_conjugate) {
+      for (int i = 0; i < n; ++i) {
+        COMPLEX_CONJUGATE(alm[i][tid]);
+      }
+    }
   }
   barrier(CLK_LOCAL_MEM_FENCE);
 
@@ -72,7 +76,7 @@ void trsv_forward(int n,
       for (int j = 0; j < i; ++j) {
         MultiplySubtract(xlm[i], alm[i][j], xlm[j]);
       }
-      if (is_unit_diagonal == 0) { DivideReal(xlm[i], xlm[i], alm[i][i]); }
+      if (is_unit_diagonal == 0) { DivideFull(xlm[i], xlm[i], alm[i][i]); }
     }
   }
   barrier(CLK_LOCAL_MEM_FENCE);
@@ -99,7 +103,6 @@ void trsv_backward(int n,
     if (is_transposed == 0) {
       for (int i = 0; i < n; ++i) {
         alm[i][tid] = A[i + tid*a_ld + a_offset];
-        if (do_conjugate) { COMPLEX_CONJUGATE(alm[i][tid]); }
       }
     }
     else {
@@ -107,6 +110,11 @@ void trsv_backward(int n,
         alm[i][tid] = A[tid + i*a_ld + a_offset];
       }
     }
+    if (do_conjugate) {
+      for (int i = 0; i < n; ++i) {
+        COMPLEX_CONJUGATE(alm[i][tid]);
+      }
+    }
   }
   barrier(CLK_LOCAL_MEM_FENCE);
 
@@ -116,7 +124,7 @@ void trsv_backward(int n,
       for (int j = i + 1; j < n; ++j) {
         MultiplySubtract(xlm[i], alm[i][j], xlm[j]);
       }
-      if (is_unit_diagonal == 0) { DivideReal(xlm[i], xlm[i], alm[i][i]); }
+      if (is_unit_diagonal == 0) { DivideFull(xlm[i], xlm[i], alm[i][i]); }
     }
   }
   barrier(CLK_LOCAL_MEM_FENCE);
diff --git a/test/correctness/tester.cpp b/test/correctness/tester.cpp
index c449b09d..dc0f842e 100644
--- a/test/correctness/tester.cpp
+++ b/test/correctness/tester.cpp
@@ -410,6 +410,10 @@ bool TestSimilarityNear(const T val1, const T val2,
   if (val1 == val2) {
     return true;
   }
+  // Handles cases with both results NaN
+  else if (std::isnan(val1) && std::isnan(val2)) {
+    return true;
+  }
   // The values are zero or very small: the relative error is less meaningful
   else if (val1 == 0 || val2 == 0 || difference < error_margin_absolute) {
     return (difference < error_margin_absolute);

From 133ebfc83414c4aa0e8fce9f6f7b9d43e6774380 Mon Sep 17 00:00:00 2001
From: Cedric Nugteren <web@cedricnugteren.nl>
Date: Sun, 19 Feb 2017 17:43:26 +0100
Subject: [PATCH 12/24] Added data-preparation function for the TRSV tests and
 special nan/inf checks in the error checking to make the tests pass

---
 src/utilities/utilities.cpp    |  9 +++++++++
 src/utilities/utilities.hpp    |  4 ++++
 test/correctness/tester.cpp    | 21 +++++++++++++++++++--
 test/routines/level2/xtrsv.hpp | 32 ++++++++++++++++++++++++++++++++
 4 files changed, 64 insertions(+), 2 deletions(-)

diff --git a/src/utilities/utilities.cpp b/src/utilities/utilities.cpp
index fd198c0d..b2ed2f0c 100644
--- a/src/utilities/utilities.cpp
+++ b/src/utilities/utilities.cpp
@@ -18,6 +18,7 @@
 #include <chrono>
 #include <random>
 #include <iomanip>
+#include <cmath>
 
 namespace clblast {
 // =================================================================================================
@@ -118,6 +119,14 @@ double2 ConstantNegOne() {
   return {-1.0, 0.0};
 }
 
+// Returns the absolute value of a scalar
+template <typename T> T AbsoluteValue(const T value) { return std::fabs(value); }
+template float AbsoluteValue<float>(const float);
+template double AbsoluteValue<double>(const double);
+template <> half AbsoluteValue(const half value) { return FloatToHalf(std::fabs(HalfToFloat(value))); }
+template <> float2 AbsoluteValue(const float2 value) { return std::abs(value); }
+template <> double2 AbsoluteValue(const double2 value) { return std::abs(value); }
+
 // =================================================================================================
 
 // Implements the string conversion using std::to_string if possible
diff --git a/src/utilities/utilities.hpp b/src/utilities/utilities.hpp
index 757f1b5e..2c13658b 100644
--- a/src/utilities/utilities.hpp
+++ b/src/utilities/utilities.hpp
@@ -114,6 +114,10 @@ T ConstantOne();
 template <typename T>
 T ConstantNegOne();
 
+// Returns the absolute value of a scalar
+template <typename T>
+T AbsoluteValue(const T value);
+
 // =================================================================================================
 
 // Structure containing all possible arguments for test clients, including their default values
diff --git a/test/correctness/tester.cpp b/test/correctness/tester.cpp
index c0ed5ba4..dd2f3f99 100644
--- a/test/correctness/tester.cpp
+++ b/test/correctness/tester.cpp
@@ -40,6 +40,12 @@ float getAbsoluteErrorMargin<half>() {
   return 0.10f; // especially small values are inaccurate for half-precision
 }
 
+// Error margin: numbers beyond this value are considered equal to inf or NaN
+template <typename T>
+T getAlmostInfNumber() {
+  return T{1e35}; // used for correctness testing of TRSV and TRSM routines
+}
+
 // Maximum number of test results printed on a single line
 template <typename T, typename U> const size_t Tester<T,U>::kResultsPerLine = size_t{64};
 
@@ -426,8 +432,19 @@ bool TestSimilarityNear(const T val1, const T val2,
   if (val1 == val2) {
     return true;
   }
-  // Handles cases with both results NaN
-  else if (std::isnan(val1) && std::isnan(val2)) {
+  // Handles cases with both results NaN or inf
+  else if ((std::isnan(val1) && std::isnan(val2)) || (std::isinf(val1) && std::isinf(val2))) {
+    return true;
+  }
+  // Also considers it OK if one of the results in NaN and the other is inf
+  // Note: for TRSV and TRSM routines
+  else if ((std::isnan(val1) && std::isinf(val2)) || (std::isinf(val1) && std::isnan(val2))) {
+    return true;
+  }
+  // Also considers it OK if one of the values is super large and the other is inf or NaN
+  // Note: for TRSV and TRSM routines
+  else if ((std::abs(val1) > getAlmostInfNumber<T>() && (std::isinf(val2) || std::isnan(val2))) ||
+           (std::abs(val2) > getAlmostInfNumber<T>() && (std::isinf(val1) || std::isnan(val1)))) {
     return true;
   }
   // The values are zero or very small: the relative error is less meaningful
diff --git a/test/routines/level2/xtrsv.hpp b/test/routines/level2/xtrsv.hpp
index 67094b3d..811feac5 100644
--- a/test/routines/level2/xtrsv.hpp
+++ b/test/routines/level2/xtrsv.hpp
@@ -29,6 +29,35 @@
 namespace clblast {
 // =================================================================================================
 
+// Prepares the data
+template <typename T>
+void PrepareData(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
+  if (args.a_ld < args.n) { return; }
+
+  // Copies input buffers to the host
+  std::vector<T> a_mat_cpu(args.a_size, static_cast<T>(0));
+  std::vector<T> x_vec_cpu(args.x_size, static_cast<T>(0));
+  buffers.a_mat.Read(queue, args.a_size, a_mat_cpu);
+  buffers.x_vec.Read(queue, args.x_size, x_vec_cpu);
+
+  // Generates 'proper' input for the TRSV routine
+  // TODO: Improve this, currently loosely based on clBLAS's implementation
+  for (auto i = size_t{0}; i < args.n; ++i) {
+    auto diagonal = a_mat_cpu[i*args.a_ld + i + args.a_offset];
+    diagonal = AbsoluteValue(diagonal) + static_cast<T>(args.n / size_t{4});
+    for (auto j = size_t{0}; j < args.n; ++j) {
+      a_mat_cpu[j*args.a_ld + i + args.a_offset] /= T{2.0};
+    }
+    a_mat_cpu[i*args.a_ld + i + args.a_offset] = diagonal;
+    x_vec_cpu[i * args.x_inc + args.x_offset] /= T{2.0};
+  }
+
+  // Copies input buffers back to the OpenCL device
+  buffers.a_mat.Write(queue, args.a_size, a_mat_cpu);
+  buffers.x_vec.Write(queue, args.x_size, x_vec_cpu);
+  return;
+}
+
 // See comment at top of file for a description of the class
 template <typename T>
 class TestXtrsv {
@@ -71,6 +100,7 @@ class TestXtrsv {
 
   // Describes how to run the CLBlast routine
   static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
+    PrepareData(args, buffers, queue);
     auto queue_plain = queue();
     auto event = cl_event{};
     auto status = Trsv<T>(args.layout, args.triangle, args.a_transpose, args.diagonal,
@@ -85,6 +115,7 @@ class TestXtrsv {
   // Describes how to run the clBLAS routine (for correctness/performance comparison)
   #ifdef CLBLAST_REF_CLBLAS
     static StatusCode RunReference1(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
+      PrepareData(args, buffers, queue);
       auto queue_plain = queue();
       auto event = cl_event{};
       auto status = clblasXtrsv<T>(convertToCLBLAS(args.layout),
@@ -103,6 +134,7 @@ class TestXtrsv {
   // Describes how to run the CPU BLAS routine (for correctness/performance comparison)
   #ifdef CLBLAST_REF_CBLAS
     static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
+      PrepareData(args, buffers, queue);
       std::vector<T> a_mat_cpu(args.a_size, static_cast<T>(0));
       std::vector<T> x_vec_cpu(args.x_size, static_cast<T>(0));
       buffers.a_mat.Read(queue, args.a_size, a_mat_cpu);

From 1e5b5157bcc9ed1630ce731b3ce41ed7712d951d Mon Sep 17 00:00:00 2001
From: Cedric Nugteren <web@cedricnugteren.nl>
Date: Wed, 22 Feb 2017 20:31:33 +0100
Subject: [PATCH 13/24] Fixed a few issues with the TRSM routine; some tests
 still failing

---
 src/routines/level3/xtrsm.cpp   | 23 ++++++++++-------------
 src/routines/levelx/xinvert.cpp |  6 +++---
 2 files changed, 13 insertions(+), 16 deletions(-)

diff --git a/src/routines/level3/xtrsm.cpp b/src/routines/level3/xtrsm.cpp
index 3a910261..8fe33b64 100644
--- a/src/routines/level3/xtrsm.cpp
+++ b/src/routines/level3/xtrsm.cpp
@@ -54,11 +54,6 @@ void Xtrsm<T>::DoTrsm(const Layout layout, const Side side, const Triangle trian
   // Checks for validity of the triangular A matrix
   TestMatrixA(k, k, a_buffer, a_offset, a_ld);
 
-  // Determines which kernels to run based on the layout (the kernels assume column-major as
-  // default) and on whether we are dealing with an upper or lower triangle of the triangular matrix
-  const bool is_upper = ((triangle == Triangle::kUpper && layout != Layout::kRowMajor) ||
-                         (triangle == Triangle::kLower && layout == Layout::kRowMajor));
-
   // Checks for validity of the input B matrix
   const auto b_one = (layout == Layout::kRowMajor) ? n : m;
   const auto b_two = (layout == Layout::kRowMajor) ? m : n;
@@ -91,15 +86,17 @@ void Xtrsm<T>::DoTrsm(const Layout layout, const Side side, const Triangle trian
                                       k, block_size, a_buffer, a_offset, a_ld, a_inv_buffer);
   diagonal_invert_event.WaitForCompletion();
 
-  // Lower of upper triangular
-  const bool condition = ((triangle == Triangle::kUpper && a_transpose != Transpose::kNo) ||
-                          (triangle == Triangle::kLower && a_transpose == Transpose::kNo));
+  // Derives properties based on the arguments
+  const auto is_upper = ((triangle == Triangle::kUpper && a_transpose == Transpose::kNo) ||
+                         (triangle == Triangle::kLower && a_transpose != Transpose::kNo));
+  const auto is_transposed = ((layout == Layout::kColMajor && a_transpose == Transpose::kNo) ||
+                              (layout != Layout::kColMajor && a_transpose != Transpose::kNo));
 
   // Left side
   if (side == Side::kLeft) {
 
     // True when (lower triangular) or (upper triangular and transposed)
-    if (condition) {
+    if ((!is_upper && !is_transposed) || (is_upper && is_transposed)) {
       for (auto i = size_t{0}; i < m; i += block_size) {
         const auto gemm_alpha = (i == 0) ? alpha : ConstantOne<T>();
         const auto current_block_size = std::min(m - i, block_size);
@@ -125,7 +122,7 @@ void Xtrsm<T>::DoTrsm(const Layout layout, const Side side, const Triangle trian
       for (auto i = i_start; i >= 0; i -= static_cast<int>(block_size)) {
         const auto gemm_alpha = (i == i_start) ? alpha : ConstantOne<T>();
         DoGemm(layout, a_transpose, Transpose::kNo,
-               block_size, n, block_size, gemm_alpha,
+               current_block_size, n, current_block_size, gemm_alpha,
                a_inv_buffer, i * block_size, block_size,
                b_buffer, i, b_ld, ConstantZero<T>(),
                x_buffer, i, x_ld);
@@ -144,20 +141,20 @@ void Xtrsm<T>::DoTrsm(const Layout layout, const Side side, const Triangle trian
   else {
 
     // True when (lower triangular) or (upper triangular and transposed)
-    if (condition) {
+    if ((!is_upper && !is_transposed) || (is_upper && is_transposed)) {
       const auto current_block_size = (n % block_size == 0) ? block_size : (n % block_size);
       const auto i_start = static_cast<int>(n) - static_cast<int>(current_block_size);
       for (auto i = i_start; i >= 0; i -= static_cast<int>(block_size)) {
         const auto gemm_alpha = (i == i_start) ? alpha : ConstantOne<T>();
         DoGemm(layout, Transpose::kNo, a_transpose,
-               m, block_size, block_size, gemm_alpha,
+               m, current_block_size, current_block_size, gemm_alpha,
                b_buffer, i * b_ld, b_ld,
                a_inv_buffer, i * block_size, block_size, ConstantZero<T>(),
                x_buffer, i * x_ld, x_ld);
         if (i - static_cast<int>(block_size) < 0) { break; }
         const auto this_a_offset = (a_transpose == Transpose::kNo) ? i : i * a_ld;
         DoGemm(layout, Transpose::kNo, a_transpose,
-               m, i, block_size, ConstantNegOne<T>(),
+               m, i, current_block_size, ConstantNegOne<T>(),
                x_buffer, i * x_ld, x_ld,
                a_buffer, this_a_offset, a_ld, ConstantOne<T>(),
                b_buffer, 0, b_ld);
diff --git a/src/routines/levelx/xinvert.cpp b/src/routines/levelx/xinvert.cpp
index 696e694a..bcc3706d 100644
--- a/src/routines/levelx/xinvert.cpp
+++ b/src/routines/levelx/xinvert.cpp
@@ -41,8 +41,8 @@ void Xinvert<T>::InvertMatrixDiagonalBlocks(const Layout layout, const Triangle
                                             const Buffer<T> &src, const size_t offset, const size_t ld_src,
                                             Buffer<T> &dest) {
 
-  // Makes sure all dimensions are larger than zero and the block size is smaller than n
-  if ((block_size == 0) || (n == 0) || (block_size > n)) {
+  // Makes sure all dimensions are larger than zero
+  if ((block_size == 0) || (n == 0)) {
     throw BLASError(StatusCode::kInvalidDimension);
   }
 
@@ -56,7 +56,7 @@ void Xinvert<T>::InvertMatrixDiagonalBlocks(const Layout layout, const Triangle
   // This routine only supports block sizes which are a multiple of the internal block size and
   // block sizes up to and including 128
   if ((block_size % internal_block_size != 0) || (block_size > 128)) {
-    throw BLASError(StatusCode::kInvalidDimension);
+    throw BLASError(StatusCode::kUnknownError);
   }
 
   // Checks for validity of the source and destination matrices

From 2f2a510c38c811b53474dd8cc1a3dfff88053cf0 Mon Sep 17 00:00:00 2001
From: Cedric Nugteren <web@cedricnugteren.nl>
Date: Fri, 24 Feb 2017 21:08:44 +0100
Subject: [PATCH 14/24] Implemented a simple row-major to col-major problem
 conversion for TRSM

---
 src/routines/level3/xtrsm.cpp | 81 +++++++++++++++++++++++------------
 src/routines/level3/xtrsm.hpp | 12 +++++-
 2 files changed, 64 insertions(+), 29 deletions(-)

diff --git a/src/routines/level3/xtrsm.cpp b/src/routines/level3/xtrsm.cpp
index 8fe33b64..42855362 100644
--- a/src/routines/level3/xtrsm.cpp
+++ b/src/routines/level3/xtrsm.cpp
@@ -10,7 +10,7 @@
 // This file implements the triangular matrix solver (A * X = B) TRSM class. This code is based
 // on the TRSM implementation in the CUDA version of Magma version 2.2.0 and the poster "Triangular
 // Linear System Solver for GPU with CUDA and OpenCL" by Peng Du, Stanimire Tomov, Piotr Luszczek,
-// and Jack Dongarra.
+// and Jack Dongarra and the OpenCL implementation in clBLAS.
 //
 // =================================================================================================
 
@@ -29,17 +29,48 @@ Xtrsm<T>::Xtrsm(Queue &queue, EventPointer event, const std::string &name):
     Xgemm<T>(queue, event, name) {
 }
 
+// =================================================================================================
+
+// The entry point: transforming into col-major (if needed) and then running the col-major version
+template <typename T>
+void Xtrsm<T>::DoTrsm(const Layout layout, Side side, Triangle triangle,
+                      const Transpose a_transpose, const Diagonal diagonal,
+                      size_t m, size_t n,
+                      const T alpha,
+                      const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
+                      const Buffer<T> &b_buffer, const size_t b_offset, const size_t b_ld) {
+
+  // Converts row-major to a col-major problem:
+  // The idea is that
+  //   B = A*X
+  // can be computed as
+  //   B' = (A*X)' = X'*A'
+  // Since changing the order is basically a transpose on each matrix, the formula becomes:
+  //   B = X*A
+  // So only the side (left/right) and the triangle (upper/lower) are changed and M/N are swapped
+  if (layout == Layout::kRowMajor) {
+    std::swap(m, n);
+    side = (side == Side::kLeft) ? Side::kRight : Side::kLeft;
+    triangle = (triangle == Triangle::kLower) ? Triangle::kUpper : Triangle::kLower;
+  }
+
+  // Runs the col-major version of TRSM
+  TrsmColMajor(side, triangle, a_transpose, diagonal,
+               m, n, alpha,
+               a_buffer, a_offset, a_ld,
+               b_buffer, b_offset, b_ld);
+}
 
 // =================================================================================================
 
 // The main routine
 template <typename T>
-void Xtrsm<T>::DoTrsm(const Layout layout, const Side side, const Triangle triangle,
-                      const Transpose a_transpose, const Diagonal diagonal,
-                      const size_t m, const size_t n,
-                      const T alpha,
-                      const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
-                      const Buffer<T> &b_buffer, const size_t b_offset, const size_t b_ld) {
+void Xtrsm<T>::TrsmColMajor(const Side side, const Triangle triangle,
+                            const Transpose a_transpose, const Diagonal diagonal,
+                            const size_t m, const size_t n,
+                            const T alpha,
+                            const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
+                            const Buffer<T> &b_buffer, const size_t b_offset, const size_t b_ld) {
 
   // Settings
   constexpr auto block_size = size_t{32}; // tuneable
@@ -55,13 +86,11 @@ void Xtrsm<T>::DoTrsm(const Layout layout, const Side side, const Triangle trian
   TestMatrixA(k, k, a_buffer, a_offset, a_ld);
 
   // Checks for validity of the input B matrix
-  const auto b_one = (layout == Layout::kRowMajor) ? n : m;
-  const auto b_two = (layout == Layout::kRowMajor) ? m : n;
-  TestMatrixB(b_one, b_two, b_buffer, b_offset, b_ld);
+  TestMatrixB(m, n, b_buffer, b_offset, b_ld);
 
   // Creates a copy of B to avoid overwriting input in GEMM while computing output
-  const auto b_size = b_ld * (b_two - 1) + b_one + b_offset;
-  const auto x_one = b_one;
+  const auto b_size = b_ld * (n - 1) + m + b_offset;
+  const auto x_one = m;
   const auto x_size = b_size;
   const auto x_ld = b_ld;
   const auto x_offset = b_offset;
@@ -82,32 +111,30 @@ void Xtrsm<T>::DoTrsm(const Layout layout, const Side side, const Triangle trian
   // Inverts the diagonal blocks
   auto diagonal_invert_event = Event();
   auto inverter = Xinvert<T>(queue_, diagonal_invert_event.pointer());
-  inverter.InvertMatrixDiagonalBlocks(layout, triangle, diagonal,
+  inverter.InvertMatrixDiagonalBlocks(Layout::kColMajor, triangle, diagonal,
                                       k, block_size, a_buffer, a_offset, a_ld, a_inv_buffer);
   diagonal_invert_event.WaitForCompletion();
 
   // Derives properties based on the arguments
-  const auto is_upper = ((triangle == Triangle::kUpper && a_transpose == Transpose::kNo) ||
-                         (triangle == Triangle::kLower && a_transpose != Transpose::kNo));
-  const auto is_transposed = ((layout == Layout::kColMajor && a_transpose == Transpose::kNo) ||
-                              (layout != Layout::kColMajor && a_transpose != Transpose::kNo));
+  const auto condition = ((triangle == Triangle::kUpper && a_transpose != Transpose::kNo) ||
+                          (triangle == Triangle::kLower && a_transpose == Transpose::kNo));
 
   // Left side
   if (side == Side::kLeft) {
 
     // True when (lower triangular) or (upper triangular and transposed)
-    if ((!is_upper && !is_transposed) || (is_upper && is_transposed)) {
+    if (condition) {
       for (auto i = size_t{0}; i < m; i += block_size) {
         const auto gemm_alpha = (i == 0) ? alpha : ConstantOne<T>();
         const auto current_block_size = std::min(m - i, block_size);
-        DoGemm(layout, a_transpose, Transpose::kNo,
+        DoGemm(Layout::kColMajor, a_transpose, Transpose::kNo,
                current_block_size, n, current_block_size, gemm_alpha,
                a_inv_buffer, i * block_size, block_size,
                b_buffer, i, b_ld, ConstantZero<T>(),
                x_buffer, i, x_ld);
         if (i + block_size >= m) { break; }
         const auto this_a_offset = (a_transpose == Transpose::kNo) ? (i + block_size) + i * a_ld : i + (block_size + i) * a_ld;
-        DoGemm(layout, a_transpose, Transpose::kNo,
+        DoGemm(Layout::kColMajor, a_transpose, Transpose::kNo,
                m - i - block_size, n, block_size, ConstantNegOne<T>(),
                a_buffer, this_a_offset, a_ld,
                x_buffer, i, x_ld, ConstantOne<T>(),
@@ -121,14 +148,14 @@ void Xtrsm<T>::DoTrsm(const Layout layout, const Side side, const Triangle trian
       const auto i_start = static_cast<int>(m) - static_cast<int>(current_block_size);
       for (auto i = i_start; i >= 0; i -= static_cast<int>(block_size)) {
         const auto gemm_alpha = (i == i_start) ? alpha : ConstantOne<T>();
-        DoGemm(layout, a_transpose, Transpose::kNo,
+        DoGemm(Layout::kColMajor, a_transpose, Transpose::kNo,
                current_block_size, n, current_block_size, gemm_alpha,
                a_inv_buffer, i * block_size, block_size,
                b_buffer, i, b_ld, ConstantZero<T>(),
                x_buffer, i, x_ld);
         if (i - static_cast<int>(block_size) < 0) { break; }
         const auto this_a_offset = (a_transpose == Transpose::kNo) ? i * a_ld : i;
-        DoGemm(layout, a_transpose, Transpose::kNo,
+        DoGemm(Layout::kColMajor, a_transpose, Transpose::kNo,
                i, n, block_size, ConstantNegOne<T>(),
                a_buffer, this_a_offset, a_ld,
                x_buffer, i, x_ld, ConstantOne<T>(),
@@ -141,19 +168,19 @@ void Xtrsm<T>::DoTrsm(const Layout layout, const Side side, const Triangle trian
   else {
 
     // True when (lower triangular) or (upper triangular and transposed)
-    if ((!is_upper && !is_transposed) || (is_upper && is_transposed)) {
+    if (condition) {
       const auto current_block_size = (n % block_size == 0) ? block_size : (n % block_size);
       const auto i_start = static_cast<int>(n) - static_cast<int>(current_block_size);
       for (auto i = i_start; i >= 0; i -= static_cast<int>(block_size)) {
         const auto gemm_alpha = (i == i_start) ? alpha : ConstantOne<T>();
-        DoGemm(layout, Transpose::kNo, a_transpose,
+        DoGemm(Layout::kColMajor, Transpose::kNo, a_transpose,
                m, current_block_size, current_block_size, gemm_alpha,
                b_buffer, i * b_ld, b_ld,
                a_inv_buffer, i * block_size, block_size, ConstantZero<T>(),
                x_buffer, i * x_ld, x_ld);
         if (i - static_cast<int>(block_size) < 0) { break; }
         const auto this_a_offset = (a_transpose == Transpose::kNo) ? i : i * a_ld;
-        DoGemm(layout, Transpose::kNo, a_transpose,
+        DoGemm(Layout::kColMajor, Transpose::kNo, a_transpose,
                m, i, current_block_size, ConstantNegOne<T>(),
                x_buffer, i * x_ld, x_ld,
                a_buffer, this_a_offset, a_ld, ConstantOne<T>(),
@@ -166,14 +193,14 @@ void Xtrsm<T>::DoTrsm(const Layout layout, const Side side, const Triangle trian
       for (auto i = size_t{0}; i < n; i += block_size) {
         const auto gemm_alpha = (i == 0) ? alpha : ConstantOne<T>();
         const auto current_block_size = std::min(n - i, block_size);
-        DoGemm(layout, Transpose::kNo, a_transpose,
+        DoGemm(Layout::kColMajor, Transpose::kNo, a_transpose,
                m, current_block_size, current_block_size, gemm_alpha,
                b_buffer, i * b_ld, b_ld,
                a_inv_buffer, i * block_size, block_size, ConstantZero<T>(),
                x_buffer, i * x_ld, x_ld);
         if (i + block_size >= n) { break; }
         const auto this_a_offset = (a_transpose == Transpose::kNo) ? i + (block_size + i) * a_ld : (i + block_size) + i * a_ld;
-        DoGemm(layout, Transpose::kNo, a_transpose,
+        DoGemm(Layout::kColMajor, Transpose::kNo, a_transpose,
                m, n - i - block_size, block_size, ConstantNegOne<T>(),
                x_buffer, i * x_ld, x_ld,
                a_buffer, this_a_offset, a_ld, ConstantOne<T>(),
diff --git a/src/routines/level3/xtrsm.hpp b/src/routines/level3/xtrsm.hpp
index b9d5432a..5b42398e 100644
--- a/src/routines/level3/xtrsm.hpp
+++ b/src/routines/level3/xtrsm.hpp
@@ -37,12 +37,20 @@ class Xtrsm: public Xgemm<T> {
   Xtrsm(Queue &queue, EventPointer event, const std::string &name = "TRSM");
 
   // Templated-precision implementation of the routine
-  void DoTrsm(const Layout layout, const Side side, const Triangle triangle,
+  void DoTrsm(const Layout layout, Side side, Triangle triangle,
               const Transpose a_transpose, const Diagonal diagonal,
-              const size_t m, const size_t n,
+              size_t m, size_t n,
               const T alpha,
               const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
               const Buffer<T> &b_buffer, const size_t b_offset, const size_t b_ld);
+
+  // Implementation of the column-major version
+  void TrsmColMajor(const Side side, const Triangle triangle,
+                    const Transpose a_transpose, const Diagonal diagonal,
+                    const size_t m, const size_t n,
+                    const T alpha,
+                    const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
+                    const Buffer<T> &b_buffer, const size_t b_offset, const size_t b_ld);
 };
 
 // =================================================================================================

From e47d95887c6671d6cee248ab4aa7b4a6bda715cd Mon Sep 17 00:00:00 2001
From: Cedric Nugteren <web@cedricnugteren.nl>
Date: Sat, 25 Feb 2017 12:23:04 +0100
Subject: [PATCH 15/24] Added PrepareData function for TRSM to create proper
 test input

---
 .../level3/invert_diagonal_blocks.opencl      |   4 +-
 src/utilities/utilities.cpp                   | 116 ++++++------------
 src/utilities/utilities.hpp                   |  26 ++--
 test/routines/level2/xtrsv.hpp                |   4 +-
 test/routines/level3/xtrsm.hpp                |  34 +++++
 test/routines/levelx/xinvert.hpp              |   7 +-
 6 files changed, 94 insertions(+), 97 deletions(-)

diff --git a/src/kernels/level3/invert_diagonal_blocks.opencl b/src/kernels/level3/invert_diagonal_blocks.opencl
index e94b4d30..d43b9b7c 100644
--- a/src/kernels/level3/invert_diagonal_blocks.opencl
+++ b/src/kernels/level3/invert_diagonal_blocks.opencl
@@ -100,7 +100,9 @@ void InvertDiagonalBlock(int n, __global const real* restrict src, const int src
   if (unit_diagonal == 0) {
     const real diagonal_value = lm[thread_index][thread_index];
     if (!IsZero(diagonal_value)) { // Only for non-singular values and values inside the matrix
-      DivideReal(inverted_diagonal, inverted_diagonal, diagonal_value);
+      real constant_one;
+      SetToOne(constant_one);
+      DivideReal(inverted_diagonal, constant_one, diagonal_value);
     }
   }
   lm[thread_index][thread_index] = inverted_diagonal;
diff --git a/src/utilities/utilities.cpp b/src/utilities/utilities.cpp
index b2ed2f0c..9cf75490 100644
--- a/src/utilities/utilities.cpp
+++ b/src/utilities/utilities.cpp
@@ -24,100 +24,52 @@ namespace clblast {
 // =================================================================================================
 
 // Returns a scalar with a default value
-template <typename T>
-T GetScalar() {
-  return static_cast<T>(2.0);
-}
+template <typename T> T GetScalar() { return static_cast<T>(2.0); }
 template float GetScalar<float>();
 template double GetScalar<double>();
-
-// Specialized version of the above for half-precision
-template <>
-half GetScalar() {
-  return FloatToHalf(2.0f);
-}
-
-// Specialized versions of the above for complex data-types
-template <>
-float2 GetScalar() {
-  return {2.0f, 0.5f};
-}
-template <>
-double2 GetScalar() {
-  return {2.0, 0.5};
-}
+template <> half GetScalar() { return FloatToHalf(2.0f); }
+template <> float2 GetScalar() { return {2.0f, 0.5f}; }
+template <> double2 GetScalar() { return {2.0, 0.5}; }
 
 // Returns a scalar of value 0
-template <typename T>
-T ConstantZero() {
-  return static_cast<T>(0.0);
-}
+template <typename T> T ConstantZero() { return static_cast<T>(0.0); }
 template float ConstantZero<float>();
 template double ConstantZero<double>();
-
-// Specialized version of the above for half-precision
-template <>
-half ConstantZero() {
-  return FloatToHalf(0.0f);
-}
-
-// Specialized versions of the above for complex data-types
-template <>
-float2 ConstantZero() {
-  return {0.0f, 0.0f};
-}
-template <>
-double2 ConstantZero() {
-  return {0.0, 0.0};
-}
+template <> half ConstantZero() { return FloatToHalf(0.0f); }
+template <> float2 ConstantZero() { return {0.0f, 0.0f}; }
+template <> double2 ConstantZero() { return {0.0, 0.0}; }
 
 // Returns a scalar of value 1
-template <typename T>
-T ConstantOne() {
-  return static_cast<T>(1.0);
-}
+template <typename T> T ConstantOne() { return static_cast<T>(1.0); }
 template float ConstantOne<float>();
 template double ConstantOne<double>();
-
-// Specialized version of the above for half-precision
-template <>
-half ConstantOne() {
-  return FloatToHalf(1.0f);
-}
-
-// Specialized versions of the above for complex data-types
-template <>
-float2 ConstantOne() {
-  return {1.0f, 0.0f};
-}
-template <>
-double2 ConstantOne() {
-  return {1.0, 0.0};
-}
+template <> half ConstantOne() { return FloatToHalf(1.0f); }
+template <> float2 ConstantOne() { return {1.0f, 0.0f}; }
+template <> double2 ConstantOne() { return {1.0, 0.0}; }
 
 // Returns a scalar of value -1
-template <typename T>
-T ConstantNegOne() {
-  return static_cast<T>(-1.0);
-}
+template <typename T> T ConstantNegOne() { return static_cast<T>(-1.0); }
 template float ConstantNegOne<float>();
 template double ConstantNegOne<double>();
+template <> half ConstantNegOne() { return FloatToHalf(-1.0f); }
+template <> float2 ConstantNegOne() { return {-1.0f, 0.0f}; }
+template <> double2 ConstantNegOne() { return {-1.0, 0.0}; }
 
-// Specialized version of the above for half-precision
-template <>
-half ConstantNegOne() {
-  return FloatToHalf(-1.0f);
-}
+// Returns a scalar of value 1
+template <typename T> T ConstantTwo() { return static_cast<T>(2.0); }
+template float ConstantTwo<float>();
+template double ConstantTwo<double>();
+template <> half ConstantTwo() { return FloatToHalf(2.0f); }
+template <> float2 ConstantTwo() { return {2.0f, 0.0f}; }
+template <> double2 ConstantTwo() { return {2.0, 0.0}; }
 
-// Specialized versions of the above for complex data-types
-template <>
-float2 ConstantNegOne() {
-  return {-1.0f, 0.0f};
-}
-template <>
-double2 ConstantNegOne() {
-  return {-1.0, 0.0};
-}
+// Returns a small scalar value just larger than 0
+template <typename T> T SmallConstant() { return static_cast<T>(1e7); }
+template float SmallConstant<float>();
+template double SmallConstant<double>();
+template <> half SmallConstant() { return FloatToHalf(1e7); }
+template <> float2 SmallConstant() { return {1e7, 0.0f}; }
+template <> double2 SmallConstant() { return {1e7, 0.0}; }
 
 // Returns the absolute value of a scalar
 template <typename T> T AbsoluteValue(const T value) { return std::fabs(value); }
@@ -127,6 +79,14 @@ template <> half AbsoluteValue(const half value) { return FloatToHalf(std::fabs(
 template <> float2 AbsoluteValue(const float2 value) { return std::abs(value); }
 template <> double2 AbsoluteValue(const double2 value) { return std::abs(value); }
 
+// Returns whether a scalar is close to zero
+template <typename T> bool IsCloseToZero(const T value) { return (value > -SmallConstant<T>()) && (value < SmallConstant<T>()); }
+template bool IsCloseToZero<float>(const float);
+template bool IsCloseToZero<double>(const double);
+template <> bool IsCloseToZero(const half value) { return IsCloseToZero(HalfToFloat(value)); }
+template <> bool IsCloseToZero(const float2 value) { return IsCloseToZero(value.real()) || IsCloseToZero(value.imag()); }
+template <> bool IsCloseToZero(const double2 value) { return IsCloseToZero(value.real()) || IsCloseToZero(value.imag()); }
+
 // =================================================================================================
 
 // Implements the string conversion using std::to_string if possible
diff --git a/src/utilities/utilities.hpp b/src/utilities/utilities.hpp
index 2c13658b..044955ea 100644
--- a/src/utilities/utilities.hpp
+++ b/src/utilities/utilities.hpp
@@ -99,24 +99,20 @@ constexpr auto kArgNoAbbreviations = "no_abbrv";
 // =================================================================================================
 
 // Returns a scalar with a default value
-template <typename T>
-T GetScalar();
+template <typename T> T GetScalar();
 
-// Returns a scalar of value 0
-template <typename T>
-T ConstantZero();
-
-// Returns a scalar of value 1
-template <typename T>
-T ConstantOne();
-
-// Returns a scalar of value -1
-template <typename T>
-T ConstantNegOne();
+// Fixed value scalars
+template <typename T> T ConstantZero();
+template <typename T> T ConstantOne();
+template <typename T> T ConstantNegOne();
+template <typename T> T ConstantTwo();
+template <typename T> T SmallConstant();
 
 // Returns the absolute value of a scalar
-template <typename T>
-T AbsoluteValue(const T value);
+template <typename T> T AbsoluteValue(const T value);
+
+// Returns whether a scalar is close to zero
+template <typename T> bool IsCloseToZero(const T value);
 
 // =================================================================================================
 
diff --git a/test/routines/level2/xtrsv.hpp b/test/routines/level2/xtrsv.hpp
index 811feac5..72ebdf9e 100644
--- a/test/routines/level2/xtrsv.hpp
+++ b/test/routines/level2/xtrsv.hpp
@@ -46,10 +46,10 @@ void PrepareData(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
     auto diagonal = a_mat_cpu[i*args.a_ld + i + args.a_offset];
     diagonal = AbsoluteValue(diagonal) + static_cast<T>(args.n / size_t{4});
     for (auto j = size_t{0}; j < args.n; ++j) {
-      a_mat_cpu[j*args.a_ld + i + args.a_offset] /= T{2.0};
+      a_mat_cpu[j*args.a_ld + i + args.a_offset] /= ConstantTwo<T>();
     }
     a_mat_cpu[i*args.a_ld + i + args.a_offset] = diagonal;
-    x_vec_cpu[i * args.x_inc + args.x_offset] /= T{2.0};
+    x_vec_cpu[i * args.x_inc + args.x_offset] /= ConstantTwo<T>();
   }
 
   // Copies input buffers back to the OpenCL device
diff --git a/test/routines/level3/xtrsm.hpp b/test/routines/level3/xtrsm.hpp
index e59c96ff..246cb930 100644
--- a/test/routines/level3/xtrsm.hpp
+++ b/test/routines/level3/xtrsm.hpp
@@ -29,6 +29,37 @@
 namespace clblast {
 // =================================================================================================
 
+// Prepares the data
+template <typename T>
+void PrepareData(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
+  const auto k = (args.side == Side::kLeft) ? args.m : args.n;
+  if (args.a_ld < k) { return; }
+
+  // Copies input buffers to the host
+  std::vector<T> a_mat_cpu(args.a_size, static_cast<T>(0));
+  std::vector<T> b_mat_cpu(args.b_size, static_cast<T>(0));
+  buffers.a_mat.Read(queue, args.a_size, a_mat_cpu);
+  buffers.b_mat.Read(queue, args.b_size, b_mat_cpu);
+
+  // Generates 'proper' input for the TRSM routine
+  // TODO: Improve this
+  for (auto i = size_t{0}; i < k; ++i) {
+    for (auto j = size_t{0}; j < k; ++j) {
+      auto value = a_mat_cpu[j*args.a_ld + i + args.a_offset];
+      value *= ConstantTwo<T>();
+      if (IsCloseToZero(value)) { value += ConstantOne<T>(); }
+      a_mat_cpu[j*args.a_ld + i + args.a_offset] = value;
+    }
+  }
+
+  // Copies input buffers back to the OpenCL device
+  buffers.a_mat.Write(queue, args.a_size, a_mat_cpu);
+  buffers.b_mat.Write(queue, args.b_size, b_mat_cpu);
+  return;
+}
+
+// =================================================================================================
+
 // See comment at top of file for a description of the class
 template <typename T>
 class TestXtrsm {
@@ -75,6 +106,7 @@ class TestXtrsm {
 
   // Describes how to run the CLBlast routine
   static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
+    PrepareData(args, buffers, queue);
     auto queue_plain = queue();
     auto event = cl_event{};
     auto status = Trsm(args.layout, args.side, args.triangle, args.a_transpose, args.diagonal,
@@ -89,6 +121,7 @@ class TestXtrsm {
   // Describes how to run the clBLAS routine (for correctness/performance comparison)
   #ifdef CLBLAST_REF_CLBLAS
     static StatusCode RunReference1(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
+      PrepareData(args, buffers, queue);
       auto queue_plain = queue();
       auto event = cl_event{};
       auto status = clblasXtrsm(convertToCLBLAS(args.layout),
@@ -108,6 +141,7 @@ class TestXtrsm {
   // Describes how to run the CPU BLAS routine (for correctness/performance comparison)
   #ifdef CLBLAST_REF_CBLAS
     static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
+      PrepareData(args, buffers, queue);
       std::vector<T> a_mat_cpu(args.a_size, static_cast<T>(0));
       std::vector<T> b_mat_cpu(args.b_size, static_cast<T>(0));
       buffers.a_mat.Read(queue, args.a_size, a_mat_cpu);
diff --git a/test/routines/levelx/xinvert.hpp b/test/routines/levelx/xinvert.hpp
index 4408e8d5..c6ce4b07 100644
--- a/test/routines/levelx/xinvert.hpp
+++ b/test/routines/levelx/xinvert.hpp
@@ -41,9 +41,14 @@ StatusCode RunReference(const Arguments<T> &args, Buffers<T> &buffers, Queue &qu
   const auto num_blocks = CeilDiv(args.n, block_size);
   const auto a_ld = args.a_ld;
   const auto b_ld = block_size;
-  if ((block_size == 0) || (args.n == 0) || (block_size > args.n)) {
+
+  // Checks for valid arguments
+  if ((block_size == 0) || (args.n == 0)) {
     return StatusCode::kInvalidDimension;
   }
+  if ((block_size % 16 != 0) || (block_size > 128)) {
+    return StatusCode::kUnknownError;
+  }
 
   // Loops over the amount of diagonal blocks of size args.m by args.m each
   for (auto block_id = size_t{0}; block_id < num_blocks; ++block_id) {

From 492ee3d0a5f042db6fa6932158ea0779a4d9a5a5 Mon Sep 17 00:00:00 2001
From: Cedric Nugteren <web@cedricnugteren.nl>
Date: Sat, 25 Feb 2017 12:28:13 +0100
Subject: [PATCH 16/24] Removed the invert routine from the tests

---
 CMakeLists.txt | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 1c35a4c5..f9f452f0 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -159,7 +159,7 @@ set(LEVEL1_ROUTINES xswap xscal xcopy xaxpy xdot xdotu xdotc xnrm2 xasum xamax)
 set(LEVEL2_ROUTINES xgemv xgbmv xhemv xhbmv xhpmv xsymv xsbmv xspmv xtrmv xtbmv xtpmv xtrsv
                     xger xgeru xgerc xher xhpr xher2 xhpr2 xsyr xspr xsyr2 xspr2)
 set(LEVEL3_ROUTINES xgemm xsymm xhemm xsyrk xherk xsyr2k xher2k xtrmm xtrsm)
-set(LEVELX_ROUTINES xomatcopy xinvert)
+set(LEVELX_ROUTINES xomatcopy)
 set(ROUTINES ${LEVEL1_ROUTINES} ${LEVEL2_ROUTINES} ${LEVEL3_ROUTINES} ${LEVELX_ROUTINES})
 set(PRECISIONS 32 64 3232 6464 16)
 
@@ -175,6 +175,7 @@ set(SOURCES
   src/clblast.cpp
   src/clblast_c.cpp
   src/routine.cpp
+  src/routines/levelx/xinvert.cpp  # only source, don't include it as a test
 )
 if(NETLIB)
   set(SOURCES ${SOURCES} src/clblast_netlib_c.cpp)

From ccac957f1735354fc1ad06a6e329c1cdabbad969 Mon Sep 17 00:00:00 2001
From: Cedric Nugteren <web@cedricnugteren.nl>
Date: Sat, 25 Feb 2017 13:02:15 +0100
Subject: [PATCH 17/24] Added documentation for the TRSV and TRSM routines

---
 CHANGELOG | 4 ++++
 README.md | 4 +++-
 2 files changed, 7 insertions(+), 1 deletion(-)

diff --git a/CHANGELOG b/CHANGELOG
index 20f17807..e8470206 100644
--- a/CHANGELOG
+++ b/CHANGELOG
@@ -7,6 +7,10 @@ Development version (next release)
 - Tests now also exit with an error code when OpenCL errors or compilation errors occur
 - Various minor fixes and enhancements
 - Added tuned parameters for various devices (see README)
+- Added level-2 routines:
+  * STRSV/DTRSV/CTRSV/ZTRSV (experimental, un-optimized)
+- Added level-3 routines:
+  * STRSM/DTRSM/CTRSM/ZTRSM (experimental, un-optimized)
 
 Version 0.10.0
 - Updated to version 8.0 of the CLCudaAPI C++11 OpenCL header
diff --git a/README.md b/README.md
index 35e79db8..13b67300 100644
--- a/README.md
+++ b/README.md
@@ -254,6 +254,7 @@ CLBlast supports almost all the Netlib BLAS routines plus a couple of extra non-
 | xSPR     | ✔ | ✔ | - | - | ✔ |
 | xSYR2    | ✔ | ✔ | - | - | ✔ |
 | xSPR2    | ✔ | ✔ | - | - | ✔ |
+| xTRSV    | ✔ | ✔ | ✔ | ✔ | ✔ | (experimental, un-optimized)
 
 | Level-3  | S | D | C | Z | H |
 | ---------|---|---|---|---|---|
@@ -265,6 +266,7 @@ CLBlast supports almost all the Netlib BLAS routines plus a couple of extra non-
 | xSYR2K   | ✔ | ✔ | ✔ | ✔ | ✔ |
 | xHER2K   | - | - | ✔ | ✔ | - |
 | xTRMM    | ✔ | ✔ | ✔ | ✔ | ✔ |
+| xTRSM    | ✔ | ✔ | ✔ | ✔ | ✔ | (experimental, un-optimized)
 
 In addition, some extra non-BLAS routines are also supported by CLBlast, classified as level-X. They are experimental and should be used with care:
 
@@ -275,7 +277,7 @@ In addition, some extra non-BLAS routines are also supported by CLBlast, classif
 | IxMIN      | ✔ | ✔ | ✔ | ✔ | ✔ |
 | xOMATCOPY  | ✔ | ✔ | ✔ | ✔ | ✔ |
 
-Some less commonly used BLAS routines are not yet supported yet by CLBlast. They are xROTG, xROTMG, xROT, xROTM, xTRSV, xTBSV, xTPSV, and xTRSM.
+Some less commonly used BLAS routines are not yet supported yet by CLBlast. They are xROTG, xROTMG, xROT, xROTM, xTBSV, and xTPSV.
 
 
 Half precision (fp16)

From a433987441e09684fb7b6f6c75962fd128cbfdbd Mon Sep 17 00:00:00 2001
From: Cedric Nugteren <web@cedricnugteren.nl>
Date: Sun, 26 Feb 2017 10:18:45 +0100
Subject: [PATCH 18/24] Fixes division in the kernel for inversion of complex
 numbers

---
 src/kernels/common.opencl                        | 7 -------
 src/kernels/level3/invert_diagonal_blocks.opencl | 2 +-
 2 files changed, 1 insertion(+), 8 deletions(-)

diff --git a/src/kernels/common.opencl b/src/kernels/common.opencl
index 0ce4f367..32e3fbb9 100644
--- a/src/kernels/common.opencl
+++ b/src/kernels/common.opencl
@@ -212,13 +212,6 @@ R"(
   #define MultiplySubtract(c, a, b) c -= a * b
 #endif
 
-// The scalar division function: real-value only
-#if PRECISION == 3232 || PRECISION == 6464
-  #define DivideReal(c, a, b) c.x = a.x / b.x; c.y = a.y
-#else
-  #define DivideReal(c, a, b) c = a / b
-#endif
-
 // The scalar division function: full division
 #if PRECISION == 3232 || PRECISION == 6464
   #define DivideFull(c, a, b) singlereal num_x = (a.x * b.x) + (a.y * b.y); singlereal num_y = (a.y * b.x) - (a.x * b.y); singlereal denom = (b.x * b.x) + (b.y * b.y); c.x = num_x / denom; c.y = num_y / denom
diff --git a/src/kernels/level3/invert_diagonal_blocks.opencl b/src/kernels/level3/invert_diagonal_blocks.opencl
index d43b9b7c..c59bcbcb 100644
--- a/src/kernels/level3/invert_diagonal_blocks.opencl
+++ b/src/kernels/level3/invert_diagonal_blocks.opencl
@@ -102,7 +102,7 @@ void InvertDiagonalBlock(int n, __global const real* restrict src, const int src
     if (!IsZero(diagonal_value)) { // Only for non-singular values and values inside the matrix
       real constant_one;
       SetToOne(constant_one);
-      DivideReal(inverted_diagonal, constant_one, diagonal_value);
+      DivideFull(inverted_diagonal, constant_one, diagonal_value);
     }
   }
   lm[thread_index][thread_index] = inverted_diagonal;

From 70d8c4bad7911b3688ac4514fedc44e5e0f1f2d8 Mon Sep 17 00:00:00 2001
From: Cedric Nugteren <web@cedricnugteren.nl>
Date: Sun, 26 Feb 2017 10:19:53 +0100
Subject: [PATCH 19/24] Improved the correctness tests for complex numbers in
 case either real or imag is much larger than the other

---
 test/correctness/tester.cpp | 18 ++++++++++++------
 1 file changed, 12 insertions(+), 6 deletions(-)

diff --git a/test/correctness/tester.cpp b/test/correctness/tester.cpp
index dd2f3f99..eb79008d 100644
--- a/test/correctness/tester.cpp
+++ b/test/correctness/tester.cpp
@@ -473,15 +473,21 @@ template bool TestSimilarity<double>(const double, const double);
 // Specialisations for non-standard data-types
 template <>
 bool TestSimilarity(const float2 val1, const float2 val2) {
-  auto real = TestSimilarity(val1.real(), val2.real());
-  auto imag = TestSimilarity(val1.imag(), val2.imag());
-  return (real && imag);
+  const auto real = TestSimilarity(val1.real(), val2.real());
+  const auto imag = TestSimilarity(val1.imag(), val2.imag());
+  if (real && imag) { return true; }
+  // also OK if one is good and the combined is good (indicates a big diff between real & imag)
+  if (real || imag) { return TestSimilarity(val1.real() + val1.imag(), val2.real() + val2.imag()); }
+  return false; // neither real nor imag is good, return false
 }
 template <>
 bool TestSimilarity(const double2 val1, const double2 val2) {
-  auto real = TestSimilarity(val1.real(), val2.real());
-  auto imag = TestSimilarity(val1.imag(), val2.imag());
-  return (real && imag);
+  const auto real = TestSimilarity(val1.real(), val2.real());
+  const auto imag = TestSimilarity(val1.imag(), val2.imag());
+  if (real && imag) { return true; }
+  // also OK if one is good and the combined is good (indicates a big diff between real & imag)
+  if (real || imag) { return TestSimilarity(val1.real() + val1.imag(), val2.real() + val2.imag()); }
+  return false; // neither real nor imag is good, return false
 }
 template <>
 bool TestSimilarity(const half val1, const half val2) {

From b7310036eda482e8871d6a9d1e1660f93be1fd49 Mon Sep 17 00:00:00 2001
From: Cedric Nugteren <web@cedricnugteren.nl>
Date: Sun, 26 Feb 2017 12:56:21 +0100
Subject: [PATCH 20/24] Removed half-precision support from the TRSM routine;
 too unstable

---
 README.md                                  |  4 ++--
 doc/clblast.md                             |  6 ------
 include/clblast.h                          |  2 +-
 include/clblast_c.h                        |  8 +-------
 include/clblast_netlib_c.h                 |  2 +-
 scripts/generator/generator.py             |  2 +-
 src/clblast.cpp                            |  9 +--------
 src/clblast_c.cpp                          | 21 ---------------------
 test/correctness/routines/level3/xtrsm.cpp |  1 -
 test/performance/routines/level3/xtrsm.cpp |  3 +--
 test/wrapper_cblas.hpp                     | 14 --------------
 test/wrapper_clblas.hpp                    | 18 ------------------
 12 files changed, 8 insertions(+), 82 deletions(-)

diff --git a/README.md b/README.md
index 13b67300..5500fc9d 100644
--- a/README.md
+++ b/README.md
@@ -254,7 +254,7 @@ CLBlast supports almost all the Netlib BLAS routines plus a couple of extra non-
 | xSPR     | ✔ | ✔ | - | - | ✔ |
 | xSYR2    | ✔ | ✔ | - | - | ✔ |
 | xSPR2    | ✔ | ✔ | - | - | ✔ |
-| xTRSV    | ✔ | ✔ | ✔ | ✔ | ✔ | (experimental, un-optimized)
+| xTRSV    | ✔ | ✔ | ✔ | ✔ |   | (experimental, un-optimized)
 
 | Level-3  | S | D | C | Z | H |
 | ---------|---|---|---|---|---|
@@ -266,7 +266,7 @@ CLBlast supports almost all the Netlib BLAS routines plus a couple of extra non-
 | xSYR2K   | ✔ | ✔ | ✔ | ✔ | ✔ |
 | xHER2K   | - | - | ✔ | ✔ | - |
 | xTRMM    | ✔ | ✔ | ✔ | ✔ | ✔ |
-| xTRSM    | ✔ | ✔ | ✔ | ✔ | ✔ | (experimental, un-optimized)
+| xTRSM    | ✔ | ✔ | ✔ | ✔ |   | (experimental, un-optimized)
 
 In addition, some extra non-BLAS routines are also supported by CLBlast, classified as level-X. They are experimental and should be used with care:
 
diff --git a/doc/clblast.md b/doc/clblast.md
index d90cb61b..909bd823 100644
--- a/doc/clblast.md
+++ b/doc/clblast.md
@@ -2807,12 +2807,6 @@ CLBlastStatusCode CLBlastZtrsm(const CLBlastLayout layout, const CLBlastSide sid
                                const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
                                cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
                                cl_command_queue* queue, cl_event* event)
-CLBlastStatusCode CLBlastHtrsm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
-                               const size_t m, const size_t n,
-                               const cl_half alpha,
-                               const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
-                               cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
-                               cl_command_queue* queue, cl_event* event)
 ```
 
 Arguments to TRSM:
diff --git a/include/clblast.h b/include/clblast.h
index 7b2021d8..43a3fbf3 100644
--- a/include/clblast.h
+++ b/include/clblast.h
@@ -583,7 +583,7 @@ StatusCode Trmm(const Layout layout, const Side side, const Triangle triangle, c
                 cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
                 cl_command_queue* queue, cl_event* event = nullptr);
 
-// Solves a triangular system of equations: STRSM/DTRSM/CTRSM/ZTRSM/HTRSM
+// Solves a triangular system of equations: STRSM/DTRSM/CTRSM/ZTRSM
 template <typename T>
 StatusCode Trsm(const Layout layout, const Side side, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal,
                 const size_t m, const size_t n,
diff --git a/include/clblast_c.h b/include/clblast_c.h
index 72f50d83..63b6c941 100644
--- a/include/clblast_c.h
+++ b/include/clblast_c.h
@@ -1258,7 +1258,7 @@ CLBlastStatusCode PUBLIC_API CLBlastHtrmm(const CLBlastLayout layout, const CLBl
                                           cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
                                           cl_command_queue* queue, cl_event* event);
 
-// Solves a triangular system of equations: STRSM/DTRSM/CTRSM/ZTRSM/HTRSM
+// Solves a triangular system of equations: STRSM/DTRSM/CTRSM/ZTRSM
 CLBlastStatusCode PUBLIC_API CLBlastStrsm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
                                           const size_t m, const size_t n,
                                           const float alpha,
@@ -1283,12 +1283,6 @@ CLBlastStatusCode PUBLIC_API CLBlastZtrsm(const CLBlastLayout layout, const CLBl
                                           const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
                                           cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
                                           cl_command_queue* queue, cl_event* event);
-CLBlastStatusCode PUBLIC_API CLBlastHtrsm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
-                                          const size_t m, const size_t n,
-                                          const cl_half alpha,
-                                          const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
-                                          cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
-                                          cl_command_queue* queue, cl_event* event);
 
 // =================================================================================================
 // Extra non-BLAS routines (level-X)
diff --git a/include/clblast_netlib_c.h b/include/clblast_netlib_c.h
index b5577cfa..384fab20 100644
--- a/include/clblast_netlib_c.h
+++ b/include/clblast_netlib_c.h
@@ -862,7 +862,7 @@ void PUBLIC_API cblas_ztrmm(const CLBlastLayout layout, const CLBlastSide side,
                             const void* a, const int a_ld,
                             void* b, const int b_ld);
 
-// Solves a triangular system of equations: STRSM/DTRSM/CTRSM/ZTRSM/HTRSM
+// Solves a triangular system of equations: STRSM/DTRSM/CTRSM/ZTRSM
 void PUBLIC_API cblas_strsm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
                             const int m, const int n,
                             const float alpha,
diff --git a/scripts/generator/generator.py b/scripts/generator/generator.py
index 8624938c..ebbaa6e6 100755
--- a/scripts/generator/generator.py
+++ b/scripts/generator/generator.py
@@ -154,7 +154,7 @@ ROUTINES = [
   Routine(True,  True,  "3", "syr2k", T,  [S,D,C,Z,H],     ["n","k"],            ["layout","triangle","ab_transpose"],                  ["a","b"],  ["c"],                        [ankab,bnkab,cn],["alpha","beta"], "",    "Rank-2K update of a symmetric matrix", "Performs the matrix product _C = alpha * A * B^T + alpha * B * A^T + beta * C_ or _C = alpha * A^T * B + alpha * B^T * A + beta * C_, in which _A_ and _B_ are general matrices and _A^T_ and _B^T_ are their transposed versions, _C_ (_n_ by _n_) is the symmetric matrix to be updated, and _alpha_ and _beta_ are scalar values.", [ald_trans_n_k, bld_trans_n_k, cld_n]),
   Routine(True,  True,  "3", "her2k", TU, [Ccs,Zzd],       ["n","k"],            ["layout","triangle","ab_transpose"],                  ["a","b"],  ["c"],                        [ankab,bnkab,cn],["alpha","beta"], "",    "Rank-2K update of a hermitian matrix", "Same operation as xSYR2K, but _C_ is an Hermitian matrix instead.", [ald_trans_n_k, bld_trans_n_k, cld_n]),
   Routine(True,  True,  "3", "trmm",  T,  [S,D,C,Z,H],     ["m","n"],            ["layout","side","triangle","a_transpose","diagonal"], ["a"],      ["b"],                        [amns,bmn],      ["alpha"],        "",    "Triangular matrix-matrix multiplication", "Performs the matrix product _B = alpha * A * B_ or _B = alpha * B * A_, in which _A_ is a unit or non-unit triangular matrix, _B_ (_m_ by _n_) is the general matrix to be updated, and _alpha_ is a scalar value.", [ald_side_m_n, bld_m]),
-  Routine(True,  True,  "3", "trsm",  T,  [S,D,C,Z,H],     ["m","n"],            ["layout","side","triangle","a_transpose","diagonal"], ["a"],      ["b"],                        [amns,bmn],      ["alpha"],        "",    "Solves a triangular system of equations", "Solves the equation _A * X = alpha * B_ for the unknown _m_ by _n_ matrix X, in which _A_ is an _n_ by _n_ unit or non-unit triangular matrix and B is an _m_ by _n_ matrix. The matrix _B_ is overwritten by the solution _X_.", []),
+  Routine(True,  True,  "3", "trsm",  T,  [S,D,C,Z],       ["m","n"],            ["layout","side","triangle","a_transpose","diagonal"], ["a"],      ["b"],                        [amns,bmn],      ["alpha"],        "",    "Solves a triangular system of equations", "Solves the equation _A * X = alpha * B_ for the unknown _m_ by _n_ matrix X, in which _A_ is an _n_ by _n_ unit or non-unit triangular matrix and B is an _m_ by _n_ matrix. The matrix _B_ is overwritten by the solution _X_.", []),
 ],
 [  # Level X: extra routines (not part of BLAS)
   Routine(True,  True,  "x", "omatcopy", T, [S,D,C,Z,H],   ["m","n"],            ["layout","a_transpose"],                              ["a"],      ["b"],                        [amn,bnma],      ["alpha"],        "",    "Scaling and out-place transpose/copy (non-BLAS function)", "Performs scaling and out-of-place transposition/copying of matrices according to _B = alpha*op(A)_, in which _A_ is an input matrix (_m_ rows by _n_ columns), _B_ an output matrix, and _alpha_ a scalar value. The operation _op_ can be a normal matrix copy, a transposition or a conjugate transposition.", [ald_m, bld_n]),
diff --git a/src/clblast.cpp b/src/clblast.cpp
index 20ce1ba4..48bec54d 100644
--- a/src/clblast.cpp
+++ b/src/clblast.cpp
@@ -2075,7 +2075,7 @@ template StatusCode PUBLIC_API Trmm<half>(const Layout, const Side, const Triang
                                           cl_mem, const size_t, const size_t,
                                           cl_command_queue*, cl_event*);
 
-// Solves a triangular system of equations: STRSM/DTRSM/CTRSM/ZTRSM/HTRSM
+// Solves a triangular system of equations: STRSM/DTRSM/CTRSM/ZTRSM
 template <typename T>
 StatusCode Trsm(const Layout layout, const Side side, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal,
                 const size_t m, const size_t n,
@@ -2118,12 +2118,6 @@ template StatusCode PUBLIC_API Trsm<double2>(const Layout, const Side, const Tri
                                              const cl_mem, const size_t, const size_t,
                                              cl_mem, const size_t, const size_t,
                                              cl_command_queue*, cl_event*);
-template StatusCode PUBLIC_API Trsm<half>(const Layout, const Side, const Triangle, const Transpose, const Diagonal,
-                                          const size_t, const size_t,
-                                          const half,
-                                          const cl_mem, const size_t, const size_t,
-                                          cl_mem, const size_t, const size_t,
-                                          cl_command_queue*, cl_event*);
 
 // =================================================================================================
 // Extra non-BLAS routines (level-X)
@@ -2178,7 +2172,6 @@ template StatusCode PUBLIC_API Omatcopy<half>(const Layout, const Transpose,
                                               const cl_mem, const size_t, const size_t,
                                               cl_mem, const size_t, const size_t,
                                               cl_command_queue*, cl_event*);
-
 // =================================================================================================
 
 // Clears the cache of stored binaries
diff --git a/src/clblast_c.cpp b/src/clblast_c.cpp
index e4f2b3ed..07331e3a 100644
--- a/src/clblast_c.cpp
+++ b/src/clblast_c.cpp
@@ -3349,27 +3349,6 @@ CLBlastStatusCode CLBlastZtrsm(const CLBlastLayout layout, const CLBlastSide sid
     );
   } catch (...) { return static_cast<CLBlastStatusCode>(clblast::DispatchExceptionForC()); }
 }
-CLBlastStatusCode CLBlastHtrsm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
-                               const size_t m, const size_t n,
-                               const cl_half alpha,
-                               const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
-                               cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
-                               cl_command_queue* queue, cl_event* event) {
-  try {
-    return static_cast<CLBlastStatusCode>(
-      clblast::Trsm(static_cast<clblast::Layout>(layout),
-                    static_cast<clblast::Side>(side),
-                    static_cast<clblast::Triangle>(triangle),
-                    static_cast<clblast::Transpose>(a_transpose),
-                    static_cast<clblast::Diagonal>(diagonal),
-                    m, n,
-                    alpha,
-                    a_buffer, a_offset, a_ld,
-                    b_buffer, b_offset, b_ld,
-                    queue, event)
-    );
-  } catch (...) { return static_cast<CLBlastStatusCode>(clblast::DispatchExceptionForC()); }
-}
 
 // =================================================================================================
 // Extra non-BLAS routines (level-X)
diff --git a/test/correctness/routines/level3/xtrsm.cpp b/test/correctness/routines/level3/xtrsm.cpp
index 6119bd17..bc45a8bf 100644
--- a/test/correctness/routines/level3/xtrsm.cpp
+++ b/test/correctness/routines/level3/xtrsm.cpp
@@ -23,7 +23,6 @@ int main(int argc, char *argv[]) {
   errors += clblast::RunTests<clblast::TestXtrsm<double>, double, double>(argc, argv, true, "DTRSM");
   errors += clblast::RunTests<clblast::TestXtrsm<float2>, float2, float2>(argc, argv, true, "CTRSM");
   errors += clblast::RunTests<clblast::TestXtrsm<double2>, double2, double2>(argc, argv, true, "ZTRSM");
-  errors += clblast::RunTests<clblast::TestXtrsm<half>, half, half>(argc, argv, true, "HTRSM");
   if (errors > 0) { return 1; } else { return 0; }
 }
 
diff --git a/test/performance/routines/level3/xtrsm.cpp b/test/performance/routines/level3/xtrsm.cpp
index ef094891..342274b7 100644
--- a/test/performance/routines/level3/xtrsm.cpp
+++ b/test/performance/routines/level3/xtrsm.cpp
@@ -20,8 +20,7 @@ using double2 = clblast::double2;
 int main(int argc, char *argv[]) {
   const auto command_line_args = clblast::RetrieveCommandLineArguments(argc, argv);
   switch(clblast::GetPrecision(command_line_args, clblast::Precision::kSingle)) {
-    case clblast::Precision::kHalf:
-      clblast::RunClient<clblast::TestXtrsm<half>, half, half>(argc, argv); break;
+    case clblast::Precision::kHalf: throw std::runtime_error("Unsupported precision mode");
     case clblast::Precision::kSingle:
       clblast::RunClient<clblast::TestXtrsm<float>, float, float>(argc, argv); break;
     case clblast::Precision::kDouble:
diff --git a/test/wrapper_cblas.hpp b/test/wrapper_cblas.hpp
index 5f1db54e..dd610a6c 100644
--- a/test/wrapper_cblas.hpp
+++ b/test/wrapper_cblas.hpp
@@ -2103,20 +2103,6 @@ void cblasXtrsm(const CBLAS_ORDER layout, const CBLAS_SIDE side, const CBLAS_UPL
               reinterpret_cast<const double*>(&a_buffer[a_offset]), a_ld,
               reinterpret_cast<double*>(&b_buffer[b_offset]), b_ld);
 }
-void cblasXtrsm(const CBLAS_ORDER layout, const CBLAS_SIDE side, const CBLAS_UPLO triangle, const CBLAS_TRANSPOSE a_transpose, const CBLAS_DIAG diagonal,
-                const size_t m, const size_t n,
-                const half alpha,
-                const std::vector<half>& a_buffer, const size_t a_offset, const size_t a_ld,
-                std::vector<half>& b_buffer, const size_t b_offset, const size_t b_ld) {
-  auto a_buffer_bis = HalfToFloatBuffer(a_buffer);
-  auto b_buffer_bis = HalfToFloatBuffer(b_buffer);
-  cblasXtrsm(layout, side, triangle, a_transpose, diagonal,
-             m, n,
-             HalfToFloat(alpha),
-             a_buffer_bis, a_offset, a_ld,
-             b_buffer_bis, b_offset, b_ld);
-  FloatToHalfBuffer(b_buffer, b_buffer_bis);
-}
 
 // =================================================================================================
 } // namespace clblast
diff --git a/test/wrapper_clblas.hpp b/test/wrapper_clblas.hpp
index f1923784..f1b3a0c4 100644
--- a/test/wrapper_clblas.hpp
+++ b/test/wrapper_clblas.hpp
@@ -2865,24 +2865,6 @@ clblasStatus clblasXtrsm(const clblasOrder layout, const clblasSide side, const
                      b_buffer(), b_offset, b_ld,
                      num_queues, queues, num_wait_events, wait_events, events);
 }
-clblasStatus clblasXtrsm(const clblasOrder layout, const clblasSide side, const clblasUplo triangle, const clblasTranspose a_transpose, const clblasDiag diagonal,
-                         const size_t m, const size_t n,
-                         const half alpha,
-                         const Buffer<half>& a_buffer, const size_t a_offset, const size_t a_ld,
-                         Buffer<half>& b_buffer, const size_t b_offset, const size_t b_ld,
-                         cl_uint num_queues, cl_command_queue *queues,
-                         cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) {
-  auto a_buffer_bis = HalfToFloatBuffer(a_buffer, queues[0]);
-  auto b_buffer_bis = HalfToFloatBuffer(b_buffer, queues[0]);
-  auto status = clblasXtrsm(layout, side, triangle, a_transpose, diagonal,
-                            m, n,
-                            HalfToFloat(alpha),
-                            a_buffer_bis, a_offset, a_ld,
-                            b_buffer_bis, b_offset, b_ld,
-                            num_queues, queues, num_wait_events, wait_events, events);
-  FloatToHalfBuffer(b_buffer, b_buffer_bis, queues[0]);
-  return status;
-}
 
 // =================================================================================================
 } // namespace clblast

From df7638c3058c59b173f04cadef78c1955ac008f6 Mon Sep 17 00:00:00 2001
From: Cedric Nugteren <web@cedricnugteren.nl>
Date: Sun, 26 Feb 2017 14:31:05 +0100
Subject: [PATCH 21/24] Fixed an out-of-bounds memory access when filling a
 matrix with a constant

---
 src/kernels/level3/level3.opencl |  4 ++--
 src/routines/common.hpp          | 15 ++++++++-------
 src/routines/level3/xtrsm.cpp    |  3 ++-
 src/routines/levelx/xinvert.cpp  |  2 +-
 4 files changed, 13 insertions(+), 11 deletions(-)

diff --git a/src/kernels/level3/level3.opencl b/src/kernels/level3/level3.opencl
index 0f5a8607..5ba8cf29 100644
--- a/src/kernels/level3/level3.opencl
+++ b/src/kernels/level3/level3.opencl
@@ -77,12 +77,12 @@ R"(
 #if defined(ROUTINE_INVERT) || defined(ROUTINE_TRSM)
 
 __kernel __attribute__((reqd_work_group_size(8, 8, 1)))
-void FillMatrix(const int n, const int ld, const int offset,
+void FillMatrix(const int m, const int n, const int ld, const int offset,
                 __global real* restrict dest, const real_arg arg_value) {
   const real value = GetRealArg(arg_value);
   const int id_one = get_global_id(0);
   const int id_two = get_global_id(1);
-  if (id_one < ld && id_two < n) {
+  if (id_one < m && id_two < n) {
     dest[id_two*ld + id_one + offset] = value;
   }
 }
diff --git a/src/routines/common.hpp b/src/routines/common.hpp
index bdea0086..47d62027 100644
--- a/src/routines/common.hpp
+++ b/src/routines/common.hpp
@@ -38,17 +38,18 @@ template <typename T>
 void FillMatrix(Queue &queue, const Device &device,
                 const Program &program, const Database &,
                 EventPointer event, const std::vector<Event> &waitForEvents,
-                const size_t n, const size_t ld, const size_t offset,
+                const size_t m, const size_t n, const size_t ld, const size_t offset,
                 const Buffer<T> &dest,
                 const T constant_value) {
   auto kernel = Kernel(program, "FillMatrix");
-  kernel.SetArgument(0, static_cast<int>(n));
-  kernel.SetArgument(1, static_cast<int>(ld));
-  kernel.SetArgument(2, static_cast<int>(offset));
-  kernel.SetArgument(3, dest());
-  kernel.SetArgument(4, GetRealArg(constant_value));
+  kernel.SetArgument(0, static_cast<int>(m));
+  kernel.SetArgument(1, static_cast<int>(n));
+  kernel.SetArgument(2, static_cast<int>(ld));
+  kernel.SetArgument(3, static_cast<int>(offset));
+  kernel.SetArgument(4, dest());
+  kernel.SetArgument(5, GetRealArg(constant_value));
   auto local = std::vector<size_t>{8, 8};
-  auto global = std::vector<size_t>{Ceil(ld, 8), Ceil(n, 8)};
+  auto global = std::vector<size_t>{Ceil(m, 8), Ceil(n, 8)};
   RunKernel(kernel, queue, device, global, local, event, waitForEvents);
 }
 
diff --git a/src/routines/level3/xtrsm.cpp b/src/routines/level3/xtrsm.cpp
index 42855362..b734dd2d 100644
--- a/src/routines/level3/xtrsm.cpp
+++ b/src/routines/level3/xtrsm.cpp
@@ -91,6 +91,7 @@ void Xtrsm<T>::TrsmColMajor(const Side side, const Triangle triangle,
   // Creates a copy of B to avoid overwriting input in GEMM while computing output
   const auto b_size = b_ld * (n - 1) + m + b_offset;
   const auto x_one = m;
+  const auto x_two = n;
   const auto x_size = b_size;
   const auto x_ld = b_ld;
   const auto x_offset = b_offset;
@@ -105,7 +106,7 @@ void Xtrsm<T>::TrsmColMajor(const Side side, const Triangle triangle,
   auto eventWaitList = std::vector<Event>();
   auto fill_matrix_event = Event();
   FillMatrix(queue_, device_, program_, db_, fill_matrix_event.pointer(), eventWaitList,
-             x_one, x_ld, x_offset, x_buffer, ConstantZero<T>());
+             x_one, x_two, x_ld, x_offset, x_buffer, ConstantZero<T>());
   fill_matrix_event.WaitForCompletion();
 
   // Inverts the diagonal blocks
diff --git a/src/routines/levelx/xinvert.cpp b/src/routines/levelx/xinvert.cpp
index bcc3706d..5c21d5ce 100644
--- a/src/routines/levelx/xinvert.cpp
+++ b/src/routines/levelx/xinvert.cpp
@@ -73,7 +73,7 @@ void Xinvert<T>::InvertMatrixDiagonalBlocks(const Layout layout, const Triangle
   auto event_wait_list = std::vector<Event>();
   auto fill_matrix_event = Event();
   FillMatrix(queue_, device_, program_, db_, fill_matrix_event.pointer(), event_wait_list,
-             num_blocks * block_size, block_size, 0, dest, ConstantZero<T>());
+             block_size, num_blocks * block_size, block_size, 0, dest, ConstantZero<T>());
   event_wait_list.push_back(fill_matrix_event);
 
   // Inverts the diagonal IB by IB inner blocks of the matrix: one block per work-group

From a145890aaac0087d36b414bd59c247ae4b70b3e5 Mon Sep 17 00:00:00 2001
From: Cedric Nugteren <web@cedricnugteren.nl>
Date: Sun, 26 Feb 2017 14:37:29 +0100
Subject: [PATCH 22/24] Added a guard against invalid buffer sizes in the
 prepare-data functions for tests

---
 test/routines/level2/xtrsv.hpp | 1 +
 test/routines/level3/xtrsm.hpp | 1 +
 2 files changed, 2 insertions(+)

diff --git a/test/routines/level2/xtrsv.hpp b/test/routines/level2/xtrsv.hpp
index 72ebdf9e..fed4378a 100644
--- a/test/routines/level2/xtrsv.hpp
+++ b/test/routines/level2/xtrsv.hpp
@@ -33,6 +33,7 @@ namespace clblast {
 template <typename T>
 void PrepareData(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
   if (args.a_ld < args.n) { return; }
+  if (args.a_size <= 0 || args.x_size <= 0) { return; }
 
   // Copies input buffers to the host
   std::vector<T> a_mat_cpu(args.a_size, static_cast<T>(0));
diff --git a/test/routines/level3/xtrsm.hpp b/test/routines/level3/xtrsm.hpp
index 246cb930..1ffaef35 100644
--- a/test/routines/level3/xtrsm.hpp
+++ b/test/routines/level3/xtrsm.hpp
@@ -34,6 +34,7 @@ template <typename T>
 void PrepareData(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
   const auto k = (args.side == Side::kLeft) ? args.m : args.n;
   if (args.a_ld < k) { return; }
+  if (args.a_size <= 0 || args.b_size <= 0) { return; }
 
   // Copies input buffers to the host
   std::vector<T> a_mat_cpu(args.a_size, static_cast<T>(0));

From dde67ac79e2f03ab9bdd89883477f2edf39afe46 Mon Sep 17 00:00:00 2001
From: Cedric Nugteren <web@cedricnugteren.nl>
Date: Sun, 26 Feb 2017 14:53:58 +0100
Subject: [PATCH 23/24] Minor fix to the generator script

---
 scripts/generator/generator.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/generator/generator.py b/scripts/generator/generator.py
index 741aa80c..09c743bb 100755
--- a/scripts/generator/generator.py
+++ b/scripts/generator/generator.py
@@ -42,7 +42,7 @@ FILES = [
     "/src/clblast_netlib_c.cpp",
 ]
 HEADER_LINES = [121, 75, 125, 23, 29, 41, 65, 32]
-FOOTER_LINES = [25, 139, 27, 38, 6, 6, 9, 2]
+FOOTER_LINES = [25, 138, 27, 38, 6, 6, 9, 2]
 HEADER_LINES_DOC = 0
 FOOTER_LINES_DOC = 63
 

From e09c26c7069d81e437ba110175ccb558023cf46a Mon Sep 17 00:00:00 2001
From: Cedric Nugteren <web@cedricnugteren.nl>
Date: Sun, 26 Feb 2017 15:03:12 +0100
Subject: [PATCH 24/24] Split the GEMM kernel further up to prevent C1091 in
 MSVC

---
 src/routines/level3/xgemm.cpp | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/routines/level3/xgemm.cpp b/src/routines/level3/xgemm.cpp
index 7bd388c1..dc8c64bc 100644
--- a/src/routines/level3/xgemm.cpp
+++ b/src/routines/level3/xgemm.cpp
@@ -33,10 +33,11 @@ Xgemm<T>::Xgemm(Queue &queue, EventPointer event, const std::string &name):
     #include "../../kernels/level3/convert_symmetric.opencl"
     #include "../../kernels/level3/convert_triangular.opencl"
     #include "../../kernels/level3/convert_hermitian.opencl"
+    , // separated in multiple parts to prevent C1091 in MSVC 2013
     #include "../../kernels/level3/xgemm_direct_part1.opencl"
     #include "../../kernels/level3/xgemm_direct_part2.opencl"
     #include "../../kernels/level3/xgemm_direct_part3.opencl"
-    , // separated in two parts to prevent C1091 in MSVC 2013
+    , // separated in multiple parts to prevent C1091 in MSVC 2013
     #include "../../kernels/level3/xgemm_part1.opencl"
     #include "../../kernels/level3/xgemm_part2.opencl"
     #include "../../kernels/level3/xgemm_part3.opencl"