Merge pull request #125 from CNugteren/netlib_blas_api

Netlib CBLAS API for CLBlast
2024-07-02 12:26:57 +02:00 · 2016-11-24 19:35:59 +01:00 · 2016-11-24 19:35:59 +01:00 · cb398f0e42
parent 88ba1f4db9 2ff3f77392
commit cb398f0e42
13 changed files with 5984 additions and 62 deletions
--- a/1
+++ b/1
@ -2,6 +2,7 @@
 Development version (next release)
 - Updated to version 8.0 of the CLCudaAPI C++11 OpenCL header
 - Changed the enums in the C API to avoid potential name clashes with external code
+- Added a Netlib CBLAS compatible API (not recommended for full control over performance)
 - Greatly improved the way exceptions are handled in the library (thanks to 'intelfx')
 - Improved performance of GEMM kernels for small sizes by using a direct single-kernel implementation
 - Fixed a bug in the tests and samples related to waiting for an invalid event
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -27,6 +27,7 @@ option(SAMPLES "Enable compilation of the examples" OFF)
 option(TUNERS "Enable compilation of the tuners" OFF)
 option(CLIENTS "Enable compilation of the clients to test and compare performance" OFF)
 option(TESTS "Enable compilation of the correctness tests" OFF)
+option(NETLIB "Enable compilation of the CBLAS Netlib API" OFF)

 # Compile in verbose mode with additional diagnostic messages
 option(VERBOSE "Compile in verbose mode for additional diagnostic messages" OFF)
@ -151,6 +152,9 @@ set(KERNELS copy_fast copy_pad transpose_fast transpose_pad xaxpy xdot xger
            xgemm xgemm_direct xgemv)
 set(SAMPLE_PROGRAMS_CPP sgemm)
 set(SAMPLE_PROGRAMS_C sasum dgemv sgemm haxpy cache)
+if(NETLIB)
+  set(SAMPLE_PROGRAMS_C ${SAMPLE_PROGRAMS_C} sgemm_netlib)
+endif()
 set(LEVEL1_ROUTINES xswap xscal xcopy xaxpy xdot xdotu xdotc xnrm2 xasum xamax)
 set(LEVEL2_ROUTINES xgemv xgbmv xhemv xhbmv xhpmv xsymv xsbmv xspmv xtrmv xtbmv xtpmv
                    xger xgeru xgerc xher xhpr xher2 xhpr2 xsyr xspr xsyr2 xspr2)
@ -172,6 +176,9 @@ set(SOURCES
  src/clblast_c.cpp
  src/routine.cpp
 )
+if(NETLIB)
+  set(SOURCES ${SOURCES} src/clblast_netlib_c.cpp)
+endif()
 foreach(ROUTINE ${LEVEL1_ROUTINES})
  set(SOURCES ${SOURCES} src/routines/level1/${ROUTINE}.cpp)
 endforeach()
@ -213,6 +220,9 @@ install(TARGETS clblast EXPORT CLBlast DESTINATION lib)
 install(FILES include/clblast.h DESTINATION include)
 install(FILES include/clblast_c.h DESTINATION include)
 install(FILES include/clblast_half.h DESTINATION include)
+if(NETLIB)
+  install(FILES include/clblast_netlib_c.h DESTINATION include)
+endif()

 # Installs the config for find_package in dependent projects
 install(EXPORT CLBlast DESTINATION lib/cmake/CLBLast FILE CLBlastConfig.cmake)
--- a/README.md
+++ b/README.md
@ -96,6 +96,10 @@ Afterwards, any of CLBlast's routines can be called directly: there is no need t

 Furthermore, it is possible to optionally set an OS environmental variable `CLBLAST_BUILD_OPTIONS` to pass specific build options to the OpenCL compiler.

+There is also a Netlib CBLAS C API available. This is however not recommended for full control over performance, since at every call it will copy all buffers to and from the OpenCL device. Especially for level 1 and level 2 BLAS functions performance will be impacted severly. However, it can be useful if you don't want to touch OpenCL at all. You can set the default device and platform by setting the `CLBLAST_DEVICE` and `CLBLAST_PLATFORM` environmental variables. This API can be used as follows after providing the `-DNETLIB=ON` flag to CMake:
+
+    #include <clblast_netlib_c.h>
+

 Using the tuners (optional)
 -------------
--- a/include/clblast_c.h
+++ b/include/clblast_c.h
@ -117,11 +117,6 @@ typedef enum CLBlastDiagonal_ { CLBlastDiagonalNonUnit = 131,
                                CLBlastDiagonalUnit = 132 } CLBlastDiagonal;
 typedef enum CLBlastSide_ { CLBlastSideLeft = 141, CLBlastSideRight = 142 } CLBlastSide;

-// Precision scoped enum (values in bits)
-typedef enum CLBlastPrecision_ { CLBlastPrecisionHalf = 16, CLBlastPrecisionSingle = 32,
-                                 CLBlastPrecisionDouble = 64, CLBlastPrecisionComplexSingle = 3232,
-                                 CLBlastPrecisionComplexDouble = 6464 } CLBlastPrecision;
-
 // =================================================================================================
 // BLAS level-1 (vector-vector) routines
 // =================================================================================================
--- a/include/clblast_netlib_c.h
+++ b/include/clblast_netlib_c.h
@ -0,0 +1,920 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file contains the Netlib CBLAS interface to the CLBlast BLAS routines, performing all buffer
+// copies automatically and running on the default OpenCL platform and device. For full control over
+// performance, it is advised to use the regular clblast.h or clblast_c.h headers instead.
+//
+// =================================================================================================
+
+#ifndef CLBLAST_CLBLAST_NETLIB_C_H_
+#define CLBLAST_CLBLAST_NETLIB_C_H_
+
+// Exports library functions under Windows when building a DLL. See also:
+// https://msdn.microsoft.com/en-us/library/a90k134d.aspx
+#if defined(_WIN32) && defined(CLBLAST_DLL)
+  #if defined(COMPILING_DLL)
+    #define PUBLIC_API __declspec(dllexport)
+  #else
+    #define PUBLIC_API __declspec(dllimport)
+  #endif
+#else
+  #define PUBLIC_API
+#endif
+
+// The C interface
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// =================================================================================================
+
+// Matrix layout and transpose types
+typedef enum CLBlastLayout_ { CLBlastLayoutRowMajor = 101,
+                              CLBlastLayoutColMajor = 102 } CLBlastLayout;
+typedef enum CLBlastTranspose_ { CLBlastTransposeNo = 111, CLBlastTransposeYes = 112,
+                                 CLBlastTransposeConjugate = 113 } CLBlastTranspose;
+typedef enum CLBlastTriangle_ { CLBlastTriangleUpper = 121,
+                                CLBlastTriangleLower = 122 } CLBlastTriangle;
+typedef enum CLBlastDiagonal_ { CLBlastDiagonalNonUnit = 131,
+                                CLBlastDiagonalUnit = 132 } CLBlastDiagonal;
+typedef enum CLBlastSide_ { CLBlastSideLeft = 141, CLBlastSideRight = 142 } CLBlastSide;
+
+// For full compatibility with CBLAS
+typedef CLBlastLayout CBLAS_ORDER;
+typedef CLBlastTranspose CBLAS_TRANSPOSE;
+typedef CLBlastTriangle CBLAS_UPLO;
+typedef CLBlastDiagonal CBLAS_DIAG;
+typedef CLBlastSide CBLAS_SIDE;
+#define CblasRowMajor CLBlastLayoutRowMajor
+#define CblasColMajor CLBlastLayoutColMajor
+#define CblasNoTrans CLBlastTransposeNo
+#define CblasTrans CLBlastTransposeYes
+#define CblasConjTrans CLBlastTransposeConjugate
+#define CblasUpper CLBlastTriangleUpper
+#define CblasLower CLBlastTriangleLower
+#define CblasNonUnit CLBlastDiagonalNonUnit
+#define CblasUnit CLBlastDiagonalUnit
+#define CblasLeft CLBlastSideLeft
+#define CblasRight CLBlastSideRight
+
+// =================================================================================================
+// BLAS level-1 (vector-vector) routines
+// =================================================================================================
+
+// Generate givens plane rotation: SROTG/DROTG
+void PUBLIC_API cblas_srotg(float* sa,
+                            float* sb,
+                            float* sc,
+                            float* ss);
+void PUBLIC_API cblas_drotg(double* sa,
+                            double* sb,
+                            double* sc,
+                            double* ss);
+
+// Generate modified givens plane rotation: SROTMG/DROTMG
+void PUBLIC_API cblas_srotmg(float* sd1,
+                             float* sd2,
+                             float* sx1,
+                             const float sy1,
+                             float* sparam);
+void PUBLIC_API cblas_drotmg(double* sd1,
+                             double* sd2,
+                             double* sx1,
+                             const double sy1,
+                             double* sparam);
+
+// Apply givens plane rotation: SROT/DROT
+void PUBLIC_API cblas_srot(const int n,
+                           float* x, const int x_inc,
+                           float* y, const int y_inc,
+                           const float cos,
+                           const float sin);
+void PUBLIC_API cblas_drot(const int n,
+                           double* x, const int x_inc,
+                           double* y, const int y_inc,
+                           const double cos,
+                           const double sin);
+
+// Apply modified givens plane rotation: SROTM/DROTM
+void PUBLIC_API cblas_srotm(const int n,
+                            float* x, const int x_inc,
+                            float* y, const int y_inc,
+                            float* sparam);
+void PUBLIC_API cblas_drotm(const int n,
+                            double* x, const int x_inc,
+                            double* y, const int y_inc,
+                            double* sparam);
+
+// Swap two vectors: SSWAP/DSWAP/CSWAP/ZSWAP/HSWAP
+void PUBLIC_API cblas_sswap(const int n,
+                            float* x, const int x_inc,
+                            float* y, const int y_inc);
+void PUBLIC_API cblas_dswap(const int n,
+                            double* x, const int x_inc,
+                            double* y, const int y_inc);
+void PUBLIC_API cblas_cswap(const int n,
+                            void* x, const int x_inc,
+                            void* y, const int y_inc);
+void PUBLIC_API cblas_zswap(const int n,
+                            void* x, const int x_inc,
+                            void* y, const int y_inc);
+
+// Vector scaling: SSCAL/DSCAL/CSCAL/ZSCAL/HSCAL
+void PUBLIC_API cblas_sscal(const int n,
+                            const float alpha,
+                            float* x, const int x_inc);
+void PUBLIC_API cblas_dscal(const int n,
+                            const double alpha,
+                            double* x, const int x_inc);
+void PUBLIC_API cblas_cscal(const int n,
+                            const void* alpha,
+                            void* x, const int x_inc);
+void PUBLIC_API cblas_zscal(const int n,
+                            const void* alpha,
+                            void* x, const int x_inc);
+
+// Vector copy: SCOPY/DCOPY/CCOPY/ZCOPY/HCOPY
+void PUBLIC_API cblas_scopy(const int n,
+                            const float* x, const int x_inc,
+                            float* y, const int y_inc);
+void PUBLIC_API cblas_dcopy(const int n,
+                            const double* x, const int x_inc,
+                            double* y, const int y_inc);
+void PUBLIC_API cblas_ccopy(const int n,
+                            const void* x, const int x_inc,
+                            void* y, const int y_inc);
+void PUBLIC_API cblas_zcopy(const int n,
+                            const void* x, const int x_inc,
+                            void* y, const int y_inc);
+
+// Vector-times-constant plus vector: SAXPY/DAXPY/CAXPY/ZAXPY/HAXPY
+void PUBLIC_API cblas_saxpy(const int n,
+                            const float alpha,
+                            const float* x, const int x_inc,
+                            float* y, const int y_inc);
+void PUBLIC_API cblas_daxpy(const int n,
+                            const double alpha,
+                            const double* x, const int x_inc,
+                            double* y, const int y_inc);
+void PUBLIC_API cblas_caxpy(const int n,
+                            const void* alpha,
+                            const void* x, const int x_inc,
+                            void* y, const int y_inc);
+void PUBLIC_API cblas_zaxpy(const int n,
+                            const void* alpha,
+                            const void* x, const int x_inc,
+                            void* y, const int y_inc);
+
+// Dot product of two vectors: SDOT/DDOT/HDOT
+float PUBLIC_API cblas_sdot(const int n,
+                            const float* x, const int x_inc,
+                            const float* y, const int y_inc);
+double PUBLIC_API cblas_ddot(const int n,
+                             const double* x, const int x_inc,
+                             const double* y, const int y_inc);
+
+// Dot product of two complex vectors: CDOTU/ZDOTU
+void PUBLIC_API cblas_cdotu_sub(const int n,
+                                const void* x, const int x_inc,
+                                const void* y, const int y_inc,
+                                void* dot);
+void PUBLIC_API cblas_zdotu_sub(const int n,
+                                const void* x, const int x_inc,
+                                const void* y, const int y_inc,
+                                void* dot);
+
+// Dot product of two complex vectors, one conjugated: CDOTC/ZDOTC
+void PUBLIC_API cblas_cdotc_sub(const int n,
+                                const void* x, const int x_inc,
+                                const void* y, const int y_inc,
+                                void* dot);
+void PUBLIC_API cblas_zdotc_sub(const int n,
+                                const void* x, const int x_inc,
+                                const void* y, const int y_inc,
+                                void* dot);
+
+// Euclidian norm of a vector: SNRM2/DNRM2/ScNRM2/DzNRM2/HNRM2
+float PUBLIC_API cblas_snrm2(const int n,
+                             const float* x, const int x_inc);
+double PUBLIC_API cblas_dnrm2(const int n,
+                              const double* x, const int x_inc);
+float PUBLIC_API cblas_scnrm2(const int n,
+                             const void* x, const int x_inc);
+double PUBLIC_API cblas_dznrm2(const int n,
+                              const void* x, const int x_inc);
+
+// Absolute sum of values in a vector: SASUM/DASUM/ScASUM/DzASUM/HASUM
+float PUBLIC_API cblas_sasum(const int n,
+                             const float* x, const int x_inc);
+double PUBLIC_API cblas_dasum(const int n,
+                              const double* x, const int x_inc);
+float PUBLIC_API cblas_scasum(const int n,
+                             const void* x, const int x_inc);
+double PUBLIC_API cblas_dzasum(const int n,
+                              const void* x, const int x_inc);
+
+// Sum of values in a vector (non-BLAS function): SSUM/DSUM/ScSUM/DzSUM/HSUM
+float PUBLIC_API cblas_ssum(const int n,
+                            const float* x, const int x_inc);
+double PUBLIC_API cblas_dsum(const int n,
+                             const double* x, const int x_inc);
+float PUBLIC_API cblas_scsum(const int n,
+                            const void* x, const int x_inc);
+double PUBLIC_API cblas_dzsum(const int n,
+                             const void* x, const int x_inc);
+
+// Index of absolute maximum value in a vector: iSAMAX/iDAMAX/iCAMAX/iZAMAX/iHAMAX
+int PUBLIC_API cblas_isamax(const int n,
+                           const float* x, const int x_inc);
+int PUBLIC_API cblas_idamax(const int n,
+                           const double* x, const int x_inc);
+int PUBLIC_API cblas_icamax(const int n,
+                           const void* x, const int x_inc);
+int PUBLIC_API cblas_izamax(const int n,
+                           const void* x, const int x_inc);
+
+// Index of maximum value in a vector (non-BLAS function): iSMAX/iDMAX/iCMAX/iZMAX/iHMAX
+int PUBLIC_API cblas_ismax(const int n,
+                          const float* x, const int x_inc);
+int PUBLIC_API cblas_idmax(const int n,
+                          const double* x, const int x_inc);
+int PUBLIC_API cblas_icmax(const int n,
+                          const void* x, const int x_inc);
+int PUBLIC_API cblas_izmax(const int n,
+                          const void* x, const int x_inc);
+
+// Index of minimum value in a vector (non-BLAS function): iSMIN/iDMIN/iCMIN/iZMIN/iHMIN
+int PUBLIC_API cblas_ismin(const int n,
+                          const float* x, const int x_inc);
+int PUBLIC_API cblas_idmin(const int n,
+                          const double* x, const int x_inc);
+int PUBLIC_API cblas_icmin(const int n,
+                          const void* x, const int x_inc);
+int PUBLIC_API cblas_izmin(const int n,
+                          const void* x, const int x_inc);
+
+// =================================================================================================
+// BLAS level-2 (matrix-vector) routines
+// =================================================================================================
+
+// General matrix-vector multiplication: SGEMV/DGEMV/CGEMV/ZGEMV/HGEMV
+void PUBLIC_API cblas_sgemv(const CLBlastLayout layout, const CLBlastTranspose a_transpose,
+                            const int m, const int n,
+                            const float alpha,
+                            const float* a, const int a_ld,
+                            const float* x, const int x_inc,
+                            const float beta,
+                            float* y, const int y_inc);
+void PUBLIC_API cblas_dgemv(const CLBlastLayout layout, const CLBlastTranspose a_transpose,
+                            const int m, const int n,
+                            const double alpha,
+                            const double* a, const int a_ld,
+                            const double* x, const int x_inc,
+                            const double beta,
+                            double* y, const int y_inc);
+void PUBLIC_API cblas_cgemv(const CLBlastLayout layout, const CLBlastTranspose a_transpose,
+                            const int m, const int n,
+                            const void* alpha,
+                            const void* a, const int a_ld,
+                            const void* x, const int x_inc,
+                            const void* beta,
+                            void* y, const int y_inc);
+void PUBLIC_API cblas_zgemv(const CLBlastLayout layout, const CLBlastTranspose a_transpose,
+                            const int m, const int n,
+                            const void* alpha,
+                            const void* a, const int a_ld,
+                            const void* x, const int x_inc,
+                            const void* beta,
+                            void* y, const int y_inc);
+
+// General banded matrix-vector multiplication: SGBMV/DGBMV/CGBMV/ZGBMV/HGBMV
+void PUBLIC_API cblas_sgbmv(const CLBlastLayout layout, const CLBlastTranspose a_transpose,
+                            const int m, const int n, const int kl, const int ku,
+                            const float alpha,
+                            const float* a, const int a_ld,
+                            const float* x, const int x_inc,
+                            const float beta,
+                            float* y, const int y_inc);
+void PUBLIC_API cblas_dgbmv(const CLBlastLayout layout, const CLBlastTranspose a_transpose,
+                            const int m, const int n, const int kl, const int ku,
+                            const double alpha,
+                            const double* a, const int a_ld,
+                            const double* x, const int x_inc,
+                            const double beta,
+                            double* y, const int y_inc);
+void PUBLIC_API cblas_cgbmv(const CLBlastLayout layout, const CLBlastTranspose a_transpose,
+                            const int m, const int n, const int kl, const int ku,
+                            const void* alpha,
+                            const void* a, const int a_ld,
+                            const void* x, const int x_inc,
+                            const void* beta,
+                            void* y, const int y_inc);
+void PUBLIC_API cblas_zgbmv(const CLBlastLayout layout, const CLBlastTranspose a_transpose,
+                            const int m, const int n, const int kl, const int ku,
+                            const void* alpha,
+                            const void* a, const int a_ld,
+                            const void* x, const int x_inc,
+                            const void* beta,
+                            void* y, const int y_inc);
+
+// Hermitian matrix-vector multiplication: CHEMV/ZHEMV
+void PUBLIC_API cblas_chemv(const CLBlastLayout layout, const CLBlastTriangle triangle,
+                            const int n,
+                            const void* alpha,
+                            const void* a, const int a_ld,
+                            const void* x, const int x_inc,
+                            const void* beta,
+                            void* y, const int y_inc);
+void PUBLIC_API cblas_zhemv(const CLBlastLayout layout, const CLBlastTriangle triangle,
+                            const int n,
+                            const void* alpha,
+                            const void* a, const int a_ld,
+                            const void* x, const int x_inc,
+                            const void* beta,
+                            void* y, const int y_inc);
+
+// Hermitian banded matrix-vector multiplication: CHBMV/ZHBMV
+void PUBLIC_API cblas_chbmv(const CLBlastLayout layout, const CLBlastTriangle triangle,
+                            const int n, const int k,
+                            const void* alpha,
+                            const void* a, const int a_ld,
+                            const void* x, const int x_inc,
+                            const void* beta,
+                            void* y, const int y_inc);
+void PUBLIC_API cblas_zhbmv(const CLBlastLayout layout, const CLBlastTriangle triangle,
+                            const int n, const int k,
+                            const void* alpha,
+                            const void* a, const int a_ld,
+                            const void* x, const int x_inc,
+                            const void* beta,
+                            void* y, const int y_inc);
+
+// Hermitian packed matrix-vector multiplication: CHPMV/ZHPMV
+void PUBLIC_API cblas_chpmv(const CLBlastLayout layout, const CLBlastTriangle triangle,
+                            const int n,
+                            const void* alpha,
+                            const void* ap,
+                            const void* x, const int x_inc,
+                            const void* beta,
+                            void* y, const int y_inc);
+void PUBLIC_API cblas_zhpmv(const CLBlastLayout layout, const CLBlastTriangle triangle,
+                            const int n,
+                            const void* alpha,
+                            const void* ap,
+                            const void* x, const int x_inc,
+                            const void* beta,
+                            void* y, const int y_inc);
+
+// Symmetric matrix-vector multiplication: SSYMV/DSYMV/HSYMV
+void PUBLIC_API cblas_ssymv(const CLBlastLayout layout, const CLBlastTriangle triangle,
+                            const int n,
+                            const float alpha,
+                            const float* a, const int a_ld,
+                            const float* x, const int x_inc,
+                            const float beta,
+                            float* y, const int y_inc);
+void PUBLIC_API cblas_dsymv(const CLBlastLayout layout, const CLBlastTriangle triangle,
+                            const int n,
+                            const double alpha,
+                            const double* a, const int a_ld,
+                            const double* x, const int x_inc,
+                            const double beta,
+                            double* y, const int y_inc);
+
+// Symmetric banded matrix-vector multiplication: SSBMV/DSBMV/HSBMV
+void PUBLIC_API cblas_ssbmv(const CLBlastLayout layout, const CLBlastTriangle triangle,
+                            const int n, const int k,
+                            const float alpha,
+                            const float* a, const int a_ld,
+                            const float* x, const int x_inc,
+                            const float beta,
+                            float* y, const int y_inc);
+void PUBLIC_API cblas_dsbmv(const CLBlastLayout layout, const CLBlastTriangle triangle,
+                            const int n, const int k,
+                            const double alpha,
+                            const double* a, const int a_ld,
+                            const double* x, const int x_inc,
+                            const double beta,
+                            double* y, const int y_inc);
+
+// Symmetric packed matrix-vector multiplication: SSPMV/DSPMV/HSPMV
+void PUBLIC_API cblas_sspmv(const CLBlastLayout layout, const CLBlastTriangle triangle,
+                            const int n,
+                            const float alpha,
+                            const float* ap,
+                            const float* x, const int x_inc,
+                            const float beta,
+                            float* y, const int y_inc);
+void PUBLIC_API cblas_dspmv(const CLBlastLayout layout, const CLBlastTriangle triangle,
+                            const int n,
+                            const double alpha,
+                            const double* ap,
+                            const double* x, const int x_inc,
+                            const double beta,
+                            double* y, const int y_inc);
+
+// Triangular matrix-vector multiplication: STRMV/DTRMV/CTRMV/ZTRMV/HTRMV
+void PUBLIC_API cblas_strmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
+                            const int n,
+                            const float* a, const int a_ld,
+                            float* x, const int x_inc);
+void PUBLIC_API cblas_dtrmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
+                            const int n,
+                            const double* a, const int a_ld,
+                            double* x, const int x_inc);
+void PUBLIC_API cblas_ctrmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
+                            const int n,
+                            const void* a, const int a_ld,
+                            void* x, const int x_inc);
+void PUBLIC_API cblas_ztrmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
+                            const int n,
+                            const void* a, const int a_ld,
+                            void* x, const int x_inc);
+
+// Triangular banded matrix-vector multiplication: STBMV/DTBMV/CTBMV/ZTBMV/HTBMV
+void PUBLIC_API cblas_stbmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
+                            const int n, const int k,
+                            const float* a, const int a_ld,
+                            float* x, const int x_inc);
+void PUBLIC_API cblas_dtbmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
+                            const int n, const int k,
+                            const double* a, const int a_ld,
+                            double* x, const int x_inc);
+void PUBLIC_API cblas_ctbmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
+                            const int n, const int k,
+                            const void* a, const int a_ld,
+                            void* x, const int x_inc);
+void PUBLIC_API cblas_ztbmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
+                            const int n, const int k,
+                            const void* a, const int a_ld,
+                            void* x, const int x_inc);
+
+// Triangular packed matrix-vector multiplication: STPMV/DTPMV/CTPMV/ZTPMV/HTPMV
+void PUBLIC_API cblas_stpmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
+                            const int n,
+                            const float* ap,
+                            float* x, const int x_inc);
+void PUBLIC_API cblas_dtpmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
+                            const int n,
+                            const double* ap,
+                            double* x, const int x_inc);
+void PUBLIC_API cblas_ctpmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
+                            const int n,
+                            const void* ap,
+                            void* x, const int x_inc);
+void PUBLIC_API cblas_ztpmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
+                            const int n,
+                            const void* ap,
+                            void* x, const int x_inc);
+
+// Solves a triangular system of equations: STRSV/DTRSV/CTRSV/ZTRSV
+void PUBLIC_API cblas_strsv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
+                            const int n,
+                            const float* a, const int a_ld,
+                            float* x, const int x_inc);
+void PUBLIC_API cblas_dtrsv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
+                            const int n,
+                            const double* a, const int a_ld,
+                            double* x, const int x_inc);
+void PUBLIC_API cblas_ctrsv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
+                            const int n,
+                            const void* a, const int a_ld,
+                            void* x, const int x_inc);
+void PUBLIC_API cblas_ztrsv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
+                            const int n,
+                            const void* a, const int a_ld,
+                            void* x, const int x_inc);
+
+// Solves a banded triangular system of equations: STBSV/DTBSV/CTBSV/ZTBSV
+void PUBLIC_API cblas_stbsv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
+                            const int n, const int k,
+                            const float* a, const int a_ld,
+                            float* x, const int x_inc);
+void PUBLIC_API cblas_dtbsv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
+                            const int n, const int k,
+                            const double* a, const int a_ld,
+                            double* x, const int x_inc);
+void PUBLIC_API cblas_ctbsv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
+                            const int n, const int k,
+                            const void* a, const int a_ld,
+                            void* x, const int x_inc);
+void PUBLIC_API cblas_ztbsv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
+                            const int n, const int k,
+                            const void* a, const int a_ld,
+                            void* x, const int x_inc);
+
+// Solves a packed triangular system of equations: STPSV/DTPSV/CTPSV/ZTPSV
+void PUBLIC_API cblas_stpsv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
+                            const int n,
+                            const float* ap,
+                            float* x, const int x_inc);
+void PUBLIC_API cblas_dtpsv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
+                            const int n,
+                            const double* ap,
+                            double* x, const int x_inc);
+void PUBLIC_API cblas_ctpsv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
+                            const int n,
+                            const void* ap,
+                            void* x, const int x_inc);
+void PUBLIC_API cblas_ztpsv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
+                            const int n,
+                            const void* ap,
+                            void* x, const int x_inc);
+
+// General rank-1 matrix update: SGER/DGER/HGER
+void PUBLIC_API cblas_sger(const CLBlastLayout layout,
+                           const int m, const int n,
+                           const float alpha,
+                           const float* x, const int x_inc,
+                           const float* y, const int y_inc,
+                           float* a, const int a_ld);
+void PUBLIC_API cblas_dger(const CLBlastLayout layout,
+                           const int m, const int n,
+                           const double alpha,
+                           const double* x, const int x_inc,
+                           const double* y, const int y_inc,
+                           double* a, const int a_ld);
+
+// General rank-1 complex matrix update: CGERU/ZGERU
+void PUBLIC_API cblas_cgeru(const CLBlastLayout layout,
+                            const int m, const int n,
+                            const void* alpha,
+                            const void* x, const int x_inc,
+                            const void* y, const int y_inc,
+                            void* a, const int a_ld);
+void PUBLIC_API cblas_zgeru(const CLBlastLayout layout,
+                            const int m, const int n,
+                            const void* alpha,
+                            const void* x, const int x_inc,
+                            const void* y, const int y_inc,
+                            void* a, const int a_ld);
+
+// General rank-1 complex conjugated matrix update: CGERC/ZGERC
+void PUBLIC_API cblas_cgerc(const CLBlastLayout layout,
+                            const int m, const int n,
+                            const void* alpha,
+                            const void* x, const int x_inc,
+                            const void* y, const int y_inc,
+                            void* a, const int a_ld);
+void PUBLIC_API cblas_zgerc(const CLBlastLayout layout,
+                            const int m, const int n,
+                            const void* alpha,
+                            const void* x, const int x_inc,
+                            const void* y, const int y_inc,
+                            void* a, const int a_ld);
+
+// Hermitian rank-1 matrix update: CHER/ZHER
+void PUBLIC_API cblas_cher(const CLBlastLayout layout, const CLBlastTriangle triangle,
+                           const int n,
+                           const float alpha,
+                           const void* x, const int x_inc,
+                           void* a, const int a_ld);
+void PUBLIC_API cblas_zher(const CLBlastLayout layout, const CLBlastTriangle triangle,
+                           const int n,
+                           const double alpha,
+                           const void* x, const int x_inc,
+                           void* a, const int a_ld);
+
+// Hermitian packed rank-1 matrix update: CHPR/ZHPR
+void PUBLIC_API cblas_chpr(const CLBlastLayout layout, const CLBlastTriangle triangle,
+                           const int n,
+                           const float alpha,
+                           const void* x, const int x_inc,
+                           void* ap);
+void PUBLIC_API cblas_zhpr(const CLBlastLayout layout, const CLBlastTriangle triangle,
+                           const int n,
+                           const double alpha,
+                           const void* x, const int x_inc,
+                           void* ap);
+
+// Hermitian rank-2 matrix update: CHER2/ZHER2
+void PUBLIC_API cblas_cher2(const CLBlastLayout layout, const CLBlastTriangle triangle,
+                            const int n,
+                            const void* alpha,
+                            const void* x, const int x_inc,
+                            const void* y, const int y_inc,
+                            void* a, const int a_ld);
+void PUBLIC_API cblas_zher2(const CLBlastLayout layout, const CLBlastTriangle triangle,
+                            const int n,
+                            const void* alpha,
+                            const void* x, const int x_inc,
+                            const void* y, const int y_inc,
+                            void* a, const int a_ld);
+
+// Hermitian packed rank-2 matrix update: CHPR2/ZHPR2
+void PUBLIC_API cblas_chpr2(const CLBlastLayout layout, const CLBlastTriangle triangle,
+                            const int n,
+                            const void* alpha,
+                            const void* x, const int x_inc,
+                            const void* y, const int y_inc,
+                            void* ap);
+void PUBLIC_API cblas_zhpr2(const CLBlastLayout layout, const CLBlastTriangle triangle,
+                            const int n,
+                            const void* alpha,
+                            const void* x, const int x_inc,
+                            const void* y, const int y_inc,
+                            void* ap);
+
+// Symmetric rank-1 matrix update: SSYR/DSYR/HSYR
+void PUBLIC_API cblas_ssyr(const CLBlastLayout layout, const CLBlastTriangle triangle,
+                           const int n,
+                           const float alpha,
+                           const float* x, const int x_inc,
+                           float* a, const int a_ld);
+void PUBLIC_API cblas_dsyr(const CLBlastLayout layout, const CLBlastTriangle triangle,
+                           const int n,
+                           const double alpha,
+                           const double* x, const int x_inc,
+                           double* a, const int a_ld);
+
+// Symmetric packed rank-1 matrix update: SSPR/DSPR/HSPR
+void PUBLIC_API cblas_sspr(const CLBlastLayout layout, const CLBlastTriangle triangle,
+                           const int n,
+                           const float alpha,
+                           const float* x, const int x_inc,
+                           float* ap);
+void PUBLIC_API cblas_dspr(const CLBlastLayout layout, const CLBlastTriangle triangle,
+                           const int n,
+                           const double alpha,
+                           const double* x, const int x_inc,
+                           double* ap);
+
+// Symmetric rank-2 matrix update: SSYR2/DSYR2/HSYR2
+void PUBLIC_API cblas_ssyr2(const CLBlastLayout layout, const CLBlastTriangle triangle,
+                            const int n,
+                            const float alpha,
+                            const float* x, const int x_inc,
+                            const float* y, const int y_inc,
+                            float* a, const int a_ld);
+void PUBLIC_API cblas_dsyr2(const CLBlastLayout layout, const CLBlastTriangle triangle,
+                            const int n,
+                            const double alpha,
+                            const double* x, const int x_inc,
+                            const double* y, const int y_inc,
+                            double* a, const int a_ld);
+
+// Symmetric packed rank-2 matrix update: SSPR2/DSPR2/HSPR2
+void PUBLIC_API cblas_sspr2(const CLBlastLayout layout, const CLBlastTriangle triangle,
+                            const int n,
+                            const float alpha,
+                            const float* x, const int x_inc,
+                            const float* y, const int y_inc,
+                            float* ap);
+void PUBLIC_API cblas_dspr2(const CLBlastLayout layout, const CLBlastTriangle triangle,
+                            const int n,
+                            const double alpha,
+                            const double* x, const int x_inc,
+                            const double* y, const int y_inc,
+                            double* ap);
+
+// =================================================================================================
+// BLAS level-3 (matrix-matrix) routines
+// =================================================================================================
+
+// General matrix-matrix multiplication: SGEMM/DGEMM/CGEMM/ZGEMM/HGEMM
+void PUBLIC_API cblas_sgemm(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const CLBlastTranspose b_transpose,
+                            const int m, const int n, const int k,
+                            const float alpha,
+                            const float* a, const int a_ld,
+                            const float* b, const int b_ld,
+                            const float beta,
+                            float* c, const int c_ld);
+void PUBLIC_API cblas_dgemm(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const CLBlastTranspose b_transpose,
+                            const int m, const int n, const int k,
+                            const double alpha,
+                            const double* a, const int a_ld,
+                            const double* b, const int b_ld,
+                            const double beta,
+                            double* c, const int c_ld);
+void PUBLIC_API cblas_cgemm(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const CLBlastTranspose b_transpose,
+                            const int m, const int n, const int k,
+                            const void* alpha,
+                            const void* a, const int a_ld,
+                            const void* b, const int b_ld,
+                            const void* beta,
+                            void* c, const int c_ld);
+void PUBLIC_API cblas_zgemm(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const CLBlastTranspose b_transpose,
+                            const int m, const int n, const int k,
+                            const void* alpha,
+                            const void* a, const int a_ld,
+                            const void* b, const int b_ld,
+                            const void* beta,
+                            void* c, const int c_ld);
+
+// Symmetric matrix-matrix multiplication: SSYMM/DSYMM/CSYMM/ZSYMM/HSYMM
+void PUBLIC_API cblas_ssymm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle,
+                            const int m, const int n,
+                            const float alpha,
+                            const float* a, const int a_ld,
+                            const float* b, const int b_ld,
+                            const float beta,
+                            float* c, const int c_ld);
+void PUBLIC_API cblas_dsymm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle,
+                            const int m, const int n,
+                            const double alpha,
+                            const double* a, const int a_ld,
+                            const double* b, const int b_ld,
+                            const double beta,
+                            double* c, const int c_ld);
+void PUBLIC_API cblas_csymm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle,
+                            const int m, const int n,
+                            const void* alpha,
+                            const void* a, const int a_ld,
+                            const void* b, const int b_ld,
+                            const void* beta,
+                            void* c, const int c_ld);
+void PUBLIC_API cblas_zsymm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle,
+                            const int m, const int n,
+                            const void* alpha,
+                            const void* a, const int a_ld,
+                            const void* b, const int b_ld,
+                            const void* beta,
+                            void* c, const int c_ld);
+
+// Hermitian matrix-matrix multiplication: CHEMM/ZHEMM
+void PUBLIC_API cblas_chemm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle,
+                            const int m, const int n,
+                            const void* alpha,
+                            const void* a, const int a_ld,
+                            const void* b, const int b_ld,
+                            const void* beta,
+                            void* c, const int c_ld);
+void PUBLIC_API cblas_zhemm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle,
+                            const int m, const int n,
+                            const void* alpha,
+                            const void* a, const int a_ld,
+                            const void* b, const int b_ld,
+                            const void* beta,
+                            void* c, const int c_ld);
+
+// Rank-K update of a symmetric matrix: SSYRK/DSYRK/CSYRK/ZSYRK/HSYRK
+void PUBLIC_API cblas_ssyrk(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose,
+                            const int n, const int k,
+                            const float alpha,
+                            const float* a, const int a_ld,
+                            const float beta,
+                            float* c, const int c_ld);
+void PUBLIC_API cblas_dsyrk(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose,
+                            const int n, const int k,
+                            const double alpha,
+                            const double* a, const int a_ld,
+                            const double beta,
+                            double* c, const int c_ld);
+void PUBLIC_API cblas_csyrk(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose,
+                            const int n, const int k,
+                            const void* alpha,
+                            const void* a, const int a_ld,
+                            const void* beta,
+                            void* c, const int c_ld);
+void PUBLIC_API cblas_zsyrk(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose,
+                            const int n, const int k,
+                            const void* alpha,
+                            const void* a, const int a_ld,
+                            const void* beta,
+                            void* c, const int c_ld);
+
+// Rank-K update of a hermitian matrix: CHERK/ZHERK
+void PUBLIC_API cblas_cherk(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose,
+                            const int n, const int k,
+                            const float alpha,
+                            const void* a, const int a_ld,
+                            const float beta,
+                            void* c, const int c_ld);
+void PUBLIC_API cblas_zherk(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose,
+                            const int n, const int k,
+                            const double alpha,
+                            const void* a, const int a_ld,
+                            const double beta,
+                            void* c, const int c_ld);
+
+// Rank-2K update of a symmetric matrix: SSYR2K/DSYR2K/CSYR2K/ZSYR2K/HSYR2K
+void PUBLIC_API cblas_ssyr2k(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose ab_transpose,
+                             const int n, const int k,
+                             const float alpha,
+                             const float* a, const int a_ld,
+                             const float* b, const int b_ld,
+                             const float beta,
+                             float* c, const int c_ld);
+void PUBLIC_API cblas_dsyr2k(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose ab_transpose,
+                             const int n, const int k,
+                             const double alpha,
+                             const double* a, const int a_ld,
+                             const double* b, const int b_ld,
+                             const double beta,
+                             double* c, const int c_ld);
+void PUBLIC_API cblas_csyr2k(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose ab_transpose,
+                             const int n, const int k,
+                             const void* alpha,
+                             const void* a, const int a_ld,
+                             const void* b, const int b_ld,
+                             const void* beta,
+                             void* c, const int c_ld);
+void PUBLIC_API cblas_zsyr2k(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose ab_transpose,
+                             const int n, const int k,
+                             const void* alpha,
+                             const void* a, const int a_ld,
+                             const void* b, const int b_ld,
+                             const void* beta,
+                             void* c, const int c_ld);
+
+// Rank-2K update of a hermitian matrix: CHER2K/ZHER2K
+void PUBLIC_API cblas_cher2k(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose ab_transpose,
+                             const int n, const int k,
+                             const void* alpha,
+                             const void* a, const int a_ld,
+                             const void* b, const int b_ld,
+                             const float beta,
+                             void* c, const int c_ld);
+void PUBLIC_API cblas_zher2k(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose ab_transpose,
+                             const int n, const int k,
+                             const void* alpha,
+                             const void* a, const int a_ld,
+                             const void* b, const int b_ld,
+                             const double beta,
+                             void* c, const int c_ld);
+
+// Triangular matrix-matrix multiplication: STRMM/DTRMM/CTRMM/ZTRMM/HTRMM
+void PUBLIC_API cblas_strmm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
+                            const int m, const int n,
+                            const float alpha,
+                            const float* a, const int a_ld,
+                            float* b, const int b_ld);
+void PUBLIC_API cblas_dtrmm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
+                            const int m, const int n,
+                            const double alpha,
+                            const double* a, const int a_ld,
+                            double* b, const int b_ld);
+void PUBLIC_API cblas_ctrmm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
+                            const int m, const int n,
+                            const void* alpha,
+                            const void* a, const int a_ld,
+                            void* b, const int b_ld);
+void PUBLIC_API cblas_ztrmm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
+                            const int m, const int n,
+                            const void* alpha,
+                            const void* a, const int a_ld,
+                            void* b, const int b_ld);
+
+// Solves a triangular system of equations: STRSM/DTRSM/CTRSM/ZTRSM/HTRSM
+void PUBLIC_API cblas_strsm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
+                            const int m, const int n,
+                            const float alpha,
+                            const float* a, const int a_ld,
+                            float* b, const int b_ld);
+void PUBLIC_API cblas_dtrsm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
+                            const int m, const int n,
+                            const double alpha,
+                            const double* a, const int a_ld,
+                            double* b, const int b_ld);
+void PUBLIC_API cblas_ctrsm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
+                            const int m, const int n,
+                            const void* alpha,
+                            const void* a, const int a_ld,
+                            void* b, const int b_ld);
+void PUBLIC_API cblas_ztrsm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
+                            const int m, const int n,
+                            const void* alpha,
+                            const void* a, const int a_ld,
+                            void* b, const int b_ld);
+
+// =================================================================================================
+// Extra non-BLAS routines (level-X)
+// =================================================================================================
+
+// Scaling and out-place transpose/copy (non-BLAS function): SOMATCOPY/DOMATCOPY/COMATCOPY/ZOMATCOPY/HOMATCOPY
+void PUBLIC_API cblas_somatcopy(const CLBlastLayout layout, const CLBlastTranspose a_transpose,
+                                const int m, const int n,
+                                const float alpha,
+                                const float* a, const int a_ld,
+                                float* b, const int b_ld);
+void PUBLIC_API cblas_domatcopy(const CLBlastLayout layout, const CLBlastTranspose a_transpose,
+                                const int m, const int n,
+                                const double alpha,
+                                const double* a, const int a_ld,
+                                double* b, const int b_ld);
+void PUBLIC_API cblas_comatcopy(const CLBlastLayout layout, const CLBlastTranspose a_transpose,
+                                const int m, const int n,
+                                const void* alpha,
+                                const void* a, const int a_ld,
+                                void* b, const int b_ld);
+void PUBLIC_API cblas_zomatcopy(const CLBlastLayout layout, const CLBlastTranspose a_transpose,
+                                const int m, const int n,
+                                const void* alpha,
+                                const void* a, const int a_ld,
+                                void* b, const int b_ld);
+
+// =================================================================================================
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+// CLBLAST_CLBLAST_NETLIB_C_H_
+#endif
--- a/samples/sgemm_netlib.c
+++ b/samples/sgemm_netlib.c
@ -0,0 +1,69 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file demonstrates the use of the Netlib CBLAS API of the CLBlast library. This API is not
+// recommended if you want full control over performance: it will internally copy buffers from and
+// to the OpenCL device.
+//
+// Note that this example is meant for illustration purposes only. CLBlast provides other programs
+// for performance benchmarking ('client_xxxxx') and for correctness testing ('test_xxxxx').
+//
+// =================================================================================================
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+
+// Includes the CLBlast library (Netlib CBLAS interface)
+#include <clblast_netlib_c.h>
+
+// =================================================================================================
+
+// Example use of the single-precision routine SGEMM
+int main(void) {
+
+  // Example SGEMM arguments
+  const int m = 128;
+  const int n = 64;
+  const int k = 512;
+  const float alpha = 0.7f;
+  const float beta = 1.0f;
+  const int a_ld = k;
+  const int b_ld = n;
+  const int c_ld = n;
+
+  // Populate host matrices with some example data
+  float* host_a = (float*)malloc(sizeof(float)*m*k);
+  float* host_b = (float*)malloc(sizeof(float)*n*k);
+  float* host_c = (float*)malloc(sizeof(float)*m*n);
+  for (int i=0; i<m*k; ++i) { host_a[i] = 12.193f; }
+  for (int i=0; i<n*k; ++i) { host_b[i] = -8.199f; }
+  for (int i=0; i<m*n; ++i) { host_c[i] = 0.0f; }
+
+  // Call the SGEMM routine.
+  cblas_sgemm(CLBlastLayoutRowMajor,
+              CLBlastTransposeNo, CLBlastTransposeNo,
+              m, n, k,
+              alpha,
+              host_a, a_ld,
+              host_b, b_ld,
+              beta,
+              host_c, c_ld);
+
+  // Example completed
+  printf("Completed SGEMM\n");
+
+  // Clean-up
+  free(host_a);
+  free(host_b);
+  free(host_c);
+  return 0;
+}
+
+// =================================================================================================
--- a/scripts/generator/generator.py
+++ b/scripts/generator/generator.py
@ -12,6 +12,8 @@
 #    clblast.cpp
 #    clblast_c.h
 #    clblast_c.cpp
+#    clblast_netlib_c.h
+#    clblast_netlib_c.cpp
 #    wrapper_clblas.h
 #    wrapper_cblas.h
 # It also generates the main functions for the correctness and performance tests as found in
@ -36,9 +38,11 @@ FILES = [
    "/src/clblast_c.cpp",
    "/test/wrapper_clblas.hpp",
    "/test/wrapper_cblas.hpp",
+    "/include/clblast_netlib_c.h",
+    "/src/clblast_netlib_c.cpp",
 ]
-HEADER_LINES = [117, 73, 118, 22, 29, 41]
-FOOTER_LINES = [17, 80, 19, 18, 6, 6]
+HEADER_LINES = [117, 73, 118, 22, 29, 41, 65, 32]
+FOOTER_LINES = [17, 80, 19, 18, 6, 6, 9, 2]

 # Different possibilities for requirements
 ald_m = "The value of `a_ld` must be at least `m`."
@ -55,70 +59,105 @@ bld_trans_n_k = "When `transpose == Transpose::kNo`, then `b_ld` must be at leas
 cld_m = "The value of `c_ld` must be at least `m`."
 cld_n = "The value of `c_ld` must be at least `n`."

+
+# Helper functions to compute vector and matrix sizes
+def size_helper(condition, size_one, size_two, multiplier):
+    length = "(" + condition + ")" + " ? " + size_one + " * " + multiplier + " : " + size_two + " * " + multiplier
+    return length
+
+
+def layout_transpose_condition(prefix):
+    return "(layout == CLBlastLayoutColMajor && " + prefix + "_transpose != CLBlastTransposeNo) || " +\
+           "(layout == CLBlastLayoutRowMajor && " + prefix + "_transpose == CLBlastTransposeNo)"
+
+
+# Different possibilities for the vector and matrix sizes
+xn = "n * x_inc"
+xm = "m * x_inc"
+yn = "n * y_inc"
+ym = "m * y_inc"
+an = "n * a_ld"
+apn = "((n*(n+1)) / 2)"
+cn = "n * c_ld"
+xmn = size_helper("a_transpose != CLBlastTransposeNo", "m", "n", "x_inc")
+ynm = size_helper("a_transpose != CLBlastTransposeNo", "n", "m", "y_inc")
+amn = size_helper("layout == CLBlastLayoutRowMajor", "m", "n", "a_ld")
+amns = size_helper("side == CLBlastSideLeft", "m", "n", "a_ld")
+amk = size_helper(layout_transpose_condition("a"), "m", "k", "a_ld")
+ank = size_helper(layout_transpose_condition("a"), "n", "k", "a_ld")
+ankab = size_helper(layout_transpose_condition("ab"), "n", "k", "a_ld")
+bkn = size_helper(layout_transpose_condition("b"), "k", "n", "b_ld")
+bnkab = size_helper(layout_transpose_condition("ab"), "n", "k", "b_ld")
+bmn = size_helper("layout == CLBlastLayoutRowMajor", "m", "n", "b_ld")
+bnma = size_helper(layout_transpose_condition("a"), "n", "m", "b_ld")
+cmn = size_helper("layout == CLBlastLayoutRowMajor", "m", "n", "c_ld")
+ammn = size_helper("layout == CLBlastLayoutRowMajor", "m", "((side == CLBlastSideLeft) ? m : n)", "a_ld")
+bmnn = size_helper("layout == CLBlastLayoutRowMajor", "((side == CLBlastSideLeft) ? m : n)", "n", "b_ld")
+
 # ==================================================================================================

 # Populates a list of routines
 ROUTINES = [
 [  # Level 1: vector-vector
-  Routine(False, True,  "1", "rotg",  T, [S,D],            [],                  [],                                                     [],         ["sa","sb","sc","ss"],        [],               "",    "Generate givens plane rotation", "", []),
-  Routine(False, True,  "1", "rotmg", T, [S,D],            [],                  [],                                                     ["sy1"],    ["sd1","sd2","sx1","sparam"], [],               "",    "Generate modified givens plane rotation", "", []),
-  Routine(False, True,  "1", "rot",   T, [S,D],            ["n"],               [],                                                     [],         ["x","y"],                    ["cos","sin"],    "",    "Apply givens plane rotation", "", []),
-  Routine(False, True,  "1", "rotm",  T, [S,D],            ["n"],               [],                                                     [],         ["x","y","sparam"],           [],               "",    "Apply modified givens plane rotation", "", []),
-  Routine(True,  True,  "1", "swap",  T, [S,D,C,Z,H],      ["n"],               [],                                                     [],         ["x","y"],                    [],               "",    "Swap two vectors", "Interchanges _n_ elements of vectors _x_ and _y_.", []),
-  Routine(True,  True,  "1", "scal",  T, [S,D,C,Z,H],      ["n"],               [],                                                     [],         ["x"],                        ["alpha"],        "",    "Vector scaling", "Multiplies _n_ elements of vector _x_ by a scalar constant _alpha_.", []),
-  Routine(True,  True,  "1", "copy",  T, [S,D,C,Z,H],      ["n"],               [],                                                     ["x"],      ["y"],                        [],               "",    "Vector copy", "Copies the contents of vector _x_ into vector _y_.", []),
-  Routine(True,  True,  "1", "axpy",  T, [S,D,C,Z,H],      ["n"],               [],                                                     ["x"],      ["y"],                        ["alpha"],        "",    "Vector-times-constant plus vector", "Performs the operation _y = alpha * x + y_, in which _x_ and _y_ are vectors and _alpha_ is a scalar constant.", []),
-  Routine(True,  True,  "1", "dot",   T, [S,D,H],          ["n"],               [],                                                     ["x","y"],  ["dot"],                      [],               "n",   "Dot product of two vectors", "Multiplies _n_ elements of the vectors _x_ and _y_ element-wise and accumulates the results. The sum is stored in the _dot_ buffer.", []),
-  Routine(True,  True,  "1", "dotu",  T, [C,Z],            ["n"],               [],                                                     ["x","y"],  ["dot"],                      [],               "n",   "Dot product of two complex vectors", "See the regular xDOT routine.", []),
-  Routine(True,  True,  "1", "dotc",  T, [C,Z],            ["n"],               [],                                                     ["x","y"],  ["dot"],                      [],               "n",   "Dot product of two complex vectors, one conjugated", "See the regular xDOT routine.", []),
-  Routine(True,  True,  "1", "nrm2",  T, [S,D,Sc,Dz,H],    ["n"],               [],                                                     ["x"],      ["nrm2"],                     [],               "2*n", "Euclidian norm of a vector", "Accumulates the square of _n_ elements in the _x_ vector and takes the square root. The resulting L2 norm is stored in the _nrm2_ buffer.", []),
-  Routine(True,  True,  "1", "asum",  T, [S,D,Sc,Dz,H],    ["n"],               [],                                                     ["x"],      ["asum"],                     [],               "n",   "Absolute sum of values in a vector", "Accumulates the absolute value of _n_ elements in the _x_ vector. The results are stored in the _asum_ buffer.", []),
-  Routine(True,  False, "1", "sum",   T, [S,D,Sc,Dz,H],    ["n"],               [],                                                     ["x"],      ["sum"],                      [],               "n",   "Sum of values in a vector (non-BLAS function)", "Accumulates the values of _n_ elements in the _x_ vector. The results are stored in the _sum_ buffer. This routine is the non-absolute version of the xASUM BLAS routine.", []),
-  Routine(True,  True,  "1", "amax",  T, [iS,iD,iC,iZ,iH], ["n"],               [],                                                     ["x"],      ["imax"],                     [],               "2*n", "Index of absolute maximum value in a vector", "Finds the index of the maximum of the absolute values in the _x_ vector. The resulting integer index is stored in the _imax_ buffer.", []),
-  Routine(True,  False, "1", "max",   T, [iS,iD,iC,iZ,iH], ["n"],               [],                                                     ["x"],      ["imax"],                     [],               "2*n", "Index of maximum value in a vector (non-BLAS function)", "Finds the index of the maximum of the values in the _x_ vector. The resulting integer index is stored in the _imax_ buffer. This routine is the non-absolute version of the IxAMAX BLAS routine.", []),
-  Routine(True,  False, "1", "min",   T, [iS,iD,iC,iZ,iH], ["n"],               [],                                                     ["x"],      ["imin"],                     [],               "2*n", "Index of minimum value in a vector (non-BLAS function)", "Finds the index of the minimum of the values in the _x_ vector. The resulting integer index is stored in the _imin_ buffer. This routine is the non-absolute minimum version of the IxAMAX BLAS routine.", []),
+  Routine(False, True,  "1", "rotg",  T, [S,D],            [],                  [],                                                     [],         ["sa","sb","sc","ss"],        ["1","1","1","1"], [],       "",    "Generate givens plane rotation", "", []),
+  Routine(False, True,  "1", "rotmg", T, [S,D],            [],                  [],                                                     ["sy1"],    ["sd1","sd2","sx1","sparam"], ["1","1","1","1","1"], [],   "",    "Generate modified givens plane rotation", "", []),
+  Routine(False, True,  "1", "rot",   T, [S,D],            ["n"],               [],                                                     [],         ["x","y"],                    [xn,yn],       ["cos","sin"],"",    "Apply givens plane rotation", "", []),
+  Routine(False, True,  "1", "rotm",  T, [S,D],            ["n"],               [],                                                     [],         ["x","y","sparam"],           [xn,yn,"1"],   [],           "",    "Apply modified givens plane rotation", "", []),
+  Routine(True,  True,  "1", "swap",  T, [S,D,C,Z,H],      ["n"],               [],                                                     [],         ["x","y"],                    [xn,yn],       [],           "",    "Swap two vectors", "Interchanges _n_ elements of vectors _x_ and _y_.", []),
+  Routine(True,  True,  "1", "scal",  T, [S,D,C,Z,H],      ["n"],               [],                                                     [],         ["x"],                        [xn],          ["alpha"],    "",    "Vector scaling", "Multiplies _n_ elements of vector _x_ by a scalar constant _alpha_.", []),
+  Routine(True,  True,  "1", "copy",  T, [S,D,C,Z,H],      ["n"],               [],                                                     ["x"],      ["y"],                        [xn,yn],       [],           "",    "Vector copy", "Copies the contents of vector _x_ into vector _y_.", []),
+  Routine(True,  True,  "1", "axpy",  T, [S,D,C,Z,H],      ["n"],               [],                                                     ["x"],      ["y"],                        [xn,yn],       ["alpha"],    "",    "Vector-times-constant plus vector", "Performs the operation _y = alpha * x + y_, in which _x_ and _y_ are vectors and _alpha_ is a scalar constant.", []),
+  Routine(True,  True,  "1", "dot",   T, [S,D,H],          ["n"],               [],                                                     ["x","y"],  ["dot"],                      [xn,yn,"1"],   [],           "n",   "Dot product of two vectors", "Multiplies _n_ elements of the vectors _x_ and _y_ element-wise and accumulates the results. The sum is stored in the _dot_ buffer.", []),
+  Routine(True,  True,  "1", "dotu",  T, [C,Z],            ["n"],               [],                                                     ["x","y"],  ["dot"],                      [xn,yn,"1"],   [],           "n",   "Dot product of two complex vectors", "See the regular xDOT routine.", []),
+  Routine(True,  True,  "1", "dotc",  T, [C,Z],            ["n"],               [],                                                     ["x","y"],  ["dot"],                      [xn,yn,"1"],   [],           "n",   "Dot product of two complex vectors, one conjugated", "See the regular xDOT routine.", []),
+  Routine(True,  True,  "1", "nrm2",  T, [S,D,Sc,Dz,H],    ["n"],               [],                                                     ["x"],      ["nrm2"],                     [xn,"1"],      [],           "2*n", "Euclidian norm of a vector", "Accumulates the square of _n_ elements in the _x_ vector and takes the square root. The resulting L2 norm is stored in the _nrm2_ buffer.", []),
+  Routine(True,  True,  "1", "asum",  T, [S,D,Sc,Dz,H],    ["n"],               [],                                                     ["x"],      ["asum"],                     [xn,"1"],      [],           "n",   "Absolute sum of values in a vector", "Accumulates the absolute value of _n_ elements in the _x_ vector. The results are stored in the _asum_ buffer.", []),
+  Routine(True,  False, "1", "sum",   T, [S,D,Sc,Dz,H],    ["n"],               [],                                                     ["x"],      ["sum"],                      [xn,"1"],      [],           "n",   "Sum of values in a vector (non-BLAS function)", "Accumulates the values of _n_ elements in the _x_ vector. The results are stored in the _sum_ buffer. This routine is the non-absolute version of the xASUM BLAS routine.", []),
+  Routine(True,  True,  "1", "amax",  T, [iS,iD,iC,iZ,iH], ["n"],               [],                                                     ["x"],      ["imax"],                     [xn,"1"],      [],           "2*n", "Index of absolute maximum value in a vector", "Finds the index of the maximum of the absolute values in the _x_ vector. The resulting integer index is stored in the _imax_ buffer.", []),
+  Routine(True,  False, "1", "max",   T, [iS,iD,iC,iZ,iH], ["n"],               [],                                                     ["x"],      ["imax"],                     [xn,"1"],      [],           "2*n", "Index of maximum value in a vector (non-BLAS function)", "Finds the index of the maximum of the values in the _x_ vector. The resulting integer index is stored in the _imax_ buffer. This routine is the non-absolute version of the IxAMAX BLAS routine.", []),
+  Routine(True,  False, "1", "min",   T, [iS,iD,iC,iZ,iH], ["n"],               [],                                                     ["x"],      ["imin"],                     [xn,"1"],      [],           "2*n", "Index of minimum value in a vector (non-BLAS function)", "Finds the index of the minimum of the values in the _x_ vector. The resulting integer index is stored in the _imin_ buffer. This routine is the non-absolute minimum version of the IxAMAX BLAS routine.", []),
 ],
 [  # Level 2: matrix-vector
-  Routine(True,  True,  "2a", "gemv",  T,  [S,D,C,Z,H],    ["m","n"],           ["layout","a_transpose"],                               ["a","x"],  ["y"],                        ["alpha","beta"], "",    "General matrix-vector multiplication", "Performs the operation _y = alpha * A * x + beta * y_, in which _x_ is an input vector, _y_ is an input and output vector, _A_ is an input matrix, and _alpha_ and _beta_ are scalars. The matrix _A_ can optionally be transposed before performing the operation.", [ald_m]),
-  Routine(True,  True,  "2a", "gbmv",  T,  [S,D,C,Z,H],    ["m","n","kl","ku"], ["layout","a_transpose"],                               ["a","x"],  ["y"],                        ["alpha","beta"], "",    "General banded matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is banded instead.", [ald_kl_ku_one]),
-  Routine(True,  True,  "2a", "hemv",  T,  [C,Z],          ["n"],               ["layout","triangle"],                                  ["a","x"],  ["y"],                        ["alpha","beta"], "",    "Hermitian matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is an Hermitian matrix instead.", [ald_n]),
-  Routine(True,  True,  "2a", "hbmv",  T,  [C,Z],          ["n","k"],           ["layout","triangle"],                                  ["a","x"],  ["y"],                        ["alpha","beta"], "",    "Hermitian banded matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is an Hermitian banded matrix instead.", [ald_k_one]),
-  Routine(True,  True,  "2a", "hpmv",  T,  [C,Z],          ["n"],               ["layout","triangle"],                                  ["ap","x"], ["y"],                        ["alpha","beta"], "",    "Hermitian packed matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is an Hermitian packed matrix instead and represented as _AP_.", []),
-  Routine(True,  True,  "2a", "symv",  T,  [S,D,H],        ["n"],               ["layout","triangle"],                                  ["a","x"],  ["y"],                        ["alpha","beta"], "",    "Symmetric matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is symmetric instead.", [ald_n]),
-  Routine(True,  True,  "2a", "sbmv",  T,  [S,D,H],        ["n","k"],           ["layout","triangle"],                                  ["a","x"],  ["y"],                        ["alpha","beta"], "",    "Symmetric banded matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is symmetric and banded instead.", [ald_k_one]),
-  Routine(True,  True,  "2a", "spmv",  T,  [S,D,H],        ["n"],               ["layout","triangle"],                                  ["ap","x"], ["y"],                        ["alpha","beta"], "",    "Symmetric packed matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is a symmetric packed matrix instead and represented as _AP_.", []),
-  Routine(True,  True,  "2a", "trmv",  T,  [S,D,C,Z,H],    ["n"],               ["layout","triangle","a_transpose","diagonal"],         ["a"],      ["x"],                        [],               "n",   "Triangular matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is triangular instead.", [ald_n]),
-  Routine(True,  True,  "2a", "tbmv",  T,  [S,D,C,Z,H],    ["n","k"],           ["layout","triangle","a_transpose","diagonal"],         ["a"],      ["x"],                        [],               "n",   "Triangular banded matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is triangular and banded instead.", [ald_k_one]),
-  Routine(True,  True,  "2a", "tpmv",  T,  [S,D,C,Z,H],    ["n"],               ["layout","triangle","a_transpose","diagonal"],         ["ap"],     ["x"],                        [],               "n",   "Triangular packed matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is a triangular packed matrix instead and repreented as _AP_.", []),
-  Routine(False, True,  "2a", "trsv",  T,  [S,D,C,Z],      ["n"],               ["layout","triangle","a_transpose","diagonal"],         ["a"],      ["x"],                        [],               "",    "Solves a triangular system of equations", "", []),
-  Routine(False, True,  "2a", "tbsv",  T,  [S,D,C,Z],      ["n","k"],           ["layout","triangle","a_transpose","diagonal"],         ["a"],      ["x"],                        [],               "",    "Solves a banded triangular system of equations", "", [ald_k_one]),
-  Routine(False, True,  "2a", "tpsv",  T,  [S,D,C,Z],      ["n"],               ["layout","triangle","a_transpose","diagonal"],         ["ap"],     ["x"],                        [],               "",    "Solves a packed triangular system of equations", "", []),
+  Routine(True,  True,  "2a", "gemv",  T,  [S,D,C,Z,H],    ["m","n"],           ["layout","a_transpose"],                               ["a","x"],  ["y"],                        [amn,xmn,ynm], ["alpha","beta"], "",    "General matrix-vector multiplication", "Performs the operation _y = alpha * A * x + beta * y_, in which _x_ is an input vector, _y_ is an input and output vector, _A_ is an input matrix, and _alpha_ and _beta_ are scalars. The matrix _A_ can optionally be transposed before performing the operation.", [ald_m]),
+  Routine(True,  True,  "2a", "gbmv",  T,  [S,D,C,Z,H],    ["m","n","kl","ku"], ["layout","a_transpose"],                               ["a","x"],  ["y"],                        [amn,xmn,ynm], ["alpha","beta"], "",    "General banded matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is banded instead.", [ald_kl_ku_one]),
+  Routine(True,  True,  "2a", "hemv",  T,  [C,Z],          ["n"],               ["layout","triangle"],                                  ["a","x"],  ["y"],                        [an,xn,yn],    ["alpha","beta"], "",    "Hermitian matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is an Hermitian matrix instead.", [ald_n]),
+  Routine(True,  True,  "2a", "hbmv",  T,  [C,Z],          ["n","k"],           ["layout","triangle"],                                  ["a","x"],  ["y"],                        [an,xn,yn],    ["alpha","beta"], "",    "Hermitian banded matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is an Hermitian banded matrix instead.", [ald_k_one]),
+  Routine(True,  True,  "2a", "hpmv",  T,  [C,Z],          ["n"],               ["layout","triangle"],                                  ["ap","x"], ["y"],                        [apn,xn,yn],   ["alpha","beta"], "",    "Hermitian packed matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is an Hermitian packed matrix instead and represented as _AP_.", []),
+  Routine(True,  True,  "2a", "symv",  T,  [S,D,H],        ["n"],               ["layout","triangle"],                                  ["a","x"],  ["y"],                        [an,xn,yn],    ["alpha","beta"], "",    "Symmetric matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is symmetric instead.", [ald_n]),
+  Routine(True,  True,  "2a", "sbmv",  T,  [S,D,H],        ["n","k"],           ["layout","triangle"],                                  ["a","x"],  ["y"],                        [an,xn,yn],    ["alpha","beta"], "",    "Symmetric banded matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is symmetric and banded instead.", [ald_k_one]),
+  Routine(True,  True,  "2a", "spmv",  T,  [S,D,H],        ["n"],               ["layout","triangle"],                                  ["ap","x"], ["y"],                        [apn,xn,yn],   ["alpha","beta"], "",    "Symmetric packed matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is a symmetric packed matrix instead and represented as _AP_.", []),
+  Routine(True,  True,  "2a", "trmv",  T,  [S,D,C,Z,H],    ["n"],               ["layout","triangle","a_transpose","diagonal"],         ["a"],      ["x"],                        [an,xn],       [],               "n",   "Triangular matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is triangular instead.", [ald_n]),
+  Routine(True,  True,  "2a", "tbmv",  T,  [S,D,C,Z,H],    ["n","k"],           ["layout","triangle","a_transpose","diagonal"],         ["a"],      ["x"],                        [an,xn],       [],               "n",   "Triangular banded matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is triangular and banded instead.", [ald_k_one]),
+  Routine(True,  True,  "2a", "tpmv",  T,  [S,D,C,Z,H],    ["n"],               ["layout","triangle","a_transpose","diagonal"],         ["ap"],     ["x"],                        [apn,xn],      [],               "n",   "Triangular packed matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is a triangular packed matrix instead and repreented as _AP_.", []),
+  Routine(False, True,  "2a", "trsv",  T,  [S,D,C,Z],      ["n"],               ["layout","triangle","a_transpose","diagonal"],         ["a"],      ["x"],                        [an,xn],       [],               "",    "Solves a triangular system of equations", "", []),
+  Routine(False, True,  "2a", "tbsv",  T,  [S,D,C,Z],      ["n","k"],           ["layout","triangle","a_transpose","diagonal"],         ["a"],      ["x"],                        [an,xn],       [],               "",    "Solves a banded triangular system of equations", "", [ald_k_one]),
+  Routine(False, True,  "2a", "tpsv",  T,  [S,D,C,Z],      ["n"],               ["layout","triangle","a_transpose","diagonal"],         ["ap"],     ["x"],                        [apn,xn],      [],               "",    "Solves a packed triangular system of equations", "", []),
  # Level 2: matrix update
-  Routine(True,  True,  "2b", "ger",   T,  [S,D,H],        ["m","n"],           ["layout"],                                             ["x","y"],  ["a"],                        ["alpha"],        "",    "General rank-1 matrix update", "Performs the operation _A = alpha * x * y^T + A_, in which _x_ is an input vector, _y^T_ is the transpose of the input vector _y_, _A_ is the matrix to be updated, and _alpha_ is a scalar value.", [ald_m]),
-  Routine(True,  True,  "2b", "geru",  T,  [C,Z],          ["m","n"],           ["layout"],                                             ["x","y"],  ["a"],                        ["alpha"],        "",    "General rank-1 complex matrix update", "Same operation as xGER, but with complex data-types.", [ald_m]),
-  Routine(True,  True,  "2b", "gerc",  T,  [C,Z],          ["m","n"],           ["layout"],                                             ["x","y"],  ["a"],                        ["alpha"],        "",    "General rank-1 complex conjugated matrix update", "Same operation as xGERU, but the update is done based on the complex conjugate of the input vectors.", [ald_m]),
-  Routine(True,  True,  "2b", "her",   Tc, [Css,Zdd],      ["n"],               ["layout","triangle"],                                  ["x"],      ["a"],                        ["alpha"],        "",    "Hermitian rank-1 matrix update", "Performs the operation _A = alpha * x * x^T + A_, in which x is an input vector, x^T is the transpose of this vector, _A_ is the triangular Hermetian matrix to be updated, and alpha is a scalar value.", [ald_n]),
-  Routine(True,  True,  "2b", "hpr",   Tc, [Css,Zdd],      ["n"],               ["layout","triangle"],                                  ["x"],      ["ap"],                       ["alpha"],        "",    "Hermitian packed rank-1 matrix update", "Same operation as xHER, but matrix _A_ is an Hermitian packed matrix instead and represented as _AP_.", []),
-  Routine(True,  True,  "2b", "her2",  T,  [C,Z],          ["n"],               ["layout","triangle"],                                  ["x","y"],  ["a"],                        ["alpha"],        "",    "Hermitian rank-2 matrix update", "Performs the operation _A = alpha * x * y^T + conj(alpha) * y * x^T + A_, in which _x_ is an input vector and _x^T_ its transpose, _y_ is an input vector and _y^T_ its transpose, _A_ is the triangular Hermetian matrix to be updated, _alpha_ is a scalar value and _conj(alpha)_ its complex conjugate.", [ald_n]),
-  Routine(True,  True,  "2b", "hpr2",  T,  [C,Z],          ["n"],               ["layout","triangle"],                                  ["x","y"],  ["ap"],                       ["alpha"],        "",    "Hermitian packed rank-2 matrix update", "Same operation as xHER2, but matrix _A_ is an Hermitian packed matrix instead and represented as _AP_.", []),
-  Routine(True,  True,  "2b", "syr",   T,  [S,D,H],        ["n"],               ["layout","triangle"],                                  ["x"],      ["a"],                        ["alpha"],        "",    "Symmetric rank-1 matrix update", "Same operation as xHER, but matrix A is a symmetric matrix instead.", [ald_n]),
-  Routine(True,  True,  "2b", "spr",   T,  [S,D,H],        ["n"],               ["layout","triangle"],                                  ["x"],      ["ap"],                       ["alpha"],        "",    "Symmetric packed rank-1 matrix update", "Same operation as xSPR, but matrix _A_ is a symmetric packed matrix instead and represented as _AP_.", []),
-  Routine(True,  True,  "2b", "syr2",  T,  [S,D,H],        ["n"],               ["layout","triangle"],                                  ["x","y"],  ["a"],                        ["alpha"],        "",    "Symmetric rank-2 matrix update", "Same operation as xHER2, but matrix _A_ is a symmetric matrix instead.", [ald_n]),
-  Routine(True,  True,  "2b", "spr2",  T,  [S,D,H],        ["n"],               ["layout","triangle"],                                  ["x","y"],  ["ap"],                       ["alpha"],        "",    "Symmetric packed rank-2 matrix update", "Same operation as xSPR2, but matrix _A_ is a symmetric packed matrix instead and represented as _AP_.", []),
+  Routine(True,  True,  "2b", "ger",   T,  [S,D,H],        ["m","n"],           ["layout"],                                             ["x","y"],  ["a"],                        [xm,yn,amn],   ["alpha"],        "",    "General rank-1 matrix update", "Performs the operation _A = alpha * x * y^T + A_, in which _x_ is an input vector, _y^T_ is the transpose of the input vector _y_, _A_ is the matrix to be updated, and _alpha_ is a scalar value.", [ald_m]),
+  Routine(True,  True,  "2b", "geru",  T,  [C,Z],          ["m","n"],           ["layout"],                                             ["x","y"],  ["a"],                        [xm,yn,amn],   ["alpha"],        "",    "General rank-1 complex matrix update", "Same operation as xGER, but with complex data-types.", [ald_m]),
+  Routine(True,  True,  "2b", "gerc",  T,  [C,Z],          ["m","n"],           ["layout"],                                             ["x","y"],  ["a"],                        [xm,yn,amn],   ["alpha"],        "",    "General rank-1 complex conjugated matrix update", "Same operation as xGERU, but the update is done based on the complex conjugate of the input vectors.", [ald_m]),
+  Routine(True,  True,  "2b", "her",   Tc, [Css,Zdd],      ["n"],               ["layout","triangle"],                                  ["x"],      ["a"],                        [xn,an],       ["alpha"],        "",    "Hermitian rank-1 matrix update", "Performs the operation _A = alpha * x * x^T + A_, in which x is an input vector, x^T is the transpose of this vector, _A_ is the triangular Hermetian matrix to be updated, and alpha is a scalar value.", [ald_n]),
+  Routine(True,  True,  "2b", "hpr",   Tc, [Css,Zdd],      ["n"],               ["layout","triangle"],                                  ["x"],      ["ap"],                       [xn,apn],      ["alpha"],        "",    "Hermitian packed rank-1 matrix update", "Same operation as xHER, but matrix _A_ is an Hermitian packed matrix instead and represented as _AP_.", []),
+  Routine(True,  True,  "2b", "her2",  T,  [C,Z],          ["n"],               ["layout","triangle"],                                  ["x","y"],  ["a"],                        [xn,yn,an],    ["alpha"],        "",    "Hermitian rank-2 matrix update", "Performs the operation _A = alpha * x * y^T + conj(alpha) * y * x^T + A_, in which _x_ is an input vector and _x^T_ its transpose, _y_ is an input vector and _y^T_ its transpose, _A_ is the triangular Hermetian matrix to be updated, _alpha_ is a scalar value and _conj(alpha)_ its complex conjugate.", [ald_n]),
+  Routine(True,  True,  "2b", "hpr2",  T,  [C,Z],          ["n"],               ["layout","triangle"],                                  ["x","y"],  ["ap"],                       [xn,yn,apn],   ["alpha"],        "",    "Hermitian packed rank-2 matrix update", "Same operation as xHER2, but matrix _A_ is an Hermitian packed matrix instead and represented as _AP_.", []),
+  Routine(True,  True,  "2b", "syr",   T,  [S,D,H],        ["n"],               ["layout","triangle"],                                  ["x"],      ["a"],                        [xn,an],       ["alpha"],        "",    "Symmetric rank-1 matrix update", "Same operation as xHER, but matrix A is a symmetric matrix instead.", [ald_n]),
+  Routine(True,  True,  "2b", "spr",   T,  [S,D,H],        ["n"],               ["layout","triangle"],                                  ["x"],      ["ap"],                       [xn,apn],      ["alpha"],        "",    "Symmetric packed rank-1 matrix update", "Same operation as xSPR, but matrix _A_ is a symmetric packed matrix instead and represented as _AP_.", []),
+  Routine(True,  True,  "2b", "syr2",  T,  [S,D,H],        ["n"],               ["layout","triangle"],                                  ["x","y"],  ["a"],                        [xn,yn,an],    ["alpha"],        "",    "Symmetric rank-2 matrix update", "Same operation as xHER2, but matrix _A_ is a symmetric matrix instead.", [ald_n]),
+  Routine(True,  True,  "2b", "spr2",  T,  [S,D,H],        ["n"],               ["layout","triangle"],                                  ["x","y"],  ["ap"],                       [xn,yn,apn],   ["alpha"],        "",    "Symmetric packed rank-2 matrix update", "Same operation as xSPR2, but matrix _A_ is a symmetric packed matrix instead and represented as _AP_.", []),
 ],
 [  # Level 3: matrix-matrix
-  Routine(True,  True,  "3", "gemm",  T,  [S,D,C,Z,H],     ["m","n","k"],        ["layout","a_transpose","b_transpose"],                ["a","b"],  ["c"],                        ["alpha","beta"], "",    "General matrix-matrix multiplication", "Performs the matrix product _C = alpha * A * B + beta * C_, in which _A_ (_m_ by _k_) and _B_ (_k_ by _n_) are two general rectangular input matrices, _C_ (_m_ by _n_) is the matrix to be updated, and _alpha_ and _beta_ are scalar values. The matrices _A_ and/or _B_ can optionally be transposed before performing the operation.", [ald_transa_m_k, bld_transb_k_n, cld_m]),
-  Routine(True,  True,  "3", "symm",  T,  [S,D,C,Z,H],     ["m","n"],            ["layout","side","triangle"],                          ["a","b"],  ["c"],                        ["alpha","beta"], "",    "Symmetric matrix-matrix multiplication", "Same operation as xGEMM, but _A_ is symmetric instead. In case of `side == kLeft`, _A_ is a symmetric _m_ by _m_ matrix and _C = alpha * A * B + beta * C_ is performed. Otherwise, in case of `side == kRight`, _A_ is a symmtric _n_ by _n_ matrix and _C = alpha * B * A + beta * C_ is performed.", [ald_side_m_n, bld_m, cld_m]),
-  Routine(True,  True,  "3", "hemm",  T,  [C,Z],           ["m","n"],            ["layout","side","triangle"],                          ["a","b"],  ["c"],                        ["alpha","beta"], "",    "Hermitian matrix-matrix multiplication", "Same operation as xSYMM, but _A_ is an Hermitian matrix instead.", [ald_side_m_n, bld_m, cld_m]),
-  Routine(True,  True,  "3", "syrk",  T,  [S,D,C,Z,H],     ["n","k"],            ["layout","triangle","a_transpose"],                   ["a"],      ["c"],                        ["alpha","beta"], "",    "Rank-K update of a symmetric matrix", "Performs the matrix product _C = alpha * A * A^T + beta * C_ or _C = alpha * A^T * A + beta * C_, in which _A_ is a general matrix and _A^T_ is its transpose, _C_ (_n_ by _n_) is the symmetric matrix to be updated, and _alpha_ and _beta_ are scalar values.", [ald_trans_n_k, cld_m]),
-  Routine(True,  True,  "3", "herk",  Tc, [Css,Zdd],       ["n","k"],            ["layout","triangle","a_transpose"],                   ["a"],      ["c"],                        ["alpha","beta"], "",    "Rank-K update of a hermitian matrix", "Same operation as xSYRK, but _C_ is an Hermitian matrix instead.", [ald_trans_n_k, cld_m]),
-  Routine(True,  True,  "3", "syr2k", T,  [S,D,C,Z,H],     ["n","k"],            ["layout","triangle","ab_transpose"],                  ["a","b"],  ["c"],                        ["alpha","beta"], "",    "Rank-2K update of a symmetric matrix", "Performs the matrix product _C = alpha * A * B^T + alpha * B * A^T + beta * C_ or _C = alpha * A^T * B + alpha * B^T * A + beta * C_, in which _A_ and _B_ are general matrices and _A^T_ and _B^T_ are their transposed versions, _C_ (_n_ by _n_) is the symmetric matrix to be updated, and _alpha_ and _beta_ are scalar values.", [ald_trans_n_k, bld_trans_n_k, cld_n]),
-  Routine(True,  True,  "3", "her2k", TU, [Ccs,Zzd],       ["n","k"],            ["layout","triangle","ab_transpose"],                  ["a","b"],  ["c"],                        ["alpha","beta"], "",    "Rank-2K update of a hermitian matrix", "Same operation as xSYR2K, but _C_ is an Hermitian matrix instead.", [ald_trans_n_k, bld_trans_n_k, cld_n]),
-  Routine(True,  True,  "3", "trmm",  T,  [S,D,C,Z,H],     ["m","n"],            ["layout","side","triangle","a_transpose","diagonal"], ["a"],      ["b"],                        ["alpha"],        "",    "Triangular matrix-matrix multiplication", "Performs the matrix product _B = alpha * A * B_ or _B = alpha * B * A_, in which _A_ is a unit or non-unit triangular matrix, _B_ (_m_ by _n_) is the general matrix to be updated, and _alpha_ is a scalar value.", [ald_side_m_n, bld_m]),
-  Routine(False, True,  "3", "trsm",  T,  [S,D,C,Z,H],     ["m","n"],            ["layout","side","triangle","a_transpose","diagonal"], ["a"],      ["b"],                        ["alpha"],        "",    "Solves a triangular system of equations", "", []),
+  Routine(True,  True,  "3", "gemm",  T,  [S,D,C,Z,H],     ["m","n","k"],        ["layout","a_transpose","b_transpose"],                ["a","b"],  ["c"],                        [amk,bkn,cmn],   ["alpha","beta"], "",    "General matrix-matrix multiplication", "Performs the matrix product _C = alpha * A * B + beta * C_, in which _A_ (_m_ by _k_) and _B_ (_k_ by _n_) are two general rectangular input matrices, _C_ (_m_ by _n_) is the matrix to be updated, and _alpha_ and _beta_ are scalar values. The matrices _A_ and/or _B_ can optionally be transposed before performing the operation.", [ald_transa_m_k, bld_transb_k_n, cld_m]),
+  Routine(True,  True,  "3", "symm",  T,  [S,D,C,Z,H],     ["m","n"],            ["layout","side","triangle"],                          ["a","b"],  ["c"],                        [ammn,bmnn,cmn], ["alpha","beta"], "",    "Symmetric matrix-matrix multiplication", "Same operation as xGEMM, but _A_ is symmetric instead. In case of `side == kLeft`, _A_ is a symmetric _m_ by _m_ matrix and _C = alpha * A * B + beta * C_ is performed. Otherwise, in case of `side == kRight`, _A_ is a symmtric _n_ by _n_ matrix and _C = alpha * B * A + beta * C_ is performed.", [ald_side_m_n, bld_m, cld_m]),
+  Routine(True,  True,  "3", "hemm",  T,  [C,Z],           ["m","n"],            ["layout","side","triangle"],                          ["a","b"],  ["c"],                        [ammn,bmnn,cmn], ["alpha","beta"], "",    "Hermitian matrix-matrix multiplication", "Same operation as xSYMM, but _A_ is an Hermitian matrix instead.", [ald_side_m_n, bld_m, cld_m]),
+  Routine(True,  True,  "3", "syrk",  T,  [S,D,C,Z,H],     ["n","k"],            ["layout","triangle","a_transpose"],                   ["a"],      ["c"],                        [ank,cn],        ["alpha","beta"], "",    "Rank-K update of a symmetric matrix", "Performs the matrix product _C = alpha * A * A^T + beta * C_ or _C = alpha * A^T * A + beta * C_, in which _A_ is a general matrix and _A^T_ is its transpose, _C_ (_n_ by _n_) is the symmetric matrix to be updated, and _alpha_ and _beta_ are scalar values.", [ald_trans_n_k, cld_m]),
+  Routine(True,  True,  "3", "herk",  Tc, [Css,Zdd],       ["n","k"],            ["layout","triangle","a_transpose"],                   ["a"],      ["c"],                        [ank,cn],        ["alpha","beta"], "",    "Rank-K update of a hermitian matrix", "Same operation as xSYRK, but _C_ is an Hermitian matrix instead.", [ald_trans_n_k, cld_m]),
+  Routine(True,  True,  "3", "syr2k", T,  [S,D,C,Z,H],     ["n","k"],            ["layout","triangle","ab_transpose"],                  ["a","b"],  ["c"],                        [ankab,bnkab,cn],["alpha","beta"], "",    "Rank-2K update of a symmetric matrix", "Performs the matrix product _C = alpha * A * B^T + alpha * B * A^T + beta * C_ or _C = alpha * A^T * B + alpha * B^T * A + beta * C_, in which _A_ and _B_ are general matrices and _A^T_ and _B^T_ are their transposed versions, _C_ (_n_ by _n_) is the symmetric matrix to be updated, and _alpha_ and _beta_ are scalar values.", [ald_trans_n_k, bld_trans_n_k, cld_n]),
+  Routine(True,  True,  "3", "her2k", TU, [Ccs,Zzd],       ["n","k"],            ["layout","triangle","ab_transpose"],                  ["a","b"],  ["c"],                        [ankab,bnkab,cn],["alpha","beta"], "",    "Rank-2K update of a hermitian matrix", "Same operation as xSYR2K, but _C_ is an Hermitian matrix instead.", [ald_trans_n_k, bld_trans_n_k, cld_n]),
+  Routine(True,  True,  "3", "trmm",  T,  [S,D,C,Z,H],     ["m","n"],            ["layout","side","triangle","a_transpose","diagonal"], ["a"],      ["b"],                        [amns,bmn],      ["alpha"],        "",    "Triangular matrix-matrix multiplication", "Performs the matrix product _B = alpha * A * B_ or _B = alpha * B * A_, in which _A_ is a unit or non-unit triangular matrix, _B_ (_m_ by _n_) is the general matrix to be updated, and _alpha_ is a scalar value.", [ald_side_m_n, bld_m]),
+  Routine(False, True,  "3", "trsm",  T,  [S,D,C,Z,H],     ["m","n"],            ["layout","side","triangle","a_transpose","diagonal"], ["a"],      ["b"],                        [amns,bmn],      ["alpha"],        "",    "Solves a triangular system of equations", "", []),
 ],
 [  # Level X: extra routines (not part of BLAS)
-  Routine(True,  True,  "x", "omatcopy", T, [S,D,C,Z,H],   ["m","n"],            ["layout","a_transpose"],                              ["a"],      ["b"],                        ["alpha"],        "",    "Scaling and out-place transpose/copy (non-BLAS function)", "Performs scaling and out-of-place transposition/copying of matrices according to _B = alpha*op(A)_, in which _A_ is an input matrix (_m_ rows by _n_ columns), _B_ an output matrix, and _alpha_ a scalar value. The operation _op_ can be a normal matrix copy, a transposition or a conjugate transposition.", [ald_m, bld_n]),
+  Routine(True,  True,  "x", "omatcopy", T, [S,D,C,Z,H],   ["m","n"],            ["layout","a_transpose"],                              ["a"],      ["b"],                        [amn,bnma],      ["alpha"],        "",    "Scaling and out-place transpose/copy (non-BLAS function)", "Performs scaling and out-of-place transposition/copying of matrices according to _B = alpha*op(A)_, in which _A_ is an input matrix (_m_ rows by _n_ columns), _B_ an output matrix, and _alpha_ a scalar value. The operation _op_ can be a normal matrix copy, a transposition or a conjugate transposition.", [ald_m, bld_n]),
 ]]


@ -165,6 +204,10 @@ def main(argv):
                        body += cpp.wrapper_clblas(routine)
                    if i == 5:
                        body += cpp.wrapper_cblas(routine)
+                    if i == 6:
+                        body += cpp.clblast_netlib_c_h(routine)
+                    if i == 7:
+                        body += cpp.clblast_netlib_c_cc(routine)
            f.write("".join(file_header))
            f.write(body)
            f.write("".join(file_footer))
--- a/scripts/generator/generator/cpp.py
+++ b/scripts/generator/generator/cpp.py
@ -95,6 +95,79 @@ def clblast_c_cc(routine):
    return result


+def clblast_netlib_c_h(routine):
+    """The Netlib CBLAS API header (.h)"""
+    result = NL + "// " + routine.description + ": " + routine.short_names() + NL
+    for flavour in routine.flavours:
+        if flavour.precision_name in ["S", "D", "C", "Z"]:
+            result += routine.routine_header_netlib(flavour, 20, " PUBLIC_API") + ";" + NL
+    return result
+
+
+def clblast_netlib_c_cc(routine):
+    """The Netlib CBLAS API implementation (.cpp)"""
+    result = NL + "// " + routine.name.upper() + NL
+    for flavour in routine.flavours:
+
+        # There is a version available in CBLAS
+        if flavour.precision_name in ["S", "D", "C", "Z"]:
+            template = "<" + flavour.template + ">" if routine.no_scalars() else ""
+            name_postfix = "_sub" if routine.name in routine.routines_scalar_no_return() else ""
+            indent = " " * (21 + routine.length() + len(template))
+            result += routine.routine_header_netlib(flavour, 9, "") + " {" + NL
+
+            # Initialize OpenCL
+            result += "  auto device = get_device();" + NL
+            result += "  auto context = clblast::Context(device);" + NL
+            result += "  auto queue = clblast::Queue(context, device);" + NL
+
+            # Set alpha and beta
+            result += "".join("  " + s + NL for s in routine.scalar_create_cpp(flavour))
+
+            # Copy data structures to the device
+            for i, name in enumerate(routine.inputs + routine.outputs):
+                result += "  " + routine.set_size(name, routine.buffer_sizes[i]) + NL
+            for i, name in enumerate(routine.inputs + routine.outputs):
+                buffer_type = routine.get_buffer_type(name, flavour)
+                result += "  " + routine.create_buffer(name, buffer_type) + NL
+                if name in routine.scalar_buffers_second_non_pointer():
+                    result += "  " + buffer_type + " " + name + "_vec[1]; " + name + "_vec[0] = " + name + ";" + NL
+            for name in routine.inputs + routine.outputs:
+                if name not in routine.scalar_buffers_first():
+                    prefix = "" if name in routine.outputs else "const "
+                    buffer_type = routine.get_buffer_type(name, flavour)
+                    result += "  " + routine.write_buffer(name, prefix + buffer_type) + NL
+
+            # The function call
+            result += "  auto queue_cl = queue();" + NL
+            result += "  auto s = clblast::" + routine.name.capitalize() + template + "("
+            result += ("," + NL + indent).join([a for a in routine.arguments_netlib(flavour, indent)])
+            result += "," + NL + indent + "&queue_cl);" + NL
+
+            # Error handling
+            result += "  if (s != clblast::StatusCode::kSuccess) {" + NL
+            result += "    throw std::runtime_error(\"CLBlast returned with error code \" + clblast::ToString(s));" + NL
+            result += "  }" + NL
+
+            # Copy back and clean-up
+            for name in routine.outputs:
+                if name in routine.scalar_buffers_first() and routine.name not in routine.routines_scalar_no_return():
+                    buffer_type = routine.get_buffer_type(name, flavour)
+                    result += "  " + buffer_type + " " + name + "[" + name + "_size];" + NL
+            for name in routine.outputs:
+                buffer_type = routine.get_buffer_type(name, flavour)
+                result += "  " + routine.read_buffer(name, buffer_type) + NL
+            for name in routine.outputs:
+                if name in routine.scalar_buffers_first() and routine.name not in routine.routines_scalar_no_return():
+                    result += "  return " + name + "[0]"
+                    if flavour.buffer_type in ["float2", "double2"]:
+                        if name not in routine.index_buffers():
+                            result += ".real()"
+                    result += ";" + NL
+            result += "}" + NL
+    return result
+
+
 def wrapper_clblas(routine):
    """The wrapper to the reference clBLAS routines (for performance/correctness testing)"""
    result = ""
--- a/scripts/generator/generator/datatype.py
+++ b/scripts/generator/generator/datatype.py
@ -54,6 +54,22 @@ class DataType:
            return self.beta_cl + "{{beta.real(), beta.imag()}}"
        return "beta"

+    def use_alpha_clblast(self):
+        """Transforms a Netlib CBLAS parameter to CLBlast style"""
+        if self.alpha_cpp == D_FLOAT2:
+            return self.alpha_cpp + "{reinterpret_cast<const float*>(alpha)[0], reinterpret_cast<const float*>(alpha)[1]}"
+        elif self.alpha_cpp == D_DOUBLE2:
+            return self.alpha_cpp + "{reinterpret_cast<const double*>(alpha)[0], reinterpret_cast<const double*>(alpha)[1]}"
+        return "alpha"
+
+    def use_beta_clblast(self):
+        """As above, but for beta instead of alpha"""
+        if self.beta_cpp == D_FLOAT2:
+            return self.beta_cpp + "{reinterpret_cast<const float*>(beta)[0], reinterpret_cast<const float*>(beta)[1]}"
+        elif self.beta_cpp == D_DOUBLE2:
+            return self.beta_cpp + "{reinterpret_cast<const double*>(beta)[0], reinterpret_cast<const double*>(beta)[1]}"
+        return "beta"
+
    def test_template(self):
        """Returns the template as used in the correctness/performance tests"""
        if self.buffer_type != self.beta_cpp:
@ -65,6 +81,10 @@ class DataType:
        return ((scalar == "alpha" and self.alpha_cpp in [D_FLOAT2, D_DOUBLE2]) or
                (scalar == "beta" and self.beta_cpp in [D_FLOAT2, D_DOUBLE2]))

+    def is_non_standard(self):
+        """Current type is of a non-standard type"""
+        return self.buffer_type in [D_HALF, D_FLOAT2, D_DOUBLE2]
+

 # Regular data-types
 H = DataType("H", "H", D_HALF, [D_HALF] * 2 + [D_HALF_OPENCL] * 2, D_HALF)  # half (16)
--- a/scripts/generator/generator/routine.py
+++ b/scripts/generator/generator/routine.py
@ -13,7 +13,8 @@ import generator.convert as convert
 class Routine:
    """Class holding routine-specific information (e.g. name, which arguments, which precisions)"""
    def __init__(self, implemented, has_tests, level, name, template, flavours, sizes, options,
-                 inputs, outputs, scalars, scratch, description, details, requirements):
+                 inputs, outputs, buffer_sizes, scalars, scratch,
+                 description, details, requirements):
        self.implemented = implemented
        self.has_tests = has_tests
        self.level = level
@ -24,6 +25,7 @@ class Routine:
        self.options = options
        self.inputs = inputs
        self.outputs = outputs
+        self.buffer_sizes = buffer_sizes
        self.scalars = scalars
        self.scratch = scratch  # Scratch buffer (e.g. for xDOT)
        self.description = description
@ -40,6 +42,11 @@ class Routine:
        """List of scalar buffers"""
        return ["sa", "sb", "sc", "ss", "sd1", "sd2", "sx1", "sy1", "sparam"]

+    @staticmethod
+    def scalar_buffers_second_non_pointer():
+        """As above, but these ones are not passed as pointers but as scalars instead"""
+        return ["sy1"]
+
    @staticmethod
    def other_scalars():
        """List of scalars other than alpha and beta"""
@ -65,6 +72,34 @@ class Routine:
        """Distinguish between vectors and matrices"""
        return ["a", "b", "c", "ap"]

+    @staticmethod
+    def routines_scalar_no_return():
+        return ["dotu", "dotc"]
+
+    @staticmethod
+    def set_size(name, size):
+        """Sets the size of a buffer"""
+        return "const auto " + name + "_size = " + size + ";"
+
+    @staticmethod
+    def create_buffer(name, template):
+        """Creates a new CLCudaAPI buffer"""
+        return "auto " + name + "_buffer = clblast::Buffer<" + template + ">(context, " + name + "_size);"
+
+    def write_buffer(self, name, template):
+        """Writes to a CLCudaAPI buffer"""
+        postfix = ""
+        if name in self.scalar_buffers_second_non_pointer():
+            postfix = "_vec"
+        data_structure = "reinterpret_cast<" + template + "*>(" + name + postfix + ")"
+        return name + "_buffer.Write(queue, " + name + "_size, " + data_structure + ");"
+
+    @staticmethod
+    def read_buffer(name, template):
+        """Reads from a CLCudaAPI buffer"""
+        data_structure = "reinterpret_cast<" + template + "*>(" + name + ")"
+        return name + "_buffer.Read(queue, " + name + "_size, " + data_structure + ");"
+
    def non_index_inputs(self):
        """Lists of input/output buffers not index (integer)"""
        buffers = self.inputs[:]  # make a copy
@ -85,6 +120,11 @@ class Routine:
        """List of buffers without 'inc' or 'ld'"""
        return self.scalar_buffers_first() + self.scalar_buffers_second() + ["ap"]

+    def get_buffer_type(self, name, flavour):
+        if name in self.index_buffers():
+            return "int"
+        return flavour.buffer_type
+
    def length(self):
        """Retrieves the number of characters in the routine's name"""
        return len(self.name)
@ -133,6 +173,15 @@ class Routine:
            return [", ".join(a + b + c)]
        return []

+    def buffer_zero_offset(self, name):
+        """As above, but with an offset value of zero"""
+        if name in self.inputs or name in self.outputs:
+            a = [name + "_buffer()"]
+            b = ["0"]
+            c = [name + "_" + self.postfix(name)] if (name not in self.buffers_without_ld_inc()) else []
+            return [", ".join(a + b + c)]
+        return []
+
    def buffer_def(self, name):
        """As above but with data-types"""
        prefix = "const " if name in self.inputs else ""
@ -163,6 +212,17 @@ class Routine:
            return [", ".join(a + b + c)]
        return []

+    def buffer_def_pointer(self, name, flavour):
+        """As above but as plain C pointer"""
+        prefix = "const " if name in self.inputs else ""
+        if name in self.inputs or name in self.outputs:
+            data_type = "void" if flavour.is_non_standard() else flavour.buffer_type
+            pointer = "" if name in self.scalar_buffers_second_non_pointer() else "*"
+            a = [prefix + data_type + pointer + " " + name + ""]
+            c = ["const int " + name + "_" + self.postfix(name)] if name not in self.buffers_without_ld_inc() else []
+            return [", ".join(a + c)]
+        return []
+
    def buffer_clcudaapi(self, name):
        """As above but with CLCudaAPI buffers"""
        if name in self.inputs or name in self.outputs:
@ -238,6 +298,12 @@ class Routine:
            return [name]
        return []

+    def scalar_cpp(self, name):
+        """As above, but with _cpp as a suffix"""
+        if name in self.scalars:
+            return [name + "_cpp"]
+        return []
+
    def scalar_half_to_float(self, name):
        """As above, but converts from float to half"""
        if name in self.scalars:
@ -288,6 +354,16 @@ class Routine:
            return ["const " + flavour.beta_cpp + " " + name]
        return []

+    def scalar_def_void(self, name, flavour):
+        """Retrieves the definition of a scalar (alpha/beta) but make it a void pointer in case of non-standard types"""
+        if name in self.scalars:
+            if name == "alpha":
+                data_type = "void*" if flavour.is_complex("alpha") else flavour.alpha_cpp
+                return ["const " + data_type + " " + name]
+            data_type = "void*" if flavour.is_complex("beta") else flavour.beta_cpp
+            return ["const " + data_type + " " + name]
+        return []
+
    def scalar_type(self, name, flavour):
        """Retrieves the type of a scalar (alpha/beta)"""
        if name in self.scalars:
@ -304,6 +380,16 @@ class Routine:
            return ["`const " + self.template.beta_cpp + " " + name + "`: Input scalar constant."]
        return []

+    def scalar_create_cpp(self, flavour):
+        """Creates a C++ version of a scalar based on a void*"""
+        result = []
+        for name in self.scalars:
+            if name == "alpha":
+                result.append("const auto alpha_cpp = " + flavour.use_alpha_clblast() + ";")
+            elif name == "beta":
+                result.append("const auto beta_cpp = " + flavour.use_beta_clblast() + ";")
+        return result
+
    def sizes_list(self):
        """Retrieves a list of comma-separated sizes (m, n, k)"""
        if self.sizes:
@ -316,6 +402,12 @@ class Routine:
            return [", ".join(["const size_t " + s for s in self.sizes])]
        return []

+    def sizes_def_netlib(self):
+        """Retrieves the definition of the sizes (m,n,k) for the CBLAS API"""
+        if self.sizes:
+            return [", ".join(["const int " + s for s in self.sizes])]
+        return []
+
    def sizes_type(self):
        """Retrieves the types of the sizes (m,n,k)"""
        if self.sizes:
@ -428,6 +520,17 @@ class Routine:
                list(chain(*[self.buffer(b) for b in self.scalar_buffers_second()])) +
                list(chain(*[self.scalar_use(s, flavour) for s in self.other_scalars()])))

+    def arguments_netlib(self, flavour, indent):
+        """As above, but for the Netlib CBLAS API"""
+        return (self.options_cast(indent) + self.sizes_list() +
+                list(chain(*[self.buffer_zero_offset(b) for b in self.scalar_buffers_first()])) +
+                self.scalar_cpp("alpha") +
+                list(chain(*[self.buffer_zero_offset(b) for b in self.buffers_first()])) +
+                self.scalar_cpp("beta") +
+                list(chain(*[self.buffer_zero_offset(b) for b in self.buffers_second()])) +
+                list(chain(*[self.buffer_zero_offset(b) for b in self.scalar_buffers_second()])) +
+                list(chain(*[self.scalar(s) for s in self.other_scalars()])))
+
    def arguments_wrapper_clblas(self, flavour):
        """As above, but for the clBLAS wrapper"""
        return (self.options_list() + self.sizes_list() +
@ -460,6 +563,19 @@ class Routine:
                list(chain(*[self.buffer_def(b) for b in self.scalar_buffers_second()])) +
                list(chain(*[self.scalar_def(s, flavour) for s in self.other_scalars()])))

+    def arguments_def_netlib(self, flavour):
+        """As above, but for the Netlib CBLAS API"""
+        result=(self.options_def_c() + self.sizes_def_netlib() +
+                self.scalar_def_void("alpha", flavour) +
+                list(chain(*[self.buffer_def_pointer(b, flavour) for b in self.buffers_first()])) +
+                self.scalar_def_void("beta", flavour) +
+                list(chain(*[self.buffer_def_pointer(b, flavour) for b in self.buffers_second()])) +
+                list(chain(*[self.buffer_def_pointer(b, flavour) for b in self.scalar_buffers_second()])) +
+                list(chain(*[self.scalar_def(s, flavour) for s in self.other_scalars()])))
+        if self.name in self.routines_scalar_no_return():
+            result += list(chain(*[self.buffer_def_pointer(b, flavour) for b in self.scalar_buffers_first()]))
+        return result
+
    def arguments_def_c(self, flavour):
        """As above, but for the C API"""
        return (self.options_def_c() + self.sizes_def() +
@ -546,6 +662,25 @@ class Routine:
        result += ",\n" + indent + "cl_command_queue* queue, cl_event* event)"
        return result

+    def routine_header_netlib(self, flavour, spaces, extra_qualifier):
+        """As above, but now for the original Netlib CBLAS API"""
+        return_type = "void"
+        for output in self.outputs:
+            if output in self.index_buffers():
+                return_type = "int"
+                break
+            if output in self.scalar_buffers_first() and self.name not in self.routines_scalar_no_return():
+                return_type = flavour.buffer_type.replace("2", "")
+                break
+        indent = " " * (spaces + len(return_type) + self.length())
+        routine_name = self.name
+        if self.name in self.routines_scalar_no_return():
+            routine_name += "_sub"
+            indent += "    "
+        result = return_type + extra_qualifier + " cblas_" + flavour.name.lower() + routine_name + "("
+        result += (",\n" + indent).join([a for a in self.arguments_def_netlib(flavour)]) + ")"
+        return result
+
    def routine_header_wrapper_clblas(self, flavour, def_only, spaces):
        """As above, but now for the clBLAS wrapper"""
        template = "<" + flavour.template + ">" if self.no_scalars() and not def_only else ""
--- a/src/clblast_netlib_c.cpp
+++ b/src/clblast_netlib_c.cpp
--- a/src/routines/level1/xscal.cpp
+++ b/src/routines/level1/xscal.cpp
@ -55,12 +55,12 @@ void Xscal<T>::DoScal(const size_t n, const T alpha,
  // Sets the kernel arguments
  if (use_fast_kernel) {
    kernel.SetArgument(0, static_cast<int>(n));
-    kernel.SetArgument(1, alpha);
+    kernel.SetArgument(1, GetRealArg(alpha));
    kernel.SetArgument(2, x_buffer());
  }
  else {
    kernel.SetArgument(0, static_cast<int>(n));
-    kernel.SetArgument(1, alpha);
+    kernel.SetArgument(1, GetRealArg(alpha));
    kernel.SetArgument(2, x_buffer());
    kernel.SetArgument(3, static_cast<int>(x_offset));
    kernel.SetArgument(4, static_cast<int>(x_inc));
--- a/src/utilities/utilities.cpp
+++ b/src/utilities/utilities.cpp
@ -151,6 +151,10 @@ std::string ToString(Precision value) {
    case Precision::kComplexDouble: return ToString(static_cast<int>(value))+" (complex-double)";
  }
 }
+template <>
+std::string ToString(StatusCode value) {
+  return std::to_string(static_cast<int>(value));
+}

 // =================================================================================================