Merge pull request #69 from CNugteren/refactoring

Refactoring of the Routine class and file-renaming
2024-07-07 12:23:46 +02:00 · 2016-06-19 14:03:53 +02:00 · 2016-06-19 14:03:53 +02:00 · 395a0ef34e
parent 52ccaf5b25 61203453aa
commit 395a0ef34e
276 changed files with 1274 additions and 1727 deletions
--- a/1
+++ b/1
@ -4,6 +4,7 @@ Development version (next release)
 - Made it possible to compile the performance tests (clients) separately from the correctness tests
 - Made a reference BLAS and head-to-head performance comparison optional in the clients
 - Increased the verbosity of the "-verbose" option in the correctness tests
+- Refactored the host code for better compilation times and fewer lines of code
 - Improved the API documentation
 - Various minor fixes and enhancements
 - Added tuned parameters for various devices (see README)
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -121,7 +121,7 @@ endif()
 # ==================================================================================================

 # Includes directories: CLBlast and OpenCL
-include_directories(${clblast_SOURCE_DIR}/include ${OPENCL_INCLUDE_DIRS})
+include_directories(${clblast_SOURCE_DIR}/include ${clblast_SOURCE_DIR}/src ${OPENCL_INCLUDE_DIRS})

 # ==================================================================================================

@ -140,19 +140,26 @@ set(PRECISIONS 32 64 3232 6464)
 # ==================================================================================================

 # Gathers all source-files
-set(SOURCES src/clblast.cc src/database.cc src/routine.cc src/cache.cc
-            src/utilities.cc src/clblast_c.cc)
+set(SOURCES
+  src/database/database.cpp
+  src/routines/common.cpp
+  src/cache.cpp
+  src/clblast.cpp
+  src/clblast_c.cpp
+  src/routine.cpp
+  src/utilities.cpp
+)
 foreach(ROUTINE ${LEVEL1_ROUTINES})
-  set(SOURCES ${SOURCES} src/routines/level1/${ROUTINE}.cc)
+  set(SOURCES ${SOURCES} src/routines/level1/${ROUTINE}.cpp)
 endforeach()
 foreach(ROUTINE ${LEVEL2_ROUTINES})
-  set(SOURCES ${SOURCES} src/routines/level2/${ROUTINE}.cc)
+  set(SOURCES ${SOURCES} src/routines/level2/${ROUTINE}.cpp)
 endforeach()
 foreach(ROUTINE ${LEVEL3_ROUTINES})
-  set(SOURCES ${SOURCES} src/routines/level3/${ROUTINE}.cc)
+  set(SOURCES ${SOURCES} src/routines/level3/${ROUTINE}.cpp)
 endforeach()
 foreach(ROUTINE ${LEVELX_ROUTINES})
-  set(SOURCES ${SOURCES} src/routines/levelx/${ROUTINE}.cc)
+  set(SOURCES ${SOURCES} src/routines/levelx/${ROUTINE}.cpp)
 endforeach()

 # Creates and links the library
@ -186,7 +193,7 @@ if(SAMPLES)

  # Adds sample programs (C++)
  foreach(SAMPLE ${SAMPLE_PROGRAMS_CPP})
-    add_executable(clblast_sample_${SAMPLE} samples/${SAMPLE}.cc)
+    add_executable(clblast_sample_${SAMPLE} samples/${SAMPLE}.cpp)
    target_link_libraries(clblast_sample_${SAMPLE} clblast ${OPENCL_LIBRARIES})
    install(TARGETS clblast_sample_${SAMPLE} DESTINATION bin)
  endforeach()
@ -211,7 +218,7 @@ if(TUNERS)

  # Adds tuning executables
  foreach(KERNEL ${KERNELS})
-    add_executable(clblast_tuner_${KERNEL} src/tuning/${KERNEL}.cc)
+    add_executable(clblast_tuner_${KERNEL} src/tuning/kernels/${KERNEL}.cpp)
    target_link_libraries(clblast_tuner_${KERNEL} clblast ${CLTUNE_LIBRARIES} ${OPENCL_LIBRARIES})
    install(TARGETS clblast_tuner_${KERNEL} DESTINATION bin)
  endforeach()
@ -257,7 +264,7 @@ if(CLIENTS OR TESTS)
  endif()

  # Sets the include directories
-  include_directories(${clblast_SOURCE_DIR}/test ${REF_INCLUDES})
+  include_directories(${clblast_SOURCE_DIR} ${REF_INCLUDES})

 endif()

@ -268,24 +275,24 @@ endif()
 if(CLIENTS)

  # Creates the common performance-tests objects (requires CMake 2.8.8)
-  add_library(test_performance_common OBJECT test/performance/client.cc)
+  add_library(test_performance_common OBJECT test/performance/client.cpp)

  # Compiles the performance-tests
  foreach(ROUTINE ${LEVEL1_ROUTINES})
    add_executable(clblast_client_${ROUTINE} $<TARGET_OBJECTS:test_performance_common>
-                   test/performance/routines/level1/${ROUTINE}.cc)
+                   test/performance/routines/level1/${ROUTINE}.cpp)
  endforeach()
  foreach(ROUTINE ${LEVEL2_ROUTINES})
    add_executable(clblast_client_${ROUTINE} $<TARGET_OBJECTS:test_performance_common>
-                   test/performance/routines/level2/${ROUTINE}.cc)
+                   test/performance/routines/level2/${ROUTINE}.cpp)
  endforeach()
  foreach(ROUTINE ${LEVEL3_ROUTINES})
    add_executable(clblast_client_${ROUTINE} $<TARGET_OBJECTS:test_performance_common>
-                   test/performance/routines/level3/${ROUTINE}.cc)
+                   test/performance/routines/level3/${ROUTINE}.cpp)
  endforeach()
  foreach(ROUTINE ${LEVELX_ROUTINES})
    add_executable(clblast_client_${ROUTINE} $<TARGET_OBJECTS:test_performance_common>
-                   test/performance/routines/levelx/${ROUTINE}.cc)
+                   test/performance/routines/levelx/${ROUTINE}.cpp)
  endforeach()
  foreach(ROUTINE ${ROUTINES})
    target_link_libraries(clblast_client_${ROUTINE} clblast ${REF_LIBRARIES} ${OPENCL_LIBRARIES})
@ -303,24 +310,24 @@ if(TESTS)

  # Creates the common correctness-tests objects (requires CMake 2.8.8)
  add_library(test_correctness_common OBJECT
-              test/correctness/tester.cc test/correctness/testblas.cc)
+              test/correctness/tester.cpp test/correctness/testblas.cpp)

  # Compiles the correctness-tests
  foreach(ROUTINE ${LEVEL1_ROUTINES})
    add_executable(clblast_test_${ROUTINE} $<TARGET_OBJECTS:test_correctness_common>
-                   test/correctness/routines/level1/${ROUTINE}.cc)
+                   test/correctness/routines/level1/${ROUTINE}.cpp)
  endforeach()
  foreach(ROUTINE ${LEVEL2_ROUTINES})
    add_executable(clblast_test_${ROUTINE} $<TARGET_OBJECTS:test_correctness_common>
-                   test/correctness/routines/level2/${ROUTINE}.cc)
+                   test/correctness/routines/level2/${ROUTINE}.cpp)
  endforeach()
  foreach(ROUTINE ${LEVEL3_ROUTINES})
    add_executable(clblast_test_${ROUTINE} $<TARGET_OBJECTS:test_correctness_common>
-                   test/correctness/routines/level3/${ROUTINE}.cc)
+                   test/correctness/routines/level3/${ROUTINE}.cpp)
  endforeach()
  foreach(ROUTINE ${LEVELX_ROUTINES})
    add_executable(clblast_test_${ROUTINE} $<TARGET_OBJECTS:test_correctness_common>
-                   test/correctness/routines/levelx/${ROUTINE}.cc)
+                   test/correctness/routines/levelx/${ROUTINE}.cpp)
  endforeach()
  foreach(ROUTINE ${ROUTINES})
    target_link_libraries(clblast_test_${ROUTINE} clblast ${REF_LIBRARIES} ${OPENCL_LIBRARIES})
--- a/README.md
+++ b/README.md
@ -136,7 +136,7 @@ Note that CLBlast's tuners are based on the CLTune auto-tuning library, which ha

 Compiling with `-DTUNERS=ON` will generate a number of tuners, each named `clblast_tuner_xxxxx`, in which `xxxxx` corresponds to a `.opencl` kernel file as found in `src/kernels`. These kernels corresponds to routines (e.g. `xgemm`) or to common pre-processing or post-processing kernels (`copy` and `transpose`). Running such a tuner will test a number of parameter-value combinations on your device and report which one gave the best performance. Running `make alltuners` runs all tuners for all precisions in one go. You can set the default device and platform for `alltuners` by setting the `DEFAULT_DEVICE` and `DEFAULT_PLATFORM` environmental variables before running CMake.

-The tuners output a JSON-file with the results. The best results need to be added to `include/internal/database/xxxxx.h` in the appropriate section. However, this can be done automatically based on the JSON-data using a Python script in `scripts/database/database.py`. If you want the found parameters to be included in future releases of CLBlast, please attach the JSON files to the corresponding issue on GitHub or [email the main author](http://www.cedricnugteren.nl).
+The tuners output a JSON-file with the results. The best results need to be added to `include/internal/database/xxxxx.hpp` in the appropriate section. However, this can be done automatically based on the JSON-data using a Python script in `scripts/database/database.py`. If you want the found parameters to be included in future releases of CLBlast, please attach the JSON files to the corresponding issue on GitHub or [email the main author](http://www.cedricnugteren.nl).

 In summary, tuning the entire library for your device can be done as follows (starting from the root of the CLBlast folder):

--- a/include/clblast.h
+++ b/include/clblast.h
@ -68,8 +68,8 @@ enum class StatusCode {
  kInvalidLocalMemUsage      = -2046, // Not enough local memory available on this device
  kNoHalfPrecision           = -2045, // Half precision (16-bits) not supported by the device
  kNoDoublePrecision         = -2044, // Double precision (64-bits) not supported by the device
-  kInvalidVectorDot          = -2043, // Vector dot is not a valid OpenCL buffer
-  kInsufficientMemoryDot     = -2042, // Vector dot's OpenCL buffer is too small
+  kInvalidVectorScalar       = -2043, // The unit-sized vector is not a valid OpenCL buffer
+  kInsufficientMemoryScalar  = -2042, // The unit-sized vector's OpenCL buffer is too small
 };

 // Matrix layout and transpose types
--- a/include/clblast_c.h
+++ b/include/clblast_c.h
@ -77,8 +77,8 @@ typedef enum StatusCode_ {
  kInvalidLocalMemUsage      = -2046, // Not enough local memory available on this device
  kNoHalfPrecision           = -2045, // Half precision (16-bits) not supported by the device
  kNoDoublePrecision         = -2044, // Double precision (64-bits) not supported by the device
-  kInvalidVectorDot          = -2043, // Vector dot is not a valid OpenCL buffer
-  kInsufficientMemoryDot     = -2042, // Vector dot's OpenCL buffer is too small
+  kInvalidVectorScalar       = -2043, // The unit-sized vector is not a valid OpenCL buffer
+  kInsufficientMemoryScalar  = -2042, // The unit-sized vector's OpenCL buffer is too small
 } StatusCode;

 // Matrix layout and transpose types
--- a/include/internal/routine.h
+++ b/include/internal/routine.h
@ -1,144 +0,0 @@
-
-// =================================================================================================
-// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
-// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
-// width of 100 characters per line.
-//
-// Author(s):
-//   Cedric Nugteren <www.cedricnugteren.nl>
-//
-// This file implements all the basic functionality for the BLAS routines. This class serves as a
-// base class for the actual routines (e.g. Xaxpy, Xgemm). It contains common functionality such as
-// compiling the OpenCL kernel, connecting to the database, etc.
-//
-// =================================================================================================
-
-#ifndef CLBLAST_ROUTINE_H_
-#define CLBLAST_ROUTINE_H_
-
-#include <string>
-#include <vector>
-
-#include "internal/cache.h"
-#include "internal/utilities.h"
-#include "internal/database.h"
-
-namespace clblast {
-// =================================================================================================
-
-// See comment at top of file for a description of the class
-template <typename T>
-class Routine {
- public:
-
-  // Helper functions which check for errors in the status code
-  static constexpr bool ErrorIn(const StatusCode s) { return (s != StatusCode::kSuccess); }
-
-  // Base class constructor
-  explicit Routine(Queue &queue, EventPointer event, const std::string &name,
-                   const std::vector<std::string> &routines, const Precision precision);
-
-  // Set-up phase of the kernel
-  StatusCode SetUp();
-
- protected:
-  
-  // Runs a kernel given the global and local thread sizes
-  StatusCode RunKernel(Kernel &kernel, std::vector<size_t> global,
-                       const std::vector<size_t> &local, EventPointer event,
-                       std::vector<Event>& waitForEvents);
-
-  // As above, but without an event waiting list
-  StatusCode RunKernel(Kernel &kernel, std::vector<size_t> global,
-                       const std::vector<size_t> &local, EventPointer event);
-
-  // Tests for valid inputs of matrices A, B, and C
-  StatusCode TestMatrixA(const size_t one, const size_t two, const Buffer<T> &buffer,
-                         const size_t offset, const size_t ld, const size_t data_size);
-  StatusCode TestMatrixB(const size_t one, const size_t two, const Buffer<T> &buffer,
-                         const size_t offset, const size_t ld, const size_t data_size);
-  StatusCode TestMatrixC(const size_t one, const size_t two, const Buffer<T> &buffer,
-                         const size_t offset, const size_t ld, const size_t data_size);
-  StatusCode TestMatrixAP(const size_t n, const Buffer<T> &buffer,
-                          const size_t offset, const size_t data_size);
-
-  // Tests for valid inputs of vector X and Y
-  StatusCode TestVectorX(const size_t n, const Buffer<T> &buffer, const size_t offset,
-                         const size_t inc, const size_t data_size);
-  StatusCode TestVectorY(const size_t n, const Buffer<T> &buffer, const size_t offset,
-                         const size_t inc, const size_t data_size);
-
-  // Tests for valid inputs of other vectors
-  StatusCode TestVectorDot(const size_t n, const Buffer<T> &buffer, const size_t offset,
-                           const size_t data_size);
-  StatusCode TestVectorIndex(const size_t n, const Buffer<unsigned int> &buffer,
-                             const size_t offset, const size_t data_size);
-
-  // Copies/transposes a matrix and padds/unpads it with zeroes. This method is also able to write
-  // to symmetric and triangular matrices through optional arguments.
-  StatusCode PadCopyTransposeMatrix(EventPointer event, std::vector<Event>& waitForEvents,
-                                    const size_t src_one, const size_t src_two,
-                                    const size_t src_ld, const size_t src_offset,
-                                    const Buffer<T> &src,
-                                    const size_t dest_one, const size_t dest_two,
-                                    const size_t dest_ld, const size_t dest_offset,
-                                    const Buffer<T> &dest,
-                                    const T alpha,
-                                    const Program &program, const bool do_pad,
-                                    const bool do_transpose, const bool do_conjugate,
-                                    const bool upper = false, const bool lower = false,
-                                    const bool diagonal_imag_zero = false);
-
-  // Stores a newly compiled binary/program into the cache
-  void StoreBinaryToCache(const std::string& binary) const {
-    cache::StoreBinaryToCache(binary, device_name_, precision_, routine_name_);
-  }
-  void StoreProgramToCache(const Program& program) const {
-    cache::StoreProgramToCache(program, context_, precision_, routine_name_);
-  }
-
-  // Queries the cache and retrieve either a matching binary/program or a boolean whether a match
-  // exists. The first assumes that the binary/program is available in the cache and will throw an
-  // exception otherwise.
-  std::string GetBinaryFromCache() const {
-    return cache::GetBinaryFromCache(device_name_, precision_, routine_name_);
-  }
-  Program GetProgramFromCache() const {
-    return cache::GetProgramFromCache(context_, precision_, routine_name_);
-  }
-  bool BinaryIsInCache() const {
-    return cache::BinaryIsInCache(device_name_, precision_, routine_name_);
-  }
-  bool ProgramIsInCache() const {
-    return cache::ProgramIsInCache(context_, precision_, routine_name_);
-  }
-
-  // Non-static variable for the precision. Note that the same variable (but static) might exist in
-  // a derived class.
-  const Precision precision_;
-
-  // The routine's name and its kernel-source in string form
-  const std::string routine_name_;
-  std::string source_string_;
-
-  // The OpenCL objects, accessible only from derived classes
-  Queue queue_;
-  EventPointer event_;
-  const Context context_;
-  const Device device_;
-
-  // OpenCL device properties
-  const std::string device_name_;
-  const size_t max_work_item_dimensions_;
-  const std::vector<size_t> max_work_item_sizes_;
-  const size_t max_work_group_size_;
-
-  // Connection to the database for all the device-specific parameters
-  const Database db_;
-};
-
-// =================================================================================================
-} // namespace clblast
-
-// CLBLAST_ROUTINE_H_
-#endif
--- a/samples/sgemm.cpp
+++ b/samples/sgemm.cpp
--- a/scripts/database/database.py
+++ b/scripts/database/database.py
@ -310,7 +310,7 @@ defaults = CalculateDefaults(bests)
 bests = ConcatenateData(bests, defaults)

 # Outputs the data as a C++ database
-path_cpp_database = os.path.join(path_clblast, "include", "internal", "database")
+path_cpp_database = os.path.join(path_clblast, "src", "database", "kernels")
 print("## Producing a C++ database in '"+path_cpp_database+"'...")
 PrintData(bests, path_cpp_database)

--- a/scripts/generator/generator.py
+++ b/scripts/generator/generator.py
@ -10,14 +10,14 @@
 # This script automatically generates the bodies of the following files, creating the full CLBlast
 # API interface and implementation (C, C++, and reference BLAS wrappers):
 #    clblast.h
-#    clblast.cc
+#    clblast.cpp
 #    clblast_c.h
-#    clblast_c.cc
+#    clblast_c.cpp
 #    wrapper_clblas.h
 #    wrapper_cblas.h
 # It also generates the main functions for the correctness and performance tests as found in
-#    test/correctness/routines/levelX/xYYYY.cc
-#    test/performance/routines/levelX/xYYYY.cc
+#    test/correctness/routines/levelX/xYYYY.cpp
+#    test/performance/routines/levelX/xYYYY.cpp
 # It also produces the API documentation found in doc/clblast.md
 #
 # ==================================================================================================
@ -200,7 +200,7 @@ def clblast_h(routines):
 		result += routine.RoutineHeaderCPP(12, " = nullptr")+";\n"
 	return result

-# The C++ API implementation (.cc)
+# The C++ API implementation (.cpp)
 def clblast_cc(routines):
 	result = ""
 	for routine in routines:
@ -237,7 +237,7 @@ def clblast_c_h(routines):
 			result += routine.RoutineHeaderC(flavour, 31, " PUBLIC_API")+";\n"
 	return result

-# The C API implementation (.cc)
+# The C API implementation (.cpp)
 def clblast_c_cc(routines):
 	result = ""
 	for routine in routines:
@ -379,14 +379,14 @@ if len(sys.argv) != 2:
 path_clblast = sys.argv[1]
 files = [
  path_clblast+"/include/clblast.h",
-  path_clblast+"/src/clblast.cc",
+  path_clblast+"/src/clblast.cpp",
  path_clblast+"/include/clblast_c.h",
-  path_clblast+"/src/clblast_c.cc",
-  path_clblast+"/test/wrapper_clblas.h",
-  path_clblast+"/test/wrapper_cblas.h",
+  path_clblast+"/src/clblast_c.cpp",
+  path_clblast+"/test/wrapper_clblas.hpp",
+  path_clblast+"/test/wrapper_cblas.hpp",
 ]
 header_lines = [84, 74, 93, 22, 29, 41]
-footer_lines = [17, 71, 19, 14, 6, 6]
+footer_lines = [17, 75, 19, 14, 6, 6]

 # Checks whether the command-line arguments are valid; exists otherwise
 for f in files:
@ -433,11 +433,11 @@ for i in xrange(0,len(files)):
 for level in [1,2,3,4]:
 	for routine in routines[level-1]:
 		if routine.has_tests:
-			filename = path_clblast+"/test/correctness/routines/level"+levelnames[level-1]+"/x"+routine.name+".cc"
+			filename = path_clblast+"/test/correctness/routines/level"+levelnames[level-1]+"/x"+routine.name+".cpp"
 			with open(filename, "w") as f:
 				body = ""
-				body += "#include \"correctness/testblas.h\"\n"
-				body += "#include \"routines/level"+levelnames[level-1]+"/x"+routine.name+".h\"\n\n"
+				body += "#include \"test/correctness/testblas.hpp\"\n"
+				body += "#include \"test/routines/level"+levelnames[level-1]+"/x"+routine.name+".hpp\"\n\n"
 				body += "// Shortcuts to the clblast namespace\n"
 				body += "using float2 = clblast::float2;\n"
 				body += "using double2 = clblast::double2;\n\n"
@ -459,11 +459,11 @@ for level in [1,2,3,4]:
 for level in [1,2,3,4]:
 	for routine in routines[level-1]:
 		if routine.has_tests:
-			filename = path_clblast+"/test/performance/routines/level"+levelnames[level-1]+"/x"+routine.name+".cc"
+			filename = path_clblast+"/test/performance/routines/level"+levelnames[level-1]+"/x"+routine.name+".cpp"
 			with open(filename, "w") as f:
 				body = ""
-				body += "#include \"performance/client.h\"\n"
-				body += "#include \"routines/level"+levelnames[level-1]+"/x"+routine.name+".h\"\n\n"
+				body += "#include \"test/performance/client.hpp\"\n"
+				body += "#include \"test/routines/level"+levelnames[level-1]+"/x"+routine.name+".hpp\"\n\n"
 				body += "// Shortcuts to the clblast namespace\n"
 				body += "using float2 = clblast::float2;\n"
 				body += "using double2 = clblast::double2;\n\n"
--- a/src/buffer_test.hpp
+++ b/src/buffer_test.hpp
@ -0,0 +1,121 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the tests for the OpenCL buffers (matrices and vectors). These tests are
+// templated and thus header-only.
+//
+// =================================================================================================
+
+#ifndef CLBLAST_BUFFER_TEST_H_
+#define CLBLAST_BUFFER_TEST_H_
+
+#include "clblast.h"
+
+namespace clblast {
+// =================================================================================================
+
+// Tests matrix 'A' for validity
+template <typename T>
+StatusCode TestMatrixA(const size_t one, const size_t two, const Buffer<T> &buffer,
+                       const size_t offset, const size_t ld) {
+  if (ld < one) { return StatusCode::kInvalidLeadDimA; }
+  try {
+    const auto required_size = (ld * (two - 1) + one + offset) * sizeof(T);
+    if (buffer.GetSize() < required_size) { return StatusCode::kInsufficientMemoryA; }
+  } catch (...) { return StatusCode::kInvalidMatrixA; }
+  return StatusCode::kSuccess;
+}
+
+// Tests matrix 'B' for validity
+template <typename T>
+StatusCode TestMatrixB(const size_t one, const size_t two, const Buffer<T> &buffer,
+                       const size_t offset, const size_t ld) {
+  if (ld < one) { return StatusCode::kInvalidLeadDimB; }
+  try {
+    const auto required_size = (ld * (two - 1) + one + offset) * sizeof(T);
+    if (buffer.GetSize() < required_size) { return StatusCode::kInsufficientMemoryB; }
+  } catch (...) { return StatusCode::kInvalidMatrixB; }
+  return StatusCode::kSuccess;
+}
+
+// Tests matrix 'C' for validity
+template <typename T>
+StatusCode TestMatrixC(const size_t one, const size_t two, const Buffer<T> &buffer,
+                       const size_t offset, const size_t ld) {
+  if (ld < one) { return StatusCode::kInvalidLeadDimC; }
+  try {
+    const auto required_size = (ld * (two - 1) + one + offset) * sizeof(T);
+    if (buffer.GetSize() < required_size) { return StatusCode::kInsufficientMemoryC; }
+  } catch (...) { return StatusCode::kInvalidMatrixC; }
+  return StatusCode::kSuccess;
+}
+
+// Tests matrix 'AP' for validity
+template <typename T>
+StatusCode TestMatrixAP(const size_t n, const Buffer<T> &buffer, const size_t offset) {
+  try {
+    const auto required_size = (((n * (n + 1)) / 2) + offset) * sizeof(T);
+    if (buffer.GetSize() < required_size) { return StatusCode::kInsufficientMemoryA; }
+  } catch (...) { return StatusCode::kInvalidMatrixA; }
+  return StatusCode::kSuccess;
+}
+
+// =================================================================================================
+
+// Tests vector 'X' for validity
+template <typename T>
+StatusCode TestVectorX(const size_t n, const Buffer<T> &buffer, const size_t offset,
+                       const size_t inc) {
+  if (inc == 0) { return StatusCode::kInvalidIncrementX; }
+  try {
+    const auto required_size = ((n - 1) * inc + 1 + offset) * sizeof(T);
+    if (buffer.GetSize() < required_size) { return StatusCode::kInsufficientMemoryX; }
+  } catch (...) { return StatusCode::kInvalidVectorX; }
+  return StatusCode::kSuccess;
+}
+
+// Tests vector 'Y' for validity
+template <typename T>
+StatusCode TestVectorY(const size_t n, const Buffer<T> &buffer, const size_t offset,
+                       const size_t inc) {
+  if (inc == 0) { return StatusCode::kInvalidIncrementY; }
+  try {
+    const auto required_size = ((n - 1) * inc + 1 + offset) * sizeof(T);
+    if (buffer.GetSize() < required_size) { return StatusCode::kInsufficientMemoryY; }
+  } catch (...) { return StatusCode::kInvalidVectorY; }
+  return StatusCode::kSuccess;
+}
+
+// =================================================================================================
+
+// Tests vector 'scalar' for validity
+template <typename T>
+StatusCode TestVectorScalar(const size_t n, const Buffer<T> &buffer, const size_t offset) {
+  try {
+    const auto required_size = (n + offset) * sizeof(T);
+    if (buffer.GetSize() < required_size) { return StatusCode::kInsufficientMemoryScalar; }
+  } catch (...) { return StatusCode::kInvalidVectorScalar; }
+  return StatusCode::kSuccess;
+}
+
+// Tests vector 'index' for validity
+template <typename T>
+StatusCode TestVectorIndex(const size_t n, const Buffer<T> &buffer, const size_t offset) {
+  try {
+    const auto required_size = (n + offset) * sizeof(T);
+    if (buffer.GetSize() < required_size) { return StatusCode::kInsufficientMemoryScalar; }
+  } catch (...) { return StatusCode::kInvalidVectorScalar; }
+  return StatusCode::kSuccess;
+}
+
+// =================================================================================================
+} // namespace clblast
+
+// CLBLAST_BUFFER_TEST_H_
+#endif
--- a/src/cache.cpp
+++ b/src/cache.cpp
@ -15,10 +15,9 @@
 #include <vector>
 #include <mutex>

-#include "internal/cache.h"
+#include "cache.hpp"

 namespace clblast {
-namespace cache {
 // =================================================================================================

 // Stores the compiled binary or IR in the cache
@ -98,7 +97,7 @@ bool ProgramIsInCache(const Context &context, const Precision &precision,
 // =================================================================================================

 // Clears the cache of stored binaries and programs
-StatusCode ClearCache() {
+StatusCode CacheClearAll() {
  binary_cache_mutex_.lock();
  binary_cache_.clear();
  binary_cache_mutex_.unlock();
@ -109,5 +108,4 @@ StatusCode ClearCache() {
 }

 // =================================================================================================
-} // namespace cache
 } // namespace clblast
--- a/include/internal/cache.h
+++ b/include/internal/cache.h
@ -18,10 +18,9 @@
 #include <vector>
 #include <mutex>

-#include "internal/utilities.h"
+#include "utilities.hpp"

 namespace clblast {
-namespace cache {
 // =================================================================================================

 // The cache of compiled OpenCL binaries, along with some meta-data
@ -90,10 +89,9 @@ bool ProgramIsInCache(const Context &context, const Precision &precision,
 // =================================================================================================

 // Clears the cache of stored binaries
-StatusCode ClearCache();
+StatusCode CacheClearAll();

 // =================================================================================================
-} // namespace cache
 } // namespace clblast

 // CLBLAST_CACHE_H_
--- a/src/clblast.cpp
+++ b/src/clblast.cpp
@ -16,60 +16,60 @@
 #include <string>

 #include "clblast.h"
-#include "internal/public_api.h"
-#include "internal/cache.h"
+#include "public_api.hpp"
+#include "cache.hpp"

 // BLAS level-1 includes
-#include "internal/routines/level1/xswap.h"
-#include "internal/routines/level1/xscal.h"
-#include "internal/routines/level1/xcopy.h"
-#include "internal/routines/level1/xaxpy.h"
-#include "internal/routines/level1/xdot.h"
-#include "internal/routines/level1/xdotu.h"
-#include "internal/routines/level1/xdotc.h"
-#include "internal/routines/level1/xnrm2.h"
-#include "internal/routines/level1/xasum.h"
-#include "internal/routines/level1/xsum.h" // non-BLAS function
-#include "internal/routines/level1/xamax.h"
-#include "internal/routines/level1/xmax.h" // non-BLAS function
-#include "internal/routines/level1/xmin.h" // non-BLAS function
+#include "routines/level1/xswap.hpp"
+#include "routines/level1/xscal.hpp"
+#include "routines/level1/xcopy.hpp"
+#include "routines/level1/xaxpy.hpp"
+#include "routines/level1/xdot.hpp"
+#include "routines/level1/xdotu.hpp"
+#include "routines/level1/xdotc.hpp"
+#include "routines/level1/xnrm2.hpp"
+#include "routines/level1/xasum.hpp"
+#include "routines/level1/xsum.hpp" // non-BLAS routine
+#include "routines/level1/xamax.hpp"
+#include "routines/level1/xmax.hpp" // non-BLAS routine
+#include "routines/level1/xmin.hpp" // non-BLAS routine

 // BLAS level-2 includes
-#include "internal/routines/level2/xgemv.h"
-#include "internal/routines/level2/xgbmv.h"
-#include "internal/routines/level2/xhemv.h"
-#include "internal/routines/level2/xhbmv.h"
-#include "internal/routines/level2/xhpmv.h"
-#include "internal/routines/level2/xsymv.h"
-#include "internal/routines/level2/xsbmv.h"
-#include "internal/routines/level2/xspmv.h"
-#include "internal/routines/level2/xtrmv.h"
-#include "internal/routines/level2/xtbmv.h"
-#include "internal/routines/level2/xtpmv.h"
-#include "internal/routines/level2/xger.h"
-#include "internal/routines/level2/xgeru.h"
-#include "internal/routines/level2/xgerc.h"
-#include "internal/routines/level2/xher.h"
-#include "internal/routines/level2/xhpr.h"
-#include "internal/routines/level2/xher2.h"
-#include "internal/routines/level2/xhpr2.h"
-#include "internal/routines/level2/xsyr.h"
-#include "internal/routines/level2/xspr.h"
-#include "internal/routines/level2/xsyr2.h"
-#include "internal/routines/level2/xspr2.h"
+#include "routines/level2/xgemv.hpp"
+#include "routines/level2/xgbmv.hpp"
+#include "routines/level2/xhemv.hpp"
+#include "routines/level2/xhbmv.hpp"
+#include "routines/level2/xhpmv.hpp"
+#include "routines/level2/xsymv.hpp"
+#include "routines/level2/xsbmv.hpp"
+#include "routines/level2/xspmv.hpp"
+#include "routines/level2/xtrmv.hpp"
+#include "routines/level2/xtbmv.hpp"
+#include "routines/level2/xtpmv.hpp"
+#include "routines/level2/xger.hpp"
+#include "routines/level2/xgeru.hpp"
+#include "routines/level2/xgerc.hpp"
+#include "routines/level2/xher.hpp"
+#include "routines/level2/xhpr.hpp"
+#include "routines/level2/xher2.hpp"
+#include "routines/level2/xhpr2.hpp"
+#include "routines/level2/xsyr.hpp"
+#include "routines/level2/xspr.hpp"
+#include "routines/level2/xsyr2.hpp"
+#include "routines/level2/xspr2.hpp"

 // BLAS level-3 includes
-#include "internal/routines/level3/xgemm.h"
-#include "internal/routines/level3/xsymm.h"
-#include "internal/routines/level3/xhemm.h"
-#include "internal/routines/level3/xsyrk.h"
-#include "internal/routines/level3/xherk.h"
-#include "internal/routines/level3/xsyr2k.h"
-#include "internal/routines/level3/xher2k.h"
-#include "internal/routines/level3/xtrmm.h"
+#include "routines/level3/xgemm.hpp"
+#include "routines/level3/xsymm.hpp"
+#include "routines/level3/xhemm.hpp"
+#include "routines/level3/xsyrk.hpp"
+#include "routines/level3/xherk.hpp"
+#include "routines/level3/xsyr2k.hpp"
+#include "routines/level3/xher2k.hpp"
+#include "routines/level3/xtrmm.hpp"

-// Extra includes (level-x)
-#include "internal/routines/levelx/xomatcopy.h"
+// Level-x includes (non-BLAS)
+#include "routines/levelx/xomatcopy.hpp"

 namespace clblast {

@ -2120,9 +2120,10 @@ template StatusCode PUBLIC_API Omatcopy<half>(const Layout, const Transpose,
 // =================================================================================================

 // Clears the cache of stored binaries
-StatusCode ClearCache() { return cache::ClearCache(); }
+StatusCode ClearCache() { return CacheClearAll(); }

 // Fills the cache with all binaries for a specific device
+// TODO: Add half-precision FP16 set-up calls
 StatusCode FillCache(const cl_device_id device) {
  try {

@ -2171,7 +2172,7 @@ StatusCode FillCache(const cl_device_id device) {
    Xsyr2<float>(queue, nullptr).SetUp(); Xsyr2<double>(queue, nullptr).SetUp();
    Xspr2<float>(queue, nullptr).SetUp(); Xspr2<double>(queue, nullptr).SetUp();

-    // Runs all the level 1 set-up functions
+    // Runs all the level 3 set-up functions
    Xgemm<float>(queue, nullptr).SetUp(); Xgemm<double>(queue, nullptr).SetUp(); Xgemm<float2>(queue, nullptr).SetUp(); Xgemm<double2>(queue, nullptr).SetUp();
    Xsymm<float>(queue, nullptr).SetUp(); Xsymm<double>(queue, nullptr).SetUp(); Xsymm<float2>(queue, nullptr).SetUp(); Xsymm<double2>(queue, nullptr).SetUp();
    Xhemm<float2>(queue, nullptr).SetUp(); Xhemm<double2>(queue, nullptr).SetUp();
@ -2181,6 +2182,9 @@ StatusCode FillCache(const cl_device_id device) {
    Xher2k<float2,float>(queue, nullptr).SetUp(); Xher2k<double2,double>(queue, nullptr).SetUp();
    Xtrmm<float>(queue, nullptr).SetUp(); Xtrmm<double>(queue, nullptr).SetUp(); Xtrmm<float2>(queue, nullptr).SetUp(); Xtrmm<double2>(queue, nullptr).SetUp();

+    // Runs all the level 3 set-up functions
+    Xomatcopy<float>(queue, nullptr).SetUp(); Xomatcopy<double>(queue, nullptr).SetUp(); Xomatcopy<float2>(queue, nullptr).SetUp(); Xomatcopy<double2>(queue, nullptr).SetUp();
+
  } catch (...) { return StatusCode::kBuildProgramFailure; }
  return StatusCode::kSuccess;
 }
--- a/src/clblast_c.cpp
+++ b/src/clblast_c.cpp
@ -15,7 +15,7 @@

 #include "clblast_c.h"
 #include "clblast.h"
-#include "internal/utilities.h"
+#include "utilities.hpp"

 // Shortcuts to the clblast namespace
 using float2 = clblast::float2;
--- a/include/internal/clpp11.h
+++ b/include/internal/clpp11.h
--- a/src/database/database.cpp
+++ b/src/database/database.cpp
@ -11,18 +11,18 @@
 //
 // =================================================================================================

-#include "internal/database.h"
-#include "internal/database/xaxpy.h"
-#include "internal/database/xdot.h"
-#include "internal/database/xgemv.h"
-#include "internal/database/xger.h"
-#include "internal/database/xgemm.h"
-#include "internal/database/copy.h"
-#include "internal/database/pad.h"
-#include "internal/database/transpose.h"
-#include "internal/database/padtranspose.h"
+#include "utilities.hpp"

-#include "internal/utilities.h"
+#include "database/database.hpp"
+#include "database/kernels/xaxpy.hpp"
+#include "database/kernels/xdot.hpp"
+#include "database/kernels/xgemv.hpp"
+#include "database/kernels/xger.hpp"
+#include "database/kernels/xgemm.hpp"
+#include "database/kernels/copy.hpp"
+#include "database/kernels/pad.hpp"
+#include "database/kernels/transpose.hpp"
+#include "database/kernels/padtranspose.hpp"

 namespace clblast {
 // =================================================================================================
--- a/include/internal/database.h
+++ b/include/internal/database.h
@ -21,7 +21,7 @@
 #include <vector>
 #include <unordered_map>

-#include "internal/utilities.h"
+#include "utilities.hpp"

 namespace clblast {
 // =================================================================================================
--- a/include/internal/database/copy.h
+++ b/include/internal/database/copy.h
--- a/include/internal/database/pad.h
+++ b/include/internal/database/pad.h
--- a/include/internal/database/padtranspose.h
+++ b/include/internal/database/padtranspose.h
--- a/include/internal/database/transpose.h
+++ b/include/internal/database/transpose.h
--- a/include/internal/database/xaxpy.h
+++ b/include/internal/database/xaxpy.h
--- a/include/internal/database/xdot.h
+++ b/include/internal/database/xdot.h
--- a/include/internal/database/xgemm.h
+++ b/include/internal/database/xgemm.h
--- a/include/internal/database/xgemv.h
+++ b/include/internal/database/xgemv.h
--- a/include/internal/database/xger.h
+++ b/include/internal/database/xger.h
--- a/include/internal/public_api.h
+++ b/include/internal/public_api.h
--- a/src/routine.cc
+++ b/src/routine.cc
@ -1,431 +0,0 @@
-
-// =================================================================================================
-// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
-// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
-// width of 100 characters per line.
-//
-// Author(s):
-//   Cedric Nugteren <www.cedricnugteren.nl>
-//
-// This file implements the Routine base class (see the header for information about the class).
-//
-// =================================================================================================
-
-#include <string>
-#include <vector>
-
-#include "internal/routine.h"
-
-namespace clblast {
-// =================================================================================================
-
-// Constructor: not much here, because no status codes can be returned
-template <typename T>
-Routine<T>::Routine(Queue &queue, EventPointer event, const std::string &name,
-                    const std::vector<std::string> &routines, const Precision precision):
-    precision_(precision),
-    routine_name_(name),
-    queue_(queue),
-    event_(event),
-    context_(queue_.GetContext()),
-    device_(queue_.GetDevice()),
-    device_name_(device_.Name()),
-    max_work_item_dimensions_(device_.MaxWorkItemDimensions()),
-    max_work_item_sizes_(device_.MaxWorkItemSizes()),
-    max_work_group_size_(device_.MaxWorkGroupSize()),
-    db_(queue_, routines, precision_) {
-}
-
-// =================================================================================================
-
-// Separate set-up function to allow for status codes to be returned
-template <typename T>
-StatusCode Routine<T>::SetUp() {
-
-  // Queries the cache to see whether or not the program (context-specific) is already there
-  if (ProgramIsInCache()) { return StatusCode::kSuccess; }
-
-  // Queries the cache to see whether or not the binary (device-specific) is already there. If it
-  // is, a program is created and stored in the cache
-  if (BinaryIsInCache()) {
-    try {
-      auto& binary = cache::GetBinaryFromCache(device_name_, precision_, routine_name_);
-      auto program = Program(device_, context_, binary);
-      auto options = std::vector<std::string>();
-      program.Build(device_, options);
-      StoreProgramToCache(program);
-    } catch (...) { return StatusCode::kBuildProgramFailure; }
-    return StatusCode::kSuccess;
-  }
-
-  // Otherwise, the kernel will be compiled and program will be built. Both the binary and the
-  // program will be added to the cache.
-
-  // Inspects whether or not cl_khr_fp64 is supported in case of double precision
-  const auto extensions = device_.Capabilities();
-  if (precision_ == Precision::kDouble || precision_ == Precision::kComplexDouble) {
-    if (extensions.find(kKhronosDoublePrecision) == std::string::npos) {
-      return StatusCode::kNoDoublePrecision;
-    }
-  }
-
-  // As above, but for cl_khr_fp16 (half precision)
-  if (precision_ == Precision::kHalf) {
-    if (extensions.find(kKhronosHalfPrecision) == std::string::npos) {
-      return StatusCode::kNoHalfPrecision;
-    }
-  }
-
-  // Loads the common header (typedefs and defines and such)
-  std::string common_header =
-    #include "kernels/common.opencl"
-  ;
-
-  // Collects the parameters for this device in the form of defines, and adds the precision
-  auto defines = db_.GetDefines();
-  defines += "#define PRECISION "+ToString(static_cast<int>(precision_))+"\n";
-
-  // Adds the name of the routine as a define
-  defines += "#define ROUTINE_"+routine_name_+"\n";
-
-  // For specific devices, use the non-IEE754 compilant OpenCL mad() instruction. This can improve
-  // performance, but might result in a reduced accuracy.
-  if (device_.IsAMD() && device_.IsGPU()) {
-    defines += "#define USE_CL_MAD 1\n";
-  }
-
-  // For specific devices, use staggered/shuffled workgroup indices.
-  if (device_.IsAMD() && device_.IsGPU()) {
-    defines += "#define USE_STAGGERED_INDICES 1\n";
-  }
-
-  // For specific devices add a global synchronisation barrier to the GEMM kernel to optimize
-  // performance through better cache behaviour
-  if (device_.IsARM() && device_.IsGPU()) {
-    defines += "#define GLOBAL_MEM_FENCE 1\n";
-  }
-
-  // Combines everything together into a single source string
-  const auto source_string = defines + common_header + source_string_;
-
-  // Compiles the kernel
-  try {
-    auto program = Program(context_, source_string);
-    auto options = std::vector<std::string>();
-    const auto build_status = program.Build(device_, options);
-
-    // Checks for compiler crashes/errors/warnings
-    if (build_status == BuildStatus::kError) {
-      const auto message = program.GetBuildInfo(device_);
-      fprintf(stdout, "OpenCL compiler error/warning: %s\n", message.c_str());
-      return StatusCode::kBuildProgramFailure;
-    }
-    if (build_status == BuildStatus::kInvalid) { return StatusCode::kInvalidBinary; }
-
-    // Store the compiled binary and program in the cache
-    const auto binary = program.GetIR();
-    StoreBinaryToCache(binary);
-    StoreProgramToCache(program);
-  } catch (...) { return StatusCode::kBuildProgramFailure; }
-
-  // No errors, normal termination of this function
-  return StatusCode::kSuccess;
-}
-
-// =================================================================================================
-
-// Enqueues a kernel, waits for completion, and checks for errors
-template <typename T>
-StatusCode Routine<T>::RunKernel(Kernel &kernel, std::vector<size_t> global,
-                                 const std::vector<size_t> &local, EventPointer event,
-                                 std::vector<Event>& waitForEvents) {
-
-  // Tests for validity of the local thread sizes
-  if (local.size() > max_work_item_dimensions_) {
-    return StatusCode::kInvalidLocalNumDimensions; 
-  }
-  for (auto i=size_t{0}; i<local.size(); ++i) {
-    if (local[i] > max_work_item_sizes_[i]) { return StatusCode::kInvalidLocalThreadsDim; }
-  }
-  auto local_size = size_t{1};
-  for (auto &item: local) { local_size *= item; }
-  if (local_size > max_work_group_size_) { return StatusCode::kInvalidLocalThreadsTotal; }
-
-  // Make sure the global thread sizes are at least equal to the local sizes
-  for (auto i=size_t{0}; i<global.size(); ++i) {
-    if (global[i] < local[i]) { global[i] = local[i]; }
-  }
-
-  // Tests for local memory usage
-  const auto local_mem_usage = kernel.LocalMemUsage(device_);
-  if (!device_.IsLocalMemoryValid(local_mem_usage)) { return StatusCode::kInvalidLocalMemUsage; }
-
-  // Launches the kernel (and checks for launch errors)
-  try {
-    kernel.Launch(queue_, global, local, event, waitForEvents);
-  } catch (...) { return StatusCode::kKernelLaunchError; }
-
-  // No errors, normal termination of this function
-  return StatusCode::kSuccess;
-}
-
-// As above, but without an event waiting list
-template <typename T>
-StatusCode Routine<T>::RunKernel(Kernel &kernel, std::vector<size_t> global,
-                                 const std::vector<size_t> &local, EventPointer event) {
-  auto emptyWaitingList = std::vector<Event>();
-  return RunKernel(kernel, global, local, event, emptyWaitingList);
-}
-
-// =================================================================================================
-
-// Tests matrix A for validity: checks for a valid OpenCL buffer, a valid lead-dimension, and for a
-// sufficient buffer size.
-template <typename T>
-StatusCode Routine<T>::TestMatrixA(const size_t one, const size_t two, const Buffer<T> &buffer,
-                                   const size_t offset, const size_t ld, const size_t data_size) {
-  if (ld < one) { return StatusCode::kInvalidLeadDimA; }
-  try {
-    const auto required_size = (ld*(two-1) + one + offset)*data_size;
-    const auto buffer_size = buffer.GetSize();
-    if (buffer_size < required_size) { return StatusCode::kInsufficientMemoryA; }
-  } catch (...) { return StatusCode::kInvalidMatrixA; }
-  return StatusCode::kSuccess;
-}
-
-// Tests matrix B for validity: checks for a valid OpenCL buffer, a valid lead-dimension, and for a
-// sufficient buffer size.
-template <typename T>
-StatusCode Routine<T>::TestMatrixB(const size_t one, const size_t two, const Buffer<T> &buffer,
-                                   const size_t offset, const size_t ld, const size_t data_size) {
-  if (ld < one) { return StatusCode::kInvalidLeadDimB; }
-  try {
-    const auto required_size = (ld*(two-1) + one + offset)*data_size;
-    const auto buffer_size = buffer.GetSize();
-    if (buffer_size < required_size) { return StatusCode::kInsufficientMemoryB; }
-  } catch (...) { return StatusCode::kInvalidMatrixB; }
-  return StatusCode::kSuccess;
-}
-
-// Tests matrix C for validity: checks for a valid OpenCL buffer, a valid lead-dimension, and for a
-// sufficient buffer size.
-template <typename T>
-StatusCode Routine<T>::TestMatrixC(const size_t one, const size_t two, const Buffer<T> &buffer,
-                                   const size_t offset, const size_t ld, const size_t data_size) {
-  if (ld < one) { return StatusCode::kInvalidLeadDimC; }
-  try {
-    const auto required_size = (ld*(two-1) + one + offset)*data_size;
-    const auto buffer_size = buffer.GetSize();
-    if (buffer_size < required_size) { return StatusCode::kInsufficientMemoryC; }
-  } catch (...) { return StatusCode::kInvalidMatrixC; }
-  return StatusCode::kSuccess;
-}
-
-// Tests matrix AP for validity: checks for a valid OpenCL buffer and for a sufficient buffer size
-template <typename T>
-StatusCode Routine<T>::TestMatrixAP(const size_t n, const Buffer<T> &buffer,
-                                    const size_t offset, const size_t data_size) {
-  try {
-    const auto required_size = (((n*(n+1))/2) + offset)*data_size;
-    const auto buffer_size = buffer.GetSize();
-    if (buffer_size < required_size) { return StatusCode::kInsufficientMemoryA; }
-  } catch (...) { return StatusCode::kInvalidMatrixA; }
-  return StatusCode::kSuccess;
-}
-
-// =================================================================================================
-
-// Tests vector X for validity: checks for a valid increment, a valid OpenCL buffer, and for a
-// sufficient buffer size.
-template <typename T>
-StatusCode Routine<T>::TestVectorX(const size_t n, const Buffer<T> &buffer, const size_t offset,
-                                   const size_t inc, const size_t data_size) {
-  if (inc == 0) { return StatusCode::kInvalidIncrementX; }
-  try {
-    const auto required_size = ((n-1)*inc + 1 + offset)*data_size;
-    const auto buffer_size = buffer.GetSize();
-    if (buffer_size < required_size) { return StatusCode::kInsufficientMemoryX; }
-  } catch (...) { return StatusCode::kInvalidVectorX; }
-  return StatusCode::kSuccess;
-}
-
-// Tests vector Y for validity: checks for a valid increment, a valid OpenCL buffer, and for a
-// sufficient buffer size.
-template <typename T>
-StatusCode Routine<T>::TestVectorY(const size_t n, const Buffer<T> &buffer, const size_t offset,
-                                   const size_t inc, const size_t data_size) {
-  if (inc == 0) { return StatusCode::kInvalidIncrementY; }
-  try {
-    const auto required_size = ((n-1)*inc + 1 + offset)*data_size;
-    const auto buffer_size = buffer.GetSize();
-    if (buffer_size < required_size) { return StatusCode::kInsufficientMemoryY; }
-  } catch (...) { return StatusCode::kInvalidVectorY; }
-  return StatusCode::kSuccess;
-}
-
-// =================================================================================================
-
-// Tests vector dot for validity: checks for a valid increment, a valid OpenCL buffer, and for a
-// sufficient buffer size.
-template <typename T>
-StatusCode Routine<T>::TestVectorDot(const size_t n, const Buffer<T> &buffer, const size_t offset,
-                                     const size_t data_size) {
-  try {
-    const auto required_size = (n + offset)*data_size;
-    const auto buffer_size = buffer.GetSize();
-    if (buffer_size < required_size) { return StatusCode::kInsufficientMemoryDot; }
-  } catch (...) { return StatusCode::kInvalidVectorDot; }
-  return StatusCode::kSuccess;
-}
-
-// Tests vector index for validity: checks for a valid increment, a valid OpenCL buffer, and for a
-// sufficient buffer size.
-template <typename T>
-StatusCode Routine<T>::TestVectorIndex(const size_t n, const Buffer<unsigned int> &buffer,
-                                       const size_t offset, const size_t data_size) {
-  try {
-    const auto required_size = (n + offset)*data_size;
-    const auto buffer_size = buffer.GetSize();
-    if (buffer_size < required_size) { return StatusCode::kInsufficientMemoryDot; }
-  } catch (...) { return StatusCode::kInvalidVectorDot; }
-  return StatusCode::kSuccess;
-}
-
-// =================================================================================================
-
-// Copies or transposes a matrix and optionally pads/unpads it with zeros
-template <typename T>
-StatusCode Routine<T>::PadCopyTransposeMatrix(EventPointer event, std::vector<Event>& waitForEvents,
-                                              const size_t src_one, const size_t src_two,
-                                              const size_t src_ld, const size_t src_offset,
-                                              const Buffer<T> &src,
-                                              const size_t dest_one, const size_t dest_two,
-                                              const size_t dest_ld, const size_t dest_offset,
-                                              const Buffer<T> &dest,
-                                              const T alpha,
-                                              const Program &program, const bool do_pad,
-                                              const bool do_transpose, const bool do_conjugate,
-                                              const bool upper, const bool lower,
-                                              const bool diagonal_imag_zero) {
-
-  // Determines whether or not the fast-version could potentially be used
-  auto use_fast_kernel = (src_offset == 0) && (dest_offset == 0) && (do_conjugate == false) &&
-                         (src_one == dest_one) && (src_two == dest_two) && (src_ld == dest_ld) &&
-                         (upper == false) && (lower == false) && (diagonal_imag_zero == false);
-
-  // Determines the right kernel
-  auto kernel_name = std::string{};
-  if (do_transpose) {
-    if (use_fast_kernel &&
-        IsMultiple(src_ld, db_["TRA_WPT"]) &&
-        IsMultiple(src_one, db_["TRA_WPT"]*db_["TRA_WPT"]) &&
-        IsMultiple(src_two, db_["TRA_WPT"]*db_["TRA_WPT"])) {
-      kernel_name = "TransposeMatrixFast";
-    }
-    else {
-      use_fast_kernel = false;
-      kernel_name = (do_pad) ? "TransposePadMatrix" : "TransposeMatrix";
-    }
-  }
-  else {
-    if (use_fast_kernel &&
-        IsMultiple(src_ld, db_["COPY_VW"]) &&
-        IsMultiple(src_one, db_["COPY_VW"]*db_["COPY_DIMX"]) &&
-        IsMultiple(src_two, db_["COPY_WPT"]*db_["COPY_DIMY"])) {
-      kernel_name = "CopyMatrixFast";
-    }
-    else {
-      use_fast_kernel = false;
-      kernel_name = (do_pad) ? "CopyPadMatrix" : "CopyMatrix";
-    }
-  }
-
-  // Upload the scalar argument as a constant buffer to the device (needed for half-precision)
-  auto alpha_buffer = Buffer<T>(context_, 1);
-  alpha_buffer.Write(queue_, 1, &alpha);
-
-  // Retrieves the kernel from the compiled binary
-  try {
-    auto kernel = Kernel(program, kernel_name);
-
-    // Sets the kernel arguments
-    if (use_fast_kernel) {
-      kernel.SetArgument(0, static_cast<int>(src_ld));
-      kernel.SetArgument(1, src());
-      kernel.SetArgument(2, dest());
-      kernel.SetArgument(3, alpha_buffer());
-    }
-    else {
-      kernel.SetArgument(0, static_cast<int>(src_one));
-      kernel.SetArgument(1, static_cast<int>(src_two));
-      kernel.SetArgument(2, static_cast<int>(src_ld));
-      kernel.SetArgument(3, static_cast<int>(src_offset));
-      kernel.SetArgument(4, src());
-      kernel.SetArgument(5, static_cast<int>(dest_one));
-      kernel.SetArgument(6, static_cast<int>(dest_two));
-      kernel.SetArgument(7, static_cast<int>(dest_ld));
-      kernel.SetArgument(8, static_cast<int>(dest_offset));
-      kernel.SetArgument(9, dest());
-      kernel.SetArgument(10, alpha_buffer());
-      if (do_pad) {
-        kernel.SetArgument(11, static_cast<int>(do_conjugate));
-      }
-      else {
-        kernel.SetArgument(11, static_cast<int>(upper));
-        kernel.SetArgument(12, static_cast<int>(lower));
-        kernel.SetArgument(13, static_cast<int>(diagonal_imag_zero));
-      }
-    }
-
-    // Launches the kernel and returns the error code. Uses global and local thread sizes based on
-    // parameters in the database.
-    if (do_transpose) {
-      if (use_fast_kernel) {
-        const auto global = std::vector<size_t>{
-          dest_one / db_["TRA_WPT"],
-          dest_two / db_["TRA_WPT"]
-        };
-        const auto local = std::vector<size_t>{db_["TRA_DIM"], db_["TRA_DIM"]};
-        return RunKernel(kernel, global, local, event, waitForEvents);
-      }
-      else {
-        const auto global = std::vector<size_t>{
-          Ceil(CeilDiv(dest_one, db_["PADTRA_WPT"]), db_["PADTRA_TILE"]),
-          Ceil(CeilDiv(dest_two, db_["PADTRA_WPT"]), db_["PADTRA_TILE"])
-        };
-        const auto local = std::vector<size_t>{db_["PADTRA_TILE"], db_["PADTRA_TILE"]};
-        return RunKernel(kernel, global, local, event, waitForEvents);
-      }
-    }
-    else {
-      if (use_fast_kernel) {
-        const auto global = std::vector<size_t>{
-          dest_one / db_["COPY_VW"],
-          dest_two / db_["COPY_WPT"]
-        };
-        const auto local = std::vector<size_t>{db_["COPY_DIMX"], db_["COPY_DIMY"]};
-        return RunKernel(kernel, global, local, event, waitForEvents);
-      }
-      else {
-        const auto global = std::vector<size_t>{
-          Ceil(CeilDiv(dest_one, db_["PAD_WPTX"]), db_["PAD_DIMX"]),
-          Ceil(CeilDiv(dest_two, db_["PAD_WPTY"]), db_["PAD_DIMY"])
-        };
-        const auto local = std::vector<size_t>{db_["PAD_DIMX"], db_["PAD_DIMY"]};
-        return RunKernel(kernel, global, local, event, waitForEvents);
-      }
-    }
-  } catch (...) { return StatusCode::kInvalidKernel; }
-}
-
-// =================================================================================================
-
-// Compiles the templated class
-template class Routine<half>;
-template class Routine<float>;
-template class Routine<double>;
-template class Routine<float2>;
-template class Routine<double2>;
-
-// =================================================================================================
-} // namespace clblast
--- a/src/routine.cpp
+++ b/src/routine.cpp
@ -0,0 +1,131 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the Routine base class (see the header for information about the class).
+//
+// =================================================================================================
+
+#include <string>
+#include <vector>
+
+#include "routine.hpp"
+
+namespace clblast {
+// =================================================================================================
+
+// Constructor: not much here, because no status codes can be returned
+Routine::Routine(Queue &queue, EventPointer event, const std::string &name,
+                 const std::vector<std::string> &routines, const Precision precision):
+    precision_(precision),
+    routine_name_(name),
+    queue_(queue),
+    event_(event),
+    context_(queue_.GetContext()),
+    device_(queue_.GetDevice()),
+    device_name_(device_.Name()),
+    db_(queue_, routines, precision_) {
+}
+
+// =================================================================================================
+
+// Separate set-up function to allow for status codes to be returned
+StatusCode Routine::SetUp() {
+
+  // Queries the cache to see whether or not the program (context-specific) is already there
+  if (ProgramIsInCache(context_, precision_, routine_name_)) { return StatusCode::kSuccess; }
+
+  // Queries the cache to see whether or not the binary (device-specific) is already there. If it
+  // is, a program is created and stored in the cache
+  if (BinaryIsInCache(device_name_, precision_, routine_name_)) {
+    try {
+      auto& binary = GetBinaryFromCache(device_name_, precision_, routine_name_);
+      auto program = Program(device_, context_, binary);
+      auto options = std::vector<std::string>();
+      program.Build(device_, options);
+      StoreProgramToCache(program, context_, precision_, routine_name_);
+    } catch (...) { return StatusCode::kBuildProgramFailure; }
+    return StatusCode::kSuccess;
+  }
+
+  // Otherwise, the kernel will be compiled and program will be built. Both the binary and the
+  // program will be added to the cache.
+
+  // Inspects whether or not cl_khr_fp64 is supported in case of double precision
+  const auto extensions = device_.Capabilities();
+  if (precision_ == Precision::kDouble || precision_ == Precision::kComplexDouble) {
+    if (extensions.find(kKhronosDoublePrecision) == std::string::npos) {
+      return StatusCode::kNoDoublePrecision;
+    }
+  }
+
+  // As above, but for cl_khr_fp16 (half precision)
+  if (precision_ == Precision::kHalf) {
+    if (extensions.find(kKhronosHalfPrecision) == std::string::npos) {
+      return StatusCode::kNoHalfPrecision;
+    }
+  }
+
+  // Loads the common header (typedefs and defines and such)
+  std::string common_header =
+    #include "kernels/common.opencl"
+  ;
+
+  // Collects the parameters for this device in the form of defines, and adds the precision
+  auto defines = db_.GetDefines();
+  defines += "#define PRECISION "+ToString(static_cast<int>(precision_))+"\n";
+
+  // Adds the name of the routine as a define
+  defines += "#define ROUTINE_"+routine_name_+"\n";
+
+  // For specific devices, use the non-IEE754 compilant OpenCL mad() instruction. This can improve
+  // performance, but might result in a reduced accuracy.
+  if (device_.IsAMD() && device_.IsGPU()) {
+    defines += "#define USE_CL_MAD 1\n";
+  }
+
+  // For specific devices, use staggered/shuffled workgroup indices.
+  if (device_.IsAMD() && device_.IsGPU()) {
+    defines += "#define USE_STAGGERED_INDICES 1\n";
+  }
+
+  // For specific devices add a global synchronisation barrier to the GEMM kernel to optimize
+  // performance through better cache behaviour
+  if (device_.IsARM() && device_.IsGPU()) {
+    defines += "#define GLOBAL_MEM_FENCE 1\n";
+  }
+
+  // Combines everything together into a single source string
+  const auto source_string = defines + common_header + source_string_;
+
+  // Compiles the kernel
+  try {
+    auto program = Program(context_, source_string);
+    auto options = std::vector<std::string>();
+    const auto build_status = program.Build(device_, options);
+
+    // Checks for compiler crashes/errors/warnings
+    if (build_status == BuildStatus::kError) {
+      const auto message = program.GetBuildInfo(device_);
+      fprintf(stdout, "OpenCL compiler error/warning: %s\n", message.c_str());
+      return StatusCode::kBuildProgramFailure;
+    }
+    if (build_status == BuildStatus::kInvalid) { return StatusCode::kInvalidBinary; }
+
+    // Store the compiled binary and program in the cache
+    const auto binary = program.GetIR();
+    StoreBinaryToCache(binary, device_name_, precision_, routine_name_);
+    StoreProgramToCache(program, context_, precision_, routine_name_);
+  } catch (...) { return StatusCode::kBuildProgramFailure; }
+
+  // No errors, normal termination of this function
+  return StatusCode::kSuccess;
+}
+
+// =================================================================================================
+} // namespace clblast
--- a/src/routine.hpp
+++ b/src/routine.hpp
@ -0,0 +1,68 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements all the basic functionality for the BLAS routines. This class serves as a
+// base class for the actual routines (e.g. Xaxpy, Xgemm). It contains common functionality such as
+// compiling the OpenCL kernel, connecting to the database, etc.
+//
+// =================================================================================================
+
+#ifndef CLBLAST_ROUTINE_H_
+#define CLBLAST_ROUTINE_H_
+
+#include <string>
+#include <vector>
+
+#include "utilities.hpp"
+#include "cache.hpp"
+#include "buffer_test.hpp"
+#include "database/database.hpp"
+#include "routines/common.hpp"
+
+namespace clblast {
+// =================================================================================================
+
+// See comment at top of file for a description of the class
+class Routine {
+ public:
+
+  // Base class constructor
+  explicit Routine(Queue &queue, EventPointer event, const std::string &name,
+                   const std::vector<std::string> &routines, const Precision precision);
+
+  // Set-up phase of the kernel
+  StatusCode SetUp();
+
+ protected:
+
+  // Non-static variable for the precision
+  const Precision precision_;
+
+  // The routine's name and its kernel-source in string form
+  const std::string routine_name_;
+  std::string source_string_;
+
+  // The OpenCL objects, accessible only from derived classes
+  Queue queue_;
+  EventPointer event_;
+  const Context context_;
+  const Device device_;
+
+  // OpenCL device properties
+  const std::string device_name_;
+
+  // Connection to the database for all the device-specific parameters
+  const Database db_;
+};
+
+// =================================================================================================
+} // namespace clblast
+
+// CLBLAST_ROUTINE_H_
+#endif
--- a/src/routines/common.cpp
+++ b/src/routines/common.cpp
@ -0,0 +1,65 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the common routine functions (see the header for more information).
+//
+// =================================================================================================
+
+#include <vector>
+
+#include "routines/common.hpp"
+
+namespace clblast {
+// =================================================================================================
+
+// Enqueues a kernel, waits for completion, and checks for errors
+StatusCode RunKernel(Kernel &kernel, Queue &queue, const Device &device,
+                     std::vector<size_t> global, const std::vector<size_t> &local,
+                     EventPointer event, std::vector<Event>& waitForEvents) {
+
+  // Tests for validity of the local thread sizes
+  if (local.size() > device.MaxWorkItemDimensions()) {
+    return StatusCode::kInvalidLocalNumDimensions; 
+  }
+  const auto max_work_item_sizes = device.MaxWorkItemSizes();
+  for (auto i=size_t{0}; i<local.size(); ++i) {
+    if (local[i] > max_work_item_sizes[i]) { return StatusCode::kInvalidLocalThreadsDim; }
+  }
+  auto local_size = size_t{1};
+  for (auto &item: local) { local_size *= item; }
+  if (local_size > device.MaxWorkGroupSize()) { return StatusCode::kInvalidLocalThreadsTotal; }
+
+  // Make sure the global thread sizes are at least equal to the local sizes
+  for (auto i=size_t{0}; i<global.size(); ++i) {
+    if (global[i] < local[i]) { global[i] = local[i]; }
+  }
+
+  // Tests for local memory usage
+  const auto local_mem_usage = kernel.LocalMemUsage(device);
+  if (!device.IsLocalMemoryValid(local_mem_usage)) { return StatusCode::kInvalidLocalMemUsage; }
+
+  // Launches the kernel (and checks for launch errors)
+  try {
+    kernel.Launch(queue, global, local, event, waitForEvents);
+  } catch (...) { return StatusCode::kKernelLaunchError; }
+
+  // No errors, normal termination of this function
+  return StatusCode::kSuccess;
+}
+
+// As above, but without an event waiting list
+StatusCode RunKernel(Kernel &kernel, Queue &queue, const Device &device,
+                     std::vector<size_t> global, const std::vector<size_t> &local,
+                     EventPointer event) {
+  auto emptyWaitingList = std::vector<Event>();
+  return RunKernel(kernel, queue, device, global, local, event, emptyWaitingList);
+}
+
+// =================================================================================================
+} // namespace clblast
--- a/src/routines/common.hpp
+++ b/src/routines/common.hpp
@ -0,0 +1,173 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file contains all the interfaces to common kernels, such as copying, padding, and
+// transposing a matrix. These functions are templated and thus header-only. This file also contains
+// other common functions to routines, such as a function to launch a kernel.
+//
+// =================================================================================================
+
+#ifndef CLBLAST_ROUTINES_COMMON_H_
+#define CLBLAST_ROUTINES_COMMON_H_
+
+#include <string>
+#include <vector>
+
+#include "clblast.h"
+#include "clpp11.hpp"
+#include "database/database.hpp"
+
+namespace clblast {
+// =================================================================================================
+
+// Enqueues a kernel, waits for completion, and checks for errors
+StatusCode RunKernel(Kernel &kernel, Queue &queue, const Device &device,
+                     std::vector<size_t> global, const std::vector<size_t> &local,
+                     EventPointer event, std::vector<Event>& waitForEvents);
+
+// As above, but without an event waiting list
+StatusCode RunKernel(Kernel &kernel, Queue &queue, const Device &device,
+                     std::vector<size_t> global, const std::vector<size_t> &local,
+                     EventPointer event);
+
+// =================================================================================================
+
+// Copies or transposes a matrix and optionally pads/unpads it with zeros. This method is also able
+// to write to symmetric and triangular matrices through optional arguments.
+template <typename T>
+StatusCode PadCopyTransposeMatrix(Queue &queue, const Device &device, const Context &context,
+                                  const Database &db,
+                                  EventPointer event, std::vector<Event>& waitForEvents,
+                                  const size_t src_one, const size_t src_two,
+                                  const size_t src_ld, const size_t src_offset,
+                                  const Buffer<T> &src,
+                                  const size_t dest_one, const size_t dest_two,
+                                  const size_t dest_ld, const size_t dest_offset,
+                                  const Buffer<T> &dest,
+                                  const T alpha,
+                                  const Program &program, const bool do_pad,
+                                  const bool do_transpose, const bool do_conjugate,
+                                  const bool upper = false, const bool lower = false,
+                                  const bool diagonal_imag_zero = false) {
+
+  // Determines whether or not the fast-version could potentially be used
+  auto use_fast_kernel = (src_offset == 0) && (dest_offset == 0) && (do_conjugate == false) &&
+                         (src_one == dest_one) && (src_two == dest_two) && (src_ld == dest_ld) &&
+                         (upper == false) && (lower == false) && (diagonal_imag_zero == false);
+
+  // Determines the right kernel
+  auto kernel_name = std::string{};
+  if (do_transpose) {
+    if (use_fast_kernel &&
+        IsMultiple(src_ld, db["TRA_WPT"]) &&
+        IsMultiple(src_one, db["TRA_WPT"]*db["TRA_WPT"]) &&
+        IsMultiple(src_two, db["TRA_WPT"]*db["TRA_WPT"])) {
+      kernel_name = "TransposeMatrixFast";
+    }
+    else {
+      use_fast_kernel = false;
+      kernel_name = (do_pad) ? "TransposePadMatrix" : "TransposeMatrix";
+    }
+  }
+  else {
+    if (use_fast_kernel &&
+        IsMultiple(src_ld, db["COPY_VW"]) &&
+        IsMultiple(src_one, db["COPY_VW"]*db["COPY_DIMX"]) &&
+        IsMultiple(src_two, db["COPY_WPT"]*db["COPY_DIMY"])) {
+      kernel_name = "CopyMatrixFast";
+    }
+    else {
+      use_fast_kernel = false;
+      kernel_name = (do_pad) ? "CopyPadMatrix" : "CopyMatrix";
+    }
+  }
+
+  // Upload the scalar argument as a constant buffer to the device (needed for half-precision)
+  auto alpha_buffer = Buffer<T>(context, 1);
+  alpha_buffer.Write(queue, 1, &alpha);
+
+  // Retrieves the kernel from the compiled binary
+  try {
+    auto kernel = Kernel(program, kernel_name);
+
+    // Sets the kernel arguments
+    if (use_fast_kernel) {
+      kernel.SetArgument(0, static_cast<int>(src_ld));
+      kernel.SetArgument(1, src());
+      kernel.SetArgument(2, dest());
+      kernel.SetArgument(3, alpha_buffer());
+    }
+    else {
+      kernel.SetArgument(0, static_cast<int>(src_one));
+      kernel.SetArgument(1, static_cast<int>(src_two));
+      kernel.SetArgument(2, static_cast<int>(src_ld));
+      kernel.SetArgument(3, static_cast<int>(src_offset));
+      kernel.SetArgument(4, src());
+      kernel.SetArgument(5, static_cast<int>(dest_one));
+      kernel.SetArgument(6, static_cast<int>(dest_two));
+      kernel.SetArgument(7, static_cast<int>(dest_ld));
+      kernel.SetArgument(8, static_cast<int>(dest_offset));
+      kernel.SetArgument(9, dest());
+      kernel.SetArgument(10, alpha_buffer());
+      if (do_pad) {
+        kernel.SetArgument(11, static_cast<int>(do_conjugate));
+      }
+      else {
+        kernel.SetArgument(11, static_cast<int>(upper));
+        kernel.SetArgument(12, static_cast<int>(lower));
+        kernel.SetArgument(13, static_cast<int>(diagonal_imag_zero));
+      }
+    }
+
+    // Launches the kernel and returns the error code. Uses global and local thread sizes based on
+    // parameters in the database.
+    if (do_transpose) {
+      if (use_fast_kernel) {
+        const auto global = std::vector<size_t>{
+          dest_one / db["TRA_WPT"],
+          dest_two / db["TRA_WPT"]
+        };
+        const auto local = std::vector<size_t>{db["TRA_DIM"], db["TRA_DIM"]};
+        return RunKernel(kernel, queue, device, global, local, event, waitForEvents);
+      }
+      else {
+        const auto global = std::vector<size_t>{
+          Ceil(CeilDiv(dest_one, db["PADTRA_WPT"]), db["PADTRA_TILE"]),
+          Ceil(CeilDiv(dest_two, db["PADTRA_WPT"]), db["PADTRA_TILE"])
+        };
+        const auto local = std::vector<size_t>{db["PADTRA_TILE"], db["PADTRA_TILE"]};
+        return RunKernel(kernel, queue, device, global, local, event, waitForEvents);
+      }
+    }
+    else {
+      if (use_fast_kernel) {
+        const auto global = std::vector<size_t>{
+          dest_one / db["COPY_VW"],
+          dest_two / db["COPY_WPT"]
+        };
+        const auto local = std::vector<size_t>{db["COPY_DIMX"], db["COPY_DIMY"]};
+        return RunKernel(kernel, queue, device, global, local, event, waitForEvents);
+      }
+      else {
+        const auto global = std::vector<size_t>{
+          Ceil(CeilDiv(dest_one, db["PAD_WPTX"]), db["PAD_DIMX"]),
+          Ceil(CeilDiv(dest_two, db["PAD_WPTY"]), db["PAD_DIMY"])
+        };
+        const auto local = std::vector<size_t>{db["PAD_DIMX"], db["PAD_DIMY"]};
+        return RunKernel(kernel, queue, device, global, local, event, waitForEvents);
+      }
+    }
+  } catch (...) { return StatusCode::kInvalidKernel; }
+}
+
+// =================================================================================================
+} // namespace clblast
+
+// CLBLAST_ROUTINES_COMMON_H_
+#endif
--- a/src/routines/level1/xamax.cpp
+++ b/src/routines/level1/xamax.cpp
@ -11,7 +11,7 @@
 //
 // =================================================================================================

-#include "internal/routines/level1/xamax.h"
+#include "routines/level1/xamax.hpp"

 #include <string>
 #include <vector>
@ -19,19 +19,10 @@
 namespace clblast {
 // =================================================================================================

-// Specific implementations to get the memory-type based on a template argument
-template <> const Precision Xamax<half>::precision_ = Precision::kHalf;
-template <> const Precision Xamax<float>::precision_ = Precision::kSingle;
-template <> const Precision Xamax<double>::precision_ = Precision::kDouble;
-template <> const Precision Xamax<float2>::precision_ = Precision::kComplexSingle;
-template <> const Precision Xamax<double2>::precision_ = Precision::kComplexDouble;
-
-// =================================================================================================
-
 // Constructor: forwards to base class constructor
 template <typename T>
 Xamax<T>::Xamax(Queue &queue, EventPointer event, const std::string &name):
-    Routine<T>(queue, event, name, {"Xdot"}, precision_) {
+    Routine(queue, event, name, {"Xdot"}, PrecisionValue<T>()) {
  source_string_ =
    #include "../../kernels/level1/xamax.opencl"
  ;
@ -49,14 +40,14 @@ StatusCode Xamax<T>::DoAmax(const size_t n,
  if (n == 0) { return StatusCode::kInvalidDimension; }

  // Tests the vectors for validity
-  auto status = TestVectorX(n, x_buffer, x_offset, x_inc, sizeof(T));
+  auto status = TestVectorX(n, x_buffer, x_offset, x_inc);
  if (ErrorIn(status)) { return status; }
-  status = TestVectorIndex(1, imax_buffer, imax_offset, sizeof(unsigned int));
+  status = TestVectorIndex(1, imax_buffer, imax_offset);
  if (ErrorIn(status)) { return status; }

  // Retrieves the Xamax kernels from the compiled binary
  try {
-    const auto program = GetProgramFromCache();
+    const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);
    auto kernel1 = Kernel(program, "Xamax");
    auto kernel2 = Kernel(program, "XamaxEpilogue");

@ -80,7 +71,7 @@ StatusCode Xamax<T>::DoAmax(const size_t n,
    auto global1 = std::vector<size_t>{db_["WGS1"]*temp_size};
    auto local1 = std::vector<size_t>{db_["WGS1"]};
    auto kernelEvent = Event();
-    status = RunKernel(kernel1, global1, local1, kernelEvent.pointer());
+    status = RunKernel(kernel1, queue_, device_, global1, local1, kernelEvent.pointer());
    if (ErrorIn(status)) { return status; }
    eventWaitList.push_back(kernelEvent);

@ -93,7 +84,7 @@ StatusCode Xamax<T>::DoAmax(const size_t n,
    // Launches the epilogue kernel
    auto global2 = std::vector<size_t>{db_["WGS2"]};
    auto local2 = std::vector<size_t>{db_["WGS2"]};
-    status = RunKernel(kernel2, global2, local2, event_, eventWaitList);
+    status = RunKernel(kernel2, queue_, device_, global2, local2, event_, eventWaitList);
    if (ErrorIn(status)) { return status; }

    // Succesfully finished the computation
--- a/include/internal/routines/level1/xamax.h
+++ b/include/internal/routines/level1/xamax.h
@ -14,28 +14,16 @@
 #ifndef CLBLAST_ROUTINES_XAMAX_H_
 #define CLBLAST_ROUTINES_XAMAX_H_

-#include "internal/routine.h"
+#include "routine.hpp"

 namespace clblast {
 // =================================================================================================

 // See comment at top of file for a description of the class
 template <typename T>
-class Xamax: public Routine<T> {
+class Xamax: public Routine {
 public:

-  // Members and methods from the base class
-  using Routine<T>::db_;
-  using Routine<T>::source_string_;
-  using Routine<T>::queue_;
-  using Routine<T>::event_;
-  using Routine<T>::context_;
-  using Routine<T>::GetProgramFromCache;
-  using Routine<T>::TestVectorX;
-  using Routine<T>::TestVectorIndex;
-  using Routine<T>::RunKernel;
-  using Routine<T>::ErrorIn;
-
  // Constructor
  Xamax(Queue &queue, EventPointer event, const std::string &name = "AMAX");

@ -43,10 +31,6 @@ class Xamax: public Routine<T> {
  StatusCode DoAmax(const size_t n,
                    const Buffer<unsigned int> &imax_buffer, const size_t imax_offset,
                    const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc);
-
- private:
-  // Static variable to get the precision
-  const static Precision precision_;
 };

 // =================================================================================================
--- a/src/routines/level1/xasum.cpp
+++ b/src/routines/level1/xasum.cpp
@ -11,7 +11,7 @@
 //
 // =================================================================================================

-#include "internal/routines/level1/xasum.h"
+#include "routines/level1/xasum.hpp"

 #include <string>
 #include <vector>
@ -19,19 +19,10 @@
 namespace clblast {
 // =================================================================================================

-// Specific implementations to get the memory-type based on a template argument
-template <> const Precision Xasum<half>::precision_ = Precision::kHalf;
-template <> const Precision Xasum<float>::precision_ = Precision::kSingle;
-template <> const Precision Xasum<double>::precision_ = Precision::kDouble;
-template <> const Precision Xasum<float2>::precision_ = Precision::kComplexSingle;
-template <> const Precision Xasum<double2>::precision_ = Precision::kComplexDouble;
-
-// =================================================================================================
-
 // Constructor: forwards to base class constructor
 template <typename T>
 Xasum<T>::Xasum(Queue &queue, EventPointer event, const std::string &name):
-    Routine<T>(queue, event, name, {"Xdot"}, precision_) {
+    Routine(queue, event, name, {"Xdot"}, PrecisionValue<T>()) {
  source_string_ =
    #include "../../kernels/level1/xasum.opencl"
  ;
@ -49,14 +40,14 @@ StatusCode Xasum<T>::DoAsum(const size_t n,
  if (n == 0) { return StatusCode::kInvalidDimension; }

  // Tests the vectors for validity
-  auto status = TestVectorX(n, x_buffer, x_offset, x_inc, sizeof(T));
+  auto status = TestVectorX(n, x_buffer, x_offset, x_inc);
  if (ErrorIn(status)) { return status; }
-  status = TestVectorDot(1, asum_buffer, asum_offset, sizeof(T));
+  status = TestVectorScalar(1, asum_buffer, asum_offset);
  if (ErrorIn(status)) { return status; }

  // Retrieves the Xasum kernels from the compiled binary
  try {
-    const auto program = GetProgramFromCache();
+    const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);
    auto kernel1 = Kernel(program, "Xasum");
    auto kernel2 = Kernel(program, "XasumEpilogue");

@ -78,7 +69,7 @@ StatusCode Xasum<T>::DoAsum(const size_t n,
    auto global1 = std::vector<size_t>{db_["WGS1"]*temp_size};
    auto local1 = std::vector<size_t>{db_["WGS1"]};
    auto kernelEvent = Event();
-    status = RunKernel(kernel1, global1, local1, kernelEvent.pointer());
+    status = RunKernel(kernel1, queue_, device_, global1, local1, kernelEvent.pointer());
    if (ErrorIn(status)) { return status; }
    eventWaitList.push_back(kernelEvent);

@ -90,7 +81,7 @@ StatusCode Xasum<T>::DoAsum(const size_t n,
    // Launches the epilogue kernel
    auto global2 = std::vector<size_t>{db_["WGS2"]};
    auto local2 = std::vector<size_t>{db_["WGS2"]};
-    status = RunKernel(kernel2, global2, local2, event_, eventWaitList);
+    status = RunKernel(kernel2, queue_, device_, global2, local2, event_, eventWaitList);
    if (ErrorIn(status)) { return status; }

    // Succesfully finished the computation
--- a/include/internal/routines/level1/xasum.h
+++ b/include/internal/routines/level1/xasum.h
@ -14,28 +14,16 @@
 #ifndef CLBLAST_ROUTINES_XASUM_H_
 #define CLBLAST_ROUTINES_XASUM_H_

-#include "internal/routine.h"
+#include "routine.hpp"

 namespace clblast {
 // =================================================================================================

 // See comment at top of file for a description of the class
 template <typename T>
-class Xasum: public Routine<T> {
+class Xasum: public Routine {
 public:

-  // Members and methods from the base class
-  using Routine<T>::db_;
-  using Routine<T>::source_string_;
-  using Routine<T>::queue_;
-  using Routine<T>::event_;
-  using Routine<T>::context_;
-  using Routine<T>::GetProgramFromCache;
-  using Routine<T>::TestVectorX;
-  using Routine<T>::TestVectorDot;
-  using Routine<T>::RunKernel;
-  using Routine<T>::ErrorIn;
-
  // Constructor
  Xasum(Queue &queue, EventPointer event, const std::string &name = "ASUM");

@ -43,10 +31,6 @@ class Xasum: public Routine<T> {
  StatusCode DoAsum(const size_t n,
                    const Buffer<T> &asum_buffer, const size_t asum_offset,
                    const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc);
-
- private:
-  // Static variable to get the precision
-  const static Precision precision_;
 };

 // =================================================================================================
--- a/src/routines/level1/xaxpy.cpp
+++ b/src/routines/level1/xaxpy.cpp
@ -11,7 +11,7 @@
 //
 // =================================================================================================

-#include "internal/routines/level1/xaxpy.h"
+#include "routines/level1/xaxpy.hpp"

 #include <string>
 #include <vector>
@ -19,19 +19,10 @@
 namespace clblast {
 // =================================================================================================

-// Specific implementations to get the memory-type based on a template argument
-template <> const Precision Xaxpy<half>::precision_ = Precision::kHalf;
-template <> const Precision Xaxpy<float>::precision_ = Precision::kSingle;
-template <> const Precision Xaxpy<double>::precision_ = Precision::kDouble;
-template <> const Precision Xaxpy<float2>::precision_ = Precision::kComplexSingle;
-template <> const Precision Xaxpy<double2>::precision_ = Precision::kComplexDouble;
-
-// =================================================================================================
-
 // Constructor: forwards to base class constructor
 template <typename T>
 Xaxpy<T>::Xaxpy(Queue &queue, EventPointer event, const std::string &name):
-    Routine<T>(queue, event, name, {"Xaxpy"}, precision_) {
+    Routine(queue, event, name, {"Xaxpy"}, PrecisionValue<T>()) {
  source_string_ =
    #include "../../kernels/level1/level1.opencl"
    #include "../../kernels/level1/xaxpy.opencl"
@ -50,9 +41,9 @@ StatusCode Xaxpy<T>::DoAxpy(const size_t n, const T alpha,
  if (n == 0) { return StatusCode::kInvalidDimension; }

  // Tests the vectors for validity
-  auto status = TestVectorX(n, x_buffer, x_offset, x_inc, sizeof(T));
+  auto status = TestVectorX(n, x_buffer, x_offset, x_inc);
  if (ErrorIn(status)) { return status; }
-  status = TestVectorY(n, y_buffer, y_offset, y_inc, sizeof(T));
+  status = TestVectorY(n, y_buffer, y_offset, y_inc);
  if (ErrorIn(status)) { return status; }

  // Determines whether or not the fast-version can be used
@ -65,7 +56,7 @@ StatusCode Xaxpy<T>::DoAxpy(const size_t n, const T alpha,

  // Retrieves the Xaxpy kernel from the compiled binary
  try {
-    const auto program = GetProgramFromCache();
+    const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);
    auto kernel = Kernel(program, kernel_name);

    // Upload the scalar argument as a constant buffer to the device (needed for half-precision)
@ -94,13 +85,13 @@ StatusCode Xaxpy<T>::DoAxpy(const size_t n, const T alpha,
    if (use_fast_kernel) {
      auto global = std::vector<size_t>{CeilDiv(n, db_["WPT"]*db_["VW"])};
      auto local = std::vector<size_t>{db_["WGS"]};
-      status = RunKernel(kernel, global, local, event_);
+      status = RunKernel(kernel, queue_, device_, global, local, event_);
    }
    else {
      auto n_ceiled = Ceil(n, db_["WGS"]*db_["WPT"]);
      auto global = std::vector<size_t>{n_ceiled/db_["WPT"]};
      auto local = std::vector<size_t>{db_["WGS"]};
-      status = RunKernel(kernel, global, local, event_);
+      status = RunKernel(kernel, queue_, device_, global, local, event_);
    }
    if (ErrorIn(status)) { return status; }

--- a/include/internal/routines/level1/xaxpy.h
+++ b/include/internal/routines/level1/xaxpy.h
@ -14,28 +14,16 @@
 #ifndef CLBLAST_ROUTINES_XAXPY_H_
 #define CLBLAST_ROUTINES_XAXPY_H_

-#include "internal/routine.h"
+#include "routine.hpp"

 namespace clblast {
 // =================================================================================================

 // See comment at top of file for a description of the class
 template <typename T>
-class Xaxpy: public Routine<T> {
+class Xaxpy: public Routine {
 public:

-  // Members and methods from the base class
-  using Routine<T>::db_;
-  using Routine<T>::source_string_;
-  using Routine<T>::queue_;
-  using Routine<T>::event_;
-  using Routine<T>::context_;
-  using Routine<T>::GetProgramFromCache;
-  using Routine<T>::TestVectorX;
-  using Routine<T>::TestVectorY;
-  using Routine<T>::RunKernel;
-  using Routine<T>::ErrorIn;
-
  // Constructor
  Xaxpy(Queue &queue, EventPointer event, const std::string &name = "AXPY");

@ -43,10 +31,6 @@ class Xaxpy: public Routine<T> {
  StatusCode DoAxpy(const size_t n, const T alpha,
                    const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
                    const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc);
-
- private:
-  // Static variable to get the precision
-  const static Precision precision_;
 };

 // =================================================================================================
--- a/src/routines/level1/xcopy.cpp
+++ b/src/routines/level1/xcopy.cpp
@ -11,7 +11,7 @@
 //
 // =================================================================================================

-#include "internal/routines/level1/xcopy.h"
+#include "routines/level1/xcopy.hpp"

 #include <string>
 #include <vector>
@ -19,19 +19,10 @@
 namespace clblast {
 // =================================================================================================

-// Specific implementations to get the memory-type based on a template argument
-template <> const Precision Xcopy<half>::precision_ = Precision::kHalf;
-template <> const Precision Xcopy<float>::precision_ = Precision::kSingle;
-template <> const Precision Xcopy<double>::precision_ = Precision::kDouble;
-template <> const Precision Xcopy<float2>::precision_ = Precision::kComplexSingle;
-template <> const Precision Xcopy<double2>::precision_ = Precision::kComplexDouble;
-
-// =================================================================================================
-
 // Constructor: forwards to base class constructor
 template <typename T>
 Xcopy<T>::Xcopy(Queue &queue, EventPointer event, const std::string &name):
-    Routine<T>(queue, event, name, {"Xaxpy"}, precision_) {
+    Routine(queue, event, name, {"Xaxpy"}, PrecisionValue<T>()) {
  source_string_ =
    #include "../../kernels/level1/level1.opencl"
    #include "../../kernels/level1/xcopy.opencl"
@ -50,9 +41,9 @@ StatusCode Xcopy<T>::DoCopy(const size_t n,
  if (n == 0) { return StatusCode::kInvalidDimension; }

  // Tests the vectors for validity
-  auto status = TestVectorX(n, x_buffer, x_offset, x_inc, sizeof(T));
+  auto status = TestVectorX(n, x_buffer, x_offset, x_inc);
  if (ErrorIn(status)) { return status; }
-  status = TestVectorY(n, y_buffer, y_offset, y_inc, sizeof(T));
+  status = TestVectorY(n, y_buffer, y_offset, y_inc);
  if (ErrorIn(status)) { return status; }

  // Determines whether or not the fast-version can be used
@ -65,7 +56,7 @@ StatusCode Xcopy<T>::DoCopy(const size_t n,

  // Retrieves the Xcopy kernel from the compiled binary
  try {
-    const auto program = GetProgramFromCache();
+    const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);
    auto kernel = Kernel(program, kernel_name);

    // Sets the kernel arguments
@ -88,13 +79,13 @@ StatusCode Xcopy<T>::DoCopy(const size_t n,
    if (use_fast_kernel) {
      auto global = std::vector<size_t>{CeilDiv(n, db_["WPT"]*db_["VW"])};
      auto local = std::vector<size_t>{db_["WGS"]};
-      status = RunKernel(kernel, global, local, event_);
+      status = RunKernel(kernel, queue_, device_, global, local, event_);
    }
    else {
      auto n_ceiled = Ceil(n, db_["WGS"]*db_["WPT"]);
      auto global = std::vector<size_t>{n_ceiled/db_["WPT"]};
      auto local = std::vector<size_t>{db_["WGS"]};
-      status = RunKernel(kernel, global, local, event_);
+      status = RunKernel(kernel, queue_, device_, global, local, event_);
    }
    if (ErrorIn(status)) { return status; }

--- a/include/internal/routines/level1/xcopy.h
+++ b/include/internal/routines/level1/xcopy.h
@ -14,27 +14,16 @@
 #ifndef CLBLAST_ROUTINES_XCOPY_H_
 #define CLBLAST_ROUTINES_XCOPY_H_

-#include "internal/routine.h"
+#include "routine.hpp"

 namespace clblast {
 // =================================================================================================

 // See comment at top of file for a description of the class
 template <typename T>
-class Xcopy: public Routine<T> {
+class Xcopy: public Routine {
 public:

-  // Members and methods from the base class
-  using Routine<T>::db_;
-  using Routine<T>::source_string_;
-  using Routine<T>::queue_;
-  using Routine<T>::event_;
-  using Routine<T>::GetProgramFromCache;
-  using Routine<T>::TestVectorX;
-  using Routine<T>::TestVectorY;
-  using Routine<T>::RunKernel;
-  using Routine<T>::ErrorIn;
-
  // Constructor
  Xcopy(Queue &queue, EventPointer event, const std::string &name = "COPY");

@ -42,10 +31,6 @@ class Xcopy: public Routine<T> {
  StatusCode DoCopy(const size_t n,
                    const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
                    const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc);
-
- private:
-  // Static variable to get the precision
-  const static Precision precision_;
 };

 // =================================================================================================
--- a/src/routines/level1/xdot.cpp
+++ b/src/routines/level1/xdot.cpp
@ -11,7 +11,7 @@
 //
 // =================================================================================================

-#include "internal/routines/level1/xdot.h"
+#include "routines/level1/xdot.hpp"

 #include <string>
 #include <vector>
@ -19,19 +19,10 @@
 namespace clblast {
 // =================================================================================================

-// Specific implementations to get the memory-type based on a template argument
-template <> const Precision Xdot<half>::precision_ = Precision::kHalf;
-template <> const Precision Xdot<float>::precision_ = Precision::kSingle;
-template <> const Precision Xdot<double>::precision_ = Precision::kDouble;
-template <> const Precision Xdot<float2>::precision_ = Precision::kComplexSingle;
-template <> const Precision Xdot<double2>::precision_ = Precision::kComplexDouble;
-
-// =================================================================================================
-
 // Constructor: forwards to base class constructor
 template <typename T>
 Xdot<T>::Xdot(Queue &queue, EventPointer event, const std::string &name):
-    Routine<T>(queue, event, name, {"Xdot"}, precision_) {
+    Routine(queue, event, name, {"Xdot"}, PrecisionValue<T>()) {
  source_string_ =
    #include "../../kernels/level1/xdot.opencl"
  ;
@ -51,16 +42,16 @@ StatusCode Xdot<T>::DoDot(const size_t n,
  if (n == 0) { return StatusCode::kInvalidDimension; }

  // Tests the vectors for validity
-  auto status = TestVectorX(n, x_buffer, x_offset, x_inc, sizeof(T));
+  auto status = TestVectorX(n, x_buffer, x_offset, x_inc);
  if (ErrorIn(status)) { return status; }
-  status = TestVectorY(n, y_buffer, y_offset, y_inc, sizeof(T));
+  status = TestVectorY(n, y_buffer, y_offset, y_inc);
  if (ErrorIn(status)) { return status; }
-  status = TestVectorDot(1, dot_buffer, dot_offset, sizeof(T));
+  status = TestVectorScalar(1, dot_buffer, dot_offset);
  if (ErrorIn(status)) { return status; }

  // Retrieves the Xdot kernels from the compiled binary
  try {
-    const auto program = GetProgramFromCache();
+    const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);
    auto kernel1 = Kernel(program, "Xdot");
    auto kernel2 = Kernel(program, "XdotEpilogue");

@ -86,7 +77,7 @@ StatusCode Xdot<T>::DoDot(const size_t n,
    auto global1 = std::vector<size_t>{db_["WGS1"]*temp_size};
    auto local1 = std::vector<size_t>{db_["WGS1"]};
    auto kernelEvent = Event();
-    status = RunKernel(kernel1, global1, local1, kernelEvent.pointer());
+    status = RunKernel(kernel1, queue_, device_, global1, local1, kernelEvent.pointer());
    if (ErrorIn(status)) { return status; }
    eventWaitList.push_back(kernelEvent);

@ -98,7 +89,7 @@ StatusCode Xdot<T>::DoDot(const size_t n,
    // Launches the epilogue kernel
    auto global2 = std::vector<size_t>{db_["WGS2"]};
    auto local2 = std::vector<size_t>{db_["WGS2"]};
-    status = RunKernel(kernel2, global2, local2, event_, eventWaitList);
+    status = RunKernel(kernel2, queue_, device_, global2, local2, event_, eventWaitList);
    if (ErrorIn(status)) { return status; }

    // Succesfully finished the computation
--- a/include/internal/routines/level1/xdot.h
+++ b/include/internal/routines/level1/xdot.h
@ -14,29 +14,16 @@
 #ifndef CLBLAST_ROUTINES_XDOT_H_
 #define CLBLAST_ROUTINES_XDOT_H_

-#include "internal/routine.h"
+#include "routine.hpp"

 namespace clblast {
 // =================================================================================================

 // See comment at top of file for a description of the class
 template <typename T>
-class Xdot: public Routine<T> {
+class Xdot: public Routine {
 public:

-  // Members and methods from the base class
-  using Routine<T>::db_;
-  using Routine<T>::source_string_;
-  using Routine<T>::queue_;
-  using Routine<T>::event_;
-  using Routine<T>::context_;
-  using Routine<T>::GetProgramFromCache;
-  using Routine<T>::TestVectorX;
-  using Routine<T>::TestVectorY;
-  using Routine<T>::TestVectorDot;
-  using Routine<T>::RunKernel;
-  using Routine<T>::ErrorIn;
-
  // Constructor
  Xdot(Queue &queue, EventPointer event, const std::string &name = "DOT");

@ -46,10 +33,6 @@ class Xdot: public Routine<T> {
                   const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
                   const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc,
                   const bool do_conjugate = false);
-
- private:
-  // Static variable to get the precision
-  const static Precision precision_;
 };

 // =================================================================================================
--- a/src/routines/level1/xdotc.cpp
+++ b/src/routines/level1/xdotc.cpp
@ -11,7 +11,7 @@
 //
 // =================================================================================================

-#include "internal/routines/level1/xdotc.h"
+#include "routines/level1/xdotc.hpp"

 #include <string>
 #include <vector>
--- a/include/internal/routines/level1/xdotc.h
+++ b/include/internal/routines/level1/xdotc.h
@ -14,7 +14,7 @@
 #ifndef CLBLAST_ROUTINES_XDOTC_H_
 #define CLBLAST_ROUTINES_XDOTC_H_

-#include "internal/routines/level1/xdot.h"
+#include "routines/level1/xdot.hpp"

 namespace clblast {
 // =================================================================================================
--- a/src/routines/level1/xdotu.cpp
+++ b/src/routines/level1/xdotu.cpp
@ -11,7 +11,7 @@
 //
 // =================================================================================================

-#include "internal/routines/level1/xdotu.h"
+#include "routines/level1/xdotu.hpp"

 #include <string>

--- a/include/internal/routines/level1/xdotu.h
+++ b/include/internal/routines/level1/xdotu.h
@ -14,7 +14,7 @@
 #ifndef CLBLAST_ROUTINES_XDOTU_H_
 #define CLBLAST_ROUTINES_XDOTU_H_

-#include "internal/routines/level1/xdot.h"
+#include "routines/level1/xdot.hpp"

 namespace clblast {
 // =================================================================================================
--- a/include/internal/routines/level1/xmax.h
+++ b/include/internal/routines/level1/xmax.h
@ -14,8 +14,8 @@
 #ifndef CLBLAST_ROUTINES_XMAX_H_
 #define CLBLAST_ROUTINES_XMAX_H_

-#include "internal/routine.h"
-#include "internal/routines/level1/xamax.h"
+#include "routine.hpp"
+#include "routines/level1/xamax.hpp"

 namespace clblast {
 // =================================================================================================
--- a/include/internal/routines/level1/xmin.h
+++ b/include/internal/routines/level1/xmin.h
@ -14,8 +14,8 @@
 #ifndef CLBLAST_ROUTINES_XMIN_H_
 #define CLBLAST_ROUTINES_XMIN_H_

-#include "internal/routine.h"
-#include "internal/routines/level1/xamax.h"
+#include "routine.hpp"
+#include "routines/level1/xamax.hpp"

 namespace clblast {
 // =================================================================================================
--- a/src/routines/level1/xnrm2.cpp
+++ b/src/routines/level1/xnrm2.cpp
@ -11,7 +11,7 @@
 //
 // =================================================================================================

-#include "internal/routines/level1/xnrm2.h"
+#include "routines/level1/xnrm2.hpp"

 #include <string>
 #include <vector>
@ -19,19 +19,10 @@
 namespace clblast {
 // =================================================================================================

-// Specific implementations to get the memory-type based on a template argument
-template <> const Precision Xnrm2<half>::precision_ = Precision::kHalf;
-template <> const Precision Xnrm2<float>::precision_ = Precision::kSingle;
-template <> const Precision Xnrm2<double>::precision_ = Precision::kDouble;
-template <> const Precision Xnrm2<float2>::precision_ = Precision::kComplexSingle;
-template <> const Precision Xnrm2<double2>::precision_ = Precision::kComplexDouble;
-
-// =================================================================================================
-
 // Constructor: forwards to base class constructor
 template <typename T>
 Xnrm2<T>::Xnrm2(Queue &queue, EventPointer event, const std::string &name):
-    Routine<T>(queue, event, name, {"Xdot"}, precision_) {
+    Routine(queue, event, name, {"Xdot"}, PrecisionValue<T>()) {
  source_string_ =
    #include "../../kernels/level1/xnrm2.opencl"
  ;
@ -49,14 +40,14 @@ StatusCode Xnrm2<T>::DoNrm2(const size_t n,
  if (n == 0) { return StatusCode::kInvalidDimension; }

  // Tests the vectors for validity
-  auto status = TestVectorX(n, x_buffer, x_offset, x_inc, sizeof(T));
+  auto status = TestVectorX(n, x_buffer, x_offset, x_inc);
  if (ErrorIn(status)) { return status; }
-  status = TestVectorDot(1, nrm2_buffer, nrm2_offset, sizeof(T));
+  status = TestVectorScalar(1, nrm2_buffer, nrm2_offset);
  if (ErrorIn(status)) { return status; }

  // Retrieves the Xnrm2 kernels from the compiled binary
  try {
-    const auto program = GetProgramFromCache();
+    const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);
    auto kernel1 = Kernel(program, "Xnrm2");
    auto kernel2 = Kernel(program, "Xnrm2Epilogue");

@ -78,7 +69,7 @@ StatusCode Xnrm2<T>::DoNrm2(const size_t n,
    auto global1 = std::vector<size_t>{db_["WGS1"]*temp_size};
    auto local1 = std::vector<size_t>{db_["WGS1"]};
    auto kernelEvent = Event();
-    status = RunKernel(kernel1, global1, local1, kernelEvent.pointer());
+    status = RunKernel(kernel1, queue_, device_, global1, local1, kernelEvent.pointer());
    if (ErrorIn(status)) { return status; }
    eventWaitList.push_back(kernelEvent);

@ -90,7 +81,7 @@ StatusCode Xnrm2<T>::DoNrm2(const size_t n,
    // Launches the epilogue kernel
    auto global2 = std::vector<size_t>{db_["WGS2"]};
    auto local2 = std::vector<size_t>{db_["WGS2"]};
-    status = RunKernel(kernel2, global2, local2, event_, eventWaitList);
+    status = RunKernel(kernel2, queue_, device_, global2, local2, event_, eventWaitList);
    if (ErrorIn(status)) { return status; }

    // Succesfully finished the computation
--- a/include/internal/routines/level1/xnrm2.h
+++ b/include/internal/routines/level1/xnrm2.h
@ -14,28 +14,16 @@
 #ifndef CLBLAST_ROUTINES_XNRM2_H_
 #define CLBLAST_ROUTINES_XNRM2_H_

-#include "internal/routine.h"
+#include "routine.hpp"

 namespace clblast {
 // =================================================================================================

 // See comment at top of file for a description of the class
 template <typename T>
-class Xnrm2: public Routine<T> {
+class Xnrm2: public Routine {
 public:

-  // Members and methods from the base class
-  using Routine<T>::db_;
-  using Routine<T>::source_string_;
-  using Routine<T>::queue_;
-  using Routine<T>::event_;
-  using Routine<T>::context_;
-  using Routine<T>::GetProgramFromCache;
-  using Routine<T>::TestVectorX;
-  using Routine<T>::TestVectorDot;
-  using Routine<T>::RunKernel;
-  using Routine<T>::ErrorIn;
-
  // Constructor
  Xnrm2(Queue &queue, EventPointer event, const std::string &name = "NRM2");

@ -43,10 +31,6 @@ class Xnrm2: public Routine<T> {
  StatusCode DoNrm2(const size_t n,
                    const Buffer<T> &nrm2_buffer, const size_t nrm2_offset,
                    const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc);
-
- private:
-  // Static variable to get the precision
-  const static Precision precision_;
 };

 // =================================================================================================
--- a/src/routines/level1/xscal.cpp
+++ b/src/routines/level1/xscal.cpp
@ -11,7 +11,7 @@
 //
 // =================================================================================================

-#include "internal/routines/level1/xscal.h"
+#include "routines/level1/xscal.hpp"

 #include <string>
 #include <vector>
@ -19,19 +19,10 @@
 namespace clblast {
 // =================================================================================================

-// Specific implementations to get the memory-type based on a template argument
-template <> const Precision Xscal<half>::precision_ = Precision::kHalf;
-template <> const Precision Xscal<float>::precision_ = Precision::kSingle;
-template <> const Precision Xscal<double>::precision_ = Precision::kDouble;
-template <> const Precision Xscal<float2>::precision_ = Precision::kComplexSingle;
-template <> const Precision Xscal<double2>::precision_ = Precision::kComplexDouble;
-
-// =================================================================================================
-
 // Constructor: forwards to base class constructor
 template <typename T>
 Xscal<T>::Xscal(Queue &queue, EventPointer event, const std::string &name):
-    Routine<T>(queue, event, name, {"Xaxpy"}, precision_) {
+    Routine(queue, event, name, {"Xaxpy"}, PrecisionValue<T>()) {
  source_string_ =
    #include "../../kernels/level1/level1.opencl"
    #include "../../kernels/level1/xscal.opencl"
@ -49,7 +40,7 @@ StatusCode Xscal<T>::DoScal(const size_t n, const T alpha,
  if (n == 0) { return StatusCode::kInvalidDimension; }

  // Tests the vector for validity
-  auto status = TestVectorX(n, x_buffer, x_offset, x_inc, sizeof(T));
+  auto status = TestVectorX(n, x_buffer, x_offset, x_inc);
  if (ErrorIn(status)) { return status; }

  // Determines whether or not the fast-version can be used
@ -61,7 +52,7 @@ StatusCode Xscal<T>::DoScal(const size_t n, const T alpha,

  // Retrieves the Xscal kernel from the compiled binary
  try {
-    const auto program = GetProgramFromCache();
+    const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);
    auto kernel = Kernel(program, kernel_name);

    // Sets the kernel arguments
@ -82,13 +73,13 @@ StatusCode Xscal<T>::DoScal(const size_t n, const T alpha,
    if (use_fast_kernel) {
      auto global = std::vector<size_t>{CeilDiv(n, db_["WPT"]*db_["VW"])};
      auto local = std::vector<size_t>{db_["WGS"]};
-      status = RunKernel(kernel, global, local, event_);
+      status = RunKernel(kernel, queue_, device_, global, local, event_);
    }
    else {
      auto n_ceiled = Ceil(n, db_["WGS"]*db_["WPT"]);
      auto global = std::vector<size_t>{n_ceiled/db_["WPT"]};
      auto local = std::vector<size_t>{db_["WGS"]};
-      status = RunKernel(kernel, global, local, event_);
+      status = RunKernel(kernel, queue_, device_, global, local, event_);
    }
    if (ErrorIn(status)) { return status; }

--- a/include/internal/routines/level1/xscal.h
+++ b/include/internal/routines/level1/xscal.h
@ -14,36 +14,22 @@
 #ifndef CLBLAST_ROUTINES_XSCAL_H_
 #define CLBLAST_ROUTINES_XSCAL_H_

-#include "internal/routine.h"
+#include "routine.hpp"

 namespace clblast {
 // =================================================================================================

 // See comment at top of file for a description of the class
 template <typename T>
-class Xscal: public Routine<T> {
+class Xscal: public Routine {
 public:

-  // Members and methods from the base class
-  using Routine<T>::db_;
-  using Routine<T>::source_string_;
-  using Routine<T>::queue_;
-  using Routine<T>::event_;
-  using Routine<T>::GetProgramFromCache;
-  using Routine<T>::TestVectorX;
-  using Routine<T>::RunKernel;
-  using Routine<T>::ErrorIn;
-
  // Constructor
  Xscal(Queue &queue, EventPointer event, const std::string &name = "SCAL");

  // Templated-precision implementation of the routine
  StatusCode DoScal(const size_t n, const T alpha,
                    const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc);
-
- private:
-  // Static variable to get the precision
-  const static Precision precision_;
 };

 // =================================================================================================
--- a/include/internal/routines/level1/xsum.h
+++ b/include/internal/routines/level1/xsum.h
@ -14,8 +14,8 @@
 #ifndef CLBLAST_ROUTINES_XSUM_H_
 #define CLBLAST_ROUTINES_XSUM_H_

-#include "internal/routine.h"
-#include "internal/routines/level1/xasum.h"
+#include "routine.hpp"
+#include "routines/level1/xasum.hpp"

 namespace clblast {
 // =================================================================================================
--- a/src/routines/level1/xswap.cpp
+++ b/src/routines/level1/xswap.cpp
@ -11,7 +11,7 @@
 //
 // =================================================================================================

-#include "internal/routines/level1/xswap.h"
+#include "routines/level1/xswap.hpp"

 #include <string>
 #include <vector>
@ -19,19 +19,10 @@
 namespace clblast {
 // =================================================================================================

-// Specific implementations to get the memory-type based on a template argument
-template <> const Precision Xswap<half>::precision_ = Precision::kHalf;
-template <> const Precision Xswap<float>::precision_ = Precision::kSingle;
-template <> const Precision Xswap<double>::precision_ = Precision::kDouble;
-template <> const Precision Xswap<float2>::precision_ = Precision::kComplexSingle;
-template <> const Precision Xswap<double2>::precision_ = Precision::kComplexDouble;
-
-// =================================================================================================
-
 // Constructor: forwards to base class constructor
 template <typename T>
 Xswap<T>::Xswap(Queue &queue, EventPointer event, const std::string &name):
-    Routine<T>(queue, event, name, {"Xaxpy"}, precision_) {
+    Routine(queue, event, name, {"Xaxpy"}, PrecisionValue<T>()) {
  source_string_ =
    #include "../../kernels/level1/level1.opencl"
    #include "../../kernels/level1/xswap.opencl"
@ -50,9 +41,9 @@ StatusCode Xswap<T>::DoSwap(const size_t n,
  if (n == 0) { return StatusCode::kInvalidDimension; }

  // Tests the vectors for validity
-  auto status = TestVectorX(n, x_buffer, x_offset, x_inc, sizeof(T));
+  auto status = TestVectorX(n, x_buffer, x_offset, x_inc);
  if (ErrorIn(status)) { return status; }
-  status = TestVectorY(n, y_buffer, y_offset, y_inc, sizeof(T));
+  status = TestVectorY(n, y_buffer, y_offset, y_inc);
  if (ErrorIn(status)) { return status; }

  // Determines whether or not the fast-version can be used
@ -65,7 +56,7 @@ StatusCode Xswap<T>::DoSwap(const size_t n,

  // Retrieves the Xswap kernel from the compiled binary
  try {
-    const auto program = GetProgramFromCache();
+    const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);
    auto kernel = Kernel(program, kernel_name);

    // Sets the kernel arguments
@ -88,13 +79,13 @@ StatusCode Xswap<T>::DoSwap(const size_t n,
    if (use_fast_kernel) {
      auto global = std::vector<size_t>{CeilDiv(n, db_["WPT"]*db_["VW"])};
      auto local = std::vector<size_t>{db_["WGS"]};
-      status = RunKernel(kernel, global, local, event_);
+      status = RunKernel(kernel, queue_, device_, global, local, event_);
    }
    else {
      auto n_ceiled = Ceil(n, db_["WGS"]*db_["WPT"]);
      auto global = std::vector<size_t>{n_ceiled/db_["WPT"]};
      auto local = std::vector<size_t>{db_["WGS"]};
-      status = RunKernel(kernel, global, local, event_);
+      status = RunKernel(kernel, queue_, device_, global, local, event_);
    }
    if (ErrorIn(status)) { return status; }

--- a/include/internal/routines/level1/xswap.h
+++ b/include/internal/routines/level1/xswap.h
@ -14,27 +14,16 @@
 #ifndef CLBLAST_ROUTINES_XSWAP_H_
 #define CLBLAST_ROUTINES_XSWAP_H_

-#include "internal/routine.h"
+#include "routine.hpp"

 namespace clblast {
 // =================================================================================================

 // See comment at top of file for a description of the class
 template <typename T>
-class Xswap: public Routine<T> {
+class Xswap: public Routine {
 public:

-  // Members and methods from the base class
-  using Routine<T>::db_;
-  using Routine<T>::source_string_;
-  using Routine<T>::queue_;
-  using Routine<T>::event_;
-  using Routine<T>::GetProgramFromCache;
-  using Routine<T>::TestVectorX;
-  using Routine<T>::TestVectorY;
-  using Routine<T>::RunKernel;
-  using Routine<T>::ErrorIn;
-
  // Constructor
  Xswap(Queue &queue, EventPointer event, const std::string &name = "SWAP");

@ -42,10 +31,6 @@ class Xswap: public Routine<T> {
  StatusCode DoSwap(const size_t n,
                    const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
                    const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc);
-
- private:
-  // Static variable to get the precision
-  const static Precision precision_;
 };

 // =================================================================================================
--- a/src/routines/level2/xgbmv.cpp
+++ b/src/routines/level2/xgbmv.cpp
@ -11,7 +11,7 @@
 //
 // =================================================================================================

-#include "internal/routines/level2/xgbmv.h"
+#include "routines/level2/xgbmv.hpp"

 #include <string>
 #include <vector>
--- a/include/internal/routines/level2/xgbmv.h
+++ b/include/internal/routines/level2/xgbmv.h
@ -16,7 +16,7 @@
 #ifndef CLBLAST_ROUTINES_XGBMV_H_
 #define CLBLAST_ROUTINES_XGBMV_H_

-#include "internal/routines/level2/xgemv.h"
+#include "routines/level2/xgemv.hpp"

 namespace clblast {
 // =================================================================================================
--- a/src/routines/level2/xgemv.cpp
+++ b/src/routines/level2/xgemv.cpp
@ -11,7 +11,7 @@
 //
 // =================================================================================================

-#include "internal/routines/level2/xgemv.h"
+#include "routines/level2/xgemv.hpp"

 #include <string>
 #include <vector>
@ -19,19 +19,10 @@
 namespace clblast {
 // =================================================================================================

-// Specific implementations to get the memory-type based on a template argument
-template <> const Precision Xgemv<half>::precision_ = Precision::kHalf;
-template <> const Precision Xgemv<float>::precision_ = Precision::kSingle;
-template <> const Precision Xgemv<double>::precision_ = Precision::kDouble;
-template <> const Precision Xgemv<float2>::precision_ = Precision::kComplexSingle;
-template <> const Precision Xgemv<double2>::precision_ = Precision::kComplexDouble;
-
-// =================================================================================================
-
 // Constructor: forwards to base class constructor
 template <typename T>
 Xgemv<T>::Xgemv(Queue &queue, EventPointer event, const std::string &name):
-    Routine<T>(queue, event, name, {"Pad", "Xgemv"}, precision_) {
+    Routine(queue, event, name, {"Pad", "Xgemv"}, PrecisionValue<T>()) {
  source_string_ =
    #include "../../kernels/level2/xgemv.opencl"
    #include "../../kernels/level2/xgemv_fast.opencl"
@ -101,12 +92,12 @@ StatusCode Xgemv<T>::MatVec(const Layout layout, const Transpose a_transpose,

  // Tests the matrix and the vectors for validity
  auto status = StatusCode::kSuccess;
-  if (packed) { status = TestMatrixAP(n, a_buffer, a_offset, sizeof(T)); }
-  else { status = TestMatrixA(a_one, a_two, a_buffer, a_offset, a_ld, sizeof(T)); }
+  if (packed) { status = TestMatrixAP(n, a_buffer, a_offset); }
+  else { status = TestMatrixA(a_one, a_two, a_buffer, a_offset, a_ld); }
  if (ErrorIn(status)) { return status; }
-  status = TestVectorX(n_real, x_buffer, x_offset, x_inc, sizeof(T));
+  status = TestVectorX(n_real, x_buffer, x_offset, x_inc);
  if (ErrorIn(status)) { return status; }
-  status = TestVectorY(m_real, y_buffer, y_offset, y_inc, sizeof(T));
+  status = TestVectorY(m_real, y_buffer, y_offset, y_inc);
  if (ErrorIn(status)) { return status; }

  // Determines whether or not the fast-version can be used
@ -143,7 +134,7 @@ StatusCode Xgemv<T>::MatVec(const Layout layout, const Transpose a_transpose,

  // Retrieves the Xgemv kernel from the compiled binary
  try {
-    const auto program = GetProgramFromCache();
+    const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);
    auto kernel = Kernel(program, kernel_name);

    // Sets the kernel arguments
@ -169,7 +160,7 @@ StatusCode Xgemv<T>::MatVec(const Layout layout, const Transpose a_transpose,
    // Launches the kernel
    auto global = std::vector<size_t>{global_size};
    auto local = std::vector<size_t>{local_size};
-    status = RunKernel(kernel, global, local, event_);
+    status = RunKernel(kernel, queue_, device_, global, local, event_);
    if (ErrorIn(status)) { return status; }

    // Succesfully finished the computation
--- a/include/internal/routines/level2/xgemv.h
+++ b/include/internal/routines/level2/xgemv.h
@ -14,30 +14,16 @@
 #ifndef CLBLAST_ROUTINES_XGEMV_H_
 #define CLBLAST_ROUTINES_XGEMV_H_

-#include "internal/routine.h"
+#include "routine.hpp"

 namespace clblast {
 // =================================================================================================

 // See comment at top of file for a description of the class
 template <typename T>
-class Xgemv: public Routine<T> {
+class Xgemv: public Routine {
 public:

-  // Members and methods from the base class
-  using Routine<T>::db_;
-  using Routine<T>::source_string_;
-  using Routine<T>::queue_;
-  using Routine<T>::event_;
-  using Routine<T>::context_;
-  using Routine<T>::GetProgramFromCache;
-  using Routine<T>::TestVectorX;
-  using Routine<T>::TestVectorY;
-  using Routine<T>::TestMatrixA;
-  using Routine<T>::TestMatrixAP;
-  using Routine<T>::RunKernel;
-  using Routine<T>::ErrorIn;
-
  // Constructor
  Xgemv(Queue &queue, EventPointer event, const std::string &name = "GEMV");

@ -61,10 +47,6 @@ class Xgemv: public Routine<T> {
                    bool fast_kernel, bool fast_kernel_rot,
                    const size_t parameter, const bool packed,
                    const size_t kl, const size_t ku);
-
- private:
-  // Static variable to get the precision
-  const static Precision precision_;
 };

 // =================================================================================================
--- a/src/routines/level2/xger.cpp
+++ b/src/routines/level2/xger.cpp
@ -11,7 +11,7 @@
 //
 // =================================================================================================

-#include "internal/routines/level2/xger.h"
+#include "routines/level2/xger.hpp"

 #include <string>
 #include <vector>
@ -19,19 +19,10 @@
 namespace clblast {
 // =================================================================================================

-// Specific implementations to get the memory-type based on a template argument
-template <> const Precision Xger<half>::precision_ = Precision::kHalf;
-template <> const Precision Xger<float>::precision_ = Precision::kSingle;
-template <> const Precision Xger<double>::precision_ = Precision::kDouble;
-template <> const Precision Xger<float2>::precision_ = Precision::kComplexSingle;
-template <> const Precision Xger<double2>::precision_ = Precision::kComplexDouble;
-
-// =================================================================================================
-
 // Constructor: forwards to base class constructor
 template <typename T>
 Xger<T>::Xger(Queue &queue, EventPointer event, const std::string &name):
-    Routine<T>(queue, event, name, {"Xger"}, precision_) {
+    Routine(queue, event, name, {"Xger"}, PrecisionValue<T>()) {
  source_string_ =
    #include "../../kernels/level2/level2.opencl"
    #include "../../kernels/level2/xger.opencl"
@ -58,11 +49,11 @@ StatusCode Xger<T>::DoGer(const Layout layout,
  const auto a_two = (a_is_rowmajor) ? m : n;

  // Tests the matrix and the vectors for validity
-  auto status = TestMatrixA(a_one, a_two, a_buffer, a_offset, a_ld, sizeof(T));
+  auto status = TestMatrixA(a_one, a_two, a_buffer, a_offset, a_ld);
  if (ErrorIn(status)) { return status; }
-  status = TestVectorX(m, x_buffer, x_offset, x_inc, sizeof(T));
+  status = TestVectorX(m, x_buffer, x_offset, x_inc);
  if (ErrorIn(status)) { return status; }
-  status = TestVectorY(n, y_buffer, y_offset, y_inc, sizeof(T));
+  status = TestVectorY(n, y_buffer, y_offset, y_inc);
  if (ErrorIn(status)) { return status; }

  // Upload the scalar argument as a constant buffer to the device (needed for half-precision)
@ -71,7 +62,7 @@ StatusCode Xger<T>::DoGer(const Layout layout,

  // Retrieves the kernel from the compiled binary
  try {
-    const auto program = GetProgramFromCache();
+    const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);
    auto kernel = Kernel(program, "Xger");

    // Sets the kernel arguments
@ -94,7 +85,7 @@ StatusCode Xger<T>::DoGer(const Layout layout,
    auto a_two_ceiled = Ceil(CeilDiv(a_two, db_["WPT"]), db_["WGS2"]);
    auto global = std::vector<size_t>{a_one_ceiled, a_two_ceiled};
    auto local = std::vector<size_t>{db_["WGS1"], db_["WGS2"]};
-    status = RunKernel(kernel, global, local, event_);
+    status = RunKernel(kernel, queue_, device_, global, local, event_);
    if (ErrorIn(status)) { return status; }

    // Succesfully finished the computation
--- a/include/internal/routines/level2/xger.h
+++ b/include/internal/routines/level2/xger.h
@ -14,29 +14,16 @@
 #ifndef CLBLAST_ROUTINES_XGER_H_
 #define CLBLAST_ROUTINES_XGER_H_

-#include "internal/routine.h"
+#include "routine.hpp"

 namespace clblast {
 // =================================================================================================

 // See comment at top of file for a description of the class
 template <typename T>
-class Xger: public Routine<T> {
+class Xger: public Routine {
 public:

-  // Members and methods from the base class
-  using Routine<T>::db_;
-  using Routine<T>::source_string_;
-  using Routine<T>::queue_;
-  using Routine<T>::event_;
-  using Routine<T>::context_;
-  using Routine<T>::GetProgramFromCache;
-  using Routine<T>::TestVectorX;
-  using Routine<T>::TestVectorY;
-  using Routine<T>::TestMatrixA;
-  using Routine<T>::RunKernel;
-  using Routine<T>::ErrorIn;
-
  // Constructor
  Xger(Queue &queue, EventPointer event, const std::string &name = "GER");

@ -47,10 +34,6 @@ class Xger: public Routine<T> {
                   const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
                   const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc,
                   const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld);
-
- private:
-  // Static variable to get the precision
-  const static Precision precision_;
 };

 // =================================================================================================
--- a/src/routines/level2/xgerc.cpp
+++ b/src/routines/level2/xgerc.cpp
@ -11,7 +11,7 @@
 //
 // =================================================================================================

-#include "internal/routines/level2/xgerc.h"
+#include "routines/level2/xgerc.hpp"

 #include <string>

--- a/include/internal/routines/level2/xgerc.h
+++ b/include/internal/routines/level2/xgerc.h
@ -14,7 +14,7 @@
 #ifndef CLBLAST_ROUTINES_XGERC_H_
 #define CLBLAST_ROUTINES_XGERC_H_

-#include "internal/routines/level2/xger.h"
+#include "routines/level2/xger.hpp"

 namespace clblast {
 // =================================================================================================
--- a/src/routines/level2/xgeru.cpp
+++ b/src/routines/level2/xgeru.cpp
@ -11,7 +11,7 @@
 //
 // =================================================================================================

-#include "internal/routines/level2/xgeru.h"
+#include "routines/level2/xgeru.hpp"

 #include <string>

--- a/include/internal/routines/level2/xgeru.h
+++ b/include/internal/routines/level2/xgeru.h
@ -14,7 +14,7 @@
 #ifndef CLBLAST_ROUTINES_XGERU_H_
 #define CLBLAST_ROUTINES_XGERU_H_

-#include "internal/routines/level2/xger.h"
+#include "routines/level2/xger.hpp"

 namespace clblast {
 // =================================================================================================
--- a/src/routines/level2/xhbmv.cpp
+++ b/src/routines/level2/xhbmv.cpp
@ -11,7 +11,7 @@
 //
 // =================================================================================================

-#include "internal/routines/level2/xhbmv.h"
+#include "routines/level2/xhbmv.hpp"

 #include <string>
 #include <vector>
--- a/include/internal/routines/level2/xhbmv.h
+++ b/include/internal/routines/level2/xhbmv.h
@ -16,7 +16,7 @@
 #ifndef CLBLAST_ROUTINES_XHBMV_H_
 #define CLBLAST_ROUTINES_XHBMV_H_

-#include "internal/routines/level2/xgemv.h"
+#include "routines/level2/xgemv.hpp"

 namespace clblast {
 // =================================================================================================
--- a/src/routines/level2/xhemv.cpp
+++ b/src/routines/level2/xhemv.cpp
@ -11,7 +11,7 @@
 //
 // =================================================================================================

-#include "internal/routines/level2/xhemv.h"
+#include "routines/level2/xhemv.hpp"

 #include <string>
 #include <vector>
--- a/include/internal/routines/level2/xhemv.h
+++ b/include/internal/routines/level2/xhemv.h
@ -16,7 +16,7 @@
 #ifndef CLBLAST_ROUTINES_XHEMV_H_
 #define CLBLAST_ROUTINES_XHEMV_H_

-#include "internal/routines/level2/xgemv.h"
+#include "routines/level2/xgemv.hpp"

 namespace clblast {
 // =================================================================================================
--- a/src/routines/level2/xher.cpp
+++ b/src/routines/level2/xher.cpp
@ -11,26 +11,17 @@
 //
 // =================================================================================================

-#include "internal/routines/level2/xher.h"
+#include "routines/level2/xher.hpp"

 #include <string>

 namespace clblast {
 // =================================================================================================

-// Specific implementations to get the memory-type based on a template argument
-template <> const Precision Xher<half, half>::precision_ = Precision::kHalf;
-template <> const Precision Xher<float, float>::precision_ = Precision::kSingle;
-template <> const Precision Xher<double, double>::precision_ = Precision::kDouble;
-template <> const Precision Xher<float2, float>::precision_ = Precision::kComplexSingle;
-template <> const Precision Xher<double2, double>::precision_ = Precision::kComplexDouble;
-
-// =================================================================================================
-
 // Constructor: forwards to base class constructor
 template <typename T, typename U>
 Xher<T,U>::Xher(Queue &queue, EventPointer event, const std::string &name):
-    Routine<T>(queue, event, name, {"Xger"}, precision_) {
+    Routine(queue, event, name, {"Xger"}, PrecisionValue<T>()) {
  source_string_ =
    #include "../../kernels/level2/level2.opencl"
    #include "../../kernels/level2/xher.opencl"
@ -67,10 +58,10 @@ StatusCode Xher<T,U>::DoHer(const Layout layout, const Triangle triangle,

  // Tests the matrix and the vectors for validity
  auto status = StatusCode::kSuccess;
-  if (packed) { status = TestMatrixAP(n, a_buffer, a_offset, sizeof(T)); }
-  else { status = TestMatrixA(n, n, a_buffer, a_offset, a_ld, sizeof(T)); }
+  if (packed) { status = TestMatrixAP(n, a_buffer, a_offset); }
+  else { status = TestMatrixA(n, n, a_buffer, a_offset, a_ld); }
  if (ErrorIn(status)) { return status; }
-  status = TestVectorX(n, x_buffer, x_offset, x_inc, sizeof(T));
+  status = TestVectorX(n, x_buffer, x_offset, x_inc);
  if (ErrorIn(status)) { return status; }

  // If alpha is zero an update is not required
@ -85,7 +76,7 @@ StatusCode Xher<T,U>::DoHer(const Layout layout, const Triangle triangle,

  // Retrieves the kernel from the compiled binary
  try {
-    const auto program = GetProgramFromCache();
+    const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);
    auto kernel = Kernel(program, "Xher");

    // Sets the kernel arguments
@ -105,7 +96,7 @@ StatusCode Xher<T,U>::DoHer(const Layout layout, const Triangle triangle,
    auto global_two = Ceil(CeilDiv(n, db_["WPT"]), db_["WGS2"]);
    auto global = std::vector<size_t>{global_one, global_two};
    auto local = std::vector<size_t>{db_["WGS1"], db_["WGS2"]};
-    status = RunKernel(kernel, global, local, event_);
+    status = RunKernel(kernel, queue_, device_, global, local, event_);
    if (ErrorIn(status)) { return status; }

    // Succesfully finished the computation
--- a/include/internal/routines/level2/xher.h
+++ b/include/internal/routines/level2/xher.h
@ -14,29 +14,16 @@
 #ifndef CLBLAST_ROUTINES_XHER_H_
 #define CLBLAST_ROUTINES_XHER_H_

-#include "internal/routine.h"
+#include "routine.hpp"

 namespace clblast {
 // =================================================================================================

 // See comment at top of file for a description of the class
 template <typename T, typename U>
-class Xher: public Routine<T> {
+class Xher: public Routine {
 public:

-  // Members and methods from the base class
-  using Routine<T>::db_;
-  using Routine<T>::source_string_;
-  using Routine<T>::queue_;
-  using Routine<T>::event_;
-  using Routine<T>::context_;
-  using Routine<T>::GetProgramFromCache;
-  using Routine<T>::TestVectorX;
-  using Routine<T>::TestMatrixA;
-  using Routine<T>::TestMatrixAP;
-  using Routine<T>::RunKernel;
-  using Routine<T>::ErrorIn;
-
  // Constructor
  Xher(Queue &queue, EventPointer event, const std::string &name = "HER");

@ -50,10 +37,6 @@ class Xher: public Routine<T> {
                   const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
                   const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
                   const bool packed = false);
-
- private:
-  // Static variable to get the precision
-  const static Precision precision_;
 };

 // =================================================================================================
--- a/src/routines/level2/xher2.cpp
+++ b/src/routines/level2/xher2.cpp
@ -11,26 +11,17 @@
 //
 // =================================================================================================

-#include "internal/routines/level2/xher2.h"
+#include "routines/level2/xher2.hpp"

 #include <string>

 namespace clblast {
 // =================================================================================================

-// Specific implementations to get the memory-type based on a template argument
-template <> const Precision Xher2<half>::precision_ = Precision::kHalf;
-template <> const Precision Xher2<float>::precision_ = Precision::kSingle;
-template <> const Precision Xher2<double>::precision_ = Precision::kDouble;
-template <> const Precision Xher2<float2>::precision_ = Precision::kComplexSingle;
-template <> const Precision Xher2<double2>::precision_ = Precision::kComplexDouble;
-
-// =================================================================================================
-
 // Constructor: forwards to base class constructor
 template <typename T>
 Xher2<T>::Xher2(Queue &queue, EventPointer event, const std::string &name):
-    Routine<T>(queue, event, name, {"Xger"}, precision_) {
+    Routine(queue, event, name, {"Xger"}, PrecisionValue<T>()) {
  source_string_ =
    #include "../../kernels/level2/level2.opencl"
    #include "../../kernels/level2/xher2.opencl"
@ -59,12 +50,12 @@ StatusCode Xher2<T>::DoHer2(const Layout layout, const Triangle triangle,

  // Tests the matrix and the vectors for validity
  auto status = StatusCode::kSuccess;
-  if (packed) { status = TestMatrixAP(n, a_buffer, a_offset, sizeof(T)); }
-  else { status = TestMatrixA(n, n, a_buffer, a_offset, a_ld, sizeof(T)); }
+  if (packed) { status = TestMatrixAP(n, a_buffer, a_offset); }
+  else { status = TestMatrixA(n, n, a_buffer, a_offset, a_ld); }
  if (ErrorIn(status)) { return status; }
-  status = TestVectorX(n, x_buffer, x_offset, x_inc, sizeof(T));
+  status = TestVectorX(n, x_buffer, x_offset, x_inc);
  if (ErrorIn(status)) { return status; }
-  status = TestVectorY(n, y_buffer, y_offset, y_inc, sizeof(T));
+  status = TestVectorY(n, y_buffer, y_offset, y_inc);
  if (ErrorIn(status)) { return status; }

  // Upload the scalar argument as a constant buffer to the device (needed for half-precision)
@ -73,7 +64,7 @@ StatusCode Xher2<T>::DoHer2(const Layout layout, const Triangle triangle,

  // Retrieves the kernel from the compiled binary
  try {
-    const auto program = GetProgramFromCache();
+    const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);
    auto kernel = Kernel(program, "Xher2");

    // Sets the kernel arguments
@ -96,7 +87,7 @@ StatusCode Xher2<T>::DoHer2(const Layout layout, const Triangle triangle,
    auto global_two = Ceil(CeilDiv(n, db_["WPT"]), db_["WGS2"]);
    auto global = std::vector<size_t>{global_one, global_two};
    auto local = std::vector<size_t>{db_["WGS1"], db_["WGS2"]};
-    status = RunKernel(kernel, global, local, event_);
+    status = RunKernel(kernel, queue_, device_, global, local, event_);
    if (ErrorIn(status)) { return status; }

    // Succesfully finished the computation
--- a/include/internal/routines/level2/xher2.h
+++ b/include/internal/routines/level2/xher2.h
@ -14,30 +14,16 @@
 #ifndef CLBLAST_ROUTINES_XHER2_H_
 #define CLBLAST_ROUTINES_XHER2_H_

-#include "internal/routine.h"
+#include "routine.hpp"

 namespace clblast {
 // =================================================================================================

 // See comment at top of file for a description of the class
 template <typename T>
-class Xher2: public Routine<T> {
+class Xher2: public Routine {
 public:

-  // Members and methods from the base class
-  using Routine<T>::db_;
-  using Routine<T>::source_string_;
-  using Routine<T>::queue_;
-  using Routine<T>::event_;
-  using Routine<T>::context_;
-  using Routine<T>::GetProgramFromCache;
-  using Routine<T>::TestVectorX;
-  using Routine<T>::TestVectorY;
-  using Routine<T>::TestMatrixA;
-  using Routine<T>::TestMatrixAP;
-  using Routine<T>::RunKernel;
-  using Routine<T>::ErrorIn;
-
  // Constructor
  Xher2(Queue &queue, EventPointer event, const std::string &name = "HER2");

@ -49,10 +35,6 @@ class Xher2: public Routine<T> {
                    const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc,
                    const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
                    const bool packed = false);
-
- private:
-  // Static variable to get the precision
-  const static Precision precision_;
 };

 // =================================================================================================
--- a/src/routines/level2/xhpmv.cpp
+++ b/src/routines/level2/xhpmv.cpp
@ -11,7 +11,7 @@
 //
 // =================================================================================================

-#include "internal/routines/level2/xhpmv.h"
+#include "routines/level2/xhpmv.hpp"

 #include <string>
 #include <vector>
--- a/include/internal/routines/level2/xhpmv.h
+++ b/include/internal/routines/level2/xhpmv.h
@ -16,7 +16,7 @@
 #ifndef CLBLAST_ROUTINES_XHPMV_H_
 #define CLBLAST_ROUTINES_XHPMV_H_

-#include "internal/routines/level2/xgemv.h"
+#include "routines/level2/xgemv.hpp"

 namespace clblast {
 // =================================================================================================
--- a/src/routines/level2/xhpr.cpp
+++ b/src/routines/level2/xhpr.cpp
@ -11,7 +11,7 @@
 //
 // =================================================================================================

-#include "internal/routines/level2/xhpr.h"
+#include "routines/level2/xhpr.hpp"

 #include <string>

--- a/include/internal/routines/level2/xhpr.h
+++ b/include/internal/routines/level2/xhpr.h
@ -14,7 +14,7 @@
 #ifndef CLBLAST_ROUTINES_XHPR_H_
 #define CLBLAST_ROUTINES_XHPR_H_

-#include "internal/routines/level2/xher.h"
+#include "routines/level2/xher.hpp"

 namespace clblast {
 // =================================================================================================
--- a/src/routines/level2/xhpr2.cpp
+++ b/src/routines/level2/xhpr2.cpp
@ -11,7 +11,7 @@
 //
 // =================================================================================================

-#include "internal/routines/level2/xhpr2.h"
+#include "routines/level2/xhpr2.hpp"

 #include <string>

--- a/include/internal/routines/level2/xhpr2.h
+++ b/include/internal/routines/level2/xhpr2.h
@ -14,7 +14,7 @@
 #ifndef CLBLAST_ROUTINES_XHPR2_H_
 #define CLBLAST_ROUTINES_XHPR2_H_

-#include "internal/routines/level2/xher2.h"
+#include "routines/level2/xher2.hpp"

 namespace clblast {
 // =================================================================================================
--- a/src/routines/level2/xsbmv.cpp
+++ b/src/routines/level2/xsbmv.cpp
@ -11,7 +11,7 @@
 //
 // =================================================================================================

-#include "internal/routines/level2/xsbmv.h"
+#include "routines/level2/xsbmv.hpp"

 #include <string>
 #include <vector>
--- a/include/internal/routines/level2/xsbmv.h
+++ b/include/internal/routines/level2/xsbmv.h
@ -16,7 +16,7 @@
 #ifndef CLBLAST_ROUTINES_XSBMV_H_
 #define CLBLAST_ROUTINES_XSBMV_H_

-#include "internal/routines/level2/xgemv.h"
+#include "routines/level2/xgemv.hpp"

 namespace clblast {
 // =================================================================================================
--- a/src/routines/level2/xspmv.cpp
+++ b/src/routines/level2/xspmv.cpp
@ -11,7 +11,7 @@
 //
 // =================================================================================================

-#include "internal/routines/level2/xspmv.h"
+#include "routines/level2/xspmv.hpp"

 #include <string>
 #include <vector>
--- a/include/internal/routines/level2/xspmv.h
+++ b/include/internal/routines/level2/xspmv.h
@ -16,7 +16,7 @@
 #ifndef CLBLAST_ROUTINES_XSPMV_H_
 #define CLBLAST_ROUTINES_XSPMV_H_

-#include "internal/routines/level2/xgemv.h"
+#include "routines/level2/xgemv.hpp"

 namespace clblast {
 // =================================================================================================
--- a/src/routines/level2/xspr.cpp
+++ b/src/routines/level2/xspr.cpp
@ -11,7 +11,7 @@
 //
 // =================================================================================================

-#include "internal/routines/level2/xspr.h"
+#include "routines/level2/xspr.hpp"

 #include <string>

--- a/include/internal/routines/level2/xspr.h
+++ b/include/internal/routines/level2/xspr.h
@ -14,7 +14,7 @@
 #ifndef CLBLAST_ROUTINES_XSPR_H_
 #define CLBLAST_ROUTINES_XSPR_H_

-#include "internal/routines/level2/xher.h"
+#include "routines/level2/xher.hpp"

 namespace clblast {
 // =================================================================================================
--- a/src/routines/level2/xspr2.cpp
+++ b/src/routines/level2/xspr2.cpp
@ -11,7 +11,7 @@
 //
 // =================================================================================================

-#include "internal/routines/level2/xspr2.h"
+#include "routines/level2/xspr2.hpp"

 #include <string>

--- a/include/internal/routines/level2/xspr2.h
+++ b/include/internal/routines/level2/xspr2.h
@ -14,7 +14,7 @@
 #ifndef CLBLAST_ROUTINES_XSPR2_H_
 #define CLBLAST_ROUTINES_XSPR2_H_

-#include "internal/routines/level2/xher2.h"
+#include "routines/level2/xher2.hpp"

 namespace clblast {
 // =================================================================================================
--- a/src/routines/level2/xsymv.cpp
+++ b/src/routines/level2/xsymv.cpp
@ -11,7 +11,7 @@
 //
 // =================================================================================================

-#include "internal/routines/level2/xsymv.h"
+#include "routines/level2/xsymv.hpp"

 #include <string>
 #include <vector>
--- a/include/internal/routines/level2/xsymv.h
+++ b/include/internal/routines/level2/xsymv.h
@ -16,7 +16,7 @@
 #ifndef CLBLAST_ROUTINES_XSYMV_H_
 #define CLBLAST_ROUTINES_XSYMV_H_

-#include "internal/routines/level2/xgemv.h"
+#include "routines/level2/xgemv.hpp"

 namespace clblast {
 // =================================================================================================
--- a/src/routines/level2/xsyr.cpp
+++ b/src/routines/level2/xsyr.cpp
@ -11,7 +11,7 @@
 //
 // =================================================================================================

-#include "internal/routines/level2/xsyr.h"
+#include "routines/level2/xsyr.hpp"

 #include <string>

--- a/include/internal/routines/level2/xsyr.h
+++ b/include/internal/routines/level2/xsyr.h
@ -14,7 +14,7 @@
 #ifndef CLBLAST_ROUTINES_XSYR_H_
 #define CLBLAST_ROUTINES_XSYR_H_

-#include "internal/routines/level2/xher.h"
+#include "routines/level2/xher.hpp"

 namespace clblast {
 // =================================================================================================
--- a/src/routines/level2/xsyr2.cpp
+++ b/src/routines/level2/xsyr2.cpp
@ -11,7 +11,7 @@
 //
 // =================================================================================================

-#include "internal/routines/level2/xsyr2.h"
+#include "routines/level2/xsyr2.hpp"

 #include <string>

--- a/include/internal/routines/level2/xsyr2.h
+++ b/include/internal/routines/level2/xsyr2.h
@ -14,7 +14,7 @@
 #ifndef CLBLAST_ROUTINES_XSYR2_H_
 #define CLBLAST_ROUTINES_XSYR2_H_

-#include "internal/routines/level2/xher2.h"
+#include "routines/level2/xher2.hpp"

 namespace clblast {
 // =================================================================================================
--- a/src/routines/level2/xtbmv.cpp
+++ b/src/routines/level2/xtbmv.cpp
@ -11,7 +11,7 @@
 //
 // =================================================================================================

-#include "internal/routines/level2/xtbmv.h"
+#include "routines/level2/xtbmv.hpp"

 #include <string>
 #include <vector>
--- a/include/internal/routines/level2/xtbmv.h
+++ b/include/internal/routines/level2/xtbmv.h
@ -16,7 +16,7 @@
 #ifndef CLBLAST_ROUTINES_XTBMV_H_
 #define CLBLAST_ROUTINES_XTBMV_H_

-#include "internal/routines/level2/xgemv.h"
+#include "routines/level2/xgemv.hpp"

 namespace clblast {
 // =================================================================================================
@ -25,12 +25,10 @@ namespace clblast {
 template <typename T>
 class Xtbmv: public Xgemv<T> {
 public:
-  
-  // Members from the base class
-  using Routine<T>::queue_;
-  using Routine<T>::context_;

  // Uses the generic matrix-vector routine
+  using Xgemv<T>::queue_;
+  using Xgemv<T>::context_;
  using Xgemv<T>::MatVec;

  // Constructor
--- a/src/routines/level2/xtpmv.cpp
+++ b/src/routines/level2/xtpmv.cpp
@ -11,7 +11,7 @@
 //
 // =================================================================================================

-#include "internal/routines/level2/xtpmv.h"
+#include "routines/level2/xtpmv.hpp"

 #include <string>
 #include <vector>
--- a/include/internal/routines/level2/xtpmv.h
+++ b/include/internal/routines/level2/xtpmv.h
@ -16,7 +16,7 @@
 #ifndef CLBLAST_ROUTINES_XTPMV_H_
 #define CLBLAST_ROUTINES_XTPMV_H_

-#include "internal/routines/level2/xgemv.h"
+#include "routines/level2/xgemv.hpp"

 namespace clblast {
 // =================================================================================================
@ -25,12 +25,10 @@ namespace clblast {
 template <typename T>
 class Xtpmv: public Xgemv<T> {
 public:
-  
-  // Members from the base class
-  using Routine<T>::queue_;
-  using Routine<T>::context_;

  // Uses the generic matrix-vector routine
+  using Xgemv<T>::queue_;
+  using Xgemv<T>::context_;
  using Xgemv<T>::MatVec;

  // Constructor
--- a/src/routines/level2/xtrmv.cpp
+++ b/src/routines/level2/xtrmv.cpp
@ -11,7 +11,7 @@
 //
 // =================================================================================================

-#include "internal/routines/level2/xtrmv.h"
+#include "routines/level2/xtrmv.hpp"

 #include <string>
 #include <vector>
--- a/include/internal/routines/level2/xtrmv.h
+++ b/include/internal/routines/level2/xtrmv.h
@ -16,7 +16,7 @@
 #ifndef CLBLAST_ROUTINES_XTRMV_H_
 #define CLBLAST_ROUTINES_XTRMV_H_

-#include "internal/routines/level2/xgemv.h"
+#include "routines/level2/xgemv.hpp"

 namespace clblast {
 // =================================================================================================
@ -25,12 +25,10 @@ namespace clblast {
 template <typename T>
 class Xtrmv: public Xgemv<T> {
 public:
-  
-  // Members from the base class
-  using Routine<T>::queue_;
-  using Routine<T>::context_;

  // Uses the generic matrix-vector routine
+  using Xgemv<T>::queue_;
+  using Xgemv<T>::context_;
  using Xgemv<T>::MatVec;

  // Constructor
--- a/src/routines/level3/xgemm.cpp
+++ b/src/routines/level3/xgemm.cpp
@ -11,7 +11,7 @@
 //
 // =================================================================================================

-#include "internal/routines/level3/xgemm.h"
+#include "routines/level3/xgemm.hpp"

 #include <string>
 #include <vector>
@ -19,19 +19,10 @@
 namespace clblast {
 // =================================================================================================

-// Specific implementations to get the memory-type based on a template argument
-template <> const Precision Xgemm<half>::precision_ = Precision::kHalf;
-template <> const Precision Xgemm<float>::precision_ = Precision::kSingle;
-template <> const Precision Xgemm<double>::precision_ = Precision::kDouble;
-template <> const Precision Xgemm<float2>::precision_ = Precision::kComplexSingle;
-template <> const Precision Xgemm<double2>::precision_ = Precision::kComplexDouble;
-
-// =================================================================================================
-
 // Constructor: forwards to base class constructor
 template <typename T>
 Xgemm<T>::Xgemm(Queue &queue, EventPointer event, const std::string &name):
-    Routine<T>(queue, event, name, {"Copy","Pad","Transpose","Padtranspose","Xgemm"}, precision_) {
+    Routine(queue, event, name, {"Copy","Pad","Transpose","Padtranspose","Xgemm"}, PrecisionValue<T>()) {
  source_string_ =
    #include "../../kernels/level3/level3.opencl"
    #include "../../kernels/level3/copy_fast.opencl"
@ -96,11 +87,11 @@ StatusCode Xgemm<T>::DoGemm(const Layout layout,
  //    matrix A cannot be less than K when rotated, or less than M when not-rotated
  //    matrix B cannot be less than N when rotated, or less than K when not-rotated
  //    matrix C cannot be less than N when rotated, or less than M when not-rotated
-  auto status = TestMatrixA(a_one, a_two, a_buffer, a_offset, a_ld, sizeof(T));
+  auto status = TestMatrixA(a_one, a_two, a_buffer, a_offset, a_ld);
  if (ErrorIn(status)) { return status; }
-  status = TestMatrixB(b_one, b_two, b_buffer, b_offset, b_ld, sizeof(T));
+  status = TestMatrixB(b_one, b_two, b_buffer, b_offset, b_ld);
  if (ErrorIn(status)) { return status; }
-  status = TestMatrixC(c_one, c_two, c_buffer, c_offset, c_ld, sizeof(T));
+  status = TestMatrixC(c_one, c_two, c_buffer, c_offset, c_ld);
  if (ErrorIn(status)) { return status; }

  // Calculates the ceiled versions of m, n, and k
@ -112,7 +103,7 @@ StatusCode Xgemm<T>::DoGemm(const Layout layout,
  try {

    // Loads the program from the database
-    const auto program = GetProgramFromCache();
+    const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);

    // Determines whether or not temporary matrices are needed
    auto a_no_temp = a_one == m_ceiled && a_two == k_ceiled && a_ld == m_ceiled && a_offset == 0 &&
@ -142,7 +133,7 @@ StatusCode Xgemm<T>::DoGemm(const Layout layout,
    // case nothing has to be done, these kernels can be skipped.
    if (!a_no_temp) {
      auto eventProcessA = Event();
-      status = PadCopyTransposeMatrix(eventProcessA.pointer(), emptyEventList,
+      status = PadCopyTransposeMatrix(queue_, device_, context_, db_, eventProcessA.pointer(), emptyEventList,
                                      a_one, a_two, a_ld, a_offset, a_buffer,
                                      m_ceiled, k_ceiled, m_ceiled, 0, a_temp,
                                      ConstantOne<T>(), program,
@ -154,7 +145,7 @@ StatusCode Xgemm<T>::DoGemm(const Layout layout,
    // As above, but now for matrix B
    if (!b_no_temp) {
      auto eventProcessB = Event();
-      status = PadCopyTransposeMatrix(eventProcessB.pointer(), emptyEventList,
+      status = PadCopyTransposeMatrix(queue_, device_, context_, db_, eventProcessB.pointer(), emptyEventList,
                                      b_one, b_two, b_ld, b_offset, b_buffer,
                                      n_ceiled, k_ceiled, n_ceiled, 0, b_temp,
                                      ConstantOne<T>(), program,
@ -166,7 +157,7 @@ StatusCode Xgemm<T>::DoGemm(const Layout layout,
    // As above, but now for matrix C. This is only necessary if C is used both as input and output.
    if (!c_no_temp && beta != static_cast<T>(0)) {
      auto eventProcessC = Event();
-      status = PadCopyTransposeMatrix(eventProcessC.pointer(), emptyEventList,
+      status = PadCopyTransposeMatrix(queue_, device_, context_, db_, eventProcessC.pointer(), emptyEventList,
                                      c_one, c_two, c_ld, c_offset, c_buffer,
                                      m_ceiled, n_ceiled, m_ceiled, 0, c_temp,
                                      ConstantOne<T>(), program,
@ -199,13 +190,13 @@ StatusCode Xgemm<T>::DoGemm(const Layout layout,
      // Launches the kernel
      auto eventKernel = Event();
      auto eventPointer = (!c_no_temp) ? eventKernel.pointer() : event_;
-      status = RunKernel(kernel, global, local, eventPointer, eventWaitList);
+      status = RunKernel(kernel, queue_, device_, global, local, eventPointer, eventWaitList);
      if (ErrorIn(status)) { return status; }

      // Runs the post-processing kernel if needed
      if (!c_no_temp) {
        eventWaitList.push_back(eventKernel);
-        status = PadCopyTransposeMatrix(event_, eventWaitList,
+        status = PadCopyTransposeMatrix(queue_, device_, context_, db_, event_, eventWaitList,
                                        m_ceiled, n_ceiled, m_ceiled, 0, c_temp,
                                        c_one, c_two, c_ld, c_offset, c_buffer,
                                        ConstantOne<T>(), program,
--- a/Show more
+++ b/Show more