treewide: use C++ exceptions properly

Since the codebase is designed around proper C++ idioms such as RAII, it makes sense to only use C++ exceptions internally instead of mixing exceptions and error codes. The exceptions are now caught at top level to preserve compatibility with the existing error code-based API. Note that we deliberately do not catch C++ runtime errors (such as `std::bad_alloc`) nor logic errors (aka failed assertions) because no actual handling can ever happen for such errors. However, in the C interface we do catch _all_ exceptions (...) and convert them into a wild-card error code.
2016-10-22 05:14:19 +03:00 · 2016-10-22 05:14:19 +03:00 · b98af44fcf
parent 5d03d48f7a
commit b98af44fcf
105 changed files with 4285 additions and 3987 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -169,6 +169,7 @@ set(SOURCES
  src/routines/common.cpp
  src/cache.cpp
  src/clblast.cpp
+  src/clblast_exceptions.cpp
  src/clblast_c.cpp
  src/routine.cpp
  src/utilities.cpp
--- a/include/clblast.h
+++ b/include/clblast.h
@ -75,13 +75,14 @@ enum class StatusCode {
  kInsufficientMemoryY       = -1007, // Vector Y's OpenCL buffer is too small

  // Custom additional status codes for CLBlast
-  kKernelLaunchError         = -2048, // Problem occurred when enqueuing the kernel
-  kKernelRunError            = -2047, // Problem occurred while running the kernel
  kInvalidLocalMemUsage      = -2046, // Not enough local memory available on this device
  kNoHalfPrecision           = -2045, // Half precision (16-bits) not supported by the device
  kNoDoublePrecision         = -2044, // Double precision (64-bits) not supported by the device
  kInvalidVectorScalar       = -2043, // The unit-sized vector is not a valid OpenCL buffer
  kInsufficientMemoryScalar  = -2042, // The unit-sized vector's OpenCL buffer is too small
+  kDatabaseError             = -2041, // Entry for the device was not found in the database
+  kUnknownError              = -2040, // A catch-all error code representing an unspecified error
+  kUnexpectedError           = -2039, // A catch-all error code representing an unexpected exception
 };

 // Matrix layout and transpose types
--- a/include/clblast_c.h
+++ b/include/clblast_c.h
@ -76,13 +76,14 @@ typedef enum StatusCode_ {
  kInsufficientMemoryY       = -1007, // Vector Y's OpenCL buffer is too small

  // Custom additional status codes for CLBlast
-  kKernelLaunchError         = -2048, // Problem occurred when enqueuing the kernel
-  kKernelRunError            = -2047, // Problem occurred while running the kernel
  kInvalidLocalMemUsage      = -2046, // Not enough local memory available on this device
  kNoHalfPrecision           = -2045, // Half precision (16-bits) not supported by the device
  kNoDoublePrecision         = -2044, // Double precision (64-bits) not supported by the device
  kInvalidVectorScalar       = -2043, // The unit-sized vector is not a valid OpenCL buffer
  kInsufficientMemoryScalar  = -2042, // The unit-sized vector's OpenCL buffer is too small
+  kDatabaseError             = -2041, // Entry for the device was not found in the database
+  kUnknownError              = -2040, // A catch-all error code representing an unspecified error
+  kUnexpectedError           = -2039, // A catch-all error code representing an unexpected exception
 } StatusCode;

 // Matrix layout and transpose types
--- a/scripts/generator/generator.py
+++ b/scripts/generator/generator.py
@ -30,8 +30,8 @@ from generator.routine import Routine
 from generator.datatype import H, S, D, C, Z, Sc, Dz, iH, iS, iD, iC, iZ, Css, Zdd, Ccs, Zzd, T, Tc, TU


-HEADER_LINES = [96, 73, 97, 22, 29, 41]
-FOOTER_LINES = [17, 75, 19, 14, 6, 6]
+HEADER_LINES = [97, 73, 98, 22, 29, 41]
+FOOTER_LINES = [17, 80, 19, 18, 6, 6]

 # Different possibilities for requirements
 ald_m = "The value of `a_ld` must be at least `m`."
--- a/scripts/generator/generator/cpp.py
+++ b/scripts/generator/generator/cpp.py
@ -45,17 +45,19 @@ def clblast_h(routine):

 def clblast_cc(routine):
    """The C++ API implementation (.cpp)"""
-    indent1 = " " * (20 + routine.length())
+    indent1 = " " * (15 + routine.length())
    result = NL + "// " + routine.description + ": " + routine.short_names() + NL
    if routine.implemented:
        result += routine.routine_header_cpp(12, "") + " {" + NL
-        result += "  auto queue_cpp = Queue(*queue);" + NL
-        result += "  auto routine = X" + routine.name + "<" + routine.template.template + ">(queue_cpp, event);" + NL
-        result += "  auto status = routine.SetUp();" + NL
-        result += "  if (status != StatusCode::kSuccess) { return status; }" + NL
-        result += "  return routine.Do" + routine.name.capitalize() + "("
+        result += "  try {" + NL
+        result += "    auto queue_cpp = Queue(*queue);" + NL
+        result += "    auto routine = X" + routine.name + "<" + routine.template.template + ">(queue_cpp, event);" + NL
+        result += "    routine.SetUp();" + NL
+        result += "    routine.Do" + routine.name.capitalize() + "("
        result += ("," + NL + indent1).join([a for a in routine.arguments_clcudaapi()])
        result += ");" + NL
+        result += "    return StatusCode::kSuccess;" + NL
+        result += "  } catch (...) { return DispatchException(); }" + NL
    else:
        result += routine.routine_header_type_cpp(12) + " {" + NL
        result += "  return StatusCode::kNotImplemented;" + NL
@ -81,12 +83,14 @@ def clblast_c_cc(routine):
    result = NL + "// " + routine.name.upper() + NL
    for flavour in routine.flavours:
        template = "<" + flavour.template + ">" if routine.no_scalars() else ""
-        indent = " " * (26 + routine.length() + len(template))
+        indent = " " * (45 + routine.length() + len(template))
        result += routine.routine_header_c(flavour, 20, "") + " {" + NL
-        result += "  auto status = clblast::" + routine.name.capitalize() + template + "("
+        result += "  try {" + NL
+        result += "    return static_cast<StatusCode>(clblast::" + routine.name.capitalize() + template + "("
        result += ("," + NL + indent).join([a for a in routine.arguments_cast(flavour, indent)])
-        result += "," + NL + indent + "queue, event);"
-        result += NL + "  return static_cast<StatusCode>(status);" + NL + "}" + NL
+        result += "," + NL + indent + "queue, event));" + NL
+        result += "  } catch (...) { return static_cast<StatusCode>(clblast::DispatchExceptionForC()); }" + NL
+        result += "}" + NL
    return result


--- a/src/buffer_test.hpp
+++ b/src/buffer_test.hpp
@ -22,96 +22,88 @@ namespace clblast {

 // Tests matrix 'A' for validity
 template <typename T>
-StatusCode TestMatrixA(const size_t one, const size_t two, const Buffer<T> &buffer,
+void TestMatrixA(const size_t one, const size_t two, const Buffer<T> &buffer,
                       const size_t offset, const size_t ld) {
-  if (ld < one) { return StatusCode::kInvalidLeadDimA; }
+  if (ld < one) { throw BLASError(StatusCode::kInvalidLeadDimA); }
  try {
    const auto required_size = (ld * (two - 1) + one + offset) * sizeof(T);
-    if (buffer.GetSize() < required_size) { return StatusCode::kInsufficientMemoryA; }
-  } catch (...) { return StatusCode::kInvalidMatrixA; }
-  return StatusCode::kSuccess;
+    if (buffer.GetSize() < required_size) { throw BLASError(StatusCode::kInsufficientMemoryA); }
+  } catch (const Error<std::runtime_error> &e) { throw BLASError(StatusCode::kInvalidMatrixA, e.what()); }
 }

 // Tests matrix 'B' for validity
 template <typename T>
-StatusCode TestMatrixB(const size_t one, const size_t two, const Buffer<T> &buffer,
+void TestMatrixB(const size_t one, const size_t two, const Buffer<T> &buffer,
                       const size_t offset, const size_t ld) {
-  if (ld < one) { return StatusCode::kInvalidLeadDimB; }
+  if (ld < one) { throw BLASError(StatusCode::kInvalidLeadDimB); }
  try {
    const auto required_size = (ld * (two - 1) + one + offset) * sizeof(T);
-    if (buffer.GetSize() < required_size) { return StatusCode::kInsufficientMemoryB; }
-  } catch (...) { return StatusCode::kInvalidMatrixB; }
-  return StatusCode::kSuccess;
+    if (buffer.GetSize() < required_size) { throw BLASError(StatusCode::kInsufficientMemoryB); }
+  } catch (const Error<std::runtime_error> &e) { throw BLASError(StatusCode::kInvalidMatrixB, e.what()); }
 }

 // Tests matrix 'C' for validity
 template <typename T>
-StatusCode TestMatrixC(const size_t one, const size_t two, const Buffer<T> &buffer,
+void TestMatrixC(const size_t one, const size_t two, const Buffer<T> &buffer,
                       const size_t offset, const size_t ld) {
-  if (ld < one) { return StatusCode::kInvalidLeadDimC; }
+  if (ld < one) { throw BLASError(StatusCode::kInvalidLeadDimC); }
  try {
    const auto required_size = (ld * (two - 1) + one + offset) * sizeof(T);
-    if (buffer.GetSize() < required_size) { return StatusCode::kInsufficientMemoryC; }
-  } catch (...) { return StatusCode::kInvalidMatrixC; }
-  return StatusCode::kSuccess;
+    if (buffer.GetSize() < required_size) { throw BLASError(StatusCode::kInsufficientMemoryC); }
+  } catch (const Error<std::runtime_error> &e) { throw BLASError(StatusCode::kInvalidMatrixC, e.what()); }
 }

 // Tests matrix 'AP' for validity
 template <typename T>
-StatusCode TestMatrixAP(const size_t n, const Buffer<T> &buffer, const size_t offset) {
+void TestMatrixAP(const size_t n, const Buffer<T> &buffer, const size_t offset) {
  try {
    const auto required_size = (((n * (n + 1)) / 2) + offset) * sizeof(T);
-    if (buffer.GetSize() < required_size) { return StatusCode::kInsufficientMemoryA; }
-  } catch (...) { return StatusCode::kInvalidMatrixA; }
-  return StatusCode::kSuccess;
+    if (buffer.GetSize() < required_size) { throw BLASError(StatusCode::kInsufficientMemoryA); }
+  } catch (const Error<std::runtime_error> &e) { throw BLASError(StatusCode::kInvalidMatrixA, e.what()); }
 }

 // =================================================================================================

 // Tests vector 'X' for validity
 template <typename T>
-StatusCode TestVectorX(const size_t n, const Buffer<T> &buffer, const size_t offset,
+void TestVectorX(const size_t n, const Buffer<T> &buffer, const size_t offset,
                       const size_t inc) {
-  if (inc == 0) { return StatusCode::kInvalidIncrementX; }
+  if (inc == 0) { throw BLASError(StatusCode::kInvalidIncrementX); }
  try {
    const auto required_size = ((n - 1) * inc + 1 + offset) * sizeof(T);
-    if (buffer.GetSize() < required_size) { return StatusCode::kInsufficientMemoryX; }
-  } catch (...) { return StatusCode::kInvalidVectorX; }
-  return StatusCode::kSuccess;
+    if (buffer.GetSize() < required_size) { throw BLASError(StatusCode::kInsufficientMemoryX); }
+  } catch (const Error<std::runtime_error> &e) { throw BLASError(StatusCode::kInvalidVectorX, e.what()); }
 }

 // Tests vector 'Y' for validity
 template <typename T>
-StatusCode TestVectorY(const size_t n, const Buffer<T> &buffer, const size_t offset,
+void TestVectorY(const size_t n, const Buffer<T> &buffer, const size_t offset,
                       const size_t inc) {
-  if (inc == 0) { return StatusCode::kInvalidIncrementY; }
+  if (inc == 0) { throw BLASError(StatusCode::kInvalidIncrementY); }
  try {
    const auto required_size = ((n - 1) * inc + 1 + offset) * sizeof(T);
-    if (buffer.GetSize() < required_size) { return StatusCode::kInsufficientMemoryY; }
-  } catch (...) { return StatusCode::kInvalidVectorY; }
-  return StatusCode::kSuccess;
+    if (buffer.GetSize() < required_size) { throw BLASError(StatusCode::kInsufficientMemoryY); }
+  } catch (const Error<std::runtime_error> &e) { throw BLASError(StatusCode::kInvalidVectorY, e.what()); }
 }

 // =================================================================================================

 // Tests vector 'scalar' for validity
 template <typename T>
-StatusCode TestVectorScalar(const size_t n, const Buffer<T> &buffer, const size_t offset) {
+void TestVectorScalar(const size_t n, const Buffer<T> &buffer, const size_t offset) {
  try {
    const auto required_size = (n + offset) * sizeof(T);
-    if (buffer.GetSize() < required_size) { return StatusCode::kInsufficientMemoryScalar; }
-  } catch (...) { return StatusCode::kInvalidVectorScalar; }
-  return StatusCode::kSuccess;
+    if (buffer.GetSize() < required_size) { throw BLASError(StatusCode::kInsufficientMemoryScalar); }
+  } catch (const Error<std::runtime_error> &e) { throw BLASError(StatusCode::kInvalidVectorScalar, e.what()); }
 }

 // Tests vector 'index' for validity
 template <typename T>
-StatusCode TestVectorIndex(const size_t n, const Buffer<T> &buffer, const size_t offset) {
+void TestVectorIndex(const size_t n, const Buffer<T> &buffer, const size_t offset) {
  try {
    const auto required_size = (n + offset) * sizeof(T);
-    if (buffer.GetSize() < required_size) { return StatusCode::kInsufficientMemoryScalar; }
-  } catch (...) { return StatusCode::kInvalidVectorScalar; }
-  return StatusCode::kSuccess;
+    if (buffer.GetSize() < required_size) { throw BLASError(StatusCode::kInsufficientMemoryScalar); }
+  } catch (const Error<std::runtime_error> &e) { throw BLASError(StatusCode::kInvalidVectorScalar, e.what()); }
 }

 // =================================================================================================
--- a/src/cache.cpp
+++ b/src/cache.cpp
@ -57,7 +57,7 @@ const std::string& GetBinaryFromCache(const std::string &device_name, const Prec
    }
  }
  binary_cache_mutex_.unlock();
-  throw std::runtime_error("Internal CLBlast error: Expected binary in cache, but found none.");
+  throw LogicError("GetBinaryFromCache: Expected binary in cache, but found none");
 }

 // Queries the cache and retrieves a matching program. Assumes that the match is available, throws
@ -75,7 +75,7 @@ const Program& GetProgramFromCache(const Context &context, const Precision &prec
    }
  }
  program_cache_mutex_.unlock();
-  throw std::runtime_error("Internal CLBlast error: Expected program in cache, but found none.");
+  throw LogicError("GetProgramFromCache: Expected program in cache, but found none");
 }

 // Queries the cache to see whether or not the compiled kernel is already there
@ -109,14 +109,13 @@ bool ProgramIsInCache(const Context &context, const Precision &precision,
 // =================================================================================================

 // Clears the cache of stored binaries and programs
-StatusCode CacheClearAll() {
+void CacheClearAll() {
  binary_cache_mutex_.lock();
  binary_cache_.clear();
  binary_cache_mutex_.unlock();
  program_cache_mutex_.lock();
  program_cache_.clear();
  program_cache_mutex_.unlock();
-  return StatusCode::kSuccess;
 }

 // =================================================================================================
--- a/src/cache.hpp
+++ b/src/cache.hpp
@ -89,7 +89,7 @@ bool ProgramIsInCache(const Context &context, const Precision &precision,
 // =================================================================================================

 // Clears the cache of stored binaries
-StatusCode CacheClearAll();
+void CacheClearAll();

 // =================================================================================================
 } // namespace clblast
--- a/src/clblast.cpp
+++ b/src/clblast.cpp
--- a/src/clblast_c.cpp
+++ b/src/clblast_c.cpp
--- a/src/clblast_exceptions.cpp
+++ b/src/clblast_exceptions.cpp
@ -0,0 +1,95 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Ivan Shapovalov <intelfx@intelfx.name>
+//
+// This file implements the exception hierarchy for CLBlast. It contains classes for exceptions
+// generated by different parts of CLBlast (e.g. OpenCL API calls, internal logic, semantic BLAS
+// errors).
+//
+// =================================================================================================
+
+#include "clblast_exceptions.hpp"
+
+namespace {
+// =================================================================================================
+
+std::string MakeReason(const std::string &reason, const std::string &subreason) {
+  std::string r = reason;
+  if (!subreason.empty()) {
+    r += " (" + subreason + ")";
+  }
+  return r;
+}
+
+} // anonymous namespace
+
+namespace clblast {
+// =================================================================================================
+
+BLASError::BLASError(StatusCode status, const std::string &subreason):
+    ErrorCode(status,
+              subreason,
+              "BLAS error: " + MakeReason(std::to_string(static_cast<int>(status)), subreason)) {
+}
+
+RuntimeErrorCode::RuntimeErrorCode(StatusCode status, const std::string &subreason):
+    ErrorCode(status,
+              subreason,
+              MakeReason(std::to_string(static_cast<int>(status)), subreason)) {
+}
+
+// =================================================================================================
+
+StatusCode DispatchException()
+{
+  const char *message = nullptr;
+  StatusCode status;
+
+  try {
+    throw;
+  } catch (BLASError &e) {
+    // no message is printed for invalid argument errors
+    status = e.status();
+  } catch (CLError &e) {
+    message = e.what();
+    status = static_cast<StatusCode>(e.status());
+  } catch (RuntimeErrorCode &e) {
+    message = e.what();
+    status = e.status();
+  } catch (Error<std::runtime_error> &e) {
+    message = e.what();
+    status = StatusCode::kUnknownError;
+  }
+
+  if (message) {
+    fprintf(stderr, "CLBlast: %s\n", message);
+  }
+  return status;
+}
+
+// =================================================================================================
+
+StatusCode DispatchExceptionForC()
+{
+  const char *message = nullptr;
+
+  try {
+    throw;
+  } catch (std::exception &e) {
+    message = e.what();
+  } catch (...) {
+    message = "unknown exception";
+  }
+
+  fprintf (stderr, "CLBlast (unexpected): %s\n", message);
+  return StatusCode::kUnexpectedError;
+}
+
+// =================================================================================================
+
+} // namespace clblast
--- a/src/clblast_exceptions.hpp
+++ b/src/clblast_exceptions.hpp
@ -0,0 +1,50 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Ivan Shapovalov <intelfx@intelfx.name>
+//
+// This file implements the exception hierarchy for CLBlast. It contains classes for exceptions
+// generated by different parts of CLBlast (e.g. OpenCL API calls, internal logic, semantic BLAS
+// errors).
+//
+// =================================================================================================
+
+#ifndef CLBLAST_EXCEPTIONS_H_
+#define CLBLAST_EXCEPTIONS_H_
+
+#include "clblast.h"
+#include "clpp11.hpp"
+
+namespace clblast {
+// =================================================================================================
+
+// Represents a semantic error in BLAS function arguments
+class PUBLIC_API BLASError : public ErrorCode<Error<std::invalid_argument>, StatusCode> {
+ public:
+  explicit BLASError(StatusCode status, const std::string &subreason = std::string{});
+};
+// =================================================================================================
+
+// Represents a runtime error generated by internal logic
+class PUBLIC_API RuntimeErrorCode : public ErrorCode<RuntimeError, StatusCode> {
+ public:
+  explicit RuntimeErrorCode(StatusCode status, const std::string &subreason = std::string{});
+};
+
+// =================================================================================================
+
+// Handles (most of the) runtime exceptions and converts them to StatusCode
+StatusCode DispatchException();
+
+// Handles remaining exceptions and converts them to StatusCode::kUnhandledError
+StatusCode DispatchExceptionForC();
+
+// =================================================================================================
+
+} // namespace clblast
+
+#endif // CLBLAST_EXCEPTIONS_H_
--- a/src/clpp11.hpp
+++ b/src/clpp11.hpp
@ -41,7 +41,6 @@
 #include <string>    // std::string
 #include <vector>    // std::vector
 #include <memory>    // std::shared_ptr
-#include <stdexcept> // std::runtime_error
 #include <numeric>   // std::accumulate
 #include <cstring>   // std::strlen

@ -52,28 +51,41 @@
  #include <CL/opencl.h>
 #endif

+// Exception classes
+#include "cxpp11_common.hpp"
+
 namespace clblast {
 // =================================================================================================

-// Error occurred in the C++11 OpenCL header (this file)
-inline void Error(const std::string &message) {
-  throw std::runtime_error("Internal OpenCL error: "+message);
-}
+// Represents a runtime error returned by an OpenCL API function
+class CLError : public ErrorCode<DeviceError, cl_int> {
+ public:
+  explicit CLError(cl_int status, const std::string &where):
+      ErrorCode(status,
+                where,
+                "OpenCL error: " + where + ": " + std::to_string(static_cast<int>(status))) {
+  }
+
+  static void Check(const cl_int status, const std::string &where) {
+    if (status != CL_SUCCESS) {
+      throw CLError(status, where);
+    }
+  }
+
+  static void CheckDtor(const cl_int status, const std::string &where) {
+    if (status != CL_SUCCESS) {
+      fprintf(stderr, "CLBlast: %s (ignoring)\n", CLError(status, where).what());
+    }
+  }
+};
+
+// =================================================================================================

 // Error occurred in OpenCL
-inline void CheckError(const cl_int status) {
-  if (status != CL_SUCCESS) {
-    throw std::runtime_error("Internal OpenCL error: "+std::to_string(status));
-  }
-}
+#define CheckError(call) CLError::Check(call, CLError::TrimCallString(#call))

 // Error occured in OpenCL (no-exception version for destructors)
-inline void CheckErrorDtor(const cl_int status) {
-  if (status != CL_SUCCESS) {
-    auto message = "Internal OpenCL Error: "+std::to_string(status) + " (ignoring)";
-    fprintf(stderr, "%s\n", message.c_str());
-  }
-}
+#define CheckErrorDtor(call) CLError::CheckDtor(call, CLError::TrimCallString(#call))

 // =================================================================================================

@ -140,10 +152,14 @@ class Platform {
  explicit Platform(const size_t platform_id) {
    auto num_platforms = cl_uint{0};
    CheckError(clGetPlatformIDs(0, nullptr, &num_platforms));
-    if (num_platforms == 0) { Error("no platforms found"); }
+    if (num_platforms == 0) {
+      throw RuntimeError("Platform: no platforms found");
+    }
+    if (platform_id >= num_platforms) {
+      throw RuntimeError("Platform: invalid platform ID "+std::to_string(platform_id));
+    }
    auto platforms = std::vector<cl_platform_id>(num_platforms);
    CheckError(clGetPlatformIDs(num_platforms, platforms.data(), nullptr));
-    if (platform_id >= num_platforms) { Error("invalid platform ID "+std::to_string(platform_id)); }
    platform_ = platforms[platform_id];
  }

@ -183,11 +199,16 @@ class Device {
  // Initialize the device. Note that this constructor can throw exceptions!
  explicit Device(const Platform &platform, const size_t device_id) {
    auto num_devices = platform.NumDevices();
-    if (num_devices == 0) { Error("no devices found"); }
+    if (num_devices == 0) {
+      throw RuntimeError("Device: no devices found");
+    }
+    if (device_id >= num_devices) {
+      throw RuntimeError("Device: invalid device ID "+std::to_string(device_id));
+    }
+
    auto devices = std::vector<cl_device_id>(num_devices);
    CheckError(clGetDeviceIDs(platform(), CL_DEVICE_TYPE_ALL, static_cast<cl_uint>(num_devices),
                              devices.data(), nullptr));
-    if (device_id >= num_devices) { Error("invalid device ID "+std::to_string(device_id)); }
    device_ = devices[device_id];
  }

@ -315,7 +336,7 @@ class Context {
    auto status = CL_SUCCESS;
    const cl_device_id dev = device();
    *context_ = clCreateContext(nullptr, 1, &dev, nullptr, nullptr, &status);
-    CheckError(status);
+    CLError::Check(status, "clCreateContext");
  }

  // Accessor to the private data-member
@ -346,7 +367,7 @@ class Program {
      source_ptr_(&source_[0]) {
    auto status = CL_SUCCESS;
    *program_ = clCreateProgramWithSource(context(), 1, &source_ptr_, &length_, &status);
-    CheckError(status);
+    CLError::Check(status, "clCreateProgramWithSource");
  }

  // Binary-based constructor with memory management
@ -361,25 +382,15 @@ class Program {
    *program_ = clCreateProgramWithBinary(context(), 1, &dev, &length_,
                                          reinterpret_cast<const unsigned char**>(&source_ptr_),
                                          &status1, &status2);
-    CheckError(status1);
-    CheckError(status2);
+    CLError::Check(status1, "clCreateProgramWithBinary (binary status)");
+    CLError::Check(status2, "clCreateProgramWithBinary");
  }

  // Compiles the device program and returns whether or not there where any warnings/errors
-  BuildStatus Build(const Device &device, std::vector<std::string> &options) {
+  void Build(const Device &device, std::vector<std::string> &options) {
    auto options_string = std::accumulate(options.begin(), options.end(), std::string{" "});
    const cl_device_id dev = device();
-    auto status = clBuildProgram(*program_, 1, &dev, options_string.c_str(), nullptr, nullptr);
-    if (status == CL_BUILD_PROGRAM_FAILURE) {
-      return BuildStatus::kError;
-    }
-    else if (status == CL_INVALID_BINARY) {
-      return BuildStatus::kInvalid;
-    }
-    else {
-      CheckError(status);
-      return BuildStatus::kSuccess;
-    }
+    CheckError(clBuildProgram(*program_, 1, &dev, options_string.c_str(), nullptr, nullptr));
  }

  // Retrieves the warning/error message from the compiler (if any)
@ -436,15 +447,17 @@ class Queue {
      {
        cl_queue_properties properties[] = {CL_QUEUE_PROPERTIES, CL_QUEUE_PROFILING_ENABLE, 0};
        *queue_ = clCreateCommandQueueWithProperties(context(), device(), properties, &status);
+        CLError::Check(status, "clCreateCommandQueueWithProperties");
      }
      else
      {
        *queue_ = clCreateCommandQueue(context(), device(), CL_QUEUE_PROFILING_ENABLE, &status);
+        CLError::Check(status, "clCreateCommandQueue");
      }
    #else
      *queue_ = clCreateCommandQueue(context(), device(), CL_QUEUE_PROFILING_ENABLE, &status);
+      CLError::Check(status, "clCreateCommandQueue");
    #endif
-    CheckError(status);
  }

  // Synchronizes the queue
@ -536,7 +549,7 @@ class Buffer {
    if (access_ == BufferAccess::kWriteOnly) { flags = CL_MEM_WRITE_ONLY; }
    auto status = CL_SUCCESS;
    *buffer_ = clCreateBuffer(context(), flags, size*sizeof(T), nullptr, &status);
-    CheckError(status);
+    CLError::Check(status, "clCreateBuffer");
  }

  // As above, but now with read/write access as a default
@ -557,18 +570,24 @@ class Buffer {

  // Copies from device to host: reading the device buffer a-synchronously
  void ReadAsync(const Queue &queue, const size_t size, T* host, const size_t offset = 0) const {
-    if (access_ == BufferAccess::kWriteOnly) { Error("reading from a write-only buffer"); }
+    if (access_ == BufferAccess::kWriteOnly) {
+      throw LogicError("Buffer: reading from a write-only buffer");
+    }
    CheckError(clEnqueueReadBuffer(queue(), *buffer_, CL_FALSE, offset*sizeof(T), size*sizeof(T),
                                   host, 0, nullptr, nullptr));
  }
  void ReadAsync(const Queue &queue, const size_t size, std::vector<T> &host,
                 const size_t offset = 0) const {
-    if (host.size() < size) { Error("target host buffer is too small"); }
+    if (host.size() < size) {
+      throw LogicError("Buffer: target host buffer is too small");
+    }
    ReadAsync(queue, size, host.data(), offset);
  }
  void ReadAsync(const Queue &queue, const size_t size, BufferHost<T> &host,
                 const size_t offset = 0) const {
-    if (host.size() < size) { Error("target host buffer is too small"); }
+    if (host.size() < size) {
+      throw LogicError("Buffer: target host buffer is too small");
+    }
    ReadAsync(queue, size, host.data(), offset);
  }

@ -588,8 +607,12 @@ class Buffer {

  // Copies from host to device: writing the device buffer a-synchronously
  void WriteAsync(const Queue &queue, const size_t size, const T* host, const size_t offset = 0) {
-    if (access_ == BufferAccess::kReadOnly) { Error("writing to a read-only buffer"); }
-    if (GetSize() < (offset+size)*sizeof(T)) { Error("target device buffer is too small"); }
+    if (access_ == BufferAccess::kReadOnly) {
+      throw LogicError("Buffer: writing to a read-only buffer");
+    }
+    if (GetSize() < (offset+size)*sizeof(T)) {
+      throw LogicError("Buffer: target device buffer is too small");
+    }
    CheckError(clEnqueueWriteBuffer(queue(), *buffer_, CL_FALSE, offset*sizeof(T), size*sizeof(T),
                                    host, 0, nullptr, nullptr));
  }
@ -658,7 +681,7 @@ class Kernel {
      kernel_(new cl_kernel, [](cl_kernel* k) { CheckErrorDtor(clReleaseKernel(*k)); delete k; }) {
    auto status = CL_SUCCESS;
    *kernel_ = clCreateKernel(program(), name.c_str(), &status);
-    CheckError(status);
+    CLError::Check(status, "clCreateKernel");
  }

  // Sets a kernel argument at the indicated position
--- a/src/cxpp11_common.hpp
+++ b/src/cxpp11_common.hpp
@ -0,0 +1,87 @@
+#ifndef CLBLAST_CXPP11_COMMON_H_
+#define CLBLAST_CXPP11_COMMON_H_
+
+// C++
+#include <string>    // std::string
+#include <stdexcept> // std::runtime_error
+
+namespace clblast {
+// =================================================================================================
+
+// Basic exception class: represents an error happened inside our code
+// (as opposed to an error in C++ runtime)
+template <typename Base>
+class Error : public Base {
+ public:
+  using Base::Base;
+};
+
+// =================================================================================================
+
+// Represents a generic device-specific runtime error (returned by an OpenCL or CUDA API function)
+class DeviceError : public Error<std::runtime_error> {
+ public:
+  using Error<std::runtime_error>::Error;
+
+  static std::string TrimCallString(const char *where) {
+    const char *paren = strchr(where, '(');
+    if (paren) {
+      return std::string(where, paren);
+    } else {
+      return std::string(where);
+    }
+  }
+};
+
+// =================================================================================================
+
+// Represents a generic runtime error (aka environmental problem)
+class RuntimeError : public Error<std::runtime_error> {
+ public:
+  explicit RuntimeError(const std::string &reason):
+      Error("Run-time error: " + reason) {
+  }
+};
+
+// =================================================================================================
+
+// Represents a generic logic error (aka failed assertion)
+class LogicError : public Error<std::logic_error> {
+ public:
+  explicit LogicError(const std::string &reason):
+      Error("Internal logic error: " + reason) {
+  }
+};
+
+// =================================================================================================
+
+// Internal exception base class with a status field and a subclass-specific "details" field
+// which can be used to recreate an exception
+template <typename Base, typename Status>
+class ErrorCode : public Base {
+ public:
+  ErrorCode(Status status, const std::string &details, const std::string &reason):
+      Base(reason),
+      status_(status),
+      details_(details) {
+  }
+
+  Status status() const {
+    return status_;
+  }
+
+  const std::string& details() const {
+    return details_;
+  }
+
+ private:
+  const Status status_;
+  const std::string details_;
+};
+
+// =================================================================================================
+
+} // namespace clblast
+
+// CLBLAST_CXPP11_COMMON_H_
+#endif
--- a/src/database/database.cpp
+++ b/src/database/database.cpp
@ -92,7 +92,7 @@ Database::Database(const Queue &queue, const std::vector<std::string> &kernels,
      }
    }

-    if (!search_result) { throw std::runtime_error("Database error, could not find a suitable entry"); }
+    if (!search_result) { throw RuntimeErrorCode(StatusCode::kDatabaseError); }
  }
 }

--- a/src/routine.cpp
+++ b/src/routine.cpp
@ -38,10 +38,10 @@ Routine::Routine(Queue &queue, EventPointer event, const std::string &name,
 // =================================================================================================

 // Separate set-up function to allow for status codes to be returned
-StatusCode Routine::SetUp() {
+void Routine::SetUp() {

  // Queries the cache to see whether or not the program (context-specific) is already there
-  if (ProgramIsInCache(context_, precision_, routine_name_)) { return StatusCode::kSuccess; }
+  if (ProgramIsInCache(context_, precision_, routine_name_)) { return; }

  // Sets the build options from an environmental variable (if set)
  auto options = std::vector<std::string>();
@ -53,13 +53,10 @@ StatusCode Routine::SetUp() {
  // Queries the cache to see whether or not the binary (device-specific) is already there. If it
  // is, a program is created and stored in the cache
  if (BinaryIsInCache(device_name_, precision_, routine_name_)) {
-    try {
-      auto& binary = GetBinaryFromCache(device_name_, precision_, routine_name_);
-      auto program = Program(device_, context_, binary);
-      program.Build(device_, options);
-      StoreProgramToCache(program, context_, precision_, routine_name_);
-    } catch (...) { return StatusCode::kBuildProgramFailure; }
-    return StatusCode::kSuccess;
+    auto& binary = GetBinaryFromCache(device_name_, precision_, routine_name_);
+    auto program = Program(device_, context_, binary);
+    program.Build(device_, options);
+    StoreProgramToCache(program, context_, precision_, routine_name_);
  }

  // Otherwise, the kernel will be compiled and program will be built. Both the binary and the
@ -69,14 +66,14 @@ StatusCode Routine::SetUp() {
  const auto extensions = device_.Capabilities();
  if (precision_ == Precision::kDouble || precision_ == Precision::kComplexDouble) {
    if (extensions.find(kKhronosDoublePrecision) == std::string::npos) {
-      return StatusCode::kNoDoublePrecision;
+      throw RuntimeErrorCode(StatusCode::kNoDoublePrecision);
    }
  }

  // As above, but for cl_khr_fp16 (half precision)
  if (precision_ == Precision::kHalf) {
    if (extensions.find(kKhronosHalfPrecision) == std::string::npos) {
-      return StatusCode::kNoHalfPrecision;
+      throw RuntimeErrorCode(StatusCode::kNoHalfPrecision);
    }
  }

@ -120,23 +117,21 @@ StatusCode Routine::SetUp() {
  #endif

  // Compiles the kernel
+  auto program = Program(context_, source_string);
  try {
-    auto program = Program(context_, source_string);
-    const auto build_status = program.Build(device_, options);
-
-    // Checks for compiler crashes/errors/warnings
-    if (build_status == BuildStatus::kError) {
-      const auto message = program.GetBuildInfo(device_);
-      fprintf(stdout, "OpenCL compiler error/warning: %s\n", message.c_str());
-      return StatusCode::kBuildProgramFailure;
+    program.Build(device_, options);
+  } catch (const CLError &e) {
+    if (e.status() == CL_BUILD_PROGRAM_FAILURE) {
+      fprintf(stdout, "OpenCL compiler error/warning: %s\n",
+              program.GetBuildInfo(device_).c_str());
    }
-    if (build_status == BuildStatus::kInvalid) { return StatusCode::kInvalidBinary; }
+    throw;
+  }

-    // Store the compiled binary and program in the cache
-    const auto binary = program.GetIR();
-    StoreBinaryToCache(binary, device_name_, precision_, routine_name_);
-    StoreProgramToCache(program, context_, precision_, routine_name_);
-  } catch (...) { return StatusCode::kBuildProgramFailure; }
+  // Store the compiled binary and program in the cache
+  const auto binary = program.GetIR();
+  StoreBinaryToCache(binary, device_name_, precision_, routine_name_);
+  StoreProgramToCache(program, context_, precision_, routine_name_);

  // Prints the elapsed compilation time in case of debugging in verbose mode
  #ifdef VERBOSE
@ -144,9 +139,6 @@ StatusCode Routine::SetUp() {
    const auto timing = std::chrono::duration<double,std::milli>(elapsed_time).count();
    printf("[DEBUG] Completed compilation in %.2lf ms\n", timing);
  #endif
-
-  // No errors, normal termination of this function
-  return StatusCode::kSuccess;
 }

 // =================================================================================================
--- a/src/routine.hpp
+++ b/src/routine.hpp
@ -39,7 +39,7 @@ class Routine {
                   const std::vector<const Database::DatabaseEntry*> &userDatabase = {});

  // Set-up phase of the kernel
-  StatusCode SetUp();
+  void SetUp();

 protected:

--- a/src/routines/common.cpp
+++ b/src/routines/common.cpp
@ -20,22 +20,26 @@ namespace clblast {
 // =================================================================================================

 // Enqueues a kernel, waits for completion, and checks for errors
-StatusCode RunKernel(Kernel &kernel, Queue &queue, const Device &device,
-                     std::vector<size_t> global, const std::vector<size_t> &local,
-                     EventPointer event, const std::vector<Event> &waitForEvents) {
+void RunKernel(Kernel &kernel, Queue &queue, const Device &device,
+               std::vector<size_t> global, const std::vector<size_t> &local,
+               EventPointer event, const std::vector<Event> &waitForEvents) {

  if (!local.empty()) {
    // Tests for validity of the local thread sizes
    if (local.size() > device.MaxWorkItemDimensions()) {
-      return StatusCode::kInvalidLocalNumDimensions;
+      throw RuntimeErrorCode(StatusCode::kInvalidLocalNumDimensions);
    }
    const auto max_work_item_sizes = device.MaxWorkItemSizes();
    for (auto i=size_t{0}; i<local.size(); ++i) {
-      if (local[i] > max_work_item_sizes[i]) { return StatusCode::kInvalidLocalThreadsDim; }
+      if (local[i] > max_work_item_sizes[i]) {
+        throw RuntimeErrorCode(StatusCode::kInvalidLocalThreadsDim);
+      }
    }
    auto local_size = size_t{1};
    for (auto &item: local) { local_size *= item; }
-    if (local_size > device.MaxWorkGroupSize()) { return StatusCode::kInvalidLocalThreadsTotal; }
+    if (local_size > device.MaxWorkGroupSize()) {
+      throw RuntimeErrorCode(StatusCode::kInvalidLocalThreadsTotal);
+    }

    // Make sure the global thread sizes are at least equal to the local sizes
    for (auto i=size_t{0}; i<global.size(); ++i) {
@ -45,7 +49,9 @@ StatusCode RunKernel(Kernel &kernel, Queue &queue, const Device &device,

  // Tests for local memory usage
  const auto local_mem_usage = kernel.LocalMemUsage(device);
-  if (!device.IsLocalMemoryValid(local_mem_usage)) { return StatusCode::kInvalidLocalMemUsage; }
+  if (!device.IsLocalMemoryValid(local_mem_usage)) {
+    throw RuntimeErrorCode(StatusCode::kInvalidLocalMemUsage);
+  }

  // Prints the name of the kernel to launch in case of debugging in verbose mode
  #ifdef VERBOSE
@ -55,9 +61,7 @@ StatusCode RunKernel(Kernel &kernel, Queue &queue, const Device &device,
  #endif

  // Launches the kernel (and checks for launch errors)
-  try {
-    kernel.Launch(queue, global, local, event, waitForEvents);
-  } catch (...) { return StatusCode::kKernelLaunchError; }
+  kernel.Launch(queue, global, local, event, waitForEvents);

  // Prints the elapsed execution time in case of debugging in verbose mode
  #ifdef VERBOSE
@ -66,9 +70,6 @@ StatusCode RunKernel(Kernel &kernel, Queue &queue, const Device &device,
    const auto timing = std::chrono::duration<double,std::milli>(elapsed_time).count();
    printf("[DEBUG] Completed kernel in %.2lf ms\n", timing);
  #endif
-
-  // No errors, normal termination of this function
-  return StatusCode::kSuccess;
 }

 // =================================================================================================
--- a/src/routines/common.hpp
+++ b/src/routines/common.hpp
@ -27,29 +27,29 @@ namespace clblast {
 // =================================================================================================

 // Enqueues a kernel, waits for completion, and checks for errors
-StatusCode RunKernel(Kernel &kernel, Queue &queue, const Device &device,
-                     std::vector<size_t> global, const std::vector<size_t> &local,
-                     EventPointer event, const std::vector<Event> &waitForEvents = {});
+void RunKernel(Kernel &kernel, Queue &queue, const Device &device,
+               std::vector<size_t> global, const std::vector<size_t> &local,
+               EventPointer event, const std::vector<Event> &waitForEvents = {});

 // =================================================================================================

 // Copies or transposes a matrix and optionally pads/unpads it with zeros. This method is also able
 // to write to symmetric and triangular matrices through optional arguments.
 template <typename T>
-StatusCode PadCopyTransposeMatrix(Queue &queue, const Device &device,
-                                  const Database &db,
-                                  EventPointer event, const std::vector<Event> &waitForEvents,
-                                  const size_t src_one, const size_t src_two,
-                                  const size_t src_ld, const size_t src_offset,
-                                  const Buffer<T> &src,
-                                  const size_t dest_one, const size_t dest_two,
-                                  const size_t dest_ld, const size_t dest_offset,
-                                  const Buffer<T> &dest,
-                                  const T alpha,
-                                  const Program &program, const bool do_pad,
-                                  const bool do_transpose, const bool do_conjugate,
-                                  const bool upper = false, const bool lower = false,
-                                  const bool diagonal_imag_zero = false) {
+void PadCopyTransposeMatrix(Queue &queue, const Device &device,
+                            const Database &db,
+                            EventPointer event, const std::vector<Event> &waitForEvents,
+                            const size_t src_one, const size_t src_two,
+                            const size_t src_ld, const size_t src_offset,
+                            const Buffer<T> &src,
+                            const size_t dest_one, const size_t dest_two,
+                            const size_t dest_ld, const size_t dest_offset,
+                            const Buffer<T> &dest,
+                            const T alpha,
+                            const Program &program, const bool do_pad,
+                            const bool do_transpose, const bool do_conjugate,
+                            const bool upper = false, const bool lower = false,
+                            const bool diagonal_imag_zero = false) {

  // Determines whether or not the fast-version could potentially be used
  auto use_fast_kernel = (src_offset == 0) && (dest_offset == 0) && (do_conjugate == false) &&
@ -84,77 +84,75 @@ StatusCode PadCopyTransposeMatrix(Queue &queue, const Device &device,
  }

  // Retrieves the kernel from the compiled binary
-  try {
-    auto kernel = Kernel(program, kernel_name);
+  auto kernel = Kernel(program, kernel_name);

-    // Sets the kernel arguments
+  // Sets the kernel arguments
+  if (use_fast_kernel) {
+    kernel.SetArgument(0, static_cast<int>(src_ld));
+    kernel.SetArgument(1, src());
+    kernel.SetArgument(2, dest());
+    kernel.SetArgument(3, GetRealArg(alpha));
+  }
+  else {
+    kernel.SetArgument(0, static_cast<int>(src_one));
+    kernel.SetArgument(1, static_cast<int>(src_two));
+    kernel.SetArgument(2, static_cast<int>(src_ld));
+    kernel.SetArgument(3, static_cast<int>(src_offset));
+    kernel.SetArgument(4, src());
+    kernel.SetArgument(5, static_cast<int>(dest_one));
+    kernel.SetArgument(6, static_cast<int>(dest_two));
+    kernel.SetArgument(7, static_cast<int>(dest_ld));
+    kernel.SetArgument(8, static_cast<int>(dest_offset));
+    kernel.SetArgument(9, dest());
+    kernel.SetArgument(10, GetRealArg(alpha));
+    if (do_pad) {
+      kernel.SetArgument(11, static_cast<int>(do_conjugate));
+    }
+    else {
+      kernel.SetArgument(11, static_cast<int>(upper));
+      kernel.SetArgument(12, static_cast<int>(lower));
+      kernel.SetArgument(13, static_cast<int>(diagonal_imag_zero));
+    }
+  }
+
+  // Launches the kernel and returns the error code. Uses global and local thread sizes based on
+  // parameters in the database.
+  if (do_transpose) {
    if (use_fast_kernel) {
-      kernel.SetArgument(0, static_cast<int>(src_ld));
-      kernel.SetArgument(1, src());
-      kernel.SetArgument(2, dest());
-      kernel.SetArgument(3, GetRealArg(alpha));
+      const auto global = std::vector<size_t>{
+        dest_one / db["TRA_WPT"],
+        dest_two / db["TRA_WPT"]
+      };
+      const auto local = std::vector<size_t>{db["TRA_DIM"], db["TRA_DIM"]};
+      RunKernel(kernel, queue, device, global, local, event, waitForEvents);
    }
    else {
-      kernel.SetArgument(0, static_cast<int>(src_one));
-      kernel.SetArgument(1, static_cast<int>(src_two));
-      kernel.SetArgument(2, static_cast<int>(src_ld));
-      kernel.SetArgument(3, static_cast<int>(src_offset));
-      kernel.SetArgument(4, src());
-      kernel.SetArgument(5, static_cast<int>(dest_one));
-      kernel.SetArgument(6, static_cast<int>(dest_two));
-      kernel.SetArgument(7, static_cast<int>(dest_ld));
-      kernel.SetArgument(8, static_cast<int>(dest_offset));
-      kernel.SetArgument(9, dest());
-      kernel.SetArgument(10, GetRealArg(alpha));
-      if (do_pad) {
-        kernel.SetArgument(11, static_cast<int>(do_conjugate));
-      }
-      else {
-        kernel.SetArgument(11, static_cast<int>(upper));
-        kernel.SetArgument(12, static_cast<int>(lower));
-        kernel.SetArgument(13, static_cast<int>(diagonal_imag_zero));
-      }
+      const auto global = std::vector<size_t>{
+        Ceil(CeilDiv(dest_one, db["PADTRA_WPT"]), db["PADTRA_TILE"]),
+        Ceil(CeilDiv(dest_two, db["PADTRA_WPT"]), db["PADTRA_TILE"])
+      };
+      const auto local = std::vector<size_t>{db["PADTRA_TILE"], db["PADTRA_TILE"]};
+      RunKernel(kernel, queue, device, global, local, event, waitForEvents);
    }
-
-    // Launches the kernel and returns the error code. Uses global and local thread sizes based on
-    // parameters in the database.
-    if (do_transpose) {
-      if (use_fast_kernel) {
-        const auto global = std::vector<size_t>{
-          dest_one / db["TRA_WPT"],
-          dest_two / db["TRA_WPT"]
-        };
-        const auto local = std::vector<size_t>{db["TRA_DIM"], db["TRA_DIM"]};
-        return RunKernel(kernel, queue, device, global, local, event, waitForEvents);
-      }
-      else {
-        const auto global = std::vector<size_t>{
-          Ceil(CeilDiv(dest_one, db["PADTRA_WPT"]), db["PADTRA_TILE"]),
-          Ceil(CeilDiv(dest_two, db["PADTRA_WPT"]), db["PADTRA_TILE"])
-        };
-        const auto local = std::vector<size_t>{db["PADTRA_TILE"], db["PADTRA_TILE"]};
-        return RunKernel(kernel, queue, device, global, local, event, waitForEvents);
-      }
+  }
+  else {
+    if (use_fast_kernel) {
+      const auto global = std::vector<size_t>{
+        dest_one / db["COPY_VW"],
+        dest_two / db["COPY_WPT"]
+      };
+      const auto local = std::vector<size_t>{db["COPY_DIMX"], db["COPY_DIMY"]};
+      RunKernel(kernel, queue, device, global, local, event, waitForEvents);
    }
    else {
-      if (use_fast_kernel) {
-        const auto global = std::vector<size_t>{
-          dest_one / db["COPY_VW"],
-          dest_two / db["COPY_WPT"]
-        };
-        const auto local = std::vector<size_t>{db["COPY_DIMX"], db["COPY_DIMY"]};
-        return RunKernel(kernel, queue, device, global, local, event, waitForEvents);
-      }
-      else {
-        const auto global = std::vector<size_t>{
-          Ceil(CeilDiv(dest_one, db["PAD_WPTX"]), db["PAD_DIMX"]),
-          Ceil(CeilDiv(dest_two, db["PAD_WPTY"]), db["PAD_DIMY"])
-        };
-        const auto local = std::vector<size_t>{db["PAD_DIMX"], db["PAD_DIMY"]};
-        return RunKernel(kernel, queue, device, global, local, event, waitForEvents);
-      }
+      const auto global = std::vector<size_t>{
+        Ceil(CeilDiv(dest_one, db["PAD_WPTX"]), db["PAD_DIMX"]),
+        Ceil(CeilDiv(dest_two, db["PAD_WPTY"]), db["PAD_DIMY"])
+      };
+      const auto local = std::vector<size_t>{db["PAD_DIMX"], db["PAD_DIMY"]};
+      RunKernel(kernel, queue, device, global, local, event, waitForEvents);
    }
-  } catch (...) { return StatusCode::kInvalidKernel; }
+  }
 }

 // =================================================================================================
--- a/src/routines/level1/xamax.cpp
+++ b/src/routines/level1/xamax.cpp
@ -32,64 +32,55 @@ Xamax<T>::Xamax(Queue &queue, EventPointer event, const std::string &name):

 // The main routine
 template <typename T>
-StatusCode Xamax<T>::DoAmax(const size_t n,
-                            const Buffer<unsigned int> &imax_buffer, const size_t imax_offset,
-                            const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc) {
+void Xamax<T>::DoAmax(const size_t n,
+                      const Buffer<unsigned int> &imax_buffer, const size_t imax_offset,
+                      const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc) {

  // Makes sure all dimensions are larger than zero
-  if (n == 0) { return StatusCode::kInvalidDimension; }
+  if (n == 0) { throw BLASError(StatusCode::kInvalidDimension); }

  // Tests the vectors for validity
-  auto status = TestVectorX(n, x_buffer, x_offset, x_inc);
-  if (ErrorIn(status)) { return status; }
-  status = TestVectorIndex(1, imax_buffer, imax_offset);
-  if (ErrorIn(status)) { return status; }
+  TestVectorX(n, x_buffer, x_offset, x_inc);
+  TestVectorIndex(1, imax_buffer, imax_offset);

  // Retrieves the Xamax kernels from the compiled binary
-  try {
-    const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);
-    auto kernel1 = Kernel(program, "Xamax");
-    auto kernel2 = Kernel(program, "XamaxEpilogue");
+  const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);
+  auto kernel1 = Kernel(program, "Xamax");
+  auto kernel2 = Kernel(program, "XamaxEpilogue");

-    // Creates the buffer for intermediate values
-    auto temp_size = 2*db_["WGS2"];
-    auto temp_buffer1 = Buffer<T>(context_, temp_size);
-    auto temp_buffer2 = Buffer<unsigned int>(context_, temp_size);
+  // Creates the buffer for intermediate values
+  auto temp_size = 2*db_["WGS2"];
+  auto temp_buffer1 = Buffer<T>(context_, temp_size);
+  auto temp_buffer2 = Buffer<unsigned int>(context_, temp_size);

-    // Sets the kernel arguments
-    kernel1.SetArgument(0, static_cast<int>(n));
-    kernel1.SetArgument(1, x_buffer());
-    kernel1.SetArgument(2, static_cast<int>(x_offset));
-    kernel1.SetArgument(3, static_cast<int>(x_inc));
-    kernel1.SetArgument(4, temp_buffer1());
-    kernel1.SetArgument(5, temp_buffer2());
+  // Sets the kernel arguments
+  kernel1.SetArgument(0, static_cast<int>(n));
+  kernel1.SetArgument(1, x_buffer());
+  kernel1.SetArgument(2, static_cast<int>(x_offset));
+  kernel1.SetArgument(3, static_cast<int>(x_inc));
+  kernel1.SetArgument(4, temp_buffer1());
+  kernel1.SetArgument(5, temp_buffer2());

-    // Event waiting list
-    auto eventWaitList = std::vector<Event>();
+  // Event waiting list
+  auto eventWaitList = std::vector<Event>();

-    // Launches the main kernel
-    auto global1 = std::vector<size_t>{db_["WGS1"]*temp_size};
-    auto local1 = std::vector<size_t>{db_["WGS1"]};
-    auto kernelEvent = Event();
-    status = RunKernel(kernel1, queue_, device_, global1, local1, kernelEvent.pointer());
-    if (ErrorIn(status)) { return status; }
-    eventWaitList.push_back(kernelEvent);
+  // Launches the main kernel
+  auto global1 = std::vector<size_t>{db_["WGS1"]*temp_size};
+  auto local1 = std::vector<size_t>{db_["WGS1"]};
+  auto kernelEvent = Event();
+  RunKernel(kernel1, queue_, device_, global1, local1, kernelEvent.pointer());
+  eventWaitList.push_back(kernelEvent);

-    // Sets the arguments for the epilogue kernel
-    kernel2.SetArgument(0, temp_buffer1());
-    kernel2.SetArgument(1, temp_buffer2());
-    kernel2.SetArgument(2, imax_buffer());
-    kernel2.SetArgument(3, static_cast<int>(imax_offset));
+  // Sets the arguments for the epilogue kernel
+  kernel2.SetArgument(0, temp_buffer1());
+  kernel2.SetArgument(1, temp_buffer2());
+  kernel2.SetArgument(2, imax_buffer());
+  kernel2.SetArgument(3, static_cast<int>(imax_offset));

-    // Launches the epilogue kernel
-    auto global2 = std::vector<size_t>{db_["WGS2"]};
-    auto local2 = std::vector<size_t>{db_["WGS2"]};
-    status = RunKernel(kernel2, queue_, device_, global2, local2, event_, eventWaitList);
-    if (ErrorIn(status)) { return status; }
-
-    // Succesfully finished the computation
-    return StatusCode::kSuccess;
-  } catch (...) { return StatusCode::kInvalidKernel; }
+  // Launches the epilogue kernel
+  auto global2 = std::vector<size_t>{db_["WGS2"]};
+  auto local2 = std::vector<size_t>{db_["WGS2"]};
+  RunKernel(kernel2, queue_, device_, global2, local2, event_, eventWaitList);
 }

 // =================================================================================================
--- a/src/routines/level1/xamax.hpp
+++ b/src/routines/level1/xamax.hpp
@ -28,9 +28,9 @@ class Xamax: public Routine {
  Xamax(Queue &queue, EventPointer event, const std::string &name = "AMAX");

  // Templated-precision implementation of the routine
-  StatusCode DoAmax(const size_t n,
-                    const Buffer<unsigned int> &imax_buffer, const size_t imax_offset,
-                    const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc);
+  void DoAmax(const size_t n,
+              const Buffer<unsigned int> &imax_buffer, const size_t imax_offset,
+              const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc);
 };

 // =================================================================================================
--- a/src/routines/level1/xasum.cpp
+++ b/src/routines/level1/xasum.cpp
@ -32,61 +32,52 @@ Xasum<T>::Xasum(Queue &queue, EventPointer event, const std::string &name):

 // The main routine
 template <typename T>
-StatusCode Xasum<T>::DoAsum(const size_t n,
-                            const Buffer<T> &asum_buffer, const size_t asum_offset,
-                            const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc) {
+void Xasum<T>::DoAsum(const size_t n,
+                      const Buffer<T> &asum_buffer, const size_t asum_offset,
+                      const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc) {

  // Makes sure all dimensions are larger than zero
-  if (n == 0) { return StatusCode::kInvalidDimension; }
+  if (n == 0) { throw BLASError(StatusCode::kInvalidDimension); }

  // Tests the vectors for validity
-  auto status = TestVectorX(n, x_buffer, x_offset, x_inc);
-  if (ErrorIn(status)) { return status; }
-  status = TestVectorScalar(1, asum_buffer, asum_offset);
-  if (ErrorIn(status)) { return status; }
+  TestVectorX(n, x_buffer, x_offset, x_inc);
+  TestVectorScalar(1, asum_buffer, asum_offset);

  // Retrieves the Xasum kernels from the compiled binary
-  try {
-    const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);
-    auto kernel1 = Kernel(program, "Xasum");
-    auto kernel2 = Kernel(program, "XasumEpilogue");
+  const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);
+  auto kernel1 = Kernel(program, "Xasum");
+  auto kernel2 = Kernel(program, "XasumEpilogue");

-    // Creates the buffer for intermediate values
-    auto temp_size = 2*db_["WGS2"];
-    auto temp_buffer = Buffer<T>(context_, temp_size);
+  // Creates the buffer for intermediate values
+  auto temp_size = 2*db_["WGS2"];
+  auto temp_buffer = Buffer<T>(context_, temp_size);

-    // Sets the kernel arguments
-    kernel1.SetArgument(0, static_cast<int>(n));
-    kernel1.SetArgument(1, x_buffer());
-    kernel1.SetArgument(2, static_cast<int>(x_offset));
-    kernel1.SetArgument(3, static_cast<int>(x_inc));
-    kernel1.SetArgument(4, temp_buffer());
+  // Sets the kernel arguments
+  kernel1.SetArgument(0, static_cast<int>(n));
+  kernel1.SetArgument(1, x_buffer());
+  kernel1.SetArgument(2, static_cast<int>(x_offset));
+  kernel1.SetArgument(3, static_cast<int>(x_inc));
+  kernel1.SetArgument(4, temp_buffer());

-    // Event waiting list
-    auto eventWaitList = std::vector<Event>();
+  // Event waiting list
+  auto eventWaitList = std::vector<Event>();

-    // Launches the main kernel
-    auto global1 = std::vector<size_t>{db_["WGS1"]*temp_size};
-    auto local1 = std::vector<size_t>{db_["WGS1"]};
-    auto kernelEvent = Event();
-    status = RunKernel(kernel1, queue_, device_, global1, local1, kernelEvent.pointer());
-    if (ErrorIn(status)) { return status; }
-    eventWaitList.push_back(kernelEvent);
+  // Launches the main kernel
+  auto global1 = std::vector<size_t>{db_["WGS1"]*temp_size};
+  auto local1 = std::vector<size_t>{db_["WGS1"]};
+  auto kernelEvent = Event();
+  RunKernel(kernel1, queue_, device_, global1, local1, kernelEvent.pointer());
+  eventWaitList.push_back(kernelEvent);

-    // Sets the arguments for the epilogue kernel
-    kernel2.SetArgument(0, temp_buffer());
-    kernel2.SetArgument(1, asum_buffer());
-    kernel2.SetArgument(2, static_cast<int>(asum_offset));
+  // Sets the arguments for the epilogue kernel
+  kernel2.SetArgument(0, temp_buffer());
+  kernel2.SetArgument(1, asum_buffer());
+  kernel2.SetArgument(2, static_cast<int>(asum_offset));

-    // Launches the epilogue kernel
-    auto global2 = std::vector<size_t>{db_["WGS2"]};
-    auto local2 = std::vector<size_t>{db_["WGS2"]};
-    status = RunKernel(kernel2, queue_, device_, global2, local2, event_, eventWaitList);
-    if (ErrorIn(status)) { return status; }
-
-    // Succesfully finished the computation
-    return StatusCode::kSuccess;
-  } catch (...) { return StatusCode::kInvalidKernel; }
+  // Launches the epilogue kernel
+  auto global2 = std::vector<size_t>{db_["WGS2"]};
+  auto local2 = std::vector<size_t>{db_["WGS2"]};
+  RunKernel(kernel2, queue_, device_, global2, local2, event_, eventWaitList);
 }

 // =================================================================================================
--- a/src/routines/level1/xasum.hpp
+++ b/src/routines/level1/xasum.hpp
@ -28,9 +28,9 @@ class Xasum: public Routine {
  Xasum(Queue &queue, EventPointer event, const std::string &name = "ASUM");

  // Templated-precision implementation of the routine
-  StatusCode DoAsum(const size_t n,
-                    const Buffer<T> &asum_buffer, const size_t asum_offset,
-                    const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc);
+  void DoAsum(const size_t n,
+              const Buffer<T> &asum_buffer, const size_t asum_offset,
+              const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc);
 };

 // =================================================================================================
--- a/src/routines/level1/xaxpy.cpp
+++ b/src/routines/level1/xaxpy.cpp
@ -33,18 +33,16 @@ Xaxpy<T>::Xaxpy(Queue &queue, EventPointer event, const std::string &name):

 // The main routine
 template <typename T>
-StatusCode Xaxpy<T>::DoAxpy(const size_t n, const T alpha,
-                            const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
-                            const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc) {
+void Xaxpy<T>::DoAxpy(const size_t n, const T alpha,
+                      const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
+                      const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc) {

  // Makes sure all dimensions are larger than zero
-  if (n == 0) { return StatusCode::kInvalidDimension; }
+  if (n == 0) { throw BLASError(StatusCode::kInvalidDimension); }

  // Tests the vectors for validity
-  auto status = TestVectorX(n, x_buffer, x_offset, x_inc);
-  if (ErrorIn(status)) { return status; }
-  status = TestVectorY(n, y_buffer, y_offset, y_inc);
-  if (ErrorIn(status)) { return status; }
+  TestVectorX(n, x_buffer, x_offset, x_inc);
+  TestVectorY(n, y_buffer, y_offset, y_inc);

  // Determines whether or not the fast-version can be used
  bool use_fast_kernel = (x_offset == 0) && (x_inc == 1) &&
@ -55,45 +53,39 @@ StatusCode Xaxpy<T>::DoAxpy(const size_t n, const T alpha,
  auto kernel_name = (use_fast_kernel) ? "XaxpyFast" : "Xaxpy";

  // Retrieves the Xaxpy kernel from the compiled binary
-  try {
-    const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);
-    auto kernel = Kernel(program, kernel_name);
+  const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);
+  auto kernel = Kernel(program, kernel_name);

-    // Sets the kernel arguments
-    if (use_fast_kernel) {
-      kernel.SetArgument(0, static_cast<int>(n));
-      kernel.SetArgument(1, GetRealArg(alpha));
-      kernel.SetArgument(2, x_buffer());
-      kernel.SetArgument(3, y_buffer());
-    }
-    else {
-      kernel.SetArgument(0, static_cast<int>(n));
-      kernel.SetArgument(1, GetRealArg(alpha));
-      kernel.SetArgument(2, x_buffer());
-      kernel.SetArgument(3, static_cast<int>(x_offset));
-      kernel.SetArgument(4, static_cast<int>(x_inc));
-      kernel.SetArgument(5, y_buffer());
-      kernel.SetArgument(6, static_cast<int>(y_offset));
-      kernel.SetArgument(7, static_cast<int>(y_inc));
-    }
+  // Sets the kernel arguments
+  if (use_fast_kernel) {
+    kernel.SetArgument(0, static_cast<int>(n));
+    kernel.SetArgument(1, GetRealArg(alpha));
+    kernel.SetArgument(2, x_buffer());
+    kernel.SetArgument(3, y_buffer());
+  }
+  else {
+    kernel.SetArgument(0, static_cast<int>(n));
+    kernel.SetArgument(1, GetRealArg(alpha));
+    kernel.SetArgument(2, x_buffer());
+    kernel.SetArgument(3, static_cast<int>(x_offset));
+    kernel.SetArgument(4, static_cast<int>(x_inc));
+    kernel.SetArgument(5, y_buffer());
+    kernel.SetArgument(6, static_cast<int>(y_offset));
+    kernel.SetArgument(7, static_cast<int>(y_inc));
+  }

-    // Launches the kernel
-    if (use_fast_kernel) {
-      auto global = std::vector<size_t>{CeilDiv(n, db_["WPT"]*db_["VW"])};
-      auto local = std::vector<size_t>{db_["WGS"]};
-      status = RunKernel(kernel, queue_, device_, global, local, event_);
-    }
-    else {
-      auto n_ceiled = Ceil(n, db_["WGS"]*db_["WPT"]);
-      auto global = std::vector<size_t>{n_ceiled/db_["WPT"]};
-      auto local = std::vector<size_t>{db_["WGS"]};
-      status = RunKernel(kernel, queue_, device_, global, local, event_);
-    }
-    if (ErrorIn(status)) { return status; }
-
-    // Succesfully finished the computation
-    return StatusCode::kSuccess;
-  } catch (...) { return StatusCode::kInvalidKernel; }
+  // Launches the kernel
+  if (use_fast_kernel) {
+    auto global = std::vector<size_t>{CeilDiv(n, db_["WPT"]*db_["VW"])};
+    auto local = std::vector<size_t>{db_["WGS"]};
+    RunKernel(kernel, queue_, device_, global, local, event_);
+  }
+  else {
+    auto n_ceiled = Ceil(n, db_["WGS"]*db_["WPT"]);
+    auto global = std::vector<size_t>{n_ceiled/db_["WPT"]};
+    auto local = std::vector<size_t>{db_["WGS"]};
+    RunKernel(kernel, queue_, device_, global, local, event_);
+  }
 }

 // =================================================================================================
--- a/src/routines/level1/xaxpy.hpp
+++ b/src/routines/level1/xaxpy.hpp
@ -28,9 +28,9 @@ class Xaxpy: public Routine {
  Xaxpy(Queue &queue, EventPointer event, const std::string &name = "AXPY");

  // Templated-precision implementation of the routine
-  StatusCode DoAxpy(const size_t n, const T alpha,
-                    const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
-                    const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc);
+  void DoAxpy(const size_t n, const T alpha,
+              const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
+              const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc);
 };

 // =================================================================================================
--- a/src/routines/level1/xcopy.cpp
+++ b/src/routines/level1/xcopy.cpp
@ -33,18 +33,16 @@ Xcopy<T>::Xcopy(Queue &queue, EventPointer event, const std::string &name):

 // The main routine
 template <typename T>
-StatusCode Xcopy<T>::DoCopy(const size_t n,
-                            const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
-                            const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc) {
+void Xcopy<T>::DoCopy(const size_t n,
+                      const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
+                      const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc) {

  // Makes sure all dimensions are larger than zero
-  if (n == 0) { return StatusCode::kInvalidDimension; }
+  if (n == 0) { throw BLASError(StatusCode::kInvalidDimension); }

  // Tests the vectors for validity
-  auto status = TestVectorX(n, x_buffer, x_offset, x_inc);
-  if (ErrorIn(status)) { return status; }
-  status = TestVectorY(n, y_buffer, y_offset, y_inc);
-  if (ErrorIn(status)) { return status; }
+  TestVectorX(n, x_buffer, x_offset, x_inc);
+  TestVectorY(n, y_buffer, y_offset, y_inc);

  // Determines whether or not the fast-version can be used
  bool use_fast_kernel = (x_offset == 0) && (x_inc == 1) &&
@ -55,43 +53,37 @@ StatusCode Xcopy<T>::DoCopy(const size_t n,
  auto kernel_name = (use_fast_kernel) ? "XcopyFast" : "Xcopy";

  // Retrieves the Xcopy kernel from the compiled binary
-  try {
-    const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);
-    auto kernel = Kernel(program, kernel_name);
+  const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);
+  auto kernel = Kernel(program, kernel_name);

-    // Sets the kernel arguments
-    if (use_fast_kernel) {
-      kernel.SetArgument(0, static_cast<int>(n));
-      kernel.SetArgument(1, x_buffer());
-      kernel.SetArgument(2, y_buffer());
-    }
-    else {
-      kernel.SetArgument(0, static_cast<int>(n));
-      kernel.SetArgument(1, x_buffer());
-      kernel.SetArgument(2, static_cast<int>(x_offset));
-      kernel.SetArgument(3, static_cast<int>(x_inc));
-      kernel.SetArgument(4, y_buffer());
-      kernel.SetArgument(5, static_cast<int>(y_offset));
-      kernel.SetArgument(6, static_cast<int>(y_inc));
-    }
+  // Sets the kernel arguments
+  if (use_fast_kernel) {
+    kernel.SetArgument(0, static_cast<int>(n));
+    kernel.SetArgument(1, x_buffer());
+    kernel.SetArgument(2, y_buffer());
+  }
+  else {
+    kernel.SetArgument(0, static_cast<int>(n));
+    kernel.SetArgument(1, x_buffer());
+    kernel.SetArgument(2, static_cast<int>(x_offset));
+    kernel.SetArgument(3, static_cast<int>(x_inc));
+    kernel.SetArgument(4, y_buffer());
+    kernel.SetArgument(5, static_cast<int>(y_offset));
+    kernel.SetArgument(6, static_cast<int>(y_inc));
+  }

-    // Launches the kernel
-    if (use_fast_kernel) {
-      auto global = std::vector<size_t>{CeilDiv(n, db_["WPT"]*db_["VW"])};
-      auto local = std::vector<size_t>{db_["WGS"]};
-      status = RunKernel(kernel, queue_, device_, global, local, event_);
-    }
-    else {
-      auto n_ceiled = Ceil(n, db_["WGS"]*db_["WPT"]);
-      auto global = std::vector<size_t>{n_ceiled/db_["WPT"]};
-      auto local = std::vector<size_t>{db_["WGS"]};
-      status = RunKernel(kernel, queue_, device_, global, local, event_);
-    }
-    if (ErrorIn(status)) { return status; }
-
-    // Succesfully finished the computation
-    return StatusCode::kSuccess;
-  } catch (...) { return StatusCode::kInvalidKernel; }
+  // Launches the kernel
+  if (use_fast_kernel) {
+    auto global = std::vector<size_t>{CeilDiv(n, db_["WPT"]*db_["VW"])};
+    auto local = std::vector<size_t>{db_["WGS"]};
+    RunKernel(kernel, queue_, device_, global, local, event_);
+  }
+  else {
+    auto n_ceiled = Ceil(n, db_["WGS"]*db_["WPT"]);
+    auto global = std::vector<size_t>{n_ceiled/db_["WPT"]};
+    auto local = std::vector<size_t>{db_["WGS"]};
+    RunKernel(kernel, queue_, device_, global, local, event_);
+  }
 }

 // =================================================================================================
--- a/src/routines/level1/xcopy.hpp
+++ b/src/routines/level1/xcopy.hpp
@ -28,9 +28,9 @@ class Xcopy: public Routine {
  Xcopy(Queue &queue, EventPointer event, const std::string &name = "COPY");

  // Templated-precision implementation of the routine
-  StatusCode DoCopy(const size_t n,
-                    const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
-                    const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc);
+  void DoCopy(const size_t n,
+              const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
+              const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc);
 };

 // =================================================================================================
--- a/src/routines/level1/xdot.cpp
+++ b/src/routines/level1/xdot.cpp
@ -32,69 +32,59 @@ Xdot<T>::Xdot(Queue &queue, EventPointer event, const std::string &name):

 // The main routine
 template <typename T>
-StatusCode Xdot<T>::DoDot(const size_t n,
-                          const Buffer<T> &dot_buffer, const size_t dot_offset,
-                          const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
-                          const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc,
-                          const bool do_conjugate) {
+void Xdot<T>::DoDot(const size_t n,
+                    const Buffer<T> &dot_buffer, const size_t dot_offset,
+                    const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
+                    const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc,
+                    const bool do_conjugate) {

  // Makes sure all dimensions are larger than zero
-  if (n == 0) { return StatusCode::kInvalidDimension; }
+  if (n == 0) { throw BLASError(StatusCode::kInvalidDimension); }

  // Tests the vectors for validity
-  auto status = TestVectorX(n, x_buffer, x_offset, x_inc);
-  if (ErrorIn(status)) { return status; }
-  status = TestVectorY(n, y_buffer, y_offset, y_inc);
-  if (ErrorIn(status)) { return status; }
-  status = TestVectorScalar(1, dot_buffer, dot_offset);
-  if (ErrorIn(status)) { return status; }
+  TestVectorX(n, x_buffer, x_offset, x_inc);
+  TestVectorY(n, y_buffer, y_offset, y_inc);
+  TestVectorScalar(1, dot_buffer, dot_offset);

  // Retrieves the Xdot kernels from the compiled binary
-  try {
-    const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);
-    auto kernel1 = Kernel(program, "Xdot");
-    auto kernel2 = Kernel(program, "XdotEpilogue");
+  const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);
+  auto kernel1 = Kernel(program, "Xdot");
+  auto kernel2 = Kernel(program, "XdotEpilogue");

-    // Creates the buffer for intermediate values
-    auto temp_size = 2*db_["WGS2"];
-    auto temp_buffer = Buffer<T>(context_, temp_size);
+  // Creates the buffer for intermediate values
+  auto temp_size = 2*db_["WGS2"];
+  auto temp_buffer = Buffer<T>(context_, temp_size);

-    // Sets the kernel arguments
-    kernel1.SetArgument(0, static_cast<int>(n));
-    kernel1.SetArgument(1, x_buffer());
-    kernel1.SetArgument(2, static_cast<int>(x_offset));
-    kernel1.SetArgument(3, static_cast<int>(x_inc));
-    kernel1.SetArgument(4, y_buffer());
-    kernel1.SetArgument(5, static_cast<int>(y_offset));
-    kernel1.SetArgument(6, static_cast<int>(y_inc));
-    kernel1.SetArgument(7, temp_buffer());
-    kernel1.SetArgument(8, static_cast<int>(do_conjugate));
+  // Sets the kernel arguments
+  kernel1.SetArgument(0, static_cast<int>(n));
+  kernel1.SetArgument(1, x_buffer());
+  kernel1.SetArgument(2, static_cast<int>(x_offset));
+  kernel1.SetArgument(3, static_cast<int>(x_inc));
+  kernel1.SetArgument(4, y_buffer());
+  kernel1.SetArgument(5, static_cast<int>(y_offset));
+  kernel1.SetArgument(6, static_cast<int>(y_inc));
+  kernel1.SetArgument(7, temp_buffer());
+  kernel1.SetArgument(8, static_cast<int>(do_conjugate));

-    // Event waiting list
-    auto eventWaitList = std::vector<Event>();
+  // Event waiting list
+  auto eventWaitList = std::vector<Event>();

-    // Launches the main kernel
-    auto global1 = std::vector<size_t>{db_["WGS1"]*temp_size};
-    auto local1 = std::vector<size_t>{db_["WGS1"]};
-    auto kernelEvent = Event();
-    status = RunKernel(kernel1, queue_, device_, global1, local1, kernelEvent.pointer());
-    if (ErrorIn(status)) { return status; }
-    eventWaitList.push_back(kernelEvent);
+  // Launches the main kernel
+  auto global1 = std::vector<size_t>{db_["WGS1"]*temp_size};
+  auto local1 = std::vector<size_t>{db_["WGS1"]};
+  auto kernelEvent = Event();
+  RunKernel(kernel1, queue_, device_, global1, local1, kernelEvent.pointer());
+  eventWaitList.push_back(kernelEvent);

-    // Sets the arguments for the epilogue kernel
-    kernel2.SetArgument(0, temp_buffer());
-    kernel2.SetArgument(1, dot_buffer());
-    kernel2.SetArgument(2, static_cast<int>(dot_offset));
+  // Sets the arguments for the epilogue kernel
+  kernel2.SetArgument(0, temp_buffer());
+  kernel2.SetArgument(1, dot_buffer());
+  kernel2.SetArgument(2, static_cast<int>(dot_offset));

-    // Launches the epilogue kernel
-    auto global2 = std::vector<size_t>{db_["WGS2"]};
-    auto local2 = std::vector<size_t>{db_["WGS2"]};
-    status = RunKernel(kernel2, queue_, device_, global2, local2, event_, eventWaitList);
-    if (ErrorIn(status)) { return status; }
-
-    // Succesfully finished the computation
-    return StatusCode::kSuccess;
-  } catch (...) { return StatusCode::kInvalidKernel; }
+  // Launches the epilogue kernel
+  auto global2 = std::vector<size_t>{db_["WGS2"]};
+  auto local2 = std::vector<size_t>{db_["WGS2"]};
+  RunKernel(kernel2, queue_, device_, global2, local2, event_, eventWaitList);
 }

 // =================================================================================================
--- a/src/routines/level1/xdot.hpp
+++ b/src/routines/level1/xdot.hpp
@ -28,11 +28,11 @@ class Xdot: public Routine {
  Xdot(Queue &queue, EventPointer event, const std::string &name = "DOT");

  // Templated-precision implementation of the routine
-  StatusCode DoDot(const size_t n,
-                   const Buffer<T> &dot_buffer, const size_t dot_offset,
-                   const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
-                   const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc,
-                   const bool do_conjugate = false);
+  void DoDot(const size_t n,
+             const Buffer<T> &dot_buffer, const size_t dot_offset,
+             const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
+             const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc,
+             const bool do_conjugate = false);
 };

 // =================================================================================================
--- a/src/routines/level1/xdotc.cpp
+++ b/src/routines/level1/xdotc.cpp
@ -29,14 +29,14 @@ Xdotc<T>::Xdotc(Queue &queue, EventPointer event, const std::string &name):

 // The main routine
 template <typename T>
-StatusCode Xdotc<T>::DoDotc(const size_t n,
-                            const Buffer<T> &dot_buffer, const size_t dot_offset,
-                            const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
-                            const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc) {
-  return DoDot(n, dot_buffer, dot_offset,
-               x_buffer, x_offset, x_inc,
-               y_buffer, y_offset, y_inc,
-               true);
+void Xdotc<T>::DoDotc(const size_t n,
+                      const Buffer<T> &dot_buffer, const size_t dot_offset,
+                      const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
+                      const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc) {
+  DoDot(n, dot_buffer, dot_offset,
+        x_buffer, x_offset, x_inc,
+        y_buffer, y_offset, y_inc,
+        true);
 }

 // =================================================================================================
--- a/src/routines/level1/xdotc.hpp
+++ b/src/routines/level1/xdotc.hpp
@ -31,10 +31,10 @@ class Xdotc: public Xdot<T> {
  Xdotc(Queue &queue, EventPointer event, const std::string &name = "DOTC");

  // Templated-precision implementation of the routine
-  StatusCode DoDotc(const size_t n,
-                    const Buffer<T> &dot_buffer, const size_t dot_offset,
-                    const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
-                    const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc);
+  void DoDotc(const size_t n,
+              const Buffer<T> &dot_buffer, const size_t dot_offset,
+              const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
+              const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc);
 };

 // =================================================================================================
--- a/src/routines/level1/xdotu.cpp
+++ b/src/routines/level1/xdotu.cpp
@ -28,14 +28,14 @@ Xdotu<T>::Xdotu(Queue &queue, EventPointer event, const std::string &name):

 // The main routine
 template <typename T>
-StatusCode Xdotu<T>::DoDotu(const size_t n,
-                            const Buffer<T> &dot_buffer, const size_t dot_offset,
-                            const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
-                            const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc) {
-  return DoDot(n, dot_buffer, dot_offset,
-               x_buffer, x_offset, x_inc,
-               y_buffer, y_offset, y_inc,
-               false);
+void Xdotu<T>::DoDotu(const size_t n,
+                      const Buffer<T> &dot_buffer, const size_t dot_offset,
+                      const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
+                      const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc) {
+  DoDot(n, dot_buffer, dot_offset,
+        x_buffer, x_offset, x_inc,
+        y_buffer, y_offset, y_inc,
+        false);
 }

 // =================================================================================================
--- a/src/routines/level1/xdotu.hpp
+++ b/src/routines/level1/xdotu.hpp
@ -31,10 +31,10 @@ class Xdotu: public Xdot<T> {
  Xdotu(Queue &queue, EventPointer event, const std::string &name = "DOTU");

  // Templated-precision implementation of the routine
-  StatusCode DoDotu(const size_t n,
-                    const Buffer<T> &dot_buffer, const size_t dot_offset,
-                    const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
-                    const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc);
+  void DoDotu(const size_t n,
+              const Buffer<T> &dot_buffer, const size_t dot_offset,
+              const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
+              const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc);
 };

 // =================================================================================================
--- a/src/routines/level1/xmax.hpp
+++ b/src/routines/level1/xmax.hpp
@ -35,10 +35,10 @@ class Xmax: public Xamax<T> {

  // Forwards to the regular absolute version. The implementation difference is realised in the
  // kernel through a pre-processor macro based on the name of the routine.
-  StatusCode DoMax(const size_t n,
-                   const Buffer<unsigned int> &imax_buffer, const size_t imax_offset,
-                   const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc) {
-    return DoAmax(n, imax_buffer, imax_offset, x_buffer, x_offset, x_inc);
+  void DoMax(const size_t n,
+             const Buffer<unsigned int> &imax_buffer, const size_t imax_offset,
+             const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc) {
+    DoAmax(n, imax_buffer, imax_offset, x_buffer, x_offset, x_inc);
  }
 };

--- a/src/routines/level1/xmin.hpp
+++ b/src/routines/level1/xmin.hpp
@ -35,10 +35,10 @@ class Xmin: public Xamax<T> {

  // Forwards to the regular max-absolute version. The implementation difference is realised in the
  // kernel through a pre-processor macro based on the name of the routine.
-  StatusCode DoMin(const size_t n,
-                   const Buffer<unsigned int> &imin_buffer, const size_t imin_offset,
-                   const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc) {
-    return DoAmax(n, imin_buffer, imin_offset, x_buffer, x_offset, x_inc);
+  void DoMin(const size_t n,
+             const Buffer<unsigned int> &imin_buffer, const size_t imin_offset,
+             const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc) {
+    DoAmax(n, imin_buffer, imin_offset, x_buffer, x_offset, x_inc);
  }
 };

--- a/src/routines/level1/xnrm2.cpp
+++ b/src/routines/level1/xnrm2.cpp
@ -32,61 +32,52 @@ Xnrm2<T>::Xnrm2(Queue &queue, EventPointer event, const std::string &name):

 // The main routine
 template <typename T>
-StatusCode Xnrm2<T>::DoNrm2(const size_t n,
-                            const Buffer<T> &nrm2_buffer, const size_t nrm2_offset,
-                            const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc) {
+void Xnrm2<T>::DoNrm2(const size_t n,
+                      const Buffer<T> &nrm2_buffer, const size_t nrm2_offset,
+                      const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc) {

  // Makes sure all dimensions are larger than zero
-  if (n == 0) { return StatusCode::kInvalidDimension; }
+  if (n == 0) { throw BLASError(StatusCode::kInvalidDimension); }

  // Tests the vectors for validity
-  auto status = TestVectorX(n, x_buffer, x_offset, x_inc);
-  if (ErrorIn(status)) { return status; }
-  status = TestVectorScalar(1, nrm2_buffer, nrm2_offset);
-  if (ErrorIn(status)) { return status; }
+  TestVectorX(n, x_buffer, x_offset, x_inc);
+  TestVectorScalar(1, nrm2_buffer, nrm2_offset);

  // Retrieves the Xnrm2 kernels from the compiled binary
-  try {
-    const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);
-    auto kernel1 = Kernel(program, "Xnrm2");
-    auto kernel2 = Kernel(program, "Xnrm2Epilogue");
+  const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);
+  auto kernel1 = Kernel(program, "Xnrm2");
+  auto kernel2 = Kernel(program, "Xnrm2Epilogue");

-    // Creates the buffer for intermediate values
-    auto temp_size = 2*db_["WGS2"];
-    auto temp_buffer = Buffer<T>(context_, temp_size);
+  // Creates the buffer for intermediate values
+  auto temp_size = 2*db_["WGS2"];
+  auto temp_buffer = Buffer<T>(context_, temp_size);

-    // Sets the kernel arguments
-    kernel1.SetArgument(0, static_cast<int>(n));
-    kernel1.SetArgument(1, x_buffer());
-    kernel1.SetArgument(2, static_cast<int>(x_offset));
-    kernel1.SetArgument(3, static_cast<int>(x_inc));
-    kernel1.SetArgument(4, temp_buffer());
+  // Sets the kernel arguments
+  kernel1.SetArgument(0, static_cast<int>(n));
+  kernel1.SetArgument(1, x_buffer());
+  kernel1.SetArgument(2, static_cast<int>(x_offset));
+  kernel1.SetArgument(3, static_cast<int>(x_inc));
+  kernel1.SetArgument(4, temp_buffer());

-    // Event waiting list
-    auto eventWaitList = std::vector<Event>();
+  // Event waiting list
+  auto eventWaitList = std::vector<Event>();

-    // Launches the main kernel
-    auto global1 = std::vector<size_t>{db_["WGS1"]*temp_size};
-    auto local1 = std::vector<size_t>{db_["WGS1"]};
-    auto kernelEvent = Event();
-    status = RunKernel(kernel1, queue_, device_, global1, local1, kernelEvent.pointer());
-    if (ErrorIn(status)) { return status; }
-    eventWaitList.push_back(kernelEvent);
+  // Launches the main kernel
+  auto global1 = std::vector<size_t>{db_["WGS1"]*temp_size};
+  auto local1 = std::vector<size_t>{db_["WGS1"]};
+  auto kernelEvent = Event();
+  RunKernel(kernel1, queue_, device_, global1, local1, kernelEvent.pointer());
+  eventWaitList.push_back(kernelEvent);

-    // Sets the arguments for the epilogue kernel
-    kernel2.SetArgument(0, temp_buffer());
-    kernel2.SetArgument(1, nrm2_buffer());
-    kernel2.SetArgument(2, static_cast<int>(nrm2_offset));
+  // Sets the arguments for the epilogue kernel
+  kernel2.SetArgument(0, temp_buffer());
+  kernel2.SetArgument(1, nrm2_buffer());
+  kernel2.SetArgument(2, static_cast<int>(nrm2_offset));

-    // Launches the epilogue kernel
-    auto global2 = std::vector<size_t>{db_["WGS2"]};
-    auto local2 = std::vector<size_t>{db_["WGS2"]};
-    status = RunKernel(kernel2, queue_, device_, global2, local2, event_, eventWaitList);
-    if (ErrorIn(status)) { return status; }
-
-    // Succesfully finished the computation
-    return StatusCode::kSuccess;
-  } catch (...) { return StatusCode::kInvalidKernel; }
+  // Launches the epilogue kernel
+  auto global2 = std::vector<size_t>{db_["WGS2"]};
+  auto local2 = std::vector<size_t>{db_["WGS2"]};
+  RunKernel(kernel2, queue_, device_, global2, local2, event_, eventWaitList);
 }

 // =================================================================================================
--- a/src/routines/level1/xnrm2.hpp
+++ b/src/routines/level1/xnrm2.hpp
@ -28,9 +28,9 @@ class Xnrm2: public Routine {
  Xnrm2(Queue &queue, EventPointer event, const std::string &name = "NRM2");

  // Templated-precision implementation of the routine
-  StatusCode DoNrm2(const size_t n,
-                    const Buffer<T> &nrm2_buffer, const size_t nrm2_offset,
-                    const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc);
+  void DoNrm2(const size_t n,
+              const Buffer<T> &nrm2_buffer, const size_t nrm2_offset,
+              const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc);
 };

 // =================================================================================================
--- a/src/routines/level1/xscal.cpp
+++ b/src/routines/level1/xscal.cpp
@ -33,15 +33,14 @@ Xscal<T>::Xscal(Queue &queue, EventPointer event, const std::string &name):

 // The main routine
 template <typename T>
-StatusCode Xscal<T>::DoScal(const size_t n, const T alpha,
-                            const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc) {
+void Xscal<T>::DoScal(const size_t n, const T alpha,
+                      const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc) {

  // Makes sure all dimensions are larger than zero
-  if (n == 0) { return StatusCode::kInvalidDimension; }
+  if (n == 0) { throw BLASError(StatusCode::kInvalidDimension); }

  // Tests the vector for validity
-  auto status = TestVectorX(n, x_buffer, x_offset, x_inc);
-  if (ErrorIn(status)) { return status; }
+  TestVectorX(n, x_buffer, x_offset, x_inc);

  // Determines whether or not the fast-version can be used
  bool use_fast_kernel = (x_offset == 0) && (x_inc == 1) &&
@ -51,41 +50,35 @@ StatusCode Xscal<T>::DoScal(const size_t n, const T alpha,
  auto kernel_name = (use_fast_kernel) ? "XscalFast" : "Xscal";

  // Retrieves the Xscal kernel from the compiled binary
-  try {
-    const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);
-    auto kernel = Kernel(program, kernel_name);
+  const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);
+  auto kernel = Kernel(program, kernel_name);

-    // Sets the kernel arguments
-    if (use_fast_kernel) {
-      kernel.SetArgument(0, static_cast<int>(n));
-      kernel.SetArgument(1, alpha);
-      kernel.SetArgument(2, x_buffer());
-    }
-    else {
-      kernel.SetArgument(0, static_cast<int>(n));
-      kernel.SetArgument(1, alpha);
-      kernel.SetArgument(2, x_buffer());
-      kernel.SetArgument(3, static_cast<int>(x_offset));
-      kernel.SetArgument(4, static_cast<int>(x_inc));
-    }
+  // Sets the kernel arguments
+  if (use_fast_kernel) {
+    kernel.SetArgument(0, static_cast<int>(n));
+    kernel.SetArgument(1, alpha);
+    kernel.SetArgument(2, x_buffer());
+  }
+  else {
+    kernel.SetArgument(0, static_cast<int>(n));
+    kernel.SetArgument(1, alpha);
+    kernel.SetArgument(2, x_buffer());
+    kernel.SetArgument(3, static_cast<int>(x_offset));
+    kernel.SetArgument(4, static_cast<int>(x_inc));
+  }

-    // Launches the kernel
-    if (use_fast_kernel) {
-      auto global = std::vector<size_t>{CeilDiv(n, db_["WPT"]*db_["VW"])};
-      auto local = std::vector<size_t>{db_["WGS"]};
-      status = RunKernel(kernel, queue_, device_, global, local, event_);
-    }
-    else {
-      auto n_ceiled = Ceil(n, db_["WGS"]*db_["WPT"]);
-      auto global = std::vector<size_t>{n_ceiled/db_["WPT"]};
-      auto local = std::vector<size_t>{db_["WGS"]};
-      status = RunKernel(kernel, queue_, device_, global, local, event_);
-    }
-    if (ErrorIn(status)) { return status; }
-
-    // Succesfully finished the computation
-    return StatusCode::kSuccess;
-  } catch (...) { return StatusCode::kInvalidKernel; }
+  // Launches the kernel
+  if (use_fast_kernel) {
+    auto global = std::vector<size_t>{CeilDiv(n, db_["WPT"]*db_["VW"])};
+    auto local = std::vector<size_t>{db_["WGS"]};
+    RunKernel(kernel, queue_, device_, global, local, event_);
+  }
+  else {
+    auto n_ceiled = Ceil(n, db_["WGS"]*db_["WPT"]);
+    auto global = std::vector<size_t>{n_ceiled/db_["WPT"]};
+    auto local = std::vector<size_t>{db_["WGS"]};
+    RunKernel(kernel, queue_, device_, global, local, event_);
+  }
 }

 // =================================================================================================
--- a/src/routines/level1/xscal.hpp
+++ b/src/routines/level1/xscal.hpp
@ -28,8 +28,8 @@ class Xscal: public Routine {
  Xscal(Queue &queue, EventPointer event, const std::string &name = "SCAL");

  // Templated-precision implementation of the routine
-  StatusCode DoScal(const size_t n, const T alpha,
-                    const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc);
+  void DoScal(const size_t n, const T alpha,
+              const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc);
 };

 // =================================================================================================
--- a/src/routines/level1/xsum.hpp
+++ b/src/routines/level1/xsum.hpp
@ -35,10 +35,10 @@ class Xsum: public Xasum<T> {

  // Forwards to the regular absolute version. The implementation difference is realised in the
  // kernel through a pre-processor macro based on the name of the routine.
-  StatusCode DoSum(const size_t n,
-                   const Buffer<T> &sum_buffer, const size_t sum_offset,
-                   const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc) {
-    return DoAsum(n, sum_buffer, sum_offset, x_buffer, x_offset, x_inc);
+  void DoSum(const size_t n,
+             const Buffer<T> &sum_buffer, const size_t sum_offset,
+             const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc) {
+    DoAsum(n, sum_buffer, sum_offset, x_buffer, x_offset, x_inc);
  }
 };

--- a/src/routines/level1/xswap.cpp
+++ b/src/routines/level1/xswap.cpp
@ -33,18 +33,16 @@ Xswap<T>::Xswap(Queue &queue, EventPointer event, const std::string &name):

 // The main routine
 template <typename T>
-StatusCode Xswap<T>::DoSwap(const size_t n,
-                            const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
-                            const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc) {
+void Xswap<T>::DoSwap(const size_t n,
+                      const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
+                      const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc) {

  // Makes sure all dimensions are larger than zero
-  if (n == 0) { return StatusCode::kInvalidDimension; }
+  if (n == 0) { throw BLASError(StatusCode::kInvalidDimension); }

  // Tests the vectors for validity
-  auto status = TestVectorX(n, x_buffer, x_offset, x_inc);
-  if (ErrorIn(status)) { return status; }
-  status = TestVectorY(n, y_buffer, y_offset, y_inc);
-  if (ErrorIn(status)) { return status; }
+  TestVectorX(n, x_buffer, x_offset, x_inc);
+  TestVectorY(n, y_buffer, y_offset, y_inc);

  // Determines whether or not the fast-version can be used
  bool use_fast_kernel = (x_offset == 0) && (x_inc == 1) &&
@ -55,43 +53,37 @@ StatusCode Xswap<T>::DoSwap(const size_t n,
  auto kernel_name = (use_fast_kernel) ? "XswapFast" : "Xswap";

  // Retrieves the Xswap kernel from the compiled binary
-  try {
-    const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);
-    auto kernel = Kernel(program, kernel_name);
+  const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);
+  auto kernel = Kernel(program, kernel_name);

-    // Sets the kernel arguments
-    if (use_fast_kernel) {
-      kernel.SetArgument(0, static_cast<int>(n));
-      kernel.SetArgument(1, x_buffer());
-      kernel.SetArgument(2, y_buffer());
-    }
-    else {
-      kernel.SetArgument(0, static_cast<int>(n));
-      kernel.SetArgument(1, x_buffer());
-      kernel.SetArgument(2, static_cast<int>(x_offset));
-      kernel.SetArgument(3, static_cast<int>(x_inc));
-      kernel.SetArgument(4, y_buffer());
-      kernel.SetArgument(5, static_cast<int>(y_offset));
-      kernel.SetArgument(6, static_cast<int>(y_inc));
-    }
+  // Sets the kernel arguments
+  if (use_fast_kernel) {
+    kernel.SetArgument(0, static_cast<int>(n));
+    kernel.SetArgument(1, x_buffer());
+    kernel.SetArgument(2, y_buffer());
+  }
+  else {
+    kernel.SetArgument(0, static_cast<int>(n));
+    kernel.SetArgument(1, x_buffer());
+    kernel.SetArgument(2, static_cast<int>(x_offset));
+    kernel.SetArgument(3, static_cast<int>(x_inc));
+    kernel.SetArgument(4, y_buffer());
+    kernel.SetArgument(5, static_cast<int>(y_offset));
+    kernel.SetArgument(6, static_cast<int>(y_inc));
+  }

-    // Launches the kernel
-    if (use_fast_kernel) {
-      auto global = std::vector<size_t>{CeilDiv(n, db_["WPT"]*db_["VW"])};
-      auto local = std::vector<size_t>{db_["WGS"]};
-      status = RunKernel(kernel, queue_, device_, global, local, event_);
-    }
-    else {
-      auto n_ceiled = Ceil(n, db_["WGS"]*db_["WPT"]);
-      auto global = std::vector<size_t>{n_ceiled/db_["WPT"]};
-      auto local = std::vector<size_t>{db_["WGS"]};
-      status = RunKernel(kernel, queue_, device_, global, local, event_);
-    }
-    if (ErrorIn(status)) { return status; }
-
-    // Succesfully finished the computation
-    return StatusCode::kSuccess;
-  } catch (...) { return StatusCode::kInvalidKernel; }
+  // Launches the kernel
+  if (use_fast_kernel) {
+    auto global = std::vector<size_t>{CeilDiv(n, db_["WPT"]*db_["VW"])};
+    auto local = std::vector<size_t>{db_["WGS"]};
+    RunKernel(kernel, queue_, device_, global, local, event_);
+  }
+  else {
+    auto n_ceiled = Ceil(n, db_["WGS"]*db_["WPT"]);
+    auto global = std::vector<size_t>{n_ceiled/db_["WPT"]};
+    auto local = std::vector<size_t>{db_["WGS"]};
+    RunKernel(kernel, queue_, device_, global, local, event_);
+  }
 }

 // =================================================================================================
--- a/src/routines/level1/xswap.hpp
+++ b/src/routines/level1/xswap.hpp
@ -28,9 +28,9 @@ class Xswap: public Routine {
  Xswap(Queue &queue, EventPointer event, const std::string &name = "SWAP");

  // Templated-precision implementation of the routine
-  StatusCode DoSwap(const size_t n,
-                    const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
-                    const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc);
+  void DoSwap(const size_t n,
+              const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
+              const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc);
 };

 // =================================================================================================
--- a/src/routines/level2/xgbmv.cpp
+++ b/src/routines/level2/xgbmv.cpp
@ -29,13 +29,13 @@ Xgbmv<T>::Xgbmv(Queue &queue, EventPointer event, const std::string &name):

 // The main routine
 template <typename T>
-StatusCode Xgbmv<T>::DoGbmv(const Layout layout, const Transpose a_transpose,
-                            const size_t m, const size_t n, const size_t kl, const size_t ku,
-                            const T alpha,
-                            const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
-                            const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
-                            const T beta,
-                            const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc) {
+void Xgbmv<T>::DoGbmv(const Layout layout, const Transpose a_transpose,
+                      const size_t m, const size_t n, const size_t kl, const size_t ku,
+                      const T alpha,
+                      const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
+                      const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
+                      const T beta,
+                      const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc) {

  // Reverses the upper and lower band count
  auto rotated = (layout == Layout::kRowMajor);
@ -46,13 +46,13 @@ StatusCode Xgbmv<T>::DoGbmv(const Layout layout, const Transpose a_transpose,
  // The specific hermitian matrix-accesses are implemented in the kernel guarded by the
  // ROUTINE_GBMV define.
  bool fast_kernels = false;
-  return MatVec(layout, a_transpose,
-                m, n, alpha,
-                a_buffer, a_offset, a_ld,
-                x_buffer, x_offset, x_inc, beta,
-                y_buffer, y_offset, y_inc,
-                fast_kernels, fast_kernels,
-                0, false, kl_real, ku_real);
+  MatVec(layout, a_transpose,
+         m, n, alpha,
+         a_buffer, a_offset, a_ld,
+         x_buffer, x_offset, x_inc, beta,
+         y_buffer, y_offset, y_inc,
+         fast_kernels, fast_kernels,
+         0, false, kl_real, ku_real);
 }

 // =================================================================================================
--- a/src/routines/level2/xgbmv.hpp
+++ b/src/routines/level2/xgbmv.hpp
@ -33,13 +33,13 @@ class Xgbmv: public Xgemv<T> {
  Xgbmv(Queue &queue, EventPointer event, const std::string &name = "GBMV");

  // Templated-precision implementation of the routine
-  StatusCode DoGbmv(const Layout layout, const Transpose a_transpose,
-                    const size_t m, const size_t n, const size_t kl, const size_t ku,
-                    const T alpha,
-                    const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
-                    const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
-                    const T beta,
-                    const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc);
+  void DoGbmv(const Layout layout, const Transpose a_transpose,
+              const size_t m, const size_t n, const size_t kl, const size_t ku,
+              const T alpha,
+              const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
+              const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
+              const T beta,
+              const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc);
 };

 // =================================================================================================
--- a/src/routines/level2/xgemv.cpp
+++ b/src/routines/level2/xgemv.cpp
@ -33,41 +33,41 @@ Xgemv<T>::Xgemv(Queue &queue, EventPointer event, const std::string &name):

 // The main routine
 template <typename T>
-StatusCode Xgemv<T>::DoGemv(const Layout layout, const Transpose a_transpose,
-                            const size_t m, const size_t n,
-                            const T alpha,
-                            const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
-                            const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
-                            const T beta,
-                            const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc) {
+void Xgemv<T>::DoGemv(const Layout layout, const Transpose a_transpose,
+                      const size_t m, const size_t n,
+                      const T alpha,
+                      const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
+                      const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
+                      const T beta,
+                      const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc) {

  // Performs the matrix-vector multiplication
-  return MatVec(layout, a_transpose,
-                m, n, alpha,
-                a_buffer, a_offset, a_ld,
-                x_buffer, x_offset, x_inc, beta,
-                y_buffer, y_offset, y_inc,
-                true, true,
-                0, false, 0, 0); // N/A for this routine
+  MatVec(layout, a_transpose,
+         m, n, alpha,
+         a_buffer, a_offset, a_ld,
+         x_buffer, x_offset, x_inc, beta,
+         y_buffer, y_offset, y_inc,
+         true, true,
+         0, false, 0, 0); // N/A for this routine
 }

 // =================================================================================================

 // The generic implementation, also suited for other (non general) matrix-vector multiplications
 template <typename T>
-StatusCode Xgemv<T>::MatVec(const Layout layout, const Transpose a_transpose,
-                            const size_t m, const size_t n,
-                            const T alpha,
-                            const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
-                            const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
-                            const T beta,
-                            const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc,
-                            bool fast_kernel, bool fast_kernel_rot,
-                            const size_t parameter, const bool packed,
-                            const size_t kl, const size_t ku) {
+void Xgemv<T>::MatVec(const Layout layout, const Transpose a_transpose,
+                      const size_t m, const size_t n,
+                      const T alpha,
+                      const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
+                      const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
+                      const T beta,
+                      const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc,
+                      bool fast_kernel, bool fast_kernel_rot,
+                      const size_t parameter, const bool packed,
+                      const size_t kl, const size_t ku) {

  // Makes sure all dimensions are larger than zero
-  if (m == 0 || n == 0) { return StatusCode::kInvalidDimension; }
+  if (m == 0 || n == 0) { throw BLASError(StatusCode::kInvalidDimension); }

  // Computes whether or not the matrix has an alternative layout (row or column-major).
  auto a_altlayout = (layout == Layout::kRowMajor);
@ -91,14 +91,10 @@ StatusCode Xgemv<T>::MatVec(const Layout layout, const Transpose a_transpose,
  auto a_conjugate = (a_transpose == Transpose::kConjugate);

  // Tests the matrix and the vectors for validity
-  auto status = StatusCode::kSuccess;
-  if (packed) { status = TestMatrixAP(n, a_buffer, a_offset); }
-  else { status = TestMatrixA(a_one, a_two, a_buffer, a_offset, a_ld); }
-  if (ErrorIn(status)) { return status; }
-  status = TestVectorX(n_real, x_buffer, x_offset, x_inc);
-  if (ErrorIn(status)) { return status; }
-  status = TestVectorY(m_real, y_buffer, y_offset, y_inc);
-  if (ErrorIn(status)) { return status; }
+  if (packed) { TestMatrixAP(n, a_buffer, a_offset); }
+  else { TestMatrixA(a_one, a_two, a_buffer, a_offset, a_ld); }
+  TestVectorX(n_real, x_buffer, x_offset, x_inc);
+  TestVectorY(m_real, y_buffer, y_offset, y_inc);

  // Determines whether or not the fast-version can be used
  fast_kernel = fast_kernel && (a_offset == 0) && (a_rotated == 0) && (a_conjugate == 0) &&
@ -127,39 +123,33 @@ StatusCode Xgemv<T>::MatVec(const Layout layout, const Transpose a_transpose,
  }

  // Retrieves the Xgemv kernel from the compiled binary
-  try {
-    const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);
-    auto kernel = Kernel(program, kernel_name);
+  const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);
+  auto kernel = Kernel(program, kernel_name);

-    // Sets the kernel arguments
-    kernel.SetArgument(0, static_cast<int>(m_real));
-    kernel.SetArgument(1, static_cast<int>(n_real));
-    kernel.SetArgument(2, GetRealArg(alpha));
-    kernel.SetArgument(3, GetRealArg(beta));
-    kernel.SetArgument(4, static_cast<int>(a_rotated));
-    kernel.SetArgument(5, a_buffer());
-    kernel.SetArgument(6, static_cast<int>(a_offset));
-    kernel.SetArgument(7, static_cast<int>(a_ld));
-    kernel.SetArgument(8, x_buffer());
-    kernel.SetArgument(9, static_cast<int>(x_offset));
-    kernel.SetArgument(10, static_cast<int>(x_inc));
-    kernel.SetArgument(11, y_buffer());
-    kernel.SetArgument(12, static_cast<int>(y_offset));
-    kernel.SetArgument(13, static_cast<int>(y_inc));
-    kernel.SetArgument(14, static_cast<int>(a_conjugate));
-    kernel.SetArgument(15, static_cast<int>(parameter)); // extra parameter used for symm/herm
-    kernel.SetArgument(16, static_cast<int>(kl)); // only used for banded matrices
-    kernel.SetArgument(17, static_cast<int>(ku)); // only used for banded matrices
+  // Sets the kernel arguments
+  kernel.SetArgument(0, static_cast<int>(m_real));
+  kernel.SetArgument(1, static_cast<int>(n_real));
+  kernel.SetArgument(2, GetRealArg(alpha));
+  kernel.SetArgument(3, GetRealArg(beta));
+  kernel.SetArgument(4, static_cast<int>(a_rotated));
+  kernel.SetArgument(5, a_buffer());
+  kernel.SetArgument(6, static_cast<int>(a_offset));
+  kernel.SetArgument(7, static_cast<int>(a_ld));
+  kernel.SetArgument(8, x_buffer());
+  kernel.SetArgument(9, static_cast<int>(x_offset));
+  kernel.SetArgument(10, static_cast<int>(x_inc));
+  kernel.SetArgument(11, y_buffer());
+  kernel.SetArgument(12, static_cast<int>(y_offset));
+  kernel.SetArgument(13, static_cast<int>(y_inc));
+  kernel.SetArgument(14, static_cast<int>(a_conjugate));
+  kernel.SetArgument(15, static_cast<int>(parameter)); // extra parameter used for symm/herm
+  kernel.SetArgument(16, static_cast<int>(kl)); // only used for banded matrices
+  kernel.SetArgument(17, static_cast<int>(ku)); // only used for banded matrices

-    // Launches the kernel
-    auto global = std::vector<size_t>{global_size};
-    auto local = std::vector<size_t>{local_size};
-    status = RunKernel(kernel, queue_, device_, global, local, event_);
-    if (ErrorIn(status)) { return status; }
-
-    // Succesfully finished the computation
-    return StatusCode::kSuccess;
-  } catch (...) { return StatusCode::kInvalidKernel; }
+  // Launches the kernel
+  auto global = std::vector<size_t>{global_size};
+  auto local = std::vector<size_t>{local_size};
+  RunKernel(kernel, queue_, device_, global, local, event_);
 }

 // =================================================================================================
--- a/src/routines/level2/xgemv.hpp
+++ b/src/routines/level2/xgemv.hpp
@ -28,25 +28,25 @@ class Xgemv: public Routine {
  Xgemv(Queue &queue, EventPointer event, const std::string &name = "GEMV");

  // Templated-precision implementation of the routine
-  StatusCode DoGemv(const Layout layout, const Transpose a_transpose,
-                    const size_t m, const size_t n,
-                    const T alpha,
-                    const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
-                    const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
-                    const T beta,
-                    const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc);
+  void DoGemv(const Layout layout, const Transpose a_transpose,
+              const size_t m, const size_t n,
+              const T alpha,
+              const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
+              const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
+              const T beta,
+              const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc);

  // Generic version used also for other matrix-vector multiplications
-  StatusCode MatVec(const Layout layout, const Transpose a_transpose,
-                    const size_t m, const size_t n,
-                    const T alpha,
-                    const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
-                    const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
-                    const T beta,
-                    const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc,
-                    bool fast_kernel, bool fast_kernel_rot,
-                    const size_t parameter, const bool packed,
-                    const size_t kl, const size_t ku);
+  void MatVec(const Layout layout, const Transpose a_transpose,
+              const size_t m, const size_t n,
+              const T alpha,
+              const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
+              const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
+              const T beta,
+              const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc,
+              bool fast_kernel, bool fast_kernel_rot,
+              const size_t parameter, const bool packed,
+              const size_t kl, const size_t ku);
 };

 // =================================================================================================
--- a/src/routines/level2/xger.cpp
+++ b/src/routines/level2/xger.cpp
@ -33,15 +33,15 @@ Xger<T>::Xger(Queue &queue, EventPointer event, const std::string &name):

 // The main routine
 template <typename T>
-StatusCode Xger<T>::DoGer(const Layout layout,
-                          const size_t m, const size_t n,
-                          const T alpha,
-                          const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
-                          const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc,
-                          const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld) {
+void Xger<T>::DoGer(const Layout layout,
+                    const size_t m, const size_t n,
+                    const T alpha,
+                    const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
+                    const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc,
+                    const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld) {

  // Makes sure all dimensions are larger than zero
-  if (m == 0 || n == 0) { return StatusCode::kInvalidDimension; }
+  if (m == 0 || n == 0) { throw BLASError(StatusCode::kInvalidDimension); }

  // Computes whether or not the matrix has an alternative layout (row or column-major).
  const auto a_is_rowmajor = (layout == Layout::kRowMajor);
@ -49,44 +49,35 @@ StatusCode Xger<T>::DoGer(const Layout layout,
  const auto a_two = (a_is_rowmajor) ? m : n;

  // Tests the matrix and the vectors for validity
-  auto status = TestMatrixA(a_one, a_two, a_buffer, a_offset, a_ld);
-  if (ErrorIn(status)) { return status; }
-  status = TestVectorX(m, x_buffer, x_offset, x_inc);
-  if (ErrorIn(status)) { return status; }
-  status = TestVectorY(n, y_buffer, y_offset, y_inc);
-  if (ErrorIn(status)) { return status; }
+  TestMatrixA(a_one, a_two, a_buffer, a_offset, a_ld);
+  TestVectorX(m, x_buffer, x_offset, x_inc);
+  TestVectorY(n, y_buffer, y_offset, y_inc);

  // Retrieves the kernel from the compiled binary
-  try {
-    const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);
-    auto kernel = Kernel(program, "Xger");
+  const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);
+  auto kernel = Kernel(program, "Xger");

-    // Sets the kernel arguments
-    kernel.SetArgument(0, static_cast<int>(a_one));
-    kernel.SetArgument(1, static_cast<int>(a_two));
-    kernel.SetArgument(2, GetRealArg(alpha));
-    kernel.SetArgument(3, x_buffer());
-    kernel.SetArgument(4, static_cast<int>(x_offset));
-    kernel.SetArgument(5, static_cast<int>(x_inc));
-    kernel.SetArgument(6, y_buffer());
-    kernel.SetArgument(7, static_cast<int>(y_offset));
-    kernel.SetArgument(8, static_cast<int>(y_inc));
-    kernel.SetArgument(9, a_buffer());
-    kernel.SetArgument(10, static_cast<int>(a_offset));
-    kernel.SetArgument(11, static_cast<int>(a_ld));
-    kernel.SetArgument(12, static_cast<int>(a_is_rowmajor));
+  // Sets the kernel arguments
+  kernel.SetArgument(0, static_cast<int>(a_one));
+  kernel.SetArgument(1, static_cast<int>(a_two));
+  kernel.SetArgument(2, GetRealArg(alpha));
+  kernel.SetArgument(3, x_buffer());
+  kernel.SetArgument(4, static_cast<int>(x_offset));
+  kernel.SetArgument(5, static_cast<int>(x_inc));
+  kernel.SetArgument(6, y_buffer());
+  kernel.SetArgument(7, static_cast<int>(y_offset));
+  kernel.SetArgument(8, static_cast<int>(y_inc));
+  kernel.SetArgument(9, a_buffer());
+  kernel.SetArgument(10, static_cast<int>(a_offset));
+  kernel.SetArgument(11, static_cast<int>(a_ld));
+  kernel.SetArgument(12, static_cast<int>(a_is_rowmajor));

-    // Launches the kernel
-    auto a_one_ceiled = Ceil(CeilDiv(a_one, db_["WPT"]), db_["WGS1"]);
-    auto a_two_ceiled = Ceil(CeilDiv(a_two, db_["WPT"]), db_["WGS2"]);
-    auto global = std::vector<size_t>{a_one_ceiled, a_two_ceiled};
-    auto local = std::vector<size_t>{db_["WGS1"], db_["WGS2"]};
-    status = RunKernel(kernel, queue_, device_, global, local, event_);
-    if (ErrorIn(status)) { return status; }
-
-    // Succesfully finished the computation
-    return StatusCode::kSuccess;
-  } catch (...) { return StatusCode::kInvalidKernel; }
+  // Launches the kernel
+  auto a_one_ceiled = Ceil(CeilDiv(a_one, db_["WPT"]), db_["WGS1"]);
+  auto a_two_ceiled = Ceil(CeilDiv(a_two, db_["WPT"]), db_["WGS2"]);
+  auto global = std::vector<size_t>{a_one_ceiled, a_two_ceiled};
+  auto local = std::vector<size_t>{db_["WGS1"], db_["WGS2"]};
+  RunKernel(kernel, queue_, device_, global, local, event_);
 }

 // =================================================================================================
--- a/src/routines/level2/xger.hpp
+++ b/src/routines/level2/xger.hpp
@ -28,12 +28,12 @@ class Xger: public Routine {
  Xger(Queue &queue, EventPointer event, const std::string &name = "GER");

  // Templated-precision implementation of the routine
-  StatusCode DoGer(const Layout layout,
-                   const size_t m, const size_t n,
-                   const T alpha,
-                   const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
-                   const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc,
-                   const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld);
+  void DoGer(const Layout layout,
+             const size_t m, const size_t n,
+             const T alpha,
+             const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
+             const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc,
+             const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld);
 };

 // =================================================================================================
--- a/src/routines/level2/xgerc.cpp
+++ b/src/routines/level2/xgerc.cpp
@ -28,19 +28,19 @@ Xgerc<T>::Xgerc(Queue &queue, EventPointer event, const std::string &name):

 // The main routine
 template <typename T>
-StatusCode Xgerc<T>::DoGerc(const Layout layout,
-                            const size_t m, const size_t n,
-                            const T alpha,
-                            const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
-                            const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc,
-                            const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld) {
+void Xgerc<T>::DoGerc(const Layout layout,
+                      const size_t m, const size_t n,
+                      const T alpha,
+                      const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
+                      const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc,
+                      const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld) {

  // Regular Ger operation on complex data, plus conjugation in the kernel guarded by the
  // ROUTINE_GERC guard.
-  return DoGer(layout, m, n, alpha,
-               x_buffer, x_offset, x_inc,
-               y_buffer, y_offset, y_inc,
-               a_buffer, a_offset, a_ld);
+  DoGer(layout, m, n, alpha,
+        x_buffer, x_offset, x_inc,
+        y_buffer, y_offset, y_inc,
+        a_buffer, a_offset, a_ld);
 }

 // =================================================================================================
--- a/src/routines/level2/xgerc.hpp
+++ b/src/routines/level2/xgerc.hpp
@ -31,12 +31,12 @@ class Xgerc: public Xger<T> {
  Xgerc(Queue &queue, EventPointer event, const std::string &name = "GERC");

  // Templated-precision implementation of the routine
-  StatusCode DoGerc(const Layout layout,
-                    const size_t m, const size_t n,
-                    const T alpha,
-                    const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
-                    const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc,
-                    const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld);
+  void DoGerc(const Layout layout,
+              const size_t m, const size_t n,
+              const T alpha,
+              const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
+              const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc,
+              const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld);
 };

 // =================================================================================================
--- a/src/routines/level2/xgeru.cpp
+++ b/src/routines/level2/xgeru.cpp
@ -28,18 +28,18 @@ Xgeru<T>::Xgeru(Queue &queue, EventPointer event, const std::string &name):

 // The main routine
 template <typename T>
-StatusCode Xgeru<T>::DoGeru(const Layout layout,
-                            const size_t m, const size_t n,
-                            const T alpha,
-                            const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
-                            const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc,
-                            const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld) {
+void Xgeru<T>::DoGeru(const Layout layout,
+                      const size_t m, const size_t n,
+                      const T alpha,
+                      const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
+                      const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc,
+                      const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld) {

  // Regular Ger operation on complex data
-  return DoGer(layout, m, n, alpha,
-               x_buffer, x_offset, x_inc,
-               y_buffer, y_offset, y_inc,
-               a_buffer, a_offset, a_ld);
+  DoGer(layout, m, n, alpha,
+        x_buffer, x_offset, x_inc,
+        y_buffer, y_offset, y_inc,
+        a_buffer, a_offset, a_ld);
 }

 // =================================================================================================
--- a/src/routines/level2/xgeru.hpp
+++ b/src/routines/level2/xgeru.hpp
@ -31,12 +31,12 @@ class Xgeru: public Xger<T> {
  Xgeru(Queue &queue, EventPointer event, const std::string &name = "GERU");

  // Templated-precision implementation of the routine
-  StatusCode DoGeru(const Layout layout,
-                    const size_t m, const size_t n,
-                    const T alpha,
-                    const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
-                    const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc,
-                    const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld);
+  void DoGeru(const Layout layout,
+              const size_t m, const size_t n,
+              const T alpha,
+              const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
+              const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc,
+              const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld);
 };

 // =================================================================================================
--- a/src/routines/level2/xhbmv.cpp
+++ b/src/routines/level2/xhbmv.cpp
@ -29,13 +29,13 @@ Xhbmv<T>::Xhbmv(Queue &queue, EventPointer event, const std::string &name):

 // The main routine
 template <typename T>
-StatusCode Xhbmv<T>::DoHbmv(const Layout layout, const Triangle triangle,
-                            const size_t n, const size_t k,
-                            const T alpha,
-                            const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
-                            const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
-                            const T beta,
-                            const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc) {
+void Xhbmv<T>::DoHbmv(const Layout layout, const Triangle triangle,
+                      const size_t n, const size_t k,
+                      const T alpha,
+                      const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
+                      const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
+                      const T beta,
+                      const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc) {

  // The data is either in the upper or lower triangle
  size_t is_upper = ((triangle == Triangle::kUpper && layout != Layout::kRowMajor) ||
@ -45,13 +45,13 @@ StatusCode Xhbmv<T>::DoHbmv(const Layout layout, const Triangle triangle,
  // The specific hermitian banded matrix-accesses are implemented in the kernel guarded by the
  // ROUTINE_HBMV define.
  bool fast_kernels = false;
-  return MatVec(layout, Transpose::kNo,
-                n, n, alpha,
-                a_buffer, a_offset, a_ld,
-                x_buffer, x_offset, x_inc, beta,
-                y_buffer, y_offset, y_inc,
-                fast_kernels, fast_kernels,
-                is_upper, false, k, 0);
+  MatVec(layout, Transpose::kNo,
+         n, n, alpha,
+         a_buffer, a_offset, a_ld,
+         x_buffer, x_offset, x_inc, beta,
+         y_buffer, y_offset, y_inc,
+         fast_kernels, fast_kernels,
+         is_upper, false, k, 0);
 }

 // =================================================================================================
--- a/src/routines/level2/xhbmv.hpp
+++ b/src/routines/level2/xhbmv.hpp
@ -33,13 +33,13 @@ class Xhbmv: public Xgemv<T> {
  Xhbmv(Queue &queue, EventPointer event, const std::string &name = "HBMV");

  // Templated-precision implementation of the routine
-  StatusCode DoHbmv(const Layout layout, const Triangle triangle,
-                    const size_t n, const size_t k,
-                    const T alpha,
-                    const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
-                    const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
-                    const T beta,
-                    const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc);
+  void DoHbmv(const Layout layout, const Triangle triangle,
+              const size_t n, const size_t k,
+              const T alpha,
+              const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
+              const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
+              const T beta,
+              const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc);
 };

 // =================================================================================================
--- a/src/routines/level2/xhemv.cpp
+++ b/src/routines/level2/xhemv.cpp
@ -29,13 +29,13 @@ Xhemv<T>::Xhemv(Queue &queue, EventPointer event, const std::string &name):

 // The main routine
 template <typename T>
-StatusCode Xhemv<T>::DoHemv(const Layout layout, const Triangle triangle,
-                            const size_t n,
-                            const T alpha,
-                            const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
-                            const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
-                            const T beta,
-                            const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc) {
+void Xhemv<T>::DoHemv(const Layout layout, const Triangle triangle,
+                      const size_t n,
+                      const T alpha,
+                      const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
+                      const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
+                      const T beta,
+                      const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc) {

  // The data is either in the upper or lower triangle
  size_t is_upper = ((triangle == Triangle::kUpper && layout != Layout::kRowMajor) ||
@ -45,13 +45,13 @@ StatusCode Xhemv<T>::DoHemv(const Layout layout, const Triangle triangle,
  // The specific hermitian matrix-accesses are implemented in the kernel guarded by the
  // ROUTINE_HEMV define.
  bool fast_kernels = false;
-  return MatVec(layout, Transpose::kNo,
-                n, n, alpha,
-                a_buffer, a_offset, a_ld,
-                x_buffer, x_offset, x_inc, beta,
-                y_buffer, y_offset, y_inc,
-                fast_kernels, fast_kernels,
-                is_upper, false, 0, 0);
+  MatVec(layout, Transpose::kNo,
+         n, n, alpha,
+         a_buffer, a_offset, a_ld,
+         x_buffer, x_offset, x_inc, beta,
+         y_buffer, y_offset, y_inc,
+         fast_kernels, fast_kernels,
+         is_upper, false, 0, 0);
 }

 // =================================================================================================
--- a/src/routines/level2/xhemv.hpp
+++ b/src/routines/level2/xhemv.hpp
@ -33,13 +33,13 @@ class Xhemv: public Xgemv<T> {
  Xhemv(Queue &queue, EventPointer event, const std::string &name = "HEMV");

  // Templated-precision implementation of the routine
-  StatusCode DoHemv(const Layout layout, const Triangle triangle,
-                    const size_t n,
-                    const T alpha,
-                    const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
-                    const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
-                    const T beta,
-                    const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc);
+  void DoHemv(const Layout layout, const Triangle triangle,
+              const size_t n,
+              const T alpha,
+              const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
+              const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
+              const T beta,
+              const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc);
 };

 // =================================================================================================
--- a/src/routines/level2/xher.cpp
+++ b/src/routines/level2/xher.cpp
@ -41,15 +41,15 @@ template <> half Xher<half,half>::GetAlpha(const half alpha) { return alpha; }

 // The main routine
 template <typename T, typename U>
-StatusCode Xher<T,U>::DoHer(const Layout layout, const Triangle triangle,
-                            const size_t n,
-                            const U alpha,
-                            const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
-                            const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
-                            const bool packed) {
+void Xher<T,U>::DoHer(const Layout layout, const Triangle triangle,
+                      const size_t n,
+                      const U alpha,
+                      const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
+                      const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
+                      const bool packed) {

  // Makes sure the dimensions are larger than zero
-  if (n == 0) { return StatusCode::kInvalidDimension; }
+  if (n == 0) { throw BLASError(StatusCode::kInvalidDimension); }

  // The data is either in the upper or lower triangle
  const auto is_upper = ((triangle == Triangle::kUpper && layout != Layout::kRowMajor) ||
@ -57,47 +57,38 @@ StatusCode Xher<T,U>::DoHer(const Layout layout, const Triangle triangle,
  const auto is_rowmajor = (layout == Layout::kRowMajor);

  // Tests the matrix and the vectors for validity
-  auto status = StatusCode::kSuccess;
-  if (packed) { status = TestMatrixAP(n, a_buffer, a_offset); }
-  else { status = TestMatrixA(n, n, a_buffer, a_offset, a_ld); }
-  if (ErrorIn(status)) { return status; }
-  status = TestVectorX(n, x_buffer, x_offset, x_inc);
-  if (ErrorIn(status)) { return status; }
+  if (packed) { TestMatrixAP(n, a_buffer, a_offset); }
+  else { TestMatrixA(n, n, a_buffer, a_offset, a_ld); }
+  TestVectorX(n, x_buffer, x_offset, x_inc);

  // If alpha is zero an update is not required
-  if (alpha == U{0}) { return StatusCode::kSuccess; }
+  if (alpha == U{0}) { return; }

  // Creates a matching version of alpha
  const auto matching_alpha = GetAlpha(alpha);

  // Retrieves the kernel from the compiled binary
-  try {
-    const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);
-    auto kernel = Kernel(program, "Xher");
+  const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);
+  auto kernel = Kernel(program, "Xher");

-    // Sets the kernel arguments
-    kernel.SetArgument(0, static_cast<int>(n));
-    kernel.SetArgument(1, GetRealArg(matching_alpha));
-    kernel.SetArgument(2, x_buffer());
-    kernel.SetArgument(3, static_cast<int>(x_offset));
-    kernel.SetArgument(4, static_cast<int>(x_inc));
-    kernel.SetArgument(5, a_buffer());
-    kernel.SetArgument(6, static_cast<int>(a_offset));
-    kernel.SetArgument(7, static_cast<int>(a_ld));
-    kernel.SetArgument(8, static_cast<int>(is_upper));
-    kernel.SetArgument(9, static_cast<int>(is_rowmajor));
+  // Sets the kernel arguments
+  kernel.SetArgument(0, static_cast<int>(n));
+  kernel.SetArgument(1, GetRealArg(matching_alpha));
+  kernel.SetArgument(2, x_buffer());
+  kernel.SetArgument(3, static_cast<int>(x_offset));
+  kernel.SetArgument(4, static_cast<int>(x_inc));
+  kernel.SetArgument(5, a_buffer());
+  kernel.SetArgument(6, static_cast<int>(a_offset));
+  kernel.SetArgument(7, static_cast<int>(a_ld));
+  kernel.SetArgument(8, static_cast<int>(is_upper));
+  kernel.SetArgument(9, static_cast<int>(is_rowmajor));

-    // Launches the kernel
-    auto global_one = Ceil(CeilDiv(n, db_["WPT"]), db_["WGS1"]);
-    auto global_two = Ceil(CeilDiv(n, db_["WPT"]), db_["WGS2"]);
-    auto global = std::vector<size_t>{global_one, global_two};
-    auto local = std::vector<size_t>{db_["WGS1"], db_["WGS2"]};
-    status = RunKernel(kernel, queue_, device_, global, local, event_);
-    if (ErrorIn(status)) { return status; }
-
-    // Succesfully finished the computation
-    return StatusCode::kSuccess;
-  } catch (...) { return StatusCode::kInvalidKernel; }
+  // Launches the kernel
+  auto global_one = Ceil(CeilDiv(n, db_["WPT"]), db_["WGS1"]);
+  auto global_two = Ceil(CeilDiv(n, db_["WPT"]), db_["WGS2"]);
+  auto global = std::vector<size_t>{global_one, global_two};
+  auto local = std::vector<size_t>{db_["WGS1"], db_["WGS2"]};
+  RunKernel(kernel, queue_, device_, global, local, event_);
 }

 // =================================================================================================
--- a/src/routines/level2/xher.hpp
+++ b/src/routines/level2/xher.hpp
@ -31,12 +31,12 @@ class Xher: public Routine {
  T GetAlpha(const U alpha);

  // Templated-precision implementation of the routine
-  StatusCode DoHer(const Layout layout, const Triangle triangle,
-                   const size_t n,
-                   const U alpha,
-                   const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
-                   const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
-                   const bool packed = false);
+  void DoHer(const Layout layout, const Triangle triangle,
+             const size_t n,
+             const U alpha,
+             const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
+             const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
+             const bool packed = false);
 };

 // =================================================================================================
--- a/src/routines/level2/xher2.cpp
+++ b/src/routines/level2/xher2.cpp
@ -32,16 +32,16 @@ Xher2<T>::Xher2(Queue &queue, EventPointer event, const std::string &name):

 // The main routine
 template <typename T>
-StatusCode Xher2<T>::DoHer2(const Layout layout, const Triangle triangle,
-                            const size_t n,
-                            const T alpha,
-                            const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
-                            const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc,
-                            const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
-                            const bool packed) {
+void Xher2<T>::DoHer2(const Layout layout, const Triangle triangle,
+                      const size_t n,
+                      const T alpha,
+                      const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
+                      const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc,
+                      const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
+                      const bool packed) {

  // Makes sure the dimensions are larger than zero
-  if (n == 0) { return StatusCode::kInvalidDimension; }
+  if (n == 0) { throw BLASError(StatusCode::kInvalidDimension); }

  // The data is either in the upper or lower triangle
  const auto is_upper = ((triangle == Triangle::kUpper && layout != Layout::kRowMajor) ||
@ -49,46 +49,36 @@ StatusCode Xher2<T>::DoHer2(const Layout layout, const Triangle triangle,
  const auto is_rowmajor = (layout == Layout::kRowMajor);

  // Tests the matrix and the vectors for validity
-  auto status = StatusCode::kSuccess;
-  if (packed) { status = TestMatrixAP(n, a_buffer, a_offset); }
-  else { status = TestMatrixA(n, n, a_buffer, a_offset, a_ld); }
-  if (ErrorIn(status)) { return status; }
-  status = TestVectorX(n, x_buffer, x_offset, x_inc);
-  if (ErrorIn(status)) { return status; }
-  status = TestVectorY(n, y_buffer, y_offset, y_inc);
-  if (ErrorIn(status)) { return status; }
+  if (packed) { TestMatrixAP(n, a_buffer, a_offset); }
+  else { TestMatrixA(n, n, a_buffer, a_offset, a_ld); }
+  TestVectorX(n, x_buffer, x_offset, x_inc);
+  TestVectorY(n, y_buffer, y_offset, y_inc);

  // Retrieves the kernel from the compiled binary
-  try {
-    const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);
-    auto kernel = Kernel(program, "Xher2");
+  const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);
+  auto kernel = Kernel(program, "Xher2");

-    // Sets the kernel arguments
-    kernel.SetArgument(0, static_cast<int>(n));
-    kernel.SetArgument(1, GetRealArg(alpha));
-    kernel.SetArgument(2, x_buffer());
-    kernel.SetArgument(3, static_cast<int>(x_offset));
-    kernel.SetArgument(4, static_cast<int>(x_inc));
-    kernel.SetArgument(5, y_buffer());
-    kernel.SetArgument(6, static_cast<int>(y_offset));
-    kernel.SetArgument(7, static_cast<int>(y_inc));
-    kernel.SetArgument(8, a_buffer());
-    kernel.SetArgument(9, static_cast<int>(a_offset));
-    kernel.SetArgument(10, static_cast<int>(a_ld));
-    kernel.SetArgument(11, static_cast<int>(is_upper));
-    kernel.SetArgument(12, static_cast<int>(is_rowmajor));
+  // Sets the kernel arguments
+  kernel.SetArgument(0, static_cast<int>(n));
+  kernel.SetArgument(1, GetRealArg(alpha));
+  kernel.SetArgument(2, x_buffer());
+  kernel.SetArgument(3, static_cast<int>(x_offset));
+  kernel.SetArgument(4, static_cast<int>(x_inc));
+  kernel.SetArgument(5, y_buffer());
+  kernel.SetArgument(6, static_cast<int>(y_offset));
+  kernel.SetArgument(7, static_cast<int>(y_inc));
+  kernel.SetArgument(8, a_buffer());
+  kernel.SetArgument(9, static_cast<int>(a_offset));
+  kernel.SetArgument(10, static_cast<int>(a_ld));
+  kernel.SetArgument(11, static_cast<int>(is_upper));
+  kernel.SetArgument(12, static_cast<int>(is_rowmajor));

-    // Launches the kernel
-    auto global_one = Ceil(CeilDiv(n, db_["WPT"]), db_["WGS1"]);
-    auto global_two = Ceil(CeilDiv(n, db_["WPT"]), db_["WGS2"]);
-    auto global = std::vector<size_t>{global_one, global_two};
-    auto local = std::vector<size_t>{db_["WGS1"], db_["WGS2"]};
-    status = RunKernel(kernel, queue_, device_, global, local, event_);
-    if (ErrorIn(status)) { return status; }
-
-    // Succesfully finished the computation
-    return StatusCode::kSuccess;
-  } catch (...) { return StatusCode::kInvalidKernel; }
+  // Launches the kernel
+  auto global_one = Ceil(CeilDiv(n, db_["WPT"]), db_["WGS1"]);
+  auto global_two = Ceil(CeilDiv(n, db_["WPT"]), db_["WGS2"]);
+  auto global = std::vector<size_t>{global_one, global_two};
+  auto local = std::vector<size_t>{db_["WGS1"], db_["WGS2"]};
+  RunKernel(kernel, queue_, device_, global, local, event_);
 }

 // =================================================================================================
--- a/src/routines/level2/xher2.hpp
+++ b/src/routines/level2/xher2.hpp
@ -28,13 +28,13 @@ class Xher2: public Routine {
  Xher2(Queue &queue, EventPointer event, const std::string &name = "HER2");

  // Templated-precision implementation of the routine
-  StatusCode DoHer2(const Layout layout, const Triangle triangle,
-                    const size_t n,
-                    const T alpha,
-                    const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
-                    const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc,
-                    const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
-                    const bool packed = false);
+  void DoHer2(const Layout layout, const Triangle triangle,
+              const size_t n,
+              const T alpha,
+              const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
+              const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc,
+              const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
+              const bool packed = false);
 };

 // =================================================================================================
--- a/src/routines/level2/xhpmv.cpp
+++ b/src/routines/level2/xhpmv.cpp
@ -29,13 +29,13 @@ Xhpmv<T>::Xhpmv(Queue &queue, EventPointer event, const std::string &name):

 // The main routine
 template <typename T>
-StatusCode Xhpmv<T>::DoHpmv(const Layout layout, const Triangle triangle,
-                            const size_t n,
-                            const T alpha,
-                            const Buffer<T> &ap_buffer, const size_t ap_offset,
-                            const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
-                            const T beta,
-                            const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc) {
+void Xhpmv<T>::DoHpmv(const Layout layout, const Triangle triangle,
+                      const size_t n,
+                      const T alpha,
+                      const Buffer<T> &ap_buffer, const size_t ap_offset,
+                      const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
+                      const T beta,
+                      const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc) {

  // The data is either in the upper or lower triangle
  size_t is_upper = ((triangle == Triangle::kUpper && layout != Layout::kRowMajor) ||
@ -45,13 +45,13 @@ StatusCode Xhpmv<T>::DoHpmv(const Layout layout, const Triangle triangle,
  // The specific hermitian packed matrix-accesses are implemented in the kernel guarded by the
  // ROUTINE_HPMV define.
  bool fast_kernels = false;
-  return MatVec(layout, Transpose::kNo,
-                n, n, alpha,
-                ap_buffer, ap_offset, n,
-                x_buffer, x_offset, x_inc, beta,
-                y_buffer, y_offset, y_inc,
-                fast_kernels, fast_kernels,
-                is_upper, true, 0, 0);
+  MatVec(layout, Transpose::kNo,
+         n, n, alpha,
+         ap_buffer, ap_offset, n,
+         x_buffer, x_offset, x_inc, beta,
+         y_buffer, y_offset, y_inc,
+         fast_kernels, fast_kernels,
+         is_upper, true, 0, 0);
 }

 // =================================================================================================
--- a/src/routines/level2/xhpmv.hpp
+++ b/src/routines/level2/xhpmv.hpp
@ -33,13 +33,13 @@ class Xhpmv: public Xgemv<T> {
  Xhpmv(Queue &queue, EventPointer event, const std::string &name = "HPMV");

  // Templated-precision implementation of the routine
-  StatusCode DoHpmv(const Layout layout, const Triangle triangle,
-                    const size_t n,
-                    const T alpha,
-                    const Buffer<T> &ap_buffer, const size_t ap_offset,
-                    const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
-                    const T beta,
-                    const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc);
+  void DoHpmv(const Layout layout, const Triangle triangle,
+              const size_t n,
+              const T alpha,
+              const Buffer<T> &ap_buffer, const size_t ap_offset,
+              const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
+              const T beta,
+              const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc);
 };

 // =================================================================================================
--- a/src/routines/level2/xhpr.cpp
+++ b/src/routines/level2/xhpr.cpp
@ -28,17 +28,17 @@ Xhpr<T,U>::Xhpr(Queue &queue, EventPointer event, const std::string &name):

 // The main routine
 template <typename T, typename U>
-StatusCode Xhpr<T,U>::DoHpr(const Layout layout, const Triangle triangle,
-                            const size_t n,
-                            const U alpha,
-                            const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
-                            const Buffer<T> &ap_buffer, const size_t ap_offset) {
+void Xhpr<T,U>::DoHpr(const Layout layout, const Triangle triangle,
+                      const size_t n,
+                      const U alpha,
+                      const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
+                      const Buffer<T> &ap_buffer, const size_t ap_offset) {

  // Specific Xhpr functionality is implemented in the kernel using defines
-  return DoHer(layout, triangle, n, alpha,
-               x_buffer, x_offset, x_inc,
-               ap_buffer, ap_offset, n,
-               true); // packed matrix
+  DoHer(layout, triangle, n, alpha,
+        x_buffer, x_offset, x_inc,
+        ap_buffer, ap_offset, n,
+        true); // packed matrix
 }

 // =================================================================================================
--- a/src/routines/level2/xhpr.hpp
+++ b/src/routines/level2/xhpr.hpp
@ -31,11 +31,11 @@ class Xhpr: public Xher<T,U> {
  Xhpr(Queue &queue, EventPointer event, const std::string &name = "HPR");

  // Templated-precision implementation of the routine
-  StatusCode DoHpr(const Layout layout, const Triangle triangle,
-                   const size_t n,
-                   const U alpha,
-                   const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
-                   const Buffer<T> &ap_buffer, const size_t ap_offset);
+  void DoHpr(const Layout layout, const Triangle triangle,
+             const size_t n,
+             const U alpha,
+             const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
+             const Buffer<T> &ap_buffer, const size_t ap_offset);
 };

 // =================================================================================================
--- a/src/routines/level2/xhpr2.cpp
+++ b/src/routines/level2/xhpr2.cpp
@ -28,19 +28,19 @@ Xhpr2<T>::Xhpr2(Queue &queue, EventPointer event, const std::string &name):

 // The main routine
 template <typename T>
-StatusCode Xhpr2<T>::DoHpr2(const Layout layout, const Triangle triangle,
-                            const size_t n,
-                            const T alpha,
-                            const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
-                            const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc,
-                            const Buffer<T> &ap_buffer, const size_t ap_offset) {
+void Xhpr2<T>::DoHpr2(const Layout layout, const Triangle triangle,
+                      const size_t n,
+                      const T alpha,
+                      const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
+                      const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc,
+                      const Buffer<T> &ap_buffer, const size_t ap_offset) {

  // Specific Xhpr2 functionality is implemented in the kernel using defines
-  return DoHer2(layout, triangle, n, alpha,
-                x_buffer, x_offset, x_inc,
-                y_buffer, y_offset, y_inc,
-                ap_buffer, ap_offset, n,
-                true); // packed matrix
+  DoHer2(layout, triangle, n, alpha,
+         x_buffer, x_offset, x_inc,
+         y_buffer, y_offset, y_inc,
+         ap_buffer, ap_offset, n,
+         true); // packed matrix
 }

 // =================================================================================================
--- a/src/routines/level2/xhpr2.hpp
+++ b/src/routines/level2/xhpr2.hpp
@ -31,12 +31,12 @@ class Xhpr2: public Xher2<T> {
  Xhpr2(Queue &queue, EventPointer event, const std::string &name = "HPR2");

  // Templated-precision implementation of the routine
-  StatusCode DoHpr2(const Layout layout, const Triangle triangle,
-                    const size_t n,
-                    const T alpha,
-                    const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
-                    const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc,
-                    const Buffer<T> &ap_buffer, const size_t ap_offset);
+  void DoHpr2(const Layout layout, const Triangle triangle,
+              const size_t n,
+              const T alpha,
+              const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
+              const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc,
+              const Buffer<T> &ap_buffer, const size_t ap_offset);
 };

 // =================================================================================================
--- a/src/routines/level2/xsbmv.cpp
+++ b/src/routines/level2/xsbmv.cpp
@ -29,13 +29,13 @@ Xsbmv<T>::Xsbmv(Queue &queue, EventPointer event, const std::string &name):

 // The main routine
 template <typename T>
-StatusCode Xsbmv<T>::DoSbmv(const Layout layout, const Triangle triangle,
-                            const size_t n, const size_t k,
-                            const T alpha,
-                            const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
-                            const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
-                            const T beta,
-                            const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc) {
+void Xsbmv<T>::DoSbmv(const Layout layout, const Triangle triangle,
+                      const size_t n, const size_t k,
+                      const T alpha,
+                      const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
+                      const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
+                      const T beta,
+                      const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc) {

  // The data is either in the upper or lower triangle
  size_t is_upper = ((triangle == Triangle::kUpper && layout != Layout::kRowMajor) ||
@ -45,13 +45,13 @@ StatusCode Xsbmv<T>::DoSbmv(const Layout layout, const Triangle triangle,
  // The specific symmetric banded matrix-accesses are implemented in the kernel guarded by the
  // ROUTINE_SBMV define.
  bool fast_kernels = false;
-  return MatVec(layout, Transpose::kNo,
-                n, n, alpha,
-                a_buffer, a_offset, a_ld,
-                x_buffer, x_offset, x_inc, beta,
-                y_buffer, y_offset, y_inc,
-                fast_kernels, fast_kernels,
-                is_upper, false, k, 0);
+  MatVec(layout, Transpose::kNo,
+         n, n, alpha,
+         a_buffer, a_offset, a_ld,
+         x_buffer, x_offset, x_inc, beta,
+         y_buffer, y_offset, y_inc,
+         fast_kernels, fast_kernels,
+         is_upper, false, k, 0);
 }

 // =================================================================================================
--- a/src/routines/level2/xsbmv.hpp
+++ b/src/routines/level2/xsbmv.hpp
@ -33,13 +33,13 @@ class Xsbmv: public Xgemv<T> {
  Xsbmv(Queue &queue, EventPointer event, const std::string &name = "SBMV");

  // Templated-precision implementation of the routine
-  StatusCode DoSbmv(const Layout layout, const Triangle triangle,
-                    const size_t n, const size_t k,
-                    const T alpha,
-                    const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
-                    const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
-                    const T beta,
-                    const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc);
+  void DoSbmv(const Layout layout, const Triangle triangle,
+              const size_t n, const size_t k,
+              const T alpha,
+              const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
+              const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
+              const T beta,
+              const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc);
 };

 // =================================================================================================
--- a/src/routines/level2/xspmv.cpp
+++ b/src/routines/level2/xspmv.cpp
@ -29,13 +29,13 @@ Xspmv<T>::Xspmv(Queue &queue, EventPointer event, const std::string &name):

 // The main routine
 template <typename T>
-StatusCode Xspmv<T>::DoSpmv(const Layout layout, const Triangle triangle,
-                            const size_t n,
-                            const T alpha,
-                            const Buffer<T> &ap_buffer, const size_t ap_offset,
-                            const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
-                            const T beta,
-                            const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc) {
+void Xspmv<T>::DoSpmv(const Layout layout, const Triangle triangle,
+                      const size_t n,
+                      const T alpha,
+                      const Buffer<T> &ap_buffer, const size_t ap_offset,
+                      const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
+                      const T beta,
+                      const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc) {

  // The data is either in the upper or lower triangle
  size_t is_upper = ((triangle == Triangle::kUpper && layout != Layout::kRowMajor) ||
@ -45,13 +45,13 @@ StatusCode Xspmv<T>::DoSpmv(const Layout layout, const Triangle triangle,
  // The specific symmetric packed matrix-accesses are implemented in the kernel guarded by the
  // ROUTINE_SPMV define.
  bool fast_kernels = false;
-  return MatVec(layout, Transpose::kNo,
-                n, n, alpha,
-                ap_buffer, ap_offset, n,
-                x_buffer, x_offset, x_inc, beta,
-                y_buffer, y_offset, y_inc,
-                fast_kernels, fast_kernels,
-                is_upper, true, 0, 0);
+  MatVec(layout, Transpose::kNo,
+         n, n, alpha,
+         ap_buffer, ap_offset, n,
+         x_buffer, x_offset, x_inc, beta,
+         y_buffer, y_offset, y_inc,
+         fast_kernels, fast_kernels,
+         is_upper, true, 0, 0);
 }

 // =================================================================================================
--- a/src/routines/level2/xspmv.hpp
+++ b/src/routines/level2/xspmv.hpp
@ -33,13 +33,13 @@ class Xspmv: public Xgemv<T> {
  Xspmv(Queue &queue, EventPointer event, const std::string &name = "SPMV");

  // Templated-precision implementation of the routine
-  StatusCode DoSpmv(const Layout layout, const Triangle triangle,
-                    const size_t n,
-                    const T alpha,
-                    const Buffer<T> &ap_buffer, const size_t ap_offset,
-                    const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
-                    const T beta,
-                    const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc);
+  void DoSpmv(const Layout layout, const Triangle triangle,
+              const size_t n,
+              const T alpha,
+              const Buffer<T> &ap_buffer, const size_t ap_offset,
+              const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
+              const T beta,
+              const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc);
 };

 // =================================================================================================
--- a/src/routines/level2/xspr.cpp
+++ b/src/routines/level2/xspr.cpp
@ -28,17 +28,17 @@ Xspr<T>::Xspr(Queue &queue, EventPointer event, const std::string &name):

 // The main routine
 template <typename T>
-StatusCode Xspr<T>::DoSpr(const Layout layout, const Triangle triangle,
-                          const size_t n,
-                          const T alpha,
-                          const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
-                          const Buffer<T> &ap_buffer, const size_t ap_offset) {
+void Xspr<T>::DoSpr(const Layout layout, const Triangle triangle,
+                    const size_t n,
+                    const T alpha,
+                    const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
+                    const Buffer<T> &ap_buffer, const size_t ap_offset) {

  // Specific Xspr functionality is implemented in the kernel using defines
-  return DoHer(layout, triangle, n, alpha,
-               x_buffer, x_offset, x_inc,
-               ap_buffer, ap_offset, n,
-               true); // packed matrix
+  DoHer(layout, triangle, n, alpha,
+        x_buffer, x_offset, x_inc,
+        ap_buffer, ap_offset, n,
+        true); // packed matrix
 }

 // =================================================================================================
--- a/src/routines/level2/xspr.hpp
+++ b/src/routines/level2/xspr.hpp
@ -31,11 +31,11 @@ class Xspr: public Xher<T,T> {
  Xspr(Queue &queue, EventPointer event, const std::string &name = "SPR");

  // Templated-precision implementation of the routine
-  StatusCode DoSpr(const Layout layout, const Triangle triangle,
-                   const size_t n,
-                   const T alpha,
-                   const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
-                   const Buffer<T> &ap_buffer, const size_t ap_offset);
+  void DoSpr(const Layout layout, const Triangle triangle,
+             const size_t n,
+             const T alpha,
+             const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
+             const Buffer<T> &ap_buffer, const size_t ap_offset);
 };

 // =================================================================================================
--- a/src/routines/level2/xspr2.cpp
+++ b/src/routines/level2/xspr2.cpp
@ -28,19 +28,19 @@ Xspr2<T>::Xspr2(Queue &queue, EventPointer event, const std::string &name):

 // The main routine
 template <typename T>
-StatusCode Xspr2<T>::DoSpr2(const Layout layout, const Triangle triangle,
-                            const size_t n,
-                            const T alpha,
-                            const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
-                            const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc,
-                            const Buffer<T> &ap_buffer, const size_t ap_offset) {
+void Xspr2<T>::DoSpr2(const Layout layout, const Triangle triangle,
+                      const size_t n,
+                      const T alpha,
+                      const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
+                      const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc,
+                      const Buffer<T> &ap_buffer, const size_t ap_offset) {

  // Specific Xspr2 functionality is implemented in the kernel using defines
-  return DoHer2(layout, triangle, n, alpha,
-                x_buffer, x_offset, x_inc,
-                y_buffer, y_offset, y_inc,
-                ap_buffer, ap_offset, n,
-                true); // packed matrix
+  DoHer2(layout, triangle, n, alpha,
+         x_buffer, x_offset, x_inc,
+         y_buffer, y_offset, y_inc,
+         ap_buffer, ap_offset, n,
+         true); // packed matrix
 }

 // =================================================================================================
--- a/src/routines/level2/xspr2.hpp
+++ b/src/routines/level2/xspr2.hpp
@ -31,12 +31,12 @@ class Xspr2: public Xher2<T> {
  Xspr2(Queue &queue, EventPointer event, const std::string &name = "SPR2");

  // Templated-precision implementation of the routine
-  StatusCode DoSpr2(const Layout layout, const Triangle triangle,
-                    const size_t n,
-                    const T alpha,
-                    const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
-                    const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc,
-                    const Buffer<T> &ap_buffer, const size_t ap_offset);
+  void DoSpr2(const Layout layout, const Triangle triangle,
+              const size_t n,
+              const T alpha,
+              const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
+              const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc,
+              const Buffer<T> &ap_buffer, const size_t ap_offset);
 };

 // =================================================================================================
--- a/src/routines/level2/xsymv.cpp
+++ b/src/routines/level2/xsymv.cpp
@ -29,13 +29,13 @@ Xsymv<T>::Xsymv(Queue &queue, EventPointer event, const std::string &name):

 // The main routine
 template <typename T>
-StatusCode Xsymv<T>::DoSymv(const Layout layout, const Triangle triangle,
-                            const size_t n,
-                            const T alpha,
-                            const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
-                            const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
-                            const T beta,
-                            const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc) {
+void Xsymv<T>::DoSymv(const Layout layout, const Triangle triangle,
+                      const size_t n,
+                      const T alpha,
+                      const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
+                      const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
+                      const T beta,
+                      const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc) {

  // The data is either in the upper or lower triangle
  size_t is_upper = ((triangle == Triangle::kUpper && layout != Layout::kRowMajor) ||
@ -45,13 +45,13 @@ StatusCode Xsymv<T>::DoSymv(const Layout layout, const Triangle triangle,
  // The specific symmetric matrix-accesses are implemented in the kernel guarded by the
  // ROUTINE_SYMV define.
  bool fast_kernels = false;
-  return MatVec(layout, Transpose::kNo,
-                n, n, alpha,
-                a_buffer, a_offset, a_ld,
-                x_buffer, x_offset, x_inc, beta,
-                y_buffer, y_offset, y_inc,
-                fast_kernels, fast_kernels,
-                is_upper, false, 0, 0);
+  MatVec(layout, Transpose::kNo,
+         n, n, alpha,
+         a_buffer, a_offset, a_ld,
+         x_buffer, x_offset, x_inc, beta,
+         y_buffer, y_offset, y_inc,
+         fast_kernels, fast_kernels,
+         is_upper, false, 0, 0);
 }

 // =================================================================================================
--- a/src/routines/level2/xsymv.hpp
+++ b/src/routines/level2/xsymv.hpp
@ -33,13 +33,13 @@ class Xsymv: public Xgemv<T> {
  Xsymv(Queue &queue, EventPointer event, const std::string &name = "SYMV");

  // Templated-precision implementation of the routine
-  StatusCode DoSymv(const Layout layout, const Triangle triangle,
-                    const size_t n,
-                    const T alpha,
-                    const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
-                    const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
-                    const T beta,
-                    const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc);
+  void DoSymv(const Layout layout, const Triangle triangle,
+              const size_t n,
+              const T alpha,
+              const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
+              const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
+              const T beta,
+              const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc);
 };

 // =================================================================================================
--- a/src/routines/level2/xsyr.cpp
+++ b/src/routines/level2/xsyr.cpp
@ -28,16 +28,16 @@ Xsyr<T>::Xsyr(Queue &queue, EventPointer event, const std::string &name):

 // The main routine
 template <typename T>
-StatusCode Xsyr<T>::DoSyr(const Layout layout, const Triangle triangle,
-                          const size_t n,
-                          const T alpha,
-                          const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
-                          const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld) {
+void Xsyr<T>::DoSyr(const Layout layout, const Triangle triangle,
+                    const size_t n,
+                    const T alpha,
+                    const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
+                    const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld) {

  // Specific Xsyr functionality is implemented in the kernel using defines
-  return DoHer(layout, triangle, n, alpha,
-               x_buffer, x_offset, x_inc,
-               a_buffer, a_offset, a_ld);
+  DoHer(layout, triangle, n, alpha,
+        x_buffer, x_offset, x_inc,
+        a_buffer, a_offset, a_ld);
 }

 // =================================================================================================
--- a/src/routines/level2/xsyr.hpp
+++ b/src/routines/level2/xsyr.hpp
@ -31,11 +31,11 @@ class Xsyr: public Xher<T,T> {
  Xsyr(Queue &queue, EventPointer event, const std::string &name = "SYR");

  // Templated-precision implementation of the routine
-  StatusCode DoSyr(const Layout layout, const Triangle triangle,
-                   const size_t n,
-                   const T alpha,
-                   const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
-                   const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld);
+  void DoSyr(const Layout layout, const Triangle triangle,
+             const size_t n,
+             const T alpha,
+             const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
+             const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld);
 };

 // =================================================================================================
--- a/src/routines/level2/xsyr2.cpp
+++ b/src/routines/level2/xsyr2.cpp
@ -28,18 +28,18 @@ Xsyr2<T>::Xsyr2(Queue &queue, EventPointer event, const std::string &name):

 // The main routine
 template <typename T>
-StatusCode Xsyr2<T>::DoSyr2(const Layout layout, const Triangle triangle,
-                            const size_t n,
-                            const T alpha,
-                            const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
-                            const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc,
-                            const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld) {
+void Xsyr2<T>::DoSyr2(const Layout layout, const Triangle triangle,
+                      const size_t n,
+                      const T alpha,
+                      const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
+                      const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc,
+                      const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld) {

  // Specific Xsyr2 functionality is implemented in the kernel using defines
-  return DoHer2(layout, triangle, n, alpha,
-                x_buffer, x_offset, x_inc,
-                y_buffer, y_offset, y_inc,
-                a_buffer, a_offset, a_ld);
+  DoHer2(layout, triangle, n, alpha,
+         x_buffer, x_offset, x_inc,
+         y_buffer, y_offset, y_inc,
+         a_buffer, a_offset, a_ld);
 }

 // =================================================================================================
--- a/src/routines/level2/xsyr2.hpp
+++ b/src/routines/level2/xsyr2.hpp
@ -31,12 +31,12 @@ class Xsyr2: public Xher2<T> {
  Xsyr2(Queue &queue, EventPointer event, const std::string &name = "SYR2");

  // Templated-precision implementation of the routine
-  StatusCode DoSyr2(const Layout layout, const Triangle triangle,
-                    const size_t n,
-                    const T alpha,
-                    const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
-                    const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc,
-                    const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld);
+  void DoSyr2(const Layout layout, const Triangle triangle,
+              const size_t n,
+              const T alpha,
+              const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
+              const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc,
+              const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld);
 };

 // =================================================================================================
--- a/src/routines/level2/xtbmv.cpp
+++ b/src/routines/level2/xtbmv.cpp
@ -29,17 +29,15 @@ Xtbmv<T>::Xtbmv(Queue &queue, EventPointer event, const std::string &name):

 // The main routine
 template <typename T>
-StatusCode Xtbmv<T>::DoTbmv(const Layout layout, const Triangle triangle,
-                            const Transpose a_transpose, const Diagonal diagonal,
-                            const size_t n, const size_t k,
-                            const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
-                            const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc) {
+void Xtbmv<T>::DoTbmv(const Layout layout, const Triangle triangle,
+                      const Transpose a_transpose, const Diagonal diagonal,
+                      const size_t n, const size_t k,
+                      const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
+                      const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc) {

  // Creates a copy of X: a temporary scratch buffer
  auto scratch_buffer = Buffer<T>(context_, n*x_inc + x_offset);
-  try {
-    x_buffer.CopyTo(queue_, n*x_inc + x_offset, scratch_buffer);
-  } catch (...) { } // Continues: error-code is returned in MatVec
+  x_buffer.CopyTo(queue_, n*x_inc + x_offset, scratch_buffer);

  // The data is either in the upper or lower triangle
  size_t is_upper = ((triangle == Triangle::kUpper && layout != Layout::kRowMajor) ||
@ -52,20 +50,22 @@ StatusCode Xtbmv<T>::DoTbmv(const Layout layout, const Triangle triangle,
  // The specific triangular banded matrix-accesses are implemented in the kernel guarded by the
  // ROUTINE_TBMV define.
  auto fast_kernels = false;
-  auto status = MatVec(layout, a_transpose,
-                       n, n, static_cast<T>(1),
-                       a_buffer, a_offset, a_ld,
-                       scratch_buffer, x_offset, x_inc, static_cast<T>(0),
-                       x_buffer, x_offset, x_inc,
-                       fast_kernels, fast_kernels,
-                       parameter, false, k, 0);
-
-  // Returns the proper error code (renames vector Y to X)
-  switch(status) {
-    case StatusCode::kInvalidVectorY:      return StatusCode::kInvalidVectorX;
-    case StatusCode::kInvalidIncrementY:   return StatusCode::kInvalidIncrementX;
-    case StatusCode::kInsufficientMemoryY: return StatusCode::kInsufficientMemoryX;
-    default: return status;
+  try {
+    MatVec(layout, a_transpose,
+           n, n, static_cast<T>(1),
+           a_buffer, a_offset, a_ld,
+           scratch_buffer, x_offset, x_inc, static_cast<T>(0),
+           x_buffer, x_offset, x_inc,
+           fast_kernels, fast_kernels,
+           parameter, false, k, 0);
+  } catch (BLASError &e) {
+    // Returns the proper error code (renames vector Y to X)
+    switch (e.status()) {
+      case StatusCode::kInvalidVectorY:      throw BLASError(StatusCode::kInvalidVectorX, e.details());
+      case StatusCode::kInvalidIncrementY:   throw BLASError(StatusCode::kInvalidIncrementX, e.details());
+      case StatusCode::kInsufficientMemoryY: throw BLASError(StatusCode::kInsufficientMemoryX, e.details());
+      default:                               throw;
+    }
  }
 }

--- a/src/routines/level2/xtbmv.hpp
+++ b/src/routines/level2/xtbmv.hpp
@ -35,11 +35,11 @@ class Xtbmv: public Xgemv<T> {
  Xtbmv(Queue &queue, EventPointer event, const std::string &name = "TBMV");

  // Templated-precision implementation of the routine
-  StatusCode DoTbmv(const Layout layout, const Triangle triangle,
-                    const Transpose a_transpose, const Diagonal diagonal,
-                    const size_t n, const size_t k,
-                    const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
-                    const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc);
+  void DoTbmv(const Layout layout, const Triangle triangle,
+              const Transpose a_transpose, const Diagonal diagonal,
+              const size_t n, const size_t k,
+              const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
+              const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc);
 };

 // =================================================================================================
--- a/src/routines/level2/xtpmv.cpp
+++ b/src/routines/level2/xtpmv.cpp
@ -29,17 +29,15 @@ Xtpmv<T>::Xtpmv(Queue &queue, EventPointer event, const std::string &name):

 // The main routine
 template <typename T>
-StatusCode Xtpmv<T>::DoTpmv(const Layout layout, const Triangle triangle,
-                            const Transpose a_transpose, const Diagonal diagonal,
-                            const size_t n,
-                            const Buffer<T> &ap_buffer, const size_t ap_offset,
-                            const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc) {
+void Xtpmv<T>::DoTpmv(const Layout layout, const Triangle triangle,
+                      const Transpose a_transpose, const Diagonal diagonal,
+                      const size_t n,
+                      const Buffer<T> &ap_buffer, const size_t ap_offset,
+                      const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc) {

  // Creates a copy of X: a temporary scratch buffer
  auto scratch_buffer = Buffer<T>(context_, n*x_inc + x_offset);
-  try {
-    x_buffer.CopyTo(queue_, n*x_inc + x_offset, scratch_buffer);
-  } catch (...) { } // Continues: error-code is returned in MatVec
+  x_buffer.CopyTo(queue_, n*x_inc + x_offset, scratch_buffer);

  // The data is either in the upper or lower triangle
  size_t is_upper = ((triangle == Triangle::kUpper && layout != Layout::kRowMajor) ||
@ -52,20 +50,22 @@ StatusCode Xtpmv<T>::DoTpmv(const Layout layout, const Triangle triangle,
  // The specific triangular packed matrix-accesses are implemented in the kernel guarded by the
  // ROUTINE_TPMV define.
  auto fast_kernels = false;
-  auto status = MatVec(layout, a_transpose,
-                       n, n, static_cast<T>(1),
-                       ap_buffer, ap_offset, n,
-                       scratch_buffer, x_offset, x_inc, static_cast<T>(0),
-                       x_buffer, x_offset, x_inc,
-                       fast_kernels, fast_kernels,
-                       parameter, true, 0, 0);
-
-  // Returns the proper error code (renames vector Y to X)
-  switch(status) {
-    case StatusCode::kInvalidVectorY:      return StatusCode::kInvalidVectorX;
-    case StatusCode::kInvalidIncrementY:   return StatusCode::kInvalidIncrementX;
-    case StatusCode::kInsufficientMemoryY: return StatusCode::kInsufficientMemoryX;
-    default: return status;
+  try {
+    MatVec(layout, a_transpose,
+           n, n, static_cast<T>(1),
+           ap_buffer, ap_offset, n,
+           scratch_buffer, x_offset, x_inc, static_cast<T>(0),
+           x_buffer, x_offset, x_inc,
+           fast_kernels, fast_kernels,
+           parameter, true, 0, 0);
+  } catch (BLASError &e) {
+    // Returns the proper error code (renames vector Y to X)
+    switch (e.status()) {
+      case StatusCode::kInvalidVectorY:      throw BLASError(StatusCode::kInvalidVectorX, e.details());
+      case StatusCode::kInvalidIncrementY:   throw BLASError(StatusCode::kInvalidIncrementX, e.details());
+      case StatusCode::kInsufficientMemoryY: throw BLASError(StatusCode::kInsufficientMemoryX, e.details());
+      default:                               throw;
+    }
  }
 }

--- a/src/routines/level2/xtpmv.hpp
+++ b/src/routines/level2/xtpmv.hpp
@ -35,11 +35,11 @@ class Xtpmv: public Xgemv<T> {
  Xtpmv(Queue &queue, EventPointer event, const std::string &name = "TPMV");

  // Templated-precision implementation of the routine
-  StatusCode DoTpmv(const Layout layout, const Triangle triangle,
-                    const Transpose a_transpose, const Diagonal diagonal,
-                    const size_t n,
-                    const Buffer<T> &ap_buffer, const size_t ap_offset,
-                    const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc);
+  void DoTpmv(const Layout layout, const Triangle triangle,
+              const Transpose a_transpose, const Diagonal diagonal,
+              const size_t n,
+              const Buffer<T> &ap_buffer, const size_t ap_offset,
+              const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc);
 };

 // =================================================================================================
--- a/src/routines/level2/xtrmv.cpp
+++ b/src/routines/level2/xtrmv.cpp
@ -29,17 +29,15 @@ Xtrmv<T>::Xtrmv(Queue &queue, EventPointer event, const std::string &name):

 // The main routine
 template <typename T>
-StatusCode Xtrmv<T>::DoTrmv(const Layout layout, const Triangle triangle,
-                            const Transpose a_transpose, const Diagonal diagonal,
-                            const size_t n,
-                            const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
-                            const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc) {
+void Xtrmv<T>::DoTrmv(const Layout layout, const Triangle triangle,
+                      const Transpose a_transpose, const Diagonal diagonal,
+                      const size_t n,
+                      const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
+                      const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc) {

  // Creates a copy of X: a temporary scratch buffer
  auto scratch_buffer = Buffer<T>(context_, n*x_inc + x_offset);
-  try {
-    x_buffer.CopyTo(queue_, n*x_inc + x_offset, scratch_buffer);
-  } catch (...) { } // Continues: error-code is returned in MatVec
+  x_buffer.CopyTo(queue_, n*x_inc + x_offset, scratch_buffer);

  // The data is either in the upper or lower triangle
  size_t is_upper = ((triangle == Triangle::kUpper && layout != Layout::kRowMajor) ||
@ -52,20 +50,22 @@ StatusCode Xtrmv<T>::DoTrmv(const Layout layout, const Triangle triangle,
  // The specific triangular matrix-accesses are implemented in the kernel guarded by the
  // ROUTINE_TRMV define.
  auto fast_kernels = false;
-  auto status = MatVec(layout, a_transpose,
-                       n, n, static_cast<T>(1),
-                       a_buffer, a_offset, a_ld,
-                       scratch_buffer, x_offset, x_inc, static_cast<T>(0),
-                       x_buffer, x_offset, x_inc,
-                       fast_kernels, fast_kernels,
-                       parameter, false, 0, 0);
-
-  // Returns the proper error code (renames vector Y to X)
-  switch(status) {
-    case StatusCode::kInvalidVectorY:      return StatusCode::kInvalidVectorX;
-    case StatusCode::kInvalidIncrementY:   return StatusCode::kInvalidIncrementX;
-    case StatusCode::kInsufficientMemoryY: return StatusCode::kInsufficientMemoryX;
-    default: return status;
+  try {
+    MatVec(layout, a_transpose,
+           n, n, static_cast<T>(1),
+           a_buffer, a_offset, a_ld,
+           scratch_buffer, x_offset, x_inc, static_cast<T>(0),
+           x_buffer, x_offset, x_inc,
+           fast_kernels, fast_kernels,
+           parameter, false, 0, 0);
+  } catch (BLASError &e) {
+    // Returns the proper error code (renames vector Y to X)
+    switch (e.status()) {
+      case StatusCode::kInvalidVectorY:      throw BLASError(StatusCode::kInvalidVectorX, e.details());
+      case StatusCode::kInvalidIncrementY:   throw BLASError(StatusCode::kInvalidIncrementX, e.details());
+      case StatusCode::kInsufficientMemoryY: throw BLASError(StatusCode::kInsufficientMemoryX, e.details());
+      default:                               throw;
+    }
  }
 }

--- a/src/routines/level2/xtrmv.hpp
+++ b/src/routines/level2/xtrmv.hpp
@ -35,11 +35,11 @@ class Xtrmv: public Xgemv<T> {
  Xtrmv(Queue &queue, EventPointer event, const std::string &name = "TRMV");

  // Templated-precision implementation of the routine
-  StatusCode DoTrmv(const Layout layout, const Triangle triangle,
-                    const Transpose a_transpose, const Diagonal diagonal,
-                    const size_t n,
-                    const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
-                    const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc);
+  void DoTrmv(const Layout layout, const Triangle triangle,
+              const Transpose a_transpose, const Diagonal diagonal,
+              const size_t n,
+              const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
+              const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc);
 };

 // =================================================================================================
--- a/src/routines/level3/xgemm.cpp
+++ b/src/routines/level3/xgemm.cpp
@ -50,17 +50,17 @@ Xgemm<T>::Xgemm(Queue &queue, EventPointer event, const std::string &name):

 // The main routine
 template <typename T>
-StatusCode Xgemm<T>::DoGemm(const Layout layout,
-                            const Transpose a_transpose, const Transpose b_transpose,
-                            const size_t m, const size_t n, const size_t k,
-                            const T alpha,
-                            const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
-                            const Buffer<T> &b_buffer, const size_t b_offset, const size_t b_ld,
-                            const T beta,
-                            const Buffer<T> &c_buffer, const size_t c_offset, const size_t c_ld) {
+void Xgemm<T>::DoGemm(const Layout layout,
+                      const Transpose a_transpose, const Transpose b_transpose,
+                      const size_t m, const size_t n, const size_t k,
+                      const T alpha,
+                      const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
+                      const Buffer<T> &b_buffer, const size_t b_offset, const size_t b_ld,
+                      const T beta,
+                      const Buffer<T> &c_buffer, const size_t c_offset, const size_t c_ld) {

  // Makes sure all dimensions are larger than zero
-  if ((m == 0) || (n == 0) || (k == 0)) { return StatusCode::kInvalidDimension; }
+  if ((m == 0) || (n == 0) || (k == 0)) { throw BLASError(StatusCode::kInvalidDimension); }

  // Computes whether or not the matrices are transposed in memory. This is based on their layout
  // (row or column-major) and whether or not they are requested to be pre-transposed. Note
@ -99,12 +99,9 @@ StatusCode Xgemm<T>::DoGemm(const Layout layout,
  //    matrix A cannot be less than K when rotated, or less than M when not-rotated
  //    matrix B cannot be less than N when rotated, or less than K when not-rotated
  //    matrix C cannot be less than N when rotated, or less than M when not-rotated
-  auto status = TestMatrixA(a_one, a_two, a_buffer, a_offset, a_ld);
-  if (ErrorIn(status)) { return status; }
-  status = TestMatrixB(b_one, b_two, b_buffer, b_offset, b_ld);
-  if (ErrorIn(status)) { return status; }
-  status = TestMatrixC(c_one, c_two, c_buffer, c_offset, c_ld);
-  if (ErrorIn(status)) { return status; }
+  TestMatrixA(a_one, a_two, a_buffer, a_offset, a_ld);
+  TestMatrixB(b_one, b_two, b_buffer, b_offset, b_ld);
+  TestMatrixC(c_one, c_two, c_buffer, c_offset, c_ld);

  // Selects which version of GEMM to run
  const auto do_gemm_direct = (m * n * k < db_["XGEMM_MIN_INDIRECT_SIZE"]);
@ -131,7 +128,7 @@ StatusCode Xgemm<T>::DoGemm(const Layout layout,
 // requirements, but several pre and post-processing kernels take care of those. However, the
 // overhead of these extra kernels might not be ideal for certain devices/arguments.
 template <typename T>
-StatusCode Xgemm<T>::GemmIndirect(const size_t m, const size_t n, const size_t k,
+void Xgemm<T>::GemmIndirect(const size_t m, const size_t n, const size_t k,
                                  const T alpha,
                                  const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
                                  const Buffer<T> &b_buffer, const size_t b_offset, const size_t b_ld,
@ -142,8 +139,6 @@ StatusCode Xgemm<T>::GemmIndirect(const size_t m, const size_t n, const size_t k
                                  const size_t a_one, const size_t a_two, const bool a_want_rotated,
                                  const size_t b_one, const size_t b_two, const bool b_want_rotated,
                                  const size_t c_one, const size_t c_two, const bool c_want_rotated) {
-  auto status = StatusCode::kSuccess;
-
  // Calculates the ceiled versions of m, n, and k
  const auto m_ceiled = Ceil(m, db_["MWG"]);
  const auto n_ceiled = Ceil(n, db_["NWG"]);
@ -158,109 +153,95 @@ StatusCode Xgemm<T>::GemmIndirect(const size_t m, const size_t n, const size_t k
  const auto c_one_i = (c_want_rotated) ? n_ceiled : m_ceiled;
  const auto c_two_i = (c_want_rotated) ? m_ceiled : n_ceiled;

-  // The padded/transposed input/output matrices: if memory allocation fails, throw an exception
-  try {
+  // Loads the program from the database
+  const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);

-    // Loads the program from the database
-    const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);
+  // Determines whether or not temporary matrices are needed
+  auto a_no_temp = a_one == a_one_i && a_two == a_two_i && a_ld == a_one && a_offset == 0 &&
+                   a_do_transpose == false && a_conjugate == false;
+  auto b_no_temp = b_one == b_one_i && b_two == b_two_i && b_ld == b_one && b_offset == 0 &&
+                   b_do_transpose == false && b_conjugate == false;
+  auto c_no_temp = c_one == c_one_i && c_two == c_two_i && c_ld == c_one && c_offset == 0 &&
+                   c_do_transpose == false;

-    // Determines whether or not temporary matrices are needed
-    auto a_no_temp = a_one == a_one_i && a_two == a_two_i && a_ld == a_one && a_offset == 0 &&
-                     a_do_transpose == false && a_conjugate == false;
-    auto b_no_temp = b_one == b_one_i && b_two == b_two_i && b_ld == b_one && b_offset == 0 &&
-                     b_do_transpose == false && b_conjugate == false;
-    auto c_no_temp = c_one == c_one_i && c_two == c_two_i && c_ld == c_one && c_offset == 0 &&
-                     c_do_transpose == false;
+  // Creates the temporary matrices
+  const auto a_temp = (a_no_temp) ? a_buffer : Buffer<T>(context_, a_one_i*a_two_i);
+  const auto b_temp = (b_no_temp) ? b_buffer : Buffer<T>(context_, b_one_i*b_two_i);
+  const auto c_temp = (c_no_temp) ? c_buffer : Buffer<T>(context_, c_one_i*c_two_i);

-    // Creates the temporary matrices
-    const auto a_temp = (a_no_temp) ? a_buffer : Buffer<T>(context_, a_one_i*a_two_i);
-    const auto b_temp = (b_no_temp) ? b_buffer : Buffer<T>(context_, b_one_i*b_two_i);
-    const auto c_temp = (c_no_temp) ? c_buffer : Buffer<T>(context_, c_one_i*c_two_i);
+  // Events of all kernels (including pre/post processing kernels)
+  auto eventWaitList = std::vector<Event>();
+  auto emptyEventList = std::vector<Event>();

-    // Events of all kernels (including pre/post processing kernels)
-    auto eventWaitList = std::vector<Event>();
-    auto emptyEventList = std::vector<Event>();
+  // Runs the pre-processing kernel for matrix A. This transposes the matrix, but also pads zeros
+  // to fill it up until it reaches a certain multiple of size (kernel parameter dependent). In
+  // case nothing has to be done, these kernels can be skipped.
+  if (!a_no_temp) {
+    auto eventProcessA = Event();
+    PadCopyTransposeMatrix(queue_, device_, db_, eventProcessA.pointer(), emptyEventList,
+                           a_one, a_two, a_ld, a_offset, a_buffer,
+                           a_one_i, a_two_i, a_one_i, 0, a_temp,
+                           ConstantOne<T>(), program,
+                           true, a_do_transpose, a_conjugate);
+    eventWaitList.push_back(eventProcessA);
+  }

-    // Runs the pre-processing kernel for matrix A. This transposes the matrix, but also pads zeros
-    // to fill it up until it reaches a certain multiple of size (kernel parameter dependent). In
-    // case nothing has to be done, these kernels can be skipped.
-    if (!a_no_temp) {
-      auto eventProcessA = Event();
-      status = PadCopyTransposeMatrix(queue_, device_, db_, eventProcessA.pointer(), emptyEventList,
-                                      a_one, a_two, a_ld, a_offset, a_buffer,
-                                      a_one_i, a_two_i, a_one_i, 0, a_temp,
-                                      ConstantOne<T>(), program,
-                                      true, a_do_transpose, a_conjugate);
-      if (ErrorIn(status)) { return status; }
-      eventWaitList.push_back(eventProcessA);
-    }
+  // As above, but now for matrix B
+  if (!b_no_temp) {
+    auto eventProcessB = Event();
+    PadCopyTransposeMatrix(queue_, device_, db_, eventProcessB.pointer(), emptyEventList,
+                           b_one, b_two, b_ld, b_offset, b_buffer,
+                           b_one_i, b_two_i, b_one_i, 0, b_temp,
+                           ConstantOne<T>(), program,
+                           true, b_do_transpose, b_conjugate);
+    eventWaitList.push_back(eventProcessB);
+  }

-    // As above, but now for matrix B
-    if (!b_no_temp) {
-      auto eventProcessB = Event();
-      status = PadCopyTransposeMatrix(queue_, device_, db_, eventProcessB.pointer(), emptyEventList,
-                                      b_one, b_two, b_ld, b_offset, b_buffer,
-                                      b_one_i, b_two_i, b_one_i, 0, b_temp,
-                                      ConstantOne<T>(), program,
-                                      true, b_do_transpose, b_conjugate);
-      if (ErrorIn(status)) { return status; }
-      eventWaitList.push_back(eventProcessB);
-    }
+  // As above, but now for matrix C. This is only necessary if C is used both as input and output.
+  if (!c_no_temp && beta != static_cast<T>(0)) {
+    auto eventProcessC = Event();
+    PadCopyTransposeMatrix(queue_, device_, db_, eventProcessC.pointer(), emptyEventList,
+                           c_one, c_two, c_ld, c_offset, c_buffer,
+                           c_one_i, c_two_i, c_one_i, 0, c_temp,
+                           ConstantOne<T>(), program,
+                           true, c_do_transpose, false);
+    eventWaitList.push_back(eventProcessC);
+  }

-    // As above, but now for matrix C. This is only necessary if C is used both as input and output.
-    if (!c_no_temp && beta != static_cast<T>(0)) {
-      auto eventProcessC = Event();
-      status = PadCopyTransposeMatrix(queue_, device_, db_, eventProcessC.pointer(), emptyEventList,
-                                      c_one, c_two, c_ld, c_offset, c_buffer,
-                                      c_one_i, c_two_i, c_one_i, 0, c_temp,
-                                      ConstantOne<T>(), program,
-                                      true, c_do_transpose, false);
-      if (ErrorIn(status)) { return status; }
-      eventWaitList.push_back(eventProcessC);
-    }
+  // Retrieves the Xgemm kernel from the compiled binary
+  auto kernel = Kernel(program, "Xgemm");

-    // Retrieves the Xgemm kernel from the compiled binary
-    try {
-      auto kernel = Kernel(program, "Xgemm");
+  // Sets the kernel arguments
+  kernel.SetArgument(0, static_cast<int>(m_ceiled));
+  kernel.SetArgument(1, static_cast<int>(n_ceiled));
+  kernel.SetArgument(2, static_cast<int>(k_ceiled));
+  kernel.SetArgument(3, GetRealArg(alpha));
+  kernel.SetArgument(4, GetRealArg(beta));
+  kernel.SetArgument(5, a_temp());
+  kernel.SetArgument(6, b_temp());
+  kernel.SetArgument(7, c_temp());

-      // Sets the kernel arguments
-      kernel.SetArgument(0, static_cast<int>(m_ceiled));
-      kernel.SetArgument(1, static_cast<int>(n_ceiled));
-      kernel.SetArgument(2, static_cast<int>(k_ceiled));
-      kernel.SetArgument(3, GetRealArg(alpha));
-      kernel.SetArgument(4, GetRealArg(beta));
-      kernel.SetArgument(5, a_temp());
-      kernel.SetArgument(6, b_temp());
-      kernel.SetArgument(7, c_temp());
+  // Computes the global and local thread sizes
+  const auto global = std::vector<size_t>{
+    (c_one_i * db_["MDIMC"]) / db_["MWG"],
+    (c_two_i * db_["NDIMC"]) / db_["NWG"]
+  };
+  const auto local = std::vector<size_t>{db_["MDIMC"], db_["NDIMC"]};

-      // Computes the global and local thread sizes
-      const auto global = std::vector<size_t>{
-        (c_one_i * db_["MDIMC"]) / db_["MWG"],
-        (c_two_i * db_["NDIMC"]) / db_["NWG"]
-      };
-      const auto local = std::vector<size_t>{db_["MDIMC"], db_["NDIMC"]};
+  // Launches the kernel
+  auto eventKernel = Event();
+  auto eventPointer = (!c_no_temp) ? eventKernel.pointer() : event_;
+  RunKernel(kernel, queue_, device_, global, local, eventPointer, eventWaitList);

-      // Launches the kernel
-      auto eventKernel = Event();
-      auto eventPointer = (!c_no_temp) ? eventKernel.pointer() : event_;
-      status = RunKernel(kernel, queue_, device_, global, local, eventPointer, eventWaitList);
-      if (ErrorIn(status)) { return status; }
-
-      // Runs the post-processing kernel if needed
-      if (!c_no_temp) {
-        eventWaitList.push_back(eventKernel);
-        status = PadCopyTransposeMatrix(queue_, device_, db_, event_, eventWaitList,
-                                        c_one_i, c_two_i, c_one_i, 0, c_temp,
-                                        c_one, c_two, c_ld, c_offset, c_buffer,
-                                        ConstantOne<T>(), program,
-                                        false, c_do_transpose, false);
-        if (ErrorIn(status)) { return status; }
-      }
-
-      // Successfully finished the computation
-      return StatusCode::kSuccess;
-    } catch (...) { return StatusCode::kInvalidKernel; }
-  } catch (...) { return StatusCode::kTempBufferAllocFailure; }
+  // Runs the post-processing kernel if needed
+  if (!c_no_temp) {
+    eventWaitList.push_back(eventKernel);
+    PadCopyTransposeMatrix(queue_, device_, db_, event_, eventWaitList,
+                           c_one_i, c_two_i, c_one_i, 0, c_temp,
+                           c_one, c_two, c_ld, c_offset, c_buffer,
+                           ConstantOne<T>(), program,
+                           false, c_do_transpose, false);
+  }
 }


@ -268,7 +249,7 @@ StatusCode Xgemm<T>::GemmIndirect(const size_t m, const size_t n, const size_t k

 // The direct version of GEMM, requiring just one kernel, no pre or post-processing kernels.
 template <typename T>
-StatusCode Xgemm<T>::GemmDirect(const size_t m, const size_t n, const size_t k,
+void Xgemm<T>::GemmDirect(const size_t m, const size_t n, const size_t k,
                                const T alpha,
                                const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
                                const Buffer<T> &b_buffer, const size_t b_offset, const size_t b_ld,
@ -281,46 +262,40 @@ StatusCode Xgemm<T>::GemmDirect(const size_t m, const size_t n, const size_t k,
  const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);

  // Retrieves the proper XgemmDirect kernel from the compiled binary
-  try {
-    const auto name = (a_do_transpose) ? (b_do_transpose ? "XgemmDirectTT" : "XgemmDirectTN") :
-                                         (b_do_transpose ? "XgemmDirectNT" : "XgemmDirectNN");
-    auto kernel = Kernel(program, name);
+  const auto name = (a_do_transpose) ? (b_do_transpose ? "XgemmDirectTT" : "XgemmDirectTN") :
+                                       (b_do_transpose ? "XgemmDirectNT" : "XgemmDirectNN");
+  auto kernel = Kernel(program, name);

-    // Sets the kernel arguments
-    kernel.SetArgument(0, static_cast<int>(m));
-    kernel.SetArgument(1, static_cast<int>(n));
-    kernel.SetArgument(2, static_cast<int>(k));
-    kernel.SetArgument(3, GetRealArg(alpha));
-    kernel.SetArgument(4, GetRealArg(beta));
-    kernel.SetArgument(5, a_buffer());
-    kernel.SetArgument(6, static_cast<int>(a_offset));
-    kernel.SetArgument(7, static_cast<int>(a_ld));
-    kernel.SetArgument(8, b_buffer());
-    kernel.SetArgument(9, static_cast<int>(b_offset));
-    kernel.SetArgument(10, static_cast<int>(b_ld));
-    kernel.SetArgument(11, c_buffer());
-    kernel.SetArgument(12, static_cast<int>(c_offset));
-    kernel.SetArgument(13, static_cast<int>(c_ld));
-    kernel.SetArgument(14, static_cast<int>(c_do_transpose));
-    kernel.SetArgument(15, static_cast<int>(a_conjugate));
-    kernel.SetArgument(16, static_cast<int>(b_conjugate));
+  // Sets the kernel arguments
+  kernel.SetArgument(0, static_cast<int>(m));
+  kernel.SetArgument(1, static_cast<int>(n));
+  kernel.SetArgument(2, static_cast<int>(k));
+  kernel.SetArgument(3, GetRealArg(alpha));
+  kernel.SetArgument(4, GetRealArg(beta));
+  kernel.SetArgument(5, a_buffer());
+  kernel.SetArgument(6, static_cast<int>(a_offset));
+  kernel.SetArgument(7, static_cast<int>(a_ld));
+  kernel.SetArgument(8, b_buffer());
+  kernel.SetArgument(9, static_cast<int>(b_offset));
+  kernel.SetArgument(10, static_cast<int>(b_ld));
+  kernel.SetArgument(11, c_buffer());
+  kernel.SetArgument(12, static_cast<int>(c_offset));
+  kernel.SetArgument(13, static_cast<int>(c_ld));
+  kernel.SetArgument(14, static_cast<int>(c_do_transpose));
+  kernel.SetArgument(15, static_cast<int>(a_conjugate));
+  kernel.SetArgument(16, static_cast<int>(b_conjugate));

-    // Computes the global and local thread sizes
-    const auto m_ceiled = Ceil(m, db_["WGD"]);
-    const auto n_ceiled = Ceil(n, db_["WGD"]);
-    const auto global = std::vector<size_t>{
-      (m_ceiled * db_["MDIMCD"]) / db_["WGD"],
-      (n_ceiled * db_["NDIMCD"]) / db_["WGD"]
-    };
-    const auto local = std::vector<size_t>{db_["MDIMCD"], db_["NDIMCD"]};
+  // Computes the global and local thread sizes
+  const auto m_ceiled = Ceil(m, db_["WGD"]);
+  const auto n_ceiled = Ceil(n, db_["WGD"]);
+  const auto global = std::vector<size_t>{
+    (m_ceiled * db_["MDIMCD"]) / db_["WGD"],
+    (n_ceiled * db_["NDIMCD"]) / db_["WGD"]
+  };
+  const auto local = std::vector<size_t>{db_["MDIMCD"], db_["NDIMCD"]};

-    // Launches the kernel
-    auto status = RunKernel(kernel, queue_, device_, global, local, event_);
-    if (ErrorIn(status)) { return status; }
-
-    // Successfully finished the computation
-    return StatusCode::kSuccess;
-  } catch (...) { return StatusCode::kInvalidKernel; }
+  // Launches the kernel
+  RunKernel(kernel, queue_, device_, global, local, event_);
 }

 // =================================================================================================
--- a/src/routines/level3/xgemm.hpp
+++ b/src/routines/level3/xgemm.hpp
@ -28,36 +28,36 @@ class Xgemm: public Routine {
  Xgemm(Queue &queue, EventPointer event, const std::string &name = "GEMM");

  // Templated-precision implementation of the routine
-  StatusCode DoGemm(const Layout layout, const Transpose a_transpose, const Transpose b_transpose,
-                    const size_t m, const size_t n, const size_t k,
+  void DoGemm(const Layout layout, const Transpose a_transpose, const Transpose b_transpose,
+              const size_t m, const size_t n, const size_t k,
+              const T alpha,
+              const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
+              const Buffer<T> &b_buffer, const size_t b_offset, const size_t b_ld,
+              const T beta,
+              const Buffer<T> &c_buffer, const size_t c_offset, const size_t c_ld);
+
+  // Indirect version of GEMM (with pre and post-processing kernels)
+  void GemmIndirect(const size_t m, const size_t n, const size_t k,
                    const T alpha,
                    const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
                    const Buffer<T> &b_buffer, const size_t b_offset, const size_t b_ld,
                    const T beta,
-                    const Buffer<T> &c_buffer, const size_t c_offset, const size_t c_ld);
-
-  // Indirect version of GEMM (with pre and post-processing kernels)
-  StatusCode GemmIndirect(const size_t m, const size_t n, const size_t k,
-                          const T alpha,
-                          const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
-                          const Buffer<T> &b_buffer, const size_t b_offset, const size_t b_ld,
-                          const T beta,
-                          const Buffer<T> &c_buffer, const size_t c_offset, const size_t c_ld,
-                          const bool a_do_transpose, const bool b_do_transpose, const bool c_do_transpose,
-                          const bool a_conjugate, const bool b_conjugate,
-                          const size_t a_one, const size_t a_two, const bool a_want_rotated,
-                          const size_t b_one, const size_t b_two, const bool b_want_rotated,
-                          const size_t c_one, const size_t c_two, const bool c_want_rotated);
+                    const Buffer<T> &c_buffer, const size_t c_offset, const size_t c_ld,
+                    const bool a_do_transpose, const bool b_do_transpose, const bool c_do_transpose,
+                    const bool a_conjugate, const bool b_conjugate,
+                    const size_t a_one, const size_t a_two, const bool a_want_rotated,
+                    const size_t b_one, const size_t b_two, const bool b_want_rotated,
+                    const size_t c_one, const size_t c_two, const bool c_want_rotated);

  // Direct version of GEMM (no pre and post-processing kernels)
-  StatusCode GemmDirect(const size_t m, const size_t n, const size_t k,
-                        const T alpha,
-                        const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
-                        const Buffer<T> &b_buffer, const size_t b_offset, const size_t b_ld,
-                        const T beta,
-                        const Buffer<T> &c_buffer, const size_t c_offset, const size_t c_ld,
-                        const bool a_do_transpose, const bool b_do_transpose, const bool c_do_transpose,
-                        const bool a_conjugate, const bool b_conjugate);
+  void GemmDirect(const size_t m, const size_t n, const size_t k,
+                  const T alpha,
+                  const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
+                  const Buffer<T> &b_buffer, const size_t b_offset, const size_t b_ld,
+                  const T beta,
+                  const Buffer<T> &c_buffer, const size_t c_offset, const size_t c_ld,
+                  const bool a_do_transpose, const bool b_do_transpose, const bool c_do_transpose,
+                  const bool a_conjugate, const bool b_conjugate);
 };

 // =================================================================================================
--- a/src/routines/level3/xhemm.cpp
+++ b/src/routines/level3/xhemm.cpp
@ -29,7 +29,7 @@ Xhemm<T>::Xhemm(Queue &queue, EventPointer event, const std::string &name):

 // The main routine
 template <typename T>
-StatusCode Xhemm<T>::DoHemm(const Layout layout, const Side side, const Triangle triangle,
+void Xhemm<T>::DoHemm(const Layout layout, const Side side, const Triangle triangle,
                            const size_t m, const size_t n,
                            const T alpha,
                            const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
@ -38,15 +38,14 @@ StatusCode Xhemm<T>::DoHemm(const Layout layout, const Side side, const Triangle
                            const Buffer<T> &c_buffer, const size_t c_offset, const size_t c_ld) {

  // Makes sure all dimensions are larger than zero
-  if ((m == 0) || (n == 0) ) { return StatusCode::kInvalidDimension; }
+  if ((m == 0) || (n == 0) ) { throw BLASError(StatusCode::kInvalidDimension); }

  // Computes the k dimension. This is based on whether or not the hermitian matrix is A (on the
  // left) or B (on the right) in the Xgemm routine.
  auto k = (side == Side::kLeft) ? m : n;

  // Checks for validity of the squared A matrix
-  auto status = TestMatrixA(k, k, a_buffer, a_offset, a_ld);
-  if (ErrorIn(status)) { return status; }
+  TestMatrixA(k, k, a_buffer, a_offset, a_ld);

  // Determines which kernel to run based on the layout (the Xgemm kernel assumes column-major as
  // default) and on whether we are dealing with an upper or lower triangle of the hermitian matrix
@ -55,73 +54,68 @@ StatusCode Xhemm<T>::DoHemm(const Layout layout, const Side side, const Triangle
  auto kernel_name = (is_upper) ? "HermUpperToSquared" : "HermLowerToSquared";

  // Temporary buffer for a copy of the hermitian matrix
-  try {
-    auto temp_herm = Buffer<T>(context_, k*k);
+  auto temp_herm = Buffer<T>(context_, k*k);

-    // Creates a general matrix from the hermitian matrix to be able to run the regular Xgemm
-    // routine afterwards
+  // Creates a general matrix from the hermitian matrix to be able to run the regular Xgemm
+  // routine afterwards
+  const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);
+  auto kernel = Kernel(program, kernel_name);
+
+  // Sets the arguments for the hermitian-to-squared kernel
+  kernel.SetArgument(0, static_cast<int>(k));
+  kernel.SetArgument(1, static_cast<int>(a_ld));
+  kernel.SetArgument(2, static_cast<int>(a_offset));
+  kernel.SetArgument(3, a_buffer());
+  kernel.SetArgument(4, static_cast<int>(k));
+  kernel.SetArgument(5, static_cast<int>(k));
+  kernel.SetArgument(6, static_cast<int>(0));
+  kernel.SetArgument(7, temp_herm());
+
+  // Uses the common padding kernel's thread configuration. This is allowed, since the
+  // hermitian-to-squared kernel uses the same parameters.
+  auto global = std::vector<size_t>{Ceil(CeilDiv(k, db_["PAD_WPTX"]), db_["PAD_DIMX"]),
+                                    Ceil(CeilDiv(k, db_["PAD_WPTY"]), db_["PAD_DIMY"])};
+  auto local = std::vector<size_t>{db_["PAD_DIMX"], db_["PAD_DIMY"]};
+  auto kernelEvent = Event();
+  RunKernel(kernel, queue_, device_, global, local, kernelEvent.pointer());
+
+  // Synchronize now: 'DoGemm' does not accept a list of events to wait for
+  kernelEvent.WaitForCompletion();
+
+  // Runs the regular Xgemm code with either "C := AB+C" or ...
+  if (side == Side::kLeft) {
+    DoGemm(layout, Transpose::kNo, Transpose::kNo,
+           m, n, k,
+           alpha,
+           temp_herm, 0, k,
+           b_buffer, b_offset, b_ld,
+           beta,
+           c_buffer, c_offset, c_ld);
+  }
+
+  // ... with "C := BA+C". Note that A and B are now reversed.
+  else {
    try {
-      const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);
-      auto kernel = Kernel(program, kernel_name);
-
-      // Sets the arguments for the hermitian-to-squared kernel
-      kernel.SetArgument(0, static_cast<int>(k));
-      kernel.SetArgument(1, static_cast<int>(a_ld));
-      kernel.SetArgument(2, static_cast<int>(a_offset));
-      kernel.SetArgument(3, a_buffer());
-      kernel.SetArgument(4, static_cast<int>(k));
-      kernel.SetArgument(5, static_cast<int>(k));
-      kernel.SetArgument(6, static_cast<int>(0));
-      kernel.SetArgument(7, temp_herm());
-
-      // Uses the common padding kernel's thread configuration. This is allowed, since the
-      // hermitian-to-squared kernel uses the same parameters.
-      auto global = std::vector<size_t>{Ceil(CeilDiv(k, db_["PAD_WPTX"]), db_["PAD_DIMX"]),
-                                        Ceil(CeilDiv(k, db_["PAD_WPTY"]), db_["PAD_DIMY"])};
-      auto local = std::vector<size_t>{db_["PAD_DIMX"], db_["PAD_DIMY"]};
-      auto kernelEvent = Event();
-      status = RunKernel(kernel, queue_, device_, global, local, kernelEvent.pointer());
-      if (ErrorIn(status)) { return status; }
-
-      // Synchronize now: 'DoGemm' does not accept a list of events to wait for
-      kernelEvent.WaitForCompletion();
-
-      // Runs the regular Xgemm code with either "C := AB+C" or ...
-      if (side == Side::kLeft) {
-        status = DoGemm(layout, Transpose::kNo, Transpose::kNo,
-                        m, n, k,
-                        alpha,
-                        temp_herm, 0, k,
-                        b_buffer, b_offset, b_ld,
-                        beta,
-                        c_buffer, c_offset, c_ld);
+      DoGemm(layout, Transpose::kNo, Transpose::kNo,
+             m, n, k,
+             alpha,
+             b_buffer, b_offset, b_ld,
+             temp_herm, 0, k,
+             beta,
+             c_buffer, c_offset, c_ld);
+    } catch (BLASError &e) {
+      // A and B are now reversed, so also reverse the error codes returned from the Xgemm routine
+      switch(e.status()) {
+        case StatusCode::kInvalidMatrixA:      throw BLASError(StatusCode::kInvalidMatrixB, e.details());
+        case StatusCode::kInvalidMatrixB:      throw BLASError(StatusCode::kInvalidMatrixA, e.details());
+        case StatusCode::kInvalidLeadDimA:     throw BLASError(StatusCode::kInvalidLeadDimB, e.details());
+        case StatusCode::kInvalidLeadDimB:     throw BLASError(StatusCode::kInvalidLeadDimA, e.details());
+        case StatusCode::kInsufficientMemoryA: throw BLASError(StatusCode::kInsufficientMemoryB, e.details());
+        case StatusCode::kInsufficientMemoryB: throw BLASError(StatusCode::kInsufficientMemoryA, e.details());
+        default:                               throw;
      }
-
-      // ... with "C := BA+C". Note that A and B are now reversed.
-      else {
-        status = DoGemm(layout, Transpose::kNo, Transpose::kNo,
-                        m, n, k,
-                        alpha,
-                        b_buffer, b_offset, b_ld,
-                        temp_herm, 0, k,
-                        beta,
-                        c_buffer, c_offset, c_ld);
-
-        // A and B are now reversed, so also reverse the error codes returned from the Xgemm routine
-        switch(status) {
-          case StatusCode::kInvalidMatrixA:      status = StatusCode::kInvalidMatrixB; break;
-          case StatusCode::kInvalidMatrixB:      status = StatusCode::kInvalidMatrixA; break;
-          case StatusCode::kInvalidLeadDimA:     status = StatusCode::kInvalidLeadDimB; break;
-          case StatusCode::kInvalidLeadDimB:     status = StatusCode::kInvalidLeadDimA; break;
-          case StatusCode::kInsufficientMemoryA: status = StatusCode::kInsufficientMemoryB; break;
-          case StatusCode::kInsufficientMemoryB: status = StatusCode::kInsufficientMemoryA; break;
-        }
-      }
-
-      // Return the status of the Xgemm routine
-      return status;
-    } catch (...) { return StatusCode::kInvalidKernel; }
-  } catch (...) { return StatusCode::kTempBufferAllocFailure; }
+    }
+  }
 }

 // =================================================================================================
--- a/src/routines/level3/xhemm.hpp
+++ b/src/routines/level3/xhemm.hpp
@ -37,13 +37,13 @@ class Xhemm: public Xgemm<T> {
  Xhemm(Queue &queue, EventPointer event, const std::string &name = "HEMM");

  // Templated-precision implementation of the routine
-  StatusCode DoHemm(const Layout layout, const Side side, const Triangle triangle,
-                    const size_t m, const size_t n,
-                    const T alpha,
-                    const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
-                    const Buffer<T> &b_buffer, const size_t b_offset, const size_t b_ld,
-                    const T beta,
-                    const Buffer<T> &c_buffer, const size_t c_offset, const size_t c_ld);
+  void DoHemm(const Layout layout, const Side side, const Triangle triangle,
+              const size_t m, const size_t n,
+              const T alpha,
+              const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
+              const Buffer<T> &b_buffer, const size_t b_offset, const size_t b_ld,
+              const T beta,
+              const Buffer<T> &c_buffer, const size_t c_offset, const size_t c_ld);
 };

 // =================================================================================================
--- a/src/routines/level3/xher2k.cpp
+++ b/src/routines/level3/xher2k.cpp
@ -39,16 +39,16 @@ Xher2k<T,U>::Xher2k(Queue &queue, EventPointer event, const std::string &name):

 // The main routine
 template <typename T, typename U>
-StatusCode Xher2k<T,U>::DoHer2k(const Layout layout, const Triangle triangle, const Transpose ab_transpose,
-                                const size_t n, const size_t k,
-                                const T alpha,
-                                const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
-                                const Buffer<T> &b_buffer, const size_t b_offset, const size_t b_ld,
-                                const U beta,
-                                const Buffer<T> &c_buffer, const size_t c_offset, const size_t c_ld) {
+void Xher2k<T,U>::DoHer2k(const Layout layout, const Triangle triangle, const Transpose ab_transpose,
+                          const size_t n, const size_t k,
+                          const T alpha,
+                          const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
+                          const Buffer<T> &b_buffer, const size_t b_offset, const size_t b_ld,
+                          const U beta,
+                          const Buffer<T> &c_buffer, const size_t c_offset, const size_t c_ld) {

  // Makes sure all dimensions are larger than zero
-  if ((n == 0) || (k == 0) ) { return StatusCode::kInvalidDimension; }
+  if ((n == 0) || (k == 0) ) { throw BLASError(StatusCode::kInvalidDimension); }

  // Determines whether to apply the conjugate transpose to matrix B (argument: no transpose) or
  // to matrix A (argument: conjugate transpose)
@ -71,12 +71,9 @@ StatusCode Xher2k<T,U>::DoHer2k(const Layout layout, const Triangle triangle, co
  //    matrix A cannot be less than N when rotated, or less than K when not-rotated
  //    matrix B cannot be less than N when rotated, or less than K when not-rotated
  //    matrix C cannot be less than N
-  auto status = TestMatrixA(ab_one, ab_two, a_buffer, a_offset, a_ld);
-  if (ErrorIn(status)) { return status; }
-  status = TestMatrixB(ab_one, ab_two, b_buffer, b_offset, b_ld);
-  if (ErrorIn(status)) { return status; }
-  status = TestMatrixC(n, n, c_buffer, c_offset, c_ld);
-  if (ErrorIn(status)) { return status; }
+  TestMatrixA(ab_one, ab_two, a_buffer, a_offset, a_ld);
+  TestMatrixB(ab_one, ab_two, b_buffer, b_offset, b_ld);
+  TestMatrixC(n, n, c_buffer, c_offset, c_ld);

  // Calculates the ceiled versions of n and k
  auto n_ceiled = Ceil(n, db_["NWG"]);
@ -85,145 +82,128 @@ StatusCode Xher2k<T,U>::DoHer2k(const Layout layout, const Triangle triangle, co
  // Decides which kernel to run: the upper-triangular or lower-triangular version
  auto kernel_name = (triangle == Triangle::kUpper) ? "XgemmUpper" : "XgemmLower";

-  // The padded/transposed input/output matrices: if memory allocation fails, throw an exception
-  try {
+  // Loads the program from the database
+  const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);

-    // Loads the program from the database
-    const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);
+  // Determines whether or not temporary matrices are needed
+  auto a1_no_temp = ab_one == n_ceiled && ab_two == k_ceiled && a_ld == n_ceiled && a_offset == 0 &&
+                    ab_rotated == false && ab_conjugate == false;
+  auto a2_no_temp = ab_one == n_ceiled && ab_two == k_ceiled && a_ld == n_ceiled && a_offset == 0 &&
+                    ab_rotated == false && ab_conjugate == true;
+  auto b1_no_temp = ab_one == n_ceiled && ab_two == k_ceiled && b_ld == n_ceiled && b_offset == 0 &&
+                    ab_rotated == false && ab_conjugate == false;
+  auto b2_no_temp = ab_one == n_ceiled && ab_two == k_ceiled && b_ld == n_ceiled && b_offset == 0 &&
+                    ab_rotated == false && ab_conjugate == true;

-    // Determines whether or not temporary matrices are needed
-    auto a1_no_temp = ab_one == n_ceiled && ab_two == k_ceiled && a_ld == n_ceiled && a_offset == 0 &&
-                      ab_rotated == false && ab_conjugate == false;
-    auto a2_no_temp = ab_one == n_ceiled && ab_two == k_ceiled && a_ld == n_ceiled && a_offset == 0 &&
-                      ab_rotated == false && ab_conjugate == true;
-    auto b1_no_temp = ab_one == n_ceiled && ab_two == k_ceiled && b_ld == n_ceiled && b_offset == 0 &&
-                      ab_rotated == false && ab_conjugate == false;
-    auto b2_no_temp = ab_one == n_ceiled && ab_two == k_ceiled && b_ld == n_ceiled && b_offset == 0 &&
-                      ab_rotated == false && ab_conjugate == true;
+  // Creates the temporary matrices
+  auto a1_temp = (a1_no_temp) ? a_buffer : Buffer<T>(context_, k_ceiled*n_ceiled);
+  auto a2_temp = (a2_no_temp) ? a_buffer : Buffer<T>(context_, k_ceiled*n_ceiled);
+  auto b1_temp = (b1_no_temp) ? b_buffer : Buffer<T>(context_, k_ceiled*n_ceiled);
+  auto b2_temp = (b2_no_temp) ? b_buffer : Buffer<T>(context_, k_ceiled*n_ceiled);
+  auto c_temp = Buffer<T>(context_, n_ceiled*n_ceiled);

-    // Creates the temporary matrices
-    auto a1_temp = (a1_no_temp) ? a_buffer : Buffer<T>(context_, k_ceiled*n_ceiled);
-    auto a2_temp = (a2_no_temp) ? a_buffer : Buffer<T>(context_, k_ceiled*n_ceiled);
-    auto b1_temp = (b1_no_temp) ? b_buffer : Buffer<T>(context_, k_ceiled*n_ceiled);
-    auto b2_temp = (b2_no_temp) ? b_buffer : Buffer<T>(context_, k_ceiled*n_ceiled);
-    auto c_temp = Buffer<T>(context_, n_ceiled*n_ceiled);
+  // Convert the arguments to complex versions
+  auto complex_beta = T{beta, static_cast<U>(0.0)};

-    // Convert the arguments to complex versions
-    auto complex_beta = T{beta, static_cast<U>(0.0)};
+  // Events of all kernels (including pre/post processing kernels)
+  auto eventWaitList = std::vector<Event>();
+  auto emptyEventList = std::vector<Event>();

-    // Events of all kernels (including pre/post processing kernels)
-    auto eventWaitList = std::vector<Event>();
-    auto emptyEventList = std::vector<Event>();
+  // Runs the pre-processing kernels. This transposes the matrices A and B, but also pads zeros to
+  // to fill it up until it reaches a certain multiple of size (kernel parameter dependent). In
+  // case nothing has to be done, these kernels can be skipped.
+  if (!a1_no_temp) {
+    auto eventProcessA1 = Event();
+    PadCopyTransposeMatrix(queue_, device_, db_, eventProcessA1.pointer(), emptyEventList,
+                           ab_one, ab_two, a_ld, a_offset, a_buffer,
+                           n_ceiled, k_ceiled, n_ceiled, 0, a1_temp,
+                           ConstantOne<T>(), program,
+                           true, ab_rotated, ab_conjugate);
+    eventWaitList.push_back(eventProcessA1);
+  }
+  if (!a2_no_temp) {
+    auto eventProcessA2 = Event();
+    PadCopyTransposeMatrix(queue_, device_, db_, eventProcessA2.pointer(), emptyEventList,
+                           ab_one, ab_two, a_ld, a_offset, a_buffer,
+                           n_ceiled, k_ceiled, n_ceiled, 0, a2_temp,
+                           ConstantOne<T>(), program,
+                           true, ab_rotated, !ab_conjugate);
+    eventWaitList.push_back(eventProcessA2);
+  }
+  if (!b1_no_temp) {
+    auto eventProcessB1 = Event();
+    PadCopyTransposeMatrix(queue_, device_, db_, eventProcessB1.pointer(), emptyEventList,
+                           ab_one, ab_two, b_ld, b_offset, b_buffer,
+                           n_ceiled, k_ceiled, n_ceiled, 0, b1_temp,
+                           ConstantOne<T>(), program,
+                           true, ab_rotated, ab_conjugate);
+    eventWaitList.push_back(eventProcessB1);
+  }
+  if (!b2_no_temp) {
+    auto eventProcessB2 = Event();
+    PadCopyTransposeMatrix(queue_, device_, db_, eventProcessB2.pointer(), emptyEventList,
+                           ab_one, ab_two, b_ld, b_offset, b_buffer,
+                           n_ceiled, k_ceiled, n_ceiled, 0, b2_temp,
+                           ConstantOne<T>(), program,
+                           true, ab_rotated, !ab_conjugate);
+    eventWaitList.push_back(eventProcessB2);
+  }

-    // Runs the pre-processing kernels. This transposes the matrices A and B, but also pads zeros to
-    // to fill it up until it reaches a certain multiple of size (kernel parameter dependent). In
-    // case nothing has to be done, these kernels can be skipped.
-    if (!a1_no_temp) {
-      auto eventProcessA1 = Event();
-      status = PadCopyTransposeMatrix(queue_, device_, db_, eventProcessA1.pointer(), emptyEventList,
-                                      ab_one, ab_two, a_ld, a_offset, a_buffer,
-                                      n_ceiled, k_ceiled, n_ceiled, 0, a1_temp,
-                                      ConstantOne<T>(), program,
-                                      true, ab_rotated, ab_conjugate);
-      eventWaitList.push_back(eventProcessA1);
-      if (ErrorIn(status)) { return status; }
-    }
-    if (!a2_no_temp) {
-      auto eventProcessA2 = Event();
-      status = PadCopyTransposeMatrix(queue_, device_, db_, eventProcessA2.pointer(), emptyEventList,
-                                      ab_one, ab_two, a_ld, a_offset, a_buffer,
-                                      n_ceiled, k_ceiled, n_ceiled, 0, a2_temp,
-                                      ConstantOne<T>(), program,
-                                      true, ab_rotated, !ab_conjugate);
-      eventWaitList.push_back(eventProcessA2);
-      if (ErrorIn(status)) { return status; }
-    }
-    if (!b1_no_temp) {
-      auto eventProcessB1 = Event();
-      status = PadCopyTransposeMatrix(queue_, device_, db_, eventProcessB1.pointer(), emptyEventList,
-                                      ab_one, ab_two, b_ld, b_offset, b_buffer,
-                                      n_ceiled, k_ceiled, n_ceiled, 0, b1_temp,
-                                      ConstantOne<T>(), program,
-                                      true, ab_rotated, ab_conjugate);
-      eventWaitList.push_back(eventProcessB1);
-      if (ErrorIn(status)) { return status; }
-    }
-    if (!b2_no_temp) {
-      auto eventProcessB2 = Event();
-      status = PadCopyTransposeMatrix(queue_, device_, db_, eventProcessB2.pointer(), emptyEventList,
-                                      ab_one, ab_two, b_ld, b_offset, b_buffer,
-                                      n_ceiled, k_ceiled, n_ceiled, 0, b2_temp,
-                                      ConstantOne<T>(), program,
-                                      true, ab_rotated, !ab_conjugate);
-      eventWaitList.push_back(eventProcessB2);
-      if (ErrorIn(status)) { return status; }
-    }
+  // Furthermore, also creates a (possibly padded) copy of matrix C, since it is not allowed to
+  // modify the other triangle.
+  auto eventProcessC = Event();
+  PadCopyTransposeMatrix(queue_, device_, db_, eventProcessC.pointer(), emptyEventList,
+                         n, n, c_ld, c_offset, c_buffer,
+                         n_ceiled, n_ceiled, n_ceiled, 0, c_temp,
+                         ConstantOne<T>(), program,
+                         true, c_rotated, false);
+  eventWaitList.push_back(eventProcessC);

-    // Furthermore, also creates a (possibly padded) copy of matrix C, since it is not allowed to
-    // modify the other triangle.
-    auto eventProcessC = Event();
-    status = PadCopyTransposeMatrix(queue_, device_, db_, eventProcessC.pointer(), emptyEventList,
-                                    n, n, c_ld, c_offset, c_buffer,
-                                    n_ceiled, n_ceiled, n_ceiled, 0, c_temp,
-                                    ConstantOne<T>(), program,
-                                    true, c_rotated, false);
-    eventWaitList.push_back(eventProcessC);
-    if (ErrorIn(status)) { return status; }
+  // Retrieves the XgemmUpper or XgemmLower kernel from the compiled binary
+  auto kernel = Kernel(program, kernel_name);

-    // Retrieves the XgemmUpper or XgemmLower kernel from the compiled binary
-    try {
-      auto kernel = Kernel(program, kernel_name);
+  // Sets the kernel arguments
+  kernel.SetArgument(0, static_cast<int>(n_ceiled));
+  kernel.SetArgument(1, static_cast<int>(k_ceiled));
+  kernel.SetArgument(2, GetRealArg(alpha));
+  kernel.SetArgument(3, GetRealArg(complex_beta));
+  kernel.SetArgument(4, a1_temp());
+  kernel.SetArgument(5, b2_temp());
+  kernel.SetArgument(6, c_temp());

-      // Sets the kernel arguments
-      kernel.SetArgument(0, static_cast<int>(n_ceiled));
-      kernel.SetArgument(1, static_cast<int>(k_ceiled));
-      kernel.SetArgument(2, GetRealArg(alpha));
-      kernel.SetArgument(3, GetRealArg(complex_beta));
-      kernel.SetArgument(4, a1_temp());
-      kernel.SetArgument(5, b2_temp());
-      kernel.SetArgument(6, c_temp());
+  // Computes the global and local thread sizes
+  auto global = std::vector<size_t>{
+    (n_ceiled * db_["MDIMC"]) / db_["MWG"],
+    (n_ceiled * db_["NDIMC"]) / db_["NWG"]
+  };
+  auto local = std::vector<size_t>{db_["MDIMC"], db_["NDIMC"]};

-      // Computes the global and local thread sizes
-      auto global = std::vector<size_t>{
-        (n_ceiled * db_["MDIMC"]) / db_["MWG"],
-        (n_ceiled * db_["NDIMC"]) / db_["NWG"]
-      };
-      auto local = std::vector<size_t>{db_["MDIMC"], db_["NDIMC"]};
+  // Launches the kernel
+  auto eventKernel1 = Event();
+  RunKernel(kernel, queue_, device_, global, local, eventKernel1.pointer(), eventWaitList);
+  eventWaitList.push_back(eventKernel1);

-      // Launches the kernel
-      auto eventKernel1 = Event();
-      status = RunKernel(kernel, queue_, device_, global, local, eventKernel1.pointer(), eventWaitList);
-      if (ErrorIn(status)) { return status; }
-      eventWaitList.push_back(eventKernel1);
+  // Swaps the arguments for matrices A and B, sets 'beta' to 1, and conjugate alpha
+  auto conjugate_alpha = T{alpha.real(), -alpha.imag()};
+  auto complex_one = T{static_cast<U>(1.0), static_cast<U>(0.0)};
+  kernel.SetArgument(2, GetRealArg(conjugate_alpha));
+  kernel.SetArgument(3, GetRealArg(complex_one));
+  kernel.SetArgument(4, b1_temp());
+  kernel.SetArgument(5, a2_temp());

-      // Swaps the arguments for matrices A and B, sets 'beta' to 1, and conjugate alpha
-      auto conjugate_alpha = T{alpha.real(), -alpha.imag()};
-      auto complex_one = T{static_cast<U>(1.0), static_cast<U>(0.0)};
-      kernel.SetArgument(2, GetRealArg(conjugate_alpha));
-      kernel.SetArgument(3, GetRealArg(complex_one));
-      kernel.SetArgument(4, b1_temp());
-      kernel.SetArgument(5, a2_temp());
+  // Runs the kernel again
+  auto eventKernel2 = Event();
+  RunKernel(kernel, queue_, device_, global, local, eventKernel2.pointer(), eventWaitList);
+  eventWaitList.push_back(eventKernel2);

-      // Runs the kernel again
-      auto eventKernel2 = Event();
-      status = RunKernel(kernel, queue_, device_, global, local, eventKernel2.pointer(), eventWaitList);
-      if (ErrorIn(status)) { return status; }
-      eventWaitList.push_back(eventKernel2);
-
-      // Runs the post-processing kernel
-      auto upper = (triangle == Triangle::kUpper);
-      auto lower = (triangle == Triangle::kLower);
-      status = PadCopyTransposeMatrix(queue_, device_, db_, event_, eventWaitList,
-                                      n_ceiled, n_ceiled, n_ceiled, 0, c_temp,
-                                      n, n, c_ld, c_offset, c_buffer,
-                                      ConstantOne<T>(), program,
-                                      false, c_rotated, false, upper, lower, true);
-      if (ErrorIn(status)) { return status; }
-
-      // Successfully finished the computation
-      return StatusCode::kSuccess;
-    } catch (...) { return StatusCode::kInvalidKernel; }
-  } catch (...) { return StatusCode::kTempBufferAllocFailure; }
+  // Runs the post-processing kernel
+  auto upper = (triangle == Triangle::kUpper);
+  auto lower = (triangle == Triangle::kLower);
+  PadCopyTransposeMatrix(queue_, device_, db_, event_, eventWaitList,
+                         n_ceiled, n_ceiled, n_ceiled, 0, c_temp,
+                         n, n, c_ld, c_offset, c_buffer,
+                         ConstantOne<T>(), program,
+                         false, c_rotated, false, upper, lower, true);
 }

 // =================================================================================================
--- a/src/routines/level3/xher2k.hpp
+++ b/src/routines/level3/xher2k.hpp
@ -30,13 +30,13 @@ class Xher2k: public Routine {
  Xher2k(Queue &queue, EventPointer event, const std::string &name = "HER2K");

  // Templated-precision implementation of the routine
-  StatusCode DoHer2k(const Layout layout, const Triangle triangle, const Transpose ab_transpose,
-                     const size_t n, const size_t k,
-                     const T alpha,
-                     const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
-                     const Buffer<T> &b_buffer, const size_t b_offset, const size_t b_ld,
-                     const U beta,
-                     const Buffer<T> &c_buffer, const size_t c_offset, const size_t c_ld);
+  void DoHer2k(const Layout layout, const Triangle triangle, const Transpose ab_transpose,
+               const size_t n, const size_t k,
+               const T alpha,
+               const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
+               const Buffer<T> &b_buffer, const size_t b_offset, const size_t b_ld,
+               const U beta,
+               const Buffer<T> &c_buffer, const size_t c_offset, const size_t c_ld);
 };

 // =================================================================================================
--- a/src/routines/level3/xherk.cpp
+++ b/src/routines/level3/xherk.cpp
@ -39,7 +39,7 @@ Xherk<T,U>::Xherk(Queue &queue, EventPointer event, const std::string &name):

 // The main routine
 template <typename T, typename U>
-StatusCode Xherk<T,U>::DoHerk(const Layout layout, const Triangle triangle, const Transpose a_transpose,
+void Xherk<T,U>::DoHerk(const Layout layout, const Triangle triangle, const Transpose a_transpose,
                              const size_t n, const size_t k,
                              const U alpha,
                              const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
@ -47,7 +47,7 @@ StatusCode Xherk<T,U>::DoHerk(const Layout layout, const Triangle triangle, cons
                              const Buffer<T> &c_buffer, const size_t c_offset, const size_t c_ld) {

  // Makes sure all dimensions are larger than zero
-  if ((n == 0) || (k == 0) ) { return StatusCode::kInvalidDimension; }
+  if ((n == 0) || (k == 0) ) { throw BLASError(StatusCode::kInvalidDimension); }

  // Determines whether to apply the conjugate transpose to matrix B (argument: no transpose) or
  // to matrix A (argument: conjugate transpose)
@ -70,10 +70,8 @@ StatusCode Xherk<T,U>::DoHerk(const Layout layout, const Triangle triangle, cons
  // space. Also tests that the leading dimensions of:
  //    matrix A cannot be less than N when rotated, or less than K when not-rotated
  //    matrix C cannot be less than N
-  auto status = TestMatrixA(a_one, a_two, a_buffer, a_offset, a_ld);
-  if (ErrorIn(status)) { return status; }
-  status = TestMatrixC(n, n, c_buffer, c_offset, c_ld);
-  if (ErrorIn(status)) { return status; }
+  TestMatrixA(a_one, a_two, a_buffer, a_offset, a_ld);
+  TestMatrixC(n, n, c_buffer, c_offset, c_ld);

  // Calculates the ceiled versions of n and k
  auto n_ceiled = Ceil(n, db_["NWG"]);
@ -82,106 +80,92 @@ StatusCode Xherk<T,U>::DoHerk(const Layout layout, const Triangle triangle, cons
  // Decides which kernel to run: the upper-triangular or lower-triangular version
  auto kernel_name = (triangle == Triangle::kUpper) ? "XgemmUpper" : "XgemmLower";

-  // The padded/transposed input/output matrices: if memory allocation fails, throw an exception
-  try {
+  // Loads the program from the database
+  const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);

-    // Loads the program from the database
-    const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);
+  // Determines whether or not temporary matrices are needed
+  auto a_no_temp = a_one == n_ceiled && a_two == k_ceiled && a_ld == n_ceiled && a_offset == 0 &&
+                   a_rotated == false && a_conjugate == false;
+  auto b_no_temp = a_one == n_ceiled && a_two == k_ceiled && a_ld == n_ceiled && a_offset == 0 &&
+                   a_rotated == false && b_conjugate == false;

-    // Determines whether or not temporary matrices are needed
-    auto a_no_temp = a_one == n_ceiled && a_two == k_ceiled && a_ld == n_ceiled && a_offset == 0 &&
-                     a_rotated == false && a_conjugate == false;
-    auto b_no_temp = a_one == n_ceiled && a_two == k_ceiled && a_ld == n_ceiled && a_offset == 0 &&
-                     a_rotated == false && b_conjugate == false;
+  // Creates the temporary matrices
+  auto a_temp = (a_no_temp) ? a_buffer : Buffer<T>(context_, k_ceiled*n_ceiled);
+  auto b_temp = (b_no_temp) ? a_buffer : Buffer<T>(context_, k_ceiled*n_ceiled);
+  auto c_temp = Buffer<T>(context_, n_ceiled*n_ceiled);

-    // Creates the temporary matrices
-    auto a_temp = (a_no_temp) ? a_buffer : Buffer<T>(context_, k_ceiled*n_ceiled);
-    auto b_temp = (b_no_temp) ? a_buffer : Buffer<T>(context_, k_ceiled*n_ceiled);
-    auto c_temp = Buffer<T>(context_, n_ceiled*n_ceiled);
+  // Convert the arguments to complex versions
+  auto complex_alpha = T{alpha, static_cast<U>(0.0)};
+  auto complex_beta = T{beta, static_cast<U>(0.0)};

-    // Convert the arguments to complex versions
-    auto complex_alpha = T{alpha, static_cast<U>(0.0)};
-    auto complex_beta = T{beta, static_cast<U>(0.0)};
+  // Events of all kernels (including pre/post processing kernels)
+  auto eventWaitList = std::vector<Event>();
+  auto emptyEventList = std::vector<Event>();

-    // Events of all kernels (including pre/post processing kernels)
-    auto eventWaitList = std::vector<Event>();
-    auto emptyEventList = std::vector<Event>();
+  // Runs the pre-processing kernel for matrix A. This transposes the matrix, but also pads zeros
+  // to fill it up until it reaches a certain multiple of size (kernel parameter dependent). In
+  // case nothing has to be done, these kernels can be skipped. Two copies are created.
+  if (!a_no_temp) {
+    auto eventProcessA = Event();
+    PadCopyTransposeMatrix(queue_, device_, db_, eventProcessA.pointer(), emptyEventList,
+                           a_one, a_two, a_ld, a_offset, a_buffer,
+                           n_ceiled, k_ceiled, n_ceiled, 0, a_temp,
+                           ConstantOne<T>(), program,
+                           true, a_rotated, a_conjugate);
+    eventWaitList.push_back(eventProcessA);
+  }
+  if (!b_no_temp) {
+    auto eventProcessB = Event();
+    PadCopyTransposeMatrix(queue_, device_, db_, eventProcessB.pointer(), emptyEventList,
+                           a_one, a_two, a_ld, a_offset, a_buffer,
+                           n_ceiled, k_ceiled, n_ceiled, 0, b_temp,
+                           ConstantOne<T>(), program,
+                           true, a_rotated, b_conjugate);
+    eventWaitList.push_back(eventProcessB);
+  }

-    // Runs the pre-processing kernel for matrix A. This transposes the matrix, but also pads zeros
-    // to fill it up until it reaches a certain multiple of size (kernel parameter dependent). In
-    // case nothing has to be done, these kernels can be skipped. Two copies are created.
-    if (!a_no_temp) {
-      auto eventProcessA = Event();
-      status = PadCopyTransposeMatrix(queue_, device_, db_, eventProcessA.pointer(), emptyEventList,
-                                      a_one, a_two, a_ld, a_offset, a_buffer,
-                                      n_ceiled, k_ceiled, n_ceiled, 0, a_temp,
-                                      ConstantOne<T>(), program,
-                                      true, a_rotated, a_conjugate);
-      eventWaitList.push_back(eventProcessA);
-      if (ErrorIn(status)) { return status; }
-    }
-    if (!b_no_temp) {
-      auto eventProcessB = Event();
-      status = PadCopyTransposeMatrix(queue_, device_, db_, eventProcessB.pointer(), emptyEventList,
-                                      a_one, a_two, a_ld, a_offset, a_buffer,
-                                      n_ceiled, k_ceiled, n_ceiled, 0, b_temp,
-                                      ConstantOne<T>(), program,
-                                      true, a_rotated, b_conjugate);
-      eventWaitList.push_back(eventProcessB);
-      if (ErrorIn(status)) { return status; }
-    }
+  // Furthermore, also creates a (possibly padded) copy of matrix C, since it is not allowed to
+  // modify the other triangle.
+  auto eventProcessC = Event();
+  PadCopyTransposeMatrix(queue_, device_, db_, eventProcessC.pointer(), emptyEventList,
+                         n, n, c_ld, c_offset, c_buffer,
+                         n_ceiled, n_ceiled, n_ceiled, 0, c_temp,
+                         ConstantOne<T>(), program,
+                         true, c_rotated, false);
+  eventWaitList.push_back(eventProcessC);

-    // Furthermore, also creates a (possibly padded) copy of matrix C, since it is not allowed to
-    // modify the other triangle.
-    auto eventProcessC = Event();
-    status = PadCopyTransposeMatrix(queue_, device_, db_, eventProcessC.pointer(), emptyEventList,
-                                    n, n, c_ld, c_offset, c_buffer,
-                                    n_ceiled, n_ceiled, n_ceiled, 0, c_temp,
-                                    ConstantOne<T>(), program,
-                                    true, c_rotated, false);
-    eventWaitList.push_back(eventProcessC);
-    if (ErrorIn(status)) { return status; }
+  // Retrieves the XgemmUpper or XgemmLower kernel from the compiled binary
+  auto kernel = Kernel(program, kernel_name);

-    // Retrieves the XgemmUpper or XgemmLower kernel from the compiled binary
-    try {
-      auto kernel = Kernel(program, kernel_name);
+  // Sets the kernel arguments
+  kernel.SetArgument(0, static_cast<int>(n_ceiled));
+  kernel.SetArgument(1, static_cast<int>(k_ceiled));
+  kernel.SetArgument(2, GetRealArg(complex_alpha));
+  kernel.SetArgument(3, GetRealArg(complex_beta));
+  kernel.SetArgument(4, a_temp());
+  kernel.SetArgument(5, b_temp());
+  kernel.SetArgument(6, c_temp());

-      // Sets the kernel arguments
-      kernel.SetArgument(0, static_cast<int>(n_ceiled));
-      kernel.SetArgument(1, static_cast<int>(k_ceiled));
-      kernel.SetArgument(2, GetRealArg(complex_alpha));
-      kernel.SetArgument(3, GetRealArg(complex_beta));
-      kernel.SetArgument(4, a_temp());
-      kernel.SetArgument(5, b_temp());
-      kernel.SetArgument(6, c_temp());
+  // Computes the global and local thread sizes
+  auto global = std::vector<size_t>{
+    (n_ceiled * db_["MDIMC"]) / db_["MWG"],
+    (n_ceiled * db_["NDIMC"]) / db_["NWG"]
+  };
+  auto local = std::vector<size_t>{db_["MDIMC"], db_["NDIMC"]};

-      // Computes the global and local thread sizes
-      auto global = std::vector<size_t>{
-        (n_ceiled * db_["MDIMC"]) / db_["MWG"],
-        (n_ceiled * db_["NDIMC"]) / db_["NWG"]
-      };
-      auto local = std::vector<size_t>{db_["MDIMC"], db_["NDIMC"]};
+  // Launches the kernel
+  auto eventKernel = Event();
+  RunKernel(kernel, queue_, device_, global, local, eventKernel.pointer(), eventWaitList);
+  eventWaitList.push_back(eventKernel);

-      // Launches the kernel
-      auto eventKernel = Event();
-      status = RunKernel(kernel, queue_, device_, global, local, eventKernel.pointer(), eventWaitList);
-      if (ErrorIn(status)) { return status; }
-      eventWaitList.push_back(eventKernel);
-
-      // Runs the post-processing kernel
-      auto upper = (triangle == Triangle::kUpper);
-      auto lower = (triangle == Triangle::kLower);
-      status = PadCopyTransposeMatrix(queue_, device_, db_, event_, eventWaitList,
-                                      n_ceiled, n_ceiled, n_ceiled, 0, c_temp,
-                                      n, n, c_ld, c_offset, c_buffer,
-                                      ConstantOne<T>(), program,
-                                      false, c_rotated, false, upper, lower, true);
-      if (ErrorIn(status)) { return status; }
-
-      // Successfully finished the computation
-      return StatusCode::kSuccess;
-    } catch (...) { return StatusCode::kInvalidKernel; }
-  } catch (...) { return StatusCode::kTempBufferAllocFailure; }
+  // Runs the post-processing kernel
+  auto upper = (triangle == Triangle::kUpper);
+  auto lower = (triangle == Triangle::kLower);
+  PadCopyTransposeMatrix(queue_, device_, db_, event_, eventWaitList,
+                         n_ceiled, n_ceiled, n_ceiled, 0, c_temp,
+                         n, n, c_ld, c_offset, c_buffer,
+                         ConstantOne<T>(), program,
+                         false, c_rotated, false, upper, lower, true);
 }

 // =================================================================================================
--- a/src/routines/level3/xherk.hpp
+++ b/src/routines/level3/xherk.hpp
@ -30,12 +30,12 @@ class Xherk: public Routine {
  Xherk(Queue &queue, EventPointer event, const std::string &name = "HERK");

  // Templated-precision implementation of the routine
-  StatusCode DoHerk(const Layout layout, const Triangle triangle, const Transpose a_transpose,
-                    const size_t n, const size_t k,
-                    const U alpha,
-                    const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
-                    const U beta,
-                    const Buffer<T> &c_buffer, const size_t c_offset, const size_t c_ld);
+  void DoHerk(const Layout layout, const Triangle triangle, const Transpose a_transpose,
+              const size_t n, const size_t k,
+              const U alpha,
+              const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
+              const U beta,
+              const Buffer<T> &c_buffer, const size_t c_offset, const size_t c_ld);
 };

 // =================================================================================================
--- a/src/routines/level3/xsymm.cpp
+++ b/src/routines/level3/xsymm.cpp
@ -29,7 +29,7 @@ Xsymm<T>::Xsymm(Queue &queue, EventPointer event, const std::string &name):

 // The main routine
 template <typename T>
-StatusCode Xsymm<T>::DoSymm(const Layout layout, const Side side, const Triangle triangle,
+void Xsymm<T>::DoSymm(const Layout layout, const Side side, const Triangle triangle,
                            const size_t m, const size_t n,
                            const T alpha,
                            const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
@ -38,15 +38,14 @@ StatusCode Xsymm<T>::DoSymm(const Layout layout, const Side side, const Triangle
                            const Buffer<T> &c_buffer, const size_t c_offset, const size_t c_ld) {

  // Makes sure all dimensions are larger than zero
-  if ((m == 0) || (n == 0) ) { return StatusCode::kInvalidDimension; }
+  if ((m == 0) || (n == 0) ) { throw BLASError(StatusCode::kInvalidDimension); }

  // Computes the k dimension. This is based on whether or not the symmetric matrix is A (on the
  // left) or B (on the right) in the Xgemm routine.
  auto k = (side == Side::kLeft) ? m : n;

  // Checks for validity of the squared A matrix
-  auto status = TestMatrixA(k, k, a_buffer, a_offset, a_ld);
-  if (ErrorIn(status)) { return status; }
+  TestMatrixA(k, k, a_buffer, a_offset, a_ld);

  // Determines which kernel to run based on the layout (the Xgemm kernel assumes column-major as
  // default) and on whether we are dealing with an upper or lower triangle of the symmetric matrix
@ -55,73 +54,68 @@ StatusCode Xsymm<T>::DoSymm(const Layout layout, const Side side, const Triangle
  auto kernel_name = (is_upper) ? "SymmUpperToSquared" : "SymmLowerToSquared";

  // Temporary buffer for a copy of the symmetric matrix
-  try {
-    auto temp_symm = Buffer<T>(context_, k*k);
+  auto temp_symm = Buffer<T>(context_, k*k);

-    // Creates a general matrix from the symmetric matrix to be able to run the regular Xgemm
-    // routine afterwards
+  // Creates a general matrix from the symmetric matrix to be able to run the regular Xgemm
+  // routine afterwards
+  const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);
+  auto kernel = Kernel(program, kernel_name);
+
+  // Sets the arguments for the symmetric-to-squared kernel
+  kernel.SetArgument(0, static_cast<int>(k));
+  kernel.SetArgument(1, static_cast<int>(a_ld));
+  kernel.SetArgument(2, static_cast<int>(a_offset));
+  kernel.SetArgument(3, a_buffer());
+  kernel.SetArgument(4, static_cast<int>(k));
+  kernel.SetArgument(5, static_cast<int>(k));
+  kernel.SetArgument(6, static_cast<int>(0));
+  kernel.SetArgument(7, temp_symm());
+
+  // Uses the common padding kernel's thread configuration. This is allowed, since the
+  // symmetric-to-squared kernel uses the same parameters.
+  auto global = std::vector<size_t>{Ceil(CeilDiv(k, db_["PAD_WPTX"]), db_["PAD_DIMX"]),
+                                    Ceil(CeilDiv(k, db_["PAD_WPTY"]), db_["PAD_DIMY"])};
+  auto local = std::vector<size_t>{db_["PAD_DIMX"], db_["PAD_DIMY"]};
+  auto kernelEvent = Event();
+  RunKernel(kernel, queue_, device_, global, local, kernelEvent.pointer());
+
+  // Synchronize now: 'DoGemm' does not accept a list of events to wait for
+  kernelEvent.WaitForCompletion();
+
+  // Runs the regular Xgemm code with either "C := AB+C" or ...
+  if (side == Side::kLeft) {
+    DoGemm(layout, Transpose::kNo, Transpose::kNo,
+           m, n, k,
+           alpha,
+           temp_symm, 0, k,
+           b_buffer, b_offset, b_ld,
+           beta,
+           c_buffer, c_offset, c_ld);
+  }
+
+  // ... with "C := BA+C". Note that A and B are now reversed.
+  else {
    try {
-      const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);
-      auto kernel = Kernel(program, kernel_name);
-
-      // Sets the arguments for the symmetric-to-squared kernel
-      kernel.SetArgument(0, static_cast<int>(k));
-      kernel.SetArgument(1, static_cast<int>(a_ld));
-      kernel.SetArgument(2, static_cast<int>(a_offset));
-      kernel.SetArgument(3, a_buffer());
-      kernel.SetArgument(4, static_cast<int>(k));
-      kernel.SetArgument(5, static_cast<int>(k));
-      kernel.SetArgument(6, static_cast<int>(0));
-      kernel.SetArgument(7, temp_symm());
-
-      // Uses the common padding kernel's thread configuration. This is allowed, since the
-      // symmetric-to-squared kernel uses the same parameters.
-      auto global = std::vector<size_t>{Ceil(CeilDiv(k, db_["PAD_WPTX"]), db_["PAD_DIMX"]),
-                                        Ceil(CeilDiv(k, db_["PAD_WPTY"]), db_["PAD_DIMY"])};
-      auto local = std::vector<size_t>{db_["PAD_DIMX"], db_["PAD_DIMY"]};
-      auto kernelEvent = Event();
-      status = RunKernel(kernel, queue_, device_, global, local, kernelEvent.pointer());
-      if (ErrorIn(status)) { return status; }
-
-      // Synchronize now: 'DoGemm' does not accept a list of events to wait for
-      kernelEvent.WaitForCompletion();
-
-      // Runs the regular Xgemm code with either "C := AB+C" or ...
-      if (side == Side::kLeft) {
-        status = DoGemm(layout, Transpose::kNo, Transpose::kNo,
-                        m, n, k,
-                        alpha,
-                        temp_symm, 0, k,
-                        b_buffer, b_offset, b_ld,
-                        beta,
-                        c_buffer, c_offset, c_ld);
+      DoGemm(layout, Transpose::kNo, Transpose::kNo,
+             m, n, k,
+             alpha,
+             b_buffer, b_offset, b_ld,
+             temp_symm, 0, k,
+             beta,
+             c_buffer, c_offset, c_ld);
+    } catch (BLASError &e) {
+      // A and B are now reversed, so also reverse the error codes returned from the Xgemm routine
+      switch(e.status()) {
+        case StatusCode::kInvalidMatrixA:      throw BLASError(StatusCode::kInvalidMatrixB, e.details());
+        case StatusCode::kInvalidMatrixB:      throw BLASError(StatusCode::kInvalidMatrixA, e.details());
+        case StatusCode::kInvalidLeadDimA:     throw BLASError(StatusCode::kInvalidLeadDimB, e.details());
+        case StatusCode::kInvalidLeadDimB:     throw BLASError(StatusCode::kInvalidLeadDimA, e.details());
+        case StatusCode::kInsufficientMemoryA: throw BLASError(StatusCode::kInsufficientMemoryB, e.details());
+        case StatusCode::kInsufficientMemoryB: throw BLASError(StatusCode::kInsufficientMemoryA, e.details());
+        default:                               throw;
      }
-
-      // ... with "C := BA+C". Note that A and B are now reversed.
-      else {
-        status = DoGemm(layout, Transpose::kNo, Transpose::kNo,
-                        m, n, k,
-                        alpha,
-                        b_buffer, b_offset, b_ld,
-                        temp_symm, 0, k,
-                        beta,
-                        c_buffer, c_offset, c_ld);
-
-        // A and B are now reversed, so also reverse the error codes returned from the Xgemm routine
-        switch(status) {
-          case StatusCode::kInvalidMatrixA:      status = StatusCode::kInvalidMatrixB; break;
-          case StatusCode::kInvalidMatrixB:      status = StatusCode::kInvalidMatrixA; break;
-          case StatusCode::kInvalidLeadDimA:     status = StatusCode::kInvalidLeadDimB; break;
-          case StatusCode::kInvalidLeadDimB:     status = StatusCode::kInvalidLeadDimA; break;
-          case StatusCode::kInsufficientMemoryA: status = StatusCode::kInsufficientMemoryB; break;
-          case StatusCode::kInsufficientMemoryB: status = StatusCode::kInsufficientMemoryA; break;
-        }
-      }
-
-      // Return the status of the Xgemm routine
-      return status;
-    } catch (...) { return StatusCode::kInvalidKernel; }
-  } catch (...) { return StatusCode::kTempBufferAllocFailure; }
+    }
+  }
 }

 // =================================================================================================
--- a/src/routines/level3/xsymm.hpp
+++ b/src/routines/level3/xsymm.hpp
@ -39,13 +39,13 @@ class Xsymm: public Xgemm<T> {
  Xsymm(Queue &queue, EventPointer event, const std::string &name = "SYMM");

  // Templated-precision implementation of the routine
-  StatusCode DoSymm(const Layout layout, const Side side, const Triangle triangle,
-                    const size_t m, const size_t n,
-                    const T alpha,
-                    const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
-                    const Buffer<T> &b_buffer, const size_t b_offset, const size_t b_ld,
-                    const T beta,
-                    const Buffer<T> &c_buffer, const size_t c_offset, const size_t c_ld);
+  void DoSymm(const Layout layout, const Side side, const Triangle triangle,
+              const size_t m, const size_t n,
+              const T alpha,
+              const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
+              const Buffer<T> &b_buffer, const size_t b_offset, const size_t b_ld,
+              const T beta,
+              const Buffer<T> &c_buffer, const size_t c_offset, const size_t c_ld);
 };

 // =================================================================================================
--- a/src/routines/level3/xsyr2k.cpp
+++ b/src/routines/level3/xsyr2k.cpp
@ -39,7 +39,7 @@ Xsyr2k<T>::Xsyr2k(Queue &queue, EventPointer event, const std::string &name):

 // The main routine
 template <typename T>
-StatusCode Xsyr2k<T>::DoSyr2k(const Layout layout, const Triangle triangle, const Transpose ab_transpose,
+void Xsyr2k<T>::DoSyr2k(const Layout layout, const Triangle triangle, const Transpose ab_transpose,
                              const size_t n, const size_t k,
                              const T alpha,
                              const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
@ -48,7 +48,7 @@ StatusCode Xsyr2k<T>::DoSyr2k(const Layout layout, const Triangle triangle, cons
                              const Buffer<T> &c_buffer, const size_t c_offset, const size_t c_ld) {

  // Makes sure all dimensions are larger than zero
-  if ((n == 0) || (k == 0) ) { return StatusCode::kInvalidDimension; }
+  if ((n == 0) || (k == 0) ) { throw BLASError(StatusCode::kInvalidDimension); }

  // Computes whether or not the matrices are transposed in memory. This is based on their layout
  // (row or column-major) and whether or not they are requested to be pre-transposed.
@ -67,12 +67,9 @@ StatusCode Xsyr2k<T>::DoSyr2k(const Layout layout, const Triangle triangle, cons
  //    matrix A cannot be less than N when rotated, or less than K when not-rotated
  //    matrix B cannot be less than N when rotated, or less than K when not-rotated
  //    matrix C cannot be less than N
-  auto status = TestMatrixA(ab_one, ab_two, a_buffer, a_offset, a_ld);
-  if (ErrorIn(status)) { return status; }
-  status = TestMatrixB(ab_one, ab_two, b_buffer, b_offset, b_ld);
-  if (ErrorIn(status)) { return status; }
-  status = TestMatrixC(n, n, c_buffer, c_offset, c_ld);
-  if (ErrorIn(status)) { return status; }
+  TestMatrixA(ab_one, ab_two, a_buffer, a_offset, a_ld);
+  TestMatrixB(ab_one, ab_two, b_buffer, b_offset, b_ld);
+  TestMatrixC(n, n, c_buffer, c_offset, c_ld);

  // Calculates the ceiled versions of n and k
  auto n_ceiled = Ceil(n, db_["NWG"]);
@ -81,114 +78,99 @@ StatusCode Xsyr2k<T>::DoSyr2k(const Layout layout, const Triangle triangle, cons
  // Decides which kernel to run: the upper-triangular or lower-triangular version
  auto kernel_name = (triangle == Triangle::kUpper) ? "XgemmUpper" : "XgemmLower";

-  // The padded/transposed input/output matrices: if memory allocation fails, throw an exception
-  try {
+  // Loads the program from the database
+  const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);

-    // Loads the program from the database
-    const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);
+  // Determines whether or not temporary matrices are needed
+  auto a_no_temp = ab_one == n_ceiled && ab_two == k_ceiled && a_ld == n_ceiled && a_offset == 0 &&
+                   ab_rotated == false;
+  auto b_no_temp = ab_one == n_ceiled && ab_two == k_ceiled && b_ld == n_ceiled && b_offset == 0 &&
+                   ab_rotated == false;

-    // Determines whether or not temporary matrices are needed
-    auto a_no_temp = ab_one == n_ceiled && ab_two == k_ceiled && a_ld == n_ceiled && a_offset == 0 &&
-                     ab_rotated == false;
-    auto b_no_temp = ab_one == n_ceiled && ab_two == k_ceiled && b_ld == n_ceiled && b_offset == 0 &&
-                     ab_rotated == false;
+  // Creates the temporary matrices
+  auto a_temp = (a_no_temp) ? a_buffer : Buffer<T>(context_, k_ceiled*n_ceiled);
+  auto b_temp = (b_no_temp) ? b_buffer : Buffer<T>(context_, k_ceiled*n_ceiled);
+  auto c_temp = Buffer<T>(context_, n_ceiled*n_ceiled);

-    // Creates the temporary matrices
-    auto a_temp = (a_no_temp) ? a_buffer : Buffer<T>(context_, k_ceiled*n_ceiled);
-    auto b_temp = (b_no_temp) ? b_buffer : Buffer<T>(context_, k_ceiled*n_ceiled);
-    auto c_temp = Buffer<T>(context_, n_ceiled*n_ceiled);
+  // Events of all kernels (including pre/post processing kernels)
+  auto eventWaitList = std::vector<Event>();
+  auto emptyEventList = std::vector<Event>();

-    // Events of all kernels (including pre/post processing kernels)
-    auto eventWaitList = std::vector<Event>();
-    auto emptyEventList = std::vector<Event>();
+  // Runs the pre-processing kernels. This transposes the matrices A and B, but also pads zeros to
+  // to fill it up until it reaches a certain multiple of size (kernel parameter dependent). In
+  // case nothing has to be done, these kernels can be skipped.
+  if (!a_no_temp) {
+    auto eventProcessA = Event();
+    PadCopyTransposeMatrix(queue_, device_, db_, eventProcessA.pointer(), emptyEventList,
+                           ab_one, ab_two, a_ld, a_offset, a_buffer,
+                           n_ceiled, k_ceiled, n_ceiled, 0, a_temp,
+                           ConstantOne<T>(), program,
+                           true, ab_rotated, false);
+    eventWaitList.push_back(eventProcessA);
+  }
+  if (!b_no_temp) {
+    auto eventProcessB = Event();
+    PadCopyTransposeMatrix(queue_, device_, db_, eventProcessB.pointer(), emptyEventList,
+                           ab_one, ab_two, b_ld, b_offset, b_buffer,
+                           n_ceiled, k_ceiled, n_ceiled, 0, b_temp,
+                           ConstantOne<T>(), program,
+                           true, ab_rotated, false);
+    eventWaitList.push_back(eventProcessB);
+  }

-    // Runs the pre-processing kernels. This transposes the matrices A and B, but also pads zeros to
-    // to fill it up until it reaches a certain multiple of size (kernel parameter dependent). In
-    // case nothing has to be done, these kernels can be skipped.
-    if (!a_no_temp) {
-      auto eventProcessA = Event();
-      status = PadCopyTransposeMatrix(queue_, device_, db_, eventProcessA.pointer(), emptyEventList,
-                                      ab_one, ab_two, a_ld, a_offset, a_buffer,
-                                      n_ceiled, k_ceiled, n_ceiled, 0, a_temp,
-                                      ConstantOne<T>(), program,
-                                      true, ab_rotated, false);
-      if (ErrorIn(status)) { return status; }
-      eventWaitList.push_back(eventProcessA);
-    }
-    if (!b_no_temp) {
-      auto eventProcessB = Event();
-      status = PadCopyTransposeMatrix(queue_, device_, db_, eventProcessB.pointer(), emptyEventList,
-                                      ab_one, ab_two, b_ld, b_offset, b_buffer,
-                                      n_ceiled, k_ceiled, n_ceiled, 0, b_temp,
-                                      ConstantOne<T>(), program,
-                                      true, ab_rotated, false);
-      if (ErrorIn(status)) { return status; }
-      eventWaitList.push_back(eventProcessB);
-    }
+  // Furthermore, also creates a (possibly padded) copy of matrix C, since it is not allowed to
+  // modify the other triangle.
+  auto eventProcessC = Event();
+  PadCopyTransposeMatrix(queue_, device_, db_, eventProcessC.pointer(), emptyEventList,
+                         n, n, c_ld, c_offset, c_buffer,
+                         n_ceiled, n_ceiled, n_ceiled, 0, c_temp,
+                         ConstantOne<T>(), program,
+                         true, c_rotated, false);
+  eventWaitList.push_back(eventProcessC);

-    // Furthermore, also creates a (possibly padded) copy of matrix C, since it is not allowed to
-    // modify the other triangle.
-    auto eventProcessC = Event();
-    status = PadCopyTransposeMatrix(queue_, device_, db_, eventProcessC.pointer(), emptyEventList,
-                                    n, n, c_ld, c_offset, c_buffer,
-                                    n_ceiled, n_ceiled, n_ceiled, 0, c_temp,
-                                    ConstantOne<T>(), program,
-                                    true, c_rotated, false);
-    if (ErrorIn(status)) { return status; }
-    eventWaitList.push_back(eventProcessC);
+  // Retrieves the XgemmUpper or XgemmLower kernel from the compiled binary
+  auto kernel = Kernel(program, kernel_name);

-    // Retrieves the XgemmUpper or XgemmLower kernel from the compiled binary
-    try {
-      auto kernel = Kernel(program, kernel_name);
+  // Sets the kernel arguments
+  kernel.SetArgument(0, static_cast<int>(n_ceiled));
+  kernel.SetArgument(1, static_cast<int>(k_ceiled));
+  kernel.SetArgument(2, GetRealArg(alpha));
+  kernel.SetArgument(3, GetRealArg(beta));
+  kernel.SetArgument(4, a_temp());
+  kernel.SetArgument(5, b_temp());
+  kernel.SetArgument(6, c_temp());

-      // Sets the kernel arguments
-      kernel.SetArgument(0, static_cast<int>(n_ceiled));
-      kernel.SetArgument(1, static_cast<int>(k_ceiled));
-      kernel.SetArgument(2, GetRealArg(alpha));
-      kernel.SetArgument(3, GetRealArg(beta));
-      kernel.SetArgument(4, a_temp());
-      kernel.SetArgument(5, b_temp());
-      kernel.SetArgument(6, c_temp());
+  // Computes the global and local thread sizes
+  auto global = std::vector<size_t>{
+    (n_ceiled * db_["MDIMC"]) / db_["MWG"],
+    (n_ceiled * db_["NDIMC"]) / db_["NWG"]
+  };
+  auto local = std::vector<size_t>{db_["MDIMC"], db_["NDIMC"]};

-      // Computes the global and local thread sizes
-      auto global = std::vector<size_t>{
-        (n_ceiled * db_["MDIMC"]) / db_["MWG"],
-        (n_ceiled * db_["NDIMC"]) / db_["NWG"]
-      };
-      auto local = std::vector<size_t>{db_["MDIMC"], db_["NDIMC"]};
+  // Launches the kernel
+  auto eventKernel1 = Event();
+  RunKernel(kernel, queue_, device_, global, local, eventKernel1.pointer(), eventWaitList);
+  eventWaitList.push_back(eventKernel1);

-      // Launches the kernel
-      auto eventKernel1 = Event();
-      status = RunKernel(kernel, queue_, device_, global, local, eventKernel1.pointer(), eventWaitList);
-      if (ErrorIn(status)) { return status; }
-      eventWaitList.push_back(eventKernel1);
+  // Swaps the arguments for matrices A and B, and sets 'beta' to 1
+  auto one = static_cast<T>(1);
+  kernel.SetArgument(3, GetRealArg(one));
+  kernel.SetArgument(4, b_temp());
+  kernel.SetArgument(5, a_temp());

-      // Swaps the arguments for matrices A and B, and sets 'beta' to 1
-      auto one = static_cast<T>(1);
-      kernel.SetArgument(3, GetRealArg(one));
-      kernel.SetArgument(4, b_temp());
-      kernel.SetArgument(5, a_temp());
+  // Runs the kernel again
+  auto eventKernel2 = Event();
+  RunKernel(kernel, queue_, device_, global, local, eventKernel2.pointer(), eventWaitList);
+  eventWaitList.push_back(eventKernel2);

-      // Runs the kernel again
-      auto eventKernel2 = Event();
-      status = RunKernel(kernel, queue_, device_, global, local, eventKernel2.pointer(), eventWaitList);
-      if (ErrorIn(status)) { return status; }
-      eventWaitList.push_back(eventKernel2);
-
-      // Runs the post-processing kernel
-      auto upper = (triangle == Triangle::kUpper);
-      auto lower = (triangle == Triangle::kLower);
-      status = PadCopyTransposeMatrix(queue_, device_, db_, event_, eventWaitList,
-                                      n_ceiled, n_ceiled, n_ceiled, 0, c_temp,
-                                      n, n, c_ld, c_offset, c_buffer,
-                                      ConstantOne<T>(), program,
-                                      false, c_rotated, false, upper, lower, false);
-      if (ErrorIn(status)) { return status; }
-
-      // Successfully finished the computation
-      return StatusCode::kSuccess;
-    } catch (...) { return StatusCode::kInvalidKernel; }
-  } catch (...) { return StatusCode::kTempBufferAllocFailure; }
+  // Runs the post-processing kernel
+  auto upper = (triangle == Triangle::kUpper);
+  auto lower = (triangle == Triangle::kLower);
+  PadCopyTransposeMatrix(queue_, device_, db_, event_, eventWaitList,
+                         n_ceiled, n_ceiled, n_ceiled, 0, c_temp,
+                         n, n, c_ld, c_offset, c_buffer,
+                         ConstantOne<T>(), program,
+                         false, c_rotated, false, upper, lower, false);
 }

 // =================================================================================================
--- a/src/routines/level3/xsyr2k.hpp
+++ b/src/routines/level3/xsyr2k.hpp
@ -30,13 +30,13 @@ class Xsyr2k: public Routine {
  Xsyr2k(Queue &queue, EventPointer event, const std::string &name = "SYR2K");

  // Templated-precision implementation of the routine
-  StatusCode DoSyr2k(const Layout layout, const Triangle triangle, const Transpose ab_transpose,
-                     const size_t n, const size_t k,
-                     const T alpha,
-                     const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
-                     const Buffer<T> &b_buffer, const size_t b_offset, const size_t b_ld,
-                     const T beta,
-                     const Buffer<T> &c_buffer, const size_t c_offset, const size_t c_ld);
+  void DoSyr2k(const Layout layout, const Triangle triangle, const Transpose ab_transpose,
+               const size_t n, const size_t k,
+               const T alpha,
+               const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
+               const Buffer<T> &b_buffer, const size_t b_offset, const size_t b_ld,
+               const T beta,
+               const Buffer<T> &c_buffer, const size_t c_offset, const size_t c_ld);
 };

 // =================================================================================================
--- a/src/routines/level3/xsyrk.cpp
+++ b/src/routines/level3/xsyrk.cpp
@ -39,7 +39,7 @@ Xsyrk<T>::Xsyrk(Queue &queue, EventPointer event, const std::string &name):

 // The main routine
 template <typename T>
-StatusCode Xsyrk<T>::DoSyrk(const Layout layout, const Triangle triangle, const Transpose a_transpose,
+void Xsyrk<T>::DoSyrk(const Layout layout, const Triangle triangle, const Transpose a_transpose,
                            const size_t n, const size_t k,
                            const T alpha,
                            const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
@ -47,7 +47,7 @@ StatusCode Xsyrk<T>::DoSyrk(const Layout layout, const Triangle triangle, const
                            const Buffer<T> &c_buffer, const size_t c_offset, const size_t c_ld) {

  // Makes sure all dimensions are larger than zero
-  if ((n == 0) || (k == 0) ) { return StatusCode::kInvalidDimension; }
+  if ((n == 0) || (k == 0) ) { throw BLASError(StatusCode::kInvalidDimension); }

  // Computes whether or not the matrices are transposed in memory. This is based on their layout
  // (row or column-major) and whether or not they are requested to be pre-transposed.
@ -65,10 +65,8 @@ StatusCode Xsyrk<T>::DoSyrk(const Layout layout, const Triangle triangle, const
  // space. Also tests that the leading dimensions of:
  //    matrix A cannot be less than N when rotated, or less than K when not-rotated
  //    matrix C cannot be less than N
-  auto status = TestMatrixA(a_one, a_two, a_buffer, a_offset, a_ld);
-  if (ErrorIn(status)) { return status; }
-  status = TestMatrixC(n, n, c_buffer, c_offset, c_ld);
-  if (ErrorIn(status)) { return status; }
+  TestMatrixA(a_one, a_two, a_buffer, a_offset, a_ld);
+  TestMatrixC(n, n, c_buffer, c_offset, c_ld);

  // Calculates the ceiled versions of n and k
  auto n_ceiled = Ceil(n, db_["NWG"]);
@ -77,90 +75,76 @@ StatusCode Xsyrk<T>::DoSyrk(const Layout layout, const Triangle triangle, const
  // Decides which kernel to run: the upper-triangular or lower-triangular version
  auto kernel_name = (triangle == Triangle::kUpper) ? "XgemmUpper" : "XgemmLower";

-  // The padded/transposed input/output matrices: if memory allocation fails, throw an exception
-  try {
+  // Loads the program from the database
+  const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);

-    // Loads the program from the database
-    const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);
+  // Determines whether or not temporary matrices are needed
+  auto a_no_temp = a_one == n_ceiled && a_two == k_ceiled && a_ld == n_ceiled && a_offset == 0 &&
+                   a_rotated == false;

-    // Determines whether or not temporary matrices are needed
-    auto a_no_temp = a_one == n_ceiled && a_two == k_ceiled && a_ld == n_ceiled && a_offset == 0 &&
-                     a_rotated == false;
+  // Creates the temporary matrices
+  auto a_temp = (a_no_temp) ? a_buffer : Buffer<T>(context_, k_ceiled*n_ceiled);
+  auto c_temp = Buffer<T>(context_, n_ceiled*n_ceiled);

-    // Creates the temporary matrices
-    auto a_temp = (a_no_temp) ? a_buffer : Buffer<T>(context_, k_ceiled*n_ceiled);
-    auto c_temp = Buffer<T>(context_, n_ceiled*n_ceiled);
+  // Events of all kernels (including pre/post processing kernels)
+  auto eventWaitList = std::vector<Event>();
+  auto emptyEventList = std::vector<Event>();

-    // Events of all kernels (including pre/post processing kernels)
-    auto eventWaitList = std::vector<Event>();
-    auto emptyEventList = std::vector<Event>();
+  // Runs the pre-processing kernel for matrix A. This transposes the matrix, but also pads zeros
+  // to fill it up until it reaches a certain multiple of size (kernel parameter dependent). In
+  // case nothing has to be done, these kernels can be skipped.
+  if (!a_no_temp) {
+    auto eventProcessA = Event();
+    PadCopyTransposeMatrix(queue_, device_, db_, eventProcessA.pointer(), emptyEventList,
+                           a_one, a_two, a_ld, a_offset, a_buffer,
+                           n_ceiled, k_ceiled, n_ceiled, 0, a_temp,
+                           ConstantOne<T>(), program,
+                           true, a_rotated, false);
+    eventWaitList.push_back(eventProcessA);
+  }

-    // Runs the pre-processing kernel for matrix A. This transposes the matrix, but also pads zeros
-    // to fill it up until it reaches a certain multiple of size (kernel parameter dependent). In
-    // case nothing has to be done, these kernels can be skipped.
-    if (!a_no_temp) {
-      auto eventProcessA = Event();
-      status = PadCopyTransposeMatrix(queue_, device_, db_, eventProcessA.pointer(), emptyEventList,
-                                      a_one, a_two, a_ld, a_offset, a_buffer,
-                                      n_ceiled, k_ceiled, n_ceiled, 0, a_temp,
-                                      ConstantOne<T>(), program,
-                                      true, a_rotated, false);
-      if (ErrorIn(status)) { return status; }
-      eventWaitList.push_back(eventProcessA);
-    }
+  // Furthermore, also creates a (possibly padded) copy of matrix C, since it is not allowed to
+  // modify the other triangle.
+  auto eventProcessC = Event();
+  PadCopyTransposeMatrix(queue_, device_, db_, eventProcessC.pointer(), emptyEventList,
+                         n, n, c_ld, c_offset, c_buffer,
+                         n_ceiled, n_ceiled, n_ceiled, 0, c_temp,
+                         ConstantOne<T>(), program,
+                         true, c_rotated, false);
+  eventWaitList.push_back(eventProcessC);

-    // Furthermore, also creates a (possibly padded) copy of matrix C, since it is not allowed to
-    // modify the other triangle.
-    auto eventProcessC = Event();
-    status = PadCopyTransposeMatrix(queue_, device_, db_, eventProcessC.pointer(), emptyEventList,
-                                    n, n, c_ld, c_offset, c_buffer,
-                                    n_ceiled, n_ceiled, n_ceiled, 0, c_temp,
-                                    ConstantOne<T>(), program,
-                                    true, c_rotated, false);
-    if (ErrorIn(status)) { return status; }
-    eventWaitList.push_back(eventProcessC);
+  // Retrieves the XgemmUpper or XgemmLower kernel from the compiled binary
+  auto kernel = Kernel(program, kernel_name);

-    // Retrieves the XgemmUpper or XgemmLower kernel from the compiled binary
-    try {
-      auto kernel = Kernel(program, kernel_name);
+  // Sets the kernel arguments
+  kernel.SetArgument(0, static_cast<int>(n_ceiled));
+  kernel.SetArgument(1, static_cast<int>(k_ceiled));
+  kernel.SetArgument(2, GetRealArg(alpha));
+  kernel.SetArgument(3, GetRealArg(beta));
+  kernel.SetArgument(4, a_temp());
+  kernel.SetArgument(5, a_temp());
+  kernel.SetArgument(6, c_temp());

-      // Sets the kernel arguments
-      kernel.SetArgument(0, static_cast<int>(n_ceiled));
-      kernel.SetArgument(1, static_cast<int>(k_ceiled));
-      kernel.SetArgument(2, GetRealArg(alpha));
-      kernel.SetArgument(3, GetRealArg(beta));
-      kernel.SetArgument(4, a_temp());
-      kernel.SetArgument(5, a_temp());
-      kernel.SetArgument(6, c_temp());
+  // Computes the global and local thread sizes
+  auto global = std::vector<size_t>{
+    (n_ceiled * db_["MDIMC"]) / db_["MWG"],
+    (n_ceiled * db_["NDIMC"]) / db_["NWG"]
+  };
+  auto local = std::vector<size_t>{db_["MDIMC"], db_["NDIMC"]};

-      // Computes the global and local thread sizes
-      auto global = std::vector<size_t>{
-        (n_ceiled * db_["MDIMC"]) / db_["MWG"],
-        (n_ceiled * db_["NDIMC"]) / db_["NWG"]
-      };
-      auto local = std::vector<size_t>{db_["MDIMC"], db_["NDIMC"]};
+  // Launches the kernel
+  auto eventKernel = Event();
+  RunKernel(kernel, queue_, device_, global, local, eventKernel.pointer(), eventWaitList);
+  eventWaitList.push_back(eventKernel);

-      // Launches the kernel
-      auto eventKernel = Event();
-      status = RunKernel(kernel, queue_, device_, global, local, eventKernel.pointer(), eventWaitList);
-      if (ErrorIn(status)) { return status; }
-      eventWaitList.push_back(eventKernel);
-
-      // Runs the post-processing kernel
-      auto upper = (triangle == Triangle::kUpper);
-      auto lower = (triangle == Triangle::kLower);
-      status = PadCopyTransposeMatrix(queue_, device_, db_, event_, eventWaitList,
-                                      n_ceiled, n_ceiled, n_ceiled, 0, c_temp,
-                                      n, n, c_ld, c_offset, c_buffer,
-                                      ConstantOne<T>(), program,
-                                      false, c_rotated, false, upper, lower, false);
-      if (ErrorIn(status)) { return status; }
-
-
-      // Successfully finished the computation
-      return StatusCode::kSuccess;
-    } catch (...) { return StatusCode::kInvalidKernel; }
-  } catch (...) { return StatusCode::kTempBufferAllocFailure; }
+  // Runs the post-processing kernel
+  auto upper = (triangle == Triangle::kUpper);
+  auto lower = (triangle == Triangle::kLower);
+  PadCopyTransposeMatrix(queue_, device_, db_, event_, eventWaitList,
+                         n_ceiled, n_ceiled, n_ceiled, 0, c_temp,
+                         n, n, c_ld, c_offset, c_buffer,
+                         ConstantOne<T>(), program,
+                         false, c_rotated, false, upper, lower, false);
 }

 // =================================================================================================
--- a/src/routines/level3/xsyrk.hpp
+++ b/src/routines/level3/xsyrk.hpp
@ -32,12 +32,12 @@ class Xsyrk: public Routine {
  Xsyrk(Queue &queue, EventPointer event, const std::string &name = "SYRK");

  // Templated-precision implementation of the routine
-  StatusCode DoSyrk(const Layout layout, const Triangle triangle, const Transpose a_transpose,
-                    const size_t n, const size_t k,
-                    const T alpha,
-                    const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
-                    const T beta,
-                    const Buffer<T> &c_buffer, const size_t c_offset, const size_t c_ld);
+  void DoSyrk(const Layout layout, const Triangle triangle, const Transpose a_transpose,
+              const size_t n, const size_t k,
+              const T alpha,
+              const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
+              const T beta,
+              const Buffer<T> &c_buffer, const size_t c_offset, const size_t c_ld);
 };

 // =================================================================================================
--- a/Show More
+++ b/Show More