Merge pull request #69 from CNugteren/refactoring

Refactoring of the Routine class and file-renaming
This commit is contained in:
Cedric Nugteren 2016-06-19 14:03:53 +02:00 committed by GitHub
commit 395a0ef34e
276 changed files with 1274 additions and 1727 deletions

View file

@ -4,6 +4,7 @@ Development version (next release)
- Made it possible to compile the performance tests (clients) separately from the correctness tests
- Made a reference BLAS and head-to-head performance comparison optional in the clients
- Increased the verbosity of the "-verbose" option in the correctness tests
- Refactored the host code for better compilation times and fewer lines of code
- Improved the API documentation
- Various minor fixes and enhancements
- Added tuned parameters for various devices (see README)

View file

@ -121,7 +121,7 @@ endif()
# ==================================================================================================
# Includes directories: CLBlast and OpenCL
include_directories(${clblast_SOURCE_DIR}/include ${OPENCL_INCLUDE_DIRS})
include_directories(${clblast_SOURCE_DIR}/include ${clblast_SOURCE_DIR}/src ${OPENCL_INCLUDE_DIRS})
# ==================================================================================================
@ -140,19 +140,26 @@ set(PRECISIONS 32 64 3232 6464)
# ==================================================================================================
# Gathers all source-files
set(SOURCES src/clblast.cc src/database.cc src/routine.cc src/cache.cc
src/utilities.cc src/clblast_c.cc)
set(SOURCES
src/database/database.cpp
src/routines/common.cpp
src/cache.cpp
src/clblast.cpp
src/clblast_c.cpp
src/routine.cpp
src/utilities.cpp
)
foreach(ROUTINE ${LEVEL1_ROUTINES})
set(SOURCES ${SOURCES} src/routines/level1/${ROUTINE}.cc)
set(SOURCES ${SOURCES} src/routines/level1/${ROUTINE}.cpp)
endforeach()
foreach(ROUTINE ${LEVEL2_ROUTINES})
set(SOURCES ${SOURCES} src/routines/level2/${ROUTINE}.cc)
set(SOURCES ${SOURCES} src/routines/level2/${ROUTINE}.cpp)
endforeach()
foreach(ROUTINE ${LEVEL3_ROUTINES})
set(SOURCES ${SOURCES} src/routines/level3/${ROUTINE}.cc)
set(SOURCES ${SOURCES} src/routines/level3/${ROUTINE}.cpp)
endforeach()
foreach(ROUTINE ${LEVELX_ROUTINES})
set(SOURCES ${SOURCES} src/routines/levelx/${ROUTINE}.cc)
set(SOURCES ${SOURCES} src/routines/levelx/${ROUTINE}.cpp)
endforeach()
# Creates and links the library
@ -186,7 +193,7 @@ if(SAMPLES)
# Adds sample programs (C++)
foreach(SAMPLE ${SAMPLE_PROGRAMS_CPP})
add_executable(clblast_sample_${SAMPLE} samples/${SAMPLE}.cc)
add_executable(clblast_sample_${SAMPLE} samples/${SAMPLE}.cpp)
target_link_libraries(clblast_sample_${SAMPLE} clblast ${OPENCL_LIBRARIES})
install(TARGETS clblast_sample_${SAMPLE} DESTINATION bin)
endforeach()
@ -211,7 +218,7 @@ if(TUNERS)
# Adds tuning executables
foreach(KERNEL ${KERNELS})
add_executable(clblast_tuner_${KERNEL} src/tuning/${KERNEL}.cc)
add_executable(clblast_tuner_${KERNEL} src/tuning/kernels/${KERNEL}.cpp)
target_link_libraries(clblast_tuner_${KERNEL} clblast ${CLTUNE_LIBRARIES} ${OPENCL_LIBRARIES})
install(TARGETS clblast_tuner_${KERNEL} DESTINATION bin)
endforeach()
@ -257,7 +264,7 @@ if(CLIENTS OR TESTS)
endif()
# Sets the include directories
include_directories(${clblast_SOURCE_DIR}/test ${REF_INCLUDES})
include_directories(${clblast_SOURCE_DIR} ${REF_INCLUDES})
endif()
@ -268,24 +275,24 @@ endif()
if(CLIENTS)
# Creates the common performance-tests objects (requires CMake 2.8.8)
add_library(test_performance_common OBJECT test/performance/client.cc)
add_library(test_performance_common OBJECT test/performance/client.cpp)
# Compiles the performance-tests
foreach(ROUTINE ${LEVEL1_ROUTINES})
add_executable(clblast_client_${ROUTINE} $<TARGET_OBJECTS:test_performance_common>
test/performance/routines/level1/${ROUTINE}.cc)
test/performance/routines/level1/${ROUTINE}.cpp)
endforeach()
foreach(ROUTINE ${LEVEL2_ROUTINES})
add_executable(clblast_client_${ROUTINE} $<TARGET_OBJECTS:test_performance_common>
test/performance/routines/level2/${ROUTINE}.cc)
test/performance/routines/level2/${ROUTINE}.cpp)
endforeach()
foreach(ROUTINE ${LEVEL3_ROUTINES})
add_executable(clblast_client_${ROUTINE} $<TARGET_OBJECTS:test_performance_common>
test/performance/routines/level3/${ROUTINE}.cc)
test/performance/routines/level3/${ROUTINE}.cpp)
endforeach()
foreach(ROUTINE ${LEVELX_ROUTINES})
add_executable(clblast_client_${ROUTINE} $<TARGET_OBJECTS:test_performance_common>
test/performance/routines/levelx/${ROUTINE}.cc)
test/performance/routines/levelx/${ROUTINE}.cpp)
endforeach()
foreach(ROUTINE ${ROUTINES})
target_link_libraries(clblast_client_${ROUTINE} clblast ${REF_LIBRARIES} ${OPENCL_LIBRARIES})
@ -303,24 +310,24 @@ if(TESTS)
# Creates the common correctness-tests objects (requires CMake 2.8.8)
add_library(test_correctness_common OBJECT
test/correctness/tester.cc test/correctness/testblas.cc)
test/correctness/tester.cpp test/correctness/testblas.cpp)
# Compiles the correctness-tests
foreach(ROUTINE ${LEVEL1_ROUTINES})
add_executable(clblast_test_${ROUTINE} $<TARGET_OBJECTS:test_correctness_common>
test/correctness/routines/level1/${ROUTINE}.cc)
test/correctness/routines/level1/${ROUTINE}.cpp)
endforeach()
foreach(ROUTINE ${LEVEL2_ROUTINES})
add_executable(clblast_test_${ROUTINE} $<TARGET_OBJECTS:test_correctness_common>
test/correctness/routines/level2/${ROUTINE}.cc)
test/correctness/routines/level2/${ROUTINE}.cpp)
endforeach()
foreach(ROUTINE ${LEVEL3_ROUTINES})
add_executable(clblast_test_${ROUTINE} $<TARGET_OBJECTS:test_correctness_common>
test/correctness/routines/level3/${ROUTINE}.cc)
test/correctness/routines/level3/${ROUTINE}.cpp)
endforeach()
foreach(ROUTINE ${LEVELX_ROUTINES})
add_executable(clblast_test_${ROUTINE} $<TARGET_OBJECTS:test_correctness_common>
test/correctness/routines/levelx/${ROUTINE}.cc)
test/correctness/routines/levelx/${ROUTINE}.cpp)
endforeach()
foreach(ROUTINE ${ROUTINES})
target_link_libraries(clblast_test_${ROUTINE} clblast ${REF_LIBRARIES} ${OPENCL_LIBRARIES})

View file

@ -136,7 +136,7 @@ Note that CLBlast's tuners are based on the CLTune auto-tuning library, which ha
Compiling with `-DTUNERS=ON` will generate a number of tuners, each named `clblast_tuner_xxxxx`, in which `xxxxx` corresponds to a `.opencl` kernel file as found in `src/kernels`. These kernels corresponds to routines (e.g. `xgemm`) or to common pre-processing or post-processing kernels (`copy` and `transpose`). Running such a tuner will test a number of parameter-value combinations on your device and report which one gave the best performance. Running `make alltuners` runs all tuners for all precisions in one go. You can set the default device and platform for `alltuners` by setting the `DEFAULT_DEVICE` and `DEFAULT_PLATFORM` environmental variables before running CMake.
The tuners output a JSON-file with the results. The best results need to be added to `include/internal/database/xxxxx.h` in the appropriate section. However, this can be done automatically based on the JSON-data using a Python script in `scripts/database/database.py`. If you want the found parameters to be included in future releases of CLBlast, please attach the JSON files to the corresponding issue on GitHub or [email the main author](http://www.cedricnugteren.nl).
The tuners output a JSON-file with the results. The best results need to be added to `include/internal/database/xxxxx.hpp` in the appropriate section. However, this can be done automatically based on the JSON-data using a Python script in `scripts/database/database.py`. If you want the found parameters to be included in future releases of CLBlast, please attach the JSON files to the corresponding issue on GitHub or [email the main author](http://www.cedricnugteren.nl).
In summary, tuning the entire library for your device can be done as follows (starting from the root of the CLBlast folder):

View file

@ -68,8 +68,8 @@ enum class StatusCode {
kInvalidLocalMemUsage = -2046, // Not enough local memory available on this device
kNoHalfPrecision = -2045, // Half precision (16-bits) not supported by the device
kNoDoublePrecision = -2044, // Double precision (64-bits) not supported by the device
kInvalidVectorDot = -2043, // Vector dot is not a valid OpenCL buffer
kInsufficientMemoryDot = -2042, // Vector dot's OpenCL buffer is too small
kInvalidVectorScalar = -2043, // The unit-sized vector is not a valid OpenCL buffer
kInsufficientMemoryScalar = -2042, // The unit-sized vector's OpenCL buffer is too small
};
// Matrix layout and transpose types

View file

@ -77,8 +77,8 @@ typedef enum StatusCode_ {
kInvalidLocalMemUsage = -2046, // Not enough local memory available on this device
kNoHalfPrecision = -2045, // Half precision (16-bits) not supported by the device
kNoDoublePrecision = -2044, // Double precision (64-bits) not supported by the device
kInvalidVectorDot = -2043, // Vector dot is not a valid OpenCL buffer
kInsufficientMemoryDot = -2042, // Vector dot's OpenCL buffer is too small
kInvalidVectorScalar = -2043, // The unit-sized vector is not a valid OpenCL buffer
kInsufficientMemoryScalar = -2042, // The unit-sized vector's OpenCL buffer is too small
} StatusCode;
// Matrix layout and transpose types

View file

@ -1,144 +0,0 @@
// =================================================================================================
// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
// width of 100 characters per line.
//
// Author(s):
// Cedric Nugteren <www.cedricnugteren.nl>
//
// This file implements all the basic functionality for the BLAS routines. This class serves as a
// base class for the actual routines (e.g. Xaxpy, Xgemm). It contains common functionality such as
// compiling the OpenCL kernel, connecting to the database, etc.
//
// =================================================================================================
#ifndef CLBLAST_ROUTINE_H_
#define CLBLAST_ROUTINE_H_
#include <string>
#include <vector>
#include "internal/cache.h"
#include "internal/utilities.h"
#include "internal/database.h"
namespace clblast {
// =================================================================================================
// See comment at top of file for a description of the class
template <typename T>
class Routine {
public:
// Helper functions which check for errors in the status code
static constexpr bool ErrorIn(const StatusCode s) { return (s != StatusCode::kSuccess); }
// Base class constructor
explicit Routine(Queue &queue, EventPointer event, const std::string &name,
const std::vector<std::string> &routines, const Precision precision);
// Set-up phase of the kernel
StatusCode SetUp();
protected:
// Runs a kernel given the global and local thread sizes
StatusCode RunKernel(Kernel &kernel, std::vector<size_t> global,
const std::vector<size_t> &local, EventPointer event,
std::vector<Event>& waitForEvents);
// As above, but without an event waiting list
StatusCode RunKernel(Kernel &kernel, std::vector<size_t> global,
const std::vector<size_t> &local, EventPointer event);
// Tests for valid inputs of matrices A, B, and C
StatusCode TestMatrixA(const size_t one, const size_t two, const Buffer<T> &buffer,
const size_t offset, const size_t ld, const size_t data_size);
StatusCode TestMatrixB(const size_t one, const size_t two, const Buffer<T> &buffer,
const size_t offset, const size_t ld, const size_t data_size);
StatusCode TestMatrixC(const size_t one, const size_t two, const Buffer<T> &buffer,
const size_t offset, const size_t ld, const size_t data_size);
StatusCode TestMatrixAP(const size_t n, const Buffer<T> &buffer,
const size_t offset, const size_t data_size);
// Tests for valid inputs of vector X and Y
StatusCode TestVectorX(const size_t n, const Buffer<T> &buffer, const size_t offset,
const size_t inc, const size_t data_size);
StatusCode TestVectorY(const size_t n, const Buffer<T> &buffer, const size_t offset,
const size_t inc, const size_t data_size);
// Tests for valid inputs of other vectors
StatusCode TestVectorDot(const size_t n, const Buffer<T> &buffer, const size_t offset,
const size_t data_size);
StatusCode TestVectorIndex(const size_t n, const Buffer<unsigned int> &buffer,
const size_t offset, const size_t data_size);
// Copies/transposes a matrix and padds/unpads it with zeroes. This method is also able to write
// to symmetric and triangular matrices through optional arguments.
StatusCode PadCopyTransposeMatrix(EventPointer event, std::vector<Event>& waitForEvents,
const size_t src_one, const size_t src_two,
const size_t src_ld, const size_t src_offset,
const Buffer<T> &src,
const size_t dest_one, const size_t dest_two,
const size_t dest_ld, const size_t dest_offset,
const Buffer<T> &dest,
const T alpha,
const Program &program, const bool do_pad,
const bool do_transpose, const bool do_conjugate,
const bool upper = false, const bool lower = false,
const bool diagonal_imag_zero = false);
// Stores a newly compiled binary/program into the cache
void StoreBinaryToCache(const std::string& binary) const {
cache::StoreBinaryToCache(binary, device_name_, precision_, routine_name_);
}
void StoreProgramToCache(const Program& program) const {
cache::StoreProgramToCache(program, context_, precision_, routine_name_);
}
// Queries the cache and retrieve either a matching binary/program or a boolean whether a match
// exists. The first assumes that the binary/program is available in the cache and will throw an
// exception otherwise.
std::string GetBinaryFromCache() const {
return cache::GetBinaryFromCache(device_name_, precision_, routine_name_);
}
Program GetProgramFromCache() const {
return cache::GetProgramFromCache(context_, precision_, routine_name_);
}
bool BinaryIsInCache() const {
return cache::BinaryIsInCache(device_name_, precision_, routine_name_);
}
bool ProgramIsInCache() const {
return cache::ProgramIsInCache(context_, precision_, routine_name_);
}
// Non-static variable for the precision. Note that the same variable (but static) might exist in
// a derived class.
const Precision precision_;
// The routine's name and its kernel-source in string form
const std::string routine_name_;
std::string source_string_;
// The OpenCL objects, accessible only from derived classes
Queue queue_;
EventPointer event_;
const Context context_;
const Device device_;
// OpenCL device properties
const std::string device_name_;
const size_t max_work_item_dimensions_;
const std::vector<size_t> max_work_item_sizes_;
const size_t max_work_group_size_;
// Connection to the database for all the device-specific parameters
const Database db_;
};
// =================================================================================================
} // namespace clblast
// CLBLAST_ROUTINE_H_
#endif

View file

@ -310,7 +310,7 @@ defaults = CalculateDefaults(bests)
bests = ConcatenateData(bests, defaults)
# Outputs the data as a C++ database
path_cpp_database = os.path.join(path_clblast, "include", "internal", "database")
path_cpp_database = os.path.join(path_clblast, "src", "database", "kernels")
print("## Producing a C++ database in '"+path_cpp_database+"'...")
PrintData(bests, path_cpp_database)

View file

@ -10,14 +10,14 @@
# This script automatically generates the bodies of the following files, creating the full CLBlast
# API interface and implementation (C, C++, and reference BLAS wrappers):
# clblast.h
# clblast.cc
# clblast.cpp
# clblast_c.h
# clblast_c.cc
# clblast_c.cpp
# wrapper_clblas.h
# wrapper_cblas.h
# It also generates the main functions for the correctness and performance tests as found in
# test/correctness/routines/levelX/xYYYY.cc
# test/performance/routines/levelX/xYYYY.cc
# test/correctness/routines/levelX/xYYYY.cpp
# test/performance/routines/levelX/xYYYY.cpp
# It also produces the API documentation found in doc/clblast.md
#
# ==================================================================================================
@ -200,7 +200,7 @@ def clblast_h(routines):
result += routine.RoutineHeaderCPP(12, " = nullptr")+";\n"
return result
# The C++ API implementation (.cc)
# The C++ API implementation (.cpp)
def clblast_cc(routines):
result = ""
for routine in routines:
@ -237,7 +237,7 @@ def clblast_c_h(routines):
result += routine.RoutineHeaderC(flavour, 31, " PUBLIC_API")+";\n"
return result
# The C API implementation (.cc)
# The C API implementation (.cpp)
def clblast_c_cc(routines):
result = ""
for routine in routines:
@ -379,14 +379,14 @@ if len(sys.argv) != 2:
path_clblast = sys.argv[1]
files = [
path_clblast+"/include/clblast.h",
path_clblast+"/src/clblast.cc",
path_clblast+"/src/clblast.cpp",
path_clblast+"/include/clblast_c.h",
path_clblast+"/src/clblast_c.cc",
path_clblast+"/test/wrapper_clblas.h",
path_clblast+"/test/wrapper_cblas.h",
path_clblast+"/src/clblast_c.cpp",
path_clblast+"/test/wrapper_clblas.hpp",
path_clblast+"/test/wrapper_cblas.hpp",
]
header_lines = [84, 74, 93, 22, 29, 41]
footer_lines = [17, 71, 19, 14, 6, 6]
footer_lines = [17, 75, 19, 14, 6, 6]
# Checks whether the command-line arguments are valid; exists otherwise
for f in files:
@ -433,11 +433,11 @@ for i in xrange(0,len(files)):
for level in [1,2,3,4]:
for routine in routines[level-1]:
if routine.has_tests:
filename = path_clblast+"/test/correctness/routines/level"+levelnames[level-1]+"/x"+routine.name+".cc"
filename = path_clblast+"/test/correctness/routines/level"+levelnames[level-1]+"/x"+routine.name+".cpp"
with open(filename, "w") as f:
body = ""
body += "#include \"correctness/testblas.h\"\n"
body += "#include \"routines/level"+levelnames[level-1]+"/x"+routine.name+".h\"\n\n"
body += "#include \"test/correctness/testblas.hpp\"\n"
body += "#include \"test/routines/level"+levelnames[level-1]+"/x"+routine.name+".hpp\"\n\n"
body += "// Shortcuts to the clblast namespace\n"
body += "using float2 = clblast::float2;\n"
body += "using double2 = clblast::double2;\n\n"
@ -459,11 +459,11 @@ for level in [1,2,3,4]:
for level in [1,2,3,4]:
for routine in routines[level-1]:
if routine.has_tests:
filename = path_clblast+"/test/performance/routines/level"+levelnames[level-1]+"/x"+routine.name+".cc"
filename = path_clblast+"/test/performance/routines/level"+levelnames[level-1]+"/x"+routine.name+".cpp"
with open(filename, "w") as f:
body = ""
body += "#include \"performance/client.h\"\n"
body += "#include \"routines/level"+levelnames[level-1]+"/x"+routine.name+".h\"\n\n"
body += "#include \"test/performance/client.hpp\"\n"
body += "#include \"test/routines/level"+levelnames[level-1]+"/x"+routine.name+".hpp\"\n\n"
body += "// Shortcuts to the clblast namespace\n"
body += "using float2 = clblast::float2;\n"
body += "using double2 = clblast::double2;\n\n"

121
src/buffer_test.hpp Normal file
View file

@ -0,0 +1,121 @@
// =================================================================================================
// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
// width of 100 characters per line.
//
// Author(s):
// Cedric Nugteren <www.cedricnugteren.nl>
//
// This file implements the tests for the OpenCL buffers (matrices and vectors). These tests are
// templated and thus header-only.
//
// =================================================================================================
#ifndef CLBLAST_BUFFER_TEST_H_
#define CLBLAST_BUFFER_TEST_H_
#include "clblast.h"
namespace clblast {
// =================================================================================================
// Tests matrix 'A' for validity
template <typename T>
StatusCode TestMatrixA(const size_t one, const size_t two, const Buffer<T> &buffer,
const size_t offset, const size_t ld) {
if (ld < one) { return StatusCode::kInvalidLeadDimA; }
try {
const auto required_size = (ld * (two - 1) + one + offset) * sizeof(T);
if (buffer.GetSize() < required_size) { return StatusCode::kInsufficientMemoryA; }
} catch (...) { return StatusCode::kInvalidMatrixA; }
return StatusCode::kSuccess;
}
// Tests matrix 'B' for validity
template <typename T>
StatusCode TestMatrixB(const size_t one, const size_t two, const Buffer<T> &buffer,
const size_t offset, const size_t ld) {
if (ld < one) { return StatusCode::kInvalidLeadDimB; }
try {
const auto required_size = (ld * (two - 1) + one + offset) * sizeof(T);
if (buffer.GetSize() < required_size) { return StatusCode::kInsufficientMemoryB; }
} catch (...) { return StatusCode::kInvalidMatrixB; }
return StatusCode::kSuccess;
}
// Tests matrix 'C' for validity
template <typename T>
StatusCode TestMatrixC(const size_t one, const size_t two, const Buffer<T> &buffer,
const size_t offset, const size_t ld) {
if (ld < one) { return StatusCode::kInvalidLeadDimC; }
try {
const auto required_size = (ld * (two - 1) + one + offset) * sizeof(T);
if (buffer.GetSize() < required_size) { return StatusCode::kInsufficientMemoryC; }
} catch (...) { return StatusCode::kInvalidMatrixC; }
return StatusCode::kSuccess;
}
// Tests matrix 'AP' for validity
template <typename T>
StatusCode TestMatrixAP(const size_t n, const Buffer<T> &buffer, const size_t offset) {
try {
const auto required_size = (((n * (n + 1)) / 2) + offset) * sizeof(T);
if (buffer.GetSize() < required_size) { return StatusCode::kInsufficientMemoryA; }
} catch (...) { return StatusCode::kInvalidMatrixA; }
return StatusCode::kSuccess;
}
// =================================================================================================
// Tests vector 'X' for validity
template <typename T>
StatusCode TestVectorX(const size_t n, const Buffer<T> &buffer, const size_t offset,
const size_t inc) {
if (inc == 0) { return StatusCode::kInvalidIncrementX; }
try {
const auto required_size = ((n - 1) * inc + 1 + offset) * sizeof(T);
if (buffer.GetSize() < required_size) { return StatusCode::kInsufficientMemoryX; }
} catch (...) { return StatusCode::kInvalidVectorX; }
return StatusCode::kSuccess;
}
// Tests vector 'Y' for validity
template <typename T>
StatusCode TestVectorY(const size_t n, const Buffer<T> &buffer, const size_t offset,
const size_t inc) {
if (inc == 0) { return StatusCode::kInvalidIncrementY; }
try {
const auto required_size = ((n - 1) * inc + 1 + offset) * sizeof(T);
if (buffer.GetSize() < required_size) { return StatusCode::kInsufficientMemoryY; }
} catch (...) { return StatusCode::kInvalidVectorY; }
return StatusCode::kSuccess;
}
// =================================================================================================
// Tests vector 'scalar' for validity
template <typename T>
StatusCode TestVectorScalar(const size_t n, const Buffer<T> &buffer, const size_t offset) {
try {
const auto required_size = (n + offset) * sizeof(T);
if (buffer.GetSize() < required_size) { return StatusCode::kInsufficientMemoryScalar; }
} catch (...) { return StatusCode::kInvalidVectorScalar; }
return StatusCode::kSuccess;
}
// Tests vector 'index' for validity
template <typename T>
StatusCode TestVectorIndex(const size_t n, const Buffer<T> &buffer, const size_t offset) {
try {
const auto required_size = (n + offset) * sizeof(T);
if (buffer.GetSize() < required_size) { return StatusCode::kInsufficientMemoryScalar; }
} catch (...) { return StatusCode::kInvalidVectorScalar; }
return StatusCode::kSuccess;
}
// =================================================================================================
} // namespace clblast
// CLBLAST_BUFFER_TEST_H_
#endif

View file

@ -15,10 +15,9 @@
#include <vector>
#include <mutex>
#include "internal/cache.h"
#include "cache.hpp"
namespace clblast {
namespace cache {
// =================================================================================================
// Stores the compiled binary or IR in the cache
@ -98,7 +97,7 @@ bool ProgramIsInCache(const Context &context, const Precision &precision,
// =================================================================================================
// Clears the cache of stored binaries and programs
StatusCode ClearCache() {
StatusCode CacheClearAll() {
binary_cache_mutex_.lock();
binary_cache_.clear();
binary_cache_mutex_.unlock();
@ -109,5 +108,4 @@ StatusCode ClearCache() {
}
// =================================================================================================
} // namespace cache
} // namespace clblast

View file

@ -18,10 +18,9 @@
#include <vector>
#include <mutex>
#include "internal/utilities.h"
#include "utilities.hpp"
namespace clblast {
namespace cache {
// =================================================================================================
// The cache of compiled OpenCL binaries, along with some meta-data
@ -90,10 +89,9 @@ bool ProgramIsInCache(const Context &context, const Precision &precision,
// =================================================================================================
// Clears the cache of stored binaries
StatusCode ClearCache();
StatusCode CacheClearAll();
// =================================================================================================
} // namespace cache
} // namespace clblast
// CLBLAST_CACHE_H_

View file

@ -16,60 +16,60 @@
#include <string>
#include "clblast.h"
#include "internal/public_api.h"
#include "internal/cache.h"
#include "public_api.hpp"
#include "cache.hpp"
// BLAS level-1 includes
#include "internal/routines/level1/xswap.h"
#include "internal/routines/level1/xscal.h"
#include "internal/routines/level1/xcopy.h"
#include "internal/routines/level1/xaxpy.h"
#include "internal/routines/level1/xdot.h"
#include "internal/routines/level1/xdotu.h"
#include "internal/routines/level1/xdotc.h"
#include "internal/routines/level1/xnrm2.h"
#include "internal/routines/level1/xasum.h"
#include "internal/routines/level1/xsum.h" // non-BLAS function
#include "internal/routines/level1/xamax.h"
#include "internal/routines/level1/xmax.h" // non-BLAS function
#include "internal/routines/level1/xmin.h" // non-BLAS function
#include "routines/level1/xswap.hpp"
#include "routines/level1/xscal.hpp"
#include "routines/level1/xcopy.hpp"
#include "routines/level1/xaxpy.hpp"
#include "routines/level1/xdot.hpp"
#include "routines/level1/xdotu.hpp"
#include "routines/level1/xdotc.hpp"
#include "routines/level1/xnrm2.hpp"
#include "routines/level1/xasum.hpp"
#include "routines/level1/xsum.hpp" // non-BLAS routine
#include "routines/level1/xamax.hpp"
#include "routines/level1/xmax.hpp" // non-BLAS routine
#include "routines/level1/xmin.hpp" // non-BLAS routine
// BLAS level-2 includes
#include "internal/routines/level2/xgemv.h"
#include "internal/routines/level2/xgbmv.h"
#include "internal/routines/level2/xhemv.h"
#include "internal/routines/level2/xhbmv.h"
#include "internal/routines/level2/xhpmv.h"
#include "internal/routines/level2/xsymv.h"
#include "internal/routines/level2/xsbmv.h"
#include "internal/routines/level2/xspmv.h"
#include "internal/routines/level2/xtrmv.h"
#include "internal/routines/level2/xtbmv.h"
#include "internal/routines/level2/xtpmv.h"
#include "internal/routines/level2/xger.h"
#include "internal/routines/level2/xgeru.h"
#include "internal/routines/level2/xgerc.h"
#include "internal/routines/level2/xher.h"
#include "internal/routines/level2/xhpr.h"
#include "internal/routines/level2/xher2.h"
#include "internal/routines/level2/xhpr2.h"
#include "internal/routines/level2/xsyr.h"
#include "internal/routines/level2/xspr.h"
#include "internal/routines/level2/xsyr2.h"
#include "internal/routines/level2/xspr2.h"
#include "routines/level2/xgemv.hpp"
#include "routines/level2/xgbmv.hpp"
#include "routines/level2/xhemv.hpp"
#include "routines/level2/xhbmv.hpp"
#include "routines/level2/xhpmv.hpp"
#include "routines/level2/xsymv.hpp"
#include "routines/level2/xsbmv.hpp"
#include "routines/level2/xspmv.hpp"
#include "routines/level2/xtrmv.hpp"
#include "routines/level2/xtbmv.hpp"
#include "routines/level2/xtpmv.hpp"
#include "routines/level2/xger.hpp"
#include "routines/level2/xgeru.hpp"
#include "routines/level2/xgerc.hpp"
#include "routines/level2/xher.hpp"
#include "routines/level2/xhpr.hpp"
#include "routines/level2/xher2.hpp"
#include "routines/level2/xhpr2.hpp"
#include "routines/level2/xsyr.hpp"
#include "routines/level2/xspr.hpp"
#include "routines/level2/xsyr2.hpp"
#include "routines/level2/xspr2.hpp"
// BLAS level-3 includes
#include "internal/routines/level3/xgemm.h"
#include "internal/routines/level3/xsymm.h"
#include "internal/routines/level3/xhemm.h"
#include "internal/routines/level3/xsyrk.h"
#include "internal/routines/level3/xherk.h"
#include "internal/routines/level3/xsyr2k.h"
#include "internal/routines/level3/xher2k.h"
#include "internal/routines/level3/xtrmm.h"
#include "routines/level3/xgemm.hpp"
#include "routines/level3/xsymm.hpp"
#include "routines/level3/xhemm.hpp"
#include "routines/level3/xsyrk.hpp"
#include "routines/level3/xherk.hpp"
#include "routines/level3/xsyr2k.hpp"
#include "routines/level3/xher2k.hpp"
#include "routines/level3/xtrmm.hpp"
// Extra includes (level-x)
#include "internal/routines/levelx/xomatcopy.h"
// Level-x includes (non-BLAS)
#include "routines/levelx/xomatcopy.hpp"
namespace clblast {
@ -2120,9 +2120,10 @@ template StatusCode PUBLIC_API Omatcopy<half>(const Layout, const Transpose,
// =================================================================================================
// Clears the cache of stored binaries
StatusCode ClearCache() { return cache::ClearCache(); }
StatusCode ClearCache() { return CacheClearAll(); }
// Fills the cache with all binaries for a specific device
// TODO: Add half-precision FP16 set-up calls
StatusCode FillCache(const cl_device_id device) {
try {
@ -2171,7 +2172,7 @@ StatusCode FillCache(const cl_device_id device) {
Xsyr2<float>(queue, nullptr).SetUp(); Xsyr2<double>(queue, nullptr).SetUp();
Xspr2<float>(queue, nullptr).SetUp(); Xspr2<double>(queue, nullptr).SetUp();
// Runs all the level 1 set-up functions
// Runs all the level 3 set-up functions
Xgemm<float>(queue, nullptr).SetUp(); Xgemm<double>(queue, nullptr).SetUp(); Xgemm<float2>(queue, nullptr).SetUp(); Xgemm<double2>(queue, nullptr).SetUp();
Xsymm<float>(queue, nullptr).SetUp(); Xsymm<double>(queue, nullptr).SetUp(); Xsymm<float2>(queue, nullptr).SetUp(); Xsymm<double2>(queue, nullptr).SetUp();
Xhemm<float2>(queue, nullptr).SetUp(); Xhemm<double2>(queue, nullptr).SetUp();
@ -2181,6 +2182,9 @@ StatusCode FillCache(const cl_device_id device) {
Xher2k<float2,float>(queue, nullptr).SetUp(); Xher2k<double2,double>(queue, nullptr).SetUp();
Xtrmm<float>(queue, nullptr).SetUp(); Xtrmm<double>(queue, nullptr).SetUp(); Xtrmm<float2>(queue, nullptr).SetUp(); Xtrmm<double2>(queue, nullptr).SetUp();
// Runs all the level 3 set-up functions
Xomatcopy<float>(queue, nullptr).SetUp(); Xomatcopy<double>(queue, nullptr).SetUp(); Xomatcopy<float2>(queue, nullptr).SetUp(); Xomatcopy<double2>(queue, nullptr).SetUp();
} catch (...) { return StatusCode::kBuildProgramFailure; }
return StatusCode::kSuccess;
}

View file

@ -15,7 +15,7 @@
#include "clblast_c.h"
#include "clblast.h"
#include "internal/utilities.h"
#include "utilities.hpp"
// Shortcuts to the clblast namespace
using float2 = clblast::float2;

View file

@ -11,18 +11,18 @@
//
// =================================================================================================
#include "internal/database.h"
#include "internal/database/xaxpy.h"
#include "internal/database/xdot.h"
#include "internal/database/xgemv.h"
#include "internal/database/xger.h"
#include "internal/database/xgemm.h"
#include "internal/database/copy.h"
#include "internal/database/pad.h"
#include "internal/database/transpose.h"
#include "internal/database/padtranspose.h"
#include "utilities.hpp"
#include "internal/utilities.h"
#include "database/database.hpp"
#include "database/kernels/xaxpy.hpp"
#include "database/kernels/xdot.hpp"
#include "database/kernels/xgemv.hpp"
#include "database/kernels/xger.hpp"
#include "database/kernels/xgemm.hpp"
#include "database/kernels/copy.hpp"
#include "database/kernels/pad.hpp"
#include "database/kernels/transpose.hpp"
#include "database/kernels/padtranspose.hpp"
namespace clblast {
// =================================================================================================

View file

@ -21,7 +21,7 @@
#include <vector>
#include <unordered_map>
#include "internal/utilities.h"
#include "utilities.hpp"
namespace clblast {
// =================================================================================================

View file

@ -1,431 +0,0 @@
// =================================================================================================
// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
// width of 100 characters per line.
//
// Author(s):
// Cedric Nugteren <www.cedricnugteren.nl>
//
// This file implements the Routine base class (see the header for information about the class).
//
// =================================================================================================
#include <string>
#include <vector>
#include "internal/routine.h"
namespace clblast {
// =================================================================================================
// Constructor: not much here, because no status codes can be returned
template <typename T>
Routine<T>::Routine(Queue &queue, EventPointer event, const std::string &name,
const std::vector<std::string> &routines, const Precision precision):
precision_(precision),
routine_name_(name),
queue_(queue),
event_(event),
context_(queue_.GetContext()),
device_(queue_.GetDevice()),
device_name_(device_.Name()),
max_work_item_dimensions_(device_.MaxWorkItemDimensions()),
max_work_item_sizes_(device_.MaxWorkItemSizes()),
max_work_group_size_(device_.MaxWorkGroupSize()),
db_(queue_, routines, precision_) {
}
// =================================================================================================
// Separate set-up function to allow for status codes to be returned
template <typename T>
StatusCode Routine<T>::SetUp() {
// Queries the cache to see whether or not the program (context-specific) is already there
if (ProgramIsInCache()) { return StatusCode::kSuccess; }
// Queries the cache to see whether or not the binary (device-specific) is already there. If it
// is, a program is created and stored in the cache
if (BinaryIsInCache()) {
try {
auto& binary = cache::GetBinaryFromCache(device_name_, precision_, routine_name_);
auto program = Program(device_, context_, binary);
auto options = std::vector<std::string>();
program.Build(device_, options);
StoreProgramToCache(program);
} catch (...) { return StatusCode::kBuildProgramFailure; }
return StatusCode::kSuccess;
}
// Otherwise, the kernel will be compiled and program will be built. Both the binary and the
// program will be added to the cache.
// Inspects whether or not cl_khr_fp64 is supported in case of double precision
const auto extensions = device_.Capabilities();
if (precision_ == Precision::kDouble || precision_ == Precision::kComplexDouble) {
if (extensions.find(kKhronosDoublePrecision) == std::string::npos) {
return StatusCode::kNoDoublePrecision;
}
}
// As above, but for cl_khr_fp16 (half precision)
if (precision_ == Precision::kHalf) {
if (extensions.find(kKhronosHalfPrecision) == std::string::npos) {
return StatusCode::kNoHalfPrecision;
}
}
// Loads the common header (typedefs and defines and such)
std::string common_header =
#include "kernels/common.opencl"
;
// Collects the parameters for this device in the form of defines, and adds the precision
auto defines = db_.GetDefines();
defines += "#define PRECISION "+ToString(static_cast<int>(precision_))+"\n";
// Adds the name of the routine as a define
defines += "#define ROUTINE_"+routine_name_+"\n";
// For specific devices, use the non-IEE754 compilant OpenCL mad() instruction. This can improve
// performance, but might result in a reduced accuracy.
if (device_.IsAMD() && device_.IsGPU()) {
defines += "#define USE_CL_MAD 1\n";
}
// For specific devices, use staggered/shuffled workgroup indices.
if (device_.IsAMD() && device_.IsGPU()) {
defines += "#define USE_STAGGERED_INDICES 1\n";
}
// For specific devices add a global synchronisation barrier to the GEMM kernel to optimize
// performance through better cache behaviour
if (device_.IsARM() && device_.IsGPU()) {
defines += "#define GLOBAL_MEM_FENCE 1\n";
}
// Combines everything together into a single source string
const auto source_string = defines + common_header + source_string_;
// Compiles the kernel
try {
auto program = Program(context_, source_string);
auto options = std::vector<std::string>();
const auto build_status = program.Build(device_, options);
// Checks for compiler crashes/errors/warnings
if (build_status == BuildStatus::kError) {
const auto message = program.GetBuildInfo(device_);
fprintf(stdout, "OpenCL compiler error/warning: %s\n", message.c_str());
return StatusCode::kBuildProgramFailure;
}
if (build_status == BuildStatus::kInvalid) { return StatusCode::kInvalidBinary; }
// Store the compiled binary and program in the cache
const auto binary = program.GetIR();
StoreBinaryToCache(binary);
StoreProgramToCache(program);
} catch (...) { return StatusCode::kBuildProgramFailure; }
// No errors, normal termination of this function
return StatusCode::kSuccess;
}
// =================================================================================================
// Enqueues a kernel, waits for completion, and checks for errors
template <typename T>
StatusCode Routine<T>::RunKernel(Kernel &kernel, std::vector<size_t> global,
const std::vector<size_t> &local, EventPointer event,
std::vector<Event>& waitForEvents) {
// Tests for validity of the local thread sizes
if (local.size() > max_work_item_dimensions_) {
return StatusCode::kInvalidLocalNumDimensions;
}
for (auto i=size_t{0}; i<local.size(); ++i) {
if (local[i] > max_work_item_sizes_[i]) { return StatusCode::kInvalidLocalThreadsDim; }
}
auto local_size = size_t{1};
for (auto &item: local) { local_size *= item; }
if (local_size > max_work_group_size_) { return StatusCode::kInvalidLocalThreadsTotal; }
// Make sure the global thread sizes are at least equal to the local sizes
for (auto i=size_t{0}; i<global.size(); ++i) {
if (global[i] < local[i]) { global[i] = local[i]; }
}
// Tests for local memory usage
const auto local_mem_usage = kernel.LocalMemUsage(device_);
if (!device_.IsLocalMemoryValid(local_mem_usage)) { return StatusCode::kInvalidLocalMemUsage; }
// Launches the kernel (and checks for launch errors)
try {
kernel.Launch(queue_, global, local, event, waitForEvents);
} catch (...) { return StatusCode::kKernelLaunchError; }
// No errors, normal termination of this function
return StatusCode::kSuccess;
}
// As above, but without an event waiting list
template <typename T>
StatusCode Routine<T>::RunKernel(Kernel &kernel, std::vector<size_t> global,
const std::vector<size_t> &local, EventPointer event) {
auto emptyWaitingList = std::vector<Event>();
return RunKernel(kernel, global, local, event, emptyWaitingList);
}
// =================================================================================================
// Tests matrix A for validity: checks for a valid OpenCL buffer, a valid lead-dimension, and for a
// sufficient buffer size.
template <typename T>
StatusCode Routine<T>::TestMatrixA(const size_t one, const size_t two, const Buffer<T> &buffer,
const size_t offset, const size_t ld, const size_t data_size) {
if (ld < one) { return StatusCode::kInvalidLeadDimA; }
try {
const auto required_size = (ld*(two-1) + one + offset)*data_size;
const auto buffer_size = buffer.GetSize();
if (buffer_size < required_size) { return StatusCode::kInsufficientMemoryA; }
} catch (...) { return StatusCode::kInvalidMatrixA; }
return StatusCode::kSuccess;
}
// Tests matrix B for validity: checks for a valid OpenCL buffer, a valid lead-dimension, and for a
// sufficient buffer size.
template <typename T>
StatusCode Routine<T>::TestMatrixB(const size_t one, const size_t two, const Buffer<T> &buffer,
const size_t offset, const size_t ld, const size_t data_size) {
if (ld < one) { return StatusCode::kInvalidLeadDimB; }
try {
const auto required_size = (ld*(two-1) + one + offset)*data_size;
const auto buffer_size = buffer.GetSize();
if (buffer_size < required_size) { return StatusCode::kInsufficientMemoryB; }
} catch (...) { return StatusCode::kInvalidMatrixB; }
return StatusCode::kSuccess;
}
// Tests matrix C for validity: checks for a valid OpenCL buffer, a valid lead-dimension, and for a
// sufficient buffer size.
template <typename T>
StatusCode Routine<T>::TestMatrixC(const size_t one, const size_t two, const Buffer<T> &buffer,
const size_t offset, const size_t ld, const size_t data_size) {
if (ld < one) { return StatusCode::kInvalidLeadDimC; }
try {
const auto required_size = (ld*(two-1) + one + offset)*data_size;
const auto buffer_size = buffer.GetSize();
if (buffer_size < required_size) { return StatusCode::kInsufficientMemoryC; }
} catch (...) { return StatusCode::kInvalidMatrixC; }
return StatusCode::kSuccess;
}
// Tests matrix AP for validity: checks for a valid OpenCL buffer and for a sufficient buffer size
template <typename T>
StatusCode Routine<T>::TestMatrixAP(const size_t n, const Buffer<T> &buffer,
const size_t offset, const size_t data_size) {
try {
const auto required_size = (((n*(n+1))/2) + offset)*data_size;
const auto buffer_size = buffer.GetSize();
if (buffer_size < required_size) { return StatusCode::kInsufficientMemoryA; }
} catch (...) { return StatusCode::kInvalidMatrixA; }
return StatusCode::kSuccess;
}
// =================================================================================================
// Tests vector X for validity: checks for a valid increment, a valid OpenCL buffer, and for a
// sufficient buffer size.
template <typename T>
StatusCode Routine<T>::TestVectorX(const size_t n, const Buffer<T> &buffer, const size_t offset,
const size_t inc, const size_t data_size) {
if (inc == 0) { return StatusCode::kInvalidIncrementX; }
try {
const auto required_size = ((n-1)*inc + 1 + offset)*data_size;
const auto buffer_size = buffer.GetSize();
if (buffer_size < required_size) { return StatusCode::kInsufficientMemoryX; }
} catch (...) { return StatusCode::kInvalidVectorX; }
return StatusCode::kSuccess;
}
// Tests vector Y for validity: checks for a valid increment, a valid OpenCL buffer, and for a
// sufficient buffer size.
template <typename T>
StatusCode Routine<T>::TestVectorY(const size_t n, const Buffer<T> &buffer, const size_t offset,
const size_t inc, const size_t data_size) {
if (inc == 0) { return StatusCode::kInvalidIncrementY; }
try {
const auto required_size = ((n-1)*inc + 1 + offset)*data_size;
const auto buffer_size = buffer.GetSize();
if (buffer_size < required_size) { return StatusCode::kInsufficientMemoryY; }
} catch (...) { return StatusCode::kInvalidVectorY; }
return StatusCode::kSuccess;
}
// =================================================================================================
// Tests vector dot for validity: checks for a valid increment, a valid OpenCL buffer, and for a
// sufficient buffer size.
template <typename T>
StatusCode Routine<T>::TestVectorDot(const size_t n, const Buffer<T> &buffer, const size_t offset,
const size_t data_size) {
try {
const auto required_size = (n + offset)*data_size;
const auto buffer_size = buffer.GetSize();
if (buffer_size < required_size) { return StatusCode::kInsufficientMemoryDot; }
} catch (...) { return StatusCode::kInvalidVectorDot; }
return StatusCode::kSuccess;
}
// Tests vector index for validity: checks for a valid increment, a valid OpenCL buffer, and for a
// sufficient buffer size.
template <typename T>
StatusCode Routine<T>::TestVectorIndex(const size_t n, const Buffer<unsigned int> &buffer,
const size_t offset, const size_t data_size) {
try {
const auto required_size = (n + offset)*data_size;
const auto buffer_size = buffer.GetSize();
if (buffer_size < required_size) { return StatusCode::kInsufficientMemoryDot; }
} catch (...) { return StatusCode::kInvalidVectorDot; }
return StatusCode::kSuccess;
}
// =================================================================================================
// Copies or transposes a matrix and optionally pads/unpads it with zeros
template <typename T>
StatusCode Routine<T>::PadCopyTransposeMatrix(EventPointer event, std::vector<Event>& waitForEvents,
const size_t src_one, const size_t src_two,
const size_t src_ld, const size_t src_offset,
const Buffer<T> &src,
const size_t dest_one, const size_t dest_two,
const size_t dest_ld, const size_t dest_offset,
const Buffer<T> &dest,
const T alpha,
const Program &program, const bool do_pad,
const bool do_transpose, const bool do_conjugate,
const bool upper, const bool lower,
const bool diagonal_imag_zero) {
// Determines whether or not the fast-version could potentially be used
auto use_fast_kernel = (src_offset == 0) && (dest_offset == 0) && (do_conjugate == false) &&
(src_one == dest_one) && (src_two == dest_two) && (src_ld == dest_ld) &&
(upper == false) && (lower == false) && (diagonal_imag_zero == false);
// Determines the right kernel
auto kernel_name = std::string{};
if (do_transpose) {
if (use_fast_kernel &&
IsMultiple(src_ld, db_["TRA_WPT"]) &&
IsMultiple(src_one, db_["TRA_WPT"]*db_["TRA_WPT"]) &&
IsMultiple(src_two, db_["TRA_WPT"]*db_["TRA_WPT"])) {
kernel_name = "TransposeMatrixFast";
}
else {
use_fast_kernel = false;
kernel_name = (do_pad) ? "TransposePadMatrix" : "TransposeMatrix";
}
}
else {
if (use_fast_kernel &&
IsMultiple(src_ld, db_["COPY_VW"]) &&
IsMultiple(src_one, db_["COPY_VW"]*db_["COPY_DIMX"]) &&
IsMultiple(src_two, db_["COPY_WPT"]*db_["COPY_DIMY"])) {
kernel_name = "CopyMatrixFast";
}
else {
use_fast_kernel = false;
kernel_name = (do_pad) ? "CopyPadMatrix" : "CopyMatrix";
}
}
// Upload the scalar argument as a constant buffer to the device (needed for half-precision)
auto alpha_buffer = Buffer<T>(context_, 1);
alpha_buffer.Write(queue_, 1, &alpha);
// Retrieves the kernel from the compiled binary
try {
auto kernel = Kernel(program, kernel_name);
// Sets the kernel arguments
if (use_fast_kernel) {
kernel.SetArgument(0, static_cast<int>(src_ld));
kernel.SetArgument(1, src());
kernel.SetArgument(2, dest());
kernel.SetArgument(3, alpha_buffer());
}
else {
kernel.SetArgument(0, static_cast<int>(src_one));
kernel.SetArgument(1, static_cast<int>(src_two));
kernel.SetArgument(2, static_cast<int>(src_ld));
kernel.SetArgument(3, static_cast<int>(src_offset));
kernel.SetArgument(4, src());
kernel.SetArgument(5, static_cast<int>(dest_one));
kernel.SetArgument(6, static_cast<int>(dest_two));
kernel.SetArgument(7, static_cast<int>(dest_ld));
kernel.SetArgument(8, static_cast<int>(dest_offset));
kernel.SetArgument(9, dest());
kernel.SetArgument(10, alpha_buffer());
if (do_pad) {
kernel.SetArgument(11, static_cast<int>(do_conjugate));
}
else {
kernel.SetArgument(11, static_cast<int>(upper));
kernel.SetArgument(12, static_cast<int>(lower));
kernel.SetArgument(13, static_cast<int>(diagonal_imag_zero));
}
}
// Launches the kernel and returns the error code. Uses global and local thread sizes based on
// parameters in the database.
if (do_transpose) {
if (use_fast_kernel) {
const auto global = std::vector<size_t>{
dest_one / db_["TRA_WPT"],
dest_two / db_["TRA_WPT"]
};
const auto local = std::vector<size_t>{db_["TRA_DIM"], db_["TRA_DIM"]};
return RunKernel(kernel, global, local, event, waitForEvents);
}
else {
const auto global = std::vector<size_t>{
Ceil(CeilDiv(dest_one, db_["PADTRA_WPT"]), db_["PADTRA_TILE"]),
Ceil(CeilDiv(dest_two, db_["PADTRA_WPT"]), db_["PADTRA_TILE"])
};
const auto local = std::vector<size_t>{db_["PADTRA_TILE"], db_["PADTRA_TILE"]};
return RunKernel(kernel, global, local, event, waitForEvents);
}
}
else {
if (use_fast_kernel) {
const auto global = std::vector<size_t>{
dest_one / db_["COPY_VW"],
dest_two / db_["COPY_WPT"]
};
const auto local = std::vector<size_t>{db_["COPY_DIMX"], db_["COPY_DIMY"]};
return RunKernel(kernel, global, local, event, waitForEvents);
}
else {
const auto global = std::vector<size_t>{
Ceil(CeilDiv(dest_one, db_["PAD_WPTX"]), db_["PAD_DIMX"]),
Ceil(CeilDiv(dest_two, db_["PAD_WPTY"]), db_["PAD_DIMY"])
};
const auto local = std::vector<size_t>{db_["PAD_DIMX"], db_["PAD_DIMY"]};
return RunKernel(kernel, global, local, event, waitForEvents);
}
}
} catch (...) { return StatusCode::kInvalidKernel; }
}
// =================================================================================================
// Compiles the templated class
template class Routine<half>;
template class Routine<float>;
template class Routine<double>;
template class Routine<float2>;
template class Routine<double2>;
// =================================================================================================
} // namespace clblast

131
src/routine.cpp Normal file
View file

@ -0,0 +1,131 @@
// =================================================================================================
// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
// width of 100 characters per line.
//
// Author(s):
// Cedric Nugteren <www.cedricnugteren.nl>
//
// This file implements the Routine base class (see the header for information about the class).
//
// =================================================================================================
#include <string>
#include <vector>
#include "routine.hpp"
namespace clblast {
// =================================================================================================
// Constructor: not much here, because no status codes can be returned
Routine::Routine(Queue &queue, EventPointer event, const std::string &name,
const std::vector<std::string> &routines, const Precision precision):
precision_(precision),
routine_name_(name),
queue_(queue),
event_(event),
context_(queue_.GetContext()),
device_(queue_.GetDevice()),
device_name_(device_.Name()),
db_(queue_, routines, precision_) {
}
// =================================================================================================
// Separate set-up function to allow for status codes to be returned
StatusCode Routine::SetUp() {
// Queries the cache to see whether or not the program (context-specific) is already there
if (ProgramIsInCache(context_, precision_, routine_name_)) { return StatusCode::kSuccess; }
// Queries the cache to see whether or not the binary (device-specific) is already there. If it
// is, a program is created and stored in the cache
if (BinaryIsInCache(device_name_, precision_, routine_name_)) {
try {
auto& binary = GetBinaryFromCache(device_name_, precision_, routine_name_);
auto program = Program(device_, context_, binary);
auto options = std::vector<std::string>();
program.Build(device_, options);
StoreProgramToCache(program, context_, precision_, routine_name_);
} catch (...) { return StatusCode::kBuildProgramFailure; }
return StatusCode::kSuccess;
}
// Otherwise, the kernel will be compiled and program will be built. Both the binary and the
// program will be added to the cache.
// Inspects whether or not cl_khr_fp64 is supported in case of double precision
const auto extensions = device_.Capabilities();
if (precision_ == Precision::kDouble || precision_ == Precision::kComplexDouble) {
if (extensions.find(kKhronosDoublePrecision) == std::string::npos) {
return StatusCode::kNoDoublePrecision;
}
}
// As above, but for cl_khr_fp16 (half precision)
if (precision_ == Precision::kHalf) {
if (extensions.find(kKhronosHalfPrecision) == std::string::npos) {
return StatusCode::kNoHalfPrecision;
}
}
// Loads the common header (typedefs and defines and such)
std::string common_header =
#include "kernels/common.opencl"
;
// Collects the parameters for this device in the form of defines, and adds the precision
auto defines = db_.GetDefines();
defines += "#define PRECISION "+ToString(static_cast<int>(precision_))+"\n";
// Adds the name of the routine as a define
defines += "#define ROUTINE_"+routine_name_+"\n";
// For specific devices, use the non-IEE754 compilant OpenCL mad() instruction. This can improve
// performance, but might result in a reduced accuracy.
if (device_.IsAMD() && device_.IsGPU()) {
defines += "#define USE_CL_MAD 1\n";
}
// For specific devices, use staggered/shuffled workgroup indices.
if (device_.IsAMD() && device_.IsGPU()) {
defines += "#define USE_STAGGERED_INDICES 1\n";
}
// For specific devices add a global synchronisation barrier to the GEMM kernel to optimize
// performance through better cache behaviour
if (device_.IsARM() && device_.IsGPU()) {
defines += "#define GLOBAL_MEM_FENCE 1\n";
}
// Combines everything together into a single source string
const auto source_string = defines + common_header + source_string_;
// Compiles the kernel
try {
auto program = Program(context_, source_string);
auto options = std::vector<std::string>();
const auto build_status = program.Build(device_, options);
// Checks for compiler crashes/errors/warnings
if (build_status == BuildStatus::kError) {
const auto message = program.GetBuildInfo(device_);
fprintf(stdout, "OpenCL compiler error/warning: %s\n", message.c_str());
return StatusCode::kBuildProgramFailure;
}
if (build_status == BuildStatus::kInvalid) { return StatusCode::kInvalidBinary; }
// Store the compiled binary and program in the cache
const auto binary = program.GetIR();
StoreBinaryToCache(binary, device_name_, precision_, routine_name_);
StoreProgramToCache(program, context_, precision_, routine_name_);
} catch (...) { return StatusCode::kBuildProgramFailure; }
// No errors, normal termination of this function
return StatusCode::kSuccess;
}
// =================================================================================================
} // namespace clblast

68
src/routine.hpp Normal file
View file

@ -0,0 +1,68 @@
// =================================================================================================
// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
// width of 100 characters per line.
//
// Author(s):
// Cedric Nugteren <www.cedricnugteren.nl>
//
// This file implements all the basic functionality for the BLAS routines. This class serves as a
// base class for the actual routines (e.g. Xaxpy, Xgemm). It contains common functionality such as
// compiling the OpenCL kernel, connecting to the database, etc.
//
// =================================================================================================
#ifndef CLBLAST_ROUTINE_H_
#define CLBLAST_ROUTINE_H_
#include <string>
#include <vector>
#include "utilities.hpp"
#include "cache.hpp"
#include "buffer_test.hpp"
#include "database/database.hpp"
#include "routines/common.hpp"
namespace clblast {
// =================================================================================================
// See comment at top of file for a description of the class
class Routine {
public:
// Base class constructor
explicit Routine(Queue &queue, EventPointer event, const std::string &name,
const std::vector<std::string> &routines, const Precision precision);
// Set-up phase of the kernel
StatusCode SetUp();
protected:
// Non-static variable for the precision
const Precision precision_;
// The routine's name and its kernel-source in string form
const std::string routine_name_;
std::string source_string_;
// The OpenCL objects, accessible only from derived classes
Queue queue_;
EventPointer event_;
const Context context_;
const Device device_;
// OpenCL device properties
const std::string device_name_;
// Connection to the database for all the device-specific parameters
const Database db_;
};
// =================================================================================================
} // namespace clblast
// CLBLAST_ROUTINE_H_
#endif

65
src/routines/common.cpp Normal file
View file

@ -0,0 +1,65 @@
// =================================================================================================
// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
// width of 100 characters per line.
//
// Author(s):
// Cedric Nugteren <www.cedricnugteren.nl>
//
// This file implements the common routine functions (see the header for more information).
//
// =================================================================================================
#include <vector>
#include "routines/common.hpp"
namespace clblast {
// =================================================================================================
// Enqueues a kernel, waits for completion, and checks for errors
StatusCode RunKernel(Kernel &kernel, Queue &queue, const Device &device,
std::vector<size_t> global, const std::vector<size_t> &local,
EventPointer event, std::vector<Event>& waitForEvents) {
// Tests for validity of the local thread sizes
if (local.size() > device.MaxWorkItemDimensions()) {
return StatusCode::kInvalidLocalNumDimensions;
}
const auto max_work_item_sizes = device.MaxWorkItemSizes();
for (auto i=size_t{0}; i<local.size(); ++i) {
if (local[i] > max_work_item_sizes[i]) { return StatusCode::kInvalidLocalThreadsDim; }
}
auto local_size = size_t{1};
for (auto &item: local) { local_size *= item; }
if (local_size > device.MaxWorkGroupSize()) { return StatusCode::kInvalidLocalThreadsTotal; }
// Make sure the global thread sizes are at least equal to the local sizes
for (auto i=size_t{0}; i<global.size(); ++i) {
if (global[i] < local[i]) { global[i] = local[i]; }
}
// Tests for local memory usage
const auto local_mem_usage = kernel.LocalMemUsage(device);
if (!device.IsLocalMemoryValid(local_mem_usage)) { return StatusCode::kInvalidLocalMemUsage; }
// Launches the kernel (and checks for launch errors)
try {
kernel.Launch(queue, global, local, event, waitForEvents);
} catch (...) { return StatusCode::kKernelLaunchError; }
// No errors, normal termination of this function
return StatusCode::kSuccess;
}
// As above, but without an event waiting list
StatusCode RunKernel(Kernel &kernel, Queue &queue, const Device &device,
std::vector<size_t> global, const std::vector<size_t> &local,
EventPointer event) {
auto emptyWaitingList = std::vector<Event>();
return RunKernel(kernel, queue, device, global, local, event, emptyWaitingList);
}
// =================================================================================================
} // namespace clblast

173
src/routines/common.hpp Normal file
View file

@ -0,0 +1,173 @@
// =================================================================================================
// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
// width of 100 characters per line.
//
// Author(s):
// Cedric Nugteren <www.cedricnugteren.nl>
//
// This file contains all the interfaces to common kernels, such as copying, padding, and
// transposing a matrix. These functions are templated and thus header-only. This file also contains
// other common functions to routines, such as a function to launch a kernel.
//
// =================================================================================================
#ifndef CLBLAST_ROUTINES_COMMON_H_
#define CLBLAST_ROUTINES_COMMON_H_
#include <string>
#include <vector>
#include "clblast.h"
#include "clpp11.hpp"
#include "database/database.hpp"
namespace clblast {
// =================================================================================================
// Enqueues a kernel, waits for completion, and checks for errors
StatusCode RunKernel(Kernel &kernel, Queue &queue, const Device &device,
std::vector<size_t> global, const std::vector<size_t> &local,
EventPointer event, std::vector<Event>& waitForEvents);
// As above, but without an event waiting list
StatusCode RunKernel(Kernel &kernel, Queue &queue, const Device &device,
std::vector<size_t> global, const std::vector<size_t> &local,
EventPointer event);
// =================================================================================================
// Copies or transposes a matrix and optionally pads/unpads it with zeros. This method is also able
// to write to symmetric and triangular matrices through optional arguments.
template <typename T>
StatusCode PadCopyTransposeMatrix(Queue &queue, const Device &device, const Context &context,
const Database &db,
EventPointer event, std::vector<Event>& waitForEvents,
const size_t src_one, const size_t src_two,
const size_t src_ld, const size_t src_offset,
const Buffer<T> &src,
const size_t dest_one, const size_t dest_two,
const size_t dest_ld, const size_t dest_offset,
const Buffer<T> &dest,
const T alpha,
const Program &program, const bool do_pad,
const bool do_transpose, const bool do_conjugate,
const bool upper = false, const bool lower = false,
const bool diagonal_imag_zero = false) {
// Determines whether or not the fast-version could potentially be used
auto use_fast_kernel = (src_offset == 0) && (dest_offset == 0) && (do_conjugate == false) &&
(src_one == dest_one) && (src_two == dest_two) && (src_ld == dest_ld) &&
(upper == false) && (lower == false) && (diagonal_imag_zero == false);
// Determines the right kernel
auto kernel_name = std::string{};
if (do_transpose) {
if (use_fast_kernel &&
IsMultiple(src_ld, db["TRA_WPT"]) &&
IsMultiple(src_one, db["TRA_WPT"]*db["TRA_WPT"]) &&
IsMultiple(src_two, db["TRA_WPT"]*db["TRA_WPT"])) {
kernel_name = "TransposeMatrixFast";
}
else {
use_fast_kernel = false;
kernel_name = (do_pad) ? "TransposePadMatrix" : "TransposeMatrix";
}
}
else {
if (use_fast_kernel &&
IsMultiple(src_ld, db["COPY_VW"]) &&
IsMultiple(src_one, db["COPY_VW"]*db["COPY_DIMX"]) &&
IsMultiple(src_two, db["COPY_WPT"]*db["COPY_DIMY"])) {
kernel_name = "CopyMatrixFast";
}
else {
use_fast_kernel = false;
kernel_name = (do_pad) ? "CopyPadMatrix" : "CopyMatrix";
}
}
// Upload the scalar argument as a constant buffer to the device (needed for half-precision)
auto alpha_buffer = Buffer<T>(context, 1);
alpha_buffer.Write(queue, 1, &alpha);
// Retrieves the kernel from the compiled binary
try {
auto kernel = Kernel(program, kernel_name);
// Sets the kernel arguments
if (use_fast_kernel) {
kernel.SetArgument(0, static_cast<int>(src_ld));
kernel.SetArgument(1, src());
kernel.SetArgument(2, dest());
kernel.SetArgument(3, alpha_buffer());
}
else {
kernel.SetArgument(0, static_cast<int>(src_one));
kernel.SetArgument(1, static_cast<int>(src_two));
kernel.SetArgument(2, static_cast<int>(src_ld));
kernel.SetArgument(3, static_cast<int>(src_offset));
kernel.SetArgument(4, src());
kernel.SetArgument(5, static_cast<int>(dest_one));
kernel.SetArgument(6, static_cast<int>(dest_two));
kernel.SetArgument(7, static_cast<int>(dest_ld));
kernel.SetArgument(8, static_cast<int>(dest_offset));
kernel.SetArgument(9, dest());
kernel.SetArgument(10, alpha_buffer());
if (do_pad) {
kernel.SetArgument(11, static_cast<int>(do_conjugate));
}
else {
kernel.SetArgument(11, static_cast<int>(upper));
kernel.SetArgument(12, static_cast<int>(lower));
kernel.SetArgument(13, static_cast<int>(diagonal_imag_zero));
}
}
// Launches the kernel and returns the error code. Uses global and local thread sizes based on
// parameters in the database.
if (do_transpose) {
if (use_fast_kernel) {
const auto global = std::vector<size_t>{
dest_one / db["TRA_WPT"],
dest_two / db["TRA_WPT"]
};
const auto local = std::vector<size_t>{db["TRA_DIM"], db["TRA_DIM"]};
return RunKernel(kernel, queue, device, global, local, event, waitForEvents);
}
else {
const auto global = std::vector<size_t>{
Ceil(CeilDiv(dest_one, db["PADTRA_WPT"]), db["PADTRA_TILE"]),
Ceil(CeilDiv(dest_two, db["PADTRA_WPT"]), db["PADTRA_TILE"])
};
const auto local = std::vector<size_t>{db["PADTRA_TILE"], db["PADTRA_TILE"]};
return RunKernel(kernel, queue, device, global, local, event, waitForEvents);
}
}
else {
if (use_fast_kernel) {
const auto global = std::vector<size_t>{
dest_one / db["COPY_VW"],
dest_two / db["COPY_WPT"]
};
const auto local = std::vector<size_t>{db["COPY_DIMX"], db["COPY_DIMY"]};
return RunKernel(kernel, queue, device, global, local, event, waitForEvents);
}
else {
const auto global = std::vector<size_t>{
Ceil(CeilDiv(dest_one, db["PAD_WPTX"]), db["PAD_DIMX"]),
Ceil(CeilDiv(dest_two, db["PAD_WPTY"]), db["PAD_DIMY"])
};
const auto local = std::vector<size_t>{db["PAD_DIMX"], db["PAD_DIMY"]};
return RunKernel(kernel, queue, device, global, local, event, waitForEvents);
}
}
} catch (...) { return StatusCode::kInvalidKernel; }
}
// =================================================================================================
} // namespace clblast
// CLBLAST_ROUTINES_COMMON_H_
#endif

View file

@ -11,7 +11,7 @@
//
// =================================================================================================
#include "internal/routines/level1/xamax.h"
#include "routines/level1/xamax.hpp"
#include <string>
#include <vector>
@ -19,19 +19,10 @@
namespace clblast {
// =================================================================================================
// Specific implementations to get the memory-type based on a template argument
template <> const Precision Xamax<half>::precision_ = Precision::kHalf;
template <> const Precision Xamax<float>::precision_ = Precision::kSingle;
template <> const Precision Xamax<double>::precision_ = Precision::kDouble;
template <> const Precision Xamax<float2>::precision_ = Precision::kComplexSingle;
template <> const Precision Xamax<double2>::precision_ = Precision::kComplexDouble;
// =================================================================================================
// Constructor: forwards to base class constructor
template <typename T>
Xamax<T>::Xamax(Queue &queue, EventPointer event, const std::string &name):
Routine<T>(queue, event, name, {"Xdot"}, precision_) {
Routine(queue, event, name, {"Xdot"}, PrecisionValue<T>()) {
source_string_ =
#include "../../kernels/level1/xamax.opencl"
;
@ -49,14 +40,14 @@ StatusCode Xamax<T>::DoAmax(const size_t n,
if (n == 0) { return StatusCode::kInvalidDimension; }
// Tests the vectors for validity
auto status = TestVectorX(n, x_buffer, x_offset, x_inc, sizeof(T));
auto status = TestVectorX(n, x_buffer, x_offset, x_inc);
if (ErrorIn(status)) { return status; }
status = TestVectorIndex(1, imax_buffer, imax_offset, sizeof(unsigned int));
status = TestVectorIndex(1, imax_buffer, imax_offset);
if (ErrorIn(status)) { return status; }
// Retrieves the Xamax kernels from the compiled binary
try {
const auto program = GetProgramFromCache();
const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);
auto kernel1 = Kernel(program, "Xamax");
auto kernel2 = Kernel(program, "XamaxEpilogue");
@ -80,7 +71,7 @@ StatusCode Xamax<T>::DoAmax(const size_t n,
auto global1 = std::vector<size_t>{db_["WGS1"]*temp_size};
auto local1 = std::vector<size_t>{db_["WGS1"]};
auto kernelEvent = Event();
status = RunKernel(kernel1, global1, local1, kernelEvent.pointer());
status = RunKernel(kernel1, queue_, device_, global1, local1, kernelEvent.pointer());
if (ErrorIn(status)) { return status; }
eventWaitList.push_back(kernelEvent);
@ -93,7 +84,7 @@ StatusCode Xamax<T>::DoAmax(const size_t n,
// Launches the epilogue kernel
auto global2 = std::vector<size_t>{db_["WGS2"]};
auto local2 = std::vector<size_t>{db_["WGS2"]};
status = RunKernel(kernel2, global2, local2, event_, eventWaitList);
status = RunKernel(kernel2, queue_, device_, global2, local2, event_, eventWaitList);
if (ErrorIn(status)) { return status; }
// Succesfully finished the computation

View file

@ -14,28 +14,16 @@
#ifndef CLBLAST_ROUTINES_XAMAX_H_
#define CLBLAST_ROUTINES_XAMAX_H_
#include "internal/routine.h"
#include "routine.hpp"
namespace clblast {
// =================================================================================================
// See comment at top of file for a description of the class
template <typename T>
class Xamax: public Routine<T> {
class Xamax: public Routine {
public:
// Members and methods from the base class
using Routine<T>::db_;
using Routine<T>::source_string_;
using Routine<T>::queue_;
using Routine<T>::event_;
using Routine<T>::context_;
using Routine<T>::GetProgramFromCache;
using Routine<T>::TestVectorX;
using Routine<T>::TestVectorIndex;
using Routine<T>::RunKernel;
using Routine<T>::ErrorIn;
// Constructor
Xamax(Queue &queue, EventPointer event, const std::string &name = "AMAX");
@ -43,10 +31,6 @@ class Xamax: public Routine<T> {
StatusCode DoAmax(const size_t n,
const Buffer<unsigned int> &imax_buffer, const size_t imax_offset,
const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc);
private:
// Static variable to get the precision
const static Precision precision_;
};
// =================================================================================================

View file

@ -11,7 +11,7 @@
//
// =================================================================================================
#include "internal/routines/level1/xasum.h"
#include "routines/level1/xasum.hpp"
#include <string>
#include <vector>
@ -19,19 +19,10 @@
namespace clblast {
// =================================================================================================
// Specific implementations to get the memory-type based on a template argument
template <> const Precision Xasum<half>::precision_ = Precision::kHalf;
template <> const Precision Xasum<float>::precision_ = Precision::kSingle;
template <> const Precision Xasum<double>::precision_ = Precision::kDouble;
template <> const Precision Xasum<float2>::precision_ = Precision::kComplexSingle;
template <> const Precision Xasum<double2>::precision_ = Precision::kComplexDouble;
// =================================================================================================
// Constructor: forwards to base class constructor
template <typename T>
Xasum<T>::Xasum(Queue &queue, EventPointer event, const std::string &name):
Routine<T>(queue, event, name, {"Xdot"}, precision_) {
Routine(queue, event, name, {"Xdot"}, PrecisionValue<T>()) {
source_string_ =
#include "../../kernels/level1/xasum.opencl"
;
@ -49,14 +40,14 @@ StatusCode Xasum<T>::DoAsum(const size_t n,
if (n == 0) { return StatusCode::kInvalidDimension; }
// Tests the vectors for validity
auto status = TestVectorX(n, x_buffer, x_offset, x_inc, sizeof(T));
auto status = TestVectorX(n, x_buffer, x_offset, x_inc);
if (ErrorIn(status)) { return status; }
status = TestVectorDot(1, asum_buffer, asum_offset, sizeof(T));
status = TestVectorScalar(1, asum_buffer, asum_offset);
if (ErrorIn(status)) { return status; }
// Retrieves the Xasum kernels from the compiled binary
try {
const auto program = GetProgramFromCache();
const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);
auto kernel1 = Kernel(program, "Xasum");
auto kernel2 = Kernel(program, "XasumEpilogue");
@ -78,7 +69,7 @@ StatusCode Xasum<T>::DoAsum(const size_t n,
auto global1 = std::vector<size_t>{db_["WGS1"]*temp_size};
auto local1 = std::vector<size_t>{db_["WGS1"]};
auto kernelEvent = Event();
status = RunKernel(kernel1, global1, local1, kernelEvent.pointer());
status = RunKernel(kernel1, queue_, device_, global1, local1, kernelEvent.pointer());
if (ErrorIn(status)) { return status; }
eventWaitList.push_back(kernelEvent);
@ -90,7 +81,7 @@ StatusCode Xasum<T>::DoAsum(const size_t n,
// Launches the epilogue kernel
auto global2 = std::vector<size_t>{db_["WGS2"]};
auto local2 = std::vector<size_t>{db_["WGS2"]};
status = RunKernel(kernel2, global2, local2, event_, eventWaitList);
status = RunKernel(kernel2, queue_, device_, global2, local2, event_, eventWaitList);
if (ErrorIn(status)) { return status; }
// Succesfully finished the computation

View file

@ -14,28 +14,16 @@
#ifndef CLBLAST_ROUTINES_XASUM_H_
#define CLBLAST_ROUTINES_XASUM_H_
#include "internal/routine.h"
#include "routine.hpp"
namespace clblast {
// =================================================================================================
// See comment at top of file for a description of the class
template <typename T>
class Xasum: public Routine<T> {
class Xasum: public Routine {
public:
// Members and methods from the base class
using Routine<T>::db_;
using Routine<T>::source_string_;
using Routine<T>::queue_;
using Routine<T>::event_;
using Routine<T>::context_;
using Routine<T>::GetProgramFromCache;
using Routine<T>::TestVectorX;
using Routine<T>::TestVectorDot;
using Routine<T>::RunKernel;
using Routine<T>::ErrorIn;
// Constructor
Xasum(Queue &queue, EventPointer event, const std::string &name = "ASUM");
@ -43,10 +31,6 @@ class Xasum: public Routine<T> {
StatusCode DoAsum(const size_t n,
const Buffer<T> &asum_buffer, const size_t asum_offset,
const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc);
private:
// Static variable to get the precision
const static Precision precision_;
};
// =================================================================================================

View file

@ -11,7 +11,7 @@
//
// =================================================================================================
#include "internal/routines/level1/xaxpy.h"
#include "routines/level1/xaxpy.hpp"
#include <string>
#include <vector>
@ -19,19 +19,10 @@
namespace clblast {
// =================================================================================================
// Specific implementations to get the memory-type based on a template argument
template <> const Precision Xaxpy<half>::precision_ = Precision::kHalf;
template <> const Precision Xaxpy<float>::precision_ = Precision::kSingle;
template <> const Precision Xaxpy<double>::precision_ = Precision::kDouble;
template <> const Precision Xaxpy<float2>::precision_ = Precision::kComplexSingle;
template <> const Precision Xaxpy<double2>::precision_ = Precision::kComplexDouble;
// =================================================================================================
// Constructor: forwards to base class constructor
template <typename T>
Xaxpy<T>::Xaxpy(Queue &queue, EventPointer event, const std::string &name):
Routine<T>(queue, event, name, {"Xaxpy"}, precision_) {
Routine(queue, event, name, {"Xaxpy"}, PrecisionValue<T>()) {
source_string_ =
#include "../../kernels/level1/level1.opencl"
#include "../../kernels/level1/xaxpy.opencl"
@ -50,9 +41,9 @@ StatusCode Xaxpy<T>::DoAxpy(const size_t n, const T alpha,
if (n == 0) { return StatusCode::kInvalidDimension; }
// Tests the vectors for validity
auto status = TestVectorX(n, x_buffer, x_offset, x_inc, sizeof(T));
auto status = TestVectorX(n, x_buffer, x_offset, x_inc);
if (ErrorIn(status)) { return status; }
status = TestVectorY(n, y_buffer, y_offset, y_inc, sizeof(T));
status = TestVectorY(n, y_buffer, y_offset, y_inc);
if (ErrorIn(status)) { return status; }
// Determines whether or not the fast-version can be used
@ -65,7 +56,7 @@ StatusCode Xaxpy<T>::DoAxpy(const size_t n, const T alpha,
// Retrieves the Xaxpy kernel from the compiled binary
try {
const auto program = GetProgramFromCache();
const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);
auto kernel = Kernel(program, kernel_name);
// Upload the scalar argument as a constant buffer to the device (needed for half-precision)
@ -94,13 +85,13 @@ StatusCode Xaxpy<T>::DoAxpy(const size_t n, const T alpha,
if (use_fast_kernel) {
auto global = std::vector<size_t>{CeilDiv(n, db_["WPT"]*db_["VW"])};
auto local = std::vector<size_t>{db_["WGS"]};
status = RunKernel(kernel, global, local, event_);
status = RunKernel(kernel, queue_, device_, global, local, event_);
}
else {
auto n_ceiled = Ceil(n, db_["WGS"]*db_["WPT"]);
auto global = std::vector<size_t>{n_ceiled/db_["WPT"]};
auto local = std::vector<size_t>{db_["WGS"]};
status = RunKernel(kernel, global, local, event_);
status = RunKernel(kernel, queue_, device_, global, local, event_);
}
if (ErrorIn(status)) { return status; }

View file

@ -14,28 +14,16 @@
#ifndef CLBLAST_ROUTINES_XAXPY_H_
#define CLBLAST_ROUTINES_XAXPY_H_
#include "internal/routine.h"
#include "routine.hpp"
namespace clblast {
// =================================================================================================
// See comment at top of file for a description of the class
template <typename T>
class Xaxpy: public Routine<T> {
class Xaxpy: public Routine {
public:
// Members and methods from the base class
using Routine<T>::db_;
using Routine<T>::source_string_;
using Routine<T>::queue_;
using Routine<T>::event_;
using Routine<T>::context_;
using Routine<T>::GetProgramFromCache;
using Routine<T>::TestVectorX;
using Routine<T>::TestVectorY;
using Routine<T>::RunKernel;
using Routine<T>::ErrorIn;
// Constructor
Xaxpy(Queue &queue, EventPointer event, const std::string &name = "AXPY");
@ -43,10 +31,6 @@ class Xaxpy: public Routine<T> {
StatusCode DoAxpy(const size_t n, const T alpha,
const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc);
private:
// Static variable to get the precision
const static Precision precision_;
};
// =================================================================================================

View file

@ -11,7 +11,7 @@
//
// =================================================================================================
#include "internal/routines/level1/xcopy.h"
#include "routines/level1/xcopy.hpp"
#include <string>
#include <vector>
@ -19,19 +19,10 @@
namespace clblast {
// =================================================================================================
// Specific implementations to get the memory-type based on a template argument
template <> const Precision Xcopy<half>::precision_ = Precision::kHalf;
template <> const Precision Xcopy<float>::precision_ = Precision::kSingle;
template <> const Precision Xcopy<double>::precision_ = Precision::kDouble;
template <> const Precision Xcopy<float2>::precision_ = Precision::kComplexSingle;
template <> const Precision Xcopy<double2>::precision_ = Precision::kComplexDouble;
// =================================================================================================
// Constructor: forwards to base class constructor
template <typename T>
Xcopy<T>::Xcopy(Queue &queue, EventPointer event, const std::string &name):
Routine<T>(queue, event, name, {"Xaxpy"}, precision_) {
Routine(queue, event, name, {"Xaxpy"}, PrecisionValue<T>()) {
source_string_ =
#include "../../kernels/level1/level1.opencl"
#include "../../kernels/level1/xcopy.opencl"
@ -50,9 +41,9 @@ StatusCode Xcopy<T>::DoCopy(const size_t n,
if (n == 0) { return StatusCode::kInvalidDimension; }
// Tests the vectors for validity
auto status = TestVectorX(n, x_buffer, x_offset, x_inc, sizeof(T));
auto status = TestVectorX(n, x_buffer, x_offset, x_inc);
if (ErrorIn(status)) { return status; }
status = TestVectorY(n, y_buffer, y_offset, y_inc, sizeof(T));
status = TestVectorY(n, y_buffer, y_offset, y_inc);
if (ErrorIn(status)) { return status; }
// Determines whether or not the fast-version can be used
@ -65,7 +56,7 @@ StatusCode Xcopy<T>::DoCopy(const size_t n,
// Retrieves the Xcopy kernel from the compiled binary
try {
const auto program = GetProgramFromCache();
const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);
auto kernel = Kernel(program, kernel_name);
// Sets the kernel arguments
@ -88,13 +79,13 @@ StatusCode Xcopy<T>::DoCopy(const size_t n,
if (use_fast_kernel) {
auto global = std::vector<size_t>{CeilDiv(n, db_["WPT"]*db_["VW"])};
auto local = std::vector<size_t>{db_["WGS"]};
status = RunKernel(kernel, global, local, event_);
status = RunKernel(kernel, queue_, device_, global, local, event_);
}
else {
auto n_ceiled = Ceil(n, db_["WGS"]*db_["WPT"]);
auto global = std::vector<size_t>{n_ceiled/db_["WPT"]};
auto local = std::vector<size_t>{db_["WGS"]};
status = RunKernel(kernel, global, local, event_);
status = RunKernel(kernel, queue_, device_, global, local, event_);
}
if (ErrorIn(status)) { return status; }

View file

@ -14,27 +14,16 @@
#ifndef CLBLAST_ROUTINES_XCOPY_H_
#define CLBLAST_ROUTINES_XCOPY_H_
#include "internal/routine.h"
#include "routine.hpp"
namespace clblast {
// =================================================================================================
// See comment at top of file for a description of the class
template <typename T>
class Xcopy: public Routine<T> {
class Xcopy: public Routine {
public:
// Members and methods from the base class
using Routine<T>::db_;
using Routine<T>::source_string_;
using Routine<T>::queue_;
using Routine<T>::event_;
using Routine<T>::GetProgramFromCache;
using Routine<T>::TestVectorX;
using Routine<T>::TestVectorY;
using Routine<T>::RunKernel;
using Routine<T>::ErrorIn;
// Constructor
Xcopy(Queue &queue, EventPointer event, const std::string &name = "COPY");
@ -42,10 +31,6 @@ class Xcopy: public Routine<T> {
StatusCode DoCopy(const size_t n,
const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc);
private:
// Static variable to get the precision
const static Precision precision_;
};
// =================================================================================================

View file

@ -11,7 +11,7 @@
//
// =================================================================================================
#include "internal/routines/level1/xdot.h"
#include "routines/level1/xdot.hpp"
#include <string>
#include <vector>
@ -19,19 +19,10 @@
namespace clblast {
// =================================================================================================
// Specific implementations to get the memory-type based on a template argument
template <> const Precision Xdot<half>::precision_ = Precision::kHalf;
template <> const Precision Xdot<float>::precision_ = Precision::kSingle;
template <> const Precision Xdot<double>::precision_ = Precision::kDouble;
template <> const Precision Xdot<float2>::precision_ = Precision::kComplexSingle;
template <> const Precision Xdot<double2>::precision_ = Precision::kComplexDouble;
// =================================================================================================
// Constructor: forwards to base class constructor
template <typename T>
Xdot<T>::Xdot(Queue &queue, EventPointer event, const std::string &name):
Routine<T>(queue, event, name, {"Xdot"}, precision_) {
Routine(queue, event, name, {"Xdot"}, PrecisionValue<T>()) {
source_string_ =
#include "../../kernels/level1/xdot.opencl"
;
@ -51,16 +42,16 @@ StatusCode Xdot<T>::DoDot(const size_t n,
if (n == 0) { return StatusCode::kInvalidDimension; }
// Tests the vectors for validity
auto status = TestVectorX(n, x_buffer, x_offset, x_inc, sizeof(T));
auto status = TestVectorX(n, x_buffer, x_offset, x_inc);
if (ErrorIn(status)) { return status; }
status = TestVectorY(n, y_buffer, y_offset, y_inc, sizeof(T));
status = TestVectorY(n, y_buffer, y_offset, y_inc);
if (ErrorIn(status)) { return status; }
status = TestVectorDot(1, dot_buffer, dot_offset, sizeof(T));
status = TestVectorScalar(1, dot_buffer, dot_offset);
if (ErrorIn(status)) { return status; }
// Retrieves the Xdot kernels from the compiled binary
try {
const auto program = GetProgramFromCache();
const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);
auto kernel1 = Kernel(program, "Xdot");
auto kernel2 = Kernel(program, "XdotEpilogue");
@ -86,7 +77,7 @@ StatusCode Xdot<T>::DoDot(const size_t n,
auto global1 = std::vector<size_t>{db_["WGS1"]*temp_size};
auto local1 = std::vector<size_t>{db_["WGS1"]};
auto kernelEvent = Event();
status = RunKernel(kernel1, global1, local1, kernelEvent.pointer());
status = RunKernel(kernel1, queue_, device_, global1, local1, kernelEvent.pointer());
if (ErrorIn(status)) { return status; }
eventWaitList.push_back(kernelEvent);
@ -98,7 +89,7 @@ StatusCode Xdot<T>::DoDot(const size_t n,
// Launches the epilogue kernel
auto global2 = std::vector<size_t>{db_["WGS2"]};
auto local2 = std::vector<size_t>{db_["WGS2"]};
status = RunKernel(kernel2, global2, local2, event_, eventWaitList);
status = RunKernel(kernel2, queue_, device_, global2, local2, event_, eventWaitList);
if (ErrorIn(status)) { return status; }
// Succesfully finished the computation

View file

@ -14,29 +14,16 @@
#ifndef CLBLAST_ROUTINES_XDOT_H_
#define CLBLAST_ROUTINES_XDOT_H_
#include "internal/routine.h"
#include "routine.hpp"
namespace clblast {
// =================================================================================================
// See comment at top of file for a description of the class
template <typename T>
class Xdot: public Routine<T> {
class Xdot: public Routine {
public:
// Members and methods from the base class
using Routine<T>::db_;
using Routine<T>::source_string_;
using Routine<T>::queue_;
using Routine<T>::event_;
using Routine<T>::context_;
using Routine<T>::GetProgramFromCache;
using Routine<T>::TestVectorX;
using Routine<T>::TestVectorY;
using Routine<T>::TestVectorDot;
using Routine<T>::RunKernel;
using Routine<T>::ErrorIn;
// Constructor
Xdot(Queue &queue, EventPointer event, const std::string &name = "DOT");
@ -46,10 +33,6 @@ class Xdot: public Routine<T> {
const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc,
const bool do_conjugate = false);
private:
// Static variable to get the precision
const static Precision precision_;
};
// =================================================================================================

View file

@ -11,7 +11,7 @@
//
// =================================================================================================
#include "internal/routines/level1/xdotc.h"
#include "routines/level1/xdotc.hpp"
#include <string>
#include <vector>

View file

@ -14,7 +14,7 @@
#ifndef CLBLAST_ROUTINES_XDOTC_H_
#define CLBLAST_ROUTINES_XDOTC_H_
#include "internal/routines/level1/xdot.h"
#include "routines/level1/xdot.hpp"
namespace clblast {
// =================================================================================================

View file

@ -11,7 +11,7 @@
//
// =================================================================================================
#include "internal/routines/level1/xdotu.h"
#include "routines/level1/xdotu.hpp"
#include <string>

View file

@ -14,7 +14,7 @@
#ifndef CLBLAST_ROUTINES_XDOTU_H_
#define CLBLAST_ROUTINES_XDOTU_H_
#include "internal/routines/level1/xdot.h"
#include "routines/level1/xdot.hpp"
namespace clblast {
// =================================================================================================

View file

@ -14,8 +14,8 @@
#ifndef CLBLAST_ROUTINES_XMAX_H_
#define CLBLAST_ROUTINES_XMAX_H_
#include "internal/routine.h"
#include "internal/routines/level1/xamax.h"
#include "routine.hpp"
#include "routines/level1/xamax.hpp"
namespace clblast {
// =================================================================================================

View file

@ -14,8 +14,8 @@
#ifndef CLBLAST_ROUTINES_XMIN_H_
#define CLBLAST_ROUTINES_XMIN_H_
#include "internal/routine.h"
#include "internal/routines/level1/xamax.h"
#include "routine.hpp"
#include "routines/level1/xamax.hpp"
namespace clblast {
// =================================================================================================

View file

@ -11,7 +11,7 @@
//
// =================================================================================================
#include "internal/routines/level1/xnrm2.h"
#include "routines/level1/xnrm2.hpp"
#include <string>
#include <vector>
@ -19,19 +19,10 @@
namespace clblast {
// =================================================================================================
// Specific implementations to get the memory-type based on a template argument
template <> const Precision Xnrm2<half>::precision_ = Precision::kHalf;
template <> const Precision Xnrm2<float>::precision_ = Precision::kSingle;
template <> const Precision Xnrm2<double>::precision_ = Precision::kDouble;
template <> const Precision Xnrm2<float2>::precision_ = Precision::kComplexSingle;
template <> const Precision Xnrm2<double2>::precision_ = Precision::kComplexDouble;
// =================================================================================================
// Constructor: forwards to base class constructor
template <typename T>
Xnrm2<T>::Xnrm2(Queue &queue, EventPointer event, const std::string &name):
Routine<T>(queue, event, name, {"Xdot"}, precision_) {
Routine(queue, event, name, {"Xdot"}, PrecisionValue<T>()) {
source_string_ =
#include "../../kernels/level1/xnrm2.opencl"
;
@ -49,14 +40,14 @@ StatusCode Xnrm2<T>::DoNrm2(const size_t n,
if (n == 0) { return StatusCode::kInvalidDimension; }
// Tests the vectors for validity
auto status = TestVectorX(n, x_buffer, x_offset, x_inc, sizeof(T));
auto status = TestVectorX(n, x_buffer, x_offset, x_inc);
if (ErrorIn(status)) { return status; }
status = TestVectorDot(1, nrm2_buffer, nrm2_offset, sizeof(T));
status = TestVectorScalar(1, nrm2_buffer, nrm2_offset);
if (ErrorIn(status)) { return status; }
// Retrieves the Xnrm2 kernels from the compiled binary
try {
const auto program = GetProgramFromCache();
const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);
auto kernel1 = Kernel(program, "Xnrm2");
auto kernel2 = Kernel(program, "Xnrm2Epilogue");
@ -78,7 +69,7 @@ StatusCode Xnrm2<T>::DoNrm2(const size_t n,
auto global1 = std::vector<size_t>{db_["WGS1"]*temp_size};
auto local1 = std::vector<size_t>{db_["WGS1"]};
auto kernelEvent = Event();
status = RunKernel(kernel1, global1, local1, kernelEvent.pointer());
status = RunKernel(kernel1, queue_, device_, global1, local1, kernelEvent.pointer());
if (ErrorIn(status)) { return status; }
eventWaitList.push_back(kernelEvent);
@ -90,7 +81,7 @@ StatusCode Xnrm2<T>::DoNrm2(const size_t n,
// Launches the epilogue kernel
auto global2 = std::vector<size_t>{db_["WGS2"]};
auto local2 = std::vector<size_t>{db_["WGS2"]};
status = RunKernel(kernel2, global2, local2, event_, eventWaitList);
status = RunKernel(kernel2, queue_, device_, global2, local2, event_, eventWaitList);
if (ErrorIn(status)) { return status; }
// Succesfully finished the computation

View file

@ -14,28 +14,16 @@
#ifndef CLBLAST_ROUTINES_XNRM2_H_
#define CLBLAST_ROUTINES_XNRM2_H_
#include "internal/routine.h"
#include "routine.hpp"
namespace clblast {
// =================================================================================================
// See comment at top of file for a description of the class
template <typename T>
class Xnrm2: public Routine<T> {
class Xnrm2: public Routine {
public:
// Members and methods from the base class
using Routine<T>::db_;
using Routine<T>::source_string_;
using Routine<T>::queue_;
using Routine<T>::event_;
using Routine<T>::context_;
using Routine<T>::GetProgramFromCache;
using Routine<T>::TestVectorX;
using Routine<T>::TestVectorDot;
using Routine<T>::RunKernel;
using Routine<T>::ErrorIn;
// Constructor
Xnrm2(Queue &queue, EventPointer event, const std::string &name = "NRM2");
@ -43,10 +31,6 @@ class Xnrm2: public Routine<T> {
StatusCode DoNrm2(const size_t n,
const Buffer<T> &nrm2_buffer, const size_t nrm2_offset,
const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc);
private:
// Static variable to get the precision
const static Precision precision_;
};
// =================================================================================================

View file

@ -11,7 +11,7 @@
//
// =================================================================================================
#include "internal/routines/level1/xscal.h"
#include "routines/level1/xscal.hpp"
#include <string>
#include <vector>
@ -19,19 +19,10 @@
namespace clblast {
// =================================================================================================
// Specific implementations to get the memory-type based on a template argument
template <> const Precision Xscal<half>::precision_ = Precision::kHalf;
template <> const Precision Xscal<float>::precision_ = Precision::kSingle;
template <> const Precision Xscal<double>::precision_ = Precision::kDouble;
template <> const Precision Xscal<float2>::precision_ = Precision::kComplexSingle;
template <> const Precision Xscal<double2>::precision_ = Precision::kComplexDouble;
// =================================================================================================
// Constructor: forwards to base class constructor
template <typename T>
Xscal<T>::Xscal(Queue &queue, EventPointer event, const std::string &name):
Routine<T>(queue, event, name, {"Xaxpy"}, precision_) {
Routine(queue, event, name, {"Xaxpy"}, PrecisionValue<T>()) {
source_string_ =
#include "../../kernels/level1/level1.opencl"
#include "../../kernels/level1/xscal.opencl"
@ -49,7 +40,7 @@ StatusCode Xscal<T>::DoScal(const size_t n, const T alpha,
if (n == 0) { return StatusCode::kInvalidDimension; }
// Tests the vector for validity
auto status = TestVectorX(n, x_buffer, x_offset, x_inc, sizeof(T));
auto status = TestVectorX(n, x_buffer, x_offset, x_inc);
if (ErrorIn(status)) { return status; }
// Determines whether or not the fast-version can be used
@ -61,7 +52,7 @@ StatusCode Xscal<T>::DoScal(const size_t n, const T alpha,
// Retrieves the Xscal kernel from the compiled binary
try {
const auto program = GetProgramFromCache();
const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);
auto kernel = Kernel(program, kernel_name);
// Sets the kernel arguments
@ -82,13 +73,13 @@ StatusCode Xscal<T>::DoScal(const size_t n, const T alpha,
if (use_fast_kernel) {
auto global = std::vector<size_t>{CeilDiv(n, db_["WPT"]*db_["VW"])};
auto local = std::vector<size_t>{db_["WGS"]};
status = RunKernel(kernel, global, local, event_);
status = RunKernel(kernel, queue_, device_, global, local, event_);
}
else {
auto n_ceiled = Ceil(n, db_["WGS"]*db_["WPT"]);
auto global = std::vector<size_t>{n_ceiled/db_["WPT"]};
auto local = std::vector<size_t>{db_["WGS"]};
status = RunKernel(kernel, global, local, event_);
status = RunKernel(kernel, queue_, device_, global, local, event_);
}
if (ErrorIn(status)) { return status; }

View file

@ -14,36 +14,22 @@
#ifndef CLBLAST_ROUTINES_XSCAL_H_
#define CLBLAST_ROUTINES_XSCAL_H_
#include "internal/routine.h"
#include "routine.hpp"
namespace clblast {
// =================================================================================================
// See comment at top of file for a description of the class
template <typename T>
class Xscal: public Routine<T> {
class Xscal: public Routine {
public:
// Members and methods from the base class
using Routine<T>::db_;
using Routine<T>::source_string_;
using Routine<T>::queue_;
using Routine<T>::event_;
using Routine<T>::GetProgramFromCache;
using Routine<T>::TestVectorX;
using Routine<T>::RunKernel;
using Routine<T>::ErrorIn;
// Constructor
Xscal(Queue &queue, EventPointer event, const std::string &name = "SCAL");
// Templated-precision implementation of the routine
StatusCode DoScal(const size_t n, const T alpha,
const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc);
private:
// Static variable to get the precision
const static Precision precision_;
};
// =================================================================================================

View file

@ -14,8 +14,8 @@
#ifndef CLBLAST_ROUTINES_XSUM_H_
#define CLBLAST_ROUTINES_XSUM_H_
#include "internal/routine.h"
#include "internal/routines/level1/xasum.h"
#include "routine.hpp"
#include "routines/level1/xasum.hpp"
namespace clblast {
// =================================================================================================

View file

@ -11,7 +11,7 @@
//
// =================================================================================================
#include "internal/routines/level1/xswap.h"
#include "routines/level1/xswap.hpp"
#include <string>
#include <vector>
@ -19,19 +19,10 @@
namespace clblast {
// =================================================================================================
// Specific implementations to get the memory-type based on a template argument
template <> const Precision Xswap<half>::precision_ = Precision::kHalf;
template <> const Precision Xswap<float>::precision_ = Precision::kSingle;
template <> const Precision Xswap<double>::precision_ = Precision::kDouble;
template <> const Precision Xswap<float2>::precision_ = Precision::kComplexSingle;
template <> const Precision Xswap<double2>::precision_ = Precision::kComplexDouble;
// =================================================================================================
// Constructor: forwards to base class constructor
template <typename T>
Xswap<T>::Xswap(Queue &queue, EventPointer event, const std::string &name):
Routine<T>(queue, event, name, {"Xaxpy"}, precision_) {
Routine(queue, event, name, {"Xaxpy"}, PrecisionValue<T>()) {
source_string_ =
#include "../../kernels/level1/level1.opencl"
#include "../../kernels/level1/xswap.opencl"
@ -50,9 +41,9 @@ StatusCode Xswap<T>::DoSwap(const size_t n,
if (n == 0) { return StatusCode::kInvalidDimension; }
// Tests the vectors for validity
auto status = TestVectorX(n, x_buffer, x_offset, x_inc, sizeof(T));
auto status = TestVectorX(n, x_buffer, x_offset, x_inc);
if (ErrorIn(status)) { return status; }
status = TestVectorY(n, y_buffer, y_offset, y_inc, sizeof(T));
status = TestVectorY(n, y_buffer, y_offset, y_inc);
if (ErrorIn(status)) { return status; }
// Determines whether or not the fast-version can be used
@ -65,7 +56,7 @@ StatusCode Xswap<T>::DoSwap(const size_t n,
// Retrieves the Xswap kernel from the compiled binary
try {
const auto program = GetProgramFromCache();
const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);
auto kernel = Kernel(program, kernel_name);
// Sets the kernel arguments
@ -88,13 +79,13 @@ StatusCode Xswap<T>::DoSwap(const size_t n,
if (use_fast_kernel) {
auto global = std::vector<size_t>{CeilDiv(n, db_["WPT"]*db_["VW"])};
auto local = std::vector<size_t>{db_["WGS"]};
status = RunKernel(kernel, global, local, event_);
status = RunKernel(kernel, queue_, device_, global, local, event_);
}
else {
auto n_ceiled = Ceil(n, db_["WGS"]*db_["WPT"]);
auto global = std::vector<size_t>{n_ceiled/db_["WPT"]};
auto local = std::vector<size_t>{db_["WGS"]};
status = RunKernel(kernel, global, local, event_);
status = RunKernel(kernel, queue_, device_, global, local, event_);
}
if (ErrorIn(status)) { return status; }

View file

@ -14,27 +14,16 @@
#ifndef CLBLAST_ROUTINES_XSWAP_H_
#define CLBLAST_ROUTINES_XSWAP_H_
#include "internal/routine.h"
#include "routine.hpp"
namespace clblast {
// =================================================================================================
// See comment at top of file for a description of the class
template <typename T>
class Xswap: public Routine<T> {
class Xswap: public Routine {
public:
// Members and methods from the base class
using Routine<T>::db_;
using Routine<T>::source_string_;
using Routine<T>::queue_;
using Routine<T>::event_;
using Routine<T>::GetProgramFromCache;
using Routine<T>::TestVectorX;
using Routine<T>::TestVectorY;
using Routine<T>::RunKernel;
using Routine<T>::ErrorIn;
// Constructor
Xswap(Queue &queue, EventPointer event, const std::string &name = "SWAP");
@ -42,10 +31,6 @@ class Xswap: public Routine<T> {
StatusCode DoSwap(const size_t n,
const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc);
private:
// Static variable to get the precision
const static Precision precision_;
};
// =================================================================================================

View file

@ -11,7 +11,7 @@
//
// =================================================================================================
#include "internal/routines/level2/xgbmv.h"
#include "routines/level2/xgbmv.hpp"
#include <string>
#include <vector>

View file

@ -16,7 +16,7 @@
#ifndef CLBLAST_ROUTINES_XGBMV_H_
#define CLBLAST_ROUTINES_XGBMV_H_
#include "internal/routines/level2/xgemv.h"
#include "routines/level2/xgemv.hpp"
namespace clblast {
// =================================================================================================

View file

@ -11,7 +11,7 @@
//
// =================================================================================================
#include "internal/routines/level2/xgemv.h"
#include "routines/level2/xgemv.hpp"
#include <string>
#include <vector>
@ -19,19 +19,10 @@
namespace clblast {
// =================================================================================================
// Specific implementations to get the memory-type based on a template argument
template <> const Precision Xgemv<half>::precision_ = Precision::kHalf;
template <> const Precision Xgemv<float>::precision_ = Precision::kSingle;
template <> const Precision Xgemv<double>::precision_ = Precision::kDouble;
template <> const Precision Xgemv<float2>::precision_ = Precision::kComplexSingle;
template <> const Precision Xgemv<double2>::precision_ = Precision::kComplexDouble;
// =================================================================================================
// Constructor: forwards to base class constructor
template <typename T>
Xgemv<T>::Xgemv(Queue &queue, EventPointer event, const std::string &name):
Routine<T>(queue, event, name, {"Pad", "Xgemv"}, precision_) {
Routine(queue, event, name, {"Pad", "Xgemv"}, PrecisionValue<T>()) {
source_string_ =
#include "../../kernels/level2/xgemv.opencl"
#include "../../kernels/level2/xgemv_fast.opencl"
@ -101,12 +92,12 @@ StatusCode Xgemv<T>::MatVec(const Layout layout, const Transpose a_transpose,
// Tests the matrix and the vectors for validity
auto status = StatusCode::kSuccess;
if (packed) { status = TestMatrixAP(n, a_buffer, a_offset, sizeof(T)); }
else { status = TestMatrixA(a_one, a_two, a_buffer, a_offset, a_ld, sizeof(T)); }
if (packed) { status = TestMatrixAP(n, a_buffer, a_offset); }
else { status = TestMatrixA(a_one, a_two, a_buffer, a_offset, a_ld); }
if (ErrorIn(status)) { return status; }
status = TestVectorX(n_real, x_buffer, x_offset, x_inc, sizeof(T));
status = TestVectorX(n_real, x_buffer, x_offset, x_inc);
if (ErrorIn(status)) { return status; }
status = TestVectorY(m_real, y_buffer, y_offset, y_inc, sizeof(T));
status = TestVectorY(m_real, y_buffer, y_offset, y_inc);
if (ErrorIn(status)) { return status; }
// Determines whether or not the fast-version can be used
@ -143,7 +134,7 @@ StatusCode Xgemv<T>::MatVec(const Layout layout, const Transpose a_transpose,
// Retrieves the Xgemv kernel from the compiled binary
try {
const auto program = GetProgramFromCache();
const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);
auto kernel = Kernel(program, kernel_name);
// Sets the kernel arguments
@ -169,7 +160,7 @@ StatusCode Xgemv<T>::MatVec(const Layout layout, const Transpose a_transpose,
// Launches the kernel
auto global = std::vector<size_t>{global_size};
auto local = std::vector<size_t>{local_size};
status = RunKernel(kernel, global, local, event_);
status = RunKernel(kernel, queue_, device_, global, local, event_);
if (ErrorIn(status)) { return status; }
// Succesfully finished the computation

View file

@ -14,30 +14,16 @@
#ifndef CLBLAST_ROUTINES_XGEMV_H_
#define CLBLAST_ROUTINES_XGEMV_H_
#include "internal/routine.h"
#include "routine.hpp"
namespace clblast {
// =================================================================================================
// See comment at top of file for a description of the class
template <typename T>
class Xgemv: public Routine<T> {
class Xgemv: public Routine {
public:
// Members and methods from the base class
using Routine<T>::db_;
using Routine<T>::source_string_;
using Routine<T>::queue_;
using Routine<T>::event_;
using Routine<T>::context_;
using Routine<T>::GetProgramFromCache;
using Routine<T>::TestVectorX;
using Routine<T>::TestVectorY;
using Routine<T>::TestMatrixA;
using Routine<T>::TestMatrixAP;
using Routine<T>::RunKernel;
using Routine<T>::ErrorIn;
// Constructor
Xgemv(Queue &queue, EventPointer event, const std::string &name = "GEMV");
@ -61,10 +47,6 @@ class Xgemv: public Routine<T> {
bool fast_kernel, bool fast_kernel_rot,
const size_t parameter, const bool packed,
const size_t kl, const size_t ku);
private:
// Static variable to get the precision
const static Precision precision_;
};
// =================================================================================================

View file

@ -11,7 +11,7 @@
//
// =================================================================================================
#include "internal/routines/level2/xger.h"
#include "routines/level2/xger.hpp"
#include <string>
#include <vector>
@ -19,19 +19,10 @@
namespace clblast {
// =================================================================================================
// Specific implementations to get the memory-type based on a template argument
template <> const Precision Xger<half>::precision_ = Precision::kHalf;
template <> const Precision Xger<float>::precision_ = Precision::kSingle;
template <> const Precision Xger<double>::precision_ = Precision::kDouble;
template <> const Precision Xger<float2>::precision_ = Precision::kComplexSingle;
template <> const Precision Xger<double2>::precision_ = Precision::kComplexDouble;
// =================================================================================================
// Constructor: forwards to base class constructor
template <typename T>
Xger<T>::Xger(Queue &queue, EventPointer event, const std::string &name):
Routine<T>(queue, event, name, {"Xger"}, precision_) {
Routine(queue, event, name, {"Xger"}, PrecisionValue<T>()) {
source_string_ =
#include "../../kernels/level2/level2.opencl"
#include "../../kernels/level2/xger.opencl"
@ -58,11 +49,11 @@ StatusCode Xger<T>::DoGer(const Layout layout,
const auto a_two = (a_is_rowmajor) ? m : n;
// Tests the matrix and the vectors for validity
auto status = TestMatrixA(a_one, a_two, a_buffer, a_offset, a_ld, sizeof(T));
auto status = TestMatrixA(a_one, a_two, a_buffer, a_offset, a_ld);
if (ErrorIn(status)) { return status; }
status = TestVectorX(m, x_buffer, x_offset, x_inc, sizeof(T));
status = TestVectorX(m, x_buffer, x_offset, x_inc);
if (ErrorIn(status)) { return status; }
status = TestVectorY(n, y_buffer, y_offset, y_inc, sizeof(T));
status = TestVectorY(n, y_buffer, y_offset, y_inc);
if (ErrorIn(status)) { return status; }
// Upload the scalar argument as a constant buffer to the device (needed for half-precision)
@ -71,7 +62,7 @@ StatusCode Xger<T>::DoGer(const Layout layout,
// Retrieves the kernel from the compiled binary
try {
const auto program = GetProgramFromCache();
const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);
auto kernel = Kernel(program, "Xger");
// Sets the kernel arguments
@ -94,7 +85,7 @@ StatusCode Xger<T>::DoGer(const Layout layout,
auto a_two_ceiled = Ceil(CeilDiv(a_two, db_["WPT"]), db_["WGS2"]);
auto global = std::vector<size_t>{a_one_ceiled, a_two_ceiled};
auto local = std::vector<size_t>{db_["WGS1"], db_["WGS2"]};
status = RunKernel(kernel, global, local, event_);
status = RunKernel(kernel, queue_, device_, global, local, event_);
if (ErrorIn(status)) { return status; }
// Succesfully finished the computation

View file

@ -14,29 +14,16 @@
#ifndef CLBLAST_ROUTINES_XGER_H_
#define CLBLAST_ROUTINES_XGER_H_
#include "internal/routine.h"
#include "routine.hpp"
namespace clblast {
// =================================================================================================
// See comment at top of file for a description of the class
template <typename T>
class Xger: public Routine<T> {
class Xger: public Routine {
public:
// Members and methods from the base class
using Routine<T>::db_;
using Routine<T>::source_string_;
using Routine<T>::queue_;
using Routine<T>::event_;
using Routine<T>::context_;
using Routine<T>::GetProgramFromCache;
using Routine<T>::TestVectorX;
using Routine<T>::TestVectorY;
using Routine<T>::TestMatrixA;
using Routine<T>::RunKernel;
using Routine<T>::ErrorIn;
// Constructor
Xger(Queue &queue, EventPointer event, const std::string &name = "GER");
@ -47,10 +34,6 @@ class Xger: public Routine<T> {
const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc,
const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld);
private:
// Static variable to get the precision
const static Precision precision_;
};
// =================================================================================================

View file

@ -11,7 +11,7 @@
//
// =================================================================================================
#include "internal/routines/level2/xgerc.h"
#include "routines/level2/xgerc.hpp"
#include <string>

View file

@ -14,7 +14,7 @@
#ifndef CLBLAST_ROUTINES_XGERC_H_
#define CLBLAST_ROUTINES_XGERC_H_
#include "internal/routines/level2/xger.h"
#include "routines/level2/xger.hpp"
namespace clblast {
// =================================================================================================

View file

@ -11,7 +11,7 @@
//
// =================================================================================================
#include "internal/routines/level2/xgeru.h"
#include "routines/level2/xgeru.hpp"
#include <string>

View file

@ -14,7 +14,7 @@
#ifndef CLBLAST_ROUTINES_XGERU_H_
#define CLBLAST_ROUTINES_XGERU_H_
#include "internal/routines/level2/xger.h"
#include "routines/level2/xger.hpp"
namespace clblast {
// =================================================================================================

View file

@ -11,7 +11,7 @@
//
// =================================================================================================
#include "internal/routines/level2/xhbmv.h"
#include "routines/level2/xhbmv.hpp"
#include <string>
#include <vector>

View file

@ -16,7 +16,7 @@
#ifndef CLBLAST_ROUTINES_XHBMV_H_
#define CLBLAST_ROUTINES_XHBMV_H_
#include "internal/routines/level2/xgemv.h"
#include "routines/level2/xgemv.hpp"
namespace clblast {
// =================================================================================================

View file

@ -11,7 +11,7 @@
//
// =================================================================================================
#include "internal/routines/level2/xhemv.h"
#include "routines/level2/xhemv.hpp"
#include <string>
#include <vector>

View file

@ -16,7 +16,7 @@
#ifndef CLBLAST_ROUTINES_XHEMV_H_
#define CLBLAST_ROUTINES_XHEMV_H_
#include "internal/routines/level2/xgemv.h"
#include "routines/level2/xgemv.hpp"
namespace clblast {
// =================================================================================================

View file

@ -11,26 +11,17 @@
//
// =================================================================================================
#include "internal/routines/level2/xher.h"
#include "routines/level2/xher.hpp"
#include <string>
namespace clblast {
// =================================================================================================
// Specific implementations to get the memory-type based on a template argument
template <> const Precision Xher<half, half>::precision_ = Precision::kHalf;
template <> const Precision Xher<float, float>::precision_ = Precision::kSingle;
template <> const Precision Xher<double, double>::precision_ = Precision::kDouble;
template <> const Precision Xher<float2, float>::precision_ = Precision::kComplexSingle;
template <> const Precision Xher<double2, double>::precision_ = Precision::kComplexDouble;
// =================================================================================================
// Constructor: forwards to base class constructor
template <typename T, typename U>
Xher<T,U>::Xher(Queue &queue, EventPointer event, const std::string &name):
Routine<T>(queue, event, name, {"Xger"}, precision_) {
Routine(queue, event, name, {"Xger"}, PrecisionValue<T>()) {
source_string_ =
#include "../../kernels/level2/level2.opencl"
#include "../../kernels/level2/xher.opencl"
@ -67,10 +58,10 @@ StatusCode Xher<T,U>::DoHer(const Layout layout, const Triangle triangle,
// Tests the matrix and the vectors for validity
auto status = StatusCode::kSuccess;
if (packed) { status = TestMatrixAP(n, a_buffer, a_offset, sizeof(T)); }
else { status = TestMatrixA(n, n, a_buffer, a_offset, a_ld, sizeof(T)); }
if (packed) { status = TestMatrixAP(n, a_buffer, a_offset); }
else { status = TestMatrixA(n, n, a_buffer, a_offset, a_ld); }
if (ErrorIn(status)) { return status; }
status = TestVectorX(n, x_buffer, x_offset, x_inc, sizeof(T));
status = TestVectorX(n, x_buffer, x_offset, x_inc);
if (ErrorIn(status)) { return status; }
// If alpha is zero an update is not required
@ -85,7 +76,7 @@ StatusCode Xher<T,U>::DoHer(const Layout layout, const Triangle triangle,
// Retrieves the kernel from the compiled binary
try {
const auto program = GetProgramFromCache();
const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);
auto kernel = Kernel(program, "Xher");
// Sets the kernel arguments
@ -105,7 +96,7 @@ StatusCode Xher<T,U>::DoHer(const Layout layout, const Triangle triangle,
auto global_two = Ceil(CeilDiv(n, db_["WPT"]), db_["WGS2"]);
auto global = std::vector<size_t>{global_one, global_two};
auto local = std::vector<size_t>{db_["WGS1"], db_["WGS2"]};
status = RunKernel(kernel, global, local, event_);
status = RunKernel(kernel, queue_, device_, global, local, event_);
if (ErrorIn(status)) { return status; }
// Succesfully finished the computation

View file

@ -14,29 +14,16 @@
#ifndef CLBLAST_ROUTINES_XHER_H_
#define CLBLAST_ROUTINES_XHER_H_
#include "internal/routine.h"
#include "routine.hpp"
namespace clblast {
// =================================================================================================
// See comment at top of file for a description of the class
template <typename T, typename U>
class Xher: public Routine<T> {
class Xher: public Routine {
public:
// Members and methods from the base class
using Routine<T>::db_;
using Routine<T>::source_string_;
using Routine<T>::queue_;
using Routine<T>::event_;
using Routine<T>::context_;
using Routine<T>::GetProgramFromCache;
using Routine<T>::TestVectorX;
using Routine<T>::TestMatrixA;
using Routine<T>::TestMatrixAP;
using Routine<T>::RunKernel;
using Routine<T>::ErrorIn;
// Constructor
Xher(Queue &queue, EventPointer event, const std::string &name = "HER");
@ -50,10 +37,6 @@ class Xher: public Routine<T> {
const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
const bool packed = false);
private:
// Static variable to get the precision
const static Precision precision_;
};
// =================================================================================================

View file

@ -11,26 +11,17 @@
//
// =================================================================================================
#include "internal/routines/level2/xher2.h"
#include "routines/level2/xher2.hpp"
#include <string>
namespace clblast {
// =================================================================================================
// Specific implementations to get the memory-type based on a template argument
template <> const Precision Xher2<half>::precision_ = Precision::kHalf;
template <> const Precision Xher2<float>::precision_ = Precision::kSingle;
template <> const Precision Xher2<double>::precision_ = Precision::kDouble;
template <> const Precision Xher2<float2>::precision_ = Precision::kComplexSingle;
template <> const Precision Xher2<double2>::precision_ = Precision::kComplexDouble;
// =================================================================================================
// Constructor: forwards to base class constructor
template <typename T>
Xher2<T>::Xher2(Queue &queue, EventPointer event, const std::string &name):
Routine<T>(queue, event, name, {"Xger"}, precision_) {
Routine(queue, event, name, {"Xger"}, PrecisionValue<T>()) {
source_string_ =
#include "../../kernels/level2/level2.opencl"
#include "../../kernels/level2/xher2.opencl"
@ -59,12 +50,12 @@ StatusCode Xher2<T>::DoHer2(const Layout layout, const Triangle triangle,
// Tests the matrix and the vectors for validity
auto status = StatusCode::kSuccess;
if (packed) { status = TestMatrixAP(n, a_buffer, a_offset, sizeof(T)); }
else { status = TestMatrixA(n, n, a_buffer, a_offset, a_ld, sizeof(T)); }
if (packed) { status = TestMatrixAP(n, a_buffer, a_offset); }
else { status = TestMatrixA(n, n, a_buffer, a_offset, a_ld); }
if (ErrorIn(status)) { return status; }
status = TestVectorX(n, x_buffer, x_offset, x_inc, sizeof(T));
status = TestVectorX(n, x_buffer, x_offset, x_inc);
if (ErrorIn(status)) { return status; }
status = TestVectorY(n, y_buffer, y_offset, y_inc, sizeof(T));
status = TestVectorY(n, y_buffer, y_offset, y_inc);
if (ErrorIn(status)) { return status; }
// Upload the scalar argument as a constant buffer to the device (needed for half-precision)
@ -73,7 +64,7 @@ StatusCode Xher2<T>::DoHer2(const Layout layout, const Triangle triangle,
// Retrieves the kernel from the compiled binary
try {
const auto program = GetProgramFromCache();
const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);
auto kernel = Kernel(program, "Xher2");
// Sets the kernel arguments
@ -96,7 +87,7 @@ StatusCode Xher2<T>::DoHer2(const Layout layout, const Triangle triangle,
auto global_two = Ceil(CeilDiv(n, db_["WPT"]), db_["WGS2"]);
auto global = std::vector<size_t>{global_one, global_two};
auto local = std::vector<size_t>{db_["WGS1"], db_["WGS2"]};
status = RunKernel(kernel, global, local, event_);
status = RunKernel(kernel, queue_, device_, global, local, event_);
if (ErrorIn(status)) { return status; }
// Succesfully finished the computation

View file

@ -14,30 +14,16 @@
#ifndef CLBLAST_ROUTINES_XHER2_H_
#define CLBLAST_ROUTINES_XHER2_H_
#include "internal/routine.h"
#include "routine.hpp"
namespace clblast {
// =================================================================================================
// See comment at top of file for a description of the class
template <typename T>
class Xher2: public Routine<T> {
class Xher2: public Routine {
public:
// Members and methods from the base class
using Routine<T>::db_;
using Routine<T>::source_string_;
using Routine<T>::queue_;
using Routine<T>::event_;
using Routine<T>::context_;
using Routine<T>::GetProgramFromCache;
using Routine<T>::TestVectorX;
using Routine<T>::TestVectorY;
using Routine<T>::TestMatrixA;
using Routine<T>::TestMatrixAP;
using Routine<T>::RunKernel;
using Routine<T>::ErrorIn;
// Constructor
Xher2(Queue &queue, EventPointer event, const std::string &name = "HER2");
@ -49,10 +35,6 @@ class Xher2: public Routine<T> {
const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc,
const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
const bool packed = false);
private:
// Static variable to get the precision
const static Precision precision_;
};
// =================================================================================================

View file

@ -11,7 +11,7 @@
//
// =================================================================================================
#include "internal/routines/level2/xhpmv.h"
#include "routines/level2/xhpmv.hpp"
#include <string>
#include <vector>

View file

@ -16,7 +16,7 @@
#ifndef CLBLAST_ROUTINES_XHPMV_H_
#define CLBLAST_ROUTINES_XHPMV_H_
#include "internal/routines/level2/xgemv.h"
#include "routines/level2/xgemv.hpp"
namespace clblast {
// =================================================================================================

View file

@ -11,7 +11,7 @@
//
// =================================================================================================
#include "internal/routines/level2/xhpr.h"
#include "routines/level2/xhpr.hpp"
#include <string>

View file

@ -14,7 +14,7 @@
#ifndef CLBLAST_ROUTINES_XHPR_H_
#define CLBLAST_ROUTINES_XHPR_H_
#include "internal/routines/level2/xher.h"
#include "routines/level2/xher.hpp"
namespace clblast {
// =================================================================================================

View file

@ -11,7 +11,7 @@
//
// =================================================================================================
#include "internal/routines/level2/xhpr2.h"
#include "routines/level2/xhpr2.hpp"
#include <string>

View file

@ -14,7 +14,7 @@
#ifndef CLBLAST_ROUTINES_XHPR2_H_
#define CLBLAST_ROUTINES_XHPR2_H_
#include "internal/routines/level2/xher2.h"
#include "routines/level2/xher2.hpp"
namespace clblast {
// =================================================================================================

View file

@ -11,7 +11,7 @@
//
// =================================================================================================
#include "internal/routines/level2/xsbmv.h"
#include "routines/level2/xsbmv.hpp"
#include <string>
#include <vector>

View file

@ -16,7 +16,7 @@
#ifndef CLBLAST_ROUTINES_XSBMV_H_
#define CLBLAST_ROUTINES_XSBMV_H_
#include "internal/routines/level2/xgemv.h"
#include "routines/level2/xgemv.hpp"
namespace clblast {
// =================================================================================================

View file

@ -11,7 +11,7 @@
//
// =================================================================================================
#include "internal/routines/level2/xspmv.h"
#include "routines/level2/xspmv.hpp"
#include <string>
#include <vector>

View file

@ -16,7 +16,7 @@
#ifndef CLBLAST_ROUTINES_XSPMV_H_
#define CLBLAST_ROUTINES_XSPMV_H_
#include "internal/routines/level2/xgemv.h"
#include "routines/level2/xgemv.hpp"
namespace clblast {
// =================================================================================================

View file

@ -11,7 +11,7 @@
//
// =================================================================================================
#include "internal/routines/level2/xspr.h"
#include "routines/level2/xspr.hpp"
#include <string>

View file

@ -14,7 +14,7 @@
#ifndef CLBLAST_ROUTINES_XSPR_H_
#define CLBLAST_ROUTINES_XSPR_H_
#include "internal/routines/level2/xher.h"
#include "routines/level2/xher.hpp"
namespace clblast {
// =================================================================================================

View file

@ -11,7 +11,7 @@
//
// =================================================================================================
#include "internal/routines/level2/xspr2.h"
#include "routines/level2/xspr2.hpp"
#include <string>

View file

@ -14,7 +14,7 @@
#ifndef CLBLAST_ROUTINES_XSPR2_H_
#define CLBLAST_ROUTINES_XSPR2_H_
#include "internal/routines/level2/xher2.h"
#include "routines/level2/xher2.hpp"
namespace clblast {
// =================================================================================================

View file

@ -11,7 +11,7 @@
//
// =================================================================================================
#include "internal/routines/level2/xsymv.h"
#include "routines/level2/xsymv.hpp"
#include <string>
#include <vector>

View file

@ -16,7 +16,7 @@
#ifndef CLBLAST_ROUTINES_XSYMV_H_
#define CLBLAST_ROUTINES_XSYMV_H_
#include "internal/routines/level2/xgemv.h"
#include "routines/level2/xgemv.hpp"
namespace clblast {
// =================================================================================================

View file

@ -11,7 +11,7 @@
//
// =================================================================================================
#include "internal/routines/level2/xsyr.h"
#include "routines/level2/xsyr.hpp"
#include <string>

View file

@ -14,7 +14,7 @@
#ifndef CLBLAST_ROUTINES_XSYR_H_
#define CLBLAST_ROUTINES_XSYR_H_
#include "internal/routines/level2/xher.h"
#include "routines/level2/xher.hpp"
namespace clblast {
// =================================================================================================

View file

@ -11,7 +11,7 @@
//
// =================================================================================================
#include "internal/routines/level2/xsyr2.h"
#include "routines/level2/xsyr2.hpp"
#include <string>

View file

@ -14,7 +14,7 @@
#ifndef CLBLAST_ROUTINES_XSYR2_H_
#define CLBLAST_ROUTINES_XSYR2_H_
#include "internal/routines/level2/xher2.h"
#include "routines/level2/xher2.hpp"
namespace clblast {
// =================================================================================================

View file

@ -11,7 +11,7 @@
//
// =================================================================================================
#include "internal/routines/level2/xtbmv.h"
#include "routines/level2/xtbmv.hpp"
#include <string>
#include <vector>

View file

@ -16,7 +16,7 @@
#ifndef CLBLAST_ROUTINES_XTBMV_H_
#define CLBLAST_ROUTINES_XTBMV_H_
#include "internal/routines/level2/xgemv.h"
#include "routines/level2/xgemv.hpp"
namespace clblast {
// =================================================================================================
@ -25,12 +25,10 @@ namespace clblast {
template <typename T>
class Xtbmv: public Xgemv<T> {
public:
// Members from the base class
using Routine<T>::queue_;
using Routine<T>::context_;
// Uses the generic matrix-vector routine
using Xgemv<T>::queue_;
using Xgemv<T>::context_;
using Xgemv<T>::MatVec;
// Constructor

View file

@ -11,7 +11,7 @@
//
// =================================================================================================
#include "internal/routines/level2/xtpmv.h"
#include "routines/level2/xtpmv.hpp"
#include <string>
#include <vector>

View file

@ -16,7 +16,7 @@
#ifndef CLBLAST_ROUTINES_XTPMV_H_
#define CLBLAST_ROUTINES_XTPMV_H_
#include "internal/routines/level2/xgemv.h"
#include "routines/level2/xgemv.hpp"
namespace clblast {
// =================================================================================================
@ -25,12 +25,10 @@ namespace clblast {
template <typename T>
class Xtpmv: public Xgemv<T> {
public:
// Members from the base class
using Routine<T>::queue_;
using Routine<T>::context_;
// Uses the generic matrix-vector routine
using Xgemv<T>::queue_;
using Xgemv<T>::context_;
using Xgemv<T>::MatVec;
// Constructor

View file

@ -11,7 +11,7 @@
//
// =================================================================================================
#include "internal/routines/level2/xtrmv.h"
#include "routines/level2/xtrmv.hpp"
#include <string>
#include <vector>

View file

@ -16,7 +16,7 @@
#ifndef CLBLAST_ROUTINES_XTRMV_H_
#define CLBLAST_ROUTINES_XTRMV_H_
#include "internal/routines/level2/xgemv.h"
#include "routines/level2/xgemv.hpp"
namespace clblast {
// =================================================================================================
@ -25,12 +25,10 @@ namespace clblast {
template <typename T>
class Xtrmv: public Xgemv<T> {
public:
// Members from the base class
using Routine<T>::queue_;
using Routine<T>::context_;
// Uses the generic matrix-vector routine
using Xgemv<T>::queue_;
using Xgemv<T>::context_;
using Xgemv<T>::MatVec;
// Constructor

View file

@ -11,7 +11,7 @@
//
// =================================================================================================
#include "internal/routines/level3/xgemm.h"
#include "routines/level3/xgemm.hpp"
#include <string>
#include <vector>
@ -19,19 +19,10 @@
namespace clblast {
// =================================================================================================
// Specific implementations to get the memory-type based on a template argument
template <> const Precision Xgemm<half>::precision_ = Precision::kHalf;
template <> const Precision Xgemm<float>::precision_ = Precision::kSingle;
template <> const Precision Xgemm<double>::precision_ = Precision::kDouble;
template <> const Precision Xgemm<float2>::precision_ = Precision::kComplexSingle;
template <> const Precision Xgemm<double2>::precision_ = Precision::kComplexDouble;
// =================================================================================================
// Constructor: forwards to base class constructor
template <typename T>
Xgemm<T>::Xgemm(Queue &queue, EventPointer event, const std::string &name):
Routine<T>(queue, event, name, {"Copy","Pad","Transpose","Padtranspose","Xgemm"}, precision_) {
Routine(queue, event, name, {"Copy","Pad","Transpose","Padtranspose","Xgemm"}, PrecisionValue<T>()) {
source_string_ =
#include "../../kernels/level3/level3.opencl"
#include "../../kernels/level3/copy_fast.opencl"
@ -96,11 +87,11 @@ StatusCode Xgemm<T>::DoGemm(const Layout layout,
// matrix A cannot be less than K when rotated, or less than M when not-rotated
// matrix B cannot be less than N when rotated, or less than K when not-rotated
// matrix C cannot be less than N when rotated, or less than M when not-rotated
auto status = TestMatrixA(a_one, a_two, a_buffer, a_offset, a_ld, sizeof(T));
auto status = TestMatrixA(a_one, a_two, a_buffer, a_offset, a_ld);
if (ErrorIn(status)) { return status; }
status = TestMatrixB(b_one, b_two, b_buffer, b_offset, b_ld, sizeof(T));
status = TestMatrixB(b_one, b_two, b_buffer, b_offset, b_ld);
if (ErrorIn(status)) { return status; }
status = TestMatrixC(c_one, c_two, c_buffer, c_offset, c_ld, sizeof(T));
status = TestMatrixC(c_one, c_two, c_buffer, c_offset, c_ld);
if (ErrorIn(status)) { return status; }
// Calculates the ceiled versions of m, n, and k
@ -112,7 +103,7 @@ StatusCode Xgemm<T>::DoGemm(const Layout layout,
try {
// Loads the program from the database
const auto program = GetProgramFromCache();
const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);
// Determines whether or not temporary matrices are needed
auto a_no_temp = a_one == m_ceiled && a_two == k_ceiled && a_ld == m_ceiled && a_offset == 0 &&
@ -142,7 +133,7 @@ StatusCode Xgemm<T>::DoGemm(const Layout layout,
// case nothing has to be done, these kernels can be skipped.
if (!a_no_temp) {
auto eventProcessA = Event();
status = PadCopyTransposeMatrix(eventProcessA.pointer(), emptyEventList,
status = PadCopyTransposeMatrix(queue_, device_, context_, db_, eventProcessA.pointer(), emptyEventList,
a_one, a_two, a_ld, a_offset, a_buffer,
m_ceiled, k_ceiled, m_ceiled, 0, a_temp,
ConstantOne<T>(), program,
@ -154,7 +145,7 @@ StatusCode Xgemm<T>::DoGemm(const Layout layout,
// As above, but now for matrix B
if (!b_no_temp) {
auto eventProcessB = Event();
status = PadCopyTransposeMatrix(eventProcessB.pointer(), emptyEventList,
status = PadCopyTransposeMatrix(queue_, device_, context_, db_, eventProcessB.pointer(), emptyEventList,
b_one, b_two, b_ld, b_offset, b_buffer,
n_ceiled, k_ceiled, n_ceiled, 0, b_temp,
ConstantOne<T>(), program,
@ -166,7 +157,7 @@ StatusCode Xgemm<T>::DoGemm(const Layout layout,
// As above, but now for matrix C. This is only necessary if C is used both as input and output.
if (!c_no_temp && beta != static_cast<T>(0)) {
auto eventProcessC = Event();
status = PadCopyTransposeMatrix(eventProcessC.pointer(), emptyEventList,
status = PadCopyTransposeMatrix(queue_, device_, context_, db_, eventProcessC.pointer(), emptyEventList,
c_one, c_two, c_ld, c_offset, c_buffer,
m_ceiled, n_ceiled, m_ceiled, 0, c_temp,
ConstantOne<T>(), program,
@ -199,13 +190,13 @@ StatusCode Xgemm<T>::DoGemm(const Layout layout,
// Launches the kernel
auto eventKernel = Event();
auto eventPointer = (!c_no_temp) ? eventKernel.pointer() : event_;
status = RunKernel(kernel, global, local, eventPointer, eventWaitList);
status = RunKernel(kernel, queue_, device_, global, local, eventPointer, eventWaitList);
if (ErrorIn(status)) { return status; }
// Runs the post-processing kernel if needed
if (!c_no_temp) {
eventWaitList.push_back(eventKernel);
status = PadCopyTransposeMatrix(event_, eventWaitList,
status = PadCopyTransposeMatrix(queue_, device_, context_, db_, event_, eventWaitList,
m_ceiled, n_ceiled, m_ceiled, 0, c_temp,
c_one, c_two, c_ld, c_offset, c_buffer,
ConstantOne<T>(), program,

Some files were not shown because too many files have changed in this diff Show more