mirror of
https://github.com/CNugteren/CLBlast.git
synced 2024-07-07 12:23:46 +02:00
Merge pull request #69 from CNugteren/refactoring
Refactoring of the Routine class and file-renaming
This commit is contained in:
commit
395a0ef34e
|
@ -4,6 +4,7 @@ Development version (next release)
|
|||
- Made it possible to compile the performance tests (clients) separately from the correctness tests
|
||||
- Made a reference BLAS and head-to-head performance comparison optional in the clients
|
||||
- Increased the verbosity of the "-verbose" option in the correctness tests
|
||||
- Refactored the host code for better compilation times and fewer lines of code
|
||||
- Improved the API documentation
|
||||
- Various minor fixes and enhancements
|
||||
- Added tuned parameters for various devices (see README)
|
||||
|
|
|
@ -121,7 +121,7 @@ endif()
|
|||
# ==================================================================================================
|
||||
|
||||
# Includes directories: CLBlast and OpenCL
|
||||
include_directories(${clblast_SOURCE_DIR}/include ${OPENCL_INCLUDE_DIRS})
|
||||
include_directories(${clblast_SOURCE_DIR}/include ${clblast_SOURCE_DIR}/src ${OPENCL_INCLUDE_DIRS})
|
||||
|
||||
# ==================================================================================================
|
||||
|
||||
|
@ -140,19 +140,26 @@ set(PRECISIONS 32 64 3232 6464)
|
|||
# ==================================================================================================
|
||||
|
||||
# Gathers all source-files
|
||||
set(SOURCES src/clblast.cc src/database.cc src/routine.cc src/cache.cc
|
||||
src/utilities.cc src/clblast_c.cc)
|
||||
set(SOURCES
|
||||
src/database/database.cpp
|
||||
src/routines/common.cpp
|
||||
src/cache.cpp
|
||||
src/clblast.cpp
|
||||
src/clblast_c.cpp
|
||||
src/routine.cpp
|
||||
src/utilities.cpp
|
||||
)
|
||||
foreach(ROUTINE ${LEVEL1_ROUTINES})
|
||||
set(SOURCES ${SOURCES} src/routines/level1/${ROUTINE}.cc)
|
||||
set(SOURCES ${SOURCES} src/routines/level1/${ROUTINE}.cpp)
|
||||
endforeach()
|
||||
foreach(ROUTINE ${LEVEL2_ROUTINES})
|
||||
set(SOURCES ${SOURCES} src/routines/level2/${ROUTINE}.cc)
|
||||
set(SOURCES ${SOURCES} src/routines/level2/${ROUTINE}.cpp)
|
||||
endforeach()
|
||||
foreach(ROUTINE ${LEVEL3_ROUTINES})
|
||||
set(SOURCES ${SOURCES} src/routines/level3/${ROUTINE}.cc)
|
||||
set(SOURCES ${SOURCES} src/routines/level3/${ROUTINE}.cpp)
|
||||
endforeach()
|
||||
foreach(ROUTINE ${LEVELX_ROUTINES})
|
||||
set(SOURCES ${SOURCES} src/routines/levelx/${ROUTINE}.cc)
|
||||
set(SOURCES ${SOURCES} src/routines/levelx/${ROUTINE}.cpp)
|
||||
endforeach()
|
||||
|
||||
# Creates and links the library
|
||||
|
@ -186,7 +193,7 @@ if(SAMPLES)
|
|||
|
||||
# Adds sample programs (C++)
|
||||
foreach(SAMPLE ${SAMPLE_PROGRAMS_CPP})
|
||||
add_executable(clblast_sample_${SAMPLE} samples/${SAMPLE}.cc)
|
||||
add_executable(clblast_sample_${SAMPLE} samples/${SAMPLE}.cpp)
|
||||
target_link_libraries(clblast_sample_${SAMPLE} clblast ${OPENCL_LIBRARIES})
|
||||
install(TARGETS clblast_sample_${SAMPLE} DESTINATION bin)
|
||||
endforeach()
|
||||
|
@ -211,7 +218,7 @@ if(TUNERS)
|
|||
|
||||
# Adds tuning executables
|
||||
foreach(KERNEL ${KERNELS})
|
||||
add_executable(clblast_tuner_${KERNEL} src/tuning/${KERNEL}.cc)
|
||||
add_executable(clblast_tuner_${KERNEL} src/tuning/kernels/${KERNEL}.cpp)
|
||||
target_link_libraries(clblast_tuner_${KERNEL} clblast ${CLTUNE_LIBRARIES} ${OPENCL_LIBRARIES})
|
||||
install(TARGETS clblast_tuner_${KERNEL} DESTINATION bin)
|
||||
endforeach()
|
||||
|
@ -257,7 +264,7 @@ if(CLIENTS OR TESTS)
|
|||
endif()
|
||||
|
||||
# Sets the include directories
|
||||
include_directories(${clblast_SOURCE_DIR}/test ${REF_INCLUDES})
|
||||
include_directories(${clblast_SOURCE_DIR} ${REF_INCLUDES})
|
||||
|
||||
endif()
|
||||
|
||||
|
@ -268,24 +275,24 @@ endif()
|
|||
if(CLIENTS)
|
||||
|
||||
# Creates the common performance-tests objects (requires CMake 2.8.8)
|
||||
add_library(test_performance_common OBJECT test/performance/client.cc)
|
||||
add_library(test_performance_common OBJECT test/performance/client.cpp)
|
||||
|
||||
# Compiles the performance-tests
|
||||
foreach(ROUTINE ${LEVEL1_ROUTINES})
|
||||
add_executable(clblast_client_${ROUTINE} $<TARGET_OBJECTS:test_performance_common>
|
||||
test/performance/routines/level1/${ROUTINE}.cc)
|
||||
test/performance/routines/level1/${ROUTINE}.cpp)
|
||||
endforeach()
|
||||
foreach(ROUTINE ${LEVEL2_ROUTINES})
|
||||
add_executable(clblast_client_${ROUTINE} $<TARGET_OBJECTS:test_performance_common>
|
||||
test/performance/routines/level2/${ROUTINE}.cc)
|
||||
test/performance/routines/level2/${ROUTINE}.cpp)
|
||||
endforeach()
|
||||
foreach(ROUTINE ${LEVEL3_ROUTINES})
|
||||
add_executable(clblast_client_${ROUTINE} $<TARGET_OBJECTS:test_performance_common>
|
||||
test/performance/routines/level3/${ROUTINE}.cc)
|
||||
test/performance/routines/level3/${ROUTINE}.cpp)
|
||||
endforeach()
|
||||
foreach(ROUTINE ${LEVELX_ROUTINES})
|
||||
add_executable(clblast_client_${ROUTINE} $<TARGET_OBJECTS:test_performance_common>
|
||||
test/performance/routines/levelx/${ROUTINE}.cc)
|
||||
test/performance/routines/levelx/${ROUTINE}.cpp)
|
||||
endforeach()
|
||||
foreach(ROUTINE ${ROUTINES})
|
||||
target_link_libraries(clblast_client_${ROUTINE} clblast ${REF_LIBRARIES} ${OPENCL_LIBRARIES})
|
||||
|
@ -303,24 +310,24 @@ if(TESTS)
|
|||
|
||||
# Creates the common correctness-tests objects (requires CMake 2.8.8)
|
||||
add_library(test_correctness_common OBJECT
|
||||
test/correctness/tester.cc test/correctness/testblas.cc)
|
||||
test/correctness/tester.cpp test/correctness/testblas.cpp)
|
||||
|
||||
# Compiles the correctness-tests
|
||||
foreach(ROUTINE ${LEVEL1_ROUTINES})
|
||||
add_executable(clblast_test_${ROUTINE} $<TARGET_OBJECTS:test_correctness_common>
|
||||
test/correctness/routines/level1/${ROUTINE}.cc)
|
||||
test/correctness/routines/level1/${ROUTINE}.cpp)
|
||||
endforeach()
|
||||
foreach(ROUTINE ${LEVEL2_ROUTINES})
|
||||
add_executable(clblast_test_${ROUTINE} $<TARGET_OBJECTS:test_correctness_common>
|
||||
test/correctness/routines/level2/${ROUTINE}.cc)
|
||||
test/correctness/routines/level2/${ROUTINE}.cpp)
|
||||
endforeach()
|
||||
foreach(ROUTINE ${LEVEL3_ROUTINES})
|
||||
add_executable(clblast_test_${ROUTINE} $<TARGET_OBJECTS:test_correctness_common>
|
||||
test/correctness/routines/level3/${ROUTINE}.cc)
|
||||
test/correctness/routines/level3/${ROUTINE}.cpp)
|
||||
endforeach()
|
||||
foreach(ROUTINE ${LEVELX_ROUTINES})
|
||||
add_executable(clblast_test_${ROUTINE} $<TARGET_OBJECTS:test_correctness_common>
|
||||
test/correctness/routines/levelx/${ROUTINE}.cc)
|
||||
test/correctness/routines/levelx/${ROUTINE}.cpp)
|
||||
endforeach()
|
||||
foreach(ROUTINE ${ROUTINES})
|
||||
target_link_libraries(clblast_test_${ROUTINE} clblast ${REF_LIBRARIES} ${OPENCL_LIBRARIES})
|
||||
|
|
|
@ -136,7 +136,7 @@ Note that CLBlast's tuners are based on the CLTune auto-tuning library, which ha
|
|||
|
||||
Compiling with `-DTUNERS=ON` will generate a number of tuners, each named `clblast_tuner_xxxxx`, in which `xxxxx` corresponds to a `.opencl` kernel file as found in `src/kernels`. These kernels corresponds to routines (e.g. `xgemm`) or to common pre-processing or post-processing kernels (`copy` and `transpose`). Running such a tuner will test a number of parameter-value combinations on your device and report which one gave the best performance. Running `make alltuners` runs all tuners for all precisions in one go. You can set the default device and platform for `alltuners` by setting the `DEFAULT_DEVICE` and `DEFAULT_PLATFORM` environmental variables before running CMake.
|
||||
|
||||
The tuners output a JSON-file with the results. The best results need to be added to `include/internal/database/xxxxx.h` in the appropriate section. However, this can be done automatically based on the JSON-data using a Python script in `scripts/database/database.py`. If you want the found parameters to be included in future releases of CLBlast, please attach the JSON files to the corresponding issue on GitHub or [email the main author](http://www.cedricnugteren.nl).
|
||||
The tuners output a JSON-file with the results. The best results need to be added to `include/internal/database/xxxxx.hpp` in the appropriate section. However, this can be done automatically based on the JSON-data using a Python script in `scripts/database/database.py`. If you want the found parameters to be included in future releases of CLBlast, please attach the JSON files to the corresponding issue on GitHub or [email the main author](http://www.cedricnugteren.nl).
|
||||
|
||||
In summary, tuning the entire library for your device can be done as follows (starting from the root of the CLBlast folder):
|
||||
|
||||
|
|
|
@ -68,8 +68,8 @@ enum class StatusCode {
|
|||
kInvalidLocalMemUsage = -2046, // Not enough local memory available on this device
|
||||
kNoHalfPrecision = -2045, // Half precision (16-bits) not supported by the device
|
||||
kNoDoublePrecision = -2044, // Double precision (64-bits) not supported by the device
|
||||
kInvalidVectorDot = -2043, // Vector dot is not a valid OpenCL buffer
|
||||
kInsufficientMemoryDot = -2042, // Vector dot's OpenCL buffer is too small
|
||||
kInvalidVectorScalar = -2043, // The unit-sized vector is not a valid OpenCL buffer
|
||||
kInsufficientMemoryScalar = -2042, // The unit-sized vector's OpenCL buffer is too small
|
||||
};
|
||||
|
||||
// Matrix layout and transpose types
|
||||
|
|
|
@ -77,8 +77,8 @@ typedef enum StatusCode_ {
|
|||
kInvalidLocalMemUsage = -2046, // Not enough local memory available on this device
|
||||
kNoHalfPrecision = -2045, // Half precision (16-bits) not supported by the device
|
||||
kNoDoublePrecision = -2044, // Double precision (64-bits) not supported by the device
|
||||
kInvalidVectorDot = -2043, // Vector dot is not a valid OpenCL buffer
|
||||
kInsufficientMemoryDot = -2042, // Vector dot's OpenCL buffer is too small
|
||||
kInvalidVectorScalar = -2043, // The unit-sized vector is not a valid OpenCL buffer
|
||||
kInsufficientMemoryScalar = -2042, // The unit-sized vector's OpenCL buffer is too small
|
||||
} StatusCode;
|
||||
|
||||
// Matrix layout and transpose types
|
||||
|
|
|
@ -1,144 +0,0 @@
|
|||
|
||||
// =================================================================================================
|
||||
// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
|
||||
// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
|
||||
// width of 100 characters per line.
|
||||
//
|
||||
// Author(s):
|
||||
// Cedric Nugteren <www.cedricnugteren.nl>
|
||||
//
|
||||
// This file implements all the basic functionality for the BLAS routines. This class serves as a
|
||||
// base class for the actual routines (e.g. Xaxpy, Xgemm). It contains common functionality such as
|
||||
// compiling the OpenCL kernel, connecting to the database, etc.
|
||||
//
|
||||
// =================================================================================================
|
||||
|
||||
#ifndef CLBLAST_ROUTINE_H_
|
||||
#define CLBLAST_ROUTINE_H_
|
||||
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
#include "internal/cache.h"
|
||||
#include "internal/utilities.h"
|
||||
#include "internal/database.h"
|
||||
|
||||
namespace clblast {
|
||||
// =================================================================================================
|
||||
|
||||
// See comment at top of file for a description of the class
|
||||
template <typename T>
|
||||
class Routine {
|
||||
public:
|
||||
|
||||
// Helper functions which check for errors in the status code
|
||||
static constexpr bool ErrorIn(const StatusCode s) { return (s != StatusCode::kSuccess); }
|
||||
|
||||
// Base class constructor
|
||||
explicit Routine(Queue &queue, EventPointer event, const std::string &name,
|
||||
const std::vector<std::string> &routines, const Precision precision);
|
||||
|
||||
// Set-up phase of the kernel
|
||||
StatusCode SetUp();
|
||||
|
||||
protected:
|
||||
|
||||
// Runs a kernel given the global and local thread sizes
|
||||
StatusCode RunKernel(Kernel &kernel, std::vector<size_t> global,
|
||||
const std::vector<size_t> &local, EventPointer event,
|
||||
std::vector<Event>& waitForEvents);
|
||||
|
||||
// As above, but without an event waiting list
|
||||
StatusCode RunKernel(Kernel &kernel, std::vector<size_t> global,
|
||||
const std::vector<size_t> &local, EventPointer event);
|
||||
|
||||
// Tests for valid inputs of matrices A, B, and C
|
||||
StatusCode TestMatrixA(const size_t one, const size_t two, const Buffer<T> &buffer,
|
||||
const size_t offset, const size_t ld, const size_t data_size);
|
||||
StatusCode TestMatrixB(const size_t one, const size_t two, const Buffer<T> &buffer,
|
||||
const size_t offset, const size_t ld, const size_t data_size);
|
||||
StatusCode TestMatrixC(const size_t one, const size_t two, const Buffer<T> &buffer,
|
||||
const size_t offset, const size_t ld, const size_t data_size);
|
||||
StatusCode TestMatrixAP(const size_t n, const Buffer<T> &buffer,
|
||||
const size_t offset, const size_t data_size);
|
||||
|
||||
// Tests for valid inputs of vector X and Y
|
||||
StatusCode TestVectorX(const size_t n, const Buffer<T> &buffer, const size_t offset,
|
||||
const size_t inc, const size_t data_size);
|
||||
StatusCode TestVectorY(const size_t n, const Buffer<T> &buffer, const size_t offset,
|
||||
const size_t inc, const size_t data_size);
|
||||
|
||||
// Tests for valid inputs of other vectors
|
||||
StatusCode TestVectorDot(const size_t n, const Buffer<T> &buffer, const size_t offset,
|
||||
const size_t data_size);
|
||||
StatusCode TestVectorIndex(const size_t n, const Buffer<unsigned int> &buffer,
|
||||
const size_t offset, const size_t data_size);
|
||||
|
||||
// Copies/transposes a matrix and padds/unpads it with zeroes. This method is also able to write
|
||||
// to symmetric and triangular matrices through optional arguments.
|
||||
StatusCode PadCopyTransposeMatrix(EventPointer event, std::vector<Event>& waitForEvents,
|
||||
const size_t src_one, const size_t src_two,
|
||||
const size_t src_ld, const size_t src_offset,
|
||||
const Buffer<T> &src,
|
||||
const size_t dest_one, const size_t dest_two,
|
||||
const size_t dest_ld, const size_t dest_offset,
|
||||
const Buffer<T> &dest,
|
||||
const T alpha,
|
||||
const Program &program, const bool do_pad,
|
||||
const bool do_transpose, const bool do_conjugate,
|
||||
const bool upper = false, const bool lower = false,
|
||||
const bool diagonal_imag_zero = false);
|
||||
|
||||
// Stores a newly compiled binary/program into the cache
|
||||
void StoreBinaryToCache(const std::string& binary) const {
|
||||
cache::StoreBinaryToCache(binary, device_name_, precision_, routine_name_);
|
||||
}
|
||||
void StoreProgramToCache(const Program& program) const {
|
||||
cache::StoreProgramToCache(program, context_, precision_, routine_name_);
|
||||
}
|
||||
|
||||
// Queries the cache and retrieve either a matching binary/program or a boolean whether a match
|
||||
// exists. The first assumes that the binary/program is available in the cache and will throw an
|
||||
// exception otherwise.
|
||||
std::string GetBinaryFromCache() const {
|
||||
return cache::GetBinaryFromCache(device_name_, precision_, routine_name_);
|
||||
}
|
||||
Program GetProgramFromCache() const {
|
||||
return cache::GetProgramFromCache(context_, precision_, routine_name_);
|
||||
}
|
||||
bool BinaryIsInCache() const {
|
||||
return cache::BinaryIsInCache(device_name_, precision_, routine_name_);
|
||||
}
|
||||
bool ProgramIsInCache() const {
|
||||
return cache::ProgramIsInCache(context_, precision_, routine_name_);
|
||||
}
|
||||
|
||||
// Non-static variable for the precision. Note that the same variable (but static) might exist in
|
||||
// a derived class.
|
||||
const Precision precision_;
|
||||
|
||||
// The routine's name and its kernel-source in string form
|
||||
const std::string routine_name_;
|
||||
std::string source_string_;
|
||||
|
||||
// The OpenCL objects, accessible only from derived classes
|
||||
Queue queue_;
|
||||
EventPointer event_;
|
||||
const Context context_;
|
||||
const Device device_;
|
||||
|
||||
// OpenCL device properties
|
||||
const std::string device_name_;
|
||||
const size_t max_work_item_dimensions_;
|
||||
const std::vector<size_t> max_work_item_sizes_;
|
||||
const size_t max_work_group_size_;
|
||||
|
||||
// Connection to the database for all the device-specific parameters
|
||||
const Database db_;
|
||||
};
|
||||
|
||||
// =================================================================================================
|
||||
} // namespace clblast
|
||||
|
||||
// CLBLAST_ROUTINE_H_
|
||||
#endif
|
|
@ -310,7 +310,7 @@ defaults = CalculateDefaults(bests)
|
|||
bests = ConcatenateData(bests, defaults)
|
||||
|
||||
# Outputs the data as a C++ database
|
||||
path_cpp_database = os.path.join(path_clblast, "include", "internal", "database")
|
||||
path_cpp_database = os.path.join(path_clblast, "src", "database", "kernels")
|
||||
print("## Producing a C++ database in '"+path_cpp_database+"'...")
|
||||
PrintData(bests, path_cpp_database)
|
||||
|
||||
|
|
|
@ -10,14 +10,14 @@
|
|||
# This script automatically generates the bodies of the following files, creating the full CLBlast
|
||||
# API interface and implementation (C, C++, and reference BLAS wrappers):
|
||||
# clblast.h
|
||||
# clblast.cc
|
||||
# clblast.cpp
|
||||
# clblast_c.h
|
||||
# clblast_c.cc
|
||||
# clblast_c.cpp
|
||||
# wrapper_clblas.h
|
||||
# wrapper_cblas.h
|
||||
# It also generates the main functions for the correctness and performance tests as found in
|
||||
# test/correctness/routines/levelX/xYYYY.cc
|
||||
# test/performance/routines/levelX/xYYYY.cc
|
||||
# test/correctness/routines/levelX/xYYYY.cpp
|
||||
# test/performance/routines/levelX/xYYYY.cpp
|
||||
# It also produces the API documentation found in doc/clblast.md
|
||||
#
|
||||
# ==================================================================================================
|
||||
|
@ -200,7 +200,7 @@ def clblast_h(routines):
|
|||
result += routine.RoutineHeaderCPP(12, " = nullptr")+";\n"
|
||||
return result
|
||||
|
||||
# The C++ API implementation (.cc)
|
||||
# The C++ API implementation (.cpp)
|
||||
def clblast_cc(routines):
|
||||
result = ""
|
||||
for routine in routines:
|
||||
|
@ -237,7 +237,7 @@ def clblast_c_h(routines):
|
|||
result += routine.RoutineHeaderC(flavour, 31, " PUBLIC_API")+";\n"
|
||||
return result
|
||||
|
||||
# The C API implementation (.cc)
|
||||
# The C API implementation (.cpp)
|
||||
def clblast_c_cc(routines):
|
||||
result = ""
|
||||
for routine in routines:
|
||||
|
@ -379,14 +379,14 @@ if len(sys.argv) != 2:
|
|||
path_clblast = sys.argv[1]
|
||||
files = [
|
||||
path_clblast+"/include/clblast.h",
|
||||
path_clblast+"/src/clblast.cc",
|
||||
path_clblast+"/src/clblast.cpp",
|
||||
path_clblast+"/include/clblast_c.h",
|
||||
path_clblast+"/src/clblast_c.cc",
|
||||
path_clblast+"/test/wrapper_clblas.h",
|
||||
path_clblast+"/test/wrapper_cblas.h",
|
||||
path_clblast+"/src/clblast_c.cpp",
|
||||
path_clblast+"/test/wrapper_clblas.hpp",
|
||||
path_clblast+"/test/wrapper_cblas.hpp",
|
||||
]
|
||||
header_lines = [84, 74, 93, 22, 29, 41]
|
||||
footer_lines = [17, 71, 19, 14, 6, 6]
|
||||
footer_lines = [17, 75, 19, 14, 6, 6]
|
||||
|
||||
# Checks whether the command-line arguments are valid; exists otherwise
|
||||
for f in files:
|
||||
|
@ -433,11 +433,11 @@ for i in xrange(0,len(files)):
|
|||
for level in [1,2,3,4]:
|
||||
for routine in routines[level-1]:
|
||||
if routine.has_tests:
|
||||
filename = path_clblast+"/test/correctness/routines/level"+levelnames[level-1]+"/x"+routine.name+".cc"
|
||||
filename = path_clblast+"/test/correctness/routines/level"+levelnames[level-1]+"/x"+routine.name+".cpp"
|
||||
with open(filename, "w") as f:
|
||||
body = ""
|
||||
body += "#include \"correctness/testblas.h\"\n"
|
||||
body += "#include \"routines/level"+levelnames[level-1]+"/x"+routine.name+".h\"\n\n"
|
||||
body += "#include \"test/correctness/testblas.hpp\"\n"
|
||||
body += "#include \"test/routines/level"+levelnames[level-1]+"/x"+routine.name+".hpp\"\n\n"
|
||||
body += "// Shortcuts to the clblast namespace\n"
|
||||
body += "using float2 = clblast::float2;\n"
|
||||
body += "using double2 = clblast::double2;\n\n"
|
||||
|
@ -459,11 +459,11 @@ for level in [1,2,3,4]:
|
|||
for level in [1,2,3,4]:
|
||||
for routine in routines[level-1]:
|
||||
if routine.has_tests:
|
||||
filename = path_clblast+"/test/performance/routines/level"+levelnames[level-1]+"/x"+routine.name+".cc"
|
||||
filename = path_clblast+"/test/performance/routines/level"+levelnames[level-1]+"/x"+routine.name+".cpp"
|
||||
with open(filename, "w") as f:
|
||||
body = ""
|
||||
body += "#include \"performance/client.h\"\n"
|
||||
body += "#include \"routines/level"+levelnames[level-1]+"/x"+routine.name+".h\"\n\n"
|
||||
body += "#include \"test/performance/client.hpp\"\n"
|
||||
body += "#include \"test/routines/level"+levelnames[level-1]+"/x"+routine.name+".hpp\"\n\n"
|
||||
body += "// Shortcuts to the clblast namespace\n"
|
||||
body += "using float2 = clblast::float2;\n"
|
||||
body += "using double2 = clblast::double2;\n\n"
|
||||
|
|
121
src/buffer_test.hpp
Normal file
121
src/buffer_test.hpp
Normal file
|
@ -0,0 +1,121 @@
|
|||
|
||||
// =================================================================================================
|
||||
// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
|
||||
// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
|
||||
// width of 100 characters per line.
|
||||
//
|
||||
// Author(s):
|
||||
// Cedric Nugteren <www.cedricnugteren.nl>
|
||||
//
|
||||
// This file implements the tests for the OpenCL buffers (matrices and vectors). These tests are
|
||||
// templated and thus header-only.
|
||||
//
|
||||
// =================================================================================================
|
||||
|
||||
#ifndef CLBLAST_BUFFER_TEST_H_
|
||||
#define CLBLAST_BUFFER_TEST_H_
|
||||
|
||||
#include "clblast.h"
|
||||
|
||||
namespace clblast {
|
||||
// =================================================================================================
|
||||
|
||||
// Tests matrix 'A' for validity
|
||||
template <typename T>
|
||||
StatusCode TestMatrixA(const size_t one, const size_t two, const Buffer<T> &buffer,
|
||||
const size_t offset, const size_t ld) {
|
||||
if (ld < one) { return StatusCode::kInvalidLeadDimA; }
|
||||
try {
|
||||
const auto required_size = (ld * (two - 1) + one + offset) * sizeof(T);
|
||||
if (buffer.GetSize() < required_size) { return StatusCode::kInsufficientMemoryA; }
|
||||
} catch (...) { return StatusCode::kInvalidMatrixA; }
|
||||
return StatusCode::kSuccess;
|
||||
}
|
||||
|
||||
// Tests matrix 'B' for validity
|
||||
template <typename T>
|
||||
StatusCode TestMatrixB(const size_t one, const size_t two, const Buffer<T> &buffer,
|
||||
const size_t offset, const size_t ld) {
|
||||
if (ld < one) { return StatusCode::kInvalidLeadDimB; }
|
||||
try {
|
||||
const auto required_size = (ld * (two - 1) + one + offset) * sizeof(T);
|
||||
if (buffer.GetSize() < required_size) { return StatusCode::kInsufficientMemoryB; }
|
||||
} catch (...) { return StatusCode::kInvalidMatrixB; }
|
||||
return StatusCode::kSuccess;
|
||||
}
|
||||
|
||||
// Tests matrix 'C' for validity
|
||||
template <typename T>
|
||||
StatusCode TestMatrixC(const size_t one, const size_t two, const Buffer<T> &buffer,
|
||||
const size_t offset, const size_t ld) {
|
||||
if (ld < one) { return StatusCode::kInvalidLeadDimC; }
|
||||
try {
|
||||
const auto required_size = (ld * (two - 1) + one + offset) * sizeof(T);
|
||||
if (buffer.GetSize() < required_size) { return StatusCode::kInsufficientMemoryC; }
|
||||
} catch (...) { return StatusCode::kInvalidMatrixC; }
|
||||
return StatusCode::kSuccess;
|
||||
}
|
||||
|
||||
// Tests matrix 'AP' for validity
|
||||
template <typename T>
|
||||
StatusCode TestMatrixAP(const size_t n, const Buffer<T> &buffer, const size_t offset) {
|
||||
try {
|
||||
const auto required_size = (((n * (n + 1)) / 2) + offset) * sizeof(T);
|
||||
if (buffer.GetSize() < required_size) { return StatusCode::kInsufficientMemoryA; }
|
||||
} catch (...) { return StatusCode::kInvalidMatrixA; }
|
||||
return StatusCode::kSuccess;
|
||||
}
|
||||
|
||||
// =================================================================================================
|
||||
|
||||
// Tests vector 'X' for validity
|
||||
template <typename T>
|
||||
StatusCode TestVectorX(const size_t n, const Buffer<T> &buffer, const size_t offset,
|
||||
const size_t inc) {
|
||||
if (inc == 0) { return StatusCode::kInvalidIncrementX; }
|
||||
try {
|
||||
const auto required_size = ((n - 1) * inc + 1 + offset) * sizeof(T);
|
||||
if (buffer.GetSize() < required_size) { return StatusCode::kInsufficientMemoryX; }
|
||||
} catch (...) { return StatusCode::kInvalidVectorX; }
|
||||
return StatusCode::kSuccess;
|
||||
}
|
||||
|
||||
// Tests vector 'Y' for validity
|
||||
template <typename T>
|
||||
StatusCode TestVectorY(const size_t n, const Buffer<T> &buffer, const size_t offset,
|
||||
const size_t inc) {
|
||||
if (inc == 0) { return StatusCode::kInvalidIncrementY; }
|
||||
try {
|
||||
const auto required_size = ((n - 1) * inc + 1 + offset) * sizeof(T);
|
||||
if (buffer.GetSize() < required_size) { return StatusCode::kInsufficientMemoryY; }
|
||||
} catch (...) { return StatusCode::kInvalidVectorY; }
|
||||
return StatusCode::kSuccess;
|
||||
}
|
||||
|
||||
// =================================================================================================
|
||||
|
||||
// Tests vector 'scalar' for validity
|
||||
template <typename T>
|
||||
StatusCode TestVectorScalar(const size_t n, const Buffer<T> &buffer, const size_t offset) {
|
||||
try {
|
||||
const auto required_size = (n + offset) * sizeof(T);
|
||||
if (buffer.GetSize() < required_size) { return StatusCode::kInsufficientMemoryScalar; }
|
||||
} catch (...) { return StatusCode::kInvalidVectorScalar; }
|
||||
return StatusCode::kSuccess;
|
||||
}
|
||||
|
||||
// Tests vector 'index' for validity
|
||||
template <typename T>
|
||||
StatusCode TestVectorIndex(const size_t n, const Buffer<T> &buffer, const size_t offset) {
|
||||
try {
|
||||
const auto required_size = (n + offset) * sizeof(T);
|
||||
if (buffer.GetSize() < required_size) { return StatusCode::kInsufficientMemoryScalar; }
|
||||
} catch (...) { return StatusCode::kInvalidVectorScalar; }
|
||||
return StatusCode::kSuccess;
|
||||
}
|
||||
|
||||
// =================================================================================================
|
||||
} // namespace clblast
|
||||
|
||||
// CLBLAST_BUFFER_TEST_H_
|
||||
#endif
|
|
@ -15,10 +15,9 @@
|
|||
#include <vector>
|
||||
#include <mutex>
|
||||
|
||||
#include "internal/cache.h"
|
||||
#include "cache.hpp"
|
||||
|
||||
namespace clblast {
|
||||
namespace cache {
|
||||
// =================================================================================================
|
||||
|
||||
// Stores the compiled binary or IR in the cache
|
||||
|
@ -98,7 +97,7 @@ bool ProgramIsInCache(const Context &context, const Precision &precision,
|
|||
// =================================================================================================
|
||||
|
||||
// Clears the cache of stored binaries and programs
|
||||
StatusCode ClearCache() {
|
||||
StatusCode CacheClearAll() {
|
||||
binary_cache_mutex_.lock();
|
||||
binary_cache_.clear();
|
||||
binary_cache_mutex_.unlock();
|
||||
|
@ -109,5 +108,4 @@ StatusCode ClearCache() {
|
|||
}
|
||||
|
||||
// =================================================================================================
|
||||
} // namespace cache
|
||||
} // namespace clblast
|
|
@ -18,10 +18,9 @@
|
|||
#include <vector>
|
||||
#include <mutex>
|
||||
|
||||
#include "internal/utilities.h"
|
||||
#include "utilities.hpp"
|
||||
|
||||
namespace clblast {
|
||||
namespace cache {
|
||||
// =================================================================================================
|
||||
|
||||
// The cache of compiled OpenCL binaries, along with some meta-data
|
||||
|
@ -90,10 +89,9 @@ bool ProgramIsInCache(const Context &context, const Precision &precision,
|
|||
// =================================================================================================
|
||||
|
||||
// Clears the cache of stored binaries
|
||||
StatusCode ClearCache();
|
||||
StatusCode CacheClearAll();
|
||||
|
||||
// =================================================================================================
|
||||
} // namespace cache
|
||||
} // namespace clblast
|
||||
|
||||
// CLBLAST_CACHE_H_
|
|
@ -16,60 +16,60 @@
|
|||
#include <string>
|
||||
|
||||
#include "clblast.h"
|
||||
#include "internal/public_api.h"
|
||||
#include "internal/cache.h"
|
||||
#include "public_api.hpp"
|
||||
#include "cache.hpp"
|
||||
|
||||
// BLAS level-1 includes
|
||||
#include "internal/routines/level1/xswap.h"
|
||||
#include "internal/routines/level1/xscal.h"
|
||||
#include "internal/routines/level1/xcopy.h"
|
||||
#include "internal/routines/level1/xaxpy.h"
|
||||
#include "internal/routines/level1/xdot.h"
|
||||
#include "internal/routines/level1/xdotu.h"
|
||||
#include "internal/routines/level1/xdotc.h"
|
||||
#include "internal/routines/level1/xnrm2.h"
|
||||
#include "internal/routines/level1/xasum.h"
|
||||
#include "internal/routines/level1/xsum.h" // non-BLAS function
|
||||
#include "internal/routines/level1/xamax.h"
|
||||
#include "internal/routines/level1/xmax.h" // non-BLAS function
|
||||
#include "internal/routines/level1/xmin.h" // non-BLAS function
|
||||
#include "routines/level1/xswap.hpp"
|
||||
#include "routines/level1/xscal.hpp"
|
||||
#include "routines/level1/xcopy.hpp"
|
||||
#include "routines/level1/xaxpy.hpp"
|
||||
#include "routines/level1/xdot.hpp"
|
||||
#include "routines/level1/xdotu.hpp"
|
||||
#include "routines/level1/xdotc.hpp"
|
||||
#include "routines/level1/xnrm2.hpp"
|
||||
#include "routines/level1/xasum.hpp"
|
||||
#include "routines/level1/xsum.hpp" // non-BLAS routine
|
||||
#include "routines/level1/xamax.hpp"
|
||||
#include "routines/level1/xmax.hpp" // non-BLAS routine
|
||||
#include "routines/level1/xmin.hpp" // non-BLAS routine
|
||||
|
||||
// BLAS level-2 includes
|
||||
#include "internal/routines/level2/xgemv.h"
|
||||
#include "internal/routines/level2/xgbmv.h"
|
||||
#include "internal/routines/level2/xhemv.h"
|
||||
#include "internal/routines/level2/xhbmv.h"
|
||||
#include "internal/routines/level2/xhpmv.h"
|
||||
#include "internal/routines/level2/xsymv.h"
|
||||
#include "internal/routines/level2/xsbmv.h"
|
||||
#include "internal/routines/level2/xspmv.h"
|
||||
#include "internal/routines/level2/xtrmv.h"
|
||||
#include "internal/routines/level2/xtbmv.h"
|
||||
#include "internal/routines/level2/xtpmv.h"
|
||||
#include "internal/routines/level2/xger.h"
|
||||
#include "internal/routines/level2/xgeru.h"
|
||||
#include "internal/routines/level2/xgerc.h"
|
||||
#include "internal/routines/level2/xher.h"
|
||||
#include "internal/routines/level2/xhpr.h"
|
||||
#include "internal/routines/level2/xher2.h"
|
||||
#include "internal/routines/level2/xhpr2.h"
|
||||
#include "internal/routines/level2/xsyr.h"
|
||||
#include "internal/routines/level2/xspr.h"
|
||||
#include "internal/routines/level2/xsyr2.h"
|
||||
#include "internal/routines/level2/xspr2.h"
|
||||
#include "routines/level2/xgemv.hpp"
|
||||
#include "routines/level2/xgbmv.hpp"
|
||||
#include "routines/level2/xhemv.hpp"
|
||||
#include "routines/level2/xhbmv.hpp"
|
||||
#include "routines/level2/xhpmv.hpp"
|
||||
#include "routines/level2/xsymv.hpp"
|
||||
#include "routines/level2/xsbmv.hpp"
|
||||
#include "routines/level2/xspmv.hpp"
|
||||
#include "routines/level2/xtrmv.hpp"
|
||||
#include "routines/level2/xtbmv.hpp"
|
||||
#include "routines/level2/xtpmv.hpp"
|
||||
#include "routines/level2/xger.hpp"
|
||||
#include "routines/level2/xgeru.hpp"
|
||||
#include "routines/level2/xgerc.hpp"
|
||||
#include "routines/level2/xher.hpp"
|
||||
#include "routines/level2/xhpr.hpp"
|
||||
#include "routines/level2/xher2.hpp"
|
||||
#include "routines/level2/xhpr2.hpp"
|
||||
#include "routines/level2/xsyr.hpp"
|
||||
#include "routines/level2/xspr.hpp"
|
||||
#include "routines/level2/xsyr2.hpp"
|
||||
#include "routines/level2/xspr2.hpp"
|
||||
|
||||
// BLAS level-3 includes
|
||||
#include "internal/routines/level3/xgemm.h"
|
||||
#include "internal/routines/level3/xsymm.h"
|
||||
#include "internal/routines/level3/xhemm.h"
|
||||
#include "internal/routines/level3/xsyrk.h"
|
||||
#include "internal/routines/level3/xherk.h"
|
||||
#include "internal/routines/level3/xsyr2k.h"
|
||||
#include "internal/routines/level3/xher2k.h"
|
||||
#include "internal/routines/level3/xtrmm.h"
|
||||
#include "routines/level3/xgemm.hpp"
|
||||
#include "routines/level3/xsymm.hpp"
|
||||
#include "routines/level3/xhemm.hpp"
|
||||
#include "routines/level3/xsyrk.hpp"
|
||||
#include "routines/level3/xherk.hpp"
|
||||
#include "routines/level3/xsyr2k.hpp"
|
||||
#include "routines/level3/xher2k.hpp"
|
||||
#include "routines/level3/xtrmm.hpp"
|
||||
|
||||
// Extra includes (level-x)
|
||||
#include "internal/routines/levelx/xomatcopy.h"
|
||||
// Level-x includes (non-BLAS)
|
||||
#include "routines/levelx/xomatcopy.hpp"
|
||||
|
||||
namespace clblast {
|
||||
|
||||
|
@ -2120,9 +2120,10 @@ template StatusCode PUBLIC_API Omatcopy<half>(const Layout, const Transpose,
|
|||
// =================================================================================================
|
||||
|
||||
// Clears the cache of stored binaries
|
||||
StatusCode ClearCache() { return cache::ClearCache(); }
|
||||
StatusCode ClearCache() { return CacheClearAll(); }
|
||||
|
||||
// Fills the cache with all binaries for a specific device
|
||||
// TODO: Add half-precision FP16 set-up calls
|
||||
StatusCode FillCache(const cl_device_id device) {
|
||||
try {
|
||||
|
||||
|
@ -2171,7 +2172,7 @@ StatusCode FillCache(const cl_device_id device) {
|
|||
Xsyr2<float>(queue, nullptr).SetUp(); Xsyr2<double>(queue, nullptr).SetUp();
|
||||
Xspr2<float>(queue, nullptr).SetUp(); Xspr2<double>(queue, nullptr).SetUp();
|
||||
|
||||
// Runs all the level 1 set-up functions
|
||||
// Runs all the level 3 set-up functions
|
||||
Xgemm<float>(queue, nullptr).SetUp(); Xgemm<double>(queue, nullptr).SetUp(); Xgemm<float2>(queue, nullptr).SetUp(); Xgemm<double2>(queue, nullptr).SetUp();
|
||||
Xsymm<float>(queue, nullptr).SetUp(); Xsymm<double>(queue, nullptr).SetUp(); Xsymm<float2>(queue, nullptr).SetUp(); Xsymm<double2>(queue, nullptr).SetUp();
|
||||
Xhemm<float2>(queue, nullptr).SetUp(); Xhemm<double2>(queue, nullptr).SetUp();
|
||||
|
@ -2181,6 +2182,9 @@ StatusCode FillCache(const cl_device_id device) {
|
|||
Xher2k<float2,float>(queue, nullptr).SetUp(); Xher2k<double2,double>(queue, nullptr).SetUp();
|
||||
Xtrmm<float>(queue, nullptr).SetUp(); Xtrmm<double>(queue, nullptr).SetUp(); Xtrmm<float2>(queue, nullptr).SetUp(); Xtrmm<double2>(queue, nullptr).SetUp();
|
||||
|
||||
// Runs all the level 3 set-up functions
|
||||
Xomatcopy<float>(queue, nullptr).SetUp(); Xomatcopy<double>(queue, nullptr).SetUp(); Xomatcopy<float2>(queue, nullptr).SetUp(); Xomatcopy<double2>(queue, nullptr).SetUp();
|
||||
|
||||
} catch (...) { return StatusCode::kBuildProgramFailure; }
|
||||
return StatusCode::kSuccess;
|
||||
}
|
|
@ -15,7 +15,7 @@
|
|||
|
||||
#include "clblast_c.h"
|
||||
#include "clblast.h"
|
||||
#include "internal/utilities.h"
|
||||
#include "utilities.hpp"
|
||||
|
||||
// Shortcuts to the clblast namespace
|
||||
using float2 = clblast::float2;
|
|
@ -11,18 +11,18 @@
|
|||
//
|
||||
// =================================================================================================
|
||||
|
||||
#include "internal/database.h"
|
||||
#include "internal/database/xaxpy.h"
|
||||
#include "internal/database/xdot.h"
|
||||
#include "internal/database/xgemv.h"
|
||||
#include "internal/database/xger.h"
|
||||
#include "internal/database/xgemm.h"
|
||||
#include "internal/database/copy.h"
|
||||
#include "internal/database/pad.h"
|
||||
#include "internal/database/transpose.h"
|
||||
#include "internal/database/padtranspose.h"
|
||||
#include "utilities.hpp"
|
||||
|
||||
#include "internal/utilities.h"
|
||||
#include "database/database.hpp"
|
||||
#include "database/kernels/xaxpy.hpp"
|
||||
#include "database/kernels/xdot.hpp"
|
||||
#include "database/kernels/xgemv.hpp"
|
||||
#include "database/kernels/xger.hpp"
|
||||
#include "database/kernels/xgemm.hpp"
|
||||
#include "database/kernels/copy.hpp"
|
||||
#include "database/kernels/pad.hpp"
|
||||
#include "database/kernels/transpose.hpp"
|
||||
#include "database/kernels/padtranspose.hpp"
|
||||
|
||||
namespace clblast {
|
||||
// =================================================================================================
|
|
@ -21,7 +21,7 @@
|
|||
#include <vector>
|
||||
#include <unordered_map>
|
||||
|
||||
#include "internal/utilities.h"
|
||||
#include "utilities.hpp"
|
||||
|
||||
namespace clblast {
|
||||
// =================================================================================================
|
431
src/routine.cc
431
src/routine.cc
|
@ -1,431 +0,0 @@
|
|||
|
||||
// =================================================================================================
|
||||
// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
|
||||
// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
|
||||
// width of 100 characters per line.
|
||||
//
|
||||
// Author(s):
|
||||
// Cedric Nugteren <www.cedricnugteren.nl>
|
||||
//
|
||||
// This file implements the Routine base class (see the header for information about the class).
|
||||
//
|
||||
// =================================================================================================
|
||||
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
#include "internal/routine.h"
|
||||
|
||||
namespace clblast {
|
||||
// =================================================================================================
|
||||
|
||||
// Constructor: not much here, because no status codes can be returned
|
||||
template <typename T>
|
||||
Routine<T>::Routine(Queue &queue, EventPointer event, const std::string &name,
|
||||
const std::vector<std::string> &routines, const Precision precision):
|
||||
precision_(precision),
|
||||
routine_name_(name),
|
||||
queue_(queue),
|
||||
event_(event),
|
||||
context_(queue_.GetContext()),
|
||||
device_(queue_.GetDevice()),
|
||||
device_name_(device_.Name()),
|
||||
max_work_item_dimensions_(device_.MaxWorkItemDimensions()),
|
||||
max_work_item_sizes_(device_.MaxWorkItemSizes()),
|
||||
max_work_group_size_(device_.MaxWorkGroupSize()),
|
||||
db_(queue_, routines, precision_) {
|
||||
}
|
||||
|
||||
// =================================================================================================
|
||||
|
||||
// Separate set-up function to allow for status codes to be returned
|
||||
template <typename T>
|
||||
StatusCode Routine<T>::SetUp() {
|
||||
|
||||
// Queries the cache to see whether or not the program (context-specific) is already there
|
||||
if (ProgramIsInCache()) { return StatusCode::kSuccess; }
|
||||
|
||||
// Queries the cache to see whether or not the binary (device-specific) is already there. If it
|
||||
// is, a program is created and stored in the cache
|
||||
if (BinaryIsInCache()) {
|
||||
try {
|
||||
auto& binary = cache::GetBinaryFromCache(device_name_, precision_, routine_name_);
|
||||
auto program = Program(device_, context_, binary);
|
||||
auto options = std::vector<std::string>();
|
||||
program.Build(device_, options);
|
||||
StoreProgramToCache(program);
|
||||
} catch (...) { return StatusCode::kBuildProgramFailure; }
|
||||
return StatusCode::kSuccess;
|
||||
}
|
||||
|
||||
// Otherwise, the kernel will be compiled and program will be built. Both the binary and the
|
||||
// program will be added to the cache.
|
||||
|
||||
// Inspects whether or not cl_khr_fp64 is supported in case of double precision
|
||||
const auto extensions = device_.Capabilities();
|
||||
if (precision_ == Precision::kDouble || precision_ == Precision::kComplexDouble) {
|
||||
if (extensions.find(kKhronosDoublePrecision) == std::string::npos) {
|
||||
return StatusCode::kNoDoublePrecision;
|
||||
}
|
||||
}
|
||||
|
||||
// As above, but for cl_khr_fp16 (half precision)
|
||||
if (precision_ == Precision::kHalf) {
|
||||
if (extensions.find(kKhronosHalfPrecision) == std::string::npos) {
|
||||
return StatusCode::kNoHalfPrecision;
|
||||
}
|
||||
}
|
||||
|
||||
// Loads the common header (typedefs and defines and such)
|
||||
std::string common_header =
|
||||
#include "kernels/common.opencl"
|
||||
;
|
||||
|
||||
// Collects the parameters for this device in the form of defines, and adds the precision
|
||||
auto defines = db_.GetDefines();
|
||||
defines += "#define PRECISION "+ToString(static_cast<int>(precision_))+"\n";
|
||||
|
||||
// Adds the name of the routine as a define
|
||||
defines += "#define ROUTINE_"+routine_name_+"\n";
|
||||
|
||||
// For specific devices, use the non-IEE754 compilant OpenCL mad() instruction. This can improve
|
||||
// performance, but might result in a reduced accuracy.
|
||||
if (device_.IsAMD() && device_.IsGPU()) {
|
||||
defines += "#define USE_CL_MAD 1\n";
|
||||
}
|
||||
|
||||
// For specific devices, use staggered/shuffled workgroup indices.
|
||||
if (device_.IsAMD() && device_.IsGPU()) {
|
||||
defines += "#define USE_STAGGERED_INDICES 1\n";
|
||||
}
|
||||
|
||||
// For specific devices add a global synchronisation barrier to the GEMM kernel to optimize
|
||||
// performance through better cache behaviour
|
||||
if (device_.IsARM() && device_.IsGPU()) {
|
||||
defines += "#define GLOBAL_MEM_FENCE 1\n";
|
||||
}
|
||||
|
||||
// Combines everything together into a single source string
|
||||
const auto source_string = defines + common_header + source_string_;
|
||||
|
||||
// Compiles the kernel
|
||||
try {
|
||||
auto program = Program(context_, source_string);
|
||||
auto options = std::vector<std::string>();
|
||||
const auto build_status = program.Build(device_, options);
|
||||
|
||||
// Checks for compiler crashes/errors/warnings
|
||||
if (build_status == BuildStatus::kError) {
|
||||
const auto message = program.GetBuildInfo(device_);
|
||||
fprintf(stdout, "OpenCL compiler error/warning: %s\n", message.c_str());
|
||||
return StatusCode::kBuildProgramFailure;
|
||||
}
|
||||
if (build_status == BuildStatus::kInvalid) { return StatusCode::kInvalidBinary; }
|
||||
|
||||
// Store the compiled binary and program in the cache
|
||||
const auto binary = program.GetIR();
|
||||
StoreBinaryToCache(binary);
|
||||
StoreProgramToCache(program);
|
||||
} catch (...) { return StatusCode::kBuildProgramFailure; }
|
||||
|
||||
// No errors, normal termination of this function
|
||||
return StatusCode::kSuccess;
|
||||
}
|
||||
|
||||
// =================================================================================================
|
||||
|
||||
// Enqueues a kernel, waits for completion, and checks for errors
|
||||
template <typename T>
|
||||
StatusCode Routine<T>::RunKernel(Kernel &kernel, std::vector<size_t> global,
|
||||
const std::vector<size_t> &local, EventPointer event,
|
||||
std::vector<Event>& waitForEvents) {
|
||||
|
||||
// Tests for validity of the local thread sizes
|
||||
if (local.size() > max_work_item_dimensions_) {
|
||||
return StatusCode::kInvalidLocalNumDimensions;
|
||||
}
|
||||
for (auto i=size_t{0}; i<local.size(); ++i) {
|
||||
if (local[i] > max_work_item_sizes_[i]) { return StatusCode::kInvalidLocalThreadsDim; }
|
||||
}
|
||||
auto local_size = size_t{1};
|
||||
for (auto &item: local) { local_size *= item; }
|
||||
if (local_size > max_work_group_size_) { return StatusCode::kInvalidLocalThreadsTotal; }
|
||||
|
||||
// Make sure the global thread sizes are at least equal to the local sizes
|
||||
for (auto i=size_t{0}; i<global.size(); ++i) {
|
||||
if (global[i] < local[i]) { global[i] = local[i]; }
|
||||
}
|
||||
|
||||
// Tests for local memory usage
|
||||
const auto local_mem_usage = kernel.LocalMemUsage(device_);
|
||||
if (!device_.IsLocalMemoryValid(local_mem_usage)) { return StatusCode::kInvalidLocalMemUsage; }
|
||||
|
||||
// Launches the kernel (and checks for launch errors)
|
||||
try {
|
||||
kernel.Launch(queue_, global, local, event, waitForEvents);
|
||||
} catch (...) { return StatusCode::kKernelLaunchError; }
|
||||
|
||||
// No errors, normal termination of this function
|
||||
return StatusCode::kSuccess;
|
||||
}
|
||||
|
||||
// As above, but without an event waiting list
|
||||
template <typename T>
|
||||
StatusCode Routine<T>::RunKernel(Kernel &kernel, std::vector<size_t> global,
|
||||
const std::vector<size_t> &local, EventPointer event) {
|
||||
auto emptyWaitingList = std::vector<Event>();
|
||||
return RunKernel(kernel, global, local, event, emptyWaitingList);
|
||||
}
|
||||
|
||||
// =================================================================================================
|
||||
|
||||
// Tests matrix A for validity: checks for a valid OpenCL buffer, a valid lead-dimension, and for a
|
||||
// sufficient buffer size.
|
||||
template <typename T>
|
||||
StatusCode Routine<T>::TestMatrixA(const size_t one, const size_t two, const Buffer<T> &buffer,
|
||||
const size_t offset, const size_t ld, const size_t data_size) {
|
||||
if (ld < one) { return StatusCode::kInvalidLeadDimA; }
|
||||
try {
|
||||
const auto required_size = (ld*(two-1) + one + offset)*data_size;
|
||||
const auto buffer_size = buffer.GetSize();
|
||||
if (buffer_size < required_size) { return StatusCode::kInsufficientMemoryA; }
|
||||
} catch (...) { return StatusCode::kInvalidMatrixA; }
|
||||
return StatusCode::kSuccess;
|
||||
}
|
||||
|
||||
// Tests matrix B for validity: checks for a valid OpenCL buffer, a valid lead-dimension, and for a
|
||||
// sufficient buffer size.
|
||||
template <typename T>
|
||||
StatusCode Routine<T>::TestMatrixB(const size_t one, const size_t two, const Buffer<T> &buffer,
|
||||
const size_t offset, const size_t ld, const size_t data_size) {
|
||||
if (ld < one) { return StatusCode::kInvalidLeadDimB; }
|
||||
try {
|
||||
const auto required_size = (ld*(two-1) + one + offset)*data_size;
|
||||
const auto buffer_size = buffer.GetSize();
|
||||
if (buffer_size < required_size) { return StatusCode::kInsufficientMemoryB; }
|
||||
} catch (...) { return StatusCode::kInvalidMatrixB; }
|
||||
return StatusCode::kSuccess;
|
||||
}
|
||||
|
||||
// Tests matrix C for validity: checks for a valid OpenCL buffer, a valid lead-dimension, and for a
|
||||
// sufficient buffer size.
|
||||
template <typename T>
|
||||
StatusCode Routine<T>::TestMatrixC(const size_t one, const size_t two, const Buffer<T> &buffer,
|
||||
const size_t offset, const size_t ld, const size_t data_size) {
|
||||
if (ld < one) { return StatusCode::kInvalidLeadDimC; }
|
||||
try {
|
||||
const auto required_size = (ld*(two-1) + one + offset)*data_size;
|
||||
const auto buffer_size = buffer.GetSize();
|
||||
if (buffer_size < required_size) { return StatusCode::kInsufficientMemoryC; }
|
||||
} catch (...) { return StatusCode::kInvalidMatrixC; }
|
||||
return StatusCode::kSuccess;
|
||||
}
|
||||
|
||||
// Tests matrix AP for validity: checks for a valid OpenCL buffer and for a sufficient buffer size
|
||||
template <typename T>
|
||||
StatusCode Routine<T>::TestMatrixAP(const size_t n, const Buffer<T> &buffer,
|
||||
const size_t offset, const size_t data_size) {
|
||||
try {
|
||||
const auto required_size = (((n*(n+1))/2) + offset)*data_size;
|
||||
const auto buffer_size = buffer.GetSize();
|
||||
if (buffer_size < required_size) { return StatusCode::kInsufficientMemoryA; }
|
||||
} catch (...) { return StatusCode::kInvalidMatrixA; }
|
||||
return StatusCode::kSuccess;
|
||||
}
|
||||
|
||||
// =================================================================================================
|
||||
|
||||
// Tests vector X for validity: checks for a valid increment, a valid OpenCL buffer, and for a
|
||||
// sufficient buffer size.
|
||||
template <typename T>
|
||||
StatusCode Routine<T>::TestVectorX(const size_t n, const Buffer<T> &buffer, const size_t offset,
|
||||
const size_t inc, const size_t data_size) {
|
||||
if (inc == 0) { return StatusCode::kInvalidIncrementX; }
|
||||
try {
|
||||
const auto required_size = ((n-1)*inc + 1 + offset)*data_size;
|
||||
const auto buffer_size = buffer.GetSize();
|
||||
if (buffer_size < required_size) { return StatusCode::kInsufficientMemoryX; }
|
||||
} catch (...) { return StatusCode::kInvalidVectorX; }
|
||||
return StatusCode::kSuccess;
|
||||
}
|
||||
|
||||
// Tests vector Y for validity: checks for a valid increment, a valid OpenCL buffer, and for a
|
||||
// sufficient buffer size.
|
||||
template <typename T>
|
||||
StatusCode Routine<T>::TestVectorY(const size_t n, const Buffer<T> &buffer, const size_t offset,
|
||||
const size_t inc, const size_t data_size) {
|
||||
if (inc == 0) { return StatusCode::kInvalidIncrementY; }
|
||||
try {
|
||||
const auto required_size = ((n-1)*inc + 1 + offset)*data_size;
|
||||
const auto buffer_size = buffer.GetSize();
|
||||
if (buffer_size < required_size) { return StatusCode::kInsufficientMemoryY; }
|
||||
} catch (...) { return StatusCode::kInvalidVectorY; }
|
||||
return StatusCode::kSuccess;
|
||||
}
|
||||
|
||||
// =================================================================================================
|
||||
|
||||
// Tests vector dot for validity: checks for a valid increment, a valid OpenCL buffer, and for a
|
||||
// sufficient buffer size.
|
||||
template <typename T>
|
||||
StatusCode Routine<T>::TestVectorDot(const size_t n, const Buffer<T> &buffer, const size_t offset,
|
||||
const size_t data_size) {
|
||||
try {
|
||||
const auto required_size = (n + offset)*data_size;
|
||||
const auto buffer_size = buffer.GetSize();
|
||||
if (buffer_size < required_size) { return StatusCode::kInsufficientMemoryDot; }
|
||||
} catch (...) { return StatusCode::kInvalidVectorDot; }
|
||||
return StatusCode::kSuccess;
|
||||
}
|
||||
|
||||
// Tests vector index for validity: checks for a valid increment, a valid OpenCL buffer, and for a
|
||||
// sufficient buffer size.
|
||||
template <typename T>
|
||||
StatusCode Routine<T>::TestVectorIndex(const size_t n, const Buffer<unsigned int> &buffer,
|
||||
const size_t offset, const size_t data_size) {
|
||||
try {
|
||||
const auto required_size = (n + offset)*data_size;
|
||||
const auto buffer_size = buffer.GetSize();
|
||||
if (buffer_size < required_size) { return StatusCode::kInsufficientMemoryDot; }
|
||||
} catch (...) { return StatusCode::kInvalidVectorDot; }
|
||||
return StatusCode::kSuccess;
|
||||
}
|
||||
|
||||
// =================================================================================================
|
||||
|
||||
// Copies or transposes a matrix and optionally pads/unpads it with zeros
|
||||
template <typename T>
|
||||
StatusCode Routine<T>::PadCopyTransposeMatrix(EventPointer event, std::vector<Event>& waitForEvents,
|
||||
const size_t src_one, const size_t src_two,
|
||||
const size_t src_ld, const size_t src_offset,
|
||||
const Buffer<T> &src,
|
||||
const size_t dest_one, const size_t dest_two,
|
||||
const size_t dest_ld, const size_t dest_offset,
|
||||
const Buffer<T> &dest,
|
||||
const T alpha,
|
||||
const Program &program, const bool do_pad,
|
||||
const bool do_transpose, const bool do_conjugate,
|
||||
const bool upper, const bool lower,
|
||||
const bool diagonal_imag_zero) {
|
||||
|
||||
// Determines whether or not the fast-version could potentially be used
|
||||
auto use_fast_kernel = (src_offset == 0) && (dest_offset == 0) && (do_conjugate == false) &&
|
||||
(src_one == dest_one) && (src_two == dest_two) && (src_ld == dest_ld) &&
|
||||
(upper == false) && (lower == false) && (diagonal_imag_zero == false);
|
||||
|
||||
// Determines the right kernel
|
||||
auto kernel_name = std::string{};
|
||||
if (do_transpose) {
|
||||
if (use_fast_kernel &&
|
||||
IsMultiple(src_ld, db_["TRA_WPT"]) &&
|
||||
IsMultiple(src_one, db_["TRA_WPT"]*db_["TRA_WPT"]) &&
|
||||
IsMultiple(src_two, db_["TRA_WPT"]*db_["TRA_WPT"])) {
|
||||
kernel_name = "TransposeMatrixFast";
|
||||
}
|
||||
else {
|
||||
use_fast_kernel = false;
|
||||
kernel_name = (do_pad) ? "TransposePadMatrix" : "TransposeMatrix";
|
||||
}
|
||||
}
|
||||
else {
|
||||
if (use_fast_kernel &&
|
||||
IsMultiple(src_ld, db_["COPY_VW"]) &&
|
||||
IsMultiple(src_one, db_["COPY_VW"]*db_["COPY_DIMX"]) &&
|
||||
IsMultiple(src_two, db_["COPY_WPT"]*db_["COPY_DIMY"])) {
|
||||
kernel_name = "CopyMatrixFast";
|
||||
}
|
||||
else {
|
||||
use_fast_kernel = false;
|
||||
kernel_name = (do_pad) ? "CopyPadMatrix" : "CopyMatrix";
|
||||
}
|
||||
}
|
||||
|
||||
// Upload the scalar argument as a constant buffer to the device (needed for half-precision)
|
||||
auto alpha_buffer = Buffer<T>(context_, 1);
|
||||
alpha_buffer.Write(queue_, 1, &alpha);
|
||||
|
||||
// Retrieves the kernel from the compiled binary
|
||||
try {
|
||||
auto kernel = Kernel(program, kernel_name);
|
||||
|
||||
// Sets the kernel arguments
|
||||
if (use_fast_kernel) {
|
||||
kernel.SetArgument(0, static_cast<int>(src_ld));
|
||||
kernel.SetArgument(1, src());
|
||||
kernel.SetArgument(2, dest());
|
||||
kernel.SetArgument(3, alpha_buffer());
|
||||
}
|
||||
else {
|
||||
kernel.SetArgument(0, static_cast<int>(src_one));
|
||||
kernel.SetArgument(1, static_cast<int>(src_two));
|
||||
kernel.SetArgument(2, static_cast<int>(src_ld));
|
||||
kernel.SetArgument(3, static_cast<int>(src_offset));
|
||||
kernel.SetArgument(4, src());
|
||||
kernel.SetArgument(5, static_cast<int>(dest_one));
|
||||
kernel.SetArgument(6, static_cast<int>(dest_two));
|
||||
kernel.SetArgument(7, static_cast<int>(dest_ld));
|
||||
kernel.SetArgument(8, static_cast<int>(dest_offset));
|
||||
kernel.SetArgument(9, dest());
|
||||
kernel.SetArgument(10, alpha_buffer());
|
||||
if (do_pad) {
|
||||
kernel.SetArgument(11, static_cast<int>(do_conjugate));
|
||||
}
|
||||
else {
|
||||
kernel.SetArgument(11, static_cast<int>(upper));
|
||||
kernel.SetArgument(12, static_cast<int>(lower));
|
||||
kernel.SetArgument(13, static_cast<int>(diagonal_imag_zero));
|
||||
}
|
||||
}
|
||||
|
||||
// Launches the kernel and returns the error code. Uses global and local thread sizes based on
|
||||
// parameters in the database.
|
||||
if (do_transpose) {
|
||||
if (use_fast_kernel) {
|
||||
const auto global = std::vector<size_t>{
|
||||
dest_one / db_["TRA_WPT"],
|
||||
dest_two / db_["TRA_WPT"]
|
||||
};
|
||||
const auto local = std::vector<size_t>{db_["TRA_DIM"], db_["TRA_DIM"]};
|
||||
return RunKernel(kernel, global, local, event, waitForEvents);
|
||||
}
|
||||
else {
|
||||
const auto global = std::vector<size_t>{
|
||||
Ceil(CeilDiv(dest_one, db_["PADTRA_WPT"]), db_["PADTRA_TILE"]),
|
||||
Ceil(CeilDiv(dest_two, db_["PADTRA_WPT"]), db_["PADTRA_TILE"])
|
||||
};
|
||||
const auto local = std::vector<size_t>{db_["PADTRA_TILE"], db_["PADTRA_TILE"]};
|
||||
return RunKernel(kernel, global, local, event, waitForEvents);
|
||||
}
|
||||
}
|
||||
else {
|
||||
if (use_fast_kernel) {
|
||||
const auto global = std::vector<size_t>{
|
||||
dest_one / db_["COPY_VW"],
|
||||
dest_two / db_["COPY_WPT"]
|
||||
};
|
||||
const auto local = std::vector<size_t>{db_["COPY_DIMX"], db_["COPY_DIMY"]};
|
||||
return RunKernel(kernel, global, local, event, waitForEvents);
|
||||
}
|
||||
else {
|
||||
const auto global = std::vector<size_t>{
|
||||
Ceil(CeilDiv(dest_one, db_["PAD_WPTX"]), db_["PAD_DIMX"]),
|
||||
Ceil(CeilDiv(dest_two, db_["PAD_WPTY"]), db_["PAD_DIMY"])
|
||||
};
|
||||
const auto local = std::vector<size_t>{db_["PAD_DIMX"], db_["PAD_DIMY"]};
|
||||
return RunKernel(kernel, global, local, event, waitForEvents);
|
||||
}
|
||||
}
|
||||
} catch (...) { return StatusCode::kInvalidKernel; }
|
||||
}
|
||||
|
||||
// =================================================================================================
|
||||
|
||||
// Compiles the templated class
|
||||
template class Routine<half>;
|
||||
template class Routine<float>;
|
||||
template class Routine<double>;
|
||||
template class Routine<float2>;
|
||||
template class Routine<double2>;
|
||||
|
||||
// =================================================================================================
|
||||
} // namespace clblast
|
131
src/routine.cpp
Normal file
131
src/routine.cpp
Normal file
|
@ -0,0 +1,131 @@
|
|||
|
||||
// =================================================================================================
|
||||
// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
|
||||
// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
|
||||
// width of 100 characters per line.
|
||||
//
|
||||
// Author(s):
|
||||
// Cedric Nugteren <www.cedricnugteren.nl>
|
||||
//
|
||||
// This file implements the Routine base class (see the header for information about the class).
|
||||
//
|
||||
// =================================================================================================
|
||||
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
#include "routine.hpp"
|
||||
|
||||
namespace clblast {
|
||||
// =================================================================================================
|
||||
|
||||
// Constructor: not much here, because no status codes can be returned
|
||||
Routine::Routine(Queue &queue, EventPointer event, const std::string &name,
|
||||
const std::vector<std::string> &routines, const Precision precision):
|
||||
precision_(precision),
|
||||
routine_name_(name),
|
||||
queue_(queue),
|
||||
event_(event),
|
||||
context_(queue_.GetContext()),
|
||||
device_(queue_.GetDevice()),
|
||||
device_name_(device_.Name()),
|
||||
db_(queue_, routines, precision_) {
|
||||
}
|
||||
|
||||
// =================================================================================================
|
||||
|
||||
// Separate set-up function to allow for status codes to be returned
|
||||
StatusCode Routine::SetUp() {
|
||||
|
||||
// Queries the cache to see whether or not the program (context-specific) is already there
|
||||
if (ProgramIsInCache(context_, precision_, routine_name_)) { return StatusCode::kSuccess; }
|
||||
|
||||
// Queries the cache to see whether or not the binary (device-specific) is already there. If it
|
||||
// is, a program is created and stored in the cache
|
||||
if (BinaryIsInCache(device_name_, precision_, routine_name_)) {
|
||||
try {
|
||||
auto& binary = GetBinaryFromCache(device_name_, precision_, routine_name_);
|
||||
auto program = Program(device_, context_, binary);
|
||||
auto options = std::vector<std::string>();
|
||||
program.Build(device_, options);
|
||||
StoreProgramToCache(program, context_, precision_, routine_name_);
|
||||
} catch (...) { return StatusCode::kBuildProgramFailure; }
|
||||
return StatusCode::kSuccess;
|
||||
}
|
||||
|
||||
// Otherwise, the kernel will be compiled and program will be built. Both the binary and the
|
||||
// program will be added to the cache.
|
||||
|
||||
// Inspects whether or not cl_khr_fp64 is supported in case of double precision
|
||||
const auto extensions = device_.Capabilities();
|
||||
if (precision_ == Precision::kDouble || precision_ == Precision::kComplexDouble) {
|
||||
if (extensions.find(kKhronosDoublePrecision) == std::string::npos) {
|
||||
return StatusCode::kNoDoublePrecision;
|
||||
}
|
||||
}
|
||||
|
||||
// As above, but for cl_khr_fp16 (half precision)
|
||||
if (precision_ == Precision::kHalf) {
|
||||
if (extensions.find(kKhronosHalfPrecision) == std::string::npos) {
|
||||
return StatusCode::kNoHalfPrecision;
|
||||
}
|
||||
}
|
||||
|
||||
// Loads the common header (typedefs and defines and such)
|
||||
std::string common_header =
|
||||
#include "kernels/common.opencl"
|
||||
;
|
||||
|
||||
// Collects the parameters for this device in the form of defines, and adds the precision
|
||||
auto defines = db_.GetDefines();
|
||||
defines += "#define PRECISION "+ToString(static_cast<int>(precision_))+"\n";
|
||||
|
||||
// Adds the name of the routine as a define
|
||||
defines += "#define ROUTINE_"+routine_name_+"\n";
|
||||
|
||||
// For specific devices, use the non-IEE754 compilant OpenCL mad() instruction. This can improve
|
||||
// performance, but might result in a reduced accuracy.
|
||||
if (device_.IsAMD() && device_.IsGPU()) {
|
||||
defines += "#define USE_CL_MAD 1\n";
|
||||
}
|
||||
|
||||
// For specific devices, use staggered/shuffled workgroup indices.
|
||||
if (device_.IsAMD() && device_.IsGPU()) {
|
||||
defines += "#define USE_STAGGERED_INDICES 1\n";
|
||||
}
|
||||
|
||||
// For specific devices add a global synchronisation barrier to the GEMM kernel to optimize
|
||||
// performance through better cache behaviour
|
||||
if (device_.IsARM() && device_.IsGPU()) {
|
||||
defines += "#define GLOBAL_MEM_FENCE 1\n";
|
||||
}
|
||||
|
||||
// Combines everything together into a single source string
|
||||
const auto source_string = defines + common_header + source_string_;
|
||||
|
||||
// Compiles the kernel
|
||||
try {
|
||||
auto program = Program(context_, source_string);
|
||||
auto options = std::vector<std::string>();
|
||||
const auto build_status = program.Build(device_, options);
|
||||
|
||||
// Checks for compiler crashes/errors/warnings
|
||||
if (build_status == BuildStatus::kError) {
|
||||
const auto message = program.GetBuildInfo(device_);
|
||||
fprintf(stdout, "OpenCL compiler error/warning: %s\n", message.c_str());
|
||||
return StatusCode::kBuildProgramFailure;
|
||||
}
|
||||
if (build_status == BuildStatus::kInvalid) { return StatusCode::kInvalidBinary; }
|
||||
|
||||
// Store the compiled binary and program in the cache
|
||||
const auto binary = program.GetIR();
|
||||
StoreBinaryToCache(binary, device_name_, precision_, routine_name_);
|
||||
StoreProgramToCache(program, context_, precision_, routine_name_);
|
||||
} catch (...) { return StatusCode::kBuildProgramFailure; }
|
||||
|
||||
// No errors, normal termination of this function
|
||||
return StatusCode::kSuccess;
|
||||
}
|
||||
|
||||
// =================================================================================================
|
||||
} // namespace clblast
|
68
src/routine.hpp
Normal file
68
src/routine.hpp
Normal file
|
@ -0,0 +1,68 @@
|
|||
|
||||
// =================================================================================================
|
||||
// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
|
||||
// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
|
||||
// width of 100 characters per line.
|
||||
//
|
||||
// Author(s):
|
||||
// Cedric Nugteren <www.cedricnugteren.nl>
|
||||
//
|
||||
// This file implements all the basic functionality for the BLAS routines. This class serves as a
|
||||
// base class for the actual routines (e.g. Xaxpy, Xgemm). It contains common functionality such as
|
||||
// compiling the OpenCL kernel, connecting to the database, etc.
|
||||
//
|
||||
// =================================================================================================
|
||||
|
||||
#ifndef CLBLAST_ROUTINE_H_
|
||||
#define CLBLAST_ROUTINE_H_
|
||||
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
#include "utilities.hpp"
|
||||
#include "cache.hpp"
|
||||
#include "buffer_test.hpp"
|
||||
#include "database/database.hpp"
|
||||
#include "routines/common.hpp"
|
||||
|
||||
namespace clblast {
|
||||
// =================================================================================================
|
||||
|
||||
// See comment at top of file for a description of the class
|
||||
class Routine {
|
||||
public:
|
||||
|
||||
// Base class constructor
|
||||
explicit Routine(Queue &queue, EventPointer event, const std::string &name,
|
||||
const std::vector<std::string> &routines, const Precision precision);
|
||||
|
||||
// Set-up phase of the kernel
|
||||
StatusCode SetUp();
|
||||
|
||||
protected:
|
||||
|
||||
// Non-static variable for the precision
|
||||
const Precision precision_;
|
||||
|
||||
// The routine's name and its kernel-source in string form
|
||||
const std::string routine_name_;
|
||||
std::string source_string_;
|
||||
|
||||
// The OpenCL objects, accessible only from derived classes
|
||||
Queue queue_;
|
||||
EventPointer event_;
|
||||
const Context context_;
|
||||
const Device device_;
|
||||
|
||||
// OpenCL device properties
|
||||
const std::string device_name_;
|
||||
|
||||
// Connection to the database for all the device-specific parameters
|
||||
const Database db_;
|
||||
};
|
||||
|
||||
// =================================================================================================
|
||||
} // namespace clblast
|
||||
|
||||
// CLBLAST_ROUTINE_H_
|
||||
#endif
|
65
src/routines/common.cpp
Normal file
65
src/routines/common.cpp
Normal file
|
@ -0,0 +1,65 @@
|
|||
|
||||
// =================================================================================================
|
||||
// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
|
||||
// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
|
||||
// width of 100 characters per line.
|
||||
//
|
||||
// Author(s):
|
||||
// Cedric Nugteren <www.cedricnugteren.nl>
|
||||
//
|
||||
// This file implements the common routine functions (see the header for more information).
|
||||
//
|
||||
// =================================================================================================
|
||||
|
||||
#include <vector>
|
||||
|
||||
#include "routines/common.hpp"
|
||||
|
||||
namespace clblast {
|
||||
// =================================================================================================
|
||||
|
||||
// Enqueues a kernel, waits for completion, and checks for errors
|
||||
StatusCode RunKernel(Kernel &kernel, Queue &queue, const Device &device,
|
||||
std::vector<size_t> global, const std::vector<size_t> &local,
|
||||
EventPointer event, std::vector<Event>& waitForEvents) {
|
||||
|
||||
// Tests for validity of the local thread sizes
|
||||
if (local.size() > device.MaxWorkItemDimensions()) {
|
||||
return StatusCode::kInvalidLocalNumDimensions;
|
||||
}
|
||||
const auto max_work_item_sizes = device.MaxWorkItemSizes();
|
||||
for (auto i=size_t{0}; i<local.size(); ++i) {
|
||||
if (local[i] > max_work_item_sizes[i]) { return StatusCode::kInvalidLocalThreadsDim; }
|
||||
}
|
||||
auto local_size = size_t{1};
|
||||
for (auto &item: local) { local_size *= item; }
|
||||
if (local_size > device.MaxWorkGroupSize()) { return StatusCode::kInvalidLocalThreadsTotal; }
|
||||
|
||||
// Make sure the global thread sizes are at least equal to the local sizes
|
||||
for (auto i=size_t{0}; i<global.size(); ++i) {
|
||||
if (global[i] < local[i]) { global[i] = local[i]; }
|
||||
}
|
||||
|
||||
// Tests for local memory usage
|
||||
const auto local_mem_usage = kernel.LocalMemUsage(device);
|
||||
if (!device.IsLocalMemoryValid(local_mem_usage)) { return StatusCode::kInvalidLocalMemUsage; }
|
||||
|
||||
// Launches the kernel (and checks for launch errors)
|
||||
try {
|
||||
kernel.Launch(queue, global, local, event, waitForEvents);
|
||||
} catch (...) { return StatusCode::kKernelLaunchError; }
|
||||
|
||||
// No errors, normal termination of this function
|
||||
return StatusCode::kSuccess;
|
||||
}
|
||||
|
||||
// As above, but without an event waiting list
|
||||
StatusCode RunKernel(Kernel &kernel, Queue &queue, const Device &device,
|
||||
std::vector<size_t> global, const std::vector<size_t> &local,
|
||||
EventPointer event) {
|
||||
auto emptyWaitingList = std::vector<Event>();
|
||||
return RunKernel(kernel, queue, device, global, local, event, emptyWaitingList);
|
||||
}
|
||||
|
||||
// =================================================================================================
|
||||
} // namespace clblast
|
173
src/routines/common.hpp
Normal file
173
src/routines/common.hpp
Normal file
|
@ -0,0 +1,173 @@
|
|||
|
||||
// =================================================================================================
|
||||
// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
|
||||
// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
|
||||
// width of 100 characters per line.
|
||||
//
|
||||
// Author(s):
|
||||
// Cedric Nugteren <www.cedricnugteren.nl>
|
||||
//
|
||||
// This file contains all the interfaces to common kernels, such as copying, padding, and
|
||||
// transposing a matrix. These functions are templated and thus header-only. This file also contains
|
||||
// other common functions to routines, such as a function to launch a kernel.
|
||||
//
|
||||
// =================================================================================================
|
||||
|
||||
#ifndef CLBLAST_ROUTINES_COMMON_H_
|
||||
#define CLBLAST_ROUTINES_COMMON_H_
|
||||
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
#include "clblast.h"
|
||||
#include "clpp11.hpp"
|
||||
#include "database/database.hpp"
|
||||
|
||||
namespace clblast {
|
||||
// =================================================================================================
|
||||
|
||||
// Enqueues a kernel, waits for completion, and checks for errors
|
||||
StatusCode RunKernel(Kernel &kernel, Queue &queue, const Device &device,
|
||||
std::vector<size_t> global, const std::vector<size_t> &local,
|
||||
EventPointer event, std::vector<Event>& waitForEvents);
|
||||
|
||||
// As above, but without an event waiting list
|
||||
StatusCode RunKernel(Kernel &kernel, Queue &queue, const Device &device,
|
||||
std::vector<size_t> global, const std::vector<size_t> &local,
|
||||
EventPointer event);
|
||||
|
||||
// =================================================================================================
|
||||
|
||||
// Copies or transposes a matrix and optionally pads/unpads it with zeros. This method is also able
|
||||
// to write to symmetric and triangular matrices through optional arguments.
|
||||
template <typename T>
|
||||
StatusCode PadCopyTransposeMatrix(Queue &queue, const Device &device, const Context &context,
|
||||
const Database &db,
|
||||
EventPointer event, std::vector<Event>& waitForEvents,
|
||||
const size_t src_one, const size_t src_two,
|
||||
const size_t src_ld, const size_t src_offset,
|
||||
const Buffer<T> &src,
|
||||
const size_t dest_one, const size_t dest_two,
|
||||
const size_t dest_ld, const size_t dest_offset,
|
||||
const Buffer<T> &dest,
|
||||
const T alpha,
|
||||
const Program &program, const bool do_pad,
|
||||
const bool do_transpose, const bool do_conjugate,
|
||||
const bool upper = false, const bool lower = false,
|
||||
const bool diagonal_imag_zero = false) {
|
||||
|
||||
// Determines whether or not the fast-version could potentially be used
|
||||
auto use_fast_kernel = (src_offset == 0) && (dest_offset == 0) && (do_conjugate == false) &&
|
||||
(src_one == dest_one) && (src_two == dest_two) && (src_ld == dest_ld) &&
|
||||
(upper == false) && (lower == false) && (diagonal_imag_zero == false);
|
||||
|
||||
// Determines the right kernel
|
||||
auto kernel_name = std::string{};
|
||||
if (do_transpose) {
|
||||
if (use_fast_kernel &&
|
||||
IsMultiple(src_ld, db["TRA_WPT"]) &&
|
||||
IsMultiple(src_one, db["TRA_WPT"]*db["TRA_WPT"]) &&
|
||||
IsMultiple(src_two, db["TRA_WPT"]*db["TRA_WPT"])) {
|
||||
kernel_name = "TransposeMatrixFast";
|
||||
}
|
||||
else {
|
||||
use_fast_kernel = false;
|
||||
kernel_name = (do_pad) ? "TransposePadMatrix" : "TransposeMatrix";
|
||||
}
|
||||
}
|
||||
else {
|
||||
if (use_fast_kernel &&
|
||||
IsMultiple(src_ld, db["COPY_VW"]) &&
|
||||
IsMultiple(src_one, db["COPY_VW"]*db["COPY_DIMX"]) &&
|
||||
IsMultiple(src_two, db["COPY_WPT"]*db["COPY_DIMY"])) {
|
||||
kernel_name = "CopyMatrixFast";
|
||||
}
|
||||
else {
|
||||
use_fast_kernel = false;
|
||||
kernel_name = (do_pad) ? "CopyPadMatrix" : "CopyMatrix";
|
||||
}
|
||||
}
|
||||
|
||||
// Upload the scalar argument as a constant buffer to the device (needed for half-precision)
|
||||
auto alpha_buffer = Buffer<T>(context, 1);
|
||||
alpha_buffer.Write(queue, 1, &alpha);
|
||||
|
||||
// Retrieves the kernel from the compiled binary
|
||||
try {
|
||||
auto kernel = Kernel(program, kernel_name);
|
||||
|
||||
// Sets the kernel arguments
|
||||
if (use_fast_kernel) {
|
||||
kernel.SetArgument(0, static_cast<int>(src_ld));
|
||||
kernel.SetArgument(1, src());
|
||||
kernel.SetArgument(2, dest());
|
||||
kernel.SetArgument(3, alpha_buffer());
|
||||
}
|
||||
else {
|
||||
kernel.SetArgument(0, static_cast<int>(src_one));
|
||||
kernel.SetArgument(1, static_cast<int>(src_two));
|
||||
kernel.SetArgument(2, static_cast<int>(src_ld));
|
||||
kernel.SetArgument(3, static_cast<int>(src_offset));
|
||||
kernel.SetArgument(4, src());
|
||||
kernel.SetArgument(5, static_cast<int>(dest_one));
|
||||
kernel.SetArgument(6, static_cast<int>(dest_two));
|
||||
kernel.SetArgument(7, static_cast<int>(dest_ld));
|
||||
kernel.SetArgument(8, static_cast<int>(dest_offset));
|
||||
kernel.SetArgument(9, dest());
|
||||
kernel.SetArgument(10, alpha_buffer());
|
||||
if (do_pad) {
|
||||
kernel.SetArgument(11, static_cast<int>(do_conjugate));
|
||||
}
|
||||
else {
|
||||
kernel.SetArgument(11, static_cast<int>(upper));
|
||||
kernel.SetArgument(12, static_cast<int>(lower));
|
||||
kernel.SetArgument(13, static_cast<int>(diagonal_imag_zero));
|
||||
}
|
||||
}
|
||||
|
||||
// Launches the kernel and returns the error code. Uses global and local thread sizes based on
|
||||
// parameters in the database.
|
||||
if (do_transpose) {
|
||||
if (use_fast_kernel) {
|
||||
const auto global = std::vector<size_t>{
|
||||
dest_one / db["TRA_WPT"],
|
||||
dest_two / db["TRA_WPT"]
|
||||
};
|
||||
const auto local = std::vector<size_t>{db["TRA_DIM"], db["TRA_DIM"]};
|
||||
return RunKernel(kernel, queue, device, global, local, event, waitForEvents);
|
||||
}
|
||||
else {
|
||||
const auto global = std::vector<size_t>{
|
||||
Ceil(CeilDiv(dest_one, db["PADTRA_WPT"]), db["PADTRA_TILE"]),
|
||||
Ceil(CeilDiv(dest_two, db["PADTRA_WPT"]), db["PADTRA_TILE"])
|
||||
};
|
||||
const auto local = std::vector<size_t>{db["PADTRA_TILE"], db["PADTRA_TILE"]};
|
||||
return RunKernel(kernel, queue, device, global, local, event, waitForEvents);
|
||||
}
|
||||
}
|
||||
else {
|
||||
if (use_fast_kernel) {
|
||||
const auto global = std::vector<size_t>{
|
||||
dest_one / db["COPY_VW"],
|
||||
dest_two / db["COPY_WPT"]
|
||||
};
|
||||
const auto local = std::vector<size_t>{db["COPY_DIMX"], db["COPY_DIMY"]};
|
||||
return RunKernel(kernel, queue, device, global, local, event, waitForEvents);
|
||||
}
|
||||
else {
|
||||
const auto global = std::vector<size_t>{
|
||||
Ceil(CeilDiv(dest_one, db["PAD_WPTX"]), db["PAD_DIMX"]),
|
||||
Ceil(CeilDiv(dest_two, db["PAD_WPTY"]), db["PAD_DIMY"])
|
||||
};
|
||||
const auto local = std::vector<size_t>{db["PAD_DIMX"], db["PAD_DIMY"]};
|
||||
return RunKernel(kernel, queue, device, global, local, event, waitForEvents);
|
||||
}
|
||||
}
|
||||
} catch (...) { return StatusCode::kInvalidKernel; }
|
||||
}
|
||||
|
||||
// =================================================================================================
|
||||
} // namespace clblast
|
||||
|
||||
// CLBLAST_ROUTINES_COMMON_H_
|
||||
#endif
|
|
@ -11,7 +11,7 @@
|
|||
//
|
||||
// =================================================================================================
|
||||
|
||||
#include "internal/routines/level1/xamax.h"
|
||||
#include "routines/level1/xamax.hpp"
|
||||
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
@ -19,19 +19,10 @@
|
|||
namespace clblast {
|
||||
// =================================================================================================
|
||||
|
||||
// Specific implementations to get the memory-type based on a template argument
|
||||
template <> const Precision Xamax<half>::precision_ = Precision::kHalf;
|
||||
template <> const Precision Xamax<float>::precision_ = Precision::kSingle;
|
||||
template <> const Precision Xamax<double>::precision_ = Precision::kDouble;
|
||||
template <> const Precision Xamax<float2>::precision_ = Precision::kComplexSingle;
|
||||
template <> const Precision Xamax<double2>::precision_ = Precision::kComplexDouble;
|
||||
|
||||
// =================================================================================================
|
||||
|
||||
// Constructor: forwards to base class constructor
|
||||
template <typename T>
|
||||
Xamax<T>::Xamax(Queue &queue, EventPointer event, const std::string &name):
|
||||
Routine<T>(queue, event, name, {"Xdot"}, precision_) {
|
||||
Routine(queue, event, name, {"Xdot"}, PrecisionValue<T>()) {
|
||||
source_string_ =
|
||||
#include "../../kernels/level1/xamax.opencl"
|
||||
;
|
||||
|
@ -49,14 +40,14 @@ StatusCode Xamax<T>::DoAmax(const size_t n,
|
|||
if (n == 0) { return StatusCode::kInvalidDimension; }
|
||||
|
||||
// Tests the vectors for validity
|
||||
auto status = TestVectorX(n, x_buffer, x_offset, x_inc, sizeof(T));
|
||||
auto status = TestVectorX(n, x_buffer, x_offset, x_inc);
|
||||
if (ErrorIn(status)) { return status; }
|
||||
status = TestVectorIndex(1, imax_buffer, imax_offset, sizeof(unsigned int));
|
||||
status = TestVectorIndex(1, imax_buffer, imax_offset);
|
||||
if (ErrorIn(status)) { return status; }
|
||||
|
||||
// Retrieves the Xamax kernels from the compiled binary
|
||||
try {
|
||||
const auto program = GetProgramFromCache();
|
||||
const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);
|
||||
auto kernel1 = Kernel(program, "Xamax");
|
||||
auto kernel2 = Kernel(program, "XamaxEpilogue");
|
||||
|
||||
|
@ -80,7 +71,7 @@ StatusCode Xamax<T>::DoAmax(const size_t n,
|
|||
auto global1 = std::vector<size_t>{db_["WGS1"]*temp_size};
|
||||
auto local1 = std::vector<size_t>{db_["WGS1"]};
|
||||
auto kernelEvent = Event();
|
||||
status = RunKernel(kernel1, global1, local1, kernelEvent.pointer());
|
||||
status = RunKernel(kernel1, queue_, device_, global1, local1, kernelEvent.pointer());
|
||||
if (ErrorIn(status)) { return status; }
|
||||
eventWaitList.push_back(kernelEvent);
|
||||
|
||||
|
@ -93,7 +84,7 @@ StatusCode Xamax<T>::DoAmax(const size_t n,
|
|||
// Launches the epilogue kernel
|
||||
auto global2 = std::vector<size_t>{db_["WGS2"]};
|
||||
auto local2 = std::vector<size_t>{db_["WGS2"]};
|
||||
status = RunKernel(kernel2, global2, local2, event_, eventWaitList);
|
||||
status = RunKernel(kernel2, queue_, device_, global2, local2, event_, eventWaitList);
|
||||
if (ErrorIn(status)) { return status; }
|
||||
|
||||
// Succesfully finished the computation
|
|
@ -14,28 +14,16 @@
|
|||
#ifndef CLBLAST_ROUTINES_XAMAX_H_
|
||||
#define CLBLAST_ROUTINES_XAMAX_H_
|
||||
|
||||
#include "internal/routine.h"
|
||||
#include "routine.hpp"
|
||||
|
||||
namespace clblast {
|
||||
// =================================================================================================
|
||||
|
||||
// See comment at top of file for a description of the class
|
||||
template <typename T>
|
||||
class Xamax: public Routine<T> {
|
||||
class Xamax: public Routine {
|
||||
public:
|
||||
|
||||
// Members and methods from the base class
|
||||
using Routine<T>::db_;
|
||||
using Routine<T>::source_string_;
|
||||
using Routine<T>::queue_;
|
||||
using Routine<T>::event_;
|
||||
using Routine<T>::context_;
|
||||
using Routine<T>::GetProgramFromCache;
|
||||
using Routine<T>::TestVectorX;
|
||||
using Routine<T>::TestVectorIndex;
|
||||
using Routine<T>::RunKernel;
|
||||
using Routine<T>::ErrorIn;
|
||||
|
||||
// Constructor
|
||||
Xamax(Queue &queue, EventPointer event, const std::string &name = "AMAX");
|
||||
|
||||
|
@ -43,10 +31,6 @@ class Xamax: public Routine<T> {
|
|||
StatusCode DoAmax(const size_t n,
|
||||
const Buffer<unsigned int> &imax_buffer, const size_t imax_offset,
|
||||
const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc);
|
||||
|
||||
private:
|
||||
// Static variable to get the precision
|
||||
const static Precision precision_;
|
||||
};
|
||||
|
||||
// =================================================================================================
|
|
@ -11,7 +11,7 @@
|
|||
//
|
||||
// =================================================================================================
|
||||
|
||||
#include "internal/routines/level1/xasum.h"
|
||||
#include "routines/level1/xasum.hpp"
|
||||
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
@ -19,19 +19,10 @@
|
|||
namespace clblast {
|
||||
// =================================================================================================
|
||||
|
||||
// Specific implementations to get the memory-type based on a template argument
|
||||
template <> const Precision Xasum<half>::precision_ = Precision::kHalf;
|
||||
template <> const Precision Xasum<float>::precision_ = Precision::kSingle;
|
||||
template <> const Precision Xasum<double>::precision_ = Precision::kDouble;
|
||||
template <> const Precision Xasum<float2>::precision_ = Precision::kComplexSingle;
|
||||
template <> const Precision Xasum<double2>::precision_ = Precision::kComplexDouble;
|
||||
|
||||
// =================================================================================================
|
||||
|
||||
// Constructor: forwards to base class constructor
|
||||
template <typename T>
|
||||
Xasum<T>::Xasum(Queue &queue, EventPointer event, const std::string &name):
|
||||
Routine<T>(queue, event, name, {"Xdot"}, precision_) {
|
||||
Routine(queue, event, name, {"Xdot"}, PrecisionValue<T>()) {
|
||||
source_string_ =
|
||||
#include "../../kernels/level1/xasum.opencl"
|
||||
;
|
||||
|
@ -49,14 +40,14 @@ StatusCode Xasum<T>::DoAsum(const size_t n,
|
|||
if (n == 0) { return StatusCode::kInvalidDimension; }
|
||||
|
||||
// Tests the vectors for validity
|
||||
auto status = TestVectorX(n, x_buffer, x_offset, x_inc, sizeof(T));
|
||||
auto status = TestVectorX(n, x_buffer, x_offset, x_inc);
|
||||
if (ErrorIn(status)) { return status; }
|
||||
status = TestVectorDot(1, asum_buffer, asum_offset, sizeof(T));
|
||||
status = TestVectorScalar(1, asum_buffer, asum_offset);
|
||||
if (ErrorIn(status)) { return status; }
|
||||
|
||||
// Retrieves the Xasum kernels from the compiled binary
|
||||
try {
|
||||
const auto program = GetProgramFromCache();
|
||||
const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);
|
||||
auto kernel1 = Kernel(program, "Xasum");
|
||||
auto kernel2 = Kernel(program, "XasumEpilogue");
|
||||
|
||||
|
@ -78,7 +69,7 @@ StatusCode Xasum<T>::DoAsum(const size_t n,
|
|||
auto global1 = std::vector<size_t>{db_["WGS1"]*temp_size};
|
||||
auto local1 = std::vector<size_t>{db_["WGS1"]};
|
||||
auto kernelEvent = Event();
|
||||
status = RunKernel(kernel1, global1, local1, kernelEvent.pointer());
|
||||
status = RunKernel(kernel1, queue_, device_, global1, local1, kernelEvent.pointer());
|
||||
if (ErrorIn(status)) { return status; }
|
||||
eventWaitList.push_back(kernelEvent);
|
||||
|
||||
|
@ -90,7 +81,7 @@ StatusCode Xasum<T>::DoAsum(const size_t n,
|
|||
// Launches the epilogue kernel
|
||||
auto global2 = std::vector<size_t>{db_["WGS2"]};
|
||||
auto local2 = std::vector<size_t>{db_["WGS2"]};
|
||||
status = RunKernel(kernel2, global2, local2, event_, eventWaitList);
|
||||
status = RunKernel(kernel2, queue_, device_, global2, local2, event_, eventWaitList);
|
||||
if (ErrorIn(status)) { return status; }
|
||||
|
||||
// Succesfully finished the computation
|
|
@ -14,28 +14,16 @@
|
|||
#ifndef CLBLAST_ROUTINES_XASUM_H_
|
||||
#define CLBLAST_ROUTINES_XASUM_H_
|
||||
|
||||
#include "internal/routine.h"
|
||||
#include "routine.hpp"
|
||||
|
||||
namespace clblast {
|
||||
// =================================================================================================
|
||||
|
||||
// See comment at top of file for a description of the class
|
||||
template <typename T>
|
||||
class Xasum: public Routine<T> {
|
||||
class Xasum: public Routine {
|
||||
public:
|
||||
|
||||
// Members and methods from the base class
|
||||
using Routine<T>::db_;
|
||||
using Routine<T>::source_string_;
|
||||
using Routine<T>::queue_;
|
||||
using Routine<T>::event_;
|
||||
using Routine<T>::context_;
|
||||
using Routine<T>::GetProgramFromCache;
|
||||
using Routine<T>::TestVectorX;
|
||||
using Routine<T>::TestVectorDot;
|
||||
using Routine<T>::RunKernel;
|
||||
using Routine<T>::ErrorIn;
|
||||
|
||||
// Constructor
|
||||
Xasum(Queue &queue, EventPointer event, const std::string &name = "ASUM");
|
||||
|
||||
|
@ -43,10 +31,6 @@ class Xasum: public Routine<T> {
|
|||
StatusCode DoAsum(const size_t n,
|
||||
const Buffer<T> &asum_buffer, const size_t asum_offset,
|
||||
const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc);
|
||||
|
||||
private:
|
||||
// Static variable to get the precision
|
||||
const static Precision precision_;
|
||||
};
|
||||
|
||||
// =================================================================================================
|
|
@ -11,7 +11,7 @@
|
|||
//
|
||||
// =================================================================================================
|
||||
|
||||
#include "internal/routines/level1/xaxpy.h"
|
||||
#include "routines/level1/xaxpy.hpp"
|
||||
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
@ -19,19 +19,10 @@
|
|||
namespace clblast {
|
||||
// =================================================================================================
|
||||
|
||||
// Specific implementations to get the memory-type based on a template argument
|
||||
template <> const Precision Xaxpy<half>::precision_ = Precision::kHalf;
|
||||
template <> const Precision Xaxpy<float>::precision_ = Precision::kSingle;
|
||||
template <> const Precision Xaxpy<double>::precision_ = Precision::kDouble;
|
||||
template <> const Precision Xaxpy<float2>::precision_ = Precision::kComplexSingle;
|
||||
template <> const Precision Xaxpy<double2>::precision_ = Precision::kComplexDouble;
|
||||
|
||||
// =================================================================================================
|
||||
|
||||
// Constructor: forwards to base class constructor
|
||||
template <typename T>
|
||||
Xaxpy<T>::Xaxpy(Queue &queue, EventPointer event, const std::string &name):
|
||||
Routine<T>(queue, event, name, {"Xaxpy"}, precision_) {
|
||||
Routine(queue, event, name, {"Xaxpy"}, PrecisionValue<T>()) {
|
||||
source_string_ =
|
||||
#include "../../kernels/level1/level1.opencl"
|
||||
#include "../../kernels/level1/xaxpy.opencl"
|
||||
|
@ -50,9 +41,9 @@ StatusCode Xaxpy<T>::DoAxpy(const size_t n, const T alpha,
|
|||
if (n == 0) { return StatusCode::kInvalidDimension; }
|
||||
|
||||
// Tests the vectors for validity
|
||||
auto status = TestVectorX(n, x_buffer, x_offset, x_inc, sizeof(T));
|
||||
auto status = TestVectorX(n, x_buffer, x_offset, x_inc);
|
||||
if (ErrorIn(status)) { return status; }
|
||||
status = TestVectorY(n, y_buffer, y_offset, y_inc, sizeof(T));
|
||||
status = TestVectorY(n, y_buffer, y_offset, y_inc);
|
||||
if (ErrorIn(status)) { return status; }
|
||||
|
||||
// Determines whether or not the fast-version can be used
|
||||
|
@ -65,7 +56,7 @@ StatusCode Xaxpy<T>::DoAxpy(const size_t n, const T alpha,
|
|||
|
||||
// Retrieves the Xaxpy kernel from the compiled binary
|
||||
try {
|
||||
const auto program = GetProgramFromCache();
|
||||
const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);
|
||||
auto kernel = Kernel(program, kernel_name);
|
||||
|
||||
// Upload the scalar argument as a constant buffer to the device (needed for half-precision)
|
||||
|
@ -94,13 +85,13 @@ StatusCode Xaxpy<T>::DoAxpy(const size_t n, const T alpha,
|
|||
if (use_fast_kernel) {
|
||||
auto global = std::vector<size_t>{CeilDiv(n, db_["WPT"]*db_["VW"])};
|
||||
auto local = std::vector<size_t>{db_["WGS"]};
|
||||
status = RunKernel(kernel, global, local, event_);
|
||||
status = RunKernel(kernel, queue_, device_, global, local, event_);
|
||||
}
|
||||
else {
|
||||
auto n_ceiled = Ceil(n, db_["WGS"]*db_["WPT"]);
|
||||
auto global = std::vector<size_t>{n_ceiled/db_["WPT"]};
|
||||
auto local = std::vector<size_t>{db_["WGS"]};
|
||||
status = RunKernel(kernel, global, local, event_);
|
||||
status = RunKernel(kernel, queue_, device_, global, local, event_);
|
||||
}
|
||||
if (ErrorIn(status)) { return status; }
|
||||
|
|
@ -14,28 +14,16 @@
|
|||
#ifndef CLBLAST_ROUTINES_XAXPY_H_
|
||||
#define CLBLAST_ROUTINES_XAXPY_H_
|
||||
|
||||
#include "internal/routine.h"
|
||||
#include "routine.hpp"
|
||||
|
||||
namespace clblast {
|
||||
// =================================================================================================
|
||||
|
||||
// See comment at top of file for a description of the class
|
||||
template <typename T>
|
||||
class Xaxpy: public Routine<T> {
|
||||
class Xaxpy: public Routine {
|
||||
public:
|
||||
|
||||
// Members and methods from the base class
|
||||
using Routine<T>::db_;
|
||||
using Routine<T>::source_string_;
|
||||
using Routine<T>::queue_;
|
||||
using Routine<T>::event_;
|
||||
using Routine<T>::context_;
|
||||
using Routine<T>::GetProgramFromCache;
|
||||
using Routine<T>::TestVectorX;
|
||||
using Routine<T>::TestVectorY;
|
||||
using Routine<T>::RunKernel;
|
||||
using Routine<T>::ErrorIn;
|
||||
|
||||
// Constructor
|
||||
Xaxpy(Queue &queue, EventPointer event, const std::string &name = "AXPY");
|
||||
|
||||
|
@ -43,10 +31,6 @@ class Xaxpy: public Routine<T> {
|
|||
StatusCode DoAxpy(const size_t n, const T alpha,
|
||||
const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
|
||||
const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc);
|
||||
|
||||
private:
|
||||
// Static variable to get the precision
|
||||
const static Precision precision_;
|
||||
};
|
||||
|
||||
// =================================================================================================
|
|
@ -11,7 +11,7 @@
|
|||
//
|
||||
// =================================================================================================
|
||||
|
||||
#include "internal/routines/level1/xcopy.h"
|
||||
#include "routines/level1/xcopy.hpp"
|
||||
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
@ -19,19 +19,10 @@
|
|||
namespace clblast {
|
||||
// =================================================================================================
|
||||
|
||||
// Specific implementations to get the memory-type based on a template argument
|
||||
template <> const Precision Xcopy<half>::precision_ = Precision::kHalf;
|
||||
template <> const Precision Xcopy<float>::precision_ = Precision::kSingle;
|
||||
template <> const Precision Xcopy<double>::precision_ = Precision::kDouble;
|
||||
template <> const Precision Xcopy<float2>::precision_ = Precision::kComplexSingle;
|
||||
template <> const Precision Xcopy<double2>::precision_ = Precision::kComplexDouble;
|
||||
|
||||
// =================================================================================================
|
||||
|
||||
// Constructor: forwards to base class constructor
|
||||
template <typename T>
|
||||
Xcopy<T>::Xcopy(Queue &queue, EventPointer event, const std::string &name):
|
||||
Routine<T>(queue, event, name, {"Xaxpy"}, precision_) {
|
||||
Routine(queue, event, name, {"Xaxpy"}, PrecisionValue<T>()) {
|
||||
source_string_ =
|
||||
#include "../../kernels/level1/level1.opencl"
|
||||
#include "../../kernels/level1/xcopy.opencl"
|
||||
|
@ -50,9 +41,9 @@ StatusCode Xcopy<T>::DoCopy(const size_t n,
|
|||
if (n == 0) { return StatusCode::kInvalidDimension; }
|
||||
|
||||
// Tests the vectors for validity
|
||||
auto status = TestVectorX(n, x_buffer, x_offset, x_inc, sizeof(T));
|
||||
auto status = TestVectorX(n, x_buffer, x_offset, x_inc);
|
||||
if (ErrorIn(status)) { return status; }
|
||||
status = TestVectorY(n, y_buffer, y_offset, y_inc, sizeof(T));
|
||||
status = TestVectorY(n, y_buffer, y_offset, y_inc);
|
||||
if (ErrorIn(status)) { return status; }
|
||||
|
||||
// Determines whether or not the fast-version can be used
|
||||
|
@ -65,7 +56,7 @@ StatusCode Xcopy<T>::DoCopy(const size_t n,
|
|||
|
||||
// Retrieves the Xcopy kernel from the compiled binary
|
||||
try {
|
||||
const auto program = GetProgramFromCache();
|
||||
const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);
|
||||
auto kernel = Kernel(program, kernel_name);
|
||||
|
||||
// Sets the kernel arguments
|
||||
|
@ -88,13 +79,13 @@ StatusCode Xcopy<T>::DoCopy(const size_t n,
|
|||
if (use_fast_kernel) {
|
||||
auto global = std::vector<size_t>{CeilDiv(n, db_["WPT"]*db_["VW"])};
|
||||
auto local = std::vector<size_t>{db_["WGS"]};
|
||||
status = RunKernel(kernel, global, local, event_);
|
||||
status = RunKernel(kernel, queue_, device_, global, local, event_);
|
||||
}
|
||||
else {
|
||||
auto n_ceiled = Ceil(n, db_["WGS"]*db_["WPT"]);
|
||||
auto global = std::vector<size_t>{n_ceiled/db_["WPT"]};
|
||||
auto local = std::vector<size_t>{db_["WGS"]};
|
||||
status = RunKernel(kernel, global, local, event_);
|
||||
status = RunKernel(kernel, queue_, device_, global, local, event_);
|
||||
}
|
||||
if (ErrorIn(status)) { return status; }
|
||||
|
|
@ -14,27 +14,16 @@
|
|||
#ifndef CLBLAST_ROUTINES_XCOPY_H_
|
||||
#define CLBLAST_ROUTINES_XCOPY_H_
|
||||
|
||||
#include "internal/routine.h"
|
||||
#include "routine.hpp"
|
||||
|
||||
namespace clblast {
|
||||
// =================================================================================================
|
||||
|
||||
// See comment at top of file for a description of the class
|
||||
template <typename T>
|
||||
class Xcopy: public Routine<T> {
|
||||
class Xcopy: public Routine {
|
||||
public:
|
||||
|
||||
// Members and methods from the base class
|
||||
using Routine<T>::db_;
|
||||
using Routine<T>::source_string_;
|
||||
using Routine<T>::queue_;
|
||||
using Routine<T>::event_;
|
||||
using Routine<T>::GetProgramFromCache;
|
||||
using Routine<T>::TestVectorX;
|
||||
using Routine<T>::TestVectorY;
|
||||
using Routine<T>::RunKernel;
|
||||
using Routine<T>::ErrorIn;
|
||||
|
||||
// Constructor
|
||||
Xcopy(Queue &queue, EventPointer event, const std::string &name = "COPY");
|
||||
|
||||
|
@ -42,10 +31,6 @@ class Xcopy: public Routine<T> {
|
|||
StatusCode DoCopy(const size_t n,
|
||||
const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
|
||||
const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc);
|
||||
|
||||
private:
|
||||
// Static variable to get the precision
|
||||
const static Precision precision_;
|
||||
};
|
||||
|
||||
// =================================================================================================
|
|
@ -11,7 +11,7 @@
|
|||
//
|
||||
// =================================================================================================
|
||||
|
||||
#include "internal/routines/level1/xdot.h"
|
||||
#include "routines/level1/xdot.hpp"
|
||||
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
@ -19,19 +19,10 @@
|
|||
namespace clblast {
|
||||
// =================================================================================================
|
||||
|
||||
// Specific implementations to get the memory-type based on a template argument
|
||||
template <> const Precision Xdot<half>::precision_ = Precision::kHalf;
|
||||
template <> const Precision Xdot<float>::precision_ = Precision::kSingle;
|
||||
template <> const Precision Xdot<double>::precision_ = Precision::kDouble;
|
||||
template <> const Precision Xdot<float2>::precision_ = Precision::kComplexSingle;
|
||||
template <> const Precision Xdot<double2>::precision_ = Precision::kComplexDouble;
|
||||
|
||||
// =================================================================================================
|
||||
|
||||
// Constructor: forwards to base class constructor
|
||||
template <typename T>
|
||||
Xdot<T>::Xdot(Queue &queue, EventPointer event, const std::string &name):
|
||||
Routine<T>(queue, event, name, {"Xdot"}, precision_) {
|
||||
Routine(queue, event, name, {"Xdot"}, PrecisionValue<T>()) {
|
||||
source_string_ =
|
||||
#include "../../kernels/level1/xdot.opencl"
|
||||
;
|
||||
|
@ -51,16 +42,16 @@ StatusCode Xdot<T>::DoDot(const size_t n,
|
|||
if (n == 0) { return StatusCode::kInvalidDimension; }
|
||||
|
||||
// Tests the vectors for validity
|
||||
auto status = TestVectorX(n, x_buffer, x_offset, x_inc, sizeof(T));
|
||||
auto status = TestVectorX(n, x_buffer, x_offset, x_inc);
|
||||
if (ErrorIn(status)) { return status; }
|
||||
status = TestVectorY(n, y_buffer, y_offset, y_inc, sizeof(T));
|
||||
status = TestVectorY(n, y_buffer, y_offset, y_inc);
|
||||
if (ErrorIn(status)) { return status; }
|
||||
status = TestVectorDot(1, dot_buffer, dot_offset, sizeof(T));
|
||||
status = TestVectorScalar(1, dot_buffer, dot_offset);
|
||||
if (ErrorIn(status)) { return status; }
|
||||
|
||||
// Retrieves the Xdot kernels from the compiled binary
|
||||
try {
|
||||
const auto program = GetProgramFromCache();
|
||||
const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);
|
||||
auto kernel1 = Kernel(program, "Xdot");
|
||||
auto kernel2 = Kernel(program, "XdotEpilogue");
|
||||
|
||||
|
@ -86,7 +77,7 @@ StatusCode Xdot<T>::DoDot(const size_t n,
|
|||
auto global1 = std::vector<size_t>{db_["WGS1"]*temp_size};
|
||||
auto local1 = std::vector<size_t>{db_["WGS1"]};
|
||||
auto kernelEvent = Event();
|
||||
status = RunKernel(kernel1, global1, local1, kernelEvent.pointer());
|
||||
status = RunKernel(kernel1, queue_, device_, global1, local1, kernelEvent.pointer());
|
||||
if (ErrorIn(status)) { return status; }
|
||||
eventWaitList.push_back(kernelEvent);
|
||||
|
||||
|
@ -98,7 +89,7 @@ StatusCode Xdot<T>::DoDot(const size_t n,
|
|||
// Launches the epilogue kernel
|
||||
auto global2 = std::vector<size_t>{db_["WGS2"]};
|
||||
auto local2 = std::vector<size_t>{db_["WGS2"]};
|
||||
status = RunKernel(kernel2, global2, local2, event_, eventWaitList);
|
||||
status = RunKernel(kernel2, queue_, device_, global2, local2, event_, eventWaitList);
|
||||
if (ErrorIn(status)) { return status; }
|
||||
|
||||
// Succesfully finished the computation
|
|
@ -14,29 +14,16 @@
|
|||
#ifndef CLBLAST_ROUTINES_XDOT_H_
|
||||
#define CLBLAST_ROUTINES_XDOT_H_
|
||||
|
||||
#include "internal/routine.h"
|
||||
#include "routine.hpp"
|
||||
|
||||
namespace clblast {
|
||||
// =================================================================================================
|
||||
|
||||
// See comment at top of file for a description of the class
|
||||
template <typename T>
|
||||
class Xdot: public Routine<T> {
|
||||
class Xdot: public Routine {
|
||||
public:
|
||||
|
||||
// Members and methods from the base class
|
||||
using Routine<T>::db_;
|
||||
using Routine<T>::source_string_;
|
||||
using Routine<T>::queue_;
|
||||
using Routine<T>::event_;
|
||||
using Routine<T>::context_;
|
||||
using Routine<T>::GetProgramFromCache;
|
||||
using Routine<T>::TestVectorX;
|
||||
using Routine<T>::TestVectorY;
|
||||
using Routine<T>::TestVectorDot;
|
||||
using Routine<T>::RunKernel;
|
||||
using Routine<T>::ErrorIn;
|
||||
|
||||
// Constructor
|
||||
Xdot(Queue &queue, EventPointer event, const std::string &name = "DOT");
|
||||
|
||||
|
@ -46,10 +33,6 @@ class Xdot: public Routine<T> {
|
|||
const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
|
||||
const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc,
|
||||
const bool do_conjugate = false);
|
||||
|
||||
private:
|
||||
// Static variable to get the precision
|
||||
const static Precision precision_;
|
||||
};
|
||||
|
||||
// =================================================================================================
|
|
@ -11,7 +11,7 @@
|
|||
//
|
||||
// =================================================================================================
|
||||
|
||||
#include "internal/routines/level1/xdotc.h"
|
||||
#include "routines/level1/xdotc.hpp"
|
||||
|
||||
#include <string>
|
||||
#include <vector>
|
|
@ -14,7 +14,7 @@
|
|||
#ifndef CLBLAST_ROUTINES_XDOTC_H_
|
||||
#define CLBLAST_ROUTINES_XDOTC_H_
|
||||
|
||||
#include "internal/routines/level1/xdot.h"
|
||||
#include "routines/level1/xdot.hpp"
|
||||
|
||||
namespace clblast {
|
||||
// =================================================================================================
|
|
@ -11,7 +11,7 @@
|
|||
//
|
||||
// =================================================================================================
|
||||
|
||||
#include "internal/routines/level1/xdotu.h"
|
||||
#include "routines/level1/xdotu.hpp"
|
||||
|
||||
#include <string>
|
||||
|
|
@ -14,7 +14,7 @@
|
|||
#ifndef CLBLAST_ROUTINES_XDOTU_H_
|
||||
#define CLBLAST_ROUTINES_XDOTU_H_
|
||||
|
||||
#include "internal/routines/level1/xdot.h"
|
||||
#include "routines/level1/xdot.hpp"
|
||||
|
||||
namespace clblast {
|
||||
// =================================================================================================
|
|
@ -14,8 +14,8 @@
|
|||
#ifndef CLBLAST_ROUTINES_XMAX_H_
|
||||
#define CLBLAST_ROUTINES_XMAX_H_
|
||||
|
||||
#include "internal/routine.h"
|
||||
#include "internal/routines/level1/xamax.h"
|
||||
#include "routine.hpp"
|
||||
#include "routines/level1/xamax.hpp"
|
||||
|
||||
namespace clblast {
|
||||
// =================================================================================================
|
|
@ -14,8 +14,8 @@
|
|||
#ifndef CLBLAST_ROUTINES_XMIN_H_
|
||||
#define CLBLAST_ROUTINES_XMIN_H_
|
||||
|
||||
#include "internal/routine.h"
|
||||
#include "internal/routines/level1/xamax.h"
|
||||
#include "routine.hpp"
|
||||
#include "routines/level1/xamax.hpp"
|
||||
|
||||
namespace clblast {
|
||||
// =================================================================================================
|
|
@ -11,7 +11,7 @@
|
|||
//
|
||||
// =================================================================================================
|
||||
|
||||
#include "internal/routines/level1/xnrm2.h"
|
||||
#include "routines/level1/xnrm2.hpp"
|
||||
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
@ -19,19 +19,10 @@
|
|||
namespace clblast {
|
||||
// =================================================================================================
|
||||
|
||||
// Specific implementations to get the memory-type based on a template argument
|
||||
template <> const Precision Xnrm2<half>::precision_ = Precision::kHalf;
|
||||
template <> const Precision Xnrm2<float>::precision_ = Precision::kSingle;
|
||||
template <> const Precision Xnrm2<double>::precision_ = Precision::kDouble;
|
||||
template <> const Precision Xnrm2<float2>::precision_ = Precision::kComplexSingle;
|
||||
template <> const Precision Xnrm2<double2>::precision_ = Precision::kComplexDouble;
|
||||
|
||||
// =================================================================================================
|
||||
|
||||
// Constructor: forwards to base class constructor
|
||||
template <typename T>
|
||||
Xnrm2<T>::Xnrm2(Queue &queue, EventPointer event, const std::string &name):
|
||||
Routine<T>(queue, event, name, {"Xdot"}, precision_) {
|
||||
Routine(queue, event, name, {"Xdot"}, PrecisionValue<T>()) {
|
||||
source_string_ =
|
||||
#include "../../kernels/level1/xnrm2.opencl"
|
||||
;
|
||||
|
@ -49,14 +40,14 @@ StatusCode Xnrm2<T>::DoNrm2(const size_t n,
|
|||
if (n == 0) { return StatusCode::kInvalidDimension; }
|
||||
|
||||
// Tests the vectors for validity
|
||||
auto status = TestVectorX(n, x_buffer, x_offset, x_inc, sizeof(T));
|
||||
auto status = TestVectorX(n, x_buffer, x_offset, x_inc);
|
||||
if (ErrorIn(status)) { return status; }
|
||||
status = TestVectorDot(1, nrm2_buffer, nrm2_offset, sizeof(T));
|
||||
status = TestVectorScalar(1, nrm2_buffer, nrm2_offset);
|
||||
if (ErrorIn(status)) { return status; }
|
||||
|
||||
// Retrieves the Xnrm2 kernels from the compiled binary
|
||||
try {
|
||||
const auto program = GetProgramFromCache();
|
||||
const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);
|
||||
auto kernel1 = Kernel(program, "Xnrm2");
|
||||
auto kernel2 = Kernel(program, "Xnrm2Epilogue");
|
||||
|
||||
|
@ -78,7 +69,7 @@ StatusCode Xnrm2<T>::DoNrm2(const size_t n,
|
|||
auto global1 = std::vector<size_t>{db_["WGS1"]*temp_size};
|
||||
auto local1 = std::vector<size_t>{db_["WGS1"]};
|
||||
auto kernelEvent = Event();
|
||||
status = RunKernel(kernel1, global1, local1, kernelEvent.pointer());
|
||||
status = RunKernel(kernel1, queue_, device_, global1, local1, kernelEvent.pointer());
|
||||
if (ErrorIn(status)) { return status; }
|
||||
eventWaitList.push_back(kernelEvent);
|
||||
|
||||
|
@ -90,7 +81,7 @@ StatusCode Xnrm2<T>::DoNrm2(const size_t n,
|
|||
// Launches the epilogue kernel
|
||||
auto global2 = std::vector<size_t>{db_["WGS2"]};
|
||||
auto local2 = std::vector<size_t>{db_["WGS2"]};
|
||||
status = RunKernel(kernel2, global2, local2, event_, eventWaitList);
|
||||
status = RunKernel(kernel2, queue_, device_, global2, local2, event_, eventWaitList);
|
||||
if (ErrorIn(status)) { return status; }
|
||||
|
||||
// Succesfully finished the computation
|
|
@ -14,28 +14,16 @@
|
|||
#ifndef CLBLAST_ROUTINES_XNRM2_H_
|
||||
#define CLBLAST_ROUTINES_XNRM2_H_
|
||||
|
||||
#include "internal/routine.h"
|
||||
#include "routine.hpp"
|
||||
|
||||
namespace clblast {
|
||||
// =================================================================================================
|
||||
|
||||
// See comment at top of file for a description of the class
|
||||
template <typename T>
|
||||
class Xnrm2: public Routine<T> {
|
||||
class Xnrm2: public Routine {
|
||||
public:
|
||||
|
||||
// Members and methods from the base class
|
||||
using Routine<T>::db_;
|
||||
using Routine<T>::source_string_;
|
||||
using Routine<T>::queue_;
|
||||
using Routine<T>::event_;
|
||||
using Routine<T>::context_;
|
||||
using Routine<T>::GetProgramFromCache;
|
||||
using Routine<T>::TestVectorX;
|
||||
using Routine<T>::TestVectorDot;
|
||||
using Routine<T>::RunKernel;
|
||||
using Routine<T>::ErrorIn;
|
||||
|
||||
// Constructor
|
||||
Xnrm2(Queue &queue, EventPointer event, const std::string &name = "NRM2");
|
||||
|
||||
|
@ -43,10 +31,6 @@ class Xnrm2: public Routine<T> {
|
|||
StatusCode DoNrm2(const size_t n,
|
||||
const Buffer<T> &nrm2_buffer, const size_t nrm2_offset,
|
||||
const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc);
|
||||
|
||||
private:
|
||||
// Static variable to get the precision
|
||||
const static Precision precision_;
|
||||
};
|
||||
|
||||
// =================================================================================================
|
|
@ -11,7 +11,7 @@
|
|||
//
|
||||
// =================================================================================================
|
||||
|
||||
#include "internal/routines/level1/xscal.h"
|
||||
#include "routines/level1/xscal.hpp"
|
||||
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
@ -19,19 +19,10 @@
|
|||
namespace clblast {
|
||||
// =================================================================================================
|
||||
|
||||
// Specific implementations to get the memory-type based on a template argument
|
||||
template <> const Precision Xscal<half>::precision_ = Precision::kHalf;
|
||||
template <> const Precision Xscal<float>::precision_ = Precision::kSingle;
|
||||
template <> const Precision Xscal<double>::precision_ = Precision::kDouble;
|
||||
template <> const Precision Xscal<float2>::precision_ = Precision::kComplexSingle;
|
||||
template <> const Precision Xscal<double2>::precision_ = Precision::kComplexDouble;
|
||||
|
||||
// =================================================================================================
|
||||
|
||||
// Constructor: forwards to base class constructor
|
||||
template <typename T>
|
||||
Xscal<T>::Xscal(Queue &queue, EventPointer event, const std::string &name):
|
||||
Routine<T>(queue, event, name, {"Xaxpy"}, precision_) {
|
||||
Routine(queue, event, name, {"Xaxpy"}, PrecisionValue<T>()) {
|
||||
source_string_ =
|
||||
#include "../../kernels/level1/level1.opencl"
|
||||
#include "../../kernels/level1/xscal.opencl"
|
||||
|
@ -49,7 +40,7 @@ StatusCode Xscal<T>::DoScal(const size_t n, const T alpha,
|
|||
if (n == 0) { return StatusCode::kInvalidDimension; }
|
||||
|
||||
// Tests the vector for validity
|
||||
auto status = TestVectorX(n, x_buffer, x_offset, x_inc, sizeof(T));
|
||||
auto status = TestVectorX(n, x_buffer, x_offset, x_inc);
|
||||
if (ErrorIn(status)) { return status; }
|
||||
|
||||
// Determines whether or not the fast-version can be used
|
||||
|
@ -61,7 +52,7 @@ StatusCode Xscal<T>::DoScal(const size_t n, const T alpha,
|
|||
|
||||
// Retrieves the Xscal kernel from the compiled binary
|
||||
try {
|
||||
const auto program = GetProgramFromCache();
|
||||
const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);
|
||||
auto kernel = Kernel(program, kernel_name);
|
||||
|
||||
// Sets the kernel arguments
|
||||
|
@ -82,13 +73,13 @@ StatusCode Xscal<T>::DoScal(const size_t n, const T alpha,
|
|||
if (use_fast_kernel) {
|
||||
auto global = std::vector<size_t>{CeilDiv(n, db_["WPT"]*db_["VW"])};
|
||||
auto local = std::vector<size_t>{db_["WGS"]};
|
||||
status = RunKernel(kernel, global, local, event_);
|
||||
status = RunKernel(kernel, queue_, device_, global, local, event_);
|
||||
}
|
||||
else {
|
||||
auto n_ceiled = Ceil(n, db_["WGS"]*db_["WPT"]);
|
||||
auto global = std::vector<size_t>{n_ceiled/db_["WPT"]};
|
||||
auto local = std::vector<size_t>{db_["WGS"]};
|
||||
status = RunKernel(kernel, global, local, event_);
|
||||
status = RunKernel(kernel, queue_, device_, global, local, event_);
|
||||
}
|
||||
if (ErrorIn(status)) { return status; }
|
||||
|
|
@ -14,36 +14,22 @@
|
|||
#ifndef CLBLAST_ROUTINES_XSCAL_H_
|
||||
#define CLBLAST_ROUTINES_XSCAL_H_
|
||||
|
||||
#include "internal/routine.h"
|
||||
#include "routine.hpp"
|
||||
|
||||
namespace clblast {
|
||||
// =================================================================================================
|
||||
|
||||
// See comment at top of file for a description of the class
|
||||
template <typename T>
|
||||
class Xscal: public Routine<T> {
|
||||
class Xscal: public Routine {
|
||||
public:
|
||||
|
||||
// Members and methods from the base class
|
||||
using Routine<T>::db_;
|
||||
using Routine<T>::source_string_;
|
||||
using Routine<T>::queue_;
|
||||
using Routine<T>::event_;
|
||||
using Routine<T>::GetProgramFromCache;
|
||||
using Routine<T>::TestVectorX;
|
||||
using Routine<T>::RunKernel;
|
||||
using Routine<T>::ErrorIn;
|
||||
|
||||
// Constructor
|
||||
Xscal(Queue &queue, EventPointer event, const std::string &name = "SCAL");
|
||||
|
||||
// Templated-precision implementation of the routine
|
||||
StatusCode DoScal(const size_t n, const T alpha,
|
||||
const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc);
|
||||
|
||||
private:
|
||||
// Static variable to get the precision
|
||||
const static Precision precision_;
|
||||
};
|
||||
|
||||
// =================================================================================================
|
|
@ -14,8 +14,8 @@
|
|||
#ifndef CLBLAST_ROUTINES_XSUM_H_
|
||||
#define CLBLAST_ROUTINES_XSUM_H_
|
||||
|
||||
#include "internal/routine.h"
|
||||
#include "internal/routines/level1/xasum.h"
|
||||
#include "routine.hpp"
|
||||
#include "routines/level1/xasum.hpp"
|
||||
|
||||
namespace clblast {
|
||||
// =================================================================================================
|
|
@ -11,7 +11,7 @@
|
|||
//
|
||||
// =================================================================================================
|
||||
|
||||
#include "internal/routines/level1/xswap.h"
|
||||
#include "routines/level1/xswap.hpp"
|
||||
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
@ -19,19 +19,10 @@
|
|||
namespace clblast {
|
||||
// =================================================================================================
|
||||
|
||||
// Specific implementations to get the memory-type based on a template argument
|
||||
template <> const Precision Xswap<half>::precision_ = Precision::kHalf;
|
||||
template <> const Precision Xswap<float>::precision_ = Precision::kSingle;
|
||||
template <> const Precision Xswap<double>::precision_ = Precision::kDouble;
|
||||
template <> const Precision Xswap<float2>::precision_ = Precision::kComplexSingle;
|
||||
template <> const Precision Xswap<double2>::precision_ = Precision::kComplexDouble;
|
||||
|
||||
// =================================================================================================
|
||||
|
||||
// Constructor: forwards to base class constructor
|
||||
template <typename T>
|
||||
Xswap<T>::Xswap(Queue &queue, EventPointer event, const std::string &name):
|
||||
Routine<T>(queue, event, name, {"Xaxpy"}, precision_) {
|
||||
Routine(queue, event, name, {"Xaxpy"}, PrecisionValue<T>()) {
|
||||
source_string_ =
|
||||
#include "../../kernels/level1/level1.opencl"
|
||||
#include "../../kernels/level1/xswap.opencl"
|
||||
|
@ -50,9 +41,9 @@ StatusCode Xswap<T>::DoSwap(const size_t n,
|
|||
if (n == 0) { return StatusCode::kInvalidDimension; }
|
||||
|
||||
// Tests the vectors for validity
|
||||
auto status = TestVectorX(n, x_buffer, x_offset, x_inc, sizeof(T));
|
||||
auto status = TestVectorX(n, x_buffer, x_offset, x_inc);
|
||||
if (ErrorIn(status)) { return status; }
|
||||
status = TestVectorY(n, y_buffer, y_offset, y_inc, sizeof(T));
|
||||
status = TestVectorY(n, y_buffer, y_offset, y_inc);
|
||||
if (ErrorIn(status)) { return status; }
|
||||
|
||||
// Determines whether or not the fast-version can be used
|
||||
|
@ -65,7 +56,7 @@ StatusCode Xswap<T>::DoSwap(const size_t n,
|
|||
|
||||
// Retrieves the Xswap kernel from the compiled binary
|
||||
try {
|
||||
const auto program = GetProgramFromCache();
|
||||
const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);
|
||||
auto kernel = Kernel(program, kernel_name);
|
||||
|
||||
// Sets the kernel arguments
|
||||
|
@ -88,13 +79,13 @@ StatusCode Xswap<T>::DoSwap(const size_t n,
|
|||
if (use_fast_kernel) {
|
||||
auto global = std::vector<size_t>{CeilDiv(n, db_["WPT"]*db_["VW"])};
|
||||
auto local = std::vector<size_t>{db_["WGS"]};
|
||||
status = RunKernel(kernel, global, local, event_);
|
||||
status = RunKernel(kernel, queue_, device_, global, local, event_);
|
||||
}
|
||||
else {
|
||||
auto n_ceiled = Ceil(n, db_["WGS"]*db_["WPT"]);
|
||||
auto global = std::vector<size_t>{n_ceiled/db_["WPT"]};
|
||||
auto local = std::vector<size_t>{db_["WGS"]};
|
||||
status = RunKernel(kernel, global, local, event_);
|
||||
status = RunKernel(kernel, queue_, device_, global, local, event_);
|
||||
}
|
||||
if (ErrorIn(status)) { return status; }
|
||||
|
|
@ -14,27 +14,16 @@
|
|||
#ifndef CLBLAST_ROUTINES_XSWAP_H_
|
||||
#define CLBLAST_ROUTINES_XSWAP_H_
|
||||
|
||||
#include "internal/routine.h"
|
||||
#include "routine.hpp"
|
||||
|
||||
namespace clblast {
|
||||
// =================================================================================================
|
||||
|
||||
// See comment at top of file for a description of the class
|
||||
template <typename T>
|
||||
class Xswap: public Routine<T> {
|
||||
class Xswap: public Routine {
|
||||
public:
|
||||
|
||||
// Members and methods from the base class
|
||||
using Routine<T>::db_;
|
||||
using Routine<T>::source_string_;
|
||||
using Routine<T>::queue_;
|
||||
using Routine<T>::event_;
|
||||
using Routine<T>::GetProgramFromCache;
|
||||
using Routine<T>::TestVectorX;
|
||||
using Routine<T>::TestVectorY;
|
||||
using Routine<T>::RunKernel;
|
||||
using Routine<T>::ErrorIn;
|
||||
|
||||
// Constructor
|
||||
Xswap(Queue &queue, EventPointer event, const std::string &name = "SWAP");
|
||||
|
||||
|
@ -42,10 +31,6 @@ class Xswap: public Routine<T> {
|
|||
StatusCode DoSwap(const size_t n,
|
||||
const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
|
||||
const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc);
|
||||
|
||||
private:
|
||||
// Static variable to get the precision
|
||||
const static Precision precision_;
|
||||
};
|
||||
|
||||
// =================================================================================================
|
|
@ -11,7 +11,7 @@
|
|||
//
|
||||
// =================================================================================================
|
||||
|
||||
#include "internal/routines/level2/xgbmv.h"
|
||||
#include "routines/level2/xgbmv.hpp"
|
||||
|
||||
#include <string>
|
||||
#include <vector>
|
|
@ -16,7 +16,7 @@
|
|||
#ifndef CLBLAST_ROUTINES_XGBMV_H_
|
||||
#define CLBLAST_ROUTINES_XGBMV_H_
|
||||
|
||||
#include "internal/routines/level2/xgemv.h"
|
||||
#include "routines/level2/xgemv.hpp"
|
||||
|
||||
namespace clblast {
|
||||
// =================================================================================================
|
|
@ -11,7 +11,7 @@
|
|||
//
|
||||
// =================================================================================================
|
||||
|
||||
#include "internal/routines/level2/xgemv.h"
|
||||
#include "routines/level2/xgemv.hpp"
|
||||
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
@ -19,19 +19,10 @@
|
|||
namespace clblast {
|
||||
// =================================================================================================
|
||||
|
||||
// Specific implementations to get the memory-type based on a template argument
|
||||
template <> const Precision Xgemv<half>::precision_ = Precision::kHalf;
|
||||
template <> const Precision Xgemv<float>::precision_ = Precision::kSingle;
|
||||
template <> const Precision Xgemv<double>::precision_ = Precision::kDouble;
|
||||
template <> const Precision Xgemv<float2>::precision_ = Precision::kComplexSingle;
|
||||
template <> const Precision Xgemv<double2>::precision_ = Precision::kComplexDouble;
|
||||
|
||||
// =================================================================================================
|
||||
|
||||
// Constructor: forwards to base class constructor
|
||||
template <typename T>
|
||||
Xgemv<T>::Xgemv(Queue &queue, EventPointer event, const std::string &name):
|
||||
Routine<T>(queue, event, name, {"Pad", "Xgemv"}, precision_) {
|
||||
Routine(queue, event, name, {"Pad", "Xgemv"}, PrecisionValue<T>()) {
|
||||
source_string_ =
|
||||
#include "../../kernels/level2/xgemv.opencl"
|
||||
#include "../../kernels/level2/xgemv_fast.opencl"
|
||||
|
@ -101,12 +92,12 @@ StatusCode Xgemv<T>::MatVec(const Layout layout, const Transpose a_transpose,
|
|||
|
||||
// Tests the matrix and the vectors for validity
|
||||
auto status = StatusCode::kSuccess;
|
||||
if (packed) { status = TestMatrixAP(n, a_buffer, a_offset, sizeof(T)); }
|
||||
else { status = TestMatrixA(a_one, a_two, a_buffer, a_offset, a_ld, sizeof(T)); }
|
||||
if (packed) { status = TestMatrixAP(n, a_buffer, a_offset); }
|
||||
else { status = TestMatrixA(a_one, a_two, a_buffer, a_offset, a_ld); }
|
||||
if (ErrorIn(status)) { return status; }
|
||||
status = TestVectorX(n_real, x_buffer, x_offset, x_inc, sizeof(T));
|
||||
status = TestVectorX(n_real, x_buffer, x_offset, x_inc);
|
||||
if (ErrorIn(status)) { return status; }
|
||||
status = TestVectorY(m_real, y_buffer, y_offset, y_inc, sizeof(T));
|
||||
status = TestVectorY(m_real, y_buffer, y_offset, y_inc);
|
||||
if (ErrorIn(status)) { return status; }
|
||||
|
||||
// Determines whether or not the fast-version can be used
|
||||
|
@ -143,7 +134,7 @@ StatusCode Xgemv<T>::MatVec(const Layout layout, const Transpose a_transpose,
|
|||
|
||||
// Retrieves the Xgemv kernel from the compiled binary
|
||||
try {
|
||||
const auto program = GetProgramFromCache();
|
||||
const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);
|
||||
auto kernel = Kernel(program, kernel_name);
|
||||
|
||||
// Sets the kernel arguments
|
||||
|
@ -169,7 +160,7 @@ StatusCode Xgemv<T>::MatVec(const Layout layout, const Transpose a_transpose,
|
|||
// Launches the kernel
|
||||
auto global = std::vector<size_t>{global_size};
|
||||
auto local = std::vector<size_t>{local_size};
|
||||
status = RunKernel(kernel, global, local, event_);
|
||||
status = RunKernel(kernel, queue_, device_, global, local, event_);
|
||||
if (ErrorIn(status)) { return status; }
|
||||
|
||||
// Succesfully finished the computation
|
|
@ -14,30 +14,16 @@
|
|||
#ifndef CLBLAST_ROUTINES_XGEMV_H_
|
||||
#define CLBLAST_ROUTINES_XGEMV_H_
|
||||
|
||||
#include "internal/routine.h"
|
||||
#include "routine.hpp"
|
||||
|
||||
namespace clblast {
|
||||
// =================================================================================================
|
||||
|
||||
// See comment at top of file for a description of the class
|
||||
template <typename T>
|
||||
class Xgemv: public Routine<T> {
|
||||
class Xgemv: public Routine {
|
||||
public:
|
||||
|
||||
// Members and methods from the base class
|
||||
using Routine<T>::db_;
|
||||
using Routine<T>::source_string_;
|
||||
using Routine<T>::queue_;
|
||||
using Routine<T>::event_;
|
||||
using Routine<T>::context_;
|
||||
using Routine<T>::GetProgramFromCache;
|
||||
using Routine<T>::TestVectorX;
|
||||
using Routine<T>::TestVectorY;
|
||||
using Routine<T>::TestMatrixA;
|
||||
using Routine<T>::TestMatrixAP;
|
||||
using Routine<T>::RunKernel;
|
||||
using Routine<T>::ErrorIn;
|
||||
|
||||
// Constructor
|
||||
Xgemv(Queue &queue, EventPointer event, const std::string &name = "GEMV");
|
||||
|
||||
|
@ -61,10 +47,6 @@ class Xgemv: public Routine<T> {
|
|||
bool fast_kernel, bool fast_kernel_rot,
|
||||
const size_t parameter, const bool packed,
|
||||
const size_t kl, const size_t ku);
|
||||
|
||||
private:
|
||||
// Static variable to get the precision
|
||||
const static Precision precision_;
|
||||
};
|
||||
|
||||
// =================================================================================================
|
|
@ -11,7 +11,7 @@
|
|||
//
|
||||
// =================================================================================================
|
||||
|
||||
#include "internal/routines/level2/xger.h"
|
||||
#include "routines/level2/xger.hpp"
|
||||
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
@ -19,19 +19,10 @@
|
|||
namespace clblast {
|
||||
// =================================================================================================
|
||||
|
||||
// Specific implementations to get the memory-type based on a template argument
|
||||
template <> const Precision Xger<half>::precision_ = Precision::kHalf;
|
||||
template <> const Precision Xger<float>::precision_ = Precision::kSingle;
|
||||
template <> const Precision Xger<double>::precision_ = Precision::kDouble;
|
||||
template <> const Precision Xger<float2>::precision_ = Precision::kComplexSingle;
|
||||
template <> const Precision Xger<double2>::precision_ = Precision::kComplexDouble;
|
||||
|
||||
// =================================================================================================
|
||||
|
||||
// Constructor: forwards to base class constructor
|
||||
template <typename T>
|
||||
Xger<T>::Xger(Queue &queue, EventPointer event, const std::string &name):
|
||||
Routine<T>(queue, event, name, {"Xger"}, precision_) {
|
||||
Routine(queue, event, name, {"Xger"}, PrecisionValue<T>()) {
|
||||
source_string_ =
|
||||
#include "../../kernels/level2/level2.opencl"
|
||||
#include "../../kernels/level2/xger.opencl"
|
||||
|
@ -58,11 +49,11 @@ StatusCode Xger<T>::DoGer(const Layout layout,
|
|||
const auto a_two = (a_is_rowmajor) ? m : n;
|
||||
|
||||
// Tests the matrix and the vectors for validity
|
||||
auto status = TestMatrixA(a_one, a_two, a_buffer, a_offset, a_ld, sizeof(T));
|
||||
auto status = TestMatrixA(a_one, a_two, a_buffer, a_offset, a_ld);
|
||||
if (ErrorIn(status)) { return status; }
|
||||
status = TestVectorX(m, x_buffer, x_offset, x_inc, sizeof(T));
|
||||
status = TestVectorX(m, x_buffer, x_offset, x_inc);
|
||||
if (ErrorIn(status)) { return status; }
|
||||
status = TestVectorY(n, y_buffer, y_offset, y_inc, sizeof(T));
|
||||
status = TestVectorY(n, y_buffer, y_offset, y_inc);
|
||||
if (ErrorIn(status)) { return status; }
|
||||
|
||||
// Upload the scalar argument as a constant buffer to the device (needed for half-precision)
|
||||
|
@ -71,7 +62,7 @@ StatusCode Xger<T>::DoGer(const Layout layout,
|
|||
|
||||
// Retrieves the kernel from the compiled binary
|
||||
try {
|
||||
const auto program = GetProgramFromCache();
|
||||
const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);
|
||||
auto kernel = Kernel(program, "Xger");
|
||||
|
||||
// Sets the kernel arguments
|
||||
|
@ -94,7 +85,7 @@ StatusCode Xger<T>::DoGer(const Layout layout,
|
|||
auto a_two_ceiled = Ceil(CeilDiv(a_two, db_["WPT"]), db_["WGS2"]);
|
||||
auto global = std::vector<size_t>{a_one_ceiled, a_two_ceiled};
|
||||
auto local = std::vector<size_t>{db_["WGS1"], db_["WGS2"]};
|
||||
status = RunKernel(kernel, global, local, event_);
|
||||
status = RunKernel(kernel, queue_, device_, global, local, event_);
|
||||
if (ErrorIn(status)) { return status; }
|
||||
|
||||
// Succesfully finished the computation
|
|
@ -14,29 +14,16 @@
|
|||
#ifndef CLBLAST_ROUTINES_XGER_H_
|
||||
#define CLBLAST_ROUTINES_XGER_H_
|
||||
|
||||
#include "internal/routine.h"
|
||||
#include "routine.hpp"
|
||||
|
||||
namespace clblast {
|
||||
// =================================================================================================
|
||||
|
||||
// See comment at top of file for a description of the class
|
||||
template <typename T>
|
||||
class Xger: public Routine<T> {
|
||||
class Xger: public Routine {
|
||||
public:
|
||||
|
||||
// Members and methods from the base class
|
||||
using Routine<T>::db_;
|
||||
using Routine<T>::source_string_;
|
||||
using Routine<T>::queue_;
|
||||
using Routine<T>::event_;
|
||||
using Routine<T>::context_;
|
||||
using Routine<T>::GetProgramFromCache;
|
||||
using Routine<T>::TestVectorX;
|
||||
using Routine<T>::TestVectorY;
|
||||
using Routine<T>::TestMatrixA;
|
||||
using Routine<T>::RunKernel;
|
||||
using Routine<T>::ErrorIn;
|
||||
|
||||
// Constructor
|
||||
Xger(Queue &queue, EventPointer event, const std::string &name = "GER");
|
||||
|
||||
|
@ -47,10 +34,6 @@ class Xger: public Routine<T> {
|
|||
const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
|
||||
const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc,
|
||||
const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld);
|
||||
|
||||
private:
|
||||
// Static variable to get the precision
|
||||
const static Precision precision_;
|
||||
};
|
||||
|
||||
// =================================================================================================
|
|
@ -11,7 +11,7 @@
|
|||
//
|
||||
// =================================================================================================
|
||||
|
||||
#include "internal/routines/level2/xgerc.h"
|
||||
#include "routines/level2/xgerc.hpp"
|
||||
|
||||
#include <string>
|
||||
|
|
@ -14,7 +14,7 @@
|
|||
#ifndef CLBLAST_ROUTINES_XGERC_H_
|
||||
#define CLBLAST_ROUTINES_XGERC_H_
|
||||
|
||||
#include "internal/routines/level2/xger.h"
|
||||
#include "routines/level2/xger.hpp"
|
||||
|
||||
namespace clblast {
|
||||
// =================================================================================================
|
|
@ -11,7 +11,7 @@
|
|||
//
|
||||
// =================================================================================================
|
||||
|
||||
#include "internal/routines/level2/xgeru.h"
|
||||
#include "routines/level2/xgeru.hpp"
|
||||
|
||||
#include <string>
|
||||
|
|
@ -14,7 +14,7 @@
|
|||
#ifndef CLBLAST_ROUTINES_XGERU_H_
|
||||
#define CLBLAST_ROUTINES_XGERU_H_
|
||||
|
||||
#include "internal/routines/level2/xger.h"
|
||||
#include "routines/level2/xger.hpp"
|
||||
|
||||
namespace clblast {
|
||||
// =================================================================================================
|
|
@ -11,7 +11,7 @@
|
|||
//
|
||||
// =================================================================================================
|
||||
|
||||
#include "internal/routines/level2/xhbmv.h"
|
||||
#include "routines/level2/xhbmv.hpp"
|
||||
|
||||
#include <string>
|
||||
#include <vector>
|
|
@ -16,7 +16,7 @@
|
|||
#ifndef CLBLAST_ROUTINES_XHBMV_H_
|
||||
#define CLBLAST_ROUTINES_XHBMV_H_
|
||||
|
||||
#include "internal/routines/level2/xgemv.h"
|
||||
#include "routines/level2/xgemv.hpp"
|
||||
|
||||
namespace clblast {
|
||||
// =================================================================================================
|
|
@ -11,7 +11,7 @@
|
|||
//
|
||||
// =================================================================================================
|
||||
|
||||
#include "internal/routines/level2/xhemv.h"
|
||||
#include "routines/level2/xhemv.hpp"
|
||||
|
||||
#include <string>
|
||||
#include <vector>
|
|
@ -16,7 +16,7 @@
|
|||
#ifndef CLBLAST_ROUTINES_XHEMV_H_
|
||||
#define CLBLAST_ROUTINES_XHEMV_H_
|
||||
|
||||
#include "internal/routines/level2/xgemv.h"
|
||||
#include "routines/level2/xgemv.hpp"
|
||||
|
||||
namespace clblast {
|
||||
// =================================================================================================
|
|
@ -11,26 +11,17 @@
|
|||
//
|
||||
// =================================================================================================
|
||||
|
||||
#include "internal/routines/level2/xher.h"
|
||||
#include "routines/level2/xher.hpp"
|
||||
|
||||
#include <string>
|
||||
|
||||
namespace clblast {
|
||||
// =================================================================================================
|
||||
|
||||
// Specific implementations to get the memory-type based on a template argument
|
||||
template <> const Precision Xher<half, half>::precision_ = Precision::kHalf;
|
||||
template <> const Precision Xher<float, float>::precision_ = Precision::kSingle;
|
||||
template <> const Precision Xher<double, double>::precision_ = Precision::kDouble;
|
||||
template <> const Precision Xher<float2, float>::precision_ = Precision::kComplexSingle;
|
||||
template <> const Precision Xher<double2, double>::precision_ = Precision::kComplexDouble;
|
||||
|
||||
// =================================================================================================
|
||||
|
||||
// Constructor: forwards to base class constructor
|
||||
template <typename T, typename U>
|
||||
Xher<T,U>::Xher(Queue &queue, EventPointer event, const std::string &name):
|
||||
Routine<T>(queue, event, name, {"Xger"}, precision_) {
|
||||
Routine(queue, event, name, {"Xger"}, PrecisionValue<T>()) {
|
||||
source_string_ =
|
||||
#include "../../kernels/level2/level2.opencl"
|
||||
#include "../../kernels/level2/xher.opencl"
|
||||
|
@ -67,10 +58,10 @@ StatusCode Xher<T,U>::DoHer(const Layout layout, const Triangle triangle,
|
|||
|
||||
// Tests the matrix and the vectors for validity
|
||||
auto status = StatusCode::kSuccess;
|
||||
if (packed) { status = TestMatrixAP(n, a_buffer, a_offset, sizeof(T)); }
|
||||
else { status = TestMatrixA(n, n, a_buffer, a_offset, a_ld, sizeof(T)); }
|
||||
if (packed) { status = TestMatrixAP(n, a_buffer, a_offset); }
|
||||
else { status = TestMatrixA(n, n, a_buffer, a_offset, a_ld); }
|
||||
if (ErrorIn(status)) { return status; }
|
||||
status = TestVectorX(n, x_buffer, x_offset, x_inc, sizeof(T));
|
||||
status = TestVectorX(n, x_buffer, x_offset, x_inc);
|
||||
if (ErrorIn(status)) { return status; }
|
||||
|
||||
// If alpha is zero an update is not required
|
||||
|
@ -85,7 +76,7 @@ StatusCode Xher<T,U>::DoHer(const Layout layout, const Triangle triangle,
|
|||
|
||||
// Retrieves the kernel from the compiled binary
|
||||
try {
|
||||
const auto program = GetProgramFromCache();
|
||||
const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);
|
||||
auto kernel = Kernel(program, "Xher");
|
||||
|
||||
// Sets the kernel arguments
|
||||
|
@ -105,7 +96,7 @@ StatusCode Xher<T,U>::DoHer(const Layout layout, const Triangle triangle,
|
|||
auto global_two = Ceil(CeilDiv(n, db_["WPT"]), db_["WGS2"]);
|
||||
auto global = std::vector<size_t>{global_one, global_two};
|
||||
auto local = std::vector<size_t>{db_["WGS1"], db_["WGS2"]};
|
||||
status = RunKernel(kernel, global, local, event_);
|
||||
status = RunKernel(kernel, queue_, device_, global, local, event_);
|
||||
if (ErrorIn(status)) { return status; }
|
||||
|
||||
// Succesfully finished the computation
|
|
@ -14,29 +14,16 @@
|
|||
#ifndef CLBLAST_ROUTINES_XHER_H_
|
||||
#define CLBLAST_ROUTINES_XHER_H_
|
||||
|
||||
#include "internal/routine.h"
|
||||
#include "routine.hpp"
|
||||
|
||||
namespace clblast {
|
||||
// =================================================================================================
|
||||
|
||||
// See comment at top of file for a description of the class
|
||||
template <typename T, typename U>
|
||||
class Xher: public Routine<T> {
|
||||
class Xher: public Routine {
|
||||
public:
|
||||
|
||||
// Members and methods from the base class
|
||||
using Routine<T>::db_;
|
||||
using Routine<T>::source_string_;
|
||||
using Routine<T>::queue_;
|
||||
using Routine<T>::event_;
|
||||
using Routine<T>::context_;
|
||||
using Routine<T>::GetProgramFromCache;
|
||||
using Routine<T>::TestVectorX;
|
||||
using Routine<T>::TestMatrixA;
|
||||
using Routine<T>::TestMatrixAP;
|
||||
using Routine<T>::RunKernel;
|
||||
using Routine<T>::ErrorIn;
|
||||
|
||||
// Constructor
|
||||
Xher(Queue &queue, EventPointer event, const std::string &name = "HER");
|
||||
|
||||
|
@ -50,10 +37,6 @@ class Xher: public Routine<T> {
|
|||
const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
|
||||
const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
|
||||
const bool packed = false);
|
||||
|
||||
private:
|
||||
// Static variable to get the precision
|
||||
const static Precision precision_;
|
||||
};
|
||||
|
||||
// =================================================================================================
|
|
@ -11,26 +11,17 @@
|
|||
//
|
||||
// =================================================================================================
|
||||
|
||||
#include "internal/routines/level2/xher2.h"
|
||||
#include "routines/level2/xher2.hpp"
|
||||
|
||||
#include <string>
|
||||
|
||||
namespace clblast {
|
||||
// =================================================================================================
|
||||
|
||||
// Specific implementations to get the memory-type based on a template argument
|
||||
template <> const Precision Xher2<half>::precision_ = Precision::kHalf;
|
||||
template <> const Precision Xher2<float>::precision_ = Precision::kSingle;
|
||||
template <> const Precision Xher2<double>::precision_ = Precision::kDouble;
|
||||
template <> const Precision Xher2<float2>::precision_ = Precision::kComplexSingle;
|
||||
template <> const Precision Xher2<double2>::precision_ = Precision::kComplexDouble;
|
||||
|
||||
// =================================================================================================
|
||||
|
||||
// Constructor: forwards to base class constructor
|
||||
template <typename T>
|
||||
Xher2<T>::Xher2(Queue &queue, EventPointer event, const std::string &name):
|
||||
Routine<T>(queue, event, name, {"Xger"}, precision_) {
|
||||
Routine(queue, event, name, {"Xger"}, PrecisionValue<T>()) {
|
||||
source_string_ =
|
||||
#include "../../kernels/level2/level2.opencl"
|
||||
#include "../../kernels/level2/xher2.opencl"
|
||||
|
@ -59,12 +50,12 @@ StatusCode Xher2<T>::DoHer2(const Layout layout, const Triangle triangle,
|
|||
|
||||
// Tests the matrix and the vectors for validity
|
||||
auto status = StatusCode::kSuccess;
|
||||
if (packed) { status = TestMatrixAP(n, a_buffer, a_offset, sizeof(T)); }
|
||||
else { status = TestMatrixA(n, n, a_buffer, a_offset, a_ld, sizeof(T)); }
|
||||
if (packed) { status = TestMatrixAP(n, a_buffer, a_offset); }
|
||||
else { status = TestMatrixA(n, n, a_buffer, a_offset, a_ld); }
|
||||
if (ErrorIn(status)) { return status; }
|
||||
status = TestVectorX(n, x_buffer, x_offset, x_inc, sizeof(T));
|
||||
status = TestVectorX(n, x_buffer, x_offset, x_inc);
|
||||
if (ErrorIn(status)) { return status; }
|
||||
status = TestVectorY(n, y_buffer, y_offset, y_inc, sizeof(T));
|
||||
status = TestVectorY(n, y_buffer, y_offset, y_inc);
|
||||
if (ErrorIn(status)) { return status; }
|
||||
|
||||
// Upload the scalar argument as a constant buffer to the device (needed for half-precision)
|
||||
|
@ -73,7 +64,7 @@ StatusCode Xher2<T>::DoHer2(const Layout layout, const Triangle triangle,
|
|||
|
||||
// Retrieves the kernel from the compiled binary
|
||||
try {
|
||||
const auto program = GetProgramFromCache();
|
||||
const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);
|
||||
auto kernel = Kernel(program, "Xher2");
|
||||
|
||||
// Sets the kernel arguments
|
||||
|
@ -96,7 +87,7 @@ StatusCode Xher2<T>::DoHer2(const Layout layout, const Triangle triangle,
|
|||
auto global_two = Ceil(CeilDiv(n, db_["WPT"]), db_["WGS2"]);
|
||||
auto global = std::vector<size_t>{global_one, global_two};
|
||||
auto local = std::vector<size_t>{db_["WGS1"], db_["WGS2"]};
|
||||
status = RunKernel(kernel, global, local, event_);
|
||||
status = RunKernel(kernel, queue_, device_, global, local, event_);
|
||||
if (ErrorIn(status)) { return status; }
|
||||
|
||||
// Succesfully finished the computation
|
|
@ -14,30 +14,16 @@
|
|||
#ifndef CLBLAST_ROUTINES_XHER2_H_
|
||||
#define CLBLAST_ROUTINES_XHER2_H_
|
||||
|
||||
#include "internal/routine.h"
|
||||
#include "routine.hpp"
|
||||
|
||||
namespace clblast {
|
||||
// =================================================================================================
|
||||
|
||||
// See comment at top of file for a description of the class
|
||||
template <typename T>
|
||||
class Xher2: public Routine<T> {
|
||||
class Xher2: public Routine {
|
||||
public:
|
||||
|
||||
// Members and methods from the base class
|
||||
using Routine<T>::db_;
|
||||
using Routine<T>::source_string_;
|
||||
using Routine<T>::queue_;
|
||||
using Routine<T>::event_;
|
||||
using Routine<T>::context_;
|
||||
using Routine<T>::GetProgramFromCache;
|
||||
using Routine<T>::TestVectorX;
|
||||
using Routine<T>::TestVectorY;
|
||||
using Routine<T>::TestMatrixA;
|
||||
using Routine<T>::TestMatrixAP;
|
||||
using Routine<T>::RunKernel;
|
||||
using Routine<T>::ErrorIn;
|
||||
|
||||
// Constructor
|
||||
Xher2(Queue &queue, EventPointer event, const std::string &name = "HER2");
|
||||
|
||||
|
@ -49,10 +35,6 @@ class Xher2: public Routine<T> {
|
|||
const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc,
|
||||
const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
|
||||
const bool packed = false);
|
||||
|
||||
private:
|
||||
// Static variable to get the precision
|
||||
const static Precision precision_;
|
||||
};
|
||||
|
||||
// =================================================================================================
|
|
@ -11,7 +11,7 @@
|
|||
//
|
||||
// =================================================================================================
|
||||
|
||||
#include "internal/routines/level2/xhpmv.h"
|
||||
#include "routines/level2/xhpmv.hpp"
|
||||
|
||||
#include <string>
|
||||
#include <vector>
|
|
@ -16,7 +16,7 @@
|
|||
#ifndef CLBLAST_ROUTINES_XHPMV_H_
|
||||
#define CLBLAST_ROUTINES_XHPMV_H_
|
||||
|
||||
#include "internal/routines/level2/xgemv.h"
|
||||
#include "routines/level2/xgemv.hpp"
|
||||
|
||||
namespace clblast {
|
||||
// =================================================================================================
|
|
@ -11,7 +11,7 @@
|
|||
//
|
||||
// =================================================================================================
|
||||
|
||||
#include "internal/routines/level2/xhpr.h"
|
||||
#include "routines/level2/xhpr.hpp"
|
||||
|
||||
#include <string>
|
||||
|
|
@ -14,7 +14,7 @@
|
|||
#ifndef CLBLAST_ROUTINES_XHPR_H_
|
||||
#define CLBLAST_ROUTINES_XHPR_H_
|
||||
|
||||
#include "internal/routines/level2/xher.h"
|
||||
#include "routines/level2/xher.hpp"
|
||||
|
||||
namespace clblast {
|
||||
// =================================================================================================
|
|
@ -11,7 +11,7 @@
|
|||
//
|
||||
// =================================================================================================
|
||||
|
||||
#include "internal/routines/level2/xhpr2.h"
|
||||
#include "routines/level2/xhpr2.hpp"
|
||||
|
||||
#include <string>
|
||||
|
|
@ -14,7 +14,7 @@
|
|||
#ifndef CLBLAST_ROUTINES_XHPR2_H_
|
||||
#define CLBLAST_ROUTINES_XHPR2_H_
|
||||
|
||||
#include "internal/routines/level2/xher2.h"
|
||||
#include "routines/level2/xher2.hpp"
|
||||
|
||||
namespace clblast {
|
||||
// =================================================================================================
|
|
@ -11,7 +11,7 @@
|
|||
//
|
||||
// =================================================================================================
|
||||
|
||||
#include "internal/routines/level2/xsbmv.h"
|
||||
#include "routines/level2/xsbmv.hpp"
|
||||
|
||||
#include <string>
|
||||
#include <vector>
|
|
@ -16,7 +16,7 @@
|
|||
#ifndef CLBLAST_ROUTINES_XSBMV_H_
|
||||
#define CLBLAST_ROUTINES_XSBMV_H_
|
||||
|
||||
#include "internal/routines/level2/xgemv.h"
|
||||
#include "routines/level2/xgemv.hpp"
|
||||
|
||||
namespace clblast {
|
||||
// =================================================================================================
|
|
@ -11,7 +11,7 @@
|
|||
//
|
||||
// =================================================================================================
|
||||
|
||||
#include "internal/routines/level2/xspmv.h"
|
||||
#include "routines/level2/xspmv.hpp"
|
||||
|
||||
#include <string>
|
||||
#include <vector>
|
|
@ -16,7 +16,7 @@
|
|||
#ifndef CLBLAST_ROUTINES_XSPMV_H_
|
||||
#define CLBLAST_ROUTINES_XSPMV_H_
|
||||
|
||||
#include "internal/routines/level2/xgemv.h"
|
||||
#include "routines/level2/xgemv.hpp"
|
||||
|
||||
namespace clblast {
|
||||
// =================================================================================================
|
|
@ -11,7 +11,7 @@
|
|||
//
|
||||
// =================================================================================================
|
||||
|
||||
#include "internal/routines/level2/xspr.h"
|
||||
#include "routines/level2/xspr.hpp"
|
||||
|
||||
#include <string>
|
||||
|
|
@ -14,7 +14,7 @@
|
|||
#ifndef CLBLAST_ROUTINES_XSPR_H_
|
||||
#define CLBLAST_ROUTINES_XSPR_H_
|
||||
|
||||
#include "internal/routines/level2/xher.h"
|
||||
#include "routines/level2/xher.hpp"
|
||||
|
||||
namespace clblast {
|
||||
// =================================================================================================
|
|
@ -11,7 +11,7 @@
|
|||
//
|
||||
// =================================================================================================
|
||||
|
||||
#include "internal/routines/level2/xspr2.h"
|
||||
#include "routines/level2/xspr2.hpp"
|
||||
|
||||
#include <string>
|
||||
|
|
@ -14,7 +14,7 @@
|
|||
#ifndef CLBLAST_ROUTINES_XSPR2_H_
|
||||
#define CLBLAST_ROUTINES_XSPR2_H_
|
||||
|
||||
#include "internal/routines/level2/xher2.h"
|
||||
#include "routines/level2/xher2.hpp"
|
||||
|
||||
namespace clblast {
|
||||
// =================================================================================================
|
|
@ -11,7 +11,7 @@
|
|||
//
|
||||
// =================================================================================================
|
||||
|
||||
#include "internal/routines/level2/xsymv.h"
|
||||
#include "routines/level2/xsymv.hpp"
|
||||
|
||||
#include <string>
|
||||
#include <vector>
|
|
@ -16,7 +16,7 @@
|
|||
#ifndef CLBLAST_ROUTINES_XSYMV_H_
|
||||
#define CLBLAST_ROUTINES_XSYMV_H_
|
||||
|
||||
#include "internal/routines/level2/xgemv.h"
|
||||
#include "routines/level2/xgemv.hpp"
|
||||
|
||||
namespace clblast {
|
||||
// =================================================================================================
|
|
@ -11,7 +11,7 @@
|
|||
//
|
||||
// =================================================================================================
|
||||
|
||||
#include "internal/routines/level2/xsyr.h"
|
||||
#include "routines/level2/xsyr.hpp"
|
||||
|
||||
#include <string>
|
||||
|
|
@ -14,7 +14,7 @@
|
|||
#ifndef CLBLAST_ROUTINES_XSYR_H_
|
||||
#define CLBLAST_ROUTINES_XSYR_H_
|
||||
|
||||
#include "internal/routines/level2/xher.h"
|
||||
#include "routines/level2/xher.hpp"
|
||||
|
||||
namespace clblast {
|
||||
// =================================================================================================
|
|
@ -11,7 +11,7 @@
|
|||
//
|
||||
// =================================================================================================
|
||||
|
||||
#include "internal/routines/level2/xsyr2.h"
|
||||
#include "routines/level2/xsyr2.hpp"
|
||||
|
||||
#include <string>
|
||||
|
|
@ -14,7 +14,7 @@
|
|||
#ifndef CLBLAST_ROUTINES_XSYR2_H_
|
||||
#define CLBLAST_ROUTINES_XSYR2_H_
|
||||
|
||||
#include "internal/routines/level2/xher2.h"
|
||||
#include "routines/level2/xher2.hpp"
|
||||
|
||||
namespace clblast {
|
||||
// =================================================================================================
|
|
@ -11,7 +11,7 @@
|
|||
//
|
||||
// =================================================================================================
|
||||
|
||||
#include "internal/routines/level2/xtbmv.h"
|
||||
#include "routines/level2/xtbmv.hpp"
|
||||
|
||||
#include <string>
|
||||
#include <vector>
|
|
@ -16,7 +16,7 @@
|
|||
#ifndef CLBLAST_ROUTINES_XTBMV_H_
|
||||
#define CLBLAST_ROUTINES_XTBMV_H_
|
||||
|
||||
#include "internal/routines/level2/xgemv.h"
|
||||
#include "routines/level2/xgemv.hpp"
|
||||
|
||||
namespace clblast {
|
||||
// =================================================================================================
|
||||
|
@ -25,12 +25,10 @@ namespace clblast {
|
|||
template <typename T>
|
||||
class Xtbmv: public Xgemv<T> {
|
||||
public:
|
||||
|
||||
// Members from the base class
|
||||
using Routine<T>::queue_;
|
||||
using Routine<T>::context_;
|
||||
|
||||
// Uses the generic matrix-vector routine
|
||||
using Xgemv<T>::queue_;
|
||||
using Xgemv<T>::context_;
|
||||
using Xgemv<T>::MatVec;
|
||||
|
||||
// Constructor
|
|
@ -11,7 +11,7 @@
|
|||
//
|
||||
// =================================================================================================
|
||||
|
||||
#include "internal/routines/level2/xtpmv.h"
|
||||
#include "routines/level2/xtpmv.hpp"
|
||||
|
||||
#include <string>
|
||||
#include <vector>
|
|
@ -16,7 +16,7 @@
|
|||
#ifndef CLBLAST_ROUTINES_XTPMV_H_
|
||||
#define CLBLAST_ROUTINES_XTPMV_H_
|
||||
|
||||
#include "internal/routines/level2/xgemv.h"
|
||||
#include "routines/level2/xgemv.hpp"
|
||||
|
||||
namespace clblast {
|
||||
// =================================================================================================
|
||||
|
@ -25,12 +25,10 @@ namespace clblast {
|
|||
template <typename T>
|
||||
class Xtpmv: public Xgemv<T> {
|
||||
public:
|
||||
|
||||
// Members from the base class
|
||||
using Routine<T>::queue_;
|
||||
using Routine<T>::context_;
|
||||
|
||||
// Uses the generic matrix-vector routine
|
||||
using Xgemv<T>::queue_;
|
||||
using Xgemv<T>::context_;
|
||||
using Xgemv<T>::MatVec;
|
||||
|
||||
// Constructor
|
|
@ -11,7 +11,7 @@
|
|||
//
|
||||
// =================================================================================================
|
||||
|
||||
#include "internal/routines/level2/xtrmv.h"
|
||||
#include "routines/level2/xtrmv.hpp"
|
||||
|
||||
#include <string>
|
||||
#include <vector>
|
|
@ -16,7 +16,7 @@
|
|||
#ifndef CLBLAST_ROUTINES_XTRMV_H_
|
||||
#define CLBLAST_ROUTINES_XTRMV_H_
|
||||
|
||||
#include "internal/routines/level2/xgemv.h"
|
||||
#include "routines/level2/xgemv.hpp"
|
||||
|
||||
namespace clblast {
|
||||
// =================================================================================================
|
||||
|
@ -25,12 +25,10 @@ namespace clblast {
|
|||
template <typename T>
|
||||
class Xtrmv: public Xgemv<T> {
|
||||
public:
|
||||
|
||||
// Members from the base class
|
||||
using Routine<T>::queue_;
|
||||
using Routine<T>::context_;
|
||||
|
||||
// Uses the generic matrix-vector routine
|
||||
using Xgemv<T>::queue_;
|
||||
using Xgemv<T>::context_;
|
||||
using Xgemv<T>::MatVec;
|
||||
|
||||
// Constructor
|
|
@ -11,7 +11,7 @@
|
|||
//
|
||||
// =================================================================================================
|
||||
|
||||
#include "internal/routines/level3/xgemm.h"
|
||||
#include "routines/level3/xgemm.hpp"
|
||||
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
@ -19,19 +19,10 @@
|
|||
namespace clblast {
|
||||
// =================================================================================================
|
||||
|
||||
// Specific implementations to get the memory-type based on a template argument
|
||||
template <> const Precision Xgemm<half>::precision_ = Precision::kHalf;
|
||||
template <> const Precision Xgemm<float>::precision_ = Precision::kSingle;
|
||||
template <> const Precision Xgemm<double>::precision_ = Precision::kDouble;
|
||||
template <> const Precision Xgemm<float2>::precision_ = Precision::kComplexSingle;
|
||||
template <> const Precision Xgemm<double2>::precision_ = Precision::kComplexDouble;
|
||||
|
||||
// =================================================================================================
|
||||
|
||||
// Constructor: forwards to base class constructor
|
||||
template <typename T>
|
||||
Xgemm<T>::Xgemm(Queue &queue, EventPointer event, const std::string &name):
|
||||
Routine<T>(queue, event, name, {"Copy","Pad","Transpose","Padtranspose","Xgemm"}, precision_) {
|
||||
Routine(queue, event, name, {"Copy","Pad","Transpose","Padtranspose","Xgemm"}, PrecisionValue<T>()) {
|
||||
source_string_ =
|
||||
#include "../../kernels/level3/level3.opencl"
|
||||
#include "../../kernels/level3/copy_fast.opencl"
|
||||
|
@ -96,11 +87,11 @@ StatusCode Xgemm<T>::DoGemm(const Layout layout,
|
|||
// matrix A cannot be less than K when rotated, or less than M when not-rotated
|
||||
// matrix B cannot be less than N when rotated, or less than K when not-rotated
|
||||
// matrix C cannot be less than N when rotated, or less than M when not-rotated
|
||||
auto status = TestMatrixA(a_one, a_two, a_buffer, a_offset, a_ld, sizeof(T));
|
||||
auto status = TestMatrixA(a_one, a_two, a_buffer, a_offset, a_ld);
|
||||
if (ErrorIn(status)) { return status; }
|
||||
status = TestMatrixB(b_one, b_two, b_buffer, b_offset, b_ld, sizeof(T));
|
||||
status = TestMatrixB(b_one, b_two, b_buffer, b_offset, b_ld);
|
||||
if (ErrorIn(status)) { return status; }
|
||||
status = TestMatrixC(c_one, c_two, c_buffer, c_offset, c_ld, sizeof(T));
|
||||
status = TestMatrixC(c_one, c_two, c_buffer, c_offset, c_ld);
|
||||
if (ErrorIn(status)) { return status; }
|
||||
|
||||
// Calculates the ceiled versions of m, n, and k
|
||||
|
@ -112,7 +103,7 @@ StatusCode Xgemm<T>::DoGemm(const Layout layout,
|
|||
try {
|
||||
|
||||
// Loads the program from the database
|
||||
const auto program = GetProgramFromCache();
|
||||
const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);
|
||||
|
||||
// Determines whether or not temporary matrices are needed
|
||||
auto a_no_temp = a_one == m_ceiled && a_two == k_ceiled && a_ld == m_ceiled && a_offset == 0 &&
|
||||
|
@ -142,7 +133,7 @@ StatusCode Xgemm<T>::DoGemm(const Layout layout,
|
|||
// case nothing has to be done, these kernels can be skipped.
|
||||
if (!a_no_temp) {
|
||||
auto eventProcessA = Event();
|
||||
status = PadCopyTransposeMatrix(eventProcessA.pointer(), emptyEventList,
|
||||
status = PadCopyTransposeMatrix(queue_, device_, context_, db_, eventProcessA.pointer(), emptyEventList,
|
||||
a_one, a_two, a_ld, a_offset, a_buffer,
|
||||
m_ceiled, k_ceiled, m_ceiled, 0, a_temp,
|
||||
ConstantOne<T>(), program,
|
||||
|
@ -154,7 +145,7 @@ StatusCode Xgemm<T>::DoGemm(const Layout layout,
|
|||
// As above, but now for matrix B
|
||||
if (!b_no_temp) {
|
||||
auto eventProcessB = Event();
|
||||
status = PadCopyTransposeMatrix(eventProcessB.pointer(), emptyEventList,
|
||||
status = PadCopyTransposeMatrix(queue_, device_, context_, db_, eventProcessB.pointer(), emptyEventList,
|
||||
b_one, b_two, b_ld, b_offset, b_buffer,
|
||||
n_ceiled, k_ceiled, n_ceiled, 0, b_temp,
|
||||
ConstantOne<T>(), program,
|
||||
|
@ -166,7 +157,7 @@ StatusCode Xgemm<T>::DoGemm(const Layout layout,
|
|||
// As above, but now for matrix C. This is only necessary if C is used both as input and output.
|
||||
if (!c_no_temp && beta != static_cast<T>(0)) {
|
||||
auto eventProcessC = Event();
|
||||
status = PadCopyTransposeMatrix(eventProcessC.pointer(), emptyEventList,
|
||||
status = PadCopyTransposeMatrix(queue_, device_, context_, db_, eventProcessC.pointer(), emptyEventList,
|
||||
c_one, c_two, c_ld, c_offset, c_buffer,
|
||||
m_ceiled, n_ceiled, m_ceiled, 0, c_temp,
|
||||
ConstantOne<T>(), program,
|
||||
|
@ -199,13 +190,13 @@ StatusCode Xgemm<T>::DoGemm(const Layout layout,
|
|||
// Launches the kernel
|
||||
auto eventKernel = Event();
|
||||
auto eventPointer = (!c_no_temp) ? eventKernel.pointer() : event_;
|
||||
status = RunKernel(kernel, global, local, eventPointer, eventWaitList);
|
||||
status = RunKernel(kernel, queue_, device_, global, local, eventPointer, eventWaitList);
|
||||
if (ErrorIn(status)) { return status; }
|
||||
|
||||
// Runs the post-processing kernel if needed
|
||||
if (!c_no_temp) {
|
||||
eventWaitList.push_back(eventKernel);
|
||||
status = PadCopyTransposeMatrix(event_, eventWaitList,
|
||||
status = PadCopyTransposeMatrix(queue_, device_, context_, db_, event_, eventWaitList,
|
||||
m_ceiled, n_ceiled, m_ceiled, 0, c_temp,
|
||||
c_one, c_two, c_ld, c_offset, c_buffer,
|
||||
ConstantOne<T>(), program,
|
Some files were not shown because too many files have changed in this diff Show more
Loading…
Reference in a new issue