diff --git a/.appveyor.yml b/.appveyor.yml index 8597e43e..eb7f1c97 100644 --- a/.appveyor.yml +++ b/.appveyor.yml @@ -1,8 +1,8 @@ environment: global: - CLBLAST_ROOT: "%APPVEYOR_BUILD_FOLDER%\\bin\\clblast" + CLBLAST_ROOT: "%APPVEYOR_BUILD_FOLDER%\\..\\bin\\clblast" OPENCL_REGISTRY: "https://www.khronos.org/registry/cl" - OPENCL_ROOT: "%APPVEYOR_BUILD_FOLDER%\\bin\\opencl" + OPENCL_ROOT: "%APPVEYOR_BUILD_FOLDER%\\..\\bin\\opencl" platform: - x64 diff --git a/.gitignore b/.gitignore index bcb32754..8ccab476 100644 --- a/.gitignore +++ b/.gitignore @@ -2,5 +2,6 @@ build stash .* *.pyc -*.db +database.json +database_best.json cl.hpp \ No newline at end of file diff --git a/.travis.yml b/.travis.yml index 8e1a80db..0465afa4 100644 --- a/.travis.yml +++ b/.travis.yml @@ -17,49 +17,21 @@ addons: - kubuntu-backports packages: - cmake + - ocl-icd-opencl-dev env: global: - CLBLAST_ROOT=${TRAVIS_BUILD_DIR}/bin/clblast - - OPENCL_REGISTRY=https://www.khronos.org/registry/cl - - OPENCL_ROOT=${TRAVIS_BUILD_DIR}/bin/opencl before_install: - cmake --version; - ${CC} --version; - ${CXX} --version; -install: - # The following linux logic is necessary because of Travis's move to the GCE platform, which does not - # currently contain packages for fglrx: https://github.com/travis-ci/travis-ci/issues/5221 - # We build our own linkable .so file - - if [ ${TRAVIS_OS_NAME} == "linux" ]; then - mkdir -p ${OPENCL_ROOT}; - pushd ${OPENCL_ROOT}; - travis_retry git clone --depth 1 https://github.com/KhronosGroup/OpenCL-ICD-Loader.git; - mv ./OpenCL-ICD-Loader/* .; - travis_retry git clone --depth 1 https://github.com/KhronosGroup/OpenCL-Headers.git inc/CL; - pushd inc/CL; - travis_retry wget -w 1 -np -nd -nv -A h,hpp ${OPENCL_REGISTRY}/api/2.1/cl.hpp; - popd; - mkdir -p lib; - pushd lib; - cmake -G "Unix Makefiles" ..; - make; - cp ./bin/libOpenCL.so .; - popd; - pushd inc/CL; - travis_retry git fetch origin opencl12:opencl12; - git checkout opencl12; - popd; - mv inc/ include/; - popd; - fi - before_script: - mkdir -p ${CLBLAST_ROOT} - pushd ${CLBLAST_ROOT} - - cmake -DOPENCL_ROOT=${OPENCL_ROOT} -DTESTS=ON -DCLIENTS=ON ${TRAVIS_BUILD_DIR} + - cmake -DTESTS=ON -DCLIENTS=ON ${TRAVIS_BUILD_DIR} script: - make diff --git a/CHANGELOG b/CHANGELOG index b49424c9..1995dc84 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -1,4 +1,19 @@ +Version 0.9.0 +- Updated to version 6.0 of the CLCudaAPI C++11 OpenCL header +- Improved performance significantly of rotated GEMV computations +- Improved performance of unseen/un-tuned devices by a better default tuning parameter selection +- Fixed proper MSVC dllimport and dllexport declarations +- Fixed memory leaks related to events not being released +- Fixed a bug with a size_t and cl_ulong mismatch on 32-bit systems +- Fixed a bug related to the cache and retrieval of programs based on the OpenCL context +- Fixed a performance issue (caused by fp16 support) by optimizing alpha/beta parameter passing to kernels +- Fixed a bug in the OpenCL kernels: now placing __kernel before __attribute__ +- Fixed a bug in level-3 routines when beta is zero and matrix C contains NaNs +- Added an option (-warm_up) to do a warm-up run before timing in the performance clients +- Various minor fixes and enhancements +- Added tuned parameters for various devices (see README) + Version 0.8.0 - Added support for half-precision floating-point (fp16) in the library - Made it possible to compile the performance tests (clients) separately from the correctness tests diff --git a/CMakeLists.txt b/CMakeLists.txt index 6deee35d..178ac9bb 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -9,7 +9,7 @@ # # ================================================================================================== -cmake_minimum_required(VERSION 2.8.10) +cmake_minimum_required(VERSION 2.8.11) # Overrides for MSVC static runtime set(CMAKE_USER_MAKE_RULES_OVERRIDE ${CMAKE_CURRENT_SOURCE_DIR}/cmake/c_flag_overrides.cmake) @@ -18,7 +18,7 @@ set(CMAKE_USER_MAKE_RULES_OVERRIDE_CXX ${CMAKE_CURRENT_SOURCE_DIR}/cmake/cxx_fla # CMake project details project("clblast" C CXX) set(clblast_VERSION_MAJOR 0) -set(clblast_VERSION_MINOR 8) +set(clblast_VERSION_MINOR 9) set(clblast_VERSION_PATCH 0) # Options and their default values @@ -27,6 +27,13 @@ option(TUNERS "Enable compilation of the tuners" OFF) option(CLIENTS "Enable compilation of the clients to test and compare performance" OFF) option(TESTS "Enable compilation of the correctness tests" OFF) +# Compile in verbose mode with additional diagnostic messages +option(VERBOSE "Compile in verbose mode for additional diagnostic messages" OFF) +if(VERBOSE) + message("-- Building in verbose mode") + add_definitions(-DVERBOSE) +endif() + # ================================================================================================== # RPATH settings @@ -68,6 +75,12 @@ else() if(CMAKE_CXX_COMPILER_VERSION VERSION_LESS 4.9.0) set(FLAGS "${FLAGS} -Wno-attributes -Wno-unused-variable") endif() + if(CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 6.0.0) + # GCC does not support attributes on template arguments + # in particular we hit this with the alignment attributes on cl_XXX types + # which are then used to instantiate various templates in CLBlast + set(FLAGS "${FLAGS} -Wno-ignored-attributes") + endif() elseif(CMAKE_CXX_COMPILER_ID MATCHES Clang) set(FLAGS "${FLAGS} -Wextra -Wno-c++98-compat -Wno-c++98-compat-pedantic -Wno-padded") set(FLAGS "${FLAGS} -Wno-missing-prototypes -Wno-float-equal -Wno-switch-enum -Wno-switch") @@ -88,7 +101,7 @@ set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${CFLAGS}") # ================================================================================================== # Package scripts location -set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_SOURCE_DIR}/cmake/Modules/") +set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${clblast_SOURCE_DIR}/cmake/Modules/") # Requires OpenCL. It is found through the included "FindOpenCL.cmake" in CMAKE_MODULE_PATH. find_package(OpenCL REQUIRED) @@ -120,11 +133,6 @@ endif() # ================================================================================================== -# Includes directories: CLBlast and OpenCL -include_directories(${clblast_SOURCE_DIR}/include ${clblast_SOURCE_DIR}/src ${OPENCL_INCLUDE_DIRS}) - -# ================================================================================================== - # Sets the supported routines and the used kernels. New routines and kernels should be added here. set(KERNELS copy_fast copy_pad transpose_fast transpose_pad xaxpy xdot xger xgemm xgemv) set(SAMPLE_PROGRAMS_CPP sgemm) @@ -166,21 +174,36 @@ endforeach() add_library(clblast SHARED ${SOURCES}) target_link_libraries(clblast ${OPENCL_LIBRARIES}) +# Includes directories: CLBlast and OpenCL +target_include_directories(clblast PUBLIC + $ + $ + $ + ${OPENCL_INCLUDE_DIRS}) + +# Sets the proper __declspec(dllexport) keyword for Visual Studio when the library is built +if(MSVC) + target_compile_definitions(clblast PRIVATE COMPILING_DLL=1) # requires at least CMake 2.8.11 +endif() + # Installs the library -install(TARGETS clblast DESTINATION lib) +install(TARGETS clblast EXPORT CLBlast DESTINATION lib) install(FILES include/clblast.h DESTINATION include) install(FILES include/clblast_c.h DESTINATION include) install(FILES include/clblast_half.h DESTINATION include) +# Installs the config for find_package in dependent projects +install(EXPORT CLBlast DESTINATION lib/cmake/CLBLast FILE CLBlastConfig.cmake) + # ================================================================================================== -# Sets a default platform ($DEVICEPLATFORM) and device ($DEFAULT_DEVICE) to run tuners and tests on +# Sets a default platform ($DEVICEPLATFORM) and device ($CLBLAST_DEVICE) to run tuners and tests on set(DEVICEPLATFORM ) -if(DEFINED ENV{DEFAULT_DEVICE}) - set(DEVICEPLATFORM ${DEVICEPLATFORM} -device $ENV{DEFAULT_DEVICE}) +if(DEFINED ENV{CLBLAST_DEVICE}) + set(DEVICEPLATFORM ${DEVICEPLATFORM} -device $ENV{CLBLAST_DEVICE}) endif() -if(DEFINED ENV{DEFAULT_PLATFORM}) - set(DEVICEPLATFORM ${DEVICEPLATFORM} -platform $ENV{DEFAULT_PLATFORM}) +if(DEFINED ENV{CLBLAST_PLATFORM}) + set(DEVICEPLATFORM ${DEVICEPLATFORM} -platform $ENV{CLBLAST_PLATFORM}) endif() # ================================================================================================== @@ -213,13 +236,17 @@ endif() # the CLTune library (not included as part of the source). if(TUNERS) - # Includes CLTune - include_directories(${CLTUNE_INCLUDE_DIRS}) + # Visual Studio requires the sources of non-exported objects/libraries + set(TUNERS_COMMON ) + if(MSVC) + set(TUNERS_COMMON ${TUNERS_COMMON} src/utilities.cpp) + endif() # Adds tuning executables foreach(KERNEL ${KERNELS}) - add_executable(clblast_tuner_${KERNEL} src/tuning/kernels/${KERNEL}.cpp) + add_executable(clblast_tuner_${KERNEL} ${TUNERS_COMMON} src/tuning/kernels/${KERNEL}.cpp) target_link_libraries(clblast_tuner_${KERNEL} clblast ${CLTUNE_LIBRARIES} ${OPENCL_LIBRARIES}) + target_include_directories(clblast_tuner_${KERNEL} PUBLIC ${CLTUNE_INCLUDE_DIRS}) install(TARGETS clblast_tuner_${KERNEL} DESTINATION bin) endforeach() @@ -263,9 +290,6 @@ if(CLIENTS OR TESTS) endif() endif() - # Sets the include directories - include_directories(${clblast_SOURCE_DIR} ${REF_INCLUDES}) - endif() # ================================================================================================== @@ -281,6 +305,11 @@ if(CLIENTS) else() # Creates the common performance-tests objects (requires CMake 2.8.8) add_library(test_performance_common OBJECT test/performance/client.cpp) + + # Adds CLBlast's interface include paths because we can't link to CLBlast here + target_include_directories(test_performance_common PRIVATE + $ + ${clblast_SOURCE_DIR}) set(CLIENTS_COMMON ${CLIENTS_COMMON} $) endif() @@ -303,6 +332,7 @@ if(CLIENTS) endforeach() foreach(ROUTINE ${ROUTINES}) target_link_libraries(clblast_client_${ROUTINE} clblast ${REF_LIBRARIES} ${OPENCL_LIBRARIES}) + target_include_directories(clblast_client_${ROUTINE} PUBLIC ${clblast_SOURCE_DIR} ${REF_INCLUDES}) install(TARGETS clblast_client_${ROUTINE} DESTINATION bin) endforeach() @@ -324,6 +354,9 @@ if(TESTS) # Creates the common correctness-tests objects (requires CMake 2.8.8) add_library(test_correctness_common OBJECT test/correctness/tester.cpp test/correctness/testblas.cpp) + target_include_directories(test_correctness_common PUBLIC + $ + ${clblast_SOURCE_DIR}) set(TESTS_COMMON ${TESTS_COMMON} $) endif() @@ -347,6 +380,7 @@ if(TESTS) foreach(ROUTINE ${ROUTINES}) target_link_libraries(clblast_test_${ROUTINE} clblast ${REF_LIBRARIES} ${OPENCL_LIBRARIES}) install(TARGETS clblast_test_${ROUTINE} DESTINATION bin) + target_include_directories(clblast_test_${ROUTINE} PUBLIC ${clblast_SOURCE_DIR} ${REF_INCLUDES}) add_test(clblast_test_${ROUTINE} clblast_test_${ROUTINE} ${DEVICEPLATFORM}) endforeach() diff --git a/README.md b/README.md index ddd841e2..b9631ea0 100644 --- a/README.md +++ b/README.md @@ -99,20 +99,26 @@ The CLBlast library will be tuned in the future for the most commonly used OpenC * NVIDIA GPUs: - GRID K520 - GeForce GTX 480 + - GeForce GTX 670 - GeForce GTX 680 + - GeForce GTX 750 - GeForce GTX 750 Ti - GeForce GTX 980 + - GeForce GTX 1070 - GeForce GTX Titan - GeForce GTX Titan X - Tesla K20m - Tesla K40m * AMD GPUs: - - Tahiti + - AMD Radeon R9 M370X Compute Engine - Hawaii + - Oland - Pitcairn - - Radeon R9 M370X Compute Engine + - Tahiti * Intel GPUs: + - HD Graphics 530 - HD Graphics Haswell Ultrabook GT2 Mobile + - HD Graphics 5500 BroadWell U-Processor GT2 - HD Graphics Skylake ULT GT2 - Iris - Iris Pro @@ -130,7 +136,7 @@ If your device is not (yet) among this list or if you want to tune CLBlast for s Note that CLBlast's tuners are based on the [CLTune auto-tuning library](https://github.com/CNugteren/CLTune), which has to be installed separately (requires version 2.3.1 or higher). -Compiling with `-DTUNERS=ON` will generate a number of tuners, each named `clblast_tuner_xxxxx`, in which `xxxxx` corresponds to a `.opencl` kernel file as found in `src/kernels`. These kernels corresponds to routines (e.g. `xgemm`) or to common pre-processing or post-processing kernels (`copy` and `transpose`). Running such a tuner will test a number of parameter-value combinations on your device and report which one gave the best performance. Running `make alltuners` runs all tuners for all precisions in one go. You can set the default device and platform for `alltuners` by setting the `DEFAULT_DEVICE` and `DEFAULT_PLATFORM` environmental variables before running CMake. +Compiling with `-DTUNERS=ON` will generate a number of tuners, each named `clblast_tuner_xxxxx`, in which `xxxxx` corresponds to a `.opencl` kernel file as found in `src/kernels`. These kernels corresponds to routines (e.g. `xgemm`) or to common pre-processing or post-processing kernels (`copy` and `transpose`). Running such a tuner will test a number of parameter-value combinations on your device and report which one gave the best performance. Running `make alltuners` runs all tuners for all precisions in one go. You can set the default device and platform for `alltuners` by setting the `CLBLAST_DEVICE` and `CLBLAST_PLATFORM` environmental variables before running CMake. The tuners output a JSON-file with the results. The best results need to be added to `src/database/kernels/xxxxx.hpp` in the appropriate section. However, this can be done automatically based on the JSON-data using a Python script in `scripts/database/database.py`. If you want the found parameters to be included in future releases of CLBlast, please attach the JSON files to the corresponding issue on GitHub or [email the main author](http://www.cedricnugteren.nl). @@ -162,7 +168,7 @@ To build these tests, another BLAS library is needed to serve as a reference. Th Afterwards, executables in the form of `clblast_test_xxxxx` are available, in which `xxxxx` is the name of a routine (e.g. `xgemm`). Note that CLBlast is tested for correctness against [clBLAS](http://github.com/clMathLibraries/clBLAS) and/or a regular CPU BLAS library. If both are installed on your system, setting the command-line option `-clblas 1` or `-cblas 1` will select the library to test against for the `clblast_test_xxxxx` executables. All tests have a `-verbose` option to enable additional diagnostic output. They also have a `-full_test` option to increase coverage further. -All tests can be run directly together in one go through the `make alltests` target or using CTest (`make test` or `ctest`). In the latter case the output is less verbose. Both cases allow you to set the default device and platform to non-zero by setting the `DEFAULT_DEVICE` and `DEFAULT_PLATFORM` environmental variables before running CMake. +All tests can be run directly together in one go through the `make alltests` target or using CTest (`make test` or `ctest`). In the latter case the output is less verbose. Both cases allow you to set the default device and platform to non-zero by setting the `CLBLAST_DEVICE` and `CLBLAST_PLATFORM` environmental variables before running CMake. Compiling the performance tests/clients (optional) @@ -180,6 +186,8 @@ The folder `doc/performance` contains some PDF files with performance results on Note that the CLBlast library provides pre-tuned parameter-values for some devices only: if your device is not among these, then out-of-the-box performance might be poor. See above under `Using the tuners` to find out how to tune for your device. +In case performance is still sub-optimal or something else is wrong, CLBlast can be build in verbose mode for (performance) debugging by specifying `-DVERBOSE=ON` to CMake. + Supported routines ------------- @@ -278,6 +286,9 @@ The contributing authors (code, pull requests, testing) so far are: * [Dragan Djuric](https://github.com/blueberry) * [Marco Hutter](https://github.com/gpus) * [Hugh Perkins](https://github.com/hughperkins) +* [Gian-Carlo Pascutto](https://github.com/gcp) +* [Ivan Shapovalov](https://github.com/intelfx) +* [Dimitri Van Assche](https://github.com/dvasschemacq) Tuning and testing on a variety of OpenCL devices was made possible by: diff --git a/include/clblast.h b/include/clblast.h index c8596b39..e1d4f25b 100644 --- a/include/clblast.h +++ b/include/clblast.h @@ -25,6 +25,18 @@ #include #endif +// Exports library functions under Windows when building a DLL. See also: +// https://msdn.microsoft.com/en-us/library/a90k134d.aspx +#ifdef _WIN32 + #ifdef COMPILING_DLL + #define PUBLIC_API __declspec(dllexport) + #else + #define PUBLIC_API __declspec(dllimport) + #endif +#else + #define PUBLIC_API +#endif + namespace clblast { // ================================================================================================= @@ -576,11 +588,11 @@ StatusCode Omatcopy(const Layout layout, const Transpose a_transpose, // CLBlast stores binaries of compiled kernels into a cache in case the same kernel is used later on // for the same device. This cache can be cleared to free up system memory or in case of debugging. -StatusCode ClearCache(); +StatusCode PUBLIC_API ClearCache(); // The cache can also be pre-initialized for a specific device with all possible CLBLast kernels. // Further CLBlast routine calls will then run at maximum speed. -StatusCode FillCache(const cl_device_id device); +StatusCode PUBLIC_API FillCache(const cl_device_id device); // ================================================================================================= diff --git a/include/clblast_c.h b/include/clblast_c.h index b92febac..a13b8e64 100644 --- a/include/clblast_c.h +++ b/include/clblast_c.h @@ -25,7 +25,11 @@ // Exports library functions under Windows when building a DLL. See also: // https://msdn.microsoft.com/en-us/library/a90k134d.aspx #ifdef _WIN32 - #define PUBLIC_API __declspec(dllexport) + #ifdef COMPILING_DLL + #define PUBLIC_API __declspec(dllexport) + #else + #define PUBLIC_API __declspec(dllimport) + #endif #else #define PUBLIC_API #endif diff --git a/samples/cache.c b/samples/cache.c index 7f876be1..a592824d 100644 --- a/samples/cache.c +++ b/samples/cache.c @@ -113,6 +113,7 @@ void run_example_routine(const cl_device_id device) { // Wait for completion clWaitForEvents(1, &event); + clReleaseEvent(event); // Retrieves the execution time clock_t diff = clock() - start; diff --git a/samples/dgemv.c b/samples/dgemv.c index 6ea0deb0..c22c9f37 100644 --- a/samples/dgemv.c +++ b/samples/dgemv.c @@ -85,6 +85,7 @@ int main(void) { // Wait for completion clWaitForEvents(1, &event); + clReleaseEvent(event); // Example completed. See "clblast_c.h" for status codes (0 -> success). printf("Completed DGEMV with status %d\n", status); diff --git a/samples/haxpy.c b/samples/haxpy.c index 3c7bb33a..d5b98e12 100644 --- a/samples/haxpy.c +++ b/samples/haxpy.c @@ -78,6 +78,7 @@ int main(void) { // Wait for completion clWaitForEvents(1, &event); + clReleaseEvent(event); // Copies the result back to the host clEnqueueReadBuffer(queue, device_b, CL_TRUE, 0, n*sizeof(cl_half), host_b, 0, NULL, NULL); diff --git a/samples/sasum.c b/samples/sasum.c index 3fdbb0eb..1518cc13 100644 --- a/samples/sasum.c +++ b/samples/sasum.c @@ -74,6 +74,7 @@ int main(void) { // Wait for completion clWaitForEvents(1, &event); + clReleaseEvent(event); // Copies the result back to the host clEnqueueReadBuffer(queue, device_output, CL_TRUE, 0, 1*sizeof(float), host_output, 0, NULL, NULL); diff --git a/samples/sgemm.c b/samples/sgemm.c index 79f30c83..b4827777 100644 --- a/samples/sgemm.c +++ b/samples/sgemm.c @@ -88,6 +88,7 @@ int main(void) { // Wait for completion clWaitForEvents(1, &event); + clReleaseEvent(event); // Example completed. See "clblast_c.h" for status codes (0 -> success). printf("Completed SGEMM with status %d\n", status); diff --git a/samples/sgemm.cpp b/samples/sgemm.cpp index 5fe7490a..a4b89968 100644 --- a/samples/sgemm.cpp +++ b/samples/sgemm.cpp @@ -96,6 +96,7 @@ int main() { // Record the execution time clWaitForEvents(1, &event); + clReleaseEvent(event); auto elapsed_time = std::chrono::steady_clock::now() - start_time; auto time_ms = std::chrono::duration(elapsed_time).count(); diff --git a/scripts/database/database.py b/scripts/database/database.py old mode 100644 new mode 100755 index 49bc1801..f758a2b7 --- a/scripts/database/database.py +++ b/scripts/database/database.py @@ -1,325 +1,104 @@ #!/usr/bin/env python -# ================================================================================================== -# This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This -# project loosely follows the Google C++ styleguide and uses a max-width of 100 characters per line. +# This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This file follows the +# PEP8 Python style guide and uses a max-width of 120 characters per line. # # Author(s): # Cedric Nugteren -# -# ================================================================================================== -# System modules import sys import os.path import glob -import re -import json -try: - from urllib.request import urlopen # Python 3 -except ImportError: - from urllib2 import urlopen # Python 2 +import argparse -# Additional modules -import pandas as pd +import database.io as io +import database.db as db +import database.clblast as clblast +import database.bests as bests +import database.defaults as defaults # Server storing a copy of the database -DATABASE_SERVER_URL = "http://www.cedricnugteren.nl/tuning/clblast.db" - -# Constants -VENDOR_DEFAULT = "default" -DEVICETYPE_DEFAULT = "All" -DEVICENAME_DEFAULT = "default" - -# Attributes -DEVICETYPE_ATTRIBUTES = ["device_vendor", "device_type"] -DEVICE_ATTRIBUTES = ["device", "device_core_clock", "device_compute_units"] -KERNEL_ATTRIBUTES = ["precision", "kernel_family"] -ARGUMENT_ATTRIBUTES = ["arg_m", "arg_n", "arg_k", "arg_alpha", "arg_beta"] -ATTRIBUTES = DEVICE_ATTRIBUTES + DEVICETYPE_ATTRIBUTES + KERNEL_ATTRIBUTES + ARGUMENT_ATTRIBUTES +DATABASE_SERVER_URL = "http://www.cedricnugteren.nl/tuning/clblast.json" # OpenCL vendor names and their short name -VENDOR_NAMES = { "device_vendor": { +VENDOR_TRANSLATION_TABLE = { "GenuineIntel": "Intel", "Intel(R) Corporation": "Intel", "Advanced Micro Devices, Inc.": "AMD", "NVIDIA Corporation": "NVIDIA", -}} +} -# Pandas options -pd.set_option('display.width', 1000) -# ================================================================================================== -# Database operations -# ================================================================================================== +def main(argv): -# Downloads the database and save it to disk -def DownloadDatabase(filename): - print("## Downloading database from '"+DATABASE_SERVER_URL+"'...") - df = urlopen(DATABASE_SERVER_URL) - output = open(file_db,'wb') - output.write(df.read()) - output.close() + # Parses the command-line arguments + parser = argparse.ArgumentParser() + parser.add_argument("source_folder", help="The folder with JSON files to parse to add to the database") + parser.add_argument("clblast_root", help="Root of the CLBlast sources") + parser.add_argument("-v", "--verbose", action="store_true", help="Increase verbosity of the script") + cl_args = parser.parse_args(argv) -# Loads the database from disk -def LoadDatabase(filename): - return pd.read_pickle(filename) + # Parses the path arguments + database_filename = os.path.join(cl_args.clblast_root, "scripts", "database", "database.json") + database_best_filename = os.path.join(cl_args.clblast_root, "scripts", "database", "database_best.json") + json_files = os.path.join(cl_args.source_folder, "*.json") + cpp_database_path = os.path.join(cl_args.clblast_root, "src", "database", "kernels") -# Saves the database to disk -def SaveDatabase(df, filename): - df.to_pickle(filename) + # Checks whether the command-line arguments are valid + clblast_header = os.path.join(cl_args.clblast_root, "include", "clblast.h") # Not used but just for validation + if not os.path.isfile(clblast_header): + raise RuntimeError("The path '" + cl_args.clblast_root + "' does not point to the root of the CLBlast library") + if len(glob.glob(json_files)) < 1: + print("[database] The path '" + cl_args.source_folder + "' does not contain any JSON files") -# Loads JSON data from file -def ImportDataFromFile(filename): - with open(filename) as f: - data = json.load(f) - json_data = pd.DataFrame(data) - df = pd.io.json.json_normalize(json_data["results"]) - for attribute in ATTRIBUTES: - if attribute == "kernel_family": - df[attribute] = re.sub(r'_\d+', '', data[attribute]) - elif attribute in data: - df[attribute] = data[attribute] - else: - df[attribute] = 0 - return df + # Downloads the database if a local copy is not present + if not os.path.isfile(database_filename): + io.download_database(database_filename, DATABASE_SERVER_URL) -# Returns the row-wise concatenation of two dataframes -def ConcatenateData(df1, df2): - return pd.concat([df1, df2]) + # Loads the database from disk + database = io.load_database(database_filename) -# Removes duplicates from a dataframe -def RemoveDuplicates(df): - return df.drop_duplicates() + # Loops over all JSON files in the supplied folder + for file_json in glob.glob(json_files): -# database = database[(database["device"] != "AMD Radeon R9 M370X Compute Engine") | (database["kernel_family"] != "xgemm") | (database["precision"] != "32")] -def RemoveEntriesByDevice(df, devicename): - return df[df["device"] != devicename] + # Loads the newly imported data + sys.stdout.write("[database] Processing '" + file_json + "' ") # No newline printed + imported_data = io.load_tuning_results(file_json) -def RemoveEntriesByKernelFamily(df, familyname): - return df[df["kernel_family"] != familyname] + # Fixes the problem that some vendors use multiple different names + for target in VENDOR_TRANSLATION_TABLE: + if imported_data["device_vendor"] == target: + imported_data["device_vendor"] = VENDOR_TRANSLATION_TABLE[target] -def GetEntriesByField(df, field, value): - return df[df[field] == value] + # Adds the new data to the database + old_size = db.length(database) + database = db.add_section(database, imported_data) + new_size = db.length(database) + print("with " + str(new_size - old_size) + " new items") # Newline printed here -# Example usage: -# df = UpdateDatabase(df, (df["kernel_family"] == "xdot") & (df["arg_n"] == "67108864"), "arg_n", "2097152") -def UpdateDatabase(df, condition, field, value): - df.loc[condition, field] = value - return df + # Stores the modified database back to disk + if len(glob.glob(json_files)) >= 1: + io.save_database(database, database_filename) -# Fixes the problem that some vendors use multiple different names -def SanitizeVendorNames(df): - df = df.replace(VENDOR_NAMES) - return df + # Retrieves the best performing results + print("[database] Calculating the best results per device/kernel...") + database_best_results = bests.get_best_results(database) -# Retrieves the results with the lowest execution times -def GetBestResults(df): - dfbest = pd.DataFrame() - grouped = df.groupby(ATTRIBUTES+["kernel"]) - for name, dfgroup in grouped: - besttime = dfgroup["time"].min() - bestcase = dfgroup[dfgroup["time"] == besttime].iloc[0] - dfbest = dfbest.append(bestcase, ignore_index=True) - return dfbest + # Determines the defaults for other vendors and per vendor + print("[database] Calculating the default values...") + database_defaults = defaults.calculate_defaults(database, cl_args.verbose) + database_best_results["sections"].extend(database_defaults["sections"]) -# Sets defaults for devices of the same type/vendor based on the smallest values of all know -# entries. The average might be better for performance but some parameters might not be supported -# on other devices. -def CalculateDefaults(df): - dfdefault = pd.DataFrame() + # Optionally outputs the database to disk + if cl_args.verbose: + io.save_database(database_best_results, database_best_filename) - # Defaults per type/vendor - groups = df.groupby(DEVICETYPE_ATTRIBUTES+KERNEL_ATTRIBUTES+ARGUMENT_ATTRIBUTES+["kernel"]) - for name, dfgroup in groups: - default_values = dfgroup.min(axis=0) - default_values["device"] = DEVICENAME_DEFAULT - default_values["device_compute_units"] = 0 - default_values["device_core_clock"] = 0 - default_values["time"] = 0.0 - dfdefault = dfdefault.append(default_values, ignore_index=True) - - # Checks for mis-matched arguments - groups = dfdefault.groupby(DEVICETYPE_ATTRIBUTES+KERNEL_ATTRIBUTES+["kernel"]) - for name, dfgroup in groups: - if len(dfgroup) != 1: - description = dfgroup["kernel"].min() + " " + dfgroup["device_vendor"].min() - print("[WARNING] Entries for a single kernel with multiple argument values: " + description) - - # Defaults in general - groups = df.groupby(KERNEL_ATTRIBUTES+ARGUMENT_ATTRIBUTES+["kernel"]) - for name, dfgroup in groups: - default_values = dfgroup.min(axis=0) - default_values["device_vendor"] = VENDOR_DEFAULT - default_values["device_type"] = DEVICETYPE_DEFAULT - default_values["device"] = DEVICENAME_DEFAULT - default_values["device_compute_units"] = 0 - default_values["device_core_clock"] = 0 - default_values["time"] = 0.0 - dfdefault = dfdefault.append(default_values, ignore_index=True) - - # Database with both types of defaults only - return dfdefault + # Outputs the database as a C++ database + print("[database] Producing a C++ database in '" + cpp_database_path + "'...") + clblast.print_cpp_database(database_best_results, cpp_database_path) -# ================================================================================================== -# C++ header generation -# ================================================================================================== + print("[database] All done") -# The C++ header -def GetHeader(family): - return(""" -// ================================================================================================= -// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This -// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- -// width of 100 characters per line. -// -// Author(s): -// Database generator -// -// This file populates the database with best-found tuning parameters for the '%s' kernels. -// -// ================================================================================================= -namespace clblast { -// =================================================================================================""" - % family.title()) - -# The C++ footer -def GetFooter(): - return("\n} // namespace clblast\n") - -# The start of a new C++ precision entry -def GetPrecision(family, precision): - precisionstring = "" - if precision == "16": - precisionstring = "Half" - elif precision == "32": - precisionstring = "Single" - elif precision == "64": - precisionstring = "Double" - elif precision == "3232": - precisionstring = "ComplexSingle" - elif precision == "6464": - precisionstring = "ComplexDouble" - else: - print("[ERROR] Unknown precision") - sys.exit() - return("\n\nconst Database::DatabaseEntry Database::%s%s = {\n \"%s\", Precision::k%s, {\n" - % (family.title(), precisionstring, family.title(), precisionstring)) - -# The C++ device type and vendor -def GetDeviceVendor(vendor, devtype): - if vendor == VENDOR_DEFAULT and devtype == DEVICETYPE_DEFAULT: - return(" { // Default\n kDeviceType%s, \"%s\", {\n" % (devtype, vendor)) - return(" { // %s %ss\n kDeviceType%s, \"%s\", {\n" % (vendor, devtype, devtype[0].upper() + devtype[1:], vendor)) - -# Prints the data to a C++ database -def PrintData(df, outputdir): - - # Iterates over the kernel families: creates a new file per family - for family, dffamily in df.groupby(["kernel_family"]): - dffamily = dffamily.dropna(axis=1, how='all') - f = open(os.path.join(outputdir, family+'.hpp'), 'w+') - f.write(GetHeader(family)) - - # Loops over the different entries for this family and prints their headers - for precision, dfprecision in dffamily.groupby(["precision"]): - f.write(GetPrecision(family, precision)) - for vendor, dfvendor in dfprecision.groupby(["device_vendor"]): - for devtype, dfdevtype in dfvendor.groupby(["device_type"]): - f.write(GetDeviceVendor(vendor, devtype)) - for device, dfdevice in dfdevtype.groupby(["device"]): - devicename = "\"%s\"," % device - f.write(" { %-50s { " % devicename) - - # Collects the paramaters for this case and prints them - parameters = [] - for kernel, dfkernel in dfdevice.groupby(["kernel"]): - dfkernel = dfkernel.dropna(axis=1) - col_names = [col for col in list(dfkernel) if col.startswith('parameters.') and col != "parameters.PRECISION"] - parameters += ["{\"%s\",%d}" % (p.replace("parameters.",""), dfkernel[p].iloc[0]) for p in col_names] - f.write(", ".join(parameters)) - f.write(" } },\n") - - # Prints the footers - f.write(" }\n },\n") - f.write(" }\n};\n\n// =================================================================================================") - f.write(GetFooter()) - -# ================================================================================================== -# Command-line arguments parsing and verification -# ================================================================================================== - -# Checks for the number of command-line arguments -if len(sys.argv) != 3: - print("[ERROR] Usage: database.py ") - sys.exit() - -# Parses the command-line arguments -path_json = sys.argv[1] -path_clblast = sys.argv[2] -file_db = os.path.join(path_clblast, "scripts", "database", "database.db") -glob_json = os.path.join(path_json, "*.json") - -# Checks whether the command-line arguments are valid; exists otherwise -clblast_h = os.path.join(path_clblast, "include", "clblast.h") # Not used but just for validation -if not os.path.isfile(clblast_h): - print("[ERROR] The path '"+path_clblast+"' does not point to the root of the CLBlast library") - sys.exit() -if len(glob.glob(glob_json)) < 1: - print("## The path '"+path_json+"' does not contain any JSON files") - -# ================================================================================================== -# The main body of the script -# ================================================================================================== - -# Downloads the database if a local copy is not present -db_exists = os.path.isfile(file_db) -if not db_exists: - DownloadDatabase(file_db) - -# Loads the database from disk -print("## Loading the database from disk...") -database = LoadDatabase(file_db) - -# Loops over all JSON files in the supplied folder -for file_json in glob.glob(glob_json): - - # Loads the newly imported data - sys.stdout.write("## Processing '"+file_json+"' ") - imported_data = ImportDataFromFile(file_json) - imported_data = SanitizeVendorNames(imported_data) - - # Adds the new data to the database - old_size = len(database.index) - database = ConcatenateData(database, imported_data) - database = RemoveDuplicates(database) - new_size = len(database.index) - print("with "+str(new_size-old_size)+" new items") - -# Stores the modified database back to disk -if len(glob.glob(glob_json)) >= 1: - print("## Storing the database to disk...") - SaveDatabase(database, file_db) - -# Optional: update the database here. Default is disabled, code below is just an example -if False: - database = UpdateDatabase(database, ((database["kernel"] == "CopyMatrixFast") & (database["precision"] == "3232")), "arg_alpha", "2+0.5i") - SaveDatabase(database, file_db) - -# Retrieves the best performing results -print("## Calculating the best results per device/kernel...") -bests = GetBestResults(database) - -# Determines the defaults for other vendors and per vendor -defaults = CalculateDefaults(bests) -bests = ConcatenateData(bests, defaults) - -# Outputs the data as a C++ database -path_cpp_database = os.path.join(path_clblast, "src", "database", "kernels") -print("## Producing a C++ database in '"+path_cpp_database+"'...") -PrintData(bests, path_cpp_database) - -print("## All done") - -# ================================================================================================== +if __name__ == '__main__': + main(sys.argv[1:]) diff --git a/scripts/database/database/__init__.py b/scripts/database/database/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/scripts/database/database/bests.py b/scripts/database/database/bests.py new file mode 100644 index 00000000..c924efde --- /dev/null +++ b/scripts/database/database/bests.py @@ -0,0 +1,58 @@ + +# This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This file follows the +# PEP8 Python style guide and uses a max-width of 120 characters per line. +# +# Author(s): +# Cedric Nugteren + +import sys + + +def get_best_results(database): + """Retrieves the results with the lowest execution times""" + sections_best = [] + for section in database["sections"]: + section_best = {} + + # Stores all the section's meta data + for attribute in section.keys(): + if attribute != "results": + section_best[attribute] = section[attribute] + + # Find the best result + parameters_best = None + time_best = sys.float_info.max + for result in section["results"]: + if result["time"] < time_best: + time_best = result["time"] + parameters_best = result["parameters"] + + # Stores the best result + section_best["results"] = [{"time": time_best, "parameters": parameters_best}] + sections_best.append(section_best) + + return {"sections": sections_best} + + +def get_relative_bests(name, common_results, common_parameters, verbose=False): + """Retrieves the parameters with the relative best execution time over different devices""" + + # Helper function + def argmax(iterable): + return max(enumerate(iterable), key=lambda x: x[1])[0] + + # Computes the sum of the execution times over the different devices + performance_sums = [] + for parameters in common_parameters: + performance_sum = sum([r["relative_performance"] for r in common_results if r["parameters"] == parameters]) + performance_sums.append(performance_sum) + + # Retrieves the entry with the highest performance + best_index = argmax(performance_sums) + best_performance = performance_sums[best_index] + best_parameters = common_parameters[best_index] + + # Completed, report and return the results + if verbose: + print("[database] " + str(name) + " with performance " + str(best_performance)) + return best_parameters diff --git a/scripts/database/database/clblast.py b/scripts/database/database/clblast.py new file mode 100644 index 00000000..8190f225 --- /dev/null +++ b/scripts/database/database/clblast.py @@ -0,0 +1,155 @@ + +# This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This file follows the +# PEP8 Python style guide and uses a max-width of 120 characters per line. +# +# Author(s): +# Cedric Nugteren + +import os + +# Constants from the C++ code +VENDOR_DEFAULT = "default" +DEVICE_TYPE_DEFAULT = "All" +DEVICE_NAME_DEFAULT = "default" + +# List of attributes +DEVICE_TYPE_ATTRIBUTES = ["device_vendor", "device_type"] +DEVICE_ATTRIBUTES = ["device", "device_core_clock", "device_compute_units"] +KERNEL_ATTRIBUTES = ["precision", "kernel_family"] +ARGUMENT_ATTRIBUTES = ["arg_m", "arg_n", "arg_k", "arg_alpha", "arg_beta"] +ATTRIBUTES = DEVICE_ATTRIBUTES + DEVICE_TYPE_ATTRIBUTES + KERNEL_ATTRIBUTES + ARGUMENT_ATTRIBUTES +GROUP_ATTRIBUTES = DEVICE_TYPE_ATTRIBUTES + KERNEL_ATTRIBUTES + ["kernel"] + ARGUMENT_ATTRIBUTES + + +def precision_to_string(precision): + """Translates a precision number (represented as Python string) into a descriptive string""" + if precision == "16": + return "Half" + elif precision == "32": + return "Single" + elif precision == "64": + return "Double" + elif precision == "3232": + return "ComplexSingle" + elif precision == "6464": + return "ComplexDouble" + else: + raise("Unknown precision: " + precision) + + +def get_cpp_separator(): + """Retrieves a C++ comment separator""" + return "// =================================================================================================" + + +def get_cpp_header(family): + """Retrieves the C++ header""" + return ("\n" + get_cpp_separator() + """ +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Database generator +// +// This file populates the database with best-found tuning parameters for the '%s' kernels. +//\n""" + % family.title() + get_cpp_separator() + "\n\nnamespace clblast {\n" + get_cpp_separator()) + + +def get_cpp_footer(): + """Retrieves the C++ footer""" + return "\n} // namespace clblast\n" + + +def get_cpp_precision(family, precision): + """Retrieves the C++ code for the start of a new precision""" + precision_string = precision_to_string(precision) + camelcase_name = family.title().replace("_", "") + return("\n\nconst Database::DatabaseEntry Database::%s%s = {\n \"%s\", Precision::k%s, {\n" + % (camelcase_name, precision_string, camelcase_name, precision_string)) + + +def get_cpp_device_vendor(vendor, device_type): + """Retrieves the C++ code for the (default) vendor and device type""" + if vendor == VENDOR_DEFAULT and device_type == DEVICE_TYPE_DEFAULT: + return " { // Default\n kDeviceType%s, \"%s\", {\n" % (device_type, vendor) + device_type_caps = device_type[0].upper() + device_type[1:] + return " { // %s %ss\n kDeviceType%s, \"%s\", {\n" % (vendor, device_type, device_type_caps, vendor) + + +def print_cpp_database(database, output_dir): + """Outputs the database as C++ code""" + + # Iterates over the kernel families + kernel_families = sorted(set([s["kernel_family"] for s in database["sections"]])) + for family_name in kernel_families: + family_database = [s for s in database["sections"] if s["kernel_family"] == family_name] + + # Opens a new file for each kernel family + full_path = os.path.join(output_dir, family_name + ".hpp") + with open(full_path, 'w+') as f: + f.write(get_cpp_header(family_name)) + + # Loops over the different precision (e.g. 16, 32, 3232, 64, 6464) + precisions = sorted(set([s["precision"] for s in database["sections"]])) # Based on full database + for precision in precisions: + precision_database = [s for s in family_database if s["precision"] == precision] + f.write(get_cpp_precision(family_name, precision)) + + # In case there is nothing found at all (e.g. 16-bit): continue as if this was a precision of 32 but + # with the defaults only + if len(precision_database) == 0: + print("[database] No results found for %s:%s, retrieving defaults from %s:32" % + (family_name, precision, family_name)) + precision_database = [s for s in family_database if s["precision"] == "32" + and s["device_vendor"] == VENDOR_DEFAULT + and s["device_type"] == DEVICE_TYPE_DEFAULT + and s["device"] == DEVICE_NAME_DEFAULT] + + # Loops over device vendors (e.g. AMD) + device_vendors = sorted(set([s["device_vendor"] for s in precision_database])) + for vendor in device_vendors: + vendor_database = [s for s in precision_database if s["device_vendor"] == vendor] + + # Loops over device types (e.g. GPU) + device_types = sorted(set([s["device_type"] for s in vendor_database])) + for device_type in device_types: + type_database = [s for s in vendor_database if s["device_type"] == device_type] + f.write(get_cpp_device_vendor(vendor, device_type)) + + # Loops over every device of this vendor-type combination + devices = sorted(set([s["device"] for s in type_database])) + for device_name in devices: + device_database = [s for s in type_database if s["device"] == device_name] + device_name_quoted = "\"%s\"," % device_name + device_name_cpp = " { %-50s { " % device_name_quoted + f.write(device_name_cpp) + + # Collects the parameters for this entry + parameters = [] + kernels = sorted(set([s["kernel"] for s in device_database])) + for kernel in kernels: + kernel_database = [s for s in device_database if s["kernel"] == kernel] + + assert len(kernel_database) == 1 + results = kernel_database[0]["results"] + + assert len(results) == 1 + new_parameters = results[0]["parameters"] + for parameter_name in sorted(new_parameters): + parameter_value = new_parameters[parameter_name] + parameters.append("{\"" + parameter_name + "\"," + str(parameter_value) + "}") + + # Prints the entry + f.write(", ".join(parameters)) + f.write(" } },\n") + + # Prints the vendor-type combination footer + f.write(" }\n },\n") + + # Prints the precision footer + f.write(" }\n};\n\n" + get_cpp_separator()) + + # Prints the file footer + f.write(get_cpp_footer()) diff --git a/scripts/database/database/db.py b/scripts/database/database/db.py new file mode 100644 index 00000000..94948b1a --- /dev/null +++ b/scripts/database/database/db.py @@ -0,0 +1,64 @@ + +# This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This file follows the +# PEP8 Python style guide and uses a max-width of 120 characters per line. +# +# Author(s): +# Cedric Nugteren + +import clblast + + +def length(database): + """Computes the total number of tuning entries""" + num_tuning_entries = 0 + for section in database["sections"]: + num_tuning_entries += len(section["results"]) + return num_tuning_entries + + +def add_section(database, new_section): + """Adds a new section to the database""" + for old_section in database["sections"]: + + # Verify whether the sections match + equal = True + for attribute in new_section.keys(): + if attribute != "results": + if attribute not in old_section or new_section[attribute] != old_section[attribute]: + equal = False + break + + # They match: append the new section's results to the corresponding entry in the database and return + if equal: + old_section["results"] = combine_results(old_section["results"], new_section["results"]) + return database + + # No match found: append the whole new section to the database + database["sections"].append(new_section) + return database + + +def combine_results(old_results, new_results): + """Adds new results to the results JSON list""" + for new_result in new_results: + old_results = combine_result(old_results, new_result) + return old_results + + +def combine_result(old_results, new_result): + """Adds a new result to the results JSON list; filters for duplicate entries and saves the best performing one""" + + # Loops over all existing results to test for already existing entries with these parameters + for old_result in old_results: + + # Verify whether the results match + equal = new_result["parameters"] == old_result["parameters"] + + # They match: keep only the one with the minimum execution time + if equal: + old_result["time"] = min(old_result["time"], new_result["time"]) + return old_results + + # No match found: append a new result + old_results.append(new_result) + return old_results diff --git a/scripts/database/database/defaults.py b/scripts/database/database/defaults.py new file mode 100644 index 00000000..00405908 --- /dev/null +++ b/scripts/database/database/defaults.py @@ -0,0 +1,180 @@ + +# This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This file follows the +# PEP8 Python style guide and uses a max-width of 120 characters per line. +# +# Author(s): +# Cedric Nugteren + + +import clblast +import bests + + +def set_default_device(section): + """Sets the device name and parameters to some default values""" + section["device"] = clblast.DEVICE_NAME_DEFAULT + section["device_compute_units"] = 0 + section["device_core_clock"] = 0 + return section + + +def set_identifiers(database, group_by_attributes, identifier_name): + """Sets a group-identifier based on a given set of attributes. Modifies the database but also returns a list of + unique identifiers.""" + identifiers = [] + for section in database["sections"]: + identifier = [] + for attribute in group_by_attributes: + if attribute in section: + identifier.append(section[attribute]) + section[identifier_name] = ";".join(identifier) + identifiers.append(section[identifier_name]) + return sorted(set(identifiers)) + + +def remove_identifiers(database, identifier_name): + """Removes an identifier from all sections in the database""" + for section in database["sections"]: + section.pop(identifier_name, None) + + +def get_groups_by_identifier(database, group_identifiers, identifier_name): + """Returns a list of (group, group_identifier) tuples based a previously made grouping""" + groups = [] + for group_identifier in group_identifiers: + + # Get all sections in this group + group = [] + for section in database["sections"]: + if section[identifier_name] == group_identifier: + group.append(section) + + groups.append((group, group_identifier)) + return groups + + +def calculate_defaults(database, verbose): + """Sets defaults for devices of the same type/vendor""" + + # Groups the database by kernel, vendor and device type (e.g. AMD GPU) + group_identifiers = set_identifiers(database, clblast.GROUP_ATTRIBUTES, "group_identifier") + groups = get_groups_by_identifier(database, group_identifiers, "group_identifier") + + # Loops over all groups + default_sections = {"sections": []} + for group, group_identifier in groups: + + # Computes the best parameters + default_parameters = get_common_best_parameters(group, group_identifier, verbose) + + # Stores all the section's data + assert len(group) > 0 + default_section = {} + for attribute in group[0].keys(): + if attribute != "results" and attribute != "group_identifier": + default_section[attribute] = group[0][attribute] + default_section = set_default_device(default_section) + default_section["results"] = [{"time": 0.0, "parameters": default_parameters}] + default_sections["sections"].append(default_section) + + # Groups the database by kernel, vendor and device type (e.g. AMD GPU) - but not by arguments! This is to check for + # mis-matched arguments. + attributes = clblast.DEVICE_TYPE_ATTRIBUTES + clblast.KERNEL_ATTRIBUTES + ["kernel"] + group_identifiers = set_identifiers(default_sections, attributes, "temp_identifier") + groups = get_groups_by_identifier(default_sections, group_identifiers, "temp_identifier") + for group, group_identifier in groups: + if len(group) != 1: + print("[ERROR] Entries for a single kernel with multiple argument values: " + str(group_identifier)) + assert len(group) == 1 + remove_identifiers(default_sections, "temp_identifier") + + # Groups the database by kernel only + group_identifiers = set_identifiers(database, clblast.KERNEL_ATTRIBUTES + ["kernel"], "group_identifier") + groups = get_groups_by_identifier(database, group_identifiers, "group_identifier") + + # Loops over all groups + for group, group_identifier in groups: + + # Computes the best parameters + default_parameters = get_common_best_parameters(group, group_identifier, verbose) + + # Stores all the section's data + assert len(group) > 0 + default_section = {} + for attribute in group[0].keys(): + if attribute != "results" and attribute != "group_identifier": + default_section[attribute] = group[0][attribute] + default_section = set_default_device(default_section) + default_section["device_vendor"] = clblast.VENDOR_DEFAULT + default_section["device_type"] = clblast.DEVICE_TYPE_DEFAULT + default_section["results"] = [{"time": 0.0, "parameters": default_parameters}] + default_sections["sections"].append(default_section) + + # Database with both types of defaults only + return default_sections + + +def get_smallest_best_parameters(group): + """Sets defaults based on the smallest values of all known entries. The average might be better for performance but + some parameters might not be supported on other devices.""" + + # Counts the number of devices in this group + assert len(group) > 0 + + # Find the smallest values of the parameters + min_parameters = {} + for section in group: + assert len(section["results"]) > 0 + minimum_time = min([result["time"] for result in section["results"]]) + for result in section["results"]: + if result["time"] == minimum_time: + for parameter in result["parameters"]: + if parameter in min_parameters: + min_parameters[parameter] = min(min_parameters[parameter], result["parameters"][parameter]) + else: + min_parameters[parameter] = result["parameters"][parameter] + + return min_parameters + + +def get_common_best_parameters(group, group_identifier, verbose): + """Sets defaults based on the best values of entries supported by all devices. This might cause a problem in case + not every device was tuned with the same parameters. In that case it falls back to the above method to retrieve + the smallest best execution time""" + + # Counts the number of devices in this group + num_devices = len(group) + assert num_devices > 0 + + # Inserts the relative execution times into the database + for section in group: + assert len(section["results"]) > 0 + minimum_time = min([result["time"] for result in section["results"]]) + for result in section["results"]: + result["relative_performance"] = minimum_time / result["time"] + + # Determine which parameters are available for all devices + common_parameters = [result["parameters"] for result in group[0]["results"]] # Parameters of the first section + for i in range(1, num_devices): + section_parameters = [result["parameters"] for result in group[i]["results"]] + common_parameters = [p for p in section_parameters if p in common_parameters] # Intersection of the parameters + + # Fall back to another method in case there are no shared entries at all across devices + if len(common_parameters) == 0: + if verbose: + print("[database] No common kernels for: " + str(group_identifier) + " with devices: %d " % num_devices) + smallest_best_parameters = get_smallest_best_parameters(group) + if verbose: + print("[database] " + str(group_identifier)) + return smallest_best_parameters + + # Removes entries with parameters which are not common + common_results = [] + for section in group: + for result in section["results"]: + if result["parameters"] in common_parameters: + common_results.append(result) + + # Retrieves the entries with the highest relative performance + relative_best_parameters = bests.get_relative_bests(group_identifier, common_results, common_parameters, verbose) + return relative_best_parameters diff --git a/scripts/database/database/io.py b/scripts/database/database/io.py new file mode 100644 index 00000000..d14f1297 --- /dev/null +++ b/scripts/database/database/io.py @@ -0,0 +1,60 @@ + +# This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This file follows the +# PEP8 Python style guide and uses a max-width of 120 characters per line. +# +# Author(s): +# Cedric Nugteren + +import re +import json + +try: + from urllib.request import urlopen # Python 3 +except ImportError: + from urllib2 import urlopen # Python 2 + + +def download_database(filename, database_url): + """Downloads a database and saves it to disk""" + print("[database] Downloading database from '" + database_url + "'...") + database = urlopen(database_url) + with open(filename, "wb") as f: + f.write(database.read()) + + +def load_database(filename): + """Loads a database from disk""" + print("[database] Loading database from '" + filename + "'") + with open(filename) as f: + return json.load(f) + + +def save_database(database, filename): + """Saves a database to disk""" + print("[database] Saving database to '" + filename + "'") + with open(filename, "wb") as f: + json.dump(database, f, sort_keys=True, indent=4) + + +def load_tuning_results(filename): + """Loads JSON data from file and pre-processes it""" + with open(filename) as f: + json_data = json.load(f) + + # Removes the numbering following the kernel family name + json_data["kernel_family"] = re.sub(r'_\d+', '', json_data["kernel_family"]) + + # Adds the kernel name to the section instead of to the individual results + assert len(json_data["results"]) > 0 + json_data["kernel"] = json_data["results"][0]["kernel"] + for result in json_data["results"]: + assert json_data["kernel"] == result["kernel"] + result.pop("kernel", None) + + # Removes the 'PRECISION' parameter from the individual results: it is redundant + for result in json_data["results"]: + assert json_data["precision"] == str(result["parameters"]["PRECISION"]) + result["parameters"].pop("PRECISION", None) + + # All done + return json_data diff --git a/scripts/generator/datatype.py b/scripts/generator/datatype.py deleted file mode 100644 index 5bff95d1..00000000 --- a/scripts/generator/datatype.py +++ /dev/null @@ -1,70 +0,0 @@ -#!/usr/bin/env python - -# ================================================================================================== -# This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This -# project loosely follows the Google C++ styleguide and uses a max-width of 100 characters per line. -# -# Author(s): -# Cedric Nugteren -# -# This file contains the 'DataType' class, used in the generator script to generate the CLBlast API -# interface and implementation. -# -# ================================================================================================== - -# Short-hands for data-types -HLF = "half" -FLT = "float" -DBL = "double" -FLT2 = "float2" -DBL2 = "double2" - -HCL = "cl_half" -F2CL = "cl_float2" -D2CL = "cl_double2" - -# Structure holding data-type and precision information -class DataType(): - def __init__(self, precision_name, name, template, scalars, buffertype): - self.precision_name = precision_name - self.name = name - self.template = template - self.alpha_cpp = scalars[0] - self.beta_cpp = scalars[1] - self.alpha_cl = scalars[2] - self.beta_cl = scalars[3] - self.buffertype = buffertype - - # Outputs the name of the data-type (alpha/beta), possibly transforming into the right type - def UseAlpha(self): - if self.alpha_cpp in [FLT2, DBL2]: - return self.alpha_cpp+"{alpha.s[0], alpha.s[1]}" - return "alpha" - def UseBeta(self): - if self.beta_cpp in [FLT2, DBL2]: - return self.beta_cpp+"{beta.s[0], beta.s[1]}" - return "beta" - - # As above, but the transformation is in the opposite direction - def UseAlphaCL(self): - if self.alpha_cpp in [FLT2, DBL2]: - return self.alpha_cl+"{{alpha.real(), alpha.imag()}}" - return "alpha" - def UseBetaCL(self): - if self.beta_cpp in [FLT2, DBL2]: - return self.beta_cl+"{{beta.real(), beta.imag()}}" - return "beta" - - # Returns the template as used in the correctness/performance tests - def TestTemplate(self): - if self.buffertype != self.beta_cpp: - return "<"+self.buffertype+","+self.beta_cpp+">, "+self.buffertype+", "+self.beta_cpp - return "<"+self.buffertype+">, "+self.buffertype+", "+self.beta_cpp - - # Current scalar is complex - def IsComplex(self, scalar): - return ((scalar == "alpha" and self.alpha_cpp in [FLT2, DBL2]) or - (scalar == "beta" and self.beta_cpp in [FLT2, DBL2])) - - -# ================================================================================================== diff --git a/scripts/generator/generator.py b/scripts/generator/generator.py index cf01f79e..d82b13a6 100644 --- a/scripts/generator/generator.py +++ b/scripts/generator/generator.py @@ -1,14 +1,13 @@ #!/usr/bin/env python -# ================================================================================================== -# This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This -# project loosely follows the Google C++ styleguide and uses a max-width of 100 characters per line. +# This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This file follows the +# PEP8 Python style guide and uses a max-width of 120 characters per line. # # Author(s): # Cedric Nugteren # -# This script automatically generates the bodies of the following files, creating the full CLBlast -# API interface and implementation (C, C++, and reference BLAS wrappers): +# This script automatically generates the bodies of the following files, creating the full CLBlast API interface and +# implementation (C, C++, and reference BLAS wrappers): # clblast.h # clblast.cpp # clblast_c.h @@ -19,45 +18,20 @@ # test/correctness/routines/levelX/xYYYY.cpp # test/performance/routines/levelX/xYYYY.cpp # It also produces the API documentation found in doc/clblast.md -# -# ================================================================================================== -# System modules + import sys import os.path +import argparse -# Local files -from routine import Routine -from datatype import DataType, HLF, FLT, DBL, FLT2, DBL2, HCL, F2CL, D2CL +import generator.cpp as cpp +import generator.doc as doc +from generator.routine import Routine +from generator.datatype import H, S, D, C, Z, Sc, Dz, iH, iS, iD, iC, iZ, Css, Zdd, Ccs, Zzd, T, Tc, TU -# ================================================================================================== -# Regular data-types -H = DataType("H", "H", HLF, [HLF, HLF, HCL, HCL], HLF ) # half (16) -S = DataType("S", "S", FLT, [FLT, FLT, FLT, FLT], FLT ) # single (32) -D = DataType("D", "D", DBL, [DBL, DBL, DBL, DBL], DBL ) # double (64) -C = DataType("C", "C", FLT2, [FLT2, FLT2, F2CL, F2CL], FLT2) # single-complex (3232) -Z = DataType("Z", "Z", DBL2, [DBL2, DBL2, D2CL, D2CL], DBL2) # double-complex (6464) - -# Special cases -Sc = DataType("C", "Sc", FLT2, [FLT2, FLT2, FLT2, FLT2], FLT2) # As C, but with real output -Dz = DataType("Z", "Dz", DBL2, [DBL2, DBL2, DBL2, DBL2], DBL2) # As Z, but with real output -iH = DataType("H", "iH", HLF, [HLF, HLF, HLF, HLF], HLF ) # As H, but with integer output -iS = DataType("S", "iS", FLT, [FLT, FLT, FLT, FLT], FLT ) # As S, but with integer output -iD = DataType("D", "iD", DBL, [DBL, DBL, DBL, DBL], DBL ) # As D, but with integer output -iC = DataType("C", "iC", FLT2, [FLT2, FLT2, F2CL, F2CL], FLT2) # As C, but with integer output -iZ = DataType("Z", "iZ", DBL2, [DBL2, DBL2, D2CL, D2CL], DBL2) # As Z, but with integer output -Css = DataType("C", "C", FLT, [FLT, FLT, FLT, FLT], FLT2) # As C, but with constants from S -Zdd = DataType("Z", "Z", DBL, [DBL, DBL, DBL, DBL], DBL2) # As Z, but with constants from D -Ccs = DataType("C", "C", FLT2+","+FLT, [FLT2, FLT, F2CL, FLT], FLT2) # As C, but with one constant from S -Zzd = DataType("Z", "Z", DBL2+","+DBL, [DBL2, DBL, D2CL, DBL], DBL2) # As Z, but with one constant from D - -# C++ template data-types -T = DataType("T", "typename T", "T", ["T", "T", "T", "T"], "T") # regular routine -Tc = DataType("Tc", "typename T", "std::complex,T", ["T", "T", "T", "T"], "std::complex") # for herk -TU = DataType("TU", "typename T, typename U", "T,U", ["T", "U", "T", "U"], "T") # for her2k - -# ================================================================================================== +HEADER_LINES = [96, 73, 97, 22, 29, 41] +FOOTER_LINES = [17, 75, 19, 14, 6, 6] # Different possibilities for requirements ald_m = "The value of `a_ld` must be at least `m`." @@ -77,472 +51,162 @@ cld_n = "The value of `c_ld` must be at least `n`." # ================================================================================================== # Populates a list of routines -routines = [ -[ # Level 1: vector-vector - Routine(False, True, "1", "rotg", T, [S,D], [], [], [], ["sa","sb","sc","ss"], [], "", "Generate givens plane rotation", "", []), - Routine(False, True, "1", "rotmg", T, [S,D], [], [], ["sy1"], ["sd1","sd2","sx1","sparam"], [], "", "Generate modified givens plane rotation", "", []), - Routine(False, True, "1", "rot", T, [S,D], ["n"], [], [], ["x","y"], ["cos","sin"], "", "Apply givens plane rotation", "", []), - Routine(False, True, "1", "rotm", T, [S,D], ["n"], [], [], ["x","y","sparam"], [], "", "Apply modified givens plane rotation", "", []), - Routine(True, True, "1", "swap", T, [S,D,C,Z,H], ["n"], [], [], ["x","y"], [], "", "Swap two vectors", "Interchanges _n_ elements of vectors _x_ and _y_.", []), - Routine(True, True, "1", "scal", T, [S,D,C,Z,H], ["n"], [], [], ["x"], ["alpha"], "", "Vector scaling", "Multiplies _n_ elements of vector _x_ by a scalar constant _alpha_.", []), - Routine(True, True, "1", "copy", T, [S,D,C,Z,H], ["n"], [], ["x"], ["y"], [], "", "Vector copy", "Copies the contents of vector _x_ into vector _y_.", []), - Routine(True, True, "1", "axpy", T, [S,D,C,Z,H], ["n"], [], ["x"], ["y"], ["alpha"], "", "Vector-times-constant plus vector", "Performs the operation _y = alpha * x + y_, in which _x_ and _y_ are vectors and _alpha_ is a scalar constant.", []), - Routine(True, True, "1", "dot", T, [S,D,H], ["n"], [], ["x","y"], ["dot"], [], "n", "Dot product of two vectors", "Multiplies _n_ elements of the vectors _x_ and _y_ element-wise and accumulates the results. The sum is stored in the _dot_ buffer.", []), - Routine(True, True, "1", "dotu", T, [C,Z], ["n"], [], ["x","y"], ["dot"], [], "n", "Dot product of two complex vectors", "See the regular xDOT routine.", []), - Routine(True, True, "1", "dotc", T, [C,Z], ["n"], [], ["x","y"], ["dot"], [], "n", "Dot product of two complex vectors, one conjugated", "See the regular xDOT routine.", []), - Routine(True, True, "1", "nrm2", T, [S,D,Sc,Dz,H], ["n"], [], ["x"], ["nrm2"], [], "2*n", "Euclidian norm of a vector", "Accumulates the square of _n_ elements in the _x_ vector and takes the square root. The resulting L2 norm is stored in the _nrm2_ buffer.", []), - Routine(True, True, "1", "asum", T, [S,D,Sc,Dz,H], ["n"], [], ["x"], ["asum"], [], "n", "Absolute sum of values in a vector", "Accumulates the absolute value of _n_ elements in the _x_ vector. The results are stored in the _asum_ buffer.", []), - Routine(True, False, "1", "sum", T, [S,D,Sc,Dz,H], ["n"], [], ["x"], ["sum"], [], "n", "Sum of values in a vector (non-BLAS function)", "Accumulates the values of _n_ elements in the _x_ vector. The results are stored in the _sum_ buffer. This routine is the non-absolute version of the xASUM BLAS routine.", []), - Routine(True, True, "1", "amax", T, [iS,iD,iC,iZ,iH], ["n"], [], ["x"], ["imax"], [], "2*n", "Index of absolute maximum value in a vector", "Finds the index of the maximum of the absolute values in the _x_ vector. The resulting integer index is stored in the _imax_ buffer.", []), - Routine(True, False, "1", "max", T, [iS,iD,iC,iZ,iH], ["n"], [], ["x"], ["imax"], [], "2*n", "Index of maximum value in a vector (non-BLAS function)", "Finds the index of the maximum of the values in the _x_ vector. The resulting integer index is stored in the _imax_ buffer. This routine is the non-absolute version of the IxAMAX BLAS routine.", []), - Routine(True, False, "1", "min", T, [iS,iD,iC,iZ,iH], ["n"], [], ["x"], ["imin"], [], "2*n", "Index of minimum value in a vector (non-BLAS function)", "Finds the index of the minimum of the values in the _x_ vector. The resulting integer index is stored in the _imin_ buffer. This routine is the non-absolute minimum version of the IxAMAX BLAS routine.", []), +ROUTINES = [ +[ # Level 1: vector-vector + Routine(False, True, "1", "rotg", T, [S,D], [], [], [], ["sa","sb","sc","ss"], [], "", "Generate givens plane rotation", "", []), + Routine(False, True, "1", "rotmg", T, [S,D], [], [], ["sy1"], ["sd1","sd2","sx1","sparam"], [], "", "Generate modified givens plane rotation", "", []), + Routine(False, True, "1", "rot", T, [S,D], ["n"], [], [], ["x","y"], ["cos","sin"], "", "Apply givens plane rotation", "", []), + Routine(False, True, "1", "rotm", T, [S,D], ["n"], [], [], ["x","y","sparam"], [], "", "Apply modified givens plane rotation", "", []), + Routine(True, True, "1", "swap", T, [S,D,C,Z,H], ["n"], [], [], ["x","y"], [], "", "Swap two vectors", "Interchanges _n_ elements of vectors _x_ and _y_.", []), + Routine(True, True, "1", "scal", T, [S,D,C,Z,H], ["n"], [], [], ["x"], ["alpha"], "", "Vector scaling", "Multiplies _n_ elements of vector _x_ by a scalar constant _alpha_.", []), + Routine(True, True, "1", "copy", T, [S,D,C,Z,H], ["n"], [], ["x"], ["y"], [], "", "Vector copy", "Copies the contents of vector _x_ into vector _y_.", []), + Routine(True, True, "1", "axpy", T, [S,D,C,Z,H], ["n"], [], ["x"], ["y"], ["alpha"], "", "Vector-times-constant plus vector", "Performs the operation _y = alpha * x + y_, in which _x_ and _y_ are vectors and _alpha_ is a scalar constant.", []), + Routine(True, True, "1", "dot", T, [S,D,H], ["n"], [], ["x","y"], ["dot"], [], "n", "Dot product of two vectors", "Multiplies _n_ elements of the vectors _x_ and _y_ element-wise and accumulates the results. The sum is stored in the _dot_ buffer.", []), + Routine(True, True, "1", "dotu", T, [C,Z], ["n"], [], ["x","y"], ["dot"], [], "n", "Dot product of two complex vectors", "See the regular xDOT routine.", []), + Routine(True, True, "1", "dotc", T, [C,Z], ["n"], [], ["x","y"], ["dot"], [], "n", "Dot product of two complex vectors, one conjugated", "See the regular xDOT routine.", []), + Routine(True, True, "1", "nrm2", T, [S,D,Sc,Dz,H], ["n"], [], ["x"], ["nrm2"], [], "2*n", "Euclidian norm of a vector", "Accumulates the square of _n_ elements in the _x_ vector and takes the square root. The resulting L2 norm is stored in the _nrm2_ buffer.", []), + Routine(True, True, "1", "asum", T, [S,D,Sc,Dz,H], ["n"], [], ["x"], ["asum"], [], "n", "Absolute sum of values in a vector", "Accumulates the absolute value of _n_ elements in the _x_ vector. The results are stored in the _asum_ buffer.", []), + Routine(True, False, "1", "sum", T, [S,D,Sc,Dz,H], ["n"], [], ["x"], ["sum"], [], "n", "Sum of values in a vector (non-BLAS function)", "Accumulates the values of _n_ elements in the _x_ vector. The results are stored in the _sum_ buffer. This routine is the non-absolute version of the xASUM BLAS routine.", []), + Routine(True, True, "1", "amax", T, [iS,iD,iC,iZ,iH], ["n"], [], ["x"], ["imax"], [], "2*n", "Index of absolute maximum value in a vector", "Finds the index of the maximum of the absolute values in the _x_ vector. The resulting integer index is stored in the _imax_ buffer.", []), + Routine(True, False, "1", "max", T, [iS,iD,iC,iZ,iH], ["n"], [], ["x"], ["imax"], [], "2*n", "Index of maximum value in a vector (non-BLAS function)", "Finds the index of the maximum of the values in the _x_ vector. The resulting integer index is stored in the _imax_ buffer. This routine is the non-absolute version of the IxAMAX BLAS routine.", []), + Routine(True, False, "1", "min", T, [iS,iD,iC,iZ,iH], ["n"], [], ["x"], ["imin"], [], "2*n", "Index of minimum value in a vector (non-BLAS function)", "Finds the index of the minimum of the values in the _x_ vector. The resulting integer index is stored in the _imin_ buffer. This routine is the non-absolute minimum version of the IxAMAX BLAS routine.", []), ], -[ # Level 2: matrix-vector - Routine(True, True, "2a", "gemv", T, [S,D,C,Z,H], ["m","n"], ["layout","a_transpose"], ["a","x"], ["y"], ["alpha","beta"], "", "General matrix-vector multiplication", "Performs the operation _y = alpha * A * x + beta * y_, in which _x_ is an input vector, _y_ is an input and output vector, _A_ is an input matrix, and _alpha_ and _beta_ are scalars. The matrix _A_ can optionally be transposed before performing the operation.", [ald_m]), - Routine(True, True, "2a", "gbmv", T, [S,D,C,Z,H], ["m","n","kl","ku"], ["layout","a_transpose"], ["a","x"], ["y"], ["alpha","beta"], "", "General banded matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is banded instead.", [ald_kl_ku_one]), - Routine(True, True, "2a", "hemv", T, [C,Z], ["n"], ["layout","triangle"], ["a","x"], ["y"], ["alpha","beta"], "", "Hermitian matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is an Hermitian matrix instead.", [ald_n]), - Routine(True, True, "2a", "hbmv", T, [C,Z], ["n","k"], ["layout","triangle"], ["a","x"], ["y"], ["alpha","beta"], "", "Hermitian banded matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is an Hermitian banded matrix instead.", [ald_k_one]), - Routine(True, True, "2a", "hpmv", T, [C,Z], ["n"], ["layout","triangle"], ["ap","x"], ["y"], ["alpha","beta"], "", "Hermitian packed matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is an Hermitian packed matrix instead and represented as _AP_.", []), - Routine(True, True, "2a", "symv", T, [S,D,H], ["n"], ["layout","triangle"], ["a","x"], ["y"], ["alpha","beta"], "", "Symmetric matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is symmetric instead.", [ald_n]), - Routine(True, True, "2a", "sbmv", T, [S,D,H], ["n","k"], ["layout","triangle"], ["a","x"], ["y"], ["alpha","beta"], "", "Symmetric banded matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is symmetric and banded instead.", [ald_k_one]), - Routine(True, True, "2a", "spmv", T, [S,D,H], ["n"], ["layout","triangle"], ["ap","x"], ["y"], ["alpha","beta"], "", "Symmetric packed matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is a symmetric packed matrix instead and represented as _AP_.", []), - Routine(True, True, "2a", "trmv", T, [S,D,C,Z,H], ["n"], ["layout","triangle","a_transpose","diagonal"], ["a"], ["x"], [], "n", "Triangular matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is triangular instead.", [ald_n]), - Routine(True, True, "2a", "tbmv", T, [S,D,C,Z,H], ["n","k"], ["layout","triangle","a_transpose","diagonal"], ["a"], ["x"], [], "n", "Triangular banded matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is triangular and banded instead.", [ald_k_one]), - Routine(True, True, "2a", "tpmv", T, [S,D,C,Z,H], ["n"], ["layout","triangle","a_transpose","diagonal"], ["ap"], ["x"], [], "n", "Triangular packed matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is a triangular packed matrix instead and repreented as _AP_.", []), - Routine(False, True, "2a", "trsv", T, [S,D,C,Z], ["n"], ["layout","triangle","a_transpose","diagonal"], ["a"], ["x"], [], "", "Solves a triangular system of equations", "", []), - Routine(False, True, "2a", "tbsv", T, [S,D,C,Z], ["n","k"], ["layout","triangle","a_transpose","diagonal"], ["a"], ["x"], [], "", "Solves a banded triangular system of equations", "", [ald_k_one]), - Routine(False, True, "2a", "tpsv", T, [S,D,C,Z], ["n"], ["layout","triangle","a_transpose","diagonal"], ["ap"], ["x"], [], "", "Solves a packed triangular system of equations", "", []), +[ # Level 2: matrix-vector + Routine(True, True, "2a", "gemv", T, [S,D,C,Z,H], ["m","n"], ["layout","a_transpose"], ["a","x"], ["y"], ["alpha","beta"], "", "General matrix-vector multiplication", "Performs the operation _y = alpha * A * x + beta * y_, in which _x_ is an input vector, _y_ is an input and output vector, _A_ is an input matrix, and _alpha_ and _beta_ are scalars. The matrix _A_ can optionally be transposed before performing the operation.", [ald_m]), + Routine(True, True, "2a", "gbmv", T, [S,D,C,Z,H], ["m","n","kl","ku"], ["layout","a_transpose"], ["a","x"], ["y"], ["alpha","beta"], "", "General banded matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is banded instead.", [ald_kl_ku_one]), + Routine(True, True, "2a", "hemv", T, [C,Z], ["n"], ["layout","triangle"], ["a","x"], ["y"], ["alpha","beta"], "", "Hermitian matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is an Hermitian matrix instead.", [ald_n]), + Routine(True, True, "2a", "hbmv", T, [C,Z], ["n","k"], ["layout","triangle"], ["a","x"], ["y"], ["alpha","beta"], "", "Hermitian banded matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is an Hermitian banded matrix instead.", [ald_k_one]), + Routine(True, True, "2a", "hpmv", T, [C,Z], ["n"], ["layout","triangle"], ["ap","x"], ["y"], ["alpha","beta"], "", "Hermitian packed matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is an Hermitian packed matrix instead and represented as _AP_.", []), + Routine(True, True, "2a", "symv", T, [S,D,H], ["n"], ["layout","triangle"], ["a","x"], ["y"], ["alpha","beta"], "", "Symmetric matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is symmetric instead.", [ald_n]), + Routine(True, True, "2a", "sbmv", T, [S,D,H], ["n","k"], ["layout","triangle"], ["a","x"], ["y"], ["alpha","beta"], "", "Symmetric banded matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is symmetric and banded instead.", [ald_k_one]), + Routine(True, True, "2a", "spmv", T, [S,D,H], ["n"], ["layout","triangle"], ["ap","x"], ["y"], ["alpha","beta"], "", "Symmetric packed matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is a symmetric packed matrix instead and represented as _AP_.", []), + Routine(True, True, "2a", "trmv", T, [S,D,C,Z,H], ["n"], ["layout","triangle","a_transpose","diagonal"], ["a"], ["x"], [], "n", "Triangular matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is triangular instead.", [ald_n]), + Routine(True, True, "2a", "tbmv", T, [S,D,C,Z,H], ["n","k"], ["layout","triangle","a_transpose","diagonal"], ["a"], ["x"], [], "n", "Triangular banded matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is triangular and banded instead.", [ald_k_one]), + Routine(True, True, "2a", "tpmv", T, [S,D,C,Z,H], ["n"], ["layout","triangle","a_transpose","diagonal"], ["ap"], ["x"], [], "n", "Triangular packed matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is a triangular packed matrix instead and repreented as _AP_.", []), + Routine(False, True, "2a", "trsv", T, [S,D,C,Z], ["n"], ["layout","triangle","a_transpose","diagonal"], ["a"], ["x"], [], "", "Solves a triangular system of equations", "", []), + Routine(False, True, "2a", "tbsv", T, [S,D,C,Z], ["n","k"], ["layout","triangle","a_transpose","diagonal"], ["a"], ["x"], [], "", "Solves a banded triangular system of equations", "", [ald_k_one]), + Routine(False, True, "2a", "tpsv", T, [S,D,C,Z], ["n"], ["layout","triangle","a_transpose","diagonal"], ["ap"], ["x"], [], "", "Solves a packed triangular system of equations", "", []), # Level 2: matrix update - Routine(True, True, "2b", "ger", T, [S,D,H], ["m","n"], ["layout"], ["x","y"], ["a"], ["alpha"], "", "General rank-1 matrix update", "Performs the operation _A = alpha * x * y^T + A_, in which _x_ is an input vector, _y^T_ is the transpose of the input vector _y_, _A_ is the matrix to be updated, and _alpha_ is a scalar value.", [ald_m]), - Routine(True, True, "2b", "geru", T, [C,Z], ["m","n"], ["layout"], ["x","y"], ["a"], ["alpha"], "", "General rank-1 complex matrix update", "Same operation as xGER, but with complex data-types.", [ald_m]), - Routine(True, True, "2b", "gerc", T, [C,Z], ["m","n"], ["layout"], ["x","y"], ["a"], ["alpha"], "", "General rank-1 complex conjugated matrix update", "Same operation as xGERU, but the update is done based on the complex conjugate of the input vectors.", [ald_m]), - Routine(True, True, "2b", "her", Tc, [Css,Zdd], ["n"], ["layout","triangle"], ["x"], ["a"], ["alpha"], "", "Hermitian rank-1 matrix update", "Performs the operation _A = alpha * x * x^T + A_, in which x is an input vector, x^T is the transpose of this vector, _A_ is the triangular Hermetian matrix to be updated, and alpha is a scalar value.", [ald_n]), - Routine(True, True, "2b", "hpr", Tc, [Css,Zdd], ["n"], ["layout","triangle"], ["x"], ["ap"], ["alpha"], "", "Hermitian packed rank-1 matrix update", "Same operation as xHER, but matrix _A_ is an Hermitian packed matrix instead and represented as _AP_.", []), - Routine(True, True, "2b", "her2", T, [C,Z], ["n"], ["layout","triangle"], ["x","y"], ["a"], ["alpha"], "", "Hermitian rank-2 matrix update", "Performs the operation _A = alpha * x * y^T + conj(alpha) * y * x^T + A_, in which _x_ is an input vector and _x^T_ its transpose, _y_ is an input vector and _y^T_ its transpose, _A_ is the triangular Hermetian matrix to be updated, _alpha_ is a scalar value and _conj(alpha)_ its complex conjugate.", [ald_n]), - Routine(True, True, "2b", "hpr2", T, [C,Z], ["n"], ["layout","triangle"], ["x","y"], ["ap"], ["alpha"], "", "Hermitian packed rank-2 matrix update", "Same operation as xHER2, but matrix _A_ is an Hermitian packed matrix instead and represented as _AP_.", []), - Routine(True, True, "2b", "syr", T, [S,D,H], ["n"], ["layout","triangle"], ["x"], ["a"], ["alpha"], "", "Symmetric rank-1 matrix update", "Same operation as xHER, but matrix A is a symmetric matrix instead.", [ald_n]), - Routine(True, True, "2b", "spr", T, [S,D,H], ["n"], ["layout","triangle"], ["x"], ["ap"], ["alpha"], "", "Symmetric packed rank-1 matrix update", "Same operation as xSPR, but matrix _A_ is a symmetric packed matrix instead and represented as _AP_.", []), - Routine(True, True, "2b", "syr2", T, [S,D,H], ["n"], ["layout","triangle"], ["x","y"], ["a"], ["alpha"], "", "Symmetric rank-2 matrix update", "Same operation as xHER2, but matrix _A_ is a symmetric matrix instead.", [ald_n]), - Routine(True, True, "2b", "spr2", T, [S,D,H], ["n"], ["layout","triangle"], ["x","y"], ["ap"], ["alpha"], "", "Symmetric packed rank-2 matrix update", "Same operation as xSPR2, but matrix _A_ is a symmetric packed matrix instead and represented as _AP_.", []), + Routine(True, True, "2b", "ger", T, [S,D,H], ["m","n"], ["layout"], ["x","y"], ["a"], ["alpha"], "", "General rank-1 matrix update", "Performs the operation _A = alpha * x * y^T + A_, in which _x_ is an input vector, _y^T_ is the transpose of the input vector _y_, _A_ is the matrix to be updated, and _alpha_ is a scalar value.", [ald_m]), + Routine(True, True, "2b", "geru", T, [C,Z], ["m","n"], ["layout"], ["x","y"], ["a"], ["alpha"], "", "General rank-1 complex matrix update", "Same operation as xGER, but with complex data-types.", [ald_m]), + Routine(True, True, "2b", "gerc", T, [C,Z], ["m","n"], ["layout"], ["x","y"], ["a"], ["alpha"], "", "General rank-1 complex conjugated matrix update", "Same operation as xGERU, but the update is done based on the complex conjugate of the input vectors.", [ald_m]), + Routine(True, True, "2b", "her", Tc, [Css,Zdd], ["n"], ["layout","triangle"], ["x"], ["a"], ["alpha"], "", "Hermitian rank-1 matrix update", "Performs the operation _A = alpha * x * x^T + A_, in which x is an input vector, x^T is the transpose of this vector, _A_ is the triangular Hermetian matrix to be updated, and alpha is a scalar value.", [ald_n]), + Routine(True, True, "2b", "hpr", Tc, [Css,Zdd], ["n"], ["layout","triangle"], ["x"], ["ap"], ["alpha"], "", "Hermitian packed rank-1 matrix update", "Same operation as xHER, but matrix _A_ is an Hermitian packed matrix instead and represented as _AP_.", []), + Routine(True, True, "2b", "her2", T, [C,Z], ["n"], ["layout","triangle"], ["x","y"], ["a"], ["alpha"], "", "Hermitian rank-2 matrix update", "Performs the operation _A = alpha * x * y^T + conj(alpha) * y * x^T + A_, in which _x_ is an input vector and _x^T_ its transpose, _y_ is an input vector and _y^T_ its transpose, _A_ is the triangular Hermetian matrix to be updated, _alpha_ is a scalar value and _conj(alpha)_ its complex conjugate.", [ald_n]), + Routine(True, True, "2b", "hpr2", T, [C,Z], ["n"], ["layout","triangle"], ["x","y"], ["ap"], ["alpha"], "", "Hermitian packed rank-2 matrix update", "Same operation as xHER2, but matrix _A_ is an Hermitian packed matrix instead and represented as _AP_.", []), + Routine(True, True, "2b", "syr", T, [S,D,H], ["n"], ["layout","triangle"], ["x"], ["a"], ["alpha"], "", "Symmetric rank-1 matrix update", "Same operation as xHER, but matrix A is a symmetric matrix instead.", [ald_n]), + Routine(True, True, "2b", "spr", T, [S,D,H], ["n"], ["layout","triangle"], ["x"], ["ap"], ["alpha"], "", "Symmetric packed rank-1 matrix update", "Same operation as xSPR, but matrix _A_ is a symmetric packed matrix instead and represented as _AP_.", []), + Routine(True, True, "2b", "syr2", T, [S,D,H], ["n"], ["layout","triangle"], ["x","y"], ["a"], ["alpha"], "", "Symmetric rank-2 matrix update", "Same operation as xHER2, but matrix _A_ is a symmetric matrix instead.", [ald_n]), + Routine(True, True, "2b", "spr2", T, [S,D,H], ["n"], ["layout","triangle"], ["x","y"], ["ap"], ["alpha"], "", "Symmetric packed rank-2 matrix update", "Same operation as xSPR2, but matrix _A_ is a symmetric packed matrix instead and represented as _AP_.", []), ], -[ # Level 3: matrix-matrix - Routine(True, True, "3", "gemm", T, [S,D,C,Z,H], ["m","n","k"], ["layout","a_transpose","b_transpose"], ["a","b"], ["c"], ["alpha","beta"], "", "General matrix-matrix multiplication", "Performs the matrix product _C = alpha * A * B + beta * C_, in which _A_ (_m_ by _k_) and _B_ (_k_ by _n_) are two general rectangular input matrices, _C_ (_m_ by _n_) is the matrix to be updated, and _alpha_ and _beta_ are scalar values. The matrices _A_ and/or _B_ can optionally be transposed before performing the operation.", [ald_transa_m_k, bld_transb_k_n, cld_m]), - Routine(True, True, "3", "symm", T, [S,D,C,Z,H], ["m","n"], ["layout","side","triangle"], ["a","b"], ["c"], ["alpha","beta"], "", "Symmetric matrix-matrix multiplication", "Same operation as xGEMM, but _A_ is symmetric instead. In case of `side == kLeft`, _A_ is a symmetric _m_ by _m_ matrix and _C = alpha * A * B + beta * C_ is performed. Otherwise, in case of `side == kRight`, _A_ is a symmtric _n_ by _n_ matrix and _C = alpha * B * A + beta * C_ is performed.", [ald_side_m_n, bld_m, cld_m]), - Routine(True, True, "3", "hemm", T, [C,Z], ["m","n"], ["layout","side","triangle"], ["a","b"], ["c"], ["alpha","beta"], "", "Hermitian matrix-matrix multiplication", "Same operation as xSYMM, but _A_ is an Hermitian matrix instead.", [ald_side_m_n, bld_m, cld_m]), - Routine(True, True, "3", "syrk", T, [S,D,C,Z,H], ["n","k"], ["layout","triangle","a_transpose"], ["a"], ["c"], ["alpha","beta"], "", "Rank-K update of a symmetric matrix", "Performs the matrix product _C = alpha * A * A^T + beta * C_ or _C = alpha * A^T * A + beta * C_, in which _A_ is a general matrix and _A^T_ is its transpose, _C_ (_n_ by _n_) is the symmetric matrix to be updated, and _alpha_ and _beta_ are scalar values.", [ald_trans_n_k, cld_m]), - Routine(True, True, "3", "herk", Tc, [Css,Zdd], ["n","k"], ["layout","triangle","a_transpose"], ["a"], ["c"], ["alpha","beta"], "", "Rank-K update of a hermitian matrix", "Same operation as xSYRK, but _C_ is an Hermitian matrix instead.", [ald_trans_n_k, cld_m]), - Routine(True, True, "3", "syr2k", T, [S,D,C,Z,H], ["n","k"], ["layout","triangle","ab_transpose"], ["a","b"], ["c"], ["alpha","beta"], "", "Rank-2K update of a symmetric matrix", "Performs the matrix product _C = alpha * A * B^T + alpha * B * A^T + beta * C_ or _C = alpha * A^T * B + alpha * B^T * A + beta * C_, in which _A_ and _B_ are general matrices and _A^T_ and _B^T_ are their transposed versions, _C_ (_n_ by _n_) is the symmetric matrix to be updated, and _alpha_ and _beta_ are scalar values.", [ald_trans_n_k, bld_trans_n_k, cld_n]), - Routine(True, True, "3", "her2k", TU, [Ccs,Zzd], ["n","k"], ["layout","triangle","ab_transpose"], ["a","b"], ["c"], ["alpha","beta"], "", "Rank-2K update of a hermitian matrix", "Same operation as xSYR2K, but _C_ is an Hermitian matrix instead.", [ald_trans_n_k, bld_trans_n_k, cld_n]), - Routine(True, True, "3", "trmm", T, [S,D,C,Z,H], ["m","n"], ["layout","side","triangle","a_transpose","diagonal"], ["a"], ["b"], ["alpha"], "", "Triangular matrix-matrix multiplication", "Performs the matrix product _B = alpha * A * B_ or _B = alpha * B * A_, in which _A_ is a unit or non-unit triangular matrix, _B_ (_m_ by _n_) is the general matrix to be updated, and _alpha_ is a scalar value.", [ald_side_m_n, bld_m]), - Routine(False, True, "3", "trsm", T, [S,D,C,Z,H], ["m","n"], ["layout","side","triangle","a_transpose","diagonal"], ["a"], ["b"], ["alpha"], "", "Solves a triangular system of equations", "", []), +[ # Level 3: matrix-matrix + Routine(True, True, "3", "gemm", T, [S,D,C,Z,H], ["m","n","k"], ["layout","a_transpose","b_transpose"], ["a","b"], ["c"], ["alpha","beta"], "", "General matrix-matrix multiplication", "Performs the matrix product _C = alpha * A * B + beta * C_, in which _A_ (_m_ by _k_) and _B_ (_k_ by _n_) are two general rectangular input matrices, _C_ (_m_ by _n_) is the matrix to be updated, and _alpha_ and _beta_ are scalar values. The matrices _A_ and/or _B_ can optionally be transposed before performing the operation.", [ald_transa_m_k, bld_transb_k_n, cld_m]), + Routine(True, True, "3", "symm", T, [S,D,C,Z,H], ["m","n"], ["layout","side","triangle"], ["a","b"], ["c"], ["alpha","beta"], "", "Symmetric matrix-matrix multiplication", "Same operation as xGEMM, but _A_ is symmetric instead. In case of `side == kLeft`, _A_ is a symmetric _m_ by _m_ matrix and _C = alpha * A * B + beta * C_ is performed. Otherwise, in case of `side == kRight`, _A_ is a symmtric _n_ by _n_ matrix and _C = alpha * B * A + beta * C_ is performed.", [ald_side_m_n, bld_m, cld_m]), + Routine(True, True, "3", "hemm", T, [C,Z], ["m","n"], ["layout","side","triangle"], ["a","b"], ["c"], ["alpha","beta"], "", "Hermitian matrix-matrix multiplication", "Same operation as xSYMM, but _A_ is an Hermitian matrix instead.", [ald_side_m_n, bld_m, cld_m]), + Routine(True, True, "3", "syrk", T, [S,D,C,Z,H], ["n","k"], ["layout","triangle","a_transpose"], ["a"], ["c"], ["alpha","beta"], "", "Rank-K update of a symmetric matrix", "Performs the matrix product _C = alpha * A * A^T + beta * C_ or _C = alpha * A^T * A + beta * C_, in which _A_ is a general matrix and _A^T_ is its transpose, _C_ (_n_ by _n_) is the symmetric matrix to be updated, and _alpha_ and _beta_ are scalar values.", [ald_trans_n_k, cld_m]), + Routine(True, True, "3", "herk", Tc, [Css,Zdd], ["n","k"], ["layout","triangle","a_transpose"], ["a"], ["c"], ["alpha","beta"], "", "Rank-K update of a hermitian matrix", "Same operation as xSYRK, but _C_ is an Hermitian matrix instead.", [ald_trans_n_k, cld_m]), + Routine(True, True, "3", "syr2k", T, [S,D,C,Z,H], ["n","k"], ["layout","triangle","ab_transpose"], ["a","b"], ["c"], ["alpha","beta"], "", "Rank-2K update of a symmetric matrix", "Performs the matrix product _C = alpha * A * B^T + alpha * B * A^T + beta * C_ or _C = alpha * A^T * B + alpha * B^T * A + beta * C_, in which _A_ and _B_ are general matrices and _A^T_ and _B^T_ are their transposed versions, _C_ (_n_ by _n_) is the symmetric matrix to be updated, and _alpha_ and _beta_ are scalar values.", [ald_trans_n_k, bld_trans_n_k, cld_n]), + Routine(True, True, "3", "her2k", TU, [Ccs,Zzd], ["n","k"], ["layout","triangle","ab_transpose"], ["a","b"], ["c"], ["alpha","beta"], "", "Rank-2K update of a hermitian matrix", "Same operation as xSYR2K, but _C_ is an Hermitian matrix instead.", [ald_trans_n_k, bld_trans_n_k, cld_n]), + Routine(True, True, "3", "trmm", T, [S,D,C,Z,H], ["m","n"], ["layout","side","triangle","a_transpose","diagonal"], ["a"], ["b"], ["alpha"], "", "Triangular matrix-matrix multiplication", "Performs the matrix product _B = alpha * A * B_ or _B = alpha * B * A_, in which _A_ is a unit or non-unit triangular matrix, _B_ (_m_ by _n_) is the general matrix to be updated, and _alpha_ is a scalar value.", [ald_side_m_n, bld_m]), + Routine(False, True, "3", "trsm", T, [S,D,C,Z,H], ["m","n"], ["layout","side","triangle","a_transpose","diagonal"], ["a"], ["b"], ["alpha"], "", "Solves a triangular system of equations", "", []), ], -[ # Level X: extra routines (not part of BLAS) - Routine(True, True, "x", "omatcopy", T, [S,D,C,Z,H], ["m","n"], ["layout","a_transpose"], ["a"], ["b"], ["alpha"], "", "Scaling and out-place transpose/copy (non-BLAS function)", "Performs scaling and out-of-place transposition/copying of matrices according to _B = alpha*op(A)_, in which _A_ is an input matrix (_m_ rows by _n_ columns), _B_ an output matrix, and _alpha_ a scalar value. The operation _op_ can be a normal matrix copy, a transposition or a conjugate transposition.", [ald_m, bld_n]), +[ # Level X: extra routines (not part of BLAS) + Routine(True, True, "x", "omatcopy", T, [S,D,C,Z,H], ["m","n"], ["layout","a_transpose"], ["a"], ["b"], ["alpha"], "", "Scaling and out-place transpose/copy (non-BLAS function)", "Performs scaling and out-of-place transposition/copying of matrices according to _B = alpha*op(A)_, in which _A_ is an input matrix (_m_ rows by _n_ columns), _B_ an output matrix, and _alpha_ a scalar value. The operation _op_ can be a normal matrix copy, a transposition or a conjugate transposition.", [ald_m, bld_n]), ]] -# ================================================================================================== -# Translates an option name to a CLBlast data-type -def PrecisionToFullName(x): - return { - 'H': "Half", - 'S': "Single", - 'D': "Double", - 'C': "ComplexSingle", - 'Z': "ComplexDouble", - }[x] -# ================================================================================================== +def main(argv): -# Separators for the BLAS levels -separators = [""" -// ================================================================================================= -// BLAS level-1 (vector-vector) routines -// =================================================================================================""", -""" -// ================================================================================================= -// BLAS level-2 (matrix-vector) routines -// =================================================================================================""", -""" -// ================================================================================================= -// BLAS level-3 (matrix-matrix) routines -// =================================================================================================""", -""" -// ================================================================================================= -// Extra non-BLAS routines (level-X) -// ================================================================================================="""] + # Parses the command-line arguments + parser = argparse.ArgumentParser() + parser.add_argument("clblast_root", help="Root of the CLBlast sources") + parser.add_argument("-v", "--verbose", action="store_true", help="Increase verbosity of the script") + cl_args = parser.parse_args(argv) + library_root = cl_args.clblast_root -# Names of the level sub-folders -levelnames = ["1", "2", "3", "x"] + # Sets all the files the output + files = [ + library_root + "/include/clblast.h", + library_root + "/src/clblast.cpp", + library_root + "/include/clblast_c.h", + library_root + "/src/clblast_c.cpp", + library_root + "/test/wrapper_clblas.hpp", + library_root + "/test/wrapper_cblas.hpp", + ] -# Main header/footer for source files -header = """ -// ================================================================================================= -// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This -// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- -// width of 100 characters per line. -// -// Author(s): -// Cedric Nugteren -// -// ================================================================================================= -""" -footer = """ -// ================================================================================================= -""" + # Checks whether the command-line arguments are valid; exists otherwise + for f in files: + if not os.path.isfile(f): + print("[ERROR] The path '" + library_root + "' does not point to the root of the CLBlast library") + sys.exit() -# ================================================================================================== + # Iterates over all regular files to output + for i in range(0, len(files)): -# The C++ API header (.h) -def clblast_h(routines): - result = "" - for routine in routines: - result += "\n// "+routine.description+": "+routine.ShortNames()+"\n" - result += routine.RoutineHeaderCPP(12, " = nullptr")+";\n" - return result + # Stores the header and the footer of the original file + with open(files[i]) as f: + original = f.readlines() + file_header = original[:HEADER_LINES[i]] + file_footer = original[-FOOTER_LINES[i]:] -# The C++ API implementation (.cpp) -def clblast_cc(routines): - result = "" - for routine in routines: - indent1 = " "*(20 + routine.Length()) - result += "\n// "+routine.description+": "+routine.ShortNames()+"\n" - if routine.implemented: - result += routine.RoutineHeaderCPP(12, "")+" {\n" - result += " auto queue_cpp = Queue(*queue);\n" - result += " auto routine = X"+routine.name+"<"+routine.template.template+">(queue_cpp, event);\n" - result += " auto status = routine.SetUp();\n" - result += " if (status != StatusCode::kSuccess) { return status; }\n" - result += " return routine.Do"+routine.name.capitalize()+"(" - result += (",\n"+indent1).join([a for a in routine.ArgumentsCladuc(routine.template, indent1)]) - result += ");\n" - else: - result += routine.RoutineHeaderTypeCPP(12)+" {\n" - result += " return StatusCode::kNotImplemented;\n" - result += "}\n" - for flavour in routine.flavours: - indent2 = " "*(34 + routine.Length() + len(flavour.template)) - result += "template StatusCode PUBLIC_API "+routine.name.capitalize()+"<"+flavour.template+">(" - result += (",\n"+indent2).join([a for a in routine.ArgumentsType(flavour)]) - result += ",\n"+indent2+"cl_command_queue*, cl_event*);\n" - return result + # Re-writes the body of the file + with open(files[i], "w") as f: + body = "" + levels = [1, 2, 3] if (i == 4 or i == 5) else [1, 2, 3, 4] + for level in levels: + body += cpp.LEVEL_SEPARATORS[level - 1] + "\n" + for routine in ROUTINES[level - 1]: + if i == 0: + body += cpp.clblast_h(routine) + if i == 1: + body += cpp.clblast_cc(routine) + if i == 2: + body += cpp.clblast_c_h(routine) + if i == 3: + body += cpp.clblast_c_cc(routine) + if i == 4: + body += cpp.wrapper_clblas(routine) + if i == 5: + body += cpp.wrapper_cblas(routine) + f.write("".join(file_header)) + f.write(body) + f.write("".join(file_footer)) -# ================================================================================================== + # Outputs all the test implementations + for level in [1, 2, 3, 4]: + for routine in ROUTINES[level - 1]: + if routine.has_tests: + level_string = cpp.LEVEL_NAMES[level - 1] + routine_suffix = "level" + level_string + "/x" + routine.name + ".cpp" -# The C API header (.h) -def clblast_c_h(routines): - result = "" - for routine in routines: - result += "\n// "+routine.description+": "+routine.ShortNames()+"\n" - for flavour in routine.flavours: - result += routine.RoutineHeaderC(flavour, 31, " PUBLIC_API")+";\n" - return result + # Correctness tests + filename = library_root + "/test/correctness/routines/" + routine_suffix + with open(filename, "w") as f: + f.write(cpp.HEADER + "\n") + f.write(cpp.correctness_test(routine, level_string)) + f.write(cpp.FOOTER) -# The C API implementation (.cpp) -def clblast_c_cc(routines): - result = "" - for routine in routines: - result += "\n// "+routine.name.upper()+"\n" - for flavour in routine.flavours: - template = "<"+flavour.template+">" if routine.NoScalars() else "" - indent = " "*(26 + routine.Length() + len(template)) - result += routine.RoutineHeaderC(flavour, 20, "")+" {\n" - result += " auto status = clblast::"+routine.name.capitalize()+template+"(" - result += (",\n"+indent).join([a for a in routine.ArgumentsCast(flavour, indent)]) - result += ",\n"+indent+"queue, event);" - result += "\n return static_cast(status);\n}\n" - return result + # Performance tests + filename = library_root + "/test/performance/routines/" + routine_suffix + with open(filename, "w") as f: + f.write(cpp.HEADER + "\n") + f.write(cpp.performance_test(routine, level_string)) + f.write(cpp.FOOTER) -# ================================================================================================== + # Outputs the API documentation + filename = cl_args.clblast_root + "/doc/clblast.md" + with open(filename, "w") as f: -# The wrapper to the reference clBLAS routines (for performance/correctness testing) -def wrapper_clblas(routines): - result = "" - for routine in routines: - if routine.has_tests: - result += "\n// Forwards the clBLAS calls for %s\n" % (routine.ShortNamesTested()) - if routine.NoScalars(): - result += routine.RoutineHeaderWrapperCL(routine.template, True, 21)+";\n" - for flavour in routine.flavours: - result += routine.RoutineHeaderWrapperCL(flavour, False, 21)+" {\n" + # Outputs the header + doc_header = doc.header() + f.write(doc_header) - # There is a version available in clBLAS - if flavour.precision_name in ["S","D","C","Z"]: - indent = " "*(17 + routine.Length()) - arguments = routine.ArgumentsWrapperCL(flavour) - if routine.scratch: - result += " auto queue = Queue(queues[0]);\n" - result += " auto context = queue.GetContext();\n" - result += " auto scratch_buffer = Buffer<"+flavour.template+">(context, "+routine.scratch+");\n" - arguments += ["scratch_buffer()"] - result += " return clblas"+flavour.name+routine.name+"(" - result += (",\n"+indent).join([a for a in arguments]) - result += ",\n"+indent+"num_queues, queues, num_wait_events, wait_events, events);" + # Generates the documentation for each routine + for level in [1, 2, 3, 4]: + for routine in ROUTINES[level - 1]: + if routine.implemented: + doc_routine = doc.generate(routine) + f.write(doc_routine) - # There is no clBLAS available, forward the call to one of the available functions - else: # Half-precision - indent = " "*(24 + routine.Length()) - - # Convert to float (note: also integer buffers are stored as half/float) - for buf in routine.inputs + routine.outputs: - result += " auto "+buf+"_buffer_bis = HalfToFloatBuffer("+buf+"_buffer, queues[0]);\n" - - # Call the float routine - result += " auto status = clblasX"+routine.name+"(" - result += (",\n"+indent).join([a for a in routine.ArgumentsHalf()]) - result += ",\n"+indent+"num_queues, queues, num_wait_events, wait_events, events);" - result += "\n" - - # Convert back to half - for buf in routine.outputs: - result += " FloatToHalfBuffer("+buf+"_buffer, "+buf+"_buffer_bis, queues[0]);\n" - result += " return status;" - - # Complete - result += "\n}\n" - return result - -# The wrapper to the reference CBLAS routines (for performance/correctness testing) -def wrapper_cblas(routines): - result = "" - for routine in routines: - if routine.has_tests: - result += "\n// Forwards the Netlib BLAS calls for %s\n" % (routine.ShortNamesTested()) - for flavour in routine.flavours: - result += routine.RoutineHeaderWrapperC(flavour, False, 12)+" {\n" - - # There is a version available in CBLAS - if flavour.precision_name in ["S","D","C","Z"]: - indent = " "*(10 + routine.Length()) - arguments = routine.ArgumentsWrapperC(flavour) - - # Complex scalars - for scalar in routine.scalars: - if flavour.IsComplex(scalar): - result += " const auto "+scalar+"_array = std::vector<"+flavour.buffertype[:-1]+">{"+scalar+".real(), "+scalar+".imag()};\n" - - # Special case for scalar outputs - assignment = "" - postfix = "" - endofline = "" - extra_argument = "" - for output_buffer in routine.outputs: - if output_buffer in routine.ScalarBuffersFirst(): - if flavour in [C,Z]: - postfix += "_sub" - indent += " " - extra_argument += ",\n"+indent+"reinterpret_cast(&"+output_buffer+"_buffer["+output_buffer+"_offset])" - elif output_buffer in routine.IndexBuffers(): - assignment = "((int*)&"+output_buffer+"_buffer[0])["+output_buffer+"_offset] = " - indent += " "*len(assignment) - else: - assignment = output_buffer+"_buffer["+output_buffer+"_offset]" - if (flavour.name in ["Sc","Dz"]): - assignment = assignment+".real(" - endofline += ")" - else: - assignment = assignment+" = " - indent += " "*len(assignment) - - result += " "+assignment+"cblas_"+flavour.name.lower()+routine.name+postfix+"(" - result += (",\n"+indent).join([a for a in arguments]) - result += extra_argument+endofline+");\n" - - # There is no CBLAS available, forward the call to one of the available functions - else: # Half-precision - indent = " "*(9 + routine.Length()) - - # Convert to float (note: also integer buffers are stored as half/float) - for buf in routine.inputs + routine.outputs: - result += " auto "+buf+"_buffer_bis = HalfToFloatBuffer("+buf+"_buffer);\n" - - # Call the float routine - result += " cblasX"+routine.name+"(" - result += (",\n"+indent).join([a for a in routine.ArgumentsHalf()]) - result += ");\n" - - # Convert back to half - for buf in routine.outputs: - result += " FloatToHalfBuffer("+buf+"_buffer, "+buf+"_buffer_bis);\n" - - # Complete - result += "}\n" - return result - -# ================================================================================================== - -# Checks for the number of command-line arguments -if len(sys.argv) != 2: - print "[ERROR] Usage: generator.py " - sys.exit() - -# Parses the command-line arguments -path_clblast = sys.argv[1] -files = [ - path_clblast+"/include/clblast.h", - path_clblast+"/src/clblast.cpp", - path_clblast+"/include/clblast_c.h", - path_clblast+"/src/clblast_c.cpp", - path_clblast+"/test/wrapper_clblas.hpp", - path_clblast+"/test/wrapper_cblas.hpp", -] -header_lines = [84, 74, 93, 22, 29, 41] -footer_lines = [17, 75, 19, 14, 6, 6] - -# Checks whether the command-line arguments are valid; exists otherwise -for f in files: - if not os.path.isfile(f): - print "[ERROR] The path '"+path_clblast+"' does not point to the root of the CLBlast library" - sys.exit() - -# ================================================================================================== - -# Iterates over all files to output -for i in xrange(0,len(files)): - - # Stores the header and the footer of the original file - with open(files[i]) as f: - original = f.readlines() - file_header = original[:header_lines[i]] - file_footer = original[-footer_lines[i]:] - - # Re-writes the body of the file - with open(files[i], "w") as f: - body = "" - levels = [1,2,3] if (i == 4 or i == 5) else [1,2,3,4] - for level in levels: - body += separators[level-1]+"\n" - if i == 0: - body += clblast_h(routines[level-1]) - if i == 1: - body += clblast_cc(routines[level-1]) - if i == 2: - body += clblast_c_h(routines[level-1]) - if i == 3: - body += clblast_c_cc(routines[level-1]) - if i == 4: - body += wrapper_clblas(routines[level-1]) - if i == 5: - body += wrapper_cblas(routines[level-1]) - f.write("".join(file_header)) - f.write(body) - f.write("".join(file_footer)) - -# ================================================================================================== - -# Outputs all the correctness-test implementations -for level in [1,2,3,4]: - for routine in routines[level-1]: - if routine.has_tests: - filename = path_clblast+"/test/correctness/routines/level"+levelnames[level-1]+"/x"+routine.name+".cpp" - with open(filename, "w") as f: - body = "" - body += "#include \"test/correctness/testblas.hpp\"\n" - body += "#include \"test/routines/level"+levelnames[level-1]+"/x"+routine.name+".hpp\"\n\n" - body += "// Shortcuts to the clblast namespace\n" - body += "using float2 = clblast::float2;\n" - body += "using double2 = clblast::double2;\n\n" - body += "// Main function (not within the clblast namespace)\n" - body += "int main(int argc, char *argv[]) {\n" - body += " auto errors = size_t{0};\n" - not_first = "false" - for flavour in routine.flavours: - body += " errors += clblast::RunTests 0: - f.write("Requirements for "+routine.name.upper()+":\n") - f.write("\n") - for requirement in routine.RequirementsDoc(): - f.write("* "+requirement+"\n") - f.write("\n") - - # Routine footer - f.write("\n\n") - - -# ================================================================================================== +if __name__ == '__main__': + main(sys.argv[1:]) diff --git a/scripts/generator/generator/__init__.py b/scripts/generator/generator/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/scripts/generator/generator/convert.py b/scripts/generator/generator/convert.py new file mode 100644 index 00000000..c0309ec3 --- /dev/null +++ b/scripts/generator/generator/convert.py @@ -0,0 +1,69 @@ + +# This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This file follows the +# PEP8 Python style guide and uses a max-width of 120 characters per line. +# +# Author(s): +# Cedric Nugteren + + +def precision_to_full_name(x): + """Translates an option name to a CLBlast data-type""" + return { + 'H': "Half", + 'S': "Single", + 'D': "Double", + 'C': "ComplexSingle", + 'Z': "ComplexDouble", + }[x] + + +def option_to_clblast(x): + """Translates an option name to a CLBlast data-type""" + return { + 'layout': "Layout", + 'a_transpose': "Transpose", + 'b_transpose': "Transpose", + 'ab_transpose': "Transpose", + 'side': "Side", + 'triangle': "Triangle", + 'diagonal': "Diagonal", + }[x] + + +def option_to_clblas(x): + """As above, but for clBLAS data-types""" + return { + 'layout': "clblasOrder", + 'a_transpose': "clblasTranspose", + 'b_transpose': "clblasTranspose", + 'ab_transpose': "clblasTranspose", + 'side': "clblasSide", + 'triangle': "clblasUplo", + 'diagonal': "clblasDiag", + }[x] + + +def option_to_cblas(x): + """As above, but for CBLAS data-types""" + return { + 'layout': "CBLAS_ORDER", + 'a_transpose': "CBLAS_TRANSPOSE", + 'b_transpose': "CBLAS_TRANSPOSE", + 'ab_transpose': "CBLAS_TRANSPOSE", + 'side': "CBLAS_SIDE", + 'triangle': "CBLAS_UPLO", + 'diagonal': "CBLAS_DIAG", + }[x] + + +def option_to_documentation(x): + """Translates an option name to a documentation string""" + return { + 'layout': "Data-layout of the matrices, either `Layout::kRowMajor` (101) for row-major layout or `Layout::kColMajor` (102) for column-major data-layout.", + 'a_transpose': "Transposing the input matrix A, either `Transpose::kNo` (111), `Transpose::kYes` (112), or `Transpose::kConjugate` (113) for a complex-conjugate transpose.", + 'b_transpose': "Transposing the input matrix B, either `Transpose::kNo` (111), `Transpose::kYes` (112), or `Transpose::kConjugate` (113) for a complex-conjugate transpose.", + 'ab_transpose': "Transposing the packed input matrix AP, either `Transpose::kNo` (111), `Transpose::kYes` (112), or `Transpose::kConjugate` (113) for a complex-conjugate transpose.", + 'side': "The position of the triangular matrix in the operation, either on the `Side::kLeft` (141) or `Side::kRight` (142).", + 'triangle': "The part of the array of the triangular matrix to be used, either `Triangle::kUpper` (121) or `Triangle::kLower` (122).", + 'diagonal': "The property of the diagonal matrix, either `Diagonal::kNonUnit` (131) for non-unit values on the diagonal or `Diagonal::kUnit` (132) for unit values on the diagonal.", + }[x] diff --git a/scripts/generator/generator/cpp.py b/scripts/generator/generator/cpp.py new file mode 100644 index 00000000..427eb180 --- /dev/null +++ b/scripts/generator/generator/cpp.py @@ -0,0 +1,257 @@ + +# This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This file follows the +# PEP8 Python style guide and uses a max-width of 120 characters per line. +# +# Author(s): +# Cedric Nugteren + +import generator.datatype as datatype +import generator.convert as convert + + +NL = "\n" +SEPARATOR = "// =================================================================================================" + +# Separators for the BLAS levels +LEVEL_SEPARATORS = [ + NL + SEPARATOR + NL + "// BLAS level-1 (vector-vector) routines" + NL + SEPARATOR, + NL + SEPARATOR + NL + "// BLAS level-2 (matrix-vector) routines" + NL + SEPARATOR, + NL + SEPARATOR + NL + "// BLAS level-3 (matrix-matrix) routines" + NL + SEPARATOR, + NL + SEPARATOR + NL + "// Extra non-BLAS routines (level-X)" + NL + SEPARATOR +] + +# Names of the level sub-folders +LEVEL_NAMES = ["1", "2", "3", "x"] + +# Main header/footer for source files +FOOTER = NL + SEPARATOR + NL +HEADER = NL + SEPARATOR + """ +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +""" + SEPARATOR + NL + + +def clblast_h(routine): + """The C++ API header (.h)""" + result = NL + "// " + routine.description + ": " + routine.short_names() + NL + result += routine.routine_header_cpp(12, " = nullptr") + ";" + NL + return result + + +def clblast_cc(routine): + """The C++ API implementation (.cpp)""" + indent1 = " " * (20 + routine.length()) + result = NL + "// " + routine.description + ": " + routine.short_names() + NL + if routine.implemented: + result += routine.routine_header_cpp(12, "") + " {" + NL + result += " auto queue_cpp = Queue(*queue);" + NL + result += " auto routine = X" + routine.name + "<" + routine.template.template + ">(queue_cpp, event);" + NL + result += " auto status = routine.SetUp();" + NL + result += " if (status != StatusCode::kSuccess) { return status; }" + NL + result += " return routine.Do" + routine.name.capitalize() + "(" + result += ("," + NL + indent1).join([a for a in routine.arguments_clcudaapi()]) + result += ");" + NL + else: + result += routine.routine_header_type_cpp(12) + " {" + NL + result += " return StatusCode::kNotImplemented;" + NL + result += "}" + NL + for flavour in routine.flavours: + indent2 = " " * (34 + routine.length() + len(flavour.template)) + result += "template StatusCode PUBLIC_API " + routine.name.capitalize() + "<" + flavour.template + ">(" + result += ("," + NL + indent2).join([a for a in routine.arguments_type(flavour)]) + result += "," + NL + indent2 + "cl_command_queue*, cl_event*);" + NL + return result + + +def clblast_c_h(routine): + """The C API header (.h)""" + result = NL + "// " + routine.description + ": " + routine.short_names() + NL + for flavour in routine.flavours: + result += routine.routine_header_c(flavour, 31, " PUBLIC_API") + ";" + NL + return result + + +def clblast_c_cc(routine): + """The C API implementation (.cpp)""" + result = NL + "// " + routine.name.upper() + NL + for flavour in routine.flavours: + template = "<" + flavour.template + ">" if routine.no_scalars() else "" + indent = " " * (26 + routine.length() + len(template)) + result += routine.routine_header_c(flavour, 20, "") + " {" + NL + result += " auto status = clblast::" + routine.name.capitalize() + template + "(" + result += ("," + NL + indent).join([a for a in routine.arguments_cast(flavour, indent)]) + result += "," + NL + indent + "queue, event);" + result += NL + " return static_cast(status);" + NL + "}" + NL + return result + + +def wrapper_clblas(routine): + """The wrapper to the reference clBLAS routines (for performance/correctness testing)""" + result = "" + if routine.has_tests: + result += NL + "// Forwards the clBLAS calls for %s" % routine.short_names_tested() + NL + if routine.no_scalars(): + result += routine.routine_header_wrapper_clblas(routine.template, True, 21) + ";" + NL + for flavour in routine.flavours: + result += routine.routine_header_wrapper_clblas(flavour, False, 21) + " {" + NL + + # There is a version available in clBLAS + if flavour.precision_name in ["S", "D", "C", "Z"]: + indent = " " * (17 + routine.length()) + arguments = routine.arguments_wrapper_clblas(flavour) + if routine.scratch: + result += " auto queue = Queue(queues[0]);" + NL + result += " auto context = queue.GetContext();" + NL + result += " auto scratch_buffer = Buffer<" + flavour.template + ">" + result += "(context, " + routine.scratch + ");" + NL + arguments += ["scratch_buffer()"] + result += " return clblas" + flavour.name + routine.name + "(" + result += ("," + NL + indent).join([a for a in arguments]) + result += "," + NL + indent + "num_queues, queues, num_wait_events, wait_events, events);" + + # There is no clBLAS available, forward the call to one of the available functions + else: # Half-precision + indent = " " * (24 + routine.length()) + + # Convert to float (note: also integer buffers are stored as half/float) + for buf in routine.inputs + routine.outputs: + result += " auto " + buf + "_buffer_bis = HalfToFloatBuffer(" + buf + "_buffer, queues[0]);" + NL + + # Call the float routine + result += " auto status = clblasX" + routine.name + "(" + result += ("," + NL + indent).join([a for a in routine.arguments_half()]) + result += "," + NL + indent + "num_queues, queues, num_wait_events, wait_events, events);" + result += NL + + # Convert back to half + for buf in routine.outputs: + result += " FloatToHalfBuffer(" + buf + "_buffer, " + buf + "_buffer_bis, queues[0]);" + NL + result += " return status;" + + # Complete + result += NL + "}" + NL + return result + + +def wrapper_cblas(routine): + """The wrapper to the reference CBLAS routines (for performance/correctness testing)""" + result = "" + if routine.has_tests: + result += NL + "// Forwards the Netlib BLAS calls for %s" % routine.short_names_tested() + NL + for flavour in routine.flavours: + result += routine.routine_header_wrapper_cblas(flavour, 12) + " {" + NL + + # There is a version available in CBLAS + if flavour.precision_name in ["S", "D", "C", "Z"]: + indent = " " * (10 + routine.length()) + arguments = routine.arguments_wrapper_cblas(flavour) + + # Complex scalars + for scalar in routine.scalars: + if flavour.is_complex(scalar): + result += " const auto " + scalar + "_array = std::vector<" + flavour.buffer_type[:-1] + ">" + result += "{" + scalar + ".real(), " + scalar + ".imag()};" + NL + + # Special case for scalar outputs + assignment = "" + postfix = "" + end_of_line = "" + extra_argument = "" + for output_buffer in routine.outputs: + if output_buffer in routine.scalar_buffers_first(): + if flavour in [datatype.C, datatype.Z]: + postfix += "_sub" + indent += " " + extra_argument += "," + NL + indent + extra_argument += "reinterpret_cast" + extra_argument += "(&" + output_buffer + "_buffer[" + output_buffer + "_offset])" + elif output_buffer in routine.index_buffers(): + assignment = "((int*)&" + output_buffer + "_buffer[0])[" + output_buffer + "_offset] = " + indent += " " * len(assignment) + else: + assignment = output_buffer + "_buffer[" + output_buffer + "_offset]" + if flavour.name in ["Sc", "Dz"]: + assignment += ".real(" + end_of_line += ")" + else: + assignment += " = " + indent += " " * len(assignment) + + result += " " + assignment + "cblas_" + flavour.name.lower() + routine.name + postfix + "(" + result += ("," + NL + indent).join([a for a in arguments]) + result += extra_argument + end_of_line + ");" + NL + + # There is no CBLAS available, forward the call to one of the available functions + else: # Half-precision + indent = " " * (9 + routine.length()) + + # Convert to float (note: also integer buffers are stored as half/float) + for buf in routine.inputs + routine.outputs: + result += " auto " + buf + "_buffer_bis = HalfToFloatBuffer(" + buf + "_buffer);" + NL + + # Call the float routine + result += " cblasX" + routine.name + "(" + result += ("," + NL + indent).join([a for a in routine.arguments_half()]) + result += ");" + NL + + # Convert back to half + for buf in routine.outputs: + result += " FloatToHalfBuffer(" + buf + "_buffer, " + buf + "_buffer_bis);" + NL + + # Complete + result += "}" + NL + return result + + +def performance_test(routine, level_string): + """Generates the body of a performance test for a specific routine""" + result = "" + result += "#include \"test/performance/client.hpp\"" + NL + result += "#include \"test/routines/level" + level_string + "/x" + routine.name + ".hpp\"" + NL + NL + result += "// Shortcuts to the clblast namespace" + NL + result += "using float2 = clblast::float2;" + NL + result += "using double2 = clblast::double2;" + NL + NL + result += "// Main function (not within the clblast namespace)" + NL + result += "int main(int argc, char *argv[]) {" + NL + default = convert.precision_to_full_name(routine.flavours[0].precision_name) + result += " switch(clblast::GetPrecision(argc, argv, clblast::Precision::k" + default + ")) {" + NL + for precision in ["H", "S", "D", "C", "Z"]: + result += " case clblast::Precision::k" + convert.precision_to_full_name(precision) + ":" + found = False + for flavour in routine.flavours: + if flavour.precision_name == precision: + result += NL + " clblast::RunClient + + +# Short-hands for data-types +D_HALF = "half" +D_FLOAT = "float" +D_DOUBLE = "double" +D_FLOAT2 = "float2" +D_DOUBLE2 = "double2" +D_HALF_OPENCL = "cl_half" +D_FLOAT2_OPENCL = "cl_float2" +D_DOUBLE2_OPENCL = "cl_double2" + + +class DataType: + """Class holding data-type and precision information""" + + def __init__(self, precision_name, name, template, scalars, buffer_type): + self.precision_name = precision_name + self.name = name + self.template = template + self.alpha_cpp = scalars[0] + self.beta_cpp = scalars[1] + self.alpha_cl = scalars[2] + self.beta_cl = scalars[3] + self.buffer_type = buffer_type + + def use_alpha(self): + """Outputs the name of the data-type (alpha/beta), possibly transforming into the right type""" + if self.alpha_cpp in [D_FLOAT2, D_DOUBLE2]: + return self.alpha_cpp + "{alpha.s[0], alpha.s[1]}" + return "alpha" + + def use_beta(self): + """As above, but for beta instead of alpha""" + if self.beta_cpp in [D_FLOAT2, D_DOUBLE2]: + return self.beta_cpp + "{beta.s[0], beta.s[1]}" + return "beta" + + def use_alpha_opencl(self): + """As above, but the transformation is in the opposite direction""" + if self.alpha_cpp in [D_FLOAT2, D_DOUBLE2]: + return self.alpha_cl + "{{alpha.real(), alpha.imag()}}" + return "alpha" + + def use_beta_opencl(self): + """As above, but for beta instead of alpha""" + if self.beta_cpp in [D_FLOAT2, D_DOUBLE2]: + return self.beta_cl + "{{beta.real(), beta.imag()}}" + return "beta" + + def test_template(self): + """Returns the template as used in the correctness/performance tests""" + if self.buffer_type != self.beta_cpp: + return "<" + self.buffer_type + "," + self.beta_cpp + ">, " + self.buffer_type + ", " + self.beta_cpp + return "<" + self.buffer_type + ">, " + self.buffer_type + ", " + self.beta_cpp + + def is_complex(self, scalar): + """Current scalar is complex""" + return ((scalar == "alpha" and self.alpha_cpp in [D_FLOAT2, D_DOUBLE2]) or + (scalar == "beta" and self.beta_cpp in [D_FLOAT2, D_DOUBLE2])) + + +# Regular data-types +H = DataType("H", "H", D_HALF, [D_HALF] * 2 + [D_HALF_OPENCL] * 2, D_HALF) # half (16) +S = DataType("S", "S", D_FLOAT, [D_FLOAT] * 4, D_FLOAT) # single (32) +D = DataType("D", "D", D_DOUBLE, [D_DOUBLE] * 4, D_DOUBLE) # double (64) +C = DataType("C", "C", D_FLOAT2, [D_FLOAT2] * 2 + [D_FLOAT2_OPENCL] * 2, D_FLOAT2) # single-complex (3232) +Z = DataType("Z", "Z", D_DOUBLE2, [D_DOUBLE2] * 2 + [D_DOUBLE2_OPENCL] * 2, D_DOUBLE2) # double-complex (6464) + +# Special cases +Sc = DataType("C", "Sc", D_FLOAT2, [D_FLOAT2] * 4, D_FLOAT2) # As C, but with real output +Dz = DataType("Z", "Dz", D_DOUBLE2, [D_DOUBLE2] * 4, D_DOUBLE2) # As Z, but with real output +iH = DataType("H", "iH", D_HALF, [D_HALF] * 4, D_HALF) # As H, but with integer output +iS = DataType("S", "iS", D_FLOAT, [D_FLOAT] * 4, D_FLOAT) # As S, but with integer output +iD = DataType("D", "iD", D_DOUBLE, [D_DOUBLE] * 4, D_DOUBLE) # As D, but with integer output +iC = DataType("C", "iC", D_FLOAT2, [D_FLOAT2] * 2 + [D_FLOAT2_OPENCL] * 2, D_FLOAT2) # As C, but with integer output +iZ = DataType("Z", "iZ", D_DOUBLE2, [D_DOUBLE2] * 2 + [D_DOUBLE2_OPENCL] * 2, D_DOUBLE2) # As Z, but with int output +Css = DataType("C", "C", D_FLOAT, [D_FLOAT, D_FLOAT, D_FLOAT, D_FLOAT], D_FLOAT2) # As C, but with constants from S +Zdd = DataType("Z", "Z", D_DOUBLE, [D_DOUBLE] * 4, D_DOUBLE2) # As Z, but with constants from D +Ccs = DataType("C", "C", D_FLOAT2 + "," + D_FLOAT, [D_FLOAT2, D_FLOAT, D_FLOAT2_OPENCL, D_FLOAT], D_FLOAT2) # As C, but with one constant from S +Zzd = DataType("Z", "Z", D_DOUBLE2 + "," + D_DOUBLE, [D_DOUBLE2, D_DOUBLE, D_DOUBLE2_OPENCL, D_DOUBLE], D_DOUBLE2) # As Z, but with one constant from D + +# C++ template data-types +T = DataType("T", "typename T", "T", ["T", "T", "T", "T"], "T") # regular routine +Tc = DataType("Tc", "typename T", "std::complex,T", ["T", "T", "T", "T"], "std::complex") # for herk +TU = DataType("TU", "typename T, typename U", "T,U", ["T", "U", "T", "U"], "T") # for her2k diff --git a/scripts/generator/generator/doc.py b/scripts/generator/generator/doc.py new file mode 100644 index 00000000..8657ed0d --- /dev/null +++ b/scripts/generator/generator/doc.py @@ -0,0 +1,57 @@ + +# This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This file follows the +# PEP8 Python style guide and uses a max-width of 120 characters per line. +# +# Author(s): +# Cedric Nugteren + +NL = "\n" + + +def header(): + """Generates the header for the API documentation""" + result = "CLBlast: API reference" + NL + result += "================" + NL + NL + NL + return result + + +def generate(routine): + """Generates the API documentation for a given routine""" + result = "" + + # Routine header + result += "x" + routine.name.upper() + ": " + routine.description + NL + result += "-------------" + NL + NL + result += routine.details + NL + NL + + # Routine API + result += "C++ API:" + NL + result += "```" + NL + result += routine.routine_header_cpp(12, "") + NL + result += "```" + NL + NL + result += "C API:" + NL + result += "```" + NL + for flavour in routine.flavours: + result += routine.routine_header_c(flavour, 20, "") + NL + result += "```" + NL + NL + + # Routine arguments + result += "Arguments to " + routine.name.upper() + ":" + NL + NL + for argument in routine.arguments_doc(): + result += "* " + argument + NL + result += "* `cl_command_queue* queue`: " + result += "Pointer to an OpenCL command queue associated with a context and device to execute the routine on." + NL + result += "* `cl_event* event`: " + result += "Pointer to an OpenCL event to be able to wait for completion of the routine's OpenCL kernel(s). " + result += "This is an optional argument." + NL + NL + + # Routine requirements + if len(routine.requirements_doc()) > 0: + result += "Requirements for " + routine.name.upper() + ":" + NL + NL + for requirement in routine.requirements_doc(): + result += "* " + requirement + NL + result += NL + + # Routine footer + result += NL + NL + return result diff --git a/scripts/generator/generator/routine.py b/scripts/generator/generator/routine.py new file mode 100644 index 00000000..a4e682c2 --- /dev/null +++ b/scripts/generator/generator/routine.py @@ -0,0 +1,552 @@ + +# This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This file follows the +# PEP8 Python style guide and uses a max-width of 120 characters per line. +# +# Author(s): +# Cedric Nugteren + +from itertools import chain + +import generator.convert as convert + + +class Routine: + """Class holding routine-specific information (e.g. name, which arguments, which precisions)""" + def __init__(self, implemented, has_tests, level, name, template, flavours, sizes, options, + inputs, outputs, scalars, scratch, description, details, requirements): + self.implemented = implemented + self.has_tests = has_tests + self.level = level + self.name = name + self.template = template + self.flavours = flavours + self.sizes = sizes + self.options = options + self.inputs = inputs + self.outputs = outputs + self.scalars = scalars + self.scratch = scratch # Scratch buffer (e.g. for xDOT) + self.description = description + self.details = details + self.requirements = requirements + + @staticmethod + def scalar_buffers_first(): + """List of scalar buffers""" + return ["dot", "nrm2", "asum", "sum", "imax", "imin"] + + @staticmethod + def scalar_buffers_second(): + """List of scalar buffers""" + return ["sa", "sb", "sc", "ss", "sd1", "sd2", "sx1", "sy1", "sparam"] + + @staticmethod + def other_scalars(): + """List of scalars other than alpha and beta""" + return ["cos", "sin"] + + @staticmethod + def index_buffers(): + """List of buffers with unsigned int type""" + return ["imax", "imin"] + + @staticmethod + def postfix(name): + """Retrieves the postfix for a buffer""" + return "inc" if (name in ["x", "y"]) else "ld" + + @staticmethod + def buffers_vector(): + """Distinguish between vectors and matrices""" + return ["x", "y"] + + @staticmethod + def buffers_matrix(): + """Distinguish between vectors and matrices""" + return ["a", "b", "c", "ap"] + + def non_index_inputs(self): + """Lists of input/output buffers not index (integer)""" + buffers = self.inputs[:] # make a copy + for i in self.index_buffers(): + if i in buffers: + buffers.remove(i) + return buffers + + def non_index_outputs(self): + """Lists of input/output buffers not index (integer)""" + buffers = self.outputs[:] # make a copy + for i in self.index_buffers(): + if i in buffers: + buffers.remove(i) + return buffers + + def buffers_without_ld_inc(self): + """List of buffers without 'inc' or 'ld'""" + return self.scalar_buffers_first() + self.scalar_buffers_second() + ["ap"] + + def length(self): + """Retrieves the number of characters in the routine's name""" + return len(self.name) + + def no_scalars(self): + """Determines whether or not this routine has scalar arguments (alpha/beta)""" + return self.scalars == [] + + def short_names(self): + """Returns the upper-case names of these routines (all flavours)""" + return "/".join([f.name + self.name.upper() for f in self.flavours]) + + def short_names_tested(self): + """As above, but excludes some""" + names = [f.name + self.name.upper() for f in self.flavours] + if "H" + self.name.upper() in names: + names.remove("H" + self.name.upper()) + return "/".join(names) + + def buffers_first(self): + """Determines which buffers go first (between alpha and beta) and which ones go after""" + if self.level == "2b": + return ["x", "y"] + return ["ap", "a", "b", "x"] + + def buffers_second(self): + if self.level == "2b": + return ["ap", "a", "b", "c"] + return ["y", "c"] + + def buffer(self, name): + """Retrieves a variable name for a specific input/output vector/matrix (e.g. 'x')""" + if name in self.inputs or name in self.outputs: + a = [name + "_buffer"] + b = [name + "_offset"] + c = [name + "_" + self.postfix(name)] if (name not in self.buffers_without_ld_inc()) else [] + return [", ".join(a + b + c)] + return [] + + def buffer_bis(self, name): + """As above but with a '_bis' suffix for the buffer name""" + if name in self.inputs or name in self.outputs: + a = [name + "_buffer_bis"] + b = [name + "_offset"] + c = [name + "_" + self.postfix(name)] if name not in self.buffers_without_ld_inc() else [] + return [", ".join(a + b + c)] + return [] + + def buffer_def(self, name): + """As above but with data-types""" + prefix = "const " if name in self.inputs else "" + if name in self.inputs or name in self.outputs: + a = [prefix + "cl_mem " + name + "_buffer"] + b = ["const size_t " + name + "_offset"] + c = ["const size_t " + name + "_" + self.postfix(name)] if name not in self.buffers_without_ld_inc() else [] + return [", ".join(a + b + c)] + return [] + + def buffer_def_wrapper_cl(self, name, flavour): + """As above but with data-types""" + prefix = "const " if name in self.inputs else "" + if name in self.inputs or name in self.outputs: + a = [prefix + "Buffer<" + flavour.buffer_type + ">& " + name + "_buffer"] + b = ["const size_t " + name + "_offset"] + c = ["const size_t " + name + "_" + self.postfix(name)] if name not in self.buffers_without_ld_inc() else [] + return [", ".join(a + b + c)] + return [] + + def buffer_def_vector(self, name, flavour): + """As above but as vectors""" + prefix = "const " if name in self.inputs else "" + if name in self.inputs or name in self.outputs: + a = [prefix + "std::vector<" + flavour.buffer_type + ">& " + name + "_buffer"] + b = ["const size_t " + name + "_offset"] + c = ["const size_t " + name + "_" + self.postfix(name)] if name not in self.buffers_without_ld_inc() else [] + return [", ".join(a + b + c)] + return [] + + def buffer_clcudaapi(self, name): + """As above but with CLCudaAPI buffers""" + if name in self.inputs or name in self.outputs: + buffer_type = "unsigned int" if (name in self.index_buffers()) else self.template.buffer_type + a = ["Buffer<" + buffer_type + ">(" + name + "_buffer)"] + b = [name + "_offset"] + c = [name + "_" + self.postfix(name)] if (name not in self.buffers_without_ld_inc()) else [] + return [", ".join(a + b + c)] + return [] + + def buffer_wrapper_clblas(self, name): + """As above but with a static cast for clBLAS wrapper""" + if name in self.inputs or name in self.outputs: + a = [name + "_buffer()"] + b = [name + "_offset"] + c = [] + if name in ["x", "y"]: + c = ["static_cast(" + name + "_" + self.postfix(name) + ")"] + elif name in ["a", "b", "c"]: + c = [name + "_" + self.postfix(name)] + return [", ".join(a + b + c)] + return [] + + def buffer_wrapper_cblas(self, name, flavour): + """As above but with a static cast for CBLAS wrapper""" + prefix = "const " if name in self.inputs else "" + if name in self.inputs or name in self.outputs: + if name == "sy1": + a = [name + "_buffer[" + name + "_offset]"] + elif flavour.precision_name in ["C", "Z"]: + a = ["reinterpret_cast<" + prefix + flavour.buffer_type[:-1] + "*>" + + "(&" + name + "_buffer[" + name + "_offset])"] + else: + a = ["&" + name + "_buffer[" + name + "_offset]"] + c = [] + if name in ["x", "y"]: + c = ["static_cast(" + name + "_" + self.postfix(name) + ")"] + elif name in ["a", "b", "c"]: + c = [name + "_" + self.postfix(name)] + return [", ".join(a + c)] + return [] + + def buffer_type(self, name): + """As above, but only data-types""" + prefix = "const " if (name in self.inputs) else "" + if (name in self.inputs) or (name in self.outputs): + a = [prefix + "cl_mem"] + b = ["const size_t"] + c = ["const size_t"] if (name not in self.buffers_without_ld_inc()) else [] + return [", ".join(a + b + c)] + return [] + + def buffer_doc(self, name): + """Retrieves the documentation of the buffers""" + prefix = "const " if (name in self.inputs) else "" + inout = "input" if (name in self.inputs) else "output" + if (name in self.inputs) or (name in self.outputs): + math_name = name.upper() + " matrix" if (name in self.buffers_matrix()) else name + " vector" + inc_ld_description = "Leading dimension " if (name in self.buffers_matrix()) else "Stride/increment " + a = ["`" + prefix + "cl_mem " + name + "_buffer`: OpenCL buffer to store the " + inout + " " + math_name + "."] + b = ["`const size_t " + name + "_offset`: The offset in elements from the start of the " + inout + " " + math_name + "."] + if name not in self.buffers_without_ld_inc(): + c = ["`const size_t " + name + "_" + self.postfix(name) + "`: " + + inc_ld_description + "of the " + inout + " " + math_name + ". This value must be greater than 0."] + else: + c = [] + return a + b + c + return [] + + def scalar(self, name): + """Retrieves the name of a scalar (alpha/beta)""" + if name in self.scalars: + return [name] + return [] + + def scalar_half_to_float(self, name): + """As above, but converts from float to half""" + if name in self.scalars: + return ["HalfToFloat(" + name + ")"] + return [] + + def scalar_use(self, name, flavour): + """Retrieves the use of a scalar (alpha/beta)""" + if name in self.scalars: + if name == "alpha": + return [flavour.use_alpha()] + elif name == "beta": + return [flavour.use_beta()] + return [name] + return [] + + def scalar_use_wrapper(self, name, flavour): + """As above, but for the clBLAS wrapper""" + if name in self.scalars: + if name == "alpha": + return [flavour.use_alpha_opencl()] + elif name == "beta": + return [flavour.use_beta_opencl()] + return [name] + return [] + + def scalar_use_wrapper_cblas(self, name, flavour): + """As above, but for the CBLAS wrapper""" + if name in self.scalars: + if flavour.is_complex(name): + return [name + "_array.data()"] + return [name] + return [] + + def scalar_def(self, name, flavour): + """Retrieves the definition of a scalar (alpha/beta)""" + if name in self.scalars: + if name == "alpha": + return ["const " + flavour.alpha_cl + " " + name] + return ["const " + flavour.beta_cl + " " + name] + return [] + + def scalar_def_plain(self, name, flavour): + """As above, but without 'cl_' prefix""" + if name in self.scalars: + if name == "alpha": + return ["const " + flavour.alpha_cpp + " " + name] + return ["const " + flavour.beta_cpp + " " + name] + return [] + + def scalar_type(self, name, flavour): + """Retrieves the type of a scalar (alpha/beta)""" + if name in self.scalars: + if name == "alpha": + return ["const " + flavour.alpha_cpp] + return ["const " + flavour.beta_cpp] + return [] + + def scalar_doc(self, name): + """Retrieves the documentation of a scalar""" + if name in self.scalars: + if name == "alpha": + return ["`const " + self.template.alpha_cpp + " " + name + "`: Input scalar constant."] + return ["`const " + self.template.beta_cpp + " " + name + "`: Input scalar constant."] + return [] + + def sizes_list(self): + """Retrieves a list of comma-separated sizes (m, n, k)""" + if self.sizes: + return [", ".join([s for s in self.sizes])] + return [] + + def sizes_def(self): + """Retrieves the definition of the sizes (m,n,k)""" + if self.sizes: + return [", ".join(["const size_t " + s for s in self.sizes])] + return [] + + def sizes_type(self): + """Retrieves the types of the sizes (m,n,k)""" + if self.sizes: + return [", ".join(["const size_t" for s in self.sizes])] + return [] + + def sizes_doc(self): + """# Retrieves the documentation of the sizes""" + if self.sizes: + definitions = ["`const size_t " + s + "`: Integer size argument. This value must be positive." for s in self.sizes] + return definitions + return [] + + def options_list(self): + """Retrieves a list of options""" + if self.options: + return [", ".join(self.options)] + return [] + + def options_cast(self, indent): + """As above, but now casted to CLBlast data-types""" + if self.options: + options = ["static_cast(" + o + ")" for o in self.options] + return [(",\n" + indent).join(options)] + return [] + + def options_def(self): + """Retrieves the definitions of the options (layout, transpose, side, etc.)""" + if self.options: + definitions = ["const " + convert.option_to_clblast(o) + " " + o for o in self.options] + return [", ".join(definitions)] + return [] + + def options_def_wrapper_clblas(self): + """As above, but now using clBLAS data-types""" + if self.options: + definitions = ["const " + convert.option_to_clblas(o) + " " + o for o in self.options] + return [", ".join(definitions)] + return [] + + def options_def_wrapper_cblas(self): + """As above, but now using CBLAS data-types""" + if self.options: + definitions = ["const " + convert.option_to_cblas(o) + " " + o for o in self.options] + return [", ".join(definitions)] + return [] + + def options_type(self): + """Retrieves the types of the options (layout, transpose, side, etc.)""" + if self.options: + definitions = ["const " + convert.option_to_clblast(o) for o in self.options] + return [", ".join(definitions)] + return [] + + def options_doc(self): + """Retrieves the documentation of the options""" + if self.options: + definitions = ["`const " + convert.option_to_clblast(o) + " " + o + "`: " + convert.option_to_documentation(o) for o in self.options] + return definitions + return [] + + def arguments(self): + """Retrieves a combination of all the argument names (no types)""" + return (self.options_list() + self.sizes_list() + + list(chain(*[self.buffer(b) for b in self.scalar_buffers_first()])) + + self.scalar("alpha") + + list(chain(*[self.buffer(b) for b in self.buffers_first()])) + + self.scalar("beta") + + list(chain(*[self.buffer(b) for b in self.buffers_second()])) + + list(chain(*[self.buffer(b) for b in self.scalar_buffers_second()])) + + list(chain(*[self.scalar(s) for s in self.other_scalars()]))) + + def arguments_half(self): + """As above, but with conversions from half to float""" + return (self.options_list() + self.sizes_list() + + list(chain(*[self.buffer_bis(b) for b in self.scalar_buffers_first()])) + + self.scalar_half_to_float("alpha") + + list(chain(*[self.buffer_bis(b) for b in self.buffers_first()])) + + self.scalar_half_to_float("beta") + + list(chain(*[self.buffer_bis(b) for b in self.buffers_second()])) + + list(chain(*[self.buffer_bis(b) for b in self.scalar_buffers_second()])) + + list(chain(*[self.scalar(s) for s in self.other_scalars()]))) + + def arguments_clcudaapi(self): + """Retrieves a combination of all the argument names, with CLCudaAPI casts""" + return (self.options_list() + self.sizes_list() + + list(chain(*[self.buffer_clcudaapi(b) for b in self.scalar_buffers_first()])) + + self.scalar("alpha") + + list(chain(*[self.buffer_clcudaapi(b) for b in self.buffers_first()])) + + self.scalar("beta") + + list(chain(*[self.buffer_clcudaapi(b) for b in self.buffers_second()])) + + list(chain(*[self.buffer_clcudaapi(b) for b in self.scalar_buffers_second()])) + + list(chain(*[self.scalar(s) for s in self.other_scalars()]))) + + def arguments_cast(self, flavour, indent): + """As above, but with CLBlast casts""" + return (self.options_cast(indent) + self.sizes_list() + + list(chain(*[self.buffer(b) for b in self.scalar_buffers_first()])) + + self.scalar_use("alpha", flavour) + + list(chain(*[self.buffer(b) for b in self.buffers_first()])) + + self.scalar_use("beta", flavour) + + list(chain(*[self.buffer(b) for b in self.buffers_second()])) + + list(chain(*[self.buffer(b) for b in self.scalar_buffers_second()])) + + list(chain(*[self.scalar_use(s, flavour) for s in self.other_scalars()]))) + + def arguments_wrapper_clblas(self, flavour): + """As above, but for the clBLAS wrapper""" + return (self.options_list() + self.sizes_list() + + list(chain(*[self.buffer_wrapper_clblas(b) for b in self.scalar_buffers_first()])) + + self.scalar_use_wrapper("alpha", flavour) + + list(chain(*[self.buffer_wrapper_clblas(b) for b in self.buffers_first()])) + + self.scalar_use_wrapper("beta", flavour) + + list(chain(*[self.buffer_wrapper_clblas(b) for b in self.buffers_second()])) + + list(chain(*[self.buffer_wrapper_clblas(b) for b in self.scalar_buffers_second()])) + + list(chain(*[self.scalar_use_wrapper(s, flavour) for s in self.other_scalars()]))) + + def arguments_wrapper_cblas(self, flavour): + """As above, but for the CBLAS wrapper""" + return (self.options_list() + self.sizes_list() + + self.scalar_use_wrapper_cblas("alpha", flavour) + + list(chain(*[self.buffer_wrapper_cblas(b, flavour) for b in self.buffers_first()])) + + self.scalar_use_wrapper_cblas("beta", flavour) + + list(chain(*[self.buffer_wrapper_cblas(b, flavour) for b in self.buffers_second()])) + + list(chain(*[self.buffer_wrapper_cblas(b, flavour) for b in self.scalar_buffers_second()])) + + list(chain(*[self.scalar_use_wrapper_cblas(s, flavour) for s in self.other_scalars()]))) + + def arguments_def(self, flavour): + """Retrieves a combination of all the argument definitions""" + return (self.options_def() + self.sizes_def() + + list(chain(*[self.buffer_def(b) for b in self.scalar_buffers_first()])) + + self.scalar_def("alpha", flavour) + + list(chain(*[self.buffer_def(b) for b in self.buffers_first()])) + + self.scalar_def("beta", flavour) + + list(chain(*[self.buffer_def(b) for b in self.buffers_second()])) + + list(chain(*[self.buffer_def(b) for b in self.scalar_buffers_second()])) + + list(chain(*[self.scalar_def(s, flavour) for s in self.other_scalars()]))) + + def arguments_def_wrapper_clblas(self, flavour): + """As above, but clBLAS wrapper plain data-types""" + return (self.options_def_wrapper_clblas() + self.sizes_def() + + list(chain(*[self.buffer_def_wrapper_cl(b, flavour) for b in self.scalar_buffers_first()])) + + self.scalar_def_plain("alpha", flavour) + + list(chain(*[self.buffer_def_wrapper_cl(b, flavour) for b in self.buffers_first()])) + + self.scalar_def_plain("beta", flavour) + + list(chain(*[self.buffer_def_wrapper_cl(b, flavour) for b in self.buffers_second()])) + + list(chain(*[self.buffer_def_wrapper_cl(b, flavour) for b in self.scalar_buffers_second()])) + + list(chain(*[self.scalar_def_plain(s, flavour) for s in self.other_scalars()]))) + + def arguments_def_wrapper_cblas(self, flavour): + """As above, but CBLAS wrapper plain data-types""" + return (self.options_def_wrapper_cblas() + self.sizes_def() + + list(chain(*[self.buffer_def_vector(b, flavour) for b in self.scalar_buffers_first()])) + + self.scalar_def_plain("alpha", flavour) + + list(chain(*[self.buffer_def_vector(b, flavour) for b in self.buffers_first()])) + + self.scalar_def_plain("beta", flavour) + + list(chain(*[self.buffer_def_vector(b, flavour) for b in self.buffers_second()])) + + list(chain(*[self.buffer_def_vector(b, flavour) for b in self.scalar_buffers_second()])) + + list(chain(*[self.scalar_def_plain(s, flavour) for s in self.other_scalars()]))) + + def arguments_type(self, flavour): + """Retrieves a combination of all the argument types""" + return (self.options_type() + self.sizes_type() + + list(chain(*[self.buffer_type(b) for b in self.scalar_buffers_first()])) + + self.scalar_type("alpha", flavour) + + list(chain(*[self.buffer_type(b) for b in self.buffers_first()])) + + self.scalar_type("beta", flavour) + + list(chain(*[self.buffer_type(b) for b in self.buffers_second()])) + + list(chain(*[self.buffer_type(b) for b in self.scalar_buffers_second()])) + + list(chain(*[self.scalar_type(s, flavour) for s in self.other_scalars()]))) + + def arguments_doc(self): + """Retrieves a combination of all the argument types""" + return (self.options_doc() + self.sizes_doc() + + list(chain(*[self.buffer_doc(b) for b in self.scalar_buffers_first()])) + + list(chain(*[self.buffer_doc(b) for b in self.scalar_buffers_first()])) + + self.scalar_doc("alpha") + + list(chain(*[self.buffer_doc(b) for b in self.buffers_first()])) + + self.scalar_doc("beta") + + list(chain(*[self.buffer_doc(b) for b in self.buffers_second()])) + + list(chain(*[self.buffer_doc(b) for b in self.scalar_buffers_second()])) + + list(chain(*[self.scalar_doc(s) for s in self.other_scalars()]))) + + def requirements_doc(self): + """Retrieves a list of routine requirements for documentation""" + return self.requirements + + def routine_header_cpp(self, spaces, default_event): + """Retrieves the C++ templated definition for a routine""" + indent = " " * (spaces + self.length()) + result = "template <" + self.template.name + ">\n" + result += "StatusCode " + self.name.capitalize() + "(" + result += (",\n" + indent).join([a for a in self.arguments_def(self.template)]) + result += ",\n" + indent + "cl_command_queue* queue, cl_event* event" + default_event + ")" + return result + + def routine_header_type_cpp(self, spaces): + """As above, but now without variable names""" + indent = " " * (spaces + self.length()) + result = "template <" + self.template.name + ">\n" + result += "StatusCode " + self.name.capitalize() + "(" + result += (",\n" + indent).join([a for a in self.arguments_type(self.template)]) + result += ",\n" + indent + "cl_command_queue*, cl_event*)" + return result + + def routine_header_c(self, flavour, spaces, extra_qualifier): + """As above, but now for C""" + indent = " " * (spaces + self.length()) + result = "StatusCode" + extra_qualifier + " CLBlast" + flavour.name + self.name + "(" + result += (",\n" + indent).join([a for a in self.arguments_def(flavour)]) + result += ",\n" + indent + "cl_command_queue* queue, cl_event* event)" + return result + + def routine_header_wrapper_clblas(self, flavour, def_only, spaces): + """As above, but now for the clBLAS wrapper""" + template = "<" + flavour.template + ">" if self.no_scalars() and not def_only else "" + indent = " " * (spaces + self.length() + len(template)) + result = "" + if self.no_scalars(): + result += "template <" + if def_only: + result += flavour.name + result += ">\n" + result += "clblasStatus clblasX" + self.name + template + "(" + result += (",\n" + indent).join([a for a in self.arguments_def_wrapper_clblas(flavour)]) + result += ",\n" + indent + "cl_uint num_queues, cl_command_queue *queues" + result += ",\n" + indent + "cl_uint num_wait_events, const cl_event *wait_events, cl_event *events)" + return result + + def routine_header_wrapper_cblas(self, flavour, spaces): + """As above, but now for the CBLAS wrapper""" + indent = " " * (spaces + self.length()) + result = "void cblasX" + self.name + "(" + result += (",\n" + indent).join([a for a in self.arguments_def_wrapper_cblas(flavour)]) + ")" + return result diff --git a/scripts/generator/routine.py b/scripts/generator/routine.py deleted file mode 100644 index 00883776..00000000 --- a/scripts/generator/routine.py +++ /dev/null @@ -1,603 +0,0 @@ -#!/usr/bin/env python - -# ================================================================================================== -# This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This -# project loosely follows the Google C++ styleguide and uses a max-width of 100 characters per line. -# -# Author(s): -# Cedric Nugteren -# -# This file contains the 'Routine' class, used in the generator script to generate the CLBlast API -# interface and implementation. -# -# ================================================================================================== - -# System modules -from itertools import chain - -# Translates an option name to a CLBlast data-type -def OptionToCLBlast(x): - return { - 'layout': "Layout", - 'a_transpose': "Transpose", - 'b_transpose': "Transpose", - 'ab_transpose': "Transpose", - 'side': "Side", - 'triangle': "Triangle", - 'diagonal': "Diagonal", - }[x] - -# As above, but for clBLAS data-types -def OptionToWrapperCL(x): - return { - 'layout': "clblasOrder", - 'a_transpose': "clblasTranspose", - 'b_transpose': "clblasTranspose", - 'ab_transpose': "clblasTranspose", - 'side': "clblasSide", - 'triangle': "clblasUplo", - 'diagonal': "clblasDiag", - }[x] - -# As above, but for CBLAS data-types -def OptionToWrapperC(x): - return { - 'layout': "CBLAS_ORDER", - 'a_transpose': "CBLAS_TRANSPOSE", - 'b_transpose': "CBLAS_TRANSPOSE", - 'ab_transpose': "CBLAS_TRANSPOSE", - 'side': "CBLAS_SIDE", - 'triangle': "CBLAS_UPLO", - 'diagonal': "CBLAS_DIAG", - }[x] - -# Translates an option name to a documentation string -def OptionToDoc(x): - return { - 'layout': "Data-layout of the matrices, either `Layout::kRowMajor` (101) for row-major layout or `Layout::kColMajor` (102) for column-major data-layout.", - 'a_transpose': "Transposing the input matrix A, either `Transpose::kNo` (111), `Transpose::kYes` (112), or `Transpose::kConjugate` (113) for a complex-conjugate transpose.", - 'b_transpose': "Transposing the input matrix B, either `Transpose::kNo` (111), `Transpose::kYes` (112), or `Transpose::kConjugate` (113) for a complex-conjugate transpose.", - 'ab_transpose': "Transposing the packed input matrix AP, either `Transpose::kNo` (111), `Transpose::kYes` (112), or `Transpose::kConjugate` (113) for a complex-conjugate transpose.", - 'side': "The position of the triangular matrix in the operation, either on the `Side::kLeft` (141) or `Side::kRight` (142).", - 'triangle': "The part of the array of the triangular matrix to be used, either `Triangle::kUpper` (121) or `Triangle::kLower` (122).", - 'diagonal': "The property of the diagonal matrix, either `Diagonal::kNonUnit` (131) for non-unit values on the diagonal or `Diagonal::kUnit` (132) for unit values on the diagonal.", - }[x] - -# ================================================================================================== - -# Class holding routine-specific information (e.g. name, which arguments, which precisions) -class Routine(): - def __init__(self, implemented, has_tests, level, name, template, flavours, sizes, options, - inputs, outputs, scalars, scratch, description, details, requirements): - self.implemented = implemented - self.has_tests = has_tests - self.level = level - self.name = name - self.template = template - self.flavours = flavours - self.sizes = sizes - self.options = options - self.inputs = inputs - self.outputs = outputs - self.scalars = scalars - self.scratch = scratch # Scratch buffer (e.g. for xDOT) - self.description = description - self.details = details - self.requirements = requirements - - # List of scalar buffers - def ScalarBuffersFirst(self): - return ["dot","nrm2","asum","sum","imax","imin"] - def ScalarBuffersSecond(self): - return ["sa","sb","sc","ss","sd1","sd2","sx1","sy1","sparam"] - - # List of scalars other than alpha and beta - def OtherScalars(self): - return ["cos","sin"] - - # List of buffers with unsigned int type - def IndexBuffers(self): - return ["imax","imin"] - - # Lists of input/output buffers not index (integer) - def NonIndexInputs(self): - buffers = self.inputs[:] # make a copy - for i in self.IndexBuffers(): - if i in buffers: buffers.remove(i) - return buffers - def NonIndexOutputs(self): - buffers = self.outputs[:] # make a copy - for i in self.IndexBuffers(): - if i in buffers: buffers.remove(i) - return buffers - - # List of buffers without 'inc' or 'ld' - def BuffersWithoutLdInc(self): - return self.ScalarBuffersFirst() + self.ScalarBuffersSecond() + ["ap"] - - # Retrieves the number of characters in the routine's name - def Length(self): - return len(self.name) - - # Retrieves the postfix for a buffer - def Postfix(self, name): - return "inc" if (name in ["x","y"]) else "ld" - - # Determines whether or not this routine has scalar arguments (alpha/beta) - def NoScalars(self): - return self.scalars == [] - - # Returns the upper-case names of these routines (all flavours) - def ShortNames(self): - return "/".join([f.name+self.name.upper() for f in self.flavours]) - - # As above, but excludes some - def ShortNamesTested(self): - names = [f.name+self.name.upper() for f in self.flavours] - if "H"+self.name.upper() in names: names.remove("H"+self.name.upper()) - return "/".join(names) - - # Determines which buffers go first (between alpha and beta) and which ones go after - def BuffersFirst(self): - if self.level == "2b": - return ["x","y"] - return ["ap","a","b","x"] - def BuffersSecond(self): - if self.level == "2b": - return ["ap","a","b","c"] - return ["y","c"] - - # Distinguish between vectors and matrices - def BuffersVector(self): - return ["x","y"] - def BuffersMatrix(self): - return ["a","b","c","ap"] - - # ============================================================================================== - - # Retrieves a variable name for a specific input/output vector/matrix (e.g. 'x') - def Buffer(self, name): - if (name in self.inputs) or (name in self.outputs): - a = [name+"_buffer"] - b = [name+"_offset"] - c = [name+"_"+self.Postfix(name)] if (name not in self.BuffersWithoutLdInc()) else [] - return [", ".join(a+b+c)] - return [] - - # As above but with a '_bis' suffix for the buffer name - def BufferBis(self, name): - #if (name in self.IndexBuffers()): - # return self.Buffer(name) - if (name in self.inputs) or (name in self.outputs): - a = [name+"_buffer_bis"] - b = [name+"_offset"] - c = [name+"_"+self.Postfix(name)] if (name not in self.BuffersWithoutLdInc()) else [] - return [", ".join(a+b+c)] - return [] - - # As above but with data-types - def BufferDef(self, name): - prefix = "const " if (name in self.inputs) else "" - if (name in self.inputs) or (name in self.outputs): - a = [prefix+"cl_mem "+name+"_buffer"] - b = ["const size_t "+name+"_offset"] - c = ["const size_t "+name+"_"+self.Postfix(name)] if (name not in self.BuffersWithoutLdInc()) else [] - return [", ".join(a+b+c)] - return [] - - # As above but with data-types - def BufferDefWrapperCL(self, name, flavour): - prefix = "const " if (name in self.inputs) else "" - if (name in self.inputs) or (name in self.outputs): - a = [prefix+"Buffer<"+flavour.buffertype+">& "+name+"_buffer"] - b = ["const size_t "+name+"_offset"] - c = ["const size_t "+name+"_"+self.Postfix(name)] if (name not in self.BuffersWithoutLdInc()) else [] - return [", ".join(a+b+c)] - return [] - - # As above but as vectors - def BufferDefVector(self, name, flavour): - prefix = "const " if (name in self.inputs) else "" - if (name in self.inputs) or (name in self.outputs): - a = [prefix+"std::vector<"+flavour.buffertype+">& "+name+"_buffer"] - b = ["const size_t "+name+"_offset"] - c = ["const size_t "+name+"_"+self.Postfix(name)] if (name not in self.BuffersWithoutLdInc()) else [] - return [", ".join(a+b+c)] - return [] - - # As above but with Claduc buffers - def BufferCladuc(self, name): - if (name in self.inputs) or (name in self.outputs): - buffertype = "unsigned int" if (name in self.IndexBuffers()) else self.template.buffertype - a = ["Buffer<"+buffertype+">("+name+"_buffer)"] - b = [name+"_offset"] - c = [name+"_"+self.Postfix(name)] if (name not in self.BuffersWithoutLdInc()) else [] - return [", ".join(a+b+c)] - return [] - - # As above but with a static cast for clBLAS wrapper - def BufferWrapperCL(self, name): - if (name in self.inputs) or (name in self.outputs): - a = [name+"_buffer()"] - b = [name+"_offset"] - c = [] - if (name in ["x","y"]): - c = ["static_cast("+name+"_"+self.Postfix(name)+")"] - elif (name in ["a","b","c"]): - c = [name+"_"+self.Postfix(name)] - return [", ".join(a+b+c)] - return [] - - # As above but with a static cast for CBLAS wrapper - def BufferWrapperC(self, name, flavour): - prefix = "const " if (name in self.inputs) else "" - if (name in self.inputs) or (name in self.outputs): - if name == "sy1": - a = [name+"_buffer["+name+"_offset]"] - elif flavour.precision_name in ["C","Z"]: - a = ["reinterpret_cast<"+prefix+flavour.buffertype[:-1]+"*>(&"+name+"_buffer["+name+"_offset])"] - else: - a = ["&"+name+"_buffer["+name+"_offset]"] - c = [] - if (name in ["x","y"]): - c = ["static_cast("+name+"_"+self.Postfix(name)+")"] - elif (name in ["a","b","c"]): - c = [name+"_"+self.Postfix(name)] - return [", ".join(a+c)] - return [] - - # As above, but only data-types - def BufferType(self, name): - prefix = "const " if (name in self.inputs) else "" - if (name in self.inputs) or (name in self.outputs): - a = [prefix+"cl_mem"] - b = ["const size_t"] - c = ["const size_t"] if (name not in self.BuffersWithoutLdInc()) else [] - return [", ".join(a+b+c)] - return [] - - # Retrieves the documentation of the buffers - def BufferDoc(self, name): - prefix = "const " if (name in self.inputs) else "" - inout = "input" if (name in self.inputs) else "output" - if (name in self.inputs) or (name in self.outputs): - math_name = name.upper()+" matrix" if (name in self.BuffersMatrix()) else name+" vector" - incld_description = "Leading dimension " if (name in self.BuffersMatrix()) else "Stride/increment " - a = ["`"+prefix+"cl_mem "+name+"_buffer`: OpenCL buffer to store the "+inout+" "+math_name+"."] - b = ["`const size_t "+name+"_offset`: The offset in elements from the start of the "+inout+" "+math_name+"."] - c = ["`const size_t "+name+"_"+self.Postfix(name)+"`: "+incld_description+"of the "+inout+" "+math_name+". This value must be greater than 0."] if (name not in self.BuffersWithoutLdInc()) else [] - return a+b+c - return [] - - # ============================================================================================== - - # Retrieves the name of a scalar (alpha/beta) - def Scalar(self, name): - if (name in self.scalars): - return [name] - return [] - - # As above, but converts from float to half - def ScalarHalfToFloat(self, name): - if name in self.scalars: - return ["HalfToFloat("+name+")"] - return [] - - # Retrieves the use of a scalar (alpha/beta) - def ScalarUse(self, name, flavour): - if name in self.scalars: - if name == "alpha": - return [flavour.UseAlpha()] - elif name == "beta": - return [flavour.UseBeta()] - return [name] - return [] - - # As above, but for the clBLAS wrapper - def ScalarUseWrapper(self, name, flavour): - if name in self.scalars: - if name == "alpha": - return [flavour.UseAlphaCL()] - elif name == "beta": - return [flavour.UseBetaCL()] - return [name] - return [] - - # As above, but for the CBLAS wrapper - def ScalarUseWrapperC(self, name, flavour): - if name in self.scalars: - if flavour.IsComplex(name): - return [name+"_array.data()"] - return [name] - return [] - - # Retrieves the definition of a scalar (alpha/beta) - def ScalarDef(self, name, flavour): - if name in self.scalars: - if name == "alpha": - return ["const "+flavour.alpha_cl+" "+name] - return ["const "+flavour.beta_cl+" "+name] - return [] - - # As above, but without 'cl_' prefix - def ScalarDefPlain(self, name, flavour): - if name in self.scalars: - if name == "alpha": - return ["const "+flavour.alpha_cpp+" "+name] - return ["const "+flavour.beta_cpp+" "+name] - return [] - - # Retrieves the type of a scalar (alpha/beta) - def ScalarType(self, name, flavour): - if name in self.scalars: - if name == "alpha": - return ["const "+flavour.alpha_cpp] - return ["const "+flavour.beta_cpp] - return [] - - # Retrieves the documentation of a scalar - def ScalarDoc(self, name): - if name in self.scalars: - if name == "alpha": - return ["`const "+self.template.alpha_cpp+" "+name+"`: Input scalar constant."] - return ["`const "+self.template.beta_cpp+" "+name+"`: Input scalar constant."] - return [] - - # ============================================================================================== - - # Retrieves a list of comma-separated sizes (m, n, k) - def Sizes(self): - if self.sizes: - return [", ".join([s for s in self.sizes])] - return [] - - # Retrieves the definition of the sizes (m,n,k) - def SizesDef(self): - if self.sizes: - return [", ".join(["const size_t "+s for s in self.sizes])] - return [] - - # Retrieves the types of the sizes (m,n,k) - def SizesType(self): - if self.sizes: - return [", ".join(["const size_t" for s in self.sizes])] - return [] - - # Retrieves the documentation of the sizes - def SizesDoc(self): - if self.sizes: - definitions = ["`const size_t "+s+"`: Integer size argument. This value must be positive." for s in self.sizes] - return definitions - return [] - - # ============================================================================================== - - # Retrieves a list of options - def Options(self): - if self.options: - return [", ".join(self.options)] - return [] - - # As above, but now casted to CLBlast data-types - def OptionsCast(self, indent): - if self.options: - options = ["static_cast("+o+")" for o in self.options] - return [(",\n"+indent).join(options)] - return [] - - # Retrieves the definitions of the options (layout, transpose, side, etc.) - def OptionsDef(self): - if self.options: - definitions = ["const "+OptionToCLBlast(o)+" "+o for o in self.options] - return [", ".join(definitions)] - return [] - - # As above, but now using clBLAS data-types - def OptionsDefWrapperCL(self): - if self.options: - definitions = ["const "+OptionToWrapperCL(o)+" "+o for o in self.options] - return [", ".join(definitions)] - return [] - - # As above, but now using CBLAS data-types - def OptionsDefWrapperC(self): - if self.options: - definitions = ["const "+OptionToWrapperC(o)+" "+o for o in self.options] - return [", ".join(definitions)] - return [] - - # Retrieves the types of the options (layout, transpose, side, etc.) - def OptionsType(self): - if self.options: - definitions = ["const "+OptionToCLBlast(o) for o in self.options] - return [", ".join(definitions)] - return [] - - # Retrieves the documentation of the options - def OptionsDoc(self): - if self.options: - definitions = ["`const "+OptionToCLBlast(o)+" "+o+"`: "+OptionToDoc(o) for o in self.options] - return definitions - return [] - - # ============================================================================================== - - # Retrieves a combination of all the argument names (no types) - def Arguments(self): - return (self.Options() + self.Sizes() + - list(chain(*[self.Buffer(b) for b in self.ScalarBuffersFirst()])) + - self.Scalar("alpha") + - list(chain(*[self.Buffer(b) for b in self.BuffersFirst()])) + - self.Scalar("beta") + - list(chain(*[self.Buffer(b) for b in self.BuffersSecond()])) + - list(chain(*[self.Buffer(b) for b in self.ScalarBuffersSecond()])) + - list(chain(*[self.Scalar(s) for s in self.OtherScalars()]))) - - # As above, but with conversions from half to float - def ArgumentsHalf(self): - return (self.Options() + self.Sizes() + - list(chain(*[self.BufferBis(b) for b in self.ScalarBuffersFirst()])) + - self.ScalarHalfToFloat("alpha") + - list(chain(*[self.BufferBis(b) for b in self.BuffersFirst()])) + - self.ScalarHalfToFloat("beta") + - list(chain(*[self.BufferBis(b) for b in self.BuffersSecond()])) + - list(chain(*[self.BufferBis(b) for b in self.ScalarBuffersSecond()])) + - list(chain(*[self.Scalar(s) for s in self.OtherScalars()]))) - - # Retrieves a combination of all the argument names, with Claduc casts - def ArgumentsCladuc(self, flavour, indent): - return (self.Options() + self.Sizes() + - list(chain(*[self.BufferCladuc(b) for b in self.ScalarBuffersFirst()])) + - self.Scalar("alpha") + - list(chain(*[self.BufferCladuc(b) for b in self.BuffersFirst()])) + - self.Scalar("beta") + - list(chain(*[self.BufferCladuc(b) for b in self.BuffersSecond()])) + - list(chain(*[self.BufferCladuc(b) for b in self.ScalarBuffersSecond()])) + - list(chain(*[self.Scalar(s) for s in self.OtherScalars()]))) - - # As above, but with CLBlast casts - def ArgumentsCast(self, flavour, indent): - return (self.OptionsCast(indent) + self.Sizes() + - list(chain(*[self.Buffer(b) for b in self.ScalarBuffersFirst()])) + - self.ScalarUse("alpha", flavour) + - list(chain(*[self.Buffer(b) for b in self.BuffersFirst()])) + - self.ScalarUse("beta", flavour) + - list(chain(*[self.Buffer(b) for b in self.BuffersSecond()])) + - list(chain(*[self.Buffer(b) for b in self.ScalarBuffersSecond()])) + - list(chain(*[self.ScalarUse(s, flavour) for s in self.OtherScalars()]))) - - # As above, but for the clBLAS wrapper - def ArgumentsWrapperCL(self, flavour): - return (self.Options() + self.Sizes() + - list(chain(*[self.BufferWrapperCL(b) for b in self.ScalarBuffersFirst()])) + - self.ScalarUseWrapper("alpha", flavour) + - list(chain(*[self.BufferWrapperCL(b) for b in self.BuffersFirst()])) + - self.ScalarUseWrapper("beta", flavour) + - list(chain(*[self.BufferWrapperCL(b) for b in self.BuffersSecond()])) + - list(chain(*[self.BufferWrapperCL(b) for b in self.ScalarBuffersSecond()])) + - list(chain(*[self.ScalarUseWrapper(s, flavour) for s in self.OtherScalars()]))) - - # As above, but for the CBLAS wrapper - def ArgumentsWrapperC(self, flavour): - return (self.Options() + self.Sizes() + - self.ScalarUseWrapperC("alpha", flavour) + - list(chain(*[self.BufferWrapperC(b, flavour) for b in self.BuffersFirst()])) + - self.ScalarUseWrapperC("beta", flavour) + - list(chain(*[self.BufferWrapperC(b, flavour) for b in self.BuffersSecond()])) + - list(chain(*[self.BufferWrapperC(b, flavour) for b in self.ScalarBuffersSecond()])) + - list(chain(*[self.ScalarUseWrapperC(s, flavour) for s in self.OtherScalars()]))) - - # Retrieves a combination of all the argument definitions - def ArgumentsDef(self, flavour): - return (self.OptionsDef() + self.SizesDef() + - list(chain(*[self.BufferDef(b) for b in self.ScalarBuffersFirst()])) + - self.ScalarDef("alpha", flavour) + - list(chain(*[self.BufferDef(b) for b in self.BuffersFirst()])) + - self.ScalarDef("beta", flavour) + - list(chain(*[self.BufferDef(b) for b in self.BuffersSecond()])) + - list(chain(*[self.BufferDef(b) for b in self.ScalarBuffersSecond()])) + - list(chain(*[self.ScalarDef(s, flavour) for s in self.OtherScalars()]))) - - # As above, but clBLAS wrapper plain datatypes - def ArgumentsDefWrapperCL(self, flavour): - return (self.OptionsDefWrapperCL() + self.SizesDef() + - list(chain(*[self.BufferDefWrapperCL(b, flavour) for b in self.ScalarBuffersFirst()])) + - self.ScalarDefPlain("alpha", flavour) + - list(chain(*[self.BufferDefWrapperCL(b, flavour) for b in self.BuffersFirst()])) + - self.ScalarDefPlain("beta", flavour) + - list(chain(*[self.BufferDefWrapperCL(b, flavour) for b in self.BuffersSecond()])) + - list(chain(*[self.BufferDefWrapperCL(b, flavour) for b in self.ScalarBuffersSecond()])) + - list(chain(*[self.ScalarDefPlain(s, flavour) for s in self.OtherScalars()]))) - - # As above, but CBLAS wrapper plain datatypes - def ArgumentsDefWrapperC(self, flavour): - return (self.OptionsDefWrapperC() + self.SizesDef() + - list(chain(*[self.BufferDefVector(b, flavour) for b in self.ScalarBuffersFirst()])) + - self.ScalarDefPlain("alpha", flavour) + - list(chain(*[self.BufferDefVector(b, flavour) for b in self.BuffersFirst()])) + - self.ScalarDefPlain("beta", flavour) + - list(chain(*[self.BufferDefVector(b, flavour) for b in self.BuffersSecond()])) + - list(chain(*[self.BufferDefVector(b, flavour) for b in self.ScalarBuffersSecond()])) + - list(chain(*[self.ScalarDefPlain(s, flavour) for s in self.OtherScalars()]))) - - # Retrieves a combination of all the argument types - def ArgumentsType(self, flavour): - return (self.OptionsType() + self.SizesType() + - list(chain(*[self.BufferType(b) for b in self.ScalarBuffersFirst()])) + - self.ScalarType("alpha", flavour) + - list(chain(*[self.BufferType(b) for b in self.BuffersFirst()])) + - self.ScalarType("beta", flavour) + - list(chain(*[self.BufferType(b) for b in self.BuffersSecond()])) + - list(chain(*[self.BufferType(b) for b in self.ScalarBuffersSecond()])) + - list(chain(*[self.ScalarType(s, flavour) for s in self.OtherScalars()]))) - - # Retrieves a combination of all the argument types - def ArgumentsDoc(self): - return (self.OptionsDoc() + self.SizesDoc() + - list(chain(*[self.BufferDoc(b) for b in self.ScalarBuffersFirst()])) + - list(chain(*[self.BufferDoc(b) for b in self.ScalarBuffersFirst()])) + - self.ScalarDoc("alpha") + - list(chain(*[self.BufferDoc(b) for b in self.BuffersFirst()])) + - self.ScalarDoc("beta") + - list(chain(*[self.BufferDoc(b) for b in self.BuffersSecond()])) + - list(chain(*[self.BufferDoc(b) for b in self.ScalarBuffersSecond()])) + - list(chain(*[self.ScalarDoc(s) for s in self.OtherScalars()]))) - - # ============================================================================================== - - # Retrieves a list of routine requirements for documentation - def RequirementsDoc(self): - return self.requirements - - # ============================================================================================== - - # Retrieves the C++ templated definition for a routine - def RoutineHeaderCPP(self, spaces, default_event): - indent = " "*(spaces + self.Length()) - result = "template <"+self.template.name+">\n" - result += "StatusCode "+self.name.capitalize()+"(" - result += (",\n"+indent).join([a for a in self.ArgumentsDef(self.template)]) - result += ",\n"+indent+"cl_command_queue* queue, cl_event* event"+default_event+")" - return result - - # As above, but now without variable names - def RoutineHeaderTypeCPP(self, spaces): - indent = " "*(spaces + self.Length()) - result = "template <"+self.template.name+">\n" - result += "StatusCode "+self.name.capitalize()+"(" - result += (",\n"+indent).join([a for a in self.ArgumentsType(self.template)]) - result += ",\n"+indent+"cl_command_queue*, cl_event*)" - return result - - # As above, but now for C - def RoutineHeaderC(self, flavour, spaces, extra_qualifier): - indent = " "*(spaces + self.Length()) - result = "StatusCode"+extra_qualifier+" CLBlast"+flavour.name+self.name+"(" - result += (",\n"+indent).join([a for a in self.ArgumentsDef(flavour)]) - result += ",\n"+indent+"cl_command_queue* queue, cl_event* event)" - return result - - # As above, but now for the clBLAS wrapper - def RoutineHeaderWrapperCL(self, flavour, def_only, spaces): - template = "<"+flavour.template+">" if self.NoScalars() and not def_only else "" - indent = " "*(spaces + self.Length() + len(template)) - result = "" - if self.NoScalars(): - result += "template <" - if def_only: - result += flavour.name - result += ">\n" - result += "clblasStatus clblasX"+self.name+template+"(" - result += (",\n"+indent).join([a for a in self.ArgumentsDefWrapperCL(flavour)]) - result += ",\n"+indent+"cl_uint num_queues, cl_command_queue *queues" - result += ",\n"+indent+"cl_uint num_wait_events, const cl_event *wait_events, cl_event *events)" - return result - - # As above, but now for the CBLAS wrapper - def RoutineHeaderWrapperC(self, flavour, def_only, spaces): - indent = " "*(spaces + self.Length()) - result = "void cblasX"+self.name+"(" - result += (",\n"+indent).join([a for a in self.ArgumentsDefWrapperC(flavour)])+")" - return result - -# ================================================================================================== diff --git a/src/cache.cpp b/src/cache.cpp index cd9055d0..6080f082 100644 --- a/src/cache.cpp +++ b/src/cache.cpp @@ -23,6 +23,9 @@ namespace clblast { // Stores the compiled binary or IR in the cache void StoreBinaryToCache(const std::string &binary, const std::string &device_name, const Precision &precision, const std::string &routine_name) { + #ifdef VERBOSE + printf("[DEBUG] Storing binary in cache\n"); + #endif binary_cache_mutex_.lock(); binary_cache_.push_back(BinaryCache{binary, device_name, precision, routine_name}); binary_cache_mutex_.unlock(); @@ -31,8 +34,11 @@ void StoreBinaryToCache(const std::string &binary, const std::string &device_nam // Stores the compiled program in the cache void StoreProgramToCache(const Program &program, const Context &context, const Precision &precision, const std::string &routine_name) { + #ifdef VERBOSE + printf("[DEBUG] Storing program in cache\n"); + #endif program_cache_mutex_.lock(); - program_cache_.push_back(ProgramCache{program, context.pointer(), precision, routine_name}); + program_cache_.push_back(ProgramCache{program, context(), precision, routine_name}); program_cache_mutex_.unlock(); } @@ -40,6 +46,9 @@ void StoreProgramToCache(const Program &program, const Context &context, // otherwise. const std::string& GetBinaryFromCache(const std::string &device_name, const Precision &precision, const std::string &routine_name) { + #ifdef VERBOSE + printf("[DEBUG] Retrieving binary from cache\n"); + #endif binary_cache_mutex_.lock(); for (auto &cached_binary: binary_cache_) { if (cached_binary.MatchInCache(device_name, precision, routine_name)) { @@ -55,9 +64,12 @@ const std::string& GetBinaryFromCache(const std::string &device_name, const Prec // otherwise. const Program& GetProgramFromCache(const Context &context, const Precision &precision, const std::string &routine_name) { + #ifdef VERBOSE + printf("[DEBUG] Retrieving program from cache\n"); + #endif program_cache_mutex_.lock(); for (auto &cached_program: program_cache_) { - if (cached_program.MatchInCache(context.pointer(), precision, routine_name)) { + if (cached_program.MatchInCache(context(), precision, routine_name)) { program_cache_mutex_.unlock(); return cached_program.program; } @@ -85,7 +97,7 @@ bool ProgramIsInCache(const Context &context, const Precision &precision, const std::string &routine_name) { program_cache_mutex_.lock(); for (auto &cached_program: program_cache_) { - if (cached_program.MatchInCache(context.pointer(), precision, routine_name)) { + if (cached_program.MatchInCache(context(), precision, routine_name)) { program_cache_mutex_.unlock(); return true; } diff --git a/src/cache.hpp b/src/cache.hpp index 0d74d7bc..9075da0d 100644 --- a/src/cache.hpp +++ b/src/cache.hpp @@ -48,14 +48,14 @@ static std::mutex binary_cache_mutex_; // The cache of compiled OpenCL programs, along with some meta-data struct ProgramCache { Program program; - ContextPointer context_ptr; + cl_context context; Precision precision; std::string routine_name_; // Finds out whether the properties match - bool MatchInCache(const ContextPointer ref_context, const Precision &ref_precision, + bool MatchInCache(const cl_context ref_context, const Precision &ref_precision, const std::string &ref_routine) { - return (context_ptr == ref_context && + return (context == ref_context && precision == ref_precision && routine_name_ == ref_routine); } diff --git a/src/clblast.cpp b/src/clblast.cpp index 88d60772..79c30ca4 100644 --- a/src/clblast.cpp +++ b/src/clblast.cpp @@ -16,7 +16,6 @@ #include #include "clblast.h" -#include "public_api.hpp" #include "cache.hpp" // BLAS level-1 includes diff --git a/src/clpp11.hpp b/src/clpp11.hpp index b834d8b4..d57223dd 100644 --- a/src/clpp11.hpp +++ b/src/clpp11.hpp @@ -72,15 +72,24 @@ inline void CheckError(const cl_int status) { class Event { public: - // Constructor based on the regular OpenCL data-type - explicit Event(const cl_event event): event_(event) { } + // Constructor based on the regular OpenCL data-type: memory management is handled elsewhere + explicit Event(const cl_event event): + event_(new cl_event) { + *event_ = event; + } - // Regular constructor - explicit Event(): event_(nullptr) { } + // Regular constructor with memory management + explicit Event(): + event_(new cl_event, [](cl_event* e) { + if (*e) { CheckError(clReleaseEvent(*e)); } + delete e; + }) { + *event_ = nullptr; + } // Waits for completion of this event void WaitForCompletion() const { - CheckError(clWaitForEvents(1, &event_)); + CheckError(clWaitForEvents(1, &(*event_))); } // Retrieves the elapsed time of the last recorded event. Note that no error checking is done on @@ -89,20 +98,22 @@ class Event { float GetElapsedTime() const { WaitForCompletion(); auto bytes = size_t{0}; - clGetEventProfilingInfo(event_, CL_PROFILING_COMMAND_START, 0, nullptr, &bytes); + clGetEventProfilingInfo(*event_, CL_PROFILING_COMMAND_START, 0, nullptr, &bytes); auto time_start = size_t{0}; - clGetEventProfilingInfo(event_, CL_PROFILING_COMMAND_START, bytes, &time_start, nullptr); - clGetEventProfilingInfo(event_, CL_PROFILING_COMMAND_END, 0, nullptr, &bytes); + clGetEventProfilingInfo(*event_, CL_PROFILING_COMMAND_START, bytes, &time_start, nullptr); + clGetEventProfilingInfo(*event_, CL_PROFILING_COMMAND_END, 0, nullptr, &bytes); auto time_end = size_t{0}; - clGetEventProfilingInfo(event_, CL_PROFILING_COMMAND_END, bytes, &time_end, nullptr); + clGetEventProfilingInfo(*event_, CL_PROFILING_COMMAND_END, bytes, &time_end, nullptr); return (time_end - time_start) * 1.0e-6f; } // Accessor to the private data-member - cl_event& operator()() { return event_; } - cl_event* pointer() { return &event_; } + cl_event& operator()() { return *event_; } + const cl_event& operator()() const { return *event_; } + cl_event* pointer() { return &(*event_); } + const cl_event* pointer() const { return &(*event_); } private: - cl_event event_; + std::shared_ptr event_; }; // Pointer to an OpenCL event @@ -163,6 +174,15 @@ class Device { // Methods to retrieve device information std::string Version() const { return GetInfoString(CL_DEVICE_VERSION); } + size_t VersionNumber() const + { + std::string version_string = Version().substr(7); + // Space separates the end of the OpenCL version number from the beginning of the + // vendor-specific information. + size_t next_whitespace = version_string.find(' '); + size_t version = (size_t) (100.0 * std::stod(version_string.substr(0, next_whitespace))); + return version; + } std::string Vendor() const { return GetInfoString(CL_DEVICE_VENDOR); } std::string Name() const { return GetInfoString(CL_DEVICE_NAME); } std::string Type() const { @@ -176,24 +196,32 @@ class Device { } size_t MaxWorkGroupSize() const { return GetInfo(CL_DEVICE_MAX_WORK_GROUP_SIZE); } size_t MaxWorkItemDimensions() const { - return GetInfo(CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS); + return static_cast(GetInfo(CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS)); } std::vector MaxWorkItemSizes() const { return GetInfoVector(CL_DEVICE_MAX_WORK_ITEM_SIZES); } - size_t LocalMemSize() const { - return static_cast(GetInfo(CL_DEVICE_LOCAL_MEM_SIZE)); + cl_ulong LocalMemSize() const { + return GetInfo(CL_DEVICE_LOCAL_MEM_SIZE); } std::string Capabilities() const { return GetInfoString(CL_DEVICE_EXTENSIONS); } - size_t CoreClock() const { return GetInfo(CL_DEVICE_MAX_CLOCK_FREQUENCY); } - size_t ComputeUnits() const { return GetInfo(CL_DEVICE_MAX_COMPUTE_UNITS); } - size_t MemorySize() const { return GetInfo(CL_DEVICE_GLOBAL_MEM_SIZE); } - size_t MaxAllocSize() const { return GetInfo(CL_DEVICE_MAX_MEM_ALLOC_SIZE); } + size_t CoreClock() const { + return static_cast(GetInfo(CL_DEVICE_MAX_CLOCK_FREQUENCY)); + } + size_t ComputeUnits() const { + return static_cast(GetInfo(CL_DEVICE_MAX_COMPUTE_UNITS)); + } + unsigned long MemorySize() const { + return static_cast(GetInfo(CL_DEVICE_GLOBAL_MEM_SIZE)); + } + unsigned long MaxAllocSize() const { + return static_cast(GetInfo(CL_DEVICE_MAX_MEM_ALLOC_SIZE)); + } size_t MemoryClock() const { return 0; } // Not exposed in OpenCL size_t MemoryBusWidth() const { return 0; } // Not exposed in OpenCL // Configuration-validity checks - bool IsLocalMemoryValid(const size_t local_mem_usage) const { + bool IsLocalMemoryValid(const cl_ulong local_mem_usage) const { return (local_mem_usage <= LocalMemSize()); } bool IsThreadConfigValid(const std::vector &local) const { @@ -211,6 +239,8 @@ class Device { bool IsCPU() const { return Type() == "CPU"; } bool IsGPU() const { return Type() == "GPU"; } bool IsAMD() const { return Vendor() == "AMD" || Vendor() == "Advanced Micro Devices, Inc."; } + bool IsNVIDIA() const { return Vendor() == "NVIDIA" || Vendor() == "NVIDIA Corporation"; } + bool IsIntel() const { return Vendor() == "Intel" || Vendor() == "GenuineIntel"; } bool IsARM() const { return Vendor() == "ARM"; } // Accessor to the private data-member @@ -227,13 +257,6 @@ class Device { CheckError(clGetDeviceInfo(device_, info, bytes, &result, nullptr)); return result; } - size_t GetInfo(const cl_device_info info) const { - auto bytes = size_t{0}; - CheckError(clGetDeviceInfo(device_, info, 0, nullptr, &bytes)); - auto result = cl_uint(0); - CheckError(clGetDeviceInfo(device_, info, bytes, &result, nullptr)); - return static_cast(result); - } template std::vector GetInfoVector(const cl_device_info info) const { auto bytes = size_t{0}; @@ -386,8 +409,16 @@ class Queue { delete s; }) { auto status = CL_SUCCESS; #ifdef CL_VERSION_2_0 - cl_queue_properties properties[] = {CL_QUEUE_PROPERTIES, CL_QUEUE_PROFILING_ENABLE, 0}; - *queue_ = clCreateCommandQueueWithProperties(context(), device(), properties, &status); + size_t ocl_version = device.VersionNumber(); + if (ocl_version >= 200) + { + cl_queue_properties properties[] = {CL_QUEUE_PROPERTIES, CL_QUEUE_PROFILING_ENABLE, 0}; + *queue_ = clCreateCommandQueueWithProperties(context(), device(), properties, &status); + } + else + { + *queue_ = clCreateCommandQueue(context(), device(), CL_QUEUE_PROFILING_ENABLE, &status); + } #else *queue_ = clCreateCommandQueue(context(), device(), CL_QUEUE_PROFILING_ENABLE, &status); #endif @@ -627,15 +658,25 @@ class Kernel { } // Retrieves the amount of local memory used per work-group for this kernel - size_t LocalMemUsage(const Device &device) const { + cl_ulong LocalMemUsage(const Device &device) const { auto bytes = size_t{0}; auto query = cl_kernel_work_group_info{CL_KERNEL_LOCAL_MEM_SIZE}; CheckError(clGetKernelWorkGroupInfo(*kernel_, device(), query, 0, nullptr, &bytes)); - auto result = size_t{0}; + auto result = cl_ulong{0}; CheckError(clGetKernelWorkGroupInfo(*kernel_, device(), query, bytes, &result, nullptr)); return result; } + // Retrieves the name of the kernel + std::string GetFunctionName() { + auto bytes = size_t{0}; + CheckError(clGetKernelInfo(*kernel_, CL_KERNEL_FUNCTION_NAME, 0, nullptr, &bytes)); + auto result = std::string{}; + result.resize(bytes); + CheckError(clGetKernelInfo(*kernel_, CL_KERNEL_FUNCTION_NAME, bytes, &result[0], nullptr)); + return std::string{result.c_str()}; // Removes any trailing '\0'-characters + } + // Launches a kernel onto the specified queue void Launch(const Queue &queue, const std::vector &global, const std::vector &local, EventPointer event) { @@ -647,30 +688,21 @@ class Kernel { // As above, but with an event waiting list void Launch(const Queue &queue, const std::vector &global, const std::vector &local, EventPointer event, - std::vector& waitForEvents) { - if (waitForEvents.size() == 0) { return Launch(queue, global, local, event); } - + const std::vector &waitForEvents) { // Builds a plain version of the events waiting list auto waitForEventsPlain = std::vector(); for (auto &waitEvent : waitForEvents) { - waitForEventsPlain.push_back(waitEvent()); + if (waitEvent()) { waitForEventsPlain.push_back(waitEvent()); } } // Launches the kernel while waiting for other events CheckError(clEnqueueNDRangeKernel(queue(), *kernel_, static_cast(global.size()), - nullptr, global.data(), local.data(), + nullptr, global.data(), !local.empty() ? local.data() : nullptr, static_cast(waitForEventsPlain.size()), - waitForEventsPlain.data(), + !waitForEventsPlain.empty() ? waitForEventsPlain.data() : nullptr, event)); } - // As above, but with the default local workgroup size - void Launch(const Queue &queue, const std::vector &global, EventPointer event) { - CheckError(clEnqueueNDRangeKernel(queue(), *kernel_, static_cast(global.size()), - nullptr, global.data(), nullptr, - 0, nullptr, event)); - } - // Accessor to the private data-member const cl_kernel& operator()() const { return *kernel_; } private: diff --git a/src/database/database.cpp b/src/database/database.cpp index 6ec93731..34c44a29 100644 --- a/src/database/database.cpp +++ b/src/database/database.cpp @@ -17,6 +17,8 @@ #include "database/kernels/xaxpy.hpp" #include "database/kernels/xdot.hpp" #include "database/kernels/xgemv.hpp" +#include "database/kernels/xgemv_fast.hpp" +#include "database/kernels/xgemv_fast_rot.hpp" #include "database/kernels/xger.hpp" #include "database/kernels/xgemm.hpp" #include "database/kernels/copy.hpp" @@ -32,6 +34,8 @@ const std::vector Database::database = { XaxpyHalf, XaxpySingle, XaxpyDouble, XaxpyComplexSingle, XaxpyComplexDouble, XdotHalf, XdotSingle, XdotDouble, XdotComplexSingle, XdotComplexDouble, XgemvHalf, XgemvSingle, XgemvDouble, XgemvComplexSingle, XgemvComplexDouble, + XgemvFastHalf, XgemvFastSingle, XgemvFastDouble, XgemvFastComplexSingle, XgemvFastComplexDouble, + XgemvFastRotHalf, XgemvFastRotSingle, XgemvFastRotDouble, XgemvFastRotComplexSingle, XgemvFastRotComplexDouble, XgerHalf, XgerSingle, XgerDouble, XgerComplexSingle, XgerComplexDouble, XgemmHalf, XgemmSingle, XgemmDouble, XgemmComplexSingle, XgemmComplexDouble, CopyHalf, CopySingle, CopyDouble, CopyComplexSingle, CopyComplexDouble, @@ -42,9 +46,10 @@ const std::vector Database::database = { // ================================================================================================= -// Constructor, computing device properties and populating the parameter-vector from the database +// Constructor, computing device properties and populating the parameter-vector from the database. +// This takes an optional overlay database in case of custom tuning or custom kernels. Database::Database(const Queue &queue, const std::vector &kernels, - const Precision precision): + const Precision precision, const std::vector &overlay): parameters_{} { // Finds information of the current device @@ -53,10 +58,26 @@ Database::Database(const Queue &queue, const std::vector &kernels, auto device_vendor = device.Vendor(); auto device_name = device.Name(); + // Set the short vendor name + for (auto &combination : kVendorNames) { + if (device_vendor == combination.first) { + device_vendor = combination.second; + } + } + // Iterates over all kernels to include, and retrieves the parameters for each of them for (auto &kernel: kernels) { - auto search_result = Search(kernel, device_type, device_vendor, device_name, precision); - parameters_.insert(search_result.begin(), search_result.end()); + auto search_result = ParametersPtr{}; + + for (auto db: { &overlay, &database }) { + search_result = Search(kernel, device_type, device_vendor, device_name, precision, *db); + if (search_result) { + parameters_.insert(search_result->begin(), search_result->end()); + break; + } + } + + if (!search_result) { throw std::runtime_error("Database error, could not find a suitable entry"); } } } @@ -73,28 +94,22 @@ std::string Database::GetDefines() const { // ================================================================================================= -// Searches the database for the right kernel and precision -Database::Parameters Database::Search(const std::string &this_kernel, - const std::string &this_type, - const std::string &this_vendor, - const std::string &this_device, - const Precision this_precision) const { - // Set the short vendor name - auto this_short_vendor = this_vendor; - for (auto &combination : kVendorNames) { - if (this_vendor == combination.first) { - this_short_vendor = combination.second; - } - } +// Searches a particular database for the right kernel and precision +Database::ParametersPtr Database::Search(const std::string &this_kernel, + const std::string &this_type, + const std::string &this_vendor, + const std::string &this_device, + const Precision this_precision, + const std::vector &this_database) const { // Selects the right kernel - for (auto &db: database) { + for (auto &db: this_database) { if (db.kernel == this_kernel && db.precision == this_precision) { // Searches for the right vendor and device type, or selects the default if unavailable. This // assumes that the default vendor / device type is last in the database. for (auto &vendor: db.vendors) { - if ((vendor.name == this_short_vendor || vendor.name == kDeviceVendorAll) && + if ((vendor.name == this_vendor || vendor.name == kDeviceVendorAll) && (vendor.type == this_type || vendor.type == kDeviceTypeAll)) { // Searches for the right device. If the current device is unavailable, selects the vendor @@ -104,7 +119,7 @@ Database::Parameters Database::Search(const std::string &this_kernel, if (device.name == this_device || device.name == "default") { // Sets the parameters accordingly - return device.parameters; + return &device.parameters; } } } @@ -112,8 +127,8 @@ Database::Parameters Database::Search(const std::string &this_kernel, } } - // If we reached this point, something is wrong - throw std::runtime_error("Database error, could not find a suitable entry"); + // If we reached this point, the entry was not found in this database + return nullptr; } // ================================================================================================= diff --git a/src/database/database.hpp b/src/database/database.hpp index 0987cbed..a6ab49c5 100644 --- a/src/database/database.hpp +++ b/src/database/database.hpp @@ -32,6 +32,7 @@ class Database { // Type alias for the database parameters using Parameters = std::unordered_map; + using ParametersPtr = const Parameters*; // Structures for content inside the database struct DatabaseDevice { @@ -70,6 +71,8 @@ class Database { static const DatabaseEntry XaxpyHalf, XaxpySingle, XaxpyDouble, XaxpyComplexSingle, XaxpyComplexDouble; static const DatabaseEntry XdotHalf, XdotSingle, XdotDouble, XdotComplexSingle, XdotComplexDouble; static const DatabaseEntry XgemvHalf, XgemvSingle, XgemvDouble, XgemvComplexSingle, XgemvComplexDouble; + static const DatabaseEntry XgemvFastHalf, XgemvFastSingle, XgemvFastDouble, XgemvFastComplexSingle, XgemvFastComplexDouble; + static const DatabaseEntry XgemvFastRotHalf, XgemvFastRotSingle, XgemvFastRotDouble, XgemvFastRotComplexSingle, XgemvFastRotComplexDouble; static const DatabaseEntry XgerHalf, XgerSingle, XgerDouble, XgerComplexSingle, XgerComplexDouble; static const DatabaseEntry XgemmHalf, XgemmSingle, XgemmDouble, XgemmComplexSingle, XgemmComplexDouble; static const DatabaseEntry CopyHalf, CopySingle, CopyDouble, CopyComplexSingle, CopyComplexDouble; @@ -78,9 +81,9 @@ class Database { static const DatabaseEntry PadtransposeHalf, PadtransposeSingle, PadtransposeDouble, PadtransposeComplexSingle, PadtransposeComplexDouble; static const std::vector database; - // The constructor + // The constructor with a user-provided database overlay (potentially an empty vector) explicit Database(const Queue &queue, const std::vector &routines, - const Precision precision); + const Precision precision, const std::vector &overlay); // Accessor of values by key size_t operator[](const std::string key) const { return parameters_.find(key)->second; } @@ -89,9 +92,10 @@ class Database { std::string GetDefines() const; private: - Parameters Search(const std::string &this_kernel, const std::string &this_type, - const std::string &this_vendor, const std::string &this_device, - const Precision this_precision) const; + // Search method for a specified database, returning pointer (possibly a nullptr) + ParametersPtr Search(const std::string &this_kernel, const std::string &this_type, + const std::string &this_vendor, const std::string &this_device, + const Precision this_precision, const std::vector &db) const; // Found parameters suitable for this device/kernel Parameters parameters_; diff --git a/src/database/kernels/copy.hpp b/src/database/kernels/copy.hpp index 14946af4..a6b7dfe8 100644 --- a/src/database/kernels/copy.hpp +++ b/src/database/kernels/copy.hpp @@ -18,6 +18,7 @@ const Database::DatabaseEntry Database::CopyHalf = { "Copy", Precision::kHalf, { { // Intel GPUs kDeviceTypeGPU, "Intel", { + { "Intel(R) HD Graphics 5500 BroadWell U-Processor GT2", { {"COPY_DIMX",8}, {"COPY_DIMY",16}, {"COPY_VW",8}, {"COPY_WPT",4} } }, { "Intel(R) HD Graphics Skylake ULT GT2", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",4}, {"COPY_WPT",1} } }, { "default", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",4}, {"COPY_WPT",1} } }, } @@ -38,9 +39,10 @@ const Database::DatabaseEntry Database::CopySingle = { kDeviceTypeGPU, "AMD", { { "AMD Radeon R9 M370X Compute Engine", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",4}, {"COPY_WPT",1} } }, { "Hawaii", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",2} } }, + { "Oland", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",4}, {"COPY_WPT",2} } }, { "Pitcairn", { {"COPY_DIMX",8}, {"COPY_DIMY",16}, {"COPY_VW",4}, {"COPY_WPT",1} } }, { "Tahiti", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",2} } }, - { "default", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",1} } }, + { "default", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",2} } }, } }, { // ARM GPUs @@ -59,11 +61,13 @@ const Database::DatabaseEntry Database::CopySingle = { }, { // Intel GPUs kDeviceTypeGPU, "Intel", { - { "Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",1} } }, + { "Intel(R) HD Graphics 530", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",1} } }, + { "Intel(R) HD Graphics 5500 BroadWell U-Processor GT2", { {"COPY_DIMX",32}, {"COPY_DIMY",16}, {"COPY_VW",4}, {"COPY_WPT",1} } }, + { "Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile", { {"COPY_DIMX",32}, {"COPY_DIMY",16}, {"COPY_VW",4}, {"COPY_WPT",1} } }, { "Intel(R) HD Graphics Skylake ULT GT2", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",1} } }, { "Iris", { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",2} } }, { "Iris Pro", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",4}, {"COPY_WPT",4} } }, - { "default", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } }, + { "default", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",1} } }, } }, { // Intel accelerators @@ -75,20 +79,23 @@ const Database::DatabaseEntry Database::CopySingle = { { // NVIDIA GPUs kDeviceTypeGPU, "NVIDIA", { { "GRID K520", { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",4}, {"COPY_WPT",1} } }, + { "GeForce GTX 1070", { {"COPY_DIMX",8}, {"COPY_DIMY",16}, {"COPY_VW",4}, {"COPY_WPT",1} } }, { "GeForce GTX 480", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",4}, {"COPY_WPT",1} } }, + { "GeForce GTX 670", { {"COPY_DIMX",16}, {"COPY_DIMY",32}, {"COPY_VW",4}, {"COPY_WPT",1} } }, { "GeForce GTX 680", { {"COPY_DIMX",32}, {"COPY_DIMY",16}, {"COPY_VW",4}, {"COPY_WPT",1} } }, + { "GeForce GTX 750", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",2} } }, { "GeForce GTX 750 Ti", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",1} } }, { "GeForce GTX 980", { {"COPY_DIMX",32}, {"COPY_DIMY",16}, {"COPY_VW",1}, {"COPY_WPT",1} } }, { "GeForce GTX TITAN", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",4} } }, - { "GeForce GTX TITAN X", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",1} } }, + { "GeForce GTX TITAN X", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",2} } }, { "Tesla K20m", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",4}, {"COPY_WPT",4} } }, { "Tesla K40m", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",4}, {"COPY_WPT",2} } }, - { "default", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } }, + { "default", { {"COPY_DIMX",8}, {"COPY_DIMY",16}, {"COPY_VW",4}, {"COPY_WPT",1} } }, } }, { // Default kDeviceTypeAll, "default", { - { "default", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } }, + { "default", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",2} } }, } }, } @@ -102,9 +109,10 @@ const Database::DatabaseEntry Database::CopyComplexSingle = { kDeviceTypeGPU, "AMD", { { "AMD Radeon R9 M370X Compute Engine", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } }, { "Hawaii", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",2} } }, + { "Oland", { {"COPY_DIMX",8}, {"COPY_DIMY",16}, {"COPY_VW",1}, {"COPY_WPT",1} } }, { "Pitcairn", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",2} } }, { "Tahiti", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",2} } }, - { "default", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } }, + { "default", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } }, } }, { // Intel CPUs @@ -112,16 +120,18 @@ const Database::DatabaseEntry Database::CopyComplexSingle = { { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"COPY_DIMX",16}, {"COPY_DIMY",16}, {"COPY_VW",8}, {"COPY_WPT",1} } }, { "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",2} } }, { "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz", { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",8}, {"COPY_WPT",1} } }, - { "default", { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",1} } }, + { "default", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",8}, {"COPY_WPT",1} } }, } }, { // Intel GPUs kDeviceTypeGPU, "Intel", { + { "Intel(R) HD Graphics 530", { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",1} } }, + { "Intel(R) HD Graphics 5500 BroadWell U-Processor GT2", { {"COPY_DIMX",16}, {"COPY_DIMY",16}, {"COPY_VW",2}, {"COPY_WPT",2} } }, { "Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } }, { "Intel(R) HD Graphics Skylake ULT GT2", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",4}, {"COPY_WPT",4} } }, { "Iris", { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",2} } }, { "Iris Pro", { {"COPY_DIMX",32}, {"COPY_DIMY",16}, {"COPY_VW",1}, {"COPY_WPT",4} } }, - { "default", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } }, + { "default", { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",2} } }, } }, { // Intel accelerators @@ -133,18 +143,21 @@ const Database::DatabaseEntry Database::CopyComplexSingle = { { // NVIDIA GPUs kDeviceTypeGPU, "NVIDIA", { { "GRID K520", { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } }, + { "GeForce GTX 1070", { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } }, { "GeForce GTX 480", { {"COPY_DIMX",16}, {"COPY_DIMY",16}, {"COPY_VW",1}, {"COPY_WPT",1} } }, + { "GeForce GTX 670", { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } }, + { "GeForce GTX 750", { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",2} } }, { "GeForce GTX 750 Ti", { {"COPY_DIMX",32}, {"COPY_DIMY",16}, {"COPY_VW",1}, {"COPY_WPT",1} } }, { "GeForce GTX 980", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } }, { "GeForce GTX TITAN X", { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } }, { "Tesla K20m", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",4} } }, { "Tesla K40m", { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } }, - { "default", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } }, + { "default", { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } }, } }, { // Default kDeviceTypeAll, "default", { - { "default", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } }, + { "default", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } }, } }, } @@ -158,9 +171,10 @@ const Database::DatabaseEntry Database::CopyDouble = { kDeviceTypeGPU, "AMD", { { "AMD Radeon R9 M370X Compute Engine", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } }, { "Hawaii", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",2} } }, + { "Oland", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",8} } }, { "Pitcairn", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } }, { "Tahiti", { {"COPY_DIMX",8}, {"COPY_DIMY",32}, {"COPY_VW",2}, {"COPY_WPT",1} } }, - { "default", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } }, + { "default", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",2} } }, } }, { // ARM GPUs @@ -174,7 +188,7 @@ const Database::DatabaseEntry Database::CopyDouble = { { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",8}, {"COPY_WPT",1} } }, { "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz", { {"COPY_DIMX",16}, {"COPY_DIMY",32}, {"COPY_VW",2}, {"COPY_WPT",1} } }, { "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz", { {"COPY_DIMX",16}, {"COPY_DIMY",16}, {"COPY_VW",8}, {"COPY_WPT",1} } }, - { "default", { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",1} } }, + { "default", { {"COPY_DIMX",16}, {"COPY_DIMY",16}, {"COPY_VW",8}, {"COPY_WPT",1} } }, } }, { // Intel accelerators @@ -186,20 +200,23 @@ const Database::DatabaseEntry Database::CopyDouble = { { // NVIDIA GPUs kDeviceTypeGPU, "NVIDIA", { { "GRID K520", { {"COPY_DIMX",32}, {"COPY_DIMY",16}, {"COPY_VW",2}, {"COPY_WPT",1} } }, + { "GeForce GTX 1070", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",4}, {"COPY_WPT",1} } }, { "GeForce GTX 480", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",1} } }, + { "GeForce GTX 670", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",1} } }, { "GeForce GTX 680", { {"COPY_DIMX",16}, {"COPY_DIMY",32}, {"COPY_VW",2}, {"COPY_WPT",1} } }, + { "GeForce GTX 750", { {"COPY_DIMX",8}, {"COPY_DIMY",16}, {"COPY_VW",2}, {"COPY_WPT",1} } }, { "GeForce GTX 750 Ti", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",1} } }, { "GeForce GTX 980", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",1} } }, { "GeForce GTX TITAN", { {"COPY_DIMX",16}, {"COPY_DIMY",32}, {"COPY_VW",2}, {"COPY_WPT",2} } }, { "GeForce GTX TITAN X", { {"COPY_DIMX",32}, {"COPY_DIMY",16}, {"COPY_VW",1}, {"COPY_WPT",1} } }, { "Tesla K20m", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",1} } }, { "Tesla K40m", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",2} } }, - { "default", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } }, + { "default", { {"COPY_DIMX",32}, {"COPY_DIMY",32}, {"COPY_VW",2}, {"COPY_WPT",1} } }, } }, { // Default kDeviceTypeAll, "default", { - { "default", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } }, + { "default", { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",1} } }, } }, } @@ -213,9 +230,10 @@ const Database::DatabaseEntry Database::CopyComplexDouble = { kDeviceTypeGPU, "AMD", { { "AMD Radeon R9 M370X Compute Engine", { {"COPY_DIMX",8}, {"COPY_DIMY",16}, {"COPY_VW",1}, {"COPY_WPT",1} } }, { "Hawaii", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",8} } }, + { "Oland", { {"COPY_DIMX",8}, {"COPY_DIMY",16}, {"COPY_VW",1}, {"COPY_WPT",1} } }, { "Pitcairn", { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } }, { "Tahiti", { {"COPY_DIMX",8}, {"COPY_DIMY",16}, {"COPY_VW",1}, {"COPY_WPT",1} } }, - { "default", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } }, + { "default", { {"COPY_DIMX",8}, {"COPY_DIMY",16}, {"COPY_VW",1}, {"COPY_WPT",1} } }, } }, { // ARM GPUs @@ -229,7 +247,7 @@ const Database::DatabaseEntry Database::CopyComplexDouble = { { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",8}, {"COPY_WPT",1} } }, { "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz", { {"COPY_DIMX",32}, {"COPY_DIMY",32}, {"COPY_VW",8}, {"COPY_WPT",1} } }, { "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",8}, {"COPY_WPT",1} } }, - { "default", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",8}, {"COPY_WPT",1} } }, + { "default", { {"COPY_DIMX",32}, {"COPY_DIMY",32}, {"COPY_VW",8}, {"COPY_WPT",1} } }, } }, { // Intel accelerators @@ -241,8 +259,11 @@ const Database::DatabaseEntry Database::CopyComplexDouble = { { // NVIDIA GPUs kDeviceTypeGPU, "NVIDIA", { { "GRID K520", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } }, + { "GeForce GTX 1070", { {"COPY_DIMX",8}, {"COPY_DIMY",32}, {"COPY_VW",1}, {"COPY_WPT",4} } }, { "GeForce GTX 480", { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } }, + { "GeForce GTX 670", { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } }, { "GeForce GTX 680", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } }, + { "GeForce GTX 750", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } }, { "GeForce GTX 750 Ti", { {"COPY_DIMX",32}, {"COPY_DIMY",16}, {"COPY_VW",1}, {"COPY_WPT",1} } }, { "GeForce GTX 980", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } }, { "GeForce GTX TITAN", { {"COPY_DIMX",16}, {"COPY_DIMY",16}, {"COPY_VW",1}, {"COPY_WPT",1} } }, @@ -254,7 +275,7 @@ const Database::DatabaseEntry Database::CopyComplexDouble = { }, { // Default kDeviceTypeAll, "default", { - { "default", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } }, + { "default", { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } }, } }, } diff --git a/src/database/kernels/pad.hpp b/src/database/kernels/pad.hpp index db4df9f0..3cfabaf4 100644 --- a/src/database/kernels/pad.hpp +++ b/src/database/kernels/pad.hpp @@ -18,6 +18,7 @@ const Database::DatabaseEntry Database::PadHalf = { "Pad", Precision::kHalf, { { // Intel GPUs kDeviceTypeGPU, "Intel", { + { "Intel(R) HD Graphics 5500 BroadWell U-Processor GT2", { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",4}, {"PAD_WPTY",1} } }, { "Intel(R) HD Graphics Skylake ULT GT2", { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } }, { "default", { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } }, } @@ -38,9 +39,10 @@ const Database::DatabaseEntry Database::PadSingle = { kDeviceTypeGPU, "AMD", { { "AMD Radeon R9 M370X Compute Engine", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } }, { "Hawaii", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",4} } }, + { "Oland", { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } }, { "Pitcairn", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } }, { "Tahiti", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } }, - { "default", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } }, + { "default", { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } }, } }, { // ARM GPUs @@ -54,16 +56,18 @@ const Database::DatabaseEntry Database::PadSingle = { { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"PAD_DIMX",32}, {"PAD_DIMY",16}, {"PAD_WPTX",4}, {"PAD_WPTY",1} } }, { "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz", { {"PAD_DIMX",16}, {"PAD_DIMY",32}, {"PAD_WPTX",4}, {"PAD_WPTY",4} } }, { "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",4}, {"PAD_WPTY",1} } }, - { "default", { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",4}, {"PAD_WPTY",1} } }, + { "default", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",4}, {"PAD_WPTY",2} } }, } }, { // Intel GPUs kDeviceTypeGPU, "Intel", { - { "Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile", { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } }, + { "Intel(R) HD Graphics 530", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",4} } }, + { "Intel(R) HD Graphics 5500 BroadWell U-Processor GT2", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",4} } }, + { "Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile", { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } }, { "Intel(R) HD Graphics Skylake ULT GT2", { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } }, { "Iris", { {"PAD_DIMX",32}, {"PAD_DIMY",16}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } }, { "Iris Pro", { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } }, - { "default", { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } }, + { "default", { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } }, } }, { // Intel accelerators @@ -75,20 +79,23 @@ const Database::DatabaseEntry Database::PadSingle = { { // NVIDIA GPUs kDeviceTypeGPU, "NVIDIA", { { "GRID K520", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } }, + { "GeForce GTX 1070", { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } }, { "GeForce GTX 480", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",4} } }, + { "GeForce GTX 670", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",2} } }, { "GeForce GTX 680", { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",4}, {"PAD_WPTY",1} } }, + { "GeForce GTX 750", { {"PAD_DIMX",32}, {"PAD_DIMY",16}, {"PAD_WPTX",4}, {"PAD_WPTY",2} } }, { "GeForce GTX 750 Ti", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",4}, {"PAD_WPTY",1} } }, { "GeForce GTX 980", { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } }, { "GeForce GTX TITAN", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } }, { "GeForce GTX TITAN X", { {"PAD_DIMX",16}, {"PAD_DIMY",16}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } }, { "Tesla K20m", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } }, { "Tesla K40m", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } }, - { "default", { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } }, + { "default", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",4}, {"PAD_WPTY",1} } }, } }, { // Default kDeviceTypeAll, "default", { - { "default", { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } }, + { "default", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } }, } }, } @@ -102,9 +109,10 @@ const Database::DatabaseEntry Database::PadComplexSingle = { kDeviceTypeGPU, "AMD", { { "AMD Radeon R9 M370X Compute Engine", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } }, { "Hawaii", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } }, + { "Oland", { {"PAD_DIMX",8}, {"PAD_DIMY",32}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } }, { "Pitcairn", { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } }, { "Tahiti", { {"PAD_DIMX",16}, {"PAD_DIMY",16}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } }, - { "default", { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } }, + { "default", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } }, } }, { // ARM GPUs @@ -118,16 +126,18 @@ const Database::DatabaseEntry Database::PadComplexSingle = { { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",2} } }, { "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz", { {"PAD_DIMX",32}, {"PAD_DIMY",32}, {"PAD_WPTX",4}, {"PAD_WPTY",1} } }, { "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz", { {"PAD_DIMX",32}, {"PAD_DIMY",16}, {"PAD_WPTX",4}, {"PAD_WPTY",1} } }, - { "default", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } }, + { "default", { {"PAD_DIMX",32}, {"PAD_DIMY",16}, {"PAD_WPTX",4}, {"PAD_WPTY",1} } }, } }, { // Intel GPUs kDeviceTypeGPU, "Intel", { + { "Intel(R) HD Graphics 530", { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } }, + { "Intel(R) HD Graphics 5500 BroadWell U-Processor GT2", { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } }, { "Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile", { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } }, { "Intel(R) HD Graphics Skylake ULT GT2", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",4} } }, { "Iris", { {"PAD_DIMX",32}, {"PAD_DIMY",16}, {"PAD_WPTX",2}, {"PAD_WPTY",4} } }, { "Iris Pro", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } }, - { "default", { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } }, + { "default", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",4} } }, } }, { // Intel accelerators @@ -139,20 +149,23 @@ const Database::DatabaseEntry Database::PadComplexSingle = { { // NVIDIA GPUs kDeviceTypeGPU, "NVIDIA", { { "GRID K520", { {"PAD_DIMX",16}, {"PAD_DIMY",16}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } }, + { "GeForce GTX 1070", { {"PAD_DIMX",8}, {"PAD_DIMY",32}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } }, { "GeForce GTX 480", { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } }, + { "GeForce GTX 670", { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } }, { "GeForce GTX 680", { {"PAD_DIMX",16}, {"PAD_DIMY",32}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } }, + { "GeForce GTX 750", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } }, { "GeForce GTX 750 Ti", { {"PAD_DIMX",16}, {"PAD_DIMY",16}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } }, { "GeForce GTX 980", { {"PAD_DIMX",16}, {"PAD_DIMY",16}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } }, { "GeForce GTX TITAN", { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } }, { "GeForce GTX TITAN X", { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } }, { "Tesla K20m", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } }, { "Tesla K40m", { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } }, - { "default", { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } }, + { "default", { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } }, } }, { // Default kDeviceTypeAll, "default", { - { "default", { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } }, + { "default", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } }, } }, } @@ -166,9 +179,10 @@ const Database::DatabaseEntry Database::PadDouble = { kDeviceTypeGPU, "AMD", { { "AMD Radeon R9 M370X Compute Engine", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } }, { "Hawaii", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } }, + { "Oland", { {"PAD_DIMX",8}, {"PAD_DIMY",32}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } }, { "Pitcairn", { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } }, { "Tahiti", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } }, - { "default", { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } }, + { "default", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } }, } }, { // ARM GPUs @@ -182,7 +196,7 @@ const Database::DatabaseEntry Database::PadDouble = { { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",4}, {"PAD_WPTY",1} } }, { "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz", { {"PAD_DIMX",32}, {"PAD_DIMY",32}, {"PAD_WPTX",4}, {"PAD_WPTY",1} } }, { "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } }, - { "default", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } }, + { "default", { {"PAD_DIMX",32}, {"PAD_DIMY",16}, {"PAD_WPTX",4}, {"PAD_WPTY",1} } }, } }, { // Intel accelerators @@ -194,20 +208,23 @@ const Database::DatabaseEntry Database::PadDouble = { { // NVIDIA GPUs kDeviceTypeGPU, "NVIDIA", { { "GRID K520", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } }, + { "GeForce GTX 1070", { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } }, { "GeForce GTX 480", { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } }, + { "GeForce GTX 670", { {"PAD_DIMX",16}, {"PAD_DIMY",16}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } }, { "GeForce GTX 680", { {"PAD_DIMX",32}, {"PAD_DIMY",32}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } }, + { "GeForce GTX 750", { {"PAD_DIMX",32}, {"PAD_DIMY",16}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } }, { "GeForce GTX 750 Ti", { {"PAD_DIMX",8}, {"PAD_DIMY",16}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } }, { "GeForce GTX 980", { {"PAD_DIMX",8}, {"PAD_DIMY",16}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } }, { "GeForce GTX TITAN", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } }, { "GeForce GTX TITAN X", { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } }, { "Tesla K20m", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } }, { "Tesla K40m", { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } }, - { "default", { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } }, + { "default", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } }, } }, { // Default kDeviceTypeAll, "default", { - { "default", { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } }, + { "default", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } }, } }, } @@ -221,9 +238,10 @@ const Database::DatabaseEntry Database::PadComplexDouble = { kDeviceTypeGPU, "AMD", { { "AMD Radeon R9 M370X Compute Engine", { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } }, { "Hawaii", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } }, + { "Oland", { {"PAD_DIMX",8}, {"PAD_DIMY",16}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } }, { "Pitcairn", { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } }, { "Tahiti", { {"PAD_DIMX",8}, {"PAD_DIMY",16}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } }, - { "default", { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } }, + { "default", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } }, } }, { // ARM GPUs @@ -237,7 +255,7 @@ const Database::DatabaseEntry Database::PadComplexDouble = { { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } }, { "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz", { {"PAD_DIMX",16}, {"PAD_DIMY",32}, {"PAD_WPTX",4}, {"PAD_WPTY",1} } }, { "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } }, - { "default", { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } }, + { "default", { {"PAD_DIMX",32}, {"PAD_DIMY",16}, {"PAD_WPTX",4}, {"PAD_WPTY",1} } }, } }, { // Intel accelerators @@ -249,20 +267,23 @@ const Database::DatabaseEntry Database::PadComplexDouble = { { // NVIDIA GPUs kDeviceTypeGPU, "NVIDIA", { { "GRID K520", { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } }, + { "GeForce GTX 1070", { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",2} } }, { "GeForce GTX 480", { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } }, + { "GeForce GTX 670", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } }, { "GeForce GTX 680", { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } }, + { "GeForce GTX 750", { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } }, { "GeForce GTX 750 Ti", { {"PAD_DIMX",32}, {"PAD_DIMY",32}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } }, { "GeForce GTX 980", { {"PAD_DIMX",16}, {"PAD_DIMY",16}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } }, { "GeForce GTX TITAN", { {"PAD_DIMX",8}, {"PAD_DIMY",32}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } }, { "GeForce GTX TITAN X", { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } }, { "Tesla K20m", { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } }, { "Tesla K40m", { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } }, - { "default", { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } }, + { "default", { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } }, } }, { // Default kDeviceTypeAll, "default", { - { "default", { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } }, + { "default", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } }, } }, } diff --git a/src/database/kernels/padtranspose.hpp b/src/database/kernels/padtranspose.hpp index 7fedd15a..88bd4ea7 100644 --- a/src/database/kernels/padtranspose.hpp +++ b/src/database/kernels/padtranspose.hpp @@ -18,6 +18,7 @@ const Database::DatabaseEntry Database::PadtransposeHalf = { "Padtranspose", Precision::kHalf, { { // Intel GPUs kDeviceTypeGPU, "Intel", { + { "Intel(R) HD Graphics 5500 BroadWell U-Processor GT2", { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",1} } }, { "Intel(R) HD Graphics Skylake ULT GT2", { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",1} } }, { "default", { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",1} } }, } @@ -38,6 +39,7 @@ const Database::DatabaseEntry Database::PadtransposeSingle = { kDeviceTypeGPU, "AMD", { { "AMD Radeon R9 M370X Compute Engine", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",4} } }, { "Hawaii", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",4} } }, + { "Oland", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",4} } }, { "Pitcairn", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",4} } }, { "Tahiti", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",4} } }, { "default", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",4} } }, @@ -54,11 +56,13 @@ const Database::DatabaseEntry Database::PadtransposeSingle = { { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } }, { "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz", { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",8} } }, { "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz", { {"PADTRA_PAD",0}, {"PADTRA_TILE",32}, {"PADTRA_WPT",1} } }, - { "default", { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",1} } }, + { "default", { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",8} } }, } }, { // Intel GPUs kDeviceTypeGPU, "Intel", { + { "Intel(R) HD Graphics 530", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } }, + { "Intel(R) HD Graphics 5500 BroadWell U-Processor GT2", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",4} } }, { "Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } }, { "Intel(R) HD Graphics Skylake ULT GT2", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } }, { "Iris", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } }, @@ -75,20 +79,23 @@ const Database::DatabaseEntry Database::PadtransposeSingle = { { // NVIDIA GPUs kDeviceTypeGPU, "NVIDIA", { { "GRID K520", { {"PADTRA_PAD",1}, {"PADTRA_TILE",32}, {"PADTRA_WPT",2} } }, + { "GeForce GTX 1070", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } }, { "GeForce GTX 480", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } }, + { "GeForce GTX 670", { {"PADTRA_PAD",1}, {"PADTRA_TILE",32}, {"PADTRA_WPT",2} } }, { "GeForce GTX 680", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } }, + { "GeForce GTX 750", { {"PADTRA_PAD",1}, {"PADTRA_TILE",32}, {"PADTRA_WPT",2} } }, { "GeForce GTX 750 Ti", { {"PADTRA_PAD",1}, {"PADTRA_TILE",32}, {"PADTRA_WPT",2} } }, { "GeForce GTX 980", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } }, { "GeForce GTX TITAN", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } }, { "GeForce GTX TITAN X", { {"PADTRA_PAD",1}, {"PADTRA_TILE",32}, {"PADTRA_WPT",1} } }, { "Tesla K20m", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } }, { "Tesla K40m", { {"PADTRA_PAD",1}, {"PADTRA_TILE",32}, {"PADTRA_WPT",2} } }, - { "default", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } }, + { "default", { {"PADTRA_PAD",1}, {"PADTRA_TILE",32}, {"PADTRA_WPT",2} } }, } }, { // Default kDeviceTypeAll, "default", { - { "default", { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",1} } }, + { "default", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } }, } }, } @@ -102,9 +109,10 @@ const Database::DatabaseEntry Database::PadtransposeComplexSingle = { kDeviceTypeGPU, "AMD", { { "AMD Radeon R9 M370X Compute Engine", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",4} } }, { "Hawaii", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } }, + { "Oland", { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",4} } }, { "Pitcairn", { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",4} } }, { "Tahiti", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } }, - { "default", { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",2} } }, + { "default", { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",4} } }, } }, { // ARM GPUs @@ -123,11 +131,13 @@ const Database::DatabaseEntry Database::PadtransposeComplexSingle = { }, { // Intel GPUs kDeviceTypeGPU, "Intel", { + { "Intel(R) HD Graphics 530", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } }, + { "Intel(R) HD Graphics 5500 BroadWell U-Processor GT2", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } }, { "Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } }, { "Intel(R) HD Graphics Skylake ULT GT2", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",4} } }, { "Iris", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } }, { "Iris Pro", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } }, - { "default", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } }, + { "default", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } }, } }, { // Intel accelerators @@ -139,20 +149,23 @@ const Database::DatabaseEntry Database::PadtransposeComplexSingle = { { // NVIDIA GPUs kDeviceTypeGPU, "NVIDIA", { { "GRID K520", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } }, + { "GeForce GTX 1070", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } }, { "GeForce GTX 480", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } }, + { "GeForce GTX 670", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } }, { "GeForce GTX 680", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } }, + { "GeForce GTX 750", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } }, { "GeForce GTX 750 Ti", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } }, { "GeForce GTX 980", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } }, { "GeForce GTX TITAN", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } }, { "GeForce GTX TITAN X", { {"PADTRA_PAD",1}, {"PADTRA_TILE",32}, {"PADTRA_WPT",1} } }, { "Tesla K20m", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } }, { "Tesla K40m", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } }, - { "default", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } }, + { "default", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } }, } }, { // Default kDeviceTypeAll, "default", { - { "default", { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",1} } }, + { "default", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } }, } }, } @@ -166,9 +179,10 @@ const Database::DatabaseEntry Database::PadtransposeDouble = { kDeviceTypeGPU, "AMD", { { "AMD Radeon R9 M370X Compute Engine", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",4} } }, { "Hawaii", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } }, + { "Oland", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",4} } }, { "Pitcairn", { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",4} } }, { "Tahiti", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } }, - { "default", { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",2} } }, + { "default", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",4} } }, } }, { // ARM GPUs @@ -182,7 +196,7 @@ const Database::DatabaseEntry Database::PadtransposeDouble = { { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"PADTRA_PAD",1}, {"PADTRA_TILE",8}, {"PADTRA_WPT",4} } }, { "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz", { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",8} } }, { "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz", { {"PADTRA_PAD",1}, {"PADTRA_TILE",32}, {"PADTRA_WPT",1} } }, - { "default", { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",1} } }, + { "default", { {"PADTRA_PAD",1}, {"PADTRA_TILE",8}, {"PADTRA_WPT",4} } }, } }, { // Intel accelerators @@ -194,20 +208,23 @@ const Database::DatabaseEntry Database::PadtransposeDouble = { { // NVIDIA GPUs kDeviceTypeGPU, "NVIDIA", { { "GRID K520", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } }, + { "GeForce GTX 1070", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } }, { "GeForce GTX 480", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } }, + { "GeForce GTX 670", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } }, { "GeForce GTX 680", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } }, + { "GeForce GTX 750", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } }, { "GeForce GTX 750 Ti", { {"PADTRA_PAD",1}, {"PADTRA_TILE",32}, {"PADTRA_WPT",2} } }, { "GeForce GTX 980", { {"PADTRA_PAD",1}, {"PADTRA_TILE",32}, {"PADTRA_WPT",1} } }, { "GeForce GTX TITAN", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } }, { "GeForce GTX TITAN X", { {"PADTRA_PAD",1}, {"PADTRA_TILE",32}, {"PADTRA_WPT",1} } }, { "Tesla K20m", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } }, { "Tesla K40m", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } }, - { "default", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } }, + { "default", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } }, } }, { // Default kDeviceTypeAll, "default", { - { "default", { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",1} } }, + { "default", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } }, } }, } @@ -221,9 +238,10 @@ const Database::DatabaseEntry Database::PadtransposeComplexDouble = { kDeviceTypeGPU, "AMD", { { "AMD Radeon R9 M370X Compute Engine", { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",4} } }, { "Hawaii", { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",4} } }, + { "Oland", { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",4} } }, { "Pitcairn", { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",4} } }, { "Tahiti", { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",2} } }, - { "default", { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",2} } }, + { "default", { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",4} } }, } }, { // ARM GPUs @@ -237,7 +255,7 @@ const Database::DatabaseEntry Database::PadtransposeComplexDouble = { { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"PADTRA_PAD",1}, {"PADTRA_TILE",8}, {"PADTRA_WPT",2} } }, { "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz", { {"PADTRA_PAD",1}, {"PADTRA_TILE",8}, {"PADTRA_WPT",4} } }, { "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz", { {"PADTRA_PAD",1}, {"PADTRA_TILE",8}, {"PADTRA_WPT",4} } }, - { "default", { {"PADTRA_PAD",1}, {"PADTRA_TILE",8}, {"PADTRA_WPT",2} } }, + { "default", { {"PADTRA_PAD",1}, {"PADTRA_TILE",8}, {"PADTRA_WPT",4} } }, } }, { // Intel accelerators @@ -249,20 +267,23 @@ const Database::DatabaseEntry Database::PadtransposeComplexDouble = { { // NVIDIA GPUs kDeviceTypeGPU, "NVIDIA", { { "GRID K520", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } }, + { "GeForce GTX 1070", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } }, { "GeForce GTX 480", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } }, + { "GeForce GTX 670", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } }, { "GeForce GTX 680", { {"PADTRA_PAD",1}, {"PADTRA_TILE",32}, {"PADTRA_WPT",1} } }, + { "GeForce GTX 750", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } }, { "GeForce GTX 750 Ti", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } }, { "GeForce GTX 980", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } }, { "GeForce GTX TITAN", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } }, { "GeForce GTX TITAN X", { {"PADTRA_PAD",1}, {"PADTRA_TILE",32}, {"PADTRA_WPT",1} } }, { "Tesla K20m", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } }, { "Tesla K40m", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } }, - { "default", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } }, + { "default", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } }, } }, { // Default kDeviceTypeAll, "default", { - { "default", { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",1} } }, + { "default", { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",2} } }, } }, } diff --git a/src/database/kernels/transpose.hpp b/src/database/kernels/transpose.hpp index 4229e39f..0e1b608e 100644 --- a/src/database/kernels/transpose.hpp +++ b/src/database/kernels/transpose.hpp @@ -18,6 +18,7 @@ const Database::DatabaseEntry Database::TransposeHalf = { "Transpose", Precision::kHalf, { { // Intel GPUs kDeviceTypeGPU, "Intel", { + { "Intel(R) HD Graphics 5500 BroadWell U-Processor GT2", { {"TRA_DIM",8}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",8} } }, { "Intel(R) HD Graphics Skylake ULT GT2", { {"TRA_DIM",16}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",4} } }, { "default", { {"TRA_DIM",16}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",4} } }, } @@ -38,9 +39,10 @@ const Database::DatabaseEntry Database::TransposeSingle = { kDeviceTypeGPU, "AMD", { { "AMD Radeon R9 M370X Compute Engine", { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",8} } }, { "Hawaii", { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",8} } }, + { "Oland", { {"TRA_DIM",8}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",4} } }, { "Pitcairn", { {"TRA_DIM",16}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",1} } }, { "Tahiti", { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",4} } }, - { "default", { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",1} } }, + { "default", { {"TRA_DIM",4}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",4} } }, } }, { // ARM GPUs @@ -59,11 +61,13 @@ const Database::DatabaseEntry Database::TransposeSingle = { }, { // Intel GPUs kDeviceTypeGPU, "Intel", { - { "Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile", { {"TRA_DIM",8}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",4} } }, + { "Intel(R) HD Graphics 530", { {"TRA_DIM",16}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",4} } }, + { "Intel(R) HD Graphics 5500 BroadWell U-Processor GT2", { {"TRA_DIM",16}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",4} } }, + { "Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile", { {"TRA_DIM",16}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",4} } }, { "Intel(R) HD Graphics Skylake ULT GT2", { {"TRA_DIM",8}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",4} } }, { "Iris", { {"TRA_DIM",8}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",4} } }, { "Iris Pro", { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",4} } }, - { "default", { {"TRA_DIM",8}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",4} } }, + { "default", { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",4} } }, } }, { // Intel accelerators @@ -75,20 +79,23 @@ const Database::DatabaseEntry Database::TransposeSingle = { { // NVIDIA GPUs kDeviceTypeGPU, "NVIDIA", { { "GRID K520", { {"TRA_DIM",32}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",2} } }, + { "GeForce GTX 1070", { {"TRA_DIM",8}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",4} } }, { "GeForce GTX 480", { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",2} } }, + { "GeForce GTX 670", { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",2} } }, { "GeForce GTX 680", { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",2} } }, + { "GeForce GTX 750", { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",8} } }, { "GeForce GTX 750 Ti", { {"TRA_DIM",32}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",2} } }, { "GeForce GTX 980", { {"TRA_DIM",16}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } }, { "GeForce GTX TITAN", { {"TRA_DIM",8}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",4} } }, { "GeForce GTX TITAN X", { {"TRA_DIM",16}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",4} } }, { "Tesla K20m", { {"TRA_DIM",8}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",4} } }, { "Tesla K40m", { {"TRA_DIM",8}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",4} } }, - { "default", { {"TRA_DIM",8}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } }, + { "default", { {"TRA_DIM",8}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",4} } }, } }, { // Default kDeviceTypeAll, "default", { - { "default", { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } }, + { "default", { {"TRA_DIM",8}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",4} } }, } }, } @@ -102,9 +109,10 @@ const Database::DatabaseEntry Database::TransposeComplexSingle = { kDeviceTypeGPU, "AMD", { { "AMD Radeon R9 M370X Compute Engine", { {"TRA_DIM",4}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",4} } }, { "Hawaii", { {"TRA_DIM",16}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",1} } }, + { "Oland", { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",2} } }, { "Pitcairn", { {"TRA_DIM",8}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",1} } }, { "Tahiti", { {"TRA_DIM",16}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",1} } }, - { "default", { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",1} } }, + { "default", { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",2} } }, } }, { // ARM GPUs @@ -118,35 +126,40 @@ const Database::DatabaseEntry Database::TransposeComplexSingle = { { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"TRA_DIM",8}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",2} } }, { "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz", { {"TRA_DIM",4}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",8} } }, { "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz", { {"TRA_DIM",16}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",4} } }, - { "default", { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",2} } }, + { "default", { {"TRA_DIM",4}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",8} } }, } }, { // Intel GPUs kDeviceTypeGPU, "Intel", { + { "Intel(R) HD Graphics 530", { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",2} } }, + { "Intel(R) HD Graphics 5500 BroadWell U-Processor GT2", { {"TRA_DIM",8}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",2} } }, { "Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile", { {"TRA_DIM",8}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",2} } }, { "Intel(R) HD Graphics Skylake ULT GT2", { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",2} } }, { "Iris", { {"TRA_DIM",8}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",2} } }, { "Iris Pro", { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",2} } }, - { "default", { {"TRA_DIM",8}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",2} } }, + { "default", { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",2} } }, } }, { // NVIDIA GPUs kDeviceTypeGPU, "NVIDIA", { { "GRID K520", { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",1} } }, + { "GeForce GTX 1070", { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",1} } }, { "GeForce GTX 480", { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } }, + { "GeForce GTX 670", { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",1} } }, { "GeForce GTX 680", { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",1} } }, + { "GeForce GTX 750", { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } }, { "GeForce GTX 750 Ti", { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } }, { "GeForce GTX 980", { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } }, { "GeForce GTX TITAN", { {"TRA_DIM",16}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } }, { "GeForce GTX TITAN X", { {"TRA_DIM",32}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } }, { "Tesla K20m", { {"TRA_DIM",16}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } }, { "Tesla K40m", { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } }, - { "default", { {"TRA_DIM",16}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } }, + { "default", { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } }, } }, { // Default kDeviceTypeAll, "default", { - { "default", { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } }, + { "default", { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } }, } }, } @@ -160,9 +173,10 @@ const Database::DatabaseEntry Database::TransposeDouble = { kDeviceTypeGPU, "AMD", { { "AMD Radeon R9 M370X Compute Engine", { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",4} } }, { "Hawaii", { {"TRA_DIM",16}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",1} } }, + { "Oland", { {"TRA_DIM",8}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",2} } }, { "Pitcairn", { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",2} } }, { "Tahiti", { {"TRA_DIM",4}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",4} } }, - { "default", { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",1} } }, + { "default", { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",4} } }, } }, { // ARM GPUs @@ -176,7 +190,7 @@ const Database::DatabaseEntry Database::TransposeDouble = { { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",4} } }, { "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz", { {"TRA_DIM",4}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",8} } }, { "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz", { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",8} } }, - { "default", { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",4} } }, + { "default", { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",8} } }, } }, { // Intel accelerators @@ -188,20 +202,23 @@ const Database::DatabaseEntry Database::TransposeDouble = { { // NVIDIA GPUs kDeviceTypeGPU, "NVIDIA", { { "GRID K520", { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",2} } }, + { "GeForce GTX 1070", { {"TRA_DIM",8}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",2} } }, { "GeForce GTX 480", { {"TRA_DIM",8}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",2} } }, + { "GeForce GTX 670", { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",2} } }, { "GeForce GTX 680", { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",2} } }, + { "GeForce GTX 750", { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } }, { "GeForce GTX 750 Ti", { {"TRA_DIM",32}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } }, { "GeForce GTX 980", { {"TRA_DIM",16}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",2} } }, { "GeForce GTX TITAN", { {"TRA_DIM",8}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",2} } }, { "GeForce GTX TITAN X", { {"TRA_DIM",32}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } }, { "Tesla K20m", { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",2} } }, { "Tesla K40m", { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",2} } }, - { "default", { {"TRA_DIM",8}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } }, + { "default", { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",2} } }, } }, { // Default kDeviceTypeAll, "default", { - { "default", { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } }, + { "default", { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",2} } }, } }, } @@ -215,9 +232,10 @@ const Database::DatabaseEntry Database::TransposeComplexDouble = { kDeviceTypeGPU, "AMD", { { "AMD Radeon R9 M370X Compute Engine", { {"TRA_DIM",4}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",2} } }, { "Hawaii", { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",2} } }, + { "Oland", { {"TRA_DIM",16}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",1} } }, { "Pitcairn", { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",1} } }, { "Tahiti", { {"TRA_DIM",16}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",1} } }, - { "default", { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",1} } }, + { "default", { {"TRA_DIM",8}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",1} } }, } }, { // ARM GPUs @@ -231,26 +249,29 @@ const Database::DatabaseEntry Database::TransposeComplexDouble = { { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",4} } }, { "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz", { {"TRA_DIM",4}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",4} } }, { "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz", { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",2} } }, - { "default", { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",2} } }, + { "default", { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",4} } }, } }, { // NVIDIA GPUs kDeviceTypeGPU, "NVIDIA", { { "GRID K520", { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",1} } }, + { "GeForce GTX 1070", { {"TRA_DIM",8}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } }, { "GeForce GTX 480", { {"TRA_DIM",8}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } }, + { "GeForce GTX 670", { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",1} } }, { "GeForce GTX 680", { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",1} } }, + { "GeForce GTX 750", { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } }, { "GeForce GTX 750 Ti", { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } }, { "GeForce GTX 980", { {"TRA_DIM",32}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } }, { "GeForce GTX TITAN", { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } }, { "GeForce GTX TITAN X", { {"TRA_DIM",32}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } }, { "Tesla K20m", { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } }, { "Tesla K40m", { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } }, - { "default", { {"TRA_DIM",8}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } }, + { "default", { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } }, } }, { // Default kDeviceTypeAll, "default", { - { "default", { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } }, + { "default", { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",1} } }, } }, } diff --git a/src/database/kernels/xaxpy.hpp b/src/database/kernels/xaxpy.hpp index d8088ca2..9c1bcd99 100644 --- a/src/database/kernels/xaxpy.hpp +++ b/src/database/kernels/xaxpy.hpp @@ -18,13 +18,14 @@ const Database::DatabaseEntry Database::XaxpyHalf = { "Xaxpy", Precision::kHalf, { { // Intel GPUs kDeviceTypeGPU, "Intel", { + { "Intel(R) HD Graphics 5500 BroadWell U-Processor GT2", { {"VW",1}, {"WGS",64}, {"WPT",1} } }, { "Intel(R) HD Graphics Skylake ULT GT2", { {"VW",4}, {"WGS",512}, {"WPT",8} } }, - { "default", { {"VW",4}, {"WGS",512}, {"WPT",8} } }, + { "default", { {"VW",8}, {"WGS",64}, {"WPT",1} } }, } }, { // Default kDeviceTypeAll, "default", { - { "default", { {"VW",4}, {"WGS",512}, {"WPT",8} } }, + { "default", { {"VW",8}, {"WGS",64}, {"WPT",1} } }, } }, } @@ -38,9 +39,10 @@ const Database::DatabaseEntry Database::XaxpySingle = { kDeviceTypeGPU, "AMD", { { "AMD Radeon R9 M370X Compute Engine", { {"VW",1}, {"WGS",128}, {"WPT",1} } }, { "Hawaii", { {"VW",2}, {"WGS",64}, {"WPT",2} } }, + { "Oland", { {"VW",1}, {"WGS",128}, {"WPT",1} } }, { "Pitcairn", { {"VW",2}, {"WGS",128}, {"WPT",1} } }, { "Tahiti", { {"VW",2}, {"WGS",64}, {"WPT",1} } }, - { "default", { {"VW",1}, {"WGS",64}, {"WPT",1} } }, + { "default", { {"VW",2}, {"WGS",256}, {"WPT",1} } }, } }, { // ARM GPUs @@ -54,12 +56,14 @@ const Database::DatabaseEntry Database::XaxpySingle = { { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"VW",1}, {"WGS",512}, {"WPT",1} } }, { "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz", { {"VW",4}, {"WGS",256}, {"WPT",1} } }, { "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz", { {"VW",1}, {"WGS",128}, {"WPT",1} } }, - { "default", { {"VW",1}, {"WGS",128}, {"WPT",1} } }, + { "default", { {"VW",2}, {"WGS",256}, {"WPT",1} } }, } }, { // Intel GPUs kDeviceTypeGPU, "Intel", { - { "Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile", { {"VW",8}, {"WGS",256}, {"WPT",1} } }, + { "Intel(R) HD Graphics 530", { {"VW",1}, {"WGS",128}, {"WPT",1} } }, + { "Intel(R) HD Graphics 5500 BroadWell U-Processor GT2", { {"VW",1}, {"WGS",256}, {"WPT",1} } }, + { "Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile", { {"VW",1}, {"WGS",64}, {"WPT",1} } }, { "Intel(R) HD Graphics Skylake ULT GT2", { {"VW",1}, {"WGS",512}, {"WPT",2} } }, { "Iris", { {"VW",1}, {"WGS",64}, {"WPT",1} } }, { "Iris Pro", { {"VW",1}, {"WGS",128}, {"WPT",2} } }, @@ -75,20 +79,23 @@ const Database::DatabaseEntry Database::XaxpySingle = { { // NVIDIA GPUs kDeviceTypeGPU, "NVIDIA", { { "GRID K520", { {"VW",2}, {"WGS",64}, {"WPT",1} } }, - { "GeForce GTX 480", { {"VW",4}, {"WGS",64}, {"WPT",1} } }, - { "GeForce GTX 680", { {"VW",2}, {"WGS",64}, {"WPT",1} } }, + { "GeForce GTX 1070", { {"VW",1}, {"WGS",64}, {"WPT",4} } }, + { "GeForce GTX 480", { {"VW",2}, {"WGS",128}, {"WPT",1} } }, + { "GeForce GTX 670", { {"VW",2}, {"WGS",64}, {"WPT",1} } }, + { "GeForce GTX 680", { {"VW",1}, {"WGS",128}, {"WPT",1} } }, + { "GeForce GTX 750", { {"VW",1}, {"WGS",64}, {"WPT",1} } }, { "GeForce GTX 750 Ti", { {"VW",1}, {"WGS",1024}, {"WPT",1} } }, { "GeForce GTX 980", { {"VW",1}, {"WGS",1024}, {"WPT",1} } }, { "GeForce GTX TITAN", { {"VW",4}, {"WGS",256}, {"WPT",1} } }, { "GeForce GTX TITAN X", { {"VW",1}, {"WGS",64}, {"WPT",1} } }, { "Tesla K20m", { {"VW",4}, {"WGS",128}, {"WPT",1} } }, { "Tesla K40m", { {"VW",4}, {"WGS",128}, {"WPT",1} } }, - { "default", { {"VW",1}, {"WGS",64}, {"WPT",1} } }, + { "default", { {"VW",4}, {"WGS",64}, {"WPT",1} } }, } }, { // Default kDeviceTypeAll, "default", { - { "default", { {"VW",1}, {"WGS",64}, {"WPT",1} } }, + { "default", { {"VW",4}, {"WGS",64}, {"WPT",1} } }, } }, } @@ -102,9 +109,10 @@ const Database::DatabaseEntry Database::XaxpyComplexSingle = { kDeviceTypeGPU, "AMD", { { "AMD Radeon R9 M370X Compute Engine", { {"VW",2}, {"WGS",64}, {"WPT",8} } }, { "Hawaii", { {"VW",1}, {"WGS",128}, {"WPT",2} } }, + { "Oland", { {"VW",1}, {"WGS",128}, {"WPT",1} } }, { "Pitcairn", { {"VW",1}, {"WGS",64}, {"WPT",1} } }, { "Tahiti", { {"VW",1}, {"WGS",64}, {"WPT",1} } }, - { "default", { {"VW",1}, {"WGS",64}, {"WPT",1} } }, + { "default", { {"VW",1}, {"WGS",128}, {"WPT",1} } }, } }, { // ARM GPUs @@ -118,16 +126,18 @@ const Database::DatabaseEntry Database::XaxpyComplexSingle = { { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"VW",4}, {"WGS",256}, {"WPT",1} } }, { "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz", { {"VW",1}, {"WGS",1024}, {"WPT",2} } }, { "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz", { {"VW",2}, {"WGS",1024}, {"WPT",1} } }, - { "default", { {"VW",1}, {"WGS",256}, {"WPT",1} } }, + { "default", { {"VW",8}, {"WGS",1024}, {"WPT",1} } }, } }, { // Intel GPUs kDeviceTypeGPU, "Intel", { + { "Intel(R) HD Graphics 530", { {"VW",4}, {"WGS",64}, {"WPT",2} } }, + { "Intel(R) HD Graphics 5500 BroadWell U-Processor GT2", { {"VW",1}, {"WGS",64}, {"WPT",1} } }, { "Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile", { {"VW",1}, {"WGS",64}, {"WPT",1} } }, { "Intel(R) HD Graphics Skylake ULT GT2", { {"VW",2}, {"WGS",512}, {"WPT",1} } }, { "Iris", { {"VW",2}, {"WGS",128}, {"WPT",1} } }, { "Iris Pro", { {"VW",1}, {"WGS",256}, {"WPT",8} } }, - { "default", { {"VW",1}, {"WGS",64}, {"WPT",1} } }, + { "default", { {"VW",1}, {"WGS",256}, {"WPT",2} } }, } }, { // Intel accelerators @@ -139,20 +149,23 @@ const Database::DatabaseEntry Database::XaxpyComplexSingle = { { // NVIDIA GPUs kDeviceTypeGPU, "NVIDIA", { { "GRID K520", { {"VW",1}, {"WGS",512}, {"WPT",1} } }, + { "GeForce GTX 1070", { {"VW",1}, {"WGS",64}, {"WPT",2} } }, { "GeForce GTX 480", { {"VW",1}, {"WGS",256}, {"WPT",1} } }, + { "GeForce GTX 670", { {"VW",1}, {"WGS",256}, {"WPT",1} } }, { "GeForce GTX 680", { {"VW",1}, {"WGS",256}, {"WPT",1} } }, + { "GeForce GTX 750", { {"VW",1}, {"WGS",512}, {"WPT",1} } }, { "GeForce GTX 750 Ti", { {"VW",1}, {"WGS",512}, {"WPT",1} } }, { "GeForce GTX 980", { {"VW",1}, {"WGS",64}, {"WPT",1} } }, { "GeForce GTX TITAN", { {"VW",1}, {"WGS",256}, {"WPT",1} } }, { "GeForce GTX TITAN X", { {"VW",1}, {"WGS",512}, {"WPT",1} } }, { "Tesla K20m", { {"VW",1}, {"WGS",128}, {"WPT",1} } }, { "Tesla K40m", { {"VW",1}, {"WGS",128}, {"WPT",1} } }, - { "default", { {"VW",1}, {"WGS",64}, {"WPT",1} } }, + { "default", { {"VW",1}, {"WGS",256}, {"WPT",1} } }, } }, { // Default kDeviceTypeAll, "default", { - { "default", { {"VW",1}, {"WGS",64}, {"WPT",1} } }, + { "default", { {"VW",1}, {"WGS",128}, {"WPT",1} } }, } }, } @@ -166,6 +179,7 @@ const Database::DatabaseEntry Database::XaxpyDouble = { kDeviceTypeGPU, "AMD", { { "AMD Radeon R9 M370X Compute Engine", { {"VW",1}, {"WGS",256}, {"WPT",1} } }, { "Hawaii", { {"VW",1}, {"WGS",64}, {"WPT",2} } }, + { "Oland", { {"VW",1}, {"WGS",64}, {"WPT",1} } }, { "Pitcairn", { {"VW",1}, {"WGS",128}, {"WPT",1} } }, { "Tahiti", { {"VW",1}, {"WGS",64}, {"WPT",1} } }, { "default", { {"VW",1}, {"WGS",64}, {"WPT",1} } }, @@ -182,7 +196,7 @@ const Database::DatabaseEntry Database::XaxpyDouble = { { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"VW",1}, {"WGS",1024}, {"WPT",1} } }, { "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz", { {"VW",8}, {"WGS",64}, {"WPT",1} } }, { "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz", { {"VW",8}, {"WGS",2048}, {"WPT",1} } }, - { "default", { {"VW",1}, {"WGS",64}, {"WPT",1} } }, + { "default", { {"VW",8}, {"WGS",512}, {"WPT",1} } }, } }, { // Intel accelerators @@ -194,15 +208,18 @@ const Database::DatabaseEntry Database::XaxpyDouble = { { // NVIDIA GPUs kDeviceTypeGPU, "NVIDIA", { { "GRID K520", { {"VW",1}, {"WGS",64}, {"WPT",1} } }, - { "GeForce GTX 480", { {"VW",2}, {"WGS",64}, {"WPT",1} } }, + { "GeForce GTX 1070", { {"VW",1}, {"WGS",64}, {"WPT",8} } }, + { "GeForce GTX 480", { {"VW",1}, {"WGS",128}, {"WPT",1} } }, + { "GeForce GTX 670", { {"VW",1}, {"WGS",64}, {"WPT",1} } }, { "GeForce GTX 680", { {"VW",1}, {"WGS",64}, {"WPT",1} } }, + { "GeForce GTX 750", { {"VW",1}, {"WGS",128}, {"WPT",1} } }, { "GeForce GTX 750 Ti", { {"VW",1}, {"WGS",64}, {"WPT",1} } }, { "GeForce GTX 980", { {"VW",1}, {"WGS",256}, {"WPT",1} } }, { "GeForce GTX TITAN", { {"VW",2}, {"WGS",1024}, {"WPT",1} } }, { "GeForce GTX TITAN X", { {"VW",1}, {"WGS",512}, {"WPT",1} } }, { "Tesla K20m", { {"VW",2}, {"WGS",128}, {"WPT",1} } }, { "Tesla K40m", { {"VW",2}, {"WGS",128}, {"WPT",1} } }, - { "default", { {"VW",1}, {"WGS",64}, {"WPT",1} } }, + { "default", { {"VW",1}, {"WGS",128}, {"WPT",1} } }, } }, { // Default @@ -221,9 +238,10 @@ const Database::DatabaseEntry Database::XaxpyComplexDouble = { kDeviceTypeGPU, "AMD", { { "AMD Radeon R9 M370X Compute Engine", { {"VW",1}, {"WGS",128}, {"WPT",1} } }, { "Hawaii", { {"VW",2}, {"WGS",64}, {"WPT",1} } }, + { "Oland", { {"VW",1}, {"WGS",256}, {"WPT",1} } }, { "Pitcairn", { {"VW",1}, {"WGS",128}, {"WPT",1} } }, { "Tahiti", { {"VW",1}, {"WGS",128}, {"WPT",1} } }, - { "default", { {"VW",1}, {"WGS",64}, {"WPT",1} } }, + { "default", { {"VW",1}, {"WGS",128}, {"WPT",1} } }, } }, { // ARM GPUs @@ -237,7 +255,7 @@ const Database::DatabaseEntry Database::XaxpyComplexDouble = { { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"VW",8}, {"WGS",128}, {"WPT",1} } }, { "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz", { {"VW",8}, {"WGS",512}, {"WPT",1} } }, { "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz", { {"VW",1}, {"WGS",256}, {"WPT",1} } }, - { "default", { {"VW",1}, {"WGS",128}, {"WPT",1} } }, + { "default", { {"VW",4}, {"WGS",1024}, {"WPT",1} } }, } }, { // Intel accelerators @@ -249,8 +267,11 @@ const Database::DatabaseEntry Database::XaxpyComplexDouble = { { // NVIDIA GPUs kDeviceTypeGPU, "NVIDIA", { { "GRID K520", { {"VW",1}, {"WGS",64}, {"WPT",1} } }, + { "GeForce GTX 1070", { {"VW",1}, {"WGS",64}, {"WPT",2} } }, { "GeForce GTX 480", { {"VW",1}, {"WGS",128}, {"WPT",1} } }, + { "GeForce GTX 670", { {"VW",1}, {"WGS",256}, {"WPT",1} } }, { "GeForce GTX 680", { {"VW",1}, {"WGS",64}, {"WPT",1} } }, + { "GeForce GTX 750", { {"VW",1}, {"WGS",1024}, {"WPT",1} } }, { "GeForce GTX 750 Ti", { {"VW",1}, {"WGS",256}, {"WPT",2} } }, { "GeForce GTX 980", { {"VW",1}, {"WGS",1024}, {"WPT",1} } }, { "GeForce GTX TITAN", { {"VW",1}, {"WGS",64}, {"WPT",4} } }, @@ -262,7 +283,7 @@ const Database::DatabaseEntry Database::XaxpyComplexDouble = { }, { // Default kDeviceTypeAll, "default", { - { "default", { {"VW",1}, {"WGS",64}, {"WPT",1} } }, + { "default", { {"VW",1}, {"WGS",128}, {"WPT",1} } }, } }, } diff --git a/src/database/kernels/xdot.hpp b/src/database/kernels/xdot.hpp index 48288f95..987a990d 100644 --- a/src/database/kernels/xdot.hpp +++ b/src/database/kernels/xdot.hpp @@ -18,6 +18,7 @@ const Database::DatabaseEntry Database::XdotHalf = { "Xdot", Precision::kHalf, { { // Intel GPUs kDeviceTypeGPU, "Intel", { + { "Intel(R) HD Graphics 5500 BroadWell U-Processor GT2", { {"WGS1",256}, {"WGS2",32} } }, { "Intel(R) HD Graphics Skylake ULT GT2", { {"WGS1",32}, {"WGS2",32} } }, { "default", { {"WGS1",32}, {"WGS2",32} } }, } @@ -37,7 +38,7 @@ const Database::DatabaseEntry Database::XdotSingle = { { // AMD GPUs kDeviceTypeGPU, "AMD", { { "AMD Radeon R9 M370X Compute Engine", { {"WGS1",128}, {"WGS2",32} } }, - { "Hawaii", { {"WGS1",256}, {"WGS2",32} } }, + { "Oland", { {"WGS1",256}, {"WGS2",32} } }, { "Pitcairn", { {"WGS1",128}, {"WGS2",32} } }, { "Tahiti", { {"WGS1",128}, {"WGS2",32} } }, { "default", { {"WGS1",128}, {"WGS2",32} } }, @@ -51,26 +52,31 @@ const Database::DatabaseEntry Database::XdotSingle = { }, { // Intel GPUs kDeviceTypeGPU, "Intel", { - { "Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile", { {"WGS1",32}, {"WGS2",32} } }, + { "Intel(R) HD Graphics 530", { {"WGS1",64}, {"WGS2",32} } }, + { "Intel(R) HD Graphics 5500 BroadWell U-Processor GT2", { {"WGS1",256}, {"WGS2",32} } }, + { "Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile", { {"WGS1",64}, {"WGS2",32} } }, { "Intel(R) HD Graphics Skylake ULT GT2", { {"WGS1",64}, {"WGS2",32} } }, { "Iris Pro", { {"WGS1",512}, {"WGS2",64} } }, - { "default", { {"WGS1",32}, {"WGS2",32} } }, + { "default", { {"WGS1",64}, {"WGS2",32} } }, } }, { // NVIDIA GPUs kDeviceTypeGPU, "NVIDIA", { { "GRID K520", { {"WGS1",128}, {"WGS2",32} } }, + { "GeForce GTX 1070", { {"WGS1",128}, {"WGS2",1024} } }, { "GeForce GTX 480", { {"WGS1",512}, {"WGS2",32} } }, + { "GeForce GTX 670", { {"WGS1",512}, {"WGS2",1024} } }, { "GeForce GTX 680", { {"WGS1",128}, {"WGS2",128} } }, + { "GeForce GTX 750", { {"WGS1",128}, {"WGS2",32} } }, { "GeForce GTX 980", { {"WGS1",256}, {"WGS2",32} } }, { "GeForce GTX TITAN X", { {"WGS1",256}, {"WGS2",32} } }, { "Tesla K20m", { {"WGS1",1024}, {"WGS2",32} } }, - { "default", { {"WGS1",128}, {"WGS2",32} } }, + { "default", { {"WGS1",256}, {"WGS2",256} } }, } }, { // Default kDeviceTypeAll, "default", { - { "default", { {"WGS1",32}, {"WGS2",32} } }, + { "default", { {"WGS1",256}, {"WGS2",32} } }, } }, } @@ -83,10 +89,10 @@ const Database::DatabaseEntry Database::XdotComplexSingle = { { // AMD GPUs kDeviceTypeGPU, "AMD", { { "AMD Radeon R9 M370X Compute Engine", { {"WGS1",64}, {"WGS2",32} } }, - { "Hawaii", { {"WGS1",256}, {"WGS2",32} } }, + { "Oland", { {"WGS1",128}, {"WGS2",32} } }, { "Pitcairn", { {"WGS1",256}, {"WGS2",32} } }, { "Tahiti", { {"WGS1",64}, {"WGS2",32} } }, - { "default", { {"WGS1",64}, {"WGS2",32} } }, + { "default", { {"WGS1",128}, {"WGS2",32} } }, } }, { // Intel CPUs @@ -97,6 +103,8 @@ const Database::DatabaseEntry Database::XdotComplexSingle = { }, { // Intel GPUs kDeviceTypeGPU, "Intel", { + { "Intel(R) HD Graphics 530", { {"WGS1",256}, {"WGS2",32} } }, + { "Intel(R) HD Graphics 5500 BroadWell U-Processor GT2", { {"WGS1",256}, {"WGS2",32} } }, { "Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile", { {"WGS1",32}, {"WGS2",32} } }, { "Intel(R) HD Graphics Skylake ULT GT2", { {"WGS1",32}, {"WGS2",32} } }, { "Iris Pro", { {"WGS1",32}, {"WGS2",32} } }, @@ -106,17 +114,20 @@ const Database::DatabaseEntry Database::XdotComplexSingle = { { // NVIDIA GPUs kDeviceTypeGPU, "NVIDIA", { { "GRID K520", { {"WGS1",64}, {"WGS2",32} } }, + { "GeForce GTX 1070", { {"WGS1",128}, {"WGS2",32} } }, { "GeForce GTX 480", { {"WGS1",512}, {"WGS2",32} } }, + { "GeForce GTX 670", { {"WGS1",256}, {"WGS2",32} } }, { "GeForce GTX 680", { {"WGS1",128}, {"WGS2",64} } }, + { "GeForce GTX 750", { {"WGS1",64}, {"WGS2",32} } }, { "GeForce GTX 980", { {"WGS1",256}, {"WGS2",64} } }, { "GeForce GTX TITAN X", { {"WGS1",256}, {"WGS2",32} } }, { "Tesla K20m", { {"WGS1",512}, {"WGS2",32} } }, - { "default", { {"WGS1",64}, {"WGS2",32} } }, + { "default", { {"WGS1",512}, {"WGS2",64} } }, } }, { // Default kDeviceTypeAll, "default", { - { "default", { {"WGS1",32}, {"WGS2",32} } }, + { "default", { {"WGS1",256}, {"WGS2",32} } }, } }, } @@ -129,10 +140,10 @@ const Database::DatabaseEntry Database::XdotDouble = { { // AMD GPUs kDeviceTypeGPU, "AMD", { { "AMD Radeon R9 M370X Compute Engine", { {"WGS1",64}, {"WGS2",128} } }, - { "Hawaii", { {"WGS1",256}, {"WGS2",32} } }, + { "Oland", { {"WGS1",256}, {"WGS2",32} } }, { "Pitcairn", { {"WGS1",128}, {"WGS2",32} } }, { "Tahiti", { {"WGS1",256}, {"WGS2",32} } }, - { "default", { {"WGS1",64}, {"WGS2",32} } }, + { "default", { {"WGS1",128}, {"WGS2",32} } }, } }, { // Intel CPUs @@ -144,17 +155,20 @@ const Database::DatabaseEntry Database::XdotDouble = { { // NVIDIA GPUs kDeviceTypeGPU, "NVIDIA", { { "GRID K520", { {"WGS1",128}, {"WGS2",32} } }, + { "GeForce GTX 1070", { {"WGS1",128}, {"WGS2",512} } }, { "GeForce GTX 480", { {"WGS1",512}, {"WGS2",32} } }, + { "GeForce GTX 670", { {"WGS1",256}, {"WGS2",32} } }, { "GeForce GTX 680", { {"WGS1",128}, {"WGS2",64} } }, + { "GeForce GTX 750", { {"WGS1",64}, {"WGS2",256} } }, { "GeForce GTX 980", { {"WGS1",128}, {"WGS2",32} } }, { "GeForce GTX TITAN X", { {"WGS1",256}, {"WGS2",32} } }, { "Tesla K20m", { {"WGS1",512}, {"WGS2",32} } }, - { "default", { {"WGS1",128}, {"WGS2",32} } }, + { "default", { {"WGS1",256}, {"WGS2",64} } }, } }, { // Default kDeviceTypeAll, "default", { - { "default", { {"WGS1",64}, {"WGS2",32} } }, + { "default", { {"WGS1",128}, {"WGS2",64} } }, } }, } @@ -167,10 +181,10 @@ const Database::DatabaseEntry Database::XdotComplexDouble = { { // AMD GPUs kDeviceTypeGPU, "AMD", { { "AMD Radeon R9 M370X Compute Engine", { {"WGS1",64}, {"WGS2",32} } }, - { "Hawaii", { {"WGS1",256}, {"WGS2",32} } }, + { "Oland", { {"WGS1",256}, {"WGS2",32} } }, { "Pitcairn", { {"WGS1",256}, {"WGS2",32} } }, { "Tahiti", { {"WGS1",256}, {"WGS2",32} } }, - { "default", { {"WGS1",64}, {"WGS2",32} } }, + { "default", { {"WGS1",256}, {"WGS2",32} } }, } }, { // Intel CPUs @@ -182,17 +196,20 @@ const Database::DatabaseEntry Database::XdotComplexDouble = { { // NVIDIA GPUs kDeviceTypeGPU, "NVIDIA", { { "GRID K520", { {"WGS1",64}, {"WGS2",32} } }, + { "GeForce GTX 1070", { {"WGS1",128}, {"WGS2",64} } }, { "GeForce GTX 480", { {"WGS1",512}, {"WGS2",32} } }, + { "GeForce GTX 670", { {"WGS1",512}, {"WGS2",128} } }, { "GeForce GTX 680", { {"WGS1",256}, {"WGS2",64} } }, + { "GeForce GTX 750", { {"WGS1",256}, {"WGS2",32} } }, { "GeForce GTX 980", { {"WGS1",64}, {"WGS2",32} } }, { "GeForce GTX TITAN X", { {"WGS1",128}, {"WGS2",32} } }, { "Tesla K20m", { {"WGS1",128}, {"WGS2",32} } }, - { "default", { {"WGS1",64}, {"WGS2",32} } }, + { "default", { {"WGS1",128}, {"WGS2",64} } }, } }, { // Default kDeviceTypeAll, "default", { - { "default", { {"WGS1",64}, {"WGS2",32} } }, + { "default", { {"WGS1",256}, {"WGS2",64} } }, } }, } diff --git a/src/database/kernels/xgemm.hpp b/src/database/kernels/xgemm.hpp index 27cebc8a..d19c55b5 100644 --- a/src/database/kernels/xgemm.hpp +++ b/src/database/kernels/xgemm.hpp @@ -18,7 +18,7 @@ const Database::DatabaseEntry Database::XgemmHalf = { "Xgemm", Precision::kHalf, { { // Default kDeviceTypeAll, "default", { - { "default", { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",64}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",1} } }, + { "default", { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",16}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",1} } }, } }, } @@ -32,6 +32,7 @@ const Database::DatabaseEntry Database::XgemmSingle = { kDeviceTypeGPU, "AMD", { { "AMD Radeon R9 M370X Compute Engine", { {"KWG",32}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",8}, {"NDIMC",16}, {"NWG",128}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",2}, {"VWN",8} } }, { "Hawaii", { {"KWG",16}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",32}, {"MWG",128}, {"NDIMB",32}, {"NDIMC",8}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",1}, {"VWM",4}, {"VWN",2} } }, + { "Oland", { {"KWG",16}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",32}, {"NDIMC",16}, {"NWG",128}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",2}, {"VWN",4} } }, { "Pitcairn", { {"KWG",16}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",128}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",1} } }, { "Tahiti", { {"KWG",32}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",32}, {"MWG",128}, {"NDIMB",16}, {"NDIMC",8}, {"NWG",64}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",4}, {"VWN",1} } }, { "default", { {"KWG",16}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",64}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",1} } }, @@ -53,11 +54,13 @@ const Database::DatabaseEntry Database::XgemmSingle = { }, { // Intel GPUs kDeviceTypeGPU, "Intel", { + { "Intel(R) HD Graphics 530", { {"KWG",32}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",128}, {"NDIMB",32}, {"NDIMC",16}, {"NWG",64}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",0}, {"VWM",4}, {"VWN",2} } }, + { "Intel(R) HD Graphics 5500 BroadWell U-Processor GT2", { {"KWG",32}, {"KWI",8}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",64}, {"NDIMB",32}, {"NDIMC",16}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",1}, {"VWM",4}, {"VWN",2} } }, { "Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile", { {"KWG",16}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",8}, {"NDIMC",16}, {"NWG",128}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",1}, {"VWM",2}, {"VWN",4} } }, { "Intel(R) HD Graphics Skylake ULT GT2", { {"KWG",32}, {"KWI",8}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",128}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",1}, {"VWM",1}, {"VWN",8} } }, { "Iris", { {"KWG",16}, {"KWI",8}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",128}, {"NDIMB",32}, {"NDIMC",16}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",1}, {"VWM",4}, {"VWN",1} } }, { "Iris Pro", { {"KWG",32}, {"KWI",8}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",64}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",64}, {"SA",1}, {"SB",0}, {"STRM",1}, {"STRN",0}, {"VWM",4}, {"VWN",4} } }, - { "default", { {"KWG",16}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",64}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",1} } }, + { "default", { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",64}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",1} } }, } }, { // Intel accelerators @@ -69,20 +72,23 @@ const Database::DatabaseEntry Database::XgemmSingle = { { // NVIDIA GPUs kDeviceTypeGPU, "NVIDIA", { { "GRID K520", { {"KWG",16}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",8}, {"NDIMC",16}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",1}, {"VWM",2}, {"VWN",4} } }, + { "GeForce GTX 1070", { {"KWG",16}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",16}, {"MWG",128}, {"NDIMB",32}, {"NDIMC",8}, {"NWG",128}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",4}, {"VWN",1} } }, { "GeForce GTX 480", { {"KWG",16}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",64}, {"NDIMB",32}, {"NDIMC",16}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",1}, {"VWM",2}, {"VWN",2} } }, + { "GeForce GTX 670", { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",64}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",2}, {"VWN",4} } }, { "GeForce GTX 680", { {"KWG",32}, {"KWI",8}, {"MDIMA",8}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",32}, {"NDIMC",16}, {"NWG",128}, {"SA",1}, {"SB",1}, {"STRM",0}, {"STRN",0}, {"VWM",4}, {"VWN",2} } }, + { "GeForce GTX 750", { {"KWG",16}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",32}, {"NDIMC",8}, {"NWG",128}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",1}, {"VWM",1}, {"VWN",2} } }, { "GeForce GTX 750 Ti", { {"KWG",16}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",16}, {"MWG",128}, {"NDIMB",16}, {"NDIMC",8}, {"NWG",128}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",1}, {"VWM",4}, {"VWN",4} } }, { "GeForce GTX 980", { {"KWG",16}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",16}, {"NDIMC",8}, {"NWG",128}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",4}, {"VWN",8} } }, { "GeForce GTX TITAN", { {"KWG",16}, {"KWI",8}, {"MDIMA",32}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",2}, {"VWN",2} } }, { "GeForce GTX TITAN X", { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",16}, {"MWG",128}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",128}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",1}, {"VWM",4}, {"VWN",8} } }, { "Tesla K20m", { {"KWG",16}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",16}, {"NDIMC",8}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",2}, {"VWN",4} } }, { "Tesla K40m", { {"KWG",16}, {"KWI",8}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",64}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",128}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",2}, {"VWN",4} } }, - { "default", { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",0}, {"STRN",0}, {"VWM",2}, {"VWN",2} } }, + { "default", { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",1} } }, } }, { // Default kDeviceTypeAll, "default", { - { "default", { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",64}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",1} } }, + { "default", { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",16}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",1} } }, } }, } @@ -96,6 +102,7 @@ const Database::DatabaseEntry Database::XgemmComplexSingle = { kDeviceTypeGPU, "AMD", { { "AMD Radeon R9 M370X Compute Engine", { {"KWG",32}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",32}, {"MWG",64}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",64}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",1}, {"VWM",2}, {"VWN",8} } }, { "Hawaii", { {"KWG",32}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",8}, {"NDIMC",16}, {"NWG",32}, {"SA",1}, {"SB",0}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",1} } }, + { "Oland", { {"KWG",32}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",32}, {"NDIMC",32}, {"NWG",128}, {"SA",1}, {"SB",0}, {"STRM",0}, {"STRN",1}, {"VWM",2}, {"VWN",4} } }, { "Pitcairn", { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",32}, {"SA",0}, {"SB",1}, {"STRM",1}, {"STRN",1}, {"VWM",4}, {"VWN",2} } }, { "Tahiti", { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",8}, {"NDIMC",16}, {"NWG",32}, {"SA",1}, {"SB",0}, {"STRM",0}, {"STRN",1}, {"VWM",2}, {"VWN",1} } }, { "default", { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",32}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",1} } }, @@ -117,11 +124,13 @@ const Database::DatabaseEntry Database::XgemmComplexSingle = { }, { // Intel GPUs kDeviceTypeGPU, "Intel", { + { "Intel(R) HD Graphics 530", { {"KWG",16}, {"KWI",8}, {"MDIMA",8}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",32}, {"NDIMC",8}, {"NWG",32}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",2}, {"VWN",1} } }, + { "Intel(R) HD Graphics 5500 BroadWell U-Processor GT2", { {"KWG",16}, {"KWI",8}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",64}, {"SA",1}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",4}, {"VWN",4} } }, { "Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile", { {"KWG",32}, {"KWI",8}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",1}, {"VWM",2}, {"VWN",1} } }, { "Intel(R) HD Graphics Skylake ULT GT2", { {"KWG",32}, {"KWI",8}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",32}, {"NDIMC",16}, {"NWG",32}, {"SA",1}, {"SB",0}, {"STRM",0}, {"STRN",1}, {"VWM",4}, {"VWN",1} } }, { "Iris", { {"KWG",32}, {"KWI",8}, {"MDIMA",32}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",8}, {"NDIMC",16}, {"NWG",64}, {"SA",1}, {"SB",0}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",1} } }, { "Iris Pro", { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",32}, {"NDIMC",8}, {"NWG",32}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",1}, {"VWM",1}, {"VWN",1} } }, - { "default", { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",32}, {"SA",1}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",1} } }, + { "default", { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",32}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",1} } }, } }, { // Intel accelerators @@ -133,8 +142,11 @@ const Database::DatabaseEntry Database::XgemmComplexSingle = { { // NVIDIA GPUs kDeviceTypeGPU, "NVIDIA", { { "GRID K520", { {"KWG",16}, {"KWI",8}, {"MDIMA",32}, {"MDIMC",32}, {"MWG",64}, {"NDIMB",32}, {"NDIMC",16}, {"NWG",128}, {"SA",1}, {"SB",0}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",4} } }, + { "GeForce GTX 1070", { {"KWG",16}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",128}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",1}, {"VWM",2}, {"VWN",4} } }, { "GeForce GTX 480", { {"KWG",16}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",32}, {"NDIMB",32}, {"NDIMC",16}, {"NWG",128}, {"SA",0}, {"SB",1}, {"STRM",1}, {"STRN",1}, {"VWM",2}, {"VWN",2} } }, + { "GeForce GTX 670", { {"KWG",16}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",32}, {"MWG",64}, {"NDIMB",32}, {"NDIMC",8}, {"NWG",32}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",1}, {"VWM",1}, {"VWN",1} } }, { "GeForce GTX 680", { {"KWG",16}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",32}, {"NDIMC",32}, {"NWG",128}, {"SA",1}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",2}, {"VWN",2} } }, + { "GeForce GTX 750", { {"KWG",16}, {"KWI",8}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",2}, {"VWN",2} } }, { "GeForce GTX 750 Ti", { {"KWG",16}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",8}, {"MWG",128}, {"NDIMB",16}, {"NDIMC",32}, {"NWG",128}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",4} } }, { "GeForce GTX 980", { {"KWG",32}, {"KWI",8}, {"MDIMA",32}, {"MDIMC",32}, {"MWG",64}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",2}, {"VWN",1} } }, { "GeForce GTX TITAN", { {"KWG",16}, {"KWI",8}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",32}, {"NDIMC",16}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",1} } }, @@ -158,11 +170,12 @@ const Database::DatabaseEntry Database::XgemmDouble = { "Xgemm", Precision::kDouble, { { // AMD GPUs kDeviceTypeGPU, "AMD", { - { "AMD Radeon R9 M370X Compute Engine", { {"KWG",32}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",32}, {"MWG",64}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",64}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",1}, {"VWM",2}, {"VWN",8} } }, + { "AMD Radeon R9 M370X Compute Engine", { {"KWG",32}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",32}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",4}, {"VWN",4} } }, { "Hawaii", { {"KWG",16}, {"KWI",8}, {"MDIMA",32}, {"MDIMC",8}, {"MWG",128}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",32}, {"SA",0}, {"SB",1}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",4} } }, + { "Oland", { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",16}, {"NDIMC",8}, {"NWG",16}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",1}, {"VWM",1}, {"VWN",1} } }, { "Pitcairn", { {"KWG",32}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",8}, {"NDIMC",16}, {"NWG",32}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",2} } }, { "Tahiti", { {"KWG",32}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",16}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",32}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",1}, {"VWM",1}, {"VWN",4} } }, - { "default", { {"KWG",16}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",16}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",32}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",2} } }, + { "default", { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",16}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",16}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",1} } }, } }, { // ARM GPUs @@ -188,8 +201,11 @@ const Database::DatabaseEntry Database::XgemmDouble = { { // NVIDIA GPUs kDeviceTypeGPU, "NVIDIA", { { "GRID K520", { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",16}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",32}, {"SA",1}, {"SB",0}, {"STRM",0}, {"STRN",1}, {"VWM",2}, {"VWN",2} } }, + { "GeForce GTX 1070", { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",16}, {"MWG",32}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",64}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",1}, {"VWM",2}, {"VWN",8} } }, { "GeForce GTX 480", { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",16}, {"MWG",32}, {"NDIMB",32}, {"NDIMC",8}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",2} } }, + { "GeForce GTX 670", { {"KWG",32}, {"KWI",8}, {"MDIMA",16}, {"MDIMC",32}, {"MWG",128}, {"NDIMB",16}, {"NDIMC",8}, {"NWG",32}, {"SA",0}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",1} } }, { "GeForce GTX 680", { {"KWG",32}, {"KWI",8}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",16}, {"NDIMC",32}, {"NWG",128}, {"SA",1}, {"SB",0}, {"STRM",0}, {"STRN",1}, {"VWM",2}, {"VWN",4} } }, + { "GeForce GTX 750", { {"KWG",32}, {"KWI",8}, {"MDIMA",16}, {"MDIMC",32}, {"MWG",64}, {"NDIMB",16}, {"NDIMC",8}, {"NWG",128}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",1}, {"VWM",2}, {"VWN",1} } }, { "GeForce GTX 750 Ti", { {"KWG",16}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",16}, {"NDIMC",32}, {"NWG",32}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",1}, {"VWM",2}, {"VWN",1} } }, { "GeForce GTX 980", { {"KWG",32}, {"KWI",8}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",64}, {"NDIMB",32}, {"NDIMC",32}, {"NWG",128}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",0}, {"VWM",2}, {"VWN",4} } }, { "GeForce GTX TITAN", { {"KWG",16}, {"KWI",8}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",16}, {"NDIMC",32}, {"NWG",128}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",1}, {"VWM",2}, {"VWN",2} } }, @@ -215,6 +231,7 @@ const Database::DatabaseEntry Database::XgemmComplexDouble = { kDeviceTypeGPU, "AMD", { { "AMD Radeon R9 M370X Compute Engine", { {"KWG",32}, {"KWI",8}, {"MDIMA",8}, {"MDIMC",16}, {"MWG",32}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",32}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",1}, {"VWM",2}, {"VWN",2} } }, { "Hawaii", { {"KWG",16}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",16}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",32}, {"SA",1}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",2} } }, + { "Oland", { {"KWG",16}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",16}, {"NDIMB",16}, {"NDIMC",32}, {"NWG",128}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",4} } }, { "Pitcairn", { {"KWG",32}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",8}, {"NDIMC",32}, {"NWG",32}, {"SA",0}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",1} } }, { "Tahiti", { {"KWG",16}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",16}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",16}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",1} } }, { "default", { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",16}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",16}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",1} } }, @@ -243,8 +260,11 @@ const Database::DatabaseEntry Database::XgemmComplexDouble = { { // NVIDIA GPUs kDeviceTypeGPU, "NVIDIA", { { "GRID K520", { {"KWG",32}, {"KWI",8}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",16}, {"NDIMB",8}, {"NDIMC",16}, {"NWG",64}, {"SA",1}, {"SB",0}, {"STRM",1}, {"STRN",1}, {"VWM",1}, {"VWN",1} } }, + { "GeForce GTX 1070", { {"KWG",32}, {"KWI",8}, {"MDIMA",32}, {"MDIMC",16}, {"MWG",32}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",32}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",1}, {"VWM",1}, {"VWN",4} } }, { "GeForce GTX 480", { {"KWG",16}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",32}, {"MWG",32}, {"NDIMB",32}, {"NDIMC",8}, {"NWG",32}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",1} } }, + { "GeForce GTX 670", { {"KWG",32}, {"KWI",8}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",16}, {"NDIMB",16}, {"NDIMC",32}, {"NWG",64}, {"SA",1}, {"SB",0}, {"STRM",0}, {"STRN",1}, {"VWM",1}, {"VWN",2} } }, { "GeForce GTX 680", { {"KWG",16}, {"KWI",8}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",64}, {"NDIMB",16}, {"NDIMC",32}, {"NWG",32}, {"SA",0}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",1} } }, + { "GeForce GTX 750", { {"KWG",32}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",32}, {"MWG",32}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",64}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",4} } }, { "GeForce GTX 750 Ti", { {"KWG",16}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",16}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",32}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",4} } }, { "GeForce GTX 980", { {"KWG",16}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",8}, {"NDIMC",16}, {"NWG",128}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",1}, {"VWM",2}, {"VWN",2} } }, { "GeForce GTX TITAN X", { {"KWG",32}, {"KWI",8}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",128}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",32}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",1} } }, diff --git a/src/database/kernels/xgemv.hpp b/src/database/kernels/xgemv.hpp index ce258f2f..e5e8845e 100644 --- a/src/database/kernels/xgemv.hpp +++ b/src/database/kernels/xgemv.hpp @@ -18,13 +18,14 @@ const Database::DatabaseEntry Database::XgemvHalf = { "Xgemv", Precision::kHalf, { { // Intel GPUs kDeviceTypeGPU, "Intel", { - { "Intel(R) HD Graphics Skylake ULT GT2", { {"WGS1",128}, {"WPT1",1}, {"VW2",2}, {"WGS2",128}, {"WPT2",2}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } }, - { "default", { {"WGS1",128}, {"WPT1",1}, {"VW2",2}, {"WGS2",128}, {"WPT2",2}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } }, + { "Intel(R) HD Graphics 5500 BroadWell U-Processor GT2", { {"WGS1",64}, {"WPT1",1} } }, + { "Intel(R) HD Graphics Skylake ULT GT2", { {"WGS1",128}, {"WPT1",1} } }, + { "default", { {"WGS1",64}, {"WPT1",1} } }, } }, { // Default kDeviceTypeAll, "default", { - { "default", { {"WGS1",128}, {"WPT1",1}, {"VW2",2}, {"WGS2",128}, {"WPT2",2}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } }, + { "default", { {"WGS1",64}, {"WPT1",1} } }, } }, } @@ -36,52 +37,58 @@ const Database::DatabaseEntry Database::XgemvSingle = { "Xgemv", Precision::kSingle, { { // AMD GPUs kDeviceTypeGPU, "AMD", { - { "AMD Radeon R9 M370X Compute Engine", { {"WGS1",128}, {"WPT1",1}, {"VW2",1}, {"WGS2",128}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } }, - { "Hawaii", { {"WGS1",128}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } }, - { "Pitcairn", { {"WGS1",256}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } }, - { "Tahiti", { {"WGS1",256}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } }, - { "default", { {"WGS1",128}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } }, + { "AMD Radeon R9 M370X Compute Engine", { {"WGS1",128}, {"WPT1",1} } }, + { "Hawaii", { {"WGS1",128}, {"WPT1",1} } }, + { "Oland", { {"WGS1",128}, {"WPT1",1} } }, + { "Pitcairn", { {"WGS1",256}, {"WPT1",1} } }, + { "Tahiti", { {"WGS1",256}, {"WPT1",1} } }, + { "default", { {"WGS1",128}, {"WPT1",1} } }, } }, { // Intel CPUs kDeviceTypeCPU, "Intel", { - { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"WGS1",64}, {"WPT1",1}, {"VW2",4}, {"WGS2",128}, {"WPT2",4}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } }, - { "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz", { {"WGS1",64}, {"WPT1",4}, {"VW2",1}, {"WGS2",64}, {"WPT2",4}, {"VW3",2}, {"WGS3",64}, {"WPT3",4} } }, - { "default", { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",4}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } }, + { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"WGS1",64}, {"WPT1",1} } }, + { "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz", { {"WGS1",64}, {"WPT1",4} } }, + { "default", { {"WGS1",64}, {"WPT1",4} } }, } }, { // Intel GPUs kDeviceTypeGPU, "Intel", { - { "Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile", { {"WGS1",256}, {"WPT1",1}, {"VW2",4}, {"WGS2",128}, {"WPT2",4}, {"VW3",4}, {"WGS3",256}, {"WPT3",4} } }, - { "Intel(R) HD Graphics Skylake ULT GT2", { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",256}, {"WPT2",1}, {"VW3",4}, {"WGS3",64}, {"WPT3",4} } }, - { "Iris", { {"WGS1",64}, {"WPT1",2}, {"VW2",1}, {"WGS2",128}, {"WPT2",2}, {"VW3",4}, {"WGS3",64}, {"WPT3",8} } }, - { "Iris Pro", { {"WGS1",256}, {"WPT1",2}, {"VW2",1}, {"WGS2",128}, {"WPT2",2}, {"VW3",4}, {"WGS3",64}, {"WPT3",4} } }, - { "default", { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",128}, {"WPT2",1}, {"VW3",4}, {"WGS3",64}, {"WPT3",4} } }, + { "Intel(R) HD Graphics 530", { {"WGS1",256}, {"WPT1",1} } }, + { "Intel(R) HD Graphics 5500 BroadWell U-Processor GT2", { {"WGS1",64}, {"WPT1",1} } }, + { "Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile", { {"WGS1",64}, {"WPT1",1} } }, + { "Intel(R) HD Graphics Skylake ULT GT2", { {"WGS1",64}, {"WPT1",1} } }, + { "Iris", { {"WGS1",64}, {"WPT1",2} } }, + { "Iris Pro", { {"WGS1",256}, {"WPT1",2} } }, + { "default", { {"WGS1",64}, {"WPT1",1} } }, } }, { // Intel accelerators kDeviceTypeAccelerator, "Intel", { - { "Intel(R) Many Integrated Core Acceleration Card", { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } }, - { "default", { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } }, + { "Intel(R) Many Integrated Core Acceleration Card", { {"WGS1",64}, {"WPT1",1} } }, + { "default", { {"WGS1",64}, {"WPT1",1} } }, } }, { // NVIDIA GPUs kDeviceTypeGPU, "NVIDIA", { - { "GRID K520", { {"WGS1",256}, {"WPT1",1}, {"VW2",2}, {"WGS2",256}, {"WPT2",2}, {"VW3",2}, {"WGS3",64}, {"WPT3",2} } }, - { "GeForce GTX 480", { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",128}, {"WPT2",1}, {"VW3",2}, {"WGS3",64}, {"WPT3",2} } }, - { "GeForce GTX 680", { {"WGS1",256}, {"WPT1",1}, {"VW2",1}, {"WGS2",128}, {"WPT2",1}, {"VW3",2}, {"WGS3",128}, {"WPT3",2} } }, - { "GeForce GTX 750 Ti", { {"WGS1",256}, {"WPT1",1}, {"VW2",1}, {"WGS2",256}, {"WPT2",1}, {"VW3",4}, {"WGS3",128}, {"WPT3",4} } }, - { "GeForce GTX 980", { {"WGS1",128}, {"WPT1",1}, {"VW2",1}, {"WGS2",256}, {"WPT2",1}, {"VW3",1}, {"WGS3",128}, {"WPT3",1} } }, - { "GeForce GTX TITAN", { {"WGS1",256}, {"WPT1",1}, {"VW2",1}, {"WGS2",256}, {"WPT2",1}, {"VW3",1}, {"WGS3",256}, {"WPT3",1} } }, - { "GeForce GTX TITAN X", { {"WGS1",256}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",128}, {"WPT3",1} } }, - { "Tesla K20m", { {"WGS1",128}, {"WPT1",1}, {"VW2",1}, {"WGS2",256}, {"WPT2",1}, {"VW3",1}, {"WGS3",256}, {"WPT3",1} } }, - { "Tesla K40m", { {"WGS1",256}, {"WPT1",1}, {"VW2",1}, {"WGS2",256}, {"WPT2",1}, {"VW3",1}, {"WGS3",128}, {"WPT3",1} } }, - { "default", { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } }, + { "GRID K520", { {"WGS1",256}, {"WPT1",1} } }, + { "GeForce GTX 1070", { {"WGS1",128}, {"WPT1",1} } }, + { "GeForce GTX 480", { {"WGS1",64}, {"WPT1",1} } }, + { "GeForce GTX 670", { {"WGS1",64}, {"WPT1",1} } }, + { "GeForce GTX 680", { {"WGS1",256}, {"WPT1",1} } }, + { "GeForce GTX 750", { {"WGS1",256}, {"WPT1",1} } }, + { "GeForce GTX 750 Ti", { {"WGS1",256}, {"WPT1",1} } }, + { "GeForce GTX 980", { {"WGS1",128}, {"WPT1",1} } }, + { "GeForce GTX TITAN", { {"WGS1",256}, {"WPT1",1} } }, + { "GeForce GTX TITAN X", { {"WGS1",256}, {"WPT1",1} } }, + { "Tesla K20m", { {"WGS1",128}, {"WPT1",1} } }, + { "Tesla K40m", { {"WGS1",256}, {"WPT1",1} } }, + { "default", { {"WGS1",256}, {"WPT1",1} } }, } }, { // Default kDeviceTypeAll, "default", { - { "default", { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } }, + { "default", { {"WGS1",64}, {"WPT1",1} } }, } }, } @@ -93,48 +100,54 @@ const Database::DatabaseEntry Database::XgemvComplexSingle = { "Xgemv", Precision::kComplexSingle, { { // AMD GPUs kDeviceTypeGPU, "AMD", { - { "AMD Radeon R9 M370X Compute Engine", { {"WGS1",64}, {"WPT1",1}, {"VW2",2}, {"WGS2",256}, {"WPT2",2}, {"VW3",1}, {"WGS3",128}, {"WPT3",1} } }, - { "Hawaii", { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } }, - { "Pitcairn", { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",2}, {"WGS3",64}, {"WPT3",2} } }, - { "Tahiti", { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",128}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } }, - { "default", { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } }, + { "AMD Radeon R9 M370X Compute Engine", { {"WGS1",64}, {"WPT1",1} } }, + { "Hawaii", { {"WGS1",64}, {"WPT1",1} } }, + { "Oland", { {"WGS1",64}, {"WPT1",1} } }, + { "Pitcairn", { {"WGS1",64}, {"WPT1",1} } }, + { "Tahiti", { {"WGS1",64}, {"WPT1",1} } }, + { "default", { {"WGS1",64}, {"WPT1",1} } }, } }, { // Intel CPUs kDeviceTypeCPU, "Intel", { - { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"WGS1",128}, {"WPT1",1}, {"VW2",1}, {"WGS2",128}, {"WPT2",2}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } }, - { "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz", { {"WGS1",64}, {"WPT1",4}, {"VW2",4}, {"WGS2",64}, {"WPT2",4}, {"VW3",2}, {"WGS3",64}, {"WPT3",2} } }, - { "default", { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",2}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } }, + { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"WGS1",128}, {"WPT1",1} } }, + { "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz", { {"WGS1",64}, {"WPT1",4} } }, + { "default", { {"WGS1",64}, {"WPT1",1} } }, } }, { // Intel GPUs kDeviceTypeGPU, "Intel", { - { "Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile", { {"WGS1",128}, {"WPT1",1}, {"VW2",2}, {"WGS2",128}, {"WPT2",2}, {"VW3",4}, {"WGS3",128}, {"WPT3",4} } }, - { "Intel(R) HD Graphics Skylake ULT GT2", { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",4}, {"WGS3",64}, {"WPT3",4} } }, - { "Iris", { {"WGS1",256}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } }, - { "Iris Pro", { {"WGS1",64}, {"WPT1",1}, {"VW2",4}, {"WGS2",128}, {"WPT2",4}, {"VW3",2}, {"WGS3",64}, {"WPT3",2} } }, - { "default", { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } }, + { "Intel(R) HD Graphics 530", { {"WGS1",64}, {"WPT1",1} } }, + { "Intel(R) HD Graphics 5500 BroadWell U-Processor GT2", { {"WGS1",64}, {"WPT1",1} } }, + { "Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile", { {"WGS1",128}, {"WPT1",1} } }, + { "Intel(R) HD Graphics Skylake ULT GT2", { {"WGS1",64}, {"WPT1",1} } }, + { "Iris", { {"WGS1",256}, {"WPT1",1} } }, + { "Iris Pro", { {"WGS1",64}, {"WPT1",1} } }, + { "default", { {"WGS1",64}, {"WPT1",1} } }, } }, { // Intel accelerators kDeviceTypeAccelerator, "Intel", { - { "Intel(R) Many Integrated Core Acceleration Card", { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } }, - { "default", { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } }, + { "Intel(R) Many Integrated Core Acceleration Card", { {"WGS1",64}, {"WPT1",1} } }, + { "default", { {"WGS1",64}, {"WPT1",1} } }, } }, { // NVIDIA GPUs kDeviceTypeGPU, "NVIDIA", { - { "GRID K520", { {"WGS1",256}, {"WPT1",1}, {"VW2",1}, {"WGS2",256}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } }, - { "GeForce GTX 480", { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } }, - { "GeForce GTX 680", { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } }, - { "GeForce GTX 750 Ti", { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1} } }, + { "GRID K520", { {"WGS1",256}, {"WPT1",1} } }, + { "GeForce GTX 1070", { {"WGS1",64}, {"WPT1",1} } }, + { "GeForce GTX 480", { {"WGS1",64}, {"WPT1",1} } }, + { "GeForce GTX 670", { {"WGS1",64}, {"WPT1",1} } }, + { "GeForce GTX 680", { {"WGS1",64}, {"WPT1",1} } }, + { "GeForce GTX 750", { {"WGS1",128}, {"WPT1",1} } }, + { "GeForce GTX 750 Ti", { {"WGS1",64}, {"WPT1",1} } }, { "GeForce GTX TITAN", { {"WGS1",256}, {"WPT1",1} } }, - { "default", { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } }, + { "default", { {"WGS1",64}, {"WPT1",1} } }, } }, { // Default kDeviceTypeAll, "default", { - { "default", { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } }, + { "default", { {"WGS1",64}, {"WPT1",1} } }, } }, } @@ -146,43 +159,47 @@ const Database::DatabaseEntry Database::XgemvDouble = { "Xgemv", Precision::kDouble, { { // AMD GPUs kDeviceTypeGPU, "AMD", { - { "AMD Radeon R9 M370X Compute Engine", { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",256}, {"WPT2",1}, {"VW3",1}, {"WGS3",128}, {"WPT3",1} } }, - { "Hawaii", { {"WGS1",128}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } }, - { "Pitcairn", { {"WGS1",256}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",2}, {"WGS3",64}, {"WPT3",2} } }, - { "Tahiti", { {"WGS1",256}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",2}, {"WGS3",64}, {"WPT3",2} } }, - { "default", { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } }, + { "AMD Radeon R9 M370X Compute Engine", { {"WGS1",64}, {"WPT1",1} } }, + { "Hawaii", { {"WGS1",128}, {"WPT1",1} } }, + { "Oland", { {"WGS1",256}, {"WPT1",1} } }, + { "Pitcairn", { {"WGS1",256}, {"WPT1",1} } }, + { "Tahiti", { {"WGS1",256}, {"WPT1",1} } }, + { "default", { {"WGS1",256}, {"WPT1",1} } }, } }, { // Intel CPUs kDeviceTypeCPU, "Intel", { - { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"WGS1",64}, {"WPT1",2}, {"VW2",4}, {"WGS2",128}, {"WPT2",4}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } }, - { "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz", { {"WGS1",64}, {"WPT1",4}, {"VW2",1}, {"WGS2",64}, {"WPT2",4}, {"VW3",1}, {"WGS3",64}, {"WPT3",2} } }, - { "default", { {"WGS1",64}, {"WPT1",2}, {"VW2",1}, {"WGS2",64}, {"WPT2",4}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } }, + { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"WGS1",64}, {"WPT1",2} } }, + { "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz", { {"WGS1",64}, {"WPT1",4} } }, + { "default", { {"WGS1",64}, {"WPT1",4} } }, } }, { // Intel accelerators kDeviceTypeAccelerator, "Intel", { - { "Intel(R) Many Integrated Core Acceleration Card", { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } }, - { "default", { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } }, + { "Intel(R) Many Integrated Core Acceleration Card", { {"WGS1",64}, {"WPT1",1} } }, + { "default", { {"WGS1",64}, {"WPT1",1} } }, } }, { // NVIDIA GPUs kDeviceTypeGPU, "NVIDIA", { - { "GRID K520", { {"WGS1",128}, {"WPT1",1}, {"VW2",1}, {"WGS2",256}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } }, - { "GeForce GTX 480", { {"WGS1",256}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } }, - { "GeForce GTX 680", { {"WGS1",128}, {"WPT1",1}, {"VW2",1}, {"WGS2",128}, {"WPT2",1}, {"VW3",2}, {"WGS3",128}, {"WPT3",2} } }, - { "GeForce GTX 750 Ti", { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",256}, {"WPT2",1}, {"VW3",2}, {"WGS3",256}, {"WPT3",2} } }, - { "GeForce GTX 980", { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } }, - { "GeForce GTX TITAN", { {"WGS1",256}, {"WPT1",1}, {"VW2",1}, {"WGS2",256}, {"WPT2",1}, {"VW3",1}, {"WGS3",256}, {"WPT3",1} } }, - { "GeForce GTX TITAN X", { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",128}, {"WPT2",1}, {"VW3",1}, {"WGS3",128}, {"WPT3",1} } }, - { "Tesla K20m", { {"WGS1",256}, {"WPT1",1}, {"VW2",1}, {"WGS2",128}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } }, - { "Tesla K40m", { {"WGS1",256}, {"WPT1",1}, {"VW2",1}, {"WGS2",256}, {"WPT2",1}, {"VW3",1}, {"WGS3",256}, {"WPT3",1} } }, - { "default", { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } }, + { "GRID K520", { {"WGS1",128}, {"WPT1",1} } }, + { "GeForce GTX 1070", { {"WGS1",64}, {"WPT1",1} } }, + { "GeForce GTX 480", { {"WGS1",256}, {"WPT1",1} } }, + { "GeForce GTX 670", { {"WGS1",128}, {"WPT1",1} } }, + { "GeForce GTX 680", { {"WGS1",128}, {"WPT1",1} } }, + { "GeForce GTX 750", { {"WGS1",64}, {"WPT1",1} } }, + { "GeForce GTX 750 Ti", { {"WGS1",64}, {"WPT1",1} } }, + { "GeForce GTX 980", { {"WGS1",64}, {"WPT1",1} } }, + { "GeForce GTX TITAN", { {"WGS1",256}, {"WPT1",1} } }, + { "GeForce GTX TITAN X", { {"WGS1",64}, {"WPT1",1} } }, + { "Tesla K20m", { {"WGS1",256}, {"WPT1",1} } }, + { "Tesla K40m", { {"WGS1",256}, {"WPT1",1} } }, + { "default", { {"WGS1",128}, {"WPT1",1} } }, } }, { // Default kDeviceTypeAll, "default", { - { "default", { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } }, + { "default", { {"WGS1",128}, {"WPT1",1} } }, } }, } @@ -194,36 +211,38 @@ const Database::DatabaseEntry Database::XgemvComplexDouble = { "Xgemv", Precision::kComplexDouble, { { // AMD GPUs kDeviceTypeGPU, "AMD", { - { "AMD Radeon R9 M370X Compute Engine", { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",256}, {"WPT2",1}, {"VW3",1}, {"WGS3",128}, {"WPT3",1} } }, - { "Hawaii", { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } }, - { "Pitcairn", { {"WGS1",256}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } }, - { "Tahiti", { {"WGS1",256}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } }, - { "default", { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } }, + { "AMD Radeon R9 M370X Compute Engine", { {"WGS1",64}, {"WPT1",1} } }, + { "Hawaii", { {"WGS1",64}, {"WPT1",1} } }, + { "Oland", { {"WGS1",256}, {"WPT1",1} } }, + { "Pitcairn", { {"WGS1",256}, {"WPT1",1} } }, + { "Tahiti", { {"WGS1",256}, {"WPT1",1} } }, + { "default", { {"WGS1",64}, {"WPT1",1} } }, } }, { // Intel CPUs kDeviceTypeCPU, "Intel", { - { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"WGS1",64}, {"WPT1",1}, {"VW2",2}, {"WGS2",64}, {"WPT2",4}, {"VW3",1}, {"WGS3",256}, {"WPT3",1} } }, - { "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz", { {"WGS1",64}, {"WPT1",4}, {"VW2",4}, {"WGS2",64}, {"WPT2",4}, {"VW3",2}, {"WGS3",256}, {"WPT3",2} } }, - { "default", { {"WGS1",64}, {"WPT1",1}, {"VW2",2}, {"WGS2",64}, {"WPT2",4}, {"VW3",1}, {"WGS3",256}, {"WPT3",1} } }, + { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"WGS1",64}, {"WPT1",1} } }, + { "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz", { {"WGS1",64}, {"WPT1",4} } }, + { "default", { {"WGS1",64}, {"WPT1",4} } }, } }, { // Intel accelerators kDeviceTypeAccelerator, "Intel", { - { "Intel(R) Many Integrated Core Acceleration Card", { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } }, - { "default", { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } }, + { "Intel(R) Many Integrated Core Acceleration Card", { {"WGS1",64}, {"WPT1",1} } }, + { "default", { {"WGS1",64}, {"WPT1",1} } }, } }, { // NVIDIA GPUs kDeviceTypeGPU, "NVIDIA", { - { "GRID K520", { {"WGS1",128}, {"WPT1",1}, {"VW2",1}, {"WGS2",128}, {"WPT2",1}, {"VW3",1}, {"WGS3",256}, {"WPT3",1} } }, - { "GeForce GTX 480", { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } }, - { "default", { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } }, + { "GRID K520", { {"WGS1",128}, {"WPT1",1} } }, + { "GeForce GTX 480", { {"WGS1",64}, {"WPT1",1} } }, + { "GeForce GTX 670", { {"WGS1",128}, {"WPT1",1} } }, + { "default", { {"WGS1",128}, {"WPT1",1} } }, } }, { // Default kDeviceTypeAll, "default", { - { "default", { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } }, + { "default", { {"WGS1",64}, {"WPT1",1} } }, } }, } diff --git a/src/database/kernels/xgemv_fast.hpp b/src/database/kernels/xgemv_fast.hpp new file mode 100644 index 00000000..52af628c --- /dev/null +++ b/src/database/kernels/xgemv_fast.hpp @@ -0,0 +1,250 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Database generator +// +// This file populates the database with best-found tuning parameters for the 'Xgemv_Fast' kernels. +// +// ================================================================================================= + +namespace clblast { +// ================================================================================================= + +const Database::DatabaseEntry Database::XgemvFastHalf = { + "XgemvFast", Precision::kHalf, { + { // Intel GPUs + kDeviceTypeGPU, "Intel", { + { "Intel(R) HD Graphics 5500 BroadWell U-Processor GT2", { {"VW2",1}, {"WGS2",16}, {"WPT2",1} } }, + { "Intel(R) HD Graphics Skylake ULT GT2", { {"VW2",2}, {"WGS2",128}, {"WPT2",2} } }, + { "default", { {"VW2",1}, {"WGS2",16}, {"WPT2",1} } }, + } + }, + { // Default + kDeviceTypeAll, "default", { + { "default", { {"VW2",1}, {"WGS2",16}, {"WPT2",1} } }, + } + }, + } +}; + +// ================================================================================================= + +const Database::DatabaseEntry Database::XgemvFastSingle = { + "XgemvFast", Precision::kSingle, { + { // AMD GPUs + kDeviceTypeGPU, "AMD", { + { "AMD Radeon R9 M370X Compute Engine", { {"VW2",1}, {"WGS2",128}, {"WPT2",1} } }, + { "Hawaii", { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } }, + { "Oland", { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } }, + { "Pitcairn", { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } }, + { "Tahiti", { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } }, + { "default", { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } }, + } + }, + { // Intel CPUs + kDeviceTypeCPU, "Intel", { + { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"VW2",4}, {"WGS2",128}, {"WPT2",4} } }, + { "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz", { {"VW2",1}, {"WGS2",64}, {"WPT2",4} } }, + { "default", { {"VW2",4}, {"WGS2",64}, {"WPT2",4} } }, + } + }, + { // Intel GPUs + kDeviceTypeGPU, "Intel", { + { "Intel(R) HD Graphics 530", { {"VW2",1}, {"WGS2",256}, {"WPT2",1} } }, + { "Intel(R) HD Graphics 5500 BroadWell U-Processor GT2", { {"VW2",2}, {"WGS2",32}, {"WPT2",2} } }, + { "Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile", { {"VW2",4}, {"WGS2",128}, {"WPT2",4} } }, + { "Intel(R) HD Graphics Skylake ULT GT2", { {"VW2",1}, {"WGS2",256}, {"WPT2",1} } }, + { "Iris", { {"VW2",1}, {"WGS2",128}, {"WPT2",2} } }, + { "Iris Pro", { {"VW2",1}, {"WGS2",128}, {"WPT2",2} } }, + { "default", { {"VW2",2}, {"WGS2",64}, {"WPT2",2} } }, + } + }, + { // Intel accelerators + kDeviceTypeAccelerator, "Intel", { + { "Intel(R) Many Integrated Core Acceleration Card", { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } }, + { "default", { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } }, + } + }, + { // NVIDIA GPUs + kDeviceTypeGPU, "NVIDIA", { + { "GRID K520", { {"VW2",2}, {"WGS2",256}, {"WPT2",2} } }, + { "GeForce GTX 1070", { {"VW2",1}, {"WGS2",256}, {"WPT2",1} } }, + { "GeForce GTX 480", { {"VW2",1}, {"WGS2",128}, {"WPT2",1} } }, + { "GeForce GTX 670", { {"VW2",2}, {"WGS2",256}, {"WPT2",2} } }, + { "GeForce GTX 680", { {"VW2",1}, {"WGS2",128}, {"WPT2",1} } }, + { "GeForce GTX 750", { {"VW2",1}, {"WGS2",256}, {"WPT2",1} } }, + { "GeForce GTX 750 Ti", { {"VW2",1}, {"WGS2",256}, {"WPT2",1} } }, + { "GeForce GTX 980", { {"VW2",1}, {"WGS2",256}, {"WPT2",1} } }, + { "GeForce GTX TITAN", { {"VW2",1}, {"WGS2",256}, {"WPT2",1} } }, + { "GeForce GTX TITAN X", { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } }, + { "Tesla K20m", { {"VW2",1}, {"WGS2",256}, {"WPT2",1} } }, + { "Tesla K40m", { {"VW2",1}, {"WGS2",256}, {"WPT2",1} } }, + { "default", { {"VW2",1}, {"WGS2",256}, {"WPT2",1} } }, + } + }, + { // Default + kDeviceTypeAll, "default", { + { "default", { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } }, + } + }, + } +}; + +// ================================================================================================= + +const Database::DatabaseEntry Database::XgemvFastComplexSingle = { + "XgemvFast", Precision::kComplexSingle, { + { // AMD GPUs + kDeviceTypeGPU, "AMD", { + { "AMD Radeon R9 M370X Compute Engine", { {"VW2",2}, {"WGS2",256}, {"WPT2",2} } }, + { "Hawaii", { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } }, + { "Oland", { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } }, + { "Pitcairn", { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } }, + { "Tahiti", { {"VW2",1}, {"WGS2",128}, {"WPT2",1} } }, + { "default", { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } }, + } + }, + { // Intel CPUs + kDeviceTypeCPU, "Intel", { + { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"VW2",1}, {"WGS2",128}, {"WPT2",2} } }, + { "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz", { {"VW2",4}, {"WGS2",64}, {"WPT2",4} } }, + { "default", { {"VW2",1}, {"WGS2",64}, {"WPT2",2} } }, + } + }, + { // Intel GPUs + kDeviceTypeGPU, "Intel", { + { "Intel(R) HD Graphics 530", { {"VW2",2}, {"WGS2",128}, {"WPT2",2} } }, + { "Intel(R) HD Graphics 5500 BroadWell U-Processor GT2", { {"VW2",1}, {"WGS2",32}, {"WPT2",2} } }, + { "Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile", { {"VW2",2}, {"WGS2",128}, {"WPT2",2} } }, + { "Intel(R) HD Graphics Skylake ULT GT2", { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } }, + { "Iris", { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } }, + { "Iris Pro", { {"VW2",4}, {"WGS2",128}, {"WPT2",4} } }, + { "default", { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } }, + } + }, + { // Intel accelerators + kDeviceTypeAccelerator, "Intel", { + { "Intel(R) Many Integrated Core Acceleration Card", { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } }, + { "default", { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } }, + } + }, + { // NVIDIA GPUs + kDeviceTypeGPU, "NVIDIA", { + { "GRID K520", { {"VW2",1}, {"WGS2",256}, {"WPT2",1} } }, + { "GeForce GTX 1070", { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } }, + { "GeForce GTX 480", { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } }, + { "GeForce GTX 670", { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } }, + { "GeForce GTX 680", { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } }, + { "GeForce GTX 750 Ti", { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } }, + { "default", { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } }, + } + }, + { // Default + kDeviceTypeAll, "default", { + { "default", { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } }, + } + }, + } +}; + +// ================================================================================================= + +const Database::DatabaseEntry Database::XgemvFastDouble = { + "XgemvFast", Precision::kDouble, { + { // AMD GPUs + kDeviceTypeGPU, "AMD", { + { "AMD Radeon R9 M370X Compute Engine", { {"VW2",1}, {"WGS2",256}, {"WPT2",1} } }, + { "Hawaii", { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } }, + { "Oland", { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } }, + { "Pitcairn", { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } }, + { "Tahiti", { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } }, + { "default", { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } }, + } + }, + { // Intel CPUs + kDeviceTypeCPU, "Intel", { + { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"VW2",4}, {"WGS2",128}, {"WPT2",4} } }, + { "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz", { {"VW2",1}, {"WGS2",64}, {"WPT2",4} } }, + { "default", { {"VW2",1}, {"WGS2",64}, {"WPT2",4} } }, + } + }, + { // Intel accelerators + kDeviceTypeAccelerator, "Intel", { + { "Intel(R) Many Integrated Core Acceleration Card", { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } }, + { "default", { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } }, + } + }, + { // NVIDIA GPUs + kDeviceTypeGPU, "NVIDIA", { + { "GRID K520", { {"VW2",1}, {"WGS2",256}, {"WPT2",1} } }, + { "GeForce GTX 1070", { {"VW2",1}, {"WGS2",256}, {"WPT2",1} } }, + { "GeForce GTX 480", { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } }, + { "GeForce GTX 670", { {"VW2",1}, {"WGS2",128}, {"WPT2",1} } }, + { "GeForce GTX 680", { {"VW2",1}, {"WGS2",128}, {"WPT2",1} } }, + { "GeForce GTX 750", { {"VW2",2}, {"WGS2",256}, {"WPT2",2} } }, + { "GeForce GTX 750 Ti", { {"VW2",1}, {"WGS2",256}, {"WPT2",1} } }, + { "GeForce GTX 980", { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } }, + { "GeForce GTX TITAN", { {"VW2",1}, {"WGS2",256}, {"WPT2",1} } }, + { "GeForce GTX TITAN X", { {"VW2",1}, {"WGS2",128}, {"WPT2",1} } }, + { "Tesla K20m", { {"VW2",1}, {"WGS2",128}, {"WPT2",1} } }, + { "Tesla K40m", { {"VW2",1}, {"WGS2",256}, {"WPT2",1} } }, + { "default", { {"VW2",1}, {"WGS2",256}, {"WPT2",1} } }, + } + }, + { // Default + kDeviceTypeAll, "default", { + { "default", { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } }, + } + }, + } +}; + +// ================================================================================================= + +const Database::DatabaseEntry Database::XgemvFastComplexDouble = { + "XgemvFast", Precision::kComplexDouble, { + { // AMD GPUs + kDeviceTypeGPU, "AMD", { + { "AMD Radeon R9 M370X Compute Engine", { {"VW2",1}, {"WGS2",256}, {"WPT2",1} } }, + { "Hawaii", { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } }, + { "Oland", { {"VW2",1}, {"WGS2",256}, {"WPT2",1} } }, + { "Pitcairn", { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } }, + { "Tahiti", { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } }, + { "default", { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } }, + } + }, + { // Intel CPUs + kDeviceTypeCPU, "Intel", { + { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"VW2",2}, {"WGS2",64}, {"WPT2",4} } }, + { "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz", { {"VW2",4}, {"WGS2",64}, {"WPT2",4} } }, + { "default", { {"VW2",2}, {"WGS2",64}, {"WPT2",4} } }, + } + }, + { // Intel accelerators + kDeviceTypeAccelerator, "Intel", { + { "Intel(R) Many Integrated Core Acceleration Card", { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } }, + { "default", { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } }, + } + }, + { // NVIDIA GPUs + kDeviceTypeGPU, "NVIDIA", { + { "GRID K520", { {"VW2",1}, {"WGS2",128}, {"WPT2",1} } }, + { "GeForce GTX 480", { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } }, + { "GeForce GTX 670", { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } }, + { "default", { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } }, + } + }, + { // Default + kDeviceTypeAll, "default", { + { "default", { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } }, + } + }, + } +}; + +// ================================================================================================= +} // namespace clblast diff --git a/src/database/kernels/xgemv_fast_rot.hpp b/src/database/kernels/xgemv_fast_rot.hpp new file mode 100644 index 00000000..328094e1 --- /dev/null +++ b/src/database/kernels/xgemv_fast_rot.hpp @@ -0,0 +1,154 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Database generator +// +// This file populates the database with best-found tuning parameters for the 'Xgemv_Fast_Rot' kernels. +// +// ================================================================================================= + +namespace clblast { +// ================================================================================================= + +const Database::DatabaseEntry Database::XgemvFastRotHalf = { + "XgemvFastRot", Precision::kHalf, { + { // Default + kDeviceTypeAll, "default", { + { "default", { {"VW3",8}, {"WGS3",32}, {"WPT3",32} } }, + } + }, + } +}; + +// ================================================================================================= + +const Database::DatabaseEntry Database::XgemvFastRotSingle = { + "XgemvFastRot", Precision::kSingle, { + { // AMD GPUs + kDeviceTypeGPU, "AMD", { + { "AMD Radeon R9 M370X Compute Engine", { {"VW3",8}, {"WGS3",64}, {"WPT3",32} } }, + { "default", { {"VW3",8}, {"WGS3",64}, {"WPT3",32} } }, + } + }, + { // Intel CPUs + kDeviceTypeCPU, "Intel", { + { "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz", { {"VW3",8}, {"WGS3",16}, {"WPT3",8} } }, + { "default", { {"VW3",8}, {"WGS3",16}, {"WPT3",8} } }, + } + }, + { // Intel GPUs + kDeviceTypeGPU, "Intel", { + { "Intel(R) HD Graphics 5500 BroadWell U-Processor GT2", { {"VW3",8}, {"WGS3",64}, {"WPT3",32} } }, + { "Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile", { {"VW3",4}, {"WGS3",64}, {"WPT3",16} } }, + { "Intel(R) HD Graphics Skylake ULT GT2", { {"VW3",4}, {"WGS3",128}, {"WPT3",16} } }, + { "Iris Pro", { {"VW3",4}, {"WGS3",32}, {"WPT3",16} } }, + { "default", { {"VW3",8}, {"WGS3",32}, {"WPT3",32} } }, + } + }, + { // NVIDIA GPUs + kDeviceTypeGPU, "NVIDIA", { + { "GeForce GTX TITAN", { {"VW3",1}, {"WGS3",16}, {"WPT3",16} } }, + { "default", { {"VW3",1}, {"WGS3",16}, {"WPT3",16} } }, + } + }, + { // Default + kDeviceTypeAll, "default", { + { "default", { {"VW3",8}, {"WGS3",32}, {"WPT3",32} } }, + } + }, + } +}; + +// ================================================================================================= + +const Database::DatabaseEntry Database::XgemvFastRotComplexSingle = { + "XgemvFastRot", Precision::kComplexSingle, { + { // AMD GPUs + kDeviceTypeGPU, "AMD", { + { "AMD Radeon R9 M370X Compute Engine", { {"VW3",8}, {"WGS3",16}, {"WPT3",16} } }, + { "default", { {"VW3",8}, {"WGS3",16}, {"WPT3",16} } }, + } + }, + { // Intel CPUs + kDeviceTypeCPU, "Intel", { + { "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz", { {"VW3",4}, {"WGS3",16}, {"WPT3",16} } }, + { "default", { {"VW3",4}, {"WGS3",16}, {"WPT3",16} } }, + } + }, + { // Intel GPUs + kDeviceTypeGPU, "Intel", { + { "Intel(R) HD Graphics 5500 BroadWell U-Processor GT2", { {"VW3",2}, {"WGS3",16}, {"WPT3",16} } }, + { "Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile", { {"VW3",4}, {"WGS3",128}, {"WPT3",8} } }, + { "Intel(R) HD Graphics Skylake ULT GT2", { {"VW3",2}, {"WGS3",32}, {"WPT3",16} } }, + { "Iris Pro", { {"VW3",4}, {"WGS3",16}, {"WPT3",16} } }, + { "default", { {"VW3",2}, {"WGS3",32}, {"WPT3",8} } }, + } + }, + { // Default + kDeviceTypeAll, "default", { + { "default", { {"VW3",2}, {"WGS3",32}, {"WPT3",16} } }, + } + }, + } +}; + +// ================================================================================================= + +const Database::DatabaseEntry Database::XgemvFastRotDouble = { + "XgemvFastRot", Precision::kDouble, { + { // AMD GPUs + kDeviceTypeGPU, "AMD", { + { "AMD Radeon R9 M370X Compute Engine", { {"VW3",4}, {"WGS3",16}, {"WPT3",16} } }, + { "default", { {"VW3",4}, {"WGS3",16}, {"WPT3",16} } }, + } + }, + { // Intel CPUs + kDeviceTypeCPU, "Intel", { + { "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz", { {"VW3",8}, {"WGS3",16}, {"WPT3",8} } }, + { "default", { {"VW3",8}, {"WGS3",16}, {"WPT3",8} } }, + } + }, + { // NVIDIA GPUs + kDeviceTypeGPU, "NVIDIA", { + { "GeForce GTX TITAN", { {"VW3",1}, {"WGS3",16}, {"WPT3",16} } }, + { "default", { {"VW3",1}, {"WGS3",16}, {"WPT3",16} } }, + } + }, + { // Default + kDeviceTypeAll, "default", { + { "default", { {"VW3",4}, {"WGS3",16}, {"WPT3",16} } }, + } + }, + } +}; + +// ================================================================================================= + +const Database::DatabaseEntry Database::XgemvFastRotComplexDouble = { + "XgemvFastRot", Precision::kComplexDouble, { + { // AMD GPUs + kDeviceTypeGPU, "AMD", { + { "AMD Radeon R9 M370X Compute Engine", { {"VW3",4}, {"WGS3",32}, {"WPT3",16} } }, + { "default", { {"VW3",4}, {"WGS3",32}, {"WPT3",16} } }, + } + }, + { // Intel CPUs + kDeviceTypeCPU, "Intel", { + { "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz", { {"VW3",8}, {"WGS3",16}, {"WPT3",16} } }, + { "default", { {"VW3",8}, {"WGS3",16}, {"WPT3",16} } }, + } + }, + { // Default + kDeviceTypeAll, "default", { + { "default", { {"VW3",8}, {"WGS3",32}, {"WPT3",16} } }, + } + }, + } +}; + +// ================================================================================================= +} // namespace clblast diff --git a/src/database/kernels/xger.hpp b/src/database/kernels/xger.hpp index 3727cc57..3e9c25c1 100644 --- a/src/database/kernels/xger.hpp +++ b/src/database/kernels/xger.hpp @@ -18,6 +18,7 @@ const Database::DatabaseEntry Database::XgerHalf = { "Xger", Precision::kHalf, { { // Intel GPUs kDeviceTypeGPU, "Intel", { + { "Intel(R) HD Graphics 5500 BroadWell U-Processor GT2", { {"WGS1",256}, {"WGS2",1}, {"WPT",2} } }, { "Intel(R) HD Graphics Skylake ULT GT2", { {"WGS1",64}, {"WGS2",1}, {"WPT",1} } }, { "default", { {"WGS1",64}, {"WGS2",1}, {"WPT",1} } }, } @@ -38,9 +39,10 @@ const Database::DatabaseEntry Database::XgerSingle = { kDeviceTypeGPU, "AMD", { { "AMD Radeon R9 M370X Compute Engine", { {"WGS1",256}, {"WGS2",1}, {"WPT",1} } }, { "Hawaii", { {"WGS1",64}, {"WGS2",2}, {"WPT",1} } }, + { "Oland", { {"WGS1",32}, {"WGS2",4}, {"WPT",2} } }, { "Pitcairn", { {"WGS1",64}, {"WGS2",1}, {"WPT",1} } }, { "Tahiti", { {"WGS1",256}, {"WGS2",1}, {"WPT",1} } }, - { "default", { {"WGS1",64}, {"WGS2",1}, {"WPT",1} } }, + { "default", { {"WGS1",32}, {"WGS2",4}, {"WPT",1} } }, } }, { // ARM GPUs @@ -53,29 +55,34 @@ const Database::DatabaseEntry Database::XgerSingle = { kDeviceTypeCPU, "Intel", { { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"WGS1",128}, {"WGS2",2}, {"WPT",4} } }, { "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz", { {"WGS1",128}, {"WGS2",1}, {"WPT",4} } }, - { "default", { {"WGS1",128}, {"WGS2",1}, {"WPT",4} } }, + { "default", { {"WGS1",128}, {"WGS2",8}, {"WPT",4} } }, } }, { // Intel GPUs kDeviceTypeGPU, "Intel", { - { "Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile", { {"WGS1",256}, {"WGS2",2}, {"WPT",2} } }, + { "Intel(R) HD Graphics 530", { {"WGS1",32}, {"WGS2",1}, {"WPT",2} } }, + { "Intel(R) HD Graphics 5500 BroadWell U-Processor GT2", { {"WGS1",256}, {"WGS2",2}, {"WPT",2} } }, + { "Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile", { {"WGS1",128}, {"WGS2",1}, {"WPT",2} } }, { "Intel(R) HD Graphics Skylake ULT GT2", { {"WGS1",8}, {"WGS2",8}, {"WPT",4} } }, { "Iris Pro", { {"WGS1",64}, {"WGS2",1}, {"WPT",4} } }, - { "default", { {"WGS1",8}, {"WGS2",1}, {"WPT",2} } }, + { "default", { {"WGS1",32}, {"WGS2",4}, {"WPT",2} } }, } }, { // NVIDIA GPUs kDeviceTypeGPU, "NVIDIA", { { "GRID K520", { {"WGS1",128}, {"WGS2",1}, {"WPT",2} } }, + { "GeForce GTX 1070", { {"WGS1",512}, {"WGS2",1}, {"WPT",1} } }, { "GeForce GTX 480", { {"WGS1",256}, {"WGS2",1}, {"WPT",4} } }, + { "GeForce GTX 670", { {"WGS1",32}, {"WGS2",8}, {"WPT",2} } }, { "GeForce GTX 680", { {"WGS1",128}, {"WGS2",1}, {"WPT",4} } }, + { "GeForce GTX 750", { {"WGS1",64}, {"WGS2",16}, {"WPT",4} } }, { "GeForce GTX TITAN", { {"WGS1",32}, {"WGS2",4}, {"WPT",2} } }, - { "default", { {"WGS1",32}, {"WGS2",1}, {"WPT",2} } }, + { "default", { {"WGS1",256}, {"WGS2",1}, {"WPT",4} } }, } }, { // Default kDeviceTypeAll, "default", { - { "default", { {"WGS1",8}, {"WGS2",1}, {"WPT",1} } }, + { "default", { {"WGS1",32}, {"WGS2",4}, {"WPT",2} } }, } }, } @@ -89,9 +96,10 @@ const Database::DatabaseEntry Database::XgerComplexSingle = { kDeviceTypeGPU, "AMD", { { "AMD Radeon R9 M370X Compute Engine", { {"WGS1",64}, {"WGS2",4}, {"WPT",1} } }, { "Hawaii", { {"WGS1",64}, {"WGS2",1}, {"WPT",2} } }, + { "Oland", { {"WGS1",4}, {"WGS2",8}, {"WPT",1} } }, { "Pitcairn", { {"WGS1",128}, {"WGS2",2}, {"WPT",1} } }, { "Tahiti", { {"WGS1",64}, {"WGS2",2}, {"WPT",1} } }, - { "default", { {"WGS1",64}, {"WGS2",1}, {"WPT",1} } }, + { "default", { {"WGS1",256}, {"WGS2",1}, {"WPT",1} } }, } }, { // ARM GPUs @@ -104,29 +112,34 @@ const Database::DatabaseEntry Database::XgerComplexSingle = { kDeviceTypeCPU, "Intel", { { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"WGS1",256}, {"WGS2",1}, {"WPT",4} } }, { "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz", { {"WGS1",512}, {"WGS2",4}, {"WPT",2} } }, - { "default", { {"WGS1",256}, {"WGS2",1}, {"WPT",2} } }, + { "default", { {"WGS1",512}, {"WGS2",4}, {"WPT",2} } }, } }, { // Intel GPUs kDeviceTypeGPU, "Intel", { - { "Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile", { {"WGS1",128}, {"WGS2",4}, {"WPT",1} } }, + { "Intel(R) HD Graphics 530", { {"WGS1",32}, {"WGS2",1}, {"WPT",2} } }, + { "Intel(R) HD Graphics 5500 BroadWell U-Processor GT2", { {"WGS1",128}, {"WGS2",2}, {"WPT",1} } }, + { "Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile", { {"WGS1",512}, {"WGS2",1}, {"WPT",1} } }, { "Intel(R) HD Graphics Skylake ULT GT2", { {"WGS1",128}, {"WGS2",4}, {"WPT",2} } }, { "Iris Pro", { {"WGS1",16}, {"WGS2",2}, {"WPT",4} } }, - { "default", { {"WGS1",16}, {"WGS2",2}, {"WPT",1} } }, + { "default", { {"WGS1",64}, {"WGS2",1}, {"WPT",2} } }, } }, { // NVIDIA GPUs kDeviceTypeGPU, "NVIDIA", { { "GRID K520", { {"WGS1",64}, {"WGS2",4}, {"WPT",2} } }, + { "GeForce GTX 1070", { {"WGS1",16}, {"WGS2",64}, {"WPT",2} } }, { "GeForce GTX 480", { {"WGS1",128}, {"WGS2",2}, {"WPT",2} } }, + { "GeForce GTX 670", { {"WGS1",16}, {"WGS2",32}, {"WPT",2} } }, { "GeForce GTX 680", { {"WGS1",32}, {"WGS2",4}, {"WPT",2} } }, + { "GeForce GTX 750", { {"WGS1",32}, {"WGS2",16}, {"WPT",4} } }, { "GeForce GTX TITAN", { {"WGS1",16}, {"WGS2",16}, {"WPT",2} } }, - { "default", { {"WGS1",16}, {"WGS2",2}, {"WPT",2} } }, + { "default", { {"WGS1",64}, {"WGS2",2}, {"WPT",2} } }, } }, { // Default kDeviceTypeAll, "default", { - { "default", { {"WGS1",16}, {"WGS2",1}, {"WPT",1} } }, + { "default", { {"WGS1",64}, {"WGS2",4}, {"WPT",2} } }, } }, } @@ -140,9 +153,10 @@ const Database::DatabaseEntry Database::XgerDouble = { kDeviceTypeGPU, "AMD", { { "AMD Radeon R9 M370X Compute Engine", { {"WGS1",32}, {"WGS2",4}, {"WPT",1} } }, { "Hawaii", { {"WGS1",32}, {"WGS2",4}, {"WPT",2} } }, + { "Oland", { {"WGS1",128}, {"WGS2",1}, {"WPT",2} } }, { "Pitcairn", { {"WGS1",64}, {"WGS2",1}, {"WPT",1} } }, { "Tahiti", { {"WGS1",64}, {"WGS2",2}, {"WPT",1} } }, - { "default", { {"WGS1",32}, {"WGS2",1}, {"WPT",1} } }, + { "default", { {"WGS1",64}, {"WGS2",2}, {"WPT",1} } }, } }, { // ARM GPUs @@ -155,21 +169,24 @@ const Database::DatabaseEntry Database::XgerDouble = { kDeviceTypeCPU, "Intel", { { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"WGS1",512}, {"WGS2",16}, {"WPT",1} } }, { "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz", { {"WGS1",512}, {"WGS2",8}, {"WPT",2} } }, - { "default", { {"WGS1",512}, {"WGS2",8}, {"WPT",1} } }, + { "default", { {"WGS1",512}, {"WGS2",8}, {"WPT",2} } }, } }, { // NVIDIA GPUs kDeviceTypeGPU, "NVIDIA", { { "GRID K520", { {"WGS1",128}, {"WGS2",8}, {"WPT",2} } }, + { "GeForce GTX 1070", { {"WGS1",32}, {"WGS2",8}, {"WPT",1} } }, { "GeForce GTX 480", { {"WGS1",32}, {"WGS2",4}, {"WPT",2} } }, + { "GeForce GTX 670", { {"WGS1",32}, {"WGS2",32}, {"WPT",2} } }, { "GeForce GTX 680", { {"WGS1",128}, {"WGS2",4}, {"WPT",2} } }, + { "GeForce GTX 750", { {"WGS1",256}, {"WGS2",2}, {"WPT",2} } }, { "GeForce GTX TITAN", { {"WGS1",16}, {"WGS2",8}, {"WPT",2} } }, - { "default", { {"WGS1",16}, {"WGS2",4}, {"WPT",2} } }, + { "default", { {"WGS1",256}, {"WGS2",2}, {"WPT",2} } }, } }, { // Default kDeviceTypeAll, "default", { - { "default", { {"WGS1",16}, {"WGS2",1}, {"WPT",1} } }, + { "default", { {"WGS1",128}, {"WGS2",1}, {"WPT",2} } }, } }, } @@ -183,9 +200,10 @@ const Database::DatabaseEntry Database::XgerComplexDouble = { kDeviceTypeGPU, "AMD", { { "AMD Radeon R9 M370X Compute Engine", { {"WGS1",64}, {"WGS2",1}, {"WPT",1} } }, { "Hawaii", { {"WGS1",128}, {"WGS2",1}, {"WPT",1} } }, + { "Oland", { {"WGS1",16}, {"WGS2",16}, {"WPT",2} } }, { "Pitcairn", { {"WGS1",64}, {"WGS2",4}, {"WPT",1} } }, { "Tahiti", { {"WGS1",32}, {"WGS2",4}, {"WPT",1} } }, - { "default", { {"WGS1",32}, {"WGS2",1}, {"WPT",1} } }, + { "default", { {"WGS1",32}, {"WGS2",4}, {"WPT",1} } }, } }, { // ARM GPUs @@ -204,15 +222,18 @@ const Database::DatabaseEntry Database::XgerComplexDouble = { { // NVIDIA GPUs kDeviceTypeGPU, "NVIDIA", { { "GRID K520", { {"WGS1",16}, {"WGS2",8}, {"WPT",2} } }, + { "GeForce GTX 1070", { {"WGS1",8}, {"WGS2",128}, {"WPT",1} } }, { "GeForce GTX 480", { {"WGS1",64}, {"WGS2",2}, {"WPT",2} } }, + { "GeForce GTX 670", { {"WGS1",8}, {"WGS2",16}, {"WPT",2} } }, { "GeForce GTX 680", { {"WGS1",8}, {"WGS2",16}, {"WPT",1} } }, + { "GeForce GTX 750", { {"WGS1",8}, {"WGS2",32}, {"WPT",4} } }, { "GeForce GTX TITAN", { {"WGS1",32}, {"WGS2",4}, {"WPT",2} } }, - { "default", { {"WGS1",8}, {"WGS2",2}, {"WPT",1} } }, + { "default", { {"WGS1",16}, {"WGS2",8}, {"WPT",2} } }, } }, { // Default kDeviceTypeAll, "default", { - { "default", { {"WGS1",8}, {"WGS2",1}, {"WPT",1} } }, + { "default", { {"WGS1",64}, {"WGS2",2}, {"WPT",2} } }, } }, } diff --git a/src/kernels/common.opencl b/src/kernels/common.opencl index 08c47d87..223501fd 100644 --- a/src/kernels/common.opencl +++ b/src/kernels/common.opencl @@ -109,6 +109,16 @@ R"( typedef real singlereal; #endif +// Converts a 'real argument' value to a 'real' value as passed to the kernel. Normally there is no +// conversion, but half-precision is not supported as kernel argument so it is converted from float. +#if PRECISION == 16 + typedef float real_arg; + #define GetRealArg(x) (half)x +#else + typedef real real_arg; + #define GetRealArg(x) x +#endif + // ================================================================================================= // Don't use the non-IEEE754 compliant OpenCL built-in mad() instruction per default. For specific @@ -138,6 +148,13 @@ R"( #define SetToOne(a) a = ONE #endif +// Determines whether a variable is zero +#if PRECISION == 3232 || PRECISION == 6464 + #define IsZero(a) ((a.x == ZERO) && (a.y == ZERO)) +#else + #define IsZero(a) (a == ZERO) +#endif + // The absolute value (component-wise) #if PRECISION == 3232 || PRECISION == 6464 #define AbsoluteValue(value) value.x = fabs(value.x); value.y = fabs(value.y) diff --git a/src/kernels/level1/xamax.opencl b/src/kernels/level1/xamax.opencl index 48d0eb5c..48ad2e75 100644 --- a/src/kernels/level1/xamax.opencl +++ b/src/kernels/level1/xamax.opencl @@ -30,10 +30,10 @@ R"( // ================================================================================================= // The main reduction kernel, performing the loading and the majority of the operation -__attribute__((reqd_work_group_size(WGS1, 1, 1))) -__kernel void Xamax(const int n, - const __global real* restrict xgm, const int x_offset, const int x_inc, - __global singlereal* maxgm, __global unsigned int* imaxgm) { +__kernel __attribute__((reqd_work_group_size(WGS1, 1, 1))) +void Xamax(const int n, + const __global real* restrict xgm, const int x_offset, const int x_inc, + __global singlereal* maxgm, __global unsigned int* imaxgm) { __local singlereal maxlm[WGS1]; __local unsigned int imaxlm[WGS1]; const int lid = get_local_id(0); @@ -95,10 +95,10 @@ __kernel void Xamax(const int n, // The epilogue reduction kernel, performing the final bit of the operation. This kernel has to // be launched with a single workgroup only. -__attribute__((reqd_work_group_size(WGS2, 1, 1))) -__kernel void XamaxEpilogue(const __global singlereal* restrict maxgm, - const __global unsigned int* restrict imaxgm, - __global unsigned int* imax, const int imax_offset) { +__kernel __attribute__((reqd_work_group_size(WGS2, 1, 1))) +void XamaxEpilogue(const __global singlereal* restrict maxgm, + const __global unsigned int* restrict imaxgm, + __global unsigned int* imax, const int imax_offset) { __local singlereal maxlm[WGS2]; __local unsigned int imaxlm[WGS2]; const int lid = get_local_id(0); diff --git a/src/kernels/level1/xasum.opencl b/src/kernels/level1/xasum.opencl index 58d0f11b..1fc91be8 100644 --- a/src/kernels/level1/xasum.opencl +++ b/src/kernels/level1/xasum.opencl @@ -30,10 +30,10 @@ R"( // ================================================================================================= // The main reduction kernel, performing the loading and the majority of the operation -__attribute__((reqd_work_group_size(WGS1, 1, 1))) -__kernel void Xasum(const int n, - const __global real* restrict xgm, const int x_offset, const int x_inc, - __global real* output) { +__kernel __attribute__((reqd_work_group_size(WGS1, 1, 1))) +void Xasum(const int n, + const __global real* restrict xgm, const int x_offset, const int x_inc, + __global real* output) { __local real lm[WGS1]; const int lid = get_local_id(0); const int wgid = get_group_id(0); @@ -74,9 +74,9 @@ __kernel void Xasum(const int n, // The epilogue reduction kernel, performing the final bit of the operation. This kernel has to // be launched with a single workgroup only. -__attribute__((reqd_work_group_size(WGS2, 1, 1))) -__kernel void XasumEpilogue(const __global real* restrict input, - __global real* asum, const int asum_offset) { +__kernel __attribute__((reqd_work_group_size(WGS2, 1, 1))) +void XasumEpilogue(const __global real* restrict input, + __global real* asum, const int asum_offset) { __local real lm[WGS2]; const int lid = get_local_id(0); diff --git a/src/kernels/level1/xaxpy.opencl b/src/kernels/level1/xaxpy.opencl index e0efadc1..ece8476e 100644 --- a/src/kernels/level1/xaxpy.opencl +++ b/src/kernels/level1/xaxpy.opencl @@ -22,11 +22,11 @@ R"( // ================================================================================================= // Full version of the kernel with offsets and strided accesses -__attribute__((reqd_work_group_size(WGS, 1, 1))) -__kernel void Xaxpy(const int n, const __constant real* restrict arg_alpha, - const __global real* restrict xgm, const int x_offset, const int x_inc, - __global real* ygm, const int y_offset, const int y_inc) { - const real alpha = arg_alpha[0]; +__kernel __attribute__((reqd_work_group_size(WGS, 1, 1))) +void Xaxpy(const int n, const real_arg arg_alpha, + const __global real* restrict xgm, const int x_offset, const int x_inc, + __global real* ygm, const int y_offset, const int y_inc) { + const real alpha = GetRealArg(arg_alpha); // Loops over the work that needs to be done (allows for an arbitrary number of threads) #pragma unroll @@ -40,11 +40,11 @@ __kernel void Xaxpy(const int n, const __constant real* restrict arg_alpha, // Faster version of the kernel without offsets and strided accesses. Also assumes that 'n' is // dividable by 'VW', 'WGS' and 'WPT'. -__attribute__((reqd_work_group_size(WGS, 1, 1))) -__kernel void XaxpyFast(const int n, const __constant real* restrict arg_alpha, - const __global realV* restrict xgm, - __global realV* ygm) { - const real alpha = arg_alpha[0]; +__kernel __attribute__((reqd_work_group_size(WGS, 1, 1))) +void XaxpyFast(const int n, const real_arg arg_alpha, + const __global realV* restrict xgm, + __global realV* ygm) { + const real alpha = GetRealArg(arg_alpha); #pragma unroll for (int w=0; w 'a_ld' is a multiple of VW2 // --> 'a_rotated' is 0 // --> 'do_conjugate' is 0 -__attribute__((reqd_work_group_size(WGS2, 1, 1))) -__kernel void XgemvFast(const int m, const int n, - const __constant real* restrict arg_alpha, - const __constant real* restrict arg_beta, - const int a_rotated, - const __global realVF* restrict agm, const int a_offset, const int a_ld, - const __global real* restrict xgm, const int x_offset, const int x_inc, - __global real* ygm, const int y_offset, const int y_inc, - const int do_conjugate, const int parameter, - const int kl, const int ku) { - const real alpha = arg_alpha[0]; - const real beta = arg_beta[0]; +__kernel __attribute__((reqd_work_group_size(WGS2, 1, 1))) +void XgemvFast(const int m, const int n, + const real_arg arg_alpha, + const real_arg arg_beta, + const int a_rotated, + const __global realVF* restrict agm, const int a_offset, const int a_ld, + const __global real* restrict xgm, const int x_offset, const int x_inc, + __global real* ygm, const int y_offset, const int y_inc, + const int do_conjugate, const int parameter, + const int kl_unused, const int ku_unused) { + const real alpha = GetRealArg(arg_alpha); + const real beta = GetRealArg(arg_beta); // Local memory for the vector X __local real xlm[WGS2]; - // Initializes the accumulation register + // Initializes the accumulation registers real acc[WPT2]; #pragma unroll for (int w=0; w 'a_ld' is a multiple of VW3 // --> 'a_rotated' is 1 // --> 'do_conjugate' is 0 -__attribute__((reqd_work_group_size(WGS3, 1, 1))) -__kernel void XgemvFastRot(const int m, const int n, - const __constant real* restrict arg_alpha, - const __constant real* restrict arg_beta, - const int a_rotated, - const __global realVFR* restrict agm, const int a_offset, const int a_ld, - const __global real* restrict xgm, const int x_offset, const int x_inc, - __global real* ygm, const int y_offset, const int y_inc, - const int do_conjugate, const int parameter, - const int kl, const int ku) { - const real alpha = arg_alpha[0]; - const real beta = arg_beta[0]; +__kernel __attribute__((reqd_work_group_size(WGS3, 1, 1))) +void XgemvFastRot(const int m, const int n, + const real_arg arg_alpha, + const real_arg arg_beta, + const int a_rotated, + const __global realVFR* restrict agm, const int a_offset, const int a_ld, + const __global real* restrict xgm, const int x_offset, const int x_inc, + __global real* ygm, const int y_offset, const int y_inc, + const int do_conjugate, const int parameter, + const int kl_unused, const int ku_unused) { + const real alpha = GetRealArg(arg_alpha); + const real beta = GetRealArg(arg_beta); + + // Local memory to store a tile of the matrix (for coalescing) + __local real tile[WPT3][WGS3]; + const int lid = get_local_id(0); + const int lid_mod = lid % (WPT3/VW3); + const int lid_div = lid / (WPT3/VW3); // Local memory for the vector X - __local real xlm[WGS3]; + __local real xlm[WPT3]; // Initializes the accumulation register - real acc[WPT3]; - #pragma unroll - for (int w=0; w // -// This is part 2 of 2 of the GEMM kernel. See part 1 for more information. +// This is part 2 of 3 of the GEMM kernel. See part 1 for more information. // // ================================================================================================= @@ -133,49 +133,93 @@ inline void StoreResults(__global realM* cgm, realM cpm[NWI][MWI/VWM], const int #endif int idm = mg + GetGroupID0() * (MWG/VWM); int idn = ng + GetGroupID1() * NWG; - - // The final multiplication with alpha and the addition with beta*C int index = idn*(kSizeM/VWM) + idm; + realM result; realM xval = cpm[ni][mi]; - realM yval = cgm[index]; - #if VWM == 1 - AXPBY(result, alpha, xval, beta, yval); - #elif VWM == 2 - AXPBY(result.x, alpha, xval.x, beta, yval.x); - AXPBY(result.y, alpha, xval.y, beta, yval.y); - #elif VWM == 4 - AXPBY(result.x, alpha, xval.x, beta, yval.x); - AXPBY(result.y, alpha, xval.y, beta, yval.y); - AXPBY(result.z, alpha, xval.z, beta, yval.z); - AXPBY(result.w, alpha, xval.w, beta, yval.w); - #elif VWM == 8 - AXPBY(result.s0, alpha, xval.s0, beta, yval.s0); - AXPBY(result.s1, alpha, xval.s1, beta, yval.s1); - AXPBY(result.s2, alpha, xval.s2, beta, yval.s2); - AXPBY(result.s3, alpha, xval.s3, beta, yval.s3); - AXPBY(result.s4, alpha, xval.s4, beta, yval.s4); - AXPBY(result.s5, alpha, xval.s5, beta, yval.s5); - AXPBY(result.s6, alpha, xval.s6, beta, yval.s6); - AXPBY(result.s7, alpha, xval.s7, beta, yval.s7); - #elif VWM == 16 - AXPBY(result.s0, alpha, xval.s0, beta, yval.s0); - AXPBY(result.s1, alpha, xval.s1, beta, yval.s1); - AXPBY(result.s2, alpha, xval.s2, beta, yval.s2); - AXPBY(result.s3, alpha, xval.s3, beta, yval.s3); - AXPBY(result.s4, alpha, xval.s4, beta, yval.s4); - AXPBY(result.s5, alpha, xval.s5, beta, yval.s5); - AXPBY(result.s6, alpha, xval.s6, beta, yval.s6); - AXPBY(result.s7, alpha, xval.s7, beta, yval.s7); - AXPBY(result.s8, alpha, xval.s8, beta, yval.s8); - AXPBY(result.s9, alpha, xval.s9, beta, yval.s9); - AXPBY(result.sA, alpha, xval.sA, beta, yval.sA); - AXPBY(result.sB, alpha, xval.sB, beta, yval.sB); - AXPBY(result.sC, alpha, xval.sC, beta, yval.sC); - AXPBY(result.sD, alpha, xval.sD, beta, yval.sD); - AXPBY(result.sE, alpha, xval.sE, beta, yval.sE); - AXPBY(result.sF, alpha, xval.sF, beta, yval.sF); - #endif + + // The final multiplication with alpha (in case beta == 0) + if (IsZero(beta)) { + #if VWM == 1 + Multiply(result, alpha, xval); + #elif VWM == 2 + Multiply(result.x, alpha, xval.x); + Multiply(result.y, alpha, xval.y); + #elif VWM == 4 + Multiply(result.x, alpha, xval.x); + Multiply(result.y, alpha, xval.y); + Multiply(result.z, alpha, xval.z); + Multiply(result.w, alpha, xval.w); + #elif VWM == 8 + Multiply(result.s0, alpha, xval.s0); + Multiply(result.s1, alpha, xval.s1); + Multiply(result.s2, alpha, xval.s2); + Multiply(result.s3, alpha, xval.s3); + Multiply(result.s4, alpha, xval.s4); + Multiply(result.s5, alpha, xval.s5); + Multiply(result.s6, alpha, xval.s6); + Multiply(result.s7, alpha, xval.s7); + #elif VWM == 16 + Multiply(result.s0, alpha, xval.s0); + Multiply(result.s1, alpha, xval.s1); + Multiply(result.s2, alpha, xval.s2); + Multiply(result.s3, alpha, xval.s3); + Multiply(result.s4, alpha, xval.s4); + Multiply(result.s5, alpha, xval.s5); + Multiply(result.s6, alpha, xval.s6); + Multiply(result.s7, alpha, xval.s7); + Multiply(result.s8, alpha, xval.s8); + Multiply(result.s9, alpha, xval.s9); + Multiply(result.sA, alpha, xval.sA); + Multiply(result.sB, alpha, xval.sB); + Multiply(result.sC, alpha, xval.sC); + Multiply(result.sD, alpha, xval.sD); + Multiply(result.sE, alpha, xval.sE); + Multiply(result.sF, alpha, xval.sF); + #endif + } + + // The final multiplication with alpha and the addition with beta*C + else { + realM yval = cgm[index]; + #if VWM == 1 + AXPBY(result, alpha, xval, beta, yval); + #elif VWM == 2 + AXPBY(result.x, alpha, xval.x, beta, yval.x); + AXPBY(result.y, alpha, xval.y, beta, yval.y); + #elif VWM == 4 + AXPBY(result.x, alpha, xval.x, beta, yval.x); + AXPBY(result.y, alpha, xval.y, beta, yval.y); + AXPBY(result.z, alpha, xval.z, beta, yval.z); + AXPBY(result.w, alpha, xval.w, beta, yval.w); + #elif VWM == 8 + AXPBY(result.s0, alpha, xval.s0, beta, yval.s0); + AXPBY(result.s1, alpha, xval.s1, beta, yval.s1); + AXPBY(result.s2, alpha, xval.s2, beta, yval.s2); + AXPBY(result.s3, alpha, xval.s3, beta, yval.s3); + AXPBY(result.s4, alpha, xval.s4, beta, yval.s4); + AXPBY(result.s5, alpha, xval.s5, beta, yval.s5); + AXPBY(result.s6, alpha, xval.s6, beta, yval.s6); + AXPBY(result.s7, alpha, xval.s7, beta, yval.s7); + #elif VWM == 16 + AXPBY(result.s0, alpha, xval.s0, beta, yval.s0); + AXPBY(result.s1, alpha, xval.s1, beta, yval.s1); + AXPBY(result.s2, alpha, xval.s2, beta, yval.s2); + AXPBY(result.s3, alpha, xval.s3, beta, yval.s3); + AXPBY(result.s4, alpha, xval.s4, beta, yval.s4); + AXPBY(result.s5, alpha, xval.s5, beta, yval.s5); + AXPBY(result.s6, alpha, xval.s6, beta, yval.s6); + AXPBY(result.s7, alpha, xval.s7, beta, yval.s7); + AXPBY(result.s8, alpha, xval.s8, beta, yval.s8); + AXPBY(result.s9, alpha, xval.s9, beta, yval.s9); + AXPBY(result.sA, alpha, xval.sA, beta, yval.sA); + AXPBY(result.sB, alpha, xval.sB, beta, yval.sB); + AXPBY(result.sC, alpha, xval.sC, beta, yval.sC); + AXPBY(result.sD, alpha, xval.sD, beta, yval.sD); + AXPBY(result.sE, alpha, xval.sE, beta, yval.sE); + AXPBY(result.sF, alpha, xval.sF, beta, yval.sF); + #endif + } cgm[index] = result; } } @@ -183,212 +227,6 @@ inline void StoreResults(__global realM* cgm, realM cpm[NWI][MWI/VWM], const int // ================================================================================================= -// Main body of the matrix-multiplication algorithm. It calls the (inlined) functions above. -inline void XgemmBody(const int kSizeM, const int kSizeN, const int kSizeK, - const __global realM* restrict agm, const __global realN* restrict bgm, - __global realM* cgm, realM cpm[NWI][MWI/VWM] - #if SA == 1 && SB == 1 - , __local realM* alm, __local realN* blm - #elif SA == 1 - , __local realM* alm - #elif SB == 1 - , __local realN* blm - #endif - ) { - - // Allocates workitem-private memory (registers) - realM apm[MWI/VWM]; - realN bpm[NWI/VWN]; - - // Combined thread identifier (volatile to disable caching) - #if SA == 1 || SB == 1 - volatile int tid = get_local_id(0) + MDIMC*get_local_id(1); - #endif - - // Initializes the accumulation registers - InitAccRegisters(cpm); - - // Loops over all workgroup tiles - for (int kwg=0; kwg local (matrix A) - #if SA == 1 - GlobalToLocalA(agm, alm, kSizeM, tid, kwg); - #endif - // Loads data: off-chip --> local (matrix B) - #if SB == 1 - GlobalToLocalB(bgm, blm, kSizeN, tid, kwg); - #endif - #if SA == 1 || SB == 1 - barrier(CLK_LOCAL_MEM_FENCE); - #endif - - // Loops over all workitem tiles, unrolled by a factor KWI - for (int pwi=0; pwi private (matrix A) - #if SA == 1 - LocalToPrivateA(alm, apm, kg); - // Loads data: off-chip --> private (matrix A) - #else - GlobalToPrivateA(agm, apm, kSizeM, idk, kwg); - #endif - - // Loads data: local --> private (matrix B) - #if SB == 1 - LocalToPrivateB(blm, bpm, kg); - // Loads data: off-chip --> private (matrix B) - #else - GlobalToPrivateB(bgm, bpm, kSizeN, idk); - #endif - - // Performs the accumulation (Cpm += Apm * Bpm) - MultiplyAccumulate(cpm, apm, bpm); - } - } - #if SA == 1 || SB == 1 - barrier(CLK_LOCAL_MEM_FENCE); - #endif - } - #if GLOBAL_MEM_FENCE == 1 - barrier(CLK_GLOBAL_MEM_FENCE); - #endif -} - -// ================================================================================================= -// The upper-triangular and lower-triangular kernels are only used in special cases -#if defined(ROUTINE_SYRK) || defined(ROUTINE_HERK) || defined(ROUTINE_SYR2K) || defined(ROUTINE_HER2K) - -// Main entry point of the kernel. This is the upper-triangular version. -__attribute__((reqd_work_group_size(MDIMC, NDIMC, 1))) -__kernel void XgemmUpper(const int kSizeN, const int kSizeK, - const __constant real* restrict arg_alpha, - const __constant real* restrict arg_beta, - const __global realM* restrict agm, - const __global realN* restrict bgm, - __global realM* cgm) { - const real alpha = arg_alpha[0]; - const real beta = arg_beta[0]; - - // Skip these threads if they do not contain threads contributing to the upper-triangle - if (GetGroupID1()*NWG < GetGroupID0()*MWG) { - return; - } - - // Allocates workgroup-private memory (local memory) - #if SA == 1 - __local realM alm[KWG * MWG/VWM]; - #endif - #if SB == 1 - __local realN blm[KWG * NWG/VWN]; - #endif - - // Computes the matrix-multiplication and stores the result in register memory - realM cpm[NWI][MWI/VWM]; - #if SA == 1 && SB == 1 - XgemmBody(kSizeN, kSizeN, kSizeK, agm, bgm, cgm, cpm, alm, blm); - #elif SA == 1 - XgemmBody(kSizeN, kSizeN, kSizeK, agm, bgm, cgm, cpm, alm); - #elif SB == 1 - XgemmBody(kSizeN, kSizeN, kSizeK, agm, bgm, cgm, cpm, blm); - #else - XgemmBody(kSizeN, kSizeN, kSizeK, agm, bgm, cgm, cpm); - #endif - - // Stores an MWG * NWG tile of results and performs the multiplication with alpha and beta - StoreResults(cgm, cpm, kSizeN, alpha, beta); -} - -// Main entry point of the kernel. This is the lower-triangular version. -__attribute__((reqd_work_group_size(MDIMC, NDIMC, 1))) -__kernel void XgemmLower(const int kSizeN, const int kSizeK, - const __constant real* restrict arg_alpha, - const __constant real* restrict arg_beta, - const __global realM* restrict agm, - const __global realN* restrict bgm, - __global realM* cgm) { - const real alpha = arg_alpha[0]; - const real beta = arg_beta[0]; - - // Skip these threads if they do not contain threads contributing to the lower-triangle - if (GetGroupID1()*NWG > GetGroupID0()*MWG) { - return; - } - - // Allocates workgroup-private memory (local memory) - #if SA == 1 - __local realM alm[KWG * MWG/VWM]; - #endif - #if SB == 1 - __local realN blm[KWG * NWG/VWN]; - #endif - - // Computes the matrix-multiplication and stores the result in register memory - realM cpm[NWI][MWI/VWM]; - #if SA == 1 && SB == 1 - XgemmBody(kSizeN, kSizeN, kSizeK, agm, bgm, cgm, cpm, alm, blm); - #elif SA == 1 - XgemmBody(kSizeN, kSizeN, kSizeK, agm, bgm, cgm, cpm, alm); - #elif SB == 1 - XgemmBody(kSizeN, kSizeN, kSizeK, agm, bgm, cgm, cpm, blm); - #else - XgemmBody(kSizeN, kSizeN, kSizeK, agm, bgm, cgm, cpm); - #endif - - // Stores an MWG * NWG tile of results and performs the multiplication with alpha and beta - StoreResults(cgm, cpm, kSizeN, alpha, beta); -} - -// ================================================================================================= -// If not using a triangular version, include the regular kernel -#else - -// Main entry point of the kernel. This is the regular full version. -__attribute__((reqd_work_group_size(MDIMC, NDIMC, 1))) -__kernel void Xgemm(const int kSizeM, const int kSizeN, const int kSizeK, - const __constant real* restrict arg_alpha, - const __constant real* restrict arg_beta, - const __global realM* restrict agm, - const __global realN* restrict bgm, - __global realM* cgm) { - const real alpha = arg_alpha[0]; - const real beta = arg_beta[0]; - - // Allocates workgroup-private memory (local memory) - #if SA == 1 - __local realM alm[KWG * MWG/VWM]; - #endif - #if SB == 1 - __local realN blm[KWG * NWG/VWN]; - #endif - - // Computes the matrix-multiplication and stores the result in register memory - realM cpm[NWI][MWI/VWM]; - #if SA == 1 && SB == 1 - XgemmBody(kSizeM, kSizeN, kSizeK, agm, bgm, cgm, cpm, alm, blm); - #elif SA == 1 - XgemmBody(kSizeM, kSizeN, kSizeK, agm, bgm, cgm, cpm, alm); - #elif SB == 1 - XgemmBody(kSizeM, kSizeN, kSizeK, agm, bgm, cgm, cpm, blm); - #else - XgemmBody(kSizeM, kSizeN, kSizeK, agm, bgm, cgm, cpm); - #endif - - // Stores an MWG * NWG tile of results and performs the multiplication with alpha and beta - StoreResults(cgm, cpm, kSizeM, alpha, beta); -} - -#endif -// ================================================================================================= - // End of the C++11 raw string literal )" diff --git a/src/kernels/level3/xgemm_part3.opencl b/src/kernels/level3/xgemm_part3.opencl new file mode 100644 index 00000000..a5faef5a --- /dev/null +++ b/src/kernels/level3/xgemm_part3.opencl @@ -0,0 +1,229 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// This is part 3 of 3 of the GEMM kernel. See part 1 for more information. +// +// ================================================================================================= + +// Enables loading of this file using the C++ pre-processor's #include (C++11 standard raw string +// literal). Comment-out this line for syntax-highlighting when developing. +R"( + +// ================================================================================================= + +// Main body of the matrix-multiplication algorithm. It calls the (inlined) functions above. +inline void XgemmBody(const int kSizeM, const int kSizeN, const int kSizeK, + const __global realM* restrict agm, const __global realN* restrict bgm, + __global realM* cgm, realM cpm[NWI][MWI/VWM] + #if SA == 1 && SB == 1 + , __local realM* alm, __local realN* blm + #elif SA == 1 + , __local realM* alm + #elif SB == 1 + , __local realN* blm + #endif + ) { + + // Allocates workitem-private memory (registers) + realM apm[MWI/VWM]; + realN bpm[NWI/VWN]; + + // Combined thread identifier (volatile to disable caching) + #if SA == 1 || SB == 1 + volatile int tid = get_local_id(0) + MDIMC*get_local_id(1); + #endif + + // Initializes the accumulation registers + InitAccRegisters(cpm); + + // Loops over all workgroup tiles + for (int kwg=0; kwg local (matrix A) + #if SA == 1 + GlobalToLocalA(agm, alm, kSizeM, tid, kwg); + #endif + // Loads data: off-chip --> local (matrix B) + #if SB == 1 + GlobalToLocalB(bgm, blm, kSizeN, tid, kwg); + #endif + #if SA == 1 || SB == 1 + barrier(CLK_LOCAL_MEM_FENCE); + #endif + + // Loops over all workitem tiles, unrolled by a factor KWI + for (int pwi=0; pwi private (matrix A) + #if SA == 1 + LocalToPrivateA(alm, apm, kg); + // Loads data: off-chip --> private (matrix A) + #else + GlobalToPrivateA(agm, apm, kSizeM, idk, kwg); + #endif + + // Loads data: local --> private (matrix B) + #if SB == 1 + LocalToPrivateB(blm, bpm, kg); + // Loads data: off-chip --> private (matrix B) + #else + GlobalToPrivateB(bgm, bpm, kSizeN, idk); + #endif + + // Performs the accumulation (Cpm += Apm * Bpm) + MultiplyAccumulate(cpm, apm, bpm); + } + } + #if SA == 1 || SB == 1 + barrier(CLK_LOCAL_MEM_FENCE); + #endif + } + #if GLOBAL_MEM_FENCE == 1 + barrier(CLK_GLOBAL_MEM_FENCE); + #endif +} + +// ================================================================================================= +// The upper-triangular and lower-triangular kernels are only used in special cases +#if defined(ROUTINE_SYRK) || defined(ROUTINE_HERK) || defined(ROUTINE_SYR2K) || defined(ROUTINE_HER2K) + +// Main entry point of the kernel. This is the upper-triangular version. +__kernel __attribute__((reqd_work_group_size(MDIMC, NDIMC, 1))) +void XgemmUpper(const int kSizeN, const int kSizeK, + const real_arg arg_alpha, + const real_arg arg_beta, + const __global realM* restrict agm, + const __global realN* restrict bgm, + __global realM* cgm) { + const real alpha = GetRealArg(arg_alpha); + const real beta = GetRealArg(arg_beta); + + // Skip these threads if they do not contain threads contributing to the upper-triangle + if (GetGroupID1()*NWG < GetGroupID0()*MWG) { + return; + } + + // Allocates workgroup-private memory (local memory) + #if SA == 1 + __local realM alm[KWG * MWG/VWM]; + #endif + #if SB == 1 + __local realN blm[KWG * NWG/VWN]; + #endif + + // Computes the matrix-multiplication and stores the result in register memory + realM cpm[NWI][MWI/VWM]; + #if SA == 1 && SB == 1 + XgemmBody(kSizeN, kSizeN, kSizeK, agm, bgm, cgm, cpm, alm, blm); + #elif SA == 1 + XgemmBody(kSizeN, kSizeN, kSizeK, agm, bgm, cgm, cpm, alm); + #elif SB == 1 + XgemmBody(kSizeN, kSizeN, kSizeK, agm, bgm, cgm, cpm, blm); + #else + XgemmBody(kSizeN, kSizeN, kSizeK, agm, bgm, cgm, cpm); + #endif + + // Stores an MWG * NWG tile of results and performs the multiplication with alpha and beta + StoreResults(cgm, cpm, kSizeN, alpha, beta); +} + +// Main entry point of the kernel. This is the lower-triangular version. +__kernel __attribute__((reqd_work_group_size(MDIMC, NDIMC, 1))) +void XgemmLower(const int kSizeN, const int kSizeK, + const real_arg arg_alpha, + const real_arg arg_beta, + const __global realM* restrict agm, + const __global realN* restrict bgm, + __global realM* cgm) { + const real alpha = GetRealArg(arg_alpha); + const real beta = GetRealArg(arg_beta); + + // Skip these threads if they do not contain threads contributing to the lower-triangle + if (GetGroupID1()*NWG > GetGroupID0()*MWG) { + return; + } + + // Allocates workgroup-private memory (local memory) + #if SA == 1 + __local realM alm[KWG * MWG/VWM]; + #endif + #if SB == 1 + __local realN blm[KWG * NWG/VWN]; + #endif + + // Computes the matrix-multiplication and stores the result in register memory + realM cpm[NWI][MWI/VWM]; + #if SA == 1 && SB == 1 + XgemmBody(kSizeN, kSizeN, kSizeK, agm, bgm, cgm, cpm, alm, blm); + #elif SA == 1 + XgemmBody(kSizeN, kSizeN, kSizeK, agm, bgm, cgm, cpm, alm); + #elif SB == 1 + XgemmBody(kSizeN, kSizeN, kSizeK, agm, bgm, cgm, cpm, blm); + #else + XgemmBody(kSizeN, kSizeN, kSizeK, agm, bgm, cgm, cpm); + #endif + + // Stores an MWG * NWG tile of results and performs the multiplication with alpha and beta + StoreResults(cgm, cpm, kSizeN, alpha, beta); +} + +// ================================================================================================= +// If not using a triangular version, include the regular kernel +#else + +// Main entry point of the kernel. This is the regular full version. +__kernel __attribute__((reqd_work_group_size(MDIMC, NDIMC, 1))) +void Xgemm(const int kSizeM, const int kSizeN, const int kSizeK, + const real_arg arg_alpha, + const real_arg arg_beta, + const __global realM* restrict agm, + const __global realN* restrict bgm, + __global realM* cgm) { + const real alpha = GetRealArg(arg_alpha); + const real beta = GetRealArg(arg_beta); + + // Allocates workgroup-private memory (local memory) + #if SA == 1 + __local realM alm[KWG * MWG/VWM]; + #endif + #if SB == 1 + __local realN blm[KWG * NWG/VWN]; + #endif + + // Computes the matrix-multiplication and stores the result in register memory + realM cpm[NWI][MWI/VWM]; + #if SA == 1 && SB == 1 + XgemmBody(kSizeM, kSizeN, kSizeK, agm, bgm, cgm, cpm, alm, blm); + #elif SA == 1 + XgemmBody(kSizeM, kSizeN, kSizeK, agm, bgm, cgm, cpm, alm); + #elif SB == 1 + XgemmBody(kSizeM, kSizeN, kSizeK, agm, bgm, cgm, cpm, blm); + #else + XgemmBody(kSizeM, kSizeN, kSizeK, agm, bgm, cgm, cpm); + #endif + + // Stores an MWG * NWG tile of results and performs the multiplication with alpha and beta + StoreResults(cgm, cpm, kSizeM, alpha, beta); +} + +#endif +// ================================================================================================= + +// End of the C++11 raw string literal +)" + +// ================================================================================================= diff --git a/src/public_api.hpp b/src/public_api.hpp deleted file mode 100644 index d0732297..00000000 --- a/src/public_api.hpp +++ /dev/null @@ -1,34 +0,0 @@ - -// ================================================================================================= -// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This -// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- -// width of 100 characters per line. -// -// Author(s): -// Cedric Nugteren -// -// This file provides macro's to define the public API. This is needed when building a Windows DLL. -// Note: this is only used for the C++ interface, the C interface has its own definition included in -// the header file itself. -// -// ================================================================================================= - -#ifndef CLBLAST_PUBLIC_API_H_ -#define CLBLAST_PUBLIC_API_H_ - -namespace clblast { -// ================================================================================================= - -// Exports library functions under Windows when building a DLL. See also: -// https://msdn.microsoft.com/en-us/library/a90k134d.aspx -#ifdef _WIN32 - #define PUBLIC_API __declspec(dllexport) -#else - #define PUBLIC_API -#endif - -// ================================================================================================= -} // namespace clblast - -// CLBLAST_PUBLIC_API_H_ -#endif diff --git a/src/routine.cpp b/src/routine.cpp index d3590896..189ae190 100644 --- a/src/routine.cpp +++ b/src/routine.cpp @@ -13,6 +13,7 @@ #include #include +#include #include "routine.hpp" @@ -21,7 +22,8 @@ namespace clblast { // Constructor: not much here, because no status codes can be returned Routine::Routine(Queue &queue, EventPointer event, const std::string &name, - const std::vector &routines, const Precision precision): + const std::vector &routines, const Precision precision, + const std::vector &userDatabase): precision_(precision), routine_name_(name), queue_(queue), @@ -29,7 +31,7 @@ Routine::Routine(Queue &queue, EventPointer event, const std::string &name, context_(queue_.GetContext()), device_(queue_.GetDevice()), device_name_(device_.Name()), - db_(queue_, routines, precision_) { + db_(queue_, routines, precision_, userDatabase) { } // ================================================================================================= @@ -103,6 +105,13 @@ StatusCode Routine::SetUp() { // Combines everything together into a single source string const auto source_string = defines + common_header + source_string_; + // Prints details of the routine to compile in case of debugging in verbose mode + #ifdef VERBOSE + printf("[DEBUG] Compiling routine '%s-%s' for device '%s'\n", + routine_name_.c_str(), ToString(precision_).c_str(), device_name_.c_str()); + const auto start_time = std::chrono::steady_clock::now(); + #endif + // Compiles the kernel try { auto program = Program(context_, source_string); @@ -123,6 +132,13 @@ StatusCode Routine::SetUp() { StoreProgramToCache(program, context_, precision_, routine_name_); } catch (...) { return StatusCode::kBuildProgramFailure; } + // Prints the elapsed compilation time in case of debugging in verbose mode + #ifdef VERBOSE + const auto elapsed_time = std::chrono::steady_clock::now() - start_time; + const auto timing = std::chrono::duration(elapsed_time).count(); + printf("[DEBUG] Completed compilation in %.2lf ms\n", timing); + #endif + // No errors, normal termination of this function return StatusCode::kSuccess; } diff --git a/src/routine.hpp b/src/routine.hpp index 54b5779f..f5c607af 100644 --- a/src/routine.hpp +++ b/src/routine.hpp @@ -32,9 +32,11 @@ namespace clblast { class Routine { public: - // Base class constructor + // Base class constructor. The user database is an optional extra database to override the + // built-in database. explicit Routine(Queue &queue, EventPointer event, const std::string &name, - const std::vector &routines, const Precision precision); + const std::vector &routines, const Precision precision, + const std::vector &userDatabase = {}); // Set-up phase of the kernel StatusCode SetUp(); diff --git a/src/routines/common.cpp b/src/routines/common.cpp index c378df28..3969cf9f 100644 --- a/src/routines/common.cpp +++ b/src/routines/common.cpp @@ -12,6 +12,7 @@ // ================================================================================================= #include +#include #include "routines/common.hpp" @@ -21,45 +22,54 @@ namespace clblast { // Enqueues a kernel, waits for completion, and checks for errors StatusCode RunKernel(Kernel &kernel, Queue &queue, const Device &device, std::vector global, const std::vector &local, - EventPointer event, std::vector& waitForEvents) { + EventPointer event, const std::vector &waitForEvents) { - // Tests for validity of the local thread sizes - if (local.size() > device.MaxWorkItemDimensions()) { - return StatusCode::kInvalidLocalNumDimensions; - } - const auto max_work_item_sizes = device.MaxWorkItemSizes(); - for (auto i=size_t{0}; i max_work_item_sizes[i]) { return StatusCode::kInvalidLocalThreadsDim; } - } - auto local_size = size_t{1}; - for (auto &item: local) { local_size *= item; } - if (local_size > device.MaxWorkGroupSize()) { return StatusCode::kInvalidLocalThreadsTotal; } + if (!local.empty()) { + // Tests for validity of the local thread sizes + if (local.size() > device.MaxWorkItemDimensions()) { + return StatusCode::kInvalidLocalNumDimensions; + } + const auto max_work_item_sizes = device.MaxWorkItemSizes(); + for (auto i=size_t{0}; i max_work_item_sizes[i]) { return StatusCode::kInvalidLocalThreadsDim; } + } + auto local_size = size_t{1}; + for (auto &item: local) { local_size *= item; } + if (local_size > device.MaxWorkGroupSize()) { return StatusCode::kInvalidLocalThreadsTotal; } - // Make sure the global thread sizes are at least equal to the local sizes - for (auto i=size_t{0}; i(elapsed_time).count(); + printf("[DEBUG] Completed kernel in %.2lf ms\n", timing); + #endif + // No errors, normal termination of this function return StatusCode::kSuccess; } -// As above, but without an event waiting list -StatusCode RunKernel(Kernel &kernel, Queue &queue, const Device &device, - std::vector global, const std::vector &local, - EventPointer event) { - auto emptyWaitingList = std::vector(); - return RunKernel(kernel, queue, device, global, local, event, emptyWaitingList); -} - // ================================================================================================= } // namespace clblast diff --git a/src/routines/common.hpp b/src/routines/common.hpp index c99cd39d..9d8849c3 100644 --- a/src/routines/common.hpp +++ b/src/routines/common.hpp @@ -29,21 +29,16 @@ namespace clblast { // Enqueues a kernel, waits for completion, and checks for errors StatusCode RunKernel(Kernel &kernel, Queue &queue, const Device &device, std::vector global, const std::vector &local, - EventPointer event, std::vector& waitForEvents); - -// As above, but without an event waiting list -StatusCode RunKernel(Kernel &kernel, Queue &queue, const Device &device, - std::vector global, const std::vector &local, - EventPointer event); + EventPointer event, const std::vector &waitForEvents = {}); // ================================================================================================= // Copies or transposes a matrix and optionally pads/unpads it with zeros. This method is also able // to write to symmetric and triangular matrices through optional arguments. template -StatusCode PadCopyTransposeMatrix(Queue &queue, const Device &device, const Context &context, +StatusCode PadCopyTransposeMatrix(Queue &queue, const Device &device, const Database &db, - EventPointer event, std::vector& waitForEvents, + EventPointer event, const std::vector &waitForEvents, const size_t src_one, const size_t src_two, const size_t src_ld, const size_t src_offset, const Buffer &src, @@ -88,10 +83,6 @@ StatusCode PadCopyTransposeMatrix(Queue &queue, const Device &device, const Cont } } - // Upload the scalar argument as a constant buffer to the device (needed for half-precision) - auto alpha_buffer = Buffer(context, 1); - alpha_buffer.Write(queue, 1, &alpha); - // Retrieves the kernel from the compiled binary try { auto kernel = Kernel(program, kernel_name); @@ -101,7 +92,7 @@ StatusCode PadCopyTransposeMatrix(Queue &queue, const Device &device, const Cont kernel.SetArgument(0, static_cast(src_ld)); kernel.SetArgument(1, src()); kernel.SetArgument(2, dest()); - kernel.SetArgument(3, alpha_buffer()); + kernel.SetArgument(3, GetRealArg(alpha)); } else { kernel.SetArgument(0, static_cast(src_one)); @@ -114,7 +105,7 @@ StatusCode PadCopyTransposeMatrix(Queue &queue, const Device &device, const Cont kernel.SetArgument(7, static_cast(dest_ld)); kernel.SetArgument(8, static_cast(dest_offset)); kernel.SetArgument(9, dest()); - kernel.SetArgument(10, alpha_buffer()); + kernel.SetArgument(10, GetRealArg(alpha)); if (do_pad) { kernel.SetArgument(11, static_cast(do_conjugate)); } diff --git a/src/routines/level1/xaxpy.cpp b/src/routines/level1/xaxpy.cpp index 5b6c9e77..3445e2b5 100644 --- a/src/routines/level1/xaxpy.cpp +++ b/src/routines/level1/xaxpy.cpp @@ -59,20 +59,16 @@ StatusCode Xaxpy::DoAxpy(const size_t n, const T alpha, const auto program = GetProgramFromCache(context_, PrecisionValue(), routine_name_); auto kernel = Kernel(program, kernel_name); - // Upload the scalar argument as a constant buffer to the device (needed for half-precision) - auto alpha_buffer = Buffer(context_, 1); - alpha_buffer.Write(queue_, 1, &alpha); - // Sets the kernel arguments if (use_fast_kernel) { kernel.SetArgument(0, static_cast(n)); - kernel.SetArgument(1, alpha_buffer()); + kernel.SetArgument(1, GetRealArg(alpha)); kernel.SetArgument(2, x_buffer()); kernel.SetArgument(3, y_buffer()); } else { kernel.SetArgument(0, static_cast(n)); - kernel.SetArgument(1, alpha_buffer()); + kernel.SetArgument(1, GetRealArg(alpha)); kernel.SetArgument(2, x_buffer()); kernel.SetArgument(3, static_cast(x_offset)); kernel.SetArgument(4, static_cast(x_inc)); diff --git a/src/routines/level2/xgemv.cpp b/src/routines/level2/xgemv.cpp index 21fb397c..4e32ba41 100644 --- a/src/routines/level2/xgemv.cpp +++ b/src/routines/level2/xgemv.cpp @@ -22,7 +22,7 @@ namespace clblast { // Constructor: forwards to base class constructor template Xgemv::Xgemv(Queue &queue, EventPointer event, const std::string &name): - Routine(queue, event, name, {"Pad", "Xgemv"}, PrecisionValue()) { + Routine(queue, event, name, {"Pad", "Xgemv", "XgemvFast", "XgemvFastRot"}, PrecisionValue()) { source_string_ = #include "../../kernels/level2/xgemv.opencl" #include "../../kernels/level2/xgemv_fast.opencl" @@ -122,16 +122,10 @@ StatusCode Xgemv::MatVec(const Layout layout, const Transpose a_transpose, } if (fast_kernel_rot) { kernel_name = "XgemvFastRot"; - global_size = m_real / db_["WPT3"]; + global_size = m_real; local_size = db_["WGS3"]; } - // Upload the scalar arguments as constant buffers to the device (needed for half-precision) - auto alpha_buffer = Buffer(context_, 1); - auto beta_buffer = Buffer(context_, 1); - alpha_buffer.Write(queue_, 1, &alpha); - beta_buffer.Write(queue_, 1, &beta); - // Retrieves the Xgemv kernel from the compiled binary try { const auto program = GetProgramFromCache(context_, PrecisionValue(), routine_name_); @@ -140,8 +134,8 @@ StatusCode Xgemv::MatVec(const Layout layout, const Transpose a_transpose, // Sets the kernel arguments kernel.SetArgument(0, static_cast(m_real)); kernel.SetArgument(1, static_cast(n_real)); - kernel.SetArgument(2, alpha_buffer()); - kernel.SetArgument(3, beta_buffer()); + kernel.SetArgument(2, GetRealArg(alpha)); + kernel.SetArgument(3, GetRealArg(beta)); kernel.SetArgument(4, static_cast(a_rotated)); kernel.SetArgument(5, a_buffer()); kernel.SetArgument(6, static_cast(a_offset)); diff --git a/src/routines/level2/xger.cpp b/src/routines/level2/xger.cpp index 353047d2..29cffe0c 100644 --- a/src/routines/level2/xger.cpp +++ b/src/routines/level2/xger.cpp @@ -56,10 +56,6 @@ StatusCode Xger::DoGer(const Layout layout, status = TestVectorY(n, y_buffer, y_offset, y_inc); if (ErrorIn(status)) { return status; } - // Upload the scalar argument as a constant buffer to the device (needed for half-precision) - auto alpha_buffer = Buffer(context_, 1); - alpha_buffer.Write(queue_, 1, &alpha); - // Retrieves the kernel from the compiled binary try { const auto program = GetProgramFromCache(context_, PrecisionValue(), routine_name_); @@ -68,7 +64,7 @@ StatusCode Xger::DoGer(const Layout layout, // Sets the kernel arguments kernel.SetArgument(0, static_cast(a_one)); kernel.SetArgument(1, static_cast(a_two)); - kernel.SetArgument(2, alpha_buffer()); + kernel.SetArgument(2, GetRealArg(alpha)); kernel.SetArgument(3, x_buffer()); kernel.SetArgument(4, static_cast(x_offset)); kernel.SetArgument(5, static_cast(x_inc)); diff --git a/src/routines/level2/xher.cpp b/src/routines/level2/xher.cpp index ed8ba9e9..6dd95938 100644 --- a/src/routines/level2/xher.cpp +++ b/src/routines/level2/xher.cpp @@ -70,10 +70,6 @@ StatusCode Xher::DoHer(const Layout layout, const Triangle triangle, // Creates a matching version of alpha const auto matching_alpha = GetAlpha(alpha); - // Upload the scalar argument as a constant buffer to the device (needed for half-precision) - auto alpha_buffer = Buffer(context_, 1); - alpha_buffer.Write(queue_, 1, &matching_alpha); - // Retrieves the kernel from the compiled binary try { const auto program = GetProgramFromCache(context_, PrecisionValue(), routine_name_); @@ -81,7 +77,7 @@ StatusCode Xher::DoHer(const Layout layout, const Triangle triangle, // Sets the kernel arguments kernel.SetArgument(0, static_cast(n)); - kernel.SetArgument(1, alpha_buffer()); + kernel.SetArgument(1, GetRealArg(matching_alpha)); kernel.SetArgument(2, x_buffer()); kernel.SetArgument(3, static_cast(x_offset)); kernel.SetArgument(4, static_cast(x_inc)); diff --git a/src/routines/level2/xher2.cpp b/src/routines/level2/xher2.cpp index 50572cea..3d57a9b9 100644 --- a/src/routines/level2/xher2.cpp +++ b/src/routines/level2/xher2.cpp @@ -58,10 +58,6 @@ StatusCode Xher2::DoHer2(const Layout layout, const Triangle triangle, status = TestVectorY(n, y_buffer, y_offset, y_inc); if (ErrorIn(status)) { return status; } - // Upload the scalar argument as a constant buffer to the device (needed for half-precision) - auto alpha_buffer = Buffer(context_, 1); - alpha_buffer.Write(queue_, 1, &alpha); - // Retrieves the kernel from the compiled binary try { const auto program = GetProgramFromCache(context_, PrecisionValue(), routine_name_); @@ -69,7 +65,7 @@ StatusCode Xher2::DoHer2(const Layout layout, const Triangle triangle, // Sets the kernel arguments kernel.SetArgument(0, static_cast(n)); - kernel.SetArgument(1, alpha_buffer()); + kernel.SetArgument(1, GetRealArg(alpha)); kernel.SetArgument(2, x_buffer()); kernel.SetArgument(3, static_cast(x_offset)); kernel.SetArgument(4, static_cast(x_inc)); diff --git a/src/routines/level3/xgemm.cpp b/src/routines/level3/xgemm.cpp index 9ea5559c..0b8e768f 100644 --- a/src/routines/level3/xgemm.cpp +++ b/src/routines/level3/xgemm.cpp @@ -34,6 +34,7 @@ Xgemm::Xgemm(Queue &queue, EventPointer event, const std::string &name): #include "../../kernels/level3/convert_hermitian.opencl" #include "../../kernels/level3/xgemm_part1.opencl" #include "../../kernels/level3/xgemm_part2.opencl" + #include "../../kernels/level3/xgemm_part3.opencl" ; } @@ -63,9 +64,12 @@ StatusCode Xgemm::DoGemm(const Layout layout, const auto b_rotated = (layout == Layout::kColMajor && b_transpose != Transpose::kNo) || (layout == Layout::kRowMajor && b_transpose == Transpose::kNo); const auto c_rotated = (layout == Layout::kRowMajor); - const auto a_do_transpose = a_rotated; - const auto b_do_transpose = !b_rotated; - const auto c_do_transpose = c_rotated; + static const auto a_want_rotated = false; + static const auto b_want_rotated = true; + static const auto c_want_rotated = false; + const auto a_do_transpose = a_rotated != a_want_rotated; + const auto b_do_transpose = b_rotated != b_want_rotated; + const auto c_do_transpose = c_rotated != c_want_rotated; // In case of complex data-types, the transpose can also become a conjugate transpose const auto a_conjugate = (a_transpose == Transpose::kConjugate); @@ -99,6 +103,15 @@ StatusCode Xgemm::DoGemm(const Layout layout, const auto n_ceiled = Ceil(n, db_["NWG"]); const auto k_ceiled = Ceil(k, db_["KWG"]); + // Computes the first and second "internal" (ceiled) dimensions of the 3 matrices taking into account + // whether the matrices need to be rotated or not for the kernel. + const auto a_one_i = (a_want_rotated) ? k_ceiled : m_ceiled; + const auto a_two_i = (a_want_rotated) ? m_ceiled : k_ceiled; + const auto b_one_i = (b_want_rotated) ? n_ceiled : k_ceiled; + const auto b_two_i = (b_want_rotated) ? k_ceiled : n_ceiled; + const auto c_one_i = (c_want_rotated) ? n_ceiled : m_ceiled; + const auto c_two_i = (c_want_rotated) ? m_ceiled : n_ceiled; + // The padded/transposed input/output matrices: if memory allocation fails, throw an exception try { @@ -106,23 +119,17 @@ StatusCode Xgemm::DoGemm(const Layout layout, const auto program = GetProgramFromCache(context_, PrecisionValue(), routine_name_); // Determines whether or not temporary matrices are needed - auto a_no_temp = a_one == m_ceiled && a_two == k_ceiled && a_ld == m_ceiled && a_offset == 0 && + auto a_no_temp = a_one == a_one_i && a_two == a_two_i && a_ld == a_one && a_offset == 0 && a_do_transpose == false && a_conjugate == false; - auto b_no_temp = b_one == n_ceiled && b_two == k_ceiled && b_ld == n_ceiled && b_offset == 0 && + auto b_no_temp = b_one == b_one_i && b_two == b_two_i && b_ld == b_one && b_offset == 0 && b_do_transpose == false && b_conjugate == false; - auto c_no_temp = c_one == m_ceiled && c_two == n_ceiled && c_ld == m_ceiled && c_offset == 0 && + auto c_no_temp = c_one == c_one_i && c_two == c_two_i && c_ld == c_one && c_offset == 0 && c_do_transpose == false; // Creates the temporary matrices - const auto a_temp = (a_no_temp) ? a_buffer : Buffer(context_, k_ceiled*m_ceiled); - const auto b_temp = (b_no_temp) ? b_buffer : Buffer(context_, k_ceiled*n_ceiled); - const auto c_temp = (c_no_temp) ? c_buffer : Buffer(context_, m_ceiled*n_ceiled); - - // Upload the scalar arguments as constant buffers to the device (needed for half-precision) - auto alpha_buffer = Buffer(context_, 1); - auto beta_buffer = Buffer(context_, 1); - alpha_buffer.Write(queue_, 1, &alpha); - beta_buffer.Write(queue_, 1, &beta); + const auto a_temp = (a_no_temp) ? a_buffer : Buffer(context_, a_one_i*a_two_i); + const auto b_temp = (b_no_temp) ? b_buffer : Buffer(context_, b_one_i*b_two_i); + const auto c_temp = (c_no_temp) ? c_buffer : Buffer(context_, c_one_i*c_two_i); // Events of all kernels (including pre/post processing kernels) auto eventWaitList = std::vector(); @@ -133,9 +140,9 @@ StatusCode Xgemm::DoGemm(const Layout layout, // case nothing has to be done, these kernels can be skipped. if (!a_no_temp) { auto eventProcessA = Event(); - status = PadCopyTransposeMatrix(queue_, device_, context_, db_, eventProcessA.pointer(), emptyEventList, + status = PadCopyTransposeMatrix(queue_, device_, db_, eventProcessA.pointer(), emptyEventList, a_one, a_two, a_ld, a_offset, a_buffer, - m_ceiled, k_ceiled, m_ceiled, 0, a_temp, + a_one_i, a_two_i, a_one_i, 0, a_temp, ConstantOne(), program, true, a_do_transpose, a_conjugate); if (ErrorIn(status)) { return status; } @@ -145,9 +152,9 @@ StatusCode Xgemm::DoGemm(const Layout layout, // As above, but now for matrix B if (!b_no_temp) { auto eventProcessB = Event(); - status = PadCopyTransposeMatrix(queue_, device_, context_, db_, eventProcessB.pointer(), emptyEventList, + status = PadCopyTransposeMatrix(queue_, device_, db_, eventProcessB.pointer(), emptyEventList, b_one, b_two, b_ld, b_offset, b_buffer, - n_ceiled, k_ceiled, n_ceiled, 0, b_temp, + b_one_i, b_two_i, b_one_i, 0, b_temp, ConstantOne(), program, true, b_do_transpose, b_conjugate); if (ErrorIn(status)) { return status; } @@ -157,9 +164,9 @@ StatusCode Xgemm::DoGemm(const Layout layout, // As above, but now for matrix C. This is only necessary if C is used both as input and output. if (!c_no_temp && beta != static_cast(0)) { auto eventProcessC = Event(); - status = PadCopyTransposeMatrix(queue_, device_, context_, db_, eventProcessC.pointer(), emptyEventList, + status = PadCopyTransposeMatrix(queue_, device_, db_, eventProcessC.pointer(), emptyEventList, c_one, c_two, c_ld, c_offset, c_buffer, - m_ceiled, n_ceiled, m_ceiled, 0, c_temp, + c_one_i, c_two_i, c_one_i, 0, c_temp, ConstantOne(), program, true, c_do_transpose, false); if (ErrorIn(status)) { return status; } @@ -174,16 +181,16 @@ StatusCode Xgemm::DoGemm(const Layout layout, kernel.SetArgument(0, static_cast(m_ceiled)); kernel.SetArgument(1, static_cast(n_ceiled)); kernel.SetArgument(2, static_cast(k_ceiled)); - kernel.SetArgument(3, alpha_buffer()); - kernel.SetArgument(4, beta_buffer()); + kernel.SetArgument(3, GetRealArg(alpha)); + kernel.SetArgument(4, GetRealArg(beta)); kernel.SetArgument(5, a_temp()); kernel.SetArgument(6, b_temp()); kernel.SetArgument(7, c_temp()); // Computes the global and local thread sizes const auto global = std::vector{ - (m_ceiled * db_["MDIMC"]) / db_["MWG"], - (n_ceiled * db_["NDIMC"]) / db_["NWG"] + (c_one_i * db_["MDIMC"]) / db_["MWG"], + (c_two_i * db_["NDIMC"]) / db_["NWG"] }; const auto local = std::vector{db_["MDIMC"], db_["NDIMC"]}; @@ -196,8 +203,8 @@ StatusCode Xgemm::DoGemm(const Layout layout, // Runs the post-processing kernel if needed if (!c_no_temp) { eventWaitList.push_back(eventKernel); - status = PadCopyTransposeMatrix(queue_, device_, context_, db_, event_, eventWaitList, - m_ceiled, n_ceiled, m_ceiled, 0, c_temp, + status = PadCopyTransposeMatrix(queue_, device_, db_, event_, eventWaitList, + c_one_i, c_two_i, c_one_i, 0, c_temp, c_one, c_two, c_ld, c_offset, c_buffer, ConstantOne(), program, false, c_do_transpose, false); diff --git a/src/routines/level3/xher2k.cpp b/src/routines/level3/xher2k.cpp index bd7a053e..ba770065 100644 --- a/src/routines/level3/xher2k.cpp +++ b/src/routines/level3/xher2k.cpp @@ -31,6 +31,7 @@ Xher2k::Xher2k(Queue &queue, EventPointer event, const std::string &name): #include "../../kernels/level3/transpose_pad.opencl" #include "../../kernels/level3/xgemm_part1.opencl" #include "../../kernels/level3/xgemm_part2.opencl" + #include "../../kernels/level3/xgemm_part3.opencl" ; } @@ -107,12 +108,8 @@ StatusCode Xher2k::DoHer2k(const Layout layout, const Triangle triangle, co auto b2_temp = (b2_no_temp) ? b_buffer : Buffer(context_, k_ceiled*n_ceiled); auto c_temp = Buffer(context_, n_ceiled*n_ceiled); - // Upload the scalar arguments as constant buffers to the device (needed for half-precision) + // Convert the arguments to complex versions auto complex_beta = T{beta, static_cast(0.0)}; - auto alpha_buffer = Buffer(context_, 1); - auto beta_buffer = Buffer(context_, 1); - alpha_buffer.Write(queue_, 1, &alpha); - beta_buffer.Write(queue_, 1, &complex_beta); // Events of all kernels (including pre/post processing kernels) auto eventWaitList = std::vector(); @@ -123,7 +120,7 @@ StatusCode Xher2k::DoHer2k(const Layout layout, const Triangle triangle, co // case nothing has to be done, these kernels can be skipped. if (!a1_no_temp) { auto eventProcessA1 = Event(); - status = PadCopyTransposeMatrix(queue_, device_, context_, db_, eventProcessA1.pointer(), emptyEventList, + status = PadCopyTransposeMatrix(queue_, device_, db_, eventProcessA1.pointer(), emptyEventList, ab_one, ab_two, a_ld, a_offset, a_buffer, n_ceiled, k_ceiled, n_ceiled, 0, a1_temp, ConstantOne(), program, @@ -133,7 +130,7 @@ StatusCode Xher2k::DoHer2k(const Layout layout, const Triangle triangle, co } if (!a2_no_temp) { auto eventProcessA2 = Event(); - status = PadCopyTransposeMatrix(queue_, device_, context_, db_, eventProcessA2.pointer(), emptyEventList, + status = PadCopyTransposeMatrix(queue_, device_, db_, eventProcessA2.pointer(), emptyEventList, ab_one, ab_two, a_ld, a_offset, a_buffer, n_ceiled, k_ceiled, n_ceiled, 0, a2_temp, ConstantOne(), program, @@ -143,7 +140,7 @@ StatusCode Xher2k::DoHer2k(const Layout layout, const Triangle triangle, co } if (!b1_no_temp) { auto eventProcessB1 = Event(); - status = PadCopyTransposeMatrix(queue_, device_, context_, db_, eventProcessB1.pointer(), emptyEventList, + status = PadCopyTransposeMatrix(queue_, device_, db_, eventProcessB1.pointer(), emptyEventList, ab_one, ab_two, b_ld, b_offset, b_buffer, n_ceiled, k_ceiled, n_ceiled, 0, b1_temp, ConstantOne(), program, @@ -153,7 +150,7 @@ StatusCode Xher2k::DoHer2k(const Layout layout, const Triangle triangle, co } if (!b2_no_temp) { auto eventProcessB2 = Event(); - status = PadCopyTransposeMatrix(queue_, device_, context_, db_, eventProcessB2.pointer(), emptyEventList, + status = PadCopyTransposeMatrix(queue_, device_, db_, eventProcessB2.pointer(), emptyEventList, ab_one, ab_two, b_ld, b_offset, b_buffer, n_ceiled, k_ceiled, n_ceiled, 0, b2_temp, ConstantOne(), program, @@ -165,7 +162,7 @@ StatusCode Xher2k::DoHer2k(const Layout layout, const Triangle triangle, co // Furthermore, also creates a (possibly padded) copy of matrix C, since it is not allowed to // modify the other triangle. auto eventProcessC = Event(); - status = PadCopyTransposeMatrix(queue_, device_, context_, db_, eventProcessC.pointer(), emptyEventList, + status = PadCopyTransposeMatrix(queue_, device_, db_, eventProcessC.pointer(), emptyEventList, n, n, c_ld, c_offset, c_buffer, n_ceiled, n_ceiled, n_ceiled, 0, c_temp, ConstantOne(), program, @@ -180,8 +177,8 @@ StatusCode Xher2k::DoHer2k(const Layout layout, const Triangle triangle, co // Sets the kernel arguments kernel.SetArgument(0, static_cast(n_ceiled)); kernel.SetArgument(1, static_cast(k_ceiled)); - kernel.SetArgument(2, alpha_buffer()); - kernel.SetArgument(3, beta_buffer()); + kernel.SetArgument(2, GetRealArg(alpha)); + kernel.SetArgument(3, GetRealArg(complex_beta)); kernel.SetArgument(4, a1_temp()); kernel.SetArgument(5, b2_temp()); kernel.SetArgument(6, c_temp()); @@ -202,10 +199,8 @@ StatusCode Xher2k::DoHer2k(const Layout layout, const Triangle triangle, co // Swaps the arguments for matrices A and B, sets 'beta' to 1, and conjugate alpha auto conjugate_alpha = T{alpha.real(), -alpha.imag()}; auto complex_one = T{static_cast(1.0), static_cast(0.0)}; - alpha_buffer.Write(queue_, 1, &conjugate_alpha); - beta_buffer.Write(queue_, 1, &complex_one); - kernel.SetArgument(2, alpha_buffer()); - kernel.SetArgument(3, beta_buffer()); + kernel.SetArgument(2, GetRealArg(conjugate_alpha)); + kernel.SetArgument(3, GetRealArg(complex_one)); kernel.SetArgument(4, b1_temp()); kernel.SetArgument(5, a2_temp()); @@ -218,7 +213,7 @@ StatusCode Xher2k::DoHer2k(const Layout layout, const Triangle triangle, co // Runs the post-processing kernel auto upper = (triangle == Triangle::kUpper); auto lower = (triangle == Triangle::kLower); - status = PadCopyTransposeMatrix(queue_, device_, context_, db_, event_, eventWaitList, + status = PadCopyTransposeMatrix(queue_, device_, db_, event_, eventWaitList, n_ceiled, n_ceiled, n_ceiled, 0, c_temp, n, n, c_ld, c_offset, c_buffer, ConstantOne(), program, diff --git a/src/routines/level3/xherk.cpp b/src/routines/level3/xherk.cpp index 6ef7f21f..3063f3bc 100644 --- a/src/routines/level3/xherk.cpp +++ b/src/routines/level3/xherk.cpp @@ -31,6 +31,7 @@ Xherk::Xherk(Queue &queue, EventPointer event, const std::string &name): #include "../../kernels/level3/transpose_pad.opencl" #include "../../kernels/level3/xgemm_part1.opencl" #include "../../kernels/level3/xgemm_part2.opencl" + #include "../../kernels/level3/xgemm_part3.opencl" ; } @@ -98,13 +99,9 @@ StatusCode Xherk::DoHerk(const Layout layout, const Triangle triangle, cons auto b_temp = (b_no_temp) ? a_buffer : Buffer(context_, k_ceiled*n_ceiled); auto c_temp = Buffer(context_, n_ceiled*n_ceiled); - // Upload the scalar arguments as constant buffers to the device (needed for half-precision) + // Convert the arguments to complex versions auto complex_alpha = T{alpha, static_cast(0.0)}; auto complex_beta = T{beta, static_cast(0.0)}; - auto alpha_buffer = Buffer(context_, 1); - auto beta_buffer = Buffer(context_, 1); - alpha_buffer.Write(queue_, 1, &complex_alpha); - beta_buffer.Write(queue_, 1, &complex_beta); // Events of all kernels (including pre/post processing kernels) auto eventWaitList = std::vector(); @@ -115,7 +112,7 @@ StatusCode Xherk::DoHerk(const Layout layout, const Triangle triangle, cons // case nothing has to be done, these kernels can be skipped. Two copies are created. if (!a_no_temp) { auto eventProcessA = Event(); - status = PadCopyTransposeMatrix(queue_, device_, context_, db_, eventProcessA.pointer(), emptyEventList, + status = PadCopyTransposeMatrix(queue_, device_, db_, eventProcessA.pointer(), emptyEventList, a_one, a_two, a_ld, a_offset, a_buffer, n_ceiled, k_ceiled, n_ceiled, 0, a_temp, ConstantOne(), program, @@ -125,7 +122,7 @@ StatusCode Xherk::DoHerk(const Layout layout, const Triangle triangle, cons } if (!b_no_temp) { auto eventProcessB = Event(); - status = PadCopyTransposeMatrix(queue_, device_, context_, db_, eventProcessB.pointer(), emptyEventList, + status = PadCopyTransposeMatrix(queue_, device_, db_, eventProcessB.pointer(), emptyEventList, a_one, a_two, a_ld, a_offset, a_buffer, n_ceiled, k_ceiled, n_ceiled, 0, b_temp, ConstantOne(), program, @@ -137,7 +134,7 @@ StatusCode Xherk::DoHerk(const Layout layout, const Triangle triangle, cons // Furthermore, also creates a (possibly padded) copy of matrix C, since it is not allowed to // modify the other triangle. auto eventProcessC = Event(); - status = PadCopyTransposeMatrix(queue_, device_, context_, db_, eventProcessC.pointer(), emptyEventList, + status = PadCopyTransposeMatrix(queue_, device_, db_, eventProcessC.pointer(), emptyEventList, n, n, c_ld, c_offset, c_buffer, n_ceiled, n_ceiled, n_ceiled, 0, c_temp, ConstantOne(), program, @@ -152,8 +149,8 @@ StatusCode Xherk::DoHerk(const Layout layout, const Triangle triangle, cons // Sets the kernel arguments kernel.SetArgument(0, static_cast(n_ceiled)); kernel.SetArgument(1, static_cast(k_ceiled)); - kernel.SetArgument(2, alpha_buffer()); - kernel.SetArgument(3, beta_buffer()); + kernel.SetArgument(2, GetRealArg(complex_alpha)); + kernel.SetArgument(3, GetRealArg(complex_beta)); kernel.SetArgument(4, a_temp()); kernel.SetArgument(5, b_temp()); kernel.SetArgument(6, c_temp()); @@ -174,7 +171,7 @@ StatusCode Xherk::DoHerk(const Layout layout, const Triangle triangle, cons // Runs the post-processing kernel auto upper = (triangle == Triangle::kUpper); auto lower = (triangle == Triangle::kLower); - status = PadCopyTransposeMatrix(queue_, device_, context_, db_, event_, eventWaitList, + status = PadCopyTransposeMatrix(queue_, device_, db_, event_, eventWaitList, n_ceiled, n_ceiled, n_ceiled, 0, c_temp, n, n, c_ld, c_offset, c_buffer, ConstantOne(), program, diff --git a/src/routines/level3/xsyr2k.cpp b/src/routines/level3/xsyr2k.cpp index 424d4d2d..158cd9e5 100644 --- a/src/routines/level3/xsyr2k.cpp +++ b/src/routines/level3/xsyr2k.cpp @@ -31,6 +31,7 @@ Xsyr2k::Xsyr2k(Queue &queue, EventPointer event, const std::string &name): #include "../../kernels/level3/transpose_pad.opencl" #include "../../kernels/level3/xgemm_part1.opencl" #include "../../kernels/level3/xgemm_part2.opencl" + #include "../../kernels/level3/xgemm_part3.opencl" ; } @@ -97,12 +98,6 @@ StatusCode Xsyr2k::DoSyr2k(const Layout layout, const Triangle triangle, cons auto b_temp = (b_no_temp) ? b_buffer : Buffer(context_, k_ceiled*n_ceiled); auto c_temp = Buffer(context_, n_ceiled*n_ceiled); - // Upload the scalar arguments as constant buffers to the device (needed for half-precision) - auto alpha_buffer = Buffer(context_, 1); - auto beta_buffer = Buffer(context_, 1); - alpha_buffer.Write(queue_, 1, &alpha); - beta_buffer.Write(queue_, 1, &beta); - // Events of all kernels (including pre/post processing kernels) auto eventWaitList = std::vector(); auto emptyEventList = std::vector(); @@ -112,7 +107,7 @@ StatusCode Xsyr2k::DoSyr2k(const Layout layout, const Triangle triangle, cons // case nothing has to be done, these kernels can be skipped. if (!a_no_temp) { auto eventProcessA = Event(); - status = PadCopyTransposeMatrix(queue_, device_, context_, db_, eventProcessA.pointer(), emptyEventList, + status = PadCopyTransposeMatrix(queue_, device_, db_, eventProcessA.pointer(), emptyEventList, ab_one, ab_two, a_ld, a_offset, a_buffer, n_ceiled, k_ceiled, n_ceiled, 0, a_temp, ConstantOne(), program, @@ -122,7 +117,7 @@ StatusCode Xsyr2k::DoSyr2k(const Layout layout, const Triangle triangle, cons } if (!b_no_temp) { auto eventProcessB = Event(); - status = PadCopyTransposeMatrix(queue_, device_, context_, db_, eventProcessB.pointer(), emptyEventList, + status = PadCopyTransposeMatrix(queue_, device_, db_, eventProcessB.pointer(), emptyEventList, ab_one, ab_two, b_ld, b_offset, b_buffer, n_ceiled, k_ceiled, n_ceiled, 0, b_temp, ConstantOne(), program, @@ -134,7 +129,7 @@ StatusCode Xsyr2k::DoSyr2k(const Layout layout, const Triangle triangle, cons // Furthermore, also creates a (possibly padded) copy of matrix C, since it is not allowed to // modify the other triangle. auto eventProcessC = Event(); - status = PadCopyTransposeMatrix(queue_, device_, context_, db_, eventProcessC.pointer(), emptyEventList, + status = PadCopyTransposeMatrix(queue_, device_, db_, eventProcessC.pointer(), emptyEventList, n, n, c_ld, c_offset, c_buffer, n_ceiled, n_ceiled, n_ceiled, 0, c_temp, ConstantOne(), program, @@ -149,8 +144,8 @@ StatusCode Xsyr2k::DoSyr2k(const Layout layout, const Triangle triangle, cons // Sets the kernel arguments kernel.SetArgument(0, static_cast(n_ceiled)); kernel.SetArgument(1, static_cast(k_ceiled)); - kernel.SetArgument(2, alpha_buffer()); - kernel.SetArgument(3, beta_buffer()); + kernel.SetArgument(2, GetRealArg(alpha)); + kernel.SetArgument(3, GetRealArg(beta)); kernel.SetArgument(4, a_temp()); kernel.SetArgument(5, b_temp()); kernel.SetArgument(6, c_temp()); @@ -170,8 +165,7 @@ StatusCode Xsyr2k::DoSyr2k(const Layout layout, const Triangle triangle, cons // Swaps the arguments for matrices A and B, and sets 'beta' to 1 auto one = static_cast(1); - beta_buffer.Write(queue_, 1, &one); - kernel.SetArgument(3, beta_buffer()); + kernel.SetArgument(3, GetRealArg(one)); kernel.SetArgument(4, b_temp()); kernel.SetArgument(5, a_temp()); @@ -184,7 +178,7 @@ StatusCode Xsyr2k::DoSyr2k(const Layout layout, const Triangle triangle, cons // Runs the post-processing kernel auto upper = (triangle == Triangle::kUpper); auto lower = (triangle == Triangle::kLower); - status = PadCopyTransposeMatrix(queue_, device_, context_, db_, event_, eventWaitList, + status = PadCopyTransposeMatrix(queue_, device_, db_, event_, eventWaitList, n_ceiled, n_ceiled, n_ceiled, 0, c_temp, n, n, c_ld, c_offset, c_buffer, ConstantOne(), program, diff --git a/src/routines/level3/xsyrk.cpp b/src/routines/level3/xsyrk.cpp index f56c232b..e1a72ef6 100644 --- a/src/routines/level3/xsyrk.cpp +++ b/src/routines/level3/xsyrk.cpp @@ -31,6 +31,7 @@ Xsyrk::Xsyrk(Queue &queue, EventPointer event, const std::string &name): #include "../../kernels/level3/transpose_pad.opencl" #include "../../kernels/level3/xgemm_part1.opencl" #include "../../kernels/level3/xgemm_part2.opencl" + #include "../../kernels/level3/xgemm_part3.opencl" ; } @@ -90,12 +91,6 @@ StatusCode Xsyrk::DoSyrk(const Layout layout, const Triangle triangle, const auto a_temp = (a_no_temp) ? a_buffer : Buffer(context_, k_ceiled*n_ceiled); auto c_temp = Buffer(context_, n_ceiled*n_ceiled); - // Upload the scalar arguments as constant buffers to the device (needed for half-precision) - auto alpha_buffer = Buffer(context_, 1); - auto beta_buffer = Buffer(context_, 1); - alpha_buffer.Write(queue_, 1, &alpha); - beta_buffer.Write(queue_, 1, &beta); - // Events of all kernels (including pre/post processing kernels) auto eventWaitList = std::vector(); auto emptyEventList = std::vector(); @@ -105,7 +100,7 @@ StatusCode Xsyrk::DoSyrk(const Layout layout, const Triangle triangle, const // case nothing has to be done, these kernels can be skipped. if (!a_no_temp) { auto eventProcessA = Event(); - status = PadCopyTransposeMatrix(queue_, device_, context_, db_, eventProcessA.pointer(), emptyEventList, + status = PadCopyTransposeMatrix(queue_, device_, db_, eventProcessA.pointer(), emptyEventList, a_one, a_two, a_ld, a_offset, a_buffer, n_ceiled, k_ceiled, n_ceiled, 0, a_temp, ConstantOne(), program, @@ -117,7 +112,7 @@ StatusCode Xsyrk::DoSyrk(const Layout layout, const Triangle triangle, const // Furthermore, also creates a (possibly padded) copy of matrix C, since it is not allowed to // modify the other triangle. auto eventProcessC = Event(); - status = PadCopyTransposeMatrix(queue_, device_, context_, db_, eventProcessC.pointer(), emptyEventList, + status = PadCopyTransposeMatrix(queue_, device_, db_, eventProcessC.pointer(), emptyEventList, n, n, c_ld, c_offset, c_buffer, n_ceiled, n_ceiled, n_ceiled, 0, c_temp, ConstantOne(), program, @@ -132,8 +127,8 @@ StatusCode Xsyrk::DoSyrk(const Layout layout, const Triangle triangle, const // Sets the kernel arguments kernel.SetArgument(0, static_cast(n_ceiled)); kernel.SetArgument(1, static_cast(k_ceiled)); - kernel.SetArgument(2, alpha_buffer()); - kernel.SetArgument(3, beta_buffer()); + kernel.SetArgument(2, GetRealArg(alpha)); + kernel.SetArgument(3, GetRealArg(beta)); kernel.SetArgument(4, a_temp()); kernel.SetArgument(5, a_temp()); kernel.SetArgument(6, c_temp()); @@ -154,7 +149,7 @@ StatusCode Xsyrk::DoSyrk(const Layout layout, const Triangle triangle, const // Runs the post-processing kernel auto upper = (triangle == Triangle::kUpper); auto lower = (triangle == Triangle::kLower); - status = PadCopyTransposeMatrix(queue_, device_, context_, db_, event_, eventWaitList, + status = PadCopyTransposeMatrix(queue_, device_, db_, event_, eventWaitList, n_ceiled, n_ceiled, n_ceiled, 0, c_temp, n, n, c_ld, c_offset, c_buffer, ConstantOne(), program, diff --git a/src/routines/levelx/xomatcopy.cpp b/src/routines/levelx/xomatcopy.cpp index e8593301..af9080af 100644 --- a/src/routines/levelx/xomatcopy.cpp +++ b/src/routines/levelx/xomatcopy.cpp @@ -72,7 +72,7 @@ StatusCode Xomatcopy::DoOmatcopy(const Layout layout, const Transpose a_trans const auto program = GetProgramFromCache(context_, PrecisionValue(), routine_name_); auto emptyEventList = std::vector(); - status = PadCopyTransposeMatrix(queue_, device_, context_, db_, event_, emptyEventList, + status = PadCopyTransposeMatrix(queue_, device_, db_, event_, emptyEventList, a_one, a_two, a_ld, a_offset, a_buffer, b_one, b_two, b_ld, b_offset, b_buffer, alpha, program, false, transpose, conjugate); diff --git a/src/tuning/kernels/copy_fast.cpp b/src/tuning/kernels/copy_fast.cpp index 34269bc7..78ded56e 100644 --- a/src/tuning/kernels/copy_fast.cpp +++ b/src/tuning/kernels/copy_fast.cpp @@ -86,11 +86,10 @@ class TuneCopy { std::vector &, std::vector &, std::vector &a_mat, std::vector &b_mat, std::vector &, std::vector &) { - auto alpha_buffer = std::vector{args.alpha}; tuner.AddArgumentScalar(static_cast(args.m)); tuner.AddArgumentInput(a_mat); tuner.AddArgumentOutput(b_mat); - tuner.AddArgumentInput(alpha_buffer); + tuner.AddArgumentScalar(GetRealArg(args.alpha)); } // Describes how to compute the performance metrics diff --git a/src/tuning/kernels/copy_pad.cpp b/src/tuning/kernels/copy_pad.cpp index 1e0dccd3..90f5ea82 100644 --- a/src/tuning/kernels/copy_pad.cpp +++ b/src/tuning/kernels/copy_pad.cpp @@ -86,7 +86,6 @@ class TunePad { std::vector &, std::vector &, std::vector &a_mat, std::vector &b_mat, std::vector &, std::vector &) { - auto alpha_buffer = std::vector{args.alpha}; tuner.AddArgumentScalar(static_cast(args.m)); tuner.AddArgumentScalar(static_cast(args.n)); tuner.AddArgumentScalar(static_cast(args.m)); @@ -97,7 +96,7 @@ class TunePad { tuner.AddArgumentScalar(static_cast(args.m)); tuner.AddArgumentScalar(0); tuner.AddArgumentOutput(b_mat); - tuner.AddArgumentInput(alpha_buffer); + tuner.AddArgumentScalar(GetRealArg(args.alpha)); tuner.AddArgumentScalar(0); } diff --git a/src/tuning/kernels/transpose_fast.cpp b/src/tuning/kernels/transpose_fast.cpp index 7ac19cb6..10fa80cb 100644 --- a/src/tuning/kernels/transpose_fast.cpp +++ b/src/tuning/kernels/transpose_fast.cpp @@ -91,11 +91,10 @@ class TuneTranspose { std::vector &, std::vector &, std::vector &a_mat, std::vector &b_mat, std::vector &, std::vector &) { - auto alpha_buffer = std::vector{args.alpha}; tuner.AddArgumentScalar(static_cast(args.m)); tuner.AddArgumentInput(a_mat); tuner.AddArgumentOutput(b_mat); - tuner.AddArgumentInput(alpha_buffer); + tuner.AddArgumentScalar(GetRealArg(args.alpha)); } // Describes how to compute the performance metrics diff --git a/src/tuning/kernels/transpose_pad.cpp b/src/tuning/kernels/transpose_pad.cpp index 63274415..507718eb 100644 --- a/src/tuning/kernels/transpose_pad.cpp +++ b/src/tuning/kernels/transpose_pad.cpp @@ -90,7 +90,6 @@ class TunePadTranspose { std::vector &, std::vector &, std::vector &a_mat, std::vector &b_mat, std::vector &, std::vector &) { - auto alpha_buffer = std::vector{args.alpha}; tuner.AddArgumentScalar(static_cast(args.m)); tuner.AddArgumentScalar(static_cast(args.n)); tuner.AddArgumentScalar(static_cast(args.m)); @@ -101,7 +100,7 @@ class TunePadTranspose { tuner.AddArgumentScalar(static_cast(args.n)); tuner.AddArgumentScalar(0); tuner.AddArgumentOutput(b_mat); - tuner.AddArgumentInput(alpha_buffer); + tuner.AddArgumentScalar(GetRealArg(args.alpha)); tuner.AddArgumentScalar(0); } diff --git a/src/tuning/kernels/xaxpy.cpp b/src/tuning/kernels/xaxpy.cpp index 88d12c1f..0033b3c6 100644 --- a/src/tuning/kernels/xaxpy.cpp +++ b/src/tuning/kernels/xaxpy.cpp @@ -89,9 +89,8 @@ class TuneXaxpy { std::vector &x_vec, std::vector &y_vec, std::vector &, std::vector &, std::vector &, std::vector &) { - auto alpha_buffer = std::vector{args.alpha}; tuner.AddArgumentScalar(static_cast(args.n)); - tuner.AddArgumentInput(alpha_buffer); + tuner.AddArgumentScalar(GetRealArg(args.alpha)); tuner.AddArgumentInput(x_vec); tuner.AddArgumentOutput(y_vec); } diff --git a/src/tuning/kernels/xgemm.cpp b/src/tuning/kernels/xgemm.cpp index 4b1efdef..4cb7fd00 100644 --- a/src/tuning/kernels/xgemm.cpp +++ b/src/tuning/kernels/xgemm.cpp @@ -7,7 +7,9 @@ // Author(s): // Cedric Nugteren // -// This file uses the CLTune auto-tuner to tune the xgemm OpenCL kernels. +// This file uses the CLTune auto-tuner to tune the xgemm OpenCL kernels. There are two variations: +// - V==1: This tests some limited set of tuning parameters exhaustively. +// - V==2: This tests a much larger set of tuning parameters by randomly sampling a subset. // // ================================================================================================= @@ -21,18 +23,19 @@ namespace clblast { // ================================================================================================= // See comment at top of file for a description of the class -template +template class TuneXgemm { public: // The representative kernel and the source code - static std::string KernelFamily() { return "xgemm"; } + static std::string KernelFamily() { return (V==1) ? "xgemm_1" : "xgemm_2"; } static std::string KernelName() { return "Xgemm"; } static std::string GetSources() { return #include "../src/kernels/common.opencl" #include "../src/kernels/level3/xgemm_part1.opencl" #include "../src/kernels/level3/xgemm_part2.opencl" + #include "../src/kernels/level3/xgemm_part3.opencl" ; } @@ -48,7 +51,7 @@ class TuneXgemm { static size_t DefaultM() { return 1024; } static size_t DefaultN() { return 1024; } static size_t DefaultK() { return 1024; } - static double DefaultFraction() { return 2048.0; } + static double DefaultFraction() { return (V==1) ? 1.0 : 512.0; } // test all or sample randomly // Describes how to obtain the sizes of the buffers static size_t GetSizeX(const Arguments &) { return 1; } // N/A for this kernel @@ -60,20 +63,38 @@ class TuneXgemm { // Sets the tuning parameters and their possible values static void SetParameters(cltune::Tuner &tuner, const size_t id) { - tuner.AddParameter(id, "MWG", {16, 32, 64, 128}); - tuner.AddParameter(id, "NWG", {16, 32, 64, 128}); - tuner.AddParameter(id, "KWG", {16, 32}); - tuner.AddParameter(id, "MDIMC", {8, 16, 32}); - tuner.AddParameter(id, "NDIMC", {8, 16, 32}); - tuner.AddParameter(id, "MDIMA", {8, 16, 32}); - tuner.AddParameter(id, "NDIMB", {8, 16, 32}); - tuner.AddParameter(id, "KWI", {2, 8}); - tuner.AddParameter(id, "VWM", {1, 2, 4, 8}); - tuner.AddParameter(id, "VWN", {1, 2, 4, 8}); - tuner.AddParameter(id, "STRM", {0, 1}); - tuner.AddParameter(id, "STRN", {0, 1}); - tuner.AddParameter(id, "SA", {0, 1}); - tuner.AddParameter(id, "SB", {0, 1}); + if (V==1) { // limited subset of tuning parameters - but explorable exhaustively + tuner.AddParameter(id, "MWG", {16, 32, 64}); + tuner.AddParameter(id, "NWG", {16, 32, 64}); + tuner.AddParameter(id, "KWG", {32}); + tuner.AddParameter(id, "MDIMC", {8, 16, 32}); + tuner.AddParameter(id, "NDIMC", {8, 16, 32}); + tuner.AddParameter(id, "MDIMA", {8, 16, 32}); + tuner.AddParameter(id, "NDIMB", {8, 16, 32}); + tuner.AddParameter(id, "KWI", {2}); + tuner.AddParameter(id, "VWM", {1, 2, 4}); + tuner.AddParameter(id, "VWN", {1, 2, 4}); + tuner.AddParameter(id, "STRM", {0}); + tuner.AddParameter(id, "STRN", {0}); + tuner.AddParameter(id, "SA", {0, 1}); + tuner.AddParameter(id, "SB", {0, 1}); + } // a lot more tuning parameters - has to be sampled randomly, too much to test all + else { + tuner.AddParameter(id, "MWG", {16, 32, 64, 128}); + tuner.AddParameter(id, "NWG", {16, 32, 64, 128}); + tuner.AddParameter(id, "KWG", {16, 32}); + tuner.AddParameter(id, "MDIMC", {8, 16, 32}); + tuner.AddParameter(id, "NDIMC", {8, 16, 32}); + tuner.AddParameter(id, "MDIMA", {8, 16, 32}); + tuner.AddParameter(id, "NDIMB", {8, 16, 32}); + tuner.AddParameter(id, "KWI", {2}); + tuner.AddParameter(id, "VWM", {1, 2, 4, 8}); + tuner.AddParameter(id, "VWN", {1, 2, 4, 8}); + tuner.AddParameter(id, "STRM", {0, 1}); + tuner.AddParameter(id, "STRN", {0, 1}); + tuner.AddParameter(id, "SA", {0, 1}); + tuner.AddParameter(id, "SB", {0, 1}); + } } // Sets the constraints @@ -92,6 +113,14 @@ class TuneXgemm { // KWG has to be a multiple of KDIMA = ((MDIMC*NDIMC)/(MDIMA)) and KDIMB = (...) tuner.AddConstraint(id, MultipleOfXMulYDivZ, {"KWG", "MDIMC", "NDIMC", "MDIMA"}); tuner.AddConstraint(id, MultipleOfXMulYDivZ, {"KWG", "MDIMC", "NDIMC", "NDIMB"}); + + // Extra constraints for variation 1 to limit the set of options significantly + if (V==1) { + auto IsEqual = [] (std::vector v) { return v[0] == v[1]; }; + tuner.AddConstraint(id, IsEqual, {"MDIMC", "MDIMA"}); + tuner.AddConstraint(id, IsEqual, {"NDIMC", "NDIMB"}); + tuner.AddConstraint(id, IsEqual, {"SA", "SB"}); + } } // Sets the local memory size @@ -121,13 +150,11 @@ class TuneXgemm { std::vector &, std::vector &, std::vector &a_mat, std::vector &b_mat, std::vector &c_mat, std::vector &) { - auto alpha_buffer = std::vector{args.alpha}; - auto beta_buffer = std::vector{args.beta}; tuner.AddArgumentScalar(static_cast(args.m)); tuner.AddArgumentScalar(static_cast(args.n)); tuner.AddArgumentScalar(static_cast(args.k)); - tuner.AddArgumentInput(alpha_buffer); - tuner.AddArgumentInput(beta_buffer); + tuner.AddArgumentScalar(GetRealArg(args.alpha)); + tuner.AddArgumentScalar(GetRealArg(args.beta)); tuner.AddArgumentInput(a_mat); tuner.AddArgumentInput(b_mat); tuner.AddArgumentOutput(c_mat); @@ -147,15 +174,22 @@ class TuneXgemm { using float2 = clblast::float2; using double2 = clblast::double2; +// Function to tune a specific variation V (not within the clblast namespace) +template +void StartVariation(int argc, char *argv[]) { + switch(clblast::GetPrecision(argc, argv)) { + case clblast::Precision::kHalf: clblast::Tuner, half>(argc, argv); break; + case clblast::Precision::kSingle: clblast::Tuner, float>(argc, argv); break; + case clblast::Precision::kDouble: clblast::Tuner, double>(argc, argv); break; + case clblast::Precision::kComplexSingle: clblast::Tuner, float2>(argc, argv); break; + case clblast::Precision::kComplexDouble: clblast::Tuner, double2>(argc, argv); break; + } +} + // Main function (not within the clblast namespace) int main(int argc, char *argv[]) { - switch(clblast::GetPrecision(argc, argv)) { - case clblast::Precision::kHalf: clblast::Tuner, half>(argc, argv); break; - case clblast::Precision::kSingle: clblast::Tuner, float>(argc, argv); break; - case clblast::Precision::kDouble: clblast::Tuner, double>(argc, argv); break; - case clblast::Precision::kComplexSingle: clblast::Tuner, float2>(argc, argv); break; - case clblast::Precision::kComplexDouble: clblast::Tuner, double2>(argc, argv); break; - } + StartVariation<1>(argc, argv); + StartVariation<2>(argc, argv); return 0; } diff --git a/src/tuning/kernels/xgemv.cpp b/src/tuning/kernels/xgemv.cpp index d42155ae..7229602d 100644 --- a/src/tuning/kernels/xgemv.cpp +++ b/src/tuning/kernels/xgemv.cpp @@ -29,7 +29,7 @@ class TuneXgemv { public: // The representative kernel and the source code - static std::string KernelFamily() { return "xgemv_"+std::to_string(V); } + static std::string KernelFamily() { return (V==1) ? "xgemv" : ((V==2) ? "xgemv_fast" : "xgemv_fast_rot"); } static std::string KernelName() { return (V==1) ? "Xgemv" : ((V==2) ? "XgemvFast" : "XgemvFastRot"); } static std::string GetSources() { return @@ -61,21 +61,42 @@ class TuneXgemv { // Sets the tuning parameters and their possible values static void SetParameters(cltune::Tuner &tuner, const size_t id) { - tuner.AddParameter(id, "WGS"+std::to_string(V), {64, 128, 256}); - tuner.AddParameter(id, "WPT"+std::to_string(V), {1, 2, 4}); - if (V==2 || V==3) { tuner.AddParameter(id, "VW"+std::to_string(V), {1, 2, 4, 8}); } + if (V==1) { + tuner.AddParameter(id, "WGS"+std::to_string(V), {32, 64, 128, 256}); + tuner.AddParameter(id, "WPT"+std::to_string(V), {1, 2, 4}); + } + if (V==2) { + tuner.AddParameter(id, "WGS"+std::to_string(V), {16, 32, 64, 128, 256}); + tuner.AddParameter(id, "WPT"+std::to_string(V), {1, 2, 4}); + tuner.AddParameter(id, "VW"+std::to_string(V), {1, 2, 4, 8}); + } + if (V==3) { + tuner.AddParameter(id, "WGS"+std::to_string(V), {16, 32, 64, 128}); + tuner.AddParameter(id, "WPT"+std::to_string(V), {1, 2, 4, 8, 16, 32}); + tuner.AddParameter(id, "VW"+std::to_string(V), {1, 2, 4, 8}); + } } // Sets the constraints and local memory size static void SetConstraints(cltune::Tuner &tuner, const size_t id) { - auto MultipleOfX = [] (std::vector v) { return IsMultiple(v[0], v[1]); }; if (V==2 || V==3) { + auto MultipleOfX = [] (std::vector v) { return IsMultiple(v[0], v[1]); }; tuner.AddConstraint(id, MultipleOfX, {"WPT"+std::to_string(V), "VW"+std::to_string(V)}); } + if (V==3) { + auto LargerOrEqual = [] (std::vector v) { return v[0] >= v[1]; }; + tuner.AddConstraint(id, LargerOrEqual, {"WGS"+std::to_string(V), "WPT"+std::to_string(V)}); + } } static void SetLocalMemorySize(cltune::Tuner &tuner, const size_t id, const Arguments &args) { - auto LocalMemorySize = [args] (std::vector v) { return v[0]*GetBytes(args.precision); }; - tuner.SetLocalMemoryUsage(id, LocalMemorySize, {"WGS"+std::to_string(V)}); + if (V==1 || V==2) { + auto LocalMemorySize = [args] (std::vector v) { return v[0]*GetBytes(args.precision); }; + tuner.SetLocalMemoryUsage(id, LocalMemorySize, {"WGS"+std::to_string(V)}); + } + else { + auto LocalMemorySize = [args] (std::vector v) { return (v[0]*v[1] + v[1])*GetBytes(args.precision); }; + tuner.SetLocalMemoryUsage(id, LocalMemorySize, {"WGS"+std::to_string(V), "WPT"+std::to_string(V)}); + } } // Sets the base thread configuration @@ -89,20 +110,21 @@ class TuneXgemv { static TransformVector MulLocal() { return {{"WGS"+std::to_string(V)}}; } static TransformVector DivLocal() { return {}; } static TransformVector MulGlobal() { return {}; } - static TransformVector DivGlobal() { return {{"WPT"+std::to_string(V)}}; } + static TransformVector DivGlobal() { + if (V==1 || V==2) return {{"WPT"+std::to_string(V)}}; + return {}; + } // Sets the kernel's arguments static void SetArguments(cltune::Tuner &tuner, const Arguments &args, std::vector &x_vec, std::vector &y_vec, std::vector &a_mat, std::vector &, std::vector &, std::vector &) { - auto alpha_buffer = std::vector{args.alpha}; - auto beta_buffer = std::vector{args.beta}; auto a_rotated = (V==3) ? 1 : 0; tuner.AddArgumentScalar(static_cast(args.m)); tuner.AddArgumentScalar(static_cast(args.n)); - tuner.AddArgumentInput(alpha_buffer); - tuner.AddArgumentInput(beta_buffer); + tuner.AddArgumentScalar(GetRealArg(args.alpha)); + tuner.AddArgumentScalar(GetRealArg(args.beta)); tuner.AddArgumentScalar(static_cast(a_rotated)); tuner.AddArgumentInput(a_mat); tuner.AddArgumentScalar(0); diff --git a/src/tuning/kernels/xger.cpp b/src/tuning/kernels/xger.cpp index d2590c53..1fb5c531 100644 --- a/src/tuning/kernels/xger.cpp +++ b/src/tuning/kernels/xger.cpp @@ -85,10 +85,9 @@ class TuneXger { std::vector &x_vec, std::vector &y_vec, std::vector &a_mat, std::vector &, std::vector &, std::vector &) { - auto alpha_buffer = std::vector{args.alpha}; tuner.AddArgumentScalar(static_cast(args.m)); tuner.AddArgumentScalar(static_cast(args.n)); - tuner.AddArgumentInput(alpha_buffer); + tuner.AddArgumentScalar(GetRealArg(args.alpha)); tuner.AddArgumentInput(x_vec); tuner.AddArgumentScalar(0); // x_offset tuner.AddArgumentScalar(1); // x_increment diff --git a/src/utilities.cpp b/src/utilities.cpp index 68e480c5..77bc72d7 100644 --- a/src/utilities.cpp +++ b/src/utilities.cpp @@ -161,6 +161,8 @@ template T ConvertArgument(const char* value) { return static_cast(std::stoi(value)); } +template size_t ConvertArgument(const char* value); + template <> half ConvertArgument(const char* value) { return FloatToHalf(static_cast(std::stod(value))); } @@ -179,6 +181,15 @@ template <> double2 ConvertArgument(const char* value) { return double2{val, val}; } +// Variant of "ConvertArgument" with default values +template +T ConvertArgument(const char* value, T default_value) { + + if (value) { return ConvertArgument(value); } + return default_value; +} +template size_t ConvertArgument(const char* value, size_t default_value); + // This function matches patterns in the form of "-option value" or "--option value". It returns a // default value in case the option is not found in the argument string. template @@ -332,6 +343,14 @@ void FloatToHalfBuffer(Buffer& result, const Buffer& source, cl_com result.Write(queue, size, result_cpu); } +// Converts a 'real' value to a 'real argument' value to be passed to a kernel. Normally there is +// no conversion, but half-precision is not supported as kernel argument so it is converted to float. +template <> typename RealArg::Type GetRealArg(const half value) { return HalfToFloat(value); } +template <> typename RealArg::Type GetRealArg(const float value) { return value; } +template <> typename RealArg::Type GetRealArg(const double value) { return value; } +template <> typename RealArg::Type GetRealArg(const float2 value) { return value; } +template <> typename RealArg::Type GetRealArg(const double2 value) { return value; } + // ================================================================================================= // Rounding functions performing ceiling and division operations diff --git a/src/utilities.hpp b/src/utilities.hpp index 5a4eef0f..75bd5a69 100644 --- a/src/utilities.hpp +++ b/src/utilities.hpp @@ -80,8 +80,9 @@ constexpr auto kArgComparecblas = "cblas"; constexpr auto kArgStepSize = "step"; constexpr auto kArgNumSteps = "num_steps"; constexpr auto kArgNumRuns = "runs"; +constexpr auto kArgWarmUp = "warm_up"; -// The client-specific arguments in string form +// The test-specific arguments in string form constexpr auto kArgFullTest = "full_test"; constexpr auto kArgVerbose = "verbose"; @@ -186,6 +187,10 @@ std::string ToString(T value); template T ConvertArgument(const char* value); +// Variant of "ConvertArgument" with default values +template +T ConvertArgument(const char* value, T default_value); + // Basic argument parser, matching patterns in the form of "-option value" and "--option value" template T GetArgument(const int argc, char **argv, std::string &help, @@ -226,6 +231,12 @@ void FloatToHalfBuffer(std::vector& result, const std::vector& sour Buffer HalfToFloatBuffer(const Buffer& source, cl_command_queue queue_raw); void FloatToHalfBuffer(Buffer& result, const Buffer& source, cl_command_queue queue_raw); +// Converts a 'real' value to a 'real argument' value to be passed to a kernel. Normally there is +// no conversion, but half-precision is not supported as kernel argument so it is converted to float. +template struct RealArg { using Type = T; }; +template <> struct RealArg { using Type = float; }; +template typename RealArg::Type GetRealArg(const T value); + // ================================================================================================= // Rounding functions diff --git a/test/correctness/tester.cpp b/test/correctness/tester.cpp index 92e2c1b8..362c5c2c 100644 --- a/test/correctness/tester.cpp +++ b/test/correctness/tester.cpp @@ -15,6 +15,7 @@ #include #include #include +#include #include "test/correctness/tester.hpp" @@ -27,8 +28,8 @@ template Tester::Tester(int argc, char *argv[], const bool silent, const std::string &name, const std::vector &options): help_("Options given/available:\n"), - platform_(Platform(GetArgument(argc, argv, help_, kArgPlatform, size_t{0}))), - device_(Device(platform_, GetArgument(argc, argv, help_, kArgDevice, size_t{0}))), + platform_(Platform(GetArgument(argc, argv, help_, kArgPlatform, ConvertArgument(std::getenv("CLBLAST_PLATFORM"), size_t{0})))), + device_(Device(platform_, GetArgument(argc, argv, help_, kArgDevice, ConvertArgument(std::getenv("CLBLAST_DEVICE"), size_t{0})))), context_(Context(device_)), queue_(Queue(context_, device_)), full_test_(CheckArgument(argc, argv, help_, kArgFullTest)), diff --git a/test/performance/client.cpp b/test/performance/client.cpp index d0068f8b..aaaab22e 100644 --- a/test/performance/client.cpp +++ b/test/performance/client.cpp @@ -113,6 +113,7 @@ Arguments Client::ParseArguments(int argc, char *argv[], const size_t le args.print_help = CheckArgument(argc, argv, help, kArgHelp); args.silent = CheckArgument(argc, argv, help, kArgQuiet); args.no_abbrv = CheckArgument(argc, argv, help, kArgNoAbbreviations); + warm_up_ = CheckArgument(argc, argv, help, kArgWarmUp); // Prints the chosen (or defaulted) arguments to screen. This also serves as the help message, // which is thus always displayed (unless silence is specified). @@ -244,12 +245,24 @@ template double Client::TimedExecution(const size_t num_runs, const Arguments &args, Buffers &buffers, Queue &queue, Routine run_blas, const std::string &library_name) { + auto status = StatusCode::kSuccess; + + // Do an optional warm-up to omit compilation times and initialisations from the measurements + if (warm_up_) { + try { + status = run_blas(args, buffers, queue); + } catch (...) { status = static_cast(kUnknownError); } + if (status != StatusCode::kSuccess) { + throw std::runtime_error(library_name+" error: "+ToString(static_cast(status))); + } + } + + // Start the timed part auto timings = std::vector(num_runs); for (auto &timing: timings) { auto start_time = std::chrono::steady_clock::now(); // Executes the main computation - auto status = StatusCode::kSuccess; try { status = run_blas(args, buffers, queue); } catch (...) { status = static_cast(kUnknownError); } diff --git a/test/performance/client.hpp b/test/performance/client.hpp index 5ff2aec7..6d35fced 100644 --- a/test/performance/client.hpp +++ b/test/performance/client.hpp @@ -82,6 +82,9 @@ class Client { const std::vector options_; const GetMetric get_flops_; const GetMetric get_bytes_; + + // Extra arguments + bool warm_up_; // if enabled, do a warm-up run first before measuring execution time }; // =================================================================================================