Merge pull request #99 from CNugteren/development

Update to version 0.9.0
This commit is contained in:
Cedric Nugteren 2016-09-13 21:14:51 +02:00 committed by GitHub
commit f07ac22f5b
100 changed files with 3732 additions and 2577 deletions

View file

@ -1,8 +1,8 @@
environment:
global:
CLBLAST_ROOT: "%APPVEYOR_BUILD_FOLDER%\\bin\\clblast"
CLBLAST_ROOT: "%APPVEYOR_BUILD_FOLDER%\\..\\bin\\clblast"
OPENCL_REGISTRY: "https://www.khronos.org/registry/cl"
OPENCL_ROOT: "%APPVEYOR_BUILD_FOLDER%\\bin\\opencl"
OPENCL_ROOT: "%APPVEYOR_BUILD_FOLDER%\\..\\bin\\opencl"
platform:
- x64

3
.gitignore vendored
View file

@ -2,5 +2,6 @@ build
stash
.*
*.pyc
*.db
database.json
database_best.json
cl.hpp

View file

@ -17,49 +17,21 @@ addons:
- kubuntu-backports
packages:
- cmake
- ocl-icd-opencl-dev
env:
global:
- CLBLAST_ROOT=${TRAVIS_BUILD_DIR}/bin/clblast
- OPENCL_REGISTRY=https://www.khronos.org/registry/cl
- OPENCL_ROOT=${TRAVIS_BUILD_DIR}/bin/opencl
before_install:
- cmake --version;
- ${CC} --version;
- ${CXX} --version;
install:
# The following linux logic is necessary because of Travis's move to the GCE platform, which does not
# currently contain packages for fglrx: https://github.com/travis-ci/travis-ci/issues/5221
# We build our own linkable .so file
- if [ ${TRAVIS_OS_NAME} == "linux" ]; then
mkdir -p ${OPENCL_ROOT};
pushd ${OPENCL_ROOT};
travis_retry git clone --depth 1 https://github.com/KhronosGroup/OpenCL-ICD-Loader.git;
mv ./OpenCL-ICD-Loader/* .;
travis_retry git clone --depth 1 https://github.com/KhronosGroup/OpenCL-Headers.git inc/CL;
pushd inc/CL;
travis_retry wget -w 1 -np -nd -nv -A h,hpp ${OPENCL_REGISTRY}/api/2.1/cl.hpp;
popd;
mkdir -p lib;
pushd lib;
cmake -G "Unix Makefiles" ..;
make;
cp ./bin/libOpenCL.so .;
popd;
pushd inc/CL;
travis_retry git fetch origin opencl12:opencl12;
git checkout opencl12;
popd;
mv inc/ include/;
popd;
fi
before_script:
- mkdir -p ${CLBLAST_ROOT}
- pushd ${CLBLAST_ROOT}
- cmake -DOPENCL_ROOT=${OPENCL_ROOT} -DTESTS=ON -DCLIENTS=ON ${TRAVIS_BUILD_DIR}
- cmake -DTESTS=ON -DCLIENTS=ON ${TRAVIS_BUILD_DIR}
script:
- make

View file

@ -1,4 +1,19 @@
Version 0.9.0
- Updated to version 6.0 of the CLCudaAPI C++11 OpenCL header
- Improved performance significantly of rotated GEMV computations
- Improved performance of unseen/un-tuned devices by a better default tuning parameter selection
- Fixed proper MSVC dllimport and dllexport declarations
- Fixed memory leaks related to events not being released
- Fixed a bug with a size_t and cl_ulong mismatch on 32-bit systems
- Fixed a bug related to the cache and retrieval of programs based on the OpenCL context
- Fixed a performance issue (caused by fp16 support) by optimizing alpha/beta parameter passing to kernels
- Fixed a bug in the OpenCL kernels: now placing __kernel before __attribute__
- Fixed a bug in level-3 routines when beta is zero and matrix C contains NaNs
- Added an option (-warm_up) to do a warm-up run before timing in the performance clients
- Various minor fixes and enhancements
- Added tuned parameters for various devices (see README)
Version 0.8.0
- Added support for half-precision floating-point (fp16) in the library
- Made it possible to compile the performance tests (clients) separately from the correctness tests

View file

@ -9,7 +9,7 @@
#
# ==================================================================================================
cmake_minimum_required(VERSION 2.8.10)
cmake_minimum_required(VERSION 2.8.11)
# Overrides for MSVC static runtime
set(CMAKE_USER_MAKE_RULES_OVERRIDE ${CMAKE_CURRENT_SOURCE_DIR}/cmake/c_flag_overrides.cmake)
@ -18,7 +18,7 @@ set(CMAKE_USER_MAKE_RULES_OVERRIDE_CXX ${CMAKE_CURRENT_SOURCE_DIR}/cmake/cxx_fla
# CMake project details
project("clblast" C CXX)
set(clblast_VERSION_MAJOR 0)
set(clblast_VERSION_MINOR 8)
set(clblast_VERSION_MINOR 9)
set(clblast_VERSION_PATCH 0)
# Options and their default values
@ -27,6 +27,13 @@ option(TUNERS "Enable compilation of the tuners" OFF)
option(CLIENTS "Enable compilation of the clients to test and compare performance" OFF)
option(TESTS "Enable compilation of the correctness tests" OFF)
# Compile in verbose mode with additional diagnostic messages
option(VERBOSE "Compile in verbose mode for additional diagnostic messages" OFF)
if(VERBOSE)
message("-- Building in verbose mode")
add_definitions(-DVERBOSE)
endif()
# ==================================================================================================
# RPATH settings
@ -68,6 +75,12 @@ else()
if(CMAKE_CXX_COMPILER_VERSION VERSION_LESS 4.9.0)
set(FLAGS "${FLAGS} -Wno-attributes -Wno-unused-variable")
endif()
if(CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 6.0.0)
# GCC does not support attributes on template arguments
# in particular we hit this with the alignment attributes on cl_XXX types
# which are then used to instantiate various templates in CLBlast
set(FLAGS "${FLAGS} -Wno-ignored-attributes")
endif()
elseif(CMAKE_CXX_COMPILER_ID MATCHES Clang)
set(FLAGS "${FLAGS} -Wextra -Wno-c++98-compat -Wno-c++98-compat-pedantic -Wno-padded")
set(FLAGS "${FLAGS} -Wno-missing-prototypes -Wno-float-equal -Wno-switch-enum -Wno-switch")
@ -88,7 +101,7 @@ set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${CFLAGS}")
# ==================================================================================================
# Package scripts location
set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_SOURCE_DIR}/cmake/Modules/")
set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${clblast_SOURCE_DIR}/cmake/Modules/")
# Requires OpenCL. It is found through the included "FindOpenCL.cmake" in CMAKE_MODULE_PATH.
find_package(OpenCL REQUIRED)
@ -120,11 +133,6 @@ endif()
# ==================================================================================================
# Includes directories: CLBlast and OpenCL
include_directories(${clblast_SOURCE_DIR}/include ${clblast_SOURCE_DIR}/src ${OPENCL_INCLUDE_DIRS})
# ==================================================================================================
# Sets the supported routines and the used kernels. New routines and kernels should be added here.
set(KERNELS copy_fast copy_pad transpose_fast transpose_pad xaxpy xdot xger xgemm xgemv)
set(SAMPLE_PROGRAMS_CPP sgemm)
@ -166,21 +174,36 @@ endforeach()
add_library(clblast SHARED ${SOURCES})
target_link_libraries(clblast ${OPENCL_LIBRARIES})
# Includes directories: CLBlast and OpenCL
target_include_directories(clblast PUBLIC
$<BUILD_INTERFACE:${clblast_SOURCE_DIR}/include>
$<BUILD_INTERFACE:${clblast_SOURCE_DIR}/src>
$<INSTALL_INTERFACE:include>
${OPENCL_INCLUDE_DIRS})
# Sets the proper __declspec(dllexport) keyword for Visual Studio when the library is built
if(MSVC)
target_compile_definitions(clblast PRIVATE COMPILING_DLL=1) # requires at least CMake 2.8.11
endif()
# Installs the library
install(TARGETS clblast DESTINATION lib)
install(TARGETS clblast EXPORT CLBlast DESTINATION lib)
install(FILES include/clblast.h DESTINATION include)
install(FILES include/clblast_c.h DESTINATION include)
install(FILES include/clblast_half.h DESTINATION include)
# Installs the config for find_package in dependent projects
install(EXPORT CLBlast DESTINATION lib/cmake/CLBLast FILE CLBlastConfig.cmake)
# ==================================================================================================
# Sets a default platform ($DEVICEPLATFORM) and device ($DEFAULT_DEVICE) to run tuners and tests on
# Sets a default platform ($DEVICEPLATFORM) and device ($CLBLAST_DEVICE) to run tuners and tests on
set(DEVICEPLATFORM )
if(DEFINED ENV{DEFAULT_DEVICE})
set(DEVICEPLATFORM ${DEVICEPLATFORM} -device $ENV{DEFAULT_DEVICE})
if(DEFINED ENV{CLBLAST_DEVICE})
set(DEVICEPLATFORM ${DEVICEPLATFORM} -device $ENV{CLBLAST_DEVICE})
endif()
if(DEFINED ENV{DEFAULT_PLATFORM})
set(DEVICEPLATFORM ${DEVICEPLATFORM} -platform $ENV{DEFAULT_PLATFORM})
if(DEFINED ENV{CLBLAST_PLATFORM})
set(DEVICEPLATFORM ${DEVICEPLATFORM} -platform $ENV{CLBLAST_PLATFORM})
endif()
# ==================================================================================================
@ -213,13 +236,17 @@ endif()
# the CLTune library (not included as part of the source).
if(TUNERS)
# Includes CLTune
include_directories(${CLTUNE_INCLUDE_DIRS})
# Visual Studio requires the sources of non-exported objects/libraries
set(TUNERS_COMMON )
if(MSVC)
set(TUNERS_COMMON ${TUNERS_COMMON} src/utilities.cpp)
endif()
# Adds tuning executables
foreach(KERNEL ${KERNELS})
add_executable(clblast_tuner_${KERNEL} src/tuning/kernels/${KERNEL}.cpp)
add_executable(clblast_tuner_${KERNEL} ${TUNERS_COMMON} src/tuning/kernels/${KERNEL}.cpp)
target_link_libraries(clblast_tuner_${KERNEL} clblast ${CLTUNE_LIBRARIES} ${OPENCL_LIBRARIES})
target_include_directories(clblast_tuner_${KERNEL} PUBLIC ${CLTUNE_INCLUDE_DIRS})
install(TARGETS clblast_tuner_${KERNEL} DESTINATION bin)
endforeach()
@ -263,9 +290,6 @@ if(CLIENTS OR TESTS)
endif()
endif()
# Sets the include directories
include_directories(${clblast_SOURCE_DIR} ${REF_INCLUDES})
endif()
# ==================================================================================================
@ -281,6 +305,11 @@ if(CLIENTS)
else()
# Creates the common performance-tests objects (requires CMake 2.8.8)
add_library(test_performance_common OBJECT test/performance/client.cpp)
# Adds CLBlast's interface include paths because we can't link to CLBlast here
target_include_directories(test_performance_common PRIVATE
$<TARGET_PROPERTY:clblast,INTERFACE_INCLUDE_DIRECTORIES>
${clblast_SOURCE_DIR})
set(CLIENTS_COMMON ${CLIENTS_COMMON} $<TARGET_OBJECTS:test_performance_common>)
endif()
@ -303,6 +332,7 @@ if(CLIENTS)
endforeach()
foreach(ROUTINE ${ROUTINES})
target_link_libraries(clblast_client_${ROUTINE} clblast ${REF_LIBRARIES} ${OPENCL_LIBRARIES})
target_include_directories(clblast_client_${ROUTINE} PUBLIC ${clblast_SOURCE_DIR} ${REF_INCLUDES})
install(TARGETS clblast_client_${ROUTINE} DESTINATION bin)
endforeach()
@ -324,6 +354,9 @@ if(TESTS)
# Creates the common correctness-tests objects (requires CMake 2.8.8)
add_library(test_correctness_common OBJECT
test/correctness/tester.cpp test/correctness/testblas.cpp)
target_include_directories(test_correctness_common PUBLIC
$<TARGET_PROPERTY:clblast,INTERFACE_INCLUDE_DIRECTORIES>
${clblast_SOURCE_DIR})
set(TESTS_COMMON ${TESTS_COMMON} $<TARGET_OBJECTS:test_correctness_common>)
endif()
@ -347,6 +380,7 @@ if(TESTS)
foreach(ROUTINE ${ROUTINES})
target_link_libraries(clblast_test_${ROUTINE} clblast ${REF_LIBRARIES} ${OPENCL_LIBRARIES})
install(TARGETS clblast_test_${ROUTINE} DESTINATION bin)
target_include_directories(clblast_test_${ROUTINE} PUBLIC ${clblast_SOURCE_DIR} ${REF_INCLUDES})
add_test(clblast_test_${ROUTINE} clblast_test_${ROUTINE} ${DEVICEPLATFORM})
endforeach()

View file

@ -99,20 +99,26 @@ The CLBlast library will be tuned in the future for the most commonly used OpenC
* NVIDIA GPUs:
- GRID K520
- GeForce GTX 480
- GeForce GTX 670
- GeForce GTX 680
- GeForce GTX 750
- GeForce GTX 750 Ti
- GeForce GTX 980
- GeForce GTX 1070
- GeForce GTX Titan
- GeForce GTX Titan X
- Tesla K20m
- Tesla K40m
* AMD GPUs:
- Tahiti
- AMD Radeon R9 M370X Compute Engine
- Hawaii
- Oland
- Pitcairn
- Radeon R9 M370X Compute Engine
- Tahiti
* Intel GPUs:
- HD Graphics 530
- HD Graphics Haswell Ultrabook GT2 Mobile
- HD Graphics 5500 BroadWell U-Processor GT2
- HD Graphics Skylake ULT GT2
- Iris
- Iris Pro
@ -130,7 +136,7 @@ If your device is not (yet) among this list or if you want to tune CLBlast for s
Note that CLBlast's tuners are based on the [CLTune auto-tuning library](https://github.com/CNugteren/CLTune), which has to be installed separately (requires version 2.3.1 or higher).
Compiling with `-DTUNERS=ON` will generate a number of tuners, each named `clblast_tuner_xxxxx`, in which `xxxxx` corresponds to a `.opencl` kernel file as found in `src/kernels`. These kernels corresponds to routines (e.g. `xgemm`) or to common pre-processing or post-processing kernels (`copy` and `transpose`). Running such a tuner will test a number of parameter-value combinations on your device and report which one gave the best performance. Running `make alltuners` runs all tuners for all precisions in one go. You can set the default device and platform for `alltuners` by setting the `DEFAULT_DEVICE` and `DEFAULT_PLATFORM` environmental variables before running CMake.
Compiling with `-DTUNERS=ON` will generate a number of tuners, each named `clblast_tuner_xxxxx`, in which `xxxxx` corresponds to a `.opencl` kernel file as found in `src/kernels`. These kernels corresponds to routines (e.g. `xgemm`) or to common pre-processing or post-processing kernels (`copy` and `transpose`). Running such a tuner will test a number of parameter-value combinations on your device and report which one gave the best performance. Running `make alltuners` runs all tuners for all precisions in one go. You can set the default device and platform for `alltuners` by setting the `CLBLAST_DEVICE` and `CLBLAST_PLATFORM` environmental variables before running CMake.
The tuners output a JSON-file with the results. The best results need to be added to `src/database/kernels/xxxxx.hpp` in the appropriate section. However, this can be done automatically based on the JSON-data using a Python script in `scripts/database/database.py`. If you want the found parameters to be included in future releases of CLBlast, please attach the JSON files to the corresponding issue on GitHub or [email the main author](http://www.cedricnugteren.nl).
@ -162,7 +168,7 @@ To build these tests, another BLAS library is needed to serve as a reference. Th
Afterwards, executables in the form of `clblast_test_xxxxx` are available, in which `xxxxx` is the name of a routine (e.g. `xgemm`). Note that CLBlast is tested for correctness against [clBLAS](http://github.com/clMathLibraries/clBLAS) and/or a regular CPU BLAS library. If both are installed on your system, setting the command-line option `-clblas 1` or `-cblas 1` will select the library to test against for the `clblast_test_xxxxx` executables. All tests have a `-verbose` option to enable additional diagnostic output. They also have a `-full_test` option to increase coverage further.
All tests can be run directly together in one go through the `make alltests` target or using CTest (`make test` or `ctest`). In the latter case the output is less verbose. Both cases allow you to set the default device and platform to non-zero by setting the `DEFAULT_DEVICE` and `DEFAULT_PLATFORM` environmental variables before running CMake.
All tests can be run directly together in one go through the `make alltests` target or using CTest (`make test` or `ctest`). In the latter case the output is less verbose. Both cases allow you to set the default device and platform to non-zero by setting the `CLBLAST_DEVICE` and `CLBLAST_PLATFORM` environmental variables before running CMake.
Compiling the performance tests/clients (optional)
@ -180,6 +186,8 @@ The folder `doc/performance` contains some PDF files with performance results on
Note that the CLBlast library provides pre-tuned parameter-values for some devices only: if your device is not among these, then out-of-the-box performance might be poor. See above under `Using the tuners` to find out how to tune for your device.
In case performance is still sub-optimal or something else is wrong, CLBlast can be build in verbose mode for (performance) debugging by specifying `-DVERBOSE=ON` to CMake.
Supported routines
-------------
@ -278,6 +286,9 @@ The contributing authors (code, pull requests, testing) so far are:
* [Dragan Djuric](https://github.com/blueberry)
* [Marco Hutter](https://github.com/gpus)
* [Hugh Perkins](https://github.com/hughperkins)
* [Gian-Carlo Pascutto](https://github.com/gcp)
* [Ivan Shapovalov](https://github.com/intelfx)
* [Dimitri Van Assche](https://github.com/dvasschemacq)
Tuning and testing on a variety of OpenCL devices was made possible by:

View file

@ -25,6 +25,18 @@
#include <CL/opencl.h>
#endif
// Exports library functions under Windows when building a DLL. See also:
// https://msdn.microsoft.com/en-us/library/a90k134d.aspx
#ifdef _WIN32
#ifdef COMPILING_DLL
#define PUBLIC_API __declspec(dllexport)
#else
#define PUBLIC_API __declspec(dllimport)
#endif
#else
#define PUBLIC_API
#endif
namespace clblast {
// =================================================================================================
@ -576,11 +588,11 @@ StatusCode Omatcopy(const Layout layout, const Transpose a_transpose,
// CLBlast stores binaries of compiled kernels into a cache in case the same kernel is used later on
// for the same device. This cache can be cleared to free up system memory or in case of debugging.
StatusCode ClearCache();
StatusCode PUBLIC_API ClearCache();
// The cache can also be pre-initialized for a specific device with all possible CLBLast kernels.
// Further CLBlast routine calls will then run at maximum speed.
StatusCode FillCache(const cl_device_id device);
StatusCode PUBLIC_API FillCache(const cl_device_id device);
// =================================================================================================

View file

@ -25,7 +25,11 @@
// Exports library functions under Windows when building a DLL. See also:
// https://msdn.microsoft.com/en-us/library/a90k134d.aspx
#ifdef _WIN32
#define PUBLIC_API __declspec(dllexport)
#ifdef COMPILING_DLL
#define PUBLIC_API __declspec(dllexport)
#else
#define PUBLIC_API __declspec(dllimport)
#endif
#else
#define PUBLIC_API
#endif

View file

@ -113,6 +113,7 @@ void run_example_routine(const cl_device_id device) {
// Wait for completion
clWaitForEvents(1, &event);
clReleaseEvent(event);
// Retrieves the execution time
clock_t diff = clock() - start;

View file

@ -85,6 +85,7 @@ int main(void) {
// Wait for completion
clWaitForEvents(1, &event);
clReleaseEvent(event);
// Example completed. See "clblast_c.h" for status codes (0 -> success).
printf("Completed DGEMV with status %d\n", status);

View file

@ -78,6 +78,7 @@ int main(void) {
// Wait for completion
clWaitForEvents(1, &event);
clReleaseEvent(event);
// Copies the result back to the host
clEnqueueReadBuffer(queue, device_b, CL_TRUE, 0, n*sizeof(cl_half), host_b, 0, NULL, NULL);

View file

@ -74,6 +74,7 @@ int main(void) {
// Wait for completion
clWaitForEvents(1, &event);
clReleaseEvent(event);
// Copies the result back to the host
clEnqueueReadBuffer(queue, device_output, CL_TRUE, 0, 1*sizeof(float), host_output, 0, NULL, NULL);

View file

@ -88,6 +88,7 @@ int main(void) {
// Wait for completion
clWaitForEvents(1, &event);
clReleaseEvent(event);
// Example completed. See "clblast_c.h" for status codes (0 -> success).
printf("Completed SGEMM with status %d\n", status);

View file

@ -96,6 +96,7 @@ int main() {
// Record the execution time
clWaitForEvents(1, &event);
clReleaseEvent(event);
auto elapsed_time = std::chrono::steady_clock::now() - start_time;
auto time_ms = std::chrono::duration<double,std::milli>(elapsed_time).count();

355
scripts/database/database.py Normal file → Executable file
View file

@ -1,325 +1,104 @@
#!/usr/bin/env python
# ==================================================================================================
# This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
# project loosely follows the Google C++ styleguide and uses a max-width of 100 characters per line.
# This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This file follows the
# PEP8 Python style guide and uses a max-width of 120 characters per line.
#
# Author(s):
# Cedric Nugteren <www.cedricnugteren.nl>
#
# ==================================================================================================
# System modules
import sys
import os.path
import glob
import re
import json
try:
from urllib.request import urlopen # Python 3
except ImportError:
from urllib2 import urlopen # Python 2
import argparse
# Additional modules
import pandas as pd
import database.io as io
import database.db as db
import database.clblast as clblast
import database.bests as bests
import database.defaults as defaults
# Server storing a copy of the database
DATABASE_SERVER_URL = "http://www.cedricnugteren.nl/tuning/clblast.db"
# Constants
VENDOR_DEFAULT = "default"
DEVICETYPE_DEFAULT = "All"
DEVICENAME_DEFAULT = "default"
# Attributes
DEVICETYPE_ATTRIBUTES = ["device_vendor", "device_type"]
DEVICE_ATTRIBUTES = ["device", "device_core_clock", "device_compute_units"]
KERNEL_ATTRIBUTES = ["precision", "kernel_family"]
ARGUMENT_ATTRIBUTES = ["arg_m", "arg_n", "arg_k", "arg_alpha", "arg_beta"]
ATTRIBUTES = DEVICE_ATTRIBUTES + DEVICETYPE_ATTRIBUTES + KERNEL_ATTRIBUTES + ARGUMENT_ATTRIBUTES
DATABASE_SERVER_URL = "http://www.cedricnugteren.nl/tuning/clblast.json"
# OpenCL vendor names and their short name
VENDOR_NAMES = { "device_vendor": {
VENDOR_TRANSLATION_TABLE = {
"GenuineIntel": "Intel",
"Intel(R) Corporation": "Intel",
"Advanced Micro Devices, Inc.": "AMD",
"NVIDIA Corporation": "NVIDIA",
}}
}
# Pandas options
pd.set_option('display.width', 1000)
# ==================================================================================================
# Database operations
# ==================================================================================================
def main(argv):
# Downloads the database and save it to disk
def DownloadDatabase(filename):
print("## Downloading database from '"+DATABASE_SERVER_URL+"'...")
df = urlopen(DATABASE_SERVER_URL)
output = open(file_db,'wb')
output.write(df.read())
output.close()
# Parses the command-line arguments
parser = argparse.ArgumentParser()
parser.add_argument("source_folder", help="The folder with JSON files to parse to add to the database")
parser.add_argument("clblast_root", help="Root of the CLBlast sources")
parser.add_argument("-v", "--verbose", action="store_true", help="Increase verbosity of the script")
cl_args = parser.parse_args(argv)
# Loads the database from disk
def LoadDatabase(filename):
return pd.read_pickle(filename)
# Parses the path arguments
database_filename = os.path.join(cl_args.clblast_root, "scripts", "database", "database.json")
database_best_filename = os.path.join(cl_args.clblast_root, "scripts", "database", "database_best.json")
json_files = os.path.join(cl_args.source_folder, "*.json")
cpp_database_path = os.path.join(cl_args.clblast_root, "src", "database", "kernels")
# Saves the database to disk
def SaveDatabase(df, filename):
df.to_pickle(filename)
# Checks whether the command-line arguments are valid
clblast_header = os.path.join(cl_args.clblast_root, "include", "clblast.h") # Not used but just for validation
if not os.path.isfile(clblast_header):
raise RuntimeError("The path '" + cl_args.clblast_root + "' does not point to the root of the CLBlast library")
if len(glob.glob(json_files)) < 1:
print("[database] The path '" + cl_args.source_folder + "' does not contain any JSON files")
# Loads JSON data from file
def ImportDataFromFile(filename):
with open(filename) as f:
data = json.load(f)
json_data = pd.DataFrame(data)
df = pd.io.json.json_normalize(json_data["results"])
for attribute in ATTRIBUTES:
if attribute == "kernel_family":
df[attribute] = re.sub(r'_\d+', '', data[attribute])
elif attribute in data:
df[attribute] = data[attribute]
else:
df[attribute] = 0
return df
# Downloads the database if a local copy is not present
if not os.path.isfile(database_filename):
io.download_database(database_filename, DATABASE_SERVER_URL)
# Returns the row-wise concatenation of two dataframes
def ConcatenateData(df1, df2):
return pd.concat([df1, df2])
# Loads the database from disk
database = io.load_database(database_filename)
# Removes duplicates from a dataframe
def RemoveDuplicates(df):
return df.drop_duplicates()
# Loops over all JSON files in the supplied folder
for file_json in glob.glob(json_files):
# database = database[(database["device"] != "AMD Radeon R9 M370X Compute Engine") | (database["kernel_family"] != "xgemm") | (database["precision"] != "32")]
def RemoveEntriesByDevice(df, devicename):
return df[df["device"] != devicename]
# Loads the newly imported data
sys.stdout.write("[database] Processing '" + file_json + "' ") # No newline printed
imported_data = io.load_tuning_results(file_json)
def RemoveEntriesByKernelFamily(df, familyname):
return df[df["kernel_family"] != familyname]
# Fixes the problem that some vendors use multiple different names
for target in VENDOR_TRANSLATION_TABLE:
if imported_data["device_vendor"] == target:
imported_data["device_vendor"] = VENDOR_TRANSLATION_TABLE[target]
def GetEntriesByField(df, field, value):
return df[df[field] == value]
# Adds the new data to the database
old_size = db.length(database)
database = db.add_section(database, imported_data)
new_size = db.length(database)
print("with " + str(new_size - old_size) + " new items") # Newline printed here
# Example usage:
# df = UpdateDatabase(df, (df["kernel_family"] == "xdot") & (df["arg_n"] == "67108864"), "arg_n", "2097152")
def UpdateDatabase(df, condition, field, value):
df.loc[condition, field] = value
return df
# Stores the modified database back to disk
if len(glob.glob(json_files)) >= 1:
io.save_database(database, database_filename)
# Fixes the problem that some vendors use multiple different names
def SanitizeVendorNames(df):
df = df.replace(VENDOR_NAMES)
return df
# Retrieves the best performing results
print("[database] Calculating the best results per device/kernel...")
database_best_results = bests.get_best_results(database)
# Retrieves the results with the lowest execution times
def GetBestResults(df):
dfbest = pd.DataFrame()
grouped = df.groupby(ATTRIBUTES+["kernel"])
for name, dfgroup in grouped:
besttime = dfgroup["time"].min()
bestcase = dfgroup[dfgroup["time"] == besttime].iloc[0]
dfbest = dfbest.append(bestcase, ignore_index=True)
return dfbest
# Determines the defaults for other vendors and per vendor
print("[database] Calculating the default values...")
database_defaults = defaults.calculate_defaults(database, cl_args.verbose)
database_best_results["sections"].extend(database_defaults["sections"])
# Sets defaults for devices of the same type/vendor based on the smallest values of all know
# entries. The average might be better for performance but some parameters might not be supported
# on other devices.
def CalculateDefaults(df):
dfdefault = pd.DataFrame()
# Optionally outputs the database to disk
if cl_args.verbose:
io.save_database(database_best_results, database_best_filename)
# Defaults per type/vendor
groups = df.groupby(DEVICETYPE_ATTRIBUTES+KERNEL_ATTRIBUTES+ARGUMENT_ATTRIBUTES+["kernel"])
for name, dfgroup in groups:
default_values = dfgroup.min(axis=0)
default_values["device"] = DEVICENAME_DEFAULT
default_values["device_compute_units"] = 0
default_values["device_core_clock"] = 0
default_values["time"] = 0.0
dfdefault = dfdefault.append(default_values, ignore_index=True)
# Checks for mis-matched arguments
groups = dfdefault.groupby(DEVICETYPE_ATTRIBUTES+KERNEL_ATTRIBUTES+["kernel"])
for name, dfgroup in groups:
if len(dfgroup) != 1:
description = dfgroup["kernel"].min() + " " + dfgroup["device_vendor"].min()
print("[WARNING] Entries for a single kernel with multiple argument values: " + description)
# Defaults in general
groups = df.groupby(KERNEL_ATTRIBUTES+ARGUMENT_ATTRIBUTES+["kernel"])
for name, dfgroup in groups:
default_values = dfgroup.min(axis=0)
default_values["device_vendor"] = VENDOR_DEFAULT
default_values["device_type"] = DEVICETYPE_DEFAULT
default_values["device"] = DEVICENAME_DEFAULT
default_values["device_compute_units"] = 0
default_values["device_core_clock"] = 0
default_values["time"] = 0.0
dfdefault = dfdefault.append(default_values, ignore_index=True)
# Database with both types of defaults only
return dfdefault
# Outputs the database as a C++ database
print("[database] Producing a C++ database in '" + cpp_database_path + "'...")
clblast.print_cpp_database(database_best_results, cpp_database_path)
# ==================================================================================================
# C++ header generation
# ==================================================================================================
print("[database] All done")
# The C++ header
def GetHeader(family):
return("""
// =================================================================================================
// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
// width of 100 characters per line.
//
// Author(s):
// Database generator <database.py>
//
// This file populates the database with best-found tuning parameters for the '%s' kernels.
//
// =================================================================================================
namespace clblast {
// ================================================================================================="""
% family.title())
# The C++ footer
def GetFooter():
return("\n} // namespace clblast\n")
# The start of a new C++ precision entry
def GetPrecision(family, precision):
precisionstring = ""
if precision == "16":
precisionstring = "Half"
elif precision == "32":
precisionstring = "Single"
elif precision == "64":
precisionstring = "Double"
elif precision == "3232":
precisionstring = "ComplexSingle"
elif precision == "6464":
precisionstring = "ComplexDouble"
else:
print("[ERROR] Unknown precision")
sys.exit()
return("\n\nconst Database::DatabaseEntry Database::%s%s = {\n \"%s\", Precision::k%s, {\n"
% (family.title(), precisionstring, family.title(), precisionstring))
# The C++ device type and vendor
def GetDeviceVendor(vendor, devtype):
if vendor == VENDOR_DEFAULT and devtype == DEVICETYPE_DEFAULT:
return(" { // Default\n kDeviceType%s, \"%s\", {\n" % (devtype, vendor))
return(" { // %s %ss\n kDeviceType%s, \"%s\", {\n" % (vendor, devtype, devtype[0].upper() + devtype[1:], vendor))
# Prints the data to a C++ database
def PrintData(df, outputdir):
# Iterates over the kernel families: creates a new file per family
for family, dffamily in df.groupby(["kernel_family"]):
dffamily = dffamily.dropna(axis=1, how='all')
f = open(os.path.join(outputdir, family+'.hpp'), 'w+')
f.write(GetHeader(family))
# Loops over the different entries for this family and prints their headers
for precision, dfprecision in dffamily.groupby(["precision"]):
f.write(GetPrecision(family, precision))
for vendor, dfvendor in dfprecision.groupby(["device_vendor"]):
for devtype, dfdevtype in dfvendor.groupby(["device_type"]):
f.write(GetDeviceVendor(vendor, devtype))
for device, dfdevice in dfdevtype.groupby(["device"]):
devicename = "\"%s\"," % device
f.write(" { %-50s { " % devicename)
# Collects the paramaters for this case and prints them
parameters = []
for kernel, dfkernel in dfdevice.groupby(["kernel"]):
dfkernel = dfkernel.dropna(axis=1)
col_names = [col for col in list(dfkernel) if col.startswith('parameters.') and col != "parameters.PRECISION"]
parameters += ["{\"%s\",%d}" % (p.replace("parameters.",""), dfkernel[p].iloc[0]) for p in col_names]
f.write(", ".join(parameters))
f.write(" } },\n")
# Prints the footers
f.write(" }\n },\n")
f.write(" }\n};\n\n// =================================================================================================")
f.write(GetFooter())
# ==================================================================================================
# Command-line arguments parsing and verification
# ==================================================================================================
# Checks for the number of command-line arguments
if len(sys.argv) != 3:
print("[ERROR] Usage: database.py <folder_with_json_files> <root_of_clblast>")
sys.exit()
# Parses the command-line arguments
path_json = sys.argv[1]
path_clblast = sys.argv[2]
file_db = os.path.join(path_clblast, "scripts", "database", "database.db")
glob_json = os.path.join(path_json, "*.json")
# Checks whether the command-line arguments are valid; exists otherwise
clblast_h = os.path.join(path_clblast, "include", "clblast.h") # Not used but just for validation
if not os.path.isfile(clblast_h):
print("[ERROR] The path '"+path_clblast+"' does not point to the root of the CLBlast library")
sys.exit()
if len(glob.glob(glob_json)) < 1:
print("## The path '"+path_json+"' does not contain any JSON files")
# ==================================================================================================
# The main body of the script
# ==================================================================================================
# Downloads the database if a local copy is not present
db_exists = os.path.isfile(file_db)
if not db_exists:
DownloadDatabase(file_db)
# Loads the database from disk
print("## Loading the database from disk...")
database = LoadDatabase(file_db)
# Loops over all JSON files in the supplied folder
for file_json in glob.glob(glob_json):
# Loads the newly imported data
sys.stdout.write("## Processing '"+file_json+"' ")
imported_data = ImportDataFromFile(file_json)
imported_data = SanitizeVendorNames(imported_data)
# Adds the new data to the database
old_size = len(database.index)
database = ConcatenateData(database, imported_data)
database = RemoveDuplicates(database)
new_size = len(database.index)
print("with "+str(new_size-old_size)+" new items")
# Stores the modified database back to disk
if len(glob.glob(glob_json)) >= 1:
print("## Storing the database to disk...")
SaveDatabase(database, file_db)
# Optional: update the database here. Default is disabled, code below is just an example
if False:
database = UpdateDatabase(database, ((database["kernel"] == "CopyMatrixFast") & (database["precision"] == "3232")), "arg_alpha", "2+0.5i")
SaveDatabase(database, file_db)
# Retrieves the best performing results
print("## Calculating the best results per device/kernel...")
bests = GetBestResults(database)
# Determines the defaults for other vendors and per vendor
defaults = CalculateDefaults(bests)
bests = ConcatenateData(bests, defaults)
# Outputs the data as a C++ database
path_cpp_database = os.path.join(path_clblast, "src", "database", "kernels")
print("## Producing a C++ database in '"+path_cpp_database+"'...")
PrintData(bests, path_cpp_database)
print("## All done")
# ==================================================================================================
if __name__ == '__main__':
main(sys.argv[1:])

View file

View file

@ -0,0 +1,58 @@
# This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This file follows the
# PEP8 Python style guide and uses a max-width of 120 characters per line.
#
# Author(s):
# Cedric Nugteren <www.cedricnugteren.nl>
import sys
def get_best_results(database):
"""Retrieves the results with the lowest execution times"""
sections_best = []
for section in database["sections"]:
section_best = {}
# Stores all the section's meta data
for attribute in section.keys():
if attribute != "results":
section_best[attribute] = section[attribute]
# Find the best result
parameters_best = None
time_best = sys.float_info.max
for result in section["results"]:
if result["time"] < time_best:
time_best = result["time"]
parameters_best = result["parameters"]
# Stores the best result
section_best["results"] = [{"time": time_best, "parameters": parameters_best}]
sections_best.append(section_best)
return {"sections": sections_best}
def get_relative_bests(name, common_results, common_parameters, verbose=False):
"""Retrieves the parameters with the relative best execution time over different devices"""
# Helper function
def argmax(iterable):
return max(enumerate(iterable), key=lambda x: x[1])[0]
# Computes the sum of the execution times over the different devices
performance_sums = []
for parameters in common_parameters:
performance_sum = sum([r["relative_performance"] for r in common_results if r["parameters"] == parameters])
performance_sums.append(performance_sum)
# Retrieves the entry with the highest performance
best_index = argmax(performance_sums)
best_performance = performance_sums[best_index]
best_parameters = common_parameters[best_index]
# Completed, report and return the results
if verbose:
print("[database] " + str(name) + " with performance " + str(best_performance))
return best_parameters

View file

@ -0,0 +1,155 @@
# This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This file follows the
# PEP8 Python style guide and uses a max-width of 120 characters per line.
#
# Author(s):
# Cedric Nugteren <www.cedricnugteren.nl>
import os
# Constants from the C++ code
VENDOR_DEFAULT = "default"
DEVICE_TYPE_DEFAULT = "All"
DEVICE_NAME_DEFAULT = "default"
# List of attributes
DEVICE_TYPE_ATTRIBUTES = ["device_vendor", "device_type"]
DEVICE_ATTRIBUTES = ["device", "device_core_clock", "device_compute_units"]
KERNEL_ATTRIBUTES = ["precision", "kernel_family"]
ARGUMENT_ATTRIBUTES = ["arg_m", "arg_n", "arg_k", "arg_alpha", "arg_beta"]
ATTRIBUTES = DEVICE_ATTRIBUTES + DEVICE_TYPE_ATTRIBUTES + KERNEL_ATTRIBUTES + ARGUMENT_ATTRIBUTES
GROUP_ATTRIBUTES = DEVICE_TYPE_ATTRIBUTES + KERNEL_ATTRIBUTES + ["kernel"] + ARGUMENT_ATTRIBUTES
def precision_to_string(precision):
"""Translates a precision number (represented as Python string) into a descriptive string"""
if precision == "16":
return "Half"
elif precision == "32":
return "Single"
elif precision == "64":
return "Double"
elif precision == "3232":
return "ComplexSingle"
elif precision == "6464":
return "ComplexDouble"
else:
raise("Unknown precision: " + precision)
def get_cpp_separator():
"""Retrieves a C++ comment separator"""
return "// ================================================================================================="
def get_cpp_header(family):
"""Retrieves the C++ header"""
return ("\n" + get_cpp_separator() + """
// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
// width of 100 characters per line.
//
// Author(s):
// Database generator <database.py>
//
// This file populates the database with best-found tuning parameters for the '%s' kernels.
//\n"""
% family.title() + get_cpp_separator() + "\n\nnamespace clblast {\n" + get_cpp_separator())
def get_cpp_footer():
"""Retrieves the C++ footer"""
return "\n} // namespace clblast\n"
def get_cpp_precision(family, precision):
"""Retrieves the C++ code for the start of a new precision"""
precision_string = precision_to_string(precision)
camelcase_name = family.title().replace("_", "")
return("\n\nconst Database::DatabaseEntry Database::%s%s = {\n \"%s\", Precision::k%s, {\n"
% (camelcase_name, precision_string, camelcase_name, precision_string))
def get_cpp_device_vendor(vendor, device_type):
"""Retrieves the C++ code for the (default) vendor and device type"""
if vendor == VENDOR_DEFAULT and device_type == DEVICE_TYPE_DEFAULT:
return " { // Default\n kDeviceType%s, \"%s\", {\n" % (device_type, vendor)
device_type_caps = device_type[0].upper() + device_type[1:]
return " { // %s %ss\n kDeviceType%s, \"%s\", {\n" % (vendor, device_type, device_type_caps, vendor)
def print_cpp_database(database, output_dir):
"""Outputs the database as C++ code"""
# Iterates over the kernel families
kernel_families = sorted(set([s["kernel_family"] for s in database["sections"]]))
for family_name in kernel_families:
family_database = [s for s in database["sections"] if s["kernel_family"] == family_name]
# Opens a new file for each kernel family
full_path = os.path.join(output_dir, family_name + ".hpp")
with open(full_path, 'w+') as f:
f.write(get_cpp_header(family_name))
# Loops over the different precision (e.g. 16, 32, 3232, 64, 6464)
precisions = sorted(set([s["precision"] for s in database["sections"]])) # Based on full database
for precision in precisions:
precision_database = [s for s in family_database if s["precision"] == precision]
f.write(get_cpp_precision(family_name, precision))
# In case there is nothing found at all (e.g. 16-bit): continue as if this was a precision of 32 but
# with the defaults only
if len(precision_database) == 0:
print("[database] No results found for %s:%s, retrieving defaults from %s:32" %
(family_name, precision, family_name))
precision_database = [s for s in family_database if s["precision"] == "32"
and s["device_vendor"] == VENDOR_DEFAULT
and s["device_type"] == DEVICE_TYPE_DEFAULT
and s["device"] == DEVICE_NAME_DEFAULT]
# Loops over device vendors (e.g. AMD)
device_vendors = sorted(set([s["device_vendor"] for s in precision_database]))
for vendor in device_vendors:
vendor_database = [s for s in precision_database if s["device_vendor"] == vendor]
# Loops over device types (e.g. GPU)
device_types = sorted(set([s["device_type"] for s in vendor_database]))
for device_type in device_types:
type_database = [s for s in vendor_database if s["device_type"] == device_type]
f.write(get_cpp_device_vendor(vendor, device_type))
# Loops over every device of this vendor-type combination
devices = sorted(set([s["device"] for s in type_database]))
for device_name in devices:
device_database = [s for s in type_database if s["device"] == device_name]
device_name_quoted = "\"%s\"," % device_name
device_name_cpp = " { %-50s { " % device_name_quoted
f.write(device_name_cpp)
# Collects the parameters for this entry
parameters = []
kernels = sorted(set([s["kernel"] for s in device_database]))
for kernel in kernels:
kernel_database = [s for s in device_database if s["kernel"] == kernel]
assert len(kernel_database) == 1
results = kernel_database[0]["results"]
assert len(results) == 1
new_parameters = results[0]["parameters"]
for parameter_name in sorted(new_parameters):
parameter_value = new_parameters[parameter_name]
parameters.append("{\"" + parameter_name + "\"," + str(parameter_value) + "}")
# Prints the entry
f.write(", ".join(parameters))
f.write(" } },\n")
# Prints the vendor-type combination footer
f.write(" }\n },\n")
# Prints the precision footer
f.write(" }\n};\n\n" + get_cpp_separator())
# Prints the file footer
f.write(get_cpp_footer())

View file

@ -0,0 +1,64 @@
# This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This file follows the
# PEP8 Python style guide and uses a max-width of 120 characters per line.
#
# Author(s):
# Cedric Nugteren <www.cedricnugteren.nl>
import clblast
def length(database):
"""Computes the total number of tuning entries"""
num_tuning_entries = 0
for section in database["sections"]:
num_tuning_entries += len(section["results"])
return num_tuning_entries
def add_section(database, new_section):
"""Adds a new section to the database"""
for old_section in database["sections"]:
# Verify whether the sections match
equal = True
for attribute in new_section.keys():
if attribute != "results":
if attribute not in old_section or new_section[attribute] != old_section[attribute]:
equal = False
break
# They match: append the new section's results to the corresponding entry in the database and return
if equal:
old_section["results"] = combine_results(old_section["results"], new_section["results"])
return database
# No match found: append the whole new section to the database
database["sections"].append(new_section)
return database
def combine_results(old_results, new_results):
"""Adds new results to the results JSON list"""
for new_result in new_results:
old_results = combine_result(old_results, new_result)
return old_results
def combine_result(old_results, new_result):
"""Adds a new result to the results JSON list; filters for duplicate entries and saves the best performing one"""
# Loops over all existing results to test for already existing entries with these parameters
for old_result in old_results:
# Verify whether the results match
equal = new_result["parameters"] == old_result["parameters"]
# They match: keep only the one with the minimum execution time
if equal:
old_result["time"] = min(old_result["time"], new_result["time"])
return old_results
# No match found: append a new result
old_results.append(new_result)
return old_results

View file

@ -0,0 +1,180 @@
# This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This file follows the
# PEP8 Python style guide and uses a max-width of 120 characters per line.
#
# Author(s):
# Cedric Nugteren <www.cedricnugteren.nl>
import clblast
import bests
def set_default_device(section):
"""Sets the device name and parameters to some default values"""
section["device"] = clblast.DEVICE_NAME_DEFAULT
section["device_compute_units"] = 0
section["device_core_clock"] = 0
return section
def set_identifiers(database, group_by_attributes, identifier_name):
"""Sets a group-identifier based on a given set of attributes. Modifies the database but also returns a list of
unique identifiers."""
identifiers = []
for section in database["sections"]:
identifier = []
for attribute in group_by_attributes:
if attribute in section:
identifier.append(section[attribute])
section[identifier_name] = ";".join(identifier)
identifiers.append(section[identifier_name])
return sorted(set(identifiers))
def remove_identifiers(database, identifier_name):
"""Removes an identifier from all sections in the database"""
for section in database["sections"]:
section.pop(identifier_name, None)
def get_groups_by_identifier(database, group_identifiers, identifier_name):
"""Returns a list of (group, group_identifier) tuples based a previously made grouping"""
groups = []
for group_identifier in group_identifiers:
# Get all sections in this group
group = []
for section in database["sections"]:
if section[identifier_name] == group_identifier:
group.append(section)
groups.append((group, group_identifier))
return groups
def calculate_defaults(database, verbose):
"""Sets defaults for devices of the same type/vendor"""
# Groups the database by kernel, vendor and device type (e.g. AMD GPU)
group_identifiers = set_identifiers(database, clblast.GROUP_ATTRIBUTES, "group_identifier")
groups = get_groups_by_identifier(database, group_identifiers, "group_identifier")
# Loops over all groups
default_sections = {"sections": []}
for group, group_identifier in groups:
# Computes the best parameters
default_parameters = get_common_best_parameters(group, group_identifier, verbose)
# Stores all the section's data
assert len(group) > 0
default_section = {}
for attribute in group[0].keys():
if attribute != "results" and attribute != "group_identifier":
default_section[attribute] = group[0][attribute]
default_section = set_default_device(default_section)
default_section["results"] = [{"time": 0.0, "parameters": default_parameters}]
default_sections["sections"].append(default_section)
# Groups the database by kernel, vendor and device type (e.g. AMD GPU) - but not by arguments! This is to check for
# mis-matched arguments.
attributes = clblast.DEVICE_TYPE_ATTRIBUTES + clblast.KERNEL_ATTRIBUTES + ["kernel"]
group_identifiers = set_identifiers(default_sections, attributes, "temp_identifier")
groups = get_groups_by_identifier(default_sections, group_identifiers, "temp_identifier")
for group, group_identifier in groups:
if len(group) != 1:
print("[ERROR] Entries for a single kernel with multiple argument values: " + str(group_identifier))
assert len(group) == 1
remove_identifiers(default_sections, "temp_identifier")
# Groups the database by kernel only
group_identifiers = set_identifiers(database, clblast.KERNEL_ATTRIBUTES + ["kernel"], "group_identifier")
groups = get_groups_by_identifier(database, group_identifiers, "group_identifier")
# Loops over all groups
for group, group_identifier in groups:
# Computes the best parameters
default_parameters = get_common_best_parameters(group, group_identifier, verbose)
# Stores all the section's data
assert len(group) > 0
default_section = {}
for attribute in group[0].keys():
if attribute != "results" and attribute != "group_identifier":
default_section[attribute] = group[0][attribute]
default_section = set_default_device(default_section)
default_section["device_vendor"] = clblast.VENDOR_DEFAULT
default_section["device_type"] = clblast.DEVICE_TYPE_DEFAULT
default_section["results"] = [{"time": 0.0, "parameters": default_parameters}]
default_sections["sections"].append(default_section)
# Database with both types of defaults only
return default_sections
def get_smallest_best_parameters(group):
"""Sets defaults based on the smallest values of all known entries. The average might be better for performance but
some parameters might not be supported on other devices."""
# Counts the number of devices in this group
assert len(group) > 0
# Find the smallest values of the parameters
min_parameters = {}
for section in group:
assert len(section["results"]) > 0
minimum_time = min([result["time"] for result in section["results"]])
for result in section["results"]:
if result["time"] == minimum_time:
for parameter in result["parameters"]:
if parameter in min_parameters:
min_parameters[parameter] = min(min_parameters[parameter], result["parameters"][parameter])
else:
min_parameters[parameter] = result["parameters"][parameter]
return min_parameters
def get_common_best_parameters(group, group_identifier, verbose):
"""Sets defaults based on the best values of entries supported by all devices. This might cause a problem in case
not every device was tuned with the same parameters. In that case it falls back to the above method to retrieve
the smallest best execution time"""
# Counts the number of devices in this group
num_devices = len(group)
assert num_devices > 0
# Inserts the relative execution times into the database
for section in group:
assert len(section["results"]) > 0
minimum_time = min([result["time"] for result in section["results"]])
for result in section["results"]:
result["relative_performance"] = minimum_time / result["time"]
# Determine which parameters are available for all devices
common_parameters = [result["parameters"] for result in group[0]["results"]] # Parameters of the first section
for i in range(1, num_devices):
section_parameters = [result["parameters"] for result in group[i]["results"]]
common_parameters = [p for p in section_parameters if p in common_parameters] # Intersection of the parameters
# Fall back to another method in case there are no shared entries at all across devices
if len(common_parameters) == 0:
if verbose:
print("[database] No common kernels for: " + str(group_identifier) + " with devices: %d " % num_devices)
smallest_best_parameters = get_smallest_best_parameters(group)
if verbose:
print("[database] " + str(group_identifier))
return smallest_best_parameters
# Removes entries with parameters which are not common
common_results = []
for section in group:
for result in section["results"]:
if result["parameters"] in common_parameters:
common_results.append(result)
# Retrieves the entries with the highest relative performance
relative_best_parameters = bests.get_relative_bests(group_identifier, common_results, common_parameters, verbose)
return relative_best_parameters

View file

@ -0,0 +1,60 @@
# This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This file follows the
# PEP8 Python style guide and uses a max-width of 120 characters per line.
#
# Author(s):
# Cedric Nugteren <www.cedricnugteren.nl>
import re
import json
try:
from urllib.request import urlopen # Python 3
except ImportError:
from urllib2 import urlopen # Python 2
def download_database(filename, database_url):
"""Downloads a database and saves it to disk"""
print("[database] Downloading database from '" + database_url + "'...")
database = urlopen(database_url)
with open(filename, "wb") as f:
f.write(database.read())
def load_database(filename):
"""Loads a database from disk"""
print("[database] Loading database from '" + filename + "'")
with open(filename) as f:
return json.load(f)
def save_database(database, filename):
"""Saves a database to disk"""
print("[database] Saving database to '" + filename + "'")
with open(filename, "wb") as f:
json.dump(database, f, sort_keys=True, indent=4)
def load_tuning_results(filename):
"""Loads JSON data from file and pre-processes it"""
with open(filename) as f:
json_data = json.load(f)
# Removes the numbering following the kernel family name
json_data["kernel_family"] = re.sub(r'_\d+', '', json_data["kernel_family"])
# Adds the kernel name to the section instead of to the individual results
assert len(json_data["results"]) > 0
json_data["kernel"] = json_data["results"][0]["kernel"]
for result in json_data["results"]:
assert json_data["kernel"] == result["kernel"]
result.pop("kernel", None)
# Removes the 'PRECISION' parameter from the individual results: it is redundant
for result in json_data["results"]:
assert json_data["precision"] == str(result["parameters"]["PRECISION"])
result["parameters"].pop("PRECISION", None)
# All done
return json_data

View file

@ -1,70 +0,0 @@
#!/usr/bin/env python
# ==================================================================================================
# This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
# project loosely follows the Google C++ styleguide and uses a max-width of 100 characters per line.
#
# Author(s):
# Cedric Nugteren <www.cedricnugteren.nl>
#
# This file contains the 'DataType' class, used in the generator script to generate the CLBlast API
# interface and implementation.
#
# ==================================================================================================
# Short-hands for data-types
HLF = "half"
FLT = "float"
DBL = "double"
FLT2 = "float2"
DBL2 = "double2"
HCL = "cl_half"
F2CL = "cl_float2"
D2CL = "cl_double2"
# Structure holding data-type and precision information
class DataType():
def __init__(self, precision_name, name, template, scalars, buffertype):
self.precision_name = precision_name
self.name = name
self.template = template
self.alpha_cpp = scalars[0]
self.beta_cpp = scalars[1]
self.alpha_cl = scalars[2]
self.beta_cl = scalars[3]
self.buffertype = buffertype
# Outputs the name of the data-type (alpha/beta), possibly transforming into the right type
def UseAlpha(self):
if self.alpha_cpp in [FLT2, DBL2]:
return self.alpha_cpp+"{alpha.s[0], alpha.s[1]}"
return "alpha"
def UseBeta(self):
if self.beta_cpp in [FLT2, DBL2]:
return self.beta_cpp+"{beta.s[0], beta.s[1]}"
return "beta"
# As above, but the transformation is in the opposite direction
def UseAlphaCL(self):
if self.alpha_cpp in [FLT2, DBL2]:
return self.alpha_cl+"{{alpha.real(), alpha.imag()}}"
return "alpha"
def UseBetaCL(self):
if self.beta_cpp in [FLT2, DBL2]:
return self.beta_cl+"{{beta.real(), beta.imag()}}"
return "beta"
# Returns the template as used in the correctness/performance tests
def TestTemplate(self):
if self.buffertype != self.beta_cpp:
return "<"+self.buffertype+","+self.beta_cpp+">, "+self.buffertype+", "+self.beta_cpp
return "<"+self.buffertype+">, "+self.buffertype+", "+self.beta_cpp
# Current scalar is complex
def IsComplex(self, scalar):
return ((scalar == "alpha" and self.alpha_cpp in [FLT2, DBL2]) or
(scalar == "beta" and self.beta_cpp in [FLT2, DBL2]))
# ==================================================================================================

View file

@ -1,14 +1,13 @@
#!/usr/bin/env python
# ==================================================================================================
# This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
# project loosely follows the Google C++ styleguide and uses a max-width of 100 characters per line.
# This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This file follows the
# PEP8 Python style guide and uses a max-width of 120 characters per line.
#
# Author(s):
# Cedric Nugteren <www.cedricnugteren.nl>
#
# This script automatically generates the bodies of the following files, creating the full CLBlast
# API interface and implementation (C, C++, and reference BLAS wrappers):
# This script automatically generates the bodies of the following files, creating the full CLBlast API interface and
# implementation (C, C++, and reference BLAS wrappers):
# clblast.h
# clblast.cpp
# clblast_c.h
@ -19,45 +18,20 @@
# test/correctness/routines/levelX/xYYYY.cpp
# test/performance/routines/levelX/xYYYY.cpp
# It also produces the API documentation found in doc/clblast.md
#
# ==================================================================================================
# System modules
import sys
import os.path
import argparse
# Local files
from routine import Routine
from datatype import DataType, HLF, FLT, DBL, FLT2, DBL2, HCL, F2CL, D2CL
import generator.cpp as cpp
import generator.doc as doc
from generator.routine import Routine
from generator.datatype import H, S, D, C, Z, Sc, Dz, iH, iS, iD, iC, iZ, Css, Zdd, Ccs, Zzd, T, Tc, TU
# ==================================================================================================
# Regular data-types
H = DataType("H", "H", HLF, [HLF, HLF, HCL, HCL], HLF ) # half (16)
S = DataType("S", "S", FLT, [FLT, FLT, FLT, FLT], FLT ) # single (32)
D = DataType("D", "D", DBL, [DBL, DBL, DBL, DBL], DBL ) # double (64)
C = DataType("C", "C", FLT2, [FLT2, FLT2, F2CL, F2CL], FLT2) # single-complex (3232)
Z = DataType("Z", "Z", DBL2, [DBL2, DBL2, D2CL, D2CL], DBL2) # double-complex (6464)
# Special cases
Sc = DataType("C", "Sc", FLT2, [FLT2, FLT2, FLT2, FLT2], FLT2) # As C, but with real output
Dz = DataType("Z", "Dz", DBL2, [DBL2, DBL2, DBL2, DBL2], DBL2) # As Z, but with real output
iH = DataType("H", "iH", HLF, [HLF, HLF, HLF, HLF], HLF ) # As H, but with integer output
iS = DataType("S", "iS", FLT, [FLT, FLT, FLT, FLT], FLT ) # As S, but with integer output
iD = DataType("D", "iD", DBL, [DBL, DBL, DBL, DBL], DBL ) # As D, but with integer output
iC = DataType("C", "iC", FLT2, [FLT2, FLT2, F2CL, F2CL], FLT2) # As C, but with integer output
iZ = DataType("Z", "iZ", DBL2, [DBL2, DBL2, D2CL, D2CL], DBL2) # As Z, but with integer output
Css = DataType("C", "C", FLT, [FLT, FLT, FLT, FLT], FLT2) # As C, but with constants from S
Zdd = DataType("Z", "Z", DBL, [DBL, DBL, DBL, DBL], DBL2) # As Z, but with constants from D
Ccs = DataType("C", "C", FLT2+","+FLT, [FLT2, FLT, F2CL, FLT], FLT2) # As C, but with one constant from S
Zzd = DataType("Z", "Z", DBL2+","+DBL, [DBL2, DBL, D2CL, DBL], DBL2) # As Z, but with one constant from D
# C++ template data-types
T = DataType("T", "typename T", "T", ["T", "T", "T", "T"], "T") # regular routine
Tc = DataType("Tc", "typename T", "std::complex<T>,T", ["T", "T", "T", "T"], "std::complex<T>") # for herk
TU = DataType("TU", "typename T, typename U", "T,U", ["T", "U", "T", "U"], "T") # for her2k
# ==================================================================================================
HEADER_LINES = [96, 73, 97, 22, 29, 41]
FOOTER_LINES = [17, 75, 19, 14, 6, 6]
# Different possibilities for requirements
ald_m = "The value of `a_ld` must be at least `m`."
@ -77,472 +51,162 @@ cld_n = "The value of `c_ld` must be at least `n`."
# ==================================================================================================
# Populates a list of routines
routines = [
[ # Level 1: vector-vector
Routine(False, True, "1", "rotg", T, [S,D], [], [], [], ["sa","sb","sc","ss"], [], "", "Generate givens plane rotation", "", []),
Routine(False, True, "1", "rotmg", T, [S,D], [], [], ["sy1"], ["sd1","sd2","sx1","sparam"], [], "", "Generate modified givens plane rotation", "", []),
Routine(False, True, "1", "rot", T, [S,D], ["n"], [], [], ["x","y"], ["cos","sin"], "", "Apply givens plane rotation", "", []),
Routine(False, True, "1", "rotm", T, [S,D], ["n"], [], [], ["x","y","sparam"], [], "", "Apply modified givens plane rotation", "", []),
Routine(True, True, "1", "swap", T, [S,D,C,Z,H], ["n"], [], [], ["x","y"], [], "", "Swap two vectors", "Interchanges _n_ elements of vectors _x_ and _y_.", []),
Routine(True, True, "1", "scal", T, [S,D,C,Z,H], ["n"], [], [], ["x"], ["alpha"], "", "Vector scaling", "Multiplies _n_ elements of vector _x_ by a scalar constant _alpha_.", []),
Routine(True, True, "1", "copy", T, [S,D,C,Z,H], ["n"], [], ["x"], ["y"], [], "", "Vector copy", "Copies the contents of vector _x_ into vector _y_.", []),
Routine(True, True, "1", "axpy", T, [S,D,C,Z,H], ["n"], [], ["x"], ["y"], ["alpha"], "", "Vector-times-constant plus vector", "Performs the operation _y = alpha * x + y_, in which _x_ and _y_ are vectors and _alpha_ is a scalar constant.", []),
Routine(True, True, "1", "dot", T, [S,D,H], ["n"], [], ["x","y"], ["dot"], [], "n", "Dot product of two vectors", "Multiplies _n_ elements of the vectors _x_ and _y_ element-wise and accumulates the results. The sum is stored in the _dot_ buffer.", []),
Routine(True, True, "1", "dotu", T, [C,Z], ["n"], [], ["x","y"], ["dot"], [], "n", "Dot product of two complex vectors", "See the regular xDOT routine.", []),
Routine(True, True, "1", "dotc", T, [C,Z], ["n"], [], ["x","y"], ["dot"], [], "n", "Dot product of two complex vectors, one conjugated", "See the regular xDOT routine.", []),
Routine(True, True, "1", "nrm2", T, [S,D,Sc,Dz,H], ["n"], [], ["x"], ["nrm2"], [], "2*n", "Euclidian norm of a vector", "Accumulates the square of _n_ elements in the _x_ vector and takes the square root. The resulting L2 norm is stored in the _nrm2_ buffer.", []),
Routine(True, True, "1", "asum", T, [S,D,Sc,Dz,H], ["n"], [], ["x"], ["asum"], [], "n", "Absolute sum of values in a vector", "Accumulates the absolute value of _n_ elements in the _x_ vector. The results are stored in the _asum_ buffer.", []),
Routine(True, False, "1", "sum", T, [S,D,Sc,Dz,H], ["n"], [], ["x"], ["sum"], [], "n", "Sum of values in a vector (non-BLAS function)", "Accumulates the values of _n_ elements in the _x_ vector. The results are stored in the _sum_ buffer. This routine is the non-absolute version of the xASUM BLAS routine.", []),
Routine(True, True, "1", "amax", T, [iS,iD,iC,iZ,iH], ["n"], [], ["x"], ["imax"], [], "2*n", "Index of absolute maximum value in a vector", "Finds the index of the maximum of the absolute values in the _x_ vector. The resulting integer index is stored in the _imax_ buffer.", []),
Routine(True, False, "1", "max", T, [iS,iD,iC,iZ,iH], ["n"], [], ["x"], ["imax"], [], "2*n", "Index of maximum value in a vector (non-BLAS function)", "Finds the index of the maximum of the values in the _x_ vector. The resulting integer index is stored in the _imax_ buffer. This routine is the non-absolute version of the IxAMAX BLAS routine.", []),
Routine(True, False, "1", "min", T, [iS,iD,iC,iZ,iH], ["n"], [], ["x"], ["imin"], [], "2*n", "Index of minimum value in a vector (non-BLAS function)", "Finds the index of the minimum of the values in the _x_ vector. The resulting integer index is stored in the _imin_ buffer. This routine is the non-absolute minimum version of the IxAMAX BLAS routine.", []),
ROUTINES = [
[ # Level 1: vector-vector
Routine(False, True, "1", "rotg", T, [S,D], [], [], [], ["sa","sb","sc","ss"], [], "", "Generate givens plane rotation", "", []),
Routine(False, True, "1", "rotmg", T, [S,D], [], [], ["sy1"], ["sd1","sd2","sx1","sparam"], [], "", "Generate modified givens plane rotation", "", []),
Routine(False, True, "1", "rot", T, [S,D], ["n"], [], [], ["x","y"], ["cos","sin"], "", "Apply givens plane rotation", "", []),
Routine(False, True, "1", "rotm", T, [S,D], ["n"], [], [], ["x","y","sparam"], [], "", "Apply modified givens plane rotation", "", []),
Routine(True, True, "1", "swap", T, [S,D,C,Z,H], ["n"], [], [], ["x","y"], [], "", "Swap two vectors", "Interchanges _n_ elements of vectors _x_ and _y_.", []),
Routine(True, True, "1", "scal", T, [S,D,C,Z,H], ["n"], [], [], ["x"], ["alpha"], "", "Vector scaling", "Multiplies _n_ elements of vector _x_ by a scalar constant _alpha_.", []),
Routine(True, True, "1", "copy", T, [S,D,C,Z,H], ["n"], [], ["x"], ["y"], [], "", "Vector copy", "Copies the contents of vector _x_ into vector _y_.", []),
Routine(True, True, "1", "axpy", T, [S,D,C,Z,H], ["n"], [], ["x"], ["y"], ["alpha"], "", "Vector-times-constant plus vector", "Performs the operation _y = alpha * x + y_, in which _x_ and _y_ are vectors and _alpha_ is a scalar constant.", []),
Routine(True, True, "1", "dot", T, [S,D,H], ["n"], [], ["x","y"], ["dot"], [], "n", "Dot product of two vectors", "Multiplies _n_ elements of the vectors _x_ and _y_ element-wise and accumulates the results. The sum is stored in the _dot_ buffer.", []),
Routine(True, True, "1", "dotu", T, [C,Z], ["n"], [], ["x","y"], ["dot"], [], "n", "Dot product of two complex vectors", "See the regular xDOT routine.", []),
Routine(True, True, "1", "dotc", T, [C,Z], ["n"], [], ["x","y"], ["dot"], [], "n", "Dot product of two complex vectors, one conjugated", "See the regular xDOT routine.", []),
Routine(True, True, "1", "nrm2", T, [S,D,Sc,Dz,H], ["n"], [], ["x"], ["nrm2"], [], "2*n", "Euclidian norm of a vector", "Accumulates the square of _n_ elements in the _x_ vector and takes the square root. The resulting L2 norm is stored in the _nrm2_ buffer.", []),
Routine(True, True, "1", "asum", T, [S,D,Sc,Dz,H], ["n"], [], ["x"], ["asum"], [], "n", "Absolute sum of values in a vector", "Accumulates the absolute value of _n_ elements in the _x_ vector. The results are stored in the _asum_ buffer.", []),
Routine(True, False, "1", "sum", T, [S,D,Sc,Dz,H], ["n"], [], ["x"], ["sum"], [], "n", "Sum of values in a vector (non-BLAS function)", "Accumulates the values of _n_ elements in the _x_ vector. The results are stored in the _sum_ buffer. This routine is the non-absolute version of the xASUM BLAS routine.", []),
Routine(True, True, "1", "amax", T, [iS,iD,iC,iZ,iH], ["n"], [], ["x"], ["imax"], [], "2*n", "Index of absolute maximum value in a vector", "Finds the index of the maximum of the absolute values in the _x_ vector. The resulting integer index is stored in the _imax_ buffer.", []),
Routine(True, False, "1", "max", T, [iS,iD,iC,iZ,iH], ["n"], [], ["x"], ["imax"], [], "2*n", "Index of maximum value in a vector (non-BLAS function)", "Finds the index of the maximum of the values in the _x_ vector. The resulting integer index is stored in the _imax_ buffer. This routine is the non-absolute version of the IxAMAX BLAS routine.", []),
Routine(True, False, "1", "min", T, [iS,iD,iC,iZ,iH], ["n"], [], ["x"], ["imin"], [], "2*n", "Index of minimum value in a vector (non-BLAS function)", "Finds the index of the minimum of the values in the _x_ vector. The resulting integer index is stored in the _imin_ buffer. This routine is the non-absolute minimum version of the IxAMAX BLAS routine.", []),
],
[ # Level 2: matrix-vector
Routine(True, True, "2a", "gemv", T, [S,D,C,Z,H], ["m","n"], ["layout","a_transpose"], ["a","x"], ["y"], ["alpha","beta"], "", "General matrix-vector multiplication", "Performs the operation _y = alpha * A * x + beta * y_, in which _x_ is an input vector, _y_ is an input and output vector, _A_ is an input matrix, and _alpha_ and _beta_ are scalars. The matrix _A_ can optionally be transposed before performing the operation.", [ald_m]),
Routine(True, True, "2a", "gbmv", T, [S,D,C,Z,H], ["m","n","kl","ku"], ["layout","a_transpose"], ["a","x"], ["y"], ["alpha","beta"], "", "General banded matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is banded instead.", [ald_kl_ku_one]),
Routine(True, True, "2a", "hemv", T, [C,Z], ["n"], ["layout","triangle"], ["a","x"], ["y"], ["alpha","beta"], "", "Hermitian matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is an Hermitian matrix instead.", [ald_n]),
Routine(True, True, "2a", "hbmv", T, [C,Z], ["n","k"], ["layout","triangle"], ["a","x"], ["y"], ["alpha","beta"], "", "Hermitian banded matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is an Hermitian banded matrix instead.", [ald_k_one]),
Routine(True, True, "2a", "hpmv", T, [C,Z], ["n"], ["layout","triangle"], ["ap","x"], ["y"], ["alpha","beta"], "", "Hermitian packed matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is an Hermitian packed matrix instead and represented as _AP_.", []),
Routine(True, True, "2a", "symv", T, [S,D,H], ["n"], ["layout","triangle"], ["a","x"], ["y"], ["alpha","beta"], "", "Symmetric matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is symmetric instead.", [ald_n]),
Routine(True, True, "2a", "sbmv", T, [S,D,H], ["n","k"], ["layout","triangle"], ["a","x"], ["y"], ["alpha","beta"], "", "Symmetric banded matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is symmetric and banded instead.", [ald_k_one]),
Routine(True, True, "2a", "spmv", T, [S,D,H], ["n"], ["layout","triangle"], ["ap","x"], ["y"], ["alpha","beta"], "", "Symmetric packed matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is a symmetric packed matrix instead and represented as _AP_.", []),
Routine(True, True, "2a", "trmv", T, [S,D,C,Z,H], ["n"], ["layout","triangle","a_transpose","diagonal"], ["a"], ["x"], [], "n", "Triangular matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is triangular instead.", [ald_n]),
Routine(True, True, "2a", "tbmv", T, [S,D,C,Z,H], ["n","k"], ["layout","triangle","a_transpose","diagonal"], ["a"], ["x"], [], "n", "Triangular banded matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is triangular and banded instead.", [ald_k_one]),
Routine(True, True, "2a", "tpmv", T, [S,D,C,Z,H], ["n"], ["layout","triangle","a_transpose","diagonal"], ["ap"], ["x"], [], "n", "Triangular packed matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is a triangular packed matrix instead and repreented as _AP_.", []),
Routine(False, True, "2a", "trsv", T, [S,D,C,Z], ["n"], ["layout","triangle","a_transpose","diagonal"], ["a"], ["x"], [], "", "Solves a triangular system of equations", "", []),
Routine(False, True, "2a", "tbsv", T, [S,D,C,Z], ["n","k"], ["layout","triangle","a_transpose","diagonal"], ["a"], ["x"], [], "", "Solves a banded triangular system of equations", "", [ald_k_one]),
Routine(False, True, "2a", "tpsv", T, [S,D,C,Z], ["n"], ["layout","triangle","a_transpose","diagonal"], ["ap"], ["x"], [], "", "Solves a packed triangular system of equations", "", []),
[ # Level 2: matrix-vector
Routine(True, True, "2a", "gemv", T, [S,D,C,Z,H], ["m","n"], ["layout","a_transpose"], ["a","x"], ["y"], ["alpha","beta"], "", "General matrix-vector multiplication", "Performs the operation _y = alpha * A * x + beta * y_, in which _x_ is an input vector, _y_ is an input and output vector, _A_ is an input matrix, and _alpha_ and _beta_ are scalars. The matrix _A_ can optionally be transposed before performing the operation.", [ald_m]),
Routine(True, True, "2a", "gbmv", T, [S,D,C,Z,H], ["m","n","kl","ku"], ["layout","a_transpose"], ["a","x"], ["y"], ["alpha","beta"], "", "General banded matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is banded instead.", [ald_kl_ku_one]),
Routine(True, True, "2a", "hemv", T, [C,Z], ["n"], ["layout","triangle"], ["a","x"], ["y"], ["alpha","beta"], "", "Hermitian matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is an Hermitian matrix instead.", [ald_n]),
Routine(True, True, "2a", "hbmv", T, [C,Z], ["n","k"], ["layout","triangle"], ["a","x"], ["y"], ["alpha","beta"], "", "Hermitian banded matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is an Hermitian banded matrix instead.", [ald_k_one]),
Routine(True, True, "2a", "hpmv", T, [C,Z], ["n"], ["layout","triangle"], ["ap","x"], ["y"], ["alpha","beta"], "", "Hermitian packed matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is an Hermitian packed matrix instead and represented as _AP_.", []),
Routine(True, True, "2a", "symv", T, [S,D,H], ["n"], ["layout","triangle"], ["a","x"], ["y"], ["alpha","beta"], "", "Symmetric matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is symmetric instead.", [ald_n]),
Routine(True, True, "2a", "sbmv", T, [S,D,H], ["n","k"], ["layout","triangle"], ["a","x"], ["y"], ["alpha","beta"], "", "Symmetric banded matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is symmetric and banded instead.", [ald_k_one]),
Routine(True, True, "2a", "spmv", T, [S,D,H], ["n"], ["layout","triangle"], ["ap","x"], ["y"], ["alpha","beta"], "", "Symmetric packed matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is a symmetric packed matrix instead and represented as _AP_.", []),
Routine(True, True, "2a", "trmv", T, [S,D,C,Z,H], ["n"], ["layout","triangle","a_transpose","diagonal"], ["a"], ["x"], [], "n", "Triangular matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is triangular instead.", [ald_n]),
Routine(True, True, "2a", "tbmv", T, [S,D,C,Z,H], ["n","k"], ["layout","triangle","a_transpose","diagonal"], ["a"], ["x"], [], "n", "Triangular banded matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is triangular and banded instead.", [ald_k_one]),
Routine(True, True, "2a", "tpmv", T, [S,D,C,Z,H], ["n"], ["layout","triangle","a_transpose","diagonal"], ["ap"], ["x"], [], "n", "Triangular packed matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is a triangular packed matrix instead and repreented as _AP_.", []),
Routine(False, True, "2a", "trsv", T, [S,D,C,Z], ["n"], ["layout","triangle","a_transpose","diagonal"], ["a"], ["x"], [], "", "Solves a triangular system of equations", "", []),
Routine(False, True, "2a", "tbsv", T, [S,D,C,Z], ["n","k"], ["layout","triangle","a_transpose","diagonal"], ["a"], ["x"], [], "", "Solves a banded triangular system of equations", "", [ald_k_one]),
Routine(False, True, "2a", "tpsv", T, [S,D,C,Z], ["n"], ["layout","triangle","a_transpose","diagonal"], ["ap"], ["x"], [], "", "Solves a packed triangular system of equations", "", []),
# Level 2: matrix update
Routine(True, True, "2b", "ger", T, [S,D,H], ["m","n"], ["layout"], ["x","y"], ["a"], ["alpha"], "", "General rank-1 matrix update", "Performs the operation _A = alpha * x * y^T + A_, in which _x_ is an input vector, _y^T_ is the transpose of the input vector _y_, _A_ is the matrix to be updated, and _alpha_ is a scalar value.", [ald_m]),
Routine(True, True, "2b", "geru", T, [C,Z], ["m","n"], ["layout"], ["x","y"], ["a"], ["alpha"], "", "General rank-1 complex matrix update", "Same operation as xGER, but with complex data-types.", [ald_m]),
Routine(True, True, "2b", "gerc", T, [C,Z], ["m","n"], ["layout"], ["x","y"], ["a"], ["alpha"], "", "General rank-1 complex conjugated matrix update", "Same operation as xGERU, but the update is done based on the complex conjugate of the input vectors.", [ald_m]),
Routine(True, True, "2b", "her", Tc, [Css,Zdd], ["n"], ["layout","triangle"], ["x"], ["a"], ["alpha"], "", "Hermitian rank-1 matrix update", "Performs the operation _A = alpha * x * x^T + A_, in which x is an input vector, x^T is the transpose of this vector, _A_ is the triangular Hermetian matrix to be updated, and alpha is a scalar value.", [ald_n]),
Routine(True, True, "2b", "hpr", Tc, [Css,Zdd], ["n"], ["layout","triangle"], ["x"], ["ap"], ["alpha"], "", "Hermitian packed rank-1 matrix update", "Same operation as xHER, but matrix _A_ is an Hermitian packed matrix instead and represented as _AP_.", []),
Routine(True, True, "2b", "her2", T, [C,Z], ["n"], ["layout","triangle"], ["x","y"], ["a"], ["alpha"], "", "Hermitian rank-2 matrix update", "Performs the operation _A = alpha * x * y^T + conj(alpha) * y * x^T + A_, in which _x_ is an input vector and _x^T_ its transpose, _y_ is an input vector and _y^T_ its transpose, _A_ is the triangular Hermetian matrix to be updated, _alpha_ is a scalar value and _conj(alpha)_ its complex conjugate.", [ald_n]),
Routine(True, True, "2b", "hpr2", T, [C,Z], ["n"], ["layout","triangle"], ["x","y"], ["ap"], ["alpha"], "", "Hermitian packed rank-2 matrix update", "Same operation as xHER2, but matrix _A_ is an Hermitian packed matrix instead and represented as _AP_.", []),
Routine(True, True, "2b", "syr", T, [S,D,H], ["n"], ["layout","triangle"], ["x"], ["a"], ["alpha"], "", "Symmetric rank-1 matrix update", "Same operation as xHER, but matrix A is a symmetric matrix instead.", [ald_n]),
Routine(True, True, "2b", "spr", T, [S,D,H], ["n"], ["layout","triangle"], ["x"], ["ap"], ["alpha"], "", "Symmetric packed rank-1 matrix update", "Same operation as xSPR, but matrix _A_ is a symmetric packed matrix instead and represented as _AP_.", []),
Routine(True, True, "2b", "syr2", T, [S,D,H], ["n"], ["layout","triangle"], ["x","y"], ["a"], ["alpha"], "", "Symmetric rank-2 matrix update", "Same operation as xHER2, but matrix _A_ is a symmetric matrix instead.", [ald_n]),
Routine(True, True, "2b", "spr2", T, [S,D,H], ["n"], ["layout","triangle"], ["x","y"], ["ap"], ["alpha"], "", "Symmetric packed rank-2 matrix update", "Same operation as xSPR2, but matrix _A_ is a symmetric packed matrix instead and represented as _AP_.", []),
Routine(True, True, "2b", "ger", T, [S,D,H], ["m","n"], ["layout"], ["x","y"], ["a"], ["alpha"], "", "General rank-1 matrix update", "Performs the operation _A = alpha * x * y^T + A_, in which _x_ is an input vector, _y^T_ is the transpose of the input vector _y_, _A_ is the matrix to be updated, and _alpha_ is a scalar value.", [ald_m]),
Routine(True, True, "2b", "geru", T, [C,Z], ["m","n"], ["layout"], ["x","y"], ["a"], ["alpha"], "", "General rank-1 complex matrix update", "Same operation as xGER, but with complex data-types.", [ald_m]),
Routine(True, True, "2b", "gerc", T, [C,Z], ["m","n"], ["layout"], ["x","y"], ["a"], ["alpha"], "", "General rank-1 complex conjugated matrix update", "Same operation as xGERU, but the update is done based on the complex conjugate of the input vectors.", [ald_m]),
Routine(True, True, "2b", "her", Tc, [Css,Zdd], ["n"], ["layout","triangle"], ["x"], ["a"], ["alpha"], "", "Hermitian rank-1 matrix update", "Performs the operation _A = alpha * x * x^T + A_, in which x is an input vector, x^T is the transpose of this vector, _A_ is the triangular Hermetian matrix to be updated, and alpha is a scalar value.", [ald_n]),
Routine(True, True, "2b", "hpr", Tc, [Css,Zdd], ["n"], ["layout","triangle"], ["x"], ["ap"], ["alpha"], "", "Hermitian packed rank-1 matrix update", "Same operation as xHER, but matrix _A_ is an Hermitian packed matrix instead and represented as _AP_.", []),
Routine(True, True, "2b", "her2", T, [C,Z], ["n"], ["layout","triangle"], ["x","y"], ["a"], ["alpha"], "", "Hermitian rank-2 matrix update", "Performs the operation _A = alpha * x * y^T + conj(alpha) * y * x^T + A_, in which _x_ is an input vector and _x^T_ its transpose, _y_ is an input vector and _y^T_ its transpose, _A_ is the triangular Hermetian matrix to be updated, _alpha_ is a scalar value and _conj(alpha)_ its complex conjugate.", [ald_n]),
Routine(True, True, "2b", "hpr2", T, [C,Z], ["n"], ["layout","triangle"], ["x","y"], ["ap"], ["alpha"], "", "Hermitian packed rank-2 matrix update", "Same operation as xHER2, but matrix _A_ is an Hermitian packed matrix instead and represented as _AP_.", []),
Routine(True, True, "2b", "syr", T, [S,D,H], ["n"], ["layout","triangle"], ["x"], ["a"], ["alpha"], "", "Symmetric rank-1 matrix update", "Same operation as xHER, but matrix A is a symmetric matrix instead.", [ald_n]),
Routine(True, True, "2b", "spr", T, [S,D,H], ["n"], ["layout","triangle"], ["x"], ["ap"], ["alpha"], "", "Symmetric packed rank-1 matrix update", "Same operation as xSPR, but matrix _A_ is a symmetric packed matrix instead and represented as _AP_.", []),
Routine(True, True, "2b", "syr2", T, [S,D,H], ["n"], ["layout","triangle"], ["x","y"], ["a"], ["alpha"], "", "Symmetric rank-2 matrix update", "Same operation as xHER2, but matrix _A_ is a symmetric matrix instead.", [ald_n]),
Routine(True, True, "2b", "spr2", T, [S,D,H], ["n"], ["layout","triangle"], ["x","y"], ["ap"], ["alpha"], "", "Symmetric packed rank-2 matrix update", "Same operation as xSPR2, but matrix _A_ is a symmetric packed matrix instead and represented as _AP_.", []),
],
[ # Level 3: matrix-matrix
Routine(True, True, "3", "gemm", T, [S,D,C,Z,H], ["m","n","k"], ["layout","a_transpose","b_transpose"], ["a","b"], ["c"], ["alpha","beta"], "", "General matrix-matrix multiplication", "Performs the matrix product _C = alpha * A * B + beta * C_, in which _A_ (_m_ by _k_) and _B_ (_k_ by _n_) are two general rectangular input matrices, _C_ (_m_ by _n_) is the matrix to be updated, and _alpha_ and _beta_ are scalar values. The matrices _A_ and/or _B_ can optionally be transposed before performing the operation.", [ald_transa_m_k, bld_transb_k_n, cld_m]),
Routine(True, True, "3", "symm", T, [S,D,C,Z,H], ["m","n"], ["layout","side","triangle"], ["a","b"], ["c"], ["alpha","beta"], "", "Symmetric matrix-matrix multiplication", "Same operation as xGEMM, but _A_ is symmetric instead. In case of `side == kLeft`, _A_ is a symmetric _m_ by _m_ matrix and _C = alpha * A * B + beta * C_ is performed. Otherwise, in case of `side == kRight`, _A_ is a symmtric _n_ by _n_ matrix and _C = alpha * B * A + beta * C_ is performed.", [ald_side_m_n, bld_m, cld_m]),
Routine(True, True, "3", "hemm", T, [C,Z], ["m","n"], ["layout","side","triangle"], ["a","b"], ["c"], ["alpha","beta"], "", "Hermitian matrix-matrix multiplication", "Same operation as xSYMM, but _A_ is an Hermitian matrix instead.", [ald_side_m_n, bld_m, cld_m]),
Routine(True, True, "3", "syrk", T, [S,D,C,Z,H], ["n","k"], ["layout","triangle","a_transpose"], ["a"], ["c"], ["alpha","beta"], "", "Rank-K update of a symmetric matrix", "Performs the matrix product _C = alpha * A * A^T + beta * C_ or _C = alpha * A^T * A + beta * C_, in which _A_ is a general matrix and _A^T_ is its transpose, _C_ (_n_ by _n_) is the symmetric matrix to be updated, and _alpha_ and _beta_ are scalar values.", [ald_trans_n_k, cld_m]),
Routine(True, True, "3", "herk", Tc, [Css,Zdd], ["n","k"], ["layout","triangle","a_transpose"], ["a"], ["c"], ["alpha","beta"], "", "Rank-K update of a hermitian matrix", "Same operation as xSYRK, but _C_ is an Hermitian matrix instead.", [ald_trans_n_k, cld_m]),
Routine(True, True, "3", "syr2k", T, [S,D,C,Z,H], ["n","k"], ["layout","triangle","ab_transpose"], ["a","b"], ["c"], ["alpha","beta"], "", "Rank-2K update of a symmetric matrix", "Performs the matrix product _C = alpha * A * B^T + alpha * B * A^T + beta * C_ or _C = alpha * A^T * B + alpha * B^T * A + beta * C_, in which _A_ and _B_ are general matrices and _A^T_ and _B^T_ are their transposed versions, _C_ (_n_ by _n_) is the symmetric matrix to be updated, and _alpha_ and _beta_ are scalar values.", [ald_trans_n_k, bld_trans_n_k, cld_n]),
Routine(True, True, "3", "her2k", TU, [Ccs,Zzd], ["n","k"], ["layout","triangle","ab_transpose"], ["a","b"], ["c"], ["alpha","beta"], "", "Rank-2K update of a hermitian matrix", "Same operation as xSYR2K, but _C_ is an Hermitian matrix instead.", [ald_trans_n_k, bld_trans_n_k, cld_n]),
Routine(True, True, "3", "trmm", T, [S,D,C,Z,H], ["m","n"], ["layout","side","triangle","a_transpose","diagonal"], ["a"], ["b"], ["alpha"], "", "Triangular matrix-matrix multiplication", "Performs the matrix product _B = alpha * A * B_ or _B = alpha * B * A_, in which _A_ is a unit or non-unit triangular matrix, _B_ (_m_ by _n_) is the general matrix to be updated, and _alpha_ is a scalar value.", [ald_side_m_n, bld_m]),
Routine(False, True, "3", "trsm", T, [S,D,C,Z,H], ["m","n"], ["layout","side","triangle","a_transpose","diagonal"], ["a"], ["b"], ["alpha"], "", "Solves a triangular system of equations", "", []),
[ # Level 3: matrix-matrix
Routine(True, True, "3", "gemm", T, [S,D,C,Z,H], ["m","n","k"], ["layout","a_transpose","b_transpose"], ["a","b"], ["c"], ["alpha","beta"], "", "General matrix-matrix multiplication", "Performs the matrix product _C = alpha * A * B + beta * C_, in which _A_ (_m_ by _k_) and _B_ (_k_ by _n_) are two general rectangular input matrices, _C_ (_m_ by _n_) is the matrix to be updated, and _alpha_ and _beta_ are scalar values. The matrices _A_ and/or _B_ can optionally be transposed before performing the operation.", [ald_transa_m_k, bld_transb_k_n, cld_m]),
Routine(True, True, "3", "symm", T, [S,D,C,Z,H], ["m","n"], ["layout","side","triangle"], ["a","b"], ["c"], ["alpha","beta"], "", "Symmetric matrix-matrix multiplication", "Same operation as xGEMM, but _A_ is symmetric instead. In case of `side == kLeft`, _A_ is a symmetric _m_ by _m_ matrix and _C = alpha * A * B + beta * C_ is performed. Otherwise, in case of `side == kRight`, _A_ is a symmtric _n_ by _n_ matrix and _C = alpha * B * A + beta * C_ is performed.", [ald_side_m_n, bld_m, cld_m]),
Routine(True, True, "3", "hemm", T, [C,Z], ["m","n"], ["layout","side","triangle"], ["a","b"], ["c"], ["alpha","beta"], "", "Hermitian matrix-matrix multiplication", "Same operation as xSYMM, but _A_ is an Hermitian matrix instead.", [ald_side_m_n, bld_m, cld_m]),
Routine(True, True, "3", "syrk", T, [S,D,C,Z,H], ["n","k"], ["layout","triangle","a_transpose"], ["a"], ["c"], ["alpha","beta"], "", "Rank-K update of a symmetric matrix", "Performs the matrix product _C = alpha * A * A^T + beta * C_ or _C = alpha * A^T * A + beta * C_, in which _A_ is a general matrix and _A^T_ is its transpose, _C_ (_n_ by _n_) is the symmetric matrix to be updated, and _alpha_ and _beta_ are scalar values.", [ald_trans_n_k, cld_m]),
Routine(True, True, "3", "herk", Tc, [Css,Zdd], ["n","k"], ["layout","triangle","a_transpose"], ["a"], ["c"], ["alpha","beta"], "", "Rank-K update of a hermitian matrix", "Same operation as xSYRK, but _C_ is an Hermitian matrix instead.", [ald_trans_n_k, cld_m]),
Routine(True, True, "3", "syr2k", T, [S,D,C,Z,H], ["n","k"], ["layout","triangle","ab_transpose"], ["a","b"], ["c"], ["alpha","beta"], "", "Rank-2K update of a symmetric matrix", "Performs the matrix product _C = alpha * A * B^T + alpha * B * A^T + beta * C_ or _C = alpha * A^T * B + alpha * B^T * A + beta * C_, in which _A_ and _B_ are general matrices and _A^T_ and _B^T_ are their transposed versions, _C_ (_n_ by _n_) is the symmetric matrix to be updated, and _alpha_ and _beta_ are scalar values.", [ald_trans_n_k, bld_trans_n_k, cld_n]),
Routine(True, True, "3", "her2k", TU, [Ccs,Zzd], ["n","k"], ["layout","triangle","ab_transpose"], ["a","b"], ["c"], ["alpha","beta"], "", "Rank-2K update of a hermitian matrix", "Same operation as xSYR2K, but _C_ is an Hermitian matrix instead.", [ald_trans_n_k, bld_trans_n_k, cld_n]),
Routine(True, True, "3", "trmm", T, [S,D,C,Z,H], ["m","n"], ["layout","side","triangle","a_transpose","diagonal"], ["a"], ["b"], ["alpha"], "", "Triangular matrix-matrix multiplication", "Performs the matrix product _B = alpha * A * B_ or _B = alpha * B * A_, in which _A_ is a unit or non-unit triangular matrix, _B_ (_m_ by _n_) is the general matrix to be updated, and _alpha_ is a scalar value.", [ald_side_m_n, bld_m]),
Routine(False, True, "3", "trsm", T, [S,D,C,Z,H], ["m","n"], ["layout","side","triangle","a_transpose","diagonal"], ["a"], ["b"], ["alpha"], "", "Solves a triangular system of equations", "", []),
],
[ # Level X: extra routines (not part of BLAS)
Routine(True, True, "x", "omatcopy", T, [S,D,C,Z,H], ["m","n"], ["layout","a_transpose"], ["a"], ["b"], ["alpha"], "", "Scaling and out-place transpose/copy (non-BLAS function)", "Performs scaling and out-of-place transposition/copying of matrices according to _B = alpha*op(A)_, in which _A_ is an input matrix (_m_ rows by _n_ columns), _B_ an output matrix, and _alpha_ a scalar value. The operation _op_ can be a normal matrix copy, a transposition or a conjugate transposition.", [ald_m, bld_n]),
[ # Level X: extra routines (not part of BLAS)
Routine(True, True, "x", "omatcopy", T, [S,D,C,Z,H], ["m","n"], ["layout","a_transpose"], ["a"], ["b"], ["alpha"], "", "Scaling and out-place transpose/copy (non-BLAS function)", "Performs scaling and out-of-place transposition/copying of matrices according to _B = alpha*op(A)_, in which _A_ is an input matrix (_m_ rows by _n_ columns), _B_ an output matrix, and _alpha_ a scalar value. The operation _op_ can be a normal matrix copy, a transposition or a conjugate transposition.", [ald_m, bld_n]),
]]
# ==================================================================================================
# Translates an option name to a CLBlast data-type
def PrecisionToFullName(x):
return {
'H': "Half",
'S': "Single",
'D': "Double",
'C': "ComplexSingle",
'Z': "ComplexDouble",
}[x]
# ==================================================================================================
def main(argv):
# Separators for the BLAS levels
separators = ["""
// =================================================================================================
// BLAS level-1 (vector-vector) routines
// =================================================================================================""",
"""
// =================================================================================================
// BLAS level-2 (matrix-vector) routines
// =================================================================================================""",
"""
// =================================================================================================
// BLAS level-3 (matrix-matrix) routines
// =================================================================================================""",
"""
// =================================================================================================
// Extra non-BLAS routines (level-X)
// ================================================================================================="""]
# Parses the command-line arguments
parser = argparse.ArgumentParser()
parser.add_argument("clblast_root", help="Root of the CLBlast sources")
parser.add_argument("-v", "--verbose", action="store_true", help="Increase verbosity of the script")
cl_args = parser.parse_args(argv)
library_root = cl_args.clblast_root
# Names of the level sub-folders
levelnames = ["1", "2", "3", "x"]
# Sets all the files the output
files = [
library_root + "/include/clblast.h",
library_root + "/src/clblast.cpp",
library_root + "/include/clblast_c.h",
library_root + "/src/clblast_c.cpp",
library_root + "/test/wrapper_clblas.hpp",
library_root + "/test/wrapper_cblas.hpp",
]
# Main header/footer for source files
header = """
// =================================================================================================
// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
// width of 100 characters per line.
//
// Author(s):
// Cedric Nugteren <www.cedricnugteren.nl>
//
// =================================================================================================
"""
footer = """
// =================================================================================================
"""
# Checks whether the command-line arguments are valid; exists otherwise
for f in files:
if not os.path.isfile(f):
print("[ERROR] The path '" + library_root + "' does not point to the root of the CLBlast library")
sys.exit()
# ==================================================================================================
# Iterates over all regular files to output
for i in range(0, len(files)):
# The C++ API header (.h)
def clblast_h(routines):
result = ""
for routine in routines:
result += "\n// "+routine.description+": "+routine.ShortNames()+"\n"
result += routine.RoutineHeaderCPP(12, " = nullptr")+";\n"
return result
# Stores the header and the footer of the original file
with open(files[i]) as f:
original = f.readlines()
file_header = original[:HEADER_LINES[i]]
file_footer = original[-FOOTER_LINES[i]:]
# The C++ API implementation (.cpp)
def clblast_cc(routines):
result = ""
for routine in routines:
indent1 = " "*(20 + routine.Length())
result += "\n// "+routine.description+": "+routine.ShortNames()+"\n"
if routine.implemented:
result += routine.RoutineHeaderCPP(12, "")+" {\n"
result += " auto queue_cpp = Queue(*queue);\n"
result += " auto routine = X"+routine.name+"<"+routine.template.template+">(queue_cpp, event);\n"
result += " auto status = routine.SetUp();\n"
result += " if (status != StatusCode::kSuccess) { return status; }\n"
result += " return routine.Do"+routine.name.capitalize()+"("
result += (",\n"+indent1).join([a for a in routine.ArgumentsCladuc(routine.template, indent1)])
result += ");\n"
else:
result += routine.RoutineHeaderTypeCPP(12)+" {\n"
result += " return StatusCode::kNotImplemented;\n"
result += "}\n"
for flavour in routine.flavours:
indent2 = " "*(34 + routine.Length() + len(flavour.template))
result += "template StatusCode PUBLIC_API "+routine.name.capitalize()+"<"+flavour.template+">("
result += (",\n"+indent2).join([a for a in routine.ArgumentsType(flavour)])
result += ",\n"+indent2+"cl_command_queue*, cl_event*);\n"
return result
# Re-writes the body of the file
with open(files[i], "w") as f:
body = ""
levels = [1, 2, 3] if (i == 4 or i == 5) else [1, 2, 3, 4]
for level in levels:
body += cpp.LEVEL_SEPARATORS[level - 1] + "\n"
for routine in ROUTINES[level - 1]:
if i == 0:
body += cpp.clblast_h(routine)
if i == 1:
body += cpp.clblast_cc(routine)
if i == 2:
body += cpp.clblast_c_h(routine)
if i == 3:
body += cpp.clblast_c_cc(routine)
if i == 4:
body += cpp.wrapper_clblas(routine)
if i == 5:
body += cpp.wrapper_cblas(routine)
f.write("".join(file_header))
f.write(body)
f.write("".join(file_footer))
# ==================================================================================================
# Outputs all the test implementations
for level in [1, 2, 3, 4]:
for routine in ROUTINES[level - 1]:
if routine.has_tests:
level_string = cpp.LEVEL_NAMES[level - 1]
routine_suffix = "level" + level_string + "/x" + routine.name + ".cpp"
# The C API header (.h)
def clblast_c_h(routines):
result = ""
for routine in routines:
result += "\n// "+routine.description+": "+routine.ShortNames()+"\n"
for flavour in routine.flavours:
result += routine.RoutineHeaderC(flavour, 31, " PUBLIC_API")+";\n"
return result
# Correctness tests
filename = library_root + "/test/correctness/routines/" + routine_suffix
with open(filename, "w") as f:
f.write(cpp.HEADER + "\n")
f.write(cpp.correctness_test(routine, level_string))
f.write(cpp.FOOTER)
# The C API implementation (.cpp)
def clblast_c_cc(routines):
result = ""
for routine in routines:
result += "\n// "+routine.name.upper()+"\n"
for flavour in routine.flavours:
template = "<"+flavour.template+">" if routine.NoScalars() else ""
indent = " "*(26 + routine.Length() + len(template))
result += routine.RoutineHeaderC(flavour, 20, "")+" {\n"
result += " auto status = clblast::"+routine.name.capitalize()+template+"("
result += (",\n"+indent).join([a for a in routine.ArgumentsCast(flavour, indent)])
result += ",\n"+indent+"queue, event);"
result += "\n return static_cast<StatusCode>(status);\n}\n"
return result
# Performance tests
filename = library_root + "/test/performance/routines/" + routine_suffix
with open(filename, "w") as f:
f.write(cpp.HEADER + "\n")
f.write(cpp.performance_test(routine, level_string))
f.write(cpp.FOOTER)
# ==================================================================================================
# Outputs the API documentation
filename = cl_args.clblast_root + "/doc/clblast.md"
with open(filename, "w") as f:
# The wrapper to the reference clBLAS routines (for performance/correctness testing)
def wrapper_clblas(routines):
result = ""
for routine in routines:
if routine.has_tests:
result += "\n// Forwards the clBLAS calls for %s\n" % (routine.ShortNamesTested())
if routine.NoScalars():
result += routine.RoutineHeaderWrapperCL(routine.template, True, 21)+";\n"
for flavour in routine.flavours:
result += routine.RoutineHeaderWrapperCL(flavour, False, 21)+" {\n"
# Outputs the header
doc_header = doc.header()
f.write(doc_header)
# There is a version available in clBLAS
if flavour.precision_name in ["S","D","C","Z"]:
indent = " "*(17 + routine.Length())
arguments = routine.ArgumentsWrapperCL(flavour)
if routine.scratch:
result += " auto queue = Queue(queues[0]);\n"
result += " auto context = queue.GetContext();\n"
result += " auto scratch_buffer = Buffer<"+flavour.template+">(context, "+routine.scratch+");\n"
arguments += ["scratch_buffer()"]
result += " return clblas"+flavour.name+routine.name+"("
result += (",\n"+indent).join([a for a in arguments])
result += ",\n"+indent+"num_queues, queues, num_wait_events, wait_events, events);"
# Generates the documentation for each routine
for level in [1, 2, 3, 4]:
for routine in ROUTINES[level - 1]:
if routine.implemented:
doc_routine = doc.generate(routine)
f.write(doc_routine)
# There is no clBLAS available, forward the call to one of the available functions
else: # Half-precision
indent = " "*(24 + routine.Length())
# Convert to float (note: also integer buffers are stored as half/float)
for buf in routine.inputs + routine.outputs:
result += " auto "+buf+"_buffer_bis = HalfToFloatBuffer("+buf+"_buffer, queues[0]);\n"
# Call the float routine
result += " auto status = clblasX"+routine.name+"("
result += (",\n"+indent).join([a for a in routine.ArgumentsHalf()])
result += ",\n"+indent+"num_queues, queues, num_wait_events, wait_events, events);"
result += "\n"
# Convert back to half
for buf in routine.outputs:
result += " FloatToHalfBuffer("+buf+"_buffer, "+buf+"_buffer_bis, queues[0]);\n"
result += " return status;"
# Complete
result += "\n}\n"
return result
# The wrapper to the reference CBLAS routines (for performance/correctness testing)
def wrapper_cblas(routines):
result = ""
for routine in routines:
if routine.has_tests:
result += "\n// Forwards the Netlib BLAS calls for %s\n" % (routine.ShortNamesTested())
for flavour in routine.flavours:
result += routine.RoutineHeaderWrapperC(flavour, False, 12)+" {\n"
# There is a version available in CBLAS
if flavour.precision_name in ["S","D","C","Z"]:
indent = " "*(10 + routine.Length())
arguments = routine.ArgumentsWrapperC(flavour)
# Complex scalars
for scalar in routine.scalars:
if flavour.IsComplex(scalar):
result += " const auto "+scalar+"_array = std::vector<"+flavour.buffertype[:-1]+">{"+scalar+".real(), "+scalar+".imag()};\n"
# Special case for scalar outputs
assignment = ""
postfix = ""
endofline = ""
extra_argument = ""
for output_buffer in routine.outputs:
if output_buffer in routine.ScalarBuffersFirst():
if flavour in [C,Z]:
postfix += "_sub"
indent += " "
extra_argument += ",\n"+indent+"reinterpret_cast<return_pointer_"+flavour.buffertype[:-1]+">(&"+output_buffer+"_buffer["+output_buffer+"_offset])"
elif output_buffer in routine.IndexBuffers():
assignment = "((int*)&"+output_buffer+"_buffer[0])["+output_buffer+"_offset] = "
indent += " "*len(assignment)
else:
assignment = output_buffer+"_buffer["+output_buffer+"_offset]"
if (flavour.name in ["Sc","Dz"]):
assignment = assignment+".real("
endofline += ")"
else:
assignment = assignment+" = "
indent += " "*len(assignment)
result += " "+assignment+"cblas_"+flavour.name.lower()+routine.name+postfix+"("
result += (",\n"+indent).join([a for a in arguments])
result += extra_argument+endofline+");\n"
# There is no CBLAS available, forward the call to one of the available functions
else: # Half-precision
indent = " "*(9 + routine.Length())
# Convert to float (note: also integer buffers are stored as half/float)
for buf in routine.inputs + routine.outputs:
result += " auto "+buf+"_buffer_bis = HalfToFloatBuffer("+buf+"_buffer);\n"
# Call the float routine
result += " cblasX"+routine.name+"("
result += (",\n"+indent).join([a for a in routine.ArgumentsHalf()])
result += ");\n"
# Convert back to half
for buf in routine.outputs:
result += " FloatToHalfBuffer("+buf+"_buffer, "+buf+"_buffer_bis);\n"
# Complete
result += "}\n"
return result
# ==================================================================================================
# Checks for the number of command-line arguments
if len(sys.argv) != 2:
print "[ERROR] Usage: generator.py <root_of_clblast>"
sys.exit()
# Parses the command-line arguments
path_clblast = sys.argv[1]
files = [
path_clblast+"/include/clblast.h",
path_clblast+"/src/clblast.cpp",
path_clblast+"/include/clblast_c.h",
path_clblast+"/src/clblast_c.cpp",
path_clblast+"/test/wrapper_clblas.hpp",
path_clblast+"/test/wrapper_cblas.hpp",
]
header_lines = [84, 74, 93, 22, 29, 41]
footer_lines = [17, 75, 19, 14, 6, 6]
# Checks whether the command-line arguments are valid; exists otherwise
for f in files:
if not os.path.isfile(f):
print "[ERROR] The path '"+path_clblast+"' does not point to the root of the CLBlast library"
sys.exit()
# ==================================================================================================
# Iterates over all files to output
for i in xrange(0,len(files)):
# Stores the header and the footer of the original file
with open(files[i]) as f:
original = f.readlines()
file_header = original[:header_lines[i]]
file_footer = original[-footer_lines[i]:]
# Re-writes the body of the file
with open(files[i], "w") as f:
body = ""
levels = [1,2,3] if (i == 4 or i == 5) else [1,2,3,4]
for level in levels:
body += separators[level-1]+"\n"
if i == 0:
body += clblast_h(routines[level-1])
if i == 1:
body += clblast_cc(routines[level-1])
if i == 2:
body += clblast_c_h(routines[level-1])
if i == 3:
body += clblast_c_cc(routines[level-1])
if i == 4:
body += wrapper_clblas(routines[level-1])
if i == 5:
body += wrapper_cblas(routines[level-1])
f.write("".join(file_header))
f.write(body)
f.write("".join(file_footer))
# ==================================================================================================
# Outputs all the correctness-test implementations
for level in [1,2,3,4]:
for routine in routines[level-1]:
if routine.has_tests:
filename = path_clblast+"/test/correctness/routines/level"+levelnames[level-1]+"/x"+routine.name+".cpp"
with open(filename, "w") as f:
body = ""
body += "#include \"test/correctness/testblas.hpp\"\n"
body += "#include \"test/routines/level"+levelnames[level-1]+"/x"+routine.name+".hpp\"\n\n"
body += "// Shortcuts to the clblast namespace\n"
body += "using float2 = clblast::float2;\n"
body += "using double2 = clblast::double2;\n\n"
body += "// Main function (not within the clblast namespace)\n"
body += "int main(int argc, char *argv[]) {\n"
body += " auto errors = size_t{0};\n"
not_first = "false"
for flavour in routine.flavours:
body += " errors += clblast::RunTests<clblast::TestX"+routine.name+flavour.TestTemplate()
body += ">(argc, argv, "+not_first+", \""+flavour.name+routine.name.upper()+"\");\n"
not_first = "true"
body += " if (errors > 0) { return 1; } else { return 0; }\n"
body += "}\n"
f.write(header+"\n")
f.write(body)
f.write(footer)
# Outputs all the performance-test implementations
for level in [1,2,3,4]:
for routine in routines[level-1]:
if routine.has_tests:
filename = path_clblast+"/test/performance/routines/level"+levelnames[level-1]+"/x"+routine.name+".cpp"
with open(filename, "w") as f:
body = ""
body += "#include \"test/performance/client.hpp\"\n"
body += "#include \"test/routines/level"+levelnames[level-1]+"/x"+routine.name+".hpp\"\n\n"
body += "// Shortcuts to the clblast namespace\n"
body += "using float2 = clblast::float2;\n"
body += "using double2 = clblast::double2;\n\n"
body += "// Main function (not within the clblast namespace)\n"
body += "int main(int argc, char *argv[]) {\n"
default = PrecisionToFullName(routine.flavours[0].precision_name)
body += " switch(clblast::GetPrecision(argc, argv, clblast::Precision::k"+default+")) {\n"
for precision in ["H","S","D","C","Z"]:
body += " case clblast::Precision::k"+PrecisionToFullName(precision)+":"
found = False
for flavour in routine.flavours:
if flavour.precision_name == precision:
body += "\n clblast::RunClient<clblast::TestX"+routine.name+flavour.TestTemplate()
body += ">(argc, argv); break;\n"
found = True
if not found:
body += " throw std::runtime_error(\"Unsupported precision mode\");\n"
body += " }\n"
body += " return 0;\n"
body += "}\n"
f.write(header+"\n")
f.write(body)
f.write(footer)
# ==================================================================================================
# Outputs the API documentation
filename = path_clblast+"/doc/clblast.md"
with open(filename, "w") as f:
# Outputs the header
f.write("CLBlast: API reference\n")
f.write("================\n")
f.write("\n\n")
# Loops over the routines
for level in [1,2,3,4]:
for routine in routines[level-1]:
if routine.implemented:
# Routine header
f.write("x"+routine.name.upper()+": "+routine.description+"\n")
f.write("-------------\n")
f.write("\n")
f.write(routine.details+"\n")
f.write("\n")
# Routine API
f.write("C++ API:\n")
f.write("```\n")
f.write(routine.RoutineHeaderCPP(12, "")+"\n")
f.write("```\n")
f.write("\n")
f.write("C API:\n")
f.write("```\n")
for flavour in routine.flavours:
f.write(routine.RoutineHeaderC(flavour, 20, "")+"\n")
f.write("```\n")
f.write("\n")
# Routine arguments
f.write("Arguments to "+routine.name.upper()+":\n")
f.write("\n")
for argument in routine.ArgumentsDoc():
f.write("* "+argument+"\n")
f.write("* `cl_command_queue* queue`: Pointer to an OpenCL command queue associated with a context and device to execute the routine on.\n")
f.write("* `cl_event* event`: Pointer to an OpenCL event to be able to wait for completion of the routine's OpenCL kernel(s). This is an optional argument.\n")
f.write("\n")
# Routine requirements
if len(routine.RequirementsDoc()) > 0:
f.write("Requirements for "+routine.name.upper()+":\n")
f.write("\n")
for requirement in routine.RequirementsDoc():
f.write("* "+requirement+"\n")
f.write("\n")
# Routine footer
f.write("\n\n")
# ==================================================================================================
if __name__ == '__main__':
main(sys.argv[1:])

View file

View file

@ -0,0 +1,69 @@
# This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This file follows the
# PEP8 Python style guide and uses a max-width of 120 characters per line.
#
# Author(s):
# Cedric Nugteren <www.cedricnugteren.nl>
def precision_to_full_name(x):
"""Translates an option name to a CLBlast data-type"""
return {
'H': "Half",
'S': "Single",
'D': "Double",
'C': "ComplexSingle",
'Z': "ComplexDouble",
}[x]
def option_to_clblast(x):
"""Translates an option name to a CLBlast data-type"""
return {
'layout': "Layout",
'a_transpose': "Transpose",
'b_transpose': "Transpose",
'ab_transpose': "Transpose",
'side': "Side",
'triangle': "Triangle",
'diagonal': "Diagonal",
}[x]
def option_to_clblas(x):
"""As above, but for clBLAS data-types"""
return {
'layout': "clblasOrder",
'a_transpose': "clblasTranspose",
'b_transpose': "clblasTranspose",
'ab_transpose': "clblasTranspose",
'side': "clblasSide",
'triangle': "clblasUplo",
'diagonal': "clblasDiag",
}[x]
def option_to_cblas(x):
"""As above, but for CBLAS data-types"""
return {
'layout': "CBLAS_ORDER",
'a_transpose': "CBLAS_TRANSPOSE",
'b_transpose': "CBLAS_TRANSPOSE",
'ab_transpose': "CBLAS_TRANSPOSE",
'side': "CBLAS_SIDE",
'triangle': "CBLAS_UPLO",
'diagonal': "CBLAS_DIAG",
}[x]
def option_to_documentation(x):
"""Translates an option name to a documentation string"""
return {
'layout': "Data-layout of the matrices, either `Layout::kRowMajor` (101) for row-major layout or `Layout::kColMajor` (102) for column-major data-layout.",
'a_transpose': "Transposing the input matrix A, either `Transpose::kNo` (111), `Transpose::kYes` (112), or `Transpose::kConjugate` (113) for a complex-conjugate transpose.",
'b_transpose': "Transposing the input matrix B, either `Transpose::kNo` (111), `Transpose::kYes` (112), or `Transpose::kConjugate` (113) for a complex-conjugate transpose.",
'ab_transpose': "Transposing the packed input matrix AP, either `Transpose::kNo` (111), `Transpose::kYes` (112), or `Transpose::kConjugate` (113) for a complex-conjugate transpose.",
'side': "The position of the triangular matrix in the operation, either on the `Side::kLeft` (141) or `Side::kRight` (142).",
'triangle': "The part of the array of the triangular matrix to be used, either `Triangle::kUpper` (121) or `Triangle::kLower` (122).",
'diagonal': "The property of the diagonal matrix, either `Diagonal::kNonUnit` (131) for non-unit values on the diagonal or `Diagonal::kUnit` (132) for unit values on the diagonal.",
}[x]

View file

@ -0,0 +1,257 @@
# This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This file follows the
# PEP8 Python style guide and uses a max-width of 120 characters per line.
#
# Author(s):
# Cedric Nugteren <www.cedricnugteren.nl>
import generator.datatype as datatype
import generator.convert as convert
NL = "\n"
SEPARATOR = "// ================================================================================================="
# Separators for the BLAS levels
LEVEL_SEPARATORS = [
NL + SEPARATOR + NL + "// BLAS level-1 (vector-vector) routines" + NL + SEPARATOR,
NL + SEPARATOR + NL + "// BLAS level-2 (matrix-vector) routines" + NL + SEPARATOR,
NL + SEPARATOR + NL + "// BLAS level-3 (matrix-matrix) routines" + NL + SEPARATOR,
NL + SEPARATOR + NL + "// Extra non-BLAS routines (level-X)" + NL + SEPARATOR
]
# Names of the level sub-folders
LEVEL_NAMES = ["1", "2", "3", "x"]
# Main header/footer for source files
FOOTER = NL + SEPARATOR + NL
HEADER = NL + SEPARATOR + """
// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
// width of 100 characters per line.
//
// Author(s):
// Cedric Nugteren <www.cedricnugteren.nl>
//
""" + SEPARATOR + NL
def clblast_h(routine):
"""The C++ API header (.h)"""
result = NL + "// " + routine.description + ": " + routine.short_names() + NL
result += routine.routine_header_cpp(12, " = nullptr") + ";" + NL
return result
def clblast_cc(routine):
"""The C++ API implementation (.cpp)"""
indent1 = " " * (20 + routine.length())
result = NL + "// " + routine.description + ": " + routine.short_names() + NL
if routine.implemented:
result += routine.routine_header_cpp(12, "") + " {" + NL
result += " auto queue_cpp = Queue(*queue);" + NL
result += " auto routine = X" + routine.name + "<" + routine.template.template + ">(queue_cpp, event);" + NL
result += " auto status = routine.SetUp();" + NL
result += " if (status != StatusCode::kSuccess) { return status; }" + NL
result += " return routine.Do" + routine.name.capitalize() + "("
result += ("," + NL + indent1).join([a for a in routine.arguments_clcudaapi()])
result += ");" + NL
else:
result += routine.routine_header_type_cpp(12) + " {" + NL
result += " return StatusCode::kNotImplemented;" + NL
result += "}" + NL
for flavour in routine.flavours:
indent2 = " " * (34 + routine.length() + len(flavour.template))
result += "template StatusCode PUBLIC_API " + routine.name.capitalize() + "<" + flavour.template + ">("
result += ("," + NL + indent2).join([a for a in routine.arguments_type(flavour)])
result += "," + NL + indent2 + "cl_command_queue*, cl_event*);" + NL
return result
def clblast_c_h(routine):
"""The C API header (.h)"""
result = NL + "// " + routine.description + ": " + routine.short_names() + NL
for flavour in routine.flavours:
result += routine.routine_header_c(flavour, 31, " PUBLIC_API") + ";" + NL
return result
def clblast_c_cc(routine):
"""The C API implementation (.cpp)"""
result = NL + "// " + routine.name.upper() + NL
for flavour in routine.flavours:
template = "<" + flavour.template + ">" if routine.no_scalars() else ""
indent = " " * (26 + routine.length() + len(template))
result += routine.routine_header_c(flavour, 20, "") + " {" + NL
result += " auto status = clblast::" + routine.name.capitalize() + template + "("
result += ("," + NL + indent).join([a for a in routine.arguments_cast(flavour, indent)])
result += "," + NL + indent + "queue, event);"
result += NL + " return static_cast<StatusCode>(status);" + NL + "}" + NL
return result
def wrapper_clblas(routine):
"""The wrapper to the reference clBLAS routines (for performance/correctness testing)"""
result = ""
if routine.has_tests:
result += NL + "// Forwards the clBLAS calls for %s" % routine.short_names_tested() + NL
if routine.no_scalars():
result += routine.routine_header_wrapper_clblas(routine.template, True, 21) + ";" + NL
for flavour in routine.flavours:
result += routine.routine_header_wrapper_clblas(flavour, False, 21) + " {" + NL
# There is a version available in clBLAS
if flavour.precision_name in ["S", "D", "C", "Z"]:
indent = " " * (17 + routine.length())
arguments = routine.arguments_wrapper_clblas(flavour)
if routine.scratch:
result += " auto queue = Queue(queues[0]);" + NL
result += " auto context = queue.GetContext();" + NL
result += " auto scratch_buffer = Buffer<" + flavour.template + ">"
result += "(context, " + routine.scratch + ");" + NL
arguments += ["scratch_buffer()"]
result += " return clblas" + flavour.name + routine.name + "("
result += ("," + NL + indent).join([a for a in arguments])
result += "," + NL + indent + "num_queues, queues, num_wait_events, wait_events, events);"
# There is no clBLAS available, forward the call to one of the available functions
else: # Half-precision
indent = " " * (24 + routine.length())
# Convert to float (note: also integer buffers are stored as half/float)
for buf in routine.inputs + routine.outputs:
result += " auto " + buf + "_buffer_bis = HalfToFloatBuffer(" + buf + "_buffer, queues[0]);" + NL
# Call the float routine
result += " auto status = clblasX" + routine.name + "("
result += ("," + NL + indent).join([a for a in routine.arguments_half()])
result += "," + NL + indent + "num_queues, queues, num_wait_events, wait_events, events);"
result += NL
# Convert back to half
for buf in routine.outputs:
result += " FloatToHalfBuffer(" + buf + "_buffer, " + buf + "_buffer_bis, queues[0]);" + NL
result += " return status;"
# Complete
result += NL + "}" + NL
return result
def wrapper_cblas(routine):
"""The wrapper to the reference CBLAS routines (for performance/correctness testing)"""
result = ""
if routine.has_tests:
result += NL + "// Forwards the Netlib BLAS calls for %s" % routine.short_names_tested() + NL
for flavour in routine.flavours:
result += routine.routine_header_wrapper_cblas(flavour, 12) + " {" + NL
# There is a version available in CBLAS
if flavour.precision_name in ["S", "D", "C", "Z"]:
indent = " " * (10 + routine.length())
arguments = routine.arguments_wrapper_cblas(flavour)
# Complex scalars
for scalar in routine.scalars:
if flavour.is_complex(scalar):
result += " const auto " + scalar + "_array = std::vector<" + flavour.buffer_type[:-1] + ">"
result += "{" + scalar + ".real(), " + scalar + ".imag()};" + NL
# Special case for scalar outputs
assignment = ""
postfix = ""
end_of_line = ""
extra_argument = ""
for output_buffer in routine.outputs:
if output_buffer in routine.scalar_buffers_first():
if flavour in [datatype.C, datatype.Z]:
postfix += "_sub"
indent += " "
extra_argument += "," + NL + indent
extra_argument += "reinterpret_cast<return_pointer_" + flavour.buffer_type[:-1] + ">"
extra_argument += "(&" + output_buffer + "_buffer[" + output_buffer + "_offset])"
elif output_buffer in routine.index_buffers():
assignment = "((int*)&" + output_buffer + "_buffer[0])[" + output_buffer + "_offset] = "
indent += " " * len(assignment)
else:
assignment = output_buffer + "_buffer[" + output_buffer + "_offset]"
if flavour.name in ["Sc", "Dz"]:
assignment += ".real("
end_of_line += ")"
else:
assignment += " = "
indent += " " * len(assignment)
result += " " + assignment + "cblas_" + flavour.name.lower() + routine.name + postfix + "("
result += ("," + NL + indent).join([a for a in arguments])
result += extra_argument + end_of_line + ");" + NL
# There is no CBLAS available, forward the call to one of the available functions
else: # Half-precision
indent = " " * (9 + routine.length())
# Convert to float (note: also integer buffers are stored as half/float)
for buf in routine.inputs + routine.outputs:
result += " auto " + buf + "_buffer_bis = HalfToFloatBuffer(" + buf + "_buffer);" + NL
# Call the float routine
result += " cblasX" + routine.name + "("
result += ("," + NL + indent).join([a for a in routine.arguments_half()])
result += ");" + NL
# Convert back to half
for buf in routine.outputs:
result += " FloatToHalfBuffer(" + buf + "_buffer, " + buf + "_buffer_bis);" + NL
# Complete
result += "}" + NL
return result
def performance_test(routine, level_string):
"""Generates the body of a performance test for a specific routine"""
result = ""
result += "#include \"test/performance/client.hpp\"" + NL
result += "#include \"test/routines/level" + level_string + "/x" + routine.name + ".hpp\"" + NL + NL
result += "// Shortcuts to the clblast namespace" + NL
result += "using float2 = clblast::float2;" + NL
result += "using double2 = clblast::double2;" + NL + NL
result += "// Main function (not within the clblast namespace)" + NL
result += "int main(int argc, char *argv[]) {" + NL
default = convert.precision_to_full_name(routine.flavours[0].precision_name)
result += " switch(clblast::GetPrecision(argc, argv, clblast::Precision::k" + default + ")) {" + NL
for precision in ["H", "S", "D", "C", "Z"]:
result += " case clblast::Precision::k" + convert.precision_to_full_name(precision) + ":"
found = False
for flavour in routine.flavours:
if flavour.precision_name == precision:
result += NL + " clblast::RunClient<clblast::TestX" + routine.name + flavour.test_template()
result += ">(argc, argv); break;" + NL
found = True
if not found:
result += " throw std::runtime_error(\"Unsupported precision mode\");" + NL
result += " }" + NL
result += " return 0;" + NL
result += "}" + NL
return result
def correctness_test(routine, level_string):
"""Generates the body of a correctness test for a specific routine"""
result = ""
result += "#include \"test/correctness/testblas.hpp\"" + NL
result += "#include \"test/routines/level" + level_string + "/x" + routine.name + ".hpp\"" + NL + NL
result += "// Shortcuts to the clblast namespace" + NL
result += "using float2 = clblast::float2;" + NL
result += "using double2 = clblast::double2;" + NL + NL
result += "// Main function (not within the clblast namespace)" + NL
result += "int main(int argc, char *argv[]) {" + NL
result += " auto errors = size_t{0};" + NL
not_first = "false"
for flavour in routine.flavours:
result += " errors += clblast::RunTests<clblast::TestX" + routine.name + flavour.test_template()
result += ">(argc, argv, " + not_first + ", \"" + flavour.name + routine.name.upper() + "\");" + NL
not_first = "true"
result += " if (errors > 0) { return 1; } else { return 0; }" + NL
result += "}" + NL
return result

View file

@ -0,0 +1,92 @@
# This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This file follows the
# PEP8 Python style guide and uses a max-width of 120 characters per line.
#
# Author(s):
# Cedric Nugteren <www.cedricnugteren.nl>
# Short-hands for data-types
D_HALF = "half"
D_FLOAT = "float"
D_DOUBLE = "double"
D_FLOAT2 = "float2"
D_DOUBLE2 = "double2"
D_HALF_OPENCL = "cl_half"
D_FLOAT2_OPENCL = "cl_float2"
D_DOUBLE2_OPENCL = "cl_double2"
class DataType:
"""Class holding data-type and precision information"""
def __init__(self, precision_name, name, template, scalars, buffer_type):
self.precision_name = precision_name
self.name = name
self.template = template
self.alpha_cpp = scalars[0]
self.beta_cpp = scalars[1]
self.alpha_cl = scalars[2]
self.beta_cl = scalars[3]
self.buffer_type = buffer_type
def use_alpha(self):
"""Outputs the name of the data-type (alpha/beta), possibly transforming into the right type"""
if self.alpha_cpp in [D_FLOAT2, D_DOUBLE2]:
return self.alpha_cpp + "{alpha.s[0], alpha.s[1]}"
return "alpha"
def use_beta(self):
"""As above, but for beta instead of alpha"""
if self.beta_cpp in [D_FLOAT2, D_DOUBLE2]:
return self.beta_cpp + "{beta.s[0], beta.s[1]}"
return "beta"
def use_alpha_opencl(self):
"""As above, but the transformation is in the opposite direction"""
if self.alpha_cpp in [D_FLOAT2, D_DOUBLE2]:
return self.alpha_cl + "{{alpha.real(), alpha.imag()}}"
return "alpha"
def use_beta_opencl(self):
"""As above, but for beta instead of alpha"""
if self.beta_cpp in [D_FLOAT2, D_DOUBLE2]:
return self.beta_cl + "{{beta.real(), beta.imag()}}"
return "beta"
def test_template(self):
"""Returns the template as used in the correctness/performance tests"""
if self.buffer_type != self.beta_cpp:
return "<" + self.buffer_type + "," + self.beta_cpp + ">, " + self.buffer_type + ", " + self.beta_cpp
return "<" + self.buffer_type + ">, " + self.buffer_type + ", " + self.beta_cpp
def is_complex(self, scalar):
"""Current scalar is complex"""
return ((scalar == "alpha" and self.alpha_cpp in [D_FLOAT2, D_DOUBLE2]) or
(scalar == "beta" and self.beta_cpp in [D_FLOAT2, D_DOUBLE2]))
# Regular data-types
H = DataType("H", "H", D_HALF, [D_HALF] * 2 + [D_HALF_OPENCL] * 2, D_HALF) # half (16)
S = DataType("S", "S", D_FLOAT, [D_FLOAT] * 4, D_FLOAT) # single (32)
D = DataType("D", "D", D_DOUBLE, [D_DOUBLE] * 4, D_DOUBLE) # double (64)
C = DataType("C", "C", D_FLOAT2, [D_FLOAT2] * 2 + [D_FLOAT2_OPENCL] * 2, D_FLOAT2) # single-complex (3232)
Z = DataType("Z", "Z", D_DOUBLE2, [D_DOUBLE2] * 2 + [D_DOUBLE2_OPENCL] * 2, D_DOUBLE2) # double-complex (6464)
# Special cases
Sc = DataType("C", "Sc", D_FLOAT2, [D_FLOAT2] * 4, D_FLOAT2) # As C, but with real output
Dz = DataType("Z", "Dz", D_DOUBLE2, [D_DOUBLE2] * 4, D_DOUBLE2) # As Z, but with real output
iH = DataType("H", "iH", D_HALF, [D_HALF] * 4, D_HALF) # As H, but with integer output
iS = DataType("S", "iS", D_FLOAT, [D_FLOAT] * 4, D_FLOAT) # As S, but with integer output
iD = DataType("D", "iD", D_DOUBLE, [D_DOUBLE] * 4, D_DOUBLE) # As D, but with integer output
iC = DataType("C", "iC", D_FLOAT2, [D_FLOAT2] * 2 + [D_FLOAT2_OPENCL] * 2, D_FLOAT2) # As C, but with integer output
iZ = DataType("Z", "iZ", D_DOUBLE2, [D_DOUBLE2] * 2 + [D_DOUBLE2_OPENCL] * 2, D_DOUBLE2) # As Z, but with int output
Css = DataType("C", "C", D_FLOAT, [D_FLOAT, D_FLOAT, D_FLOAT, D_FLOAT], D_FLOAT2) # As C, but with constants from S
Zdd = DataType("Z", "Z", D_DOUBLE, [D_DOUBLE] * 4, D_DOUBLE2) # As Z, but with constants from D
Ccs = DataType("C", "C", D_FLOAT2 + "," + D_FLOAT, [D_FLOAT2, D_FLOAT, D_FLOAT2_OPENCL, D_FLOAT], D_FLOAT2) # As C, but with one constant from S
Zzd = DataType("Z", "Z", D_DOUBLE2 + "," + D_DOUBLE, [D_DOUBLE2, D_DOUBLE, D_DOUBLE2_OPENCL, D_DOUBLE], D_DOUBLE2) # As Z, but with one constant from D
# C++ template data-types
T = DataType("T", "typename T", "T", ["T", "T", "T", "T"], "T") # regular routine
Tc = DataType("Tc", "typename T", "std::complex<T>,T", ["T", "T", "T", "T"], "std::complex<T>") # for herk
TU = DataType("TU", "typename T, typename U", "T,U", ["T", "U", "T", "U"], "T") # for her2k

View file

@ -0,0 +1,57 @@
# This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This file follows the
# PEP8 Python style guide and uses a max-width of 120 characters per line.
#
# Author(s):
# Cedric Nugteren <www.cedricnugteren.nl>
NL = "\n"
def header():
"""Generates the header for the API documentation"""
result = "CLBlast: API reference" + NL
result += "================" + NL + NL + NL
return result
def generate(routine):
"""Generates the API documentation for a given routine"""
result = ""
# Routine header
result += "x" + routine.name.upper() + ": " + routine.description + NL
result += "-------------" + NL + NL
result += routine.details + NL + NL
# Routine API
result += "C++ API:" + NL
result += "```" + NL
result += routine.routine_header_cpp(12, "") + NL
result += "```" + NL + NL
result += "C API:" + NL
result += "```" + NL
for flavour in routine.flavours:
result += routine.routine_header_c(flavour, 20, "") + NL
result += "```" + NL + NL
# Routine arguments
result += "Arguments to " + routine.name.upper() + ":" + NL + NL
for argument in routine.arguments_doc():
result += "* " + argument + NL
result += "* `cl_command_queue* queue`: "
result += "Pointer to an OpenCL command queue associated with a context and device to execute the routine on." + NL
result += "* `cl_event* event`: "
result += "Pointer to an OpenCL event to be able to wait for completion of the routine's OpenCL kernel(s). "
result += "This is an optional argument." + NL + NL
# Routine requirements
if len(routine.requirements_doc()) > 0:
result += "Requirements for " + routine.name.upper() + ":" + NL + NL
for requirement in routine.requirements_doc():
result += "* " + requirement + NL
result += NL
# Routine footer
result += NL + NL
return result

View file

@ -0,0 +1,552 @@
# This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This file follows the
# PEP8 Python style guide and uses a max-width of 120 characters per line.
#
# Author(s):
# Cedric Nugteren <www.cedricnugteren.nl>
from itertools import chain
import generator.convert as convert
class Routine:
"""Class holding routine-specific information (e.g. name, which arguments, which precisions)"""
def __init__(self, implemented, has_tests, level, name, template, flavours, sizes, options,
inputs, outputs, scalars, scratch, description, details, requirements):
self.implemented = implemented
self.has_tests = has_tests
self.level = level
self.name = name
self.template = template
self.flavours = flavours
self.sizes = sizes
self.options = options
self.inputs = inputs
self.outputs = outputs
self.scalars = scalars
self.scratch = scratch # Scratch buffer (e.g. for xDOT)
self.description = description
self.details = details
self.requirements = requirements
@staticmethod
def scalar_buffers_first():
"""List of scalar buffers"""
return ["dot", "nrm2", "asum", "sum", "imax", "imin"]
@staticmethod
def scalar_buffers_second():
"""List of scalar buffers"""
return ["sa", "sb", "sc", "ss", "sd1", "sd2", "sx1", "sy1", "sparam"]
@staticmethod
def other_scalars():
"""List of scalars other than alpha and beta"""
return ["cos", "sin"]
@staticmethod
def index_buffers():
"""List of buffers with unsigned int type"""
return ["imax", "imin"]
@staticmethod
def postfix(name):
"""Retrieves the postfix for a buffer"""
return "inc" if (name in ["x", "y"]) else "ld"
@staticmethod
def buffers_vector():
"""Distinguish between vectors and matrices"""
return ["x", "y"]
@staticmethod
def buffers_matrix():
"""Distinguish between vectors and matrices"""
return ["a", "b", "c", "ap"]
def non_index_inputs(self):
"""Lists of input/output buffers not index (integer)"""
buffers = self.inputs[:] # make a copy
for i in self.index_buffers():
if i in buffers:
buffers.remove(i)
return buffers
def non_index_outputs(self):
"""Lists of input/output buffers not index (integer)"""
buffers = self.outputs[:] # make a copy
for i in self.index_buffers():
if i in buffers:
buffers.remove(i)
return buffers
def buffers_without_ld_inc(self):
"""List of buffers without 'inc' or 'ld'"""
return self.scalar_buffers_first() + self.scalar_buffers_second() + ["ap"]
def length(self):
"""Retrieves the number of characters in the routine's name"""
return len(self.name)
def no_scalars(self):
"""Determines whether or not this routine has scalar arguments (alpha/beta)"""
return self.scalars == []
def short_names(self):
"""Returns the upper-case names of these routines (all flavours)"""
return "/".join([f.name + self.name.upper() for f in self.flavours])
def short_names_tested(self):
"""As above, but excludes some"""
names = [f.name + self.name.upper() for f in self.flavours]
if "H" + self.name.upper() in names:
names.remove("H" + self.name.upper())
return "/".join(names)
def buffers_first(self):
"""Determines which buffers go first (between alpha and beta) and which ones go after"""
if self.level == "2b":
return ["x", "y"]
return ["ap", "a", "b", "x"]
def buffers_second(self):
if self.level == "2b":
return ["ap", "a", "b", "c"]
return ["y", "c"]
def buffer(self, name):
"""Retrieves a variable name for a specific input/output vector/matrix (e.g. 'x')"""
if name in self.inputs or name in self.outputs:
a = [name + "_buffer"]
b = [name + "_offset"]
c = [name + "_" + self.postfix(name)] if (name not in self.buffers_without_ld_inc()) else []
return [", ".join(a + b + c)]
return []
def buffer_bis(self, name):
"""As above but with a '_bis' suffix for the buffer name"""
if name in self.inputs or name in self.outputs:
a = [name + "_buffer_bis"]
b = [name + "_offset"]
c = [name + "_" + self.postfix(name)] if name not in self.buffers_without_ld_inc() else []
return [", ".join(a + b + c)]
return []
def buffer_def(self, name):
"""As above but with data-types"""
prefix = "const " if name in self.inputs else ""
if name in self.inputs or name in self.outputs:
a = [prefix + "cl_mem " + name + "_buffer"]
b = ["const size_t " + name + "_offset"]
c = ["const size_t " + name + "_" + self.postfix(name)] if name not in self.buffers_without_ld_inc() else []
return [", ".join(a + b + c)]
return []
def buffer_def_wrapper_cl(self, name, flavour):
"""As above but with data-types"""
prefix = "const " if name in self.inputs else ""
if name in self.inputs or name in self.outputs:
a = [prefix + "Buffer<" + flavour.buffer_type + ">& " + name + "_buffer"]
b = ["const size_t " + name + "_offset"]
c = ["const size_t " + name + "_" + self.postfix(name)] if name not in self.buffers_without_ld_inc() else []
return [", ".join(a + b + c)]
return []
def buffer_def_vector(self, name, flavour):
"""As above but as vectors"""
prefix = "const " if name in self.inputs else ""
if name in self.inputs or name in self.outputs:
a = [prefix + "std::vector<" + flavour.buffer_type + ">& " + name + "_buffer"]
b = ["const size_t " + name + "_offset"]
c = ["const size_t " + name + "_" + self.postfix(name)] if name not in self.buffers_without_ld_inc() else []
return [", ".join(a + b + c)]
return []
def buffer_clcudaapi(self, name):
"""As above but with CLCudaAPI buffers"""
if name in self.inputs or name in self.outputs:
buffer_type = "unsigned int" if (name in self.index_buffers()) else self.template.buffer_type
a = ["Buffer<" + buffer_type + ">(" + name + "_buffer)"]
b = [name + "_offset"]
c = [name + "_" + self.postfix(name)] if (name not in self.buffers_without_ld_inc()) else []
return [", ".join(a + b + c)]
return []
def buffer_wrapper_clblas(self, name):
"""As above but with a static cast for clBLAS wrapper"""
if name in self.inputs or name in self.outputs:
a = [name + "_buffer()"]
b = [name + "_offset"]
c = []
if name in ["x", "y"]:
c = ["static_cast<int>(" + name + "_" + self.postfix(name) + ")"]
elif name in ["a", "b", "c"]:
c = [name + "_" + self.postfix(name)]
return [", ".join(a + b + c)]
return []
def buffer_wrapper_cblas(self, name, flavour):
"""As above but with a static cast for CBLAS wrapper"""
prefix = "const " if name in self.inputs else ""
if name in self.inputs or name in self.outputs:
if name == "sy1":
a = [name + "_buffer[" + name + "_offset]"]
elif flavour.precision_name in ["C", "Z"]:
a = ["reinterpret_cast<" + prefix + flavour.buffer_type[:-1] + "*>" +
"(&" + name + "_buffer[" + name + "_offset])"]
else:
a = ["&" + name + "_buffer[" + name + "_offset]"]
c = []
if name in ["x", "y"]:
c = ["static_cast<int>(" + name + "_" + self.postfix(name) + ")"]
elif name in ["a", "b", "c"]:
c = [name + "_" + self.postfix(name)]
return [", ".join(a + c)]
return []
def buffer_type(self, name):
"""As above, but only data-types"""
prefix = "const " if (name in self.inputs) else ""
if (name in self.inputs) or (name in self.outputs):
a = [prefix + "cl_mem"]
b = ["const size_t"]
c = ["const size_t"] if (name not in self.buffers_without_ld_inc()) else []
return [", ".join(a + b + c)]
return []
def buffer_doc(self, name):
"""Retrieves the documentation of the buffers"""
prefix = "const " if (name in self.inputs) else ""
inout = "input" if (name in self.inputs) else "output"
if (name in self.inputs) or (name in self.outputs):
math_name = name.upper() + " matrix" if (name in self.buffers_matrix()) else name + " vector"
inc_ld_description = "Leading dimension " if (name in self.buffers_matrix()) else "Stride/increment "
a = ["`" + prefix + "cl_mem " + name + "_buffer`: OpenCL buffer to store the " + inout + " " + math_name + "."]
b = ["`const size_t " + name + "_offset`: The offset in elements from the start of the " + inout + " " + math_name + "."]
if name not in self.buffers_without_ld_inc():
c = ["`const size_t " + name + "_" + self.postfix(name) + "`: " +
inc_ld_description + "of the " + inout + " " + math_name + ". This value must be greater than 0."]
else:
c = []
return a + b + c
return []
def scalar(self, name):
"""Retrieves the name of a scalar (alpha/beta)"""
if name in self.scalars:
return [name]
return []
def scalar_half_to_float(self, name):
"""As above, but converts from float to half"""
if name in self.scalars:
return ["HalfToFloat(" + name + ")"]
return []
def scalar_use(self, name, flavour):
"""Retrieves the use of a scalar (alpha/beta)"""
if name in self.scalars:
if name == "alpha":
return [flavour.use_alpha()]
elif name == "beta":
return [flavour.use_beta()]
return [name]
return []
def scalar_use_wrapper(self, name, flavour):
"""As above, but for the clBLAS wrapper"""
if name in self.scalars:
if name == "alpha":
return [flavour.use_alpha_opencl()]
elif name == "beta":
return [flavour.use_beta_opencl()]
return [name]
return []
def scalar_use_wrapper_cblas(self, name, flavour):
"""As above, but for the CBLAS wrapper"""
if name in self.scalars:
if flavour.is_complex(name):
return [name + "_array.data()"]
return [name]
return []
def scalar_def(self, name, flavour):
"""Retrieves the definition of a scalar (alpha/beta)"""
if name in self.scalars:
if name == "alpha":
return ["const " + flavour.alpha_cl + " " + name]
return ["const " + flavour.beta_cl + " " + name]
return []
def scalar_def_plain(self, name, flavour):
"""As above, but without 'cl_' prefix"""
if name in self.scalars:
if name == "alpha":
return ["const " + flavour.alpha_cpp + " " + name]
return ["const " + flavour.beta_cpp + " " + name]
return []
def scalar_type(self, name, flavour):
"""Retrieves the type of a scalar (alpha/beta)"""
if name in self.scalars:
if name == "alpha":
return ["const " + flavour.alpha_cpp]
return ["const " + flavour.beta_cpp]
return []
def scalar_doc(self, name):
"""Retrieves the documentation of a scalar"""
if name in self.scalars:
if name == "alpha":
return ["`const " + self.template.alpha_cpp + " " + name + "`: Input scalar constant."]
return ["`const " + self.template.beta_cpp + " " + name + "`: Input scalar constant."]
return []
def sizes_list(self):
"""Retrieves a list of comma-separated sizes (m, n, k)"""
if self.sizes:
return [", ".join([s for s in self.sizes])]
return []
def sizes_def(self):
"""Retrieves the definition of the sizes (m,n,k)"""
if self.sizes:
return [", ".join(["const size_t " + s for s in self.sizes])]
return []
def sizes_type(self):
"""Retrieves the types of the sizes (m,n,k)"""
if self.sizes:
return [", ".join(["const size_t" for s in self.sizes])]
return []
def sizes_doc(self):
"""# Retrieves the documentation of the sizes"""
if self.sizes:
definitions = ["`const size_t " + s + "`: Integer size argument. This value must be positive." for s in self.sizes]
return definitions
return []
def options_list(self):
"""Retrieves a list of options"""
if self.options:
return [", ".join(self.options)]
return []
def options_cast(self, indent):
"""As above, but now casted to CLBlast data-types"""
if self.options:
options = ["static_cast<clblast::" + convert.option_to_clblast(o) + ">(" + o + ")" for o in self.options]
return [(",\n" + indent).join(options)]
return []
def options_def(self):
"""Retrieves the definitions of the options (layout, transpose, side, etc.)"""
if self.options:
definitions = ["const " + convert.option_to_clblast(o) + " " + o for o in self.options]
return [", ".join(definitions)]
return []
def options_def_wrapper_clblas(self):
"""As above, but now using clBLAS data-types"""
if self.options:
definitions = ["const " + convert.option_to_clblas(o) + " " + o for o in self.options]
return [", ".join(definitions)]
return []
def options_def_wrapper_cblas(self):
"""As above, but now using CBLAS data-types"""
if self.options:
definitions = ["const " + convert.option_to_cblas(o) + " " + o for o in self.options]
return [", ".join(definitions)]
return []
def options_type(self):
"""Retrieves the types of the options (layout, transpose, side, etc.)"""
if self.options:
definitions = ["const " + convert.option_to_clblast(o) for o in self.options]
return [", ".join(definitions)]
return []
def options_doc(self):
"""Retrieves the documentation of the options"""
if self.options:
definitions = ["`const " + convert.option_to_clblast(o) + " " + o + "`: " + convert.option_to_documentation(o) for o in self.options]
return definitions
return []
def arguments(self):
"""Retrieves a combination of all the argument names (no types)"""
return (self.options_list() + self.sizes_list() +
list(chain(*[self.buffer(b) for b in self.scalar_buffers_first()])) +
self.scalar("alpha") +
list(chain(*[self.buffer(b) for b in self.buffers_first()])) +
self.scalar("beta") +
list(chain(*[self.buffer(b) for b in self.buffers_second()])) +
list(chain(*[self.buffer(b) for b in self.scalar_buffers_second()])) +
list(chain(*[self.scalar(s) for s in self.other_scalars()])))
def arguments_half(self):
"""As above, but with conversions from half to float"""
return (self.options_list() + self.sizes_list() +
list(chain(*[self.buffer_bis(b) for b in self.scalar_buffers_first()])) +
self.scalar_half_to_float("alpha") +
list(chain(*[self.buffer_bis(b) for b in self.buffers_first()])) +
self.scalar_half_to_float("beta") +
list(chain(*[self.buffer_bis(b) for b in self.buffers_second()])) +
list(chain(*[self.buffer_bis(b) for b in self.scalar_buffers_second()])) +
list(chain(*[self.scalar(s) for s in self.other_scalars()])))
def arguments_clcudaapi(self):
"""Retrieves a combination of all the argument names, with CLCudaAPI casts"""
return (self.options_list() + self.sizes_list() +
list(chain(*[self.buffer_clcudaapi(b) for b in self.scalar_buffers_first()])) +
self.scalar("alpha") +
list(chain(*[self.buffer_clcudaapi(b) for b in self.buffers_first()])) +
self.scalar("beta") +
list(chain(*[self.buffer_clcudaapi(b) for b in self.buffers_second()])) +
list(chain(*[self.buffer_clcudaapi(b) for b in self.scalar_buffers_second()])) +
list(chain(*[self.scalar(s) for s in self.other_scalars()])))
def arguments_cast(self, flavour, indent):
"""As above, but with CLBlast casts"""
return (self.options_cast(indent) + self.sizes_list() +
list(chain(*[self.buffer(b) for b in self.scalar_buffers_first()])) +
self.scalar_use("alpha", flavour) +
list(chain(*[self.buffer(b) for b in self.buffers_first()])) +
self.scalar_use("beta", flavour) +
list(chain(*[self.buffer(b) for b in self.buffers_second()])) +
list(chain(*[self.buffer(b) for b in self.scalar_buffers_second()])) +
list(chain(*[self.scalar_use(s, flavour) for s in self.other_scalars()])))
def arguments_wrapper_clblas(self, flavour):
"""As above, but for the clBLAS wrapper"""
return (self.options_list() + self.sizes_list() +
list(chain(*[self.buffer_wrapper_clblas(b) for b in self.scalar_buffers_first()])) +
self.scalar_use_wrapper("alpha", flavour) +
list(chain(*[self.buffer_wrapper_clblas(b) for b in self.buffers_first()])) +
self.scalar_use_wrapper("beta", flavour) +
list(chain(*[self.buffer_wrapper_clblas(b) for b in self.buffers_second()])) +
list(chain(*[self.buffer_wrapper_clblas(b) for b in self.scalar_buffers_second()])) +
list(chain(*[self.scalar_use_wrapper(s, flavour) for s in self.other_scalars()])))
def arguments_wrapper_cblas(self, flavour):
"""As above, but for the CBLAS wrapper"""
return (self.options_list() + self.sizes_list() +
self.scalar_use_wrapper_cblas("alpha", flavour) +
list(chain(*[self.buffer_wrapper_cblas(b, flavour) for b in self.buffers_first()])) +
self.scalar_use_wrapper_cblas("beta", flavour) +
list(chain(*[self.buffer_wrapper_cblas(b, flavour) for b in self.buffers_second()])) +
list(chain(*[self.buffer_wrapper_cblas(b, flavour) for b in self.scalar_buffers_second()])) +
list(chain(*[self.scalar_use_wrapper_cblas(s, flavour) for s in self.other_scalars()])))
def arguments_def(self, flavour):
"""Retrieves a combination of all the argument definitions"""
return (self.options_def() + self.sizes_def() +
list(chain(*[self.buffer_def(b) for b in self.scalar_buffers_first()])) +
self.scalar_def("alpha", flavour) +
list(chain(*[self.buffer_def(b) for b in self.buffers_first()])) +
self.scalar_def("beta", flavour) +
list(chain(*[self.buffer_def(b) for b in self.buffers_second()])) +
list(chain(*[self.buffer_def(b) for b in self.scalar_buffers_second()])) +
list(chain(*[self.scalar_def(s, flavour) for s in self.other_scalars()])))
def arguments_def_wrapper_clblas(self, flavour):
"""As above, but clBLAS wrapper plain data-types"""
return (self.options_def_wrapper_clblas() + self.sizes_def() +
list(chain(*[self.buffer_def_wrapper_cl(b, flavour) for b in self.scalar_buffers_first()])) +
self.scalar_def_plain("alpha", flavour) +
list(chain(*[self.buffer_def_wrapper_cl(b, flavour) for b in self.buffers_first()])) +
self.scalar_def_plain("beta", flavour) +
list(chain(*[self.buffer_def_wrapper_cl(b, flavour) for b in self.buffers_second()])) +
list(chain(*[self.buffer_def_wrapper_cl(b, flavour) for b in self.scalar_buffers_second()])) +
list(chain(*[self.scalar_def_plain(s, flavour) for s in self.other_scalars()])))
def arguments_def_wrapper_cblas(self, flavour):
"""As above, but CBLAS wrapper plain data-types"""
return (self.options_def_wrapper_cblas() + self.sizes_def() +
list(chain(*[self.buffer_def_vector(b, flavour) for b in self.scalar_buffers_first()])) +
self.scalar_def_plain("alpha", flavour) +
list(chain(*[self.buffer_def_vector(b, flavour) for b in self.buffers_first()])) +
self.scalar_def_plain("beta", flavour) +
list(chain(*[self.buffer_def_vector(b, flavour) for b in self.buffers_second()])) +
list(chain(*[self.buffer_def_vector(b, flavour) for b in self.scalar_buffers_second()])) +
list(chain(*[self.scalar_def_plain(s, flavour) for s in self.other_scalars()])))
def arguments_type(self, flavour):
"""Retrieves a combination of all the argument types"""
return (self.options_type() + self.sizes_type() +
list(chain(*[self.buffer_type(b) for b in self.scalar_buffers_first()])) +
self.scalar_type("alpha", flavour) +
list(chain(*[self.buffer_type(b) for b in self.buffers_first()])) +
self.scalar_type("beta", flavour) +
list(chain(*[self.buffer_type(b) for b in self.buffers_second()])) +
list(chain(*[self.buffer_type(b) for b in self.scalar_buffers_second()])) +
list(chain(*[self.scalar_type(s, flavour) for s in self.other_scalars()])))
def arguments_doc(self):
"""Retrieves a combination of all the argument types"""
return (self.options_doc() + self.sizes_doc() +
list(chain(*[self.buffer_doc(b) for b in self.scalar_buffers_first()])) +
list(chain(*[self.buffer_doc(b) for b in self.scalar_buffers_first()])) +
self.scalar_doc("alpha") +
list(chain(*[self.buffer_doc(b) for b in self.buffers_first()])) +
self.scalar_doc("beta") +
list(chain(*[self.buffer_doc(b) for b in self.buffers_second()])) +
list(chain(*[self.buffer_doc(b) for b in self.scalar_buffers_second()])) +
list(chain(*[self.scalar_doc(s) for s in self.other_scalars()])))
def requirements_doc(self):
"""Retrieves a list of routine requirements for documentation"""
return self.requirements
def routine_header_cpp(self, spaces, default_event):
"""Retrieves the C++ templated definition for a routine"""
indent = " " * (spaces + self.length())
result = "template <" + self.template.name + ">\n"
result += "StatusCode " + self.name.capitalize() + "("
result += (",\n" + indent).join([a for a in self.arguments_def(self.template)])
result += ",\n" + indent + "cl_command_queue* queue, cl_event* event" + default_event + ")"
return result
def routine_header_type_cpp(self, spaces):
"""As above, but now without variable names"""
indent = " " * (spaces + self.length())
result = "template <" + self.template.name + ">\n"
result += "StatusCode " + self.name.capitalize() + "("
result += (",\n" + indent).join([a for a in self.arguments_type(self.template)])
result += ",\n" + indent + "cl_command_queue*, cl_event*)"
return result
def routine_header_c(self, flavour, spaces, extra_qualifier):
"""As above, but now for C"""
indent = " " * (spaces + self.length())
result = "StatusCode" + extra_qualifier + " CLBlast" + flavour.name + self.name + "("
result += (",\n" + indent).join([a for a in self.arguments_def(flavour)])
result += ",\n" + indent + "cl_command_queue* queue, cl_event* event)"
return result
def routine_header_wrapper_clblas(self, flavour, def_only, spaces):
"""As above, but now for the clBLAS wrapper"""
template = "<" + flavour.template + ">" if self.no_scalars() and not def_only else ""
indent = " " * (spaces + self.length() + len(template))
result = ""
if self.no_scalars():
result += "template <"
if def_only:
result += flavour.name
result += ">\n"
result += "clblasStatus clblasX" + self.name + template + "("
result += (",\n" + indent).join([a for a in self.arguments_def_wrapper_clblas(flavour)])
result += ",\n" + indent + "cl_uint num_queues, cl_command_queue *queues"
result += ",\n" + indent + "cl_uint num_wait_events, const cl_event *wait_events, cl_event *events)"
return result
def routine_header_wrapper_cblas(self, flavour, spaces):
"""As above, but now for the CBLAS wrapper"""
indent = " " * (spaces + self.length())
result = "void cblasX" + self.name + "("
result += (",\n" + indent).join([a for a in self.arguments_def_wrapper_cblas(flavour)]) + ")"
return result

View file

@ -1,603 +0,0 @@
#!/usr/bin/env python
# ==================================================================================================
# This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
# project loosely follows the Google C++ styleguide and uses a max-width of 100 characters per line.
#
# Author(s):
# Cedric Nugteren <www.cedricnugteren.nl>
#
# This file contains the 'Routine' class, used in the generator script to generate the CLBlast API
# interface and implementation.
#
# ==================================================================================================
# System modules
from itertools import chain
# Translates an option name to a CLBlast data-type
def OptionToCLBlast(x):
return {
'layout': "Layout",
'a_transpose': "Transpose",
'b_transpose': "Transpose",
'ab_transpose': "Transpose",
'side': "Side",
'triangle': "Triangle",
'diagonal': "Diagonal",
}[x]
# As above, but for clBLAS data-types
def OptionToWrapperCL(x):
return {
'layout': "clblasOrder",
'a_transpose': "clblasTranspose",
'b_transpose': "clblasTranspose",
'ab_transpose': "clblasTranspose",
'side': "clblasSide",
'triangle': "clblasUplo",
'diagonal': "clblasDiag",
}[x]
# As above, but for CBLAS data-types
def OptionToWrapperC(x):
return {
'layout': "CBLAS_ORDER",
'a_transpose': "CBLAS_TRANSPOSE",
'b_transpose': "CBLAS_TRANSPOSE",
'ab_transpose': "CBLAS_TRANSPOSE",
'side': "CBLAS_SIDE",
'triangle': "CBLAS_UPLO",
'diagonal': "CBLAS_DIAG",
}[x]
# Translates an option name to a documentation string
def OptionToDoc(x):
return {
'layout': "Data-layout of the matrices, either `Layout::kRowMajor` (101) for row-major layout or `Layout::kColMajor` (102) for column-major data-layout.",
'a_transpose': "Transposing the input matrix A, either `Transpose::kNo` (111), `Transpose::kYes` (112), or `Transpose::kConjugate` (113) for a complex-conjugate transpose.",
'b_transpose': "Transposing the input matrix B, either `Transpose::kNo` (111), `Transpose::kYes` (112), or `Transpose::kConjugate` (113) for a complex-conjugate transpose.",
'ab_transpose': "Transposing the packed input matrix AP, either `Transpose::kNo` (111), `Transpose::kYes` (112), or `Transpose::kConjugate` (113) for a complex-conjugate transpose.",
'side': "The position of the triangular matrix in the operation, either on the `Side::kLeft` (141) or `Side::kRight` (142).",
'triangle': "The part of the array of the triangular matrix to be used, either `Triangle::kUpper` (121) or `Triangle::kLower` (122).",
'diagonal': "The property of the diagonal matrix, either `Diagonal::kNonUnit` (131) for non-unit values on the diagonal or `Diagonal::kUnit` (132) for unit values on the diagonal.",
}[x]
# ==================================================================================================
# Class holding routine-specific information (e.g. name, which arguments, which precisions)
class Routine():
def __init__(self, implemented, has_tests, level, name, template, flavours, sizes, options,
inputs, outputs, scalars, scratch, description, details, requirements):
self.implemented = implemented
self.has_tests = has_tests
self.level = level
self.name = name
self.template = template
self.flavours = flavours
self.sizes = sizes
self.options = options
self.inputs = inputs
self.outputs = outputs
self.scalars = scalars
self.scratch = scratch # Scratch buffer (e.g. for xDOT)
self.description = description
self.details = details
self.requirements = requirements
# List of scalar buffers
def ScalarBuffersFirst(self):
return ["dot","nrm2","asum","sum","imax","imin"]
def ScalarBuffersSecond(self):
return ["sa","sb","sc","ss","sd1","sd2","sx1","sy1","sparam"]
# List of scalars other than alpha and beta
def OtherScalars(self):
return ["cos","sin"]
# List of buffers with unsigned int type
def IndexBuffers(self):
return ["imax","imin"]
# Lists of input/output buffers not index (integer)
def NonIndexInputs(self):
buffers = self.inputs[:] # make a copy
for i in self.IndexBuffers():
if i in buffers: buffers.remove(i)
return buffers
def NonIndexOutputs(self):
buffers = self.outputs[:] # make a copy
for i in self.IndexBuffers():
if i in buffers: buffers.remove(i)
return buffers
# List of buffers without 'inc' or 'ld'
def BuffersWithoutLdInc(self):
return self.ScalarBuffersFirst() + self.ScalarBuffersSecond() + ["ap"]
# Retrieves the number of characters in the routine's name
def Length(self):
return len(self.name)
# Retrieves the postfix for a buffer
def Postfix(self, name):
return "inc" if (name in ["x","y"]) else "ld"
# Determines whether or not this routine has scalar arguments (alpha/beta)
def NoScalars(self):
return self.scalars == []
# Returns the upper-case names of these routines (all flavours)
def ShortNames(self):
return "/".join([f.name+self.name.upper() for f in self.flavours])
# As above, but excludes some
def ShortNamesTested(self):
names = [f.name+self.name.upper() for f in self.flavours]
if "H"+self.name.upper() in names: names.remove("H"+self.name.upper())
return "/".join(names)
# Determines which buffers go first (between alpha and beta) and which ones go after
def BuffersFirst(self):
if self.level == "2b":
return ["x","y"]
return ["ap","a","b","x"]
def BuffersSecond(self):
if self.level == "2b":
return ["ap","a","b","c"]
return ["y","c"]
# Distinguish between vectors and matrices
def BuffersVector(self):
return ["x","y"]
def BuffersMatrix(self):
return ["a","b","c","ap"]
# ==============================================================================================
# Retrieves a variable name for a specific input/output vector/matrix (e.g. 'x')
def Buffer(self, name):
if (name in self.inputs) or (name in self.outputs):
a = [name+"_buffer"]
b = [name+"_offset"]
c = [name+"_"+self.Postfix(name)] if (name not in self.BuffersWithoutLdInc()) else []
return [", ".join(a+b+c)]
return []
# As above but with a '_bis' suffix for the buffer name
def BufferBis(self, name):
#if (name in self.IndexBuffers()):
# return self.Buffer(name)
if (name in self.inputs) or (name in self.outputs):
a = [name+"_buffer_bis"]
b = [name+"_offset"]
c = [name+"_"+self.Postfix(name)] if (name not in self.BuffersWithoutLdInc()) else []
return [", ".join(a+b+c)]
return []
# As above but with data-types
def BufferDef(self, name):
prefix = "const " if (name in self.inputs) else ""
if (name in self.inputs) or (name in self.outputs):
a = [prefix+"cl_mem "+name+"_buffer"]
b = ["const size_t "+name+"_offset"]
c = ["const size_t "+name+"_"+self.Postfix(name)] if (name not in self.BuffersWithoutLdInc()) else []
return [", ".join(a+b+c)]
return []
# As above but with data-types
def BufferDefWrapperCL(self, name, flavour):
prefix = "const " if (name in self.inputs) else ""
if (name in self.inputs) or (name in self.outputs):
a = [prefix+"Buffer<"+flavour.buffertype+">& "+name+"_buffer"]
b = ["const size_t "+name+"_offset"]
c = ["const size_t "+name+"_"+self.Postfix(name)] if (name not in self.BuffersWithoutLdInc()) else []
return [", ".join(a+b+c)]
return []
# As above but as vectors
def BufferDefVector(self, name, flavour):
prefix = "const " if (name in self.inputs) else ""
if (name in self.inputs) or (name in self.outputs):
a = [prefix+"std::vector<"+flavour.buffertype+">& "+name+"_buffer"]
b = ["const size_t "+name+"_offset"]
c = ["const size_t "+name+"_"+self.Postfix(name)] if (name not in self.BuffersWithoutLdInc()) else []
return [", ".join(a+b+c)]
return []
# As above but with Claduc buffers
def BufferCladuc(self, name):
if (name in self.inputs) or (name in self.outputs):
buffertype = "unsigned int" if (name in self.IndexBuffers()) else self.template.buffertype
a = ["Buffer<"+buffertype+">("+name+"_buffer)"]
b = [name+"_offset"]
c = [name+"_"+self.Postfix(name)] if (name not in self.BuffersWithoutLdInc()) else []
return [", ".join(a+b+c)]
return []
# As above but with a static cast for clBLAS wrapper
def BufferWrapperCL(self, name):
if (name in self.inputs) or (name in self.outputs):
a = [name+"_buffer()"]
b = [name+"_offset"]
c = []
if (name in ["x","y"]):
c = ["static_cast<int>("+name+"_"+self.Postfix(name)+")"]
elif (name in ["a","b","c"]):
c = [name+"_"+self.Postfix(name)]
return [", ".join(a+b+c)]
return []
# As above but with a static cast for CBLAS wrapper
def BufferWrapperC(self, name, flavour):
prefix = "const " if (name in self.inputs) else ""
if (name in self.inputs) or (name in self.outputs):
if name == "sy1":
a = [name+"_buffer["+name+"_offset]"]
elif flavour.precision_name in ["C","Z"]:
a = ["reinterpret_cast<"+prefix+flavour.buffertype[:-1]+"*>(&"+name+"_buffer["+name+"_offset])"]
else:
a = ["&"+name+"_buffer["+name+"_offset]"]
c = []
if (name in ["x","y"]):
c = ["static_cast<int>("+name+"_"+self.Postfix(name)+")"]
elif (name in ["a","b","c"]):
c = [name+"_"+self.Postfix(name)]
return [", ".join(a+c)]
return []
# As above, but only data-types
def BufferType(self, name):
prefix = "const " if (name in self.inputs) else ""
if (name in self.inputs) or (name in self.outputs):
a = [prefix+"cl_mem"]
b = ["const size_t"]
c = ["const size_t"] if (name not in self.BuffersWithoutLdInc()) else []
return [", ".join(a+b+c)]
return []
# Retrieves the documentation of the buffers
def BufferDoc(self, name):
prefix = "const " if (name in self.inputs) else ""
inout = "input" if (name in self.inputs) else "output"
if (name in self.inputs) or (name in self.outputs):
math_name = name.upper()+" matrix" if (name in self.BuffersMatrix()) else name+" vector"
incld_description = "Leading dimension " if (name in self.BuffersMatrix()) else "Stride/increment "
a = ["`"+prefix+"cl_mem "+name+"_buffer`: OpenCL buffer to store the "+inout+" "+math_name+"."]
b = ["`const size_t "+name+"_offset`: The offset in elements from the start of the "+inout+" "+math_name+"."]
c = ["`const size_t "+name+"_"+self.Postfix(name)+"`: "+incld_description+"of the "+inout+" "+math_name+". This value must be greater than 0."] if (name not in self.BuffersWithoutLdInc()) else []
return a+b+c
return []
# ==============================================================================================
# Retrieves the name of a scalar (alpha/beta)
def Scalar(self, name):
if (name in self.scalars):
return [name]
return []
# As above, but converts from float to half
def ScalarHalfToFloat(self, name):
if name in self.scalars:
return ["HalfToFloat("+name+")"]
return []
# Retrieves the use of a scalar (alpha/beta)
def ScalarUse(self, name, flavour):
if name in self.scalars:
if name == "alpha":
return [flavour.UseAlpha()]
elif name == "beta":
return [flavour.UseBeta()]
return [name]
return []
# As above, but for the clBLAS wrapper
def ScalarUseWrapper(self, name, flavour):
if name in self.scalars:
if name == "alpha":
return [flavour.UseAlphaCL()]
elif name == "beta":
return [flavour.UseBetaCL()]
return [name]
return []
# As above, but for the CBLAS wrapper
def ScalarUseWrapperC(self, name, flavour):
if name in self.scalars:
if flavour.IsComplex(name):
return [name+"_array.data()"]
return [name]
return []
# Retrieves the definition of a scalar (alpha/beta)
def ScalarDef(self, name, flavour):
if name in self.scalars:
if name == "alpha":
return ["const "+flavour.alpha_cl+" "+name]
return ["const "+flavour.beta_cl+" "+name]
return []
# As above, but without 'cl_' prefix
def ScalarDefPlain(self, name, flavour):
if name in self.scalars:
if name == "alpha":
return ["const "+flavour.alpha_cpp+" "+name]
return ["const "+flavour.beta_cpp+" "+name]
return []
# Retrieves the type of a scalar (alpha/beta)
def ScalarType(self, name, flavour):
if name in self.scalars:
if name == "alpha":
return ["const "+flavour.alpha_cpp]
return ["const "+flavour.beta_cpp]
return []
# Retrieves the documentation of a scalar
def ScalarDoc(self, name):
if name in self.scalars:
if name == "alpha":
return ["`const "+self.template.alpha_cpp+" "+name+"`: Input scalar constant."]
return ["`const "+self.template.beta_cpp+" "+name+"`: Input scalar constant."]
return []
# ==============================================================================================
# Retrieves a list of comma-separated sizes (m, n, k)
def Sizes(self):
if self.sizes:
return [", ".join([s for s in self.sizes])]
return []
# Retrieves the definition of the sizes (m,n,k)
def SizesDef(self):
if self.sizes:
return [", ".join(["const size_t "+s for s in self.sizes])]
return []
# Retrieves the types of the sizes (m,n,k)
def SizesType(self):
if self.sizes:
return [", ".join(["const size_t" for s in self.sizes])]
return []
# Retrieves the documentation of the sizes
def SizesDoc(self):
if self.sizes:
definitions = ["`const size_t "+s+"`: Integer size argument. This value must be positive." for s in self.sizes]
return definitions
return []
# ==============================================================================================
# Retrieves a list of options
def Options(self):
if self.options:
return [", ".join(self.options)]
return []
# As above, but now casted to CLBlast data-types
def OptionsCast(self, indent):
if self.options:
options = ["static_cast<clblast::"+OptionToCLBlast(o)+">("+o+")" for o in self.options]
return [(",\n"+indent).join(options)]
return []
# Retrieves the definitions of the options (layout, transpose, side, etc.)
def OptionsDef(self):
if self.options:
definitions = ["const "+OptionToCLBlast(o)+" "+o for o in self.options]
return [", ".join(definitions)]
return []
# As above, but now using clBLAS data-types
def OptionsDefWrapperCL(self):
if self.options:
definitions = ["const "+OptionToWrapperCL(o)+" "+o for o in self.options]
return [", ".join(definitions)]
return []
# As above, but now using CBLAS data-types
def OptionsDefWrapperC(self):
if self.options:
definitions = ["const "+OptionToWrapperC(o)+" "+o for o in self.options]
return [", ".join(definitions)]
return []
# Retrieves the types of the options (layout, transpose, side, etc.)
def OptionsType(self):
if self.options:
definitions = ["const "+OptionToCLBlast(o) for o in self.options]
return [", ".join(definitions)]
return []
# Retrieves the documentation of the options
def OptionsDoc(self):
if self.options:
definitions = ["`const "+OptionToCLBlast(o)+" "+o+"`: "+OptionToDoc(o) for o in self.options]
return definitions
return []
# ==============================================================================================
# Retrieves a combination of all the argument names (no types)
def Arguments(self):
return (self.Options() + self.Sizes() +
list(chain(*[self.Buffer(b) for b in self.ScalarBuffersFirst()])) +
self.Scalar("alpha") +
list(chain(*[self.Buffer(b) for b in self.BuffersFirst()])) +
self.Scalar("beta") +
list(chain(*[self.Buffer(b) for b in self.BuffersSecond()])) +
list(chain(*[self.Buffer(b) for b in self.ScalarBuffersSecond()])) +
list(chain(*[self.Scalar(s) for s in self.OtherScalars()])))
# As above, but with conversions from half to float
def ArgumentsHalf(self):
return (self.Options() + self.Sizes() +
list(chain(*[self.BufferBis(b) for b in self.ScalarBuffersFirst()])) +
self.ScalarHalfToFloat("alpha") +
list(chain(*[self.BufferBis(b) for b in self.BuffersFirst()])) +
self.ScalarHalfToFloat("beta") +
list(chain(*[self.BufferBis(b) for b in self.BuffersSecond()])) +
list(chain(*[self.BufferBis(b) for b in self.ScalarBuffersSecond()])) +
list(chain(*[self.Scalar(s) for s in self.OtherScalars()])))
# Retrieves a combination of all the argument names, with Claduc casts
def ArgumentsCladuc(self, flavour, indent):
return (self.Options() + self.Sizes() +
list(chain(*[self.BufferCladuc(b) for b in self.ScalarBuffersFirst()])) +
self.Scalar("alpha") +
list(chain(*[self.BufferCladuc(b) for b in self.BuffersFirst()])) +
self.Scalar("beta") +
list(chain(*[self.BufferCladuc(b) for b in self.BuffersSecond()])) +
list(chain(*[self.BufferCladuc(b) for b in self.ScalarBuffersSecond()])) +
list(chain(*[self.Scalar(s) for s in self.OtherScalars()])))
# As above, but with CLBlast casts
def ArgumentsCast(self, flavour, indent):
return (self.OptionsCast(indent) + self.Sizes() +
list(chain(*[self.Buffer(b) for b in self.ScalarBuffersFirst()])) +
self.ScalarUse("alpha", flavour) +
list(chain(*[self.Buffer(b) for b in self.BuffersFirst()])) +
self.ScalarUse("beta", flavour) +
list(chain(*[self.Buffer(b) for b in self.BuffersSecond()])) +
list(chain(*[self.Buffer(b) for b in self.ScalarBuffersSecond()])) +
list(chain(*[self.ScalarUse(s, flavour) for s in self.OtherScalars()])))
# As above, but for the clBLAS wrapper
def ArgumentsWrapperCL(self, flavour):
return (self.Options() + self.Sizes() +
list(chain(*[self.BufferWrapperCL(b) for b in self.ScalarBuffersFirst()])) +
self.ScalarUseWrapper("alpha", flavour) +
list(chain(*[self.BufferWrapperCL(b) for b in self.BuffersFirst()])) +
self.ScalarUseWrapper("beta", flavour) +
list(chain(*[self.BufferWrapperCL(b) for b in self.BuffersSecond()])) +
list(chain(*[self.BufferWrapperCL(b) for b in self.ScalarBuffersSecond()])) +
list(chain(*[self.ScalarUseWrapper(s, flavour) for s in self.OtherScalars()])))
# As above, but for the CBLAS wrapper
def ArgumentsWrapperC(self, flavour):
return (self.Options() + self.Sizes() +
self.ScalarUseWrapperC("alpha", flavour) +
list(chain(*[self.BufferWrapperC(b, flavour) for b in self.BuffersFirst()])) +
self.ScalarUseWrapperC("beta", flavour) +
list(chain(*[self.BufferWrapperC(b, flavour) for b in self.BuffersSecond()])) +
list(chain(*[self.BufferWrapperC(b, flavour) for b in self.ScalarBuffersSecond()])) +
list(chain(*[self.ScalarUseWrapperC(s, flavour) for s in self.OtherScalars()])))
# Retrieves a combination of all the argument definitions
def ArgumentsDef(self, flavour):
return (self.OptionsDef() + self.SizesDef() +
list(chain(*[self.BufferDef(b) for b in self.ScalarBuffersFirst()])) +
self.ScalarDef("alpha", flavour) +
list(chain(*[self.BufferDef(b) for b in self.BuffersFirst()])) +
self.ScalarDef("beta", flavour) +
list(chain(*[self.BufferDef(b) for b in self.BuffersSecond()])) +
list(chain(*[self.BufferDef(b) for b in self.ScalarBuffersSecond()])) +
list(chain(*[self.ScalarDef(s, flavour) for s in self.OtherScalars()])))
# As above, but clBLAS wrapper plain datatypes
def ArgumentsDefWrapperCL(self, flavour):
return (self.OptionsDefWrapperCL() + self.SizesDef() +
list(chain(*[self.BufferDefWrapperCL(b, flavour) for b in self.ScalarBuffersFirst()])) +
self.ScalarDefPlain("alpha", flavour) +
list(chain(*[self.BufferDefWrapperCL(b, flavour) for b in self.BuffersFirst()])) +
self.ScalarDefPlain("beta", flavour) +
list(chain(*[self.BufferDefWrapperCL(b, flavour) for b in self.BuffersSecond()])) +
list(chain(*[self.BufferDefWrapperCL(b, flavour) for b in self.ScalarBuffersSecond()])) +
list(chain(*[self.ScalarDefPlain(s, flavour) for s in self.OtherScalars()])))
# As above, but CBLAS wrapper plain datatypes
def ArgumentsDefWrapperC(self, flavour):
return (self.OptionsDefWrapperC() + self.SizesDef() +
list(chain(*[self.BufferDefVector(b, flavour) for b in self.ScalarBuffersFirst()])) +
self.ScalarDefPlain("alpha", flavour) +
list(chain(*[self.BufferDefVector(b, flavour) for b in self.BuffersFirst()])) +
self.ScalarDefPlain("beta", flavour) +
list(chain(*[self.BufferDefVector(b, flavour) for b in self.BuffersSecond()])) +
list(chain(*[self.BufferDefVector(b, flavour) for b in self.ScalarBuffersSecond()])) +
list(chain(*[self.ScalarDefPlain(s, flavour) for s in self.OtherScalars()])))
# Retrieves a combination of all the argument types
def ArgumentsType(self, flavour):
return (self.OptionsType() + self.SizesType() +
list(chain(*[self.BufferType(b) for b in self.ScalarBuffersFirst()])) +
self.ScalarType("alpha", flavour) +
list(chain(*[self.BufferType(b) for b in self.BuffersFirst()])) +
self.ScalarType("beta", flavour) +
list(chain(*[self.BufferType(b) for b in self.BuffersSecond()])) +
list(chain(*[self.BufferType(b) for b in self.ScalarBuffersSecond()])) +
list(chain(*[self.ScalarType(s, flavour) for s in self.OtherScalars()])))
# Retrieves a combination of all the argument types
def ArgumentsDoc(self):
return (self.OptionsDoc() + self.SizesDoc() +
list(chain(*[self.BufferDoc(b) for b in self.ScalarBuffersFirst()])) +
list(chain(*[self.BufferDoc(b) for b in self.ScalarBuffersFirst()])) +
self.ScalarDoc("alpha") +
list(chain(*[self.BufferDoc(b) for b in self.BuffersFirst()])) +
self.ScalarDoc("beta") +
list(chain(*[self.BufferDoc(b) for b in self.BuffersSecond()])) +
list(chain(*[self.BufferDoc(b) for b in self.ScalarBuffersSecond()])) +
list(chain(*[self.ScalarDoc(s) for s in self.OtherScalars()])))
# ==============================================================================================
# Retrieves a list of routine requirements for documentation
def RequirementsDoc(self):
return self.requirements
# ==============================================================================================
# Retrieves the C++ templated definition for a routine
def RoutineHeaderCPP(self, spaces, default_event):
indent = " "*(spaces + self.Length())
result = "template <"+self.template.name+">\n"
result += "StatusCode "+self.name.capitalize()+"("
result += (",\n"+indent).join([a for a in self.ArgumentsDef(self.template)])
result += ",\n"+indent+"cl_command_queue* queue, cl_event* event"+default_event+")"
return result
# As above, but now without variable names
def RoutineHeaderTypeCPP(self, spaces):
indent = " "*(spaces + self.Length())
result = "template <"+self.template.name+">\n"
result += "StatusCode "+self.name.capitalize()+"("
result += (",\n"+indent).join([a for a in self.ArgumentsType(self.template)])
result += ",\n"+indent+"cl_command_queue*, cl_event*)"
return result
# As above, but now for C
def RoutineHeaderC(self, flavour, spaces, extra_qualifier):
indent = " "*(spaces + self.Length())
result = "StatusCode"+extra_qualifier+" CLBlast"+flavour.name+self.name+"("
result += (",\n"+indent).join([a for a in self.ArgumentsDef(flavour)])
result += ",\n"+indent+"cl_command_queue* queue, cl_event* event)"
return result
# As above, but now for the clBLAS wrapper
def RoutineHeaderWrapperCL(self, flavour, def_only, spaces):
template = "<"+flavour.template+">" if self.NoScalars() and not def_only else ""
indent = " "*(spaces + self.Length() + len(template))
result = ""
if self.NoScalars():
result += "template <"
if def_only:
result += flavour.name
result += ">\n"
result += "clblasStatus clblasX"+self.name+template+"("
result += (",\n"+indent).join([a for a in self.ArgumentsDefWrapperCL(flavour)])
result += ",\n"+indent+"cl_uint num_queues, cl_command_queue *queues"
result += ",\n"+indent+"cl_uint num_wait_events, const cl_event *wait_events, cl_event *events)"
return result
# As above, but now for the CBLAS wrapper
def RoutineHeaderWrapperC(self, flavour, def_only, spaces):
indent = " "*(spaces + self.Length())
result = "void cblasX"+self.name+"("
result += (",\n"+indent).join([a for a in self.ArgumentsDefWrapperC(flavour)])+")"
return result
# ==================================================================================================

View file

@ -23,6 +23,9 @@ namespace clblast {
// Stores the compiled binary or IR in the cache
void StoreBinaryToCache(const std::string &binary, const std::string &device_name,
const Precision &precision, const std::string &routine_name) {
#ifdef VERBOSE
printf("[DEBUG] Storing binary in cache\n");
#endif
binary_cache_mutex_.lock();
binary_cache_.push_back(BinaryCache{binary, device_name, precision, routine_name});
binary_cache_mutex_.unlock();
@ -31,8 +34,11 @@ void StoreBinaryToCache(const std::string &binary, const std::string &device_nam
// Stores the compiled program in the cache
void StoreProgramToCache(const Program &program, const Context &context,
const Precision &precision, const std::string &routine_name) {
#ifdef VERBOSE
printf("[DEBUG] Storing program in cache\n");
#endif
program_cache_mutex_.lock();
program_cache_.push_back(ProgramCache{program, context.pointer(), precision, routine_name});
program_cache_.push_back(ProgramCache{program, context(), precision, routine_name});
program_cache_mutex_.unlock();
}
@ -40,6 +46,9 @@ void StoreProgramToCache(const Program &program, const Context &context,
// otherwise.
const std::string& GetBinaryFromCache(const std::string &device_name, const Precision &precision,
const std::string &routine_name) {
#ifdef VERBOSE
printf("[DEBUG] Retrieving binary from cache\n");
#endif
binary_cache_mutex_.lock();
for (auto &cached_binary: binary_cache_) {
if (cached_binary.MatchInCache(device_name, precision, routine_name)) {
@ -55,9 +64,12 @@ const std::string& GetBinaryFromCache(const std::string &device_name, const Prec
// otherwise.
const Program& GetProgramFromCache(const Context &context, const Precision &precision,
const std::string &routine_name) {
#ifdef VERBOSE
printf("[DEBUG] Retrieving program from cache\n");
#endif
program_cache_mutex_.lock();
for (auto &cached_program: program_cache_) {
if (cached_program.MatchInCache(context.pointer(), precision, routine_name)) {
if (cached_program.MatchInCache(context(), precision, routine_name)) {
program_cache_mutex_.unlock();
return cached_program.program;
}
@ -85,7 +97,7 @@ bool ProgramIsInCache(const Context &context, const Precision &precision,
const std::string &routine_name) {
program_cache_mutex_.lock();
for (auto &cached_program: program_cache_) {
if (cached_program.MatchInCache(context.pointer(), precision, routine_name)) {
if (cached_program.MatchInCache(context(), precision, routine_name)) {
program_cache_mutex_.unlock();
return true;
}

View file

@ -48,14 +48,14 @@ static std::mutex binary_cache_mutex_;
// The cache of compiled OpenCL programs, along with some meta-data
struct ProgramCache {
Program program;
ContextPointer context_ptr;
cl_context context;
Precision precision;
std::string routine_name_;
// Finds out whether the properties match
bool MatchInCache(const ContextPointer ref_context, const Precision &ref_precision,
bool MatchInCache(const cl_context ref_context, const Precision &ref_precision,
const std::string &ref_routine) {
return (context_ptr == ref_context &&
return (context == ref_context &&
precision == ref_precision &&
routine_name_ == ref_routine);
}

View file

@ -16,7 +16,6 @@
#include <string>
#include "clblast.h"
#include "public_api.hpp"
#include "cache.hpp"
// BLAS level-1 includes

View file

@ -72,15 +72,24 @@ inline void CheckError(const cl_int status) {
class Event {
public:
// Constructor based on the regular OpenCL data-type
explicit Event(const cl_event event): event_(event) { }
// Constructor based on the regular OpenCL data-type: memory management is handled elsewhere
explicit Event(const cl_event event):
event_(new cl_event) {
*event_ = event;
}
// Regular constructor
explicit Event(): event_(nullptr) { }
// Regular constructor with memory management
explicit Event():
event_(new cl_event, [](cl_event* e) {
if (*e) { CheckError(clReleaseEvent(*e)); }
delete e;
}) {
*event_ = nullptr;
}
// Waits for completion of this event
void WaitForCompletion() const {
CheckError(clWaitForEvents(1, &event_));
CheckError(clWaitForEvents(1, &(*event_)));
}
// Retrieves the elapsed time of the last recorded event. Note that no error checking is done on
@ -89,20 +98,22 @@ class Event {
float GetElapsedTime() const {
WaitForCompletion();
auto bytes = size_t{0};
clGetEventProfilingInfo(event_, CL_PROFILING_COMMAND_START, 0, nullptr, &bytes);
clGetEventProfilingInfo(*event_, CL_PROFILING_COMMAND_START, 0, nullptr, &bytes);
auto time_start = size_t{0};
clGetEventProfilingInfo(event_, CL_PROFILING_COMMAND_START, bytes, &time_start, nullptr);
clGetEventProfilingInfo(event_, CL_PROFILING_COMMAND_END, 0, nullptr, &bytes);
clGetEventProfilingInfo(*event_, CL_PROFILING_COMMAND_START, bytes, &time_start, nullptr);
clGetEventProfilingInfo(*event_, CL_PROFILING_COMMAND_END, 0, nullptr, &bytes);
auto time_end = size_t{0};
clGetEventProfilingInfo(event_, CL_PROFILING_COMMAND_END, bytes, &time_end, nullptr);
clGetEventProfilingInfo(*event_, CL_PROFILING_COMMAND_END, bytes, &time_end, nullptr);
return (time_end - time_start) * 1.0e-6f;
}
// Accessor to the private data-member
cl_event& operator()() { return event_; }
cl_event* pointer() { return &event_; }
cl_event& operator()() { return *event_; }
const cl_event& operator()() const { return *event_; }
cl_event* pointer() { return &(*event_); }
const cl_event* pointer() const { return &(*event_); }
private:
cl_event event_;
std::shared_ptr<cl_event> event_;
};
// Pointer to an OpenCL event
@ -163,6 +174,15 @@ class Device {
// Methods to retrieve device information
std::string Version() const { return GetInfoString(CL_DEVICE_VERSION); }
size_t VersionNumber() const
{
std::string version_string = Version().substr(7);
// Space separates the end of the OpenCL version number from the beginning of the
// vendor-specific information.
size_t next_whitespace = version_string.find(' ');
size_t version = (size_t) (100.0 * std::stod(version_string.substr(0, next_whitespace)));
return version;
}
std::string Vendor() const { return GetInfoString(CL_DEVICE_VENDOR); }
std::string Name() const { return GetInfoString(CL_DEVICE_NAME); }
std::string Type() const {
@ -176,24 +196,32 @@ class Device {
}
size_t MaxWorkGroupSize() const { return GetInfo<size_t>(CL_DEVICE_MAX_WORK_GROUP_SIZE); }
size_t MaxWorkItemDimensions() const {
return GetInfo(CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS);
return static_cast<size_t>(GetInfo<cl_uint>(CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS));
}
std::vector<size_t> MaxWorkItemSizes() const {
return GetInfoVector<size_t>(CL_DEVICE_MAX_WORK_ITEM_SIZES);
}
size_t LocalMemSize() const {
return static_cast<size_t>(GetInfo<cl_ulong>(CL_DEVICE_LOCAL_MEM_SIZE));
cl_ulong LocalMemSize() const {
return GetInfo<cl_ulong>(CL_DEVICE_LOCAL_MEM_SIZE);
}
std::string Capabilities() const { return GetInfoString(CL_DEVICE_EXTENSIONS); }
size_t CoreClock() const { return GetInfo(CL_DEVICE_MAX_CLOCK_FREQUENCY); }
size_t ComputeUnits() const { return GetInfo(CL_DEVICE_MAX_COMPUTE_UNITS); }
size_t MemorySize() const { return GetInfo(CL_DEVICE_GLOBAL_MEM_SIZE); }
size_t MaxAllocSize() const { return GetInfo(CL_DEVICE_MAX_MEM_ALLOC_SIZE); }
size_t CoreClock() const {
return static_cast<size_t>(GetInfo<cl_uint>(CL_DEVICE_MAX_CLOCK_FREQUENCY));
}
size_t ComputeUnits() const {
return static_cast<size_t>(GetInfo<cl_uint>(CL_DEVICE_MAX_COMPUTE_UNITS));
}
unsigned long MemorySize() const {
return static_cast<unsigned long>(GetInfo<cl_ulong>(CL_DEVICE_GLOBAL_MEM_SIZE));
}
unsigned long MaxAllocSize() const {
return static_cast<unsigned long>(GetInfo<cl_ulong>(CL_DEVICE_MAX_MEM_ALLOC_SIZE));
}
size_t MemoryClock() const { return 0; } // Not exposed in OpenCL
size_t MemoryBusWidth() const { return 0; } // Not exposed in OpenCL
// Configuration-validity checks
bool IsLocalMemoryValid(const size_t local_mem_usage) const {
bool IsLocalMemoryValid(const cl_ulong local_mem_usage) const {
return (local_mem_usage <= LocalMemSize());
}
bool IsThreadConfigValid(const std::vector<size_t> &local) const {
@ -211,6 +239,8 @@ class Device {
bool IsCPU() const { return Type() == "CPU"; }
bool IsGPU() const { return Type() == "GPU"; }
bool IsAMD() const { return Vendor() == "AMD" || Vendor() == "Advanced Micro Devices, Inc."; }
bool IsNVIDIA() const { return Vendor() == "NVIDIA" || Vendor() == "NVIDIA Corporation"; }
bool IsIntel() const { return Vendor() == "Intel" || Vendor() == "GenuineIntel"; }
bool IsARM() const { return Vendor() == "ARM"; }
// Accessor to the private data-member
@ -227,13 +257,6 @@ class Device {
CheckError(clGetDeviceInfo(device_, info, bytes, &result, nullptr));
return result;
}
size_t GetInfo(const cl_device_info info) const {
auto bytes = size_t{0};
CheckError(clGetDeviceInfo(device_, info, 0, nullptr, &bytes));
auto result = cl_uint(0);
CheckError(clGetDeviceInfo(device_, info, bytes, &result, nullptr));
return static_cast<size_t>(result);
}
template <typename T>
std::vector<T> GetInfoVector(const cl_device_info info) const {
auto bytes = size_t{0};
@ -386,8 +409,16 @@ class Queue {
delete s; }) {
auto status = CL_SUCCESS;
#ifdef CL_VERSION_2_0
cl_queue_properties properties[] = {CL_QUEUE_PROPERTIES, CL_QUEUE_PROFILING_ENABLE, 0};
*queue_ = clCreateCommandQueueWithProperties(context(), device(), properties, &status);
size_t ocl_version = device.VersionNumber();
if (ocl_version >= 200)
{
cl_queue_properties properties[] = {CL_QUEUE_PROPERTIES, CL_QUEUE_PROFILING_ENABLE, 0};
*queue_ = clCreateCommandQueueWithProperties(context(), device(), properties, &status);
}
else
{
*queue_ = clCreateCommandQueue(context(), device(), CL_QUEUE_PROFILING_ENABLE, &status);
}
#else
*queue_ = clCreateCommandQueue(context(), device(), CL_QUEUE_PROFILING_ENABLE, &status);
#endif
@ -627,15 +658,25 @@ class Kernel {
}
// Retrieves the amount of local memory used per work-group for this kernel
size_t LocalMemUsage(const Device &device) const {
cl_ulong LocalMemUsage(const Device &device) const {
auto bytes = size_t{0};
auto query = cl_kernel_work_group_info{CL_KERNEL_LOCAL_MEM_SIZE};
CheckError(clGetKernelWorkGroupInfo(*kernel_, device(), query, 0, nullptr, &bytes));
auto result = size_t{0};
auto result = cl_ulong{0};
CheckError(clGetKernelWorkGroupInfo(*kernel_, device(), query, bytes, &result, nullptr));
return result;
}
// Retrieves the name of the kernel
std::string GetFunctionName() {
auto bytes = size_t{0};
CheckError(clGetKernelInfo(*kernel_, CL_KERNEL_FUNCTION_NAME, 0, nullptr, &bytes));
auto result = std::string{};
result.resize(bytes);
CheckError(clGetKernelInfo(*kernel_, CL_KERNEL_FUNCTION_NAME, bytes, &result[0], nullptr));
return std::string{result.c_str()}; // Removes any trailing '\0'-characters
}
// Launches a kernel onto the specified queue
void Launch(const Queue &queue, const std::vector<size_t> &global,
const std::vector<size_t> &local, EventPointer event) {
@ -647,30 +688,21 @@ class Kernel {
// As above, but with an event waiting list
void Launch(const Queue &queue, const std::vector<size_t> &global,
const std::vector<size_t> &local, EventPointer event,
std::vector<Event>& waitForEvents) {
if (waitForEvents.size() == 0) { return Launch(queue, global, local, event); }
const std::vector<Event> &waitForEvents) {
// Builds a plain version of the events waiting list
auto waitForEventsPlain = std::vector<cl_event>();
for (auto &waitEvent : waitForEvents) {
waitForEventsPlain.push_back(waitEvent());
if (waitEvent()) { waitForEventsPlain.push_back(waitEvent()); }
}
// Launches the kernel while waiting for other events
CheckError(clEnqueueNDRangeKernel(queue(), *kernel_, static_cast<cl_uint>(global.size()),
nullptr, global.data(), local.data(),
nullptr, global.data(), !local.empty() ? local.data() : nullptr,
static_cast<cl_uint>(waitForEventsPlain.size()),
waitForEventsPlain.data(),
!waitForEventsPlain.empty() ? waitForEventsPlain.data() : nullptr,
event));
}
// As above, but with the default local workgroup size
void Launch(const Queue &queue, const std::vector<size_t> &global, EventPointer event) {
CheckError(clEnqueueNDRangeKernel(queue(), *kernel_, static_cast<cl_uint>(global.size()),
nullptr, global.data(), nullptr,
0, nullptr, event));
}
// Accessor to the private data-member
const cl_kernel& operator()() const { return *kernel_; }
private:

View file

@ -17,6 +17,8 @@
#include "database/kernels/xaxpy.hpp"
#include "database/kernels/xdot.hpp"
#include "database/kernels/xgemv.hpp"
#include "database/kernels/xgemv_fast.hpp"
#include "database/kernels/xgemv_fast_rot.hpp"
#include "database/kernels/xger.hpp"
#include "database/kernels/xgemm.hpp"
#include "database/kernels/copy.hpp"
@ -32,6 +34,8 @@ const std::vector<Database::DatabaseEntry> Database::database = {
XaxpyHalf, XaxpySingle, XaxpyDouble, XaxpyComplexSingle, XaxpyComplexDouble,
XdotHalf, XdotSingle, XdotDouble, XdotComplexSingle, XdotComplexDouble,
XgemvHalf, XgemvSingle, XgemvDouble, XgemvComplexSingle, XgemvComplexDouble,
XgemvFastHalf, XgemvFastSingle, XgemvFastDouble, XgemvFastComplexSingle, XgemvFastComplexDouble,
XgemvFastRotHalf, XgemvFastRotSingle, XgemvFastRotDouble, XgemvFastRotComplexSingle, XgemvFastRotComplexDouble,
XgerHalf, XgerSingle, XgerDouble, XgerComplexSingle, XgerComplexDouble,
XgemmHalf, XgemmSingle, XgemmDouble, XgemmComplexSingle, XgemmComplexDouble,
CopyHalf, CopySingle, CopyDouble, CopyComplexSingle, CopyComplexDouble,
@ -42,9 +46,10 @@ const std::vector<Database::DatabaseEntry> Database::database = {
// =================================================================================================
// Constructor, computing device properties and populating the parameter-vector from the database
// Constructor, computing device properties and populating the parameter-vector from the database.
// This takes an optional overlay database in case of custom tuning or custom kernels.
Database::Database(const Queue &queue, const std::vector<std::string> &kernels,
const Precision precision):
const Precision precision, const std::vector<DatabaseEntry> &overlay):
parameters_{} {
// Finds information of the current device
@ -53,10 +58,26 @@ Database::Database(const Queue &queue, const std::vector<std::string> &kernels,
auto device_vendor = device.Vendor();
auto device_name = device.Name();
// Set the short vendor name
for (auto &combination : kVendorNames) {
if (device_vendor == combination.first) {
device_vendor = combination.second;
}
}
// Iterates over all kernels to include, and retrieves the parameters for each of them
for (auto &kernel: kernels) {
auto search_result = Search(kernel, device_type, device_vendor, device_name, precision);
parameters_.insert(search_result.begin(), search_result.end());
auto search_result = ParametersPtr{};
for (auto db: { &overlay, &database }) {
search_result = Search(kernel, device_type, device_vendor, device_name, precision, *db);
if (search_result) {
parameters_.insert(search_result->begin(), search_result->end());
break;
}
}
if (!search_result) { throw std::runtime_error("Database error, could not find a suitable entry"); }
}
}
@ -73,28 +94,22 @@ std::string Database::GetDefines() const {
// =================================================================================================
// Searches the database for the right kernel and precision
Database::Parameters Database::Search(const std::string &this_kernel,
const std::string &this_type,
const std::string &this_vendor,
const std::string &this_device,
const Precision this_precision) const {
// Set the short vendor name
auto this_short_vendor = this_vendor;
for (auto &combination : kVendorNames) {
if (this_vendor == combination.first) {
this_short_vendor = combination.second;
}
}
// Searches a particular database for the right kernel and precision
Database::ParametersPtr Database::Search(const std::string &this_kernel,
const std::string &this_type,
const std::string &this_vendor,
const std::string &this_device,
const Precision this_precision,
const std::vector<DatabaseEntry> &this_database) const {
// Selects the right kernel
for (auto &db: database) {
for (auto &db: this_database) {
if (db.kernel == this_kernel && db.precision == this_precision) {
// Searches for the right vendor and device type, or selects the default if unavailable. This
// assumes that the default vendor / device type is last in the database.
for (auto &vendor: db.vendors) {
if ((vendor.name == this_short_vendor || vendor.name == kDeviceVendorAll) &&
if ((vendor.name == this_vendor || vendor.name == kDeviceVendorAll) &&
(vendor.type == this_type || vendor.type == kDeviceTypeAll)) {
// Searches for the right device. If the current device is unavailable, selects the vendor
@ -104,7 +119,7 @@ Database::Parameters Database::Search(const std::string &this_kernel,
if (device.name == this_device || device.name == "default") {
// Sets the parameters accordingly
return device.parameters;
return &device.parameters;
}
}
}
@ -112,8 +127,8 @@ Database::Parameters Database::Search(const std::string &this_kernel,
}
}
// If we reached this point, something is wrong
throw std::runtime_error("Database error, could not find a suitable entry");
// If we reached this point, the entry was not found in this database
return nullptr;
}
// =================================================================================================

View file

@ -32,6 +32,7 @@ class Database {
// Type alias for the database parameters
using Parameters = std::unordered_map<std::string,size_t>;
using ParametersPtr = const Parameters*;
// Structures for content inside the database
struct DatabaseDevice {
@ -70,6 +71,8 @@ class Database {
static const DatabaseEntry XaxpyHalf, XaxpySingle, XaxpyDouble, XaxpyComplexSingle, XaxpyComplexDouble;
static const DatabaseEntry XdotHalf, XdotSingle, XdotDouble, XdotComplexSingle, XdotComplexDouble;
static const DatabaseEntry XgemvHalf, XgemvSingle, XgemvDouble, XgemvComplexSingle, XgemvComplexDouble;
static const DatabaseEntry XgemvFastHalf, XgemvFastSingle, XgemvFastDouble, XgemvFastComplexSingle, XgemvFastComplexDouble;
static const DatabaseEntry XgemvFastRotHalf, XgemvFastRotSingle, XgemvFastRotDouble, XgemvFastRotComplexSingle, XgemvFastRotComplexDouble;
static const DatabaseEntry XgerHalf, XgerSingle, XgerDouble, XgerComplexSingle, XgerComplexDouble;
static const DatabaseEntry XgemmHalf, XgemmSingle, XgemmDouble, XgemmComplexSingle, XgemmComplexDouble;
static const DatabaseEntry CopyHalf, CopySingle, CopyDouble, CopyComplexSingle, CopyComplexDouble;
@ -78,9 +81,9 @@ class Database {
static const DatabaseEntry PadtransposeHalf, PadtransposeSingle, PadtransposeDouble, PadtransposeComplexSingle, PadtransposeComplexDouble;
static const std::vector<DatabaseEntry> database;
// The constructor
// The constructor with a user-provided database overlay (potentially an empty vector)
explicit Database(const Queue &queue, const std::vector<std::string> &routines,
const Precision precision);
const Precision precision, const std::vector<DatabaseEntry> &overlay);
// Accessor of values by key
size_t operator[](const std::string key) const { return parameters_.find(key)->second; }
@ -89,9 +92,10 @@ class Database {
std::string GetDefines() const;
private:
Parameters Search(const std::string &this_kernel, const std::string &this_type,
const std::string &this_vendor, const std::string &this_device,
const Precision this_precision) const;
// Search method for a specified database, returning pointer (possibly a nullptr)
ParametersPtr Search(const std::string &this_kernel, const std::string &this_type,
const std::string &this_vendor, const std::string &this_device,
const Precision this_precision, const std::vector<DatabaseEntry> &db) const;
// Found parameters suitable for this device/kernel
Parameters parameters_;

View file

@ -18,6 +18,7 @@ const Database::DatabaseEntry Database::CopyHalf = {
"Copy", Precision::kHalf, {
{ // Intel GPUs
kDeviceTypeGPU, "Intel", {
{ "Intel(R) HD Graphics 5500 BroadWell U-Processor GT2", { {"COPY_DIMX",8}, {"COPY_DIMY",16}, {"COPY_VW",8}, {"COPY_WPT",4} } },
{ "Intel(R) HD Graphics Skylake ULT GT2", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",4}, {"COPY_WPT",1} } },
{ "default", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",4}, {"COPY_WPT",1} } },
}
@ -38,9 +39,10 @@ const Database::DatabaseEntry Database::CopySingle = {
kDeviceTypeGPU, "AMD", {
{ "AMD Radeon R9 M370X Compute Engine", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",4}, {"COPY_WPT",1} } },
{ "Hawaii", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",2} } },
{ "Oland", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",4}, {"COPY_WPT",2} } },
{ "Pitcairn", { {"COPY_DIMX",8}, {"COPY_DIMY",16}, {"COPY_VW",4}, {"COPY_WPT",1} } },
{ "Tahiti", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",2} } },
{ "default", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",1} } },
{ "default", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",2} } },
}
},
{ // ARM GPUs
@ -59,11 +61,13 @@ const Database::DatabaseEntry Database::CopySingle = {
},
{ // Intel GPUs
kDeviceTypeGPU, "Intel", {
{ "Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",1} } },
{ "Intel(R) HD Graphics 530", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",1} } },
{ "Intel(R) HD Graphics 5500 BroadWell U-Processor GT2", { {"COPY_DIMX",32}, {"COPY_DIMY",16}, {"COPY_VW",4}, {"COPY_WPT",1} } },
{ "Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile", { {"COPY_DIMX",32}, {"COPY_DIMY",16}, {"COPY_VW",4}, {"COPY_WPT",1} } },
{ "Intel(R) HD Graphics Skylake ULT GT2", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",1} } },
{ "Iris", { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",2} } },
{ "Iris Pro", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",4}, {"COPY_WPT",4} } },
{ "default", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
{ "default", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",1} } },
}
},
{ // Intel accelerators
@ -75,20 +79,23 @@ const Database::DatabaseEntry Database::CopySingle = {
{ // NVIDIA GPUs
kDeviceTypeGPU, "NVIDIA", {
{ "GRID K520", { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",4}, {"COPY_WPT",1} } },
{ "GeForce GTX 1070", { {"COPY_DIMX",8}, {"COPY_DIMY",16}, {"COPY_VW",4}, {"COPY_WPT",1} } },
{ "GeForce GTX 480", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",4}, {"COPY_WPT",1} } },
{ "GeForce GTX 670", { {"COPY_DIMX",16}, {"COPY_DIMY",32}, {"COPY_VW",4}, {"COPY_WPT",1} } },
{ "GeForce GTX 680", { {"COPY_DIMX",32}, {"COPY_DIMY",16}, {"COPY_VW",4}, {"COPY_WPT",1} } },
{ "GeForce GTX 750", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",2} } },
{ "GeForce GTX 750 Ti", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",1} } },
{ "GeForce GTX 980", { {"COPY_DIMX",32}, {"COPY_DIMY",16}, {"COPY_VW",1}, {"COPY_WPT",1} } },
{ "GeForce GTX TITAN", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",4} } },
{ "GeForce GTX TITAN X", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",1} } },
{ "GeForce GTX TITAN X", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",2} } },
{ "Tesla K20m", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",4}, {"COPY_WPT",4} } },
{ "Tesla K40m", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",4}, {"COPY_WPT",2} } },
{ "default", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
{ "default", { {"COPY_DIMX",8}, {"COPY_DIMY",16}, {"COPY_VW",4}, {"COPY_WPT",1} } },
}
},
{ // Default
kDeviceTypeAll, "default", {
{ "default", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
{ "default", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",2} } },
}
},
}
@ -102,9 +109,10 @@ const Database::DatabaseEntry Database::CopyComplexSingle = {
kDeviceTypeGPU, "AMD", {
{ "AMD Radeon R9 M370X Compute Engine", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
{ "Hawaii", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",2} } },
{ "Oland", { {"COPY_DIMX",8}, {"COPY_DIMY",16}, {"COPY_VW",1}, {"COPY_WPT",1} } },
{ "Pitcairn", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",2} } },
{ "Tahiti", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",2} } },
{ "default", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
{ "default", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
}
},
{ // Intel CPUs
@ -112,16 +120,18 @@ const Database::DatabaseEntry Database::CopyComplexSingle = {
{ "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"COPY_DIMX",16}, {"COPY_DIMY",16}, {"COPY_VW",8}, {"COPY_WPT",1} } },
{ "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",2} } },
{ "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz", { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",8}, {"COPY_WPT",1} } },
{ "default", { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",1} } },
{ "default", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",8}, {"COPY_WPT",1} } },
}
},
{ // Intel GPUs
kDeviceTypeGPU, "Intel", {
{ "Intel(R) HD Graphics 530", { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",1} } },
{ "Intel(R) HD Graphics 5500 BroadWell U-Processor GT2", { {"COPY_DIMX",16}, {"COPY_DIMY",16}, {"COPY_VW",2}, {"COPY_WPT",2} } },
{ "Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
{ "Intel(R) HD Graphics Skylake ULT GT2", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",4}, {"COPY_WPT",4} } },
{ "Iris", { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",2} } },
{ "Iris Pro", { {"COPY_DIMX",32}, {"COPY_DIMY",16}, {"COPY_VW",1}, {"COPY_WPT",4} } },
{ "default", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
{ "default", { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",2} } },
}
},
{ // Intel accelerators
@ -133,18 +143,21 @@ const Database::DatabaseEntry Database::CopyComplexSingle = {
{ // NVIDIA GPUs
kDeviceTypeGPU, "NVIDIA", {
{ "GRID K520", { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
{ "GeForce GTX 1070", { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
{ "GeForce GTX 480", { {"COPY_DIMX",16}, {"COPY_DIMY",16}, {"COPY_VW",1}, {"COPY_WPT",1} } },
{ "GeForce GTX 670", { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
{ "GeForce GTX 750", { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",2} } },
{ "GeForce GTX 750 Ti", { {"COPY_DIMX",32}, {"COPY_DIMY",16}, {"COPY_VW",1}, {"COPY_WPT",1} } },
{ "GeForce GTX 980", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
{ "GeForce GTX TITAN X", { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
{ "Tesla K20m", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",4} } },
{ "Tesla K40m", { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
{ "default", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
{ "default", { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
}
},
{ // Default
kDeviceTypeAll, "default", {
{ "default", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
{ "default", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
}
},
}
@ -158,9 +171,10 @@ const Database::DatabaseEntry Database::CopyDouble = {
kDeviceTypeGPU, "AMD", {
{ "AMD Radeon R9 M370X Compute Engine", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
{ "Hawaii", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",2} } },
{ "Oland", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",8} } },
{ "Pitcairn", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
{ "Tahiti", { {"COPY_DIMX",8}, {"COPY_DIMY",32}, {"COPY_VW",2}, {"COPY_WPT",1} } },
{ "default", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
{ "default", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",2} } },
}
},
{ // ARM GPUs
@ -174,7 +188,7 @@ const Database::DatabaseEntry Database::CopyDouble = {
{ "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",8}, {"COPY_WPT",1} } },
{ "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz", { {"COPY_DIMX",16}, {"COPY_DIMY",32}, {"COPY_VW",2}, {"COPY_WPT",1} } },
{ "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz", { {"COPY_DIMX",16}, {"COPY_DIMY",16}, {"COPY_VW",8}, {"COPY_WPT",1} } },
{ "default", { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",1} } },
{ "default", { {"COPY_DIMX",16}, {"COPY_DIMY",16}, {"COPY_VW",8}, {"COPY_WPT",1} } },
}
},
{ // Intel accelerators
@ -186,20 +200,23 @@ const Database::DatabaseEntry Database::CopyDouble = {
{ // NVIDIA GPUs
kDeviceTypeGPU, "NVIDIA", {
{ "GRID K520", { {"COPY_DIMX",32}, {"COPY_DIMY",16}, {"COPY_VW",2}, {"COPY_WPT",1} } },
{ "GeForce GTX 1070", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",4}, {"COPY_WPT",1} } },
{ "GeForce GTX 480", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",1} } },
{ "GeForce GTX 670", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",1} } },
{ "GeForce GTX 680", { {"COPY_DIMX",16}, {"COPY_DIMY",32}, {"COPY_VW",2}, {"COPY_WPT",1} } },
{ "GeForce GTX 750", { {"COPY_DIMX",8}, {"COPY_DIMY",16}, {"COPY_VW",2}, {"COPY_WPT",1} } },
{ "GeForce GTX 750 Ti", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",1} } },
{ "GeForce GTX 980", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",1} } },
{ "GeForce GTX TITAN", { {"COPY_DIMX",16}, {"COPY_DIMY",32}, {"COPY_VW",2}, {"COPY_WPT",2} } },
{ "GeForce GTX TITAN X", { {"COPY_DIMX",32}, {"COPY_DIMY",16}, {"COPY_VW",1}, {"COPY_WPT",1} } },
{ "Tesla K20m", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",1} } },
{ "Tesla K40m", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",2} } },
{ "default", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
{ "default", { {"COPY_DIMX",32}, {"COPY_DIMY",32}, {"COPY_VW",2}, {"COPY_WPT",1} } },
}
},
{ // Default
kDeviceTypeAll, "default", {
{ "default", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
{ "default", { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",1} } },
}
},
}
@ -213,9 +230,10 @@ const Database::DatabaseEntry Database::CopyComplexDouble = {
kDeviceTypeGPU, "AMD", {
{ "AMD Radeon R9 M370X Compute Engine", { {"COPY_DIMX",8}, {"COPY_DIMY",16}, {"COPY_VW",1}, {"COPY_WPT",1} } },
{ "Hawaii", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",8} } },
{ "Oland", { {"COPY_DIMX",8}, {"COPY_DIMY",16}, {"COPY_VW",1}, {"COPY_WPT",1} } },
{ "Pitcairn", { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
{ "Tahiti", { {"COPY_DIMX",8}, {"COPY_DIMY",16}, {"COPY_VW",1}, {"COPY_WPT",1} } },
{ "default", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
{ "default", { {"COPY_DIMX",8}, {"COPY_DIMY",16}, {"COPY_VW",1}, {"COPY_WPT",1} } },
}
},
{ // ARM GPUs
@ -229,7 +247,7 @@ const Database::DatabaseEntry Database::CopyComplexDouble = {
{ "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",8}, {"COPY_WPT",1} } },
{ "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz", { {"COPY_DIMX",32}, {"COPY_DIMY",32}, {"COPY_VW",8}, {"COPY_WPT",1} } },
{ "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",8}, {"COPY_WPT",1} } },
{ "default", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",8}, {"COPY_WPT",1} } },
{ "default", { {"COPY_DIMX",32}, {"COPY_DIMY",32}, {"COPY_VW",8}, {"COPY_WPT",1} } },
}
},
{ // Intel accelerators
@ -241,8 +259,11 @@ const Database::DatabaseEntry Database::CopyComplexDouble = {
{ // NVIDIA GPUs
kDeviceTypeGPU, "NVIDIA", {
{ "GRID K520", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
{ "GeForce GTX 1070", { {"COPY_DIMX",8}, {"COPY_DIMY",32}, {"COPY_VW",1}, {"COPY_WPT",4} } },
{ "GeForce GTX 480", { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
{ "GeForce GTX 670", { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
{ "GeForce GTX 680", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
{ "GeForce GTX 750", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
{ "GeForce GTX 750 Ti", { {"COPY_DIMX",32}, {"COPY_DIMY",16}, {"COPY_VW",1}, {"COPY_WPT",1} } },
{ "GeForce GTX 980", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
{ "GeForce GTX TITAN", { {"COPY_DIMX",16}, {"COPY_DIMY",16}, {"COPY_VW",1}, {"COPY_WPT",1} } },
@ -254,7 +275,7 @@ const Database::DatabaseEntry Database::CopyComplexDouble = {
},
{ // Default
kDeviceTypeAll, "default", {
{ "default", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
{ "default", { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
}
},
}

View file

@ -18,6 +18,7 @@ const Database::DatabaseEntry Database::PadHalf = {
"Pad", Precision::kHalf, {
{ // Intel GPUs
kDeviceTypeGPU, "Intel", {
{ "Intel(R) HD Graphics 5500 BroadWell U-Processor GT2", { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",4}, {"PAD_WPTY",1} } },
{ "Intel(R) HD Graphics Skylake ULT GT2", { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } },
{ "default", { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } },
}
@ -38,9 +39,10 @@ const Database::DatabaseEntry Database::PadSingle = {
kDeviceTypeGPU, "AMD", {
{ "AMD Radeon R9 M370X Compute Engine", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
{ "Hawaii", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",4} } },
{ "Oland", { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } },
{ "Pitcairn", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } },
{ "Tahiti", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } },
{ "default", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
{ "default", { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } },
}
},
{ // ARM GPUs
@ -54,16 +56,18 @@ const Database::DatabaseEntry Database::PadSingle = {
{ "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"PAD_DIMX",32}, {"PAD_DIMY",16}, {"PAD_WPTX",4}, {"PAD_WPTY",1} } },
{ "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz", { {"PAD_DIMX",16}, {"PAD_DIMY",32}, {"PAD_WPTX",4}, {"PAD_WPTY",4} } },
{ "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",4}, {"PAD_WPTY",1} } },
{ "default", { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",4}, {"PAD_WPTY",1} } },
{ "default", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",4}, {"PAD_WPTY",2} } },
}
},
{ // Intel GPUs
kDeviceTypeGPU, "Intel", {
{ "Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile", { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
{ "Intel(R) HD Graphics 530", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",4} } },
{ "Intel(R) HD Graphics 5500 BroadWell U-Processor GT2", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",4} } },
{ "Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile", { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } },
{ "Intel(R) HD Graphics Skylake ULT GT2", { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } },
{ "Iris", { {"PAD_DIMX",32}, {"PAD_DIMY",16}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } },
{ "Iris Pro", { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } },
{ "default", { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
{ "default", { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } },
}
},
{ // Intel accelerators
@ -75,20 +79,23 @@ const Database::DatabaseEntry Database::PadSingle = {
{ // NVIDIA GPUs
kDeviceTypeGPU, "NVIDIA", {
{ "GRID K520", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } },
{ "GeForce GTX 1070", { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
{ "GeForce GTX 480", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",4} } },
{ "GeForce GTX 670", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",2} } },
{ "GeForce GTX 680", { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",4}, {"PAD_WPTY",1} } },
{ "GeForce GTX 750", { {"PAD_DIMX",32}, {"PAD_DIMY",16}, {"PAD_WPTX",4}, {"PAD_WPTY",2} } },
{ "GeForce GTX 750 Ti", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",4}, {"PAD_WPTY",1} } },
{ "GeForce GTX 980", { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
{ "GeForce GTX TITAN", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } },
{ "GeForce GTX TITAN X", { {"PAD_DIMX",16}, {"PAD_DIMY",16}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
{ "Tesla K20m", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } },
{ "Tesla K40m", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
{ "default", { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
{ "default", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",4}, {"PAD_WPTY",1} } },
}
},
{ // Default
kDeviceTypeAll, "default", {
{ "default", { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
{ "default", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } },
}
},
}
@ -102,9 +109,10 @@ const Database::DatabaseEntry Database::PadComplexSingle = {
kDeviceTypeGPU, "AMD", {
{ "AMD Radeon R9 M370X Compute Engine", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
{ "Hawaii", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } },
{ "Oland", { {"PAD_DIMX",8}, {"PAD_DIMY",32}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
{ "Pitcairn", { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } },
{ "Tahiti", { {"PAD_DIMX",16}, {"PAD_DIMY",16}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
{ "default", { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
{ "default", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
}
},
{ // ARM GPUs
@ -118,16 +126,18 @@ const Database::DatabaseEntry Database::PadComplexSingle = {
{ "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",2} } },
{ "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz", { {"PAD_DIMX",32}, {"PAD_DIMY",32}, {"PAD_WPTX",4}, {"PAD_WPTY",1} } },
{ "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz", { {"PAD_DIMX",32}, {"PAD_DIMY",16}, {"PAD_WPTX",4}, {"PAD_WPTY",1} } },
{ "default", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } },
{ "default", { {"PAD_DIMX",32}, {"PAD_DIMY",16}, {"PAD_WPTX",4}, {"PAD_WPTY",1} } },
}
},
{ // Intel GPUs
kDeviceTypeGPU, "Intel", {
{ "Intel(R) HD Graphics 530", { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } },
{ "Intel(R) HD Graphics 5500 BroadWell U-Processor GT2", { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
{ "Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile", { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
{ "Intel(R) HD Graphics Skylake ULT GT2", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",4} } },
{ "Iris", { {"PAD_DIMX",32}, {"PAD_DIMY",16}, {"PAD_WPTX",2}, {"PAD_WPTY",4} } },
{ "Iris Pro", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } },
{ "default", { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
{ "default", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",4} } },
}
},
{ // Intel accelerators
@ -139,20 +149,23 @@ const Database::DatabaseEntry Database::PadComplexSingle = {
{ // NVIDIA GPUs
kDeviceTypeGPU, "NVIDIA", {
{ "GRID K520", { {"PAD_DIMX",16}, {"PAD_DIMY",16}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
{ "GeForce GTX 1070", { {"PAD_DIMX",8}, {"PAD_DIMY",32}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
{ "GeForce GTX 480", { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } },
{ "GeForce GTX 670", { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } },
{ "GeForce GTX 680", { {"PAD_DIMX",16}, {"PAD_DIMY",32}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } },
{ "GeForce GTX 750", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } },
{ "GeForce GTX 750 Ti", { {"PAD_DIMX",16}, {"PAD_DIMY",16}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
{ "GeForce GTX 980", { {"PAD_DIMX",16}, {"PAD_DIMY",16}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
{ "GeForce GTX TITAN", { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } },
{ "GeForce GTX TITAN X", { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
{ "Tesla K20m", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } },
{ "Tesla K40m", { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
{ "default", { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
{ "default", { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } },
}
},
{ // Default
kDeviceTypeAll, "default", {
{ "default", { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
{ "default", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
}
},
}
@ -166,9 +179,10 @@ const Database::DatabaseEntry Database::PadDouble = {
kDeviceTypeGPU, "AMD", {
{ "AMD Radeon R9 M370X Compute Engine", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
{ "Hawaii", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } },
{ "Oland", { {"PAD_DIMX",8}, {"PAD_DIMY",32}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
{ "Pitcairn", { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } },
{ "Tahiti", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
{ "default", { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
{ "default", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } },
}
},
{ // ARM GPUs
@ -182,7 +196,7 @@ const Database::DatabaseEntry Database::PadDouble = {
{ "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",4}, {"PAD_WPTY",1} } },
{ "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz", { {"PAD_DIMX",32}, {"PAD_DIMY",32}, {"PAD_WPTX",4}, {"PAD_WPTY",1} } },
{ "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } },
{ "default", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } },
{ "default", { {"PAD_DIMX",32}, {"PAD_DIMY",16}, {"PAD_WPTX",4}, {"PAD_WPTY",1} } },
}
},
{ // Intel accelerators
@ -194,20 +208,23 @@ const Database::DatabaseEntry Database::PadDouble = {
{ // NVIDIA GPUs
kDeviceTypeGPU, "NVIDIA", {
{ "GRID K520", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
{ "GeForce GTX 1070", { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
{ "GeForce GTX 480", { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
{ "GeForce GTX 670", { {"PAD_DIMX",16}, {"PAD_DIMY",16}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } },
{ "GeForce GTX 680", { {"PAD_DIMX",32}, {"PAD_DIMY",32}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } },
{ "GeForce GTX 750", { {"PAD_DIMX",32}, {"PAD_DIMY",16}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
{ "GeForce GTX 750 Ti", { {"PAD_DIMX",8}, {"PAD_DIMY",16}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
{ "GeForce GTX 980", { {"PAD_DIMX",8}, {"PAD_DIMY",16}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
{ "GeForce GTX TITAN", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
{ "GeForce GTX TITAN X", { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
{ "Tesla K20m", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
{ "Tesla K40m", { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } },
{ "default", { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
{ "default", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
}
},
{ // Default
kDeviceTypeAll, "default", {
{ "default", { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
{ "default", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
}
},
}
@ -221,9 +238,10 @@ const Database::DatabaseEntry Database::PadComplexDouble = {
kDeviceTypeGPU, "AMD", {
{ "AMD Radeon R9 M370X Compute Engine", { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
{ "Hawaii", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
{ "Oland", { {"PAD_DIMX",8}, {"PAD_DIMY",16}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } },
{ "Pitcairn", { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
{ "Tahiti", { {"PAD_DIMX",8}, {"PAD_DIMY",16}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
{ "default", { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
{ "default", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
}
},
{ // ARM GPUs
@ -237,7 +255,7 @@ const Database::DatabaseEntry Database::PadComplexDouble = {
{ "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } },
{ "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz", { {"PAD_DIMX",16}, {"PAD_DIMY",32}, {"PAD_WPTX",4}, {"PAD_WPTY",1} } },
{ "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } },
{ "default", { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } },
{ "default", { {"PAD_DIMX",32}, {"PAD_DIMY",16}, {"PAD_WPTX",4}, {"PAD_WPTY",1} } },
}
},
{ // Intel accelerators
@ -249,20 +267,23 @@ const Database::DatabaseEntry Database::PadComplexDouble = {
{ // NVIDIA GPUs
kDeviceTypeGPU, "NVIDIA", {
{ "GRID K520", { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
{ "GeForce GTX 1070", { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",2} } },
{ "GeForce GTX 480", { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
{ "GeForce GTX 670", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
{ "GeForce GTX 680", { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
{ "GeForce GTX 750", { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
{ "GeForce GTX 750 Ti", { {"PAD_DIMX",32}, {"PAD_DIMY",32}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
{ "GeForce GTX 980", { {"PAD_DIMX",16}, {"PAD_DIMY",16}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
{ "GeForce GTX TITAN", { {"PAD_DIMX",8}, {"PAD_DIMY",32}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } },
{ "GeForce GTX TITAN X", { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
{ "Tesla K20m", { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } },
{ "Tesla K40m", { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
{ "default", { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
{ "default", { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
}
},
{ // Default
kDeviceTypeAll, "default", {
{ "default", { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
{ "default", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
}
},
}

View file

@ -18,6 +18,7 @@ const Database::DatabaseEntry Database::PadtransposeHalf = {
"Padtranspose", Precision::kHalf, {
{ // Intel GPUs
kDeviceTypeGPU, "Intel", {
{ "Intel(R) HD Graphics 5500 BroadWell U-Processor GT2", { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",1} } },
{ "Intel(R) HD Graphics Skylake ULT GT2", { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",1} } },
{ "default", { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",1} } },
}
@ -38,6 +39,7 @@ const Database::DatabaseEntry Database::PadtransposeSingle = {
kDeviceTypeGPU, "AMD", {
{ "AMD Radeon R9 M370X Compute Engine", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",4} } },
{ "Hawaii", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",4} } },
{ "Oland", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",4} } },
{ "Pitcairn", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",4} } },
{ "Tahiti", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",4} } },
{ "default", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",4} } },
@ -54,11 +56,13 @@ const Database::DatabaseEntry Database::PadtransposeSingle = {
{ "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
{ "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz", { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",8} } },
{ "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz", { {"PADTRA_PAD",0}, {"PADTRA_TILE",32}, {"PADTRA_WPT",1} } },
{ "default", { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",1} } },
{ "default", { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",8} } },
}
},
{ // Intel GPUs
kDeviceTypeGPU, "Intel", {
{ "Intel(R) HD Graphics 530", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } },
{ "Intel(R) HD Graphics 5500 BroadWell U-Processor GT2", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",4} } },
{ "Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } },
{ "Intel(R) HD Graphics Skylake ULT GT2", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } },
{ "Iris", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } },
@ -75,20 +79,23 @@ const Database::DatabaseEntry Database::PadtransposeSingle = {
{ // NVIDIA GPUs
kDeviceTypeGPU, "NVIDIA", {
{ "GRID K520", { {"PADTRA_PAD",1}, {"PADTRA_TILE",32}, {"PADTRA_WPT",2} } },
{ "GeForce GTX 1070", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
{ "GeForce GTX 480", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } },
{ "GeForce GTX 670", { {"PADTRA_PAD",1}, {"PADTRA_TILE",32}, {"PADTRA_WPT",2} } },
{ "GeForce GTX 680", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } },
{ "GeForce GTX 750", { {"PADTRA_PAD",1}, {"PADTRA_TILE",32}, {"PADTRA_WPT",2} } },
{ "GeForce GTX 750 Ti", { {"PADTRA_PAD",1}, {"PADTRA_TILE",32}, {"PADTRA_WPT",2} } },
{ "GeForce GTX 980", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
{ "GeForce GTX TITAN", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } },
{ "GeForce GTX TITAN X", { {"PADTRA_PAD",1}, {"PADTRA_TILE",32}, {"PADTRA_WPT",1} } },
{ "Tesla K20m", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } },
{ "Tesla K40m", { {"PADTRA_PAD",1}, {"PADTRA_TILE",32}, {"PADTRA_WPT",2} } },
{ "default", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
{ "default", { {"PADTRA_PAD",1}, {"PADTRA_TILE",32}, {"PADTRA_WPT",2} } },
}
},
{ // Default
kDeviceTypeAll, "default", {
{ "default", { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",1} } },
{ "default", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } },
}
},
}
@ -102,9 +109,10 @@ const Database::DatabaseEntry Database::PadtransposeComplexSingle = {
kDeviceTypeGPU, "AMD", {
{ "AMD Radeon R9 M370X Compute Engine", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",4} } },
{ "Hawaii", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } },
{ "Oland", { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",4} } },
{ "Pitcairn", { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",4} } },
{ "Tahiti", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } },
{ "default", { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",2} } },
{ "default", { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",4} } },
}
},
{ // ARM GPUs
@ -123,11 +131,13 @@ const Database::DatabaseEntry Database::PadtransposeComplexSingle = {
},
{ // Intel GPUs
kDeviceTypeGPU, "Intel", {
{ "Intel(R) HD Graphics 530", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } },
{ "Intel(R) HD Graphics 5500 BroadWell U-Processor GT2", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } },
{ "Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } },
{ "Intel(R) HD Graphics Skylake ULT GT2", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",4} } },
{ "Iris", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } },
{ "Iris Pro", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } },
{ "default", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } },
{ "default", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } },
}
},
{ // Intel accelerators
@ -139,20 +149,23 @@ const Database::DatabaseEntry Database::PadtransposeComplexSingle = {
{ // NVIDIA GPUs
kDeviceTypeGPU, "NVIDIA", {
{ "GRID K520", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
{ "GeForce GTX 1070", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
{ "GeForce GTX 480", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
{ "GeForce GTX 670", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
{ "GeForce GTX 680", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
{ "GeForce GTX 750", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } },
{ "GeForce GTX 750 Ti", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
{ "GeForce GTX 980", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
{ "GeForce GTX TITAN", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
{ "GeForce GTX TITAN X", { {"PADTRA_PAD",1}, {"PADTRA_TILE",32}, {"PADTRA_WPT",1} } },
{ "Tesla K20m", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
{ "Tesla K40m", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
{ "default", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
{ "default", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
}
},
{ // Default
kDeviceTypeAll, "default", {
{ "default", { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",1} } },
{ "default", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } },
}
},
}
@ -166,9 +179,10 @@ const Database::DatabaseEntry Database::PadtransposeDouble = {
kDeviceTypeGPU, "AMD", {
{ "AMD Radeon R9 M370X Compute Engine", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",4} } },
{ "Hawaii", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } },
{ "Oland", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",4} } },
{ "Pitcairn", { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",4} } },
{ "Tahiti", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } },
{ "default", { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",2} } },
{ "default", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",4} } },
}
},
{ // ARM GPUs
@ -182,7 +196,7 @@ const Database::DatabaseEntry Database::PadtransposeDouble = {
{ "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"PADTRA_PAD",1}, {"PADTRA_TILE",8}, {"PADTRA_WPT",4} } },
{ "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz", { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",8} } },
{ "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz", { {"PADTRA_PAD",1}, {"PADTRA_TILE",32}, {"PADTRA_WPT",1} } },
{ "default", { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",1} } },
{ "default", { {"PADTRA_PAD",1}, {"PADTRA_TILE",8}, {"PADTRA_WPT",4} } },
}
},
{ // Intel accelerators
@ -194,20 +208,23 @@ const Database::DatabaseEntry Database::PadtransposeDouble = {
{ // NVIDIA GPUs
kDeviceTypeGPU, "NVIDIA", {
{ "GRID K520", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
{ "GeForce GTX 1070", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
{ "GeForce GTX 480", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
{ "GeForce GTX 670", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
{ "GeForce GTX 680", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
{ "GeForce GTX 750", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } },
{ "GeForce GTX 750 Ti", { {"PADTRA_PAD",1}, {"PADTRA_TILE",32}, {"PADTRA_WPT",2} } },
{ "GeForce GTX 980", { {"PADTRA_PAD",1}, {"PADTRA_TILE",32}, {"PADTRA_WPT",1} } },
{ "GeForce GTX TITAN", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
{ "GeForce GTX TITAN X", { {"PADTRA_PAD",1}, {"PADTRA_TILE",32}, {"PADTRA_WPT",1} } },
{ "Tesla K20m", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
{ "Tesla K40m", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
{ "default", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
{ "default", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
}
},
{ // Default
kDeviceTypeAll, "default", {
{ "default", { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",1} } },
{ "default", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } },
}
},
}
@ -221,9 +238,10 @@ const Database::DatabaseEntry Database::PadtransposeComplexDouble = {
kDeviceTypeGPU, "AMD", {
{ "AMD Radeon R9 M370X Compute Engine", { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",4} } },
{ "Hawaii", { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",4} } },
{ "Oland", { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",4} } },
{ "Pitcairn", { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",4} } },
{ "Tahiti", { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",2} } },
{ "default", { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",2} } },
{ "default", { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",4} } },
}
},
{ // ARM GPUs
@ -237,7 +255,7 @@ const Database::DatabaseEntry Database::PadtransposeComplexDouble = {
{ "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"PADTRA_PAD",1}, {"PADTRA_TILE",8}, {"PADTRA_WPT",2} } },
{ "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz", { {"PADTRA_PAD",1}, {"PADTRA_TILE",8}, {"PADTRA_WPT",4} } },
{ "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz", { {"PADTRA_PAD",1}, {"PADTRA_TILE",8}, {"PADTRA_WPT",4} } },
{ "default", { {"PADTRA_PAD",1}, {"PADTRA_TILE",8}, {"PADTRA_WPT",2} } },
{ "default", { {"PADTRA_PAD",1}, {"PADTRA_TILE",8}, {"PADTRA_WPT",4} } },
}
},
{ // Intel accelerators
@ -249,20 +267,23 @@ const Database::DatabaseEntry Database::PadtransposeComplexDouble = {
{ // NVIDIA GPUs
kDeviceTypeGPU, "NVIDIA", {
{ "GRID K520", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
{ "GeForce GTX 1070", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
{ "GeForce GTX 480", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
{ "GeForce GTX 670", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
{ "GeForce GTX 680", { {"PADTRA_PAD",1}, {"PADTRA_TILE",32}, {"PADTRA_WPT",1} } },
{ "GeForce GTX 750", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
{ "GeForce GTX 750 Ti", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } },
{ "GeForce GTX 980", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
{ "GeForce GTX TITAN", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
{ "GeForce GTX TITAN X", { {"PADTRA_PAD",1}, {"PADTRA_TILE",32}, {"PADTRA_WPT",1} } },
{ "Tesla K20m", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
{ "Tesla K40m", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
{ "default", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
{ "default", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
}
},
{ // Default
kDeviceTypeAll, "default", {
{ "default", { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",1} } },
{ "default", { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",2} } },
}
},
}

View file

@ -18,6 +18,7 @@ const Database::DatabaseEntry Database::TransposeHalf = {
"Transpose", Precision::kHalf, {
{ // Intel GPUs
kDeviceTypeGPU, "Intel", {
{ "Intel(R) HD Graphics 5500 BroadWell U-Processor GT2", { {"TRA_DIM",8}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",8} } },
{ "Intel(R) HD Graphics Skylake ULT GT2", { {"TRA_DIM",16}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",4} } },
{ "default", { {"TRA_DIM",16}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",4} } },
}
@ -38,9 +39,10 @@ const Database::DatabaseEntry Database::TransposeSingle = {
kDeviceTypeGPU, "AMD", {
{ "AMD Radeon R9 M370X Compute Engine", { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",8} } },
{ "Hawaii", { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",8} } },
{ "Oland", { {"TRA_DIM",8}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",4} } },
{ "Pitcairn", { {"TRA_DIM",16}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",1} } },
{ "Tahiti", { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",4} } },
{ "default", { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",1} } },
{ "default", { {"TRA_DIM",4}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",4} } },
}
},
{ // ARM GPUs
@ -59,11 +61,13 @@ const Database::DatabaseEntry Database::TransposeSingle = {
},
{ // Intel GPUs
kDeviceTypeGPU, "Intel", {
{ "Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile", { {"TRA_DIM",8}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",4} } },
{ "Intel(R) HD Graphics 530", { {"TRA_DIM",16}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",4} } },
{ "Intel(R) HD Graphics 5500 BroadWell U-Processor GT2", { {"TRA_DIM",16}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",4} } },
{ "Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile", { {"TRA_DIM",16}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",4} } },
{ "Intel(R) HD Graphics Skylake ULT GT2", { {"TRA_DIM",8}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",4} } },
{ "Iris", { {"TRA_DIM",8}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",4} } },
{ "Iris Pro", { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",4} } },
{ "default", { {"TRA_DIM",8}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",4} } },
{ "default", { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",4} } },
}
},
{ // Intel accelerators
@ -75,20 +79,23 @@ const Database::DatabaseEntry Database::TransposeSingle = {
{ // NVIDIA GPUs
kDeviceTypeGPU, "NVIDIA", {
{ "GRID K520", { {"TRA_DIM",32}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",2} } },
{ "GeForce GTX 1070", { {"TRA_DIM",8}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",4} } },
{ "GeForce GTX 480", { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",2} } },
{ "GeForce GTX 670", { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",2} } },
{ "GeForce GTX 680", { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",2} } },
{ "GeForce GTX 750", { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",8} } },
{ "GeForce GTX 750 Ti", { {"TRA_DIM",32}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",2} } },
{ "GeForce GTX 980", { {"TRA_DIM",16}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
{ "GeForce GTX TITAN", { {"TRA_DIM",8}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",4} } },
{ "GeForce GTX TITAN X", { {"TRA_DIM",16}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",4} } },
{ "Tesla K20m", { {"TRA_DIM",8}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",4} } },
{ "Tesla K40m", { {"TRA_DIM",8}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",4} } },
{ "default", { {"TRA_DIM",8}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
{ "default", { {"TRA_DIM",8}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",4} } },
}
},
{ // Default
kDeviceTypeAll, "default", {
{ "default", { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
{ "default", { {"TRA_DIM",8}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",4} } },
}
},
}
@ -102,9 +109,10 @@ const Database::DatabaseEntry Database::TransposeComplexSingle = {
kDeviceTypeGPU, "AMD", {
{ "AMD Radeon R9 M370X Compute Engine", { {"TRA_DIM",4}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",4} } },
{ "Hawaii", { {"TRA_DIM",16}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",1} } },
{ "Oland", { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",2} } },
{ "Pitcairn", { {"TRA_DIM",8}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",1} } },
{ "Tahiti", { {"TRA_DIM",16}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",1} } },
{ "default", { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",1} } },
{ "default", { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",2} } },
}
},
{ // ARM GPUs
@ -118,35 +126,40 @@ const Database::DatabaseEntry Database::TransposeComplexSingle = {
{ "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"TRA_DIM",8}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",2} } },
{ "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz", { {"TRA_DIM",4}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",8} } },
{ "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz", { {"TRA_DIM",16}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",4} } },
{ "default", { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",2} } },
{ "default", { {"TRA_DIM",4}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",8} } },
}
},
{ // Intel GPUs
kDeviceTypeGPU, "Intel", {
{ "Intel(R) HD Graphics 530", { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",2} } },
{ "Intel(R) HD Graphics 5500 BroadWell U-Processor GT2", { {"TRA_DIM",8}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",2} } },
{ "Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile", { {"TRA_DIM",8}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",2} } },
{ "Intel(R) HD Graphics Skylake ULT GT2", { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",2} } },
{ "Iris", { {"TRA_DIM",8}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",2} } },
{ "Iris Pro", { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",2} } },
{ "default", { {"TRA_DIM",8}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",2} } },
{ "default", { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",2} } },
}
},
{ // NVIDIA GPUs
kDeviceTypeGPU, "NVIDIA", {
{ "GRID K520", { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",1} } },
{ "GeForce GTX 1070", { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",1} } },
{ "GeForce GTX 480", { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
{ "GeForce GTX 670", { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",1} } },
{ "GeForce GTX 680", { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",1} } },
{ "GeForce GTX 750", { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
{ "GeForce GTX 750 Ti", { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
{ "GeForce GTX 980", { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
{ "GeForce GTX TITAN", { {"TRA_DIM",16}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
{ "GeForce GTX TITAN X", { {"TRA_DIM",32}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
{ "Tesla K20m", { {"TRA_DIM",16}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
{ "Tesla K40m", { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
{ "default", { {"TRA_DIM",16}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
{ "default", { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
}
},
{ // Default
kDeviceTypeAll, "default", {
{ "default", { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
{ "default", { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
}
},
}
@ -160,9 +173,10 @@ const Database::DatabaseEntry Database::TransposeDouble = {
kDeviceTypeGPU, "AMD", {
{ "AMD Radeon R9 M370X Compute Engine", { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",4} } },
{ "Hawaii", { {"TRA_DIM",16}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",1} } },
{ "Oland", { {"TRA_DIM",8}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",2} } },
{ "Pitcairn", { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",2} } },
{ "Tahiti", { {"TRA_DIM",4}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",4} } },
{ "default", { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",1} } },
{ "default", { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",4} } },
}
},
{ // ARM GPUs
@ -176,7 +190,7 @@ const Database::DatabaseEntry Database::TransposeDouble = {
{ "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",4} } },
{ "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz", { {"TRA_DIM",4}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",8} } },
{ "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz", { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",8} } },
{ "default", { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",4} } },
{ "default", { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",8} } },
}
},
{ // Intel accelerators
@ -188,20 +202,23 @@ const Database::DatabaseEntry Database::TransposeDouble = {
{ // NVIDIA GPUs
kDeviceTypeGPU, "NVIDIA", {
{ "GRID K520", { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",2} } },
{ "GeForce GTX 1070", { {"TRA_DIM",8}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",2} } },
{ "GeForce GTX 480", { {"TRA_DIM",8}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",2} } },
{ "GeForce GTX 670", { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",2} } },
{ "GeForce GTX 680", { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",2} } },
{ "GeForce GTX 750", { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
{ "GeForce GTX 750 Ti", { {"TRA_DIM",32}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
{ "GeForce GTX 980", { {"TRA_DIM",16}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",2} } },
{ "GeForce GTX TITAN", { {"TRA_DIM",8}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",2} } },
{ "GeForce GTX TITAN X", { {"TRA_DIM",32}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
{ "Tesla K20m", { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",2} } },
{ "Tesla K40m", { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",2} } },
{ "default", { {"TRA_DIM",8}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
{ "default", { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",2} } },
}
},
{ // Default
kDeviceTypeAll, "default", {
{ "default", { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
{ "default", { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",2} } },
}
},
}
@ -215,9 +232,10 @@ const Database::DatabaseEntry Database::TransposeComplexDouble = {
kDeviceTypeGPU, "AMD", {
{ "AMD Radeon R9 M370X Compute Engine", { {"TRA_DIM",4}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",2} } },
{ "Hawaii", { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",2} } },
{ "Oland", { {"TRA_DIM",16}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",1} } },
{ "Pitcairn", { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",1} } },
{ "Tahiti", { {"TRA_DIM",16}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",1} } },
{ "default", { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",1} } },
{ "default", { {"TRA_DIM",8}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",1} } },
}
},
{ // ARM GPUs
@ -231,26 +249,29 @@ const Database::DatabaseEntry Database::TransposeComplexDouble = {
{ "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",4} } },
{ "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz", { {"TRA_DIM",4}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",4} } },
{ "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz", { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",2} } },
{ "default", { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",2} } },
{ "default", { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",4} } },
}
},
{ // NVIDIA GPUs
kDeviceTypeGPU, "NVIDIA", {
{ "GRID K520", { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",1} } },
{ "GeForce GTX 1070", { {"TRA_DIM",8}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
{ "GeForce GTX 480", { {"TRA_DIM",8}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
{ "GeForce GTX 670", { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",1} } },
{ "GeForce GTX 680", { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",1} } },
{ "GeForce GTX 750", { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
{ "GeForce GTX 750 Ti", { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
{ "GeForce GTX 980", { {"TRA_DIM",32}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
{ "GeForce GTX TITAN", { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
{ "GeForce GTX TITAN X", { {"TRA_DIM",32}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
{ "Tesla K20m", { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
{ "Tesla K40m", { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
{ "default", { {"TRA_DIM",8}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
{ "default", { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
}
},
{ // Default
kDeviceTypeAll, "default", {
{ "default", { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
{ "default", { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",1} } },
}
},
}

View file

@ -18,13 +18,14 @@ const Database::DatabaseEntry Database::XaxpyHalf = {
"Xaxpy", Precision::kHalf, {
{ // Intel GPUs
kDeviceTypeGPU, "Intel", {
{ "Intel(R) HD Graphics 5500 BroadWell U-Processor GT2", { {"VW",1}, {"WGS",64}, {"WPT",1} } },
{ "Intel(R) HD Graphics Skylake ULT GT2", { {"VW",4}, {"WGS",512}, {"WPT",8} } },
{ "default", { {"VW",4}, {"WGS",512}, {"WPT",8} } },
{ "default", { {"VW",8}, {"WGS",64}, {"WPT",1} } },
}
},
{ // Default
kDeviceTypeAll, "default", {
{ "default", { {"VW",4}, {"WGS",512}, {"WPT",8} } },
{ "default", { {"VW",8}, {"WGS",64}, {"WPT",1} } },
}
},
}
@ -38,9 +39,10 @@ const Database::DatabaseEntry Database::XaxpySingle = {
kDeviceTypeGPU, "AMD", {
{ "AMD Radeon R9 M370X Compute Engine", { {"VW",1}, {"WGS",128}, {"WPT",1} } },
{ "Hawaii", { {"VW",2}, {"WGS",64}, {"WPT",2} } },
{ "Oland", { {"VW",1}, {"WGS",128}, {"WPT",1} } },
{ "Pitcairn", { {"VW",2}, {"WGS",128}, {"WPT",1} } },
{ "Tahiti", { {"VW",2}, {"WGS",64}, {"WPT",1} } },
{ "default", { {"VW",1}, {"WGS",64}, {"WPT",1} } },
{ "default", { {"VW",2}, {"WGS",256}, {"WPT",1} } },
}
},
{ // ARM GPUs
@ -54,12 +56,14 @@ const Database::DatabaseEntry Database::XaxpySingle = {
{ "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"VW",1}, {"WGS",512}, {"WPT",1} } },
{ "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz", { {"VW",4}, {"WGS",256}, {"WPT",1} } },
{ "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz", { {"VW",1}, {"WGS",128}, {"WPT",1} } },
{ "default", { {"VW",1}, {"WGS",128}, {"WPT",1} } },
{ "default", { {"VW",2}, {"WGS",256}, {"WPT",1} } },
}
},
{ // Intel GPUs
kDeviceTypeGPU, "Intel", {
{ "Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile", { {"VW",8}, {"WGS",256}, {"WPT",1} } },
{ "Intel(R) HD Graphics 530", { {"VW",1}, {"WGS",128}, {"WPT",1} } },
{ "Intel(R) HD Graphics 5500 BroadWell U-Processor GT2", { {"VW",1}, {"WGS",256}, {"WPT",1} } },
{ "Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile", { {"VW",1}, {"WGS",64}, {"WPT",1} } },
{ "Intel(R) HD Graphics Skylake ULT GT2", { {"VW",1}, {"WGS",512}, {"WPT",2} } },
{ "Iris", { {"VW",1}, {"WGS",64}, {"WPT",1} } },
{ "Iris Pro", { {"VW",1}, {"WGS",128}, {"WPT",2} } },
@ -75,20 +79,23 @@ const Database::DatabaseEntry Database::XaxpySingle = {
{ // NVIDIA GPUs
kDeviceTypeGPU, "NVIDIA", {
{ "GRID K520", { {"VW",2}, {"WGS",64}, {"WPT",1} } },
{ "GeForce GTX 480", { {"VW",4}, {"WGS",64}, {"WPT",1} } },
{ "GeForce GTX 680", { {"VW",2}, {"WGS",64}, {"WPT",1} } },
{ "GeForce GTX 1070", { {"VW",1}, {"WGS",64}, {"WPT",4} } },
{ "GeForce GTX 480", { {"VW",2}, {"WGS",128}, {"WPT",1} } },
{ "GeForce GTX 670", { {"VW",2}, {"WGS",64}, {"WPT",1} } },
{ "GeForce GTX 680", { {"VW",1}, {"WGS",128}, {"WPT",1} } },
{ "GeForce GTX 750", { {"VW",1}, {"WGS",64}, {"WPT",1} } },
{ "GeForce GTX 750 Ti", { {"VW",1}, {"WGS",1024}, {"WPT",1} } },
{ "GeForce GTX 980", { {"VW",1}, {"WGS",1024}, {"WPT",1} } },
{ "GeForce GTX TITAN", { {"VW",4}, {"WGS",256}, {"WPT",1} } },
{ "GeForce GTX TITAN X", { {"VW",1}, {"WGS",64}, {"WPT",1} } },
{ "Tesla K20m", { {"VW",4}, {"WGS",128}, {"WPT",1} } },
{ "Tesla K40m", { {"VW",4}, {"WGS",128}, {"WPT",1} } },
{ "default", { {"VW",1}, {"WGS",64}, {"WPT",1} } },
{ "default", { {"VW",4}, {"WGS",64}, {"WPT",1} } },
}
},
{ // Default
kDeviceTypeAll, "default", {
{ "default", { {"VW",1}, {"WGS",64}, {"WPT",1} } },
{ "default", { {"VW",4}, {"WGS",64}, {"WPT",1} } },
}
},
}
@ -102,9 +109,10 @@ const Database::DatabaseEntry Database::XaxpyComplexSingle = {
kDeviceTypeGPU, "AMD", {
{ "AMD Radeon R9 M370X Compute Engine", { {"VW",2}, {"WGS",64}, {"WPT",8} } },
{ "Hawaii", { {"VW",1}, {"WGS",128}, {"WPT",2} } },
{ "Oland", { {"VW",1}, {"WGS",128}, {"WPT",1} } },
{ "Pitcairn", { {"VW",1}, {"WGS",64}, {"WPT",1} } },
{ "Tahiti", { {"VW",1}, {"WGS",64}, {"WPT",1} } },
{ "default", { {"VW",1}, {"WGS",64}, {"WPT",1} } },
{ "default", { {"VW",1}, {"WGS",128}, {"WPT",1} } },
}
},
{ // ARM GPUs
@ -118,16 +126,18 @@ const Database::DatabaseEntry Database::XaxpyComplexSingle = {
{ "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"VW",4}, {"WGS",256}, {"WPT",1} } },
{ "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz", { {"VW",1}, {"WGS",1024}, {"WPT",2} } },
{ "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz", { {"VW",2}, {"WGS",1024}, {"WPT",1} } },
{ "default", { {"VW",1}, {"WGS",256}, {"WPT",1} } },
{ "default", { {"VW",8}, {"WGS",1024}, {"WPT",1} } },
}
},
{ // Intel GPUs
kDeviceTypeGPU, "Intel", {
{ "Intel(R) HD Graphics 530", { {"VW",4}, {"WGS",64}, {"WPT",2} } },
{ "Intel(R) HD Graphics 5500 BroadWell U-Processor GT2", { {"VW",1}, {"WGS",64}, {"WPT",1} } },
{ "Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile", { {"VW",1}, {"WGS",64}, {"WPT",1} } },
{ "Intel(R) HD Graphics Skylake ULT GT2", { {"VW",2}, {"WGS",512}, {"WPT",1} } },
{ "Iris", { {"VW",2}, {"WGS",128}, {"WPT",1} } },
{ "Iris Pro", { {"VW",1}, {"WGS",256}, {"WPT",8} } },
{ "default", { {"VW",1}, {"WGS",64}, {"WPT",1} } },
{ "default", { {"VW",1}, {"WGS",256}, {"WPT",2} } },
}
},
{ // Intel accelerators
@ -139,20 +149,23 @@ const Database::DatabaseEntry Database::XaxpyComplexSingle = {
{ // NVIDIA GPUs
kDeviceTypeGPU, "NVIDIA", {
{ "GRID K520", { {"VW",1}, {"WGS",512}, {"WPT",1} } },
{ "GeForce GTX 1070", { {"VW",1}, {"WGS",64}, {"WPT",2} } },
{ "GeForce GTX 480", { {"VW",1}, {"WGS",256}, {"WPT",1} } },
{ "GeForce GTX 670", { {"VW",1}, {"WGS",256}, {"WPT",1} } },
{ "GeForce GTX 680", { {"VW",1}, {"WGS",256}, {"WPT",1} } },
{ "GeForce GTX 750", { {"VW",1}, {"WGS",512}, {"WPT",1} } },
{ "GeForce GTX 750 Ti", { {"VW",1}, {"WGS",512}, {"WPT",1} } },
{ "GeForce GTX 980", { {"VW",1}, {"WGS",64}, {"WPT",1} } },
{ "GeForce GTX TITAN", { {"VW",1}, {"WGS",256}, {"WPT",1} } },
{ "GeForce GTX TITAN X", { {"VW",1}, {"WGS",512}, {"WPT",1} } },
{ "Tesla K20m", { {"VW",1}, {"WGS",128}, {"WPT",1} } },
{ "Tesla K40m", { {"VW",1}, {"WGS",128}, {"WPT",1} } },
{ "default", { {"VW",1}, {"WGS",64}, {"WPT",1} } },
{ "default", { {"VW",1}, {"WGS",256}, {"WPT",1} } },
}
},
{ // Default
kDeviceTypeAll, "default", {
{ "default", { {"VW",1}, {"WGS",64}, {"WPT",1} } },
{ "default", { {"VW",1}, {"WGS",128}, {"WPT",1} } },
}
},
}
@ -166,6 +179,7 @@ const Database::DatabaseEntry Database::XaxpyDouble = {
kDeviceTypeGPU, "AMD", {
{ "AMD Radeon R9 M370X Compute Engine", { {"VW",1}, {"WGS",256}, {"WPT",1} } },
{ "Hawaii", { {"VW",1}, {"WGS",64}, {"WPT",2} } },
{ "Oland", { {"VW",1}, {"WGS",64}, {"WPT",1} } },
{ "Pitcairn", { {"VW",1}, {"WGS",128}, {"WPT",1} } },
{ "Tahiti", { {"VW",1}, {"WGS",64}, {"WPT",1} } },
{ "default", { {"VW",1}, {"WGS",64}, {"WPT",1} } },
@ -182,7 +196,7 @@ const Database::DatabaseEntry Database::XaxpyDouble = {
{ "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"VW",1}, {"WGS",1024}, {"WPT",1} } },
{ "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz", { {"VW",8}, {"WGS",64}, {"WPT",1} } },
{ "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz", { {"VW",8}, {"WGS",2048}, {"WPT",1} } },
{ "default", { {"VW",1}, {"WGS",64}, {"WPT",1} } },
{ "default", { {"VW",8}, {"WGS",512}, {"WPT",1} } },
}
},
{ // Intel accelerators
@ -194,15 +208,18 @@ const Database::DatabaseEntry Database::XaxpyDouble = {
{ // NVIDIA GPUs
kDeviceTypeGPU, "NVIDIA", {
{ "GRID K520", { {"VW",1}, {"WGS",64}, {"WPT",1} } },
{ "GeForce GTX 480", { {"VW",2}, {"WGS",64}, {"WPT",1} } },
{ "GeForce GTX 1070", { {"VW",1}, {"WGS",64}, {"WPT",8} } },
{ "GeForce GTX 480", { {"VW",1}, {"WGS",128}, {"WPT",1} } },
{ "GeForce GTX 670", { {"VW",1}, {"WGS",64}, {"WPT",1} } },
{ "GeForce GTX 680", { {"VW",1}, {"WGS",64}, {"WPT",1} } },
{ "GeForce GTX 750", { {"VW",1}, {"WGS",128}, {"WPT",1} } },
{ "GeForce GTX 750 Ti", { {"VW",1}, {"WGS",64}, {"WPT",1} } },
{ "GeForce GTX 980", { {"VW",1}, {"WGS",256}, {"WPT",1} } },
{ "GeForce GTX TITAN", { {"VW",2}, {"WGS",1024}, {"WPT",1} } },
{ "GeForce GTX TITAN X", { {"VW",1}, {"WGS",512}, {"WPT",1} } },
{ "Tesla K20m", { {"VW",2}, {"WGS",128}, {"WPT",1} } },
{ "Tesla K40m", { {"VW",2}, {"WGS",128}, {"WPT",1} } },
{ "default", { {"VW",1}, {"WGS",64}, {"WPT",1} } },
{ "default", { {"VW",1}, {"WGS",128}, {"WPT",1} } },
}
},
{ // Default
@ -221,9 +238,10 @@ const Database::DatabaseEntry Database::XaxpyComplexDouble = {
kDeviceTypeGPU, "AMD", {
{ "AMD Radeon R9 M370X Compute Engine", { {"VW",1}, {"WGS",128}, {"WPT",1} } },
{ "Hawaii", { {"VW",2}, {"WGS",64}, {"WPT",1} } },
{ "Oland", { {"VW",1}, {"WGS",256}, {"WPT",1} } },
{ "Pitcairn", { {"VW",1}, {"WGS",128}, {"WPT",1} } },
{ "Tahiti", { {"VW",1}, {"WGS",128}, {"WPT",1} } },
{ "default", { {"VW",1}, {"WGS",64}, {"WPT",1} } },
{ "default", { {"VW",1}, {"WGS",128}, {"WPT",1} } },
}
},
{ // ARM GPUs
@ -237,7 +255,7 @@ const Database::DatabaseEntry Database::XaxpyComplexDouble = {
{ "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"VW",8}, {"WGS",128}, {"WPT",1} } },
{ "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz", { {"VW",8}, {"WGS",512}, {"WPT",1} } },
{ "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz", { {"VW",1}, {"WGS",256}, {"WPT",1} } },
{ "default", { {"VW",1}, {"WGS",128}, {"WPT",1} } },
{ "default", { {"VW",4}, {"WGS",1024}, {"WPT",1} } },
}
},
{ // Intel accelerators
@ -249,8 +267,11 @@ const Database::DatabaseEntry Database::XaxpyComplexDouble = {
{ // NVIDIA GPUs
kDeviceTypeGPU, "NVIDIA", {
{ "GRID K520", { {"VW",1}, {"WGS",64}, {"WPT",1} } },
{ "GeForce GTX 1070", { {"VW",1}, {"WGS",64}, {"WPT",2} } },
{ "GeForce GTX 480", { {"VW",1}, {"WGS",128}, {"WPT",1} } },
{ "GeForce GTX 670", { {"VW",1}, {"WGS",256}, {"WPT",1} } },
{ "GeForce GTX 680", { {"VW",1}, {"WGS",64}, {"WPT",1} } },
{ "GeForce GTX 750", { {"VW",1}, {"WGS",1024}, {"WPT",1} } },
{ "GeForce GTX 750 Ti", { {"VW",1}, {"WGS",256}, {"WPT",2} } },
{ "GeForce GTX 980", { {"VW",1}, {"WGS",1024}, {"WPT",1} } },
{ "GeForce GTX TITAN", { {"VW",1}, {"WGS",64}, {"WPT",4} } },
@ -262,7 +283,7 @@ const Database::DatabaseEntry Database::XaxpyComplexDouble = {
},
{ // Default
kDeviceTypeAll, "default", {
{ "default", { {"VW",1}, {"WGS",64}, {"WPT",1} } },
{ "default", { {"VW",1}, {"WGS",128}, {"WPT",1} } },
}
},
}

View file

@ -18,6 +18,7 @@ const Database::DatabaseEntry Database::XdotHalf = {
"Xdot", Precision::kHalf, {
{ // Intel GPUs
kDeviceTypeGPU, "Intel", {
{ "Intel(R) HD Graphics 5500 BroadWell U-Processor GT2", { {"WGS1",256}, {"WGS2",32} } },
{ "Intel(R) HD Graphics Skylake ULT GT2", { {"WGS1",32}, {"WGS2",32} } },
{ "default", { {"WGS1",32}, {"WGS2",32} } },
}
@ -37,7 +38,7 @@ const Database::DatabaseEntry Database::XdotSingle = {
{ // AMD GPUs
kDeviceTypeGPU, "AMD", {
{ "AMD Radeon R9 M370X Compute Engine", { {"WGS1",128}, {"WGS2",32} } },
{ "Hawaii", { {"WGS1",256}, {"WGS2",32} } },
{ "Oland", { {"WGS1",256}, {"WGS2",32} } },
{ "Pitcairn", { {"WGS1",128}, {"WGS2",32} } },
{ "Tahiti", { {"WGS1",128}, {"WGS2",32} } },
{ "default", { {"WGS1",128}, {"WGS2",32} } },
@ -51,26 +52,31 @@ const Database::DatabaseEntry Database::XdotSingle = {
},
{ // Intel GPUs
kDeviceTypeGPU, "Intel", {
{ "Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile", { {"WGS1",32}, {"WGS2",32} } },
{ "Intel(R) HD Graphics 530", { {"WGS1",64}, {"WGS2",32} } },
{ "Intel(R) HD Graphics 5500 BroadWell U-Processor GT2", { {"WGS1",256}, {"WGS2",32} } },
{ "Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile", { {"WGS1",64}, {"WGS2",32} } },
{ "Intel(R) HD Graphics Skylake ULT GT2", { {"WGS1",64}, {"WGS2",32} } },
{ "Iris Pro", { {"WGS1",512}, {"WGS2",64} } },
{ "default", { {"WGS1",32}, {"WGS2",32} } },
{ "default", { {"WGS1",64}, {"WGS2",32} } },
}
},
{ // NVIDIA GPUs
kDeviceTypeGPU, "NVIDIA", {
{ "GRID K520", { {"WGS1",128}, {"WGS2",32} } },
{ "GeForce GTX 1070", { {"WGS1",128}, {"WGS2",1024} } },
{ "GeForce GTX 480", { {"WGS1",512}, {"WGS2",32} } },
{ "GeForce GTX 670", { {"WGS1",512}, {"WGS2",1024} } },
{ "GeForce GTX 680", { {"WGS1",128}, {"WGS2",128} } },
{ "GeForce GTX 750", { {"WGS1",128}, {"WGS2",32} } },
{ "GeForce GTX 980", { {"WGS1",256}, {"WGS2",32} } },
{ "GeForce GTX TITAN X", { {"WGS1",256}, {"WGS2",32} } },
{ "Tesla K20m", { {"WGS1",1024}, {"WGS2",32} } },
{ "default", { {"WGS1",128}, {"WGS2",32} } },
{ "default", { {"WGS1",256}, {"WGS2",256} } },
}
},
{ // Default
kDeviceTypeAll, "default", {
{ "default", { {"WGS1",32}, {"WGS2",32} } },
{ "default", { {"WGS1",256}, {"WGS2",32} } },
}
},
}
@ -83,10 +89,10 @@ const Database::DatabaseEntry Database::XdotComplexSingle = {
{ // AMD GPUs
kDeviceTypeGPU, "AMD", {
{ "AMD Radeon R9 M370X Compute Engine", { {"WGS1",64}, {"WGS2",32} } },
{ "Hawaii", { {"WGS1",256}, {"WGS2",32} } },
{ "Oland", { {"WGS1",128}, {"WGS2",32} } },
{ "Pitcairn", { {"WGS1",256}, {"WGS2",32} } },
{ "Tahiti", { {"WGS1",64}, {"WGS2",32} } },
{ "default", { {"WGS1",64}, {"WGS2",32} } },
{ "default", { {"WGS1",128}, {"WGS2",32} } },
}
},
{ // Intel CPUs
@ -97,6 +103,8 @@ const Database::DatabaseEntry Database::XdotComplexSingle = {
},
{ // Intel GPUs
kDeviceTypeGPU, "Intel", {
{ "Intel(R) HD Graphics 530", { {"WGS1",256}, {"WGS2",32} } },
{ "Intel(R) HD Graphics 5500 BroadWell U-Processor GT2", { {"WGS1",256}, {"WGS2",32} } },
{ "Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile", { {"WGS1",32}, {"WGS2",32} } },
{ "Intel(R) HD Graphics Skylake ULT GT2", { {"WGS1",32}, {"WGS2",32} } },
{ "Iris Pro", { {"WGS1",32}, {"WGS2",32} } },
@ -106,17 +114,20 @@ const Database::DatabaseEntry Database::XdotComplexSingle = {
{ // NVIDIA GPUs
kDeviceTypeGPU, "NVIDIA", {
{ "GRID K520", { {"WGS1",64}, {"WGS2",32} } },
{ "GeForce GTX 1070", { {"WGS1",128}, {"WGS2",32} } },
{ "GeForce GTX 480", { {"WGS1",512}, {"WGS2",32} } },
{ "GeForce GTX 670", { {"WGS1",256}, {"WGS2",32} } },
{ "GeForce GTX 680", { {"WGS1",128}, {"WGS2",64} } },
{ "GeForce GTX 750", { {"WGS1",64}, {"WGS2",32} } },
{ "GeForce GTX 980", { {"WGS1",256}, {"WGS2",64} } },
{ "GeForce GTX TITAN X", { {"WGS1",256}, {"WGS2",32} } },
{ "Tesla K20m", { {"WGS1",512}, {"WGS2",32} } },
{ "default", { {"WGS1",64}, {"WGS2",32} } },
{ "default", { {"WGS1",512}, {"WGS2",64} } },
}
},
{ // Default
kDeviceTypeAll, "default", {
{ "default", { {"WGS1",32}, {"WGS2",32} } },
{ "default", { {"WGS1",256}, {"WGS2",32} } },
}
},
}
@ -129,10 +140,10 @@ const Database::DatabaseEntry Database::XdotDouble = {
{ // AMD GPUs
kDeviceTypeGPU, "AMD", {
{ "AMD Radeon R9 M370X Compute Engine", { {"WGS1",64}, {"WGS2",128} } },
{ "Hawaii", { {"WGS1",256}, {"WGS2",32} } },
{ "Oland", { {"WGS1",256}, {"WGS2",32} } },
{ "Pitcairn", { {"WGS1",128}, {"WGS2",32} } },
{ "Tahiti", { {"WGS1",256}, {"WGS2",32} } },
{ "default", { {"WGS1",64}, {"WGS2",32} } },
{ "default", { {"WGS1",128}, {"WGS2",32} } },
}
},
{ // Intel CPUs
@ -144,17 +155,20 @@ const Database::DatabaseEntry Database::XdotDouble = {
{ // NVIDIA GPUs
kDeviceTypeGPU, "NVIDIA", {
{ "GRID K520", { {"WGS1",128}, {"WGS2",32} } },
{ "GeForce GTX 1070", { {"WGS1",128}, {"WGS2",512} } },
{ "GeForce GTX 480", { {"WGS1",512}, {"WGS2",32} } },
{ "GeForce GTX 670", { {"WGS1",256}, {"WGS2",32} } },
{ "GeForce GTX 680", { {"WGS1",128}, {"WGS2",64} } },
{ "GeForce GTX 750", { {"WGS1",64}, {"WGS2",256} } },
{ "GeForce GTX 980", { {"WGS1",128}, {"WGS2",32} } },
{ "GeForce GTX TITAN X", { {"WGS1",256}, {"WGS2",32} } },
{ "Tesla K20m", { {"WGS1",512}, {"WGS2",32} } },
{ "default", { {"WGS1",128}, {"WGS2",32} } },
{ "default", { {"WGS1",256}, {"WGS2",64} } },
}
},
{ // Default
kDeviceTypeAll, "default", {
{ "default", { {"WGS1",64}, {"WGS2",32} } },
{ "default", { {"WGS1",128}, {"WGS2",64} } },
}
},
}
@ -167,10 +181,10 @@ const Database::DatabaseEntry Database::XdotComplexDouble = {
{ // AMD GPUs
kDeviceTypeGPU, "AMD", {
{ "AMD Radeon R9 M370X Compute Engine", { {"WGS1",64}, {"WGS2",32} } },
{ "Hawaii", { {"WGS1",256}, {"WGS2",32} } },
{ "Oland", { {"WGS1",256}, {"WGS2",32} } },
{ "Pitcairn", { {"WGS1",256}, {"WGS2",32} } },
{ "Tahiti", { {"WGS1",256}, {"WGS2",32} } },
{ "default", { {"WGS1",64}, {"WGS2",32} } },
{ "default", { {"WGS1",256}, {"WGS2",32} } },
}
},
{ // Intel CPUs
@ -182,17 +196,20 @@ const Database::DatabaseEntry Database::XdotComplexDouble = {
{ // NVIDIA GPUs
kDeviceTypeGPU, "NVIDIA", {
{ "GRID K520", { {"WGS1",64}, {"WGS2",32} } },
{ "GeForce GTX 1070", { {"WGS1",128}, {"WGS2",64} } },
{ "GeForce GTX 480", { {"WGS1",512}, {"WGS2",32} } },
{ "GeForce GTX 670", { {"WGS1",512}, {"WGS2",128} } },
{ "GeForce GTX 680", { {"WGS1",256}, {"WGS2",64} } },
{ "GeForce GTX 750", { {"WGS1",256}, {"WGS2",32} } },
{ "GeForce GTX 980", { {"WGS1",64}, {"WGS2",32} } },
{ "GeForce GTX TITAN X", { {"WGS1",128}, {"WGS2",32} } },
{ "Tesla K20m", { {"WGS1",128}, {"WGS2",32} } },
{ "default", { {"WGS1",64}, {"WGS2",32} } },
{ "default", { {"WGS1",128}, {"WGS2",64} } },
}
},
{ // Default
kDeviceTypeAll, "default", {
{ "default", { {"WGS1",64}, {"WGS2",32} } },
{ "default", { {"WGS1",256}, {"WGS2",64} } },
}
},
}

View file

@ -18,7 +18,7 @@ const Database::DatabaseEntry Database::XgemmHalf = {
"Xgemm", Precision::kHalf, {
{ // Default
kDeviceTypeAll, "default", {
{ "default", { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",64}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
{ "default", { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",16}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
}
},
}
@ -32,6 +32,7 @@ const Database::DatabaseEntry Database::XgemmSingle = {
kDeviceTypeGPU, "AMD", {
{ "AMD Radeon R9 M370X Compute Engine", { {"KWG",32}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",8}, {"NDIMC",16}, {"NWG",128}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",2}, {"VWN",8} } },
{ "Hawaii", { {"KWG",16}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",32}, {"MWG",128}, {"NDIMB",32}, {"NDIMC",8}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",1}, {"VWM",4}, {"VWN",2} } },
{ "Oland", { {"KWG",16}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",32}, {"NDIMC",16}, {"NWG",128}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",2}, {"VWN",4} } },
{ "Pitcairn", { {"KWG",16}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",128}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
{ "Tahiti", { {"KWG",32}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",32}, {"MWG",128}, {"NDIMB",16}, {"NDIMC",8}, {"NWG",64}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",4}, {"VWN",1} } },
{ "default", { {"KWG",16}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",64}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
@ -53,11 +54,13 @@ const Database::DatabaseEntry Database::XgemmSingle = {
},
{ // Intel GPUs
kDeviceTypeGPU, "Intel", {
{ "Intel(R) HD Graphics 530", { {"KWG",32}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",128}, {"NDIMB",32}, {"NDIMC",16}, {"NWG",64}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",0}, {"VWM",4}, {"VWN",2} } },
{ "Intel(R) HD Graphics 5500 BroadWell U-Processor GT2", { {"KWG",32}, {"KWI",8}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",64}, {"NDIMB",32}, {"NDIMC",16}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",1}, {"VWM",4}, {"VWN",2} } },
{ "Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile", { {"KWG",16}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",8}, {"NDIMC",16}, {"NWG",128}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",1}, {"VWM",2}, {"VWN",4} } },
{ "Intel(R) HD Graphics Skylake ULT GT2", { {"KWG",32}, {"KWI",8}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",128}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",1}, {"VWM",1}, {"VWN",8} } },
{ "Iris", { {"KWG",16}, {"KWI",8}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",128}, {"NDIMB",32}, {"NDIMC",16}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",1}, {"VWM",4}, {"VWN",1} } },
{ "Iris Pro", { {"KWG",32}, {"KWI",8}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",64}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",64}, {"SA",1}, {"SB",0}, {"STRM",1}, {"STRN",0}, {"VWM",4}, {"VWN",4} } },
{ "default", { {"KWG",16}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",64}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
{ "default", { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",64}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
}
},
{ // Intel accelerators
@ -69,20 +72,23 @@ const Database::DatabaseEntry Database::XgemmSingle = {
{ // NVIDIA GPUs
kDeviceTypeGPU, "NVIDIA", {
{ "GRID K520", { {"KWG",16}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",8}, {"NDIMC",16}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",1}, {"VWM",2}, {"VWN",4} } },
{ "GeForce GTX 1070", { {"KWG",16}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",16}, {"MWG",128}, {"NDIMB",32}, {"NDIMC",8}, {"NWG",128}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",4}, {"VWN",1} } },
{ "GeForce GTX 480", { {"KWG",16}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",64}, {"NDIMB",32}, {"NDIMC",16}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",1}, {"VWM",2}, {"VWN",2} } },
{ "GeForce GTX 670", { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",64}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",2}, {"VWN",4} } },
{ "GeForce GTX 680", { {"KWG",32}, {"KWI",8}, {"MDIMA",8}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",32}, {"NDIMC",16}, {"NWG",128}, {"SA",1}, {"SB",1}, {"STRM",0}, {"STRN",0}, {"VWM",4}, {"VWN",2} } },
{ "GeForce GTX 750", { {"KWG",16}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",32}, {"NDIMC",8}, {"NWG",128}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",1}, {"VWM",1}, {"VWN",2} } },
{ "GeForce GTX 750 Ti", { {"KWG",16}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",16}, {"MWG",128}, {"NDIMB",16}, {"NDIMC",8}, {"NWG",128}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",1}, {"VWM",4}, {"VWN",4} } },
{ "GeForce GTX 980", { {"KWG",16}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",16}, {"NDIMC",8}, {"NWG",128}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",4}, {"VWN",8} } },
{ "GeForce GTX TITAN", { {"KWG",16}, {"KWI",8}, {"MDIMA",32}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",2}, {"VWN",2} } },
{ "GeForce GTX TITAN X", { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",16}, {"MWG",128}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",128}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",1}, {"VWM",4}, {"VWN",8} } },
{ "Tesla K20m", { {"KWG",16}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",16}, {"NDIMC",8}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",2}, {"VWN",4} } },
{ "Tesla K40m", { {"KWG",16}, {"KWI",8}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",64}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",128}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",2}, {"VWN",4} } },
{ "default", { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",0}, {"STRN",0}, {"VWM",2}, {"VWN",2} } },
{ "default", { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
}
},
{ // Default
kDeviceTypeAll, "default", {
{ "default", { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",64}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
{ "default", { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",16}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
}
},
}
@ -96,6 +102,7 @@ const Database::DatabaseEntry Database::XgemmComplexSingle = {
kDeviceTypeGPU, "AMD", {
{ "AMD Radeon R9 M370X Compute Engine", { {"KWG",32}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",32}, {"MWG",64}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",64}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",1}, {"VWM",2}, {"VWN",8} } },
{ "Hawaii", { {"KWG",32}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",8}, {"NDIMC",16}, {"NWG",32}, {"SA",1}, {"SB",0}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
{ "Oland", { {"KWG",32}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",32}, {"NDIMC",32}, {"NWG",128}, {"SA",1}, {"SB",0}, {"STRM",0}, {"STRN",1}, {"VWM",2}, {"VWN",4} } },
{ "Pitcairn", { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",32}, {"SA",0}, {"SB",1}, {"STRM",1}, {"STRN",1}, {"VWM",4}, {"VWN",2} } },
{ "Tahiti", { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",8}, {"NDIMC",16}, {"NWG",32}, {"SA",1}, {"SB",0}, {"STRM",0}, {"STRN",1}, {"VWM",2}, {"VWN",1} } },
{ "default", { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",32}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
@ -117,11 +124,13 @@ const Database::DatabaseEntry Database::XgemmComplexSingle = {
},
{ // Intel GPUs
kDeviceTypeGPU, "Intel", {
{ "Intel(R) HD Graphics 530", { {"KWG",16}, {"KWI",8}, {"MDIMA",8}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",32}, {"NDIMC",8}, {"NWG",32}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",2}, {"VWN",1} } },
{ "Intel(R) HD Graphics 5500 BroadWell U-Processor GT2", { {"KWG",16}, {"KWI",8}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",64}, {"SA",1}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",4}, {"VWN",4} } },
{ "Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile", { {"KWG",32}, {"KWI",8}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",1}, {"VWM",2}, {"VWN",1} } },
{ "Intel(R) HD Graphics Skylake ULT GT2", { {"KWG",32}, {"KWI",8}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",32}, {"NDIMC",16}, {"NWG",32}, {"SA",1}, {"SB",0}, {"STRM",0}, {"STRN",1}, {"VWM",4}, {"VWN",1} } },
{ "Iris", { {"KWG",32}, {"KWI",8}, {"MDIMA",32}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",8}, {"NDIMC",16}, {"NWG",64}, {"SA",1}, {"SB",0}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
{ "Iris Pro", { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",32}, {"NDIMC",8}, {"NWG",32}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",1}, {"VWM",1}, {"VWN",1} } },
{ "default", { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",32}, {"SA",1}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
{ "default", { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",32}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
}
},
{ // Intel accelerators
@ -133,8 +142,11 @@ const Database::DatabaseEntry Database::XgemmComplexSingle = {
{ // NVIDIA GPUs
kDeviceTypeGPU, "NVIDIA", {
{ "GRID K520", { {"KWG",16}, {"KWI",8}, {"MDIMA",32}, {"MDIMC",32}, {"MWG",64}, {"NDIMB",32}, {"NDIMC",16}, {"NWG",128}, {"SA",1}, {"SB",0}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",4} } },
{ "GeForce GTX 1070", { {"KWG",16}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",128}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",1}, {"VWM",2}, {"VWN",4} } },
{ "GeForce GTX 480", { {"KWG",16}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",32}, {"NDIMB",32}, {"NDIMC",16}, {"NWG",128}, {"SA",0}, {"SB",1}, {"STRM",1}, {"STRN",1}, {"VWM",2}, {"VWN",2} } },
{ "GeForce GTX 670", { {"KWG",16}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",32}, {"MWG",64}, {"NDIMB",32}, {"NDIMC",8}, {"NWG",32}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",1}, {"VWM",1}, {"VWN",1} } },
{ "GeForce GTX 680", { {"KWG",16}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",32}, {"NDIMC",32}, {"NWG",128}, {"SA",1}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",2}, {"VWN",2} } },
{ "GeForce GTX 750", { {"KWG",16}, {"KWI",8}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",2}, {"VWN",2} } },
{ "GeForce GTX 750 Ti", { {"KWG",16}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",8}, {"MWG",128}, {"NDIMB",16}, {"NDIMC",32}, {"NWG",128}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",4} } },
{ "GeForce GTX 980", { {"KWG",32}, {"KWI",8}, {"MDIMA",32}, {"MDIMC",32}, {"MWG",64}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",2}, {"VWN",1} } },
{ "GeForce GTX TITAN", { {"KWG",16}, {"KWI",8}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",32}, {"NDIMC",16}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
@ -158,11 +170,12 @@ const Database::DatabaseEntry Database::XgemmDouble = {
"Xgemm", Precision::kDouble, {
{ // AMD GPUs
kDeviceTypeGPU, "AMD", {
{ "AMD Radeon R9 M370X Compute Engine", { {"KWG",32}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",32}, {"MWG",64}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",64}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",1}, {"VWM",2}, {"VWN",8} } },
{ "AMD Radeon R9 M370X Compute Engine", { {"KWG",32}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",32}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",4}, {"VWN",4} } },
{ "Hawaii", { {"KWG",16}, {"KWI",8}, {"MDIMA",32}, {"MDIMC",8}, {"MWG",128}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",32}, {"SA",0}, {"SB",1}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",4} } },
{ "Oland", { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",16}, {"NDIMC",8}, {"NWG",16}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",1}, {"VWM",1}, {"VWN",1} } },
{ "Pitcairn", { {"KWG",32}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",8}, {"NDIMC",16}, {"NWG",32}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",2} } },
{ "Tahiti", { {"KWG",32}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",16}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",32}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",1}, {"VWM",1}, {"VWN",4} } },
{ "default", { {"KWG",16}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",16}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",32}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",2} } },
{ "default", { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",16}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",16}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
}
},
{ // ARM GPUs
@ -188,8 +201,11 @@ const Database::DatabaseEntry Database::XgemmDouble = {
{ // NVIDIA GPUs
kDeviceTypeGPU, "NVIDIA", {
{ "GRID K520", { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",16}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",32}, {"SA",1}, {"SB",0}, {"STRM",0}, {"STRN",1}, {"VWM",2}, {"VWN",2} } },
{ "GeForce GTX 1070", { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",16}, {"MWG",32}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",64}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",1}, {"VWM",2}, {"VWN",8} } },
{ "GeForce GTX 480", { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",16}, {"MWG",32}, {"NDIMB",32}, {"NDIMC",8}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",2} } },
{ "GeForce GTX 670", { {"KWG",32}, {"KWI",8}, {"MDIMA",16}, {"MDIMC",32}, {"MWG",128}, {"NDIMB",16}, {"NDIMC",8}, {"NWG",32}, {"SA",0}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
{ "GeForce GTX 680", { {"KWG",32}, {"KWI",8}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",16}, {"NDIMC",32}, {"NWG",128}, {"SA",1}, {"SB",0}, {"STRM",0}, {"STRN",1}, {"VWM",2}, {"VWN",4} } },
{ "GeForce GTX 750", { {"KWG",32}, {"KWI",8}, {"MDIMA",16}, {"MDIMC",32}, {"MWG",64}, {"NDIMB",16}, {"NDIMC",8}, {"NWG",128}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",1}, {"VWM",2}, {"VWN",1} } },
{ "GeForce GTX 750 Ti", { {"KWG",16}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",16}, {"NDIMC",32}, {"NWG",32}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",1}, {"VWM",2}, {"VWN",1} } },
{ "GeForce GTX 980", { {"KWG",32}, {"KWI",8}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",64}, {"NDIMB",32}, {"NDIMC",32}, {"NWG",128}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",0}, {"VWM",2}, {"VWN",4} } },
{ "GeForce GTX TITAN", { {"KWG",16}, {"KWI",8}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",16}, {"NDIMC",32}, {"NWG",128}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",1}, {"VWM",2}, {"VWN",2} } },
@ -215,6 +231,7 @@ const Database::DatabaseEntry Database::XgemmComplexDouble = {
kDeviceTypeGPU, "AMD", {
{ "AMD Radeon R9 M370X Compute Engine", { {"KWG",32}, {"KWI",8}, {"MDIMA",8}, {"MDIMC",16}, {"MWG",32}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",32}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",1}, {"VWM",2}, {"VWN",2} } },
{ "Hawaii", { {"KWG",16}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",16}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",32}, {"SA",1}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",2} } },
{ "Oland", { {"KWG",16}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",16}, {"NDIMB",16}, {"NDIMC",32}, {"NWG",128}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",4} } },
{ "Pitcairn", { {"KWG",32}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",8}, {"NDIMC",32}, {"NWG",32}, {"SA",0}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
{ "Tahiti", { {"KWG",16}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",16}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",16}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
{ "default", { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",16}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",16}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
@ -243,8 +260,11 @@ const Database::DatabaseEntry Database::XgemmComplexDouble = {
{ // NVIDIA GPUs
kDeviceTypeGPU, "NVIDIA", {
{ "GRID K520", { {"KWG",32}, {"KWI",8}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",16}, {"NDIMB",8}, {"NDIMC",16}, {"NWG",64}, {"SA",1}, {"SB",0}, {"STRM",1}, {"STRN",1}, {"VWM",1}, {"VWN",1} } },
{ "GeForce GTX 1070", { {"KWG",32}, {"KWI",8}, {"MDIMA",32}, {"MDIMC",16}, {"MWG",32}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",32}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",1}, {"VWM",1}, {"VWN",4} } },
{ "GeForce GTX 480", { {"KWG",16}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",32}, {"MWG",32}, {"NDIMB",32}, {"NDIMC",8}, {"NWG",32}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
{ "GeForce GTX 670", { {"KWG",32}, {"KWI",8}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",16}, {"NDIMB",16}, {"NDIMC",32}, {"NWG",64}, {"SA",1}, {"SB",0}, {"STRM",0}, {"STRN",1}, {"VWM",1}, {"VWN",2} } },
{ "GeForce GTX 680", { {"KWG",16}, {"KWI",8}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",64}, {"NDIMB",16}, {"NDIMC",32}, {"NWG",32}, {"SA",0}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
{ "GeForce GTX 750", { {"KWG",32}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",32}, {"MWG",32}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",64}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",4} } },
{ "GeForce GTX 750 Ti", { {"KWG",16}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",16}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",32}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",4} } },
{ "GeForce GTX 980", { {"KWG",16}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",8}, {"NDIMC",16}, {"NWG",128}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",1}, {"VWM",2}, {"VWN",2} } },
{ "GeForce GTX TITAN X", { {"KWG",32}, {"KWI",8}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",128}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",32}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },

View file

@ -18,13 +18,14 @@ const Database::DatabaseEntry Database::XgemvHalf = {
"Xgemv", Precision::kHalf, {
{ // Intel GPUs
kDeviceTypeGPU, "Intel", {
{ "Intel(R) HD Graphics Skylake ULT GT2", { {"WGS1",128}, {"WPT1",1}, {"VW2",2}, {"WGS2",128}, {"WPT2",2}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
{ "default", { {"WGS1",128}, {"WPT1",1}, {"VW2",2}, {"WGS2",128}, {"WPT2",2}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
{ "Intel(R) HD Graphics 5500 BroadWell U-Processor GT2", { {"WGS1",64}, {"WPT1",1} } },
{ "Intel(R) HD Graphics Skylake ULT GT2", { {"WGS1",128}, {"WPT1",1} } },
{ "default", { {"WGS1",64}, {"WPT1",1} } },
}
},
{ // Default
kDeviceTypeAll, "default", {
{ "default", { {"WGS1",128}, {"WPT1",1}, {"VW2",2}, {"WGS2",128}, {"WPT2",2}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
{ "default", { {"WGS1",64}, {"WPT1",1} } },
}
},
}
@ -36,52 +37,58 @@ const Database::DatabaseEntry Database::XgemvSingle = {
"Xgemv", Precision::kSingle, {
{ // AMD GPUs
kDeviceTypeGPU, "AMD", {
{ "AMD Radeon R9 M370X Compute Engine", { {"WGS1",128}, {"WPT1",1}, {"VW2",1}, {"WGS2",128}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
{ "Hawaii", { {"WGS1",128}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
{ "Pitcairn", { {"WGS1",256}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
{ "Tahiti", { {"WGS1",256}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
{ "default", { {"WGS1",128}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
{ "AMD Radeon R9 M370X Compute Engine", { {"WGS1",128}, {"WPT1",1} } },
{ "Hawaii", { {"WGS1",128}, {"WPT1",1} } },
{ "Oland", { {"WGS1",128}, {"WPT1",1} } },
{ "Pitcairn", { {"WGS1",256}, {"WPT1",1} } },
{ "Tahiti", { {"WGS1",256}, {"WPT1",1} } },
{ "default", { {"WGS1",128}, {"WPT1",1} } },
}
},
{ // Intel CPUs
kDeviceTypeCPU, "Intel", {
{ "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"WGS1",64}, {"WPT1",1}, {"VW2",4}, {"WGS2",128}, {"WPT2",4}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
{ "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz", { {"WGS1",64}, {"WPT1",4}, {"VW2",1}, {"WGS2",64}, {"WPT2",4}, {"VW3",2}, {"WGS3",64}, {"WPT3",4} } },
{ "default", { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",4}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
{ "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"WGS1",64}, {"WPT1",1} } },
{ "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz", { {"WGS1",64}, {"WPT1",4} } },
{ "default", { {"WGS1",64}, {"WPT1",4} } },
}
},
{ // Intel GPUs
kDeviceTypeGPU, "Intel", {
{ "Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile", { {"WGS1",256}, {"WPT1",1}, {"VW2",4}, {"WGS2",128}, {"WPT2",4}, {"VW3",4}, {"WGS3",256}, {"WPT3",4} } },
{ "Intel(R) HD Graphics Skylake ULT GT2", { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",256}, {"WPT2",1}, {"VW3",4}, {"WGS3",64}, {"WPT3",4} } },
{ "Iris", { {"WGS1",64}, {"WPT1",2}, {"VW2",1}, {"WGS2",128}, {"WPT2",2}, {"VW3",4}, {"WGS3",64}, {"WPT3",8} } },
{ "Iris Pro", { {"WGS1",256}, {"WPT1",2}, {"VW2",1}, {"WGS2",128}, {"WPT2",2}, {"VW3",4}, {"WGS3",64}, {"WPT3",4} } },
{ "default", { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",128}, {"WPT2",1}, {"VW3",4}, {"WGS3",64}, {"WPT3",4} } },
{ "Intel(R) HD Graphics 530", { {"WGS1",256}, {"WPT1",1} } },
{ "Intel(R) HD Graphics 5500 BroadWell U-Processor GT2", { {"WGS1",64}, {"WPT1",1} } },
{ "Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile", { {"WGS1",64}, {"WPT1",1} } },
{ "Intel(R) HD Graphics Skylake ULT GT2", { {"WGS1",64}, {"WPT1",1} } },
{ "Iris", { {"WGS1",64}, {"WPT1",2} } },
{ "Iris Pro", { {"WGS1",256}, {"WPT1",2} } },
{ "default", { {"WGS1",64}, {"WPT1",1} } },
}
},
{ // Intel accelerators
kDeviceTypeAccelerator, "Intel", {
{ "Intel(R) Many Integrated Core Acceleration Card", { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
{ "default", { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
{ "Intel(R) Many Integrated Core Acceleration Card", { {"WGS1",64}, {"WPT1",1} } },
{ "default", { {"WGS1",64}, {"WPT1",1} } },
}
},
{ // NVIDIA GPUs
kDeviceTypeGPU, "NVIDIA", {
{ "GRID K520", { {"WGS1",256}, {"WPT1",1}, {"VW2",2}, {"WGS2",256}, {"WPT2",2}, {"VW3",2}, {"WGS3",64}, {"WPT3",2} } },
{ "GeForce GTX 480", { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",128}, {"WPT2",1}, {"VW3",2}, {"WGS3",64}, {"WPT3",2} } },
{ "GeForce GTX 680", { {"WGS1",256}, {"WPT1",1}, {"VW2",1}, {"WGS2",128}, {"WPT2",1}, {"VW3",2}, {"WGS3",128}, {"WPT3",2} } },
{ "GeForce GTX 750 Ti", { {"WGS1",256}, {"WPT1",1}, {"VW2",1}, {"WGS2",256}, {"WPT2",1}, {"VW3",4}, {"WGS3",128}, {"WPT3",4} } },
{ "GeForce GTX 980", { {"WGS1",128}, {"WPT1",1}, {"VW2",1}, {"WGS2",256}, {"WPT2",1}, {"VW3",1}, {"WGS3",128}, {"WPT3",1} } },
{ "GeForce GTX TITAN", { {"WGS1",256}, {"WPT1",1}, {"VW2",1}, {"WGS2",256}, {"WPT2",1}, {"VW3",1}, {"WGS3",256}, {"WPT3",1} } },
{ "GeForce GTX TITAN X", { {"WGS1",256}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",128}, {"WPT3",1} } },
{ "Tesla K20m", { {"WGS1",128}, {"WPT1",1}, {"VW2",1}, {"WGS2",256}, {"WPT2",1}, {"VW3",1}, {"WGS3",256}, {"WPT3",1} } },
{ "Tesla K40m", { {"WGS1",256}, {"WPT1",1}, {"VW2",1}, {"WGS2",256}, {"WPT2",1}, {"VW3",1}, {"WGS3",128}, {"WPT3",1} } },
{ "default", { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
{ "GRID K520", { {"WGS1",256}, {"WPT1",1} } },
{ "GeForce GTX 1070", { {"WGS1",128}, {"WPT1",1} } },
{ "GeForce GTX 480", { {"WGS1",64}, {"WPT1",1} } },
{ "GeForce GTX 670", { {"WGS1",64}, {"WPT1",1} } },
{ "GeForce GTX 680", { {"WGS1",256}, {"WPT1",1} } },
{ "GeForce GTX 750", { {"WGS1",256}, {"WPT1",1} } },
{ "GeForce GTX 750 Ti", { {"WGS1",256}, {"WPT1",1} } },
{ "GeForce GTX 980", { {"WGS1",128}, {"WPT1",1} } },
{ "GeForce GTX TITAN", { {"WGS1",256}, {"WPT1",1} } },
{ "GeForce GTX TITAN X", { {"WGS1",256}, {"WPT1",1} } },
{ "Tesla K20m", { {"WGS1",128}, {"WPT1",1} } },
{ "Tesla K40m", { {"WGS1",256}, {"WPT1",1} } },
{ "default", { {"WGS1",256}, {"WPT1",1} } },
}
},
{ // Default
kDeviceTypeAll, "default", {
{ "default", { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
{ "default", { {"WGS1",64}, {"WPT1",1} } },
}
},
}
@ -93,48 +100,54 @@ const Database::DatabaseEntry Database::XgemvComplexSingle = {
"Xgemv", Precision::kComplexSingle, {
{ // AMD GPUs
kDeviceTypeGPU, "AMD", {
{ "AMD Radeon R9 M370X Compute Engine", { {"WGS1",64}, {"WPT1",1}, {"VW2",2}, {"WGS2",256}, {"WPT2",2}, {"VW3",1}, {"WGS3",128}, {"WPT3",1} } },
{ "Hawaii", { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
{ "Pitcairn", { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",2}, {"WGS3",64}, {"WPT3",2} } },
{ "Tahiti", { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",128}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
{ "default", { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
{ "AMD Radeon R9 M370X Compute Engine", { {"WGS1",64}, {"WPT1",1} } },
{ "Hawaii", { {"WGS1",64}, {"WPT1",1} } },
{ "Oland", { {"WGS1",64}, {"WPT1",1} } },
{ "Pitcairn", { {"WGS1",64}, {"WPT1",1} } },
{ "Tahiti", { {"WGS1",64}, {"WPT1",1} } },
{ "default", { {"WGS1",64}, {"WPT1",1} } },
}
},
{ // Intel CPUs
kDeviceTypeCPU, "Intel", {
{ "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"WGS1",128}, {"WPT1",1}, {"VW2",1}, {"WGS2",128}, {"WPT2",2}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
{ "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz", { {"WGS1",64}, {"WPT1",4}, {"VW2",4}, {"WGS2",64}, {"WPT2",4}, {"VW3",2}, {"WGS3",64}, {"WPT3",2} } },
{ "default", { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",2}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
{ "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"WGS1",128}, {"WPT1",1} } },
{ "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz", { {"WGS1",64}, {"WPT1",4} } },
{ "default", { {"WGS1",64}, {"WPT1",1} } },
}
},
{ // Intel GPUs
kDeviceTypeGPU, "Intel", {
{ "Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile", { {"WGS1",128}, {"WPT1",1}, {"VW2",2}, {"WGS2",128}, {"WPT2",2}, {"VW3",4}, {"WGS3",128}, {"WPT3",4} } },
{ "Intel(R) HD Graphics Skylake ULT GT2", { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",4}, {"WGS3",64}, {"WPT3",4} } },
{ "Iris", { {"WGS1",256}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
{ "Iris Pro", { {"WGS1",64}, {"WPT1",1}, {"VW2",4}, {"WGS2",128}, {"WPT2",4}, {"VW3",2}, {"WGS3",64}, {"WPT3",2} } },
{ "default", { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
{ "Intel(R) HD Graphics 530", { {"WGS1",64}, {"WPT1",1} } },
{ "Intel(R) HD Graphics 5500 BroadWell U-Processor GT2", { {"WGS1",64}, {"WPT1",1} } },
{ "Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile", { {"WGS1",128}, {"WPT1",1} } },
{ "Intel(R) HD Graphics Skylake ULT GT2", { {"WGS1",64}, {"WPT1",1} } },
{ "Iris", { {"WGS1",256}, {"WPT1",1} } },
{ "Iris Pro", { {"WGS1",64}, {"WPT1",1} } },
{ "default", { {"WGS1",64}, {"WPT1",1} } },
}
},
{ // Intel accelerators
kDeviceTypeAccelerator, "Intel", {
{ "Intel(R) Many Integrated Core Acceleration Card", { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
{ "default", { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
{ "Intel(R) Many Integrated Core Acceleration Card", { {"WGS1",64}, {"WPT1",1} } },
{ "default", { {"WGS1",64}, {"WPT1",1} } },
}
},
{ // NVIDIA GPUs
kDeviceTypeGPU, "NVIDIA", {
{ "GRID K520", { {"WGS1",256}, {"WPT1",1}, {"VW2",1}, {"WGS2",256}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
{ "GeForce GTX 480", { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
{ "GeForce GTX 680", { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
{ "GeForce GTX 750 Ti", { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1} } },
{ "GRID K520", { {"WGS1",256}, {"WPT1",1} } },
{ "GeForce GTX 1070", { {"WGS1",64}, {"WPT1",1} } },
{ "GeForce GTX 480", { {"WGS1",64}, {"WPT1",1} } },
{ "GeForce GTX 670", { {"WGS1",64}, {"WPT1",1} } },
{ "GeForce GTX 680", { {"WGS1",64}, {"WPT1",1} } },
{ "GeForce GTX 750", { {"WGS1",128}, {"WPT1",1} } },
{ "GeForce GTX 750 Ti", { {"WGS1",64}, {"WPT1",1} } },
{ "GeForce GTX TITAN", { {"WGS1",256}, {"WPT1",1} } },
{ "default", { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
{ "default", { {"WGS1",64}, {"WPT1",1} } },
}
},
{ // Default
kDeviceTypeAll, "default", {
{ "default", { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
{ "default", { {"WGS1",64}, {"WPT1",1} } },
}
},
}
@ -146,43 +159,47 @@ const Database::DatabaseEntry Database::XgemvDouble = {
"Xgemv", Precision::kDouble, {
{ // AMD GPUs
kDeviceTypeGPU, "AMD", {
{ "AMD Radeon R9 M370X Compute Engine", { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",256}, {"WPT2",1}, {"VW3",1}, {"WGS3",128}, {"WPT3",1} } },
{ "Hawaii", { {"WGS1",128}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
{ "Pitcairn", { {"WGS1",256}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",2}, {"WGS3",64}, {"WPT3",2} } },
{ "Tahiti", { {"WGS1",256}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",2}, {"WGS3",64}, {"WPT3",2} } },
{ "default", { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
{ "AMD Radeon R9 M370X Compute Engine", { {"WGS1",64}, {"WPT1",1} } },
{ "Hawaii", { {"WGS1",128}, {"WPT1",1} } },
{ "Oland", { {"WGS1",256}, {"WPT1",1} } },
{ "Pitcairn", { {"WGS1",256}, {"WPT1",1} } },
{ "Tahiti", { {"WGS1",256}, {"WPT1",1} } },
{ "default", { {"WGS1",256}, {"WPT1",1} } },
}
},
{ // Intel CPUs
kDeviceTypeCPU, "Intel", {
{ "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"WGS1",64}, {"WPT1",2}, {"VW2",4}, {"WGS2",128}, {"WPT2",4}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
{ "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz", { {"WGS1",64}, {"WPT1",4}, {"VW2",1}, {"WGS2",64}, {"WPT2",4}, {"VW3",1}, {"WGS3",64}, {"WPT3",2} } },
{ "default", { {"WGS1",64}, {"WPT1",2}, {"VW2",1}, {"WGS2",64}, {"WPT2",4}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
{ "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"WGS1",64}, {"WPT1",2} } },
{ "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz", { {"WGS1",64}, {"WPT1",4} } },
{ "default", { {"WGS1",64}, {"WPT1",4} } },
}
},
{ // Intel accelerators
kDeviceTypeAccelerator, "Intel", {
{ "Intel(R) Many Integrated Core Acceleration Card", { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
{ "default", { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
{ "Intel(R) Many Integrated Core Acceleration Card", { {"WGS1",64}, {"WPT1",1} } },
{ "default", { {"WGS1",64}, {"WPT1",1} } },
}
},
{ // NVIDIA GPUs
kDeviceTypeGPU, "NVIDIA", {
{ "GRID K520", { {"WGS1",128}, {"WPT1",1}, {"VW2",1}, {"WGS2",256}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
{ "GeForce GTX 480", { {"WGS1",256}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
{ "GeForce GTX 680", { {"WGS1",128}, {"WPT1",1}, {"VW2",1}, {"WGS2",128}, {"WPT2",1}, {"VW3",2}, {"WGS3",128}, {"WPT3",2} } },
{ "GeForce GTX 750 Ti", { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",256}, {"WPT2",1}, {"VW3",2}, {"WGS3",256}, {"WPT3",2} } },
{ "GeForce GTX 980", { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
{ "GeForce GTX TITAN", { {"WGS1",256}, {"WPT1",1}, {"VW2",1}, {"WGS2",256}, {"WPT2",1}, {"VW3",1}, {"WGS3",256}, {"WPT3",1} } },
{ "GeForce GTX TITAN X", { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",128}, {"WPT2",1}, {"VW3",1}, {"WGS3",128}, {"WPT3",1} } },
{ "Tesla K20m", { {"WGS1",256}, {"WPT1",1}, {"VW2",1}, {"WGS2",128}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
{ "Tesla K40m", { {"WGS1",256}, {"WPT1",1}, {"VW2",1}, {"WGS2",256}, {"WPT2",1}, {"VW3",1}, {"WGS3",256}, {"WPT3",1} } },
{ "default", { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
{ "GRID K520", { {"WGS1",128}, {"WPT1",1} } },
{ "GeForce GTX 1070", { {"WGS1",64}, {"WPT1",1} } },
{ "GeForce GTX 480", { {"WGS1",256}, {"WPT1",1} } },
{ "GeForce GTX 670", { {"WGS1",128}, {"WPT1",1} } },
{ "GeForce GTX 680", { {"WGS1",128}, {"WPT1",1} } },
{ "GeForce GTX 750", { {"WGS1",64}, {"WPT1",1} } },
{ "GeForce GTX 750 Ti", { {"WGS1",64}, {"WPT1",1} } },
{ "GeForce GTX 980", { {"WGS1",64}, {"WPT1",1} } },
{ "GeForce GTX TITAN", { {"WGS1",256}, {"WPT1",1} } },
{ "GeForce GTX TITAN X", { {"WGS1",64}, {"WPT1",1} } },
{ "Tesla K20m", { {"WGS1",256}, {"WPT1",1} } },
{ "Tesla K40m", { {"WGS1",256}, {"WPT1",1} } },
{ "default", { {"WGS1",128}, {"WPT1",1} } },
}
},
{ // Default
kDeviceTypeAll, "default", {
{ "default", { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
{ "default", { {"WGS1",128}, {"WPT1",1} } },
}
},
}
@ -194,36 +211,38 @@ const Database::DatabaseEntry Database::XgemvComplexDouble = {
"Xgemv", Precision::kComplexDouble, {
{ // AMD GPUs
kDeviceTypeGPU, "AMD", {
{ "AMD Radeon R9 M370X Compute Engine", { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",256}, {"WPT2",1}, {"VW3",1}, {"WGS3",128}, {"WPT3",1} } },
{ "Hawaii", { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
{ "Pitcairn", { {"WGS1",256}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
{ "Tahiti", { {"WGS1",256}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
{ "default", { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
{ "AMD Radeon R9 M370X Compute Engine", { {"WGS1",64}, {"WPT1",1} } },
{ "Hawaii", { {"WGS1",64}, {"WPT1",1} } },
{ "Oland", { {"WGS1",256}, {"WPT1",1} } },
{ "Pitcairn", { {"WGS1",256}, {"WPT1",1} } },
{ "Tahiti", { {"WGS1",256}, {"WPT1",1} } },
{ "default", { {"WGS1",64}, {"WPT1",1} } },
}
},
{ // Intel CPUs
kDeviceTypeCPU, "Intel", {
{ "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"WGS1",64}, {"WPT1",1}, {"VW2",2}, {"WGS2",64}, {"WPT2",4}, {"VW3",1}, {"WGS3",256}, {"WPT3",1} } },
{ "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz", { {"WGS1",64}, {"WPT1",4}, {"VW2",4}, {"WGS2",64}, {"WPT2",4}, {"VW3",2}, {"WGS3",256}, {"WPT3",2} } },
{ "default", { {"WGS1",64}, {"WPT1",1}, {"VW2",2}, {"WGS2",64}, {"WPT2",4}, {"VW3",1}, {"WGS3",256}, {"WPT3",1} } },
{ "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"WGS1",64}, {"WPT1",1} } },
{ "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz", { {"WGS1",64}, {"WPT1",4} } },
{ "default", { {"WGS1",64}, {"WPT1",4} } },
}
},
{ // Intel accelerators
kDeviceTypeAccelerator, "Intel", {
{ "Intel(R) Many Integrated Core Acceleration Card", { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
{ "default", { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
{ "Intel(R) Many Integrated Core Acceleration Card", { {"WGS1",64}, {"WPT1",1} } },
{ "default", { {"WGS1",64}, {"WPT1",1} } },
}
},
{ // NVIDIA GPUs
kDeviceTypeGPU, "NVIDIA", {
{ "GRID K520", { {"WGS1",128}, {"WPT1",1}, {"VW2",1}, {"WGS2",128}, {"WPT2",1}, {"VW3",1}, {"WGS3",256}, {"WPT3",1} } },
{ "GeForce GTX 480", { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
{ "default", { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
{ "GRID K520", { {"WGS1",128}, {"WPT1",1} } },
{ "GeForce GTX 480", { {"WGS1",64}, {"WPT1",1} } },
{ "GeForce GTX 670", { {"WGS1",128}, {"WPT1",1} } },
{ "default", { {"WGS1",128}, {"WPT1",1} } },
}
},
{ // Default
kDeviceTypeAll, "default", {
{ "default", { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
{ "default", { {"WGS1",64}, {"WPT1",1} } },
}
},
}

View file

@ -0,0 +1,250 @@
// =================================================================================================
// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
// width of 100 characters per line.
//
// Author(s):
// Database generator <database.py>
//
// This file populates the database with best-found tuning parameters for the 'Xgemv_Fast' kernels.
//
// =================================================================================================
namespace clblast {
// =================================================================================================
const Database::DatabaseEntry Database::XgemvFastHalf = {
"XgemvFast", Precision::kHalf, {
{ // Intel GPUs
kDeviceTypeGPU, "Intel", {
{ "Intel(R) HD Graphics 5500 BroadWell U-Processor GT2", { {"VW2",1}, {"WGS2",16}, {"WPT2",1} } },
{ "Intel(R) HD Graphics Skylake ULT GT2", { {"VW2",2}, {"WGS2",128}, {"WPT2",2} } },
{ "default", { {"VW2",1}, {"WGS2",16}, {"WPT2",1} } },
}
},
{ // Default
kDeviceTypeAll, "default", {
{ "default", { {"VW2",1}, {"WGS2",16}, {"WPT2",1} } },
}
},
}
};
// =================================================================================================
const Database::DatabaseEntry Database::XgemvFastSingle = {
"XgemvFast", Precision::kSingle, {
{ // AMD GPUs
kDeviceTypeGPU, "AMD", {
{ "AMD Radeon R9 M370X Compute Engine", { {"VW2",1}, {"WGS2",128}, {"WPT2",1} } },
{ "Hawaii", { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } },
{ "Oland", { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } },
{ "Pitcairn", { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } },
{ "Tahiti", { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } },
{ "default", { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } },
}
},
{ // Intel CPUs
kDeviceTypeCPU, "Intel", {
{ "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"VW2",4}, {"WGS2",128}, {"WPT2",4} } },
{ "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz", { {"VW2",1}, {"WGS2",64}, {"WPT2",4} } },
{ "default", { {"VW2",4}, {"WGS2",64}, {"WPT2",4} } },
}
},
{ // Intel GPUs
kDeviceTypeGPU, "Intel", {
{ "Intel(R) HD Graphics 530", { {"VW2",1}, {"WGS2",256}, {"WPT2",1} } },
{ "Intel(R) HD Graphics 5500 BroadWell U-Processor GT2", { {"VW2",2}, {"WGS2",32}, {"WPT2",2} } },
{ "Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile", { {"VW2",4}, {"WGS2",128}, {"WPT2",4} } },
{ "Intel(R) HD Graphics Skylake ULT GT2", { {"VW2",1}, {"WGS2",256}, {"WPT2",1} } },
{ "Iris", { {"VW2",1}, {"WGS2",128}, {"WPT2",2} } },
{ "Iris Pro", { {"VW2",1}, {"WGS2",128}, {"WPT2",2} } },
{ "default", { {"VW2",2}, {"WGS2",64}, {"WPT2",2} } },
}
},
{ // Intel accelerators
kDeviceTypeAccelerator, "Intel", {
{ "Intel(R) Many Integrated Core Acceleration Card", { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } },
{ "default", { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } },
}
},
{ // NVIDIA GPUs
kDeviceTypeGPU, "NVIDIA", {
{ "GRID K520", { {"VW2",2}, {"WGS2",256}, {"WPT2",2} } },
{ "GeForce GTX 1070", { {"VW2",1}, {"WGS2",256}, {"WPT2",1} } },
{ "GeForce GTX 480", { {"VW2",1}, {"WGS2",128}, {"WPT2",1} } },
{ "GeForce GTX 670", { {"VW2",2}, {"WGS2",256}, {"WPT2",2} } },
{ "GeForce GTX 680", { {"VW2",1}, {"WGS2",128}, {"WPT2",1} } },
{ "GeForce GTX 750", { {"VW2",1}, {"WGS2",256}, {"WPT2",1} } },
{ "GeForce GTX 750 Ti", { {"VW2",1}, {"WGS2",256}, {"WPT2",1} } },
{ "GeForce GTX 980", { {"VW2",1}, {"WGS2",256}, {"WPT2",1} } },
{ "GeForce GTX TITAN", { {"VW2",1}, {"WGS2",256}, {"WPT2",1} } },
{ "GeForce GTX TITAN X", { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } },
{ "Tesla K20m", { {"VW2",1}, {"WGS2",256}, {"WPT2",1} } },
{ "Tesla K40m", { {"VW2",1}, {"WGS2",256}, {"WPT2",1} } },
{ "default", { {"VW2",1}, {"WGS2",256}, {"WPT2",1} } },
}
},
{ // Default
kDeviceTypeAll, "default", {
{ "default", { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } },
}
},
}
};
// =================================================================================================
const Database::DatabaseEntry Database::XgemvFastComplexSingle = {
"XgemvFast", Precision::kComplexSingle, {
{ // AMD GPUs
kDeviceTypeGPU, "AMD", {
{ "AMD Radeon R9 M370X Compute Engine", { {"VW2",2}, {"WGS2",256}, {"WPT2",2} } },
{ "Hawaii", { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } },
{ "Oland", { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } },
{ "Pitcairn", { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } },
{ "Tahiti", { {"VW2",1}, {"WGS2",128}, {"WPT2",1} } },
{ "default", { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } },
}
},
{ // Intel CPUs
kDeviceTypeCPU, "Intel", {
{ "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"VW2",1}, {"WGS2",128}, {"WPT2",2} } },
{ "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz", { {"VW2",4}, {"WGS2",64}, {"WPT2",4} } },
{ "default", { {"VW2",1}, {"WGS2",64}, {"WPT2",2} } },
}
},
{ // Intel GPUs
kDeviceTypeGPU, "Intel", {
{ "Intel(R) HD Graphics 530", { {"VW2",2}, {"WGS2",128}, {"WPT2",2} } },
{ "Intel(R) HD Graphics 5500 BroadWell U-Processor GT2", { {"VW2",1}, {"WGS2",32}, {"WPT2",2} } },
{ "Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile", { {"VW2",2}, {"WGS2",128}, {"WPT2",2} } },
{ "Intel(R) HD Graphics Skylake ULT GT2", { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } },
{ "Iris", { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } },
{ "Iris Pro", { {"VW2",4}, {"WGS2",128}, {"WPT2",4} } },
{ "default", { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } },
}
},
{ // Intel accelerators
kDeviceTypeAccelerator, "Intel", {
{ "Intel(R) Many Integrated Core Acceleration Card", { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } },
{ "default", { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } },
}
},
{ // NVIDIA GPUs
kDeviceTypeGPU, "NVIDIA", {
{ "GRID K520", { {"VW2",1}, {"WGS2",256}, {"WPT2",1} } },
{ "GeForce GTX 1070", { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } },
{ "GeForce GTX 480", { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } },
{ "GeForce GTX 670", { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } },
{ "GeForce GTX 680", { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } },
{ "GeForce GTX 750 Ti", { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } },
{ "default", { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } },
}
},
{ // Default
kDeviceTypeAll, "default", {
{ "default", { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } },
}
},
}
};
// =================================================================================================
const Database::DatabaseEntry Database::XgemvFastDouble = {
"XgemvFast", Precision::kDouble, {
{ // AMD GPUs
kDeviceTypeGPU, "AMD", {
{ "AMD Radeon R9 M370X Compute Engine", { {"VW2",1}, {"WGS2",256}, {"WPT2",1} } },
{ "Hawaii", { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } },
{ "Oland", { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } },
{ "Pitcairn", { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } },
{ "Tahiti", { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } },
{ "default", { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } },
}
},
{ // Intel CPUs
kDeviceTypeCPU, "Intel", {
{ "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"VW2",4}, {"WGS2",128}, {"WPT2",4} } },
{ "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz", { {"VW2",1}, {"WGS2",64}, {"WPT2",4} } },
{ "default", { {"VW2",1}, {"WGS2",64}, {"WPT2",4} } },
}
},
{ // Intel accelerators
kDeviceTypeAccelerator, "Intel", {
{ "Intel(R) Many Integrated Core Acceleration Card", { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } },
{ "default", { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } },
}
},
{ // NVIDIA GPUs
kDeviceTypeGPU, "NVIDIA", {
{ "GRID K520", { {"VW2",1}, {"WGS2",256}, {"WPT2",1} } },
{ "GeForce GTX 1070", { {"VW2",1}, {"WGS2",256}, {"WPT2",1} } },
{ "GeForce GTX 480", { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } },
{ "GeForce GTX 670", { {"VW2",1}, {"WGS2",128}, {"WPT2",1} } },
{ "GeForce GTX 680", { {"VW2",1}, {"WGS2",128}, {"WPT2",1} } },
{ "GeForce GTX 750", { {"VW2",2}, {"WGS2",256}, {"WPT2",2} } },
{ "GeForce GTX 750 Ti", { {"VW2",1}, {"WGS2",256}, {"WPT2",1} } },
{ "GeForce GTX 980", { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } },
{ "GeForce GTX TITAN", { {"VW2",1}, {"WGS2",256}, {"WPT2",1} } },
{ "GeForce GTX TITAN X", { {"VW2",1}, {"WGS2",128}, {"WPT2",1} } },
{ "Tesla K20m", { {"VW2",1}, {"WGS2",128}, {"WPT2",1} } },
{ "Tesla K40m", { {"VW2",1}, {"WGS2",256}, {"WPT2",1} } },
{ "default", { {"VW2",1}, {"WGS2",256}, {"WPT2",1} } },
}
},
{ // Default
kDeviceTypeAll, "default", {
{ "default", { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } },
}
},
}
};
// =================================================================================================
const Database::DatabaseEntry Database::XgemvFastComplexDouble = {
"XgemvFast", Precision::kComplexDouble, {
{ // AMD GPUs
kDeviceTypeGPU, "AMD", {
{ "AMD Radeon R9 M370X Compute Engine", { {"VW2",1}, {"WGS2",256}, {"WPT2",1} } },
{ "Hawaii", { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } },
{ "Oland", { {"VW2",1}, {"WGS2",256}, {"WPT2",1} } },
{ "Pitcairn", { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } },
{ "Tahiti", { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } },
{ "default", { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } },
}
},
{ // Intel CPUs
kDeviceTypeCPU, "Intel", {
{ "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"VW2",2}, {"WGS2",64}, {"WPT2",4} } },
{ "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz", { {"VW2",4}, {"WGS2",64}, {"WPT2",4} } },
{ "default", { {"VW2",2}, {"WGS2",64}, {"WPT2",4} } },
}
},
{ // Intel accelerators
kDeviceTypeAccelerator, "Intel", {
{ "Intel(R) Many Integrated Core Acceleration Card", { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } },
{ "default", { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } },
}
},
{ // NVIDIA GPUs
kDeviceTypeGPU, "NVIDIA", {
{ "GRID K520", { {"VW2",1}, {"WGS2",128}, {"WPT2",1} } },
{ "GeForce GTX 480", { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } },
{ "GeForce GTX 670", { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } },
{ "default", { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } },
}
},
{ // Default
kDeviceTypeAll, "default", {
{ "default", { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } },
}
},
}
};
// =================================================================================================
} // namespace clblast

View file

@ -0,0 +1,154 @@
// =================================================================================================
// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
// width of 100 characters per line.
//
// Author(s):
// Database generator <database.py>
//
// This file populates the database with best-found tuning parameters for the 'Xgemv_Fast_Rot' kernels.
//
// =================================================================================================
namespace clblast {
// =================================================================================================
const Database::DatabaseEntry Database::XgemvFastRotHalf = {
"XgemvFastRot", Precision::kHalf, {
{ // Default
kDeviceTypeAll, "default", {
{ "default", { {"VW3",8}, {"WGS3",32}, {"WPT3",32} } },
}
},
}
};
// =================================================================================================
const Database::DatabaseEntry Database::XgemvFastRotSingle = {
"XgemvFastRot", Precision::kSingle, {
{ // AMD GPUs
kDeviceTypeGPU, "AMD", {
{ "AMD Radeon R9 M370X Compute Engine", { {"VW3",8}, {"WGS3",64}, {"WPT3",32} } },
{ "default", { {"VW3",8}, {"WGS3",64}, {"WPT3",32} } },
}
},
{ // Intel CPUs
kDeviceTypeCPU, "Intel", {
{ "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz", { {"VW3",8}, {"WGS3",16}, {"WPT3",8} } },
{ "default", { {"VW3",8}, {"WGS3",16}, {"WPT3",8} } },
}
},
{ // Intel GPUs
kDeviceTypeGPU, "Intel", {
{ "Intel(R) HD Graphics 5500 BroadWell U-Processor GT2", { {"VW3",8}, {"WGS3",64}, {"WPT3",32} } },
{ "Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile", { {"VW3",4}, {"WGS3",64}, {"WPT3",16} } },
{ "Intel(R) HD Graphics Skylake ULT GT2", { {"VW3",4}, {"WGS3",128}, {"WPT3",16} } },
{ "Iris Pro", { {"VW3",4}, {"WGS3",32}, {"WPT3",16} } },
{ "default", { {"VW3",8}, {"WGS3",32}, {"WPT3",32} } },
}
},
{ // NVIDIA GPUs
kDeviceTypeGPU, "NVIDIA", {
{ "GeForce GTX TITAN", { {"VW3",1}, {"WGS3",16}, {"WPT3",16} } },
{ "default", { {"VW3",1}, {"WGS3",16}, {"WPT3",16} } },
}
},
{ // Default
kDeviceTypeAll, "default", {
{ "default", { {"VW3",8}, {"WGS3",32}, {"WPT3",32} } },
}
},
}
};
// =================================================================================================
const Database::DatabaseEntry Database::XgemvFastRotComplexSingle = {
"XgemvFastRot", Precision::kComplexSingle, {
{ // AMD GPUs
kDeviceTypeGPU, "AMD", {
{ "AMD Radeon R9 M370X Compute Engine", { {"VW3",8}, {"WGS3",16}, {"WPT3",16} } },
{ "default", { {"VW3",8}, {"WGS3",16}, {"WPT3",16} } },
}
},
{ // Intel CPUs
kDeviceTypeCPU, "Intel", {
{ "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz", { {"VW3",4}, {"WGS3",16}, {"WPT3",16} } },
{ "default", { {"VW3",4}, {"WGS3",16}, {"WPT3",16} } },
}
},
{ // Intel GPUs
kDeviceTypeGPU, "Intel", {
{ "Intel(R) HD Graphics 5500 BroadWell U-Processor GT2", { {"VW3",2}, {"WGS3",16}, {"WPT3",16} } },
{ "Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile", { {"VW3",4}, {"WGS3",128}, {"WPT3",8} } },
{ "Intel(R) HD Graphics Skylake ULT GT2", { {"VW3",2}, {"WGS3",32}, {"WPT3",16} } },
{ "Iris Pro", { {"VW3",4}, {"WGS3",16}, {"WPT3",16} } },
{ "default", { {"VW3",2}, {"WGS3",32}, {"WPT3",8} } },
}
},
{ // Default
kDeviceTypeAll, "default", {
{ "default", { {"VW3",2}, {"WGS3",32}, {"WPT3",16} } },
}
},
}
};
// =================================================================================================
const Database::DatabaseEntry Database::XgemvFastRotDouble = {
"XgemvFastRot", Precision::kDouble, {
{ // AMD GPUs
kDeviceTypeGPU, "AMD", {
{ "AMD Radeon R9 M370X Compute Engine", { {"VW3",4}, {"WGS3",16}, {"WPT3",16} } },
{ "default", { {"VW3",4}, {"WGS3",16}, {"WPT3",16} } },
}
},
{ // Intel CPUs
kDeviceTypeCPU, "Intel", {
{ "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz", { {"VW3",8}, {"WGS3",16}, {"WPT3",8} } },
{ "default", { {"VW3",8}, {"WGS3",16}, {"WPT3",8} } },
}
},
{ // NVIDIA GPUs
kDeviceTypeGPU, "NVIDIA", {
{ "GeForce GTX TITAN", { {"VW3",1}, {"WGS3",16}, {"WPT3",16} } },
{ "default", { {"VW3",1}, {"WGS3",16}, {"WPT3",16} } },
}
},
{ // Default
kDeviceTypeAll, "default", {
{ "default", { {"VW3",4}, {"WGS3",16}, {"WPT3",16} } },
}
},
}
};
// =================================================================================================
const Database::DatabaseEntry Database::XgemvFastRotComplexDouble = {
"XgemvFastRot", Precision::kComplexDouble, {
{ // AMD GPUs
kDeviceTypeGPU, "AMD", {
{ "AMD Radeon R9 M370X Compute Engine", { {"VW3",4}, {"WGS3",32}, {"WPT3",16} } },
{ "default", { {"VW3",4}, {"WGS3",32}, {"WPT3",16} } },
}
},
{ // Intel CPUs
kDeviceTypeCPU, "Intel", {
{ "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz", { {"VW3",8}, {"WGS3",16}, {"WPT3",16} } },
{ "default", { {"VW3",8}, {"WGS3",16}, {"WPT3",16} } },
}
},
{ // Default
kDeviceTypeAll, "default", {
{ "default", { {"VW3",8}, {"WGS3",32}, {"WPT3",16} } },
}
},
}
};
// =================================================================================================
} // namespace clblast

View file

@ -18,6 +18,7 @@ const Database::DatabaseEntry Database::XgerHalf = {
"Xger", Precision::kHalf, {
{ // Intel GPUs
kDeviceTypeGPU, "Intel", {
{ "Intel(R) HD Graphics 5500 BroadWell U-Processor GT2", { {"WGS1",256}, {"WGS2",1}, {"WPT",2} } },
{ "Intel(R) HD Graphics Skylake ULT GT2", { {"WGS1",64}, {"WGS2",1}, {"WPT",1} } },
{ "default", { {"WGS1",64}, {"WGS2",1}, {"WPT",1} } },
}
@ -38,9 +39,10 @@ const Database::DatabaseEntry Database::XgerSingle = {
kDeviceTypeGPU, "AMD", {
{ "AMD Radeon R9 M370X Compute Engine", { {"WGS1",256}, {"WGS2",1}, {"WPT",1} } },
{ "Hawaii", { {"WGS1",64}, {"WGS2",2}, {"WPT",1} } },
{ "Oland", { {"WGS1",32}, {"WGS2",4}, {"WPT",2} } },
{ "Pitcairn", { {"WGS1",64}, {"WGS2",1}, {"WPT",1} } },
{ "Tahiti", { {"WGS1",256}, {"WGS2",1}, {"WPT",1} } },
{ "default", { {"WGS1",64}, {"WGS2",1}, {"WPT",1} } },
{ "default", { {"WGS1",32}, {"WGS2",4}, {"WPT",1} } },
}
},
{ // ARM GPUs
@ -53,29 +55,34 @@ const Database::DatabaseEntry Database::XgerSingle = {
kDeviceTypeCPU, "Intel", {
{ "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"WGS1",128}, {"WGS2",2}, {"WPT",4} } },
{ "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz", { {"WGS1",128}, {"WGS2",1}, {"WPT",4} } },
{ "default", { {"WGS1",128}, {"WGS2",1}, {"WPT",4} } },
{ "default", { {"WGS1",128}, {"WGS2",8}, {"WPT",4} } },
}
},
{ // Intel GPUs
kDeviceTypeGPU, "Intel", {
{ "Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile", { {"WGS1",256}, {"WGS2",2}, {"WPT",2} } },
{ "Intel(R) HD Graphics 530", { {"WGS1",32}, {"WGS2",1}, {"WPT",2} } },
{ "Intel(R) HD Graphics 5500 BroadWell U-Processor GT2", { {"WGS1",256}, {"WGS2",2}, {"WPT",2} } },
{ "Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile", { {"WGS1",128}, {"WGS2",1}, {"WPT",2} } },
{ "Intel(R) HD Graphics Skylake ULT GT2", { {"WGS1",8}, {"WGS2",8}, {"WPT",4} } },
{ "Iris Pro", { {"WGS1",64}, {"WGS2",1}, {"WPT",4} } },
{ "default", { {"WGS1",8}, {"WGS2",1}, {"WPT",2} } },
{ "default", { {"WGS1",32}, {"WGS2",4}, {"WPT",2} } },
}
},
{ // NVIDIA GPUs
kDeviceTypeGPU, "NVIDIA", {
{ "GRID K520", { {"WGS1",128}, {"WGS2",1}, {"WPT",2} } },
{ "GeForce GTX 1070", { {"WGS1",512}, {"WGS2",1}, {"WPT",1} } },
{ "GeForce GTX 480", { {"WGS1",256}, {"WGS2",1}, {"WPT",4} } },
{ "GeForce GTX 670", { {"WGS1",32}, {"WGS2",8}, {"WPT",2} } },
{ "GeForce GTX 680", { {"WGS1",128}, {"WGS2",1}, {"WPT",4} } },
{ "GeForce GTX 750", { {"WGS1",64}, {"WGS2",16}, {"WPT",4} } },
{ "GeForce GTX TITAN", { {"WGS1",32}, {"WGS2",4}, {"WPT",2} } },
{ "default", { {"WGS1",32}, {"WGS2",1}, {"WPT",2} } },
{ "default", { {"WGS1",256}, {"WGS2",1}, {"WPT",4} } },
}
},
{ // Default
kDeviceTypeAll, "default", {
{ "default", { {"WGS1",8}, {"WGS2",1}, {"WPT",1} } },
{ "default", { {"WGS1",32}, {"WGS2",4}, {"WPT",2} } },
}
},
}
@ -89,9 +96,10 @@ const Database::DatabaseEntry Database::XgerComplexSingle = {
kDeviceTypeGPU, "AMD", {
{ "AMD Radeon R9 M370X Compute Engine", { {"WGS1",64}, {"WGS2",4}, {"WPT",1} } },
{ "Hawaii", { {"WGS1",64}, {"WGS2",1}, {"WPT",2} } },
{ "Oland", { {"WGS1",4}, {"WGS2",8}, {"WPT",1} } },
{ "Pitcairn", { {"WGS1",128}, {"WGS2",2}, {"WPT",1} } },
{ "Tahiti", { {"WGS1",64}, {"WGS2",2}, {"WPT",1} } },
{ "default", { {"WGS1",64}, {"WGS2",1}, {"WPT",1} } },
{ "default", { {"WGS1",256}, {"WGS2",1}, {"WPT",1} } },
}
},
{ // ARM GPUs
@ -104,29 +112,34 @@ const Database::DatabaseEntry Database::XgerComplexSingle = {
kDeviceTypeCPU, "Intel", {
{ "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"WGS1",256}, {"WGS2",1}, {"WPT",4} } },
{ "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz", { {"WGS1",512}, {"WGS2",4}, {"WPT",2} } },
{ "default", { {"WGS1",256}, {"WGS2",1}, {"WPT",2} } },
{ "default", { {"WGS1",512}, {"WGS2",4}, {"WPT",2} } },
}
},
{ // Intel GPUs
kDeviceTypeGPU, "Intel", {
{ "Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile", { {"WGS1",128}, {"WGS2",4}, {"WPT",1} } },
{ "Intel(R) HD Graphics 530", { {"WGS1",32}, {"WGS2",1}, {"WPT",2} } },
{ "Intel(R) HD Graphics 5500 BroadWell U-Processor GT2", { {"WGS1",128}, {"WGS2",2}, {"WPT",1} } },
{ "Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile", { {"WGS1",512}, {"WGS2",1}, {"WPT",1} } },
{ "Intel(R) HD Graphics Skylake ULT GT2", { {"WGS1",128}, {"WGS2",4}, {"WPT",2} } },
{ "Iris Pro", { {"WGS1",16}, {"WGS2",2}, {"WPT",4} } },
{ "default", { {"WGS1",16}, {"WGS2",2}, {"WPT",1} } },
{ "default", { {"WGS1",64}, {"WGS2",1}, {"WPT",2} } },
}
},
{ // NVIDIA GPUs
kDeviceTypeGPU, "NVIDIA", {
{ "GRID K520", { {"WGS1",64}, {"WGS2",4}, {"WPT",2} } },
{ "GeForce GTX 1070", { {"WGS1",16}, {"WGS2",64}, {"WPT",2} } },
{ "GeForce GTX 480", { {"WGS1",128}, {"WGS2",2}, {"WPT",2} } },
{ "GeForce GTX 670", { {"WGS1",16}, {"WGS2",32}, {"WPT",2} } },
{ "GeForce GTX 680", { {"WGS1",32}, {"WGS2",4}, {"WPT",2} } },
{ "GeForce GTX 750", { {"WGS1",32}, {"WGS2",16}, {"WPT",4} } },
{ "GeForce GTX TITAN", { {"WGS1",16}, {"WGS2",16}, {"WPT",2} } },
{ "default", { {"WGS1",16}, {"WGS2",2}, {"WPT",2} } },
{ "default", { {"WGS1",64}, {"WGS2",2}, {"WPT",2} } },
}
},
{ // Default
kDeviceTypeAll, "default", {
{ "default", { {"WGS1",16}, {"WGS2",1}, {"WPT",1} } },
{ "default", { {"WGS1",64}, {"WGS2",4}, {"WPT",2} } },
}
},
}
@ -140,9 +153,10 @@ const Database::DatabaseEntry Database::XgerDouble = {
kDeviceTypeGPU, "AMD", {
{ "AMD Radeon R9 M370X Compute Engine", { {"WGS1",32}, {"WGS2",4}, {"WPT",1} } },
{ "Hawaii", { {"WGS1",32}, {"WGS2",4}, {"WPT",2} } },
{ "Oland", { {"WGS1",128}, {"WGS2",1}, {"WPT",2} } },
{ "Pitcairn", { {"WGS1",64}, {"WGS2",1}, {"WPT",1} } },
{ "Tahiti", { {"WGS1",64}, {"WGS2",2}, {"WPT",1} } },
{ "default", { {"WGS1",32}, {"WGS2",1}, {"WPT",1} } },
{ "default", { {"WGS1",64}, {"WGS2",2}, {"WPT",1} } },
}
},
{ // ARM GPUs
@ -155,21 +169,24 @@ const Database::DatabaseEntry Database::XgerDouble = {
kDeviceTypeCPU, "Intel", {
{ "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"WGS1",512}, {"WGS2",16}, {"WPT",1} } },
{ "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz", { {"WGS1",512}, {"WGS2",8}, {"WPT",2} } },
{ "default", { {"WGS1",512}, {"WGS2",8}, {"WPT",1} } },
{ "default", { {"WGS1",512}, {"WGS2",8}, {"WPT",2} } },
}
},
{ // NVIDIA GPUs
kDeviceTypeGPU, "NVIDIA", {
{ "GRID K520", { {"WGS1",128}, {"WGS2",8}, {"WPT",2} } },
{ "GeForce GTX 1070", { {"WGS1",32}, {"WGS2",8}, {"WPT",1} } },
{ "GeForce GTX 480", { {"WGS1",32}, {"WGS2",4}, {"WPT",2} } },
{ "GeForce GTX 670", { {"WGS1",32}, {"WGS2",32}, {"WPT",2} } },
{ "GeForce GTX 680", { {"WGS1",128}, {"WGS2",4}, {"WPT",2} } },
{ "GeForce GTX 750", { {"WGS1",256}, {"WGS2",2}, {"WPT",2} } },
{ "GeForce GTX TITAN", { {"WGS1",16}, {"WGS2",8}, {"WPT",2} } },
{ "default", { {"WGS1",16}, {"WGS2",4}, {"WPT",2} } },
{ "default", { {"WGS1",256}, {"WGS2",2}, {"WPT",2} } },
}
},
{ // Default
kDeviceTypeAll, "default", {
{ "default", { {"WGS1",16}, {"WGS2",1}, {"WPT",1} } },
{ "default", { {"WGS1",128}, {"WGS2",1}, {"WPT",2} } },
}
},
}
@ -183,9 +200,10 @@ const Database::DatabaseEntry Database::XgerComplexDouble = {
kDeviceTypeGPU, "AMD", {
{ "AMD Radeon R9 M370X Compute Engine", { {"WGS1",64}, {"WGS2",1}, {"WPT",1} } },
{ "Hawaii", { {"WGS1",128}, {"WGS2",1}, {"WPT",1} } },
{ "Oland", { {"WGS1",16}, {"WGS2",16}, {"WPT",2} } },
{ "Pitcairn", { {"WGS1",64}, {"WGS2",4}, {"WPT",1} } },
{ "Tahiti", { {"WGS1",32}, {"WGS2",4}, {"WPT",1} } },
{ "default", { {"WGS1",32}, {"WGS2",1}, {"WPT",1} } },
{ "default", { {"WGS1",32}, {"WGS2",4}, {"WPT",1} } },
}
},
{ // ARM GPUs
@ -204,15 +222,18 @@ const Database::DatabaseEntry Database::XgerComplexDouble = {
{ // NVIDIA GPUs
kDeviceTypeGPU, "NVIDIA", {
{ "GRID K520", { {"WGS1",16}, {"WGS2",8}, {"WPT",2} } },
{ "GeForce GTX 1070", { {"WGS1",8}, {"WGS2",128}, {"WPT",1} } },
{ "GeForce GTX 480", { {"WGS1",64}, {"WGS2",2}, {"WPT",2} } },
{ "GeForce GTX 670", { {"WGS1",8}, {"WGS2",16}, {"WPT",2} } },
{ "GeForce GTX 680", { {"WGS1",8}, {"WGS2",16}, {"WPT",1} } },
{ "GeForce GTX 750", { {"WGS1",8}, {"WGS2",32}, {"WPT",4} } },
{ "GeForce GTX TITAN", { {"WGS1",32}, {"WGS2",4}, {"WPT",2} } },
{ "default", { {"WGS1",8}, {"WGS2",2}, {"WPT",1} } },
{ "default", { {"WGS1",16}, {"WGS2",8}, {"WPT",2} } },
}
},
{ // Default
kDeviceTypeAll, "default", {
{ "default", { {"WGS1",8}, {"WGS2",1}, {"WPT",1} } },
{ "default", { {"WGS1",64}, {"WGS2",2}, {"WPT",2} } },
}
},
}

View file

@ -109,6 +109,16 @@ R"(
typedef real singlereal;
#endif
// Converts a 'real argument' value to a 'real' value as passed to the kernel. Normally there is no
// conversion, but half-precision is not supported as kernel argument so it is converted from float.
#if PRECISION == 16
typedef float real_arg;
#define GetRealArg(x) (half)x
#else
typedef real real_arg;
#define GetRealArg(x) x
#endif
// =================================================================================================
// Don't use the non-IEEE754 compliant OpenCL built-in mad() instruction per default. For specific
@ -138,6 +148,13 @@ R"(
#define SetToOne(a) a = ONE
#endif
// Determines whether a variable is zero
#if PRECISION == 3232 || PRECISION == 6464
#define IsZero(a) ((a.x == ZERO) && (a.y == ZERO))
#else
#define IsZero(a) (a == ZERO)
#endif
// The absolute value (component-wise)
#if PRECISION == 3232 || PRECISION == 6464
#define AbsoluteValue(value) value.x = fabs(value.x); value.y = fabs(value.y)

View file

@ -30,10 +30,10 @@ R"(
// =================================================================================================
// The main reduction kernel, performing the loading and the majority of the operation
__attribute__((reqd_work_group_size(WGS1, 1, 1)))
__kernel void Xamax(const int n,
const __global real* restrict xgm, const int x_offset, const int x_inc,
__global singlereal* maxgm, __global unsigned int* imaxgm) {
__kernel __attribute__((reqd_work_group_size(WGS1, 1, 1)))
void Xamax(const int n,
const __global real* restrict xgm, const int x_offset, const int x_inc,
__global singlereal* maxgm, __global unsigned int* imaxgm) {
__local singlereal maxlm[WGS1];
__local unsigned int imaxlm[WGS1];
const int lid = get_local_id(0);
@ -95,10 +95,10 @@ __kernel void Xamax(const int n,
// The epilogue reduction kernel, performing the final bit of the operation. This kernel has to
// be launched with a single workgroup only.
__attribute__((reqd_work_group_size(WGS2, 1, 1)))
__kernel void XamaxEpilogue(const __global singlereal* restrict maxgm,
const __global unsigned int* restrict imaxgm,
__global unsigned int* imax, const int imax_offset) {
__kernel __attribute__((reqd_work_group_size(WGS2, 1, 1)))
void XamaxEpilogue(const __global singlereal* restrict maxgm,
const __global unsigned int* restrict imaxgm,
__global unsigned int* imax, const int imax_offset) {
__local singlereal maxlm[WGS2];
__local unsigned int imaxlm[WGS2];
const int lid = get_local_id(0);

View file

@ -30,10 +30,10 @@ R"(
// =================================================================================================
// The main reduction kernel, performing the loading and the majority of the operation
__attribute__((reqd_work_group_size(WGS1, 1, 1)))
__kernel void Xasum(const int n,
const __global real* restrict xgm, const int x_offset, const int x_inc,
__global real* output) {
__kernel __attribute__((reqd_work_group_size(WGS1, 1, 1)))
void Xasum(const int n,
const __global real* restrict xgm, const int x_offset, const int x_inc,
__global real* output) {
__local real lm[WGS1];
const int lid = get_local_id(0);
const int wgid = get_group_id(0);
@ -74,9 +74,9 @@ __kernel void Xasum(const int n,
// The epilogue reduction kernel, performing the final bit of the operation. This kernel has to
// be launched with a single workgroup only.
__attribute__((reqd_work_group_size(WGS2, 1, 1)))
__kernel void XasumEpilogue(const __global real* restrict input,
__global real* asum, const int asum_offset) {
__kernel __attribute__((reqd_work_group_size(WGS2, 1, 1)))
void XasumEpilogue(const __global real* restrict input,
__global real* asum, const int asum_offset) {
__local real lm[WGS2];
const int lid = get_local_id(0);

View file

@ -22,11 +22,11 @@ R"(
// =================================================================================================
// Full version of the kernel with offsets and strided accesses
__attribute__((reqd_work_group_size(WGS, 1, 1)))
__kernel void Xaxpy(const int n, const __constant real* restrict arg_alpha,
const __global real* restrict xgm, const int x_offset, const int x_inc,
__global real* ygm, const int y_offset, const int y_inc) {
const real alpha = arg_alpha[0];
__kernel __attribute__((reqd_work_group_size(WGS, 1, 1)))
void Xaxpy(const int n, const real_arg arg_alpha,
const __global real* restrict xgm, const int x_offset, const int x_inc,
__global real* ygm, const int y_offset, const int y_inc) {
const real alpha = GetRealArg(arg_alpha);
// Loops over the work that needs to be done (allows for an arbitrary number of threads)
#pragma unroll
@ -40,11 +40,11 @@ __kernel void Xaxpy(const int n, const __constant real* restrict arg_alpha,
// Faster version of the kernel without offsets and strided accesses. Also assumes that 'n' is
// dividable by 'VW', 'WGS' and 'WPT'.
__attribute__((reqd_work_group_size(WGS, 1, 1)))
__kernel void XaxpyFast(const int n, const __constant real* restrict arg_alpha,
const __global realV* restrict xgm,
__global realV* ygm) {
const real alpha = arg_alpha[0];
__kernel __attribute__((reqd_work_group_size(WGS, 1, 1)))
void XaxpyFast(const int n, const real_arg arg_alpha,
const __global realV* restrict xgm,
__global realV* ygm) {
const real alpha = GetRealArg(arg_alpha);
#pragma unroll
for (int w=0; w<WPT; ++w) {

View file

@ -22,10 +22,10 @@ R"(
// =================================================================================================
// Full version of the kernel with offsets and strided accesses
__attribute__((reqd_work_group_size(WGS, 1, 1)))
__kernel void Xcopy(const int n,
const __global real* restrict xgm, const int x_offset, const int x_inc,
__global real* ygm, const int y_offset, const int y_inc) {
__kernel __attribute__((reqd_work_group_size(WGS, 1, 1)))
void Xcopy(const int n,
const __global real* restrict xgm, const int x_offset, const int x_inc,
__global real* ygm, const int y_offset, const int y_inc) {
// Loops over the work that needs to be done (allows for an arbitrary number of threads)
#pragma unroll
@ -38,10 +38,10 @@ __kernel void Xcopy(const int n,
// Faster version of the kernel without offsets and strided accesses. Also assumes that 'n' is
// dividable by 'VW', 'WGS' and 'WPT'.
__attribute__((reqd_work_group_size(WGS, 1, 1)))
__kernel void XcopyFast(const int n,
const __global realV* restrict xgm,
__global realV* ygm) {
__kernel __attribute__((reqd_work_group_size(WGS, 1, 1)))
void XcopyFast(const int n,
const __global realV* restrict xgm,
__global realV* ygm) {
#pragma unroll
for (int w=0; w<WPT; ++w) {
const int id = w*get_global_size(0) + get_global_id(0);

View file

@ -30,11 +30,11 @@ R"(
// =================================================================================================
// The main reduction kernel, performing the multiplication and the majority of the sum operation
__attribute__((reqd_work_group_size(WGS1, 1, 1)))
__kernel void Xdot(const int n,
const __global real* restrict xgm, const int x_offset, const int x_inc,
const __global real* restrict ygm, const int y_offset, const int y_inc,
__global real* output, const int do_conjugate) {
__kernel __attribute__((reqd_work_group_size(WGS1, 1, 1)))
void Xdot(const int n,
const __global real* restrict xgm, const int x_offset, const int x_inc,
const __global real* restrict ygm, const int y_offset, const int y_inc,
__global real* output, const int do_conjugate) {
__local real lm[WGS1];
const int lid = get_local_id(0);
const int wgid = get_group_id(0);
@ -73,9 +73,9 @@ __kernel void Xdot(const int n,
// The epilogue reduction kernel, performing the final bit of the sum operation. This kernel has to
// be launched with a single workgroup only.
__attribute__((reqd_work_group_size(WGS2, 1, 1)))
__kernel void XdotEpilogue(const __global real* restrict input,
__global real* dot, const int dot_offset) {
__kernel __attribute__((reqd_work_group_size(WGS2, 1, 1)))
void XdotEpilogue(const __global real* restrict input,
__global real* dot, const int dot_offset) {
__local real lm[WGS2];
const int lid = get_local_id(0);

View file

@ -30,10 +30,10 @@ R"(
// =================================================================================================
// The main reduction kernel, performing the multiplication and the majority of the operation
__attribute__((reqd_work_group_size(WGS1, 1, 1)))
__kernel void Xnrm2(const int n,
const __global real* restrict xgm, const int x_offset, const int x_inc,
__global real* output) {
__kernel __attribute__((reqd_work_group_size(WGS1, 1, 1)))
void Xnrm2(const int n,
const __global real* restrict xgm, const int x_offset, const int x_inc,
__global real* output) {
__local real lm[WGS1];
const int lid = get_local_id(0);
const int wgid = get_group_id(0);
@ -72,9 +72,9 @@ __kernel void Xnrm2(const int n,
// The epilogue reduction kernel, performing the final bit of the operation. This kernel has to
// be launched with a single workgroup only.
__attribute__((reqd_work_group_size(WGS2, 1, 1)))
__kernel void Xnrm2Epilogue(const __global real* restrict input,
__global real* nrm2, const int nrm2_offset) {
__kernel __attribute__((reqd_work_group_size(WGS2, 1, 1)))
void Xnrm2Epilogue(const __global real* restrict input,
__global real* nrm2, const int nrm2_offset) {
__local real lm[WGS2];
const int lid = get_local_id(0);

View file

@ -22,9 +22,10 @@ R"(
// =================================================================================================
// Full version of the kernel with offsets and strided accesses
__attribute__((reqd_work_group_size(WGS, 1, 1)))
__kernel void Xscal(const int n, const real alpha,
__global real* xgm, const int x_offset, const int x_inc) {
__kernel __attribute__((reqd_work_group_size(WGS, 1, 1)))
void Xscal(const int n, const real_arg arg_alpha,
__global real* xgm, const int x_offset, const int x_inc) {
const real alpha = GetRealArg(arg_alpha);
// Loops over the work that needs to be done (allows for an arbitrary number of threads)
#pragma unroll
@ -40,9 +41,11 @@ __kernel void Xscal(const int n, const real alpha,
// Faster version of the kernel without offsets and strided accesses. Also assumes that 'n' is
// dividable by 'VW', 'WGS' and 'WPT'.
__attribute__((reqd_work_group_size(WGS, 1, 1)))
__kernel void XscalFast(const int n, const real alpha,
__global realV* xgm) {
__kernel __attribute__((reqd_work_group_size(WGS, 1, 1)))
void XscalFast(const int n, const real_arg arg_alpha,
__global realV* xgm) {
const real alpha = GetRealArg(arg_alpha);
#pragma unroll
for (int w=0; w<WPT; ++w) {
const int id = w*get_global_size(0) + get_global_id(0);

View file

@ -22,10 +22,10 @@ R"(
// =================================================================================================
// Full version of the kernel with offsets and strided accesses
__attribute__((reqd_work_group_size(WGS, 1, 1)))
__kernel void Xswap(const int n,
__global real* xgm, const int x_offset, const int x_inc,
__global real* ygm, const int y_offset, const int y_inc) {
__kernel __attribute__((reqd_work_group_size(WGS, 1, 1)))
void Xswap(const int n,
__global real* xgm, const int x_offset, const int x_inc,
__global real* ygm, const int y_offset, const int y_inc) {
// Loops over the work that needs to be done (allows for an arbitrary number of threads)
#pragma unroll
@ -40,10 +40,10 @@ __kernel void Xswap(const int n,
// Faster version of the kernel without offsets and strided accesses. Also assumes that 'n' is
// dividable by 'VW', 'WGS' and 'WPT'.
__attribute__((reqd_work_group_size(WGS, 1, 1)))
__kernel void XswapFast(const int n,
__global realV* xgm,
__global realV* ygm) {
__kernel __attribute__((reqd_work_group_size(WGS, 1, 1)))
void XswapFast(const int n,
__global realV* xgm,
__global realV* ygm) {
#pragma unroll
for (int w=0; w<WPT; ++w) {
const int id = w*get_global_size(0) + get_global_id(0);

View file

@ -210,18 +210,18 @@ inline real LoadMatrixA(const __global real* restrict agm, const int x, const in
// =================================================================================================
// Full version of the kernel
__attribute__((reqd_work_group_size(WGS1, 1, 1)))
__kernel void Xgemv(const int m, const int n,
const __constant real* restrict arg_alpha,
const __constant real* restrict arg_beta,
__kernel __attribute__((reqd_work_group_size(WGS1, 1, 1)))
void Xgemv(const int m, const int n,
const real_arg arg_alpha,
const real_arg arg_beta,
const int a_rotated,
const __global real* restrict agm, const int a_offset, const int a_ld,
const __global real* restrict xgm, const int x_offset, const int x_inc,
__global real* ygm, const int y_offset, const int y_inc,
const int do_conjugate, const int parameter,
const int kl, const int ku) {
const real alpha = arg_alpha[0];
const real beta = arg_beta[0];
const real alpha = GetRealArg(arg_alpha);
const real beta = GetRealArg(arg_beta);
// Local memory for the vector X
__local real xlm[WGS1];

View file

@ -38,7 +38,7 @@ R"(
#define WGS3 64 // The local work-group size
#endif
#ifndef WPT3
#define WPT3 1 // The amount of work-per-thread
#define WPT3 1 // The tile-size
#endif
#ifndef VW3
#define VW3 1 // Vector width of matrix A loads
@ -74,18 +74,12 @@ R"(
// =================================================================================================
// Loads a vector input value (1/2)
// Loads a vector input value
inline realVF LoadMatrixAVF(const __global realVF* restrict agm, const int x, const int y,
const int a_ld) {
return agm[a_ld*y + x];
}
// Loads a vector input value (2/2): as before, but different data-type
inline realVFR LoadMatrixAVFR(const __global realVFR* restrict agm, const int x, const int y,
const int a_ld) {
return agm[a_ld*y + x];
}
// =================================================================================================
// Faster version of the kernel, assuming that:
@ -94,23 +88,23 @@ inline realVFR LoadMatrixAVFR(const __global realVFR* restrict agm, const int x,
// --> 'a_ld' is a multiple of VW2
// --> 'a_rotated' is 0
// --> 'do_conjugate' is 0
__attribute__((reqd_work_group_size(WGS2, 1, 1)))
__kernel void XgemvFast(const int m, const int n,
const __constant real* restrict arg_alpha,
const __constant real* restrict arg_beta,
const int a_rotated,
const __global realVF* restrict agm, const int a_offset, const int a_ld,
const __global real* restrict xgm, const int x_offset, const int x_inc,
__global real* ygm, const int y_offset, const int y_inc,
const int do_conjugate, const int parameter,
const int kl, const int ku) {
const real alpha = arg_alpha[0];
const real beta = arg_beta[0];
__kernel __attribute__((reqd_work_group_size(WGS2, 1, 1)))
void XgemvFast(const int m, const int n,
const real_arg arg_alpha,
const real_arg arg_beta,
const int a_rotated,
const __global realVF* restrict agm, const int a_offset, const int a_ld,
const __global real* restrict xgm, const int x_offset, const int x_inc,
__global real* ygm, const int y_offset, const int y_inc,
const int do_conjugate, const int parameter,
const int kl_unused, const int ku_unused) {
const real alpha = GetRealArg(arg_alpha);
const real beta = GetRealArg(arg_beta);
// Local memory for the vector X
__local real xlm[WGS2];
// Initializes the accumulation register
// Initializes the accumulation registers
real acc[WPT2];
#pragma unroll
for (int w=0; w<WPT2; ++w) {
@ -134,7 +128,7 @@ __kernel void XgemvFast(const int m, const int n,
#pragma unroll
for (int w=0; w<WPT2/VW2; ++w) {
const int gid = (WPT2/VW2)*get_global_id(0) + w;
realVF avec = LoadMatrixAVF(agm, gid, k, a_ld/VW2);
realVF avec = agm[(a_ld/VW2)*k + gid];
#if VW2 == 1
MultiplyAdd(acc[VW2*w+0], xlm[kl], avec);
#elif VW2 == 2
@ -196,84 +190,96 @@ __kernel void XgemvFast(const int m, const int n,
// --> 'a_ld' is a multiple of VW3
// --> 'a_rotated' is 1
// --> 'do_conjugate' is 0
__attribute__((reqd_work_group_size(WGS3, 1, 1)))
__kernel void XgemvFastRot(const int m, const int n,
const __constant real* restrict arg_alpha,
const __constant real* restrict arg_beta,
const int a_rotated,
const __global realVFR* restrict agm, const int a_offset, const int a_ld,
const __global real* restrict xgm, const int x_offset, const int x_inc,
__global real* ygm, const int y_offset, const int y_inc,
const int do_conjugate, const int parameter,
const int kl, const int ku) {
const real alpha = arg_alpha[0];
const real beta = arg_beta[0];
__kernel __attribute__((reqd_work_group_size(WGS3, 1, 1)))
void XgemvFastRot(const int m, const int n,
const real_arg arg_alpha,
const real_arg arg_beta,
const int a_rotated,
const __global realVFR* restrict agm, const int a_offset, const int a_ld,
const __global real* restrict xgm, const int x_offset, const int x_inc,
__global real* ygm, const int y_offset, const int y_inc,
const int do_conjugate, const int parameter,
const int kl_unused, const int ku_unused) {
const real alpha = GetRealArg(arg_alpha);
const real beta = GetRealArg(arg_beta);
// Local memory to store a tile of the matrix (for coalescing)
__local real tile[WPT3][WGS3];
const int lid = get_local_id(0);
const int lid_mod = lid % (WPT3/VW3);
const int lid_div = lid / (WPT3/VW3);
// Local memory for the vector X
__local real xlm[WGS3];
__local real xlm[WPT3];
// Initializes the accumulation register
real acc[WPT3];
#pragma unroll
for (int w=0; w<WPT3; ++w) {
SetToZero(acc[w]);
}
real acc;
SetToZero(acc);
// Loops over work-group sized portions of the work
for (int kwg=0; kwg<n; kwg+=WGS3) {
// Loops over tile-sized portions of the work
for (int kwg=0; kwg<n; kwg+=WPT3) {
// Loads the vector X into local memory
const int lid = get_local_id(0);
xlm[lid] = xgm[(kwg + lid)*x_inc + x_offset];
if (lid < WPT3) {
xlm[lid] = xgm[(kwg + lid) * x_inc + x_offset];
}
// Loads the matrix A into local memory
#pragma unroll
for (int kl=0; kl<WPT3/VW3; ++kl) {
const int x = (kwg/VW3) + lid_mod;
const int y = get_group_id(0) * WGS3 + lid_div * (WPT3/VW3) + kl;
realVFR avec = agm[(a_ld/VW3) * y + x];
#if VW3 == 1
tile[kl*VW3 + 0][lid] = avec;
#elif VW3 == 2
tile[kl*VW3 + 0][lid] = avec.x;
tile[kl*VW3 + 1][lid] = avec.y;
#elif VW3 == 4
tile[kl*VW3 + 0][lid] = avec.x;
tile[kl*VW3 + 1][lid] = avec.y;
tile[kl*VW3 + 2][lid] = avec.z;
tile[kl*VW3 + 3][lid] = avec.w;
#elif VW3 == 8
tile[kl*VW3 + 0][lid] = avec.s0;
tile[kl*VW3 + 1][lid] = avec.s1;
tile[kl*VW3 + 2][lid] = avec.s2;
tile[kl*VW3 + 3][lid] = avec.s3;
tile[kl*VW3 + 4][lid] = avec.s4;
tile[kl*VW3 + 5][lid] = avec.s5;
tile[kl*VW3 + 6][lid] = avec.s6;
tile[kl*VW3 + 7][lid] = avec.s7;
#elif VW3 == 16
tile[kl*VW3 + 0][lid] = avec.s0;
tile[kl*VW3 + 1][lid] = avec.s1;
tile[kl*VW3 + 2][lid] = avec.s2;
tile[kl*VW3 + 3][lid] = avec.s3;
tile[kl*VW3 + 4][lid] = avec.s4;
tile[kl*VW3 + 5][lid] = avec.s5;
tile[kl*VW3 + 6][lid] = avec.s6;
tile[kl*VW3 + 7][lid] = avec.s7;
tile[kl*VW3 + 8][lid] = avec.s8;
tile[kl*VW3 + 9][lid] = avec.s9;
tile[kl*VW3 + 10][lid] = avec.sA;
tile[kl*VW3 + 11][lid] = avec.sB;
tile[kl*VW3 + 12][lid] = avec.sC;
tile[kl*VW3 + 13][lid] = avec.sD;
tile[kl*VW3 + 14][lid] = avec.sE;
tile[kl*VW3 + 15][lid] = avec.sF;
#endif
}
// Synchronizes all threads in a workgroup
barrier(CLK_LOCAL_MEM_FENCE);
// The multiply-add function (rotated)
#pragma unroll
for (int kl=0; kl<WGS3/VW3; ++kl) {
const int k = (kwg/VW3) + kl;
for (int kl=0; kl<WPT3/VW3; ++kl) {
#pragma unroll
for (int w=0; w<WPT3; ++w) {
const int gid = WPT3*get_global_id(0) + w;
realVFR avec = LoadMatrixAVFR(agm, k, gid, a_ld/VW3);
#if VW3 == 1
MultiplyAdd(acc[w], xlm[VW3*kl+0], avec);
#elif VW3 == 2
MultiplyAdd(acc[w], xlm[VW3*kl+0], avec.x);
MultiplyAdd(acc[w], xlm[VW3*kl+1], avec.y);
#elif VW3 == 4
MultiplyAdd(acc[w], xlm[VW3*kl+0], avec.x);
MultiplyAdd(acc[w], xlm[VW3*kl+1], avec.y);
MultiplyAdd(acc[w], xlm[VW3*kl+2], avec.z);
MultiplyAdd(acc[w], xlm[VW3*kl+3], avec.w);
#elif VW3 == 8
MultiplyAdd(acc[w], xlm[VW3*kl+0], avec.s0);
MultiplyAdd(acc[w], xlm[VW3*kl+1], avec.s1);
MultiplyAdd(acc[w], xlm[VW3*kl+2], avec.s2);
MultiplyAdd(acc[w], xlm[VW3*kl+3], avec.s3);
MultiplyAdd(acc[w], xlm[VW3*kl+4], avec.s4);
MultiplyAdd(acc[w], xlm[VW3*kl+5], avec.s5);
MultiplyAdd(acc[w], xlm[VW3*kl+6], avec.s6);
MultiplyAdd(acc[w], xlm[VW3*kl+7], avec.s7);
#elif VW3 == 16
MultiplyAdd(acc[w], xlm[VW3*kl+0], avec.s0);
MultiplyAdd(acc[w], xlm[VW3*kl+1], avec.s1);
MultiplyAdd(acc[w], xlm[VW3*kl+2], avec.s2);
MultiplyAdd(acc[w], xlm[VW3*kl+3], avec.s3);
MultiplyAdd(acc[w], xlm[VW3*kl+4], avec.s4);
MultiplyAdd(acc[w], xlm[VW3*kl+5], avec.s5);
MultiplyAdd(acc[w], xlm[VW3*kl+6], avec.s6);
MultiplyAdd(acc[w], xlm[VW3*kl+7], avec.s7);
MultiplyAdd(acc[w], xlm[VW3*kl+8], avec.s8);
MultiplyAdd(acc[w], xlm[VW3*kl+9], avec.s9);
MultiplyAdd(acc[w], xlm[VW3*kl+10], avec.sA);
MultiplyAdd(acc[w], xlm[VW3*kl+11], avec.sB);
MultiplyAdd(acc[w], xlm[VW3*kl+12], avec.sC);
MultiplyAdd(acc[w], xlm[VW3*kl+13], avec.sD);
MultiplyAdd(acc[w], xlm[VW3*kl+14], avec.sE);
MultiplyAdd(acc[w], xlm[VW3*kl+15], avec.sF);
#endif
for (int v=0; v<VW3; ++v) {
real aval = tile[lid_mod*VW3 + v][lid_div * (WPT3/VW3) + kl];
real xval = xlm[kl*VW3 + v];
MultiplyAdd(acc, xval, aval);
}
}
@ -282,12 +288,9 @@ __kernel void XgemvFastRot(const int m, const int n,
}
// Stores the final result
#pragma unroll
for (int w=0; w<WPT3; ++w) {
const int gid = WPT3*get_global_id(0) + w;
real yval = ygm[gid*y_inc + y_offset];
AXPBY(ygm[gid*y_inc + y_offset], alpha, acc[w], beta, yval);
}
const int gid = get_global_id(0);
real yval = ygm[gid * y_inc + y_offset];
AXPBY(ygm[gid * y_inc + y_offset], alpha, acc, beta, yval);
}
// =================================================================================================

View file

@ -18,14 +18,14 @@ R"(
// =================================================================================================
// Regular version of the rank-1 matrix update kernel (GER, GERU, GERC)
__attribute__((reqd_work_group_size(WGS1, WGS2, 1)))
__kernel void Xger(const int max1, const int max2,
const __constant real* restrict arg_alpha,
const __global real* restrict xgm, const int x_offset, const int x_inc,
const __global real* ygm, const int y_offset, const int y_inc,
__global real* restrict agm, const int a_offset, const int a_ld,
const int is_rowmajor) {
const real alpha = arg_alpha[0];
__kernel __attribute__((reqd_work_group_size(WGS1, WGS2, 1)))
void Xger(const int max1, const int max2,
const real_arg arg_alpha,
const __global real* restrict xgm, const int x_offset, const int x_inc,
const __global real* ygm, const int y_offset, const int y_inc,
__global real* restrict agm, const int a_offset, const int a_ld,
const int is_rowmajor) {
const real alpha = GetRealArg(arg_alpha);
// Register storage for X and Y
real xvalues[WPT];

View file

@ -18,13 +18,13 @@ R"(
// =================================================================================================
// Symmetric version of the rank-1 matrix update kernel (HER, HPR, SYR, SPR)
__attribute__((reqd_work_group_size(WGS1, WGS2, 1)))
__kernel void Xher(const int n,
const __constant real* restrict arg_alpha,
const __global real* restrict xgm, const int x_offset, const int x_inc,
__global real* restrict agm, const int a_offset, const int a_ld,
const int is_upper, const int is_rowmajor) {
const real alpha = arg_alpha[0];
__kernel __attribute__((reqd_work_group_size(WGS1, WGS2, 1)))
void Xher(const int n,
const real_arg arg_alpha,
const __global real* restrict xgm, const int x_offset, const int x_inc,
__global real* restrict agm, const int a_offset, const int a_ld,
const int is_upper, const int is_rowmajor) {
const real alpha = GetRealArg(arg_alpha);
// Register storage for X and XT
real xvalues[WPT];

View file

@ -18,14 +18,14 @@ R"(
// =================================================================================================
// Symmetric version of the rank-2 matrix update kernel (HER2, HPR2, SYR2, SPR2)
__attribute__((reqd_work_group_size(WGS1, WGS2, 1)))
__kernel void Xher2(const int n,
const __constant real* restrict arg_alpha,
const __global real* restrict xgm, const int x_offset, const int x_inc,
const __global real* restrict ygm, const int y_offset, const int y_inc,
__global real* restrict agm, const int a_offset, const int a_ld,
const int is_upper, const int is_rowmajor) {
const real alpha = arg_alpha[0];
__kernel __attribute__((reqd_work_group_size(WGS1, WGS2, 1)))
void Xher2(const int n,
const real_arg arg_alpha,
const __global real* restrict xgm, const int x_offset, const int x_inc,
const __global real* restrict ygm, const int y_offset, const int y_inc,
__global real* restrict agm, const int a_offset, const int a_ld,
const int is_upper, const int is_rowmajor) {
const real alpha = GetRealArg(arg_alpha);
// Register storage for X and Y
real xvalues[WPT];

View file

@ -20,13 +20,13 @@ R"(
// Kernel to populate a squared hermitian matrix, given that the triangle which holds the data is
// stored as the lower-triangle of the input matrix. This uses the padding kernel's parameters.
__attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1)))
__kernel void HermLowerToSquared(const int src_dim,
const int src_ld, const int src_offset,
__global const real* restrict src,
const int dest_dim,
const int dest_ld, const int dest_offset,
__global real* dest) {
__kernel __attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1)))
void HermLowerToSquared(const int src_dim,
const int src_ld, const int src_offset,
__global const real* restrict src,
const int dest_dim,
const int dest_ld, const int dest_offset,
__global real* dest) {
// Loops over the work per thread in both dimensions
#pragma unroll
@ -59,13 +59,13 @@ __kernel void HermLowerToSquared(const int src_dim,
}
// Same as above, but now the matrix' data is stored in the upper-triangle
__attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1)))
__kernel void HermUpperToSquared(const int src_dim,
const int src_ld, const int src_offset,
__global const real* restrict src,
const int dest_dim,
const int dest_ld, const int dest_offset,
__global real* dest) {
__kernel __attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1)))
void HermUpperToSquared(const int src_dim,
const int src_ld, const int src_offset,
__global const real* restrict src,
const int dest_dim,
const int dest_ld, const int dest_offset,
__global real* dest) {
// Loops over the work per thread in both dimensions
#pragma unroll

View file

@ -20,13 +20,13 @@ R"(
// Kernel to populate a squared symmetric matrix, given that the triangle which holds the data is
// stored as the lower-triangle of the input matrix. This uses the padding kernel's parameters.
__attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1)))
__kernel void SymmLowerToSquared(const int src_dim,
const int src_ld, const int src_offset,
__global const real* restrict src,
const int dest_dim,
const int dest_ld, const int dest_offset,
__global real* dest) {
__kernel __attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1)))
void SymmLowerToSquared(const int src_dim,
const int src_ld, const int src_offset,
__global const real* restrict src,
const int dest_dim,
const int dest_ld, const int dest_offset,
__global real* dest) {
// Loops over the work per thread in both dimensions
#pragma unroll
@ -53,13 +53,13 @@ __kernel void SymmLowerToSquared(const int src_dim,
}
// Same as above, but now the matrix' data is stored in the upper-triangle
__attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1)))
__kernel void SymmUpperToSquared(const int src_dim,
const int src_ld, const int src_offset,
__global const real* restrict src,
const int dest_dim,
const int dest_ld, const int dest_offset,
__global real* dest) {
__kernel __attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1)))
void SymmUpperToSquared(const int src_dim,
const int src_ld, const int src_offset,
__global const real* restrict src,
const int dest_dim,
const int dest_ld, const int dest_offset,
__global real* dest) {
// Loops over the work per thread in both dimensions
#pragma unroll

View file

@ -20,14 +20,14 @@ R"(
// Kernel to populate a squared triangular matrix, given that the triangle which holds the data is
// stored as the lower-triangle of the input matrix. This uses the padding kernel's parameters.
__attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1)))
__kernel void TriaLowerToSquared(const int src_dim,
const int src_ld, const int src_offset,
__global const real* restrict src,
const int dest_dim,
const int dest_ld, const int dest_offset,
__global real* dest,
const int unit_diagonal) {
__kernel __attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1)))
void TriaLowerToSquared(const int src_dim,
const int src_ld, const int src_offset,
__global const real* restrict src,
const int dest_dim,
const int dest_ld, const int dest_offset,
__global real* dest,
const int unit_diagonal) {
// Loops over the work per thread in both dimensions
#pragma unroll
@ -55,14 +55,14 @@ __kernel void TriaLowerToSquared(const int src_dim,
}
// Same as above, but now the matrix' data is stored in the upper-triangle
__attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1)))
__kernel void TriaUpperToSquared(const int src_dim,
const int src_ld, const int src_offset,
__global const real* restrict src,
const int dest_dim,
const int dest_ld, const int dest_offset,
__global real* dest,
const int unit_diagonal) {
__kernel __attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1)))
void TriaUpperToSquared(const int src_dim,
const int src_ld, const int src_offset,
__global const real* restrict src,
const int dest_dim,
const int dest_ld, const int dest_offset,
__global real* dest,
const int unit_diagonal) {
// Loops over the work per thread in both dimensions
#pragma unroll

View file

@ -35,12 +35,12 @@ R"(
// Fast copy kernel. Requires 'ld' and the number of threads in dimension 0 to be a multiple of
// COPY_VW. Also requires both matrices to be of the same dimensions and without offset.
__attribute__((reqd_work_group_size(COPY_DIMX, COPY_DIMY, 1)))
__kernel void CopyMatrixFast(const int ld,
__global const realC* restrict src,
__global realC* dest,
const __constant real* restrict arg_alpha) {
const real alpha = arg_alpha[0];
__kernel __attribute__((reqd_work_group_size(COPY_DIMX, COPY_DIMY, 1)))
void CopyMatrixFast(const int ld,
__global const realC* restrict src,
__global realC* dest,
const real_arg arg_alpha) {
const real alpha = GetRealArg(arg_alpha);
#pragma unroll
for (int w_one=0; w_one<COPY_WPT; ++w_one) {
const int id_one = get_global_id(0);

View file

@ -24,16 +24,16 @@ R"(
// Copies a matrix from source to destination. The output is padded with zero values in case the
// destination matrix dimensions are larger than the source matrix dimensions. Additionally, the ld
// value and offset can be different.
__attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1)))
__kernel void CopyPadMatrix(const int src_one, const int src_two,
const int src_ld, const int src_offset,
__global const real* restrict src,
const int dest_one, const int dest_two,
const int dest_ld, const int dest_offset,
__global real* dest,
const __constant real* restrict arg_alpha,
const int do_conjugate) {
const real alpha = arg_alpha[0];
__kernel __attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1)))
void CopyPadMatrix(const int src_one, const int src_two,
const int src_ld, const int src_offset,
__global const real* restrict src,
const int dest_one, const int dest_two,
const int dest_ld, const int dest_offset,
__global real* dest,
const real_arg arg_alpha,
const int do_conjugate) {
const real alpha = GetRealArg(arg_alpha);
// Loops over the work per thread in both dimensions
#pragma unroll
@ -65,17 +65,17 @@ __kernel void CopyPadMatrix(const int src_one, const int src_two,
// Same as above, but now un-pads a matrix. This kernel reads data from a padded source matrix, but
// writes only the actual data back to the destination matrix. Again, the ld value and offset can
// be different.
__attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1)))
__kernel void CopyMatrix(const int src_one, const int src_two,
const int src_ld, const int src_offset,
__global const real* restrict src,
const int dest_one, const int dest_two,
const int dest_ld, const int dest_offset,
__global real* dest,
const __constant real* restrict arg_alpha,
const int upper, const int lower,
const int diagonal_imag_zero) {
const real alpha = arg_alpha[0];
__kernel __attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1)))
void CopyMatrix(const int src_one, const int src_two,
const int src_ld, const int src_offset,
__global const real* restrict src,
const int dest_one, const int dest_two,
const int dest_ld, const int dest_offset,
__global real* dest,
const real_arg arg_alpha,
const int upper, const int lower,
const int diagonal_imag_zero) {
const real alpha = GetRealArg(arg_alpha);
// Loops over the work per thread in both dimensions
#pragma unroll

View file

@ -36,12 +36,12 @@ R"(
// Transposes and copies a matrix. Requires both matrices to be of the same dimensions and without
// offset. A more general version is available in 'padtranspose.opencl'.
__attribute__((reqd_work_group_size(TRA_DIM, TRA_DIM, 1)))
__kernel void TransposeMatrixFast(const int ld,
__global const realT* restrict src,
__global realT* dest,
const __constant real* restrict arg_alpha) {
const real alpha = arg_alpha[0];
__kernel __attribute__((reqd_work_group_size(TRA_DIM, TRA_DIM, 1)))
void TransposeMatrixFast(const int ld,
__global const realT* restrict src,
__global realT* dest,
const real_arg arg_alpha) {
const real alpha = GetRealArg(arg_alpha);
// Sets the group identifiers. They might be 'shuffled' around to distribute work in a different
// way over workgroups, breaking memory-bank dependencies.

View file

@ -24,16 +24,16 @@ R"(
// Transposes a matrix from source to destination. The output is padded with zero values in case the
// destination matrix dimensions are larger than the transposed source matrix dimensions.
__attribute__((reqd_work_group_size(PADTRA_TILE, PADTRA_TILE, 1)))
__kernel void TransposePadMatrix(const int src_one, const int src_two,
const int src_ld, const int src_offset,
__global const real* restrict src,
const int dest_one, const int dest_two,
const int dest_ld, const int dest_offset,
__global real* dest,
const __constant real* restrict arg_alpha,
const int do_conjugate) {
const real alpha = arg_alpha[0];
__kernel __attribute__((reqd_work_group_size(PADTRA_TILE, PADTRA_TILE, 1)))
void TransposePadMatrix(const int src_one, const int src_two,
const int src_ld, const int src_offset,
__global const real* restrict src,
const int dest_one, const int dest_two,
const int dest_ld, const int dest_offset,
__global real* dest,
const real_arg arg_alpha,
const int do_conjugate) {
const real alpha = GetRealArg(arg_alpha);
// Local memory to store a tile of the matrix (for coalescing)
__local real tile[PADTRA_WPT*PADTRA_TILE][PADTRA_WPT*PADTRA_TILE + PADTRA_PAD];
@ -88,17 +88,17 @@ __kernel void TransposePadMatrix(const int src_one, const int src_two,
// Transposes a matrix, while considering possible padding in the source matrix. Data is read from a
// padded source matrix, but only the actual data is written back to the transposed destination
// matrix. This kernel optionally checks for upper/lower triangular matrices.
__attribute__((reqd_work_group_size(PADTRA_TILE, PADTRA_TILE, 1)))
__kernel void TransposeMatrix(const int src_one, const int src_two,
const int src_ld, const int src_offset,
__global const real* restrict src,
const int dest_one, const int dest_two,
const int dest_ld, const int dest_offset,
__global real* dest,
const __constant real* restrict arg_alpha,
const int upper, const int lower,
const int diagonal_imag_zero) {
const real alpha = arg_alpha[0];
__kernel __attribute__((reqd_work_group_size(PADTRA_TILE, PADTRA_TILE, 1)))
void TransposeMatrix(const int src_one, const int src_two,
const int src_ld, const int src_offset,
__global const real* restrict src,
const int dest_one, const int dest_two,
const int dest_ld, const int dest_offset,
__global real* dest,
const real_arg arg_alpha,
const int upper, const int lower,
const int diagonal_imag_zero) {
const real alpha = GetRealArg(arg_alpha);
// Local memory to store a tile of the matrix (for coalescing)
__local real tile[PADTRA_WPT*PADTRA_TILE][PADTRA_WPT*PADTRA_TILE + PADTRA_PAD];

View file

@ -31,7 +31,7 @@
// o-------o o-----o
//
//
// This kernel is seperated into two files. This is part 1 out of 2.
// This kernel is seperated into three files. This is part 1 out of 3.
//
// =================================================================================================

View file

@ -7,7 +7,7 @@
// Author(s):
// Cedric Nugteren <www.cedricnugteren.nl>
//
// This is part 2 of 2 of the GEMM kernel. See part 1 for more information.
// This is part 2 of 3 of the GEMM kernel. See part 1 for more information.
//
// =================================================================================================
@ -133,49 +133,93 @@ inline void StoreResults(__global realM* cgm, realM cpm[NWI][MWI/VWM], const int
#endif
int idm = mg + GetGroupID0() * (MWG/VWM);
int idn = ng + GetGroupID1() * NWG;
// The final multiplication with alpha and the addition with beta*C
int index = idn*(kSizeM/VWM) + idm;
realM result;
realM xval = cpm[ni][mi];
realM yval = cgm[index];
#if VWM == 1
AXPBY(result, alpha, xval, beta, yval);
#elif VWM == 2
AXPBY(result.x, alpha, xval.x, beta, yval.x);
AXPBY(result.y, alpha, xval.y, beta, yval.y);
#elif VWM == 4
AXPBY(result.x, alpha, xval.x, beta, yval.x);
AXPBY(result.y, alpha, xval.y, beta, yval.y);
AXPBY(result.z, alpha, xval.z, beta, yval.z);
AXPBY(result.w, alpha, xval.w, beta, yval.w);
#elif VWM == 8
AXPBY(result.s0, alpha, xval.s0, beta, yval.s0);
AXPBY(result.s1, alpha, xval.s1, beta, yval.s1);
AXPBY(result.s2, alpha, xval.s2, beta, yval.s2);
AXPBY(result.s3, alpha, xval.s3, beta, yval.s3);
AXPBY(result.s4, alpha, xval.s4, beta, yval.s4);
AXPBY(result.s5, alpha, xval.s5, beta, yval.s5);
AXPBY(result.s6, alpha, xval.s6, beta, yval.s6);
AXPBY(result.s7, alpha, xval.s7, beta, yval.s7);
#elif VWM == 16
AXPBY(result.s0, alpha, xval.s0, beta, yval.s0);
AXPBY(result.s1, alpha, xval.s1, beta, yval.s1);
AXPBY(result.s2, alpha, xval.s2, beta, yval.s2);
AXPBY(result.s3, alpha, xval.s3, beta, yval.s3);
AXPBY(result.s4, alpha, xval.s4, beta, yval.s4);
AXPBY(result.s5, alpha, xval.s5, beta, yval.s5);
AXPBY(result.s6, alpha, xval.s6, beta, yval.s6);
AXPBY(result.s7, alpha, xval.s7, beta, yval.s7);
AXPBY(result.s8, alpha, xval.s8, beta, yval.s8);
AXPBY(result.s9, alpha, xval.s9, beta, yval.s9);
AXPBY(result.sA, alpha, xval.sA, beta, yval.sA);
AXPBY(result.sB, alpha, xval.sB, beta, yval.sB);
AXPBY(result.sC, alpha, xval.sC, beta, yval.sC);
AXPBY(result.sD, alpha, xval.sD, beta, yval.sD);
AXPBY(result.sE, alpha, xval.sE, beta, yval.sE);
AXPBY(result.sF, alpha, xval.sF, beta, yval.sF);
#endif
// The final multiplication with alpha (in case beta == 0)
if (IsZero(beta)) {
#if VWM == 1
Multiply(result, alpha, xval);
#elif VWM == 2
Multiply(result.x, alpha, xval.x);
Multiply(result.y, alpha, xval.y);
#elif VWM == 4
Multiply(result.x, alpha, xval.x);
Multiply(result.y, alpha, xval.y);
Multiply(result.z, alpha, xval.z);
Multiply(result.w, alpha, xval.w);
#elif VWM == 8
Multiply(result.s0, alpha, xval.s0);
Multiply(result.s1, alpha, xval.s1);
Multiply(result.s2, alpha, xval.s2);
Multiply(result.s3, alpha, xval.s3);
Multiply(result.s4, alpha, xval.s4);
Multiply(result.s5, alpha, xval.s5);
Multiply(result.s6, alpha, xval.s6);
Multiply(result.s7, alpha, xval.s7);
#elif VWM == 16
Multiply(result.s0, alpha, xval.s0);
Multiply(result.s1, alpha, xval.s1);
Multiply(result.s2, alpha, xval.s2);
Multiply(result.s3, alpha, xval.s3);
Multiply(result.s4, alpha, xval.s4);
Multiply(result.s5, alpha, xval.s5);
Multiply(result.s6, alpha, xval.s6);
Multiply(result.s7, alpha, xval.s7);
Multiply(result.s8, alpha, xval.s8);
Multiply(result.s9, alpha, xval.s9);
Multiply(result.sA, alpha, xval.sA);
Multiply(result.sB, alpha, xval.sB);
Multiply(result.sC, alpha, xval.sC);
Multiply(result.sD, alpha, xval.sD);
Multiply(result.sE, alpha, xval.sE);
Multiply(result.sF, alpha, xval.sF);
#endif
}
// The final multiplication with alpha and the addition with beta*C
else {
realM yval = cgm[index];
#if VWM == 1
AXPBY(result, alpha, xval, beta, yval);
#elif VWM == 2
AXPBY(result.x, alpha, xval.x, beta, yval.x);
AXPBY(result.y, alpha, xval.y, beta, yval.y);
#elif VWM == 4
AXPBY(result.x, alpha, xval.x, beta, yval.x);
AXPBY(result.y, alpha, xval.y, beta, yval.y);
AXPBY(result.z, alpha, xval.z, beta, yval.z);
AXPBY(result.w, alpha, xval.w, beta, yval.w);
#elif VWM == 8
AXPBY(result.s0, alpha, xval.s0, beta, yval.s0);
AXPBY(result.s1, alpha, xval.s1, beta, yval.s1);
AXPBY(result.s2, alpha, xval.s2, beta, yval.s2);
AXPBY(result.s3, alpha, xval.s3, beta, yval.s3);
AXPBY(result.s4, alpha, xval.s4, beta, yval.s4);
AXPBY(result.s5, alpha, xval.s5, beta, yval.s5);
AXPBY(result.s6, alpha, xval.s6, beta, yval.s6);
AXPBY(result.s7, alpha, xval.s7, beta, yval.s7);
#elif VWM == 16
AXPBY(result.s0, alpha, xval.s0, beta, yval.s0);
AXPBY(result.s1, alpha, xval.s1, beta, yval.s1);
AXPBY(result.s2, alpha, xval.s2, beta, yval.s2);
AXPBY(result.s3, alpha, xval.s3, beta, yval.s3);
AXPBY(result.s4, alpha, xval.s4, beta, yval.s4);
AXPBY(result.s5, alpha, xval.s5, beta, yval.s5);
AXPBY(result.s6, alpha, xval.s6, beta, yval.s6);
AXPBY(result.s7, alpha, xval.s7, beta, yval.s7);
AXPBY(result.s8, alpha, xval.s8, beta, yval.s8);
AXPBY(result.s9, alpha, xval.s9, beta, yval.s9);
AXPBY(result.sA, alpha, xval.sA, beta, yval.sA);
AXPBY(result.sB, alpha, xval.sB, beta, yval.sB);
AXPBY(result.sC, alpha, xval.sC, beta, yval.sC);
AXPBY(result.sD, alpha, xval.sD, beta, yval.sD);
AXPBY(result.sE, alpha, xval.sE, beta, yval.sE);
AXPBY(result.sF, alpha, xval.sF, beta, yval.sF);
#endif
}
cgm[index] = result;
}
}
@ -183,212 +227,6 @@ inline void StoreResults(__global realM* cgm, realM cpm[NWI][MWI/VWM], const int
// =================================================================================================
// Main body of the matrix-multiplication algorithm. It calls the (inlined) functions above.
inline void XgemmBody(const int kSizeM, const int kSizeN, const int kSizeK,
const __global realM* restrict agm, const __global realN* restrict bgm,
__global realM* cgm, realM cpm[NWI][MWI/VWM]
#if SA == 1 && SB == 1
, __local realM* alm, __local realN* blm
#elif SA == 1
, __local realM* alm
#elif SB == 1
, __local realN* blm
#endif
) {
// Allocates workitem-private memory (registers)
realM apm[MWI/VWM];
realN bpm[NWI/VWN];
// Combined thread identifier (volatile to disable caching)
#if SA == 1 || SB == 1
volatile int tid = get_local_id(0) + MDIMC*get_local_id(1);
#endif
// Initializes the accumulation registers
InitAccRegisters(cpm);
// Loops over all workgroup tiles
for (int kwg=0; kwg<kSizeK; kwg+=KWG) {
// Loads data: off-chip --> local (matrix A)
#if SA == 1
GlobalToLocalA(agm, alm, kSizeM, tid, kwg);
#endif
// Loads data: off-chip --> local (matrix B)
#if SB == 1
GlobalToLocalB(bgm, blm, kSizeN, tid, kwg);
#endif
#if SA == 1 || SB == 1
barrier(CLK_LOCAL_MEM_FENCE);
#endif
// Loops over all workitem tiles, unrolled by a factor KWI
for (int pwi=0; pwi<KWG; pwi+=KWI) {
#pragma unroll
for (int pit=0; pit<KWI; ++pit) {
#if SA == 0 || SB == 0
int idk = kwg + pwi + pit;
#endif
#if SA == 1 || SB == 1
int kg = pwi+pit;
#endif
// Loads data: local --> private (matrix A)
#if SA == 1
LocalToPrivateA(alm, apm, kg);
// Loads data: off-chip --> private (matrix A)
#else
GlobalToPrivateA(agm, apm, kSizeM, idk, kwg);
#endif
// Loads data: local --> private (matrix B)
#if SB == 1
LocalToPrivateB(blm, bpm, kg);
// Loads data: off-chip --> private (matrix B)
#else
GlobalToPrivateB(bgm, bpm, kSizeN, idk);
#endif
// Performs the accumulation (Cpm += Apm * Bpm)
MultiplyAccumulate(cpm, apm, bpm);
}
}
#if SA == 1 || SB == 1
barrier(CLK_LOCAL_MEM_FENCE);
#endif
}
#if GLOBAL_MEM_FENCE == 1
barrier(CLK_GLOBAL_MEM_FENCE);
#endif
}
// =================================================================================================
// The upper-triangular and lower-triangular kernels are only used in special cases
#if defined(ROUTINE_SYRK) || defined(ROUTINE_HERK) || defined(ROUTINE_SYR2K) || defined(ROUTINE_HER2K)
// Main entry point of the kernel. This is the upper-triangular version.
__attribute__((reqd_work_group_size(MDIMC, NDIMC, 1)))
__kernel void XgemmUpper(const int kSizeN, const int kSizeK,
const __constant real* restrict arg_alpha,
const __constant real* restrict arg_beta,
const __global realM* restrict agm,
const __global realN* restrict bgm,
__global realM* cgm) {
const real alpha = arg_alpha[0];
const real beta = arg_beta[0];
// Skip these threads if they do not contain threads contributing to the upper-triangle
if (GetGroupID1()*NWG < GetGroupID0()*MWG) {
return;
}
// Allocates workgroup-private memory (local memory)
#if SA == 1
__local realM alm[KWG * MWG/VWM];
#endif
#if SB == 1
__local realN blm[KWG * NWG/VWN];
#endif
// Computes the matrix-multiplication and stores the result in register memory
realM cpm[NWI][MWI/VWM];
#if SA == 1 && SB == 1
XgemmBody(kSizeN, kSizeN, kSizeK, agm, bgm, cgm, cpm, alm, blm);
#elif SA == 1
XgemmBody(kSizeN, kSizeN, kSizeK, agm, bgm, cgm, cpm, alm);
#elif SB == 1
XgemmBody(kSizeN, kSizeN, kSizeK, agm, bgm, cgm, cpm, blm);
#else
XgemmBody(kSizeN, kSizeN, kSizeK, agm, bgm, cgm, cpm);
#endif
// Stores an MWG * NWG tile of results and performs the multiplication with alpha and beta
StoreResults(cgm, cpm, kSizeN, alpha, beta);
}
// Main entry point of the kernel. This is the lower-triangular version.
__attribute__((reqd_work_group_size(MDIMC, NDIMC, 1)))
__kernel void XgemmLower(const int kSizeN, const int kSizeK,
const __constant real* restrict arg_alpha,
const __constant real* restrict arg_beta,
const __global realM* restrict agm,
const __global realN* restrict bgm,
__global realM* cgm) {
const real alpha = arg_alpha[0];
const real beta = arg_beta[0];
// Skip these threads if they do not contain threads contributing to the lower-triangle
if (GetGroupID1()*NWG > GetGroupID0()*MWG) {
return;
}
// Allocates workgroup-private memory (local memory)
#if SA == 1
__local realM alm[KWG * MWG/VWM];
#endif
#if SB == 1
__local realN blm[KWG * NWG/VWN];
#endif
// Computes the matrix-multiplication and stores the result in register memory
realM cpm[NWI][MWI/VWM];
#if SA == 1 && SB == 1
XgemmBody(kSizeN, kSizeN, kSizeK, agm, bgm, cgm, cpm, alm, blm);
#elif SA == 1
XgemmBody(kSizeN, kSizeN, kSizeK, agm, bgm, cgm, cpm, alm);
#elif SB == 1
XgemmBody(kSizeN, kSizeN, kSizeK, agm, bgm, cgm, cpm, blm);
#else
XgemmBody(kSizeN, kSizeN, kSizeK, agm, bgm, cgm, cpm);
#endif
// Stores an MWG * NWG tile of results and performs the multiplication with alpha and beta
StoreResults(cgm, cpm, kSizeN, alpha, beta);
}
// =================================================================================================
// If not using a triangular version, include the regular kernel
#else
// Main entry point of the kernel. This is the regular full version.
__attribute__((reqd_work_group_size(MDIMC, NDIMC, 1)))
__kernel void Xgemm(const int kSizeM, const int kSizeN, const int kSizeK,
const __constant real* restrict arg_alpha,
const __constant real* restrict arg_beta,
const __global realM* restrict agm,
const __global realN* restrict bgm,
__global realM* cgm) {
const real alpha = arg_alpha[0];
const real beta = arg_beta[0];
// Allocates workgroup-private memory (local memory)
#if SA == 1
__local realM alm[KWG * MWG/VWM];
#endif
#if SB == 1
__local realN blm[KWG * NWG/VWN];
#endif
// Computes the matrix-multiplication and stores the result in register memory
realM cpm[NWI][MWI/VWM];
#if SA == 1 && SB == 1
XgemmBody(kSizeM, kSizeN, kSizeK, agm, bgm, cgm, cpm, alm, blm);
#elif SA == 1
XgemmBody(kSizeM, kSizeN, kSizeK, agm, bgm, cgm, cpm, alm);
#elif SB == 1
XgemmBody(kSizeM, kSizeN, kSizeK, agm, bgm, cgm, cpm, blm);
#else
XgemmBody(kSizeM, kSizeN, kSizeK, agm, bgm, cgm, cpm);
#endif
// Stores an MWG * NWG tile of results and performs the multiplication with alpha and beta
StoreResults(cgm, cpm, kSizeM, alpha, beta);
}
#endif
// =================================================================================================
// End of the C++11 raw string literal
)"

View file

@ -0,0 +1,229 @@
// =================================================================================================
// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
// width of 100 characters per line.
//
// Author(s):
// Cedric Nugteren <www.cedricnugteren.nl>
//
// This is part 3 of 3 of the GEMM kernel. See part 1 for more information.
//
// =================================================================================================
// Enables loading of this file using the C++ pre-processor's #include (C++11 standard raw string
// literal). Comment-out this line for syntax-highlighting when developing.
R"(
// =================================================================================================
// Main body of the matrix-multiplication algorithm. It calls the (inlined) functions above.
inline void XgemmBody(const int kSizeM, const int kSizeN, const int kSizeK,
const __global realM* restrict agm, const __global realN* restrict bgm,
__global realM* cgm, realM cpm[NWI][MWI/VWM]
#if SA == 1 && SB == 1
, __local realM* alm, __local realN* blm
#elif SA == 1
, __local realM* alm
#elif SB == 1
, __local realN* blm
#endif
) {
// Allocates workitem-private memory (registers)
realM apm[MWI/VWM];
realN bpm[NWI/VWN];
// Combined thread identifier (volatile to disable caching)
#if SA == 1 || SB == 1
volatile int tid = get_local_id(0) + MDIMC*get_local_id(1);
#endif
// Initializes the accumulation registers
InitAccRegisters(cpm);
// Loops over all workgroup tiles
for (int kwg=0; kwg<kSizeK; kwg+=KWG) {
// Loads data: off-chip --> local (matrix A)
#if SA == 1
GlobalToLocalA(agm, alm, kSizeM, tid, kwg);
#endif
// Loads data: off-chip --> local (matrix B)
#if SB == 1
GlobalToLocalB(bgm, blm, kSizeN, tid, kwg);
#endif
#if SA == 1 || SB == 1
barrier(CLK_LOCAL_MEM_FENCE);
#endif
// Loops over all workitem tiles, unrolled by a factor KWI
for (int pwi=0; pwi<KWG; pwi+=KWI) {
#pragma unroll
for (int pit=0; pit<KWI; ++pit) {
#if SA == 0 || SB == 0
int idk = kwg + pwi + pit;
#endif
#if SA == 1 || SB == 1
int kg = pwi+pit;
#endif
// Loads data: local --> private (matrix A)
#if SA == 1
LocalToPrivateA(alm, apm, kg);
// Loads data: off-chip --> private (matrix A)
#else
GlobalToPrivateA(agm, apm, kSizeM, idk, kwg);
#endif
// Loads data: local --> private (matrix B)
#if SB == 1
LocalToPrivateB(blm, bpm, kg);
// Loads data: off-chip --> private (matrix B)
#else
GlobalToPrivateB(bgm, bpm, kSizeN, idk);
#endif
// Performs the accumulation (Cpm += Apm * Bpm)
MultiplyAccumulate(cpm, apm, bpm);
}
}
#if SA == 1 || SB == 1
barrier(CLK_LOCAL_MEM_FENCE);
#endif
}
#if GLOBAL_MEM_FENCE == 1
barrier(CLK_GLOBAL_MEM_FENCE);
#endif
}
// =================================================================================================
// The upper-triangular and lower-triangular kernels are only used in special cases
#if defined(ROUTINE_SYRK) || defined(ROUTINE_HERK) || defined(ROUTINE_SYR2K) || defined(ROUTINE_HER2K)
// Main entry point of the kernel. This is the upper-triangular version.
__kernel __attribute__((reqd_work_group_size(MDIMC, NDIMC, 1)))
void XgemmUpper(const int kSizeN, const int kSizeK,
const real_arg arg_alpha,
const real_arg arg_beta,
const __global realM* restrict agm,
const __global realN* restrict bgm,
__global realM* cgm) {
const real alpha = GetRealArg(arg_alpha);
const real beta = GetRealArg(arg_beta);
// Skip these threads if they do not contain threads contributing to the upper-triangle
if (GetGroupID1()*NWG < GetGroupID0()*MWG) {
return;
}
// Allocates workgroup-private memory (local memory)
#if SA == 1
__local realM alm[KWG * MWG/VWM];
#endif
#if SB == 1
__local realN blm[KWG * NWG/VWN];
#endif
// Computes the matrix-multiplication and stores the result in register memory
realM cpm[NWI][MWI/VWM];
#if SA == 1 && SB == 1
XgemmBody(kSizeN, kSizeN, kSizeK, agm, bgm, cgm, cpm, alm, blm);
#elif SA == 1
XgemmBody(kSizeN, kSizeN, kSizeK, agm, bgm, cgm, cpm, alm);
#elif SB == 1
XgemmBody(kSizeN, kSizeN, kSizeK, agm, bgm, cgm, cpm, blm);
#else
XgemmBody(kSizeN, kSizeN, kSizeK, agm, bgm, cgm, cpm);
#endif
// Stores an MWG * NWG tile of results and performs the multiplication with alpha and beta
StoreResults(cgm, cpm, kSizeN, alpha, beta);
}
// Main entry point of the kernel. This is the lower-triangular version.
__kernel __attribute__((reqd_work_group_size(MDIMC, NDIMC, 1)))
void XgemmLower(const int kSizeN, const int kSizeK,
const real_arg arg_alpha,
const real_arg arg_beta,
const __global realM* restrict agm,
const __global realN* restrict bgm,
__global realM* cgm) {
const real alpha = GetRealArg(arg_alpha);
const real beta = GetRealArg(arg_beta);
// Skip these threads if they do not contain threads contributing to the lower-triangle
if (GetGroupID1()*NWG > GetGroupID0()*MWG) {
return;
}
// Allocates workgroup-private memory (local memory)
#if SA == 1
__local realM alm[KWG * MWG/VWM];
#endif
#if SB == 1
__local realN blm[KWG * NWG/VWN];
#endif
// Computes the matrix-multiplication and stores the result in register memory
realM cpm[NWI][MWI/VWM];
#if SA == 1 && SB == 1
XgemmBody(kSizeN, kSizeN, kSizeK, agm, bgm, cgm, cpm, alm, blm);
#elif SA == 1
XgemmBody(kSizeN, kSizeN, kSizeK, agm, bgm, cgm, cpm, alm);
#elif SB == 1
XgemmBody(kSizeN, kSizeN, kSizeK, agm, bgm, cgm, cpm, blm);
#else
XgemmBody(kSizeN, kSizeN, kSizeK, agm, bgm, cgm, cpm);
#endif
// Stores an MWG * NWG tile of results and performs the multiplication with alpha and beta
StoreResults(cgm, cpm, kSizeN, alpha, beta);
}
// =================================================================================================
// If not using a triangular version, include the regular kernel
#else
// Main entry point of the kernel. This is the regular full version.
__kernel __attribute__((reqd_work_group_size(MDIMC, NDIMC, 1)))
void Xgemm(const int kSizeM, const int kSizeN, const int kSizeK,
const real_arg arg_alpha,
const real_arg arg_beta,
const __global realM* restrict agm,
const __global realN* restrict bgm,
__global realM* cgm) {
const real alpha = GetRealArg(arg_alpha);
const real beta = GetRealArg(arg_beta);
// Allocates workgroup-private memory (local memory)
#if SA == 1
__local realM alm[KWG * MWG/VWM];
#endif
#if SB == 1
__local realN blm[KWG * NWG/VWN];
#endif
// Computes the matrix-multiplication and stores the result in register memory
realM cpm[NWI][MWI/VWM];
#if SA == 1 && SB == 1
XgemmBody(kSizeM, kSizeN, kSizeK, agm, bgm, cgm, cpm, alm, blm);
#elif SA == 1
XgemmBody(kSizeM, kSizeN, kSizeK, agm, bgm, cgm, cpm, alm);
#elif SB == 1
XgemmBody(kSizeM, kSizeN, kSizeK, agm, bgm, cgm, cpm, blm);
#else
XgemmBody(kSizeM, kSizeN, kSizeK, agm, bgm, cgm, cpm);
#endif
// Stores an MWG * NWG tile of results and performs the multiplication with alpha and beta
StoreResults(cgm, cpm, kSizeM, alpha, beta);
}
#endif
// =================================================================================================
// End of the C++11 raw string literal
)"
// =================================================================================================

View file

@ -1,34 +0,0 @@
// =================================================================================================
// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
// width of 100 characters per line.
//
// Author(s):
// Cedric Nugteren <www.cedricnugteren.nl>
//
// This file provides macro's to define the public API. This is needed when building a Windows DLL.
// Note: this is only used for the C++ interface, the C interface has its own definition included in
// the header file itself.
//
// =================================================================================================
#ifndef CLBLAST_PUBLIC_API_H_
#define CLBLAST_PUBLIC_API_H_
namespace clblast {
// =================================================================================================
// Exports library functions under Windows when building a DLL. See also:
// https://msdn.microsoft.com/en-us/library/a90k134d.aspx
#ifdef _WIN32
#define PUBLIC_API __declspec(dllexport)
#else
#define PUBLIC_API
#endif
// =================================================================================================
} // namespace clblast
// CLBLAST_PUBLIC_API_H_
#endif

View file

@ -13,6 +13,7 @@
#include <string>
#include <vector>
#include <chrono>
#include "routine.hpp"
@ -21,7 +22,8 @@ namespace clblast {
// Constructor: not much here, because no status codes can be returned
Routine::Routine(Queue &queue, EventPointer event, const std::string &name,
const std::vector<std::string> &routines, const Precision precision):
const std::vector<std::string> &routines, const Precision precision,
const std::vector<Database::DatabaseEntry> &userDatabase):
precision_(precision),
routine_name_(name),
queue_(queue),
@ -29,7 +31,7 @@ Routine::Routine(Queue &queue, EventPointer event, const std::string &name,
context_(queue_.GetContext()),
device_(queue_.GetDevice()),
device_name_(device_.Name()),
db_(queue_, routines, precision_) {
db_(queue_, routines, precision_, userDatabase) {
}
// =================================================================================================
@ -103,6 +105,13 @@ StatusCode Routine::SetUp() {
// Combines everything together into a single source string
const auto source_string = defines + common_header + source_string_;
// Prints details of the routine to compile in case of debugging in verbose mode
#ifdef VERBOSE
printf("[DEBUG] Compiling routine '%s-%s' for device '%s'\n",
routine_name_.c_str(), ToString(precision_).c_str(), device_name_.c_str());
const auto start_time = std::chrono::steady_clock::now();
#endif
// Compiles the kernel
try {
auto program = Program(context_, source_string);
@ -123,6 +132,13 @@ StatusCode Routine::SetUp() {
StoreProgramToCache(program, context_, precision_, routine_name_);
} catch (...) { return StatusCode::kBuildProgramFailure; }
// Prints the elapsed compilation time in case of debugging in verbose mode
#ifdef VERBOSE
const auto elapsed_time = std::chrono::steady_clock::now() - start_time;
const auto timing = std::chrono::duration<double,std::milli>(elapsed_time).count();
printf("[DEBUG] Completed compilation in %.2lf ms\n", timing);
#endif
// No errors, normal termination of this function
return StatusCode::kSuccess;
}

View file

@ -32,9 +32,11 @@ namespace clblast {
class Routine {
public:
// Base class constructor
// Base class constructor. The user database is an optional extra database to override the
// built-in database.
explicit Routine(Queue &queue, EventPointer event, const std::string &name,
const std::vector<std::string> &routines, const Precision precision);
const std::vector<std::string> &routines, const Precision precision,
const std::vector<Database::DatabaseEntry> &userDatabase = {});
// Set-up phase of the kernel
StatusCode SetUp();

View file

@ -12,6 +12,7 @@
// =================================================================================================
#include <vector>
#include <chrono>
#include "routines/common.hpp"
@ -21,45 +22,54 @@ namespace clblast {
// Enqueues a kernel, waits for completion, and checks for errors
StatusCode RunKernel(Kernel &kernel, Queue &queue, const Device &device,
std::vector<size_t> global, const std::vector<size_t> &local,
EventPointer event, std::vector<Event>& waitForEvents) {
EventPointer event, const std::vector<Event> &waitForEvents) {
// Tests for validity of the local thread sizes
if (local.size() > device.MaxWorkItemDimensions()) {
return StatusCode::kInvalidLocalNumDimensions;
}
const auto max_work_item_sizes = device.MaxWorkItemSizes();
for (auto i=size_t{0}; i<local.size(); ++i) {
if (local[i] > max_work_item_sizes[i]) { return StatusCode::kInvalidLocalThreadsDim; }
}
auto local_size = size_t{1};
for (auto &item: local) { local_size *= item; }
if (local_size > device.MaxWorkGroupSize()) { return StatusCode::kInvalidLocalThreadsTotal; }
if (!local.empty()) {
// Tests for validity of the local thread sizes
if (local.size() > device.MaxWorkItemDimensions()) {
return StatusCode::kInvalidLocalNumDimensions;
}
const auto max_work_item_sizes = device.MaxWorkItemSizes();
for (auto i=size_t{0}; i<local.size(); ++i) {
if (local[i] > max_work_item_sizes[i]) { return StatusCode::kInvalidLocalThreadsDim; }
}
auto local_size = size_t{1};
for (auto &item: local) { local_size *= item; }
if (local_size > device.MaxWorkGroupSize()) { return StatusCode::kInvalidLocalThreadsTotal; }
// Make sure the global thread sizes are at least equal to the local sizes
for (auto i=size_t{0}; i<global.size(); ++i) {
if (global[i] < local[i]) { global[i] = local[i]; }
// Make sure the global thread sizes are at least equal to the local sizes
for (auto i=size_t{0}; i<global.size(); ++i) {
if (global[i] < local[i]) { global[i] = local[i]; }
}
}
// Tests for local memory usage
const auto local_mem_usage = kernel.LocalMemUsage(device);
if (!device.IsLocalMemoryValid(local_mem_usage)) { return StatusCode::kInvalidLocalMemUsage; }
// Prints the name of the kernel to launch in case of debugging in verbose mode
#ifdef VERBOSE
queue.Finish();
printf("[DEBUG] Running kernel '%s'\n", kernel.GetFunctionName().c_str());
const auto start_time = std::chrono::steady_clock::now();
#endif
// Launches the kernel (and checks for launch errors)
try {
kernel.Launch(queue, global, local, event, waitForEvents);
} catch (...) { return StatusCode::kKernelLaunchError; }
// Prints the elapsed execution time in case of debugging in verbose mode
#ifdef VERBOSE
queue.Finish();
const auto elapsed_time = std::chrono::steady_clock::now() - start_time;
const auto timing = std::chrono::duration<double,std::milli>(elapsed_time).count();
printf("[DEBUG] Completed kernel in %.2lf ms\n", timing);
#endif
// No errors, normal termination of this function
return StatusCode::kSuccess;
}
// As above, but without an event waiting list
StatusCode RunKernel(Kernel &kernel, Queue &queue, const Device &device,
std::vector<size_t> global, const std::vector<size_t> &local,
EventPointer event) {
auto emptyWaitingList = std::vector<Event>();
return RunKernel(kernel, queue, device, global, local, event, emptyWaitingList);
}
// =================================================================================================
} // namespace clblast

View file

@ -29,21 +29,16 @@ namespace clblast {
// Enqueues a kernel, waits for completion, and checks for errors
StatusCode RunKernel(Kernel &kernel, Queue &queue, const Device &device,
std::vector<size_t> global, const std::vector<size_t> &local,
EventPointer event, std::vector<Event>& waitForEvents);
// As above, but without an event waiting list
StatusCode RunKernel(Kernel &kernel, Queue &queue, const Device &device,
std::vector<size_t> global, const std::vector<size_t> &local,
EventPointer event);
EventPointer event, const std::vector<Event> &waitForEvents = {});
// =================================================================================================
// Copies or transposes a matrix and optionally pads/unpads it with zeros. This method is also able
// to write to symmetric and triangular matrices through optional arguments.
template <typename T>
StatusCode PadCopyTransposeMatrix(Queue &queue, const Device &device, const Context &context,
StatusCode PadCopyTransposeMatrix(Queue &queue, const Device &device,
const Database &db,
EventPointer event, std::vector<Event>& waitForEvents,
EventPointer event, const std::vector<Event> &waitForEvents,
const size_t src_one, const size_t src_two,
const size_t src_ld, const size_t src_offset,
const Buffer<T> &src,
@ -88,10 +83,6 @@ StatusCode PadCopyTransposeMatrix(Queue &queue, const Device &device, const Cont
}
}
// Upload the scalar argument as a constant buffer to the device (needed for half-precision)
auto alpha_buffer = Buffer<T>(context, 1);
alpha_buffer.Write(queue, 1, &alpha);
// Retrieves the kernel from the compiled binary
try {
auto kernel = Kernel(program, kernel_name);
@ -101,7 +92,7 @@ StatusCode PadCopyTransposeMatrix(Queue &queue, const Device &device, const Cont
kernel.SetArgument(0, static_cast<int>(src_ld));
kernel.SetArgument(1, src());
kernel.SetArgument(2, dest());
kernel.SetArgument(3, alpha_buffer());
kernel.SetArgument(3, GetRealArg(alpha));
}
else {
kernel.SetArgument(0, static_cast<int>(src_one));
@ -114,7 +105,7 @@ StatusCode PadCopyTransposeMatrix(Queue &queue, const Device &device, const Cont
kernel.SetArgument(7, static_cast<int>(dest_ld));
kernel.SetArgument(8, static_cast<int>(dest_offset));
kernel.SetArgument(9, dest());
kernel.SetArgument(10, alpha_buffer());
kernel.SetArgument(10, GetRealArg(alpha));
if (do_pad) {
kernel.SetArgument(11, static_cast<int>(do_conjugate));
}

View file

@ -59,20 +59,16 @@ StatusCode Xaxpy<T>::DoAxpy(const size_t n, const T alpha,
const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);
auto kernel = Kernel(program, kernel_name);
// Upload the scalar argument as a constant buffer to the device (needed for half-precision)
auto alpha_buffer = Buffer<T>(context_, 1);
alpha_buffer.Write(queue_, 1, &alpha);
// Sets the kernel arguments
if (use_fast_kernel) {
kernel.SetArgument(0, static_cast<int>(n));
kernel.SetArgument(1, alpha_buffer());
kernel.SetArgument(1, GetRealArg(alpha));
kernel.SetArgument(2, x_buffer());
kernel.SetArgument(3, y_buffer());
}
else {
kernel.SetArgument(0, static_cast<int>(n));
kernel.SetArgument(1, alpha_buffer());
kernel.SetArgument(1, GetRealArg(alpha));
kernel.SetArgument(2, x_buffer());
kernel.SetArgument(3, static_cast<int>(x_offset));
kernel.SetArgument(4, static_cast<int>(x_inc));

View file

@ -22,7 +22,7 @@ namespace clblast {
// Constructor: forwards to base class constructor
template <typename T>
Xgemv<T>::Xgemv(Queue &queue, EventPointer event, const std::string &name):
Routine(queue, event, name, {"Pad", "Xgemv"}, PrecisionValue<T>()) {
Routine(queue, event, name, {"Pad", "Xgemv", "XgemvFast", "XgemvFastRot"}, PrecisionValue<T>()) {
source_string_ =
#include "../../kernels/level2/xgemv.opencl"
#include "../../kernels/level2/xgemv_fast.opencl"
@ -122,16 +122,10 @@ StatusCode Xgemv<T>::MatVec(const Layout layout, const Transpose a_transpose,
}
if (fast_kernel_rot) {
kernel_name = "XgemvFastRot";
global_size = m_real / db_["WPT3"];
global_size = m_real;
local_size = db_["WGS3"];
}
// Upload the scalar arguments as constant buffers to the device (needed for half-precision)
auto alpha_buffer = Buffer<T>(context_, 1);
auto beta_buffer = Buffer<T>(context_, 1);
alpha_buffer.Write(queue_, 1, &alpha);
beta_buffer.Write(queue_, 1, &beta);
// Retrieves the Xgemv kernel from the compiled binary
try {
const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);
@ -140,8 +134,8 @@ StatusCode Xgemv<T>::MatVec(const Layout layout, const Transpose a_transpose,
// Sets the kernel arguments
kernel.SetArgument(0, static_cast<int>(m_real));
kernel.SetArgument(1, static_cast<int>(n_real));
kernel.SetArgument(2, alpha_buffer());
kernel.SetArgument(3, beta_buffer());
kernel.SetArgument(2, GetRealArg(alpha));
kernel.SetArgument(3, GetRealArg(beta));
kernel.SetArgument(4, static_cast<int>(a_rotated));
kernel.SetArgument(5, a_buffer());
kernel.SetArgument(6, static_cast<int>(a_offset));

View file

@ -56,10 +56,6 @@ StatusCode Xger<T>::DoGer(const Layout layout,
status = TestVectorY(n, y_buffer, y_offset, y_inc);
if (ErrorIn(status)) { return status; }
// Upload the scalar argument as a constant buffer to the device (needed for half-precision)
auto alpha_buffer = Buffer<T>(context_, 1);
alpha_buffer.Write(queue_, 1, &alpha);
// Retrieves the kernel from the compiled binary
try {
const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);
@ -68,7 +64,7 @@ StatusCode Xger<T>::DoGer(const Layout layout,
// Sets the kernel arguments
kernel.SetArgument(0, static_cast<int>(a_one));
kernel.SetArgument(1, static_cast<int>(a_two));
kernel.SetArgument(2, alpha_buffer());
kernel.SetArgument(2, GetRealArg(alpha));
kernel.SetArgument(3, x_buffer());
kernel.SetArgument(4, static_cast<int>(x_offset));
kernel.SetArgument(5, static_cast<int>(x_inc));

View file

@ -70,10 +70,6 @@ StatusCode Xher<T,U>::DoHer(const Layout layout, const Triangle triangle,
// Creates a matching version of alpha
const auto matching_alpha = GetAlpha(alpha);
// Upload the scalar argument as a constant buffer to the device (needed for half-precision)
auto alpha_buffer = Buffer<T>(context_, 1);
alpha_buffer.Write(queue_, 1, &matching_alpha);
// Retrieves the kernel from the compiled binary
try {
const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);
@ -81,7 +77,7 @@ StatusCode Xher<T,U>::DoHer(const Layout layout, const Triangle triangle,
// Sets the kernel arguments
kernel.SetArgument(0, static_cast<int>(n));
kernel.SetArgument(1, alpha_buffer());
kernel.SetArgument(1, GetRealArg(matching_alpha));
kernel.SetArgument(2, x_buffer());
kernel.SetArgument(3, static_cast<int>(x_offset));
kernel.SetArgument(4, static_cast<int>(x_inc));

View file

@ -58,10 +58,6 @@ StatusCode Xher2<T>::DoHer2(const Layout layout, const Triangle triangle,
status = TestVectorY(n, y_buffer, y_offset, y_inc);
if (ErrorIn(status)) { return status; }
// Upload the scalar argument as a constant buffer to the device (needed for half-precision)
auto alpha_buffer = Buffer<T>(context_, 1);
alpha_buffer.Write(queue_, 1, &alpha);
// Retrieves the kernel from the compiled binary
try {
const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);
@ -69,7 +65,7 @@ StatusCode Xher2<T>::DoHer2(const Layout layout, const Triangle triangle,
// Sets the kernel arguments
kernel.SetArgument(0, static_cast<int>(n));
kernel.SetArgument(1, alpha_buffer());
kernel.SetArgument(1, GetRealArg(alpha));
kernel.SetArgument(2, x_buffer());
kernel.SetArgument(3, static_cast<int>(x_offset));
kernel.SetArgument(4, static_cast<int>(x_inc));

View file

@ -34,6 +34,7 @@ Xgemm<T>::Xgemm(Queue &queue, EventPointer event, const std::string &name):
#include "../../kernels/level3/convert_hermitian.opencl"
#include "../../kernels/level3/xgemm_part1.opencl"
#include "../../kernels/level3/xgemm_part2.opencl"
#include "../../kernels/level3/xgemm_part3.opencl"
;
}
@ -63,9 +64,12 @@ StatusCode Xgemm<T>::DoGemm(const Layout layout,
const auto b_rotated = (layout == Layout::kColMajor && b_transpose != Transpose::kNo) ||
(layout == Layout::kRowMajor && b_transpose == Transpose::kNo);
const auto c_rotated = (layout == Layout::kRowMajor);
const auto a_do_transpose = a_rotated;
const auto b_do_transpose = !b_rotated;
const auto c_do_transpose = c_rotated;
static const auto a_want_rotated = false;
static const auto b_want_rotated = true;
static const auto c_want_rotated = false;
const auto a_do_transpose = a_rotated != a_want_rotated;
const auto b_do_transpose = b_rotated != b_want_rotated;
const auto c_do_transpose = c_rotated != c_want_rotated;
// In case of complex data-types, the transpose can also become a conjugate transpose
const auto a_conjugate = (a_transpose == Transpose::kConjugate);
@ -99,6 +103,15 @@ StatusCode Xgemm<T>::DoGemm(const Layout layout,
const auto n_ceiled = Ceil(n, db_["NWG"]);
const auto k_ceiled = Ceil(k, db_["KWG"]);
// Computes the first and second "internal" (ceiled) dimensions of the 3 matrices taking into account
// whether the matrices need to be rotated or not for the kernel.
const auto a_one_i = (a_want_rotated) ? k_ceiled : m_ceiled;
const auto a_two_i = (a_want_rotated) ? m_ceiled : k_ceiled;
const auto b_one_i = (b_want_rotated) ? n_ceiled : k_ceiled;
const auto b_two_i = (b_want_rotated) ? k_ceiled : n_ceiled;
const auto c_one_i = (c_want_rotated) ? n_ceiled : m_ceiled;
const auto c_two_i = (c_want_rotated) ? m_ceiled : n_ceiled;
// The padded/transposed input/output matrices: if memory allocation fails, throw an exception
try {
@ -106,23 +119,17 @@ StatusCode Xgemm<T>::DoGemm(const Layout layout,
const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);
// Determines whether or not temporary matrices are needed
auto a_no_temp = a_one == m_ceiled && a_two == k_ceiled && a_ld == m_ceiled && a_offset == 0 &&
auto a_no_temp = a_one == a_one_i && a_two == a_two_i && a_ld == a_one && a_offset == 0 &&
a_do_transpose == false && a_conjugate == false;
auto b_no_temp = b_one == n_ceiled && b_two == k_ceiled && b_ld == n_ceiled && b_offset == 0 &&
auto b_no_temp = b_one == b_one_i && b_two == b_two_i && b_ld == b_one && b_offset == 0 &&
b_do_transpose == false && b_conjugate == false;
auto c_no_temp = c_one == m_ceiled && c_two == n_ceiled && c_ld == m_ceiled && c_offset == 0 &&
auto c_no_temp = c_one == c_one_i && c_two == c_two_i && c_ld == c_one && c_offset == 0 &&
c_do_transpose == false;
// Creates the temporary matrices
const auto a_temp = (a_no_temp) ? a_buffer : Buffer<T>(context_, k_ceiled*m_ceiled);
const auto b_temp = (b_no_temp) ? b_buffer : Buffer<T>(context_, k_ceiled*n_ceiled);
const auto c_temp = (c_no_temp) ? c_buffer : Buffer<T>(context_, m_ceiled*n_ceiled);
// Upload the scalar arguments as constant buffers to the device (needed for half-precision)
auto alpha_buffer = Buffer<T>(context_, 1);
auto beta_buffer = Buffer<T>(context_, 1);
alpha_buffer.Write(queue_, 1, &alpha);
beta_buffer.Write(queue_, 1, &beta);
const auto a_temp = (a_no_temp) ? a_buffer : Buffer<T>(context_, a_one_i*a_two_i);
const auto b_temp = (b_no_temp) ? b_buffer : Buffer<T>(context_, b_one_i*b_two_i);
const auto c_temp = (c_no_temp) ? c_buffer : Buffer<T>(context_, c_one_i*c_two_i);
// Events of all kernels (including pre/post processing kernels)
auto eventWaitList = std::vector<Event>();
@ -133,9 +140,9 @@ StatusCode Xgemm<T>::DoGemm(const Layout layout,
// case nothing has to be done, these kernels can be skipped.
if (!a_no_temp) {
auto eventProcessA = Event();
status = PadCopyTransposeMatrix(queue_, device_, context_, db_, eventProcessA.pointer(), emptyEventList,
status = PadCopyTransposeMatrix(queue_, device_, db_, eventProcessA.pointer(), emptyEventList,
a_one, a_two, a_ld, a_offset, a_buffer,
m_ceiled, k_ceiled, m_ceiled, 0, a_temp,
a_one_i, a_two_i, a_one_i, 0, a_temp,
ConstantOne<T>(), program,
true, a_do_transpose, a_conjugate);
if (ErrorIn(status)) { return status; }
@ -145,9 +152,9 @@ StatusCode Xgemm<T>::DoGemm(const Layout layout,
// As above, but now for matrix B
if (!b_no_temp) {
auto eventProcessB = Event();
status = PadCopyTransposeMatrix(queue_, device_, context_, db_, eventProcessB.pointer(), emptyEventList,
status = PadCopyTransposeMatrix(queue_, device_, db_, eventProcessB.pointer(), emptyEventList,
b_one, b_two, b_ld, b_offset, b_buffer,
n_ceiled, k_ceiled, n_ceiled, 0, b_temp,
b_one_i, b_two_i, b_one_i, 0, b_temp,
ConstantOne<T>(), program,
true, b_do_transpose, b_conjugate);
if (ErrorIn(status)) { return status; }
@ -157,9 +164,9 @@ StatusCode Xgemm<T>::DoGemm(const Layout layout,
// As above, but now for matrix C. This is only necessary if C is used both as input and output.
if (!c_no_temp && beta != static_cast<T>(0)) {
auto eventProcessC = Event();
status = PadCopyTransposeMatrix(queue_, device_, context_, db_, eventProcessC.pointer(), emptyEventList,
status = PadCopyTransposeMatrix(queue_, device_, db_, eventProcessC.pointer(), emptyEventList,
c_one, c_two, c_ld, c_offset, c_buffer,
m_ceiled, n_ceiled, m_ceiled, 0, c_temp,
c_one_i, c_two_i, c_one_i, 0, c_temp,
ConstantOne<T>(), program,
true, c_do_transpose, false);
if (ErrorIn(status)) { return status; }
@ -174,16 +181,16 @@ StatusCode Xgemm<T>::DoGemm(const Layout layout,
kernel.SetArgument(0, static_cast<int>(m_ceiled));
kernel.SetArgument(1, static_cast<int>(n_ceiled));
kernel.SetArgument(2, static_cast<int>(k_ceiled));
kernel.SetArgument(3, alpha_buffer());
kernel.SetArgument(4, beta_buffer());
kernel.SetArgument(3, GetRealArg(alpha));
kernel.SetArgument(4, GetRealArg(beta));
kernel.SetArgument(5, a_temp());
kernel.SetArgument(6, b_temp());
kernel.SetArgument(7, c_temp());
// Computes the global and local thread sizes
const auto global = std::vector<size_t>{
(m_ceiled * db_["MDIMC"]) / db_["MWG"],
(n_ceiled * db_["NDIMC"]) / db_["NWG"]
(c_one_i * db_["MDIMC"]) / db_["MWG"],
(c_two_i * db_["NDIMC"]) / db_["NWG"]
};
const auto local = std::vector<size_t>{db_["MDIMC"], db_["NDIMC"]};
@ -196,8 +203,8 @@ StatusCode Xgemm<T>::DoGemm(const Layout layout,
// Runs the post-processing kernel if needed
if (!c_no_temp) {
eventWaitList.push_back(eventKernel);
status = PadCopyTransposeMatrix(queue_, device_, context_, db_, event_, eventWaitList,
m_ceiled, n_ceiled, m_ceiled, 0, c_temp,
status = PadCopyTransposeMatrix(queue_, device_, db_, event_, eventWaitList,
c_one_i, c_two_i, c_one_i, 0, c_temp,
c_one, c_two, c_ld, c_offset, c_buffer,
ConstantOne<T>(), program,
false, c_do_transpose, false);

View file

@ -31,6 +31,7 @@ Xher2k<T,U>::Xher2k(Queue &queue, EventPointer event, const std::string &name):
#include "../../kernels/level3/transpose_pad.opencl"
#include "../../kernels/level3/xgemm_part1.opencl"
#include "../../kernels/level3/xgemm_part2.opencl"
#include "../../kernels/level3/xgemm_part3.opencl"
;
}
@ -107,12 +108,8 @@ StatusCode Xher2k<T,U>::DoHer2k(const Layout layout, const Triangle triangle, co
auto b2_temp = (b2_no_temp) ? b_buffer : Buffer<T>(context_, k_ceiled*n_ceiled);
auto c_temp = Buffer<T>(context_, n_ceiled*n_ceiled);
// Upload the scalar arguments as constant buffers to the device (needed for half-precision)
// Convert the arguments to complex versions
auto complex_beta = T{beta, static_cast<U>(0.0)};
auto alpha_buffer = Buffer<T>(context_, 1);
auto beta_buffer = Buffer<T>(context_, 1);
alpha_buffer.Write(queue_, 1, &alpha);
beta_buffer.Write(queue_, 1, &complex_beta);
// Events of all kernels (including pre/post processing kernels)
auto eventWaitList = std::vector<Event>();
@ -123,7 +120,7 @@ StatusCode Xher2k<T,U>::DoHer2k(const Layout layout, const Triangle triangle, co
// case nothing has to be done, these kernels can be skipped.
if (!a1_no_temp) {
auto eventProcessA1 = Event();
status = PadCopyTransposeMatrix(queue_, device_, context_, db_, eventProcessA1.pointer(), emptyEventList,
status = PadCopyTransposeMatrix(queue_, device_, db_, eventProcessA1.pointer(), emptyEventList,
ab_one, ab_two, a_ld, a_offset, a_buffer,
n_ceiled, k_ceiled, n_ceiled, 0, a1_temp,
ConstantOne<T>(), program,
@ -133,7 +130,7 @@ StatusCode Xher2k<T,U>::DoHer2k(const Layout layout, const Triangle triangle, co
}
if (!a2_no_temp) {
auto eventProcessA2 = Event();
status = PadCopyTransposeMatrix(queue_, device_, context_, db_, eventProcessA2.pointer(), emptyEventList,
status = PadCopyTransposeMatrix(queue_, device_, db_, eventProcessA2.pointer(), emptyEventList,
ab_one, ab_two, a_ld, a_offset, a_buffer,
n_ceiled, k_ceiled, n_ceiled, 0, a2_temp,
ConstantOne<T>(), program,
@ -143,7 +140,7 @@ StatusCode Xher2k<T,U>::DoHer2k(const Layout layout, const Triangle triangle, co
}
if (!b1_no_temp) {
auto eventProcessB1 = Event();
status = PadCopyTransposeMatrix(queue_, device_, context_, db_, eventProcessB1.pointer(), emptyEventList,
status = PadCopyTransposeMatrix(queue_, device_, db_, eventProcessB1.pointer(), emptyEventList,
ab_one, ab_two, b_ld, b_offset, b_buffer,
n_ceiled, k_ceiled, n_ceiled, 0, b1_temp,
ConstantOne<T>(), program,
@ -153,7 +150,7 @@ StatusCode Xher2k<T,U>::DoHer2k(const Layout layout, const Triangle triangle, co
}
if (!b2_no_temp) {
auto eventProcessB2 = Event();
status = PadCopyTransposeMatrix(queue_, device_, context_, db_, eventProcessB2.pointer(), emptyEventList,
status = PadCopyTransposeMatrix(queue_, device_, db_, eventProcessB2.pointer(), emptyEventList,
ab_one, ab_two, b_ld, b_offset, b_buffer,
n_ceiled, k_ceiled, n_ceiled, 0, b2_temp,
ConstantOne<T>(), program,
@ -165,7 +162,7 @@ StatusCode Xher2k<T,U>::DoHer2k(const Layout layout, const Triangle triangle, co
// Furthermore, also creates a (possibly padded) copy of matrix C, since it is not allowed to
// modify the other triangle.
auto eventProcessC = Event();
status = PadCopyTransposeMatrix(queue_, device_, context_, db_, eventProcessC.pointer(), emptyEventList,
status = PadCopyTransposeMatrix(queue_, device_, db_, eventProcessC.pointer(), emptyEventList,
n, n, c_ld, c_offset, c_buffer,
n_ceiled, n_ceiled, n_ceiled, 0, c_temp,
ConstantOne<T>(), program,
@ -180,8 +177,8 @@ StatusCode Xher2k<T,U>::DoHer2k(const Layout layout, const Triangle triangle, co
// Sets the kernel arguments
kernel.SetArgument(0, static_cast<int>(n_ceiled));
kernel.SetArgument(1, static_cast<int>(k_ceiled));
kernel.SetArgument(2, alpha_buffer());
kernel.SetArgument(3, beta_buffer());
kernel.SetArgument(2, GetRealArg(alpha));
kernel.SetArgument(3, GetRealArg(complex_beta));
kernel.SetArgument(4, a1_temp());
kernel.SetArgument(5, b2_temp());
kernel.SetArgument(6, c_temp());
@ -202,10 +199,8 @@ StatusCode Xher2k<T,U>::DoHer2k(const Layout layout, const Triangle triangle, co
// Swaps the arguments for matrices A and B, sets 'beta' to 1, and conjugate alpha
auto conjugate_alpha = T{alpha.real(), -alpha.imag()};
auto complex_one = T{static_cast<U>(1.0), static_cast<U>(0.0)};
alpha_buffer.Write(queue_, 1, &conjugate_alpha);
beta_buffer.Write(queue_, 1, &complex_one);
kernel.SetArgument(2, alpha_buffer());
kernel.SetArgument(3, beta_buffer());
kernel.SetArgument(2, GetRealArg(conjugate_alpha));
kernel.SetArgument(3, GetRealArg(complex_one));
kernel.SetArgument(4, b1_temp());
kernel.SetArgument(5, a2_temp());
@ -218,7 +213,7 @@ StatusCode Xher2k<T,U>::DoHer2k(const Layout layout, const Triangle triangle, co
// Runs the post-processing kernel
auto upper = (triangle == Triangle::kUpper);
auto lower = (triangle == Triangle::kLower);
status = PadCopyTransposeMatrix(queue_, device_, context_, db_, event_, eventWaitList,
status = PadCopyTransposeMatrix(queue_, device_, db_, event_, eventWaitList,
n_ceiled, n_ceiled, n_ceiled, 0, c_temp,
n, n, c_ld, c_offset, c_buffer,
ConstantOne<T>(), program,

View file

@ -31,6 +31,7 @@ Xherk<T,U>::Xherk(Queue &queue, EventPointer event, const std::string &name):
#include "../../kernels/level3/transpose_pad.opencl"
#include "../../kernels/level3/xgemm_part1.opencl"
#include "../../kernels/level3/xgemm_part2.opencl"
#include "../../kernels/level3/xgemm_part3.opencl"
;
}
@ -98,13 +99,9 @@ StatusCode Xherk<T,U>::DoHerk(const Layout layout, const Triangle triangle, cons
auto b_temp = (b_no_temp) ? a_buffer : Buffer<T>(context_, k_ceiled*n_ceiled);
auto c_temp = Buffer<T>(context_, n_ceiled*n_ceiled);
// Upload the scalar arguments as constant buffers to the device (needed for half-precision)
// Convert the arguments to complex versions
auto complex_alpha = T{alpha, static_cast<U>(0.0)};
auto complex_beta = T{beta, static_cast<U>(0.0)};
auto alpha_buffer = Buffer<T>(context_, 1);
auto beta_buffer = Buffer<T>(context_, 1);
alpha_buffer.Write(queue_, 1, &complex_alpha);
beta_buffer.Write(queue_, 1, &complex_beta);
// Events of all kernels (including pre/post processing kernels)
auto eventWaitList = std::vector<Event>();
@ -115,7 +112,7 @@ StatusCode Xherk<T,U>::DoHerk(const Layout layout, const Triangle triangle, cons
// case nothing has to be done, these kernels can be skipped. Two copies are created.
if (!a_no_temp) {
auto eventProcessA = Event();
status = PadCopyTransposeMatrix(queue_, device_, context_, db_, eventProcessA.pointer(), emptyEventList,
status = PadCopyTransposeMatrix(queue_, device_, db_, eventProcessA.pointer(), emptyEventList,
a_one, a_two, a_ld, a_offset, a_buffer,
n_ceiled, k_ceiled, n_ceiled, 0, a_temp,
ConstantOne<T>(), program,
@ -125,7 +122,7 @@ StatusCode Xherk<T,U>::DoHerk(const Layout layout, const Triangle triangle, cons
}
if (!b_no_temp) {
auto eventProcessB = Event();
status = PadCopyTransposeMatrix(queue_, device_, context_, db_, eventProcessB.pointer(), emptyEventList,
status = PadCopyTransposeMatrix(queue_, device_, db_, eventProcessB.pointer(), emptyEventList,
a_one, a_two, a_ld, a_offset, a_buffer,
n_ceiled, k_ceiled, n_ceiled, 0, b_temp,
ConstantOne<T>(), program,
@ -137,7 +134,7 @@ StatusCode Xherk<T,U>::DoHerk(const Layout layout, const Triangle triangle, cons
// Furthermore, also creates a (possibly padded) copy of matrix C, since it is not allowed to
// modify the other triangle.
auto eventProcessC = Event();
status = PadCopyTransposeMatrix(queue_, device_, context_, db_, eventProcessC.pointer(), emptyEventList,
status = PadCopyTransposeMatrix(queue_, device_, db_, eventProcessC.pointer(), emptyEventList,
n, n, c_ld, c_offset, c_buffer,
n_ceiled, n_ceiled, n_ceiled, 0, c_temp,
ConstantOne<T>(), program,
@ -152,8 +149,8 @@ StatusCode Xherk<T,U>::DoHerk(const Layout layout, const Triangle triangle, cons
// Sets the kernel arguments
kernel.SetArgument(0, static_cast<int>(n_ceiled));
kernel.SetArgument(1, static_cast<int>(k_ceiled));
kernel.SetArgument(2, alpha_buffer());
kernel.SetArgument(3, beta_buffer());
kernel.SetArgument(2, GetRealArg(complex_alpha));
kernel.SetArgument(3, GetRealArg(complex_beta));
kernel.SetArgument(4, a_temp());
kernel.SetArgument(5, b_temp());
kernel.SetArgument(6, c_temp());
@ -174,7 +171,7 @@ StatusCode Xherk<T,U>::DoHerk(const Layout layout, const Triangle triangle, cons
// Runs the post-processing kernel
auto upper = (triangle == Triangle::kUpper);
auto lower = (triangle == Triangle::kLower);
status = PadCopyTransposeMatrix(queue_, device_, context_, db_, event_, eventWaitList,
status = PadCopyTransposeMatrix(queue_, device_, db_, event_, eventWaitList,
n_ceiled, n_ceiled, n_ceiled, 0, c_temp,
n, n, c_ld, c_offset, c_buffer,
ConstantOne<T>(), program,

View file

@ -31,6 +31,7 @@ Xsyr2k<T>::Xsyr2k(Queue &queue, EventPointer event, const std::string &name):
#include "../../kernels/level3/transpose_pad.opencl"
#include "../../kernels/level3/xgemm_part1.opencl"
#include "../../kernels/level3/xgemm_part2.opencl"
#include "../../kernels/level3/xgemm_part3.opencl"
;
}
@ -97,12 +98,6 @@ StatusCode Xsyr2k<T>::DoSyr2k(const Layout layout, const Triangle triangle, cons
auto b_temp = (b_no_temp) ? b_buffer : Buffer<T>(context_, k_ceiled*n_ceiled);
auto c_temp = Buffer<T>(context_, n_ceiled*n_ceiled);
// Upload the scalar arguments as constant buffers to the device (needed for half-precision)
auto alpha_buffer = Buffer<T>(context_, 1);
auto beta_buffer = Buffer<T>(context_, 1);
alpha_buffer.Write(queue_, 1, &alpha);
beta_buffer.Write(queue_, 1, &beta);
// Events of all kernels (including pre/post processing kernels)
auto eventWaitList = std::vector<Event>();
auto emptyEventList = std::vector<Event>();
@ -112,7 +107,7 @@ StatusCode Xsyr2k<T>::DoSyr2k(const Layout layout, const Triangle triangle, cons
// case nothing has to be done, these kernels can be skipped.
if (!a_no_temp) {
auto eventProcessA = Event();
status = PadCopyTransposeMatrix(queue_, device_, context_, db_, eventProcessA.pointer(), emptyEventList,
status = PadCopyTransposeMatrix(queue_, device_, db_, eventProcessA.pointer(), emptyEventList,
ab_one, ab_two, a_ld, a_offset, a_buffer,
n_ceiled, k_ceiled, n_ceiled, 0, a_temp,
ConstantOne<T>(), program,
@ -122,7 +117,7 @@ StatusCode Xsyr2k<T>::DoSyr2k(const Layout layout, const Triangle triangle, cons
}
if (!b_no_temp) {
auto eventProcessB = Event();
status = PadCopyTransposeMatrix(queue_, device_, context_, db_, eventProcessB.pointer(), emptyEventList,
status = PadCopyTransposeMatrix(queue_, device_, db_, eventProcessB.pointer(), emptyEventList,
ab_one, ab_two, b_ld, b_offset, b_buffer,
n_ceiled, k_ceiled, n_ceiled, 0, b_temp,
ConstantOne<T>(), program,
@ -134,7 +129,7 @@ StatusCode Xsyr2k<T>::DoSyr2k(const Layout layout, const Triangle triangle, cons
// Furthermore, also creates a (possibly padded) copy of matrix C, since it is not allowed to
// modify the other triangle.
auto eventProcessC = Event();
status = PadCopyTransposeMatrix(queue_, device_, context_, db_, eventProcessC.pointer(), emptyEventList,
status = PadCopyTransposeMatrix(queue_, device_, db_, eventProcessC.pointer(), emptyEventList,
n, n, c_ld, c_offset, c_buffer,
n_ceiled, n_ceiled, n_ceiled, 0, c_temp,
ConstantOne<T>(), program,
@ -149,8 +144,8 @@ StatusCode Xsyr2k<T>::DoSyr2k(const Layout layout, const Triangle triangle, cons
// Sets the kernel arguments
kernel.SetArgument(0, static_cast<int>(n_ceiled));
kernel.SetArgument(1, static_cast<int>(k_ceiled));
kernel.SetArgument(2, alpha_buffer());
kernel.SetArgument(3, beta_buffer());
kernel.SetArgument(2, GetRealArg(alpha));
kernel.SetArgument(3, GetRealArg(beta));
kernel.SetArgument(4, a_temp());
kernel.SetArgument(5, b_temp());
kernel.SetArgument(6, c_temp());
@ -170,8 +165,7 @@ StatusCode Xsyr2k<T>::DoSyr2k(const Layout layout, const Triangle triangle, cons
// Swaps the arguments for matrices A and B, and sets 'beta' to 1
auto one = static_cast<T>(1);
beta_buffer.Write(queue_, 1, &one);
kernel.SetArgument(3, beta_buffer());
kernel.SetArgument(3, GetRealArg(one));
kernel.SetArgument(4, b_temp());
kernel.SetArgument(5, a_temp());
@ -184,7 +178,7 @@ StatusCode Xsyr2k<T>::DoSyr2k(const Layout layout, const Triangle triangle, cons
// Runs the post-processing kernel
auto upper = (triangle == Triangle::kUpper);
auto lower = (triangle == Triangle::kLower);
status = PadCopyTransposeMatrix(queue_, device_, context_, db_, event_, eventWaitList,
status = PadCopyTransposeMatrix(queue_, device_, db_, event_, eventWaitList,
n_ceiled, n_ceiled, n_ceiled, 0, c_temp,
n, n, c_ld, c_offset, c_buffer,
ConstantOne<T>(), program,

View file

@ -31,6 +31,7 @@ Xsyrk<T>::Xsyrk(Queue &queue, EventPointer event, const std::string &name):
#include "../../kernels/level3/transpose_pad.opencl"
#include "../../kernels/level3/xgemm_part1.opencl"
#include "../../kernels/level3/xgemm_part2.opencl"
#include "../../kernels/level3/xgemm_part3.opencl"
;
}
@ -90,12 +91,6 @@ StatusCode Xsyrk<T>::DoSyrk(const Layout layout, const Triangle triangle, const
auto a_temp = (a_no_temp) ? a_buffer : Buffer<T>(context_, k_ceiled*n_ceiled);
auto c_temp = Buffer<T>(context_, n_ceiled*n_ceiled);
// Upload the scalar arguments as constant buffers to the device (needed for half-precision)
auto alpha_buffer = Buffer<T>(context_, 1);
auto beta_buffer = Buffer<T>(context_, 1);
alpha_buffer.Write(queue_, 1, &alpha);
beta_buffer.Write(queue_, 1, &beta);
// Events of all kernels (including pre/post processing kernels)
auto eventWaitList = std::vector<Event>();
auto emptyEventList = std::vector<Event>();
@ -105,7 +100,7 @@ StatusCode Xsyrk<T>::DoSyrk(const Layout layout, const Triangle triangle, const
// case nothing has to be done, these kernels can be skipped.
if (!a_no_temp) {
auto eventProcessA = Event();
status = PadCopyTransposeMatrix(queue_, device_, context_, db_, eventProcessA.pointer(), emptyEventList,
status = PadCopyTransposeMatrix(queue_, device_, db_, eventProcessA.pointer(), emptyEventList,
a_one, a_two, a_ld, a_offset, a_buffer,
n_ceiled, k_ceiled, n_ceiled, 0, a_temp,
ConstantOne<T>(), program,
@ -117,7 +112,7 @@ StatusCode Xsyrk<T>::DoSyrk(const Layout layout, const Triangle triangle, const
// Furthermore, also creates a (possibly padded) copy of matrix C, since it is not allowed to
// modify the other triangle.
auto eventProcessC = Event();
status = PadCopyTransposeMatrix(queue_, device_, context_, db_, eventProcessC.pointer(), emptyEventList,
status = PadCopyTransposeMatrix(queue_, device_, db_, eventProcessC.pointer(), emptyEventList,
n, n, c_ld, c_offset, c_buffer,
n_ceiled, n_ceiled, n_ceiled, 0, c_temp,
ConstantOne<T>(), program,
@ -132,8 +127,8 @@ StatusCode Xsyrk<T>::DoSyrk(const Layout layout, const Triangle triangle, const
// Sets the kernel arguments
kernel.SetArgument(0, static_cast<int>(n_ceiled));
kernel.SetArgument(1, static_cast<int>(k_ceiled));
kernel.SetArgument(2, alpha_buffer());
kernel.SetArgument(3, beta_buffer());
kernel.SetArgument(2, GetRealArg(alpha));
kernel.SetArgument(3, GetRealArg(beta));
kernel.SetArgument(4, a_temp());
kernel.SetArgument(5, a_temp());
kernel.SetArgument(6, c_temp());
@ -154,7 +149,7 @@ StatusCode Xsyrk<T>::DoSyrk(const Layout layout, const Triangle triangle, const
// Runs the post-processing kernel
auto upper = (triangle == Triangle::kUpper);
auto lower = (triangle == Triangle::kLower);
status = PadCopyTransposeMatrix(queue_, device_, context_, db_, event_, eventWaitList,
status = PadCopyTransposeMatrix(queue_, device_, db_, event_, eventWaitList,
n_ceiled, n_ceiled, n_ceiled, 0, c_temp,
n, n, c_ld, c_offset, c_buffer,
ConstantOne<T>(), program,

View file

@ -72,7 +72,7 @@ StatusCode Xomatcopy<T>::DoOmatcopy(const Layout layout, const Transpose a_trans
const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);
auto emptyEventList = std::vector<Event>();
status = PadCopyTransposeMatrix(queue_, device_, context_, db_, event_, emptyEventList,
status = PadCopyTransposeMatrix(queue_, device_, db_, event_, emptyEventList,
a_one, a_two, a_ld, a_offset, a_buffer,
b_one, b_two, b_ld, b_offset, b_buffer,
alpha, program, false, transpose, conjugate);

View file

@ -86,11 +86,10 @@ class TuneCopy {
std::vector<T> &, std::vector<T> &,
std::vector<T> &a_mat, std::vector<T> &b_mat, std::vector<T> &,
std::vector<T> &) {
auto alpha_buffer = std::vector<T>{args.alpha};
tuner.AddArgumentScalar(static_cast<int>(args.m));
tuner.AddArgumentInput(a_mat);
tuner.AddArgumentOutput(b_mat);
tuner.AddArgumentInput(alpha_buffer);
tuner.AddArgumentScalar(GetRealArg(args.alpha));
}
// Describes how to compute the performance metrics

View file

@ -86,7 +86,6 @@ class TunePad {
std::vector<T> &, std::vector<T> &,
std::vector<T> &a_mat, std::vector<T> &b_mat, std::vector<T> &,
std::vector<T> &) {
auto alpha_buffer = std::vector<T>{args.alpha};
tuner.AddArgumentScalar(static_cast<int>(args.m));
tuner.AddArgumentScalar(static_cast<int>(args.n));
tuner.AddArgumentScalar(static_cast<int>(args.m));
@ -97,7 +96,7 @@ class TunePad {
tuner.AddArgumentScalar(static_cast<int>(args.m));
tuner.AddArgumentScalar(0);
tuner.AddArgumentOutput(b_mat);
tuner.AddArgumentInput(alpha_buffer);
tuner.AddArgumentScalar(GetRealArg(args.alpha));
tuner.AddArgumentScalar(0);
}

View file

@ -91,11 +91,10 @@ class TuneTranspose {
std::vector<T> &, std::vector<T> &,
std::vector<T> &a_mat, std::vector<T> &b_mat, std::vector<T> &,
std::vector<T> &) {
auto alpha_buffer = std::vector<T>{args.alpha};
tuner.AddArgumentScalar(static_cast<int>(args.m));
tuner.AddArgumentInput(a_mat);
tuner.AddArgumentOutput(b_mat);
tuner.AddArgumentInput(alpha_buffer);
tuner.AddArgumentScalar(GetRealArg(args.alpha));
}
// Describes how to compute the performance metrics

View file

@ -90,7 +90,6 @@ class TunePadTranspose {
std::vector<T> &, std::vector<T> &,
std::vector<T> &a_mat, std::vector<T> &b_mat, std::vector<T> &,
std::vector<T> &) {
auto alpha_buffer = std::vector<T>{args.alpha};
tuner.AddArgumentScalar(static_cast<int>(args.m));
tuner.AddArgumentScalar(static_cast<int>(args.n));
tuner.AddArgumentScalar(static_cast<int>(args.m));
@ -101,7 +100,7 @@ class TunePadTranspose {
tuner.AddArgumentScalar(static_cast<int>(args.n));
tuner.AddArgumentScalar(0);
tuner.AddArgumentOutput(b_mat);
tuner.AddArgumentInput(alpha_buffer);
tuner.AddArgumentScalar(GetRealArg(args.alpha));
tuner.AddArgumentScalar(0);
}

View file

@ -89,9 +89,8 @@ class TuneXaxpy {
std::vector<T> &x_vec, std::vector<T> &y_vec,
std::vector<T> &, std::vector<T> &, std::vector<T> &,
std::vector<T> &) {
auto alpha_buffer = std::vector<T>{args.alpha};
tuner.AddArgumentScalar(static_cast<int>(args.n));
tuner.AddArgumentInput(alpha_buffer);
tuner.AddArgumentScalar(GetRealArg(args.alpha));
tuner.AddArgumentInput(x_vec);
tuner.AddArgumentOutput(y_vec);
}

View file

@ -7,7 +7,9 @@
// Author(s):
// Cedric Nugteren <www.cedricnugteren.nl>
//
// This file uses the CLTune auto-tuner to tune the xgemm OpenCL kernels.
// This file uses the CLTune auto-tuner to tune the xgemm OpenCL kernels. There are two variations:
// - V==1: This tests some limited set of tuning parameters exhaustively.
// - V==2: This tests a much larger set of tuning parameters by randomly sampling a subset.
//
// =================================================================================================
@ -21,18 +23,19 @@ namespace clblast {
// =================================================================================================
// See comment at top of file for a description of the class
template <typename T>
template <typename T, int V>
class TuneXgemm {
public:
// The representative kernel and the source code
static std::string KernelFamily() { return "xgemm"; }
static std::string KernelFamily() { return (V==1) ? "xgemm_1" : "xgemm_2"; }
static std::string KernelName() { return "Xgemm"; }
static std::string GetSources() {
return
#include "../src/kernels/common.opencl"
#include "../src/kernels/level3/xgemm_part1.opencl"
#include "../src/kernels/level3/xgemm_part2.opencl"
#include "../src/kernels/level3/xgemm_part3.opencl"
;
}
@ -48,7 +51,7 @@ class TuneXgemm {
static size_t DefaultM() { return 1024; }
static size_t DefaultN() { return 1024; }
static size_t DefaultK() { return 1024; }
static double DefaultFraction() { return 2048.0; }
static double DefaultFraction() { return (V==1) ? 1.0 : 512.0; } // test all or sample randomly
// Describes how to obtain the sizes of the buffers
static size_t GetSizeX(const Arguments<T> &) { return 1; } // N/A for this kernel
@ -60,20 +63,38 @@ class TuneXgemm {
// Sets the tuning parameters and their possible values
static void SetParameters(cltune::Tuner &tuner, const size_t id) {
tuner.AddParameter(id, "MWG", {16, 32, 64, 128});
tuner.AddParameter(id, "NWG", {16, 32, 64, 128});
tuner.AddParameter(id, "KWG", {16, 32});
tuner.AddParameter(id, "MDIMC", {8, 16, 32});
tuner.AddParameter(id, "NDIMC", {8, 16, 32});
tuner.AddParameter(id, "MDIMA", {8, 16, 32});
tuner.AddParameter(id, "NDIMB", {8, 16, 32});
tuner.AddParameter(id, "KWI", {2, 8});
tuner.AddParameter(id, "VWM", {1, 2, 4, 8});
tuner.AddParameter(id, "VWN", {1, 2, 4, 8});
tuner.AddParameter(id, "STRM", {0, 1});
tuner.AddParameter(id, "STRN", {0, 1});
tuner.AddParameter(id, "SA", {0, 1});
tuner.AddParameter(id, "SB", {0, 1});
if (V==1) { // limited subset of tuning parameters - but explorable exhaustively
tuner.AddParameter(id, "MWG", {16, 32, 64});
tuner.AddParameter(id, "NWG", {16, 32, 64});
tuner.AddParameter(id, "KWG", {32});
tuner.AddParameter(id, "MDIMC", {8, 16, 32});
tuner.AddParameter(id, "NDIMC", {8, 16, 32});
tuner.AddParameter(id, "MDIMA", {8, 16, 32});
tuner.AddParameter(id, "NDIMB", {8, 16, 32});
tuner.AddParameter(id, "KWI", {2});
tuner.AddParameter(id, "VWM", {1, 2, 4});
tuner.AddParameter(id, "VWN", {1, 2, 4});
tuner.AddParameter(id, "STRM", {0});
tuner.AddParameter(id, "STRN", {0});
tuner.AddParameter(id, "SA", {0, 1});
tuner.AddParameter(id, "SB", {0, 1});
} // a lot more tuning parameters - has to be sampled randomly, too much to test all
else {
tuner.AddParameter(id, "MWG", {16, 32, 64, 128});
tuner.AddParameter(id, "NWG", {16, 32, 64, 128});
tuner.AddParameter(id, "KWG", {16, 32});
tuner.AddParameter(id, "MDIMC", {8, 16, 32});
tuner.AddParameter(id, "NDIMC", {8, 16, 32});
tuner.AddParameter(id, "MDIMA", {8, 16, 32});
tuner.AddParameter(id, "NDIMB", {8, 16, 32});
tuner.AddParameter(id, "KWI", {2});
tuner.AddParameter(id, "VWM", {1, 2, 4, 8});
tuner.AddParameter(id, "VWN", {1, 2, 4, 8});
tuner.AddParameter(id, "STRM", {0, 1});
tuner.AddParameter(id, "STRN", {0, 1});
tuner.AddParameter(id, "SA", {0, 1});
tuner.AddParameter(id, "SB", {0, 1});
}
}
// Sets the constraints
@ -92,6 +113,14 @@ class TuneXgemm {
// KWG has to be a multiple of KDIMA = ((MDIMC*NDIMC)/(MDIMA)) and KDIMB = (...)
tuner.AddConstraint(id, MultipleOfXMulYDivZ, {"KWG", "MDIMC", "NDIMC", "MDIMA"});
tuner.AddConstraint(id, MultipleOfXMulYDivZ, {"KWG", "MDIMC", "NDIMC", "NDIMB"});
// Extra constraints for variation 1 to limit the set of options significantly
if (V==1) {
auto IsEqual = [] (std::vector<size_t> v) { return v[0] == v[1]; };
tuner.AddConstraint(id, IsEqual, {"MDIMC", "MDIMA"});
tuner.AddConstraint(id, IsEqual, {"NDIMC", "NDIMB"});
tuner.AddConstraint(id, IsEqual, {"SA", "SB"});
}
}
// Sets the local memory size
@ -121,13 +150,11 @@ class TuneXgemm {
std::vector<T> &, std::vector<T> &,
std::vector<T> &a_mat, std::vector<T> &b_mat, std::vector<T> &c_mat,
std::vector<T> &) {
auto alpha_buffer = std::vector<T>{args.alpha};
auto beta_buffer = std::vector<T>{args.beta};
tuner.AddArgumentScalar(static_cast<int>(args.m));
tuner.AddArgumentScalar(static_cast<int>(args.n));
tuner.AddArgumentScalar(static_cast<int>(args.k));
tuner.AddArgumentInput(alpha_buffer);
tuner.AddArgumentInput(beta_buffer);
tuner.AddArgumentScalar(GetRealArg(args.alpha));
tuner.AddArgumentScalar(GetRealArg(args.beta));
tuner.AddArgumentInput(a_mat);
tuner.AddArgumentInput(b_mat);
tuner.AddArgumentOutput(c_mat);
@ -147,15 +174,22 @@ class TuneXgemm {
using float2 = clblast::float2;
using double2 = clblast::double2;
// Function to tune a specific variation V (not within the clblast namespace)
template <int V>
void StartVariation(int argc, char *argv[]) {
switch(clblast::GetPrecision(argc, argv)) {
case clblast::Precision::kHalf: clblast::Tuner<clblast::TuneXgemm<half,V>, half>(argc, argv); break;
case clblast::Precision::kSingle: clblast::Tuner<clblast::TuneXgemm<float,V>, float>(argc, argv); break;
case clblast::Precision::kDouble: clblast::Tuner<clblast::TuneXgemm<double,V>, double>(argc, argv); break;
case clblast::Precision::kComplexSingle: clblast::Tuner<clblast::TuneXgemm<float2,V>, float2>(argc, argv); break;
case clblast::Precision::kComplexDouble: clblast::Tuner<clblast::TuneXgemm<double2,V>, double2>(argc, argv); break;
}
}
// Main function (not within the clblast namespace)
int main(int argc, char *argv[]) {
switch(clblast::GetPrecision(argc, argv)) {
case clblast::Precision::kHalf: clblast::Tuner<clblast::TuneXgemm<half>, half>(argc, argv); break;
case clblast::Precision::kSingle: clblast::Tuner<clblast::TuneXgemm<float>, float>(argc, argv); break;
case clblast::Precision::kDouble: clblast::Tuner<clblast::TuneXgemm<double>, double>(argc, argv); break;
case clblast::Precision::kComplexSingle: clblast::Tuner<clblast::TuneXgemm<float2>, float2>(argc, argv); break;
case clblast::Precision::kComplexDouble: clblast::Tuner<clblast::TuneXgemm<double2>, double2>(argc, argv); break;
}
StartVariation<1>(argc, argv);
StartVariation<2>(argc, argv);
return 0;
}

View file

@ -29,7 +29,7 @@ class TuneXgemv {
public:
// The representative kernel and the source code
static std::string KernelFamily() { return "xgemv_"+std::to_string(V); }
static std::string KernelFamily() { return (V==1) ? "xgemv" : ((V==2) ? "xgemv_fast" : "xgemv_fast_rot"); }
static std::string KernelName() { return (V==1) ? "Xgemv" : ((V==2) ? "XgemvFast" : "XgemvFastRot"); }
static std::string GetSources() {
return
@ -61,21 +61,42 @@ class TuneXgemv {
// Sets the tuning parameters and their possible values
static void SetParameters(cltune::Tuner &tuner, const size_t id) {
tuner.AddParameter(id, "WGS"+std::to_string(V), {64, 128, 256});
tuner.AddParameter(id, "WPT"+std::to_string(V), {1, 2, 4});
if (V==2 || V==3) { tuner.AddParameter(id, "VW"+std::to_string(V), {1, 2, 4, 8}); }
if (V==1) {
tuner.AddParameter(id, "WGS"+std::to_string(V), {32, 64, 128, 256});
tuner.AddParameter(id, "WPT"+std::to_string(V), {1, 2, 4});
}
if (V==2) {
tuner.AddParameter(id, "WGS"+std::to_string(V), {16, 32, 64, 128, 256});
tuner.AddParameter(id, "WPT"+std::to_string(V), {1, 2, 4});
tuner.AddParameter(id, "VW"+std::to_string(V), {1, 2, 4, 8});
}
if (V==3) {
tuner.AddParameter(id, "WGS"+std::to_string(V), {16, 32, 64, 128});
tuner.AddParameter(id, "WPT"+std::to_string(V), {1, 2, 4, 8, 16, 32});
tuner.AddParameter(id, "VW"+std::to_string(V), {1, 2, 4, 8});
}
}
// Sets the constraints and local memory size
static void SetConstraints(cltune::Tuner &tuner, const size_t id) {
auto MultipleOfX = [] (std::vector<size_t> v) { return IsMultiple(v[0], v[1]); };
if (V==2 || V==3) {
auto MultipleOfX = [] (std::vector<size_t> v) { return IsMultiple(v[0], v[1]); };
tuner.AddConstraint(id, MultipleOfX, {"WPT"+std::to_string(V), "VW"+std::to_string(V)});
}
if (V==3) {
auto LargerOrEqual = [] (std::vector<size_t> v) { return v[0] >= v[1]; };
tuner.AddConstraint(id, LargerOrEqual, {"WGS"+std::to_string(V), "WPT"+std::to_string(V)});
}
}
static void SetLocalMemorySize(cltune::Tuner &tuner, const size_t id, const Arguments<T> &args) {
auto LocalMemorySize = [args] (std::vector<size_t> v) { return v[0]*GetBytes(args.precision); };
tuner.SetLocalMemoryUsage(id, LocalMemorySize, {"WGS"+std::to_string(V)});
if (V==1 || V==2) {
auto LocalMemorySize = [args] (std::vector<size_t> v) { return v[0]*GetBytes(args.precision); };
tuner.SetLocalMemoryUsage(id, LocalMemorySize, {"WGS"+std::to_string(V)});
}
else {
auto LocalMemorySize = [args] (std::vector<size_t> v) { return (v[0]*v[1] + v[1])*GetBytes(args.precision); };
tuner.SetLocalMemoryUsage(id, LocalMemorySize, {"WGS"+std::to_string(V), "WPT"+std::to_string(V)});
}
}
// Sets the base thread configuration
@ -89,20 +110,21 @@ class TuneXgemv {
static TransformVector MulLocal() { return {{"WGS"+std::to_string(V)}}; }
static TransformVector DivLocal() { return {}; }
static TransformVector MulGlobal() { return {}; }
static TransformVector DivGlobal() { return {{"WPT"+std::to_string(V)}}; }
static TransformVector DivGlobal() {
if (V==1 || V==2) return {{"WPT"+std::to_string(V)}};
return {};
}
// Sets the kernel's arguments
static void SetArguments(cltune::Tuner &tuner, const Arguments<T> &args,
std::vector<T> &x_vec, std::vector<T> &y_vec,
std::vector<T> &a_mat, std::vector<T> &, std::vector<T> &,
std::vector<T> &) {
auto alpha_buffer = std::vector<T>{args.alpha};
auto beta_buffer = std::vector<T>{args.beta};
auto a_rotated = (V==3) ? 1 : 0;
tuner.AddArgumentScalar(static_cast<int>(args.m));
tuner.AddArgumentScalar(static_cast<int>(args.n));
tuner.AddArgumentInput(alpha_buffer);
tuner.AddArgumentInput(beta_buffer);
tuner.AddArgumentScalar(GetRealArg(args.alpha));
tuner.AddArgumentScalar(GetRealArg(args.beta));
tuner.AddArgumentScalar(static_cast<int>(a_rotated));
tuner.AddArgumentInput(a_mat);
tuner.AddArgumentScalar(0);

View file

@ -85,10 +85,9 @@ class TuneXger {
std::vector<T> &x_vec, std::vector<T> &y_vec,
std::vector<T> &a_mat, std::vector<T> &, std::vector<T> &,
std::vector<T> &) {
auto alpha_buffer = std::vector<T>{args.alpha};
tuner.AddArgumentScalar(static_cast<int>(args.m));
tuner.AddArgumentScalar(static_cast<int>(args.n));
tuner.AddArgumentInput(alpha_buffer);
tuner.AddArgumentScalar(GetRealArg(args.alpha));
tuner.AddArgumentInput(x_vec);
tuner.AddArgumentScalar(0); // x_offset
tuner.AddArgumentScalar(1); // x_increment

View file

@ -161,6 +161,8 @@ template <typename T>
T ConvertArgument(const char* value) {
return static_cast<T>(std::stoi(value));
}
template size_t ConvertArgument(const char* value);
template <> half ConvertArgument(const char* value) {
return FloatToHalf(static_cast<float>(std::stod(value)));
}
@ -179,6 +181,15 @@ template <> double2 ConvertArgument(const char* value) {
return double2{val, val};
}
// Variant of "ConvertArgument" with default values
template <typename T>
T ConvertArgument(const char* value, T default_value) {
if (value) { return ConvertArgument<T>(value); }
return default_value;
}
template size_t ConvertArgument(const char* value, size_t default_value);
// This function matches patterns in the form of "-option value" or "--option value". It returns a
// default value in case the option is not found in the argument string.
template <typename T>
@ -332,6 +343,14 @@ void FloatToHalfBuffer(Buffer<half>& result, const Buffer<float>& source, cl_com
result.Write(queue, size, result_cpu);
}
// Converts a 'real' value to a 'real argument' value to be passed to a kernel. Normally there is
// no conversion, but half-precision is not supported as kernel argument so it is converted to float.
template <> typename RealArg<half>::Type GetRealArg(const half value) { return HalfToFloat(value); }
template <> typename RealArg<float>::Type GetRealArg(const float value) { return value; }
template <> typename RealArg<double>::Type GetRealArg(const double value) { return value; }
template <> typename RealArg<float2>::Type GetRealArg(const float2 value) { return value; }
template <> typename RealArg<double2>::Type GetRealArg(const double2 value) { return value; }
// =================================================================================================
// Rounding functions performing ceiling and division operations

View file

@ -80,8 +80,9 @@ constexpr auto kArgComparecblas = "cblas";
constexpr auto kArgStepSize = "step";
constexpr auto kArgNumSteps = "num_steps";
constexpr auto kArgNumRuns = "runs";
constexpr auto kArgWarmUp = "warm_up";
// The client-specific arguments in string form
// The test-specific arguments in string form
constexpr auto kArgFullTest = "full_test";
constexpr auto kArgVerbose = "verbose";
@ -186,6 +187,10 @@ std::string ToString(T value);
template <typename T>
T ConvertArgument(const char* value);
// Variant of "ConvertArgument" with default values
template <typename T>
T ConvertArgument(const char* value, T default_value);
// Basic argument parser, matching patterns in the form of "-option value" and "--option value"
template <typename T>
T GetArgument(const int argc, char **argv, std::string &help,
@ -226,6 +231,12 @@ void FloatToHalfBuffer(std::vector<half>& result, const std::vector<float>& sour
Buffer<float> HalfToFloatBuffer(const Buffer<half>& source, cl_command_queue queue_raw);
void FloatToHalfBuffer(Buffer<half>& result, const Buffer<float>& source, cl_command_queue queue_raw);
// Converts a 'real' value to a 'real argument' value to be passed to a kernel. Normally there is
// no conversion, but half-precision is not supported as kernel argument so it is converted to float.
template <typename T> struct RealArg { using Type = T; };
template <> struct RealArg<half> { using Type = float; };
template <typename T> typename RealArg<T>::Type GetRealArg(const T value);
// =================================================================================================
// Rounding functions

View file

@ -15,6 +15,7 @@
#include <vector>
#include <iostream>
#include <cmath>
#include <cstdlib>
#include "test/correctness/tester.hpp"
@ -27,8 +28,8 @@ template <typename T, typename U>
Tester<T,U>::Tester(int argc, char *argv[], const bool silent,
const std::string &name, const std::vector<std::string> &options):
help_("Options given/available:\n"),
platform_(Platform(GetArgument(argc, argv, help_, kArgPlatform, size_t{0}))),
device_(Device(platform_, GetArgument(argc, argv, help_, kArgDevice, size_t{0}))),
platform_(Platform(GetArgument(argc, argv, help_, kArgPlatform, ConvertArgument(std::getenv("CLBLAST_PLATFORM"), size_t{0})))),
device_(Device(platform_, GetArgument(argc, argv, help_, kArgDevice, ConvertArgument(std::getenv("CLBLAST_DEVICE"), size_t{0})))),
context_(Context(device_)),
queue_(Queue(context_, device_)),
full_test_(CheckArgument(argc, argv, help_, kArgFullTest)),

View file

@ -113,6 +113,7 @@ Arguments<U> Client<T,U>::ParseArguments(int argc, char *argv[], const size_t le
args.print_help = CheckArgument(argc, argv, help, kArgHelp);
args.silent = CheckArgument(argc, argv, help, kArgQuiet);
args.no_abbrv = CheckArgument(argc, argv, help, kArgNoAbbreviations);
warm_up_ = CheckArgument(argc, argv, help, kArgWarmUp);
// Prints the chosen (or defaulted) arguments to screen. This also serves as the help message,
// which is thus always displayed (unless silence is specified).
@ -244,12 +245,24 @@ template <typename T, typename U>
double Client<T,U>::TimedExecution(const size_t num_runs, const Arguments<U> &args,
Buffers<T> &buffers, Queue &queue,
Routine run_blas, const std::string &library_name) {
auto status = StatusCode::kSuccess;
// Do an optional warm-up to omit compilation times and initialisations from the measurements
if (warm_up_) {
try {
status = run_blas(args, buffers, queue);
} catch (...) { status = static_cast<StatusCode>(kUnknownError); }
if (status != StatusCode::kSuccess) {
throw std::runtime_error(library_name+" error: "+ToString(static_cast<int>(status)));
}
}
// Start the timed part
auto timings = std::vector<double>(num_runs);
for (auto &timing: timings) {
auto start_time = std::chrono::steady_clock::now();
// Executes the main computation
auto status = StatusCode::kSuccess;
try {
status = run_blas(args, buffers, queue);
} catch (...) { status = static_cast<StatusCode>(kUnknownError); }

View file

@ -82,6 +82,9 @@ class Client {
const std::vector<std::string> options_;
const GetMetric get_flops_;
const GetMetric get_bytes_;
// Extra arguments
bool warm_up_; // if enabled, do a warm-up run first before measuring execution time
};
// =================================================================================================