Merge pull request #60 from CNugteren/development

Update to version 0.7.1
This commit is contained in:
Cedric Nugteren 2016-05-18 21:18:07 +02:00
commit 181eb20bbf
22 changed files with 347 additions and 260 deletions

View file

@ -1,4 +1,9 @@
Version 0.7.1
- Improved performance of large power-of-2 xGEMM kernels for AMD GPUs
- Fixed a bug in the xGEMM routine related to the event incorrectly set
- Made MSVC link the run-time libraries statically
Version 0.7.0
- Added exports to be able to create a DLL on Windows (thanks to Marco Hutter)
- Made the library thread-safe

View file

@ -9,12 +9,17 @@
#
# ==================================================================================================
# CMake project details
cmake_minimum_required(VERSION 2.8.10)
# Overrides for MSVC static runtime
set(CMAKE_USER_MAKE_RULES_OVERRIDE ${CMAKE_CURRENT_SOURCE_DIR}/cmake/c_flag_overrides.cmake)
set(CMAKE_USER_MAKE_RULES_OVERRIDE_CXX ${CMAKE_CURRENT_SOURCE_DIR}/cmake/cxx_flag_overrides.cmake)
# CMake project details
project("clblast" C CXX)
set(clblast_VERSION_MAJOR 0)
set(clblast_VERSION_MINOR 7)
set(clblast_VERSION_PATCH 0)
set(clblast_VERSION_PATCH 1)
# Options and their default values
option(SAMPLES "Enable compilation of the examples" OFF)
@ -32,40 +37,40 @@ set(CMAKE_INSTALL_RPATH_USE_LINK_PATH false) # Don't add the automatically deter
# ==================================================================================================
# Compiler-version check (requires at least CMake 2.8.10)
if("${CMAKE_CXX_COMPILER_ID}" STREQUAL "GNU")
if (CMAKE_CXX_COMPILER_VERSION VERSION_LESS 4.7)
if(CMAKE_CXX_COMPILER_ID STREQUAL GNU)
if(CMAKE_CXX_COMPILER_VERSION VERSION_LESS 4.7)
message(FATAL_ERROR "GCC version must be at least 4.7")
endif()
elseif ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang")
if (CMAKE_CXX_COMPILER_VERSION VERSION_LESS 3.3)
elseif(CMAKE_CXX_COMPILER_ID STREQUAL Clang)
if(CMAKE_CXX_COMPILER_VERSION VERSION_LESS 3.3)
message(FATAL_ERROR "Clang version must be at least 3.3")
endif()
elseif ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "AppleClang")
if (CMAKE_CXX_COMPILER_VERSION VERSION_LESS 5.0)
elseif(CMAKE_CXX_COMPILER_ID STREQUAL AppleClang)
if(CMAKE_CXX_COMPILER_VERSION VERSION_LESS 5.0)
message(FATAL_ERROR "AppleClang version must be at least 5.0")
endif()
elseif ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Intel")
if (CMAKE_CXX_COMPILER_VERSION VERSION_LESS 14.0)
elseif(CMAKE_CXX_COMPILER_ID STREQUAL Intel)
if(CMAKE_CXX_COMPILER_VERSION VERSION_LESS 14.0)
message(FATAL_ERROR "ICC version must be at least 14.0")
endif()
elseif ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "MSVC")
if (CMAKE_CXX_COMPILER_VERSION VERSION_LESS 18.0)
elseif(MSVC)
if(CMAKE_CXX_COMPILER_VERSION VERSION_LESS 18.0)
message(FATAL_ERROR "MS Visual Studio version must be at least 18.0")
endif()
endif()
# C++ compiler settings
if ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "MSVC")
if(MSVC)
set(FLAGS "/Ox")
set(FLAGS "${FLAGS} /wd4715")
else ()
else()
set(FLAGS "-O3 -std=c++11")
if("${CMAKE_CXX_COMPILER_ID}" STREQUAL "GNU")
if(CMAKE_CXX_COMPILER_ID STREQUAL GNU)
set(FLAGS "${FLAGS} -Wall -Wno-comment -Wno-return-type -Wno-switch -Wno-missing-noreturn")
if (CMAKE_CXX_COMPILER_VERSION VERSION_LESS 4.9.0)
if(CMAKE_CXX_COMPILER_VERSION VERSION_LESS 4.9.0)
set(FLAGS "${FLAGS} -Wno-attributes -Wno-unused-variable")
endif()
elseif ("${CMAKE_CXX_COMPILER_ID}" MATCHES "Clang")
elseif(CMAKE_CXX_COMPILER_ID MATCHES Clang)
set(FLAGS "${FLAGS} -Wextra -Wno-c++98-compat -Wno-c++98-compat-pedantic -Wno-padded")
set(FLAGS "${FLAGS} -Wno-missing-prototypes -Wno-float-equal -Wno-switch-enum -Wno-switch")
set(FLAGS "${FLAGS} -Wno-exit-time-destructors -Wno-global-constructors -Wno-missing-noreturn")
@ -75,9 +80,9 @@ endif()
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${FLAGS}")
# C compiler settings (for the sample)
if ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "MSVC")
if(MSVC)
set(CFLAGS "/Ox")
else ()
else()
set(CFLAGS "-O3 -std=c99")
endif()
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${CFLAGS}")
@ -228,7 +233,7 @@ if(TESTS)
if(CLBLAS_FOUND)
set(REF_INCLUDES ${REF_INCLUDES} ${CLBLAS_INCLUDE_DIRS})
set(REF_LIBRARIES ${REF_LIBRARIES} ${CLBLAS_LIBRARIES})
if("${CMAKE_CXX_COMPILER_ID}" STREQUAL "MSVC")
if(CMAKE_CXX_COMPILER_ID STREQUAL "MSVC")
add_definitions(" /DCLBLAST_REF_CLBLAS")
else()
add_definitions(" -DCLBLAST_REF_CLBLAS")
@ -237,7 +242,7 @@ if(TESTS)
if(CBLAS_FOUND)
set(REF_INCLUDES ${REF_INCLUDES} ${CBLAS_INCLUDE_DIRS})
set(REF_LIBRARIES ${REF_LIBRARIES} ${CBLAS_LIBRARIES})
if("${CMAKE_CXX_COMPILER_ID}" STREQUAL "MSVC")
if(CMAKE_CXX_COMPILER_ID STREQUAL "MSVC")
add_definitions(" /DCLBLAST_REF_CBLAS")
else()
add_definitions(" -DCLBLAST_REF_CBLAS")

View file

@ -54,13 +54,13 @@ The pre-requisites for compilation of CLBlast are:
Furthermore, to build the (optional) correctness and performance tests, another BLAS library is needed to serve as a reference. This can be either:
* The OpenCL BLAS library [clBLAS](http://github.com/clMathLibraries/clBLAS (maintained by AMD)
* The OpenCL BLAS library [clBLAS](http://github.com/clMathLibraries/clBLAS) (maintained by AMD)
* A regular CPU Netlib BLAS library, e.g.:
- OpenBLAS
- BLIS
- Accelerate
An example of an out-of-source build (starting from the root of the CLBlast folder):
An example of an out-of-source build using a command-line compiler and make (starting from the root of the CLBlast folder):
mkdir build
cd build
@ -68,6 +68,12 @@ An example of an out-of-source build (starting from the root of the CLBlast fold
make
sudo make install
When using Visual Studio, the project-files can be generated as follows:
mkdir build
cd build
cmake -G "Visual Studio 14 Win64" ..
A custom installation folder can be specified when calling CMake:
cmake -DCMAKE_INSTALL_PREFIX=/path/to/install/directory ..
@ -84,7 +90,7 @@ Or alternatively the plain C version:
#include <clblast_c.h>
Afterwards, any of CLBlast's routines can be called directly: there is no need to initialize the library. The available routines and the required arguments are described in the `clblast.h` include file and the included [API documentation](doc/api.md). Additionally, a couple of stand-alone example programs are included in `samples/`.
Afterwards, any of CLBlast's routines can be called directly: there is no need to initialize the library. The available routines and the required arguments are described in the `clblast.h` include file and the included [API documentation](doc/clblast.md). Additionally, a couple of stand-alone example programs are included in `samples/`.
Using the tuners (optional)
@ -254,6 +260,7 @@ The contributing authors (code, pull requests, testing) so far are:
* [Cedric Nugteren](http://www.cedricnugteren.nl)
* [Anton Lokhmotov](https://github.com/psyhtest)
* [Dragan Djuric](https://github.com/blueberry)
* [Marco Hutter](https://github.com/gpus)
* [Hugh Perkins](https://github.com/hughperkins)
Tuning and testing on a variety of OpenCL devices was made possible by:

View file

@ -0,0 +1,9 @@
# Overriding the CMake flags to use static runtime libraries
# See http://www.cmake.org/Wiki/CMake_FAQ#How_can_I_build_my_MSVC_application_with_a_static_runtime.3F
if(MSVC)
set(CMAKE_C_FLAGS_DEBUG_INIT "/D_DEBUG /MTd /Zi /Ob0 /Od /RTC1")
set(CMAKE_C_FLAGS_MINSIZEREL_INIT "/MT /O1 /Ob1 /D NDEBUG")
set(CMAKE_C_FLAGS_RELEASE_INIT "/MT /O2 /Ob2 /D NDEBUG")
set(CMAKE_C_FLAGS_RELWITHDEBINFO_INIT "/MT /Zi /O2 /Ob1 /D NDEBUG")
endif()

View file

@ -0,0 +1,9 @@
# Overriding the CMake flags to use static runtime libraries
# See http://www.cmake.org/Wiki/CMake_FAQ#How_can_I_build_my_MSVC_application_with_a_static_runtime.3F
if(MSVC)
set(CMAKE_CXX_FLAGS_DEBUG_INIT "/D_DEBUG /MTd /Zi /Ob0 /Od /RTC1")
set(CMAKE_CXX_FLAGS_MINSIZEREL_INIT "/MT /O1 /Ob1 /D NDEBUG")
set(CMAKE_CXX_FLAGS_RELEASE_INIT "/MT /O2 /Ob2 /D NDEBUG")
set(CMAKE_CXX_FLAGS_RELWITHDEBINFO_INIT "/MT /Zi /O2 /Ob1 /D NDEBUG")
endif()

View file

@ -18,11 +18,11 @@ const Database::DatabaseEntry Database::XgemmSingle = {
"Xgemm", Precision::kSingle, {
{ // AMD GPUs
kDeviceTypeGPU, "AMD", {
{ "AMD Radeon R9 M370X Compute Engine", { {"KWG",32}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",32}, {"MWG",128}, {"NDIMB",32}, {"NDIMC",8}, {"NWG",64}, {"SA",0}, {"SB",1}, {"STRM",1}, {"STRN",1}, {"VWM",4}, {"VWN",1} } },
{ "AMD Radeon R9 M370X Compute Engine", { {"KWG",32}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",8}, {"NDIMC",16}, {"NWG",128}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",2}, {"VWN",8} } },
{ "Hawaii", { {"KWG",16}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",32}, {"MWG",128}, {"NDIMB",32}, {"NDIMC",8}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",1}, {"VWM",4}, {"VWN",2} } },
{ "Pitcairn", { {"KWG",16}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",128}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
{ "Tahiti", { {"KWG",32}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",32}, {"MWG",128}, {"NDIMB",16}, {"NDIMC",8}, {"NWG",64}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",4}, {"VWN",1} } },
{ "default", { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",16}, {"NDIMC",8}, {"NWG",64}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
{ "default", { {"KWG",16}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",64}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
}
},
{ // ARM GPUs

View file

@ -48,14 +48,18 @@ void Tuner(int argc, char* argv[]) {
// Tests validity of the given arguments
C::TestValidArguments(args);
// Tests for validity of the precision
// Tests for validity of the precision and retrieves properties
auto isAMD = false;
auto isGPU = false;
{
auto platform = Platform(args.platform_id);
auto device = Device(platform, args.device_id);
const auto platform = Platform(args.platform_id);
const auto device = Device(platform, args.device_id);
if (!PrecisionSupported<T>(device)) {
printf("* Unsupported precision, skipping this tuning run\n\n");
return;
}
isAMD = device.Vendor() == "AMD" || device.Vendor() == "Advanced Micro Devices, Inc.";
isGPU = device.Type() == "GPU";
}
// Creates input buffers with random data
@ -84,8 +88,15 @@ void Tuner(int argc, char* argv[]) {
tuner.UseRandomSearch(1.0/args.fraction);
}
// Set extra settings for specific defines. This mimics src/routine.cc.
auto defines = std::string{""};
if (isAMD && isGPU) {
defines += "#define USE_CL_MAD 1\n";
defines += "#define USE_STAGGERED_INDICES 1\n";
}
// Loads the kernel sources and defines the kernel to tune
auto sources = C::GetSources();
auto sources = defines + C::GetSources();
auto id = tuner.AddKernelFromString(sources, C::KernelName(), C::GlobalSize(args), C::LocalSize());
tuner.SetReferenceFromString(sources, C::KernelName(), C::GlobalSizeRef(args), C::LocalSizeRef());

View file

@ -92,6 +92,7 @@ def ConcatenateData(df1, df2):
def RemoveDuplicates(df):
return df.drop_duplicates()
# database = database[(database["device"] != "AMD Radeon R9 M370X Compute Engine") | (database["kernel_family"] != "xgemm") | (database["precision"] != "32")]
def RemoveEntriesByDevice(df, devicename):
return df[df["device"] != devicename]

View file

@ -176,6 +176,32 @@ R"(
// =================================================================================================
// Shuffled workgroup indices to avoid partition camping, see below. For specific devices, this is
// enabled (see src/routine.cc).
#ifndef USE_STAGGERED_INDICES
#define USE_STAGGERED_INDICES 0
#endif
// Staggered/shuffled group indices to avoid partition camping (AMD GPUs). Formula's are taken from:
// http://docs.nvidia.com/cuda/samples/6_Advanced/transpose/doc/MatrixTranspose.pdf
// More details: https://github.com/CNugteren/CLBlast/issues/53
#if USE_STAGGERED_INDICES == 1
inline size_t GetGroupIDFlat() {
return get_group_id(0) + get_num_groups(0) * get_group_id(1);
}
inline size_t GetGroupID1() {
return (GetGroupIDFlat()) % get_num_groups(1);
}
inline size_t GetGroupID0() {
return ((GetGroupIDFlat() / get_num_groups(1)) + GetGroupID1()) % get_num_groups(0);
}
#else
inline size_t GetGroupID1() { return get_group_id(1); }
inline size_t GetGroupID0() { return get_group_id(0); }
#endif
// =================================================================================================
// End of the C++11 raw string literal
)"

View file

@ -199,7 +199,7 @@ inline void GlobalToLocalA(const __global realM* restrict agm, __local realM* al
// Computes the indices for the global memory
int kg = kia + la1*KWA;
int idm = mg + get_group_id(0)*(MWG/VWM);
int idm = mg + GetGroupID0() * (MWG/VWM);
int idk = kg + kwg;
// Loads the data from global memory (not transposed) into the local memory
@ -229,7 +229,7 @@ inline void GlobalToLocalB(const __global realN* restrict bgm, __local realN* bl
// Computes the indices for the global memory
int kg = kib + lb1*KWB;
int idn = ng + get_group_id(1)*(NWG/VWN);
int idn = ng + GetGroupID1() * (NWG/VWN);
int idk = kg + kwg;
// Loads the data from global memory (transposed) into the local memory
@ -257,7 +257,7 @@ inline void GlobalToPrivateA(const __global realM* restrict agm, realM apm[MWI/V
#endif
// Computes the indices for the global memory
int idm = mg + get_group_id(0)*(MWG/VWM);
int idm = mg + GetGroupID0() * (MWG/VWM);
// Loads the data from global memory (not transposed) and stores into registers
apm[mi] = agm[idk*(kSizeM/VWM) + idm];
@ -280,7 +280,7 @@ inline void GlobalToPrivateB(const __global realN* restrict bgm, realN bpm[NWI/V
#endif
// Computes the indices for the global memory
int idn = ng + get_group_id(1)*(NWG/VWN);
int idn = ng + GetGroupID1() * (NWG/VWN);
// Loads the data from global memory (transposed) and stores into registers
bpm[ni] = bgm[idk*(kSizeN/VWN) + idn];

View file

@ -69,42 +69,43 @@ inline void MultiplyAccumulate(realM cpm[NWI][MWI/VWM], realM apm[MWI/VWM], real
for (int ni=0; ni<NWI/VWN; ++ni) {
#pragma unroll
for (int mi=0; mi<MWI/VWM; ++mi) {
const realM aval = apm[mi];
#if VWN == 1
cpm[ni*VWN + 0][mi] = MultiplyAddVector(cpm[ni*VWN + 0][mi], apm[mi], bpm[ni]);
cpm[ni*VWN + 0][mi] = MultiplyAddVector(cpm[ni*VWN + 0][mi], aval, bpm[ni]);
#elif VWN == 2
cpm[ni*VWN + 0][mi] = MultiplyAddVector(cpm[ni*VWN + 0][mi], apm[mi], bpm[ni].x);
cpm[ni*VWN + 1][mi] = MultiplyAddVector(cpm[ni*VWN + 1][mi], apm[mi], bpm[ni].y);
cpm[ni*VWN + 0][mi] = MultiplyAddVector(cpm[ni*VWN + 0][mi], aval, bpm[ni].x);
cpm[ni*VWN + 1][mi] = MultiplyAddVector(cpm[ni*VWN + 1][mi], aval, bpm[ni].y);
#elif VWN == 4
cpm[ni*VWN + 0][mi] = MultiplyAddVector(cpm[ni*VWN + 0][mi], apm[mi], bpm[ni].x);
cpm[ni*VWN + 1][mi] = MultiplyAddVector(cpm[ni*VWN + 1][mi], apm[mi], bpm[ni].y);
cpm[ni*VWN + 2][mi] = MultiplyAddVector(cpm[ni*VWN + 2][mi], apm[mi], bpm[ni].z);
cpm[ni*VWN + 3][mi] = MultiplyAddVector(cpm[ni*VWN + 3][mi], apm[mi], bpm[ni].w);
cpm[ni*VWN + 0][mi] = MultiplyAddVector(cpm[ni*VWN + 0][mi], aval, bpm[ni].x);
cpm[ni*VWN + 1][mi] = MultiplyAddVector(cpm[ni*VWN + 1][mi], aval, bpm[ni].y);
cpm[ni*VWN + 2][mi] = MultiplyAddVector(cpm[ni*VWN + 2][mi], aval, bpm[ni].z);
cpm[ni*VWN + 3][mi] = MultiplyAddVector(cpm[ni*VWN + 3][mi], aval, bpm[ni].w);
#elif VWN == 8
cpm[ni*VWN + 0][mi] = MultiplyAddVector(cpm[ni*VWN + 0][mi], apm[mi], bpm[ni].s0);
cpm[ni*VWN + 1][mi] = MultiplyAddVector(cpm[ni*VWN + 1][mi], apm[mi], bpm[ni].s1);
cpm[ni*VWN + 2][mi] = MultiplyAddVector(cpm[ni*VWN + 2][mi], apm[mi], bpm[ni].s2);
cpm[ni*VWN + 3][mi] = MultiplyAddVector(cpm[ni*VWN + 3][mi], apm[mi], bpm[ni].s3);
cpm[ni*VWN + 4][mi] = MultiplyAddVector(cpm[ni*VWN + 4][mi], apm[mi], bpm[ni].s4);
cpm[ni*VWN + 5][mi] = MultiplyAddVector(cpm[ni*VWN + 5][mi], apm[mi], bpm[ni].s5);
cpm[ni*VWN + 6][mi] = MultiplyAddVector(cpm[ni*VWN + 6][mi], apm[mi], bpm[ni].s6);
cpm[ni*VWN + 7][mi] = MultiplyAddVector(cpm[ni*VWN + 7][mi], apm[mi], bpm[ni].s7);
cpm[ni*VWN + 0][mi] = MultiplyAddVector(cpm[ni*VWN + 0][mi], aval, bpm[ni].s0);
cpm[ni*VWN + 1][mi] = MultiplyAddVector(cpm[ni*VWN + 1][mi], aval, bpm[ni].s1);
cpm[ni*VWN + 2][mi] = MultiplyAddVector(cpm[ni*VWN + 2][mi], aval, bpm[ni].s2);
cpm[ni*VWN + 3][mi] = MultiplyAddVector(cpm[ni*VWN + 3][mi], aval, bpm[ni].s3);
cpm[ni*VWN + 4][mi] = MultiplyAddVector(cpm[ni*VWN + 4][mi], aval, bpm[ni].s4);
cpm[ni*VWN + 5][mi] = MultiplyAddVector(cpm[ni*VWN + 5][mi], aval, bpm[ni].s5);
cpm[ni*VWN + 6][mi] = MultiplyAddVector(cpm[ni*VWN + 6][mi], aval, bpm[ni].s6);
cpm[ni*VWN + 7][mi] = MultiplyAddVector(cpm[ni*VWN + 7][mi], aval, bpm[ni].s7);
#elif VWN == 16
cpm[ni*VWN + 0 ][mi] = MultiplyAddVector(cpm[ni*VWN + 0 ][mi], apm[mi], bpm[ni].s0);
cpm[ni*VWN + 1 ][mi] = MultiplyAddVector(cpm[ni*VWN + 1 ][mi], apm[mi], bpm[ni].s1);
cpm[ni*VWN + 2 ][mi] = MultiplyAddVector(cpm[ni*VWN + 2 ][mi], apm[mi], bpm[ni].s2);
cpm[ni*VWN + 3 ][mi] = MultiplyAddVector(cpm[ni*VWN + 3 ][mi], apm[mi], bpm[ni].s3);
cpm[ni*VWN + 4 ][mi] = MultiplyAddVector(cpm[ni*VWN + 4 ][mi], apm[mi], bpm[ni].s4);
cpm[ni*VWN + 5 ][mi] = MultiplyAddVector(cpm[ni*VWN + 5 ][mi], apm[mi], bpm[ni].s5);
cpm[ni*VWN + 6 ][mi] = MultiplyAddVector(cpm[ni*VWN + 6 ][mi], apm[mi], bpm[ni].s6);
cpm[ni*VWN + 7 ][mi] = MultiplyAddVector(cpm[ni*VWN + 7 ][mi], apm[mi], bpm[ni].s7);
cpm[ni*VWN + 8 ][mi] = MultiplyAddVector(cpm[ni*VWN + 8 ][mi], apm[mi], bpm[ni].s8);
cpm[ni*VWN + 9 ][mi] = MultiplyAddVector(cpm[ni*VWN + 9 ][mi], apm[mi], bpm[ni].s9);
cpm[ni*VWN + 10][mi] = MultiplyAddVector(cpm[ni*VWN + 10][mi], apm[mi], bpm[ni].sA);
cpm[ni*VWN + 11][mi] = MultiplyAddVector(cpm[ni*VWN + 11][mi], apm[mi], bpm[ni].sB);
cpm[ni*VWN + 12][mi] = MultiplyAddVector(cpm[ni*VWN + 12][mi], apm[mi], bpm[ni].sC);
cpm[ni*VWN + 13][mi] = MultiplyAddVector(cpm[ni*VWN + 13][mi], apm[mi], bpm[ni].sD);
cpm[ni*VWN + 14][mi] = MultiplyAddVector(cpm[ni*VWN + 14][mi], apm[mi], bpm[ni].sE);
cpm[ni*VWN + 15][mi] = MultiplyAddVector(cpm[ni*VWN + 15][mi], apm[mi], bpm[ni].sF);
cpm[ni*VWN + 0 ][mi] = MultiplyAddVector(cpm[ni*VWN + 0 ][mi], aval, bpm[ni].s0);
cpm[ni*VWN + 1 ][mi] = MultiplyAddVector(cpm[ni*VWN + 1 ][mi], aval, bpm[ni].s1);
cpm[ni*VWN + 2 ][mi] = MultiplyAddVector(cpm[ni*VWN + 2 ][mi], aval, bpm[ni].s2);
cpm[ni*VWN + 3 ][mi] = MultiplyAddVector(cpm[ni*VWN + 3 ][mi], aval, bpm[ni].s3);
cpm[ni*VWN + 4 ][mi] = MultiplyAddVector(cpm[ni*VWN + 4 ][mi], aval, bpm[ni].s4);
cpm[ni*VWN + 5 ][mi] = MultiplyAddVector(cpm[ni*VWN + 5 ][mi], aval, bpm[ni].s5);
cpm[ni*VWN + 6 ][mi] = MultiplyAddVector(cpm[ni*VWN + 6 ][mi], aval, bpm[ni].s6);
cpm[ni*VWN + 7 ][mi] = MultiplyAddVector(cpm[ni*VWN + 7 ][mi], aval, bpm[ni].s7);
cpm[ni*VWN + 8 ][mi] = MultiplyAddVector(cpm[ni*VWN + 8 ][mi], aval, bpm[ni].s8);
cpm[ni*VWN + 9 ][mi] = MultiplyAddVector(cpm[ni*VWN + 9 ][mi], aval, bpm[ni].s9);
cpm[ni*VWN + 10][mi] = MultiplyAddVector(cpm[ni*VWN + 10][mi], aval, bpm[ni].sA);
cpm[ni*VWN + 11][mi] = MultiplyAddVector(cpm[ni*VWN + 11][mi], aval, bpm[ni].sB);
cpm[ni*VWN + 12][mi] = MultiplyAddVector(cpm[ni*VWN + 12][mi], aval, bpm[ni].sC);
cpm[ni*VWN + 13][mi] = MultiplyAddVector(cpm[ni*VWN + 13][mi], aval, bpm[ni].sD);
cpm[ni*VWN + 14][mi] = MultiplyAddVector(cpm[ni*VWN + 14][mi], aval, bpm[ni].sE);
cpm[ni*VWN + 15][mi] = MultiplyAddVector(cpm[ni*VWN + 15][mi], aval, bpm[ni].sF);
#endif
}
}
@ -130,49 +131,52 @@ inline void StoreResults(__global realM* cgm, realM cpm[NWI][MWI/VWM], const int
#elif STRN == 1
int ng = ni%VWN + get_local_id(1)*VWN + (ni/VWN)*VWN*NDIMC;
#endif
int idm = mg + get_group_id(0)*(MWG/VWM);
int idn = ng + get_group_id(1)*NWG;
int idm = mg + GetGroupID0() * (MWG/VWM);
int idn = ng + GetGroupID1() * NWG;
// The final multiplication with alpha and the addition with beta*C
int index = idn*(kSizeM/VWM) + idm;
realM cval = cgm[index];
realM result;
realM xval = cpm[ni][mi];
realM yval = cgm[index];
#if VWM == 1
AXPBY(cgm[index], alpha, cpm[ni][mi], beta, cval);
AXPBY(result, alpha, xval, beta, yval);
#elif VWM == 2
AXPBY(cgm[index].x, alpha, cpm[ni][mi].x, beta, cval.x);
AXPBY(cgm[index].y, alpha, cpm[ni][mi].y, beta, cval.y);
AXPBY(result.x, alpha, xval.x, beta, yval.x);
AXPBY(result.y, alpha, xval.y, beta, yval.y);
#elif VWM == 4
AXPBY(cgm[index].x, alpha, cpm[ni][mi].x, beta, cval.x);
AXPBY(cgm[index].y, alpha, cpm[ni][mi].y, beta, cval.y);
AXPBY(cgm[index].z, alpha, cpm[ni][mi].z, beta, cval.z);
AXPBY(cgm[index].w, alpha, cpm[ni][mi].w, beta, cval.w);
AXPBY(result.x, alpha, xval.x, beta, yval.x);
AXPBY(result.y, alpha, xval.y, beta, yval.y);
AXPBY(result.z, alpha, xval.z, beta, yval.z);
AXPBY(result.w, alpha, xval.w, beta, yval.w);
#elif VWM == 8
AXPBY(cgm[index].s0, alpha, cpm[ni][mi].s0, beta, cval.s0);
AXPBY(cgm[index].s1, alpha, cpm[ni][mi].s1, beta, cval.s1);
AXPBY(cgm[index].s2, alpha, cpm[ni][mi].s2, beta, cval.s2);
AXPBY(cgm[index].s3, alpha, cpm[ni][mi].s3, beta, cval.s3);
AXPBY(cgm[index].s4, alpha, cpm[ni][mi].s4, beta, cval.s4);
AXPBY(cgm[index].s5, alpha, cpm[ni][mi].s5, beta, cval.s5);
AXPBY(cgm[index].s6, alpha, cpm[ni][mi].s6, beta, cval.s6);
AXPBY(cgm[index].s7, alpha, cpm[ni][mi].s7, beta, cval.s7);
AXPBY(result.s0, alpha, xval.s0, beta, yval.s0);
AXPBY(result.s1, alpha, xval.s1, beta, yval.s1);
AXPBY(result.s2, alpha, xval.s2, beta, yval.s2);
AXPBY(result.s3, alpha, xval.s3, beta, yval.s3);
AXPBY(result.s4, alpha, xval.s4, beta, yval.s4);
AXPBY(result.s5, alpha, xval.s5, beta, yval.s5);
AXPBY(result.s6, alpha, xval.s6, beta, yval.s6);
AXPBY(result.s7, alpha, xval.s7, beta, yval.s7);
#elif VWM == 16
AXPBY(cgm[index].s0, alpha, cpm[ni][mi].s0, beta, cval.s0);
AXPBY(cgm[index].s1, alpha, cpm[ni][mi].s1, beta, cval.s1);
AXPBY(cgm[index].s2, alpha, cpm[ni][mi].s2, beta, cval.s2);
AXPBY(cgm[index].s3, alpha, cpm[ni][mi].s3, beta, cval.s3);
AXPBY(cgm[index].s4, alpha, cpm[ni][mi].s4, beta, cval.s4);
AXPBY(cgm[index].s5, alpha, cpm[ni][mi].s5, beta, cval.s5);
AXPBY(cgm[index].s6, alpha, cpm[ni][mi].s6, beta, cval.s6);
AXPBY(cgm[index].s7, alpha, cpm[ni][mi].s7, beta, cval.s7);
AXPBY(cgm[index].s8, alpha, cpm[ni][mi].s8, beta, cval.s8);
AXPBY(cgm[index].s9, alpha, cpm[ni][mi].s9, beta, cval.s9);
AXPBY(cgm[index].sA, alpha, cpm[ni][mi].sA, beta, cval.sA);
AXPBY(cgm[index].sB, alpha, cpm[ni][mi].sB, beta, cval.sB);
AXPBY(cgm[index].sC, alpha, cpm[ni][mi].sC, beta, cval.sC);
AXPBY(cgm[index].sD, alpha, cpm[ni][mi].sD, beta, cval.sD);
AXPBY(cgm[index].sE, alpha, cpm[ni][mi].sE, beta, cval.sE);
AXPBY(cgm[index].sF, alpha, cpm[ni][mi].sF, beta, cval.sF);
AXPBY(result.s0, alpha, xval.s0, beta, yval.s0);
AXPBY(result.s1, alpha, xval.s1, beta, yval.s1);
AXPBY(result.s2, alpha, xval.s2, beta, yval.s2);
AXPBY(result.s3, alpha, xval.s3, beta, yval.s3);
AXPBY(result.s4, alpha, xval.s4, beta, yval.s4);
AXPBY(result.s5, alpha, xval.s5, beta, yval.s5);
AXPBY(result.s6, alpha, xval.s6, beta, yval.s6);
AXPBY(result.s7, alpha, xval.s7, beta, yval.s7);
AXPBY(result.s8, alpha, xval.s8, beta, yval.s8);
AXPBY(result.s9, alpha, xval.s9, beta, yval.s9);
AXPBY(result.sA, alpha, xval.sA, beta, yval.sA);
AXPBY(result.sB, alpha, xval.sB, beta, yval.sB);
AXPBY(result.sC, alpha, xval.sC, beta, yval.sC);
AXPBY(result.sD, alpha, xval.sD, beta, yval.sD);
AXPBY(result.sE, alpha, xval.sE, beta, yval.sE);
AXPBY(result.sF, alpha, xval.sF, beta, yval.sF);
#endif
cgm[index] = result;
}
}
}
@ -269,7 +273,7 @@ __kernel void XgemmUpper(const int kSizeN, const int kSizeK,
__global realM* cgm) {
// Skip these threads if they do not contain threads contributing to the upper-triangle
if (get_group_id(1)*NWG < get_group_id(0)*MWG) {
if (GetGroupID1()*NWG < GetGroupID0()*MWG) {
return;
}
@ -306,7 +310,7 @@ __kernel void XgemmLower(const int kSizeN, const int kSizeK,
__global realM* cgm) {
// Skip these threads if they do not contain threads contributing to the lower-triangle
if (get_group_id(1)*NWG > get_group_id(0)*MWG) {
if (GetGroupID1()*NWG > GetGroupID0()*MWG) {
return;
}

View file

@ -88,12 +88,21 @@ StatusCode Routine<T>::SetUp() {
// Adds the name of the routine as a define
defines += "#define ROUTINE_"+routine_name_+"\n";
// Determines whether this is a specific device
const auto isAMD = device_.Vendor() == "AMD" || device_.Vendor() == "Advanced Micro Devices, Inc.";
const auto isGPU = device_.Type() == "GPU";
// For specific devices, use the non-IEE754 compilant OpenCL mad() instruction. This can improve
// performance, but might result in a reduced accuracy.
if (device_.Vendor() == "AMD") {
if (isAMD && isGPU) {
defines += "#define USE_CL_MAD 1\n";
}
// For specific devices, use staggered/shuffled workgroup indices.
if (isAMD && isGPU) {
defines += "#define USE_STAGGERED_INDICES 1\n";
}
// Combines everything together into a single source string
auto source_string = defines + common_header + source_string_;

View file

@ -184,12 +184,13 @@ StatusCode Xgemm<T>::DoGemm(const Layout layout,
// Launches the kernel
auto eventKernel = Event();
status = RunKernel(kernel, global, local, eventKernel.pointer(), eventWaitList);
auto eventPointer = (!c_no_temp) ? eventKernel.pointer() : event_;
status = RunKernel(kernel, global, local, eventPointer, eventWaitList);
if (ErrorIn(status)) { return status; }
eventWaitList.push_back(eventKernel);
// Runs the post-processing kernel if needed
if (!c_no_temp) {
eventWaitList.push_back(eventKernel);
status = PadCopyTransposeMatrix(event_, eventWaitList,
m_ceiled, n_ceiled, m_ceiled, 0, c_temp,
c_one, c_two, c_ld, c_offset, c_buffer,

View file

@ -334,7 +334,7 @@ bool TestSimilarity(const T val1, const T val2) {
// Set the allowed error margin for floating-point comparisons
constexpr auto kErrorMarginRelative = T(0.025);
constexpr auto kErrorMarginAbsolute = T(1.0e-4);
constexpr auto kErrorMarginAbsolute = T(1.0e-3);
// Shortcut, handles infinities
if (val1 == val2) {

View file

@ -34,7 +34,7 @@ options("width"=170)
# Constants
num_runs <- 4
devices <- c("-platform","-device")
options_string <- "-q -no_abbrv"
options_string <- "-q -no_abbrv -cblas 0"
library_names <- c("CLBlast", "clBLAS")
# Command-line arguments

View file

@ -35,32 +35,32 @@ test_names <- list(
# Defines the test-cases
test_values <- list(
list(c( 128, 128, 128, 1, 0, 0, 16, 128, num_runs, precision)),
list(c( 129, 129, 129, 1, 0, 0, 16, 128, num_runs, precision)),
list(c( 512, 512, 512, 1, 0, 0, 16, 1, num_runs, precision)),
list(c(2048, 2048, 2048, 1, 0, 0, 16, 1, num_runs, precision)),
list(c( 128, 128, 128, 102, 111, 111, 16, 128, num_runs, precision)),
list(c( 129, 129, 129, 102, 111, 111, 16, 128, num_runs, precision)),
list(c( 512, 512, 512, 102, 111, 111, 16, 1, num_runs, precision)),
list(c(2048, 2048, 2048, 102, 111, 111, 16, 1, num_runs, precision)),
list(
c(1024, 1024, 1024, 0, 0, 0, 1, 0, num_runs, precision),
c(1024, 1024, 1024, 0, 0, 1, 1, 0, num_runs, precision),
c(1024, 1024, 1024, 0, 1, 0, 1, 0, num_runs, precision),
c(1024, 1024, 1024, 0, 1, 1, 1, 0, num_runs, precision),
c(1024, 1024, 1024, 1, 0, 0, 1, 0, num_runs, precision),
c(1024, 1024, 1024, 1, 0, 1, 1, 0, num_runs, precision),
c(1024, 1024, 1024, 1, 1, 0, 1, 0, num_runs, precision),
c(1024, 1024, 1024, 1, 1, 1, 1, 0, num_runs, precision)
c(1024, 1024, 1024, 101, 111, 111, 1, 0, num_runs, precision),
c(1024, 1024, 1024, 101, 111, 112, 1, 0, num_runs, precision),
c(1024, 1024, 1024, 101, 112, 111, 1, 0, num_runs, precision),
c(1024, 1024, 1024, 101, 112, 112, 1, 0, num_runs, precision),
c(1024, 1024, 1024, 102, 111, 111, 1, 0, num_runs, precision),
c(1024, 1024, 1024, 102, 111, 112, 1, 0, num_runs, precision),
c(1024, 1024, 1024, 102, 112, 111, 1, 0, num_runs, precision),
c(1024, 1024, 1024, 102, 112, 112, 1, 0, num_runs, precision)
),
list(
c( 8, 8, 8, 1, 0, 0, 1, 0, num_runs, precision),
c( 16, 16, 16, 1, 0, 0, 1, 0, num_runs, precision),
c( 32, 32, 32, 1, 0, 0, 1, 0, num_runs, precision),
c( 64, 64, 64, 1, 0, 0, 1, 0, num_runs, precision),
c( 128, 128, 128, 1, 0, 0, 1, 0, num_runs, precision),
c( 256, 256, 256, 1, 0, 0, 1, 0, num_runs, precision),
c( 512, 512, 512, 1, 0, 0, 1, 0, num_runs, precision),
c(1024, 1024, 1024, 1, 0, 0, 1, 0, num_runs, precision),
c(2048, 2048, 2048, 1, 0, 0, 1, 0, num_runs, precision),
c(4096, 4096, 4096, 1, 0, 0, 1, 0, num_runs, precision),
c(8192, 8192, 8192, 1, 0, 0, 1, 0, num_runs, precision)
c( 8, 8, 8, 102, 111, 111, 1, 0, num_runs, precision),
c( 16, 16, 16, 102, 111, 111, 1, 0, num_runs, precision),
c( 32, 32, 32, 102, 111, 111, 1, 0, num_runs, precision),
c( 64, 64, 64, 102, 111, 111, 1, 0, num_runs, precision),
c( 128, 128, 128, 102, 111, 111, 1, 0, num_runs, precision),
c( 256, 256, 256, 102, 111, 111, 1, 0, num_runs, precision),
c( 512, 512, 512, 102, 111, 111, 1, 0, num_runs, precision),
c(1024, 1024, 1024, 102, 111, 111, 1, 0, num_runs, precision),
c(2048, 2048, 2048, 102, 111, 111, 1, 0, num_runs, precision),
c(4096, 4096, 4096, 102, 111, 111, 1, 0, num_runs, precision),
c(8192, 8192, 8192, 102, 111, 111, 1, 0, num_runs, precision)
)
)

View file

@ -35,22 +35,22 @@ test_names <- list(
# Defines the test-cases
test_values <- list(
list(c(256, 256, 1, 1, 1, 16, 256, num_runs, precision)),
list(c(256+1, 256+1, 1, 1, 1, 16, 256, num_runs, precision)),
list(c(2*kilo, 2*kilo, 1, 1, 1, 16, 1, num_runs, precision)),
list(c(256, 256, 1, 1, 0, 16, 256, num_runs, precision)),
list(c(256+1, 256+1, 1, 1, 0, 16, 256, num_runs, precision)),
list(c(256, 256, 1, 1, 102, 16, 256, num_runs, precision)),
list(c(256+1, 256+1, 1, 1, 102, 16, 256, num_runs, precision)),
list(c(2*kilo, 2*kilo, 1, 1, 102, 16, 1, num_runs, precision)),
list(c(256, 256, 1, 1, 101, 16, 256, num_runs, precision)),
list(c(256+1, 256+1, 1, 1, 101, 16, 256, num_runs, precision)),
list(
c(2*kilo, 2*kilo, 1, 1, 1, 1, 0, num_runs, precision),
c(2*kilo, 2*kilo, 2, 1, 1, 1, 0, num_runs, precision),
c(2*kilo, 2*kilo, 4, 1, 1, 1, 0, num_runs, precision),
c(2*kilo, 2*kilo, 8, 1, 1, 1, 0, num_runs, precision),
c(2*kilo, 2*kilo, 1, 2, 1, 1, 0, num_runs, precision),
c(2*kilo, 2*kilo, 1, 4, 1, 1, 0, num_runs, precision),
c(2*kilo, 2*kilo, 1, 8, 1, 1, 0, num_runs, precision),
c(2*kilo, 2*kilo, 2, 2, 1, 1, 0, num_runs, precision),
c(2*kilo, 2*kilo, 4, 4, 1, 1, 0, num_runs, precision),
c(2*kilo, 2*kilo, 8, 8, 1, 1, 0, num_runs, precision)
c(2*kilo, 2*kilo, 1, 1, 102, 1, 0, num_runs, precision),
c(2*kilo, 2*kilo, 2, 1, 102, 1, 0, num_runs, precision),
c(2*kilo, 2*kilo, 4, 1, 102, 1, 0, num_runs, precision),
c(2*kilo, 2*kilo, 8, 1, 102, 1, 0, num_runs, precision),
c(2*kilo, 2*kilo, 1, 2, 102, 1, 0, num_runs, precision),
c(2*kilo, 2*kilo, 1, 4, 102, 1, 0, num_runs, precision),
c(2*kilo, 2*kilo, 1, 8, 102, 1, 0, num_runs, precision),
c(2*kilo, 2*kilo, 2, 2, 102, 1, 0, num_runs, precision),
c(2*kilo, 2*kilo, 4, 4, 102, 1, 0, num_runs, precision),
c(2*kilo, 2*kilo, 8, 8, 102, 1, 0, num_runs, precision)
)
)

View file

@ -35,32 +35,32 @@ test_names <- list(
# Defines the test-cases
test_values <- list(
list(c( 128, 128, 1, 0, 0, 16, 128, num_runs, precision)),
list(c( 129, 129, 1, 0, 0, 16, 128, num_runs, precision)),
list(c( 512, 512, 1, 0, 0, 16, 1, num_runs, precision)),
list(c(2048, 2048, 1, 0, 0, 16, 1, num_runs, precision)),
list(c( 128, 128, 102, 111, 111, 16, 128, num_runs, precision)),
list(c( 129, 129, 102, 111, 111, 16, 128, num_runs, precision)),
list(c( 512, 512, 102, 111, 111, 16, 1, num_runs, precision)),
list(c(2048, 2048, 102, 111, 111, 16, 1, num_runs, precision)),
list(
c(1024, 1024, 0, 0, 0, 1, 0, num_runs, precision),
c(1024, 1024, 0, 0, 1, 1, 0, num_runs, precision),
c(1024, 1024, 0, 1, 0, 1, 0, num_runs, precision),
c(1024, 1024, 0, 1, 1, 1, 0, num_runs, precision),
c(1024, 1024, 1, 0, 0, 1, 0, num_runs, precision),
c(1024, 1024, 1, 0, 1, 1, 0, num_runs, precision),
c(1024, 1024, 1, 1, 0, 1, 0, num_runs, precision),
c(1024, 1024, 1, 1, 1, 1, 0, num_runs, precision)
c(1024, 1024, 101, 111, 111, 1, 0, num_runs, precision),
c(1024, 1024, 101, 111, 112, 1, 0, num_runs, precision),
c(1024, 1024, 101, 112, 111, 1, 0, num_runs, precision),
c(1024, 1024, 101, 112, 112, 1, 0, num_runs, precision),
c(1024, 1024, 102, 111, 111, 1, 0, num_runs, precision),
c(1024, 1024, 102, 111, 112, 1, 0, num_runs, precision),
c(1024, 1024, 102, 112, 111, 1, 0, num_runs, precision),
c(1024, 1024, 102, 112, 112, 1, 0, num_runs, precision)
),
list(
c( 8, 8, 1, 0, 0, 1, 0, num_runs, precision),
c( 16, 16, 1, 0, 0, 1, 0, num_runs, precision),
c( 32, 32, 1, 0, 0, 1, 0, num_runs, precision),
c( 64, 64, 1, 0, 0, 1, 0, num_runs, precision),
c( 128, 128, 1, 0, 0, 1, 0, num_runs, precision),
c( 256, 256, 1, 0, 0, 1, 0, num_runs, precision),
c( 512, 512, 1, 0, 0, 1, 0, num_runs, precision),
c(1024, 1024, 1, 0, 0, 1, 0, num_runs, precision),
c(2048, 2048, 1, 0, 0, 1, 0, num_runs, precision),
c(4096, 4096, 1, 0, 0, 1, 0, num_runs, precision),
c(8192, 8192, 1, 0, 0, 1, 0, num_runs, precision)
c( 8, 8, 102, 111, 111, 1, 0, num_runs, precision),
c( 16, 16, 102, 111, 111, 1, 0, num_runs, precision),
c( 32, 32, 102, 111, 111, 1, 0, num_runs, precision),
c( 64, 64, 102, 111, 111, 1, 0, num_runs, precision),
c( 128, 128, 102, 111, 111, 1, 0, num_runs, precision),
c( 256, 256, 102, 111, 111, 1, 0, num_runs, precision),
c( 512, 512, 102, 111, 111, 1, 0, num_runs, precision),
c(1024, 1024, 102, 111, 111, 1, 0, num_runs, precision),
c(2048, 2048, 102, 111, 111, 1, 0, num_runs, precision),
c(4096, 4096, 102, 111, 111, 1, 0, num_runs, precision),
c(8192, 8192, 102, 111, 111, 1, 0, num_runs, precision)
)
)

View file

@ -35,32 +35,32 @@ test_names <- list(
# Defines the test-cases
test_values <- list(
list(c( 128, 128, 1, 0, 0, 16, 128, num_runs, precision)),
list(c( 129, 129, 1, 0, 0, 16, 128, num_runs, precision)),
list(c( 512, 512, 1, 0, 0, 16, 1, num_runs, precision)),
list(c(1536, 1536, 1, 0, 0, 16, 1, num_runs, precision)),
list(c( 128, 128, 102, 111, 111, 16, 128, num_runs, precision)),
list(c( 129, 129, 102, 111, 111, 16, 128, num_runs, precision)),
list(c( 512, 512, 102, 111, 111, 16, 1, num_runs, precision)),
list(c(1536, 1536, 102, 111, 111, 16, 1, num_runs, precision)),
list(
c(1024, 1024, 0, 0, 0, 1, 0, num_runs, precision),
c(1024, 1024, 0, 0, 1, 1, 0, num_runs, precision),
c(1024, 1024, 0, 1, 0, 1, 0, num_runs, precision),
c(1024, 1024, 0, 1, 1, 1, 0, num_runs, precision),
c(1024, 1024, 1, 0, 0, 1, 0, num_runs, precision),
c(1024, 1024, 1, 0, 1, 1, 0, num_runs, precision),
c(1024, 1024, 1, 1, 0, 1, 0, num_runs, precision),
c(1024, 1024, 1, 1, 1, 1, 0, num_runs, precision)
c(1024, 1024, 101, 111, 111, 1, 0, num_runs, precision),
c(1024, 1024, 101, 111, 112, 1, 0, num_runs, precision),
c(1024, 1024, 101, 112, 111, 1, 0, num_runs, precision),
c(1024, 1024, 101, 112, 112, 1, 0, num_runs, precision),
c(1024, 1024, 102, 111, 111, 1, 0, num_runs, precision),
c(1024, 1024, 102, 111, 112, 1, 0, num_runs, precision),
c(1024, 1024, 102, 112, 111, 1, 0, num_runs, precision),
c(1024, 1024, 102, 112, 112, 1, 0, num_runs, precision)
),
list(
c( 8, 8, 1, 0, 0, 1, 0, num_runs, precision),
c( 16, 16, 1, 0, 0, 1, 0, num_runs, precision),
c( 32, 32, 1, 0, 0, 1, 0, num_runs, precision),
c( 64, 64, 1, 0, 0, 1, 0, num_runs, precision),
c( 128, 128, 1, 0, 0, 1, 0, num_runs, precision),
c( 256, 256, 1, 0, 0, 1, 0, num_runs, precision),
c( 512, 512, 1, 0, 0, 1, 0, num_runs, precision),
c(1024, 1024, 1, 0, 0, 1, 0, num_runs, precision),
c(2048, 2048, 1, 0, 0, 1, 0, num_runs, precision),
c(4096, 4096, 1, 0, 0, 1, 0, num_runs, precision),
c(8192, 8192, 1, 0, 0, 1, 0, num_runs, precision)
c( 8, 8, 102, 111, 111, 1, 0, num_runs, precision),
c( 16, 16, 102, 111, 111, 1, 0, num_runs, precision),
c( 32, 32, 102, 111, 111, 1, 0, num_runs, precision),
c( 64, 64, 102, 111, 111, 1, 0, num_runs, precision),
c( 128, 128, 102, 111, 111, 1, 0, num_runs, precision),
c( 256, 256, 102, 111, 111, 1, 0, num_runs, precision),
c( 512, 512, 102, 111, 111, 1, 0, num_runs, precision),
c(1024, 1024, 102, 111, 111, 1, 0, num_runs, precision),
c(2048, 2048, 102, 111, 111, 1, 0, num_runs, precision),
c(4096, 4096, 102, 111, 111, 1, 0, num_runs, precision),
c(8192, 8192, 102, 111, 111, 1, 0, num_runs, precision)
)
)

View file

@ -35,32 +35,32 @@ test_names <- list(
# Defines the test-cases
test_values <- list(
list(c( 128, 128, 1, 0, 0, 16, 128, num_runs, precision)),
list(c( 129, 129, 1, 0, 0, 16, 128, num_runs, precision)),
list(c( 512, 512, 1, 0, 0, 16, 1, num_runs, precision)),
list(c(2048, 2048, 1, 0, 0, 16, 1, num_runs, precision)),
list(c( 128, 128, 102, 111, 111, 16, 128, num_runs, precision)),
list(c( 129, 129, 102, 111, 111, 16, 128, num_runs, precision)),
list(c( 512, 512, 102, 111, 111, 16, 1, num_runs, precision)),
list(c(2048, 2048, 102, 111, 111, 16, 1, num_runs, precision)),
list(
c(1024, 1024, 0, 0, 0, 1, 0, num_runs, precision),
c(1024, 1024, 0, 0, 1, 1, 0, num_runs, precision),
c(1024, 1024, 0, 1, 0, 1, 0, num_runs, precision),
c(1024, 1024, 0, 1, 1, 1, 0, num_runs, precision),
c(1024, 1024, 1, 0, 0, 1, 0, num_runs, precision),
c(1024, 1024, 1, 0, 1, 1, 0, num_runs, precision),
c(1024, 1024, 1, 1, 0, 1, 0, num_runs, precision),
c(1024, 1024, 1, 1, 1, 1, 0, num_runs, precision)
c(1024, 1024, 101, 111, 111, 1, 0, num_runs, precision),
c(1024, 1024, 101, 111, 112, 1, 0, num_runs, precision),
c(1024, 1024, 101, 112, 111, 1, 0, num_runs, precision),
c(1024, 1024, 101, 112, 112, 1, 0, num_runs, precision),
c(1024, 1024, 102, 111, 111, 1, 0, num_runs, precision),
c(1024, 1024, 102, 111, 112, 1, 0, num_runs, precision),
c(1024, 1024, 102, 112, 111, 1, 0, num_runs, precision),
c(1024, 1024, 102, 112, 112, 1, 0, num_runs, precision)
),
list(
c( 8, 8, 1, 0, 0, 1, 0, num_runs, precision),
c( 16, 16, 1, 0, 0, 1, 0, num_runs, precision),
c( 32, 32, 1, 0, 0, 1, 0, num_runs, precision),
c( 64, 64, 1, 0, 0, 1, 0, num_runs, precision),
c( 128, 128, 1, 0, 0, 1, 0, num_runs, precision),
c( 256, 256, 1, 0, 0, 1, 0, num_runs, precision),
c( 512, 512, 1, 0, 0, 1, 0, num_runs, precision),
c(1024, 1024, 1, 0, 0, 1, 0, num_runs, precision),
c(2048, 2048, 1, 0, 0, 1, 0, num_runs, precision),
c(4096, 4096, 1, 0, 0, 1, 0, num_runs, precision),
c(8192, 8192, 1, 0, 0, 1, 0, num_runs, precision)
c( 8, 8, 102, 111, 111, 1, 0, num_runs, precision),
c( 16, 16, 102, 111, 111, 1, 0, num_runs, precision),
c( 32, 32, 102, 111, 111, 1, 0, num_runs, precision),
c( 64, 64, 102, 111, 111, 1, 0, num_runs, precision),
c( 128, 128, 102, 111, 111, 1, 0, num_runs, precision),
c( 256, 256, 102, 111, 111, 1, 0, num_runs, precision),
c( 512, 512, 102, 111, 111, 1, 0, num_runs, precision),
c(1024, 1024, 102, 111, 111, 1, 0, num_runs, precision),
c(2048, 2048, 102, 111, 111, 1, 0, num_runs, precision),
c(4096, 4096, 102, 111, 111, 1, 0, num_runs, precision),
c(8192, 8192, 102, 111, 111, 1, 0, num_runs, precision)
)
)

View file

@ -35,59 +35,59 @@ test_names <- list(
# Defines the test-cases
test_values <- list(
list(c( 128, 128, 1, 0, 0, 0, 0, 16, 128, num_runs, precision)),
list(c( 129, 129, 1, 0, 0, 0, 0, 16, 128, num_runs, precision)),
list(c( 512, 512, 1, 0, 0, 0, 0, 16, 1, num_runs, precision)),
list(c(2048, 2048, 1, 0, 0, 0, 0, 16, 1, num_runs, precision)),
list(c( 128, 128, 102, 141, 121, 111, 131, 16, 128, num_runs, precision)),
list(c( 129, 129, 102, 141, 121, 111, 131, 16, 128, num_runs, precision)),
list(c( 512, 512, 102, 141, 121, 111, 131, 16, 1, num_runs, precision)),
list(c(2048, 2048, 102, 141, 121, 111, 131, 16, 1, num_runs, precision)),
list(
c(1024, 1024, 0, 0, 0, 0, 0, 1, 0, num_runs, precision),
c(1024, 1024, 0, 0, 0, 0, 1, 1, 0, num_runs, precision),
c(1024, 1024, 0, 0, 0, 1, 0, 1, 0, num_runs, precision),
c(1024, 1024, 0, 0, 0, 1, 1, 1, 0, num_runs, precision),
c(1024, 1024, 0, 0, 1, 0, 0, 1, 0, num_runs, precision),
c(1024, 1024, 0, 0, 1, 0, 1, 1, 0, num_runs, precision),
c(1024, 1024, 0, 0, 1, 1, 0, 1, 0, num_runs, precision),
c(1024, 1024, 0, 0, 1, 1, 1, 1, 0, num_runs, precision),
c(1024, 1024, 101, 141, 121, 111, 131, 1, 0, num_runs, precision),
c(1024, 1024, 101, 141, 121, 111, 132, 1, 0, num_runs, precision),
c(1024, 1024, 101, 141, 121, 112, 131, 1, 0, num_runs, precision),
c(1024, 1024, 101, 141, 121, 112, 132, 1, 0, num_runs, precision),
c(1024, 1024, 101, 141, 122, 111, 131, 1, 0, num_runs, precision),
c(1024, 1024, 101, 141, 122, 111, 132, 1, 0, num_runs, precision),
c(1024, 1024, 101, 141, 122, 112, 131, 1, 0, num_runs, precision),
c(1024, 1024, 101, 141, 122, 112, 132, 1, 0, num_runs, precision),
c(1024, 1024, 0, 1, 0, 0, 0, 1, 0, num_runs, precision),
c(1024, 1024, 0, 1, 0, 0, 1, 1, 0, num_runs, precision),
c(1024, 1024, 0, 1, 0, 1, 0, 1, 0, num_runs, precision),
c(1024, 1024, 0, 1, 0, 1, 1, 1, 0, num_runs, precision),
c(1024, 1024, 0, 1, 1, 0, 0, 1, 0, num_runs, precision),
c(1024, 1024, 0, 1, 1, 0, 1, 1, 0, num_runs, precision),
c(1024, 1024, 0, 1, 1, 1, 0, 1, 0, num_runs, precision),
c(1024, 1024, 0, 1, 1, 1, 1, 1, 0, num_runs, precision),
c(1024, 1024, 101, 142, 121, 111, 131, 1, 0, num_runs, precision),
c(1024, 1024, 101, 142, 121, 111, 132, 1, 0, num_runs, precision),
c(1024, 1024, 101, 142, 121, 112, 131, 1, 0, num_runs, precision),
c(1024, 1024, 101, 142, 121, 112, 132, 1, 0, num_runs, precision),
c(1024, 1024, 101, 142, 122, 111, 131, 1, 0, num_runs, precision),
c(1024, 1024, 101, 142, 122, 111, 132, 1, 0, num_runs, precision),
c(1024, 1024, 101, 142, 122, 112, 131, 1, 0, num_runs, precision),
c(1024, 1024, 101, 142, 122, 112, 132, 1, 0, num_runs, precision),
c(1024, 1024, 1, 0, 0, 0, 0, 1, 0, num_runs, precision),
c(1024, 1024, 1, 0, 0, 0, 1, 1, 0, num_runs, precision),
c(1024, 1024, 1, 0, 0, 1, 0, 1, 0, num_runs, precision),
c(1024, 1024, 1, 0, 0, 1, 1, 1, 0, num_runs, precision),
c(1024, 1024, 1, 0, 1, 0, 0, 1, 0, num_runs, precision),
c(1024, 1024, 1, 0, 1, 0, 1, 1, 0, num_runs, precision),
c(1024, 1024, 1, 0, 1, 1, 0, 1, 0, num_runs, precision),
c(1024, 1024, 1, 0, 1, 1, 1, 1, 0, num_runs, precision),
c(1024, 1024, 102, 141, 121, 111, 131, 1, 0, num_runs, precision),
c(1024, 1024, 102, 141, 121, 111, 132, 1, 0, num_runs, precision),
c(1024, 1024, 102, 141, 121, 112, 131, 1, 0, num_runs, precision),
c(1024, 1024, 102, 141, 121, 112, 132, 1, 0, num_runs, precision),
c(1024, 1024, 102, 141, 122, 111, 131, 1, 0, num_runs, precision),
c(1024, 1024, 102, 141, 122, 111, 132, 1, 0, num_runs, precision),
c(1024, 1024, 102, 141, 122, 112, 131, 1, 0, num_runs, precision),
c(1024, 1024, 102, 141, 122, 112, 132, 1, 0, num_runs, precision),
c(1024, 1024, 1, 1, 0, 0, 0, 1, 0, num_runs, precision),
c(1024, 1024, 1, 1, 0, 0, 1, 1, 0, num_runs, precision),
c(1024, 1024, 1, 1, 0, 1, 0, 1, 0, num_runs, precision),
c(1024, 1024, 1, 1, 0, 1, 1, 1, 0, num_runs, precision),
c(1024, 1024, 1, 1, 1, 0, 0, 1, 0, num_runs, precision),
c(1024, 1024, 1, 1, 1, 0, 1, 1, 0, num_runs, precision),
c(1024, 1024, 1, 1, 1, 1, 0, 1, 0, num_runs, precision),
c(1024, 1024, 1, 1, 1, 1, 1, 1, 0, num_runs, precision)
c(1024, 1024, 102, 142, 121, 111, 131, 1, 0, num_runs, precision),
c(1024, 1024, 102, 142, 121, 111, 132, 1, 0, num_runs, precision),
c(1024, 1024, 102, 142, 121, 112, 131, 1, 0, num_runs, precision),
c(1024, 1024, 102, 142, 121, 112, 132, 1, 0, num_runs, precision),
c(1024, 1024, 102, 142, 122, 111, 131, 1, 0, num_runs, precision),
c(1024, 1024, 102, 142, 122, 111, 132, 1, 0, num_runs, precision),
c(1024, 1024, 102, 142, 122, 112, 131, 1, 0, num_runs, precision),
c(1024, 1024, 102, 142, 122, 112, 132, 1, 0, num_runs, precision)
),
list(
c( 8, 8, 1, 0, 0, 0, 0, 1, 0, num_runs, precision),
c( 16, 16, 1, 0, 0, 0, 0, 1, 0, num_runs, precision),
c( 32, 32, 1, 0, 0, 0, 0, 1, 0, num_runs, precision),
c( 64, 64, 1, 0, 0, 0, 0, 1, 0, num_runs, precision),
c( 128, 128, 1, 0, 0, 0, 0, 1, 0, num_runs, precision),
c( 256, 256, 1, 0, 0, 0, 0, 1, 0, num_runs, precision),
c( 512, 512, 1, 0, 0, 0, 0, 1, 0, num_runs, precision),
c(1024, 1024, 1, 0, 0, 0, 0, 1, 0, num_runs, precision),
c(2048, 2048, 1, 0, 0, 0, 0, 1, 0, num_runs, precision),
c(4096, 4096, 1, 0, 0, 0, 0, 1, 0, num_runs, precision),
c(8192, 8192, 1, 0, 0, 0, 0, 1, 0, num_runs, precision)
c( 8, 8, 102, 141, 121, 111, 131, 1, 0, num_runs, precision),
c( 16, 16, 102, 141, 121, 111, 131, 1, 0, num_runs, precision),
c( 32, 32, 102, 141, 121, 111, 131, 1, 0, num_runs, precision),
c( 64, 64, 102, 141, 121, 111, 131, 1, 0, num_runs, precision),
c( 128, 128, 102, 141, 121, 111, 131, 1, 0, num_runs, precision),
c( 256, 256, 102, 141, 121, 111, 131, 1, 0, num_runs, precision),
c( 512, 512, 102, 141, 121, 111, 131, 1, 0, num_runs, precision),
c(1024, 1024, 102, 141, 121, 111, 131, 1, 0, num_runs, precision),
c(2048, 2048, 102, 141, 121, 111, 131, 1, 0, num_runs, precision),
c(4096, 4096, 102, 141, 121, 111, 131, 1, 0, num_runs, precision),
c(8192, 8192, 102, 141, 121, 111, 131, 1, 0, num_runs, precision)
)
)