diff --git a/CHANGELOG b/CHANGELOG index c77e5e48..76903180 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -1,4 +1,9 @@ +Version 0.7.1 +- Improved performance of large power-of-2 xGEMM kernels for AMD GPUs +- Fixed a bug in the xGEMM routine related to the event incorrectly set +- Made MSVC link the run-time libraries statically + Version 0.7.0 - Added exports to be able to create a DLL on Windows (thanks to Marco Hutter) - Made the library thread-safe diff --git a/CMakeLists.txt b/CMakeLists.txt index 44524537..02ffba1d 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -9,12 +9,17 @@ # # ================================================================================================== -# CMake project details cmake_minimum_required(VERSION 2.8.10) + +# Overrides for MSVC static runtime +set(CMAKE_USER_MAKE_RULES_OVERRIDE ${CMAKE_CURRENT_SOURCE_DIR}/cmake/c_flag_overrides.cmake) +set(CMAKE_USER_MAKE_RULES_OVERRIDE_CXX ${CMAKE_CURRENT_SOURCE_DIR}/cmake/cxx_flag_overrides.cmake) + +# CMake project details project("clblast" C CXX) set(clblast_VERSION_MAJOR 0) set(clblast_VERSION_MINOR 7) -set(clblast_VERSION_PATCH 0) +set(clblast_VERSION_PATCH 1) # Options and their default values option(SAMPLES "Enable compilation of the examples" OFF) @@ -32,40 +37,40 @@ set(CMAKE_INSTALL_RPATH_USE_LINK_PATH false) # Don't add the automatically deter # ================================================================================================== # Compiler-version check (requires at least CMake 2.8.10) -if("${CMAKE_CXX_COMPILER_ID}" STREQUAL "GNU") - if (CMAKE_CXX_COMPILER_VERSION VERSION_LESS 4.7) +if(CMAKE_CXX_COMPILER_ID STREQUAL GNU) + if(CMAKE_CXX_COMPILER_VERSION VERSION_LESS 4.7) message(FATAL_ERROR "GCC version must be at least 4.7") endif() -elseif ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang") - if (CMAKE_CXX_COMPILER_VERSION VERSION_LESS 3.3) +elseif(CMAKE_CXX_COMPILER_ID STREQUAL Clang) + if(CMAKE_CXX_COMPILER_VERSION VERSION_LESS 3.3) message(FATAL_ERROR "Clang version must be at least 3.3") endif() -elseif ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "AppleClang") - if (CMAKE_CXX_COMPILER_VERSION VERSION_LESS 5.0) +elseif(CMAKE_CXX_COMPILER_ID STREQUAL AppleClang) + if(CMAKE_CXX_COMPILER_VERSION VERSION_LESS 5.0) message(FATAL_ERROR "AppleClang version must be at least 5.0") endif() -elseif ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Intel") - if (CMAKE_CXX_COMPILER_VERSION VERSION_LESS 14.0) +elseif(CMAKE_CXX_COMPILER_ID STREQUAL Intel) + if(CMAKE_CXX_COMPILER_VERSION VERSION_LESS 14.0) message(FATAL_ERROR "ICC version must be at least 14.0") endif() -elseif ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "MSVC") - if (CMAKE_CXX_COMPILER_VERSION VERSION_LESS 18.0) +elseif(MSVC) + if(CMAKE_CXX_COMPILER_VERSION VERSION_LESS 18.0) message(FATAL_ERROR "MS Visual Studio version must be at least 18.0") endif() endif() # C++ compiler settings -if ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "MSVC") +if(MSVC) set(FLAGS "/Ox") set(FLAGS "${FLAGS} /wd4715") -else () +else() set(FLAGS "-O3 -std=c++11") - if("${CMAKE_CXX_COMPILER_ID}" STREQUAL "GNU") + if(CMAKE_CXX_COMPILER_ID STREQUAL GNU) set(FLAGS "${FLAGS} -Wall -Wno-comment -Wno-return-type -Wno-switch -Wno-missing-noreturn") - if (CMAKE_CXX_COMPILER_VERSION VERSION_LESS 4.9.0) + if(CMAKE_CXX_COMPILER_VERSION VERSION_LESS 4.9.0) set(FLAGS "${FLAGS} -Wno-attributes -Wno-unused-variable") endif() - elseif ("${CMAKE_CXX_COMPILER_ID}" MATCHES "Clang") + elseif(CMAKE_CXX_COMPILER_ID MATCHES Clang) set(FLAGS "${FLAGS} -Wextra -Wno-c++98-compat -Wno-c++98-compat-pedantic -Wno-padded") set(FLAGS "${FLAGS} -Wno-missing-prototypes -Wno-float-equal -Wno-switch-enum -Wno-switch") set(FLAGS "${FLAGS} -Wno-exit-time-destructors -Wno-global-constructors -Wno-missing-noreturn") @@ -75,9 +80,9 @@ endif() set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${FLAGS}") # C compiler settings (for the sample) -if ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "MSVC") +if(MSVC) set(CFLAGS "/Ox") -else () +else() set(CFLAGS "-O3 -std=c99") endif() set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${CFLAGS}") @@ -228,7 +233,7 @@ if(TESTS) if(CLBLAS_FOUND) set(REF_INCLUDES ${REF_INCLUDES} ${CLBLAS_INCLUDE_DIRS}) set(REF_LIBRARIES ${REF_LIBRARIES} ${CLBLAS_LIBRARIES}) - if("${CMAKE_CXX_COMPILER_ID}" STREQUAL "MSVC") + if(CMAKE_CXX_COMPILER_ID STREQUAL "MSVC") add_definitions(" /DCLBLAST_REF_CLBLAS") else() add_definitions(" -DCLBLAST_REF_CLBLAS") @@ -237,7 +242,7 @@ if(TESTS) if(CBLAS_FOUND) set(REF_INCLUDES ${REF_INCLUDES} ${CBLAS_INCLUDE_DIRS}) set(REF_LIBRARIES ${REF_LIBRARIES} ${CBLAS_LIBRARIES}) - if("${CMAKE_CXX_COMPILER_ID}" STREQUAL "MSVC") + if(CMAKE_CXX_COMPILER_ID STREQUAL "MSVC") add_definitions(" /DCLBLAST_REF_CBLAS") else() add_definitions(" -DCLBLAST_REF_CBLAS") diff --git a/README.md b/README.md index 8d9220a6..e4564c26 100644 --- a/README.md +++ b/README.md @@ -54,13 +54,13 @@ The pre-requisites for compilation of CLBlast are: Furthermore, to build the (optional) correctness and performance tests, another BLAS library is needed to serve as a reference. This can be either: -* The OpenCL BLAS library [clBLAS](http://github.com/clMathLibraries/clBLAS (maintained by AMD) +* The OpenCL BLAS library [clBLAS](http://github.com/clMathLibraries/clBLAS) (maintained by AMD) * A regular CPU Netlib BLAS library, e.g.: - OpenBLAS - BLIS - Accelerate -An example of an out-of-source build (starting from the root of the CLBlast folder): +An example of an out-of-source build using a command-line compiler and make (starting from the root of the CLBlast folder): mkdir build cd build @@ -68,6 +68,12 @@ An example of an out-of-source build (starting from the root of the CLBlast fold make sudo make install +When using Visual Studio, the project-files can be generated as follows: + + mkdir build + cd build + cmake -G "Visual Studio 14 Win64" .. + A custom installation folder can be specified when calling CMake: cmake -DCMAKE_INSTALL_PREFIX=/path/to/install/directory .. @@ -84,7 +90,7 @@ Or alternatively the plain C version: #include -Afterwards, any of CLBlast's routines can be called directly: there is no need to initialize the library. The available routines and the required arguments are described in the `clblast.h` include file and the included [API documentation](doc/api.md). Additionally, a couple of stand-alone example programs are included in `samples/`. +Afterwards, any of CLBlast's routines can be called directly: there is no need to initialize the library. The available routines and the required arguments are described in the `clblast.h` include file and the included [API documentation](doc/clblast.md). Additionally, a couple of stand-alone example programs are included in `samples/`. Using the tuners (optional) @@ -254,6 +260,7 @@ The contributing authors (code, pull requests, testing) so far are: * [Cedric Nugteren](http://www.cedricnugteren.nl) * [Anton Lokhmotov](https://github.com/psyhtest) * [Dragan Djuric](https://github.com/blueberry) +* [Marco Hutter](https://github.com/gpus) * [Hugh Perkins](https://github.com/hughperkins) Tuning and testing on a variety of OpenCL devices was made possible by: diff --git a/cmake/c_flag_overrides.cmake b/cmake/c_flag_overrides.cmake new file mode 100644 index 00000000..3b38d297 --- /dev/null +++ b/cmake/c_flag_overrides.cmake @@ -0,0 +1,9 @@ +# Overriding the CMake flags to use static runtime libraries +# See http://www.cmake.org/Wiki/CMake_FAQ#How_can_I_build_my_MSVC_application_with_a_static_runtime.3F +if(MSVC) + set(CMAKE_C_FLAGS_DEBUG_INIT "/D_DEBUG /MTd /Zi /Ob0 /Od /RTC1") + set(CMAKE_C_FLAGS_MINSIZEREL_INIT "/MT /O1 /Ob1 /D NDEBUG") + set(CMAKE_C_FLAGS_RELEASE_INIT "/MT /O2 /Ob2 /D NDEBUG") + set(CMAKE_C_FLAGS_RELWITHDEBINFO_INIT "/MT /Zi /O2 /Ob1 /D NDEBUG") +endif() + diff --git a/cmake/cxx_flag_overrides.cmake b/cmake/cxx_flag_overrides.cmake new file mode 100644 index 00000000..b8935bf7 --- /dev/null +++ b/cmake/cxx_flag_overrides.cmake @@ -0,0 +1,9 @@ +# Overriding the CMake flags to use static runtime libraries +# See http://www.cmake.org/Wiki/CMake_FAQ#How_can_I_build_my_MSVC_application_with_a_static_runtime.3F +if(MSVC) + set(CMAKE_CXX_FLAGS_DEBUG_INIT "/D_DEBUG /MTd /Zi /Ob0 /Od /RTC1") + set(CMAKE_CXX_FLAGS_MINSIZEREL_INIT "/MT /O1 /Ob1 /D NDEBUG") + set(CMAKE_CXX_FLAGS_RELEASE_INIT "/MT /O2 /Ob2 /D NDEBUG") + set(CMAKE_CXX_FLAGS_RELWITHDEBINFO_INIT "/MT /Zi /O2 /Ob1 /D NDEBUG") +endif() + diff --git a/doc/performance/Radeon_M370X/SGEMM.pdf b/doc/performance/Radeon_M370X/SGEMM.pdf index 362d229d..5dca8f03 100644 Binary files a/doc/performance/Radeon_M370X/SGEMM.pdf and b/doc/performance/Radeon_M370X/SGEMM.pdf differ diff --git a/include/internal/database/xgemm.h b/include/internal/database/xgemm.h index e24adb19..9ca2bff5 100644 --- a/include/internal/database/xgemm.h +++ b/include/internal/database/xgemm.h @@ -18,11 +18,11 @@ const Database::DatabaseEntry Database::XgemmSingle = { "Xgemm", Precision::kSingle, { { // AMD GPUs kDeviceTypeGPU, "AMD", { - { "AMD Radeon R9 M370X Compute Engine", { {"KWG",32}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",32}, {"MWG",128}, {"NDIMB",32}, {"NDIMC",8}, {"NWG",64}, {"SA",0}, {"SB",1}, {"STRM",1}, {"STRN",1}, {"VWM",4}, {"VWN",1} } }, + { "AMD Radeon R9 M370X Compute Engine", { {"KWG",32}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",8}, {"NDIMC",16}, {"NWG",128}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",2}, {"VWN",8} } }, { "Hawaii", { {"KWG",16}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",32}, {"MWG",128}, {"NDIMB",32}, {"NDIMC",8}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",1}, {"VWM",4}, {"VWN",2} } }, { "Pitcairn", { {"KWG",16}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",128}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",1} } }, { "Tahiti", { {"KWG",32}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",32}, {"MWG",128}, {"NDIMB",16}, {"NDIMC",8}, {"NWG",64}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",4}, {"VWN",1} } }, - { "default", { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",16}, {"NDIMC",8}, {"NWG",64}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",1} } }, + { "default", { {"KWG",16}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",64}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",1} } }, } }, { // ARM GPUs diff --git a/include/internal/tuning.h b/include/internal/tuning.h index 5645a5e5..215beb59 100644 --- a/include/internal/tuning.h +++ b/include/internal/tuning.h @@ -48,14 +48,18 @@ void Tuner(int argc, char* argv[]) { // Tests validity of the given arguments C::TestValidArguments(args); - // Tests for validity of the precision + // Tests for validity of the precision and retrieves properties + auto isAMD = false; + auto isGPU = false; { - auto platform = Platform(args.platform_id); - auto device = Device(platform, args.device_id); + const auto platform = Platform(args.platform_id); + const auto device = Device(platform, args.device_id); if (!PrecisionSupported(device)) { printf("* Unsupported precision, skipping this tuning run\n\n"); return; } + isAMD = device.Vendor() == "AMD" || device.Vendor() == "Advanced Micro Devices, Inc."; + isGPU = device.Type() == "GPU"; } // Creates input buffers with random data @@ -84,8 +88,15 @@ void Tuner(int argc, char* argv[]) { tuner.UseRandomSearch(1.0/args.fraction); } + // Set extra settings for specific defines. This mimics src/routine.cc. + auto defines = std::string{""}; + if (isAMD && isGPU) { + defines += "#define USE_CL_MAD 1\n"; + defines += "#define USE_STAGGERED_INDICES 1\n"; + } + // Loads the kernel sources and defines the kernel to tune - auto sources = C::GetSources(); + auto sources = defines + C::GetSources(); auto id = tuner.AddKernelFromString(sources, C::KernelName(), C::GlobalSize(args), C::LocalSize()); tuner.SetReferenceFromString(sources, C::KernelName(), C::GlobalSizeRef(args), C::LocalSizeRef()); diff --git a/scripts/database/database.py b/scripts/database/database.py index d14e36cc..8e8f37f8 100644 --- a/scripts/database/database.py +++ b/scripts/database/database.py @@ -92,6 +92,7 @@ def ConcatenateData(df1, df2): def RemoveDuplicates(df): return df.drop_duplicates() +# database = database[(database["device"] != "AMD Radeon R9 M370X Compute Engine") | (database["kernel_family"] != "xgemm") | (database["precision"] != "32")] def RemoveEntriesByDevice(df, devicename): return df[df["device"] != devicename] diff --git a/src/kernels/common.opencl b/src/kernels/common.opencl index d401744d..b9e52e17 100644 --- a/src/kernels/common.opencl +++ b/src/kernels/common.opencl @@ -176,6 +176,32 @@ R"( // ================================================================================================= +// Shuffled workgroup indices to avoid partition camping, see below. For specific devices, this is +// enabled (see src/routine.cc). +#ifndef USE_STAGGERED_INDICES + #define USE_STAGGERED_INDICES 0 +#endif + +// Staggered/shuffled group indices to avoid partition camping (AMD GPUs). Formula's are taken from: +// http://docs.nvidia.com/cuda/samples/6_Advanced/transpose/doc/MatrixTranspose.pdf +// More details: https://github.com/CNugteren/CLBlast/issues/53 +#if USE_STAGGERED_INDICES == 1 + inline size_t GetGroupIDFlat() { + return get_group_id(0) + get_num_groups(0) * get_group_id(1); + } + inline size_t GetGroupID1() { + return (GetGroupIDFlat()) % get_num_groups(1); + } + inline size_t GetGroupID0() { + return ((GetGroupIDFlat() / get_num_groups(1)) + GetGroupID1()) % get_num_groups(0); + } +#else + inline size_t GetGroupID1() { return get_group_id(1); } + inline size_t GetGroupID0() { return get_group_id(0); } +#endif + +// ================================================================================================= + // End of the C++11 raw string literal )" diff --git a/src/kernels/level3/xgemm_part1.opencl b/src/kernels/level3/xgemm_part1.opencl index 4cb0585b..a2a555de 100644 --- a/src/kernels/level3/xgemm_part1.opencl +++ b/src/kernels/level3/xgemm_part1.opencl @@ -199,7 +199,7 @@ inline void GlobalToLocalA(const __global realM* restrict agm, __local realM* al // Computes the indices for the global memory int kg = kia + la1*KWA; - int idm = mg + get_group_id(0)*(MWG/VWM); + int idm = mg + GetGroupID0() * (MWG/VWM); int idk = kg + kwg; // Loads the data from global memory (not transposed) into the local memory @@ -229,7 +229,7 @@ inline void GlobalToLocalB(const __global realN* restrict bgm, __local realN* bl // Computes the indices for the global memory int kg = kib + lb1*KWB; - int idn = ng + get_group_id(1)*(NWG/VWN); + int idn = ng + GetGroupID1() * (NWG/VWN); int idk = kg + kwg; // Loads the data from global memory (transposed) into the local memory @@ -257,7 +257,7 @@ inline void GlobalToPrivateA(const __global realM* restrict agm, realM apm[MWI/V #endif // Computes the indices for the global memory - int idm = mg + get_group_id(0)*(MWG/VWM); + int idm = mg + GetGroupID0() * (MWG/VWM); // Loads the data from global memory (not transposed) and stores into registers apm[mi] = agm[idk*(kSizeM/VWM) + idm]; @@ -280,7 +280,7 @@ inline void GlobalToPrivateB(const __global realN* restrict bgm, realN bpm[NWI/V #endif // Computes the indices for the global memory - int idn = ng + get_group_id(1)*(NWG/VWN); + int idn = ng + GetGroupID1() * (NWG/VWN); // Loads the data from global memory (transposed) and stores into registers bpm[ni] = bgm[idk*(kSizeN/VWN) + idn]; diff --git a/src/kernels/level3/xgemm_part2.opencl b/src/kernels/level3/xgemm_part2.opencl index c0760db6..599e01d5 100644 --- a/src/kernels/level3/xgemm_part2.opencl +++ b/src/kernels/level3/xgemm_part2.opencl @@ -69,42 +69,43 @@ inline void MultiplyAccumulate(realM cpm[NWI][MWI/VWM], realM apm[MWI/VWM], real for (int ni=0; ni get_group_id(0)*MWG) { + if (GetGroupID1()*NWG > GetGroupID0()*MWG) { return; } diff --git a/src/routine.cc b/src/routine.cc index e0cc9a90..eee4c7cc 100644 --- a/src/routine.cc +++ b/src/routine.cc @@ -88,12 +88,21 @@ StatusCode Routine::SetUp() { // Adds the name of the routine as a define defines += "#define ROUTINE_"+routine_name_+"\n"; + // Determines whether this is a specific device + const auto isAMD = device_.Vendor() == "AMD" || device_.Vendor() == "Advanced Micro Devices, Inc."; + const auto isGPU = device_.Type() == "GPU"; + // For specific devices, use the non-IEE754 compilant OpenCL mad() instruction. This can improve // performance, but might result in a reduced accuracy. - if (device_.Vendor() == "AMD") { + if (isAMD && isGPU) { defines += "#define USE_CL_MAD 1\n"; } + // For specific devices, use staggered/shuffled workgroup indices. + if (isAMD && isGPU) { + defines += "#define USE_STAGGERED_INDICES 1\n"; + } + // Combines everything together into a single source string auto source_string = defines + common_header + source_string_; diff --git a/src/routines/level3/xgemm.cc b/src/routines/level3/xgemm.cc index aa081e81..3699b548 100644 --- a/src/routines/level3/xgemm.cc +++ b/src/routines/level3/xgemm.cc @@ -184,12 +184,13 @@ StatusCode Xgemm::DoGemm(const Layout layout, // Launches the kernel auto eventKernel = Event(); - status = RunKernel(kernel, global, local, eventKernel.pointer(), eventWaitList); + auto eventPointer = (!c_no_temp) ? eventKernel.pointer() : event_; + status = RunKernel(kernel, global, local, eventPointer, eventWaitList); if (ErrorIn(status)) { return status; } - eventWaitList.push_back(eventKernel); // Runs the post-processing kernel if needed if (!c_no_temp) { + eventWaitList.push_back(eventKernel); status = PadCopyTransposeMatrix(event_, eventWaitList, m_ceiled, n_ceiled, m_ceiled, 0, c_temp, c_one, c_two, c_ld, c_offset, c_buffer, diff --git a/test/correctness/tester.cc b/test/correctness/tester.cc index 26c4ba59..85ae7091 100644 --- a/test/correctness/tester.cc +++ b/test/correctness/tester.cc @@ -334,7 +334,7 @@ bool TestSimilarity(const T val1, const T val2) { // Set the allowed error margin for floating-point comparisons constexpr auto kErrorMarginRelative = T(0.025); - constexpr auto kErrorMarginAbsolute = T(1.0e-4); + constexpr auto kErrorMarginAbsolute = T(1.0e-3); // Shortcut, handles infinities if (val1 == val2) { diff --git a/test/performance/graphs/common.r b/test/performance/graphs/common.r index 5b3e6e52..cd68cf26 100644 --- a/test/performance/graphs/common.r +++ b/test/performance/graphs/common.r @@ -34,7 +34,7 @@ options("width"=170) # Constants num_runs <- 4 devices <- c("-platform","-device") -options_string <- "-q -no_abbrv" +options_string <- "-q -no_abbrv -cblas 0" library_names <- c("CLBlast", "clBLAS") # Command-line arguments diff --git a/test/performance/graphs/xgemm.r b/test/performance/graphs/xgemm.r index 6533b44b..e758f460 100755 --- a/test/performance/graphs/xgemm.r +++ b/test/performance/graphs/xgemm.r @@ -35,32 +35,32 @@ test_names <- list( # Defines the test-cases test_values <- list( - list(c( 128, 128, 128, 1, 0, 0, 16, 128, num_runs, precision)), - list(c( 129, 129, 129, 1, 0, 0, 16, 128, num_runs, precision)), - list(c( 512, 512, 512, 1, 0, 0, 16, 1, num_runs, precision)), - list(c(2048, 2048, 2048, 1, 0, 0, 16, 1, num_runs, precision)), + list(c( 128, 128, 128, 102, 111, 111, 16, 128, num_runs, precision)), + list(c( 129, 129, 129, 102, 111, 111, 16, 128, num_runs, precision)), + list(c( 512, 512, 512, 102, 111, 111, 16, 1, num_runs, precision)), + list(c(2048, 2048, 2048, 102, 111, 111, 16, 1, num_runs, precision)), list( - c(1024, 1024, 1024, 0, 0, 0, 1, 0, num_runs, precision), - c(1024, 1024, 1024, 0, 0, 1, 1, 0, num_runs, precision), - c(1024, 1024, 1024, 0, 1, 0, 1, 0, num_runs, precision), - c(1024, 1024, 1024, 0, 1, 1, 1, 0, num_runs, precision), - c(1024, 1024, 1024, 1, 0, 0, 1, 0, num_runs, precision), - c(1024, 1024, 1024, 1, 0, 1, 1, 0, num_runs, precision), - c(1024, 1024, 1024, 1, 1, 0, 1, 0, num_runs, precision), - c(1024, 1024, 1024, 1, 1, 1, 1, 0, num_runs, precision) + c(1024, 1024, 1024, 101, 111, 111, 1, 0, num_runs, precision), + c(1024, 1024, 1024, 101, 111, 112, 1, 0, num_runs, precision), + c(1024, 1024, 1024, 101, 112, 111, 1, 0, num_runs, precision), + c(1024, 1024, 1024, 101, 112, 112, 1, 0, num_runs, precision), + c(1024, 1024, 1024, 102, 111, 111, 1, 0, num_runs, precision), + c(1024, 1024, 1024, 102, 111, 112, 1, 0, num_runs, precision), + c(1024, 1024, 1024, 102, 112, 111, 1, 0, num_runs, precision), + c(1024, 1024, 1024, 102, 112, 112, 1, 0, num_runs, precision) ), list( - c( 8, 8, 8, 1, 0, 0, 1, 0, num_runs, precision), - c( 16, 16, 16, 1, 0, 0, 1, 0, num_runs, precision), - c( 32, 32, 32, 1, 0, 0, 1, 0, num_runs, precision), - c( 64, 64, 64, 1, 0, 0, 1, 0, num_runs, precision), - c( 128, 128, 128, 1, 0, 0, 1, 0, num_runs, precision), - c( 256, 256, 256, 1, 0, 0, 1, 0, num_runs, precision), - c( 512, 512, 512, 1, 0, 0, 1, 0, num_runs, precision), - c(1024, 1024, 1024, 1, 0, 0, 1, 0, num_runs, precision), - c(2048, 2048, 2048, 1, 0, 0, 1, 0, num_runs, precision), - c(4096, 4096, 4096, 1, 0, 0, 1, 0, num_runs, precision), - c(8192, 8192, 8192, 1, 0, 0, 1, 0, num_runs, precision) + c( 8, 8, 8, 102, 111, 111, 1, 0, num_runs, precision), + c( 16, 16, 16, 102, 111, 111, 1, 0, num_runs, precision), + c( 32, 32, 32, 102, 111, 111, 1, 0, num_runs, precision), + c( 64, 64, 64, 102, 111, 111, 1, 0, num_runs, precision), + c( 128, 128, 128, 102, 111, 111, 1, 0, num_runs, precision), + c( 256, 256, 256, 102, 111, 111, 1, 0, num_runs, precision), + c( 512, 512, 512, 102, 111, 111, 1, 0, num_runs, precision), + c(1024, 1024, 1024, 102, 111, 111, 1, 0, num_runs, precision), + c(2048, 2048, 2048, 102, 111, 111, 1, 0, num_runs, precision), + c(4096, 4096, 4096, 102, 111, 111, 1, 0, num_runs, precision), + c(8192, 8192, 8192, 102, 111, 111, 1, 0, num_runs, precision) ) ) diff --git a/test/performance/graphs/xgemv.r b/test/performance/graphs/xgemv.r index a4e7a834..9a8040f7 100644 --- a/test/performance/graphs/xgemv.r +++ b/test/performance/graphs/xgemv.r @@ -35,22 +35,22 @@ test_names <- list( # Defines the test-cases test_values <- list( - list(c(256, 256, 1, 1, 1, 16, 256, num_runs, precision)), - list(c(256+1, 256+1, 1, 1, 1, 16, 256, num_runs, precision)), - list(c(2*kilo, 2*kilo, 1, 1, 1, 16, 1, num_runs, precision)), - list(c(256, 256, 1, 1, 0, 16, 256, num_runs, precision)), - list(c(256+1, 256+1, 1, 1, 0, 16, 256, num_runs, precision)), + list(c(256, 256, 1, 1, 102, 16, 256, num_runs, precision)), + list(c(256+1, 256+1, 1, 1, 102, 16, 256, num_runs, precision)), + list(c(2*kilo, 2*kilo, 1, 1, 102, 16, 1, num_runs, precision)), + list(c(256, 256, 1, 1, 101, 16, 256, num_runs, precision)), + list(c(256+1, 256+1, 1, 1, 101, 16, 256, num_runs, precision)), list( - c(2*kilo, 2*kilo, 1, 1, 1, 1, 0, num_runs, precision), - c(2*kilo, 2*kilo, 2, 1, 1, 1, 0, num_runs, precision), - c(2*kilo, 2*kilo, 4, 1, 1, 1, 0, num_runs, precision), - c(2*kilo, 2*kilo, 8, 1, 1, 1, 0, num_runs, precision), - c(2*kilo, 2*kilo, 1, 2, 1, 1, 0, num_runs, precision), - c(2*kilo, 2*kilo, 1, 4, 1, 1, 0, num_runs, precision), - c(2*kilo, 2*kilo, 1, 8, 1, 1, 0, num_runs, precision), - c(2*kilo, 2*kilo, 2, 2, 1, 1, 0, num_runs, precision), - c(2*kilo, 2*kilo, 4, 4, 1, 1, 0, num_runs, precision), - c(2*kilo, 2*kilo, 8, 8, 1, 1, 0, num_runs, precision) + c(2*kilo, 2*kilo, 1, 1, 102, 1, 0, num_runs, precision), + c(2*kilo, 2*kilo, 2, 1, 102, 1, 0, num_runs, precision), + c(2*kilo, 2*kilo, 4, 1, 102, 1, 0, num_runs, precision), + c(2*kilo, 2*kilo, 8, 1, 102, 1, 0, num_runs, precision), + c(2*kilo, 2*kilo, 1, 2, 102, 1, 0, num_runs, precision), + c(2*kilo, 2*kilo, 1, 4, 102, 1, 0, num_runs, precision), + c(2*kilo, 2*kilo, 1, 8, 102, 1, 0, num_runs, precision), + c(2*kilo, 2*kilo, 2, 2, 102, 1, 0, num_runs, precision), + c(2*kilo, 2*kilo, 4, 4, 102, 1, 0, num_runs, precision), + c(2*kilo, 2*kilo, 8, 8, 102, 1, 0, num_runs, precision) ) ) diff --git a/test/performance/graphs/xsymm.r b/test/performance/graphs/xsymm.r index c27de904..a65bb16f 100644 --- a/test/performance/graphs/xsymm.r +++ b/test/performance/graphs/xsymm.r @@ -35,32 +35,32 @@ test_names <- list( # Defines the test-cases test_values <- list( - list(c( 128, 128, 1, 0, 0, 16, 128, num_runs, precision)), - list(c( 129, 129, 1, 0, 0, 16, 128, num_runs, precision)), - list(c( 512, 512, 1, 0, 0, 16, 1, num_runs, precision)), - list(c(2048, 2048, 1, 0, 0, 16, 1, num_runs, precision)), + list(c( 128, 128, 102, 111, 111, 16, 128, num_runs, precision)), + list(c( 129, 129, 102, 111, 111, 16, 128, num_runs, precision)), + list(c( 512, 512, 102, 111, 111, 16, 1, num_runs, precision)), + list(c(2048, 2048, 102, 111, 111, 16, 1, num_runs, precision)), list( - c(1024, 1024, 0, 0, 0, 1, 0, num_runs, precision), - c(1024, 1024, 0, 0, 1, 1, 0, num_runs, precision), - c(1024, 1024, 0, 1, 0, 1, 0, num_runs, precision), - c(1024, 1024, 0, 1, 1, 1, 0, num_runs, precision), - c(1024, 1024, 1, 0, 0, 1, 0, num_runs, precision), - c(1024, 1024, 1, 0, 1, 1, 0, num_runs, precision), - c(1024, 1024, 1, 1, 0, 1, 0, num_runs, precision), - c(1024, 1024, 1, 1, 1, 1, 0, num_runs, precision) + c(1024, 1024, 101, 111, 111, 1, 0, num_runs, precision), + c(1024, 1024, 101, 111, 112, 1, 0, num_runs, precision), + c(1024, 1024, 101, 112, 111, 1, 0, num_runs, precision), + c(1024, 1024, 101, 112, 112, 1, 0, num_runs, precision), + c(1024, 1024, 102, 111, 111, 1, 0, num_runs, precision), + c(1024, 1024, 102, 111, 112, 1, 0, num_runs, precision), + c(1024, 1024, 102, 112, 111, 1, 0, num_runs, precision), + c(1024, 1024, 102, 112, 112, 1, 0, num_runs, precision) ), list( - c( 8, 8, 1, 0, 0, 1, 0, num_runs, precision), - c( 16, 16, 1, 0, 0, 1, 0, num_runs, precision), - c( 32, 32, 1, 0, 0, 1, 0, num_runs, precision), - c( 64, 64, 1, 0, 0, 1, 0, num_runs, precision), - c( 128, 128, 1, 0, 0, 1, 0, num_runs, precision), - c( 256, 256, 1, 0, 0, 1, 0, num_runs, precision), - c( 512, 512, 1, 0, 0, 1, 0, num_runs, precision), - c(1024, 1024, 1, 0, 0, 1, 0, num_runs, precision), - c(2048, 2048, 1, 0, 0, 1, 0, num_runs, precision), - c(4096, 4096, 1, 0, 0, 1, 0, num_runs, precision), - c(8192, 8192, 1, 0, 0, 1, 0, num_runs, precision) + c( 8, 8, 102, 111, 111, 1, 0, num_runs, precision), + c( 16, 16, 102, 111, 111, 1, 0, num_runs, precision), + c( 32, 32, 102, 111, 111, 1, 0, num_runs, precision), + c( 64, 64, 102, 111, 111, 1, 0, num_runs, precision), + c( 128, 128, 102, 111, 111, 1, 0, num_runs, precision), + c( 256, 256, 102, 111, 111, 1, 0, num_runs, precision), + c( 512, 512, 102, 111, 111, 1, 0, num_runs, precision), + c(1024, 1024, 102, 111, 111, 1, 0, num_runs, precision), + c(2048, 2048, 102, 111, 111, 1, 0, num_runs, precision), + c(4096, 4096, 102, 111, 111, 1, 0, num_runs, precision), + c(8192, 8192, 102, 111, 111, 1, 0, num_runs, precision) ) ) diff --git a/test/performance/graphs/xsyr2k.r b/test/performance/graphs/xsyr2k.r index eb761e4c..4b2dd4a0 100644 --- a/test/performance/graphs/xsyr2k.r +++ b/test/performance/graphs/xsyr2k.r @@ -35,32 +35,32 @@ test_names <- list( # Defines the test-cases test_values <- list( - list(c( 128, 128, 1, 0, 0, 16, 128, num_runs, precision)), - list(c( 129, 129, 1, 0, 0, 16, 128, num_runs, precision)), - list(c( 512, 512, 1, 0, 0, 16, 1, num_runs, precision)), - list(c(1536, 1536, 1, 0, 0, 16, 1, num_runs, precision)), + list(c( 128, 128, 102, 111, 111, 16, 128, num_runs, precision)), + list(c( 129, 129, 102, 111, 111, 16, 128, num_runs, precision)), + list(c( 512, 512, 102, 111, 111, 16, 1, num_runs, precision)), + list(c(1536, 1536, 102, 111, 111, 16, 1, num_runs, precision)), list( - c(1024, 1024, 0, 0, 0, 1, 0, num_runs, precision), - c(1024, 1024, 0, 0, 1, 1, 0, num_runs, precision), - c(1024, 1024, 0, 1, 0, 1, 0, num_runs, precision), - c(1024, 1024, 0, 1, 1, 1, 0, num_runs, precision), - c(1024, 1024, 1, 0, 0, 1, 0, num_runs, precision), - c(1024, 1024, 1, 0, 1, 1, 0, num_runs, precision), - c(1024, 1024, 1, 1, 0, 1, 0, num_runs, precision), - c(1024, 1024, 1, 1, 1, 1, 0, num_runs, precision) + c(1024, 1024, 101, 111, 111, 1, 0, num_runs, precision), + c(1024, 1024, 101, 111, 112, 1, 0, num_runs, precision), + c(1024, 1024, 101, 112, 111, 1, 0, num_runs, precision), + c(1024, 1024, 101, 112, 112, 1, 0, num_runs, precision), + c(1024, 1024, 102, 111, 111, 1, 0, num_runs, precision), + c(1024, 1024, 102, 111, 112, 1, 0, num_runs, precision), + c(1024, 1024, 102, 112, 111, 1, 0, num_runs, precision), + c(1024, 1024, 102, 112, 112, 1, 0, num_runs, precision) ), list( - c( 8, 8, 1, 0, 0, 1, 0, num_runs, precision), - c( 16, 16, 1, 0, 0, 1, 0, num_runs, precision), - c( 32, 32, 1, 0, 0, 1, 0, num_runs, precision), - c( 64, 64, 1, 0, 0, 1, 0, num_runs, precision), - c( 128, 128, 1, 0, 0, 1, 0, num_runs, precision), - c( 256, 256, 1, 0, 0, 1, 0, num_runs, precision), - c( 512, 512, 1, 0, 0, 1, 0, num_runs, precision), - c(1024, 1024, 1, 0, 0, 1, 0, num_runs, precision), - c(2048, 2048, 1, 0, 0, 1, 0, num_runs, precision), - c(4096, 4096, 1, 0, 0, 1, 0, num_runs, precision), - c(8192, 8192, 1, 0, 0, 1, 0, num_runs, precision) + c( 8, 8, 102, 111, 111, 1, 0, num_runs, precision), + c( 16, 16, 102, 111, 111, 1, 0, num_runs, precision), + c( 32, 32, 102, 111, 111, 1, 0, num_runs, precision), + c( 64, 64, 102, 111, 111, 1, 0, num_runs, precision), + c( 128, 128, 102, 111, 111, 1, 0, num_runs, precision), + c( 256, 256, 102, 111, 111, 1, 0, num_runs, precision), + c( 512, 512, 102, 111, 111, 1, 0, num_runs, precision), + c(1024, 1024, 102, 111, 111, 1, 0, num_runs, precision), + c(2048, 2048, 102, 111, 111, 1, 0, num_runs, precision), + c(4096, 4096, 102, 111, 111, 1, 0, num_runs, precision), + c(8192, 8192, 102, 111, 111, 1, 0, num_runs, precision) ) ) diff --git a/test/performance/graphs/xsyrk.r b/test/performance/graphs/xsyrk.r index 04f7b515..4ab46c9f 100644 --- a/test/performance/graphs/xsyrk.r +++ b/test/performance/graphs/xsyrk.r @@ -35,32 +35,32 @@ test_names <- list( # Defines the test-cases test_values <- list( - list(c( 128, 128, 1, 0, 0, 16, 128, num_runs, precision)), - list(c( 129, 129, 1, 0, 0, 16, 128, num_runs, precision)), - list(c( 512, 512, 1, 0, 0, 16, 1, num_runs, precision)), - list(c(2048, 2048, 1, 0, 0, 16, 1, num_runs, precision)), + list(c( 128, 128, 102, 111, 111, 16, 128, num_runs, precision)), + list(c( 129, 129, 102, 111, 111, 16, 128, num_runs, precision)), + list(c( 512, 512, 102, 111, 111, 16, 1, num_runs, precision)), + list(c(2048, 2048, 102, 111, 111, 16, 1, num_runs, precision)), list( - c(1024, 1024, 0, 0, 0, 1, 0, num_runs, precision), - c(1024, 1024, 0, 0, 1, 1, 0, num_runs, precision), - c(1024, 1024, 0, 1, 0, 1, 0, num_runs, precision), - c(1024, 1024, 0, 1, 1, 1, 0, num_runs, precision), - c(1024, 1024, 1, 0, 0, 1, 0, num_runs, precision), - c(1024, 1024, 1, 0, 1, 1, 0, num_runs, precision), - c(1024, 1024, 1, 1, 0, 1, 0, num_runs, precision), - c(1024, 1024, 1, 1, 1, 1, 0, num_runs, precision) + c(1024, 1024, 101, 111, 111, 1, 0, num_runs, precision), + c(1024, 1024, 101, 111, 112, 1, 0, num_runs, precision), + c(1024, 1024, 101, 112, 111, 1, 0, num_runs, precision), + c(1024, 1024, 101, 112, 112, 1, 0, num_runs, precision), + c(1024, 1024, 102, 111, 111, 1, 0, num_runs, precision), + c(1024, 1024, 102, 111, 112, 1, 0, num_runs, precision), + c(1024, 1024, 102, 112, 111, 1, 0, num_runs, precision), + c(1024, 1024, 102, 112, 112, 1, 0, num_runs, precision) ), list( - c( 8, 8, 1, 0, 0, 1, 0, num_runs, precision), - c( 16, 16, 1, 0, 0, 1, 0, num_runs, precision), - c( 32, 32, 1, 0, 0, 1, 0, num_runs, precision), - c( 64, 64, 1, 0, 0, 1, 0, num_runs, precision), - c( 128, 128, 1, 0, 0, 1, 0, num_runs, precision), - c( 256, 256, 1, 0, 0, 1, 0, num_runs, precision), - c( 512, 512, 1, 0, 0, 1, 0, num_runs, precision), - c(1024, 1024, 1, 0, 0, 1, 0, num_runs, precision), - c(2048, 2048, 1, 0, 0, 1, 0, num_runs, precision), - c(4096, 4096, 1, 0, 0, 1, 0, num_runs, precision), - c(8192, 8192, 1, 0, 0, 1, 0, num_runs, precision) + c( 8, 8, 102, 111, 111, 1, 0, num_runs, precision), + c( 16, 16, 102, 111, 111, 1, 0, num_runs, precision), + c( 32, 32, 102, 111, 111, 1, 0, num_runs, precision), + c( 64, 64, 102, 111, 111, 1, 0, num_runs, precision), + c( 128, 128, 102, 111, 111, 1, 0, num_runs, precision), + c( 256, 256, 102, 111, 111, 1, 0, num_runs, precision), + c( 512, 512, 102, 111, 111, 1, 0, num_runs, precision), + c(1024, 1024, 102, 111, 111, 1, 0, num_runs, precision), + c(2048, 2048, 102, 111, 111, 1, 0, num_runs, precision), + c(4096, 4096, 102, 111, 111, 1, 0, num_runs, precision), + c(8192, 8192, 102, 111, 111, 1, 0, num_runs, precision) ) ) diff --git a/test/performance/graphs/xtrmm.r b/test/performance/graphs/xtrmm.r index 3b35f7c0..c2faaa8b 100644 --- a/test/performance/graphs/xtrmm.r +++ b/test/performance/graphs/xtrmm.r @@ -35,59 +35,59 @@ test_names <- list( # Defines the test-cases test_values <- list( - list(c( 128, 128, 1, 0, 0, 0, 0, 16, 128, num_runs, precision)), - list(c( 129, 129, 1, 0, 0, 0, 0, 16, 128, num_runs, precision)), - list(c( 512, 512, 1, 0, 0, 0, 0, 16, 1, num_runs, precision)), - list(c(2048, 2048, 1, 0, 0, 0, 0, 16, 1, num_runs, precision)), + list(c( 128, 128, 102, 141, 121, 111, 131, 16, 128, num_runs, precision)), + list(c( 129, 129, 102, 141, 121, 111, 131, 16, 128, num_runs, precision)), + list(c( 512, 512, 102, 141, 121, 111, 131, 16, 1, num_runs, precision)), + list(c(2048, 2048, 102, 141, 121, 111, 131, 16, 1, num_runs, precision)), list( - c(1024, 1024, 0, 0, 0, 0, 0, 1, 0, num_runs, precision), - c(1024, 1024, 0, 0, 0, 0, 1, 1, 0, num_runs, precision), - c(1024, 1024, 0, 0, 0, 1, 0, 1, 0, num_runs, precision), - c(1024, 1024, 0, 0, 0, 1, 1, 1, 0, num_runs, precision), - c(1024, 1024, 0, 0, 1, 0, 0, 1, 0, num_runs, precision), - c(1024, 1024, 0, 0, 1, 0, 1, 1, 0, num_runs, precision), - c(1024, 1024, 0, 0, 1, 1, 0, 1, 0, num_runs, precision), - c(1024, 1024, 0, 0, 1, 1, 1, 1, 0, num_runs, precision), + c(1024, 1024, 101, 141, 121, 111, 131, 1, 0, num_runs, precision), + c(1024, 1024, 101, 141, 121, 111, 132, 1, 0, num_runs, precision), + c(1024, 1024, 101, 141, 121, 112, 131, 1, 0, num_runs, precision), + c(1024, 1024, 101, 141, 121, 112, 132, 1, 0, num_runs, precision), + c(1024, 1024, 101, 141, 122, 111, 131, 1, 0, num_runs, precision), + c(1024, 1024, 101, 141, 122, 111, 132, 1, 0, num_runs, precision), + c(1024, 1024, 101, 141, 122, 112, 131, 1, 0, num_runs, precision), + c(1024, 1024, 101, 141, 122, 112, 132, 1, 0, num_runs, precision), - c(1024, 1024, 0, 1, 0, 0, 0, 1, 0, num_runs, precision), - c(1024, 1024, 0, 1, 0, 0, 1, 1, 0, num_runs, precision), - c(1024, 1024, 0, 1, 0, 1, 0, 1, 0, num_runs, precision), - c(1024, 1024, 0, 1, 0, 1, 1, 1, 0, num_runs, precision), - c(1024, 1024, 0, 1, 1, 0, 0, 1, 0, num_runs, precision), - c(1024, 1024, 0, 1, 1, 0, 1, 1, 0, num_runs, precision), - c(1024, 1024, 0, 1, 1, 1, 0, 1, 0, num_runs, precision), - c(1024, 1024, 0, 1, 1, 1, 1, 1, 0, num_runs, precision), + c(1024, 1024, 101, 142, 121, 111, 131, 1, 0, num_runs, precision), + c(1024, 1024, 101, 142, 121, 111, 132, 1, 0, num_runs, precision), + c(1024, 1024, 101, 142, 121, 112, 131, 1, 0, num_runs, precision), + c(1024, 1024, 101, 142, 121, 112, 132, 1, 0, num_runs, precision), + c(1024, 1024, 101, 142, 122, 111, 131, 1, 0, num_runs, precision), + c(1024, 1024, 101, 142, 122, 111, 132, 1, 0, num_runs, precision), + c(1024, 1024, 101, 142, 122, 112, 131, 1, 0, num_runs, precision), + c(1024, 1024, 101, 142, 122, 112, 132, 1, 0, num_runs, precision), - c(1024, 1024, 1, 0, 0, 0, 0, 1, 0, num_runs, precision), - c(1024, 1024, 1, 0, 0, 0, 1, 1, 0, num_runs, precision), - c(1024, 1024, 1, 0, 0, 1, 0, 1, 0, num_runs, precision), - c(1024, 1024, 1, 0, 0, 1, 1, 1, 0, num_runs, precision), - c(1024, 1024, 1, 0, 1, 0, 0, 1, 0, num_runs, precision), - c(1024, 1024, 1, 0, 1, 0, 1, 1, 0, num_runs, precision), - c(1024, 1024, 1, 0, 1, 1, 0, 1, 0, num_runs, precision), - c(1024, 1024, 1, 0, 1, 1, 1, 1, 0, num_runs, precision), + c(1024, 1024, 102, 141, 121, 111, 131, 1, 0, num_runs, precision), + c(1024, 1024, 102, 141, 121, 111, 132, 1, 0, num_runs, precision), + c(1024, 1024, 102, 141, 121, 112, 131, 1, 0, num_runs, precision), + c(1024, 1024, 102, 141, 121, 112, 132, 1, 0, num_runs, precision), + c(1024, 1024, 102, 141, 122, 111, 131, 1, 0, num_runs, precision), + c(1024, 1024, 102, 141, 122, 111, 132, 1, 0, num_runs, precision), + c(1024, 1024, 102, 141, 122, 112, 131, 1, 0, num_runs, precision), + c(1024, 1024, 102, 141, 122, 112, 132, 1, 0, num_runs, precision), - c(1024, 1024, 1, 1, 0, 0, 0, 1, 0, num_runs, precision), - c(1024, 1024, 1, 1, 0, 0, 1, 1, 0, num_runs, precision), - c(1024, 1024, 1, 1, 0, 1, 0, 1, 0, num_runs, precision), - c(1024, 1024, 1, 1, 0, 1, 1, 1, 0, num_runs, precision), - c(1024, 1024, 1, 1, 1, 0, 0, 1, 0, num_runs, precision), - c(1024, 1024, 1, 1, 1, 0, 1, 1, 0, num_runs, precision), - c(1024, 1024, 1, 1, 1, 1, 0, 1, 0, num_runs, precision), - c(1024, 1024, 1, 1, 1, 1, 1, 1, 0, num_runs, precision) + c(1024, 1024, 102, 142, 121, 111, 131, 1, 0, num_runs, precision), + c(1024, 1024, 102, 142, 121, 111, 132, 1, 0, num_runs, precision), + c(1024, 1024, 102, 142, 121, 112, 131, 1, 0, num_runs, precision), + c(1024, 1024, 102, 142, 121, 112, 132, 1, 0, num_runs, precision), + c(1024, 1024, 102, 142, 122, 111, 131, 1, 0, num_runs, precision), + c(1024, 1024, 102, 142, 122, 111, 132, 1, 0, num_runs, precision), + c(1024, 1024, 102, 142, 122, 112, 131, 1, 0, num_runs, precision), + c(1024, 1024, 102, 142, 122, 112, 132, 1, 0, num_runs, precision) ), list( - c( 8, 8, 1, 0, 0, 0, 0, 1, 0, num_runs, precision), - c( 16, 16, 1, 0, 0, 0, 0, 1, 0, num_runs, precision), - c( 32, 32, 1, 0, 0, 0, 0, 1, 0, num_runs, precision), - c( 64, 64, 1, 0, 0, 0, 0, 1, 0, num_runs, precision), - c( 128, 128, 1, 0, 0, 0, 0, 1, 0, num_runs, precision), - c( 256, 256, 1, 0, 0, 0, 0, 1, 0, num_runs, precision), - c( 512, 512, 1, 0, 0, 0, 0, 1, 0, num_runs, precision), - c(1024, 1024, 1, 0, 0, 0, 0, 1, 0, num_runs, precision), - c(2048, 2048, 1, 0, 0, 0, 0, 1, 0, num_runs, precision), - c(4096, 4096, 1, 0, 0, 0, 0, 1, 0, num_runs, precision), - c(8192, 8192, 1, 0, 0, 0, 0, 1, 0, num_runs, precision) + c( 8, 8, 102, 141, 121, 111, 131, 1, 0, num_runs, precision), + c( 16, 16, 102, 141, 121, 111, 131, 1, 0, num_runs, precision), + c( 32, 32, 102, 141, 121, 111, 131, 1, 0, num_runs, precision), + c( 64, 64, 102, 141, 121, 111, 131, 1, 0, num_runs, precision), + c( 128, 128, 102, 141, 121, 111, 131, 1, 0, num_runs, precision), + c( 256, 256, 102, 141, 121, 111, 131, 1, 0, num_runs, precision), + c( 512, 512, 102, 141, 121, 111, 131, 1, 0, num_runs, precision), + c(1024, 1024, 102, 141, 121, 111, 131, 1, 0, num_runs, precision), + c(2048, 2048, 102, 141, 121, 111, 131, 1, 0, num_runs, precision), + c(4096, 4096, 102, 141, 121, 111, 131, 1, 0, num_runs, precision), + c(8192, 8192, 102, 141, 121, 111, 131, 1, 0, num_runs, precision) ) )