From 0dacd04bcdd54ec5280b3929ef00592c77e5daa1 Mon Sep 17 00:00:00 2001 From: Cedric Nugteren Date: Sun, 8 May 2016 21:30:04 +0200 Subject: [PATCH 01/13] Prepared the changelog for the next release --- CHANGELOG | 3 +++ 1 file changed, 3 insertions(+) diff --git a/CHANGELOG b/CHANGELOG index c77e5e48..92c0c5ad 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -1,4 +1,7 @@ +Development version (next release) +- + Version 0.7.0 - Added exports to be able to create a DLL on Windows (thanks to Marco Hutter) - Made the library thread-safe From 1c72d225c53c123ed810cf3f56f5c92603f7f791 Mon Sep 17 00:00:00 2001 From: Cedric Nugteren Date: Tue, 10 May 2016 21:03:51 +0200 Subject: [PATCH 02/13] Fixed links in the README --- README.md | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 8d9220a6..869ef636 100644 --- a/README.md +++ b/README.md @@ -54,7 +54,7 @@ The pre-requisites for compilation of CLBlast are: Furthermore, to build the (optional) correctness and performance tests, another BLAS library is needed to serve as a reference. This can be either: -* The OpenCL BLAS library [clBLAS](http://github.com/clMathLibraries/clBLAS (maintained by AMD) +* The OpenCL BLAS library [clBLAS](http://github.com/clMathLibraries/clBLAS) (maintained by AMD) * A regular CPU Netlib BLAS library, e.g.: - OpenBLAS - BLIS @@ -84,7 +84,7 @@ Or alternatively the plain C version: #include -Afterwards, any of CLBlast's routines can be called directly: there is no need to initialize the library. The available routines and the required arguments are described in the `clblast.h` include file and the included [API documentation](doc/api.md). Additionally, a couple of stand-alone example programs are included in `samples/`. +Afterwards, any of CLBlast's routines can be called directly: there is no need to initialize the library. The available routines and the required arguments are described in the `clblast.h` include file and the included [API documentation](doc/clblast.md). Additionally, a couple of stand-alone example programs are included in `samples/`. Using the tuners (optional) @@ -254,6 +254,7 @@ The contributing authors (code, pull requests, testing) so far are: * [Cedric Nugteren](http://www.cedricnugteren.nl) * [Anton Lokhmotov](https://github.com/psyhtest) * [Dragan Djuric](https://github.com/blueberry) +* [Marco Hutter](https://github.com/gpus) * [Hugh Perkins](https://github.com/hughperkins) Tuning and testing on a variety of OpenCL devices was made possible by: From 9065b3468478818e9c5918380af665f2d499a322 Mon Sep 17 00:00:00 2001 From: cnugteren Date: Sun, 15 May 2016 14:04:34 +0200 Subject: [PATCH 03/13] Added support for staggered/shuffled offsets for GEMM to improve performance for large power-of-2 kernels on AMD GPUs --- CHANGELOG | 2 +- include/internal/tuning.h | 19 +++- src/kernels/common.opencl | 26 +++++ src/kernels/level3/xgemm_part1.opencl | 8 +- src/kernels/level3/xgemm_part2.opencl | 138 +++++++++++++------------- src/routine.cc | 11 +- 6 files changed, 127 insertions(+), 77 deletions(-) diff --git a/CHANGELOG b/CHANGELOG index 92c0c5ad..187fca73 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -1,6 +1,6 @@ Development version (next release) -- +- Improved performance of large power-of-2 xGEMM kernels for AMD GPUs Version 0.7.0 - Added exports to be able to create a DLL on Windows (thanks to Marco Hutter) diff --git a/include/internal/tuning.h b/include/internal/tuning.h index 5645a5e5..215beb59 100644 --- a/include/internal/tuning.h +++ b/include/internal/tuning.h @@ -48,14 +48,18 @@ void Tuner(int argc, char* argv[]) { // Tests validity of the given arguments C::TestValidArguments(args); - // Tests for validity of the precision + // Tests for validity of the precision and retrieves properties + auto isAMD = false; + auto isGPU = false; { - auto platform = Platform(args.platform_id); - auto device = Device(platform, args.device_id); + const auto platform = Platform(args.platform_id); + const auto device = Device(platform, args.device_id); if (!PrecisionSupported(device)) { printf("* Unsupported precision, skipping this tuning run\n\n"); return; } + isAMD = device.Vendor() == "AMD" || device.Vendor() == "Advanced Micro Devices, Inc."; + isGPU = device.Type() == "GPU"; } // Creates input buffers with random data @@ -84,8 +88,15 @@ void Tuner(int argc, char* argv[]) { tuner.UseRandomSearch(1.0/args.fraction); } + // Set extra settings for specific defines. This mimics src/routine.cc. + auto defines = std::string{""}; + if (isAMD && isGPU) { + defines += "#define USE_CL_MAD 1\n"; + defines += "#define USE_STAGGERED_INDICES 1\n"; + } + // Loads the kernel sources and defines the kernel to tune - auto sources = C::GetSources(); + auto sources = defines + C::GetSources(); auto id = tuner.AddKernelFromString(sources, C::KernelName(), C::GlobalSize(args), C::LocalSize()); tuner.SetReferenceFromString(sources, C::KernelName(), C::GlobalSizeRef(args), C::LocalSizeRef()); diff --git a/src/kernels/common.opencl b/src/kernels/common.opencl index d401744d..b9e52e17 100644 --- a/src/kernels/common.opencl +++ b/src/kernels/common.opencl @@ -176,6 +176,32 @@ R"( // ================================================================================================= +// Shuffled workgroup indices to avoid partition camping, see below. For specific devices, this is +// enabled (see src/routine.cc). +#ifndef USE_STAGGERED_INDICES + #define USE_STAGGERED_INDICES 0 +#endif + +// Staggered/shuffled group indices to avoid partition camping (AMD GPUs). Formula's are taken from: +// http://docs.nvidia.com/cuda/samples/6_Advanced/transpose/doc/MatrixTranspose.pdf +// More details: https://github.com/CNugteren/CLBlast/issues/53 +#if USE_STAGGERED_INDICES == 1 + inline size_t GetGroupIDFlat() { + return get_group_id(0) + get_num_groups(0) * get_group_id(1); + } + inline size_t GetGroupID1() { + return (GetGroupIDFlat()) % get_num_groups(1); + } + inline size_t GetGroupID0() { + return ((GetGroupIDFlat() / get_num_groups(1)) + GetGroupID1()) % get_num_groups(0); + } +#else + inline size_t GetGroupID1() { return get_group_id(1); } + inline size_t GetGroupID0() { return get_group_id(0); } +#endif + +// ================================================================================================= + // End of the C++11 raw string literal )" diff --git a/src/kernels/level3/xgemm_part1.opencl b/src/kernels/level3/xgemm_part1.opencl index 4cb0585b..a2a555de 100644 --- a/src/kernels/level3/xgemm_part1.opencl +++ b/src/kernels/level3/xgemm_part1.opencl @@ -199,7 +199,7 @@ inline void GlobalToLocalA(const __global realM* restrict agm, __local realM* al // Computes the indices for the global memory int kg = kia + la1*KWA; - int idm = mg + get_group_id(0)*(MWG/VWM); + int idm = mg + GetGroupID0() * (MWG/VWM); int idk = kg + kwg; // Loads the data from global memory (not transposed) into the local memory @@ -229,7 +229,7 @@ inline void GlobalToLocalB(const __global realN* restrict bgm, __local realN* bl // Computes the indices for the global memory int kg = kib + lb1*KWB; - int idn = ng + get_group_id(1)*(NWG/VWN); + int idn = ng + GetGroupID1() * (NWG/VWN); int idk = kg + kwg; // Loads the data from global memory (transposed) into the local memory @@ -257,7 +257,7 @@ inline void GlobalToPrivateA(const __global realM* restrict agm, realM apm[MWI/V #endif // Computes the indices for the global memory - int idm = mg + get_group_id(0)*(MWG/VWM); + int idm = mg + GetGroupID0() * (MWG/VWM); // Loads the data from global memory (not transposed) and stores into registers apm[mi] = agm[idk*(kSizeM/VWM) + idm]; @@ -280,7 +280,7 @@ inline void GlobalToPrivateB(const __global realN* restrict bgm, realN bpm[NWI/V #endif // Computes the indices for the global memory - int idn = ng + get_group_id(1)*(NWG/VWN); + int idn = ng + GetGroupID1() * (NWG/VWN); // Loads the data from global memory (transposed) and stores into registers bpm[ni] = bgm[idk*(kSizeN/VWN) + idn]; diff --git a/src/kernels/level3/xgemm_part2.opencl b/src/kernels/level3/xgemm_part2.opencl index c0760db6..599e01d5 100644 --- a/src/kernels/level3/xgemm_part2.opencl +++ b/src/kernels/level3/xgemm_part2.opencl @@ -69,42 +69,43 @@ inline void MultiplyAccumulate(realM cpm[NWI][MWI/VWM], realM apm[MWI/VWM], real for (int ni=0; ni get_group_id(0)*MWG) { + if (GetGroupID1()*NWG > GetGroupID0()*MWG) { return; } diff --git a/src/routine.cc b/src/routine.cc index e0cc9a90..eee4c7cc 100644 --- a/src/routine.cc +++ b/src/routine.cc @@ -88,12 +88,21 @@ StatusCode Routine::SetUp() { // Adds the name of the routine as a define defines += "#define ROUTINE_"+routine_name_+"\n"; + // Determines whether this is a specific device + const auto isAMD = device_.Vendor() == "AMD" || device_.Vendor() == "Advanced Micro Devices, Inc."; + const auto isGPU = device_.Type() == "GPU"; + // For specific devices, use the non-IEE754 compilant OpenCL mad() instruction. This can improve // performance, but might result in a reduced accuracy. - if (device_.Vendor() == "AMD") { + if (isAMD && isGPU) { defines += "#define USE_CL_MAD 1\n"; } + // For specific devices, use staggered/shuffled workgroup indices. + if (isAMD && isGPU) { + defines += "#define USE_STAGGERED_INDICES 1\n"; + } + // Combines everything together into a single source string auto source_string = defines + common_header + source_string_; From 9e36b3b20d5bc69e8744e76f347a3f5e1345778a Mon Sep 17 00:00:00 2001 From: cnugteren Date: Sun, 15 May 2016 14:31:37 +0200 Subject: [PATCH 04/13] Fixed the arguments in the performance graphs to reflect the changes in enum values --- test/performance/graphs/xgemm.r | 46 ++++++++-------- test/performance/graphs/xgemv.r | 30 +++++----- test/performance/graphs/xsymm.r | 46 ++++++++-------- test/performance/graphs/xsyr2k.r | 46 ++++++++-------- test/performance/graphs/xsyrk.r | 46 ++++++++-------- test/performance/graphs/xtrmm.r | 94 ++++++++++++++++---------------- 6 files changed, 154 insertions(+), 154 deletions(-) diff --git a/test/performance/graphs/xgemm.r b/test/performance/graphs/xgemm.r index 6533b44b..e758f460 100755 --- a/test/performance/graphs/xgemm.r +++ b/test/performance/graphs/xgemm.r @@ -35,32 +35,32 @@ test_names <- list( # Defines the test-cases test_values <- list( - list(c( 128, 128, 128, 1, 0, 0, 16, 128, num_runs, precision)), - list(c( 129, 129, 129, 1, 0, 0, 16, 128, num_runs, precision)), - list(c( 512, 512, 512, 1, 0, 0, 16, 1, num_runs, precision)), - list(c(2048, 2048, 2048, 1, 0, 0, 16, 1, num_runs, precision)), + list(c( 128, 128, 128, 102, 111, 111, 16, 128, num_runs, precision)), + list(c( 129, 129, 129, 102, 111, 111, 16, 128, num_runs, precision)), + list(c( 512, 512, 512, 102, 111, 111, 16, 1, num_runs, precision)), + list(c(2048, 2048, 2048, 102, 111, 111, 16, 1, num_runs, precision)), list( - c(1024, 1024, 1024, 0, 0, 0, 1, 0, num_runs, precision), - c(1024, 1024, 1024, 0, 0, 1, 1, 0, num_runs, precision), - c(1024, 1024, 1024, 0, 1, 0, 1, 0, num_runs, precision), - c(1024, 1024, 1024, 0, 1, 1, 1, 0, num_runs, precision), - c(1024, 1024, 1024, 1, 0, 0, 1, 0, num_runs, precision), - c(1024, 1024, 1024, 1, 0, 1, 1, 0, num_runs, precision), - c(1024, 1024, 1024, 1, 1, 0, 1, 0, num_runs, precision), - c(1024, 1024, 1024, 1, 1, 1, 1, 0, num_runs, precision) + c(1024, 1024, 1024, 101, 111, 111, 1, 0, num_runs, precision), + c(1024, 1024, 1024, 101, 111, 112, 1, 0, num_runs, precision), + c(1024, 1024, 1024, 101, 112, 111, 1, 0, num_runs, precision), + c(1024, 1024, 1024, 101, 112, 112, 1, 0, num_runs, precision), + c(1024, 1024, 1024, 102, 111, 111, 1, 0, num_runs, precision), + c(1024, 1024, 1024, 102, 111, 112, 1, 0, num_runs, precision), + c(1024, 1024, 1024, 102, 112, 111, 1, 0, num_runs, precision), + c(1024, 1024, 1024, 102, 112, 112, 1, 0, num_runs, precision) ), list( - c( 8, 8, 8, 1, 0, 0, 1, 0, num_runs, precision), - c( 16, 16, 16, 1, 0, 0, 1, 0, num_runs, precision), - c( 32, 32, 32, 1, 0, 0, 1, 0, num_runs, precision), - c( 64, 64, 64, 1, 0, 0, 1, 0, num_runs, precision), - c( 128, 128, 128, 1, 0, 0, 1, 0, num_runs, precision), - c( 256, 256, 256, 1, 0, 0, 1, 0, num_runs, precision), - c( 512, 512, 512, 1, 0, 0, 1, 0, num_runs, precision), - c(1024, 1024, 1024, 1, 0, 0, 1, 0, num_runs, precision), - c(2048, 2048, 2048, 1, 0, 0, 1, 0, num_runs, precision), - c(4096, 4096, 4096, 1, 0, 0, 1, 0, num_runs, precision), - c(8192, 8192, 8192, 1, 0, 0, 1, 0, num_runs, precision) + c( 8, 8, 8, 102, 111, 111, 1, 0, num_runs, precision), + c( 16, 16, 16, 102, 111, 111, 1, 0, num_runs, precision), + c( 32, 32, 32, 102, 111, 111, 1, 0, num_runs, precision), + c( 64, 64, 64, 102, 111, 111, 1, 0, num_runs, precision), + c( 128, 128, 128, 102, 111, 111, 1, 0, num_runs, precision), + c( 256, 256, 256, 102, 111, 111, 1, 0, num_runs, precision), + c( 512, 512, 512, 102, 111, 111, 1, 0, num_runs, precision), + c(1024, 1024, 1024, 102, 111, 111, 1, 0, num_runs, precision), + c(2048, 2048, 2048, 102, 111, 111, 1, 0, num_runs, precision), + c(4096, 4096, 4096, 102, 111, 111, 1, 0, num_runs, precision), + c(8192, 8192, 8192, 102, 111, 111, 1, 0, num_runs, precision) ) ) diff --git a/test/performance/graphs/xgemv.r b/test/performance/graphs/xgemv.r index a4e7a834..9a8040f7 100644 --- a/test/performance/graphs/xgemv.r +++ b/test/performance/graphs/xgemv.r @@ -35,22 +35,22 @@ test_names <- list( # Defines the test-cases test_values <- list( - list(c(256, 256, 1, 1, 1, 16, 256, num_runs, precision)), - list(c(256+1, 256+1, 1, 1, 1, 16, 256, num_runs, precision)), - list(c(2*kilo, 2*kilo, 1, 1, 1, 16, 1, num_runs, precision)), - list(c(256, 256, 1, 1, 0, 16, 256, num_runs, precision)), - list(c(256+1, 256+1, 1, 1, 0, 16, 256, num_runs, precision)), + list(c(256, 256, 1, 1, 102, 16, 256, num_runs, precision)), + list(c(256+1, 256+1, 1, 1, 102, 16, 256, num_runs, precision)), + list(c(2*kilo, 2*kilo, 1, 1, 102, 16, 1, num_runs, precision)), + list(c(256, 256, 1, 1, 101, 16, 256, num_runs, precision)), + list(c(256+1, 256+1, 1, 1, 101, 16, 256, num_runs, precision)), list( - c(2*kilo, 2*kilo, 1, 1, 1, 1, 0, num_runs, precision), - c(2*kilo, 2*kilo, 2, 1, 1, 1, 0, num_runs, precision), - c(2*kilo, 2*kilo, 4, 1, 1, 1, 0, num_runs, precision), - c(2*kilo, 2*kilo, 8, 1, 1, 1, 0, num_runs, precision), - c(2*kilo, 2*kilo, 1, 2, 1, 1, 0, num_runs, precision), - c(2*kilo, 2*kilo, 1, 4, 1, 1, 0, num_runs, precision), - c(2*kilo, 2*kilo, 1, 8, 1, 1, 0, num_runs, precision), - c(2*kilo, 2*kilo, 2, 2, 1, 1, 0, num_runs, precision), - c(2*kilo, 2*kilo, 4, 4, 1, 1, 0, num_runs, precision), - c(2*kilo, 2*kilo, 8, 8, 1, 1, 0, num_runs, precision) + c(2*kilo, 2*kilo, 1, 1, 102, 1, 0, num_runs, precision), + c(2*kilo, 2*kilo, 2, 1, 102, 1, 0, num_runs, precision), + c(2*kilo, 2*kilo, 4, 1, 102, 1, 0, num_runs, precision), + c(2*kilo, 2*kilo, 8, 1, 102, 1, 0, num_runs, precision), + c(2*kilo, 2*kilo, 1, 2, 102, 1, 0, num_runs, precision), + c(2*kilo, 2*kilo, 1, 4, 102, 1, 0, num_runs, precision), + c(2*kilo, 2*kilo, 1, 8, 102, 1, 0, num_runs, precision), + c(2*kilo, 2*kilo, 2, 2, 102, 1, 0, num_runs, precision), + c(2*kilo, 2*kilo, 4, 4, 102, 1, 0, num_runs, precision), + c(2*kilo, 2*kilo, 8, 8, 102, 1, 0, num_runs, precision) ) ) diff --git a/test/performance/graphs/xsymm.r b/test/performance/graphs/xsymm.r index c27de904..a65bb16f 100644 --- a/test/performance/graphs/xsymm.r +++ b/test/performance/graphs/xsymm.r @@ -35,32 +35,32 @@ test_names <- list( # Defines the test-cases test_values <- list( - list(c( 128, 128, 1, 0, 0, 16, 128, num_runs, precision)), - list(c( 129, 129, 1, 0, 0, 16, 128, num_runs, precision)), - list(c( 512, 512, 1, 0, 0, 16, 1, num_runs, precision)), - list(c(2048, 2048, 1, 0, 0, 16, 1, num_runs, precision)), + list(c( 128, 128, 102, 111, 111, 16, 128, num_runs, precision)), + list(c( 129, 129, 102, 111, 111, 16, 128, num_runs, precision)), + list(c( 512, 512, 102, 111, 111, 16, 1, num_runs, precision)), + list(c(2048, 2048, 102, 111, 111, 16, 1, num_runs, precision)), list( - c(1024, 1024, 0, 0, 0, 1, 0, num_runs, precision), - c(1024, 1024, 0, 0, 1, 1, 0, num_runs, precision), - c(1024, 1024, 0, 1, 0, 1, 0, num_runs, precision), - c(1024, 1024, 0, 1, 1, 1, 0, num_runs, precision), - c(1024, 1024, 1, 0, 0, 1, 0, num_runs, precision), - c(1024, 1024, 1, 0, 1, 1, 0, num_runs, precision), - c(1024, 1024, 1, 1, 0, 1, 0, num_runs, precision), - c(1024, 1024, 1, 1, 1, 1, 0, num_runs, precision) + c(1024, 1024, 101, 111, 111, 1, 0, num_runs, precision), + c(1024, 1024, 101, 111, 112, 1, 0, num_runs, precision), + c(1024, 1024, 101, 112, 111, 1, 0, num_runs, precision), + c(1024, 1024, 101, 112, 112, 1, 0, num_runs, precision), + c(1024, 1024, 102, 111, 111, 1, 0, num_runs, precision), + c(1024, 1024, 102, 111, 112, 1, 0, num_runs, precision), + c(1024, 1024, 102, 112, 111, 1, 0, num_runs, precision), + c(1024, 1024, 102, 112, 112, 1, 0, num_runs, precision) ), list( - c( 8, 8, 1, 0, 0, 1, 0, num_runs, precision), - c( 16, 16, 1, 0, 0, 1, 0, num_runs, precision), - c( 32, 32, 1, 0, 0, 1, 0, num_runs, precision), - c( 64, 64, 1, 0, 0, 1, 0, num_runs, precision), - c( 128, 128, 1, 0, 0, 1, 0, num_runs, precision), - c( 256, 256, 1, 0, 0, 1, 0, num_runs, precision), - c( 512, 512, 1, 0, 0, 1, 0, num_runs, precision), - c(1024, 1024, 1, 0, 0, 1, 0, num_runs, precision), - c(2048, 2048, 1, 0, 0, 1, 0, num_runs, precision), - c(4096, 4096, 1, 0, 0, 1, 0, num_runs, precision), - c(8192, 8192, 1, 0, 0, 1, 0, num_runs, precision) + c( 8, 8, 102, 111, 111, 1, 0, num_runs, precision), + c( 16, 16, 102, 111, 111, 1, 0, num_runs, precision), + c( 32, 32, 102, 111, 111, 1, 0, num_runs, precision), + c( 64, 64, 102, 111, 111, 1, 0, num_runs, precision), + c( 128, 128, 102, 111, 111, 1, 0, num_runs, precision), + c( 256, 256, 102, 111, 111, 1, 0, num_runs, precision), + c( 512, 512, 102, 111, 111, 1, 0, num_runs, precision), + c(1024, 1024, 102, 111, 111, 1, 0, num_runs, precision), + c(2048, 2048, 102, 111, 111, 1, 0, num_runs, precision), + c(4096, 4096, 102, 111, 111, 1, 0, num_runs, precision), + c(8192, 8192, 102, 111, 111, 1, 0, num_runs, precision) ) ) diff --git a/test/performance/graphs/xsyr2k.r b/test/performance/graphs/xsyr2k.r index eb761e4c..4b2dd4a0 100644 --- a/test/performance/graphs/xsyr2k.r +++ b/test/performance/graphs/xsyr2k.r @@ -35,32 +35,32 @@ test_names <- list( # Defines the test-cases test_values <- list( - list(c( 128, 128, 1, 0, 0, 16, 128, num_runs, precision)), - list(c( 129, 129, 1, 0, 0, 16, 128, num_runs, precision)), - list(c( 512, 512, 1, 0, 0, 16, 1, num_runs, precision)), - list(c(1536, 1536, 1, 0, 0, 16, 1, num_runs, precision)), + list(c( 128, 128, 102, 111, 111, 16, 128, num_runs, precision)), + list(c( 129, 129, 102, 111, 111, 16, 128, num_runs, precision)), + list(c( 512, 512, 102, 111, 111, 16, 1, num_runs, precision)), + list(c(1536, 1536, 102, 111, 111, 16, 1, num_runs, precision)), list( - c(1024, 1024, 0, 0, 0, 1, 0, num_runs, precision), - c(1024, 1024, 0, 0, 1, 1, 0, num_runs, precision), - c(1024, 1024, 0, 1, 0, 1, 0, num_runs, precision), - c(1024, 1024, 0, 1, 1, 1, 0, num_runs, precision), - c(1024, 1024, 1, 0, 0, 1, 0, num_runs, precision), - c(1024, 1024, 1, 0, 1, 1, 0, num_runs, precision), - c(1024, 1024, 1, 1, 0, 1, 0, num_runs, precision), - c(1024, 1024, 1, 1, 1, 1, 0, num_runs, precision) + c(1024, 1024, 101, 111, 111, 1, 0, num_runs, precision), + c(1024, 1024, 101, 111, 112, 1, 0, num_runs, precision), + c(1024, 1024, 101, 112, 111, 1, 0, num_runs, precision), + c(1024, 1024, 101, 112, 112, 1, 0, num_runs, precision), + c(1024, 1024, 102, 111, 111, 1, 0, num_runs, precision), + c(1024, 1024, 102, 111, 112, 1, 0, num_runs, precision), + c(1024, 1024, 102, 112, 111, 1, 0, num_runs, precision), + c(1024, 1024, 102, 112, 112, 1, 0, num_runs, precision) ), list( - c( 8, 8, 1, 0, 0, 1, 0, num_runs, precision), - c( 16, 16, 1, 0, 0, 1, 0, num_runs, precision), - c( 32, 32, 1, 0, 0, 1, 0, num_runs, precision), - c( 64, 64, 1, 0, 0, 1, 0, num_runs, precision), - c( 128, 128, 1, 0, 0, 1, 0, num_runs, precision), - c( 256, 256, 1, 0, 0, 1, 0, num_runs, precision), - c( 512, 512, 1, 0, 0, 1, 0, num_runs, precision), - c(1024, 1024, 1, 0, 0, 1, 0, num_runs, precision), - c(2048, 2048, 1, 0, 0, 1, 0, num_runs, precision), - c(4096, 4096, 1, 0, 0, 1, 0, num_runs, precision), - c(8192, 8192, 1, 0, 0, 1, 0, num_runs, precision) + c( 8, 8, 102, 111, 111, 1, 0, num_runs, precision), + c( 16, 16, 102, 111, 111, 1, 0, num_runs, precision), + c( 32, 32, 102, 111, 111, 1, 0, num_runs, precision), + c( 64, 64, 102, 111, 111, 1, 0, num_runs, precision), + c( 128, 128, 102, 111, 111, 1, 0, num_runs, precision), + c( 256, 256, 102, 111, 111, 1, 0, num_runs, precision), + c( 512, 512, 102, 111, 111, 1, 0, num_runs, precision), + c(1024, 1024, 102, 111, 111, 1, 0, num_runs, precision), + c(2048, 2048, 102, 111, 111, 1, 0, num_runs, precision), + c(4096, 4096, 102, 111, 111, 1, 0, num_runs, precision), + c(8192, 8192, 102, 111, 111, 1, 0, num_runs, precision) ) ) diff --git a/test/performance/graphs/xsyrk.r b/test/performance/graphs/xsyrk.r index 04f7b515..4ab46c9f 100644 --- a/test/performance/graphs/xsyrk.r +++ b/test/performance/graphs/xsyrk.r @@ -35,32 +35,32 @@ test_names <- list( # Defines the test-cases test_values <- list( - list(c( 128, 128, 1, 0, 0, 16, 128, num_runs, precision)), - list(c( 129, 129, 1, 0, 0, 16, 128, num_runs, precision)), - list(c( 512, 512, 1, 0, 0, 16, 1, num_runs, precision)), - list(c(2048, 2048, 1, 0, 0, 16, 1, num_runs, precision)), + list(c( 128, 128, 102, 111, 111, 16, 128, num_runs, precision)), + list(c( 129, 129, 102, 111, 111, 16, 128, num_runs, precision)), + list(c( 512, 512, 102, 111, 111, 16, 1, num_runs, precision)), + list(c(2048, 2048, 102, 111, 111, 16, 1, num_runs, precision)), list( - c(1024, 1024, 0, 0, 0, 1, 0, num_runs, precision), - c(1024, 1024, 0, 0, 1, 1, 0, num_runs, precision), - c(1024, 1024, 0, 1, 0, 1, 0, num_runs, precision), - c(1024, 1024, 0, 1, 1, 1, 0, num_runs, precision), - c(1024, 1024, 1, 0, 0, 1, 0, num_runs, precision), - c(1024, 1024, 1, 0, 1, 1, 0, num_runs, precision), - c(1024, 1024, 1, 1, 0, 1, 0, num_runs, precision), - c(1024, 1024, 1, 1, 1, 1, 0, num_runs, precision) + c(1024, 1024, 101, 111, 111, 1, 0, num_runs, precision), + c(1024, 1024, 101, 111, 112, 1, 0, num_runs, precision), + c(1024, 1024, 101, 112, 111, 1, 0, num_runs, precision), + c(1024, 1024, 101, 112, 112, 1, 0, num_runs, precision), + c(1024, 1024, 102, 111, 111, 1, 0, num_runs, precision), + c(1024, 1024, 102, 111, 112, 1, 0, num_runs, precision), + c(1024, 1024, 102, 112, 111, 1, 0, num_runs, precision), + c(1024, 1024, 102, 112, 112, 1, 0, num_runs, precision) ), list( - c( 8, 8, 1, 0, 0, 1, 0, num_runs, precision), - c( 16, 16, 1, 0, 0, 1, 0, num_runs, precision), - c( 32, 32, 1, 0, 0, 1, 0, num_runs, precision), - c( 64, 64, 1, 0, 0, 1, 0, num_runs, precision), - c( 128, 128, 1, 0, 0, 1, 0, num_runs, precision), - c( 256, 256, 1, 0, 0, 1, 0, num_runs, precision), - c( 512, 512, 1, 0, 0, 1, 0, num_runs, precision), - c(1024, 1024, 1, 0, 0, 1, 0, num_runs, precision), - c(2048, 2048, 1, 0, 0, 1, 0, num_runs, precision), - c(4096, 4096, 1, 0, 0, 1, 0, num_runs, precision), - c(8192, 8192, 1, 0, 0, 1, 0, num_runs, precision) + c( 8, 8, 102, 111, 111, 1, 0, num_runs, precision), + c( 16, 16, 102, 111, 111, 1, 0, num_runs, precision), + c( 32, 32, 102, 111, 111, 1, 0, num_runs, precision), + c( 64, 64, 102, 111, 111, 1, 0, num_runs, precision), + c( 128, 128, 102, 111, 111, 1, 0, num_runs, precision), + c( 256, 256, 102, 111, 111, 1, 0, num_runs, precision), + c( 512, 512, 102, 111, 111, 1, 0, num_runs, precision), + c(1024, 1024, 102, 111, 111, 1, 0, num_runs, precision), + c(2048, 2048, 102, 111, 111, 1, 0, num_runs, precision), + c(4096, 4096, 102, 111, 111, 1, 0, num_runs, precision), + c(8192, 8192, 102, 111, 111, 1, 0, num_runs, precision) ) ) diff --git a/test/performance/graphs/xtrmm.r b/test/performance/graphs/xtrmm.r index 3b35f7c0..c2faaa8b 100644 --- a/test/performance/graphs/xtrmm.r +++ b/test/performance/graphs/xtrmm.r @@ -35,59 +35,59 @@ test_names <- list( # Defines the test-cases test_values <- list( - list(c( 128, 128, 1, 0, 0, 0, 0, 16, 128, num_runs, precision)), - list(c( 129, 129, 1, 0, 0, 0, 0, 16, 128, num_runs, precision)), - list(c( 512, 512, 1, 0, 0, 0, 0, 16, 1, num_runs, precision)), - list(c(2048, 2048, 1, 0, 0, 0, 0, 16, 1, num_runs, precision)), + list(c( 128, 128, 102, 141, 121, 111, 131, 16, 128, num_runs, precision)), + list(c( 129, 129, 102, 141, 121, 111, 131, 16, 128, num_runs, precision)), + list(c( 512, 512, 102, 141, 121, 111, 131, 16, 1, num_runs, precision)), + list(c(2048, 2048, 102, 141, 121, 111, 131, 16, 1, num_runs, precision)), list( - c(1024, 1024, 0, 0, 0, 0, 0, 1, 0, num_runs, precision), - c(1024, 1024, 0, 0, 0, 0, 1, 1, 0, num_runs, precision), - c(1024, 1024, 0, 0, 0, 1, 0, 1, 0, num_runs, precision), - c(1024, 1024, 0, 0, 0, 1, 1, 1, 0, num_runs, precision), - c(1024, 1024, 0, 0, 1, 0, 0, 1, 0, num_runs, precision), - c(1024, 1024, 0, 0, 1, 0, 1, 1, 0, num_runs, precision), - c(1024, 1024, 0, 0, 1, 1, 0, 1, 0, num_runs, precision), - c(1024, 1024, 0, 0, 1, 1, 1, 1, 0, num_runs, precision), + c(1024, 1024, 101, 141, 121, 111, 131, 1, 0, num_runs, precision), + c(1024, 1024, 101, 141, 121, 111, 132, 1, 0, num_runs, precision), + c(1024, 1024, 101, 141, 121, 112, 131, 1, 0, num_runs, precision), + c(1024, 1024, 101, 141, 121, 112, 132, 1, 0, num_runs, precision), + c(1024, 1024, 101, 141, 122, 111, 131, 1, 0, num_runs, precision), + c(1024, 1024, 101, 141, 122, 111, 132, 1, 0, num_runs, precision), + c(1024, 1024, 101, 141, 122, 112, 131, 1, 0, num_runs, precision), + c(1024, 1024, 101, 141, 122, 112, 132, 1, 0, num_runs, precision), - c(1024, 1024, 0, 1, 0, 0, 0, 1, 0, num_runs, precision), - c(1024, 1024, 0, 1, 0, 0, 1, 1, 0, num_runs, precision), - c(1024, 1024, 0, 1, 0, 1, 0, 1, 0, num_runs, precision), - c(1024, 1024, 0, 1, 0, 1, 1, 1, 0, num_runs, precision), - c(1024, 1024, 0, 1, 1, 0, 0, 1, 0, num_runs, precision), - c(1024, 1024, 0, 1, 1, 0, 1, 1, 0, num_runs, precision), - c(1024, 1024, 0, 1, 1, 1, 0, 1, 0, num_runs, precision), - c(1024, 1024, 0, 1, 1, 1, 1, 1, 0, num_runs, precision), + c(1024, 1024, 101, 142, 121, 111, 131, 1, 0, num_runs, precision), + c(1024, 1024, 101, 142, 121, 111, 132, 1, 0, num_runs, precision), + c(1024, 1024, 101, 142, 121, 112, 131, 1, 0, num_runs, precision), + c(1024, 1024, 101, 142, 121, 112, 132, 1, 0, num_runs, precision), + c(1024, 1024, 101, 142, 122, 111, 131, 1, 0, num_runs, precision), + c(1024, 1024, 101, 142, 122, 111, 132, 1, 0, num_runs, precision), + c(1024, 1024, 101, 142, 122, 112, 131, 1, 0, num_runs, precision), + c(1024, 1024, 101, 142, 122, 112, 132, 1, 0, num_runs, precision), - c(1024, 1024, 1, 0, 0, 0, 0, 1, 0, num_runs, precision), - c(1024, 1024, 1, 0, 0, 0, 1, 1, 0, num_runs, precision), - c(1024, 1024, 1, 0, 0, 1, 0, 1, 0, num_runs, precision), - c(1024, 1024, 1, 0, 0, 1, 1, 1, 0, num_runs, precision), - c(1024, 1024, 1, 0, 1, 0, 0, 1, 0, num_runs, precision), - c(1024, 1024, 1, 0, 1, 0, 1, 1, 0, num_runs, precision), - c(1024, 1024, 1, 0, 1, 1, 0, 1, 0, num_runs, precision), - c(1024, 1024, 1, 0, 1, 1, 1, 1, 0, num_runs, precision), + c(1024, 1024, 102, 141, 121, 111, 131, 1, 0, num_runs, precision), + c(1024, 1024, 102, 141, 121, 111, 132, 1, 0, num_runs, precision), + c(1024, 1024, 102, 141, 121, 112, 131, 1, 0, num_runs, precision), + c(1024, 1024, 102, 141, 121, 112, 132, 1, 0, num_runs, precision), + c(1024, 1024, 102, 141, 122, 111, 131, 1, 0, num_runs, precision), + c(1024, 1024, 102, 141, 122, 111, 132, 1, 0, num_runs, precision), + c(1024, 1024, 102, 141, 122, 112, 131, 1, 0, num_runs, precision), + c(1024, 1024, 102, 141, 122, 112, 132, 1, 0, num_runs, precision), - c(1024, 1024, 1, 1, 0, 0, 0, 1, 0, num_runs, precision), - c(1024, 1024, 1, 1, 0, 0, 1, 1, 0, num_runs, precision), - c(1024, 1024, 1, 1, 0, 1, 0, 1, 0, num_runs, precision), - c(1024, 1024, 1, 1, 0, 1, 1, 1, 0, num_runs, precision), - c(1024, 1024, 1, 1, 1, 0, 0, 1, 0, num_runs, precision), - c(1024, 1024, 1, 1, 1, 0, 1, 1, 0, num_runs, precision), - c(1024, 1024, 1, 1, 1, 1, 0, 1, 0, num_runs, precision), - c(1024, 1024, 1, 1, 1, 1, 1, 1, 0, num_runs, precision) + c(1024, 1024, 102, 142, 121, 111, 131, 1, 0, num_runs, precision), + c(1024, 1024, 102, 142, 121, 111, 132, 1, 0, num_runs, precision), + c(1024, 1024, 102, 142, 121, 112, 131, 1, 0, num_runs, precision), + c(1024, 1024, 102, 142, 121, 112, 132, 1, 0, num_runs, precision), + c(1024, 1024, 102, 142, 122, 111, 131, 1, 0, num_runs, precision), + c(1024, 1024, 102, 142, 122, 111, 132, 1, 0, num_runs, precision), + c(1024, 1024, 102, 142, 122, 112, 131, 1, 0, num_runs, precision), + c(1024, 1024, 102, 142, 122, 112, 132, 1, 0, num_runs, precision) ), list( - c( 8, 8, 1, 0, 0, 0, 0, 1, 0, num_runs, precision), - c( 16, 16, 1, 0, 0, 0, 0, 1, 0, num_runs, precision), - c( 32, 32, 1, 0, 0, 0, 0, 1, 0, num_runs, precision), - c( 64, 64, 1, 0, 0, 0, 0, 1, 0, num_runs, precision), - c( 128, 128, 1, 0, 0, 0, 0, 1, 0, num_runs, precision), - c( 256, 256, 1, 0, 0, 0, 0, 1, 0, num_runs, precision), - c( 512, 512, 1, 0, 0, 0, 0, 1, 0, num_runs, precision), - c(1024, 1024, 1, 0, 0, 0, 0, 1, 0, num_runs, precision), - c(2048, 2048, 1, 0, 0, 0, 0, 1, 0, num_runs, precision), - c(4096, 4096, 1, 0, 0, 0, 0, 1, 0, num_runs, precision), - c(8192, 8192, 1, 0, 0, 0, 0, 1, 0, num_runs, precision) + c( 8, 8, 102, 141, 121, 111, 131, 1, 0, num_runs, precision), + c( 16, 16, 102, 141, 121, 111, 131, 1, 0, num_runs, precision), + c( 32, 32, 102, 141, 121, 111, 131, 1, 0, num_runs, precision), + c( 64, 64, 102, 141, 121, 111, 131, 1, 0, num_runs, precision), + c( 128, 128, 102, 141, 121, 111, 131, 1, 0, num_runs, precision), + c( 256, 256, 102, 141, 121, 111, 131, 1, 0, num_runs, precision), + c( 512, 512, 102, 141, 121, 111, 131, 1, 0, num_runs, precision), + c(1024, 1024, 102, 141, 121, 111, 131, 1, 0, num_runs, precision), + c(2048, 2048, 102, 141, 121, 111, 131, 1, 0, num_runs, precision), + c(4096, 4096, 102, 141, 121, 111, 131, 1, 0, num_runs, precision), + c(8192, 8192, 102, 141, 121, 111, 131, 1, 0, num_runs, precision) ) ) From 716d7c67d91ef61e3d71e219f61c72859ac823eb Mon Sep 17 00:00:00 2001 From: cnugteren Date: Sun, 15 May 2016 16:10:56 +0200 Subject: [PATCH 05/13] Fixed a bug in the xGEMM routine related to the event incorrectly set --- CHANGELOG | 1 + src/routines/level3/xgemm.cc | 5 +++-- test/correctness/tester.cc | 2 +- 3 files changed, 5 insertions(+), 3 deletions(-) diff --git a/CHANGELOG b/CHANGELOG index 187fca73..6de365bf 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -1,6 +1,7 @@ Development version (next release) - Improved performance of large power-of-2 xGEMM kernels for AMD GPUs +- Fixed a bug in the xGEMM routine related to the event incorrectly set Version 0.7.0 - Added exports to be able to create a DLL on Windows (thanks to Marco Hutter) diff --git a/src/routines/level3/xgemm.cc b/src/routines/level3/xgemm.cc index aa081e81..3699b548 100644 --- a/src/routines/level3/xgemm.cc +++ b/src/routines/level3/xgemm.cc @@ -184,12 +184,13 @@ StatusCode Xgemm::DoGemm(const Layout layout, // Launches the kernel auto eventKernel = Event(); - status = RunKernel(kernel, global, local, eventKernel.pointer(), eventWaitList); + auto eventPointer = (!c_no_temp) ? eventKernel.pointer() : event_; + status = RunKernel(kernel, global, local, eventPointer, eventWaitList); if (ErrorIn(status)) { return status; } - eventWaitList.push_back(eventKernel); // Runs the post-processing kernel if needed if (!c_no_temp) { + eventWaitList.push_back(eventKernel); status = PadCopyTransposeMatrix(event_, eventWaitList, m_ceiled, n_ceiled, m_ceiled, 0, c_temp, c_one, c_two, c_ld, c_offset, c_buffer, diff --git a/test/correctness/tester.cc b/test/correctness/tester.cc index 26c4ba59..85ae7091 100644 --- a/test/correctness/tester.cc +++ b/test/correctness/tester.cc @@ -334,7 +334,7 @@ bool TestSimilarity(const T val1, const T val2) { // Set the allowed error margin for floating-point comparisons constexpr auto kErrorMarginRelative = T(0.025); - constexpr auto kErrorMarginAbsolute = T(1.0e-4); + constexpr auto kErrorMarginAbsolute = T(1.0e-3); // Shortcut, handles infinities if (val1 == val2) { From 802c1f48c7bb334e8060689adc898b17e216bffb Mon Sep 17 00:00:00 2001 From: cnugteren Date: Sun, 15 May 2016 17:06:36 +0200 Subject: [PATCH 06/13] Removed comparison to CBLAS for the graph scripts --- test/performance/graphs/common.r | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/performance/graphs/common.r b/test/performance/graphs/common.r index 5b3e6e52..cd68cf26 100644 --- a/test/performance/graphs/common.r +++ b/test/performance/graphs/common.r @@ -34,7 +34,7 @@ options("width"=170) # Constants num_runs <- 4 devices <- c("-platform","-device") -options_string <- "-q -no_abbrv" +options_string <- "-q -no_abbrv -cblas 0" library_names <- c("CLBlast", "clBLAS") # Command-line arguments From fd107c9b12e2c369ed2946efd53ddabc440388e1 Mon Sep 17 00:00:00 2001 From: cnugteren Date: Sun, 15 May 2016 17:28:22 +0200 Subject: [PATCH 07/13] Added new tuning results for SGEMM and updated the performance graph for the Radeon M370X AMD GPU --- doc/performance/Radeon_M370X/SGEMM.pdf | Bin 13227 -> 13268 bytes include/internal/database/xgemm.h | 4 ++-- scripts/database/database.py | 1 + 3 files changed, 3 insertions(+), 2 deletions(-) diff --git a/doc/performance/Radeon_M370X/SGEMM.pdf b/doc/performance/Radeon_M370X/SGEMM.pdf index 362d229d5eb09dc75a86a4ef0587e9c1b110e544..ab4382ce431f3a11e87a85b7df82b92646b40a5e 100644 GIT binary patch delta 9451 zcmZXZbyS;A(C;bkP+W?%MT%>2D_Yz=IKf?t^B@IUv=AJMyA&zz?k>gMgIn=(!|%Q4 z{&C-&^PI_kcXoG@b26WuNvY+GSHJMgpuZhqWg1^G)o zKMWhJmjJFw-F`7i@%0bMY&gWPF|2Kyj*{3au8UV3M|-41_iwhnUmIk~H=X{{p1i&n zn`Rq--c0acJ6#jH@N@KkzCZbGBxzHn4YW8l+g*w-THPe9hLtEDUqCl5Ontn(u9KQG z+c(44v^o18LfHmyfNYn+h7}#2q+|cj3%<4s)wK_F;L;?Psrf5amYk|OC(+PBt8yj; zLbFz?sO#Fbq|s}}rnaPq34@B9Lj>U}ZC-Y-DpUjN#w$|fJ4uX#hSz>#m|)9;v$xYIOdPE8BBZ%bZ?0{tM^c^DdsYYsA!NYDo*9pEbiZZ z(q#xxJcxjmRle5$Aw+_I%_erS#W8HND$}vt$?PQz=S>2X=1STd~iWXncA7;m= z9X8-!+8T(`a0;J|?O)sT*|Cv__tt3(Vribn#C`jht8sB64JkgK~_tIR(4vUD4mC07+jyw z6?L+lJNuYxk-wQ>*tpmPBYrl)fG(=BrtE%Tu@9P~nEu!w=s_2;T*W_3f-ZI5v|8vr z$UjWSwl2sM_r#!6j(+-3H)8eX+^`X-duU-W#5tmIiSYEO&T^FHCLrVezV6;Bmr1S+ zWN}o8H|~9h)ObLz3a(gcD&eCL4Efc6U*~wPaQL_?lNg(SS_=}ePLjdFZ}~Nkzw;@R znDsr_z;0^(?0);YJ@Kj(hsEtjK-Im-@PI6f>&xM=(3m4Dpono&d zeYL3D$gpNZFM{v}F@3MLDs(<}&< zleR?U2DrRD_|5ujn_GuKD2twFU$asq-BpP5OfN*hi{uXb2)}t#4#fn`hA_E#XD9C> z^I>|bSa7@Iv7$7E+t)^%6#rcQX#ejsI4sF6&C3%M%}b>t?ka*?k}^OpoO%i!g>=PW z^DsiLo5xGA%$(%PU}grIX@rtNSfrbYebw`6Bkoy01(j!5vaq2jWF3sBtT^S}NBwFj zwOm9$t6pgqV?)EXS)x$|!9@gBh!+{cZFuS&W*(0z^zn3!0?2`{x513SYfe#l+f87G{+7yi+f#jo><8YQ!upv#JSLANN;ej|i6~TaWLJd#QkQ{mv4s${aA$go!r^qVccE zUGjFCIuZ%vJROSIHYBFO&+w^mOq58yZHarb*}?uD6UaK}4Ida>Gr4p$xmljOoTT6L zX;F6ODphv*=IePq<5qQGCqncl>d&ll7y5~G2siU_lMC~VHLFxMh|kaLT7GSgF6kS) z3tJIY4q*Z_IJ-evs+;b~bGW0QHKDB6C-EiYqy}at^A|Z7~svY6lnd0&z``B6~FmL}1 zav%pyL@H3bMa+{`)G|m@m72%zLU92xGl=lG^HKWo1f|mS>E!+^%jv5!^f5^wse`c4<+q!kS5(R5+7B&R1~nw%?H3<0cWe4`fH4QXk)P^g?>sBm11r zfQRodjaPSZAxC_9`vK@75??^+eNDJ=NZzPG^n~ccHSxooqd(s<@CV&$K~}Bv^oGs< zX_`%&7lD2CX@ta+yTNRcy-=9lxfbb;`y9h3fiw%ZapKu}tsNZOuZh{sY!w2WmWB*O zNEKW&(bubZ)n>`omlM5XBcg=jM$rCo_VuVgbG)0d`m8jL6-IL;PDa(vUGRFWpS#AX z;L@v+KiM?Iw(7G&Q{x$)QsY|!vyfp6^=wZQ;Sq0U+PqTZk=l^Kh$YIjHQEm23)5Fg zV|d2w3oM+%9$14f9PR;0Z~`#k*65YJC4~@UEa*0c3`K<4ZLh%K9kl8P^Ex6zvTyim z4x{tb7u1(U>nlk_Trk29f~`4VOB&n~Uxsil6|p=CNh(VwLbtaKZ)p%Rhujq`(r*ay z{}`cw7Z0Y+It^vOLw8g2I7U+7DVRlr*{-`OKHvvP3OrV?C!Gcp9IQSb`CN9kw)b9+ zYE|yEMh~Yu)(5SexPx|myG`b9v|x7FB*mAB)wS&ay{>-FIfXp+S7H4Y=U&>coSlmQ zYnSwd9K7DtDJY+Yh~O@7usp_BrNnO*qK=sh1Wl&w**S$~{8cvbI)>Okd;(O-%DUm2 znOT*wGmRra^Q3RZ#-gGJBD&oAQgB1-NCVcKozjs`^!47-zN;n_8Rhz+9sL_uNrSI` zJw)0Eze|yt_y)Uf7bIg*H11|7>rIZLt>m;qcX=0$^Vj6eOOycVq7Q6?U#R9lh)wq8 zl`X~Q$xzL7sV^<+S`M~CRRA!~sPUz?u7ka`b3v_cU=<7b0(qNJKdFKxmfi#HO*9_0 z<&E5T-*99F`t)RUKeu=u7Cr=G>Dy38h7bd<-k*LUJab-^!OZW+)b{oFQT5cojwlFT zfy0=Vm(1)Glk7-)tlw`mJYC5|+j63t&R>^x0EA;bHn>C^-Gz3Cr|bao{1)w!chYaZ znu69L-#iBcc%PQZJc0e~K;eji=Hc%~L&uuC0@ANV=W&T8$P9#xBtYnN>ZjGE-A6FE zkgA7;QXeNSTtkZ6*Tr$YGF?zN$+&zVjnVTQDePLW3z~yniT+8(dL)%LKwt-eWOiLO z3oc_?WOnCshaTePfvFo6m^*glT?e_QgL*pE{B=T}Vj+mb>`T4&3)FD1#pxPiIJ**i zw^zU-qo*)l%yG=u$60Sz0eHW3daNZ^4Olx)R*p5LDMA+cC$hd;h9Cs}59!KI@~qdE zqcn?t-reeYRWF-Rnsj|{QRa8pjD!8w8?kTyaRAnWC4~YLIJ2-EFn@`h*t=@}3*mVA zuKt@pgAUhk7!`Fv`7cDvt$N1BZ*}rf5$z~tdEskt;F1Ik-~#Km@{jr(j@kYJhw|Gi zJ7w<4et!jQ);okmAY0yqPZKaGQsR-P{q5fJu1AXU!O#nG^nygN)zE=M>GeLmUi;*`{L7{8LGP}*^8=l!3yAwnC*20ygOT15=MUAosgZIvp>Y<_t_QrcZcqx8mF)+3n}y?A1jy|!6{rd1BbtRG_>HD ze5JNbnnX$vap?oT`)A&(fnS5*Fv)qMfn+Ykw2TwDg8P;0g)J)d&$l$u-O0^}{*$Ax z6s{j;rhoi&CPAXXe2ORS=*gaHfY1fJ&?zYR2+r8kVOwBkM_jQ{3Wa_4R9*S>qnSSP zXOlD5;dJhXm2P5c%nnJQUDG5gnVGZ`UrU?QB9g;BB#xM*%Zk5!VT?U6$ zH6E$sNNI9Lo#OH$2p$jN))>xg4(9=h7)~8VCu)ucqz=zGmrAvCiK6!`|HVr7%?J(? zl8EpRDzbSy?*#J`x#m9JLR5Q2qj1%E{1%(;Yx)wraS5gp5E zV1jMn2lcrj35LhZ7$&Bkkqy}?AbR--zs5O%Rf=H-mN%ySQmo0sq{CG%M6gG>*Ciap zlw6e#FLRAI%&}m~Yw_&HZ270G>dn;(*>H;Tluq=4s8C|?^bks~OIgrOuL~pA9DY?g z*(IQ^ACD8gGheEcC%r@a@4QZ`A$%tp$xo&{0>&kzzg!)tzhAgS$KYIr>s3qLtFtPx zGi!=kqJ$Ak91$cRk)~hNqNgJ?z$~ICRFrtJ?5V;cv>aGg1C_PgcGF!%c5)cqq;0Kb zS7{=0=UN3doDm)MiNzD1w4e}#-Is>|&S0xN{ahWbc?8wLOpreR*A!`H?Z6^i3`mkO z64tPU>5n0YN%IIHmX`{#(Uy8>yBA)6o3_jp1~jU1yl z_svIK@P%n04+vfrhBtmnn5k&g*us5r{&Cy}i&&?_sJ13~(18QkUzrn&AB^bzA9CSp z>bIjyuEd$E%QV6)9;Qt|stdE9h(T`eKP_sguS)ig3*omM*NwX=mtm;Ny{}MS_UK{N z6&9NAe9fNO!DV)O^s%>4eA`>ue3>yGzIYKTxr0q1{&*45KTV!ZIQio-wbAx>&NV%k zX60rQ{%-&l-kyKRi;sMIn|D<_6TAKVL`x=VH&_3T5qR;V#`rTi|7p1XBRDTiL`Pv; z4Hxx$YDLUydw4h$1>M?<_LtNnXDD$)QIIlh=33y!WW?H7jq0R5Wf0zI;83ytUX^qp zUn`KRR?_CfRP{GGMKjiYIvwreXL0!k*IqeZPD=^M7j-*)nNF7^LCY?RGHKlO#jFT1 z{L!U{l1WZcKfm|Ksap&CDxc8_2hsKn!zkP4=M$2AoJ#OkmdhK+6WDL~F}}+zbpyA5 zJgQyPRYU{RSE$hb2=Sy7O{76KB#NJ6f1M>zv|yj*YUHF$L7!3SJ@kqN)L;2tve2Ff zFG3FJAA_$PVHM#MI-(Pd-kLIP;W%AnD_%;(uj8y$g%+6f4w#7^Np~gZ zCx01pqwAdUwEI9EcMV8NKH9MAW6)sDx&q*wdu3+b(djNpNyrE*RAh%d{6=96cz;$f$6jrVymPfml6}f8~cGk50kGMpsd%%7XXku`t}6r0;J0Wpt%z=rHoy^!P_)4lf0{Vc4S<(fQ3w z>W*n+d3%gb|4n#89M^LbAXX$vgGD_x-S+Tw@acHO;~BCnOSK>7aB;$!N$ydvy#TLV z|7!j?un>KX6h6SffqWg*DAkx+AQGLO;rKTizJo8>(a;{5-EjDB*V+OO15XnA<))4n z+so<42gU&VvvibZ;>q=X#a%&^0I(QkMovn$qUpgs2Ao1h&gvIHXCNoWNDY=xue#N_ z&t{~>O_tBseO@Q4<~ocbxa{Q5^&@vkO4z7W2|7B$T zYe{@0nv?s^gX+!2{pwTp|4Hi9ew{>On`xhZzEVi1BKb25!&38M1e*%_=Vdf_<4wu~ z-A-w^7matETT`sdG4PcF#fQ&B@nP^iQGZ(<575~mvc=5Ze*?d>d*(3dZY0z zM}QRhA73-6|1hWSGa4z4Ktr?WL{{Bk4R`;Bh64^ChLoC}v`=_QH2&CZ^U#4#ALjo8 zT{_#QK%(&#wX{qvE!4KgL3%;Q99$>7u~0ehm>RyNiZc2e7<<+X#=QVD6u53znrX!q zsezWLmzWINX_^_R&s|9-X0vBLPUQWEvouQ!Ch84-p+BQLGqmB3|7nx`P4IXz-9vD6 zO2w?$ZU`@ojd8X)J-_{4a^r?7e(1qN^hd-Xp$nYU%DAb8T7DOpL zQ$O0S189D*6t)ZDJ~O|^!4Wd4dd}*AXr8roKsbZ*Z$f{ztSZX_$o^QCq7jWGag?)i zN<=ZsmS&LPS}Q3R`z!Wb?@t8RG~uC2QVsfOPdKy3UC>J>G1zvdqGrkNPh4zEoU4-(>l z(d#&3!E3nzp6jq*Uj*v>1vcN=*;TbGUb@9ausFnk-=%Mx>SKsfZu-_#x>QlfWqLd^ z5iez5VJSFt^?v!_w|ZUyvJ0Eyb(%nXk-hrc<6vHEVwQ9-@?Nll+=~|%K5XtTyFk*; z_P`tAd-_==Y$qyPxo!Y9>0icxv6StmSI?t-q$h%nrq7xg7tGQj{KUaO&to_|aiAQk zYo#D6-u3t7@1qJjk3es)SMIms?02brMXqSAkyN3Gv|{K8t~+@LzS-}m93m^$(2>Zm zpgQoSiU~#pwhNH%YCbn{a~$*}>&UuGQtuR0VUg6wE=1cD49ZS=!-;L?eMw#$G;Mr zAEwu9Q@2S`vcTfH#R#*l_ap;4Pnnaj0y+<2)NJS)X`$*zo=IK%4lK3wAVn#&+Ko?J z1a|6fTI865ZI~?FJd9+eit+n#s+Ee)Cbb*9luDfi7v{Aa(F{qlojf}sv&hOT;p&ye zLb~=Up~bButJ@*7bYwo^)$@{O@*JeypFS+~ylA07COPbYok5#m&gCWx$BQnTZhS#O zq_iQ(EVBJTH2ZH}FWCOIS^rl8^G%ukrDH4O0nz=B5<1MT{%vuOwpRA#B=xcQUg`ff zJDTD2fe>qvQ@BEc?4vswY!ShWY_AEywVja`asuEgRai&y`)~V2ptPlOG0Sk5OoNC# z!6QlVpChaV2wQhmUs;YC5jE62ES26i-PVAwqigS$ol0O==hO(AWQH{meY1=Oku=At zjU%cTT}n5}sQx;9Qj;Jku@d>3*lQLb6|2-!PR5|SC;uw@KoL%NH4q`y4!-CeGp0cb zOR%J&*q>q#OVSd6j4Q{44qd~&Zopn#v%;>^6SXKO*I<7^sr6Ih4ZHO!%|#F%8851P zM{*&gGQ{GbeMHGuL{v+cwmN{+LUy<%ff7SQ)60RQ$i{a{<3uC0q@(?-xytOZiA%Wo zd@q}`c*2Z{7)|AHEvHo4^0f?_;<-uD(4#x%3Z~}8jJ+#R#HiM9p_1?TPYW5^j5g2y zo=;Zw5mjgo3a<&}(`3O~N+Ec=kFOU$D=CCUA|=#`SfbIaTz=$8RMb@&Zh}{d^!2ds zE92~my8h2ks@W(ARUY)lzfCMv^6zCkLg_~oghD7OC1=s*8(e=&B*4gm(*8ATB6u$} zBp|BUA&?P(rRFsnFY`b~A?<Wg|GSXy&_+g$YvOXS0m(9%uM==yWi<* z`URdRV`wm24u&p-x=WNI@s!1%0a0kQ#PB?@pp_k8SC0Cto=zPRhmPAOx%OZCdpRH+n1BGq&cxIV)0bY0gvIBF;+)b?wTv{A!D_0IKh9 z<&ijBpv%Lj(>##RJ5Y{Kv@!`hM|8DkmcD)$_8jt%2~owsAB!2uD@q>K)_%nvW3=?- z-pKT#kT9jAa*HEL=EU{E^VXHktq|p}Dx0L|_+SbjEjX!R5w;-HkW51H$*%E;Y@H36 z;18*eLI`hDgZOekXnV^me=&Mo2Ly}M*>VnJi8F)}vo>>*B72JG5POT+8e9&mkO_%} zqqF6cWqPDy%47S*H9(tUhbV$E^LIKyQ{RyODpl?sKb-b_x{XPySvNF%+`d*e%)G(b zjk=-GFi<`%Iq4@akxyqOy>*z%oqhVe*>}CFVhfQ#Y6ez~4zW(rGvF3Y0o8Kdxr5U$ z4lUEnn#LfKB4aywB+*qd*Ir@^cq8)7Q*o?u>$mx3InSYwpA|(#`YNvmL5u4ma$fG% zr$+596WqS%xn5yFgYS;tEotkaHXGm)xRh2W&(`Muv|E@5JT1g%xBHv4ua3B0S5d4t zxFC=C55J0Li@MJbX}=l;b`Ix&tct9g=R*(2Hoqr&*DZjuE}4qH{a$j_`G=Qrd-&ne z&buC1eYQne$Qna#WQ`+%GdK`88a%cHHq_`~`O!zPuYiej!tkbasZ#V`Y%e&_=vI&B zoBg^awtP{#tE1Wa@j1e8CIpUs2I=cOfAbL*U70(jFPPKg(RtoL1GJltNb`i&nZl6+ z+!6+5_)M-)LJvztv%DXViH!(#m^@v_+>CJ)O1u6AzO^Hxi=pZFfc@V=N~#vuSG1|# zdu?{>AO-8O3{4pS?e;5CkO`1*i_;&-BI+oRgSCvn_8EUMB3u_djiQhV(0`W8Lx}Sy zICOEK)BgMo!Hxh5hne$uKiYn8_y#;%jY8RmSgjDO}v6ET3|ml7={ zLAaD2tr&!J6x-!CV~8O_;q{2B-*~0~ZlitOmmYoAo&eA+St`g#N+z6!6q?lr$m7|N zD*+v8f48SOoRsCTpZYXJIV9y#1!O=7Mnr283PhJv4s!=Zux>~MhJGP&Q|u`idV(vV zfof(hPy%aw7@(TDd4m<>CMak!n97U=D1Or^D)?PE#VWB;#w8S*{dk*3qbpIE3ri7> zW2+X^Kxq+^fk)0!uEMYzGgL40ew)=U%}_@n#a!x3=^6gHtDQhNY}<3lUAmz`Va<^h z2aT<6q1hb6FK;qbh=#cmTic=Vci9O_7vklD+*KsrC!X}yRagQL6JvJa zkZ$ts?yE@j;D~e9+s>vboNY=6+-MQ(*_(H&c(V}=`x5JlBiyjw#D^{=UX~%64ep2x z8j4B+vc;d6#LQA!;h@uuNV;6x939pUT40V7< zFJA9YHSB*H8Vr0sjS-Edqd!1zJ%T^F_JL5nQ&tUNnPLkig*CM9Tnc}>v|*n%4GjI* z*n1r!!K=;&OBk4O#r0qD=#PH1V_4T4NP({6YB{bGw>Of&UW3<5)y%BkJ1bnRv`Tz~ zeROxNeN~;CIY#6&q&#EIR*}Z<-vU3n^S1UXe4rtLfm!hr7HdP%8{p|Xc_*ybkm~r> zjhG_FdZf5DhiConm3Qak_xjykiW?!0Z#fS_QevO9POtzM?PA&t_goJs&k55|~^?$92@ z{S58ZMS$glKW0lcM$yiGXDkuy+@xRZd>d7FV7=f-REvs>IQBte`W(y0QXzagReGwK z(betJ$YUkS>1(7eoQK1xQ)6qI~2Mwc8AU@ zoh#;Gkbo=G$J;fcoQYrcwFzv5N|!@S=s4+3tfWXSIHxhY7!v&4OM;bEef85IhrLnY zL*nE>l;)=GpM)l$D)NrMsd!wEzERr zv?6;xsJrsc_~D^3eIH6di?dd<)K!~z}cB$fesVB~t&w=_!l>$huCA`-dum6GUo0cR0 zKrnpn`{q-Cxsek2bDzDaklq%gW7%GZV}1srmLN z0_C&I?gdZA9zQLNMVAaLch+`(H9r98T9<*pDQJIUiG1XzW`B86J5Zzl_lBLHhnw?% vX}mli|F_1^2f5`GVC3ie|E4%Nc|WrMFO83z7XspvM&ss3r=^urmPY?SuDx>8 delta 9406 zcmZX3Wmr^S)bAkDr6}Fq-61I@-5@QEv~(Rp;E!~7cXz{ppwv(@q_i|h4c&L}z0dt{ z-+7)r>+IG0d|1D|)|s}hw2tGa z4OU+8@g|BO&;p8$WUVeMXS!L%dM-66wFp-0GMgU^x8}VSwDo`JGq&JDz890Km%qQc z*gHOO1by)u_8q5dUn^7<;3Vu4%QJby-UI#$$z^}K*r(ula$g{CIm9*S7ianrdb_>| z5jRq(Ck%*AZWvsv z<>0>VPA&Ngjr`i=Z_)wTlX>A_B0Rb$BlSCg_1*S4HI>6^D)|?m__&r*OTFwI+?{kz z>wzS_(&a3BK*-v37)M)R#Yorlzr%8&JBxMVItJEI^93jNC}nU}s3t^!0NY^TvNeev zpjt5&4htupo$?KEEfzBp9)5~#c3XBjZVV(}otrvr>xKrqd8g>(dgLJv9ymF1uo{tD zycjNFp6u<*Ut0OBe%s|x`lo(*#o=}Kipoz*-=6Nab05aiKXl70F6tfK#hR!7J^o;s zy(?Ump9d~-bNM^5nx`}?EAD9TOy<;M0rP1mVQK%(*OWyY`JZwXa6~enI7~zX&Ngsr zIRGXa*iX+VU+=r+r*81t(<-?8{wQ+Lr0eD<@OX4_0Cf939FSPg_P>7lcWbCv@sBQW zmf7pXO4A!^Wzer;%bUbyL%O6q&YKL)Mr-y}mERonUEo{rUMj&AlrKC2$}V zdOYvUTl79@gDOBl&iA&JWhw)yt&mqWAfH?~$y6rSdWylJx^uPYoylNcAWlk85$+5ggxw4TRV&i>Vg zv}ViU({(0t@AYA8gl50@2831$TRC56y2VdM$PpLc6_LbX`Hzr{33GnqUcgxqF2Y_Q zao$z$Q0+MJ;oXY1B5P3H`oSG>*qdzgr9(Bte5thSSC#awF1~e&W_nldbkI?)%9Z4S z#-a3z?QH!Q*=_T|+Z17D)hpR6g+){x??Qc)+jiGst=V79rP)*dlR%L61=TU{eq1F~ zIxbo*CREQtior;uGv*-=$Kftce7p1go;7>tS0OfcQ&Q~GyP{&D*G&+3enq*EfzSxL zAC~Qh&jF3faJ);OeY>a^M+sYy+IhgUV*#te1BRbB8k&3~%@kr+AE1(hyKrAuxe*jd6NYcQ|`ibP`l$bmey$ zt*pjrxkx>(?_Q>Ap&NB=HkhAoN|z~#%eAisNO~*&BtVj_;`BUt(I%QJ3q|SFPAPG8>pq}Yc4B|a zf}#OG8i=JyOtmMcKDn@} z?g88Bu$MYP%*k?dbFbttk*bnb8FKYpkSm9}pMLQP(`lgbe22R{*`gTFd85Zy)hq|h z-Q+3#CaBX0pzss*Y3lD~a9$Db(D_f%+n|?uzQg&^!?{y1D7lM6)v|;^3Mgs`OorWU zGQ4xEHl?#9K2TG}iBzv&|LD{^qB(NmxM+OmTO4~}9o@OUDe17`W@>dX>9 zUJ#s_Ik(N*Qc!9Jfoz#(W+iI^A0hADM`Hq3|6II!_tCpYqXVboefA#vaT9o0)AvuA zzQK=rjO4|&7B)Tu3>5J_vD*@gu!ef%f26-l>%D^Fe)55h^&VI^EB^P+U~-}anVfG)~d$)WUyb7zo@ zKK*Ore6sR(n~^T79^jBvE|40AefgVN(~*Ph(z6QS-n*O^m%Wp=xg!frBcM ze}H9jd*Ph((HO(UBfirye^&WwJ!EOc$zxM=(bDl)aBWn?BYn{qoBqJ_FL$e_(f8BK z-|S!H=9I|i*~tW zFQCyyHs_MS%=R+c|4vGT=0Ktsn z-v~}JEqCoeXY@_H)-=|+(qO1bO!5aaAzm!X2|;p;_HN|Ry_cY|z_;(rU5^!)BsA%i zPz>uDmuz5G5-WKgeYX3iWH5HB)2!dvhyW}K@%@T`w$b`uTF+!wU1CiUv&;n9orm7& z<9K1Kra~C3Ok%}3Fwi(?meeM#+--4LRzqtsJ~l6fa98+CzHHut=6ww5$rLqJYggGw z4L;@&$@z8LX~W>RE=4Whi58MLYa4wxy#b%a@ zCqgze8OuC*?D^@b^nEzy_?(_8J?7b*e zL2b8I3;dkt@LE5pJj2g^%zH2t3MA50#OUlwx1z(rXcF3}lIWS?E-x;Pr{gRjM@AUF z@0QD}UGfhRCnv($vCD)xbL(j%%RRDIl<_uUDvj9ZU$$WR9WQ0cSk<55lVK@IfY^PK znfbxE-6P+;IDTqZY**I7nBrM?Z>L0&vs|4^O2*7B_dNeHo(cezd==sY!kCpzYb5b> zxOFpr`A@>UF6$v8W5SaY73d+#2D^VMI4Y1)#;6Yeot?$BzaZ3hap?_ag&Cr`#GlZTSbl%Rb^h%E>d!Yp ze$^P#)>OG@&5CTg9{`d=m_@+2QHf|E2}C0AVZpPcRxf)f5nj!5+sbM~GEUx`3Y4Kd zPHFu!LJT7Gpkvg<>n$@JA zdmm%{F~eU3$AU3z4sbwen7I6<37EYew&kn|UYMXtAJ;|HV8YZ8-{~WeY#PY0RS5(E z`x54v=SY1LDE>um)AiM$UVkj7eic&!y#mRdFH|XoSQms*fk$@lKukdvx2bJ=;wd8w z@TVf6aBf`gotz z6AP}+mnIHE#pDbIF;K@&mzy02p+c+1Ez`d`2IF1{3$9sDr!8630=2OXTV6;M4O@Q$ z4|V_tp-$GH)-j9}4v^E5w&HHCLar(x-@i7c|NPjqqQQ zKTw;wB2)=JH>Scg6vtu+%Fj>V&V!*E=Ao%^=_0Ir#@B0ywD4 zTDO<&v(CGs6DcBp4dm8tE}t15c@q<#S!bc@+c!KTa(%AJjrTiE^?!u`Y!XFfpsN!6 z+5Bc3g|ETyY1Qe`+3YhAWaPwGvou*wVQKgsoXK6Q=VSU=)Kb#GT!lkq{LQ`}B34DO z>#U-^OGJg3qrMYMpk>Zab?VIb1wtert$ZGI-r;Uc5r-gL)yyr~yT9!?-RdRbrTa@w z@ZUh-mc+z*o=VosBg9rLQ3AU-MIhOcIbPR{Cu-WfG=(vPzQ)1L&qA@8jExpBXd@X> zdDFZ#94(549&XgUUWF*hW^?mhQ6$Kqwi*V>L4y!kOB%OU#~5A6YoK&lS?Z#8p$_an z4lBP4RUZs9cB+Q$m$up=Id(8e1Y;|QcPlE@(1`yAsT+;P zwPMs*CA2IE6tZ%`FSL2(-C6Z(I>G1oRR>I^){TM}lM2N)8D5$Mr!35Lg0K0cEMkKOqv1`I^MX5bRex4s34mr=5J!)C*~pg>ijR`b#CdH$$=u|bbX z{uJo|qQ$TPei{Bn^~wsc=gcwI?W9Bo6)@VHLI1FHE_5F5#EmT^40oWEx(8D!jFG-2 z>R@^)kVh6Ch3KXZ2UnUL#0mH3X3Qg^ch%Gqze9td7~V)#Z%~<(1{?%{I#=5oq{ZLb z5DH^=>H?1Y%KM&Ba|?6@Y}3gyh^d|-H3I7?!7a|KUacz<`oLd?ojk&Ups=~1SUYY| zWm@Wg7non$xN|;j6TC>1U7JJ+0_il9A-l|q1DnRanLMoLP$HkTZ3o; zMDdQu@?PwPC`LEy-2d!*>@ne-n&8-SIbsJgM7T8yg>39EY$Vza>PE5>iE?f8um9td zL!u(^$8hf>C!Qi7V$`$8&0{FDQ{a0<%tf;ggie+JSk@JM)FEx;U7Z<}pBwkD%lW!0PKGxwC$TOx4Q;GBn7 zW|?v*KJ!pa@Q4{e8zs$edcQf*mn(sQXPlwSJJ=)9{=8>lcpCjzXyaSz7x?lWZ=3ok zLZqAFw zlzi#U>n+|zdK>0U*eep_PY6xKj8U5pY1_FX9;FYOlC+KqN{6<@#L`MXEENqw!&YH(Un!C$=&e1SA^j|1V92|XW*EP!VCiNKu5Wv{E0WF>oxa8U{$u@dO&k;5$`F*`LEV(Op;NC6P$X` zBo;s0i7d{SpC4+DB}7*jUy0xr+t$FX7I!j=iQz*38vPRZwu#&!N2u8^sKI9ck$%w5Nb@ zy-WFyf0^4ZVhjc0ldBrZOBa#t(C7eP^)c+;k6}G0c_k=9R*nldisb1A*gAU)Cb$oJ z>6>5OEsQ4$+PhxImA6WD+BTw!BwGzto34`&Em0GOniN<&GG%_yb4-;U6sO3b#!NSm znh|E>i0p!_oTo;&L5yh}h%~9cRGa{*51-NvnvASazNO=AYKneQoc8dZO_o|5wz1XP zQI3hQ1k{_+a{DZ-qID=f>@E(3aEuG5u}hM1NFE=1J%Yu@2uoOwhchaE4v7ufiJj9M zmI@SGIet$g=NU_j_m2zT4s|u7vEEKeD|bKRHJ81V)$*BjF*x-{f z3ZGL7m@G%SK>?}x+VxTT$Ec&k73~PF&1hrp*0nL|4BG)X$8rC)$mZgCT=OVf)}w6+ z5s#0B<98-Ub29ql^MrP!3p-?@vFT6eLrb`dTA`5;k!}k|$@EuN8C->O+xVio(7N<4 zC_HXc_PR()`On?>b|gzepk@PRiMCol%El-inQ3Xq1@|eL-cUqD0(TioqMZqk41e)z zVL7Ah&ryC3*uiDyJ8ls=;ls}nzJK96_-6IbZvCziP+Hle`+_l;KL$T*EZk}T1so*c z2jhGaYWcrG9ju?)x}c@J<^qy47(hal3_n$Q|E*>1WNm2Q?F=H0BC$f3O`gp;KK31d z(<6$bx>>B9yt?lDgvG#$tpCWGWv3Kw5P2 zlTt@&$svygsr^wRC?71snFo|K2>PlP)QN>Se&Xt%rf+kpFPU# z6b$U#o6PU0Fe@6wn!#@@ZyssQxwv>|5~F@L(d1c4B*Uj+8Pbnmne7W~>52MT(Rqf` zT<+Gw_BLyF`@NfDY0U|>OMqjIGHyqTC5QbREFa3g>Y*mQvLI(0HA{E{u*lm%a&D?& zHqsW1h*(I(iC6JlR2lQw)k1d$=!W%9@<1WX@Twha=Vp|LKB4SDPlSI+ z=VNHO7}@g-6EeKd{O8Xl2nZn;a(9Af6KUs_rB)1&0o*n4v7DbUrdua)UFv83(^F53 zV^a&IKO?owAU;PKT>a-Ttx{6AtnV(2GcjtRf%KoARjeL7tnHu@_rOx8yHxV2Tml#S zMP*dl68Q$4=b{fc^oOO=e3iu6FE$$GmD`Qgpw}V-Xo=1azl2tBsVaB!0^8-1GijwE zJ)=+@_)Gd|6EnQic+KZ~9LMRCJSmWgd!{U|+*ePar^kAg zvR&Y%-%+1OL~N4ch-n>iCr-Q(8ha94JD|5tctzmHKY($38gV?JH*{3jsP}f2bIS1a zU7u&gEzW=s4mcq(w(W>l8Bdno4{M*qFY=VckG?*q1e1n9tV;qPU`a^8=xLdm{Z6HQ$)Ap8Ewl8tHTPdy^;i8@6~KM&Ttlu2_iIqKlruHiv0ej<}1{QV&zMP>9@MzEx_P2GWwS!ZQfGn z1WJ}(@IvJwh=CYP0N8mc;ONhm_Ev6m@;#huB@|4QaD}U~-0zkSbOC3#)R2?NT*Djq zw*}8Hm;k%O!}39{QJ!`xbe>?f%ll5Z4>&u^tqFLT0=h-Cr%PvRp=1s3@=XRaWqqN2 z0{*@?+hY`)Y()>i zm3#(;t2|?smb+(Bvbbwob0kfPOI8&e)aF)JU6^B&&ZaPa&V{u)zSuyQ&y!ZYn4n0w z%-~Y&SZ?oBe&KC#2(t8LH!yrcG z79IZT)x)D=>H1CZ(G0ufgQU@lt+IUYZYp5~J>V_Thrno5ps#ub$~3`u-M+ zey$Ex;m^1sRJqm~A-<^_imvON_Q?E}B`^D3zqS-u>W-NS4TSVe!uq!6r`edIwg{)y zBe0i;>OmmKeJoo@{T|0P47IV8BAss3r@zZ;DyDAXQzDVBcIJA!3Ad-)sF3W0snDNk zV6p*lL4c^2(BM(gcLv@6jC^9@p)^0s)TFV z3|I>B-%)-$JAk>oF)3aw)77QV8eE&TLsj)Zt_N z!VcqF2$#KGLPMerDO$C?5OWU>m;va88wJ$Ee&N(IO5;Uv$ZcJHU-dXVvlK#p80ame zaQ;=A0g+BMKXH16`N+TQ;@}-hV<#Gs?c*Z3**WjZ3%@-&&|}@1I^yW&>wSya(PGpY z(Pq@y7sC5yx$2DfjcyVCmw1yjz2S}b4Xp$Hr-nxx`cFG|yl-E`KZbtV0|bq$q)uTJ zYWZIK-*76`SZTL;D%3zMU?gPG3XMrmy=ireET(YNk5xXr`vE zl&&y)it|PMd^%q`VmcibRb!hN#CS`a!m%MER;F`lsZwTqEn{4!b8YDee^LdlCP)7y3wbjZQN4)|h`gT?Il{#Lr)bm5i!>}!_Jv+yFO zv^08sc~xZf>j}7CYmOzu$WYG<4cg_jBLjQsXX0=4aTPtlMSwOK>&t<=Ne8nin`el?B`9sXBNsXDx z@1X~yJ|FXwKcLD^N_cmjm|0L*s9V1$;CDq^!j$*NwcbX6_vdV;ZbP*QMd|A+Y4qn9 zpXn-civxf)r%6j_uW_oU4_ZeNjctxTBK)SOQh`JWvkh3UTED=y*q!qrCCN66f{%4m z6rfws1byfW`hJ}Y%|Lug+jG=3OscYtuMiP`ywj)jxALvWD#U-qJxXy0kF>n5HU|^h zMO-orEJQpQ7E(ngxIGw%di}4e3+*^N2F&M=+<;y+I}`JPuEf`3!C_VAQ@!6YMhf{o zN=wi)x)zR_kS?$mw~yR7i{wM@`9v^z-BUoXzh9)uGWksLzsnfh%oUX7CsgVpJf|5# z%^t)?nK>1qkbG3z?XtCb#~olVH*@KwqQ2t@?VJ}NWpP6B;W+M0O!hp|SP{sw=?o#i zJ_X7TI{vEs;>Xx<{30`owPWht{pu|aGDYW`i?(rP#UZexpmSXWk5so~OZM(j(bDT& z@bQ^%P|JE#h03r;ew=%+(ol;CQ!9>RIt9k%a!JK%u8Bk+Dq8EZ!VOK8=(MoAD|oT! z)B2$gOVCD6XyCz766;I96X+E7q605oA1Z(ro5TIxlR-6|^V;@kx@FOnoBJVX*^$$z zys&9`7(N^AFMvNFze#gc_CO4FFJkRXJe3m^=<1HaIe@aVE?(H;CqHC~jeTf`uXm-Y zje(!%O7;Dk51?72rGiI07q;}fx4e;49{w3K`?tJ10Zn3K)O99Rs%NlcXlXa{EnpRM zyxAyAmyIRHB}mEn{G)WFMF01|Da6mo^}l!m+`Rvr$0v|T#Usqj$Nm3_xVQy4h5i>$ Wkdr?#lt&hgQxKhwPDWK0{r>;}o`;YC diff --git a/include/internal/database/xgemm.h b/include/internal/database/xgemm.h index e24adb19..9ca2bff5 100644 --- a/include/internal/database/xgemm.h +++ b/include/internal/database/xgemm.h @@ -18,11 +18,11 @@ const Database::DatabaseEntry Database::XgemmSingle = { "Xgemm", Precision::kSingle, { { // AMD GPUs kDeviceTypeGPU, "AMD", { - { "AMD Radeon R9 M370X Compute Engine", { {"KWG",32}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",32}, {"MWG",128}, {"NDIMB",32}, {"NDIMC",8}, {"NWG",64}, {"SA",0}, {"SB",1}, {"STRM",1}, {"STRN",1}, {"VWM",4}, {"VWN",1} } }, + { "AMD Radeon R9 M370X Compute Engine", { {"KWG",32}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",8}, {"NDIMC",16}, {"NWG",128}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",2}, {"VWN",8} } }, { "Hawaii", { {"KWG",16}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",32}, {"MWG",128}, {"NDIMB",32}, {"NDIMC",8}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",1}, {"VWM",4}, {"VWN",2} } }, { "Pitcairn", { {"KWG",16}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",128}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",1} } }, { "Tahiti", { {"KWG",32}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",32}, {"MWG",128}, {"NDIMB",16}, {"NDIMC",8}, {"NWG",64}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",4}, {"VWN",1} } }, - { "default", { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",16}, {"NDIMC",8}, {"NWG",64}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",1} } }, + { "default", { {"KWG",16}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",64}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",1} } }, } }, { // ARM GPUs diff --git a/scripts/database/database.py b/scripts/database/database.py index d14e36cc..8e8f37f8 100644 --- a/scripts/database/database.py +++ b/scripts/database/database.py @@ -92,6 +92,7 @@ def ConcatenateData(df1, df2): def RemoveDuplicates(df): return df.drop_duplicates() +# database = database[(database["device"] != "AMD Radeon R9 M370X Compute Engine") | (database["kernel_family"] != "xgemm") | (database["precision"] != "32")] def RemoveEntriesByDevice(df, devicename): return df[df["device"] != devicename] From 7f5cfd92ba280ebd2e27b68e58685db608cacef4 Mon Sep 17 00:00:00 2001 From: cnugteren Date: Sun, 15 May 2016 17:31:19 +0200 Subject: [PATCH 08/13] Updated the performance graph for the Radeon M370X AMD GPU --- doc/performance/Radeon_M370X/SGEMM.pdf | Bin 13268 -> 13326 bytes 1 file changed, 0 insertions(+), 0 deletions(-) diff --git a/doc/performance/Radeon_M370X/SGEMM.pdf b/doc/performance/Radeon_M370X/SGEMM.pdf index ab4382ce431f3a11e87a85b7df82b92646b40a5e..5dca8f033c2f52e3ff1080c72a3156c104468415 100644 GIT binary patch delta 9505 zcmZX2Wl&sA&@N8!;K2#*?(XsicXtUc!C5#+2oQqn;t<>+xDzZ$a3{D!a9`{$`Rn)Ev{mn|}C;rV+>s zd*Ds=l^t^`Ao6$vc&8p;gonG{<3HV|hIebf?7k0oh)qpuau`ibCB*)qIRB3QVJFHk zz<$0G+`PZ+RXKZ2)hgz)(HdQ<>o!^J$~t*X#Reg+>g_V^mlfaS<(-e(hu~~AxM)^y z=BFZO`vK46T{F*5Pe<~Q+v`(PQ4l0b!R7CWxXE3I<0Sw0woo$X3^L1{E z?ku2+)!V(dzwK2$!a{dYk}ejk#=t zJ}3B5*b@Tg%vnQf-lM^MQP{#YpT(`>2OAO*pN3@5Tg^m&UGads3f(u1wi;+^J5%y}lrsu{JyiDjD343|WFdQ6>$X?T2X%P-q?fStBY>oE`ZHUc;@u@C zyo(M^ir6>laj=(I7lK>D{xk^4(QHJnLeISYyFLji3 zv6eo3)FJB{FZirn_fPKbEha`~v|1hKp(vNS%uAbH8cRYZGR=KaU*8>UZW zJZW|^yAqZtWRZDFmnu5}!&=%~LbIz@RiP06iMXE5%3MqjfZNx*#f<#exteP}Sd5(K z5w{3^cGAbY`r^dBcOUq(0{trBpT5B>&OYyBd@_JK;W4+c(EqC}N9OUj^~-2t#YVrJ zA66O%)v0FDK|Icgg$vibw6=$+ao@=|}gN^u*a zU7RYTs(bgsbFUJNFRCz%Pf2SQ=4sihF>HoEu@Q+Bs9BSblE9Ia7>r*zuZK_eVvpsUz?79X%s32WfUkY1Y6=POCQr<2z+? zSMn-%m*wKZ%GNULox_3UCDenG+c|}l!b;3$Iewj?tP%##cE2aIboh*-v%KCk8iZUk ztS=!QOqYLa$AV8u-OW@hTXj><={A2xkIoaeti2xqd~*uX?r&mfmp4!=fkihlK-M7u zv0saZ+U#W+4iV+PH`C^9MBNxP8~8Rr5IY`Z{e)EEjQkr;M~j+rmLWpPc#^Qj;#jO< z1cB=@Ql#QrKEqTuicZB!vCf$vV)=mPW}*7KXsmAB3YoTyl(Hedd%8RH(ry8aZb)Ou zY#fsku(qTT9x~3bklT^T^6MkqFp^IGn(_G2F?PjAglP_cI}ydAtueix|E}7OqS(if z#CQf#LqBuSw^*-7g(mNN&%3mH3kKmwpMOI%X~7Tr3{x5MiW0)+BUb#kR8RJ^6ED4l zQCP1uFbA@L7(r0;QvyNT9;BQ%A1wMkIQbBOdO%ZPxI-|^4D0EETh*ySZPY8zF8WaO ztab)Vt^1DV^A;ly4I&LoezC%cJbCodP*8kYdvc9_jpzLlJHya*Pv&MDT9CaENmwCB>kr zl`j#sxt2RQjoM5F3q6uXO8DI76NZCPh%1zDY?h3c?L#-!*_x~?~3?L5s_8fc%0CE1>|R^{?~+u&M8AEw;s5R_~k z*}!LW9V_pooH-5Q(sqCPGV37EDftPXwV$0}>6%Db7Y_F^!TS!h(1jkZy(_UZZt)v%pZv$v}Zx8 z91+}KtfZ-KeuH%73~@hfGskCP3CxDM5{<1jDQ!=cVP`uzm6(U-YUC}VsP^vhwt4jS zlTzWTH5eJ_8wkk1|xieiCeK*&1bts{0nnkaMceIsSH`MFG$f`C!wM z!rhs!^_^7AS5tP-e5sggs~^f4yK*fZ!=M7^6@eN|R8)E_>kgq*&X$n=<$px-2gj!d zVKe$L_fmk{R57BNkw1^IsAIT<8rf^Yip5f3X|EfTC>i?sEcEw$T*?>)JcD`}l?*qK zR^Wdv;z21RbC%HRl>orw+Io}S1{XDR_%S)&!}=hOX(&;JLcpapKSQ@*_mGP^^;V6^ zH=EL4iE!T*ES8)TwhCXu9`Bu+F(EtMDHwpT3a|A}5BrXhixt6;3(A7#&ZkfoJcKdD z|9sd3dkuRTQ3_NuJvnnJ^A~CV{zseO*c^OgFB=onWVjlUMiPJ$9|XXv{uX zv)fRP#qVZVW{)Dx;y<5nGH;kN)^^2U z+SWexf|@2&^_`Om#Hek&0^S5!^PtS=62g37Nj*^PCR^0FyXZm6L3YB};R=!dU{?L6 z6J16KHqEh+w#LT~)q|uQVm4ZPVxF&LV-FztxWbMgC2C$@XW*7p=XlvTIFwC1j zqBpk(k-iKdZzs_Od4t)@ki=}A?}S*CX*18)lq4*GoY4_Q!?Y0_xHO1kk%><^e2}%9jIsqh%QmVR zl!uwh}El|c;&-r`C8q;^V% zRocfP4*83x1EcwL0vX2a!|&D@`58}=7ZPph_7B^R%7nvtRQvmmdpHb`>q{Z)ex;TM z?r|>$%Kq-eIR)akZlbI(bdZ3<-Z&vb7)ux|bTfBu}k~+M;6qDGM0;LQo*osW)~7PR&ggUk+{q4P7(oTC+TWr&e@Kh;N^~9OEE}r z;i0-Sb84U`)w1s1dGy)NbgV)@K+wm#oZU7MNQAg6y*zi4X6metq*~&)kECkaHD{mg zXh}`eH!_?akJJ{W70d1PznmAzHlOdLg(1R#!{lK~`_W-FqG*wRv9P@P#Ug?VmFQ53 z36*c566aU;m{jnLMI@q-SeGG)15jh#X?e~y@M&g0m)HU6Hz#79OL*TU)B|TYZ}Co^ zcWJ5iG3Ar+Z{l)&Zu!|cCB1!WD{6%E6M#=nJL@*YwSNTiomrJkGVys>u!1F_yf!BvIK%D!N{b5%^?@F2>kEtEMs{cX|KTSYaJ!5kis_EI3bdcx@9lq` zmUJ>DQ>Ux0)!JGIBDj2OyS;#a9pstzhgk4m0!tEP`J|z!esht#yXIbO?b_%+*(6q) z@-1#MD_iLHLFw)+0b)kq`=7s1AGT}enUhe&6pqWM^N>~y)L!`l3Jy;L*xEnIVD-N% z_)KU(+L?&AhJ{JlP!`m3v=)(Ro( z*)^bDL(n{?$ZWB0>7GPQ2JhW;%+_XMU@=TcDmNj-)uw)iV(M%kafMSsz>k+x~413&+-A-^j z^??WfUJWNANc?U)3CoG$TX@gOqWcI>E{s9FdB6JE98XV0DL%=0z9-vNWOhKeU_+rB zB^3Ds2DD$$4OD~{`ka@+FUqiby1Q==*A35@plw78h48Vq9GQ~?$F6R4+qg2@G*H&w zHoDxia;&{8#1SvVJpe*(>mTTq-(IoC&;sA|o}x9iM^Y<-a%gPbczE4dz(#y1TwMN% zEiqFxFcHe7F~h0d+N;JzwIiPC9m7WQmgD5vIa#yk zb$oYlO~Y*?;EqCQc56}c;v-tXoAqbEH>1;$Q0r|>NIiQXeaCdDr5as!xt9EPvOKgM z6MTd0@XCm3SzH5e%@w%TqP2a!^#vF0y8bwvLi;30)+wuZd$|3D*iL~+(PGA^ClB2K zk8}KC?(Kv~jN33z>HTx_E376S+VnkgwS_Eq>-*3{5_aBDts7wP!gctLB%GeB38lz> z!XHMJ9_nZGW1T8=kC=zHXI@$^{Xfv#!=0hjpK{NAtj#w#9RTCiwN)A-9134Eq70kK z+|Co zo;a)F!Vj9xgj^UTev$yHy}nh_AmO@zt13v z>MfA|tJ$E0MhQmB4S!x=b=O==XNRCF+-3K8GW1VHg;c|&9eU5pn;PN8or!{udKA*g>%|dX zBLz4FhFT(WU`i9DBl(xO!Ei)kc|0`+#llQEhGPrPVItNnVop;my$|}lXKNd+E^3_h z^?z7N03Sc@tRwhHjjIH>%xXn>r4RD0JEU$7as-5AwnJ?`#D9hqqCWDlC~Liv+Hwqe zm1sewf!{j%osybuP-wER&?&%ZZpeDt}x)0YPDWGigQom(4R}n+s-Zp#A)1^qJS_A-t zxC9Tk?jrh#r_jHm2JhCa_SIOhGv;JrwiBC;AlRt0S>3R%(Xy(g)^44}LDt==TX32C zcWLv!8!vsciOl?9W)|&@6z&rUla2?`YBKB`bDGgbrjit>gan=>7cXC@QdEz?I^T? z>IYE$Rj#Xa6j?*%JX8l>2V+pc3M%`dfK>*hxBSM#@u_}Lwyw7S*(SX>v1MWNMvvy_ zkddRq#-MR#EWwXf_TJ7-{J+!61FzFU-2bLmhYCs; zFbhw|`k-K7i5*pAgGUE75Sh7cE}SGLKF5QxoOg9!Tci&)nr?0IwV1y!PpEk>wXwBh zhW0*ifz*+t+S=zpwN?M4l6b=Bem7@OQ9%MmU0Z!tv&e-EmwDO zWc`^d1er_2Y}p}3tA@#N-Iosqwu)1AY>W#68R#SFq^y_Z6gia0_)nn+EY7;>uDca{ zMqfVdDd>$ItE_GcHiY9D5d- zX-?KH$;jxQCyR6PP5xM-5 zRBhx39TE^VE=793+zxp5OIITIR7uAW6g1gQ#dBmDuiyh7nV=(06R2}Rl(GN87OLJK)GW-dB_tMQuL$_@>+ z*HSq&W^Li`x`h&vA4_Kz!=C?=&0tVj1&1g@ZNBxJdHW})R`9ip$f@=eK^cKy(__gN zOK`e^PvkkY7@C8(ew|2bz|E-Qt>i2>PK~Sxw>QNa^n6Nq(n@1w#9s@>OwB}EuwD#V&kISFS1_%!4khDB&e`QE%_cd9K zOrex8MT^H@+2b_5?Te)>9tk6r~I$P6a{6$KFLe# z3lkEyRv{tvyE3jEQNh0Oyt0}w2)mDz;__k+?vp+bz>c0<=5(Q`Te4|8ltL(%hnn-| zEo@mU$ic0|{t;yj&!!~7t56uHCeN*M48f5TD^gs$1P6NImY)4PkEuSN6K0g>^W5i1 z{1Oc)I0O%~t&QA*YmTxYHSePm%rt8N*ErECkHB60*Pd3?RO#H&z}fAtz5yaIahPW z)PC)D5QYg#HWg<@VM(y^4*#u8uW=45vmlZWGwRYLpiOOH}F z!Dv^m8vbY($2Ca`7+x#S50g!^2njG{kE8szekgPV&#yVQmEKGm+-QZS4Q-y9tsr>& zC|r*=aE+ATKS{mjQZ8;0%w!Us@M={{WA@r&?Hj%cBR|D5k7*Z2C~Fs2>beS-!s@C! zL|M2?wh3o~(wr*!yBp}s<0ZeSj=b|4k4&j!KxEyKL_JmN1az9DPk6CyLwM@+xvRl< zR-3#|Q9Ih-)w@{j=IiY=NyOBq8n{;3_tH&I0f*rR`aiU-dv_6-xFhNWSO;?iO}&-D zyBtycLvuy*GFK;j>KzZteP(W%Y5iB3C1GtU724C@R0L6%+Y8Q3cdjS6iVZjDFOfE6 z{weUXBVyyBfJIo&j@0DO@yk5D4f!kvViysz_rxIr9Sr<_XsTXpfv$h=W}^1}RB{hH z5%38+WN1`6kQh`%>DN6>L%&9eUE-&g61q)Jn|wSED`~z9o2c& zb$RpTE%o+!3z4-sRW7C<&jiF-_XU61bzw}16k04Znw;MD7+eOkZFHQp~bq?Q%`(MLX=xswtnt#W%$3S<1> zy|O<3-PAw^@6(;$m%>;%J8FM;wlv|w4&Z51YWggZBAV=Z<084MSGBa#wBv3Fv7X;4 zRqP%z3moMnJkJ)3uYw>g0e;@ubCa<+^Qt ztrLok&KR~Jb>wb(?LDHyCH>!^a z&`AuCDll<{!uYjfK#t?zk!|}qs-^J(yZ`23&+?i$&(g?pXl;K_?LpykDm1rp9x6;6Zn05+2wZb&FuA#2m z0$MKFh_g}6`_dmJA=hn~`D_^@mr_sms!rsdo1s=Un+!AAbZo=(%h9 zZ204VuYNweb|w1{s-D=}N7}CJlaYjX_%DLw0ewaE4eN&uWXl|cX{E(7|k^B5D>0QLI*S~Ksuv9)gf7FPGmHDOBBF@o7E*)s=>?}%52KN=D#-qoFKrr|% zx$+VA<SHOU;2sNnsRnl5i%XQ8mxSS3L^P|a2VS}o+m4%+N#@ot zXZK0;5y2Uhk#dbTLsYBzta6mz$gK;^#2}M1ooKQo;7osuvi&vF{rl~g$3LY=_GKdQ z{P$8g$X%r!LpaTXt}bs>o|OZMIWCk5p?A{ZNaE>Fu2ssRZ1+VDXBDu}QUDil(hw-ZOmpjC6q9YcY3>*>hk5-IU8tVLam250 zW=mHqR2a#1ivI+7N`<)HFT$8)`J@?ArT-M~2}1ixP@wvfOZ~$dF0OfZcssX3#p7K< z8*81A8>Bz4oCFEu_aJ3YQ=O3KHw@s9gr!CK9-EP8{Y)$w+V;98o^TDeQMFQ-m`v(+ zmHMWBjgY6Hwl>0X!1HI3^s zLzmSge)`@4o?nT%s|<#cUG=hfqN|+mKFq=R8fWoDR@tSJwsx*7_zJ!ay2Jp&XBCIb z`Pg=>drGw!iqQ(Tpra4o@kby8lXVqDEo-l%Ka>C{jP*@E-i zeA9q}VNy%D$zV2X=7{qDQ z`^zU1<6`wi)k#l-;e8?2Ydqs(*m=2mJ5a-Fu;f7BMl@WC;pZ%FTmsIu3-Q zy94=*ZFkn>=0zXHhol6Zt(~ntdAM7fJE1~HE*AEv+>{)YE*3wGX~QtY`8fG0IbI)1 zXG+w60}ercsP?}x0WRpO*XRGS6X4}be9tY&%*)I9zvei(1ULl#KTME=H!+M`292A8 OgCCWaR#sIG_5T2+43hu= delta 9424 zcmZXZWl&pfw6-bkP+W?%MT%>2D_Yz=IKf?tvylQVS_lrsU5XTUcbDSs!L4}t!h6oI z?__4r%D(p_E1AiCJ!>V}YR2-15CRtmU#uwUe?fl*frI}eMDP_Y5JyoTyK#z|k(EJcHyvDlpRA*p%dfQI0`Ofcobf=xQ!o=-nA?^8bhd3$I&kN=7xccAC zaOM-!)8fz$l}Gy%umWy=++YRyOFcgf8?2WAu1Vc~F-h_D56Nse#IG@|ZJUmg*eb4z zR~$!sq(t{`w!L2)WXd<4{?eYjz89Nj8-LzR@LxM!6T0wo^nboT`E4X=Q>6{GI5pc{ ziY{8+B&>#&C>~!xH!e(lyu7ZHnlsxs!`HMq`yN8s25+(fm%)Y=9iF6P|IZ7)whPs@ z4|L$tB$lc9D^-@9syZjp&_SzmCImvWR;sA$+O?$7YsRLwq=yNEikw3P;VNxjcCRW_ z4NF624MS$N#u$?-6|brepO{|ponCmyRU(W8Vhrya9avm>mvbGbpWLD|(L9&ms9?x^ zb|N2~(gMKlWAb}p_xme{Q>JumL{}%)2U?JoO+Gwd^M+o5Bj3uW-!UC=r)kWbnig{3 zo~k$=tTu?)+Y`I0W~M``vJz^Ypm%N?m6JY&wQgwVz6T^Ht~&B_kNXOKEp9nLOs!k# z-V$?G@1b5(%w-f&(I8h;oXV+L+`s#z%aCGV5CJQze69aOhy?$dP3&TeW7uX@reobV z>C>=HFTpj3SUur*iSxKIUQnI|{gH0OoVS0Al+6tucJER{_QAx(mWprBrDD>=h5hk0 zpU=~b{|3Sacb@ZF)olN|LsQF zY4M-h$v6$Nx-Iz#OiZ#AExw>X%#KexY{0*?H4vrY6h0fSQ^0_A%EYe>1_baj^+T{A_{&T~uRD+5Ny` zA2dfX{jopLgDzsZihq~{UFy7Pwa|Nzf0&MKU63d4i9x3v{q&)3#OlqtVPhTe(86Gd zb4243;ptJG^ovt_xPqE-)!?u2eCE1Pm@biPN_=%1%HQw+Y% zv)FjMExDtd1FYEie5Va&eR zM5#V2wdB9{@@PC4`{I+(h^@C}<(tpr^v-iVFkCTa;33|cW}}GxSaDYXuwu7_t|ohm z{aE%C`_|H|Fuo%&VhH{fOcqL~Sr9BIZHdSYaCv#~oAuW=w+?|&7Cq0tW~E5Fs}Sdz zUWkAf$sP6)e)FariV2zxVRG}%PToc4!}L_K;C97hMQIARuZ=h<{<-|o{@-VCSdv?s zmnSHimr6(6RRp&rWpV*H^%Ob^>59STVT4>akC$YbImwm5%nUTs2qlBCOg9tzs^`;2 z+_QcPD$lTFVM9^KIv7t`amu@o`qfZsxrlyNz0xekhK9XSTs0V*nwx&fezB1ze_Ds`*SToM&Krg^4fCl>{&B%P~$qasVngp$&v(0`7Qz1{=?6ry202 zh;EG4CzIxsQQCwCGgeXkZIO$@CVmCqb>Kc;m2`KY`y3ZAF|mXbUTQA}c7{*FLd4SG$xy?ytxm5iU!% z9^V`HQl$g>oh4Y6Ibfy<6K@hk<6o1zNx$dQqU_97s_gR3*YkSDt?Ix|gy>DwpIPND^b_e2 zZsy}A7v>pjR;g?dpP$*a{MsB{(l>S&wj!z=!khtcc7w81IWOR~;J4I=&j3UwN(!9a z9O`cNH)WE|X+(k;X#wG)HLBlqGG{F(-AGj`@s34mUt@hiWFo6j?2z*%JY*&b_LL{CgFjN z$#jbSJ^VUyV~nL9%7jSD=0$@8NtfWYa@Gnn7Hbxy2Rba-1B8=h=wrrwYSFa1;l{6e zdDYQHLh6iEJHoXy#pOr#v9(HIqy00;a{x3EsX*-(F;7-e%OFivY97A}#RbI7Aj0F$ zN9o5CluFa5ll!kMr?1M;#~eXno^j}{E)Fl#4U{X)jyXERl766E3TIP^>vYN`CKnf$ zM2t4}kIPkI8p>4?_VU9E9@nHq=I=YtSG&~EMmRIjSI7I~lH;9=^XcUfsoo9P#Dt2cU;Yd;z8RHQ~k~d7}c+ z6QU2-#1C_h{(Q&4A9Sk)S+&m78#e!^X*O+M1oqXZ5fV@C2D3%>LSc62TBJMfa}1vZ z(k$G@iD&Dzc5rOJCT2IYRmdq&8Zro2ok-B58X}8;}}VSr(h8c7Q61I z_aaIpG#v=M?hPUxoEsoO@}%a&{{IH!kT3Ie5LPQ&2t)5y4&FV0nzMN{QbrL>)61 z2%1dUvvUf~_^WKdWf_SewQLO@eOv}E=b0rXxz z=Ym?@z$zB<1@bndeo_TXEWHQXn`k_0%Nx1xzTwCU^y$gyes1wREPM#W(zl_G3?T+y zy+8dzc;>t+gPGrtsqO3Uqw1-F9Z?Xx0*5gzFPYgXCfSkpSij$Bc)F5_w&g@Ooxd*Y z00_r=Y;cJ-x(n?NPuY=y`7PQd@1);+H3h9hzIhG?@IEb*c>?>}fx;03&BNb~hK@CN z1*Bh#&f^kGkQoRWNr2Gl)K9BRyN_USAyp3xr9MtvxP}zBuZ!b)WxAkll5zP!8l&eq zQrNX#7c>XE68)2m^++mhfWS5Y$?Upn7F@=($n4JN4n4%nPXRY7u;JK|cOB%K4(jPt z^VbP^iiIE!voH19FHpn57N={7;p|H6-ChBUjGj7FaL#ee*T-3JR{?mxbb72MR}EM@ zPF9XJr71!d`6sfzT81D5{SWENPV%hRmZLO_e%{^cdQ~r*P?~goZ&BuV*o=ey)*G>J z|8ZCYf+d9l6F9Rl517A1PV8MZ|Alb8d{_U?pFxM~H;jt9p!^r2ozp@{jr(j@kYJhw|GiJ7w<4et!jQ);okmAY0yqPZKaGQsR-P{q5fJ zu1AXU!O#nG^nygN)zE=M>GeLmUi;*` z{L7{8LGP}*^8=l!3yAwnC*20ygOT15=M zUAosgZIvp>Y<_t_QrcZcqx z8mF)+3n}y?A1jy|!6{rd9|FI7G_>HDe5JNbnnX$vap?oT`)A&(fnS5*Fv)qMfn+Yk zw2TwDg8P;0g)J)d&$l$u-O0^}{*$Ax6s{j;rhoi&CPAXXe2ORS=*gaHfY1fJ&?zYR z2+r8kVOwBkM_jQ{3Wa_4R9*S>qnSSPXOlD5;dJhXm2P5c%nr$RK-q5}Z&>G5gn zVGZ`UrU?QBFj?8p#xM*%Zk5!VT?U6$H6E$sNNI9Lo#OH$2p$jN))>xg4(9=h7)~8V zCu)ucqz=zGmrAvCiK6!`|K&>d%?J(?l8EpRDzbSy?*tnsa?O3bg{bz5M&YXS_$@Zw z*YqWLO}m$IOnUKd8JIsB@0vP*S9KOQG~XTDS?PkM*;-+7%>L-uF^#0&b10^I3qgh6N@K2X+a?fyDtxM2EbN% z`nft<^9ZVgnIL`suPM^Z+JQy37?31mB&=Zx(;q_)ljadZEH4#eqb>E&b}zjCHf@WOl7}eG!4?1w{11oc4@q-b)|3fZZP5pLs$(1;Bb(uz3#KW`+NOfWM6EVo`{ij6@ z^;OBn=doQ!xt|? zC3mnX#2+sr`lrdW2`7I%rZ(FC&bg-N(yZJ}!v75mz}xc=dGV1?Z}YB-XJWUXpJ>S> z?dIy=F#<1s)EIvz=RXbCe+1`+iRdV7tKp)4PpybqZ4VEJqM%!Q(f*QpZ@izL*suhCjOWP%_CW>gV_VICX1bU*$78;UL?Q4^V(2u|~tfRURI?RMndE}L84&|Mb4cV`-D4agWq7zMutv5z}p_U(>u z6uOW$viDqdz8+ABS7?Dr?|_-;k#tvbe)5+wH@eOlPrDD)ao2#PM1xa&smyHl-n9u~3z>vhXR>}?w;s|x@mGF0^5_&?Y;+ZMsw{Yq9t*?GN&4=_ zUq)Aoh7KdIO^<&>=I~OG8-_h<5uM+>r0$qDmbb^~^xuRh#Bn`0i2+5DG+5SC(`^q= z2cM2dJf0!TvQ+zF4i_h!ndBbz+6(Z?^{?iS0}IjDNZ|tv9LU!}jZ%%N1tQVe8IFIW z;XC+}9S!Y~*$s#9cC9VoFz_UyUvBDXvAvvrd|(W)KTAhxCZ1gHSKJju2>^>xX5^$~ zE1DkMW56k7G(jzr@oOrM&(1y5cG{m+s0uO;!3Xin}s52`m8_p49Y|Hss+{W^)nHq$=+ ze5H_1Me=7BhNb4i2sRb;&&z1=#+#G}x}DN+FBJ5rHVo>)8mnecq#h|OTnS5_sa*r)$p0b?oKO|PCu z`AAO$8%>`zGcK5=L->h3UqN-?OBEB02y7Q1-PL?<;^sK$N!F2dm!#e) zsKO$tk6nniDIS3UNStM#%Vu|OKKVwP^+245v7wK*>0F*Pve6fbxZ2@!_m6*J0Fu&M zmC6vhYL{L+UzW6C_D>SVVZ?LAExcDOva7*cZ1a;zT}!#l%AN9#-vN_7jrh`v9}vAP zc@?_BF%olhxO|~gN3KqZO``_&oQ7S~u|G_&*QRchqGT6=b&C;ZTklB*be=LNVFh#^ z!l>EMHPS-Wk35sQ_8nMi=|PH8X0;oiwg~Lh-L%Lt1=}!LxOo`KN)_Yx<5VjZolR;t zcqx@S3oguSH=-GmWIK6wLS~VbSHjgRi-mOURYHqfM^?8(X6eX$!mH;c&Ez>qyFYzc z=y}mXGXZkg20Mc`!JNxY7LFHPHr@Dwf=FpYkXdB=foS&Myk4;NYqS1W0`pCo{iS0o z;{nnAj}kh}uKsOtkG59!WheEq_+IJ%7CV~Z^nnm-kyE%rg6yL^8Eg^3i)^n6!L^-{ z7IFgMDpgoV@%wN4#ZsW9axu$rmrR3*Ji#ML@Sh{B1qhpWRbN?-8WA&oHr>{M zucK@4mYqsqSLf6Snq-DG5Ph?Z1(7tzsf{D57hOs>$*BH1d{UDjD6taxn%HX=AQh|B zQ%=U9yC?rD`#=#+cQp_p)egSs9W$mu3QMr0q1c~d4@=S#GJq?`gbrQ9y>7r>T(iQi z(-XBQC)Z$qL8@(^$}HQ4hpXc<f zBWOtH@>WBV_{%mjI2ue>S@&)HugL3VJc`s{O}|#5YB5b*hbY;OzKS;#B^vF`A_KR& zKvwvgAKxqTRf%kdL3lMnPQ}cmzqtFIuBKn$c`}9uv*lpuLa4h$DH2av{2361MoSFO z1LKLZAp4=OmUKA3hbX0C}B*~n(K6u``vbhza{8eR> z^c){d;iCm7H7vpwWEzr5C_dRW9+9oHArt%|)lmrHZE6r-?gwpedF3xgkL!X!kvdz> zVJvZmP-50*PEuq~@f>1rF+iNj2++hL79V%7&RYIJ;3d6dDG~rzI!-g3tl{GWCU^MI#?80~g{llIjS*Xt^Z^#&K@5&z*=(QHxo`62CBqdUOi9FSF!b@P1a z;n?Q)MDMx0FWbb=)yw+>RU9?Q^#@!xL05(Sw63AZ@?fh?kq z0y$XA2yCD67bC)T!P6)TnE?G~xjck8e}Y372RiM~-w^DG0&ti)j|arQMex^LHE^hw z&HOvcdpt1FZ$kXGpu`US?Ate3!w4~-Kn-T6bcmE9v&Fs|up_p3;m2dmww#MQNlLLi zeOs$M_Aqvx9L_N3Yi0a1H=2k66u*>cDG9=*^k~H(oTJz-w;4kW5elzIRQ<*?1#lbf z>%R2pv-SiUV98QJMp822G^Eh1Hb5TFhFl5gNc+1z&EcdhhyB#2A<7{sk18MoLNFp) zlTaYKoN|~uD1voEA~5s|iJM|i!O#<22@O;;bAb|Ae%*`9D7&k#dlfhJGEI{#_ zPEo<{!YNjXjWRBw(Co+CG#Xur!d#d|IF7AaOqT*hOa>k~N4W~aZp=`<%=>LtyEH=` zg%oqCFQsSr=dN}F;jm@TA$RG928A_8Rva|8x`k$Q48Od|P$3%TN^EV1!rx^lC|!t` z3vySHc%OLETaR0NXm+<${9_A!-{=j&E+_+4m|9qJj6{m~NWY1x2z#8+@s^6jrSpe1 z;M*Nvjn<`e^50;;H$$`+c2r>rKunC;g+sc@ySuL<(Ssw-S#LX=rf{|?9dM&XuxD@H zsp8E>H0(>PD~@o(dJ`YIlz3T&Xg0VbGH57znI#RGE4A#0)in%RwuhIy+OM{7BZay# za^i{4Y6~sZz9NcRtc+a*HYB_@S~Jvn1iW~?Kh?1RX=pI;`7}l}mX7`az4ZwG=-LND z^-ftefH}n$N(yUe-MJM0bZNsrZ5kN*v9b3$M1ohH4Q3daamDpt@#v3!v}0J;8%Tk! z;%Ygr6Sp^#z+QvbOV!M*-a9K?t+YyfgMD;&t$kIUn>j}0Go(CY%~p}d@8A9ay7RX7 zDtw?Jfq_}^6BcVj(Hr3DJ9#Ip*O2P?){U4V#(JcB^-eLuPpl$Umk8I^jD6HxUZd&BGb_~ z|0V59ZC5eB%htzW>p?2okXuOrn^Z+S{zN~fQz5bK+KQZuBAIfqI;$sIx-uNJFeG|b zRThXG=)oyHt&-Q`;1-LF@pr{EeJRU1i-^r2PGrgW-PBOc_i7E9qYM26N&T_*Lto`Q zn|m8`hfZoxDYeojrX-*0ku+@v(cP3NmlU;M!p(h}0q4Ow-D%5`j5H;%XR{GEm^_(Q zfE9xJ#pr>ny97@ZoyI{5s2>LOi5whdh;b?1E0u>!ab2 ztvCZZCcPxCa{+|tF1Kcy&!@n<`@4M>TqY7nj;*G1W{!0Is3+PNTRbb-P3*~Z<=2+z zR@QQ&S$(4mC}i{W_{hk>^CuxnB8lFWo&t@ZHg=nF4c`Sx83Zl^U`HQK{12msVAr=cBc0YRHFjWDkQ)uRX=GH2s8#I zYkRuKnXK+YB|j#WFx*DnnH>sU7`sE~mChA&Fi60a>ErDhQO?A#`q~6GLZ!s=oSZki*_6@F8(>AWC!7_RuZvGjbHG{Lg+UqO!tF`%=`* zoztW|BR$BdZXNh6=ws2sBBMCx*%oHHI9icCAJkoWXZ-Ncn7$7spv76MS?a3I!B8Uj z=e}}A3|6h19UPF=Xhy3)Z4v-CZseiBv!C~DL3?&I$)CI-tb7S&muL+DI=kD>CLGa%zqlUo00YTRNPB~a$|C%aT} zz0?!u%jZD-qe=m!))L-pjMx7__D#!?e;^n>_x()*?oYNw1r=jbWu;ha0h8?Fyr> Date: Tue, 17 May 2016 22:55:11 +0200 Subject: [PATCH 09/13] Fixed warning CMP0054 --- CMakeLists.txt | 38 +++++++++++++++++++------------------- 1 file changed, 19 insertions(+), 19 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 44524537..e0511300 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -32,40 +32,40 @@ set(CMAKE_INSTALL_RPATH_USE_LINK_PATH false) # Don't add the automatically deter # ================================================================================================== # Compiler-version check (requires at least CMake 2.8.10) -if("${CMAKE_CXX_COMPILER_ID}" STREQUAL "GNU") - if (CMAKE_CXX_COMPILER_VERSION VERSION_LESS 4.7) +if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU") + if(CMAKE_CXX_COMPILER_VERSION VERSION_LESS 4.7) message(FATAL_ERROR "GCC version must be at least 4.7") endif() -elseif ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang") - if (CMAKE_CXX_COMPILER_VERSION VERSION_LESS 3.3) +elseif(CMAKE_CXX_COMPILER_ID STREQUAL "Clang") + if(CMAKE_CXX_COMPILER_VERSION VERSION_LESS 3.3) message(FATAL_ERROR "Clang version must be at least 3.3") endif() -elseif ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "AppleClang") - if (CMAKE_CXX_COMPILER_VERSION VERSION_LESS 5.0) +elseif(CMAKE_CXX_COMPILER_ID STREQUAL "AppleClang") + if(CMAKE_CXX_COMPILER_VERSION VERSION_LESS 5.0) message(FATAL_ERROR "AppleClang version must be at least 5.0") endif() -elseif ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Intel") - if (CMAKE_CXX_COMPILER_VERSION VERSION_LESS 14.0) +elseif(CMAKE_CXX_COMPILER_ID STREQUAL "Intel") + if(CMAKE_CXX_COMPILER_VERSION VERSION_LESS 14.0) message(FATAL_ERROR "ICC version must be at least 14.0") endif() -elseif ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "MSVC") - if (CMAKE_CXX_COMPILER_VERSION VERSION_LESS 18.0) +elseif(CMAKE_CXX_COMPILER_ID STREQUAL "MSVC") + if(CMAKE_CXX_COMPILER_VERSION VERSION_LESS 18.0) message(FATAL_ERROR "MS Visual Studio version must be at least 18.0") endif() endif() # C++ compiler settings -if ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "MSVC") +if(CMAKE_CXX_COMPILER_ID STREQUAL "MSVC") set(FLAGS "/Ox") set(FLAGS "${FLAGS} /wd4715") -else () +else() set(FLAGS "-O3 -std=c++11") - if("${CMAKE_CXX_COMPILER_ID}" STREQUAL "GNU") + if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU") set(FLAGS "${FLAGS} -Wall -Wno-comment -Wno-return-type -Wno-switch -Wno-missing-noreturn") - if (CMAKE_CXX_COMPILER_VERSION VERSION_LESS 4.9.0) + if(CMAKE_CXX_COMPILER_VERSION VERSION_LESS 4.9.0) set(FLAGS "${FLAGS} -Wno-attributes -Wno-unused-variable") endif() - elseif ("${CMAKE_CXX_COMPILER_ID}" MATCHES "Clang") + elseif(CMAKE_CXX_COMPILER_ID MATCHES "Clang") set(FLAGS "${FLAGS} -Wextra -Wno-c++98-compat -Wno-c++98-compat-pedantic -Wno-padded") set(FLAGS "${FLAGS} -Wno-missing-prototypes -Wno-float-equal -Wno-switch-enum -Wno-switch") set(FLAGS "${FLAGS} -Wno-exit-time-destructors -Wno-global-constructors -Wno-missing-noreturn") @@ -75,9 +75,9 @@ endif() set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${FLAGS}") # C compiler settings (for the sample) -if ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "MSVC") +if(CMAKE_CXX_COMPILER_ID STREQUAL "MSVC") set(CFLAGS "/Ox") -else () +else() set(CFLAGS "-O3 -std=c99") endif() set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${CFLAGS}") @@ -228,7 +228,7 @@ if(TESTS) if(CLBLAS_FOUND) set(REF_INCLUDES ${REF_INCLUDES} ${CLBLAS_INCLUDE_DIRS}) set(REF_LIBRARIES ${REF_LIBRARIES} ${CLBLAS_LIBRARIES}) - if("${CMAKE_CXX_COMPILER_ID}" STREQUAL "MSVC") + if(CMAKE_CXX_COMPILER_ID STREQUAL "MSVC") add_definitions(" /DCLBLAST_REF_CLBLAS") else() add_definitions(" -DCLBLAST_REF_CLBLAS") @@ -237,7 +237,7 @@ if(TESTS) if(CBLAS_FOUND) set(REF_INCLUDES ${REF_INCLUDES} ${CBLAS_INCLUDE_DIRS}) set(REF_LIBRARIES ${REF_LIBRARIES} ${CBLAS_LIBRARIES}) - if("${CMAKE_CXX_COMPILER_ID}" STREQUAL "MSVC") + if(CMAKE_CXX_COMPILER_ID STREQUAL "MSVC") add_definitions(" /DCLBLAST_REF_CBLAS") else() add_definitions(" -DCLBLAST_REF_CBLAS") From 7ad5cc89d0d71b145d8902e38bd7ce3cd79d3faa Mon Sep 17 00:00:00 2001 From: Cedric Nugteren Date: Tue, 17 May 2016 23:12:19 +0200 Subject: [PATCH 10/13] Made MSVC link the run-time libraries statically --- CHANGELOG | 1 + CMakeLists.txt | 7 ++++++- cmake/c_flag_overrides.cmake | 9 +++++++++ cmake/cxx_flag_overrides.cmake | 9 +++++++++ 4 files changed, 25 insertions(+), 1 deletion(-) create mode 100644 cmake/c_flag_overrides.cmake create mode 100644 cmake/cxx_flag_overrides.cmake diff --git a/CHANGELOG b/CHANGELOG index 6de365bf..579b1b66 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -2,6 +2,7 @@ Development version (next release) - Improved performance of large power-of-2 xGEMM kernels for AMD GPUs - Fixed a bug in the xGEMM routine related to the event incorrectly set +- Made MSVC link the run-time libraries statically Version 0.7.0 - Added exports to be able to create a DLL on Windows (thanks to Marco Hutter) diff --git a/CMakeLists.txt b/CMakeLists.txt index e0511300..72667c00 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -9,8 +9,13 @@ # # ================================================================================================== -# CMake project details cmake_minimum_required(VERSION 2.8.10) + +# Overrides for MSVC static runtime +set(CMAKE_USER_MAKE_RULES_OVERRIDE ${CMAKE_CURRENT_SOURCE_DIR}/cmake/c_flag_overrides.cmake) +set(CMAKE_USER_MAKE_RULES_OVERRIDE_CXX ${CMAKE_CURRENT_SOURCE_DIR}/cmake/cxx_flag_overrides.cmake) + +# CMake project details project("clblast" C CXX) set(clblast_VERSION_MAJOR 0) set(clblast_VERSION_MINOR 7) diff --git a/cmake/c_flag_overrides.cmake b/cmake/c_flag_overrides.cmake new file mode 100644 index 00000000..3b38d297 --- /dev/null +++ b/cmake/c_flag_overrides.cmake @@ -0,0 +1,9 @@ +# Overriding the CMake flags to use static runtime libraries +# See http://www.cmake.org/Wiki/CMake_FAQ#How_can_I_build_my_MSVC_application_with_a_static_runtime.3F +if(MSVC) + set(CMAKE_C_FLAGS_DEBUG_INIT "/D_DEBUG /MTd /Zi /Ob0 /Od /RTC1") + set(CMAKE_C_FLAGS_MINSIZEREL_INIT "/MT /O1 /Ob1 /D NDEBUG") + set(CMAKE_C_FLAGS_RELEASE_INIT "/MT /O2 /Ob2 /D NDEBUG") + set(CMAKE_C_FLAGS_RELWITHDEBINFO_INIT "/MT /Zi /O2 /Ob1 /D NDEBUG") +endif() + diff --git a/cmake/cxx_flag_overrides.cmake b/cmake/cxx_flag_overrides.cmake new file mode 100644 index 00000000..b8935bf7 --- /dev/null +++ b/cmake/cxx_flag_overrides.cmake @@ -0,0 +1,9 @@ +# Overriding the CMake flags to use static runtime libraries +# See http://www.cmake.org/Wiki/CMake_FAQ#How_can_I_build_my_MSVC_application_with_a_static_runtime.3F +if(MSVC) + set(CMAKE_CXX_FLAGS_DEBUG_INIT "/D_DEBUG /MTd /Zi /Ob0 /Od /RTC1") + set(CMAKE_CXX_FLAGS_MINSIZEREL_INIT "/MT /O1 /Ob1 /D NDEBUG") + set(CMAKE_CXX_FLAGS_RELEASE_INIT "/MT /O2 /Ob2 /D NDEBUG") + set(CMAKE_CXX_FLAGS_RELWITHDEBINFO_INIT "/MT /Zi /O2 /Ob1 /D NDEBUG") +endif() + From 9bccc2544a6777102eb3317e65f9b7bb195f081b Mon Sep 17 00:00:00 2001 From: Cedric Nugteren Date: Wed, 18 May 2016 20:36:07 +0200 Subject: [PATCH 11/13] Fixes for CMake policy CMP0054 --- CMakeLists.txt | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 72667c00..6cfa5281 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -37,40 +37,40 @@ set(CMAKE_INSTALL_RPATH_USE_LINK_PATH false) # Don't add the automatically deter # ================================================================================================== # Compiler-version check (requires at least CMake 2.8.10) -if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU") +if(CMAKE_CXX_COMPILER_ID STREQUAL GNU) if(CMAKE_CXX_COMPILER_VERSION VERSION_LESS 4.7) message(FATAL_ERROR "GCC version must be at least 4.7") endif() -elseif(CMAKE_CXX_COMPILER_ID STREQUAL "Clang") +elseif(CMAKE_CXX_COMPILER_ID STREQUAL Clang) if(CMAKE_CXX_COMPILER_VERSION VERSION_LESS 3.3) message(FATAL_ERROR "Clang version must be at least 3.3") endif() -elseif(CMAKE_CXX_COMPILER_ID STREQUAL "AppleClang") +elseif(CMAKE_CXX_COMPILER_ID STREQUAL AppleClang) if(CMAKE_CXX_COMPILER_VERSION VERSION_LESS 5.0) message(FATAL_ERROR "AppleClang version must be at least 5.0") endif() -elseif(CMAKE_CXX_COMPILER_ID STREQUAL "Intel") +elseif(CMAKE_CXX_COMPILER_ID STREQUAL Intel) if(CMAKE_CXX_COMPILER_VERSION VERSION_LESS 14.0) message(FATAL_ERROR "ICC version must be at least 14.0") endif() -elseif(CMAKE_CXX_COMPILER_ID STREQUAL "MSVC") +elseif(CMAKE_CXX_COMPILER_ID STREQUAL MSVC) if(CMAKE_CXX_COMPILER_VERSION VERSION_LESS 18.0) message(FATAL_ERROR "MS Visual Studio version must be at least 18.0") endif() endif() # C++ compiler settings -if(CMAKE_CXX_COMPILER_ID STREQUAL "MSVC") +if(CMAKE_CXX_COMPILER_ID STREQUAL MSVC) set(FLAGS "/Ox") set(FLAGS "${FLAGS} /wd4715") else() set(FLAGS "-O3 -std=c++11") - if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU") + if(CMAKE_CXX_COMPILER_ID STREQUAL GNU) set(FLAGS "${FLAGS} -Wall -Wno-comment -Wno-return-type -Wno-switch -Wno-missing-noreturn") if(CMAKE_CXX_COMPILER_VERSION VERSION_LESS 4.9.0) set(FLAGS "${FLAGS} -Wno-attributes -Wno-unused-variable") endif() - elseif(CMAKE_CXX_COMPILER_ID MATCHES "Clang") + elseif(CMAKE_CXX_COMPILER_ID MATCHES Clang) set(FLAGS "${FLAGS} -Wextra -Wno-c++98-compat -Wno-c++98-compat-pedantic -Wno-padded") set(FLAGS "${FLAGS} -Wno-missing-prototypes -Wno-float-equal -Wno-switch-enum -Wno-switch") set(FLAGS "${FLAGS} -Wno-exit-time-destructors -Wno-global-constructors -Wno-missing-noreturn") @@ -80,7 +80,7 @@ endif() set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${FLAGS}") # C compiler settings (for the sample) -if(CMAKE_CXX_COMPILER_ID STREQUAL "MSVC") +if(CMAKE_CXX_COMPILER_ID STREQUAL MSVC) set(CFLAGS "/Ox") else() set(CFLAGS "-O3 -std=c99") From 748df9bf753637e25ab9806ce994b758448f15d3 Mon Sep 17 00:00:00 2001 From: CNugteren Date: Wed, 18 May 2016 20:53:40 +0200 Subject: [PATCH 12/13] Fixes for Visual Studio --- CMakeLists.txt | 6 +++--- README.md | 8 +++++++- 2 files changed, 10 insertions(+), 4 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 6cfa5281..3409196d 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -53,14 +53,14 @@ elseif(CMAKE_CXX_COMPILER_ID STREQUAL Intel) if(CMAKE_CXX_COMPILER_VERSION VERSION_LESS 14.0) message(FATAL_ERROR "ICC version must be at least 14.0") endif() -elseif(CMAKE_CXX_COMPILER_ID STREQUAL MSVC) +elseif(MSVC) if(CMAKE_CXX_COMPILER_VERSION VERSION_LESS 18.0) message(FATAL_ERROR "MS Visual Studio version must be at least 18.0") endif() endif() # C++ compiler settings -if(CMAKE_CXX_COMPILER_ID STREQUAL MSVC) +if(MSVC) set(FLAGS "/Ox") set(FLAGS "${FLAGS} /wd4715") else() @@ -80,7 +80,7 @@ endif() set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${FLAGS}") # C compiler settings (for the sample) -if(CMAKE_CXX_COMPILER_ID STREQUAL MSVC) +if(MSVC) set(CFLAGS "/Ox") else() set(CFLAGS "-O3 -std=c99") diff --git a/README.md b/README.md index 869ef636..e4564c26 100644 --- a/README.md +++ b/README.md @@ -60,7 +60,7 @@ Furthermore, to build the (optional) correctness and performance tests, another - BLIS - Accelerate -An example of an out-of-source build (starting from the root of the CLBlast folder): +An example of an out-of-source build using a command-line compiler and make (starting from the root of the CLBlast folder): mkdir build cd build @@ -68,6 +68,12 @@ An example of an out-of-source build (starting from the root of the CLBlast fold make sudo make install +When using Visual Studio, the project-files can be generated as follows: + + mkdir build + cd build + cmake -G "Visual Studio 14 Win64" .. + A custom installation folder can be specified when calling CMake: cmake -DCMAKE_INSTALL_PREFIX=/path/to/install/directory .. From 9a061528eb006f6a59b8bdc12c0e802bd28941cf Mon Sep 17 00:00:00 2001 From: Cedric Nugteren Date: Wed, 18 May 2016 21:13:04 +0200 Subject: [PATCH 13/13] Updated to version 0.7.1 --- CHANGELOG | 2 +- CMakeLists.txt | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/CHANGELOG b/CHANGELOG index 579b1b66..76903180 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -1,5 +1,5 @@ -Development version (next release) +Version 0.7.1 - Improved performance of large power-of-2 xGEMM kernels for AMD GPUs - Fixed a bug in the xGEMM routine related to the event incorrectly set - Made MSVC link the run-time libraries statically diff --git a/CMakeLists.txt b/CMakeLists.txt index 3409196d..02ffba1d 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -19,7 +19,7 @@ set(CMAKE_USER_MAKE_RULES_OVERRIDE_CXX ${CMAKE_CURRENT_SOURCE_DIR}/cmake/cxx_fla project("clblast" C CXX) set(clblast_VERSION_MAJOR 0) set(clblast_VERSION_MINOR 7) -set(clblast_VERSION_PATCH 0) +set(clblast_VERSION_PATCH 1) # Options and their default values option(SAMPLES "Enable compilation of the examples" OFF)