From eaa348735ee5cee396f9ec629f1486ebb3dbeff7 Mon Sep 17 00:00:00 2001 From: Cedric Nugteren Date: Sat, 16 Jul 2016 15:18:28 +0200 Subject: [PATCH 01/19] Created infrastructure to support a direct GEMM kernel; added correct but slow reference kernel as a place-holder --- src/kernels/common.opencl | 2 +- src/kernels/level3/xgemm_direct.opencl | 71 +++++++++++++++++++ src/routines/level3/xgemm.cpp | 94 ++++++++++++++++++++++++++ src/routines/level3/xgemm.hpp | 23 +++++++ 4 files changed, 189 insertions(+), 1 deletion(-) create mode 100644 src/kernels/level3/xgemm_direct.opencl diff --git a/src/kernels/common.opencl b/src/kernels/common.opencl index 9d2bb65e..2fca6b73 100644 --- a/src/kernels/common.opencl +++ b/src/kernels/common.opencl @@ -197,7 +197,7 @@ R"( #if PRECISION == 3232 || PRECISION == 6464 #define COMPLEX_CONJUGATE(value) value.x = value.x; value.y = -value.y #else - #define COMPLEX_CONJUGATE(value) value = value + #define COMPLEX_CONJUGATE(value) #endif // ================================================================================================= diff --git a/src/kernels/level3/xgemm_direct.opencl b/src/kernels/level3/xgemm_direct.opencl new file mode 100644 index 00000000..9d2a55c8 --- /dev/null +++ b/src/kernels/level3/xgemm_direct.opencl @@ -0,0 +1,71 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// This is a generic GEMM kernel that works for all sizes and configurations: it doesn't require any +// pre and and post-processing kernels. +// +// ================================================================================================= + +// Enables loading of this file using the C++ pre-processor's #include (C++11 standard raw string +// literal). Comment-out this line for syntax-highlighting when developing. +R"( + +// ================================================================================================= + +// Main entry point of the kernel. This is the direct version. +__attribute__((reqd_work_group_size(16, 16, 1))) +__kernel void XgemmDirect(const int kSizeM, const int kSizeN, const int kSizeK, + const real_arg arg_alpha, + const real_arg arg_beta, + const __global real* restrict agm, const int a_offset, const int a_ld, + const __global real* restrict bgm, const int b_offset, const int b_ld, + __global real* cgm, const int c_offset, const int c_ld, + const int a_transpose, const int b_transpose, const int c_transpose, + const int a_conjugate, const int b_conjugate) { + const real alpha = GetRealArg(arg_alpha); + const real beta = GetRealArg(arg_beta); + + // Thread identifiers + const int mid = get_global_id(0); // Row ID of cgm + const int nid = get_global_id(1); // Col ID of cgm + + // Allows for incomplete workgroups + if (mid < kSizeM && nid < kSizeN) { + + // Computes a single element + real acc; + SetToZero(acc); + for (int k=0; k::Xgemm(Queue &queue, EventPointer event, const std::string &name): #include "../../kernels/level3/convert_hermitian.opencl" #include "../../kernels/level3/xgemm_part1.opencl" #include "../../kernels/level3/xgemm_part2.opencl" + #include "../../kernels/level3/xgemm_direct.opencl" ; } @@ -94,6 +95,42 @@ StatusCode Xgemm::DoGemm(const Layout layout, status = TestMatrixC(c_one, c_two, c_buffer, c_offset, c_ld); if (ErrorIn(status)) { return status; } + // Optionally runs the direct version of GEMM. TODO: Set this based on the arguments + const auto do_gemm_direct = true; // for now, for testing + if (do_gemm_direct) { + return GemmDirect(m, n, k, alpha, + a_buffer, a_offset, a_ld, b_buffer, b_offset, b_ld, beta, + c_buffer, c_offset, c_ld, + a_do_transpose, b_do_transpose, c_do_transpose, a_conjugate, b_conjugate); + } + else { + return GemmIndirect(m, n, k, alpha, + a_buffer, a_offset, a_ld, b_buffer, b_offset, b_ld, beta, + c_buffer, c_offset, c_ld, + a_do_transpose, b_do_transpose, c_do_transpose, a_conjugate, b_conjugate, + a_one, a_two, b_one, b_two, c_one, c_two); + } +} + +// ================================================================================================= + +// The indirect version of GEMM. This uses the faster but non-general kernel. It has specific +// requirements, but several pre and post-processing kernels take care of those. However, the +// overhead of these extra kernels might not be ideal for certain devices/arguments. +template +StatusCode Xgemm::GemmIndirect(const size_t m, const size_t n, const size_t k, + const T alpha, + const Buffer &a_buffer, const size_t a_offset, const size_t a_ld, + const Buffer &b_buffer, const size_t b_offset, const size_t b_ld, + const T beta, + const Buffer &c_buffer, const size_t c_offset, const size_t c_ld, + const bool a_do_transpose, const bool b_do_transpose, const bool c_do_transpose, + const bool a_conjugate, const bool b_conjugate, + const size_t a_one, const size_t a_two, + const size_t b_one, const size_t b_two, + const size_t c_one, const size_t c_two) { + auto status = StatusCode::kSuccess; + // Calculates the ceiled versions of m, n, and k const auto m_ceiled = Ceil(m, db_["MWG"]); const auto n_ceiled = Ceil(n, db_["NWG"]); @@ -204,6 +241,63 @@ StatusCode Xgemm::DoGemm(const Layout layout, } catch (...) { return StatusCode::kTempBufferAllocFailure; } } + +// ================================================================================================= + +// The direct version of GEMM, requiring just one kernel, no pre or post-processing kernels. +template +StatusCode Xgemm::GemmDirect(const size_t m, const size_t n, const size_t k, + const T alpha, + const Buffer &a_buffer, const size_t a_offset, const size_t a_ld, + const Buffer &b_buffer, const size_t b_offset, const size_t b_ld, + const T beta, + const Buffer &c_buffer, const size_t c_offset, const size_t c_ld, + const bool a_do_transpose, const bool b_do_transpose, const bool c_do_transpose, + const bool a_conjugate, const bool b_conjugate) { + + // Loads the program from the database + const auto program = GetProgramFromCache(context_, PrecisionValue(), routine_name_); + + // Retrieves the XgemmDirect kernel from the compiled binary + try { + auto kernel = Kernel(program, "XgemmDirect"); + + // Sets the kernel arguments + kernel.SetArgument(0, static_cast(m)); + kernel.SetArgument(1, static_cast(n)); + kernel.SetArgument(2, static_cast(k)); + kernel.SetArgument(3, GetRealArg(alpha)); + kernel.SetArgument(4, GetRealArg(beta)); + kernel.SetArgument(5, a_buffer()); + kernel.SetArgument(6, static_cast(a_offset)); + kernel.SetArgument(7, static_cast(a_ld)); + kernel.SetArgument(8, b_buffer()); + kernel.SetArgument(9, static_cast(b_offset)); + kernel.SetArgument(10, static_cast(b_ld)); + kernel.SetArgument(11, c_buffer()); + kernel.SetArgument(12, static_cast(c_offset)); + kernel.SetArgument(13, static_cast(c_ld)); + kernel.SetArgument(14, static_cast(a_do_transpose)); + kernel.SetArgument(15, static_cast(b_do_transpose)); + kernel.SetArgument(16, static_cast(c_do_transpose)); + kernel.SetArgument(17, static_cast(a_conjugate)); + kernel.SetArgument(18, static_cast(b_conjugate)); + + // Computes the global and local thread sizes + const auto m_ceiled = Ceil(m, 16); + const auto n_ceiled = Ceil(n, 16); + const auto global = std::vector{m_ceiled, n_ceiled}; + const auto local = std::vector{16, 16}; + + // Launches the kernel + auto status = RunKernel(kernel, queue_, device_, global, local, event_); + if (ErrorIn(status)) { return status; } + + // Successfully finished the computation + return StatusCode::kSuccess; + } catch (...) { return StatusCode::kInvalidKernel; } +} + // ================================================================================================= // Compiles the templated class diff --git a/src/routines/level3/xgemm.hpp b/src/routines/level3/xgemm.hpp index bc51c7f5..8db1cb11 100644 --- a/src/routines/level3/xgemm.hpp +++ b/src/routines/level3/xgemm.hpp @@ -35,6 +35,29 @@ class Xgemm: public Routine { const Buffer &b_buffer, const size_t b_offset, const size_t b_ld, const T beta, const Buffer &c_buffer, const size_t c_offset, const size_t c_ld); + + // Indirect version of GEMM (with pre and post-processing kernels) + StatusCode GemmIndirect(const size_t m, const size_t n, const size_t k, + const T alpha, + const Buffer &a_buffer, const size_t a_offset, const size_t a_ld, + const Buffer &b_buffer, const size_t b_offset, const size_t b_ld, + const T beta, + const Buffer &c_buffer, const size_t c_offset, const size_t c_ld, + const bool a_do_transpose, const bool b_do_transpose, const bool c_do_transpose, + const bool a_conjugate, const bool b_conjugate, + const size_t a_one, const size_t a_two, + const size_t b_one, const size_t b_two, + const size_t c_one, const size_t c_two); + + // Direct version of GEMM (no pre and post-processing kernels) + StatusCode GemmDirect(const size_t m, const size_t n, const size_t k, + const T alpha, + const Buffer &a_buffer, const size_t a_offset, const size_t a_ld, + const Buffer &b_buffer, const size_t b_offset, const size_t b_ld, + const T beta, + const Buffer &c_buffer, const size_t c_offset, const size_t c_ld, + const bool a_do_transpose, const bool b_do_transpose, const bool c_do_transpose, + const bool a_conjugate, const bool b_conjugate); }; // ================================================================================================= From 798d32edad091b6faaa1627a7514868fc28c5fd9 Mon Sep 17 00:00:00 2001 From: Cedric Nugteren Date: Sun, 17 Jul 2016 14:36:51 +0200 Subject: [PATCH 02/19] Improved the GEMM direct kernel by adding register blocking. Still not fast though --- src/kernels/level3/xgemm_direct.opencl | 186 +++++++++++++++++++++---- src/routines/level3/xgemm.cpp | 11 +- 2 files changed, 166 insertions(+), 31 deletions(-) diff --git a/src/kernels/level3/xgemm_direct.opencl b/src/kernels/level3/xgemm_direct.opencl index 9d2a55c8..a5e8ca3d 100644 --- a/src/kernels/level3/xgemm_direct.opencl +++ b/src/kernels/level3/xgemm_direct.opencl @@ -18,48 +18,180 @@ R"( // ================================================================================================= -// Main entry point of the kernel. This is the direct version. -__attribute__((reqd_work_group_size(16, 16, 1))) +// Initializes the accumulation registers to zero +inline void InitAccRegistersDirect(real cpm[NWI][MWI]) { + #pragma unroll + for (int mi=0; mi::GemmDirect(const size_t m, const size_t n, const size_t k, kernel.SetArgument(18, static_cast(b_conjugate)); // Computes the global and local thread sizes - const auto m_ceiled = Ceil(m, 16); - const auto n_ceiled = Ceil(n, 16); - const auto global = std::vector{m_ceiled, n_ceiled}; - const auto local = std::vector{16, 16}; + const auto m_ceiled = Ceil(m, db_["MWG"]); + const auto n_ceiled = Ceil(n, db_["NWG"]); + const auto global = std::vector{ + (m_ceiled * db_["MDIMC"]) / db_["MWG"], + (n_ceiled * db_["NDIMC"]) / db_["NWG"] + }; + const auto local = std::vector{db_["MDIMC"], db_["NDIMC"]}; // Launches the kernel auto status = RunKernel(kernel, queue_, device_, global, local, event_); From 5004a435ff984bba0dff0147a5c4f6a04d703562 Mon Sep 17 00:00:00 2001 From: Cedric Nugteren Date: Tue, 26 Jul 2016 20:59:59 +0200 Subject: [PATCH 03/19] Fixed issues related to the recent changes in the Xgemm infrastructure --- src/routines/level3/xgemm.cpp | 10 ++++++---- src/routines/level3/xgemm.hpp | 6 +++--- 2 files changed, 9 insertions(+), 7 deletions(-) diff --git a/src/routines/level3/xgemm.cpp b/src/routines/level3/xgemm.cpp index e2e8647e..a85f55b5 100644 --- a/src/routines/level3/xgemm.cpp +++ b/src/routines/level3/xgemm.cpp @@ -111,7 +111,9 @@ StatusCode Xgemm::DoGemm(const Layout layout, a_buffer, a_offset, a_ld, b_buffer, b_offset, b_ld, beta, c_buffer, c_offset, c_ld, a_do_transpose, b_do_transpose, c_do_transpose, a_conjugate, b_conjugate, - a_one, a_two, b_one, b_two, c_one, c_two); + a_one, a_two, a_want_rotated, + b_one, b_two, b_want_rotated, + c_one, c_two, c_want_rotated); } } @@ -129,9 +131,9 @@ StatusCode Xgemm::GemmIndirect(const size_t m, const size_t n, const size_t k const Buffer &c_buffer, const size_t c_offset, const size_t c_ld, const bool a_do_transpose, const bool b_do_transpose, const bool c_do_transpose, const bool a_conjugate, const bool b_conjugate, - const size_t a_one, const size_t a_two, - const size_t b_one, const size_t b_two, - const size_t c_one, const size_t c_two) { + const size_t a_one, const size_t a_two, const bool a_want_rotated, + const size_t b_one, const size_t b_two, const bool b_want_rotated, + const size_t c_one, const size_t c_two, const bool c_want_rotated) { auto status = StatusCode::kSuccess; // Calculates the ceiled versions of m, n, and k diff --git a/src/routines/level3/xgemm.hpp b/src/routines/level3/xgemm.hpp index 8db1cb11..46e12453 100644 --- a/src/routines/level3/xgemm.hpp +++ b/src/routines/level3/xgemm.hpp @@ -45,9 +45,9 @@ class Xgemm: public Routine { const Buffer &c_buffer, const size_t c_offset, const size_t c_ld, const bool a_do_transpose, const bool b_do_transpose, const bool c_do_transpose, const bool a_conjugate, const bool b_conjugate, - const size_t a_one, const size_t a_two, - const size_t b_one, const size_t b_two, - const size_t c_one, const size_t c_two); + const size_t a_one, const size_t a_two, const bool a_want_rotated, + const size_t b_one, const size_t b_two, const bool b_want_rotated, + const size_t c_one, const size_t c_two, const bool c_want_rotated); // Direct version of GEMM (no pre and post-processing kernels) StatusCode GemmDirect(const size_t m, const size_t n, const size_t k, From 140dc12854dd9521c1420ccba7eb9fb0d50e054e Mon Sep 17 00:00:00 2001 From: Cedric Nugteren Date: Sun, 25 Sep 2016 11:38:35 +0200 Subject: [PATCH 04/19] Added a first version of the direct version of GEMM with local memory --- src/kernels/level3/xgemm_direct.opencl | 198 ++++++++++++++++++++++++- 1 file changed, 194 insertions(+), 4 deletions(-) diff --git a/src/kernels/level3/xgemm_direct.opencl b/src/kernels/level3/xgemm_direct.opencl index a5e8ca3d..fb5972ba 100644 --- a/src/kernels/level3/xgemm_direct.opencl +++ b/src/kernels/level3/xgemm_direct.opencl @@ -18,6 +18,164 @@ R"( // ================================================================================================= +// Caches global off-chip memory into local (shared) memory on-chip. This function is specific for +// caching the A input matrix. +inline void GlobalToLocalDirectA(const __global realM* restrict agm, __local real* alm, + const int a_ld, const int a_offset, const int tid, const int kwg, + const int a_transpose, const int a_conjugate) { + const int la0 = tid % MDIMA; + const int la1 = tid / MDIMA; + #pragma unroll + for (int mia=0; mia local (matrix A and B) + GlobalToLocalDirectA(agm, alm, a_ld, a_offset, tid, kwg, a_transpose, a_conjugate); + GlobalToLocalDirectB(bgm, blm, b_ld, b_offset, tid, kwg, b_transpose, b_conjugate); + barrier(CLK_LOCAL_MEM_FENCE); + + // Loops over all workitem tiles, unrolled by a factor KWI + for (int pwi=0; pwi private (matrix A) + LocalToPrivateDirectA(alm, apm, kg, a_transpose); + + // Loads data: local --> private (matrix B) + LocalToPrivateDirectB(blm, bpm, kg, b_transpose); + + // Performs the accumulation (Cpm += Apm * Bpm) + MultiplyAccumulateDirect(cpm, apm, bpm); + } + } + barrier(CLK_LOCAL_MEM_FENCE); + } // Loop over the remaining part (incomplete tile in K-dimension) for (; kwg < kSizeK; ++kwg) { From 669f43aed65ccd4aae9c4a478e994660f3e2a592 Mon Sep 17 00:00:00 2001 From: Cedric Nugteren Date: Sun, 25 Sep 2016 13:52:08 +0200 Subject: [PATCH 05/19] Separated the tuning parameters of the new direct GEMM kernel from the indirect version --- src/database/database.cpp | 2 + src/database/database.hpp | 1 + src/database/kernels/xgemm_direct.hpp | 76 ++++++ src/kernels/level3/xgemm_direct.opencl | 358 +++++++++++++++---------- src/routines/level3/xgemm.cpp | 13 +- 5 files changed, 299 insertions(+), 151 deletions(-) create mode 100644 src/database/kernels/xgemm_direct.hpp diff --git a/src/database/database.cpp b/src/database/database.cpp index 34c44a29..2696fb9b 100644 --- a/src/database/database.cpp +++ b/src/database/database.cpp @@ -21,6 +21,7 @@ #include "database/kernels/xgemv_fast_rot.hpp" #include "database/kernels/xger.hpp" #include "database/kernels/xgemm.hpp" +#include "database/kernels/xgemm_direct.hpp" #include "database/kernels/copy.hpp" #include "database/kernels/pad.hpp" #include "database/kernels/transpose.hpp" @@ -38,6 +39,7 @@ const std::vector Database::database = { XgemvFastRotHalf, XgemvFastRotSingle, XgemvFastRotDouble, XgemvFastRotComplexSingle, XgemvFastRotComplexDouble, XgerHalf, XgerSingle, XgerDouble, XgerComplexSingle, XgerComplexDouble, XgemmHalf, XgemmSingle, XgemmDouble, XgemmComplexSingle, XgemmComplexDouble, + XgemmDirectHalf, XgemmDirectSingle, XgemmDirectDouble, XgemmDirectComplexSingle, XgemmDirectComplexDouble, CopyHalf, CopySingle, CopyDouble, CopyComplexSingle, CopyComplexDouble, PadHalf, PadSingle, PadDouble, PadComplexSingle, PadComplexDouble, TransposeHalf, TransposeSingle, TransposeDouble, TransposeComplexSingle, TransposeComplexDouble, diff --git a/src/database/database.hpp b/src/database/database.hpp index a6ab49c5..7c0afb46 100644 --- a/src/database/database.hpp +++ b/src/database/database.hpp @@ -75,6 +75,7 @@ class Database { static const DatabaseEntry XgemvFastRotHalf, XgemvFastRotSingle, XgemvFastRotDouble, XgemvFastRotComplexSingle, XgemvFastRotComplexDouble; static const DatabaseEntry XgerHalf, XgerSingle, XgerDouble, XgerComplexSingle, XgerComplexDouble; static const DatabaseEntry XgemmHalf, XgemmSingle, XgemmDouble, XgemmComplexSingle, XgemmComplexDouble; + static const DatabaseEntry XgemmDirectHalf, XgemmDirectSingle, XgemmDirectDouble, XgemmDirectComplexSingle, XgemmDirectComplexDouble; static const DatabaseEntry CopyHalf, CopySingle, CopyDouble, CopyComplexSingle, CopyComplexDouble; static const DatabaseEntry PadHalf, PadSingle, PadDouble, PadComplexSingle, PadComplexDouble; static const DatabaseEntry TransposeHalf, TransposeSingle, TransposeDouble, TransposeComplexSingle, TransposeComplexDouble; diff --git a/src/database/kernels/xgemm_direct.hpp b/src/database/kernels/xgemm_direct.hpp new file mode 100644 index 00000000..76055ef2 --- /dev/null +++ b/src/database/kernels/xgemm_direct.hpp @@ -0,0 +1,76 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Database generator +// +// This file populates the database with best-found tuning parameters for the 'Xgemm' kernels. +// +// ================================================================================================= + +namespace clblast { +// ================================================================================================= + +const Database::DatabaseEntry Database::XgemmDirectHalf = { + "XgemmDirect", Precision::kHalf, { + { // Default + kDeviceTypeAll, "default", { + { "default", { {"KWGD",32}, {"KWID",2}, {"MDIMAD",8}, {"MDIMCD",8}, {"MWGD",32}, {"NDIMBD",8}, {"NDIMCD",8}, {"NWGD",32}, {"VWMD",1}, {"VWND",1} } }, + } + }, + } +}; + +// ================================================================================================= + +const Database::DatabaseEntry Database::XgemmDirectSingle = { + "XgemmDirect", Precision::kSingle, { + { // Default + kDeviceTypeAll, "default", { + { "default", { {"KWGD",32}, {"KWID",2}, {"MDIMAD",8}, {"MDIMCD",8}, {"MWGD",32}, {"NDIMBD",8}, {"NDIMCD",8}, {"NWGD",32}, {"VWMD",1}, {"VWND",1} } }, + } + }, + } +}; + +// ================================================================================================= + +const Database::DatabaseEntry Database::XgemmDirectComplexSingle = { + "XgemmDirect", Precision::kComplexSingle, { + { // Default + kDeviceTypeAll, "default", { + { "default", { {"KWGD",32}, {"KWID",2}, {"MDIMAD",8}, {"MDIMCD",8}, {"MWGD",32}, {"NDIMBD",8}, {"NDIMCD",8}, {"NWGD",32}, {"VWMD",1}, {"VWND",1} } }, + } + }, + } +}; + +// ================================================================================================= + +const Database::DatabaseEntry Database::XgemmDirectDouble = { + "XgemmDirect", Precision::kDouble, { + { // Default + kDeviceTypeAll, "default", { + { "default", { {"KWGD",32}, {"KWID",2}, {"MDIMAD",8}, {"MDIMCD",8}, {"MWGD",32}, {"NDIMBD",8}, {"NDIMCD",8}, {"NWGD",32}, {"VWMD",1}, {"VWND",1} } }, + } + }, + } +}; + +// ================================================================================================= + +const Database::DatabaseEntry Database::XgemmDirectComplexDouble = { + "XgemmDirect", Precision::kComplexDouble, { + { // Default + kDeviceTypeAll, "default", { + { "default", { {"KWGD",32}, {"KWID",2}, {"MDIMAD",8}, {"MDIMCD",8}, {"MWGD",32}, {"NDIMBD",8}, {"NDIMCD",8}, {"NWGD",32}, {"VWMD",1}, {"VWND",1} } }, + } + }, + } +}; + +// ================================================================================================= +} // namespace clblast diff --git a/src/kernels/level3/xgemm_direct.opencl b/src/kernels/level3/xgemm_direct.opencl index fb5972ba..801887dd 100644 --- a/src/kernels/level3/xgemm_direct.opencl +++ b/src/kernels/level3/xgemm_direct.opencl @@ -16,68 +16,140 @@ // literal). Comment-out this line for syntax-highlighting when developing. R"( +// Parameters set by the tuner or by the database. Here they are given a basic default value in case +// this kernel file is used outside of the CLBlast library. Note that all parameters here have a +// suffix 'D' to denote that they are for the 'direct' version of the GEMM kernel. +#ifndef MWGD + #define MWGD 8 // Tile-size in dimension M (e.g. 64, 128) +#endif +#ifndef NWGD + #define NWGD 8 // Tile-size in dimension N (e.g. 64, 128) +#endif +#ifndef KWGD + #define KWGD 8 // Tile-size in dimension K (e.g. 8, 16) +#endif +#ifndef MDIMCD + #define MDIMCD 8 // Threads per workgroup in M-dimension (e.g. 8, 16, 32) +#endif +#ifndef NDIMCD + #define NDIMCD 8 // Threads per workgroup in N-dimension (e.g. 8, 16, 32) +#endif +#ifndef MDIMAD + #define MDIMAD 8 // Re-shaped tile dimension of matrix A: KDIMAD * MDIMAD +#endif +#ifndef NDIMBD + #define NDIMBD 8 // Re-shaped tile dimension of matrix B: KDIMBD * NDIMBD +#endif +#ifndef KWID + #define KWID 1 // Unroll factor of the KWGD loop (smaller or equal than KWGD) +#endif +#ifndef VWMD + #define VWMD 1 // Vector width of matrices A and C +#endif +#ifndef VWND + #define VWND 1 // Vector width of matrix B +#endif + +// Helper parameters based on the above tuning parameters +#define MWID (MWGD/MDIMCD) // Work per work-item (M-dimension) +#define NWID (NWGD/NDIMCD) // Work per work-item (N-dimension) +#define KDIMAD ((MDIMCD*NDIMCD)/(MDIMAD)) // Re-shaped tile dimension of matrix A: KDIMAD * MDIMAD +#define KDIMBD ((MDIMCD*NDIMCD)/(NDIMBD)) // Re-shaped tile dimension of matrix B: KDIMBD * NDIMBD +#define MWAD (MWGD/MDIMAD) // Amount of loads-per-thread for matrix A (M-dimension) +#define KWAD (KWGD/KDIMAD) // Amount of loads-per-thread for matrix A (K-dimension) +#define KWBD (KWGD/KDIMBD) // Amount of loads-per-thread for matrix B (K-dimension) +#define NWBD (NWGD/NDIMBD) // Amount of loads-per-thread for matrix B (N-dimension) + +// ================================================================================================= + +// Data-widths in dimension M +#if VWMD == 1 + typedef real realMD; +#elif VWMD == 2 + typedef real2 realMD; +#elif VWMD == 4 + typedef real4 realMD; +#elif VWMD == 8 + typedef real8 realMD; +#elif VWMD == 16 + typedef real16 realMD; +#endif + +// Data-widths in dimension N +#if VWND == 1 + typedef real realND; +#elif VWND == 2 + typedef real2 realND; +#elif VWND == 4 + typedef real4 realND; +#elif VWND == 8 + typedef real8 realND; +#elif VWND == 16 + typedef real16 realND; +#endif + // ================================================================================================= // Caches global off-chip memory into local (shared) memory on-chip. This function is specific for // caching the A input matrix. -inline void GlobalToLocalDirectA(const __global realM* restrict agm, __local real* alm, +inline void GlobalToLocalDirectA(const __global realMD* restrict agm, __local real* alm, const int a_ld, const int a_offset, const int tid, const int kwg, const int a_transpose, const int a_conjugate) { - const int la0 = tid % MDIMA; - const int la1 = tid / MDIMA; + const int la0 = tid % MDIMAD; + const int la1 = tid / MDIMAD; #pragma unroll - for (int mia=0; mia local (matrix A and B) GlobalToLocalDirectA(agm, alm, a_ld, a_offset, tid, kwg, a_transpose, a_conjugate); GlobalToLocalDirectB(bgm, blm, b_ld, b_offset, tid, kwg, b_transpose, b_conjugate); barrier(CLK_LOCAL_MEM_FENCE); - // Loops over all workitem tiles, unrolled by a factor KWI - for (int pwi=0; pwi private (matrix A) @@ -303,7 +375,7 @@ __kernel void XgemmDirect(const int kSizeM, const int kSizeN, const int kSizeK, // Loads A into register memory #pragma unroll - for (int mi=0; mi Xgemm::Xgemm(Queue &queue, EventPointer event, const std::string &name): - Routine(queue, event, name, {"Copy","Pad","Transpose","Padtranspose","Xgemm"}, PrecisionValue()) { + Routine(queue, event, name, {"Copy","Pad","Transpose","Padtranspose","Xgemm", "XgemmDirect"}, + PrecisionValue()) { source_string_ = #include "../../kernels/level3/level3.opencl" #include "../../kernels/level3/copy_fast.opencl" @@ -299,13 +300,13 @@ StatusCode Xgemm::GemmDirect(const size_t m, const size_t n, const size_t k, kernel.SetArgument(18, static_cast(b_conjugate)); // Computes the global and local thread sizes - const auto m_ceiled = Ceil(m, db_["MWG"]); - const auto n_ceiled = Ceil(n, db_["NWG"]); + const auto m_ceiled = Ceil(m, db_["MWGD"]); + const auto n_ceiled = Ceil(n, db_["NWGD"]); const auto global = std::vector{ - (m_ceiled * db_["MDIMC"]) / db_["MWG"], - (n_ceiled * db_["NDIMC"]) / db_["NWG"] + (m_ceiled * db_["MDIMCD"]) / db_["MWGD"], + (n_ceiled * db_["NDIMCD"]) / db_["NWGD"] }; - const auto local = std::vector{db_["MDIMC"], db_["NDIMC"]}; + const auto local = std::vector{db_["MDIMCD"], db_["NDIMCD"]}; // Launches the kernel auto status = RunKernel(kernel, queue_, device_, global, local, event_); From 73d135c2cef9763b47d410b125eb8bb89ece8432 Mon Sep 17 00:00:00 2001 From: Cedric Nugteren Date: Sun, 25 Sep 2016 14:48:34 +0200 Subject: [PATCH 06/19] Added a first version of a tuner for the GEMM direct kernel; collapsed MWGD, NWGD and KWGD into one WGD parameter --- CMakeLists.txt | 3 +- src/database/kernels/xgemm_direct.hpp | 10 +- src/kernels/level3/xgemm_direct.opencl | 186 ++++++++++++------------ src/routines/level3/xgemm.cpp | 8 +- src/tuning/kernels/xgemm_direct.cpp | 191 +++++++++++++++++++++++++ 5 files changed, 292 insertions(+), 106 deletions(-) create mode 100644 src/tuning/kernels/xgemm_direct.cpp diff --git a/CMakeLists.txt b/CMakeLists.txt index 178ac9bb..e90fdc56 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -134,7 +134,8 @@ endif() # ================================================================================================== # Sets the supported routines and the used kernels. New routines and kernels should be added here. -set(KERNELS copy_fast copy_pad transpose_fast transpose_pad xaxpy xdot xger xgemm xgemv) +set(KERNELS copy_fast copy_pad transpose_fast transpose_pad xaxpy xdot xger + xgemm xgemm_direct xgemv) set(SAMPLE_PROGRAMS_CPP sgemm) set(SAMPLE_PROGRAMS_C sasum dgemv sgemm haxpy cache) set(LEVEL1_ROUTINES xswap xscal xcopy xaxpy xdot xdotu xdotc xnrm2 xasum xamax) diff --git a/src/database/kernels/xgemm_direct.hpp b/src/database/kernels/xgemm_direct.hpp index 76055ef2..dc69f61b 100644 --- a/src/database/kernels/xgemm_direct.hpp +++ b/src/database/kernels/xgemm_direct.hpp @@ -18,7 +18,7 @@ const Database::DatabaseEntry Database::XgemmDirectHalf = { "XgemmDirect", Precision::kHalf, { { // Default kDeviceTypeAll, "default", { - { "default", { {"KWGD",32}, {"KWID",2}, {"MDIMAD",8}, {"MDIMCD",8}, {"MWGD",32}, {"NDIMBD",8}, {"NDIMCD",8}, {"NWGD",32}, {"VWMD",1}, {"VWND",1} } }, + { "default", { {"WGD",32}, {"KWID",2}, {"MDIMAD",8}, {"MDIMCD",8}, {"NDIMBD",8}, {"NDIMCD",8}, {"VWMD",1}, {"VWND",1} } }, } }, } @@ -30,7 +30,7 @@ const Database::DatabaseEntry Database::XgemmDirectSingle = { "XgemmDirect", Precision::kSingle, { { // Default kDeviceTypeAll, "default", { - { "default", { {"KWGD",32}, {"KWID",2}, {"MDIMAD",8}, {"MDIMCD",8}, {"MWGD",32}, {"NDIMBD",8}, {"NDIMCD",8}, {"NWGD",32}, {"VWMD",1}, {"VWND",1} } }, + { "default", { {"WGD",32}, {"KWID",2}, {"MDIMAD",8}, {"MDIMCD",8}, {"NDIMBD",8}, {"NDIMCD",8}, {"VWMD",1}, {"VWND",1} } }, } }, } @@ -42,7 +42,7 @@ const Database::DatabaseEntry Database::XgemmDirectComplexSingle = { "XgemmDirect", Precision::kComplexSingle, { { // Default kDeviceTypeAll, "default", { - { "default", { {"KWGD",32}, {"KWID",2}, {"MDIMAD",8}, {"MDIMCD",8}, {"MWGD",32}, {"NDIMBD",8}, {"NDIMCD",8}, {"NWGD",32}, {"VWMD",1}, {"VWND",1} } }, + { "default", { {"WGD",32}, {"KWID",2}, {"MDIMAD",8}, {"MDIMCD",8}, {"NDIMBD",8}, {"NDIMCD",8}, {"VWMD",1}, {"VWND",1} } }, } }, } @@ -54,7 +54,7 @@ const Database::DatabaseEntry Database::XgemmDirectDouble = { "XgemmDirect", Precision::kDouble, { { // Default kDeviceTypeAll, "default", { - { "default", { {"KWGD",32}, {"KWID",2}, {"MDIMAD",8}, {"MDIMCD",8}, {"MWGD",32}, {"NDIMBD",8}, {"NDIMCD",8}, {"NWGD",32}, {"VWMD",1}, {"VWND",1} } }, + { "default", { {"WGD",32}, {"KWID",2}, {"MDIMAD",8}, {"MDIMCD",8}, {"NDIMBD",8}, {"NDIMCD",8}, {"VWMD",1}, {"VWND",1} } }, } }, } @@ -66,7 +66,7 @@ const Database::DatabaseEntry Database::XgemmDirectComplexDouble = { "XgemmDirect", Precision::kComplexDouble, { { // Default kDeviceTypeAll, "default", { - { "default", { {"KWGD",32}, {"KWID",2}, {"MDIMAD",8}, {"MDIMCD",8}, {"MWGD",32}, {"NDIMBD",8}, {"NDIMCD",8}, {"NWGD",32}, {"VWMD",1}, {"VWND",1} } }, + { "default", { {"WGD",32}, {"KWID",2}, {"MDIMAD",8}, {"MDIMCD",8}, {"NDIMBD",8}, {"NDIMCD",8}, {"VWMD",1}, {"VWND",1} } }, } }, } diff --git a/src/kernels/level3/xgemm_direct.opencl b/src/kernels/level3/xgemm_direct.opencl index 801887dd..705ced9c 100644 --- a/src/kernels/level3/xgemm_direct.opencl +++ b/src/kernels/level3/xgemm_direct.opencl @@ -19,14 +19,8 @@ R"( // Parameters set by the tuner or by the database. Here they are given a basic default value in case // this kernel file is used outside of the CLBlast library. Note that all parameters here have a // suffix 'D' to denote that they are for the 'direct' version of the GEMM kernel. -#ifndef MWGD - #define MWGD 8 // Tile-size in dimension M (e.g. 64, 128) -#endif -#ifndef NWGD - #define NWGD 8 // Tile-size in dimension N (e.g. 64, 128) -#endif -#ifndef KWGD - #define KWGD 8 // Tile-size in dimension K (e.g. 8, 16) +#ifndef WGD + #define WGD 8 // Tile-size in dimension M, N, and K (e.g. 8, 16, 32, 64) #endif #ifndef MDIMCD #define MDIMCD 8 // Threads per workgroup in M-dimension (e.g. 8, 16, 32) @@ -41,7 +35,7 @@ R"( #define NDIMBD 8 // Re-shaped tile dimension of matrix B: KDIMBD * NDIMBD #endif #ifndef KWID - #define KWID 1 // Unroll factor of the KWGD loop (smaller or equal than KWGD) + #define KWID 1 // Unroll factor of the WGD loop (smaller or equal than WGD) #endif #ifndef VWMD #define VWMD 1 // Vector width of matrices A and C @@ -51,14 +45,14 @@ R"( #endif // Helper parameters based on the above tuning parameters -#define MWID (MWGD/MDIMCD) // Work per work-item (M-dimension) -#define NWID (NWGD/NDIMCD) // Work per work-item (N-dimension) +#define MWID (WGD/MDIMCD) // Work per work-item (M-dimension) +#define NWID (WGD/NDIMCD) // Work per work-item (N-dimension) #define KDIMAD ((MDIMCD*NDIMCD)/(MDIMAD)) // Re-shaped tile dimension of matrix A: KDIMAD * MDIMAD #define KDIMBD ((MDIMCD*NDIMCD)/(NDIMBD)) // Re-shaped tile dimension of matrix B: KDIMBD * NDIMBD -#define MWAD (MWGD/MDIMAD) // Amount of loads-per-thread for matrix A (M-dimension) -#define KWAD (KWGD/KDIMAD) // Amount of loads-per-thread for matrix A (K-dimension) -#define KWBD (KWGD/KDIMBD) // Amount of loads-per-thread for matrix B (K-dimension) -#define NWBD (NWGD/NDIMBD) // Amount of loads-per-thread for matrix B (N-dimension) +#define MWAD (WGD/MDIMAD) // Amount of loads-per-thread for matrix A (M-dimension) +#define KWAD (WGD/KDIMAD) // Amount of loads-per-thread for matrix A (K-dimension) +#define KWBD (WGD/KDIMBD) // Amount of loads-per-thread for matrix B (K-dimension) +#define NWBD (WGD/NDIMBD) // Amount of loads-per-thread for matrix B (N-dimension) // ================================================================================================= @@ -105,51 +99,51 @@ inline void GlobalToLocalDirectA(const __global realMD* restrict agm, __local re // Computes the indices for the global memory int mg = mia + la0*(MWAD/VWMD); int kg = kia + la1*KWAD; - int idm = (a_transpose) ? mg + kwg/VWMD : mg + GetGroupID0()*(MWGD/VWMD); - int idk = (a_transpose) ? kg + GetGroupID0()*MWGD : kg + kwg; + int idm = (a_transpose) ? mg + kwg/VWMD : mg + GetGroupID0()*(WGD/VWMD); + int idk = (a_transpose) ? kg + GetGroupID0()*WGD : kg + kwg; // Loads the data from global memory into the local memory const realMD avec = agm[idk*(a_ld/VWMD) + idm + a_offset]; #if VWMD == 1 - alm[kg*MWGD + mg] = avec; + alm[kg*WGD + mg] = avec; #elif VWMD == 2 - alm[kg*MWGD + mg*VWMD + 0] = avec.x; - alm[kg*MWGD + mg*VWMD + 1] = avec.y; + alm[kg*WGD + mg*VWMD + 0] = avec.x; + alm[kg*WGD + mg*VWMD + 1] = avec.y; #elif VWMD == 4 - alm[kg*MWGD + mg*VWMD + 0] = avec.x; - alm[kg*MWGD + mg*VWMD + 1] = avec.y; - alm[kg*MWGD + mg*VWMD + 2] = avec.z; - alm[kg*MWGD + mg*VWMD + 3] = avec.w; + alm[kg*WGD + mg*VWMD + 0] = avec.x; + alm[kg*WGD + mg*VWMD + 1] = avec.y; + alm[kg*WGD + mg*VWMD + 2] = avec.z; + alm[kg*WGD + mg*VWMD + 3] = avec.w; #elif VWMD == 8 - alm[kg*MWGD + mg*VWMD + 0] = avec.s0; - alm[kg*MWGD + mg*VWMD + 1] = avec.s1; - alm[kg*MWGD + mg*VWMD + 2] = avec.s2; - alm[kg*MWGD + mg*VWMD + 3] = avec.s3; - alm[kg*MWGD + mg*VWMD + 4] = avec.s4; - alm[kg*MWGD + mg*VWMD + 5] = avec.s5; - alm[kg*MWGD + mg*VWMD + 6] = avec.s6; - alm[kg*MWGD + mg*VWMD + 7] = avec.s7; + alm[kg*WGD + mg*VWMD + 0] = avec.s0; + alm[kg*WGD + mg*VWMD + 1] = avec.s1; + alm[kg*WGD + mg*VWMD + 2] = avec.s2; + alm[kg*WGD + mg*VWMD + 3] = avec.s3; + alm[kg*WGD + mg*VWMD + 4] = avec.s4; + alm[kg*WGD + mg*VWMD + 5] = avec.s5; + alm[kg*WGD + mg*VWMD + 6] = avec.s6; + alm[kg*WGD + mg*VWMD + 7] = avec.s7; #elif VWMD == 16 - alm[kg*MWGD + mg*VWMD + 0] = avec.s0; - alm[kg*MWGD + mg*VWMD + 1] = avec.s1; - alm[kg*MWGD + mg*VWMD + 2] = avec.s2; - alm[kg*MWGD + mg*VWMD + 3] = avec.s3; - alm[kg*MWGD + mg*VWMD + 4] = avec.s4; - alm[kg*MWGD + mg*VWMD + 5] = avec.s5; - alm[kg*MWGD + mg*VWMD + 6] = avec.s6; - alm[kg*MWGD + mg*VWMD + 7] = avec.s7; - alm[kg*MWGD + mg*VWMD + 8] = avec.s8; - alm[kg*MWGD + mg*VWMD + 9] = avec.s9; - alm[kg*MWGD + mg*VWMD + 10] = avec.sA; - alm[kg*MWGD + mg*VWMD + 11] = avec.sB; - alm[kg*MWGD + mg*VWMD + 12] = avec.sC; - alm[kg*MWGD + mg*VWMD + 13] = avec.sD; - alm[kg*MWGD + mg*VWMD + 14] = avec.sE; - alm[kg*MWGD + mg*VWMD + 15] = avec.sF; + alm[kg*WGD + mg*VWMD + 0] = avec.s0; + alm[kg*WGD + mg*VWMD + 1] = avec.s1; + alm[kg*WGD + mg*VWMD + 2] = avec.s2; + alm[kg*WGD + mg*VWMD + 3] = avec.s3; + alm[kg*WGD + mg*VWMD + 4] = avec.s4; + alm[kg*WGD + mg*VWMD + 5] = avec.s5; + alm[kg*WGD + mg*VWMD + 6] = avec.s6; + alm[kg*WGD + mg*VWMD + 7] = avec.s7; + alm[kg*WGD + mg*VWMD + 8] = avec.s8; + alm[kg*WGD + mg*VWMD + 9] = avec.s9; + alm[kg*WGD + mg*VWMD + 10] = avec.sA; + alm[kg*WGD + mg*VWMD + 11] = avec.sB; + alm[kg*WGD + mg*VWMD + 12] = avec.sC; + alm[kg*WGD + mg*VWMD + 13] = avec.sD; + alm[kg*WGD + mg*VWMD + 14] = avec.sE; + alm[kg*WGD + mg*VWMD + 15] = avec.sF; #endif if (a_conjugate) { for (int vm=0; vm local (matrix A and B) GlobalToLocalDirectA(agm, alm, a_ld, a_offset, tid, kwg, a_transpose, a_conjugate); @@ -351,7 +345,7 @@ __kernel void XgemmDirect(const int kSizeM, const int kSizeN, const int kSizeK, barrier(CLK_LOCAL_MEM_FENCE); // Loops over all workitem tiles, unrolled by a factor KWID - for (int pwi=0; pwi::GemmDirect(const size_t m, const size_t n, const size_t k, kernel.SetArgument(18, static_cast(b_conjugate)); // Computes the global and local thread sizes - const auto m_ceiled = Ceil(m, db_["MWGD"]); - const auto n_ceiled = Ceil(n, db_["NWGD"]); + const auto m_ceiled = Ceil(m, db_["WGD"]); + const auto n_ceiled = Ceil(n, db_["WGD"]); const auto global = std::vector{ - (m_ceiled * db_["MDIMCD"]) / db_["MWGD"], - (n_ceiled * db_["NDIMCD"]) / db_["NWGD"] + (m_ceiled * db_["MDIMCD"]) / db_["WGD"], + (n_ceiled * db_["NDIMCD"]) / db_["WGD"] }; const auto local = std::vector{db_["MDIMCD"], db_["NDIMCD"]}; diff --git a/src/tuning/kernels/xgemm_direct.cpp b/src/tuning/kernels/xgemm_direct.cpp new file mode 100644 index 00000000..c2e8710f --- /dev/null +++ b/src/tuning/kernels/xgemm_direct.cpp @@ -0,0 +1,191 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// This file uses the CLTune auto-tuner to tune the direct xgemm kernels. There are two variations: +// - V==1: This tests some limited set of tuning parameters exhaustively. +// - V==2: This tests a much larger set of tuning parameters by randomly sampling a subset. +// +// ================================================================================================= + +#include +#include + +#include "utilities.hpp" +#include "tuning/tuning.hpp" + +namespace clblast { +// ================================================================================================= + +// See comment at top of file for a description of the class +template +class TuneXgemmDirect { + public: + + // The representative kernel and the source code + static std::string KernelFamily() { return (V==1) ? "xgemm_direct_1" : "xgemm_direct_2"; } + static std::string KernelName() { return "XgemmDirect"; } + static std::string GetSources() { + return + #include "../src/kernels/common.opencl" + #include "../src/kernels/level3/xgemm_direct.opencl" + ; + } + + // The list of arguments relevant for this routine + static std::vector GetOptions() { + return {kArgM, kArgN, kArgK, kArgAlpha, kArgBeta, kArgFraction}; + } + + // Tests for valid arguments + static void TestValidArguments(const Arguments &) { } + + // Sets the default values for the arguments + static size_t DefaultM() { return 128; } + static size_t DefaultN() { return 128; } + static size_t DefaultK() { return 128; } + static double DefaultFraction() { return (V==1) ? 1.0 : 16.0; } // test all or sample randomly + + // Describes how to obtain the sizes of the buffers + static size_t GetSizeX(const Arguments &) { return 1; } // N/A for this kernel + static size_t GetSizeY(const Arguments &) { return 1; } // N/A for this kernel + static size_t GetSizeA(const Arguments &args) { return args.m * args.k; } + static size_t GetSizeB(const Arguments &args) { return args.n * args.k; } + static size_t GetSizeC(const Arguments &args) { return args.m * args.n; } + static size_t GetSizeTemp(const Arguments &) { return 1; } // N/A for this kernel + + // Sets the tuning parameters and their possible values + static void SetParameters(cltune::Tuner &tuner, const size_t id) { + if (V==1) { // limited subset of tuning parameters - but explorable exhaustively + tuner.AddParameter(id, "WGD", {8, 16, 32}); + tuner.AddParameter(id, "MDIMCD", {8, 16, 32}); + tuner.AddParameter(id, "NDIMCD", {8, 16, 32}); + tuner.AddParameter(id, "MDIMAD", {8, 16, 32}); + tuner.AddParameter(id, "NDIMBD", {8, 16, 32}); + tuner.AddParameter(id, "KWID", {2}); + tuner.AddParameter(id, "VWMD", {1, 2, 4, 8}); + tuner.AddParameter(id, "VWND", {1, 2, 4, 8}); + } // a lot more tuning parameters - has to be sampled randomly, too much to test all + else { + tuner.AddParameter(id, "WGD", {8, 16, 32, 64, 128}); + tuner.AddParameter(id, "MDIMCD", {8, 16, 32}); + tuner.AddParameter(id, "NDIMCD", {8, 16, 32}); + tuner.AddParameter(id, "MDIMAD", {8, 16, 32}); + tuner.AddParameter(id, "NDIMBD", {8, 16, 32}); + tuner.AddParameter(id, "KWID", {2, 8, 16}); + tuner.AddParameter(id, "VWMD", {1, 2, 4, 8}); + tuner.AddParameter(id, "VWND", {1, 2, 4, 8}); + } + } + + // Sets the constraints + static void SetConstraints(cltune::Tuner &tuner, const size_t id) { + auto MultipleOfX = [] (std::vector v) { return IsMultiple(v[0], v[1]); }; + auto MultipleOfXMulY = [] (std::vector v) { return IsMultiple(v[0], v[1]*v[2]); }; + auto MultipleOfXMulYDivZ = [] (std::vector v) { return IsMultiple(v[0], (v[1]*v[2])/v[3]); }; + // Requirement for unrolling the WGD loop + tuner.AddConstraint(id, MultipleOfX, {"WGD", "KWID"}); + // Required for integer MWID and NWID + tuner.AddConstraint(id, MultipleOfXMulY, {"WGD", "MDIMCD", "VWMD"}); + tuner.AddConstraint(id, MultipleOfXMulY, {"WGD", "NDIMCD", "VWND"}); + // Required for integer MWIAD and NWIBD + tuner.AddConstraint(id, MultipleOfXMulY, {"WGD", "MDIMAD", "VWMD"}); + tuner.AddConstraint(id, MultipleOfXMulY, {"WGD", "NDIMBD", "VWND"}); + // WGD has to be a multiple of KDIMAD = ((MDIMCD*NDIMCD)/(MDIMAD)) and KDIMBD = (...) + tuner.AddConstraint(id, MultipleOfXMulYDivZ, {"WGD", "MDIMCD", "NDIMCD", "MDIMAD"}); + tuner.AddConstraint(id, MultipleOfXMulYDivZ, {"WGD", "MDIMCD", "NDIMCD", "NDIMBD"}); + + // Extra constraints for variation 1 to limit the set of options significantly + if (V==1) { + auto IsEqual = [] (std::vector v) { return v[0] == v[1]; }; + tuner.AddConstraint(id, IsEqual, {"MDIMCD", "MDIMAD"}); + tuner.AddConstraint(id, IsEqual, {"NDIMCD", "NDIMBD"}); + } + } + + // Sets the local memory size + static void SetLocalMemorySize(cltune::Tuner &tuner, const size_t id, const Arguments &args) { + auto LocalMemorySize = [args] (std::vector v) { + return ((v[0]*v[1] + v[2]*v[3])*GetBytes(args.precision)); + }; + tuner.SetLocalMemoryUsage(id, LocalMemorySize, {"WGD", "WGD", "WGD", "WGD"}); + } + + // Sets the base thread configuration + static std::vector GlobalSize(const Arguments &args) { return {args.m, args.n}; } + static std::vector GlobalSizeRef(const Arguments &args) { return GlobalSize(args); } + static std::vector LocalSize() { return {1, 1}; } + static std::vector LocalSizeRef() { return {8, 8}; } + + // Transforms the thread configuration based on the parameters + using TransformVector = std::vector>; + static TransformVector MulLocal() { return {{"MDIMCD", "NDIMCD"}}; } + static TransformVector DivLocal() { return {}; } + static TransformVector MulGlobal() { return {{"MDIMCD", "NDIMCD"}}; } + static TransformVector DivGlobal() { return {{"WGD", "WGD"}}; } + + // Sets the kernel's arguments + static void SetArguments(cltune::Tuner &tuner, const Arguments &args, + std::vector &, std::vector &, + std::vector &a_mat, std::vector &b_mat, std::vector &c_mat, + std::vector &) { + tuner.AddArgumentScalar(static_cast(args.m)); + tuner.AddArgumentScalar(static_cast(args.n)); + tuner.AddArgumentScalar(static_cast(args.k)); + tuner.AddArgumentScalar(GetRealArg(args.alpha)); + tuner.AddArgumentScalar(GetRealArg(args.beta)); + tuner.AddArgumentInput(a_mat); + tuner.AddArgumentScalar(0); // a_offset + tuner.AddArgumentScalar(static_cast(args.k)); // a_ld + tuner.AddArgumentInput(b_mat); + tuner.AddArgumentScalar(0); // b_offset + tuner.AddArgumentScalar(static_cast(args.n)); // b_ld + tuner.AddArgumentOutput(c_mat); + tuner.AddArgumentScalar(0); // c_offset + tuner.AddArgumentScalar(static_cast(args.n)); // c_ld + tuner.AddArgumentScalar(1); // a_do_transpose + tuner.AddArgumentScalar(1); // b_do_transpose + tuner.AddArgumentScalar(1); // c_do_transpose + tuner.AddArgumentScalar(0); // a_conjugate + tuner.AddArgumentScalar(0); // b_conjugate + } + + // Describes how to compute the performance metrics + static size_t GetMetric(const Arguments &args) { + return 2 * args.m * args.n * args.k; + } + static std::string PerformanceUnit() { return "GFLOPS"; } +}; + +// ================================================================================================= +} // namespace clblast + +// Shortcuts to the clblast namespace +using float2 = clblast::float2; +using double2 = clblast::double2; + +// Function to tune a specific variation V (not within the clblast namespace) +template +void StartVariation(int argc, char *argv[]) { + switch(clblast::GetPrecision(argc, argv)) { + case clblast::Precision::kHalf: clblast::Tuner, half>(argc, argv); break; + case clblast::Precision::kSingle: clblast::Tuner, float>(argc, argv); break; + case clblast::Precision::kDouble: clblast::Tuner, double>(argc, argv); break; + case clblast::Precision::kComplexSingle: clblast::Tuner, float2>(argc, argv); break; + case clblast::Precision::kComplexDouble: clblast::Tuner, double2>(argc, argv); break; + } +} + +// Main function (not within the clblast namespace) +int main(int argc, char *argv[]) { + StartVariation<1>(argc, argv); + StartVariation<2>(argc, argv); + return 0; +} + +// ================================================================================================= From ecc704cc76625fa0601b06ce5246831a14f18c8a Mon Sep 17 00:00:00 2001 From: Cedric Nugteren Date: Sat, 1 Oct 2016 16:55:21 +0200 Subject: [PATCH 07/19] Added default num-runs to the tuner adding averaging over 10 runs as a default for the GEMM direct kernel --- src/tuning/kernels/copy_fast.cpp | 1 + src/tuning/kernels/copy_pad.cpp | 1 + src/tuning/kernels/transpose_fast.cpp | 1 + src/tuning/kernels/transpose_pad.cpp | 1 + src/tuning/kernels/xaxpy.cpp | 1 + src/tuning/kernels/xdot.cpp | 1 + src/tuning/kernels/xgemm.cpp | 1 + src/tuning/kernels/xgemm_direct.cpp | 1 + src/tuning/kernels/xgemv.cpp | 1 + src/tuning/kernels/xger.cpp | 1 + src/tuning/tuning.hpp | 2 +- 11 files changed, 11 insertions(+), 1 deletion(-) diff --git a/src/tuning/kernels/copy_fast.cpp b/src/tuning/kernels/copy_fast.cpp index 78ded56e..da4124fd 100644 --- a/src/tuning/kernels/copy_fast.cpp +++ b/src/tuning/kernels/copy_fast.cpp @@ -47,6 +47,7 @@ class TuneCopy { static size_t DefaultN() { return 1024; } static size_t DefaultK() { return 1; } // N/A for this kernel static double DefaultFraction() { return 1.0; } // N/A for this kernel + static size_t DefaultNumRuns() { return 1; } // run every kernel this many times for averaging // Describes how to obtain the sizes of the buffers static size_t GetSizeX(const Arguments &) { return 1; } // N/A for this kernel diff --git a/src/tuning/kernels/copy_pad.cpp b/src/tuning/kernels/copy_pad.cpp index 90f5ea82..cdd1efcb 100644 --- a/src/tuning/kernels/copy_pad.cpp +++ b/src/tuning/kernels/copy_pad.cpp @@ -47,6 +47,7 @@ class TunePad { static size_t DefaultN() { return 1024; } static size_t DefaultK() { return 1; } // N/A for this kernel static double DefaultFraction() { return 1.0; } // N/A for this kernel + static size_t DefaultNumRuns() { return 1; } // run every kernel this many times for averaging // Describes how to obtain the sizes of the buffers static size_t GetSizeX(const Arguments &) { return 1; } // N/A for this kernel diff --git a/src/tuning/kernels/transpose_fast.cpp b/src/tuning/kernels/transpose_fast.cpp index 10fa80cb..8d39ead7 100644 --- a/src/tuning/kernels/transpose_fast.cpp +++ b/src/tuning/kernels/transpose_fast.cpp @@ -47,6 +47,7 @@ class TuneTranspose { static size_t DefaultN() { return 1024; } static size_t DefaultK() { return 1; } // N/A for this kernel static double DefaultFraction() { return 1.0; } // N/A for this kernel + static size_t DefaultNumRuns() { return 1; } // run every kernel this many times for averaging // Describes how to obtain the sizes of the buffers static size_t GetSizeX(const Arguments &) { return 1; } // N/A for this kernel diff --git a/src/tuning/kernels/transpose_pad.cpp b/src/tuning/kernels/transpose_pad.cpp index 507718eb..937b6c76 100644 --- a/src/tuning/kernels/transpose_pad.cpp +++ b/src/tuning/kernels/transpose_pad.cpp @@ -47,6 +47,7 @@ class TunePadTranspose { static size_t DefaultN() { return 1024; } static size_t DefaultK() { return 1; } // N/A for this kernel static double DefaultFraction() { return 1.0; } // N/A for this kernel + static size_t DefaultNumRuns() { return 1; } // run every kernel this many times for averaging // Describes how to obtain the sizes of the buffers static size_t GetSizeX(const Arguments &) { return 1; } // N/A for this kernel diff --git a/src/tuning/kernels/xaxpy.cpp b/src/tuning/kernels/xaxpy.cpp index 0033b3c6..1fd4ebad 100644 --- a/src/tuning/kernels/xaxpy.cpp +++ b/src/tuning/kernels/xaxpy.cpp @@ -51,6 +51,7 @@ class TuneXaxpy { static size_t DefaultN() { return 4096*1024; } static size_t DefaultK() { return 1; } // N/A for this kernel static double DefaultFraction() { return 1.0; } // N/A for this kernel + static size_t DefaultNumRuns() { return 1; } // run every kernel this many times for averaging // Describes how to obtain the sizes of the buffers static size_t GetSizeX(const Arguments &args) { return args.n; } diff --git a/src/tuning/kernels/xdot.cpp b/src/tuning/kernels/xdot.cpp index 1581e13f..0994e619 100644 --- a/src/tuning/kernels/xdot.cpp +++ b/src/tuning/kernels/xdot.cpp @@ -47,6 +47,7 @@ class TuneXdot { static size_t DefaultN() { return 2*1024*1024; } static size_t DefaultK() { return 1; } // N/A for this kernel static double DefaultFraction() { return 1.0; } // N/A for this kernel + static size_t DefaultNumRuns() { return 1; } // run every kernel this many times for averaging // Describes how to obtain the sizes of the buffers static size_t GetSizeX(const Arguments &args) { return args.n; } diff --git a/src/tuning/kernels/xgemm.cpp b/src/tuning/kernels/xgemm.cpp index 1abc5e8a..ea2bd677 100644 --- a/src/tuning/kernels/xgemm.cpp +++ b/src/tuning/kernels/xgemm.cpp @@ -52,6 +52,7 @@ class TuneXgemm { static size_t DefaultN() { return 1024; } static size_t DefaultK() { return 1024; } static double DefaultFraction() { return (V==1) ? 1.0 : 512.0; } // test all or sample randomly + static size_t DefaultNumRuns() { return 1; } // run every kernel this many times for averaging // Describes how to obtain the sizes of the buffers static size_t GetSizeX(const Arguments &) { return 1; } // N/A for this kernel diff --git a/src/tuning/kernels/xgemm_direct.cpp b/src/tuning/kernels/xgemm_direct.cpp index c2e8710f..98714da8 100644 --- a/src/tuning/kernels/xgemm_direct.cpp +++ b/src/tuning/kernels/xgemm_direct.cpp @@ -50,6 +50,7 @@ class TuneXgemmDirect { static size_t DefaultN() { return 128; } static size_t DefaultK() { return 128; } static double DefaultFraction() { return (V==1) ? 1.0 : 16.0; } // test all or sample randomly + static size_t DefaultNumRuns() { return 10; } // run every kernel this many times for averaging // Describes how to obtain the sizes of the buffers static size_t GetSizeX(const Arguments &) { return 1; } // N/A for this kernel diff --git a/src/tuning/kernels/xgemv.cpp b/src/tuning/kernels/xgemv.cpp index 7229602d..dd5b01ca 100644 --- a/src/tuning/kernels/xgemv.cpp +++ b/src/tuning/kernels/xgemv.cpp @@ -50,6 +50,7 @@ class TuneXgemv { static size_t DefaultN() { return 2048; } static size_t DefaultK() { return 1; } // N/A for this kernel static double DefaultFraction() { return 1.0; } // N/A for this kernel + static size_t DefaultNumRuns() { return 1; } // run every kernel this many times for averaging // Describes how to obtain the sizes of the buffers static size_t GetSizeX(const Arguments &args) { return args.n; } diff --git a/src/tuning/kernels/xger.cpp b/src/tuning/kernels/xger.cpp index 1fb5c531..e1640fd6 100644 --- a/src/tuning/kernels/xger.cpp +++ b/src/tuning/kernels/xger.cpp @@ -47,6 +47,7 @@ class TuneXger { static size_t DefaultN() { return 1024; } static size_t DefaultK() { return 1; } // N/A for this kernel static double DefaultFraction() { return 1.0; } // N/A for this kernel + static size_t DefaultNumRuns() { return 1; } // run every kernel this many times for averaging // Describes how to obtain the sizes of the buffers static size_t GetSizeX(const Arguments &args) { return args.m; } diff --git a/src/tuning/tuning.hpp b/src/tuning/tuning.hpp index 8fa93efc..afb092bc 100644 --- a/src/tuning/tuning.hpp +++ b/src/tuning/tuning.hpp @@ -46,7 +46,7 @@ void Tuner(int argc, char* argv[]) { if (o == kArgBeta) { args.beta = GetArgument(argc, argv, help, kArgBeta, GetScalar()); } if (o == kArgFraction) { args.fraction = GetArgument(argc, argv, help, kArgFraction, C::DefaultFraction()); } } - const auto num_runs = GetArgument(argc, argv, help, kArgNumRuns, size_t{1}); + const auto num_runs = GetArgument(argc, argv, help, kArgNumRuns, C::DefaultNumRuns()); fprintf(stdout, "%s\n", help.c_str()); From a45992010591bfbf46fdc99496e68982cad163b9 Mon Sep 17 00:00:00 2001 From: Cedric Nugteren Date: Sat, 1 Oct 2016 16:58:53 +0200 Subject: [PATCH 08/19] Added padding to the local memory of the GEMM direct kernel --- src/database/kernels/xgemm_direct.hpp | 10 +- src/kernels/level3/xgemm_direct.opencl | 173 ++++++++++++++----------- src/tuning/kernels/xgemm_direct.cpp | 10 +- 3 files changed, 106 insertions(+), 87 deletions(-) diff --git a/src/database/kernels/xgemm_direct.hpp b/src/database/kernels/xgemm_direct.hpp index dc69f61b..bc91fdc2 100644 --- a/src/database/kernels/xgemm_direct.hpp +++ b/src/database/kernels/xgemm_direct.hpp @@ -18,7 +18,7 @@ const Database::DatabaseEntry Database::XgemmDirectHalf = { "XgemmDirect", Precision::kHalf, { { // Default kDeviceTypeAll, "default", { - { "default", { {"WGD",32}, {"KWID",2}, {"MDIMAD",8}, {"MDIMCD",8}, {"NDIMBD",8}, {"NDIMCD",8}, {"VWMD",1}, {"VWND",1} } }, + { "default", { {"WGD",32}, {"KWID",2}, {"MDIMAD",8}, {"MDIMCD",8}, {"NDIMBD",8}, {"NDIMCD",8}, {"VWMD",1}, {"VWND",1}, {"PADA",0}, {"PADB",0} } }, } }, } @@ -30,7 +30,7 @@ const Database::DatabaseEntry Database::XgemmDirectSingle = { "XgemmDirect", Precision::kSingle, { { // Default kDeviceTypeAll, "default", { - { "default", { {"WGD",32}, {"KWID",2}, {"MDIMAD",8}, {"MDIMCD",8}, {"NDIMBD",8}, {"NDIMCD",8}, {"VWMD",1}, {"VWND",1} } }, + { "default", { {"WGD",32}, {"KWID",2}, {"MDIMAD",8}, {"MDIMCD",8}, {"NDIMBD",8}, {"NDIMCD",8}, {"VWMD",1}, {"VWND",1}, {"PADA",0}, {"PADB",0} } }, } }, } @@ -42,7 +42,7 @@ const Database::DatabaseEntry Database::XgemmDirectComplexSingle = { "XgemmDirect", Precision::kComplexSingle, { { // Default kDeviceTypeAll, "default", { - { "default", { {"WGD",32}, {"KWID",2}, {"MDIMAD",8}, {"MDIMCD",8}, {"NDIMBD",8}, {"NDIMCD",8}, {"VWMD",1}, {"VWND",1} } }, + { "default", { {"WGD",32}, {"KWID",2}, {"MDIMAD",8}, {"MDIMCD",8}, {"NDIMBD",8}, {"NDIMCD",8}, {"VWMD",1}, {"VWND",1}, {"PADA",0}, {"PADB",0} } }, } }, } @@ -54,7 +54,7 @@ const Database::DatabaseEntry Database::XgemmDirectDouble = { "XgemmDirect", Precision::kDouble, { { // Default kDeviceTypeAll, "default", { - { "default", { {"WGD",32}, {"KWID",2}, {"MDIMAD",8}, {"MDIMCD",8}, {"NDIMBD",8}, {"NDIMCD",8}, {"VWMD",1}, {"VWND",1} } }, + { "default", { {"WGD",32}, {"KWID",2}, {"MDIMAD",8}, {"MDIMCD",8}, {"NDIMBD",8}, {"NDIMCD",8}, {"VWMD",1}, {"VWND",1}, {"PADA",0}, {"PADB",0} } }, } }, } @@ -66,7 +66,7 @@ const Database::DatabaseEntry Database::XgemmDirectComplexDouble = { "XgemmDirect", Precision::kComplexDouble, { { // Default kDeviceTypeAll, "default", { - { "default", { {"WGD",32}, {"KWID",2}, {"MDIMAD",8}, {"MDIMCD",8}, {"NDIMBD",8}, {"NDIMCD",8}, {"VWMD",1}, {"VWND",1} } }, + { "default", { {"WGD",32}, {"KWID",2}, {"MDIMAD",8}, {"MDIMCD",8}, {"NDIMBD",8}, {"NDIMCD",8}, {"VWMD",1}, {"VWND",1}, {"PADA",0}, {"PADB",0} } }, } }, } diff --git a/src/kernels/level3/xgemm_direct.opencl b/src/kernels/level3/xgemm_direct.opencl index 705ced9c..75618e8c 100644 --- a/src/kernels/level3/xgemm_direct.opencl +++ b/src/kernels/level3/xgemm_direct.opencl @@ -43,6 +43,12 @@ R"( #ifndef VWND #define VWND 1 // Vector width of matrix B #endif +#ifndef PADA + #define PADA 1 // Local memory padding for matrix A +#endif +#ifndef PADB + #define PADB 1 // Local memory padding for matrix B +#endif // Helper parameters based on the above tuning parameters #define MWID (WGD/MDIMCD) // Work per work-item (M-dimension) @@ -87,10 +93,16 @@ R"( // Caches global off-chip memory into local (shared) memory on-chip. This function is specific for // caching the A input matrix. inline void GlobalToLocalDirectA(const __global realMD* restrict agm, __local real* alm, - const int a_ld, const int a_offset, const int tid, const int kwg, + const int a_ld, const int a_offset, const int kwg, const int a_transpose, const int a_conjugate) { - const int la0 = tid % MDIMAD; - const int la1 = tid / MDIMAD; + #if MDIMCD == MDIMAD + const int la0 = get_local_id(0); + const int la1 = get_local_id(1); + #else + const int tid = get_local_id(0) + MDIMCD*get_local_id(1); + const int la0 = tid % MDIMAD; + const int la1 = tid / MDIMAD; + #endif #pragma unroll for (int mia=0; mia local (matrix A and B) - GlobalToLocalDirectA(agm, alm, a_ld, a_offset, tid, kwg, a_transpose, a_conjugate); - GlobalToLocalDirectB(bgm, blm, b_ld, b_offset, tid, kwg, b_transpose, b_conjugate); + GlobalToLocalDirectA(agm, alm, a_ld, a_offset, kwg, a_transpose, a_conjugate); + GlobalToLocalDirectB(bgm, blm, b_ld, b_offset, kwg, b_transpose, b_conjugate); barrier(CLK_LOCAL_MEM_FENCE); // Loops over all workitem tiles, unrolled by a factor KWID diff --git a/src/tuning/kernels/xgemm_direct.cpp b/src/tuning/kernels/xgemm_direct.cpp index 98714da8..6ab6d1f0 100644 --- a/src/tuning/kernels/xgemm_direct.cpp +++ b/src/tuning/kernels/xgemm_direct.cpp @@ -71,6 +71,8 @@ class TuneXgemmDirect { tuner.AddParameter(id, "KWID", {2}); tuner.AddParameter(id, "VWMD", {1, 2, 4, 8}); tuner.AddParameter(id, "VWND", {1, 2, 4, 8}); + tuner.AddParameter(id, "PADA", {1}); + tuner.AddParameter(id, "PADB", {1}); } // a lot more tuning parameters - has to be sampled randomly, too much to test all else { tuner.AddParameter(id, "WGD", {8, 16, 32, 64, 128}); @@ -81,6 +83,8 @@ class TuneXgemmDirect { tuner.AddParameter(id, "KWID", {2, 8, 16}); tuner.AddParameter(id, "VWMD", {1, 2, 4, 8}); tuner.AddParameter(id, "VWND", {1, 2, 4, 8}); + tuner.AddParameter(id, "PADA", {0, 1}); + tuner.AddParameter(id, "PADB", {0, 1}); } } @@ -112,9 +116,9 @@ class TuneXgemmDirect { // Sets the local memory size static void SetLocalMemorySize(cltune::Tuner &tuner, const size_t id, const Arguments &args) { auto LocalMemorySize = [args] (std::vector v) { - return ((v[0]*v[1] + v[2]*v[3])*GetBytes(args.precision)); + return ((v[0]*(v[0] + v[1]) + v[0]*(v[0] + v[2]))*GetBytes(args.precision)); }; - tuner.SetLocalMemoryUsage(id, LocalMemorySize, {"WGD", "WGD", "WGD", "WGD"}); + tuner.SetLocalMemoryUsage(id, LocalMemorySize, {"WGD", "PADA", "PADB"}); } // Sets the base thread configuration @@ -150,7 +154,7 @@ class TuneXgemmDirect { tuner.AddArgumentScalar(0); // c_offset tuner.AddArgumentScalar(static_cast(args.n)); // c_ld tuner.AddArgumentScalar(1); // a_do_transpose - tuner.AddArgumentScalar(1); // b_do_transpose + tuner.AddArgumentScalar(0); // b_do_transpose tuner.AddArgumentScalar(1); // c_do_transpose tuner.AddArgumentScalar(0); // a_conjugate tuner.AddArgumentScalar(0); // b_conjugate From 61f489e370c56075e166caff6d1ad671ca6787b9 Mon Sep 17 00:00:00 2001 From: Cedric Nugteren Date: Sun, 2 Oct 2016 15:06:59 +0200 Subject: [PATCH 09/19] Split the GEMM direct kernel into two files; set the default tuning target to 256-256-256 --- ...irect.opencl => xgemm_direct_part1.opencl} | 180 +-------------- src/kernels/level3/xgemm_direct_part2.opencl | 207 ++++++++++++++++++ src/routines/level3/xgemm.cpp | 3 +- src/tuning/kernels/xgemm_direct.cpp | 9 +- 4 files changed, 216 insertions(+), 183 deletions(-) rename src/kernels/level3/{xgemm_direct.opencl => xgemm_direct_part1.opencl} (63%) create mode 100644 src/kernels/level3/xgemm_direct_part2.opencl diff --git a/src/kernels/level3/xgemm_direct.opencl b/src/kernels/level3/xgemm_direct_part1.opencl similarity index 63% rename from src/kernels/level3/xgemm_direct.opencl rename to src/kernels/level3/xgemm_direct_part1.opencl index 75618e8c..cb407824 100644 --- a/src/kernels/level3/xgemm_direct.opencl +++ b/src/kernels/level3/xgemm_direct_part1.opencl @@ -10,6 +10,8 @@ // This is a generic GEMM kernel that works for all sizes and configurations: it doesn't require any // pre and and post-processing kernels. // +// This kernel is seperated into three files. This is part 1 out of 2. +// // ================================================================================================= // Enables loading of this file using the C++ pre-processor's #include (C++11 standard raw string @@ -286,184 +288,6 @@ inline void MultiplyAccumulateDirect(real cpm[NWID][MWID], real apm[MWID], real // ================================================================================================= -// Merges the results in Cpm with the global array in Cgm. This also performs the multiplication -// with the constants: Cgm = alpha*A*B + beta*Cgm = alpha*Cpm + beta*Cgm -inline void StoreResultsDirect(__global real* cgm, real cpm[NWID][MWID], - const int kSizeM, const int kSizeN, - const real alpha, const real beta, - const int c_ld, const int c_offset, const int c_transpose) { - #pragma unroll - for (int ni=0; ni local (matrix A and B) - GlobalToLocalDirectA(agm, alm, a_ld, a_offset, kwg, a_transpose, a_conjugate); - GlobalToLocalDirectB(bgm, blm, b_ld, b_offset, kwg, b_transpose, b_conjugate); - barrier(CLK_LOCAL_MEM_FENCE); - - // Loops over all workitem tiles, unrolled by a factor KWID - for (int pwi=0; pwi private (matrix A) - LocalToPrivateDirectA(alm, apm, kg, a_transpose); - - // Loads data: local --> private (matrix B) - LocalToPrivateDirectB(blm, bpm, kg, b_transpose); - - // Performs the accumulation (Cpm += Apm * Bpm) - MultiplyAccumulateDirect(cpm, apm, bpm); - } - } - barrier(CLK_LOCAL_MEM_FENCE); - } - - // Loop over the remaining part (incomplete tile in K-dimension) - for (; kwg < kSizeK; ++kwg) { - const int idk = kwg; - - // Loads A into register memory - #pragma unroll - for (int mi=0; mi +// +// This is part 2 of 2 of the GEMM kernel. See part 1 for more information. +// +// ================================================================================================= + +// Enables loading of this file using the C++ pre-processor's #include (C++11 standard raw string +// literal). Comment-out this line for syntax-highlighting when developing. +R"( + +// ================================================================================================= + +// Merges the results in Cpm with the global array in Cgm. This also performs the multiplication +// with the constants: Cgm = alpha*A*B + beta*Cgm = alpha*Cpm + beta*Cgm +inline void StoreResultsDirect(__global real* cgm, real cpm[NWID][MWID], + const int kSizeM, const int kSizeN, + const real alpha, const real beta, + const int c_ld, const int c_offset, const int c_transpose) { + #pragma unroll + for (int ni=0; ni local (matrix A and B) + GlobalToLocalDirectA(agm, alm, a_ld, a_offset, kwg, a_transpose, a_conjugate); + GlobalToLocalDirectB(bgm, blm, b_ld, b_offset, kwg, b_transpose, b_conjugate); + barrier(CLK_LOCAL_MEM_FENCE); + + // Loops over all workitem tiles, unrolled by a factor KWID + for (int pwi=0; pwi private (matrix A) + LocalToPrivateDirectA(alm, apm, kg, a_transpose); + + // Loads data: local --> private (matrix B) + LocalToPrivateDirectB(blm, bpm, kg, b_transpose); + + // Performs the accumulation (Cpm += Apm * Bpm) + MultiplyAccumulateDirect(cpm, apm, bpm); + } + } + barrier(CLK_LOCAL_MEM_FENCE); + } + + // Loop over the remaining part (incomplete tile in K-dimension) + for (; kwg < kSizeK; ++kwg) { + const int idk = kwg; + + // Loads A into register memory + #pragma unroll + for (int mi=0; mi::Xgemm(Queue &queue, EventPointer event, const std::string &name): #include "../../kernels/level3/xgemm_part1.opencl" #include "../../kernels/level3/xgemm_part2.opencl" #include "../../kernels/level3/xgemm_part3.opencl" - #include "../../kernels/level3/xgemm_direct.opencl" + #include "../../kernels/level3/xgemm_direct_part1.opencl" + #include "../../kernels/level3/xgemm_direct_part2.opencl" ; } diff --git a/src/tuning/kernels/xgemm_direct.cpp b/src/tuning/kernels/xgemm_direct.cpp index 6ab6d1f0..c3864348 100644 --- a/src/tuning/kernels/xgemm_direct.cpp +++ b/src/tuning/kernels/xgemm_direct.cpp @@ -33,7 +33,8 @@ class TuneXgemmDirect { static std::string GetSources() { return #include "../src/kernels/common.opencl" - #include "../src/kernels/level3/xgemm_direct.opencl" + #include "../src/kernels/level3/xgemm_direct_part1.opencl" + #include "../src/kernels/level3/xgemm_direct_part2.opencl" ; } @@ -46,9 +47,9 @@ class TuneXgemmDirect { static void TestValidArguments(const Arguments &) { } // Sets the default values for the arguments - static size_t DefaultM() { return 128; } - static size_t DefaultN() { return 128; } - static size_t DefaultK() { return 128; } + static size_t DefaultM() { return 256; } + static size_t DefaultN() { return 256; } + static size_t DefaultK() { return 256; } static double DefaultFraction() { return (V==1) ? 1.0 : 16.0; } // test all or sample randomly static size_t DefaultNumRuns() { return 10; } // run every kernel this many times for averaging From d8827e908cd7ff70e1bf294468c12e76c749317e Mon Sep 17 00:00:00 2001 From: Cedric Nugteren Date: Sun, 2 Oct 2016 17:59:05 +0200 Subject: [PATCH 10/19] Specialised the GEMM direct kernel in four ways for transposing/non-transposing: NN, NT, TN, TT --- src/kernels/level3/xgemm_direct_part2.opencl | 86 ++++++++++++++++---- src/routines/level3/xgemm.cpp | 14 ++-- src/tuning/kernels/xgemm_direct.cpp | 8 +- 3 files changed, 82 insertions(+), 26 deletions(-) diff --git a/src/kernels/level3/xgemm_direct_part2.opencl b/src/kernels/level3/xgemm_direct_part2.opencl index 36804f4e..0d066186 100644 --- a/src/kernels/level3/xgemm_direct_part2.opencl +++ b/src/kernels/level3/xgemm_direct_part2.opencl @@ -51,16 +51,16 @@ inline void StoreResultsDirect(__global real* cgm, real cpm[NWID][MWID], // ================================================================================================= -// Main entry point of the kernel. This is the direct version without restrictions. -__attribute__((reqd_work_group_size(MDIMCD, NDIMCD, 1))) -__kernel void XgemmDirect(const int kSizeM, const int kSizeN, const int kSizeK, - const real_arg arg_alpha, - const real_arg arg_beta, - const __global realMD* restrict agm, const int a_offset, const int a_ld, - const __global realND* restrict bgm, const int b_offset, const int b_ld, - __global real* cgm, const int c_offset, const int c_ld, - const int a_transpose, const int b_transpose, const int c_transpose, - const int a_conjugate, const int b_conjugate) { +// Main body of the kernel. This is the direct version without restrictions. +inline void XgemmDirect(const int kSizeM, const int kSizeN, const int kSizeK, + const real_arg arg_alpha, + const real_arg arg_beta, + const __global realMD* restrict agm, const int a_offset, const int a_ld, + const __global realND* restrict bgm, const int b_offset, const int b_ld, + __global real* cgm, const int c_offset, const int c_ld, + __local real* alm, __local real* blm, + const int a_transpose, const int b_transpose, const int c_transpose, + const int a_conjugate, const int b_conjugate) { const real alpha = GetRealArg(arg_alpha); const real beta = GetRealArg(arg_beta); @@ -68,10 +68,6 @@ __kernel void XgemmDirect(const int kSizeM, const int kSizeN, const int kSizeK, const __global real* restrict agms = (const __global real* restrict) agm; const __global real* restrict bgms = (const __global real* restrict) bgm; - // Allocates workgroup-private memory (local memory) - __local real alm[WGD * (WGD + PADA)]; - __local real blm[WGD * (WGD + PADB)]; - // Allocates workitem-private memory (registers) real apm[MWID]; real bpm[NWID]; @@ -201,6 +197,68 @@ __kernel void XgemmDirect(const int kSizeM, const int kSizeN, const int kSizeK, // ================================================================================================= +// Direct version of the GEMM kernel with [A, B] = [non-transposed, non-transposed] +__attribute__((reqd_work_group_size(MDIMCD, NDIMCD, 1))) +__kernel void XgemmDirectNN(const int kSizeM, const int kSizeN, const int kSizeK, + const real_arg arg_alpha, const real_arg arg_beta, + const __global realMD* restrict agm, const int a_offset, const int a_ld, + const __global realND* restrict bgm, const int b_offset, const int b_ld, + __global real* cgm, const int c_offset, const int c_ld, + const int c_transpose, const int a_conjugate, const int b_conjugate) { + __local real alm[WGD * (WGD + PADA)]; + __local real blm[WGD * (WGD + PADB)]; + XgemmDirect(kSizeM, kSizeN, kSizeK, arg_alpha, arg_beta, + agm, a_offset, a_ld, bgm, b_offset, b_ld, cgm, c_offset, c_ld, + alm, blm, 0, 0, c_transpose, a_conjugate, b_conjugate); +} + +// Direct version of the GEMM kernel with [A, B] = [non-transposed, transposed] +__attribute__((reqd_work_group_size(MDIMCD, NDIMCD, 1))) +__kernel void XgemmDirectNT(const int kSizeM, const int kSizeN, const int kSizeK, + const real_arg arg_alpha, const real_arg arg_beta, + const __global realMD* restrict agm, const int a_offset, const int a_ld, + const __global realND* restrict bgm, const int b_offset, const int b_ld, + __global real* cgm, const int c_offset, const int c_ld, + const int c_transpose, const int a_conjugate, const int b_conjugate) { + __local real alm[WGD * (WGD + PADA)]; + __local real blm[WGD * (WGD + PADB)]; + XgemmDirect(kSizeM, kSizeN, kSizeK, arg_alpha, arg_beta, + agm, a_offset, a_ld, bgm, b_offset, b_ld, cgm, c_offset, c_ld, + alm, blm, 0, 1, c_transpose, a_conjugate, b_conjugate); +} + +// Direct version of the GEMM kernel with [A, B] = [transposed, non-transposed] +__attribute__((reqd_work_group_size(MDIMCD, NDIMCD, 1))) +__kernel void XgemmDirectTN(const int kSizeM, const int kSizeN, const int kSizeK, + const real_arg arg_alpha, const real_arg arg_beta, + const __global realMD* restrict agm, const int a_offset, const int a_ld, + const __global realND* restrict bgm, const int b_offset, const int b_ld, + __global real* cgm, const int c_offset, const int c_ld, + const int c_transpose, const int a_conjugate, const int b_conjugate) { + __local real alm[WGD * (WGD + PADA)]; + __local real blm[WGD * (WGD + PADB)]; + XgemmDirect(kSizeM, kSizeN, kSizeK, arg_alpha, arg_beta, + agm, a_offset, a_ld, bgm, b_offset, b_ld, cgm, c_offset, c_ld, + alm, blm, 1, 0, c_transpose, a_conjugate, b_conjugate); +} + +// Direct version of the GEMM kernel with [A, B] = [transposed, transposed] +__attribute__((reqd_work_group_size(MDIMCD, NDIMCD, 1))) +__kernel void XgemmDirectTT(const int kSizeM, const int kSizeN, const int kSizeK, + const real_arg arg_alpha, const real_arg arg_beta, + const __global realMD* restrict agm, const int a_offset, const int a_ld, + const __global realND* restrict bgm, const int b_offset, const int b_ld, + __global real* cgm, const int c_offset, const int c_ld, + const int c_transpose, const int a_conjugate, const int b_conjugate) { + __local real alm[WGD * (WGD + PADA)]; + __local real blm[WGD * (WGD + PADB)]; + XgemmDirect(kSizeM, kSizeN, kSizeK, arg_alpha, arg_beta, + agm, a_offset, a_ld, bgm, b_offset, b_ld, cgm, c_offset, c_ld, + alm, blm, 1, 1, c_transpose, a_conjugate, b_conjugate); +} + +// ================================================================================================= + // End of the C++11 raw string literal )" diff --git a/src/routines/level3/xgemm.cpp b/src/routines/level3/xgemm.cpp index ee33c8be..143ef3c1 100644 --- a/src/routines/level3/xgemm.cpp +++ b/src/routines/level3/xgemm.cpp @@ -275,9 +275,11 @@ StatusCode Xgemm::GemmDirect(const size_t m, const size_t n, const size_t k, // Loads the program from the database const auto program = GetProgramFromCache(context_, PrecisionValue(), routine_name_); - // Retrieves the XgemmDirect kernel from the compiled binary + // Retrieves the proper XgemmDirect kernel from the compiled binary try { - auto kernel = Kernel(program, "XgemmDirect"); + const auto name = (a_do_transpose) ? (b_do_transpose ? "XgemmDirectTT" : "XgemmDirectTN") : + (b_do_transpose ? "XgemmDirectNT" : "XgemmDirectNN"); + auto kernel = Kernel(program, name); // Sets the kernel arguments kernel.SetArgument(0, static_cast(m)); @@ -294,11 +296,9 @@ StatusCode Xgemm::GemmDirect(const size_t m, const size_t n, const size_t k, kernel.SetArgument(11, c_buffer()); kernel.SetArgument(12, static_cast(c_offset)); kernel.SetArgument(13, static_cast(c_ld)); - kernel.SetArgument(14, static_cast(a_do_transpose)); - kernel.SetArgument(15, static_cast(b_do_transpose)); - kernel.SetArgument(16, static_cast(c_do_transpose)); - kernel.SetArgument(17, static_cast(a_conjugate)); - kernel.SetArgument(18, static_cast(b_conjugate)); + kernel.SetArgument(14, static_cast(c_do_transpose)); + kernel.SetArgument(15, static_cast(a_conjugate)); + kernel.SetArgument(16, static_cast(b_conjugate)); // Computes the global and local thread sizes const auto m_ceiled = Ceil(m, db_["WGD"]); diff --git a/src/tuning/kernels/xgemm_direct.cpp b/src/tuning/kernels/xgemm_direct.cpp index c3864348..c6948ef5 100644 --- a/src/tuning/kernels/xgemm_direct.cpp +++ b/src/tuning/kernels/xgemm_direct.cpp @@ -29,7 +29,7 @@ class TuneXgemmDirect { // The representative kernel and the source code static std::string KernelFamily() { return (V==1) ? "xgemm_direct_1" : "xgemm_direct_2"; } - static std::string KernelName() { return "XgemmDirect"; } + static std::string KernelName() { return "XgemmDirectTN"; } static std::string GetSources() { return #include "../src/kernels/common.opencl" @@ -50,8 +50,8 @@ class TuneXgemmDirect { static size_t DefaultM() { return 256; } static size_t DefaultN() { return 256; } static size_t DefaultK() { return 256; } - static double DefaultFraction() { return (V==1) ? 1.0 : 16.0; } // test all or sample randomly - static size_t DefaultNumRuns() { return 10; } // run every kernel this many times for averaging + static double DefaultFraction() { return (V==1) ? 1.0 : 32.0; } // test all or sample randomly + static size_t DefaultNumRuns() { return 4; } // run every kernel this many times for averaging // Describes how to obtain the sizes of the buffers static size_t GetSizeX(const Arguments &) { return 1; } // N/A for this kernel @@ -154,8 +154,6 @@ class TuneXgemmDirect { tuner.AddArgumentOutput(c_mat); tuner.AddArgumentScalar(0); // c_offset tuner.AddArgumentScalar(static_cast(args.n)); // c_ld - tuner.AddArgumentScalar(1); // a_do_transpose - tuner.AddArgumentScalar(0); // b_do_transpose tuner.AddArgumentScalar(1); // c_do_transpose tuner.AddArgumentScalar(0); // a_conjugate tuner.AddArgumentScalar(0); // b_conjugate From 243cef73db53b5d8ce6c55f95e18c4412539d210 Mon Sep 17 00:00:00 2001 From: Cedric Nugteren Date: Sun, 2 Oct 2016 21:23:23 +0200 Subject: [PATCH 11/19] Set the default number of runs for all kernels to at least 2 runs --- src/tuning/kernels/copy_fast.cpp | 2 +- src/tuning/kernels/copy_pad.cpp | 2 +- src/tuning/kernels/transpose_fast.cpp | 2 +- src/tuning/kernels/transpose_pad.cpp | 2 +- src/tuning/kernels/xaxpy.cpp | 2 +- src/tuning/kernels/xdot.cpp | 2 +- src/tuning/kernels/xgemm.cpp | 2 +- src/tuning/kernels/xgemv.cpp | 2 +- src/tuning/kernels/xger.cpp | 2 +- 9 files changed, 9 insertions(+), 9 deletions(-) diff --git a/src/tuning/kernels/copy_fast.cpp b/src/tuning/kernels/copy_fast.cpp index da4124fd..c57aab39 100644 --- a/src/tuning/kernels/copy_fast.cpp +++ b/src/tuning/kernels/copy_fast.cpp @@ -47,7 +47,7 @@ class TuneCopy { static size_t DefaultN() { return 1024; } static size_t DefaultK() { return 1; } // N/A for this kernel static double DefaultFraction() { return 1.0; } // N/A for this kernel - static size_t DefaultNumRuns() { return 1; } // run every kernel this many times for averaging + static size_t DefaultNumRuns() { return 2; } // run every kernel this many times for averaging // Describes how to obtain the sizes of the buffers static size_t GetSizeX(const Arguments &) { return 1; } // N/A for this kernel diff --git a/src/tuning/kernels/copy_pad.cpp b/src/tuning/kernels/copy_pad.cpp index cdd1efcb..9486ee8d 100644 --- a/src/tuning/kernels/copy_pad.cpp +++ b/src/tuning/kernels/copy_pad.cpp @@ -47,7 +47,7 @@ class TunePad { static size_t DefaultN() { return 1024; } static size_t DefaultK() { return 1; } // N/A for this kernel static double DefaultFraction() { return 1.0; } // N/A for this kernel - static size_t DefaultNumRuns() { return 1; } // run every kernel this many times for averaging + static size_t DefaultNumRuns() { return 2; } // run every kernel this many times for averaging // Describes how to obtain the sizes of the buffers static size_t GetSizeX(const Arguments &) { return 1; } // N/A for this kernel diff --git a/src/tuning/kernels/transpose_fast.cpp b/src/tuning/kernels/transpose_fast.cpp index 8d39ead7..2d9d5e49 100644 --- a/src/tuning/kernels/transpose_fast.cpp +++ b/src/tuning/kernels/transpose_fast.cpp @@ -47,7 +47,7 @@ class TuneTranspose { static size_t DefaultN() { return 1024; } static size_t DefaultK() { return 1; } // N/A for this kernel static double DefaultFraction() { return 1.0; } // N/A for this kernel - static size_t DefaultNumRuns() { return 1; } // run every kernel this many times for averaging + static size_t DefaultNumRuns() { return 2; } // run every kernel this many times for averaging // Describes how to obtain the sizes of the buffers static size_t GetSizeX(const Arguments &) { return 1; } // N/A for this kernel diff --git a/src/tuning/kernels/transpose_pad.cpp b/src/tuning/kernels/transpose_pad.cpp index 937b6c76..d364dabe 100644 --- a/src/tuning/kernels/transpose_pad.cpp +++ b/src/tuning/kernels/transpose_pad.cpp @@ -47,7 +47,7 @@ class TunePadTranspose { static size_t DefaultN() { return 1024; } static size_t DefaultK() { return 1; } // N/A for this kernel static double DefaultFraction() { return 1.0; } // N/A for this kernel - static size_t DefaultNumRuns() { return 1; } // run every kernel this many times for averaging + static size_t DefaultNumRuns() { return 2; } // run every kernel this many times for averaging // Describes how to obtain the sizes of the buffers static size_t GetSizeX(const Arguments &) { return 1; } // N/A for this kernel diff --git a/src/tuning/kernels/xaxpy.cpp b/src/tuning/kernels/xaxpy.cpp index 1fd4ebad..403ee9e4 100644 --- a/src/tuning/kernels/xaxpy.cpp +++ b/src/tuning/kernels/xaxpy.cpp @@ -51,7 +51,7 @@ class TuneXaxpy { static size_t DefaultN() { return 4096*1024; } static size_t DefaultK() { return 1; } // N/A for this kernel static double DefaultFraction() { return 1.0; } // N/A for this kernel - static size_t DefaultNumRuns() { return 1; } // run every kernel this many times for averaging + static size_t DefaultNumRuns() { return 2; } // run every kernel this many times for averaging // Describes how to obtain the sizes of the buffers static size_t GetSizeX(const Arguments &args) { return args.n; } diff --git a/src/tuning/kernels/xdot.cpp b/src/tuning/kernels/xdot.cpp index 0994e619..f8416761 100644 --- a/src/tuning/kernels/xdot.cpp +++ b/src/tuning/kernels/xdot.cpp @@ -47,7 +47,7 @@ class TuneXdot { static size_t DefaultN() { return 2*1024*1024; } static size_t DefaultK() { return 1; } // N/A for this kernel static double DefaultFraction() { return 1.0; } // N/A for this kernel - static size_t DefaultNumRuns() { return 1; } // run every kernel this many times for averaging + static size_t DefaultNumRuns() { return 2; } // run every kernel this many times for averaging // Describes how to obtain the sizes of the buffers static size_t GetSizeX(const Arguments &args) { return args.n; } diff --git a/src/tuning/kernels/xgemm.cpp b/src/tuning/kernels/xgemm.cpp index ea2bd677..0eb1875b 100644 --- a/src/tuning/kernels/xgemm.cpp +++ b/src/tuning/kernels/xgemm.cpp @@ -52,7 +52,7 @@ class TuneXgemm { static size_t DefaultN() { return 1024; } static size_t DefaultK() { return 1024; } static double DefaultFraction() { return (V==1) ? 1.0 : 512.0; } // test all or sample randomly - static size_t DefaultNumRuns() { return 1; } // run every kernel this many times for averaging + static size_t DefaultNumRuns() { return 2; } // run every kernel this many times for averaging // Describes how to obtain the sizes of the buffers static size_t GetSizeX(const Arguments &) { return 1; } // N/A for this kernel diff --git a/src/tuning/kernels/xgemv.cpp b/src/tuning/kernels/xgemv.cpp index dd5b01ca..f332f52a 100644 --- a/src/tuning/kernels/xgemv.cpp +++ b/src/tuning/kernels/xgemv.cpp @@ -50,7 +50,7 @@ class TuneXgemv { static size_t DefaultN() { return 2048; } static size_t DefaultK() { return 1; } // N/A for this kernel static double DefaultFraction() { return 1.0; } // N/A for this kernel - static size_t DefaultNumRuns() { return 1; } // run every kernel this many times for averaging + static size_t DefaultNumRuns() { return 2; } // run every kernel this many times for averaging // Describes how to obtain the sizes of the buffers static size_t GetSizeX(const Arguments &args) { return args.n; } diff --git a/src/tuning/kernels/xger.cpp b/src/tuning/kernels/xger.cpp index e1640fd6..c3d0c7dd 100644 --- a/src/tuning/kernels/xger.cpp +++ b/src/tuning/kernels/xger.cpp @@ -47,7 +47,7 @@ class TuneXger { static size_t DefaultN() { return 1024; } static size_t DefaultK() { return 1; } // N/A for this kernel static double DefaultFraction() { return 1.0; } // N/A for this kernel - static size_t DefaultNumRuns() { return 1; } // run every kernel this many times for averaging + static size_t DefaultNumRuns() { return 2; } // run every kernel this many times for averaging // Describes how to obtain the sizes of the buffers static size_t GetSizeX(const Arguments &args) { return args.m; } From c1c4bc5d209280e4ec9be5c0a26f7c94077a6b20 Mon Sep 17 00:00:00 2001 From: Cedric Nugteren Date: Mon, 3 Oct 2016 19:32:01 +0200 Subject: [PATCH 12/19] Re-organised GEMM direct kernel and added faster fall-back version for incomplete rectangles --- src/kernels/level3/xgemm_direct_part1.opencl | 296 +++++------- src/kernels/level3/xgemm_direct_part2.opencl | 478 ++++++++++--------- src/kernels/level3/xgemm_direct_part3.opencl | 206 ++++++++ src/routines/level3/xgemm.cpp | 1 + src/tuning/kernels/xgemm_direct.cpp | 1 + 5 files changed, 575 insertions(+), 407 deletions(-) create mode 100644 src/kernels/level3/xgemm_direct_part3.opencl diff --git a/src/kernels/level3/xgemm_direct_part1.opencl b/src/kernels/level3/xgemm_direct_part1.opencl index cb407824..2e5addef 100644 --- a/src/kernels/level3/xgemm_direct_part1.opencl +++ b/src/kernels/level3/xgemm_direct_part1.opencl @@ -10,7 +10,7 @@ // This is a generic GEMM kernel that works for all sizes and configurations: it doesn't require any // pre and and post-processing kernels. // -// This kernel is seperated into three files. This is part 1 out of 2. +// This kernel is seperated into three files. This is part 1 out of 3. // // ================================================================================================= @@ -92,176 +92,6 @@ R"( // ================================================================================================= -// Caches global off-chip memory into local (shared) memory on-chip. This function is specific for -// caching the A input matrix. -inline void GlobalToLocalDirectA(const __global realMD* restrict agm, __local real* alm, - const int a_ld, const int a_offset, const int kwg, - const int a_transpose, const int a_conjugate) { - #if MDIMCD == MDIMAD - const int la0 = get_local_id(0); - const int la1 = get_local_id(1); - #else - const int tid = get_local_id(0) + MDIMCD*get_local_id(1); - const int la0 = tid % MDIMAD; - const int la1 = tid / MDIMAD; - #endif - #pragma unroll - for (int mia=0; mia // -// This is part 2 of 2 of the GEMM kernel. See part 1 for more information. +// This is part 2 of 3 of the GEMM kernel. See part 1 for more information. // // ================================================================================================= @@ -17,248 +17,254 @@ R"( // ================================================================================================= -// Merges the results in Cpm with the global array in Cgm. This also performs the multiplication -// with the constants: Cgm = alpha*A*B + beta*Cgm = alpha*Cpm + beta*Cgm -inline void StoreResultsDirect(__global real* cgm, real cpm[NWID][MWID], - const int kSizeM, const int kSizeN, - const real alpha, const real beta, - const int c_ld, const int c_offset, const int c_transpose) { +// Caches global off-chip memory into local (shared) memory on-chip. This function is specific for +// caching the A input matrix. +inline void GlobalToLocalDirectA(const __global realMD* restrict agm, __local real* alm, + const int a_ld, const int a_offset, const int kwg, + const int a_transpose, const int a_conjugate) { + #if MDIMCD == MDIMAD + const int la0 = get_local_id(0); + const int la1 = get_local_id(1); + #else + const int tid = get_local_id(0) + MDIMCD*get_local_id(1); + const int la0 = tid % MDIMAD; + const int la1 = tid / MDIMAD; + #endif + #pragma unroll + for (int mia=0; mia local (matrix A and B) - GlobalToLocalDirectA(agm, alm, a_ld, a_offset, kwg, a_transpose, a_conjugate); - GlobalToLocalDirectB(bgm, blm, b_ld, b_offset, kwg, b_transpose, b_conjugate); - barrier(CLK_LOCAL_MEM_FENCE); - - // Loops over all workitem tiles, unrolled by a factor KWID - for (int pwi=0; pwi private (matrix A) - LocalToPrivateDirectA(alm, apm, kg, a_transpose); - - // Loads data: local --> private (matrix B) - LocalToPrivateDirectB(blm, bpm, kg, b_transpose); - - // Performs the accumulation (Cpm += Apm * Bpm) - MultiplyAccumulateDirect(cpm, apm, bpm); - } - } - barrier(CLK_LOCAL_MEM_FENCE); - } - - // Loop over the remaining part (incomplete tile in K-dimension) - for (; kwg < kSizeK; ++kwg) { - const int idk = kwg; - - // Loads A into register memory - #pragma unroll - for (int mi=0; mi +// +// This is part 3 of 3 of the GEMM kernel. See part 1 for more information. +// +// ================================================================================================= + +// Enables loading of this file using the C++ pre-processor's #include (C++11 standard raw string +// literal). Comment-out this line for syntax-highlighting when developing. +R"( + +// ================================================================================================= + +// Main body of the kernel. This is the direct version without pre/post processing and restrictions. +inline void XgemmDirect(const int kSizeM, const int kSizeN, const int kSizeK, + const real_arg arg_alpha, + const real_arg arg_beta, + const __global realMD* restrict agm, const int a_offset, const int a_ld, + const __global realND* restrict bgm, const int b_offset, const int b_ld, + __global real* cgm, const int c_offset, const int c_ld, + __local real* alm, __local real* blm, + const int a_transpose, const int b_transpose, const int c_transpose, + const int a_conjugate, const int b_conjugate) { + const real alpha = GetRealArg(arg_alpha); + const real beta = GetRealArg(arg_beta); + + // Extra pointers to scalar versions of global memory + const __global real* restrict agms = (const __global real* restrict) agm; + const __global real* restrict bgms = (const __global real* restrict) bgm; + + // Allocates workitem-private memory (registers) + real apm[MWID]; + real bpm[NWID]; + real cpm[NWID][MWID]; + + // Initializes the accumulation registers + InitAccRegistersDirect(cpm); + + // The faster version of GEMM is not allowed on the (incomplete) borders. Therefore, this section + // processes only the main parts: output blocks of WGD by WGD. + const int idm = get_local_id(0) * MWID + GetGroupID0() * WGD; + const int idn = get_local_id(1) * NWID + GetGroupID1() * WGD; + + if ((idm < (kSizeM/WGD)*WGD) && (idn < (kSizeN/WGD)*WGD) && + (a_ld % VWMD == 0) && (b_ld % VWND == 0)) { + + // Loops over all complete workgroup tiles (K-dimension) + int kwg = 0; + for (; kwg < (kSizeK/WGD) * WGD; kwg+=WGD) { + + // Loads data: off-chip --> local (matrix A and B) + GlobalToLocalDirectA(agm, alm, a_ld, a_offset, kwg, a_transpose, a_conjugate); + GlobalToLocalDirectB(bgm, blm, b_ld, b_offset, kwg, b_transpose, b_conjugate); + barrier(CLK_LOCAL_MEM_FENCE); + + // Loops over all workitem tiles, unrolled by a factor KWID + for (int pwi=0; pwi private (matrix A and B) + LocalToPrivateDirectA(alm, apm, kg, a_transpose); + LocalToPrivateDirectB(blm, bpm, kg, b_transpose); + + // Performs the accumulation (Cpm += Apm * Bpm) + MultiplyAccumulateDirect(cpm, apm, bpm); + } + } + barrier(CLK_LOCAL_MEM_FENCE); + } + + // Loop over the remaining part (incomplete tile in K-dimension) + for (; kwg < kSizeK; ++kwg) { + + // Loads data: off-chip --> private (matrix A and B) + GlobalToPrivateDirectA(agms, apm, a_ld, a_offset, idm, kwg, a_transpose, a_conjugate); + GlobalToPrivateDirectB(bgms, bpm, b_ld, b_offset, idn, kwg, b_transpose, b_conjugate); + + // Performs the accumulation (Cpm += Apm * Bpm) + MultiplyAccumulateDirect(cpm, apm, bpm); + } + + // Stores a tile of results and performs the multiplication with alpha and beta + StoreResultsDirect(cgm, cpm, idm, idn, alpha, beta, c_ld, c_offset, c_transpose); + } + + // Simple but slower version for the parts on the edge (incomplete tiles in M and N-dimensions) + else { + + // Loops over all complete workgroup tiles (K-dimension) + int kwg = 0; + for (; kwg < (kSizeK/WGD) * WGD; kwg+=WGD) { + + // Loads data: off-chip --> local (matrix A and B) + GlobalToLocalCheckedA(agms, alm, a_ld, a_offset, kwg, a_transpose, a_conjugate, kSizeM, kSizeK); + GlobalToLocalCheckedB(bgms, blm, b_ld, b_offset, kwg, b_transpose, b_conjugate, kSizeN, kSizeK); + barrier(CLK_LOCAL_MEM_FENCE); + + // Loops over all workitem tiles, unrolled by a factor KWID + for (int pwi=0; pwi private (matrix A and B) + LocalToPrivateDirectA(alm, apm, kg, a_transpose); + LocalToPrivateDirectB(blm, bpm, kg, b_transpose); + + // Performs the accumulation (Cpm += Apm * Bpm) + MultiplyAccumulateDirect(cpm, apm, bpm); + } + } + barrier(CLK_LOCAL_MEM_FENCE); + } + + // Loop over the remaining part (incomplete tile in K-dimension) + for (; kwg < kSizeK; ++kwg) { + + // Loads data: off-chip --> private (matrix A and B) + GlobalToPrivateCheckedA(agms, apm, a_ld, a_offset, idm, kwg, a_transpose, a_conjugate, kSizeM); + GlobalToPrivateCheckedB(bgms, bpm, b_ld, b_offset, idn, kwg, b_transpose, b_conjugate, kSizeN); + + // Performs the accumulation (Cpm += Apm * Bpm) + MultiplyAccumulateDirect(cpm, apm, bpm); + } + + // Stores a tile of results and performs the multiplication with alpha and beta + StoreResultsChecked(cgm, cpm, idm, idn, kSizeM, kSizeN, alpha, beta, c_ld, c_offset, c_transpose); + } +} + +// ================================================================================================= + +// Direct version of the GEMM kernel with [A, B] = [non-transposed, non-transposed] +__attribute__((reqd_work_group_size(MDIMCD, NDIMCD, 1))) +__kernel void XgemmDirectNN(const int kSizeM, const int kSizeN, const int kSizeK, + const real_arg arg_alpha, const real_arg arg_beta, + const __global realMD* restrict agm, const int a_offset, const int a_ld, + const __global realND* restrict bgm, const int b_offset, const int b_ld, + __global real* cgm, const int c_offset, const int c_ld, + const int c_transpose, const int a_conjugate, const int b_conjugate) { + __local real alm[WGD * (WGD + PADA)]; + __local real blm[WGD * (WGD + PADB)]; + XgemmDirect(kSizeM, kSizeN, kSizeK, arg_alpha, arg_beta, + agm, a_offset, a_ld, bgm, b_offset, b_ld, cgm, c_offset, c_ld, + alm, blm, 0, 0, c_transpose, a_conjugate, b_conjugate); +} + +// Direct version of the GEMM kernel with [A, B] = [non-transposed, transposed] +__attribute__((reqd_work_group_size(MDIMCD, NDIMCD, 1))) +__kernel void XgemmDirectNT(const int kSizeM, const int kSizeN, const int kSizeK, + const real_arg arg_alpha, const real_arg arg_beta, + const __global realMD* restrict agm, const int a_offset, const int a_ld, + const __global realND* restrict bgm, const int b_offset, const int b_ld, + __global real* cgm, const int c_offset, const int c_ld, + const int c_transpose, const int a_conjugate, const int b_conjugate) { + __local real alm[WGD * (WGD + PADA)]; + __local real blm[WGD * (WGD + PADB)]; + XgemmDirect(kSizeM, kSizeN, kSizeK, arg_alpha, arg_beta, + agm, a_offset, a_ld, bgm, b_offset, b_ld, cgm, c_offset, c_ld, + alm, blm, 0, 1, c_transpose, a_conjugate, b_conjugate); +} + +// Direct version of the GEMM kernel with [A, B] = [transposed, non-transposed] +__attribute__((reqd_work_group_size(MDIMCD, NDIMCD, 1))) +__kernel void XgemmDirectTN(const int kSizeM, const int kSizeN, const int kSizeK, + const real_arg arg_alpha, const real_arg arg_beta, + const __global realMD* restrict agm, const int a_offset, const int a_ld, + const __global realND* restrict bgm, const int b_offset, const int b_ld, + __global real* cgm, const int c_offset, const int c_ld, + const int c_transpose, const int a_conjugate, const int b_conjugate) { + __local real alm[WGD * (WGD + PADA)]; + __local real blm[WGD * (WGD + PADB)]; + XgemmDirect(kSizeM, kSizeN, kSizeK, arg_alpha, arg_beta, + agm, a_offset, a_ld, bgm, b_offset, b_ld, cgm, c_offset, c_ld, + alm, blm, 1, 0, c_transpose, a_conjugate, b_conjugate); +} + +// Direct version of the GEMM kernel with [A, B] = [transposed, transposed] +__attribute__((reqd_work_group_size(MDIMCD, NDIMCD, 1))) +__kernel void XgemmDirectTT(const int kSizeM, const int kSizeN, const int kSizeK, + const real_arg arg_alpha, const real_arg arg_beta, + const __global realMD* restrict agm, const int a_offset, const int a_ld, + const __global realND* restrict bgm, const int b_offset, const int b_ld, + __global real* cgm, const int c_offset, const int c_ld, + const int c_transpose, const int a_conjugate, const int b_conjugate) { + __local real alm[WGD * (WGD + PADA)]; + __local real blm[WGD * (WGD + PADB)]; + XgemmDirect(kSizeM, kSizeN, kSizeK, arg_alpha, arg_beta, + agm, a_offset, a_ld, bgm, b_offset, b_ld, cgm, c_offset, c_ld, + alm, blm, 1, 1, c_transpose, a_conjugate, b_conjugate); +} + +// ================================================================================================= + +// End of the C++11 raw string literal +)" + +// ================================================================================================= diff --git a/src/routines/level3/xgemm.cpp b/src/routines/level3/xgemm.cpp index 143ef3c1..93f5d30c 100644 --- a/src/routines/level3/xgemm.cpp +++ b/src/routines/level3/xgemm.cpp @@ -38,6 +38,7 @@ Xgemm::Xgemm(Queue &queue, EventPointer event, const std::string &name): #include "../../kernels/level3/xgemm_part3.opencl" #include "../../kernels/level3/xgemm_direct_part1.opencl" #include "../../kernels/level3/xgemm_direct_part2.opencl" + #include "../../kernels/level3/xgemm_direct_part3.opencl" ; } diff --git a/src/tuning/kernels/xgemm_direct.cpp b/src/tuning/kernels/xgemm_direct.cpp index c6948ef5..204e0be4 100644 --- a/src/tuning/kernels/xgemm_direct.cpp +++ b/src/tuning/kernels/xgemm_direct.cpp @@ -35,6 +35,7 @@ class TuneXgemmDirect { #include "../src/kernels/common.opencl" #include "../src/kernels/level3/xgemm_direct_part1.opencl" #include "../src/kernels/level3/xgemm_direct_part2.opencl" + #include "../src/kernels/level3/xgemm_direct_part3.opencl" ; } From ca0c075de2a73f250046876b0ca5f90dc4ef0e77 Mon Sep 17 00:00:00 2001 From: Cedric Nugteren Date: Mon, 3 Oct 2016 20:09:15 +0200 Subject: [PATCH 13/19] Added functions to load from off-chip to local memory without vector loads for the GEMM direct kernels --- src/kernels/level3/xgemm_direct_part1.opencl | 25 ++++++ src/kernels/level3/xgemm_direct_part2.opencl | 93 ++++++++++++++------ src/kernels/level3/xgemm_direct_part3.opencl | 18 ++-- 3 files changed, 106 insertions(+), 30 deletions(-) diff --git a/src/kernels/level3/xgemm_direct_part1.opencl b/src/kernels/level3/xgemm_direct_part1.opencl index 2e5addef..a8bd450e 100644 --- a/src/kernels/level3/xgemm_direct_part1.opencl +++ b/src/kernels/level3/xgemm_direct_part1.opencl @@ -182,6 +182,31 @@ inline void GlobalToPrivateCheckedB(const __global real* restrict bgms, real bpm // ================================================================================================= +// Caches on-chip local memory into per-thread private memory (registers). This function is specific +// for caching the A input matrix. +inline void LocalToPrivateDirectA(__local real* alm, real apm[MWID], const int kg, + const int a_transpose) { + #pragma unroll + for (int mi=0; mi local (matrix A and B) - GlobalToLocalDirectA(agm, alm, a_ld, a_offset, kwg, a_transpose, a_conjugate); - GlobalToLocalDirectB(bgm, blm, b_ld, b_offset, kwg, b_transpose, b_conjugate); + if (a_ld % VWMD == 0) { + GlobalToLocalDirectA(agm, alm, a_ld, a_offset, kwg, a_transpose, a_conjugate); + } + else { + GlobalToLocalScalarA(agms, alm, a_ld, a_offset, kwg, a_transpose, a_conjugate); + } + if (b_ld % VWND == 0) { + GlobalToLocalDirectB(bgm, blm, b_ld, b_offset, kwg, b_transpose, b_conjugate); + } + else { + GlobalToLocalScalarB(bgms, blm, b_ld, b_offset, kwg, b_transpose, b_conjugate); + } barrier(CLK_LOCAL_MEM_FENCE); // Loops over all workitem tiles, unrolled by a factor KWID From 7052a00a3edc0d37444c88914ece4c468c3e4e96 Mon Sep 17 00:00:00 2001 From: Cedric Nugteren Date: Mon, 3 Oct 2016 20:13:19 +0200 Subject: [PATCH 14/19] Fixed a const-correctness issue with complex conjugation in the GEMM direct kernel --- src/kernels/level3/xgemm_direct_part2.opencl | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/kernels/level3/xgemm_direct_part2.opencl b/src/kernels/level3/xgemm_direct_part2.opencl index 5f5c6883..d77cbf65 100644 --- a/src/kernels/level3/xgemm_direct_part2.opencl +++ b/src/kernels/level3/xgemm_direct_part2.opencl @@ -188,7 +188,7 @@ inline void GlobalToLocalScalarA(const __global real* restrict agms, __local rea int idk = (a_transpose) ? kg + GetGroupID0()*WGD : kg + kwg; // Loads the data from global memory into the local memory - const real result = agms[idk*a_ld + idm + a_offset]; + real result = agms[idk*a_ld + idm + a_offset]; if (a_conjugate) { COMPLEX_CONJUGATE(result); } alm[kg*(WGD + PADA) + mg] = result; } @@ -219,7 +219,7 @@ inline void GlobalToLocalScalarB(const __global real* restrict bgms, __local rea int idk = (b_transpose) ? kg + GetGroupID1()*WGD : kg + kwg; // Loads the data from global memory into the local memory - const real result = bgms[idk*b_ld + idn + b_offset]; + real result = bgms[idk*b_ld + idn + b_offset]; if (b_conjugate) { COMPLEX_CONJUGATE(result); } blm[kg*(WGD + PADB) + ng] = result; } @@ -257,7 +257,7 @@ inline void GlobalToLocalCheckedA(const __global real* restrict agms, __local re // Loads the data from global memory into the local memory int condition = (a_transpose) ? idm < kSizeK : idm < kSizeM; if (condition) { - const real result = agms[idk*a_ld + idm + a_offset]; + real result = agms[idk*a_ld + idm + a_offset]; if (a_conjugate) { COMPLEX_CONJUGATE(result); } alm[kg*(WGD + PADA) + mg] = result; } @@ -295,7 +295,7 @@ inline void GlobalToLocalCheckedB(const __global real* restrict bgms, __local re // Loads the data from global memory into the local memory int condition = (b_transpose) ? idn < kSizeK : idn < kSizeN; if (condition) { - const real result = bgms[idk*b_ld + idn + b_offset]; + real result = bgms[idk*b_ld + idn + b_offset]; if (b_conjugate) { COMPLEX_CONJUGATE(result); } blm[kg*(WGD + PADB) + ng] = result; } From a3e67f2be2ea9f964c8077d379ca522c6c439036 Mon Sep 17 00:00:00 2001 From: Cedric Nugteren Date: Thu, 6 Oct 2016 19:51:12 +0200 Subject: [PATCH 15/19] Added a kernel selection database to select between the direct and indirect GEMM kernels --- src/database/database.cpp | 4 +- src/database/database.hpp | 1 + src/database/kernel_selection.hpp | 79 +++++++++++++++++++++++++++++++ src/routines/level3/xgemm.cpp | 11 +++-- 4 files changed, 89 insertions(+), 6 deletions(-) create mode 100644 src/database/kernel_selection.hpp diff --git a/src/database/database.cpp b/src/database/database.cpp index 2696fb9b..df9ac373 100644 --- a/src/database/database.cpp +++ b/src/database/database.cpp @@ -26,6 +26,7 @@ #include "database/kernels/pad.hpp" #include "database/kernels/transpose.hpp" #include "database/kernels/padtranspose.hpp" +#include "database/kernel_selection.hpp" namespace clblast { // ================================================================================================= @@ -43,7 +44,8 @@ const std::vector Database::database = { CopyHalf, CopySingle, CopyDouble, CopyComplexSingle, CopyComplexDouble, PadHalf, PadSingle, PadDouble, PadComplexSingle, PadComplexDouble, TransposeHalf, TransposeSingle, TransposeDouble, TransposeComplexSingle, TransposeComplexDouble, - PadtransposeHalf, PadtransposeSingle, PadtransposeDouble, PadtransposeComplexSingle, PadtransposeComplexDouble + PadtransposeHalf, PadtransposeSingle, PadtransposeDouble, PadtransposeComplexSingle, PadtransposeComplexDouble, + KernelSelectionHalf, KernelSelectionSingle, KernelSelectionDouble, KernelSelectionComplexSingle, KernelSelectionComplexDouble }; // ================================================================================================= diff --git a/src/database/database.hpp b/src/database/database.hpp index 7c0afb46..912f0f15 100644 --- a/src/database/database.hpp +++ b/src/database/database.hpp @@ -80,6 +80,7 @@ class Database { static const DatabaseEntry PadHalf, PadSingle, PadDouble, PadComplexSingle, PadComplexDouble; static const DatabaseEntry TransposeHalf, TransposeSingle, TransposeDouble, TransposeComplexSingle, TransposeComplexDouble; static const DatabaseEntry PadtransposeHalf, PadtransposeSingle, PadtransposeDouble, PadtransposeComplexSingle, PadtransposeComplexDouble; + static const DatabaseEntry KernelSelectionHalf, KernelSelectionSingle, KernelSelectionDouble, KernelSelectionComplexSingle, KernelSelectionComplexDouble; static const std::vector database; // The constructor with a user-provided database overlay (potentially an empty vector) diff --git a/src/database/kernel_selection.hpp b/src/database/kernel_selection.hpp new file mode 100644 index 00000000..bccfb0c0 --- /dev/null +++ b/src/database/kernel_selection.hpp @@ -0,0 +1,79 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// This determines when to switch between the direct (for small sizes) and in-direct GEMM kernel +// with pre/post-processing kernels (for larger sizes). These can be set in a similar way as for the +// regular kernel tuning parameters: they can be specific for a certain vendor or device or can use +// some common default values. +// +// ================================================================================================= + +namespace clblast { +// ================================================================================================= + +const Database::DatabaseEntry Database::KernelSelectionHalf = { + "KernelSelection", Precision::kHalf, { + { // Default + kDeviceTypeAll, "default", { + { "default", { {"XGEMM_MIN_INDIRECT_SIZE",512*512*512} } }, + } + }, + } +}; + +// ================================================================================================= + +const Database::DatabaseEntry Database::KernelSelectionSingle = { + "KernelSelection", Precision::kSingle, { + { // Default + kDeviceTypeAll, "default", { + { "default", { {"XGEMM_MIN_INDIRECT_SIZE",512*512*512} } }, + } + }, + } +}; + +// ================================================================================================= + +const Database::DatabaseEntry Database::KernelSelectionComplexSingle = { + "KernelSelection", Precision::kComplexSingle, { + { // Default + kDeviceTypeAll, "default", { + { "default", { {"XGEMM_MIN_INDIRECT_SIZE",512*512*512} } }, + } + }, + } +}; + +// ================================================================================================= + +const Database::DatabaseEntry Database::KernelSelectionDouble = { + "KernelSelection", Precision::kDouble, { + { // Default + kDeviceTypeAll, "default", { + { "default", { {"XGEMM_MIN_INDIRECT_SIZE",512*512*512} } }, + } + }, + } +}; + +// ================================================================================================= + +const Database::DatabaseEntry Database::KernelSelectionComplexDouble = { + "KernelSelection", Precision::kComplexDouble, { + { // Default + kDeviceTypeAll, "default", { + { "default", { {"XGEMM_MIN_INDIRECT_SIZE",512*512*512} } }, + } + }, + } +}; + +// ================================================================================================= +} // namespace clblast diff --git a/src/routines/level3/xgemm.cpp b/src/routines/level3/xgemm.cpp index 93f5d30c..9d912374 100644 --- a/src/routines/level3/xgemm.cpp +++ b/src/routines/level3/xgemm.cpp @@ -22,7 +22,8 @@ namespace clblast { // Constructor: forwards to base class constructor template Xgemm::Xgemm(Queue &queue, EventPointer event, const std::string &name): - Routine(queue, event, name, {"Copy","Pad","Transpose","Padtranspose","Xgemm", "XgemmDirect"}, + Routine(queue, event, name, + {"Copy","Pad","Transpose","Padtranspose","Xgemm","XgemmDirect","KernelSelection"}, PrecisionValue()) { source_string_ = #include "../../kernels/level3/level3.opencl" @@ -102,15 +103,15 @@ StatusCode Xgemm::DoGemm(const Layout layout, status = TestMatrixC(c_one, c_two, c_buffer, c_offset, c_ld); if (ErrorIn(status)) { return status; } - // Optionally runs the direct version of GEMM. TODO: Set this based on the arguments - const auto do_gemm_direct = true; // for now, for testing - if (do_gemm_direct) { + // Selects which version of GEMM to run + const auto do_gemm_direct = (m * n * k < db_["XGEMM_MIN_INDIRECT_SIZE"]); + if (do_gemm_direct) { // for small sizes (single kernel) return GemmDirect(m, n, k, alpha, a_buffer, a_offset, a_ld, b_buffer, b_offset, b_ld, beta, c_buffer, c_offset, c_ld, a_do_transpose, b_do_transpose, c_do_transpose, a_conjugate, b_conjugate); } - else { + else { // for larger sizes (pre/post-processing plus a very fast kernel) return GemmIndirect(m, n, k, alpha, a_buffer, a_offset, a_ld, b_buffer, b_offset, b_ld, beta, c_buffer, c_offset, c_ld, From b698e454782d6347fbd329dded24c4ef3895b566 Mon Sep 17 00:00:00 2001 From: Cedric Nugteren Date: Thu, 6 Oct 2016 21:13:14 +0200 Subject: [PATCH 16/19] Added first tuning results for the single-kernel direct GEMM implementation --- CHANGELOG | 5 +- src/database/kernels/xgemm_direct.hpp | 72 ++++++++++++++++++++++++--- 2 files changed, 69 insertions(+), 8 deletions(-) diff --git a/CHANGELOG b/CHANGELOG index 9adb6e64..87ecccce 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -1,8 +1,9 @@ Development version (next release) -- It is now possible to set OpenCL compiler options through the env variable CLBLAST_BUILD_OPTIONS -- Fixed a bug in the tests and samples related to waiting for an invalid event - Updated to version 8.0 of the CLCudaAPI C++11 OpenCL header +- Improved performance of GEMM kernels for small sizes by using a direct single-kernel implementation +- Fixed a bug in the tests and samples related to waiting for an invalid event +- Added an option to set OpenCL compiler options through the env variable CLBLAST_BUILD_OPTIONS - Added an option to run tuned kernels multiple times to average execution times - Various minor fixes and enhancements diff --git a/src/database/kernels/xgemm_direct.hpp b/src/database/kernels/xgemm_direct.hpp index bc91fdc2..202deb1f 100644 --- a/src/database/kernels/xgemm_direct.hpp +++ b/src/database/kernels/xgemm_direct.hpp @@ -7,7 +7,7 @@ // Author(s): // Database generator // -// This file populates the database with best-found tuning parameters for the 'Xgemm' kernels. +// This file populates the database with best-found tuning parameters for the 'Xgemm_Direct' kernels. // // ================================================================================================= @@ -18,7 +18,7 @@ const Database::DatabaseEntry Database::XgemmDirectHalf = { "XgemmDirect", Precision::kHalf, { { // Default kDeviceTypeAll, "default", { - { "default", { {"WGD",32}, {"KWID",2}, {"MDIMAD",8}, {"MDIMCD",8}, {"NDIMBD",8}, {"NDIMCD",8}, {"VWMD",1}, {"VWND",1}, {"PADA",0}, {"PADB",0} } }, + { "default", { {"KWID",2}, {"MDIMAD",8}, {"MDIMCD",8}, {"NDIMBD",8}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",4}, {"VWND",4}, {"WGD",32} } }, } }, } @@ -28,9 +28,27 @@ const Database::DatabaseEntry Database::XgemmDirectHalf = { const Database::DatabaseEntry Database::XgemmDirectSingle = { "XgemmDirect", Precision::kSingle, { + { // AMD GPUs + kDeviceTypeGPU, "AMD", { + { "AMD Radeon R9 M370X Compute Engine", { {"KWID",2}, {"MDIMAD",8}, {"MDIMCD",8}, {"NDIMBD",8}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",2}, {"VWND",2}, {"WGD",32} } }, + { "default", { {"KWID",2}, {"MDIMAD",8}, {"MDIMCD",8}, {"NDIMBD",8}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",2}, {"VWND",2}, {"WGD",32} } }, + } + }, + { // Intel GPUs + kDeviceTypeGPU, "Intel", { + { "Iris Pro", { {"KWID",2}, {"MDIMAD",16}, {"MDIMCD",16}, {"NDIMBD",8}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",2}, {"VWND",4}, {"WGD",32} } }, + { "default", { {"KWID",2}, {"MDIMAD",16}, {"MDIMCD",16}, {"NDIMBD",8}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",2}, {"VWND",4}, {"WGD",32} } }, + } + }, + { // NVIDIA GPUs + kDeviceTypeGPU, "NVIDIA", { + { "GeForce GTX 750 Ti", { {"KWID",2}, {"MDIMAD",8}, {"MDIMCD",8}, {"NDIMBD",8}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",4}, {"VWND",4}, {"WGD",32} } }, + { "default", { {"KWID",2}, {"MDIMAD",8}, {"MDIMCD",8}, {"NDIMBD",8}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",4}, {"VWND",4}, {"WGD",32} } }, + } + }, { // Default kDeviceTypeAll, "default", { - { "default", { {"WGD",32}, {"KWID",2}, {"MDIMAD",8}, {"MDIMCD",8}, {"NDIMBD",8}, {"NDIMCD",8}, {"VWMD",1}, {"VWND",1}, {"PADA",0}, {"PADB",0} } }, + { "default", { {"KWID",2}, {"MDIMAD",8}, {"MDIMCD",8}, {"NDIMBD",8}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",4}, {"VWND",4}, {"WGD",32} } }, } }, } @@ -40,9 +58,27 @@ const Database::DatabaseEntry Database::XgemmDirectSingle = { const Database::DatabaseEntry Database::XgemmDirectComplexSingle = { "XgemmDirect", Precision::kComplexSingle, { + { // AMD GPUs + kDeviceTypeGPU, "AMD", { + { "AMD Radeon R9 M370X Compute Engine", { {"KWID",2}, {"MDIMAD",16}, {"MDIMCD",16}, {"NDIMBD",16}, {"NDIMCD",16}, {"PADA",1}, {"PADB",1}, {"VWMD",1}, {"VWND",1}, {"WGD",16} } }, + { "default", { {"KWID",2}, {"MDIMAD",16}, {"MDIMCD",16}, {"NDIMBD",16}, {"NDIMCD",16}, {"PADA",1}, {"PADB",1}, {"VWMD",1}, {"VWND",1}, {"WGD",16} } }, + } + }, + { // Intel GPUs + kDeviceTypeGPU, "Intel", { + { "Iris Pro", { {"KWID",2}, {"MDIMAD",16}, {"MDIMCD",16}, {"NDIMBD",8}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",2}, {"VWND",2}, {"WGD",32} } }, + { "default", { {"KWID",2}, {"MDIMAD",16}, {"MDIMCD",16}, {"NDIMBD",8}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",2}, {"VWND",2}, {"WGD",32} } }, + } + }, + { // NVIDIA GPUs + kDeviceTypeGPU, "NVIDIA", { + { "GeForce GTX 750 Ti", { {"KWID",2}, {"MDIMAD",8}, {"MDIMCD",8}, {"NDIMBD",8}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",1}, {"VWND",2}, {"WGD",16} } }, + { "default", { {"KWID",2}, {"MDIMAD",8}, {"MDIMCD",8}, {"NDIMBD",8}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",1}, {"VWND",2}, {"WGD",16} } }, + } + }, { // Default kDeviceTypeAll, "default", { - { "default", { {"WGD",32}, {"KWID",2}, {"MDIMAD",8}, {"MDIMCD",8}, {"NDIMBD",8}, {"NDIMCD",8}, {"VWMD",1}, {"VWND",1}, {"PADA",0}, {"PADB",0} } }, + { "default", { {"KWID",2}, {"MDIMAD",16}, {"MDIMCD",16}, {"NDIMBD",8}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",2}, {"VWND",2}, {"WGD",32} } }, } }, } @@ -52,9 +88,21 @@ const Database::DatabaseEntry Database::XgemmDirectComplexSingle = { const Database::DatabaseEntry Database::XgemmDirectDouble = { "XgemmDirect", Precision::kDouble, { + { // AMD GPUs + kDeviceTypeGPU, "AMD", { + { "AMD Radeon R9 M370X Compute Engine", { {"KWID",2}, {"MDIMAD",16}, {"MDIMCD",16}, {"NDIMBD",8}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",1}, {"VWND",1}, {"WGD",16} } }, + { "default", { {"KWID",2}, {"MDIMAD",16}, {"MDIMCD",16}, {"NDIMBD",8}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",1}, {"VWND",1}, {"WGD",16} } }, + } + }, + { // NVIDIA GPUs + kDeviceTypeGPU, "NVIDIA", { + { "GeForce GTX 750 Ti", { {"KWID",2}, {"MDIMAD",8}, {"MDIMCD",8}, {"NDIMBD",8}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",2}, {"VWND",2}, {"WGD",32} } }, + { "default", { {"KWID",2}, {"MDIMAD",8}, {"MDIMCD",8}, {"NDIMBD",8}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",2}, {"VWND",2}, {"WGD",32} } }, + } + }, { // Default kDeviceTypeAll, "default", { - { "default", { {"WGD",32}, {"KWID",2}, {"MDIMAD",8}, {"MDIMCD",8}, {"NDIMBD",8}, {"NDIMCD",8}, {"VWMD",1}, {"VWND",1}, {"PADA",0}, {"PADB",0} } }, + { "default", { {"KWID",2}, {"MDIMAD",16}, {"MDIMCD",16}, {"NDIMBD",8}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",1}, {"VWND",1}, {"WGD",16} } }, } }, } @@ -64,9 +112,21 @@ const Database::DatabaseEntry Database::XgemmDirectDouble = { const Database::DatabaseEntry Database::XgemmDirectComplexDouble = { "XgemmDirect", Precision::kComplexDouble, { + { // AMD GPUs + kDeviceTypeGPU, "AMD", { + { "AMD Radeon R9 M370X Compute Engine", { {"KWID",2}, {"MDIMAD",16}, {"MDIMCD",16}, {"NDIMBD",16}, {"NDIMCD",16}, {"PADA",1}, {"PADB",1}, {"VWMD",1}, {"VWND",1}, {"WGD",16} } }, + { "default", { {"KWID",2}, {"MDIMAD",16}, {"MDIMCD",16}, {"NDIMBD",16}, {"NDIMCD",16}, {"PADA",1}, {"PADB",1}, {"VWMD",1}, {"VWND",1}, {"WGD",16} } }, + } + }, + { // NVIDIA GPUs + kDeviceTypeGPU, "NVIDIA", { + { "GeForce GTX 750 Ti", { {"KWID",2}, {"MDIMAD",32}, {"MDIMCD",32}, {"NDIMBD",8}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",1}, {"VWND",1}, {"WGD",32} } }, + { "default", { {"KWID",2}, {"MDIMAD",32}, {"MDIMCD",32}, {"NDIMBD",8}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",1}, {"VWND",1}, {"WGD",32} } }, + } + }, { // Default kDeviceTypeAll, "default", { - { "default", { {"WGD",32}, {"KWID",2}, {"MDIMAD",8}, {"MDIMCD",8}, {"NDIMBD",8}, {"NDIMCD",8}, {"VWMD",1}, {"VWND",1}, {"PADA",0}, {"PADB",0} } }, + { "default", { {"KWID",2}, {"MDIMAD",8}, {"MDIMCD",8}, {"NDIMBD",8}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",1}, {"VWND",2}, {"WGD",16} } }, } }, } From 7baac46e723088bba1b6845d7dfd709563174a87 Mon Sep 17 00:00:00 2001 From: Cedric Nugteren Date: Sat, 8 Oct 2016 21:56:06 +0200 Subject: [PATCH 17/19] Fixed a performance bug for Intel Iris Pro GPUs due to incorrect tuning results --- src/database/kernels/xgemm.hpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/database/kernels/xgemm.hpp b/src/database/kernels/xgemm.hpp index d19c55b5..e289c542 100644 --- a/src/database/kernels/xgemm.hpp +++ b/src/database/kernels/xgemm.hpp @@ -59,8 +59,8 @@ const Database::DatabaseEntry Database::XgemmSingle = { { "Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile", { {"KWG",16}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",8}, {"NDIMC",16}, {"NWG",128}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",1}, {"VWM",2}, {"VWN",4} } }, { "Intel(R) HD Graphics Skylake ULT GT2", { {"KWG",32}, {"KWI",8}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",128}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",1}, {"VWM",1}, {"VWN",8} } }, { "Iris", { {"KWG",16}, {"KWI",8}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",128}, {"NDIMB",32}, {"NDIMC",16}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",1}, {"VWM",4}, {"VWN",1} } }, - { "Iris Pro", { {"KWG",32}, {"KWI",8}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",64}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",64}, {"SA",1}, {"SB",0}, {"STRM",1}, {"STRN",0}, {"VWM",4}, {"VWN",4} } }, - { "default", { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",64}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",1} } }, + { "Iris Pro", { {"KWG",16}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",64}, {"NDIMB",32}, {"NDIMC",32}, {"NWG",128}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",4}, {"VWN",4} } }, + { "default", { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",8}, {"NDIMC",16}, {"NWG",64}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",1} } }, } }, { // Intel accelerators From d7cfb6aa9bac8207f76a818749d0654337d51533 Mon Sep 17 00:00:00 2001 From: Cedric Nugteren Date: Sat, 8 Oct 2016 22:05:54 +0200 Subject: [PATCH 18/19] Added benchmark script for small matrix sizes, testing the direct GEMM kernels --- scripts/graphs/common.r | 35 ++++++++++++++++------ scripts/graphs/xgemm_small.r | 56 ++++++++++++++++++++++++++++++++++++ 2 files changed, 83 insertions(+), 8 deletions(-) create mode 100644 scripts/graphs/xgemm_small.r diff --git a/scripts/graphs/common.r b/scripts/graphs/common.r index cd68cf26..e5dad616 100644 --- a/scripts/graphs/common.r +++ b/scripts/graphs/common.r @@ -31,8 +31,12 @@ options("width"=170) # ================================================================================================== -# Constants -num_runs <- 4 +# Settings +num_runs <- 5 +num_runs_short <- 50 +xtics_subset_threshold <- 100 +xtics_subset_stepsize <- 8 + devices <- c("-platform","-device") options_string <- "-q -no_abbrv -cblas 0" library_names <- c("CLBlast", "clBLAS") @@ -66,11 +70,21 @@ main <- function(routine_name, precision, test_names, test_values, executable <- paste("./clblast_client_", routine_name, sep="") # Configures the outputfile - pdf(paste(display_name, ".pdf", sep=""), height=8, width=13) - par(mfrow=c(2, 3)) - par(oma=c(0, 0, 0, 0)) - par(mar=c(4.6, 4.4, 1.5, 0)) # bottom, left, top, right [c(5.1, 4.1, 4.1, 2.1)] - par(mgp=c(2.8, 0.6, 0)) # location of xlab/ylab, tick-mark labels, tick marks [c(3, 1, 0)] + file_name <- paste(display_name, ".pdf", sep="") + if (length(test_names) == 6) { + pdf(file_name, height=8, width=13) + par(mfrow=c(2, 3)) + par(oma=c(0, 0, 0, 0)) + par(mar=c(4.6, 4.4, 1.5, 0)) # bottom, left, top, right [c(5.1, 4.1, 4.1, 2.1)] + par(mgp=c(2.8, 0.6, 0)) # location of xlab/ylab, tick-mark labels, tick marks [c(3, 1, 0)] + } + else { # length(test_names) == 2 + pdf(file_name, height=8, width=13) + par(mfrow=c(2, 1)) + par(oma=c(0, 0, 0, 0)) + par(mar=c(4.6, 4.4, 1.5, 0)) # bottom, left, top, right [c(5.1, 4.1, 4.1, 2.1)] + par(mgp=c(2.8, 0.6, 0)) # location of xlab/ylab, tick-mark labels, tick marks [c(3, 1, 0)] + } # Loops over the test-cases for (test_id in 1:length(test_names)) { @@ -169,7 +183,12 @@ plot_graph <- function(xdata, ydata, log_setting, main="", xlab="", ylab="", ylim=c(ymin, ymax), xlim=c(xmin, xmax), axes=F, "n") axis(side=2, las=2) - axis(side=1, at=xdata, labels=xtics, las=2) + if (length(xdata) > xtics_subset_threshold) { # Too many indices to print, plot only every Nth + subset <- seq(from=1, to=length(xdata), by=xtics_subset_stepsize) + axis(side=1, at=xdata[subset], labels=xtics[subset], las=2) + } else { + axis(side=1, at=xdata, labels=xtics, las=2) + } title(xlab=xlabel, line=-1) title(ylab=ylabel, line=2) title(graph_title, line=-2) diff --git a/scripts/graphs/xgemm_small.r b/scripts/graphs/xgemm_small.r new file mode 100644 index 00000000..ef94ef20 --- /dev/null +++ b/scripts/graphs/xgemm_small.r @@ -0,0 +1,56 @@ + +# ================================================================================================== +# This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +# project uses a tab-size of two spaces and a max-width of 100 characters per line. +# +# Author(s): +# Cedric Nugteren +# +# This file implements the performance script for small sizes of Xgemm, testing the direct kernel +# +# ================================================================================================== + +# Includes the common functions +args <- commandArgs(trailingOnly = FALSE) +thisfile <- (normalizePath(sub("--file=", "", args[grep("--file=", args)]))) +source(file.path(dirname(thisfile), "common.r")) + +# ================================================================================================== + +# Settings +routine_name <- "xgemm" +parameters <- c("-m","-n","-k","-layout","-transA","-transB", + "-num_steps","-step","-runs","-precision") +precision <- 32 + +# Sets the names of the test-cases +test_names <- list( + "small matrices in steps of 16", + "small matrices in steps of 1" +) + +# Defines the test-cases +test_values <- list( + list(c( 128, 128, 128, 102, 111, 111, 57, 16, num_runs_short, precision)), + list(c( 128, 128, 128, 102, 111, 111, 385, 1, num_runs_short, precision)) +) + +# Defines the x-labels corresponding to the test-cases +test_xlabels <- list( + "matrix sizes (m=n=k)", + "matrix sizes (m=n=k)" +) + +# Defines the x-axis of the test-cases +test_xaxis <- list( + c("m", ""), + c("m", "") +) + +# ================================================================================================== + +# Start the script +main(routine_name=routine_name, precision=precision, test_names=test_names, test_values=test_values, + test_xlabels=test_xlabels, test_xaxis=test_xaxis, metric_gflops=TRUE) + +# ================================================================================================== \ No newline at end of file From 7c228f6a674a748ec9ef4907552f5043fb424224 Mon Sep 17 00:00:00 2001 From: Cedric Nugteren Date: Mon, 10 Oct 2016 16:01:02 +0200 Subject: [PATCH 19/19] Changed the thresholds for the direct/indirect GEMM kernels for NVIDIA and Intel GPUs --- src/database/kernel_selection.hpp | 50 +++++++++++++++++++++++++++++++ 1 file changed, 50 insertions(+) diff --git a/src/database/kernel_selection.hpp b/src/database/kernel_selection.hpp index bccfb0c0..c9462c7a 100644 --- a/src/database/kernel_selection.hpp +++ b/src/database/kernel_selection.hpp @@ -19,6 +19,16 @@ namespace clblast { const Database::DatabaseEntry Database::KernelSelectionHalf = { "KernelSelection", Precision::kHalf, { + { // Intel GPUs + kDeviceTypeGPU, "Intel", { + { "default", { {"XGEMM_MIN_INDIRECT_SIZE",384*384*384} } }, + } + }, + { // NVIDIA GPUs + kDeviceTypeGPU, "NVIDIA", { + { "default", { {"XGEMM_MIN_INDIRECT_SIZE",768*768*768} } }, + } + }, { // Default kDeviceTypeAll, "default", { { "default", { {"XGEMM_MIN_INDIRECT_SIZE",512*512*512} } }, @@ -31,6 +41,16 @@ const Database::DatabaseEntry Database::KernelSelectionHalf = { const Database::DatabaseEntry Database::KernelSelectionSingle = { "KernelSelection", Precision::kSingle, { + { // Intel GPUs + kDeviceTypeGPU, "Intel", { + { "default", { {"XGEMM_MIN_INDIRECT_SIZE",384*384*384} } }, + } + }, + { // NVIDIA GPUs + kDeviceTypeGPU, "NVIDIA", { + { "default", { {"XGEMM_MIN_INDIRECT_SIZE",768*768*768} } }, + } + }, { // Default kDeviceTypeAll, "default", { { "default", { {"XGEMM_MIN_INDIRECT_SIZE",512*512*512} } }, @@ -43,6 +63,16 @@ const Database::DatabaseEntry Database::KernelSelectionSingle = { const Database::DatabaseEntry Database::KernelSelectionComplexSingle = { "KernelSelection", Precision::kComplexSingle, { + { // Intel GPUs + kDeviceTypeGPU, "Intel", { + { "default", { {"XGEMM_MIN_INDIRECT_SIZE",384*384*384} } }, + } + }, + { // NVIDIA GPUs + kDeviceTypeGPU, "NVIDIA", { + { "default", { {"XGEMM_MIN_INDIRECT_SIZE",768*768*768} } }, + } + }, { // Default kDeviceTypeAll, "default", { { "default", { {"XGEMM_MIN_INDIRECT_SIZE",512*512*512} } }, @@ -55,6 +85,16 @@ const Database::DatabaseEntry Database::KernelSelectionComplexSingle = { const Database::DatabaseEntry Database::KernelSelectionDouble = { "KernelSelection", Precision::kDouble, { + { // Intel GPUs + kDeviceTypeGPU, "Intel", { + { "default", { {"XGEMM_MIN_INDIRECT_SIZE",384*384*384} } }, + } + }, + { // NVIDIA GPUs + kDeviceTypeGPU, "NVIDIA", { + { "default", { {"XGEMM_MIN_INDIRECT_SIZE",768*768*768} } }, + } + }, { // Default kDeviceTypeAll, "default", { { "default", { {"XGEMM_MIN_INDIRECT_SIZE",512*512*512} } }, @@ -67,6 +107,16 @@ const Database::DatabaseEntry Database::KernelSelectionDouble = { const Database::DatabaseEntry Database::KernelSelectionComplexDouble = { "KernelSelection", Precision::kComplexDouble, { + { // Intel GPUs + kDeviceTypeGPU, "Intel", { + { "default", { {"XGEMM_MIN_INDIRECT_SIZE",384*384*384} } }, + } + }, + { // NVIDIA GPUs + kDeviceTypeGPU, "NVIDIA", { + { "default", { {"XGEMM_MIN_INDIRECT_SIZE",768*768*768} } }, + } + }, { // Default kDeviceTypeAll, "default", { { "default", { {"XGEMM_MIN_INDIRECT_SIZE",512*512*512} } },