Merge pull request #345 from CNugteren/convolution-fixes-and-tuner

Convolution with single kernel
This commit is contained in:
Cedric Nugteren 2019-01-19 17:56:05 +01:00 committed by GitHub
commit 9a9c24e811
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
24 changed files with 585 additions and 43 deletions

View file

@ -1,3 +1,6 @@
Development (next version)
- Implemented single-kernel version of convolution as GEMM
- Various minor fixes and enhancements
Version 1.5.0 Version 1.5.0
- Added support for shuffle instructions for NVIDIA GPUs (thanks to 'tyler-utah') - Added support for shuffle instructions for NVIDIA GPUs (thanks to 'tyler-utah')

View file

@ -212,10 +212,10 @@ endif()
# Sets the supported routines and the used kernels. New routines and kernels should be added here. # Sets the supported routines and the used kernels. New routines and kernels should be added here.
set(KERNELS copy_fast copy_pad transpose_fast transpose_pad xaxpy xdot xger set(KERNELS copy_fast copy_pad transpose_fast transpose_pad xaxpy xdot xger
xgemm xgemm_direct xgemv invert) xgemm xgemm_direct xgemv invert xconvgemm)
set(DATABASES copy pad padtranspose transpose xaxpy xdot set(DATABASES copy pad padtranspose transpose xaxpy xdot
xgemm xgemm_direct xgemv xgemv_fast xgemv_fast_rot xger invert xgemm xgemm_direct xgemv xgemv_fast xgemv_fast_rot xger invert
gemm_routine trsv_routine) gemm_routine trsv_routine xconvgemm)
set(ROUTINE_TUNERS xgemm xtrsv) set(ROUTINE_TUNERS xgemm xtrsv)
set(LEVEL1_ROUTINES xswap xscal xcopy xaxpy xdot xdotu xdotc xnrm2 xasum xamax) set(LEVEL1_ROUTINES xswap xscal xcopy xaxpy xdot xdotu xdotc xnrm2 xasum xamax)
set(LEVEL2_ROUTINES xgemv xgbmv xhemv xhbmv xhpmv xsymv xsbmv xspmv xtrmv xtbmv xtpmv xtrsv set(LEVEL2_ROUTINES xgemv xgbmv xhemv xhbmv xhpmv xsymv xsbmv xspmv xtrmv xtbmv xtpmv xtrsv
@ -434,7 +434,8 @@ if(TUNERS)
endif() endif()
# Adds tuning executables # Adds tuning executables
foreach(KERNEL ${KERNELS}) set(ALLKERNELS ${KERNELS})
foreach(KERNEL ${ALLKERNELS})
add_executable(clblast_tuner_${KERNEL} ${TUNERS_COMMON} src/tuning/kernels/${KERNEL}.cpp) add_executable(clblast_tuner_${KERNEL} ${TUNERS_COMMON} src/tuning/kernels/${KERNEL}.cpp)
target_link_libraries(clblast_tuner_${KERNEL} ${API_LIBRARIES}) target_link_libraries(clblast_tuner_${KERNEL} ${API_LIBRARIES})
target_include_directories(clblast_tuner_${KERNEL} PUBLIC $<TARGET_PROPERTY:clblast,INTERFACE_INCLUDE_DIRECTORIES> ${API_INCLUDE_DIRS}) target_include_directories(clblast_tuner_${KERNEL} PUBLIC $<TARGET_PROPERTY:clblast,INTERFACE_INCLUDE_DIRECTORIES> ${API_INCLUDE_DIRS})

View file

@ -79,6 +79,7 @@ More detailed documentation is available in separate files:
* [Testing the library for correctness](doc/testing.md) * [Testing the library for correctness](doc/testing.md)
* [Bindings / wrappers for other languages](doc/bindings.md) * [Bindings / wrappers for other languages](doc/bindings.md)
* [More details on the GEMM kernel](doc/details_gemm.md) * [More details on the GEMM kernel](doc/details_gemm.md)
* [More details on the convolution implementation](doc/details_conv.md)
* [Glossary with some terms explained](doc/glossary.md) * [Glossary with some terms explained](doc/glossary.md)
* [Frequently asked questions (FAQ) and their answers](doc/faq.md) * [Frequently asked questions (FAQ) and their answers](doc/faq.md)

View file

@ -20,6 +20,6 @@ This file gives an overview of the main features planned for addition to CLBlast
| [#228](https://github.com/CNugteren/CLBlast/issues/228) | Mar-Apr '18 | CNugteren | ✔ | Improving performance for Qualcomm Adreno GPUs | | [#228](https://github.com/CNugteren/CLBlast/issues/228) | Mar-Apr '18 | CNugteren | ✔ | Improving performance for Qualcomm Adreno GPUs |
| [#270](https://github.com/CNugteren/CLBlast/issues/270) | Oct '18 | CNugteren | ✔ | Implement col2im | | [#270](https://github.com/CNugteren/CLBlast/issues/270) | Oct '18 | CNugteren | ✔ | Implement col2im |
| - | ?? | CNugteren | | Add support for OpenCL image buffers | | - | ?? | CNugteren | | Add support for OpenCL image buffers |
| [#267](https://github.com/CNugteren/CLBlast/issues/267) | ?? | CNugteren | WIP | Merge im2col and GEMM into a direct kernel | | [#267](https://github.com/CNugteren/CLBlast/issues/267) | Jan '19 | vbkaisetsu| ✔ | Merge im2col and GEMM into a direct kernel |
| [#136](https://github.com/CNugteren/CLBlast/issues/136) | ?? | CNugteren | | Implement xAXPBY and xSET | | [#136](https://github.com/CNugteren/CLBlast/issues/136) | ?? | CNugteren | | Implement xAXPBY and xSET |
| [#169](https://github.com/CNugteren/CLBlast/issues/169) | ?? | dividiti | | Problem-specific tuning parameter selection | | [#169](https://github.com/CNugteren/CLBlast/issues/169) | ?? | dividiti | | Problem-specific tuning parameter selection |

22
doc/details_conv.md Normal file
View file

@ -0,0 +1,22 @@
CLBlast: Details on the CONVGEMM routine
================
This document gives a bit more detail on how the CONVGEMM routine is organised and implemented. For other information about CLBlast, see the [main README](../README.md).
CONVGEMM: Two approaches
-------------
CLBlast implements two approaches to batched convolutions using GEMM: through im2col, or stand-alone:
* `ConvGemmMethod::kWithIm2Col`: running first a batched version of im2col to prepare the data into a temporary buffer, and then running a batched version of GEMM. The implementation is just as the regular im2col and GEMM kernels in CLBlast, but it is implemented as a separate kernel so all the non-needed features can be stripped out and some optimizations can be made. It uses the tuning parameters of the regular im2col and GEMM kernels.
* `ConvGemmMethod::kSingleKernel`: this is a single kernel approach: it loads the data in such a way that the im2col kernel is no longer needed, i.e. loading the data as the im2col transformation does it. That way it becomes a single kernel and there will be no need for an intermediate large buffer. It uses a separate set of tuning parameters, and can be tuned using the `clblast_tuner_xconvgemm` binary.
CONVGEMM: Selecting which approach to use
-------------
Since CONVGEMM is a relatively new and experimental feature, selection of the approach is hard-coded in [xconvgemm.hpp on line 32](../src/routines/levelx/xconvgemm.hpp:32), but can be changed there in a single place.
The main drawback of the `ConvGemmMethod::kWithIm2Col` approach is its extra memory usage, but depending on the device and setting, it might be faster compared to the `ConvGemmMethod::kSingleKernel` approach. The latter has as extra advantage that it has its own tuning parameters, so it can be fine-tuned for your specific use-case a bit better than the 2-kernel approach with im2col.

View file

@ -94,7 +94,7 @@ In addition, some extra non-BLAS routines are also supported by CLBlast, classif
| xOMATCOPY | ✔ | ✔ | ✔ | ✔ | ✔ | (Out-of-place copying/transposing/scaling of matrices) | xOMATCOPY | ✔ | ✔ | ✔ | ✔ | ✔ | (Out-of-place copying/transposing/scaling of matrices)
| xIM2COL | ✔ | ✔ | ✔ | ✔ | ✔ | (Image to column transform as used to express convolution as GEMM) | xIM2COL | ✔ | ✔ | ✔ | ✔ | ✔ | (Image to column transform as used to express convolution as GEMM)
| xCOL2IM | ✔ | ✔ | ✔ | ✔ | ✔ | (Column to image transform as used in machine learning) | xCOL2IM | ✔ | ✔ | ✔ | ✔ | ✔ | (Column to image transform as used in machine learning)
| xCONVGEMM | ✔ | ✔ | - | - | ✔ | (Experimental, implemented as im2col followed by batched GEMM) | xCONVGEMM | ✔ | ✔ | - | - | ✔ | (Experimental, implemented as either im2col followed by batched GEMM or as a single kernel)
Some less commonly used BLAS routines are not yet supported by CLBlast. They are xROTG, xROTMG, xROT, xROTM, xTBSV, and xTPSV. Some less commonly used BLAS routines are not yet supported by CLBlast. They are xROTG, xROTMG, xROT, xROTM, xTBSV, and xTPSV.

View file

@ -24,7 +24,9 @@ DEVICE_ATTRIBUTES = ["clblast_device_name", "clblast_device_architecture",
"device_core_clock", "device_compute_units"] "device_core_clock", "device_compute_units"]
KERNEL_ATTRIBUTES = ["precision", "kernel_family"] KERNEL_ATTRIBUTES = ["precision", "kernel_family"]
ARGUMENT_ATTRIBUTES = ["arg_m", "arg_n", "arg_k", "arg_alpha", "arg_beta", ARGUMENT_ATTRIBUTES = ["arg_m", "arg_n", "arg_k", "arg_alpha", "arg_beta",
"arg_from", "arg_to", "arg_step"] "arg_from", "arg_to", "arg_step",
"arg_channels", "arg_height", "arg_width", "arg_kernel_h", "arg_kernel_w",
"arg_num_kernels", "arg_batch_count"]
ATTRIBUTES = DEVICE_ATTRIBUTES + DEVICE_TYPE_ATTRIBUTES + KERNEL_ATTRIBUTES + ARGUMENT_ATTRIBUTES ATTRIBUTES = DEVICE_ATTRIBUTES + DEVICE_TYPE_ATTRIBUTES + KERNEL_ATTRIBUTES + ARGUMENT_ATTRIBUTES
GROUP_ATTRIBUTES = DEVICE_TYPE_ATTRIBUTES + KERNEL_ATTRIBUTES + ["kernel"] + ARGUMENT_ATTRIBUTES GROUP_ATTRIBUTES = DEVICE_TYPE_ATTRIBUTES + KERNEL_ATTRIBUTES + ["kernel"] + ARGUMENT_ATTRIBUTES

View file

@ -49,6 +49,9 @@ const DatabaseEntry XgemmApple = {
const DatabaseEntry XgemmDirectApple = { const DatabaseEntry XgemmDirectApple = {
"XgemmDirect", Precision::kAny, {"KWID", "MDIMAD", "MDIMCD", "NDIMBD", "NDIMCD", "PADA", "PADB", "VWMD", "VWND", "WGD"}, { { kDeviceTypeAll, "default", { { "default", { { kDeviceNameDefault, Params{ 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0 } } } } } } } "XgemmDirect", Precision::kAny, {"KWID", "MDIMAD", "MDIMCD", "NDIMBD", "NDIMCD", "PADA", "PADB", "VWMD", "VWND", "WGD"}, { { kDeviceTypeAll, "default", { { "default", { { kDeviceNameDefault, Params{ 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0 } } } } } } }
}; };
const DatabaseEntry XconvgemmApple = {
"Xconvgemm", Precision::kAny, {"KWID", "MDIMAD", "MDIMCD", "NDIMBD", "NDIMCD", "PADA", "PADB", "VWMD", "VWND", "WGD"}, { { kDeviceTypeAll, "default", { { "default", { { kDeviceNameDefault, Params{ 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0 } } } } } } }
};
const DatabaseEntry CopyApple = { const DatabaseEntry CopyApple = {
"Copy", Precision::kAny, {"COPY_DIMX", "COPY_DIMY", "COPY_VW", "COPY_WPT"}, { { kDeviceTypeAll, "default", { { "default", { { kDeviceNameDefault, Params{ 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } } } } } } } "Copy", Precision::kAny, {"COPY_DIMX", "COPY_DIMY", "COPY_VW", "COPY_WPT"}, { { kDeviceTypeAll, "default", { { "default", { { kDeviceNameDefault, Params{ 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } } } } } } }
}; };

View file

@ -25,6 +25,7 @@
#include "database/kernels/xger/xger.hpp" #include "database/kernels/xger/xger.hpp"
#include "database/kernels/xgemm/xgemm.hpp" #include "database/kernels/xgemm/xgemm.hpp"
#include "database/kernels/xgemm_direct/xgemm_direct.hpp" #include "database/kernels/xgemm_direct/xgemm_direct.hpp"
#include "database/kernels/xconvgemm/xconvgemm.hpp"
#include "database/kernels/copy/copy.hpp" #include "database/kernels/copy/copy.hpp"
#include "database/kernels/pad/pad.hpp" #include "database/kernels/pad/pad.hpp"
#include "database/kernels/transpose/transpose.hpp" #include "database/kernels/transpose/transpose.hpp"
@ -43,7 +44,7 @@ std::vector<database::DatabaseEntry> Database::database = std::vector<database::
const std::vector<database::DatabaseEntry> Database::apple_cpu_fallback = std::vector<database::DatabaseEntry>{ const std::vector<database::DatabaseEntry> Database::apple_cpu_fallback = std::vector<database::DatabaseEntry>{
database::XaxpyApple, database::XdotApple, database::XaxpyApple, database::XdotApple,
database::XgemvApple, database::XgemvFastApple, database::XgemvFastRotApple, database::XgerApple, database::XtrsvApple, database::XgemvApple, database::XgemvFastApple, database::XgemvFastRotApple, database::XgerApple, database::XtrsvApple,
database::XgemmApple, database::XgemmDirectApple, database::XgemmApple, database::XgemmDirectApple, database::XconvgemmApple,
database::CopyApple, database::PadApple, database::TransposeApple, database::PadtransposeApple, database::CopyApple, database::PadApple, database::TransposeApple, database::PadtransposeApple,
database::InvertApple, database::InvertApple,
database::TrsvRoutineApple database::TrsvRoutineApple
@ -71,6 +72,7 @@ Database::Database(const Device &device, const std::string &kernel_name,
database::XgerHalf, database::XgerSingle, database::XgerDouble, database::XgerComplexSingle, database::XgerComplexDouble, database::XgerHalf, database::XgerSingle, database::XgerDouble, database::XgerComplexSingle, database::XgerComplexDouble,
database::XgemmHalf, database::XgemmSingle, database::XgemmDouble, database::XgemmComplexSingle, database::XgemmComplexDouble, database::XgemmHalf, database::XgemmSingle, database::XgemmDouble, database::XgemmComplexSingle, database::XgemmComplexDouble,
database::XgemmDirectHalf, database::XgemmDirectSingle, database::XgemmDirectDouble, database::XgemmDirectComplexSingle, database::XgemmDirectComplexDouble, database::XgemmDirectHalf, database::XgemmDirectSingle, database::XgemmDirectDouble, database::XgemmDirectComplexSingle, database::XgemmDirectComplexDouble,
database::XconvgemmHalf, database::XconvgemmSingle, database::XconvgemmDouble, database::XconvgemmComplexSingle, database::XconvgemmComplexDouble,
database::CopyHalf, database::CopySingle, database::CopyDouble, database::CopyComplexSingle, database::CopyComplexDouble, database::CopyHalf, database::CopySingle, database::CopyDouble, database::CopyComplexSingle, database::CopyComplexDouble,
database::PadHalf, database::PadSingle, database::PadDouble, database::PadComplexSingle, database::PadComplexDouble, database::PadHalf, database::PadSingle, database::PadDouble, database::PadComplexSingle, database::PadComplexDouble,
database::TransposeHalf, database::TransposeSingle, database::TransposeDouble, database::TransposeComplexSingle, database::TransposeComplexDouble, database::TransposeHalf, database::TransposeSingle, database::TransposeDouble, database::TransposeComplexSingle, database::TransposeComplexDouble,

View file

@ -0,0 +1,15 @@
// =================================================================================================
// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. It
// is auto-generated by the 'scripts/database/database.py' Python script.
//
// This file populates the database with best-found tuning parameters for the 'Xconvgemm' kernels.
//
// =================================================================================================
#include "database/kernels/xconvgemm/xconvgemm.hpp"
#include "database/kernels/xconvgemm/xconvgemm_16.hpp"
#include "database/kernels/xconvgemm/xconvgemm_32.hpp"
#include "database/kernels/xconvgemm/xconvgemm_3232.hpp"
#include "database/kernels/xconvgemm/xconvgemm_64.hpp"
#include "database/kernels/xconvgemm/xconvgemm_6464.hpp"

View file

@ -0,0 +1,22 @@
// =================================================================================================
// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. It
// is auto-generated by the 'scripts/database/database.py' Python script.
//
// This file populates the database with best-found tuning parameters for the 'Xconvgemm' kernels.
//
// =================================================================================================
#include "database/database_structure.hpp"
namespace clblast {
namespace database {
extern const DatabaseEntry XconvgemmHalf;
extern const DatabaseEntry XconvgemmSingle;
extern const DatabaseEntry XconvgemmComplexSingle;
extern const DatabaseEntry XconvgemmDouble;
extern const DatabaseEntry XconvgemmComplexDouble;
} // namespace database
} // namespace clblast

View file

@ -0,0 +1,34 @@
// =================================================================================================
// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. It
// is auto-generated by the 'scripts/database/database.py' Python script.
//
// This file populates the database with best-found tuning parameters for the 'Xconvgemm16' kernels.
//
// =================================================================================================
namespace clblast {
namespace database {
const DatabaseEntry XconvgemmHalf = {
"Xconvgemm", Precision::kHalf, {"KWID", "MDIMAD", "MDIMCD", "NDIMBD", "NDIMCD", "PADA", "PADB", "VWMD", "VWND", "WGD"}, {
{ // Intel GPUs
kDeviceTypeGPU, "Intel", {
{ "default", {
{ Name{"Intel(R) HD Graphics Skylake ULT GT2 "}, Params{ 1, 8, 16, 16, 8, 0, 0, 1, 1, 32, 0, 0, 0, 0, 0, 0 } },
{ kDeviceNameDefault , Params{ 1, 8, 16, 16, 8, 0, 0, 1, 1, 32, 0, 0, 0, 0, 0, 0 } },
} },
}
},
{ // Default
kDeviceTypeAll, "default", {
{ "default", {
{ kDeviceNameDefault , Params{ 1, 8, 16, 16, 8, 0, 0, 1, 1, 32, 0, 0, 0, 0, 0, 0 } },
} },
}
},
}
};
} // namespace database
} // namespace clblast

View file

@ -0,0 +1,35 @@
// =================================================================================================
// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. It
// is auto-generated by the 'scripts/database/database.py' Python script.
//
// This file populates the database with best-found tuning parameters for the 'Xconvgemm32' kernels.
//
// =================================================================================================
namespace clblast {
namespace database {
const DatabaseEntry XconvgemmSingle = {
"Xconvgemm", Precision::kSingle, {"KWID", "MDIMAD", "MDIMCD", "NDIMBD", "NDIMCD", "PADA", "PADB", "VWMD", "VWND", "WGD"}, {
{ // Intel GPUs
kDeviceTypeGPU, "Intel", {
{ "default", {
{ Name{"Intel(R) Gen9 HD Graphics NEO "}, Params{ 1, 16, 32, 8, 8, 0, 0, 1, 4, 32, 0, 0, 0, 0, 0, 0 } },
{ Name{"Intel(R) HD Graphics Skylake ULT GT2 "}, Params{ 1, 16, 8, 8, 16, 0, 0, 1, 1, 32, 0, 0, 0, 0, 0, 0 } },
{ kDeviceNameDefault , Params{ 1, 8, 16, 16, 8, 0, 0, 1, 1, 32, 0, 0, 0, 0, 0, 0 } },
} },
}
},
{ // Default
kDeviceTypeAll, "default", {
{ "default", {
{ kDeviceNameDefault , Params{ 1, 8, 16, 16, 8, 0, 0, 1, 1, 32, 0, 0, 0, 0, 0, 0 } },
} },
}
},
}
};
} // namespace database
} // namespace clblast

View file

@ -0,0 +1,26 @@
// =================================================================================================
// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. It
// is auto-generated by the 'scripts/database/database.py' Python script.
//
// This file populates the database with best-found tuning parameters for the 'Xconvgemm3232' kernels.
//
// =================================================================================================
namespace clblast {
namespace database {
const DatabaseEntry XconvgemmComplexSingle = {
"Xconvgemm", Precision::kComplexSingle, {"KWID", "MDIMAD", "MDIMCD", "NDIMBD", "NDIMCD", "PADA", "PADB", "VWMD", "VWND", "WGD"}, {
{ // Default
kDeviceTypeAll, "default", {
{ "default", {
{ kDeviceNameDefault , Params{ 1, 8, 16, 16, 8, 0, 0, 1, 1, 32, 0, 0, 0, 0, 0, 0 } },
} },
}
},
}
};
} // namespace database
} // namespace clblast

View file

@ -0,0 +1,34 @@
// =================================================================================================
// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. It
// is auto-generated by the 'scripts/database/database.py' Python script.
//
// This file populates the database with best-found tuning parameters for the 'Xconvgemm64' kernels.
//
// =================================================================================================
namespace clblast {
namespace database {
const DatabaseEntry XconvgemmDouble = {
"Xconvgemm", Precision::kDouble, {"KWID", "MDIMAD", "MDIMCD", "NDIMBD", "NDIMCD", "PADA", "PADB", "VWMD", "VWND", "WGD"}, {
{ // Intel GPUs
kDeviceTypeGPU, "Intel", {
{ "default", {
{ Name{"Intel(R) Gen9 HD Graphics NEO "}, Params{ 1, 8, 16, 16, 8, 0, 0, 1, 2, 32, 0, 0, 0, 0, 0, 0 } },
{ kDeviceNameDefault , Params{ 1, 8, 16, 16, 8, 0, 0, 1, 2, 32, 0, 0, 0, 0, 0, 0 } },
} },
}
},
{ // Default
kDeviceTypeAll, "default", {
{ "default", {
{ kDeviceNameDefault , Params{ 1, 8, 16, 16, 8, 0, 0, 1, 2, 32, 0, 0, 0, 0, 0, 0 } },
} },
}
},
}
};
} // namespace database
} // namespace clblast

View file

@ -0,0 +1,26 @@
// =================================================================================================
// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. It
// is auto-generated by the 'scripts/database/database.py' Python script.
//
// This file populates the database with best-found tuning parameters for the 'Xconvgemm6464' kernels.
//
// =================================================================================================
namespace clblast {
namespace database {
const DatabaseEntry XconvgemmComplexDouble = {
"Xconvgemm", Precision::kComplexDouble, {"KWID", "MDIMAD", "MDIMCD", "NDIMBD", "NDIMCD", "PADA", "PADB", "VWMD", "VWND", "WGD"}, {
{ // Default
kDeviceTypeAll, "default", {
{ "default", {
{ kDeviceNameDefault , Params{ 1, 8, 16, 16, 8, 0, 0, 1, 1, 32, 0, 0, 0, 0, 0, 0 } },
} },
}
},
}
};
} // namespace database
} // namespace clblast

View file

@ -11,7 +11,6 @@
// uses parameters from the direct GEMM kernel. This is the part with the loads from memory (1/2). // uses parameters from the direct GEMM kernel. This is the part with the loads from memory (1/2).
// This uses "CONVGEMM_WITH_IM2COL" as a switch to select between direct convgemm or first running // This uses "CONVGEMM_WITH_IM2COL" as a switch to select between direct convgemm or first running
// the im2col kernel to create a 'col' temporary matrix. // the im2col kernel to create a 'col' temporary matrix.
// TODO: Currently only works with 'CONVGEMM_WITH_IM2COL' set
// //
// ================================================================================================= // =================================================================================================
@ -30,12 +29,17 @@ INLINE_FUNC real GlobalToPrivateCheckedImage(const __global real* restrict image
const int kernel_h, const int kernel_w, const int kernel_h, const int kernel_w,
const int pad_h, const int pad_w, const int pad_h, const int pad_w,
const int stride_h, const int stride_w, const int stride_h, const int stride_w,
const int dilation_h, const int dilation_w) { const int dilation_h, const int dilation_w,
const bool kernel_flip) {
// Im2col indices // Im2col indices
const int kernel_2d_index = kwg % (kernel_h * kernel_w); const int kernel_2d_index = kwg % (kernel_h * kernel_w);
const int kw_id = kernel_2d_index % kernel_w; const int kw_id = (kernel_flip)
const int kh_id = kernel_2d_index / kernel_w; ? kernel_w - kernel_2d_index % kernel_w - 1
: kernel_2d_index % kernel_w;
const int kh_id = (kernel_flip)
? kernel_h - kernel_2d_index / kernel_w - 1
: kernel_2d_index / kernel_w;
const int c_id = kwg / (kernel_h * kernel_w); const int c_id = kwg / (kernel_h * kernel_w);
const int h_index = -pad_h + kh_id * dilation_h + stride_h * h_id; const int h_index = -pad_h + kh_id * dilation_h + stride_h * h_id;
const int w_index = -pad_w + kw_id * dilation_w + stride_w * w_id; const int w_index = -pad_w + kw_id * dilation_w + stride_w * w_id;
@ -55,14 +59,15 @@ INLINE_FUNC real GlobalToPrivateCheckedImage(const __global real* restrict image
// Loads global off-chip memory into local (shared) memory on-chip. This function is specific for // Loads global off-chip memory into local (shared) memory on-chip. This function is specific for
// loading the image input tensor. This includes a bounds check. // loading the image input tensor. This includes a bounds check.
INLINE_FUNC real GlobalToLocalCheckedImage(const __global realMD* restrict imagegm, LOCAL_PTR real* alm, INLINE_FUNC real GlobalToLocalCheckedImage(const __global real* restrict imagegm, LOCAL_PTR real* alm,
const int image_offset_batch, const int image_offset_batch,
const int h_id, const int w_id, const int kwg, const int output_w, const int kwg,
const int input_h, const int input_w, const int channels, const int input_h, const int input_w, const int channels,
const int kernel_h, const int kernel_w, const int kernel_h, const int kernel_w,
const int pad_h, const int pad_w, const int pad_h, const int pad_w,
const int stride_h, const int stride_w, const int stride_h, const int stride_w,
const int dilation_h, const int dilation_w) { const int dilation_h, const int dilation_w,
const bool kernel_flip) {
#if MDIMCD == MDIMAD #if MDIMCD == MDIMAD
const int la0 = get_local_id(0); const int la0 = get_local_id(0);
const int la1 = get_local_id(1); const int la1 = get_local_id(1);
@ -82,10 +87,17 @@ INLINE_FUNC real GlobalToLocalCheckedImage(const __global realMD* restrict image
int idm = mg + GetGroupID0()*WGD; int idm = mg + GetGroupID0()*WGD;
int idk = kg + kwg; int idk = kg + kwg;
const int w_id = idm % output_w;
const int h_id = idm / output_w;
// Im2col indices // Im2col indices
const int kernel_2d_index = idk % (kernel_h * kernel_w); const int kernel_2d_index = idk % (kernel_h * kernel_w);
const int kw_id = kernel_2d_index % kernel_w; const int kw_id = (kernel_flip)
const int kh_id = kernel_2d_index / kernel_w; ? kernel_w - kernel_2d_index % kernel_w - 1
: kernel_2d_index % kernel_w;
const int kh_id = (kernel_flip)
? kernel_h - kernel_2d_index / kernel_w - 1
: kernel_2d_index / kernel_w;
const int c_id = idk / (kernel_h * kernel_w); const int c_id = idk / (kernel_h * kernel_w);
const int h_index = -pad_h + kh_id * dilation_h + stride_h * h_id; const int h_index = -pad_h + kh_id * dilation_h + stride_h * h_id;
const int w_index = -pad_w + kw_id * dilation_w + stride_w * w_id; const int w_index = -pad_w + kw_id * dilation_w + stride_w * w_id;
@ -104,7 +116,8 @@ INLINE_FUNC real GlobalToLocalCheckedImage(const __global realMD* restrict image
} }
} }
#endif #endif // defined(ROUTINE_CONVGEMM) && !defined(CONVGEMM_WITH_IM2COL)
// ================================================================================================= // =================================================================================================
// End of the C++11 raw string literal // End of the C++11 raw string literal

View file

@ -11,7 +11,6 @@
// uses parameters from the direct GEMM kernel. This part contains the main kernel (2/2). // uses parameters from the direct GEMM kernel. This part contains the main kernel (2/2).
// This uses "CONVGEMM_WITH_IM2COL" as a switch to select between direct convgemm or first running // This uses "CONVGEMM_WITH_IM2COL" as a switch to select between direct convgemm or first running
// the im2col kernel to create a 'col' temporary matrix. // the im2col kernel to create a 'col' temporary matrix.
// TODO: Currently only works with 'CONVGEMM_WITH_IM2COL' set
// //
// ================================================================================================= // =================================================================================================
@ -23,20 +22,25 @@ R"(
#if defined(ROUTINE_CONVGEMM) #if defined(ROUTINE_CONVGEMM)
// ConvGEMM kernel // ConvGEMM kernel
#if defined(CONVGEMM_WITH_IM2COL)
__kernel __attribute__((reqd_work_group_size(MDIMCD, NDIMCD, 1))) __kernel __attribute__((reqd_work_group_size(MDIMCD, NDIMCD, 1)))
void Xconvgemm(const int num_patches, const int num_kernels, const int patch_size, void Xconvgemm(const int num_patches, const int num_kernels, const int patch_size,
const __global realND* restrict kernelgm, const int kernel_offset, const __global realND* restrict kernelgm, const int kernel_offset,
__global real* resultgm, const int result_offset, const int result_stride, __global real* resultgm, const int result_offset, const int result_stride,
#if defined(CONVGEMM_WITH_IM2COL)
const __global realMD* restrict colgm, const int col_offset, const int col_stride) const __global realMD* restrict colgm, const int col_offset, const int col_stride)
#else #else
const __global realMD* restrict imagegm, const int image_offset, INLINE_FUNC void Xconvgemm(const int num_patches, const int num_kernels, const int patch_size,
const int input_h, const int input_w, const int channels, const __global realND* restrict kernelgm, const int kernel_offset,
const int kernel_h, const int kernel_w, __global real* resultgm, const int result_offset, const int result_stride,
const int pad_h, const int pad_w, const __global realMD* restrict imagegm, const int image_offset,
const int stride_h, const int stride_w, const int input_h, const int input_w, const int channels,
const int dilation_h, const int dilation_w, const int kernel_h, const int kernel_w,
const int output_h, const int output_w) const int pad_h, const int pad_w,
const int stride_h, const int stride_w,
const int dilation_h, const int dilation_w,
const int output_h, const int output_w,
LOCAL_PTR real* alm, LOCAL_PTR real* blm,
const bool kernel_flip)
#endif #endif
{ {
@ -49,12 +53,16 @@ void Xconvgemm(const int num_patches, const int num_kernels, const int patch_siz
#endif #endif
const int result_offset_batch = result_offset + result_stride * batch; const int result_offset_batch = result_offset + result_stride * batch;
#if defined(CONVGEMM_WITH_IM2COL)
__local real alm[WGD * (WGD + PADA)]; __local real alm[WGD * (WGD + PADA)];
__local real blm[WGD * (WGD + PADB)]; __local real blm[WGD * (WGD + PADB)];
#endif
// Extra pointers to scalar versions of global memory // Extra pointers to scalar versions of global memory
#if defined(CONVGEMM_WITH_IM2COL) #if defined(CONVGEMM_WITH_IM2COL)
const __global real* restrict colgms = (const __global real* restrict) colgm; const __global real* restrict colgms = (const __global real* restrict) colgm;
#else
const __global real* restrict imagegms = (const __global real* restrict) imagegm;
#endif #endif
const __global real* restrict kernelgms = (const __global real* restrict) kernelgm; const __global real* restrict kernelgms = (const __global real* restrict) kernelgm;
@ -100,10 +108,10 @@ void Xconvgemm(const int num_patches, const int num_kernels, const int patch_siz
GlobalToLocalScalarA(colgms, alm, num_patches, col_offset_batch, kwg, false, false); GlobalToLocalScalarA(colgms, alm, num_patches, col_offset_batch, kwg, false, false);
} }
#else #else
GlobalToLocalCheckedImage(imagegm, alm, image_offset_batch, h_id, w_id, kwg, GlobalToLocalCheckedImage(imagegms, alm, image_offset_batch, output_w, kwg,
input_h, input_w, channels, kernel_h, kernel_w, input_h, input_w, channels, kernel_h, kernel_w,
pad_h, pad_w, stride_h, stride_w, pad_h, pad_w, stride_h, stride_w,
dilation_h, dilation_w); dilation_h, dilation_w, kernel_flip);
#endif #endif
if (patch_size % VWND == 0 && kernel_offset % VWND == 0) { if (patch_size % VWND == 0 && kernel_offset % VWND == 0) {
GlobalToLocalDirectB(kernelgm, blm, patch_size, kernel_offset, kwg, true, false); GlobalToLocalDirectB(kernelgm, blm, patch_size, kernel_offset, kwg, true, false);
@ -151,10 +159,12 @@ void Xconvgemm(const int num_patches, const int num_kernels, const int patch_siz
#if defined(CONVGEMM_WITH_IM2COL) #if defined(CONVGEMM_WITH_IM2COL)
apd[_mi] = GlobalToPrivateDirectA(colgms, _mi, num_patches, col_offset_batch, idm, kwg, false, false); apd[_mi] = GlobalToPrivateDirectA(colgms, _mi, num_patches, col_offset_batch, idm, kwg, false, false);
#else #else
apd[_mi] = GlobalToPrivateCheckedImage(imagegm, image_offset_batch, h_id, w_id, kwg, const int w_id = (idm + _mi) % output_w;
const int h_id = (idm + _mi) / output_w;
apd[_mi] = GlobalToPrivateCheckedImage(imagegms, image_offset_batch, h_id, w_id, kwg,
input_h, input_w, channels, kernel_h, kernel_w, input_h, input_w, channels, kernel_h, kernel_w,
pad_h, pad_w, stride_h, stride_w, pad_h, pad_w, stride_h, stride_w,
dilation_h, dilation_w); dilation_h, dilation_w, kernel_flip);
#endif #endif
} }
#pragma unroll #pragma unroll
@ -193,10 +203,10 @@ void Xconvgemm(const int num_patches, const int num_kernels, const int patch_siz
#if defined(CONVGEMM_WITH_IM2COL) #if defined(CONVGEMM_WITH_IM2COL)
GlobalToLocalCheckedA(colgms, alm, num_patches, col_offset_batch, kwg, false, false, num_patches, patch_size); GlobalToLocalCheckedA(colgms, alm, num_patches, col_offset_batch, kwg, false, false, num_patches, patch_size);
#else #else
GlobalToLocalCheckedImage(imagegm, alm, image_offset_batch, h_id, w_id, kwg, GlobalToLocalCheckedImage(imagegms, alm, image_offset_batch, output_w, kwg,
input_h, input_w, channels, kernel_h, kernel_w, input_h, input_w, channels, kernel_h, kernel_w,
pad_h, pad_w, stride_h, stride_w, pad_h, pad_w, stride_h, stride_w,
dilation_h, dilation_w); dilation_h, dilation_w, kernel_flip);
#endif #endif
GlobalToLocalCheckedB(kernelgms, blm, patch_size, kernel_offset, kwg, true, false, num_kernels, patch_size); GlobalToLocalCheckedB(kernelgms, blm, patch_size, kernel_offset, kwg, true, false, num_kernels, patch_size);
barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);
@ -239,10 +249,12 @@ void Xconvgemm(const int num_patches, const int num_kernels, const int patch_siz
#if defined(CONVGEMM_WITH_IM2COL) #if defined(CONVGEMM_WITH_IM2COL)
apd[_mi] = GlobalToPrivateCheckedA(colgms, _mi, num_patches, col_offset_batch, idm, kwg, false, false, num_patches); apd[_mi] = GlobalToPrivateCheckedA(colgms, _mi, num_patches, col_offset_batch, idm, kwg, false, false, num_patches);
#else #else
apd[_mi] = GlobalToPrivateCheckedImage(imagegm, image_offset_batch, h_id, w_id, kwg, const int w_id = (idm + _mi) % output_w;
const int h_id = (idm + _mi) / output_w;
apd[_mi] = GlobalToPrivateCheckedImage(imagegms, image_offset_batch, h_id, w_id, kwg,
input_h, input_w, channels, kernel_h, kernel_w, input_h, input_w, channels, kernel_h, kernel_w,
pad_h, pad_w, stride_h, stride_w, pad_h, pad_w, stride_h, stride_w,
dilation_h, dilation_w); dilation_h, dilation_w, kernel_flip);
#endif #endif
} }
#pragma unroll #pragma unroll
@ -272,7 +284,53 @@ void Xconvgemm(const int num_patches, const int num_kernels, const int patch_siz
} }
} }
#endif #if !defined(CONVGEMM_WITH_IM2COL)
__kernel __attribute__((reqd_work_group_size(MDIMCD, NDIMCD, 1)))
void XconvgemmFlip(const int num_patches, const int num_kernels, const int patch_size,
const __global realND* restrict kernelgm, const int kernel_offset,
__global real* resultgm, const int result_offset, const int result_stride,
const __global realMD* restrict imagegm, const int image_offset,
const int input_h, const int input_w, const int channels,
const int kernel_h, const int kernel_w,
const int pad_h, const int pad_w,
const int stride_h, const int stride_w,
const int dilation_h, const int dilation_w,
const int output_h, const int output_w) {
const bool kernel_flip = true;
__local real alm[WGD * (WGD + PADA)];
__local real blm[WGD * (WGD + PADB)];
Xconvgemm(num_patches, num_kernels, patch_size,
kernelgm, kernel_offset, resultgm, result_offset, result_stride,
imagegm, image_offset, input_h, input_w, channels, kernel_h, kernel_w,
pad_h, pad_w, stride_h, stride_w, dilation_h, dilation_w,
output_h, output_w, alm, blm, kernel_flip);
}
__kernel __attribute__((reqd_work_group_size(MDIMCD, NDIMCD, 1)))
void XconvgemmNormal(const int num_patches, const int num_kernels, const int patch_size,
const __global realND* restrict kernelgm, const int kernel_offset,
__global real* resultgm, const int result_offset, const int result_stride,
const __global realMD* restrict imagegm, const int image_offset,
const int input_h, const int input_w, const int channels,
const int kernel_h, const int kernel_w,
const int pad_h, const int pad_w,
const int stride_h, const int stride_w,
const int dilation_h, const int dilation_w,
const int output_h, const int output_w) {
const bool kernel_flip = false;
__local real alm[WGD * (WGD + PADA)];
__local real blm[WGD * (WGD + PADB)];
Xconvgemm(num_patches, num_kernels, patch_size,
kernelgm, kernel_offset, resultgm, result_offset, result_stride,
imagegm, image_offset, input_h, input_w, channels, kernel_h, kernel_w,
pad_h, pad_w, stride_h, stride_w, dilation_h, dilation_w,
output_h, output_w, alm, blm, kernel_flip);
}
#endif // !defined(CONVGEMM_WITH_IM2COL)
#endif // defined(ROUTINE_CONVGEMM)
// ================================================================================================= // =================================================================================================
// End of the C++11 raw string literal // End of the C++11 raw string literal

View file

@ -25,7 +25,7 @@ namespace clblast {
template <typename T> template <typename T>
Xconvgemm<T>::Xconvgemm(Queue &queue, EventPointer event, const std::string &name, Xconvgemm<T>::Xconvgemm(Queue &queue, EventPointer event, const std::string &name,
const ConvGemmMethod method): const ConvGemmMethod method):
Routine(queue, event, name, {"XgemmDirect"}, Routine(queue, event, name, {"Xconvgemm"},
PrecisionValue<T>(), {}, { PrecisionValue<T>(), {}, {
(method == ConvGemmMethod::kWithIm2Col) ? "#define CONVGEMM_WITH_IM2COL\n" : "", (method == ConvGemmMethod::kWithIm2Col) ? "#define CONVGEMM_WITH_IM2COL\n" : "",
#include "../../kernels/level3/level3.opencl" #include "../../kernels/level3/level3.opencl"
@ -53,9 +53,6 @@ void Xconvgemm<T>::DoConvgemm(const KernelMode kernel_mode,
const Buffer<T> &kernel_buffer, const size_t kernel_offset, const Buffer<T> &kernel_buffer, const size_t kernel_offset,
const Buffer<T> &result_buffer, const size_t result_offset) { const Buffer<T> &result_buffer, const size_t result_offset) {
// TODO: Implement single-kernel approach
assert(method_ == ConvGemmMethod::kWithIm2Col);
// Tests for a valid batch count // Tests for a valid batch count
if (batch_count == 0) { if (batch_count == 0) {
throw BLASError(StatusCode::kInvalidBatchCount); throw BLASError(StatusCode::kInvalidBatchCount);
@ -121,7 +118,12 @@ void Xconvgemm<T>::DoConvgemm(const KernelMode kernel_mode,
} }
// Retrieves the proper XgemmDirect kernel from the compiled binary // Retrieves the proper XgemmDirect kernel from the compiled binary
auto kernel = Kernel(program_, "Xconvgemm"); const std::string kernel_name = (method_ == ConvGemmMethod::kWithIm2Col)
? "Xconvgemm"
: (kernel_mode == KernelMode::kConvolution)
? "XconvgemmFlip"
: "XconvgemmNormal";
auto kernel = Kernel(program_, kernel_name);
// Sets the kernel arguments // Sets the kernel arguments
kernel.SetArgument(0, static_cast<int>(num_patches)); kernel.SetArgument(0, static_cast<int>(num_patches));

View file

@ -29,7 +29,7 @@ class Xconvgemm: public Routine {
// Constructor // Constructor
enum class ConvGemmMethod {kWithIm2Col, kSingleKernel}; enum class ConvGemmMethod {kWithIm2Col, kSingleKernel};
Xconvgemm(Queue &queue, EventPointer event, const std::string &name = "CONVGEMM", Xconvgemm(Queue &queue, EventPointer event, const std::string &name = "CONVGEMM",
const ConvGemmMethod method = ConvGemmMethod::kWithIm2Col); const ConvGemmMethod method = ConvGemmMethod::kSingleKernel);
// Templated-precision implementation of the routine // Templated-precision implementation of the routine
void DoConvgemm(const KernelMode kernel_mode, void DoConvgemm(const KernelMode kernel_mode,

View file

@ -0,0 +1,38 @@
// =================================================================================================
// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
// width of 100 characters per line.
//
// Author(s):
// Cedric Nugteren <www.cedricnugteren.nl>
//
// This file uses the auto-tuner to tune the convgemm kernels.
//
// =================================================================================================
#include "tuning/kernels/xconvgemm.hpp"
// Shortcuts to the clblast namespace
using half = clblast::half;
using float2 = clblast::float2;
using double2 = clblast::double2;
// Function to tune a specific variation V (not within the clblast namespace)
template <int V>
void StartVariation(int argc, char *argv[]) {
const auto command_line_args = clblast::RetrieveCommandLineArguments(argc, argv);
switch(clblast::GetPrecision(command_line_args)) {
case clblast::Precision::kHalf: clblast::Tuner<half>(argc, argv, V, clblast::XConvGemmGetTunerDefaults, clblast::XConvGemmGetTunerSettings<half>, clblast::XConvGemmTestValidArguments<half>, clblast::XConvGemmSetConstraints, clblast::XConvGemmComputeLocalMemSize<half>, clblast::XConvGemmSetArguments<half>); break;
case clblast::Precision::kSingle: clblast::Tuner<float>(argc, argv, V, clblast::XConvGemmGetTunerDefaults, clblast::XConvGemmGetTunerSettings<float>, clblast::XConvGemmTestValidArguments<float>, clblast::XConvGemmSetConstraints, clblast::XConvGemmComputeLocalMemSize<float>, clblast::XConvGemmSetArguments<float>); break;
case clblast::Precision::kDouble: clblast::Tuner<double>(argc, argv, V, clblast::XConvGemmGetTunerDefaults, clblast::XConvGemmGetTunerSettings<double>, clblast::XConvGemmTestValidArguments<double>, clblast::XConvGemmSetConstraints, clblast::XConvGemmComputeLocalMemSize<double>, clblast::XConvGemmSetArguments<double>); break;
}
}
// Main function (not within the clblast namespace)
int main(int argc, char *argv[]) {
StartVariation<1>(argc, argv);
return 0;
}
// =================================================================================================

View file

@ -0,0 +1,186 @@
// =================================================================================================
// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
// width of 100 characters per line.
//
// Author(s):
// Cedric Nugteren <www.cedricnugteren.nl>
//
// This file uses the auto-tuner to tune the ConvGemm kernels. These kernels are based on the GEMM
// direct kernel and will use those parameters, this tuner is just optional to use for advanced
// users.
//
// =================================================================================================
#include <string>
#include <vector>
#include "utilities/utilities.hpp"
#include "tuning/tuning.hpp"
namespace clblast {
// =================================================================================================
// Helper functions
template <typename T>
size_t OutputHeight(const Arguments<T> &args) {
const auto size = args.height + 2 * args.pad_h;
const auto padding = args.dilation_h * (args.kernel_h - 1) + 1;
if (size >= padding) { return (size - padding) / args.stride_h + 1; }
return 1;
}
template <typename T>
size_t OutputWidth(const Arguments<T> &args) {
const auto size = args.width + 2 * args.pad_w;
const auto padding = args.dilation_w * (args.kernel_w - 1) + 1;
if (size >= padding) { return (size - padding) / args.stride_w + 1; }
return 1;
}
// Settings for this kernel (default command-line arguments)
TunerDefaults XConvGemmGetTunerDefaults(const int) {
auto settings = TunerDefaults();
settings.options = {kArgChannels, kArgHeight, kArgWidth, kArgKernelH, kArgKernelW,
kArgNumKernels, kArgBatchCount, kArgFraction};
settings.channels = 32;
settings.height = 66;
settings.width = 66; // num_patches = 64x64 = 4096
settings.kernel_h = 3;
settings.kernel_w = 3;
settings.num_kernels = 32;
settings.default_batch_count = 16;
settings.default_fraction = 1.0;
settings.default_num_runs = 2;
return settings;
}
// Settings for this kernel (general)
template <typename T>
TunerSettings XConvGemmGetTunerSettings(const int, const Arguments<T> &args) {
auto settings = TunerSettings();
// Identification of the kernel
settings.kernel_family = "xconvgemm";
settings.kernel_name = "XconvgemmNormal";
settings.sources =
"#define ROUTINE_CONVGEMM"
#include "../src/kernels/level3/xgemm_direct_part1.opencl"
#include "../src/kernels/level3/xgemm_direct_part2.opencl"
#include "../src/kernels/level3/xgemm_direct_part3.opencl"
#include "../src/kernels/levelx/xconvgemm_part1.opencl"
#include "../src/kernels/levelx/xconvgemm_part2.opencl"
;
// Helper variables
const auto patch_size = args.kernel_h * args.kernel_w * args.channels;
const auto num_patches = OutputHeight(args) * OutputWidth(args);
// Buffer sizes
settings.size_a = args.batch_count * args.channels * args.height * args.width;
settings.size_b = args.num_kernels * args.channels * args.kernel_h * args.kernel_w;
settings.size_c = args.batch_count * args.num_kernels * OutputHeight(args) * OutputWidth(args);
// Inputs and outputs IDs (X:0, Y:1, A:2, B:3, C:4, temp:5)
settings.inputs = {2, 3, 4};
settings.outputs = {4};
// Sets the base thread configuration
settings.global_size = {num_patches, args.num_kernels, args.batch_count};
settings.global_size_ref = settings.global_size;
settings.local_size = {1, 1, 1};
settings.local_size_ref = {8, 8, 1};
// Transforms the thread configuration based on the parameters
settings.mul_local = {{"MDIMCD", "NDIMCD"}};
settings.mul_global = {{"MDIMCD", "NDIMCD"}};
settings.div_global = {{"WGD", "WGD"}};
// Sets the tuning parameters and their possible values
settings.parameters = {
{"WGD", {8, 16, 32}},
{"MDIMCD", {8, 16, 32}},
{"NDIMCD", {8, 16, 32}},
{"MDIMAD", {8, 16, 32}},
{"NDIMBD", {8, 16, 32}},
{"KWID", {1}},
{"VWMD", {1, 2, 4, 8}},
{"VWND", {1, 2, 4, 8}},
{"PADA", {0}},
{"PADB", {0}},
};
// Describes how to compute the performance metrics
settings.metric_amount = args.batch_count * 2 * num_patches * args.num_kernels * patch_size;
settings.performance_unit = "GFLOPS";
return settings;
}
// Tests for valid arguments
template <typename T>
void XConvGemmTestValidArguments(const int, const Arguments<T> &) { }
std::vector<Constraint> XConvGemmSetConstraints(const int) {
auto constraints = std::vector<Constraint>();
auto MultipleOfX = [] (std::vector<size_t> v) { return IsMultiple(v[0], v[1]); };
auto MultipleOfXMulY = [] (std::vector<size_t> v) { return IsMultiple(v[0], v[1]*v[2]); };
auto MultipleOfXMulYDivZ = [] (std::vector<size_t> v) { return IsMultiple(v[0], (v[1]*v[2])/v[3]); };
// Requirement for unrolling the WGD loop
constraints.push_back({MultipleOfX, {"WGD", "KWID"}});
// Required for integer MWID and NWID
constraints.push_back({MultipleOfXMulY, {"WGD", "MDIMCD", "VWMD"}});
constraints.push_back({MultipleOfXMulY, {"WGD", "NDIMCD", "VWND"}});
// Required for integer MWIAD and NWIBD
constraints.push_back({MultipleOfXMulY, {"WGD", "MDIMAD", "VWMD"}});
constraints.push_back({MultipleOfXMulY, {"WGD", "NDIMBD", "VWND"}});
// WGD has to be a multiple of KDIMAD = ((MDIMCD*NDIMCD)/(MDIMAD)) and KDIMBD = (...)
constraints.push_back({MultipleOfXMulYDivZ, {"WGD", "MDIMCD", "NDIMCD", "MDIMAD"}});
constraints.push_back({MultipleOfXMulYDivZ, {"WGD", "MDIMCD", "NDIMCD", "NDIMBD"}});
return constraints;
}
template <typename T>
LocalMemSizeInfo XConvGemmComputeLocalMemSize(const int) {
return {
[] (std::vector<size_t> v) -> size_t {
return GetBytes(PrecisionValue<T>()) * ((v[0]*(v[0] + v[1]) + v[0]*(v[0] + v[2])));
},
{"WGD", "PADA", "PADB"}
};
}
// Sets the kernel's arguments
template <typename T>
void XConvGemmSetArguments(const int, Kernel &kernel, const Arguments<T> &args, std::vector<Buffer<T>>& buffers) {
const auto output_h = OutputHeight(args);
const auto output_w = OutputWidth(args);
const auto patch_size = args.kernel_h * args.kernel_w * args.channels;
const auto num_patches = output_h * output_w;
const auto result_stride = args.num_kernels * output_h * output_w;
kernel.SetArgument(0, static_cast<int>(num_patches));
kernel.SetArgument(1, static_cast<int>(args.num_kernels));
kernel.SetArgument(2, static_cast<int>(patch_size));
kernel.SetArgument(3, buffers[3]()); // 3 == B matrix ==> kernel buffer
kernel.SetArgument(4, 0); // kernel offset
kernel.SetArgument(5, buffers[4]()); // 4 == C matrix ==> result buffer
kernel.SetArgument(6, 0); // result offset
kernel.SetArgument(7, static_cast<int>(result_stride));
kernel.SetArgument(8, buffers[2]()); // 2 == A matrix ==> image buffer
kernel.SetArgument(9, 0); // image offset
kernel.SetArgument(10, static_cast<int>(args.height));
kernel.SetArgument(11, static_cast<int>(args.width));
kernel.SetArgument(12, static_cast<int>(args.channels));
kernel.SetArgument(13, static_cast<int>(args.kernel_h));
kernel.SetArgument(14, static_cast<int>(args.kernel_w));
kernel.SetArgument(15, 0); // pad_h
kernel.SetArgument(16, 0); // pad_w
kernel.SetArgument(17, 1); // stride_h
kernel.SetArgument(18, 1); // stride_w
kernel.SetArgument(19, 1); // dilation_h
kernel.SetArgument(20, 1); // dilation_w
kernel.SetArgument(21, static_cast<int>(output_h));
kernel.SetArgument(22, static_cast<int>(output_w));
}
// =================================================================================================
} // namespace clblast

View file

@ -122,8 +122,14 @@ void Tuner(int argc, char* argv[], const int V,
if (o == kArgM) { args.m = GetArgument(command_line_args, help, kArgM, defaults.default_m); } if (o == kArgM) { args.m = GetArgument(command_line_args, help, kArgM, defaults.default_m); }
if (o == kArgN) { args.n = GetArgument(command_line_args, help, kArgN, defaults.default_n); } if (o == kArgN) { args.n = GetArgument(command_line_args, help, kArgN, defaults.default_n); }
if (o == kArgK) { args.k = GetArgument(command_line_args, help, kArgK, defaults.default_k); } if (o == kArgK) { args.k = GetArgument(command_line_args, help, kArgK, defaults.default_k); }
if (o == kArgAlpha) { args.alpha = GetArgument(command_line_args, help, kArgAlpha, GetScalar<T>()); } if (o == kArgChannels) { args.channels = GetArgument(command_line_args, help, kArgChannels, defaults.channels); }
if (o == kArgBeta) { args.beta = GetArgument(command_line_args, help, kArgBeta, GetScalar<T>()); } if (o == kArgHeight) { args.height = GetArgument(command_line_args, help, kArgHeight, defaults.height); }
if (o == kArgWidth) { args.width = GetArgument(command_line_args, help, kArgWidth, defaults.width); }
if (o == kArgKernelH) { args.kernel_h = GetArgument(command_line_args, help, kArgKernelH, defaults.kernel_h); }
if (o == kArgKernelW) { args.kernel_w = GetArgument(command_line_args, help, kArgKernelW, defaults.kernel_w); }
if (o == kArgNumKernels) { args.num_kernels = GetArgument(command_line_args, help, kArgNumKernels, defaults.num_kernels); }
if (o == kArgAlpha) { args.alpha = GetArgument(command_line_args, help, kArgAlpha, GetScalar<T>()); }
if (o == kArgBeta) { args.beta = GetArgument(command_line_args, help, kArgBeta, GetScalar<T>()); }
if (o == kArgBatchCount) { args.batch_count = GetArgument(command_line_args, help, kArgBatchCount, defaults.default_batch_count); } if (o == kArgBatchCount) { args.batch_count = GetArgument(command_line_args, help, kArgBatchCount, defaults.default_batch_count); }
} }
args.fraction = GetArgument(command_line_args, help, kArgFraction, defaults.default_fraction); args.fraction = GetArgument(command_line_args, help, kArgFraction, defaults.default_fraction);
@ -383,6 +389,12 @@ void Tuner(int argc, char* argv[], const int V,
if (o == kArgAlpha) { metadata.push_back({"arg_alpha", ToString(args.alpha)}); } if (o == kArgAlpha) { metadata.push_back({"arg_alpha", ToString(args.alpha)}); }
if (o == kArgBeta) { metadata.push_back({"arg_beta", ToString(args.beta)}); } if (o == kArgBeta) { metadata.push_back({"arg_beta", ToString(args.beta)}); }
if (o == kArgBatchCount) { metadata.push_back({"arg_batch_count", ToString(args.batch_count)}); } if (o == kArgBatchCount) { metadata.push_back({"arg_batch_count", ToString(args.batch_count)}); }
if (o == kArgHeight) { metadata.push_back({"arg_height", ToString(args.height)}); }
if (o == kArgWidth) { metadata.push_back({"arg_width", ToString(args.width)}); }
if (o == kArgKernelH) { metadata.push_back({"arg_kernel_h", ToString(args.kernel_h)}); }
if (o == kArgKernelW) { metadata.push_back({"arg_kernel_w", ToString(args.kernel_w)}); }
if (o == kArgChannels) { metadata.push_back({"arg_channels", ToString(args.channels)}); }
if (o == kArgNumKernels) { metadata.push_back({"arg_num_kernels", ToString(args.num_kernels)}); }
} }
PrintTimingsToFileAsJSON("clblast_" + settings.kernel_family + "_" + precision_string + ".json", PrintTimingsToFileAsJSON("clblast_" + settings.kernel_family + "_" + precision_string + ".json",
device, platform, metadata, results); device, platform, metadata, results);

View file

@ -41,6 +41,13 @@ struct TunerDefaults {
size_t default_m = 1; size_t default_m = 1;
size_t default_n = 1; size_t default_n = 1;
size_t default_k = 1; size_t default_k = 1;
size_t channels = 1;
size_t height = 1;
size_t width = 1;
size_t kernel_h = 3;
size_t kernel_w = 3;
size_t num_kernels = 1;
size_t batch_count = 1;
// Other defaults // Other defaults
size_t default_batch_count = 1; size_t default_batch_count = 1;