From b24d36474334a74c286ffddc6af8fb74a3bae445 Mon Sep 17 00:00:00 2001 From: Cedric Nugteren Date: Sun, 2 Apr 2017 18:06:15 +0200 Subject: [PATCH] Layed the groundwork for cuBLAS comparisons in the clients --- CMakeLists.txt | 27 +++++++-- src/utilities/utilities.hpp | 2 + test/correctness/tester.cpp | 38 ++++++++---- test/correctness/tester.hpp | 1 + test/performance/client.cpp | 35 ++++++++++-- test/performance/client.hpp | 12 +++- test/wrapper_cuda.hpp | 111 ++++++++++++++++++++++++++++++++++++ 7 files changed, 200 insertions(+), 26 deletions(-) create mode 100644 test/wrapper_cuda.hpp diff --git a/CMakeLists.txt b/CMakeLists.txt index 62cf00cc..d9af0b91 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -130,17 +130,23 @@ if(TUNERS) endif() # Locates the reference BLAS libraries in case the tests need to be compiled. The "FindclBLAS.cmake" -# and "FindCBLAS.cmake" are included. +# and "FindCBLAS.cmake" are included, "FindCUDA.cmake" is provided by CMake. if(CLIENTS OR TESTS) find_package(clBLAS) find_package(CBLAS) - if(NOT CLBLAS_FOUND AND NOT CBLAS_FOUND) + find_package(CUDA QUIET) # for cuBLAS + if(CUDA_FOUND) + message(STATUS "CUDA and cuBLAS found") + else() + message(STATUS "Could not find cuBLAS as a reference") + endif() + if(NOT CLBLAS_FOUND AND NOT CBLAS_FOUND AND NOT CUDA_FOUND) if(TESTS) - message(STATUS "Could NOT find clBLAS nor a CPU BLAS, disabling the compilation of the tests") + message(STATUS "Could NOT find clBLAS nor a CPU BLAS nor cuBLAS, disabling the compilation of the tests") set(TESTS OFF) endif() if(CLIENTS) - message(STATUS "Could NOT find clBLAS nor a CPU BLAS, head-to-head performance comparison not supported in the clients") + message(STATUS "Could NOT find clBLAS nor a CPU BLAS nor cuBLAS, head-to-head performance comparison not supported in the clients") endif() endif() endif() @@ -320,13 +326,22 @@ if(CLIENTS OR TESTS) add_definitions(" -DCLBLAST_REF_CBLAS") endif() endif() + if(CUDA_FOUND) + set(REF_INCLUDES ${REF_INCLUDES} ${CUDA_INCLUDE_DIRS}) + set(REF_LIBRARIES ${REF_LIBRARIES} ${CUDA_LIBRARIES} ${CUDA_CUBLAS_LIBRARIES}) + if(MSVC) + add_definitions(" /DCLBLAST_REF_CUBLAS") + else() + add_definitions(" -DCLBLAST_REF_CUBLAS") + endif() + endif() endif() # ================================================================================================== # Section for the performance tests (i.e. the client). These compare against optionally a reference -# library, either clBLAS or a CPU BLAS. +# library, either clBLAS, a CPU BLAS, or CUDA's cuBLAS. if(CLIENTS) # Visual Studio requires the sources of non-exported objects/libraries @@ -372,7 +387,7 @@ endif() # ================================================================================================== # Section for the correctness tests. Note that these tests require the presence of clBLAS and/or a -# CPU BLAS library to act as a reference. +# CPU BLAS library, and/or cuBLAS to act as a reference. if(TESTS) enable_testing() diff --git a/src/utilities/utilities.hpp b/src/utilities/utilities.hpp index 535560a3..7aadb983 100644 --- a/src/utilities/utilities.hpp +++ b/src/utilities/utilities.hpp @@ -81,6 +81,7 @@ constexpr auto kArgFraction = "fraction"; // The client-specific arguments in string form constexpr auto kArgCompareclblas = "clblas"; constexpr auto kArgComparecblas = "cblas"; +constexpr auto kArgComparecublas = "cublas"; constexpr auto kArgStepSize = "step"; constexpr auto kArgNumSteps = "num_steps"; constexpr auto kArgNumRuns = "runs"; @@ -188,6 +189,7 @@ struct Arguments { // Client-specific arguments int compare_clblas = 1; int compare_cblas = 1; + int compare_cublas = 1; size_t step = 1; size_t num_steps = 0; size_t num_runs = 10; diff --git a/test/correctness/tester.cpp b/test/correctness/tester.cpp index 40784fdb..b352c1aa 100644 --- a/test/correctness/tester.cpp +++ b/test/correctness/tester.cpp @@ -116,24 +116,38 @@ Tester::Tester(const std::vector &arguments, const bool silent tests_failed_{0} { options_ = options; + // Determines which reference is the default + auto default_clblas = 0; + auto default_cblas = 0; + auto default_cublas = 0; + #if defined(CLBLAST_REF_CBLAS) + default_cblas = 1; + #elif defined(CLBLAST_REF_CLBLAS) + default_clblas = 1; + #elif defined(CLBLAST_REF_CUBLAS) + default_cublas = 1; + #endif + // Determines which reference to test against - #if defined(CLBLAST_REF_CLBLAS) && defined(CLBLAST_REF_CBLAS) - compare_clblas_ = GetArgument(arguments, help_, kArgCompareclblas, 0); - compare_cblas_ = GetArgument(arguments, help_, kArgComparecblas, 1); - #elif CLBLAST_REF_CLBLAS - compare_clblas_ = GetArgument(arguments, help_, kArgCompareclblas, 1); - compare_cblas_ = 0; - #elif CLBLAST_REF_CBLAS - compare_clblas_ = 0; - compare_cblas_ = GetArgument(arguments, help_, kArgComparecblas, 1); - #else - compare_clblas_ = 0; - compare_cblas_ = 0; + compare_clblas_ = 0; + compare_cblas_ = 0; + compare_cublas_ = 0; + #if defined(CLBLAST_REF_CBLAS) + compare_cblas_ = GetArgument(arguments, help_, kArgComparecblas, default_cblas); + #endif + #if defined(CLBLAST_REF_CLBLAS) + compare_clblas_ = GetArgument(arguments, help_, kArgCompareclblas, default_clblas); + #endif + #if defined(CLBLAST_REF_CUBLAS) + compare_cublas_ = GetArgument(arguments, help_, kArgComparecublas, default_cublas); #endif // Prints the help message (command-line arguments) if (!silent) { fprintf(stdout, "\n* %s\n", help_.c_str()); } + // Support for cuBLAS not available yet + if (compare_cublas_) { throw std::runtime_error("Cannot test against cuBLAS; not implemented yet"); } + // Can only test against a single reference (not two, not zero) if (compare_clblas_ && compare_cblas_) { throw std::runtime_error("Cannot test against both clBLAS and CBLAS references; choose one using the -cblas and -clblas arguments"); diff --git a/test/correctness/tester.hpp b/test/correctness/tester.hpp index f60be04b..8cfa702f 100644 --- a/test/correctness/tester.hpp +++ b/test/correctness/tester.hpp @@ -113,6 +113,7 @@ class Tester { // Testing against reference implementations int compare_cblas_; int compare_clblas_; + int compare_cublas_; private: diff --git a/test/performance/client.cpp b/test/performance/client.cpp index 48d6708e..a2f0f9f4 100644 --- a/test/performance/client.cpp +++ b/test/performance/client.cpp @@ -30,13 +30,14 @@ template const int Client::kSeed = 42; // fixed se template Client::Client(const Routine run_routine, const Reference1 run_reference1, const Reference2 run_reference2, - const std::vector &options, + const Reference3 run_reference3, const std::vector &options, const std::vector &buffers_in, const std::vector &buffers_out, const GetMetric get_flops, const GetMetric get_bytes): run_routine_(run_routine), run_reference1_(run_reference1), run_reference2_(run_reference2), + run_reference3_(run_reference3), options_(options), buffers_in_(buffers_in), buffers_out_(buffers_out), @@ -119,6 +120,11 @@ Arguments Client::ParseArguments(int argc, char *argv[], const size_t le #else args.compare_cblas = 0; #endif + #ifdef CLBLAST_REF_CUBLAS + args.compare_cublas = GetArgument(command_line_args, help, kArgComparecublas, 1); + #else + args.compare_cublas = 0; + #endif args.step = GetArgument(command_line_args, help, kArgStepSize, size_t{1}); args.num_steps = GetArgument(command_line_args, help, kArgNumSteps, size_t{0}); args.num_runs = GetArgument(command_line_args, help, kArgNumRuns, size_t{10}); @@ -133,24 +139,26 @@ Arguments Client::ParseArguments(int argc, char *argv[], const size_t le // Comparison against a non-BLAS routine is not supported if (level == 4) { // level-4 == level-X - if (args.compare_clblas != 0 || args.compare_cblas != 0) { + if (args.compare_clblas != 0 || args.compare_cblas != 0 || args.compare_cublas != 0) { if (!args.silent) { - fprintf(stdout, "* Disabling clBLAS and CPU BLAS comparisons for this non-BLAS routine\n\n"); + fprintf(stdout, "* Disabling clBLAS/CBLAS/cuBLAS comparisons for this non-BLAS routine\n\n"); } } args.compare_clblas = 0; args.compare_cblas = 0; + args.compare_cublas = 0; } - // Comparison against clBLAS or a CPU BLAS library is not supported in case of half-precision + // Comparison against other BLAS libraries is not supported in case of half-precision if (args.precision == Precision::kHalf) { - if (args.compare_clblas != 0 || args.compare_cblas != 0) { + if (args.compare_clblas != 0 || args.compare_cblas != 0 || args.compare_cublas != 0) { if (!args.silent) { - fprintf(stdout, "* Disabling clBLAS and CPU BLAS comparisons for half-precision\n\n"); + fprintf(stdout, "* Disabling clBLAS/CBLAS/cuBLAS comparisons for half-precision\n\n"); } } args.compare_clblas = 0; args.compare_cblas = 0; + args.compare_cublas = 0; } // Returns the arguments @@ -174,6 +182,9 @@ void Client::PerformanceTest(Arguments &args, const SetMetric set_sizes) #ifdef CLBLAST_REF_CLBLAS if (args.compare_clblas) { clblasSetup(); } #endif + #ifdef CLBLAST_REF_CUBLAS + cudaSetDevice(static_cast(args.device_id)); + #endif // Iterates over all "num_step" values jumping by "step" each time auto s = size_t{0}; @@ -232,6 +243,16 @@ void Client::PerformanceTest(Arguments &args, const SetMetric set_sizes) HostToDevice(args, buffers, buffers_host, queue, buffers_out_); timings.push_back(std::pair("CPU BLAS", ms_cblas)); } + if (args.compare_cublas) { + auto buffers_host = BuffersHost(); + auto buffers_cuda = BuffersCUDA(); + DeviceToHost(args, buffers, buffers_host, queue, buffers_in_); + HostToCUDA(args, buffers_cuda, buffers_host, buffers_in_); + auto ms_cublas = TimedExecution(args.num_runs, args, buffers_cuda, queue, run_reference3_, "cuBLAS"); + CUDAToHost(args, buffers_cuda, buffers_host, buffers_out_); + HostToDevice(args, buffers, buffers_host, queue, buffers_out_); + timings.push_back(std::pair("cuBLAS", ms_cublas)); + } // Prints the performance of the tested libraries PrintTableRow(args, timings); @@ -307,6 +328,7 @@ void Client::PrintTableHeader(const Arguments& args) { fprintf(stdout, " | <-- CLBlast -->"); if (args.compare_clblas) { fprintf(stdout, " | <-- clBLAS -->"); } if (args.compare_cblas) { fprintf(stdout, " | <-- CPU BLAS -->"); } + if (args.compare_cublas) { fprintf(stdout, " | <-- cuBLAS -->"); } fprintf(stdout, " |\n"); } @@ -315,6 +337,7 @@ void Client::PrintTableHeader(const Arguments& args) { fprintf(stdout, "%9s;%9s;%9s", "ms_1", "GFLOPS_1", "GBs_1"); if (args.compare_clblas) { fprintf(stdout, ";%9s;%9s;%9s", "ms_2", "GFLOPS_2", "GBs_2"); } if (args.compare_cblas) { fprintf(stdout, ";%9s;%9s;%9s", "ms_3", "GFLOPS_3", "GBs_3"); } + if (args.compare_cublas) { fprintf(stdout, ";%9s;%9s;%9s", "ms_4", "GFLOPS_4", "GBs_4"); } fprintf(stdout, "\n"); } diff --git a/test/performance/client.hpp b/test/performance/client.hpp index 12fd113d..47a13017 100644 --- a/test/performance/client.hpp +++ b/test/performance/client.hpp @@ -31,6 +31,7 @@ #ifdef CLBLAST_REF_CLBLAS #include #endif +#include "test/wrapper_cuda.hpp" #include "clblast.h" namespace clblast { @@ -46,12 +47,13 @@ class Client { using Routine = std::function&, Buffers&, Queue&)>; using Reference1 = std::function&, Buffers&, Queue&)>; using Reference2 = std::function&, BuffersHost&, Queue&)>; + using Reference3 = std::function&, BuffersCUDA&, Queue&)>; using SetMetric = std::function&)>; using GetMetric = std::function&)>; // The constructor Client(const Routine run_routine, const Reference1 run_reference1, const Reference2 run_reference2, - const std::vector &options, + const Reference3 run_reference3, const std::vector &options, const std::vector &buffers_in, const std::vector &buffers_out, const GetMetric get_flops, const GetMetric get_bytes); @@ -84,6 +86,7 @@ class Client { const Routine run_routine_; const Reference1 run_reference1_; const Reference2 run_reference2_; + const Reference3 run_reference3_; const std::vector options_; const std::vector buffers_in_; const std::vector buffers_out_; @@ -118,9 +121,14 @@ void RunClient(int argc, char *argv[]) { #else auto reference2 = ReferenceNotAvailable>; #endif + #ifdef CLBLAST_REF_CUBLAS + auto reference3 = C::RunReference3; // cuBLAS when available + #else + auto reference3 = ReferenceNotAvailable>; + #endif // Creates a new client - auto client = Client(C::RunRoutine, reference1, reference2, C::GetOptions(), + auto client = Client(C::RunRoutine, reference1, reference2, reference3, C::GetOptions(), C::BuffersIn(), C::BuffersOut(), C::GetFlops, C::GetBytes); // Simple command line argument parser with defaults diff --git a/test/wrapper_cuda.hpp b/test/wrapper_cuda.hpp new file mode 100644 index 00000000..0f24d0d9 --- /dev/null +++ b/test/wrapper_cuda.hpp @@ -0,0 +1,111 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// This file contains all the CUDA related code; used only in case of testing against cuBLAS +// +// ================================================================================================= + +#ifndef CLBLAST_TEST_WRAPPER_CUDA_H_ +#define CLBLAST_TEST_WRAPPER_CUDA_H_ + +#include +#include +#include +#include + +#include "utilities/utilities.hpp" + +#ifdef CLBLAST_REF_CUBLAS + #include + #include +#endif + +namespace clblast { +// ================================================================================================= + +// Copies data from the CUDA device to the host and frees-up the CUDA memory afterwards +#ifdef CLBLAST_REF_CUBLAS + template + void CUDAToHost(const T* buffer_cuda, const std::vector &buffer_host, const size_t size) { + cudaMemcpy( + std::reinterpret_cast(buffer_host.data()), + std::reinterpret_cast(buffer_cuda), + size*sizeof(T), + cudaMemcpyDeviceToHost + ); + cudaFree(buffer_cuda); +} +#else + template void CUDAToHost(const T*, const std::vector&, const size_t) { } +#endif + +// Allocates space on the CUDA device and copies in data from the host +#ifdef CLBLAST_REF_CUBLAS + template + void HostToCUDA(const T* buffer_cuda, const std::vector &buffer_host, const size_t size) { + cudaMalloc(std::reinterpret_cast&buffer_cuda, size*sizeof(T)); + cudaMemcpy( + std::reinterpret_cast(buffer_cuda), + std::reinterpret_cast(buffer_host.data()), + size*sizeof(T), + cudaMemcpyHostToDevice + ); + } +#else + template void HostToCUDA(const T*, const std::vector&, const size_t) { } +#endif + +// ================================================================================================= + +template +struct BuffersCUDA { + T* x_vec; + T* y_vec; + T* a_mat; + T* b_mat; + T* c_mat; + T* ap_mat; + T* scalar; +}; + +template +void CUDAToHost(const Arguments &args, BuffersCUDA &buffers, BuffersHost &buffers_host, + const std::vector &names) { + for (auto &name: names) { + if (name == kBufVecX) { buffers_host.x_vec = std::vector(args.x_size, static_cast(0)); CUDAToHost(buffers.x_vec, buffers_host.x_vec, args.x_size); } + else if (name == kBufVecY) { buffers_host.y_vec = std::vector(args.y_size, static_cast(0)); CUDAToHost(buffers.y_vec, buffers_host.y_vec, args.y_size); } + else if (name == kBufMatA) { buffers_host.a_mat = std::vector(args.a_size, static_cast(0)); CUDAToHost(buffers.a_mat, buffers_host.a_mat, args.a_size); } + else if (name == kBufMatB) { buffers_host.b_mat = std::vector(args.b_size, static_cast(0)); CUDAToHost(buffers.b_mat, buffers_host.b_mat, args.b_size); } + else if (name == kBufMatC) { buffers_host.c_mat = std::vector(args.c_size, static_cast(0)); CUDAToHost(buffers.c_mat, buffers_host.c_mat, args.c_size); } + else if (name == kBufMatAP) { buffers_host.ap_mat = std::vector(args.ap_size, static_cast(0)); CUDAToHost(buffers.ap_mat, buffers_host.ap_mat, args.ap_size); } + else if (name == kBufScalar) { buffers_host.scalar = std::vector(args.scalar_size, static_cast(0)); CUDAToHost(buffers.scalar, buffers_host.scalar, args.scalar_size); } + else { throw std::runtime_error("Invalid buffer name"); } + } +} + +template +void HostToCUDA(const Arguments &args, BuffersCUDA &buffers, BuffersHost &buffers_host, + const std::vector &names) { + for (auto &name: names) { + if (name == kBufVecX) { HostToCUDA(buffers.x_vec, buffers_host.x_vec, args.x_size); } + else if (name == kBufVecY) { HostToCUDA(buffers.y_vec, buffers_host.y_vec, args.y_size); } + else if (name == kBufMatA) { HostToCUDA(buffers.a_mat, buffers_host.a_mat, args.a_size); } + else if (name == kBufMatB) { HostToCUDA(buffers.b_mat, buffers_host.b_mat, args.b_size); } + else if (name == kBufMatC) { HostToCUDA(buffers.c_mat, buffers_host.c_mat, args.c_size); } + else if (name == kBufMatAP) { HostToCUDA(buffers.ap_mat, buffers_host.ap_mat, args.ap_size); } + else if (name == kBufScalar) { HostToCUDA(buffers.scalar, buffers_host.scalar, args.scalar_size); } + else { throw std::runtime_error("Invalid buffer name"); } + } +} + +// ================================================================================================= +} // namespace clblast + +// CLBLAST_TEST_WRAPPER_CUDA_H_ +#endif