diff --git a/samples/sgemm_cuda.cpp b/samples/sgemm_cuda.cpp index ed2ad588..f1138316 100644 --- a/samples/sgemm_cuda.cpp +++ b/samples/sgemm_cuda.cpp @@ -19,7 +19,7 @@ #include // Includes the CUDA driver API -#include +#include // Includes the CLBlast library #include @@ -43,14 +43,15 @@ int main() { const auto c_ld = n; // Initializes the OpenCL device + cuInit(0); CUdevice device; cuDeviceGet(&device, device_id); // Creates the OpenCL context and stream CUcontext context; - cuCtxCreate(context, 0, device); + cuCtxCreate(&context, 0, device); CUstream stream; - cuStreamCreate(queue, CU_STREAM_NON_BLOCKING); + cuStreamCreate(&stream, CU_STREAM_NON_BLOCKING); // Populate host matrices with some example data auto host_a = std::vector(m*k); @@ -64,12 +65,12 @@ int main() { CUdeviceptr device_a; CUdeviceptr device_b; CUdeviceptr device_c; - cuMemAlloc(device_a, host_a.size()*sizeof(float)); - cuMemAlloc(device_b, host_b.size()*sizeof(float)); - cuMemAlloc(device_c, host_c.size()*sizeof(float)); - cuMemcpyHtoDAsync(device_a, host_a.data()), host_a.size()*sizeof(T), queue); - cuMemcpyHtoDAsync(device_b, host_c.data()), host_b.size()*sizeof(T), queue); - cuMemcpyHtoDAsync(device_c, host_b.data()), host_c.size()*sizeof(T), queue); + cuMemAlloc(&device_a, host_a.size()*sizeof(float)); + cuMemAlloc(&device_b, host_b.size()*sizeof(float)); + cuMemAlloc(&device_c, host_c.size()*sizeof(float)); + cuMemcpyHtoDAsync(device_a, host_a.data(), host_a.size()*sizeof(float), stream); + cuMemcpyHtoDAsync(device_b, host_c.data(), host_b.size()*sizeof(float), stream); + cuMemcpyHtoDAsync(device_c, host_b.data(), host_c.size()*sizeof(float), stream); // Start the timer auto start_time = std::chrono::steady_clock::now(); @@ -79,11 +80,12 @@ int main() { clblast::Transpose::kNo, clblast::Transpose::kNo, m, n, k, alpha, - device_a(), 0, a_ld, - device_b(), 0, b_ld, + device_a, 0, a_ld, + device_b, 0, b_ld, beta, - device_c(), 0, c_ld, + device_c, 0, c_ld, context, device); + cuStreamSynchronize(stream); // Record the execution time auto elapsed_time = std::chrono::steady_clock::now() - start_time; diff --git a/scripts/generator/generator/cpp.py b/scripts/generator/generator/cpp.py index 5413906a..2d18655f 100644 --- a/scripts/generator/generator/cpp.py +++ b/scripts/generator/generator/cpp.py @@ -56,7 +56,8 @@ def clblast_cc(routine, cuda=False): result += " auto queue_cpp = Queue(context_cpp, device_cpp);" + NL else: result += " auto queue_cpp = Queue(*queue);" + NL - result += " auto routine = X" + routine.plain_name() + "<" + routine.template.template + ">(queue_cpp, event);" + NL + event = "nullptr" if cuda else "event" + result += " auto routine = X" + routine.plain_name() + "<" + routine.template.template + ">(queue_cpp, " + event + ");" + NL if routine.batched: result += " " + (NL + " ").join(routine.batched_transform_to_cpp()) + NL result += " routine.Do" + routine.capitalized_name() + "(" diff --git a/src/clblast_cuda.cpp b/src/clblast_cuda.cpp index f9a24236..0e3d949d 100644 --- a/src/clblast_cuda.cpp +++ b/src/clblast_cuda.cpp @@ -120,7 +120,7 @@ StatusCode Swap(const size_t n, const auto context_cpp = Context(context); const auto device_cpp = Device(device); auto queue_cpp = Queue(context_cpp, device_cpp); - auto routine = Xswap(queue_cpp, event); + auto routine = Xswap(queue_cpp, nullptr); routine.DoSwap(n, Buffer(x_buffer), x_offset, x_inc, Buffer(y_buffer), y_offset, y_inc); @@ -158,7 +158,7 @@ StatusCode Scal(const size_t n, const auto context_cpp = Context(context); const auto device_cpp = Device(device); auto queue_cpp = Queue(context_cpp, device_cpp); - auto routine = Xscal(queue_cpp, event); + auto routine = Xscal(queue_cpp, nullptr); routine.DoScal(n, alpha, Buffer(x_buffer), x_offset, x_inc); @@ -196,7 +196,7 @@ StatusCode Copy(const size_t n, const auto context_cpp = Context(context); const auto device_cpp = Device(device); auto queue_cpp = Queue(context_cpp, device_cpp); - auto routine = Xcopy(queue_cpp, event); + auto routine = Xcopy(queue_cpp, nullptr); routine.DoCopy(n, Buffer(x_buffer), x_offset, x_inc, Buffer(y_buffer), y_offset, y_inc); @@ -235,7 +235,7 @@ StatusCode Axpy(const size_t n, const auto context_cpp = Context(context); const auto device_cpp = Device(device); auto queue_cpp = Queue(context_cpp, device_cpp); - auto routine = Xaxpy(queue_cpp, event); + auto routine = Xaxpy(queue_cpp, nullptr); routine.DoAxpy(n, alpha, Buffer(x_buffer), x_offset, x_inc, @@ -280,7 +280,7 @@ StatusCode Dot(const size_t n, const auto context_cpp = Context(context); const auto device_cpp = Device(device); auto queue_cpp = Queue(context_cpp, device_cpp); - auto routine = Xdot(queue_cpp, event); + auto routine = Xdot(queue_cpp, nullptr); routine.DoDot(n, Buffer(dot_buffer), dot_offset, Buffer(x_buffer), x_offset, x_inc, @@ -315,7 +315,7 @@ StatusCode Dotu(const size_t n, const auto context_cpp = Context(context); const auto device_cpp = Device(device); auto queue_cpp = Queue(context_cpp, device_cpp); - auto routine = Xdotu(queue_cpp, event); + auto routine = Xdotu(queue_cpp, nullptr); routine.DoDotu(n, Buffer(dot_buffer), dot_offset, Buffer(x_buffer), x_offset, x_inc, @@ -345,7 +345,7 @@ StatusCode Dotc(const size_t n, const auto context_cpp = Context(context); const auto device_cpp = Device(device); auto queue_cpp = Queue(context_cpp, device_cpp); - auto routine = Xdotc(queue_cpp, event); + auto routine = Xdotc(queue_cpp, nullptr); routine.DoDotc(n, Buffer(dot_buffer), dot_offset, Buffer(x_buffer), x_offset, x_inc, @@ -374,7 +374,7 @@ StatusCode Nrm2(const size_t n, const auto context_cpp = Context(context); const auto device_cpp = Device(device); auto queue_cpp = Queue(context_cpp, device_cpp); - auto routine = Xnrm2(queue_cpp, event); + auto routine = Xnrm2(queue_cpp, nullptr); routine.DoNrm2(n, Buffer(nrm2_buffer), nrm2_offset, Buffer(x_buffer), x_offset, x_inc); @@ -412,7 +412,7 @@ StatusCode Asum(const size_t n, const auto context_cpp = Context(context); const auto device_cpp = Device(device); auto queue_cpp = Queue(context_cpp, device_cpp); - auto routine = Xasum(queue_cpp, event); + auto routine = Xasum(queue_cpp, nullptr); routine.DoAsum(n, Buffer(asum_buffer), asum_offset, Buffer(x_buffer), x_offset, x_inc); @@ -450,7 +450,7 @@ StatusCode Sum(const size_t n, const auto context_cpp = Context(context); const auto device_cpp = Device(device); auto queue_cpp = Queue(context_cpp, device_cpp); - auto routine = Xsum(queue_cpp, event); + auto routine = Xsum(queue_cpp, nullptr); routine.DoSum(n, Buffer(sum_buffer), sum_offset, Buffer(x_buffer), x_offset, x_inc); @@ -488,7 +488,7 @@ StatusCode Amax(const size_t n, const auto context_cpp = Context(context); const auto device_cpp = Device(device); auto queue_cpp = Queue(context_cpp, device_cpp); - auto routine = Xamax(queue_cpp, event); + auto routine = Xamax(queue_cpp, nullptr); routine.DoAmax(n, Buffer(imax_buffer), imax_offset, Buffer(x_buffer), x_offset, x_inc); @@ -526,7 +526,7 @@ StatusCode Amin(const size_t n, const auto context_cpp = Context(context); const auto device_cpp = Device(device); auto queue_cpp = Queue(context_cpp, device_cpp); - auto routine = Xamin(queue_cpp, event); + auto routine = Xamin(queue_cpp, nullptr); routine.DoAmin(n, Buffer(imin_buffer), imin_offset, Buffer(x_buffer), x_offset, x_inc); @@ -564,7 +564,7 @@ StatusCode Max(const size_t n, const auto context_cpp = Context(context); const auto device_cpp = Device(device); auto queue_cpp = Queue(context_cpp, device_cpp); - auto routine = Xmax(queue_cpp, event); + auto routine = Xmax(queue_cpp, nullptr); routine.DoMax(n, Buffer(imax_buffer), imax_offset, Buffer(x_buffer), x_offset, x_inc); @@ -602,7 +602,7 @@ StatusCode Min(const size_t n, const auto context_cpp = Context(context); const auto device_cpp = Device(device); auto queue_cpp = Queue(context_cpp, device_cpp); - auto routine = Xmin(queue_cpp, event); + auto routine = Xmin(queue_cpp, nullptr); routine.DoMin(n, Buffer(imin_buffer), imin_offset, Buffer(x_buffer), x_offset, x_inc); @@ -648,7 +648,7 @@ StatusCode Gemv(const Layout layout, const Transpose a_transpose, const auto context_cpp = Context(context); const auto device_cpp = Device(device); auto queue_cpp = Queue(context_cpp, device_cpp); - auto routine = Xgemv(queue_cpp, event); + auto routine = Xgemv(queue_cpp, nullptr); routine.DoGemv(layout, a_transpose, m, n, alpha, @@ -714,7 +714,7 @@ StatusCode Gbmv(const Layout layout, const Transpose a_transpose, const auto context_cpp = Context(context); const auto device_cpp = Device(device); auto queue_cpp = Queue(context_cpp, device_cpp); - auto routine = Xgbmv(queue_cpp, event); + auto routine = Xgbmv(queue_cpp, nullptr); routine.DoGbmv(layout, a_transpose, m, n, kl, ku, alpha, @@ -780,7 +780,7 @@ StatusCode Hemv(const Layout layout, const Triangle triangle, const auto context_cpp = Context(context); const auto device_cpp = Device(device); auto queue_cpp = Queue(context_cpp, device_cpp); - auto routine = Xhemv(queue_cpp, event); + auto routine = Xhemv(queue_cpp, nullptr); routine.DoHemv(layout, triangle, n, alpha, @@ -822,7 +822,7 @@ StatusCode Hbmv(const Layout layout, const Triangle triangle, const auto context_cpp = Context(context); const auto device_cpp = Device(device); auto queue_cpp = Queue(context_cpp, device_cpp); - auto routine = Xhbmv(queue_cpp, event); + auto routine = Xhbmv(queue_cpp, nullptr); routine.DoHbmv(layout, triangle, n, k, alpha, @@ -864,7 +864,7 @@ StatusCode Hpmv(const Layout layout, const Triangle triangle, const auto context_cpp = Context(context); const auto device_cpp = Device(device); auto queue_cpp = Queue(context_cpp, device_cpp); - auto routine = Xhpmv(queue_cpp, event); + auto routine = Xhpmv(queue_cpp, nullptr); routine.DoHpmv(layout, triangle, n, alpha, @@ -906,7 +906,7 @@ StatusCode Symv(const Layout layout, const Triangle triangle, const auto context_cpp = Context(context); const auto device_cpp = Device(device); auto queue_cpp = Queue(context_cpp, device_cpp); - auto routine = Xsymv(queue_cpp, event); + auto routine = Xsymv(queue_cpp, nullptr); routine.DoSymv(layout, triangle, n, alpha, @@ -956,7 +956,7 @@ StatusCode Sbmv(const Layout layout, const Triangle triangle, const auto context_cpp = Context(context); const auto device_cpp = Device(device); auto queue_cpp = Queue(context_cpp, device_cpp); - auto routine = Xsbmv(queue_cpp, event); + auto routine = Xsbmv(queue_cpp, nullptr); routine.DoSbmv(layout, triangle, n, k, alpha, @@ -1006,7 +1006,7 @@ StatusCode Spmv(const Layout layout, const Triangle triangle, const auto context_cpp = Context(context); const auto device_cpp = Device(device); auto queue_cpp = Queue(context_cpp, device_cpp); - auto routine = Xspmv(queue_cpp, event); + auto routine = Xspmv(queue_cpp, nullptr); routine.DoSpmv(layout, triangle, n, alpha, @@ -1053,7 +1053,7 @@ StatusCode Trmv(const Layout layout, const Triangle triangle, const Transpose a_ const auto context_cpp = Context(context); const auto device_cpp = Device(device); auto queue_cpp = Queue(context_cpp, device_cpp); - auto routine = Xtrmv(queue_cpp, event); + auto routine = Xtrmv(queue_cpp, nullptr); routine.DoTrmv(layout, triangle, a_transpose, diagonal, n, Buffer(a_buffer), a_offset, a_ld, @@ -1098,7 +1098,7 @@ StatusCode Tbmv(const Layout layout, const Triangle triangle, const Transpose a_ const auto context_cpp = Context(context); const auto device_cpp = Device(device); auto queue_cpp = Queue(context_cpp, device_cpp); - auto routine = Xtbmv(queue_cpp, event); + auto routine = Xtbmv(queue_cpp, nullptr); routine.DoTbmv(layout, triangle, a_transpose, diagonal, n, k, Buffer(a_buffer), a_offset, a_ld, @@ -1143,7 +1143,7 @@ StatusCode Tpmv(const Layout layout, const Triangle triangle, const Transpose a_ const auto context_cpp = Context(context); const auto device_cpp = Device(device); auto queue_cpp = Queue(context_cpp, device_cpp); - auto routine = Xtpmv(queue_cpp, event); + auto routine = Xtpmv(queue_cpp, nullptr); routine.DoTpmv(layout, triangle, a_transpose, diagonal, n, Buffer(ap_buffer), ap_offset, @@ -1188,7 +1188,7 @@ StatusCode Trsv(const Layout layout, const Triangle triangle, const Transpose a_ const auto context_cpp = Context(context); const auto device_cpp = Device(device); auto queue_cpp = Queue(context_cpp, device_cpp); - auto routine = Xtrsv(queue_cpp, event); + auto routine = Xtrsv(queue_cpp, nullptr); routine.DoTrsv(layout, triangle, a_transpose, diagonal, n, Buffer(a_buffer), a_offset, a_ld, @@ -1290,7 +1290,7 @@ StatusCode Ger(const Layout layout, const auto context_cpp = Context(context); const auto device_cpp = Device(device); auto queue_cpp = Queue(context_cpp, device_cpp); - auto routine = Xger(queue_cpp, event); + auto routine = Xger(queue_cpp, nullptr); routine.DoGer(layout, m, n, alpha, @@ -1335,7 +1335,7 @@ StatusCode Geru(const Layout layout, const auto context_cpp = Context(context); const auto device_cpp = Device(device); auto queue_cpp = Queue(context_cpp, device_cpp); - auto routine = Xgeru(queue_cpp, event); + auto routine = Xgeru(queue_cpp, nullptr); routine.DoGeru(layout, m, n, alpha, @@ -1373,7 +1373,7 @@ StatusCode Gerc(const Layout layout, const auto context_cpp = Context(context); const auto device_cpp = Device(device); auto queue_cpp = Queue(context_cpp, device_cpp); - auto routine = Xgerc(queue_cpp, event); + auto routine = Xgerc(queue_cpp, nullptr); routine.DoGerc(layout, m, n, alpha, @@ -1410,7 +1410,7 @@ StatusCode Her(const Layout layout, const Triangle triangle, const auto context_cpp = Context(context); const auto device_cpp = Device(device); auto queue_cpp = Queue(context_cpp, device_cpp); - auto routine = Xher,T>(queue_cpp, event); + auto routine = Xher,T>(queue_cpp, nullptr); routine.DoHer(layout, triangle, n, alpha, @@ -1444,7 +1444,7 @@ StatusCode Hpr(const Layout layout, const Triangle triangle, const auto context_cpp = Context(context); const auto device_cpp = Device(device); auto queue_cpp = Queue(context_cpp, device_cpp); - auto routine = Xhpr,T>(queue_cpp, event); + auto routine = Xhpr,T>(queue_cpp, nullptr); routine.DoHpr(layout, triangle, n, alpha, @@ -1479,7 +1479,7 @@ StatusCode Her2(const Layout layout, const Triangle triangle, const auto context_cpp = Context(context); const auto device_cpp = Device(device); auto queue_cpp = Queue(context_cpp, device_cpp); - auto routine = Xher2(queue_cpp, event); + auto routine = Xher2(queue_cpp, nullptr); routine.DoHer2(layout, triangle, n, alpha, @@ -1517,7 +1517,7 @@ StatusCode Hpr2(const Layout layout, const Triangle triangle, const auto context_cpp = Context(context); const auto device_cpp = Device(device); auto queue_cpp = Queue(context_cpp, device_cpp); - auto routine = Xhpr2(queue_cpp, event); + auto routine = Xhpr2(queue_cpp, nullptr); routine.DoHpr2(layout, triangle, n, alpha, @@ -1554,7 +1554,7 @@ StatusCode Syr(const Layout layout, const Triangle triangle, const auto context_cpp = Context(context); const auto device_cpp = Device(device); auto queue_cpp = Queue(context_cpp, device_cpp); - auto routine = Xsyr(queue_cpp, event); + auto routine = Xsyr(queue_cpp, nullptr); routine.DoSyr(layout, triangle, n, alpha, @@ -1594,7 +1594,7 @@ StatusCode Spr(const Layout layout, const Triangle triangle, const auto context_cpp = Context(context); const auto device_cpp = Device(device); auto queue_cpp = Queue(context_cpp, device_cpp); - auto routine = Xspr(queue_cpp, event); + auto routine = Xspr(queue_cpp, nullptr); routine.DoSpr(layout, triangle, n, alpha, @@ -1635,7 +1635,7 @@ StatusCode Syr2(const Layout layout, const Triangle triangle, const auto context_cpp = Context(context); const auto device_cpp = Device(device); auto queue_cpp = Queue(context_cpp, device_cpp); - auto routine = Xsyr2(queue_cpp, event); + auto routine = Xsyr2(queue_cpp, nullptr); routine.DoSyr2(layout, triangle, n, alpha, @@ -1680,7 +1680,7 @@ StatusCode Spr2(const Layout layout, const Triangle triangle, const auto context_cpp = Context(context); const auto device_cpp = Device(device); auto queue_cpp = Queue(context_cpp, device_cpp); - auto routine = Xspr2(queue_cpp, event); + auto routine = Xspr2(queue_cpp, nullptr); routine.DoSpr2(layout, triangle, n, alpha, @@ -1730,7 +1730,7 @@ StatusCode Gemm(const Layout layout, const Transpose a_transpose, const Transpos const auto context_cpp = Context(context); const auto device_cpp = Device(device); auto queue_cpp = Queue(context_cpp, device_cpp); - auto routine = Xgemm(queue_cpp, event); + auto routine = Xgemm(queue_cpp, nullptr); routine.DoGemm(layout, a_transpose, b_transpose, m, n, k, alpha, @@ -1796,7 +1796,7 @@ StatusCode Symm(const Layout layout, const Side side, const Triangle triangle, const auto context_cpp = Context(context); const auto device_cpp = Device(device); auto queue_cpp = Queue(context_cpp, device_cpp); - auto routine = Xsymm(queue_cpp, event); + auto routine = Xsymm(queue_cpp, nullptr); routine.DoSymm(layout, side, triangle, m, n, alpha, @@ -1862,7 +1862,7 @@ StatusCode Hemm(const Layout layout, const Side side, const Triangle triangle, const auto context_cpp = Context(context); const auto device_cpp = Device(device); auto queue_cpp = Queue(context_cpp, device_cpp); - auto routine = Xhemm(queue_cpp, event); + auto routine = Xhemm(queue_cpp, nullptr); routine.DoHemm(layout, side, triangle, m, n, alpha, @@ -1903,7 +1903,7 @@ StatusCode Syrk(const Layout layout, const Triangle triangle, const Transpose a_ const auto context_cpp = Context(context); const auto device_cpp = Device(device); auto queue_cpp = Queue(context_cpp, device_cpp); - auto routine = Xsyrk(queue_cpp, event); + auto routine = Xsyrk(queue_cpp, nullptr); routine.DoSyrk(layout, triangle, a_transpose, n, k, alpha, @@ -1962,7 +1962,7 @@ StatusCode Herk(const Layout layout, const Triangle triangle, const Transpose a_ const auto context_cpp = Context(context); const auto device_cpp = Device(device); auto queue_cpp = Queue(context_cpp, device_cpp); - auto routine = Xherk,T>(queue_cpp, event); + auto routine = Xherk,T>(queue_cpp, nullptr); routine.DoHerk(layout, triangle, a_transpose, n, k, alpha, @@ -2001,7 +2001,7 @@ StatusCode Syr2k(const Layout layout, const Triangle triangle, const Transpose a const auto context_cpp = Context(context); const auto device_cpp = Device(device); auto queue_cpp = Queue(context_cpp, device_cpp); - auto routine = Xsyr2k(queue_cpp, event); + auto routine = Xsyr2k(queue_cpp, nullptr); routine.DoSyr2k(layout, triangle, ab_transpose, n, k, alpha, @@ -2067,7 +2067,7 @@ StatusCode Her2k(const Layout layout, const Triangle triangle, const Transpose a const auto context_cpp = Context(context); const auto device_cpp = Device(device); auto queue_cpp = Queue(context_cpp, device_cpp); - auto routine = Xher2k(queue_cpp, event); + auto routine = Xher2k(queue_cpp, nullptr); routine.DoHer2k(layout, triangle, ab_transpose, n, k, alpha, @@ -2107,7 +2107,7 @@ StatusCode Trmm(const Layout layout, const Side side, const Triangle triangle, c const auto context_cpp = Context(context); const auto device_cpp = Device(device); auto queue_cpp = Queue(context_cpp, device_cpp); - auto routine = Xtrmm(queue_cpp, event); + auto routine = Xtrmm(queue_cpp, nullptr); routine.DoTrmm(layout, side, triangle, a_transpose, diagonal, m, n, alpha, @@ -2159,7 +2159,7 @@ StatusCode Trsm(const Layout layout, const Side side, const Triangle triangle, c const auto context_cpp = Context(context); const auto device_cpp = Device(device); auto queue_cpp = Queue(context_cpp, device_cpp); - auto routine = Xtrsm(queue_cpp, event); + auto routine = Xtrsm(queue_cpp, nullptr); routine.DoTrsm(layout, side, triangle, a_transpose, diagonal, m, n, alpha, @@ -2209,7 +2209,7 @@ StatusCode Omatcopy(const Layout layout, const Transpose a_transpose, const auto context_cpp = Context(context); const auto device_cpp = Device(device); auto queue_cpp = Queue(context_cpp, device_cpp); - auto routine = Xomatcopy(queue_cpp, event); + auto routine = Xomatcopy(queue_cpp, nullptr); routine.DoOmatcopy(layout, a_transpose, m, n, alpha, @@ -2259,7 +2259,7 @@ StatusCode Im2col(const size_t channels, const size_t height, const size_t width const auto context_cpp = Context(context); const auto device_cpp = Device(device); auto queue_cpp = Queue(context_cpp, device_cpp); - auto routine = Xim2col(queue_cpp, event); + auto routine = Xim2col(queue_cpp, nullptr); routine.DoIm2col(channels, height, width, kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w, dilation_h, dilation_w, Buffer(im_buffer), im_offset, Buffer(col_buffer), col_offset); @@ -2299,7 +2299,7 @@ StatusCode AxpyBatched(const size_t n, const auto context_cpp = Context(context); const auto device_cpp = Device(device); auto queue_cpp = Queue(context_cpp, device_cpp); - auto routine = XaxpyBatched(queue_cpp, event); + auto routine = XaxpyBatched(queue_cpp, nullptr); auto alphas_cpp = std::vector(); auto x_offsets_cpp = std::vector(); auto y_offsets_cpp = std::vector(); @@ -2362,7 +2362,7 @@ StatusCode GemmBatched(const Layout layout, const Transpose a_transpose, const T const auto context_cpp = Context(context); const auto device_cpp = Device(device); auto queue_cpp = Queue(context_cpp, device_cpp); - auto routine = XgemmBatched(queue_cpp, event); + auto routine = XgemmBatched(queue_cpp, nullptr); auto alphas_cpp = std::vector(); auto betas_cpp = std::vector(); auto a_offsets_cpp = std::vector(); diff --git a/src/cupp11.hpp b/src/cupp11.hpp index 988366ea..854c0be9 100644 --- a/src/cupp11.hpp +++ b/src/cupp11.hpp @@ -370,6 +370,8 @@ using ContextPointer = CUcontext*; // C++11 version of 'nvrtcProgram'. Additionally holds the program's source code. class Program { public: + Program() = default; + // Note that there is no constructor based on the regular CUDA data-type because of extra state // Source-based constructor with memory management @@ -404,7 +406,7 @@ public: // Confirms whether a certain status code is an actual compilation error or warning bool StatusIsCompilationWarningOrError(const nvrtcResult status) const { - return (status == NVRTC_ERROR_INVALID_INPUT); + return (status == NVRTC_ERROR_COMPILATION); } // Retrieves the warning/error message from the compiler (if any) @@ -433,8 +435,8 @@ public: const nvrtcProgram& operator()() const { return *program_; } private: std::shared_ptr program_; - const std::string source_; - const bool from_binary_; + std::string source_; + bool from_binary_; }; // ================================================================================================= @@ -730,7 +732,7 @@ public: // TODO: Implement this function void Launch(const Queue &queue, const std::vector &global, const std::vector &local, EventPointer event, - std::vector& waitForEvents) { + const std::vector& waitForEvents) { if (local.size() == 0) { throw LogicError("Kernel: launching with a default workgroup size is not implemented for the CUDA back-end"); } diff --git a/src/cxpp11_common.hpp b/src/cxpp11_common.hpp index 6ac008be..5097eac4 100644 --- a/src/cxpp11_common.hpp +++ b/src/cxpp11_common.hpp @@ -15,6 +15,7 @@ #ifndef CLBLAST_CXPP11_COMMON_H_ #define CLBLAST_CXPP11_COMMON_H_ +#include // strchr #include // std::string #include // std::runtime_error diff --git a/src/kernels/opencl_to_cuda.h b/src/kernels/opencl_to_cuda.h index 43a26a2f..2e46bc2b 100644 --- a/src/kernels/opencl_to_cuda.h +++ b/src/kernels/opencl_to_cuda.h @@ -11,6 +11,11 @@ // // ================================================================================================= +// Enables loading of this file using the C++ pre-processor's #include (C++11 standard raw string +// literal). Comment-out this line for syntax-highlighting when developing. +R"( +// ================================================================================================= + // Replaces the OpenCL keywords with CUDA equivalent #define __kernel __placeholder__ #define __global @@ -49,3 +54,9 @@ typedef struct { float s0; float s1; float s2; float s3; float s12; float s13; float s14; float s15; } float16; // ================================================================================================= + +// End of the C++11 raw string literal +)" + +// ================================================================================================= +