// ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- // width of 100 characters per line. // // Author(s): // Cedric Nugteren // // This file implements the common routine functions (see the header for more information). // // ================================================================================================= #include #include #include #include "routines/common.hpp" namespace clblast { // ================================================================================================= // Enqueues a kernel, waits for completion, and checks for errors void RunKernel(Kernel &kernel, Queue &queue, const Device &device, std::vector global, const std::vector &local, EventPointer event, const std::vector &waitForEvents) { if (!local.empty()) { // Tests for validity of the local thread sizes if (local.size() > device.MaxWorkItemDimensions()) { throw RuntimeErrorCode(StatusCode::kInvalidLocalNumDimensions); } const auto max_work_item_sizes = device.MaxWorkItemSizes(); for (auto i=size_t{0}; i max_work_item_sizes[i]) { throw RuntimeErrorCode(StatusCode::kInvalidLocalThreadsDim); } } auto local_size = size_t{1}; for (auto &item: local) { local_size *= item; } if (local_size > device.MaxWorkGroupSize()) { throw RuntimeErrorCode(StatusCode::kInvalidLocalThreadsTotal, ToString(local_size) + " is larger than " + ToString(device.MaxWorkGroupSize())); } // Make sure the global thread sizes are at least equal to the local sizes for (auto i=size_t{0}; i(elapsed_time).count(); printf("[DEBUG] Completed kernel in %.2lf ms\n", timing); #endif } // ================================================================================================= // Sets all elements of a matrix to a constant value template void FillMatrix(Queue &queue, const Device &device, const std::shared_ptr program, EventPointer event, const std::vector &waitForEvents, const size_t m, const size_t n, const size_t ld, const size_t offset, const Buffer &dest, const T constant_value, const size_t local_size) { auto kernel = Kernel(program, "FillMatrix"); kernel.SetArgument(0, static_cast(m)); kernel.SetArgument(1, static_cast(n)); kernel.SetArgument(2, static_cast(ld)); kernel.SetArgument(3, static_cast(offset)); kernel.SetArgument(4, dest()); kernel.SetArgument(5, GetRealArg(constant_value)); auto local = std::vector{local_size, 1}; auto global = std::vector{Ceil(m, local_size), n}; RunKernel(kernel, queue, device, global, local, event, waitForEvents); } // Compiles the above function template void FillMatrix(Queue&, const Device&, const std::shared_ptr, EventPointer, const std::vector&, const size_t, const size_t, const size_t, const size_t, const Buffer&, const half, const size_t); template void FillMatrix(Queue&, const Device&, const std::shared_ptr, EventPointer, const std::vector&, const size_t, const size_t, const size_t, const size_t, const Buffer&, const float, const size_t); template void FillMatrix(Queue&, const Device&, const std::shared_ptr, EventPointer, const std::vector&, const size_t, const size_t, const size_t, const size_t, const Buffer&, const double, const size_t); template void FillMatrix(Queue&, const Device&, const std::shared_ptr, EventPointer, const std::vector&, const size_t, const size_t, const size_t, const size_t, const Buffer&, const float2, const size_t); template void FillMatrix(Queue&, const Device&, const std::shared_ptr, EventPointer, const std::vector&, const size_t, const size_t, const size_t, const size_t, const Buffer&, const double2, const size_t); // Sets all elements of a vector to a constant value template void FillVector(Queue &queue, const Device &device, const std::shared_ptr program, EventPointer event, const std::vector &waitForEvents, const size_t n, const size_t inc, const size_t offset, const Buffer &dest, const T constant_value, const size_t local_size) { auto kernel = Kernel(program, "FillVector"); kernel.SetArgument(0, static_cast(n)); kernel.SetArgument(1, static_cast(inc)); kernel.SetArgument(2, static_cast(offset)); kernel.SetArgument(3, dest()); kernel.SetArgument(4, GetRealArg(constant_value)); auto local = std::vector{local_size}; auto global = std::vector{Ceil(n, local_size)}; RunKernel(kernel, queue, device, global, local, event, waitForEvents); } // Compiles the above function template void FillVector(Queue&, const Device&, const std::shared_ptr, EventPointer, const std::vector&, const size_t, const size_t, const size_t, const Buffer&, const half, const size_t); template void FillVector(Queue&, const Device&, const std::shared_ptr, EventPointer, const std::vector&, const size_t, const size_t, const size_t, const Buffer&, const float, const size_t); template void FillVector(Queue&, const Device&, const std::shared_ptr, EventPointer, const std::vector&, const size_t, const size_t, const size_t, const Buffer&, const double, const size_t); template void FillVector(Queue&, const Device&, const std::shared_ptr, EventPointer, const std::vector&, const size_t, const size_t, const size_t, const Buffer&, const float2, const size_t); template void FillVector(Queue&, const Device&, const std::shared_ptr, EventPointer, const std::vector&, const size_t, const size_t, const size_t, const Buffer&, const double2, const size_t); // ================================================================================================= } // namespace clblast