From d94d086d6f92ff1f73bd2a8595a974f6802b3f24 Mon Sep 17 00:00:00 2001 From: Cedric Nugteren Date: Wed, 10 May 2023 12:48:25 +0200 Subject: [PATCH] TBMV/TPMV/TRSV: Use the minimum x buffer size for copying to a temp buffer (#461) --- CHANGELOG | 4 ++-- src/routines/level2/xtbmv.cpp | 5 +++-- src/routines/level2/xtpmv.cpp | 5 +++-- src/routines/level2/xtrsv.cpp | 2 +- 4 files changed, 9 insertions(+), 7 deletions(-) diff --git a/CHANGELOG b/CHANGELOG index 39813f94..345bab2a 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -1,6 +1,4 @@ Development version (next version) -- Fixes a minor issue with the expected input buffer size in the TRMV routine -- Fixes two small issues in the plotting script - Modifications to improve performance on Qualcomm Adreno GPUs: * Unique database entries for specific Adreno devices * Toggle OpenCL kernel compilation options for Adreno @@ -9,6 +7,8 @@ Development version (next version) - Fixed a bug in XAMAX/XAMIN routines related to inadvertently including the increment and offset in the result - Fixed a bug in XAMAX/XAMIN routines that would cause only the real part of a complex number to be taken into account - Fixed a bug that caused tests to not properly do integer-output testing (for XAMAX/XAMIN) +- Fixes a minor issue with the expected input buffer size in the TRMV/TBMV/TPMV/TRSV routines +- Fixes two small issues in the plotting script - Fixed a documentation bug in the 'ld' requirements - Added tuned parameters for various devices (see doc/tuning.md) diff --git a/src/routines/level2/xtbmv.cpp b/src/routines/level2/xtbmv.cpp index 117d26e0..87053deb 100644 --- a/src/routines/level2/xtbmv.cpp +++ b/src/routines/level2/xtbmv.cpp @@ -36,8 +36,9 @@ void Xtbmv::DoTbmv(const Layout layout, const Triangle triangle, const Buffer &x_buffer, const size_t x_offset, const size_t x_inc) { // Creates a copy of X: a temporary scratch buffer - auto scratch_buffer = Buffer(context_, n*x_inc + x_offset); - x_buffer.CopyTo(queue_, n*x_inc + x_offset, scratch_buffer); + const auto x_size = (1 + (n - 1) * x_inc) + x_offset; + auto scratch_buffer = Buffer(context_, x_size); + x_buffer.CopyTo(queue_, x_size, scratch_buffer); // The data is either in the upper or lower triangle size_t is_upper = ((triangle == Triangle::kUpper && layout != Layout::kRowMajor) || diff --git a/src/routines/level2/xtpmv.cpp b/src/routines/level2/xtpmv.cpp index 00282378..2190a6f5 100644 --- a/src/routines/level2/xtpmv.cpp +++ b/src/routines/level2/xtpmv.cpp @@ -36,8 +36,9 @@ void Xtpmv::DoTpmv(const Layout layout, const Triangle triangle, const Buffer &x_buffer, const size_t x_offset, const size_t x_inc) { // Creates a copy of X: a temporary scratch buffer - auto scratch_buffer = Buffer(context_, n*x_inc + x_offset); - x_buffer.CopyTo(queue_, n*x_inc + x_offset, scratch_buffer); + const auto x_size = (1 + (n - 1) * x_inc) + x_offset; + auto scratch_buffer = Buffer(context_, x_size); + x_buffer.CopyTo(queue_, x_size, scratch_buffer); // The data is either in the upper or lower triangle size_t is_upper = ((triangle == Triangle::kUpper && layout != Layout::kRowMajor) || diff --git a/src/routines/level2/xtrsv.cpp b/src/routines/level2/xtrsv.cpp index 2a5a5664..b50b259b 100644 --- a/src/routines/level2/xtrsv.cpp +++ b/src/routines/level2/xtrsv.cpp @@ -99,7 +99,7 @@ void Xtrsv::DoTrsv(const Layout layout, const Triangle triangle, // TODO: Make x with 0 offset and unit increment by creating custom copy-to and copy-from kernels const auto x_offset = b_offset; const auto x_inc = b_inc; - const auto x_size = n*x_inc + x_offset; + const auto x_size = (1 + (n - 1) * x_inc) + x_offset; auto x_buffer = Buffer(context_, x_size); b_buffer.CopyTo(queue_, x_size, x_buffer);