From f04d0028444bc9b3d4225fba47e19d4c3aeb3741 Mon Sep 17 00:00:00 2001 From: Engininja2 <139037756+Engininja2@users.noreply.github.com> Date: Fri, 1 Sep 2023 15:33:19 -0600 Subject: [PATCH] cuda : vsubss4 for older versions of ROCm/clang (#2942) --- ggml-cuda.cu | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/ggml-cuda.cu b/ggml-cuda.cu index 5fd625630..8357f32f7 100644 --- a/ggml-cuda.cu +++ b/ggml-cuda.cu @@ -81,12 +81,29 @@ #if defined(GGML_USE_HIPBLAS) #define __CUDA_ARCH__ 1300 +#ifndef __has_builtin + #define __has_builtin(x) 0 +#endif + typedef int8_t int8x4_t __attribute__((ext_vector_type(4))); static __device__ __forceinline__ int __vsubss4(const int a, const int b) { const int8x4_t va = reinterpret_cast(a); const int8x4_t vb = reinterpret_cast(b); +#if __has_builtin(__builtin_elementwise_sub_sat) const int8x4_t c = __builtin_elementwise_sub_sat(va, vb); return reinterpret_cast(c); +#else + int8x4_t c; + int16_t tmp; +#pragma unroll + for (int i = 0; i < 4; i++) { + tmp = va[i] - vb[i]; + if(tmp > std::numeric_limits::max()) tmp = std::numeric_limits::max(); + if(tmp < std::numeric_limits::min()) tmp = std::numeric_limits::min(); + c[i] = tmp; + } + return reinterpret_cast(c); +#endif // __has_builtin(__builtin_elementwise_sub_sat) } static __device__ __forceinline__ int __dp4a(const int a, const int b, int c) {