diff --git a/ggml-cuda.cu b/ggml-cuda.cu index 5fd625630..8357f32f7 100644 --- a/ggml-cuda.cu +++ b/ggml-cuda.cu @@ -81,12 +81,29 @@ #if defined(GGML_USE_HIPBLAS) #define __CUDA_ARCH__ 1300 +#ifndef __has_builtin + #define __has_builtin(x) 0 +#endif + typedef int8_t int8x4_t __attribute__((ext_vector_type(4))); static __device__ __forceinline__ int __vsubss4(const int a, const int b) { const int8x4_t va = reinterpret_cast(a); const int8x4_t vb = reinterpret_cast(b); +#if __has_builtin(__builtin_elementwise_sub_sat) const int8x4_t c = __builtin_elementwise_sub_sat(va, vb); return reinterpret_cast(c); +#else + int8x4_t c; + int16_t tmp; +#pragma unroll + for (int i = 0; i < 4; i++) { + tmp = va[i] - vb[i]; + if(tmp > std::numeric_limits::max()) tmp = std::numeric_limits::max(); + if(tmp < std::numeric_limits::min()) tmp = std::numeric_limits::min(); + c[i] = tmp; + } + return reinterpret_cast(c); +#endif // __has_builtin(__builtin_elementwise_sub_sat) } static __device__ __forceinline__ int __dp4a(const int a, const int b, int c) {