From e5d06cfc0f51a2f0845b235d21fdf9e814f19685 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Thu, 22 Feb 2024 23:21:39 +0200 Subject: [PATCH] ggml : always define ggml_fp16_t as uint16_t (llama/5666) * ggml : always define ggml_fp16_t as uint16_t ggml-ci * ggml : cont ggml-ci * ggml : cont * ggml : cont ggml-ci * ggml : cont ggml-ci * cuda : no longer ggml headers last ggml-ci * ggml : fix q6_K FP16 -> FP32 conversion ggml-ci * ggml : more FP16 -> FP32 conversion fixes ggml-ci --- ggml-cuda.cu | 9 ++++----- ggml-impl.h | 27 ++++++++++++++++++++------- ggml-quants.c | 30 +++++++++++++++--------------- ggml.c | 6 +++--- ggml.h | 6 ------ 5 files changed, 42 insertions(+), 36 deletions(-) diff --git a/ggml-cuda.cu b/ggml-cuda.cu index e7c211d..b0e454e 100644 --- a/ggml-cuda.cu +++ b/ggml-cuda.cu @@ -1,3 +1,7 @@ +#include "ggml-cuda.h" +#include "ggml.h" +#include "ggml-backend-impl.h" + #include #include #include @@ -121,11 +125,6 @@ #endif // defined(GGML_USE_HIPBLAS) -// ggml-cuda need half type so keep ggml headers include at last -#include "ggml-cuda.h" -#include "ggml.h" -#include "ggml-backend-impl.h" - #define CUDART_HMAX 11070 // CUDA 11.7, min. ver. for which __hmax and __hmax2 are known to work (may be higher than needed) #define CC_PASCAL 600 diff --git a/ggml-impl.h b/ggml-impl.h index 19df66b..c5637e4 100644 --- a/ggml-impl.h +++ b/ggml-impl.h @@ -53,11 +53,23 @@ extern "C" { // #include -#define GGML_COMPUTE_FP16_TO_FP32(x) ((float) (x)) -#define GGML_COMPUTE_FP32_TO_FP16(x) (x) +#define GGML_COMPUTE_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x) +#define GGML_COMPUTE_FP32_TO_FP16(x) ggml_compute_fp32_to_fp16(x) -#define GGML_FP16_TO_FP32(x) ((float) (x)) -#define GGML_FP32_TO_FP16(x) (x) +#define GGML_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x) + +static inline float ggml_compute_fp16_to_fp32(ggml_fp16_t h) { + __fp16 tmp; + memcpy(&tmp, &h, sizeof(ggml_fp16_t)); + return (float)tmp; +} + +static inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f) { + ggml_fp16_t res; + __fp16 tmp = f; + memcpy(&res, &tmp, sizeof(ggml_fp16_t)); + return res; +} #else @@ -214,8 +226,7 @@ extern float ggml_table_f32_f16[1 << 16]; // On ARM NEON, it's quicker to directly convert x -> x instead of calling into ggml_lookup_fp16_to_fp32, // so we define GGML_FP16_TO_FP32 and GGML_FP32_TO_FP16 elsewhere for NEON. // This is also true for POWER9. -#if !defined(GGML_FP16_TO_FP32) || !defined(GGML_FP32_TO_FP16) - +#if !defined(GGML_FP16_TO_FP32) inline static float ggml_lookup_fp16_to_fp32(ggml_fp16_t f) { uint16_t s; memcpy(&s, &f, sizeof(uint16_t)); @@ -223,8 +234,10 @@ inline static float ggml_lookup_fp16_to_fp32(ggml_fp16_t f) { } #define GGML_FP16_TO_FP32(x) ggml_lookup_fp16_to_fp32(x) -#define GGML_FP32_TO_FP16(x) GGML_COMPUTE_FP32_TO_FP16(x) +#endif +#if !defined(GGML_FP32_TO_FP16) +#define GGML_FP32_TO_FP16(x) GGML_COMPUTE_FP32_TO_FP16(x) #endif #define GGML_HASHTABLE_FULL ((size_t)-1) diff --git a/ggml-quants.c b/ggml-quants.c index 8917c8a..b15977f 100644 --- a/ggml-quants.c +++ b/ggml-quants.c @@ -5654,8 +5654,8 @@ void ggml_vec_dot_q2_K_q8_K(int n, float * restrict s, size_t bs, const void * r for (int i = 0; i < nb; ++i) { - const float d = y[i].d * (float)x[i].d; - const float dmin = -y[i].d * (float)x[i].dmin; + const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d); + const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin); const uint8_t * restrict q2 = x[i].qs; const int8_t * restrict q8 = y[i].qs; @@ -5804,8 +5804,8 @@ void ggml_vec_dot_q2_K_q8_K(int n, float * restrict s, size_t bs, const void * r for (int i = 0; i < nb; ++i) { - const float d = y[i].d * (float)x[i].d; - const float dmin = -y[i].d * (float)x[i].dmin; + const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d); + const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin); const uint8_t * restrict q2 = x[i].qs; const int8_t * restrict q8 = y[i].qs; @@ -6458,7 +6458,7 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * restrict s, size_t bs, const void * r int32_t isum = -4*(scales[0] * y[i].bsums[0] + scales[2] * y[i].bsums[1] + scales[1] * y[i].bsums[2] + scales[3] * y[i].bsums[3]); - const float d = y[i].d * (float)x[i].d; + const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d); const uint8x16_t htmp = vcombine_u8(hbits, vshr_n_u8(hbits, 1)); q3h.val[0] = vandq_u8(mh, vshlq_n_u8(htmp, 2)); @@ -6660,7 +6660,7 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * restrict s, size_t bs, const void * r int32_t isum = -4*(scales[0] * y[i].bsums[0] + scales[2] * y[i].bsums[1] + scales[1] * y[i].bsums[2] + scales[3] * y[i].bsums[3]); - const float d = y[i].d * (float)x[i].d; + const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d); vint32m1_t vzero = __riscv_vmv_v_x_i32m1(0, 1); @@ -7163,9 +7163,9 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * restrict s, size_t bs, const void * r aux16[1] = (a[0] >> 4) & 0x0f0f; const int32_t summi = scales[2] * (y[i].bsums[0] + y[i].bsums[1]) + scales[3] * (y[i].bsums[2] + y[i].bsums[3]); - sum_mins += y[i].d * (float)x[i].d[1] * summi; + sum_mins += y[i].d * GGML_FP16_TO_FP32(x[i].d[1]) * summi; - const float d = y[i].d * (float)x[i].d[0]; + const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d[0]); const ggml_uint8x16x2_t q4bits = ggml_vld1q_u8_x2(q4); @@ -7823,7 +7823,7 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * r for (int i = 0; i < nb; ++i) { - const float d = y[i].d * (float)x[i].d; + const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d); const int8_t * sc = x[i].scales; const uint8_t * restrict q5 = x[i].qs; @@ -7965,7 +7965,7 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * r for (int i = 0; i < nb; ++i) { - const float d = y[i].d * (float)x[i].d; + const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d); const int8_t * sc = x[i].scales; const uint8_t * restrict q5 = x[i].qs; @@ -8533,7 +8533,7 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * restrict s, size_t bs, const void * r for (int i = 0; i < nb; ++i) { - const float d_all = (float)x[i].d; + const float d_all = GGML_FP16_TO_FP32(x[i].d); const uint8_t * restrict q6 = x[i].ql; const uint8_t * restrict qh = x[i].qh; @@ -8704,7 +8704,7 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * restrict s, size_t bs, const void * r for (int i = 0; i < nb; ++i) { - const float d_all = (float)x[i].d; + const float d_all = GGML_FP16_TO_FP32(x[i].d); const uint8_t * restrict q6 = x[i].ql; const uint8_t * restrict qh = x[i].qh; @@ -9523,7 +9523,6 @@ void ggml_vec_dot_iq4_nl_q8_0(int n, float * restrict s, size_t bs, const void * float sumf = 0; for (int ib = 0; ib < nb; ib += 2) { - q4bits.val[0] = vld1q_u8(x[ib+0].qs); q4bits.val[1] = vld1q_u8(x[ib+1].qs); q8b.val[0] = vld1q_s8(y[ib+0].qs); @@ -9539,8 +9538,9 @@ void ggml_vec_dot_iq4_nl_q8_0(int n, float * restrict s, size_t bs, const void * prod_1 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), q4b.val[0], q8b.val[0]), q4b.val[1], q8b.val[1]); prod_2 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), q4b.val[2], q8b.val[2]), q4b.val[3], q8b.val[3]); - sumf += (float)x[ib+0].d * (float)y[ib+0].d * vaddvq_s32(prod_1) + (float)x[ib+1].d * (float)y[ib+1].d * vaddvq_s32(prod_2); - + sumf += + GGML_FP16_TO_FP32(x[ib+0].d) * GGML_FP16_TO_FP32(y[ib+0].d) * vaddvq_s32(prod_1) + + GGML_FP16_TO_FP32(x[ib+1].d) * GGML_FP16_TO_FP32(y[ib+1].d) * vaddvq_s32(prod_2); } *s = sumf; diff --git a/ggml.c b/ggml.c index 5b9fa74..d710fe7 100644 --- a/ggml.c +++ b/ggml.c @@ -323,7 +323,7 @@ float ggml_table_f32_f16[1 << 16]; // note: do not use these inside ggml.c // these are meant to be used via the ggml.h API float ggml_fp16_to_fp32(ggml_fp16_t x) { - return (float) GGML_FP16_TO_FP32(x); + return GGML_FP16_TO_FP32(x); } ggml_fp16_t ggml_fp32_to_fp16(float x) { @@ -798,7 +798,7 @@ inline static float vaddvq_f32(float32x4_t v) { #define GGML_F16x8 float16x8_t #define GGML_F16x8_ZERO vdupq_n_f16(0.0f) #define GGML_F16x8_SET1(x) vdupq_n_f16(x) - #define GGML_F16x8_LOAD vld1q_f16 + #define GGML_F16x8_LOAD(x) vld1q_f16((const __fp16 *)(x)) #define GGML_F16x8_STORE vst1q_f16 #define GGML_F16x8_FMA(a, b, c) vfmaq_f16(a, b, c) #define GGML_F16x8_ADD vaddq_f16 @@ -841,7 +841,7 @@ inline static float vaddvq_f32(float32x4_t v) { #define GGML_F32Cx4 float32x4_t #define GGML_F32Cx4_ZERO vdupq_n_f32(0.0f) #define GGML_F32Cx4_SET1(x) vdupq_n_f32(x) - #define GGML_F32Cx4_LOAD(x) vcvt_f32_f16(vld1_f16(x)) + #define GGML_F32Cx4_LOAD(x) vcvt_f32_f16(vld1_f16((const __fp16 *)(x))) #define GGML_F32Cx4_STORE(x, y) vst1_f16(x, vcvt_f16_f32(y)) #define GGML_F32Cx4_FMA(a, b, c) vfmaq_f32(a, b, c) #define GGML_F32Cx4_ADD vaddq_f32 diff --git a/ggml.h b/ggml.h index bed7a36..37eff62 100644 --- a/ggml.h +++ b/ggml.h @@ -315,13 +315,7 @@ extern "C" { #endif -#if defined(__ARM_NEON) && defined(__CUDACC__) - typedef half ggml_fp16_t; -#elif defined(__ARM_NEON) && !defined(_MSC_VER) - typedef __fp16 ggml_fp16_t; -#else typedef uint16_t ggml_fp16_t; -#endif // convert FP16 <-> FP32 GGML_API float ggml_fp16_to_fp32(ggml_fp16_t x);