From c82642ce356bc73c4f9ab7c8f23d0e08ac96f4d9 Mon Sep 17 00:00:00 2001 From: Rob Sykes Date: Fri, 4 Mar 2016 22:18:36 +0000 Subject: [PATCH] use latest pffft --- src/pffft-wrap.c | 108 +++++++++ src/pffft.c | 621 +++++++++++++++++++++++++++++++---------------- src/pffft.h | 52 ++-- src/pffft32.c | 2 +- src/pffft32s.c | 2 +- 5 files changed, 557 insertions(+), 228 deletions(-) create mode 100644 src/pffft-wrap.c diff --git a/src/pffft-wrap.c b/src/pffft-wrap.c new file mode 100644 index 0000000..dd2d668 --- /dev/null +++ b/src/pffft-wrap.c @@ -0,0 +1,108 @@ +/* SoX Resampler Library Copyright (c) 2007-16 robs@users.sourceforge.net + * Licence for this file: LGPL v2.1 See LICENCE for details. */ + +#if !defined PFFT_MACROS_ONLY + +#include "simd.h" +#include + +#define pffft_aligned_free _soxr_simd_aligned_free +#define pffft_aligned_malloc _soxr_simd_aligned_malloc +#define pffft_aligned_calloc _soxr_simd_aligned_calloc + +#if defined _MSC_VER + #define sin (float)sin + #define cos (float)cos +#else + #define sin(x) sinf((float)(x)) + #define cos(x) cosf((float)(x)) +#endif + +#endif + + + +#include "pffft.c" + + + +#if !defined PFFT_MACROS_ONLY + +#if !defined PFFFT_SIMD_DISABLE + +static void pffft_zconvolve(PFFFT_Setup *s, const float *a, const float *b, float *ab) { + int i, Ncvec = s->Ncvec; + const v4sf * /*RESTRICT*/ va = (const v4sf*)a; + const v4sf * RESTRICT vb = (const v4sf*)b; + v4sf * /*RESTRICT*/ vab = (v4sf*)ab; + + float ar, ai, br, bi; + +#ifdef __arm__ + __builtin_prefetch(va); + __builtin_prefetch(vb); + __builtin_prefetch(va+2); + __builtin_prefetch(vb+2); + __builtin_prefetch(va+4); + __builtin_prefetch(vb+4); + __builtin_prefetch(va+6); + __builtin_prefetch(vb+6); +#endif + + assert(VALIGNED(a) && VALIGNED(b) && VALIGNED(ab)); + ar = ((v4sf_union*)va)[0].f[0]; + ai = ((v4sf_union*)va)[1].f[0]; + br = ((v4sf_union*)vb)[0].f[0]; + bi = ((v4sf_union*)vb)[1].f[0]; + + for (i=0; i < Ncvec; i += 2) { + v4sf ar, ai, br, bi; + ar = va[2*i+0]; ai = va[2*i+1]; + br = vb[2*i+0]; bi = vb[2*i+1]; + VCPLXMUL(ar, ai, br, bi); + vab[2*i+0] = ar; + vab[2*i+1] = ai; + ar = va[2*i+2]; ai = va[2*i+3]; + br = vb[2*i+2]; bi = vb[2*i+3]; + VCPLXMUL(ar, ai, br, bi); + vab[2*i+2] = ar; + vab[2*i+3] = ai; + } + if (s->transform == PFFFT_REAL) { + ((v4sf_union*)vab)[0].f[0] = ar*br; + ((v4sf_union*)vab)[1].f[0] = ai*bi; + } +} + +#else + +static void pffft_zconvolve(PFFFT_Setup *s, const float *a, const float *b, float *ab) { + int i, Ncvec = s->Ncvec; + + if (s->transform == PFFFT_REAL) { + /* take care of the fftpack ordering */ + ab[0] = a[0]*b[0]; + ab[2*Ncvec-1] = a[2*Ncvec-1]*b[2*Ncvec-1]; + ++ab; ++a; ++b; --Ncvec; + } + for (i=0; i < Ncvec; ++i) { + float ar, ai, br, bi; + ar = a[2*i+0]; ai = a[2*i+1]; + br = b[2*i+0]; bi = b[2*i+1]; + VCPLXMUL(ar, ai, br, bi); + ab[2*i+0] = ar; + ab[2*i+1] = ai; + } +} + +#endif + +#include + +static void pffft_reorder_back(int length, void * setup, float * data, float * work) +{ + memcpy(work, data, (unsigned)length * sizeof(*work)); + pffft_zreorder(setup, work, data, PFFFT_BACKWARD); +} + +#endif diff --git a/src/pffft.c b/src/pffft.c index 9b4f59d..b68f86c 100644 --- a/src/pffft.c +++ b/src/pffft.c @@ -1,4 +1,7 @@ -/* Copyright (c) 2011 Julien Pommier ( pommier@modartt.com ) +/* https://bitbucket.org/jpommier/pffft/raw/483453d8f7661058e74aa4e7cf5c27bcd7887e7a/pffft.c + * with minor changes for libsoxr. */ + +/* Copyright (c) 2013 Julien Pommier ( pommier@modartt.com ) Based on original fortran 77 code from FFTPACKv4 from NETLIB (http://www.netlib.org/fftpack), authored by Dr Paul Swarztrauber @@ -57,29 +60,12 @@ - 2011/10/02, version 1: This is the very first release of this file. */ -#if !defined PFFT_MACROS_ONLY #include "pffft.h" -#include "simd.h" -#include #include +#include #include #include -#define pffft_aligned_free _soxr_simd_aligned_free -#define pffft_aligned_malloc _soxr_simd_aligned_malloc -#define pffft_aligned_calloc _soxr_simd_aligned_calloc -#endif - -/* - vector support macros: the rest of the code is independant of - SSE/Altivec/NEON -- adding support for other platforms with 4-element - vectors should be limited to these macros -*/ - - -/* define PFFFT_SIMD_DISABLE if you want to use scalar code instead of simd code */ -/*#define PFFFT_SIMD_DISABLE */ - /* detect compiler flavour */ #if defined(_MSC_VER) # define COMPILER_MSVC @@ -91,14 +77,25 @@ # define ALWAYS_INLINE(return_type) inline return_type __attribute__ ((always_inline)) # define NEVER_INLINE(return_type) return_type __attribute__ ((noinline)) # define RESTRICT __restrict -/*# define VLA_ARRAY_ON_STACK(type__, varname__, size__) type__ varname__[size__]; */ +# define VLA_ARRAY_ON_STACK(type__, varname__, size__) type__ varname__[size__]; #elif defined(COMPILER_MSVC) # define ALWAYS_INLINE(return_type) __forceinline return_type # define NEVER_INLINE(return_type) __declspec(noinline) return_type # define RESTRICT __restrict -/*# define VLA_ARRAY_ON_STACK(type__, varname__, size__) type__ *varname__ = (v4sf*)_alloca(size__ * sizeof(type__)) */ +# define VLA_ARRAY_ON_STACK(type__, varname__, size__) type__ *varname__ = (type__*)_alloca(size__ * sizeof(type__)) #endif + +/* + vector support macros: the rest of the code is independant of + SSE/Altivec/NEON -- adding support for other platforms with 4-element + vectors should be limited to these macros +*/ + + +/* define PFFFT_SIMD_DISABLE if you want to use scalar code instead of simd code */ +/*#define PFFFT_SIMD_DISABLE */ + /* Altivec support macros */ @@ -166,7 +163,7 @@ typedef float32x4_t v4sf; # define LD_PS1(p) vld1q_dup_f32(&(p)) # define INTERLEAVE2(in1, in2, out1, out2) { float32x4x2_t tmp__ = vzipq_f32(in1,in2); out1=tmp__.val[0]; out2=tmp__.val[1]; } # define UNINTERLEAVE2(in1, in2, out1, out2) { float32x4x2_t tmp__ = vuzpq_f32(in1,in2); out1=tmp__.val[0]; out2=tmp__.val[1]; } -# define VTRANSPOSE4_(x0,x1,x2,x3) { \ +# define VTRANSPOSE4(x0,x1,x2,x3) { \ float32x4x2_t t0_ = vzipq_f32(x0, x2); \ float32x4x2_t t1_ = vzipq_f32(x1, x3); \ float32x4x2_t u0_ = vzipq_f32(t0_.val[0], t1_.val[0]); \ @@ -174,7 +171,7 @@ typedef float32x4_t v4sf; x0 = u0_.val[0]; x1 = u0_.val[1]; x2 = u1_.val[0]; x3 = u1_.val[1]; \ } /* marginally faster version */ -# define VTRANSPOSE4(x0,x1,x2,x3) { asm("vtrn.32 %q0, %q1;\n vtrn.32 %q2,%q3\n vswp %f0,%e2\n vswp %f1,%e3" : "+w"(x0), "+w"(x1), "+w"(x2), "+w"(x3)::); } +/*# define VTRANSPOSE4(x0,x1,x2,x3) { asm("vtrn.32 %q0, %q1;\n vtrn.32 %q2,%q3\n vswp %f0,%e2\n vswp %f1,%e3" : "+w"(x0), "+w"(x1), "+w"(x2), "+w"(x3)::); } */ # define VSWAPHL(a,b) vcombine_f32(vget_low_f32(b), vget_high_f32(a)) # define VALIGNED(ptr) ((((long)(ptr)) & 0x3) == 0) #else @@ -200,6 +197,10 @@ typedef float v4sf; /* shortcuts for complex multiplcations */ #define VCPLXMUL(ar,ai,br,bi) { v4sf tmp; tmp=VMUL(ar,bi); ar=VMUL(ar,br); ar=VSUB(ar,VMUL(ai,bi)); ai=VMUL(ai,br); ai=VADD(ai,tmp); } #define VCPLXMULCONJ(ar,ai,br,bi) { v4sf tmp; tmp=VMUL(ar,bi); ar=VMUL(ar,br); ar=VADD(ar,VMUL(ai,bi)); ai=VMUL(ai,br); ai=VSUB(ai,tmp); } +#ifndef SVMUL +/* multiply a scalar with a vector */ +#define SVMUL(f,v) VMUL(LD_PS1(f),v) +#endif #if !defined(PFFFT_SIMD_DISABLE) typedef union v4sf_union { @@ -252,20 +253,25 @@ void validate_pffft_simd() { #endif #endif /*!PFFFT_SIMD_DISABLE */ -#if !defined PFFT_MACROS_ONLY +#if 0 +/* SSE and co like 16-bytes aligned pointers */ +#define MALLOC_V4SF_ALIGNMENT 64 /* with a 64-byte alignment, we are even aligned on L2 cache lines... */ +void *pffft_aligned_malloc(size_t nb_bytes) { + void *p, *p0 = malloc(nb_bytes + MALLOC_V4SF_ALIGNMENT); + if (!p0) return (void *) 0; + p = (void *) (((size_t) p0 + MALLOC_V4SF_ALIGNMENT) & (~((size_t) (MALLOC_V4SF_ALIGNMENT-1)))); + *((void **) p - 1) = p0; + return p; +} +void pffft_aligned_free(void *p) { + if (p) free(*((void **) p - 1)); +} -#if defined (COMPILER_MSVC) - #define sin (float)sin - #define cos (float)cos -#else - #define sin sinf - #define cos cosf +int pffft_simd_size() { return SIMD_SZ; } #endif -/* -int pffft_simd_size() { return SIMD_SZ; } -*/ +#if !defined PFFT_MACROS_ONLY /* passf2 and passb2 has been merged here, fsign = -1 for passf2, +1 for passb2 @@ -299,6 +305,7 @@ static NEVER_INLINE(void) passf2_ps(int ido, int l1, const v4sf *cc, v4sf *ch, c /* passf3 and passb3 has been merged here, fsign = -1 for passf3, +1 for passb3 */ +#if 0 static NEVER_INLINE(void) passf3_ps(int ido, int l1, const v4sf *cc, v4sf *ch, const float *wa1, const float *wa2, float fsign) { static const float taur = -0.5f; @@ -311,13 +318,13 @@ static NEVER_INLINE(void) passf3_ps(int ido, int l1, const v4sf *cc, v4sf *ch, for (k=0; k< l1ido; k += ido, cc+= 3*ido, ch +=ido) { for (i=0; i 2); + for (k = 0; k < l1; ++k, cc += 5*ido, ch += ido) { + for (i = 0; i < ido-1; i += 2) { + ti5 = VSUB(cc_ref(i , 2), cc_ref(i , 5)); + ti2 = VADD(cc_ref(i , 2), cc_ref(i , 5)); + ti4 = VSUB(cc_ref(i , 3), cc_ref(i , 4)); + ti3 = VADD(cc_ref(i , 3), cc_ref(i , 4)); + tr5 = VSUB(cc_ref(i-1, 2), cc_ref(i-1, 5)); + tr2 = VADD(cc_ref(i-1, 2), cc_ref(i-1, 5)); + tr4 = VSUB(cc_ref(i-1, 3), cc_ref(i-1, 4)); + tr3 = VADD(cc_ref(i-1, 3), cc_ref(i-1, 4)); + ch_ref(i-1, 1) = VADD(cc_ref(i-1, 1), VADD(tr2, tr3)); + ch_ref(i , 1) = VADD(cc_ref(i , 1), VADD(ti2, ti3)); + cr2 = VADD(cc_ref(i-1, 1), VADD(SVMUL(tr11, tr2),SVMUL(tr12, tr3))); + ci2 = VADD(cc_ref(i , 1), VADD(SVMUL(tr11, ti2),SVMUL(tr12, ti3))); + cr3 = VADD(cc_ref(i-1, 1), VADD(SVMUL(tr12, tr2),SVMUL(tr11, tr3))); + ci3 = VADD(cc_ref(i , 1), VADD(SVMUL(tr12, ti2),SVMUL(tr11, ti3))); + cr5 = VADD(SVMUL(ti11, tr5), SVMUL(ti12, tr4)); + ci5 = VADD(SVMUL(ti11, ti5), SVMUL(ti12, ti4)); + cr4 = VSUB(SVMUL(ti12, tr5), SVMUL(ti11, tr4)); + ci4 = VSUB(SVMUL(ti12, ti5), SVMUL(ti11, ti4)); + dr3 = VSUB(cr3, ci4); + dr4 = VADD(cr3, ci4); + di3 = VADD(ci3, cr4); + di4 = VSUB(ci3, cr4); + dr5 = VADD(cr2, ci5); + dr2 = VSUB(cr2, ci5); + di5 = VSUB(ci2, cr5); + di2 = VADD(ci2, cr5); + wr1=wa1[i], wi1=fsign*wa1[i+1], wr2=wa2[i], wi2=fsign*wa2[i+1]; + wr3=wa3[i], wi3=fsign*wa3[i+1], wr4=wa4[i], wi4=fsign*wa4[i+1]; + VCPLXMUL(dr2, di2, LD_PS1(wr1), LD_PS1(wi1)); + ch_ref(i - 1, 2) = dr2; + ch_ref(i, 2) = di2; + VCPLXMUL(dr3, di3, LD_PS1(wr2), LD_PS1(wi2)); + ch_ref(i - 1, 3) = dr3; + ch_ref(i, 3) = di3; + VCPLXMUL(dr4, di4, LD_PS1(wr3), LD_PS1(wi3)); + ch_ref(i - 1, 4) = dr4; + ch_ref(i, 4) = di4; + VCPLXMUL(dr5, di5, LD_PS1(wr4), LD_PS1(wi4)); + ch_ref(i - 1, 5) = dr5; + ch_ref(i, 5) = di5; + } + } +#undef ch_ref +#undef cc_ref +} +#endif + static NEVER_INLINE(void) radf2_ps(int ido, int l1, const v4sf * RESTRICT cc, v4sf * RESTRICT ch, const float *wa1) { static const float minus_one = -1.f; int i, k, l1ido = l1*ido; @@ -425,7 +505,7 @@ static NEVER_INLINE(void) radf2_ps(int ido, int l1, const v4sf * RESTRICT cc, v4 if (ido % 2 == 1) return; } for (k=0; k < l1ido; k += ido) { - ch[2*k + ido] = VMUL(LD_PS1(minus_one), cc[ido-1 + k + l1ido]); + ch[2*k + ido] = SVMUL(minus_one, cc[ido-1 + k + l1ido]); ch[2*k + ido-1] = cc[k + ido-1]; } } /* radf2 */ @@ -460,10 +540,11 @@ static NEVER_INLINE(void) radb2_ps(int ido, int l1, const v4sf *cc, v4sf *ch, co for (k = 0; k < l1ido; k += ido) { a = cc[2*k + ido-1]; b = cc[2*k + ido]; ch[k + ido-1] = VADD(a,a); - ch[k + ido-1 + l1ido] = VMUL(LD_PS1(minus_two), b); + ch[k + ido-1 + l1ido] = SVMUL(minus_two, b); } } /* radb2 */ +#if 0 static void radf3_ps(int ido, int l1, const v4sf * RESTRICT cc, v4sf * RESTRICT ch, const float *wa1, const float *wa2) { static const float taur = -0.5f; @@ -473,8 +554,8 @@ static void radf3_ps(int ido, int l1, const v4sf * RESTRICT cc, v4sf * RESTRICT for (k=0; k= 32); } - if (transform == PFFFT_COMPLEX) { assert(N >= 16); } + int k, m; + if (!s) return s; + /* unfortunately, the fft size must be a multiple of 16 for complex FFTs + and 32 for real FFTs -- a lot of stuff would need to be rewritten to + handle other cases (or maybe just switch to a scalar fft, I don't know..) */ + if (transform == PFFFT_REAL) { assert((N%(2*SIMD_SZ*SIMD_SZ))==0 && N>0); } + if (transform == PFFFT_COMPLEX) { assert((N%(SIMD_SZ*SIMD_SZ))==0 && N>0); } /*assert((N % 32) == 0); */ s->N = N; s->transform = transform; /* nb of complex simd vectors */ s->Ncvec = (transform == PFFFT_REAL ? N/2 : N)/SIMD_SZ; s->data = (v4sf*)pffft_aligned_malloc(2*(size_t)s->Ncvec * sizeof(v4sf)); - if (!s->data) { - free(s); - return 0; - } + if (!s->data) {free(s); return 0;} s->e = (float*)s->data; s->twiddle = (float*)(s->data + (2*s->Ncvec*(SIMD_SZ-1))/SIMD_SZ); @@ -988,15 +1278,22 @@ PFFFT_Setup *pffft_new_setup(int N, pffft_transform_t transform) { } cffti1_ps(N/SIMD_SZ, s->twiddle, s->ifac); } + + /* check that N is decomposable with allowed prime factors */ + for (k=0, m=1; k < s->ifac[1]; ++k) { m *= s->ifac[2+k]; } + if (m != N/SIMD_SZ) { + pffft_destroy_setup(s); s = 0; + } + return s; } -static void pffft_destroy_setup(PFFFT_Setup *s) { - if(s){ - pffft_aligned_free(s->data); - free(s); - } +static +void pffft_destroy_setup(PFFFT_Setup *s) { + if (!s) return; + pffft_aligned_free(s->data); + free(s); } #if !defined(PFFFT_SIMD_DISABLE) @@ -1035,7 +1332,8 @@ static void unreversed_copy(int N, const v4sf *in, v4sf *out, int out_stride) { UNINTERLEAVE2(h0, g1, out[0], out[1]); } -static void pffft_zreorder(PFFFT_Setup *setup, const float *in, float *out, pffft_direction_t direction) { +static +void pffft_zreorder(PFFFT_Setup *setup, const float *in, float *out, pffft_direction_t direction) { int k, N = setup->N, Ncvec = setup->Ncvec; const v4sf *vin = (const v4sf*)in; v4sf *vout = (v4sf*)out; @@ -1072,7 +1370,8 @@ static void pffft_zreorder(PFFFT_Setup *setup, const float *in, float *out, pfff } } -static void pffft_cplx_finalize(int Ncvec, const v4sf *in, v4sf *out, const v4sf *e) { +static +void pffft_cplx_finalize(int Ncvec, const v4sf *in, v4sf *out, const v4sf *e) { int k, dk = Ncvec/SIMD_SZ; /* number of 4x4 matrix blocks */ v4sf r0, i0, r1, i1, r2, i2, r3, i3; v4sf sr0, dr0, sr1, dr1, si0, di0, si1, di1; @@ -1116,7 +1415,8 @@ static void pffft_cplx_finalize(int Ncvec, const v4sf *in, v4sf *out, const v4sf } } -static void pffft_cplx_preprocess(int Ncvec, const v4sf *in, v4sf *out, const v4sf *e) { +static +void pffft_cplx_preprocess(int Ncvec, const v4sf *in, v4sf *out, const v4sf *e) { int k, dk = Ncvec/SIMD_SZ; /* number of 4x4 matrix blocks */ v4sf r0, i0, r1, i1, r2, i2, r3, i3; v4sf sr0, dr0, sr1, dr1, si0, di0, si1, di1; @@ -1342,22 +1642,23 @@ static NEVER_INLINE(void) pffft_real_preprocess(int Ncvec, const v4sf *in, v4sf } -static void pffft_transform_internal(PFFFT_Setup *setup, const float *finput, float *foutput, v4sf *scratch, +static +void pffft_transform_internal(PFFFT_Setup *setup, const float *finput, float *foutput, v4sf *scratch, pffft_direction_t direction, int ordered) { int k, Ncvec = setup->Ncvec; int nf_odd = (setup->ifac[1] & 1); +#if 0 /* temporary buffer is allocated on the stack if the scratch pointer is NULL */ - /*int stack_allocate = (scratch == 0 ? Ncvec*2 : 1); */ - /*VLA_ARRAY_ON_STACK(v4sf, scratch_on_stack, stack_allocate); */ + int stack_allocate = (scratch == 0 ? Ncvec*2 : 1); + VLA_ARRAY_ON_STACK(v4sf, scratch_on_stack, stack_allocate); +#endif - int ib = (nf_odd ^ ordered ? 1 : 0); const v4sf *vinput = (const v4sf*)finput; v4sf *voutput = (v4sf*)foutput; v4sf *buff[2]; - buff[0] = voutput, buff[1] = scratch /*? scratch : scratch_on_stack*/; - - /*if (scratch == 0) scratch = scratch_on_stack; */ + int ib = (nf_odd ^ ordered ? 1 : 0); + buff[0] = voutput; buff[1] = scratch; assert(VALIGNED(finput) && VALIGNED(foutput)); @@ -1415,8 +1716,8 @@ static void pffft_transform_internal(PFFFT_Setup *setup, const float *finput, fl } #if 0 -static void pffft_zconvolve_accumulate(PFFFT_Setup *s, const float *a, const float *b, float *ab, float scaling) { - int i, Ncvec = s->Ncvec; +void pffft_zconvolve_accumulate(PFFFT_Setup *s, const float *a, const float *b, float *ab, float scaling) { + int Ncvec = s->Ncvec; const v4sf * RESTRICT va = (const v4sf*)a; const v4sf * RESTRICT vb = (const v4sf*)b; v4sf * RESTRICT vab = (v4sf*)ab; @@ -1434,10 +1735,16 @@ static void pffft_zconvolve_accumulate(PFFFT_Setup *s, const float *a, const flo __builtin_prefetch(va+6); __builtin_prefetch(vb+6); __builtin_prefetch(vab+6); +# ifndef __clang__ +# define ZCONVOLVE_USING_INLINE_NEON_ASM +# endif #endif float ar, ai, br, bi, abr, abi; +#ifndef ZCONVOLVE_USING_INLINE_ASM v4sf vscal = LD_PS1(scaling); + int i; +#endif assert(VALIGNED(a) && VALIGNED(b) && VALIGNED(ab)); ar = ((v4sf_union*)va)[0].f[0]; @@ -1447,8 +1754,7 @@ static void pffft_zconvolve_accumulate(PFFFT_Setup *s, const float *a, const flo abr = ((v4sf_union*)vab)[0].f[0]; abi = ((v4sf_union*)vab)[1].f[0]; -#ifdef __arm__ -# if 1 /* inline asm version */ +#ifdef ZCONVOLVE_USING_INLINE_ASM /* inline asm version, unfortunately miscompiled by clang 3.2, at least on ubuntu.. so this will be restricted to gcc */ const float *a_ = a, *b_ = b; float *ab_ = ab; int N = Ncvec; asm volatile("mov r8, %2 \n" @@ -1484,49 +1790,7 @@ static void pffft_zconvolve_accumulate(PFFFT_Setup *s, const float *a, const flo "subs %3, #2 \n" "bne 1b \n" : "+r"(a_), "+r"(b_), "+r"(ab_), "+r"(N) : "r"(scaling) : "r8", "q0","q1","q2","q3","q4","q5","q6","q7","q8","q9", "q10","q11","q12","q13","q15","memory"); - -# else /* neon instrinsics version, 30% slower that the asm one with gcc 4.6 */ - v4sf a1r, a1i, b1r, b1i; - v4sf a2r, a2i, b2r, b2i; - v4sf ab1r, ab1i, ab2r, ab2i; - for (i=0; i < Ncvec; i += 2) { - __builtin_prefetch(va+8); - __builtin_prefetch(va+10); - - a1r = *va++; a1i = *va++; - a2r = *va++; a2i = *va++; - b1r = *vb++; b1i = *vb++; - b2r = *vb++; b2i = *vb++; - ab1r = vab[0]; ab1i = vab[1]; - ab2r = vab[2]; ab2i = vab[3]; - - v4sf z1r = VMUL(a1r, b1r); - v4sf z2r = VMUL(a2r, b2r); - v4sf z1i = VMUL(a1r, b1i); - v4sf z2i = VMUL(a2r, b2i); - - __builtin_prefetch(vb+4); - __builtin_prefetch(vb+6); - - z1r = vmlsq_f32(z1r, a1i, b1i); - z2r = vmlsq_f32(z2r, a2i, b2i); - z1i = vmlaq_f32(z1i, a1i, b1r); - z2i = vmlaq_f32(z2i, a2i, b2r); - - __builtin_prefetch(vab+4); - __builtin_prefetch(vab+6); - - ab1r = vmlaq_f32(ab1r, z1r, vscal); - ab2r = vmlaq_f32(ab2r, z2r, vscal); - ab1i = vmlaq_f32(ab1i, z1i, vscal); - ab2i = vmlaq_f32(ab2i, z2i, vscal); - - *vab++ = ab1r; *vab++ = ab1i; - *vab++ = ab2r; *vab++ = ab2i; - } -# endif - -#else /* not ARM, no need to use a special routine */ +#else /* default routine, works fine for non-arm cpus with current compilers */ for (i=0; i < Ncvec; i += 2) { v4sf ar, ai, br, bi; ar = va[2*i+0]; ai = va[2*i+1]; @@ -1548,50 +1812,14 @@ static void pffft_zconvolve_accumulate(PFFFT_Setup *s, const float *a, const flo } #endif -static void pffft_zconvolve(PFFFT_Setup *s, const float *a, const float *b, float *ab) { - int i, Ncvec = s->Ncvec; - const v4sf * /*RESTRICT*/ va = (const v4sf*)a; - const v4sf * RESTRICT vb = (const v4sf*)b; - v4sf * /*RESTRICT*/ vab = (v4sf*)ab; - - float ar, ai, br, bi; - -#ifdef __arm__ -#error -#endif - assert(VALIGNED(a) && VALIGNED(b) && VALIGNED(ab)); - ar = ((v4sf_union*)va)[0].f[0]; - ai = ((v4sf_union*)va)[1].f[0]; - br = ((v4sf_union*)vb)[0].f[0]; - bi = ((v4sf_union*)vb)[1].f[0]; - - for (i=0; i < Ncvec; i += 2) { - v4sf ar, ai, br, bi; - ar = va[2*i+0]; ai = va[2*i+1]; - br = vb[2*i+0]; bi = vb[2*i+1]; - VCPLXMUL(ar, ai, br, bi); - vab[2*i+0] = ar; - vab[2*i+1] = ai; - ar = va[2*i+2]; ai = va[2*i+3]; - br = vb[2*i+2]; bi = vb[2*i+3]; - VCPLXMUL(ar, ai, br, bi); - vab[2*i+2] = ar; - vab[2*i+3] = ai; - } - if (s->transform == PFFFT_REAL) { - ((v4sf_union*)vab)[0].f[0] = ar*br; - ((v4sf_union*)vab)[1].f[0] = ai*bi; - } -} - - #else /* defined(PFFFT_SIMD_DISABLE) */ /* standard routine using scalar floats, without SIMD stuff. */ #define pffft_zreorder_nosimd pffft_zreorder -static void pffft_zreorder_nosimd(PFFFT_Setup *setup, const float *in, float *out, pffft_direction_t direction) { +static +void pffft_zreorder_nosimd(PFFFT_Setup *setup, const float *in, float *out, pffft_direction_t direction) { int k, N = setup->N; if (setup->transform == PFFFT_COMPLEX) { for (k=0; k < 2*N; ++k) out[k] = in[k]; @@ -1611,19 +1839,22 @@ static void pffft_zreorder_nosimd(PFFFT_Setup *setup, const float *in, float *ou } #define pffft_transform_internal_nosimd pffft_transform_internal -static void pffft_transform_internal_nosimd(PFFFT_Setup *setup, const float *input, float *output, float *scratch, +static +void pffft_transform_internal_nosimd(PFFFT_Setup *setup, const float *input, float *output, float *scratch, pffft_direction_t direction, int ordered) { int Ncvec = setup->Ncvec; int nf_odd = (setup->ifac[1] & 1); +#if 0 /* temporary buffer is allocated on the stack if the scratch pointer is NULL */ - /*int stack_allocate = (scratch == 0 ? Ncvec*2 : 1); */ - /*VLA_ARRAY_ON_STACK(v4sf, scratch_on_stack, stack_allocate); */ - /*if (scratch == 0) scratch = scratch_on_stack; */ - - int ib; + int stack_allocate = (scratch == 0 ? Ncvec*2 : 1); + VLA_ARRAY_ON_STACK(v4sf, scratch_on_stack, stack_allocate); +#endif float *buff[2]; - buff[0] = output, buff[1] = scratch; + int ib; + /* if (scratch == 0) scratch = scratch_on_stack; */ + buff[0] = output; buff[1] = scratch; + if (setup->transform == PFFFT_COMPLEX) ordered = 0; /* it is always ordered. */ ib = (nf_odd ^ ordered ? 1 : 0); @@ -1669,7 +1900,7 @@ static void pffft_transform_internal_nosimd(PFFFT_Setup *setup, const float *inp #if 0 #define pffft_zconvolve_accumulate_nosimd pffft_zconvolve_accumulate -static void pffft_zconvolve_accumulate_nosimd(PFFFT_Setup *s, const float *a, const float *b, +void pffft_zconvolve_accumulate_nosimd(PFFFT_Setup *s, const float *a, const float *b, float *ab, float scaling) { int i, Ncvec = s->Ncvec; @@ -1690,40 +1921,16 @@ static void pffft_zconvolve_accumulate_nosimd(PFFFT_Setup *s, const float *a, co } #endif -#define pffft_zconvolve_nosimd pffft_zconvolve -static void pffft_zconvolve_nosimd(PFFFT_Setup *s, const float *a, const float *b, float *ab) { - int i, Ncvec = s->Ncvec; - - if (s->transform == PFFFT_REAL) { - /* take care of the fftpack ordering */ - ab[0] = a[0]*b[0]; - ab[2*Ncvec-1] = a[2*Ncvec-1]*b[2*Ncvec-1]; - ++ab; ++a; ++b; --Ncvec; - } - for (i=0; i < Ncvec; ++i) { - float ar, ai, br, bi; - ar = a[2*i+0]; ai = a[2*i+1]; - br = b[2*i+0]; bi = b[2*i+1]; - VCPLXMUL(ar, ai, br, bi); - ab[2*i+0] = ar; - ab[2*i+1] = ai; - } -} - #endif /* defined(PFFFT_SIMD_DISABLE) */ -static void pffft_transform(PFFFT_Setup *setup, const float *input, float *output, float *work, pffft_direction_t direction) { +static +void pffft_transform(PFFFT_Setup *setup, const float *input, float *output, float *work, pffft_direction_t direction) { pffft_transform_internal(setup, input, output, (v4sf*)work, direction, 0); } -static void pffft_transform_ordered(PFFFT_Setup *setup, const float *input, float *output, float *work, pffft_direction_t direction) { +static +void pffft_transform_ordered(PFFFT_Setup *setup, const float *input, float *output, float *work, pffft_direction_t direction) { pffft_transform_internal(setup, input, output, (v4sf*)work, direction, 1); } - -static void pffft_reorder_back(int length, void * setup, float * data, float * work) -{ - memcpy(work, data, (unsigned)length * sizeof(*work)); - pffft_zreorder(setup, work, data, PFFFT_BACKWARD); -} #endif diff --git a/src/pffft.h b/src/pffft.h index 78d936b..5ff8466 100644 --- a/src/pffft.h +++ b/src/pffft.h @@ -1,4 +1,9 @@ -/* Copyright (c) 2011 Julien Pommier ( pommier@modartt.com ) +/* https://bitbucket.org/jpommier/pffft/raw/483453d8f7661058e74aa4e7cf5c27bcd7887e7a/pffft.h + * with minor changes for libsoxr. */ + +#if !defined PFFT_MACROS_ONLY + +/* Copyright (c) 2013 Julien Pommier ( pommier@modartt.com ) Based on original fortran 77 code from FFTPACKv4 from NETLIB, authored by Dr Paul Swarztrauber of NCAR, in 1985. @@ -60,8 +65,9 @@ - 1D transforms only, with 32-bit single precision. - supports only transforms for inputs of length N of the form - N=(2^a)*(3^b), a >= 5 and b >=0 (32, 48, 64, 96, 128, 144 etc - are all acceptable lengths). Performance is best for 128<=N<=8192. + N=(2^a)*(3^b)*(5^c), a >= 5, b >=0, c >= 0 (32, 48, 64, 96, 128, + 144, 160, etc are all acceptable lengths). Performance is best for + 128<=N<=8192. - all (float*) pointers in the functions below are expected to have an "simd-compatible" alignment, that is 16 bytes on x86 and @@ -99,8 +105,10 @@ extern "C" { PFFFT_Setup structure is read-only so it can safely be shared by multiple concurrent threads. */ - static PFFFT_Setup *pffft_new_setup(int N, pffft_transform_t transform); - static void pffft_destroy_setup(PFFFT_Setup *); + static + PFFFT_Setup *pffft_new_setup(int N, pffft_transform_t transform); + static + void pffft_destroy_setup(PFFFT_Setup *); /* Perform a Fourier transform , The z-domain data is stored in the most efficient order for transforming it back, or using it for @@ -113,13 +121,14 @@ extern "C" { Typically you will want to scale the backward transform by 1/N. The 'work' pointer should point to an area of N (2*N for complex - fft) floats, properly aligned. [del]If 'work' is NULL, then stack will - be used instead (this is probably the beest strategy for small - FFTs, say for N < 16384).[/del] + fft) floats, properly aligned. If 'work' is NULL, then stack will + be used instead (this is probably the best strategy for small + FFTs, say for N < 16384). input and output may alias. */ - static void pffft_transform(PFFFT_Setup *setup, const float *input, float *output, float *work, pffft_direction_t direction); + static + void pffft_transform(PFFFT_Setup *setup, const float *input, float *output, float *work, pffft_direction_t direction); /* Similar to pffft_transform, but makes sure that the output is @@ -128,7 +137,8 @@ extern "C" { input and output may alias. */ - static void pffft_transform_ordered(PFFFT_Setup *setup, const float *input, float *output, float *work, pffft_direction_t direction); + static + void pffft_transform_ordered(PFFFT_Setup *setup, const float *input, float *output, float *work, pffft_direction_t direction); /* call pffft_zreorder(.., PFFFT_FORWARD) after pffft_transform(..., @@ -142,7 +152,8 @@ extern "C" { input and output should not alias. */ - static void pffft_zreorder(PFFFT_Setup *setup, const float *input, float *output, pffft_direction_t direction); + static + void pffft_zreorder(PFFFT_Setup *setup, const float *input, float *output, pffft_direction_t direction); /* Perform a multiplication of the frequency components of dft_a and @@ -155,23 +166,26 @@ extern "C" { the operation performed is: dft_ab += (dft_a * fdt_b)*scaling The dft_a, dft_b and dft_ab pointers may alias. - void pffft_zconvolve_accumulate(PFFFT_Setup *setup, const float *dft_a, const float *dft_b, float *dft_ab, float scaling); */ + void pffft_zconvolve_accumulate(PFFFT_Setup *setup, const float *dft_a, const float *dft_b, float *dft_ab, float scaling); /* - the operation performed is: dft_ab = (dft_a * fdt_b) - - The dft_a, dft_b and dft_ab pointers may alias. + the float buffers must have the correct alignment (16-byte boundary + on intel and powerpc). This function may be used to obtain such + correctly aligned buffers. */ - static void pffft_zconvolve(PFFFT_Setup *setup, const float *dft_a, const float *dft_b, float *dft_ab); +#if 0 + void *pffft_aligned_malloc(size_t nb_bytes); + void pffft_aligned_free(void *); /* return 4 or 1 wether support SSE/Altivec instructions was enable when building pffft.c */ - int pffft_simd_size(void); - - static void pffft_reorder_back(int length, void * setup, float * data, float * work); + int pffft_simd_size(); +#endif #ifdef __cplusplus } #endif #endif + +#endif diff --git a/src/pffft32.c b/src/pffft32.c index 21bd845..fb7400f 100644 --- a/src/pffft32.c +++ b/src/pffft32.c @@ -4,7 +4,7 @@ #define _soxr_simd_aligned_free free #define _soxr_simd_aligned_malloc malloc #define PFFFT_SIMD_DISABLE -#include "pffft.c" +#include "pffft-wrap.c" #include "filter.h" static void * setup(int len) {return pffft_new_setup(len, PFFFT_REAL);} diff --git a/src/pffft32s.c b/src/pffft32s.c index d049990..b067ad2 100644 --- a/src/pffft32s.c +++ b/src/pffft32s.c @@ -1,7 +1,7 @@ /* SoX Resampler Library Copyright (c) 2007-13 robs@users.sourceforge.net * Licence for this file: LGPL v2.1 See LICENCE for details. */ -#include "pffft.c" +#include "pffft-wrap.c" static void * setup(int len) {return pffft_new_setup(len, PFFFT_REAL);} static void forward (int length, void * setup, float * h, float * scratch) {pffft_transform (setup, h, h, scratch, PFFFT_FORWARD); (void)length;}