clean-up; set version as 0.1.3b1

2016-06-05 19:12:38 +01:00 · 2016-06-05 19:12:38 +01:00 · 5fa7eeb9a3
parent 7110101172
commit 5fa7eeb9a3
23 changed files with 198 additions and 133 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -13,7 +13,7 @@ set (DESCRIPTION_SUMMARY

 set (PROJECT_VERSION_MAJOR 0)
 set (PROJECT_VERSION_MINOR 1)
-set (PROJECT_VERSION_PATCH 2)
+set (PROJECT_VERSION_PATCH "3b1")

 # For shared-object; if, since the last public release:
 #   1) library code changed at all: ++revision
@ -22,7 +22,7 @@ set (PROJECT_VERSION_PATCH 2)
 #   4) interfaces removed:          age = 0

 set (SO_VERSION_CURRENT  1)
-set (SO_VERSION_REVISION 1)
+set (SO_VERSION_REVISION 2)
 set (SO_VERSION_AGE      1)

 math (EXPR SO_VERSION_MAJOR "${SO_VERSION_CURRENT} - ${SO_VERSION_AGE}")
@ -63,18 +63,22 @@ cmake_dependent_option (WITH_CR64S
 cmake_dependent_option (WITH_CR32S
  "Include HQ SIMD constant-rate resampling engine." ON
  "WITH_VR32 OR WITH_CR64 OR WITH_CR32 OR WITH_CR64S" ON)
-cmake_dependent_option (WITH_AVFFT
-  "Use libavcodec (LGPL) for HQ SIMD DFT." OFF
-  "WITH_CR32S;NOT WITH_PFFFT" OFF)
 cmake_dependent_option (WITH_PFFFT
  "Use PFFFT (BSD-like licence) for HQ SIMD DFT." ON
  "WITH_CR32S;NOT WITH_AVFFT" OFF)
+cmake_dependent_option (WITH_AVFFT
+  "Use libavcodec (LGPL) for HQ SIMD DFT." OFF
+  "WITH_CR32S;NOT WITH_PFFFT" OFF)
 cmake_dependent_option (BUILD_LSR_TESTS "Build LSR tests." OFF
  "UNIX;NOT CMAKE_CROSSCOMPILING;EXISTS ${PROJECT_SOURCE_DIR}/lsr-tests;WITH_LSR_BINDINGS" OFF)

+option (WITH_HI_PREC_CLOCK "Enable high-precision time-base." ON)
+option (WITH_FLOAT_STD_PREC_CLOCK
+	"Use floating-point for standard-precision time-base." OFF)
 option (WITH_DEV_TRACE "Enable developer trace capability." ON)
 option (WITH_DEV_GPROF "Enable developer grpof output." OFF)
-mark_as_advanced (WITH_DEV_TRACE WITH_DEV_GPROF)
+mark_as_advanced (WITH_HI_PREC_CLOCK WITH_FLOAT_STD_PREC_CLOCK
+       	WITH_DEV_TRACE WITH_DEV_GPROF)



@ -130,7 +134,7 @@ if (WITH_AVFFT)
  endif ()
 endif ()

-if (WITH_AVFFT OR (CMAKE_SYSTEM_PROCESSOR MATCHES "^arm" AND SIMD32_FOUND))
+if (WITH_AVFFT OR (CMAKE_SYSTEM_PROCESSOR MATCHES "^arm" AND SIMD32_FOUND AND WITH_CR32))
  find_package (LibAVUtil)
  if (AVUTIL_FOUND)
    include_directories (${AVUTIL_INCLUDE_DIRS})
--- a/5
+++ b/5
@ -1,2 +1,3 @@
-* SOXR_ALLOW_ALIASING
-* Explicit flush API fn, perhaps.
+* vr32s
+* vr32 with 1-delay-clear
+* fir_to_phase with RDFT32
--- a/go.bat
+++ b/go.bat
@ -1,5 +1,5 @@
@echo off
-rem SoX Resampler Library      Copyright (c) 2007-13 robs@users.sourceforge.net
+rem SoX Resampler Library      Copyright (c) 2007-16 robs@users.sourceforge.net
 rem Licence for this file: LGPL v2.1                  See LICENCE for details.

 set build=%1
--- a/msvc/soxr-config.h
+++ b/msvc/soxr-config.h
@ -9,6 +9,7 @@

 #define AVCODEC_FOUND 0
 #define AVUTIL_FOUND 0
+#define WITH_PFFFT 1

 #define HAVE_FENV_H 1
 #define HAVE_STDBOOL_H 1
@ -22,6 +23,8 @@
 #define WITH_CR64S 1
 #define WITH_VR32 1

+#define WITH_HI_PREC_CLOCK 1
+#define WITH_FLOAT_STD_PREC_CLOCK 0
 #define WITH_DEV_TRACE 1

 #endif
--- a/2
+++ b/2
@ -1,7 +1,7 @@
 #!/usr/bin/env bash
 set -e

-# SoX Resampler Library       Copyright (c) 2007-13 robs@users.sourceforge.net
+# SoX Resampler Library       Copyright (c) 2007-16 robs@users.sourceforge.net
 # Licence for this file: LGPL v2.1                  See LICENCE for details.

 rm -f CMakeCache.txt             # Prevent interference from any in-tree build
--- a/soxr-config.h.in
+++ b/soxr-config.h.in
@ -6,6 +6,7 @@

 #cmakedefine01 AVCODEC_FOUND
 #cmakedefine01 AVUTIL_FOUND
+#cmakedefine01 WITH_PFFFT

 #cmakedefine01 HAVE_FENV_H
 #cmakedefine01 HAVE_STDBOOL_H
@ -19,6 +20,8 @@
 #cmakedefine01 WITH_CR64S
 #cmakedefine01 WITH_VR32

+#cmakedefine01 WITH_HI_PREC_CLOCK
+#cmakedefine01 WITH_FLOAT_STD_PREC_CLOCK
 #cmakedefine01 WITH_DEV_TRACE

 #endif
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@ -32,13 +32,16 @@ elseif (WITH_PFFFT)
  #set (RDFT32 pffft32)
  set (RDFT32S pffft32s)
 elseif (WITH_CR32S)
-  set (RDFT32S fft4g32s fft4g32)
+  set (RDFT32S fft4g32s)
+  if (NOT WITH_CR32)
+    list (APPEND RDFT32S fft4g32)
+  endif ()
 endif ()

 set (SOURCES ${PROJECT_NAME}.c data-io)

 if (WITH_CR32 OR WITH_CR32S OR WITH_CR64 OR WITH_CR64S)
-  list (APPEND SOURCES dbesi0 filter fft4g64 cr.c)
+  list (APPEND SOURCES dbesi0 filter fft4g64 cr)
 endif ()

 if (WITH_CR32)
--- a/src/cr-core.c
+++ b/src/cr-core.c
@ -139,7 +139,8 @@ static half_fir_info_t const half_firs[] = {
  #define SIMD_AVX ((CORE_TYPE & CORE_SIMD_POLY) && defined __AVX__)
  #define SIMD_SSE 0
 #else
-  #define SIMD_SSE ((CORE_TYPE & CORE_SIMD_POLY) && (defined __x86_64__ || defined _M_X64 || defined i386 || defined _M_IX86))
+  #define SIMD_SSE ((CORE_TYPE & CORE_SIMD_POLY) && \
+      (defined __x86_64__ || defined _M_X64 || defined i386 || defined _M_IX86))
  #define SIMD_AVX 0
 #endif

@ -147,8 +148,6 @@ static half_fir_info_t const half_firs[] = {



-
-#define HI_PREC_CLOCK
 #define COEFS (sample_t * __restrict)p->shared->poly_fir_coefs
 #define VAR_LENGTH p->n
 #define VAR_CONVOLVE(n) while (j < (n)) _
--- a/src/cr.c
+++ b/src/cr.c
@ -343,7 +343,7 @@ STATIC char const * _soxr_init(
  for (i = 0; i < p->num_stages; ++i) {
    p->stages[i].num = i;
    p->stages[i].shared = shared;
-    p->stages[i].input_size = 4096;
+    p->stages[i].input_size = 8192;
  }
  p->stages[0].is_input = true;

@ -443,8 +443,8 @@ STATIC char const * _soxr_init(
    s->L = arbL;
    s->use_hi_prec_clock =
      mode>1 && (q_spec->flags & SOXR_HI_PREC_CLOCK) && !rational;
-#if FLOAT_HI_PREC_CLOCK
-    if (s->use_hi_prec_clock) {
+#if WITH_FLOAT_STD_PREC_CLOCK
+    if (order && !s->use_hi_prec_clock) {
      s->at.flt = at;
      s->step.flt = arbM;
      s->out_in_ratio = (double)(arbL / s->step.flt);
@ -452,7 +452,7 @@ STATIC char const * _soxr_init(
 #endif
    {
      s->at.whole = (int64_t)(at * MULT32 + .5);
-#if !FLOAT_HI_PREC_CLOCK
+#if WITH_HI_PREC_CLOCK
      if (s->use_hi_prec_clock) {
        double M = arbM * MULT32;
        s->at.fix.ls.parts.ms = 0x80000000ul;
@ -474,7 +474,7 @@ STATIC char const * _soxr_init(
        s++, postL, postM, &multiplier, r_spec->log2_min_dft_size,
        r_spec->log2_large_dft_size, core_flags, core->rdft_cb);

-  lsx_debug("%g: »%i⋅%i/%i⋅%i/%g⋅%i/%i %x", 1/io_ratio,
+  lsx_debug("%g: >>%i %i/%i %i/%g %i/%i (%x)", 1/io_ratio,
      shr, preL, preM, arbL, arbM, postL, postM, core_flags);

  for (i = 0, s = p->stages; i < p->num_stages; ++i, ++s) {
--- a/src/cr.h
+++ b/src/cr.h
@ -10,7 +10,12 @@
 typedef void real; /* float or double */
 struct stage;
 typedef void (* stage_fn_t)(struct stage * input, fifo_t * output);
-typedef struct half_fir_info {int num_coefs; real const * coefs; stage_fn_t fn, dfn; float att;} half_fir_info_t;
+typedef struct half_fir_info {
+  int num_coefs;
+  real const * coefs;
+  stage_fn_t fn, dfn;
+  float att;
+} half_fir_info_t;
 typedef struct {float scalar; stage_fn_t fn;} poly_fir1_t;
 typedef struct {float beta; poly_fir1_t interp[3];} poly_fir_t;

@ -48,9 +53,6 @@ typedef union { /* Uint64 in parts */
  uint64_t all;
 } uint64p_t;

-#define FLOAT_HI_PREC_CLOCK 0    /* Non-float hi-prec has ~96 bits. */
-#define float_step_t long double /* __float128 is also a (slow) option */
-
 typedef struct {
  int        dft_length, num_taps, post_peak;
  void       * dft_forward_setup, * dft_backward_setup;
@ -62,11 +64,17 @@ typedef struct { /* So generated filter coefs may be shared between channels */
  dft_filter_t dft_filter[2];
 } rate_shared_t;

+typedef double float_step_t; /* Or long double or __float128. */
+
 typedef union { /* Fixed point arithmetic */
-  struct {uint64p_t ls; int64p_t ms;} fix;
+  struct {uint64p_t ls; int64p_t ms;} fix;  /* Hi-prec has ~96 bits. */
  float_step_t flt;
 } step_t;

+#define integer  fix.ms.parts.ms
+#define fraction fix.ms.parts.ls
+#define whole    fix.ms.all
+
 #define CORE_DBL       1
 #define CORE_SIMD_POLY 2
 #define CORE_SIMD_HALF 4
@ -113,16 +121,11 @@ typedef struct stage {

 #define stage_occupancy(s) max(0, fifo_occupancy(&(s)->fifo) - (s)->pre_post)
 #define stage_read_p(s) ((sample_t *)fifo_read_ptr(&(s)->fifo) + (s)->pre)
-#define integer  fix.ms.parts.ms
-#define fraction fix.ms.parts.ls
-#define whole    fix.ms.all
-

 #define lq_bw0  (1385/2048.) /* ~.67625, FP exact. */

 typedef enum {rolloff_small, rolloff_medium, rolloff_none} rolloff_t;

-
 typedef struct {
  void * (* alloc)(size_t);
  void * (* calloc)(size_t, size_t);
--- a/src/fft4g32.c
+++ b/src/fft4g32.c
@ -5,8 +5,10 @@
 #include "filter.h"
 #define FFT4G_FLOAT
 #include "fft4g.c"
-#include "rdft_t.h"
+#include "soxr-config.h"

+#if WITH_CR32
+#include "rdft_t.h"
 static void * null(void) {return 0;}
 static void forward (int length, void * setup, double * H) {lsx_safe_rdft_f(length,  1, H); (void)setup;}
 static void backward(int length, void * setup, double * H) {lsx_safe_rdft_f(length, -1, H); (void)setup;}
@ -31,3 +33,4 @@ fn_t _soxr_rdft32_cb[] = {
  (fn_t)free,
  (fn_t)flags,
 };
+#endif
--- a/src/filter.c
+++ b/src/filter.c
@ -28,7 +28,7 @@
 #include "fft4g_cache.h"
 #endif

-#if WITH_CR32 && !AVCODEC_FOUND
+#if (WITH_CR32 && !AVCODEC_FOUND) || (WITH_CR32S && !AVCODEC_FOUND && !WITH_PFFFT)
 #define DFT_FLOAT float
 #define DONE_WITH_FFT_CACHE done_with_fft_cache_f
 #define FFT_CACHE_CCRW fft_cache_ccrw_f
@ -93,7 +93,7 @@ double * lsx_make_lpf(
  double * h = malloc((size_t)num_taps * sizeof(*h));
  double mult = scale / lsx_bessel_I_0(beta), mult1 = 1 / (.5 * m + rho);
  assert(Fc >= 0 && Fc <= 1);
-  lsx_debug("make_lpf(n=%i Fc=%.7g β=%g ρ=%g scale=%g)",
+  lsx_debug("make_lpf(n=%i Fc=%.7g beta=%g rho=%g scale=%g)",
      num_taps, Fc, beta, rho, scale);

  if (h) for (i = 0; i <= m / 2; ++i) {
@ -120,7 +120,7 @@ double * lsx_design_lpf(
    double Fn,      /* Nyquist freq; e.g. 0.5, 1, PI */
    double att,     /* Stop-band attenuation in dB */
    int * num_taps, /* 0: value will be estimated */
-    int k,          /* >0: number of phases; <0: num_taps ≡ 1 (mod -k) */
+    int k,          /* >0: number of phases; <0: num_taps = 1 (mod -k) */
    double beta)    /* <0: value will be estimated */
 {
  int n = *num_taps, phases = max(k, 1), modulo = max(-k, 1);
--- a/src/filter.h
+++ b/src/filter.h
@ -31,7 +31,7 @@ double * lsx_design_lpf(
    double Fn,      /* Nyquist freq; e.g. 0.5, 1, PI; < 0: dummy run */
    double att,     /* Stop-band attenuation in dB */
    int * num_taps, /* 0: value will be estimated */
-    int k,          /* >0: number of phases; <0: num_taps ≡ 1 (mod -k) */
+    int k,          /* >0: number of phases; <0: num_taps = 1 (mod -k) */
    double beta);   /* <0: value will be estimated */

 void lsx_fir_to_phase(double * * h, int * len,
--- a/src/internal.h
+++ b/src/internal.h
@ -55,6 +55,7 @@

 #define SOXR_ROLLOFF_LSR2Q     3u    /* Reserved for internal use. */
 #define SOXR_ROLLOFF_MASK      3u    /* For masking these bits. */
+#define SOXR_MAINTAIN_3DB_PT   4u    /* Reserved for internal use. */
 #define SOXR_PROMOTE_TO_LQ    64u    /* Reserved for internal use. */


--- a/src/poly-fir.h
+++ b/src/poly-fir.h
@ -26,8 +26,6 @@

  #define BEGINNING v4_t X = vLds(x), sum = vZero(); \
      v4_t const * const __restrict coefs = (v4_t *)COEFS
-  #define MIDDLE switch (N) {case 3: CONVOLVE(3); break; case 4: CONVOLVE(4); \
-      break; case 5: CONVOLVE(5); break;  default: CONVOLVE(N); }
  #define END vStorSum(output+i, sum)
  #define cc(n) case n: core(n); break
  #define CORE(n) switch (n) {cc(2); cc(3); cc(4); cc(5); cc(6); default: core(n);}
@ -48,60 +46,74 @@
  #define d (coef(COEFS, COEF_INTERP, N, phase, 3,j))

  #define BEGINNING sample_t sum = 0
-  #define MIDDLE CONVOLVE(N)
  #define END output[i] = sum
  #define CORE(n) core(n)
 #endif

-#define fphpCore(n) \
-  if (p->use_hi_prec_clock) { \
-    float_step_t at = p->at.flt; \
-    for (i = 0; (int)at < num_in; ++i, at += p->step.flt) { \
-      sample_t const * const __restrict in = input + (int)at; \
-      float_step_t frac = at - (int)at; \
-      int phase = (int)(frac * (1 << PHASE_BITS)); \
-      sample_t x = (sample_t)(frac * (1 << PHASE_BITS) - phase); \
-      int j = 0; \
-      BEGINNING; CONVOLVE(n); END; \
-    } \
-    fifo_read(&p->fifo, (int)at, NULL); \
-    p->at.flt = at - (int)at; \
-  } else

-#define hpCore(n) \
-  if (p->use_hi_prec_clock) { \
-    for (i = 0; p->at.integer < num_in; ++i, \
-        p->at.fix.ls.all += p->step.fix.ls.all, \
-        p->at.whole += p->step.whole + (p->at.fix.ls.all < p->step.fix.ls.all)) { \
-      sample_t const * const __restrict in = input + p->at.integer; \
-      uint32_t frac = p->at.fraction; \
-      int phase = (int)(frac >> (32 - PHASE_BITS)); /* high-order bits */ \
-      sample_t x = (sample_t)((frac << PHASE_BITS) * (1 / MULT32)); /* low-order bits, scaled to [0,1) */ \
-      int j = 0; \
-      BEGINNING; CONVOLVE(n); END; \
-    } \
-    fifo_read(&p->fifo, p->at.integer, NULL); \
-    p->at.integer = 0; \
-  } else

-#define spCore(n) { \
-    for (i = 0; p->at.integer < num_in; ++i, p->at.whole += p->step.whole) { \
-      sample_t const * const __restrict in = input + p->at.integer; \
-      uint32_t frac = p->at.fraction; \
-      int phase = (int)(frac >> (32 - PHASE_BITS)); /* high-order bits */ \
-      sample_t x = (sample_t)((frac << PHASE_BITS) * (1 / MULT32)); /* low-order bits, scaled to [0,1) */ \
-      int j = 0; \
-      BEGINNING; CONVOLVE(n); END; \
-    } \
-    fifo_read(&p->fifo, p->at.integer, NULL); \
-    p->at.integer = 0; }
+#define floatPrecCore(n) { \
+  float_step_t at = p->at.flt; \
+  for (i = 0; (int)at < num_in; ++i, at += p->step.flt) { \
+    sample_t const * const __restrict in = input + (int)at; \
+    float_step_t frac = at - (int)at; \
+    int phase = (int)(frac * (1 << PHASE_BITS)); \
+    sample_t x = (sample_t)(frac * (1 << PHASE_BITS) - phase); \
+    int j = 0; \
+    BEGINNING; CONVOLVE(n); END; \
+  } \
+  fifo_read(&p->fifo, (int)at, NULL); \
+  p->at.flt = at - (int)at; } /* Could round to 1 in some cirmcumstances. */

-#if defined HI_PREC_CLOCK && FLOAT_HI_PREC_CLOCK
-  #define core(n) fphpCore(n) spCore(n)
-#elif defined HI_PREC_CLOCK
-  #define core(n) hpCore(n) spCore(n)
+
+
+#define highPrecCore(n) { \
+  step_t at; at.fix = p->at.fix; \
+  for (i = 0; at.integer < num_in; ++i, \
+      at.fix.ls.all += p->step.fix.ls.all, \
+      at.whole += p->step.whole + (at.fix.ls.all < p->step.fix.ls.all)) { \
+    sample_t const * const __restrict in = input + at.integer; \
+    uint32_t frac = at.fraction; \
+    int phase = (int)(frac >> (32 - PHASE_BITS)); /* High-order bits */ \
+    /* Low-order bits, scaled to [0,1): */ \
+    sample_t x = (sample_t)((frac << PHASE_BITS) * (1 / MULT32)); \
+    int j = 0; \
+    BEGINNING; CONVOLVE(n); END; \
+  } \
+  fifo_read(&p->fifo, at.integer, NULL); \
+  p->at.whole = at.fraction; \
+  p->at.fix.ls = at.fix.ls; }
+
+
+
+#define stdPrecCore(n) { \
+  int64p_t at; at.all = p->at.whole; \
+  for (i = 0; at.parts.ms < num_in; ++i, at.all += p->step.whole) { \
+    sample_t const * const __restrict in = input + at.parts.ms; \
+    uint32_t const frac = at.parts.ls; \
+    int phase = (int)(frac >> (32 - PHASE_BITS)); /* high-order bits */ \
+    /* Low-order bits, scaled to [0,1): */ \
+    sample_t x = (sample_t)((frac << PHASE_BITS) * (1 / MULT32)); \
+    int j = 0; \
+    BEGINNING; CONVOLVE(n); END; \
+  } \
+  fifo_read(&p->fifo, at.parts.ms, NULL); \
+  p->at.whole = at.parts.ls; }
+
+
+
+#if WITH_FLOAT_STD_PREC_CLOCK
+  #define SPCORE floatPrecCore
 #else
-  #define core(n) spCore(n)
+  #define SPCORE stdPrecCore
+#endif
+
+
+
+#if WITH_HI_PREC_CLOCK
+  #define core(n) if (p->use_hi_prec_clock) highPrecCore(n) else SPCORE(n)
+#else
+  #define core(n) SPCORE(n)
 #endif


@ -131,7 +143,6 @@ static void FUNCTION(stage_t * p, fifo_t * output_fifo)
 #undef COEF_INTERP
 #undef N
 #undef BEGINNING
-#undef MIDDLE
 #undef END
 #undef CONVOLVE
 #undef FIR_LENGTH
--- a/src/poly-fir0.h
+++ b/src/poly-fir0.h
@ -22,9 +22,8 @@
 #endif

 #define core(n) \
-  for (i = 0; p->at.integer < num_in * p->L; ++i, \
-      p->at.integer += p->step.integer) { \
-    int const div = p->at.integer / p->L, rem = p->at.integer % p->L; \
+  for (i = 0; at < num_in * p->L; ++i, at += step) { \
+    int const div = at / p->L, rem = at % p->L; \
    sample_t const * const __restrict at = input + div; \
    int j = 0; BEGINNING; CONVOLVE(n); END;}

@ -33,13 +32,14 @@ static void FUNCTION(stage_t * p, fifo_t * output_fifo)
  int num_in = min(stage_occupancy(p), p->input_size);
  if (num_in) {
    sample_t const * input = stage_read_p(p);
-    int i, num_out = (num_in * p->L - p->at.integer + p->step.integer - 1) / p->step.integer;
+    int at = p->at.integer, step = p->step.integer;
+    int i, num_out = (num_in * p->L - at + step - 1) / step;
    sample_t * __restrict output = fifo_reserve(output_fifo, num_out);

    CORE(N);
    assert(i == num_out);
-    fifo_read(&p->fifo, p->at.integer / p->L, NULL);
-    p->at.integer = p->at.integer % p->L;
+    fifo_read(&p->fifo, at / p->L, NULL);
+    p->at.integer = at % p->L;
  }
 }

--- a/src/rint.h
+++ b/src/rint.h
@ -6,6 +6,9 @@

 #include "std-types.h"

+/* For x86, compiler-supplied versions of these functions (where available)
+ * can have poor performance (e.g. mingw32), so prefer these asm versions: */
+
 #if defined __GNUC__ && (defined __i386__ || defined __x86_64__)
  #define FPU_RINT32
  #define FPU_RINT16
@ -23,7 +26,7 @@
    int32_t status[7];
    __asm__ __volatile__("fnstenv %0": "=m"(status));
    status[1] &= ~FE_INVALID;
-    __asm__ __volatile__("fldenv %0": : "m"(status));
+    __asm__ __volatile__("fldenv %0": : "m"(*status));
    return 0;
  }
 #elif defined _MSC_VER && defined _M_IX86
@ -69,7 +72,7 @@
  #define rint16F(y,x) rint16d(&(y),(double)(x))
  #define FE_INVALID 1
  #define fe_test_invalid() (_statusfp() & _SW_INVALID)
-  #define fe_clear_invalid _clearfp /* Note: clears all */
+  #define fe_clear_invalid _clearfp /* Note: clears all. */
 #elif HAVE_LRINT && LONG_MAX == 2147483647L && HAVE_FENV_H
  #include <math.h>
  #include <fenv.h>
--- a/src/soxr.c
+++ b/src/soxr.c
@ -92,10 +92,10 @@ struct soxr {



-#if !WITH_CR32 && !WITH_CR32S && !WITH_CR64 && !WITH_CR64S
-  #define lsx_to_3dB(x) ((x)/(x))
-#else
+#if WITH_CR32 || WITH_CR32S || WITH_CR64 || WITH_CR64S
  #include "filter.h"
+#else
+  #define lsx_to_3dB(x) ((x)/(x))
 #endif


@ -193,7 +193,7 @@ soxr_io_spec_t soxr_io_spec(



-#if WITH_CR32S || WITH_CR64S
+#if (WITH_CR32S && WITH_CR32) || (WITH_CR64S && WITH_CR64)
  #if defined __GNUC__ && defined __x86_64__
    #define CPUID(type, eax_, ebx_, ecx_, edx_) \
      __asm__ __volatile__ ( \
@ -240,7 +240,7 @@ soxr_io_spec_t soxr_io_spec(



-#if WITH_CR32S
+#if WITH_CR32S && WITH_CR32
  static bool cpu_has_simd32(void)
  {
  #if defined __x86_64__ || defined _M_X64
@ -259,14 +259,17 @@ soxr_io_spec_t soxr_io_spec(

  static bool should_use_simd32(void)
  {
-    char const * e = getenv("SOXR_USE_SIMD32");
-    return e? !!atoi(e) : cpu_has_simd32();
+    char const * e;
+    return ((e = getenv("SOXR_USE_SIMD"  )))? !!atoi(e) :
+           ((e = getenv("SOXR_USE_SIMD32")))? !!atoi(e) : cpu_has_simd32();
  }
+#else
+  #define should_use_simd32() true
 #endif



-#if WITH_CR64S
+#if WITH_CR64S && WITH_CR64
  #if defined __GNUC__
    #define XGETBV(type, eax_, edx_) \
      __asm__ __volatile__ ( \
@ -306,9 +309,12 @@ soxr_io_spec_t soxr_io_spec(

  static bool should_use_simd64(void)
  {
-    char const * e = getenv("SOXR_USE_SIMD64");
-    return e? !!atoi(e) : cpu_has_simd64();
+    char const * e;
+    return ((e = getenv("SOXR_USE_SIMD"  )))? !!atoi(e) :
+           ((e = getenv("SOXR_USE_SIMD64")))? !!atoi(e) : cpu_has_simd64();
  }
+#else
+  #define should_use_simd64() true
 #endif


@ -322,7 +328,8 @@ extern control_block_t



-static void runtime_num(char const * env_name, int min, int max, unsigned * field)
+static void runtime_num(char const * env_name,
+    int min, int max, unsigned * field)
 {
  char const * e = getenv(env_name);
  if (e) {
@ -334,7 +341,8 @@ static void runtime_num(char const * env_name, int min, int max, unsigned * fiel



-static void runtime_flag(char const * env_name, unsigned n_bits, unsigned n_shift, unsigned long * flags)
+static void runtime_flag(char const * env_name,
+    unsigned n_bits, unsigned n_shift, unsigned long * flags)
 {
  char const * e = getenv(env_name);
  if (e) {
@ -355,14 +363,28 @@ soxr_t soxr_create(
  soxr_quality_spec_t const * q_spec,
  soxr_runtime_spec_t const * runtime_spec)
 {
-  double io_ratio = output_rate!=0? input_rate!=0? input_rate / output_rate : -1 : input_rate!=0? -1 : 0;
+  double io_ratio = output_rate!=0? input_rate!=0?
+    input_rate / output_rate : -1 : input_rate!=0? -1 : 0;
  static const float datatype_full_scale[] = {1, 1, 65536.*32768, 32768};
  soxr_t p = 0;
  soxr_error_t error = 0;

 #if WITH_DEV_TRACE
+#define _(x) (char)(sizeof(x)>=10? 'a'+(char)(sizeof(x)-10):'0'+(char)sizeof(x))
  char const * e = getenv("SOXR_TRACE");
  _soxr_trace_level = e? atoi(e) : 0;
+  {
+    char const arch[] = {_(char), _(short), _(int), _(long), _(long long)
+      , ' ', _(float), _(double), _(long double)
+      , ' ', _(int *), _(int (*)(int))
+      , ' ', HAVE_BIGENDIAN ? 'B' : 'L'
+#if defined _OPENMP
+      , ' ', 'O', 'M', 'P'
+#endif
+      , 0};
+#undef _
+    lsx_debug("arch: %s", arch);
+  }
 #endif

  if (q_spec && q_spec->e)  error = q_spec->e;
--- a/src/soxr.h
+++ b/src/soxr.h
@ -65,8 +65,8 @@ input or output (e.g. ilen, olen).                                            */
 /* E.g. #if SOXR_THIS_VERSION >= SOXR_VERSION(0,1,1) ...                      */

 #define SOXR_VERSION(x,y,z)     (((x)<<16)|((y)<<8)|(z))
-#define SOXR_THIS_VERSION       SOXR_VERSION(0,1,2)
-#define SOXR_THIS_VERSION_STR               "0.1.2"
+#define SOXR_THIS_VERSION       SOXR_VERSION(0,1,3)
+#define SOXR_THIS_VERSION_STR               "0.1.3b1"



@ -249,7 +249,6 @@ struct soxr_quality_spec {                                       /* Typically */
 #define SOXR_ROLLOFF_MEDIUM    1u    /* <= 0.35 dB */
 #define SOXR_ROLLOFF_NONE      2u    /* For Chebyshev bandwidth. */

-#define SOXR_MAINTAIN_3DB_PT   4u  /* Reserved for internal use. */
 #define SOXR_HI_PREC_CLOCK     8u  /* Increase `irrational' ratio accuracy. */
 #define SOXR_DOUBLE_PRECISION 16u  /* Use D.P. calcs even if precision <= 20. */
 #define SOXR_VR               32u  /* Variable-rate resampling. */
@ -257,12 +256,12 @@ struct soxr_quality_spec {                                       /* Typically */


 struct soxr_runtime_spec {                                       /* Typically */
-  unsigned log2_min_dft_size;/* For DFT efficiency. [8,15]              10    */
-  unsigned log2_large_dft_size;/* For DFT efficiency. [8,20]            17    */
-  unsigned coef_size_kbytes; /* For SOXR_COEF_INTERP_AUTO (below).      400   */
-  unsigned num_threads;      /* If built so. 0 means `automatic'.        1    */
-  void * e;                  /* Reserved for internal use.               0    */
-  unsigned long flags;       /* Per the following #defines.              0    */
+  unsigned log2_min_dft_size;   /* For DFT efficiency. [8,15]           11    */
+  unsigned log2_large_dft_size; /* For DFT efficiency. [8,20]           17    */
+  unsigned coef_size_kbytes;    /* For SOXR_COEF_INTERP_AUTO (below).   400   */
+  unsigned num_threads;         /* If built so. 0 means `automatic'.     1    */
+  void * e;                     /* Reserved for internal use.            0    */
+  unsigned long flags;          /* Per the following #defines.           0    */
 };
                                   /* For `irrational' ratios only: */
 #define SOXR_COEF_INTERP_AUTO  0u    /* Auto select coef. interpolation. */
@ -293,7 +292,7 @@ SOXR soxr_quality_spec_t soxr_quality_spec(
 #define SOXR_24_BITQ            5
 #define SOXR_28_BITQ            6
 #define SOXR_32_BITQ            7
-                                    /* For internal use only; to be removed: */
+                                /* Reserved for internal use (to be removed): */
 #define SOXR_LSR0Q              8     /* 'Best sinc'. */
 #define SOXR_LSR1Q              9     /* 'Medium sinc'. */
 #define SOXR_LSR2Q              10    /* 'Fast sinc'. */
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@ -44,8 +44,12 @@ if (WITH_CR64 OR WITH_CR64S)
  set (test_bits ${test_bits} 28)
 endif ()

+set (rates 192000)
+if (WITH_HI_PREC_CLOCK)
+  set (rates ${rates} 65537)
+endif ()
 foreach (b ${test_bits})
-  foreach (r 192000 65537)
+  foreach (r ${rates})
    add_cmp_test (${base_rate} ${r} ${b})
    add_cmp_test (${r} ${base_rate} ${b})
  endforeach ()
--- a/tests/throughput-test
+++ b/tests/throughput-test
@ -1,6 +1,9 @@
 #!/bin/sh
 set -e

+# SoX Resampler Library       Copyright (c) 2007-16 robs@users.sourceforge.net
+# Licence for this file: LGPL v2.1                  See LICENCE for details.
+
 test -r throughput.exe && wine=wine

 test /$1 = / && list="`seq 0 3`" || list="$*"
--- a/tests/throughput-test.bat
+++ b/tests/throughput-test.bat
@ -1,2 +1,5 @@
@echo off
+rem SoX Resampler Library      Copyright (c) 2007-16 robs@users.sourceforge.net
+rem Licence for this file: LGPL v2.1                  See LICENCE for details.
+
 for /L %%i in (0,1,3) DO throughput 44.1 48 1 0 %%i
--- a/tests/throughput.c
+++ b/tests/throughput.c
@ -16,23 +16,23 @@
  #define timerRunning() (QueryPerformanceCounter(&tmp), \
      (tmp.QuadPart-start.QuadPart < stop.QuadPart))
 #else
-  #include <time.h>
-  #include <unistd.h>
-  #if defined _POSIX_TIMERS && _POSIX_TIMERS > 0
-    #define K (k*k)
-    #define tv_frac tv_nsec
-    #if defined _POSIX_MONOTONIC_CLOCK
-      #define get_time(x) clock_gettime(CLOCK_MONOTONIC, x)
-    #else
-      #define get_time(x) clock_gettime(CLOCK_REALTIME, x)
-    #endif
+  #include <sys/time.h>
+  #if defined timeradd
+    #define K k
+    #define tv_frac tv_usec
+    #define timespec timeval
+    #define get_time(x) gettimeofday(x, NULL)
  #else
-    #include <sys/time.h>
-    #if defined timeradd
-      #define K k
-      #define tv_frac tv_usec
-      #define timespec timeval
-      #define get_time(x) gettimeofday(x, NULL)
+    #include <time.h>
+    #include <unistd.h>
+    #if defined _POSIX_TIMERS && _POSIX_TIMERS > 0
+      #define K (k*k)
+      #define tv_frac tv_nsec
+      #if defined _POSIX_MONOTONIC_CLOCK
+        #define get_time(x) clock_gettime(CLOCK_MONOTONIC, x)
+      #else
+        #define get_time(x) clock_gettime(CLOCK_REALTIME, x)
+      #endif
    #else
      #include <sys/timeb.h>
      #define K 1