diff --git a/CMakeLists.txt b/CMakeLists.txt
index c8d6bae..b9ee5d0 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -13,7 +13,7 @@ set (DESCRIPTION_SUMMARY
 
 set (PROJECT_VERSION_MAJOR 0)
 set (PROJECT_VERSION_MINOR 1)
-set (PROJECT_VERSION_PATCH 2)
+set (PROJECT_VERSION_PATCH "3b1")
 
 # For shared-object; if, since the last public release:
 #   1) library code changed at all: ++revision
@@ -22,7 +22,7 @@ set (PROJECT_VERSION_PATCH 2)
 #   4) interfaces removed:          age = 0
 
 set (SO_VERSION_CURRENT  1)
-set (SO_VERSION_REVISION 1)
+set (SO_VERSION_REVISION 2)
 set (SO_VERSION_AGE      1)
 
 math (EXPR SO_VERSION_MAJOR "${SO_VERSION_CURRENT} - ${SO_VERSION_AGE}")
@@ -63,18 +63,22 @@ cmake_dependent_option (WITH_CR64S
 cmake_dependent_option (WITH_CR32S
   "Include HQ SIMD constant-rate resampling engine." ON
   "WITH_VR32 OR WITH_CR64 OR WITH_CR32 OR WITH_CR64S" ON)
-cmake_dependent_option (WITH_AVFFT
-  "Use libavcodec (LGPL) for HQ SIMD DFT." OFF
-  "WITH_CR32S;NOT WITH_PFFFT" OFF)
 cmake_dependent_option (WITH_PFFFT
   "Use PFFFT (BSD-like licence) for HQ SIMD DFT." ON
   "WITH_CR32S;NOT WITH_AVFFT" OFF)
+cmake_dependent_option (WITH_AVFFT
+  "Use libavcodec (LGPL) for HQ SIMD DFT." OFF
+  "WITH_CR32S;NOT WITH_PFFFT" OFF)
 cmake_dependent_option (BUILD_LSR_TESTS "Build LSR tests." OFF
   "UNIX;NOT CMAKE_CROSSCOMPILING;EXISTS ${PROJECT_SOURCE_DIR}/lsr-tests;WITH_LSR_BINDINGS" OFF)
 
+option (WITH_HI_PREC_CLOCK "Enable high-precision time-base." ON)
+option (WITH_FLOAT_STD_PREC_CLOCK
+	"Use floating-point for standard-precision time-base." OFF)
 option (WITH_DEV_TRACE "Enable developer trace capability." ON)
 option (WITH_DEV_GPROF "Enable developer grpof output." OFF)
-mark_as_advanced (WITH_DEV_TRACE WITH_DEV_GPROF)
+mark_as_advanced (WITH_HI_PREC_CLOCK WITH_FLOAT_STD_PREC_CLOCK
+       	WITH_DEV_TRACE WITH_DEV_GPROF)
 
 
 
@@ -130,7 +134,7 @@ if (WITH_AVFFT)
   endif ()
 endif ()
 
-if (WITH_AVFFT OR (CMAKE_SYSTEM_PROCESSOR MATCHES "^arm" AND SIMD32_FOUND))
+if (WITH_AVFFT OR (CMAKE_SYSTEM_PROCESSOR MATCHES "^arm" AND SIMD32_FOUND AND WITH_CR32))
   find_package (LibAVUtil)
   if (AVUTIL_FOUND)
     include_directories (${AVUTIL_INCLUDE_DIRS})
diff --git a/TODO b/TODO
index c699c0c..2d1bc19 100644
--- a/TODO
+++ b/TODO
@@ -1,2 +1,3 @@
-* SOXR_ALLOW_ALIASING
-* Explicit flush API fn, perhaps.
+* vr32s
+* vr32 with 1-delay-clear
+* fir_to_phase with RDFT32
diff --git a/go.bat b/go.bat
index c73d4c2..aabff75 100644
--- a/go.bat
+++ b/go.bat
@@ -1,5 +1,5 @@
 @echo off
-rem SoX Resampler Library      Copyright (c) 2007-13 robs@users.sourceforge.net
+rem SoX Resampler Library      Copyright (c) 2007-16 robs@users.sourceforge.net
 rem Licence for this file: LGPL v2.1                  See LICENCE for details.
 
 set build=%1
diff --git a/msvc/soxr-config.h b/msvc/soxr-config.h
index d17ae6b..89f7a91 100644
--- a/msvc/soxr-config.h
+++ b/msvc/soxr-config.h
@@ -9,6 +9,7 @@
 
 #define AVCODEC_FOUND 0
 #define AVUTIL_FOUND 0
+#define WITH_PFFFT 1
 
 #define HAVE_FENV_H 1
 #define HAVE_STDBOOL_H 1
@@ -22,6 +23,8 @@
 #define WITH_CR64S 1
 #define WITH_VR32 1
 
+#define WITH_HI_PREC_CLOCK 1
+#define WITH_FLOAT_STD_PREC_CLOCK 0
 #define WITH_DEV_TRACE 1
 
 #endif
diff --git a/multi-arch b/multi-arch
index 63bb223..288b578 100755
--- a/multi-arch
+++ b/multi-arch
@@ -1,7 +1,7 @@
 #!/usr/bin/env bash
 set -e
 
-# SoX Resampler Library       Copyright (c) 2007-13 robs@users.sourceforge.net
+# SoX Resampler Library       Copyright (c) 2007-16 robs@users.sourceforge.net
 # Licence for this file: LGPL v2.1                  See LICENCE for details.
 
 rm -f CMakeCache.txt             # Prevent interference from any in-tree build
diff --git a/soxr-config.h.in b/soxr-config.h.in
index 8d654d8..00b3b45 100644
--- a/soxr-config.h.in
+++ b/soxr-config.h.in
@@ -6,6 +6,7 @@
 
 #cmakedefine01 AVCODEC_FOUND
 #cmakedefine01 AVUTIL_FOUND
+#cmakedefine01 WITH_PFFFT
 
 #cmakedefine01 HAVE_FENV_H
 #cmakedefine01 HAVE_STDBOOL_H
@@ -19,6 +20,8 @@
 #cmakedefine01 WITH_CR64S
 #cmakedefine01 WITH_VR32
 
+#cmakedefine01 WITH_HI_PREC_CLOCK
+#cmakedefine01 WITH_FLOAT_STD_PREC_CLOCK
 #cmakedefine01 WITH_DEV_TRACE
 
 #endif
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 3c45b13..bb01a0d 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -32,13 +32,16 @@ elseif (WITH_PFFFT)
   #set (RDFT32 pffft32)
   set (RDFT32S pffft32s)
 elseif (WITH_CR32S)
-  set (RDFT32S fft4g32s fft4g32)
+  set (RDFT32S fft4g32s)
+  if (NOT WITH_CR32)
+    list (APPEND RDFT32S fft4g32)
+  endif ()
 endif ()
 
 set (SOURCES ${PROJECT_NAME}.c data-io)
 
 if (WITH_CR32 OR WITH_CR32S OR WITH_CR64 OR WITH_CR64S)
-  list (APPEND SOURCES dbesi0 filter fft4g64 cr.c)
+  list (APPEND SOURCES dbesi0 filter fft4g64 cr)
 endif ()
 
 if (WITH_CR32)
diff --git a/src/cr-core.c b/src/cr-core.c
index 3f35ff0..d45a3fd 100644
--- a/src/cr-core.c
+++ b/src/cr-core.c
@@ -139,7 +139,8 @@ static half_fir_info_t const half_firs[] = {
   #define SIMD_AVX ((CORE_TYPE & CORE_SIMD_POLY) && defined __AVX__)
   #define SIMD_SSE 0
 #else
-  #define SIMD_SSE ((CORE_TYPE & CORE_SIMD_POLY) && (defined __x86_64__ || defined _M_X64 || defined i386 || defined _M_IX86))
+  #define SIMD_SSE ((CORE_TYPE & CORE_SIMD_POLY) && \
+      (defined __x86_64__ || defined _M_X64 || defined i386 || defined _M_IX86))
   #define SIMD_AVX 0
 #endif
 
@@ -147,8 +148,6 @@ static half_fir_info_t const half_firs[] = {
 
 
 
-
-#define HI_PREC_CLOCK
 #define COEFS (sample_t * __restrict)p->shared->poly_fir_coefs
 #define VAR_LENGTH p->n
 #define VAR_CONVOLVE(n) while (j < (n)) _
diff --git a/src/cr.c b/src/cr.c
index eb65a04..5f09604 100644
--- a/src/cr.c
+++ b/src/cr.c
@@ -343,7 +343,7 @@ STATIC char const * _soxr_init(
   for (i = 0; i < p->num_stages; ++i) {
     p->stages[i].num = i;
     p->stages[i].shared = shared;
-    p->stages[i].input_size = 4096;
+    p->stages[i].input_size = 8192;
   }
   p->stages[0].is_input = true;
 
@@ -443,8 +443,8 @@ STATIC char const * _soxr_init(
     s->L = arbL;
     s->use_hi_prec_clock =
       mode>1 && (q_spec->flags & SOXR_HI_PREC_CLOCK) && !rational;
-#if FLOAT_HI_PREC_CLOCK
-    if (s->use_hi_prec_clock) {
+#if WITH_FLOAT_STD_PREC_CLOCK
+    if (order && !s->use_hi_prec_clock) {
       s->at.flt = at;
       s->step.flt = arbM;
       s->out_in_ratio = (double)(arbL / s->step.flt);
@@ -452,7 +452,7 @@ STATIC char const * _soxr_init(
 #endif
     {
       s->at.whole = (int64_t)(at * MULT32 + .5);
-#if !FLOAT_HI_PREC_CLOCK
+#if WITH_HI_PREC_CLOCK
       if (s->use_hi_prec_clock) {
         double M = arbM * MULT32;
         s->at.fix.ls.parts.ms = 0x80000000ul;
@@ -474,7 +474,7 @@ STATIC char const * _soxr_init(
         s++, postL, postM, &multiplier, r_spec->log2_min_dft_size,
         r_spec->log2_large_dft_size, core_flags, core->rdft_cb);
 
-  lsx_debug("%g: »%i⋅%i/%i⋅%i/%g⋅%i/%i %x", 1/io_ratio,
+  lsx_debug("%g: >>%i %i/%i %i/%g %i/%i (%x)", 1/io_ratio,
       shr, preL, preM, arbL, arbM, postL, postM, core_flags);
 
   for (i = 0, s = p->stages; i < p->num_stages; ++i, ++s) {
diff --git a/src/cr.h b/src/cr.h
index 1b707e6..d6e8637 100644
--- a/src/cr.h
+++ b/src/cr.h
@@ -10,7 +10,12 @@
 typedef void real; /* float or double */
 struct stage;
 typedef void (* stage_fn_t)(struct stage * input, fifo_t * output);
-typedef struct half_fir_info {int num_coefs; real const * coefs; stage_fn_t fn, dfn; float att;} half_fir_info_t;
+typedef struct half_fir_info {
+  int num_coefs;
+  real const * coefs;
+  stage_fn_t fn, dfn;
+  float att;
+} half_fir_info_t;
 typedef struct {float scalar; stage_fn_t fn;} poly_fir1_t;
 typedef struct {float beta; poly_fir1_t interp[3];} poly_fir_t;
 
@@ -48,9 +53,6 @@ typedef union { /* Uint64 in parts */
   uint64_t all;
 } uint64p_t;
 
-#define FLOAT_HI_PREC_CLOCK 0    /* Non-float hi-prec has ~96 bits. */
-#define float_step_t long double /* __float128 is also a (slow) option */
-
 typedef struct {
   int        dft_length, num_taps, post_peak;
   void       * dft_forward_setup, * dft_backward_setup;
@@ -62,11 +64,17 @@ typedef struct { /* So generated filter coefs may be shared between channels */
   dft_filter_t dft_filter[2];
 } rate_shared_t;
 
+typedef double float_step_t; /* Or long double or __float128. */
+
 typedef union { /* Fixed point arithmetic */
-  struct {uint64p_t ls; int64p_t ms;} fix;
+  struct {uint64p_t ls; int64p_t ms;} fix;  /* Hi-prec has ~96 bits. */
   float_step_t flt;
 } step_t;
 
+#define integer  fix.ms.parts.ms
+#define fraction fix.ms.parts.ls
+#define whole    fix.ms.all
+
 #define CORE_DBL       1
 #define CORE_SIMD_POLY 2
 #define CORE_SIMD_HALF 4
@@ -113,16 +121,11 @@ typedef struct stage {
 
 #define stage_occupancy(s) max(0, fifo_occupancy(&(s)->fifo) - (s)->pre_post)
 #define stage_read_p(s) ((sample_t *)fifo_read_ptr(&(s)->fifo) + (s)->pre)
-#define integer  fix.ms.parts.ms
-#define fraction fix.ms.parts.ls
-#define whole    fix.ms.all
-
 
 #define lq_bw0  (1385/2048.) /* ~.67625, FP exact. */
 
 typedef enum {rolloff_small, rolloff_medium, rolloff_none} rolloff_t;
 
-
 typedef struct {
   void * (* alloc)(size_t);
   void * (* calloc)(size_t, size_t);
diff --git a/src/fft4g32.c b/src/fft4g32.c
index 5dcf34d..7a31ba4 100644
--- a/src/fft4g32.c
+++ b/src/fft4g32.c
@@ -5,8 +5,10 @@
 #include "filter.h"
 #define FFT4G_FLOAT
 #include "fft4g.c"
-#include "rdft_t.h"
+#include "soxr-config.h"
 
+#if WITH_CR32
+#include "rdft_t.h"
 static void * null(void) {return 0;}
 static void forward (int length, void * setup, double * H) {lsx_safe_rdft_f(length,  1, H); (void)setup;}
 static void backward(int length, void * setup, double * H) {lsx_safe_rdft_f(length, -1, H); (void)setup;}
@@ -31,3 +33,4 @@ fn_t _soxr_rdft32_cb[] = {
   (fn_t)free,
   (fn_t)flags,
 };
+#endif
diff --git a/src/filter.c b/src/filter.c
index aec0b6e..019d24d 100644
--- a/src/filter.c
+++ b/src/filter.c
@@ -28,7 +28,7 @@
 #include "fft4g_cache.h"
 #endif
 
-#if WITH_CR32 && !AVCODEC_FOUND
+#if (WITH_CR32 && !AVCODEC_FOUND) || (WITH_CR32S && !AVCODEC_FOUND && !WITH_PFFFT)
 #define DFT_FLOAT float
 #define DONE_WITH_FFT_CACHE done_with_fft_cache_f
 #define FFT_CACHE_CCRW fft_cache_ccrw_f
@@ -93,7 +93,7 @@ double * lsx_make_lpf(
   double * h = malloc((size_t)num_taps * sizeof(*h));
   double mult = scale / lsx_bessel_I_0(beta), mult1 = 1 / (.5 * m + rho);
   assert(Fc >= 0 && Fc <= 1);
-  lsx_debug("make_lpf(n=%i Fc=%.7g β=%g ρ=%g scale=%g)",
+  lsx_debug("make_lpf(n=%i Fc=%.7g beta=%g rho=%g scale=%g)",
       num_taps, Fc, beta, rho, scale);
 
   if (h) for (i = 0; i <= m / 2; ++i) {
@@ -120,7 +120,7 @@ double * lsx_design_lpf(
     double Fn,      /* Nyquist freq; e.g. 0.5, 1, PI */
     double att,     /* Stop-band attenuation in dB */
     int * num_taps, /* 0: value will be estimated */
-    int k,          /* >0: number of phases; <0: num_taps ≡ 1 (mod -k) */
+    int k,          /* >0: number of phases; <0: num_taps = 1 (mod -k) */
     double beta)    /* <0: value will be estimated */
 {
   int n = *num_taps, phases = max(k, 1), modulo = max(-k, 1);
diff --git a/src/filter.h b/src/filter.h
index 56333ff..ccb3ba8 100644
--- a/src/filter.h
+++ b/src/filter.h
@@ -31,7 +31,7 @@ double * lsx_design_lpf(
     double Fn,      /* Nyquist freq; e.g. 0.5, 1, PI; < 0: dummy run */
     double att,     /* Stop-band attenuation in dB */
     int * num_taps, /* 0: value will be estimated */
-    int k,          /* >0: number of phases; <0: num_taps ≡ 1 (mod -k) */
+    int k,          /* >0: number of phases; <0: num_taps = 1 (mod -k) */
     double beta);   /* <0: value will be estimated */
 
 void lsx_fir_to_phase(double * * h, int * len,
diff --git a/src/internal.h b/src/internal.h
index ee691a0..08924d5 100644
--- a/src/internal.h
+++ b/src/internal.h
@@ -55,6 +55,7 @@
 
 #define SOXR_ROLLOFF_LSR2Q     3u    /* Reserved for internal use. */
 #define SOXR_ROLLOFF_MASK      3u    /* For masking these bits. */
+#define SOXR_MAINTAIN_3DB_PT   4u    /* Reserved for internal use. */
 #define SOXR_PROMOTE_TO_LQ    64u    /* Reserved for internal use. */
 
 
diff --git a/src/poly-fir.h b/src/poly-fir.h
index 94db90e..d138e03 100644
--- a/src/poly-fir.h
+++ b/src/poly-fir.h
@@ -26,8 +26,6 @@
 
   #define BEGINNING v4_t X = vLds(x), sum = vZero(); \
       v4_t const * const __restrict coefs = (v4_t *)COEFS
-  #define MIDDLE switch (N) {case 3: CONVOLVE(3); break; case 4: CONVOLVE(4); \
-      break; case 5: CONVOLVE(5); break;  default: CONVOLVE(N); }
   #define END vStorSum(output+i, sum)
   #define cc(n) case n: core(n); break
   #define CORE(n) switch (n) {cc(2); cc(3); cc(4); cc(5); cc(6); default: core(n);}
@@ -48,60 +46,74 @@
   #define d (coef(COEFS, COEF_INTERP, N, phase, 3,j))
 
   #define BEGINNING sample_t sum = 0
-  #define MIDDLE CONVOLVE(N)
   #define END output[i] = sum
   #define CORE(n) core(n)
 #endif
 
-#define fphpCore(n) \
-  if (p->use_hi_prec_clock) { \
-    float_step_t at = p->at.flt; \
-    for (i = 0; (int)at < num_in; ++i, at += p->step.flt) { \
-      sample_t const * const __restrict in = input + (int)at; \
-      float_step_t frac = at - (int)at; \
-      int phase = (int)(frac * (1 << PHASE_BITS)); \
-      sample_t x = (sample_t)(frac * (1 << PHASE_BITS) - phase); \
-      int j = 0; \
-      BEGINNING; CONVOLVE(n); END; \
-    } \
-    fifo_read(&p->fifo, (int)at, NULL); \
-    p->at.flt = at - (int)at; \
-  } else
 
-#define hpCore(n) \
-  if (p->use_hi_prec_clock) { \
-    for (i = 0; p->at.integer < num_in; ++i, \
-        p->at.fix.ls.all += p->step.fix.ls.all, \
-        p->at.whole += p->step.whole + (p->at.fix.ls.all < p->step.fix.ls.all)) { \
-      sample_t const * const __restrict in = input + p->at.integer; \
-      uint32_t frac = p->at.fraction; \
-      int phase = (int)(frac >> (32 - PHASE_BITS)); /* high-order bits */ \
-      sample_t x = (sample_t)((frac << PHASE_BITS) * (1 / MULT32)); /* low-order bits, scaled to [0,1) */ \
-      int j = 0; \
-      BEGINNING; CONVOLVE(n); END; \
-    } \
-    fifo_read(&p->fifo, p->at.integer, NULL); \
-    p->at.integer = 0; \
-  } else
 
-#define spCore(n) { \
-    for (i = 0; p->at.integer < num_in; ++i, p->at.whole += p->step.whole) { \
-      sample_t const * const __restrict in = input + p->at.integer; \
-      uint32_t frac = p->at.fraction; \
-      int phase = (int)(frac >> (32 - PHASE_BITS)); /* high-order bits */ \
-      sample_t x = (sample_t)((frac << PHASE_BITS) * (1 / MULT32)); /* low-order bits, scaled to [0,1) */ \
-      int j = 0; \
-      BEGINNING; CONVOLVE(n); END; \
-    } \
-    fifo_read(&p->fifo, p->at.integer, NULL); \
-    p->at.integer = 0; }
+#define floatPrecCore(n) { \
+  float_step_t at = p->at.flt; \
+  for (i = 0; (int)at < num_in; ++i, at += p->step.flt) { \
+    sample_t const * const __restrict in = input + (int)at; \
+    float_step_t frac = at - (int)at; \
+    int phase = (int)(frac * (1 << PHASE_BITS)); \
+    sample_t x = (sample_t)(frac * (1 << PHASE_BITS) - phase); \
+    int j = 0; \
+    BEGINNING; CONVOLVE(n); END; \
+  } \
+  fifo_read(&p->fifo, (int)at, NULL); \
+  p->at.flt = at - (int)at; } /* Could round to 1 in some cirmcumstances. */
 
-#if defined HI_PREC_CLOCK && FLOAT_HI_PREC_CLOCK
-  #define core(n) fphpCore(n) spCore(n)
-#elif defined HI_PREC_CLOCK
-  #define core(n) hpCore(n) spCore(n)
+
+
+#define highPrecCore(n) { \
+  step_t at; at.fix = p->at.fix; \
+  for (i = 0; at.integer < num_in; ++i, \
+      at.fix.ls.all += p->step.fix.ls.all, \
+      at.whole += p->step.whole + (at.fix.ls.all < p->step.fix.ls.all)) { \
+    sample_t const * const __restrict in = input + at.integer; \
+    uint32_t frac = at.fraction; \
+    int phase = (int)(frac >> (32 - PHASE_BITS)); /* High-order bits */ \
+    /* Low-order bits, scaled to [0,1): */ \
+    sample_t x = (sample_t)((frac << PHASE_BITS) * (1 / MULT32)); \
+    int j = 0; \
+    BEGINNING; CONVOLVE(n); END; \
+  } \
+  fifo_read(&p->fifo, at.integer, NULL); \
+  p->at.whole = at.fraction; \
+  p->at.fix.ls = at.fix.ls; }
+
+
+
+#define stdPrecCore(n) { \
+  int64p_t at; at.all = p->at.whole; \
+  for (i = 0; at.parts.ms < num_in; ++i, at.all += p->step.whole) { \
+    sample_t const * const __restrict in = input + at.parts.ms; \
+    uint32_t const frac = at.parts.ls; \
+    int phase = (int)(frac >> (32 - PHASE_BITS)); /* high-order bits */ \
+    /* Low-order bits, scaled to [0,1): */ \
+    sample_t x = (sample_t)((frac << PHASE_BITS) * (1 / MULT32)); \
+    int j = 0; \
+    BEGINNING; CONVOLVE(n); END; \
+  } \
+  fifo_read(&p->fifo, at.parts.ms, NULL); \
+  p->at.whole = at.parts.ls; }
+
+
+
+#if WITH_FLOAT_STD_PREC_CLOCK
+  #define SPCORE floatPrecCore
 #else
-  #define core(n) spCore(n)
+  #define SPCORE stdPrecCore
+#endif
+
+
+
+#if WITH_HI_PREC_CLOCK
+  #define core(n) if (p->use_hi_prec_clock) highPrecCore(n) else SPCORE(n)
+#else
+  #define core(n) SPCORE(n)
 #endif
 
 
@@ -131,7 +143,6 @@ static void FUNCTION(stage_t * p, fifo_t * output_fifo)
 #undef COEF_INTERP
 #undef N
 #undef BEGINNING
-#undef MIDDLE
 #undef END
 #undef CONVOLVE
 #undef FIR_LENGTH
diff --git a/src/poly-fir0.h b/src/poly-fir0.h
index 0f28c69..76fca2d 100644
--- a/src/poly-fir0.h
+++ b/src/poly-fir0.h
@@ -22,9 +22,8 @@
 #endif
 
 #define core(n) \
-  for (i = 0; p->at.integer < num_in * p->L; ++i, \
-      p->at.integer += p->step.integer) { \
-    int const div = p->at.integer / p->L, rem = p->at.integer % p->L; \
+  for (i = 0; at < num_in * p->L; ++i, at += step) { \
+    int const div = at / p->L, rem = at % p->L; \
     sample_t const * const __restrict at = input + div; \
     int j = 0; BEGINNING; CONVOLVE(n); END;}
 
@@ -33,13 +32,14 @@ static void FUNCTION(stage_t * p, fifo_t * output_fifo)
   int num_in = min(stage_occupancy(p), p->input_size);
   if (num_in) {
     sample_t const * input = stage_read_p(p);
-    int i, num_out = (num_in * p->L - p->at.integer + p->step.integer - 1) / p->step.integer;
+    int at = p->at.integer, step = p->step.integer;
+    int i, num_out = (num_in * p->L - at + step - 1) / step;
     sample_t * __restrict output = fifo_reserve(output_fifo, num_out);
 
     CORE(N);
     assert(i == num_out);
-    fifo_read(&p->fifo, p->at.integer / p->L, NULL);
-    p->at.integer = p->at.integer % p->L;
+    fifo_read(&p->fifo, at / p->L, NULL);
+    p->at.integer = at % p->L;
   }
 }
 
diff --git a/src/rint.h b/src/rint.h
index d3629ae..2f1dfbe 100644
--- a/src/rint.h
+++ b/src/rint.h
@@ -6,6 +6,9 @@
 
 #include "std-types.h"
 
+/* For x86, compiler-supplied versions of these functions (where available)
+ * can have poor performance (e.g. mingw32), so prefer these asm versions: */
+
 #if defined __GNUC__ && (defined __i386__ || defined __x86_64__)
   #define FPU_RINT32
   #define FPU_RINT16
@@ -23,7 +26,7 @@
     int32_t status[7];
     __asm__ __volatile__("fnstenv %0": "=m"(status));
     status[1] &= ~FE_INVALID;
-    __asm__ __volatile__("fldenv %0": : "m"(status));
+    __asm__ __volatile__("fldenv %0": : "m"(*status));
     return 0;
   }
 #elif defined _MSC_VER && defined _M_IX86
@@ -69,7 +72,7 @@
   #define rint16F(y,x) rint16d(&(y),(double)(x))
   #define FE_INVALID 1
   #define fe_test_invalid() (_statusfp() & _SW_INVALID)
-  #define fe_clear_invalid _clearfp /* Note: clears all */
+  #define fe_clear_invalid _clearfp /* Note: clears all. */
 #elif HAVE_LRINT && LONG_MAX == 2147483647L && HAVE_FENV_H
   #include <math.h>
   #include <fenv.h>
diff --git a/src/soxr.c b/src/soxr.c
index 4a7b2da..9bd5fcb 100644
--- a/src/soxr.c
+++ b/src/soxr.c
@@ -92,10 +92,10 @@ struct soxr {
 
 
 
-#if !WITH_CR32 && !WITH_CR32S && !WITH_CR64 && !WITH_CR64S
-  #define lsx_to_3dB(x) ((x)/(x))
-#else
+#if WITH_CR32 || WITH_CR32S || WITH_CR64 || WITH_CR64S
   #include "filter.h"
+#else
+  #define lsx_to_3dB(x) ((x)/(x))
 #endif
 
 
@@ -193,7 +193,7 @@ soxr_io_spec_t soxr_io_spec(
 
 
 
-#if WITH_CR32S || WITH_CR64S
+#if (WITH_CR32S && WITH_CR32) || (WITH_CR64S && WITH_CR64)
   #if defined __GNUC__ && defined __x86_64__
     #define CPUID(type, eax_, ebx_, ecx_, edx_) \
       __asm__ __volatile__ ( \
@@ -240,7 +240,7 @@ soxr_io_spec_t soxr_io_spec(
 
 
 
-#if WITH_CR32S
+#if WITH_CR32S && WITH_CR32
   static bool cpu_has_simd32(void)
   {
   #if defined __x86_64__ || defined _M_X64
@@ -259,14 +259,17 @@ soxr_io_spec_t soxr_io_spec(
 
   static bool should_use_simd32(void)
   {
-    char const * e = getenv("SOXR_USE_SIMD32");
-    return e? !!atoi(e) : cpu_has_simd32();
+    char const * e;
+    return ((e = getenv("SOXR_USE_SIMD"  )))? !!atoi(e) :
+           ((e = getenv("SOXR_USE_SIMD32")))? !!atoi(e) : cpu_has_simd32();
   }
+#else
+  #define should_use_simd32() true
 #endif
 
 
 
-#if WITH_CR64S
+#if WITH_CR64S && WITH_CR64
   #if defined __GNUC__
     #define XGETBV(type, eax_, edx_) \
       __asm__ __volatile__ ( \
@@ -306,9 +309,12 @@ soxr_io_spec_t soxr_io_spec(
 
   static bool should_use_simd64(void)
   {
-    char const * e = getenv("SOXR_USE_SIMD64");
-    return e? !!atoi(e) : cpu_has_simd64();
+    char const * e;
+    return ((e = getenv("SOXR_USE_SIMD"  )))? !!atoi(e) :
+           ((e = getenv("SOXR_USE_SIMD64")))? !!atoi(e) : cpu_has_simd64();
   }
+#else
+  #define should_use_simd64() true
 #endif
 
 
@@ -322,7 +328,8 @@ extern control_block_t
 
 
 
-static void runtime_num(char const * env_name, int min, int max, unsigned * field)
+static void runtime_num(char const * env_name,
+    int min, int max, unsigned * field)
 {
   char const * e = getenv(env_name);
   if (e) {
@@ -334,7 +341,8 @@ static void runtime_num(char const * env_name, int min, int max, unsigned * fiel
 
 
 
-static void runtime_flag(char const * env_name, unsigned n_bits, unsigned n_shift, unsigned long * flags)
+static void runtime_flag(char const * env_name,
+    unsigned n_bits, unsigned n_shift, unsigned long * flags)
 {
   char const * e = getenv(env_name);
   if (e) {
@@ -355,14 +363,28 @@ soxr_t soxr_create(
   soxr_quality_spec_t const * q_spec,
   soxr_runtime_spec_t const * runtime_spec)
 {
-  double io_ratio = output_rate!=0? input_rate!=0? input_rate / output_rate : -1 : input_rate!=0? -1 : 0;
+  double io_ratio = output_rate!=0? input_rate!=0?
+    input_rate / output_rate : -1 : input_rate!=0? -1 : 0;
   static const float datatype_full_scale[] = {1, 1, 65536.*32768, 32768};
   soxr_t p = 0;
   soxr_error_t error = 0;
 
 #if WITH_DEV_TRACE
+#define _(x) (char)(sizeof(x)>=10? 'a'+(char)(sizeof(x)-10):'0'+(char)sizeof(x))
   char const * e = getenv("SOXR_TRACE");
   _soxr_trace_level = e? atoi(e) : 0;
+  {
+    char const arch[] = {_(char), _(short), _(int), _(long), _(long long)
+      , ' ', _(float), _(double), _(long double)
+      , ' ', _(int *), _(int (*)(int))
+      , ' ', HAVE_BIGENDIAN ? 'B' : 'L'
+#if defined _OPENMP
+      , ' ', 'O', 'M', 'P'
+#endif
+      , 0};
+#undef _
+    lsx_debug("arch: %s", arch);
+  }
 #endif
 
   if (q_spec && q_spec->e)  error = q_spec->e;
diff --git a/src/soxr.h b/src/soxr.h
index 640b698..022ba26 100644
--- a/src/soxr.h
+++ b/src/soxr.h
@@ -65,8 +65,8 @@ input or output (e.g. ilen, olen).                                            */
 /* E.g. #if SOXR_THIS_VERSION >= SOXR_VERSION(0,1,1) ...                      */
 
 #define SOXR_VERSION(x,y,z)     (((x)<<16)|((y)<<8)|(z))
-#define SOXR_THIS_VERSION       SOXR_VERSION(0,1,2)
-#define SOXR_THIS_VERSION_STR               "0.1.2"
+#define SOXR_THIS_VERSION       SOXR_VERSION(0,1,3)
+#define SOXR_THIS_VERSION_STR               "0.1.3b1"
 
 
 
@@ -249,7 +249,6 @@ struct soxr_quality_spec {                                       /* Typically */
 #define SOXR_ROLLOFF_MEDIUM    1u    /* <= 0.35 dB */
 #define SOXR_ROLLOFF_NONE      2u    /* For Chebyshev bandwidth. */
 
-#define SOXR_MAINTAIN_3DB_PT   4u  /* Reserved for internal use. */
 #define SOXR_HI_PREC_CLOCK     8u  /* Increase `irrational' ratio accuracy. */
 #define SOXR_DOUBLE_PRECISION 16u  /* Use D.P. calcs even if precision <= 20. */
 #define SOXR_VR               32u  /* Variable-rate resampling. */
@@ -257,12 +256,12 @@ struct soxr_quality_spec {                                       /* Typically */
 
 
 struct soxr_runtime_spec {                                       /* Typically */
-  unsigned log2_min_dft_size;/* For DFT efficiency. [8,15]              10    */
-  unsigned log2_large_dft_size;/* For DFT efficiency. [8,20]            17    */
-  unsigned coef_size_kbytes; /* For SOXR_COEF_INTERP_AUTO (below).      400   */
-  unsigned num_threads;      /* If built so. 0 means `automatic'.        1    */
-  void * e;                  /* Reserved for internal use.               0    */
-  unsigned long flags;       /* Per the following #defines.              0    */
+  unsigned log2_min_dft_size;   /* For DFT efficiency. [8,15]           11    */
+  unsigned log2_large_dft_size; /* For DFT efficiency. [8,20]           17    */
+  unsigned coef_size_kbytes;    /* For SOXR_COEF_INTERP_AUTO (below).   400   */
+  unsigned num_threads;         /* If built so. 0 means `automatic'.     1    */
+  void * e;                     /* Reserved for internal use.            0    */
+  unsigned long flags;          /* Per the following #defines.           0    */
 };
                                    /* For `irrational' ratios only: */
 #define SOXR_COEF_INTERP_AUTO  0u    /* Auto select coef. interpolation. */
@@ -293,7 +292,7 @@ SOXR soxr_quality_spec_t soxr_quality_spec(
 #define SOXR_24_BITQ            5
 #define SOXR_28_BITQ            6
 #define SOXR_32_BITQ            7
-                                    /* For internal use only; to be removed: */
+                                /* Reserved for internal use (to be removed): */
 #define SOXR_LSR0Q              8     /* 'Best sinc'. */
 #define SOXR_LSR1Q              9     /* 'Medium sinc'. */
 #define SOXR_LSR2Q              10    /* 'Fast sinc'. */
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index 55cb55d..ee8dd0b 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -44,8 +44,12 @@ if (WITH_CR64 OR WITH_CR64S)
   set (test_bits ${test_bits} 28)
 endif ()
 
+set (rates 192000)
+if (WITH_HI_PREC_CLOCK)
+  set (rates ${rates} 65537)
+endif ()
 foreach (b ${test_bits})
-  foreach (r 192000 65537)
+  foreach (r ${rates})
     add_cmp_test (${base_rate} ${r} ${b})
     add_cmp_test (${r} ${base_rate} ${b})
   endforeach ()
diff --git a/tests/throughput-test b/tests/throughput-test
index b03a2a4..aef36f6 100755
--- a/tests/throughput-test
+++ b/tests/throughput-test
@@ -1,6 +1,9 @@
 #!/bin/sh
 set -e
 
+# SoX Resampler Library       Copyright (c) 2007-16 robs@users.sourceforge.net
+# Licence for this file: LGPL v2.1                  See LICENCE for details.
+
 test -r throughput.exe && wine=wine
 
 test /$1 = / && list="`seq 0 3`" || list="$*"
diff --git a/tests/throughput-test.bat b/tests/throughput-test.bat
index 482c93b..46b8f7d 100644
--- a/tests/throughput-test.bat
+++ b/tests/throughput-test.bat
@@ -1,2 +1,5 @@
 @echo off
+rem SoX Resampler Library      Copyright (c) 2007-16 robs@users.sourceforge.net
+rem Licence for this file: LGPL v2.1                  See LICENCE for details.
+
 for /L %%i in (0,1,3) DO throughput 44.1 48 1 0 %%i
diff --git a/tests/throughput.c b/tests/throughput.c
index 80256ed..e81d530 100644
--- a/tests/throughput.c
+++ b/tests/throughput.c
@@ -16,23 +16,23 @@
   #define timerRunning() (QueryPerformanceCounter(&tmp), \
       (tmp.QuadPart-start.QuadPart < stop.QuadPart))
 #else
-  #include <time.h>
-  #include <unistd.h>
-  #if defined _POSIX_TIMERS && _POSIX_TIMERS > 0
-    #define K (k*k)
-    #define tv_frac tv_nsec
-    #if defined _POSIX_MONOTONIC_CLOCK
-      #define get_time(x) clock_gettime(CLOCK_MONOTONIC, x)
-    #else
-      #define get_time(x) clock_gettime(CLOCK_REALTIME, x)
-    #endif
+  #include <sys/time.h>
+  #if defined timeradd
+    #define K k
+    #define tv_frac tv_usec
+    #define timespec timeval
+    #define get_time(x) gettimeofday(x, NULL)
   #else
-    #include <sys/time.h>
-    #if defined timeradd
-      #define K k
-      #define tv_frac tv_usec
-      #define timespec timeval
-      #define get_time(x) gettimeofday(x, NULL)
+    #include <time.h>
+    #include <unistd.h>
+    #if defined _POSIX_TIMERS && _POSIX_TIMERS > 0
+      #define K (k*k)
+      #define tv_frac tv_nsec
+      #if defined _POSIX_MONOTONIC_CLOCK
+        #define get_time(x) clock_gettime(CLOCK_MONOTONIC, x)
+      #else
+        #define get_time(x) clock_gettime(CLOCK_REALTIME, x)
+      #endif
     #else
       #include <sys/timeb.h>
       #define K 1