diff --git a/CMakeLists.txt b/CMakeLists.txt
index 69e6425..60a5c33 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -4,7 +4,8 @@
 cmake_minimum_required (VERSION 3.0 FATAL_ERROR)
 
 project (soxr C)
-set (DESCRIPTION_SUMMARY "High quality, one-dimensional sample-rate conversion library")
+set (DESCRIPTION_SUMMARY
+    "High quality, one-dimensional sample-rate conversion library")
 
 
 
@@ -15,15 +16,20 @@ set (PROJECT_VERSION_MINOR 1)
 set (PROJECT_VERSION_PATCH 2)
 
 # For shared-object; if, since the last public release:
-#  * library code changed at all: ++revision
-#  * interfaces changed at all:   ++current, revision = 0
-#  * interfaces added:            ++age
-#  * interfaces removed:          age = 0
+#
+#  1) library code changed at all: ++revision
+#  2) interfaces changed at all:   ++current, revision = 0
+#  3) interfaces added:            ++age
+#  4) interfaces removed:          age = 0
 
 set (SO_VERSION_CURRENT  1)
 set (SO_VERSION_REVISION 1)
 set (SO_VERSION_AGE      1)
 
+math (EXPR SO_VERSION_MAJOR "${SO_VERSION_CURRENT} - ${SO_VERSION_AGE}")
+math (EXPR SO_VERSION_MINOR "${SO_VERSION_AGE}")
+math (EXPR SO_VERSION_PATCH "${SO_VERSION_REVISION}")
+
 
 
 # Main options:
@@ -31,31 +37,45 @@ set (SO_VERSION_AGE      1)
 include (CMakeDependentOption)
 
 if (NOT CMAKE_BUILD_TYPE)
-  set (CMAKE_BUILD_TYPE Release CACHE STRING "Choose the type of build, options are: None Debug Release RelWithDebInfo MinSizeRel." FORCE)
+  set (CMAKE_BUILD_TYPE Release CACHE STRING
+    "Build type, one of: None Debug Release RelWithDebInfo MinSizeRel." FORCE)
 endif ()
 
-option (BUILD_SHARED_LIBS "Build shared libraries." ON)
+option (BUILD_TESTS "Build sanity-tests." ON)
 option (BUILD_EXAMPLES "Build examples." OFF)
 option (WITH_OPENMP "Include OpenMP threading." ON)
 option (WITH_LSR_BINDINGS "Include a `libsamplerate'-like interface." ON)
 
-cmake_dependent_option (BUILD_TESTS "Build sanity-tests."  ON
-  "NOT CMAKE_CROSSCOMPILING" OFF)
-cmake_dependent_option (WITH_SINGLE_PRECISION "Build with single precision (for up to 20-bit accuracy)." ON
-  "WITH_DOUBLE_PRECISION" ON)
-cmake_dependent_option (WITH_DOUBLE_PRECISION "Build with double precision (for up to 32-bit accuracy)." ON
-  "WITH_SINGLE_PRECISION" ON)
-cmake_dependent_option (WITH_SIMD "Use SIMD (for faster single precision)." ON
-  "WITH_SINGLE_PRECISION" OFF)
-cmake_dependent_option (WITH_AVFFT "Use libavcodec (LGPL) for SIMD DFT." OFF
-  "WITH_SIMD;NOT WITH_PFFFT" OFF)
-cmake_dependent_option (WITH_PFFFT "Use PFFFT (BSD-like licence) for SIMD DFT." ON
-  "WITH_SIMD;NOT WITH_AVFFT" OFF)
+cmake_dependent_option (BUILD_SHARED_LIBS
+  "Build shared (dynamic) soxr libraries." ON
+  "NOT WITH_DEV_GPROF" OFF)
+cmake_dependent_option (WITH_VR32
+  "Include HQ variable-rate resampling engine." ON
+  "WITH_CR32 OR WITH_CR64 OR WITH_CR32S OR WITH_CR64S OR NOT DEFINED WITH_VR32" ON)
+cmake_dependent_option (WITH_CR32
+  "Include HQ constant-rate resampling engine." ON
+  "WITH_VR32 OR WITH_CR64 OR WITH_CR32S OR WITH_CR64S" ON)
+cmake_dependent_option (WITH_CR64
+  "Include VHQ constant-rate resampling engine." ON
+  "WITH_VR32 OR WITH_CR32 OR WITH_CR32S OR WITH_CR64S" ON)
+cmake_dependent_option (WITH_CR64S
+  "Include VHQ SIMD constant-rate resampling engine." ON
+  "WITH_VR32 OR WITH_CR32 OR WITH_CR32S OR WITH_CR64" ON)
+cmake_dependent_option (WITH_CR32S
+  "Include HQ SIMD constant-rate resampling engine." ON
+  "WITH_VR32 OR WITH_CR64 OR WITH_CR32 OR WITH_CR64S" ON)
+cmake_dependent_option (WITH_AVFFT
+  "Use libavcodec (LGPL) for HQ SIMD DFT." OFF
+  "WITH_CR32S;NOT WITH_PFFFT" OFF)
+cmake_dependent_option (WITH_PFFFT
+  "Use PFFFT (BSD-like licence) for HQ SIMD DFT." ON
+  "WITH_CR32S;NOT WITH_AVFFT" OFF)
 cmake_dependent_option (BUILD_LSR_TESTS "Build LSR tests." OFF
   "UNIX;NOT CMAKE_CROSSCOMPILING;EXISTS ${PROJECT_SOURCE_DIR}/lsr-tests;WITH_LSR_BINDINGS" OFF)
 
-option (WITH_DEV_TRACE "Enable developer trace output." OFF)
-mark_as_advanced (WITH_DEV_TRACE)
+option (WITH_DEV_TRACE "Enable developer trace capability." ON)
+option (WITH_DEV_GPROF "Enable developer grpof output." OFF)
+mark_as_advanced (WITH_DEV_TRACE WITH_DEV_GPROF)
 
 
 
@@ -79,15 +99,23 @@ endif ()
 
 if (WITH_OPENMP)
   find_package (OpenMP)
-  if (OPENMP_FOUND)
+  if (OpenMP_FOUND)
     set (CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OpenMP_C_FLAGS}")
-    set (CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${OpenMP_EXE_LINKER_FLAGS}")
-    set (CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} ${OpenMP_SHARED_LINKER_FLAGS}")
+    set (CMAKE_EXE_LINKER_FLAGS
+        "${CMAKE_EXE_LINKER_FLAGS} ${OpenMP_EXE_LINKER_FLAGS}")
+    set (CMAKE_SHARED_LINKER_FLAGS
+        "${CMAKE_SHARED_LINKER_FLAGS} ${OpenMP_SHARED_LINKER_FLAGS}")
   endif ()
 endif ()
 
-if (WITH_SIMD)
-  find_package (SIMD)
+if (WITH_CR32S)
+  find_package (SIMD32)
+  set (WITH_CR32S ${SIMD32_FOUND})
+endif ()
+
+if (WITH_CR64S)
+  find_package (SIMD64)
+  set (WITH_CR64S ${SIMD64_FOUND})
 endif ()
 
 if (WITH_AVFFT)
@@ -98,7 +126,7 @@ if (WITH_AVFFT)
   endif ()
 endif ()
 
-if (WITH_AVFFT OR (CMAKE_SYSTEM_PROCESSOR MATCHES "^arm" AND SIMD_FOUND))
+if (WITH_AVFFT OR (CMAKE_SYSTEM_PROCESSOR MATCHES "^arm" AND SIMD32_FOUND))
   find_package (LibAVUtil)
   if (AVUTIL_FOUND)
     include_directories (${AVUTIL_INCLUDE_DIRS})
@@ -117,13 +145,22 @@ test_big_endian (HAVE_BIGENDIAN)
 # Compiler configuration:
 
 if (CMAKE_C_COMPILER_ID STREQUAL "GNU" OR CMAKE_C_COMPILER_ID STREQUAL "Clang")
-  set (PROJECT_CXX_FLAGS "-Wconversion -Wall -W -pedantic -Wundef -Wcast-align -Wpointer-arith -Wno-long-long")
-  set (PROJECT_C_FLAGS "${PROJECT_CXX_FLAGS} -std=gnu89 -Wnested-externs -Wmissing-prototypes -Wstrict-prototypes")
+  set (PROJECT_CXX_FLAGS "${PROJECT_CXX_FLAGS} -Wconversion -Wall -Wextra")
+  set (PROJECT_CXX_FLAGS "${PROJECT_CXX_FLAGS} -pedantic -Wundef -Wpointer-arith")
+  set (PROJECT_CXX_FLAGS "${PROJECT_CXX_FLAGS} -Wno-long-long -Wno-keyword-macro")
+  if (WITH_DEV_GPROF)
+    set (PROJECT_CXX_FLAGS "${PROJECT_CXX_FLAGS} -pg")
+  endif ()
+  # Can use std=c89, but gnu89 should give faster sinf, cosf, etc.:
+  set (PROJECT_C_FLAGS "${PROJECT_CXX_FLAGS} -std=gnu89 -Wnested-externs")
+  set (PROJECT_C_FLAGS "${PROJECT_C_FLAGS} -Wmissing-prototypes -Wstrict-prototypes")
   if (CMAKE_BUILD_TYPE STREQUAL "Release")
     set (CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} -s") # strip
   endif ()
-  cmake_dependent_option (VISIBILITY_HIDDEN "Build with -fvisibility=hidden." ON
+  cmake_dependent_option (VISIBILITY_HIDDEN
+    "Build shared libraries with -fvisibility=hidden." ON
     "BUILD_SHARED_LIBS" OFF)
+  mark_as_advanced (VISIBILITY_HIDDEN)
   if (VISIBILITY_HIDDEN)
     add_definitions (-fvisibility=hidden -DSOXR_VISIBILITY)
   endif ()
@@ -131,9 +168,14 @@ endif ()
 
 if (MSVC)
   add_definitions (-D_USE_MATH_DEFINES -D_CRT_SECURE_NO_WARNINGS)
-  option (ENABLE_STATIC_RUNTIME "Visual Studio, link with runtime statically."  OFF)
-  if (ENABLE_STATIC_RUNTIME)
-    foreach (flag_var CMAKE_CXX_FLAGS CMAKE_CXX_FLAGS_DEBUG CMAKE_CXX_FLAGS_RELEASE CMAKE_CXX_FLAGS_MINSIZEREL CMAKE_CXX_FLAGS_RELWITHDEBINFO)
+  option (BUILD_SHARED_RUNTIME "MSVC, link with runtime dynamically."  ON)
+  if (NOT BUILD_SHARED_RUNTIME)
+    foreach (flag_var
+        CMAKE_C_FLAGS                CMAKE_CXX_FLAGS
+        CMAKE_C_FLAGS_DEBUG          CMAKE_CXX_FLAGS_DEBUG
+        CMAKE_C_FLAGS_RELEASE        CMAKE_CXX_FLAGS_RELEASE
+        CMAKE_C_FLAGS_MINSIZEREL     CMAKE_CXX_FLAGS_MINSIZEREL
+        CMAKE_C_FLAGS_RELWITHDEBINFO CMAKE_CXX_FLAGS_RELWITHDEBINFO)
       string (REGEX REPLACE "/MD" "/MT" ${flag_var} "${${flag_var}}")
     endforeach ()
   endif ()
@@ -147,7 +189,8 @@ endif ()
 
 # Build configuration:
 
-if (${BUILD_SHARED_LIBS} AND ${CMAKE_SYSTEM_NAME} STREQUAL Windows) # Allow exes to find dlls:
+if (${BUILD_SHARED_LIBS} AND ${CMAKE_SYSTEM_NAME} STREQUAL Windows)
+  # Allow exes to find dlls:
   set (BIN ${PROJECT_BINARY_DIR}/bin/)
   set (EXAMPLES_BIN ${BIN})
   set (CMAKE_LIBRARY_OUTPUT_DIRECTORY ${BIN})
@@ -188,17 +231,16 @@ endif ()
 
 if (APPLE)
   option (BUILD_FRAMEWORK "Build an OS X framework." OFF)
-  set (FRAMEWORK_INSTALL_DIR "/Library/Frameworks" CACHE STRING "Directory to install frameworks to.")
+  set (FRAMEWORK_INSTALL_DIR
+      "/Library/Frameworks" CACHE STRING "Directory to install frameworks to.")
 endif ()
 
 
 
 # Top-level:
 
-set (PROJECT_VERSION ${PROJECT_VERSION_MAJOR}.${PROJECT_VERSION_MINOR}.${PROJECT_VERSION_PATCH})
-math (EXPR SO_VERSION_MAJOR "${SO_VERSION_CURRENT} - ${SO_VERSION_AGE}")
-math (EXPR SO_VERSION_MINOR "${SO_VERSION_AGE}")
-math (EXPR SO_VERSION_PATCH "${SO_VERSION_REVISION}")
+set (PROJECT_VERSION
+    ${PROJECT_VERSION_MAJOR}.${PROJECT_VERSION_MINOR}.${PROJECT_VERSION_PATCH})
 set (SO_VERSION ${SO_VERSION_MAJOR}.${SO_VERSION_MINOR}.${SO_VERSION_PATCH})
 
 configure_file (
@@ -206,7 +248,7 @@ configure_file (
   ${PROJECT_BINARY_DIR}/${PROJECT_NAME}-config.h)
 include_directories (${PROJECT_BINARY_DIR})
 
-if (BUILD_TESTS OR BUILD_LSR_TESTS)
+if (NOT CMAKE_CROSSCOMPILING AND (BUILD_TESTS OR BUILD_LSR_TESTS))
   enable_testing ()
 endif ()
 
diff --git a/INSTALL b/INSTALL
index 3f30772..c429e61 100644
--- a/INSTALL
+++ b/INSTALL
@@ -78,32 +78,27 @@ To list help on the available options, enter:
 
 Options, if given, should be preceded with '-D', e.g.
 
-    cmake -DWITH_SIMD:BOOL=OFF ..
+    cmake -DBUILD_SHARED_LIBS:BOOL=OFF ..
 
 
 
 Resampling engines
 
-As available on a given system, the library may include up to four resampling
-‘engines’, as follows:
+As available on a given system, options for including up to five resampling
+‘engines’ are available (per above) as follows:
 
-    cr32: for constant-rate resampling with precision up to 20 bits,
-    cr32s: SIMD variant of cr32,
-    cr64: for constant-rate resampling with precision greater than 20 bits,
-    vr32: for variable-rate resampling.
+    WITH_CR32: for constant-rate HQ resampling,
+    WITH_CR32S: SIMD variant of previous,
+    WITH_CR64: for constant-rate VHQ resampling,
+    WITH_CR64S: SIMD variant of previous,
+    WITH_VR32: for variable-rate HQ resampling.
 
-Engine inclusion is controlled (as above) by the following cmake option
-variables:
+By default, these options are all set to ON.
 
-    cr32: WITH_SINGLE_PRECISION
-    cr32s: WITH_SINGLE_PRECISION, WITH_SIMD
-    cr32: WITH_DOUBLE_PRECISION
-
-By default, these variables are all set to ON.
-
-When both cr32 and cr32s engines are included, run-time selection is automatic
-(based on CPU capability) for x86 CPUs, and can be automatic for ARM CPUs if
-the 3rd-party library `libavutil' is available at libsoxr build-time.
+When both SIMD and non-SIMD engine variants are included, run-time selection
+is automatic (based on CPU capability) for x86 CPUs, and can be automatic for
+ARM CPUs if the 3rd-party library `libavutil' is available at libsoxr
+build-time.
 
 
 
@@ -114,13 +109,13 @@ E.g. targeting a Linux ARM system:
     mkdir build
     cd build
     cmake -DCMAKE_SYSTEM_NAME=Linux \
-          -DCMAKE_C_COMPILER=arm-linux-gnueabi-gcc-5 \
+          -DCMAKE_C_COMPILER=arm-linux-gnueabi-gcc \
           ..
 or, also building the examples (one of which uses C++):
 
     cmake -DCMAKE_SYSTEM_NAME=Linux \
-          -DCMAKE_C_COMPILER=arm-linux-gnueabi-gcc-5 \
-          -DCMAKE_CXX_COMPILER=arm-linux-gnueabi-g++-5 \
+          -DCMAKE_C_COMPILER=arm-linux-gnueabi-gcc \
+          -DCMAKE_CXX_COMPILER=arm-linux-gnueabi-g++ \
           -DBUILD_EXAMPLES=1 \
           ..
 
@@ -158,7 +153,7 @@ Autotools-based systems might find it useful to create a file called
     cmake -DBUILD_SHARED_LIBS=OFF .
 (or with other build options as required).
 
-For MS visual studio, see msvc/README.
+For MS Visual C++, see msvc/README.
 
 
 
@@ -168,7 +163,7 @@ The libsoxr API structure ‘soxr_runtime_spec_t’ allows application developer
 to optimise some aspects of libsoxr’s operation for a particular application.
 However, since optimal performance might depend on an individual end-user’s
 run-time system and the end-user’s preferences, environment variables are
-available to set (override) the run-time parameters:
+available to set (override) run-time parameters as follows:
 
     Env. variable        Equivalent soxr_runtime_spec_t item
     ------------------   -----------------------------------
@@ -178,8 +173,8 @@ available to set (override) the run-time parameters:
     SOXR_MIN_DFT_SIZE    log2_min_dft_size
     SOXR_NUM_THREADS     num_threads
 
-Additionally, the SOXR_USE_SIMD environment variable may be used to override
-automatic selection (or to provide manual selection where automatic selection
-is not available) between the cr32 and cr32s resampling engines. (Which engine
-is selected for a specific configuration of libsoxr can be checked using
-example #3, which reports it.)
+Additionally, the SOXR_USE_SIMD32 and SOXR_USE_SIMD64 environment variables
+may be used to override automatic selection (or to provide manual selection
+where automatic selection is not available) between SIMD and non-SIMD engine
+variants.  (Which engine is selected for a specific configuration of libsoxr
+can be checked using example #3, which reports it.)
diff --git a/LICENCE b/LICENCE
index 1c61878..52f84ee 100644
--- a/LICENCE
+++ b/LICENCE
@@ -1,4 +1,4 @@
-SoX Resampler Library       Copyright (c) 2007-13 robs@users.sourceforge.net
+SoX Resampler Library       Copyright (c) 2007-16 robs@users.sourceforge.net
 
 This library is free software; you can redistribute it and/or modify it
 under the terms of the GNU Lesser General Public License as published by
@@ -11,8 +11,7 @@ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser
 General Public License for more details.
 
 You should have received a copy of the GNU Lesser General Public License
-along with this library; if not, write to the Free Software Foundation,
-Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+along with this library; if not, see <http://www.gnu.org/licenses/>.
 
 
 Notes
diff --git a/NEWS b/NEWS
index a2eca85..d368fa2 100644
--- a/NEWS
+++ b/NEWS
@@ -1,6 +1,8 @@
 Version 0.1.3 (2016-mm-dd)
-  * Better support for clang, ARM+SIMD, and cross-compilation.
+  * SIMD enhancements: SSE, AVX, Neon.
+  * Improved support for clang, ARM, and cross-compilation.
   * Other minor fixes/improvements to build/tests/documentation.
+  * N.B. some cmake configuration variable name changes.
 
 Version 0.1.2 (2015-09-05)
   * Fix conversion failure when I/O types differ but I/O rates don't.
diff --git a/README b/README
index 06f11e6..0070896 100644
--- a/README
+++ b/README
@@ -1,4 +1,4 @@
-SoX Resampler Library       Copyright (c) 2007-13 robs@users.sourceforge.net
+SoX Resampler Library       Copyright (c) 2007-16 robs@users.sourceforge.net
 
 The SoX Resampler library `libsoxr' performs one-dimensional sample-rate
 conversion -- it may be used, for example, to resample PCM-encoded audio.
diff --git a/TODO b/TODO
index 1c4a31b..c699c0c 100644
--- a/TODO
+++ b/TODO
@@ -1,3 +1,2 @@
 * SOXR_ALLOW_ALIASING
 * Explicit flush API fn, perhaps.
-* More SIMD.
diff --git a/cmake/Modules/FindCFlags.cmake b/cmake/Modules/FindCFlags.cmake
new file mode 100644
index 0000000..f118727
--- /dev/null
+++ b/cmake/Modules/FindCFlags.cmake
@@ -0,0 +1,35 @@
+# SoX Resampler Library       Copyright (c) 2007-16 robs@users.sourceforge.net
+# Licence for this file: LGPL v2.1                  See LICENCE for details.
+
+# - Function to find C compiler feature flags
+
+include (CheckCSourceCompiles)
+include (FindPackageHandleStandardArgs)
+
+function (FindCFlags PKG_NAME PKG_DESC TRIAL_C_FLAGS TEST_C_SOURCE)
+
+foreach (TRIAL_C_FLAG ${TRIAL_C_FLAGS})
+  message (STATUS "Trying ${PKG_NAME} C flags: ${TRIAL_C_FLAG}")
+  unset (DETECT_${PKG_NAME}_C_FLAGS CACHE) #displayed by check_c_source_compiles
+
+  set (TMP "${CMAKE_REQUIRED_FLAGS}")
+  set (CMAKE_REQUIRED_FLAGS "${TRIAL_C_FLAG}")
+  check_c_source_compiles ("${TEST_C_SOURCE}" DETECT_${PKG_NAME}_C_FLAGS)
+  set (CMAKE_REQUIRED_FLAGS "${TMP}")
+
+  if (DETECT_${PKG_NAME}_C_FLAGS)
+    set (DETECTED_C_FLAGS "${TRIAL_C_FLAG}")
+    break ()
+  endif ()
+endforeach ()
+
+# N.B. Will not overwrite existing cache variable:
+set (${PKG_NAME}_C_FLAGS "${DETECTED_C_FLAGS}"
+  CACHE STRING "C compiler flags for ${PKG_DESC}")
+
+find_package_handle_standard_args (
+  ${PKG_NAME} DEFAULT_MSG ${PKG_NAME}_C_FLAGS ${PKG_NAME}_C_FLAGS)
+mark_as_advanced (${PKG_NAME}_C_FLAGS)
+set (${PKG_NAME}_FOUND ${${PKG_NAME}_FOUND} PARENT_SCOPE)
+
+endfunction ()
diff --git a/cmake/Modules/FindOpenMP.cmake b/cmake/Modules/FindOpenMP.cmake
index 74e5bc5..3664eed 100644
--- a/cmake/Modules/FindOpenMP.cmake
+++ b/cmake/Modules/FindOpenMP.cmake
@@ -1,117 +1,38 @@
+# SoX Resampler Library       Copyright (c) 2007-16 robs@users.sourceforge.net
+# Licence for this file: LGPL v2.1                  See LICENCE for details.
+
 # - Finds OpenMP support
-# This module can be used to detect OpenMP support in a compiler.
-# If the compiler supports OpenMP, the flags required to compile with
-# openmp support are set.
 #
 # The following variables are set:
-#   OpenMP_C_FLAGS - flags to add to the C compiler for OpenMP support
-#   OPENMP_FOUND - true if openmp is detected
-#
-# Supported compilers can be found at http://openmp.org/wp/openmp-compilers/
-#
-# Modifications for soxr:
-#   * don't rely on presence of C++ compiler
-#   * support MINGW
-#
-#=============================================================================
-# Copyright 2009 Kitware, Inc.
-# Copyright 2008-2009 André Rigland Brodtkorb <Andre.Brodtkorb@ifi.uio.no>
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions are
-# met:
-#
-#  * Redistributions of source code must retain the above copyright notice,
-#    this list of conditions and the following disclaimer.
-#
-#  * Redistributions in binary form must reproduce the above copyright notice,
-#    this list of conditions and the following disclaimer in the documentation
-#    and/or other materials provided with the distribution.
-#
-#  * The names of Kitware, Inc., the Insight Consortium, or the names of
-#    any consortium members, or of any contributors, may not be used to
-#    endorse or promote products derived from this software without
-#    specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER AND CONTRIBUTORS ``AS IS''
-# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-# ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE FOR
-# ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#   OpenMP_C_FLAGS - flags to add to the C compiler for this package.
+#   OpenMP_FOUND - true if support for this package is found.
 
-include (CheckCSourceCompiles)
-include (FindPackageHandleStandardArgs)
-
-set (OpenMP_C_FLAG_CANDIDATES
-  #Gnu
-  "-fopenmp"
-  #Clang
-  "-fopenmp=libiomp5"
-  #Microsoft Visual Studio
-  "/openmp"
-  #Intel windows
-  "-Qopenmp"
-  #Intel
-  "-openmp"
-  #Empty, if compiler automatically accepts openmp
-  " "
-  #Sun
-  "-xopenmp"
-  #HP
-  "+Oopenmp"
-  #IBM XL C/c++
-  "-qsmp"
-  #Portland Group
-  "-mp"
-)
-
-# sample openmp source code to test
-set (OpenMP_C_TEST_SOURCE
-"
-#include <omp.h>
-int main() {
-#ifdef _OPENMP
-  return 0;
-#else
-  breaks_on_purpose
-#endif
-}
-")
-# if these are set then do not try to find them again,
-# by avoiding any try_compiles for the flags
 if (DEFINED OpenMP_C_FLAGS)
-  set (OpenMP_C_FLAG_CANDIDATES)
-endif (DEFINED OpenMP_C_FLAGS)
+  set (TRIAL_C_FLAGS)
+else ()
+  set (TRIAL_C_FLAGS
+    "-fopenmp"          # Gnu
+    "-fopenmp=libiomp5" # Clang
+    "/openmp"           # MSVC
+    " "
+  )
 
-# check c compiler
-foreach (FLAG ${OpenMP_C_FLAG_CANDIDATES})
-  set (SAFE_CMAKE_REQUIRED_FLAGS "${CMAKE_REQUIRED_FLAGS}")
-  set (CMAKE_REQUIRED_FLAGS "${FLAG}")
-  unset (OpenMP_FLAG_DETECTED CACHE)
-  message (STATUS "Try OpenMP C flag = [${FLAG}]")
-  check_c_source_compiles ("${OpenMP_C_TEST_SOURCE}" OpenMP_FLAG_DETECTED)
-  set (CMAKE_REQUIRED_FLAGS "${SAFE_CMAKE_REQUIRED_FLAGS}")
-  if (OpenMP_FLAG_DETECTED)
-    set (OpenMP_C_FLAGS_INTERNAL "${FLAG}")
-    break ()
-  endif (OpenMP_FLAG_DETECTED)
-endforeach (FLAG ${OpenMP_C_FLAG_CANDIDATES})
+  set (TEST_C_SOURCE "
+    #ifndef _OPENMP
+      #error
+    #endif
+    #include <omp.h>
+    int main() {return 0;}
+  ")
+endif ()
 
-set (OpenMP_C_FLAGS "${OpenMP_C_FLAGS_INTERNAL}"
-  CACHE STRING "C compiler flags for OpenMP parallization")
+include (FindCFlags)
 
-# handle the standard arguments for find_package
-find_package_handle_standard_args (OpenMP DEFAULT_MSG
-  OpenMP_C_FLAGS)
+FindCFlags ("OpenMP" "OpenMP threading"
+  "${TRIAL_C_FLAGS}" "${TEST_C_SOURCE}")
 
 if (MINGW)
   set (OpenMP_SHARED_LINKER_FLAGS "${OpenMP_SHARED_LINKER_FLAGS} ${OpenMP_C_FLAGS}")
   set (OpenMP_EXE_LINKER_FLAGS "${OpenMP_EXE_LINKER_FLAGS} ${OpenMP_C_FLAGS}")
+  mark_as_advanced (OpenMP_SHARED_LINKER_FLAGS OpenMP_EXE_LINKER_FLAGS)
 endif ()
-
-mark_as_advanced (OpenMP_C_FLAGS OpenMP_SHARED_LINKER_FLAGS OpenMP_EXE_LINKER_FLAGS)
diff --git a/cmake/Modules/FindSIMD.cmake b/cmake/Modules/FindSIMD.cmake
deleted file mode 100644
index 30dca70..0000000
--- a/cmake/Modules/FindSIMD.cmake
+++ /dev/null
@@ -1,104 +0,0 @@
-# - Finds SIMD support
-#
-# The following variables are set:
-#   SIMD_C_FLAGS - flags to add to the C compiler for this package.
-#   SIMD_FOUND - true if support for this package is found.
-#
-#=============================================================================
-# Based on FindOpenMP.cmake, which is:
-#
-# Copyright 2009 Kitware, Inc.
-# Copyright 2008-2009 André Rigland Brodtkorb <Andre.Brodtkorb@ifi.uio.no>
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions are
-# met:
-#
-#  * Redistributions of source code must retain the above copyright notice,
-#    this list of conditions and the following disclaimer.
-#
-#  * Redistributions in binary form must reproduce the above copyright notice,
-#    this list of conditions and the following disclaimer in the documentation
-#    and/or other materials provided with the distribution.
-#
-#  * The names of Kitware, Inc., the Insight Consortium, or the names of
-#    any consortium members, or of any contributors, may not be used to
-#    endorse or promote products derived from this software without
-#    specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER AND CONTRIBUTORS ``AS IS''
-# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-# ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE FOR
-# ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-include (CheckCSourceCompiles)
-include (FindPackageHandleStandardArgs)
-
-if (CMAKE_SYSTEM_PROCESSOR MATCHES "^arm")
-  set (SIMD_C_FLAG_CANDIDATES
-    # Gcc
-    "-Wno-cast-align -mfpu=neon-vfpv4 -mcpu=cortex-a7"
-    "-Wno-cast-align -mfpu=neon       -mfloat-abi=hard"
-    "-Wno-cast-align -mfpu=neon       -mfloat-abi=softfp"
-    "-Wno-cast-align -mfpu=neon       -mfloat-abi=soft"
-  )
-  set (SIMD_C_TEST_SOURCE "
-    #include <arm_neon.h>
-    int main() {
-      float32x4_t a = vdupq_n_f32(0), b = a, c = vaddq_f32(a,b);
-      return 0;
-    }
-    ")
-else ()
-  if (WIN32) # Safety for when mixed lib/app compilers (but performance hit)
-    set (GCC_WIN32_SIMD_OPTS "-mincoming-stack-boundary=2")
-  endif ()
-
-  set (SIMD_C_FLAG_CANDIDATES
-    # x64
-    "-Wno-cast-align"
-    " "
-    # Microsoft Visual Studio x86
-    "/arch:SSE /fp:fast -D__SSE__"
-    # Gcc x86
-    "-Wno-cast-align -msse -mfpmath=sse ${GCC_WIN32_SIMD_OPTS}"
-    # Gcc x86 (old versions)
-    "-msse -mfpmath=sse"
-  )
-  set (SIMD_C_TEST_SOURCE "
-    #include <xmmintrin.h>
-    int main() {
-      __m128 a = _mm_setzero_ps(), b = a, c = _mm_add_ps(a,b);
-      return 0;
-    }
-    ")
-endif ()
-
-if (DEFINED SIMD_C_FLAGS)
-  set (SIMD_C_FLAG_CANDIDATES)
-endif ()
-
-foreach (FLAG ${SIMD_C_FLAG_CANDIDATES})
-  set (SAFE_CMAKE_REQUIRED_FLAGS "${CMAKE_REQUIRED_FLAGS}")
-  set (CMAKE_REQUIRED_FLAGS "${FLAG}")
-  unset (SIMD_FLAG_DETECTED CACHE)
-  message (STATUS "Try SIMD C flag = [${FLAG}]")
-  check_c_source_compiles ("${SIMD_C_TEST_SOURCE}" SIMD_FLAG_DETECTED)
-  set (CMAKE_REQUIRED_FLAGS "${SAFE_CMAKE_REQUIRED_FLAGS}")
-  if (SIMD_FLAG_DETECTED)
-    set (SIMD_C_FLAGS_INTERNAL "${FLAG}")
-    break ()
-  endif ()
-endforeach ()
-
-set (SIMD_C_FLAGS "${SIMD_C_FLAGS_INTERNAL}"
-  CACHE STRING "C compiler flags for SIMD vectorization")
-
-find_package_handle_standard_args (SIMD DEFAULT_MSG SIMD_C_FLAGS SIMD_C_FLAGS)
-mark_as_advanced (SIMD_C_FLAGS)
diff --git a/cmake/Modules/FindSIMD32.cmake b/cmake/Modules/FindSIMD32.cmake
new file mode 100644
index 0000000..c215455
--- /dev/null
+++ b/cmake/Modules/FindSIMD32.cmake
@@ -0,0 +1,54 @@
+# SoX Resampler Library       Copyright (c) 2007-16 robs@users.sourceforge.net
+# Licence for this file: LGPL v2.1                  See LICENCE for details.
+
+# - Finds SIMD32 support
+#
+# The following variables are set:
+#   SIMD32_C_FLAGS - flags to add to the C compiler for this package.
+#   SIMD32_FOUND - true if support for this package is found.
+
+if (DEFINED SIMD32_C_FLAGS)
+  set (TRIAL_C_FLAGS)
+elseif (CMAKE_SYSTEM_PROCESSOR MATCHES "^arm")
+  set (TRIAL_C_FLAGS
+    # Gcc
+    "-mfpu=neon-vfpv4 -mcpu=cortex-a7"
+    "-mfpu=neon       -mfloat-abi=hard"
+    "-mfpu=neon       -mfloat-abi=softfp"
+    "-mfpu=neon       -mfloat-abi=soft"
+  )
+  set (TEST_C_SOURCE "
+    #include <arm_neon.h>
+    int main() {
+      float32x4_t a = vdupq_n_f32(0), b = a, c = vaddq_f32(a,b);
+      return 0;
+    }
+  ")
+else ()
+  if (WIN32) # Safety for when mixed lib/app compilers (but performance hit)
+    set (GCC_WIN32_SIMD32_OPTS "-mincoming-stack-boundary=2")
+  endif ()
+
+  set (TRIAL_C_FLAGS
+    # x64
+    " "
+    # MSVC x86
+    "/arch:SSE /fp:fast -D__SSE__"
+    # Gcc x86
+    "-Wno-cast-align -msse -mfpmath=sse ${GCC_WIN32_SIMD32_OPTS}"
+    # Gcc x86 (old versions)
+    "-msse -mfpmath=sse"
+  )
+  set (TEST_C_SOURCE "
+    #include <xmmintrin.h>
+    int main() {
+      __m128 a = _mm_setzero_ps(), b = a, c = _mm_add_ps(a,b);
+      return 0;
+    }
+  ")
+endif ()
+
+include (FindCFlags)
+
+FindCFlags ("SIMD32" "FLOAT-32 (single-precision) SIMD vectorization"
+  "${TRIAL_C_FLAGS}" "${TEST_C_SOURCE}")
diff --git a/cmake/Modules/FindSIMD64.cmake b/cmake/Modules/FindSIMD64.cmake
new file mode 100644
index 0000000..5bf82c9
--- /dev/null
+++ b/cmake/Modules/FindSIMD64.cmake
@@ -0,0 +1,29 @@
+# SoX Resampler Library       Copyright (c) 2007-16 robs@users.sourceforge.net
+# Licence for this file: LGPL v2.1                  See LICENCE for details.
+
+# - Finds SIMD64 support
+#
+# The following variables are set:
+#   SIMD64_C_FLAGS - flags to add to the C compiler for this package.
+#   SIMD64_FOUND - true if support for this package is found.
+
+if (DEFINED SIMD64_C_FLAGS OR CMAKE_SYSTEM_PROCESSOR MATCHES "^arm")
+  set (TRIAL_C_FLAGS)
+else ()
+  set (TRIAL_C_FLAGS
+    "-mavx" # Gcc
+    "/arch:AVX" # MSVC
+  )
+  set (TEST_C_SOURCE "
+    #include <immintrin.h>
+    int main() {
+      __m256d a = _mm256_setzero_pd(), b = a, c = _mm256_add_pd(a,b);
+      return 0;
+    }
+    ")
+endif ()
+
+include (FindCFlags)
+
+FindCFlags ("SIMD64" "FLOAT-64 (double-precision) SIMD vectorization"
+  "${TRIAL_C_FLAGS}" "${TEST_C_SOURCE}")
diff --git a/cmake/Modules/SetSystemProcessor.cmake b/cmake/Modules/SetSystemProcessor.cmake
index d1973f8..9bafe05 100644
--- a/cmake/Modules/SetSystemProcessor.cmake
+++ b/cmake/Modules/SetSystemProcessor.cmake
@@ -5,13 +5,16 @@
 
 macro (set_system_processor)
   if ("${CMAKE_SYSTEM_PROCESSOR}" STREQUAL "")
+    unset(CMAKE_SYSTEM_PROCESSOR)
+  endif ()
+  if (NOT DEFINED CMAKE_SYSTEM_PROCESSOR)
     include (CheckCSourceCompiles)
-    set (CPU_CANDIDATES
+    set (CPU_LINES
       "#if defined __x86_64__ || defined _M_X64  /*\;x86_64\;*/"
       "#if defined __i386__   || defined _M_IX86 /*\;x86_32\;*/"
       "#if defined __arm__    || defined _M_ARM  /*\;arm\;*/"
       )
-    foreach (CPU_LINE ${CPU_CANDIDATES})
+    foreach (CPU_LINE ${CPU_LINES})
       string (CONCAT CPU_SOURCE "${CPU_LINE}" "
       int main() {return 0;}
       #endif
@@ -19,10 +22,14 @@ macro (set_system_processor)
       unset (SYSTEM_PROCESSOR_DETECTED CACHE)
       check_c_source_compiles ("${CPU_SOURCE}" SYSTEM_PROCESSOR_DETECTED)
       if (SYSTEM_PROCESSOR_DETECTED)
-        list (GET CPU_LINE 1 CMAKE_SYSTEM_PROCESSOR)
-        message (STATUS "CMAKE_SYSTEM_PROCESSOR set to ${CMAKE_SYSTEM_PROCESSOR}")
+        list (GET CPU_LINE 1 DETECTED_SYSTEM_PROCESSOR)
+        message (STATUS "CMAKE_SYSTEM_PROCESSOR is ${DETECTED_SYSTEM_PROCESSOR}")
         break ()
       endif ()
     endforeach ()
   endif ()
+
+  # N.B. Will not overwrite existing cache variable:
+  set (CMAKE_SYSTEM_PROCESSOR "${DETECTED_SYSTEM_PROCESSOR}"
+    CACHE STRING "Target system processor")
 endmacro ()
diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt
index b7b50f8..7a95823 100644
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@@ -1,16 +1,18 @@
-# SoX Resampler Library       Copyright (c) 2007-13 robs@users.sourceforge.net
+# SoX Resampler Library       Copyright (c) 2007-16 robs@users.sourceforge.net
 # Licence for this file: LGPL v2.1                  See LICENCE for details.
 
+if (${BUILD_EXAMPLES} OR ${BUILD_TESTS})
+  set (SOURCES 3-options-input-fn)
+  if (${WITH_LSR_BINDINGS})
+    set (LSR_SOURCES 1a-lsr)
+  endif ()
+endif ()
+
 if (${BUILD_EXAMPLES})
   project (soxr) # Adds c++ compiler
-  file (GLOB SOURCES ${CMAKE_CURRENT_SOURCE_DIR}/[1-9]-*.[cC])
-elseif (${BUILD_TESTS})
-  file (GLOB SOURCES ${CMAKE_CURRENT_SOURCE_DIR}/3*.c)
-endif ()
-
-if (${BUILD_EXAMPLES} OR ${BUILD_TESTS})
-  if (${WITH_LSR_BINDINGS})
-    set (LSR_SOURCES 1a-lsr.c)
+  list (APPEND SOURCES 1-single-block 2-stream 4-split-channels)
+  if (${WITH_VR32})
+    list (APPEND SOURCES 5-variable-rate)
   endif ()
 endif ()
 
@@ -34,4 +36,5 @@ if (${BUILD_TESTS} AND ${WITH_LSR_BINDINGS})
 endif ()
 
 file (GLOB INSTALL_SOURCES ${CMAKE_CURRENT_SOURCE_DIR}/*.[cCh])
-install (FILES ${INSTALL_SOURCES} ${CMAKE_CURRENT_SOURCE_DIR}/README DESTINATION ${DOC_INSTALL_DIR}/examples)
+install (FILES ${INSTALL_SOURCES} ${CMAKE_CURRENT_SOURCE_DIR}/README
+    DESTINATION ${DOC_INSTALL_DIR}/examples)
diff --git a/examples/examples-common.h b/examples/examples-common.h
index 25bd48c..7ebde73 100644
--- a/examples/examples-common.h
+++ b/examples/examples-common.h
@@ -1,4 +1,4 @@
-/* SoX Resampler Library      Copyright (c) 2007-13 robs@users.sourceforge.net
+/* SoX Resampler Library      Copyright (c) 2007-16 robs@users.sourceforge.net
  * Licence for this file: LGPL v2.1                  See LICENCE for details. */
 
 /* Common includes etc. for the examples.  */
@@ -6,7 +6,7 @@
 #include <assert.h>
 #include <errno.h>
 #include <limits.h>
-#include <math.h>
+#include "math-wrap.h"
 #include <stddef.h>
 #include <stdio.h>
 #include <stdlib.h>
@@ -17,10 +17,7 @@
   #include <io.h>
   #include <fcntl.h>
   #define USE_STD_STDIO _setmode(_fileno(stdout), _O_BINARY), \
-                        _setmode(_fileno(stdin ), _O_BINARY);
-  /* Sometimes missing, so ensure that it is defined: */
-  #undef M_PI
-  #define M_PI 3.14159265358979323846
+                        _setmode(_fileno(stdin ), _O_BINARY)
 #else
   #define USE_STD_STDIO
 #endif
diff --git a/lsr-tests/CMakeLists.txt b/lsr-tests/CMakeLists.txt
index 1ac041e..4f718f7 100644
--- a/lsr-tests/CMakeLists.txt
+++ b/lsr-tests/CMakeLists.txt
@@ -35,7 +35,7 @@ set (tests
   callback_hang_test callback_test downsample_test
   float_short_test misc_test multi_channel_test
   reset_test simple_test termination_test varispeed_test)
-if (WITH_DOUBLE_PRECISION)
+if (WITH_CR64 OR WITH_CR64S)
   set (tests ${tests} snr_bw_test)
 endif ()
 
diff --git a/lsr-tests/config.h.in b/lsr-tests/config.h.in
index 3de8799..1095e00 100644
--- a/lsr-tests/config.h.in
+++ b/lsr-tests/config.h.in
@@ -1,4 +1,4 @@
-/* SoX Resampler Library      Copyright (c) 2007-13 robs@users.sourceforge.net
+/* SoX Resampler Library      Copyright (c) 2007-16 robs@users.sourceforge.net
  * Licence for this file: LGPL v2.1                  See LICENCE for details. */
 
 #if !defined soxsrc_lsr_tests_config_included
diff --git a/lsr-tests/snr_bw_test.c b/lsr-tests/snr_bw_test.c
index 26fb279..55130b4 100644
--- a/lsr-tests/snr_bw_test.c
+++ b/lsr-tests/snr_bw_test.c
@@ -118,9 +118,9 @@ main (int argc, char *argv [])
 			BOOLEAN_TRUE,
 			{	{	1,	{ 0.01111111111 },		3.0,		1,	130.0,	1.0 },
 				{	1,	{ 0.01111111111 },		0.6,		1,	132.0,	1.0 },
-				{	1,	{ 0.01111111111 },		0.3,		1,	138.0,	1.0 },
+				{	1,	{ 0.01111111111 },		0.3,		1,	135.0,	1.0 },
 				{	1,	{ 0.01111111111 },		1.0,		1,	155.0,	1.0 },
-				{	1,	{ 0.01111111111 },		1.001,		1,	134.0,	1.0 },
+				{	1,	{ 0.01111111111 },		1.001,		1,	133.0,	1.0 },
 				{	2,	{ 0.011111, 0.324 },	1.9999,		2,	127.0,	1.0 },
 				{	2,	{ 0.012345, 0.457 },	0.456789,	1,	124.0,	0.5 },
 				{	2,	{ 0.011111, 0.45 },		0.6,		1,	126.0,	0.5 },
@@ -135,10 +135,10 @@ main (int argc, char *argv [])
 				{	1,	{ 0.01111111111 },		0.6,		1,	147.0,	1.0 },
 				{	1,	{ 0.01111111111 },		0.3,		1,	147.0,	1.0 },
 				{	1,	{ 0.01111111111 },		1.0,		1,	155.0,	1.0 },
-				{	1,	{ 0.01111111111 },		1.001,		1,	147.0,	1.0 },
+				{	1,	{ 0.01111111111 },		1.001,		1,	146.0,	1.0 },
 				{	2,	{ 0.011111, 0.324 },	1.9999,		2,	147.0,	1.0 },
 				{	2,	{ 0.012345, 0.457 },	0.456789,	1,	148.0,	0.5 },
-				{	2,	{ 0.011111, 0.45 },		0.6,		1,	149.0,	0.5 },
+				{	2,	{ 0.011111, 0.45 },		0.6,		1,	145.0,	0.5 },
 				{	1,	{ 0.43111111111 },		1.33,		1,	145.0,	1.0 }
 				}
 			},
diff --git a/msvc/libsoxr.vcproj b/msvc/libsoxr.vcproj
index b1e1714..2dd7be0 100644
--- a/msvc/libsoxr.vcproj
+++ b/msvc/libsoxr.vcproj
@@ -60,6 +60,12 @@
 	</References>
 	<Files>
 		<Filter Name="Source Files" >
+			<File RelativePath="../src/constructors.c" />
+			<File RelativePath="../src/cr.c" />
+			<File RelativePath="../src/cr32.c" />
+			<File RelativePath="../src/cr32s.c" />
+			<File RelativePath="../src/cr64.c" />
+			<File RelativePath="../src/cr64s.c" />
 			<File RelativePath="../src/data-io.c" />
 			<File RelativePath="../src/dbesi0.c" />
 			<File RelativePath="../src/fft4g32.c" />
@@ -67,10 +73,9 @@
 			<File RelativePath="../src/filter.c" />
 			<File RelativePath="../src/lsr.c" />
 			<File RelativePath="../src/pffft32s.c" />
-			<File RelativePath="../src/rate32.c" />
-			<File RelativePath="../src/rate32s.c" />
-			<File RelativePath="../src/rate64.c" />
-			<File RelativePath="../src/simd.c" />
+			<File RelativePath="../src/pffft64s.c" />
+			<File RelativePath="../src/simd32.c" />
+			<File RelativePath="../src/simd64.c" />
 			<File RelativePath="../src/soxr.c" />
 			<File RelativePath="../src/vr32.c" />
 		</Filter>
diff --git a/msvc/soxr-config.h b/msvc/soxr-config.h
index d6b07ac..a1fcfb4 100644
--- a/msvc/soxr-config.h
+++ b/msvc/soxr-config.h
@@ -7,12 +7,14 @@
 #if !defined soxr_config_included
 #define soxr_config_included
 
-#define WITH_SINGLE_PRECISION 1
-#define WITH_DOUBLE_PRECISION 1
+#define WITH_CR32 1
+#define WITH_CR32S 1
+#define WITH_CR64 1
+#define WITH_CR64S 1
+#define WITH_VR32 1
 
 #define AVCODEC_FOUND 0
 #define AVUTIL_FOUND 0
-#define SIMD_FOUND 1
 
 #define HAVE_FENV_H 0
 #define HAVE_STDBOOL_H 0
diff --git a/multi-arch b/multi-arch
new file mode 100755
index 0000000..174cc4b
--- /dev/null
+++ b/multi-arch
@@ -0,0 +1,29 @@
+#!/bin/sh
+set -e
+
+# SoX Resampler Library       Copyright (c) 2007-13 robs@users.sourceforge.net
+# Licence for this file: LGPL v2.1                  See LICENCE for details.
+
+rm -f CMakeCache.txt             # Prevent interference from any in-tree build
+
+j=-j4
+build=Release
+
+for n in \
+    cc: \
+    clang: \
+    arm-linux-gnueabi-gcc:Linux \
+    x86_64-w64-mingw32-gcc:Windows \
+    i686-w64-mingw32-gcc:Windows \
+    ; do
+  compiler=$(echo $n | sed 's/:.*//')
+  system=$(echo $n | sed 's/.*://')
+  dir=$build-$compiler
+  which $compiler > /dev/null || echo $compiler not found && (
+  echo "***" $dir
+  mkdir -p $dir
+    cd $dir
+    cmake -DCMAKE_BUILD_TYPE=$build -DCMAKE_C_COMPILER=$compiler -DCMAKE_SYSTEM_NAME="$system" ..
+    make $j && [ /$system = / ] && ctest -j || true
+  )
+done
diff --git a/soxr-config.h.in b/soxr-config.h.in
index 3c7dbe9..37ccdb2 100644
--- a/soxr-config.h.in
+++ b/soxr-config.h.in
@@ -4,12 +4,14 @@
 #if !defined soxr_config_included
 #define soxr_config_included
 
-#cmakedefine01 WITH_SINGLE_PRECISION
-#cmakedefine01 WITH_DOUBLE_PRECISION
+#cmakedefine01 WITH_CR32
+#cmakedefine01 WITH_CR32S
+#cmakedefine01 WITH_CR64
+#cmakedefine01 WITH_CR64S
+#cmakedefine01 WITH_VR32
 
 #cmakedefine01 AVCODEC_FOUND
 #cmakedefine01 AVUTIL_FOUND
-#cmakedefine01 SIMD_FOUND
 
 #cmakedefine01 HAVE_FENV_H
 #cmakedefine01 HAVE_STDBOOL_H
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 19c7522..67d34d7 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -1,4 +1,4 @@
-# SoX Resampler Library       Copyright (c) 2007-13 robs@users.sourceforge.net
+# SoX Resampler Library       Copyright (c) 2007-16 robs@users.sourceforge.net
 # Licence for this file: LGPL v2.1                  See LICENCE for details.
 
 
@@ -7,8 +7,10 @@
 
 if (NOT EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/vr-coefs.h)
   include_directories(${CMAKE_CURRENT_BINARY_DIR})
-  set_property(SOURCE vr32.c APPEND PROPERTY OBJECT_DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/vr-coefs.h)
+  set_property(SOURCE vr32.c
+      APPEND PROPERTY OBJECT_DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/vr-coefs.h)
   add_executable (vr-coefs vr-coefs.c)
+  target_link_libraries (vr-coefs ${LIBM_LIBRARIES})
   ADD_CUSTOM_COMMAND(OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/vr-coefs.h
     COMMAND vr-coefs > ${CMAKE_CURRENT_BINARY_DIR}/vr-coefs.h
     DEPENDS vr-coefs)
@@ -29,22 +31,41 @@ if (AVCODEC_FOUND)
 elseif (WITH_PFFFT)
   #set (RDFT32 pffft32)
   set (RDFT32S pffft32s)
-elseif (SIMD_FOUND)
-  set (RDFT32S fft4g32s)
+elseif (WITH_CR32S)
+  set (RDFT32S fft4g32s fft4g32)
 endif ()
 
-if (WITH_DOUBLE_PRECISION)
-  set (DP_SOURCES rate64)
+set (SOURCES ${PROJECT_NAME}.c constructors data-io)
+
+if (WITH_CR32 OR WITH_CR32S OR WITH_CR64 OR WITH_CR64S)
+  list (APPEND SOURCES dbesi0 filter fft4g64 cr.c)
 endif ()
 
-if (WITH_SINGLE_PRECISION)
-  set (SP_SOURCES rate32 ${RDFT32})
+if (WITH_CR32)
+  list (APPEND SOURCES cr32 ${RDFT32})
 endif ()
 
-if (SIMD_FOUND)
-  set (SIMD_SOURCES rate32s ${RDFT32S} simd)
-  foreach (source ${SIMD_SOURCES})
-    set_property (SOURCE ${source} PROPERTY COMPILE_FLAGS ${SIMD_C_FLAGS})
+if (WITH_CR64)
+  list (APPEND SOURCES cr64)
+endif ()
+
+if (WITH_VR32)
+  list (APPEND SOURCES vr32)
+endif ()
+
+if (WITH_CR32S)
+  foreach (source cr32s ${RDFT32S} simd32)
+    list (APPEND SOURCES ${source})
+    set_property (SOURCE ${source}
+        APPEND_STRING PROPERTY COMPILE_FLAGS ${SIMD32_C_FLAGS})
+  endforeach ()
+endif ()
+
+if (WITH_CR64S)
+  foreach (source cr64s pffft64s simd64)
+    list (APPEND SOURCES ${source})
+    set_property (SOURCE ${source}
+        APPEND_STRING PROPERTY COMPILE_FLAGS ${SIMD64_C_FLAGS})
   endforeach ()
 endif ()
 
@@ -52,8 +73,7 @@ endif ()
 
 # Libsoxr:
 
-add_library (${PROJECT_NAME} ${LIB_TYPE} ${PROJECT_NAME}.c data-io dbesi0 filter fft4g64
-  ${SP_SOURCES} vr32 ${DP_SOURCES} ${SIMD_SOURCES})
+add_library (${PROJECT_NAME} ${LIB_TYPE} ${SOURCES})
 target_link_libraries (${PROJECT_NAME} PRIVATE ${LIBS} ${LIBM_LIBRARIES})
 set_target_properties (${PROJECT_NAME} PROPERTIES
   VERSION "${SO_VERSION}"
diff --git a/src/aliases.h b/src/aliases.h
index eb42bdc..ebcce41 100644
--- a/src/aliases.h
+++ b/src/aliases.h
@@ -18,8 +18,10 @@
 #define lsx_dfst_f                     _soxr_dfst_f
 #define lsx_dfst                       _soxr_dfst
 #define lsx_fir_to_phase               _soxr_fir_to_phase
+#define lsx_f_resp                     _soxr_f_resp
 #define lsx_init_fft_cache_f           _soxr_init_fft_cache_f
 #define lsx_init_fft_cache             _soxr_init_fft_cache
+#define lsx_inv_f_resp                 _soxr_inv_f_resp
 #define lsx_kaiser_beta                _soxr_kaiser_beta
 #define lsx_kaiser_params              _soxr_kaiser_params
 #define lsx_make_lpf                   _soxr_make_lpf
diff --git a/src/avfft32.c b/src/avfft32.c
index 5be13d2..fe651f5 100644
--- a/src/avfft32.c
+++ b/src/avfft32.c
@@ -1,17 +1,19 @@
 /* SoX Resampler Library      Copyright (c) 2007-13 robs@users.sourceforge.net
  * Licence for this file: LGPL v2.1                  See LICENCE for details. */
 
+#include <stdlib.h>
 #include <math.h>
 #include <libavcodec/avfft.h>
 #include "filter.h"
+#include "rdft_t.h"
 
 static void * forward_setup(int len) {return av_rdft_init((int)(log(len)/log(2)+.5),DFT_R2C);}
 static void * backward_setup(int len) {return av_rdft_init((int)(log(len)/log(2)+.5),IDFT_C2R);}
 static void rdft(int length, void * setup, float * h) {av_rdft_calc(setup, h); (void)length;}
 static int multiplier(void) {return 2;}
 static void nothing(void) {}
+static int flags(void) {return 0;}
 
-typedef void (* fn_t)(void);
 fn_t _soxr_rdft32_cb[] = {
   (fn_t)forward_setup,
   (fn_t)backward_setup,
@@ -24,4 +26,8 @@ fn_t _soxr_rdft32_cb[] = {
   (fn_t)_soxr_ordered_partial_convolve_f,
   (fn_t)multiplier,
   (fn_t)nothing,
+  (fn_t)malloc,
+  (fn_t)calloc,
+  (fn_t)free,
+  (fn_t)flags,
 };
diff --git a/src/avfft32s.c b/src/avfft32s.c
index 75e485e..8b1da80 100644
--- a/src/avfft32s.c
+++ b/src/avfft32s.c
@@ -3,15 +3,16 @@
 
 #include <math.h>
 #include <libavcodec/avfft.h>
-#include "simd.h"
+#include "simd32.h"
+#include "rdft_t.h"
 
 static void * forward_setup(int len) {return av_rdft_init((int)(log(len)/log(2)+.5),DFT_R2C);}
 static void * backward_setup(int len) {return av_rdft_init((int)(log(len)/log(2)+.5),IDFT_C2R);}
 static void rdft(int length, void * setup, float * h) {av_rdft_calc(setup, h); (void)length;}
 static int multiplier(void) {return 2;}
 static void nothing(void) {}
+static int flags(void) {return RDFT_IS_SIMD;}
 
-typedef void (* fn_t)(void);
 fn_t _soxr_rdft32s_cb[] = {
   (fn_t)forward_setup,
   (fn_t)backward_setup,
@@ -20,8 +21,12 @@ fn_t _soxr_rdft32s_cb[] = {
   (fn_t)rdft,
   (fn_t)rdft,
   (fn_t)rdft,
-  (fn_t)_soxr_ordered_convolve_simd,
-  (fn_t)_soxr_ordered_partial_convolve_simd,
+  (fn_t)ORDERED_CONVOLVE_SIMD,
+  (fn_t)ORDERED_PARTIAL_CONVOLVE_SIMD,
   (fn_t)multiplier,
   (fn_t)nothing,
+  (fn_t)SIMD_ALIGNED_MALLOC,
+  (fn_t)SIMD_ALIGNED_CALLOC,
+  (fn_t)SIMD_ALIGNED_FREE,
+  (fn_t)flags,
 };
diff --git a/src/avx.h b/src/avx.h
new file mode 100644
index 0000000..ace19b5
--- /dev/null
+++ b/src/avx.h
@@ -0,0 +1,40 @@
+/* SoX Resampler Library      Copyright (c) 2007-16 robs@users.sourceforge.net
+ * Licence for this file: LGPL v2.1                  See LICENCE for details. */
+
+/* AVX support macros */
+
+#if !defined soxr_avx_included
+#define soxr_avx_included
+
+#include <immintrin.h>
+
+typedef __m256d v4sf;
+#define VZERO() _mm256_setzero_pd()
+#define VMUL(a,b) _mm256_mul_pd(a,b)
+#define VADD(a,b) _mm256_add_pd(a,b)
+#define VMADD(a,b,c) VADD(VMUL(a,b),c) /* Note: gcc -mfma will `fuse' these */
+#define VSUB(a,b) _mm256_sub_pd(a,b)
+#define LD_PS1(p) _mm256_set1_pd(p)
+#define INTERLEAVE2(in1, in2, out1, out2) {v4sf \
+  t1 = _mm256_unpacklo_pd(in1, in2), \
+  t2 = _mm256_unpackhi_pd(in1, in2); \
+  out1 = _mm256_permute2f128_pd(t1,t2,0x20); \
+  out2 = _mm256_permute2f128_pd(t1,t2,0x31); }
+#define UNINTERLEAVE2(in1, in2, out1, out2) {v4sf \
+  t1 = _mm256_permute2f128_pd(in1,in2,0x20), \
+  t2 = _mm256_permute2f128_pd(in1,in2,0x31); \
+  out1 = _mm256_unpacklo_pd(t1, t2); \
+  out2 = _mm256_unpackhi_pd(t1, t2);}
+#define VTRANSPOSE4(x0,x1,x2,x3) {v4sf \
+  t0 = _mm256_shuffle_pd(x0,x1, 0x0), \
+  t2 = _mm256_shuffle_pd(x0,x1, 0xf), \
+  t1 = _mm256_shuffle_pd(x2,x3, 0x0), \
+  t3 = _mm256_shuffle_pd(x2,x3, 0xf); \
+  x0 = _mm256_permute2f128_pd(t0,t1, 0x20); \
+  x1 = _mm256_permute2f128_pd(t2,t3, 0x20); \
+  x2 = _mm256_permute2f128_pd(t0,t1, 0x31); \
+  x3 = _mm256_permute2f128_pd(t2,t3, 0x31);}
+#define VSWAPHL(a,b) _mm256_permute2f128_pd(b, a, 0x30)
+#define VALIGNED(ptr) ((((long)(ptr)) & 0x1F) == 0)
+
+#endif
diff --git a/src/constructors.c b/src/constructors.c
new file mode 100644
index 0000000..0128990
--- /dev/null
+++ b/src/constructors.c
@@ -0,0 +1,85 @@
+/* SoX Resampler Library      Copyright (c) 2007-16 robs@users.sourceforge.net
+ * Licence for this file: LGPL v2.1                  See LICENCE for details. */
+
+#include "soxr.h"
+#include "filter.h"
+#include "internal.h"
+#include <math.h>
+#include <stdarg.h>
+#include <string.h>
+
+#if !WITH_CR32 && !WITH_CR32S && !WITH_CR64 && !WITH_CR64S
+#undef lsx_to_3dB
+#define lsx_to_3dB(x) ((x)/(x))
+#endif
+
+
+
+soxr_quality_spec_t soxr_quality_spec(unsigned long recipe, unsigned long flags)
+{
+  soxr_quality_spec_t spec, * p = &spec;
+  unsigned quality = recipe & 0xf;
+  double rej;
+  memset(p, 0, sizeof(*p));
+  if (quality > SOXR_PRECISIONQ) {
+    p->e = "invalid quality type";
+    return spec;
+  }
+  flags |= quality < SOXR_LSR0Q ? RESET_ON_CLEAR : 0;
+  p->phase_response = "\62\31\144"[(recipe & 0x30)>>4];
+  p->stopband_begin = 1;
+  p->precision =
+    quality == SOXR_QQ      ?  0 :
+    quality <= SOXR_16_BITQ ? 16 :
+    quality <= SOXR_32_BITQ ?  4 + quality * 4 :
+    quality <= SOXR_LSR2Q   ? 55 - quality * 4 : /* TODO: move to lsr.c */
+    0;
+  rej = p->precision * linear_to_dB(2.);
+  p->flags = flags;
+  if (quality <= SOXR_32_BITQ || quality == SOXR_PRECISIONQ) {
+    #define LOW_Q_BW0     (1385 / 2048.) /* 0.67625 rounded to be a FP exact. */
+    p->passband_end = quality == 1? LOW_Q_BW0 : 1 - .05 / lsx_to_3dB(rej);
+    if (quality <= 2)
+      p->flags &= ~SOXR_ROLLOFF_NONE, p->flags |= SOXR_ROLLOFF_MEDIUM;
+  }
+  else { /* TODO: move to lsr.c */
+    static float const bw[] = {.931f, .832f, .663f};
+    p->passband_end = bw[quality - SOXR_LSR0Q];
+    if (quality == SOXR_LSR2Q)
+      p->flags &= ~SOXR_ROLLOFF_NONE, p->flags |= SOXR_ROLLOFF_LSR2Q | SOXR_PROMOTE_TO_LQ;
+  }
+  if (recipe & SOXR_STEEP_FILTER)
+    p->passband_end = 1 - .01 / lsx_to_3dB(rej);
+  return spec;
+}
+
+
+
+soxr_runtime_spec_t soxr_runtime_spec(unsigned num_threads)
+{
+  soxr_runtime_spec_t spec, * p = &spec;
+  memset(p, 0, sizeof(*p));
+  p->log2_min_dft_size = 10;
+  p->log2_large_dft_size = 17;
+  p->coef_size_kbytes = 400;
+  p->num_threads = num_threads;
+  return spec;
+}
+
+
+
+soxr_io_spec_t soxr_io_spec(
+  soxr_datatype_t itype,
+  soxr_datatype_t otype)
+{
+  soxr_io_spec_t spec, * p = &spec;
+  memset(p, 0, sizeof(*p));
+  if ((itype | otype) >= SOXR_SPLIT * 2)
+    p->e = "invalid io datatype(s)";
+  else {
+    p->itype = itype;
+    p->otype = otype;
+    p->scale = 1;
+  }
+  return spec;
+}
diff --git a/src/cr-core.c b/src/cr-core.c
new file mode 100644
index 0000000..bc6282c
--- /dev/null
+++ b/src/cr-core.c
@@ -0,0 +1,297 @@
+/* SoX Resampler Library      Copyright (c) 2007-16 robs@users.sourceforge.net
+ * Licence for this file: LGPL v2.1                  See LICENCE for details.
+ *
+ * Constant-rate resampling engine-specific code. */
+
+#include <math.h>
+#include <assert.h>
+#include <string.h>
+#include <stdlib.h>
+
+#include "filter.h"
+
+#if defined SOXR_LIB
+  #include "internal.h"
+  #include "cr.h"
+  #if CORE_TYPE & CORE_DBL
+    typedef double sample_t;
+    #if CORE_TYPE & CORE_SIMD_DFT
+      #define RDFT_CB    _soxr_rdft64s_cb
+    #else
+      #define RDFT_CB    _soxr_rdft64_cb
+    #endif
+  #else
+    typedef float sample_t;
+    #if CORE_TYPE & CORE_SIMD_DFT
+      #define RDFT_CB    _soxr_rdft32s_cb
+    #else
+      #define RDFT_CB    _soxr_rdft32_cb
+    #endif
+  #endif
+
+  #if CORE_TYPE & (CORE_SIMD_POLY|CORE_SIMD_HALF|CORE_SIMD_DFT)
+    #if CORE_TYPE & CORE_DBL
+      #include "simd64.h"
+      #include "simd64-dev.h"
+    #else
+      #include "simd32.h"
+      #include "simd32-dev.h"
+    #endif
+  #endif
+
+  extern fn_t RDFT_CB[];
+#else
+  #define RDFT_CB 0
+#endif
+
+
+
+static void cubic_stage_fn(stage_t * p, fifo_t * output_fifo)
+{
+  sample_t const * input = stage_read_p(p);
+  int num_in = min(stage_occupancy(p), p->input_size);
+  int i, max_num_out = 1 + (int)(num_in * p->out_in_ratio);
+  sample_t * output = fifo_reserve(output_fifo, max_num_out);
+
+  for (i = 0; p->at.integer < num_in; ++i, p->at.whole += p->step.whole) {
+    sample_t const * s = input + p->at.integer;
+    double x = p->at.fraction * (1 / MULT32);
+    double b = .5*(s[1]+s[-1])-*s, a = (1/6.)*(s[2]-s[1]+s[-1]-*s-4*b);
+    double c = s[1]-*s-a-b;
+    output[i] = (sample_t)(p->mult * (((a*x + b)*x + c)*x + *s));
+  }
+  assert(max_num_out - i >= 0);
+  fifo_trim_by(output_fifo, max_num_out - i);
+  fifo_read(&p->fifo, p->at.integer, NULL);
+  p->at.integer = 0;
+}
+
+
+
+#if CORE_TYPE & CORE_DBL
+  #define SIMD_AVX ((CORE_TYPE & CORE_SIMD_HALF) && defined __AVX__)
+  #define SIMD_SSE 0
+#else
+  #define SIMD_SSE ((CORE_TYPE & CORE_SIMD_HALF) && (defined __x86_64__ || defined _M_X64 || defined i386 || defined _M_IX86))
+  #define SIMD_AVX 0
+#endif
+
+#define SIMD_NEON ((CORE_TYPE & CORE_SIMD_HALF) && defined __arm__)
+
+
+
+
+#include "half-coefs.h"
+
+#if !(CORE_TYPE & CORE_SIMD_HALF)
+#define FUNCTION_H h7
+#define CONVOLVE ____ __ _
+#include "half-fir.h"
+#endif
+
+#define FUNCTION_H h8
+#define CONVOLVE ____ ____
+#include "half-fir.h"
+
+#define FUNCTION_H h9
+#define CONVOLVE ____ ____ _
+#include "half-fir.h"
+
+#if CORE_TYPE & CORE_DBL
+  #define FUNCTION_H h10
+  #define CONVOLVE ____ ____ __
+  #include "half-fir.h"
+
+  #define FUNCTION_H h11
+  #define CONVOLVE ____ ____ __ _
+  #include "half-fir.h"
+
+  #define FUNCTION_H h12
+  #define CONVOLVE ____ ____ ____
+  #include "half-fir.h"
+
+  #define FUNCTION_H h13
+  #define CONVOLVE ____ ____ ____ _
+  #include "half-fir.h"
+#endif
+
+static half_fir_info_t const half_firs[] = {
+#if !(CORE_TYPE & CORE_SIMD_HALF)
+  { 7, half_fir_coefs_7 , h7 , 0  , 120.65f},
+#endif
+  { 8, half_fir_coefs_8 , h8 , 0  , 136.51f},
+  { 9, half_fir_coefs_9 , h9 , 0  , 152.32f},
+#if CORE_TYPE & CORE_DBL
+  {10, half_fir_coefs_10, h10, 0  , 168.08f},
+  {11, half_fir_coefs_11, h11, 0  , 183.79f},
+  {12, half_fir_coefs_12, h12, 0  , 199.46f},
+  {13, half_fir_coefs_13, h13, 0  , 215.12f},
+#endif
+};
+
+#undef SIMD_AVX
+#undef SIMD_NEON
+#undef SIMD_SSE
+
+
+
+#if CORE_TYPE & CORE_DBL
+  #define SIMD_AVX ((CORE_TYPE & CORE_SIMD_POLY) && defined __AVX__)
+  #define SIMD_SSE 0
+#else
+  #define SIMD_SSE ((CORE_TYPE & CORE_SIMD_POLY) && (defined __x86_64__ || defined _M_X64 || defined i386 || defined _M_IX86))
+  #define SIMD_AVX 0
+#endif
+
+#define SIMD_NEON ((CORE_TYPE & CORE_SIMD_POLY) && defined __arm__)
+
+
+
+
+#define HI_PREC_CLOCK
+#define COEFS (sample_t * __restrict)p->shared->poly_fir_coefs
+#define VAR_LENGTH p->n
+#define VAR_CONVOLVE(n) while (j < (n)) _
+#define VAR_POLY_PHASE_BITS p->phase_bits
+
+
+
+#define FUNCTION vpoly0
+#define FIR_LENGTH VAR_LENGTH
+#define CONVOLVE(n) VAR_CONVOLVE(n)
+#include "poly-fir0.h"
+
+#define FUNCTION vpoly1
+#define COEF_INTERP 1
+#define PHASE_BITS VAR_POLY_PHASE_BITS
+#define FIR_LENGTH VAR_LENGTH
+#define CONVOLVE(n) VAR_CONVOLVE(n)
+#include "poly-fir.h"
+
+#define FUNCTION vpoly2
+#define COEF_INTERP 2
+#define PHASE_BITS VAR_POLY_PHASE_BITS
+#define FIR_LENGTH VAR_LENGTH
+#define CONVOLVE(n) VAR_CONVOLVE(n)
+#include "poly-fir.h"
+
+#define FUNCTION vpoly3
+#define COEF_INTERP 3
+#define PHASE_BITS VAR_POLY_PHASE_BITS
+#define FIR_LENGTH VAR_LENGTH
+#define CONVOLVE(n) VAR_CONVOLVE(n)
+#include "poly-fir.h"
+
+
+
+#if !(CORE_TYPE & CORE_SIMD_POLY)
+
+#define poly_fir_convolve_U100 _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
+#define FUNCTION U100_0
+#define FIR_LENGTH U100_l
+#define CONVOLVE(n) poly_fir_convolve_U100
+#include "poly-fir0.h"
+
+#define u100_l 11
+#define poly_fir_convolve_u100 _ _ _ _ _ _ _ _ _ _ _
+#define FUNCTION u100_0
+#define FIR_LENGTH u100_l
+#define CONVOLVE(n) poly_fir_convolve_u100
+#include "poly-fir0.h"
+
+#define FUNCTION u100_1
+#define COEF_INTERP 1
+#define PHASE_BITS 8
+#define FIR_LENGTH u100_l
+#define CONVOLVE(n) poly_fir_convolve_u100
+#include "poly-fir.h"
+
+#define FUNCTION u100_2
+#define COEF_INTERP 2
+#define PHASE_BITS 6
+#define FIR_LENGTH u100_l
+#define CONVOLVE(n) poly_fir_convolve_u100
+#include "poly-fir.h"
+
+#endif
+
+#define u100_1_b 8
+#define u100_2_b 6
+
+
+
+static poly_fir_t const poly_firs[] = {
+  {-1, {{0, vpoly0}, { 7.2f, vpoly1}, {5.0f, vpoly2}}},
+  {-1, {{0, vpoly0}, { 9.4f, vpoly1}, {6.7f, vpoly2}}},
+  {-1, {{0, vpoly0}, {12.4f, vpoly1}, {7.8f, vpoly2}}},
+  {-1, {{0, vpoly0}, {13.6f, vpoly1}, {9.3f, vpoly2}}},
+  {-1, {{0, vpoly0}, {10.5f, vpoly2}, {8.4f, vpoly3}}},
+  {-1, {{0, vpoly0}, {11.85f,vpoly2}, {9.0f, vpoly3}}},
+
+  {-1, {{0, vpoly0}, { 8.0f, vpoly1}, {5.3f, vpoly2}}},
+  {-1, {{0, vpoly0}, { 8.6f, vpoly1}, {5.7f, vpoly2}}},
+  {-1, {{0, vpoly0}, {10.6f, vpoly1}, {6.75f,vpoly2}}},
+  {-1, {{0, vpoly0}, {12.6f, vpoly1}, {8.6f, vpoly2}}},
+  {-1, {{0, vpoly0}, { 9.6f, vpoly2}, {7.6f, vpoly3}}},
+  {-1, {{0, vpoly0}, {11.4f, vpoly2}, {8.65f,vpoly3}}},
+
+#if CORE_TYPE & CORE_SIMD_POLY
+  {10.62f, {{0, vpoly0}, {0, 0}, {0, 0}}},
+  {-1, {{0, vpoly0}, {u100_1_b, vpoly1}, {u100_2_b, vpoly2}}},
+#else
+  {10.62f, {{U100_l, U100_0}, {0, 0}, {0, 0}}},
+  {11.28f, {{u100_l, u100_0}, {u100_1_b, u100_1}, {u100_2_b, u100_2}}},
+#endif
+  {-1, {{0, vpoly0}, {   9, vpoly1}, {  6, vpoly2}}},
+  {-1, {{0, vpoly0}, {  11, vpoly1}, {  7, vpoly2}}},
+  {-1, {{0, vpoly0}, {  13, vpoly1}, {  8, vpoly2}}},
+  {-1, {{0, vpoly0}, {  10, vpoly2}, {  8, vpoly3}}},
+  {-1, {{0, vpoly0}, {  12, vpoly2}, {  9, vpoly3}}},
+};
+
+
+
+static cr_core_t const cr_core = {
+
+#if CORE_TYPE & CORE_SIMD_POLY
+  {SIMD_ALIGNED_MALLOC, SIMD_ALIGNED_CALLOC, SIMD_ALIGNED_FREE},
+#else
+  {malloc, calloc, free},
+#endif
+  half_firs, array_length(half_firs),
+  0, 0,
+  cubic_stage_fn,
+  poly_firs, RDFT_CB
+};
+
+
+
+#if defined SOXR_LIB
+
+#include "soxr.h"
+
+static char const * rate_create(void * channel, void * shared, double io_ratio,
+    soxr_quality_spec_t * q_spec, soxr_runtime_spec_t * r_spec, double scale)
+{
+  return _soxr_init(channel, shared, io_ratio, q_spec, r_spec, scale,
+      &cr_core, CORE_TYPE);
+}
+
+
+
+static char const * id(void) {return CORE_STR;}
+
+fn_t RATE_CB[] = {
+  (fn_t)_soxr_input,
+  (fn_t)_soxr_process,
+  (fn_t)_soxr_output,
+  (fn_t)_soxr_flush,
+  (fn_t)_soxr_close,
+  (fn_t)_soxr_delay,
+  (fn_t)_soxr_sizes,
+  (fn_t)rate_create,
+  (fn_t)0,
+  (fn_t)id,
+};
+
+#endif
diff --git a/src/cr.c b/src/cr.c
new file mode 100644
index 0000000..eb65a04
--- /dev/null
+++ b/src/cr.c
@@ -0,0 +1,581 @@
+/* SoX Resampler Library      Copyright (c) 2007-16 robs@users.sourceforge.net
+ * Licence for this file: LGPL v2.1                  See LICENCE for details.
+ *
+ * Constant-rate resampling common code. */
+
+#include <math.h>
+#include <assert.h>
+#include <string.h>
+#include <stdlib.h>
+
+#include "filter.h"
+
+#if defined SOXR_LIB
+  #include "internal.h"
+  #define STATIC
+#endif
+
+#include "cr.h"
+
+#define num_coefs4 ((core_flags&CORE_SIMD_POLY)? ((num_coefs+3)&~3) : num_coefs)
+
+#define coef_coef(C,T,x) \
+  C((T*)result, interp_order, num_coefs4, j, x, num_coefs4 - 1 - i)
+
+#define STORE(C,T) { \
+  if (interp_order > 2) coef_coef(C,T,3) = (T)d; \
+  if (interp_order > 1) coef_coef(C,T,2) = (T)c; \
+  if (interp_order > 0) coef_coef(C,T,1) = (T)b; \
+  coef_coef(C,T,0) = (T)f0;}
+
+static real * prepare_poly_fir_coefs(double const * coefs, int num_coefs,
+    int num_phases, int interp_order, double multiplier,
+    core_flags_t core_flags, alloc_t const * mem)
+{
+  int i, j, length = num_coefs4 * num_phases * (interp_order + 1);
+  real * result = mem->calloc(1,(size_t)length << LOG2_SIZEOF_REAL(core_flags));
+  double fm1 = coefs[0], f1 = 0, f2 = 0;
+
+  for (i = num_coefs - 1; i >= 0; --i)
+    for (j = num_phases - 1; j >= 0; --j) {
+      double f0 = fm1, b = 0, c = 0, d = 0; /* = 0 to kill compiler warning */
+      int pos = i * num_phases + j - 1;
+      fm1 = pos > 0 ? coefs[pos - 1] * multiplier : 0;
+      switch (interp_order) {
+        case 1: b = f1 - f0; break;
+        case 2: b = f1 - (.5 * (f2+f0) - f1) - f0; c = .5 * (f2+f0) - f1; break;
+        case 3: c=.5*(f1+fm1)-f0;d=(1/6.)*(f2-f1+fm1-f0-4*c);b=f1-f0-d-c; break;
+        default: assert(!interp_order);
+      }
+      switch (core_flags & 3) {
+        case 0: if (WITH_CR32 ) STORE(coef , float ); break;
+        case 1: if (WITH_CR64 ) STORE(coef , double); break;
+        case 2: if (WITH_CR32S) STORE(coef4, float ); break;
+        default:if (WITH_CR64S) STORE(coef4, double); break;
+      }
+      f2 = f1, f1 = f0;
+    }
+  return result;
+}
+
+#undef STORE
+#undef coef_coef
+
+#define IS_FLOAT32 (WITH_CR32 || WITH_CR32S) && \
+    (!(WITH_CR64 || WITH_CR64S) || sizeof_real == sizeof(float))
+#define WITH_FLOAT64 WITH_CR64 || WITH_CR64S
+
+static void dft_stage_fn(stage_t * p, fifo_t * output_fifo)
+{
+  real * output, * dft_out;
+  int i, j, num_in = max(0, fifo_occupancy(&p->fifo));
+  rate_shared_t const * s = p->shared;
+  dft_filter_t const * f = &s->dft_filter[p->dft_filter_num];
+  int const overlap = f->num_taps - 1;
+
+  if (p->at.integer + p->L * num_in >= f->dft_length) {
+    fn_t const * const RDFT_CB = p->rdft_cb;
+    size_t const sizeof_real = sizeof(char) << LOG2_SIZEOF_REAL(p->core_flags);
+    div_t divd = div(f->dft_length - overlap - p->at.integer + p->L - 1, p->L);
+    real const * input = fifo_read_ptr(&p->fifo);
+    fifo_read(&p->fifo, divd.quot, NULL);
+    num_in -= divd.quot;
+
+    output = fifo_reserve(output_fifo, f->dft_length);
+    dft_out = (p->core_flags & CORE_SIMD_DFT)? p->dft_out : output;
+
+    if (lsx_is_power_of_2(p->L)) { /* F-domain */
+      int portion = f->dft_length / p->L;
+      memcpy(dft_out, input, (unsigned)portion * sizeof_real);
+      rdft_oforward(portion, f->dft_forward_setup, dft_out, p->dft_scratch);
+      if (IS_FLOAT32) {
+#define dft_out ((float *)dft_out)
+        for (i = portion + 2; i < (portion << 1); i += 2) /* Mirror image. */
+          dft_out[i] = dft_out[(portion << 1) - i],
+            dft_out[i+1] = -dft_out[(portion << 1) - i + 1];
+        dft_out[portion] = dft_out[1];
+        dft_out[portion + 1] = 0;
+        dft_out[1] = dft_out[0];
+#undef dft_out
+      }
+      else if (WITH_FLOAT64) {
+#define dft_out ((double *)dft_out)
+        for (i = portion + 2; i < (portion << 1); i += 2) /* Mirror image. */
+          dft_out[i] = dft_out[(portion << 1) - i],
+            dft_out[i+1] = -dft_out[(portion << 1) - i + 1];
+        dft_out[portion] = dft_out[1];
+        dft_out[portion + 1] = 0;
+        dft_out[1] = dft_out[0];
+#undef dft_out
+      }
+
+      for (portion <<= 1; i < f->dft_length; i += portion, portion <<= 1) {
+        memcpy((char *)dft_out + (size_t)i * sizeof_real, dft_out, (size_t)portion * sizeof_real);
+        ((char *)dft_out)[((size_t)i + 1) * sizeof_real] = 0;
+      }
+      if (p->step.integer > 0)
+        rdft_reorder_back(f->dft_length, f->dft_backward_setup, dft_out, p->dft_scratch);
+    } else {
+      if (p->L == 1)
+        memcpy(dft_out, input, (size_t)f->dft_length * sizeof_real);
+      else {
+        memset(dft_out, 0, (size_t)f->dft_length * sizeof_real);
+        if (IS_FLOAT32)
+          for (j = 0, i = p->at.integer; i < f->dft_length; ++j, i += p->L)
+            ((float *)dft_out)[i] = ((float *)input)[j];
+        else if (WITH_FLOAT64)
+          for (j = 0, i = p->at.integer; i < f->dft_length; ++j, i += p->L)
+            ((double *)dft_out)[i] = ((double *)input)[j];
+        p->at.integer = p->L - 1 - divd.rem;
+      }
+      if (p->step.integer > 0)
+        rdft_forward(f->dft_length, f->dft_forward_setup, dft_out, p->dft_scratch);
+      else
+        rdft_oforward(f->dft_length, f->dft_forward_setup, dft_out, p->dft_scratch);
+    }
+
+    if (p->step.integer > 0) {
+      rdft_convolve(f->dft_length, f->dft_backward_setup, dft_out, f->coefs);
+      rdft_backward(f->dft_length, f->dft_backward_setup, dft_out, p->dft_scratch);
+      if ((p->core_flags & CORE_SIMD_DFT) && p->step.integer == 1)
+        memcpy(output, dft_out, (size_t)f->dft_length * sizeof_real);
+      if (p->step.integer != 1) {
+        if (IS_FLOAT32)
+          for (j = 0, i = p->remM; i < f->dft_length - overlap; ++j,
+              i += p->step.integer)
+            ((float *)output)[j] = ((float *)dft_out)[i];
+        else if (WITH_FLOAT64)
+          for (j = 0, i = p->remM; i < f->dft_length - overlap; ++j,
+              i += p->step.integer)
+            ((double *)output)[j] = ((double *)dft_out)[i];
+        p->remM = i - (f->dft_length - overlap);
+        fifo_trim_by(output_fifo, f->dft_length - j);
+      }
+      else fifo_trim_by(output_fifo, overlap);
+    }
+    else { /* F-domain */
+      int m = -p->step.integer;
+      rdft_convolve_portion(f->dft_length >> m, dft_out, f->coefs);
+      rdft_obackward(f->dft_length >> m, f->dft_backward_setup, dft_out, p->dft_scratch);
+      if (p->core_flags & CORE_SIMD_DFT)
+        memcpy(output, dft_out, (size_t)(f->dft_length >> m) * sizeof_real);
+      fifo_trim_by(output_fifo, (((1 << m) - 1) * f->dft_length + overlap) >>m);
+    }
+    (void)RDFT_CB;
+  }
+  p->input_size = (f->dft_length - p->at.integer + p->L - 1) / p->L;
+}
+
+/* Set to 4 x nearest power of 2 or half of that */
+/* if danger of causing too many cache misses. */
+static int set_dft_length(int num_taps, int min, int large)
+{
+  double d = log((double)num_taps) / log(2.);
+  return 1 << range_limit((int)(d + 2.77), min, max((int)(d + 1.77), large));
+}
+
+static void dft_stage_init(
+    unsigned instance, double Fp, double Fs, double Fn, double att,
+    double phase_response, stage_t * p, int L, int M, double * multiplier,
+    unsigned min_dft_size, unsigned large_dft_size, core_flags_t core_flags,
+    fn_t const * RDFT_CB)
+{
+  dft_filter_t * f = &p->shared->dft_filter[instance];
+  int num_taps = 0, dft_length = f->dft_length, i, offset;
+  bool f_domain_m = abs(3-M) == 1 && Fs <= 1;
+  size_t const sizeof_real = sizeof(char) << LOG2_SIZEOF_REAL(core_flags);
+
+  if (!dft_length) {
+    int k = phase_response == 50 && lsx_is_power_of_2(L) && Fn == L? L << 1 : 4;
+    double m, * h = lsx_design_lpf(Fp, Fs, Fn, att, &num_taps, -k, -1.);
+
+    if (phase_response != 50)
+      lsx_fir_to_phase(&h, &num_taps, &f->post_peak, phase_response);
+    else f->post_peak = num_taps / 2;
+
+    dft_length = set_dft_length(num_taps, (int)min_dft_size, (int)large_dft_size);
+    f->coefs = rdft_calloc((size_t)dft_length, sizeof_real);
+    offset = dft_length - num_taps + 1;
+    m = (1. / dft_length) * rdft_multiplier() * L * *multiplier;
+    if (IS_FLOAT32) for (i = 0; i < num_taps; ++i)
+        ((float *)f->coefs)[(i + offset) & (dft_length - 1)] =(float)(h[i] * m);
+    else if (WITH_FLOAT64) for (i = 0; i < num_taps; ++i)
+        ((double *)f->coefs)[(i + offset) & (dft_length - 1)] = h[i] * m;
+    free(h);
+  }
+
+  if (rdft_flags() & RDFT_IS_SIMD)
+    p->dft_out = rdft_malloc(sizeof_real * (size_t)dft_length);
+  if (rdft_flags() & RDFT_NEEDS_SCRATCH)
+    p->dft_scratch = rdft_malloc(2 * sizeof_real * (size_t)dft_length);
+
+  if (!f->dft_length) {
+    void * coef_setup = rdft_forward_setup(dft_length);
+    int Lp = lsx_is_power_of_2(L)? L : 1;
+    int Mp = f_domain_m? M : 1;
+    f->dft_forward_setup = rdft_forward_setup(dft_length / Lp);
+    f->dft_backward_setup = rdft_backward_setup(dft_length / Mp);
+    if (Mp == 1)
+      rdft_forward(dft_length, coef_setup, f->coefs, p->dft_scratch);
+    else
+      rdft_oforward(dft_length, coef_setup, f->coefs, p->dft_scratch);
+    rdft_delete_setup(coef_setup);
+    f->num_taps = num_taps;
+    f->dft_length = dft_length;
+    lsx_debug("fir_len=%i dft_length=%i Fp=%g Fs=%g Fn=%g att=%g %i/%i",
+        num_taps, dft_length, Fp, Fs, Fn, att, L, M);
+  }
+  *multiplier = 1;
+  p->out_in_ratio = (double)L / M;
+  p->core_flags = core_flags;
+  p->rdft_cb = RDFT_CB;
+  p->fn = dft_stage_fn;
+  p->preload = f->post_peak / L;
+  p->at.integer = f->post_peak % L;
+  p->L = L;
+  p->step.integer = f_domain_m? -M/2 : M;
+  p->dft_filter_num = instance;
+  p->block_len = f->dft_length - (f->num_taps - 1);
+  p->phase0 = p->at.integer / p->L;
+  p->input_size = (f->dft_length - p->at.integer + p->L - 1) / p->L;
+}
+
+static struct half_fir_info const * find_half_fir(
+    struct half_fir_info const * firs, size_t len, double att)
+{
+  size_t i;
+  for (i = 0; i + 1 < len && att > firs[i].att; ++i);
+  return &firs[i];
+}
+
+#define have_pre_stage  (preM  * preL  != 1)
+#define have_arb_stage  (arbM  * arbL  != 1)
+#define have_post_stage (postM * postL != 1)
+
+#include "soxr.h"
+
+STATIC char const * _soxr_init(
+  rate_t * const p,             /* Per audio channel. */
+  rate_shared_t * const shared, /* By channels undergoing same rate change. */
+  double const io_ratio,        /* Input rate divided by output rate. */
+  soxr_quality_spec_t const * const q_spec,
+  soxr_runtime_spec_t const * const r_spec,
+  double multiplier,            /* Linear gain to apply during conversion. */
+  cr_core_t const * const core,
+  core_flags_t const core_flags)
+{
+  size_t const sizeof_real = sizeof(char) << LOG2_SIZEOF_REAL(core_flags);
+  double const tolerance = 1 + 1e-5;
+
+  double       bits = q_spec->precision;
+  rolloff_t const rolloff = (rolloff_t)(q_spec->flags & 3);
+  int interpolator = (int)(r_spec->flags & 3) - 1;
+  double const Fp0 = q_spec->passband_end, Fs0 = q_spec->stopband_begin;
+  double const phase_response = q_spec->phase_response, tbw0 = Fs0-Fp0;
+
+  bool const maintain_3dB_pt = !!(q_spec->flags & SOXR_MAINTAIN_3DB_PT);
+  double tbw_tighten = 1, alpha;
+  #define tighten(x) (Fs0-(Fs0-(x))*tbw_tighten)
+
+  double arbM = io_ratio, Fn1, Fp1 = Fp0, Fs1 = Fs0, bits1 = min(bits,33);
+  double att = (bits1 + 1) * linear_to_dB(2.), attArb = att; /* +1: pass+stop */
+  int preL = 1, preM = 1, shr = 0, arbL = 1, postL = 1, postM = 1;
+  bool upsample=false, rational=false, iOpt=!(r_spec->flags&SOXR_NOSMALLINTOPT);
+  bool lq_bits= (q_spec->flags & SOXR_PROMOTE_TO_LQ)? bits <= 16 : bits == 16;
+  bool lq_Fp0 = (q_spec->flags & SOXR_PROMOTE_TO_LQ)? Fp0<=lq_bw0 : Fp0==lq_bw0;
+  int n = 0, i, mode = lq_bits && rolloff == rolloff_medium? io_ratio > 1 ||
+    phase_response != 50 || !lq_Fp0 || Fs0 != 1 : ((int)ceil(bits1) - 6) / 4;
+  struct half_fir_info const * half_fir_info;
+  stage_t * s;
+
+  if (io_ratio < 1 && Fs0 - 1 > 1 - Fp0 / tolerance)
+    return "imaging greater than rolloff";
+  if (.002 / tolerance > tbw0 || tbw0 > .5 * tolerance)
+    return "transition bandwidth not in [0.2,50] % of nyquist";
+  if (.5 / tolerance > Fp0 || Fs0 > 1.5 * tolerance)
+    return "transition band not within [50,150] % of nyquist";
+  if (bits!=0 && (15 > bits || bits > 33))
+    return "precision not in [15,33] bits";
+  if (io_ratio <= 0)
+    return "resampling factor not positive";
+  if (0 > phase_response || phase_response > 100)
+    return "phase response not in [0=min-phase,100=max-phase] %";
+
+  p->core = core;
+  p->io_ratio = io_ratio;
+  if (bits!=0) while (!n++) {                            /* Determine stages: */
+    int try, L, M, x, maxL = interpolator > 0? 1 : mode? 2048 :
+      (int)ceil(r_spec->coef_size_kbytes * 1000. / (U100_l * (int)sizeof_real));
+    double d, epsilon = 0, frac;
+    upsample = arbM < 1;
+    for (i = (int)(.5 * arbM), shr = 0; i >>= 1; arbM *= .5, ++shr);
+    preM = upsample || (arbM > 1.5 && arbM < 2);
+    postM = 1 + (arbM > 1 && preM), arbM /= postM;
+    preL = 1 + (!preM && arbM < 2) + (upsample && mode), arbM *= preL;
+    if ((frac = arbM - (int)arbM)!=0)
+      epsilon = fabs(floor(frac * MULT32 + .5) / (frac * MULT32) - 1);
+    for (i = 1, rational = frac==0; i <= maxL && !rational; ++i) {
+      d = frac * i, try = (int)(d + .5);
+      if ((rational = fabs(try / d - 1) <= epsilon)) {    /* No long doubles! */
+        if (try == i)
+          arbM = ceil(arbM), shr += x = arbM > 3, arbM /= 1 + x;
+        else arbM = i * (int)arbM + try, arbL = i;
+      }
+    }
+    L = preL * arbL, M = (int)(arbM * postM), x = (L|M)&1, L >>= !x, M >>= !x;
+    if (iOpt && postL == 1 && (d = preL * arbL / arbM) > 4 && d != 5) {
+      for (postL = 4, i = (int)(d / 16); (i >>= 1) && postL < 256; postL <<= 1);
+      arbM = arbM * postL / arbL / preL, arbL = 1, n = 0;
+    } else if (rational && (max(L, M) < 3 + 2 * iOpt || L * M < 6 * iOpt))
+      preL = L, preM = M, arbM = arbL = postM = 1;
+    if (!mode && (!rational || !n))
+      ++mode, n = 0;
+  }
+
+  p->num_stages = shr + have_pre_stage + have_arb_stage + have_post_stage;
+  if (!p->num_stages && multiplier != 1) {
+    bits = arbL = 0;                         /* Use cubic_stage in this case. */
+    ++p->num_stages;
+  }
+  p->stages = calloc((size_t)p->num_stages + 1, sizeof(*p->stages));
+  if (!p->stages)
+    return "out of memory";
+  for (i = 0; i < p->num_stages; ++i) {
+    p->stages[i].num = i;
+    p->stages[i].shared = shared;
+    p->stages[i].input_size = 4096;
+  }
+  p->stages[0].is_input = true;
+
+  alpha = postM / (io_ratio * (postL << 0));
+
+  if ((n = p->num_stages) > 1) {                              /* Att. budget: */
+    if (have_arb_stage)
+      att += linear_to_dB(2.), attArb = att, --n;
+    att += linear_to_dB((double)n);
+  }
+
+  half_fir_info = find_half_fir(core->half_firs, core->half_firs_len, att);
+  for (i = 0, s = p->stages; i < shr; ++i, ++s) {
+    s->fn = half_fir_info->fn;
+    s->coefs = half_fir_info->coefs;
+    s->n = half_fir_info->num_coefs;
+    s->pre_post = 4 * s->n;
+    s->preload = s->pre = s->pre_post >> 1;
+  }
+
+  if (have_pre_stage) {
+    if (maintain_3dB_pt && have_post_stage) {    /* Trans. bands overlapping. */
+      double x = tbw0 * lsx_inv_f_resp(-3., att);
+      x = -lsx_f_resp(x / (max(2 * alpha - Fs0, alpha) - Fp0), att);
+      if (x > .035) {
+        tbw_tighten = ((4.3074e-3 - 3.9121e-4 * x) * x - .040009) * x + 1.0014;
+        lsx_debug("tbw_tighten=%g (%gdB)", tbw_tighten, x);
+      }
+    }
+    Fn1 = preM? max(preL, preM) : arbM / arbL;
+    dft_stage_init(0, tighten(Fp1), Fs1, Fn1, att, phase_response, s++, preL,
+        max(preM, 1), &multiplier, r_spec->log2_min_dft_size,
+        r_spec->log2_large_dft_size, core_flags, core->rdft_cb);
+    Fp1 /= Fn1, Fs1 /= Fn1;
+  }
+
+  if (bits==0 && have_arb_stage) {                /* `Quick' cubic arb stage: */
+    s->fn = core->cubic_stage_fn;
+    s->mult = multiplier, multiplier = 1;
+    s->step.whole = (int64_t)(arbM * MULT32 + .5);
+    s->pre_post = max(3, s->step.integer);
+    s->preload = s->pre = 1;
+    s->out_in_ratio = MULT32 / (double)s->step.whole;
+  }
+  else if (have_arb_stage) {                     /* Higher quality arb stage: */
+    static const float rolloffs[] = {-.01f, -.3f, 0, -.103f};
+    poly_fir_t const * f = &core->poly_firs[6*(upsample+!!preM)+mode-!upsample];
+    int order, num_coefs = (int)f->interp[0].scalar, phase_bits, phases;
+    size_t coefs_size;
+    double at, Fp = Fp1, Fs, Fn, mult = upsample? 1 : arbM / arbL;
+    poly_fir1_t const * f1;
+
+    if (!upsample && preM)
+      Fn = 2 * mult, Fs = 3 + fabs(Fs1 - 1);
+    else Fn = 1, Fs = 2 - (mode? Fp1 + (Fs1 - Fp1) * .7 : Fs1);
+
+    if (mode)
+      Fp = Fs - (Fs - Fp) / (1 - lsx_inv_f_resp(rolloffs[rolloff], attArb));
+
+    i = (interpolator < 0? !rational : max(interpolator, !rational)) - 1;
+    do {
+      f1 = &f->interp[++i];
+      assert(f1->fn);
+      if (i)
+        arbM /= arbL, arbL = 1, rational = false;
+      phase_bits = (int)ceil(f1->scalar - log(mult)/log(2.));
+      phases = !rational? (1 << phase_bits) : arbL;
+      if (f->interp[0].scalar==0) {
+        int phases0 = max(phases, 19), n0 = 0;
+        lsx_design_lpf(Fp, Fs, -Fn, attArb, &n0, phases0, f->beta);
+        num_coefs = n0 / phases0 + 1, num_coefs += num_coefs & !preM;
+      }
+      if ((num_coefs & 1) && rational && (arbL & 1))
+        phases <<= 1, arbL <<= 1, arbM *= 2;
+      at = arbL * (s->phase0 = .5 * (num_coefs & 1));
+      order = i + (i && mode > 4);
+      coefs_size = (size_t)(num_coefs4 * phases * (order+1)) * sizeof_real;
+    } while (interpolator < 0 && i < 2 && f->interp[i+1].fn &&
+        coefs_size / 1000 > r_spec->coef_size_kbytes);
+
+    if (!s->shared->poly_fir_coefs) {
+      int num_taps = num_coefs * phases - 1;
+      double * coefs = lsx_design_lpf(
+          Fp, Fs, Fn, attArb, &num_taps, phases, f->beta);
+      s->shared->poly_fir_coefs = prepare_poly_fir_coefs(
+          coefs, num_coefs, phases, order, multiplier, core_flags, &core->mem);
+      lsx_debug("fir_len=%i phases=%i coef_interp=%i size=%.3gk",
+          num_coefs, phases, order, (double)coefs_size / 1000.);
+      free(coefs);
+    }
+    multiplier = 1;
+    s->fn = f1->fn;
+    s->pre_post = num_coefs4 - 1;
+    s->preload = ((num_coefs - 1) >> 1) + (num_coefs4 - num_coefs);
+    s->n = num_coefs4;
+    s->phase_bits = phase_bits;
+    s->L = arbL;
+    s->use_hi_prec_clock =
+      mode>1 && (q_spec->flags & SOXR_HI_PREC_CLOCK) && !rational;
+#if FLOAT_HI_PREC_CLOCK
+    if (s->use_hi_prec_clock) {
+      s->at.flt = at;
+      s->step.flt = arbM;
+      s->out_in_ratio = (double)(arbL / s->step.flt);
+    } else
+#endif
+    {
+      s->at.whole = (int64_t)(at * MULT32 + .5);
+#if !FLOAT_HI_PREC_CLOCK
+      if (s->use_hi_prec_clock) {
+        double M = arbM * MULT32;
+        s->at.fix.ls.parts.ms = 0x80000000ul;
+        s->step.whole = (int64_t)M;
+        M -= (double)s->step.whole;
+        M *= MULT32 * MULT32;
+        s->step.fix.ls.all = (uint64_t)M;
+      } else
+#endif
+        s->step.whole = (int64_t)(arbM * MULT32 + .5);
+      s->out_in_ratio = MULT32 * arbL / (double)s->step.whole;
+    }
+    ++s;
+  }
+
+  if (have_post_stage)
+    dft_stage_init(1, tighten(Fp0 / (upsample? alpha : 1)), upsample? max(2 -
+        Fs0 / alpha, 1) : Fs0, (double)max(postL, postM), att, phase_response,
+        s++, postL, postM, &multiplier, r_spec->log2_min_dft_size,
+        r_spec->log2_large_dft_size, core_flags, core->rdft_cb);
+
+  lsx_debug("%g: »%i⋅%i/%i⋅%i/%g⋅%i/%i %x", 1/io_ratio,
+      shr, preL, preM, arbL, arbM, postL, postM, core_flags);
+
+  for (i = 0, s = p->stages; i < p->num_stages; ++i, ++s) {
+    fifo_create(&s->fifo, (int)sizeof_real);
+    memset(fifo_reserve(&s->fifo, s->preload), 0,
+        sizeof_real * (size_t)s->preload);
+    lsx_debug_more("%5i|%-5i preload=%i remL=%i",
+        s->pre, s->pre_post-s->pre, s->preload, s->at.integer);
+  }
+  fifo_create(&s->fifo, (int)sizeof_real);
+  return 0;
+}
+
+static bool stage_process(stage_t * stage, bool flushing)
+{
+  fifo_t * fifo = &stage->fifo;
+  bool done = false;
+  int want;
+  while (!done && (want = stage->input_size - fifo_occupancy(fifo)) > 0) {
+    if (stage->is_input) {
+      if (flushing)
+        memset(fifo_reserve(fifo, want), 0, fifo->item_size * (size_t)want);
+      else done = true;
+    }
+    else done = stage_process(stage - 1, flushing);
+  }
+  stage->fn(stage, &stage[1].fifo);
+  return done && fifo_occupancy(fifo) < stage->input_size;
+}
+
+STATIC void _soxr_process(rate_t * p, size_t olen)
+{
+  int const n = p->flushing? min(-(int)p->samples_out, (int)olen) : (int)olen;
+  stage_t * stage = &p->stages[p->num_stages];
+  fifo_t * fifo = &stage->fifo;
+  bool done = false;
+  while (!done && fifo_occupancy(fifo) < (int)n)
+    done = stage->is_input || stage_process(stage - 1, p->flushing);
+}
+
+STATIC real * _soxr_input(rate_t * p, real const * samples, size_t n)
+{
+  if (p->flushing)
+    return 0;
+  p->samples_in += (int64_t)n;
+  return fifo_write(&p->stages[0].fifo, (int)n, samples);
+}
+
+STATIC real const * _soxr_output(rate_t * p, real * samples, size_t * n0)
+{
+  fifo_t * fifo = &p->stages[p->num_stages].fifo;
+  int n = p->flushing? min(-(int)p->samples_out, (int)*n0) : (int)*n0;
+  p->samples_out += n = min(n, fifo_occupancy(fifo));
+  return fifo_read(fifo, (int)(*n0 = (size_t)n), samples);
+}
+
+STATIC void _soxr_flush(rate_t * p)
+{
+  if (p->flushing) return;
+  p->samples_out -= (int64_t)((double)p->samples_in / p->io_ratio + .5);
+  p->samples_in = 0;
+  p->flushing = true;
+}
+
+STATIC void _soxr_close(rate_t * p)
+{
+  if (p->stages) {
+    fn_t const * const RDFT_CB = p->core->rdft_cb;
+    rate_shared_t * shared = p->stages[0].shared;
+    int i;
+
+    for (i = 0; i <= p->num_stages; ++i) {
+      stage_t * s = &p->stages[i];
+      rdft_free(s->dft_scratch);
+      rdft_free(s->dft_out);
+      fifo_delete(&s->fifo);
+    }
+    if (shared) {
+      for (i = 0; i < 2; ++i) {
+        dft_filter_t * f= &shared->dft_filter[i];
+        rdft_free(f->coefs);
+        rdft_delete_setup(f->dft_forward_setup);
+        rdft_delete_setup(f->dft_backward_setup);
+      }
+      p->core->mem.free(shared->poly_fir_coefs);
+      memset(shared, 0, sizeof(*shared));
+    }
+    free(p->stages);
+    (void)RDFT_CB;
+  }
+}
+
+#if defined SOXR_LIB
+STATIC double _soxr_delay(rate_t * p)
+{
+  return (double)p->samples_in / p->io_ratio - (double)p->samples_out;
+}
+
+STATIC void _soxr_sizes(size_t * shared, size_t * channel)
+{
+  *shared = sizeof(rate_shared_t);
+  *channel = sizeof(rate_t);
+}
+#endif
diff --git a/src/cr.h b/src/cr.h
new file mode 100644
index 0000000..7e20327
--- /dev/null
+++ b/src/cr.h
@@ -0,0 +1,175 @@
+/* SoX Resampler Library      Copyright (c) 2007-16 robs@users.sourceforge.net
+ * Licence for this file: LGPL v2.1                  See LICENCE for details. */
+
+#if !defined soxr_rate1_included
+#define soxr_rate1_included
+
+#define  FIFO_SIZE_T int
+#include "fifo.h"
+
+typedef void real; /* float or double */
+struct stage;
+typedef void (* stage_fn_t)(struct stage * input, fifo_t * output);
+typedef struct half_fir_info {int num_coefs; real const * coefs; stage_fn_t fn, dfn; float att;} half_fir_info_t;
+typedef struct {float scalar; stage_fn_t fn;} poly_fir1_t;
+typedef struct {float beta; poly_fir1_t interp[3];} poly_fir_t;
+
+#define U100_l 42
+#define MULT32 (65536. * 65536.)
+
+/* Conceptually: coef_p is &coefs[num_phases][fir_len][interp_order+1]: */
+#define coef(coef_p, interp_order, fir_len, phase_num, coef_interp_num, fir_coef_num) (coef_p)[\
+  (fir_len) * ((interp_order) + 1) * (phase_num) + \
+  ((interp_order) + 1) * (fir_coef_num) + \
+  ((interp_order) - (coef_interp_num))]
+
+/* Conceptually: coef_p is &coefs[num_phases][fir_len/4][interp_order+1][4]: */
+#define coef4(coef_p, interp_order, fir_len, phase_num, coef_interp_num, fir_coef_num) (coef_p)[\
+  (fir_len) * ((interp_order) + 1) * (phase_num) + \
+  ((interp_order) + 1) * ((fir_coef_num) & ~3) + \
+  4 * ((interp_order) - (coef_interp_num)) + \
+  ((fir_coef_num) & 3)]
+
+typedef union { /* Int64 in parts */
+  #if HAVE_BIGENDIAN
+  struct {int32_t ms; uint32_t ls;} parts;
+  #else
+  struct {uint32_t ls; int32_t ms;} parts;
+  #endif
+  int64_t all;
+} int64p_t;
+
+typedef union { /* Uint64 in parts */
+  #if HAVE_BIGENDIAN
+  struct {uint32_t ms, ls;} parts;
+  #else
+  struct {uint32_t ls, ms;} parts;
+  #endif
+  uint64_t all;
+} uint64p_t;
+
+#define FLOAT_HI_PREC_CLOCK 0    /* Non-float hi-prec has ~96 bits. */
+#define float_step_t long double /* __float128 is also a (slow) option */
+
+typedef struct {
+  int        dft_length, num_taps, post_peak;
+  void       * dft_forward_setup, * dft_backward_setup;
+  real   * coefs;
+} dft_filter_t;
+
+typedef struct { /* So generated filter coefs may be shared between channels */
+  real   * poly_fir_coefs;
+  dft_filter_t dft_filter[2];
+} rate_shared_t;
+
+typedef union { /* Fixed point arithmetic */
+  struct {uint64p_t ls; int64p_t ms;} fix;
+  float_step_t flt;
+} step_t;
+
+#define CORE_DBL       1
+#define CORE_SIMD_POLY 2
+#define CORE_SIMD_HALF 4
+#define CORE_SIMD_DFT  8
+#define LOG2_SIZEOF_REAL(core_flags) (2 + ((core_flags) & 1))
+
+typedef int core_flags_t;
+
+#if defined SOXR_LIB
+#include "rdft_t.h"
+#else
+typedef void fn_t;
+#endif
+
+typedef struct stage {
+  int        num;
+
+  /* Common to all stage types: */
+  core_flags_t   core_flags;
+  stage_fn_t fn;
+  fifo_t     fifo;
+  int        pre;       /* Number of past samples to store */
+  int        pre_post;  /* pre + number of future samples to store */
+  int        preload;   /* Number of zero samples to pre-load the fifo */
+  double     out_in_ratio; /* For buffer management. */
+  int        input_size;
+  bool       is_input;
+
+  /* For a stage with variable (run-time generated) filter coefs: */
+  fn_t const * rdft_cb;
+  rate_shared_t * shared;
+  unsigned   dft_filter_num; /* Which, if any, of the 2 DFT filters to use */
+  real       * dft_scratch;
+  float      * dft_out;
+  real const * coefs;
+
+  /* For a stage with variable L/M: */
+  step_t     at, step;
+  bool       use_hi_prec_clock;
+  int        L, remM;
+  int        n, phase_bits, block_len;
+  double     mult, phase0;
+} stage_t;
+
+#define stage_occupancy(s) max(0, fifo_occupancy(&(s)->fifo) - (s)->pre_post)
+#define stage_read_p(s) ((sample_t *)fifo_read_ptr(&(s)->fifo) + (s)->pre)
+#define integer  fix.ms.parts.ms
+#define fraction fix.ms.parts.ls
+#define whole    fix.ms.all
+
+
+#define lq_bw0  (1385/2048.) /* ~.67625, FP exact. */
+
+typedef enum {rolloff_small, rolloff_medium, rolloff_none} rolloff_t;
+
+
+typedef struct {
+  void * (* alloc)(size_t);
+  void * (* calloc)(size_t, size_t);
+  void (* free)(void *);
+} alloc_t;
+
+typedef struct {
+  alloc_t mem;
+  half_fir_info_t  const * half_firs;
+  size_t half_firs_len;
+  half_fir_info_t  const * doub_firs;
+  size_t doub_firs_len;
+  stage_fn_t cubic_stage_fn;
+  poly_fir_t const * poly_firs;
+  fn_t * rdft_cb;
+} cr_core_t;
+
+typedef struct rate rate_t;
+struct rate {
+  cr_core_t const * core;
+  double     io_ratio;
+  int64_t    samples_in, samples_out;
+  int        num_stages, flushing;
+  stage_t    * stages;
+};
+
+#if defined SOXR_LIB
+
+#include "soxr.h"
+
+char const * _soxr_init(
+  rate_t * const p,                /* Per audio channel.                            */
+  rate_shared_t * const shared,    /* Between channels (undergoing same rate change)*/
+  double const io_ratio,           /* Input rate divided by output rate.            */
+  soxr_quality_spec_t const * const q_spec,
+  soxr_runtime_spec_t const * const r_spec,
+  double multiplier,               /* Linear gain to apply during conversion.   1   */
+  cr_core_t const * const core,
+  core_flags_t const);
+
+void _soxr_process(struct rate * p, size_t olen);
+real * _soxr_input(struct rate * p, real const * samples, size_t n);
+real const * _soxr_output(struct rate * p, real * samples, size_t * n0);
+void _soxr_flush(struct rate * p);
+void _soxr_close(struct rate * p);
+double _soxr_delay(struct rate * p);
+void _soxr_sizes(size_t * shared, size_t * channel);
+#endif
+
+#endif
diff --git a/src/cr32.c b/src/cr32.c
new file mode 100644
index 0000000..b9eb264
--- /dev/null
+++ b/src/cr32.c
@@ -0,0 +1,8 @@
+/* SoX Resampler Library      Copyright (c) 2007-16 robs@users.sourceforge.net
+ * Licence for this file: LGPL v2.1                  See LICENCE for details. */
+
+#define RATE_CB    _soxr_rate32_cb
+#define CORE_STR   "cr32"
+
+#define CORE_TYPE  0
+#include "cr-core.c"
diff --git a/src/cr32s.c b/src/cr32s.c
new file mode 100644
index 0000000..5de2a43
--- /dev/null
+++ b/src/cr32s.c
@@ -0,0 +1,8 @@
+/* SoX Resampler Library      Copyright (c) 2007-16 robs@users.sourceforge.net
+ * Licence for this file: LGPL v2.1                  See LICENCE for details. */
+
+#define RATE_CB    _soxr_rate32s_cb
+#define CORE_STR   "cr32s"
+
+#define CORE_TYPE  (CORE_SIMD_POLY|CORE_SIMD_HALF|CORE_SIMD_DFT)
+#include "cr-core.c"
diff --git a/src/cr64.c b/src/cr64.c
new file mode 100644
index 0000000..518cdd7
--- /dev/null
+++ b/src/cr64.c
@@ -0,0 +1,8 @@
+/* SoX Resampler Library      Copyright (c) 2007-16 robs@users.sourceforge.net
+ * Licence for this file: LGPL v2.1                  See LICENCE for details. */
+
+#define RATE_CB    _soxr_rate64_cb
+#define CORE_STR   "cr64"
+
+#define CORE_TYPE  CORE_DBL
+#include "cr-core.c"
diff --git a/src/cr64s.c b/src/cr64s.c
new file mode 100644
index 0000000..5dcd6f1
--- /dev/null
+++ b/src/cr64s.c
@@ -0,0 +1,8 @@
+/* SoX Resampler Library      Copyright (c) 2007-16 robs@users.sourceforge.net
+ * Licence for this file: LGPL v2.1                  See LICENCE for details. */
+
+#define RATE_CB    _soxr_rate64s_cb
+#define CORE_STR   "cr64s"
+
+#define CORE_TYPE  (CORE_DBL|CORE_SIMD_POLY|CORE_SIMD_HALF|CORE_SIMD_DFT)
+#include "cr-core.c"
diff --git a/src/data-io.c b/src/data-io.c
index 1081000..52144c2 100644
--- a/src/data-io.c
+++ b/src/data-io.c
@@ -23,7 +23,7 @@
 
 
 
-#if WITH_DOUBLE_PRECISION
+#if WITH_CR64 || WITH_CR64S
 void _soxr_deinterleave(double * * dest, /* Round/clipping not needed here */
     soxr_datatype_t data_type, void const * * src0, size_t n, unsigned ch)
 {
@@ -40,7 +40,7 @@ void _soxr_deinterleave(double * * dest, /* Round/clipping not needed here */
 
 
 
-#if WITH_SINGLE_PRECISION
+#if WITH_CR32 || WITH_CR32S || WITH_VR32
 void _soxr_deinterleave_f(float * * dest, /* Round/clipping not needed here */
     soxr_datatype_t data_type, void const * * src0, size_t n, unsigned ch)
 {
@@ -97,7 +97,7 @@ void _soxr_deinterleave_f(float * * dest, /* Round/clipping not needed here */
   #endif
 #endif
 
-#if WITH_DOUBLE_PRECISION
+#if WITH_CR64 || WITH_CR64S
 #define FLOATX double
 
 #define LSX_RINT_CLIP_2 lsx_rint32_clip_2
@@ -139,7 +139,7 @@ void _soxr_deinterleave_f(float * * dest, /* Round/clipping not needed here */
 
 
 
-#if WITH_SINGLE_PRECISION
+#if WITH_CR32 || WITH_CR32S || WITH_VR32
 #define FLOATX float
 
 #define LSX_RINT_CLIP_2 lsx_rint32_clip_2_f
@@ -199,7 +199,7 @@ void _soxr_deinterleave_f(float * * dest, /* Round/clipping not needed here */
   return 0; \
 } while (0)
 
-#if WITH_DOUBLE_PRECISION
+#if WITH_CR64 || WITH_CR64S
 size_t /* clips */ _soxr_interleave(soxr_datatype_t data_type, void * * dest0,
   double const * const * src, size_t n, unsigned ch, unsigned long * seed)
 {
@@ -225,7 +225,7 @@ size_t /* clips */ _soxr_interleave(soxr_datatype_t data_type, void * * dest0,
 }
 #endif
 
-#if WITH_SINGLE_PRECISION
+#if WITH_CR32 || WITH_CR32S || WITH_VR32
 size_t /* clips */ _soxr_interleave_f(soxr_datatype_t data_type, void * * dest0,
   float const * const * src, size_t n, unsigned ch, unsigned long * seed)
 {
diff --git a/src/fft4g.c b/src/fft4g.c
index 5fae8a6..cf6293a 100644
--- a/src/fft4g.c
+++ b/src/fft4g.c
@@ -282,22 +282,16 @@ Appendix :
 */
 
 
-#include <math.h>
+#include "math-wrap.h"
 #include "fft4g.h"
 
 #ifdef FFT4G_FLOAT
   #define double float
   #define one_half 0.5f
 
-#if defined _MSC_VER
-  #define sin   (float)sin
-  #define cos   (float)cos
-  #define atan  (float)atan
-#else
-  #define sin   sinf
-  #define cos   cosf
-  #define atan  atanf
-#endif
+  #define sin(x)   sinf(x)
+  #define cos(x)   cosf(x)
+  #define atan(x)  atanf(x)
 
   #define cdft  lsx_cdft_f
   #define rdft  lsx_rdft_f
@@ -818,7 +812,7 @@ static void bitrv2(int n, int *ip0, double *a)
 
 static void bitrv2conj(int n, int *ip0, double *a)
 {
-    int j, j1, k, k1, l, m, m2, ip[256];
+    int j, j1, k, k1, l, m, m2, ip[512];
     double xr, xi, yr, yi;
 
     (void)ip0;
diff --git a/src/fft4g32.c b/src/fft4g32.c
index 8741394..5dcf34d 100644
--- a/src/fft4g32.c
+++ b/src/fft4g32.c
@@ -1,17 +1,19 @@
 /* SoX Resampler Library      Copyright (c) 2007-13 robs@users.sourceforge.net
  * Licence for this file: LGPL v2.1                  See LICENCE for details. */
 
+#include <stdlib.h>
 #include "filter.h"
 #define FFT4G_FLOAT
 #include "fft4g.c"
+#include "rdft_t.h"
 
 static void * null(void) {return 0;}
 static void forward (int length, void * setup, double * H) {lsx_safe_rdft_f(length,  1, H); (void)setup;}
 static void backward(int length, void * setup, double * H) {lsx_safe_rdft_f(length, -1, H); (void)setup;}
 static int multiplier(void) {return 2;}
 static void nothing(void) {}
+static int flags(void) {return 0;}
 
-typedef void (* fn_t)(void);
 fn_t _soxr_rdft32_cb[] = {
   (fn_t)null,
   (fn_t)null,
@@ -24,4 +26,8 @@ fn_t _soxr_rdft32_cb[] = {
   (fn_t)_soxr_ordered_partial_convolve_f,
   (fn_t)multiplier,
   (fn_t)nothing,
+  (fn_t)malloc,
+  (fn_t)calloc,
+  (fn_t)free,
+  (fn_t)flags,
 };
diff --git a/src/fft4g32s.c b/src/fft4g32s.c
index 4a95a7d..34dae4b 100644
--- a/src/fft4g32s.c
+++ b/src/fft4g32s.c
@@ -3,14 +3,15 @@
 
 #include "filter.h"
 #include "simd.h"
+#include "rdft_t.h"
 
 static void * null(void) {return 0;}
 static void nothing(void) {}
 static void forward (int length, void * setup, float * H) {lsx_safe_rdft_f(length,  1, H); (void)setup;}
 static void backward(int length, void * setup, float * H) {lsx_safe_rdft_f(length, -1, H); (void)setup;}
 static int multiplier(void) {return 2;}
+static int flags(void) {return RDFT_IS_SIMD;}
 
-typedef void (* fn_t)(void);
 fn_t _soxr_rdft32s_cb[] = {
   (fn_t)null,
   (fn_t)null,
@@ -19,8 +20,12 @@ fn_t _soxr_rdft32s_cb[] = {
   (fn_t)forward,
   (fn_t)backward,
   (fn_t)backward,
-  (fn_t)_soxr_ordered_convolve_simd,
-  (fn_t)_soxr_ordered_partial_convolve_simd,
+  (fn_t)ORDERED_CONVOLVE_SIMD,
+  (fn_t)ORDERED_PARTIAL_CONVOLVE_SIMD,
   (fn_t)multiplier,
   (fn_t)nothing,
+  (fn_t)SIMD_ALIGNED_MALLOC,
+  (fn_t)SIMD_ALIGNED_CALLOC,
+  (fn_t)SIMD_ALIGNED_FREE,
+  (fn_t)flags,
 };
diff --git a/src/fft4g64.c b/src/fft4g64.c
index 4acb33b..0018516 100644
--- a/src/fft4g64.c
+++ b/src/fft4g64.c
@@ -1,16 +1,18 @@
 /* SoX Resampler Library      Copyright (c) 2007-13 robs@users.sourceforge.net
  * Licence for this file: LGPL v2.1                  See LICENCE for details. */
 
+#include <stdlib.h>
 #include "filter.h"
 #include "fft4g.c"
 #include "soxr-config.h"
 
-#if WITH_DOUBLE_PRECISION
+#if WITH_CR64
 static void * null(void) {return 0;}
 static void nothing(void) {}
 static void forward (int length, void * setup, double * H) {lsx_safe_rdft(length,  1, H); (void)setup;}
 static void backward(int length, void * setup, double * H) {lsx_safe_rdft(length, -1, H); (void)setup;}
 static int multiplier(void) {return 2;}
+static int flags(void) {return 0;}
 
 typedef void (* fn_t)(void);
 fn_t _soxr_rdft64_cb[] = {
@@ -25,5 +27,9 @@ fn_t _soxr_rdft64_cb[] = {
   (fn_t)_soxr_ordered_partial_convolve,
   (fn_t)multiplier,
   (fn_t)nothing,
+  (fn_t)malloc,
+  (fn_t)calloc,
+  (fn_t)free,
+  (fn_t)flags,
 };
 #endif
diff --git a/src/fifo.h b/src/fifo.h
index b2bda43..19f6c1d 100644
--- a/src/fifo.h
+++ b/src/fifo.h
@@ -9,6 +9,7 @@
 #endif
 
 #if !defined FIFO_REALLOC
+#include <stdlib.h>
   #define FIFO_REALLOC(a,b,c) realloc(a,b)
   #undef FIFO_FREE
   #define FIFO_FREE free
diff --git a/src/filter.c b/src/filter.c
index 482302e..aec0b6e 100644
--- a/src/filter.c
+++ b/src/filter.c
@@ -1,12 +1,9 @@
-/* SoX Resampler Library      Copyright (c) 2007-13 robs@users.sourceforge.net
+/* SoX Resampler Library      Copyright (c) 2007-16 robs@users.sourceforge.net
  * Licence for this file: LGPL v2.1                  See LICENCE for details. */
 
 #include "filter.h"
 
-#include <math.h>
-#if !defined M_PI
-#define M_PI    3.14159265358979323846
-#endif
+#include "math-wrap.h"
 #include <assert.h>
 #include <string.h>
 #include <stdlib.h>
@@ -14,7 +11,7 @@
 #include "fft4g.h"
 #include "ccrw2.h"
 
-#if 1 || WITH_DOUBLE_PRECISION /* Always need this, for lsx_fir_to_phase. */
+#if 1 || WITH_CR64 || WITH_CR64S /* Always need this, for lsx_fir_to_phase. */
 #define DFT_FLOAT double
 #define DONE_WITH_FFT_CACHE done_with_fft_cache
 #define FFT_CACHE_CCRW fft_cache_ccrw
@@ -31,7 +28,7 @@
 #include "fft4g_cache.h"
 #endif
 
-#if WITH_SINGLE_PRECISION && !AVCODEC_FOUND
+#if WITH_CR32 && !AVCODEC_FOUND
 #define DFT_FLOAT float
 #define DONE_WITH_FFT_CACHE done_with_fft_cache_f
 #define FFT_CACHE_CCRW fft_cache_ccrw_f
@@ -48,14 +45,14 @@
 #include "fft4g_cache.h"
 #endif
 
-#if WITH_DOUBLE_PRECISION || !SOXR_LIB
+#if WITH_CR64 || WITH_CR64S || !SOXR_LIB
 #define DFT_FLOAT double
 #define ORDERED_CONVOLVE lsx_ordered_convolve
 #define ORDERED_PARTIAL_CONVOLVE lsx_ordered_partial_convolve
 #include "rdft.h"
 #endif
 
-#if WITH_SINGLE_PRECISION
+#if WITH_CR32
 #define DFT_FLOAT float
 #define ORDERED_CONVOLVE lsx_ordered_convolve_f
 #define ORDERED_PARTIAL_CONVOLVE lsx_ordered_partial_convolve_f
@@ -129,6 +126,9 @@ double * lsx_design_lpf(
   int n = *num_taps, phases = max(k, 1), modulo = max(-k, 1);
   double tr_bw, Fc, rho = phases == 1? .5 : att < 120? .63 : .75;
 
+  lsx_debug_more("./sinctest %-12.7g %-12.7g %g 0 %-5g %i %i 50 %g %g -4 >1",
+      Fp, Fs, Fn, att, *num_taps, k, beta, rho);
+
   Fp /= fabs(Fn), Fs /= fabs(Fn);        /* Normalise to Fn = 1 */
   tr_bw = .5 * (Fs - Fp); /* Transition band-width: 6dB to stop points */
   tr_bw /= phases, Fs /= phases;
@@ -243,3 +243,35 @@ void lsx_fir_to_phase(double * * h, int * len, int * post_len, double phase)
       work[imp_peak], *len, *post_len, 100 - 100. * *post_len / (*len - 1));
   free(pi_wraps), free(work);
 }
+
+#define F_x(F,expr) static double F(double x) {return expr;}
+F_x(sinePhi, ((2.0517e-07*x-1.1303e-04)*x+.023154)*x+.55924 )
+F_x(sinePsi, ((9.0667e-08*x-5.6114e-05)*x+.013658)*x+1.0977 )
+F_x(sinePow, log(.5)/log(sin(x*.5)) )
+#define dB_to_linear(x) exp((x) * (M_LN10 * 0.05))
+
+double lsx_f_resp(double t, double a)
+{
+  double x;
+  if (t > (a <= 160? .8 : .82)) {
+    double a1 = a+15;
+    double p = .00035*a+.375;
+    double w = 1/(1-.597)*asin(pow((a1-10.6)/a1,1/p));
+    double c = 1+asin(pow(1-a/a1,1/p))/w;
+    return a1*(pow(sin((c-t)*w),p)-1);
+  }
+  if (t > .5)
+    x = sinePsi(a), x = pow(sin((1-t) * x), sinePow(x));
+  else
+    x = sinePhi(a), x = 1 - pow(sin(t * x), sinePow(x));
+  return linear_to_dB(x);
+}
+
+double lsx_inv_f_resp(double drop, double a)
+{
+  double x = sinePhi(a), s;
+  drop = dB_to_linear(drop);
+  s = drop > .5 ? 1 - drop : drop;
+  x = asin(pow(s, 1/sinePow(x))) / x;
+  return drop > .5? x : 1 -x;
+}
diff --git a/src/filter.h b/src/filter.h
index 435303b..56333ff 100644
--- a/src/filter.h
+++ b/src/filter.h
@@ -33,7 +33,12 @@ double * lsx_design_lpf(
     int * num_taps, /* 0: value will be estimated */
     int k,          /* >0: number of phases; <0: num_taps ≡ 1 (mod -k) */
     double beta);   /* <0: value will be estimated */
+
 void lsx_fir_to_phase(double * * h, int * len,
     int * post_len, double phase0);
 
+double lsx_f_resp(double t, double a);
+double lsx_inv_f_resp(double drop, double a);
+#define lsx_to_3dB(a) (1 - lsx_inv_f_resp(-3., a))
+
 #endif
diff --git a/src/filters.h b/src/filters.h
deleted file mode 100644
index e9a8011..0000000
--- a/src/filters.h
+++ /dev/null
@@ -1,151 +0,0 @@
-/* SoX Resampler Library      Copyright (c) 2007-13 robs@users.sourceforge.net
- * Licence for this file: LGPL v2.1                  See LICENCE for details. */
-
-#include "half_coefs.h"
-
-#define FUNCTION h8
-#define CONVOLVE _ _ _ _ _ _ _ _
-#define h8_l 8
-#define COEFS half_fir_coefs_8
-#include "half-fir.h"
-
-#define FUNCTION h9
-#define CONVOLVE _ _ _ _ _ _ _ _ _
-#define h9_l 9
-#define COEFS half_fir_coefs_9
-#include "half-fir.h"
-
-#define FUNCTION h10
-#define CONVOLVE _ _ _ _ _ _ _ _ _ _
-#define h10_l 10
-#define COEFS half_fir_coefs_10
-#include "half-fir.h"
-
-#define FUNCTION h11
-#define CONVOLVE _ _ _ _ _ _ _ _ _ _ _
-#define h11_l 11
-#define COEFS half_fir_coefs_11
-#include "half-fir.h"
-
-#define FUNCTION h12
-#define CONVOLVE _ _ _ _ _ _ _ _ _ _ _ _
-#define h12_l 12
-#define COEFS half_fir_coefs_12
-#include "half-fir.h"
-
-#define FUNCTION h13
-#define CONVOLVE _ _ _ _ _ _ _ _ _ _ _ _ _
-#define h13_l 13
-#define COEFS half_fir_coefs_13
-#include "half-fir.h"
-
-static struct {int num_coefs; stage_fn_t fn; float att;} const half_firs[] = {
-  { 8, h8 , 136.51f},
-  { 9, h9 , 152.32f},
-  {10, h10, 168.07f},
-  {11, h11, 183.78f},
-  {12, h12, 199.44f},
-  {13, h13, 212.75f},
-};
-
-#define HI_PREC_CLOCK
-
-#define VAR_LENGTH p->n
-#define VAR_CONVOLVE while (j < FIR_LENGTH) _
-#define VAR_POLY_PHASE_BITS p->phase_bits
-
-#define FUNCTION vpoly0
-#define FIR_LENGTH VAR_LENGTH
-#define CONVOLVE VAR_CONVOLVE
-#include "poly-fir0.h"
-
-#define FUNCTION vpoly1
-#define COEF_INTERP 1
-#define PHASE_BITS VAR_POLY_PHASE_BITS
-#define FIR_LENGTH VAR_LENGTH
-#define CONVOLVE VAR_CONVOLVE
-#include "poly-fir.h"
-
-#define FUNCTION vpoly2
-#define COEF_INTERP 2
-#define PHASE_BITS VAR_POLY_PHASE_BITS
-#define FIR_LENGTH VAR_LENGTH
-#define CONVOLVE VAR_CONVOLVE
-#include "poly-fir.h"
-
-#define FUNCTION vpoly3
-#define COEF_INTERP 3
-#define PHASE_BITS VAR_POLY_PHASE_BITS
-#define FIR_LENGTH VAR_LENGTH
-#define CONVOLVE VAR_CONVOLVE
-#include "poly-fir.h"
-
-#undef HI_PREC_CLOCK
-
-#define U100_l 42
-#if RATE_SIMD_POLY
-  #define U100_l_EXTRA _ _
-  #define u100_l_EXTRA _
-  #define U100_l_EXTRA_LENGTH 2
-  #define u100_l_EXTRA_LENGTH 1
-#else
-  #define U100_l_EXTRA
-  #define u100_l_EXTRA
-  #define U100_l_EXTRA_LENGTH 0
-  #define u100_l_EXTRA_LENGTH 0
-#endif
-#define poly_fir_convolve_U100 _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ U100_l_EXTRA
-#define FUNCTION U100_0
-#define FIR_LENGTH (U100_l + U100_l_EXTRA_LENGTH)
-#define CONVOLVE poly_fir_convolve_U100
-#include "poly-fir0.h"
-
-#define u100_l 11
-#define poly_fir_convolve_u100 _ _ _ _ _ _ _ _ _ _ _ u100_l_EXTRA
-#define FUNCTION u100_0
-#define FIR_LENGTH (u100_l + u100_l_EXTRA_LENGTH)
-#define CONVOLVE poly_fir_convolve_u100
-#include "poly-fir0.h"
-
-#define FUNCTION u100_1
-#define COEF_INTERP 1
-#define PHASE_BITS 8
-#define FIR_LENGTH (u100_l + u100_l_EXTRA_LENGTH)
-#define CONVOLVE poly_fir_convolve_u100
-#include "poly-fir.h"
-#define u100_1_b 8
-
-#define FUNCTION u100_2
-#define COEF_INTERP 2
-#define PHASE_BITS 6
-#define FIR_LENGTH (u100_l + u100_l_EXTRA_LENGTH)
-#define CONVOLVE poly_fir_convolve_u100
-#include "poly-fir.h"
-#define u100_2_b 6
-
-typedef struct {float scalar; stage_fn_t fn;} poly_fir1_t;
-typedef struct {float beta; poly_fir1_t interp[3];} poly_fir_t;
-
-static poly_fir_t const poly_firs[] = {
-  {-1, {{0, vpoly0}, { 7.2f, vpoly1}, {5.0f, vpoly2}}},
-  {-1, {{0, vpoly0}, { 9.4f, vpoly1}, {6.7f, vpoly2}}},
-  {-1, {{0, vpoly0}, {12.4f, vpoly1}, {7.8f, vpoly2}}},
-  {-1, {{0, vpoly0}, {13.6f, vpoly1}, {9.3f, vpoly2}}},
-  {-1, {{0, vpoly0}, {10.5f, vpoly2}, {8.4f, vpoly3}}},
-  {-1, {{0, vpoly0}, {11.85f,vpoly2}, {9.0f, vpoly3}}},
-
-  {-1, {{0, vpoly0}, { 8.0f, vpoly1}, {5.3f, vpoly2}}},
-  {-1, {{0, vpoly0}, { 8.6f, vpoly1}, {5.7f, vpoly2}}},
-  {-1, {{0, vpoly0}, {10.6f, vpoly1}, {6.75f,vpoly2}}},
-  {-1, {{0, vpoly0}, {12.6f, vpoly1}, {8.6f, vpoly2}}},
-  {-1, {{0, vpoly0}, { 9.6f, vpoly2}, {7.6f, vpoly3}}},
-  {-1, {{0, vpoly0}, {11.4f, vpoly2}, {8.65f,vpoly3}}},
-
-  {10.62f, {{U100_l, U100_0}, {0, 0}, {0, 0}}},
-  {11.28f, {{u100_l, u100_0}, {u100_1_b, u100_1}, {u100_2_b, u100_2}}},
-  {-1, {{0, vpoly0}, {   9, vpoly1}, {  6, vpoly2}}},
-  {-1, {{0, vpoly0}, {  11, vpoly1}, {  7, vpoly2}}},
-  {-1, {{0, vpoly0}, {  13, vpoly1}, {  8, vpoly2}}},
-  {-1, {{0, vpoly0}, {  10, vpoly2}, {  8, vpoly3}}},
-  {-1, {{0, vpoly0}, {  12, vpoly2}, {  9, vpoly3}}},
-};
diff --git a/src/half-coefs.h b/src/half-coefs.h
new file mode 100644
index 0000000..a5a0882
--- /dev/null
+++ b/src/half-coefs.h
@@ -0,0 +1,75 @@
+/* SoX Resampler Library      Copyright (c) 2007-13 robs@users.sourceforge.net
+ * Licence for this file: LGPL v2.1                  See LICENCE for details. */
+
+#if defined __GNUC__
+  #pragma GCC system_header
+#elif defined __SUNPRO_C
+  #pragma disable_warn
+#elif defined _MSC_VER
+  #pragma warning(push, 1)
+#endif
+
+#if CORE_TYPE & CORE_SIMD_HALF
+  #define VALIGN vAlign
+#else
+  #define VALIGN
+#endif
+
+#if !(CORE_TYPE & CORE_SIMD_HALF)
+static VALIGN const sample_t half_fir_coefs_7[] = {
+ 3.1062656496657370e-01, -8.4998810699955796e-02,  3.4007044621123500e-02,
+-1.2839903789829387e-02,  3.9899380181723145e-03, -8.9355202017945374e-04,
+ 1.0918292424806546e-04,
+};
+#endif
+
+static VALIGN const sample_t half_fir_coefs_8[] = {
+ 3.1154652365332069e-01, -8.7344917685739543e-02,  3.6814458353637280e-02,
+-1.5189204581464479e-02,  5.4540855610738801e-03, -1.5643862626630416e-03,
+ 3.1816575906323303e-04, -3.4799449225005688e-05,
+};
+
+static VALIGN const sample_t half_fir_coefs_9[] = {
+ 3.1227034755311189e-01, -8.9221517147969526e-02,  3.9139704015071934e-02,
+-1.7250558515852023e-02,  6.8589440230476112e-03, -2.3045049636430419e-03,
+ 6.0963740543348963e-04, -1.1323803957431231e-04,  1.1197769991000046e-05,
+};
+
+#if CORE_TYPE & CORE_DBL
+static VALIGN const sample_t half_fir_coefs_10[] = {
+ 3.1285456012000523e-01, -9.0756740799292787e-02,  4.1096398104193160e-02,
+-1.9066319572525220e-02,  8.1840569787684902e-03, -3.0766876176359834e-03,
+ 9.6396524429277980e-04, -2.3585679989922018e-04,  4.0252189026627833e-05,
+-3.6298196342497932e-06,
+};
+
+static VALIGN const sample_t half_fir_coefs_11[] = {
+ 3.1333588822574199e-01, -9.2035898673019811e-02,  4.2765169698406408e-02,
+-2.0673580894964429e-02,  9.4225426824512421e-03, -3.8563379950013192e-03,
+ 1.3634742159642453e-03, -3.9874150714431009e-04,  9.0586723632664806e-05,
+-1.4285617244076783e-05,  1.1834642946400529e-06,
+};
+
+static VALIGN const sample_t half_fir_coefs_12[] = {
+ 3.1373928463345568e-01, -9.3118180335301962e-02,  4.4205005881659098e-02,
+-2.2103860986973051e-02,  1.0574689371162864e-02, -4.6276428065385065e-03,
+ 1.7936153397572132e-03, -5.9617527051353237e-04,  1.6314517495669067e-04,
+-3.4555126770115446e-05,  5.0617615610782593e-06, -3.8768958592971409e-07,
+};
+
+static VALIGN const sample_t half_fir_coefs_13[] = {
+ 3.1408224847888910e-01, -9.4045836332667387e-02,  4.5459878763259978e-02,
+-2.3383369012219993e-02,  1.1644273044890753e-02, -5.3806714579057013e-03,
+ 2.2429072878264022e-03, -8.2204347506606424e-04,  2.5724946477840893e-04,
+-6.6072709864248668e-05,  1.3099163296288644e-05, -1.7907147069136000e-06,
+ 1.2750825595240592e-07,
+};
+#endif
+
+#undef VALIGN
+
+#if defined __SUNPRO_C
+  #pragma enable_warn
+#elif defined _MSC_VER
+  #pragma warning(pop)
+#endif
diff --git a/src/half-fir.h b/src/half-fir.h
index 0a8ee97..782be1b 100644
--- a/src/half-fir.h
+++ b/src/half-fir.h
@@ -1,25 +1,61 @@
-/* SoX Resampler Library      Copyright (c) 2007-13 robs@users.sourceforge.net
+/* SoX Resampler Library      Copyright (c) 2007-16 robs@users.sourceforge.net
  * Licence for this file: LGPL v2.1                  See LICENCE for details. */
 
-/* Down-sample by a factor of 2 using a FIR with odd length (LEN).*/
+/* Decimate by 2 using a FIR with odd length (LEN). */
 /* Input must be preceded and followed by LEN >> 1 samples. */
 
-#define _ sum += (input[-(2*j +1)] + input[(2*j +1)]) * COEFS[j], ++j;
-static void FUNCTION(stage_t * p, fifo_t * output_fifo)
+#define COEFS ((sample_t const *)p->coefs)
+
+#if SIMD_SSE
+  #define BEGINNING v4_t sum, q1, q2, t
+  #define ____ \
+    q1 = _mm_shuffle_ps(t=vLdu(input+2*j),vLdu(input+2*j+4),_MM_SHUFFLE(3,1,3,1)); \
+    q2 = _mm_shuffle_ps(vLdu(input-2*j-4),vLdu(input-2*j-8),_MM_SHUFFLE(1,3,1,3)); \
+    sum = vAdd(j? sum : vMul(vSet1(.5), t), vMul(vAdd(q1, q2), vLd(COEFS+j))); \
+    j += 4;
+  #define __ \
+    q1 = _mm_shuffle_ps(vLdu(input+2*j), vLdu(input-2*j-4), _MM_SHUFFLE(1,3,3,1)); \
+    q2 = _mm_loadl_pi(q2, (__m64*)(COEFS+j)), q2 = _mm_movelh_ps(q2, q2); \
+    sum = vAdd(sum, vMul(q1, q2)); \
+    j += 2;
+  #define _ \
+    q1 = _mm_add_ss(_mm_load_ss(input+2*j+1), _mm_load_ss(input-2*j-1)); \
+    sum = _mm_add_ss(sum, _mm_mul_ss(q1, _mm_load_ss(COEFS+j))); \
+    ++j;
+  #define END vStorSum(output+i, sum)
+/* #elif SIMD_AVX; No good solution found. */
+/* #elif SIMD_NEON; No need: gcc -O3 does a good job by itself. */
+#else
+  #define BEGINNING sample_t sum = input[0] * .5f
+  #define ____ __ __
+  #define __ _ _
+  #define _ sum += (input[-(2*j +1)] + input[(2*j +1)]) * COEFS[j], ++j;
+  #define END output[i] = sum
+#endif
+
+
+
+static void FUNCTION_H(stage_t * p, fifo_t * output_fifo)
 {
-  sample_t const * input = stage_read_p(p);
-  int i, num_out = (stage_occupancy(p) + 1) / 2;
-  sample_t * output = fifo_reserve(output_fifo, num_out);
+  sample_t const * __restrict input = stage_read_p(p);
+  int num_in = min(stage_occupancy(p), p->input_size);
+  int i, num_out = (num_in + 1) >> 1;
+  sample_t * __restrict output = fifo_reserve(output_fifo, num_out);
 
   for (i = 0; i < num_out; ++i, input += 2) {
     int j = 0;
-    sample_t sum = input[0] * .5f;
-    CONVOLVE
-    output[i] = sum;
+    BEGINNING; CONVOLVE; END;
   }
   fifo_read(&p->fifo, 2 * num_out, NULL);
 }
+
+
+
 #undef _
+#undef __
+#undef ____
+#undef BEGINNING
+#undef END
 #undef COEFS
 #undef CONVOLVE
-#undef FUNCTION
+#undef FUNCTION_H
diff --git a/src/half_coefs.h b/src/half_coefs.h
deleted file mode 100644
index aac7769..0000000
--- a/src/half_coefs.h
+++ /dev/null
@@ -1,57 +0,0 @@
-/* SoX Resampler Library      Copyright (c) 2007-13 robs@users.sourceforge.net
- * Licence for this file: LGPL v2.1                  See LICENCE for details. */
-
-#if defined __GNUC__
-  #pragma GCC system_header
-#elif defined __SUNPRO_C
-  #pragma disable_warn
-#elif defined _MSC_VER
-  #pragma warning(push, 1)
-#endif
-
-static const sample_t half_fir_coefs_8[] = {
-  0.3115465451887802, -0.08734497241282892, 0.03681452335604365,
-  -0.01518925831569441, 0.005454118437408876, -0.001564400922162005,
-  0.0003181701445034203, -3.48001341225749e-5,
-};
-
-static const sample_t half_fir_coefs_9[] = {
-  0.3122703613711853, -0.08922155288172305, 0.03913974805854332,
-  -0.01725059723447163, 0.006858970092378141, -0.002304518467568703,
-  0.0006096426006051062, -0.0001132393923815236, 1.119795386287666e-5,
-};
-
-static const sample_t half_fir_coefs_10[] = {
-  0.3128545521327376, -0.09075671986104322, 0.04109637155154835,
-  -0.01906629512749895, 0.008184039342054333, -0.0030766775017262,
-  0.0009639607022414314, -0.0002358552746579827, 4.025184282444155e-5,
-  -3.629779111541012e-6,
-};
-
-static const sample_t half_fir_coefs_11[] = {
-  0.3133358837508807, -0.09203588680609488, 0.04276515428384758,
-  -0.02067356614745591, 0.00942253142371517, -0.003856330993895144,
-  0.001363470684892284, -0.0003987400965541919, 9.058629923971627e-5,
-  -1.428553070915318e-5, 1.183455238783835e-6,
-};
-
-static const sample_t half_fir_coefs_12[] = {
-  0.3137392991811407, -0.0931182192961332, 0.0442050575271454,
-  -0.02210391200618091, 0.01057473015666001, -0.00462766983973885,
-  0.001793630226239453, -0.0005961819959665878, 0.0001631475979359577,
-  -3.45557865639653e-5, 5.06188341942088e-6, -3.877010943315563e-7,
-};
-
-static const sample_t half_fir_coefs_13[] = {
-  0.3140822554324578, -0.0940458550886253, 0.04545990399121566,
-  -0.02338339450796002, 0.01164429409071052, -0.005380686021429845,
-  0.002242915773871009, -0.000822047600000082, 0.0002572510962395222,
-  -6.607320708956279e-5, 1.309926399120154e-5, -1.790719575255006e-6,
-  1.27504961098836e-7,
-};
-
-#if defined __SUNPRO_C
-  #pragma enable_warn
-#elif defined _MSC_VER
-  #pragma warning(pop)
-#endif
diff --git a/src/internal.h b/src/internal.h
index 6b4fb24..ee691a0 100644
--- a/src/internal.h
+++ b/src/internal.h
@@ -6,11 +6,15 @@
 
 #include "std-types.h"
 
+
+
 #undef min
 #undef max
 #define min(a, b) ((a) <= (b) ? (a) : (b))
 #define max(a, b) ((a) >= (b) ? (a) : (b))
 
+
+
 #define range_limit(x, lower, upper) (min(max(x, lower), upper))
 #define linear_to_dB(x) (log10(x) * 20)
 #define array_length(a) (sizeof(a)/sizeof(a[0]))
@@ -20,12 +24,16 @@
 #define iAL(a) (int)AL(a)
 #define sqr(a) ((a) * (a))
 
+
+
 #if defined __GNUC__
   #define UNUSED __attribute__ ((unused))
 #else
   #define UNUSED
 #endif
 
+
+
 #if !WITH_DEV_TRACE
   #ifdef __GNUC__
     void lsx_dummy(char const *, ...);
@@ -35,10 +43,41 @@
   #define lsx_debug if(0) lsx_dummy
   #define lsx_debug_more lsx_debug
 #else
-  int _soxr_trace_level(void);
+  extern int _soxr_trace_level;
   void _soxr_trace(char const * fmt, ...);
-  #define lsx_debug      if (_soxr_trace_level() >= 4) _soxr_trace
-  #define lsx_debug_more if (_soxr_trace_level() >= 5) _soxr_trace
+  #define lsx_debug      if (_soxr_trace_level > 0) _soxr_trace
+  #define lsx_debug_more if (_soxr_trace_level > 1) _soxr_trace
 #endif
 
+
+
+/* soxr_quality_spec_t.flags: */
+
+#define SOXR_ROLLOFF_LSR2Q     3u    /* Reserved for internal use. */
+#define SOXR_ROLLOFF_MASK      3u    /* For masking these bits. */
+#define SOXR_PROMOTE_TO_LQ    64u    /* Reserved for internal use. */
+
+
+
+/* soxr_runtime_spec_t.flags: */
+
+#define SOXR_STRICT_BUFFERING  4u    /* Reserved for future use. */
+#define SOXR_NOSMALLINTOPT     8u    /* For test purposes only. */
+
+
+
+/* soxr_quality_spec recipe: */
+
+#define SOXR_PRECISIONQ         11   /* Quality specified by the precision parameter. */
+
+#define SOXR_PHASE_MASK         0x30 /* For masking these bits. */
+
+
+
+/* soxr_quality_spec flags: */
+
+#define RESET_ON_CLEAR   (1u<<31)
+
+
+
 #endif
diff --git a/src/math-wrap.h b/src/math-wrap.h
new file mode 100644
index 0000000..8a526f1
--- /dev/null
+++ b/src/math-wrap.h
@@ -0,0 +1,31 @@
+/* SoX Resampler Library      Copyright (c) 2007-16 robs@users.sourceforge.net
+ * Licence for this file: LGPL v2.1                  See LICENCE for details. */
+
+#if !defined soxr_math_wrap_included
+#define soxr_math_wrap_included
+
+#include <math.h>
+
+#if defined __STRICT_ANSI__
+  #define sinf(x)  (float)sin ((double)(x))
+  #define cosf(x)  (float)cos ((double)(x))
+  #define atanf(x) (float)atan((double)(x))
+#endif
+
+#if !defined M_PI
+  #define M_PI    3.141592653589793238462643383279502884
+#endif
+
+#if !defined M_LN10
+  #define M_LN10  2.302585092994045684017991454684364208
+#endif
+
+#if !defined M_SQRT2
+  #define M_SQRT2 1.414213562373095048801688724209698079
+#endif
+
+#if !defined M_LN2
+  #define M_LN2   0.693147180559945309417232121458176568
+#endif
+
+#endif
diff --git a/src/pffft-wrap.c b/src/pffft-wrap.c
index f410892..806547c 100644
--- a/src/pffft-wrap.c
+++ b/src/pffft-wrap.c
@@ -3,24 +3,23 @@
 
 #if !defined PFFT_MACROS_ONLY
 
-#include "simd.h"
-#include <math.h>
+#include "math-wrap.h"
 
-#define pffft_aligned_free    _soxr_simd_aligned_free
-#define pffft_aligned_malloc  _soxr_simd_aligned_malloc
-#define pffft_aligned_calloc  _soxr_simd_aligned_calloc
+#if PFFFT_DOUBLE
+  #include "simd64.h"
+#else
+  #include "simd32.h"
+  #define sin(x) sinf(x)
+  #define cos(x) cosf(x)
+#endif
+
+#define pffft_aligned_free    SIMD_ALIGNED_FREE
+#define pffft_aligned_malloc  SIMD_ALIGNED_MALLOC
+#define pffft_aligned_calloc  SIMD_ALIGNED_CALLOC
 
 #undef inline
 #define inline __inline
 
-#if defined _MSC_VER
-  #define sin (float)sin
-  #define cos (float)cos
-#else
-  #define sin(x) sinf((float)(x))
-  #define cos(x) cosf((float)(x))
-#endif
-
 #endif
 
 
diff --git a/src/pffft.c b/src/pffft.c
index b68f86c..5729274 100644
--- a/src/pffft.c
+++ b/src/pffft.c
@@ -133,9 +133,11 @@ inline v4sf ld_ps1(const float *p) { v4sf v=vec_lde(0,p); return vec_splat(vec_p
 */
 #elif !defined(PFFFT_SIMD_DISABLE) && (defined(__x86_64__) || defined(_M_X64) || defined(i386) || defined(_M_IX86))
 
+#  define SIMD_SZ 4 /* 4 floats by simd vector -- this is pretty much hardcoded in the preprocess/finalize functions anyway so you will have to work if you want to enable AVX with its 256-bit vectors. */
+
+#if !PFFFT_DOUBLE
 #include <xmmintrin.h>
 typedef __m128 v4sf;
-#  define SIMD_SZ 4 /* 4 floats by simd vector -- this is pretty much hardcoded in the preprocess/finalize functions anyway so you will have to work if you want to enable AVX with its 256-bit vectors. */
 #  define VZERO() _mm_setzero_ps()
 #  define VMUL(a,b) _mm_mul_ps(a,b)
 #  define VADD(a,b) _mm_add_ps(a,b)
@@ -148,6 +150,10 @@ typedef __m128 v4sf;
 #  define VSWAPHL(a,b) _mm_shuffle_ps(b, a, _MM_SHUFFLE(3,2,1,0))
 #  define VALIGNED(ptr) ((((long)(ptr)) & 0xF) == 0)
 
+#else
+#include "avx.h"
+#endif
+
 /*
   ARM NEON support macros
 */
@@ -181,6 +187,10 @@ typedef float32x4_t v4sf;
 #  endif
 #endif
 
+#if PFFFT_DOUBLE
+#define float double
+#endif
+
 /* fallback mode for situations where SSE/Altivec are not available, use scalar mode instead */
 #ifdef PFFFT_SIMD_DISABLE
 typedef float v4sf;
@@ -202,6 +212,8 @@ typedef float v4sf;
 #define SVMUL(f,v) VMUL(LD_PS1(f),v)
 #endif
 
+#if !defined PFFT_MACROS_ONLY
+
 #if !defined(PFFFT_SIMD_DISABLE)
 typedef union v4sf_union {
   v4sf  v;
@@ -214,7 +226,8 @@ typedef union v4sf_union {
 #define assertv4(v,f0,f1,f2,f3) assert(v.f[0] == (f0) && v.f[1] == (f1) && v.f[2] == (f2) && v.f[3] == (f3))
 
 /* detect bugs with the vector support macros */
-void validate_pffft_simd() {
+void validate_pffft_simd(void);
+void validate_pffft_simd(void) {
   float f[16] = { 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15 };
   v4sf_union a0, a1, a2, a3, t, u;
   memcpy(a0.f, f, 4*sizeof(float));
@@ -230,7 +243,6 @@ void validate_pffft_simd() {
   printf("VMUL(4:7,8:11)=[%2g %2g %2g %2g]\n", t.f[0], t.f[1], t.f[2], t.f[3]); assertv4(t, 32, 45, 60, 77);
   t.v = VMADD(a1.v, a2.v,a0.v);
   printf("VMADD(4:7,8:11,0:3)=[%2g %2g %2g %2g]\n", t.f[0], t.f[1], t.f[2], t.f[3]); assertv4(t, 32, 46, 62, 80);
-
   INTERLEAVE2(a1.v,a2.v,t.v,u.v);
   printf("INTERLEAVE2(4:7,8:11)=[%2g %2g %2g %2g] [%2g %2g %2g %2g]\n", t.f[0], t.f[1], t.f[2], t.f[3], u.f[0], u.f[1], u.f[2], u.f[3]);
   assertv4(t, 4, 8, 5, 9); assertv4(u, 6, 10, 7, 11);
@@ -271,8 +283,6 @@ void pffft_aligned_free(void *p) {
 int pffft_simd_size() { return SIMD_SZ; }
 #endif
 
-#if !defined PFFT_MACROS_ONLY
-
 /*
   passf2 and passb2 has been merged here, fsign = -1 for passf2, +1 for passb2
 */
diff --git a/src/pffft.h b/src/pffft.h
index 5ff8466..63522ca 100644
--- a/src/pffft.h
+++ b/src/pffft.h
@@ -86,6 +86,10 @@
 
 #ifdef __cplusplus
 extern "C" {
+#endif
+
+#if PFFFT_DOUBLE
+#define float double
 #endif
 
   /* opaque struct holding internal stuff (precomputed twiddle factors)
@@ -182,6 +186,8 @@ extern "C" {
   int pffft_simd_size();
 #endif
 
+#undef float
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/src/pffft32.c b/src/pffft32.c
index fb7400f..f480809 100644
--- a/src/pffft32.c
+++ b/src/pffft32.c
@@ -1,11 +1,14 @@
 /* SoX Resampler Library      Copyright (c) 2007-13 robs@users.sourceforge.net
  * Licence for this file: LGPL v2.1                  See LICENCE for details. */
 
-#define _soxr_simd_aligned_free free
-#define _soxr_simd_aligned_malloc malloc
+#define SIMD_ALIGNED_FREE free
+#define SIMD_ALIGNED_MALLOC malloc
 #define PFFFT_SIMD_DISABLE
+#define PFFFT_DOUBLE 0
 #include "pffft-wrap.c"
+
 #include "filter.h"
+#include "rdft_t.h"
 
 static void * setup(int len) {return pffft_new_setup(len, PFFFT_REAL);}
 static void delete_setup(void * setup) {pffft_destroy_setup(setup);}
@@ -15,8 +18,8 @@ static void backward (int length, void * setup, float * H, float * scratch) {pff
 static void obackward(int length, void * setup, float * H, float * scratch) {pffft_transform_ordered(setup, H, H, scratch, PFFFT_BACKWARD);(void)length;}
 static void convolve(int length, void * setup, float * H, float const * with) { pffft_zconvolve(setup, H, with, H);  (void)length;}
 static int multiplier(void) {return 1;}
+static int flags(void) {return RDFT_NEEDS_SCRATCH;}
 
-typedef void (* fn_t)(void);
 fn_t _soxr_rdft32_cb[] = {
   (fn_t)setup,
   (fn_t)setup,
@@ -29,4 +32,8 @@ fn_t _soxr_rdft32_cb[] = {
   (fn_t)_soxr_ordered_partial_convolve_f,
   (fn_t)multiplier,
   (fn_t)pffft_reorder_back,
+  (fn_t)malloc,
+  (fn_t)calloc,
+  (fn_t)free,
+  (fn_t)flags,
 };
diff --git a/src/pffft32s.c b/src/pffft32s.c
index b067ad2..7798a45 100644
--- a/src/pffft32s.c
+++ b/src/pffft32s.c
@@ -1,17 +1,20 @@
 /* SoX Resampler Library      Copyright (c) 2007-13 robs@users.sourceforge.net
  * Licence for this file: LGPL v2.1                  See LICENCE for details. */
 
+#define PFFFT_DOUBLE 0
 #include "pffft-wrap.c"
 
+#include "rdft_t.h"
+
 static void * setup(int len) {return pffft_new_setup(len, PFFFT_REAL);}
 static void forward  (int length, void * setup, float * h, float * scratch) {pffft_transform        (setup, h, h, scratch, PFFFT_FORWARD); (void)length;}
 static void oforward (int length, void * setup, float * h, float * scratch) {pffft_transform_ordered(setup, h, h, scratch, PFFFT_FORWARD); (void)length;}
 static void backward (int length, void * setup, float * H, float * scratch) {pffft_transform        (setup, H, H, scratch, PFFFT_BACKWARD);(void)length;}
 static void obackward(int length, void * setup, float * H, float * scratch) {pffft_transform_ordered(setup, H, H, scratch, PFFFT_BACKWARD);(void)length;}
-static void convolve(int length, void * setup, float * H, float const * with) { pffft_zconvolve(setup, H, with, H);                  (void)length;}
+static void convolve(int length, void * setup, float * H, float const * with) {pffft_zconvolve(setup, H, with, H); (void)length;}
 static int multiplier(void) {return 1;}
+static int flags(void) {return RDFT_IS_SIMD | RDFT_NEEDS_SCRATCH;}
 
-typedef void (* fn_t)(void);
 fn_t _soxr_rdft32s_cb[] = {
   (fn_t)setup,
   (fn_t)setup,
@@ -21,7 +24,11 @@ fn_t _soxr_rdft32s_cb[] = {
   (fn_t)backward,
   (fn_t)obackward,
   (fn_t)convolve,
-  (fn_t)_soxr_ordered_partial_convolve_simd,
+  (fn_t)ORDERED_PARTIAL_CONVOLVE_SIMD,
   (fn_t)multiplier,
   (fn_t)pffft_reorder_back,
+  (fn_t)SIMD_ALIGNED_MALLOC,
+  (fn_t)SIMD_ALIGNED_CALLOC,
+  (fn_t)SIMD_ALIGNED_FREE,
+  (fn_t)flags,
 };
diff --git a/src/pffft64s.c b/src/pffft64s.c
new file mode 100644
index 0000000..7c37c9d
--- /dev/null
+++ b/src/pffft64s.c
@@ -0,0 +1,34 @@
+/* SoX Resampler Library      Copyright (c) 2007-16 robs@users.sourceforge.net
+ * Licence for this file: LGPL v2.1                  See LICENCE for details. */
+
+#define PFFFT_DOUBLE 1
+#include "pffft-wrap.c"
+
+#include "rdft_t.h"
+
+static void * setup(int len) {return pffft_new_setup(len, PFFFT_REAL);}
+static void forward  (int length, void * setup, double * h, double * scratch) {pffft_transform        (setup, h, h, scratch, PFFFT_FORWARD); (void)length;}
+static void oforward (int length, void * setup, double * h, double * scratch) {pffft_transform_ordered(setup, h, h, scratch, PFFFT_FORWARD); (void)length;}
+static void backward (int length, void * setup, double * H, double * scratch) {pffft_transform        (setup, H, H, scratch, PFFFT_BACKWARD);(void)length;}
+static void obackward(int length, void * setup, double * H, double * scratch) {pffft_transform_ordered(setup, H, H, scratch, PFFFT_BACKWARD);(void)length;}
+static void convolve(int length, void * setup, double * H, double const * with) {pffft_zconvolve(setup, H, with, H); (void)length;}
+static int multiplier(void) {return 1;}
+static int flags(void) {return RDFT_IS_SIMD | RDFT_NEEDS_SCRATCH;}
+
+fn_t _soxr_rdft64s_cb[] = {
+  (fn_t)setup,
+  (fn_t)setup,
+  (fn_t)pffft_destroy_setup,
+  (fn_t)forward,
+  (fn_t)oforward,
+  (fn_t)backward,
+  (fn_t)obackward,
+  (fn_t)convolve,
+  (fn_t)ORDERED_PARTIAL_CONVOLVE_SIMD,
+  (fn_t)multiplier,
+  (fn_t)pffft_reorder_back,
+  (fn_t)SIMD_ALIGNED_MALLOC,
+  (fn_t)SIMD_ALIGNED_CALLOC,
+  (fn_t)SIMD_ALIGNED_FREE,
+  (fn_t)flags,
+};
diff --git a/src/poly-fir.h b/src/poly-fir.h
index f7b4261..94db90e 100644
--- a/src/poly-fir.h
+++ b/src/poly-fir.h
@@ -1,97 +1,138 @@
-/* SoX Resampler Library      Copyright (c) 2007-13 robs@users.sourceforge.net
+/* SoX Resampler Library      Copyright (c) 2007-16 robs@users.sourceforge.net
  * Licence for this file: LGPL v2.1                  See LICENCE for details. */
 
-/* Resample using an interpolated poly-phase FIR with length LEN.*/
-/* Input must be followed by LEN-1 samples. */
+/* Resample using an interpolated poly-phase FIR with length LEN. */
+/* Input must be followed by FIR_LENGTH-1 samples. */
 
-#define a (coef(p->shared->poly_fir_coefs, COEF_INTERP, FIR_LENGTH, phase, 0,j))
-#define b (coef(p->shared->poly_fir_coefs, COEF_INTERP, FIR_LENGTH, phase, 1,j))
-#define c (coef(p->shared->poly_fir_coefs, COEF_INTERP, FIR_LENGTH, phase, 2,j))
-#define d (coef(p->shared->poly_fir_coefs, COEF_INTERP, FIR_LENGTH, phase, 3,j))
-#if COEF_INTERP == 0
-  #define _ sum += a *in[j], ++j;
-#elif COEF_INTERP == 1
-  #define _ sum += (b *x + a)*in[j], ++j;
-#elif COEF_INTERP == 2
-  #define _ sum += ((c *x + b)*x + a)*in[j], ++j;
-#elif COEF_INTERP == 3
-  #define _ sum += (((d*x + c)*x + b)*x + a)*in[j], ++j;
-#else
+#if COEF_INTERP != 1 && COEF_INTERP != 2 && COEF_INTERP != 3
   #error COEF_INTERP
 #endif
 
+#if SIMD_AVX || SIMD_SSE || SIMD_NEON
+  #define N (FIR_LENGTH>>2)
+
+  #if COEF_INTERP == 1
+    #define _ sum=vMac(vMac(b,X,a),vLdu(in+j*4),sum), ++j;
+  #elif COEF_INTERP == 2
+    #define _ sum=vMac(vMac(vMac(c,X,b),X,a),vLdu(in+j*4),sum), ++j;
+  #else
+    #define _ sum=vMac(vMac(vMac(vMac(d,X,c),X,b),X,a),vLdu(in+j*4),sum), ++j;
+  #endif
+
+  #define a coefs[(COEF_INTERP+1)*(N*phase+j)+(COEF_INTERP-0)]
+  #define b coefs[(COEF_INTERP+1)*(N*phase+j)+(COEF_INTERP-1)]
+  #define c coefs[(COEF_INTERP+1)*(N*phase+j)+(COEF_INTERP-2)]
+  #define d coefs[(COEF_INTERP+1)*(N*phase+j)+(COEF_INTERP-3)]
+
+  #define BEGINNING v4_t X = vLds(x), sum = vZero(); \
+      v4_t const * const __restrict coefs = (v4_t *)COEFS
+  #define MIDDLE switch (N) {case 3: CONVOLVE(3); break; case 4: CONVOLVE(4); \
+      break; case 5: CONVOLVE(5); break;  default: CONVOLVE(N); }
+  #define END vStorSum(output+i, sum)
+  #define cc(n) case n: core(n); break
+  #define CORE(n) switch (n) {cc(2); cc(3); cc(4); cc(5); cc(6); default: core(n);}
+#else
+  #define N FIR_LENGTH
+
+  #if COEF_INTERP == 1
+    #define _ sum += (b*x + a)*in[j], ++j;
+  #elif COEF_INTERP == 2
+    #define _ sum += ((c*x + b)*x + a)*in[j], ++j;
+  #else
+    #define _ sum += (((d*x + c)*x + b)*x + a)*in[j], ++j;
+  #endif
+
+  #define a (coef(COEFS, COEF_INTERP, N, phase, 0,j))
+  #define b (coef(COEFS, COEF_INTERP, N, phase, 1,j))
+  #define c (coef(COEFS, COEF_INTERP, N, phase, 2,j))
+  #define d (coef(COEFS, COEF_INTERP, N, phase, 3,j))
+
+  #define BEGINNING sample_t sum = 0
+  #define MIDDLE CONVOLVE(N)
+  #define END output[i] = sum
+  #define CORE(n) core(n)
+#endif
+
+#define fphpCore(n) \
+  if (p->use_hi_prec_clock) { \
+    float_step_t at = p->at.flt; \
+    for (i = 0; (int)at < num_in; ++i, at += p->step.flt) { \
+      sample_t const * const __restrict in = input + (int)at; \
+      float_step_t frac = at - (int)at; \
+      int phase = (int)(frac * (1 << PHASE_BITS)); \
+      sample_t x = (sample_t)(frac * (1 << PHASE_BITS) - phase); \
+      int j = 0; \
+      BEGINNING; CONVOLVE(n); END; \
+    } \
+    fifo_read(&p->fifo, (int)at, NULL); \
+    p->at.flt = at - (int)at; \
+  } else
+
+#define hpCore(n) \
+  if (p->use_hi_prec_clock) { \
+    for (i = 0; p->at.integer < num_in; ++i, \
+        p->at.fix.ls.all += p->step.fix.ls.all, \
+        p->at.whole += p->step.whole + (p->at.fix.ls.all < p->step.fix.ls.all)) { \
+      sample_t const * const __restrict in = input + p->at.integer; \
+      uint32_t frac = p->at.fraction; \
+      int phase = (int)(frac >> (32 - PHASE_BITS)); /* high-order bits */ \
+      sample_t x = (sample_t)((frac << PHASE_BITS) * (1 / MULT32)); /* low-order bits, scaled to [0,1) */ \
+      int j = 0; \
+      BEGINNING; CONVOLVE(n); END; \
+    } \
+    fifo_read(&p->fifo, p->at.integer, NULL); \
+    p->at.integer = 0; \
+  } else
+
+#define spCore(n) { \
+    for (i = 0; p->at.integer < num_in; ++i, p->at.whole += p->step.whole) { \
+      sample_t const * const __restrict in = input + p->at.integer; \
+      uint32_t frac = p->at.fraction; \
+      int phase = (int)(frac >> (32 - PHASE_BITS)); /* high-order bits */ \
+      sample_t x = (sample_t)((frac << PHASE_BITS) * (1 / MULT32)); /* low-order bits, scaled to [0,1) */ \
+      int j = 0; \
+      BEGINNING; CONVOLVE(n); END; \
+    } \
+    fifo_read(&p->fifo, p->at.integer, NULL); \
+    p->at.integer = 0; }
+
+#if defined HI_PREC_CLOCK && FLOAT_HI_PREC_CLOCK
+  #define core(n) fphpCore(n) spCore(n)
+#elif defined HI_PREC_CLOCK
+  #define core(n) hpCore(n) spCore(n)
+#else
+  #define core(n) spCore(n)
+#endif
+
+
+
 static void FUNCTION(stage_t * p, fifo_t * output_fifo)
 {
   sample_t const * input = stage_read_p(p);
-  int i, num_in = stage_occupancy(p), max_num_out = 1 + (int)(num_in*p->out_in_ratio);
-  sample_t * output = fifo_reserve(output_fifo, max_num_out);
+  int num_in = min(stage_occupancy(p), p->input_size);
+  int i, max_num_out = 1 + (int)(num_in * p->out_in_ratio);
+  sample_t * const __restrict output = fifo_reserve(output_fifo, max_num_out);
 
-#if defined HI_PREC_CLOCK
-#if FLOAT_HI_PREC_CLOCK
-  if (p->use_hi_prec_clock) {
-    float_step_t at = p->at.flt;
-    for (i = 0; (int)at < num_in; ++i, at += p->step.flt) {
-      sample_t const * in = input + (int)at;
-      float_step_t frac = at - (int)at;
-      int phase = (int)(frac * (1 << PHASE_BITS));
-#if COEF_INTERP > 0
-      sample_t x = (sample_t)(frac * (1 << PHASE_BITS) - phase);
-#endif
-      sample_t sum = 0;
-      int j = 0;
-      CONVOLVE
-      output[i] = sum;
-    }
-    fifo_read(&p->fifo, (int)at, NULL);
-    p->at.flt = at - (int)at;
-  } else
-#else
-  if (p->use_hi_prec_clock) {
-    for (i = 0; p->at.integer < num_in; ++i,
-        p->at.fix.ls.all += p->step.fix.ls.all,
-        p->at.whole += p->step.whole + (p->at.fix.ls.all < p->step.fix.ls.all)) {
-      sample_t const * in = input + p->at.integer;
-      uint32_t frac = p->at.fraction;
-      int phase = (int)(frac >> (32 - PHASE_BITS)); /* high-order bits */
-#if COEF_INTERP > 0              /* low-order bits, scaled to [0,1) */
-      sample_t x = (sample_t)((frac << PHASE_BITS) * (1 / MULT32));
-#endif
-      sample_t sum = 0;
-      int j = 0;
-      CONVOLVE
-      output[i] = sum;
-    }
-    fifo_read(&p->fifo, p->at.integer, NULL);
-    p->at.integer = 0;
-  } else
-#endif
-#endif
-  {
-    for (i = 0; p->at.integer < num_in; ++i, p->at.whole += p->step.whole) {
-      sample_t const * in = input + p->at.integer;
-      uint32_t frac = p->at.fraction;
-      int phase = (int)(frac >> (32 - PHASE_BITS)); /* high-order bits */
-#if COEF_INTERP > 0              /* low-order bits, scaled to [0,1) */
-      sample_t x = (sample_t)((frac << PHASE_BITS) * (1 / MULT32));
-#endif
-      sample_t sum = 0;
-      int j = 0;
-      CONVOLVE
-      output[i] = sum;
-    }
-    fifo_read(&p->fifo, p->at.integer, NULL);
-    p->at.integer = 0;
-  }
+  CORE(N);
   assert(max_num_out - i >= 0);
   fifo_trim_by(output_fifo, max_num_out - i);
 }
 
+
+
 #undef _
 #undef a
 #undef b
 #undef c
 #undef d
+#undef CORE
+#undef cc
+#undef core
 #undef COEF_INTERP
+#undef N
+#undef BEGINNING
+#undef MIDDLE
+#undef END
 #undef CONVOLVE
 #undef FIR_LENGTH
 #undef FUNCTION
diff --git a/src/poly-fir0.h b/src/poly-fir0.h
index 52d85b3..0f28c69 100644
--- a/src/poly-fir0.h
+++ b/src/poly-fir0.h
@@ -1,32 +1,56 @@
-/* SoX Resampler Library      Copyright (c) 2007-13 robs@users.sourceforge.net
+/* SoX Resampler Library      Copyright (c) 2007-16 robs@users.sourceforge.net
  * Licence for this file: LGPL v2.1                  See LICENCE for details. */
 
-/* Resample using a non-interpolated poly-phase FIR with length LEN.*/
-/* Input must be followed by LEN-1 samples. */
+/* Resample using a non-interpolated poly-phase FIR with length LEN. */
+/* Input must be followed by FIR_LENGTH-1 samples. */
 
-#define _ sum += (coef(p->shared->poly_fir_coefs, 0, FIR_LENGTH, rem, 0, j)) *at[j], ++j;
+#if SIMD_AVX || SIMD_SSE || SIMD_NEON
+  #define N (FIR_LENGTH>>2)
+  #define BEGINNING v4_t sum = vZero(); \
+      v4_t const * const __restrict coefs = (v4_t *)COEFS + N * rem;
+  #define _ sum = vMac(vLdu(at+j*4), coefs[j], sum), ++j;
+  #define END vStorSum(output+i, sum)
+  #define cc(n) case n: core(n); break
+  #define CORE(n) switch (n) {cc(2); cc(3); cc(4); cc(5); cc(6); default: core(n);}
+#else
+  #define N FIR_LENGTH
+  #define BEGINNING sample_t sum = 0; \
+      sample_t const * const __restrict coefs = (sample_t *)COEFS + N * rem;
+  #define _ sum += coefs[j]*at[j], ++j;
+  #define END output[i] = sum
+  #define CORE(n) core(n)
+#endif
+
+#define core(n) \
+  for (i = 0; p->at.integer < num_in * p->L; ++i, \
+      p->at.integer += p->step.integer) { \
+    int const div = p->at.integer / p->L, rem = p->at.integer % p->L; \
+    sample_t const * const __restrict at = input + div; \
+    int j = 0; BEGINNING; CONVOLVE(n); END;}
 
 static void FUNCTION(stage_t * p, fifo_t * output_fifo)
 {
-  sample_t const * input = stage_read_p(p);
-  int i, num_in = stage_occupancy(p), max_num_out = 1 + (int)(num_in*p->out_in_ratio);
-  sample_t * output = fifo_reserve(output_fifo, max_num_out);
+  int num_in = min(stage_occupancy(p), p->input_size);
+  if (num_in) {
+    sample_t const * input = stage_read_p(p);
+    int i, num_out = (num_in * p->L - p->at.integer + p->step.integer - 1) / p->step.integer;
+    sample_t * __restrict output = fifo_reserve(output_fifo, num_out);
 
-  for (i = 0; p->at.integer < num_in * p->L; ++i, p->at.integer += p->step.integer) {
-    int div = p->at.integer / p->L, rem = p->at.integer % p->L;
-    sample_t const * at = input + div;
-    sample_t sum = 0;
-    int j = 0;
-    CONVOLVE
-    output[i] = sum;
+    CORE(N);
+    assert(i == num_out);
+    fifo_read(&p->fifo, p->at.integer / p->L, NULL);
+    p->at.integer = p->at.integer % p->L;
   }
-  assert(max_num_out - i >= 0);
-  fifo_trim_by(output_fifo, max_num_out - i);
-  fifo_read(&p->fifo, p->at.integer / p->L, NULL);
-  p->at.integer = p->at.integer % p->L;
 }
 
 #undef _
+#undef CORE
+#undef cc
+#undef core
+#undef N
+#undef BEGINNING
+#undef MIDDLE
+#undef END
 #undef CONVOLVE
 #undef FIR_LENGTH
 #undef FUNCTION
diff --git a/src/rate.h b/src/rate.h
deleted file mode 100644
index b0598f7..0000000
--- a/src/rate.h
+++ /dev/null
@@ -1,726 +0,0 @@
-/* SoX Resampler Library      Copyright (c) 2007-14 robs@users.sourceforge.net
- * Licence for this file: LGPL v2.1                  See LICENCE for details. */
-
-#include <math.h>
-#include <assert.h>
-#include <string.h>
-#include <stdlib.h>
-
-#include "filter.h"
-
-#if defined SOXR_LIB
-#include "internal.h"
-
-typedef void (* fn_t)(void);
-extern fn_t RDFT_CB[11];
-
-#define rdft_forward_setup    (*(void * (*)(int))RDFT_CB[0])
-#define rdft_backward_setup   (*(void * (*)(int))RDFT_CB[1])
-#define rdft_delete_setup     (*(void (*)(void *))RDFT_CB[2])
-#define rdft_forward          (*(void (*)(int, void *, sample_t *, sample_t *))RDFT_CB[3])
-#define rdft_oforward         (*(void (*)(int, void *, sample_t *, sample_t *))RDFT_CB[4])
-#define rdft_backward         (*(void (*)(int, void *, sample_t *, sample_t *))RDFT_CB[5])
-#define rdft_obackward        (*(void (*)(int, void *, sample_t *, sample_t *))RDFT_CB[6])
-#define rdft_convolve         (*(void (*)(int, void *, sample_t *, sample_t const *))RDFT_CB[7])
-#define rdft_convolve_portion (*(void (*)(int, sample_t *, sample_t const *))RDFT_CB[8])
-#define rdft_multiplier       (*(int (*)(void))RDFT_CB[9])
-#define rdft_reorder_back     (*(void (*)(int, void *, sample_t *, sample_t *))RDFT_CB[10])
-
-#endif
-
-#if RATE_SIMD /* Align for SIMD: */
-  #include "simd.h"
-#if 0 /* Not using this yet. */
-  #define RATE_SIMD_POLY 1
-  #define num_coefs4 ((num_coefs + 3) & ~3)
-  #define coefs4_check(i) ((i) < num_coefs)
-#else
-  #define RATE_SIMD_POLY 0
-  #define num_coefs4 num_coefs
-  #define coefs4_check(i) 1
-#endif
-
-  #define aligned_free    _soxr_simd_aligned_free
-  #define aligned_malloc  _soxr_simd_aligned_malloc
-  #define aligned_calloc  _soxr_simd_aligned_calloc
-#if 0
-  #define FIFO_REALLOC    aligned_realloc
-  #define FIFO_MALLOC     aligned_malloc
-  #define FIFO_FREE       aligned_free
-
-  static void * aligned_realloc(void * q, size_t nb_bytes, size_t copy_bytes) {
-    void * p = aligned_malloc(nb_bytes);
-    if (p) memcpy(p, q, copy_bytes);
-    aligned_free(q);
-    return p;
-  }
-#endif
-#else
-  #define RATE_SIMD_POLY 0
-  #define num_coefs4 num_coefs
-  #define coefs4_check(i) 1
-
-  #define aligned_free    free
-  #define aligned_malloc  malloc
-  #define aligned_calloc  calloc
-#endif
-
-#define  FIFO_SIZE_T int
-#include "fifo.h"
-
-typedef union { /* Int64 in parts */
-  #if HAVE_BIGENDIAN
-  struct {int32_t ms; uint32_t ls;} parts;
-  #else
-  struct {uint32_t ls; int32_t ms;} parts;
-  #endif
-  int64_t all;
-} int64p_t;
-
-typedef union { /* Uint64 in parts */
-  #if HAVE_BIGENDIAN
-  struct {uint32_t ms, ls;} parts;
-  #else
-  struct {uint32_t ls, ms;} parts;
-  #endif
-  uint64_t all;
-} uint64p_t;
-
-#define FLOAT_HI_PREC_CLOCK 0    /* Non-float hi-prec has ~96 bits. */
-#define float_step_t long double /* __float128 is also a (slow) option */
-
-#define coef(coef_p, interp_order, fir_len, phase_num, coef_interp_num, fir_coef_num) coef_p[(fir_len) * ((interp_order) + 1) * (phase_num) + ((interp_order) + 1) * (fir_coef_num) + (interp_order - coef_interp_num)]
-
-#define raw_coef_t double
-
-static sample_t * prepare_coefs(raw_coef_t const * coefs, int num_coefs,
-    int num_phases, int interp_order, double multiplier)
-{
-  int i, j, length = num_coefs4 * num_phases;
-  sample_t * result = malloc((size_t)(length * (interp_order + 1)) * sizeof(*result));
-  double fm1 = coefs[0], f1 = 0, f2 = 0;
-
-  for (i = num_coefs4 - 1; i >= 0; --i)
-    for (j = num_phases - 1; j >= 0; --j) {
-      double f0 = fm1, b = 0, c = 0, d = 0; /* = 0 to kill compiler warning */
-      int pos = i * num_phases + j - 1;
-      fm1 = coefs4_check(i) && pos > 0 ? coefs[pos - 1] * multiplier : 0;
-      switch (interp_order) {
-        case 1: b = f1 - f0; break;
-        case 2: b = f1 - (.5 * (f2+f0) - f1) - f0; c = .5 * (f2+f0) - f1; break;
-        case 3: c=.5*(f1+fm1)-f0;d=(1/6.)*(f2-f1+fm1-f0-4*c);b=f1-f0-d-c; break;
-        default: if (interp_order) assert(0);
-      }
-      #define coef_coef(x) \
-        coef(result, interp_order, num_coefs4, j, x, num_coefs4 - 1 - i)
-      coef_coef(0) = (sample_t)f0;
-      if (interp_order > 0) coef_coef(1) = (sample_t)b;
-      if (interp_order > 1) coef_coef(2) = (sample_t)c;
-      if (interp_order > 2) coef_coef(3) = (sample_t)d;
-      #undef coef_coef
-      f2 = f1, f1 = f0;
-    }
-  return result;
-}
-
-typedef struct {
-  int        dft_length, num_taps, post_peak;
-  void       * dft_forward_setup, * dft_backward_setup;
-  sample_t   * coefs;
-} dft_filter_t;
-
-typedef struct { /* So generated filter coefs may be shared between channels */
-  sample_t   * poly_fir_coefs;
-  dft_filter_t dft_filter[2];
-} rate_shared_t;
-
-typedef enum {
-  irrational_stage = 1,
-  cubic_stage,
-  dft_stage,
-  half_stage,
-  rational_stage
-} stage_type_t;
-
-struct stage;
-typedef void (* stage_fn_t)(struct stage * input, fifo_t * output);
-#define MULT32 (65536. * 65536.)
-
-typedef union { /* Fixed point arithmetic */
-  struct {uint64p_t ls; int64p_t ms;} fix;
-  float_step_t flt;
-} step_t;
-
-typedef struct stage {
-  /* Common to all stage types: */
-  stage_type_t type;
-  stage_fn_t fn;
-  fifo_t     fifo;
-  int        pre;       /* Number of past samples to store */
-  int        pre_post;  /* pre + number of future samples to store */
-  int        preload;   /* Number of zero samples to pre-load the fifo */
-  double     out_in_ratio; /* For buffer management. */
-
-  /* For a stage with variable (run-time generated) filter coefs: */
-  rate_shared_t * shared;
-  unsigned   dft_filter_num; /* Which, if any, of the 2 DFT filters to use */
-  sample_t   * dft_scratch, * dft_out;
-
-  /* For a stage with variable L/M: */
-  step_t     at, step;
-  bool       use_hi_prec_clock;
-  int        L, remM;
-  int        n, phase_bits, block_len;
-  double     mult, phase0;
-} stage_t;
-
-#define stage_occupancy(s) max(0, fifo_occupancy(&(s)->fifo) - (s)->pre_post)
-#define stage_read_p(s) ((sample_t *)fifo_read_ptr(&(s)->fifo) + (s)->pre)
-
-static void cubic_stage_fn(stage_t * p, fifo_t * output_fifo)
-{
-  int i, num_in = stage_occupancy(p), max_num_out = 1 + (int)(num_in*p->out_in_ratio);
-  sample_t const * input = stage_read_p(p);
-  sample_t * output = fifo_reserve(output_fifo, max_num_out);
-
-#define integer  fix.ms.parts.ms
-#define fraction fix.ms.parts.ls
-#define whole    fix.ms.all
-  for (i = 0; p->at.integer < num_in; ++i, p->at.whole += p->step.whole) {
-    sample_t const * s = input + p->at.integer;
-    double x = p->at.fraction * (1 / MULT32);
-    double b = .5*(s[1]+s[-1])-*s, a = (1/6.)*(s[2]-s[1]+s[-1]-*s-4*b);
-    double c = s[1]-*s-a-b;
-    output[i] = (sample_t)(p->mult * (((a*x + b)*x + c)*x + *s));
-  }
-  assert(max_num_out - i >= 0);
-  fifo_trim_by(output_fifo, max_num_out - i);
-  fifo_read(&p->fifo, p->at.integer, NULL);
-  p->at.integer = 0;
-}
-
-#if RATE_SIMD
-  #define dft_out p->dft_out
-#else
-  #define dft_out output
-#endif
-
-static void dft_stage_fn(stage_t * p, fifo_t * output_fifo)
-{
-  sample_t * output;
-  int i, j, num_in = max(0, fifo_occupancy(&p->fifo));
-  rate_shared_t const * s = p->shared;
-  dft_filter_t const * f = &s->dft_filter[p->dft_filter_num];
-  int const overlap = f->num_taps - 1;
-
-  while (p->at.integer + p->L * num_in >= f->dft_length) {
-    div_t divd = div(f->dft_length - overlap - p->at.integer + p->L - 1, p->L);
-    sample_t const * input = fifo_read_ptr(&p->fifo);
-    fifo_read(&p->fifo, divd.quot, NULL);
-    num_in -= divd.quot;
-
-    output = fifo_reserve(output_fifo, f->dft_length);
-
-    if (lsx_is_power_of_2(p->L)) { /* F-domain */
-      int portion = f->dft_length / p->L;
-      memcpy(dft_out, input, (unsigned)portion * sizeof(*dft_out));
-      rdft_oforward(portion, f->dft_forward_setup, dft_out, p->dft_scratch);
-      for (i = portion + 2; i < (portion << 1); i += 2) /* Mirror image. */
-        dft_out[i] = dft_out[(portion << 1) - i],
-        dft_out[i+1] = -dft_out[(portion << 1) - i + 1];
-      dft_out[portion] = dft_out[1];
-      dft_out[portion + 1] = 0;
-      dft_out[1] = dft_out[0];
-
-      for (portion <<= 1; i < f->dft_length; i += portion, portion <<= 1) {
-        memcpy(dft_out + i, dft_out, (size_t)portion * sizeof(*dft_out));
-        dft_out[i + 1] = 0;
-      }
-      if (p->step.integer > 0)
-        rdft_reorder_back(f->dft_length, f->dft_backward_setup, dft_out, p->dft_scratch);
-    } else {
-      if (p->L == 1)
-        memcpy(dft_out, input, (size_t)f->dft_length * sizeof(*dft_out));
-      else {
-        memset(dft_out, 0, (size_t)f->dft_length * sizeof(*dft_out));
-        for (j = 0, i = p->at.integer; i < f->dft_length; ++j, i += p->L)
-          dft_out[i] = input[j];
-        p->at.integer = p->L - 1 - divd.rem;
-      }
-      if (p->step.integer > 0)
-        rdft_forward(f->dft_length, f->dft_forward_setup, dft_out, p->dft_scratch);
-      else
-        rdft_oforward(f->dft_length, f->dft_forward_setup, dft_out, p->dft_scratch);
-    }
-
-    if (p->step.integer > 0) {
-      rdft_convolve(f->dft_length, f->dft_backward_setup, dft_out, f->coefs);
-      rdft_backward(f->dft_length, f->dft_backward_setup, dft_out, p->dft_scratch);
-#if RATE_SIMD
-      if (p->step.integer == 1)
-        memcpy(output, dft_out, (size_t)f->dft_length * sizeof(sample_t));
-#endif
-      if (p->step.integer != 1) {
-        for (j = 0, i = p->remM; i < f->dft_length - overlap; ++j,
-            i += p->step.integer)
-          output[j] = dft_out[i];
-        p->remM = i - (f->dft_length - overlap);
-        fifo_trim_by(output_fifo, f->dft_length - j);
-      }
-      else fifo_trim_by(output_fifo, overlap);
-    }
-    else { /* F-domain */
-      int m = -p->step.integer;
-      rdft_convolve_portion(f->dft_length >> m, dft_out, f->coefs);
-      rdft_obackward(f->dft_length >> m, f->dft_backward_setup, dft_out, p->dft_scratch);
-#if RATE_SIMD
-      memcpy(output, dft_out, (size_t)(f->dft_length >> m) * sizeof(sample_t));
-#endif
-      fifo_trim_by(output_fifo, (((1 << m) - 1) * f->dft_length + overlap) >>m);
-    }
-  }
-}
-
-#undef dft_out
-
-/* Set to 4 x nearest power of 2 */
-/* or half of that if danger of causing too many cache misses. */
-static int set_dft_length(int num_taps, int min, int large)
-{
-  double d = log((double)num_taps) / log(2.);
-  return 1 << range_limit((int)(d + 2.77), min, max((int)(d + 1.77), large));
-}
-
-static void dft_stage_init(
-    unsigned instance, double Fp, double Fs, double Fn, double att,
-    double phase, stage_t * p, int L, int M, double * multiplier,
-    int min_dft_size, int large_dft_size)
-{
-  dft_filter_t * f = &p->shared->dft_filter[instance];
-  int num_taps = 0, dft_length = f->dft_length, i;
-  bool f_domain_m = abs(3-M) == 1 && Fs <= 1;
-
-  if (!dft_length) {
-    int k = phase == 50 && lsx_is_power_of_2(L) && Fn == L? L << 1 : 4;
-    double * h = lsx_design_lpf(Fp, Fs, Fn, att, &num_taps, -k, -1.);
-
-    if (phase != 50)
-      lsx_fir_to_phase(&h, &num_taps, &f->post_peak, phase);
-    else f->post_peak = num_taps / 2;
-
-    dft_length = set_dft_length(num_taps, min_dft_size, large_dft_size);
-    f->coefs = aligned_calloc((size_t)dft_length, sizeof(*f->coefs));
-    for (i = 0; i < num_taps; ++i)
-      f->coefs[(i + dft_length - num_taps + 1) & (dft_length - 1)]
-        = (sample_t)(h[i] * ((1. / dft_length) * rdft_multiplier() * L * *multiplier));
-    free(h);
-  }
-
-#if RATE_SIMD
-  p->dft_out = aligned_malloc(sizeof(sample_t) * (size_t)dft_length);
-#endif
-#if 1 /* In fact, currently, only pffft needs this. */
-  p->dft_scratch = aligned_malloc(2 * sizeof(sample_t) * (size_t)dft_length);
-#endif
-
-  if (!f->dft_length) {
-    void * coef_setup = rdft_forward_setup(dft_length);
-    int Lp = lsx_is_power_of_2(L)? L : 1;
-    int Mp = f_domain_m? M : 1;
-    f->dft_forward_setup = rdft_forward_setup(dft_length / Lp);
-    f->dft_backward_setup = rdft_backward_setup(dft_length / Mp);
-    if (Mp == 1)
-      rdft_forward(dft_length, coef_setup, f->coefs, p->dft_scratch);
-    else
-      rdft_oforward(dft_length, coef_setup, f->coefs, p->dft_scratch);
-    rdft_delete_setup(coef_setup);
-    f->num_taps = num_taps;
-    f->dft_length = dft_length;
-    lsx_debug("fir_len=%i dft_length=%i Fp=%g Fs=%g Fn=%g att=%g %i/%i",
-        num_taps, dft_length, Fp, Fs, Fn, att, L, M);
-  }
-  *multiplier = 1;
-  p->out_in_ratio = (double)L / M;
-  p->type = dft_stage;
-  p->fn = dft_stage_fn;
-  p->preload = f->post_peak / L;
-  p->at.integer = f->post_peak % L;
-  p->L = L;
-  p->step.integer = f_domain_m? -M/2 : M;
-  p->dft_filter_num = instance;
-  p->block_len = f->dft_length - (f->num_taps - 1);
-  p->phase0 = p->at.integer / p->L;
-}
-
-#include "filters.h"
-
-typedef struct {
-  double     factor;
-  uint64_t   samples_in, samples_out;
-  int        num_stages;
-  stage_t    * stages;
-} rate_t;
-
-#define pre_stage       p->stages[shift]
-#define arb_stage       p->stages[shift + have_pre_stage]
-#define post_stage      p->stages[shift + have_pre_stage + have_arb_stage]
-#define have_pre_stage  (preM  * preL  != 1)
-#define have_arb_stage  (arbM  * arbL  != 1)
-#define have_post_stage (postM * postL != 1)
-
-#define TO_3dB(a)       ((1.6e-6*a-7.5e-4)*a+.646)
-#define LOW_Q_BW0       (1385 / 2048.) /* 0.67625 rounded to be a FP exact. */
-
-typedef enum {
-  rolloff_none, rolloff_small /* <= 0.01 dB */, rolloff_medium /* <= 0.35 dB */
-} rolloff_t;
-
-
-static char const * rate_init(
-  /* Private work areas (to be supplied by the client):                       */
-  rate_t * p,                /* Per audio channel.                            */
-  rate_shared_t * shared,    /* Between channels (undergoing same rate change)*/
-
-  /* Public parameters:                                             Typically */
-  double factor,             /* Input rate divided by output rate.            */
-  double bits,               /* Required bit-accuracy (pass + stop)  16|20|28 */
-  double phase,              /* Linear/minimum etc. filter phase.       50    */
-  double passband_end,       /* 0dB pt. bandwidth to preserve; nyquist=1 0.913*/
-  double stopband_begin,     /* Aliasing/imaging control; > passband_end  1   */
-  rolloff_t rolloff,         /* Pass-band roll-off                    small   */
-  bool maintain_3dB_pt,      /*                                        true   */
-  double multiplier,         /* Linear gain to apply during conversion.   1   */
-
-  /* Primarily for test/development purposes:                                 */
-  bool use_hi_prec_clock,    /* Increase irrational ratio accuracy.   false   */
-  int interpolator,          /* Force a particular coef interpolator.   -1    */
-  size_t max_coefs_size,     /* k bytes of coefs to try to keep below.  400   */
-  bool noSmallIntOpt,        /* Disable small integer optimisations.  false   */
-  int log2_min_dft_size,
-  int log2_large_dft_size)
-{
-  double att = (bits + 1) * linear_to_dB(2.), attArb = att;    /* pass + stop */
-  double tbw0 = 1 - passband_end, Fs_a = stopband_begin;
-  double arbM = factor, tbw_tighten = 1;
-  int n = 0, i, preL = 1, preM = 1, shift = 0, arbL = 1, postL = 1, postM = 1;
-  bool upsample = false, rational = false, iOpt = !noSmallIntOpt;
-  int mode = rolloff > rolloff_small? factor > 1 || passband_end > LOW_Q_BW0:
-    (int)ceil(2 + (bits - 17) / 4);
-  stage_t * s;
-
-  assert(factor > 0);
-  assert(!bits || (15 <= bits && bits <= 33));
-  assert(0 <= phase && phase <= 100);
-  assert(.53 <= passband_end);
-  assert(stopband_begin <= 1.2);
-  assert(passband_end + .005 < stopband_begin);
-
-  p->factor = factor;
-  if (bits!=0) while (!n++) {                            /* Determine stages: */
-    int try, L, M, x, maxL = interpolator > 0? 1 : mode? 2048 :
-      (int)ceil((double)max_coefs_size * 1000. / (U100_l * sizeof(sample_t)));
-    double d, epsilon = 0, frac;
-    upsample = arbM < 1;
-    for (i = (int)(arbM * .5), shift = 0; i >>= 1; arbM *= .5, ++shift);
-    preM = upsample || (arbM > 1.5 && arbM < 2);
-    postM = 1 + (arbM > 1 && preM), arbM /= postM;
-    preL = 1 + (!preM && arbM < 2) + (upsample && mode), arbM *= preL;
-    if ((frac = arbM - (int)arbM)!=0)
-      epsilon = fabs(floor(frac * MULT32 + .5) / (frac * MULT32) - 1);
-    for (i = 1, rational = frac==0; i <= maxL && !rational; ++i) {
-      d = frac * i, try = (int)(d + .5);
-      if ((rational = fabs(try / d - 1) <= epsilon)) {    /* No long doubles! */
-        if (try == i)
-          arbM = ceil(arbM), shift += x = arbM > 3, arbM /= 1 + x;                                                         
-        else arbM = i * (int)arbM + try, arbL = i;
-      }
-    }
-    L = preL * arbL, M = (int)(arbM * postM), x = (L|M)&1, L >>= !x, M >>= !x;
-    if (iOpt && postL == 1 && (d = preL * arbL / arbM) > 4 && d != 5) {
-      for (postL = 4, i = (int)(d / 16); (i >>= 1) && postL < 256; postL <<= 1);
-      arbM = arbM * postL / arbL / preL, arbL = 1, n = 0;
-    } else if (rational && (max(L, M) < 3 + 2 * iOpt || L * M < 6 * iOpt))
-      preL = L, preM = M, arbM = arbL = postM = 1;
-    if (!mode && (!rational || !n))
-      ++mode, n = 0;
-  }
-
-  p->num_stages = shift + have_pre_stage + have_arb_stage + have_post_stage;
-  if (!p->num_stages && multiplier != 1) {
-    bits = arbL = 0;                         /* Use cubic_stage in this case. */
-    ++p->num_stages;
-  }
-  p->stages = calloc((size_t)p->num_stages + 1, sizeof(*p->stages));
-  for (i = 0; i < p->num_stages; ++i)
-    p->stages[i].shared = shared;
-
-  if ((n = p->num_stages) > 1) {                              /* Att. budget: */
-    if (have_arb_stage)
-      att += linear_to_dB(2.), attArb = att, --n;
-    att += linear_to_dB((double)n);
-  }
-
-  for (n = 0; (size_t)n + 1 < array_length(half_firs) && att > half_firs[n].att; ++n);
-  for (i = 0, s = p->stages; i < shift; ++i, ++s) {
-    s->type = half_stage;
-    s->fn = half_firs[n].fn;
-    s->pre_post = 4 * half_firs[n].num_coefs;
-    s->preload = s->pre = s->pre_post >> 1;
-  }
-
-  if (have_pre_stage) {
-    if (maintain_3dB_pt && have_post_stage) {    /* Trans. bands overlapping. */
-      double tbw3 = tbw0 * TO_3dB(att);                /* FFS: consider Fs_a. */
-      double x = ((2.1429e-4 - 5.2083e-7 * att) * att - .015863) * att + 3.95;
-      x = att * pow((tbw0 - tbw3) / (postM / (factor * postL) - 1 + tbw0), x);
-      if (x > .035) {
-        tbw_tighten = ((4.3074e-3 - 3.9121e-4 * x) * x - .040009) * x + 1.0014;
-        lsx_debug("x=%g tbw_tighten=%g", x, tbw_tighten);
-      }
-    }
-    dft_stage_init(0, 1 - tbw0 * tbw_tighten, Fs_a, preM? max(preL, preM) :
-        arbM / arbL, att, phase, &pre_stage, preL, max(preM, 1), &multiplier,
-        log2_min_dft_size, log2_large_dft_size);
-  }
-
-  if (bits==0 && have_arb_stage) {                  /* `Quick' cubic arb stage: */
-    arb_stage.type = cubic_stage;
-    arb_stage.fn = cubic_stage_fn;
-    arb_stage.mult = multiplier, multiplier = 1;
-    arb_stage.step.whole = (int64_t)(arbM * MULT32 + .5);
-    arb_stage.pre_post = max(3, arb_stage.step.integer);
-    arb_stage.preload = arb_stage.pre = 1;
-    arb_stage.out_in_ratio = MULT32 / (double)arb_stage.step.whole;
-  }
-  else if (have_arb_stage) {                     /* Higher quality arb stage: */
-    poly_fir_t const * f = &poly_firs[6*(upsample + !!preM) + mode - !upsample];
-    int order, num_coefs = (int)f->interp[0].scalar, phase_bits, phases;
-    size_t coefs_size;
-    double x = .5, at, Fp, Fs, Fn, mult = upsample? 1 : arbL / arbM;
-    poly_fir1_t const * f1;
-
-    Fn = !upsample && preM? x = arbM / arbL : 1;
-    Fp = !preM? mult : mode? .5 : 1;
-    Fs = 2 - Fp;           /* Ignore Fs_a; it would have little benefit here. */
-    Fp *= 1 - tbw0;
-    if (rolloff > rolloff_small && mode)
-      Fp = !preM? mult * .5 - .125 : mult * .05 + .1;
-    else if (rolloff == rolloff_small)
-      Fp = Fs - (Fs - .148 * x - Fp * .852) * (.00813 * bits + .973);
-
-    i = (interpolator < 0? !rational : max(interpolator, !rational)) - 1;
-    do {
-      f1 = &f->interp[++i];
-      assert(f1->fn);
-      if (i)
-        arbM /= arbL, arbL = 1, rational = false;
-      phase_bits = (int)ceil(f1->scalar + log(mult)/log(2.));
-      phases = !rational? (1 << phase_bits) : arbL;
-      if (f->interp[0].scalar==0) {
-        int phases0 = max(phases, 19), n0 = 0;
-        lsx_design_lpf(Fp, Fs, -Fn, attArb, &n0, phases0, f->beta);
-        num_coefs = n0 / phases0 + 1, num_coefs += num_coefs & !preM;
-      }
-      if ((num_coefs & 1) && rational && (arbL & 1))
-        phases <<= 1, arbL <<= 1, arbM *= 2;
-      at = arbL * (arb_stage.phase0 = .5 * (num_coefs & 1));
-      order = i + (i && mode > 4);
-      coefs_size = (size_t)(num_coefs4 * phases * (order + 1)) * sizeof(sample_t);
-    } while (interpolator < 0 && i < 2 && f->interp[i+1].fn &&
-        coefs_size / 1000 > max_coefs_size);
-
-    if (!arb_stage.shared->poly_fir_coefs) {
-      int num_taps = num_coefs * phases - 1;
-      raw_coef_t * coefs = lsx_design_lpf(
-          Fp, Fs, Fn, attArb, &num_taps, phases, f->beta);
-      arb_stage.shared->poly_fir_coefs = prepare_coefs(
-          coefs, num_coefs, phases, order, multiplier);
-      lsx_debug("fir_len=%i phases=%i coef_interp=%i size=%.3gk",
-          num_coefs, phases, order, (double)coefs_size / 1000.);
-      free(coefs);
-    }
-    multiplier = 1;
-    arb_stage.type = rational? rational_stage : irrational_stage;
-    arb_stage.fn = f1->fn;
-    arb_stage.pre_post = num_coefs4 - 1;
-    arb_stage.preload = ((num_coefs - 1) >> 1) + (num_coefs4 - num_coefs);
-    arb_stage.n = num_coefs4;
-    arb_stage.phase_bits = phase_bits;
-    arb_stage.L = arbL;
-    arb_stage.use_hi_prec_clock = mode > 1 && use_hi_prec_clock && !rational;
-#if FLOAT_HI_PREC_CLOCK
-    if (arb_stage.use_hi_prec_clock) {
-      arb_stage.at.flt = at;
-      arb_stage.step.flt = arbM;
-      arb_stage.out_in_ratio = (double)(arbL / arb_stage.step.flt);
-    } else
-#endif
-    {
-      arb_stage.at.whole = (int64_t)(at * MULT32 + .5);
-#if !FLOAT_HI_PREC_CLOCK
-      if (arb_stage.use_hi_prec_clock) {
-        arb_stage.at.fix.ls.parts.ms = 0x80000000ul;
-        arbM *= MULT32;
-        arb_stage.step.whole = (int64_t)arbM;
-        arbM -= (double)arb_stage.step.whole;
-        arbM *= MULT32 * MULT32;
-        arb_stage.step.fix.ls.all = (uint64_t)arbM;
-      } else
-#endif
-        arb_stage.step.whole = (int64_t)(arbM * MULT32 + .5);
-      arb_stage.out_in_ratio = MULT32 * arbL / (double)arb_stage.step.whole;
-    }
-  }
-
-  if (have_post_stage)
-    dft_stage_init(1, 1 - (1 - (1 - tbw0) *
-        (upsample? factor * postL / postM : 1)) * tbw_tighten, Fs_a,
-        (double)max(postL, postM), att, phase, &post_stage, postL, postM,
-        &multiplier, log2_min_dft_size, log2_large_dft_size);
-
-
-  lsx_debug("%g: »%i⋅%i/%i⋅%i/%g⋅%i/%i",
-      1/factor, shift, preL, preM, arbL, arbM, postL, postM);
-  for (i = 0, s = p->stages; i < p->num_stages; ++i, ++s) {
-    fifo_create(&s->fifo, (int)sizeof(sample_t));
-    memset(fifo_reserve(&s->fifo, s->preload), 0, sizeof(sample_t) * (size_t)s->preload);
-    lsx_debug("%5i|%-5i preload=%i remL=%i o/i=%g",
-        s->pre, s->pre_post - s->pre, s->preload, s->at.integer, s->out_in_ratio);
-  }
-  fifo_create(&s->fifo, (int)sizeof(sample_t));
-  return 0;
-}
-
-static void rate_process(rate_t * p)
-{
-  stage_t * stage = p->stages;
-  int i;
-  for (i = 0; i < p->num_stages; ++i, ++stage)
-    stage->fn(stage, &(stage+1)->fifo);
-}
-
-static sample_t * rate_input(rate_t * p, sample_t const * samples, size_t n)
-{
-  p->samples_in += n;
-  return fifo_write(&p->stages[0].fifo, (int)n, samples);
-}
-
-static sample_t const * rate_output(rate_t * p, sample_t * samples, size_t * n)
-{
-  fifo_t * fifo = &p->stages[p->num_stages].fifo;
-  p->samples_out += *n = min(*n, (size_t)fifo_occupancy(fifo));
-  return fifo_read(fifo, (int)*n, samples);
-}
-
-static void rate_flush(rate_t * p)
-{
-  fifo_t * fifo = &p->stages[p->num_stages].fifo;
-#if defined _MSC_VER && _MSC_VER == 1200
-  uint64_t samples_out = (uint64_t)(int64_t)((double)(int64_t)p->samples_in / p->factor + .5);
-#else
-  uint64_t samples_out = (uint64_t)((double)p->samples_in / p->factor + .5);
-#endif
-  size_t remaining = (size_t)(samples_out - p->samples_out);
-
-  if ((size_t)fifo_occupancy(fifo) < remaining) {
-    uint64_t samples_in = p->samples_in;
-    sample_t * buff = calloc(1024, sizeof(*buff));
-
-    while ((size_t)fifo_occupancy(fifo) < remaining) {
-      rate_input(p, buff, 1024);
-      rate_process(p);
-    }
-    fifo_trim_to(fifo, (int)remaining);
-    p->samples_in = samples_in;
-    free(buff);
-  }
-}
-
-static void rate_close(rate_t * p)
-{
-  rate_shared_t * shared = p->stages[0].shared;
-  int i;
-
-  for (i = 0; i <= p->num_stages; ++i) {
-    stage_t * s = &p->stages[i];
-    aligned_free(s->dft_scratch);
-    aligned_free(s->dft_out);
-    fifo_delete(&s->fifo);
-  }
-  if (shared) {
-    for (i = 0; i < 2; ++i) {
-      dft_filter_t * f= &shared->dft_filter[i];
-      aligned_free(f->coefs);
-      rdft_delete_setup(f->dft_forward_setup);
-      rdft_delete_setup(f->dft_backward_setup);
-    }
-    free(shared->poly_fir_coefs);
-    memset(shared, 0, sizeof(*shared));
-  }
-  free(p->stages);
-}
-
-#if defined SOXR_LIB
-static double rate_delay(rate_t * p)
-{
-#if defined _MSC_VER && _MSC_VER == 1200
-  double samples_out = (double)(int64_t)p->samples_in / p->factor;
-  return max(0, samples_out - (double)(int64_t)p->samples_out);
-#else
-  double samples_out = (double)p->samples_in / p->factor;
-  return max(0, samples_out - (double)p->samples_out);
-#endif
-}
-
-static void rate_sizes(size_t * shared, size_t * channel)
-{
-  *shared = sizeof(rate_shared_t);
-  *channel = sizeof(rate_t);
-}
-
-#include "soxr.h"
-
-static char const * rate_create(
-    void * channel,
-    void * shared,
-    double io_ratio,
-    soxr_quality_spec_t * q_spec,
-    soxr_runtime_spec_t * r_spec,
-    double scale)
-{
-  return rate_init(
-      channel, shared,
-      io_ratio,
-      q_spec->precision,
-      q_spec->phase_response,
-      q_spec->passband_end,
-      q_spec->stopband_begin,
-      (rolloff_t)"\1\2\0"[q_spec->flags & 3],
-      !!(q_spec->flags & SOXR_MAINTAIN_3DB_PT),
-      scale,
-      !!(q_spec->flags & SOXR_HI_PREC_CLOCK),
-      (int)(r_spec->flags & 3) - 1,
-      r_spec->coef_size_kbytes,
-      !!(r_spec->flags & SOXR_NOSMALLINTOPT),
-      (int)r_spec->log2_min_dft_size,
-      (int)r_spec->log2_large_dft_size);
-}
-
-static char const * id(void)
-{
-  return RATE_ID;
-}
-
-fn_t RATE_CB[] = {
-  (fn_t)rate_input,
-  (fn_t)rate_process,
-  (fn_t)rate_output,
-  (fn_t)rate_flush,
-  (fn_t)rate_close,
-  (fn_t)rate_delay,
-  (fn_t)rate_sizes,
-  (fn_t)rate_create,
-  (fn_t)0,
-  (fn_t)id,
-};
-#endif
diff --git a/src/rate32.c b/src/rate32.c
deleted file mode 100644
index fe85bae..0000000
--- a/src/rate32.c
+++ /dev/null
@@ -1,9 +0,0 @@
-/* SoX Resampler Library      Copyright (c) 2007-13 robs@users.sourceforge.net
- * Licence for this file: LGPL v2.1                  See LICENCE for details. */
-
-#define sample_t   float
-#define RATE_SIMD  0
-#define RDFT_CB    _soxr_rdft32_cb
-#define RATE_CB    _soxr_rate32_cb
-#define RATE_ID    "cr32"
-#include "rate.h"
diff --git a/src/rate32s.c b/src/rate32s.c
deleted file mode 100644
index 3acfcb4..0000000
--- a/src/rate32s.c
+++ /dev/null
@@ -1,9 +0,0 @@
-/* SoX Resampler Library      Copyright (c) 2007-13 robs@users.sourceforge.net
- * Licence for this file: LGPL v2.1                  See LICENCE for details. */
-
-#define sample_t   float
-#define RATE_SIMD  1
-#define RDFT_CB    _soxr_rdft32s_cb
-#define RATE_CB    _soxr_rate32s_cb
-#define RATE_ID    "cr32s"
-#include "rate.h"
diff --git a/src/rate64.c b/src/rate64.c
deleted file mode 100644
index 6f25143..0000000
--- a/src/rate64.c
+++ /dev/null
@@ -1,9 +0,0 @@
-/* SoX Resampler Library      Copyright (c) 2007-13 robs@users.sourceforge.net
- * Licence for this file: LGPL v2.1                  See LICENCE for details. */
-
-#define sample_t   double
-#define RATE_SIMD  0
-#define RDFT_CB    _soxr_rdft64_cb
-#define RATE_CB    _soxr_rate64_cb
-#define RATE_ID    "cr64"
-#include "rate.h"
diff --git a/src/rdft_t.h b/src/rdft_t.h
new file mode 100644
index 0000000..293d9c3
--- /dev/null
+++ b/src/rdft_t.h
@@ -0,0 +1,24 @@
+/* SoX Resampler Library      Copyright (c) 2007-13 robs@users.sourceforge.net
+ * Licence for this file: LGPL v2.1                  See LICENCE for details. */
+
+typedef void (* fn_t)(void);
+
+#define rdft_forward_setup    (*(void * (*)(int))RDFT_CB[0])
+#define rdft_backward_setup   (*(void * (*)(int))RDFT_CB[1])
+#define rdft_delete_setup     (*(void (*)(void *))RDFT_CB[2])
+#define rdft_forward          (*(void (*)(int, void *, void *, void *))RDFT_CB[3])
+#define rdft_oforward         (*(void (*)(int, void *, void *, void *))RDFT_CB[4])
+#define rdft_backward         (*(void (*)(int, void *, void *, void *))RDFT_CB[5])
+#define rdft_obackward        (*(void (*)(int, void *, void *, void *))RDFT_CB[6])
+#define rdft_convolve         (*(void (*)(int, void *, void *, void const *))RDFT_CB[7])
+#define rdft_convolve_portion (*(void (*)(int, void *, void const *))RDFT_CB[8])
+#define rdft_multiplier       (*(int (*)(void))RDFT_CB[9])
+#define rdft_reorder_back     (*(void (*)(int, void *, void *, void *))RDFT_CB[10])
+#define rdft_malloc           (*(void * (*)(size_t))RDFT_CB[11])
+#define rdft_calloc           (*(void * (*)(size_t, size_t))RDFT_CB[12])
+#define rdft_free             (*(void (*)(void *))RDFT_CB[13])
+#define rdft_flags            (*(int (*)(void))RDFT_CB[14])
+
+/* Flag templates: */
+#define RDFT_IS_SIMD       1
+#define RDFT_NEEDS_SCRATCH 2
diff --git a/src/simd-dev.h b/src/simd-dev.h
deleted file mode 100644
index 019325c..0000000
--- a/src/simd-dev.h
+++ /dev/null
@@ -1,5 +0,0 @@
-/* SoX Resampler Library      Copyright (c) 2007-13 robs@users.sourceforge.net
- * Licence for this file: LGPL v2.1                  See LICENCE for details. */
-
-#define PFFT_MACROS_ONLY
-#include "pffft.c"
diff --git a/src/simd.c b/src/simd.c
index 48d440f..ec548fd 100644
--- a/src/simd.c
+++ b/src/simd.c
@@ -1,21 +1,15 @@
-/* SoX Resampler Library      Copyright (c) 2007-13 robs@users.sourceforge.net
+/* SoX Resampler Library      Copyright (c) 2007-16 robs@users.sourceforge.net
  * Licence for this file: LGPL v2.1                  See LICENCE for details. */
 
 #include <assert.h>
 #include <string.h>
 #include <stdlib.h>
-#include "simd.h"
-#include "simd-dev.h"
+
 #include "soxr-config.h"
 
-#if AVCODEC_FOUND
-  #define SIMD_ALIGNMENT (sizeof(double) * 4)
-#else
-  #define SIMD_ALIGNMENT (sizeof(float) * 4)
-#endif
+#define SIMD_ALIGNMENT (sizeof(float) * (1 + (PFFFT_DOUBLE|AVCODEC_FOUND)) * 4)
 
-
-void * _soxr_simd_aligned_malloc(size_t size)
+void * SIMD_ALIGNED_MALLOC(size_t size)
 {
   char * p1 = 0, * p = malloc(size + SIMD_ALIGNMENT);
   if (p) {
@@ -27,9 +21,9 @@ void * _soxr_simd_aligned_malloc(size_t size)
 
 
 
-void * _soxr_simd_aligned_calloc(size_t nmemb, size_t size)
+void * SIMD_ALIGNED_CALLOC(size_t nmemb, size_t size)
 {
-  void * p = _soxr_simd_aligned_malloc(nmemb * size);
+  void * p = SIMD_ALIGNED_MALLOC(nmemb * size);
   if (p)
     memset(p, 0, nmemb * size);
   return p;
@@ -37,7 +31,7 @@ void * _soxr_simd_aligned_calloc(size_t nmemb, size_t size)
 
 
 
-void _soxr_simd_aligned_free(void * p1)
+void SIMD_ALIGNED_FREE(void * p1)
 {
   if (p1)
     free(*((void * *)p1 - 1));
@@ -45,11 +39,16 @@ void _soxr_simd_aligned_free(void * p1)
 
 
 
-void _soxr_ordered_convolve_simd(int n, void * not_used, float * a, const float * b)
+#define PFFT_MACROS_ONLY
+#include "pffft.c"
+
+
+
+void ORDERED_CONVOLVE_SIMD(int n, void * not_used, float * a, float const * b)
 {
   int i;
   float ab0, ab1;
-  v4sf       * /*RESTRICT*/ va = (v4sf       *)a;
+  v4sf       *   RESTRICT   va = (v4sf       *)a;
   v4sf const *   RESTRICT   vb = (v4sf const *)b;
   assert(VALIGNED(a) && VALIGNED(b));
   ab0 = a[0] * b[0], ab1 = a[1] * b[1];
@@ -68,11 +67,11 @@ void _soxr_ordered_convolve_simd(int n, void * not_used, float * a, const float
 
 
 
-void _soxr_ordered_partial_convolve_simd(int n, float * a, const float * b)
+void ORDERED_PARTIAL_CONVOLVE_SIMD(int n, float * a, float const * b)
 {
   int i;
   float ab0;
-  v4sf       * /*RESTRICT*/ va = (v4sf       *)a;
+  v4sf       *   RESTRICT   va = (v4sf       *)a;
   v4sf const *   RESTRICT   vb = (v4sf const *)b;
   assert(VALIGNED(a) && VALIGNED(b));
   ab0 = a[0] * b[0];
diff --git a/src/simd.h b/src/simd.h
deleted file mode 100644
index 71eefc6..0000000
--- a/src/simd.h
+++ /dev/null
@@ -1,16 +0,0 @@
-/* SoX Resampler Library      Copyright (c) 2007-13 robs@users.sourceforge.net
- * Licence for this file: LGPL v2.1                  See LICENCE for details. */
-
-#if !defined simd_included
-#define simd_included
-
-#include <stddef.h>
-
-void * _soxr_simd_aligned_malloc(size_t);
-void * _soxr_simd_aligned_calloc(size_t, size_t);
-void _soxr_simd_aligned_free(void *);
-
-void _soxr_ordered_convolve_simd(int n, void * not_used, float * a, const float * b);
-void _soxr_ordered_partial_convolve_simd(int n, float * a, const float * b);
-
-#endif
diff --git a/src/simd32-dev.h b/src/simd32-dev.h
new file mode 100644
index 0000000..0408758
--- /dev/null
+++ b/src/simd32-dev.h
@@ -0,0 +1,54 @@
+/* SoX Resampler Library      Copyright (c) 2007-16 robs@users.sourceforge.net
+ * Licence for this file: LGPL v2.1                  See LICENCE for details. */
+
+#if !defined soxr_simd_dev_included
+#define soxr_simd_dev_included
+
+#if defined __GNUC__
+  #define SIMD_INLINE(T) static __inline T __attribute__((always_inline))
+  #define vAlign __attribute__((aligned (16)))
+#elif defined _MSC_VER
+  #define SIMD_INLINE(T) static __forceinline T
+  #define vAlign __declspec(align(16))
+#endif
+
+#if defined __x86_64__ || defined _M_X64 || defined i386 || defined _M_IX86
+
+#include <xmmintrin.h>
+
+#define vZero()      _mm_setzero_ps()
+#define vSet1(a)     _mm_set_ss(a)
+#define vMul(a,b)    _mm_mul_ps(a,b)
+#define vAdd(a,b)    _mm_add_ps(a,b)
+#define vMac(a,b,c)  vAdd(vMul(a,b),c)
+#define vLds(a)      _mm_set1_ps(a)
+#define vLd(a)       _mm_load_ps(a)
+#define vLdu(a)      _mm_loadu_ps(a)
+
+typedef __m128 v4_t;
+
+SIMD_INLINE(void) vStorSum(float * a, v4_t b) {
+  v4_t t = vAdd(_mm_movehl_ps(b, b), b);
+  _mm_store_ss(a, vAdd(t, _mm_shuffle_ps(t,t,1)));}
+
+#elif defined __arm__
+
+#include <arm_neon.h>
+
+#define vZero()      vdupq_n_f32(0)
+#define vMul(a,b)    vmulq_f32(a,b)
+#define vAdd(a,b)    vaddq_f32(a,b)
+#define vMac(a,b,c)  vmlaq_f32(c,a,b)
+#define vLds(a)      vld1q_dup_f32(&(a))
+#define vLd(a)       vld1q_f32(a)
+#define vLdu(a)      vld1q_f32(a)
+
+typedef float32x4_t v4_t;
+
+SIMD_INLINE(void) vStorSum(float * a, v4_t b) {
+  float32x2_t t = vadd_f32(vget_high_f32(b), vget_low_f32(b));
+  *a = vget_lane_f32(vpadd_f32(t, t), 0);}
+
+#endif
+
+#endif
diff --git a/src/simd32.c b/src/simd32.c
new file mode 100644
index 0000000..3f6cb81
--- /dev/null
+++ b/src/simd32.c
@@ -0,0 +1,8 @@
+/* SoX Resampler Library      Copyright (c) 2007-16 robs@users.sourceforge.net
+ * Licence for this file: LGPL v2.1                  See LICENCE for details. */
+
+#define PFFFT_DOUBLE 0
+
+#include "simd32.h"
+
+#include "simd.c"
diff --git a/src/simd32.h b/src/simd32.h
new file mode 100644
index 0000000..bc185da
--- /dev/null
+++ b/src/simd32.h
@@ -0,0 +1,23 @@
+/* SoX Resampler Library      Copyright (c) 2007-16 robs@users.sourceforge.net
+ * Licence for this file: LGPL v2.1                  See LICENCE for details. */
+
+#if !defined simd32_included
+#define simd32_included
+
+#include <stddef.h>
+
+void * _soxr_simd32_aligned_malloc(size_t);
+void * _soxr_simd32_aligned_calloc(size_t, size_t);
+void _soxr_simd32_aligned_free(void *);
+
+#define SIMD_ALIGNED_MALLOC _soxr_simd32_aligned_malloc
+#define SIMD_ALIGNED_CALLOC _soxr_simd32_aligned_calloc
+#define SIMD_ALIGNED_FREE _soxr_simd32_aligned_free
+
+void _soxr_ordered_convolve_simd32(int n, void * not_used, float * a, float const * b);
+void _soxr_ordered_partial_convolve_simd32(int n, float * a, float const * b);
+
+#define ORDERED_CONVOLVE_SIMD _soxr_ordered_convolve_simd32
+#define ORDERED_PARTIAL_CONVOLVE_SIMD _soxr_ordered_partial_convolve_simd32
+
+#endif
diff --git a/src/simd64-dev.h b/src/simd64-dev.h
new file mode 100644
index 0000000..37484e4
--- /dev/null
+++ b/src/simd64-dev.h
@@ -0,0 +1,42 @@
+/* SoX Resampler Library      Copyright (c) 2007-16 robs@users.sourceforge.net
+ * Licence for this file: LGPL v2.1                  See LICENCE for details. */
+
+#if !defined soxr_simd64_dev_included
+#define soxr_simd64_dev_included
+
+#if defined __GNUC__
+  #define SIMD_INLINE(T) static __inline T __attribute__((always_inline))
+  #define vAlign __attribute__((aligned (32)))
+#elif defined _MSC_VER
+  #define SIMD_INLINE(T) static __forceinline T
+  #define vAlign __declspec(align(32))
+#else
+  #define SIMD_INLINE(T) static __inline T
+#endif
+
+#if defined __x86_64__ || defined _M_X64 || defined i386 || defined _M_IX86
+
+#include <immintrin.h>
+
+#if defined __AVX__
+
+#define vZero()      _mm256_setzero_pd()
+#define vSet1(a)     _mm256_set_pd(0,0,0,a)
+#define vMul(a,b)    _mm256_mul_pd(a,b)
+#define vAdd(a,b)    _mm256_add_pd(a,b)
+#define vMac(a,b,c)  vAdd(vMul(a,b),c) /* Note: gcc -mfma will `fuse' these */
+#define vLds(a)      _mm256_set1_pd(a)
+#define vLd(a)       _mm256_load_pd(a)
+#define vLdu(a)      _mm256_loadu_pd(a)
+
+typedef __m256d v4_t;
+
+SIMD_INLINE(void) vStorSum(double * a, v4_t b) {
+  b = _mm256_hadd_pd(b, _mm256_permute2f128_pd(b,b,1));
+  _mm_store_sd(a, _mm256_castpd256_pd128(_mm256_hadd_pd(b,b)));}
+
+#endif
+
+#endif
+
+#endif
diff --git a/src/simd64.c b/src/simd64.c
new file mode 100644
index 0000000..b601750
--- /dev/null
+++ b/src/simd64.c
@@ -0,0 +1,8 @@
+/* SoX Resampler Library      Copyright (c) 2007-16 robs@users.sourceforge.net
+ * Licence for this file: LGPL v2.1                  See LICENCE for details. */
+
+#define PFFFT_DOUBLE 1
+
+#include "simd64.h"
+
+#include "simd.c"
diff --git a/src/simd64.h b/src/simd64.h
new file mode 100644
index 0000000..0ebc439
--- /dev/null
+++ b/src/simd64.h
@@ -0,0 +1,23 @@
+/* SoX Resampler Library      Copyright (c) 2007-16 robs@users.sourceforge.net
+ * Licence for this file: LGPL v2.1                  See LICENCE for details. */
+
+#if !defined simd64_included
+#define simd64_included
+
+#include <stddef.h>
+
+void * _soxr_simd64_aligned_malloc(size_t);
+void * _soxr_simd64_aligned_calloc(size_t, size_t);
+void _soxr_simd64_aligned_free(void *);
+
+#define SIMD_ALIGNED_MALLOC _soxr_simd64_aligned_malloc
+#define SIMD_ALIGNED_CALLOC _soxr_simd64_aligned_calloc
+#define SIMD_ALIGNED_FREE _soxr_simd64_aligned_free
+
+void _soxr_ordered_convolve_simd64(int n, void * not_used, double * a, double const * b);
+void _soxr_ordered_partial_convolve_simd64(int n, double * a, double const * b);
+
+#define ORDERED_CONVOLVE_SIMD _soxr_ordered_convolve_simd64
+#define ORDERED_PARTIAL_CONVOLVE_SIMD _soxr_ordered_partial_convolve_simd64
+
+#endif
diff --git a/src/soxr-lsr.h b/src/soxr-lsr.h
index c0923aa..2acd138 100644
--- a/src/soxr-lsr.h
+++ b/src/soxr-lsr.h
@@ -37,9 +37,9 @@
 #endif
 
 typedef float   SRC_SAMPLE;
-#if !defined SOXR_LIB
 enum SRC_SRCTYPE_e {SRC_SINC_BEST_QUALITY, SRC_SINC_MEDIUM_QUALITY,
                     SRC_SINC_FASTEST, SRC_ZERO_ORDER_HOLD, SRC_LINEAR};
+#if !defined SOXR_LIB
 typedef int     SRC_SRCTYPE;
 typedef int     SRC_ERROR;
 typedef long    (* src_callback_t)(void *, SRC_SAMPLE * *);
diff --git a/src/soxr.c b/src/soxr.c
index d36891a..d963bec 100644
--- a/src/soxr.c
+++ b/src/soxr.c
@@ -1,10 +1,8 @@
-/* SoX Resampler Library      Copyright (c) 2007-13 robs@users.sourceforge.net
+/* SoX Resampler Library      Copyright (c) 2007-16 robs@users.sourceforge.net
  * Licence for this file: LGPL v2.1                  See LICENCE for details. */
 
 #include <math.h>
-#include <stdarg.h>
 #include <stdlib.h>
-#include <stdio.h>
 #include <string.h>
 #include <time.h>
 
@@ -13,20 +11,17 @@
 #include "internal.h"
 
 #if AVUTIL_FOUND
-#include <libavutil/cpu.h>
+  #include <libavutil/cpu.h>
 #endif
 
 
 
 #if WITH_DEV_TRACE
 
-int _soxr_trace_level(void)
-{
-  char const * e = getenv("SOXR_TRACE");
-  return e? atoi(e) : 4;
-}
-
+#include <stdarg.h>
+#include <stdio.h>
 
+int _soxr_trace_level;
 
 void _soxr_trace(char const * fmt, ...)
 {
@@ -96,52 +91,6 @@ struct soxr {
 
 
 
-#define RESET_ON_CLEAR   (1u<<31)
-
-/* TODO: these should not be here. */
-#define TO_3dB(a)       ((1.6e-6*a-7.5e-4)*a+.646)
-#define LOW_Q_BW0       (1385 / 2048.) /* 0.67625 rounded to be a FP exact. */
-
-
-
-soxr_quality_spec_t soxr_quality_spec(unsigned long recipe, unsigned long flags)
-{
-  soxr_quality_spec_t spec, * p = &spec;
-  unsigned quality = recipe & 0xf;
-  double rej;
-  memset(p, 0, sizeof(*p));
-  if (quality > 13) {
-    p->e = "invalid quality type";
-    return spec;
-  }
-  flags |= quality < SOXR_LSR0Q? RESET_ON_CLEAR : 0;
-  if (quality == 13)
-    quality = 6;
-  else if (quality > 10)
-    quality = 0;
-  p->phase_response = "\62\31\144"[(recipe & 0x30) >> 4];
-  p->stopband_begin = 1;
-  p->precision = !quality? 0: quality < 3? 16 : quality < 8? 4 + quality * 4 : 55 - quality * 4;
-  rej = p->precision * linear_to_dB(2.);
-  p->flags = flags;
-  if (quality < 8) {
-    p->passband_end = quality == 1? LOW_Q_BW0 : 1 - .05 / TO_3dB(rej);
-    if (quality <= 2)
-      p->flags &= ~SOXR_ROLLOFF_NONE, p->flags |= SOXR_ROLLOFF_MEDIUM;
-  }
-  else {
-    static float const bw[] = {.931f, .832f, .663f};
-    p->passband_end = bw[quality - 8];
-    if (quality - 8 == 2)
-      p->flags &= ~SOXR_ROLLOFF_NONE, p->flags |= SOXR_ROLLOFF_MEDIUM;
-  }
-  if (recipe & SOXR_STEEP_FILTER)
-    p->passband_end = 1 - .01 / TO_3dB(rej);
-  return spec;
-}
-
-
-
 char const * soxr_engine(soxr_t p)
 {
   return resampler_id();
@@ -163,82 +112,132 @@ soxr_error_t soxr_error(soxr_t p)
 
 
 
-soxr_runtime_spec_t soxr_runtime_spec(unsigned num_threads)
-{
-  soxr_runtime_spec_t spec, * p = &spec;
-  memset(p, 0, sizeof(*p));
-  p->log2_min_dft_size = 10;
-  p->log2_large_dft_size = 17;
-  p->coef_size_kbytes = 400;
-  p->num_threads = num_threads;
-  return spec;
-}
-
-
-
-soxr_io_spec_t soxr_io_spec(
-  soxr_datatype_t itype,
-  soxr_datatype_t otype)
-{
-  soxr_io_spec_t spec, * p = &spec;
-  memset(p, 0, sizeof(*p));
-  if ((itype | otype) >= SOXR_SPLIT * 2)
-    p->e = "invalid io datatype(s)";
-  else {
-    p->itype = itype;
-    p->otype = otype;
-    p->scale = 1;
-  }
-  return spec;
-}
-
-
-
-#if SIMD_FOUND
-static bool cpu_has_simd(void)
-{
-#if defined __x86_64__ || defined _M_X64
-  return true;
-#elif defined __GNUC__ && defined i386
-  uint32_t eax, ebx, ecx, edx;
-  __asm__ __volatile__ (
-      "pushl %%ebx   \n\t"
-      "cpuid         \n\t"
-      "movl %%ebx, %1\n\t"
-      "popl %%ebx    \n\t"
-      : "=a"(eax), "=r"(ebx), "=c"(ecx), "=d"(edx)
-      : "a"(1)
-      : "cc" );
-  return !!(edx & 0x06000000);
-#elif defined _MSC_VER && defined _M_IX86
-  uint32_t d;
-  __asm {
-    xor     eax, eax
-    inc     eax
-    push    ebx
-    cpuid
-    pop     ebx
-    mov     d, edx
-  }
-  return !!(d & 0x06000000);
-#elif defined AV_CPU_FLAG_NEON
-  return !!(av_get_cpu_flags() & AV_CPU_FLAG_NEON);
-#endif
-  return false;
-}
-
-
-
-static bool should_use_simd(void)
-{
-    char const * e = getenv("SOXR_USE_SIMD");
-    return e? !!atoi(e) : cpu_has_simd();
-}
+#if WITH_CR32S || WITH_CR64S
+  #if defined __GNUC__ && defined __x86_64__
+    #define CPUID(type, eax_, ebx_, ecx_, edx_) \
+      __asm__ __volatile__ ( \
+        "cpuid \n\t" \
+        : "=a" (eax_), "=b" (ebx_), "=c" (ecx_), "=d" (edx_) \
+        : "a" (type), "c" (0));
+  #elif defined __GNUC__ && defined __i386__
+    #define CPUID(type, eax_, ebx_, ecx_, edx_) \
+      __asm__ __volatile__ ( \
+        "mov %%ebx, %%edi \n\t" \
+        "cpuid \n\t" \
+        "xchg %%edi, %%ebx \n\t" \
+        : "=a" (eax_), "=D" (ebx_), "=c" (ecx_), "=d" (edx_) \
+        : "a" (type), "c" (0));
+  #elif defined _M_X64 && defined _MSC_VER && _MSC_VER > 1500
+     void __cpuidex(int CPUInfo[4], int info_type, int ecxvalue);
+     #pragma intrinsic(__cpuidex)
+     #define CPUID(type, eax_, ebx_, ecx_, edx_) do { \
+       int regs[4]; \
+       __cpuidex(regs, type, 0); \
+       eax_ = regs[0], ebx_ = regs[1], ecx_ = regs[2], edx_ = regs[3]; \
+     } while(0)
+  #elif defined _M_X64 && defined _MSC_VER
+     void __cpuidex(int CPUInfo[4], int info_type);
+     #pragma intrinsic(__cpuidex)
+     #define CPUID(type, eax_, ebx_, ecx_, edx_) do { \
+       int regs[4]; \
+       __cpuidex(regs, type); \
+       eax_ = regs[0], ebx_ = regs[1], ecx_ = regs[2], edx_ = regs[3]; \
+     } while(0)
+  #elif defined _M_IX86 && defined _MSC_VER
+    #define CPUID(type, eax_, ebx_, ecx_, edx_) \
+      __asm pushad \
+      __asm mov eax, type \
+      __asm xor ecx, ecx \
+      __asm cpuid \
+      __asm mov eax_, eax \
+      __asm mov ebx_, ebx \
+      __asm mov ecx_, ecx \
+      __asm mov edx_, edx \
+      __asm popad
+  #endif
 #endif
 
 
 
-extern control_block_t _soxr_rate32s_cb, _soxr_rate32_cb, _soxr_rate64_cb, _soxr_vr32_cb;
+#if WITH_CR32S
+  static bool cpu_has_simd32(void)
+  {
+  #if defined __x86_64__ || defined _M_X64
+    return true;
+  #elif defined __i386__ || defined _M_IX86
+    enum {SSE = 1 << 25, SSE2 = 1 << 26};
+    unsigned eax_, ebx_, ecx_, edx_;
+    CPUID(1, eax_, ebx_, ecx_, edx_);
+    return (edx_ & (SSE|SSE2)) != 0;
+  #elif defined AV_CPU_FLAG_NEON
+    return !!(av_get_cpu_flags() & AV_CPU_FLAG_NEON);
+  #else
+    return false;
+  #endif
+  }
+
+  static bool should_use_simd32(void)
+  {
+    char const * e = getenv("SOXR_USE_SIMD32");
+    return e? !!atoi(e) : cpu_has_simd32();
+  }
+#endif
+
+
+
+#if WITH_CR64S
+  #if defined __GNUC__
+    #define XGETBV(type, eax_, edx_) \
+      __asm__ __volatile__ ( \
+        ".byte 0x0f, 0x01, 0xd0\n" \
+        : "=a"(eax_), "=d"(edx_) : "c" (type));
+  #elif defined _M_X64 && defined _MSC_FULL_VER && _MSC_FULL_VER >= 160040219
+    #include <immintrin.h>
+    #define XGETBV(type, eax_, edx_) do { \
+      union {uint64_t x; uint32_t y[2];} a = {_xgetbv(0)}; \
+      eax_ = a.y[0], edx_ = a.y[1]; \
+     } while(0)
+  #elif defined _M_IX86 && defined _MSC_VER
+    #define XGETBV(type, eax_, edx_) \
+      __asm pushad \
+      __asm mov ecx, type \
+      __asm _emit 0x0f \
+      __asm _emit 0x01 \
+      __asm _emit 0xd0 \
+      __asm mov eax_, eax \
+      __asm mov edx_, edx \
+      __asm popad
+  #else
+    #define XGETBV(type, eax_, edx_) eax_ = edx_ = 0
+  #endif
+
+  static bool cpu_has_simd64(void)
+  {
+    enum {OSXSAVE = 1 << 27, AVX = 1 << 28};
+    unsigned eax_, ebx_, ecx_, edx_;
+    CPUID(1, eax_, ebx_, ecx_, edx_);
+    if ((ecx_ & (OSXSAVE|AVX)) == (OSXSAVE|AVX)) {
+      XGETBV(0, eax_, edx_);
+      return (eax_ & 6) == 6;
+    }
+    return false;
+  }
+
+  static bool should_use_simd64(void)
+  {
+    char const * e = getenv("SOXR_USE_SIMD64");
+    return e? !!atoi(e) : cpu_has_simd64();
+  }
+#endif
+
+
+
+extern control_block_t
+  _soxr_rate32_cb,
+  _soxr_rate32s_cb,
+  _soxr_rate64_cb,
+  _soxr_rate64s_cb,
+  _soxr_vr32_cb;
 
 
 
@@ -280,6 +279,11 @@ soxr_t soxr_create(
   soxr_t p = 0;
   soxr_error_t error = 0;
 
+#if WITH_DEV_TRACE
+  char const * e = getenv("SOXR_TRACE");
+  _soxr_trace_level = e? atoi(e) : 0;
+#endif
+
   if (q_spec && q_spec->e)  error = q_spec->e;
   else if (io_spec && (io_spec->itype | io_spec->otype) >= SOXR_SPLIT * 2)
     error = "invalid io datatype(s)";
@@ -319,27 +323,39 @@ soxr_t soxr_create(
 
     p->seed = (unsigned long)time(0) ^ (unsigned long)(size_t)p;
 
-#if WITH_SINGLE_PRECISION
-    if (!WITH_DOUBLE_PRECISION || (p->q_spec.precision <= 20 && !(p->q_spec.flags & SOXR_DOUBLE_PRECISION))
-        || (p->q_spec.flags & SOXR_VR)) {
+#if WITH_CR32 || WITH_CR32S || WITH_VR32
+    if (0
+#if WITH_VR32
+        || ((!WITH_CR32 && !WITH_CR32S) || (p->q_spec.flags & SOXR_VR))
+#endif
+#if WITH_CR32 || WITH_CR32S
+        || !(WITH_CR64 || WITH_CR64S) || (p->q_spec.precision <= 20 && !(p->q_spec.flags & SOXR_DOUBLE_PRECISION))
+#endif
+        ) {
       p->deinterleave = (deinterleave_t)_soxr_deinterleave_f;
       p->interleave = (interleave_t)_soxr_interleave_f;
       memcpy(&p->control_block,
-          (p->q_spec.flags & SOXR_VR)? &_soxr_vr32_cb :
-#if SIMD_FOUND
-          should_use_simd()? &_soxr_rate32s_cb :
+#if WITH_VR32
+          ((!WITH_CR32 && !WITH_CR32S) || (p->q_spec.flags & SOXR_VR))? &_soxr_vr32_cb :
+#endif
+#if WITH_CR32S
+          !WITH_CR32 || should_use_simd32()? &_soxr_rate32s_cb :
 #endif
           &_soxr_rate32_cb, sizeof(p->control_block));
     }
-#if WITH_DOUBLE_PRECISION
+#if WITH_CR64 || WITH_CR64S
     else
 #endif
 #endif
-#if WITH_DOUBLE_PRECISION
+#if WITH_CR64 || WITH_CR64S
     {
       p->deinterleave = (deinterleave_t)_soxr_deinterleave;
       p->interleave = (interleave_t)_soxr_interleave;
-      memcpy(&p->control_block, &_soxr_rate64_cb, sizeof(p->control_block));
+      memcpy(&p->control_block,
+#if WITH_CR64S
+          !WITH_CR64 || should_use_simd64()? &_soxr_rate64s_cb :
+#endif
+          &_soxr_rate64_cb, sizeof(p->control_block));
     }
 #endif
 
diff --git a/src/soxr.h b/src/soxr.h
index 56132cb..640b698 100644
--- a/src/soxr.h
+++ b/src/soxr.h
@@ -1,4 +1,4 @@
-/* SoX Resampler Library       Copyright (c) 2007-13 robs@users.sourceforge.net
+/* SoX Resampler Library       Copyright (c) 2007-16 robs@users.sourceforge.net
  *
  * This library is free software; you can redistribute it and/or modify it
  * under the terms of the GNU Lesser General Public License as published by
@@ -269,9 +269,6 @@ struct soxr_runtime_spec {                                       /* Typically */
 #define SOXR_COEF_INTERP_LOW   2u    /* Man. select: less CPU, more memory. */
 #define SOXR_COEF_INTERP_HIGH  3u    /* Man. select: more CPU, less memory. */
 
-#define SOXR_STRICT_BUFFERING  4u  /* Reserved for future use. */
-#define SOXR_NOSMALLINTOPT     8u  /* For test purposes only. */
-
 
 
 /* -------------------------- API type constructors ------------------------- */
@@ -296,7 +293,7 @@ SOXR soxr_quality_spec_t soxr_quality_spec(
 #define SOXR_24_BITQ            5
 #define SOXR_28_BITQ            6
 #define SOXR_32_BITQ            7
-                                    /* Libsamplerate equivalent qualities: */
+                                    /* For internal use only; to be removed: */
 #define SOXR_LSR0Q              8     /* 'Best sinc'. */
 #define SOXR_LSR1Q              9     /* 'Medium sinc'. */
 #define SOXR_LSR2Q              10    /* 'Fast sinc'. */
@@ -304,8 +301,8 @@ SOXR soxr_quality_spec_t soxr_quality_spec(
 #define SOXR_LINEAR_PHASE       0x00
 #define SOXR_INTERMEDIATE_PHASE 0x10
 #define SOXR_MINIMUM_PHASE      0x30
+
 #define SOXR_STEEP_FILTER       0x40
-#define SOXR_ALLOW_ALIASING     0x80  /* Reserved for future use. */
 
 
 
diff --git a/src/vr32.c b/src/vr32.c
index 9ad17d1..8b1a259 100644
--- a/src/vr32.c
+++ b/src/vr32.c
@@ -1,16 +1,10 @@
-/* SoX Resampler Library      Copyright (c) 2007-13 robs@users.sourceforge.net
+/* SoX Resampler Library      Copyright (c) 2007-16 robs@users.sourceforge.net
  * Licence for this file: LGPL v2.1                  See LICENCE for details. */
 
 /* Variable-rate resampling. */
 
 #include <assert.h>
-#include <math.h>
-#if !defined M_PI
-#define M_PI    3.14159265358979323846
-#endif
-#if !defined M_LN2
-#define M_LN2   0.69314718055994530942
-#endif
+#include "math-wrap.h"
 #include <string.h>
 #include <stdlib.h>
 #include "internal.h"
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index 0eba7e0..3c28f9c 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -10,7 +10,10 @@ foreach (fe ${SOURCES})
   add_executable (${f} ${fe})
 endforeach ()
 
-enable_testing ()
+# Can't use c89 for this file:
+if (CMAKE_C_COMPILER_ID STREQUAL "GNU" OR CMAKE_C_COMPILER_ID STREQUAL "Clang")
+  set_property (SOURCE throughput APPEND_STRING PROPERTY COMPILE_FLAGS "-std=gnu89")
+endif ()
 
 set (sweep_to_freq 22050)
 set (leader 1)
@@ -20,33 +23,36 @@ math (EXPR base_rate "${sweep_to_freq} + ${sweep_to_freq}")
 macro (add_vector r)
   set (output ${CMAKE_CURRENT_BINARY_DIR}/ref-${r}.s32)
   add_custom_command (OUTPUT ${output} DEPENDS vector-gen ${CMAKE_CURRENT_LIST_FILE}
-    COMMAND vector-gen ${r} ${leader} ${len} ${sweep_to_freq} 1 ${output})
+    COMMAND vector-gen ${r} ${leader} ${len} 0 ${sweep_to_freq} 1 ${output})
   set (vectors ${output} ${vectors})
 endmacro ()
 
-macro (add_cmp_test from to bits)
-  set (name ${bits}-bit-perfect-${from}-${to})
-  add_test (NAME ${name} COMMAND ${CMAKE_COMMAND} -Dbits=${bits} -DBIN=${BIN} -DEXAMPLES_BIN=${EXAMPLES_BIN} -Dleader=${leader} -Dto=${to}
-    -Dfrom=${from} -Dlen=${len} -P ${CMAKE_CURRENT_SOURCE_DIR}/cmp-test.cmake)
-  add_vector (${from})
-  add_vector (${to})
+macro (add_cmp_test irate orate bits)
+  set (name ${bits}-bit-perfect-${irate}-${orate})
+  add_test (NAME ${name} COMMAND ${CMAKE_COMMAND} -Dbits=${bits} -DBIN=${BIN}
+    -DEXAMPLES_BIN=${EXAMPLES_BIN} -DlenToSkip=${leader} -Dorate=${orate}
+    -Dirate=${irate} -Dlen=${len} -P ${CMAKE_CURRENT_SOURCE_DIR}/cmp-test.cmake)
+  add_vector (${irate})
+  add_vector (${orate})
 endmacro ()
 
 unset (test_bits)
-if (WITH_SINGLE_PRECISION)
+if (WITH_CR32 OR WITH_CR32S OR WITH_CR64 OR WITH_CR64S)
   set (test_bits 20)
 endif ()
-if (WITH_DOUBLE_PRECISION)
-  set (test_bits ${test_bits} 24)
+if (WITH_CR64 OR WITH_CR64S)
+  set (test_bits ${test_bits} 28)
 endif ()
 
 foreach (b ${test_bits})
-  foreach (r 96000 65537)
+  foreach (r 192000 65537)
     add_cmp_test (${base_rate} ${r} ${b})
     add_cmp_test (${r} ${base_rate} ${b})
   endforeach ()
 endforeach ()
 
-add_custom_target (test-vectors ALL DEPENDS ${vectors})
+if (NOT CMAKE_CROSSCOMPILING)
+  add_custom_target (test-vectors ALL DEPENDS ${vectors})
+endif ()
 
 add_test (1-delay-clear ${BIN}1-delay-clear)
diff --git a/tests/cmp-test.cmake b/tests/cmp-test.cmake
index 8db76c5..a836322 100644
--- a/tests/cmp-test.cmake
+++ b/tests/cmp-test.cmake
@@ -1,17 +1,13 @@
 # SoX Resampler Library       Copyright (c) 2007-13 robs@users.sourceforge.net
 # Licence for this file: LGPL v2.1                  See LICENCE for details.
 
-if (${bits} STREQUAL 24)
-  set (quality 45)
-else ()
-  set (quality 44)
-endif ()
+math (EXPR quality "43 + (${bits} - 13) / 4")
+set (ofile ${irate}-${orate}-${quality}.s32)
+#message (STATUS "Output file = [${ofile}]")
 
-set (output ${from}-${to}-${quality}.s32)
-
-execute_process(COMMAND ${EXAMPLES_BIN}3-options-input-fn ${from} ${to} 1 2 2 ${quality} a
-  INPUT_FILE ref-${from}.s32
-  OUTPUT_FILE ${output}
+execute_process(COMMAND ${EXAMPLES_BIN}3-options-input-fn ${irate} ${orate} 1 2 2 ${quality} a
+  INPUT_FILE ref-${irate}.s32
+  OUTPUT_FILE ${ofile}
   ERROR_VARIABLE test_error
   RESULT_VARIABLE test_result)
 
@@ -19,7 +15,11 @@ if (test_result)
   message (FATAL_ERROR "Resampling failure: ${test_error}")
 endif ()
 
-execute_process(COMMAND ${BIN}vector-cmp ref-${to}.s32 ${output} ${to} ${leader} ${len} ${bits} 98
+set (percentageToCheck 98)
+math (EXPR lenToCheck "${len} * ${percentageToCheck}")
+string (REGEX REPLACE "(..)$" ".\\1" lenToCheck "${lenToCheck}") # Divide by 100
+
+execute_process(COMMAND ${BIN}vector-cmp ref-${orate}.s32 ${ofile} ${orate} ${lenToSkip} ${lenToCheck} ${bits}
   OUTPUT_VARIABLE test_output
   RESULT_VARIABLE test_result)
 
diff --git a/tests/io-test b/tests/io-test
index 4205c71..608bc9a 100755
--- a/tests/io-test
+++ b/tests/io-test
@@ -35,7 +35,7 @@ test z$1 != z && j=$1 || j=1
 
 for c in `seq 1 $j`; do
   for n in `seq 0 3`; do
-    sox -r $ir -n $c.${types[$n]} synth $len sin $f gain -.1
+    sox -R -r $ir -n $c.${types[$n]} synth $len sin $f gain -.1
   done
 
   n=0
diff --git a/tests/large-ratio-test b/tests/large-ratio-test
index 74cd0a4..540c5df 100755
--- a/tests/large-ratio-test
+++ b/tests/large-ratio-test
@@ -1,22 +1,22 @@
 #!/usr/bin/env bash
 set -e
 
-# SoX Resampler Library       Copyright (c) 2007-15 robs@users.sourceforge.net
+# SoX Resampler Library       Copyright (c) 2007-16 robs@users.sourceforge.net
 # Licence for this file: LGPL v2.1                  See LICENCE for details.
 
 # Tests interpolating then decimating by the same, large ratio.
 
 tool=../examples/3-options-input-fn
 w=$(echo -e "`sox --ver |sed 's/.*SoX v//'` d\n14.4.1 k"|sort -Vr|head -1|sed 's/.* //')
-q=6
-ratio=2e4
-srate=8000
-nrate=`expr $srate / 2`
+q=4
+test x$1 = x && ratio=1e5 || ratio=$1
+test x$2 = x && rate=8000 || rate=$2
 
-../tests/vector-gen $srate 0 8 $nrate .9375 1.s32
+sox -r$rate -n 1.s32 synth 10 sin 0:`expr $rate / 2` vol .9375
+sync
 
-$tool 1 $ratio 1 2 1 $q < 1.s32 | $tool $ratio 1 1 1 2 $q > 2.s32
+time { $tool 1 $ratio 1 2 1 $q a < 1.s32 | $tool $ratio 1 1 1 2 $q a > 2.s32;}
 
-sox -M -r $srate -c1 1.s32 -r $srate -c1 2.s32 -n spectrogram -hw$w -Z-10 -z180 -o lr-$w.png -c "large-ratio-test q:$q ratio:$ratio"
+sox -mv-1 -r$rate -c1 1.s32 -r$rate -c1 2.s32 -n spectrogram -hw$w -z150 -o lr-$w.png -c "large-ratio-test q:$q ratio:$ratio"
 
 rm [12].s32
diff --git a/tests/throughput-test b/tests/throughput-test
new file mode 100755
index 0000000..544c620
--- /dev/null
+++ b/tests/throughput-test
@@ -0,0 +1,4 @@
+#!/bin/sh
+set -e
+
+for n in `seq 0 3`; do ./throughput 44.1 48 1 0 $n; done
diff --git a/tests/throughput-test.bat b/tests/throughput-test.bat
new file mode 100644
index 0000000..a183881
--- /dev/null
+++ b/tests/throughput-test.bat
@@ -0,0 +1 @@
+for /L %%i in (0,1,3) DO throughput 44.1 48 1 0 %%i
diff --git a/tests/throughput.c b/tests/throughput.c
new file mode 100644
index 0000000..21a4c32
--- /dev/null
+++ b/tests/throughput.c
@@ -0,0 +1,111 @@
+/* SoX Resampler Library      Copyright (c) 2007-16 robs@users.sourceforge.net
+ * Licence for this file: LGPL v2.1                  See LICENCE for details. */
+
+#include <soxr.h>
+#include "rint.h"
+#include "../examples/examples-common.h"
+
+#define k 1000
+#if defined _WIN32
+  #define WIN32_LEAN_AND_MEAN
+  #include <windows.h>
+  #define timerDecl LARGE_INTEGER start, stop, tmp
+  #define timerStart(msecs) QueryPerformanceCounter(&start), \
+      QueryPerformanceFrequency(&tmp), \
+      stop.QuadPart = (msecs * tmp.QuadPart + k/2) / k
+  #define timerRunning() (QueryPerformanceCounter(&tmp), \
+      (tmp.QuadPart-start.QuadPart < stop.QuadPart))
+#else
+  #include <time.h>
+  #define timerDecl struct timespec stop, tmp
+  #define timerStart(msecs) clock_gettime(CLOCK_MONOTONIC, &stop), \
+      stop.tv_nsec += (msecs%k)*(k*k), \
+      stop.tv_sec  += msecs/k + stop.tv_nsec/(k*k*k), \
+      stop.tv_nsec %= k*k*k
+  #define timerRunning() (clock_gettime(CLOCK_MONOTONIC, &tmp), \
+      (tmp.tv_sec < stop.tv_sec || tmp.tv_nsec < stop.tv_nsec))
+#endif
+
+int main(int n, char const * arg[])
+{
+  char const *     const arg0 = n? --n, *arg++ : "", * engine = "";
+  double          const irate = n? --n, atof(*arg++) : 96000.;
+  double          const orate = n? --n, atof(*arg++) : 44100.;
+  unsigned        const chans = n? --n, (unsigned)atoi(*arg++) : 1;
+  soxr_datatype_t const itype = n? --n, (soxr_datatype_t)atoi(*arg++) : 0;
+  unsigned        const ospec = n? --n, (soxr_datatype_t)atoi(*arg++) : 0;
+  unsigned long const q_recipe= n? --n, strtoul(*arg++, 0, 16) : SOXR_HQ;
+  unsigned long const q_flags = n? --n, strtoul(*arg++, 0, 16) : 0;
+  double   const passband_end = n? --n, atof(*arg++) : 0;
+  double const stopband_begin = n? --n, atof(*arg++) : 0;
+  double const phase_response = n? --n, atof(*arg++) : -1;
+  int       const use_threads = n? --n, atoi(*arg++) : 1;
+  soxr_datatype_t const otype = ospec & 3;
+
+  soxr_quality_spec_t       q_spec = soxr_quality_spec(q_recipe, q_flags);
+  soxr_io_spec_t            io_spec = soxr_io_spec(itype, otype);
+  soxr_runtime_spec_t const runtime_spec = soxr_runtime_spec(!use_threads);
+
+  /* Allocate resampling input and output buffers in proportion to the input
+   * and output rates: */
+  #define buf_total_len 15000  /* In samples per channel. */
+  size_t const osize = soxr_datatype_size(otype) * chans;
+  size_t const isize = soxr_datatype_size(itype) * chans;
+  size_t const olen0= (size_t)(orate * buf_total_len / (irate + orate) + .5);
+  size_t const olen = min(max(olen0, 1), buf_total_len - 1);
+  size_t const ilen = buf_total_len - olen;
+  void * const obuf = malloc(osize * olen);
+  void * const ibuf = malloc(isize * ilen);
+
+  size_t odone = 0, clips = 0, omax = 0, i;
+  soxr_error_t error;
+  soxr_t soxr;
+
+
+  /* Overrides (if given): */
+  if (passband_end   > 0) q_spec.passband_end   = passband_end / 100;
+  if (stopband_begin > 0) q_spec.stopband_begin = stopband_begin / 100;
+  if (phase_response >=0) q_spec.phase_response = phase_response;
+  io_spec.flags = ospec & ~7u;
+
+  /* Create a stream resampler: */
+  soxr = soxr_create(
+      irate, orate, chans,         /* Input rate, output rate, # of channels. */
+      &error,                         /* To report any error during creation. */
+      &io_spec, &q_spec, &runtime_spec);
+
+  if (!error) {                         /* If all is well, run the resampler: */
+    engine = soxr_engine(soxr);
+#define RAND ((rand()*(1./RAND_MAX)-.5)*1)
+    switch (itype & 3) {
+      case 0: for (i=0;i<ilen*chans; ((float   *)ibuf)[i]=(float  )RAND, ++i); break;
+      case 1: for (i=0;i<ilen*chans; ((double  *)ibuf)[i]=(double )RAND, ++i); break;
+      case 2: for (i=0;i<ilen*chans; ((int32_t *)ibuf)[i]=rint32(65536.*32768*RAND), ++i); break;
+      case 3: for (i=0;i<ilen*chans; ((int16_t *)ibuf)[i]=rint16(    1.*32768*RAND), ++i); break;
+    }
+                                                       /* Resample in blocks: */
+    for (i=0; i<8; ++i) {
+      size_t itotal = 0, ototal = 0;
+      timerDecl;
+#define MSECS 125
+      timerStart(MSECS);
+      do {
+        size_t const ilen1 = odone < olen? ilen : 0;
+        error = soxr_process(soxr, ibuf, ilen1, NULL, obuf, olen, &odone);
+        itotal += ilen1;
+        ototal += odone;
+      } while (!error && timerRunning());
+      omax = max(omax, ototal);
+    }
+  }
+                                                                  /* Tidy up: */
+  clips = *soxr_num_clips(soxr);     /* Can occur only with integer output. */
+  soxr_delete(soxr);
+  free(obuf), free(ibuf);
+                                                              /* Diagnostics: */
+  fprintf(stderr, "%-26s %s; %lu clips; I/O: %s (%s) %.2fMs/s\n",
+      arg0, soxr_strerror(error), (long unsigned)clips,
+      ferror(stdin) || ferror(stdout)? strerror(errno) : "no error", engine,
+      1e-6*k/MSECS*chans*(double)omax);
+  return !!error;
+}
diff --git a/tests/vector-cmp.c b/tests/vector-cmp.c
index 6edd2d5..f90cc7f 100644
--- a/tests/vector-cmp.c
+++ b/tests/vector-cmp.c
@@ -1,53 +1,56 @@
-/* SoX Resampler Library      Copyright (c) 2007-13 robs@users.sourceforge.net
+/* SoX Resampler Library      Copyright (c) 2007-16 robs@users.sourceforge.net
  * Licence for this file: LGPL v2.1                  See LICENCE for details. */
 
 /* Utility used to help test the library; not for general consumption.
  *
- * Compare two swept-sine files.  */
+ * Measure the peak bit difference between two files.  */
 
 #include <stdlib.h>
 #include <stdio.h>
-#include <math.h>
 #include "../src/rint.h"
+#include "../examples/examples-common.h"
 
-int main(int bit, char const * arg[])
+#define TYPE 0 /* As vector-gen */
+
+#if TYPE
+  #define sample_t double
+  #define N 50
+  #define DIFF(s1,s2) abs(rint32((s1-s2)*ldexp(1,N-1)))
+#else
+  #define sample_t int32_t
+  #define N 32
+  #define DIFF(s1,s2) abs((int)(s1-s2))
+#endif
+
+int main(int argc, char const * arg[])
 {
-  FILE    * f1       = fopen(arg[1], "rb"),
-          * f2       = fopen(arg[2], "rb");
-  double  rate       = atof (arg[3]), /* Rate for this vector */
-          leader_len = atof (arg[4]), /* Leader length in seconds */
-          len        = atof (arg[5]), /* Sweep length (excl. leader_len) */
-          expect_bits= atof (arg[6]),
-          expect_bw  = atof (arg[7]);
+  int     two      = !!arg[2][0];
+  FILE    * f1 = fopen(arg[1], "rb"), * f2 = two? fopen(arg[2], "rb") : 0;
+  double  rate     = atof (arg[3]), /* Sample-rate */
+          skip_len = atof (arg[4]), /* Skip length in seconds */
+          len      = atof (arg[5]), /* Compare length in seconds */ r;
+  int i = 0, count = rint32(rate * len), max = 0, diff;
+  sample_t s1, s2;
 
-  int32_t s1, s2;
-  long count = 0;
-  static long thresh[32];
-  double bw, prev = 0;
-
-  for (; fread(&s1, sizeof(s1), 1, f1) == 1 &&
-         fread(&s2, sizeof(s2), 1, f2) == 1; ++count) {
-    long diff = abs((int)(s1 - s2));
-    for (bit = 0; diff && bit < 32; bit++, diff >>= 1)
-      if ((diff & 1) && !thresh[bit])
-        thresh[bit] = count + 1;
-  }
-
-  if (count != (long)((leader_len + len) * rate + .5)) {
-    printf("incorrect file length\n");
-    exit(1);
-  }
-
-  for (bit = 0; bit < 32; ++bit) {
-    bw = ((double)thresh[bit] - 1) / rate - leader_len;
-    if (bit && bw >= 0 && (bw - prev) * 100 / len < .08) {
-      --bit;
-      break;
+  fseek(f1, rint32(rate * skip_len) * (int)sizeof(s1), SEEK_CUR);
+  if (two) {
+    fseek(f2, rint32(rate * skip_len) * (int)sizeof(s2), SEEK_CUR);
+    for (; i < count &&
+        fread(&s1, sizeof(s1), 1, f1) &&
+        fread(&s2, sizeof(s2), 1, f2); ++i) {
+      diff = DIFF(s1, s2);
+      max = max(max, diff);
     }
-    prev = bw;
   }
-  bit = 32 - bit;
-  bw = bw * 100 / len;
-  printf("Bit perfect to %i bits, from DC to %.2f%% nyquist.\n", bit, bw);
-  return !(bit >= expect_bits && bw >= expect_bw);
+  else for (; i < count && fread(&s1, sizeof(s1), 1, f1); ++i) {
+    diff = DIFF(s1, 0);
+    max = max(max, diff);
+  }
+
+  if (i != count) {
+    fprintf(stderr, "incorrect file length\n");
+    return 1;
+  }
+  printf("%f\n", r = N-log(max)/log(2));
+  return argc>6? r<atof(arg[6]) : 0;
 }
diff --git a/tests/vector-gen.c b/tests/vector-gen.c
index f3920d4..0446ec9 100644
--- a/tests/vector-gen.c
+++ b/tests/vector-gen.c
@@ -1,61 +1,61 @@
-/* SoX Resampler Library      Copyright (c) 2007-13 robs@users.sourceforge.net
+/* SoX Resampler Library      Copyright (c) 2007-16 robs@users.sourceforge.net
  * Licence for this file: LGPL v2.1                  See LICENCE for details. */
 
 /* Utility used to help test the library; not for general consumption.
  *
- * Generate a swept sine to a file, with faded `lead-in' section.  */
+ * Generate a swept sine to a file, with `lead-in' section.  */
 
-#define QUAD 0
+#define TYPE 0 /* calc/store: 0:flt64/int32 1:flt80/flt64 2:flt128/flt64 */
 
-#if QUAD
+#if TYPE > 1
   #include <quadmath.h>
 #endif
 
-#include <math.h>
+#include "math-wrap.h"
 #include <stdlib.h>
 #include <stdio.h>
-#if !defined M_PI
-  #define M_PI 3.14159265358979323846
-#endif
 
-#if QUAD
-  #define modf modfq
-  #define cos cosq
-  #define sin sinq
-  #undef M_PI
-  #define M_PI M_PIq
-  #define real __float128
-  #define atof(x) strtoflt128(x, 0)
+#if TYPE
+  #if TYPE > 1
+    #define modf modfq
+    #define cos cosq
+    #define sin sinq
+    #define PI M_PIq
+    #define real __float128
+    #define atof(x) strtoflt128(x, 0)
+  #else
+    #define modf modfl
+    #define cos cosl
+    #define sin sinl
+    #define PI M_PIl
+    #define real long double
+  #endif
+  #define MULT 1
+  #define OUT(d) double output = d
 #else
+  #define PI M_PI
   #define real double
   #include "rint.h"
+  #define MULT (32768. * 65536 - 1/scale)
+  #define OUT(d) int32_t output = rint32(d)
 #endif
 
-int main(int i, char const * argv[])
+int main(int argc, char const * argv[])
 {
-  real rate           = atof(argv[1]), /* Rate for this vector */
-       lead_in_len    = atof(argv[2]), /* Lead-in length in seconds */
-       len            = atof(argv[3]), /* Sweep length (excl. lead_in_len) */
-       sweep_to_freq  = atof(argv[4]), /* Sweep from DC to this freq. */
-       multiplier     = atof(argv[5]), /* For headroom */
-       f1 = -sweep_to_freq / len * lead_in_len, f2 = sweep_to_freq,
-       n1 = rate * -lead_in_len, n2 = rate * len,
-       m = (f2 - f1) / (n2 - n1) / 2, dummy;
-  FILE * file = fopen(argv[6], "wb");
-  i = (int)n1;
-  if (!file || i != n1)
-    exit(1);
-  for (; i < (int)(n2 + .5); ++i) {
-    double d1 = multiplier * sin(2 * M_PI * modf(i * m * i / rate, &dummy));
-    double d = i < 0? d1 * (1 - cos(M_PI * (i + n1) / n1)) * .5 : d1;
-#if QUAD
-    size_t actual = fwrite(&d, sizeof(d), 1, file);
-#else
-    int32_t out = rint32(d * (32768. * 65536 - 1));
-    size_t actual = fwrite(&out, sizeof(out), 1, file);
-#endif
-    if (actual != 1)
-      return 1;
+  real rate         = atof(argv[1]), /* Rate for this vector */
+       lead_in_len  = atof(argv[2]), /* Lead-in length in seconds */
+       len          = atof(argv[3]), /* Sweep length (excl. lead_in_len) */
+       f1           = atof(argv[4]),
+       f2           = atof(argv[5]),
+       scale        = atof(argv[6]), /* For headroom */
+       n1 = rate * -lead_in_len,
+       m = (f2 - f1) / (rate * len * 2), dummy;
+  FILE * file = fopen(argv[7], "wb");
+  int i = (int)n1, err = !file || i != n1;
+  for (; !err && i < (int)(rate*(len+lead_in_len)+.5); ++i) {
+    real d = sin(2 * PI * modf((f1 + i * m) * i / rate, &dummy));
+    OUT((double)(scale * MULT * d));
+    err = fwrite(&output, sizeof(output), 1, file) != 1;
   }
-  return 0;
+  return err |!argc;
 }