From 5e9699704bc4327eb2523276cce6f9c90ed618db Mon Sep 17 00:00:00 2001
From: "zone117x@gmail.com" <root@cryppit.com>
Date: Sun, 30 Mar 2014 04:04:48 -0400
Subject: [PATCH] Init

---
 README.md                                  |   32 +
 bcrypt.c                                   |  566 ++++
 bcrypt.h                                   |   14 +
 binding.gyp                                |   29 +
 build/Makefile                             |  332 +++
 build/binding.Makefile                     |    6 +
 build/config.gypi                          |   38 +
 build/multihashing.target.mk               |  158 +
 index.js                                   |    1 +
 keccak.c                                   |   18 +
 keccak.h                                   |   14 +
 multihashing.cc                            |  260 ++
 package.json                               |   30 +
 quark.c                                    |  211 ++
 quark.h                                    |    6 +
 scrypt-jane/scrypt-jane-chacha.h           |  132 +
 scrypt-jane/scrypt-jane-hash.h             |   48 +
 scrypt-jane/scrypt-jane-hash_keccak.h      |  168 ++
 scrypt-jane/scrypt-jane-hash_sha256.h      |  135 +
 scrypt-jane/scrypt-jane-mix_chacha-avx.h   |  340 +++
 scrypt-jane/scrypt-jane-mix_chacha-sse2.h  |  371 +++
 scrypt-jane/scrypt-jane-mix_chacha-ssse3.h |  348 +++
 scrypt-jane/scrypt-jane-mix_chacha.h       |   69 +
 scrypt-jane/scrypt-jane-mix_salsa-avx.h    |  381 +++
 scrypt-jane/scrypt-jane-mix_salsa-sse2.h   |  443 +++
 scrypt-jane/scrypt-jane-mix_salsa.h        |   70 +
 scrypt-jane/scrypt-jane-pbkdf2.h           |  112 +
 scrypt-jane/scrypt-jane-portable-x86.h     |  364 +++
 scrypt-jane/scrypt-jane-portable.h         |  281 ++
 scrypt-jane/scrypt-jane-romix-basic.h      |   67 +
 scrypt-jane/scrypt-jane-romix-template.h   |  118 +
 scrypt-jane/scrypt-jane-romix.h            |   27 +
 scrypt-jane/scrypt-jane-salsa.h            |  106 +
 scrypt-jane/scrypt-jane-test-vectors.h     |  261 ++
 scrypt.c                                   |  686 +++++
 scrypt.h                                   |    8 +
 scryptjane.c                               |  182 ++
 scryptjane.h                               |   32 +
 scryptn.c                                  |  687 +++++
 scryptn.h                                  |   16 +
 sha3/aes_helper.c                          |  392 +++
 sha3/blake.c                               | 1120 +++++++
 sha3/bmw.c                                 |  965 ++++++
 sha3/cubehash.c                            |  723 +++++
 sha3/echo.c                                | 1031 +++++++
 sha3/groestl.c                             | 3123 ++++++++++++++++++++
 sha3/jh.c                                  | 1116 +++++++
 sha3/keccak.c                              | 1824 ++++++++++++
 sha3/luffa.c                               | 1426 +++++++++
 sha3/shavite.c                             | 1764 +++++++++++
 sha3/simd.c                                | 1799 +++++++++++
 sha3/skein.c                               | 1254 ++++++++
 sha3/sph_blake.h                           |  327 ++
 sha3/sph_bmw.h                             |  328 ++
 sha3/sph_cubehash.h                        |  292 ++
 sha3/sph_echo.h                            |  320 ++
 sha3/sph_groestl.h                         |  329 +++
 sha3/sph_jh.h                              |  298 ++
 sha3/sph_keccak.h                          |  293 ++
 sha3/sph_luffa.h                           |  296 ++
 sha3/sph_shavite.h                         |  314 ++
 sha3/sph_simd.h                            |  309 ++
 sha3/sph_skein.h                           |  298 ++
 sha3/sph_types.h                           | 1976 +++++++++++++
 skein.c                                    |   18 +
 skein.h                                    |   14 +
 stdint.h                                   |  259 ++
 x11.c                                      |   85 +
 x11.h                                      |   14 +
 xcoin.c                                    |    0
 70 files changed, 29474 insertions(+)
 create mode 100644 README.md
 create mode 100644 bcrypt.c
 create mode 100644 bcrypt.h
 create mode 100644 binding.gyp
 create mode 100644 build/Makefile
 create mode 100644 build/binding.Makefile
 create mode 100644 build/config.gypi
 create mode 100644 build/multihashing.target.mk
 create mode 100644 index.js
 create mode 100644 keccak.c
 create mode 100644 keccak.h
 create mode 100644 multihashing.cc
 create mode 100644 package.json
 create mode 100644 quark.c
 create mode 100644 quark.h
 create mode 100644 scrypt-jane/scrypt-jane-chacha.h
 create mode 100644 scrypt-jane/scrypt-jane-hash.h
 create mode 100644 scrypt-jane/scrypt-jane-hash_keccak.h
 create mode 100644 scrypt-jane/scrypt-jane-hash_sha256.h
 create mode 100644 scrypt-jane/scrypt-jane-mix_chacha-avx.h
 create mode 100644 scrypt-jane/scrypt-jane-mix_chacha-sse2.h
 create mode 100644 scrypt-jane/scrypt-jane-mix_chacha-ssse3.h
 create mode 100644 scrypt-jane/scrypt-jane-mix_chacha.h
 create mode 100644 scrypt-jane/scrypt-jane-mix_salsa-avx.h
 create mode 100644 scrypt-jane/scrypt-jane-mix_salsa-sse2.h
 create mode 100644 scrypt-jane/scrypt-jane-mix_salsa.h
 create mode 100644 scrypt-jane/scrypt-jane-pbkdf2.h
 create mode 100644 scrypt-jane/scrypt-jane-portable-x86.h
 create mode 100644 scrypt-jane/scrypt-jane-portable.h
 create mode 100644 scrypt-jane/scrypt-jane-romix-basic.h
 create mode 100644 scrypt-jane/scrypt-jane-romix-template.h
 create mode 100644 scrypt-jane/scrypt-jane-romix.h
 create mode 100644 scrypt-jane/scrypt-jane-salsa.h
 create mode 100644 scrypt-jane/scrypt-jane-test-vectors.h
 create mode 100644 scrypt.c
 create mode 100644 scrypt.h
 create mode 100644 scryptjane.c
 create mode 100644 scryptjane.h
 create mode 100644 scryptn.c
 create mode 100644 scryptn.h
 create mode 100644 sha3/aes_helper.c
 create mode 100644 sha3/blake.c
 create mode 100644 sha3/bmw.c
 create mode 100644 sha3/cubehash.c
 create mode 100644 sha3/echo.c
 create mode 100644 sha3/groestl.c
 create mode 100644 sha3/jh.c
 create mode 100644 sha3/keccak.c
 create mode 100644 sha3/luffa.c
 create mode 100644 sha3/shavite.c
 create mode 100644 sha3/simd.c
 create mode 100644 sha3/skein.c
 create mode 100644 sha3/sph_blake.h
 create mode 100644 sha3/sph_bmw.h
 create mode 100644 sha3/sph_cubehash.h
 create mode 100644 sha3/sph_echo.h
 create mode 100644 sha3/sph_groestl.h
 create mode 100644 sha3/sph_jh.h
 create mode 100644 sha3/sph_keccak.h
 create mode 100644 sha3/sph_luffa.h
 create mode 100644 sha3/sph_shavite.h
 create mode 100644 sha3/sph_simd.h
 create mode 100644 sha3/sph_skein.h
 create mode 100644 sha3/sph_types.h
 create mode 100644 skein.c
 create mode 100644 skein.h
 create mode 100644 stdint.h
 create mode 100644 x11.c
 create mode 100644 x11.h
 create mode 100644 xcoin.c

diff --git a/README.md b/README.md
new file mode 100644
index 0000000..5abaed8
--- /dev/null
+++ b/README.md
@@ -0,0 +1,32 @@
+node-multi-hashing
+===============
+
+Cryptocurrency hashing functions for node.js.
+
+Usage
+-----
+
+Install
+
+```bash
+npm install multi-hashing
+```
+
+
+Hash your data
+
+```javascript
+var multiHashing = require('multi-hashing');
+
+var data = new Buffer("hash me good bro");
+var hashed = multiHashing.x11(data); //returns a 32 byte buffer
+
+console.log(hashed);
+//<SlowBuffer 0b de 16 ef 2d 92 e4 35 65 c6 6c d8 92 d9 66 b4 3d 65 ..... >
+```
+
+Credits
+-------
+
+* Creators of the SHA2 and SHA3 hashing algorithms used here
+* X11 & Quark creators
\ No newline at end of file
diff --git a/bcrypt.c b/bcrypt.c
new file mode 100644
index 0000000..068bcb9
--- /dev/null
+++ b/bcrypt.c
@@ -0,0 +1,566 @@
+/*
+ * This code comes from John the Ripper password cracker, with reentrant
+ * and crypt(3) interfaces added, but optimizations specific to password
+ * cracking removed.
+ *
+ * Written by Solar Designer <solar at openwall.com> in 1998-2002 and
+ * placed in the public domain.
+ *
+ * There's absolutely no warranty.
+ *
+ * It is my intent that you should be able to use this on your system,
+ * as a part of a software package, or anywhere else to improve security,
+ * ensure compatibility, or for any other purpose.  I would appreciate
+ * it if you give credit where it is due and keep your modifications in
+ * the public domain as well, but I don't require that in order to let
+ * you place this code and any modifications you make under a license
+ * of your choice.
+ *
+ * This implementation is compatible with OpenBSD bcrypt.c (version 2a)
+ * by Niels Provos <provos at citi.umich.edu>, and uses some of his
+ * ideas.  The password hashing algorithm was designed by David Mazieres
+ * <dm at lcs.mit.edu>.
+ *
+ * There's a paper on the algorithm that explains its design decisions:
+ *
+ *	http://www.usenix.org/events/usenix99/provos.html
+ *
+ * Some of the tricks in BF_ROUND might be inspired by Eric Young's
+ * Blowfish library (I can't be sure if I would think of something if I
+ * hadn't seen his code).
+ */
+// Copyright (c) 2009-2015 The Tjcoin  developers
+/*
+ * Modified 2011/9/4 by Brendan Younger
+ * removed unneeded code, changed test for endianness, added wrappers for _crypt_blowfish_rn worker function
+ */
+
+#include <errno.h>
+#include <string.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include "bcrypt.h"
+
+#ifndef __set_errno
+#define __set_errno(val) errno = (val)
+#endif
+
+typedef uint32_t BF_word;
+
+/* Number of Blowfish rounds, this is also hardcoded into a few places */
+#define BF_N 16
+
+typedef BF_word BF_key[BF_N + 2];
+
+typedef struct {
+	BF_word S[4][0x100];
+	BF_key P;
+} BF_ctx;
+
+/*
+ * Magic IV for 64 Blowfish encryptions that we do at the end.
+ * The string is "OrpheanBeholderS" on big-endian.
+ */
+static BF_word BF_magic_w[4] = { //---------------:-)
+	0x4F727068, 0x65616E42,
+	0x65686F6C, 0x64657253
+};
+
+/*
+ * P-box and S-box tables initialized with digits of Pi.
+ */
+static BF_ctx BF_init_state = {
+	{
+		{
+			0xd1310ba6, 0x98dfb5ac, 0x2ffd72db, 0xd01adfb7,
+			0xb8e1afed, 0x6a267e96, 0xba7c9045, 0xf12c7f99,
+			0x24a19947, 0xb3916cf7, 0x0801f2e2, 0x858efc16,
+			0x636920d8, 0x71574e69, 0xa458fea3, 0xf4933d7e,
+			0x0d95748f, 0x728eb658, 0x718bcd58, 0x82154aee,
+			0x7b54a41d, 0xc25a59b5, 0x9c30d539, 0x2af26013,
+			0xc5d1b023, 0x286085f0, 0xca417918, 0xb8db38ef,
+			0x8e79dcb0, 0x603a180e, 0x6c9e0e8b, 0xb01e8a3e,
+			0xd71577c1, 0xbd314b27, 0x78af2fda, 0x55605c60,
+			0xe65525f3, 0xaa55ab94, 0x57489862, 0x63e81440,
+			0x55ca396a, 0x2aab10b6, 0xb4cc5c34, 0x1141e8ce,
+			0xa15486af, 0x7c72e993, 0xb3ee1411, 0x636fbc2a,
+			0x2ba9c55d, 0x741831f6, 0xce5c3e16, 0x9b87931e,
+			0xafd6ba33, 0x6c24cf5c, 0x7a325381, 0x28958677,
+			0x3b8f4898, 0x6b4bb9af, 0xc4bfe81b, 0x66282193,
+			0x61d809cc, 0xfb21a991, 0x487cac60, 0x5dec8032,
+			0xef845d5d, 0xe98575b1, 0xdc262302, 0xeb651b88,
+			0x23893e81, 0xd396acc5, 0x0f6d6ff3, 0x83f44239,
+			0x2e0b4482, 0xa4842004, 0x69c8f04a, 0x9e1f9b5e,
+			0x21c66842, 0xf6e96c9a, 0x670c9c61, 0xabd388f0,
+			0x6a51a0d2, 0xd8542f68, 0x960fa728, 0xab5133a3,
+			0x6eef0b6c, 0x137a3be4, 0xba3bf050, 0x7efb2a98,
+			0xa1f1651d, 0x39af0176, 0x66ca593e, 0x82430e88,
+			0x8cee8619, 0x456f9fb4, 0x7d84a5c3, 0x3b8b5ebe,
+			0xe06f75d8, 0x85c12073, 0x401a449f, 0x56c16aa6,
+			0x4ed3aa62, 0x363f7706, 0x1bfedf72, 0x429b023d,
+			0x37d0d724, 0xd00a1248, 0xdb0fead3, 0x49f1c09b,
+			0x075372c9, 0x80991b7b, 0x25d479d8, 0xf6e8def7,
+			0xe3fe501a, 0xb6794c3b, 0x976ce0bd, 0x04c006ba,
+			0xc1a94fb6, 0x409f60c4, 0x5e5c9ec2, 0x196a2463,
+			0x68fb6faf, 0x3e6c53b5, 0x1339b2eb, 0x3b52ec6f,
+			0x6dfc511f, 0x9b30952c, 0xcc814544, 0xaf5ebd09,
+			0xbee3d004, 0xde334afd, 0x660f2807, 0x192e4bb3,
+			0xc0cba857, 0x45c8740f, 0xd20b5f39, 0xb9d3fbdb,
+			0x5579c0bd, 0x1a60320a, 0xd6a100c6, 0x402c7279,
+			0x679f25fe, 0xfb1fa3cc, 0x8ea5e9f8, 0xdb3222f8,
+			0x3c7516df, 0xfd616b15, 0x2f501ec8, 0xad0552ab,
+			0x323db5fa, 0xfd238760, 0x53317b48, 0x3e00df82,
+			0x9e5c57bb, 0xca6f8ca0, 0x1a87562e, 0xdf1769db,
+			0xd542a8f6, 0x287effc3, 0xac6732c6, 0x8c4f5573,
+			0x695b27b0, 0xbbca58c8, 0xe1ffa35d, 0xb8f011a0,
+			0x10fa3d98, 0xfd2183b8, 0x4afcb56c, 0x2dd1d35b,
+			0x9a53e479, 0xb6f84565, 0xd28e49bc, 0x4bfb9790,
+			0xe1ddf2da, 0xa4cb7e33, 0x62fb1341, 0xcee4c6e8,
+			0xef20cada, 0x36774c01, 0xd07e9efe, 0x2bf11fb4,
+			0x95dbda4d, 0xae909198, 0xeaad8e71, 0x6b93d5a0,
+			0xd08ed1d0, 0xafc725e0, 0x8e3c5b2f, 0x8e7594b7,
+			0x8ff6e2fb, 0xf2122b64, 0x8888b812, 0x900df01c,
+			0x4fad5ea0, 0x688fc31c, 0xd1cff191, 0xb3a8c1ad,
+			0x2f2f2218, 0xbe0e1777, 0xea752dfe, 0x8b021fa1,
+			0xe5a0cc0f, 0xb56f74e8, 0x18acf3d6, 0xce89e299,
+			0xb4a84fe0, 0xfd13e0b7, 0x7cc43b81, 0xd2ada8d9,
+			0x165fa266, 0x80957705, 0x93cc7314, 0x211a1477,
+			0xe6ad2065, 0x77b5fa86, 0xc75442f5, 0xfb9d35cf,
+			0xebcdaf0c, 0x7b3e89a0, 0xd6411bd3, 0xae1e7e49,
+			0x00250e2d, 0x2071b35e, 0x226800bb, 0x57b8e0af,
+			0x2464369b, 0xf009b91e, 0x5563911d, 0x59dfa6aa,
+			0x78c14389, 0xd95a537f, 0x207d5ba2, 0x02e5b9c5,
+			0x83260376, 0x6295cfa9, 0x11c81968, 0x4e734a41,
+			0xb3472dca, 0x7b14a94a, 0x1b510052, 0x9a532915,
+			0xd60f573f, 0xbc9bc6e4, 0x2b60a476, 0x81e67400,
+			0x08ba6fb5, 0x571be91f, 0xf296ec6b, 0x2a0dd915,
+			0xb6636521, 0xe7b9f9b6, 0xff34052e, 0xc5855664,
+			0x53b02d5d, 0xa99f8fa1, 0x08ba4799, 0x6e85076a
+		}, {
+			0x4b7a70e9, 0xb5b32944, 0xdb75092e, 0xc4192623,
+			0xad6ea6b0, 0x49a7df7d, 0x9cee60b8, 0x8fedb266,
+			0xecaa8c71, 0x699a17ff, 0x5664526c, 0xc2b19ee1,
+			0x193602a5, 0x75094c29, 0xa0591340, 0xe4183a3e,
+			0x3f54989a, 0x5b429d65, 0x6b8fe4d6, 0x99f73fd6,
+			0xa1d29c07, 0xefe830f5, 0x4d2d38e6, 0xf0255dc1,
+			0x4cdd2086, 0x8470eb26, 0x6382e9c6, 0x021ecc5e,
+			0x09686b3f, 0x3ebaefc9, 0x3c971814, 0x6b6a70a1,
+			0x687f3584, 0x52a0e286, 0xb79c5305, 0xaa500737,
+			0x3e07841c, 0x7fdeae5c, 0x8e7d44ec, 0x5716f2b8,
+			0xb03ada37, 0xf0500c0d, 0xf01c1f04, 0x0200b3ff,
+			0xae0cf51a, 0x3cb574b2, 0x25837a58, 0xdc0921bd,
+			0xd19113f9, 0x7ca92ff6, 0x94324773, 0x22f54701,
+			0x3ae5e581, 0x37c2dadc, 0xc8b57634, 0x9af3dda7,
+			0xa9446146, 0x0fd0030e, 0xecc8c73e, 0xa4751e41,
+			0xe238cd99, 0x3bea0e2f, 0x3280bba1, 0x183eb331,
+			0x4e548b38, 0x4f6db908, 0x6f420d03, 0xf60a04bf,
+			0x2cb81290, 0x24977c79, 0x5679b072, 0xbcaf89af,
+			0xde9a771f, 0xd9930810, 0xb38bae12, 0xdccf3f2e,
+			0x5512721f, 0x2e6b7124, 0x501adde6, 0x9f84cd87,
+			0x7a584718, 0x7408da17, 0xbc9f9abc, 0xe94b7d8c,
+			0xec7aec3a, 0xdb851dfa, 0x63094366, 0xc464c3d2,
+			0xef1c1847, 0x3215d908, 0xdd433b37, 0x24c2ba16,
+			0x12a14d43, 0x2a65c451, 0x50940002, 0x133ae4dd,
+			0x71dff89e, 0x10314e55, 0x81ac77d6, 0x5f11199b,
+			0x043556f1, 0xd7a3c76b, 0x3c11183b, 0x5924a509,
+			0xf28fe6ed, 0x97f1fbfa, 0x9ebabf2c, 0x1e153c6e,
+			0x86e34570, 0xeae96fb1, 0x860e5e0a, 0x5a3e2ab3,
+			0x771fe71c, 0x4e3d06fa, 0x2965dcb9, 0x99e71d0f,
+			0x803e89d6, 0x5266c825, 0x2e4cc978, 0x9c10b36a,
+			0xc6150eba, 0x94e2ea78, 0xa5fc3c53, 0x1e0a2df4,
+			0xf2f74ea7, 0x361d2b3d, 0x1939260f, 0x19c27960,
+			0x5223a708, 0xf71312b6, 0xebadfe6e, 0xeac31f66,
+			0xe3bc4595, 0xa67bc883, 0xb17f37d1, 0x018cff28,
+			0xc332ddef, 0xbe6c5aa5, 0x65582185, 0x68ab9802,
+			0xeecea50f, 0xdb2f953b, 0x2aef7dad, 0x5b6e2f84,
+			0x1521b628, 0x29076170, 0xecdd4775, 0x619f1510,
+			0x13cca830, 0xeb61bd96, 0x0334fe1e, 0xaa0363cf,
+			0xb5735c90, 0x4c70a239, 0xd59e9e0b, 0xcbaade14,
+			0xeecc86bc, 0x60622ca7, 0x9cab5cab, 0xb2f3846e,
+			0x648b1eaf, 0x19bdf0ca, 0xa02369b9, 0x655abb50,
+			0x40685a32, 0x3c2ab4b3, 0x319ee9d5, 0xc021b8f7,
+			0x9b540b19, 0x875fa099, 0x95f7997e, 0x623d7da8,
+			0xf837889a, 0x97e32d77, 0x11ed935f, 0x16681281,
+			0x0e358829, 0xc7e61fd6, 0x96dedfa1, 0x7858ba99,
+			0x57f584a5, 0x1b227263, 0x9b83c3ff, 0x1ac24696,
+			0xcdb30aeb, 0x532e3054, 0x8fd948e4, 0x6dbc3128,
+			0x58ebf2ef, 0x34c6ffea, 0xfe28ed61, 0xee7c3c73,
+			0x5d4a14d9, 0xe864b7e3, 0x42105d14, 0x203e13e0,
+			0x45eee2b6, 0xa3aaabea, 0xdb6c4f15, 0xfacb4fd0,
+			0xc742f442, 0xef6abbb5, 0x654f3b1d, 0x41cd2105,
+			0xd81e799e, 0x86854dc7, 0xe44b476a, 0x3d816250,
+			0xcf62a1f2, 0x5b8d2646, 0xfc8883a0, 0xc1c7b6a3,
+			0x7f1524c3, 0x69cb7492, 0x47848a0b, 0x5692b285,
+			0x095bbf00, 0xad19489d, 0x1462b174, 0x23820e00,
+			0x58428d2a, 0x0c55f5ea, 0x1dadf43e, 0x233f7061,
+			0x3372f092, 0x8d937e41, 0xd65fecf1, 0x6c223bdb,
+			0x7cde3759, 0xcbee7460, 0x4085f2a7, 0xce77326e,
+			0xa6078084, 0x19f8509e, 0xe8efd855, 0x61d99735,
+			0xa969a7aa, 0xc50c06c2, 0x5a04abfc, 0x800bcadc,
+			0x9e447a2e, 0xc3453484, 0xfdd56705, 0x0e1e9ec9,
+			0xdb73dbd3, 0x105588cd, 0x675fda79, 0xe3674340,
+			0xc5c43465, 0x713e38d8, 0x3d28f89e, 0xf16dff20,
+			0x153e21e7, 0x8fb03d4a, 0xe6e39f2b, 0xdb83adf7
+		}, {
+			0xe93d5a68, 0x948140f7, 0xf64c261c, 0x94692934,
+			0x411520f7, 0x7602d4f7, 0xbcf46b2e, 0xd4a20068,
+			0xd4082471, 0x3320f46a, 0x43b7d4b7, 0x500061af,
+			0x1e39f62e, 0x97244546, 0x14214f74, 0xbf8b8840,
+			0x4d95fc1d, 0x96b591af, 0x70f4ddd3, 0x66a02f45,
+			0xbfbc09ec, 0x03bd9785, 0x7fac6dd0, 0x31cb8504,
+			0x96eb27b3, 0x55fd3941, 0xda2547e6, 0xabca0a9a,
+			0x28507825, 0x530429f4, 0x0a2c86da, 0xe9b66dfb,
+			0x68dc1462, 0xd7486900, 0x680ec0a4, 0x27a18dee,
+			0x4f3ffea2, 0xe887ad8c, 0xb58ce006, 0x7af4d6b6,
+			0xaace1e7c, 0xd3375fec, 0xce78a399, 0x406b2a42,
+			0x20fe9e35, 0xd9f385b9, 0xee39d7ab, 0x3b124e8b,
+			0x1dc9faf7, 0x4b6d1856, 0x26a36631, 0xeae397b2,
+			0x3a6efa74, 0xdd5b4332, 0x6841e7f7, 0xca7820fb,
+			0xfb0af54e, 0xd8feb397, 0x454056ac, 0xba489527,
+			0x55533a3a, 0x20838d87, 0xfe6ba9b7, 0xd096954b,
+			0x55a867bc, 0xa1159a58, 0xcca92963, 0x99e1db33,
+			0xa62a4a56, 0x3f3125f9, 0x5ef47e1c, 0x9029317c,
+			0xfdf8e802, 0x04272f70, 0x80bb155c, 0x05282ce3,
+			0x95c11548, 0xe4c66d22, 0x48c1133f, 0xc70f86dc,
+			0x07f9c9ee, 0x41041f0f, 0x404779a4, 0x5d886e17,
+			0x325f51eb, 0xd59bc0d1, 0xf2bcc18f, 0x41113564,
+			0x257b7834, 0x602a9c60, 0xdff8e8a3, 0x1f636c1b,
+			0x0e12b4c2, 0x02e1329e, 0xaf664fd1, 0xcad18115,
+			0x6b2395e0, 0x333e92e1, 0x3b240b62, 0xeebeb922,
+			0x85b2a20e, 0xe6ba0d99, 0xde720c8c, 0x2da2f728,
+			0xd0127845, 0x95b794fd, 0x647d0862, 0xe7ccf5f0,
+			0x5449a36f, 0x877d48fa, 0xc39dfd27, 0xf33e8d1e,
+			0x0a476341, 0x992eff74, 0x3a6f6eab, 0xf4f8fd37,
+			0xa812dc60, 0xa1ebddf8, 0x991be14c, 0xdb6e6b0d,
+			0xc67b5510, 0x6d672c37, 0x2765d43b, 0xdcd0e804,
+			0xf1290dc7, 0xcc00ffa3, 0xb5390f92, 0x690fed0b,
+			0x667b9ffb, 0xcedb7d9c, 0xa091cf0b, 0xd9155ea3,
+			0xbb132f88, 0x515bad24, 0x7b9479bf, 0x763bd6eb,
+			0x37392eb3, 0xcc115979, 0x8026e297, 0xf42e312d,
+			0x6842ada7, 0xc66a2b3b, 0x12754ccc, 0x782ef11c,
+			0x6a124237, 0xb79251e7, 0x06a1bbe6, 0x4bfb6350,
+			0x1a6b1018, 0x11caedfa, 0x3d25bdd8, 0xe2e1c3c9,
+			0x44421659, 0x0a121386, 0xd90cec6e, 0xd5abea2a,
+			0x64af674e, 0xda86a85f, 0xbebfe988, 0x64e4c3fe,
+			0x9dbc8057, 0xf0f7c086, 0x60787bf8, 0x6003604d,
+			0xd1fd8346, 0xf6381fb0, 0x7745ae04, 0xd736fccc,
+			0x83426b33, 0xf01eab71, 0xb0804187, 0x3c005e5f,
+			0x77a057be, 0xbde8ae24, 0x55464299, 0xbf582e61,
+			0x4e58f48f, 0xf2ddfda2, 0xf474ef38, 0x8789bdc2,
+			0x5366f9c3, 0xc8b38e74, 0xb475f255, 0x46fcd9b9,
+			0x7aeb2661, 0x8b1ddf84, 0x846a0e79, 0x915f95e2,
+			0x466e598e, 0x20b45770, 0x8cd55591, 0xc902de4c,
+			0xb90bace1, 0xbb8205d0, 0x11a86248, 0x7574a99e,
+			0xb77f19b6, 0xe0a9dc09, 0x662d09a1, 0xc4324633,
+			0xe85a1f02, 0x09f0be8c, 0x4a99a025, 0x1d6efe10,
+			0x1ab93d1d, 0x0ba5a4df, 0xa186f20f, 0x2868f169,
+			0xdcb7da83, 0x573906fe, 0xa1e2ce9b, 0x4fcd7f52,
+			0x50115e01, 0xa70683fa, 0xa002b5c4, 0x0de6d027,
+			0x9af88c27, 0x773f8641, 0xc3604c06, 0x61a806b5,
+			0xf0177a28, 0xc0f586e0, 0x006058aa, 0x30dc7d62,
+			0x11e69ed7, 0x2338ea63, 0x53c2dd94, 0xc2c21634,
+			0xbbcbee56, 0x90bcb6de, 0xebfc7da1, 0xce591d76,
+			0x6f05e409, 0x4b7c0188, 0x39720a3d, 0x7c927c24,
+			0x86e3725f, 0x724d9db9, 0x1ac15bb4, 0xd39eb8fc,
+			0xed545578, 0x08fca5b5, 0xd83d7cd3, 0x4dad0fc4,
+			0x1e50ef5e, 0xb161e6f8, 0xa28514d9, 0x6c51133c,
+			0x6fd5c7e7, 0x56e14ec4, 0x362abfce, 0xddc6c837,
+			0xd79a3234, 0x92638212, 0x670efa8e, 0x406000e0
+		}, {
+			0x3a39ce37, 0xd3faf5cf, 0xabc27737, 0x5ac52d1b,
+			0x5cb0679e, 0x4fa33742, 0xd3822740, 0x99bc9bbe,
+			0xd5118e9d, 0xbf0f7315, 0xd62d1c7e, 0xc700c47b,
+			0xb78c1b6b, 0x21a19045, 0xb26eb1be, 0x6a366eb4,
+			0x5748ab2f, 0xbc946e79, 0xc6a376d2, 0x6549c2c8,
+			0x530ff8ee, 0x468dde7d, 0xd5730a1d, 0x4cd04dc6,
+			0x2939bbdb, 0xa9ba4650, 0xac9526e8, 0xbe5ee304,
+			0xa1fad5f0, 0x6a2d519a, 0x63ef8ce2, 0x9a86ee22,
+			0xc089c2b8, 0x43242ef6, 0xa51e03aa, 0x9cf2d0a4,
+			0x83c061ba, 0x9be96a4d, 0x8fe51550, 0xba645bd6,
+			0x2826a2f9, 0xa73a3ae1, 0x4ba99586, 0xef5562e9,
+			0xc72fefd3, 0xf752f7da, 0x3f046f69, 0x77fa0a59,
+			0x80e4a915, 0x87b08601, 0x9b09e6ad, 0x3b3ee593,
+			0xe990fd5a, 0x9e34d797, 0x2cf0b7d9, 0x022b8b51,
+			0x96d5ac3a, 0x017da67d, 0xd1cf3ed6, 0x7c7d2d28,
+			0x1f9f25cf, 0xadf2b89b, 0x5ad6b472, 0x5a88f54c,
+			0xe029ac71, 0xe019a5e6, 0x47b0acfd, 0xed93fa9b,
+			0xe8d3c48d, 0x283b57cc, 0xf8d56629, 0x79132e28,
+			0x785f0191, 0xed756055, 0xf7960e44, 0xe3d35e8c,
+			0x15056dd4, 0x88f46dba, 0x03a16125, 0x0564f0bd,
+			0xc3eb9e15, 0x3c9057a2, 0x97271aec, 0xa93a072a,
+			0x1b3f6d9b, 0x1e6321f5, 0xf59c66fb, 0x26dcf319,
+			0x7533d928, 0xb155fdf5, 0x03563482, 0x8aba3cbb,
+			0x28517711, 0xc20ad9f8, 0xabcc5167, 0xccad925f,
+			0x4de81751, 0x3830dc8e, 0x379d5862, 0x9320f991,
+			0xea7a90c2, 0xfb3e7bce, 0x5121ce64, 0x774fbe32,
+			0xa8b6e37e, 0xc3293d46, 0x48de5369, 0x6413e680,
+			0xa2ae0810, 0xdd6db224, 0x69852dfd, 0x09072166,
+			0xb39a460a, 0x6445c0dd, 0x586cdecf, 0x1c20c8ae,
+			0x5bbef7dd, 0x1b588d40, 0xccd2017f, 0x6bb4e3bb,
+			0xdda26a7e, 0x3a59ff45, 0x3e350a44, 0xbcb4cdd5,
+			0x72eacea8, 0xfa6484bb, 0x8d6612ae, 0xbf3c6f47,
+			0xd29be463, 0x542f5d9e, 0xaec2771b, 0xf64e6370,
+			0x740e0d8d, 0xe75b1357, 0xf8721671, 0xaf537d5d,
+			0x4040cb08, 0x4eb4e2cc, 0x34d2466a, 0x0115af84,
+			0xe1b00428, 0x95983a1d, 0x06b89fb4, 0xce6ea048,
+			0x6f3f3b82, 0x3520ab82, 0x011a1d4b, 0x277227f8,
+			0x611560b1, 0xe7933fdc, 0xbb3a792b, 0x344525bd,
+			0xa08839e1, 0x51ce794b, 0x2f32c9b7, 0xa01fbac9,
+			0xe01cc87e, 0xbcc7d1f6, 0xcf0111c3, 0xa1e8aac7,
+			0x1a908749, 0xd44fbd9a, 0xd0dadecb, 0xd50ada38,
+			0x0339c32a, 0xc6913667, 0x8df9317c, 0xe0b12b4f,
+			0xf79e59b7, 0x43f5bb3a, 0xf2d519ff, 0x27d9459c,
+			0xbf97222c, 0x15e6fc2a, 0x0f91fc71, 0x9b941525,
+			0xfae59361, 0xceb69ceb, 0xc2a86459, 0x12baa8d1,
+			0xb6c1075e, 0xe3056a0c, 0x10d25065, 0xcb03a442,
+			0xe0ec6e0e, 0x1698db3b, 0x4c98a0be, 0x3278e964,
+			0x9f1f9532, 0xe0d392df, 0xd3a0342b, 0x8971f21e,
+			0x1b0a7441, 0x4ba3348c, 0xc5be7120, 0xc37632d8,
+			0xdf359f8d, 0x9b992f2e, 0xe60b6f47, 0x0fe3f11d,
+			0xe54cda54, 0x1edad891, 0xce6279cf, 0xcd3e7e6f,
+			0x1618b166, 0xfd2c1d05, 0x848fd2c5, 0xf6fb2299,
+			0xf523f357, 0xa6327623, 0x93a83531, 0x56cccd02,
+			0xacf08162, 0x5a75ebb5, 0x6e163697, 0x88d273cc,
+			0xde966292, 0x81b949d0, 0x4c50901b, 0x71c65614,
+			0xe6c6c7bd, 0x327a140a, 0x45e1d006, 0xc3f27b9a,
+			0xc9aa53fd, 0x62a80f00, 0xbb25bfe2, 0x35bdd2f6,
+			0x71126905, 0xb2040222, 0xb6cbcf7c, 0xcd769c2b,
+			0x53113ec0, 0x1640e3d3, 0x38abbd60, 0x2547adf0,
+			0xba38209c, 0xf746ce76, 0x77afa1c5, 0x20756060,
+			0x85cbfe4e, 0x8ae88dd8, 0x7aaaf9b0, 0x4cf9aa7e,
+			0x1948c25c, 0x02fb8a8c, 0x01c36ae4, 0xd6ebe1f9,
+			0x90d4f869, 0xa65cdea0, 0x3f09252d, 0xc208e69f,
+			0xb74e6132, 0xce77e25b, 0x578fdfe3, 0x3ac372e6
+		}
+	}, {
+		0x243f6a88, 0x85a308d3, 0x13198a2e, 0x03707344,
+		0xa4093822, 0x299f31d0, 0x082efa98, 0xec4e6c89,
+		0x452821e6, 0x38d01377, 0xbe5466cf, 0x34e90c6c,
+		0xc0ac29b7, 0xc97c50dd, 0x3f84d5b5, 0xb5470917,
+		0x9216d5d9, 0x8979fb1b
+	}
+};
+
+static void clean(void *data, int size) {
+	memset(data, 0, size);
+}
+
+static void BF_swap(BF_word *x, int count) {
+	static union {
+		uint16_t i;
+		uint8_t  c[2];
+	} is_little = { 0x0001 };
+	BF_word tmp;
+
+	if(is_little.c[0]) {
+		do {
+			tmp = *x;
+			tmp = (tmp << 16) | (tmp >> 16);
+			*x++ = ((tmp & 0x00FF00FF) << 8) | ((tmp >> 8) & 0x00FF00FF);
+		} while(--count);
+	}
+}
+
+#define BF_INDEX(S, i) \
+	(*((BF_word *)(((unsigned char *)S) + (i))))
+
+#define BF_ROUND(L, R, N) \
+	tmp1 = L & 0xFF; \
+	tmp1 <<= 2; \
+	tmp2 = L >> 6; \
+	tmp2 &= 0x3FC; \
+	tmp3 = L >> 14; \
+	tmp3 &= 0x3FC; \
+	tmp4 = L >> 22; \
+	tmp4 &= 0x3FC; \
+	tmp1 = BF_INDEX(data.ctx.S[3], tmp1); \
+	tmp2 = BF_INDEX(data.ctx.S[2], tmp2); \
+	tmp3 = BF_INDEX(data.ctx.S[1], tmp3); \
+	tmp3 += BF_INDEX(data.ctx.S[0], tmp4); \
+	tmp3 ^= tmp2; \
+	R ^= data.ctx.P[N + 1]; \
+	tmp3 += tmp1; \
+	R ^= tmp3;
+
+/*
+ * Encrypt one block, BF_N is hardcoded here.
+ */
+#define BF_ENCRYPT \
+	L ^= data.ctx.P[0]; \
+	BF_ROUND(L, R, 0); \
+	BF_ROUND(R, L, 1); \
+	BF_ROUND(L, R, 2); \
+	BF_ROUND(R, L, 3); \
+	BF_ROUND(L, R, 4); \
+	BF_ROUND(R, L, 5); \
+	BF_ROUND(L, R, 6); \
+	BF_ROUND(R, L, 7); \
+	BF_ROUND(L, R, 8); \
+	BF_ROUND(R, L, 9); \
+	BF_ROUND(L, R, 10); \
+	BF_ROUND(R, L, 11); \
+	BF_ROUND(L, R, 12); \
+	BF_ROUND(R, L, 13); \
+	BF_ROUND(L, R, 14); \
+	BF_ROUND(R, L, 15); \
+	tmp4 = R; \
+	R = L; \
+	L = tmp4 ^ data.ctx.P[BF_N + 1];
+
+
+#define BF_body() \
+	L = R = 0; \
+	ptr = data.ctx.P; \
+	do { \
+		ptr += 2; \
+		BF_ENCRYPT; \
+		*(ptr - 2) = L; \
+		*(ptr - 1) = R; \
+	} while (ptr < &data.ctx.P[BF_N + 2]); \
+\
+	ptr = data.ctx.S[0]; \
+	do { \
+		ptr += 2; \
+		BF_ENCRYPT; \
+		*(ptr - 2) = L; \
+		*(ptr - 1) = R; \
+	} while (ptr < &data.ctx.S[3][0xFF]);
+
+static void BF_set_key(const char *key, BF_key expanded, BF_key initial) {
+	const char *ptr = key;
+	int i, j;
+	BF_word tmp;
+
+	for (i = 0; i < BF_N + 2; i++) {
+		tmp = 0;
+		for (j = 0; j < 4; j++) {
+			tmp <<= 8;
+			tmp |= (unsigned char)*ptr;
+
+			if (!*ptr) ptr = key; else ptr++;
+		}
+
+		expanded[i] = tmp;
+		initial[i] = BF_init_state.P[i] ^ tmp;
+	}
+}
+
+static void _crypt_blowfish_rn(const char *key, const char *salt, char *output) {
+	struct {
+		BF_ctx ctx;
+		BF_key expanded_key;
+		union {
+			BF_word salt[4];
+			BF_word output[4];
+		} binary;
+	} data;
+	BF_word L, R;
+	BF_word tmp1, tmp2, tmp3, tmp4;
+	BF_word *ptr;
+	BF_word count = (BF_word)1 << 12; //work factor
+	int i;
+
+	memcpy(data.binary.salt, salt, 16);
+	BF_swap(data.binary.salt, 4);
+
+	BF_set_key(key, data.expanded_key, data.ctx.P);
+
+	memcpy(data.ctx.S, BF_init_state.S, sizeof(data.ctx.S));
+
+	L = R = 0;
+	for (i = 0; i < BF_N + 2; i += 2) {
+		L ^= data.binary.salt[i & 2];
+		R ^= data.binary.salt[(i & 2) + 1];
+		BF_ENCRYPT;
+		data.ctx.P[i] = L;
+		data.ctx.P[i + 1] = R;
+	}
+
+	ptr = data.ctx.S[0];
+	do {
+		ptr += 4;
+		L ^= data.binary.salt[(BF_N + 2) & 3];
+		R ^= data.binary.salt[(BF_N + 3) & 3];
+		BF_ENCRYPT;
+		*(ptr - 4) = L;
+		*(ptr - 3) = R;
+
+		L ^= data.binary.salt[(BF_N + 4) & 3];
+		R ^= data.binary.salt[(BF_N + 5) & 3];
+		BF_ENCRYPT;
+		*(ptr - 2) = L;
+		*(ptr - 1) = R;
+	} while (ptr < &data.ctx.S[3][0xFF]);
+
+	do {
+		data.ctx.P[0] ^= data.expanded_key[0];
+		data.ctx.P[1] ^= data.expanded_key[1];
+		data.ctx.P[2] ^= data.expanded_key[2];
+		data.ctx.P[3] ^= data.expanded_key[3];
+		data.ctx.P[4] ^= data.expanded_key[4];
+		data.ctx.P[5] ^= data.expanded_key[5];
+		data.ctx.P[6] ^= data.expanded_key[6];
+		data.ctx.P[7] ^= data.expanded_key[7];
+		data.ctx.P[8] ^= data.expanded_key[8];
+		data.ctx.P[9] ^= data.expanded_key[9];
+		data.ctx.P[10] ^= data.expanded_key[10];
+		data.ctx.P[11] ^= data.expanded_key[11];
+		data.ctx.P[12] ^= data.expanded_key[12];
+		data.ctx.P[13] ^= data.expanded_key[13];
+		data.ctx.P[14] ^= data.expanded_key[14];
+		data.ctx.P[15] ^= data.expanded_key[15];
+		data.ctx.P[16] ^= data.expanded_key[16];
+		data.ctx.P[17] ^= data.expanded_key[17];
+
+		BF_body();
+
+		tmp1 = data.binary.salt[0];
+		tmp2 = data.binary.salt[1];
+		tmp3 = data.binary.salt[2];
+		tmp4 = data.binary.salt[3];
+		data.ctx.P[0] ^= tmp1;
+		data.ctx.P[1] ^= tmp2;
+		data.ctx.P[2] ^= tmp3;
+		data.ctx.P[3] ^= tmp4;
+		data.ctx.P[4] ^= tmp1;
+		data.ctx.P[5] ^= tmp2;
+		data.ctx.P[6] ^= tmp3;
+		data.ctx.P[7] ^= tmp4;
+		data.ctx.P[8] ^= tmp1;
+		data.ctx.P[9] ^= tmp2;
+		data.ctx.P[10] ^= tmp3;
+		data.ctx.P[11] ^= tmp4;
+		data.ctx.P[12] ^= tmp1;
+		data.ctx.P[13] ^= tmp2;
+		data.ctx.P[14] ^= tmp3;
+		data.ctx.P[15] ^= tmp4;
+		data.ctx.P[16] ^= tmp1;
+		data.ctx.P[17] ^= tmp2;
+
+		BF_body();
+	} while (--count);
+
+	for (i = 0; i < 4; i += 2) {
+		L = BF_magic_w[i];
+		R = BF_magic_w[i + 1];
+
+		count = 64;
+		do {
+			BF_ENCRYPT;
+		} while (--count);
+
+		data.binary.output[i] = L;
+		data.binary.output[i + 1] = R;
+	}
+
+	memcpy(output, data.binary.output, 16);
+
+	clean(&data, sizeof(data));
+}
+
+void bcrypt_hash(const char *in, char *out)
+{
+	_crypt_blowfish_rn(&in[0 * BF_N], &in[1 * BF_N], &out[0 * BF_N]);
+	_crypt_blowfish_rn(&in[2 * BF_N], &in[3 * BF_N], &out[1 * BF_N]);
+	_crypt_blowfish_rn(&in[4 * BF_N], &out[1 * BF_N], &out[1 * BF_N]);
+}
\ No newline at end of file
diff --git a/bcrypt.h b/bcrypt.h
new file mode 100644
index 0000000..7a9d8cb
--- /dev/null
+++ b/bcrypt.h
@@ -0,0 +1,14 @@
+#ifndef BCRYPT_H
+#define BCRYPT_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void bcrypt_hash(const char *input, char *output);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
\ No newline at end of file
diff --git a/binding.gyp b/binding.gyp
new file mode 100644
index 0000000..dce7a11
--- /dev/null
+++ b/binding.gyp
@@ -0,0 +1,29 @@
+{
+    "targets": [
+        {
+            "target_name": "multihashing",
+            "sources": [
+                "multihashing.cc",
+                "scrypt.c",
+                "scryptjane.c",
+                "scryptn.c",
+                "keccak.c",
+                "skein.c",
+                "x11.c",
+                "quark.c",
+                "sha3/aes_helper.c",
+                "sha3/blake.c",
+                "sha3/bmw.c",
+                "sha3/cubehash.c",
+                "sha3/echo.c",
+                "sha3/groestl.c",
+                "sha3/jh.c",
+                "sha3/keccak.c",
+                "sha3/luffa.c",
+                "sha3/shavite.c",
+                "sha3/simd.c",
+                "sha3/skein.c"
+            ]
+        }
+    ]
+}
diff --git a/build/Makefile b/build/Makefile
new file mode 100644
index 0000000..9f17b5f
--- /dev/null
+++ b/build/Makefile
@@ -0,0 +1,332 @@
+# We borrow heavily from the kernel build setup, though we are simpler since
+# we don't have Kconfig tweaking settings on us.
+
+# The implicit make rules have it looking for RCS files, among other things.
+# We instead explicitly write all the rules we care about.
+# It's even quicker (saves ~200ms) to pass -r on the command line.
+MAKEFLAGS=-r
+
+# The source directory tree.
+srcdir := ..
+abs_srcdir := $(abspath $(srcdir))
+
+# The name of the builddir.
+builddir_name ?= .
+
+# The V=1 flag on command line makes us verbosely print command lines.
+ifdef V
+  quiet=
+else
+  quiet=quiet_
+endif
+
+# Specify BUILDTYPE=Release on the command line for a release build.
+BUILDTYPE ?= Release
+
+# Directory all our build output goes into.
+# Note that this must be two directories beneath src/ for unit tests to pass,
+# as they reach into the src/ directory for data with relative paths.
+builddir ?= $(builddir_name)/$(BUILDTYPE)
+abs_builddir := $(abspath $(builddir))
+depsdir := $(builddir)/.deps
+
+# Object output directory.
+obj := $(builddir)/obj
+abs_obj := $(abspath $(obj))
+
+# We build up a list of every single one of the targets so we can slurp in the
+# generated dependency rule Makefiles in one pass.
+all_deps :=
+
+
+
+CC.target ?= $(CC)
+CFLAGS.target ?= $(CFLAGS)
+CXX.target ?= $(CXX)
+CXXFLAGS.target ?= $(CXXFLAGS)
+LINK.target ?= $(LINK)
+LDFLAGS.target ?= $(LDFLAGS)
+AR.target ?= $(AR)
+
+# C++ apps need to be linked with g++.
+#
+# Note: flock is used to seralize linking. Linking is a memory-intensive
+# process so running parallel links can often lead to thrashing.  To disable
+# the serialization, override LINK via an envrionment variable as follows:
+#
+#   export LINK=g++
+#
+# This will allow make to invoke N linker processes as specified in -jN.
+LINK ?= flock $(builddir)/linker.lock $(CXX.target)
+
+# TODO(evan): move all cross-compilation logic to gyp-time so we don't need
+# to replicate this environment fallback in make as well.
+CC.host ?= gcc
+CFLAGS.host ?=
+CXX.host ?= g++
+CXXFLAGS.host ?=
+LINK.host ?= $(CXX.host)
+LDFLAGS.host ?=
+AR.host ?= ar
+
+# Define a dir function that can handle spaces.
+# http://www.gnu.org/software/make/manual/make.html#Syntax-of-Functions
+# "leading spaces cannot appear in the text of the first argument as written.
+# These characters can be put into the argument value by variable substitution."
+empty :=
+space := $(empty) $(empty)
+
+# http://stackoverflow.com/questions/1189781/using-make-dir-or-notdir-on-a-path-with-spaces
+replace_spaces = $(subst $(space),?,$1)
+unreplace_spaces = $(subst ?,$(space),$1)
+dirx = $(call unreplace_spaces,$(dir $(call replace_spaces,$1)))
+
+# Flags to make gcc output dependency info.  Note that you need to be
+# careful here to use the flags that ccache and distcc can understand.
+# We write to a dep file on the side first and then rename at the end
+# so we can't end up with a broken dep file.
+depfile = $(depsdir)/$(call replace_spaces,$@).d
+DEPFLAGS = -MMD -MF $(depfile).raw
+
+# We have to fixup the deps output in a few ways.
+# (1) the file output should mention the proper .o file.
+# ccache or distcc lose the path to the target, so we convert a rule of
+# the form:
+#   foobar.o: DEP1 DEP2
+# into
+#   path/to/foobar.o: DEP1 DEP2
+# (2) we want missing files not to cause us to fail to build.
+# We want to rewrite
+#   foobar.o: DEP1 DEP2 \
+#               DEP3
+# to
+#   DEP1:
+#   DEP2:
+#   DEP3:
+# so if the files are missing, they're just considered phony rules.
+# We have to do some pretty insane escaping to get those backslashes
+# and dollar signs past make, the shell, and sed at the same time.
+# Doesn't work with spaces, but that's fine: .d files have spaces in
+# their names replaced with other characters.
+define fixup_dep
+# The depfile may not exist if the input file didn't have any #includes.
+touch $(depfile).raw
+# Fixup path as in (1).
+sed -e "s|^$(notdir $@)|$@|" $(depfile).raw >> $(depfile)
+# Add extra rules as in (2).
+# We remove slashes and replace spaces with new lines;
+# remove blank lines;
+# delete the first line and append a colon to the remaining lines.
+sed -e 's|\\||' -e 'y| |\n|' $(depfile).raw |\
+  grep -v '^$$'                             |\
+  sed -e 1d -e 's|$$|:|'                     \
+    >> $(depfile)
+rm $(depfile).raw
+endef
+
+# Command definitions:
+# - cmd_foo is the actual command to run;
+# - quiet_cmd_foo is the brief-output summary of the command.
+
+quiet_cmd_cc = CC($(TOOLSET)) $@
+cmd_cc = $(CC.$(TOOLSET)) $(GYP_CFLAGS) $(DEPFLAGS) $(CFLAGS.$(TOOLSET)) -c -o $@ $<
+
+quiet_cmd_cxx = CXX($(TOOLSET)) $@
+cmd_cxx = $(CXX.$(TOOLSET)) $(GYP_CXXFLAGS) $(DEPFLAGS) $(CXXFLAGS.$(TOOLSET)) -c -o $@ $<
+
+quiet_cmd_touch = TOUCH $@
+cmd_touch = touch $@
+
+quiet_cmd_copy = COPY $@
+# send stderr to /dev/null to ignore messages when linking directories.
+cmd_copy = rm -rf "$@" && cp -af "$<" "$@"
+
+quiet_cmd_alink = AR($(TOOLSET)) $@
+cmd_alink = rm -f $@ && $(AR.$(TOOLSET)) crs $@ $(filter %.o,$^)
+
+quiet_cmd_alink_thin = AR($(TOOLSET)) $@
+cmd_alink_thin = rm -f $@ && $(AR.$(TOOLSET)) crsT $@ $(filter %.o,$^)
+
+# Due to circular dependencies between libraries :(, we wrap the
+# special "figure out circular dependencies" flags around the entire
+# input list during linking.
+quiet_cmd_link = LINK($(TOOLSET)) $@
+cmd_link = $(LINK.$(TOOLSET)) $(GYP_LDFLAGS) $(LDFLAGS.$(TOOLSET)) -o $@ -Wl,--start-group $(LD_INPUTS) -Wl,--end-group $(LIBS)
+
+# We support two kinds of shared objects (.so):
+# 1) shared_library, which is just bundling together many dependent libraries
+# into a link line.
+# 2) loadable_module, which is generating a module intended for dlopen().
+#
+# They differ only slightly:
+# In the former case, we want to package all dependent code into the .so.
+# In the latter case, we want to package just the API exposed by the
+# outermost module.
+# This means shared_library uses --whole-archive, while loadable_module doesn't.
+# (Note that --whole-archive is incompatible with the --start-group used in
+# normal linking.)
+
+# Other shared-object link notes:
+# - Set SONAME to the library filename so our binaries don't reference
+# the local, absolute paths used on the link command-line.
+quiet_cmd_solink = SOLINK($(TOOLSET)) $@
+cmd_solink = $(LINK.$(TOOLSET)) -shared $(GYP_LDFLAGS) $(LDFLAGS.$(TOOLSET)) -Wl,-soname=$(@F) -o $@ -Wl,--whole-archive $(LD_INPUTS) -Wl,--no-whole-archive $(LIBS)
+
+quiet_cmd_solink_module = SOLINK_MODULE($(TOOLSET)) $@
+cmd_solink_module = $(LINK.$(TOOLSET)) -shared $(GYP_LDFLAGS) $(LDFLAGS.$(TOOLSET)) -Wl,-soname=$(@F) -o $@ -Wl,--start-group $(filter-out FORCE_DO_CMD, $^) -Wl,--end-group $(LIBS)
+
+
+# Define an escape_quotes function to escape single quotes.
+# This allows us to handle quotes properly as long as we always use
+# use single quotes and escape_quotes.
+escape_quotes = $(subst ','\'',$(1))
+# This comment is here just to include a ' to unconfuse syntax highlighting.
+# Define an escape_vars function to escape '$' variable syntax.
+# This allows us to read/write command lines with shell variables (e.g.
+# $LD_LIBRARY_PATH), without triggering make substitution.
+escape_vars = $(subst $$,$$$$,$(1))
+# Helper that expands to a shell command to echo a string exactly as it is in
+# make. This uses printf instead of echo because printf's behaviour with respect
+# to escape sequences is more portable than echo's across different shells
+# (e.g., dash, bash).
+exact_echo = printf '%s\n' '$(call escape_quotes,$(1))'
+
+# Helper to compare the command we're about to run against the command
+# we logged the last time we ran the command.  Produces an empty
+# string (false) when the commands match.
+# Tricky point: Make has no string-equality test function.
+# The kernel uses the following, but it seems like it would have false
+# positives, where one string reordered its arguments.
+#   arg_check = $(strip $(filter-out $(cmd_$(1)), $(cmd_$@)) \
+#                       $(filter-out $(cmd_$@), $(cmd_$(1))))
+# We instead substitute each for the empty string into the other, and
+# say they're equal if both substitutions produce the empty string.
+# .d files contain ? instead of spaces, take that into account.
+command_changed = $(or $(subst $(cmd_$(1)),,$(cmd_$(call replace_spaces,$@))),\
+                       $(subst $(cmd_$(call replace_spaces,$@)),,$(cmd_$(1))))
+
+# Helper that is non-empty when a prerequisite changes.
+# Normally make does this implicitly, but we force rules to always run
+# so we can check their command lines.
+#   $? -- new prerequisites
+#   $| -- order-only dependencies
+prereq_changed = $(filter-out FORCE_DO_CMD,$(filter-out $|,$?))
+
+# Helper that executes all postbuilds until one fails.
+define do_postbuilds
+  @E=0;\
+  for p in $(POSTBUILDS); do\
+    eval $$p;\
+    E=$$?;\
+    if [ $$E -ne 0 ]; then\
+      break;\
+    fi;\
+  done;\
+  if [ $$E -ne 0 ]; then\
+    rm -rf "$@";\
+    exit $$E;\
+  fi
+endef
+
+# do_cmd: run a command via the above cmd_foo names, if necessary.
+# Should always run for a given target to handle command-line changes.
+# Second argument, if non-zero, makes it do asm/C/C++ dependency munging.
+# Third argument, if non-zero, makes it do POSTBUILDS processing.
+# Note: We intentionally do NOT call dirx for depfile, since it contains ? for
+# spaces already and dirx strips the ? characters.
+define do_cmd
+$(if $(or $(command_changed),$(prereq_changed)),
+  @$(call exact_echo,  $($(quiet)cmd_$(1)))
+  @mkdir -p "$(call dirx,$@)" "$(dir $(depfile))"
+  $(if $(findstring flock,$(word 1,$(cmd_$1))),
+    @$(cmd_$(1))
+    @echo "  $(quiet_cmd_$(1)): Finished",
+    @$(cmd_$(1))
+  )
+  @$(call exact_echo,$(call escape_vars,cmd_$(call replace_spaces,$@) := $(cmd_$(1)))) > $(depfile)
+  @$(if $(2),$(fixup_dep))
+  $(if $(and $(3), $(POSTBUILDS)),
+    $(call do_postbuilds)
+  )
+)
+endef
+
+# Declare the "all" target first so it is the default,
+# even though we don't have the deps yet.
+.PHONY: all
+all:
+
+# make looks for ways to re-generate included makefiles, but in our case, we
+# don't have a direct way. Explicitly telling make that it has nothing to do
+# for them makes it go faster.
+%.d: ;
+
+# Use FORCE_DO_CMD to force a target to run.  Should be coupled with
+# do_cmd.
+.PHONY: FORCE_DO_CMD
+FORCE_DO_CMD:
+
+TOOLSET := target
+# Suffix rules, putting all outputs into $(obj).
+$(obj).$(TOOLSET)/%.o: $(srcdir)/%.c FORCE_DO_CMD
+	@$(call do_cmd,cc,1)
+$(obj).$(TOOLSET)/%.o: $(srcdir)/%.cc FORCE_DO_CMD
+	@$(call do_cmd,cxx,1)
+$(obj).$(TOOLSET)/%.o: $(srcdir)/%.cpp FORCE_DO_CMD
+	@$(call do_cmd,cxx,1)
+$(obj).$(TOOLSET)/%.o: $(srcdir)/%.cxx FORCE_DO_CMD
+	@$(call do_cmd,cxx,1)
+$(obj).$(TOOLSET)/%.o: $(srcdir)/%.S FORCE_DO_CMD
+	@$(call do_cmd,cc,1)
+$(obj).$(TOOLSET)/%.o: $(srcdir)/%.s FORCE_DO_CMD
+	@$(call do_cmd,cc,1)
+
+# Try building from generated source, too.
+$(obj).$(TOOLSET)/%.o: $(obj).$(TOOLSET)/%.c FORCE_DO_CMD
+	@$(call do_cmd,cc,1)
+$(obj).$(TOOLSET)/%.o: $(obj).$(TOOLSET)/%.cc FORCE_DO_CMD
+	@$(call do_cmd,cxx,1)
+$(obj).$(TOOLSET)/%.o: $(obj).$(TOOLSET)/%.cpp FORCE_DO_CMD
+	@$(call do_cmd,cxx,1)
+$(obj).$(TOOLSET)/%.o: $(obj).$(TOOLSET)/%.cxx FORCE_DO_CMD
+	@$(call do_cmd,cxx,1)
+$(obj).$(TOOLSET)/%.o: $(obj).$(TOOLSET)/%.S FORCE_DO_CMD
+	@$(call do_cmd,cc,1)
+$(obj).$(TOOLSET)/%.o: $(obj).$(TOOLSET)/%.s FORCE_DO_CMD
+	@$(call do_cmd,cc,1)
+
+$(obj).$(TOOLSET)/%.o: $(obj)/%.c FORCE_DO_CMD
+	@$(call do_cmd,cc,1)
+$(obj).$(TOOLSET)/%.o: $(obj)/%.cc FORCE_DO_CMD
+	@$(call do_cmd,cxx,1)
+$(obj).$(TOOLSET)/%.o: $(obj)/%.cpp FORCE_DO_CMD
+	@$(call do_cmd,cxx,1)
+$(obj).$(TOOLSET)/%.o: $(obj)/%.cxx FORCE_DO_CMD
+	@$(call do_cmd,cxx,1)
+$(obj).$(TOOLSET)/%.o: $(obj)/%.S FORCE_DO_CMD
+	@$(call do_cmd,cc,1)
+$(obj).$(TOOLSET)/%.o: $(obj)/%.s FORCE_DO_CMD
+	@$(call do_cmd,cc,1)
+
+
+ifeq ($(strip $(foreach prefix,$(NO_LOAD),\
+    $(findstring $(join ^,$(prefix)),\
+                 $(join ^,multihashing.target.mk)))),)
+  include multihashing.target.mk
+endif
+
+quiet_cmd_regen_makefile = ACTION Regenerating $@
+cmd_regen_makefile = cd $(srcdir); /usr/lib/node_modules/node-gyp/gyp/gyp_main.py -fmake --ignore-environment "--toplevel-dir=." -I/root/multi-hashing/build/config.gypi -I/usr/lib/node_modules/node-gyp/addon.gypi -I/root/.node-gyp/0.10.26/common.gypi "--depth=." "-Goutput_dir=." "--generator-output=build" "-Dlibrary=shared_library" "-Dvisibility=default" "-Dnode_root_dir=/root/.node-gyp/0.10.26" "-Dmodule_root_dir=/root/multi-hashing" binding.gyp
+Makefile: $(srcdir)/../../usr/lib/node_modules/node-gyp/addon.gypi $(srcdir)/../.node-gyp/0.10.26/common.gypi $(srcdir)/build/config.gypi $(srcdir)/binding.gyp
+	$(call do_cmd,regen_makefile)
+
+# "all" is a concatenation of the "all" targets from all the included
+# sub-makefiles. This is just here to clarify.
+all:
+
+# Add in dependency-tracking rules.  $(all_deps) is the list of every single
+# target in our tree. Only consider the ones with .d (dependency) info:
+d_files := $(wildcard $(foreach f,$(all_deps),$(depsdir)/$(f).d))
+ifneq ($(d_files),)
+  include $(d_files)
+endif
diff --git a/build/binding.Makefile b/build/binding.Makefile
new file mode 100644
index 0000000..3dfc8e4
--- /dev/null
+++ b/build/binding.Makefile
@@ -0,0 +1,6 @@
+# This file is generated by gyp; do not edit.
+
+export builddir_name ?= ./build/.
+.PHONY: all
+all:
+	$(MAKE) multihashing
diff --git a/build/config.gypi b/build/config.gypi
new file mode 100644
index 0000000..9f4ebd4
--- /dev/null
+++ b/build/config.gypi
@@ -0,0 +1,38 @@
+# Do not edit. File was generated by node-gyp's "configure" step
+{
+  "target_defaults": {
+    "cflags": [],
+    "default_configuration": "Release",
+    "defines": [],
+    "include_dirs": [],
+    "libraries": []
+  },
+  "variables": {
+    "clang": 0,
+    "gcc_version": 48,
+    "host_arch": "x64",
+    "node_install_npm": "true",
+    "node_prefix": "/usr",
+    "node_shared_cares": "false",
+    "node_shared_http_parser": "false",
+    "node_shared_libuv": "false",
+    "node_shared_openssl": "false",
+    "node_shared_v8": "false",
+    "node_shared_zlib": "false",
+    "node_tag": "",
+    "node_unsafe_optimizations": 0,
+    "node_use_dtrace": "false",
+    "node_use_etw": "false",
+    "node_use_openssl": "true",
+    "node_use_perfctr": "false",
+    "node_use_systemtap": "false",
+    "python": "/usr/bin/python",
+    "target_arch": "x64",
+    "v8_enable_gdbjit": 0,
+    "v8_no_strict_aliasing": 1,
+    "v8_use_snapshot": "false",
+    "nodedir": "/root/.node-gyp/0.10.26",
+    "copy_dev_lib": "true",
+    "standalone_static_library": 1
+  }
+}
diff --git a/build/multihashing.target.mk b/build/multihashing.target.mk
new file mode 100644
index 0000000..de2a674
--- /dev/null
+++ b/build/multihashing.target.mk
@@ -0,0 +1,158 @@
+# This file is generated by gyp; do not edit.
+
+TOOLSET := target
+TARGET := multihashing
+DEFS_Debug := \
+	'-D_LARGEFILE_SOURCE' \
+	'-D_FILE_OFFSET_BITS=64' \
+	'-DBUILDING_NODE_EXTENSION' \
+	'-DDEBUG' \
+	'-D_DEBUG'
+
+# Flags passed to all source files.
+CFLAGS_Debug := \
+	-fPIC \
+	-Wall \
+	-Wextra \
+	-Wno-unused-parameter \
+	-pthread \
+	-m64 \
+	-g \
+	-O0
+
+# Flags passed to only C files.
+CFLAGS_C_Debug :=
+
+# Flags passed to only C++ files.
+CFLAGS_CC_Debug := \
+	-fno-rtti \
+	-fno-exceptions
+
+INCS_Debug := \
+	-I/root/.node-gyp/0.10.26/src \
+	-I/root/.node-gyp/0.10.26/deps/uv/include \
+	-I/root/.node-gyp/0.10.26/deps/v8/include
+
+DEFS_Release := \
+	'-D_LARGEFILE_SOURCE' \
+	'-D_FILE_OFFSET_BITS=64' \
+	'-DBUILDING_NODE_EXTENSION'
+
+# Flags passed to all source files.
+CFLAGS_Release := \
+	-fPIC \
+	-Wall \
+	-Wextra \
+	-Wno-unused-parameter \
+	-pthread \
+	-m64 \
+	-O2 \
+	-fno-strict-aliasing \
+	-fno-tree-vrp \
+	-fno-omit-frame-pointer
+
+# Flags passed to only C files.
+CFLAGS_C_Release :=
+
+# Flags passed to only C++ files.
+CFLAGS_CC_Release := \
+	-fno-rtti \
+	-fno-exceptions
+
+INCS_Release := \
+	-I/root/.node-gyp/0.10.26/src \
+	-I/root/.node-gyp/0.10.26/deps/uv/include \
+	-I/root/.node-gyp/0.10.26/deps/v8/include
+
+OBJS := \
+	$(obj).target/$(TARGET)/multihashing.o \
+	$(obj).target/$(TARGET)/scrypt.o \
+	$(obj).target/$(TARGET)/scryptjane.o \
+	$(obj).target/$(TARGET)/scryptn.o \
+	$(obj).target/$(TARGET)/keccak.o \
+	$(obj).target/$(TARGET)/skein.o \
+	$(obj).target/$(TARGET)/x11.o \
+	$(obj).target/$(TARGET)/quark.o \
+	$(obj).target/$(TARGET)/sha3/aes_helper.o \
+	$(obj).target/$(TARGET)/sha3/blake.o \
+	$(obj).target/$(TARGET)/sha3/bmw.o \
+	$(obj).target/$(TARGET)/sha3/cubehash.o \
+	$(obj).target/$(TARGET)/sha3/echo.o \
+	$(obj).target/$(TARGET)/sha3/groestl.o \
+	$(obj).target/$(TARGET)/sha3/jh.o \
+	$(obj).target/$(TARGET)/sha3/keccak.o \
+	$(obj).target/$(TARGET)/sha3/luffa.o \
+	$(obj).target/$(TARGET)/sha3/shavite.o \
+	$(obj).target/$(TARGET)/sha3/simd.o \
+	$(obj).target/$(TARGET)/sha3/skein.o
+
+# Add to the list of files we specially track dependencies for.
+all_deps += $(OBJS)
+
+# CFLAGS et al overrides must be target-local.
+# See "Target-specific Variable Values" in the GNU Make manual.
+$(OBJS): TOOLSET := $(TOOLSET)
+$(OBJS): GYP_CFLAGS := $(DEFS_$(BUILDTYPE)) $(INCS_$(BUILDTYPE))  $(CFLAGS_$(BUILDTYPE)) $(CFLAGS_C_$(BUILDTYPE))
+$(OBJS): GYP_CXXFLAGS := $(DEFS_$(BUILDTYPE)) $(INCS_$(BUILDTYPE))  $(CFLAGS_$(BUILDTYPE)) $(CFLAGS_CC_$(BUILDTYPE))
+
+# Suffix rules, putting all outputs into $(obj).
+
+$(obj).$(TOOLSET)/$(TARGET)/%.o: $(srcdir)/%.cc FORCE_DO_CMD
+	@$(call do_cmd,cxx,1)
+
+$(obj).$(TOOLSET)/$(TARGET)/%.o: $(srcdir)/%.c FORCE_DO_CMD
+	@$(call do_cmd,cc,1)
+
+# Try building from generated source, too.
+
+$(obj).$(TOOLSET)/$(TARGET)/%.o: $(obj).$(TOOLSET)/%.cc FORCE_DO_CMD
+	@$(call do_cmd,cxx,1)
+
+$(obj).$(TOOLSET)/$(TARGET)/%.o: $(obj).$(TOOLSET)/%.c FORCE_DO_CMD
+	@$(call do_cmd,cc,1)
+
+$(obj).$(TOOLSET)/$(TARGET)/%.o: $(obj)/%.cc FORCE_DO_CMD
+	@$(call do_cmd,cxx,1)
+
+$(obj).$(TOOLSET)/$(TARGET)/%.o: $(obj)/%.c FORCE_DO_CMD
+	@$(call do_cmd,cc,1)
+
+# End of this set of suffix rules
+### Rules for final target.
+LDFLAGS_Debug := \
+	-pthread \
+	-rdynamic \
+	-m64
+
+LDFLAGS_Release := \
+	-pthread \
+	-rdynamic \
+	-m64
+
+LIBS :=
+
+$(obj).target/multihashing.node: GYP_LDFLAGS := $(LDFLAGS_$(BUILDTYPE))
+$(obj).target/multihashing.node: LIBS := $(LIBS)
+$(obj).target/multihashing.node: TOOLSET := $(TOOLSET)
+$(obj).target/multihashing.node: $(OBJS) FORCE_DO_CMD
+	$(call do_cmd,solink_module)
+
+all_deps += $(obj).target/multihashing.node
+# Add target alias
+.PHONY: multihashing
+multihashing: $(builddir)/multihashing.node
+
+# Copy this to the executable output path.
+$(builddir)/multihashing.node: TOOLSET := $(TOOLSET)
+$(builddir)/multihashing.node: $(obj).target/multihashing.node FORCE_DO_CMD
+	$(call do_cmd,copy)
+
+all_deps += $(builddir)/multihashing.node
+# Short alias for building this executable.
+.PHONY: multihashing.node
+multihashing.node: $(obj).target/multihashing.node $(builddir)/multihashing.node
+
+# Add executable to "all" target.
+.PHONY: all
+all: $(builddir)/multihashing.node
+
diff --git a/index.js b/index.js
new file mode 100644
index 0000000..be19772
--- /dev/null
+++ b/index.js
@@ -0,0 +1 @@
+module.exports = require('bindings')('multihashing.node')
\ No newline at end of file
diff --git a/keccak.c b/keccak.c
new file mode 100644
index 0000000..f41312f
--- /dev/null
+++ b/keccak.c
@@ -0,0 +1,18 @@
+#include "keccak.h"
+#include <stdlib.h>
+#include <stdint.h>
+#include <string.h>
+#include <stdio.h>
+
+#include "sha3/sph_keccak.h"
+
+
+void keccak_hash(const char* input, char* output)
+{
+    sph_keccak256_context    ctx_keccak;
+    sph_keccak256_init(&ctx_keccak);
+    sph_keccak256 (&ctx_keccak, input, 80);
+    sph_keccak256_close(&ctx_keccak, output);
+
+}
+
diff --git a/keccak.h b/keccak.h
new file mode 100644
index 0000000..4442ae9
--- /dev/null
+++ b/keccak.h
@@ -0,0 +1,14 @@
+#ifndef KECCAK_H
+#define KECCAK_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void keccak_hash(const char* input, char* output);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/multihashing.cc b/multihashing.cc
new file mode 100644
index 0000000..fad8023
--- /dev/null
+++ b/multihashing.cc
@@ -0,0 +1,260 @@
+#include <node.h>
+#include <node_buffer.h>
+#include <v8.h>
+
+extern "C" {
+    #include "bcrypt.h"
+    #include "keccak.h"
+    #include "quark.h"
+    #include "scrypt.h"
+    #include "scryptjane.h"
+    #include "scryptn.h"
+    #include "skein.h"
+    #include "x11.h"
+
+
+    static unsigned char getNfactor(char* blockheader) {
+        int n,l = 0;
+        unsigned long nTimestamp = *(unsigned int*)(&blockheader[68]);
+        unsigned char minNfactor = 10;
+        unsigned char maxNfactor = 30;
+        unsigned char N;
+        uint64_t s;
+
+        if (nTimestamp <= 1389306217) {
+            return minNfactor;
+        }
+
+        s = nTimestamp - 1389306217;
+        while ((s >> 1) > 3) {
+          l += 1;
+          s >>= 1;
+        }
+
+        s &= 3;
+
+        n = (l * 158 + s * 28 - 2670) / 100;
+
+        if (n < 0) n = 0;
+
+        N = (unsigned char) n;
+        n = N > minNfactor ? N : minNfactor;
+        N = n < maxNfactor ? n : maxNfactor;
+
+        return N;
+    }
+
+    #define max(a,b)            (((a) > (b)) ? (a) : (b))
+    #define min(a,b)            (((a) < (b)) ? (a) : (b))
+    unsigned char GetNfactorJane(int nTimestamp, int nChainStartTime) {
+
+            const unsigned char minNfactor = 4;
+            const unsigned char maxNfactor = 30;
+
+            int l = 0, s, n;
+            unsigned char N;
+
+            if (nTimestamp <= nChainStartTime)
+                    return 4;
+
+            s = nTimestamp - nChainStartTime;
+            while ((s >> 1) > 3) {
+                    l += 1;
+                    s >>= 1;
+            }
+
+            s &= 3;
+
+            n = (l * 170 + s * 25 - 2320) / 100;
+
+            if (n < 0) n = 0;
+
+            if (n > 255)
+                    printf("GetNfactor(%d) - something wrong(n == %d)\n", nTimestamp, n);
+
+            N = (unsigned char)n;
+            //printf("GetNfactor: %d -> %d %d : %d / %d\n", nTimestamp - nChainStartTime, l, s, n, min(max(N, minNfactor), maxNfactor));
+
+            return min(max(N, minNfactor), maxNfactor);
+    }
+
+    void scryptjane_hash(const void* input, size_t inputlen, uint32_t *res, unsigned char Nfactor)
+    {
+            return scrypt((const unsigned char*)input, inputlen,
+                    (const unsigned char*)input, inputlen,
+                    Nfactor, 0, 0, (unsigned char*)res, 32);
+    }
+}
+
+using namespace node;
+using namespace v8;
+
+Handle<Value> except(const char* msg) {
+    return ThrowException(Exception::Error(String::New(msg)));
+}
+
+Handle<Value> quark(const Arguments& args) {
+    HandleScope scope;
+
+    if (args.Length() < 1)
+        return except("You must provide one argument.");
+
+    Local<Object> target = args[0]->ToObject();
+
+    if(!Buffer::HasInstance(target))
+        return except("Argument should be a buffer object.");
+
+    char * input = Buffer::Data(target);
+    char * output = new char[32];
+
+    quark_hash(input, output);
+
+    Buffer* buff = Buffer::New(output, 32);
+    return scope.Close(buff->handle_);
+}
+
+Handle<Value> x11(const Arguments& args) {
+    HandleScope scope;
+
+    if (args.Length() < 1)
+        return except("You must provide one argument.");
+
+    Local<Object> target = args[0]->ToObject();
+
+    if(!Buffer::HasInstance(target))
+        return except("Argument should be a buffer object.");
+
+    char * input = Buffer::Data(target);
+    char * output = new char[32];
+
+    x11_hash(input, output);
+
+    Buffer* buff = Buffer::New(output, 32);
+    return scope.Close(buff->handle_);
+}
+
+Handle<Value> scrypt(const Arguments& args) {
+   HandleScope scope;
+
+   if (args.Length() < 1)
+       return except("You must provide one argument.");
+
+   Local<Object> target = args[0]->ToObject();
+
+   if(!Buffer::HasInstance(target))
+       return except("Argument should be a buffer object.");
+
+   char * input = Buffer::Data(target);
+   char * output = new char[32];
+
+   scrypt_1024_1_1_256(input, output);
+
+   Buffer* buff = Buffer::New(output, 32);
+   return scope.Close(buff->handle_);
+}
+
+
+
+Handle<Value> scryptn(const Arguments& args) {
+   HandleScope scope;
+
+   if (args.Length() < 1)
+       return except("You must provide one argument.");
+
+   Local<Object> target = args[0]->ToObject();
+
+   if(!Buffer::HasInstance(target))
+       return except("Argument should be a buffer object.");
+
+
+
+   char * input = Buffer::Data(target);
+   char * output = new char[32];
+
+   unsigned int N = 1 << (getNfactor(input) + 1);
+
+   scrypt_N_1_1_256(input, output, N);
+
+   Buffer* buff = Buffer::New(output, 32);
+   return scope.Close(buff->handle_);
+}
+
+Handle<Value> scryptjane(const Arguments& args) {
+    HandleScope scope;
+
+    if (args.Length() < 3)
+        return except("You must provide two argument: buffer, timestamp as number, and nChainStarTime as number");
+
+    Local<Object> target = args[0]->ToObject();
+
+    if(!Buffer::HasInstance(target))
+        return except("First should be a buffer object.");
+
+    Local<Number> num = args[1]->ToNumber();
+    int timestamp = num->Value();
+
+    Local<Number> num = args[2]->ToNumber();
+    int nChainStarTime = num->Value();
+
+
+    char * input = Buffer::Data(target);
+    char * output = new char[32];
+
+    scryptjane_hash(input, 80, (uint32_t *)output, GetNfactorJane(timestamp, nChainStarTime));
+
+    Buffer* buff = Buffer::New(output, 32);
+    return scope.Close(buff->handle_);
+}
+
+Handle<Value> keccak(const Arguments& args) {
+    HandleScope scope;
+
+    if (args.Length() < 1)
+        return except("You must provide one argument.");
+
+    Local<Object> target = args[0]->ToObject();
+
+    if(!Buffer::HasInstance(target))
+        return except("Argument should be a buffer object.");
+
+    char * input = Buffer::Data(target);
+    char * output = new char[32];
+
+    keccak_hash(input, output);
+
+    Buffer* buff = Buffer::New(output, 32);
+    return scope.Close(buff->handle_);
+}
+
+
+Handle<Value> bcrypt(const Arguments& args) {
+    HandleScope scope;
+
+    if (args.Length() < 1)
+        return except("You must provide one argument.");
+
+    Local<Object> target = args[0]->ToObject();
+
+    if(!Buffer::HasInstance(target))
+        return except("Argument should be a buffer object.");
+
+    char * input = Buffer::Data(target);
+    char * output = new char[32];
+
+    bcrypt_hash(input, output);
+
+    Buffer* buff = Buffer::New(output, 32);
+    return scope.Close(buff->handle_);
+}
+
+void init(Handle<Object> exports) {
+    exports->Set(String::NewSymbol("quark"), FunctionTemplate::New(quark)->GetFunction());
+    exports->Set(String::NewSymbol("x11"), FunctionTemplate::New(x11)->GetFunction());
+    exports->Set(String::NewSymbol("scrypt"), FunctionTemplate::New(scrypt)->GetFunction());
+    exports->Set(String::NewSymbol("scryptn"), FunctionTemplate::New(scryptn)->GetFunction());
+    exports->Set(String::NewSymbol("scryptjane"), FunctionTemplate::New(scryptjane)->GetFunction());
+    exports->Set(String::NewSymbol("keccak"), FunctionTemplate::New(keccak)->GetFunction());
+    exports->Set(String::NewSymbol("bcrypt"), FunctionTemplate::New(keccak)->GetFunction());
+}
+
+NODE_MODULE(multihashing, init)
diff --git a/package.json b/package.json
new file mode 100644
index 0000000..6160b2f
--- /dev/null
+++ b/package.json
@@ -0,0 +1,30 @@
+{
+    "name": "multi-hashing",
+    "version": "0.0.1",
+    "main": "multihashing",
+    "author": {
+        "name": "Matthew Little",
+        "email": "zone117x@gmail.com"
+    },
+    "repository": {
+        "type": "git",
+        "url": "https://github.com/zone117x/node-multi-hashing.git"
+    },
+    "dependencies" : {
+        "bindings" : "*"
+    },
+    "keywords": [
+        "scrypt",
+        "scrypt-jane",
+        "script-n",
+        "x11",
+        "quark",
+        "keccak_hash",
+        "skein",
+        "bcrypt",
+        "keccak",
+        "blake",
+        "shavite",
+        "fugue"
+    ]
+}
\ No newline at end of file
diff --git a/quark.c b/quark.c
new file mode 100644
index 0000000..fc6a022
--- /dev/null
+++ b/quark.c
@@ -0,0 +1,211 @@
+/*-
+ * Copyright 2009 Colin Percival, 2011 ArtForz, 2013 Neisklar,
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * This file was originally written by Colin Percival as part of the Tarsnap
+ * online backup system.
+ */
+
+#include "quark.h"
+#include <stdlib.h>
+#include <stdint.h>
+#include <string.h>
+#include <stdio.h>
+#include "sha3/sph_blake.h"
+#include "sha3/sph_bmw.h"
+#include "sha3/sph_groestl.h"
+#include "sha3/sph_jh.h"
+#include "sha3/sph_keccak.h"
+#include "sha3/sph_skein.h"
+
+
+static __inline uint32_t
+be32dec(const void *pp)
+{
+	const uint8_t *p = (uint8_t const *)pp;
+
+	return ((uint32_t)(p[3]) + ((uint32_t)(p[2]) << 8) +
+	    ((uint32_t)(p[1]) << 16) + ((uint32_t)(p[0]) << 24));
+}
+
+static __inline void
+be32enc(void *pp, uint32_t x)
+{
+	uint8_t * p = (uint8_t *)pp;
+
+	p[3] = x & 0xff;
+	p[2] = (x >> 8) & 0xff;
+	p[1] = (x >> 16) & 0xff;
+	p[0] = (x >> 24) & 0xff;
+}
+
+static __inline uint32_t
+le32dec(const void *pp)
+{
+	const uint8_t *p = (uint8_t const *)pp;
+
+	return ((uint32_t)(p[0]) + ((uint32_t)(p[1]) << 8) +
+	    ((uint32_t)(p[2]) << 16) + ((uint32_t)(p[3]) << 24));
+}
+
+static __inline void
+le32enc(void *pp, uint32_t x)
+{
+	uint8_t * p = (uint8_t *)pp;
+
+	p[0] = x & 0xff;
+	p[1] = (x >> 8) & 0xff;
+	p[2] = (x >> 16) & 0xff;
+	p[3] = (x >> 24) & 0xff;
+}
+
+/*
+ * Encode a length len/4 vector of (uint32_t) into a length len vector of
+ * (unsigned char) in big-endian form.  Assumes len is a multiple of 4.
+ */
+static void
+be32enc_vect(unsigned char *dst, const uint32_t *src, size_t len)
+{
+	size_t i;
+
+	for (i = 0; i < len / 4; i++)
+		be32enc(dst + i * 4, src[i]);
+}
+
+/*
+ * Decode a big-endian length len vector of (unsigned char) into a length
+ * len/4 vector of (uint32_t).  Assumes len is a multiple of 4.
+ */
+static void
+be32dec_vect(uint32_t *dst, const unsigned char *src, size_t len)
+{
+	size_t i;
+
+	for (i = 0; i < len / 4; i++)
+		dst[i] = be32dec(src + i * 4);
+}
+
+void quark_hash(const char* input, char* output)
+{
+    sph_blake512_context     ctx_blake;
+    sph_bmw512_context       ctx_bmw;
+    sph_groestl512_context   ctx_groestl;
+    sph_jh512_context        ctx_jh;
+    sph_keccak512_context    ctx_keccak;
+    sph_skein512_context     ctx_skein;
+    static unsigned char pblank[1];
+
+    uint32_t mask = 8;
+    uint32_t zero = 0;
+
+    uint32_t hashA[16], hashB[16];
+
+
+
+    sph_blake512_init(&ctx_blake);
+    sph_blake512 (&ctx_blake, input, 80);
+    sph_blake512_close (&ctx_blake, hashA);	 //0
+
+
+    sph_bmw512_init(&ctx_bmw);
+    sph_bmw512 (&ctx_bmw, hashA, 64);    //0
+    sph_bmw512_close(&ctx_bmw, hashB);   //1
+
+
+    if ((hashB[0] & mask) != zero)   //1
+    {
+        sph_groestl512_init(&ctx_groestl);
+        sph_groestl512 (&ctx_groestl, hashB, 64); //1
+        sph_groestl512_close(&ctx_groestl, hashA); //2
+    }
+    else
+    {
+        sph_skein512_init(&ctx_skein);
+        sph_skein512 (&ctx_skein, hashB, 64); //1
+        sph_skein512_close(&ctx_skein, hashA); //2
+    }
+
+
+    sph_groestl512_init(&ctx_groestl);
+    sph_groestl512 (&ctx_groestl, hashA, 64); //2
+    sph_groestl512_close(&ctx_groestl, hashB); //3
+
+    sph_jh512_init(&ctx_jh);
+    sph_jh512 (&ctx_jh, hashB, 64); //3
+    sph_jh512_close(&ctx_jh, hashA); //4
+
+    if ((hashA[0] & mask) != zero) //4
+    {
+        sph_blake512_init(&ctx_blake);
+        sph_blake512 (&ctx_blake, hashA, 64); //
+        sph_blake512_close(&ctx_blake, hashB); //5
+    }
+    else
+    {
+        sph_bmw512_init(&ctx_bmw);
+        sph_bmw512 (&ctx_bmw, hashA, 64); //4
+        sph_bmw512_close(&ctx_bmw, hashB);   //5
+    }
+
+    sph_keccak512_init(&ctx_keccak);
+    sph_keccak512 (&ctx_keccak,hashB, 64); //5
+    sph_keccak512_close(&ctx_keccak, hashA); //6
+
+    sph_skein512_init(&ctx_skein);
+    sph_skein512 (&ctx_skein, hashA, 64); //6
+    sph_skein512_close(&ctx_skein, hashB); //7
+
+    if ((hashB[0] & mask) != zero) //7
+    {
+        sph_keccak512_init(&ctx_keccak);
+        sph_keccak512 (&ctx_keccak, hashB, 64); //
+        sph_keccak512_close(&ctx_keccak, hashA); //8
+    }
+    else
+    {
+        sph_jh512_init(&ctx_jh);
+        sph_jh512 (&ctx_jh, hashB, 64); //7
+        sph_jh512_close(&ctx_jh, hashA); //8
+    }
+
+
+
+	memcpy(output, hashA, 32);
+
+
+/*
+	printf("result: ");
+	for (ii=0; ii < 32; ii++)
+	{
+		printf ("%.2x",((uint8_t*)output)[ii]);
+	}
+	printf ("\n");
+*/
+
+
+
+
+}
+
+
diff --git a/quark.h b/quark.h
new file mode 100644
index 0000000..5b76749
--- /dev/null
+++ b/quark.h
@@ -0,0 +1,6 @@
+#ifndef QUARK_H
+#define QUARK_H
+
+void quark_hash(const char* input, char* output);
+
+#endif
diff --git a/scrypt-jane/scrypt-jane-chacha.h b/scrypt-jane/scrypt-jane-chacha.h
new file mode 100644
index 0000000..41d96e5
--- /dev/null
+++ b/scrypt-jane/scrypt-jane-chacha.h
@@ -0,0 +1,132 @@
+#define SCRYPT_MIX_BASE "ChaCha20/8"
+
+typedef uint32_t scrypt_mix_word_t;
+
+#define SCRYPT_WORDTO8_LE U32TO8_LE
+#define SCRYPT_WORD_ENDIAN_SWAP U32_SWAP
+
+#define SCRYPT_BLOCK_BYTES 64
+#define SCRYPT_BLOCK_WORDS (SCRYPT_BLOCK_BYTES / sizeof(scrypt_mix_word_t))
+
+/* must have these here in case block bytes is ever != 64 */
+#include "scrypt-jane-romix-basic.h"
+
+#include "scrypt-jane-mix_chacha-avx.h"
+#include "scrypt-jane-mix_chacha-ssse3.h"
+#include "scrypt-jane-mix_chacha-sse2.h"
+#include "scrypt-jane-mix_chacha.h"
+
+#if defined(SCRYPT_CHACHA_AVX)
+	#define SCRYPT_CHUNKMIX_FN scrypt_ChunkMix_avx
+	#define SCRYPT_ROMIX_FN scrypt_ROMix_avx
+	#define SCRYPT_MIX_FN chacha_core_avx
+	#define SCRYPT_ROMIX_TANGLE_FN scrypt_romix_nop
+	#define SCRYPT_ROMIX_UNTANGLE_FN scrypt_romix_nop
+	#include "scrypt-jane-romix-template.h"
+#endif
+
+#if defined(SCRYPT_CHACHA_SSSE3)
+	#define SCRYPT_CHUNKMIX_FN scrypt_ChunkMix_ssse3
+	#define SCRYPT_ROMIX_FN scrypt_ROMix_ssse3
+	#define SCRYPT_MIX_FN chacha_core_ssse3
+	#define SCRYPT_ROMIX_TANGLE_FN scrypt_romix_nop
+	#define SCRYPT_ROMIX_UNTANGLE_FN scrypt_romix_nop
+	#include "scrypt-jane-romix-template.h"
+#endif
+
+#if defined(SCRYPT_CHACHA_SSE2)
+	#define SCRYPT_CHUNKMIX_FN scrypt_ChunkMix_sse2
+	#define SCRYPT_ROMIX_FN scrypt_ROMix_sse2
+	#define SCRYPT_MIX_FN chacha_core_sse2
+	#define SCRYPT_ROMIX_TANGLE_FN scrypt_romix_nop
+	#define SCRYPT_ROMIX_UNTANGLE_FN scrypt_romix_nop
+	#include "scrypt-jane-romix-template.h"
+#endif
+
+/* cpu agnostic */
+#define SCRYPT_ROMIX_FN scrypt_ROMix_basic
+#define SCRYPT_MIX_FN chacha_core_basic
+#define SCRYPT_ROMIX_TANGLE_FN scrypt_romix_convert_endian
+#define SCRYPT_ROMIX_UNTANGLE_FN scrypt_romix_convert_endian
+#include "scrypt-jane-romix-template.h"
+
+#if !defined(SCRYPT_CHOOSE_COMPILETIME)
+static scrypt_ROMixfn
+scrypt_getROMix() {
+	size_t cpuflags = detect_cpu();
+
+#if defined(SCRYPT_CHACHA_AVX)
+	if (cpuflags & cpu_avx)
+		return scrypt_ROMix_avx;
+	else
+#endif
+
+#if defined(SCRYPT_CHACHA_SSSE3)
+	if (cpuflags & cpu_ssse3)
+		return scrypt_ROMix_ssse3;
+	else
+#endif
+
+#if defined(SCRYPT_CHACHA_SSE2)
+	if (cpuflags & cpu_sse2)
+		return scrypt_ROMix_sse2;
+	else
+#endif
+
+	return scrypt_ROMix_basic;
+}
+#endif
+
+
+#if defined(SCRYPT_TEST_SPEED)
+static size_t
+available_implementations() {
+	size_t flags = 0;
+
+#if defined(SCRYPT_CHACHA_AVX)
+	flags |= cpu_avx;
+#endif
+
+#if defined(SCRYPT_CHACHA_SSSE3)
+	flags |= cpu_ssse3;
+#endif
+
+#if defined(SCRYPT_CHACHA_SSE2)
+		flags |= cpu_sse2;
+#endif
+
+	return flags;
+}
+#endif
+
+static int
+scrypt_test_mix() {
+	static const uint8_t expected[16] = {
+		0x48,0x2b,0x2d,0xb8,0xa1,0x33,0x22,0x73,0xcd,0x16,0xc4,0xb4,0xb0,0x7f,0xb1,0x8a,
+	};
+
+	int ret = 1;
+	size_t cpuflags = detect_cpu();
+
+#if defined(SCRYPT_CHACHA_AVX)
+	if (cpuflags & cpu_avx)
+		ret &= scrypt_test_mix_instance(scrypt_ChunkMix_avx, scrypt_romix_nop, scrypt_romix_nop, expected);
+#endif
+
+#if defined(SCRYPT_CHACHA_SSSE3)
+	if (cpuflags & cpu_ssse3)
+		ret &= scrypt_test_mix_instance(scrypt_ChunkMix_ssse3, scrypt_romix_nop, scrypt_romix_nop, expected);
+#endif
+
+#if defined(SCRYPT_CHACHA_SSE2)
+	if (cpuflags & cpu_sse2)
+		ret &= scrypt_test_mix_instance(scrypt_ChunkMix_sse2, scrypt_romix_nop, scrypt_romix_nop, expected);
+#endif
+
+#if defined(SCRYPT_CHACHA_BASIC)
+	ret &= scrypt_test_mix_instance(scrypt_ChunkMix_basic, scrypt_romix_convert_endian, scrypt_romix_convert_endian, expected);
+#endif
+
+	return ret;
+}
+
diff --git a/scrypt-jane/scrypt-jane-hash.h b/scrypt-jane/scrypt-jane-hash.h
new file mode 100644
index 0000000..db5c1db
--- /dev/null
+++ b/scrypt-jane/scrypt-jane-hash.h
@@ -0,0 +1,48 @@
+#if defined(SCRYPT_BLAKE512)
+#include "scrypt-jane-hash_blake512.h"
+#elif defined(SCRYPT_BLAKE256)
+#include "scrypt-jane-hash_blake256.h"
+#elif defined(SCRYPT_SHA512)
+#include "scrypt-jane-hash_sha512.h"
+#elif defined(SCRYPT_SHA256)
+#include "scrypt-jane-hash_sha256.h"
+#elif defined(SCRYPT_SKEIN512)
+#include "scrypt-jane-hash_skein512.h"
+#elif defined(SCRYPT_KECCAK512) || defined(SCRYPT_KECCAK256)
+#include "scrypt-jane-hash_keccak.h"
+#else
+	#define SCRYPT_HASH "ERROR"
+	#define SCRYPT_HASH_BLOCK_SIZE 64
+	#define SCRYPT_HASH_DIGEST_SIZE 64
+	typedef struct scrypt_hash_state_t { size_t dummy; } scrypt_hash_state;
+	typedef uint8_t scrypt_hash_digest[SCRYPT_HASH_DIGEST_SIZE];
+	static void scrypt_hash_init(scrypt_hash_state *S) {}
+	static void scrypt_hash_update(scrypt_hash_state *S, const uint8_t *in, size_t inlen) {}
+	static void scrypt_hash_finish(scrypt_hash_state *S, uint8_t *hash) {}
+	static const uint8_t scrypt_test_hash_expected[SCRYPT_HASH_DIGEST_SIZE] = {0};
+	#error must define a hash function!
+#endif
+
+#include "scrypt-jane-pbkdf2.h"
+
+#define SCRYPT_TEST_HASH_LEN 257 /* (2 * largest block size) + 1 */
+
+static int
+scrypt_test_hash() {
+	scrypt_hash_state st;
+	scrypt_hash_digest hash, final;
+	uint8_t msg[SCRYPT_TEST_HASH_LEN];
+	size_t i;
+
+	for (i = 0; i < SCRYPT_TEST_HASH_LEN; i++)
+		msg[i] = (uint8_t)i;
+
+	scrypt_hash_init(&st);
+	for (i = 0; i < SCRYPT_TEST_HASH_LEN + 1; i++) {
+		scrypt_hash(hash, msg, i);
+		scrypt_hash_update(&st, hash, sizeof(hash));
+	}
+	scrypt_hash_finish(&st, final);
+	return scrypt_verify(final, scrypt_test_hash_expected, SCRYPT_HASH_DIGEST_SIZE);
+}
+
diff --git a/scrypt-jane/scrypt-jane-hash_keccak.h b/scrypt-jane/scrypt-jane-hash_keccak.h
new file mode 100644
index 0000000..7ed5574
--- /dev/null
+++ b/scrypt-jane/scrypt-jane-hash_keccak.h
@@ -0,0 +1,168 @@
+#if defined(SCRYPT_KECCAK256)
+	#define SCRYPT_HASH "Keccak-256"
+	#define SCRYPT_HASH_DIGEST_SIZE 32
+#else
+	#define SCRYPT_HASH "Keccak-512"
+	#define SCRYPT_HASH_DIGEST_SIZE 64
+#endif
+#define SCRYPT_KECCAK_F 1600
+#define SCRYPT_KECCAK_C (SCRYPT_HASH_DIGEST_SIZE * 8 * 2) /* 256=512, 512=1024 */
+#define SCRYPT_KECCAK_R (SCRYPT_KECCAK_F - SCRYPT_KECCAK_C) /* 256=1088, 512=576 */
+#define SCRYPT_HASH_BLOCK_SIZE (SCRYPT_KECCAK_R / 8)
+
+typedef uint8_t scrypt_hash_digest[SCRYPT_HASH_DIGEST_SIZE];
+
+typedef struct scrypt_hash_state_t {
+	uint64_t state[SCRYPT_KECCAK_F / 64];
+	uint32_t leftover;
+	uint8_t buffer[SCRYPT_HASH_BLOCK_SIZE];
+} scrypt_hash_state;
+
+static const uint64_t keccak_round_constants[24] = {
+	0x0000000000000001ull, 0x0000000000008082ull,
+	0x800000000000808aull, 0x8000000080008000ull,
+	0x000000000000808bull, 0x0000000080000001ull,
+	0x8000000080008081ull, 0x8000000000008009ull,
+	0x000000000000008aull, 0x0000000000000088ull,
+	0x0000000080008009ull, 0x000000008000000aull,
+	0x000000008000808bull, 0x800000000000008bull,
+	0x8000000000008089ull, 0x8000000000008003ull,
+	0x8000000000008002ull, 0x8000000000000080ull,
+	0x000000000000800aull, 0x800000008000000aull,
+	0x8000000080008081ull, 0x8000000000008080ull,
+	0x0000000080000001ull, 0x8000000080008008ull
+};
+
+static void
+keccak_block(scrypt_hash_state *S, const uint8_t *in) {
+	size_t i;
+	uint64_t *s = S->state, t[5], u[5], v, w;
+
+	/* absorb input */
+	for (i = 0; i < SCRYPT_HASH_BLOCK_SIZE / 8; i++, in += 8)
+		s[i] ^= U8TO64_LE(in);
+	
+	for (i = 0; i < 24; i++) {
+		/* theta: c = a[0,i] ^ a[1,i] ^ .. a[4,i] */
+		t[0] = s[0] ^ s[5] ^ s[10] ^ s[15] ^ s[20];
+		t[1] = s[1] ^ s[6] ^ s[11] ^ s[16] ^ s[21];
+		t[2] = s[2] ^ s[7] ^ s[12] ^ s[17] ^ s[22];
+		t[3] = s[3] ^ s[8] ^ s[13] ^ s[18] ^ s[23];
+		t[4] = s[4] ^ s[9] ^ s[14] ^ s[19] ^ s[24];
+
+		/* theta: d[i] = c[i+4] ^ rotl(c[i+1],1) */
+		u[0] = t[4] ^ ROTL64(t[1], 1);
+		u[1] = t[0] ^ ROTL64(t[2], 1);
+		u[2] = t[1] ^ ROTL64(t[3], 1);
+		u[3] = t[2] ^ ROTL64(t[4], 1);
+		u[4] = t[3] ^ ROTL64(t[0], 1);
+
+		/* theta: a[0,i], a[1,i], .. a[4,i] ^= d[i] */
+		s[0] ^= u[0]; s[5] ^= u[0]; s[10] ^= u[0]; s[15] ^= u[0]; s[20] ^= u[0];
+		s[1] ^= u[1]; s[6] ^= u[1]; s[11] ^= u[1]; s[16] ^= u[1]; s[21] ^= u[1];
+		s[2] ^= u[2]; s[7] ^= u[2]; s[12] ^= u[2]; s[17] ^= u[2]; s[22] ^= u[2];
+		s[3] ^= u[3]; s[8] ^= u[3]; s[13] ^= u[3]; s[18] ^= u[3]; s[23] ^= u[3];
+		s[4] ^= u[4]; s[9] ^= u[4]; s[14] ^= u[4]; s[19] ^= u[4]; s[24] ^= u[4];
+
+		/* rho pi: b[..] = rotl(a[..], ..) */
+		v = s[ 1];
+		s[ 1] = ROTL64(s[ 6], 44);
+		s[ 6] = ROTL64(s[ 9], 20);
+		s[ 9] = ROTL64(s[22], 61);
+		s[22] = ROTL64(s[14], 39);
+		s[14] = ROTL64(s[20], 18);
+		s[20] = ROTL64(s[ 2], 62);
+		s[ 2] = ROTL64(s[12], 43);
+		s[12] = ROTL64(s[13], 25);
+		s[13] = ROTL64(s[19],  8);
+		s[19] = ROTL64(s[23], 56);
+		s[23] = ROTL64(s[15], 41);
+		s[15] = ROTL64(s[ 4], 27);
+		s[ 4] = ROTL64(s[24], 14);
+		s[24] = ROTL64(s[21],  2);
+		s[21] = ROTL64(s[ 8], 55);
+		s[ 8] = ROTL64(s[16], 45);
+		s[16] = ROTL64(s[ 5], 36);
+		s[ 5] = ROTL64(s[ 3], 28);
+		s[ 3] = ROTL64(s[18], 21);
+		s[18] = ROTL64(s[17], 15);
+		s[17] = ROTL64(s[11], 10);
+		s[11] = ROTL64(s[ 7],  6);
+		s[ 7] = ROTL64(s[10],  3);
+		s[10] = ROTL64(    v,  1);
+
+		/* chi: a[i,j] ^= ~b[i,j+1] & b[i,j+2] */
+		v = s[ 0]; w = s[ 1]; s[ 0] ^= (~w) & s[ 2]; s[ 1] ^= (~s[ 2]) & s[ 3]; s[ 2] ^= (~s[ 3]) & s[ 4]; s[ 3] ^= (~s[ 4]) & v; s[ 4] ^= (~v) & w;
+		v = s[ 5]; w = s[ 6]; s[ 5] ^= (~w) & s[ 7]; s[ 6] ^= (~s[ 7]) & s[ 8]; s[ 7] ^= (~s[ 8]) & s[ 9]; s[ 8] ^= (~s[ 9]) & v; s[ 9] ^= (~v) & w;
+		v = s[10]; w = s[11]; s[10] ^= (~w) & s[12]; s[11] ^= (~s[12]) & s[13]; s[12] ^= (~s[13]) & s[14]; s[13] ^= (~s[14]) & v; s[14] ^= (~v) & w;
+		v = s[15]; w = s[16]; s[15] ^= (~w) & s[17]; s[16] ^= (~s[17]) & s[18]; s[17] ^= (~s[18]) & s[19]; s[18] ^= (~s[19]) & v; s[19] ^= (~v) & w;
+		v = s[20]; w = s[21]; s[20] ^= (~w) & s[22]; s[21] ^= (~s[22]) & s[23]; s[22] ^= (~s[23]) & s[24]; s[23] ^= (~s[24]) & v; s[24] ^= (~v) & w;
+
+		/* iota: a[0,0] ^= round constant */
+		s[0] ^= keccak_round_constants[i];
+	}
+}
+
+static void
+scrypt_hash_init(scrypt_hash_state *S) {
+	memset(S, 0, sizeof(*S));
+}
+
+static void
+scrypt_hash_update(scrypt_hash_state *S, const uint8_t *in, size_t inlen) {
+	size_t want;
+
+	/* handle the previous data */
+	if (S->leftover) {
+		want = (SCRYPT_HASH_BLOCK_SIZE - S->leftover);
+		want = (want < inlen) ? want : inlen;
+		memcpy(S->buffer + S->leftover, in, want);
+		S->leftover += (uint32_t)want;
+		if (S->leftover < SCRYPT_HASH_BLOCK_SIZE)
+			return;
+		in += want;
+		inlen -= want;
+		keccak_block(S, S->buffer);
+	}
+
+	/* handle the current data */
+	while (inlen >= SCRYPT_HASH_BLOCK_SIZE) {
+		keccak_block(S, in);
+		in += SCRYPT_HASH_BLOCK_SIZE;
+		inlen -= SCRYPT_HASH_BLOCK_SIZE;
+	}
+
+	/* handle leftover data */
+	S->leftover = (uint32_t)inlen;
+	if (S->leftover)
+		memcpy(S->buffer, in, S->leftover);
+}
+
+static void
+scrypt_hash_finish(scrypt_hash_state *S, uint8_t *hash) {
+	size_t i;
+
+	S->buffer[S->leftover] = 0x01;
+	memset(S->buffer + (S->leftover + 1), 0, SCRYPT_HASH_BLOCK_SIZE - (S->leftover + 1));
+	S->buffer[SCRYPT_HASH_BLOCK_SIZE - 1] |= 0x80;
+	keccak_block(S, S->buffer);
+
+	for (i = 0; i < SCRYPT_HASH_DIGEST_SIZE; i += 8) {
+		U64TO8_LE(&hash[i], S->state[i / 8]);
+	}
+}
+
+#if defined(SCRYPT_KECCAK256)
+static const uint8_t scrypt_test_hash_expected[SCRYPT_HASH_DIGEST_SIZE] = {
+	0x26,0xb7,0x10,0xb3,0x66,0xb1,0xd1,0xb1,0x25,0xfc,0x3e,0xe3,0x1e,0x33,0x1d,0x19,
+	0x94,0xaa,0x63,0x7a,0xd5,0x77,0x29,0xb4,0x27,0xe9,0xe0,0xf4,0x19,0xba,0x68,0xea,
+};
+#else
+static const uint8_t scrypt_test_hash_expected[SCRYPT_HASH_DIGEST_SIZE] = {
+	0x17,0xc7,0x8c,0xa0,0xd9,0x08,0x1d,0xba,0x8a,0xc8,0x3e,0x07,0x90,0xda,0x91,0x88,
+	0x25,0xbd,0xd3,0xf8,0x78,0x4a,0x8d,0x5e,0xe4,0x96,0x9c,0x01,0xf3,0xeb,0xdc,0x12,
+	0xea,0x35,0x57,0xba,0x94,0xb8,0xe9,0xb9,0x27,0x45,0x0a,0x48,0x5c,0x3d,0x69,0xf0,
+	0xdb,0x22,0x38,0xb5,0x52,0x22,0x29,0xea,0x7a,0xb2,0xe6,0x07,0xaa,0x37,0x4d,0xe6,
+};
+#endif
+
diff --git a/scrypt-jane/scrypt-jane-hash_sha256.h b/scrypt-jane/scrypt-jane-hash_sha256.h
new file mode 100644
index 0000000..d06d3e1
--- /dev/null
+++ b/scrypt-jane/scrypt-jane-hash_sha256.h
@@ -0,0 +1,135 @@
+#define SCRYPT_HASH "SHA-2-256"
+#define SCRYPT_HASH_BLOCK_SIZE 64
+#define SCRYPT_HASH_DIGEST_SIZE 32
+
+typedef uint8_t scrypt_hash_digest[SCRYPT_HASH_DIGEST_SIZE];
+
+typedef struct scrypt_hash_state_t {
+	uint32_t H[8];
+	uint64_t T;
+	uint32_t leftover;
+	uint8_t buffer[SCRYPT_HASH_BLOCK_SIZE];
+} scrypt_hash_state;
+
+static const uint32_t sha256_constants[64] = {
+	0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5, 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
+	0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3, 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,
+	0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc, 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
+	0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7, 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967,
+	0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13, 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85,
+	0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3, 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
+	0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5, 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3,
+	0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208, 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2
+};
+
+#define Ch(x,y,z)  (z ^ (x & (y ^ z)))
+#define Maj(x,y,z) (((x | y) & z) | (x & y))
+#define S0(x)      (ROTR32(x,  2) ^ ROTR32(x, 13) ^ ROTR32(x, 22))
+#define S1(x)      (ROTR32(x,  6) ^ ROTR32(x, 11) ^ ROTR32(x, 25))
+#define G0(x)      (ROTR32(x,  7) ^ ROTR32(x, 18) ^ (x >>  3))
+#define G1(x)      (ROTR32(x, 17) ^ ROTR32(x, 19) ^ (x >> 10))
+#define W0(in,i)   (U8TO32_BE(&in[i * 4]))
+#define W1(i)      (G1(w[i - 2]) + w[i - 7] + G0(w[i - 15]) + w[i - 16])
+#define STEP(i) \
+	t1 = S0(r[0]) + Maj(r[0], r[1], r[2]); \
+	t0 = r[7] + S1(r[4]) + Ch(r[4], r[5], r[6]) + sha256_constants[i] + w[i]; \
+	r[7] = r[6]; \
+	r[6] = r[5]; \
+	r[5] = r[4]; \
+	r[4] = r[3] + t0; \
+	r[3] = r[2]; \
+	r[2] = r[1]; \
+	r[1] = r[0]; \
+	r[0] = t0 + t1;
+
+static void
+sha256_blocks(scrypt_hash_state *S, const uint8_t *in, size_t blocks) {
+	uint32_t r[8], w[64], t0, t1;
+	size_t i;
+
+	for (i = 0; i < 8; i++) r[i] = S->H[i];
+
+	while (blocks--) {
+		for (i =  0; i < 16; i++) { w[i] = W0(in, i); }
+		for (i = 16; i < 64; i++) { w[i] = W1(i); }
+		for (i =  0; i < 64; i++) { STEP(i); }
+		for (i =  0; i <  8; i++) { r[i] += S->H[i]; S->H[i] = r[i]; }
+		S->T += SCRYPT_HASH_BLOCK_SIZE * 8;
+		in += SCRYPT_HASH_BLOCK_SIZE;
+	}
+}
+
+static void
+scrypt_hash_init(scrypt_hash_state *S) {
+	S->H[0] = 0x6a09e667;
+	S->H[1] = 0xbb67ae85;
+	S->H[2] = 0x3c6ef372;
+	S->H[3] = 0xa54ff53a;
+	S->H[4] = 0x510e527f;
+	S->H[5] = 0x9b05688c;
+	S->H[6] = 0x1f83d9ab;
+	S->H[7] = 0x5be0cd19;
+	S->T = 0;
+	S->leftover = 0;
+}
+
+static void
+scrypt_hash_update(scrypt_hash_state *S, const uint8_t *in, size_t inlen) {
+	size_t blocks, want;
+
+	/* handle the previous data */
+	if (S->leftover) {
+		want = (SCRYPT_HASH_BLOCK_SIZE - S->leftover);
+		want = (want < inlen) ? want : inlen;
+		memcpy(S->buffer + S->leftover, in, want);
+		S->leftover += (uint32_t)want;
+		if (S->leftover < SCRYPT_HASH_BLOCK_SIZE)
+			return;
+		in += want;
+		inlen -= want;
+		sha256_blocks(S, S->buffer, 1);
+	}
+
+	/* handle the current data */
+	blocks = (inlen & ~(SCRYPT_HASH_BLOCK_SIZE - 1));
+	S->leftover = (uint32_t)(inlen - blocks);
+	if (blocks) {
+		sha256_blocks(S, in, blocks / SCRYPT_HASH_BLOCK_SIZE);
+		in += blocks;
+	}
+
+	/* handle leftover data */
+	if (S->leftover)
+		memcpy(S->buffer, in, S->leftover);
+}
+
+static void
+scrypt_hash_finish(scrypt_hash_state *S, uint8_t *hash) {
+	uint64_t t = S->T + (S->leftover * 8);
+
+	S->buffer[S->leftover] = 0x80;
+	if (S->leftover <= 55) {
+		memset(S->buffer + S->leftover + 1, 0, 55 - S->leftover);
+	} else {
+		memset(S->buffer + S->leftover + 1, 0, 63 - S->leftover);
+		sha256_blocks(S, S->buffer, 1);
+		memset(S->buffer, 0, 56);
+	}
+
+	U64TO8_BE(S->buffer + 56, t);
+	sha256_blocks(S, S->buffer, 1);
+
+	U32TO8_BE(&hash[ 0], S->H[0]);
+	U32TO8_BE(&hash[ 4], S->H[1]);
+	U32TO8_BE(&hash[ 8], S->H[2]);
+	U32TO8_BE(&hash[12], S->H[3]);
+	U32TO8_BE(&hash[16], S->H[4]);
+	U32TO8_BE(&hash[20], S->H[5]);
+	U32TO8_BE(&hash[24], S->H[6]);
+	U32TO8_BE(&hash[28], S->H[7]);
+}
+
+static const uint8_t scrypt_test_hash_expected[SCRYPT_HASH_DIGEST_SIZE] = {
+	0xee,0x36,0xae,0xa6,0x65,0xf0,0x28,0x7d,0xc9,0xde,0xd8,0xad,0x48,0x33,0x7d,0xbf,
+	0xcb,0xc0,0x48,0xfa,0x5f,0x92,0xfd,0x0a,0x95,0x6f,0x34,0x8e,0x8c,0x1e,0x73,0xad,
+};
diff --git a/scrypt-jane/scrypt-jane-mix_chacha-avx.h b/scrypt-jane/scrypt-jane-mix_chacha-avx.h
new file mode 100644
index 0000000..50d6e2d
--- /dev/null
+++ b/scrypt-jane/scrypt-jane-mix_chacha-avx.h
@@ -0,0 +1,340 @@
+/* x86 */
+#if defined(X86ASM_AVX) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_CHACHA_INCLUDED))
+
+#define SCRYPT_CHACHA_AVX
+
+asm_naked_fn_proto(void, scrypt_ChunkMix_avx)(uint32_t *Bout/*[chunkBytes]*/, uint32_t *Bin/*[chunkBytes]*/, uint32_t *Bxor/*[chunkBytes]*/, uint32_t r)
+asm_naked_fn(scrypt_ChunkMix_avx)
+	a1(push ebx)
+	a1(push edi)
+	a1(push esi)
+	a1(push ebp)
+	a2(mov ebp,esp)
+	a2(mov edi,[ebp+20])
+	a2(mov esi,[ebp+24])
+	a2(mov eax,[ebp+28])
+	a2(mov ebx,[ebp+32])
+	a2(sub esp,64)
+	a2(and esp,~63)
+	a2(lea edx,[ebx*2])
+	a2(shl edx,6)
+	a2(lea ecx,[edx-64])
+	a2(and eax, eax)
+	a2(vmovdqa xmm4,[ssse3_rotl16_32bit])
+	a2(vmovdqa xmm5,[ssse3_rotl8_32bit])
+	a2(vmovdqa xmm0,[ecx+esi+0])
+	a2(vmovdqa xmm1,[ecx+esi+16])
+	a2(vmovdqa xmm2,[ecx+esi+32])
+	a2(vmovdqa xmm3,[ecx+esi+48])
+	a1(jz scrypt_ChunkMix_avx_no_xor1)
+	a3(vpxor xmm0,xmm0,[ecx+eax+0])
+	a3(vpxor xmm1,xmm1,[ecx+eax+16])
+	a3(vpxor xmm2,xmm2,[ecx+eax+32])
+	a3(vpxor xmm3,xmm3,[ecx+eax+48])
+	a1(scrypt_ChunkMix_avx_no_xor1:)
+	a2(xor ecx,ecx)
+	a2(xor ebx,ebx)
+	a1(scrypt_ChunkMix_avx_loop:)
+		a2(and eax, eax)
+		a3(vpxor xmm0,xmm0,[esi+ecx+0])
+		a3(vpxor xmm1,xmm1,[esi+ecx+16])
+		a3(vpxor xmm2,xmm2,[esi+ecx+32])
+		a3(vpxor xmm3,xmm3,[esi+ecx+48])
+		a1(jz scrypt_ChunkMix_avx_no_xor2)
+		a3(vpxor xmm0,xmm0,[eax+ecx+0])
+		a3(vpxor xmm1,xmm1,[eax+ecx+16])
+		a3(vpxor xmm2,xmm2,[eax+ecx+32])
+		a3(vpxor xmm3,xmm3,[eax+ecx+48])
+		a1(scrypt_ChunkMix_avx_no_xor2:)
+		a2(vmovdqa [esp+0],xmm0)
+		a2(vmovdqa [esp+16],xmm1)
+		a2(vmovdqa [esp+32],xmm2)
+		a2(vmovdqa [esp+48],xmm3)
+		a2(mov eax,8)
+		a1(scrypt_chacha_avx_loop: )
+			a3(vpaddd xmm0,xmm0,xmm1)
+			a3(vpxor  xmm3,xmm3,xmm0)
+			a3(vpshufb xmm3,xmm3,xmm4)
+			a3(vpaddd xmm2,xmm2,xmm3)
+			a3(vpxor  xmm1,xmm1,xmm2)
+			a3(vpsrld xmm6,xmm1,20)
+			a3(vpslld xmm1,xmm1,12)
+			a3(vpxor  xmm1,xmm1,xmm6)
+			a3(vpaddd xmm0,xmm0,xmm1)
+			a3(vpxor  xmm3,xmm3,xmm0)
+			a3(vpshufb xmm3,xmm3,xmm5)
+			a3(vpshufd xmm0,xmm0,0x93)
+			a3(vpaddd xmm2,xmm2,xmm3)
+			a3(vpshufd xmm3,xmm3,0x4e)
+			a3(vpxor xmm1,xmm1,xmm2)
+			a3(vpshufd xmm2,xmm2,0x39)
+			a3(vpsrld xmm6,xmm1,25)
+			a3(vpslld xmm1,xmm1,7)
+			a3(vpxor xmm1,xmm1,xmm6)
+			a2(sub eax,2)
+			a3(vpaddd xmm0,xmm0,xmm1)
+			a3(vpxor  xmm3,xmm3,xmm0)
+			a3(vpshufb xmm3,xmm3,xmm4)
+			a3(vpaddd xmm2,xmm2,xmm3)
+			a3(vpxor  xmm1,xmm1,xmm2)
+			a3(vpsrld xmm6,xmm1,20)
+			a3(vpslld xmm1,xmm1,12)
+			a3(vpxor xmm1,xmm1,xmm6)
+			a3(vpaddd xmm0,xmm0,xmm1)
+			a3(vpxor  xmm3,xmm3,xmm0)
+			a3(vpshufb xmm3,xmm3,xmm5)
+			a3(vpshufd xmm0,xmm0,0x39)
+			a3(vpaddd xmm2,xmm2,xmm3)
+			a3(pshufd xmm3,xmm3,0x4e)
+			a3(vpxor  xmm1,xmm1,xmm2)
+			a3(pshufd xmm2,xmm2,0x93)
+			a3(vpsrld xmm6,xmm1,25)
+			a3(vpslld xmm1,xmm1,7)
+			a3(vpxor  xmm1,xmm1,xmm6)
+			a1(ja scrypt_chacha_avx_loop)
+		a3(vpaddd xmm0,xmm0,[esp+0])
+		a3(vpaddd xmm1,xmm1,[esp+16])
+		a3(vpaddd xmm2,xmm2,[esp+32])
+		a3(vpaddd xmm3,xmm3,[esp+48])
+		a2(lea eax,[ebx+ecx])
+		a2(xor ebx,edx)
+		a2(and eax,~0x7f)
+		a2(add ecx,64)
+		a2(shr eax,1)
+		a2(add eax, edi)
+		a2(cmp ecx,edx)
+		a2(vmovdqa [eax+0],xmm0)
+		a2(vmovdqa [eax+16],xmm1)
+		a2(vmovdqa [eax+32],xmm2)
+		a2(vmovdqa [eax+48],xmm3)
+		a2(mov eax,[ebp+28])
+		a1(jne scrypt_ChunkMix_avx_loop)
+	a2(mov esp,ebp)
+	a1(pop ebp)
+	a1(pop esi)
+	a1(pop edi)
+	a1(pop ebx)
+	a1(ret 16)
+asm_naked_fn_end(scrypt_ChunkMix_avx)
+
+#endif
+
+
+
+/* x64 */
+#if defined(X86_64ASM_AVX) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_CHACHA_INCLUDED))
+
+#define SCRYPT_CHACHA_AVX
+
+asm_naked_fn_proto(void, scrypt_ChunkMix_avx)(uint32_t *Bout/*[chunkBytes]*/, uint32_t *Bin/*[chunkBytes]*/, uint32_t *Bxor/*[chunkBytes]*/, uint32_t r)
+asm_naked_fn(scrypt_ChunkMix_avx)
+	a2(lea rcx,[rcx*2])
+	a2(shl rcx,6)
+	a2(lea r9,[rcx-64])
+	a2(lea rax,[rsi+r9])
+	a2(lea r9,[rdx+r9])
+	a2(and rdx, rdx)
+	a2(vmovdqa xmm4,[ssse3_rotl16_32bit])
+	a2(vmovdqa xmm5,[ssse3_rotl8_32bit])
+	a2(vmovdqa xmm0,[rax+0])
+	a2(vmovdqa xmm1,[rax+16])
+	a2(vmovdqa xmm2,[rax+32])
+	a2(vmovdqa xmm3,[rax+48])
+	a1(jz scrypt_ChunkMix_avx_no_xor1)
+	a3(vpxor xmm0,xmm0,[r9+0])
+	a3(vpxor xmm1,xmm1,[r9+16])
+	a3(vpxor xmm2,xmm2,[r9+32])
+	a3(vpxor xmm3,xmm3,[r9+48])
+	a1(scrypt_ChunkMix_avx_no_xor1:)
+	a2(xor r8,r8)
+	a2(xor r9,r9)
+	a1(scrypt_ChunkMix_avx_loop:)
+		a2(and rdx, rdx)
+		a3(vpxor xmm0,xmm0,[rsi+r9+0])
+		a3(vpxor xmm1,xmm1,[rsi+r9+16])
+		a3(vpxor xmm2,xmm2,[rsi+r9+32])
+		a3(vpxor xmm3,xmm3,[rsi+r9+48])
+		a1(jz scrypt_ChunkMix_avx_no_xor2)
+		a3(vpxor xmm0,xmm0,[rdx+r9+0])
+		a3(vpxor xmm1,xmm1,[rdx+r9+16])
+		a3(vpxor xmm2,xmm2,[rdx+r9+32])
+		a3(vpxor xmm3,xmm3,[rdx+r9+48])
+		a1(scrypt_ChunkMix_avx_no_xor2:)
+		a2(vmovdqa xmm8,xmm0)
+		a2(vmovdqa xmm9,xmm1)
+		a2(vmovdqa xmm10,xmm2)
+		a2(vmovdqa xmm11,xmm3)
+		a2(mov rax,8)
+		a1(scrypt_chacha_avx_loop: )
+			a3(vpaddd xmm0,xmm0,xmm1)
+			a3(vpxor  xmm3,xmm3,xmm0)
+			a3(vpshufb xmm3,xmm3,xmm4)
+			a3(vpaddd xmm2,xmm2,xmm3)
+			a3(vpxor  xmm1,xmm1,xmm2)
+			a3(vpsrld xmm12,xmm1,20)
+			a3(vpslld xmm1,xmm1,12)
+			a3(vpxor  xmm1,xmm1,xmm12)
+			a3(vpaddd xmm0,xmm0,xmm1)
+			a3(vpxor  xmm3,xmm3,xmm0)
+			a3(vpshufb xmm3,xmm3,xmm5)
+			a3(vpshufd xmm0,xmm0,0x93)
+			a3(vpaddd xmm2,xmm2,xmm3)
+			a3(vpshufd xmm3,xmm3,0x4e)
+			a3(vpxor xmm1,xmm1,xmm2)
+			a3(vpshufd xmm2,xmm2,0x39)
+			a3(vpsrld xmm12,xmm1,25)
+			a3(vpslld xmm1,xmm1,7)
+			a3(vpxor xmm1,xmm1,xmm12)
+			a2(sub rax,2)
+			a3(vpaddd xmm0,xmm0,xmm1)
+			a3(vpxor  xmm3,xmm3,xmm0)
+			a3(vpshufb xmm3,xmm3,xmm4)
+			a3(vpaddd xmm2,xmm2,xmm3)
+			a3(vpxor  xmm1,xmm1,xmm2)
+			a3(vpsrld xmm12,xmm1,20)
+			a3(vpslld xmm1,xmm1,12)
+			a3(vpxor xmm1,xmm1,xmm12)
+			a3(vpaddd xmm0,xmm0,xmm1)
+			a3(vpxor  xmm3,xmm3,xmm0)
+			a3(vpshufb xmm3,xmm3,xmm5)
+			a3(vpshufd xmm0,xmm0,0x39)
+			a3(vpaddd xmm2,xmm2,xmm3)
+			a3(pshufd xmm3,xmm3,0x4e)
+			a3(vpxor  xmm1,xmm1,xmm2)
+			a3(pshufd xmm2,xmm2,0x93)
+			a3(vpsrld xmm12,xmm1,25)
+			a3(vpslld xmm1,xmm1,7)
+			a3(vpxor  xmm1,xmm1,xmm12)
+			a1(ja scrypt_chacha_avx_loop)
+		a3(vpaddd xmm0,xmm0,xmm8)
+		a3(vpaddd xmm1,xmm1,xmm9)
+		a3(vpaddd xmm2,xmm2,xmm10)
+		a3(vpaddd xmm3,xmm3,xmm11)
+		a2(lea rax,[r8+r9])
+		a2(xor r8,rcx)
+		a2(and rax,~0x7f)
+		a2(add r9,64)
+		a2(shr rax,1)
+		a2(add rax, rdi)
+		a2(cmp r9,rcx)
+		a2(vmovdqa [rax+0],xmm0)
+		a2(vmovdqa [rax+16],xmm1)
+		a2(vmovdqa [rax+32],xmm2)
+		a2(vmovdqa [rax+48],xmm3)
+		a1(jne scrypt_ChunkMix_avx_loop)
+	a1(ret)
+asm_naked_fn_end(scrypt_ChunkMix_avx)
+
+#endif
+
+
+/* intrinsic */
+#if defined(X86_INTRINSIC_AVX) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_CHACHA_INCLUDED))
+
+#define SCRYPT_CHACHA_AVX
+
+static void NOINLINE
+scrypt_ChunkMix_avx(uint32_t *Bout/*[chunkBytes]*/, uint32_t *Bin/*[chunkBytes]*/, uint32_t *Bxor/*[chunkBytes]*/, uint32_t r) {
+	uint32_t i, blocksPerChunk = r * 2, half = 0;
+	xmmi *xmmp,x0,x1,x2,x3,x6,t0,t1,t2,t3;
+	const xmmi x4 = *(xmmi *)&ssse3_rotl16_32bit, x5 = *(xmmi *)&ssse3_rotl8_32bit;
+	size_t rounds;
+
+	/* 1: X = B_{2r - 1} */
+	xmmp = (xmmi *)scrypt_block(Bin, blocksPerChunk - 1);
+	x0 = xmmp[0];
+	x1 = xmmp[1];
+	x2 = xmmp[2];
+	x3 = xmmp[3];
+
+	if (Bxor) {
+		xmmp = (xmmi *)scrypt_block(Bxor, blocksPerChunk - 1);
+		x0 = _mm_xor_si128(x0, xmmp[0]);
+		x1 = _mm_xor_si128(x1, xmmp[1]);
+		x2 = _mm_xor_si128(x2, xmmp[2]);
+		x3 = _mm_xor_si128(x3, xmmp[3]);
+	}
+
+	/* 2: for i = 0 to 2r - 1 do */
+	for (i = 0; i < blocksPerChunk; i++, half ^= r) {
+		/* 3: X = H(X ^ B_i) */
+		xmmp = (xmmi *)scrypt_block(Bin, i);
+		x0 = _mm_xor_si128(x0, xmmp[0]);
+		x1 = _mm_xor_si128(x1, xmmp[1]);
+		x2 = _mm_xor_si128(x2, xmmp[2]);
+		x3 = _mm_xor_si128(x3, xmmp[3]);
+
+		if (Bxor) {
+			xmmp = (xmmi *)scrypt_block(Bxor, i);
+			x0 = _mm_xor_si128(x0, xmmp[0]);
+			x1 = _mm_xor_si128(x1, xmmp[1]);
+			x2 = _mm_xor_si128(x2, xmmp[2]);
+			x3 = _mm_xor_si128(x3, xmmp[3]);
+		}
+
+		t0 = x0;
+		t1 = x1;
+		t2 = x2;
+		t3 = x3;
+
+		for (rounds = 8; rounds; rounds -= 2) {
+			x0 = _mm_add_epi32(x0, x1);
+			x3 = _mm_xor_si128(x3, x0);
+			x3 = _mm_shuffle_epi8(x3, x4);
+			x2 = _mm_add_epi32(x2, x3);
+			x1 = _mm_xor_si128(x1, x2);
+			x6 = x1;
+			x1 = _mm_or_si128(_mm_slli_epi32(x1, 12), _mm_srli_epi32(x6, 20));
+			x0 = _mm_add_epi32(x0, x1);
+			x3 = _mm_xor_si128(x3, x0);
+			x3 = _mm_shuffle_epi8(x3, x5);
+			x0 = _mm_shuffle_epi32(x0, 0x93);
+			x2 = _mm_add_epi32(x2, x3);
+			x3 = _mm_shuffle_epi32(x3, 0x4e);
+			x1 = _mm_xor_si128(x1, x2);
+			x2 = _mm_shuffle_epi32(x2, 0x39);
+			x6 = x1;
+			x1 = _mm_or_si128(_mm_slli_epi32(x1, 7), _mm_srli_epi32(x6, 25));
+			x0 = _mm_add_epi32(x0, x1);
+			x3 = _mm_xor_si128(x3, x0);
+			x3 = _mm_shuffle_epi8(x3, x4);
+			x2 = _mm_add_epi32(x2, x3);
+			x1 = _mm_xor_si128(x1, x2);
+			x6 = x1;
+			x1 = _mm_or_si128(_mm_slli_epi32(x1, 12), _mm_srli_epi32(x6, 20));
+			x0 = _mm_add_epi32(x0, x1);
+			x3 = _mm_xor_si128(x3, x0);
+			x3 = _mm_shuffle_epi8(x3, x5);
+			x0 = _mm_shuffle_epi32(x0, 0x39);
+			x2 = _mm_add_epi32(x2, x3);
+			x3 = _mm_shuffle_epi32(x3, 0x4e);
+			x1 = _mm_xor_si128(x1, x2);
+			x2 = _mm_shuffle_epi32(x2, 0x93);
+			x6 = x1;
+			x1 = _mm_or_si128(_mm_slli_epi32(x1, 7), _mm_srli_epi32(x6, 25));
+		}
+
+		x0 = _mm_add_epi32(x0, t0);
+		x1 = _mm_add_epi32(x1, t1);
+		x2 = _mm_add_epi32(x2, t2);
+		x3 = _mm_add_epi32(x3, t3);
+
+		/* 4: Y_i = X */
+		/* 6: B'[0..r-1] = Y_even */
+		/* 6: B'[r..2r-1] = Y_odd */
+		xmmp = (xmmi *)scrypt_block(Bout, (i / 2) + half);
+		xmmp[0] = x0;
+		xmmp[1] = x1;
+		xmmp[2] = x2;
+		xmmp[3] = x3;
+	}
+}
+
+#endif
+
+#if defined(SCRYPT_CHACHA_AVX)
+	#undef SCRYPT_MIX
+	#define SCRYPT_MIX "ChaCha/8-AVX"
+	#undef SCRYPT_CHACHA_INCLUDED
+	#define SCRYPT_CHACHA_INCLUDED
+#endif
diff --git a/scrypt-jane/scrypt-jane-mix_chacha-sse2.h b/scrypt-jane/scrypt-jane-mix_chacha-sse2.h
new file mode 100644
index 0000000..d2192c8
--- /dev/null
+++ b/scrypt-jane/scrypt-jane-mix_chacha-sse2.h
@@ -0,0 +1,371 @@
+/* x86 */
+#if defined(X86ASM_SSE2) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_CHACHA_INCLUDED))
+
+#define SCRYPT_CHACHA_SSE2
+
+asm_naked_fn_proto(void, scrypt_ChunkMix_sse2)(uint32_t *Bout/*[chunkBytes]*/, uint32_t *Bin/*[chunkBytes]*/, uint32_t *Bxor/*[chunkBytes]*/, uint32_t r)
+asm_naked_fn(scrypt_ChunkMix_sse2)
+	a1(push ebx)
+	a1(push edi)
+	a1(push esi)
+	a1(push ebp)
+	a2(mov ebp,esp)
+	a2(mov edi,[ebp+20])
+	a2(mov esi,[ebp+24])
+	a2(mov eax,[ebp+28])
+	a2(mov ebx,[ebp+32])
+	a2(sub esp,16)
+	a2(and esp,~15)
+	a2(lea edx,[ebx*2])
+	a2(shl edx,6)
+	a2(lea ecx,[edx-64])
+	a2(and eax, eax)
+	a2(movdqa xmm0,[ecx+esi+0])
+	a2(movdqa xmm1,[ecx+esi+16])
+	a2(movdqa xmm2,[ecx+esi+32])
+	a2(movdqa xmm3,[ecx+esi+48])
+	a1(jz scrypt_ChunkMix_sse2_no_xor1)
+	a2(pxor xmm0,[ecx+eax+0])
+	a2(pxor xmm1,[ecx+eax+16])
+	a2(pxor xmm2,[ecx+eax+32])
+	a2(pxor xmm3,[ecx+eax+48])
+	a1(scrypt_ChunkMix_sse2_no_xor1:)
+	a2(xor ecx,ecx)
+	a2(xor ebx,ebx)
+	a1(scrypt_ChunkMix_sse2_loop:)
+		a2(and eax, eax)
+		a2(pxor xmm0,[esi+ecx+0])
+		a2(pxor xmm1,[esi+ecx+16])
+		a2(pxor xmm2,[esi+ecx+32])
+		a2(pxor xmm3,[esi+ecx+48])
+		a1(jz scrypt_ChunkMix_sse2_no_xor2)
+		a2(pxor xmm0,[eax+ecx+0])
+		a2(pxor xmm1,[eax+ecx+16])
+		a2(pxor xmm2,[eax+ecx+32])
+		a2(pxor xmm3,[eax+ecx+48])
+		a1(scrypt_ChunkMix_sse2_no_xor2:)
+		a2(movdqa [esp+0],xmm0)
+		a2(movdqa xmm4,xmm1)
+		a2(movdqa xmm5,xmm2)
+		a2(movdqa xmm7,xmm3)
+		a2(mov eax,8)
+		a1(scrypt_chacha_sse2_loop: )
+			a2(paddd xmm0,xmm1)
+			a2(pxor  xmm3,xmm0)
+			a2(movdqa xmm6,xmm3)
+			a2(pslld xmm3,16)
+			a2(psrld xmm6,16)
+			a2(pxor  xmm3,xmm6)
+			a2(paddd xmm2,xmm3)
+			a2(pxor  xmm1,xmm2)
+			a2(movdqa xmm6,xmm1)
+			a2(pslld xmm1,12)
+			a2(psrld xmm6,20)
+			a2(pxor  xmm1,xmm6)
+			a2(paddd xmm0,xmm1)
+			a2(pxor  xmm3,xmm0)
+			a2(movdqa xmm6,xmm3)
+			a2(pslld xmm3,8)
+			a2(psrld xmm6,24)
+			a2(pxor  xmm3,xmm6)
+			a3(pshufd xmm0,xmm0,0x93)
+			a2(paddd xmm2,xmm3)
+			a3(pshufd xmm3,xmm3,0x4e)
+			a2(pxor  xmm1,xmm2)
+			a3(pshufd xmm2,xmm2,0x39)
+			a2(movdqa xmm6,xmm1)
+			a2(pslld xmm1,7)
+			a2(psrld xmm6,25)
+			a2(pxor  xmm1,xmm6)
+			a2(sub eax,2)
+			a2(paddd xmm0,xmm1)
+			a2(pxor  xmm3,xmm0)
+			a2(movdqa xmm6,xmm3)
+			a2(pslld xmm3,16)
+			a2(psrld xmm6,16)
+			a2(pxor  xmm3,xmm6)
+			a2(paddd xmm2,xmm3)
+			a2(pxor  xmm1,xmm2)
+			a2(movdqa xmm6,xmm1)
+			a2(pslld xmm1,12)
+			a2(psrld xmm6,20)
+			a2(pxor  xmm1,xmm6)
+			a2(paddd xmm0,xmm1)
+			a2(pxor  xmm3,xmm0)
+			a2(movdqa xmm6,xmm3)
+			a2(pslld xmm3,8)
+			a2(psrld xmm6,24)
+			a2(pxor  xmm3,xmm6)
+			a3(pshufd xmm0,xmm0,0x39)
+			a2(paddd xmm2,xmm3)
+			a3(pshufd xmm3,xmm3,0x4e)
+			a2(pxor  xmm1,xmm2)
+			a3(pshufd xmm2,xmm2,0x93)
+			a2(movdqa xmm6,xmm1)
+			a2(pslld xmm1,7)
+			a2(psrld xmm6,25)
+			a2(pxor  xmm1,xmm6)
+			a1(ja scrypt_chacha_sse2_loop)
+		a2(paddd xmm0,[esp+0])
+		a2(paddd xmm1,xmm4)
+		a2(paddd xmm2,xmm5)
+		a2(paddd xmm3,xmm7)
+		a2(lea eax,[ebx+ecx])
+		a2(xor ebx,edx)
+		a2(and eax,~0x7f)
+		a2(add ecx,64)
+		a2(shr eax,1)
+		a2(add eax, edi)
+		a2(cmp ecx,edx)
+		a2(movdqa [eax+0],xmm0)
+		a2(movdqa [eax+16],xmm1)
+		a2(movdqa [eax+32],xmm2)
+		a2(movdqa [eax+48],xmm3)
+		a2(mov eax,[ebp+28])
+		a1(jne scrypt_ChunkMix_sse2_loop)
+	a2(mov esp,ebp)
+	a1(pop ebp)
+	a1(pop esi)
+	a1(pop edi)
+	a1(pop ebx)
+	a1(ret 16)
+asm_naked_fn_end(scrypt_ChunkMix_sse2)
+
+#endif
+
+
+
+/* x64 */
+#if defined(X86_64ASM_SSE2) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_CHACHA_INCLUDED))
+
+#define SCRYPT_CHACHA_SSE2
+
+asm_naked_fn_proto(void, scrypt_ChunkMix_sse2)(uint32_t *Bout/*[chunkBytes]*/, uint32_t *Bin/*[chunkBytes]*/, uint32_t *Bxor/*[chunkBytes]*/, uint32_t r)
+asm_naked_fn(scrypt_ChunkMix_sse2)
+	a2(lea rcx,[rcx*2])
+	a2(shl rcx,6)
+	a2(lea r9,[rcx-64])
+	a2(lea rax,[rsi+r9])
+	a2(lea r9,[rdx+r9])
+	a2(and rdx, rdx)
+	a2(movdqa xmm0,[rax+0])
+	a2(movdqa xmm1,[rax+16])
+	a2(movdqa xmm2,[rax+32])
+	a2(movdqa xmm3,[rax+48])
+	a1(jz scrypt_ChunkMix_sse2_no_xor1)
+	a2(pxor xmm0,[r9+0])
+	a2(pxor xmm1,[r9+16])
+	a2(pxor xmm2,[r9+32])
+	a2(pxor xmm3,[r9+48])
+	a1(scrypt_ChunkMix_sse2_no_xor1:)
+	a2(xor r9,r9)
+	a2(xor r8,r8)
+	a1(scrypt_ChunkMix_sse2_loop:)
+		a2(and rdx, rdx)
+		a2(pxor xmm0,[rsi+r9+0])
+		a2(pxor xmm1,[rsi+r9+16])
+		a2(pxor xmm2,[rsi+r9+32])
+		a2(pxor xmm3,[rsi+r9+48])
+		a1(jz scrypt_ChunkMix_sse2_no_xor2)
+		a2(pxor xmm0,[rdx+r9+0])
+		a2(pxor xmm1,[rdx+r9+16])
+		a2(pxor xmm2,[rdx+r9+32])
+		a2(pxor xmm3,[rdx+r9+48])
+		a1(scrypt_ChunkMix_sse2_no_xor2:)
+		a2(movdqa xmm8,xmm0)
+		a2(movdqa xmm9,xmm1)
+		a2(movdqa xmm10,xmm2)
+		a2(movdqa xmm11,xmm3)
+		a2(mov rax,8)
+		a1(scrypt_chacha_sse2_loop: )
+			a2(paddd xmm0,xmm1)
+			a2(pxor  xmm3,xmm0)
+			a2(movdqa xmm6,xmm3)
+			a2(pslld xmm3,16)
+			a2(psrld xmm6,16)
+			a2(pxor  xmm3,xmm6)
+			a2(paddd xmm2,xmm3)
+			a2(pxor  xmm1,xmm2)
+			a2(movdqa xmm6,xmm1)
+			a2(pslld xmm1,12)
+			a2(psrld xmm6,20)
+			a2(pxor  xmm1,xmm6)
+			a2(paddd xmm0,xmm1)
+			a2(pxor  xmm3,xmm0)
+			a2(movdqa xmm6,xmm3)
+			a2(pslld xmm3,8)
+			a2(psrld xmm6,24)
+			a2(pxor  xmm3,xmm6)
+			a3(pshufd xmm0,xmm0,0x93)
+			a2(paddd xmm2,xmm3)
+			a3(pshufd xmm3,xmm3,0x4e)
+			a2(pxor  xmm1,xmm2)
+			a3(pshufd xmm2,xmm2,0x39)
+			a2(movdqa xmm6,xmm1)
+			a2(pslld xmm1,7)
+			a2(psrld xmm6,25)
+			a2(pxor  xmm1,xmm6)
+			a2(sub rax,2)
+			a2(paddd xmm0,xmm1)
+			a2(pxor  xmm3,xmm0)
+			a2(movdqa xmm6,xmm3)
+			a2(pslld xmm3,16)
+			a2(psrld xmm6,16)
+			a2(pxor  xmm3,xmm6)
+			a2(paddd xmm2,xmm3)
+			a2(pxor  xmm1,xmm2)
+			a2(movdqa xmm6,xmm1)
+			a2(pslld xmm1,12)
+			a2(psrld xmm6,20)
+			a2(pxor  xmm1,xmm6)
+			a2(paddd xmm0,xmm1)
+			a2(pxor  xmm3,xmm0)
+			a2(movdqa xmm6,xmm3)
+			a2(pslld xmm3,8)
+			a2(psrld xmm6,24)
+			a2(pxor  xmm3,xmm6)
+			a3(pshufd xmm0,xmm0,0x39)
+			a2(paddd xmm2,xmm3)
+			a3(pshufd xmm3,xmm3,0x4e)
+			a2(pxor  xmm1,xmm2)
+			a3(pshufd xmm2,xmm2,0x93)
+			a2(movdqa xmm6,xmm1)
+			a2(pslld xmm1,7)
+			a2(psrld xmm6,25)
+			a2(pxor  xmm1,xmm6)
+			a1(ja scrypt_chacha_sse2_loop)
+		a2(paddd xmm0,xmm8)
+		a2(paddd xmm1,xmm9)
+		a2(paddd xmm2,xmm10)
+		a2(paddd xmm3,xmm11)
+		a2(lea rax,[r8+r9])
+		a2(xor r8,rcx)
+		a2(and rax,~0x7f)
+		a2(add r9,64)
+		a2(shr rax,1)
+		a2(add rax, rdi)
+		a2(cmp r9,rcx)
+		a2(movdqa [rax+0],xmm0)
+		a2(movdqa [rax+16],xmm1)
+		a2(movdqa [rax+32],xmm2)
+		a2(movdqa [rax+48],xmm3)
+		a1(jne scrypt_ChunkMix_sse2_loop)
+	a1(ret)
+asm_naked_fn_end(scrypt_ChunkMix_sse2)
+
+#endif
+
+
+/* intrinsic */
+#if defined(X86_INTRINSIC_SSE2) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_CHACHA_INCLUDED))
+
+#define SCRYPT_CHACHA_SSE2
+
+static void NOINLINE
+scrypt_ChunkMix_sse2(uint32_t *Bout/*[chunkBytes]*/, uint32_t *Bin/*[chunkBytes]*/, uint32_t *Bxor/*[chunkBytes]*/, uint32_t r) {
+	uint32_t i, blocksPerChunk = r * 2, half = 0;
+	xmmi *xmmp,x0,x1,x2,x3,x4,t0,t1,t2,t3;
+	size_t rounds;
+
+	/* 1: X = B_{2r - 1} */
+	xmmp = (xmmi *)scrypt_block(Bin, blocksPerChunk - 1);
+	x0 = xmmp[0];
+	x1 = xmmp[1];
+	x2 = xmmp[2];
+	x3 = xmmp[3];
+
+	if (Bxor) {
+		xmmp = (xmmi *)scrypt_block(Bxor, blocksPerChunk - 1);
+		x0 = _mm_xor_si128(x0, xmmp[0]);
+		x1 = _mm_xor_si128(x1, xmmp[1]);
+		x2 = _mm_xor_si128(x2, xmmp[2]);
+		x3 = _mm_xor_si128(x3, xmmp[3]);
+	}
+
+	/* 2: for i = 0 to 2r - 1 do */
+	for (i = 0; i < blocksPerChunk; i++, half ^= r) {
+		/* 3: X = H(X ^ B_i) */
+		xmmp = (xmmi *)scrypt_block(Bin, i);
+		x0 = _mm_xor_si128(x0, xmmp[0]);
+		x1 = _mm_xor_si128(x1, xmmp[1]);
+		x2 = _mm_xor_si128(x2, xmmp[2]);
+		x3 = _mm_xor_si128(x3, xmmp[3]);
+
+		if (Bxor) {
+			xmmp = (xmmi *)scrypt_block(Bxor, i);
+			x0 = _mm_xor_si128(x0, xmmp[0]);
+			x1 = _mm_xor_si128(x1, xmmp[1]);
+			x2 = _mm_xor_si128(x2, xmmp[2]);
+			x3 = _mm_xor_si128(x3, xmmp[3]);
+		}
+
+		t0 = x0;
+		t1 = x1;
+		t2 = x2;
+		t3 = x3;
+
+		for (rounds = 8; rounds; rounds -= 2) {
+			x0 = _mm_add_epi32(x0, x1);
+			x3 = _mm_xor_si128(x3, x0);
+			x4 = x3;
+			x3 = _mm_or_si128(_mm_slli_epi32(x3, 16), _mm_srli_epi32(x4, 16));
+			x2 = _mm_add_epi32(x2, x3);
+			x1 = _mm_xor_si128(x1, x2);
+			x4 = x1;
+			x1 = _mm_or_si128(_mm_slli_epi32(x1, 12), _mm_srli_epi32(x4, 20));
+			x0 = _mm_add_epi32(x0, x1);
+			x3 = _mm_xor_si128(x3, x0);
+			x4 = x3;
+			x3 = _mm_or_si128(_mm_slli_epi32(x3, 8), _mm_srli_epi32(x4, 24));
+			x0 = _mm_shuffle_epi32(x0, 0x93);
+			x2 = _mm_add_epi32(x2, x3);
+			x3 = _mm_shuffle_epi32(x3, 0x4e);
+			x1 = _mm_xor_si128(x1, x2);
+			x2 = _mm_shuffle_epi32(x2, 0x39);
+			x4 = x1;
+			x1 = _mm_or_si128(_mm_slli_epi32(x1, 7), _mm_srli_epi32(x4, 25));
+			x0 = _mm_add_epi32(x0, x1);
+			x3 = _mm_xor_si128(x3, x0);
+			x4 = x3;
+			x3 = _mm_or_si128(_mm_slli_epi32(x3, 16), _mm_srli_epi32(x4, 16));
+			x2 = _mm_add_epi32(x2, x3);
+			x1 = _mm_xor_si128(x1, x2);
+			x4 = x1;
+			x1 = _mm_or_si128(_mm_slli_epi32(x1, 12), _mm_srli_epi32(x4, 20));
+			x0 = _mm_add_epi32(x0, x1);
+			x3 = _mm_xor_si128(x3, x0);
+			x4 = x3;
+			x3 = _mm_or_si128(_mm_slli_epi32(x3, 8), _mm_srli_epi32(x4, 24));
+			x0 = _mm_shuffle_epi32(x0, 0x39);
+			x2 = _mm_add_epi32(x2, x3);
+			x3 = _mm_shuffle_epi32(x3, 0x4e);
+			x1 = _mm_xor_si128(x1, x2);
+			x2 = _mm_shuffle_epi32(x2, 0x93);
+			x4 = x1;
+			x1 = _mm_or_si128(_mm_slli_epi32(x1, 7), _mm_srli_epi32(x4, 25));
+		}
+
+		x0 = _mm_add_epi32(x0, t0);
+		x1 = _mm_add_epi32(x1, t1);
+		x2 = _mm_add_epi32(x2, t2);
+		x3 = _mm_add_epi32(x3, t3);
+
+		/* 4: Y_i = X */
+		/* 6: B'[0..r-1] = Y_even */
+		/* 6: B'[r..2r-1] = Y_odd */
+		xmmp = (xmmi *)scrypt_block(Bout, (i / 2) + half);
+		xmmp[0] = x0;
+		xmmp[1] = x1;
+		xmmp[2] = x2;
+		xmmp[3] = x3;
+	}
+}
+
+#endif
+
+#if defined(SCRYPT_CHACHA_SSE2)
+	#undef SCRYPT_MIX
+	#define SCRYPT_MIX "ChaCha/8-SSE2"
+	#undef SCRYPT_CHACHA_INCLUDED
+	#define SCRYPT_CHACHA_INCLUDED
+#endif
diff --git a/scrypt-jane/scrypt-jane-mix_chacha-ssse3.h b/scrypt-jane/scrypt-jane-mix_chacha-ssse3.h
new file mode 100644
index 0000000..b25e356
--- /dev/null
+++ b/scrypt-jane/scrypt-jane-mix_chacha-ssse3.h
@@ -0,0 +1,348 @@
+/* x86 */
+#if defined(X86ASM_SSSE3) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_CHACHA_INCLUDED))
+
+#define SCRYPT_CHACHA_SSSE3
+
+asm_naked_fn_proto(void, scrypt_ChunkMix_ssse3)(uint32_t *Bout/*[chunkBytes]*/, uint32_t *Bin/*[chunkBytes]*/, uint32_t *Bxor/*[chunkBytes]*/, uint32_t r)
+asm_naked_fn(scrypt_ChunkMix_ssse3)
+	a1(push ebx)
+	a1(push edi)
+	a1(push esi)
+	a1(push ebp)
+	a2(mov ebp,esp)
+	a2(mov edi,[ebp+20])
+	a2(mov esi,[ebp+24])
+	a2(mov eax,[ebp+28])
+	a2(mov ebx,[ebp+32])
+	a2(sub esp,64)
+	a2(and esp,~63)
+	a2(lea edx,[ebx*2])
+	a2(shl edx,6)
+	a2(lea ecx,[edx-64])
+	a2(and eax, eax)
+	a2(movdqa xmm4,[ssse3_rotl16_32bit])
+	a2(movdqa xmm5,[ssse3_rotl8_32bit])
+	a2(movdqa xmm0,[ecx+esi+0])
+	a2(movdqa xmm1,[ecx+esi+16])
+	a2(movdqa xmm2,[ecx+esi+32])
+	a2(movdqa xmm3,[ecx+esi+48])
+	a1(jz scrypt_ChunkMix_ssse3_no_xor1)
+	a2(pxor xmm0,[ecx+eax+0])
+	a2(pxor xmm1,[ecx+eax+16])
+	a2(pxor xmm2,[ecx+eax+32])
+	a2(pxor xmm3,[ecx+eax+48])
+	a1(scrypt_ChunkMix_ssse3_no_xor1:)
+	a2(xor ecx,ecx)
+	a2(xor ebx,ebx)
+	a1(scrypt_ChunkMix_ssse3_loop:)
+		a2(and eax, eax)
+		a2(pxor xmm0,[esi+ecx+0])
+		a2(pxor xmm1,[esi+ecx+16])
+		a2(pxor xmm2,[esi+ecx+32])
+		a2(pxor xmm3,[esi+ecx+48])
+		a1(jz scrypt_ChunkMix_ssse3_no_xor2)
+		a2(pxor xmm0,[eax+ecx+0])
+		a2(pxor xmm1,[eax+ecx+16])
+		a2(pxor xmm2,[eax+ecx+32])
+		a2(pxor xmm3,[eax+ecx+48])
+		a1(scrypt_ChunkMix_ssse3_no_xor2:)
+		a2(movdqa [esp+0],xmm0)
+		a2(movdqa [esp+16],xmm1)
+		a2(movdqa [esp+32],xmm2)
+		a2(movdqa xmm7,xmm3)
+		a2(mov eax,8)
+		a1(scrypt_chacha_ssse3_loop: )
+			a2(paddd xmm0,xmm1)
+			a2(pxor  xmm3,xmm0)
+			a2(pshufb xmm3,xmm4)
+			a2(paddd xmm2,xmm3)
+			a2(pxor  xmm1,xmm2)
+			a2(movdqa xmm6,xmm1)
+			a2(pslld xmm1,12)
+			a2(psrld xmm6,20)
+			a2(pxor  xmm1,xmm6)
+			a2(paddd xmm0,xmm1)
+			a2(pxor  xmm3,xmm0)
+			a2(pshufb xmm3,xmm5)
+			a3(pshufd xmm0,xmm0,0x93)
+			a2(paddd xmm2,xmm3)
+			a3(pshufd xmm3,xmm3,0x4e)
+			a2(pxor  xmm1,xmm2)
+			a3(pshufd xmm2,xmm2,0x39)
+			a2(movdqa xmm6,xmm1)
+			a2(pslld xmm1,7)
+			a2(psrld xmm6,25)
+			a2(pxor  xmm1,xmm6)
+			a2(sub eax,2)
+			a2(paddd xmm0,xmm1)
+			a2(pxor  xmm3,xmm0)
+			a2(pshufb xmm3,xmm4)
+			a2(paddd xmm2,xmm3)
+			a2(pxor  xmm1,xmm2)
+			a2(movdqa xmm6,xmm1)
+			a2(pslld xmm1,12)
+			a2(psrld xmm6,20)
+			a2(pxor  xmm1,xmm6)
+			a2(paddd xmm0,xmm1)
+			a2(pxor  xmm3,xmm0)
+			a2(pshufb xmm3,xmm5)
+			a3(pshufd xmm0,xmm0,0x39)
+			a2(paddd xmm2,xmm3)
+			a3(pshufd xmm3,xmm3,0x4e)
+			a2(pxor  xmm1,xmm2)
+			a3(pshufd xmm2,xmm2,0x93)
+			a2(movdqa xmm6,xmm1)
+			a2(pslld xmm1,7)
+			a2(psrld xmm6,25)
+			a2(pxor  xmm1,xmm6)
+			a1(ja scrypt_chacha_ssse3_loop)
+		a2(paddd xmm0,[esp+0])
+		a2(paddd xmm1,[esp+16])
+		a2(paddd xmm2,[esp+32])
+		a2(paddd xmm3,xmm7)
+		a2(lea eax,[ebx+ecx])
+		a2(xor ebx,edx)
+		a2(and eax,~0x7f)
+		a2(add ecx,64)
+		a2(shr eax,1)
+		a2(add eax, edi)
+		a2(cmp ecx,edx)
+		a2(movdqa [eax+0],xmm0)
+		a2(movdqa [eax+16],xmm1)
+		a2(movdqa [eax+32],xmm2)
+		a2(movdqa [eax+48],xmm3)
+		a2(mov eax,[ebp+28])
+		a1(jne scrypt_ChunkMix_ssse3_loop)
+	a2(mov esp,ebp)
+	a1(pop ebp)
+	a1(pop esi)
+	a1(pop edi)
+	a1(pop ebx)
+	a1(ret 16)
+asm_naked_fn_end(scrypt_ChunkMix_ssse3)
+
+#endif
+
+
+
+/* x64 */
+#if defined(X86_64ASM_SSSE3) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_CHACHA_INCLUDED))
+
+#define SCRYPT_CHACHA_SSSE3
+
+asm_naked_fn_proto(void, scrypt_ChunkMix_ssse3)(uint32_t *Bout/*[chunkBytes]*/, uint32_t *Bin/*[chunkBytes]*/, uint32_t *Bxor/*[chunkBytes]*/, uint32_t r)
+asm_naked_fn(scrypt_ChunkMix_ssse3)
+	a2(lea rcx,[rcx*2])
+	a2(shl rcx,6)
+	a2(lea r9,[rcx-64])
+	a2(lea rax,[rsi+r9])
+	a2(lea r9,[rdx+r9])
+	a2(and rdx, rdx)
+	a2(movdqa xmm4,[ssse3_rotl16_32bit])
+	a2(movdqa xmm5,[ssse3_rotl8_32bit])
+	a2(movdqa xmm0,[rax+0])
+	a2(movdqa xmm1,[rax+16])
+	a2(movdqa xmm2,[rax+32])
+	a2(movdqa xmm3,[rax+48])
+	a1(jz scrypt_ChunkMix_ssse3_no_xor1)
+	a2(pxor xmm0,[r9+0])
+	a2(pxor xmm1,[r9+16])
+	a2(pxor xmm2,[r9+32])
+	a2(pxor xmm3,[r9+48])
+	a1(scrypt_ChunkMix_ssse3_no_xor1:)
+	a2(xor r8,r8)
+	a2(xor r9,r9)
+	a1(scrypt_ChunkMix_ssse3_loop:)
+		a2(and rdx, rdx)
+		a2(pxor xmm0,[rsi+r9+0])
+		a2(pxor xmm1,[rsi+r9+16])
+		a2(pxor xmm2,[rsi+r9+32])
+		a2(pxor xmm3,[rsi+r9+48])
+		a1(jz scrypt_ChunkMix_ssse3_no_xor2)
+		a2(pxor xmm0,[rdx+r9+0])
+		a2(pxor xmm1,[rdx+r9+16])
+		a2(pxor xmm2,[rdx+r9+32])
+		a2(pxor xmm3,[rdx+r9+48])
+		a1(scrypt_ChunkMix_ssse3_no_xor2:)
+		a2(movdqa xmm8,xmm0)
+		a2(movdqa xmm9,xmm1)
+		a2(movdqa xmm10,xmm2)
+		a2(movdqa xmm11,xmm3)
+		a2(mov rax,8)
+		a1(scrypt_chacha_ssse3_loop: )
+			a2(paddd xmm0,xmm1)
+			a2(pxor  xmm3,xmm0)
+			a2(pshufb xmm3,xmm4)
+			a2(paddd xmm2,xmm3)
+			a2(pxor  xmm1,xmm2)
+			a2(movdqa xmm12,xmm1)
+			a2(pslld xmm1,12)
+			a2(psrld xmm12,20)
+			a2(pxor  xmm1,xmm12)
+			a2(paddd xmm0,xmm1)
+			a2(pxor  xmm3,xmm0)
+			a2(pshufb xmm3,xmm5)
+			a3(pshufd xmm0,xmm0,0x93)
+			a2(paddd xmm2,xmm3)
+			a3(pshufd xmm3,xmm3,0x4e)
+			a2(pxor  xmm1,xmm2)
+			a3(pshufd xmm2,xmm2,0x39)
+			a2(movdqa xmm12,xmm1)
+			a2(pslld xmm1,7)
+			a2(psrld xmm12,25)
+			a2(pxor  xmm1,xmm12)
+			a2(sub rax,2)
+			a2(paddd xmm0,xmm1)
+			a2(pxor  xmm3,xmm0)
+			a2(pshufb xmm3,xmm4)
+			a2(paddd xmm2,xmm3)
+			a2(pxor  xmm1,xmm2)
+			a2(movdqa xmm12,xmm1)
+			a2(pslld xmm1,12)
+			a2(psrld xmm12,20)
+			a2(pxor  xmm1,xmm12)
+			a2(paddd xmm0,xmm1)
+			a2(pxor  xmm3,xmm0)
+			a2(pshufb xmm3,xmm5)
+			a3(pshufd xmm0,xmm0,0x39)
+			a2(paddd xmm2,xmm3)
+			a3(pshufd xmm3,xmm3,0x4e)
+			a2(pxor  xmm1,xmm2)
+			a3(pshufd xmm2,xmm2,0x93)
+			a2(movdqa xmm12,xmm1)
+			a2(pslld xmm1,7)
+			a2(psrld xmm12,25)
+			a2(pxor  xmm1,xmm12)
+			a1(ja scrypt_chacha_ssse3_loop)
+		a2(paddd xmm0,xmm8)
+		a2(paddd xmm1,xmm9)
+		a2(paddd xmm2,xmm10)
+		a2(paddd xmm3,xmm11)
+		a2(lea rax,[r8+r9])
+		a2(xor r8,rcx)
+		a2(and rax,~0x7f)
+		a2(add r9,64)
+		a2(shr rax,1)
+		a2(add rax, rdi)
+		a2(cmp r9,rcx)
+		a2(movdqa [rax+0],xmm0)
+		a2(movdqa [rax+16],xmm1)
+		a2(movdqa [rax+32],xmm2)
+		a2(movdqa [rax+48],xmm3)
+		a1(jne scrypt_ChunkMix_ssse3_loop)
+	a1(ret)
+asm_naked_fn_end(scrypt_ChunkMix_ssse3)
+
+#endif
+
+
+/* intrinsic */
+#if defined(X86_INTRINSIC_SSSE3) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_CHACHA_INCLUDED))
+
+#define SCRYPT_CHACHA_SSSE3
+
+static void NOINLINE
+scrypt_ChunkMix_ssse3(uint32_t *Bout/*[chunkBytes]*/, uint32_t *Bin/*[chunkBytes]*/, uint32_t *Bxor/*[chunkBytes]*/, uint32_t r) {
+	uint32_t i, blocksPerChunk = r * 2, half = 0;
+	xmmi *xmmp,x0,x1,x2,x3,x6,t0,t1,t2,t3;
+	const xmmi x4 = *(xmmi *)&ssse3_rotl16_32bit, x5 = *(xmmi *)&ssse3_rotl8_32bit;
+	size_t rounds;
+
+	/* 1: X = B_{2r - 1} */
+	xmmp = (xmmi *)scrypt_block(Bin, blocksPerChunk - 1);
+	x0 = xmmp[0];
+	x1 = xmmp[1];
+	x2 = xmmp[2];
+	x3 = xmmp[3];
+
+	if (Bxor) {
+		xmmp = (xmmi *)scrypt_block(Bxor, blocksPerChunk - 1);
+		x0 = _mm_xor_si128(x0, xmmp[0]);
+		x1 = _mm_xor_si128(x1, xmmp[1]);
+		x2 = _mm_xor_si128(x2, xmmp[2]);
+		x3 = _mm_xor_si128(x3, xmmp[3]);
+	}
+
+	/* 2: for i = 0 to 2r - 1 do */
+	for (i = 0; i < blocksPerChunk; i++, half ^= r) {
+		/* 3: X = H(X ^ B_i) */
+		xmmp = (xmmi *)scrypt_block(Bin, i);
+		x0 = _mm_xor_si128(x0, xmmp[0]);
+		x1 = _mm_xor_si128(x1, xmmp[1]);
+		x2 = _mm_xor_si128(x2, xmmp[2]);
+		x3 = _mm_xor_si128(x3, xmmp[3]);
+
+		if (Bxor) {
+			xmmp = (xmmi *)scrypt_block(Bxor, i);
+			x0 = _mm_xor_si128(x0, xmmp[0]);
+			x1 = _mm_xor_si128(x1, xmmp[1]);
+			x2 = _mm_xor_si128(x2, xmmp[2]);
+			x3 = _mm_xor_si128(x3, xmmp[3]);
+		}
+
+		t0 = x0;
+		t1 = x1;
+		t2 = x2;
+		t3 = x3;
+
+		for (rounds = 8; rounds; rounds -= 2) {
+			x0 = _mm_add_epi32(x0, x1);
+			x3 = _mm_xor_si128(x3, x0);
+			x3 = _mm_shuffle_epi8(x3, x4);
+			x2 = _mm_add_epi32(x2, x3);
+			x1 = _mm_xor_si128(x1, x2);
+			x6 = x1;
+			x1 = _mm_or_si128(_mm_slli_epi32(x1, 12), _mm_srli_epi32(x6, 20));
+			x0 = _mm_add_epi32(x0, x1);
+			x3 = _mm_xor_si128(x3, x0);
+			x3 = _mm_shuffle_epi8(x3, x5);
+			x0 = _mm_shuffle_epi32(x0, 0x93);
+			x2 = _mm_add_epi32(x2, x3);
+			x3 = _mm_shuffle_epi32(x3, 0x4e);
+			x1 = _mm_xor_si128(x1, x2);
+			x2 = _mm_shuffle_epi32(x2, 0x39);
+			x6 = x1;
+			x1 = _mm_or_si128(_mm_slli_epi32(x1, 7), _mm_srli_epi32(x6, 25));
+			x0 = _mm_add_epi32(x0, x1);
+			x3 = _mm_xor_si128(x3, x0);
+			x3 = _mm_shuffle_epi8(x3, x4);
+			x2 = _mm_add_epi32(x2, x3);
+			x1 = _mm_xor_si128(x1, x2);
+			x6 = x1;
+			x1 = _mm_or_si128(_mm_slli_epi32(x1, 12), _mm_srli_epi32(x6, 20));
+			x0 = _mm_add_epi32(x0, x1);
+			x3 = _mm_xor_si128(x3, x0);
+			x3 = _mm_shuffle_epi8(x3, x5);
+			x0 = _mm_shuffle_epi32(x0, 0x39);
+			x2 = _mm_add_epi32(x2, x3);
+			x3 = _mm_shuffle_epi32(x3, 0x4e);
+			x1 = _mm_xor_si128(x1, x2);
+			x2 = _mm_shuffle_epi32(x2, 0x93);
+			x6 = x1;
+			x1 = _mm_or_si128(_mm_slli_epi32(x1, 7), _mm_srli_epi32(x6, 25));
+		}
+
+		x0 = _mm_add_epi32(x0, t0);
+		x1 = _mm_add_epi32(x1, t1);
+		x2 = _mm_add_epi32(x2, t2);
+		x3 = _mm_add_epi32(x3, t3);
+
+		/* 4: Y_i = X */
+		/* 6: B'[0..r-1] = Y_even */
+		/* 6: B'[r..2r-1] = Y_odd */
+		xmmp = (xmmi *)scrypt_block(Bout, (i / 2) + half);
+		xmmp[0] = x0;
+		xmmp[1] = x1;
+		xmmp[2] = x2;
+		xmmp[3] = x3;
+	}
+}
+
+#endif
+
+#if defined(SCRYPT_CHACHA_SSSE3)
+	#undef SCRYPT_MIX
+	#define SCRYPT_MIX "ChaCha/8-SSSE3"
+	#undef SCRYPT_CHACHA_INCLUDED
+	#define SCRYPT_CHACHA_INCLUDED
+#endif
diff --git a/scrypt-jane/scrypt-jane-mix_chacha.h b/scrypt-jane/scrypt-jane-mix_chacha.h
new file mode 100644
index 0000000..85ee9c1
--- /dev/null
+++ b/scrypt-jane/scrypt-jane-mix_chacha.h
@@ -0,0 +1,69 @@
+#if !defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_CHACHA_INCLUDED)
+
+#undef SCRYPT_MIX
+#define SCRYPT_MIX "ChaCha20/8 Ref"
+
+#undef SCRYPT_CHACHA_INCLUDED
+#define SCRYPT_CHACHA_INCLUDED
+#define SCRYPT_CHACHA_BASIC
+
+static void
+chacha_core_basic(uint32_t state[16]) {
+	size_t rounds = 8;
+	uint32_t x0,x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15,t;
+
+	x0 = state[0];
+	x1 = state[1];
+	x2 = state[2];
+	x3 = state[3];
+	x4 = state[4];
+	x5 = state[5];
+	x6 = state[6];
+	x7 = state[7];
+	x8 = state[8];
+	x9 = state[9];
+	x10 = state[10];
+	x11 = state[11];
+	x12 = state[12];
+	x13 = state[13];
+	x14 = state[14];
+	x15 = state[15];
+
+	#define quarter(a,b,c,d) \
+		a += b; t = d^a; d = ROTL32(t,16); \
+		c += d; t = b^c; b = ROTL32(t,12); \
+		a += b; t = d^a; d = ROTL32(t, 8); \
+		c += d; t = b^c; b = ROTL32(t, 7);
+
+	for (; rounds; rounds -= 2) {
+		quarter( x0, x4, x8,x12)
+		quarter( x1, x5, x9,x13)
+		quarter( x2, x6,x10,x14)
+		quarter( x3, x7,x11,x15)
+		quarter( x0, x5,x10,x15)
+		quarter( x1, x6,x11,x12)
+		quarter( x2, x7, x8,x13)
+		quarter( x3, x4, x9,x14)
+	}
+
+	state[0] += x0;
+	state[1] += x1;
+	state[2] += x2;
+	state[3] += x3;
+	state[4] += x4;
+	state[5] += x5;
+	state[6] += x6;
+	state[7] += x7;
+	state[8] += x8;
+	state[9] += x9;
+	state[10] += x10;
+	state[11] += x11;
+	state[12] += x12;
+	state[13] += x13;
+	state[14] += x14;
+	state[15] += x15;
+
+	#undef quarter
+}
+
+#endif
\ No newline at end of file
diff --git a/scrypt-jane/scrypt-jane-mix_salsa-avx.h b/scrypt-jane/scrypt-jane-mix_salsa-avx.h
new file mode 100644
index 0000000..15fb48e
--- /dev/null
+++ b/scrypt-jane/scrypt-jane-mix_salsa-avx.h
@@ -0,0 +1,381 @@
+/* x86 */
+#if defined(X86ASM_AVX) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_SALSA_INCLUDED))
+
+#define SCRYPT_SALSA_AVX
+
+asm_naked_fn_proto(void, scrypt_ChunkMix_avx)(uint32_t *Bout/*[chunkBytes]*/, uint32_t *Bin/*[chunkBytes]*/, uint32_t *Bxor/*[chunkBytes]*/, uint32_t r)
+asm_naked_fn(scrypt_ChunkMix_avx)
+	a1(push ebx)
+	a1(push edi)
+	a1(push esi)
+	a1(push ebp)
+	a2(mov ebp,esp)
+	a2(mov edi,[ebp+20])
+	a2(mov esi,[ebp+24])
+	a2(mov eax,[ebp+28])
+	a2(mov ebx,[ebp+32])
+	a2(sub esp,32)
+	a2(and esp,~63)
+	a2(lea edx,[ebx*2])
+	a2(shl edx,6)
+	a2(lea ecx,[edx-64])
+	a2(and eax, eax)
+	a2(movdqa xmm0,[ecx+esi+0])
+	a2(movdqa xmm1,[ecx+esi+16])
+	a2(movdqa xmm2,[ecx+esi+32])
+	a2(movdqa xmm3,[ecx+esi+48])
+	a1(jz scrypt_ChunkMix_avx_no_xor1)
+	a3(vpxor xmm0,xmm0,[ecx+eax+0])
+	a3(vpxor xmm1,xmm1,[ecx+eax+16])
+	a3(vpxor xmm2,xmm2,[ecx+eax+32])
+	a3(vpxor xmm3,xmm3,[ecx+eax+48])
+	a1(scrypt_ChunkMix_avx_no_xor1:)
+	a2(xor ecx,ecx)
+	a2(xor ebx,ebx)
+	a1(scrypt_ChunkMix_avx_loop:)
+		a2(and eax, eax)
+		a3(vpxor xmm0,xmm0,[esi+ecx+0])
+		a3(vpxor xmm1,xmm1,[esi+ecx+16])
+		a3(vpxor xmm2,xmm2,[esi+ecx+32])
+		a3(vpxor xmm3,xmm3,[esi+ecx+48])
+		a1(jz scrypt_ChunkMix_avx_no_xor2)
+		a3(vpxor xmm0,xmm0,[eax+ecx+0])
+		a3(vpxor xmm1,xmm1,[eax+ecx+16])
+		a3(vpxor xmm2,xmm2,[eax+ecx+32])
+		a3(vpxor xmm3,xmm3,[eax+ecx+48])
+		a1(scrypt_ChunkMix_avx_no_xor2:)
+		a2(vmovdqa [esp+0],xmm0)
+		a2(vmovdqa [esp+16],xmm1)
+		a2(vmovdqa xmm6,xmm2)
+		a2(vmovdqa xmm7,xmm3)
+		a2(mov eax,8)
+		a1(scrypt_salsa_avx_loop: )
+			a3(vpaddd xmm4, xmm1, xmm0)
+			a3(vpsrld xmm5, xmm4, 25)
+			a3(vpslld xmm4, xmm4, 7)
+			a3(vpxor xmm3, xmm3, xmm5)
+			a3(vpxor xmm3, xmm3, xmm4)
+			a3(vpaddd xmm4, xmm0, xmm3)
+			a3(vpsrld xmm5, xmm4, 23)
+			a3(vpslld xmm4, xmm4, 9)
+			a3(vpxor xmm2, xmm2, xmm5)
+			a3(vpxor xmm2, xmm2, xmm4)
+			a3(vpaddd xmm4, xmm3, xmm2)
+			a3(vpsrld xmm5, xmm4, 19)
+			a3(vpslld xmm4, xmm4, 13)
+			a3(vpxor xmm1, xmm1, xmm5)
+			a3(pshufd xmm3, xmm3, 0x93)
+			a3(vpxor xmm1, xmm1, xmm4)
+			a3(vpaddd xmm4, xmm2, xmm1)
+			a3(vpsrld xmm5, xmm4, 14)
+			a3(vpslld xmm4, xmm4, 18)
+			a3(vpxor xmm0, xmm0, xmm5)
+			a3(pshufd xmm2, xmm2, 0x4e)
+			a3(vpxor xmm0, xmm0, xmm4)
+			a2(sub eax, 2)
+			a3(vpaddd xmm4, xmm3, xmm0)
+			a3(pshufd xmm1, xmm1, 0x39)
+			a3(vpsrld xmm5, xmm4, 25)
+			a3(vpslld xmm4, xmm4, 7)
+			a3(vpxor xmm1, xmm1, xmm5)
+			a3(vpxor xmm1, xmm1, xmm4)
+			a3(vpaddd xmm4, xmm0, xmm1)
+			a3(vpsrld xmm5, xmm4, 23)
+			a3(vpslld xmm4, xmm4, 9)
+			a3(vpxor xmm2, xmm2, xmm5)
+			a3(vpxor xmm2, xmm2, xmm4)
+			a3(vpaddd xmm4, xmm1, xmm2)
+			a3(vpsrld xmm5, xmm4, 19)
+			a3(vpslld xmm4, xmm4, 13)
+			a3(vpxor xmm3, xmm3, xmm5)
+			a3(pshufd xmm1, xmm1, 0x93)
+			a3(vpxor xmm3, xmm3, xmm4)
+			a3(vpaddd xmm4, xmm2, xmm3)
+			a3(vpsrld xmm5, xmm4, 14)
+			a3(vpslld xmm4, xmm4, 18)
+			a3(vpxor xmm0, xmm0, xmm5)
+			a3(pshufd xmm2, xmm2, 0x4e)
+			a3(vpxor xmm0, xmm0, xmm4)
+			a3(pshufd xmm3, xmm3, 0x39)
+			a1(ja scrypt_salsa_avx_loop)
+		a3(vpaddd xmm0,xmm0,[esp+0])
+		a3(vpaddd xmm1,xmm1,[esp+16])
+		a3(vpaddd xmm2,xmm2,xmm6)
+		a3(vpaddd xmm3,xmm3,xmm7)
+		a2(lea eax,[ebx+ecx])
+		a2(xor ebx,edx)
+		a2(and eax,~0x7f)
+		a2(add ecx,64)
+		a2(shr eax,1)
+		a2(add eax, edi)
+		a2(cmp ecx,edx)
+		a2(vmovdqa [eax+0],xmm0)
+		a2(vmovdqa [eax+16],xmm1)
+		a2(vmovdqa [eax+32],xmm2)
+		a2(vmovdqa [eax+48],xmm3)
+		a2(mov eax,[ebp+28])
+		a1(jne scrypt_ChunkMix_avx_loop)
+	a2(mov esp,ebp)
+	a1(pop ebp)
+	a1(pop esi)
+	a1(pop edi)
+	a1(pop ebx)
+	a1(ret 16)
+asm_naked_fn_end(scrypt_ChunkMix_avx)
+
+#endif
+
+
+
+/* x64 */
+#if defined(X86_64ASM_AVX) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_SALSA_INCLUDED))
+
+#define SCRYPT_SALSA_AVX
+
+asm_naked_fn_proto(void, scrypt_ChunkMix_avx)(uint32_t *Bout/*[chunkBytes]*/, uint32_t *Bin/*[chunkBytes]*/, uint32_t *Bxor/*[chunkBytes]*/, uint32_t r)
+asm_naked_fn(scrypt_ChunkMix_avx)
+	a2(lea rcx,[rcx*2])
+	a2(shl rcx,6)
+	a2(lea r9,[rcx-64])
+	a2(lea rax,[rsi+r9])
+	a2(lea r9,[rdx+r9])
+	a2(and rdx, rdx)
+	a2(vmovdqa xmm0,[rax+0])
+	a2(vmovdqa xmm1,[rax+16])
+	a2(vmovdqa xmm2,[rax+32])
+	a2(vmovdqa xmm3,[rax+48])
+	a1(jz scrypt_ChunkMix_avx_no_xor1)
+	a3(vpxor xmm0,xmm0,[r9+0])
+	a3(vpxor xmm1,xmm1,[r9+16])
+	a3(vpxor xmm2,xmm2,[r9+32])
+	a3(vpxor xmm3,xmm3,[r9+48])
+	a1(scrypt_ChunkMix_avx_no_xor1:)
+	a2(xor r9,r9)
+	a2(xor r8,r8)
+	a1(scrypt_ChunkMix_avx_loop:)
+		a2(and rdx, rdx)
+		a3(vpxor xmm0,xmm0,[rsi+r9+0])
+		a3(vpxor xmm1,xmm1,[rsi+r9+16])
+		a3(vpxor xmm2,xmm2,[rsi+r9+32])
+		a3(vpxor xmm3,xmm3,[rsi+r9+48])
+		a1(jz scrypt_ChunkMix_avx_no_xor2)
+		a3(vpxor xmm0,xmm0,[rdx+r9+0])
+		a3(vpxor xmm1,xmm1,[rdx+r9+16])
+		a3(vpxor xmm2,xmm2,[rdx+r9+32])
+		a3(vpxor xmm3,xmm3,[rdx+r9+48])
+		a1(scrypt_ChunkMix_avx_no_xor2:)
+		a2(vmovdqa xmm8,xmm0)
+		a2(vmovdqa xmm9,xmm1)
+		a2(vmovdqa xmm10,xmm2)
+		a2(vmovdqa xmm11,xmm3)
+		a2(mov rax,8)
+		a1(scrypt_salsa_avx_loop: )
+			a3(vpaddd xmm4, xmm1, xmm0)
+			a3(vpsrld xmm5, xmm4, 25)
+			a3(vpslld xmm4, xmm4, 7)
+			a3(vpxor xmm3, xmm3, xmm5)
+			a3(vpxor xmm3, xmm3, xmm4)
+			a3(vpaddd xmm4, xmm0, xmm3)
+			a3(vpsrld xmm5, xmm4, 23)
+			a3(vpslld xmm4, xmm4, 9)
+			a3(vpxor xmm2, xmm2, xmm5)
+			a3(vpxor xmm2, xmm2, xmm4)
+			a3(vpaddd xmm4, xmm3, xmm2)
+			a3(vpsrld xmm5, xmm4, 19)
+			a3(vpslld xmm4, xmm4, 13)
+			a3(vpxor xmm1, xmm1, xmm5)
+			a3(pshufd xmm3, xmm3, 0x93)
+			a3(vpxor xmm1, xmm1, xmm4)
+			a3(vpaddd xmm4, xmm2, xmm1)
+			a3(vpsrld xmm5, xmm4, 14)
+			a3(vpslld xmm4, xmm4, 18)
+			a3(vpxor xmm0, xmm0, xmm5)
+			a3(pshufd xmm2, xmm2, 0x4e)
+			a3(vpxor xmm0, xmm0, xmm4)
+			a2(sub rax, 2)
+			a3(vpaddd xmm4, xmm3, xmm0)
+			a3(pshufd xmm1, xmm1, 0x39)
+			a3(vpsrld xmm5, xmm4, 25)
+			a3(vpslld xmm4, xmm4, 7)
+			a3(vpxor xmm1, xmm1, xmm5)
+			a3(vpxor xmm1, xmm1, xmm4)
+			a3(vpaddd xmm4, xmm0, xmm1)
+			a3(vpsrld xmm5, xmm4, 23)
+			a3(vpslld xmm4, xmm4, 9)
+			a3(vpxor xmm2, xmm2, xmm5)
+			a3(vpxor xmm2, xmm2, xmm4)
+			a3(vpaddd xmm4, xmm1, xmm2)
+			a3(vpsrld xmm5, xmm4, 19)
+			a3(vpslld xmm4, xmm4, 13)
+			a3(vpxor xmm3, xmm3, xmm5)
+			a3(pshufd xmm1, xmm1, 0x93)
+			a3(vpxor xmm3, xmm3, xmm4)
+			a3(vpaddd xmm4, xmm2, xmm3)
+			a3(vpsrld xmm5, xmm4, 14)
+			a3(vpslld xmm4, xmm4, 18)
+			a3(vpxor xmm0, xmm0, xmm5)
+			a3(pshufd xmm2, xmm2, 0x4e)
+			a3(vpxor xmm0, xmm0, xmm4)
+			a3(pshufd xmm3, xmm3, 0x39)
+			a1(ja scrypt_salsa_avx_loop)
+		a3(vpaddd xmm0,xmm0,xmm8)
+		a3(vpaddd xmm1,xmm1,xmm9)
+		a3(vpaddd xmm2,xmm2,xmm10)
+		a3(vpaddd xmm3,xmm3,xmm11)
+		a2(lea rax,[r8+r9])
+		a2(xor r8,rcx)
+		a2(and rax,~0x7f)
+		a2(add r9,64)
+		a2(shr rax,1)
+		a2(add rax, rdi)
+		a2(cmp r9,rcx)
+		a2(vmovdqa [rax+0],xmm0)
+		a2(vmovdqa [rax+16],xmm1)
+		a2(vmovdqa [rax+32],xmm2)
+		a2(vmovdqa [rax+48],xmm3)
+		a1(jne scrypt_ChunkMix_avx_loop)
+	a1(ret)
+asm_naked_fn_end(scrypt_ChunkMix_avx)
+
+#endif
+
+
+/* intrinsic */
+#if defined(X86_INTRINSIC_AVX) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_SALSA_INCLUDED))
+
+#define SCRYPT_SALSA_AVX
+
+static void NOINLINE
+scrypt_ChunkMix_avx(uint32_t *Bout/*[chunkBytes]*/, uint32_t *Bin/*[chunkBytes]*/, uint32_t *Bxor/*[chunkBytes]*/, uint32_t r) {
+	uint32_t i, blocksPerChunk = r * 2, half = 0;
+	xmmi *xmmp,x0,x1,x2,x3,x4,x5,t0,t1,t2,t3;
+	size_t rounds;
+
+	/* 1: X = B_{2r - 1} */
+	xmmp = (xmmi *)scrypt_block(Bin, blocksPerChunk - 1);
+	x0 = xmmp[0];
+	x1 = xmmp[1];
+	x2 = xmmp[2];
+	x3 = xmmp[3];
+
+	if (Bxor) {
+		xmmp = (xmmi *)scrypt_block(Bxor, blocksPerChunk - 1);
+		x0 = _mm_xor_si128(x0, xmmp[0]);
+		x1 = _mm_xor_si128(x1, xmmp[1]);
+		x2 = _mm_xor_si128(x2, xmmp[2]);
+		x3 = _mm_xor_si128(x3, xmmp[3]);
+	}
+
+	/* 2: for i = 0 to 2r - 1 do */
+	for (i = 0; i < blocksPerChunk; i++, half ^= r) {
+		/* 3: X = H(X ^ B_i) */
+		xmmp = (xmmi *)scrypt_block(Bin, i);
+		x0 = _mm_xor_si128(x0, xmmp[0]);
+		x1 = _mm_xor_si128(x1, xmmp[1]);
+		x2 = _mm_xor_si128(x2, xmmp[2]);
+		x3 = _mm_xor_si128(x3, xmmp[3]);
+
+		if (Bxor) {
+			xmmp = (xmmi *)scrypt_block(Bxor, i);
+			x0 = _mm_xor_si128(x0, xmmp[0]);
+			x1 = _mm_xor_si128(x1, xmmp[1]);
+			x2 = _mm_xor_si128(x2, xmmp[2]);
+			x3 = _mm_xor_si128(x3, xmmp[3]);
+		}
+
+		t0 = x0;
+		t1 = x1;
+		t2 = x2;
+		t3 = x3;
+
+		for (rounds = 8; rounds; rounds -= 2) {
+			x4 = x1;
+			x4 = _mm_add_epi32(x4, x0);
+			x5 = x4;
+			x4 = _mm_slli_epi32(x4, 7);
+			x5 = _mm_srli_epi32(x5, 25);
+			x3 = _mm_xor_si128(x3, x4);
+			x4 = x0;
+			x3 = _mm_xor_si128(x3, x5);
+			x4 = _mm_add_epi32(x4, x3);
+			x5 = x4;
+			x4 = _mm_slli_epi32(x4, 9);
+			x5 = _mm_srli_epi32(x5, 23);
+			x2 = _mm_xor_si128(x2, x4);
+			x4 = x3;
+			x2 = _mm_xor_si128(x2, x5);
+			x3 = _mm_shuffle_epi32(x3, 0x93);
+			x4 = _mm_add_epi32(x4, x2);
+			x5 = x4;
+			x4 = _mm_slli_epi32(x4, 13);
+			x5 = _mm_srli_epi32(x5, 19);
+			x1 = _mm_xor_si128(x1, x4);
+			x4 = x2;
+			x1 = _mm_xor_si128(x1, x5);
+			x2 = _mm_shuffle_epi32(x2, 0x4e);
+			x4 = _mm_add_epi32(x4, x1);
+			x5 = x4;
+			x4 = _mm_slli_epi32(x4, 18);
+			x5 = _mm_srli_epi32(x5, 14);
+			x0 = _mm_xor_si128(x0, x4);
+			x4 = x3;
+			x0 = _mm_xor_si128(x0, x5);
+			x1 = _mm_shuffle_epi32(x1, 0x39);
+			x4 = _mm_add_epi32(x4, x0);
+			x5 = x4;
+			x4 = _mm_slli_epi32(x4, 7);
+			x5 = _mm_srli_epi32(x5, 25);
+			x1 = _mm_xor_si128(x1, x4);
+			x4 = x0;
+			x1 = _mm_xor_si128(x1, x5);
+			x4 = _mm_add_epi32(x4, x1);
+			x5 = x4;
+			x4 = _mm_slli_epi32(x4, 9);
+			x5 = _mm_srli_epi32(x5, 23);
+			x2 = _mm_xor_si128(x2, x4);
+			x4 = x1;
+			x2 = _mm_xor_si128(x2, x5);
+			x1 = _mm_shuffle_epi32(x1, 0x93);
+			x4 = _mm_add_epi32(x4, x2);
+			x5 = x4;
+			x4 = _mm_slli_epi32(x4, 13);
+			x5 = _mm_srli_epi32(x5, 19);
+			x3 = _mm_xor_si128(x3, x4);
+			x4 = x2;
+			x3 = _mm_xor_si128(x3, x5);
+			x2 = _mm_shuffle_epi32(x2, 0x4e);
+			x4 = _mm_add_epi32(x4, x3);
+			x5 = x4;
+			x4 = _mm_slli_epi32(x4, 18);
+			x5 = _mm_srli_epi32(x5, 14);
+			x0 = _mm_xor_si128(x0, x4);
+			x3 = _mm_shuffle_epi32(x3, 0x39);
+			x0 = _mm_xor_si128(x0, x5);
+		}
+
+		x0 = _mm_add_epi32(x0, t0);
+		x1 = _mm_add_epi32(x1, t1);
+		x2 = _mm_add_epi32(x2, t2);
+		x3 = _mm_add_epi32(x3, t3);
+
+		/* 4: Y_i = X */
+		/* 6: B'[0..r-1] = Y_even */
+		/* 6: B'[r..2r-1] = Y_odd */
+		xmmp = (xmmi *)scrypt_block(Bout, (i / 2) + half);
+		xmmp[0] = x0;
+		xmmp[1] = x1;
+		xmmp[2] = x2;
+		xmmp[3] = x3;
+	}
+}
+
+#endif
+
+#if defined(SCRYPT_SALSA_AVX)
+	/* uses salsa_core_tangle_sse2 */
+
+	#undef SCRYPT_MIX
+	#define SCRYPT_MIX "Salsa/8-AVX"
+	#undef SCRYPT_SALSA_INCLUDED
+	#define SCRYPT_SALSA_INCLUDED
+#endif
diff --git a/scrypt-jane/scrypt-jane-mix_salsa-sse2.h b/scrypt-jane/scrypt-jane-mix_salsa-sse2.h
new file mode 100644
index 0000000..4898659
--- /dev/null
+++ b/scrypt-jane/scrypt-jane-mix_salsa-sse2.h
@@ -0,0 +1,443 @@
+/* x86 */
+#if defined(X86ASM_SSE2) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_SALSA_INCLUDED))
+
+#define SCRYPT_SALSA_SSE2
+
+asm_naked_fn_proto(void, scrypt_ChunkMix_sse2)(uint32_t *Bout/*[chunkBytes]*/, uint32_t *Bin/*[chunkBytes]*/, uint32_t *Bxor/*[chunkBytes]*/, uint32_t r)
+asm_naked_fn(scrypt_ChunkMix_sse2)
+	a1(push ebx)
+	a1(push edi)
+	a1(push esi)
+	a1(push ebp)
+	a2(mov ebp,esp)
+	a2(mov edi,[ebp+20])
+	a2(mov esi,[ebp+24])
+	a2(mov eax,[ebp+28])
+	a2(mov ebx,[ebp+32])
+	a2(sub esp,32)
+	a2(and esp,~63)
+	a2(lea edx,[ebx*2])
+	a2(shl edx,6)
+	a2(lea ecx,[edx-64])
+	a2(and eax, eax)
+	a2(movdqa xmm0,[ecx+esi+0])
+	a2(movdqa xmm1,[ecx+esi+16])
+	a2(movdqa xmm2,[ecx+esi+32])
+	a2(movdqa xmm3,[ecx+esi+48])
+	a1(jz scrypt_ChunkMix_sse2_no_xor1)
+	a2(pxor xmm0,[ecx+eax+0])
+	a2(pxor xmm1,[ecx+eax+16])
+	a2(pxor xmm2,[ecx+eax+32])
+	a2(pxor xmm3,[ecx+eax+48])
+	a1(scrypt_ChunkMix_sse2_no_xor1:)
+	a2(xor ecx,ecx)
+	a2(xor ebx,ebx)
+	a1(scrypt_ChunkMix_sse2_loop:)
+		a2(and eax, eax)
+		a2(pxor xmm0,[esi+ecx+0])
+		a2(pxor xmm1,[esi+ecx+16])
+		a2(pxor xmm2,[esi+ecx+32])
+		a2(pxor xmm3,[esi+ecx+48])
+		a1(jz scrypt_ChunkMix_sse2_no_xor2)
+		a2(pxor xmm0,[eax+ecx+0])
+		a2(pxor xmm1,[eax+ecx+16])
+		a2(pxor xmm2,[eax+ecx+32])
+		a2(pxor xmm3,[eax+ecx+48])
+		a1(scrypt_ChunkMix_sse2_no_xor2:)
+		a2(movdqa [esp+0],xmm0)
+		a2(movdqa [esp+16],xmm1)
+		a2(movdqa xmm6,xmm2)
+		a2(movdqa xmm7,xmm3)
+		a2(mov eax,8)
+		a1(scrypt_salsa_sse2_loop: )
+			a2(movdqa xmm4, xmm1)
+			a2(paddd xmm4, xmm0)
+			a2(movdqa xmm5, xmm4)
+			a2(pslld xmm4, 7)
+			a2(psrld xmm5, 25)
+			a2(pxor xmm3, xmm4)
+			a2(movdqa xmm4, xmm0)
+			a2(pxor xmm3, xmm5)
+			a2(paddd xmm4, xmm3)
+			a2(movdqa xmm5, xmm4)
+			a2(pslld xmm4, 9)
+			a2(psrld xmm5, 23)
+			a2(pxor xmm2, xmm4)
+			a2(movdqa xmm4, xmm3)
+			a2(pxor xmm2, xmm5)
+			a3(pshufd xmm3, xmm3, 0x93)
+			a2(paddd xmm4, xmm2)
+			a2(movdqa xmm5, xmm4)
+			a2(pslld xmm4, 13)
+			a2(psrld xmm5, 19)
+			a2(pxor xmm1, xmm4)
+			a2(movdqa xmm4, xmm2)
+			a2(pxor xmm1, xmm5)
+			a3(pshufd xmm2, xmm2, 0x4e)
+			a2(paddd xmm4, xmm1)
+			a2(movdqa xmm5, xmm4)
+			a2(pslld xmm4, 18)
+			a2(psrld xmm5, 14)
+			a2(pxor xmm0, xmm4)
+			a2(movdqa xmm4, xmm3)
+			a2(pxor xmm0, xmm5)
+			a3(pshufd xmm1, xmm1, 0x39)
+			a2(paddd xmm4, xmm0)
+			a2(movdqa xmm5, xmm4)
+			a2(pslld xmm4, 7)
+			a2(psrld xmm5, 25)
+			a2(pxor xmm1, xmm4)
+			a2(movdqa xmm4, xmm0)
+			a2(pxor xmm1, xmm5)
+			a2(paddd xmm4, xmm1)
+			a2(movdqa xmm5, xmm4)
+			a2(pslld xmm4, 9)
+			a2(psrld xmm5, 23)
+			a2(pxor xmm2, xmm4)
+			a2(movdqa xmm4, xmm1)
+			a2(pxor xmm2, xmm5)
+			a3(pshufd xmm1, xmm1, 0x93)
+			a2(paddd xmm4, xmm2)
+			a2(movdqa xmm5, xmm4)
+			a2(pslld xmm4, 13)
+			a2(psrld xmm5, 19)
+			a2(pxor xmm3, xmm4)
+			a2(movdqa xmm4, xmm2)
+			a2(pxor xmm3, xmm5)
+			a3(pshufd xmm2, xmm2, 0x4e)
+			a2(paddd xmm4, xmm3)
+			a2(sub eax, 2)
+			a2(movdqa xmm5, xmm4)
+			a2(pslld xmm4, 18)
+			a2(psrld xmm5, 14)
+			a2(pxor xmm0, xmm4)
+			a3(pshufd xmm3, xmm3, 0x39)
+			a2(pxor xmm0, xmm5)
+			a1(ja scrypt_salsa_sse2_loop)
+		a2(paddd xmm0,[esp+0])
+		a2(paddd xmm1,[esp+16])
+		a2(paddd xmm2,xmm6)
+		a2(paddd xmm3,xmm7)
+		a2(lea eax,[ebx+ecx])
+		a2(xor ebx,edx)
+		a2(and eax,~0x7f)
+		a2(add ecx,64)
+		a2(shr eax,1)
+		a2(add eax, edi)
+		a2(cmp ecx,edx)
+		a2(movdqa [eax+0],xmm0)
+		a2(movdqa [eax+16],xmm1)
+		a2(movdqa [eax+32],xmm2)
+		a2(movdqa [eax+48],xmm3)
+		a2(mov eax,[ebp+28])
+		a1(jne scrypt_ChunkMix_sse2_loop)
+	a2(mov esp,ebp)
+	a1(pop ebp)
+	a1(pop esi)
+	a1(pop edi)
+	a1(pop ebx)
+	a1(ret 16)
+asm_naked_fn_end(scrypt_ChunkMix_sse2)
+
+#endif
+
+
+
+/* x64 */
+#if defined(X86_64ASM_SSE2) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_SALSA_INCLUDED))
+
+#define SCRYPT_SALSA_SSE2
+
+asm_naked_fn_proto(void, scrypt_ChunkMix_sse2)(uint32_t *Bout/*[chunkBytes]*/, uint32_t *Bin/*[chunkBytes]*/, uint32_t *Bxor/*[chunkBytes]*/, uint32_t r)
+asm_naked_fn(scrypt_ChunkMix_sse2)
+	a2(lea rcx,[rcx*2])
+	a2(shl rcx,6)
+	a2(lea r9,[rcx-64])
+	a2(lea rax,[rsi+r9])
+	a2(lea r9,[rdx+r9])
+	a2(and rdx, rdx)
+	a2(movdqa xmm0,[rax+0])
+	a2(movdqa xmm1,[rax+16])
+	a2(movdqa xmm2,[rax+32])
+	a2(movdqa xmm3,[rax+48])
+	a1(jz scrypt_ChunkMix_sse2_no_xor1)
+	a2(pxor xmm0,[r9+0])
+	a2(pxor xmm1,[r9+16])
+	a2(pxor xmm2,[r9+32])
+	a2(pxor xmm3,[r9+48])
+	a1(scrypt_ChunkMix_sse2_no_xor1:)
+	a2(xor r9,r9)
+	a2(xor r8,r8)
+	a1(scrypt_ChunkMix_sse2_loop:)
+		a2(and rdx, rdx)
+		a2(pxor xmm0,[rsi+r9+0])
+		a2(pxor xmm1,[rsi+r9+16])
+		a2(pxor xmm2,[rsi+r9+32])
+		a2(pxor xmm3,[rsi+r9+48])
+		a1(jz scrypt_ChunkMix_sse2_no_xor2)
+		a2(pxor xmm0,[rdx+r9+0])
+		a2(pxor xmm1,[rdx+r9+16])
+		a2(pxor xmm2,[rdx+r9+32])
+		a2(pxor xmm3,[rdx+r9+48])
+		a1(scrypt_ChunkMix_sse2_no_xor2:)
+		a2(movdqa xmm8,xmm0)
+		a2(movdqa xmm9,xmm1)
+		a2(movdqa xmm10,xmm2)
+		a2(movdqa xmm11,xmm3)
+		a2(mov rax,8)
+		a1(scrypt_salsa_sse2_loop: )
+			a2(movdqa xmm4, xmm1)
+			a2(paddd xmm4, xmm0)
+			a2(movdqa xmm5, xmm4)
+			a2(pslld xmm4, 7)
+			a2(psrld xmm5, 25)
+			a2(pxor xmm3, xmm4)
+			a2(movdqa xmm4, xmm0)
+			a2(pxor xmm3, xmm5)
+			a2(paddd xmm4, xmm3)
+			a2(movdqa xmm5, xmm4)
+			a2(pslld xmm4, 9)
+			a2(psrld xmm5, 23)
+			a2(pxor xmm2, xmm4)
+			a2(movdqa xmm4, xmm3)
+			a2(pxor xmm2, xmm5)
+			a3(pshufd xmm3, xmm3, 0x93)
+			a2(paddd xmm4, xmm2)
+			a2(movdqa xmm5, xmm4)
+			a2(pslld xmm4, 13)
+			a2(psrld xmm5, 19)
+			a2(pxor xmm1, xmm4)
+			a2(movdqa xmm4, xmm2)
+			a2(pxor xmm1, xmm5)
+			a3(pshufd xmm2, xmm2, 0x4e)
+			a2(paddd xmm4, xmm1)
+			a2(movdqa xmm5, xmm4)
+			a2(pslld xmm4, 18)
+			a2(psrld xmm5, 14)
+			a2(pxor xmm0, xmm4)
+			a2(movdqa xmm4, xmm3)
+			a2(pxor xmm0, xmm5)
+			a3(pshufd xmm1, xmm1, 0x39)
+			a2(paddd xmm4, xmm0)
+			a2(movdqa xmm5, xmm4)
+			a2(pslld xmm4, 7)
+			a2(psrld xmm5, 25)
+			a2(pxor xmm1, xmm4)
+			a2(movdqa xmm4, xmm0)
+			a2(pxor xmm1, xmm5)
+			a2(paddd xmm4, xmm1)
+			a2(movdqa xmm5, xmm4)
+			a2(pslld xmm4, 9)
+			a2(psrld xmm5, 23)
+			a2(pxor xmm2, xmm4)
+			a2(movdqa xmm4, xmm1)
+			a2(pxor xmm2, xmm5)
+			a3(pshufd xmm1, xmm1, 0x93)
+			a2(paddd xmm4, xmm2)
+			a2(movdqa xmm5, xmm4)
+			a2(pslld xmm4, 13)
+			a2(psrld xmm5, 19)
+			a2(pxor xmm3, xmm4)
+			a2(movdqa xmm4, xmm2)
+			a2(pxor xmm3, xmm5)
+			a3(pshufd xmm2, xmm2, 0x4e)
+			a2(paddd xmm4, xmm3)
+			a2(sub rax, 2)
+			a2(movdqa xmm5, xmm4)
+			a2(pslld xmm4, 18)
+			a2(psrld xmm5, 14)
+			a2(pxor xmm0, xmm4)
+			a3(pshufd xmm3, xmm3, 0x39)
+			a2(pxor xmm0, xmm5)
+			a1(ja scrypt_salsa_sse2_loop)
+		a2(paddd xmm0,xmm8)
+		a2(paddd xmm1,xmm9)
+		a2(paddd xmm2,xmm10)
+		a2(paddd xmm3,xmm11)
+		a2(lea rax,[r8+r9])
+		a2(xor r8,rcx)
+		a2(and rax,~0x7f)
+		a2(add r9,64)
+		a2(shr rax,1)
+		a2(add rax, rdi)
+		a2(cmp r9,rcx)
+		a2(movdqa [rax+0],xmm0)
+		a2(movdqa [rax+16],xmm1)
+		a2(movdqa [rax+32],xmm2)
+		a2(movdqa [rax+48],xmm3)		
+		a1(jne scrypt_ChunkMix_sse2_loop)
+	a1(ret)
+asm_naked_fn_end(scrypt_ChunkMix_sse2)
+
+#endif
+
+
+/* intrinsic */
+#if defined(X86_INTRINSIC_SSE2) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_SALSA_INCLUDED))
+
+#define SCRYPT_SALSA_SSE2
+
+static void NOINLINE
+scrypt_ChunkMix_sse2(uint32_t *Bout/*[chunkBytes]*/, uint32_t *Bin/*[chunkBytes]*/, uint32_t *Bxor/*[chunkBytes]*/, uint32_t r) {
+	uint32_t i, blocksPerChunk = r * 2, half = 0;
+	xmmi *xmmp,x0,x1,x2,x3,x4,x5,t0,t1,t2,t3;
+	size_t rounds;
+
+	/* 1: X = B_{2r - 1} */
+	xmmp = (xmmi *)scrypt_block(Bin, blocksPerChunk - 1);
+	x0 = xmmp[0];
+	x1 = xmmp[1];
+	x2 = xmmp[2];
+	x3 = xmmp[3];
+
+	if (Bxor) {
+		xmmp = (xmmi *)scrypt_block(Bxor, blocksPerChunk - 1);
+		x0 = _mm_xor_si128(x0, xmmp[0]);
+		x1 = _mm_xor_si128(x1, xmmp[1]);
+		x2 = _mm_xor_si128(x2, xmmp[2]);
+		x3 = _mm_xor_si128(x3, xmmp[3]);
+	}
+
+	/* 2: for i = 0 to 2r - 1 do */
+	for (i = 0; i < blocksPerChunk; i++, half ^= r) {
+		/* 3: X = H(X ^ B_i) */
+		xmmp = (xmmi *)scrypt_block(Bin, i);
+		x0 = _mm_xor_si128(x0, xmmp[0]);
+		x1 = _mm_xor_si128(x1, xmmp[1]);
+		x2 = _mm_xor_si128(x2, xmmp[2]);
+		x3 = _mm_xor_si128(x3, xmmp[3]);
+
+		if (Bxor) {
+			xmmp = (xmmi *)scrypt_block(Bxor, i);
+			x0 = _mm_xor_si128(x0, xmmp[0]);
+			x1 = _mm_xor_si128(x1, xmmp[1]);
+			x2 = _mm_xor_si128(x2, xmmp[2]);
+			x3 = _mm_xor_si128(x3, xmmp[3]);
+		}
+
+		t0 = x0;
+		t1 = x1;
+		t2 = x2;
+		t3 = x3;
+
+		for (rounds = 8; rounds; rounds -= 2) {
+			x4 = x1;
+			x4 = _mm_add_epi32(x4, x0);
+			x5 = x4;
+			x4 = _mm_slli_epi32(x4, 7);
+			x5 = _mm_srli_epi32(x5, 25);
+			x3 = _mm_xor_si128(x3, x4);
+			x4 = x0;
+			x3 = _mm_xor_si128(x3, x5);
+			x4 = _mm_add_epi32(x4, x3);
+			x5 = x4;
+			x4 = _mm_slli_epi32(x4, 9);
+			x5 = _mm_srli_epi32(x5, 23);
+			x2 = _mm_xor_si128(x2, x4);
+			x4 = x3;
+			x2 = _mm_xor_si128(x2, x5);
+			x3 = _mm_shuffle_epi32(x3, 0x93);
+			x4 = _mm_add_epi32(x4, x2);
+			x5 = x4;
+			x4 = _mm_slli_epi32(x4, 13);
+			x5 = _mm_srli_epi32(x5, 19);
+			x1 = _mm_xor_si128(x1, x4);
+			x4 = x2;
+			x1 = _mm_xor_si128(x1, x5);
+			x2 = _mm_shuffle_epi32(x2, 0x4e);
+			x4 = _mm_add_epi32(x4, x1);
+			x5 = x4;
+			x4 = _mm_slli_epi32(x4, 18);
+			x5 = _mm_srli_epi32(x5, 14);
+			x0 = _mm_xor_si128(x0, x4);
+			x4 = x3;
+			x0 = _mm_xor_si128(x0, x5);
+			x1 = _mm_shuffle_epi32(x1, 0x39);
+			x4 = _mm_add_epi32(x4, x0);
+			x5 = x4;
+			x4 = _mm_slli_epi32(x4, 7);
+			x5 = _mm_srli_epi32(x5, 25);
+			x1 = _mm_xor_si128(x1, x4);
+			x4 = x0;
+			x1 = _mm_xor_si128(x1, x5);
+			x4 = _mm_add_epi32(x4, x1);
+			x5 = x4;
+			x4 = _mm_slli_epi32(x4, 9);
+			x5 = _mm_srli_epi32(x5, 23);
+			x2 = _mm_xor_si128(x2, x4);
+			x4 = x1;
+			x2 = _mm_xor_si128(x2, x5);
+			x1 = _mm_shuffle_epi32(x1, 0x93);
+			x4 = _mm_add_epi32(x4, x2);
+			x5 = x4;
+			x4 = _mm_slli_epi32(x4, 13);
+			x5 = _mm_srli_epi32(x5, 19);
+			x3 = _mm_xor_si128(x3, x4);
+			x4 = x2;
+			x3 = _mm_xor_si128(x3, x5);
+			x2 = _mm_shuffle_epi32(x2, 0x4e);
+			x4 = _mm_add_epi32(x4, x3);
+			x5 = x4;
+			x4 = _mm_slli_epi32(x4, 18);
+			x5 = _mm_srli_epi32(x5, 14);
+			x0 = _mm_xor_si128(x0, x4);
+			x3 = _mm_shuffle_epi32(x3, 0x39);
+			x0 = _mm_xor_si128(x0, x5);
+		}
+
+		x0 = _mm_add_epi32(x0, t0);
+		x1 = _mm_add_epi32(x1, t1);
+		x2 = _mm_add_epi32(x2, t2);
+		x3 = _mm_add_epi32(x3, t3);
+
+		/* 4: Y_i = X */
+		/* 6: B'[0..r-1] = Y_even */
+		/* 6: B'[r..2r-1] = Y_odd */
+		xmmp = (xmmi *)scrypt_block(Bout, (i / 2) + half);
+		xmmp[0] = x0;
+		xmmp[1] = x1;
+		xmmp[2] = x2;
+		xmmp[3] = x3;
+	}
+}
+
+#endif
+
+#if defined(SCRYPT_SALSA_SSE2)
+	#undef SCRYPT_MIX
+	#define SCRYPT_MIX "Salsa/8-SSE2"
+	#undef SCRYPT_SALSA_INCLUDED
+	#define SCRYPT_SALSA_INCLUDED
+#endif
+
+/* used by avx,etc as well */
+#if defined(SCRYPT_SALSA_INCLUDED)
+	/*
+		Default layout:
+		 0  1  2  3
+		 4  5  6  7
+		 8  9 10 11
+		12 13 14 15
+
+		SSE2 layout:
+		 0  5 10 15
+		12  1  6 11
+		 8 13  2  7
+		 4  9 14  3
+	*/
+
+	static void STDCALL
+	salsa_core_tangle_sse2(uint32_t *blocks, size_t count) {
+		uint32_t t;
+		while (count--) {
+			t = blocks[1]; blocks[1] = blocks[5]; blocks[5] = t;
+			t = blocks[2]; blocks[2] = blocks[10]; blocks[10] = t;
+			t = blocks[3]; blocks[3] = blocks[15]; blocks[15] = t;
+			t = blocks[4]; blocks[4] = blocks[12]; blocks[12] = t;
+			t = blocks[7]; blocks[7] = blocks[11]; blocks[11] = t;
+			t = blocks[9]; blocks[9] = blocks[13]; blocks[13] = t;
+			blocks += 16;
+		}
+	}
+#endif
+
diff --git a/scrypt-jane/scrypt-jane-mix_salsa.h b/scrypt-jane/scrypt-jane-mix_salsa.h
new file mode 100644
index 0000000..33f3340
--- /dev/null
+++ b/scrypt-jane/scrypt-jane-mix_salsa.h
@@ -0,0 +1,70 @@
+#if !defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_SALSA_INCLUDED)
+
+#undef SCRYPT_MIX
+#define SCRYPT_MIX "Salsa20/8 Ref"
+
+#undef SCRYPT_SALSA_INCLUDED
+#define SCRYPT_SALSA_INCLUDED
+#define SCRYPT_SALSA_BASIC
+
+static void
+salsa_core_basic(uint32_t state[16]) {
+	size_t rounds = 8;
+	uint32_t x0,x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15,t;
+
+	x0 = state[0];
+	x1 = state[1];
+	x2 = state[2];
+	x3 = state[3];
+	x4 = state[4];
+	x5 = state[5];
+	x6 = state[6];
+	x7 = state[7];
+	x8 = state[8];
+	x9 = state[9];
+	x10 = state[10];
+	x11 = state[11];
+	x12 = state[12];
+	x13 = state[13];
+	x14 = state[14];
+	x15 = state[15];
+
+	#define quarter(a,b,c,d) \
+		t = a+d; t = ROTL32(t,  7); b ^= t; \
+		t = b+a; t = ROTL32(t,  9); c ^= t; \
+		t = c+b; t = ROTL32(t, 13); d ^= t; \
+		t = d+c; t = ROTL32(t, 18); a ^= t; \
+
+	for (; rounds; rounds -= 2) {
+		quarter( x0, x4, x8,x12)
+		quarter( x5, x9,x13, x1)
+		quarter(x10,x14, x2, x6)
+		quarter(x15, x3, x7,x11)
+		quarter( x0, x1, x2, x3)
+		quarter( x5, x6, x7, x4)
+		quarter(x10,x11, x8, x9)
+		quarter(x15,x12,x13,x14)
+	}
+
+	state[0] += x0;
+	state[1] += x1;
+	state[2] += x2;
+	state[3] += x3;
+	state[4] += x4;
+	state[5] += x5;
+	state[6] += x6;
+	state[7] += x7;
+	state[8] += x8;
+	state[9] += x9;
+	state[10] += x10;
+	state[11] += x11;
+	state[12] += x12;
+	state[13] += x13;
+	state[14] += x14;
+	state[15] += x15;
+
+	#undef quarter
+}
+
+#endif
+
diff --git a/scrypt-jane/scrypt-jane-pbkdf2.h b/scrypt-jane/scrypt-jane-pbkdf2.h
new file mode 100644
index 0000000..711e3d6
--- /dev/null
+++ b/scrypt-jane/scrypt-jane-pbkdf2.h
@@ -0,0 +1,112 @@
+typedef struct scrypt_hmac_state_t {
+	scrypt_hash_state inner, outer;
+} scrypt_hmac_state;
+
+
+static void
+scrypt_hash(scrypt_hash_digest hash, const uint8_t *m, size_t mlen) {
+	scrypt_hash_state st;
+	scrypt_hash_init(&st);
+	scrypt_hash_update(&st, m, mlen);
+	scrypt_hash_finish(&st, hash);
+}
+
+/* hmac */
+static void
+scrypt_hmac_init(scrypt_hmac_state *st, const uint8_t *key, size_t keylen) {
+	uint8_t pad[SCRYPT_HASH_BLOCK_SIZE] = {0};
+	size_t i;
+
+	scrypt_hash_init(&st->inner);
+	scrypt_hash_init(&st->outer);
+
+	if (keylen <= SCRYPT_HASH_BLOCK_SIZE) {
+		/* use the key directly if it's <= blocksize bytes */
+		memcpy(pad, key, keylen);
+	} else {
+		/* if it's > blocksize bytes, hash it */
+		scrypt_hash(pad, key, keylen);
+	}
+
+	/* inner = (key ^ 0x36) */
+	/* h(inner || ...) */
+	for (i = 0; i < SCRYPT_HASH_BLOCK_SIZE; i++)
+		pad[i] ^= 0x36;
+	scrypt_hash_update(&st->inner, pad, SCRYPT_HASH_BLOCK_SIZE);
+
+	/* outer = (key ^ 0x5c) */
+	/* h(outer || ...) */
+	for (i = 0; i < SCRYPT_HASH_BLOCK_SIZE; i++)
+		pad[i] ^= (0x5c ^ 0x36);
+	scrypt_hash_update(&st->outer, pad, SCRYPT_HASH_BLOCK_SIZE);
+
+	scrypt_ensure_zero(pad, sizeof(pad));
+}
+
+static void
+scrypt_hmac_update(scrypt_hmac_state *st, const uint8_t *m, size_t mlen) {
+	/* h(inner || m...) */
+	scrypt_hash_update(&st->inner, m, mlen);
+}
+
+static void
+scrypt_hmac_finish(scrypt_hmac_state *st, scrypt_hash_digest mac) {
+	/* h(inner || m) */
+	scrypt_hash_digest innerhash;
+	scrypt_hash_finish(&st->inner, innerhash);
+
+	/* h(outer || h(inner || m)) */
+	scrypt_hash_update(&st->outer, innerhash, sizeof(innerhash));
+	scrypt_hash_finish(&st->outer, mac);
+
+	scrypt_ensure_zero(st, sizeof(*st));
+}
+
+static void
+scrypt_pbkdf2(const uint8_t *password, size_t password_len, const uint8_t *salt, size_t salt_len, uint64_t N, uint8_t *out, size_t bytes) {
+	scrypt_hmac_state hmac_pw, hmac_pw_salt, work;
+	scrypt_hash_digest ti, u;
+	uint8_t be[4];
+	uint32_t i, j, blocks;
+	uint64_t c;
+	
+	/* bytes must be <= (0xffffffff - (SCRYPT_HASH_DIGEST_SIZE - 1)), which they will always be under scrypt */
+
+	/* hmac(password, ...) */
+	scrypt_hmac_init(&hmac_pw, password, password_len);
+
+	/* hmac(password, salt...) */
+	hmac_pw_salt = hmac_pw;
+	scrypt_hmac_update(&hmac_pw_salt, salt, salt_len);
+
+	blocks = ((uint32_t)bytes + (SCRYPT_HASH_DIGEST_SIZE - 1)) / SCRYPT_HASH_DIGEST_SIZE;
+	for (i = 1; i <= blocks; i++) {
+		/* U1 = hmac(password, salt || be(i)) */
+		U32TO8_BE(be, i);
+		work = hmac_pw_salt;
+		scrypt_hmac_update(&work, be, 4);
+		scrypt_hmac_finish(&work, ti);
+		memcpy(u, ti, sizeof(u));
+
+		/* T[i] = U1 ^ U2 ^ U3... */
+		for (c = 0; c < N - 1; c++) {
+			/* UX = hmac(password, U{X-1}) */
+			work = hmac_pw;
+			scrypt_hmac_update(&work, u, SCRYPT_HASH_DIGEST_SIZE);
+			scrypt_hmac_finish(&work, u);
+
+			/* T[i] ^= UX */
+			for (j = 0; j < sizeof(u); j++)
+				ti[j] ^= u[j];
+		}
+
+		memcpy(out, ti, (bytes > SCRYPT_HASH_DIGEST_SIZE) ? SCRYPT_HASH_DIGEST_SIZE : bytes);
+		out += SCRYPT_HASH_DIGEST_SIZE;
+		bytes -= SCRYPT_HASH_DIGEST_SIZE;
+	}
+
+	scrypt_ensure_zero(ti, sizeof(ti));
+	scrypt_ensure_zero(u, sizeof(u));
+	scrypt_ensure_zero(&hmac_pw, sizeof(hmac_pw));
+	scrypt_ensure_zero(&hmac_pw_salt, sizeof(hmac_pw_salt));
+}
diff --git a/scrypt-jane/scrypt-jane-portable-x86.h b/scrypt-jane/scrypt-jane-portable-x86.h
new file mode 100644
index 0000000..03282fa
--- /dev/null
+++ b/scrypt-jane/scrypt-jane-portable-x86.h
@@ -0,0 +1,364 @@
+#if defined(CPU_X86) && (defined(COMPILER_MSVC) || defined(COMPILER_GCC))
+	#define X86ASM
+	/* gcc 2.95 royally screws up stack alignments on variables */
+	#if (defined(COMPILER_MSVC6PP_AND_LATER) || (defined(COMPILER_GCC) && (COMPILER_GCC >= 30000)))
+		#define X86ASM_SSE
+		#define X86ASM_SSE2
+	#endif
+	#if ((defined(COMPILER_MSVC) && (COMPILER_MSVC >= 1400)) || (defined(COMPILER_GCC) && (COMPILER_GCC >= 40102)))
+		#define X86ASM_SSSE3
+	#endif
+	#if ((defined(COMPILER_GCC) && (COMPILER_GCC >= 40400)))
+		#define X86ASM_AVX
+	#endif
+#endif
+
+#if defined(CPU_X86_64) && defined(COMPILER_GCC)
+	#define X86_64ASM
+	#define X86_64ASM_SSE2
+	#if (COMPILER_GCC >= 40102)
+		#define X86_64ASM_SSSE3
+	#endif
+	#if (COMPILER_GCC >= 40400)
+		#define X86_64ASM_AVX
+	#endif
+#endif
+
+#if defined(COMPILER_MSVC)
+	#define X86_INTRINSIC
+	#if defined(CPU_X86_64) || defined(X86ASM_SSE)
+		#define X86_INTRINSIC_SSE
+	#endif
+	#if defined(CPU_X86_64) || defined(X86ASM_SSE2)
+		#define X86_INTRINSIC_SSE2
+	#endif
+	#if (COMPILER_MSVC >= 1400)
+		#define X86_INTRINSIC_SSSE3
+	#endif
+#endif
+
+#if defined(COMPILER_MSVC) && defined(CPU_X86_64)
+	#define X86_64USE_INTRINSIC
+#endif
+
+#if defined(COMPILER_MSVC) && defined(CPU_X86_64)
+	#define X86_64USE_INTRINSIC
+#endif
+
+#if defined(COMPILER_GCC) && defined(CPU_X86_FORCE_INTRINSICS)
+	#define X86_INTRINSIC
+	#if defined(__SSE__)
+		#define X86_INTRINSIC_SSE
+	#endif
+	#if defined(__SSE2__)
+		#define X86_INTRINSIC_SSE2
+	#endif
+	#if defined(__SSSE3__)
+		#define X86_INTRINSIC_SSSE3
+	#endif
+	#if defined(__AVX__)
+		#define X86_INTRINSIC_AVX
+	#endif
+#endif
+
+/* only use simd on windows (or SSE2 on gcc)! */
+#if defined(CPU_X86_FORCE_INTRINSICS) || defined(X86_INTRINSIC)
+	#if defined(X86_INTRINSIC_SSE)
+		#define X86_INTRINSIC
+		#include <mmintrin.h>
+		#include <xmmintrin.h>
+		typedef __m64 qmm;
+		typedef __m128 xmm;
+		typedef __m128d xmmd;
+	#endif
+	#if defined(X86_INTRINSIC_SSE2)
+		#define X86_INTRINSIC_SSE2
+		#include <emmintrin.h>
+		typedef __m128i xmmi;
+	#endif
+	#if defined(X86_INTRINSIC_SSSE3)
+		#define X86_INTRINSIC_SSSE3
+		#include <tmmintrin.h>
+	#endif
+#endif
+
+
+#if defined(X86_INTRINSIC_SSE2)
+	typedef union packedelem8_t {
+		uint8_t u[16];
+		xmmi v;	
+	} packedelem8;
+
+	typedef union packedelem32_t {
+		uint32_t u[4];
+		xmmi v;	
+	} packedelem32;
+
+	typedef union packedelem64_t {
+		uint64_t u[2];
+		xmmi v;	
+	} packedelem64;
+#else
+	typedef union packedelem8_t {
+		uint8_t u[16];
+		uint32_t dw[4];		
+	} packedelem8;
+
+	typedef union packedelem32_t {
+		uint32_t u[4];
+		uint8_t b[16];
+	} packedelem32;
+
+	typedef union packedelem64_t {
+		uint64_t u[2];
+		uint8_t b[16];
+	} packedelem64;
+#endif
+
+#if defined(X86_INTRINSIC_SSSE3) || defined(X86ASM_SSSE3) || defined(X86_64ASM_SSSE3)
+	const packedelem8 MM16 ssse3_rotr16_64bit      = {{2,3,4,5,6,7,0,1,10,11,12,13,14,15,8,9}};
+	const packedelem8 MM16 ssse3_rotl16_32bit      = {{2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13}};
+	const packedelem8 MM16 ssse3_rotl8_32bit       = {{3,0,1,2,7,4,5,6,11,8,9,10,15,12,13,14}};
+	const packedelem8 MM16 ssse3_endian_swap_64bit = {{7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8}};
+#endif
+
+/*
+	x86 inline asm for gcc/msvc. usage:
+
+	asm_naked_fn_proto(return_type, name) (type parm1, type parm2..)
+	asm_naked_fn(name)
+		a1(..)
+		a2(.., ..)
+		a3(.., .., ..)
+		a1(ret)
+	asm_naked_fn_end(name)
+*/
+
+#if defined(X86ASM) || defined(X86_64ASM)
+
+#if defined(COMPILER_MSVC)
+	#pragma warning(disable : 4731) /* frame pointer modified by inline assembly */
+	#define a1(x) __asm {x}
+	#define a2(x, y) __asm {x, y}
+	#define a3(x, y, z) __asm {x, y, z}
+	#define a4(x, y, z, w) __asm {x, y, z, w}
+	#define al(x) __asm {label##x:}
+	#define aj(x, y, z) __asm {x label##y}
+	#define asm_align8 a1(ALIGN 8)
+	#define asm_align16 a1(ALIGN 16)
+
+	#define asm_naked_fn_proto(type, fn) static NAKED type STDCALL fn
+	#define asm_naked_fn(fn) {
+	#define asm_naked_fn_end(fn) }
+#elif defined(COMPILER_GCC)
+	#define GNU_AS1(x) #x ";\n"
+	#define GNU_AS2(x, y) #x ", " #y ";\n"
+	#define GNU_AS3(x, y, z) #x ", " #y ", " #z ";\n"
+	#define GNU_AS4(x, y, z, w) #x ", " #y ", " #z ", " #w ";\n"
+	#define GNU_ASL(x) "\n" #x ":\n"
+	#define GNU_ASJ(x, y, z) #x " " #y #z ";"
+
+	#define a1(x) GNU_AS1(x)
+	#define a2(x, y) GNU_AS2(x, y)
+	#define a3(x, y, z) GNU_AS3(x, y, z)
+	#define a4(x, y, z, w) GNU_AS4(x, y, z, w)
+	#define al(x) GNU_ASL(x)
+	#define aj(x, y, z) GNU_ASJ(x, y, z)
+	#define asm_align8 a1(.align 8)
+	#define asm_align16 a1(.align 16)
+
+	#define asm_naked_fn_proto(type, fn) extern type STDCALL fn
+	#define asm_naked_fn(fn) ; __asm__ (".intel_syntax noprefix;\n.text\n" asm_align16 GNU_ASL(fn)
+	#define asm_naked_fn_end(fn) ".att_syntax prefix;\n.type  " #fn ",@function\n.size " #fn ",.-" #fn "\n" );
+	#define asm_gcc() __asm__ __volatile__(".intel_syntax noprefix;\n"
+	#define asm_gcc_parms() ".att_syntax prefix;"
+	#define asm_gcc_trashed() __asm__ __volatile__("" :::
+	#define asm_gcc_end() );
+#else
+	need x86 asm
+#endif
+
+#endif /* X86ASM || X86_64ASM */
+
+
+#if defined(CPU_X86) || defined(CPU_X86_64)
+
+typedef enum cpu_flags_x86_t {
+	cpu_mmx = 1 << 0,
+	cpu_sse = 1 << 1,
+	cpu_sse2 = 1 << 2,
+	cpu_sse3 = 1 << 3,
+	cpu_ssse3 = 1 << 4,
+	cpu_sse4_1 = 1 << 5,
+	cpu_sse4_2 = 1 << 6,
+	cpu_avx = 1 << 7
+} cpu_flags_x86;
+
+typedef enum cpu_vendors_x86_t {
+	cpu_nobody,
+	cpu_intel,
+	cpu_amd
+} cpu_vendors_x86;
+
+typedef struct x86_regs_t {
+	uint32_t eax, ebx, ecx, edx;
+} x86_regs;
+
+#if defined(X86ASM)
+asm_naked_fn_proto(int, has_cpuid)(void)
+asm_naked_fn(has_cpuid)
+	a1(pushfd)
+	a1(pop eax)
+	a2(mov ecx, eax)
+	a2(xor eax, 0x200000)
+	a1(push eax)
+	a1(popfd)
+	a1(pushfd)
+	a1(pop eax)
+	a2(xor eax, ecx)
+	a2(shr eax, 21)
+	a2(and eax, 1)
+	a1(push ecx)
+	a1(popfd)
+	a1(ret)
+asm_naked_fn_end(has_cpuid)
+#endif /* X86ASM */
+
+
+static void NOINLINE
+get_cpuid(x86_regs *regs, uint32_t flags) {
+#if defined(COMPILER_MSVC)
+	__cpuid((int *)regs, (int)flags);
+#else
+	#if defined(CPU_X86_64)
+		#define cpuid_bx rbx
+	#else
+		#define cpuid_bx ebx
+	#endif
+
+	asm_gcc()
+		a1(push cpuid_bx)
+		a1(cpuid)
+		a2(mov [%1 + 0], eax)
+		a2(mov [%1 + 4], ebx)
+		a2(mov [%1 + 8], ecx)
+		a2(mov [%1 + 12], edx)
+		a1(pop cpuid_bx)
+		asm_gcc_parms() : "+a"(flags) : "S"(regs)  : "%ecx", "%edx", "cc"
+	asm_gcc_end()
+#endif
+}
+
+#if defined(X86ASM_AVX) || defined(X86_64ASM_AVX)
+static uint64_t NOINLINE
+get_xgetbv(uint32_t flags) {
+#if defined(COMPILER_MSVC)
+	return _xgetbv(flags);
+#else
+	uint32_t lo, hi;
+	asm_gcc()
+		a1(xgetbv)
+		asm_gcc_parms() : "+c"(flags), "=a" (lo), "=d" (hi)
+	asm_gcc_end()
+	return ((uint64_t)lo | ((uint64_t)hi << 32));
+#endif
+}
+#endif // AVX support
+
+#if defined(SCRYPT_TEST_SPEED)
+size_t cpu_detect_mask = (size_t)-1;
+#endif
+
+static size_t
+detect_cpu(void) {
+	union { uint8_t s[12]; uint32_t i[3]; } vendor_string;
+	cpu_vendors_x86 vendor = cpu_nobody;
+	x86_regs regs;
+	uint32_t max_level;
+	size_t cpu_flags = 0;
+#if defined(X86ASM_AVX) || defined(X86_64ASM_AVX)
+	uint64_t xgetbv_flags;
+#endif
+
+#if defined(CPU_X86)
+	if (!has_cpuid())
+		return cpu_flags;
+#endif
+
+	get_cpuid(&regs, 0);
+	max_level = regs.eax;
+	vendor_string.i[0] = regs.ebx;
+	vendor_string.i[1] = regs.edx;
+	vendor_string.i[2] = regs.ecx;
+
+	if (scrypt_verify(vendor_string.s, (const uint8_t *)"GenuineIntel", 12))
+		vendor = cpu_intel;
+	else if (scrypt_verify(vendor_string.s, (const uint8_t *)"AuthenticAMD", 12))
+		vendor = cpu_amd;
+	
+	if (max_level & 0x00000500) {
+		/* "Intel P5 pre-B0" */
+		cpu_flags |= cpu_mmx;
+		return cpu_flags;
+	}
+
+	if (max_level < 1)
+		return cpu_flags;
+
+	get_cpuid(&regs, 1);
+#if defined(X86ASM_AVX) || defined(X86_64ASM_AVX)
+	/* xsave/xrestore */
+	if (regs.ecx & (1 << 27)) {
+		xgetbv_flags = get_xgetbv(0);
+		if ((regs.ecx & (1 << 28)) && (xgetbv_flags & 0x6)) cpu_flags |= cpu_avx;
+	}
+#endif
+	if (regs.ecx & (1 << 20)) cpu_flags |= cpu_sse4_2;
+	if (regs.ecx & (1 << 19)) cpu_flags |= cpu_sse4_2;
+	if (regs.ecx & (1 <<  9)) cpu_flags |= cpu_ssse3;
+	if (regs.ecx & (1      )) cpu_flags |= cpu_sse3;
+	if (regs.edx & (1 << 26)) cpu_flags |= cpu_sse2;
+	if (regs.edx & (1 << 25)) cpu_flags |= cpu_sse;
+	if (regs.edx & (1 << 23)) cpu_flags |= cpu_mmx;
+	
+#if defined(SCRYPT_TEST_SPEED)
+	cpu_flags &= cpu_detect_mask;
+#endif
+
+	return cpu_flags;
+}
+
+#if defined(SCRYPT_TEST_SPEED)
+static const char *
+get_top_cpuflag_desc(size_t flag) {
+	if (flag & cpu_avx) return "AVX";
+	else if (flag & cpu_sse4_2) return "SSE4.2";
+	else if (flag & cpu_sse4_1) return "SSE4.1";
+	else if (flag & cpu_ssse3) return "SSSE3";
+	else if (flag & cpu_sse2) return "SSE2";
+	else if (flag & cpu_sse) return "SSE";
+	else if (flag & cpu_mmx) return "MMX";
+	else return "Basic";
+}
+#endif
+
+/* enable the highest system-wide option */
+#if defined(SCRYPT_CHOOSE_COMPILETIME)
+	#if !defined(__AVX__)
+		#undef X86_64ASM_AVX
+		#undef X86ASM_AVX
+		#undef X86_INTRINSIC_AVX
+	#endif
+	#if !defined(__SSSE3__)
+		#undef X86_64ASM_SSSE3
+		#undef X86ASM_SSSE3
+		#undef X86_INTRINSIC_SSSE3
+	#endif
+	#if !defined(__SSE2__)
+		#undef X86_64ASM_SSE2
+		#undef X86ASM_SSE2
+		#undef X86_INTRINSIC_SSE2
+	#endif
+#endif
+
+#endif /* defined(CPU_X86) || defined(CPU_X86_64) */
\ No newline at end of file
diff --git a/scrypt-jane/scrypt-jane-portable.h b/scrypt-jane/scrypt-jane-portable.h
new file mode 100644
index 0000000..33c8c2c
--- /dev/null
+++ b/scrypt-jane/scrypt-jane-portable.h
@@ -0,0 +1,281 @@
+/* determine os */
+#if defined(_WIN32)	|| defined(_WIN64) || defined(__TOS_WIN__) || defined(__WINDOWS__)
+	#include <windows.h>
+	#include <wincrypt.h>
+	#define OS_WINDOWS
+#elif defined(sun) || defined(__sun) || defined(__SVR4) || defined(__svr4__)
+	#include <sys/mman.h>
+	#include <sys/time.h>
+	#include <fcntl.h>
+
+	#define OS_SOLARIS
+#else
+	#include <sys/mman.h>
+	#include <sys/time.h>
+	#include <sys/param.h> /* need this to define BSD */
+	#include <unistd.h>
+	#include <fcntl.h>
+
+	#define OS_NIX
+	#if defined(__linux__)
+		#include <endian.h>
+		#define OS_LINUX
+	#elif defined(BSD)
+		#define OS_BSD
+
+		#if defined(MACOS_X) || (defined(__APPLE__) & defined(__MACH__))
+			#define OS_OSX
+		#elif defined(macintosh) || defined(Macintosh)
+			#define OS_MAC
+		#elif defined(__OpenBSD__)
+			#define OS_OPENBSD
+		#endif
+	#endif
+#endif
+
+
+/* determine compiler */
+#if defined(_MSC_VER)
+	#define COMPILER_MSVC _MSC_VER
+	#if ((COMPILER_MSVC > 1200) || defined(_mm_free))
+		#define COMPILER_MSVC6PP_AND_LATER
+	#endif
+	#if (COMPILER_MSVC >= 1500)
+		#define COMPILER_HAS_TMMINTRIN
+	#endif
+	
+	#pragma warning(disable : 4127) /* conditional expression is constant */
+	#pragma warning(disable : 4100) /* unreferenced formal parameter */
+	
+	#define _CRT_SECURE_NO_WARNINGS	
+	#include <float.h>
+	#include <stdlib.h> /* _rotl */
+	#include <intrin.h>
+
+	typedef unsigned char uint8_t;
+	typedef unsigned short uint16_t;
+	typedef unsigned int uint32_t;
+	typedef signed int int32_t;	
+	typedef unsigned __int64 uint64_t;
+	typedef signed __int64 int64_t;
+
+	#define ROTL32(a,b) _rotl(a,b)
+	#define ROTR32(a,b) _rotr(a,b)
+	#define ROTL64(a,b) _rotl64(a,b)
+	#define ROTR64(a,b) _rotr64(a,b)
+	#undef NOINLINE
+	#define NOINLINE __declspec(noinline)
+	#undef INLINE
+	#define INLINE __forceinline
+	#undef FASTCALL
+	#define FASTCALL __fastcall
+	#undef CDECL
+	#define CDECL __cdecl
+	#undef STDCALL
+	#define STDCALL __stdcall
+	#undef NAKED
+	#define NAKED __declspec(naked)
+	#define MM16 __declspec(align(16))
+#endif
+#if defined(__ICC)
+	#define COMPILER_INTEL
+#endif
+#if defined(__GNUC__)
+	#if (__GNUC__ >= 3)
+		#define COMPILER_GCC_PATCHLEVEL __GNUC_PATCHLEVEL__
+	#else
+		#define COMPILER_GCC_PATCHLEVEL 0
+	#endif
+	#define COMPILER_GCC (__GNUC__ * 10000 + __GNUC_MINOR__ * 100 + COMPILER_GCC_PATCHLEVEL)
+	#define ROTL32(a,b) (((a) << (b)) | ((a) >> (32 - b)))
+	#define ROTR32(a,b) (((a) >> (b)) | ((a) << (32 - b)))
+	#define ROTL64(a,b) (((a) << (b)) | ((a) >> (64 - b)))
+	#define ROTR64(a,b) (((a) >> (b)) | ((a) << (64 - b)))
+	#undef NOINLINE
+	#if (COMPILER_GCC >= 30000)
+		#define NOINLINE __attribute__((noinline))
+	#else
+		#define NOINLINE
+	#endif
+	#undef INLINE
+	#if (COMPILER_GCC >= 30000)
+		#define INLINE __attribute__((always_inline))
+	#else
+		#define INLINE inline
+	#endif
+	#undef FASTCALL
+	#if (COMPILER_GCC >= 30400)
+		#define FASTCALL __attribute__((fastcall))
+	#else
+		#define FASTCALL
+	#endif
+	#undef CDECL
+	#define CDECL __attribute__((cdecl))
+	#undef STDCALL
+	#define STDCALL __attribute__((stdcall))
+	#define MM16 __attribute__((aligned(16)))
+	#include <stdint.h>
+#endif
+#if defined(__MINGW32__) || defined(__MINGW64__)
+	#define COMPILER_MINGW
+#endif
+#if defined(__PATHCC__)
+	#define COMPILER_PATHCC
+#endif
+
+#define OPTIONAL_INLINE
+#if defined(OPTIONAL_INLINE)
+	#undef OPTIONAL_INLINE
+	#define OPTIONAL_INLINE INLINE
+#else
+	#define OPTIONAL_INLINE
+#endif
+
+#define CRYPTO_FN NOINLINE STDCALL
+
+/* determine cpu */
+#if defined(__amd64__) || defined(__amd64) || defined(__x86_64__ ) || defined(_M_X64)
+	#define CPU_X86_64
+#elif defined(__i586__) || defined(__i686__) || (defined(_M_IX86) && (_M_IX86 >= 500))
+	#define CPU_X86 500
+#elif defined(__i486__) || (defined(_M_IX86) && (_M_IX86 >= 400))
+	#define CPU_X86 400
+#elif defined(__i386__) || (defined(_M_IX86) && (_M_IX86 >= 300)) || defined(__X86__) || defined(_X86_) || defined(__I86__)
+	#define CPU_X86 300
+#elif defined(__ia64__) || defined(_IA64) || defined(__IA64__) || defined(_M_IA64) || defined(__ia64)
+	#define CPU_IA64
+#endif
+
+#if defined(__sparc__) || defined(__sparc) || defined(__sparcv9)
+	#define CPU_SPARC
+	#if defined(__sparcv9)
+		#define CPU_SPARC64
+	#endif
+#endif
+
+#if defined(CPU_X86_64) || defined(CPU_IA64) || defined(CPU_SPARC64) || defined(__64BIT__) || defined(__LP64__) || defined(_LP64) || (defined(_MIPS_SZLONG) && (_MIPS_SZLONG == 64))
+	#define CPU_64BITS
+	#undef FASTCALL
+	#define FASTCALL
+	#undef CDECL
+	#define CDECL
+	#undef STDCALL
+	#define STDCALL
+#endif
+
+#if defined(powerpc) || defined(__PPC__) || defined(__ppc__) || defined(_ARCH_PPC) || defined(__powerpc__) || defined(__powerpc) || defined(POWERPC) || defined(_M_PPC)
+	#define CPU_PPC
+	#if defined(_ARCH_PWR7)
+		#define CPU_POWER7
+	#elif defined(__64BIT__)
+		#define CPU_PPC64
+	#else
+		#define CPU_PPC32
+	#endif
+#endif
+
+#if defined(__hppa__) || defined(__hppa)
+	#define CPU_HPPA
+#endif
+
+#if defined(__alpha__) || defined(__alpha) || defined(_M_ALPHA)
+	#define CPU_ALPHA
+#endif
+
+/* endian */
+
+#if ((defined(__BYTE_ORDER) && defined(__LITTLE_ENDIAN) && (__BYTE_ORDER == __LITTLE_ENDIAN)) || \
+	 (defined(BYTE_ORDER) && defined(LITTLE_ENDIAN) && (BYTE_ORDER == LITTLE_ENDIAN)) || \
+	 (defined(CPU_X86) || defined(CPU_X86_64)) || \
+	 (defined(vax) || defined(MIPSEL) || defined(_MIPSEL)))
+#define CPU_LE
+#elif ((defined(__BYTE_ORDER) && defined(__BIG_ENDIAN) && (__BYTE_ORDER == __BIG_ENDIAN)) || \
+	   (defined(BYTE_ORDER) && defined(BIG_ENDIAN) && (BYTE_ORDER == BIG_ENDIAN)) || \
+	   (defined(CPU_SPARC) || defined(CPU_PPC) || defined(mc68000) || defined(sel)) || defined(_MIPSEB))
+#define CPU_BE
+#else
+	/* unknown endian! */
+#endif
+
+
+#define U8TO32_BE(p)                                            \
+	(((uint32_t)((p)[0]) << 24) | ((uint32_t)((p)[1]) << 16) |  \
+	 ((uint32_t)((p)[2]) <<  8) | ((uint32_t)((p)[3])      ))
+
+#define U8TO32_LE(p)                                            \
+	(((uint32_t)((p)[0])      ) | ((uint32_t)((p)[1]) <<  8) |  \
+	 ((uint32_t)((p)[2]) << 16) | ((uint32_t)((p)[3]) << 24))
+
+#define U32TO8_BE(p, v)                                           \
+	(p)[0] = (uint8_t)((v) >> 24); (p)[1] = (uint8_t)((v) >> 16); \
+	(p)[2] = (uint8_t)((v) >>  8); (p)[3] = (uint8_t)((v)      );
+
+#define U32TO8_LE(p, v)                                           \
+	(p)[0] = (uint8_t)((v)      ); (p)[1] = (uint8_t)((v) >>  8); \
+	(p)[2] = (uint8_t)((v) >> 16); (p)[3] = (uint8_t)((v) >> 24);
+
+#define U8TO64_BE(p)                                                  \
+	(((uint64_t)U8TO32_BE(p) << 32) | (uint64_t)U8TO32_BE((p) + 4))
+
+#define U8TO64_LE(p)                                                  \
+	(((uint64_t)U8TO32_LE(p)) | ((uint64_t)U8TO32_LE((p) + 4) << 32))
+
+#define U64TO8_BE(p, v)                        \
+	U32TO8_BE((p),     (uint32_t)((v) >> 32)); \
+	U32TO8_BE((p) + 4, (uint32_t)((v)      ));
+
+#define U64TO8_LE(p, v)                        \
+	U32TO8_LE((p),     (uint32_t)((v)      )); \
+	U32TO8_LE((p) + 4, (uint32_t)((v) >> 32));
+
+#define U32_SWAP(v) {                                             \
+	(v) = (((v) << 8) & 0xFF00FF00 ) | (((v) >> 8) & 0xFF00FF );  \
+    (v) = ((v) << 16) | ((v) >> 16);                              \
+}
+
+#define U64_SWAP(v) {                                                                       \
+	(v) = (((v) <<  8) & 0xFF00FF00FF00FF00ull ) | (((v) >>  8) & 0x00FF00FF00FF00FFull );  \
+	(v) = (((v) << 16) & 0xFFFF0000FFFF0000ull ) | (((v) >> 16) & 0x0000FFFF0000FFFFull );  \
+    (v) = ((v) << 32) | ((v) >> 32);                                                        \
+}
+
+static int
+scrypt_verify(const uint8_t *x, const uint8_t *y, size_t len) {
+	uint32_t differentbits = 0;
+	while (len--)
+		differentbits |= (*x++ ^ *y++);
+	return (1 & ((differentbits - 1) >> 8));
+}
+
+void
+scrypt_ensure_zero(void *p, size_t len) {
+#if ((defined(CPU_X86) || defined(CPU_X86_64)) && defined(COMPILER_MSVC))
+		__stosb((unsigned char *)p, 0, len);
+#elif (defined(CPU_X86) && defined(COMPILER_GCC))
+	__asm__ __volatile__(
+		"pushl %%edi;\n"
+		"pushl %%ecx;\n"
+		"rep stosb;\n"
+		"popl %%ecx;\n"
+		"popl %%edi;\n"
+		:: "a"(0), "D"(p), "c"(len) : "cc", "memory"
+	);
+#elif (defined(CPU_X86_64) && defined(COMPILER_GCC))
+	__asm__ __volatile__(
+		"pushq %%rdi;\n"
+		"pushq %%rcx;\n"
+		"rep stosb;\n"
+		"popq %%rcx;\n"
+		"popq %%rdi;\n"
+		:: "a"(0), "D"(p), "c"(len) : "cc", "memory"
+	);
+#else
+	volatile uint8_t *b = (volatile uint8_t *)p;
+	size_t i;
+	for (i = 0; i < len; i++)
+		b[i] = 0;
+#endif
+}
+
+#include "scrypt-jane-portable-x86.h"
+
diff --git a/scrypt-jane/scrypt-jane-romix-basic.h b/scrypt-jane/scrypt-jane-romix-basic.h
new file mode 100644
index 0000000..ca1df02
--- /dev/null
+++ b/scrypt-jane/scrypt-jane-romix-basic.h
@@ -0,0 +1,67 @@
+#if !defined(SCRYPT_CHOOSE_COMPILETIME)
+/* function type returned by scrypt_getROMix, used with cpu detection */
+typedef void (FASTCALL *scrypt_ROMixfn)(scrypt_mix_word_t *X/*[chunkWords]*/, scrypt_mix_word_t *Y/*[chunkWords]*/, scrypt_mix_word_t *V/*[chunkWords * N]*/, uint32_t N, uint32_t r);
+#endif
+
+/* romix pre/post nop function */
+static void STDCALL
+scrypt_romix_nop(scrypt_mix_word_t *blocks, size_t nblocks) {
+}
+
+/* romix pre/post endian conversion function */
+static void STDCALL
+scrypt_romix_convert_endian(scrypt_mix_word_t *blocks, size_t nblocks) {
+#if !defined(CPU_LE)
+	static const union { uint8_t b[2]; uint16_t w; } endian_test = {{1,0}};
+	size_t i;
+	if (endian_test.w == 0x100) {
+		nblocks *= SCRYPT_BLOCK_WORDS;
+		for (i = 0; i < nblocks; i++) {
+			SCRYPT_WORD_ENDIAN_SWAP(blocks[i]);
+		}
+	}
+#endif
+}
+
+/* chunkmix test function */
+typedef void (STDCALL *chunkmixfn)(scrypt_mix_word_t *Bout/*[chunkWords]*/, scrypt_mix_word_t *Bin/*[chunkWords]*/, scrypt_mix_word_t *Bxor/*[chunkWords]*/, uint32_t r);
+typedef void (STDCALL *blockfixfn)(scrypt_mix_word_t *blocks, size_t nblocks);
+
+static int
+scrypt_test_mix_instance(chunkmixfn mixfn, blockfixfn prefn, blockfixfn postfn, const uint8_t expected[16]) {
+	/* r = 2, (2 * r) = 4 blocks in a chunk, 4 * SCRYPT_BLOCK_WORDS total */
+	const uint32_t r = 2, blocks = 2 * r, words = blocks * SCRYPT_BLOCK_WORDS;
+	scrypt_mix_word_t MM16 chunk[2][4 * SCRYPT_BLOCK_WORDS], v;
+	uint8_t final[16];
+	size_t i;
+
+	for (i = 0; i < words; i++) {
+		v = (scrypt_mix_word_t)i;
+		v = (v << 8) | v;
+		v = (v << 16) | v;
+		chunk[0][i] = v;
+	}
+
+	prefn(chunk[0], blocks);
+	mixfn(chunk[1], chunk[0], NULL, r);
+	postfn(chunk[1], blocks);
+
+	/* grab the last 16 bytes of the final block */
+	for (i = 0; i < 16; i += sizeof(scrypt_mix_word_t)) {
+		SCRYPT_WORDTO8_LE(final + i, chunk[1][words - (16 / sizeof(scrypt_mix_word_t)) + (i / sizeof(scrypt_mix_word_t))]);
+	}
+
+	return scrypt_verify(expected, final, 16);
+}
+
+/* returns a pointer to item i, where item is len scrypt_mix_word_t's long */
+static scrypt_mix_word_t *
+scrypt_item(scrypt_mix_word_t *base, scrypt_mix_word_t i, scrypt_mix_word_t len) {
+	return base + (i * len);
+}
+
+/* returns a pointer to block i */
+static scrypt_mix_word_t *
+scrypt_block(scrypt_mix_word_t *base, scrypt_mix_word_t i) {
+	return base + (i * SCRYPT_BLOCK_WORDS);
+}
diff --git a/scrypt-jane/scrypt-jane-romix-template.h b/scrypt-jane/scrypt-jane-romix-template.h
new file mode 100644
index 0000000..2fd7674
--- /dev/null
+++ b/scrypt-jane/scrypt-jane-romix-template.h
@@ -0,0 +1,118 @@
+#if !defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_HAVE_ROMIX)
+
+#if defined(SCRYPT_CHOOSE_COMPILETIME)
+#undef SCRYPT_ROMIX_FN
+#define SCRYPT_ROMIX_FN scrypt_ROMix
+#endif
+
+#undef SCRYPT_HAVE_ROMIX
+#define SCRYPT_HAVE_ROMIX
+
+#if !defined(SCRYPT_CHUNKMIX_FN)
+
+#define SCRYPT_CHUNKMIX_FN scrypt_ChunkMix_basic
+
+/*
+	Bout = ChunkMix(Bin)
+
+	2*r: number of blocks in the chunk
+*/
+static void STDCALL
+SCRYPT_CHUNKMIX_FN(scrypt_mix_word_t *Bout/*[chunkWords]*/, scrypt_mix_word_t *Bin/*[chunkWords]*/, scrypt_mix_word_t *Bxor/*[chunkWords]*/, uint32_t r) {
+	scrypt_mix_word_t MM16 X[SCRYPT_BLOCK_WORDS], *block;
+	uint32_t i, j, blocksPerChunk = r * 2, half = 0;
+
+	/* 1: X = B_{2r - 1} */
+	block = scrypt_block(Bin, blocksPerChunk - 1);
+	for (i = 0; i < SCRYPT_BLOCK_WORDS; i++)
+		X[i] = block[i];
+
+	if (Bxor) {
+		block = scrypt_block(Bxor, blocksPerChunk - 1);
+		for (i = 0; i < SCRYPT_BLOCK_WORDS; i++)
+			X[i] ^= block[i];
+	}
+
+	/* 2: for i = 0 to 2r - 1 do */
+	for (i = 0; i < blocksPerChunk; i++, half ^= r) {
+		/* 3: X = H(X ^ B_i) */
+		block = scrypt_block(Bin, i);
+		for (j = 0; j < SCRYPT_BLOCK_WORDS; j++)
+			X[j] ^= block[j];
+
+		if (Bxor) {
+			block = scrypt_block(Bxor, i);
+			for (j = 0; j < SCRYPT_BLOCK_WORDS; j++)
+				X[j] ^= block[j];
+		}
+		SCRYPT_MIX_FN(X);
+
+		/* 4: Y_i = X */
+		/* 6: B'[0..r-1] = Y_even */
+		/* 6: B'[r..2r-1] = Y_odd */
+		block = scrypt_block(Bout, (i / 2) + half);
+		for (j = 0; j < SCRYPT_BLOCK_WORDS; j++)
+			block[j] = X[j];
+	}
+}
+#endif
+
+/*
+	X = ROMix(X)
+
+	X: chunk to mix
+	Y: scratch chunk
+	N: number of rounds
+	V[N]: array of chunks to randomly index in to
+	2*r: number of blocks in a chunk
+*/
+
+static void NOINLINE FASTCALL
+SCRYPT_ROMIX_FN(scrypt_mix_word_t *X/*[chunkWords]*/, scrypt_mix_word_t *Y/*[chunkWords]*/, scrypt_mix_word_t *V/*[N * chunkWords]*/, uint32_t N, uint32_t r) {
+	uint32_t i, j, chunkWords = SCRYPT_BLOCK_WORDS * r * 2;
+	scrypt_mix_word_t *block = V;
+
+	SCRYPT_ROMIX_TANGLE_FN(X, r * 2);
+
+	/* 1: X = B */
+	/* implicit */
+
+	/* 2: for i = 0 to N - 1 do */
+	memcpy(block, X, chunkWords * sizeof(scrypt_mix_word_t));
+	for (i = 0; i < N - 1; i++, block += chunkWords) {
+		/* 3: V_i = X */
+		/* 4: X = H(X) */
+		SCRYPT_CHUNKMIX_FN(block + chunkWords, block, NULL, r);
+	}
+	SCRYPT_CHUNKMIX_FN(X, block, NULL, r);
+
+	/* 6: for i = 0 to N - 1 do */
+	for (i = 0; i < N; i += 2) {
+		/* 7: j = Integerify(X) % N */
+		j = X[chunkWords - SCRYPT_BLOCK_WORDS] & (N - 1);
+
+		/* 8: X = H(Y ^ V_j) */
+		SCRYPT_CHUNKMIX_FN(Y, X, scrypt_item(V, j, chunkWords), r);
+
+		/* 7: j = Integerify(Y) % N */
+		j = Y[chunkWords - SCRYPT_BLOCK_WORDS] & (N - 1);
+
+		/* 8: X = H(Y ^ V_j) */
+		SCRYPT_CHUNKMIX_FN(X, Y, scrypt_item(V, j, chunkWords), r);
+	}
+
+	/* 10: B' = X */
+	/* implicit */
+
+	SCRYPT_ROMIX_UNTANGLE_FN(X, r * 2);
+}
+
+#endif /* !defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_HAVE_ROMIX) */
+
+
+#undef SCRYPT_CHUNKMIX_FN
+#undef SCRYPT_ROMIX_FN
+#undef SCRYPT_MIX_FN
+#undef SCRYPT_ROMIX_TANGLE_FN
+#undef SCRYPT_ROMIX_UNTANGLE_FN
+
diff --git a/scrypt-jane/scrypt-jane-romix.h b/scrypt-jane/scrypt-jane-romix.h
new file mode 100644
index 0000000..faa655a
--- /dev/null
+++ b/scrypt-jane/scrypt-jane-romix.h
@@ -0,0 +1,27 @@
+#if defined(SCRYPT_CHACHA)
+#include "scrypt-jane-chacha.h"
+#elif defined(SCRYPT_SALSA)
+#include "scrypt-jane-salsa.h"
+#elif defined(SCRYPT_SALSA64)
+#include "scrypt-jane-salsa64.h"
+#else
+	#define SCRYPT_MIX_BASE "ERROR"
+	typedef uint32_t scrypt_mix_word_t;
+	#define SCRYPT_WORDTO8_LE U32TO8_LE
+	#define SCRYPT_WORD_ENDIAN_SWAP U32_SWAP
+	#define SCRYPT_BLOCK_BYTES 64
+	#define SCRYPT_BLOCK_WORDS (SCRYPT_BLOCK_BYTES / sizeof(scrypt_mix_word_t))
+	#if !defined(SCRYPT_CHOOSE_COMPILETIME)
+		static void FASTCALL scrypt_ROMix_error(scrypt_mix_word_t *X/*[chunkWords]*/, scrypt_mix_word_t *Y/*[chunkWords]*/, scrypt_mix_word_t *V/*[chunkWords * N]*/, uint32_t N, uint32_t r) {}
+		static scrypt_ROMixfn scrypt_getROMix() { return scrypt_ROMix_error; }
+	#else
+		static void FASTCALL scrypt_ROMix(scrypt_mix_word_t *X, scrypt_mix_word_t *Y, scrypt_mix_word_t *V, uint32_t N, uint32_t r) {}
+	#endif
+	static int scrypt_test_mix() { return 0; }
+	#error must define a mix function!
+#endif
+
+#if !defined(SCRYPT_CHOOSE_COMPILETIME)
+#undef SCRYPT_MIX
+#define SCRYPT_MIX SCRYPT_MIX_BASE
+#endif
diff --git a/scrypt-jane/scrypt-jane-salsa.h b/scrypt-jane/scrypt-jane-salsa.h
new file mode 100644
index 0000000..0c1604b
--- /dev/null
+++ b/scrypt-jane/scrypt-jane-salsa.h
@@ -0,0 +1,106 @@
+#define SCRYPT_MIX_BASE "Salsa20/8"
+
+typedef uint32_t scrypt_mix_word_t;
+
+#define SCRYPT_WORDTO8_LE U32TO8_LE
+#define SCRYPT_WORD_ENDIAN_SWAP U32_SWAP
+
+#define SCRYPT_BLOCK_BYTES 64
+#define SCRYPT_BLOCK_WORDS (SCRYPT_BLOCK_BYTES / sizeof(scrypt_mix_word_t))
+
+/* must have these here in case block bytes is ever != 64 */
+#include "scrypt-jane-romix-basic.h"
+
+#include "scrypt-jane-mix_salsa-avx.h"
+#include "scrypt-jane-mix_salsa-sse2.h"
+#include "scrypt-jane-mix_salsa.h"
+
+#if defined(SCRYPT_SALSA_AVX)
+	#define SCRYPT_CHUNKMIX_FN scrypt_ChunkMix_avx
+	#define SCRYPT_ROMIX_FN scrypt_ROMix_avx
+	#define SCRYPT_ROMIX_TANGLE_FN salsa_core_tangle_sse2
+	#define SCRYPT_ROMIX_UNTANGLE_FN salsa_core_tangle_sse2
+	#include "scrypt-jane-romix-template.h"
+#endif
+
+#if defined(SCRYPT_SALSA_SSE2)
+	#define SCRYPT_CHUNKMIX_FN scrypt_ChunkMix_sse2
+	#define SCRYPT_ROMIX_FN scrypt_ROMix_sse2
+	#define SCRYPT_MIX_FN salsa_core_sse2
+	#define SCRYPT_ROMIX_TANGLE_FN salsa_core_tangle_sse2
+	#define SCRYPT_ROMIX_UNTANGLE_FN salsa_core_tangle_sse2
+	#include "scrypt-jane-romix-template.h"
+#endif
+
+/* cpu agnostic */
+#define SCRYPT_ROMIX_FN scrypt_ROMix_basic
+#define SCRYPT_MIX_FN salsa_core_basic
+#define SCRYPT_ROMIX_TANGLE_FN scrypt_romix_convert_endian
+#define SCRYPT_ROMIX_UNTANGLE_FN scrypt_romix_convert_endian
+#include "scrypt-jane-romix-template.h"
+
+#if !defined(SCRYPT_CHOOSE_COMPILETIME)
+static scrypt_ROMixfn
+scrypt_getROMix() {
+	size_t cpuflags = detect_cpu();
+
+#if defined(SCRYPT_SALSA_AVX)
+	if (cpuflags & cpu_avx)
+		return scrypt_ROMix_avx;
+	else
+#endif
+
+#if defined(SCRYPT_SALSA_SSE2)
+	if (cpuflags & cpu_sse2)
+		return scrypt_ROMix_sse2;
+	else
+#endif
+
+	return scrypt_ROMix_basic;
+}
+#endif
+
+
+#if defined(SCRYPT_TEST_SPEED)
+static size_t
+available_implementations() {
+	size_t flags = 0;
+
+#if defined(SCRYPT_SALSA_AVX)
+		flags |= cpu_avx;
+#endif
+
+#if defined(SCRYPT_SALSA_SSE2)
+		flags |= cpu_sse2;
+#endif
+
+	return flags;
+}
+#endif
+
+
+static int
+scrypt_test_mix() {
+	static const uint8_t expected[16] = {
+		0x41,0x1f,0x2e,0xa3,0xab,0xa3,0x1a,0x34,0x87,0x1d,0x8a,0x1c,0x76,0xa0,0x27,0x66,
+	};
+
+	int ret = 1;
+	size_t cpuflags = detect_cpu();
+
+#if defined(SCRYPT_SALSA_AVX)
+	if (cpuflags & cpu_avx)
+		ret &= scrypt_test_mix_instance(scrypt_ChunkMix_avx, salsa_core_tangle_sse2, salsa_core_tangle_sse2, expected);
+#endif
+
+#if defined(SCRYPT_SALSA_SSE2)
+	if (cpuflags & cpu_sse2)
+		ret &= scrypt_test_mix_instance(scrypt_ChunkMix_sse2, salsa_core_tangle_sse2, salsa_core_tangle_sse2, expected);
+#endif
+
+#if defined(SCRYPT_SALSA_BASIC)
+	ret &= scrypt_test_mix_instance(scrypt_ChunkMix_basic, scrypt_romix_convert_endian, scrypt_romix_convert_endian, expected);
+#endif
+
+	return ret;
+}
diff --git a/scrypt-jane/scrypt-jane-test-vectors.h b/scrypt-jane/scrypt-jane-test-vectors.h
new file mode 100644
index 0000000..a1e4c61
--- /dev/null
+++ b/scrypt-jane/scrypt-jane-test-vectors.h
@@ -0,0 +1,261 @@
+typedef struct scrypt_test_setting_t {
+	const char *pw, *salt;
+	uint8_t Nfactor, rfactor, pfactor;
+} scrypt_test_setting;
+
+static const scrypt_test_setting post_settings[] = {
+	{"", "", 3, 0, 0},
+	{"password", "NaCl", 9, 3, 4},
+	{0}
+};
+
+#if defined(SCRYPT_SHA256)
+	#if defined(SCRYPT_SALSA)
+		/* sha256 + salsa20/8, the only 'official' test vectors! */
+		static const uint8_t post_vectors[][64] = {
+			{0x77,0xd6,0x57,0x62,0x38,0x65,0x7b,0x20,0x3b,0x19,0xca,0x42,0xc1,0x8a,0x04,0x97,
+			 0xf1,0x6b,0x48,0x44,0xe3,0x07,0x4a,0xe8,0xdf,0xdf,0xfa,0x3f,0xed,0xe2,0x14,0x42,
+			 0xfc,0xd0,0x06,0x9d,0xed,0x09,0x48,0xf8,0x32,0x6a,0x75,0x3a,0x0f,0xc8,0x1f,0x17,
+			 0xe8,0xd3,0xe0,0xfb,0x2e,0x0d,0x36,0x28,0xcf,0x35,0xe2,0x0c,0x38,0xd1,0x89,0x06},
+			{0xfd,0xba,0xbe,0x1c,0x9d,0x34,0x72,0x00,0x78,0x56,0xe7,0x19,0x0d,0x01,0xe9,0xfe,
+			 0x7c,0x6a,0xd7,0xcb,0xc8,0x23,0x78,0x30,0xe7,0x73,0x76,0x63,0x4b,0x37,0x31,0x62,
+			 0x2e,0xaf,0x30,0xd9,0x2e,0x22,0xa3,0x88,0x6f,0xf1,0x09,0x27,0x9d,0x98,0x30,0xda,
+			 0xc7,0x27,0xaf,0xb9,0x4a,0x83,0xee,0x6d,0x83,0x60,0xcb,0xdf,0xa2,0xcc,0x06,0x40}
+		};
+	#elif defined(SCRYPT_CHACHA)
+		static const uint8_t post_vectors[][64] = {
+			{0xef,0x8f,0x44,0x8f,0xc3,0xef,0x78,0x13,0xb2,0x26,0xa7,0x2a,0x40,0xa1,0x98,0x7f,
+			 0xc8,0x7f,0x0d,0x5f,0x40,0x66,0xa2,0x05,0x07,0x4f,0xc7,0xac,0x3b,0x47,0x07,0x0c,
+			 0xf5,0x20,0x46,0x76,0x20,0x7b,0xee,0x51,0x6d,0x5f,0xfa,0x9c,0x27,0xac,0xa9,0x36,
+			 0x62,0xbd,0xde,0x0b,0xa3,0xc0,0x66,0x84,0xde,0x82,0xd0,0x1a,0xb4,0xd1,0xb5,0xfe},
+			{0xf1,0x94,0xf7,0x5f,0x15,0x12,0x10,0x4d,0x6e,0xfb,0x04,0x8c,0x35,0xc4,0x51,0xb6,
+			 0x11,0x04,0xa7,0x9b,0xb0,0x46,0xaf,0x7b,0x47,0x39,0xf0,0xac,0xb2,0x8a,0xfa,0x45,
+			 0x09,0x86,0x8f,0x10,0x4b,0xc6,0xee,0x00,0x11,0x38,0x73,0x7a,0x6a,0xd8,0x25,0x67,
+			 0x85,0xa4,0x10,0x4e,0xa9,0x2f,0x15,0xfe,0xcf,0x63,0xe1,0xe8,0xcf,0xab,0xe8,0xbd}
+		};
+	#elif defined(SCRYPT_SALSA64)
+		static const uint8_t post_vectors[][64] = {
+			{0xf4,0x87,0x29,0xf4,0xc3,0x31,0x8c,0xe8,0xdf,0xe5,0xd8,0x73,0xff,0xca,0x32,0xcf,
+			 0xd8,0xac,0xe7,0xf7,0x15,0xda,0x84,0x41,0x60,0x23,0x26,0x4a,0xc8,0x3e,0xee,0xa6,
+			 0xa5,0x6e,0x52,0xd6,0x64,0x55,0x16,0x31,0x3e,0x66,0x7b,0x65,0xd5,0xe2,0xc9,0x95,
+			 0x1b,0xf0,0x81,0x40,0xb7,0x2f,0xff,0xa6,0xe6,0x02,0xcc,0x63,0x08,0x4a,0x74,0x31},
+			{0x7a,0xd8,0xad,0x02,0x9c,0xa5,0xf4,0x42,0x6a,0x29,0xd2,0xb5,0x53,0xf1,0x6d,0x1d,
+			 0x25,0xc8,0x70,0x48,0x80,0xb9,0xa3,0xf6,0x94,0xf8,0xfa,0xb8,0x52,0x42,0xcd,0x14,
+			 0x26,0x46,0x28,0x06,0xc7,0xf6,0x1f,0xa7,0x89,0x6d,0xc5,0xa0,0x36,0xcc,0xde,0xcb,
+			 0x73,0x0b,0xa4,0xe2,0xd3,0xd1,0x44,0x06,0x35,0x08,0xe0,0x35,0x5b,0xf8,0xd7,0xe7}
+		};
+	#endif
+#elif defined(SCRYPT_SHA512)
+	#if defined(SCRYPT_SALSA)
+		static const uint8_t post_vectors[][64] = {
+			{0xae,0x54,0xe7,0x74,0xe4,0x51,0x6b,0x0f,0xe1,0xe7,0x28,0x03,0x17,0xe4,0x8c,0xfa,
+			 0x2f,0x66,0x55,0x7f,0xdc,0x3b,0x40,0xab,0x47,0x84,0xc9,0x63,0x36,0x07,0x9d,0xe5,
+			 0x86,0x43,0x95,0x89,0xb6,0xc0,0x6c,0x72,0x64,0x00,0xc1,0x2a,0xd7,0x69,0x21,0x92,
+			 0x8e,0xba,0xa4,0x59,0x9f,0x00,0x14,0x3a,0x7c,0x12,0x58,0x91,0x09,0xa0,0x32,0xfe},
+			{0xc5,0xb3,0xd6,0xea,0x0a,0x4b,0x1e,0xcc,0x40,0x00,0xe5,0x98,0x5c,0xdc,0x06,0x06,
+			 0x78,0x34,0x92,0x16,0xcf,0xe4,0x9f,0x03,0x96,0x2d,0x41,0x35,0x00,0x9b,0xff,0x74,
+			 0x60,0x19,0x6e,0xe6,0xa6,0x46,0xf7,0x37,0xcb,0xfa,0xd0,0x9f,0x80,0x72,0x2e,0x85,
+			 0x13,0x3e,0x1a,0x91,0x90,0x53,0xa1,0x33,0x85,0x51,0xdc,0x62,0x1c,0x0e,0x4d,0x30}
+		};
+	#elif defined(SCRYPT_CHACHA)
+		static const uint8_t post_vectors[][64] = {
+			{0xe2,0x05,0x7c,0x44,0xf9,0x55,0x9f,0x64,0xbe,0xd5,0x7f,0x85,0x69,0xc7,0x8c,0x7f,
+			 0x2b,0x91,0xd6,0x9a,0x6c,0xf8,0x57,0x55,0x61,0x25,0x3d,0xee,0xb8,0xd5,0x8c,0xdc,
+			 0x2d,0xd5,0x53,0x84,0x8c,0x06,0xaa,0x37,0x77,0xa6,0xf0,0xf1,0x35,0xfe,0xb5,0xcb,
+			 0x61,0xd7,0x2c,0x67,0xf3,0x7e,0x8a,0x1b,0x04,0xa3,0xa3,0x43,0xa2,0xb2,0x29,0xf2},
+			{0x82,0xda,0x29,0xb2,0x08,0x27,0xfc,0x78,0x22,0xc4,0xb8,0x7e,0xbc,0x36,0xcf,0xcd,
+			 0x17,0x4b,0xa1,0x30,0x16,0x4a,0x25,0x70,0xc7,0xcb,0xe0,0x2b,0x56,0xd3,0x16,0x4e,
+			 0x85,0xb6,0x84,0xe7,0x9b,0x7f,0x8b,0xb5,0x94,0x33,0xcf,0x33,0x44,0x65,0xc8,0xa1,
+			 0x46,0xf9,0xf5,0xfc,0x74,0x29,0x7e,0xd5,0x46,0xec,0xbd,0x95,0xc1,0x80,0x24,0xe4}
+		};
+	#elif defined(SCRYPT_SALSA64)
+		static const uint8_t post_vectors[][64] = {
+			{0xa6,0xcb,0x77,0x9a,0x64,0x1f,0x95,0x02,0x53,0xe7,0x5c,0x78,0xdb,0xa3,0x43,0xff,
+			 0xbe,0x10,0x4c,0x7b,0xe4,0xe1,0x91,0xcf,0x67,0x69,0x5a,0x2c,0x12,0xd6,0x99,0x49,
+			 0x92,0xfd,0x5a,0xaa,0x12,0x4c,0x2e,0xf6,0x95,0x46,0x8f,0x5e,0x77,0x62,0x16,0x29,
+			 0xdb,0xe7,0xab,0x02,0x2b,0x9c,0x35,0x03,0xf8,0xd4,0x04,0x7d,0x2d,0x73,0x85,0xf1},
+			{0x54,0xb7,0xca,0xbb,0xaf,0x0f,0xb0,0x5f,0xb7,0x10,0x63,0x48,0xb3,0x15,0xd8,0xb5,
+			 0x62,0x64,0x89,0x6a,0x59,0xc6,0x0f,0x86,0x96,0x38,0xf0,0xcf,0xd4,0x62,0x90,0x61,
+			 0x7d,0xce,0xd6,0x13,0x85,0x67,0x4a,0xf5,0x32,0x03,0x74,0x30,0x0b,0x5a,0x2f,0x86,
+			 0x82,0x6e,0x0c,0x3e,0x40,0x7a,0xde,0xbe,0x42,0x6e,0x80,0x2b,0xaf,0xdb,0xcc,0x94}
+		};
+	#endif
+#elif defined(SCRYPT_BLAKE512)
+	#if defined(SCRYPT_SALSA)
+		static const uint8_t post_vectors[][64] = {
+			{0x4a,0x48,0xb3,0xfa,0xdc,0xb0,0xb8,0xdb,0x54,0xee,0xf3,0x5c,0x27,0x65,0x6c,0x20,
+			 0xab,0x61,0x9a,0x5b,0xd5,0x1d,0xd9,0x95,0xab,0x88,0x0e,0x4d,0x1e,0x71,0x2f,0x11,
+			 0x43,0x2e,0xef,0x23,0xca,0x8a,0x49,0x3b,0x11,0x38,0xa5,0x28,0x61,0x2f,0xb7,0x89,
+			 0x5d,0xef,0x42,0x4c,0xc1,0x74,0xea,0x8a,0x56,0xbe,0x4a,0x82,0x76,0x15,0x1a,0x87},
+			{0x96,0x24,0xbf,0x40,0xeb,0x03,0x8e,0xfe,0xc0,0xd5,0xa4,0x81,0x85,0x7b,0x09,0x88,
+			 0x52,0xb5,0xcb,0xc4,0x48,0xe1,0xb9,0x1d,0x3f,0x8b,0x3a,0xc6,0x38,0x32,0xc7,0x55,
+			 0x30,0x28,0x7a,0x42,0xa9,0x5d,0x54,0x33,0x62,0xf3,0xd9,0x3c,0x96,0x40,0xd1,0x80,
+			 0xe4,0x0e,0x7e,0xf0,0x64,0x53,0xfe,0x7b,0xd7,0x15,0xba,0xad,0x16,0x80,0x01,0xb5}
+		};
+	#elif defined(SCRYPT_CHACHA)
+		static const uint8_t post_vectors[][64] = {
+			{0x45,0x42,0x22,0x31,0x26,0x13,0x5f,0x94,0xa4,0x00,0x04,0x47,0xe8,0x50,0x6d,0xd6,
+			 0xdd,0xd5,0x08,0xd4,0x90,0x64,0xe0,0x59,0x70,0x46,0xff,0xfc,0x29,0xb3,0x6a,0xc9,
+			 0x4d,0x45,0x97,0x95,0xa8,0xf0,0x53,0xe7,0xee,0x4b,0x6b,0x5d,0x1e,0xa5,0xb2,0x58,
+			 0x4b,0x93,0xc9,0x89,0x4c,0xa8,0xab,0x03,0x74,0x38,0xbd,0x54,0x97,0x6b,0xab,0x4a},
+			{0x4b,0x4a,0x63,0x96,0x73,0x34,0x9f,0x39,0x64,0x51,0x0e,0x2e,0x3b,0x07,0xd5,0x1c,
+			 0xd2,0xf7,0xce,0x60,0xab,0xac,0x89,0xa4,0x16,0x0c,0x58,0x82,0xb3,0xd3,0x25,0x5b,
+			 0xd5,0x62,0x32,0xf4,0x86,0x5d,0xb2,0x4b,0xbf,0x8e,0xc6,0xc0,0xac,0x40,0x48,0xb4,
+			 0x69,0x08,0xba,0x40,0x4b,0x07,0x2a,0x13,0x9c,0x98,0x3b,0x8b,0x20,0x0c,0xac,0x9e}
+		};
+	#elif defined(SCRYPT_SALSA64)
+		static const uint8_t post_vectors[][64] = {
+			{0xcb,0x4b,0xc2,0xd1,0xf4,0x77,0x32,0x3c,0x42,0x9d,0xf7,0x7d,0x1f,0x22,0x64,0xa4,
+			 0xe2,0x88,0x30,0x2d,0x54,0x9d,0xb6,0x26,0x89,0x25,0x30,0xc3,0x3d,0xdb,0xba,0x99,
+			 0xe9,0x8e,0x1e,0x5e,0x57,0x66,0x75,0x7c,0x24,0xda,0x00,0x6f,0x79,0xf7,0x47,0xf5,
+			 0xea,0x40,0x70,0x37,0xd2,0x91,0xc7,0x4d,0xdf,0x46,0xb6,0x3e,0x95,0x7d,0xcb,0xc1},
+			{0x25,0xc2,0xcb,0x7f,0xc8,0x50,0xb7,0x0b,0x11,0x9e,0x1d,0x10,0xb2,0xa8,0x35,0x23,
+			 0x91,0x39,0xfb,0x45,0xf2,0xbf,0xe4,0xd0,0x84,0xec,0x72,0x33,0x6d,0x09,0xed,0x41,
+			 0x9a,0x7e,0x4f,0x10,0x73,0x97,0x22,0x76,0x58,0x93,0x39,0x24,0xdf,0xd2,0xaa,0x2f,
+			 0x6b,0x2b,0x64,0x48,0xa5,0xb7,0xf5,0x56,0x77,0x02,0xa7,0x71,0x46,0xe5,0x0e,0x8d},
+		};
+	#endif
+#elif defined(SCRYPT_BLAKE256)
+	#if defined(SCRYPT_SALSA)
+		static const uint8_t post_vectors[][64] = {
+			{0xf1,0xf1,0x91,0x1a,0x81,0xe6,0x9f,0xc1,0xce,0x43,0xab,0xb1,0x1a,0x02,0x1e,0x16,
+			 0x08,0xc6,0xf9,0x00,0x50,0x1b,0x6d,0xf1,0x31,0x06,0x95,0x48,0x5d,0xf7,0x6c,0x00,
+			 0xa2,0x4c,0xb1,0x0e,0x52,0x66,0x94,0x7e,0x84,0xfc,0xa5,0x34,0xfd,0xf0,0xe9,0x57,
+			 0x85,0x2d,0x8c,0x05,0x5c,0x0f,0x04,0xd4,0x8d,0x3e,0x13,0x52,0x3d,0x90,0x2d,0x2c},
+			{0xd5,0x42,0xd2,0x7b,0x06,0xae,0x63,0x90,0x9e,0x30,0x00,0x0e,0xd8,0xa4,0x3a,0x0b,
+			 0xee,0x4a,0xef,0xb2,0xc4,0x95,0x0d,0x72,0x07,0x70,0xcc,0xa3,0xf9,0x1e,0xc2,0x75,
+			 0xcf,0xaf,0xe1,0x44,0x1c,0x8c,0xe2,0x3e,0x0c,0x81,0xf3,0x92,0xe1,0x13,0xe6,0x4f,
+			 0x2d,0x27,0xc3,0x87,0xe5,0xb6,0xf9,0xd7,0x02,0x04,0x37,0x64,0x78,0x36,0x6e,0xb3}
+		};
+	#elif defined(SCRYPT_CHACHA)
+		static const uint8_t post_vectors[][64] = {
+			{0xad,0x1b,0x4b,0xca,0xe3,0x26,0x1a,0xfd,0xb7,0x77,0x8c,0xde,0x8d,0x26,0x14,0xe1,
+			 0x54,0x38,0x42,0xf3,0xb3,0x66,0x29,0xf9,0x90,0x04,0xf1,0x82,0x7c,0x5a,0x6f,0xa8,
+			 0x7d,0xd6,0x08,0x0d,0x8b,0x78,0x04,0xad,0x31,0xea,0xd4,0x87,0x2d,0xf7,0x74,0x9a,
+			 0xe5,0xce,0x97,0xef,0xa3,0xbb,0x90,0x46,0x7c,0xf4,0x51,0x38,0xc7,0x60,0x53,0x21},
+			{0x39,0xbb,0x56,0x3d,0x0d,0x7b,0x74,0x82,0xfe,0x5a,0x78,0x3d,0x66,0xe8,0x3a,0xdf,
+			 0x51,0x6f,0x3e,0xf4,0x86,0x20,0x8d,0xe1,0x81,0x22,0x02,0xf7,0x0d,0xb5,0x1a,0x0f,
+			 0xfc,0x59,0xb6,0x60,0xc9,0xdb,0x38,0x0b,0x5b,0x95,0xa5,0x94,0xda,0x42,0x2d,0x90,
+			 0x47,0xeb,0x73,0x31,0x9f,0x20,0xf6,0x81,0xc2,0xef,0x33,0x77,0x51,0xd8,0x2c,0xe4}
+		};
+	#elif defined(SCRYPT_SALSA64)
+		static const uint8_t post_vectors[][64] = {
+			{0x9e,0xf2,0x60,0x7c,0xbd,0x7c,0x19,0x5c,0x79,0xc6,0x1b,0x7e,0xb0,0x65,0x1b,0xc3,
+			 0x70,0x0d,0x89,0xfc,0x72,0xb2,0x03,0x72,0x15,0xcb,0x8e,0x8c,0x49,0x50,0x4c,0x27,
+			 0x99,0xda,0x47,0x32,0x5e,0xb4,0xa2,0x07,0x83,0x51,0x6b,0x06,0x37,0x60,0x42,0xc4,
+			 0x59,0x49,0x99,0xdd,0xc0,0xd2,0x08,0x94,0x7f,0xe3,0x9e,0x4e,0x43,0x8e,0x5b,0xba},
+			{0x86,0x6f,0x3b,0x11,0xb8,0xca,0x4b,0x6e,0xa7,0x6f,0xc2,0xc9,0x33,0xb7,0x8b,0x9f,
+			 0xa3,0xb9,0xf5,0xb5,0x62,0xa6,0x17,0x66,0xe4,0xc3,0x9d,0x9b,0xca,0x51,0xb0,0x2f,
+			 0xda,0x09,0xc1,0x77,0xed,0x8b,0x89,0xc2,0x69,0x5a,0x34,0x05,0x4a,0x1f,0x4d,0x76,
+			 0xcb,0xd5,0xa4,0x78,0xfa,0x1b,0xb9,0x5b,0xbc,0x3d,0xce,0x04,0x63,0x99,0xad,0x54}
+		};
+	#endif
+#elif defined(SCRYPT_SKEIN512)
+	#if defined(SCRYPT_SALSA)
+		static const uint8_t post_vectors[][64] = {
+			{0xe4,0x36,0xa0,0x9a,0xdb,0xf0,0xd1,0x45,0x56,0xda,0x25,0x53,0x00,0xf9,0x2c,0x69,
+			 0xa4,0xc2,0xa5,0x8e,0x1a,0x85,0xfa,0x53,0xbd,0x55,0x3d,0x11,0x2a,0x44,0x13,0x87,
+			 0x8f,0x81,0x88,0x13,0x1e,0x49,0xa8,0xc4,0xc5,0xcd,0x1f,0xe1,0x5f,0xf5,0xcb,0x2f,
+			 0x8b,0xab,0x57,0x38,0x59,0xeb,0x6b,0xac,0x3b,0x73,0x10,0xa6,0xe1,0xfe,0x17,0x3e},
+			{0x6d,0x61,0xde,0x43,0xa9,0x38,0x53,0x5f,0xd8,0xf2,0x6d,0xf3,0xe4,0xd6,0xd8,0x5e,
+			 0x81,0x89,0xd0,0x0b,0x86,0x16,0xb1,0x91,0x65,0x76,0xd8,0xc1,0xf7,0x3b,0xca,0x8b,
+			 0x35,0x07,0x58,0xba,0x77,0xdf,0x11,0x6c,0xbc,0x58,0xee,0x11,0x59,0xf2,0xfe,0xcb,
+			 0x51,0xdc,0xcd,0x35,0x2e,0x46,0x22,0xa0,0xaa,0x55,0x60,0x7c,0x91,0x15,0xb8,0x00}
+		};
+	#elif defined(SCRYPT_CHACHA)
+		static const uint8_t post_vectors[][64] = {
+			{0xd1,0x12,0x6d,0x64,0x10,0x0e,0x98,0x6c,0xbe,0x70,0x21,0xd9,0xc6,0x04,0x62,0xa4,
+			 0x29,0x13,0x9a,0x3c,0xf8,0xe9,0x1e,0x87,0x9f,0x88,0xf4,0x98,0x01,0x41,0x8e,0xce,
+			 0x60,0xf7,0xbe,0x17,0x0a,0xec,0xd6,0x30,0x80,0xcf,0x6b,0x1e,0xcf,0x95,0xa0,0x4d,
+			 0x37,0xed,0x3a,0x09,0xd1,0xeb,0x0c,0x80,0x82,0x22,0x8e,0xd3,0xb1,0x7f,0xd6,0xa8},
+			{0x5c,0x5c,0x05,0xe2,0x75,0xa5,0xa4,0xec,0x81,0x97,0x9c,0x5b,0xd7,0x26,0xb3,0x16,
+			 0xb4,0x02,0x8c,0x56,0xe6,0x32,0x57,0x33,0x47,0x19,0x06,0x6c,0xde,0x68,0x41,0x37,
+			 0x5b,0x7d,0xa7,0xb3,0x73,0xeb,0x82,0xca,0x0f,0x86,0x2e,0x6b,0x47,0xa2,0x70,0x39,
+			 0x35,0xfd,0x2d,0x2e,0x7b,0xc3,0x68,0xbb,0x52,0x42,0x19,0x3b,0x78,0x96,0xe7,0xc8}
+		};
+	#elif defined(SCRYPT_SALSA64)
+		static const uint8_t post_vectors[][64] = {
+			{0xd2,0xad,0x32,0x05,0xee,0x80,0xe3,0x44,0x70,0xc6,0x34,0xde,0x05,0xb6,0xcf,0x60,
+			 0x89,0x98,0x70,0xc0,0xb8,0xf5,0x54,0xf1,0xa6,0xb2,0xc8,0x76,0x34,0xec,0xc4,0x59,
+			 0x8e,0x64,0x42,0xd0,0xa9,0xed,0xe7,0x19,0xb2,0x8a,0x11,0xc6,0xa6,0xbf,0xa7,0xa9,
+			 0x4e,0x44,0x32,0x7e,0x12,0x91,0x9d,0xfe,0x52,0x48,0xa8,0x27,0xb3,0xfc,0xb1,0x89},
+			{0xd6,0x67,0xd2,0x3e,0x30,0x1e,0x9d,0xe2,0x55,0x68,0x17,0x3d,0x2b,0x75,0x5a,0xe5,
+			 0x04,0xfb,0x3d,0x0e,0x86,0xe0,0xaa,0x1d,0xd4,0x72,0xda,0xb0,0x79,0x41,0xb7,0x99,
+			 0x68,0xe5,0xd9,0x55,0x79,0x7d,0xc3,0xd1,0xa6,0x56,0xc1,0xbe,0x0b,0x6c,0x62,0x23,
+			 0x66,0x67,0x91,0x47,0x99,0x13,0x6b,0xe3,0xda,0x59,0x55,0x18,0x67,0x8f,0x2e,0x3b}
+		};
+	#endif
+#elif defined(SCRYPT_KECCAK512)
+	#if defined(SCRYPT_SALSA)
+		static const uint8_t post_vectors[][64] = {
+			{0xc2,0x7b,0xbe,0x1d,0xf1,0x99,0xd8,0xe7,0x1b,0xac,0xe0,0x9d,0xeb,0x5a,0xfe,0x21,
+			 0x71,0xff,0x41,0x51,0x4f,0xbe,0x41,0x01,0x15,0xe2,0xb7,0xb9,0x55,0x15,0x25,0xa1,
+			 0x40,0x4c,0x66,0x29,0x32,0xb7,0xc9,0x62,0x60,0x88,0xe0,0x99,0x39,0xae,0xce,0x25,
+			 0x3c,0x11,0x89,0xdd,0xc6,0x14,0xd7,0x3e,0xa3,0x6d,0x07,0x2e,0x56,0xa0,0xff,0x97},
+			{0x3c,0x91,0x12,0x4a,0x37,0x7d,0xd6,0x96,0xd2,0x9b,0x5d,0xea,0xb8,0xb9,0x82,0x4e,
+			 0x4f,0x6b,0x60,0x4c,0x59,0x01,0xe5,0x73,0xfd,0xf6,0xb8,0x9a,0x5a,0xd3,0x7c,0x7a,
+			 0xd2,0x4f,0x8e,0x74,0xc1,0x90,0x88,0xa0,0x3f,0x55,0x75,0x79,0x10,0xd0,0x09,0x79,
+			 0x0f,0x6c,0x74,0x0c,0x05,0x08,0x3c,0x8c,0x94,0x7b,0x30,0x56,0xca,0xdf,0xdf,0x34}
+		};
+	#elif defined(SCRYPT_CHACHA)
+		static const uint8_t post_vectors[][64] = {
+			{0x77,0xcb,0x70,0xbf,0xae,0xd4,0x4c,0x5b,0xbc,0xd3,0xec,0x8a,0x82,0x43,0x8d,0xb3,
+			 0x7f,0x1f,0xfb,0x70,0x36,0x32,0x4d,0xa6,0xb7,0x13,0x37,0x77,0x30,0x0c,0x3c,0xfb,
+			 0x2c,0x20,0x8f,0x2a,0xf4,0x47,0x4d,0x69,0x8e,0xae,0x2d,0xad,0xba,0x35,0xe9,0x2f,
+			 0xe6,0x99,0x7a,0xf8,0xcf,0x70,0x78,0xbb,0x0c,0x72,0x64,0x95,0x8b,0x36,0x77,0x3d},
+			{0xc6,0x43,0x17,0x16,0x87,0x09,0x5f,0x12,0xed,0x21,0xe2,0xb4,0xad,0x55,0xa1,0xa1,
+			 0x49,0x50,0x90,0x70,0xab,0x81,0x83,0x7a,0xcd,0xdf,0x23,0x52,0x19,0xc0,0xa2,0xd8,
+			 0x8e,0x98,0xeb,0xf0,0x37,0xab,0xad,0xfd,0x1c,0x04,0x97,0x18,0x42,0x85,0xf7,0x4b,
+			 0x18,0x2c,0x55,0xd3,0xa9,0xe6,0x89,0xfb,0x58,0x0a,0xb2,0x37,0xb9,0xf8,0xfb,0xc5}
+		};
+	#elif defined(SCRYPT_SALSA64)
+		static const uint8_t post_vectors[][64] = {
+			{0xc7,0x34,0x95,0x02,0x5e,0x31,0x0d,0x1f,0x10,0x38,0x9c,0x3f,0x04,0x53,0xed,0x05,
+			 0x27,0x38,0xc1,0x3f,0x6a,0x0f,0xc5,0xa3,0x9b,0x73,0x8a,0x28,0x7e,0x5d,0x3c,0xdc,
+			 0x9d,0x5a,0x09,0xbf,0x8c,0x0a,0xad,0xe4,0x73,0x52,0xe3,0x6d,0xaa,0xd1,0x8b,0xbf,
+			 0xa3,0xb7,0xf0,0x58,0xad,0x22,0x24,0xc9,0xaa,0x96,0xb7,0x5d,0xfc,0x5f,0xb0,0xcf},
+			{0x76,0x22,0xfd,0xe8,0xa2,0x79,0x8e,0x9d,0x43,0x8c,0x7a,0xba,0x78,0xb7,0x84,0xf1,
+			 0xc8,0xee,0x3b,0xae,0x31,0x89,0xbf,0x7e,0xd0,0x4b,0xc1,0x2d,0x58,0x5d,0x84,0x6b,
+			 0xec,0x86,0x56,0xe0,0x87,0x94,0x7f,0xbc,0xf9,0x48,0x92,0xef,0x54,0x7f,0x23,0x8d,
+			 0x4f,0x8b,0x0a,0x75,0xa7,0x39,0x0e,0x46,0x6e,0xee,0x58,0xc8,0xfa,0xea,0x90,0x53}
+		};
+	#endif
+#elif defined(SCRYPT_KECCAK256)
+	#if defined(SCRYPT_SALSA)
+		static const uint8_t post_vectors[][64] = {
+			{0x2e,0x96,0xd8,0x87,0x45,0xcd,0xd6,0xc8,0xf6,0xd2,0x87,0x33,0x50,0xc7,0x04,0xe5,
+			 0x3c,0x4b,0x48,0x44,0x57,0xc1,0x74,0x09,0x76,0x02,0xaa,0xd3,0x7b,0xf3,0xbf,0xed,
+			 0x4b,0x72,0xd7,0x1b,0x49,0x6b,0xe0,0x44,0x83,0xee,0x8f,0xaf,0xa1,0xb5,0x33,0xa9,
+			 0x9e,0x86,0xab,0xe2,0x9f,0xcf,0x68,0x6e,0x7e,0xbd,0xf5,0x7a,0x83,0x4b,0x1c,0x10},
+			{0x42,0x7e,0xf9,0x4b,0x72,0x61,0xda,0x2d,0xb3,0x27,0x0e,0xe1,0xd9,0xde,0x5f,0x3e,
+			 0x64,0x2f,0xd6,0xda,0x90,0x59,0xce,0xbf,0x02,0x5b,0x32,0xf7,0x6d,0x94,0x51,0x7b,
+			 0xb6,0xa6,0x0d,0x99,0x3e,0x7f,0x39,0xbe,0x1b,0x1d,0x6c,0x97,0x12,0xd8,0xb7,0xfd,
+			 0x5b,0xb5,0xf3,0x73,0x5a,0x89,0xb2,0xdd,0xcc,0x3d,0x74,0x2e,0x3d,0x9e,0x3c,0x22}
+		};
+	#elif defined(SCRYPT_CHACHA)
+		static const uint8_t post_vectors[][64] = {
+			{0x76,0x1d,0x5b,0x8f,0xa9,0xe1,0xa6,0x01,0xcb,0xc5,0x7a,0x5f,0x02,0x23,0xb6,0x82,
+			 0x57,0x79,0x60,0x2f,0x05,0x7f,0xb8,0x0a,0xcb,0x5e,0x54,0x11,0x49,0x2e,0xdd,0x85,
+			 0x83,0x30,0x67,0xb3,0x24,0x5c,0xce,0xfc,0x32,0xcf,0x12,0xc3,0xff,0xe0,0x79,0x36,
+			 0x74,0x17,0xa6,0x3e,0xcd,0xa0,0x7e,0xcb,0x37,0xeb,0xcb,0xb6,0xe1,0xb9,0xf5,0x15},
+			{0xf5,0x66,0xa7,0x4c,0xe4,0xdc,0x18,0x56,0x2f,0x3e,0x86,0x4d,0x92,0xa5,0x5c,0x5a,
+			 0x8f,0xc3,0x6b,0x32,0xdb,0xe5,0x72,0x50,0x84,0xfc,0x6e,0x5d,0x15,0x77,0x3d,0xca,
+			 0xc5,0x2b,0x20,0x3c,0x78,0x37,0x80,0x78,0x23,0x56,0x91,0xa0,0xce,0xa4,0x06,0x5a,
+			 0x7f,0xe3,0xbf,0xab,0x51,0x57,0x32,0x2c,0x0a,0xf0,0xc5,0x6f,0xf4,0xcb,0xff,0x42}
+		};
+	#elif defined(SCRYPT_SALSA64)
+		static const uint8_t post_vectors[][64] = {
+			{0xb0,0xb7,0x10,0xb5,0x1f,0x2b,0x7f,0xaf,0x9d,0x95,0x5f,0x4c,0x2d,0x98,0x7c,0xc1,
+			 0xbc,0x37,0x2f,0x50,0x8d,0xb2,0x9f,0xfd,0x48,0x0d,0xe0,0x44,0x19,0xdf,0x28,0x6c,
+			 0xab,0xbf,0x1e,0x17,0x26,0xcc,0x57,0x95,0x18,0x17,0x83,0x4c,0x12,0x48,0xd9,0xee,
+			 0x4b,0x00,0x29,0x06,0x31,0x01,0x6b,0x8c,0x26,0x39,0xbf,0xe4,0xe4,0xd4,0x6a,0x26},
+			{0xa0,0x40,0xb2,0xf2,0x11,0xb6,0x5f,0x3d,0x4c,0x1e,0xef,0x59,0xd4,0x98,0xdb,0x14,
+			 0x01,0xff,0xe3,0x34,0xd7,0x19,0xcd,0xeb,0xde,0x52,0x1c,0xf4,0x86,0x43,0xc9,0xe2,
+			 0xfb,0xf9,0x4f,0x0a,0xbb,0x1f,0x5c,0x6a,0xdf,0xb9,0x28,0xfa,0xac,0xc4,0x48,0xed,
+			 0xcc,0xd2,0x2e,0x25,0x5f,0xf3,0x56,0x1d,0x2d,0x23,0x22,0xc1,0xbc,0xff,0x78,0x80}
+		};
+	#endif
+#else
+	static const uint8_t post_vectors[][64] = {{0}};
+#endif
+
diff --git a/scrypt.c b/scrypt.c
new file mode 100644
index 0000000..24ad35a
--- /dev/null
+++ b/scrypt.c
@@ -0,0 +1,686 @@
+/*-
+ * Copyright 2009 Colin Percival, 2011 ArtForz
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * This file was originally written by Colin Percival as part of the Tarsnap
+ * online backup system.
+ */
+
+#include "scrypt.h"
+#include <stdlib.h>
+
+#if defined(WIN32) || defined(_WIN32) || defined(__WIN32) && !defined(__CYGWIN__)
+#include "stdint.h"
+#else
+#include <stdint.h>
+#endif
+
+#include <string.h>
+
+static __inline uint32_t
+be32dec(const void *pp)
+{
+	const uint8_t *p = (uint8_t const *)pp;
+
+	return ((uint32_t)(p[3]) + ((uint32_t)(p[2]) << 8) +
+	    ((uint32_t)(p[1]) << 16) + ((uint32_t)(p[0]) << 24));
+}
+
+static __inline void
+be32enc(void *pp, uint32_t x)
+{
+	uint8_t * p = (uint8_t *)pp;
+
+	p[3] = x & 0xff;
+	p[2] = (x >> 8) & 0xff;
+	p[1] = (x >> 16) & 0xff;
+	p[0] = (x >> 24) & 0xff;
+}
+
+static __inline uint32_t
+le32dec(const void *pp)
+{
+	const uint8_t *p = (uint8_t const *)pp;
+
+	return ((uint32_t)(p[0]) + ((uint32_t)(p[1]) << 8) +
+	    ((uint32_t)(p[2]) << 16) + ((uint32_t)(p[3]) << 24));
+}
+
+static __inline void
+le32enc(void *pp, uint32_t x)
+{
+	uint8_t * p = (uint8_t *)pp;
+
+	p[0] = x & 0xff;
+	p[1] = (x >> 8) & 0xff;
+	p[2] = (x >> 16) & 0xff;
+	p[3] = (x >> 24) & 0xff;
+}
+
+
+typedef struct SHA256Context {
+	uint32_t state[8];
+	uint32_t count[2];
+	unsigned char buf[64];
+} SHA256_CTX;
+
+typedef struct HMAC_SHA256Context {
+	SHA256_CTX ictx;
+	SHA256_CTX octx;
+} HMAC_SHA256_CTX;
+
+/*
+ * Encode a length len/4 vector of (uint32_t) into a length len vector of
+ * (unsigned char) in big-endian form.  Assumes len is a multiple of 4.
+ */
+static void
+be32enc_vect(unsigned char *dst, const uint32_t *src, size_t len)
+{
+	size_t i;
+
+	for (i = 0; i < len / 4; i++)
+		be32enc(dst + i * 4, src[i]);
+}
+
+/*
+ * Decode a big-endian length len vector of (unsigned char) into a length
+ * len/4 vector of (uint32_t).  Assumes len is a multiple of 4.
+ */
+static void
+be32dec_vect(uint32_t *dst, const unsigned char *src, size_t len)
+{
+	size_t i;
+
+	for (i = 0; i < len / 4; i++)
+		dst[i] = be32dec(src + i * 4);
+}
+
+/* Elementary functions used by SHA256 */
+#define Ch(x, y, z)	((x & (y ^ z)) ^ z)
+#define Maj(x, y, z)	((x & (y | z)) | (y & z))
+#define SHR(x, n)	(x >> n)
+#define ROTR(x, n)	((x >> n) | (x << (32 - n)))
+#define S0(x)		(ROTR(x, 2) ^ ROTR(x, 13) ^ ROTR(x, 22))
+#define S1(x)		(ROTR(x, 6) ^ ROTR(x, 11) ^ ROTR(x, 25))
+#define s0(x)		(ROTR(x, 7) ^ ROTR(x, 18) ^ SHR(x, 3))
+#define s1(x)		(ROTR(x, 17) ^ ROTR(x, 19) ^ SHR(x, 10))
+
+/* SHA256 round function */
+#define RND(a, b, c, d, e, f, g, h, k)			\
+	t0 = h + S1(e) + Ch(e, f, g) + k;		\
+	t1 = S0(a) + Maj(a, b, c);			\
+	d += t0;					\
+	h  = t0 + t1;
+
+/* Adjusted round function for rotating state */
+#define RNDr(S, W, i, k)			\
+	RND(S[(64 - i) % 8], S[(65 - i) % 8],	\
+	    S[(66 - i) % 8], S[(67 - i) % 8],	\
+	    S[(68 - i) % 8], S[(69 - i) % 8],	\
+	    S[(70 - i) % 8], S[(71 - i) % 8],	\
+	    W[i] + k)
+
+/*
+ * SHA256 block compression function.  The 256-bit state is transformed via
+ * the 512-bit input block to produce a new state.
+ */
+static void
+SHA256_Transform(uint32_t * state, const unsigned char block[64])
+{
+	uint32_t W[64];
+	uint32_t S[8];
+	uint32_t t0, t1;
+	int i;
+
+	/* 1. Prepare message schedule W. */
+	be32dec_vect(W, block, 64);
+	for (i = 16; i < 64; i++)
+		W[i] = s1(W[i - 2]) + W[i - 7] + s0(W[i - 15]) + W[i - 16];
+
+	/* 2. Initialize working variables. */
+	memcpy(S, state, 32);
+
+	/* 3. Mix. */
+	RNDr(S, W, 0, 0x428a2f98);
+	RNDr(S, W, 1, 0x71374491);
+	RNDr(S, W, 2, 0xb5c0fbcf);
+	RNDr(S, W, 3, 0xe9b5dba5);
+	RNDr(S, W, 4, 0x3956c25b);
+	RNDr(S, W, 5, 0x59f111f1);
+	RNDr(S, W, 6, 0x923f82a4);
+	RNDr(S, W, 7, 0xab1c5ed5);
+	RNDr(S, W, 8, 0xd807aa98);
+	RNDr(S, W, 9, 0x12835b01);
+	RNDr(S, W, 10, 0x243185be);
+	RNDr(S, W, 11, 0x550c7dc3);
+	RNDr(S, W, 12, 0x72be5d74);
+	RNDr(S, W, 13, 0x80deb1fe);
+	RNDr(S, W, 14, 0x9bdc06a7);
+	RNDr(S, W, 15, 0xc19bf174);
+	RNDr(S, W, 16, 0xe49b69c1);
+	RNDr(S, W, 17, 0xefbe4786);
+	RNDr(S, W, 18, 0x0fc19dc6);
+	RNDr(S, W, 19, 0x240ca1cc);
+	RNDr(S, W, 20, 0x2de92c6f);
+	RNDr(S, W, 21, 0x4a7484aa);
+	RNDr(S, W, 22, 0x5cb0a9dc);
+	RNDr(S, W, 23, 0x76f988da);
+	RNDr(S, W, 24, 0x983e5152);
+	RNDr(S, W, 25, 0xa831c66d);
+	RNDr(S, W, 26, 0xb00327c8);
+	RNDr(S, W, 27, 0xbf597fc7);
+	RNDr(S, W, 28, 0xc6e00bf3);
+	RNDr(S, W, 29, 0xd5a79147);
+	RNDr(S, W, 30, 0x06ca6351);
+	RNDr(S, W, 31, 0x14292967);
+	RNDr(S, W, 32, 0x27b70a85);
+	RNDr(S, W, 33, 0x2e1b2138);
+	RNDr(S, W, 34, 0x4d2c6dfc);
+	RNDr(S, W, 35, 0x53380d13);
+	RNDr(S, W, 36, 0x650a7354);
+	RNDr(S, W, 37, 0x766a0abb);
+	RNDr(S, W, 38, 0x81c2c92e);
+	RNDr(S, W, 39, 0x92722c85);
+	RNDr(S, W, 40, 0xa2bfe8a1);
+	RNDr(S, W, 41, 0xa81a664b);
+	RNDr(S, W, 42, 0xc24b8b70);
+	RNDr(S, W, 43, 0xc76c51a3);
+	RNDr(S, W, 44, 0xd192e819);
+	RNDr(S, W, 45, 0xd6990624);
+	RNDr(S, W, 46, 0xf40e3585);
+	RNDr(S, W, 47, 0x106aa070);
+	RNDr(S, W, 48, 0x19a4c116);
+	RNDr(S, W, 49, 0x1e376c08);
+	RNDr(S, W, 50, 0x2748774c);
+	RNDr(S, W, 51, 0x34b0bcb5);
+	RNDr(S, W, 52, 0x391c0cb3);
+	RNDr(S, W, 53, 0x4ed8aa4a);
+	RNDr(S, W, 54, 0x5b9cca4f);
+	RNDr(S, W, 55, 0x682e6ff3);
+	RNDr(S, W, 56, 0x748f82ee);
+	RNDr(S, W, 57, 0x78a5636f);
+	RNDr(S, W, 58, 0x84c87814);
+	RNDr(S, W, 59, 0x8cc70208);
+	RNDr(S, W, 60, 0x90befffa);
+	RNDr(S, W, 61, 0xa4506ceb);
+	RNDr(S, W, 62, 0xbef9a3f7);
+	RNDr(S, W, 63, 0xc67178f2);
+
+	/* 4. Mix local working variables into global state */
+	for (i = 0; i < 8; i++)
+		state[i] += S[i];
+
+	/* Clean the stack. */
+	memset(W, 0, 256);
+	memset(S, 0, 32);
+	t0 = t1 = 0;
+}
+
+static unsigned char PAD[64] = {
+	0x80, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+};
+
+/* SHA-256 initialization.  Begins a SHA-256 operation. */
+static void
+SHA256_Init(SHA256_CTX * ctx)
+{
+
+	/* Zero bits processed so far */
+	ctx->count[0] = ctx->count[1] = 0;
+
+	/* Magic initialization constants */
+	ctx->state[0] = 0x6A09E667;
+	ctx->state[1] = 0xBB67AE85;
+	ctx->state[2] = 0x3C6EF372;
+	ctx->state[3] = 0xA54FF53A;
+	ctx->state[4] = 0x510E527F;
+	ctx->state[5] = 0x9B05688C;
+	ctx->state[6] = 0x1F83D9AB;
+	ctx->state[7] = 0x5BE0CD19;
+}
+
+/* Add bytes into the hash */
+static void
+SHA256_Update(SHA256_CTX * ctx, const void *in, size_t len)
+{
+	uint32_t bitlen[2];
+	uint32_t r;
+	const unsigned char *src = in;
+
+	/* Number of bytes left in the buffer from previous updates */
+	r = (ctx->count[1] >> 3) & 0x3f;
+
+	/* Convert the length into a number of bits */
+	bitlen[1] = ((uint32_t)len) << 3;
+	bitlen[0] = (uint32_t)(len >> 29);
+
+	/* Update number of bits */
+	if ((ctx->count[1] += bitlen[1]) < bitlen[1])
+		ctx->count[0]++;
+	ctx->count[0] += bitlen[0];
+
+	/* Handle the case where we don't need to perform any transforms */
+	if (len < 64 - r) {
+		memcpy(&ctx->buf[r], src, len);
+		return;
+	}
+
+	/* Finish the current block */
+	memcpy(&ctx->buf[r], src, 64 - r);
+	SHA256_Transform(ctx->state, ctx->buf);
+	src += 64 - r;
+	len -= 64 - r;
+
+	/* Perform complete blocks */
+	while (len >= 64) {
+		SHA256_Transform(ctx->state, src);
+		src += 64;
+		len -= 64;
+	}
+
+	/* Copy left over data into buffer */
+	memcpy(ctx->buf, src, len);
+}
+
+/* Add padding and terminating bit-count. */
+static void
+SHA256_Pad(SHA256_CTX * ctx)
+{
+	unsigned char len[8];
+	uint32_t r, plen;
+
+	/*
+	 * Convert length to a vector of bytes -- we do this now rather
+	 * than later because the length will change after we pad.
+	 */
+	be32enc_vect(len, ctx->count, 8);
+
+	/* Add 1--64 bytes so that the resulting length is 56 mod 64 */
+	r = (ctx->count[1] >> 3) & 0x3f;
+	plen = (r < 56) ? (56 - r) : (120 - r);
+	SHA256_Update(ctx, PAD, (size_t)plen);
+
+	/* Add the terminating bit-count */
+	SHA256_Update(ctx, len, 8);
+}
+
+/*
+ * SHA-256 finalization.  Pads the input data, exports the hash value,
+ * and clears the context state.
+ */
+static void
+SHA256_Final(unsigned char digest[32], SHA256_CTX * ctx)
+{
+
+	/* Add padding */
+	SHA256_Pad(ctx);
+
+	/* Write the hash */
+	be32enc_vect(digest, ctx->state, 32);
+
+	/* Clear the context state */
+	memset((void *)ctx, 0, sizeof(*ctx));
+}
+
+/* Initialize an HMAC-SHA256 operation with the given key. */
+static void
+HMAC_SHA256_Init(HMAC_SHA256_CTX * ctx, const void * _K, size_t Klen)
+{
+	unsigned char pad[64];
+	unsigned char khash[32];
+	const unsigned char * K = _K;
+	size_t i;
+
+	/* If Klen > 64, the key is really SHA256(K). */
+	if (Klen > 64) {
+		SHA256_Init(&ctx->ictx);
+		SHA256_Update(&ctx->ictx, K, Klen);
+		SHA256_Final(khash, &ctx->ictx);
+		K = khash;
+		Klen = 32;
+	}
+
+	/* Inner SHA256 operation is SHA256(K xor [block of 0x36] || data). */
+	SHA256_Init(&ctx->ictx);
+	memset(pad, 0x36, 64);
+	for (i = 0; i < Klen; i++)
+		pad[i] ^= K[i];
+	SHA256_Update(&ctx->ictx, pad, 64);
+
+	/* Outer SHA256 operation is SHA256(K xor [block of 0x5c] || hash). */
+	SHA256_Init(&ctx->octx);
+	memset(pad, 0x5c, 64);
+	for (i = 0; i < Klen; i++)
+		pad[i] ^= K[i];
+	SHA256_Update(&ctx->octx, pad, 64);
+
+	/* Clean the stack. */
+	memset(khash, 0, 32);
+}
+
+/* Add bytes to the HMAC-SHA256 operation. */
+static void
+HMAC_SHA256_Update(HMAC_SHA256_CTX * ctx, const void *in, size_t len)
+{
+
+	/* Feed data to the inner SHA256 operation. */
+	SHA256_Update(&ctx->ictx, in, len);
+}
+
+/* Finish an HMAC-SHA256 operation. */
+static void
+HMAC_SHA256_Final(unsigned char digest[32], HMAC_SHA256_CTX * ctx)
+{
+	unsigned char ihash[32];
+
+	/* Finish the inner SHA256 operation. */
+	SHA256_Final(ihash, &ctx->ictx);
+
+	/* Feed the inner hash to the outer SHA256 operation. */
+	SHA256_Update(&ctx->octx, ihash, 32);
+
+	/* Finish the outer SHA256 operation. */
+	SHA256_Final(digest, &ctx->octx);
+
+	/* Clean the stack. */
+	memset(ihash, 0, 32);
+}
+
+/**
+ * PBKDF2_SHA256(passwd, passwdlen, salt, saltlen, c, buf, dkLen):
+ * Compute PBKDF2(passwd, salt, c, dkLen) using HMAC-SHA256 as the PRF, and
+ * write the output to buf.  The value dkLen must be at most 32 * (2^32 - 1).
+ */
+static void
+PBKDF2_SHA256(const uint8_t * passwd, size_t passwdlen, const uint8_t * salt,
+    size_t saltlen, uint64_t c, uint8_t * buf, size_t dkLen)
+{
+	HMAC_SHA256_CTX PShctx, hctx;
+	size_t i;
+	uint8_t ivec[4];
+	uint8_t U[32];
+	uint8_t T[32];
+	uint64_t j;
+	int k;
+	size_t clen;
+
+	/* Compute HMAC state after processing P and S. */
+	HMAC_SHA256_Init(&PShctx, passwd, passwdlen);
+	HMAC_SHA256_Update(&PShctx, salt, saltlen);
+
+	/* Iterate through the blocks. */
+	for (i = 0; i * 32 < dkLen; i++) {
+		/* Generate INT(i + 1). */
+		be32enc(ivec, (uint32_t)(i + 1));
+
+		/* Compute U_1 = PRF(P, S || INT(i)). */
+		memcpy(&hctx, &PShctx, sizeof(HMAC_SHA256_CTX));
+		HMAC_SHA256_Update(&hctx, ivec, 4);
+		HMAC_SHA256_Final(U, &hctx);
+
+		/* T_i = U_1 ... */
+		memcpy(T, U, 32);
+
+		for (j = 2; j <= c; j++) {
+			/* Compute U_j. */
+			HMAC_SHA256_Init(&hctx, passwd, passwdlen);
+			HMAC_SHA256_Update(&hctx, U, 32);
+			HMAC_SHA256_Final(U, &hctx);
+
+			/* ... xor U_j ... */
+			for (k = 0; k < 32; k++)
+				T[k] ^= U[k];
+		}
+
+		/* Copy as many bytes as necessary into buf. */
+		clen = dkLen - i * 32;
+		if (clen > 32)
+			clen = 32;
+		memcpy(&buf[i * 32], T, clen);
+	}
+
+	/* Clean PShctx, since we never called _Final on it. */
+	memset(&PShctx, 0, sizeof(HMAC_SHA256_CTX));
+}
+
+
+static void blkcpy(void *, void *, size_t);
+static void blkxor(void *, void *, size_t);
+static void salsa20_8(uint32_t[16]);
+static void blockmix_salsa8(uint32_t *, uint32_t *, uint32_t *, size_t);
+static uint64_t integerify(void *, size_t);
+static void smix(uint8_t *, size_t, uint64_t, uint32_t *, uint32_t *);
+
+static void
+blkcpy(void * dest, void * src, size_t len)
+{
+	size_t * D = dest;
+	size_t * S = src;
+	size_t L = len / sizeof(size_t);
+	size_t i;
+
+	for (i = 0; i < L; i++)
+		D[i] = S[i];
+}
+
+static void
+blkxor(void * dest, void * src, size_t len)
+{
+	size_t * D = dest;
+	size_t * S = src;
+	size_t L = len / sizeof(size_t);
+	size_t i;
+
+	for (i = 0; i < L; i++)
+		D[i] ^= S[i];
+}
+
+/**
+ * salsa20_8(B):
+ * Apply the salsa20/8 core to the provided block.
+ */
+static void
+salsa20_8(uint32_t B[16])
+{
+	uint32_t x[16];
+	size_t i;
+
+	blkcpy(x, B, 64);
+	for (i = 0; i < 8; i += 2) {
+#define R(a,b) (((a) << (b)) | ((a) >> (32 - (b))))
+		/* Operate on columns. */
+		x[ 4] ^= R(x[ 0]+x[12], 7);  x[ 8] ^= R(x[ 4]+x[ 0], 9);
+		x[12] ^= R(x[ 8]+x[ 4],13);  x[ 0] ^= R(x[12]+x[ 8],18);
+
+		x[ 9] ^= R(x[ 5]+x[ 1], 7);  x[13] ^= R(x[ 9]+x[ 5], 9);
+		x[ 1] ^= R(x[13]+x[ 9],13);  x[ 5] ^= R(x[ 1]+x[13],18);
+
+		x[14] ^= R(x[10]+x[ 6], 7);  x[ 2] ^= R(x[14]+x[10], 9);
+		x[ 6] ^= R(x[ 2]+x[14],13);  x[10] ^= R(x[ 6]+x[ 2],18);
+
+		x[ 3] ^= R(x[15]+x[11], 7);  x[ 7] ^= R(x[ 3]+x[15], 9);
+		x[11] ^= R(x[ 7]+x[ 3],13);  x[15] ^= R(x[11]+x[ 7],18);
+
+		/* Operate on rows. */
+		x[ 1] ^= R(x[ 0]+x[ 3], 7);  x[ 2] ^= R(x[ 1]+x[ 0], 9);
+		x[ 3] ^= R(x[ 2]+x[ 1],13);  x[ 0] ^= R(x[ 3]+x[ 2],18);
+
+		x[ 6] ^= R(x[ 5]+x[ 4], 7);  x[ 7] ^= R(x[ 6]+x[ 5], 9);
+		x[ 4] ^= R(x[ 7]+x[ 6],13);  x[ 5] ^= R(x[ 4]+x[ 7],18);
+
+		x[11] ^= R(x[10]+x[ 9], 7);  x[ 8] ^= R(x[11]+x[10], 9);
+		x[ 9] ^= R(x[ 8]+x[11],13);  x[10] ^= R(x[ 9]+x[ 8],18);
+
+		x[12] ^= R(x[15]+x[14], 7);  x[13] ^= R(x[12]+x[15], 9);
+		x[14] ^= R(x[13]+x[12],13);  x[15] ^= R(x[14]+x[13],18);
+#undef R
+	}
+	for (i = 0; i < 16; i++)
+		B[i] += x[i];
+}
+
+/**
+ * blockmix_salsa8(Bin, Bout, X, r):
+ * Compute Bout = BlockMix_{salsa20/8, r}(Bin).  The input Bin must be 128r
+ * bytes in length; the output Bout must also be the same size.  The
+ * temporary space X must be 64 bytes.
+ */
+static void
+blockmix_salsa8(uint32_t * Bin, uint32_t * Bout, uint32_t * X, size_t r)
+{
+	size_t i;
+
+	/* 1: X <-- B_{2r - 1} */
+	blkcpy(X, &Bin[(2 * r - 1) * 16], 64);
+
+	/* 2: for i = 0 to 2r - 1 do */
+	for (i = 0; i < 2 * r; i += 2) {
+		/* 3: X <-- H(X \xor B_i) */
+		blkxor(X, &Bin[i * 16], 64);
+		salsa20_8(X);
+
+		/* 4: Y_i <-- X */
+		/* 6: B' <-- (Y_0, Y_2 ... Y_{2r-2}, Y_1, Y_3 ... Y_{2r-1}) */
+		blkcpy(&Bout[i * 8], X, 64);
+
+		/* 3: X <-- H(X \xor B_i) */
+		blkxor(X, &Bin[i * 16 + 16], 64);
+		salsa20_8(X);
+
+		/* 4: Y_i <-- X */
+		/* 6: B' <-- (Y_0, Y_2 ... Y_{2r-2}, Y_1, Y_3 ... Y_{2r-1}) */
+		blkcpy(&Bout[i * 8 + r * 16], X, 64);
+	}
+}
+
+/**
+ * integerify(B, r):
+ * Return the result of parsing B_{2r-1} as a little-endian integer.
+ */
+static uint64_t
+integerify(void * B, size_t r)
+{
+	uint32_t * X = (void *)((uintptr_t)(B) + (2 * r - 1) * 64);
+
+	return (((uint64_t)(X[1]) << 32) + X[0]);
+}
+
+/**
+ * smix(B, r, N, V, XY):
+ * Compute B = SMix_r(B, N).  The input B must be 128r bytes in length;
+ * the temporary storage V must be 128rN bytes in length; the temporary
+ * storage XY must be 256r + 64 bytes in length.  The value N must be a
+ * power of 2 greater than 1.  The arrays B, V, and XY must be aligned to a
+ * multiple of 64 bytes.
+ */
+static void
+smix(uint8_t * B, size_t r, uint64_t N, uint32_t * V, uint32_t * XY)
+{
+	uint32_t * X = XY;
+	uint32_t * Y = &XY[32 * r];
+	uint32_t * Z = &XY[64 * r];
+	uint64_t i;
+	uint64_t j;
+	size_t k;
+
+	/* 1: X <-- B */
+	for (k = 0; k < 32 * r; k++)
+		X[k] = le32dec(&B[4 * k]);
+
+	/* 2: for i = 0 to N - 1 do */
+	for (i = 0; i < N; i += 2) {
+		/* 3: V_i <-- X */
+		blkcpy(&V[i * (32 * r)], X, 128 * r);
+
+		/* 4: X <-- H(X) */
+		blockmix_salsa8(X, Y, Z, r);
+
+		/* 3: V_i <-- X */
+		blkcpy(&V[(i + 1) * (32 * r)], Y, 128 * r);
+
+		/* 4: X <-- H(X) */
+		blockmix_salsa8(Y, X, Z, r);
+	}
+
+	/* 6: for i = 0 to N - 1 do */
+	for (i = 0; i < N; i += 2) {
+		/* 7: j <-- Integerify(X) mod N */
+		j = integerify(X, r) & (N - 1);
+
+		/* 8: X <-- H(X \xor V_j) */
+		blkxor(X, &V[j * (32 * r)], 128 * r);
+		blockmix_salsa8(X, Y, Z, r);
+
+		/* 7: j <-- Integerify(X) mod N */
+		j = integerify(Y, r) & (N - 1);
+
+		/* 8: X <-- H(X \xor V_j) */
+		blkxor(Y, &V[j * (32 * r)], 128 * r);
+		blockmix_salsa8(Y, X, Z, r);
+	}
+
+	/* 10: B' <-- X */
+	for (k = 0; k < 32 * r; k++)
+		le32enc(&B[4 * k], X[k]);
+}
+
+/* cpu and memory intensive function to transform a 80 byte buffer into a 32 byte output
+   scratchpad size needs to be at least 63 + (128 * r * p) + (256 * r + 64) + (128 * r * N) bytes
+ */
+void scrypt_1024_1_1_256_sp(const char* input, char* output, char* scratchpad)
+{
+	uint8_t * B;
+	uint32_t * V;
+	uint32_t * XY;
+	uint32_t i;
+
+	const uint32_t N = 1024;
+	const uint32_t r = 1;
+	const uint32_t p = 1;
+
+	B = (uint8_t *)(((uintptr_t)(scratchpad) + 63) & ~ (uintptr_t)(63));
+	XY = (uint32_t *)(B + (128 * r * p));
+	V = (uint32_t *)(B + (128 * r * p) + (256 * r + 64));
+
+	/* 1: (B_0 ... B_{p-1}) <-- PBKDF2(P, S, 1, p * MFLen) */
+	PBKDF2_SHA256((const uint8_t*)input, 80, (const uint8_t*)input, 80, 1, B, p * 128 * r);
+
+	/* 2: for i = 0 to p - 1 do */
+	for (i = 0; i < p; i++) {
+		/* 3: B_i <-- MF(B_i, N) */
+		smix(&B[i * 128 * r], r, N, V, XY);
+	}
+
+	/* 5: DK <-- PBKDF2(P, B, 1, dkLen) */
+	PBKDF2_SHA256((const uint8_t*)input, 80, B, p * 128 * r, 1, (uint8_t*)output, 32);
+}
+
+void scrypt_1024_1_1_256(const char* input, char* output)
+{
+	char scratchpad[131583];
+	scrypt_1024_1_1_256_sp(input, output, scratchpad);
+}
diff --git a/scrypt.h b/scrypt.h
new file mode 100644
index 0000000..ece0134
--- /dev/null
+++ b/scrypt.h
@@ -0,0 +1,8 @@
+#ifndef SCRYPT_H
+#define SCRYPT_H
+
+void scrypt_1024_1_1_256(const char* input, char* output);
+void scrypt_1024_1_1_256_sp(const char* input, char* output, char* scratchpad);
+#define  scrypt_scratchpad_size 131583;
+
+#endif
\ No newline at end of file
diff --git a/scryptjane.c b/scryptjane.c
new file mode 100644
index 0000000..ca4f1aa
--- /dev/null
+++ b/scryptjane.c
@@ -0,0 +1,182 @@
+/*
+	scrypt-jane by Andrew M, https://github.com/floodyberry/scrypt-jane
+
+	Public Domain or MIT License, whichever is easier
+*/
+
+#include <string.h>
+
+#include "scryptjane.h"
+#include "scryptjane/scrypt-jane-portable.h"
+#include "scryptjane/scrypt-jane-hash.h"
+#include "scryptjane/scrypt-jane-romix.h"
+#include "scryptjane/scrypt-jane-test-vectors.h"
+
+
+#define scrypt_maxN 30  /* (1 << (30 + 1)) = ~2 billion */
+#if (SCRYPT_BLOCK_BYTES == 64)
+#define scrypt_r_32kb 8 /* (1 << 8) = 256 * 2 blocks in a chunk * 64 bytes = Max of 32kb in a chunk */
+#elif (SCRYPT_BLOCK_BYTES == 128)
+#define scrypt_r_32kb 7 /* (1 << 7) = 128 * 2 blocks in a chunk * 128 bytes = Max of 32kb in a chunk */
+#elif (SCRYPT_BLOCK_BYTES == 256)
+#define scrypt_r_32kb 6 /* (1 << 6) = 64 * 2 blocks in a chunk * 256 bytes = Max of 32kb in a chunk */
+#elif (SCRYPT_BLOCK_BYTES == 512)
+#define scrypt_r_32kb 5 /* (1 << 5) = 32 * 2 blocks in a chunk * 512 bytes = Max of 32kb in a chunk */
+#endif
+#define scrypt_maxr scrypt_r_32kb /* 32kb */
+#define scrypt_maxp 25  /* (1 << 25) = ~33 million */
+
+#include <stdio.h>
+#include <malloc.h>
+
+static void
+scrypt_fatal_error_default(const char *msg) {
+	fprintf(stderr, "%s\n", msg);
+	exit(1);
+}
+
+static scrypt_fatal_errorfn scrypt_fatal_error = scrypt_fatal_error_default;
+
+void
+scrypt_set_fatal_error_default(scrypt_fatal_errorfn fn) {
+	scrypt_fatal_error = fn;
+}
+
+static int
+scrypt_power_on_self_test() {
+	const scrypt_test_setting *t;
+	uint8_t test_digest[64];
+	uint32_t i;
+	int res = 7, scrypt_valid;
+
+	if (!scrypt_test_mix()) {
+#if !defined(SCRYPT_TEST)
+		scrypt_fatal_error("scrypt: mix function power-on-self-test failed");
+#endif
+		res &= ~1;
+	}
+
+	if (!scrypt_test_hash()) {
+#if !defined(SCRYPT_TEST)
+		scrypt_fatal_error("scrypt: hash function power-on-self-test failed");
+#endif
+		res &= ~2;
+	}
+
+	for (i = 0, scrypt_valid = 1; post_settings[i].pw; i++) {
+		t = post_settings + i;
+		scrypt((uint8_t *)t->pw, strlen(t->pw), (uint8_t *)t->salt, strlen(t->salt), t->Nfactor, t->rfactor, t->pfactor, test_digest, sizeof(test_digest));
+		scrypt_valid &= scrypt_verify(post_vectors[i], test_digest, sizeof(test_digest));
+	}
+	
+	if (!scrypt_valid) {
+#if !defined(SCRYPT_TEST)
+		scrypt_fatal_error("scrypt: scrypt power-on-self-test failed");
+#endif
+		res &= ~4;
+	}
+
+	return res;
+}
+
+typedef struct scrypt_aligned_alloc_t {
+	uint8_t *mem, *ptr;
+} scrypt_aligned_alloc;
+
+#if defined(SCRYPT_TEST_SPEED)
+static uint8_t *mem_base = (uint8_t *)0;
+static size_t mem_bump = 0;
+
+/* allocations are assumed to be multiples of 64 bytes and total allocations not to exceed ~1.01gb */
+static scrypt_aligned_alloc
+scrypt_alloc(uint64_t size) {
+	scrypt_aligned_alloc aa;
+	if (!mem_base) {
+		mem_base = (uint8_t *)malloc((1024 * 1024 * 1024) + (1024 * 1024) + (SCRYPT_BLOCK_BYTES - 1));
+		if (!mem_base)
+			scrypt_fatal_error("scrypt: out of memory");
+		mem_base = (uint8_t *)(((size_t)mem_base + (SCRYPT_BLOCK_BYTES - 1)) & ~(SCRYPT_BLOCK_BYTES - 1));
+	}
+	aa.mem = mem_base + mem_bump;
+	aa.ptr = aa.mem;
+	mem_bump += (size_t)size;
+	return aa;
+}
+
+static void
+scrypt_free(scrypt_aligned_alloc *aa) {
+	mem_bump = 0;
+}
+#else
+static scrypt_aligned_alloc
+scrypt_alloc(uint64_t size) {
+	static const size_t max_alloc = (size_t)-1;
+	scrypt_aligned_alloc aa;
+	size += (SCRYPT_BLOCK_BYTES - 1);
+	if (size > max_alloc)
+		scrypt_fatal_error("scrypt: not enough address space on this CPU to allocate required memory");
+	aa.mem = (uint8_t *)malloc((size_t)size);
+	aa.ptr = (uint8_t *)(((size_t)aa.mem + (SCRYPT_BLOCK_BYTES - 1)) & ~(SCRYPT_BLOCK_BYTES - 1));
+	if (!aa.mem)
+		scrypt_fatal_error("scrypt: out of memory");
+	return aa;
+}
+
+static void
+scrypt_free(scrypt_aligned_alloc *aa) {
+	free(aa->mem);
+}
+#endif
+
+
+void
+scrypt(const uint8_t *password, size_t password_len, const uint8_t *salt, size_t salt_len, uint8_t Nfactor, uint8_t rfactor, uint8_t pfactor, uint8_t *out, size_t bytes) {
+	scrypt_aligned_alloc YX, V;
+	uint8_t *X, *Y;
+	uint32_t N, r, p, chunk_bytes, i;
+
+#if !defined(SCRYPT_CHOOSE_COMPILETIME)
+	scrypt_ROMixfn scrypt_ROMix = scrypt_getROMix();
+#endif
+
+#if !defined(SCRYPT_TEST)
+	static int power_on_self_test = 0;
+	if (!power_on_self_test) {
+		power_on_self_test = 1;
+		if (!scrypt_power_on_self_test())
+			scrypt_fatal_error("scrypt: power on self test failed");
+	}
+#endif
+
+	if (Nfactor > scrypt_maxN)
+		scrypt_fatal_error("scrypt: N out of range");
+	if (rfactor > scrypt_maxr)
+		scrypt_fatal_error("scrypt: r out of range");
+	if (pfactor > scrypt_maxp)
+		scrypt_fatal_error("scrypt: p out of range");
+
+	N = (1 << (Nfactor + 1));
+	r = (1 << rfactor);
+	p = (1 << pfactor);
+
+	chunk_bytes = SCRYPT_BLOCK_BYTES * r * 2;
+	V = scrypt_alloc((uint64_t)N * chunk_bytes);
+	YX = scrypt_alloc((p + 1) * chunk_bytes);
+
+	/* 1: X = PBKDF2(password, salt) */
+	Y = YX.ptr;
+	X = Y + chunk_bytes;
+	scrypt_pbkdf2(password, password_len, salt, salt_len, 1, X, chunk_bytes * p);
+
+	/* 2: X = ROMix(X) */
+	for (i = 0; i < p; i++)
+		scrypt_ROMix((scrypt_mix_word_t *)(X + (chunk_bytes * i)), (scrypt_mix_word_t *)Y, (scrypt_mix_word_t *)V.ptr, N, r);
+
+	/* 3: Out = PBKDF2(password, X) */
+	scrypt_pbkdf2(password, password_len, X, chunk_bytes * p, 1, out, bytes);
+
+	scrypt_ensure_zero(YX.ptr, (p + 1) * chunk_bytes);
+
+	scrypt_free(&V);
+	scrypt_free(&YX);
+}
diff --git a/scryptjane.h b/scryptjane.h
new file mode 100644
index 0000000..00a28dd
--- /dev/null
+++ b/scryptjane.h
@@ -0,0 +1,32 @@
+#ifndef SCRYPT_JANE_H
+#define SCRYPT_JANE_H
+
+
+#define SCRYPT_KECCAK512
+#define SCRYPT_CHACHA
+#define SCRYPT_CHOOSE_COMPILETIME
+
+/*
+	Nfactor: Increases CPU & Memory Hardness
+	N = (1 << (Nfactor + 1)): How many times to mix a chunk and how many temporary chunks are used
+
+	rfactor: Increases Memory Hardness
+	r = (1 << rfactor): How large a chunk is
+
+	pfactor: Increases CPU Hardness
+	p = (1 << pfactor): Number of times to mix the main chunk
+
+	A block is the basic mixing unit (salsa/chacha block = 64 bytes)
+	A chunk is (2 * r) blocks
+
+	~Memory used = (N + 2) * ((2 * r) * block size)
+*/
+
+#include <stdlib.h>
+
+typedef void (*scrypt_fatal_errorfn)(const char *msg);
+void scrypt_set_fatal_error(scrypt_fatal_errorfn fn);
+
+void scrypt(const unsigned char *password, size_t password_len, const unsigned char *salt, size_t salt_len, unsigned char Nfactor, unsigned char rfactor, unsigned char pfactor, unsigned char *out, size_t bytes);
+
+#endif /* SCRYPT_JANE_H */
diff --git a/scryptn.c b/scryptn.c
new file mode 100644
index 0000000..11623dc
--- /dev/null
+++ b/scryptn.c
@@ -0,0 +1,687 @@
+/*-
+ * Copyright 2009 Colin Percival, 2011 ArtForz
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * This file was originally written by Colin Percival as part of the Tarsnap
+ * online backup system.
+ */
+
+
+#include <stdlib.h>
+#include <string.h>
+
+#include "scryptn.h"
+
+static __inline uint32_t
+be32dec(const void *pp)
+{
+	const uint8_t *p = (uint8_t const *)pp;
+
+	return ((uint32_t)(p[3]) + ((uint32_t)(p[2]) << 8) +
+	    ((uint32_t)(p[1]) << 16) + ((uint32_t)(p[0]) << 24));
+}
+
+static __inline void
+be32enc(void *pp, uint32_t x)
+{
+	uint8_t * p = (uint8_t *)pp;
+
+	p[3] = x & 0xff;
+	p[2] = (x >> 8) & 0xff;
+	p[1] = (x >> 16) & 0xff;
+	p[0] = (x >> 24) & 0xff;
+}
+
+static __inline uint32_t
+le32dec(const void *pp)
+{
+	const uint8_t *p = (uint8_t const *)pp;
+
+	return ((uint32_t)(p[0]) + ((uint32_t)(p[1]) << 8) +
+	    ((uint32_t)(p[2]) << 16) + ((uint32_t)(p[3]) << 24));
+}
+
+static __inline void
+le32enc(void *pp, uint32_t x)
+{
+	uint8_t * p = (uint8_t *)pp;
+
+	p[0] = x & 0xff;
+	p[1] = (x >> 8) & 0xff;
+	p[2] = (x >> 16) & 0xff;
+	p[3] = (x >> 24) & 0xff;
+}
+
+
+typedef struct SHA256Context {
+	uint32_t state[8];
+	uint32_t count[2];
+	unsigned char buf[64];
+} SHA256_CTX;
+
+typedef struct HMAC_SHA256Context {
+	SHA256_CTX ictx;
+	SHA256_CTX octx;
+} HMAC_SHA256_CTX;
+
+/*
+ * Encode a length len/4 vector of (uint32_t) into a length len vector of
+ * (unsigned char) in big-endian form.  Assumes len is a multiple of 4.
+ */
+static void
+be32enc_vect(unsigned char *dst, const uint32_t *src, size_t len)
+{
+	size_t i;
+
+	for (i = 0; i < len / 4; i++)
+		be32enc(dst + i * 4, src[i]);
+}
+
+/*
+ * Decode a big-endian length len vector of (unsigned char) into a length
+ * len/4 vector of (uint32_t).  Assumes len is a multiple of 4.
+ */
+static void
+be32dec_vect(uint32_t *dst, const unsigned char *src, size_t len)
+{
+	size_t i;
+
+	for (i = 0; i < len / 4; i++)
+		dst[i] = be32dec(src + i * 4);
+}
+
+/* Elementary functions used by SHA256 */
+#define Ch(x, y, z)	((x & (y ^ z)) ^ z)
+#define Maj(x, y, z)	((x & (y | z)) | (y & z))
+#define SHR(x, n)	(x >> n)
+#define ROTR(x, n)	((x >> n) | (x << (32 - n)))
+#define S0(x)		(ROTR(x, 2) ^ ROTR(x, 13) ^ ROTR(x, 22))
+#define S1(x)		(ROTR(x, 6) ^ ROTR(x, 11) ^ ROTR(x, 25))
+#define s0(x)		(ROTR(x, 7) ^ ROTR(x, 18) ^ SHR(x, 3))
+#define s1(x)		(ROTR(x, 17) ^ ROTR(x, 19) ^ SHR(x, 10))
+
+/* SHA256 round function */
+#define RND(a, b, c, d, e, f, g, h, k)			\
+	t0 = h + S1(e) + Ch(e, f, g) + k;		\
+	t1 = S0(a) + Maj(a, b, c);			\
+	d += t0;					\
+	h  = t0 + t1;
+
+/* Adjusted round function for rotating state */
+#define RNDr(S, W, i, k)			\
+	RND(S[(64 - i) % 8], S[(65 - i) % 8],	\
+	    S[(66 - i) % 8], S[(67 - i) % 8],	\
+	    S[(68 - i) % 8], S[(69 - i) % 8],	\
+	    S[(70 - i) % 8], S[(71 - i) % 8],	\
+	    W[i] + k)
+
+/*
+ * SHA256 block compression function.  The 256-bit state is transformed via
+ * the 512-bit input block to produce a new state.
+ */
+static void
+SHA256_Transform(uint32_t * state, const unsigned char block[64])
+{
+	uint32_t W[64];
+	uint32_t S[8];
+	uint32_t t0, t1;
+	int i;
+
+	/* 1. Prepare message schedule W. */
+	be32dec_vect(W, block, 64);
+	for (i = 16; i < 64; i++)
+		W[i] = s1(W[i - 2]) + W[i - 7] + s0(W[i - 15]) + W[i - 16];
+
+	/* 2. Initialize working variables. */
+	memcpy(S, state, 32);
+
+	/* 3. Mix. */
+	RNDr(S, W, 0, 0x428a2f98);
+	RNDr(S, W, 1, 0x71374491);
+	RNDr(S, W, 2, 0xb5c0fbcf);
+	RNDr(S, W, 3, 0xe9b5dba5);
+	RNDr(S, W, 4, 0x3956c25b);
+	RNDr(S, W, 5, 0x59f111f1);
+	RNDr(S, W, 6, 0x923f82a4);
+	RNDr(S, W, 7, 0xab1c5ed5);
+	RNDr(S, W, 8, 0xd807aa98);
+	RNDr(S, W, 9, 0x12835b01);
+	RNDr(S, W, 10, 0x243185be);
+	RNDr(S, W, 11, 0x550c7dc3);
+	RNDr(S, W, 12, 0x72be5d74);
+	RNDr(S, W, 13, 0x80deb1fe);
+	RNDr(S, W, 14, 0x9bdc06a7);
+	RNDr(S, W, 15, 0xc19bf174);
+	RNDr(S, W, 16, 0xe49b69c1);
+	RNDr(S, W, 17, 0xefbe4786);
+	RNDr(S, W, 18, 0x0fc19dc6);
+	RNDr(S, W, 19, 0x240ca1cc);
+	RNDr(S, W, 20, 0x2de92c6f);
+	RNDr(S, W, 21, 0x4a7484aa);
+	RNDr(S, W, 22, 0x5cb0a9dc);
+	RNDr(S, W, 23, 0x76f988da);
+	RNDr(S, W, 24, 0x983e5152);
+	RNDr(S, W, 25, 0xa831c66d);
+	RNDr(S, W, 26, 0xb00327c8);
+	RNDr(S, W, 27, 0xbf597fc7);
+	RNDr(S, W, 28, 0xc6e00bf3);
+	RNDr(S, W, 29, 0xd5a79147);
+	RNDr(S, W, 30, 0x06ca6351);
+	RNDr(S, W, 31, 0x14292967);
+	RNDr(S, W, 32, 0x27b70a85);
+	RNDr(S, W, 33, 0x2e1b2138);
+	RNDr(S, W, 34, 0x4d2c6dfc);
+	RNDr(S, W, 35, 0x53380d13);
+	RNDr(S, W, 36, 0x650a7354);
+	RNDr(S, W, 37, 0x766a0abb);
+	RNDr(S, W, 38, 0x81c2c92e);
+	RNDr(S, W, 39, 0x92722c85);
+	RNDr(S, W, 40, 0xa2bfe8a1);
+	RNDr(S, W, 41, 0xa81a664b);
+	RNDr(S, W, 42, 0xc24b8b70);
+	RNDr(S, W, 43, 0xc76c51a3);
+	RNDr(S, W, 44, 0xd192e819);
+	RNDr(S, W, 45, 0xd6990624);
+	RNDr(S, W, 46, 0xf40e3585);
+	RNDr(S, W, 47, 0x106aa070);
+	RNDr(S, W, 48, 0x19a4c116);
+	RNDr(S, W, 49, 0x1e376c08);
+	RNDr(S, W, 50, 0x2748774c);
+	RNDr(S, W, 51, 0x34b0bcb5);
+	RNDr(S, W, 52, 0x391c0cb3);
+	RNDr(S, W, 53, 0x4ed8aa4a);
+	RNDr(S, W, 54, 0x5b9cca4f);
+	RNDr(S, W, 55, 0x682e6ff3);
+	RNDr(S, W, 56, 0x748f82ee);
+	RNDr(S, W, 57, 0x78a5636f);
+	RNDr(S, W, 58, 0x84c87814);
+	RNDr(S, W, 59, 0x8cc70208);
+	RNDr(S, W, 60, 0x90befffa);
+	RNDr(S, W, 61, 0xa4506ceb);
+	RNDr(S, W, 62, 0xbef9a3f7);
+	RNDr(S, W, 63, 0xc67178f2);
+
+	/* 4. Mix local working variables into global state */
+	for (i = 0; i < 8; i++)
+		state[i] += S[i];
+
+	/* Clean the stack. */
+	memset(W, 0, 256);
+	memset(S, 0, 32);
+	t0 = t1 = 0;
+}
+
+static unsigned char PAD[64] = {
+	0x80, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+};
+
+/* SHA-256 initialization.  Begins a SHA-256 operation. */
+static void
+SHA256_Init(SHA256_CTX * ctx)
+{
+
+	/* Zero bits processed so far */
+	ctx->count[0] = ctx->count[1] = 0;
+
+	/* Magic initialization constants */
+	ctx->state[0] = 0x6A09E667;
+	ctx->state[1] = 0xBB67AE85;
+	ctx->state[2] = 0x3C6EF372;
+	ctx->state[3] = 0xA54FF53A;
+	ctx->state[4] = 0x510E527F;
+	ctx->state[5] = 0x9B05688C;
+	ctx->state[6] = 0x1F83D9AB;
+	ctx->state[7] = 0x5BE0CD19;
+}
+
+/* Add bytes into the hash */
+static void
+SHA256_Update(SHA256_CTX * ctx, const void *in, size_t len)
+{
+	uint32_t bitlen[2];
+	uint32_t r;
+	const unsigned char *src = in;
+
+	/* Number of bytes left in the buffer from previous updates */
+	r = (ctx->count[1] >> 3) & 0x3f;
+
+	/* Convert the length into a number of bits */
+	bitlen[1] = ((uint32_t)len) << 3;
+	bitlen[0] = (uint32_t)(len >> 29);
+
+	/* Update number of bits */
+	if ((ctx->count[1] += bitlen[1]) < bitlen[1])
+		ctx->count[0]++;
+	ctx->count[0] += bitlen[0];
+
+	/* Handle the case where we don't need to perform any transforms */
+	if (len < 64 - r) {
+		memcpy(&ctx->buf[r], src, len);
+		return;
+	}
+
+	/* Finish the current block */
+	memcpy(&ctx->buf[r], src, 64 - r);
+	SHA256_Transform(ctx->state, ctx->buf);
+	src += 64 - r;
+	len -= 64 - r;
+
+	/* Perform complete blocks */
+	while (len >= 64) {
+		SHA256_Transform(ctx->state, src);
+		src += 64;
+		len -= 64;
+	}
+
+	/* Copy left over data into buffer */
+	memcpy(ctx->buf, src, len);
+}
+
+/* Add padding and terminating bit-count. */
+static void
+SHA256_Pad(SHA256_CTX * ctx)
+{
+	unsigned char len[8];
+	uint32_t r, plen;
+
+	/*
+	 * Convert length to a vector of bytes -- we do this now rather
+	 * than later because the length will change after we pad.
+	 */
+	be32enc_vect(len, ctx->count, 8);
+
+	/* Add 1--64 bytes so that the resulting length is 56 mod 64 */
+	r = (ctx->count[1] >> 3) & 0x3f;
+	plen = (r < 56) ? (56 - r) : (120 - r);
+	SHA256_Update(ctx, PAD, (size_t)plen);
+
+	/* Add the terminating bit-count */
+	SHA256_Update(ctx, len, 8);
+}
+
+/*
+ * SHA-256 finalization.  Pads the input data, exports the hash value,
+ * and clears the context state.
+ */
+static void
+SHA256_Final(unsigned char digest[32], SHA256_CTX * ctx)
+{
+
+	/* Add padding */
+	SHA256_Pad(ctx);
+
+	/* Write the hash */
+	be32enc_vect(digest, ctx->state, 32);
+
+	/* Clear the context state */
+	memset((void *)ctx, 0, sizeof(*ctx));
+}
+
+/* Initialize an HMAC-SHA256 operation with the given key. */
+static void
+HMAC_SHA256_Init(HMAC_SHA256_CTX * ctx, const void * _K, size_t Klen)
+{
+	unsigned char pad[64];
+	unsigned char khash[32];
+	const unsigned char * K = _K;
+	size_t i;
+
+	/* If Klen > 64, the key is really SHA256(K). */
+	if (Klen > 64) {
+		SHA256_Init(&ctx->ictx);
+		SHA256_Update(&ctx->ictx, K, Klen);
+		SHA256_Final(khash, &ctx->ictx);
+		K = khash;
+		Klen = 32;
+	}
+
+	/* Inner SHA256 operation is SHA256(K xor [block of 0x36] || data). */
+	SHA256_Init(&ctx->ictx);
+	memset(pad, 0x36, 64);
+	for (i = 0; i < Klen; i++)
+		pad[i] ^= K[i];
+	SHA256_Update(&ctx->ictx, pad, 64);
+
+	/* Outer SHA256 operation is SHA256(K xor [block of 0x5c] || hash). */
+	SHA256_Init(&ctx->octx);
+	memset(pad, 0x5c, 64);
+	for (i = 0; i < Klen; i++)
+		pad[i] ^= K[i];
+	SHA256_Update(&ctx->octx, pad, 64);
+
+	/* Clean the stack. */
+	memset(khash, 0, 32);
+}
+
+/* Add bytes to the HMAC-SHA256 operation. */
+static void
+HMAC_SHA256_Update(HMAC_SHA256_CTX * ctx, const void *in, size_t len)
+{
+
+	/* Feed data to the inner SHA256 operation. */
+	SHA256_Update(&ctx->ictx, in, len);
+}
+
+/* Finish an HMAC-SHA256 operation. */
+static void
+HMAC_SHA256_Final(unsigned char digest[32], HMAC_SHA256_CTX * ctx)
+{
+	unsigned char ihash[32];
+
+	/* Finish the inner SHA256 operation. */
+	SHA256_Final(ihash, &ctx->ictx);
+
+	/* Feed the inner hash to the outer SHA256 operation. */
+	SHA256_Update(&ctx->octx, ihash, 32);
+
+	/* Finish the outer SHA256 operation. */
+	SHA256_Final(digest, &ctx->octx);
+
+	/* Clean the stack. */
+	memset(ihash, 0, 32);
+}
+
+/**
+ * PBKDF2_SHA256(passwd, passwdlen, salt, saltlen, c, buf, dkLen):
+ * Compute PBKDF2(passwd, salt, c, dkLen) using HMAC-SHA256 as the PRF, and
+ * write the output to buf.  The value dkLen must be at most 32 * (2^32 - 1).
+ */
+static void
+PBKDF2_SHA256(const uint8_t * passwd, size_t passwdlen, const uint8_t * salt,
+    size_t saltlen, uint64_t c, uint8_t * buf, size_t dkLen)
+{
+	HMAC_SHA256_CTX PShctx, hctx;
+	size_t i;
+	uint8_t ivec[4];
+	uint8_t U[32];
+	uint8_t T[32];
+	uint64_t j;
+	int k;
+	size_t clen;
+
+	/* Compute HMAC state after processing P and S. */
+	HMAC_SHA256_Init(&PShctx, passwd, passwdlen);
+	HMAC_SHA256_Update(&PShctx, salt, saltlen);
+
+	/* Iterate through the blocks. */
+	for (i = 0; i * 32 < dkLen; i++) {
+		/* Generate INT(i + 1). */
+		be32enc(ivec, (uint32_t)(i + 1));
+
+		/* Compute U_1 = PRF(P, S || INT(i)). */
+		memcpy(&hctx, &PShctx, sizeof(HMAC_SHA256_CTX));
+		HMAC_SHA256_Update(&hctx, ivec, 4);
+		HMAC_SHA256_Final(U, &hctx);
+
+		/* T_i = U_1 ... */
+		memcpy(T, U, 32);
+
+		for (j = 2; j <= c; j++) {
+			/* Compute U_j. */
+			HMAC_SHA256_Init(&hctx, passwd, passwdlen);
+			HMAC_SHA256_Update(&hctx, U, 32);
+			HMAC_SHA256_Final(U, &hctx);
+
+			/* ... xor U_j ... */
+			for (k = 0; k < 32; k++)
+				T[k] ^= U[k];
+		}
+
+		/* Copy as many bytes as necessary into buf. */
+		clen = dkLen - i * 32;
+		if (clen > 32)
+			clen = 32;
+		memcpy(&buf[i * 32], T, clen);
+	}
+
+	/* Clean PShctx, since we never called _Final on it. */
+	memset(&PShctx, 0, sizeof(HMAC_SHA256_CTX));
+}
+
+
+static void blkcpy(void *, void *, size_t);
+static void blkxor(void *, void *, size_t);
+static void salsa20_8(uint32_t[16]);
+static void blockmix_salsa8(uint32_t *, uint32_t *, uint32_t *, size_t);
+static uint64_t integerify(void *, size_t);
+static void smix(uint8_t *, size_t, uint64_t, uint32_t *, uint32_t *);
+
+static void
+blkcpy(void * dest, void * src, size_t len)
+{
+	size_t * D = dest;
+	size_t * S = src;
+	size_t L = len / sizeof(size_t);
+	size_t i;
+
+	for (i = 0; i < L; i++)
+		D[i] = S[i];
+}
+
+static void
+blkxor(void * dest, void * src, size_t len)
+{
+	size_t * D = dest;
+	size_t * S = src;
+	size_t L = len / sizeof(size_t);
+	size_t i;
+
+	for (i = 0; i < L; i++)
+		D[i] ^= S[i];
+}
+
+/**
+ * salsa20_8(B):
+ * Apply the salsa20/8 core to the provided block.
+ */
+static void
+salsa20_8(uint32_t B[16])
+{
+	uint32_t x[16];
+	size_t i;
+
+	blkcpy(x, B, 64);
+	for (i = 0; i < 8; i += 2) {
+#define R(a,b) (((a) << (b)) | ((a) >> (32 - (b))))
+		/* Operate on columns. */
+		x[ 4] ^= R(x[ 0]+x[12], 7);  x[ 8] ^= R(x[ 4]+x[ 0], 9);
+		x[12] ^= R(x[ 8]+x[ 4],13);  x[ 0] ^= R(x[12]+x[ 8],18);
+
+		x[ 9] ^= R(x[ 5]+x[ 1], 7);  x[13] ^= R(x[ 9]+x[ 5], 9);
+		x[ 1] ^= R(x[13]+x[ 9],13);  x[ 5] ^= R(x[ 1]+x[13],18);
+
+		x[14] ^= R(x[10]+x[ 6], 7);  x[ 2] ^= R(x[14]+x[10], 9);
+		x[ 6] ^= R(x[ 2]+x[14],13);  x[10] ^= R(x[ 6]+x[ 2],18);
+
+		x[ 3] ^= R(x[15]+x[11], 7);  x[ 7] ^= R(x[ 3]+x[15], 9);
+		x[11] ^= R(x[ 7]+x[ 3],13);  x[15] ^= R(x[11]+x[ 7],18);
+
+		/* Operate on rows. */
+		x[ 1] ^= R(x[ 0]+x[ 3], 7);  x[ 2] ^= R(x[ 1]+x[ 0], 9);
+		x[ 3] ^= R(x[ 2]+x[ 1],13);  x[ 0] ^= R(x[ 3]+x[ 2],18);
+
+		x[ 6] ^= R(x[ 5]+x[ 4], 7);  x[ 7] ^= R(x[ 6]+x[ 5], 9);
+		x[ 4] ^= R(x[ 7]+x[ 6],13);  x[ 5] ^= R(x[ 4]+x[ 7],18);
+
+		x[11] ^= R(x[10]+x[ 9], 7);  x[ 8] ^= R(x[11]+x[10], 9);
+		x[ 9] ^= R(x[ 8]+x[11],13);  x[10] ^= R(x[ 9]+x[ 8],18);
+
+		x[12] ^= R(x[15]+x[14], 7);  x[13] ^= R(x[12]+x[15], 9);
+		x[14] ^= R(x[13]+x[12],13);  x[15] ^= R(x[14]+x[13],18);
+#undef R
+	}
+	for (i = 0; i < 16; i++)
+		B[i] += x[i];
+}
+
+/**
+ * blockmix_salsa8(Bin, Bout, X, r):
+ * Compute Bout = BlockMix_{salsa20/8, r}(Bin).  The input Bin must be 128r
+ * bytes in length; the output Bout must also be the same size.  The
+ * temporary space X must be 64 bytes.
+ */
+static void
+blockmix_salsa8(uint32_t * Bin, uint32_t * Bout, uint32_t * X, size_t r)
+{
+	size_t i;
+
+	/* 1: X <-- B_{2r - 1} */
+	blkcpy(X, &Bin[(2 * r - 1) * 16], 64);
+
+	/* 2: for i = 0 to 2r - 1 do */
+	for (i = 0; i < 2 * r; i += 2) {
+		/* 3: X <-- H(X \xor B_i) */
+		blkxor(X, &Bin[i * 16], 64);
+		salsa20_8(X);
+
+		/* 4: Y_i <-- X */
+		/* 6: B' <-- (Y_0, Y_2 ... Y_{2r-2}, Y_1, Y_3 ... Y_{2r-1}) */
+		blkcpy(&Bout[i * 8], X, 64);
+
+		/* 3: X <-- H(X \xor B_i) */
+		blkxor(X, &Bin[i * 16 + 16], 64);
+		salsa20_8(X);
+
+		/* 4: Y_i <-- X */
+		/* 6: B' <-- (Y_0, Y_2 ... Y_{2r-2}, Y_1, Y_3 ... Y_{2r-1}) */
+		blkcpy(&Bout[i * 8 + r * 16], X, 64);
+	}
+}
+
+/**
+ * integerify(B, r):
+ * Return the result of parsing B_{2r-1} as a little-endian integer.
+ */
+static uint64_t
+integerify(void * B, size_t r)
+{
+	uint32_t * X = (void *)((uintptr_t)(B) + (2 * r - 1) * 64);
+
+	return (((uint64_t)(X[1]) << 32) + X[0]);
+}
+
+/**
+ * smix(B, r, N, V, XY):
+ * Compute B = SMix_r(B, N).  The input B must be 128r bytes in length;
+ * the temporary storage V must be 128rN bytes in length; the temporary
+ * storage XY must be 256r + 64 bytes in length.  The value N must be a
+ * power of 2 greater than 1.  The arrays B, V, and XY must be aligned to a
+ * multiple of 64 bytes.
+ */
+static void
+smix(uint8_t * B, size_t r, uint64_t N, uint32_t * V, uint32_t * XY)
+{
+	uint32_t * X = XY;
+	uint32_t * Y = &XY[32 * r];
+	uint32_t * Z = &XY[64 * r];
+	uint64_t i;
+	uint64_t j;
+	size_t k;
+
+	/* 1: X <-- B */
+	for (k = 0; k < 32 * r; k++)
+		X[k] = le32dec(&B[4 * k]);
+
+	/* 2: for i = 0 to N - 1 do */
+	for (i = 0; i < N; i += 2) {
+		/* 3: V_i <-- X */
+		blkcpy(&V[i * (32 * r)], X, 128 * r);
+
+		/* 4: X <-- H(X) */
+		blockmix_salsa8(X, Y, Z, r);
+
+		/* 3: V_i <-- X */
+		blkcpy(&V[(i + 1) * (32 * r)], Y, 128 * r);
+
+		/* 4: X <-- H(X) */
+		blockmix_salsa8(Y, X, Z, r);
+	}
+
+	/* 6: for i = 0 to N - 1 do */
+	for (i = 0; i < N; i += 2) {
+		/* 7: j <-- Integerify(X) mod N */
+		j = integerify(X, r) & (N - 1);
+
+		/* 8: X <-- H(X \xor V_j) */
+		blkxor(X, &V[j * (32 * r)], 128 * r);
+		blockmix_salsa8(X, Y, Z, r);
+
+		/* 7: j <-- Integerify(X) mod N */
+		j = integerify(Y, r) & (N - 1);
+
+		/* 8: X <-- H(X \xor V_j) */
+		blkxor(Y, &V[j * (32 * r)], 128 * r);
+		blockmix_salsa8(Y, X, Z, r);
+	}
+
+	/* 10: B' <-- X */
+	for (k = 0; k < 32 * r; k++)
+		le32enc(&B[4 * k], X[k]);
+}
+
+/* cpu and memory intensive function to transform a 80 byte buffer into a 32 byte output
+   scratchpad size needs to be at least 63 + (128 * r * p) + (256 * r + 64) + (128 * r * N) bytes
+ */
+void scrypt_N_1_1_256_sp(const char* input, char* output, char* scratchpad, uint32_t N)
+{
+	uint8_t * B;
+	uint32_t * V;
+	uint32_t * XY;
+	uint32_t i;
+
+	//const uint32_t N = 1024;
+	const uint32_t r = 1;
+	const uint32_t p = 1;
+
+	B = (uint8_t *)(((uintptr_t)(scratchpad) + 63) & ~ (uintptr_t)(63));
+	XY = (uint32_t *)(B + (128 * r * p));
+	V = (uint32_t *)(B + (128 * r * p) + (256 * r + 64));
+
+	/* 1: (B_0 ... B_{p-1}) <-- PBKDF2(P, S, 1, p * MFLen) */
+	PBKDF2_SHA256((const uint8_t*)input, 80, (const uint8_t*)input, 80, 1, B, p * 128 * r);
+
+	/* 2: for i = 0 to p - 1 do */
+	for (i = 0; i < p; i++) {
+		/* 3: B_i <-- MF(B_i, N) */
+		smix(&B[i * 128 * r], r, N, V, XY);
+	}
+
+	/* 5: DK <-- PBKDF2(P, B, 1, dkLen) */
+	PBKDF2_SHA256((const uint8_t*)input, 80, B, p * 128 * r, 1, (uint8_t*)output, 32);
+}
+
+void scrypt_N_1_1_256(const char* input, char* output, uint32_t N)
+{
+	//char scratchpad[131583];
+    char *scratchpad;
+    
+    // align on 4 byte boundary
+    scratchpad = (char*)malloc(128*N + 512);
+	scrypt_N_1_1_256_sp(input, output, scratchpad, N);
+    free(scratchpad);
+}
+
diff --git a/scryptn.h b/scryptn.h
new file mode 100644
index 0000000..ba461a2
--- /dev/null
+++ b/scryptn.h
@@ -0,0 +1,16 @@
+#ifndef SCRYPT_H
+#define SCRYPT_H
+#include <stdint.h>
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void scrypt_N_1_1_256(const char* input, char* output, uint32_t N);
+void scrypt_N_1_1_256_sp(const char* input, char* output, char* scratchpad, uint32_t N);
+//const int scrypt_scratchpad_size = 131583;
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/sha3/aes_helper.c b/sha3/aes_helper.c
new file mode 100644
index 0000000..75b7cc6
--- /dev/null
+++ b/sha3/aes_helper.c
@@ -0,0 +1,392 @@
+/* $Id: aes_helper.c 220 2010-06-09 09:21:50Z tp $ */
+/*
+ * AES tables. This file is not meant to be compiled by itself; it
+ * is included by some hash function implementations. It contains
+ * the precomputed tables and helper macros for evaluating an AES
+ * round, optionally with a final XOR with a subkey.
+ *
+ * By default, this file defines the tables and macros for little-endian
+ * processing (i.e. it is assumed that the input bytes have been read
+ * from memory and assembled with the little-endian convention). If
+ * the 'AES_BIG_ENDIAN' macro is defined (to a non-zero integer value)
+ * when this file is included, then the tables and macros for big-endian
+ * processing are defined instead. The big-endian tables and macros have
+ * names distinct from the little-endian tables and macros, hence it is
+ * possible to have both simultaneously, by including this file twice
+ * (with and without the AES_BIG_ENDIAN macro).
+ *
+ * ==========================(LICENSE BEGIN)============================
+ *
+ * Copyright (c) 2007-2010  Projet RNRT SAPHIR
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ===========================(LICENSE END)=============================
+ *
+ * @author   Thomas Pornin <thomas.pornin@cryptolog.com>
+ */
+
+#include "sph_types.h"
+#ifdef __cplusplus
+extern "C"{
+#endif
+#if AES_BIG_ENDIAN
+
+#define AESx(x)   ( ((SPH_C32(x) >> 24) & SPH_C32(0x000000FF)) \
+                  | ((SPH_C32(x) >>  8) & SPH_C32(0x0000FF00)) \
+                  | ((SPH_C32(x) <<  8) & SPH_C32(0x00FF0000)) \
+                  | ((SPH_C32(x) << 24) & SPH_C32(0xFF000000)))
+
+#define AES0      AES0_BE
+#define AES1      AES1_BE
+#define AES2      AES2_BE
+#define AES3      AES3_BE
+
+#define AES_ROUND_BE(X0, X1, X2, X3, K0, K1, K2, K3, Y0, Y1, Y2, Y3)   do { \
+		(Y0) = AES0[((X0) >> 24) & 0xFF] \
+			^ AES1[((X1) >> 16) & 0xFF] \
+			^ AES2[((X2) >> 8) & 0xFF] \
+			^ AES3[(X3) & 0xFF] ^ (K0); \
+		(Y1) = AES0[((X1) >> 24) & 0xFF] \
+			^ AES1[((X2) >> 16) & 0xFF] \
+			^ AES2[((X3) >> 8) & 0xFF] \
+			^ AES3[(X0) & 0xFF] ^ (K1); \
+		(Y2) = AES0[((X2) >> 24) & 0xFF] \
+			^ AES1[((X3) >> 16) & 0xFF] \
+			^ AES2[((X0) >> 8) & 0xFF] \
+			^ AES3[(X1) & 0xFF] ^ (K2); \
+		(Y3) = AES0[((X3) >> 24) & 0xFF] \
+			^ AES1[((X0) >> 16) & 0xFF] \
+			^ AES2[((X1) >> 8) & 0xFF] \
+			^ AES3[(X2) & 0xFF] ^ (K3); \
+	} while (0)
+
+#define AES_ROUND_NOKEY_BE(X0, X1, X2, X3, Y0, Y1, Y2, Y3) \
+	AES_ROUND_BE(X0, X1, X2, X3, 0, 0, 0, 0, Y0, Y1, Y2, Y3)
+
+#else
+
+#define AESx(x)   SPH_C32(x)
+#define AES0      AES0_LE
+#define AES1      AES1_LE
+#define AES2      AES2_LE
+#define AES3      AES3_LE
+
+#define AES_ROUND_LE(X0, X1, X2, X3, K0, K1, K2, K3, Y0, Y1, Y2, Y3)   do { \
+		(Y0) = AES0[(X0) & 0xFF] \
+			^ AES1[((X1) >> 8) & 0xFF] \
+			^ AES2[((X2) >> 16) & 0xFF] \
+			^ AES3[((X3) >> 24) & 0xFF] ^ (K0); \
+		(Y1) = AES0[(X1) & 0xFF] \
+			^ AES1[((X2) >> 8) & 0xFF] \
+			^ AES2[((X3) >> 16) & 0xFF] \
+			^ AES3[((X0) >> 24) & 0xFF] ^ (K1); \
+		(Y2) = AES0[(X2) & 0xFF] \
+			^ AES1[((X3) >> 8) & 0xFF] \
+			^ AES2[((X0) >> 16) & 0xFF] \
+			^ AES3[((X1) >> 24) & 0xFF] ^ (K2); \
+		(Y3) = AES0[(X3) & 0xFF] \
+			^ AES1[((X0) >> 8) & 0xFF] \
+			^ AES2[((X1) >> 16) & 0xFF] \
+			^ AES3[((X2) >> 24) & 0xFF] ^ (K3); \
+	} while (0)
+
+#define AES_ROUND_NOKEY_LE(X0, X1, X2, X3, Y0, Y1, Y2, Y3) \
+	AES_ROUND_LE(X0, X1, X2, X3, 0, 0, 0, 0, Y0, Y1, Y2, Y3)
+
+#endif
+
+/*
+ * The AES*[] tables allow us to perform a fast evaluation of an AES
+ * round; table AESi[] combines SubBytes for a byte at row i, and
+ * MixColumns for the column where that byte goes after ShiftRows.
+ */
+
+static const sph_u32 AES0[256] = {
+	AESx(0xA56363C6), AESx(0x847C7CF8), AESx(0x997777EE), AESx(0x8D7B7BF6),
+	AESx(0x0DF2F2FF), AESx(0xBD6B6BD6), AESx(0xB16F6FDE), AESx(0x54C5C591),
+	AESx(0x50303060), AESx(0x03010102), AESx(0xA96767CE), AESx(0x7D2B2B56),
+	AESx(0x19FEFEE7), AESx(0x62D7D7B5), AESx(0xE6ABAB4D), AESx(0x9A7676EC),
+	AESx(0x45CACA8F), AESx(0x9D82821F), AESx(0x40C9C989), AESx(0x877D7DFA),
+	AESx(0x15FAFAEF), AESx(0xEB5959B2), AESx(0xC947478E), AESx(0x0BF0F0FB),
+	AESx(0xECADAD41), AESx(0x67D4D4B3), AESx(0xFDA2A25F), AESx(0xEAAFAF45),
+	AESx(0xBF9C9C23), AESx(0xF7A4A453), AESx(0x967272E4), AESx(0x5BC0C09B),
+	AESx(0xC2B7B775), AESx(0x1CFDFDE1), AESx(0xAE93933D), AESx(0x6A26264C),
+	AESx(0x5A36366C), AESx(0x413F3F7E), AESx(0x02F7F7F5), AESx(0x4FCCCC83),
+	AESx(0x5C343468), AESx(0xF4A5A551), AESx(0x34E5E5D1), AESx(0x08F1F1F9),
+	AESx(0x937171E2), AESx(0x73D8D8AB), AESx(0x53313162), AESx(0x3F15152A),
+	AESx(0x0C040408), AESx(0x52C7C795), AESx(0x65232346), AESx(0x5EC3C39D),
+	AESx(0x28181830), AESx(0xA1969637), AESx(0x0F05050A), AESx(0xB59A9A2F),
+	AESx(0x0907070E), AESx(0x36121224), AESx(0x9B80801B), AESx(0x3DE2E2DF),
+	AESx(0x26EBEBCD), AESx(0x6927274E), AESx(0xCDB2B27F), AESx(0x9F7575EA),
+	AESx(0x1B090912), AESx(0x9E83831D), AESx(0x742C2C58), AESx(0x2E1A1A34),
+	AESx(0x2D1B1B36), AESx(0xB26E6EDC), AESx(0xEE5A5AB4), AESx(0xFBA0A05B),
+	AESx(0xF65252A4), AESx(0x4D3B3B76), AESx(0x61D6D6B7), AESx(0xCEB3B37D),
+	AESx(0x7B292952), AESx(0x3EE3E3DD), AESx(0x712F2F5E), AESx(0x97848413),
+	AESx(0xF55353A6), AESx(0x68D1D1B9), AESx(0x00000000), AESx(0x2CEDEDC1),
+	AESx(0x60202040), AESx(0x1FFCFCE3), AESx(0xC8B1B179), AESx(0xED5B5BB6),
+	AESx(0xBE6A6AD4), AESx(0x46CBCB8D), AESx(0xD9BEBE67), AESx(0x4B393972),
+	AESx(0xDE4A4A94), AESx(0xD44C4C98), AESx(0xE85858B0), AESx(0x4ACFCF85),
+	AESx(0x6BD0D0BB), AESx(0x2AEFEFC5), AESx(0xE5AAAA4F), AESx(0x16FBFBED),
+	AESx(0xC5434386), AESx(0xD74D4D9A), AESx(0x55333366), AESx(0x94858511),
+	AESx(0xCF45458A), AESx(0x10F9F9E9), AESx(0x06020204), AESx(0x817F7FFE),
+	AESx(0xF05050A0), AESx(0x443C3C78), AESx(0xBA9F9F25), AESx(0xE3A8A84B),
+	AESx(0xF35151A2), AESx(0xFEA3A35D), AESx(0xC0404080), AESx(0x8A8F8F05),
+	AESx(0xAD92923F), AESx(0xBC9D9D21), AESx(0x48383870), AESx(0x04F5F5F1),
+	AESx(0xDFBCBC63), AESx(0xC1B6B677), AESx(0x75DADAAF), AESx(0x63212142),
+	AESx(0x30101020), AESx(0x1AFFFFE5), AESx(0x0EF3F3FD), AESx(0x6DD2D2BF),
+	AESx(0x4CCDCD81), AESx(0x140C0C18), AESx(0x35131326), AESx(0x2FECECC3),
+	AESx(0xE15F5FBE), AESx(0xA2979735), AESx(0xCC444488), AESx(0x3917172E),
+	AESx(0x57C4C493), AESx(0xF2A7A755), AESx(0x827E7EFC), AESx(0x473D3D7A),
+	AESx(0xAC6464C8), AESx(0xE75D5DBA), AESx(0x2B191932), AESx(0x957373E6),
+	AESx(0xA06060C0), AESx(0x98818119), AESx(0xD14F4F9E), AESx(0x7FDCDCA3),
+	AESx(0x66222244), AESx(0x7E2A2A54), AESx(0xAB90903B), AESx(0x8388880B),
+	AESx(0xCA46468C), AESx(0x29EEEEC7), AESx(0xD3B8B86B), AESx(0x3C141428),
+	AESx(0x79DEDEA7), AESx(0xE25E5EBC), AESx(0x1D0B0B16), AESx(0x76DBDBAD),
+	AESx(0x3BE0E0DB), AESx(0x56323264), AESx(0x4E3A3A74), AESx(0x1E0A0A14),
+	AESx(0xDB494992), AESx(0x0A06060C), AESx(0x6C242448), AESx(0xE45C5CB8),
+	AESx(0x5DC2C29F), AESx(0x6ED3D3BD), AESx(0xEFACAC43), AESx(0xA66262C4),
+	AESx(0xA8919139), AESx(0xA4959531), AESx(0x37E4E4D3), AESx(0x8B7979F2),
+	AESx(0x32E7E7D5), AESx(0x43C8C88B), AESx(0x5937376E), AESx(0xB76D6DDA),
+	AESx(0x8C8D8D01), AESx(0x64D5D5B1), AESx(0xD24E4E9C), AESx(0xE0A9A949),
+	AESx(0xB46C6CD8), AESx(0xFA5656AC), AESx(0x07F4F4F3), AESx(0x25EAEACF),
+	AESx(0xAF6565CA), AESx(0x8E7A7AF4), AESx(0xE9AEAE47), AESx(0x18080810),
+	AESx(0xD5BABA6F), AESx(0x887878F0), AESx(0x6F25254A), AESx(0x722E2E5C),
+	AESx(0x241C1C38), AESx(0xF1A6A657), AESx(0xC7B4B473), AESx(0x51C6C697),
+	AESx(0x23E8E8CB), AESx(0x7CDDDDA1), AESx(0x9C7474E8), AESx(0x211F1F3E),
+	AESx(0xDD4B4B96), AESx(0xDCBDBD61), AESx(0x868B8B0D), AESx(0x858A8A0F),
+	AESx(0x907070E0), AESx(0x423E3E7C), AESx(0xC4B5B571), AESx(0xAA6666CC),
+	AESx(0xD8484890), AESx(0x05030306), AESx(0x01F6F6F7), AESx(0x120E0E1C),
+	AESx(0xA36161C2), AESx(0x5F35356A), AESx(0xF95757AE), AESx(0xD0B9B969),
+	AESx(0x91868617), AESx(0x58C1C199), AESx(0x271D1D3A), AESx(0xB99E9E27),
+	AESx(0x38E1E1D9), AESx(0x13F8F8EB), AESx(0xB398982B), AESx(0x33111122),
+	AESx(0xBB6969D2), AESx(0x70D9D9A9), AESx(0x898E8E07), AESx(0xA7949433),
+	AESx(0xB69B9B2D), AESx(0x221E1E3C), AESx(0x92878715), AESx(0x20E9E9C9),
+	AESx(0x49CECE87), AESx(0xFF5555AA), AESx(0x78282850), AESx(0x7ADFDFA5),
+	AESx(0x8F8C8C03), AESx(0xF8A1A159), AESx(0x80898909), AESx(0x170D0D1A),
+	AESx(0xDABFBF65), AESx(0x31E6E6D7), AESx(0xC6424284), AESx(0xB86868D0),
+	AESx(0xC3414182), AESx(0xB0999929), AESx(0x772D2D5A), AESx(0x110F0F1E),
+	AESx(0xCBB0B07B), AESx(0xFC5454A8), AESx(0xD6BBBB6D), AESx(0x3A16162C)
+};
+
+static const sph_u32 AES1[256] = {
+	AESx(0x6363C6A5), AESx(0x7C7CF884), AESx(0x7777EE99), AESx(0x7B7BF68D),
+	AESx(0xF2F2FF0D), AESx(0x6B6BD6BD), AESx(0x6F6FDEB1), AESx(0xC5C59154),
+	AESx(0x30306050), AESx(0x01010203), AESx(0x6767CEA9), AESx(0x2B2B567D),
+	AESx(0xFEFEE719), AESx(0xD7D7B562), AESx(0xABAB4DE6), AESx(0x7676EC9A),
+	AESx(0xCACA8F45), AESx(0x82821F9D), AESx(0xC9C98940), AESx(0x7D7DFA87),
+	AESx(0xFAFAEF15), AESx(0x5959B2EB), AESx(0x47478EC9), AESx(0xF0F0FB0B),
+	AESx(0xADAD41EC), AESx(0xD4D4B367), AESx(0xA2A25FFD), AESx(0xAFAF45EA),
+	AESx(0x9C9C23BF), AESx(0xA4A453F7), AESx(0x7272E496), AESx(0xC0C09B5B),
+	AESx(0xB7B775C2), AESx(0xFDFDE11C), AESx(0x93933DAE), AESx(0x26264C6A),
+	AESx(0x36366C5A), AESx(0x3F3F7E41), AESx(0xF7F7F502), AESx(0xCCCC834F),
+	AESx(0x3434685C), AESx(0xA5A551F4), AESx(0xE5E5D134), AESx(0xF1F1F908),
+	AESx(0x7171E293), AESx(0xD8D8AB73), AESx(0x31316253), AESx(0x15152A3F),
+	AESx(0x0404080C), AESx(0xC7C79552), AESx(0x23234665), AESx(0xC3C39D5E),
+	AESx(0x18183028), AESx(0x969637A1), AESx(0x05050A0F), AESx(0x9A9A2FB5),
+	AESx(0x07070E09), AESx(0x12122436), AESx(0x80801B9B), AESx(0xE2E2DF3D),
+	AESx(0xEBEBCD26), AESx(0x27274E69), AESx(0xB2B27FCD), AESx(0x7575EA9F),
+	AESx(0x0909121B), AESx(0x83831D9E), AESx(0x2C2C5874), AESx(0x1A1A342E),
+	AESx(0x1B1B362D), AESx(0x6E6EDCB2), AESx(0x5A5AB4EE), AESx(0xA0A05BFB),
+	AESx(0x5252A4F6), AESx(0x3B3B764D), AESx(0xD6D6B761), AESx(0xB3B37DCE),
+	AESx(0x2929527B), AESx(0xE3E3DD3E), AESx(0x2F2F5E71), AESx(0x84841397),
+	AESx(0x5353A6F5), AESx(0xD1D1B968), AESx(0x00000000), AESx(0xEDEDC12C),
+	AESx(0x20204060), AESx(0xFCFCE31F), AESx(0xB1B179C8), AESx(0x5B5BB6ED),
+	AESx(0x6A6AD4BE), AESx(0xCBCB8D46), AESx(0xBEBE67D9), AESx(0x3939724B),
+	AESx(0x4A4A94DE), AESx(0x4C4C98D4), AESx(0x5858B0E8), AESx(0xCFCF854A),
+	AESx(0xD0D0BB6B), AESx(0xEFEFC52A), AESx(0xAAAA4FE5), AESx(0xFBFBED16),
+	AESx(0x434386C5), AESx(0x4D4D9AD7), AESx(0x33336655), AESx(0x85851194),
+	AESx(0x45458ACF), AESx(0xF9F9E910), AESx(0x02020406), AESx(0x7F7FFE81),
+	AESx(0x5050A0F0), AESx(0x3C3C7844), AESx(0x9F9F25BA), AESx(0xA8A84BE3),
+	AESx(0x5151A2F3), AESx(0xA3A35DFE), AESx(0x404080C0), AESx(0x8F8F058A),
+	AESx(0x92923FAD), AESx(0x9D9D21BC), AESx(0x38387048), AESx(0xF5F5F104),
+	AESx(0xBCBC63DF), AESx(0xB6B677C1), AESx(0xDADAAF75), AESx(0x21214263),
+	AESx(0x10102030), AESx(0xFFFFE51A), AESx(0xF3F3FD0E), AESx(0xD2D2BF6D),
+	AESx(0xCDCD814C), AESx(0x0C0C1814), AESx(0x13132635), AESx(0xECECC32F),
+	AESx(0x5F5FBEE1), AESx(0x979735A2), AESx(0x444488CC), AESx(0x17172E39),
+	AESx(0xC4C49357), AESx(0xA7A755F2), AESx(0x7E7EFC82), AESx(0x3D3D7A47),
+	AESx(0x6464C8AC), AESx(0x5D5DBAE7), AESx(0x1919322B), AESx(0x7373E695),
+	AESx(0x6060C0A0), AESx(0x81811998), AESx(0x4F4F9ED1), AESx(0xDCDCA37F),
+	AESx(0x22224466), AESx(0x2A2A547E), AESx(0x90903BAB), AESx(0x88880B83),
+	AESx(0x46468CCA), AESx(0xEEEEC729), AESx(0xB8B86BD3), AESx(0x1414283C),
+	AESx(0xDEDEA779), AESx(0x5E5EBCE2), AESx(0x0B0B161D), AESx(0xDBDBAD76),
+	AESx(0xE0E0DB3B), AESx(0x32326456), AESx(0x3A3A744E), AESx(0x0A0A141E),
+	AESx(0x494992DB), AESx(0x06060C0A), AESx(0x2424486C), AESx(0x5C5CB8E4),
+	AESx(0xC2C29F5D), AESx(0xD3D3BD6E), AESx(0xACAC43EF), AESx(0x6262C4A6),
+	AESx(0x919139A8), AESx(0x959531A4), AESx(0xE4E4D337), AESx(0x7979F28B),
+	AESx(0xE7E7D532), AESx(0xC8C88B43), AESx(0x37376E59), AESx(0x6D6DDAB7),
+	AESx(0x8D8D018C), AESx(0xD5D5B164), AESx(0x4E4E9CD2), AESx(0xA9A949E0),
+	AESx(0x6C6CD8B4), AESx(0x5656ACFA), AESx(0xF4F4F307), AESx(0xEAEACF25),
+	AESx(0x6565CAAF), AESx(0x7A7AF48E), AESx(0xAEAE47E9), AESx(0x08081018),
+	AESx(0xBABA6FD5), AESx(0x7878F088), AESx(0x25254A6F), AESx(0x2E2E5C72),
+	AESx(0x1C1C3824), AESx(0xA6A657F1), AESx(0xB4B473C7), AESx(0xC6C69751),
+	AESx(0xE8E8CB23), AESx(0xDDDDA17C), AESx(0x7474E89C), AESx(0x1F1F3E21),
+	AESx(0x4B4B96DD), AESx(0xBDBD61DC), AESx(0x8B8B0D86), AESx(0x8A8A0F85),
+	AESx(0x7070E090), AESx(0x3E3E7C42), AESx(0xB5B571C4), AESx(0x6666CCAA),
+	AESx(0x484890D8), AESx(0x03030605), AESx(0xF6F6F701), AESx(0x0E0E1C12),
+	AESx(0x6161C2A3), AESx(0x35356A5F), AESx(0x5757AEF9), AESx(0xB9B969D0),
+	AESx(0x86861791), AESx(0xC1C19958), AESx(0x1D1D3A27), AESx(0x9E9E27B9),
+	AESx(0xE1E1D938), AESx(0xF8F8EB13), AESx(0x98982BB3), AESx(0x11112233),
+	AESx(0x6969D2BB), AESx(0xD9D9A970), AESx(0x8E8E0789), AESx(0x949433A7),
+	AESx(0x9B9B2DB6), AESx(0x1E1E3C22), AESx(0x87871592), AESx(0xE9E9C920),
+	AESx(0xCECE8749), AESx(0x5555AAFF), AESx(0x28285078), AESx(0xDFDFA57A),
+	AESx(0x8C8C038F), AESx(0xA1A159F8), AESx(0x89890980), AESx(0x0D0D1A17),
+	AESx(0xBFBF65DA), AESx(0xE6E6D731), AESx(0x424284C6), AESx(0x6868D0B8),
+	AESx(0x414182C3), AESx(0x999929B0), AESx(0x2D2D5A77), AESx(0x0F0F1E11),
+	AESx(0xB0B07BCB), AESx(0x5454A8FC), AESx(0xBBBB6DD6), AESx(0x16162C3A)
+};
+
+static const sph_u32 AES2[256] = {
+	AESx(0x63C6A563), AESx(0x7CF8847C), AESx(0x77EE9977), AESx(0x7BF68D7B),
+	AESx(0xF2FF0DF2), AESx(0x6BD6BD6B), AESx(0x6FDEB16F), AESx(0xC59154C5),
+	AESx(0x30605030), AESx(0x01020301), AESx(0x67CEA967), AESx(0x2B567D2B),
+	AESx(0xFEE719FE), AESx(0xD7B562D7), AESx(0xAB4DE6AB), AESx(0x76EC9A76),
+	AESx(0xCA8F45CA), AESx(0x821F9D82), AESx(0xC98940C9), AESx(0x7DFA877D),
+	AESx(0xFAEF15FA), AESx(0x59B2EB59), AESx(0x478EC947), AESx(0xF0FB0BF0),
+	AESx(0xAD41ECAD), AESx(0xD4B367D4), AESx(0xA25FFDA2), AESx(0xAF45EAAF),
+	AESx(0x9C23BF9C), AESx(0xA453F7A4), AESx(0x72E49672), AESx(0xC09B5BC0),
+	AESx(0xB775C2B7), AESx(0xFDE11CFD), AESx(0x933DAE93), AESx(0x264C6A26),
+	AESx(0x366C5A36), AESx(0x3F7E413F), AESx(0xF7F502F7), AESx(0xCC834FCC),
+	AESx(0x34685C34), AESx(0xA551F4A5), AESx(0xE5D134E5), AESx(0xF1F908F1),
+	AESx(0x71E29371), AESx(0xD8AB73D8), AESx(0x31625331), AESx(0x152A3F15),
+	AESx(0x04080C04), AESx(0xC79552C7), AESx(0x23466523), AESx(0xC39D5EC3),
+	AESx(0x18302818), AESx(0x9637A196), AESx(0x050A0F05), AESx(0x9A2FB59A),
+	AESx(0x070E0907), AESx(0x12243612), AESx(0x801B9B80), AESx(0xE2DF3DE2),
+	AESx(0xEBCD26EB), AESx(0x274E6927), AESx(0xB27FCDB2), AESx(0x75EA9F75),
+	AESx(0x09121B09), AESx(0x831D9E83), AESx(0x2C58742C), AESx(0x1A342E1A),
+	AESx(0x1B362D1B), AESx(0x6EDCB26E), AESx(0x5AB4EE5A), AESx(0xA05BFBA0),
+	AESx(0x52A4F652), AESx(0x3B764D3B), AESx(0xD6B761D6), AESx(0xB37DCEB3),
+	AESx(0x29527B29), AESx(0xE3DD3EE3), AESx(0x2F5E712F), AESx(0x84139784),
+	AESx(0x53A6F553), AESx(0xD1B968D1), AESx(0x00000000), AESx(0xEDC12CED),
+	AESx(0x20406020), AESx(0xFCE31FFC), AESx(0xB179C8B1), AESx(0x5BB6ED5B),
+	AESx(0x6AD4BE6A), AESx(0xCB8D46CB), AESx(0xBE67D9BE), AESx(0x39724B39),
+	AESx(0x4A94DE4A), AESx(0x4C98D44C), AESx(0x58B0E858), AESx(0xCF854ACF),
+	AESx(0xD0BB6BD0), AESx(0xEFC52AEF), AESx(0xAA4FE5AA), AESx(0xFBED16FB),
+	AESx(0x4386C543), AESx(0x4D9AD74D), AESx(0x33665533), AESx(0x85119485),
+	AESx(0x458ACF45), AESx(0xF9E910F9), AESx(0x02040602), AESx(0x7FFE817F),
+	AESx(0x50A0F050), AESx(0x3C78443C), AESx(0x9F25BA9F), AESx(0xA84BE3A8),
+	AESx(0x51A2F351), AESx(0xA35DFEA3), AESx(0x4080C040), AESx(0x8F058A8F),
+	AESx(0x923FAD92), AESx(0x9D21BC9D), AESx(0x38704838), AESx(0xF5F104F5),
+	AESx(0xBC63DFBC), AESx(0xB677C1B6), AESx(0xDAAF75DA), AESx(0x21426321),
+	AESx(0x10203010), AESx(0xFFE51AFF), AESx(0xF3FD0EF3), AESx(0xD2BF6DD2),
+	AESx(0xCD814CCD), AESx(0x0C18140C), AESx(0x13263513), AESx(0xECC32FEC),
+	AESx(0x5FBEE15F), AESx(0x9735A297), AESx(0x4488CC44), AESx(0x172E3917),
+	AESx(0xC49357C4), AESx(0xA755F2A7), AESx(0x7EFC827E), AESx(0x3D7A473D),
+	AESx(0x64C8AC64), AESx(0x5DBAE75D), AESx(0x19322B19), AESx(0x73E69573),
+	AESx(0x60C0A060), AESx(0x81199881), AESx(0x4F9ED14F), AESx(0xDCA37FDC),
+	AESx(0x22446622), AESx(0x2A547E2A), AESx(0x903BAB90), AESx(0x880B8388),
+	AESx(0x468CCA46), AESx(0xEEC729EE), AESx(0xB86BD3B8), AESx(0x14283C14),
+	AESx(0xDEA779DE), AESx(0x5EBCE25E), AESx(0x0B161D0B), AESx(0xDBAD76DB),
+	AESx(0xE0DB3BE0), AESx(0x32645632), AESx(0x3A744E3A), AESx(0x0A141E0A),
+	AESx(0x4992DB49), AESx(0x060C0A06), AESx(0x24486C24), AESx(0x5CB8E45C),
+	AESx(0xC29F5DC2), AESx(0xD3BD6ED3), AESx(0xAC43EFAC), AESx(0x62C4A662),
+	AESx(0x9139A891), AESx(0x9531A495), AESx(0xE4D337E4), AESx(0x79F28B79),
+	AESx(0xE7D532E7), AESx(0xC88B43C8), AESx(0x376E5937), AESx(0x6DDAB76D),
+	AESx(0x8D018C8D), AESx(0xD5B164D5), AESx(0x4E9CD24E), AESx(0xA949E0A9),
+	AESx(0x6CD8B46C), AESx(0x56ACFA56), AESx(0xF4F307F4), AESx(0xEACF25EA),
+	AESx(0x65CAAF65), AESx(0x7AF48E7A), AESx(0xAE47E9AE), AESx(0x08101808),
+	AESx(0xBA6FD5BA), AESx(0x78F08878), AESx(0x254A6F25), AESx(0x2E5C722E),
+	AESx(0x1C38241C), AESx(0xA657F1A6), AESx(0xB473C7B4), AESx(0xC69751C6),
+	AESx(0xE8CB23E8), AESx(0xDDA17CDD), AESx(0x74E89C74), AESx(0x1F3E211F),
+	AESx(0x4B96DD4B), AESx(0xBD61DCBD), AESx(0x8B0D868B), AESx(0x8A0F858A),
+	AESx(0x70E09070), AESx(0x3E7C423E), AESx(0xB571C4B5), AESx(0x66CCAA66),
+	AESx(0x4890D848), AESx(0x03060503), AESx(0xF6F701F6), AESx(0x0E1C120E),
+	AESx(0x61C2A361), AESx(0x356A5F35), AESx(0x57AEF957), AESx(0xB969D0B9),
+	AESx(0x86179186), AESx(0xC19958C1), AESx(0x1D3A271D), AESx(0x9E27B99E),
+	AESx(0xE1D938E1), AESx(0xF8EB13F8), AESx(0x982BB398), AESx(0x11223311),
+	AESx(0x69D2BB69), AESx(0xD9A970D9), AESx(0x8E07898E), AESx(0x9433A794),
+	AESx(0x9B2DB69B), AESx(0x1E3C221E), AESx(0x87159287), AESx(0xE9C920E9),
+	AESx(0xCE8749CE), AESx(0x55AAFF55), AESx(0x28507828), AESx(0xDFA57ADF),
+	AESx(0x8C038F8C), AESx(0xA159F8A1), AESx(0x89098089), AESx(0x0D1A170D),
+	AESx(0xBF65DABF), AESx(0xE6D731E6), AESx(0x4284C642), AESx(0x68D0B868),
+	AESx(0x4182C341), AESx(0x9929B099), AESx(0x2D5A772D), AESx(0x0F1E110F),
+	AESx(0xB07BCBB0), AESx(0x54A8FC54), AESx(0xBB6DD6BB), AESx(0x162C3A16)
+};
+
+static const sph_u32 AES3[256] = {
+	AESx(0xC6A56363), AESx(0xF8847C7C), AESx(0xEE997777), AESx(0xF68D7B7B),
+	AESx(0xFF0DF2F2), AESx(0xD6BD6B6B), AESx(0xDEB16F6F), AESx(0x9154C5C5),
+	AESx(0x60503030), AESx(0x02030101), AESx(0xCEA96767), AESx(0x567D2B2B),
+	AESx(0xE719FEFE), AESx(0xB562D7D7), AESx(0x4DE6ABAB), AESx(0xEC9A7676),
+	AESx(0x8F45CACA), AESx(0x1F9D8282), AESx(0x8940C9C9), AESx(0xFA877D7D),
+	AESx(0xEF15FAFA), AESx(0xB2EB5959), AESx(0x8EC94747), AESx(0xFB0BF0F0),
+	AESx(0x41ECADAD), AESx(0xB367D4D4), AESx(0x5FFDA2A2), AESx(0x45EAAFAF),
+	AESx(0x23BF9C9C), AESx(0x53F7A4A4), AESx(0xE4967272), AESx(0x9B5BC0C0),
+	AESx(0x75C2B7B7), AESx(0xE11CFDFD), AESx(0x3DAE9393), AESx(0x4C6A2626),
+	AESx(0x6C5A3636), AESx(0x7E413F3F), AESx(0xF502F7F7), AESx(0x834FCCCC),
+	AESx(0x685C3434), AESx(0x51F4A5A5), AESx(0xD134E5E5), AESx(0xF908F1F1),
+	AESx(0xE2937171), AESx(0xAB73D8D8), AESx(0x62533131), AESx(0x2A3F1515),
+	AESx(0x080C0404), AESx(0x9552C7C7), AESx(0x46652323), AESx(0x9D5EC3C3),
+	AESx(0x30281818), AESx(0x37A19696), AESx(0x0A0F0505), AESx(0x2FB59A9A),
+	AESx(0x0E090707), AESx(0x24361212), AESx(0x1B9B8080), AESx(0xDF3DE2E2),
+	AESx(0xCD26EBEB), AESx(0x4E692727), AESx(0x7FCDB2B2), AESx(0xEA9F7575),
+	AESx(0x121B0909), AESx(0x1D9E8383), AESx(0x58742C2C), AESx(0x342E1A1A),
+	AESx(0x362D1B1B), AESx(0xDCB26E6E), AESx(0xB4EE5A5A), AESx(0x5BFBA0A0),
+	AESx(0xA4F65252), AESx(0x764D3B3B), AESx(0xB761D6D6), AESx(0x7DCEB3B3),
+	AESx(0x527B2929), AESx(0xDD3EE3E3), AESx(0x5E712F2F), AESx(0x13978484),
+	AESx(0xA6F55353), AESx(0xB968D1D1), AESx(0x00000000), AESx(0xC12CEDED),
+	AESx(0x40602020), AESx(0xE31FFCFC), AESx(0x79C8B1B1), AESx(0xB6ED5B5B),
+	AESx(0xD4BE6A6A), AESx(0x8D46CBCB), AESx(0x67D9BEBE), AESx(0x724B3939),
+	AESx(0x94DE4A4A), AESx(0x98D44C4C), AESx(0xB0E85858), AESx(0x854ACFCF),
+	AESx(0xBB6BD0D0), AESx(0xC52AEFEF), AESx(0x4FE5AAAA), AESx(0xED16FBFB),
+	AESx(0x86C54343), AESx(0x9AD74D4D), AESx(0x66553333), AESx(0x11948585),
+	AESx(0x8ACF4545), AESx(0xE910F9F9), AESx(0x04060202), AESx(0xFE817F7F),
+	AESx(0xA0F05050), AESx(0x78443C3C), AESx(0x25BA9F9F), AESx(0x4BE3A8A8),
+	AESx(0xA2F35151), AESx(0x5DFEA3A3), AESx(0x80C04040), AESx(0x058A8F8F),
+	AESx(0x3FAD9292), AESx(0x21BC9D9D), AESx(0x70483838), AESx(0xF104F5F5),
+	AESx(0x63DFBCBC), AESx(0x77C1B6B6), AESx(0xAF75DADA), AESx(0x42632121),
+	AESx(0x20301010), AESx(0xE51AFFFF), AESx(0xFD0EF3F3), AESx(0xBF6DD2D2),
+	AESx(0x814CCDCD), AESx(0x18140C0C), AESx(0x26351313), AESx(0xC32FECEC),
+	AESx(0xBEE15F5F), AESx(0x35A29797), AESx(0x88CC4444), AESx(0x2E391717),
+	AESx(0x9357C4C4), AESx(0x55F2A7A7), AESx(0xFC827E7E), AESx(0x7A473D3D),
+	AESx(0xC8AC6464), AESx(0xBAE75D5D), AESx(0x322B1919), AESx(0xE6957373),
+	AESx(0xC0A06060), AESx(0x19988181), AESx(0x9ED14F4F), AESx(0xA37FDCDC),
+	AESx(0x44662222), AESx(0x547E2A2A), AESx(0x3BAB9090), AESx(0x0B838888),
+	AESx(0x8CCA4646), AESx(0xC729EEEE), AESx(0x6BD3B8B8), AESx(0x283C1414),
+	AESx(0xA779DEDE), AESx(0xBCE25E5E), AESx(0x161D0B0B), AESx(0xAD76DBDB),
+	AESx(0xDB3BE0E0), AESx(0x64563232), AESx(0x744E3A3A), AESx(0x141E0A0A),
+	AESx(0x92DB4949), AESx(0x0C0A0606), AESx(0x486C2424), AESx(0xB8E45C5C),
+	AESx(0x9F5DC2C2), AESx(0xBD6ED3D3), AESx(0x43EFACAC), AESx(0xC4A66262),
+	AESx(0x39A89191), AESx(0x31A49595), AESx(0xD337E4E4), AESx(0xF28B7979),
+	AESx(0xD532E7E7), AESx(0x8B43C8C8), AESx(0x6E593737), AESx(0xDAB76D6D),
+	AESx(0x018C8D8D), AESx(0xB164D5D5), AESx(0x9CD24E4E), AESx(0x49E0A9A9),
+	AESx(0xD8B46C6C), AESx(0xACFA5656), AESx(0xF307F4F4), AESx(0xCF25EAEA),
+	AESx(0xCAAF6565), AESx(0xF48E7A7A), AESx(0x47E9AEAE), AESx(0x10180808),
+	AESx(0x6FD5BABA), AESx(0xF0887878), AESx(0x4A6F2525), AESx(0x5C722E2E),
+	AESx(0x38241C1C), AESx(0x57F1A6A6), AESx(0x73C7B4B4), AESx(0x9751C6C6),
+	AESx(0xCB23E8E8), AESx(0xA17CDDDD), AESx(0xE89C7474), AESx(0x3E211F1F),
+	AESx(0x96DD4B4B), AESx(0x61DCBDBD), AESx(0x0D868B8B), AESx(0x0F858A8A),
+	AESx(0xE0907070), AESx(0x7C423E3E), AESx(0x71C4B5B5), AESx(0xCCAA6666),
+	AESx(0x90D84848), AESx(0x06050303), AESx(0xF701F6F6), AESx(0x1C120E0E),
+	AESx(0xC2A36161), AESx(0x6A5F3535), AESx(0xAEF95757), AESx(0x69D0B9B9),
+	AESx(0x17918686), AESx(0x9958C1C1), AESx(0x3A271D1D), AESx(0x27B99E9E),
+	AESx(0xD938E1E1), AESx(0xEB13F8F8), AESx(0x2BB39898), AESx(0x22331111),
+	AESx(0xD2BB6969), AESx(0xA970D9D9), AESx(0x07898E8E), AESx(0x33A79494),
+	AESx(0x2DB69B9B), AESx(0x3C221E1E), AESx(0x15928787), AESx(0xC920E9E9),
+	AESx(0x8749CECE), AESx(0xAAFF5555), AESx(0x50782828), AESx(0xA57ADFDF),
+	AESx(0x038F8C8C), AESx(0x59F8A1A1), AESx(0x09808989), AESx(0x1A170D0D),
+	AESx(0x65DABFBF), AESx(0xD731E6E6), AESx(0x84C64242), AESx(0xD0B86868),
+	AESx(0x82C34141), AESx(0x29B09999), AESx(0x5A772D2D), AESx(0x1E110F0F),
+	AESx(0x7BCBB0B0), AESx(0xA8FC5454), AESx(0x6DD6BBBB), AESx(0x2C3A1616)
+};
+
+#ifdef __cplusplus
+}
+#endif
diff --git a/sha3/blake.c b/sha3/blake.c
new file mode 100644
index 0000000..0650b9c
--- /dev/null
+++ b/sha3/blake.c
@@ -0,0 +1,1120 @@
+/* $Id: blake.c 252 2011-06-07 17:55:14Z tp $ */
+/*
+ * BLAKE implementation.
+ *
+ * ==========================(LICENSE BEGIN)============================
+ *
+ * Copyright (c) 2007-2010  Projet RNRT SAPHIR
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ===========================(LICENSE END)=============================
+ *
+ * @author   Thomas Pornin <thomas.pornin@cryptolog.com>
+ */
+
+#include <stddef.h>
+#include <string.h>
+#include <limits.h>
+
+#include "sph_blake.h"
+
+#ifdef __cplusplus
+extern "C"{
+#endif
+
+#if SPH_SMALL_FOOTPRINT && !defined SPH_SMALL_FOOTPRINT_BLAKE
+#define SPH_SMALL_FOOTPRINT_BLAKE   1
+#endif
+
+#if SPH_SMALL_FOOTPRINT_BLAKE
+#define SPH_COMPACT_BLAKE_32   1
+#endif
+
+#if SPH_64 && (SPH_SMALL_FOOTPRINT_BLAKE || !SPH_64_TRUE)
+#define SPH_COMPACT_BLAKE_64   1
+#endif
+
+#ifdef _MSC_VER
+#pragma warning (disable: 4146)
+#endif
+
+static const sph_u32 IV224[8] = {
+	SPH_C32(0xC1059ED8), SPH_C32(0x367CD507),
+	SPH_C32(0x3070DD17), SPH_C32(0xF70E5939),
+	SPH_C32(0xFFC00B31), SPH_C32(0x68581511),
+	SPH_C32(0x64F98FA7), SPH_C32(0xBEFA4FA4)
+};
+
+static const sph_u32 IV256[8] = {
+	SPH_C32(0x6A09E667), SPH_C32(0xBB67AE85),
+	SPH_C32(0x3C6EF372), SPH_C32(0xA54FF53A),
+	SPH_C32(0x510E527F), SPH_C32(0x9B05688C),
+	SPH_C32(0x1F83D9AB), SPH_C32(0x5BE0CD19)
+};
+
+#if SPH_64
+
+static const sph_u64 IV384[8] = {
+	SPH_C64(0xCBBB9D5DC1059ED8), SPH_C64(0x629A292A367CD507),
+	SPH_C64(0x9159015A3070DD17), SPH_C64(0x152FECD8F70E5939),
+	SPH_C64(0x67332667FFC00B31), SPH_C64(0x8EB44A8768581511),
+	SPH_C64(0xDB0C2E0D64F98FA7), SPH_C64(0x47B5481DBEFA4FA4)
+};
+
+static const sph_u64 IV512[8] = {
+	SPH_C64(0x6A09E667F3BCC908), SPH_C64(0xBB67AE8584CAA73B),
+	SPH_C64(0x3C6EF372FE94F82B), SPH_C64(0xA54FF53A5F1D36F1),
+	SPH_C64(0x510E527FADE682D1), SPH_C64(0x9B05688C2B3E6C1F),
+	SPH_C64(0x1F83D9ABFB41BD6B), SPH_C64(0x5BE0CD19137E2179)
+};
+
+#endif
+
+#if SPH_COMPACT_BLAKE_32 || SPH_COMPACT_BLAKE_64
+
+static const unsigned sigma[16][16] = {
+	{  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15 },
+	{ 14, 10,  4,  8,  9, 15, 13,  6,  1, 12,  0,  2, 11,  7,  5,  3 },
+	{ 11,  8, 12,  0,  5,  2, 15, 13, 10, 14,  3,  6,  7,  1,  9,  4 },
+	{  7,  9,  3,  1, 13, 12, 11, 14,  2,  6,  5, 10,  4,  0, 15,  8 },
+	{  9,  0,  5,  7,  2,  4, 10, 15, 14,  1, 11, 12,  6,  8,  3, 13 },
+	{  2, 12,  6, 10,  0, 11,  8,  3,  4, 13,  7,  5, 15, 14,  1,  9 },
+	{ 12,  5,  1, 15, 14, 13,  4, 10,  0,  7,  6,  3,  9,  2,  8, 11 },
+	{ 13, 11,  7, 14, 12,  1,  3,  9,  5,  0, 15,  4,  8,  6,  2, 10 },
+	{  6, 15, 14,  9, 11,  3,  0,  8, 12,  2, 13,  7,  1,  4, 10,  5 },
+	{ 10,  2,  8,  4,  7,  6,  1,  5, 15, 11,  9, 14,  3, 12, 13,  0 },
+	{  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15 },
+	{ 14, 10,  4,  8,  9, 15, 13,  6,  1, 12,  0,  2, 11,  7,  5,  3 },
+	{ 11,  8, 12,  0,  5,  2, 15, 13, 10, 14,  3,  6,  7,  1,  9,  4 },
+	{  7,  9,  3,  1, 13, 12, 11, 14,  2,  6,  5, 10,  4,  0, 15,  8 },
+	{  9,  0,  5,  7,  2,  4, 10, 15, 14,  1, 11, 12,  6,  8,  3, 13 },
+	{  2, 12,  6, 10,  0, 11,  8,  3,  4, 13,  7,  5, 15, 14,  1,  9 }
+};
+
+/*
+  0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15
+ 14 10  4  8  9 15 13  6  1 12  0  2 11  7  5  3
+ 11  8 12  0  5  2 15 13 10 14  3  6  7  1  9  4
+  7  9  3  1 13 12 11 14  2  6  5 10  4  0 15  8
+  9  0  5  7  2  4 10 15 14  1 11 12  6  8  3 13
+  2 12  6 10  0 11  8  3  4 13  7  5 15 14  1  9
+ 12  5  1 15 14 13  4 10  0  7  6  3  9  2  8 11
+ 13 11  7 14 12  1  3  9  5  0 15  4  8  6  2 10
+  6 15 14  9 11  3  0  8 12  2 13  7  1  4 10  5
+ 10  2  8  4  7  6  1  5 15 11  9 14  3 12 13  0
+*/
+#endif
+
+#define Z00   0
+#define Z01   1
+#define Z02   2
+#define Z03   3
+#define Z04   4
+#define Z05   5
+#define Z06   6
+#define Z07   7
+#define Z08   8
+#define Z09   9
+#define Z0A   A
+#define Z0B   B
+#define Z0C   C
+#define Z0D   D
+#define Z0E   E
+#define Z0F   F
+
+#define Z10   E
+#define Z11   A
+#define Z12   4
+#define Z13   8
+#define Z14   9
+#define Z15   F
+#define Z16   D
+#define Z17   6
+#define Z18   1
+#define Z19   C
+#define Z1A   0
+#define Z1B   2
+#define Z1C   B
+#define Z1D   7
+#define Z1E   5
+#define Z1F   3
+
+#define Z20   B
+#define Z21   8
+#define Z22   C
+#define Z23   0
+#define Z24   5
+#define Z25   2
+#define Z26   F
+#define Z27   D
+#define Z28   A
+#define Z29   E
+#define Z2A   3
+#define Z2B   6
+#define Z2C   7
+#define Z2D   1
+#define Z2E   9
+#define Z2F   4
+
+#define Z30   7
+#define Z31   9
+#define Z32   3
+#define Z33   1
+#define Z34   D
+#define Z35   C
+#define Z36   B
+#define Z37   E
+#define Z38   2
+#define Z39   6
+#define Z3A   5
+#define Z3B   A
+#define Z3C   4
+#define Z3D   0
+#define Z3E   F
+#define Z3F   8
+
+#define Z40   9
+#define Z41   0
+#define Z42   5
+#define Z43   7
+#define Z44   2
+#define Z45   4
+#define Z46   A
+#define Z47   F
+#define Z48   E
+#define Z49   1
+#define Z4A   B
+#define Z4B   C
+#define Z4C   6
+#define Z4D   8
+#define Z4E   3
+#define Z4F   D
+
+#define Z50   2
+#define Z51   C
+#define Z52   6
+#define Z53   A
+#define Z54   0
+#define Z55   B
+#define Z56   8
+#define Z57   3
+#define Z58   4
+#define Z59   D
+#define Z5A   7
+#define Z5B   5
+#define Z5C   F
+#define Z5D   E
+#define Z5E   1
+#define Z5F   9
+
+#define Z60   C
+#define Z61   5
+#define Z62   1
+#define Z63   F
+#define Z64   E
+#define Z65   D
+#define Z66   4
+#define Z67   A
+#define Z68   0
+#define Z69   7
+#define Z6A   6
+#define Z6B   3
+#define Z6C   9
+#define Z6D   2
+#define Z6E   8
+#define Z6F   B
+
+#define Z70   D
+#define Z71   B
+#define Z72   7
+#define Z73   E
+#define Z74   C
+#define Z75   1
+#define Z76   3
+#define Z77   9
+#define Z78   5
+#define Z79   0
+#define Z7A   F
+#define Z7B   4
+#define Z7C   8
+#define Z7D   6
+#define Z7E   2
+#define Z7F   A
+
+#define Z80   6
+#define Z81   F
+#define Z82   E
+#define Z83   9
+#define Z84   B
+#define Z85   3
+#define Z86   0
+#define Z87   8
+#define Z88   C
+#define Z89   2
+#define Z8A   D
+#define Z8B   7
+#define Z8C   1
+#define Z8D   4
+#define Z8E   A
+#define Z8F   5
+
+#define Z90   A
+#define Z91   2
+#define Z92   8
+#define Z93   4
+#define Z94   7
+#define Z95   6
+#define Z96   1
+#define Z97   5
+#define Z98   F
+#define Z99   B
+#define Z9A   9
+#define Z9B   E
+#define Z9C   3
+#define Z9D   C
+#define Z9E   D
+#define Z9F   0
+
+#define Mx(r, i)    Mx_(Z ## r ## i)
+#define Mx_(n)      Mx__(n)
+#define Mx__(n)     M ## n
+
+#define CSx(r, i)   CSx_(Z ## r ## i)
+#define CSx_(n)     CSx__(n)
+#define CSx__(n)    CS ## n
+
+#define CS0   SPH_C32(0x243F6A88)
+#define CS1   SPH_C32(0x85A308D3)
+#define CS2   SPH_C32(0x13198A2E)
+#define CS3   SPH_C32(0x03707344)
+#define CS4   SPH_C32(0xA4093822)
+#define CS5   SPH_C32(0x299F31D0)
+#define CS6   SPH_C32(0x082EFA98)
+#define CS7   SPH_C32(0xEC4E6C89)
+#define CS8   SPH_C32(0x452821E6)
+#define CS9   SPH_C32(0x38D01377)
+#define CSA   SPH_C32(0xBE5466CF)
+#define CSB   SPH_C32(0x34E90C6C)
+#define CSC   SPH_C32(0xC0AC29B7)
+#define CSD   SPH_C32(0xC97C50DD)
+#define CSE   SPH_C32(0x3F84D5B5)
+#define CSF   SPH_C32(0xB5470917)
+
+#if SPH_COMPACT_BLAKE_32
+
+static const sph_u32 CS[16] = {
+	SPH_C32(0x243F6A88), SPH_C32(0x85A308D3),
+	SPH_C32(0x13198A2E), SPH_C32(0x03707344),
+	SPH_C32(0xA4093822), SPH_C32(0x299F31D0),
+	SPH_C32(0x082EFA98), SPH_C32(0xEC4E6C89),
+	SPH_C32(0x452821E6), SPH_C32(0x38D01377),
+	SPH_C32(0xBE5466CF), SPH_C32(0x34E90C6C),
+	SPH_C32(0xC0AC29B7), SPH_C32(0xC97C50DD),
+	SPH_C32(0x3F84D5B5), SPH_C32(0xB5470917)
+};
+
+#endif
+
+#if SPH_64
+
+#define CBx(r, i)   CBx_(Z ## r ## i)
+#define CBx_(n)     CBx__(n)
+#define CBx__(n)    CB ## n
+
+#define CB0   SPH_C64(0x243F6A8885A308D3)
+#define CB1   SPH_C64(0x13198A2E03707344)
+#define CB2   SPH_C64(0xA4093822299F31D0)
+#define CB3   SPH_C64(0x082EFA98EC4E6C89)
+#define CB4   SPH_C64(0x452821E638D01377)
+#define CB5   SPH_C64(0xBE5466CF34E90C6C)
+#define CB6   SPH_C64(0xC0AC29B7C97C50DD)
+#define CB7   SPH_C64(0x3F84D5B5B5470917)
+#define CB8   SPH_C64(0x9216D5D98979FB1B)
+#define CB9   SPH_C64(0xD1310BA698DFB5AC)
+#define CBA   SPH_C64(0x2FFD72DBD01ADFB7)
+#define CBB   SPH_C64(0xB8E1AFED6A267E96)
+#define CBC   SPH_C64(0xBA7C9045F12C7F99)
+#define CBD   SPH_C64(0x24A19947B3916CF7)
+#define CBE   SPH_C64(0x0801F2E2858EFC16)
+#define CBF   SPH_C64(0x636920D871574E69)
+
+#if SPH_COMPACT_BLAKE_64
+
+static const sph_u64 CB[16] = {
+	SPH_C64(0x243F6A8885A308D3), SPH_C64(0x13198A2E03707344),
+	SPH_C64(0xA4093822299F31D0), SPH_C64(0x082EFA98EC4E6C89),
+	SPH_C64(0x452821E638D01377), SPH_C64(0xBE5466CF34E90C6C),
+	SPH_C64(0xC0AC29B7C97C50DD), SPH_C64(0x3F84D5B5B5470917),
+	SPH_C64(0x9216D5D98979FB1B), SPH_C64(0xD1310BA698DFB5AC),
+	SPH_C64(0x2FFD72DBD01ADFB7), SPH_C64(0xB8E1AFED6A267E96),
+	SPH_C64(0xBA7C9045F12C7F99), SPH_C64(0x24A19947B3916CF7),
+	SPH_C64(0x0801F2E2858EFC16), SPH_C64(0x636920D871574E69)
+};
+
+#endif
+
+#endif
+
+#define GS(m0, m1, c0, c1, a, b, c, d)   do { \
+		a = SPH_T32(a + b + (m0 ^ c1)); \
+		d = SPH_ROTR32(d ^ a, 16); \
+		c = SPH_T32(c + d); \
+		b = SPH_ROTR32(b ^ c, 12); \
+		a = SPH_T32(a + b + (m1 ^ c0)); \
+		d = SPH_ROTR32(d ^ a, 8); \
+		c = SPH_T32(c + d); \
+		b = SPH_ROTR32(b ^ c, 7); \
+	} while (0)
+
+#if SPH_COMPACT_BLAKE_32
+
+#define ROUND_S(r)   do { \
+		GS(M[sigma[r][0x0]], M[sigma[r][0x1]], \
+			CS[sigma[r][0x0]], CS[sigma[r][0x1]], V0, V4, V8, VC); \
+		GS(M[sigma[r][0x2]], M[sigma[r][0x3]], \
+			CS[sigma[r][0x2]], CS[sigma[r][0x3]], V1, V5, V9, VD); \
+		GS(M[sigma[r][0x4]], M[sigma[r][0x5]], \
+			CS[sigma[r][0x4]], CS[sigma[r][0x5]], V2, V6, VA, VE); \
+		GS(M[sigma[r][0x6]], M[sigma[r][0x7]], \
+			CS[sigma[r][0x6]], CS[sigma[r][0x7]], V3, V7, VB, VF); \
+		GS(M[sigma[r][0x8]], M[sigma[r][0x9]], \
+			CS[sigma[r][0x8]], CS[sigma[r][0x9]], V0, V5, VA, VF); \
+		GS(M[sigma[r][0xA]], M[sigma[r][0xB]], \
+			CS[sigma[r][0xA]], CS[sigma[r][0xB]], V1, V6, VB, VC); \
+		GS(M[sigma[r][0xC]], M[sigma[r][0xD]], \
+			CS[sigma[r][0xC]], CS[sigma[r][0xD]], V2, V7, V8, VD); \
+		GS(M[sigma[r][0xE]], M[sigma[r][0xF]], \
+			CS[sigma[r][0xE]], CS[sigma[r][0xF]], V3, V4, V9, VE); \
+	} while (0)
+
+#else
+
+#define ROUND_S(r)   do { \
+		GS(Mx(r, 0), Mx(r, 1), CSx(r, 0), CSx(r, 1), V0, V4, V8, VC); \
+		GS(Mx(r, 2), Mx(r, 3), CSx(r, 2), CSx(r, 3), V1, V5, V9, VD); \
+		GS(Mx(r, 4), Mx(r, 5), CSx(r, 4), CSx(r, 5), V2, V6, VA, VE); \
+		GS(Mx(r, 6), Mx(r, 7), CSx(r, 6), CSx(r, 7), V3, V7, VB, VF); \
+		GS(Mx(r, 8), Mx(r, 9), CSx(r, 8), CSx(r, 9), V0, V5, VA, VF); \
+		GS(Mx(r, A), Mx(r, B), CSx(r, A), CSx(r, B), V1, V6, VB, VC); \
+		GS(Mx(r, C), Mx(r, D), CSx(r, C), CSx(r, D), V2, V7, V8, VD); \
+		GS(Mx(r, E), Mx(r, F), CSx(r, E), CSx(r, F), V3, V4, V9, VE); \
+	} while (0)
+
+#endif
+
+#if SPH_64
+
+#define GB(m0, m1, c0, c1, a, b, c, d)   do { \
+		a = SPH_T64(a + b + (m0 ^ c1)); \
+		d = SPH_ROTR64(d ^ a, 32); \
+		c = SPH_T64(c + d); \
+		b = SPH_ROTR64(b ^ c, 25); \
+		a = SPH_T64(a + b + (m1 ^ c0)); \
+		d = SPH_ROTR64(d ^ a, 16); \
+		c = SPH_T64(c + d); \
+		b = SPH_ROTR64(b ^ c, 11); \
+	} while (0)
+
+#if SPH_COMPACT_BLAKE_64
+
+#define ROUND_B(r)   do { \
+		GB(M[sigma[r][0x0]], M[sigma[r][0x1]], \
+			CB[sigma[r][0x0]], CB[sigma[r][0x1]], V0, V4, V8, VC); \
+		GB(M[sigma[r][0x2]], M[sigma[r][0x3]], \
+			CB[sigma[r][0x2]], CB[sigma[r][0x3]], V1, V5, V9, VD); \
+		GB(M[sigma[r][0x4]], M[sigma[r][0x5]], \
+			CB[sigma[r][0x4]], CB[sigma[r][0x5]], V2, V6, VA, VE); \
+		GB(M[sigma[r][0x6]], M[sigma[r][0x7]], \
+			CB[sigma[r][0x6]], CB[sigma[r][0x7]], V3, V7, VB, VF); \
+		GB(M[sigma[r][0x8]], M[sigma[r][0x9]], \
+			CB[sigma[r][0x8]], CB[sigma[r][0x9]], V0, V5, VA, VF); \
+		GB(M[sigma[r][0xA]], M[sigma[r][0xB]], \
+			CB[sigma[r][0xA]], CB[sigma[r][0xB]], V1, V6, VB, VC); \
+		GB(M[sigma[r][0xC]], M[sigma[r][0xD]], \
+			CB[sigma[r][0xC]], CB[sigma[r][0xD]], V2, V7, V8, VD); \
+		GB(M[sigma[r][0xE]], M[sigma[r][0xF]], \
+			CB[sigma[r][0xE]], CB[sigma[r][0xF]], V3, V4, V9, VE); \
+	} while (0)
+
+#else
+
+#define ROUND_B(r)   do { \
+		GB(Mx(r, 0), Mx(r, 1), CBx(r, 0), CBx(r, 1), V0, V4, V8, VC); \
+		GB(Mx(r, 2), Mx(r, 3), CBx(r, 2), CBx(r, 3), V1, V5, V9, VD); \
+		GB(Mx(r, 4), Mx(r, 5), CBx(r, 4), CBx(r, 5), V2, V6, VA, VE); \
+		GB(Mx(r, 6), Mx(r, 7), CBx(r, 6), CBx(r, 7), V3, V7, VB, VF); \
+		GB(Mx(r, 8), Mx(r, 9), CBx(r, 8), CBx(r, 9), V0, V5, VA, VF); \
+		GB(Mx(r, A), Mx(r, B), CBx(r, A), CBx(r, B), V1, V6, VB, VC); \
+		GB(Mx(r, C), Mx(r, D), CBx(r, C), CBx(r, D), V2, V7, V8, VD); \
+		GB(Mx(r, E), Mx(r, F), CBx(r, E), CBx(r, F), V3, V4, V9, VE); \
+	} while (0)
+
+#endif
+
+#endif
+
+#define DECL_STATE32 \
+	sph_u32 H0, H1, H2, H3, H4, H5, H6, H7; \
+	sph_u32 S0, S1, S2, S3, T0, T1;
+
+#define READ_STATE32(state)   do { \
+		H0 = (state)->H[0]; \
+		H1 = (state)->H[1]; \
+		H2 = (state)->H[2]; \
+		H3 = (state)->H[3]; \
+		H4 = (state)->H[4]; \
+		H5 = (state)->H[5]; \
+		H6 = (state)->H[6]; \
+		H7 = (state)->H[7]; \
+		S0 = (state)->S[0]; \
+		S1 = (state)->S[1]; \
+		S2 = (state)->S[2]; \
+		S3 = (state)->S[3]; \
+		T0 = (state)->T0; \
+		T1 = (state)->T1; \
+	} while (0)
+
+#define WRITE_STATE32(state)   do { \
+		(state)->H[0] = H0; \
+		(state)->H[1] = H1; \
+		(state)->H[2] = H2; \
+		(state)->H[3] = H3; \
+		(state)->H[4] = H4; \
+		(state)->H[5] = H5; \
+		(state)->H[6] = H6; \
+		(state)->H[7] = H7; \
+		(state)->S[0] = S0; \
+		(state)->S[1] = S1; \
+		(state)->S[2] = S2; \
+		(state)->S[3] = S3; \
+		(state)->T0 = T0; \
+		(state)->T1 = T1; \
+	} while (0)
+
+#if SPH_COMPACT_BLAKE_32
+
+#define COMPRESS32   do { \
+		sph_u32 M[16]; \
+		sph_u32 V0, V1, V2, V3, V4, V5, V6, V7; \
+		sph_u32 V8, V9, VA, VB, VC, VD, VE, VF; \
+		unsigned r; \
+		V0 = H0; \
+		V1 = H1; \
+		V2 = H2; \
+		V3 = H3; \
+		V4 = H4; \
+		V5 = H5; \
+		V6 = H6; \
+		V7 = H7; \
+		V8 = S0 ^ CS0; \
+		V9 = S1 ^ CS1; \
+		VA = S2 ^ CS2; \
+		VB = S3 ^ CS3; \
+		VC = T0 ^ CS4; \
+		VD = T0 ^ CS5; \
+		VE = T1 ^ CS6; \
+		VF = T1 ^ CS7; \
+		M[0x0] = sph_dec32be_aligned(buf +  0); \
+		M[0x1] = sph_dec32be_aligned(buf +  4); \
+		M[0x2] = sph_dec32be_aligned(buf +  8); \
+		M[0x3] = sph_dec32be_aligned(buf + 12); \
+		M[0x4] = sph_dec32be_aligned(buf + 16); \
+		M[0x5] = sph_dec32be_aligned(buf + 20); \
+		M[0x6] = sph_dec32be_aligned(buf + 24); \
+		M[0x7] = sph_dec32be_aligned(buf + 28); \
+		M[0x8] = sph_dec32be_aligned(buf + 32); \
+		M[0x9] = sph_dec32be_aligned(buf + 36); \
+		M[0xA] = sph_dec32be_aligned(buf + 40); \
+		M[0xB] = sph_dec32be_aligned(buf + 44); \
+		M[0xC] = sph_dec32be_aligned(buf + 48); \
+		M[0xD] = sph_dec32be_aligned(buf + 52); \
+		M[0xE] = sph_dec32be_aligned(buf + 56); \
+		M[0xF] = sph_dec32be_aligned(buf + 60); \
+		for (r = 0; r < 14; r ++) \
+			ROUND_S(r); \
+		H0 ^= S0 ^ V0 ^ V8; \
+		H1 ^= S1 ^ V1 ^ V9; \
+		H2 ^= S2 ^ V2 ^ VA; \
+		H3 ^= S3 ^ V3 ^ VB; \
+		H4 ^= S0 ^ V4 ^ VC; \
+		H5 ^= S1 ^ V5 ^ VD; \
+		H6 ^= S2 ^ V6 ^ VE; \
+		H7 ^= S3 ^ V7 ^ VF; \
+	} while (0)
+
+#else
+
+#define COMPRESS32   do { \
+		sph_u32 M0, M1, M2, M3, M4, M5, M6, M7; \
+		sph_u32 M8, M9, MA, MB, MC, MD, ME, MF; \
+		sph_u32 V0, V1, V2, V3, V4, V5, V6, V7; \
+		sph_u32 V8, V9, VA, VB, VC, VD, VE, VF; \
+		V0 = H0; \
+		V1 = H1; \
+		V2 = H2; \
+		V3 = H3; \
+		V4 = H4; \
+		V5 = H5; \
+		V6 = H6; \
+		V7 = H7; \
+		V8 = S0 ^ CS0; \
+		V9 = S1 ^ CS1; \
+		VA = S2 ^ CS2; \
+		VB = S3 ^ CS3; \
+		VC = T0 ^ CS4; \
+		VD = T0 ^ CS5; \
+		VE = T1 ^ CS6; \
+		VF = T1 ^ CS7; \
+		M0 = sph_dec32be_aligned(buf +  0); \
+		M1 = sph_dec32be_aligned(buf +  4); \
+		M2 = sph_dec32be_aligned(buf +  8); \
+		M3 = sph_dec32be_aligned(buf + 12); \
+		M4 = sph_dec32be_aligned(buf + 16); \
+		M5 = sph_dec32be_aligned(buf + 20); \
+		M6 = sph_dec32be_aligned(buf + 24); \
+		M7 = sph_dec32be_aligned(buf + 28); \
+		M8 = sph_dec32be_aligned(buf + 32); \
+		M9 = sph_dec32be_aligned(buf + 36); \
+		MA = sph_dec32be_aligned(buf + 40); \
+		MB = sph_dec32be_aligned(buf + 44); \
+		MC = sph_dec32be_aligned(buf + 48); \
+		MD = sph_dec32be_aligned(buf + 52); \
+		ME = sph_dec32be_aligned(buf + 56); \
+		MF = sph_dec32be_aligned(buf + 60); \
+		ROUND_S(0); \
+		ROUND_S(1); \
+		ROUND_S(2); \
+		ROUND_S(3); \
+		ROUND_S(4); \
+		ROUND_S(5); \
+		ROUND_S(6); \
+		ROUND_S(7); \
+		ROUND_S(8); \
+		ROUND_S(9); \
+		ROUND_S(0); \
+		ROUND_S(1); \
+		ROUND_S(2); \
+		ROUND_S(3); \
+		H0 ^= S0 ^ V0 ^ V8; \
+		H1 ^= S1 ^ V1 ^ V9; \
+		H2 ^= S2 ^ V2 ^ VA; \
+		H3 ^= S3 ^ V3 ^ VB; \
+		H4 ^= S0 ^ V4 ^ VC; \
+		H5 ^= S1 ^ V5 ^ VD; \
+		H6 ^= S2 ^ V6 ^ VE; \
+		H7 ^= S3 ^ V7 ^ VF; \
+	} while (0)
+
+#endif
+
+#if SPH_64
+
+#define DECL_STATE64 \
+	sph_u64 H0, H1, H2, H3, H4, H5, H6, H7; \
+	sph_u64 S0, S1, S2, S3, T0, T1;
+
+#define READ_STATE64(state)   do { \
+		H0 = (state)->H[0]; \
+		H1 = (state)->H[1]; \
+		H2 = (state)->H[2]; \
+		H3 = (state)->H[3]; \
+		H4 = (state)->H[4]; \
+		H5 = (state)->H[5]; \
+		H6 = (state)->H[6]; \
+		H7 = (state)->H[7]; \
+		S0 = (state)->S[0]; \
+		S1 = (state)->S[1]; \
+		S2 = (state)->S[2]; \
+		S3 = (state)->S[3]; \
+		T0 = (state)->T0; \
+		T1 = (state)->T1; \
+	} while (0)
+
+#define WRITE_STATE64(state)   do { \
+		(state)->H[0] = H0; \
+		(state)->H[1] = H1; \
+		(state)->H[2] = H2; \
+		(state)->H[3] = H3; \
+		(state)->H[4] = H4; \
+		(state)->H[5] = H5; \
+		(state)->H[6] = H6; \
+		(state)->H[7] = H7; \
+		(state)->S[0] = S0; \
+		(state)->S[1] = S1; \
+		(state)->S[2] = S2; \
+		(state)->S[3] = S3; \
+		(state)->T0 = T0; \
+		(state)->T1 = T1; \
+	} while (0)
+
+#if SPH_COMPACT_BLAKE_64
+
+#define COMPRESS64   do { \
+		sph_u64 M[16]; \
+		sph_u64 V0, V1, V2, V3, V4, V5, V6, V7; \
+		sph_u64 V8, V9, VA, VB, VC, VD, VE, VF; \
+		unsigned r; \
+		V0 = H0; \
+		V1 = H1; \
+		V2 = H2; \
+		V3 = H3; \
+		V4 = H4; \
+		V5 = H5; \
+		V6 = H6; \
+		V7 = H7; \
+		V8 = S0 ^ CB0; \
+		V9 = S1 ^ CB1; \
+		VA = S2 ^ CB2; \
+		VB = S3 ^ CB3; \
+		VC = T0 ^ CB4; \
+		VD = T0 ^ CB5; \
+		VE = T1 ^ CB6; \
+		VF = T1 ^ CB7; \
+		M[0x0] = sph_dec64be_aligned(buf +   0); \
+		M[0x1] = sph_dec64be_aligned(buf +   8); \
+		M[0x2] = sph_dec64be_aligned(buf +  16); \
+		M[0x3] = sph_dec64be_aligned(buf +  24); \
+		M[0x4] = sph_dec64be_aligned(buf +  32); \
+		M[0x5] = sph_dec64be_aligned(buf +  40); \
+		M[0x6] = sph_dec64be_aligned(buf +  48); \
+		M[0x7] = sph_dec64be_aligned(buf +  56); \
+		M[0x8] = sph_dec64be_aligned(buf +  64); \
+		M[0x9] = sph_dec64be_aligned(buf +  72); \
+		M[0xA] = sph_dec64be_aligned(buf +  80); \
+		M[0xB] = sph_dec64be_aligned(buf +  88); \
+		M[0xC] = sph_dec64be_aligned(buf +  96); \
+		M[0xD] = sph_dec64be_aligned(buf + 104); \
+		M[0xE] = sph_dec64be_aligned(buf + 112); \
+		M[0xF] = sph_dec64be_aligned(buf + 120); \
+		for (r = 0; r < 16; r ++) \
+			ROUND_B(r); \
+		H0 ^= S0 ^ V0 ^ V8; \
+		H1 ^= S1 ^ V1 ^ V9; \
+		H2 ^= S2 ^ V2 ^ VA; \
+		H3 ^= S3 ^ V3 ^ VB; \
+		H4 ^= S0 ^ V4 ^ VC; \
+		H5 ^= S1 ^ V5 ^ VD; \
+		H6 ^= S2 ^ V6 ^ VE; \
+		H7 ^= S3 ^ V7 ^ VF; \
+	} while (0)
+
+#else
+
+#define COMPRESS64   do { \
+		sph_u64 M0, M1, M2, M3, M4, M5, M6, M7; \
+		sph_u64 M8, M9, MA, MB, MC, MD, ME, MF; \
+		sph_u64 V0, V1, V2, V3, V4, V5, V6, V7; \
+		sph_u64 V8, V9, VA, VB, VC, VD, VE, VF; \
+		V0 = H0; \
+		V1 = H1; \
+		V2 = H2; \
+		V3 = H3; \
+		V4 = H4; \
+		V5 = H5; \
+		V6 = H6; \
+		V7 = H7; \
+		V8 = S0 ^ CB0; \
+		V9 = S1 ^ CB1; \
+		VA = S2 ^ CB2; \
+		VB = S3 ^ CB3; \
+		VC = T0 ^ CB4; \
+		VD = T0 ^ CB5; \
+		VE = T1 ^ CB6; \
+		VF = T1 ^ CB7; \
+		M0 = sph_dec64be_aligned(buf +   0); \
+		M1 = sph_dec64be_aligned(buf +   8); \
+		M2 = sph_dec64be_aligned(buf +  16); \
+		M3 = sph_dec64be_aligned(buf +  24); \
+		M4 = sph_dec64be_aligned(buf +  32); \
+		M5 = sph_dec64be_aligned(buf +  40); \
+		M6 = sph_dec64be_aligned(buf +  48); \
+		M7 = sph_dec64be_aligned(buf +  56); \
+		M8 = sph_dec64be_aligned(buf +  64); \
+		M9 = sph_dec64be_aligned(buf +  72); \
+		MA = sph_dec64be_aligned(buf +  80); \
+		MB = sph_dec64be_aligned(buf +  88); \
+		MC = sph_dec64be_aligned(buf +  96); \
+		MD = sph_dec64be_aligned(buf + 104); \
+		ME = sph_dec64be_aligned(buf + 112); \
+		MF = sph_dec64be_aligned(buf + 120); \
+		ROUND_B(0); \
+		ROUND_B(1); \
+		ROUND_B(2); \
+		ROUND_B(3); \
+		ROUND_B(4); \
+		ROUND_B(5); \
+		ROUND_B(6); \
+		ROUND_B(7); \
+		ROUND_B(8); \
+		ROUND_B(9); \
+		ROUND_B(0); \
+		ROUND_B(1); \
+		ROUND_B(2); \
+		ROUND_B(3); \
+		ROUND_B(4); \
+		ROUND_B(5); \
+		H0 ^= S0 ^ V0 ^ V8; \
+		H1 ^= S1 ^ V1 ^ V9; \
+		H2 ^= S2 ^ V2 ^ VA; \
+		H3 ^= S3 ^ V3 ^ VB; \
+		H4 ^= S0 ^ V4 ^ VC; \
+		H5 ^= S1 ^ V5 ^ VD; \
+		H6 ^= S2 ^ V6 ^ VE; \
+		H7 ^= S3 ^ V7 ^ VF; \
+	} while (0)
+
+#endif
+
+#endif
+
+static const sph_u32 salt_zero_small[4] = { 0, 0, 0, 0 };
+
+static void
+blake32_init(sph_blake_small_context *sc,
+	const sph_u32 *iv, const sph_u32 *salt)
+{
+	memcpy(sc->H, iv, 8 * sizeof(sph_u32));
+	memcpy(sc->S, salt, 4 * sizeof(sph_u32));
+	sc->T0 = sc->T1 = 0;
+	sc->ptr = 0;
+}
+
+static void
+blake32(sph_blake_small_context *sc, const void *data, size_t len)
+{
+	unsigned char *buf;
+	size_t ptr;
+	DECL_STATE32
+
+	buf = sc->buf;
+	ptr = sc->ptr;
+	if (len < (sizeof sc->buf) - ptr) {
+		memcpy(buf + ptr, data, len);
+		ptr += len;
+		sc->ptr = ptr;
+		return;
+	}
+
+	READ_STATE32(sc);
+	while (len > 0) {
+		size_t clen;
+
+		clen = (sizeof sc->buf) - ptr;
+		if (clen > len)
+			clen = len;
+		memcpy(buf + ptr, data, clen);
+		ptr += clen;
+		data = (const unsigned char *)data + clen;
+		len -= clen;
+		if (ptr == sizeof sc->buf) {
+			if ((T0 = SPH_T32(T0 + 512)) < 512)
+				T1 = SPH_T32(T1 + 1);
+			COMPRESS32;
+			ptr = 0;
+		}
+	}
+	WRITE_STATE32(sc);
+	sc->ptr = ptr;
+}
+
+static void
+blake32_close(sph_blake_small_context *sc,
+	unsigned ub, unsigned n, void *dst, size_t out_size_w32)
+{
+	union {
+		unsigned char buf[64];
+		sph_u32 dummy;
+	} u;
+	size_t ptr, k;
+	unsigned bit_len;
+	unsigned z;
+	sph_u32 th, tl;
+	unsigned char *out;
+
+	ptr = sc->ptr;
+	bit_len = ((unsigned)ptr << 3) + n;
+	z = 0x80 >> n;
+	u.buf[ptr] = ((ub & -z) | z) & 0xFF;
+	tl = sc->T0 + bit_len;
+	th = sc->T1;
+	if (ptr == 0 && n == 0) {
+		sc->T0 = SPH_C32(0xFFFFFE00);
+		sc->T1 = SPH_C32(0xFFFFFFFF);
+	} else if (sc->T0 == 0) {
+		sc->T0 = SPH_C32(0xFFFFFE00) + bit_len;
+		sc->T1 = SPH_T32(sc->T1 - 1);
+	} else {
+		sc->T0 -= 512 - bit_len;
+	}
+	if (bit_len <= 446) {
+		memset(u.buf + ptr + 1, 0, 55 - ptr);
+		if (out_size_w32 == 8)
+			u.buf[55] |= 1;
+		sph_enc32be_aligned(u.buf + 56, th);
+		sph_enc32be_aligned(u.buf + 60, tl);
+		blake32(sc, u.buf + ptr, 64 - ptr);
+	} else {
+		memset(u.buf + ptr + 1, 0, 63 - ptr);
+		blake32(sc, u.buf + ptr, 64 - ptr);
+		sc->T0 = SPH_C32(0xFFFFFE00);
+		sc->T1 = SPH_C32(0xFFFFFFFF);
+		memset(u.buf, 0, 56);
+		if (out_size_w32 == 8)
+			u.buf[55] = 1;
+		sph_enc32be_aligned(u.buf + 56, th);
+		sph_enc32be_aligned(u.buf + 60, tl);
+		blake32(sc, u.buf, 64);
+	}
+	out = dst;
+	for (k = 0; k < out_size_w32; k ++)
+		sph_enc32be(out + (k << 2), sc->H[k]);
+}
+
+#if SPH_64
+
+static const sph_u64 salt_zero_big[4] = { 0, 0, 0, 0 };
+
+static void
+blake64_init(sph_blake_big_context *sc,
+	const sph_u64 *iv, const sph_u64 *salt)
+{
+	memcpy(sc->H, iv, 8 * sizeof(sph_u64));
+	memcpy(sc->S, salt, 4 * sizeof(sph_u64));
+	sc->T0 = sc->T1 = 0;
+	sc->ptr = 0;
+}
+
+static void
+blake64(sph_blake_big_context *sc, const void *data, size_t len)
+{
+	unsigned char *buf;
+	size_t ptr;
+	DECL_STATE64
+
+	buf = sc->buf;
+	ptr = sc->ptr;
+	if (len < (sizeof sc->buf) - ptr) {
+		memcpy(buf + ptr, data, len);
+		ptr += len;
+		sc->ptr = ptr;
+		return;
+	}
+
+	READ_STATE64(sc);
+	while (len > 0) {
+		size_t clen;
+
+		clen = (sizeof sc->buf) - ptr;
+		if (clen > len)
+			clen = len;
+		memcpy(buf + ptr, data, clen);
+		ptr += clen;
+		data = (const unsigned char *)data + clen;
+		len -= clen;
+		if (ptr == sizeof sc->buf) {
+			if ((T0 = SPH_T64(T0 + 1024)) < 1024)
+				T1 = SPH_T64(T1 + 1);
+			COMPRESS64;
+			ptr = 0;
+		}
+	}
+	WRITE_STATE64(sc);
+	sc->ptr = ptr;
+}
+
+static void
+blake64_close(sph_blake_big_context *sc,
+	unsigned ub, unsigned n, void *dst, size_t out_size_w64)
+{
+	union {
+		unsigned char buf[128];
+		sph_u64 dummy;
+	} u;
+	size_t ptr, k;
+	unsigned bit_len;
+	unsigned z;
+	sph_u64 th, tl;
+	unsigned char *out;
+
+	ptr = sc->ptr;
+	bit_len = ((unsigned)ptr << 3) + n;
+	z = 0x80 >> n;
+	u.buf[ptr] = ((ub & -z) | z) & 0xFF;
+	tl = sc->T0 + bit_len;
+	th = sc->T1;
+	if (ptr == 0 && n == 0) {
+		sc->T0 = SPH_C64(0xFFFFFFFFFFFFFC00);
+		sc->T1 = SPH_C64(0xFFFFFFFFFFFFFFFF);
+	} else if (sc->T0 == 0) {
+		sc->T0 = SPH_C64(0xFFFFFFFFFFFFFC00) + bit_len;
+		sc->T1 = SPH_T64(sc->T1 - 1);
+	} else {
+		sc->T0 -= 1024 - bit_len;
+	}
+	if (bit_len <= 894) {
+		memset(u.buf + ptr + 1, 0, 111 - ptr);
+		if (out_size_w64 == 8)
+			u.buf[111] |= 1;
+		sph_enc64be_aligned(u.buf + 112, th);
+		sph_enc64be_aligned(u.buf + 120, tl);
+		blake64(sc, u.buf + ptr, 128 - ptr);
+	} else {
+		memset(u.buf + ptr + 1, 0, 127 - ptr);
+		blake64(sc, u.buf + ptr, 128 - ptr);
+		sc->T0 = SPH_C64(0xFFFFFFFFFFFFFC00);
+		sc->T1 = SPH_C64(0xFFFFFFFFFFFFFFFF);
+		memset(u.buf, 0, 112);
+		if (out_size_w64 == 8)
+			u.buf[111] = 1;
+		sph_enc64be_aligned(u.buf + 112, th);
+		sph_enc64be_aligned(u.buf + 120, tl);
+		blake64(sc, u.buf, 128);
+	}
+	out = dst;
+	for (k = 0; k < out_size_w64; k ++)
+		sph_enc64be(out + (k << 3), sc->H[k]);
+}
+
+#endif
+
+/* see sph_blake.h */
+void
+sph_blake224_init(void *cc)
+{
+	blake32_init(cc, IV224, salt_zero_small);
+}
+
+/* see sph_blake.h */
+void
+sph_blake224(void *cc, const void *data, size_t len)
+{
+	blake32(cc, data, len);
+}
+
+/* see sph_blake.h */
+void
+sph_blake224_close(void *cc, void *dst)
+{
+	sph_blake224_addbits_and_close(cc, 0, 0, dst);
+}
+
+/* see sph_blake.h */
+void
+sph_blake224_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
+{
+	blake32_close(cc, ub, n, dst, 7);
+	sph_blake224_init(cc);
+}
+
+/* see sph_blake.h */
+void
+sph_blake256_init(void *cc)
+{
+	blake32_init(cc, IV256, salt_zero_small);
+}
+
+/* see sph_blake.h */
+void
+sph_blake256(void *cc, const void *data, size_t len)
+{
+	blake32(cc, data, len);
+}
+
+/* see sph_blake.h */
+void
+sph_blake256_close(void *cc, void *dst)
+{
+	sph_blake256_addbits_and_close(cc, 0, 0, dst);
+}
+
+/* see sph_blake.h */
+void
+sph_blake256_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
+{
+	blake32_close(cc, ub, n, dst, 8);
+	sph_blake256_init(cc);
+}
+
+#if SPH_64
+
+/* see sph_blake.h */
+void
+sph_blake384_init(void *cc)
+{
+	blake64_init(cc, IV384, salt_zero_big);
+}
+
+/* see sph_blake.h */
+void
+sph_blake384(void *cc, const void *data, size_t len)
+{
+	blake64(cc, data, len);
+}
+
+/* see sph_blake.h */
+void
+sph_blake384_close(void *cc, void *dst)
+{
+	sph_blake384_addbits_and_close(cc, 0, 0, dst);
+}
+
+/* see sph_blake.h */
+void
+sph_blake384_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
+{
+	blake64_close(cc, ub, n, dst, 6);
+	sph_blake384_init(cc);
+}
+
+/* see sph_blake.h */
+void
+sph_blake512_init(void *cc)
+{
+	blake64_init(cc, IV512, salt_zero_big);
+}
+
+/* see sph_blake.h */
+void
+sph_blake512(void *cc, const void *data, size_t len)
+{
+	blake64(cc, data, len);
+}
+
+/* see sph_blake.h */
+void
+sph_blake512_close(void *cc, void *dst)
+{
+	sph_blake512_addbits_and_close(cc, 0, 0, dst);
+}
+
+/* see sph_blake.h */
+void
+sph_blake512_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
+{
+	blake64_close(cc, ub, n, dst, 8);
+	sph_blake512_init(cc);
+}
+
+#endif
+
+#ifdef __cplusplus
+}
+#endif
diff --git a/sha3/bmw.c b/sha3/bmw.c
new file mode 100644
index 0000000..b89a881
--- /dev/null
+++ b/sha3/bmw.c
@@ -0,0 +1,965 @@
+/* $Id: bmw.c 227 2010-06-16 17:28:38Z tp $ */
+/*
+ * BMW implementation.
+ *
+ * ==========================(LICENSE BEGIN)============================
+ *
+ * Copyright (c) 2007-2010  Projet RNRT SAPHIR
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ===========================(LICENSE END)=============================
+ *
+ * @author   Thomas Pornin <thomas.pornin@cryptolog.com>
+ */
+
+#include <stddef.h>
+#include <string.h>
+#include <limits.h>
+
+#ifdef __cplusplus
+extern "C"{
+#endif
+
+#include "sph_bmw.h"
+
+#if SPH_SMALL_FOOTPRINT && !defined SPH_SMALL_FOOTPRINT_BMW
+#define SPH_SMALL_FOOTPRINT_BMW   1
+#endif
+
+#ifdef _MSC_VER
+#pragma warning (disable: 4146)
+#endif
+
+static const sph_u32 IV224[] = {
+	SPH_C32(0x00010203), SPH_C32(0x04050607),
+	SPH_C32(0x08090A0B), SPH_C32(0x0C0D0E0F),
+	SPH_C32(0x10111213), SPH_C32(0x14151617),
+	SPH_C32(0x18191A1B), SPH_C32(0x1C1D1E1F),
+	SPH_C32(0x20212223), SPH_C32(0x24252627),
+	SPH_C32(0x28292A2B), SPH_C32(0x2C2D2E2F),
+	SPH_C32(0x30313233), SPH_C32(0x34353637),
+	SPH_C32(0x38393A3B), SPH_C32(0x3C3D3E3F)
+};
+
+static const sph_u32 IV256[] = {
+	SPH_C32(0x40414243), SPH_C32(0x44454647),
+	SPH_C32(0x48494A4B), SPH_C32(0x4C4D4E4F),
+	SPH_C32(0x50515253), SPH_C32(0x54555657),
+	SPH_C32(0x58595A5B), SPH_C32(0x5C5D5E5F),
+	SPH_C32(0x60616263), SPH_C32(0x64656667),
+	SPH_C32(0x68696A6B), SPH_C32(0x6C6D6E6F),
+	SPH_C32(0x70717273), SPH_C32(0x74757677),
+	SPH_C32(0x78797A7B), SPH_C32(0x7C7D7E7F)
+};
+
+#if SPH_64
+
+static const sph_u64 IV384[] = {
+	SPH_C64(0x0001020304050607), SPH_C64(0x08090A0B0C0D0E0F),
+	SPH_C64(0x1011121314151617), SPH_C64(0x18191A1B1C1D1E1F),
+	SPH_C64(0x2021222324252627), SPH_C64(0x28292A2B2C2D2E2F),
+	SPH_C64(0x3031323334353637), SPH_C64(0x38393A3B3C3D3E3F),
+	SPH_C64(0x4041424344454647), SPH_C64(0x48494A4B4C4D4E4F),
+	SPH_C64(0x5051525354555657), SPH_C64(0x58595A5B5C5D5E5F),
+	SPH_C64(0x6061626364656667), SPH_C64(0x68696A6B6C6D6E6F),
+	SPH_C64(0x7071727374757677), SPH_C64(0x78797A7B7C7D7E7F)
+};
+
+static const sph_u64 IV512[] = {
+	SPH_C64(0x8081828384858687), SPH_C64(0x88898A8B8C8D8E8F),
+	SPH_C64(0x9091929394959697), SPH_C64(0x98999A9B9C9D9E9F),
+	SPH_C64(0xA0A1A2A3A4A5A6A7), SPH_C64(0xA8A9AAABACADAEAF),
+	SPH_C64(0xB0B1B2B3B4B5B6B7), SPH_C64(0xB8B9BABBBCBDBEBF),
+	SPH_C64(0xC0C1C2C3C4C5C6C7), SPH_C64(0xC8C9CACBCCCDCECF),
+	SPH_C64(0xD0D1D2D3D4D5D6D7), SPH_C64(0xD8D9DADBDCDDDEDF),
+	SPH_C64(0xE0E1E2E3E4E5E6E7), SPH_C64(0xE8E9EAEBECEDEEEF),
+	SPH_C64(0xF0F1F2F3F4F5F6F7), SPH_C64(0xF8F9FAFBFCFDFEFF)
+};
+
+#endif
+
+#define XCAT(x, y)    XCAT_(x, y)
+#define XCAT_(x, y)   x ## y
+
+#define LPAR   (
+
+#define I16_16    0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15
+#define I16_17    1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16
+#define I16_18    2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17
+#define I16_19    3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17, 18
+#define I16_20    4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19
+#define I16_21    5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20
+#define I16_22    6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21
+#define I16_23    7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22
+#define I16_24    8,  9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23
+#define I16_25    9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24
+#define I16_26   10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25
+#define I16_27   11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26
+#define I16_28   12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27
+#define I16_29   13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28
+#define I16_30   14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29
+#define I16_31   15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30
+
+#define M16_16    0,  1,  3,  4,  7, 10, 11
+#define M16_17    1,  2,  4,  5,  8, 11, 12
+#define M16_18    2,  3,  5,  6,  9, 12, 13
+#define M16_19    3,  4,  6,  7, 10, 13, 14
+#define M16_20    4,  5,  7,  8, 11, 14, 15
+#define M16_21    5,  6,  8,  9, 12, 15, 16
+#define M16_22    6,  7,  9, 10, 13,  0,  1
+#define M16_23    7,  8, 10, 11, 14,  1,  2
+#define M16_24    8,  9, 11, 12, 15,  2,  3
+#define M16_25    9, 10, 12, 13,  0,  3,  4
+#define M16_26   10, 11, 13, 14,  1,  4,  5
+#define M16_27   11, 12, 14, 15,  2,  5,  6
+#define M16_28   12, 13, 15, 16,  3,  6,  7
+#define M16_29   13, 14,  0,  1,  4,  7,  8
+#define M16_30   14, 15,  1,  2,  5,  8,  9
+#define M16_31   15, 16,  2,  3,  6,  9, 10
+
+#define ss0(x)    (((x) >> 1) ^ SPH_T32((x) << 3) \
+                  ^ SPH_ROTL32(x,  4) ^ SPH_ROTL32(x, 19))
+#define ss1(x)    (((x) >> 1) ^ SPH_T32((x) << 2) \
+                  ^ SPH_ROTL32(x,  8) ^ SPH_ROTL32(x, 23))
+#define ss2(x)    (((x) >> 2) ^ SPH_T32((x) << 1) \
+                  ^ SPH_ROTL32(x, 12) ^ SPH_ROTL32(x, 25))
+#define ss3(x)    (((x) >> 2) ^ SPH_T32((x) << 2) \
+                  ^ SPH_ROTL32(x, 15) ^ SPH_ROTL32(x, 29))
+#define ss4(x)    (((x) >> 1) ^ (x))
+#define ss5(x)    (((x) >> 2) ^ (x))
+#define rs1(x)    SPH_ROTL32(x,  3)
+#define rs2(x)    SPH_ROTL32(x,  7)
+#define rs3(x)    SPH_ROTL32(x, 13)
+#define rs4(x)    SPH_ROTL32(x, 16)
+#define rs5(x)    SPH_ROTL32(x, 19)
+#define rs6(x)    SPH_ROTL32(x, 23)
+#define rs7(x)    SPH_ROTL32(x, 27)
+
+#define Ks(j)   SPH_T32((sph_u32)(j) * SPH_C32(0x05555555))
+
+#define add_elt_s(mf, hf, j0m, j1m, j3m, j4m, j7m, j10m, j11m, j16) \
+	(SPH_T32(SPH_ROTL32(mf(j0m), j1m) + SPH_ROTL32(mf(j3m), j4m) \
+		- SPH_ROTL32(mf(j10m), j11m) + Ks(j16)) ^ hf(j7m))
+
+#define expand1s_inner(qf, mf, hf, i16, \
+		i0, i1, i2, i3, i4, i5, i6, i7, i8, \
+		i9, i10, i11, i12, i13, i14, i15, \
+		i0m, i1m, i3m, i4m, i7m, i10m, i11m) \
+	SPH_T32(ss1(qf(i0)) + ss2(qf(i1)) + ss3(qf(i2)) + ss0(qf(i3)) \
+		+ ss1(qf(i4)) + ss2(qf(i5)) + ss3(qf(i6)) + ss0(qf(i7)) \
+		+ ss1(qf(i8)) + ss2(qf(i9)) + ss3(qf(i10)) + ss0(qf(i11)) \
+		+ ss1(qf(i12)) + ss2(qf(i13)) + ss3(qf(i14)) + ss0(qf(i15)) \
+		+ add_elt_s(mf, hf, i0m, i1m, i3m, i4m, i7m, i10m, i11m, i16))
+
+#define expand1s(qf, mf, hf, i16) \
+	expand1s_(qf, mf, hf, i16, I16_ ## i16, M16_ ## i16)
+#define expand1s_(qf, mf, hf, i16, ix, iy) \
+	expand1s_inner LPAR qf, mf, hf, i16, ix, iy)
+
+#define expand2s_inner(qf, mf, hf, i16, \
+		i0, i1, i2, i3, i4, i5, i6, i7, i8, \
+		i9, i10, i11, i12, i13, i14, i15, \
+		i0m, i1m, i3m, i4m, i7m, i10m, i11m) \
+	SPH_T32(qf(i0) + rs1(qf(i1)) + qf(i2) + rs2(qf(i3)) \
+		+ qf(i4) + rs3(qf(i5)) + qf(i6) + rs4(qf(i7)) \
+		+ qf(i8) + rs5(qf(i9)) + qf(i10) + rs6(qf(i11)) \
+		+ qf(i12) + rs7(qf(i13)) + ss4(qf(i14)) + ss5(qf(i15)) \
+		+ add_elt_s(mf, hf, i0m, i1m, i3m, i4m, i7m, i10m, i11m, i16))
+
+#define expand2s(qf, mf, hf, i16) \
+	expand2s_(qf, mf, hf, i16, I16_ ## i16, M16_ ## i16)
+#define expand2s_(qf, mf, hf, i16, ix, iy) \
+	expand2s_inner LPAR qf, mf, hf, i16, ix, iy)
+
+#if SPH_64
+
+#define sb0(x)    (((x) >> 1) ^ SPH_T64((x) << 3) \
+                  ^ SPH_ROTL64(x,  4) ^ SPH_ROTL64(x, 37))
+#define sb1(x)    (((x) >> 1) ^ SPH_T64((x) << 2) \
+                  ^ SPH_ROTL64(x, 13) ^ SPH_ROTL64(x, 43))
+#define sb2(x)    (((x) >> 2) ^ SPH_T64((x) << 1) \
+                  ^ SPH_ROTL64(x, 19) ^ SPH_ROTL64(x, 53))
+#define sb3(x)    (((x) >> 2) ^ SPH_T64((x) << 2) \
+                  ^ SPH_ROTL64(x, 28) ^ SPH_ROTL64(x, 59))
+#define sb4(x)    (((x) >> 1) ^ (x))
+#define sb5(x)    (((x) >> 2) ^ (x))
+#define rb1(x)    SPH_ROTL64(x,  5)
+#define rb2(x)    SPH_ROTL64(x, 11)
+#define rb3(x)    SPH_ROTL64(x, 27)
+#define rb4(x)    SPH_ROTL64(x, 32)
+#define rb5(x)    SPH_ROTL64(x, 37)
+#define rb6(x)    SPH_ROTL64(x, 43)
+#define rb7(x)    SPH_ROTL64(x, 53)
+
+#define Kb(j)   SPH_T64((sph_u64)(j) * SPH_C64(0x0555555555555555))
+
+#if SPH_SMALL_FOOTPRINT_BMW
+
+static const sph_u64 Kb_tab[] = {
+	Kb(16), Kb(17), Kb(18), Kb(19), Kb(20), Kb(21), Kb(22), Kb(23),
+	Kb(24), Kb(25), Kb(26), Kb(27), Kb(28), Kb(29), Kb(30), Kb(31)
+};
+
+#define rol_off(mf, j, off) \
+	SPH_ROTL64(mf(((j) + (off)) & 15), (((j) + (off)) & 15) + 1)
+
+#define add_elt_b(mf, hf, j) \
+	(SPH_T64(rol_off(mf, j, 0) + rol_off(mf, j, 3) \
+		- rol_off(mf, j, 10) + Kb_tab[j]) ^ hf(((j) + 7) & 15))
+
+#define expand1b(qf, mf, hf, i) \
+	SPH_T64(sb1(qf((i) - 16)) + sb2(qf((i) - 15)) \
+		+ sb3(qf((i) - 14)) + sb0(qf((i) - 13)) \
+		+ sb1(qf((i) - 12)) + sb2(qf((i) - 11)) \
+		+ sb3(qf((i) - 10)) + sb0(qf((i) - 9)) \
+		+ sb1(qf((i) - 8)) + sb2(qf((i) - 7)) \
+		+ sb3(qf((i) - 6)) + sb0(qf((i) - 5)) \
+		+ sb1(qf((i) - 4)) + sb2(qf((i) - 3)) \
+		+ sb3(qf((i) - 2)) + sb0(qf((i) - 1)) \
+		+ add_elt_b(mf, hf, (i) - 16))
+
+#define expand2b(qf, mf, hf, i) \
+	SPH_T64(qf((i) - 16) + rb1(qf((i) - 15)) \
+		+ qf((i) - 14) + rb2(qf((i) - 13)) \
+		+ qf((i) - 12) + rb3(qf((i) - 11)) \
+		+ qf((i) - 10) + rb4(qf((i) - 9)) \
+		+ qf((i) - 8) + rb5(qf((i) - 7)) \
+		+ qf((i) - 6) + rb6(qf((i) - 5)) \
+		+ qf((i) - 4) + rb7(qf((i) - 3)) \
+		+ sb4(qf((i) - 2)) + sb5(qf((i) - 1)) \
+		+ add_elt_b(mf, hf, (i) - 16))
+
+#else
+
+#define add_elt_b(mf, hf, j0m, j1m, j3m, j4m, j7m, j10m, j11m, j16) \
+	(SPH_T64(SPH_ROTL64(mf(j0m), j1m) + SPH_ROTL64(mf(j3m), j4m) \
+		- SPH_ROTL64(mf(j10m), j11m) + Kb(j16)) ^ hf(j7m))
+
+#define expand1b_inner(qf, mf, hf, i16, \
+		i0, i1, i2, i3, i4, i5, i6, i7, i8, \
+		i9, i10, i11, i12, i13, i14, i15, \
+		i0m, i1m, i3m, i4m, i7m, i10m, i11m) \
+	SPH_T64(sb1(qf(i0)) + sb2(qf(i1)) + sb3(qf(i2)) + sb0(qf(i3)) \
+		+ sb1(qf(i4)) + sb2(qf(i5)) + sb3(qf(i6)) + sb0(qf(i7)) \
+		+ sb1(qf(i8)) + sb2(qf(i9)) + sb3(qf(i10)) + sb0(qf(i11)) \
+		+ sb1(qf(i12)) + sb2(qf(i13)) + sb3(qf(i14)) + sb0(qf(i15)) \
+		+ add_elt_b(mf, hf, i0m, i1m, i3m, i4m, i7m, i10m, i11m, i16))
+
+#define expand1b(qf, mf, hf, i16) \
+	expand1b_(qf, mf, hf, i16, I16_ ## i16, M16_ ## i16)
+#define expand1b_(qf, mf, hf, i16, ix, iy) \
+	expand1b_inner LPAR qf, mf, hf, i16, ix, iy)
+
+#define expand2b_inner(qf, mf, hf, i16, \
+		i0, i1, i2, i3, i4, i5, i6, i7, i8, \
+		i9, i10, i11, i12, i13, i14, i15, \
+		i0m, i1m, i3m, i4m, i7m, i10m, i11m) \
+	SPH_T64(qf(i0) + rb1(qf(i1)) + qf(i2) + rb2(qf(i3)) \
+		+ qf(i4) + rb3(qf(i5)) + qf(i6) + rb4(qf(i7)) \
+		+ qf(i8) + rb5(qf(i9)) + qf(i10) + rb6(qf(i11)) \
+		+ qf(i12) + rb7(qf(i13)) + sb4(qf(i14)) + sb5(qf(i15)) \
+		+ add_elt_b(mf, hf, i0m, i1m, i3m, i4m, i7m, i10m, i11m, i16))
+
+#define expand2b(qf, mf, hf, i16) \
+	expand2b_(qf, mf, hf, i16, I16_ ## i16, M16_ ## i16)
+#define expand2b_(qf, mf, hf, i16, ix, iy) \
+	expand2b_inner LPAR qf, mf, hf, i16, ix, iy)
+
+#endif
+
+#endif
+
+#define MAKE_W(tt, i0, op01, i1, op12, i2, op23, i3, op34, i4) \
+	tt((M(i0) ^ H(i0)) op01 (M(i1) ^ H(i1)) op12 (M(i2) ^ H(i2)) \
+	op23 (M(i3) ^ H(i3)) op34 (M(i4) ^ H(i4)))
+
+#define Ws0    MAKE_W(SPH_T32,  5, -,  7, +, 10, +, 13, +, 14)
+#define Ws1    MAKE_W(SPH_T32,  6, -,  8, +, 11, +, 14, -, 15)
+#define Ws2    MAKE_W(SPH_T32,  0, +,  7, +,  9, -, 12, +, 15)
+#define Ws3    MAKE_W(SPH_T32,  0, -,  1, +,  8, -, 10, +, 13)
+#define Ws4    MAKE_W(SPH_T32,  1, +,  2, +,  9, -, 11, -, 14)
+#define Ws5    MAKE_W(SPH_T32,  3, -,  2, +, 10, -, 12, +, 15)
+#define Ws6    MAKE_W(SPH_T32,  4, -,  0, -,  3, -, 11, +, 13)
+#define Ws7    MAKE_W(SPH_T32,  1, -,  4, -,  5, -, 12, -, 14)
+#define Ws8    MAKE_W(SPH_T32,  2, -,  5, -,  6, +, 13, -, 15)
+#define Ws9    MAKE_W(SPH_T32,  0, -,  3, +,  6, -,  7, +, 14)
+#define Ws10   MAKE_W(SPH_T32,  8, -,  1, -,  4, -,  7, +, 15)
+#define Ws11   MAKE_W(SPH_T32,  8, -,  0, -,  2, -,  5, +,  9)
+#define Ws12   MAKE_W(SPH_T32,  1, +,  3, -,  6, -,  9, +, 10)
+#define Ws13   MAKE_W(SPH_T32,  2, +,  4, +,  7, +, 10, +, 11)
+#define Ws14   MAKE_W(SPH_T32,  3, -,  5, +,  8, -, 11, -, 12)
+#define Ws15   MAKE_W(SPH_T32, 12, -,  4, -,  6, -,  9, +, 13)
+
+#if SPH_SMALL_FOOTPRINT_BMW
+
+#define MAKE_Qas   do { \
+		unsigned u; \
+		sph_u32 Ws[16]; \
+		Ws[ 0] = Ws0; \
+		Ws[ 1] = Ws1; \
+		Ws[ 2] = Ws2; \
+		Ws[ 3] = Ws3; \
+		Ws[ 4] = Ws4; \
+		Ws[ 5] = Ws5; \
+		Ws[ 6] = Ws6; \
+		Ws[ 7] = Ws7; \
+		Ws[ 8] = Ws8; \
+		Ws[ 9] = Ws9; \
+		Ws[10] = Ws10; \
+		Ws[11] = Ws11; \
+		Ws[12] = Ws12; \
+		Ws[13] = Ws13; \
+		Ws[14] = Ws14; \
+		Ws[15] = Ws15; \
+		for (u = 0; u < 15; u += 5) { \
+			qt[u + 0] = SPH_T32(ss0(Ws[u + 0]) + H(u + 1)); \
+			qt[u + 1] = SPH_T32(ss1(Ws[u + 1]) + H(u + 2)); \
+			qt[u + 2] = SPH_T32(ss2(Ws[u + 2]) + H(u + 3)); \
+			qt[u + 3] = SPH_T32(ss3(Ws[u + 3]) + H(u + 4)); \
+			qt[u + 4] = SPH_T32(ss4(Ws[u + 4]) + H(u + 5)); \
+		} \
+		qt[15] = SPH_T32(ss0(Ws[15]) + H(0)); \
+	} while (0)
+
+#define MAKE_Qbs   do { \
+		qt[16] = expand1s(Qs, M, H, 16); \
+		qt[17] = expand1s(Qs, M, H, 17); \
+		qt[18] = expand2s(Qs, M, H, 18); \
+		qt[19] = expand2s(Qs, M, H, 19); \
+		qt[20] = expand2s(Qs, M, H, 20); \
+		qt[21] = expand2s(Qs, M, H, 21); \
+		qt[22] = expand2s(Qs, M, H, 22); \
+		qt[23] = expand2s(Qs, M, H, 23); \
+		qt[24] = expand2s(Qs, M, H, 24); \
+		qt[25] = expand2s(Qs, M, H, 25); \
+		qt[26] = expand2s(Qs, M, H, 26); \
+		qt[27] = expand2s(Qs, M, H, 27); \
+		qt[28] = expand2s(Qs, M, H, 28); \
+		qt[29] = expand2s(Qs, M, H, 29); \
+		qt[30] = expand2s(Qs, M, H, 30); \
+		qt[31] = expand2s(Qs, M, H, 31); \
+	} while (0)
+
+#else
+
+#define MAKE_Qas   do { \
+		qt[ 0] = SPH_T32(ss0(Ws0 ) + H( 1)); \
+		qt[ 1] = SPH_T32(ss1(Ws1 ) + H( 2)); \
+		qt[ 2] = SPH_T32(ss2(Ws2 ) + H( 3)); \
+		qt[ 3] = SPH_T32(ss3(Ws3 ) + H( 4)); \
+		qt[ 4] = SPH_T32(ss4(Ws4 ) + H( 5)); \
+		qt[ 5] = SPH_T32(ss0(Ws5 ) + H( 6)); \
+		qt[ 6] = SPH_T32(ss1(Ws6 ) + H( 7)); \
+		qt[ 7] = SPH_T32(ss2(Ws7 ) + H( 8)); \
+		qt[ 8] = SPH_T32(ss3(Ws8 ) + H( 9)); \
+		qt[ 9] = SPH_T32(ss4(Ws9 ) + H(10)); \
+		qt[10] = SPH_T32(ss0(Ws10) + H(11)); \
+		qt[11] = SPH_T32(ss1(Ws11) + H(12)); \
+		qt[12] = SPH_T32(ss2(Ws12) + H(13)); \
+		qt[13] = SPH_T32(ss3(Ws13) + H(14)); \
+		qt[14] = SPH_T32(ss4(Ws14) + H(15)); \
+		qt[15] = SPH_T32(ss0(Ws15) + H( 0)); \
+	} while (0)
+
+#define MAKE_Qbs   do { \
+		qt[16] = expand1s(Qs, M, H, 16); \
+		qt[17] = expand1s(Qs, M, H, 17); \
+		qt[18] = expand2s(Qs, M, H, 18); \
+		qt[19] = expand2s(Qs, M, H, 19); \
+		qt[20] = expand2s(Qs, M, H, 20); \
+		qt[21] = expand2s(Qs, M, H, 21); \
+		qt[22] = expand2s(Qs, M, H, 22); \
+		qt[23] = expand2s(Qs, M, H, 23); \
+		qt[24] = expand2s(Qs, M, H, 24); \
+		qt[25] = expand2s(Qs, M, H, 25); \
+		qt[26] = expand2s(Qs, M, H, 26); \
+		qt[27] = expand2s(Qs, M, H, 27); \
+		qt[28] = expand2s(Qs, M, H, 28); \
+		qt[29] = expand2s(Qs, M, H, 29); \
+		qt[30] = expand2s(Qs, M, H, 30); \
+		qt[31] = expand2s(Qs, M, H, 31); \
+	} while (0)
+
+#endif
+
+#define MAKE_Qs   do { \
+		MAKE_Qas; \
+		MAKE_Qbs; \
+	} while (0)
+
+#define Qs(j)   (qt[j])
+
+#if SPH_64
+
+#define Wb0    MAKE_W(SPH_T64,  5, -,  7, +, 10, +, 13, +, 14)
+#define Wb1    MAKE_W(SPH_T64,  6, -,  8, +, 11, +, 14, -, 15)
+#define Wb2    MAKE_W(SPH_T64,  0, +,  7, +,  9, -, 12, +, 15)
+#define Wb3    MAKE_W(SPH_T64,  0, -,  1, +,  8, -, 10, +, 13)
+#define Wb4    MAKE_W(SPH_T64,  1, +,  2, +,  9, -, 11, -, 14)
+#define Wb5    MAKE_W(SPH_T64,  3, -,  2, +, 10, -, 12, +, 15)
+#define Wb6    MAKE_W(SPH_T64,  4, -,  0, -,  3, -, 11, +, 13)
+#define Wb7    MAKE_W(SPH_T64,  1, -,  4, -,  5, -, 12, -, 14)
+#define Wb8    MAKE_W(SPH_T64,  2, -,  5, -,  6, +, 13, -, 15)
+#define Wb9    MAKE_W(SPH_T64,  0, -,  3, +,  6, -,  7, +, 14)
+#define Wb10   MAKE_W(SPH_T64,  8, -,  1, -,  4, -,  7, +, 15)
+#define Wb11   MAKE_W(SPH_T64,  8, -,  0, -,  2, -,  5, +,  9)
+#define Wb12   MAKE_W(SPH_T64,  1, +,  3, -,  6, -,  9, +, 10)
+#define Wb13   MAKE_W(SPH_T64,  2, +,  4, +,  7, +, 10, +, 11)
+#define Wb14   MAKE_W(SPH_T64,  3, -,  5, +,  8, -, 11, -, 12)
+#define Wb15   MAKE_W(SPH_T64, 12, -,  4, -,  6, -,  9, +, 13)
+
+#if SPH_SMALL_FOOTPRINT_BMW
+
+#define MAKE_Qab   do { \
+		unsigned u; \
+		sph_u64 Wb[16]; \
+		Wb[ 0] = Wb0; \
+		Wb[ 1] = Wb1; \
+		Wb[ 2] = Wb2; \
+		Wb[ 3] = Wb3; \
+		Wb[ 4] = Wb4; \
+		Wb[ 5] = Wb5; \
+		Wb[ 6] = Wb6; \
+		Wb[ 7] = Wb7; \
+		Wb[ 8] = Wb8; \
+		Wb[ 9] = Wb9; \
+		Wb[10] = Wb10; \
+		Wb[11] = Wb11; \
+		Wb[12] = Wb12; \
+		Wb[13] = Wb13; \
+		Wb[14] = Wb14; \
+		Wb[15] = Wb15; \
+		for (u = 0; u < 15; u += 5) { \
+			qt[u + 0] = SPH_T64(sb0(Wb[u + 0]) + H(u + 1)); \
+			qt[u + 1] = SPH_T64(sb1(Wb[u + 1]) + H(u + 2)); \
+			qt[u + 2] = SPH_T64(sb2(Wb[u + 2]) + H(u + 3)); \
+			qt[u + 3] = SPH_T64(sb3(Wb[u + 3]) + H(u + 4)); \
+			qt[u + 4] = SPH_T64(sb4(Wb[u + 4]) + H(u + 5)); \
+		} \
+		qt[15] = SPH_T64(sb0(Wb[15]) + H(0)); \
+	} while (0)
+
+#define MAKE_Qbb   do { \
+		unsigned u; \
+		for (u = 16; u < 18; u ++) \
+			qt[u] = expand1b(Qb, M, H, u); \
+		for (u = 18; u < 32; u ++) \
+			qt[u] = expand2b(Qb, M, H, u); \
+	} while (0)
+
+#else
+
+#define MAKE_Qab   do { \
+		qt[ 0] = SPH_T64(sb0(Wb0 ) + H( 1)); \
+		qt[ 1] = SPH_T64(sb1(Wb1 ) + H( 2)); \
+		qt[ 2] = SPH_T64(sb2(Wb2 ) + H( 3)); \
+		qt[ 3] = SPH_T64(sb3(Wb3 ) + H( 4)); \
+		qt[ 4] = SPH_T64(sb4(Wb4 ) + H( 5)); \
+		qt[ 5] = SPH_T64(sb0(Wb5 ) + H( 6)); \
+		qt[ 6] = SPH_T64(sb1(Wb6 ) + H( 7)); \
+		qt[ 7] = SPH_T64(sb2(Wb7 ) + H( 8)); \
+		qt[ 8] = SPH_T64(sb3(Wb8 ) + H( 9)); \
+		qt[ 9] = SPH_T64(sb4(Wb9 ) + H(10)); \
+		qt[10] = SPH_T64(sb0(Wb10) + H(11)); \
+		qt[11] = SPH_T64(sb1(Wb11) + H(12)); \
+		qt[12] = SPH_T64(sb2(Wb12) + H(13)); \
+		qt[13] = SPH_T64(sb3(Wb13) + H(14)); \
+		qt[14] = SPH_T64(sb4(Wb14) + H(15)); \
+		qt[15] = SPH_T64(sb0(Wb15) + H( 0)); \
+	} while (0)
+
+#define MAKE_Qbb   do { \
+		qt[16] = expand1b(Qb, M, H, 16); \
+		qt[17] = expand1b(Qb, M, H, 17); \
+		qt[18] = expand2b(Qb, M, H, 18); \
+		qt[19] = expand2b(Qb, M, H, 19); \
+		qt[20] = expand2b(Qb, M, H, 20); \
+		qt[21] = expand2b(Qb, M, H, 21); \
+		qt[22] = expand2b(Qb, M, H, 22); \
+		qt[23] = expand2b(Qb, M, H, 23); \
+		qt[24] = expand2b(Qb, M, H, 24); \
+		qt[25] = expand2b(Qb, M, H, 25); \
+		qt[26] = expand2b(Qb, M, H, 26); \
+		qt[27] = expand2b(Qb, M, H, 27); \
+		qt[28] = expand2b(Qb, M, H, 28); \
+		qt[29] = expand2b(Qb, M, H, 29); \
+		qt[30] = expand2b(Qb, M, H, 30); \
+		qt[31] = expand2b(Qb, M, H, 31); \
+	} while (0)
+
+#endif
+
+#define MAKE_Qb   do { \
+		MAKE_Qab; \
+		MAKE_Qbb; \
+	} while (0)
+
+#define Qb(j)   (qt[j])
+
+#endif
+
+#define FOLD(type, mkQ, tt, rol, mf, qf, dhf)   do { \
+		type qt[32], xl, xh; \
+		mkQ; \
+		xl = qf(16) ^ qf(17) ^ qf(18) ^ qf(19) \
+			^ qf(20) ^ qf(21) ^ qf(22) ^ qf(23); \
+		xh = xl ^ qf(24) ^ qf(25) ^ qf(26) ^ qf(27) \
+			^ qf(28) ^ qf(29) ^ qf(30) ^ qf(31); \
+		dhf( 0) = tt(((xh <<  5) ^ (qf(16) >>  5) ^ mf( 0)) \
+			+ (xl ^ qf(24) ^ qf( 0))); \
+		dhf( 1) = tt(((xh >>  7) ^ (qf(17) <<  8) ^ mf( 1)) \
+			+ (xl ^ qf(25) ^ qf( 1))); \
+		dhf( 2) = tt(((xh >>  5) ^ (qf(18) <<  5) ^ mf( 2)) \
+			+ (xl ^ qf(26) ^ qf( 2))); \
+		dhf( 3) = tt(((xh >>  1) ^ (qf(19) <<  5) ^ mf( 3)) \
+			+ (xl ^ qf(27) ^ qf( 3))); \
+		dhf( 4) = tt(((xh >>  3) ^ (qf(20) <<  0) ^ mf( 4)) \
+			+ (xl ^ qf(28) ^ qf( 4))); \
+		dhf( 5) = tt(((xh <<  6) ^ (qf(21) >>  6) ^ mf( 5)) \
+			+ (xl ^ qf(29) ^ qf( 5))); \
+		dhf( 6) = tt(((xh >>  4) ^ (qf(22) <<  6) ^ mf( 6)) \
+			+ (xl ^ qf(30) ^ qf( 6))); \
+		dhf( 7) = tt(((xh >> 11) ^ (qf(23) <<  2) ^ mf( 7)) \
+			+ (xl ^ qf(31) ^ qf( 7))); \
+		dhf( 8) = tt(rol(dhf(4),  9) + (xh ^ qf(24) ^ mf( 8)) \
+			+ ((xl << 8) ^ qf(23) ^ qf( 8))); \
+		dhf( 9) = tt(rol(dhf(5), 10) + (xh ^ qf(25) ^ mf( 9)) \
+			+ ((xl >> 6) ^ qf(16) ^ qf( 9))); \
+		dhf(10) = tt(rol(dhf(6), 11) + (xh ^ qf(26) ^ mf(10)) \
+			+ ((xl << 6) ^ qf(17) ^ qf(10))); \
+		dhf(11) = tt(rol(dhf(7), 12) + (xh ^ qf(27) ^ mf(11)) \
+			+ ((xl << 4) ^ qf(18) ^ qf(11))); \
+		dhf(12) = tt(rol(dhf(0), 13) + (xh ^ qf(28) ^ mf(12)) \
+			+ ((xl >> 3) ^ qf(19) ^ qf(12))); \
+		dhf(13) = tt(rol(dhf(1), 14) + (xh ^ qf(29) ^ mf(13)) \
+			+ ((xl >> 4) ^ qf(20) ^ qf(13))); \
+		dhf(14) = tt(rol(dhf(2), 15) + (xh ^ qf(30) ^ mf(14)) \
+			+ ((xl >> 7) ^ qf(21) ^ qf(14))); \
+		dhf(15) = tt(rol(dhf(3), 16) + (xh ^ qf(31) ^ mf(15)) \
+			+ ((xl >> 2) ^ qf(22) ^ qf(15))); \
+	} while (0)
+
+#define FOLDs   FOLD(sph_u32, MAKE_Qs, SPH_T32, SPH_ROTL32, M, Qs, dH)
+
+#if SPH_64
+
+#define FOLDb   FOLD(sph_u64, MAKE_Qb, SPH_T64, SPH_ROTL64, M, Qb, dH)
+
+#endif
+
+static void
+compress_small(const unsigned char *data, const sph_u32 h[16], sph_u32 dh[16])
+{
+#if SPH_LITTLE_FAST
+#define M(x)    sph_dec32le_aligned(data + 4 * (x))
+#else
+	sph_u32 mv[16];
+
+	mv[ 0] = sph_dec32le_aligned(data +  0);
+	mv[ 1] = sph_dec32le_aligned(data +  4);
+	mv[ 2] = sph_dec32le_aligned(data +  8);
+	mv[ 3] = sph_dec32le_aligned(data + 12);
+	mv[ 4] = sph_dec32le_aligned(data + 16);
+	mv[ 5] = sph_dec32le_aligned(data + 20);
+	mv[ 6] = sph_dec32le_aligned(data + 24);
+	mv[ 7] = sph_dec32le_aligned(data + 28);
+	mv[ 8] = sph_dec32le_aligned(data + 32);
+	mv[ 9] = sph_dec32le_aligned(data + 36);
+	mv[10] = sph_dec32le_aligned(data + 40);
+	mv[11] = sph_dec32le_aligned(data + 44);
+	mv[12] = sph_dec32le_aligned(data + 48);
+	mv[13] = sph_dec32le_aligned(data + 52);
+	mv[14] = sph_dec32le_aligned(data + 56);
+	mv[15] = sph_dec32le_aligned(data + 60);
+#define M(x)    (mv[x])
+#endif
+#define H(x)    (h[x])
+#define dH(x)   (dh[x])
+
+	FOLDs;
+
+#undef M
+#undef H
+#undef dH
+}
+
+static const sph_u32 final_s[16] = {
+	SPH_C32(0xaaaaaaa0), SPH_C32(0xaaaaaaa1), SPH_C32(0xaaaaaaa2),
+	SPH_C32(0xaaaaaaa3), SPH_C32(0xaaaaaaa4), SPH_C32(0xaaaaaaa5),
+	SPH_C32(0xaaaaaaa6), SPH_C32(0xaaaaaaa7), SPH_C32(0xaaaaaaa8),
+	SPH_C32(0xaaaaaaa9), SPH_C32(0xaaaaaaaa), SPH_C32(0xaaaaaaab),
+	SPH_C32(0xaaaaaaac), SPH_C32(0xaaaaaaad), SPH_C32(0xaaaaaaae),
+	SPH_C32(0xaaaaaaaf)
+};
+
+static void
+bmw32_init(sph_bmw_small_context *sc, const sph_u32 *iv)
+{
+	memcpy(sc->H, iv, sizeof sc->H);
+	sc->ptr = 0;
+#if SPH_64
+	sc->bit_count = 0;
+#else
+	sc->bit_count_high = 0;
+	sc->bit_count_low = 0;
+#endif
+}
+
+static void
+bmw32(sph_bmw_small_context *sc, const void *data, size_t len)
+{
+	unsigned char *buf;
+	size_t ptr;
+	sph_u32 htmp[16];
+	sph_u32 *h1, *h2;
+#if !SPH_64
+	sph_u32 tmp;
+#endif
+
+#if SPH_64
+	sc->bit_count += (sph_u64)len << 3;
+#else
+	tmp = sc->bit_count_low;
+	sc->bit_count_low = SPH_T32(tmp + ((sph_u32)len << 3));
+	if (sc->bit_count_low < tmp)
+		sc->bit_count_high ++;
+	sc->bit_count_high += len >> 29;
+#endif
+	buf = sc->buf;
+	ptr = sc->ptr;
+	h1 = sc->H;
+	h2 = htmp;
+	while (len > 0) {
+		size_t clen;
+
+		clen = (sizeof sc->buf) - ptr;
+		if (clen > len)
+			clen = len;
+		memcpy(buf + ptr, data, clen);
+		data = (const unsigned char *)data + clen;
+		len -= clen;
+		ptr += clen;
+		if (ptr == sizeof sc->buf) {
+			sph_u32 *ht;
+
+			compress_small(buf, h1, h2);
+			ht = h1;
+			h1 = h2;
+			h2 = ht;
+			ptr = 0;
+		}
+	}
+	sc->ptr = ptr;
+	if (h1 != sc->H)
+		memcpy(sc->H, h1, sizeof sc->H);
+}
+
+static void
+bmw32_close(sph_bmw_small_context *sc, unsigned ub, unsigned n,
+	void *dst, size_t out_size_w32)
+{
+	unsigned char *buf, *out;
+	size_t ptr, u, v;
+	unsigned z;
+	sph_u32 h1[16], h2[16], *h;
+
+	buf = sc->buf;
+	ptr = sc->ptr;
+	z = 0x80 >> n;
+	buf[ptr ++] = ((ub & -z) | z) & 0xFF;
+	h = sc->H;
+	if (ptr > (sizeof sc->buf) - 8) {
+		memset(buf + ptr, 0, (sizeof sc->buf) - ptr);
+		compress_small(buf, h, h1);
+		ptr = 0;
+		h = h1;
+	}
+	memset(buf + ptr, 0, (sizeof sc->buf) - 8 - ptr);
+#if SPH_64
+	sph_enc64le_aligned(buf + (sizeof sc->buf) - 8,
+		SPH_T64(sc->bit_count + n));
+#else
+	sph_enc32le_aligned(buf + (sizeof sc->buf) - 8,
+		sc->bit_count_low + n);
+	sph_enc32le_aligned(buf + (sizeof sc->buf) - 4,
+		SPH_T32(sc->bit_count_high));
+#endif
+	compress_small(buf, h, h2);
+	for (u = 0; u < 16; u ++)
+		sph_enc32le_aligned(buf + 4 * u, h2[u]);
+	compress_small(buf, final_s, h1);
+	out = dst;
+	for (u = 0, v = 16 - out_size_w32; u < out_size_w32; u ++, v ++)
+		sph_enc32le(out + 4 * u, h1[v]);
+}
+
+#if SPH_64
+
+static void
+compress_big(const unsigned char *data, const sph_u64 h[16], sph_u64 dh[16])
+{
+#if SPH_LITTLE_FAST
+#define M(x)    sph_dec64le_aligned(data + 8 * (x))
+#else
+	sph_u64 mv[16];
+
+	mv[ 0] = sph_dec64le_aligned(data +   0);
+	mv[ 1] = sph_dec64le_aligned(data +   8);
+	mv[ 2] = sph_dec64le_aligned(data +  16);
+	mv[ 3] = sph_dec64le_aligned(data +  24);
+	mv[ 4] = sph_dec64le_aligned(data +  32);
+	mv[ 5] = sph_dec64le_aligned(data +  40);
+	mv[ 6] = sph_dec64le_aligned(data +  48);
+	mv[ 7] = sph_dec64le_aligned(data +  56);
+	mv[ 8] = sph_dec64le_aligned(data +  64);
+	mv[ 9] = sph_dec64le_aligned(data +  72);
+	mv[10] = sph_dec64le_aligned(data +  80);
+	mv[11] = sph_dec64le_aligned(data +  88);
+	mv[12] = sph_dec64le_aligned(data +  96);
+	mv[13] = sph_dec64le_aligned(data + 104);
+	mv[14] = sph_dec64le_aligned(data + 112);
+	mv[15] = sph_dec64le_aligned(data + 120);
+#define M(x)    (mv[x])
+#endif
+#define H(x)    (h[x])
+#define dH(x)   (dh[x])
+
+	FOLDb;
+
+#undef M
+#undef H
+#undef dH
+}
+
+static const sph_u64 final_b[16] = {
+	SPH_C64(0xaaaaaaaaaaaaaaa0), SPH_C64(0xaaaaaaaaaaaaaaa1),
+	SPH_C64(0xaaaaaaaaaaaaaaa2), SPH_C64(0xaaaaaaaaaaaaaaa3),
+	SPH_C64(0xaaaaaaaaaaaaaaa4), SPH_C64(0xaaaaaaaaaaaaaaa5),
+	SPH_C64(0xaaaaaaaaaaaaaaa6), SPH_C64(0xaaaaaaaaaaaaaaa7),
+	SPH_C64(0xaaaaaaaaaaaaaaa8), SPH_C64(0xaaaaaaaaaaaaaaa9),
+	SPH_C64(0xaaaaaaaaaaaaaaaa), SPH_C64(0xaaaaaaaaaaaaaaab),
+	SPH_C64(0xaaaaaaaaaaaaaaac), SPH_C64(0xaaaaaaaaaaaaaaad),
+	SPH_C64(0xaaaaaaaaaaaaaaae), SPH_C64(0xaaaaaaaaaaaaaaaf)
+};
+
+static void
+bmw64_init(sph_bmw_big_context *sc, const sph_u64 *iv)
+{
+	memcpy(sc->H, iv, sizeof sc->H);
+	sc->ptr = 0;
+	sc->bit_count = 0;
+}
+
+static void
+bmw64(sph_bmw_big_context *sc, const void *data, size_t len)
+{
+	unsigned char *buf;
+	size_t ptr;
+	sph_u64 htmp[16];
+	sph_u64 *h1, *h2;
+
+	sc->bit_count += (sph_u64)len << 3;
+	buf = sc->buf;
+	ptr = sc->ptr;
+	h1 = sc->H;
+	h2 = htmp;
+	while (len > 0) {
+		size_t clen;
+
+		clen = (sizeof sc->buf) - ptr;
+		if (clen > len)
+			clen = len;
+		memcpy(buf + ptr, data, clen);
+		data = (const unsigned char *)data + clen;
+		len -= clen;
+		ptr += clen;
+		if (ptr == sizeof sc->buf) {
+			sph_u64 *ht;
+
+			compress_big(buf, h1, h2);
+			ht = h1;
+			h1 = h2;
+			h2 = ht;
+			ptr = 0;
+		}
+	}
+	sc->ptr = ptr;
+	if (h1 != sc->H)
+		memcpy(sc->H, h1, sizeof sc->H);
+}
+
+static void
+bmw64_close(sph_bmw_big_context *sc, unsigned ub, unsigned n,
+	void *dst, size_t out_size_w64)
+{
+	unsigned char *buf, *out;
+	size_t ptr, u, v;
+	unsigned z;
+	sph_u64 h1[16], h2[16], *h;
+
+	buf = sc->buf;
+	ptr = sc->ptr;
+	z = 0x80 >> n;
+	buf[ptr ++] = ((ub & -z) | z) & 0xFF;
+	h = sc->H;
+	if (ptr > (sizeof sc->buf) - 8) {
+		memset(buf + ptr, 0, (sizeof sc->buf) - ptr);
+		compress_big(buf, h, h1);
+		ptr = 0;
+		h = h1;
+	}
+	memset(buf + ptr, 0, (sizeof sc->buf) - 8 - ptr);
+	sph_enc64le_aligned(buf + (sizeof sc->buf) - 8,
+		SPH_T64(sc->bit_count + n));
+	compress_big(buf, h, h2);
+	for (u = 0; u < 16; u ++)
+		sph_enc64le_aligned(buf + 8 * u, h2[u]);
+	compress_big(buf, final_b, h1);
+	out = dst;
+	for (u = 0, v = 16 - out_size_w64; u < out_size_w64; u ++, v ++)
+		sph_enc64le(out + 8 * u, h1[v]);
+}
+
+#endif
+
+/* see sph_bmw.h */
+void
+sph_bmw224_init(void *cc)
+{
+	bmw32_init(cc, IV224);
+}
+
+/* see sph_bmw.h */
+void
+sph_bmw224(void *cc, const void *data, size_t len)
+{
+	bmw32(cc, data, len);
+}
+
+/* see sph_bmw.h */
+void
+sph_bmw224_close(void *cc, void *dst)
+{
+	sph_bmw224_addbits_and_close(cc, 0, 0, dst);
+}
+
+/* see sph_bmw.h */
+void
+sph_bmw224_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
+{
+	bmw32_close(cc, ub, n, dst, 7);
+	sph_bmw224_init(cc);
+}
+
+/* see sph_bmw.h */
+void
+sph_bmw256_init(void *cc)
+{
+	bmw32_init(cc, IV256);
+}
+
+/* see sph_bmw.h */
+void
+sph_bmw256(void *cc, const void *data, size_t len)
+{
+	bmw32(cc, data, len);
+}
+
+/* see sph_bmw.h */
+void
+sph_bmw256_close(void *cc, void *dst)
+{
+	sph_bmw256_addbits_and_close(cc, 0, 0, dst);
+}
+
+/* see sph_bmw.h */
+void
+sph_bmw256_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
+{
+	bmw32_close(cc, ub, n, dst, 8);
+	sph_bmw256_init(cc);
+}
+
+#if SPH_64
+
+/* see sph_bmw.h */
+void
+sph_bmw384_init(void *cc)
+{
+	bmw64_init(cc, IV384);
+}
+
+/* see sph_bmw.h */
+void
+sph_bmw384(void *cc, const void *data, size_t len)
+{
+	bmw64(cc, data, len);
+}
+
+/* see sph_bmw.h */
+void
+sph_bmw384_close(void *cc, void *dst)
+{
+	sph_bmw384_addbits_and_close(cc, 0, 0, dst);
+}
+
+/* see sph_bmw.h */
+void
+sph_bmw384_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
+{
+	bmw64_close(cc, ub, n, dst, 6);
+	sph_bmw384_init(cc);
+}
+
+/* see sph_bmw.h */
+void
+sph_bmw512_init(void *cc)
+{
+	bmw64_init(cc, IV512);
+}
+
+/* see sph_bmw.h */
+void
+sph_bmw512(void *cc, const void *data, size_t len)
+{
+	bmw64(cc, data, len);
+}
+
+/* see sph_bmw.h */
+void
+sph_bmw512_close(void *cc, void *dst)
+{
+	sph_bmw512_addbits_and_close(cc, 0, 0, dst);
+}
+
+/* see sph_bmw.h */
+void
+sph_bmw512_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
+{
+	bmw64_close(cc, ub, n, dst, 8);
+	sph_bmw512_init(cc);
+}
+
+#endif
+
+#ifdef __cplusplus
+}
+#endif
diff --git a/sha3/cubehash.c b/sha3/cubehash.c
new file mode 100644
index 0000000..9322fe1
--- /dev/null
+++ b/sha3/cubehash.c
@@ -0,0 +1,723 @@
+/* $Id: cubehash.c 227 2010-06-16 17:28:38Z tp $ */
+/*
+ * CubeHash implementation.
+ *
+ * ==========================(LICENSE BEGIN)============================
+ *
+ * Copyright (c) 2007-2010  Projet RNRT SAPHIR
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ===========================(LICENSE END)=============================
+ *
+ * @author   Thomas Pornin <thomas.pornin@cryptolog.com>
+ */
+
+#include <stddef.h>
+#include <string.h>
+#include <limits.h>
+
+#include "sph_cubehash.h"
+#ifdef __cplusplus
+extern "C"{
+#endif
+
+#if SPH_SMALL_FOOTPRINT && !defined SPH_SMALL_FOOTPRINT_CUBEHASH
+#define SPH_SMALL_FOOTPRINT_CUBEHASH   1
+#endif
+
+/*
+ * Some tests were conducted on an Intel Core2 Q6600 (32-bit and 64-bit
+ * mode), a PowerPC G3, and a MIPS-compatible CPU (Broadcom BCM3302).
+ * It appears that the optimal settings are:
+ *  -- full unroll, no state copy on the "big" systems (x86, PowerPC)
+ *  -- unroll to 4 or 8, state copy on the "small" system (MIPS)
+ */
+
+#if SPH_SMALL_FOOTPRINT_CUBEHASH
+
+#if !defined SPH_CUBEHASH_UNROLL
+#define SPH_CUBEHASH_UNROLL   4
+#endif
+#if !defined SPH_CUBEHASH_NOCOPY
+#define SPH_CUBEHASH_NOCOPY   1
+#endif
+
+#else
+
+#if !defined SPH_CUBEHASH_UNROLL
+#define SPH_CUBEHASH_UNROLL   0
+#endif
+#if !defined SPH_CUBEHASH_NOCOPY
+#define SPH_CUBEHASH_NOCOPY   0
+#endif
+
+#endif
+
+#ifdef _MSC_VER
+#pragma warning (disable: 4146)
+#endif
+
+static const sph_u32 IV224[] = {
+	SPH_C32(0xB0FC8217), SPH_C32(0x1BEE1A90), SPH_C32(0x829E1A22),
+	SPH_C32(0x6362C342), SPH_C32(0x24D91C30), SPH_C32(0x03A7AA24),
+	SPH_C32(0xA63721C8), SPH_C32(0x85B0E2EF), SPH_C32(0xF35D13F3),
+	SPH_C32(0x41DA807D), SPH_C32(0x21A70CA6), SPH_C32(0x1F4E9774),
+	SPH_C32(0xB3E1C932), SPH_C32(0xEB0A79A8), SPH_C32(0xCDDAAA66),
+	SPH_C32(0xE2F6ECAA), SPH_C32(0x0A713362), SPH_C32(0xAA3080E0),
+	SPH_C32(0xD8F23A32), SPH_C32(0xCEF15E28), SPH_C32(0xDB086314),
+	SPH_C32(0x7F709DF7), SPH_C32(0xACD228A4), SPH_C32(0x704D6ECE),
+	SPH_C32(0xAA3EC95F), SPH_C32(0xE387C214), SPH_C32(0x3A6445FF),
+	SPH_C32(0x9CAB81C3), SPH_C32(0xC73D4B98), SPH_C32(0xD277AEBE),
+	SPH_C32(0xFD20151C), SPH_C32(0x00CB573E)
+};
+
+static const sph_u32 IV256[] = {
+	SPH_C32(0xEA2BD4B4), SPH_C32(0xCCD6F29F), SPH_C32(0x63117E71),
+	SPH_C32(0x35481EAE), SPH_C32(0x22512D5B), SPH_C32(0xE5D94E63),
+	SPH_C32(0x7E624131), SPH_C32(0xF4CC12BE), SPH_C32(0xC2D0B696),
+	SPH_C32(0x42AF2070), SPH_C32(0xD0720C35), SPH_C32(0x3361DA8C),
+	SPH_C32(0x28CCECA4), SPH_C32(0x8EF8AD83), SPH_C32(0x4680AC00),
+	SPH_C32(0x40E5FBAB), SPH_C32(0xD89041C3), SPH_C32(0x6107FBD5),
+	SPH_C32(0x6C859D41), SPH_C32(0xF0B26679), SPH_C32(0x09392549),
+	SPH_C32(0x5FA25603), SPH_C32(0x65C892FD), SPH_C32(0x93CB6285),
+	SPH_C32(0x2AF2B5AE), SPH_C32(0x9E4B4E60), SPH_C32(0x774ABFDD),
+	SPH_C32(0x85254725), SPH_C32(0x15815AEB), SPH_C32(0x4AB6AAD6),
+	SPH_C32(0x9CDAF8AF), SPH_C32(0xD6032C0A)
+};
+
+static const sph_u32 IV384[] = {
+	SPH_C32(0xE623087E), SPH_C32(0x04C00C87), SPH_C32(0x5EF46453),
+	SPH_C32(0x69524B13), SPH_C32(0x1A05C7A9), SPH_C32(0x3528DF88),
+	SPH_C32(0x6BDD01B5), SPH_C32(0x5057B792), SPH_C32(0x6AA7A922),
+	SPH_C32(0x649C7EEE), SPH_C32(0xF426309F), SPH_C32(0xCB629052),
+	SPH_C32(0xFC8E20ED), SPH_C32(0xB3482BAB), SPH_C32(0xF89E5E7E),
+	SPH_C32(0xD83D4DE4), SPH_C32(0x44BFC10D), SPH_C32(0x5FC1E63D),
+	SPH_C32(0x2104E6CB), SPH_C32(0x17958F7F), SPH_C32(0xDBEAEF70),
+	SPH_C32(0xB4B97E1E), SPH_C32(0x32C195F6), SPH_C32(0x6184A8E4),
+	SPH_C32(0x796C2543), SPH_C32(0x23DE176D), SPH_C32(0xD33BBAEC),
+	SPH_C32(0x0C12E5D2), SPH_C32(0x4EB95A7B), SPH_C32(0x2D18BA01),
+	SPH_C32(0x04EE475F), SPH_C32(0x1FC5F22E)
+};
+
+static const sph_u32 IV512[] = {
+	SPH_C32(0x2AEA2A61), SPH_C32(0x50F494D4), SPH_C32(0x2D538B8B),
+	SPH_C32(0x4167D83E), SPH_C32(0x3FEE2313), SPH_C32(0xC701CF8C),
+	SPH_C32(0xCC39968E), SPH_C32(0x50AC5695), SPH_C32(0x4D42C787),
+	SPH_C32(0xA647A8B3), SPH_C32(0x97CF0BEF), SPH_C32(0x825B4537),
+	SPH_C32(0xEEF864D2), SPH_C32(0xF22090C4), SPH_C32(0xD0E5CD33),
+	SPH_C32(0xA23911AE), SPH_C32(0xFCD398D9), SPH_C32(0x148FE485),
+	SPH_C32(0x1B017BEF), SPH_C32(0xB6444532), SPH_C32(0x6A536159),
+	SPH_C32(0x2FF5781C), SPH_C32(0x91FA7934), SPH_C32(0x0DBADEA9),
+	SPH_C32(0xD65C8A2B), SPH_C32(0xA5A70E75), SPH_C32(0xB1C62456),
+	SPH_C32(0xBC796576), SPH_C32(0x1921C8F7), SPH_C32(0xE7989AF1),
+	SPH_C32(0x7795D246), SPH_C32(0xD43E3B44)
+};
+
+#define T32      SPH_T32
+#define ROTL32   SPH_ROTL32
+
+#if SPH_CUBEHASH_NOCOPY
+
+#define DECL_STATE
+#define READ_STATE(cc)
+#define WRITE_STATE(cc)
+
+#define x0   ((sc)->state[ 0])
+#define x1   ((sc)->state[ 1])
+#define x2   ((sc)->state[ 2])
+#define x3   ((sc)->state[ 3])
+#define x4   ((sc)->state[ 4])
+#define x5   ((sc)->state[ 5])
+#define x6   ((sc)->state[ 6])
+#define x7   ((sc)->state[ 7])
+#define x8   ((sc)->state[ 8])
+#define x9   ((sc)->state[ 9])
+#define xa   ((sc)->state[10])
+#define xb   ((sc)->state[11])
+#define xc   ((sc)->state[12])
+#define xd   ((sc)->state[13])
+#define xe   ((sc)->state[14])
+#define xf   ((sc)->state[15])
+#define xg   ((sc)->state[16])
+#define xh   ((sc)->state[17])
+#define xi   ((sc)->state[18])
+#define xj   ((sc)->state[19])
+#define xk   ((sc)->state[20])
+#define xl   ((sc)->state[21])
+#define xm   ((sc)->state[22])
+#define xn   ((sc)->state[23])
+#define xo   ((sc)->state[24])
+#define xp   ((sc)->state[25])
+#define xq   ((sc)->state[26])
+#define xr   ((sc)->state[27])
+#define xs   ((sc)->state[28])
+#define xt   ((sc)->state[29])
+#define xu   ((sc)->state[30])
+#define xv   ((sc)->state[31])
+
+#else
+
+#define DECL_STATE \
+	sph_u32 x0, x1, x2, x3, x4, x5, x6, x7; \
+	sph_u32 x8, x9, xa, xb, xc, xd, xe, xf; \
+	sph_u32 xg, xh, xi, xj, xk, xl, xm, xn; \
+	sph_u32 xo, xp, xq, xr, xs, xt, xu, xv;
+
+#define READ_STATE(cc)   do { \
+		x0 = (cc)->state[ 0]; \
+		x1 = (cc)->state[ 1]; \
+		x2 = (cc)->state[ 2]; \
+		x3 = (cc)->state[ 3]; \
+		x4 = (cc)->state[ 4]; \
+		x5 = (cc)->state[ 5]; \
+		x6 = (cc)->state[ 6]; \
+		x7 = (cc)->state[ 7]; \
+		x8 = (cc)->state[ 8]; \
+		x9 = (cc)->state[ 9]; \
+		xa = (cc)->state[10]; \
+		xb = (cc)->state[11]; \
+		xc = (cc)->state[12]; \
+		xd = (cc)->state[13]; \
+		xe = (cc)->state[14]; \
+		xf = (cc)->state[15]; \
+		xg = (cc)->state[16]; \
+		xh = (cc)->state[17]; \
+		xi = (cc)->state[18]; \
+		xj = (cc)->state[19]; \
+		xk = (cc)->state[20]; \
+		xl = (cc)->state[21]; \
+		xm = (cc)->state[22]; \
+		xn = (cc)->state[23]; \
+		xo = (cc)->state[24]; \
+		xp = (cc)->state[25]; \
+		xq = (cc)->state[26]; \
+		xr = (cc)->state[27]; \
+		xs = (cc)->state[28]; \
+		xt = (cc)->state[29]; \
+		xu = (cc)->state[30]; \
+		xv = (cc)->state[31]; \
+	} while (0)
+
+#define WRITE_STATE(cc)   do { \
+		(cc)->state[ 0] = x0; \
+		(cc)->state[ 1] = x1; \
+		(cc)->state[ 2] = x2; \
+		(cc)->state[ 3] = x3; \
+		(cc)->state[ 4] = x4; \
+		(cc)->state[ 5] = x5; \
+		(cc)->state[ 6] = x6; \
+		(cc)->state[ 7] = x7; \
+		(cc)->state[ 8] = x8; \
+		(cc)->state[ 9] = x9; \
+		(cc)->state[10] = xa; \
+		(cc)->state[11] = xb; \
+		(cc)->state[12] = xc; \
+		(cc)->state[13] = xd; \
+		(cc)->state[14] = xe; \
+		(cc)->state[15] = xf; \
+		(cc)->state[16] = xg; \
+		(cc)->state[17] = xh; \
+		(cc)->state[18] = xi; \
+		(cc)->state[19] = xj; \
+		(cc)->state[20] = xk; \
+		(cc)->state[21] = xl; \
+		(cc)->state[22] = xm; \
+		(cc)->state[23] = xn; \
+		(cc)->state[24] = xo; \
+		(cc)->state[25] = xp; \
+		(cc)->state[26] = xq; \
+		(cc)->state[27] = xr; \
+		(cc)->state[28] = xs; \
+		(cc)->state[29] = xt; \
+		(cc)->state[30] = xu; \
+		(cc)->state[31] = xv; \
+	} while (0)
+
+#endif
+
+#define INPUT_BLOCK   do { \
+		x0 ^= sph_dec32le_aligned(buf +  0); \
+		x1 ^= sph_dec32le_aligned(buf +  4); \
+		x2 ^= sph_dec32le_aligned(buf +  8); \
+		x3 ^= sph_dec32le_aligned(buf + 12); \
+		x4 ^= sph_dec32le_aligned(buf + 16); \
+		x5 ^= sph_dec32le_aligned(buf + 20); \
+		x6 ^= sph_dec32le_aligned(buf + 24); \
+		x7 ^= sph_dec32le_aligned(buf + 28); \
+	} while (0)
+
+#define ROUND_EVEN   do { \
+		xg = T32(x0 + xg); \
+		x0 = ROTL32(x0, 7); \
+		xh = T32(x1 + xh); \
+		x1 = ROTL32(x1, 7); \
+		xi = T32(x2 + xi); \
+		x2 = ROTL32(x2, 7); \
+		xj = T32(x3 + xj); \
+		x3 = ROTL32(x3, 7); \
+		xk = T32(x4 + xk); \
+		x4 = ROTL32(x4, 7); \
+		xl = T32(x5 + xl); \
+		x5 = ROTL32(x5, 7); \
+		xm = T32(x6 + xm); \
+		x6 = ROTL32(x6, 7); \
+		xn = T32(x7 + xn); \
+		x7 = ROTL32(x7, 7); \
+		xo = T32(x8 + xo); \
+		x8 = ROTL32(x8, 7); \
+		xp = T32(x9 + xp); \
+		x9 = ROTL32(x9, 7); \
+		xq = T32(xa + xq); \
+		xa = ROTL32(xa, 7); \
+		xr = T32(xb + xr); \
+		xb = ROTL32(xb, 7); \
+		xs = T32(xc + xs); \
+		xc = ROTL32(xc, 7); \
+		xt = T32(xd + xt); \
+		xd = ROTL32(xd, 7); \
+		xu = T32(xe + xu); \
+		xe = ROTL32(xe, 7); \
+		xv = T32(xf + xv); \
+		xf = ROTL32(xf, 7); \
+		x8 ^= xg; \
+		x9 ^= xh; \
+		xa ^= xi; \
+		xb ^= xj; \
+		xc ^= xk; \
+		xd ^= xl; \
+		xe ^= xm; \
+		xf ^= xn; \
+		x0 ^= xo; \
+		x1 ^= xp; \
+		x2 ^= xq; \
+		x3 ^= xr; \
+		x4 ^= xs; \
+		x5 ^= xt; \
+		x6 ^= xu; \
+		x7 ^= xv; \
+		xi = T32(x8 + xi); \
+		x8 = ROTL32(x8, 11); \
+		xj = T32(x9 + xj); \
+		x9 = ROTL32(x9, 11); \
+		xg = T32(xa + xg); \
+		xa = ROTL32(xa, 11); \
+		xh = T32(xb + xh); \
+		xb = ROTL32(xb, 11); \
+		xm = T32(xc + xm); \
+		xc = ROTL32(xc, 11); \
+		xn = T32(xd + xn); \
+		xd = ROTL32(xd, 11); \
+		xk = T32(xe + xk); \
+		xe = ROTL32(xe, 11); \
+		xl = T32(xf + xl); \
+		xf = ROTL32(xf, 11); \
+		xq = T32(x0 + xq); \
+		x0 = ROTL32(x0, 11); \
+		xr = T32(x1 + xr); \
+		x1 = ROTL32(x1, 11); \
+		xo = T32(x2 + xo); \
+		x2 = ROTL32(x2, 11); \
+		xp = T32(x3 + xp); \
+		x3 = ROTL32(x3, 11); \
+		xu = T32(x4 + xu); \
+		x4 = ROTL32(x4, 11); \
+		xv = T32(x5 + xv); \
+		x5 = ROTL32(x5, 11); \
+		xs = T32(x6 + xs); \
+		x6 = ROTL32(x6, 11); \
+		xt = T32(x7 + xt); \
+		x7 = ROTL32(x7, 11); \
+		xc ^= xi; \
+		xd ^= xj; \
+		xe ^= xg; \
+		xf ^= xh; \
+		x8 ^= xm; \
+		x9 ^= xn; \
+		xa ^= xk; \
+		xb ^= xl; \
+		x4 ^= xq; \
+		x5 ^= xr; \
+		x6 ^= xo; \
+		x7 ^= xp; \
+		x0 ^= xu; \
+		x1 ^= xv; \
+		x2 ^= xs; \
+		x3 ^= xt; \
+	} while (0)
+
+#define ROUND_ODD   do { \
+		xj = T32(xc + xj); \
+		xc = ROTL32(xc, 7); \
+		xi = T32(xd + xi); \
+		xd = ROTL32(xd, 7); \
+		xh = T32(xe + xh); \
+		xe = ROTL32(xe, 7); \
+		xg = T32(xf + xg); \
+		xf = ROTL32(xf, 7); \
+		xn = T32(x8 + xn); \
+		x8 = ROTL32(x8, 7); \
+		xm = T32(x9 + xm); \
+		x9 = ROTL32(x9, 7); \
+		xl = T32(xa + xl); \
+		xa = ROTL32(xa, 7); \
+		xk = T32(xb + xk); \
+		xb = ROTL32(xb, 7); \
+		xr = T32(x4 + xr); \
+		x4 = ROTL32(x4, 7); \
+		xq = T32(x5 + xq); \
+		x5 = ROTL32(x5, 7); \
+		xp = T32(x6 + xp); \
+		x6 = ROTL32(x6, 7); \
+		xo = T32(x7 + xo); \
+		x7 = ROTL32(x7, 7); \
+		xv = T32(x0 + xv); \
+		x0 = ROTL32(x0, 7); \
+		xu = T32(x1 + xu); \
+		x1 = ROTL32(x1, 7); \
+		xt = T32(x2 + xt); \
+		x2 = ROTL32(x2, 7); \
+		xs = T32(x3 + xs); \
+		x3 = ROTL32(x3, 7); \
+		x4 ^= xj; \
+		x5 ^= xi; \
+		x6 ^= xh; \
+		x7 ^= xg; \
+		x0 ^= xn; \
+		x1 ^= xm; \
+		x2 ^= xl; \
+		x3 ^= xk; \
+		xc ^= xr; \
+		xd ^= xq; \
+		xe ^= xp; \
+		xf ^= xo; \
+		x8 ^= xv; \
+		x9 ^= xu; \
+		xa ^= xt; \
+		xb ^= xs; \
+		xh = T32(x4 + xh); \
+		x4 = ROTL32(x4, 11); \
+		xg = T32(x5 + xg); \
+		x5 = ROTL32(x5, 11); \
+		xj = T32(x6 + xj); \
+		x6 = ROTL32(x6, 11); \
+		xi = T32(x7 + xi); \
+		x7 = ROTL32(x7, 11); \
+		xl = T32(x0 + xl); \
+		x0 = ROTL32(x0, 11); \
+		xk = T32(x1 + xk); \
+		x1 = ROTL32(x1, 11); \
+		xn = T32(x2 + xn); \
+		x2 = ROTL32(x2, 11); \
+		xm = T32(x3 + xm); \
+		x3 = ROTL32(x3, 11); \
+		xp = T32(xc + xp); \
+		xc = ROTL32(xc, 11); \
+		xo = T32(xd + xo); \
+		xd = ROTL32(xd, 11); \
+		xr = T32(xe + xr); \
+		xe = ROTL32(xe, 11); \
+		xq = T32(xf + xq); \
+		xf = ROTL32(xf, 11); \
+		xt = T32(x8 + xt); \
+		x8 = ROTL32(x8, 11); \
+		xs = T32(x9 + xs); \
+		x9 = ROTL32(x9, 11); \
+		xv = T32(xa + xv); \
+		xa = ROTL32(xa, 11); \
+		xu = T32(xb + xu); \
+		xb = ROTL32(xb, 11); \
+		x0 ^= xh; \
+		x1 ^= xg; \
+		x2 ^= xj; \
+		x3 ^= xi; \
+		x4 ^= xl; \
+		x5 ^= xk; \
+		x6 ^= xn; \
+		x7 ^= xm; \
+		x8 ^= xp; \
+		x9 ^= xo; \
+		xa ^= xr; \
+		xb ^= xq; \
+		xc ^= xt; \
+		xd ^= xs; \
+		xe ^= xv; \
+		xf ^= xu; \
+	} while (0)
+
+/*
+ * There is no need to unroll all 16 rounds. The word-swapping permutation
+ * is an involution, so we need to unroll an even number of rounds. On
+ * "big" systems, unrolling 4 rounds yields about 97% of the speed
+ * achieved with full unrolling; and it keeps the code more compact
+ * for small architectures.
+ */
+
+#if SPH_CUBEHASH_UNROLL == 2
+
+#define SIXTEEN_ROUNDS   do { \
+		int j; \
+		for (j = 0; j < 8; j ++) { \
+			ROUND_EVEN; \
+			ROUND_ODD; \
+		} \
+	} while (0)
+
+#elif SPH_CUBEHASH_UNROLL == 4
+
+#define SIXTEEN_ROUNDS   do { \
+		int j; \
+		for (j = 0; j < 4; j ++) { \
+			ROUND_EVEN; \
+			ROUND_ODD; \
+			ROUND_EVEN; \
+			ROUND_ODD; \
+		} \
+	} while (0)
+
+#elif SPH_CUBEHASH_UNROLL == 8
+
+#define SIXTEEN_ROUNDS   do { \
+		int j; \
+		for (j = 0; j < 2; j ++) { \
+			ROUND_EVEN; \
+			ROUND_ODD; \
+			ROUND_EVEN; \
+			ROUND_ODD; \
+			ROUND_EVEN; \
+			ROUND_ODD; \
+			ROUND_EVEN; \
+			ROUND_ODD; \
+		} \
+	} while (0)
+
+#else
+
+#define SIXTEEN_ROUNDS   do { \
+		ROUND_EVEN; \
+		ROUND_ODD; \
+		ROUND_EVEN; \
+		ROUND_ODD; \
+		ROUND_EVEN; \
+		ROUND_ODD; \
+		ROUND_EVEN; \
+		ROUND_ODD; \
+		ROUND_EVEN; \
+		ROUND_ODD; \
+		ROUND_EVEN; \
+		ROUND_ODD; \
+		ROUND_EVEN; \
+		ROUND_ODD; \
+		ROUND_EVEN; \
+		ROUND_ODD; \
+	} while (0)
+
+#endif
+
+static void
+cubehash_init(sph_cubehash_context *sc, const sph_u32 *iv)
+{
+	memcpy(sc->state, iv, sizeof sc->state);
+	sc->ptr = 0;
+}
+
+static void
+cubehash_core(sph_cubehash_context *sc, const void *data, size_t len)
+{
+	unsigned char *buf;
+	size_t ptr;
+	DECL_STATE
+
+	buf = sc->buf;
+	ptr = sc->ptr;
+	if (len < (sizeof sc->buf) - ptr) {
+		memcpy(buf + ptr, data, len);
+		ptr += len;
+		sc->ptr = ptr;
+		return;
+	}
+
+	READ_STATE(sc);
+	while (len > 0) {
+		size_t clen;
+
+		clen = (sizeof sc->buf) - ptr;
+		if (clen > len)
+			clen = len;
+		memcpy(buf + ptr, data, clen);
+		ptr += clen;
+		data = (const unsigned char *)data + clen;
+		len -= clen;
+		if (ptr == sizeof sc->buf) {
+			INPUT_BLOCK;
+			SIXTEEN_ROUNDS;
+			ptr = 0;
+		}
+	}
+	WRITE_STATE(sc);
+	sc->ptr = ptr;
+}
+
+static void
+cubehash_close(sph_cubehash_context *sc, unsigned ub, unsigned n,
+	void *dst, size_t out_size_w32)
+{
+	unsigned char *buf, *out;
+	size_t ptr;
+	unsigned z;
+	int i;
+	DECL_STATE
+
+	buf = sc->buf;
+	ptr = sc->ptr;
+	z = 0x80 >> n;
+	buf[ptr ++] = ((ub & -z) | z) & 0xFF;
+	memset(buf + ptr, 0, (sizeof sc->buf) - ptr);
+	READ_STATE(sc);
+	INPUT_BLOCK;
+	for (i = 0; i < 11; i ++) {
+		SIXTEEN_ROUNDS;
+		if (i == 0)
+			xv ^= SPH_C32(1);
+	}
+	WRITE_STATE(sc);
+	out = dst;
+	for (z = 0; z < out_size_w32; z ++)
+		sph_enc32le(out + (z << 2), sc->state[z]);
+}
+
+/* see sph_cubehash.h */
+void
+sph_cubehash224_init(void *cc)
+{
+	cubehash_init(cc, IV224);
+}
+
+/* see sph_cubehash.h */
+void
+sph_cubehash224(void *cc, const void *data, size_t len)
+{
+	cubehash_core(cc, data, len);
+}
+
+/* see sph_cubehash.h */
+void
+sph_cubehash224_close(void *cc, void *dst)
+{
+	sph_cubehash224_addbits_and_close(cc, 0, 0, dst);
+}
+
+/* see sph_cubehash.h */
+void
+sph_cubehash224_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
+{
+	cubehash_close(cc, ub, n, dst, 7);
+	sph_cubehash224_init(cc);
+}
+
+/* see sph_cubehash.h */
+void
+sph_cubehash256_init(void *cc)
+{
+	cubehash_init(cc, IV256);
+}
+
+/* see sph_cubehash.h */
+void
+sph_cubehash256(void *cc, const void *data, size_t len)
+{
+	cubehash_core(cc, data, len);
+}
+
+/* see sph_cubehash.h */
+void
+sph_cubehash256_close(void *cc, void *dst)
+{
+	sph_cubehash256_addbits_and_close(cc, 0, 0, dst);
+}
+
+/* see sph_cubehash.h */
+void
+sph_cubehash256_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
+{
+	cubehash_close(cc, ub, n, dst, 8);
+	sph_cubehash256_init(cc);
+}
+
+/* see sph_cubehash.h */
+void
+sph_cubehash384_init(void *cc)
+{
+	cubehash_init(cc, IV384);
+}
+
+/* see sph_cubehash.h */
+void
+sph_cubehash384(void *cc, const void *data, size_t len)
+{
+	cubehash_core(cc, data, len);
+}
+
+/* see sph_cubehash.h */
+void
+sph_cubehash384_close(void *cc, void *dst)
+{
+	sph_cubehash384_addbits_and_close(cc, 0, 0, dst);
+}
+
+/* see sph_cubehash.h */
+void
+sph_cubehash384_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
+{
+	cubehash_close(cc, ub, n, dst, 12);
+	sph_cubehash384_init(cc);
+}
+
+/* see sph_cubehash.h */
+void
+sph_cubehash512_init(void *cc)
+{
+	cubehash_init(cc, IV512);
+}
+
+/* see sph_cubehash.h */
+void
+sph_cubehash512(void *cc, const void *data, size_t len)
+{
+	cubehash_core(cc, data, len);
+}
+
+/* see sph_cubehash.h */
+void
+sph_cubehash512_close(void *cc, void *dst)
+{
+	sph_cubehash512_addbits_and_close(cc, 0, 0, dst);
+}
+
+/* see sph_cubehash.h */
+void
+sph_cubehash512_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
+{
+	cubehash_close(cc, ub, n, dst, 16);
+	sph_cubehash512_init(cc);
+}
+#ifdef __cplusplus
+}
+#endif
diff --git a/sha3/echo.c b/sha3/echo.c
new file mode 100644
index 0000000..667e3f3
--- /dev/null
+++ b/sha3/echo.c
@@ -0,0 +1,1031 @@
+/* $Id: echo.c 227 2010-06-16 17:28:38Z tp $ */
+/*
+ * ECHO implementation.
+ *
+ * ==========================(LICENSE BEGIN)============================
+ *
+ * Copyright (c) 2007-2010  Projet RNRT SAPHIR
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ===========================(LICENSE END)=============================
+ *
+ * @author   Thomas Pornin <thomas.pornin@cryptolog.com>
+ */
+
+#include <stddef.h>
+#include <string.h>
+#include <limits.h>
+
+#include "sph_echo.h"
+
+#ifdef __cplusplus
+extern "C"{
+#endif
+
+#if SPH_SMALL_FOOTPRINT && !defined SPH_SMALL_FOOTPRINT_ECHO
+#define SPH_SMALL_FOOTPRINT_ECHO   1
+#endif
+
+/*
+ * Some measures tend to show that the 64-bit implementation offers
+ * better performance only on a "64-bit architectures", those which have
+ * actual 64-bit registers.
+ */
+#if !defined SPH_ECHO_64 && SPH_64_TRUE
+#define SPH_ECHO_64   1
+#endif
+
+/*
+ * We can use a 64-bit implementation only if a 64-bit type is available.
+ */
+#if !SPH_64
+#undef SPH_ECHO_64
+#endif
+
+#ifdef _MSC_VER
+#pragma warning (disable: 4146)
+#endif
+
+#define T32   SPH_T32
+#define C32   SPH_C32
+#if SPH_64
+#define C64   SPH_C64
+#endif
+
+#define AES_BIG_ENDIAN   0
+#include "aes_helper.c"
+
+#if SPH_ECHO_64
+
+#define DECL_STATE_SMALL   \
+	sph_u64 W[16][2];
+
+#define DECL_STATE_BIG   \
+	sph_u64 W[16][2];
+
+#define INPUT_BLOCK_SMALL(sc)   do { \
+		unsigned u; \
+		memcpy(W, sc->u.Vb, 8 * sizeof(sph_u64)); \
+		for (u = 0; u < 12; u ++) { \
+			W[u + 4][0] = sph_dec64le_aligned( \
+				sc->buf + 16 * u); \
+			W[u + 4][1] = sph_dec64le_aligned( \
+				sc->buf + 16 * u + 8); \
+		} \
+	} while (0)
+
+#define INPUT_BLOCK_BIG(sc)   do { \
+		unsigned u; \
+		memcpy(W, sc->u.Vb, 16 * sizeof(sph_u64)); \
+		for (u = 0; u < 8; u ++) { \
+			W[u + 8][0] = sph_dec64le_aligned( \
+				sc->buf + 16 * u); \
+			W[u + 8][1] = sph_dec64le_aligned( \
+				sc->buf + 16 * u + 8); \
+		} \
+	} while (0)
+
+#if SPH_SMALL_FOOTPRINT_ECHO
+
+static void
+aes_2rounds_all(sph_u64 W[16][2],
+	sph_u32 *pK0, sph_u32 *pK1, sph_u32 *pK2, sph_u32 *pK3)
+{
+	int n;
+	sph_u32 K0 = *pK0;
+	sph_u32 K1 = *pK1;
+	sph_u32 K2 = *pK2;
+	sph_u32 K3 = *pK3;
+
+	for (n = 0; n < 16; n ++) {
+		sph_u64 Wl = W[n][0];
+		sph_u64 Wh = W[n][1];
+		sph_u32 X0 = (sph_u32)Wl;
+		sph_u32 X1 = (sph_u32)(Wl >> 32);
+		sph_u32 X2 = (sph_u32)Wh;
+		sph_u32 X3 = (sph_u32)(Wh >> 32);
+		sph_u32 Y0, Y1, Y2, Y3; \
+		AES_ROUND_LE(X0, X1, X2, X3, K0, K1, K2, K3, Y0, Y1, Y2, Y3);
+		AES_ROUND_NOKEY_LE(Y0, Y1, Y2, Y3, X0, X1, X2, X3);
+		W[n][0] = (sph_u64)X0 | ((sph_u64)X1 << 32);
+		W[n][1] = (sph_u64)X2 | ((sph_u64)X3 << 32);
+		if ((K0 = T32(K0 + 1)) == 0) {
+			if ((K1 = T32(K1 + 1)) == 0)
+				if ((K2 = T32(K2 + 1)) == 0)
+					K3 = T32(K3 + 1);
+		}
+	}
+	*pK0 = K0;
+	*pK1 = K1;
+	*pK2 = K2;
+	*pK3 = K3;
+}
+
+#define BIG_SUB_WORDS   do { \
+		aes_2rounds_all(W, &K0, &K1, &K2, &K3); \
+	} while (0)
+
+#else
+
+#define AES_2ROUNDS(X)   do { \
+		sph_u32 X0 = (sph_u32)(X[0]); \
+		sph_u32 X1 = (sph_u32)(X[0] >> 32); \
+		sph_u32 X2 = (sph_u32)(X[1]); \
+		sph_u32 X3 = (sph_u32)(X[1] >> 32); \
+		sph_u32 Y0, Y1, Y2, Y3; \
+		AES_ROUND_LE(X0, X1, X2, X3, K0, K1, K2, K3, Y0, Y1, Y2, Y3); \
+		AES_ROUND_NOKEY_LE(Y0, Y1, Y2, Y3, X0, X1, X2, X3); \
+		X[0] = (sph_u64)X0 | ((sph_u64)X1 << 32); \
+		X[1] = (sph_u64)X2 | ((sph_u64)X3 << 32); \
+		if ((K0 = T32(K0 + 1)) == 0) { \
+			if ((K1 = T32(K1 + 1)) == 0) \
+				if ((K2 = T32(K2 + 1)) == 0) \
+					K3 = T32(K3 + 1); \
+		} \
+	} while (0)
+
+#define BIG_SUB_WORDS   do { \
+		AES_2ROUNDS(W[ 0]); \
+		AES_2ROUNDS(W[ 1]); \
+		AES_2ROUNDS(W[ 2]); \
+		AES_2ROUNDS(W[ 3]); \
+		AES_2ROUNDS(W[ 4]); \
+		AES_2ROUNDS(W[ 5]); \
+		AES_2ROUNDS(W[ 6]); \
+		AES_2ROUNDS(W[ 7]); \
+		AES_2ROUNDS(W[ 8]); \
+		AES_2ROUNDS(W[ 9]); \
+		AES_2ROUNDS(W[10]); \
+		AES_2ROUNDS(W[11]); \
+		AES_2ROUNDS(W[12]); \
+		AES_2ROUNDS(W[13]); \
+		AES_2ROUNDS(W[14]); \
+		AES_2ROUNDS(W[15]); \
+	} while (0)
+
+#endif
+
+#define SHIFT_ROW1(a, b, c, d)   do { \
+		sph_u64 tmp; \
+		tmp = W[a][0]; \
+		W[a][0] = W[b][0]; \
+		W[b][0] = W[c][0]; \
+		W[c][0] = W[d][0]; \
+		W[d][0] = tmp; \
+		tmp = W[a][1]; \
+		W[a][1] = W[b][1]; \
+		W[b][1] = W[c][1]; \
+		W[c][1] = W[d][1]; \
+		W[d][1] = tmp; \
+	} while (0)
+
+#define SHIFT_ROW2(a, b, c, d)   do { \
+		sph_u64 tmp; \
+		tmp = W[a][0]; \
+		W[a][0] = W[c][0]; \
+		W[c][0] = tmp; \
+		tmp = W[b][0]; \
+		W[b][0] = W[d][0]; \
+		W[d][0] = tmp; \
+		tmp = W[a][1]; \
+		W[a][1] = W[c][1]; \
+		W[c][1] = tmp; \
+		tmp = W[b][1]; \
+		W[b][1] = W[d][1]; \
+		W[d][1] = tmp; \
+	} while (0)
+
+#define SHIFT_ROW3(a, b, c, d)   SHIFT_ROW1(d, c, b, a)
+
+#define BIG_SHIFT_ROWS   do { \
+		SHIFT_ROW1(1, 5, 9, 13); \
+		SHIFT_ROW2(2, 6, 10, 14); \
+		SHIFT_ROW3(3, 7, 11, 15); \
+	} while (0)
+
+#if SPH_SMALL_FOOTPRINT_ECHO
+
+static void
+mix_column(sph_u64 W[16][2], int ia, int ib, int ic, int id)
+{
+	int n;
+
+	for (n = 0; n < 2; n ++) {
+		sph_u64 a = W[ia][n];
+		sph_u64 b = W[ib][n];
+		sph_u64 c = W[ic][n];
+		sph_u64 d = W[id][n];
+		sph_u64 ab = a ^ b;
+		sph_u64 bc = b ^ c;
+		sph_u64 cd = c ^ d;
+		sph_u64 abx = ((ab & C64(0x8080808080808080)) >> 7) * 27U
+			^ ((ab & C64(0x7F7F7F7F7F7F7F7F)) << 1);
+		sph_u64 bcx = ((bc & C64(0x8080808080808080)) >> 7) * 27U
+			^ ((bc & C64(0x7F7F7F7F7F7F7F7F)) << 1);
+		sph_u64 cdx = ((cd & C64(0x8080808080808080)) >> 7) * 27U
+			^ ((cd & C64(0x7F7F7F7F7F7F7F7F)) << 1);
+		W[ia][n] = abx ^ bc ^ d;
+		W[ib][n] = bcx ^ a ^ cd;
+		W[ic][n] = cdx ^ ab ^ d;
+		W[id][n] = abx ^ bcx ^ cdx ^ ab ^ c;
+	}
+}
+
+#define MIX_COLUMN(a, b, c, d)   mix_column(W, a, b, c, d)
+
+#else
+
+#define MIX_COLUMN1(ia, ib, ic, id, n)   do { \
+		sph_u64 a = W[ia][n]; \
+		sph_u64 b = W[ib][n]; \
+		sph_u64 c = W[ic][n]; \
+		sph_u64 d = W[id][n]; \
+		sph_u64 ab = a ^ b; \
+		sph_u64 bc = b ^ c; \
+		sph_u64 cd = c ^ d; \
+		sph_u64 abx = ((ab & C64(0x8080808080808080)) >> 7) * 27U \
+			^ ((ab & C64(0x7F7F7F7F7F7F7F7F)) << 1); \
+		sph_u64 bcx = ((bc & C64(0x8080808080808080)) >> 7) * 27U \
+			^ ((bc & C64(0x7F7F7F7F7F7F7F7F)) << 1); \
+		sph_u64 cdx = ((cd & C64(0x8080808080808080)) >> 7) * 27U \
+			^ ((cd & C64(0x7F7F7F7F7F7F7F7F)) << 1); \
+		W[ia][n] = abx ^ bc ^ d; \
+		W[ib][n] = bcx ^ a ^ cd; \
+		W[ic][n] = cdx ^ ab ^ d; \
+		W[id][n] = abx ^ bcx ^ cdx ^ ab ^ c; \
+	} while (0)
+
+#define MIX_COLUMN(a, b, c, d)   do { \
+		MIX_COLUMN1(a, b, c, d, 0); \
+		MIX_COLUMN1(a, b, c, d, 1); \
+	} while (0)
+
+#endif
+
+#define BIG_MIX_COLUMNS   do { \
+		MIX_COLUMN(0, 1, 2, 3); \
+		MIX_COLUMN(4, 5, 6, 7); \
+		MIX_COLUMN(8, 9, 10, 11); \
+		MIX_COLUMN(12, 13, 14, 15); \
+	} while (0)
+
+#define BIG_ROUND   do { \
+		BIG_SUB_WORDS; \
+		BIG_SHIFT_ROWS; \
+		BIG_MIX_COLUMNS; \
+	} while (0)
+
+#define FINAL_SMALL   do { \
+		unsigned u; \
+		sph_u64 *VV = &sc->u.Vb[0][0]; \
+		sph_u64 *WW = &W[0][0]; \
+		for (u = 0; u < 8; u ++) { \
+			VV[u] ^= sph_dec64le_aligned(sc->buf + (u * 8)) \
+				^ sph_dec64le_aligned(sc->buf + (u * 8) + 64) \
+				^ sph_dec64le_aligned(sc->buf + (u * 8) + 128) \
+				^ WW[u] ^ WW[u + 8] \
+				^ WW[u + 16] ^ WW[u + 24]; \
+		} \
+	} while (0)
+
+#define FINAL_BIG   do { \
+		unsigned u; \
+		sph_u64 *VV = &sc->u.Vb[0][0]; \
+		sph_u64 *WW = &W[0][0]; \
+		for (u = 0; u < 16; u ++) { \
+			VV[u] ^= sph_dec64le_aligned(sc->buf + (u * 8)) \
+				^ WW[u] ^ WW[u + 16]; \
+		} \
+	} while (0)
+
+#define COMPRESS_SMALL(sc)   do { \
+		sph_u32 K0 = sc->C0; \
+		sph_u32 K1 = sc->C1; \
+		sph_u32 K2 = sc->C2; \
+		sph_u32 K3 = sc->C3; \
+		unsigned u; \
+		INPUT_BLOCK_SMALL(sc); \
+		for (u = 0; u < 8; u ++) { \
+			BIG_ROUND; \
+		} \
+		FINAL_SMALL; \
+	} while (0)
+
+#define COMPRESS_BIG(sc)   do { \
+		sph_u32 K0 = sc->C0; \
+		sph_u32 K1 = sc->C1; \
+		sph_u32 K2 = sc->C2; \
+		sph_u32 K3 = sc->C3; \
+		unsigned u; \
+		INPUT_BLOCK_BIG(sc); \
+		for (u = 0; u < 10; u ++) { \
+			BIG_ROUND; \
+		} \
+		FINAL_BIG; \
+	} while (0)
+
+#else
+
+#define DECL_STATE_SMALL   \
+	sph_u32 W[16][4];
+
+#define DECL_STATE_BIG   \
+	sph_u32 W[16][4];
+
+#define INPUT_BLOCK_SMALL(sc)   do { \
+		unsigned u; \
+		memcpy(W, sc->u.Vs, 16 * sizeof(sph_u32)); \
+		for (u = 0; u < 12; u ++) { \
+			W[u + 4][0] = sph_dec32le_aligned( \
+				sc->buf + 16 * u); \
+			W[u + 4][1] = sph_dec32le_aligned( \
+				sc->buf + 16 * u + 4); \
+			W[u + 4][2] = sph_dec32le_aligned( \
+				sc->buf + 16 * u + 8); \
+			W[u + 4][3] = sph_dec32le_aligned( \
+				sc->buf + 16 * u + 12); \
+		} \
+	} while (0)
+
+#define INPUT_BLOCK_BIG(sc)   do { \
+		unsigned u; \
+		memcpy(W, sc->u.Vs, 32 * sizeof(sph_u32)); \
+		for (u = 0; u < 8; u ++) { \
+			W[u + 8][0] = sph_dec32le_aligned( \
+				sc->buf + 16 * u); \
+			W[u + 8][1] = sph_dec32le_aligned( \
+				sc->buf + 16 * u + 4); \
+			W[u + 8][2] = sph_dec32le_aligned( \
+				sc->buf + 16 * u + 8); \
+			W[u + 8][3] = sph_dec32le_aligned( \
+				sc->buf + 16 * u + 12); \
+		} \
+	} while (0)
+
+#if SPH_SMALL_FOOTPRINT_ECHO
+
+static void
+aes_2rounds_all(sph_u32 W[16][4],
+	sph_u32 *pK0, sph_u32 *pK1, sph_u32 *pK2, sph_u32 *pK3)
+{
+	int n;
+	sph_u32 K0 = *pK0;
+	sph_u32 K1 = *pK1;
+	sph_u32 K2 = *pK2;
+	sph_u32 K3 = *pK3;
+
+	for (n = 0; n < 16; n ++) {
+		sph_u32 *X = W[n];
+		sph_u32 Y0, Y1, Y2, Y3;
+		AES_ROUND_LE(X[0], X[1], X[2], X[3],
+			K0, K1, K2, K3, Y0, Y1, Y2, Y3);
+		AES_ROUND_NOKEY_LE(Y0, Y1, Y2, Y3, X[0], X[1], X[2], X[3]);
+		if ((K0 = T32(K0 + 1)) == 0) {
+			if ((K1 = T32(K1 + 1)) == 0)
+				if ((K2 = T32(K2 + 1)) == 0)
+					K3 = T32(K3 + 1);
+		}
+	}
+	*pK0 = K0;
+	*pK1 = K1;
+	*pK2 = K2;
+	*pK3 = K3;
+}
+
+#define BIG_SUB_WORDS   do { \
+		aes_2rounds_all(W, &K0, &K1, &K2, &K3); \
+	} while (0)
+
+#else
+
+#define AES_2ROUNDS(X)   do { \
+		sph_u32 Y0, Y1, Y2, Y3; \
+		AES_ROUND_LE(X[0], X[1], X[2], X[3], \
+			K0, K1, K2, K3, Y0, Y1, Y2, Y3); \
+		AES_ROUND_NOKEY_LE(Y0, Y1, Y2, Y3, X[0], X[1], X[2], X[3]); \
+		if ((K0 = T32(K0 + 1)) == 0) { \
+			if ((K1 = T32(K1 + 1)) == 0) \
+				if ((K2 = T32(K2 + 1)) == 0) \
+					K3 = T32(K3 + 1); \
+		} \
+	} while (0)
+
+#define BIG_SUB_WORDS   do { \
+		AES_2ROUNDS(W[ 0]); \
+		AES_2ROUNDS(W[ 1]); \
+		AES_2ROUNDS(W[ 2]); \
+		AES_2ROUNDS(W[ 3]); \
+		AES_2ROUNDS(W[ 4]); \
+		AES_2ROUNDS(W[ 5]); \
+		AES_2ROUNDS(W[ 6]); \
+		AES_2ROUNDS(W[ 7]); \
+		AES_2ROUNDS(W[ 8]); \
+		AES_2ROUNDS(W[ 9]); \
+		AES_2ROUNDS(W[10]); \
+		AES_2ROUNDS(W[11]); \
+		AES_2ROUNDS(W[12]); \
+		AES_2ROUNDS(W[13]); \
+		AES_2ROUNDS(W[14]); \
+		AES_2ROUNDS(W[15]); \
+	} while (0)
+
+#endif
+
+#define SHIFT_ROW1(a, b, c, d)   do { \
+		sph_u32 tmp; \
+		tmp = W[a][0]; \
+		W[a][0] = W[b][0]; \
+		W[b][0] = W[c][0]; \
+		W[c][0] = W[d][0]; \
+		W[d][0] = tmp; \
+		tmp = W[a][1]; \
+		W[a][1] = W[b][1]; \
+		W[b][1] = W[c][1]; \
+		W[c][1] = W[d][1]; \
+		W[d][1] = tmp; \
+		tmp = W[a][2]; \
+		W[a][2] = W[b][2]; \
+		W[b][2] = W[c][2]; \
+		W[c][2] = W[d][2]; \
+		W[d][2] = tmp; \
+		tmp = W[a][3]; \
+		W[a][3] = W[b][3]; \
+		W[b][3] = W[c][3]; \
+		W[c][3] = W[d][3]; \
+		W[d][3] = tmp; \
+	} while (0)
+
+#define SHIFT_ROW2(a, b, c, d)   do { \
+		sph_u32 tmp; \
+		tmp = W[a][0]; \
+		W[a][0] = W[c][0]; \
+		W[c][0] = tmp; \
+		tmp = W[b][0]; \
+		W[b][0] = W[d][0]; \
+		W[d][0] = tmp; \
+		tmp = W[a][1]; \
+		W[a][1] = W[c][1]; \
+		W[c][1] = tmp; \
+		tmp = W[b][1]; \
+		W[b][1] = W[d][1]; \
+		W[d][1] = tmp; \
+		tmp = W[a][2]; \
+		W[a][2] = W[c][2]; \
+		W[c][2] = tmp; \
+		tmp = W[b][2]; \
+		W[b][2] = W[d][2]; \
+		W[d][2] = tmp; \
+		tmp = W[a][3]; \
+		W[a][3] = W[c][3]; \
+		W[c][3] = tmp; \
+		tmp = W[b][3]; \
+		W[b][3] = W[d][3]; \
+		W[d][3] = tmp; \
+	} while (0)
+
+#define SHIFT_ROW3(a, b, c, d)   SHIFT_ROW1(d, c, b, a)
+
+#define BIG_SHIFT_ROWS   do { \
+		SHIFT_ROW1(1, 5, 9, 13); \
+		SHIFT_ROW2(2, 6, 10, 14); \
+		SHIFT_ROW3(3, 7, 11, 15); \
+	} while (0)
+
+#if SPH_SMALL_FOOTPRINT_ECHO
+
+static void
+mix_column(sph_u32 W[16][4], int ia, int ib, int ic, int id)
+{
+	int n;
+
+	for (n = 0; n < 4; n ++) {
+		sph_u32 a = W[ia][n];
+		sph_u32 b = W[ib][n];
+		sph_u32 c = W[ic][n];
+		sph_u32 d = W[id][n];
+		sph_u32 ab = a ^ b;
+		sph_u32 bc = b ^ c;
+		sph_u32 cd = c ^ d;
+		sph_u32 abx = ((ab & C32(0x80808080)) >> 7) * 27U
+			^ ((ab & C32(0x7F7F7F7F)) << 1);
+		sph_u32 bcx = ((bc & C32(0x80808080)) >> 7) * 27U
+			^ ((bc & C32(0x7F7F7F7F)) << 1);
+		sph_u32 cdx = ((cd & C32(0x80808080)) >> 7) * 27U
+			^ ((cd & C32(0x7F7F7F7F)) << 1);
+		W[ia][n] = abx ^ bc ^ d;
+		W[ib][n] = bcx ^ a ^ cd;
+		W[ic][n] = cdx ^ ab ^ d;
+		W[id][n] = abx ^ bcx ^ cdx ^ ab ^ c;
+	}
+}
+
+#define MIX_COLUMN(a, b, c, d)   mix_column(W, a, b, c, d)
+
+#else
+
+#define MIX_COLUMN1(ia, ib, ic, id, n)   do { \
+		sph_u32 a = W[ia][n]; \
+		sph_u32 b = W[ib][n]; \
+		sph_u32 c = W[ic][n]; \
+		sph_u32 d = W[id][n]; \
+		sph_u32 ab = a ^ b; \
+		sph_u32 bc = b ^ c; \
+		sph_u32 cd = c ^ d; \
+		sph_u32 abx = ((ab & C32(0x80808080)) >> 7) * 27U \
+			^ ((ab & C32(0x7F7F7F7F)) << 1); \
+		sph_u32 bcx = ((bc & C32(0x80808080)) >> 7) * 27U \
+			^ ((bc & C32(0x7F7F7F7F)) << 1); \
+		sph_u32 cdx = ((cd & C32(0x80808080)) >> 7) * 27U \
+			^ ((cd & C32(0x7F7F7F7F)) << 1); \
+		W[ia][n] = abx ^ bc ^ d; \
+		W[ib][n] = bcx ^ a ^ cd; \
+		W[ic][n] = cdx ^ ab ^ d; \
+		W[id][n] = abx ^ bcx ^ cdx ^ ab ^ c; \
+	} while (0)
+
+#define MIX_COLUMN(a, b, c, d)   do { \
+		MIX_COLUMN1(a, b, c, d, 0); \
+		MIX_COLUMN1(a, b, c, d, 1); \
+		MIX_COLUMN1(a, b, c, d, 2); \
+		MIX_COLUMN1(a, b, c, d, 3); \
+	} while (0)
+
+#endif
+
+#define BIG_MIX_COLUMNS   do { \
+		MIX_COLUMN(0, 1, 2, 3); \
+		MIX_COLUMN(4, 5, 6, 7); \
+		MIX_COLUMN(8, 9, 10, 11); \
+		MIX_COLUMN(12, 13, 14, 15); \
+	} while (0)
+
+#define BIG_ROUND   do { \
+		BIG_SUB_WORDS; \
+		BIG_SHIFT_ROWS; \
+		BIG_MIX_COLUMNS; \
+	} while (0)
+
+#define FINAL_SMALL   do { \
+		unsigned u; \
+		sph_u32 *VV = &sc->u.Vs[0][0]; \
+		sph_u32 *WW = &W[0][0]; \
+		for (u = 0; u < 16; u ++) { \
+			VV[u] ^= sph_dec32le_aligned(sc->buf + (u * 4)) \
+				^ sph_dec32le_aligned(sc->buf + (u * 4) + 64) \
+				^ sph_dec32le_aligned(sc->buf + (u * 4) + 128) \
+				^ WW[u] ^ WW[u + 16] \
+				^ WW[u + 32] ^ WW[u + 48]; \
+		} \
+	} while (0)
+
+#define FINAL_BIG   do { \
+		unsigned u; \
+		sph_u32 *VV = &sc->u.Vs[0][0]; \
+		sph_u32 *WW = &W[0][0]; \
+		for (u = 0; u < 32; u ++) { \
+			VV[u] ^= sph_dec32le_aligned(sc->buf + (u * 4)) \
+				^ WW[u] ^ WW[u + 32]; \
+		} \
+	} while (0)
+
+#define COMPRESS_SMALL(sc)   do { \
+		sph_u32 K0 = sc->C0; \
+		sph_u32 K1 = sc->C1; \
+		sph_u32 K2 = sc->C2; \
+		sph_u32 K3 = sc->C3; \
+		unsigned u; \
+		INPUT_BLOCK_SMALL(sc); \
+		for (u = 0; u < 8; u ++) { \
+			BIG_ROUND; \
+		} \
+		FINAL_SMALL; \
+	} while (0)
+
+#define COMPRESS_BIG(sc)   do { \
+		sph_u32 K0 = sc->C0; \
+		sph_u32 K1 = sc->C1; \
+		sph_u32 K2 = sc->C2; \
+		sph_u32 K3 = sc->C3; \
+		unsigned u; \
+		INPUT_BLOCK_BIG(sc); \
+		for (u = 0; u < 10; u ++) { \
+			BIG_ROUND; \
+		} \
+		FINAL_BIG; \
+	} while (0)
+
+#endif
+
+#define INCR_COUNTER(sc, val)   do { \
+		sc->C0 = T32(sc->C0 + (sph_u32)(val)); \
+		if (sc->C0 < (sph_u32)(val)) { \
+			if ((sc->C1 = T32(sc->C1 + 1)) == 0) \
+				if ((sc->C2 = T32(sc->C2 + 1)) == 0) \
+					sc->C3 = T32(sc->C3 + 1); \
+		} \
+	} while (0)
+
+static void
+echo_small_init(sph_echo_small_context *sc, unsigned out_len)
+{
+#if SPH_ECHO_64
+	sc->u.Vb[0][0] = (sph_u64)out_len;
+	sc->u.Vb[0][1] = 0;
+	sc->u.Vb[1][0] = (sph_u64)out_len;
+	sc->u.Vb[1][1] = 0;
+	sc->u.Vb[2][0] = (sph_u64)out_len;
+	sc->u.Vb[2][1] = 0;
+	sc->u.Vb[3][0] = (sph_u64)out_len;
+	sc->u.Vb[3][1] = 0;
+#else
+	sc->u.Vs[0][0] = (sph_u32)out_len;
+	sc->u.Vs[0][1] = sc->u.Vs[0][2] = sc->u.Vs[0][3] = 0;
+	sc->u.Vs[1][0] = (sph_u32)out_len;
+	sc->u.Vs[1][1] = sc->u.Vs[1][2] = sc->u.Vs[1][3] = 0;
+	sc->u.Vs[2][0] = (sph_u32)out_len;
+	sc->u.Vs[2][1] = sc->u.Vs[2][2] = sc->u.Vs[2][3] = 0;
+	sc->u.Vs[3][0] = (sph_u32)out_len;
+	sc->u.Vs[3][1] = sc->u.Vs[3][2] = sc->u.Vs[3][3] = 0;
+#endif
+	sc->ptr = 0;
+	sc->C0 = sc->C1 = sc->C2 = sc->C3 = 0;
+}
+
+static void
+echo_big_init(sph_echo_big_context *sc, unsigned out_len)
+{
+#if SPH_ECHO_64
+	sc->u.Vb[0][0] = (sph_u64)out_len;
+	sc->u.Vb[0][1] = 0;
+	sc->u.Vb[1][0] = (sph_u64)out_len;
+	sc->u.Vb[1][1] = 0;
+	sc->u.Vb[2][0] = (sph_u64)out_len;
+	sc->u.Vb[2][1] = 0;
+	sc->u.Vb[3][0] = (sph_u64)out_len;
+	sc->u.Vb[3][1] = 0;
+	sc->u.Vb[4][0] = (sph_u64)out_len;
+	sc->u.Vb[4][1] = 0;
+	sc->u.Vb[5][0] = (sph_u64)out_len;
+	sc->u.Vb[5][1] = 0;
+	sc->u.Vb[6][0] = (sph_u64)out_len;
+	sc->u.Vb[6][1] = 0;
+	sc->u.Vb[7][0] = (sph_u64)out_len;
+	sc->u.Vb[7][1] = 0;
+#else
+	sc->u.Vs[0][0] = (sph_u32)out_len;
+	sc->u.Vs[0][1] = sc->u.Vs[0][2] = sc->u.Vs[0][3] = 0;
+	sc->u.Vs[1][0] = (sph_u32)out_len;
+	sc->u.Vs[1][1] = sc->u.Vs[1][2] = sc->u.Vs[1][3] = 0;
+	sc->u.Vs[2][0] = (sph_u32)out_len;
+	sc->u.Vs[2][1] = sc->u.Vs[2][2] = sc->u.Vs[2][3] = 0;
+	sc->u.Vs[3][0] = (sph_u32)out_len;
+	sc->u.Vs[3][1] = sc->u.Vs[3][2] = sc->u.Vs[3][3] = 0;
+	sc->u.Vs[4][0] = (sph_u32)out_len;
+	sc->u.Vs[4][1] = sc->u.Vs[4][2] = sc->u.Vs[4][3] = 0;
+	sc->u.Vs[5][0] = (sph_u32)out_len;
+	sc->u.Vs[5][1] = sc->u.Vs[5][2] = sc->u.Vs[5][3] = 0;
+	sc->u.Vs[6][0] = (sph_u32)out_len;
+	sc->u.Vs[6][1] = sc->u.Vs[6][2] = sc->u.Vs[6][3] = 0;
+	sc->u.Vs[7][0] = (sph_u32)out_len;
+	sc->u.Vs[7][1] = sc->u.Vs[7][2] = sc->u.Vs[7][3] = 0;
+#endif
+	sc->ptr = 0;
+	sc->C0 = sc->C1 = sc->C2 = sc->C3 = 0;
+}
+
+static void
+echo_small_compress(sph_echo_small_context *sc)
+{
+	DECL_STATE_SMALL
+
+	COMPRESS_SMALL(sc);
+}
+
+static void
+echo_big_compress(sph_echo_big_context *sc)
+{
+	DECL_STATE_BIG
+
+	COMPRESS_BIG(sc);
+}
+
+static void
+echo_small_core(sph_echo_small_context *sc,
+	const unsigned char *data, size_t len)
+{
+	unsigned char *buf;
+	size_t ptr;
+
+	buf = sc->buf;
+	ptr = sc->ptr;
+	if (len < (sizeof sc->buf) - ptr) {
+		memcpy(buf + ptr, data, len);
+		ptr += len;
+		sc->ptr = ptr;
+		return;
+	}
+
+	while (len > 0) {
+		size_t clen;
+
+		clen = (sizeof sc->buf) - ptr;
+		if (clen > len)
+			clen = len;
+		memcpy(buf + ptr, data, clen);
+		ptr += clen;
+		data += clen;
+		len -= clen;
+		if (ptr == sizeof sc->buf) {
+			INCR_COUNTER(sc, 1536);
+			echo_small_compress(sc);
+			ptr = 0;
+		}
+	}
+	sc->ptr = ptr;
+}
+
+static void
+echo_big_core(sph_echo_big_context *sc,
+	const unsigned char *data, size_t len)
+{
+	unsigned char *buf;
+	size_t ptr;
+
+	buf = sc->buf;
+	ptr = sc->ptr;
+	if (len < (sizeof sc->buf) - ptr) {
+		memcpy(buf + ptr, data, len);
+		ptr += len;
+		sc->ptr = ptr;
+		return;
+	}
+
+	while (len > 0) {
+		size_t clen;
+
+		clen = (sizeof sc->buf) - ptr;
+		if (clen > len)
+			clen = len;
+		memcpy(buf + ptr, data, clen);
+		ptr += clen;
+		data += clen;
+		len -= clen;
+		if (ptr == sizeof sc->buf) {
+			INCR_COUNTER(sc, 1024);
+			echo_big_compress(sc);
+			ptr = 0;
+		}
+	}
+	sc->ptr = ptr;
+}
+
+static void
+echo_small_close(sph_echo_small_context *sc, unsigned ub, unsigned n,
+	void *dst, unsigned out_size_w32)
+{
+	unsigned char *buf;
+	size_t ptr;
+	unsigned z;
+	unsigned elen;
+	union {
+		unsigned char tmp[32];
+		sph_u32 dummy;
+#if SPH_ECHO_64
+		sph_u64 dummy2;
+#endif
+	} u;
+#if SPH_ECHO_64
+	sph_u64 *VV;
+#else
+	sph_u32 *VV;
+#endif
+	unsigned k;
+
+	buf = sc->buf;
+	ptr = sc->ptr;
+	elen = ((unsigned)ptr << 3) + n;
+	INCR_COUNTER(sc, elen);
+	sph_enc32le_aligned(u.tmp, sc->C0);
+	sph_enc32le_aligned(u.tmp + 4, sc->C1);
+	sph_enc32le_aligned(u.tmp + 8, sc->C2);
+	sph_enc32le_aligned(u.tmp + 12, sc->C3);
+	/*
+	 * If elen is zero, then this block actually contains no message
+	 * bit, only the first padding bit.
+	 */
+	if (elen == 0) {
+		sc->C0 = sc->C1 = sc->C2 = sc->C3 = 0;
+	}
+	z = 0x80 >> n;
+	buf[ptr ++] = ((ub & -z) | z) & 0xFF;
+	memset(buf + ptr, 0, (sizeof sc->buf) - ptr);
+	if (ptr > ((sizeof sc->buf) - 18)) {
+		echo_small_compress(sc);
+		sc->C0 = sc->C1 = sc->C2 = sc->C3 = 0;
+		memset(buf, 0, sizeof sc->buf);
+	}
+	sph_enc16le(buf + (sizeof sc->buf) - 18, out_size_w32 << 5);
+	memcpy(buf + (sizeof sc->buf) - 16, u.tmp, 16);
+	echo_small_compress(sc);
+#if SPH_ECHO_64
+	for (VV = &sc->u.Vb[0][0], k = 0; k < ((out_size_w32 + 1) >> 1); k ++)
+		sph_enc64le_aligned(u.tmp + (k << 3), VV[k]);
+#else
+	for (VV = &sc->u.Vs[0][0], k = 0; k < out_size_w32; k ++)
+		sph_enc32le_aligned(u.tmp + (k << 2), VV[k]);
+#endif
+	memcpy(dst, u.tmp, out_size_w32 << 2);
+	echo_small_init(sc, out_size_w32 << 5);
+}
+
+static void
+echo_big_close(sph_echo_big_context *sc, unsigned ub, unsigned n,
+	void *dst, unsigned out_size_w32)
+{
+	unsigned char *buf;
+	size_t ptr;
+	unsigned z;
+	unsigned elen;
+	union {
+		unsigned char tmp[64];
+		sph_u32 dummy;
+#if SPH_ECHO_64
+		sph_u64 dummy2;
+#endif
+	} u;
+#if SPH_ECHO_64
+	sph_u64 *VV;
+#else
+	sph_u32 *VV;
+#endif
+	unsigned k;
+
+	buf = sc->buf;
+	ptr = sc->ptr;
+	elen = ((unsigned)ptr << 3) + n;
+	INCR_COUNTER(sc, elen);
+	sph_enc32le_aligned(u.tmp, sc->C0);
+	sph_enc32le_aligned(u.tmp + 4, sc->C1);
+	sph_enc32le_aligned(u.tmp + 8, sc->C2);
+	sph_enc32le_aligned(u.tmp + 12, sc->C3);
+	/*
+	 * If elen is zero, then this block actually contains no message
+	 * bit, only the first padding bit.
+	 */
+	if (elen == 0) {
+		sc->C0 = sc->C1 = sc->C2 = sc->C3 = 0;
+	}
+	z = 0x80 >> n;
+	buf[ptr ++] = ((ub & -z) | z) & 0xFF;
+	memset(buf + ptr, 0, (sizeof sc->buf) - ptr);
+	if (ptr > ((sizeof sc->buf) - 18)) {
+		echo_big_compress(sc);
+		sc->C0 = sc->C1 = sc->C2 = sc->C3 = 0;
+		memset(buf, 0, sizeof sc->buf);
+	}
+	sph_enc16le(buf + (sizeof sc->buf) - 18, out_size_w32 << 5);
+	memcpy(buf + (sizeof sc->buf) - 16, u.tmp, 16);
+	echo_big_compress(sc);
+#if SPH_ECHO_64
+	for (VV = &sc->u.Vb[0][0], k = 0; k < ((out_size_w32 + 1) >> 1); k ++)
+		sph_enc64le_aligned(u.tmp + (k << 3), VV[k]);
+#else
+	for (VV = &sc->u.Vs[0][0], k = 0; k < out_size_w32; k ++)
+		sph_enc32le_aligned(u.tmp + (k << 2), VV[k]);
+#endif
+	memcpy(dst, u.tmp, out_size_w32 << 2);
+	echo_big_init(sc, out_size_w32 << 5);
+}
+
+/* see sph_echo.h */
+void
+sph_echo224_init(void *cc)
+{
+	echo_small_init(cc, 224);
+}
+
+/* see sph_echo.h */
+void
+sph_echo224(void *cc, const void *data, size_t len)
+{
+	echo_small_core(cc, data, len);
+}
+
+/* see sph_echo.h */
+void
+sph_echo224_close(void *cc, void *dst)
+{
+	echo_small_close(cc, 0, 0, dst, 7);
+}
+
+/* see sph_echo.h */
+void
+sph_echo224_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
+{
+	echo_small_close(cc, ub, n, dst, 7);
+}
+
+/* see sph_echo.h */
+void
+sph_echo256_init(void *cc)
+{
+	echo_small_init(cc, 256);
+}
+
+/* see sph_echo.h */
+void
+sph_echo256(void *cc, const void *data, size_t len)
+{
+	echo_small_core(cc, data, len);
+}
+
+/* see sph_echo.h */
+void
+sph_echo256_close(void *cc, void *dst)
+{
+	echo_small_close(cc, 0, 0, dst, 8);
+}
+
+/* see sph_echo.h */
+void
+sph_echo256_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
+{
+	echo_small_close(cc, ub, n, dst, 8);
+}
+
+/* see sph_echo.h */
+void
+sph_echo384_init(void *cc)
+{
+	echo_big_init(cc, 384);
+}
+
+/* see sph_echo.h */
+void
+sph_echo384(void *cc, const void *data, size_t len)
+{
+	echo_big_core(cc, data, len);
+}
+
+/* see sph_echo.h */
+void
+sph_echo384_close(void *cc, void *dst)
+{
+	echo_big_close(cc, 0, 0, dst, 12);
+}
+
+/* see sph_echo.h */
+void
+sph_echo384_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
+{
+	echo_big_close(cc, ub, n, dst, 12);
+}
+
+/* see sph_echo.h */
+void
+sph_echo512_init(void *cc)
+{
+	echo_big_init(cc, 512);
+}
+
+/* see sph_echo.h */
+void
+sph_echo512(void *cc, const void *data, size_t len)
+{
+	echo_big_core(cc, data, len);
+}
+
+/* see sph_echo.h */
+void
+sph_echo512_close(void *cc, void *dst)
+{
+	echo_big_close(cc, 0, 0, dst, 16);
+}
+
+/* see sph_echo.h */
+void
+sph_echo512_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
+{
+	echo_big_close(cc, ub, n, dst, 16);
+}
+#ifdef __cplusplus
+}
+#endif
diff --git a/sha3/groestl.c b/sha3/groestl.c
new file mode 100644
index 0000000..928bc41
--- /dev/null
+++ b/sha3/groestl.c
@@ -0,0 +1,3123 @@
+/* $Id: groestl.c 260 2011-07-21 01:02:38Z tp $ */
+/*
+ * Groestl implementation.
+ *
+ * ==========================(LICENSE BEGIN)============================
+ *
+ * Copyright (c) 2007-2010  Projet RNRT SAPHIR
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ===========================(LICENSE END)=============================
+ *
+ * @author   Thomas Pornin <thomas.pornin@cryptolog.com>
+ */
+
+#include <stddef.h>
+#include <string.h>
+
+#include "sph_groestl.h"
+
+#ifdef __cplusplus
+extern "C"{
+#endif
+
+#if SPH_SMALL_FOOTPRINT && !defined SPH_SMALL_FOOTPRINT_GROESTL
+#define SPH_SMALL_FOOTPRINT_GROESTL   1
+#endif
+
+/*
+ * Apparently, the 32-bit-only version is not faster than the 64-bit
+ * version unless using the "small footprint" code on a 32-bit machine.
+ */
+#if !defined SPH_GROESTL_64
+#if SPH_SMALL_FOOTPRINT_GROESTL && !SPH_64_TRUE
+#define SPH_GROESTL_64   0
+#else
+#define SPH_GROESTL_64   1
+#endif
+#endif
+
+#if !SPH_64
+#undef SPH_GROESTL_64
+#endif
+
+#ifdef _MSC_VER
+#pragma warning (disable: 4146)
+#endif
+
+/*
+ * The internal representation may use either big-endian or
+ * little-endian. Using the platform default representation speeds up
+ * encoding and decoding between bytes and the matrix columns.
+ */
+
+#undef USE_LE
+#if SPH_GROESTL_LITTLE_ENDIAN
+#define USE_LE   1
+#elif SPH_GROESTL_BIG_ENDIAN
+#define USE_LE   0
+#elif SPH_LITTLE_ENDIAN
+#define USE_LE   1
+#endif
+
+#if USE_LE
+
+#define C32e(x)     ((SPH_C32(x) >> 24) \
+                    | ((SPH_C32(x) >>  8) & SPH_C32(0x0000FF00)) \
+                    | ((SPH_C32(x) <<  8) & SPH_C32(0x00FF0000)) \
+                    | ((SPH_C32(x) << 24) & SPH_C32(0xFF000000)))
+#define dec32e_aligned   sph_dec32le_aligned
+#define enc32e           sph_enc32le
+#define B32_0(x)    ((x) & 0xFF)
+#define B32_1(x)    (((x) >> 8) & 0xFF)
+#define B32_2(x)    (((x) >> 16) & 0xFF)
+#define B32_3(x)    ((x) >> 24)
+
+#define R32u(u, d)   SPH_T32(((u) << 16) | ((d) >> 16))
+#define R32d(u, d)   SPH_T32(((u) >> 16) | ((d) << 16))
+
+#define PC32up(j, r)   ((sph_u32)((j) + (r)))
+#define PC32dn(j, r)   0
+#define QC32up(j, r)   SPH_C32(0xFFFFFFFF)
+#define QC32dn(j, r)   (((sph_u32)(r) << 24) ^ SPH_T32(~((sph_u32)(j) << 24)))
+
+#if SPH_64
+#define C64e(x)     ((SPH_C64(x) >> 56) \
+                    | ((SPH_C64(x) >> 40) & SPH_C64(0x000000000000FF00)) \
+                    | ((SPH_C64(x) >> 24) & SPH_C64(0x0000000000FF0000)) \
+                    | ((SPH_C64(x) >>  8) & SPH_C64(0x00000000FF000000)) \
+                    | ((SPH_C64(x) <<  8) & SPH_C64(0x000000FF00000000)) \
+                    | ((SPH_C64(x) << 24) & SPH_C64(0x0000FF0000000000)) \
+                    | ((SPH_C64(x) << 40) & SPH_C64(0x00FF000000000000)) \
+                    | ((SPH_C64(x) << 56) & SPH_C64(0xFF00000000000000)))
+#define dec64e_aligned   sph_dec64le_aligned
+#define enc64e           sph_enc64le
+#define B64_0(x)    ((x) & 0xFF)
+#define B64_1(x)    (((x) >> 8) & 0xFF)
+#define B64_2(x)    (((x) >> 16) & 0xFF)
+#define B64_3(x)    (((x) >> 24) & 0xFF)
+#define B64_4(x)    (((x) >> 32) & 0xFF)
+#define B64_5(x)    (((x) >> 40) & 0xFF)
+#define B64_6(x)    (((x) >> 48) & 0xFF)
+#define B64_7(x)    ((x) >> 56)
+#define R64         SPH_ROTL64
+#define PC64(j, r)  ((sph_u64)((j) + (r)))
+#define QC64(j, r)  (((sph_u64)(r) << 56) ^ SPH_T64(~((sph_u64)(j) << 56)))
+#endif
+
+#else
+
+#define C32e(x)     SPH_C32(x)
+#define dec32e_aligned   sph_dec32be_aligned
+#define enc32e           sph_enc32be
+#define B32_0(x)    ((x) >> 24)
+#define B32_1(x)    (((x) >> 16) & 0xFF)
+#define B32_2(x)    (((x) >> 8) & 0xFF)
+#define B32_3(x)    ((x) & 0xFF)
+
+#define R32u(u, d)   SPH_T32(((u) >> 16) | ((d) << 16))
+#define R32d(u, d)   SPH_T32(((u) << 16) | ((d) >> 16))
+
+#define PC32up(j, r)   ((sph_u32)((j) + (r)) << 24)
+#define PC32dn(j, r)   0
+#define QC32up(j, r)   SPH_C32(0xFFFFFFFF)
+#define QC32dn(j, r)   ((sph_u32)(r) ^ SPH_T32(~(sph_u32)(j)))
+
+#if SPH_64
+#define C64e(x)     SPH_C64(x)
+#define dec64e_aligned   sph_dec64be_aligned
+#define enc64e           sph_enc64be
+#define B64_0(x)    ((x) >> 56)
+#define B64_1(x)    (((x) >> 48) & 0xFF)
+#define B64_2(x)    (((x) >> 40) & 0xFF)
+#define B64_3(x)    (((x) >> 32) & 0xFF)
+#define B64_4(x)    (((x) >> 24) & 0xFF)
+#define B64_5(x)    (((x) >> 16) & 0xFF)
+#define B64_6(x)    (((x) >> 8) & 0xFF)
+#define B64_7(x)    ((x) & 0xFF)
+#define R64         SPH_ROTR64
+#define PC64(j, r)  ((sph_u64)((j) + (r)) << 56)
+#define QC64(j, r)  ((sph_u64)(r) ^ SPH_T64(~(sph_u64)(j)))
+#endif
+
+#endif
+
+#if SPH_GROESTL_64
+
+static const sph_u64 T0[] = {
+	C64e(0xc632f4a5f497a5c6), C64e(0xf86f978497eb84f8),
+	C64e(0xee5eb099b0c799ee), C64e(0xf67a8c8d8cf78df6),
+	C64e(0xffe8170d17e50dff), C64e(0xd60adcbddcb7bdd6),
+	C64e(0xde16c8b1c8a7b1de), C64e(0x916dfc54fc395491),
+	C64e(0x6090f050f0c05060), C64e(0x0207050305040302),
+	C64e(0xce2ee0a9e087a9ce), C64e(0x56d1877d87ac7d56),
+	C64e(0xe7cc2b192bd519e7), C64e(0xb513a662a67162b5),
+	C64e(0x4d7c31e6319ae64d), C64e(0xec59b59ab5c39aec),
+	C64e(0x8f40cf45cf05458f), C64e(0x1fa3bc9dbc3e9d1f),
+	C64e(0x8949c040c0094089), C64e(0xfa68928792ef87fa),
+	C64e(0xefd03f153fc515ef), C64e(0xb29426eb267febb2),
+	C64e(0x8ece40c94007c98e), C64e(0xfbe61d0b1ded0bfb),
+	C64e(0x416e2fec2f82ec41), C64e(0xb31aa967a97d67b3),
+	C64e(0x5f431cfd1cbefd5f), C64e(0x456025ea258aea45),
+	C64e(0x23f9dabfda46bf23), C64e(0x535102f702a6f753),
+	C64e(0xe445a196a1d396e4), C64e(0x9b76ed5bed2d5b9b),
+	C64e(0x75285dc25deac275), C64e(0xe1c5241c24d91ce1),
+	C64e(0x3dd4e9aee97aae3d), C64e(0x4cf2be6abe986a4c),
+	C64e(0x6c82ee5aeed85a6c), C64e(0x7ebdc341c3fc417e),
+	C64e(0xf5f3060206f102f5), C64e(0x8352d14fd11d4f83),
+	C64e(0x688ce45ce4d05c68), C64e(0x515607f407a2f451),
+	C64e(0xd18d5c345cb934d1), C64e(0xf9e1180818e908f9),
+	C64e(0xe24cae93aedf93e2), C64e(0xab3e9573954d73ab),
+	C64e(0x6297f553f5c45362), C64e(0x2a6b413f41543f2a),
+	C64e(0x081c140c14100c08), C64e(0x9563f652f6315295),
+	C64e(0x46e9af65af8c6546), C64e(0x9d7fe25ee2215e9d),
+	C64e(0x3048782878602830), C64e(0x37cff8a1f86ea137),
+	C64e(0x0a1b110f11140f0a), C64e(0x2febc4b5c45eb52f),
+	C64e(0x0e151b091b1c090e), C64e(0x247e5a365a483624),
+	C64e(0x1badb69bb6369b1b), C64e(0xdf98473d47a53ddf),
+	C64e(0xcda76a266a8126cd), C64e(0x4ef5bb69bb9c694e),
+	C64e(0x7f334ccd4cfecd7f), C64e(0xea50ba9fbacf9fea),
+	C64e(0x123f2d1b2d241b12), C64e(0x1da4b99eb93a9e1d),
+	C64e(0x58c49c749cb07458), C64e(0x3446722e72682e34),
+	C64e(0x3641772d776c2d36), C64e(0xdc11cdb2cda3b2dc),
+	C64e(0xb49d29ee2973eeb4), C64e(0x5b4d16fb16b6fb5b),
+	C64e(0xa4a501f60153f6a4), C64e(0x76a1d74dd7ec4d76),
+	C64e(0xb714a361a37561b7), C64e(0x7d3449ce49face7d),
+	C64e(0x52df8d7b8da47b52), C64e(0xdd9f423e42a13edd),
+	C64e(0x5ecd937193bc715e), C64e(0x13b1a297a2269713),
+	C64e(0xa6a204f50457f5a6), C64e(0xb901b868b86968b9),
+	C64e(0x0000000000000000), C64e(0xc1b5742c74992cc1),
+	C64e(0x40e0a060a0806040), C64e(0xe3c2211f21dd1fe3),
+	C64e(0x793a43c843f2c879), C64e(0xb69a2ced2c77edb6),
+	C64e(0xd40dd9bed9b3bed4), C64e(0x8d47ca46ca01468d),
+	C64e(0x671770d970ced967), C64e(0x72afdd4bdde44b72),
+	C64e(0x94ed79de7933de94), C64e(0x98ff67d4672bd498),
+	C64e(0xb09323e8237be8b0), C64e(0x855bde4ade114a85),
+	C64e(0xbb06bd6bbd6d6bbb), C64e(0xc5bb7e2a7e912ac5),
+	C64e(0x4f7b34e5349ee54f), C64e(0xedd73a163ac116ed),
+	C64e(0x86d254c55417c586), C64e(0x9af862d7622fd79a),
+	C64e(0x6699ff55ffcc5566), C64e(0x11b6a794a7229411),
+	C64e(0x8ac04acf4a0fcf8a), C64e(0xe9d9301030c910e9),
+	C64e(0x040e0a060a080604), C64e(0xfe66988198e781fe),
+	C64e(0xa0ab0bf00b5bf0a0), C64e(0x78b4cc44ccf04478),
+	C64e(0x25f0d5bad54aba25), C64e(0x4b753ee33e96e34b),
+	C64e(0xa2ac0ef30e5ff3a2), C64e(0x5d4419fe19bafe5d),
+	C64e(0x80db5bc05b1bc080), C64e(0x0580858a850a8a05),
+	C64e(0x3fd3ecadec7ead3f), C64e(0x21fedfbcdf42bc21),
+	C64e(0x70a8d848d8e04870), C64e(0xf1fd0c040cf904f1),
+	C64e(0x63197adf7ac6df63), C64e(0x772f58c158eec177),
+	C64e(0xaf309f759f4575af), C64e(0x42e7a563a5846342),
+	C64e(0x2070503050403020), C64e(0xe5cb2e1a2ed11ae5),
+	C64e(0xfdef120e12e10efd), C64e(0xbf08b76db7656dbf),
+	C64e(0x8155d44cd4194c81), C64e(0x18243c143c301418),
+	C64e(0x26795f355f4c3526), C64e(0xc3b2712f719d2fc3),
+	C64e(0xbe8638e13867e1be), C64e(0x35c8fda2fd6aa235),
+	C64e(0x88c74fcc4f0bcc88), C64e(0x2e654b394b5c392e),
+	C64e(0x936af957f93d5793), C64e(0x55580df20daaf255),
+	C64e(0xfc619d829de382fc), C64e(0x7ab3c947c9f4477a),
+	C64e(0xc827efacef8bacc8), C64e(0xba8832e7326fe7ba),
+	C64e(0x324f7d2b7d642b32), C64e(0xe642a495a4d795e6),
+	C64e(0xc03bfba0fb9ba0c0), C64e(0x19aab398b3329819),
+	C64e(0x9ef668d16827d19e), C64e(0xa322817f815d7fa3),
+	C64e(0x44eeaa66aa886644), C64e(0x54d6827e82a87e54),
+	C64e(0x3bdde6abe676ab3b), C64e(0x0b959e839e16830b),
+	C64e(0x8cc945ca4503ca8c), C64e(0xc7bc7b297b9529c7),
+	C64e(0x6b056ed36ed6d36b), C64e(0x286c443c44503c28),
+	C64e(0xa72c8b798b5579a7), C64e(0xbc813de23d63e2bc),
+	C64e(0x1631271d272c1d16), C64e(0xad379a769a4176ad),
+	C64e(0xdb964d3b4dad3bdb), C64e(0x649efa56fac85664),
+	C64e(0x74a6d24ed2e84e74), C64e(0x1436221e22281e14),
+	C64e(0x92e476db763fdb92), C64e(0x0c121e0a1e180a0c),
+	C64e(0x48fcb46cb4906c48), C64e(0xb88f37e4376be4b8),
+	C64e(0x9f78e75de7255d9f), C64e(0xbd0fb26eb2616ebd),
+	C64e(0x43692aef2a86ef43), C64e(0xc435f1a6f193a6c4),
+	C64e(0x39dae3a8e372a839), C64e(0x31c6f7a4f762a431),
+	C64e(0xd38a593759bd37d3), C64e(0xf274868b86ff8bf2),
+	C64e(0xd583563256b132d5), C64e(0x8b4ec543c50d438b),
+	C64e(0x6e85eb59ebdc596e), C64e(0xda18c2b7c2afb7da),
+	C64e(0x018e8f8c8f028c01), C64e(0xb11dac64ac7964b1),
+	C64e(0x9cf16dd26d23d29c), C64e(0x49723be03b92e049),
+	C64e(0xd81fc7b4c7abb4d8), C64e(0xacb915fa1543faac),
+	C64e(0xf3fa090709fd07f3), C64e(0xcfa06f256f8525cf),
+	C64e(0xca20eaafea8fafca), C64e(0xf47d898e89f38ef4),
+	C64e(0x476720e9208ee947), C64e(0x1038281828201810),
+	C64e(0x6f0b64d564ded56f), C64e(0xf073838883fb88f0),
+	C64e(0x4afbb16fb1946f4a), C64e(0x5cca967296b8725c),
+	C64e(0x38546c246c702438), C64e(0x575f08f108aef157),
+	C64e(0x732152c752e6c773), C64e(0x9764f351f3355197),
+	C64e(0xcbae6523658d23cb), C64e(0xa125847c84597ca1),
+	C64e(0xe857bf9cbfcb9ce8), C64e(0x3e5d6321637c213e),
+	C64e(0x96ea7cdd7c37dd96), C64e(0x611e7fdc7fc2dc61),
+	C64e(0x0d9c9186911a860d), C64e(0x0f9b9485941e850f),
+	C64e(0xe04bab90abdb90e0), C64e(0x7cbac642c6f8427c),
+	C64e(0x712657c457e2c471), C64e(0xcc29e5aae583aacc),
+	C64e(0x90e373d8733bd890), C64e(0x06090f050f0c0506),
+	C64e(0xf7f4030103f501f7), C64e(0x1c2a36123638121c),
+	C64e(0xc23cfea3fe9fa3c2), C64e(0x6a8be15fe1d45f6a),
+	C64e(0xaebe10f91047f9ae), C64e(0x69026bd06bd2d069),
+	C64e(0x17bfa891a82e9117), C64e(0x9971e858e8295899),
+	C64e(0x3a5369276974273a), C64e(0x27f7d0b9d04eb927),
+	C64e(0xd991483848a938d9), C64e(0xebde351335cd13eb),
+	C64e(0x2be5ceb3ce56b32b), C64e(0x2277553355443322),
+	C64e(0xd204d6bbd6bfbbd2), C64e(0xa9399070904970a9),
+	C64e(0x07878089800e8907), C64e(0x33c1f2a7f266a733),
+	C64e(0x2decc1b6c15ab62d), C64e(0x3c5a66226678223c),
+	C64e(0x15b8ad92ad2a9215), C64e(0xc9a96020608920c9),
+	C64e(0x875cdb49db154987), C64e(0xaab01aff1a4fffaa),
+	C64e(0x50d8887888a07850), C64e(0xa52b8e7a8e517aa5),
+	C64e(0x03898a8f8a068f03), C64e(0x594a13f813b2f859),
+	C64e(0x09929b809b128009), C64e(0x1a2339173934171a),
+	C64e(0x651075da75cada65), C64e(0xd784533153b531d7),
+	C64e(0x84d551c65113c684), C64e(0xd003d3b8d3bbb8d0),
+	C64e(0x82dc5ec35e1fc382), C64e(0x29e2cbb0cb52b029),
+	C64e(0x5ac3997799b4775a), C64e(0x1e2d3311333c111e),
+	C64e(0x7b3d46cb46f6cb7b), C64e(0xa8b71ffc1f4bfca8),
+	C64e(0x6d0c61d661dad66d), C64e(0x2c624e3a4e583a2c)
+};
+
+#if !SPH_SMALL_FOOTPRINT_GROESTL
+
+static const sph_u64 T1[] = {
+	C64e(0xc6c632f4a5f497a5), C64e(0xf8f86f978497eb84),
+	C64e(0xeeee5eb099b0c799), C64e(0xf6f67a8c8d8cf78d),
+	C64e(0xffffe8170d17e50d), C64e(0xd6d60adcbddcb7bd),
+	C64e(0xdede16c8b1c8a7b1), C64e(0x91916dfc54fc3954),
+	C64e(0x606090f050f0c050), C64e(0x0202070503050403),
+	C64e(0xcece2ee0a9e087a9), C64e(0x5656d1877d87ac7d),
+	C64e(0xe7e7cc2b192bd519), C64e(0xb5b513a662a67162),
+	C64e(0x4d4d7c31e6319ae6), C64e(0xecec59b59ab5c39a),
+	C64e(0x8f8f40cf45cf0545), C64e(0x1f1fa3bc9dbc3e9d),
+	C64e(0x898949c040c00940), C64e(0xfafa68928792ef87),
+	C64e(0xefefd03f153fc515), C64e(0xb2b29426eb267feb),
+	C64e(0x8e8ece40c94007c9), C64e(0xfbfbe61d0b1ded0b),
+	C64e(0x41416e2fec2f82ec), C64e(0xb3b31aa967a97d67),
+	C64e(0x5f5f431cfd1cbefd), C64e(0x45456025ea258aea),
+	C64e(0x2323f9dabfda46bf), C64e(0x53535102f702a6f7),
+	C64e(0xe4e445a196a1d396), C64e(0x9b9b76ed5bed2d5b),
+	C64e(0x7575285dc25deac2), C64e(0xe1e1c5241c24d91c),
+	C64e(0x3d3dd4e9aee97aae), C64e(0x4c4cf2be6abe986a),
+	C64e(0x6c6c82ee5aeed85a), C64e(0x7e7ebdc341c3fc41),
+	C64e(0xf5f5f3060206f102), C64e(0x838352d14fd11d4f),
+	C64e(0x68688ce45ce4d05c), C64e(0x51515607f407a2f4),
+	C64e(0xd1d18d5c345cb934), C64e(0xf9f9e1180818e908),
+	C64e(0xe2e24cae93aedf93), C64e(0xabab3e9573954d73),
+	C64e(0x626297f553f5c453), C64e(0x2a2a6b413f41543f),
+	C64e(0x08081c140c14100c), C64e(0x959563f652f63152),
+	C64e(0x4646e9af65af8c65), C64e(0x9d9d7fe25ee2215e),
+	C64e(0x3030487828786028), C64e(0x3737cff8a1f86ea1),
+	C64e(0x0a0a1b110f11140f), C64e(0x2f2febc4b5c45eb5),
+	C64e(0x0e0e151b091b1c09), C64e(0x24247e5a365a4836),
+	C64e(0x1b1badb69bb6369b), C64e(0xdfdf98473d47a53d),
+	C64e(0xcdcda76a266a8126), C64e(0x4e4ef5bb69bb9c69),
+	C64e(0x7f7f334ccd4cfecd), C64e(0xeaea50ba9fbacf9f),
+	C64e(0x12123f2d1b2d241b), C64e(0x1d1da4b99eb93a9e),
+	C64e(0x5858c49c749cb074), C64e(0x343446722e72682e),
+	C64e(0x363641772d776c2d), C64e(0xdcdc11cdb2cda3b2),
+	C64e(0xb4b49d29ee2973ee), C64e(0x5b5b4d16fb16b6fb),
+	C64e(0xa4a4a501f60153f6), C64e(0x7676a1d74dd7ec4d),
+	C64e(0xb7b714a361a37561), C64e(0x7d7d3449ce49face),
+	C64e(0x5252df8d7b8da47b), C64e(0xdddd9f423e42a13e),
+	C64e(0x5e5ecd937193bc71), C64e(0x1313b1a297a22697),
+	C64e(0xa6a6a204f50457f5), C64e(0xb9b901b868b86968),
+	C64e(0x0000000000000000), C64e(0xc1c1b5742c74992c),
+	C64e(0x4040e0a060a08060), C64e(0xe3e3c2211f21dd1f),
+	C64e(0x79793a43c843f2c8), C64e(0xb6b69a2ced2c77ed),
+	C64e(0xd4d40dd9bed9b3be), C64e(0x8d8d47ca46ca0146),
+	C64e(0x67671770d970ced9), C64e(0x7272afdd4bdde44b),
+	C64e(0x9494ed79de7933de), C64e(0x9898ff67d4672bd4),
+	C64e(0xb0b09323e8237be8), C64e(0x85855bde4ade114a),
+	C64e(0xbbbb06bd6bbd6d6b), C64e(0xc5c5bb7e2a7e912a),
+	C64e(0x4f4f7b34e5349ee5), C64e(0xededd73a163ac116),
+	C64e(0x8686d254c55417c5), C64e(0x9a9af862d7622fd7),
+	C64e(0x666699ff55ffcc55), C64e(0x1111b6a794a72294),
+	C64e(0x8a8ac04acf4a0fcf), C64e(0xe9e9d9301030c910),
+	C64e(0x04040e0a060a0806), C64e(0xfefe66988198e781),
+	C64e(0xa0a0ab0bf00b5bf0), C64e(0x7878b4cc44ccf044),
+	C64e(0x2525f0d5bad54aba), C64e(0x4b4b753ee33e96e3),
+	C64e(0xa2a2ac0ef30e5ff3), C64e(0x5d5d4419fe19bafe),
+	C64e(0x8080db5bc05b1bc0), C64e(0x050580858a850a8a),
+	C64e(0x3f3fd3ecadec7ead), C64e(0x2121fedfbcdf42bc),
+	C64e(0x7070a8d848d8e048), C64e(0xf1f1fd0c040cf904),
+	C64e(0x6363197adf7ac6df), C64e(0x77772f58c158eec1),
+	C64e(0xafaf309f759f4575), C64e(0x4242e7a563a58463),
+	C64e(0x2020705030504030), C64e(0xe5e5cb2e1a2ed11a),
+	C64e(0xfdfdef120e12e10e), C64e(0xbfbf08b76db7656d),
+	C64e(0x818155d44cd4194c), C64e(0x1818243c143c3014),
+	C64e(0x2626795f355f4c35), C64e(0xc3c3b2712f719d2f),
+	C64e(0xbebe8638e13867e1), C64e(0x3535c8fda2fd6aa2),
+	C64e(0x8888c74fcc4f0bcc), C64e(0x2e2e654b394b5c39),
+	C64e(0x93936af957f93d57), C64e(0x5555580df20daaf2),
+	C64e(0xfcfc619d829de382), C64e(0x7a7ab3c947c9f447),
+	C64e(0xc8c827efacef8bac), C64e(0xbaba8832e7326fe7),
+	C64e(0x32324f7d2b7d642b), C64e(0xe6e642a495a4d795),
+	C64e(0xc0c03bfba0fb9ba0), C64e(0x1919aab398b33298),
+	C64e(0x9e9ef668d16827d1), C64e(0xa3a322817f815d7f),
+	C64e(0x4444eeaa66aa8866), C64e(0x5454d6827e82a87e),
+	C64e(0x3b3bdde6abe676ab), C64e(0x0b0b959e839e1683),
+	C64e(0x8c8cc945ca4503ca), C64e(0xc7c7bc7b297b9529),
+	C64e(0x6b6b056ed36ed6d3), C64e(0x28286c443c44503c),
+	C64e(0xa7a72c8b798b5579), C64e(0xbcbc813de23d63e2),
+	C64e(0x161631271d272c1d), C64e(0xadad379a769a4176),
+	C64e(0xdbdb964d3b4dad3b), C64e(0x64649efa56fac856),
+	C64e(0x7474a6d24ed2e84e), C64e(0x141436221e22281e),
+	C64e(0x9292e476db763fdb), C64e(0x0c0c121e0a1e180a),
+	C64e(0x4848fcb46cb4906c), C64e(0xb8b88f37e4376be4),
+	C64e(0x9f9f78e75de7255d), C64e(0xbdbd0fb26eb2616e),
+	C64e(0x4343692aef2a86ef), C64e(0xc4c435f1a6f193a6),
+	C64e(0x3939dae3a8e372a8), C64e(0x3131c6f7a4f762a4),
+	C64e(0xd3d38a593759bd37), C64e(0xf2f274868b86ff8b),
+	C64e(0xd5d583563256b132), C64e(0x8b8b4ec543c50d43),
+	C64e(0x6e6e85eb59ebdc59), C64e(0xdada18c2b7c2afb7),
+	C64e(0x01018e8f8c8f028c), C64e(0xb1b11dac64ac7964),
+	C64e(0x9c9cf16dd26d23d2), C64e(0x4949723be03b92e0),
+	C64e(0xd8d81fc7b4c7abb4), C64e(0xacacb915fa1543fa),
+	C64e(0xf3f3fa090709fd07), C64e(0xcfcfa06f256f8525),
+	C64e(0xcaca20eaafea8faf), C64e(0xf4f47d898e89f38e),
+	C64e(0x47476720e9208ee9), C64e(0x1010382818282018),
+	C64e(0x6f6f0b64d564ded5), C64e(0xf0f073838883fb88),
+	C64e(0x4a4afbb16fb1946f), C64e(0x5c5cca967296b872),
+	C64e(0x3838546c246c7024), C64e(0x57575f08f108aef1),
+	C64e(0x73732152c752e6c7), C64e(0x979764f351f33551),
+	C64e(0xcbcbae6523658d23), C64e(0xa1a125847c84597c),
+	C64e(0xe8e857bf9cbfcb9c), C64e(0x3e3e5d6321637c21),
+	C64e(0x9696ea7cdd7c37dd), C64e(0x61611e7fdc7fc2dc),
+	C64e(0x0d0d9c9186911a86), C64e(0x0f0f9b9485941e85),
+	C64e(0xe0e04bab90abdb90), C64e(0x7c7cbac642c6f842),
+	C64e(0x71712657c457e2c4), C64e(0xcccc29e5aae583aa),
+	C64e(0x9090e373d8733bd8), C64e(0x0606090f050f0c05),
+	C64e(0xf7f7f4030103f501), C64e(0x1c1c2a3612363812),
+	C64e(0xc2c23cfea3fe9fa3), C64e(0x6a6a8be15fe1d45f),
+	C64e(0xaeaebe10f91047f9), C64e(0x6969026bd06bd2d0),
+	C64e(0x1717bfa891a82e91), C64e(0x999971e858e82958),
+	C64e(0x3a3a536927697427), C64e(0x2727f7d0b9d04eb9),
+	C64e(0xd9d991483848a938), C64e(0xebebde351335cd13),
+	C64e(0x2b2be5ceb3ce56b3), C64e(0x2222775533554433),
+	C64e(0xd2d204d6bbd6bfbb), C64e(0xa9a9399070904970),
+	C64e(0x0707878089800e89), C64e(0x3333c1f2a7f266a7),
+	C64e(0x2d2decc1b6c15ab6), C64e(0x3c3c5a6622667822),
+	C64e(0x1515b8ad92ad2a92), C64e(0xc9c9a96020608920),
+	C64e(0x87875cdb49db1549), C64e(0xaaaab01aff1a4fff),
+	C64e(0x5050d8887888a078), C64e(0xa5a52b8e7a8e517a),
+	C64e(0x0303898a8f8a068f), C64e(0x59594a13f813b2f8),
+	C64e(0x0909929b809b1280), C64e(0x1a1a233917393417),
+	C64e(0x65651075da75cada), C64e(0xd7d784533153b531),
+	C64e(0x8484d551c65113c6), C64e(0xd0d003d3b8d3bbb8),
+	C64e(0x8282dc5ec35e1fc3), C64e(0x2929e2cbb0cb52b0),
+	C64e(0x5a5ac3997799b477), C64e(0x1e1e2d3311333c11),
+	C64e(0x7b7b3d46cb46f6cb), C64e(0xa8a8b71ffc1f4bfc),
+	C64e(0x6d6d0c61d661dad6), C64e(0x2c2c624e3a4e583a)
+};
+
+static const sph_u64 T2[] = {
+	C64e(0xa5c6c632f4a5f497), C64e(0x84f8f86f978497eb),
+	C64e(0x99eeee5eb099b0c7), C64e(0x8df6f67a8c8d8cf7),
+	C64e(0x0dffffe8170d17e5), C64e(0xbdd6d60adcbddcb7),
+	C64e(0xb1dede16c8b1c8a7), C64e(0x5491916dfc54fc39),
+	C64e(0x50606090f050f0c0), C64e(0x0302020705030504),
+	C64e(0xa9cece2ee0a9e087), C64e(0x7d5656d1877d87ac),
+	C64e(0x19e7e7cc2b192bd5), C64e(0x62b5b513a662a671),
+	C64e(0xe64d4d7c31e6319a), C64e(0x9aecec59b59ab5c3),
+	C64e(0x458f8f40cf45cf05), C64e(0x9d1f1fa3bc9dbc3e),
+	C64e(0x40898949c040c009), C64e(0x87fafa68928792ef),
+	C64e(0x15efefd03f153fc5), C64e(0xebb2b29426eb267f),
+	C64e(0xc98e8ece40c94007), C64e(0x0bfbfbe61d0b1ded),
+	C64e(0xec41416e2fec2f82), C64e(0x67b3b31aa967a97d),
+	C64e(0xfd5f5f431cfd1cbe), C64e(0xea45456025ea258a),
+	C64e(0xbf2323f9dabfda46), C64e(0xf753535102f702a6),
+	C64e(0x96e4e445a196a1d3), C64e(0x5b9b9b76ed5bed2d),
+	C64e(0xc27575285dc25dea), C64e(0x1ce1e1c5241c24d9),
+	C64e(0xae3d3dd4e9aee97a), C64e(0x6a4c4cf2be6abe98),
+	C64e(0x5a6c6c82ee5aeed8), C64e(0x417e7ebdc341c3fc),
+	C64e(0x02f5f5f3060206f1), C64e(0x4f838352d14fd11d),
+	C64e(0x5c68688ce45ce4d0), C64e(0xf451515607f407a2),
+	C64e(0x34d1d18d5c345cb9), C64e(0x08f9f9e1180818e9),
+	C64e(0x93e2e24cae93aedf), C64e(0x73abab3e9573954d),
+	C64e(0x53626297f553f5c4), C64e(0x3f2a2a6b413f4154),
+	C64e(0x0c08081c140c1410), C64e(0x52959563f652f631),
+	C64e(0x654646e9af65af8c), C64e(0x5e9d9d7fe25ee221),
+	C64e(0x2830304878287860), C64e(0xa13737cff8a1f86e),
+	C64e(0x0f0a0a1b110f1114), C64e(0xb52f2febc4b5c45e),
+	C64e(0x090e0e151b091b1c), C64e(0x3624247e5a365a48),
+	C64e(0x9b1b1badb69bb636), C64e(0x3ddfdf98473d47a5),
+	C64e(0x26cdcda76a266a81), C64e(0x694e4ef5bb69bb9c),
+	C64e(0xcd7f7f334ccd4cfe), C64e(0x9feaea50ba9fbacf),
+	C64e(0x1b12123f2d1b2d24), C64e(0x9e1d1da4b99eb93a),
+	C64e(0x745858c49c749cb0), C64e(0x2e343446722e7268),
+	C64e(0x2d363641772d776c), C64e(0xb2dcdc11cdb2cda3),
+	C64e(0xeeb4b49d29ee2973), C64e(0xfb5b5b4d16fb16b6),
+	C64e(0xf6a4a4a501f60153), C64e(0x4d7676a1d74dd7ec),
+	C64e(0x61b7b714a361a375), C64e(0xce7d7d3449ce49fa),
+	C64e(0x7b5252df8d7b8da4), C64e(0x3edddd9f423e42a1),
+	C64e(0x715e5ecd937193bc), C64e(0x971313b1a297a226),
+	C64e(0xf5a6a6a204f50457), C64e(0x68b9b901b868b869),
+	C64e(0x0000000000000000), C64e(0x2cc1c1b5742c7499),
+	C64e(0x604040e0a060a080), C64e(0x1fe3e3c2211f21dd),
+	C64e(0xc879793a43c843f2), C64e(0xedb6b69a2ced2c77),
+	C64e(0xbed4d40dd9bed9b3), C64e(0x468d8d47ca46ca01),
+	C64e(0xd967671770d970ce), C64e(0x4b7272afdd4bdde4),
+	C64e(0xde9494ed79de7933), C64e(0xd49898ff67d4672b),
+	C64e(0xe8b0b09323e8237b), C64e(0x4a85855bde4ade11),
+	C64e(0x6bbbbb06bd6bbd6d), C64e(0x2ac5c5bb7e2a7e91),
+	C64e(0xe54f4f7b34e5349e), C64e(0x16ededd73a163ac1),
+	C64e(0xc58686d254c55417), C64e(0xd79a9af862d7622f),
+	C64e(0x55666699ff55ffcc), C64e(0x941111b6a794a722),
+	C64e(0xcf8a8ac04acf4a0f), C64e(0x10e9e9d9301030c9),
+	C64e(0x0604040e0a060a08), C64e(0x81fefe66988198e7),
+	C64e(0xf0a0a0ab0bf00b5b), C64e(0x447878b4cc44ccf0),
+	C64e(0xba2525f0d5bad54a), C64e(0xe34b4b753ee33e96),
+	C64e(0xf3a2a2ac0ef30e5f), C64e(0xfe5d5d4419fe19ba),
+	C64e(0xc08080db5bc05b1b), C64e(0x8a050580858a850a),
+	C64e(0xad3f3fd3ecadec7e), C64e(0xbc2121fedfbcdf42),
+	C64e(0x487070a8d848d8e0), C64e(0x04f1f1fd0c040cf9),
+	C64e(0xdf6363197adf7ac6), C64e(0xc177772f58c158ee),
+	C64e(0x75afaf309f759f45), C64e(0x634242e7a563a584),
+	C64e(0x3020207050305040), C64e(0x1ae5e5cb2e1a2ed1),
+	C64e(0x0efdfdef120e12e1), C64e(0x6dbfbf08b76db765),
+	C64e(0x4c818155d44cd419), C64e(0x141818243c143c30),
+	C64e(0x352626795f355f4c), C64e(0x2fc3c3b2712f719d),
+	C64e(0xe1bebe8638e13867), C64e(0xa23535c8fda2fd6a),
+	C64e(0xcc8888c74fcc4f0b), C64e(0x392e2e654b394b5c),
+	C64e(0x5793936af957f93d), C64e(0xf25555580df20daa),
+	C64e(0x82fcfc619d829de3), C64e(0x477a7ab3c947c9f4),
+	C64e(0xacc8c827efacef8b), C64e(0xe7baba8832e7326f),
+	C64e(0x2b32324f7d2b7d64), C64e(0x95e6e642a495a4d7),
+	C64e(0xa0c0c03bfba0fb9b), C64e(0x981919aab398b332),
+	C64e(0xd19e9ef668d16827), C64e(0x7fa3a322817f815d),
+	C64e(0x664444eeaa66aa88), C64e(0x7e5454d6827e82a8),
+	C64e(0xab3b3bdde6abe676), C64e(0x830b0b959e839e16),
+	C64e(0xca8c8cc945ca4503), C64e(0x29c7c7bc7b297b95),
+	C64e(0xd36b6b056ed36ed6), C64e(0x3c28286c443c4450),
+	C64e(0x79a7a72c8b798b55), C64e(0xe2bcbc813de23d63),
+	C64e(0x1d161631271d272c), C64e(0x76adad379a769a41),
+	C64e(0x3bdbdb964d3b4dad), C64e(0x5664649efa56fac8),
+	C64e(0x4e7474a6d24ed2e8), C64e(0x1e141436221e2228),
+	C64e(0xdb9292e476db763f), C64e(0x0a0c0c121e0a1e18),
+	C64e(0x6c4848fcb46cb490), C64e(0xe4b8b88f37e4376b),
+	C64e(0x5d9f9f78e75de725), C64e(0x6ebdbd0fb26eb261),
+	C64e(0xef4343692aef2a86), C64e(0xa6c4c435f1a6f193),
+	C64e(0xa83939dae3a8e372), C64e(0xa43131c6f7a4f762),
+	C64e(0x37d3d38a593759bd), C64e(0x8bf2f274868b86ff),
+	C64e(0x32d5d583563256b1), C64e(0x438b8b4ec543c50d),
+	C64e(0x596e6e85eb59ebdc), C64e(0xb7dada18c2b7c2af),
+	C64e(0x8c01018e8f8c8f02), C64e(0x64b1b11dac64ac79),
+	C64e(0xd29c9cf16dd26d23), C64e(0xe04949723be03b92),
+	C64e(0xb4d8d81fc7b4c7ab), C64e(0xfaacacb915fa1543),
+	C64e(0x07f3f3fa090709fd), C64e(0x25cfcfa06f256f85),
+	C64e(0xafcaca20eaafea8f), C64e(0x8ef4f47d898e89f3),
+	C64e(0xe947476720e9208e), C64e(0x1810103828182820),
+	C64e(0xd56f6f0b64d564de), C64e(0x88f0f073838883fb),
+	C64e(0x6f4a4afbb16fb194), C64e(0x725c5cca967296b8),
+	C64e(0x243838546c246c70), C64e(0xf157575f08f108ae),
+	C64e(0xc773732152c752e6), C64e(0x51979764f351f335),
+	C64e(0x23cbcbae6523658d), C64e(0x7ca1a125847c8459),
+	C64e(0x9ce8e857bf9cbfcb), C64e(0x213e3e5d6321637c),
+	C64e(0xdd9696ea7cdd7c37), C64e(0xdc61611e7fdc7fc2),
+	C64e(0x860d0d9c9186911a), C64e(0x850f0f9b9485941e),
+	C64e(0x90e0e04bab90abdb), C64e(0x427c7cbac642c6f8),
+	C64e(0xc471712657c457e2), C64e(0xaacccc29e5aae583),
+	C64e(0xd89090e373d8733b), C64e(0x050606090f050f0c),
+	C64e(0x01f7f7f4030103f5), C64e(0x121c1c2a36123638),
+	C64e(0xa3c2c23cfea3fe9f), C64e(0x5f6a6a8be15fe1d4),
+	C64e(0xf9aeaebe10f91047), C64e(0xd06969026bd06bd2),
+	C64e(0x911717bfa891a82e), C64e(0x58999971e858e829),
+	C64e(0x273a3a5369276974), C64e(0xb92727f7d0b9d04e),
+	C64e(0x38d9d991483848a9), C64e(0x13ebebde351335cd),
+	C64e(0xb32b2be5ceb3ce56), C64e(0x3322227755335544),
+	C64e(0xbbd2d204d6bbd6bf), C64e(0x70a9a93990709049),
+	C64e(0x890707878089800e), C64e(0xa73333c1f2a7f266),
+	C64e(0xb62d2decc1b6c15a), C64e(0x223c3c5a66226678),
+	C64e(0x921515b8ad92ad2a), C64e(0x20c9c9a960206089),
+	C64e(0x4987875cdb49db15), C64e(0xffaaaab01aff1a4f),
+	C64e(0x785050d8887888a0), C64e(0x7aa5a52b8e7a8e51),
+	C64e(0x8f0303898a8f8a06), C64e(0xf859594a13f813b2),
+	C64e(0x800909929b809b12), C64e(0x171a1a2339173934),
+	C64e(0xda65651075da75ca), C64e(0x31d7d784533153b5),
+	C64e(0xc68484d551c65113), C64e(0xb8d0d003d3b8d3bb),
+	C64e(0xc38282dc5ec35e1f), C64e(0xb02929e2cbb0cb52),
+	C64e(0x775a5ac3997799b4), C64e(0x111e1e2d3311333c),
+	C64e(0xcb7b7b3d46cb46f6), C64e(0xfca8a8b71ffc1f4b),
+	C64e(0xd66d6d0c61d661da), C64e(0x3a2c2c624e3a4e58)
+};
+
+static const sph_u64 T3[] = {
+	C64e(0x97a5c6c632f4a5f4), C64e(0xeb84f8f86f978497),
+	C64e(0xc799eeee5eb099b0), C64e(0xf78df6f67a8c8d8c),
+	C64e(0xe50dffffe8170d17), C64e(0xb7bdd6d60adcbddc),
+	C64e(0xa7b1dede16c8b1c8), C64e(0x395491916dfc54fc),
+	C64e(0xc050606090f050f0), C64e(0x0403020207050305),
+	C64e(0x87a9cece2ee0a9e0), C64e(0xac7d5656d1877d87),
+	C64e(0xd519e7e7cc2b192b), C64e(0x7162b5b513a662a6),
+	C64e(0x9ae64d4d7c31e631), C64e(0xc39aecec59b59ab5),
+	C64e(0x05458f8f40cf45cf), C64e(0x3e9d1f1fa3bc9dbc),
+	C64e(0x0940898949c040c0), C64e(0xef87fafa68928792),
+	C64e(0xc515efefd03f153f), C64e(0x7febb2b29426eb26),
+	C64e(0x07c98e8ece40c940), C64e(0xed0bfbfbe61d0b1d),
+	C64e(0x82ec41416e2fec2f), C64e(0x7d67b3b31aa967a9),
+	C64e(0xbefd5f5f431cfd1c), C64e(0x8aea45456025ea25),
+	C64e(0x46bf2323f9dabfda), C64e(0xa6f753535102f702),
+	C64e(0xd396e4e445a196a1), C64e(0x2d5b9b9b76ed5bed),
+	C64e(0xeac27575285dc25d), C64e(0xd91ce1e1c5241c24),
+	C64e(0x7aae3d3dd4e9aee9), C64e(0x986a4c4cf2be6abe),
+	C64e(0xd85a6c6c82ee5aee), C64e(0xfc417e7ebdc341c3),
+	C64e(0xf102f5f5f3060206), C64e(0x1d4f838352d14fd1),
+	C64e(0xd05c68688ce45ce4), C64e(0xa2f451515607f407),
+	C64e(0xb934d1d18d5c345c), C64e(0xe908f9f9e1180818),
+	C64e(0xdf93e2e24cae93ae), C64e(0x4d73abab3e957395),
+	C64e(0xc453626297f553f5), C64e(0x543f2a2a6b413f41),
+	C64e(0x100c08081c140c14), C64e(0x3152959563f652f6),
+	C64e(0x8c654646e9af65af), C64e(0x215e9d9d7fe25ee2),
+	C64e(0x6028303048782878), C64e(0x6ea13737cff8a1f8),
+	C64e(0x140f0a0a1b110f11), C64e(0x5eb52f2febc4b5c4),
+	C64e(0x1c090e0e151b091b), C64e(0x483624247e5a365a),
+	C64e(0x369b1b1badb69bb6), C64e(0xa53ddfdf98473d47),
+	C64e(0x8126cdcda76a266a), C64e(0x9c694e4ef5bb69bb),
+	C64e(0xfecd7f7f334ccd4c), C64e(0xcf9feaea50ba9fba),
+	C64e(0x241b12123f2d1b2d), C64e(0x3a9e1d1da4b99eb9),
+	C64e(0xb0745858c49c749c), C64e(0x682e343446722e72),
+	C64e(0x6c2d363641772d77), C64e(0xa3b2dcdc11cdb2cd),
+	C64e(0x73eeb4b49d29ee29), C64e(0xb6fb5b5b4d16fb16),
+	C64e(0x53f6a4a4a501f601), C64e(0xec4d7676a1d74dd7),
+	C64e(0x7561b7b714a361a3), C64e(0xface7d7d3449ce49),
+	C64e(0xa47b5252df8d7b8d), C64e(0xa13edddd9f423e42),
+	C64e(0xbc715e5ecd937193), C64e(0x26971313b1a297a2),
+	C64e(0x57f5a6a6a204f504), C64e(0x6968b9b901b868b8),
+	C64e(0x0000000000000000), C64e(0x992cc1c1b5742c74),
+	C64e(0x80604040e0a060a0), C64e(0xdd1fe3e3c2211f21),
+	C64e(0xf2c879793a43c843), C64e(0x77edb6b69a2ced2c),
+	C64e(0xb3bed4d40dd9bed9), C64e(0x01468d8d47ca46ca),
+	C64e(0xced967671770d970), C64e(0xe44b7272afdd4bdd),
+	C64e(0x33de9494ed79de79), C64e(0x2bd49898ff67d467),
+	C64e(0x7be8b0b09323e823), C64e(0x114a85855bde4ade),
+	C64e(0x6d6bbbbb06bd6bbd), C64e(0x912ac5c5bb7e2a7e),
+	C64e(0x9ee54f4f7b34e534), C64e(0xc116ededd73a163a),
+	C64e(0x17c58686d254c554), C64e(0x2fd79a9af862d762),
+	C64e(0xcc55666699ff55ff), C64e(0x22941111b6a794a7),
+	C64e(0x0fcf8a8ac04acf4a), C64e(0xc910e9e9d9301030),
+	C64e(0x080604040e0a060a), C64e(0xe781fefe66988198),
+	C64e(0x5bf0a0a0ab0bf00b), C64e(0xf0447878b4cc44cc),
+	C64e(0x4aba2525f0d5bad5), C64e(0x96e34b4b753ee33e),
+	C64e(0x5ff3a2a2ac0ef30e), C64e(0xbafe5d5d4419fe19),
+	C64e(0x1bc08080db5bc05b), C64e(0x0a8a050580858a85),
+	C64e(0x7ead3f3fd3ecadec), C64e(0x42bc2121fedfbcdf),
+	C64e(0xe0487070a8d848d8), C64e(0xf904f1f1fd0c040c),
+	C64e(0xc6df6363197adf7a), C64e(0xeec177772f58c158),
+	C64e(0x4575afaf309f759f), C64e(0x84634242e7a563a5),
+	C64e(0x4030202070503050), C64e(0xd11ae5e5cb2e1a2e),
+	C64e(0xe10efdfdef120e12), C64e(0x656dbfbf08b76db7),
+	C64e(0x194c818155d44cd4), C64e(0x30141818243c143c),
+	C64e(0x4c352626795f355f), C64e(0x9d2fc3c3b2712f71),
+	C64e(0x67e1bebe8638e138), C64e(0x6aa23535c8fda2fd),
+	C64e(0x0bcc8888c74fcc4f), C64e(0x5c392e2e654b394b),
+	C64e(0x3d5793936af957f9), C64e(0xaaf25555580df20d),
+	C64e(0xe382fcfc619d829d), C64e(0xf4477a7ab3c947c9),
+	C64e(0x8bacc8c827efacef), C64e(0x6fe7baba8832e732),
+	C64e(0x642b32324f7d2b7d), C64e(0xd795e6e642a495a4),
+	C64e(0x9ba0c0c03bfba0fb), C64e(0x32981919aab398b3),
+	C64e(0x27d19e9ef668d168), C64e(0x5d7fa3a322817f81),
+	C64e(0x88664444eeaa66aa), C64e(0xa87e5454d6827e82),
+	C64e(0x76ab3b3bdde6abe6), C64e(0x16830b0b959e839e),
+	C64e(0x03ca8c8cc945ca45), C64e(0x9529c7c7bc7b297b),
+	C64e(0xd6d36b6b056ed36e), C64e(0x503c28286c443c44),
+	C64e(0x5579a7a72c8b798b), C64e(0x63e2bcbc813de23d),
+	C64e(0x2c1d161631271d27), C64e(0x4176adad379a769a),
+	C64e(0xad3bdbdb964d3b4d), C64e(0xc85664649efa56fa),
+	C64e(0xe84e7474a6d24ed2), C64e(0x281e141436221e22),
+	C64e(0x3fdb9292e476db76), C64e(0x180a0c0c121e0a1e),
+	C64e(0x906c4848fcb46cb4), C64e(0x6be4b8b88f37e437),
+	C64e(0x255d9f9f78e75de7), C64e(0x616ebdbd0fb26eb2),
+	C64e(0x86ef4343692aef2a), C64e(0x93a6c4c435f1a6f1),
+	C64e(0x72a83939dae3a8e3), C64e(0x62a43131c6f7a4f7),
+	C64e(0xbd37d3d38a593759), C64e(0xff8bf2f274868b86),
+	C64e(0xb132d5d583563256), C64e(0x0d438b8b4ec543c5),
+	C64e(0xdc596e6e85eb59eb), C64e(0xafb7dada18c2b7c2),
+	C64e(0x028c01018e8f8c8f), C64e(0x7964b1b11dac64ac),
+	C64e(0x23d29c9cf16dd26d), C64e(0x92e04949723be03b),
+	C64e(0xabb4d8d81fc7b4c7), C64e(0x43faacacb915fa15),
+	C64e(0xfd07f3f3fa090709), C64e(0x8525cfcfa06f256f),
+	C64e(0x8fafcaca20eaafea), C64e(0xf38ef4f47d898e89),
+	C64e(0x8ee947476720e920), C64e(0x2018101038281828),
+	C64e(0xded56f6f0b64d564), C64e(0xfb88f0f073838883),
+	C64e(0x946f4a4afbb16fb1), C64e(0xb8725c5cca967296),
+	C64e(0x70243838546c246c), C64e(0xaef157575f08f108),
+	C64e(0xe6c773732152c752), C64e(0x3551979764f351f3),
+	C64e(0x8d23cbcbae652365), C64e(0x597ca1a125847c84),
+	C64e(0xcb9ce8e857bf9cbf), C64e(0x7c213e3e5d632163),
+	C64e(0x37dd9696ea7cdd7c), C64e(0xc2dc61611e7fdc7f),
+	C64e(0x1a860d0d9c918691), C64e(0x1e850f0f9b948594),
+	C64e(0xdb90e0e04bab90ab), C64e(0xf8427c7cbac642c6),
+	C64e(0xe2c471712657c457), C64e(0x83aacccc29e5aae5),
+	C64e(0x3bd89090e373d873), C64e(0x0c050606090f050f),
+	C64e(0xf501f7f7f4030103), C64e(0x38121c1c2a361236),
+	C64e(0x9fa3c2c23cfea3fe), C64e(0xd45f6a6a8be15fe1),
+	C64e(0x47f9aeaebe10f910), C64e(0xd2d06969026bd06b),
+	C64e(0x2e911717bfa891a8), C64e(0x2958999971e858e8),
+	C64e(0x74273a3a53692769), C64e(0x4eb92727f7d0b9d0),
+	C64e(0xa938d9d991483848), C64e(0xcd13ebebde351335),
+	C64e(0x56b32b2be5ceb3ce), C64e(0x4433222277553355),
+	C64e(0xbfbbd2d204d6bbd6), C64e(0x4970a9a939907090),
+	C64e(0x0e89070787808980), C64e(0x66a73333c1f2a7f2),
+	C64e(0x5ab62d2decc1b6c1), C64e(0x78223c3c5a662266),
+	C64e(0x2a921515b8ad92ad), C64e(0x8920c9c9a9602060),
+	C64e(0x154987875cdb49db), C64e(0x4fffaaaab01aff1a),
+	C64e(0xa0785050d8887888), C64e(0x517aa5a52b8e7a8e),
+	C64e(0x068f0303898a8f8a), C64e(0xb2f859594a13f813),
+	C64e(0x12800909929b809b), C64e(0x34171a1a23391739),
+	C64e(0xcada65651075da75), C64e(0xb531d7d784533153),
+	C64e(0x13c68484d551c651), C64e(0xbbb8d0d003d3b8d3),
+	C64e(0x1fc38282dc5ec35e), C64e(0x52b02929e2cbb0cb),
+	C64e(0xb4775a5ac3997799), C64e(0x3c111e1e2d331133),
+	C64e(0xf6cb7b7b3d46cb46), C64e(0x4bfca8a8b71ffc1f),
+	C64e(0xdad66d6d0c61d661), C64e(0x583a2c2c624e3a4e)
+};
+
+#endif
+
+static const sph_u64 T4[] = {
+	C64e(0xf497a5c6c632f4a5), C64e(0x97eb84f8f86f9784),
+	C64e(0xb0c799eeee5eb099), C64e(0x8cf78df6f67a8c8d),
+	C64e(0x17e50dffffe8170d), C64e(0xdcb7bdd6d60adcbd),
+	C64e(0xc8a7b1dede16c8b1), C64e(0xfc395491916dfc54),
+	C64e(0xf0c050606090f050), C64e(0x0504030202070503),
+	C64e(0xe087a9cece2ee0a9), C64e(0x87ac7d5656d1877d),
+	C64e(0x2bd519e7e7cc2b19), C64e(0xa67162b5b513a662),
+	C64e(0x319ae64d4d7c31e6), C64e(0xb5c39aecec59b59a),
+	C64e(0xcf05458f8f40cf45), C64e(0xbc3e9d1f1fa3bc9d),
+	C64e(0xc00940898949c040), C64e(0x92ef87fafa689287),
+	C64e(0x3fc515efefd03f15), C64e(0x267febb2b29426eb),
+	C64e(0x4007c98e8ece40c9), C64e(0x1ded0bfbfbe61d0b),
+	C64e(0x2f82ec41416e2fec), C64e(0xa97d67b3b31aa967),
+	C64e(0x1cbefd5f5f431cfd), C64e(0x258aea45456025ea),
+	C64e(0xda46bf2323f9dabf), C64e(0x02a6f753535102f7),
+	C64e(0xa1d396e4e445a196), C64e(0xed2d5b9b9b76ed5b),
+	C64e(0x5deac27575285dc2), C64e(0x24d91ce1e1c5241c),
+	C64e(0xe97aae3d3dd4e9ae), C64e(0xbe986a4c4cf2be6a),
+	C64e(0xeed85a6c6c82ee5a), C64e(0xc3fc417e7ebdc341),
+	C64e(0x06f102f5f5f30602), C64e(0xd11d4f838352d14f),
+	C64e(0xe4d05c68688ce45c), C64e(0x07a2f451515607f4),
+	C64e(0x5cb934d1d18d5c34), C64e(0x18e908f9f9e11808),
+	C64e(0xaedf93e2e24cae93), C64e(0x954d73abab3e9573),
+	C64e(0xf5c453626297f553), C64e(0x41543f2a2a6b413f),
+	C64e(0x14100c08081c140c), C64e(0xf63152959563f652),
+	C64e(0xaf8c654646e9af65), C64e(0xe2215e9d9d7fe25e),
+	C64e(0x7860283030487828), C64e(0xf86ea13737cff8a1),
+	C64e(0x11140f0a0a1b110f), C64e(0xc45eb52f2febc4b5),
+	C64e(0x1b1c090e0e151b09), C64e(0x5a483624247e5a36),
+	C64e(0xb6369b1b1badb69b), C64e(0x47a53ddfdf98473d),
+	C64e(0x6a8126cdcda76a26), C64e(0xbb9c694e4ef5bb69),
+	C64e(0x4cfecd7f7f334ccd), C64e(0xbacf9feaea50ba9f),
+	C64e(0x2d241b12123f2d1b), C64e(0xb93a9e1d1da4b99e),
+	C64e(0x9cb0745858c49c74), C64e(0x72682e343446722e),
+	C64e(0x776c2d363641772d), C64e(0xcda3b2dcdc11cdb2),
+	C64e(0x2973eeb4b49d29ee), C64e(0x16b6fb5b5b4d16fb),
+	C64e(0x0153f6a4a4a501f6), C64e(0xd7ec4d7676a1d74d),
+	C64e(0xa37561b7b714a361), C64e(0x49face7d7d3449ce),
+	C64e(0x8da47b5252df8d7b), C64e(0x42a13edddd9f423e),
+	C64e(0x93bc715e5ecd9371), C64e(0xa226971313b1a297),
+	C64e(0x0457f5a6a6a204f5), C64e(0xb86968b9b901b868),
+	C64e(0x0000000000000000), C64e(0x74992cc1c1b5742c),
+	C64e(0xa080604040e0a060), C64e(0x21dd1fe3e3c2211f),
+	C64e(0x43f2c879793a43c8), C64e(0x2c77edb6b69a2ced),
+	C64e(0xd9b3bed4d40dd9be), C64e(0xca01468d8d47ca46),
+	C64e(0x70ced967671770d9), C64e(0xdde44b7272afdd4b),
+	C64e(0x7933de9494ed79de), C64e(0x672bd49898ff67d4),
+	C64e(0x237be8b0b09323e8), C64e(0xde114a85855bde4a),
+	C64e(0xbd6d6bbbbb06bd6b), C64e(0x7e912ac5c5bb7e2a),
+	C64e(0x349ee54f4f7b34e5), C64e(0x3ac116ededd73a16),
+	C64e(0x5417c58686d254c5), C64e(0x622fd79a9af862d7),
+	C64e(0xffcc55666699ff55), C64e(0xa722941111b6a794),
+	C64e(0x4a0fcf8a8ac04acf), C64e(0x30c910e9e9d93010),
+	C64e(0x0a080604040e0a06), C64e(0x98e781fefe669881),
+	C64e(0x0b5bf0a0a0ab0bf0), C64e(0xccf0447878b4cc44),
+	C64e(0xd54aba2525f0d5ba), C64e(0x3e96e34b4b753ee3),
+	C64e(0x0e5ff3a2a2ac0ef3), C64e(0x19bafe5d5d4419fe),
+	C64e(0x5b1bc08080db5bc0), C64e(0x850a8a050580858a),
+	C64e(0xec7ead3f3fd3ecad), C64e(0xdf42bc2121fedfbc),
+	C64e(0xd8e0487070a8d848), C64e(0x0cf904f1f1fd0c04),
+	C64e(0x7ac6df6363197adf), C64e(0x58eec177772f58c1),
+	C64e(0x9f4575afaf309f75), C64e(0xa584634242e7a563),
+	C64e(0x5040302020705030), C64e(0x2ed11ae5e5cb2e1a),
+	C64e(0x12e10efdfdef120e), C64e(0xb7656dbfbf08b76d),
+	C64e(0xd4194c818155d44c), C64e(0x3c30141818243c14),
+	C64e(0x5f4c352626795f35), C64e(0x719d2fc3c3b2712f),
+	C64e(0x3867e1bebe8638e1), C64e(0xfd6aa23535c8fda2),
+	C64e(0x4f0bcc8888c74fcc), C64e(0x4b5c392e2e654b39),
+	C64e(0xf93d5793936af957), C64e(0x0daaf25555580df2),
+	C64e(0x9de382fcfc619d82), C64e(0xc9f4477a7ab3c947),
+	C64e(0xef8bacc8c827efac), C64e(0x326fe7baba8832e7),
+	C64e(0x7d642b32324f7d2b), C64e(0xa4d795e6e642a495),
+	C64e(0xfb9ba0c0c03bfba0), C64e(0xb332981919aab398),
+	C64e(0x6827d19e9ef668d1), C64e(0x815d7fa3a322817f),
+	C64e(0xaa88664444eeaa66), C64e(0x82a87e5454d6827e),
+	C64e(0xe676ab3b3bdde6ab), C64e(0x9e16830b0b959e83),
+	C64e(0x4503ca8c8cc945ca), C64e(0x7b9529c7c7bc7b29),
+	C64e(0x6ed6d36b6b056ed3), C64e(0x44503c28286c443c),
+	C64e(0x8b5579a7a72c8b79), C64e(0x3d63e2bcbc813de2),
+	C64e(0x272c1d161631271d), C64e(0x9a4176adad379a76),
+	C64e(0x4dad3bdbdb964d3b), C64e(0xfac85664649efa56),
+	C64e(0xd2e84e7474a6d24e), C64e(0x22281e141436221e),
+	C64e(0x763fdb9292e476db), C64e(0x1e180a0c0c121e0a),
+	C64e(0xb4906c4848fcb46c), C64e(0x376be4b8b88f37e4),
+	C64e(0xe7255d9f9f78e75d), C64e(0xb2616ebdbd0fb26e),
+	C64e(0x2a86ef4343692aef), C64e(0xf193a6c4c435f1a6),
+	C64e(0xe372a83939dae3a8), C64e(0xf762a43131c6f7a4),
+	C64e(0x59bd37d3d38a5937), C64e(0x86ff8bf2f274868b),
+	C64e(0x56b132d5d5835632), C64e(0xc50d438b8b4ec543),
+	C64e(0xebdc596e6e85eb59), C64e(0xc2afb7dada18c2b7),
+	C64e(0x8f028c01018e8f8c), C64e(0xac7964b1b11dac64),
+	C64e(0x6d23d29c9cf16dd2), C64e(0x3b92e04949723be0),
+	C64e(0xc7abb4d8d81fc7b4), C64e(0x1543faacacb915fa),
+	C64e(0x09fd07f3f3fa0907), C64e(0x6f8525cfcfa06f25),
+	C64e(0xea8fafcaca20eaaf), C64e(0x89f38ef4f47d898e),
+	C64e(0x208ee947476720e9), C64e(0x2820181010382818),
+	C64e(0x64ded56f6f0b64d5), C64e(0x83fb88f0f0738388),
+	C64e(0xb1946f4a4afbb16f), C64e(0x96b8725c5cca9672),
+	C64e(0x6c70243838546c24), C64e(0x08aef157575f08f1),
+	C64e(0x52e6c773732152c7), C64e(0xf33551979764f351),
+	C64e(0x658d23cbcbae6523), C64e(0x84597ca1a125847c),
+	C64e(0xbfcb9ce8e857bf9c), C64e(0x637c213e3e5d6321),
+	C64e(0x7c37dd9696ea7cdd), C64e(0x7fc2dc61611e7fdc),
+	C64e(0x911a860d0d9c9186), C64e(0x941e850f0f9b9485),
+	C64e(0xabdb90e0e04bab90), C64e(0xc6f8427c7cbac642),
+	C64e(0x57e2c471712657c4), C64e(0xe583aacccc29e5aa),
+	C64e(0x733bd89090e373d8), C64e(0x0f0c050606090f05),
+	C64e(0x03f501f7f7f40301), C64e(0x3638121c1c2a3612),
+	C64e(0xfe9fa3c2c23cfea3), C64e(0xe1d45f6a6a8be15f),
+	C64e(0x1047f9aeaebe10f9), C64e(0x6bd2d06969026bd0),
+	C64e(0xa82e911717bfa891), C64e(0xe82958999971e858),
+	C64e(0x6974273a3a536927), C64e(0xd04eb92727f7d0b9),
+	C64e(0x48a938d9d9914838), C64e(0x35cd13ebebde3513),
+	C64e(0xce56b32b2be5ceb3), C64e(0x5544332222775533),
+	C64e(0xd6bfbbd2d204d6bb), C64e(0x904970a9a9399070),
+	C64e(0x800e890707878089), C64e(0xf266a73333c1f2a7),
+	C64e(0xc15ab62d2decc1b6), C64e(0x6678223c3c5a6622),
+	C64e(0xad2a921515b8ad92), C64e(0x608920c9c9a96020),
+	C64e(0xdb154987875cdb49), C64e(0x1a4fffaaaab01aff),
+	C64e(0x88a0785050d88878), C64e(0x8e517aa5a52b8e7a),
+	C64e(0x8a068f0303898a8f), C64e(0x13b2f859594a13f8),
+	C64e(0x9b12800909929b80), C64e(0x3934171a1a233917),
+	C64e(0x75cada65651075da), C64e(0x53b531d7d7845331),
+	C64e(0x5113c68484d551c6), C64e(0xd3bbb8d0d003d3b8),
+	C64e(0x5e1fc38282dc5ec3), C64e(0xcb52b02929e2cbb0),
+	C64e(0x99b4775a5ac39977), C64e(0x333c111e1e2d3311),
+	C64e(0x46f6cb7b7b3d46cb), C64e(0x1f4bfca8a8b71ffc),
+	C64e(0x61dad66d6d0c61d6), C64e(0x4e583a2c2c624e3a)
+};
+
+#if !SPH_SMALL_FOOTPRINT_GROESTL
+
+static const sph_u64 T5[] = {
+	C64e(0xa5f497a5c6c632f4), C64e(0x8497eb84f8f86f97),
+	C64e(0x99b0c799eeee5eb0), C64e(0x8d8cf78df6f67a8c),
+	C64e(0x0d17e50dffffe817), C64e(0xbddcb7bdd6d60adc),
+	C64e(0xb1c8a7b1dede16c8), C64e(0x54fc395491916dfc),
+	C64e(0x50f0c050606090f0), C64e(0x0305040302020705),
+	C64e(0xa9e087a9cece2ee0), C64e(0x7d87ac7d5656d187),
+	C64e(0x192bd519e7e7cc2b), C64e(0x62a67162b5b513a6),
+	C64e(0xe6319ae64d4d7c31), C64e(0x9ab5c39aecec59b5),
+	C64e(0x45cf05458f8f40cf), C64e(0x9dbc3e9d1f1fa3bc),
+	C64e(0x40c00940898949c0), C64e(0x8792ef87fafa6892),
+	C64e(0x153fc515efefd03f), C64e(0xeb267febb2b29426),
+	C64e(0xc94007c98e8ece40), C64e(0x0b1ded0bfbfbe61d),
+	C64e(0xec2f82ec41416e2f), C64e(0x67a97d67b3b31aa9),
+	C64e(0xfd1cbefd5f5f431c), C64e(0xea258aea45456025),
+	C64e(0xbfda46bf2323f9da), C64e(0xf702a6f753535102),
+	C64e(0x96a1d396e4e445a1), C64e(0x5bed2d5b9b9b76ed),
+	C64e(0xc25deac27575285d), C64e(0x1c24d91ce1e1c524),
+	C64e(0xaee97aae3d3dd4e9), C64e(0x6abe986a4c4cf2be),
+	C64e(0x5aeed85a6c6c82ee), C64e(0x41c3fc417e7ebdc3),
+	C64e(0x0206f102f5f5f306), C64e(0x4fd11d4f838352d1),
+	C64e(0x5ce4d05c68688ce4), C64e(0xf407a2f451515607),
+	C64e(0x345cb934d1d18d5c), C64e(0x0818e908f9f9e118),
+	C64e(0x93aedf93e2e24cae), C64e(0x73954d73abab3e95),
+	C64e(0x53f5c453626297f5), C64e(0x3f41543f2a2a6b41),
+	C64e(0x0c14100c08081c14), C64e(0x52f63152959563f6),
+	C64e(0x65af8c654646e9af), C64e(0x5ee2215e9d9d7fe2),
+	C64e(0x2878602830304878), C64e(0xa1f86ea13737cff8),
+	C64e(0x0f11140f0a0a1b11), C64e(0xb5c45eb52f2febc4),
+	C64e(0x091b1c090e0e151b), C64e(0x365a483624247e5a),
+	C64e(0x9bb6369b1b1badb6), C64e(0x3d47a53ddfdf9847),
+	C64e(0x266a8126cdcda76a), C64e(0x69bb9c694e4ef5bb),
+	C64e(0xcd4cfecd7f7f334c), C64e(0x9fbacf9feaea50ba),
+	C64e(0x1b2d241b12123f2d), C64e(0x9eb93a9e1d1da4b9),
+	C64e(0x749cb0745858c49c), C64e(0x2e72682e34344672),
+	C64e(0x2d776c2d36364177), C64e(0xb2cda3b2dcdc11cd),
+	C64e(0xee2973eeb4b49d29), C64e(0xfb16b6fb5b5b4d16),
+	C64e(0xf60153f6a4a4a501), C64e(0x4dd7ec4d7676a1d7),
+	C64e(0x61a37561b7b714a3), C64e(0xce49face7d7d3449),
+	C64e(0x7b8da47b5252df8d), C64e(0x3e42a13edddd9f42),
+	C64e(0x7193bc715e5ecd93), C64e(0x97a226971313b1a2),
+	C64e(0xf50457f5a6a6a204), C64e(0x68b86968b9b901b8),
+	C64e(0x0000000000000000), C64e(0x2c74992cc1c1b574),
+	C64e(0x60a080604040e0a0), C64e(0x1f21dd1fe3e3c221),
+	C64e(0xc843f2c879793a43), C64e(0xed2c77edb6b69a2c),
+	C64e(0xbed9b3bed4d40dd9), C64e(0x46ca01468d8d47ca),
+	C64e(0xd970ced967671770), C64e(0x4bdde44b7272afdd),
+	C64e(0xde7933de9494ed79), C64e(0xd4672bd49898ff67),
+	C64e(0xe8237be8b0b09323), C64e(0x4ade114a85855bde),
+	C64e(0x6bbd6d6bbbbb06bd), C64e(0x2a7e912ac5c5bb7e),
+	C64e(0xe5349ee54f4f7b34), C64e(0x163ac116ededd73a),
+	C64e(0xc55417c58686d254), C64e(0xd7622fd79a9af862),
+	C64e(0x55ffcc55666699ff), C64e(0x94a722941111b6a7),
+	C64e(0xcf4a0fcf8a8ac04a), C64e(0x1030c910e9e9d930),
+	C64e(0x060a080604040e0a), C64e(0x8198e781fefe6698),
+	C64e(0xf00b5bf0a0a0ab0b), C64e(0x44ccf0447878b4cc),
+	C64e(0xbad54aba2525f0d5), C64e(0xe33e96e34b4b753e),
+	C64e(0xf30e5ff3a2a2ac0e), C64e(0xfe19bafe5d5d4419),
+	C64e(0xc05b1bc08080db5b), C64e(0x8a850a8a05058085),
+	C64e(0xadec7ead3f3fd3ec), C64e(0xbcdf42bc2121fedf),
+	C64e(0x48d8e0487070a8d8), C64e(0x040cf904f1f1fd0c),
+	C64e(0xdf7ac6df6363197a), C64e(0xc158eec177772f58),
+	C64e(0x759f4575afaf309f), C64e(0x63a584634242e7a5),
+	C64e(0x3050403020207050), C64e(0x1a2ed11ae5e5cb2e),
+	C64e(0x0e12e10efdfdef12), C64e(0x6db7656dbfbf08b7),
+	C64e(0x4cd4194c818155d4), C64e(0x143c30141818243c),
+	C64e(0x355f4c352626795f), C64e(0x2f719d2fc3c3b271),
+	C64e(0xe13867e1bebe8638), C64e(0xa2fd6aa23535c8fd),
+	C64e(0xcc4f0bcc8888c74f), C64e(0x394b5c392e2e654b),
+	C64e(0x57f93d5793936af9), C64e(0xf20daaf25555580d),
+	C64e(0x829de382fcfc619d), C64e(0x47c9f4477a7ab3c9),
+	C64e(0xacef8bacc8c827ef), C64e(0xe7326fe7baba8832),
+	C64e(0x2b7d642b32324f7d), C64e(0x95a4d795e6e642a4),
+	C64e(0xa0fb9ba0c0c03bfb), C64e(0x98b332981919aab3),
+	C64e(0xd16827d19e9ef668), C64e(0x7f815d7fa3a32281),
+	C64e(0x66aa88664444eeaa), C64e(0x7e82a87e5454d682),
+	C64e(0xabe676ab3b3bdde6), C64e(0x839e16830b0b959e),
+	C64e(0xca4503ca8c8cc945), C64e(0x297b9529c7c7bc7b),
+	C64e(0xd36ed6d36b6b056e), C64e(0x3c44503c28286c44),
+	C64e(0x798b5579a7a72c8b), C64e(0xe23d63e2bcbc813d),
+	C64e(0x1d272c1d16163127), C64e(0x769a4176adad379a),
+	C64e(0x3b4dad3bdbdb964d), C64e(0x56fac85664649efa),
+	C64e(0x4ed2e84e7474a6d2), C64e(0x1e22281e14143622),
+	C64e(0xdb763fdb9292e476), C64e(0x0a1e180a0c0c121e),
+	C64e(0x6cb4906c4848fcb4), C64e(0xe4376be4b8b88f37),
+	C64e(0x5de7255d9f9f78e7), C64e(0x6eb2616ebdbd0fb2),
+	C64e(0xef2a86ef4343692a), C64e(0xa6f193a6c4c435f1),
+	C64e(0xa8e372a83939dae3), C64e(0xa4f762a43131c6f7),
+	C64e(0x3759bd37d3d38a59), C64e(0x8b86ff8bf2f27486),
+	C64e(0x3256b132d5d58356), C64e(0x43c50d438b8b4ec5),
+	C64e(0x59ebdc596e6e85eb), C64e(0xb7c2afb7dada18c2),
+	C64e(0x8c8f028c01018e8f), C64e(0x64ac7964b1b11dac),
+	C64e(0xd26d23d29c9cf16d), C64e(0xe03b92e04949723b),
+	C64e(0xb4c7abb4d8d81fc7), C64e(0xfa1543faacacb915),
+	C64e(0x0709fd07f3f3fa09), C64e(0x256f8525cfcfa06f),
+	C64e(0xafea8fafcaca20ea), C64e(0x8e89f38ef4f47d89),
+	C64e(0xe9208ee947476720), C64e(0x1828201810103828),
+	C64e(0xd564ded56f6f0b64), C64e(0x8883fb88f0f07383),
+	C64e(0x6fb1946f4a4afbb1), C64e(0x7296b8725c5cca96),
+	C64e(0x246c70243838546c), C64e(0xf108aef157575f08),
+	C64e(0xc752e6c773732152), C64e(0x51f33551979764f3),
+	C64e(0x23658d23cbcbae65), C64e(0x7c84597ca1a12584),
+	C64e(0x9cbfcb9ce8e857bf), C64e(0x21637c213e3e5d63),
+	C64e(0xdd7c37dd9696ea7c), C64e(0xdc7fc2dc61611e7f),
+	C64e(0x86911a860d0d9c91), C64e(0x85941e850f0f9b94),
+	C64e(0x90abdb90e0e04bab), C64e(0x42c6f8427c7cbac6),
+	C64e(0xc457e2c471712657), C64e(0xaae583aacccc29e5),
+	C64e(0xd8733bd89090e373), C64e(0x050f0c050606090f),
+	C64e(0x0103f501f7f7f403), C64e(0x123638121c1c2a36),
+	C64e(0xa3fe9fa3c2c23cfe), C64e(0x5fe1d45f6a6a8be1),
+	C64e(0xf91047f9aeaebe10), C64e(0xd06bd2d06969026b),
+	C64e(0x91a82e911717bfa8), C64e(0x58e82958999971e8),
+	C64e(0x276974273a3a5369), C64e(0xb9d04eb92727f7d0),
+	C64e(0x3848a938d9d99148), C64e(0x1335cd13ebebde35),
+	C64e(0xb3ce56b32b2be5ce), C64e(0x3355443322227755),
+	C64e(0xbbd6bfbbd2d204d6), C64e(0x70904970a9a93990),
+	C64e(0x89800e8907078780), C64e(0xa7f266a73333c1f2),
+	C64e(0xb6c15ab62d2decc1), C64e(0x226678223c3c5a66),
+	C64e(0x92ad2a921515b8ad), C64e(0x20608920c9c9a960),
+	C64e(0x49db154987875cdb), C64e(0xff1a4fffaaaab01a),
+	C64e(0x7888a0785050d888), C64e(0x7a8e517aa5a52b8e),
+	C64e(0x8f8a068f0303898a), C64e(0xf813b2f859594a13),
+	C64e(0x809b12800909929b), C64e(0x173934171a1a2339),
+	C64e(0xda75cada65651075), C64e(0x3153b531d7d78453),
+	C64e(0xc65113c68484d551), C64e(0xb8d3bbb8d0d003d3),
+	C64e(0xc35e1fc38282dc5e), C64e(0xb0cb52b02929e2cb),
+	C64e(0x7799b4775a5ac399), C64e(0x11333c111e1e2d33),
+	C64e(0xcb46f6cb7b7b3d46), C64e(0xfc1f4bfca8a8b71f),
+	C64e(0xd661dad66d6d0c61), C64e(0x3a4e583a2c2c624e)
+};
+
+static const sph_u64 T6[] = {
+	C64e(0xf4a5f497a5c6c632), C64e(0x978497eb84f8f86f),
+	C64e(0xb099b0c799eeee5e), C64e(0x8c8d8cf78df6f67a),
+	C64e(0x170d17e50dffffe8), C64e(0xdcbddcb7bdd6d60a),
+	C64e(0xc8b1c8a7b1dede16), C64e(0xfc54fc395491916d),
+	C64e(0xf050f0c050606090), C64e(0x0503050403020207),
+	C64e(0xe0a9e087a9cece2e), C64e(0x877d87ac7d5656d1),
+	C64e(0x2b192bd519e7e7cc), C64e(0xa662a67162b5b513),
+	C64e(0x31e6319ae64d4d7c), C64e(0xb59ab5c39aecec59),
+	C64e(0xcf45cf05458f8f40), C64e(0xbc9dbc3e9d1f1fa3),
+	C64e(0xc040c00940898949), C64e(0x928792ef87fafa68),
+	C64e(0x3f153fc515efefd0), C64e(0x26eb267febb2b294),
+	C64e(0x40c94007c98e8ece), C64e(0x1d0b1ded0bfbfbe6),
+	C64e(0x2fec2f82ec41416e), C64e(0xa967a97d67b3b31a),
+	C64e(0x1cfd1cbefd5f5f43), C64e(0x25ea258aea454560),
+	C64e(0xdabfda46bf2323f9), C64e(0x02f702a6f7535351),
+	C64e(0xa196a1d396e4e445), C64e(0xed5bed2d5b9b9b76),
+	C64e(0x5dc25deac2757528), C64e(0x241c24d91ce1e1c5),
+	C64e(0xe9aee97aae3d3dd4), C64e(0xbe6abe986a4c4cf2),
+	C64e(0xee5aeed85a6c6c82), C64e(0xc341c3fc417e7ebd),
+	C64e(0x060206f102f5f5f3), C64e(0xd14fd11d4f838352),
+	C64e(0xe45ce4d05c68688c), C64e(0x07f407a2f4515156),
+	C64e(0x5c345cb934d1d18d), C64e(0x180818e908f9f9e1),
+	C64e(0xae93aedf93e2e24c), C64e(0x9573954d73abab3e),
+	C64e(0xf553f5c453626297), C64e(0x413f41543f2a2a6b),
+	C64e(0x140c14100c08081c), C64e(0xf652f63152959563),
+	C64e(0xaf65af8c654646e9), C64e(0xe25ee2215e9d9d7f),
+	C64e(0x7828786028303048), C64e(0xf8a1f86ea13737cf),
+	C64e(0x110f11140f0a0a1b), C64e(0xc4b5c45eb52f2feb),
+	C64e(0x1b091b1c090e0e15), C64e(0x5a365a483624247e),
+	C64e(0xb69bb6369b1b1bad), C64e(0x473d47a53ddfdf98),
+	C64e(0x6a266a8126cdcda7), C64e(0xbb69bb9c694e4ef5),
+	C64e(0x4ccd4cfecd7f7f33), C64e(0xba9fbacf9feaea50),
+	C64e(0x2d1b2d241b12123f), C64e(0xb99eb93a9e1d1da4),
+	C64e(0x9c749cb0745858c4), C64e(0x722e72682e343446),
+	C64e(0x772d776c2d363641), C64e(0xcdb2cda3b2dcdc11),
+	C64e(0x29ee2973eeb4b49d), C64e(0x16fb16b6fb5b5b4d),
+	C64e(0x01f60153f6a4a4a5), C64e(0xd74dd7ec4d7676a1),
+	C64e(0xa361a37561b7b714), C64e(0x49ce49face7d7d34),
+	C64e(0x8d7b8da47b5252df), C64e(0x423e42a13edddd9f),
+	C64e(0x937193bc715e5ecd), C64e(0xa297a226971313b1),
+	C64e(0x04f50457f5a6a6a2), C64e(0xb868b86968b9b901),
+	C64e(0x0000000000000000), C64e(0x742c74992cc1c1b5),
+	C64e(0xa060a080604040e0), C64e(0x211f21dd1fe3e3c2),
+	C64e(0x43c843f2c879793a), C64e(0x2ced2c77edb6b69a),
+	C64e(0xd9bed9b3bed4d40d), C64e(0xca46ca01468d8d47),
+	C64e(0x70d970ced9676717), C64e(0xdd4bdde44b7272af),
+	C64e(0x79de7933de9494ed), C64e(0x67d4672bd49898ff),
+	C64e(0x23e8237be8b0b093), C64e(0xde4ade114a85855b),
+	C64e(0xbd6bbd6d6bbbbb06), C64e(0x7e2a7e912ac5c5bb),
+	C64e(0x34e5349ee54f4f7b), C64e(0x3a163ac116ededd7),
+	C64e(0x54c55417c58686d2), C64e(0x62d7622fd79a9af8),
+	C64e(0xff55ffcc55666699), C64e(0xa794a722941111b6),
+	C64e(0x4acf4a0fcf8a8ac0), C64e(0x301030c910e9e9d9),
+	C64e(0x0a060a080604040e), C64e(0x988198e781fefe66),
+	C64e(0x0bf00b5bf0a0a0ab), C64e(0xcc44ccf0447878b4),
+	C64e(0xd5bad54aba2525f0), C64e(0x3ee33e96e34b4b75),
+	C64e(0x0ef30e5ff3a2a2ac), C64e(0x19fe19bafe5d5d44),
+	C64e(0x5bc05b1bc08080db), C64e(0x858a850a8a050580),
+	C64e(0xecadec7ead3f3fd3), C64e(0xdfbcdf42bc2121fe),
+	C64e(0xd848d8e0487070a8), C64e(0x0c040cf904f1f1fd),
+	C64e(0x7adf7ac6df636319), C64e(0x58c158eec177772f),
+	C64e(0x9f759f4575afaf30), C64e(0xa563a584634242e7),
+	C64e(0x5030504030202070), C64e(0x2e1a2ed11ae5e5cb),
+	C64e(0x120e12e10efdfdef), C64e(0xb76db7656dbfbf08),
+	C64e(0xd44cd4194c818155), C64e(0x3c143c3014181824),
+	C64e(0x5f355f4c35262679), C64e(0x712f719d2fc3c3b2),
+	C64e(0x38e13867e1bebe86), C64e(0xfda2fd6aa23535c8),
+	C64e(0x4fcc4f0bcc8888c7), C64e(0x4b394b5c392e2e65),
+	C64e(0xf957f93d5793936a), C64e(0x0df20daaf2555558),
+	C64e(0x9d829de382fcfc61), C64e(0xc947c9f4477a7ab3),
+	C64e(0xefacef8bacc8c827), C64e(0x32e7326fe7baba88),
+	C64e(0x7d2b7d642b32324f), C64e(0xa495a4d795e6e642),
+	C64e(0xfba0fb9ba0c0c03b), C64e(0xb398b332981919aa),
+	C64e(0x68d16827d19e9ef6), C64e(0x817f815d7fa3a322),
+	C64e(0xaa66aa88664444ee), C64e(0x827e82a87e5454d6),
+	C64e(0xe6abe676ab3b3bdd), C64e(0x9e839e16830b0b95),
+	C64e(0x45ca4503ca8c8cc9), C64e(0x7b297b9529c7c7bc),
+	C64e(0x6ed36ed6d36b6b05), C64e(0x443c44503c28286c),
+	C64e(0x8b798b5579a7a72c), C64e(0x3de23d63e2bcbc81),
+	C64e(0x271d272c1d161631), C64e(0x9a769a4176adad37),
+	C64e(0x4d3b4dad3bdbdb96), C64e(0xfa56fac85664649e),
+	C64e(0xd24ed2e84e7474a6), C64e(0x221e22281e141436),
+	C64e(0x76db763fdb9292e4), C64e(0x1e0a1e180a0c0c12),
+	C64e(0xb46cb4906c4848fc), C64e(0x37e4376be4b8b88f),
+	C64e(0xe75de7255d9f9f78), C64e(0xb26eb2616ebdbd0f),
+	C64e(0x2aef2a86ef434369), C64e(0xf1a6f193a6c4c435),
+	C64e(0xe3a8e372a83939da), C64e(0xf7a4f762a43131c6),
+	C64e(0x593759bd37d3d38a), C64e(0x868b86ff8bf2f274),
+	C64e(0x563256b132d5d583), C64e(0xc543c50d438b8b4e),
+	C64e(0xeb59ebdc596e6e85), C64e(0xc2b7c2afb7dada18),
+	C64e(0x8f8c8f028c01018e), C64e(0xac64ac7964b1b11d),
+	C64e(0x6dd26d23d29c9cf1), C64e(0x3be03b92e0494972),
+	C64e(0xc7b4c7abb4d8d81f), C64e(0x15fa1543faacacb9),
+	C64e(0x090709fd07f3f3fa), C64e(0x6f256f8525cfcfa0),
+	C64e(0xeaafea8fafcaca20), C64e(0x898e89f38ef4f47d),
+	C64e(0x20e9208ee9474767), C64e(0x2818282018101038),
+	C64e(0x64d564ded56f6f0b), C64e(0x838883fb88f0f073),
+	C64e(0xb16fb1946f4a4afb), C64e(0x967296b8725c5cca),
+	C64e(0x6c246c7024383854), C64e(0x08f108aef157575f),
+	C64e(0x52c752e6c7737321), C64e(0xf351f33551979764),
+	C64e(0x6523658d23cbcbae), C64e(0x847c84597ca1a125),
+	C64e(0xbf9cbfcb9ce8e857), C64e(0x6321637c213e3e5d),
+	C64e(0x7cdd7c37dd9696ea), C64e(0x7fdc7fc2dc61611e),
+	C64e(0x9186911a860d0d9c), C64e(0x9485941e850f0f9b),
+	C64e(0xab90abdb90e0e04b), C64e(0xc642c6f8427c7cba),
+	C64e(0x57c457e2c4717126), C64e(0xe5aae583aacccc29),
+	C64e(0x73d8733bd89090e3), C64e(0x0f050f0c05060609),
+	C64e(0x030103f501f7f7f4), C64e(0x36123638121c1c2a),
+	C64e(0xfea3fe9fa3c2c23c), C64e(0xe15fe1d45f6a6a8b),
+	C64e(0x10f91047f9aeaebe), C64e(0x6bd06bd2d0696902),
+	C64e(0xa891a82e911717bf), C64e(0xe858e82958999971),
+	C64e(0x69276974273a3a53), C64e(0xd0b9d04eb92727f7),
+	C64e(0x483848a938d9d991), C64e(0x351335cd13ebebde),
+	C64e(0xceb3ce56b32b2be5), C64e(0x5533554433222277),
+	C64e(0xd6bbd6bfbbd2d204), C64e(0x9070904970a9a939),
+	C64e(0x8089800e89070787), C64e(0xf2a7f266a73333c1),
+	C64e(0xc1b6c15ab62d2dec), C64e(0x66226678223c3c5a),
+	C64e(0xad92ad2a921515b8), C64e(0x6020608920c9c9a9),
+	C64e(0xdb49db154987875c), C64e(0x1aff1a4fffaaaab0),
+	C64e(0x887888a0785050d8), C64e(0x8e7a8e517aa5a52b),
+	C64e(0x8a8f8a068f030389), C64e(0x13f813b2f859594a),
+	C64e(0x9b809b1280090992), C64e(0x39173934171a1a23),
+	C64e(0x75da75cada656510), C64e(0x533153b531d7d784),
+	C64e(0x51c65113c68484d5), C64e(0xd3b8d3bbb8d0d003),
+	C64e(0x5ec35e1fc38282dc), C64e(0xcbb0cb52b02929e2),
+	C64e(0x997799b4775a5ac3), C64e(0x3311333c111e1e2d),
+	C64e(0x46cb46f6cb7b7b3d), C64e(0x1ffc1f4bfca8a8b7),
+	C64e(0x61d661dad66d6d0c), C64e(0x4e3a4e583a2c2c62)
+};
+
+static const sph_u64 T7[] = {
+	C64e(0x32f4a5f497a5c6c6), C64e(0x6f978497eb84f8f8),
+	C64e(0x5eb099b0c799eeee), C64e(0x7a8c8d8cf78df6f6),
+	C64e(0xe8170d17e50dffff), C64e(0x0adcbddcb7bdd6d6),
+	C64e(0x16c8b1c8a7b1dede), C64e(0x6dfc54fc39549191),
+	C64e(0x90f050f0c0506060), C64e(0x0705030504030202),
+	C64e(0x2ee0a9e087a9cece), C64e(0xd1877d87ac7d5656),
+	C64e(0xcc2b192bd519e7e7), C64e(0x13a662a67162b5b5),
+	C64e(0x7c31e6319ae64d4d), C64e(0x59b59ab5c39aecec),
+	C64e(0x40cf45cf05458f8f), C64e(0xa3bc9dbc3e9d1f1f),
+	C64e(0x49c040c009408989), C64e(0x68928792ef87fafa),
+	C64e(0xd03f153fc515efef), C64e(0x9426eb267febb2b2),
+	C64e(0xce40c94007c98e8e), C64e(0xe61d0b1ded0bfbfb),
+	C64e(0x6e2fec2f82ec4141), C64e(0x1aa967a97d67b3b3),
+	C64e(0x431cfd1cbefd5f5f), C64e(0x6025ea258aea4545),
+	C64e(0xf9dabfda46bf2323), C64e(0x5102f702a6f75353),
+	C64e(0x45a196a1d396e4e4), C64e(0x76ed5bed2d5b9b9b),
+	C64e(0x285dc25deac27575), C64e(0xc5241c24d91ce1e1),
+	C64e(0xd4e9aee97aae3d3d), C64e(0xf2be6abe986a4c4c),
+	C64e(0x82ee5aeed85a6c6c), C64e(0xbdc341c3fc417e7e),
+	C64e(0xf3060206f102f5f5), C64e(0x52d14fd11d4f8383),
+	C64e(0x8ce45ce4d05c6868), C64e(0x5607f407a2f45151),
+	C64e(0x8d5c345cb934d1d1), C64e(0xe1180818e908f9f9),
+	C64e(0x4cae93aedf93e2e2), C64e(0x3e9573954d73abab),
+	C64e(0x97f553f5c4536262), C64e(0x6b413f41543f2a2a),
+	C64e(0x1c140c14100c0808), C64e(0x63f652f631529595),
+	C64e(0xe9af65af8c654646), C64e(0x7fe25ee2215e9d9d),
+	C64e(0x4878287860283030), C64e(0xcff8a1f86ea13737),
+	C64e(0x1b110f11140f0a0a), C64e(0xebc4b5c45eb52f2f),
+	C64e(0x151b091b1c090e0e), C64e(0x7e5a365a48362424),
+	C64e(0xadb69bb6369b1b1b), C64e(0x98473d47a53ddfdf),
+	C64e(0xa76a266a8126cdcd), C64e(0xf5bb69bb9c694e4e),
+	C64e(0x334ccd4cfecd7f7f), C64e(0x50ba9fbacf9feaea),
+	C64e(0x3f2d1b2d241b1212), C64e(0xa4b99eb93a9e1d1d),
+	C64e(0xc49c749cb0745858), C64e(0x46722e72682e3434),
+	C64e(0x41772d776c2d3636), C64e(0x11cdb2cda3b2dcdc),
+	C64e(0x9d29ee2973eeb4b4), C64e(0x4d16fb16b6fb5b5b),
+	C64e(0xa501f60153f6a4a4), C64e(0xa1d74dd7ec4d7676),
+	C64e(0x14a361a37561b7b7), C64e(0x3449ce49face7d7d),
+	C64e(0xdf8d7b8da47b5252), C64e(0x9f423e42a13edddd),
+	C64e(0xcd937193bc715e5e), C64e(0xb1a297a226971313),
+	C64e(0xa204f50457f5a6a6), C64e(0x01b868b86968b9b9),
+	C64e(0x0000000000000000), C64e(0xb5742c74992cc1c1),
+	C64e(0xe0a060a080604040), C64e(0xc2211f21dd1fe3e3),
+	C64e(0x3a43c843f2c87979), C64e(0x9a2ced2c77edb6b6),
+	C64e(0x0dd9bed9b3bed4d4), C64e(0x47ca46ca01468d8d),
+	C64e(0x1770d970ced96767), C64e(0xafdd4bdde44b7272),
+	C64e(0xed79de7933de9494), C64e(0xff67d4672bd49898),
+	C64e(0x9323e8237be8b0b0), C64e(0x5bde4ade114a8585),
+	C64e(0x06bd6bbd6d6bbbbb), C64e(0xbb7e2a7e912ac5c5),
+	C64e(0x7b34e5349ee54f4f), C64e(0xd73a163ac116eded),
+	C64e(0xd254c55417c58686), C64e(0xf862d7622fd79a9a),
+	C64e(0x99ff55ffcc556666), C64e(0xb6a794a722941111),
+	C64e(0xc04acf4a0fcf8a8a), C64e(0xd9301030c910e9e9),
+	C64e(0x0e0a060a08060404), C64e(0x66988198e781fefe),
+	C64e(0xab0bf00b5bf0a0a0), C64e(0xb4cc44ccf0447878),
+	C64e(0xf0d5bad54aba2525), C64e(0x753ee33e96e34b4b),
+	C64e(0xac0ef30e5ff3a2a2), C64e(0x4419fe19bafe5d5d),
+	C64e(0xdb5bc05b1bc08080), C64e(0x80858a850a8a0505),
+	C64e(0xd3ecadec7ead3f3f), C64e(0xfedfbcdf42bc2121),
+	C64e(0xa8d848d8e0487070), C64e(0xfd0c040cf904f1f1),
+	C64e(0x197adf7ac6df6363), C64e(0x2f58c158eec17777),
+	C64e(0x309f759f4575afaf), C64e(0xe7a563a584634242),
+	C64e(0x7050305040302020), C64e(0xcb2e1a2ed11ae5e5),
+	C64e(0xef120e12e10efdfd), C64e(0x08b76db7656dbfbf),
+	C64e(0x55d44cd4194c8181), C64e(0x243c143c30141818),
+	C64e(0x795f355f4c352626), C64e(0xb2712f719d2fc3c3),
+	C64e(0x8638e13867e1bebe), C64e(0xc8fda2fd6aa23535),
+	C64e(0xc74fcc4f0bcc8888), C64e(0x654b394b5c392e2e),
+	C64e(0x6af957f93d579393), C64e(0x580df20daaf25555),
+	C64e(0x619d829de382fcfc), C64e(0xb3c947c9f4477a7a),
+	C64e(0x27efacef8bacc8c8), C64e(0x8832e7326fe7baba),
+	C64e(0x4f7d2b7d642b3232), C64e(0x42a495a4d795e6e6),
+	C64e(0x3bfba0fb9ba0c0c0), C64e(0xaab398b332981919),
+	C64e(0xf668d16827d19e9e), C64e(0x22817f815d7fa3a3),
+	C64e(0xeeaa66aa88664444), C64e(0xd6827e82a87e5454),
+	C64e(0xdde6abe676ab3b3b), C64e(0x959e839e16830b0b),
+	C64e(0xc945ca4503ca8c8c), C64e(0xbc7b297b9529c7c7),
+	C64e(0x056ed36ed6d36b6b), C64e(0x6c443c44503c2828),
+	C64e(0x2c8b798b5579a7a7), C64e(0x813de23d63e2bcbc),
+	C64e(0x31271d272c1d1616), C64e(0x379a769a4176adad),
+	C64e(0x964d3b4dad3bdbdb), C64e(0x9efa56fac8566464),
+	C64e(0xa6d24ed2e84e7474), C64e(0x36221e22281e1414),
+	C64e(0xe476db763fdb9292), C64e(0x121e0a1e180a0c0c),
+	C64e(0xfcb46cb4906c4848), C64e(0x8f37e4376be4b8b8),
+	C64e(0x78e75de7255d9f9f), C64e(0x0fb26eb2616ebdbd),
+	C64e(0x692aef2a86ef4343), C64e(0x35f1a6f193a6c4c4),
+	C64e(0xdae3a8e372a83939), C64e(0xc6f7a4f762a43131),
+	C64e(0x8a593759bd37d3d3), C64e(0x74868b86ff8bf2f2),
+	C64e(0x83563256b132d5d5), C64e(0x4ec543c50d438b8b),
+	C64e(0x85eb59ebdc596e6e), C64e(0x18c2b7c2afb7dada),
+	C64e(0x8e8f8c8f028c0101), C64e(0x1dac64ac7964b1b1),
+	C64e(0xf16dd26d23d29c9c), C64e(0x723be03b92e04949),
+	C64e(0x1fc7b4c7abb4d8d8), C64e(0xb915fa1543faacac),
+	C64e(0xfa090709fd07f3f3), C64e(0xa06f256f8525cfcf),
+	C64e(0x20eaafea8fafcaca), C64e(0x7d898e89f38ef4f4),
+	C64e(0x6720e9208ee94747), C64e(0x3828182820181010),
+	C64e(0x0b64d564ded56f6f), C64e(0x73838883fb88f0f0),
+	C64e(0xfbb16fb1946f4a4a), C64e(0xca967296b8725c5c),
+	C64e(0x546c246c70243838), C64e(0x5f08f108aef15757),
+	C64e(0x2152c752e6c77373), C64e(0x64f351f335519797),
+	C64e(0xae6523658d23cbcb), C64e(0x25847c84597ca1a1),
+	C64e(0x57bf9cbfcb9ce8e8), C64e(0x5d6321637c213e3e),
+	C64e(0xea7cdd7c37dd9696), C64e(0x1e7fdc7fc2dc6161),
+	C64e(0x9c9186911a860d0d), C64e(0x9b9485941e850f0f),
+	C64e(0x4bab90abdb90e0e0), C64e(0xbac642c6f8427c7c),
+	C64e(0x2657c457e2c47171), C64e(0x29e5aae583aacccc),
+	C64e(0xe373d8733bd89090), C64e(0x090f050f0c050606),
+	C64e(0xf4030103f501f7f7), C64e(0x2a36123638121c1c),
+	C64e(0x3cfea3fe9fa3c2c2), C64e(0x8be15fe1d45f6a6a),
+	C64e(0xbe10f91047f9aeae), C64e(0x026bd06bd2d06969),
+	C64e(0xbfa891a82e911717), C64e(0x71e858e829589999),
+	C64e(0x5369276974273a3a), C64e(0xf7d0b9d04eb92727),
+	C64e(0x91483848a938d9d9), C64e(0xde351335cd13ebeb),
+	C64e(0xe5ceb3ce56b32b2b), C64e(0x7755335544332222),
+	C64e(0x04d6bbd6bfbbd2d2), C64e(0x399070904970a9a9),
+	C64e(0x878089800e890707), C64e(0xc1f2a7f266a73333),
+	C64e(0xecc1b6c15ab62d2d), C64e(0x5a66226678223c3c),
+	C64e(0xb8ad92ad2a921515), C64e(0xa96020608920c9c9),
+	C64e(0x5cdb49db15498787), C64e(0xb01aff1a4fffaaaa),
+	C64e(0xd8887888a0785050), C64e(0x2b8e7a8e517aa5a5),
+	C64e(0x898a8f8a068f0303), C64e(0x4a13f813b2f85959),
+	C64e(0x929b809b12800909), C64e(0x2339173934171a1a),
+	C64e(0x1075da75cada6565), C64e(0x84533153b531d7d7),
+	C64e(0xd551c65113c68484), C64e(0x03d3b8d3bbb8d0d0),
+	C64e(0xdc5ec35e1fc38282), C64e(0xe2cbb0cb52b02929),
+	C64e(0xc3997799b4775a5a), C64e(0x2d3311333c111e1e),
+	C64e(0x3d46cb46f6cb7b7b), C64e(0xb71ffc1f4bfca8a8),
+	C64e(0x0c61d661dad66d6d), C64e(0x624e3a4e583a2c2c)
+};
+
+#endif
+
+#define DECL_STATE_SMALL \
+	sph_u64 H[8];
+
+#define READ_STATE_SMALL(sc)   do { \
+		memcpy(H, (sc)->state.wide, sizeof H); \
+	} while (0)
+
+#define WRITE_STATE_SMALL(sc)   do { \
+		memcpy((sc)->state.wide, H, sizeof H); \
+	} while (0)
+
+#if SPH_SMALL_FOOTPRINT_GROESTL
+
+#define RSTT(d, a, b0, b1, b2, b3, b4, b5, b6, b7)   do { \
+		t[d] = T0[B64_0(a[b0])] \
+			^ R64(T0[B64_1(a[b1])],  8) \
+			^ R64(T0[B64_2(a[b2])], 16) \
+			^ R64(T0[B64_3(a[b3])], 24) \
+			^ T4[B64_4(a[b4])] \
+			^ R64(T4[B64_5(a[b5])],  8) \
+			^ R64(T4[B64_6(a[b6])], 16) \
+			^ R64(T4[B64_7(a[b7])], 24); \
+	} while (0)
+
+#else
+
+#define RSTT(d, a, b0, b1, b2, b3, b4, b5, b6, b7)   do { \
+		t[d] = T0[B64_0(a[b0])] \
+			^ T1[B64_1(a[b1])] \
+			^ T2[B64_2(a[b2])] \
+			^ T3[B64_3(a[b3])] \
+			^ T4[B64_4(a[b4])] \
+			^ T5[B64_5(a[b5])] \
+			^ T6[B64_6(a[b6])] \
+			^ T7[B64_7(a[b7])]; \
+	} while (0)
+
+#endif
+
+#define ROUND_SMALL_P(a, r)   do { \
+		sph_u64 t[8]; \
+		a[0] ^= PC64(0x00, r); \
+		a[1] ^= PC64(0x10, r); \
+		a[2] ^= PC64(0x20, r); \
+		a[3] ^= PC64(0x30, r); \
+		a[4] ^= PC64(0x40, r); \
+		a[5] ^= PC64(0x50, r); \
+		a[6] ^= PC64(0x60, r); \
+		a[7] ^= PC64(0x70, r); \
+		RSTT(0, a, 0, 1, 2, 3, 4, 5, 6, 7); \
+		RSTT(1, a, 1, 2, 3, 4, 5, 6, 7, 0); \
+		RSTT(2, a, 2, 3, 4, 5, 6, 7, 0, 1); \
+		RSTT(3, a, 3, 4, 5, 6, 7, 0, 1, 2); \
+		RSTT(4, a, 4, 5, 6, 7, 0, 1, 2, 3); \
+		RSTT(5, a, 5, 6, 7, 0, 1, 2, 3, 4); \
+		RSTT(6, a, 6, 7, 0, 1, 2, 3, 4, 5); \
+		RSTT(7, a, 7, 0, 1, 2, 3, 4, 5, 6); \
+		a[0] = t[0]; \
+		a[1] = t[1]; \
+		a[2] = t[2]; \
+		a[3] = t[3]; \
+		a[4] = t[4]; \
+		a[5] = t[5]; \
+		a[6] = t[6]; \
+		a[7] = t[7]; \
+	} while (0)
+
+#define ROUND_SMALL_Q(a, r)   do { \
+		sph_u64 t[8]; \
+		a[0] ^= QC64(0x00, r); \
+		a[1] ^= QC64(0x10, r); \
+		a[2] ^= QC64(0x20, r); \
+		a[3] ^= QC64(0x30, r); \
+		a[4] ^= QC64(0x40, r); \
+		a[5] ^= QC64(0x50, r); \
+		a[6] ^= QC64(0x60, r); \
+		a[7] ^= QC64(0x70, r); \
+		RSTT(0, a, 1, 3, 5, 7, 0, 2, 4, 6); \
+		RSTT(1, a, 2, 4, 6, 0, 1, 3, 5, 7); \
+		RSTT(2, a, 3, 5, 7, 1, 2, 4, 6, 0); \
+		RSTT(3, a, 4, 6, 0, 2, 3, 5, 7, 1); \
+		RSTT(4, a, 5, 7, 1, 3, 4, 6, 0, 2); \
+		RSTT(5, a, 6, 0, 2, 4, 5, 7, 1, 3); \
+		RSTT(6, a, 7, 1, 3, 5, 6, 0, 2, 4); \
+		RSTT(7, a, 0, 2, 4, 6, 7, 1, 3, 5); \
+		a[0] = t[0]; \
+		a[1] = t[1]; \
+		a[2] = t[2]; \
+		a[3] = t[3]; \
+		a[4] = t[4]; \
+		a[5] = t[5]; \
+		a[6] = t[6]; \
+		a[7] = t[7]; \
+	} while (0)
+
+#if SPH_SMALL_FOOTPRINT_GROESTL
+
+#define PERM_SMALL_P(a)   do { \
+		int r; \
+		for (r = 0; r < 10; r ++) \
+			ROUND_SMALL_P(a, r); \
+	} while (0)
+
+#define PERM_SMALL_Q(a)   do { \
+		int r; \
+		for (r = 0; r < 10; r ++) \
+			ROUND_SMALL_Q(a, r); \
+	} while (0)
+
+#else
+
+/*
+ * Apparently, unrolling more than that confuses GCC, resulting in
+ * lower performance, even though L1 cache would be no problem.
+ */
+#define PERM_SMALL_P(a)   do { \
+		int r; \
+		for (r = 0; r < 10; r += 2) { \
+			ROUND_SMALL_P(a, r + 0); \
+			ROUND_SMALL_P(a, r + 1); \
+		} \
+	} while (0)
+
+#define PERM_SMALL_Q(a)   do { \
+		int r; \
+		for (r = 0; r < 10; r += 2) { \
+			ROUND_SMALL_Q(a, r + 0); \
+			ROUND_SMALL_Q(a, r + 1); \
+		} \
+	} while (0)
+
+#endif
+
+#define COMPRESS_SMALL   do { \
+		sph_u64 g[8], m[8]; \
+		size_t u; \
+		for (u = 0; u < 8; u ++) { \
+			m[u] = dec64e_aligned(buf + (u << 3)); \
+			g[u] = m[u] ^ H[u]; \
+		} \
+		PERM_SMALL_P(g); \
+		PERM_SMALL_Q(m); \
+		for (u = 0; u < 8; u ++) \
+			H[u] ^= g[u] ^ m[u]; \
+	} while (0)
+
+#define FINAL_SMALL   do { \
+		sph_u64 x[8]; \
+		size_t u; \
+		memcpy(x, H, sizeof x); \
+		PERM_SMALL_P(x); \
+		for (u = 0; u < 8; u ++) \
+			H[u] ^= x[u]; \
+	} while (0)
+
+#define DECL_STATE_BIG \
+	sph_u64 H[16];
+
+#define READ_STATE_BIG(sc)   do { \
+		memcpy(H, (sc)->state.wide, sizeof H); \
+	} while (0)
+
+#define WRITE_STATE_BIG(sc)   do { \
+		memcpy((sc)->state.wide, H, sizeof H); \
+	} while (0)
+
+#if SPH_SMALL_FOOTPRINT_GROESTL
+
+#define RBTT(d, a, b0, b1, b2, b3, b4, b5, b6, b7)   do { \
+		t[d] = T0[B64_0(a[b0])] \
+			^ R64(T0[B64_1(a[b1])],  8) \
+			^ R64(T0[B64_2(a[b2])], 16) \
+			^ R64(T0[B64_3(a[b3])], 24) \
+			^ T4[B64_4(a[b4])] \
+			^ R64(T4[B64_5(a[b5])],  8) \
+			^ R64(T4[B64_6(a[b6])], 16) \
+			^ R64(T4[B64_7(a[b7])], 24); \
+	} while (0)
+
+#else
+
+#define RBTT(d, a, b0, b1, b2, b3, b4, b5, b6, b7)   do { \
+		t[d] = T0[B64_0(a[b0])] \
+			^ T1[B64_1(a[b1])] \
+			^ T2[B64_2(a[b2])] \
+			^ T3[B64_3(a[b3])] \
+			^ T4[B64_4(a[b4])] \
+			^ T5[B64_5(a[b5])] \
+			^ T6[B64_6(a[b6])] \
+			^ T7[B64_7(a[b7])]; \
+	} while (0)
+
+#endif
+
+#if SPH_SMALL_FOOTPRINT_GROESTL
+
+#define ROUND_BIG_P(a, r)   do { \
+		sph_u64 t[16]; \
+		size_t u; \
+		a[0x0] ^= PC64(0x00, r); \
+		a[0x1] ^= PC64(0x10, r); \
+		a[0x2] ^= PC64(0x20, r); \
+		a[0x3] ^= PC64(0x30, r); \
+		a[0x4] ^= PC64(0x40, r); \
+		a[0x5] ^= PC64(0x50, r); \
+		a[0x6] ^= PC64(0x60, r); \
+		a[0x7] ^= PC64(0x70, r); \
+		a[0x8] ^= PC64(0x80, r); \
+		a[0x9] ^= PC64(0x90, r); \
+		a[0xA] ^= PC64(0xA0, r); \
+		a[0xB] ^= PC64(0xB0, r); \
+		a[0xC] ^= PC64(0xC0, r); \
+		a[0xD] ^= PC64(0xD0, r); \
+		a[0xE] ^= PC64(0xE0, r); \
+		a[0xF] ^= PC64(0xF0, r); \
+		for (u = 0; u < 16; u += 4) { \
+			RBTT(u + 0, a, u + 0, (u + 1) & 0xF, \
+				(u + 2) & 0xF, (u + 3) & 0xF, (u + 4) & 0xF, \
+				(u + 5) & 0xF, (u + 6) & 0xF, (u + 11) & 0xF); \
+			RBTT(u + 1, a, u + 1, (u + 2) & 0xF, \
+				(u + 3) & 0xF, (u + 4) & 0xF, (u + 5) & 0xF, \
+				(u + 6) & 0xF, (u + 7) & 0xF, (u + 12) & 0xF); \
+			RBTT(u + 2, a, u + 2, (u + 3) & 0xF, \
+				(u + 4) & 0xF, (u + 5) & 0xF, (u + 6) & 0xF, \
+				(u + 7) & 0xF, (u + 8) & 0xF, (u + 13) & 0xF); \
+			RBTT(u + 3, a, u + 3, (u + 4) & 0xF, \
+				(u + 5) & 0xF, (u + 6) & 0xF, (u + 7) & 0xF, \
+				(u + 8) & 0xF, (u + 9) & 0xF, (u + 14) & 0xF); \
+		} \
+		memcpy(a, t, sizeof t); \
+	} while (0)
+
+#define ROUND_BIG_Q(a, r)   do { \
+		sph_u64 t[16]; \
+		size_t u; \
+		a[0x0] ^= QC64(0x00, r); \
+		a[0x1] ^= QC64(0x10, r); \
+		a[0x2] ^= QC64(0x20, r); \
+		a[0x3] ^= QC64(0x30, r); \
+		a[0x4] ^= QC64(0x40, r); \
+		a[0x5] ^= QC64(0x50, r); \
+		a[0x6] ^= QC64(0x60, r); \
+		a[0x7] ^= QC64(0x70, r); \
+		a[0x8] ^= QC64(0x80, r); \
+		a[0x9] ^= QC64(0x90, r); \
+		a[0xA] ^= QC64(0xA0, r); \
+		a[0xB] ^= QC64(0xB0, r); \
+		a[0xC] ^= QC64(0xC0, r); \
+		a[0xD] ^= QC64(0xD0, r); \
+		a[0xE] ^= QC64(0xE0, r); \
+		a[0xF] ^= QC64(0xF0, r); \
+		for (u = 0; u < 16; u += 4) { \
+			RBTT(u + 0, a, (u + 1) & 0xF, (u + 3) & 0xF, \
+				(u + 5) & 0xF, (u + 11) & 0xF, (u + 0) & 0xF, \
+				(u + 2) & 0xF, (u + 4) & 0xF, (u + 6) & 0xF); \
+			RBTT(u + 1, a, (u + 2) & 0xF, (u + 4) & 0xF, \
+				(u + 6) & 0xF, (u + 12) & 0xF, (u + 1) & 0xF, \
+				(u + 3) & 0xF, (u + 5) & 0xF, (u + 7) & 0xF); \
+			RBTT(u + 2, a, (u + 3) & 0xF, (u + 5) & 0xF, \
+				(u + 7) & 0xF, (u + 13) & 0xF, (u + 2) & 0xF, \
+				(u + 4) & 0xF, (u + 6) & 0xF, (u + 8) & 0xF); \
+			RBTT(u + 3, a, (u + 4) & 0xF, (u + 6) & 0xF, \
+				(u + 8) & 0xF, (u + 14) & 0xF, (u + 3) & 0xF, \
+				(u + 5) & 0xF, (u + 7) & 0xF, (u + 9) & 0xF); \
+		} \
+		memcpy(a, t, sizeof t); \
+	} while (0)
+
+#else
+
+#define ROUND_BIG_P(a, r)   do { \
+		sph_u64 t[16]; \
+		a[0x0] ^= PC64(0x00, r); \
+		a[0x1] ^= PC64(0x10, r); \
+		a[0x2] ^= PC64(0x20, r); \
+		a[0x3] ^= PC64(0x30, r); \
+		a[0x4] ^= PC64(0x40, r); \
+		a[0x5] ^= PC64(0x50, r); \
+		a[0x6] ^= PC64(0x60, r); \
+		a[0x7] ^= PC64(0x70, r); \
+		a[0x8] ^= PC64(0x80, r); \
+		a[0x9] ^= PC64(0x90, r); \
+		a[0xA] ^= PC64(0xA0, r); \
+		a[0xB] ^= PC64(0xB0, r); \
+		a[0xC] ^= PC64(0xC0, r); \
+		a[0xD] ^= PC64(0xD0, r); \
+		a[0xE] ^= PC64(0xE0, r); \
+		a[0xF] ^= PC64(0xF0, r); \
+		RBTT(0x0, a, 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0xB); \
+		RBTT(0x1, a, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0xC); \
+		RBTT(0x2, a, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0xD); \
+		RBTT(0x3, a, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xE); \
+		RBTT(0x4, a, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xA, 0xF); \
+		RBTT(0x5, a, 0x5, 0x6, 0x7, 0x8, 0x9, 0xA, 0xB, 0x0); \
+		RBTT(0x6, a, 0x6, 0x7, 0x8, 0x9, 0xA, 0xB, 0xC, 0x1); \
+		RBTT(0x7, a, 0x7, 0x8, 0x9, 0xA, 0xB, 0xC, 0xD, 0x2); \
+		RBTT(0x8, a, 0x8, 0x9, 0xA, 0xB, 0xC, 0xD, 0xE, 0x3); \
+		RBTT(0x9, a, 0x9, 0xA, 0xB, 0xC, 0xD, 0xE, 0xF, 0x4); \
+		RBTT(0xA, a, 0xA, 0xB, 0xC, 0xD, 0xE, 0xF, 0x0, 0x5); \
+		RBTT(0xB, a, 0xB, 0xC, 0xD, 0xE, 0xF, 0x0, 0x1, 0x6); \
+		RBTT(0xC, a, 0xC, 0xD, 0xE, 0xF, 0x0, 0x1, 0x2, 0x7); \
+		RBTT(0xD, a, 0xD, 0xE, 0xF, 0x0, 0x1, 0x2, 0x3, 0x8); \
+		RBTT(0xE, a, 0xE, 0xF, 0x0, 0x1, 0x2, 0x3, 0x4, 0x9); \
+		RBTT(0xF, a, 0xF, 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0xA); \
+		a[0x0] = t[0x0]; \
+		a[0x1] = t[0x1]; \
+		a[0x2] = t[0x2]; \
+		a[0x3] = t[0x3]; \
+		a[0x4] = t[0x4]; \
+		a[0x5] = t[0x5]; \
+		a[0x6] = t[0x6]; \
+		a[0x7] = t[0x7]; \
+		a[0x8] = t[0x8]; \
+		a[0x9] = t[0x9]; \
+		a[0xA] = t[0xA]; \
+		a[0xB] = t[0xB]; \
+		a[0xC] = t[0xC]; \
+		a[0xD] = t[0xD]; \
+		a[0xE] = t[0xE]; \
+		a[0xF] = t[0xF]; \
+	} while (0)
+
+#define ROUND_BIG_Q(a, r)   do { \
+		sph_u64 t[16]; \
+		a[0x0] ^= QC64(0x00, r); \
+		a[0x1] ^= QC64(0x10, r); \
+		a[0x2] ^= QC64(0x20, r); \
+		a[0x3] ^= QC64(0x30, r); \
+		a[0x4] ^= QC64(0x40, r); \
+		a[0x5] ^= QC64(0x50, r); \
+		a[0x6] ^= QC64(0x60, r); \
+		a[0x7] ^= QC64(0x70, r); \
+		a[0x8] ^= QC64(0x80, r); \
+		a[0x9] ^= QC64(0x90, r); \
+		a[0xA] ^= QC64(0xA0, r); \
+		a[0xB] ^= QC64(0xB0, r); \
+		a[0xC] ^= QC64(0xC0, r); \
+		a[0xD] ^= QC64(0xD0, r); \
+		a[0xE] ^= QC64(0xE0, r); \
+		a[0xF] ^= QC64(0xF0, r); \
+		RBTT(0x0, a, 0x1, 0x3, 0x5, 0xB, 0x0, 0x2, 0x4, 0x6); \
+		RBTT(0x1, a, 0x2, 0x4, 0x6, 0xC, 0x1, 0x3, 0x5, 0x7); \
+		RBTT(0x2, a, 0x3, 0x5, 0x7, 0xD, 0x2, 0x4, 0x6, 0x8); \
+		RBTT(0x3, a, 0x4, 0x6, 0x8, 0xE, 0x3, 0x5, 0x7, 0x9); \
+		RBTT(0x4, a, 0x5, 0x7, 0x9, 0xF, 0x4, 0x6, 0x8, 0xA); \
+		RBTT(0x5, a, 0x6, 0x8, 0xA, 0x0, 0x5, 0x7, 0x9, 0xB); \
+		RBTT(0x6, a, 0x7, 0x9, 0xB, 0x1, 0x6, 0x8, 0xA, 0xC); \
+		RBTT(0x7, a, 0x8, 0xA, 0xC, 0x2, 0x7, 0x9, 0xB, 0xD); \
+		RBTT(0x8, a, 0x9, 0xB, 0xD, 0x3, 0x8, 0xA, 0xC, 0xE); \
+		RBTT(0x9, a, 0xA, 0xC, 0xE, 0x4, 0x9, 0xB, 0xD, 0xF); \
+		RBTT(0xA, a, 0xB, 0xD, 0xF, 0x5, 0xA, 0xC, 0xE, 0x0); \
+		RBTT(0xB, a, 0xC, 0xE, 0x0, 0x6, 0xB, 0xD, 0xF, 0x1); \
+		RBTT(0xC, a, 0xD, 0xF, 0x1, 0x7, 0xC, 0xE, 0x0, 0x2); \
+		RBTT(0xD, a, 0xE, 0x0, 0x2, 0x8, 0xD, 0xF, 0x1, 0x3); \
+		RBTT(0xE, a, 0xF, 0x1, 0x3, 0x9, 0xE, 0x0, 0x2, 0x4); \
+		RBTT(0xF, a, 0x0, 0x2, 0x4, 0xA, 0xF, 0x1, 0x3, 0x5); \
+		a[0x0] = t[0x0]; \
+		a[0x1] = t[0x1]; \
+		a[0x2] = t[0x2]; \
+		a[0x3] = t[0x3]; \
+		a[0x4] = t[0x4]; \
+		a[0x5] = t[0x5]; \
+		a[0x6] = t[0x6]; \
+		a[0x7] = t[0x7]; \
+		a[0x8] = t[0x8]; \
+		a[0x9] = t[0x9]; \
+		a[0xA] = t[0xA]; \
+		a[0xB] = t[0xB]; \
+		a[0xC] = t[0xC]; \
+		a[0xD] = t[0xD]; \
+		a[0xE] = t[0xE]; \
+		a[0xF] = t[0xF]; \
+	} while (0)
+
+#endif
+
+#define PERM_BIG_P(a)   do { \
+		int r; \
+		for (r = 0; r < 14; r += 2) { \
+			ROUND_BIG_P(a, r + 0); \
+			ROUND_BIG_P(a, r + 1); \
+		} \
+	} while (0)
+
+#define PERM_BIG_Q(a)   do { \
+		int r; \
+		for (r = 0; r < 14; r += 2) { \
+			ROUND_BIG_Q(a, r + 0); \
+			ROUND_BIG_Q(a, r + 1); \
+		} \
+	} while (0)
+
+/* obsolete
+#if SPH_SMALL_FOOTPRINT_GROESTL
+
+#define COMPRESS_BIG   do { \
+		sph_u64 g[16], m[16], *ya; \
+		const sph_u64 *yc; \
+		size_t u; \
+		int i; \
+		for (u = 0; u < 16; u ++) { \
+			m[u] = dec64e_aligned(buf + (u << 3)); \
+			g[u] = m[u] ^ H[u]; \
+		} \
+		ya = g; \
+		yc = CP; \
+		for (i = 0; i < 2; i ++) { \
+			PERM_BIG(ya, yc); \
+			ya = m; \
+			yc = CQ; \
+		} \
+		for (u = 0; u < 16; u ++) { \
+			H[u] ^= g[u] ^ m[u]; \
+		} \
+	} while (0)
+
+#else
+*/
+
+#define COMPRESS_BIG   do { \
+		sph_u64 g[16], m[16]; \
+		size_t u; \
+		for (u = 0; u < 16; u ++) { \
+			m[u] = dec64e_aligned(buf + (u << 3)); \
+			g[u] = m[u] ^ H[u]; \
+		} \
+		PERM_BIG_P(g); \
+		PERM_BIG_Q(m); \
+		for (u = 0; u < 16; u ++) { \
+			H[u] ^= g[u] ^ m[u]; \
+		} \
+	} while (0)
+
+/* obsolete
+#endif
+*/
+
+#define FINAL_BIG   do { \
+		sph_u64 x[16]; \
+		size_t u; \
+		memcpy(x, H, sizeof x); \
+		PERM_BIG_P(x); \
+		for (u = 0; u < 16; u ++) \
+			H[u] ^= x[u]; \
+	} while (0)
+
+#else
+
+static const sph_u32 T0up[] = {
+	C32e(0xc632f4a5), C32e(0xf86f9784), C32e(0xee5eb099), C32e(0xf67a8c8d),
+	C32e(0xffe8170d), C32e(0xd60adcbd), C32e(0xde16c8b1), C32e(0x916dfc54),
+	C32e(0x6090f050), C32e(0x02070503), C32e(0xce2ee0a9), C32e(0x56d1877d),
+	C32e(0xe7cc2b19), C32e(0xb513a662), C32e(0x4d7c31e6), C32e(0xec59b59a),
+	C32e(0x8f40cf45), C32e(0x1fa3bc9d), C32e(0x8949c040), C32e(0xfa689287),
+	C32e(0xefd03f15), C32e(0xb29426eb), C32e(0x8ece40c9), C32e(0xfbe61d0b),
+	C32e(0x416e2fec), C32e(0xb31aa967), C32e(0x5f431cfd), C32e(0x456025ea),
+	C32e(0x23f9dabf), C32e(0x535102f7), C32e(0xe445a196), C32e(0x9b76ed5b),
+	C32e(0x75285dc2), C32e(0xe1c5241c), C32e(0x3dd4e9ae), C32e(0x4cf2be6a),
+	C32e(0x6c82ee5a), C32e(0x7ebdc341), C32e(0xf5f30602), C32e(0x8352d14f),
+	C32e(0x688ce45c), C32e(0x515607f4), C32e(0xd18d5c34), C32e(0xf9e11808),
+	C32e(0xe24cae93), C32e(0xab3e9573), C32e(0x6297f553), C32e(0x2a6b413f),
+	C32e(0x081c140c), C32e(0x9563f652), C32e(0x46e9af65), C32e(0x9d7fe25e),
+	C32e(0x30487828), C32e(0x37cff8a1), C32e(0x0a1b110f), C32e(0x2febc4b5),
+	C32e(0x0e151b09), C32e(0x247e5a36), C32e(0x1badb69b), C32e(0xdf98473d),
+	C32e(0xcda76a26), C32e(0x4ef5bb69), C32e(0x7f334ccd), C32e(0xea50ba9f),
+	C32e(0x123f2d1b), C32e(0x1da4b99e), C32e(0x58c49c74), C32e(0x3446722e),
+	C32e(0x3641772d), C32e(0xdc11cdb2), C32e(0xb49d29ee), C32e(0x5b4d16fb),
+	C32e(0xa4a501f6), C32e(0x76a1d74d), C32e(0xb714a361), C32e(0x7d3449ce),
+	C32e(0x52df8d7b), C32e(0xdd9f423e), C32e(0x5ecd9371), C32e(0x13b1a297),
+	C32e(0xa6a204f5), C32e(0xb901b868), C32e(0x00000000), C32e(0xc1b5742c),
+	C32e(0x40e0a060), C32e(0xe3c2211f), C32e(0x793a43c8), C32e(0xb69a2ced),
+	C32e(0xd40dd9be), C32e(0x8d47ca46), C32e(0x671770d9), C32e(0x72afdd4b),
+	C32e(0x94ed79de), C32e(0x98ff67d4), C32e(0xb09323e8), C32e(0x855bde4a),
+	C32e(0xbb06bd6b), C32e(0xc5bb7e2a), C32e(0x4f7b34e5), C32e(0xedd73a16),
+	C32e(0x86d254c5), C32e(0x9af862d7), C32e(0x6699ff55), C32e(0x11b6a794),
+	C32e(0x8ac04acf), C32e(0xe9d93010), C32e(0x040e0a06), C32e(0xfe669881),
+	C32e(0xa0ab0bf0), C32e(0x78b4cc44), C32e(0x25f0d5ba), C32e(0x4b753ee3),
+	C32e(0xa2ac0ef3), C32e(0x5d4419fe), C32e(0x80db5bc0), C32e(0x0580858a),
+	C32e(0x3fd3ecad), C32e(0x21fedfbc), C32e(0x70a8d848), C32e(0xf1fd0c04),
+	C32e(0x63197adf), C32e(0x772f58c1), C32e(0xaf309f75), C32e(0x42e7a563),
+	C32e(0x20705030), C32e(0xe5cb2e1a), C32e(0xfdef120e), C32e(0xbf08b76d),
+	C32e(0x8155d44c), C32e(0x18243c14), C32e(0x26795f35), C32e(0xc3b2712f),
+	C32e(0xbe8638e1), C32e(0x35c8fda2), C32e(0x88c74fcc), C32e(0x2e654b39),
+	C32e(0x936af957), C32e(0x55580df2), C32e(0xfc619d82), C32e(0x7ab3c947),
+	C32e(0xc827efac), C32e(0xba8832e7), C32e(0x324f7d2b), C32e(0xe642a495),
+	C32e(0xc03bfba0), C32e(0x19aab398), C32e(0x9ef668d1), C32e(0xa322817f),
+	C32e(0x44eeaa66), C32e(0x54d6827e), C32e(0x3bdde6ab), C32e(0x0b959e83),
+	C32e(0x8cc945ca), C32e(0xc7bc7b29), C32e(0x6b056ed3), C32e(0x286c443c),
+	C32e(0xa72c8b79), C32e(0xbc813de2), C32e(0x1631271d), C32e(0xad379a76),
+	C32e(0xdb964d3b), C32e(0x649efa56), C32e(0x74a6d24e), C32e(0x1436221e),
+	C32e(0x92e476db), C32e(0x0c121e0a), C32e(0x48fcb46c), C32e(0xb88f37e4),
+	C32e(0x9f78e75d), C32e(0xbd0fb26e), C32e(0x43692aef), C32e(0xc435f1a6),
+	C32e(0x39dae3a8), C32e(0x31c6f7a4), C32e(0xd38a5937), C32e(0xf274868b),
+	C32e(0xd5835632), C32e(0x8b4ec543), C32e(0x6e85eb59), C32e(0xda18c2b7),
+	C32e(0x018e8f8c), C32e(0xb11dac64), C32e(0x9cf16dd2), C32e(0x49723be0),
+	C32e(0xd81fc7b4), C32e(0xacb915fa), C32e(0xf3fa0907), C32e(0xcfa06f25),
+	C32e(0xca20eaaf), C32e(0xf47d898e), C32e(0x476720e9), C32e(0x10382818),
+	C32e(0x6f0b64d5), C32e(0xf0738388), C32e(0x4afbb16f), C32e(0x5cca9672),
+	C32e(0x38546c24), C32e(0x575f08f1), C32e(0x732152c7), C32e(0x9764f351),
+	C32e(0xcbae6523), C32e(0xa125847c), C32e(0xe857bf9c), C32e(0x3e5d6321),
+	C32e(0x96ea7cdd), C32e(0x611e7fdc), C32e(0x0d9c9186), C32e(0x0f9b9485),
+	C32e(0xe04bab90), C32e(0x7cbac642), C32e(0x712657c4), C32e(0xcc29e5aa),
+	C32e(0x90e373d8), C32e(0x06090f05), C32e(0xf7f40301), C32e(0x1c2a3612),
+	C32e(0xc23cfea3), C32e(0x6a8be15f), C32e(0xaebe10f9), C32e(0x69026bd0),
+	C32e(0x17bfa891), C32e(0x9971e858), C32e(0x3a536927), C32e(0x27f7d0b9),
+	C32e(0xd9914838), C32e(0xebde3513), C32e(0x2be5ceb3), C32e(0x22775533),
+	C32e(0xd204d6bb), C32e(0xa9399070), C32e(0x07878089), C32e(0x33c1f2a7),
+	C32e(0x2decc1b6), C32e(0x3c5a6622), C32e(0x15b8ad92), C32e(0xc9a96020),
+	C32e(0x875cdb49), C32e(0xaab01aff), C32e(0x50d88878), C32e(0xa52b8e7a),
+	C32e(0x03898a8f), C32e(0x594a13f8), C32e(0x09929b80), C32e(0x1a233917),
+	C32e(0x651075da), C32e(0xd7845331), C32e(0x84d551c6), C32e(0xd003d3b8),
+	C32e(0x82dc5ec3), C32e(0x29e2cbb0), C32e(0x5ac39977), C32e(0x1e2d3311),
+	C32e(0x7b3d46cb), C32e(0xa8b71ffc), C32e(0x6d0c61d6), C32e(0x2c624e3a)
+};
+
+static const sph_u32 T0dn[] = {
+	C32e(0xf497a5c6), C32e(0x97eb84f8), C32e(0xb0c799ee), C32e(0x8cf78df6),
+	C32e(0x17e50dff), C32e(0xdcb7bdd6), C32e(0xc8a7b1de), C32e(0xfc395491),
+	C32e(0xf0c05060), C32e(0x05040302), C32e(0xe087a9ce), C32e(0x87ac7d56),
+	C32e(0x2bd519e7), C32e(0xa67162b5), C32e(0x319ae64d), C32e(0xb5c39aec),
+	C32e(0xcf05458f), C32e(0xbc3e9d1f), C32e(0xc0094089), C32e(0x92ef87fa),
+	C32e(0x3fc515ef), C32e(0x267febb2), C32e(0x4007c98e), C32e(0x1ded0bfb),
+	C32e(0x2f82ec41), C32e(0xa97d67b3), C32e(0x1cbefd5f), C32e(0x258aea45),
+	C32e(0xda46bf23), C32e(0x02a6f753), C32e(0xa1d396e4), C32e(0xed2d5b9b),
+	C32e(0x5deac275), C32e(0x24d91ce1), C32e(0xe97aae3d), C32e(0xbe986a4c),
+	C32e(0xeed85a6c), C32e(0xc3fc417e), C32e(0x06f102f5), C32e(0xd11d4f83),
+	C32e(0xe4d05c68), C32e(0x07a2f451), C32e(0x5cb934d1), C32e(0x18e908f9),
+	C32e(0xaedf93e2), C32e(0x954d73ab), C32e(0xf5c45362), C32e(0x41543f2a),
+	C32e(0x14100c08), C32e(0xf6315295), C32e(0xaf8c6546), C32e(0xe2215e9d),
+	C32e(0x78602830), C32e(0xf86ea137), C32e(0x11140f0a), C32e(0xc45eb52f),
+	C32e(0x1b1c090e), C32e(0x5a483624), C32e(0xb6369b1b), C32e(0x47a53ddf),
+	C32e(0x6a8126cd), C32e(0xbb9c694e), C32e(0x4cfecd7f), C32e(0xbacf9fea),
+	C32e(0x2d241b12), C32e(0xb93a9e1d), C32e(0x9cb07458), C32e(0x72682e34),
+	C32e(0x776c2d36), C32e(0xcda3b2dc), C32e(0x2973eeb4), C32e(0x16b6fb5b),
+	C32e(0x0153f6a4), C32e(0xd7ec4d76), C32e(0xa37561b7), C32e(0x49face7d),
+	C32e(0x8da47b52), C32e(0x42a13edd), C32e(0x93bc715e), C32e(0xa2269713),
+	C32e(0x0457f5a6), C32e(0xb86968b9), C32e(0x00000000), C32e(0x74992cc1),
+	C32e(0xa0806040), C32e(0x21dd1fe3), C32e(0x43f2c879), C32e(0x2c77edb6),
+	C32e(0xd9b3bed4), C32e(0xca01468d), C32e(0x70ced967), C32e(0xdde44b72),
+	C32e(0x7933de94), C32e(0x672bd498), C32e(0x237be8b0), C32e(0xde114a85),
+	C32e(0xbd6d6bbb), C32e(0x7e912ac5), C32e(0x349ee54f), C32e(0x3ac116ed),
+	C32e(0x5417c586), C32e(0x622fd79a), C32e(0xffcc5566), C32e(0xa7229411),
+	C32e(0x4a0fcf8a), C32e(0x30c910e9), C32e(0x0a080604), C32e(0x98e781fe),
+	C32e(0x0b5bf0a0), C32e(0xccf04478), C32e(0xd54aba25), C32e(0x3e96e34b),
+	C32e(0x0e5ff3a2), C32e(0x19bafe5d), C32e(0x5b1bc080), C32e(0x850a8a05),
+	C32e(0xec7ead3f), C32e(0xdf42bc21), C32e(0xd8e04870), C32e(0x0cf904f1),
+	C32e(0x7ac6df63), C32e(0x58eec177), C32e(0x9f4575af), C32e(0xa5846342),
+	C32e(0x50403020), C32e(0x2ed11ae5), C32e(0x12e10efd), C32e(0xb7656dbf),
+	C32e(0xd4194c81), C32e(0x3c301418), C32e(0x5f4c3526), C32e(0x719d2fc3),
+	C32e(0x3867e1be), C32e(0xfd6aa235), C32e(0x4f0bcc88), C32e(0x4b5c392e),
+	C32e(0xf93d5793), C32e(0x0daaf255), C32e(0x9de382fc), C32e(0xc9f4477a),
+	C32e(0xef8bacc8), C32e(0x326fe7ba), C32e(0x7d642b32), C32e(0xa4d795e6),
+	C32e(0xfb9ba0c0), C32e(0xb3329819), C32e(0x6827d19e), C32e(0x815d7fa3),
+	C32e(0xaa886644), C32e(0x82a87e54), C32e(0xe676ab3b), C32e(0x9e16830b),
+	C32e(0x4503ca8c), C32e(0x7b9529c7), C32e(0x6ed6d36b), C32e(0x44503c28),
+	C32e(0x8b5579a7), C32e(0x3d63e2bc), C32e(0x272c1d16), C32e(0x9a4176ad),
+	C32e(0x4dad3bdb), C32e(0xfac85664), C32e(0xd2e84e74), C32e(0x22281e14),
+	C32e(0x763fdb92), C32e(0x1e180a0c), C32e(0xb4906c48), C32e(0x376be4b8),
+	C32e(0xe7255d9f), C32e(0xb2616ebd), C32e(0x2a86ef43), C32e(0xf193a6c4),
+	C32e(0xe372a839), C32e(0xf762a431), C32e(0x59bd37d3), C32e(0x86ff8bf2),
+	C32e(0x56b132d5), C32e(0xc50d438b), C32e(0xebdc596e), C32e(0xc2afb7da),
+	C32e(0x8f028c01), C32e(0xac7964b1), C32e(0x6d23d29c), C32e(0x3b92e049),
+	C32e(0xc7abb4d8), C32e(0x1543faac), C32e(0x09fd07f3), C32e(0x6f8525cf),
+	C32e(0xea8fafca), C32e(0x89f38ef4), C32e(0x208ee947), C32e(0x28201810),
+	C32e(0x64ded56f), C32e(0x83fb88f0), C32e(0xb1946f4a), C32e(0x96b8725c),
+	C32e(0x6c702438), C32e(0x08aef157), C32e(0x52e6c773), C32e(0xf3355197),
+	C32e(0x658d23cb), C32e(0x84597ca1), C32e(0xbfcb9ce8), C32e(0x637c213e),
+	C32e(0x7c37dd96), C32e(0x7fc2dc61), C32e(0x911a860d), C32e(0x941e850f),
+	C32e(0xabdb90e0), C32e(0xc6f8427c), C32e(0x57e2c471), C32e(0xe583aacc),
+	C32e(0x733bd890), C32e(0x0f0c0506), C32e(0x03f501f7), C32e(0x3638121c),
+	C32e(0xfe9fa3c2), C32e(0xe1d45f6a), C32e(0x1047f9ae), C32e(0x6bd2d069),
+	C32e(0xa82e9117), C32e(0xe8295899), C32e(0x6974273a), C32e(0xd04eb927),
+	C32e(0x48a938d9), C32e(0x35cd13eb), C32e(0xce56b32b), C32e(0x55443322),
+	C32e(0xd6bfbbd2), C32e(0x904970a9), C32e(0x800e8907), C32e(0xf266a733),
+	C32e(0xc15ab62d), C32e(0x6678223c), C32e(0xad2a9215), C32e(0x608920c9),
+	C32e(0xdb154987), C32e(0x1a4fffaa), C32e(0x88a07850), C32e(0x8e517aa5),
+	C32e(0x8a068f03), C32e(0x13b2f859), C32e(0x9b128009), C32e(0x3934171a),
+	C32e(0x75cada65), C32e(0x53b531d7), C32e(0x5113c684), C32e(0xd3bbb8d0),
+	C32e(0x5e1fc382), C32e(0xcb52b029), C32e(0x99b4775a), C32e(0x333c111e),
+	C32e(0x46f6cb7b), C32e(0x1f4bfca8), C32e(0x61dad66d), C32e(0x4e583a2c)
+};
+
+static const sph_u32 T1up[] = {
+	C32e(0xc6c632f4), C32e(0xf8f86f97), C32e(0xeeee5eb0), C32e(0xf6f67a8c),
+	C32e(0xffffe817), C32e(0xd6d60adc), C32e(0xdede16c8), C32e(0x91916dfc),
+	C32e(0x606090f0), C32e(0x02020705), C32e(0xcece2ee0), C32e(0x5656d187),
+	C32e(0xe7e7cc2b), C32e(0xb5b513a6), C32e(0x4d4d7c31), C32e(0xecec59b5),
+	C32e(0x8f8f40cf), C32e(0x1f1fa3bc), C32e(0x898949c0), C32e(0xfafa6892),
+	C32e(0xefefd03f), C32e(0xb2b29426), C32e(0x8e8ece40), C32e(0xfbfbe61d),
+	C32e(0x41416e2f), C32e(0xb3b31aa9), C32e(0x5f5f431c), C32e(0x45456025),
+	C32e(0x2323f9da), C32e(0x53535102), C32e(0xe4e445a1), C32e(0x9b9b76ed),
+	C32e(0x7575285d), C32e(0xe1e1c524), C32e(0x3d3dd4e9), C32e(0x4c4cf2be),
+	C32e(0x6c6c82ee), C32e(0x7e7ebdc3), C32e(0xf5f5f306), C32e(0x838352d1),
+	C32e(0x68688ce4), C32e(0x51515607), C32e(0xd1d18d5c), C32e(0xf9f9e118),
+	C32e(0xe2e24cae), C32e(0xabab3e95), C32e(0x626297f5), C32e(0x2a2a6b41),
+	C32e(0x08081c14), C32e(0x959563f6), C32e(0x4646e9af), C32e(0x9d9d7fe2),
+	C32e(0x30304878), C32e(0x3737cff8), C32e(0x0a0a1b11), C32e(0x2f2febc4),
+	C32e(0x0e0e151b), C32e(0x24247e5a), C32e(0x1b1badb6), C32e(0xdfdf9847),
+	C32e(0xcdcda76a), C32e(0x4e4ef5bb), C32e(0x7f7f334c), C32e(0xeaea50ba),
+	C32e(0x12123f2d), C32e(0x1d1da4b9), C32e(0x5858c49c), C32e(0x34344672),
+	C32e(0x36364177), C32e(0xdcdc11cd), C32e(0xb4b49d29), C32e(0x5b5b4d16),
+	C32e(0xa4a4a501), C32e(0x7676a1d7), C32e(0xb7b714a3), C32e(0x7d7d3449),
+	C32e(0x5252df8d), C32e(0xdddd9f42), C32e(0x5e5ecd93), C32e(0x1313b1a2),
+	C32e(0xa6a6a204), C32e(0xb9b901b8), C32e(0x00000000), C32e(0xc1c1b574),
+	C32e(0x4040e0a0), C32e(0xe3e3c221), C32e(0x79793a43), C32e(0xb6b69a2c),
+	C32e(0xd4d40dd9), C32e(0x8d8d47ca), C32e(0x67671770), C32e(0x7272afdd),
+	C32e(0x9494ed79), C32e(0x9898ff67), C32e(0xb0b09323), C32e(0x85855bde),
+	C32e(0xbbbb06bd), C32e(0xc5c5bb7e), C32e(0x4f4f7b34), C32e(0xededd73a),
+	C32e(0x8686d254), C32e(0x9a9af862), C32e(0x666699ff), C32e(0x1111b6a7),
+	C32e(0x8a8ac04a), C32e(0xe9e9d930), C32e(0x04040e0a), C32e(0xfefe6698),
+	C32e(0xa0a0ab0b), C32e(0x7878b4cc), C32e(0x2525f0d5), C32e(0x4b4b753e),
+	C32e(0xa2a2ac0e), C32e(0x5d5d4419), C32e(0x8080db5b), C32e(0x05058085),
+	C32e(0x3f3fd3ec), C32e(0x2121fedf), C32e(0x7070a8d8), C32e(0xf1f1fd0c),
+	C32e(0x6363197a), C32e(0x77772f58), C32e(0xafaf309f), C32e(0x4242e7a5),
+	C32e(0x20207050), C32e(0xe5e5cb2e), C32e(0xfdfdef12), C32e(0xbfbf08b7),
+	C32e(0x818155d4), C32e(0x1818243c), C32e(0x2626795f), C32e(0xc3c3b271),
+	C32e(0xbebe8638), C32e(0x3535c8fd), C32e(0x8888c74f), C32e(0x2e2e654b),
+	C32e(0x93936af9), C32e(0x5555580d), C32e(0xfcfc619d), C32e(0x7a7ab3c9),
+	C32e(0xc8c827ef), C32e(0xbaba8832), C32e(0x32324f7d), C32e(0xe6e642a4),
+	C32e(0xc0c03bfb), C32e(0x1919aab3), C32e(0x9e9ef668), C32e(0xa3a32281),
+	C32e(0x4444eeaa), C32e(0x5454d682), C32e(0x3b3bdde6), C32e(0x0b0b959e),
+	C32e(0x8c8cc945), C32e(0xc7c7bc7b), C32e(0x6b6b056e), C32e(0x28286c44),
+	C32e(0xa7a72c8b), C32e(0xbcbc813d), C32e(0x16163127), C32e(0xadad379a),
+	C32e(0xdbdb964d), C32e(0x64649efa), C32e(0x7474a6d2), C32e(0x14143622),
+	C32e(0x9292e476), C32e(0x0c0c121e), C32e(0x4848fcb4), C32e(0xb8b88f37),
+	C32e(0x9f9f78e7), C32e(0xbdbd0fb2), C32e(0x4343692a), C32e(0xc4c435f1),
+	C32e(0x3939dae3), C32e(0x3131c6f7), C32e(0xd3d38a59), C32e(0xf2f27486),
+	C32e(0xd5d58356), C32e(0x8b8b4ec5), C32e(0x6e6e85eb), C32e(0xdada18c2),
+	C32e(0x01018e8f), C32e(0xb1b11dac), C32e(0x9c9cf16d), C32e(0x4949723b),
+	C32e(0xd8d81fc7), C32e(0xacacb915), C32e(0xf3f3fa09), C32e(0xcfcfa06f),
+	C32e(0xcaca20ea), C32e(0xf4f47d89), C32e(0x47476720), C32e(0x10103828),
+	C32e(0x6f6f0b64), C32e(0xf0f07383), C32e(0x4a4afbb1), C32e(0x5c5cca96),
+	C32e(0x3838546c), C32e(0x57575f08), C32e(0x73732152), C32e(0x979764f3),
+	C32e(0xcbcbae65), C32e(0xa1a12584), C32e(0xe8e857bf), C32e(0x3e3e5d63),
+	C32e(0x9696ea7c), C32e(0x61611e7f), C32e(0x0d0d9c91), C32e(0x0f0f9b94),
+	C32e(0xe0e04bab), C32e(0x7c7cbac6), C32e(0x71712657), C32e(0xcccc29e5),
+	C32e(0x9090e373), C32e(0x0606090f), C32e(0xf7f7f403), C32e(0x1c1c2a36),
+	C32e(0xc2c23cfe), C32e(0x6a6a8be1), C32e(0xaeaebe10), C32e(0x6969026b),
+	C32e(0x1717bfa8), C32e(0x999971e8), C32e(0x3a3a5369), C32e(0x2727f7d0),
+	C32e(0xd9d99148), C32e(0xebebde35), C32e(0x2b2be5ce), C32e(0x22227755),
+	C32e(0xd2d204d6), C32e(0xa9a93990), C32e(0x07078780), C32e(0x3333c1f2),
+	C32e(0x2d2decc1), C32e(0x3c3c5a66), C32e(0x1515b8ad), C32e(0xc9c9a960),
+	C32e(0x87875cdb), C32e(0xaaaab01a), C32e(0x5050d888), C32e(0xa5a52b8e),
+	C32e(0x0303898a), C32e(0x59594a13), C32e(0x0909929b), C32e(0x1a1a2339),
+	C32e(0x65651075), C32e(0xd7d78453), C32e(0x8484d551), C32e(0xd0d003d3),
+	C32e(0x8282dc5e), C32e(0x2929e2cb), C32e(0x5a5ac399), C32e(0x1e1e2d33),
+	C32e(0x7b7b3d46), C32e(0xa8a8b71f), C32e(0x6d6d0c61), C32e(0x2c2c624e)
+};
+
+static const sph_u32 T1dn[] = {
+	C32e(0xa5f497a5), C32e(0x8497eb84), C32e(0x99b0c799), C32e(0x8d8cf78d),
+	C32e(0x0d17e50d), C32e(0xbddcb7bd), C32e(0xb1c8a7b1), C32e(0x54fc3954),
+	C32e(0x50f0c050), C32e(0x03050403), C32e(0xa9e087a9), C32e(0x7d87ac7d),
+	C32e(0x192bd519), C32e(0x62a67162), C32e(0xe6319ae6), C32e(0x9ab5c39a),
+	C32e(0x45cf0545), C32e(0x9dbc3e9d), C32e(0x40c00940), C32e(0x8792ef87),
+	C32e(0x153fc515), C32e(0xeb267feb), C32e(0xc94007c9), C32e(0x0b1ded0b),
+	C32e(0xec2f82ec), C32e(0x67a97d67), C32e(0xfd1cbefd), C32e(0xea258aea),
+	C32e(0xbfda46bf), C32e(0xf702a6f7), C32e(0x96a1d396), C32e(0x5bed2d5b),
+	C32e(0xc25deac2), C32e(0x1c24d91c), C32e(0xaee97aae), C32e(0x6abe986a),
+	C32e(0x5aeed85a), C32e(0x41c3fc41), C32e(0x0206f102), C32e(0x4fd11d4f),
+	C32e(0x5ce4d05c), C32e(0xf407a2f4), C32e(0x345cb934), C32e(0x0818e908),
+	C32e(0x93aedf93), C32e(0x73954d73), C32e(0x53f5c453), C32e(0x3f41543f),
+	C32e(0x0c14100c), C32e(0x52f63152), C32e(0x65af8c65), C32e(0x5ee2215e),
+	C32e(0x28786028), C32e(0xa1f86ea1), C32e(0x0f11140f), C32e(0xb5c45eb5),
+	C32e(0x091b1c09), C32e(0x365a4836), C32e(0x9bb6369b), C32e(0x3d47a53d),
+	C32e(0x266a8126), C32e(0x69bb9c69), C32e(0xcd4cfecd), C32e(0x9fbacf9f),
+	C32e(0x1b2d241b), C32e(0x9eb93a9e), C32e(0x749cb074), C32e(0x2e72682e),
+	C32e(0x2d776c2d), C32e(0xb2cda3b2), C32e(0xee2973ee), C32e(0xfb16b6fb),
+	C32e(0xf60153f6), C32e(0x4dd7ec4d), C32e(0x61a37561), C32e(0xce49face),
+	C32e(0x7b8da47b), C32e(0x3e42a13e), C32e(0x7193bc71), C32e(0x97a22697),
+	C32e(0xf50457f5), C32e(0x68b86968), C32e(0x00000000), C32e(0x2c74992c),
+	C32e(0x60a08060), C32e(0x1f21dd1f), C32e(0xc843f2c8), C32e(0xed2c77ed),
+	C32e(0xbed9b3be), C32e(0x46ca0146), C32e(0xd970ced9), C32e(0x4bdde44b),
+	C32e(0xde7933de), C32e(0xd4672bd4), C32e(0xe8237be8), C32e(0x4ade114a),
+	C32e(0x6bbd6d6b), C32e(0x2a7e912a), C32e(0xe5349ee5), C32e(0x163ac116),
+	C32e(0xc55417c5), C32e(0xd7622fd7), C32e(0x55ffcc55), C32e(0x94a72294),
+	C32e(0xcf4a0fcf), C32e(0x1030c910), C32e(0x060a0806), C32e(0x8198e781),
+	C32e(0xf00b5bf0), C32e(0x44ccf044), C32e(0xbad54aba), C32e(0xe33e96e3),
+	C32e(0xf30e5ff3), C32e(0xfe19bafe), C32e(0xc05b1bc0), C32e(0x8a850a8a),
+	C32e(0xadec7ead), C32e(0xbcdf42bc), C32e(0x48d8e048), C32e(0x040cf904),
+	C32e(0xdf7ac6df), C32e(0xc158eec1), C32e(0x759f4575), C32e(0x63a58463),
+	C32e(0x30504030), C32e(0x1a2ed11a), C32e(0x0e12e10e), C32e(0x6db7656d),
+	C32e(0x4cd4194c), C32e(0x143c3014), C32e(0x355f4c35), C32e(0x2f719d2f),
+	C32e(0xe13867e1), C32e(0xa2fd6aa2), C32e(0xcc4f0bcc), C32e(0x394b5c39),
+	C32e(0x57f93d57), C32e(0xf20daaf2), C32e(0x829de382), C32e(0x47c9f447),
+	C32e(0xacef8bac), C32e(0xe7326fe7), C32e(0x2b7d642b), C32e(0x95a4d795),
+	C32e(0xa0fb9ba0), C32e(0x98b33298), C32e(0xd16827d1), C32e(0x7f815d7f),
+	C32e(0x66aa8866), C32e(0x7e82a87e), C32e(0xabe676ab), C32e(0x839e1683),
+	C32e(0xca4503ca), C32e(0x297b9529), C32e(0xd36ed6d3), C32e(0x3c44503c),
+	C32e(0x798b5579), C32e(0xe23d63e2), C32e(0x1d272c1d), C32e(0x769a4176),
+	C32e(0x3b4dad3b), C32e(0x56fac856), C32e(0x4ed2e84e), C32e(0x1e22281e),
+	C32e(0xdb763fdb), C32e(0x0a1e180a), C32e(0x6cb4906c), C32e(0xe4376be4),
+	C32e(0x5de7255d), C32e(0x6eb2616e), C32e(0xef2a86ef), C32e(0xa6f193a6),
+	C32e(0xa8e372a8), C32e(0xa4f762a4), C32e(0x3759bd37), C32e(0x8b86ff8b),
+	C32e(0x3256b132), C32e(0x43c50d43), C32e(0x59ebdc59), C32e(0xb7c2afb7),
+	C32e(0x8c8f028c), C32e(0x64ac7964), C32e(0xd26d23d2), C32e(0xe03b92e0),
+	C32e(0xb4c7abb4), C32e(0xfa1543fa), C32e(0x0709fd07), C32e(0x256f8525),
+	C32e(0xafea8faf), C32e(0x8e89f38e), C32e(0xe9208ee9), C32e(0x18282018),
+	C32e(0xd564ded5), C32e(0x8883fb88), C32e(0x6fb1946f), C32e(0x7296b872),
+	C32e(0x246c7024), C32e(0xf108aef1), C32e(0xc752e6c7), C32e(0x51f33551),
+	C32e(0x23658d23), C32e(0x7c84597c), C32e(0x9cbfcb9c), C32e(0x21637c21),
+	C32e(0xdd7c37dd), C32e(0xdc7fc2dc), C32e(0x86911a86), C32e(0x85941e85),
+	C32e(0x90abdb90), C32e(0x42c6f842), C32e(0xc457e2c4), C32e(0xaae583aa),
+	C32e(0xd8733bd8), C32e(0x050f0c05), C32e(0x0103f501), C32e(0x12363812),
+	C32e(0xa3fe9fa3), C32e(0x5fe1d45f), C32e(0xf91047f9), C32e(0xd06bd2d0),
+	C32e(0x91a82e91), C32e(0x58e82958), C32e(0x27697427), C32e(0xb9d04eb9),
+	C32e(0x3848a938), C32e(0x1335cd13), C32e(0xb3ce56b3), C32e(0x33554433),
+	C32e(0xbbd6bfbb), C32e(0x70904970), C32e(0x89800e89), C32e(0xa7f266a7),
+	C32e(0xb6c15ab6), C32e(0x22667822), C32e(0x92ad2a92), C32e(0x20608920),
+	C32e(0x49db1549), C32e(0xff1a4fff), C32e(0x7888a078), C32e(0x7a8e517a),
+	C32e(0x8f8a068f), C32e(0xf813b2f8), C32e(0x809b1280), C32e(0x17393417),
+	C32e(0xda75cada), C32e(0x3153b531), C32e(0xc65113c6), C32e(0xb8d3bbb8),
+	C32e(0xc35e1fc3), C32e(0xb0cb52b0), C32e(0x7799b477), C32e(0x11333c11),
+	C32e(0xcb46f6cb), C32e(0xfc1f4bfc), C32e(0xd661dad6), C32e(0x3a4e583a)
+};
+
+static const sph_u32 T2up[] = {
+	C32e(0xa5c6c632), C32e(0x84f8f86f), C32e(0x99eeee5e), C32e(0x8df6f67a),
+	C32e(0x0dffffe8), C32e(0xbdd6d60a), C32e(0xb1dede16), C32e(0x5491916d),
+	C32e(0x50606090), C32e(0x03020207), C32e(0xa9cece2e), C32e(0x7d5656d1),
+	C32e(0x19e7e7cc), C32e(0x62b5b513), C32e(0xe64d4d7c), C32e(0x9aecec59),
+	C32e(0x458f8f40), C32e(0x9d1f1fa3), C32e(0x40898949), C32e(0x87fafa68),
+	C32e(0x15efefd0), C32e(0xebb2b294), C32e(0xc98e8ece), C32e(0x0bfbfbe6),
+	C32e(0xec41416e), C32e(0x67b3b31a), C32e(0xfd5f5f43), C32e(0xea454560),
+	C32e(0xbf2323f9), C32e(0xf7535351), C32e(0x96e4e445), C32e(0x5b9b9b76),
+	C32e(0xc2757528), C32e(0x1ce1e1c5), C32e(0xae3d3dd4), C32e(0x6a4c4cf2),
+	C32e(0x5a6c6c82), C32e(0x417e7ebd), C32e(0x02f5f5f3), C32e(0x4f838352),
+	C32e(0x5c68688c), C32e(0xf4515156), C32e(0x34d1d18d), C32e(0x08f9f9e1),
+	C32e(0x93e2e24c), C32e(0x73abab3e), C32e(0x53626297), C32e(0x3f2a2a6b),
+	C32e(0x0c08081c), C32e(0x52959563), C32e(0x654646e9), C32e(0x5e9d9d7f),
+	C32e(0x28303048), C32e(0xa13737cf), C32e(0x0f0a0a1b), C32e(0xb52f2feb),
+	C32e(0x090e0e15), C32e(0x3624247e), C32e(0x9b1b1bad), C32e(0x3ddfdf98),
+	C32e(0x26cdcda7), C32e(0x694e4ef5), C32e(0xcd7f7f33), C32e(0x9feaea50),
+	C32e(0x1b12123f), C32e(0x9e1d1da4), C32e(0x745858c4), C32e(0x2e343446),
+	C32e(0x2d363641), C32e(0xb2dcdc11), C32e(0xeeb4b49d), C32e(0xfb5b5b4d),
+	C32e(0xf6a4a4a5), C32e(0x4d7676a1), C32e(0x61b7b714), C32e(0xce7d7d34),
+	C32e(0x7b5252df), C32e(0x3edddd9f), C32e(0x715e5ecd), C32e(0x971313b1),
+	C32e(0xf5a6a6a2), C32e(0x68b9b901), C32e(0x00000000), C32e(0x2cc1c1b5),
+	C32e(0x604040e0), C32e(0x1fe3e3c2), C32e(0xc879793a), C32e(0xedb6b69a),
+	C32e(0xbed4d40d), C32e(0x468d8d47), C32e(0xd9676717), C32e(0x4b7272af),
+	C32e(0xde9494ed), C32e(0xd49898ff), C32e(0xe8b0b093), C32e(0x4a85855b),
+	C32e(0x6bbbbb06), C32e(0x2ac5c5bb), C32e(0xe54f4f7b), C32e(0x16ededd7),
+	C32e(0xc58686d2), C32e(0xd79a9af8), C32e(0x55666699), C32e(0x941111b6),
+	C32e(0xcf8a8ac0), C32e(0x10e9e9d9), C32e(0x0604040e), C32e(0x81fefe66),
+	C32e(0xf0a0a0ab), C32e(0x447878b4), C32e(0xba2525f0), C32e(0xe34b4b75),
+	C32e(0xf3a2a2ac), C32e(0xfe5d5d44), C32e(0xc08080db), C32e(0x8a050580),
+	C32e(0xad3f3fd3), C32e(0xbc2121fe), C32e(0x487070a8), C32e(0x04f1f1fd),
+	C32e(0xdf636319), C32e(0xc177772f), C32e(0x75afaf30), C32e(0x634242e7),
+	C32e(0x30202070), C32e(0x1ae5e5cb), C32e(0x0efdfdef), C32e(0x6dbfbf08),
+	C32e(0x4c818155), C32e(0x14181824), C32e(0x35262679), C32e(0x2fc3c3b2),
+	C32e(0xe1bebe86), C32e(0xa23535c8), C32e(0xcc8888c7), C32e(0x392e2e65),
+	C32e(0x5793936a), C32e(0xf2555558), C32e(0x82fcfc61), C32e(0x477a7ab3),
+	C32e(0xacc8c827), C32e(0xe7baba88), C32e(0x2b32324f), C32e(0x95e6e642),
+	C32e(0xa0c0c03b), C32e(0x981919aa), C32e(0xd19e9ef6), C32e(0x7fa3a322),
+	C32e(0x664444ee), C32e(0x7e5454d6), C32e(0xab3b3bdd), C32e(0x830b0b95),
+	C32e(0xca8c8cc9), C32e(0x29c7c7bc), C32e(0xd36b6b05), C32e(0x3c28286c),
+	C32e(0x79a7a72c), C32e(0xe2bcbc81), C32e(0x1d161631), C32e(0x76adad37),
+	C32e(0x3bdbdb96), C32e(0x5664649e), C32e(0x4e7474a6), C32e(0x1e141436),
+	C32e(0xdb9292e4), C32e(0x0a0c0c12), C32e(0x6c4848fc), C32e(0xe4b8b88f),
+	C32e(0x5d9f9f78), C32e(0x6ebdbd0f), C32e(0xef434369), C32e(0xa6c4c435),
+	C32e(0xa83939da), C32e(0xa43131c6), C32e(0x37d3d38a), C32e(0x8bf2f274),
+	C32e(0x32d5d583), C32e(0x438b8b4e), C32e(0x596e6e85), C32e(0xb7dada18),
+	C32e(0x8c01018e), C32e(0x64b1b11d), C32e(0xd29c9cf1), C32e(0xe0494972),
+	C32e(0xb4d8d81f), C32e(0xfaacacb9), C32e(0x07f3f3fa), C32e(0x25cfcfa0),
+	C32e(0xafcaca20), C32e(0x8ef4f47d), C32e(0xe9474767), C32e(0x18101038),
+	C32e(0xd56f6f0b), C32e(0x88f0f073), C32e(0x6f4a4afb), C32e(0x725c5cca),
+	C32e(0x24383854), C32e(0xf157575f), C32e(0xc7737321), C32e(0x51979764),
+	C32e(0x23cbcbae), C32e(0x7ca1a125), C32e(0x9ce8e857), C32e(0x213e3e5d),
+	C32e(0xdd9696ea), C32e(0xdc61611e), C32e(0x860d0d9c), C32e(0x850f0f9b),
+	C32e(0x90e0e04b), C32e(0x427c7cba), C32e(0xc4717126), C32e(0xaacccc29),
+	C32e(0xd89090e3), C32e(0x05060609), C32e(0x01f7f7f4), C32e(0x121c1c2a),
+	C32e(0xa3c2c23c), C32e(0x5f6a6a8b), C32e(0xf9aeaebe), C32e(0xd0696902),
+	C32e(0x911717bf), C32e(0x58999971), C32e(0x273a3a53), C32e(0xb92727f7),
+	C32e(0x38d9d991), C32e(0x13ebebde), C32e(0xb32b2be5), C32e(0x33222277),
+	C32e(0xbbd2d204), C32e(0x70a9a939), C32e(0x89070787), C32e(0xa73333c1),
+	C32e(0xb62d2dec), C32e(0x223c3c5a), C32e(0x921515b8), C32e(0x20c9c9a9),
+	C32e(0x4987875c), C32e(0xffaaaab0), C32e(0x785050d8), C32e(0x7aa5a52b),
+	C32e(0x8f030389), C32e(0xf859594a), C32e(0x80090992), C32e(0x171a1a23),
+	C32e(0xda656510), C32e(0x31d7d784), C32e(0xc68484d5), C32e(0xb8d0d003),
+	C32e(0xc38282dc), C32e(0xb02929e2), C32e(0x775a5ac3), C32e(0x111e1e2d),
+	C32e(0xcb7b7b3d), C32e(0xfca8a8b7), C32e(0xd66d6d0c), C32e(0x3a2c2c62)
+};
+
+static const sph_u32 T2dn[] = {
+	C32e(0xf4a5f497), C32e(0x978497eb), C32e(0xb099b0c7), C32e(0x8c8d8cf7),
+	C32e(0x170d17e5), C32e(0xdcbddcb7), C32e(0xc8b1c8a7), C32e(0xfc54fc39),
+	C32e(0xf050f0c0), C32e(0x05030504), C32e(0xe0a9e087), C32e(0x877d87ac),
+	C32e(0x2b192bd5), C32e(0xa662a671), C32e(0x31e6319a), C32e(0xb59ab5c3),
+	C32e(0xcf45cf05), C32e(0xbc9dbc3e), C32e(0xc040c009), C32e(0x928792ef),
+	C32e(0x3f153fc5), C32e(0x26eb267f), C32e(0x40c94007), C32e(0x1d0b1ded),
+	C32e(0x2fec2f82), C32e(0xa967a97d), C32e(0x1cfd1cbe), C32e(0x25ea258a),
+	C32e(0xdabfda46), C32e(0x02f702a6), C32e(0xa196a1d3), C32e(0xed5bed2d),
+	C32e(0x5dc25dea), C32e(0x241c24d9), C32e(0xe9aee97a), C32e(0xbe6abe98),
+	C32e(0xee5aeed8), C32e(0xc341c3fc), C32e(0x060206f1), C32e(0xd14fd11d),
+	C32e(0xe45ce4d0), C32e(0x07f407a2), C32e(0x5c345cb9), C32e(0x180818e9),
+	C32e(0xae93aedf), C32e(0x9573954d), C32e(0xf553f5c4), C32e(0x413f4154),
+	C32e(0x140c1410), C32e(0xf652f631), C32e(0xaf65af8c), C32e(0xe25ee221),
+	C32e(0x78287860), C32e(0xf8a1f86e), C32e(0x110f1114), C32e(0xc4b5c45e),
+	C32e(0x1b091b1c), C32e(0x5a365a48), C32e(0xb69bb636), C32e(0x473d47a5),
+	C32e(0x6a266a81), C32e(0xbb69bb9c), C32e(0x4ccd4cfe), C32e(0xba9fbacf),
+	C32e(0x2d1b2d24), C32e(0xb99eb93a), C32e(0x9c749cb0), C32e(0x722e7268),
+	C32e(0x772d776c), C32e(0xcdb2cda3), C32e(0x29ee2973), C32e(0x16fb16b6),
+	C32e(0x01f60153), C32e(0xd74dd7ec), C32e(0xa361a375), C32e(0x49ce49fa),
+	C32e(0x8d7b8da4), C32e(0x423e42a1), C32e(0x937193bc), C32e(0xa297a226),
+	C32e(0x04f50457), C32e(0xb868b869), C32e(0x00000000), C32e(0x742c7499),
+	C32e(0xa060a080), C32e(0x211f21dd), C32e(0x43c843f2), C32e(0x2ced2c77),
+	C32e(0xd9bed9b3), C32e(0xca46ca01), C32e(0x70d970ce), C32e(0xdd4bdde4),
+	C32e(0x79de7933), C32e(0x67d4672b), C32e(0x23e8237b), C32e(0xde4ade11),
+	C32e(0xbd6bbd6d), C32e(0x7e2a7e91), C32e(0x34e5349e), C32e(0x3a163ac1),
+	C32e(0x54c55417), C32e(0x62d7622f), C32e(0xff55ffcc), C32e(0xa794a722),
+	C32e(0x4acf4a0f), C32e(0x301030c9), C32e(0x0a060a08), C32e(0x988198e7),
+	C32e(0x0bf00b5b), C32e(0xcc44ccf0), C32e(0xd5bad54a), C32e(0x3ee33e96),
+	C32e(0x0ef30e5f), C32e(0x19fe19ba), C32e(0x5bc05b1b), C32e(0x858a850a),
+	C32e(0xecadec7e), C32e(0xdfbcdf42), C32e(0xd848d8e0), C32e(0x0c040cf9),
+	C32e(0x7adf7ac6), C32e(0x58c158ee), C32e(0x9f759f45), C32e(0xa563a584),
+	C32e(0x50305040), C32e(0x2e1a2ed1), C32e(0x120e12e1), C32e(0xb76db765),
+	C32e(0xd44cd419), C32e(0x3c143c30), C32e(0x5f355f4c), C32e(0x712f719d),
+	C32e(0x38e13867), C32e(0xfda2fd6a), C32e(0x4fcc4f0b), C32e(0x4b394b5c),
+	C32e(0xf957f93d), C32e(0x0df20daa), C32e(0x9d829de3), C32e(0xc947c9f4),
+	C32e(0xefacef8b), C32e(0x32e7326f), C32e(0x7d2b7d64), C32e(0xa495a4d7),
+	C32e(0xfba0fb9b), C32e(0xb398b332), C32e(0x68d16827), C32e(0x817f815d),
+	C32e(0xaa66aa88), C32e(0x827e82a8), C32e(0xe6abe676), C32e(0x9e839e16),
+	C32e(0x45ca4503), C32e(0x7b297b95), C32e(0x6ed36ed6), C32e(0x443c4450),
+	C32e(0x8b798b55), C32e(0x3de23d63), C32e(0x271d272c), C32e(0x9a769a41),
+	C32e(0x4d3b4dad), C32e(0xfa56fac8), C32e(0xd24ed2e8), C32e(0x221e2228),
+	C32e(0x76db763f), C32e(0x1e0a1e18), C32e(0xb46cb490), C32e(0x37e4376b),
+	C32e(0xe75de725), C32e(0xb26eb261), C32e(0x2aef2a86), C32e(0xf1a6f193),
+	C32e(0xe3a8e372), C32e(0xf7a4f762), C32e(0x593759bd), C32e(0x868b86ff),
+	C32e(0x563256b1), C32e(0xc543c50d), C32e(0xeb59ebdc), C32e(0xc2b7c2af),
+	C32e(0x8f8c8f02), C32e(0xac64ac79), C32e(0x6dd26d23), C32e(0x3be03b92),
+	C32e(0xc7b4c7ab), C32e(0x15fa1543), C32e(0x090709fd), C32e(0x6f256f85),
+	C32e(0xeaafea8f), C32e(0x898e89f3), C32e(0x20e9208e), C32e(0x28182820),
+	C32e(0x64d564de), C32e(0x838883fb), C32e(0xb16fb194), C32e(0x967296b8),
+	C32e(0x6c246c70), C32e(0x08f108ae), C32e(0x52c752e6), C32e(0xf351f335),
+	C32e(0x6523658d), C32e(0x847c8459), C32e(0xbf9cbfcb), C32e(0x6321637c),
+	C32e(0x7cdd7c37), C32e(0x7fdc7fc2), C32e(0x9186911a), C32e(0x9485941e),
+	C32e(0xab90abdb), C32e(0xc642c6f8), C32e(0x57c457e2), C32e(0xe5aae583),
+	C32e(0x73d8733b), C32e(0x0f050f0c), C32e(0x030103f5), C32e(0x36123638),
+	C32e(0xfea3fe9f), C32e(0xe15fe1d4), C32e(0x10f91047), C32e(0x6bd06bd2),
+	C32e(0xa891a82e), C32e(0xe858e829), C32e(0x69276974), C32e(0xd0b9d04e),
+	C32e(0x483848a9), C32e(0x351335cd), C32e(0xceb3ce56), C32e(0x55335544),
+	C32e(0xd6bbd6bf), C32e(0x90709049), C32e(0x8089800e), C32e(0xf2a7f266),
+	C32e(0xc1b6c15a), C32e(0x66226678), C32e(0xad92ad2a), C32e(0x60206089),
+	C32e(0xdb49db15), C32e(0x1aff1a4f), C32e(0x887888a0), C32e(0x8e7a8e51),
+	C32e(0x8a8f8a06), C32e(0x13f813b2), C32e(0x9b809b12), C32e(0x39173934),
+	C32e(0x75da75ca), C32e(0x533153b5), C32e(0x51c65113), C32e(0xd3b8d3bb),
+	C32e(0x5ec35e1f), C32e(0xcbb0cb52), C32e(0x997799b4), C32e(0x3311333c),
+	C32e(0x46cb46f6), C32e(0x1ffc1f4b), C32e(0x61d661da), C32e(0x4e3a4e58)
+};
+
+static const sph_u32 T3up[] = {
+	C32e(0x97a5c6c6), C32e(0xeb84f8f8), C32e(0xc799eeee), C32e(0xf78df6f6),
+	C32e(0xe50dffff), C32e(0xb7bdd6d6), C32e(0xa7b1dede), C32e(0x39549191),
+	C32e(0xc0506060), C32e(0x04030202), C32e(0x87a9cece), C32e(0xac7d5656),
+	C32e(0xd519e7e7), C32e(0x7162b5b5), C32e(0x9ae64d4d), C32e(0xc39aecec),
+	C32e(0x05458f8f), C32e(0x3e9d1f1f), C32e(0x09408989), C32e(0xef87fafa),
+	C32e(0xc515efef), C32e(0x7febb2b2), C32e(0x07c98e8e), C32e(0xed0bfbfb),
+	C32e(0x82ec4141), C32e(0x7d67b3b3), C32e(0xbefd5f5f), C32e(0x8aea4545),
+	C32e(0x46bf2323), C32e(0xa6f75353), C32e(0xd396e4e4), C32e(0x2d5b9b9b),
+	C32e(0xeac27575), C32e(0xd91ce1e1), C32e(0x7aae3d3d), C32e(0x986a4c4c),
+	C32e(0xd85a6c6c), C32e(0xfc417e7e), C32e(0xf102f5f5), C32e(0x1d4f8383),
+	C32e(0xd05c6868), C32e(0xa2f45151), C32e(0xb934d1d1), C32e(0xe908f9f9),
+	C32e(0xdf93e2e2), C32e(0x4d73abab), C32e(0xc4536262), C32e(0x543f2a2a),
+	C32e(0x100c0808), C32e(0x31529595), C32e(0x8c654646), C32e(0x215e9d9d),
+	C32e(0x60283030), C32e(0x6ea13737), C32e(0x140f0a0a), C32e(0x5eb52f2f),
+	C32e(0x1c090e0e), C32e(0x48362424), C32e(0x369b1b1b), C32e(0xa53ddfdf),
+	C32e(0x8126cdcd), C32e(0x9c694e4e), C32e(0xfecd7f7f), C32e(0xcf9feaea),
+	C32e(0x241b1212), C32e(0x3a9e1d1d), C32e(0xb0745858), C32e(0x682e3434),
+	C32e(0x6c2d3636), C32e(0xa3b2dcdc), C32e(0x73eeb4b4), C32e(0xb6fb5b5b),
+	C32e(0x53f6a4a4), C32e(0xec4d7676), C32e(0x7561b7b7), C32e(0xface7d7d),
+	C32e(0xa47b5252), C32e(0xa13edddd), C32e(0xbc715e5e), C32e(0x26971313),
+	C32e(0x57f5a6a6), C32e(0x6968b9b9), C32e(0x00000000), C32e(0x992cc1c1),
+	C32e(0x80604040), C32e(0xdd1fe3e3), C32e(0xf2c87979), C32e(0x77edb6b6),
+	C32e(0xb3bed4d4), C32e(0x01468d8d), C32e(0xced96767), C32e(0xe44b7272),
+	C32e(0x33de9494), C32e(0x2bd49898), C32e(0x7be8b0b0), C32e(0x114a8585),
+	C32e(0x6d6bbbbb), C32e(0x912ac5c5), C32e(0x9ee54f4f), C32e(0xc116eded),
+	C32e(0x17c58686), C32e(0x2fd79a9a), C32e(0xcc556666), C32e(0x22941111),
+	C32e(0x0fcf8a8a), C32e(0xc910e9e9), C32e(0x08060404), C32e(0xe781fefe),
+	C32e(0x5bf0a0a0), C32e(0xf0447878), C32e(0x4aba2525), C32e(0x96e34b4b),
+	C32e(0x5ff3a2a2), C32e(0xbafe5d5d), C32e(0x1bc08080), C32e(0x0a8a0505),
+	C32e(0x7ead3f3f), C32e(0x42bc2121), C32e(0xe0487070), C32e(0xf904f1f1),
+	C32e(0xc6df6363), C32e(0xeec17777), C32e(0x4575afaf), C32e(0x84634242),
+	C32e(0x40302020), C32e(0xd11ae5e5), C32e(0xe10efdfd), C32e(0x656dbfbf),
+	C32e(0x194c8181), C32e(0x30141818), C32e(0x4c352626), C32e(0x9d2fc3c3),
+	C32e(0x67e1bebe), C32e(0x6aa23535), C32e(0x0bcc8888), C32e(0x5c392e2e),
+	C32e(0x3d579393), C32e(0xaaf25555), C32e(0xe382fcfc), C32e(0xf4477a7a),
+	C32e(0x8bacc8c8), C32e(0x6fe7baba), C32e(0x642b3232), C32e(0xd795e6e6),
+	C32e(0x9ba0c0c0), C32e(0x32981919), C32e(0x27d19e9e), C32e(0x5d7fa3a3),
+	C32e(0x88664444), C32e(0xa87e5454), C32e(0x76ab3b3b), C32e(0x16830b0b),
+	C32e(0x03ca8c8c), C32e(0x9529c7c7), C32e(0xd6d36b6b), C32e(0x503c2828),
+	C32e(0x5579a7a7), C32e(0x63e2bcbc), C32e(0x2c1d1616), C32e(0x4176adad),
+	C32e(0xad3bdbdb), C32e(0xc8566464), C32e(0xe84e7474), C32e(0x281e1414),
+	C32e(0x3fdb9292), C32e(0x180a0c0c), C32e(0x906c4848), C32e(0x6be4b8b8),
+	C32e(0x255d9f9f), C32e(0x616ebdbd), C32e(0x86ef4343), C32e(0x93a6c4c4),
+	C32e(0x72a83939), C32e(0x62a43131), C32e(0xbd37d3d3), C32e(0xff8bf2f2),
+	C32e(0xb132d5d5), C32e(0x0d438b8b), C32e(0xdc596e6e), C32e(0xafb7dada),
+	C32e(0x028c0101), C32e(0x7964b1b1), C32e(0x23d29c9c), C32e(0x92e04949),
+	C32e(0xabb4d8d8), C32e(0x43faacac), C32e(0xfd07f3f3), C32e(0x8525cfcf),
+	C32e(0x8fafcaca), C32e(0xf38ef4f4), C32e(0x8ee94747), C32e(0x20181010),
+	C32e(0xded56f6f), C32e(0xfb88f0f0), C32e(0x946f4a4a), C32e(0xb8725c5c),
+	C32e(0x70243838), C32e(0xaef15757), C32e(0xe6c77373), C32e(0x35519797),
+	C32e(0x8d23cbcb), C32e(0x597ca1a1), C32e(0xcb9ce8e8), C32e(0x7c213e3e),
+	C32e(0x37dd9696), C32e(0xc2dc6161), C32e(0x1a860d0d), C32e(0x1e850f0f),
+	C32e(0xdb90e0e0), C32e(0xf8427c7c), C32e(0xe2c47171), C32e(0x83aacccc),
+	C32e(0x3bd89090), C32e(0x0c050606), C32e(0xf501f7f7), C32e(0x38121c1c),
+	C32e(0x9fa3c2c2), C32e(0xd45f6a6a), C32e(0x47f9aeae), C32e(0xd2d06969),
+	C32e(0x2e911717), C32e(0x29589999), C32e(0x74273a3a), C32e(0x4eb92727),
+	C32e(0xa938d9d9), C32e(0xcd13ebeb), C32e(0x56b32b2b), C32e(0x44332222),
+	C32e(0xbfbbd2d2), C32e(0x4970a9a9), C32e(0x0e890707), C32e(0x66a73333),
+	C32e(0x5ab62d2d), C32e(0x78223c3c), C32e(0x2a921515), C32e(0x8920c9c9),
+	C32e(0x15498787), C32e(0x4fffaaaa), C32e(0xa0785050), C32e(0x517aa5a5),
+	C32e(0x068f0303), C32e(0xb2f85959), C32e(0x12800909), C32e(0x34171a1a),
+	C32e(0xcada6565), C32e(0xb531d7d7), C32e(0x13c68484), C32e(0xbbb8d0d0),
+	C32e(0x1fc38282), C32e(0x52b02929), C32e(0xb4775a5a), C32e(0x3c111e1e),
+	C32e(0xf6cb7b7b), C32e(0x4bfca8a8), C32e(0xdad66d6d), C32e(0x583a2c2c)
+};
+
+static const sph_u32 T3dn[] = {
+	C32e(0x32f4a5f4), C32e(0x6f978497), C32e(0x5eb099b0), C32e(0x7a8c8d8c),
+	C32e(0xe8170d17), C32e(0x0adcbddc), C32e(0x16c8b1c8), C32e(0x6dfc54fc),
+	C32e(0x90f050f0), C32e(0x07050305), C32e(0x2ee0a9e0), C32e(0xd1877d87),
+	C32e(0xcc2b192b), C32e(0x13a662a6), C32e(0x7c31e631), C32e(0x59b59ab5),
+	C32e(0x40cf45cf), C32e(0xa3bc9dbc), C32e(0x49c040c0), C32e(0x68928792),
+	C32e(0xd03f153f), C32e(0x9426eb26), C32e(0xce40c940), C32e(0xe61d0b1d),
+	C32e(0x6e2fec2f), C32e(0x1aa967a9), C32e(0x431cfd1c), C32e(0x6025ea25),
+	C32e(0xf9dabfda), C32e(0x5102f702), C32e(0x45a196a1), C32e(0x76ed5bed),
+	C32e(0x285dc25d), C32e(0xc5241c24), C32e(0xd4e9aee9), C32e(0xf2be6abe),
+	C32e(0x82ee5aee), C32e(0xbdc341c3), C32e(0xf3060206), C32e(0x52d14fd1),
+	C32e(0x8ce45ce4), C32e(0x5607f407), C32e(0x8d5c345c), C32e(0xe1180818),
+	C32e(0x4cae93ae), C32e(0x3e957395), C32e(0x97f553f5), C32e(0x6b413f41),
+	C32e(0x1c140c14), C32e(0x63f652f6), C32e(0xe9af65af), C32e(0x7fe25ee2),
+	C32e(0x48782878), C32e(0xcff8a1f8), C32e(0x1b110f11), C32e(0xebc4b5c4),
+	C32e(0x151b091b), C32e(0x7e5a365a), C32e(0xadb69bb6), C32e(0x98473d47),
+	C32e(0xa76a266a), C32e(0xf5bb69bb), C32e(0x334ccd4c), C32e(0x50ba9fba),
+	C32e(0x3f2d1b2d), C32e(0xa4b99eb9), C32e(0xc49c749c), C32e(0x46722e72),
+	C32e(0x41772d77), C32e(0x11cdb2cd), C32e(0x9d29ee29), C32e(0x4d16fb16),
+	C32e(0xa501f601), C32e(0xa1d74dd7), C32e(0x14a361a3), C32e(0x3449ce49),
+	C32e(0xdf8d7b8d), C32e(0x9f423e42), C32e(0xcd937193), C32e(0xb1a297a2),
+	C32e(0xa204f504), C32e(0x01b868b8), C32e(0x00000000), C32e(0xb5742c74),
+	C32e(0xe0a060a0), C32e(0xc2211f21), C32e(0x3a43c843), C32e(0x9a2ced2c),
+	C32e(0x0dd9bed9), C32e(0x47ca46ca), C32e(0x1770d970), C32e(0xafdd4bdd),
+	C32e(0xed79de79), C32e(0xff67d467), C32e(0x9323e823), C32e(0x5bde4ade),
+	C32e(0x06bd6bbd), C32e(0xbb7e2a7e), C32e(0x7b34e534), C32e(0xd73a163a),
+	C32e(0xd254c554), C32e(0xf862d762), C32e(0x99ff55ff), C32e(0xb6a794a7),
+	C32e(0xc04acf4a), C32e(0xd9301030), C32e(0x0e0a060a), C32e(0x66988198),
+	C32e(0xab0bf00b), C32e(0xb4cc44cc), C32e(0xf0d5bad5), C32e(0x753ee33e),
+	C32e(0xac0ef30e), C32e(0x4419fe19), C32e(0xdb5bc05b), C32e(0x80858a85),
+	C32e(0xd3ecadec), C32e(0xfedfbcdf), C32e(0xa8d848d8), C32e(0xfd0c040c),
+	C32e(0x197adf7a), C32e(0x2f58c158), C32e(0x309f759f), C32e(0xe7a563a5),
+	C32e(0x70503050), C32e(0xcb2e1a2e), C32e(0xef120e12), C32e(0x08b76db7),
+	C32e(0x55d44cd4), C32e(0x243c143c), C32e(0x795f355f), C32e(0xb2712f71),
+	C32e(0x8638e138), C32e(0xc8fda2fd), C32e(0xc74fcc4f), C32e(0x654b394b),
+	C32e(0x6af957f9), C32e(0x580df20d), C32e(0x619d829d), C32e(0xb3c947c9),
+	C32e(0x27efacef), C32e(0x8832e732), C32e(0x4f7d2b7d), C32e(0x42a495a4),
+	C32e(0x3bfba0fb), C32e(0xaab398b3), C32e(0xf668d168), C32e(0x22817f81),
+	C32e(0xeeaa66aa), C32e(0xd6827e82), C32e(0xdde6abe6), C32e(0x959e839e),
+	C32e(0xc945ca45), C32e(0xbc7b297b), C32e(0x056ed36e), C32e(0x6c443c44),
+	C32e(0x2c8b798b), C32e(0x813de23d), C32e(0x31271d27), C32e(0x379a769a),
+	C32e(0x964d3b4d), C32e(0x9efa56fa), C32e(0xa6d24ed2), C32e(0x36221e22),
+	C32e(0xe476db76), C32e(0x121e0a1e), C32e(0xfcb46cb4), C32e(0x8f37e437),
+	C32e(0x78e75de7), C32e(0x0fb26eb2), C32e(0x692aef2a), C32e(0x35f1a6f1),
+	C32e(0xdae3a8e3), C32e(0xc6f7a4f7), C32e(0x8a593759), C32e(0x74868b86),
+	C32e(0x83563256), C32e(0x4ec543c5), C32e(0x85eb59eb), C32e(0x18c2b7c2),
+	C32e(0x8e8f8c8f), C32e(0x1dac64ac), C32e(0xf16dd26d), C32e(0x723be03b),
+	C32e(0x1fc7b4c7), C32e(0xb915fa15), C32e(0xfa090709), C32e(0xa06f256f),
+	C32e(0x20eaafea), C32e(0x7d898e89), C32e(0x6720e920), C32e(0x38281828),
+	C32e(0x0b64d564), C32e(0x73838883), C32e(0xfbb16fb1), C32e(0xca967296),
+	C32e(0x546c246c), C32e(0x5f08f108), C32e(0x2152c752), C32e(0x64f351f3),
+	C32e(0xae652365), C32e(0x25847c84), C32e(0x57bf9cbf), C32e(0x5d632163),
+	C32e(0xea7cdd7c), C32e(0x1e7fdc7f), C32e(0x9c918691), C32e(0x9b948594),
+	C32e(0x4bab90ab), C32e(0xbac642c6), C32e(0x2657c457), C32e(0x29e5aae5),
+	C32e(0xe373d873), C32e(0x090f050f), C32e(0xf4030103), C32e(0x2a361236),
+	C32e(0x3cfea3fe), C32e(0x8be15fe1), C32e(0xbe10f910), C32e(0x026bd06b),
+	C32e(0xbfa891a8), C32e(0x71e858e8), C32e(0x53692769), C32e(0xf7d0b9d0),
+	C32e(0x91483848), C32e(0xde351335), C32e(0xe5ceb3ce), C32e(0x77553355),
+	C32e(0x04d6bbd6), C32e(0x39907090), C32e(0x87808980), C32e(0xc1f2a7f2),
+	C32e(0xecc1b6c1), C32e(0x5a662266), C32e(0xb8ad92ad), C32e(0xa9602060),
+	C32e(0x5cdb49db), C32e(0xb01aff1a), C32e(0xd8887888), C32e(0x2b8e7a8e),
+	C32e(0x898a8f8a), C32e(0x4a13f813), C32e(0x929b809b), C32e(0x23391739),
+	C32e(0x1075da75), C32e(0x84533153), C32e(0xd551c651), C32e(0x03d3b8d3),
+	C32e(0xdc5ec35e), C32e(0xe2cbb0cb), C32e(0xc3997799), C32e(0x2d331133),
+	C32e(0x3d46cb46), C32e(0xb71ffc1f), C32e(0x0c61d661), C32e(0x624e3a4e)
+};
+
+#define DECL_STATE_SMALL \
+	sph_u32 H[16];
+
+#define READ_STATE_SMALL(sc)   do { \
+		memcpy(H, (sc)->state.narrow, sizeof H); \
+	} while (0)
+
+#define WRITE_STATE_SMALL(sc)   do { \
+		memcpy((sc)->state.narrow, H, sizeof H); \
+	} while (0)
+
+#define XCAT(x, y)    XCAT_(x, y)
+#define XCAT_(x, y)   x ## y
+
+#define RSTT(d0, d1, a, b0, b1, b2, b3, b4, b5, b6, b7)   do { \
+		t[d0] = T0up[B32_0(a[b0])] \
+			^ T1up[B32_1(a[b1])] \
+			^ T2up[B32_2(a[b2])] \
+			^ T3up[B32_3(a[b3])] \
+			^ T0dn[B32_0(a[b4])] \
+			^ T1dn[B32_1(a[b5])] \
+			^ T2dn[B32_2(a[b6])] \
+			^ T3dn[B32_3(a[b7])]; \
+		t[d1] = T0dn[B32_0(a[b0])] \
+			^ T1dn[B32_1(a[b1])] \
+			^ T2dn[B32_2(a[b2])] \
+			^ T3dn[B32_3(a[b3])] \
+			^ T0up[B32_0(a[b4])] \
+			^ T1up[B32_1(a[b5])] \
+			^ T2up[B32_2(a[b6])] \
+			^ T3up[B32_3(a[b7])]; \
+	} while (0)
+
+#define ROUND_SMALL_P(a, r)   do { \
+		sph_u32 t[16]; \
+		a[0x0] ^= PC32up(0x00, r); \
+		a[0x1] ^= PC32dn(0x00, r); \
+		a[0x2] ^= PC32up(0x10, r); \
+		a[0x3] ^= PC32dn(0x10, r); \
+		a[0x4] ^= PC32up(0x20, r); \
+		a[0x5] ^= PC32dn(0x20, r); \
+		a[0x6] ^= PC32up(0x30, r); \
+		a[0x7] ^= PC32dn(0x30, r); \
+		a[0x8] ^= PC32up(0x40, r); \
+		a[0x9] ^= PC32dn(0x40, r); \
+		a[0xA] ^= PC32up(0x50, r); \
+		a[0xB] ^= PC32dn(0x50, r); \
+		a[0xC] ^= PC32up(0x60, r); \
+		a[0xD] ^= PC32dn(0x60, r); \
+		a[0xE] ^= PC32up(0x70, r); \
+		a[0xF] ^= PC32dn(0x70, r); \
+		RSTT(0x0, 0x1, a, 0x0, 0x2, 0x4, 0x6, 0x9, 0xB, 0xD, 0xF); \
+		RSTT(0x2, 0x3, a, 0x2, 0x4, 0x6, 0x8, 0xB, 0xD, 0xF, 0x1); \
+		RSTT(0x4, 0x5, a, 0x4, 0x6, 0x8, 0xA, 0xD, 0xF, 0x1, 0x3); \
+		RSTT(0x6, 0x7, a, 0x6, 0x8, 0xA, 0xC, 0xF, 0x1, 0x3, 0x5); \
+		RSTT(0x8, 0x9, a, 0x8, 0xA, 0xC, 0xE, 0x1, 0x3, 0x5, 0x7); \
+		RSTT(0xA, 0xB, a, 0xA, 0xC, 0xE, 0x0, 0x3, 0x5, 0x7, 0x9); \
+		RSTT(0xC, 0xD, a, 0xC, 0xE, 0x0, 0x2, 0x5, 0x7, 0x9, 0xB); \
+		RSTT(0xE, 0xF, a, 0xE, 0x0, 0x2, 0x4, 0x7, 0x9, 0xB, 0xD); \
+		memcpy(a, t, sizeof t); \
+	} while (0)
+
+#define ROUND_SMALL_Q(a, r)   do { \
+		sph_u32 t[16]; \
+		a[0x0] ^= QC32up(0x00, r); \
+		a[0x1] ^= QC32dn(0x00, r); \
+		a[0x2] ^= QC32up(0x10, r); \
+		a[0x3] ^= QC32dn(0x10, r); \
+		a[0x4] ^= QC32up(0x20, r); \
+		a[0x5] ^= QC32dn(0x20, r); \
+		a[0x6] ^= QC32up(0x30, r); \
+		a[0x7] ^= QC32dn(0x30, r); \
+		a[0x8] ^= QC32up(0x40, r); \
+		a[0x9] ^= QC32dn(0x40, r); \
+		a[0xA] ^= QC32up(0x50, r); \
+		a[0xB] ^= QC32dn(0x50, r); \
+		a[0xC] ^= QC32up(0x60, r); \
+		a[0xD] ^= QC32dn(0x60, r); \
+		a[0xE] ^= QC32up(0x70, r); \
+		a[0xF] ^= QC32dn(0x70, r); \
+		RSTT(0x0, 0x1, a, 0x2, 0x6, 0xA, 0xE, 0x1, 0x5, 0x9, 0xD); \
+		RSTT(0x2, 0x3, a, 0x4, 0x8, 0xC, 0x0, 0x3, 0x7, 0xB, 0xF); \
+		RSTT(0x4, 0x5, a, 0x6, 0xA, 0xE, 0x2, 0x5, 0x9, 0xD, 0x1); \
+		RSTT(0x6, 0x7, a, 0x8, 0xC, 0x0, 0x4, 0x7, 0xB, 0xF, 0x3); \
+		RSTT(0x8, 0x9, a, 0xA, 0xE, 0x2, 0x6, 0x9, 0xD, 0x1, 0x5); \
+		RSTT(0xA, 0xB, a, 0xC, 0x0, 0x4, 0x8, 0xB, 0xF, 0x3, 0x7); \
+		RSTT(0xC, 0xD, a, 0xE, 0x2, 0x6, 0xA, 0xD, 0x1, 0x5, 0x9); \
+		RSTT(0xE, 0xF, a, 0x0, 0x4, 0x8, 0xC, 0xF, 0x3, 0x7, 0xB); \
+		memcpy(a, t, sizeof t); \
+	} while (0)
+
+#if SPH_SMALL_FOOTPRINT_GROESTL
+
+#define PERM_SMALL_P(a)   do { \
+		int r; \
+		for (r = 0; r < 10; r ++) \
+			ROUND_SMALL_P(a, r); \
+	} while (0)
+
+#define PERM_SMALL_Q(a)   do { \
+		int r; \
+		for (r = 0; r < 10; r ++) \
+			ROUND_SMALL_Q(a, r); \
+	} while (0)
+
+#else
+
+#define PERM_SMALL_P(a)   do { \
+		int r; \
+		for (r = 0; r < 10; r += 2) { \
+			ROUND_SMALL_P(a, r + 0); \
+			ROUND_SMALL_P(a, r + 1); \
+		} \
+	} while (0)
+
+#define PERM_SMALL_Q(a)   do { \
+		int r; \
+		for (r = 0; r < 10; r += 2) { \
+			ROUND_SMALL_Q(a, r + 0); \
+			ROUND_SMALL_Q(a, r + 1); \
+		} \
+	} while (0)
+
+#endif
+
+#define COMPRESS_SMALL   do { \
+		sph_u32 g[16], m[16]; \
+		size_t u; \
+		for (u = 0; u < 16; u ++) { \
+			m[u] = dec32e_aligned(buf + (u << 2)); \
+			g[u] = m[u] ^ H[u]; \
+		} \
+		PERM_SMALL_P(g); \
+		PERM_SMALL_Q(m); \
+		for (u = 0; u < 16; u ++) \
+			H[u] ^= g[u] ^ m[u]; \
+	} while (0)
+
+#define FINAL_SMALL   do { \
+		sph_u32 x[16]; \
+		size_t u; \
+		memcpy(x, H, sizeof x); \
+		PERM_SMALL_P(x); \
+		for (u = 0; u < 16; u ++) \
+			H[u] ^= x[u]; \
+	} while (0)
+
+#define DECL_STATE_BIG \
+	sph_u32 H[32];
+
+#define READ_STATE_BIG(sc)   do { \
+		memcpy(H, (sc)->state.narrow, sizeof H); \
+	} while (0)
+
+#define WRITE_STATE_BIG(sc)   do { \
+		memcpy((sc)->state.narrow, H, sizeof H); \
+	} while (0)
+
+#if SPH_SMALL_FOOTPRINT_GROESTL
+
+#define RBTT(d0, d1, a, b0, b1, b2, b3, b4, b5, b6, b7)   do { \
+		sph_u32 fu2 = T0up[B32_2(a[b2])]; \
+		sph_u32 fd2 = T0dn[B32_2(a[b2])]; \
+		sph_u32 fu3 = T1up[B32_3(a[b3])]; \
+		sph_u32 fd3 = T1dn[B32_3(a[b3])]; \
+		sph_u32 fu6 = T0up[B32_2(a[b6])]; \
+		sph_u32 fd6 = T0dn[B32_2(a[b6])]; \
+		sph_u32 fu7 = T1up[B32_3(a[b7])]; \
+		sph_u32 fd7 = T1dn[B32_3(a[b7])]; \
+		t[d0] = T0up[B32_0(a[b0])] \
+			^ T1up[B32_1(a[b1])] \
+			^ R32u(fu2, fd2) \
+			^ R32u(fu3, fd3) \
+			^ T0dn[B32_0(a[b4])] \
+			^ T1dn[B32_1(a[b5])] \
+			^ R32d(fu6, fd6) \
+			^ R32d(fu7, fd7); \
+		t[d1] = T0dn[B32_0(a[b0])] \
+			^ T1dn[B32_1(a[b1])] \
+			^ R32d(fu2, fd2) \
+			^ R32d(fu3, fd3) \
+			^ T0up[B32_0(a[b4])] \
+			^ T1up[B32_1(a[b5])] \
+			^ R32u(fu6, fd6) \
+			^ R32u(fu7, fd7); \
+	} while (0)
+
+#else
+
+#define RBTT(d0, d1, a, b0, b1, b2, b3, b4, b5, b6, b7)   do { \
+		t[d0] = T0up[B32_0(a[b0])] \
+			^ T1up[B32_1(a[b1])] \
+			^ T2up[B32_2(a[b2])] \
+			^ T3up[B32_3(a[b3])] \
+			^ T0dn[B32_0(a[b4])] \
+			^ T1dn[B32_1(a[b5])] \
+			^ T2dn[B32_2(a[b6])] \
+			^ T3dn[B32_3(a[b7])]; \
+		t[d1] = T0dn[B32_0(a[b0])] \
+			^ T1dn[B32_1(a[b1])] \
+			^ T2dn[B32_2(a[b2])] \
+			^ T3dn[B32_3(a[b3])] \
+			^ T0up[B32_0(a[b4])] \
+			^ T1up[B32_1(a[b5])] \
+			^ T2up[B32_2(a[b6])] \
+			^ T3up[B32_3(a[b7])]; \
+	} while (0)
+
+#endif
+
+#if SPH_SMALL_FOOTPRINT_GROESTL
+
+#define ROUND_BIG_P(a, r)   do { \
+		sph_u32 t[32]; \
+		size_t u; \
+		a[0x00] ^= PC32up(0x00, r); \
+		a[0x01] ^= PC32dn(0x00, r); \
+		a[0x02] ^= PC32up(0x10, r); \
+		a[0x03] ^= PC32dn(0x10, r); \
+		a[0x04] ^= PC32up(0x20, r); \
+		a[0x05] ^= PC32dn(0x20, r); \
+		a[0x06] ^= PC32up(0x30, r); \
+		a[0x07] ^= PC32dn(0x30, r); \
+		a[0x08] ^= PC32up(0x40, r); \
+		a[0x09] ^= PC32dn(0x40, r); \
+		a[0x0A] ^= PC32up(0x50, r); \
+		a[0x0B] ^= PC32dn(0x50, r); \
+		a[0x0C] ^= PC32up(0x60, r); \
+		a[0x0D] ^= PC32dn(0x60, r); \
+		a[0x0E] ^= PC32up(0x70, r); \
+		a[0x0F] ^= PC32dn(0x70, r); \
+		a[0x10] ^= PC32up(0x80, r); \
+		a[0x11] ^= PC32dn(0x80, r); \
+		a[0x12] ^= PC32up(0x90, r); \
+		a[0x13] ^= PC32dn(0x90, r); \
+		a[0x14] ^= PC32up(0xA0, r); \
+		a[0x15] ^= PC32dn(0xA0, r); \
+		a[0x16] ^= PC32up(0xB0, r); \
+		a[0x17] ^= PC32dn(0xB0, r); \
+		a[0x18] ^= PC32up(0xC0, r); \
+		a[0x19] ^= PC32dn(0xC0, r); \
+		a[0x1A] ^= PC32up(0xD0, r); \
+		a[0x1B] ^= PC32dn(0xD0, r); \
+		a[0x1C] ^= PC32up(0xE0, r); \
+		a[0x1D] ^= PC32dn(0xE0, r); \
+		a[0x1E] ^= PC32up(0xF0, r); \
+		a[0x1F] ^= PC32dn(0xF0, r); \
+		for (u = 0; u < 32; u += 8) { \
+			RBTT(u + 0x00, (u + 0x01) & 0x1F, a, \
+				u + 0x00, (u + 0x02) & 0x1F, \
+				(u + 0x04) & 0x1F, (u + 0x06) & 0x1F, \
+				(u + 0x09) & 0x1F, (u + 0x0B) & 0x1F, \
+				(u + 0x0D) & 0x1F, (u + 0x17) & 0x1F); \
+			RBTT(u + 0x02, (u + 0x03) & 0x1F, a, \
+				u + 0x02, (u + 0x04) & 0x1F, \
+				(u + 0x06) & 0x1F, (u + 0x08) & 0x1F, \
+				(u + 0x0B) & 0x1F, (u + 0x0D) & 0x1F, \
+				(u + 0x0F) & 0x1F, (u + 0x19) & 0x1F); \
+			RBTT(u + 0x04, (u + 0x05) & 0x1F, a, \
+				u + 0x04, (u + 0x06) & 0x1F, \
+				(u + 0x08) & 0x1F, (u + 0x0A) & 0x1F, \
+				(u + 0x0D) & 0x1F, (u + 0x0F) & 0x1F, \
+				(u + 0x11) & 0x1F, (u + 0x1B) & 0x1F); \
+			RBTT(u + 0x06, (u + 0x07) & 0x1F, a, \
+				u + 0x06, (u + 0x08) & 0x1F, \
+				(u + 0x0A) & 0x1F, (u + 0x0C) & 0x1F, \
+				(u + 0x0F) & 0x1F, (u + 0x11) & 0x1F, \
+				(u + 0x13) & 0x1F, (u + 0x1D) & 0x1F); \
+		} \
+		memcpy(a, t, sizeof t); \
+	} while (0)
+
+#define ROUND_BIG_Q(a, r)   do { \
+		sph_u32 t[32]; \
+		size_t u; \
+		a[0x00] ^= QC32up(0x00, r); \
+		a[0x01] ^= QC32dn(0x00, r); \
+		a[0x02] ^= QC32up(0x10, r); \
+		a[0x03] ^= QC32dn(0x10, r); \
+		a[0x04] ^= QC32up(0x20, r); \
+		a[0x05] ^= QC32dn(0x20, r); \
+		a[0x06] ^= QC32up(0x30, r); \
+		a[0x07] ^= QC32dn(0x30, r); \
+		a[0x08] ^= QC32up(0x40, r); \
+		a[0x09] ^= QC32dn(0x40, r); \
+		a[0x0A] ^= QC32up(0x50, r); \
+		a[0x0B] ^= QC32dn(0x50, r); \
+		a[0x0C] ^= QC32up(0x60, r); \
+		a[0x0D] ^= QC32dn(0x60, r); \
+		a[0x0E] ^= QC32up(0x70, r); \
+		a[0x0F] ^= QC32dn(0x70, r); \
+		a[0x10] ^= QC32up(0x80, r); \
+		a[0x11] ^= QC32dn(0x80, r); \
+		a[0x12] ^= QC32up(0x90, r); \
+		a[0x13] ^= QC32dn(0x90, r); \
+		a[0x14] ^= QC32up(0xA0, r); \
+		a[0x15] ^= QC32dn(0xA0, r); \
+		a[0x16] ^= QC32up(0xB0, r); \
+		a[0x17] ^= QC32dn(0xB0, r); \
+		a[0x18] ^= QC32up(0xC0, r); \
+		a[0x19] ^= QC32dn(0xC0, r); \
+		a[0x1A] ^= QC32up(0xD0, r); \
+		a[0x1B] ^= QC32dn(0xD0, r); \
+		a[0x1C] ^= QC32up(0xE0, r); \
+		a[0x1D] ^= QC32dn(0xE0, r); \
+		a[0x1E] ^= QC32up(0xF0, r); \
+		a[0x1F] ^= QC32dn(0xF0, r); \
+		for (u = 0; u < 32; u += 8) { \
+			RBTT(u + 0x00, (u + 0x01) & 0x1F, a, \
+				(u + 0x02) & 0x1F, (u + 0x06) & 0x1F, \
+				(u + 0x0A) & 0x1F, (u + 0x16) & 0x1F, \
+				(u + 0x01) & 0x1F, (u + 0x05) & 0x1F, \
+				(u + 0x09) & 0x1F, (u + 0x0D) & 0x1F); \
+			RBTT(u + 0x02, (u + 0x03) & 0x1F, a, \
+				(u + 0x04) & 0x1F, (u + 0x08) & 0x1F, \
+				(u + 0x0C) & 0x1F, (u + 0x18) & 0x1F, \
+				(u + 0x03) & 0x1F, (u + 0x07) & 0x1F, \
+				(u + 0x0B) & 0x1F, (u + 0x0F) & 0x1F); \
+			RBTT(u + 0x04, (u + 0x05) & 0x1F, a, \
+				(u + 0x06) & 0x1F, (u + 0x0A) & 0x1F, \
+				(u + 0x0E) & 0x1F, (u + 0x1A) & 0x1F, \
+				(u + 0x05) & 0x1F, (u + 0x09) & 0x1F, \
+				(u + 0x0D) & 0x1F, (u + 0x11) & 0x1F); \
+			RBTT(u + 0x06, (u + 0x07) & 0x1F, a, \
+				(u + 0x08) & 0x1F, (u + 0x0C) & 0x1F, \
+				(u + 0x10) & 0x1F, (u + 0x1C) & 0x1F, \
+				(u + 0x07) & 0x1F, (u + 0x0B) & 0x1F, \
+				(u + 0x0F) & 0x1F, (u + 0x13) & 0x1F); \
+		} \
+		memcpy(a, t, sizeof t); \
+	} while (0)
+
+#else
+
+#define ROUND_BIG_P(a, r)   do { \
+		sph_u32 t[32]; \
+		a[0x00] ^= PC32up(0x00, r); \
+		a[0x01] ^= PC32dn(0x00, r); \
+		a[0x02] ^= PC32up(0x10, r); \
+		a[0x03] ^= PC32dn(0x10, r); \
+		a[0x04] ^= PC32up(0x20, r); \
+		a[0x05] ^= PC32dn(0x20, r); \
+		a[0x06] ^= PC32up(0x30, r); \
+		a[0x07] ^= PC32dn(0x30, r); \
+		a[0x08] ^= PC32up(0x40, r); \
+		a[0x09] ^= PC32dn(0x40, r); \
+		a[0x0A] ^= PC32up(0x50, r); \
+		a[0x0B] ^= PC32dn(0x50, r); \
+		a[0x0C] ^= PC32up(0x60, r); \
+		a[0x0D] ^= PC32dn(0x60, r); \
+		a[0x0E] ^= PC32up(0x70, r); \
+		a[0x0F] ^= PC32dn(0x70, r); \
+		a[0x10] ^= PC32up(0x80, r); \
+		a[0x11] ^= PC32dn(0x80, r); \
+		a[0x12] ^= PC32up(0x90, r); \
+		a[0x13] ^= PC32dn(0x90, r); \
+		a[0x14] ^= PC32up(0xA0, r); \
+		a[0x15] ^= PC32dn(0xA0, r); \
+		a[0x16] ^= PC32up(0xB0, r); \
+		a[0x17] ^= PC32dn(0xB0, r); \
+		a[0x18] ^= PC32up(0xC0, r); \
+		a[0x19] ^= PC32dn(0xC0, r); \
+		a[0x1A] ^= PC32up(0xD0, r); \
+		a[0x1B] ^= PC32dn(0xD0, r); \
+		a[0x1C] ^= PC32up(0xE0, r); \
+		a[0x1D] ^= PC32dn(0xE0, r); \
+		a[0x1E] ^= PC32up(0xF0, r); \
+		a[0x1F] ^= PC32dn(0xF0, r); \
+		RBTT(0x00, 0x01, a, \
+			0x00, 0x02, 0x04, 0x06, 0x09, 0x0B, 0x0D, 0x17); \
+		RBTT(0x02, 0x03, a, \
+			0x02, 0x04, 0x06, 0x08, 0x0B, 0x0D, 0x0F, 0x19); \
+		RBTT(0x04, 0x05, a, \
+			0x04, 0x06, 0x08, 0x0A, 0x0D, 0x0F, 0x11, 0x1B); \
+		RBTT(0x06, 0x07, a, \
+			0x06, 0x08, 0x0A, 0x0C, 0x0F, 0x11, 0x13, 0x1D); \
+		RBTT(0x08, 0x09, a, \
+			0x08, 0x0A, 0x0C, 0x0E, 0x11, 0x13, 0x15, 0x1F); \
+		RBTT(0x0A, 0x0B, a, \
+			0x0A, 0x0C, 0x0E, 0x10, 0x13, 0x15, 0x17, 0x01); \
+		RBTT(0x0C, 0x0D, a, \
+			0x0C, 0x0E, 0x10, 0x12, 0x15, 0x17, 0x19, 0x03); \
+		RBTT(0x0E, 0x0F, a, \
+			0x0E, 0x10, 0x12, 0x14, 0x17, 0x19, 0x1B, 0x05); \
+		RBTT(0x10, 0x11, a, \
+			0x10, 0x12, 0x14, 0x16, 0x19, 0x1B, 0x1D, 0x07); \
+		RBTT(0x12, 0x13, a, \
+			0x12, 0x14, 0x16, 0x18, 0x1B, 0x1D, 0x1F, 0x09); \
+		RBTT(0x14, 0x15, a, \
+			0x14, 0x16, 0x18, 0x1A, 0x1D, 0x1F, 0x01, 0x0B); \
+		RBTT(0x16, 0x17, a, \
+			0x16, 0x18, 0x1A, 0x1C, 0x1F, 0x01, 0x03, 0x0D); \
+		RBTT(0x18, 0x19, a, \
+			0x18, 0x1A, 0x1C, 0x1E, 0x01, 0x03, 0x05, 0x0F); \
+		RBTT(0x1A, 0x1B, a, \
+			0x1A, 0x1C, 0x1E, 0x00, 0x03, 0x05, 0x07, 0x11); \
+		RBTT(0x1C, 0x1D, a, \
+			0x1C, 0x1E, 0x00, 0x02, 0x05, 0x07, 0x09, 0x13); \
+		RBTT(0x1E, 0x1F, a, \
+			0x1E, 0x00, 0x02, 0x04, 0x07, 0x09, 0x0B, 0x15); \
+		memcpy(a, t, sizeof t); \
+	} while (0)
+
+#define ROUND_BIG_Q(a, r)   do { \
+		sph_u32 t[32]; \
+		a[0x00] ^= QC32up(0x00, r); \
+		a[0x01] ^= QC32dn(0x00, r); \
+		a[0x02] ^= QC32up(0x10, r); \
+		a[0x03] ^= QC32dn(0x10, r); \
+		a[0x04] ^= QC32up(0x20, r); \
+		a[0x05] ^= QC32dn(0x20, r); \
+		a[0x06] ^= QC32up(0x30, r); \
+		a[0x07] ^= QC32dn(0x30, r); \
+		a[0x08] ^= QC32up(0x40, r); \
+		a[0x09] ^= QC32dn(0x40, r); \
+		a[0x0A] ^= QC32up(0x50, r); \
+		a[0x0B] ^= QC32dn(0x50, r); \
+		a[0x0C] ^= QC32up(0x60, r); \
+		a[0x0D] ^= QC32dn(0x60, r); \
+		a[0x0E] ^= QC32up(0x70, r); \
+		a[0x0F] ^= QC32dn(0x70, r); \
+		a[0x10] ^= QC32up(0x80, r); \
+		a[0x11] ^= QC32dn(0x80, r); \
+		a[0x12] ^= QC32up(0x90, r); \
+		a[0x13] ^= QC32dn(0x90, r); \
+		a[0x14] ^= QC32up(0xA0, r); \
+		a[0x15] ^= QC32dn(0xA0, r); \
+		a[0x16] ^= QC32up(0xB0, r); \
+		a[0x17] ^= QC32dn(0xB0, r); \
+		a[0x18] ^= QC32up(0xC0, r); \
+		a[0x19] ^= QC32dn(0xC0, r); \
+		a[0x1A] ^= QC32up(0xD0, r); \
+		a[0x1B] ^= QC32dn(0xD0, r); \
+		a[0x1C] ^= QC32up(0xE0, r); \
+		a[0x1D] ^= QC32dn(0xE0, r); \
+		a[0x1E] ^= QC32up(0xF0, r); \
+		a[0x1F] ^= QC32dn(0xF0, r); \
+		RBTT(0x00, 0x01, a, \
+			0x02, 0x06, 0x0A, 0x16, 0x01, 0x05, 0x09, 0x0D); \
+		RBTT(0x02, 0x03, a, \
+			0x04, 0x08, 0x0C, 0x18, 0x03, 0x07, 0x0B, 0x0F); \
+		RBTT(0x04, 0x05, a, \
+			0x06, 0x0A, 0x0E, 0x1A, 0x05, 0x09, 0x0D, 0x11); \
+		RBTT(0x06, 0x07, a, \
+			0x08, 0x0C, 0x10, 0x1C, 0x07, 0x0B, 0x0F, 0x13); \
+		RBTT(0x08, 0x09, a, \
+			0x0A, 0x0E, 0x12, 0x1E, 0x09, 0x0D, 0x11, 0x15); \
+		RBTT(0x0A, 0x0B, a, \
+			0x0C, 0x10, 0x14, 0x00, 0x0B, 0x0F, 0x13, 0x17); \
+		RBTT(0x0C, 0x0D, a, \
+			0x0E, 0x12, 0x16, 0x02, 0x0D, 0x11, 0x15, 0x19); \
+		RBTT(0x0E, 0x0F, a, \
+			0x10, 0x14, 0x18, 0x04, 0x0F, 0x13, 0x17, 0x1B); \
+		RBTT(0x10, 0x11, a, \
+			0x12, 0x16, 0x1A, 0x06, 0x11, 0x15, 0x19, 0x1D); \
+		RBTT(0x12, 0x13, a, \
+			0x14, 0x18, 0x1C, 0x08, 0x13, 0x17, 0x1B, 0x1F); \
+		RBTT(0x14, 0x15, a, \
+			0x16, 0x1A, 0x1E, 0x0A, 0x15, 0x19, 0x1D, 0x01); \
+		RBTT(0x16, 0x17, a, \
+			0x18, 0x1C, 0x00, 0x0C, 0x17, 0x1B, 0x1F, 0x03); \
+		RBTT(0x18, 0x19, a, \
+			0x1A, 0x1E, 0x02, 0x0E, 0x19, 0x1D, 0x01, 0x05); \
+		RBTT(0x1A, 0x1B, a, \
+			0x1C, 0x00, 0x04, 0x10, 0x1B, 0x1F, 0x03, 0x07); \
+		RBTT(0x1C, 0x1D, a, \
+			0x1E, 0x02, 0x06, 0x12, 0x1D, 0x01, 0x05, 0x09); \
+		RBTT(0x1E, 0x1F, a, \
+			0x00, 0x04, 0x08, 0x14, 0x1F, 0x03, 0x07, 0x0B); \
+		memcpy(a, t, sizeof t); \
+	} while (0)
+
+#endif
+
+#if SPH_SMALL_FOOTPRINT_GROESTL
+
+#define PERM_BIG_P(a)   do { \
+		int r; \
+		for (r = 0; r < 14; r ++) \
+			ROUND_BIG_P(a, r); \
+	} while (0)
+
+#define PERM_BIG_Q(a)   do { \
+		int r; \
+		for (r = 0; r < 14; r ++) \
+			ROUND_BIG_Q(a, r); \
+	} while (0)
+
+#else
+
+#define PERM_BIG_P(a)   do { \
+		int r; \
+		for (r = 0; r < 14; r += 2) { \
+			ROUND_BIG_P(a, r + 0); \
+			ROUND_BIG_P(a, r + 1); \
+		} \
+	} while (0)
+
+#define PERM_BIG_Q(a)   do { \
+		int r; \
+		for (r = 0; r < 14; r += 2) { \
+			ROUND_BIG_Q(a, r + 0); \
+			ROUND_BIG_Q(a, r + 1); \
+		} \
+	} while (0)
+
+#endif
+
+#define COMPRESS_BIG   do { \
+		sph_u32 g[32], m[32]; \
+		size_t u; \
+		for (u = 0; u < 32; u ++) { \
+			m[u] = dec32e_aligned(buf + (u << 2)); \
+			g[u] = m[u] ^ H[u]; \
+		} \
+		PERM_BIG_P(g); \
+		PERM_BIG_Q(m); \
+		for (u = 0; u < 32; u ++) \
+			H[u] ^= g[u] ^ m[u]; \
+	} while (0)
+
+#define FINAL_BIG   do { \
+		sph_u32 x[32]; \
+		size_t u; \
+		memcpy(x, H, sizeof x); \
+		PERM_BIG_P(x); \
+		for (u = 0; u < 32; u ++) \
+			H[u] ^= x[u]; \
+	} while (0)
+
+#endif
+
+static void
+groestl_small_init(sph_groestl_small_context *sc, unsigned out_size)
+{
+	size_t u;
+
+	sc->ptr = 0;
+#if SPH_GROESTL_64
+	for (u = 0; u < 7; u ++)
+		sc->state.wide[u] = 0;
+#if USE_LE
+	sc->state.wide[7] = ((sph_u64)(out_size & 0xFF) << 56)
+		| ((sph_u64)(out_size & 0xFF00) << 40);
+#else
+	sc->state.wide[7] = (sph_u64)out_size;
+#endif
+#else
+	for (u = 0; u < 15; u ++)
+		sc->state.narrow[u] = 0;
+#if USE_LE
+	sc->state.narrow[15] = ((sph_u32)(out_size & 0xFF) << 24)
+		| ((sph_u32)(out_size & 0xFF00) << 8);
+#else
+	sc->state.narrow[15] = (sph_u32)out_size;
+#endif
+#endif
+#if SPH_64
+	sc->count = 0;
+#else
+	sc->count_high = 0;
+	sc->count_low = 0;
+#endif
+}
+
+static void
+groestl_small_core(sph_groestl_small_context *sc, const void *data, size_t len)
+{
+	unsigned char *buf;
+	size_t ptr;
+	DECL_STATE_SMALL
+
+	buf = sc->buf;
+	ptr = sc->ptr;
+	if (len < (sizeof sc->buf) - ptr) {
+		memcpy(buf + ptr, data, len);
+		ptr += len;
+		sc->ptr = ptr;
+		return;
+	}
+
+	READ_STATE_SMALL(sc);
+	while (len > 0) {
+		size_t clen;
+
+		clen = (sizeof sc->buf) - ptr;
+		if (clen > len)
+			clen = len;
+		memcpy(buf + ptr, data, clen);
+		ptr += clen;
+		data = (const unsigned char *)data + clen;
+		len -= clen;
+		if (ptr == sizeof sc->buf) {
+			COMPRESS_SMALL;
+#if SPH_64
+			sc->count ++;
+#else
+			if ((sc->count_low = SPH_T32(sc->count_low + 1)) == 0)
+				sc->count_high = SPH_T32(sc->count_high + 1);
+#endif
+			ptr = 0;
+		}
+	}
+	WRITE_STATE_SMALL(sc);
+	sc->ptr = ptr;
+}
+
+static void
+groestl_small_close(sph_groestl_small_context *sc,
+	unsigned ub, unsigned n, void *dst, size_t out_len)
+{
+	unsigned char *buf;
+	unsigned char pad[72];
+	size_t u, ptr, pad_len;
+#if SPH_64
+	sph_u64 count;
+#else
+	sph_u32 count_high, count_low;
+#endif
+	unsigned z;
+	DECL_STATE_SMALL
+
+	buf = sc->buf;
+	ptr = sc->ptr;
+	z = 0x80 >> n;
+	pad[0] = ((ub & -z) | z) & 0xFF;
+	if (ptr < 56) {
+		pad_len = 64 - ptr;
+#if SPH_64
+		count = SPH_T64(sc->count + 1);
+#else
+		count_low = SPH_T32(sc->count_low + 1);
+		count_high = SPH_T32(sc->count_high);
+		if (count_low == 0)
+			count_high = SPH_T32(count_high + 1);
+#endif
+	} else {
+		pad_len = 128 - ptr;
+#if SPH_64
+		count = SPH_T64(sc->count + 2);
+#else
+		count_low = SPH_T32(sc->count_low + 2);
+		count_high = SPH_T32(sc->count_high);
+		if (count_low <= 1)
+			count_high = SPH_T32(count_high + 1);
+#endif
+	}
+	memset(pad + 1, 0, pad_len - 9);
+#if SPH_64
+	sph_enc64be(pad + pad_len - 8, count);
+#else
+	sph_enc64be(pad + pad_len - 8, count_high);
+	sph_enc64be(pad + pad_len - 4, count_low);
+#endif
+	groestl_small_core(sc, pad, pad_len);
+	READ_STATE_SMALL(sc);
+	FINAL_SMALL;
+#if SPH_GROESTL_64
+	for (u = 0; u < 4; u ++)
+		enc64e(pad + (u << 3), H[u + 4]);
+#else
+	for (u = 0; u < 8; u ++)
+		enc32e(pad + (u << 2), H[u + 8]);
+#endif
+	memcpy(dst, pad + 32 - out_len, out_len);
+	groestl_small_init(sc, (unsigned)out_len << 3);
+}
+
+static void
+groestl_big_init(sph_groestl_big_context *sc, unsigned out_size)
+{
+	size_t u;
+
+	sc->ptr = 0;
+#if SPH_GROESTL_64
+	for (u = 0; u < 15; u ++)
+		sc->state.wide[u] = 0;
+#if USE_LE
+	sc->state.wide[15] = ((sph_u64)(out_size & 0xFF) << 56)
+		| ((sph_u64)(out_size & 0xFF00) << 40);
+#else
+	sc->state.wide[15] = (sph_u64)out_size;
+#endif
+#else
+	for (u = 0; u < 31; u ++)
+		sc->state.narrow[u] = 0;
+#if USE_LE
+	sc->state.narrow[31] = ((sph_u32)(out_size & 0xFF) << 24)
+		| ((sph_u32)(out_size & 0xFF00) << 8);
+#else
+	sc->state.narrow[31] = (sph_u32)out_size;
+#endif
+#endif
+#if SPH_64
+	sc->count = 0;
+#else
+	sc->count_high = 0;
+	sc->count_low = 0;
+#endif
+}
+
+static void
+groestl_big_core(sph_groestl_big_context *sc, const void *data, size_t len)
+{
+	unsigned char *buf;
+	size_t ptr;
+	DECL_STATE_BIG
+
+	buf = sc->buf;
+	ptr = sc->ptr;
+	if (len < (sizeof sc->buf) - ptr) {
+		memcpy(buf + ptr, data, len);
+		ptr += len;
+		sc->ptr = ptr;
+		return;
+	}
+
+	READ_STATE_BIG(sc);
+	while (len > 0) {
+		size_t clen;
+
+		clen = (sizeof sc->buf) - ptr;
+		if (clen > len)
+			clen = len;
+		memcpy(buf + ptr, data, clen);
+		ptr += clen;
+		data = (const unsigned char *)data + clen;
+		len -= clen;
+		if (ptr == sizeof sc->buf) {
+			COMPRESS_BIG;
+#if SPH_64
+			sc->count ++;
+#else
+			if ((sc->count_low = SPH_T32(sc->count_low + 1)) == 0)
+				sc->count_high = SPH_T32(sc->count_high + 1);
+#endif
+			ptr = 0;
+		}
+	}
+	WRITE_STATE_BIG(sc);
+	sc->ptr = ptr;
+}
+
+static void
+groestl_big_close(sph_groestl_big_context *sc,
+	unsigned ub, unsigned n, void *dst, size_t out_len)
+{
+	unsigned char *buf;
+	unsigned char pad[136];
+	size_t ptr, pad_len, u;
+#if SPH_64
+	sph_u64 count;
+#else
+	sph_u32 count_high, count_low;
+#endif
+	unsigned z;
+	DECL_STATE_BIG
+
+	buf = sc->buf;
+	ptr = sc->ptr;
+	z = 0x80 >> n;
+	pad[0] = ((ub & -z) | z) & 0xFF;
+	if (ptr < 120) {
+		pad_len = 128 - ptr;
+#if SPH_64
+		count = SPH_T64(sc->count + 1);
+#else
+		count_low = SPH_T32(sc->count_low + 1);
+		count_high = SPH_T32(sc->count_high);
+		if (count_low == 0)
+			count_high = SPH_T32(count_high + 1);
+#endif
+	} else {
+		pad_len = 256 - ptr;
+#if SPH_64
+		count = SPH_T64(sc->count + 2);
+#else
+		count_low = SPH_T32(sc->count_low + 2);
+		count_high = SPH_T32(sc->count_high);
+		if (count_low <= 1)
+			count_high = SPH_T32(count_high + 1);
+#endif
+	}
+	memset(pad + 1, 0, pad_len - 9);
+#if SPH_64
+	sph_enc64be(pad + pad_len - 8, count);
+#else
+	sph_enc64be(pad + pad_len - 8, count_high);
+	sph_enc64be(pad + pad_len - 4, count_low);
+#endif
+	groestl_big_core(sc, pad, pad_len);
+	READ_STATE_BIG(sc);
+	FINAL_BIG;
+#if SPH_GROESTL_64
+	for (u = 0; u < 8; u ++)
+		enc64e(pad + (u << 3), H[u + 8]);
+#else
+	for (u = 0; u < 16; u ++)
+		enc32e(pad + (u << 2), H[u + 16]);
+#endif
+	memcpy(dst, pad + 64 - out_len, out_len);
+	groestl_big_init(sc, (unsigned)out_len << 3);
+}
+
+/* see sph_groestl.h */
+void
+sph_groestl224_init(void *cc)
+{
+	groestl_small_init(cc, 224);
+}
+
+/* see sph_groestl.h */
+void
+sph_groestl224(void *cc, const void *data, size_t len)
+{
+	groestl_small_core(cc, data, len);
+}
+
+/* see sph_groestl.h */
+void
+sph_groestl224_close(void *cc, void *dst)
+{
+	groestl_small_close(cc, 0, 0, dst, 28);
+}
+
+/* see sph_groestl.h */
+void
+sph_groestl224_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
+{
+	groestl_small_close(cc, ub, n, dst, 28);
+}
+
+/* see sph_groestl.h */
+void
+sph_groestl256_init(void *cc)
+{
+	groestl_small_init(cc, 256);
+}
+
+/* see sph_groestl.h */
+void
+sph_groestl256(void *cc, const void *data, size_t len)
+{
+	groestl_small_core(cc, data, len);
+}
+
+/* see sph_groestl.h */
+void
+sph_groestl256_close(void *cc, void *dst)
+{
+	groestl_small_close(cc, 0, 0, dst, 32);
+}
+
+/* see sph_groestl.h */
+void
+sph_groestl256_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
+{
+	groestl_small_close(cc, ub, n, dst, 32);
+}
+
+/* see sph_groestl.h */
+void
+sph_groestl384_init(void *cc)
+{
+	groestl_big_init(cc, 384);
+}
+
+/* see sph_groestl.h */
+void
+sph_groestl384(void *cc, const void *data, size_t len)
+{
+	groestl_big_core(cc, data, len);
+}
+
+/* see sph_groestl.h */
+void
+sph_groestl384_close(void *cc, void *dst)
+{
+	groestl_big_close(cc, 0, 0, dst, 48);
+}
+
+/* see sph_groestl.h */
+void
+sph_groestl384_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
+{
+	groestl_big_close(cc, ub, n, dst, 48);
+}
+
+/* see sph_groestl.h */
+void
+sph_groestl512_init(void *cc)
+{
+	groestl_big_init(cc, 512);
+}
+
+/* see sph_groestl.h */
+void
+sph_groestl512(void *cc, const void *data, size_t len)
+{
+	groestl_big_core(cc, data, len);
+}
+
+/* see sph_groestl.h */
+void
+sph_groestl512_close(void *cc, void *dst)
+{
+	groestl_big_close(cc, 0, 0, dst, 64);
+}
+
+/* see sph_groestl.h */
+void
+sph_groestl512_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
+{
+	groestl_big_close(cc, ub, n, dst, 64);
+}
+
+#ifdef __cplusplus
+}
+#endif
diff --git a/sha3/jh.c b/sha3/jh.c
new file mode 100644
index 0000000..41487a5
--- /dev/null
+++ b/sha3/jh.c
@@ -0,0 +1,1116 @@
+/* $Id: jh.c 255 2011-06-07 19:50:20Z tp $ */
+/*
+ * JH implementation.
+ *
+ * ==========================(LICENSE BEGIN)============================
+ *
+ * Copyright (c) 2007-2010  Projet RNRT SAPHIR
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ===========================(LICENSE END)=============================
+ *
+ * @author   Thomas Pornin <thomas.pornin@cryptolog.com>
+ */
+
+#include <stddef.h>
+#include <string.h>
+
+#include "sph_jh.h"
+
+#ifdef __cplusplus
+extern "C"{
+#endif
+
+
+#if SPH_SMALL_FOOTPRINT && !defined SPH_SMALL_FOOTPRINT_JH
+#define SPH_SMALL_FOOTPRINT_JH   1
+#endif
+
+#if !defined SPH_JH_64 && SPH_64_TRUE
+#define SPH_JH_64   1
+#endif
+
+#if !SPH_64
+#undef SPH_JH_64
+#endif
+
+#ifdef _MSC_VER
+#pragma warning (disable: 4146)
+#endif
+
+/*
+ * The internal bitslice representation may use either big-endian or
+ * little-endian (true bitslice operations do not care about the bit
+ * ordering, and the bit-swapping linear operations in JH happen to
+ * be invariant through endianness-swapping). The constants must be
+ * defined according to the chosen endianness; we use some
+ * byte-swapping macros for that.
+ */
+
+#if SPH_LITTLE_ENDIAN
+
+#define C32e(x)     ((SPH_C32(x) >> 24) \
+                    | ((SPH_C32(x) >>  8) & SPH_C32(0x0000FF00)) \
+                    | ((SPH_C32(x) <<  8) & SPH_C32(0x00FF0000)) \
+                    | ((SPH_C32(x) << 24) & SPH_C32(0xFF000000)))
+#define dec32e_aligned   sph_dec32le_aligned
+#define enc32e           sph_enc32le
+
+#if SPH_64
+#define C64e(x)     ((SPH_C64(x) >> 56) \
+                    | ((SPH_C64(x) >> 40) & SPH_C64(0x000000000000FF00)) \
+                    | ((SPH_C64(x) >> 24) & SPH_C64(0x0000000000FF0000)) \
+                    | ((SPH_C64(x) >>  8) & SPH_C64(0x00000000FF000000)) \
+                    | ((SPH_C64(x) <<  8) & SPH_C64(0x000000FF00000000)) \
+                    | ((SPH_C64(x) << 24) & SPH_C64(0x0000FF0000000000)) \
+                    | ((SPH_C64(x) << 40) & SPH_C64(0x00FF000000000000)) \
+                    | ((SPH_C64(x) << 56) & SPH_C64(0xFF00000000000000)))
+#define dec64e_aligned   sph_dec64le_aligned
+#define enc64e           sph_enc64le
+#endif
+
+#else
+
+#define C32e(x)     SPH_C32(x)
+#define dec32e_aligned   sph_dec32be_aligned
+#define enc32e           sph_enc32be
+#if SPH_64
+#define C64e(x)     SPH_C64(x)
+#define dec64e_aligned   sph_dec64be_aligned
+#define enc64e           sph_enc64be
+#endif
+
+#endif
+
+#define Sb(x0, x1, x2, x3, c)   do { \
+		x3 = ~x3; \
+		x0 ^= (c) & ~x2; \
+		tmp = (c) ^ (x0 & x1); \
+		x0 ^= x2 & x3; \
+		x3 ^= ~x1 & x2; \
+		x1 ^= x0 & x2; \
+		x2 ^= x0 & ~x3; \
+		x0 ^= x1 | x3; \
+		x3 ^= x1 & x2; \
+		x1 ^= tmp & x0; \
+		x2 ^= tmp; \
+	} while (0)
+
+#define Lb(x0, x1, x2, x3, x4, x5, x6, x7)   do { \
+		x4 ^= x1; \
+		x5 ^= x2; \
+		x6 ^= x3 ^ x0; \
+		x7 ^= x0; \
+		x0 ^= x5; \
+		x1 ^= x6; \
+		x2 ^= x7 ^ x4; \
+		x3 ^= x4; \
+	} while (0)
+
+#if SPH_JH_64
+
+static const sph_u64 C[] = {
+	C64e(0x72d5dea2df15f867), C64e(0x7b84150ab7231557),
+	C64e(0x81abd6904d5a87f6), C64e(0x4e9f4fc5c3d12b40),
+	C64e(0xea983ae05c45fa9c), C64e(0x03c5d29966b2999a),
+	C64e(0x660296b4f2bb538a), C64e(0xb556141a88dba231),
+	C64e(0x03a35a5c9a190edb), C64e(0x403fb20a87c14410),
+	C64e(0x1c051980849e951d), C64e(0x6f33ebad5ee7cddc),
+	C64e(0x10ba139202bf6b41), C64e(0xdc786515f7bb27d0),
+	C64e(0x0a2c813937aa7850), C64e(0x3f1abfd2410091d3),
+	C64e(0x422d5a0df6cc7e90), C64e(0xdd629f9c92c097ce),
+	C64e(0x185ca70bc72b44ac), C64e(0xd1df65d663c6fc23),
+	C64e(0x976e6c039ee0b81a), C64e(0x2105457e446ceca8),
+	C64e(0xeef103bb5d8e61fa), C64e(0xfd9697b294838197),
+	C64e(0x4a8e8537db03302f), C64e(0x2a678d2dfb9f6a95),
+	C64e(0x8afe7381f8b8696c), C64e(0x8ac77246c07f4214),
+	C64e(0xc5f4158fbdc75ec4), C64e(0x75446fa78f11bb80),
+	C64e(0x52de75b7aee488bc), C64e(0x82b8001e98a6a3f4),
+	C64e(0x8ef48f33a9a36315), C64e(0xaa5f5624d5b7f989),
+	C64e(0xb6f1ed207c5ae0fd), C64e(0x36cae95a06422c36),
+	C64e(0xce2935434efe983d), C64e(0x533af974739a4ba7),
+	C64e(0xd0f51f596f4e8186), C64e(0x0e9dad81afd85a9f),
+	C64e(0xa7050667ee34626a), C64e(0x8b0b28be6eb91727),
+	C64e(0x47740726c680103f), C64e(0xe0a07e6fc67e487b),
+	C64e(0x0d550aa54af8a4c0), C64e(0x91e3e79f978ef19e),
+	C64e(0x8676728150608dd4), C64e(0x7e9e5a41f3e5b062),
+	C64e(0xfc9f1fec4054207a), C64e(0xe3e41a00cef4c984),
+	C64e(0x4fd794f59dfa95d8), C64e(0x552e7e1124c354a5),
+	C64e(0x5bdf7228bdfe6e28), C64e(0x78f57fe20fa5c4b2),
+	C64e(0x05897cefee49d32e), C64e(0x447e9385eb28597f),
+	C64e(0x705f6937b324314a), C64e(0x5e8628f11dd6e465),
+	C64e(0xc71b770451b920e7), C64e(0x74fe43e823d4878a),
+	C64e(0x7d29e8a3927694f2), C64e(0xddcb7a099b30d9c1),
+	C64e(0x1d1b30fb5bdc1be0), C64e(0xda24494ff29c82bf),
+	C64e(0xa4e7ba31b470bfff), C64e(0x0d324405def8bc48),
+	C64e(0x3baefc3253bbd339), C64e(0x459fc3c1e0298ba0),
+	C64e(0xe5c905fdf7ae090f), C64e(0x947034124290f134),
+	C64e(0xa271b701e344ed95), C64e(0xe93b8e364f2f984a),
+	C64e(0x88401d63a06cf615), C64e(0x47c1444b8752afff),
+	C64e(0x7ebb4af1e20ac630), C64e(0x4670b6c5cc6e8ce6),
+	C64e(0xa4d5a456bd4fca00), C64e(0xda9d844bc83e18ae),
+	C64e(0x7357ce453064d1ad), C64e(0xe8a6ce68145c2567),
+	C64e(0xa3da8cf2cb0ee116), C64e(0x33e906589a94999a),
+	C64e(0x1f60b220c26f847b), C64e(0xd1ceac7fa0d18518),
+	C64e(0x32595ba18ddd19d3), C64e(0x509a1cc0aaa5b446),
+	C64e(0x9f3d6367e4046bba), C64e(0xf6ca19ab0b56ee7e),
+	C64e(0x1fb179eaa9282174), C64e(0xe9bdf7353b3651ee),
+	C64e(0x1d57ac5a7550d376), C64e(0x3a46c2fea37d7001),
+	C64e(0xf735c1af98a4d842), C64e(0x78edec209e6b6779),
+	C64e(0x41836315ea3adba8), C64e(0xfac33b4d32832c83),
+	C64e(0xa7403b1f1c2747f3), C64e(0x5940f034b72d769a),
+	C64e(0xe73e4e6cd2214ffd), C64e(0xb8fd8d39dc5759ef),
+	C64e(0x8d9b0c492b49ebda), C64e(0x5ba2d74968f3700d),
+	C64e(0x7d3baed07a8d5584), C64e(0xf5a5e9f0e4f88e65),
+	C64e(0xa0b8a2f436103b53), C64e(0x0ca8079e753eec5a),
+	C64e(0x9168949256e8884f), C64e(0x5bb05c55f8babc4c),
+	C64e(0xe3bb3b99f387947b), C64e(0x75daf4d6726b1c5d),
+	C64e(0x64aeac28dc34b36d), C64e(0x6c34a550b828db71),
+	C64e(0xf861e2f2108d512a), C64e(0xe3db643359dd75fc),
+	C64e(0x1cacbcf143ce3fa2), C64e(0x67bbd13c02e843b0),
+	C64e(0x330a5bca8829a175), C64e(0x7f34194db416535c),
+	C64e(0x923b94c30e794d1e), C64e(0x797475d7b6eeaf3f),
+	C64e(0xeaa8d4f7be1a3921), C64e(0x5cf47e094c232751),
+	C64e(0x26a32453ba323cd2), C64e(0x44a3174a6da6d5ad),
+	C64e(0xb51d3ea6aff2c908), C64e(0x83593d98916b3c56),
+	C64e(0x4cf87ca17286604d), C64e(0x46e23ecc086ec7f6),
+	C64e(0x2f9833b3b1bc765e), C64e(0x2bd666a5efc4e62a),
+	C64e(0x06f4b6e8bec1d436), C64e(0x74ee8215bcef2163),
+	C64e(0xfdc14e0df453c969), C64e(0xa77d5ac406585826),
+	C64e(0x7ec1141606e0fa16), C64e(0x7e90af3d28639d3f),
+	C64e(0xd2c9f2e3009bd20c), C64e(0x5faace30b7d40c30),
+	C64e(0x742a5116f2e03298), C64e(0x0deb30d8e3cef89a),
+	C64e(0x4bc59e7bb5f17992), C64e(0xff51e66e048668d3),
+	C64e(0x9b234d57e6966731), C64e(0xcce6a6f3170a7505),
+	C64e(0xb17681d913326cce), C64e(0x3c175284f805a262),
+	C64e(0xf42bcbb378471547), C64e(0xff46548223936a48),
+	C64e(0x38df58074e5e6565), C64e(0xf2fc7c89fc86508e),
+	C64e(0x31702e44d00bca86), C64e(0xf04009a23078474e),
+	C64e(0x65a0ee39d1f73883), C64e(0xf75ee937e42c3abd),
+	C64e(0x2197b2260113f86f), C64e(0xa344edd1ef9fdee7),
+	C64e(0x8ba0df15762592d9), C64e(0x3c85f7f612dc42be),
+	C64e(0xd8a7ec7cab27b07e), C64e(0x538d7ddaaa3ea8de),
+	C64e(0xaa25ce93bd0269d8), C64e(0x5af643fd1a7308f9),
+	C64e(0xc05fefda174a19a5), C64e(0x974d66334cfd216a),
+	C64e(0x35b49831db411570), C64e(0xea1e0fbbedcd549b),
+	C64e(0x9ad063a151974072), C64e(0xf6759dbf91476fe2)
+};
+
+#define Ceven_hi(r)   (C[((r) << 2) + 0])
+#define Ceven_lo(r)   (C[((r) << 2) + 1])
+#define Codd_hi(r)    (C[((r) << 2) + 2])
+#define Codd_lo(r)    (C[((r) << 2) + 3])
+
+#define S(x0, x1, x2, x3, cb, r)   do { \
+		Sb(x0 ## h, x1 ## h, x2 ## h, x3 ## h, cb ## hi(r)); \
+		Sb(x0 ## l, x1 ## l, x2 ## l, x3 ## l, cb ## lo(r)); \
+	} while (0)
+
+#define L(x0, x1, x2, x3, x4, x5, x6, x7)   do { \
+		Lb(x0 ## h, x1 ## h, x2 ## h, x3 ## h, \
+			x4 ## h, x5 ## h, x6 ## h, x7 ## h); \
+		Lb(x0 ## l, x1 ## l, x2 ## l, x3 ## l, \
+			x4 ## l, x5 ## l, x6 ## l, x7 ## l); \
+	} while (0)
+
+#define Wz(x, c, n)   do { \
+		sph_u64 t = (x ## h & (c)) << (n); \
+		x ## h = ((x ## h >> (n)) & (c)) | t; \
+		t = (x ## l & (c)) << (n); \
+		x ## l = ((x ## l >> (n)) & (c)) | t; \
+	} while (0)
+
+#define W0(x)   Wz(x, SPH_C64(0x5555555555555555),  1)
+#define W1(x)   Wz(x, SPH_C64(0x3333333333333333),  2)
+#define W2(x)   Wz(x, SPH_C64(0x0F0F0F0F0F0F0F0F),  4)
+#define W3(x)   Wz(x, SPH_C64(0x00FF00FF00FF00FF),  8)
+#define W4(x)   Wz(x, SPH_C64(0x0000FFFF0000FFFF), 16)
+#define W5(x)   Wz(x, SPH_C64(0x00000000FFFFFFFF), 32)
+#define W6(x)   do { \
+		sph_u64 t = x ## h; \
+		x ## h = x ## l; \
+		x ## l = t; \
+	} while (0)
+
+#define DECL_STATE \
+	sph_u64 h0h, h1h, h2h, h3h, h4h, h5h, h6h, h7h; \
+	sph_u64 h0l, h1l, h2l, h3l, h4l, h5l, h6l, h7l; \
+	sph_u64 tmp;
+
+#define READ_STATE(state)   do { \
+		h0h = (state)->H.wide[ 0]; \
+		h0l = (state)->H.wide[ 1]; \
+		h1h = (state)->H.wide[ 2]; \
+		h1l = (state)->H.wide[ 3]; \
+		h2h = (state)->H.wide[ 4]; \
+		h2l = (state)->H.wide[ 5]; \
+		h3h = (state)->H.wide[ 6]; \
+		h3l = (state)->H.wide[ 7]; \
+		h4h = (state)->H.wide[ 8]; \
+		h4l = (state)->H.wide[ 9]; \
+		h5h = (state)->H.wide[10]; \
+		h5l = (state)->H.wide[11]; \
+		h6h = (state)->H.wide[12]; \
+		h6l = (state)->H.wide[13]; \
+		h7h = (state)->H.wide[14]; \
+		h7l = (state)->H.wide[15]; \
+	} while (0)
+
+#define WRITE_STATE(state)   do { \
+		(state)->H.wide[ 0] = h0h; \
+		(state)->H.wide[ 1] = h0l; \
+		(state)->H.wide[ 2] = h1h; \
+		(state)->H.wide[ 3] = h1l; \
+		(state)->H.wide[ 4] = h2h; \
+		(state)->H.wide[ 5] = h2l; \
+		(state)->H.wide[ 6] = h3h; \
+		(state)->H.wide[ 7] = h3l; \
+		(state)->H.wide[ 8] = h4h; \
+		(state)->H.wide[ 9] = h4l; \
+		(state)->H.wide[10] = h5h; \
+		(state)->H.wide[11] = h5l; \
+		(state)->H.wide[12] = h6h; \
+		(state)->H.wide[13] = h6l; \
+		(state)->H.wide[14] = h7h; \
+		(state)->H.wide[15] = h7l; \
+	} while (0)
+
+#define INPUT_BUF1 \
+	sph_u64 m0h = dec64e_aligned(buf +  0); \
+	sph_u64 m0l = dec64e_aligned(buf +  8); \
+	sph_u64 m1h = dec64e_aligned(buf + 16); \
+	sph_u64 m1l = dec64e_aligned(buf + 24); \
+	sph_u64 m2h = dec64e_aligned(buf + 32); \
+	sph_u64 m2l = dec64e_aligned(buf + 40); \
+	sph_u64 m3h = dec64e_aligned(buf + 48); \
+	sph_u64 m3l = dec64e_aligned(buf + 56); \
+	h0h ^= m0h; \
+	h0l ^= m0l; \
+	h1h ^= m1h; \
+	h1l ^= m1l; \
+	h2h ^= m2h; \
+	h2l ^= m2l; \
+	h3h ^= m3h; \
+	h3l ^= m3l;
+
+#define INPUT_BUF2 \
+	h4h ^= m0h; \
+	h4l ^= m0l; \
+	h5h ^= m1h; \
+	h5l ^= m1l; \
+	h6h ^= m2h; \
+	h6l ^= m2l; \
+	h7h ^= m3h; \
+	h7l ^= m3l;
+
+static const sph_u64 IV224[] = {
+	C64e(0x2dfedd62f99a98ac), C64e(0xae7cacd619d634e7),
+	C64e(0xa4831005bc301216), C64e(0xb86038c6c9661494),
+	C64e(0x66d9899f2580706f), C64e(0xce9ea31b1d9b1adc),
+	C64e(0x11e8325f7b366e10), C64e(0xf994857f02fa06c1),
+	C64e(0x1b4f1b5cd8c840b3), C64e(0x97f6a17f6e738099),
+	C64e(0xdcdf93a5adeaa3d3), C64e(0xa431e8dec9539a68),
+	C64e(0x22b4a98aec86a1e4), C64e(0xd574ac959ce56cf0),
+	C64e(0x15960deab5ab2bbf), C64e(0x9611dcf0dd64ea6e)
+};
+
+static const sph_u64 IV256[] = {
+	C64e(0xeb98a3412c20d3eb), C64e(0x92cdbe7b9cb245c1),
+	C64e(0x1c93519160d4c7fa), C64e(0x260082d67e508a03),
+	C64e(0xa4239e267726b945), C64e(0xe0fb1a48d41a9477),
+	C64e(0xcdb5ab26026b177a), C64e(0x56f024420fff2fa8),
+	C64e(0x71a396897f2e4d75), C64e(0x1d144908f77de262),
+	C64e(0x277695f776248f94), C64e(0x87d5b6574780296c),
+	C64e(0x5c5e272dac8e0d6c), C64e(0x518450c657057a0f),
+	C64e(0x7be4d367702412ea), C64e(0x89e3ab13d31cd769)
+};
+
+static const sph_u64 IV384[] = {
+	C64e(0x481e3bc6d813398a), C64e(0x6d3b5e894ade879b),
+	C64e(0x63faea68d480ad2e), C64e(0x332ccb21480f8267),
+	C64e(0x98aec84d9082b928), C64e(0xd455ea3041114249),
+	C64e(0x36f555b2924847ec), C64e(0xc7250a93baf43ce1),
+	C64e(0x569b7f8a27db454c), C64e(0x9efcbd496397af0e),
+	C64e(0x589fc27d26aa80cd), C64e(0x80c08b8c9deb2eda),
+	C64e(0x8a7981e8f8d5373a), C64e(0xf43967adddd17a71),
+	C64e(0xa9b4d3bda475d394), C64e(0x976c3fba9842737f)
+};
+
+static const sph_u64 IV512[] = {
+	C64e(0x6fd14b963e00aa17), C64e(0x636a2e057a15d543),
+	C64e(0x8a225e8d0c97ef0b), C64e(0xe9341259f2b3c361),
+	C64e(0x891da0c1536f801e), C64e(0x2aa9056bea2b6d80),
+	C64e(0x588eccdb2075baa6), C64e(0xa90f3a76baf83bf7),
+	C64e(0x0169e60541e34a69), C64e(0x46b58a8e2e6fe65a),
+	C64e(0x1047a7d0c1843c24), C64e(0x3b6e71b12d5ac199),
+	C64e(0xcf57f6ec9db1f856), C64e(0xa706887c5716b156),
+	C64e(0xe3c2fcdfe68517fb), C64e(0x545a4678cc8cdd4b)
+};
+
+#else
+
+static const sph_u32 C[] = {
+	C32e(0x72d5dea2), C32e(0xdf15f867), C32e(0x7b84150a),
+	C32e(0xb7231557), C32e(0x81abd690), C32e(0x4d5a87f6),
+	C32e(0x4e9f4fc5), C32e(0xc3d12b40), C32e(0xea983ae0),
+	C32e(0x5c45fa9c), C32e(0x03c5d299), C32e(0x66b2999a),
+	C32e(0x660296b4), C32e(0xf2bb538a), C32e(0xb556141a),
+	C32e(0x88dba231), C32e(0x03a35a5c), C32e(0x9a190edb),
+	C32e(0x403fb20a), C32e(0x87c14410), C32e(0x1c051980),
+	C32e(0x849e951d), C32e(0x6f33ebad), C32e(0x5ee7cddc),
+	C32e(0x10ba1392), C32e(0x02bf6b41), C32e(0xdc786515),
+	C32e(0xf7bb27d0), C32e(0x0a2c8139), C32e(0x37aa7850),
+	C32e(0x3f1abfd2), C32e(0x410091d3), C32e(0x422d5a0d),
+	C32e(0xf6cc7e90), C32e(0xdd629f9c), C32e(0x92c097ce),
+	C32e(0x185ca70b), C32e(0xc72b44ac), C32e(0xd1df65d6),
+	C32e(0x63c6fc23), C32e(0x976e6c03), C32e(0x9ee0b81a),
+	C32e(0x2105457e), C32e(0x446ceca8), C32e(0xeef103bb),
+	C32e(0x5d8e61fa), C32e(0xfd9697b2), C32e(0x94838197),
+	C32e(0x4a8e8537), C32e(0xdb03302f), C32e(0x2a678d2d),
+	C32e(0xfb9f6a95), C32e(0x8afe7381), C32e(0xf8b8696c),
+	C32e(0x8ac77246), C32e(0xc07f4214), C32e(0xc5f4158f),
+	C32e(0xbdc75ec4), C32e(0x75446fa7), C32e(0x8f11bb80),
+	C32e(0x52de75b7), C32e(0xaee488bc), C32e(0x82b8001e),
+	C32e(0x98a6a3f4), C32e(0x8ef48f33), C32e(0xa9a36315),
+	C32e(0xaa5f5624), C32e(0xd5b7f989), C32e(0xb6f1ed20),
+	C32e(0x7c5ae0fd), C32e(0x36cae95a), C32e(0x06422c36),
+	C32e(0xce293543), C32e(0x4efe983d), C32e(0x533af974),
+	C32e(0x739a4ba7), C32e(0xd0f51f59), C32e(0x6f4e8186),
+	C32e(0x0e9dad81), C32e(0xafd85a9f), C32e(0xa7050667),
+	C32e(0xee34626a), C32e(0x8b0b28be), C32e(0x6eb91727),
+	C32e(0x47740726), C32e(0xc680103f), C32e(0xe0a07e6f),
+	C32e(0xc67e487b), C32e(0x0d550aa5), C32e(0x4af8a4c0),
+	C32e(0x91e3e79f), C32e(0x978ef19e), C32e(0x86767281),
+	C32e(0x50608dd4), C32e(0x7e9e5a41), C32e(0xf3e5b062),
+	C32e(0xfc9f1fec), C32e(0x4054207a), C32e(0xe3e41a00),
+	C32e(0xcef4c984), C32e(0x4fd794f5), C32e(0x9dfa95d8),
+	C32e(0x552e7e11), C32e(0x24c354a5), C32e(0x5bdf7228),
+	C32e(0xbdfe6e28), C32e(0x78f57fe2), C32e(0x0fa5c4b2),
+	C32e(0x05897cef), C32e(0xee49d32e), C32e(0x447e9385),
+	C32e(0xeb28597f), C32e(0x705f6937), C32e(0xb324314a),
+	C32e(0x5e8628f1), C32e(0x1dd6e465), C32e(0xc71b7704),
+	C32e(0x51b920e7), C32e(0x74fe43e8), C32e(0x23d4878a),
+	C32e(0x7d29e8a3), C32e(0x927694f2), C32e(0xddcb7a09),
+	C32e(0x9b30d9c1), C32e(0x1d1b30fb), C32e(0x5bdc1be0),
+	C32e(0xda24494f), C32e(0xf29c82bf), C32e(0xa4e7ba31),
+	C32e(0xb470bfff), C32e(0x0d324405), C32e(0xdef8bc48),
+	C32e(0x3baefc32), C32e(0x53bbd339), C32e(0x459fc3c1),
+	C32e(0xe0298ba0), C32e(0xe5c905fd), C32e(0xf7ae090f),
+	C32e(0x94703412), C32e(0x4290f134), C32e(0xa271b701),
+	C32e(0xe344ed95), C32e(0xe93b8e36), C32e(0x4f2f984a),
+	C32e(0x88401d63), C32e(0xa06cf615), C32e(0x47c1444b),
+	C32e(0x8752afff), C32e(0x7ebb4af1), C32e(0xe20ac630),
+	C32e(0x4670b6c5), C32e(0xcc6e8ce6), C32e(0xa4d5a456),
+	C32e(0xbd4fca00), C32e(0xda9d844b), C32e(0xc83e18ae),
+	C32e(0x7357ce45), C32e(0x3064d1ad), C32e(0xe8a6ce68),
+	C32e(0x145c2567), C32e(0xa3da8cf2), C32e(0xcb0ee116),
+	C32e(0x33e90658), C32e(0x9a94999a), C32e(0x1f60b220),
+	C32e(0xc26f847b), C32e(0xd1ceac7f), C32e(0xa0d18518),
+	C32e(0x32595ba1), C32e(0x8ddd19d3), C32e(0x509a1cc0),
+	C32e(0xaaa5b446), C32e(0x9f3d6367), C32e(0xe4046bba),
+	C32e(0xf6ca19ab), C32e(0x0b56ee7e), C32e(0x1fb179ea),
+	C32e(0xa9282174), C32e(0xe9bdf735), C32e(0x3b3651ee),
+	C32e(0x1d57ac5a), C32e(0x7550d376), C32e(0x3a46c2fe),
+	C32e(0xa37d7001), C32e(0xf735c1af), C32e(0x98a4d842),
+	C32e(0x78edec20), C32e(0x9e6b6779), C32e(0x41836315),
+	C32e(0xea3adba8), C32e(0xfac33b4d), C32e(0x32832c83),
+	C32e(0xa7403b1f), C32e(0x1c2747f3), C32e(0x5940f034),
+	C32e(0xb72d769a), C32e(0xe73e4e6c), C32e(0xd2214ffd),
+	C32e(0xb8fd8d39), C32e(0xdc5759ef), C32e(0x8d9b0c49),
+	C32e(0x2b49ebda), C32e(0x5ba2d749), C32e(0x68f3700d),
+	C32e(0x7d3baed0), C32e(0x7a8d5584), C32e(0xf5a5e9f0),
+	C32e(0xe4f88e65), C32e(0xa0b8a2f4), C32e(0x36103b53),
+	C32e(0x0ca8079e), C32e(0x753eec5a), C32e(0x91689492),
+	C32e(0x56e8884f), C32e(0x5bb05c55), C32e(0xf8babc4c),
+	C32e(0xe3bb3b99), C32e(0xf387947b), C32e(0x75daf4d6),
+	C32e(0x726b1c5d), C32e(0x64aeac28), C32e(0xdc34b36d),
+	C32e(0x6c34a550), C32e(0xb828db71), C32e(0xf861e2f2),
+	C32e(0x108d512a), C32e(0xe3db6433), C32e(0x59dd75fc),
+	C32e(0x1cacbcf1), C32e(0x43ce3fa2), C32e(0x67bbd13c),
+	C32e(0x02e843b0), C32e(0x330a5bca), C32e(0x8829a175),
+	C32e(0x7f34194d), C32e(0xb416535c), C32e(0x923b94c3),
+	C32e(0x0e794d1e), C32e(0x797475d7), C32e(0xb6eeaf3f),
+	C32e(0xeaa8d4f7), C32e(0xbe1a3921), C32e(0x5cf47e09),
+	C32e(0x4c232751), C32e(0x26a32453), C32e(0xba323cd2),
+	C32e(0x44a3174a), C32e(0x6da6d5ad), C32e(0xb51d3ea6),
+	C32e(0xaff2c908), C32e(0x83593d98), C32e(0x916b3c56),
+	C32e(0x4cf87ca1), C32e(0x7286604d), C32e(0x46e23ecc),
+	C32e(0x086ec7f6), C32e(0x2f9833b3), C32e(0xb1bc765e),
+	C32e(0x2bd666a5), C32e(0xefc4e62a), C32e(0x06f4b6e8),
+	C32e(0xbec1d436), C32e(0x74ee8215), C32e(0xbcef2163),
+	C32e(0xfdc14e0d), C32e(0xf453c969), C32e(0xa77d5ac4),
+	C32e(0x06585826), C32e(0x7ec11416), C32e(0x06e0fa16),
+	C32e(0x7e90af3d), C32e(0x28639d3f), C32e(0xd2c9f2e3),
+	C32e(0x009bd20c), C32e(0x5faace30), C32e(0xb7d40c30),
+	C32e(0x742a5116), C32e(0xf2e03298), C32e(0x0deb30d8),
+	C32e(0xe3cef89a), C32e(0x4bc59e7b), C32e(0xb5f17992),
+	C32e(0xff51e66e), C32e(0x048668d3), C32e(0x9b234d57),
+	C32e(0xe6966731), C32e(0xcce6a6f3), C32e(0x170a7505),
+	C32e(0xb17681d9), C32e(0x13326cce), C32e(0x3c175284),
+	C32e(0xf805a262), C32e(0xf42bcbb3), C32e(0x78471547),
+	C32e(0xff465482), C32e(0x23936a48), C32e(0x38df5807),
+	C32e(0x4e5e6565), C32e(0xf2fc7c89), C32e(0xfc86508e),
+	C32e(0x31702e44), C32e(0xd00bca86), C32e(0xf04009a2),
+	C32e(0x3078474e), C32e(0x65a0ee39), C32e(0xd1f73883),
+	C32e(0xf75ee937), C32e(0xe42c3abd), C32e(0x2197b226),
+	C32e(0x0113f86f), C32e(0xa344edd1), C32e(0xef9fdee7),
+	C32e(0x8ba0df15), C32e(0x762592d9), C32e(0x3c85f7f6),
+	C32e(0x12dc42be), C32e(0xd8a7ec7c), C32e(0xab27b07e),
+	C32e(0x538d7dda), C32e(0xaa3ea8de), C32e(0xaa25ce93),
+	C32e(0xbd0269d8), C32e(0x5af643fd), C32e(0x1a7308f9),
+	C32e(0xc05fefda), C32e(0x174a19a5), C32e(0x974d6633),
+	C32e(0x4cfd216a), C32e(0x35b49831), C32e(0xdb411570),
+	C32e(0xea1e0fbb), C32e(0xedcd549b), C32e(0x9ad063a1),
+	C32e(0x51974072), C32e(0xf6759dbf), C32e(0x91476fe2)
+};
+
+#define Ceven_w3(r)   (C[((r) << 3) + 0])
+#define Ceven_w2(r)   (C[((r) << 3) + 1])
+#define Ceven_w1(r)   (C[((r) << 3) + 2])
+#define Ceven_w0(r)   (C[((r) << 3) + 3])
+#define Codd_w3(r)    (C[((r) << 3) + 4])
+#define Codd_w2(r)    (C[((r) << 3) + 5])
+#define Codd_w1(r)    (C[((r) << 3) + 6])
+#define Codd_w0(r)    (C[((r) << 3) + 7])
+
+#define S(x0, x1, x2, x3, cb, r)   do { \
+		Sb(x0 ## 3, x1 ## 3, x2 ## 3, x3 ## 3, cb ## w3(r)); \
+		Sb(x0 ## 2, x1 ## 2, x2 ## 2, x3 ## 2, cb ## w2(r)); \
+		Sb(x0 ## 1, x1 ## 1, x2 ## 1, x3 ## 1, cb ## w1(r)); \
+		Sb(x0 ## 0, x1 ## 0, x2 ## 0, x3 ## 0, cb ## w0(r)); \
+	} while (0)
+
+#define L(x0, x1, x2, x3, x4, x5, x6, x7)   do { \
+		Lb(x0 ## 3, x1 ## 3, x2 ## 3, x3 ## 3, \
+			x4 ## 3, x5 ## 3, x6 ## 3, x7 ## 3); \
+		Lb(x0 ## 2, x1 ## 2, x2 ## 2, x3 ## 2, \
+			x4 ## 2, x5 ## 2, x6 ## 2, x7 ## 2); \
+		Lb(x0 ## 1, x1 ## 1, x2 ## 1, x3 ## 1, \
+			x4 ## 1, x5 ## 1, x6 ## 1, x7 ## 1); \
+		Lb(x0 ## 0, x1 ## 0, x2 ## 0, x3 ## 0, \
+			x4 ## 0, x5 ## 0, x6 ## 0, x7 ## 0); \
+	} while (0)
+
+#define Wz(x, c, n)   do { \
+		sph_u32 t = (x ## 3 & (c)) << (n); \
+		x ## 3 = ((x ## 3 >> (n)) & (c)) | t; \
+		t = (x ## 2 & (c)) << (n); \
+		x ## 2 = ((x ## 2 >> (n)) & (c)) | t; \
+		t = (x ## 1 & (c)) << (n); \
+		x ## 1 = ((x ## 1 >> (n)) & (c)) | t; \
+		t = (x ## 0 & (c)) << (n); \
+		x ## 0 = ((x ## 0 >> (n)) & (c)) | t; \
+	} while (0)
+
+#define W0(x)   Wz(x, SPH_C32(0x55555555),  1)
+#define W1(x)   Wz(x, SPH_C32(0x33333333),  2)
+#define W2(x)   Wz(x, SPH_C32(0x0F0F0F0F),  4)
+#define W3(x)   Wz(x, SPH_C32(0x00FF00FF),  8)
+#define W4(x)   Wz(x, SPH_C32(0x0000FFFF), 16)
+#define W5(x)   do { \
+		sph_u32 t = x ## 3; \
+		x ## 3 = x ## 2; \
+		x ## 2 = t; \
+		t = x ## 1; \
+		x ## 1 = x ## 0; \
+		x ## 0 = t; \
+	} while (0)
+#define W6(x)   do { \
+		sph_u32 t = x ## 3; \
+		x ## 3 = x ## 1; \
+		x ## 1 = t; \
+		t = x ## 2; \
+		x ## 2 = x ## 0; \
+		x ## 0 = t; \
+	} while (0)
+
+#define DECL_STATE \
+	sph_u32 h03, h02, h01, h00, h13, h12, h11, h10; \
+	sph_u32 h23, h22, h21, h20, h33, h32, h31, h30; \
+	sph_u32 h43, h42, h41, h40, h53, h52, h51, h50; \
+	sph_u32 h63, h62, h61, h60, h73, h72, h71, h70; \
+	sph_u32 tmp;
+
+#define READ_STATE(state)   do { \
+		h03 = (state)->H.narrow[ 0]; \
+		h02 = (state)->H.narrow[ 1]; \
+		h01 = (state)->H.narrow[ 2]; \
+		h00 = (state)->H.narrow[ 3]; \
+		h13 = (state)->H.narrow[ 4]; \
+		h12 = (state)->H.narrow[ 5]; \
+		h11 = (state)->H.narrow[ 6]; \
+		h10 = (state)->H.narrow[ 7]; \
+		h23 = (state)->H.narrow[ 8]; \
+		h22 = (state)->H.narrow[ 9]; \
+		h21 = (state)->H.narrow[10]; \
+		h20 = (state)->H.narrow[11]; \
+		h33 = (state)->H.narrow[12]; \
+		h32 = (state)->H.narrow[13]; \
+		h31 = (state)->H.narrow[14]; \
+		h30 = (state)->H.narrow[15]; \
+		h43 = (state)->H.narrow[16]; \
+		h42 = (state)->H.narrow[17]; \
+		h41 = (state)->H.narrow[18]; \
+		h40 = (state)->H.narrow[19]; \
+		h53 = (state)->H.narrow[20]; \
+		h52 = (state)->H.narrow[21]; \
+		h51 = (state)->H.narrow[22]; \
+		h50 = (state)->H.narrow[23]; \
+		h63 = (state)->H.narrow[24]; \
+		h62 = (state)->H.narrow[25]; \
+		h61 = (state)->H.narrow[26]; \
+		h60 = (state)->H.narrow[27]; \
+		h73 = (state)->H.narrow[28]; \
+		h72 = (state)->H.narrow[29]; \
+		h71 = (state)->H.narrow[30]; \
+		h70 = (state)->H.narrow[31]; \
+	} while (0)
+
+#define WRITE_STATE(state)   do { \
+		(state)->H.narrow[ 0] = h03; \
+		(state)->H.narrow[ 1] = h02; \
+		(state)->H.narrow[ 2] = h01; \
+		(state)->H.narrow[ 3] = h00; \
+		(state)->H.narrow[ 4] = h13; \
+		(state)->H.narrow[ 5] = h12; \
+		(state)->H.narrow[ 6] = h11; \
+		(state)->H.narrow[ 7] = h10; \
+		(state)->H.narrow[ 8] = h23; \
+		(state)->H.narrow[ 9] = h22; \
+		(state)->H.narrow[10] = h21; \
+		(state)->H.narrow[11] = h20; \
+		(state)->H.narrow[12] = h33; \
+		(state)->H.narrow[13] = h32; \
+		(state)->H.narrow[14] = h31; \
+		(state)->H.narrow[15] = h30; \
+		(state)->H.narrow[16] = h43; \
+		(state)->H.narrow[17] = h42; \
+		(state)->H.narrow[18] = h41; \
+		(state)->H.narrow[19] = h40; \
+		(state)->H.narrow[20] = h53; \
+		(state)->H.narrow[21] = h52; \
+		(state)->H.narrow[22] = h51; \
+		(state)->H.narrow[23] = h50; \
+		(state)->H.narrow[24] = h63; \
+		(state)->H.narrow[25] = h62; \
+		(state)->H.narrow[26] = h61; \
+		(state)->H.narrow[27] = h60; \
+		(state)->H.narrow[28] = h73; \
+		(state)->H.narrow[29] = h72; \
+		(state)->H.narrow[30] = h71; \
+		(state)->H.narrow[31] = h70; \
+	} while (0)
+
+#define INPUT_BUF1 \
+	sph_u32 m03 = dec32e_aligned(buf +  0); \
+	sph_u32 m02 = dec32e_aligned(buf +  4); \
+	sph_u32 m01 = dec32e_aligned(buf +  8); \
+	sph_u32 m00 = dec32e_aligned(buf + 12); \
+	sph_u32 m13 = dec32e_aligned(buf + 16); \
+	sph_u32 m12 = dec32e_aligned(buf + 20); \
+	sph_u32 m11 = dec32e_aligned(buf + 24); \
+	sph_u32 m10 = dec32e_aligned(buf + 28); \
+	sph_u32 m23 = dec32e_aligned(buf + 32); \
+	sph_u32 m22 = dec32e_aligned(buf + 36); \
+	sph_u32 m21 = dec32e_aligned(buf + 40); \
+	sph_u32 m20 = dec32e_aligned(buf + 44); \
+	sph_u32 m33 = dec32e_aligned(buf + 48); \
+	sph_u32 m32 = dec32e_aligned(buf + 52); \
+	sph_u32 m31 = dec32e_aligned(buf + 56); \
+	sph_u32 m30 = dec32e_aligned(buf + 60); \
+	h03 ^= m03; \
+	h02 ^= m02; \
+	h01 ^= m01; \
+	h00 ^= m00; \
+	h13 ^= m13; \
+	h12 ^= m12; \
+	h11 ^= m11; \
+	h10 ^= m10; \
+	h23 ^= m23; \
+	h22 ^= m22; \
+	h21 ^= m21; \
+	h20 ^= m20; \
+	h33 ^= m33; \
+	h32 ^= m32; \
+	h31 ^= m31; \
+	h30 ^= m30;
+
+#define INPUT_BUF2 \
+	h43 ^= m03; \
+	h42 ^= m02; \
+	h41 ^= m01; \
+	h40 ^= m00; \
+	h53 ^= m13; \
+	h52 ^= m12; \
+	h51 ^= m11; \
+	h50 ^= m10; \
+	h63 ^= m23; \
+	h62 ^= m22; \
+	h61 ^= m21; \
+	h60 ^= m20; \
+	h73 ^= m33; \
+	h72 ^= m32; \
+	h71 ^= m31; \
+	h70 ^= m30;
+
+static const sph_u32 IV224[] = {
+	C32e(0x2dfedd62), C32e(0xf99a98ac), C32e(0xae7cacd6), C32e(0x19d634e7),
+	C32e(0xa4831005), C32e(0xbc301216), C32e(0xb86038c6), C32e(0xc9661494),
+	C32e(0x66d9899f), C32e(0x2580706f), C32e(0xce9ea31b), C32e(0x1d9b1adc),
+	C32e(0x11e8325f), C32e(0x7b366e10), C32e(0xf994857f), C32e(0x02fa06c1),
+	C32e(0x1b4f1b5c), C32e(0xd8c840b3), C32e(0x97f6a17f), C32e(0x6e738099),
+	C32e(0xdcdf93a5), C32e(0xadeaa3d3), C32e(0xa431e8de), C32e(0xc9539a68),
+	C32e(0x22b4a98a), C32e(0xec86a1e4), C32e(0xd574ac95), C32e(0x9ce56cf0),
+	C32e(0x15960dea), C32e(0xb5ab2bbf), C32e(0x9611dcf0), C32e(0xdd64ea6e)
+};
+
+static const sph_u32 IV256[] = {
+	C32e(0xeb98a341), C32e(0x2c20d3eb), C32e(0x92cdbe7b), C32e(0x9cb245c1),
+	C32e(0x1c935191), C32e(0x60d4c7fa), C32e(0x260082d6), C32e(0x7e508a03),
+	C32e(0xa4239e26), C32e(0x7726b945), C32e(0xe0fb1a48), C32e(0xd41a9477),
+	C32e(0xcdb5ab26), C32e(0x026b177a), C32e(0x56f02442), C32e(0x0fff2fa8),
+	C32e(0x71a39689), C32e(0x7f2e4d75), C32e(0x1d144908), C32e(0xf77de262),
+	C32e(0x277695f7), C32e(0x76248f94), C32e(0x87d5b657), C32e(0x4780296c),
+	C32e(0x5c5e272d), C32e(0xac8e0d6c), C32e(0x518450c6), C32e(0x57057a0f),
+	C32e(0x7be4d367), C32e(0x702412ea), C32e(0x89e3ab13), C32e(0xd31cd769)
+};
+
+static const sph_u32 IV384[] = {
+	C32e(0x481e3bc6), C32e(0xd813398a), C32e(0x6d3b5e89), C32e(0x4ade879b),
+	C32e(0x63faea68), C32e(0xd480ad2e), C32e(0x332ccb21), C32e(0x480f8267),
+	C32e(0x98aec84d), C32e(0x9082b928), C32e(0xd455ea30), C32e(0x41114249),
+	C32e(0x36f555b2), C32e(0x924847ec), C32e(0xc7250a93), C32e(0xbaf43ce1),
+	C32e(0x569b7f8a), C32e(0x27db454c), C32e(0x9efcbd49), C32e(0x6397af0e),
+	C32e(0x589fc27d), C32e(0x26aa80cd), C32e(0x80c08b8c), C32e(0x9deb2eda),
+	C32e(0x8a7981e8), C32e(0xf8d5373a), C32e(0xf43967ad), C32e(0xddd17a71),
+	C32e(0xa9b4d3bd), C32e(0xa475d394), C32e(0x976c3fba), C32e(0x9842737f)
+};
+
+static const sph_u32 IV512[] = {
+	C32e(0x6fd14b96), C32e(0x3e00aa17), C32e(0x636a2e05), C32e(0x7a15d543),
+	C32e(0x8a225e8d), C32e(0x0c97ef0b), C32e(0xe9341259), C32e(0xf2b3c361),
+	C32e(0x891da0c1), C32e(0x536f801e), C32e(0x2aa9056b), C32e(0xea2b6d80),
+	C32e(0x588eccdb), C32e(0x2075baa6), C32e(0xa90f3a76), C32e(0xbaf83bf7),
+	C32e(0x0169e605), C32e(0x41e34a69), C32e(0x46b58a8e), C32e(0x2e6fe65a),
+	C32e(0x1047a7d0), C32e(0xc1843c24), C32e(0x3b6e71b1), C32e(0x2d5ac199),
+	C32e(0xcf57f6ec), C32e(0x9db1f856), C32e(0xa706887c), C32e(0x5716b156),
+	C32e(0xe3c2fcdf), C32e(0xe68517fb), C32e(0x545a4678), C32e(0xcc8cdd4b)
+};
+
+#endif
+
+#define SL(ro)   SLu(r + ro, ro)
+
+#define SLu(r, ro)   do { \
+		S(h0, h2, h4, h6, Ceven_, r); \
+		S(h1, h3, h5, h7, Codd_, r); \
+		L(h0, h2, h4, h6, h1, h3, h5, h7); \
+		W ## ro(h1); \
+		W ## ro(h3); \
+		W ## ro(h5); \
+		W ## ro(h7); \
+	} while (0)
+
+#if SPH_SMALL_FOOTPRINT_JH
+
+#if SPH_JH_64
+
+/*
+ * The "small footprint" 64-bit version just uses a partially unrolled
+ * loop.
+ */
+
+#define E8   do { \
+		unsigned r; \
+		for (r = 0; r < 42; r += 7) { \
+			SL(0); \
+			SL(1); \
+			SL(2); \
+			SL(3); \
+			SL(4); \
+			SL(5); \
+			SL(6); \
+		} \
+	} while (0)
+
+#else
+
+#define E8   do { \
+		unsigned r, g; \
+		for (r = g = 0; r < 42; r ++) { \
+			S(h0, h2, h4, h6, Ceven_, r); \
+			S(h1, h3, h5, h7, Codd_, r); \
+			L(h0, h2, h4, h6, h1, h3, h5, h7); \
+			switch (g) { \
+			case 0: \
+				W0(h1); \
+				W0(h3); \
+				W0(h5); \
+				W0(h7); \
+				break; \
+			case 1: \
+				W1(h1); \
+				W1(h3); \
+				W1(h5); \
+				W1(h7); \
+				break; \
+			case 2: \
+				W2(h1); \
+				W2(h3); \
+				W2(h5); \
+				W2(h7); \
+				break; \
+			case 3: \
+				W3(h1); \
+				W3(h3); \
+				W3(h5); \
+				W3(h7); \
+				break; \
+			case 4: \
+				W4(h1); \
+				W4(h3); \
+				W4(h5); \
+				W4(h7); \
+				break; \
+			case 5: \
+				W5(h1); \
+				W5(h3); \
+				W5(h5); \
+				W5(h7); \
+				break; \
+			case 6: \
+				W6(h1); \
+				W6(h3); \
+				W6(h5); \
+				W6(h7); \
+				break; \
+			} \
+			if (++ g == 7) \
+				g = 0; \
+		} \
+	} while (0)
+
+#endif
+
+#else
+
+#if SPH_JH_64
+
+/*
+ * On a "true 64-bit" architecture, we can unroll at will.
+ */
+
+#define E8   do { \
+		SLu( 0, 0); \
+		SLu( 1, 1); \
+		SLu( 2, 2); \
+		SLu( 3, 3); \
+		SLu( 4, 4); \
+		SLu( 5, 5); \
+		SLu( 6, 6); \
+		SLu( 7, 0); \
+		SLu( 8, 1); \
+		SLu( 9, 2); \
+		SLu(10, 3); \
+		SLu(11, 4); \
+		SLu(12, 5); \
+		SLu(13, 6); \
+		SLu(14, 0); \
+		SLu(15, 1); \
+		SLu(16, 2); \
+		SLu(17, 3); \
+		SLu(18, 4); \
+		SLu(19, 5); \
+		SLu(20, 6); \
+		SLu(21, 0); \
+		SLu(22, 1); \
+		SLu(23, 2); \
+		SLu(24, 3); \
+		SLu(25, 4); \
+		SLu(26, 5); \
+		SLu(27, 6); \
+		SLu(28, 0); \
+		SLu(29, 1); \
+		SLu(30, 2); \
+		SLu(31, 3); \
+		SLu(32, 4); \
+		SLu(33, 5); \
+		SLu(34, 6); \
+		SLu(35, 0); \
+		SLu(36, 1); \
+		SLu(37, 2); \
+		SLu(38, 3); \
+		SLu(39, 4); \
+		SLu(40, 5); \
+		SLu(41, 6); \
+	} while (0)
+
+#else
+
+/*
+ * We are not aiming at a small footprint, but we are still using a
+ * 32-bit implementation. Full loop unrolling would smash the L1
+ * cache on some "big" architectures (32 kB L1 cache).
+ */
+
+#define E8   do { \
+		unsigned r; \
+		for (r = 0; r < 42; r += 7) { \
+			SL(0); \
+			SL(1); \
+			SL(2); \
+			SL(3); \
+			SL(4); \
+			SL(5); \
+			SL(6); \
+		} \
+	} while (0)
+
+#endif
+
+#endif
+
+static void
+jh_init(sph_jh_context *sc, const void *iv)
+{
+	sc->ptr = 0;
+#if SPH_JH_64
+	memcpy(sc->H.wide, iv, sizeof sc->H.wide);
+#else
+	memcpy(sc->H.narrow, iv, sizeof sc->H.narrow);
+#endif
+#if SPH_64
+	sc->block_count = 0;
+#else
+	sc->block_count_high = 0;
+	sc->block_count_low = 0;
+#endif
+}
+
+static void
+jh_core(sph_jh_context *sc, const void *data, size_t len)
+{
+	unsigned char *buf;
+	size_t ptr;
+	DECL_STATE
+
+	buf = sc->buf;
+	ptr = sc->ptr;
+	if (len < (sizeof sc->buf) - ptr) {
+		memcpy(buf + ptr, data, len);
+		ptr += len;
+		sc->ptr = ptr;
+		return;
+	}
+
+	READ_STATE(sc);
+	while (len > 0) {
+		size_t clen;
+
+		clen = (sizeof sc->buf) - ptr;
+		if (clen > len)
+			clen = len;
+		memcpy(buf + ptr, data, clen);
+		ptr += clen;
+		data = (const unsigned char *)data + clen;
+		len -= clen;
+		if (ptr == sizeof sc->buf) {
+			INPUT_BUF1;
+			E8;
+			INPUT_BUF2;
+#if SPH_64
+			sc->block_count ++;
+#else
+			if ((sc->block_count_low = SPH_T32(
+				sc->block_count_low + 1)) == 0)
+				sc->block_count_high ++;
+#endif
+			ptr = 0;
+		}
+	}
+	WRITE_STATE(sc);
+	sc->ptr = ptr;
+}
+
+static void
+jh_close(sph_jh_context *sc, unsigned ub, unsigned n,
+	void *dst, size_t out_size_w32, const void *iv)
+{
+	unsigned z;
+	unsigned char buf[128];
+	size_t numz, u;
+#if SPH_64
+	sph_u64 l0, l1;
+#else
+	sph_u32 l0, l1, l2, l3;
+#endif
+
+	z = 0x80 >> n;
+	buf[0] = ((ub & -z) | z) & 0xFF;
+	if (sc->ptr == 0 && n == 0) {
+		numz = 47;
+	} else {
+		numz = 111 - sc->ptr;
+	}
+	memset(buf + 1, 0, numz);
+#if SPH_64
+	l0 = SPH_T64(sc->block_count << 9) + (sc->ptr << 3) + n;
+	l1 = SPH_T64(sc->block_count >> 55);
+	sph_enc64be(buf + numz + 1, l1);
+	sph_enc64be(buf + numz + 9, l0);
+#else
+	l0 = SPH_T32(sc->block_count_low << 9) + (sc->ptr << 3) + n;
+	l1 = SPH_T32(sc->block_count_low >> 23)
+		+ SPH_T32(sc->block_count_high << 9);
+	l2 = SPH_T32(sc->block_count_high >> 23);
+	l3 = 0;
+	sph_enc32be(buf + numz +  1, l3);
+	sph_enc32be(buf + numz +  5, l2);
+	sph_enc32be(buf + numz +  9, l1);
+	sph_enc32be(buf + numz + 13, l0);
+#endif
+	jh_core(sc, buf, numz + 17);
+#if SPH_JH_64
+	for (u = 0; u < 8; u ++)
+		enc64e(buf + (u << 3), sc->H.wide[u + 8]);
+#else
+	for (u = 0; u < 16; u ++)
+		enc32e(buf + (u << 2), sc->H.narrow[u + 16]);
+#endif
+	memcpy(dst, buf + ((16 - out_size_w32) << 2), out_size_w32 << 2);
+	jh_init(sc, iv);
+}
+
+/* see sph_jh.h */
+void
+sph_jh224_init(void *cc)
+{
+	jh_init(cc, IV224);
+}
+
+/* see sph_jh.h */
+void
+sph_jh224(void *cc, const void *data, size_t len)
+{
+	jh_core(cc, data, len);
+}
+
+/* see sph_jh.h */
+void
+sph_jh224_close(void *cc, void *dst)
+{
+	jh_close(cc, 0, 0, dst, 7, IV224);
+}
+
+/* see sph_jh.h */
+void
+sph_jh224_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
+{
+	jh_close(cc, ub, n, dst, 7, IV224);
+}
+
+/* see sph_jh.h */
+void
+sph_jh256_init(void *cc)
+{
+	jh_init(cc, IV256);
+}
+
+/* see sph_jh.h */
+void
+sph_jh256(void *cc, const void *data, size_t len)
+{
+	jh_core(cc, data, len);
+}
+
+/* see sph_jh.h */
+void
+sph_jh256_close(void *cc, void *dst)
+{
+	jh_close(cc, 0, 0, dst, 8, IV256);
+}
+
+/* see sph_jh.h */
+void
+sph_jh256_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
+{
+	jh_close(cc, ub, n, dst, 8, IV256);
+}
+
+/* see sph_jh.h */
+void
+sph_jh384_init(void *cc)
+{
+	jh_init(cc, IV384);
+}
+
+/* see sph_jh.h */
+void
+sph_jh384(void *cc, const void *data, size_t len)
+{
+	jh_core(cc, data, len);
+}
+
+/* see sph_jh.h */
+void
+sph_jh384_close(void *cc, void *dst)
+{
+	jh_close(cc, 0, 0, dst, 12, IV384);
+}
+
+/* see sph_jh.h */
+void
+sph_jh384_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
+{
+	jh_close(cc, ub, n, dst, 12, IV384);
+}
+
+/* see sph_jh.h */
+void
+sph_jh512_init(void *cc)
+{
+	jh_init(cc, IV512);
+}
+
+/* see sph_jh.h */
+void
+sph_jh512(void *cc, const void *data, size_t len)
+{
+	jh_core(cc, data, len);
+}
+
+/* see sph_jh.h */
+void
+sph_jh512_close(void *cc, void *dst)
+{
+	jh_close(cc, 0, 0, dst, 16, IV512);
+}
+
+/* see sph_jh.h */
+void
+sph_jh512_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
+{
+	jh_close(cc, ub, n, dst, 16, IV512);
+}
+
+#ifdef __cplusplus
+}
+#endif
diff --git a/sha3/keccak.c b/sha3/keccak.c
new file mode 100644
index 0000000..cff9f87
--- /dev/null
+++ b/sha3/keccak.c
@@ -0,0 +1,1824 @@
+/* $Id: keccak.c 259 2011-07-19 22:11:27Z tp $ */
+/*
+ * Keccak implementation.
+ *
+ * ==========================(LICENSE BEGIN)============================
+ *
+ * Copyright (c) 2007-2010  Projet RNRT SAPHIR
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ===========================(LICENSE END)=============================
+ *
+ * @author   Thomas Pornin <thomas.pornin@cryptolog.com>
+ */
+
+#include <stddef.h>
+#include <string.h>
+
+#include "sph_keccak.h"
+
+#ifdef __cplusplus
+extern "C"{
+#endif
+
+/*
+ * Parameters:
+ *
+ *  SPH_KECCAK_64          use a 64-bit type
+ *  SPH_KECCAK_UNROLL      number of loops to unroll (0/undef for full unroll)
+ *  SPH_KECCAK_INTERLEAVE  use bit-interleaving (32-bit type only)
+ *  SPH_KECCAK_NOCOPY      do not copy the state into local variables
+ * 
+ * If there is no usable 64-bit type, the code automatically switches
+ * back to the 32-bit implementation.
+ *
+ * Some tests on an Intel Core2 Q6600 (both 64-bit and 32-bit, 32 kB L1
+ * code cache), a PowerPC (G3, 32 kB L1 code cache), an ARM920T core
+ * (16 kB L1 code cache), and a small MIPS-compatible CPU (Broadcom BCM3302,
+ * 8 kB L1 code cache), seem to show that the following are optimal:
+ *
+ * -- x86, 64-bit: use the 64-bit implementation, unroll 8 rounds,
+ * do not copy the state; unrolling 2, 6 or all rounds also provides
+ * near-optimal performance.
+ * -- x86, 32-bit: use the 32-bit implementation, unroll 6 rounds,
+ * interleave, do not copy the state. Unrolling 1, 2, 4 or 8 rounds
+ * also provides near-optimal performance.
+ * -- PowerPC: use the 64-bit implementation, unroll 8 rounds,
+ * copy the state. Unrolling 4 or 6 rounds is near-optimal.
+ * -- ARM: use the 64-bit implementation, unroll 2 or 4 rounds,
+ * copy the state.
+ * -- MIPS: use the 64-bit implementation, unroll 2 rounds, copy
+ * the state. Unrolling only 1 round is also near-optimal.
+ *
+ * Also, interleaving does not always yield actual improvements when
+ * using a 32-bit implementation; in particular when the architecture
+ * does not offer a native rotation opcode (interleaving replaces one
+ * 64-bit rotation with two 32-bit rotations, which is a gain only if
+ * there is a native 32-bit rotation opcode and not a native 64-bit
+ * rotation opcode; also, interleaving implies a small overhead when
+ * processing input words).
+ *
+ * To sum up:
+ * -- when possible, use the 64-bit code
+ * -- exception: on 32-bit x86, use 32-bit code
+ * -- when using 32-bit code, use interleaving
+ * -- copy the state, except on x86
+ * -- unroll 8 rounds on "big" machine, 2 rounds on "small" machines
+ */
+
+#if SPH_SMALL_FOOTPRINT && !defined SPH_SMALL_FOOTPRINT_KECCAK
+#define SPH_SMALL_FOOTPRINT_KECCAK   1
+#endif
+
+/*
+ * By default, we select the 64-bit implementation if a 64-bit type
+ * is available, unless a 32-bit x86 is detected.
+ */
+#if !defined SPH_KECCAK_64 && SPH_64 \
+	&& !(defined __i386__ || SPH_I386_GCC || SPH_I386_MSVC)
+#define SPH_KECCAK_64   1
+#endif
+
+/*
+ * If using a 32-bit implementation, we prefer to interleave.
+ */
+#if !SPH_KECCAK_64 && !defined SPH_KECCAK_INTERLEAVE
+#define SPH_KECCAK_INTERLEAVE   1
+#endif
+
+/*
+ * Unroll 8 rounds on big systems, 2 rounds on small systems.
+ */
+#ifndef SPH_KECCAK_UNROLL
+#if SPH_SMALL_FOOTPRINT_KECCAK
+#define SPH_KECCAK_UNROLL   2
+#else
+#define SPH_KECCAK_UNROLL   8
+#endif
+#endif
+
+/*
+ * We do not want to copy the state to local variables on x86 (32-bit
+ * and 64-bit alike).
+ */
+#ifndef SPH_KECCAK_NOCOPY
+#if defined __i386__ || defined __x86_64 || SPH_I386_MSVC || SPH_I386_GCC
+#define SPH_KECCAK_NOCOPY   1
+#else
+#define SPH_KECCAK_NOCOPY   0
+#endif
+#endif
+
+#ifdef _MSC_VER
+#pragma warning (disable: 4146)
+#endif
+
+#if SPH_KECCAK_64
+
+static const sph_u64 RC[] = {
+	SPH_C64(0x0000000000000001), SPH_C64(0x0000000000008082),
+	SPH_C64(0x800000000000808A), SPH_C64(0x8000000080008000),
+	SPH_C64(0x000000000000808B), SPH_C64(0x0000000080000001),
+	SPH_C64(0x8000000080008081), SPH_C64(0x8000000000008009),
+	SPH_C64(0x000000000000008A), SPH_C64(0x0000000000000088),
+	SPH_C64(0x0000000080008009), SPH_C64(0x000000008000000A),
+	SPH_C64(0x000000008000808B), SPH_C64(0x800000000000008B),
+	SPH_C64(0x8000000000008089), SPH_C64(0x8000000000008003),
+	SPH_C64(0x8000000000008002), SPH_C64(0x8000000000000080),
+	SPH_C64(0x000000000000800A), SPH_C64(0x800000008000000A),
+	SPH_C64(0x8000000080008081), SPH_C64(0x8000000000008080),
+	SPH_C64(0x0000000080000001), SPH_C64(0x8000000080008008)
+};
+
+#if SPH_KECCAK_NOCOPY
+
+#define a00   (kc->u.wide[ 0])
+#define a10   (kc->u.wide[ 1])
+#define a20   (kc->u.wide[ 2])
+#define a30   (kc->u.wide[ 3])
+#define a40   (kc->u.wide[ 4])
+#define a01   (kc->u.wide[ 5])
+#define a11   (kc->u.wide[ 6])
+#define a21   (kc->u.wide[ 7])
+#define a31   (kc->u.wide[ 8])
+#define a41   (kc->u.wide[ 9])
+#define a02   (kc->u.wide[10])
+#define a12   (kc->u.wide[11])
+#define a22   (kc->u.wide[12])
+#define a32   (kc->u.wide[13])
+#define a42   (kc->u.wide[14])
+#define a03   (kc->u.wide[15])
+#define a13   (kc->u.wide[16])
+#define a23   (kc->u.wide[17])
+#define a33   (kc->u.wide[18])
+#define a43   (kc->u.wide[19])
+#define a04   (kc->u.wide[20])
+#define a14   (kc->u.wide[21])
+#define a24   (kc->u.wide[22])
+#define a34   (kc->u.wide[23])
+#define a44   (kc->u.wide[24])
+
+#define DECL_STATE
+#define READ_STATE(sc)
+#define WRITE_STATE(sc)
+
+#define INPUT_BUF(size)   do { \
+		size_t j; \
+		for (j = 0; j < (size); j += 8) { \
+			kc->u.wide[j >> 3] ^= sph_dec64le_aligned(buf + j); \
+		} \
+	} while (0)
+
+#define INPUT_BUF144   INPUT_BUF(144)
+#define INPUT_BUF136   INPUT_BUF(136)
+#define INPUT_BUF104   INPUT_BUF(104)
+#define INPUT_BUF72    INPUT_BUF(72)
+
+#else
+
+#define DECL_STATE \
+	sph_u64 a00, a01, a02, a03, a04; \
+	sph_u64 a10, a11, a12, a13, a14; \
+	sph_u64 a20, a21, a22, a23, a24; \
+	sph_u64 a30, a31, a32, a33, a34; \
+	sph_u64 a40, a41, a42, a43, a44;
+
+#define READ_STATE(state)   do { \
+		a00 = (state)->u.wide[ 0]; \
+		a10 = (state)->u.wide[ 1]; \
+		a20 = (state)->u.wide[ 2]; \
+		a30 = (state)->u.wide[ 3]; \
+		a40 = (state)->u.wide[ 4]; \
+		a01 = (state)->u.wide[ 5]; \
+		a11 = (state)->u.wide[ 6]; \
+		a21 = (state)->u.wide[ 7]; \
+		a31 = (state)->u.wide[ 8]; \
+		a41 = (state)->u.wide[ 9]; \
+		a02 = (state)->u.wide[10]; \
+		a12 = (state)->u.wide[11]; \
+		a22 = (state)->u.wide[12]; \
+		a32 = (state)->u.wide[13]; \
+		a42 = (state)->u.wide[14]; \
+		a03 = (state)->u.wide[15]; \
+		a13 = (state)->u.wide[16]; \
+		a23 = (state)->u.wide[17]; \
+		a33 = (state)->u.wide[18]; \
+		a43 = (state)->u.wide[19]; \
+		a04 = (state)->u.wide[20]; \
+		a14 = (state)->u.wide[21]; \
+		a24 = (state)->u.wide[22]; \
+		a34 = (state)->u.wide[23]; \
+		a44 = (state)->u.wide[24]; \
+	} while (0)
+
+#define WRITE_STATE(state)   do { \
+		(state)->u.wide[ 0] = a00; \
+		(state)->u.wide[ 1] = a10; \
+		(state)->u.wide[ 2] = a20; \
+		(state)->u.wide[ 3] = a30; \
+		(state)->u.wide[ 4] = a40; \
+		(state)->u.wide[ 5] = a01; \
+		(state)->u.wide[ 6] = a11; \
+		(state)->u.wide[ 7] = a21; \
+		(state)->u.wide[ 8] = a31; \
+		(state)->u.wide[ 9] = a41; \
+		(state)->u.wide[10] = a02; \
+		(state)->u.wide[11] = a12; \
+		(state)->u.wide[12] = a22; \
+		(state)->u.wide[13] = a32; \
+		(state)->u.wide[14] = a42; \
+		(state)->u.wide[15] = a03; \
+		(state)->u.wide[16] = a13; \
+		(state)->u.wide[17] = a23; \
+		(state)->u.wide[18] = a33; \
+		(state)->u.wide[19] = a43; \
+		(state)->u.wide[20] = a04; \
+		(state)->u.wide[21] = a14; \
+		(state)->u.wide[22] = a24; \
+		(state)->u.wide[23] = a34; \
+		(state)->u.wide[24] = a44; \
+	} while (0)
+
+#define INPUT_BUF144   do { \
+		a00 ^= sph_dec64le_aligned(buf +   0); \
+		a10 ^= sph_dec64le_aligned(buf +   8); \
+		a20 ^= sph_dec64le_aligned(buf +  16); \
+		a30 ^= sph_dec64le_aligned(buf +  24); \
+		a40 ^= sph_dec64le_aligned(buf +  32); \
+		a01 ^= sph_dec64le_aligned(buf +  40); \
+		a11 ^= sph_dec64le_aligned(buf +  48); \
+		a21 ^= sph_dec64le_aligned(buf +  56); \
+		a31 ^= sph_dec64le_aligned(buf +  64); \
+		a41 ^= sph_dec64le_aligned(buf +  72); \
+		a02 ^= sph_dec64le_aligned(buf +  80); \
+		a12 ^= sph_dec64le_aligned(buf +  88); \
+		a22 ^= sph_dec64le_aligned(buf +  96); \
+		a32 ^= sph_dec64le_aligned(buf + 104); \
+		a42 ^= sph_dec64le_aligned(buf + 112); \
+		a03 ^= sph_dec64le_aligned(buf + 120); \
+		a13 ^= sph_dec64le_aligned(buf + 128); \
+		a23 ^= sph_dec64le_aligned(buf + 136); \
+	} while (0)
+
+#define INPUT_BUF136   do { \
+		a00 ^= sph_dec64le_aligned(buf +   0); \
+		a10 ^= sph_dec64le_aligned(buf +   8); \
+		a20 ^= sph_dec64le_aligned(buf +  16); \
+		a30 ^= sph_dec64le_aligned(buf +  24); \
+		a40 ^= sph_dec64le_aligned(buf +  32); \
+		a01 ^= sph_dec64le_aligned(buf +  40); \
+		a11 ^= sph_dec64le_aligned(buf +  48); \
+		a21 ^= sph_dec64le_aligned(buf +  56); \
+		a31 ^= sph_dec64le_aligned(buf +  64); \
+		a41 ^= sph_dec64le_aligned(buf +  72); \
+		a02 ^= sph_dec64le_aligned(buf +  80); \
+		a12 ^= sph_dec64le_aligned(buf +  88); \
+		a22 ^= sph_dec64le_aligned(buf +  96); \
+		a32 ^= sph_dec64le_aligned(buf + 104); \
+		a42 ^= sph_dec64le_aligned(buf + 112); \
+		a03 ^= sph_dec64le_aligned(buf + 120); \
+		a13 ^= sph_dec64le_aligned(buf + 128); \
+	} while (0)
+
+#define INPUT_BUF104   do { \
+		a00 ^= sph_dec64le_aligned(buf +   0); \
+		a10 ^= sph_dec64le_aligned(buf +   8); \
+		a20 ^= sph_dec64le_aligned(buf +  16); \
+		a30 ^= sph_dec64le_aligned(buf +  24); \
+		a40 ^= sph_dec64le_aligned(buf +  32); \
+		a01 ^= sph_dec64le_aligned(buf +  40); \
+		a11 ^= sph_dec64le_aligned(buf +  48); \
+		a21 ^= sph_dec64le_aligned(buf +  56); \
+		a31 ^= sph_dec64le_aligned(buf +  64); \
+		a41 ^= sph_dec64le_aligned(buf +  72); \
+		a02 ^= sph_dec64le_aligned(buf +  80); \
+		a12 ^= sph_dec64le_aligned(buf +  88); \
+		a22 ^= sph_dec64le_aligned(buf +  96); \
+	} while (0)
+
+#define INPUT_BUF72   do { \
+		a00 ^= sph_dec64le_aligned(buf +   0); \
+		a10 ^= sph_dec64le_aligned(buf +   8); \
+		a20 ^= sph_dec64le_aligned(buf +  16); \
+		a30 ^= sph_dec64le_aligned(buf +  24); \
+		a40 ^= sph_dec64le_aligned(buf +  32); \
+		a01 ^= sph_dec64le_aligned(buf +  40); \
+		a11 ^= sph_dec64le_aligned(buf +  48); \
+		a21 ^= sph_dec64le_aligned(buf +  56); \
+		a31 ^= sph_dec64le_aligned(buf +  64); \
+	} while (0)
+
+#define INPUT_BUF(lim)   do { \
+		a00 ^= sph_dec64le_aligned(buf +   0); \
+		a10 ^= sph_dec64le_aligned(buf +   8); \
+		a20 ^= sph_dec64le_aligned(buf +  16); \
+		a30 ^= sph_dec64le_aligned(buf +  24); \
+		a40 ^= sph_dec64le_aligned(buf +  32); \
+		a01 ^= sph_dec64le_aligned(buf +  40); \
+		a11 ^= sph_dec64le_aligned(buf +  48); \
+		a21 ^= sph_dec64le_aligned(buf +  56); \
+		a31 ^= sph_dec64le_aligned(buf +  64); \
+		if ((lim) == 72) \
+			break; \
+		a41 ^= sph_dec64le_aligned(buf +  72); \
+		a02 ^= sph_dec64le_aligned(buf +  80); \
+		a12 ^= sph_dec64le_aligned(buf +  88); \
+		a22 ^= sph_dec64le_aligned(buf +  96); \
+		if ((lim) == 104) \
+			break; \
+		a32 ^= sph_dec64le_aligned(buf + 104); \
+		a42 ^= sph_dec64le_aligned(buf + 112); \
+		a03 ^= sph_dec64le_aligned(buf + 120); \
+		a13 ^= sph_dec64le_aligned(buf + 128); \
+		if ((lim) == 136) \
+			break; \
+		a23 ^= sph_dec64le_aligned(buf + 136); \
+	} while (0)
+
+#endif
+
+#define DECL64(x)        sph_u64 x
+#define MOV64(d, s)      (d = s)
+#define XOR64(d, a, b)   (d = a ^ b)
+#define AND64(d, a, b)   (d = a & b)
+#define OR64(d, a, b)    (d = a | b)
+#define NOT64(d, s)      (d = SPH_T64(~s))
+#define ROL64(d, v, n)   (d = SPH_ROTL64(v, n))
+#define XOR64_IOTA       XOR64
+
+#else
+
+static const struct {
+	sph_u32 high, low;
+} RC[] = {
+#if SPH_KECCAK_INTERLEAVE
+	{ SPH_C32(0x00000000), SPH_C32(0x00000001) },
+	{ SPH_C32(0x00000089), SPH_C32(0x00000000) },
+	{ SPH_C32(0x8000008B), SPH_C32(0x00000000) },
+	{ SPH_C32(0x80008080), SPH_C32(0x00000000) },
+	{ SPH_C32(0x0000008B), SPH_C32(0x00000001) },
+	{ SPH_C32(0x00008000), SPH_C32(0x00000001) },
+	{ SPH_C32(0x80008088), SPH_C32(0x00000001) },
+	{ SPH_C32(0x80000082), SPH_C32(0x00000001) },
+	{ SPH_C32(0x0000000B), SPH_C32(0x00000000) },
+	{ SPH_C32(0x0000000A), SPH_C32(0x00000000) },
+	{ SPH_C32(0x00008082), SPH_C32(0x00000001) },
+	{ SPH_C32(0x00008003), SPH_C32(0x00000000) },
+	{ SPH_C32(0x0000808B), SPH_C32(0x00000001) },
+	{ SPH_C32(0x8000000B), SPH_C32(0x00000001) },
+	{ SPH_C32(0x8000008A), SPH_C32(0x00000001) },
+	{ SPH_C32(0x80000081), SPH_C32(0x00000001) },
+	{ SPH_C32(0x80000081), SPH_C32(0x00000000) },
+	{ SPH_C32(0x80000008), SPH_C32(0x00000000) },
+	{ SPH_C32(0x00000083), SPH_C32(0x00000000) },
+	{ SPH_C32(0x80008003), SPH_C32(0x00000000) },
+	{ SPH_C32(0x80008088), SPH_C32(0x00000001) },
+	{ SPH_C32(0x80000088), SPH_C32(0x00000000) },
+	{ SPH_C32(0x00008000), SPH_C32(0x00000001) },
+	{ SPH_C32(0x80008082), SPH_C32(0x00000000) }
+#else
+	{ SPH_C32(0x00000000), SPH_C32(0x00000001) },
+	{ SPH_C32(0x00000000), SPH_C32(0x00008082) },
+	{ SPH_C32(0x80000000), SPH_C32(0x0000808A) },
+	{ SPH_C32(0x80000000), SPH_C32(0x80008000) },
+	{ SPH_C32(0x00000000), SPH_C32(0x0000808B) },
+	{ SPH_C32(0x00000000), SPH_C32(0x80000001) },
+	{ SPH_C32(0x80000000), SPH_C32(0x80008081) },
+	{ SPH_C32(0x80000000), SPH_C32(0x00008009) },
+	{ SPH_C32(0x00000000), SPH_C32(0x0000008A) },
+	{ SPH_C32(0x00000000), SPH_C32(0x00000088) },
+	{ SPH_C32(0x00000000), SPH_C32(0x80008009) },
+	{ SPH_C32(0x00000000), SPH_C32(0x8000000A) },
+	{ SPH_C32(0x00000000), SPH_C32(0x8000808B) },
+	{ SPH_C32(0x80000000), SPH_C32(0x0000008B) },
+	{ SPH_C32(0x80000000), SPH_C32(0x00008089) },
+	{ SPH_C32(0x80000000), SPH_C32(0x00008003) },
+	{ SPH_C32(0x80000000), SPH_C32(0x00008002) },
+	{ SPH_C32(0x80000000), SPH_C32(0x00000080) },
+	{ SPH_C32(0x00000000), SPH_C32(0x0000800A) },
+	{ SPH_C32(0x80000000), SPH_C32(0x8000000A) },
+	{ SPH_C32(0x80000000), SPH_C32(0x80008081) },
+	{ SPH_C32(0x80000000), SPH_C32(0x00008080) },
+	{ SPH_C32(0x00000000), SPH_C32(0x80000001) },
+	{ SPH_C32(0x80000000), SPH_C32(0x80008008) }
+#endif
+};
+
+#if SPH_KECCAK_INTERLEAVE
+
+#define INTERLEAVE(xl, xh)   do { \
+		sph_u32 l, h, t; \
+		l = (xl); h = (xh); \
+		t = (l ^ (l >> 1)) & SPH_C32(0x22222222); l ^= t ^ (t << 1); \
+		t = (h ^ (h >> 1)) & SPH_C32(0x22222222); h ^= t ^ (t << 1); \
+		t = (l ^ (l >> 2)) & SPH_C32(0x0C0C0C0C); l ^= t ^ (t << 2); \
+		t = (h ^ (h >> 2)) & SPH_C32(0x0C0C0C0C); h ^= t ^ (t << 2); \
+		t = (l ^ (l >> 4)) & SPH_C32(0x00F000F0); l ^= t ^ (t << 4); \
+		t = (h ^ (h >> 4)) & SPH_C32(0x00F000F0); h ^= t ^ (t << 4); \
+		t = (l ^ (l >> 8)) & SPH_C32(0x0000FF00); l ^= t ^ (t << 8); \
+		t = (h ^ (h >> 8)) & SPH_C32(0x0000FF00); h ^= t ^ (t << 8); \
+		t = (l ^ SPH_T32(h << 16)) & SPH_C32(0xFFFF0000); \
+		l ^= t; h ^= t >> 16; \
+		(xl) = l; (xh) = h; \
+	} while (0)
+
+#define UNINTERLEAVE(xl, xh)   do { \
+		sph_u32 l, h, t; \
+		l = (xl); h = (xh); \
+		t = (l ^ SPH_T32(h << 16)) & SPH_C32(0xFFFF0000); \
+		l ^= t; h ^= t >> 16; \
+		t = (l ^ (l >> 8)) & SPH_C32(0x0000FF00); l ^= t ^ (t << 8); \
+		t = (h ^ (h >> 8)) & SPH_C32(0x0000FF00); h ^= t ^ (t << 8); \
+		t = (l ^ (l >> 4)) & SPH_C32(0x00F000F0); l ^= t ^ (t << 4); \
+		t = (h ^ (h >> 4)) & SPH_C32(0x00F000F0); h ^= t ^ (t << 4); \
+		t = (l ^ (l >> 2)) & SPH_C32(0x0C0C0C0C); l ^= t ^ (t << 2); \
+		t = (h ^ (h >> 2)) & SPH_C32(0x0C0C0C0C); h ^= t ^ (t << 2); \
+		t = (l ^ (l >> 1)) & SPH_C32(0x22222222); l ^= t ^ (t << 1); \
+		t = (h ^ (h >> 1)) & SPH_C32(0x22222222); h ^= t ^ (t << 1); \
+		(xl) = l; (xh) = h; \
+	} while (0)
+
+#else
+
+#define INTERLEAVE(l, h)
+#define UNINTERLEAVE(l, h)
+
+#endif
+
+#if SPH_KECCAK_NOCOPY
+
+#define a00l   (kc->u.narrow[2 *  0 + 0])
+#define a00h   (kc->u.narrow[2 *  0 + 1])
+#define a10l   (kc->u.narrow[2 *  1 + 0])
+#define a10h   (kc->u.narrow[2 *  1 + 1])
+#define a20l   (kc->u.narrow[2 *  2 + 0])
+#define a20h   (kc->u.narrow[2 *  2 + 1])
+#define a30l   (kc->u.narrow[2 *  3 + 0])
+#define a30h   (kc->u.narrow[2 *  3 + 1])
+#define a40l   (kc->u.narrow[2 *  4 + 0])
+#define a40h   (kc->u.narrow[2 *  4 + 1])
+#define a01l   (kc->u.narrow[2 *  5 + 0])
+#define a01h   (kc->u.narrow[2 *  5 + 1])
+#define a11l   (kc->u.narrow[2 *  6 + 0])
+#define a11h   (kc->u.narrow[2 *  6 + 1])
+#define a21l   (kc->u.narrow[2 *  7 + 0])
+#define a21h   (kc->u.narrow[2 *  7 + 1])
+#define a31l   (kc->u.narrow[2 *  8 + 0])
+#define a31h   (kc->u.narrow[2 *  8 + 1])
+#define a41l   (kc->u.narrow[2 *  9 + 0])
+#define a41h   (kc->u.narrow[2 *  9 + 1])
+#define a02l   (kc->u.narrow[2 * 10 + 0])
+#define a02h   (kc->u.narrow[2 * 10 + 1])
+#define a12l   (kc->u.narrow[2 * 11 + 0])
+#define a12h   (kc->u.narrow[2 * 11 + 1])
+#define a22l   (kc->u.narrow[2 * 12 + 0])
+#define a22h   (kc->u.narrow[2 * 12 + 1])
+#define a32l   (kc->u.narrow[2 * 13 + 0])
+#define a32h   (kc->u.narrow[2 * 13 + 1])
+#define a42l   (kc->u.narrow[2 * 14 + 0])
+#define a42h   (kc->u.narrow[2 * 14 + 1])
+#define a03l   (kc->u.narrow[2 * 15 + 0])
+#define a03h   (kc->u.narrow[2 * 15 + 1])
+#define a13l   (kc->u.narrow[2 * 16 + 0])
+#define a13h   (kc->u.narrow[2 * 16 + 1])
+#define a23l   (kc->u.narrow[2 * 17 + 0])
+#define a23h   (kc->u.narrow[2 * 17 + 1])
+#define a33l   (kc->u.narrow[2 * 18 + 0])
+#define a33h   (kc->u.narrow[2 * 18 + 1])
+#define a43l   (kc->u.narrow[2 * 19 + 0])
+#define a43h   (kc->u.narrow[2 * 19 + 1])
+#define a04l   (kc->u.narrow[2 * 20 + 0])
+#define a04h   (kc->u.narrow[2 * 20 + 1])
+#define a14l   (kc->u.narrow[2 * 21 + 0])
+#define a14h   (kc->u.narrow[2 * 21 + 1])
+#define a24l   (kc->u.narrow[2 * 22 + 0])
+#define a24h   (kc->u.narrow[2 * 22 + 1])
+#define a34l   (kc->u.narrow[2 * 23 + 0])
+#define a34h   (kc->u.narrow[2 * 23 + 1])
+#define a44l   (kc->u.narrow[2 * 24 + 0])
+#define a44h   (kc->u.narrow[2 * 24 + 1])
+
+#define DECL_STATE
+#define READ_STATE(state)
+#define WRITE_STATE(state)
+
+#define INPUT_BUF(size)   do { \
+		size_t j; \
+		for (j = 0; j < (size); j += 8) { \
+			sph_u32 tl, th; \
+			tl = sph_dec32le_aligned(buf + j + 0); \
+			th = sph_dec32le_aligned(buf + j + 4); \
+			INTERLEAVE(tl, th); \
+			kc->u.narrow[(j >> 2) + 0] ^= tl; \
+			kc->u.narrow[(j >> 2) + 1] ^= th; \
+		} \
+	} while (0)
+
+#define INPUT_BUF144   INPUT_BUF(144)
+#define INPUT_BUF136   INPUT_BUF(136)
+#define INPUT_BUF104   INPUT_BUF(104)
+#define INPUT_BUF72    INPUT_BUF(72)
+
+#else
+
+#define DECL_STATE \
+	sph_u32 a00l, a00h, a01l, a01h, a02l, a02h, a03l, a03h, a04l, a04h; \
+	sph_u32 a10l, a10h, a11l, a11h, a12l, a12h, a13l, a13h, a14l, a14h; \
+	sph_u32 a20l, a20h, a21l, a21h, a22l, a22h, a23l, a23h, a24l, a24h; \
+	sph_u32 a30l, a30h, a31l, a31h, a32l, a32h, a33l, a33h, a34l, a34h; \
+	sph_u32 a40l, a40h, a41l, a41h, a42l, a42h, a43l, a43h, a44l, a44h;
+
+#define READ_STATE(state)   do { \
+		a00l = (state)->u.narrow[2 *  0 + 0]; \
+		a00h = (state)->u.narrow[2 *  0 + 1]; \
+		a10l = (state)->u.narrow[2 *  1 + 0]; \
+		a10h = (state)->u.narrow[2 *  1 + 1]; \
+		a20l = (state)->u.narrow[2 *  2 + 0]; \
+		a20h = (state)->u.narrow[2 *  2 + 1]; \
+		a30l = (state)->u.narrow[2 *  3 + 0]; \
+		a30h = (state)->u.narrow[2 *  3 + 1]; \
+		a40l = (state)->u.narrow[2 *  4 + 0]; \
+		a40h = (state)->u.narrow[2 *  4 + 1]; \
+		a01l = (state)->u.narrow[2 *  5 + 0]; \
+		a01h = (state)->u.narrow[2 *  5 + 1]; \
+		a11l = (state)->u.narrow[2 *  6 + 0]; \
+		a11h = (state)->u.narrow[2 *  6 + 1]; \
+		a21l = (state)->u.narrow[2 *  7 + 0]; \
+		a21h = (state)->u.narrow[2 *  7 + 1]; \
+		a31l = (state)->u.narrow[2 *  8 + 0]; \
+		a31h = (state)->u.narrow[2 *  8 + 1]; \
+		a41l = (state)->u.narrow[2 *  9 + 0]; \
+		a41h = (state)->u.narrow[2 *  9 + 1]; \
+		a02l = (state)->u.narrow[2 * 10 + 0]; \
+		a02h = (state)->u.narrow[2 * 10 + 1]; \
+		a12l = (state)->u.narrow[2 * 11 + 0]; \
+		a12h = (state)->u.narrow[2 * 11 + 1]; \
+		a22l = (state)->u.narrow[2 * 12 + 0]; \
+		a22h = (state)->u.narrow[2 * 12 + 1]; \
+		a32l = (state)->u.narrow[2 * 13 + 0]; \
+		a32h = (state)->u.narrow[2 * 13 + 1]; \
+		a42l = (state)->u.narrow[2 * 14 + 0]; \
+		a42h = (state)->u.narrow[2 * 14 + 1]; \
+		a03l = (state)->u.narrow[2 * 15 + 0]; \
+		a03h = (state)->u.narrow[2 * 15 + 1]; \
+		a13l = (state)->u.narrow[2 * 16 + 0]; \
+		a13h = (state)->u.narrow[2 * 16 + 1]; \
+		a23l = (state)->u.narrow[2 * 17 + 0]; \
+		a23h = (state)->u.narrow[2 * 17 + 1]; \
+		a33l = (state)->u.narrow[2 * 18 + 0]; \
+		a33h = (state)->u.narrow[2 * 18 + 1]; \
+		a43l = (state)->u.narrow[2 * 19 + 0]; \
+		a43h = (state)->u.narrow[2 * 19 + 1]; \
+		a04l = (state)->u.narrow[2 * 20 + 0]; \
+		a04h = (state)->u.narrow[2 * 20 + 1]; \
+		a14l = (state)->u.narrow[2 * 21 + 0]; \
+		a14h = (state)->u.narrow[2 * 21 + 1]; \
+		a24l = (state)->u.narrow[2 * 22 + 0]; \
+		a24h = (state)->u.narrow[2 * 22 + 1]; \
+		a34l = (state)->u.narrow[2 * 23 + 0]; \
+		a34h = (state)->u.narrow[2 * 23 + 1]; \
+		a44l = (state)->u.narrow[2 * 24 + 0]; \
+		a44h = (state)->u.narrow[2 * 24 + 1]; \
+	} while (0)
+
+#define WRITE_STATE(state)   do { \
+		(state)->u.narrow[2 *  0 + 0] = a00l; \
+		(state)->u.narrow[2 *  0 + 1] = a00h; \
+		(state)->u.narrow[2 *  1 + 0] = a10l; \
+		(state)->u.narrow[2 *  1 + 1] = a10h; \
+		(state)->u.narrow[2 *  2 + 0] = a20l; \
+		(state)->u.narrow[2 *  2 + 1] = a20h; \
+		(state)->u.narrow[2 *  3 + 0] = a30l; \
+		(state)->u.narrow[2 *  3 + 1] = a30h; \
+		(state)->u.narrow[2 *  4 + 0] = a40l; \
+		(state)->u.narrow[2 *  4 + 1] = a40h; \
+		(state)->u.narrow[2 *  5 + 0] = a01l; \
+		(state)->u.narrow[2 *  5 + 1] = a01h; \
+		(state)->u.narrow[2 *  6 + 0] = a11l; \
+		(state)->u.narrow[2 *  6 + 1] = a11h; \
+		(state)->u.narrow[2 *  7 + 0] = a21l; \
+		(state)->u.narrow[2 *  7 + 1] = a21h; \
+		(state)->u.narrow[2 *  8 + 0] = a31l; \
+		(state)->u.narrow[2 *  8 + 1] = a31h; \
+		(state)->u.narrow[2 *  9 + 0] = a41l; \
+		(state)->u.narrow[2 *  9 + 1] = a41h; \
+		(state)->u.narrow[2 * 10 + 0] = a02l; \
+		(state)->u.narrow[2 * 10 + 1] = a02h; \
+		(state)->u.narrow[2 * 11 + 0] = a12l; \
+		(state)->u.narrow[2 * 11 + 1] = a12h; \
+		(state)->u.narrow[2 * 12 + 0] = a22l; \
+		(state)->u.narrow[2 * 12 + 1] = a22h; \
+		(state)->u.narrow[2 * 13 + 0] = a32l; \
+		(state)->u.narrow[2 * 13 + 1] = a32h; \
+		(state)->u.narrow[2 * 14 + 0] = a42l; \
+		(state)->u.narrow[2 * 14 + 1] = a42h; \
+		(state)->u.narrow[2 * 15 + 0] = a03l; \
+		(state)->u.narrow[2 * 15 + 1] = a03h; \
+		(state)->u.narrow[2 * 16 + 0] = a13l; \
+		(state)->u.narrow[2 * 16 + 1] = a13h; \
+		(state)->u.narrow[2 * 17 + 0] = a23l; \
+		(state)->u.narrow[2 * 17 + 1] = a23h; \
+		(state)->u.narrow[2 * 18 + 0] = a33l; \
+		(state)->u.narrow[2 * 18 + 1] = a33h; \
+		(state)->u.narrow[2 * 19 + 0] = a43l; \
+		(state)->u.narrow[2 * 19 + 1] = a43h; \
+		(state)->u.narrow[2 * 20 + 0] = a04l; \
+		(state)->u.narrow[2 * 20 + 1] = a04h; \
+		(state)->u.narrow[2 * 21 + 0] = a14l; \
+		(state)->u.narrow[2 * 21 + 1] = a14h; \
+		(state)->u.narrow[2 * 22 + 0] = a24l; \
+		(state)->u.narrow[2 * 22 + 1] = a24h; \
+		(state)->u.narrow[2 * 23 + 0] = a34l; \
+		(state)->u.narrow[2 * 23 + 1] = a34h; \
+		(state)->u.narrow[2 * 24 + 0] = a44l; \
+		(state)->u.narrow[2 * 24 + 1] = a44h; \
+	} while (0)
+
+#define READ64(d, off)   do { \
+		sph_u32 tl, th; \
+		tl = sph_dec32le_aligned(buf + (off)); \
+		th = sph_dec32le_aligned(buf + (off) + 4); \
+		INTERLEAVE(tl, th); \
+		d ## l ^= tl; \
+		d ## h ^= th; \
+	} while (0)
+
+#define INPUT_BUF144   do { \
+		READ64(a00,   0); \
+		READ64(a10,   8); \
+		READ64(a20,  16); \
+		READ64(a30,  24); \
+		READ64(a40,  32); \
+		READ64(a01,  40); \
+		READ64(a11,  48); \
+		READ64(a21,  56); \
+		READ64(a31,  64); \
+		READ64(a41,  72); \
+		READ64(a02,  80); \
+		READ64(a12,  88); \
+		READ64(a22,  96); \
+		READ64(a32, 104); \
+		READ64(a42, 112); \
+		READ64(a03, 120); \
+		READ64(a13, 128); \
+		READ64(a23, 136); \
+	} while (0)
+
+#define INPUT_BUF136   do { \
+		READ64(a00,   0); \
+		READ64(a10,   8); \
+		READ64(a20,  16); \
+		READ64(a30,  24); \
+		READ64(a40,  32); \
+		READ64(a01,  40); \
+		READ64(a11,  48); \
+		READ64(a21,  56); \
+		READ64(a31,  64); \
+		READ64(a41,  72); \
+		READ64(a02,  80); \
+		READ64(a12,  88); \
+		READ64(a22,  96); \
+		READ64(a32, 104); \
+		READ64(a42, 112); \
+		READ64(a03, 120); \
+		READ64(a13, 128); \
+	} while (0)
+
+#define INPUT_BUF104   do { \
+		READ64(a00,   0); \
+		READ64(a10,   8); \
+		READ64(a20,  16); \
+		READ64(a30,  24); \
+		READ64(a40,  32); \
+		READ64(a01,  40); \
+		READ64(a11,  48); \
+		READ64(a21,  56); \
+		READ64(a31,  64); \
+		READ64(a41,  72); \
+		READ64(a02,  80); \
+		READ64(a12,  88); \
+		READ64(a22,  96); \
+	} while (0)
+
+#define INPUT_BUF72   do { \
+		READ64(a00,   0); \
+		READ64(a10,   8); \
+		READ64(a20,  16); \
+		READ64(a30,  24); \
+		READ64(a40,  32); \
+		READ64(a01,  40); \
+		READ64(a11,  48); \
+		READ64(a21,  56); \
+		READ64(a31,  64); \
+	} while (0)
+
+#define INPUT_BUF(lim)   do { \
+		READ64(a00,   0); \
+		READ64(a10,   8); \
+		READ64(a20,  16); \
+		READ64(a30,  24); \
+		READ64(a40,  32); \
+		READ64(a01,  40); \
+		READ64(a11,  48); \
+		READ64(a21,  56); \
+		READ64(a31,  64); \
+		if ((lim) == 72) \
+			break; \
+		READ64(a41,  72); \
+		READ64(a02,  80); \
+		READ64(a12,  88); \
+		READ64(a22,  96); \
+		if ((lim) == 104) \
+			break; \
+		READ64(a32, 104); \
+		READ64(a42, 112); \
+		READ64(a03, 120); \
+		READ64(a13, 128); \
+		if ((lim) == 136) \
+			break; \
+		READ64(a23, 136); \
+	} while (0)
+
+#endif
+
+#define DECL64(x)        sph_u64 x ## l, x ## h
+#define MOV64(d, s)      (d ## l = s ## l, d ## h = s ## h)
+#define XOR64(d, a, b)   (d ## l = a ## l ^ b ## l, d ## h = a ## h ^ b ## h)
+#define AND64(d, a, b)   (d ## l = a ## l & b ## l, d ## h = a ## h & b ## h)
+#define OR64(d, a, b)    (d ## l = a ## l | b ## l, d ## h = a ## h | b ## h)
+#define NOT64(d, s)      (d ## l = SPH_T32(~s ## l), d ## h = SPH_T32(~s ## h))
+#define ROL64(d, v, n)   ROL64_ ## n(d, v)
+
+#if SPH_KECCAK_INTERLEAVE
+
+#define ROL64_odd1(d, v)   do { \
+		sph_u32 tmp; \
+		tmp = v ## l; \
+		d ## l = SPH_T32(v ## h << 1) | (v ## h >> 31); \
+		d ## h = tmp; \
+	} while (0)
+
+#define ROL64_odd63(d, v)   do { \
+		sph_u32 tmp; \
+		tmp = SPH_T32(v ## l << 31) | (v ## l >> 1); \
+		d ## l = v ## h; \
+		d ## h = tmp; \
+	} while (0)
+
+#define ROL64_odd(d, v, n)   do { \
+		sph_u32 tmp; \
+		tmp = SPH_T32(v ## l << (n - 1)) | (v ## l >> (33 - n)); \
+		d ## l = SPH_T32(v ## h << n) | (v ## h >> (32 - n)); \
+		d ## h = tmp; \
+	} while (0)
+
+#define ROL64_even(d, v, n)   do { \
+		d ## l = SPH_T32(v ## l << n) | (v ## l >> (32 - n)); \
+		d ## h = SPH_T32(v ## h << n) | (v ## h >> (32 - n)); \
+	} while (0)
+
+#define ROL64_0(d, v)
+#define ROL64_1(d, v)    ROL64_odd1(d, v)
+#define ROL64_2(d, v)    ROL64_even(d, v,  1)
+#define ROL64_3(d, v)    ROL64_odd( d, v,  2)
+#define ROL64_4(d, v)    ROL64_even(d, v,  2)
+#define ROL64_5(d, v)    ROL64_odd( d, v,  3)
+#define ROL64_6(d, v)    ROL64_even(d, v,  3)
+#define ROL64_7(d, v)    ROL64_odd( d, v,  4)
+#define ROL64_8(d, v)    ROL64_even(d, v,  4)
+#define ROL64_9(d, v)    ROL64_odd( d, v,  5)
+#define ROL64_10(d, v)   ROL64_even(d, v,  5)
+#define ROL64_11(d, v)   ROL64_odd( d, v,  6)
+#define ROL64_12(d, v)   ROL64_even(d, v,  6)
+#define ROL64_13(d, v)   ROL64_odd( d, v,  7)
+#define ROL64_14(d, v)   ROL64_even(d, v,  7)
+#define ROL64_15(d, v)   ROL64_odd( d, v,  8)
+#define ROL64_16(d, v)   ROL64_even(d, v,  8)
+#define ROL64_17(d, v)   ROL64_odd( d, v,  9)
+#define ROL64_18(d, v)   ROL64_even(d, v,  9)
+#define ROL64_19(d, v)   ROL64_odd( d, v, 10)
+#define ROL64_20(d, v)   ROL64_even(d, v, 10)
+#define ROL64_21(d, v)   ROL64_odd( d, v, 11)
+#define ROL64_22(d, v)   ROL64_even(d, v, 11)
+#define ROL64_23(d, v)   ROL64_odd( d, v, 12)
+#define ROL64_24(d, v)   ROL64_even(d, v, 12)
+#define ROL64_25(d, v)   ROL64_odd( d, v, 13)
+#define ROL64_26(d, v)   ROL64_even(d, v, 13)
+#define ROL64_27(d, v)   ROL64_odd( d, v, 14)
+#define ROL64_28(d, v)   ROL64_even(d, v, 14)
+#define ROL64_29(d, v)   ROL64_odd( d, v, 15)
+#define ROL64_30(d, v)   ROL64_even(d, v, 15)
+#define ROL64_31(d, v)   ROL64_odd( d, v, 16)
+#define ROL64_32(d, v)   ROL64_even(d, v, 16)
+#define ROL64_33(d, v)   ROL64_odd( d, v, 17)
+#define ROL64_34(d, v)   ROL64_even(d, v, 17)
+#define ROL64_35(d, v)   ROL64_odd( d, v, 18)
+#define ROL64_36(d, v)   ROL64_even(d, v, 18)
+#define ROL64_37(d, v)   ROL64_odd( d, v, 19)
+#define ROL64_38(d, v)   ROL64_even(d, v, 19)
+#define ROL64_39(d, v)   ROL64_odd( d, v, 20)
+#define ROL64_40(d, v)   ROL64_even(d, v, 20)
+#define ROL64_41(d, v)   ROL64_odd( d, v, 21)
+#define ROL64_42(d, v)   ROL64_even(d, v, 21)
+#define ROL64_43(d, v)   ROL64_odd( d, v, 22)
+#define ROL64_44(d, v)   ROL64_even(d, v, 22)
+#define ROL64_45(d, v)   ROL64_odd( d, v, 23)
+#define ROL64_46(d, v)   ROL64_even(d, v, 23)
+#define ROL64_47(d, v)   ROL64_odd( d, v, 24)
+#define ROL64_48(d, v)   ROL64_even(d, v, 24)
+#define ROL64_49(d, v)   ROL64_odd( d, v, 25)
+#define ROL64_50(d, v)   ROL64_even(d, v, 25)
+#define ROL64_51(d, v)   ROL64_odd( d, v, 26)
+#define ROL64_52(d, v)   ROL64_even(d, v, 26)
+#define ROL64_53(d, v)   ROL64_odd( d, v, 27)
+#define ROL64_54(d, v)   ROL64_even(d, v, 27)
+#define ROL64_55(d, v)   ROL64_odd( d, v, 28)
+#define ROL64_56(d, v)   ROL64_even(d, v, 28)
+#define ROL64_57(d, v)   ROL64_odd( d, v, 29)
+#define ROL64_58(d, v)   ROL64_even(d, v, 29)
+#define ROL64_59(d, v)   ROL64_odd( d, v, 30)
+#define ROL64_60(d, v)   ROL64_even(d, v, 30)
+#define ROL64_61(d, v)   ROL64_odd( d, v, 31)
+#define ROL64_62(d, v)   ROL64_even(d, v, 31)
+#define ROL64_63(d, v)   ROL64_odd63(d, v)
+
+#else
+
+#define ROL64_small(d, v, n)   do { \
+		sph_u32 tmp; \
+		tmp = SPH_T32(v ## l << n) | (v ## h >> (32 - n)); \
+		d ## h = SPH_T32(v ## h << n) | (v ## l >> (32 - n)); \
+		d ## l = tmp; \
+	} while (0)
+
+#define ROL64_0(d, v)    0
+#define ROL64_1(d, v)    ROL64_small(d, v, 1)
+#define ROL64_2(d, v)    ROL64_small(d, v, 2)
+#define ROL64_3(d, v)    ROL64_small(d, v, 3)
+#define ROL64_4(d, v)    ROL64_small(d, v, 4)
+#define ROL64_5(d, v)    ROL64_small(d, v, 5)
+#define ROL64_6(d, v)    ROL64_small(d, v, 6)
+#define ROL64_7(d, v)    ROL64_small(d, v, 7)
+#define ROL64_8(d, v)    ROL64_small(d, v, 8)
+#define ROL64_9(d, v)    ROL64_small(d, v, 9)
+#define ROL64_10(d, v)   ROL64_small(d, v, 10)
+#define ROL64_11(d, v)   ROL64_small(d, v, 11)
+#define ROL64_12(d, v)   ROL64_small(d, v, 12)
+#define ROL64_13(d, v)   ROL64_small(d, v, 13)
+#define ROL64_14(d, v)   ROL64_small(d, v, 14)
+#define ROL64_15(d, v)   ROL64_small(d, v, 15)
+#define ROL64_16(d, v)   ROL64_small(d, v, 16)
+#define ROL64_17(d, v)   ROL64_small(d, v, 17)
+#define ROL64_18(d, v)   ROL64_small(d, v, 18)
+#define ROL64_19(d, v)   ROL64_small(d, v, 19)
+#define ROL64_20(d, v)   ROL64_small(d, v, 20)
+#define ROL64_21(d, v)   ROL64_small(d, v, 21)
+#define ROL64_22(d, v)   ROL64_small(d, v, 22)
+#define ROL64_23(d, v)   ROL64_small(d, v, 23)
+#define ROL64_24(d, v)   ROL64_small(d, v, 24)
+#define ROL64_25(d, v)   ROL64_small(d, v, 25)
+#define ROL64_26(d, v)   ROL64_small(d, v, 26)
+#define ROL64_27(d, v)   ROL64_small(d, v, 27)
+#define ROL64_28(d, v)   ROL64_small(d, v, 28)
+#define ROL64_29(d, v)   ROL64_small(d, v, 29)
+#define ROL64_30(d, v)   ROL64_small(d, v, 30)
+#define ROL64_31(d, v)   ROL64_small(d, v, 31)
+
+#define ROL64_32(d, v)   do { \
+		sph_u32 tmp; \
+		tmp = v ## l; \
+		d ## l = v ## h; \
+		d ## h = tmp; \
+	} while (0)
+
+#define ROL64_big(d, v, n)   do { \
+		sph_u32 trl, trh; \
+		ROL64_small(tr, v, n); \
+		d ## h = trl; \
+		d ## l = trh; \
+	} while (0)
+
+#define ROL64_33(d, v)   ROL64_big(d, v, 1)
+#define ROL64_34(d, v)   ROL64_big(d, v, 2)
+#define ROL64_35(d, v)   ROL64_big(d, v, 3)
+#define ROL64_36(d, v)   ROL64_big(d, v, 4)
+#define ROL64_37(d, v)   ROL64_big(d, v, 5)
+#define ROL64_38(d, v)   ROL64_big(d, v, 6)
+#define ROL64_39(d, v)   ROL64_big(d, v, 7)
+#define ROL64_40(d, v)   ROL64_big(d, v, 8)
+#define ROL64_41(d, v)   ROL64_big(d, v, 9)
+#define ROL64_42(d, v)   ROL64_big(d, v, 10)
+#define ROL64_43(d, v)   ROL64_big(d, v, 11)
+#define ROL64_44(d, v)   ROL64_big(d, v, 12)
+#define ROL64_45(d, v)   ROL64_big(d, v, 13)
+#define ROL64_46(d, v)   ROL64_big(d, v, 14)
+#define ROL64_47(d, v)   ROL64_big(d, v, 15)
+#define ROL64_48(d, v)   ROL64_big(d, v, 16)
+#define ROL64_49(d, v)   ROL64_big(d, v, 17)
+#define ROL64_50(d, v)   ROL64_big(d, v, 18)
+#define ROL64_51(d, v)   ROL64_big(d, v, 19)
+#define ROL64_52(d, v)   ROL64_big(d, v, 20)
+#define ROL64_53(d, v)   ROL64_big(d, v, 21)
+#define ROL64_54(d, v)   ROL64_big(d, v, 22)
+#define ROL64_55(d, v)   ROL64_big(d, v, 23)
+#define ROL64_56(d, v)   ROL64_big(d, v, 24)
+#define ROL64_57(d, v)   ROL64_big(d, v, 25)
+#define ROL64_58(d, v)   ROL64_big(d, v, 26)
+#define ROL64_59(d, v)   ROL64_big(d, v, 27)
+#define ROL64_60(d, v)   ROL64_big(d, v, 28)
+#define ROL64_61(d, v)   ROL64_big(d, v, 29)
+#define ROL64_62(d, v)   ROL64_big(d, v, 30)
+#define ROL64_63(d, v)   ROL64_big(d, v, 31)
+
+#endif
+
+#define XOR64_IOTA(d, s, k) \
+	(d ## l = s ## l ^ k.low, d ## h = s ## h ^ k.high)
+
+#endif
+
+#define TH_ELT(t, c0, c1, c2, c3, c4, d0, d1, d2, d3, d4)   do { \
+		DECL64(tt0); \
+		DECL64(tt1); \
+		DECL64(tt2); \
+		DECL64(tt3); \
+		XOR64(tt0, d0, d1); \
+		XOR64(tt1, d2, d3); \
+		XOR64(tt0, tt0, d4); \
+		XOR64(tt0, tt0, tt1); \
+		ROL64(tt0, tt0, 1); \
+		XOR64(tt2, c0, c1); \
+		XOR64(tt3, c2, c3); \
+		XOR64(tt0, tt0, c4); \
+		XOR64(tt2, tt2, tt3); \
+		XOR64(t, tt0, tt2); \
+	} while (0)
+
+#define THETA(b00, b01, b02, b03, b04, b10, b11, b12, b13, b14, \
+	b20, b21, b22, b23, b24, b30, b31, b32, b33, b34, \
+	b40, b41, b42, b43, b44) \
+	do { \
+		DECL64(t0); \
+		DECL64(t1); \
+		DECL64(t2); \
+		DECL64(t3); \
+		DECL64(t4); \
+		TH_ELT(t0, b40, b41, b42, b43, b44, b10, b11, b12, b13, b14); \
+		TH_ELT(t1, b00, b01, b02, b03, b04, b20, b21, b22, b23, b24); \
+		TH_ELT(t2, b10, b11, b12, b13, b14, b30, b31, b32, b33, b34); \
+		TH_ELT(t3, b20, b21, b22, b23, b24, b40, b41, b42, b43, b44); \
+		TH_ELT(t4, b30, b31, b32, b33, b34, b00, b01, b02, b03, b04); \
+		XOR64(b00, b00, t0); \
+		XOR64(b01, b01, t0); \
+		XOR64(b02, b02, t0); \
+		XOR64(b03, b03, t0); \
+		XOR64(b04, b04, t0); \
+		XOR64(b10, b10, t1); \
+		XOR64(b11, b11, t1); \
+		XOR64(b12, b12, t1); \
+		XOR64(b13, b13, t1); \
+		XOR64(b14, b14, t1); \
+		XOR64(b20, b20, t2); \
+		XOR64(b21, b21, t2); \
+		XOR64(b22, b22, t2); \
+		XOR64(b23, b23, t2); \
+		XOR64(b24, b24, t2); \
+		XOR64(b30, b30, t3); \
+		XOR64(b31, b31, t3); \
+		XOR64(b32, b32, t3); \
+		XOR64(b33, b33, t3); \
+		XOR64(b34, b34, t3); \
+		XOR64(b40, b40, t4); \
+		XOR64(b41, b41, t4); \
+		XOR64(b42, b42, t4); \
+		XOR64(b43, b43, t4); \
+		XOR64(b44, b44, t4); \
+	} while (0)
+
+#define RHO(b00, b01, b02, b03, b04, b10, b11, b12, b13, b14, \
+	b20, b21, b22, b23, b24, b30, b31, b32, b33, b34, \
+	b40, b41, b42, b43, b44) \
+	do { \
+		/* ROL64(b00, b00,  0); */ \
+		ROL64(b01, b01, 36); \
+		ROL64(b02, b02,  3); \
+		ROL64(b03, b03, 41); \
+		ROL64(b04, b04, 18); \
+		ROL64(b10, b10,  1); \
+		ROL64(b11, b11, 44); \
+		ROL64(b12, b12, 10); \
+		ROL64(b13, b13, 45); \
+		ROL64(b14, b14,  2); \
+		ROL64(b20, b20, 62); \
+		ROL64(b21, b21,  6); \
+		ROL64(b22, b22, 43); \
+		ROL64(b23, b23, 15); \
+		ROL64(b24, b24, 61); \
+		ROL64(b30, b30, 28); \
+		ROL64(b31, b31, 55); \
+		ROL64(b32, b32, 25); \
+		ROL64(b33, b33, 21); \
+		ROL64(b34, b34, 56); \
+		ROL64(b40, b40, 27); \
+		ROL64(b41, b41, 20); \
+		ROL64(b42, b42, 39); \
+		ROL64(b43, b43,  8); \
+		ROL64(b44, b44, 14); \
+	} while (0)
+
+/*
+ * The KHI macro integrates the "lane complement" optimization. On input,
+ * some words are complemented:
+ *    a00 a01 a02 a04 a13 a20 a21 a22 a30 a33 a34 a43
+ * On output, the following words are complemented:
+ *    a04 a10 a20 a22 a23 a31
+ *
+ * The (implicit) permutation and the theta expansion will bring back
+ * the input mask for the next round.
+ */
+
+#define KHI_XO(d, a, b, c)   do { \
+		DECL64(kt); \
+		OR64(kt, b, c); \
+		XOR64(d, a, kt); \
+	} while (0)
+
+#define KHI_XA(d, a, b, c)   do { \
+		DECL64(kt); \
+		AND64(kt, b, c); \
+		XOR64(d, a, kt); \
+	} while (0)
+
+#define KHI(b00, b01, b02, b03, b04, b10, b11, b12, b13, b14, \
+	b20, b21, b22, b23, b24, b30, b31, b32, b33, b34, \
+	b40, b41, b42, b43, b44) \
+	do { \
+		DECL64(c0); \
+		DECL64(c1); \
+		DECL64(c2); \
+		DECL64(c3); \
+		DECL64(c4); \
+		DECL64(bnn); \
+		NOT64(bnn, b20); \
+		KHI_XO(c0, b00, b10, b20); \
+		KHI_XO(c1, b10, bnn, b30); \
+		KHI_XA(c2, b20, b30, b40); \
+		KHI_XO(c3, b30, b40, b00); \
+		KHI_XA(c4, b40, b00, b10); \
+		MOV64(b00, c0); \
+		MOV64(b10, c1); \
+		MOV64(b20, c2); \
+		MOV64(b30, c3); \
+		MOV64(b40, c4); \
+		NOT64(bnn, b41); \
+		KHI_XO(c0, b01, b11, b21); \
+		KHI_XA(c1, b11, b21, b31); \
+		KHI_XO(c2, b21, b31, bnn); \
+		KHI_XO(c3, b31, b41, b01); \
+		KHI_XA(c4, b41, b01, b11); \
+		MOV64(b01, c0); \
+		MOV64(b11, c1); \
+		MOV64(b21, c2); \
+		MOV64(b31, c3); \
+		MOV64(b41, c4); \
+		NOT64(bnn, b32); \
+		KHI_XO(c0, b02, b12, b22); \
+		KHI_XA(c1, b12, b22, b32); \
+		KHI_XA(c2, b22, bnn, b42); \
+		KHI_XO(c3, bnn, b42, b02); \
+		KHI_XA(c4, b42, b02, b12); \
+		MOV64(b02, c0); \
+		MOV64(b12, c1); \
+		MOV64(b22, c2); \
+		MOV64(b32, c3); \
+		MOV64(b42, c4); \
+		NOT64(bnn, b33); \
+		KHI_XA(c0, b03, b13, b23); \
+		KHI_XO(c1, b13, b23, b33); \
+		KHI_XO(c2, b23, bnn, b43); \
+		KHI_XA(c3, bnn, b43, b03); \
+		KHI_XO(c4, b43, b03, b13); \
+		MOV64(b03, c0); \
+		MOV64(b13, c1); \
+		MOV64(b23, c2); \
+		MOV64(b33, c3); \
+		MOV64(b43, c4); \
+		NOT64(bnn, b14); \
+		KHI_XA(c0, b04, bnn, b24); \
+		KHI_XO(c1, bnn, b24, b34); \
+		KHI_XA(c2, b24, b34, b44); \
+		KHI_XO(c3, b34, b44, b04); \
+		KHI_XA(c4, b44, b04, b14); \
+		MOV64(b04, c0); \
+		MOV64(b14, c1); \
+		MOV64(b24, c2); \
+		MOV64(b34, c3); \
+		MOV64(b44, c4); \
+	} while (0)
+
+#define IOTA(r)   XOR64_IOTA(a00, a00, r)
+
+#define P0    a00, a01, a02, a03, a04, a10, a11, a12, a13, a14, a20, a21, \
+              a22, a23, a24, a30, a31, a32, a33, a34, a40, a41, a42, a43, a44
+#define P1    a00, a30, a10, a40, a20, a11, a41, a21, a01, a31, a22, a02, \
+              a32, a12, a42, a33, a13, a43, a23, a03, a44, a24, a04, a34, a14
+#define P2    a00, a33, a11, a44, a22, a41, a24, a02, a30, a13, a32, a10, \
+              a43, a21, a04, a23, a01, a34, a12, a40, a14, a42, a20, a03, a31
+#define P3    a00, a23, a41, a14, a32, a24, a42, a10, a33, a01, a43, a11, \
+              a34, a02, a20, a12, a30, a03, a21, a44, a31, a04, a22, a40, a13
+#define P4    a00, a12, a24, a31, a43, a42, a04, a11, a23, a30, a34, a41, \
+              a03, a10, a22, a21, a33, a40, a02, a14, a13, a20, a32, a44, a01
+#define P5    a00, a21, a42, a13, a34, a04, a20, a41, a12, a33, a03, a24, \
+              a40, a11, a32, a02, a23, a44, a10, a31, a01, a22, a43, a14, a30
+#define P6    a00, a02, a04, a01, a03, a20, a22, a24, a21, a23, a40, a42, \
+              a44, a41, a43, a10, a12, a14, a11, a13, a30, a32, a34, a31, a33
+#define P7    a00, a10, a20, a30, a40, a22, a32, a42, a02, a12, a44, a04, \
+              a14, a24, a34, a11, a21, a31, a41, a01, a33, a43, a03, a13, a23
+#define P8    a00, a11, a22, a33, a44, a32, a43, a04, a10, a21, a14, a20, \
+              a31, a42, a03, a41, a02, a13, a24, a30, a23, a34, a40, a01, a12
+#define P9    a00, a41, a32, a23, a14, a43, a34, a20, a11, a02, a31, a22, \
+              a13, a04, a40, a24, a10, a01, a42, a33, a12, a03, a44, a30, a21
+#define P10   a00, a24, a43, a12, a31, a34, a03, a22, a41, a10, a13, a32, \
+              a01, a20, a44, a42, a11, a30, a04, a23, a21, a40, a14, a33, a02
+#define P11   a00, a42, a34, a21, a13, a03, a40, a32, a24, a11, a01, a43, \
+              a30, a22, a14, a04, a41, a33, a20, a12, a02, a44, a31, a23, a10
+#define P12   a00, a04, a03, a02, a01, a40, a44, a43, a42, a41, a30, a34, \
+              a33, a32, a31, a20, a24, a23, a22, a21, a10, a14, a13, a12, a11
+#define P13   a00, a20, a40, a10, a30, a44, a14, a34, a04, a24, a33, a03, \
+              a23, a43, a13, a22, a42, a12, a32, a02, a11, a31, a01, a21, a41
+#define P14   a00, a22, a44, a11, a33, a14, a31, a03, a20, a42, a23, a40, \
+              a12, a34, a01, a32, a04, a21, a43, a10, a41, a13, a30, a02, a24
+#define P15   a00, a32, a14, a41, a23, a31, a13, a40, a22, a04, a12, a44, \
+              a21, a03, a30, a43, a20, a02, a34, a11, a24, a01, a33, a10, a42
+#define P16   a00, a43, a31, a24, a12, a13, a01, a44, a32, a20, a21, a14, \
+              a02, a40, a33, a34, a22, a10, a03, a41, a42, a30, a23, a11, a04
+#define P17   a00, a34, a13, a42, a21, a01, a30, a14, a43, a22, a02, a31, \
+              a10, a44, a23, a03, a32, a11, a40, a24, a04, a33, a12, a41, a20
+#define P18   a00, a03, a01, a04, a02, a30, a33, a31, a34, a32, a10, a13, \
+              a11, a14, a12, a40, a43, a41, a44, a42, a20, a23, a21, a24, a22
+#define P19   a00, a40, a30, a20, a10, a33, a23, a13, a03, a43, a11, a01, \
+              a41, a31, a21, a44, a34, a24, a14, a04, a22, a12, a02, a42, a32
+#define P20   a00, a44, a33, a22, a11, a23, a12, a01, a40, a34, a41, a30, \
+              a24, a13, a02, a14, a03, a42, a31, a20, a32, a21, a10, a04, a43
+#define P21   a00, a14, a23, a32, a41, a12, a21, a30, a44, a03, a24, a33, \
+              a42, a01, a10, a31, a40, a04, a13, a22, a43, a02, a11, a20, a34
+#define P22   a00, a31, a12, a43, a24, a21, a02, a33, a14, a40, a42, a23, \
+              a04, a30, a11, a13, a44, a20, a01, a32, a34, a10, a41, a22, a03
+#define P23   a00, a13, a21, a34, a42, a02, a10, a23, a31, a44, a04, a12, \
+              a20, a33, a41, a01, a14, a22, a30, a43, a03, a11, a24, a32, a40
+
+#define P1_TO_P0   do { \
+		DECL64(t); \
+		MOV64(t, a01); \
+		MOV64(a01, a30); \
+		MOV64(a30, a33); \
+		MOV64(a33, a23); \
+		MOV64(a23, a12); \
+		MOV64(a12, a21); \
+		MOV64(a21, a02); \
+		MOV64(a02, a10); \
+		MOV64(a10, a11); \
+		MOV64(a11, a41); \
+		MOV64(a41, a24); \
+		MOV64(a24, a42); \
+		MOV64(a42, a04); \
+		MOV64(a04, a20); \
+		MOV64(a20, a22); \
+		MOV64(a22, a32); \
+		MOV64(a32, a43); \
+		MOV64(a43, a34); \
+		MOV64(a34, a03); \
+		MOV64(a03, a40); \
+		MOV64(a40, a44); \
+		MOV64(a44, a14); \
+		MOV64(a14, a31); \
+		MOV64(a31, a13); \
+		MOV64(a13, t); \
+	} while (0)
+
+#define P2_TO_P0   do { \
+		DECL64(t); \
+		MOV64(t, a01); \
+		MOV64(a01, a33); \
+		MOV64(a33, a12); \
+		MOV64(a12, a02); \
+		MOV64(a02, a11); \
+		MOV64(a11, a24); \
+		MOV64(a24, a04); \
+		MOV64(a04, a22); \
+		MOV64(a22, a43); \
+		MOV64(a43, a03); \
+		MOV64(a03, a44); \
+		MOV64(a44, a31); \
+		MOV64(a31, t); \
+		MOV64(t, a10); \
+		MOV64(a10, a41); \
+		MOV64(a41, a42); \
+		MOV64(a42, a20); \
+		MOV64(a20, a32); \
+		MOV64(a32, a34); \
+		MOV64(a34, a40); \
+		MOV64(a40, a14); \
+		MOV64(a14, a13); \
+		MOV64(a13, a30); \
+		MOV64(a30, a23); \
+		MOV64(a23, a21); \
+		MOV64(a21, t); \
+	} while (0)
+
+#define P4_TO_P0   do { \
+		DECL64(t); \
+		MOV64(t, a01); \
+		MOV64(a01, a12); \
+		MOV64(a12, a11); \
+		MOV64(a11, a04); \
+		MOV64(a04, a43); \
+		MOV64(a43, a44); \
+		MOV64(a44, t); \
+		MOV64(t, a02); \
+		MOV64(a02, a24); \
+		MOV64(a24, a22); \
+		MOV64(a22, a03); \
+		MOV64(a03, a31); \
+		MOV64(a31, a33); \
+		MOV64(a33, t); \
+		MOV64(t, a10); \
+		MOV64(a10, a42); \
+		MOV64(a42, a32); \
+		MOV64(a32, a40); \
+		MOV64(a40, a13); \
+		MOV64(a13, a23); \
+		MOV64(a23, t); \
+		MOV64(t, a14); \
+		MOV64(a14, a30); \
+		MOV64(a30, a21); \
+		MOV64(a21, a41); \
+		MOV64(a41, a20); \
+		MOV64(a20, a34); \
+		MOV64(a34, t); \
+	} while (0)
+
+#define P6_TO_P0   do { \
+		DECL64(t); \
+		MOV64(t, a01); \
+		MOV64(a01, a02); \
+		MOV64(a02, a04); \
+		MOV64(a04, a03); \
+		MOV64(a03, t); \
+		MOV64(t, a10); \
+		MOV64(a10, a20); \
+		MOV64(a20, a40); \
+		MOV64(a40, a30); \
+		MOV64(a30, t); \
+		MOV64(t, a11); \
+		MOV64(a11, a22); \
+		MOV64(a22, a44); \
+		MOV64(a44, a33); \
+		MOV64(a33, t); \
+		MOV64(t, a12); \
+		MOV64(a12, a24); \
+		MOV64(a24, a43); \
+		MOV64(a43, a31); \
+		MOV64(a31, t); \
+		MOV64(t, a13); \
+		MOV64(a13, a21); \
+		MOV64(a21, a42); \
+		MOV64(a42, a34); \
+		MOV64(a34, t); \
+		MOV64(t, a14); \
+		MOV64(a14, a23); \
+		MOV64(a23, a41); \
+		MOV64(a41, a32); \
+		MOV64(a32, t); \
+	} while (0)
+
+#define P8_TO_P0   do { \
+		DECL64(t); \
+		MOV64(t, a01); \
+		MOV64(a01, a11); \
+		MOV64(a11, a43); \
+		MOV64(a43, t); \
+		MOV64(t, a02); \
+		MOV64(a02, a22); \
+		MOV64(a22, a31); \
+		MOV64(a31, t); \
+		MOV64(t, a03); \
+		MOV64(a03, a33); \
+		MOV64(a33, a24); \
+		MOV64(a24, t); \
+		MOV64(t, a04); \
+		MOV64(a04, a44); \
+		MOV64(a44, a12); \
+		MOV64(a12, t); \
+		MOV64(t, a10); \
+		MOV64(a10, a32); \
+		MOV64(a32, a13); \
+		MOV64(a13, t); \
+		MOV64(t, a14); \
+		MOV64(a14, a21); \
+		MOV64(a21, a20); \
+		MOV64(a20, t); \
+		MOV64(t, a23); \
+		MOV64(a23, a42); \
+		MOV64(a42, a40); \
+		MOV64(a40, t); \
+		MOV64(t, a30); \
+		MOV64(a30, a41); \
+		MOV64(a41, a34); \
+		MOV64(a34, t); \
+	} while (0)
+
+#define P12_TO_P0   do { \
+		DECL64(t); \
+		MOV64(t, a01); \
+		MOV64(a01, a04); \
+		MOV64(a04, t); \
+		MOV64(t, a02); \
+		MOV64(a02, a03); \
+		MOV64(a03, t); \
+		MOV64(t, a10); \
+		MOV64(a10, a40); \
+		MOV64(a40, t); \
+		MOV64(t, a11); \
+		MOV64(a11, a44); \
+		MOV64(a44, t); \
+		MOV64(t, a12); \
+		MOV64(a12, a43); \
+		MOV64(a43, t); \
+		MOV64(t, a13); \
+		MOV64(a13, a42); \
+		MOV64(a42, t); \
+		MOV64(t, a14); \
+		MOV64(a14, a41); \
+		MOV64(a41, t); \
+		MOV64(t, a20); \
+		MOV64(a20, a30); \
+		MOV64(a30, t); \
+		MOV64(t, a21); \
+		MOV64(a21, a34); \
+		MOV64(a34, t); \
+		MOV64(t, a22); \
+		MOV64(a22, a33); \
+		MOV64(a33, t); \
+		MOV64(t, a23); \
+		MOV64(a23, a32); \
+		MOV64(a32, t); \
+		MOV64(t, a24); \
+		MOV64(a24, a31); \
+		MOV64(a31, t); \
+	} while (0)
+
+#define LPAR   (
+#define RPAR   )
+
+#define KF_ELT(r, s, k)   do { \
+		THETA LPAR P ## r RPAR; \
+		RHO LPAR P ## r RPAR; \
+		KHI LPAR P ## s RPAR; \
+		IOTA(k); \
+	} while (0)
+
+#define DO(x)   x
+
+#define KECCAK_F_1600   DO(KECCAK_F_1600_)
+
+#if SPH_KECCAK_UNROLL == 1
+
+#define KECCAK_F_1600_   do { \
+		int j; \
+		for (j = 0; j < 24; j ++) { \
+			KF_ELT( 0,  1, RC[j + 0]); \
+			P1_TO_P0; \
+		} \
+	} while (0)
+
+#elif SPH_KECCAK_UNROLL == 2
+
+#define KECCAK_F_1600_   do { \
+		int j; \
+		for (j = 0; j < 24; j += 2) { \
+			KF_ELT( 0,  1, RC[j + 0]); \
+			KF_ELT( 1,  2, RC[j + 1]); \
+			P2_TO_P0; \
+		} \
+	} while (0)
+
+#elif SPH_KECCAK_UNROLL == 4
+
+#define KECCAK_F_1600_   do { \
+		int j; \
+		for (j = 0; j < 24; j += 4) { \
+			KF_ELT( 0,  1, RC[j + 0]); \
+			KF_ELT( 1,  2, RC[j + 1]); \
+			KF_ELT( 2,  3, RC[j + 2]); \
+			KF_ELT( 3,  4, RC[j + 3]); \
+			P4_TO_P0; \
+		} \
+	} while (0)
+
+#elif SPH_KECCAK_UNROLL == 6
+
+#define KECCAK_F_1600_   do { \
+		int j; \
+		for (j = 0; j < 24; j += 6) { \
+			KF_ELT( 0,  1, RC[j + 0]); \
+			KF_ELT( 1,  2, RC[j + 1]); \
+			KF_ELT( 2,  3, RC[j + 2]); \
+			KF_ELT( 3,  4, RC[j + 3]); \
+			KF_ELT( 4,  5, RC[j + 4]); \
+			KF_ELT( 5,  6, RC[j + 5]); \
+			P6_TO_P0; \
+		} \
+	} while (0)
+
+#elif SPH_KECCAK_UNROLL == 8
+
+#define KECCAK_F_1600_   do { \
+		int j; \
+		for (j = 0; j < 24; j += 8) { \
+			KF_ELT( 0,  1, RC[j + 0]); \
+			KF_ELT( 1,  2, RC[j + 1]); \
+			KF_ELT( 2,  3, RC[j + 2]); \
+			KF_ELT( 3,  4, RC[j + 3]); \
+			KF_ELT( 4,  5, RC[j + 4]); \
+			KF_ELT( 5,  6, RC[j + 5]); \
+			KF_ELT( 6,  7, RC[j + 6]); \
+			KF_ELT( 7,  8, RC[j + 7]); \
+			P8_TO_P0; \
+		} \
+	} while (0)
+
+#elif SPH_KECCAK_UNROLL == 12
+
+#define KECCAK_F_1600_   do { \
+		int j; \
+		for (j = 0; j < 24; j += 12) { \
+			KF_ELT( 0,  1, RC[j +  0]); \
+			KF_ELT( 1,  2, RC[j +  1]); \
+			KF_ELT( 2,  3, RC[j +  2]); \
+			KF_ELT( 3,  4, RC[j +  3]); \
+			KF_ELT( 4,  5, RC[j +  4]); \
+			KF_ELT( 5,  6, RC[j +  5]); \
+			KF_ELT( 6,  7, RC[j +  6]); \
+			KF_ELT( 7,  8, RC[j +  7]); \
+			KF_ELT( 8,  9, RC[j +  8]); \
+			KF_ELT( 9, 10, RC[j +  9]); \
+			KF_ELT(10, 11, RC[j + 10]); \
+			KF_ELT(11, 12, RC[j + 11]); \
+			P12_TO_P0; \
+		} \
+	} while (0)
+
+#elif SPH_KECCAK_UNROLL == 0
+
+#define KECCAK_F_1600_   do { \
+		KF_ELT( 0,  1, RC[ 0]); \
+		KF_ELT( 1,  2, RC[ 1]); \
+		KF_ELT( 2,  3, RC[ 2]); \
+		KF_ELT( 3,  4, RC[ 3]); \
+		KF_ELT( 4,  5, RC[ 4]); \
+		KF_ELT( 5,  6, RC[ 5]); \
+		KF_ELT( 6,  7, RC[ 6]); \
+		KF_ELT( 7,  8, RC[ 7]); \
+		KF_ELT( 8,  9, RC[ 8]); \
+		KF_ELT( 9, 10, RC[ 9]); \
+		KF_ELT(10, 11, RC[10]); \
+		KF_ELT(11, 12, RC[11]); \
+		KF_ELT(12, 13, RC[12]); \
+		KF_ELT(13, 14, RC[13]); \
+		KF_ELT(14, 15, RC[14]); \
+		KF_ELT(15, 16, RC[15]); \
+		KF_ELT(16, 17, RC[16]); \
+		KF_ELT(17, 18, RC[17]); \
+		KF_ELT(18, 19, RC[18]); \
+		KF_ELT(19, 20, RC[19]); \
+		KF_ELT(20, 21, RC[20]); \
+		KF_ELT(21, 22, RC[21]); \
+		KF_ELT(22, 23, RC[22]); \
+		KF_ELT(23,  0, RC[23]); \
+	} while (0)
+
+#else
+
+#error Unimplemented unroll count for Keccak.
+
+#endif
+
+static void
+keccak_init(sph_keccak_context *kc, unsigned out_size)
+{
+	int i;
+
+#if SPH_KECCAK_64
+	for (i = 0; i < 25; i ++)
+		kc->u.wide[i] = 0;
+	/*
+	 * Initialization for the "lane complement".
+	 */
+	kc->u.wide[ 1] = SPH_C64(0xFFFFFFFFFFFFFFFF);
+	kc->u.wide[ 2] = SPH_C64(0xFFFFFFFFFFFFFFFF);
+	kc->u.wide[ 8] = SPH_C64(0xFFFFFFFFFFFFFFFF);
+	kc->u.wide[12] = SPH_C64(0xFFFFFFFFFFFFFFFF);
+	kc->u.wide[17] = SPH_C64(0xFFFFFFFFFFFFFFFF);
+	kc->u.wide[20] = SPH_C64(0xFFFFFFFFFFFFFFFF);
+#else
+
+	for (i = 0; i < 50; i ++)
+		kc->u.narrow[i] = 0;
+	/*
+	 * Initialization for the "lane complement".
+	 * Note: since we set to all-one full 64-bit words,
+	 * interleaving (if applicable) is a no-op.
+	 */
+	kc->u.narrow[ 2] = SPH_C32(0xFFFFFFFF);
+	kc->u.narrow[ 3] = SPH_C32(0xFFFFFFFF);
+	kc->u.narrow[ 4] = SPH_C32(0xFFFFFFFF);
+	kc->u.narrow[ 5] = SPH_C32(0xFFFFFFFF);
+	kc->u.narrow[16] = SPH_C32(0xFFFFFFFF);
+	kc->u.narrow[17] = SPH_C32(0xFFFFFFFF);
+	kc->u.narrow[24] = SPH_C32(0xFFFFFFFF);
+	kc->u.narrow[25] = SPH_C32(0xFFFFFFFF);
+	kc->u.narrow[34] = SPH_C32(0xFFFFFFFF);
+	kc->u.narrow[35] = SPH_C32(0xFFFFFFFF);
+	kc->u.narrow[40] = SPH_C32(0xFFFFFFFF);
+	kc->u.narrow[41] = SPH_C32(0xFFFFFFFF);
+#endif
+	kc->ptr = 0;
+	kc->lim = 200 - (out_size >> 2);
+}
+
+static void
+keccak_core(sph_keccak_context *kc, const void *data, size_t len, size_t lim)
+{
+	unsigned char *buf;
+	size_t ptr;
+	DECL_STATE
+
+	buf = kc->buf;
+	ptr = kc->ptr;
+
+	if (len < (lim - ptr)) {
+		memcpy(buf + ptr, data, len);
+		kc->ptr = ptr + len;
+		return;
+	}
+
+	READ_STATE(kc);
+	while (len > 0) {
+		size_t clen;
+
+		clen = (lim - ptr);
+		if (clen > len)
+			clen = len;
+		memcpy(buf + ptr, data, clen);
+		ptr += clen;
+		data = (const unsigned char *)data + clen;
+		len -= clen;
+		if (ptr == lim) {
+			INPUT_BUF(lim);
+			KECCAK_F_1600;
+			ptr = 0;
+		}
+	}
+	WRITE_STATE(kc);
+	kc->ptr = ptr;
+}
+
+#if SPH_KECCAK_64
+
+#define DEFCLOSE(d, lim) \
+	static void keccak_close ## d( \
+		sph_keccak_context *kc, unsigned ub, unsigned n, void *dst) \
+	{ \
+		unsigned eb; \
+		union { \
+			unsigned char tmp[lim + 1]; \
+			sph_u64 dummy;   /* for alignment */ \
+		} u; \
+		size_t j; \
+ \
+		eb = (0x100 | (ub & 0xFF)) >> (8 - n); \
+		if (kc->ptr == (lim - 1)) { \
+			if (n == 7) { \
+				u.tmp[0] = eb; \
+				memset(u.tmp + 1, 0, lim - 1); \
+				u.tmp[lim] = 0x80; \
+				j = 1 + lim; \
+			} else { \
+				u.tmp[0] = eb | 0x80; \
+				j = 1; \
+			} \
+		} else { \
+			j = lim - kc->ptr; \
+			u.tmp[0] = eb; \
+			memset(u.tmp + 1, 0, j - 2); \
+			u.tmp[j - 1] = 0x80; \
+		} \
+		keccak_core(kc, u.tmp, j, lim); \
+		/* Finalize the "lane complement" */ \
+		kc->u.wide[ 1] = ~kc->u.wide[ 1]; \
+		kc->u.wide[ 2] = ~kc->u.wide[ 2]; \
+		kc->u.wide[ 8] = ~kc->u.wide[ 8]; \
+		kc->u.wide[12] = ~kc->u.wide[12]; \
+		kc->u.wide[17] = ~kc->u.wide[17]; \
+		kc->u.wide[20] = ~kc->u.wide[20]; \
+		for (j = 0; j < d; j += 8) \
+			sph_enc64le_aligned(u.tmp + j, kc->u.wide[j >> 3]); \
+		memcpy(dst, u.tmp, d); \
+		keccak_init(kc, (unsigned)d << 3); \
+	} \
+
+#else
+
+#define DEFCLOSE(d, lim) \
+	static void keccak_close ## d( \
+		sph_keccak_context *kc, unsigned ub, unsigned n, void *dst) \
+	{ \
+		unsigned eb; \
+		union { \
+			unsigned char tmp[lim + 1]; \
+			sph_u64 dummy;   /* for alignment */ \
+		} u; \
+		size_t j; \
+ \
+		eb = (0x100 | (ub & 0xFF)) >> (8 - n); \
+		if (kc->ptr == (lim - 1)) { \
+			if (n == 7) { \
+				u.tmp[0] = eb; \
+				memset(u.tmp + 1, 0, lim - 1); \
+				u.tmp[lim] = 0x80; \
+				j = 1 + lim; \
+			} else { \
+				u.tmp[0] = eb | 0x80; \
+				j = 1; \
+			} \
+		} else { \
+			j = lim - kc->ptr; \
+			u.tmp[0] = eb; \
+			memset(u.tmp + 1, 0, j - 2); \
+			u.tmp[j - 1] = 0x80; \
+		} \
+		keccak_core(kc, u.tmp, j, lim); \
+		/* Finalize the "lane complement" */ \
+		kc->u.narrow[ 2] = ~kc->u.narrow[ 2]; \
+		kc->u.narrow[ 3] = ~kc->u.narrow[ 3]; \
+		kc->u.narrow[ 4] = ~kc->u.narrow[ 4]; \
+		kc->u.narrow[ 5] = ~kc->u.narrow[ 5]; \
+		kc->u.narrow[16] = ~kc->u.narrow[16]; \
+		kc->u.narrow[17] = ~kc->u.narrow[17]; \
+		kc->u.narrow[24] = ~kc->u.narrow[24]; \
+		kc->u.narrow[25] = ~kc->u.narrow[25]; \
+		kc->u.narrow[34] = ~kc->u.narrow[34]; \
+		kc->u.narrow[35] = ~kc->u.narrow[35]; \
+		kc->u.narrow[40] = ~kc->u.narrow[40]; \
+		kc->u.narrow[41] = ~kc->u.narrow[41]; \
+		/* un-interleave */ \
+		for (j = 0; j < 50; j += 2) \
+			UNINTERLEAVE(kc->u.narrow[j], kc->u.narrow[j + 1]); \
+		for (j = 0; j < d; j += 4) \
+			sph_enc32le_aligned(u.tmp + j, kc->u.narrow[j >> 2]); \
+		memcpy(dst, u.tmp, d); \
+		keccak_init(kc, (unsigned)d << 3); \
+	} \
+
+#endif
+
+DEFCLOSE(28, 144)
+DEFCLOSE(32, 136)
+DEFCLOSE(48, 104)
+DEFCLOSE(64, 72)
+
+/* see sph_keccak.h */
+void
+sph_keccak224_init(void *cc)
+{
+	keccak_init(cc, 224);
+}
+
+/* see sph_keccak.h */
+void
+sph_keccak224(void *cc, const void *data, size_t len)
+{
+	keccak_core(cc, data, len, 144);
+}
+
+/* see sph_keccak.h */
+void
+sph_keccak224_close(void *cc, void *dst)
+{
+	sph_keccak224_addbits_and_close(cc, 0, 0, dst);
+}
+
+/* see sph_keccak.h */
+void
+sph_keccak224_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
+{
+	keccak_close28(cc, ub, n, dst);
+}
+
+/* see sph_keccak.h */
+void
+sph_keccak256_init(void *cc)
+{
+	keccak_init(cc, 256);
+}
+
+/* see sph_keccak.h */
+void
+sph_keccak256(void *cc, const void *data, size_t len)
+{
+	keccak_core(cc, data, len, 136);
+}
+
+/* see sph_keccak.h */
+void
+sph_keccak256_close(void *cc, void *dst)
+{
+	sph_keccak256_addbits_and_close(cc, 0, 0, dst);
+}
+
+/* see sph_keccak.h */
+void
+sph_keccak256_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
+{
+	keccak_close32(cc, ub, n, dst);
+}
+
+/* see sph_keccak.h */
+void
+sph_keccak384_init(void *cc)
+{
+	keccak_init(cc, 384);
+}
+
+/* see sph_keccak.h */
+void
+sph_keccak384(void *cc, const void *data, size_t len)
+{
+	keccak_core(cc, data, len, 104);
+}
+
+/* see sph_keccak.h */
+void
+sph_keccak384_close(void *cc, void *dst)
+{
+	sph_keccak384_addbits_and_close(cc, 0, 0, dst);
+}
+
+/* see sph_keccak.h */
+void
+sph_keccak384_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
+{
+	keccak_close48(cc, ub, n, dst);
+}
+
+/* see sph_keccak.h */
+void
+sph_keccak512_init(void *cc)
+{
+	keccak_init(cc, 512);
+}
+
+/* see sph_keccak.h */
+void
+sph_keccak512(void *cc, const void *data, size_t len)
+{
+	keccak_core(cc, data, len, 72);
+}
+
+/* see sph_keccak.h */
+void
+sph_keccak512_close(void *cc, void *dst)
+{
+	sph_keccak512_addbits_and_close(cc, 0, 0, dst);
+}
+
+/* see sph_keccak.h */
+void
+sph_keccak512_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
+{
+	keccak_close64(cc, ub, n, dst);
+}
+
+
+#ifdef __cplusplus
+}
+#endif
diff --git a/sha3/luffa.c b/sha3/luffa.c
new file mode 100644
index 0000000..a761bea
--- /dev/null
+++ b/sha3/luffa.c
@@ -0,0 +1,1426 @@
+/* $Id: luffa.c 219 2010-06-08 17:24:41Z tp $ */
+/*
+ * Luffa implementation.
+ *
+ * ==========================(LICENSE BEGIN)============================
+ *
+ * Copyright (c) 2007-2010  Projet RNRT SAPHIR
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ===========================(LICENSE END)=============================
+ *
+ * @author   Thomas Pornin <thomas.pornin@cryptolog.com>
+ */
+
+#include <stddef.h>
+#include <string.h>
+#include <limits.h>
+
+#include "sph_luffa.h"
+
+#ifdef __cplusplus
+extern "C"{
+#endif
+
+#if SPH_64_TRUE && !defined SPH_LUFFA_PARALLEL
+#define SPH_LUFFA_PARALLEL   1
+#endif
+
+#ifdef _MSC_VER
+#pragma warning (disable: 4146)
+#endif
+
+static const sph_u32 V_INIT[5][8] = {
+	{
+		SPH_C32(0x6d251e69), SPH_C32(0x44b051e0),
+		SPH_C32(0x4eaa6fb4), SPH_C32(0xdbf78465),
+		SPH_C32(0x6e292011), SPH_C32(0x90152df4),
+		SPH_C32(0xee058139), SPH_C32(0xdef610bb)
+	}, {
+		SPH_C32(0xc3b44b95), SPH_C32(0xd9d2f256),
+		SPH_C32(0x70eee9a0), SPH_C32(0xde099fa3),
+		SPH_C32(0x5d9b0557), SPH_C32(0x8fc944b3),
+		SPH_C32(0xcf1ccf0e), SPH_C32(0x746cd581)
+	}, {
+		SPH_C32(0xf7efc89d), SPH_C32(0x5dba5781),
+		SPH_C32(0x04016ce5), SPH_C32(0xad659c05),
+		SPH_C32(0x0306194f), SPH_C32(0x666d1836),
+		SPH_C32(0x24aa230a), SPH_C32(0x8b264ae7)
+	}, {
+		SPH_C32(0x858075d5), SPH_C32(0x36d79cce),
+		SPH_C32(0xe571f7d7), SPH_C32(0x204b1f67),
+		SPH_C32(0x35870c6a), SPH_C32(0x57e9e923),
+		SPH_C32(0x14bcb808), SPH_C32(0x7cde72ce)
+	}, {
+		SPH_C32(0x6c68e9be), SPH_C32(0x5ec41e22),
+		SPH_C32(0xc825b7c7), SPH_C32(0xaffb4363),
+		SPH_C32(0xf5df3999), SPH_C32(0x0fc688f1),
+		SPH_C32(0xb07224cc), SPH_C32(0x03e86cea)
+	}
+};
+
+static const sph_u32 RC00[8] = {
+	SPH_C32(0x303994a6), SPH_C32(0xc0e65299),
+	SPH_C32(0x6cc33a12), SPH_C32(0xdc56983e),
+	SPH_C32(0x1e00108f), SPH_C32(0x7800423d),
+	SPH_C32(0x8f5b7882), SPH_C32(0x96e1db12)
+};
+
+static const sph_u32 RC04[8] = {
+	SPH_C32(0xe0337818), SPH_C32(0x441ba90d),
+	SPH_C32(0x7f34d442), SPH_C32(0x9389217f),
+	SPH_C32(0xe5a8bce6), SPH_C32(0x5274baf4),
+	SPH_C32(0x26889ba7), SPH_C32(0x9a226e9d)
+};
+
+static const sph_u32 RC10[8] = {
+	SPH_C32(0xb6de10ed), SPH_C32(0x70f47aae),
+	SPH_C32(0x0707a3d4), SPH_C32(0x1c1e8f51),
+	SPH_C32(0x707a3d45), SPH_C32(0xaeb28562),
+	SPH_C32(0xbaca1589), SPH_C32(0x40a46f3e)
+};
+
+static const sph_u32 RC14[8] = {
+	SPH_C32(0x01685f3d), SPH_C32(0x05a17cf4),
+	SPH_C32(0xbd09caca), SPH_C32(0xf4272b28),
+	SPH_C32(0x144ae5cc), SPH_C32(0xfaa7ae2b),
+	SPH_C32(0x2e48f1c1), SPH_C32(0xb923c704)
+};
+
+#if SPH_LUFFA_PARALLEL
+
+static const sph_u64 RCW010[8] = {
+	SPH_C64(0xb6de10ed303994a6), SPH_C64(0x70f47aaec0e65299),
+	SPH_C64(0x0707a3d46cc33a12), SPH_C64(0x1c1e8f51dc56983e),
+	SPH_C64(0x707a3d451e00108f), SPH_C64(0xaeb285627800423d),
+	SPH_C64(0xbaca15898f5b7882), SPH_C64(0x40a46f3e96e1db12)
+};
+
+static const sph_u64 RCW014[8] = {
+	SPH_C64(0x01685f3de0337818), SPH_C64(0x05a17cf4441ba90d),
+	SPH_C64(0xbd09caca7f34d442), SPH_C64(0xf4272b289389217f),
+	SPH_C64(0x144ae5cce5a8bce6), SPH_C64(0xfaa7ae2b5274baf4),
+	SPH_C64(0x2e48f1c126889ba7), SPH_C64(0xb923c7049a226e9d)
+};
+
+#endif
+
+static const sph_u32 RC20[8] = {
+	SPH_C32(0xfc20d9d2), SPH_C32(0x34552e25),
+	SPH_C32(0x7ad8818f), SPH_C32(0x8438764a),
+	SPH_C32(0xbb6de032), SPH_C32(0xedb780c8),
+	SPH_C32(0xd9847356), SPH_C32(0xa2c78434)
+};
+
+static const sph_u32 RC24[8] = {
+	SPH_C32(0xe25e72c1), SPH_C32(0xe623bb72),
+	SPH_C32(0x5c58a4a4), SPH_C32(0x1e38e2e7),
+	SPH_C32(0x78e38b9d), SPH_C32(0x27586719),
+	SPH_C32(0x36eda57f), SPH_C32(0x703aace7)
+};
+
+static const sph_u32 RC30[8] = {
+	SPH_C32(0xb213afa5), SPH_C32(0xc84ebe95),
+	SPH_C32(0x4e608a22), SPH_C32(0x56d858fe),
+	SPH_C32(0x343b138f), SPH_C32(0xd0ec4e3d),
+	SPH_C32(0x2ceb4882), SPH_C32(0xb3ad2208)
+};
+
+static const sph_u32 RC34[8] = {
+	SPH_C32(0xe028c9bf), SPH_C32(0x44756f91),
+	SPH_C32(0x7e8fce32), SPH_C32(0x956548be),
+	SPH_C32(0xfe191be2), SPH_C32(0x3cb226e5),
+	SPH_C32(0x5944a28e), SPH_C32(0xa1c4c355)
+};
+
+#if SPH_LUFFA_PARALLEL
+
+static const sph_u64 RCW230[8] = {
+	SPH_C64(0xb213afa5fc20d9d2), SPH_C64(0xc84ebe9534552e25),
+	SPH_C64(0x4e608a227ad8818f), SPH_C64(0x56d858fe8438764a),
+	SPH_C64(0x343b138fbb6de032), SPH_C64(0xd0ec4e3dedb780c8),
+	SPH_C64(0x2ceb4882d9847356), SPH_C64(0xb3ad2208a2c78434)
+};
+
+
+static const sph_u64 RCW234[8] = {
+	SPH_C64(0xe028c9bfe25e72c1), SPH_C64(0x44756f91e623bb72),
+	SPH_C64(0x7e8fce325c58a4a4), SPH_C64(0x956548be1e38e2e7),
+	SPH_C64(0xfe191be278e38b9d), SPH_C64(0x3cb226e527586719),
+	SPH_C64(0x5944a28e36eda57f), SPH_C64(0xa1c4c355703aace7)
+};
+
+#endif
+
+static const sph_u32 RC40[8] = {
+	SPH_C32(0xf0d2e9e3), SPH_C32(0xac11d7fa),
+	SPH_C32(0x1bcb66f2), SPH_C32(0x6f2d9bc9),
+	SPH_C32(0x78602649), SPH_C32(0x8edae952),
+	SPH_C32(0x3b6ba548), SPH_C32(0xedae9520)
+};
+
+static const sph_u32 RC44[8] = {
+	SPH_C32(0x5090d577), SPH_C32(0x2d1925ab),
+	SPH_C32(0xb46496ac), SPH_C32(0xd1925ab0),
+	SPH_C32(0x29131ab6), SPH_C32(0x0fc053c3),
+	SPH_C32(0x3f014f0c), SPH_C32(0xfc053c31)
+};
+
+#define DECL_TMP8(w) \
+	sph_u32 w ## 0, w ## 1, w ## 2, w ## 3, w ## 4, w ## 5, w ## 6, w ## 7;
+
+#define M2(d, s)   do { \
+		sph_u32 tmp = s ## 7; \
+		d ## 7 = s ## 6; \
+		d ## 6 = s ## 5; \
+		d ## 5 = s ## 4; \
+		d ## 4 = s ## 3 ^ tmp; \
+		d ## 3 = s ## 2 ^ tmp; \
+		d ## 2 = s ## 1; \
+		d ## 1 = s ## 0 ^ tmp; \
+		d ## 0 = tmp; \
+	} while (0)
+
+#define XOR(d, s1, s2)   do { \
+		d ## 0 = s1 ## 0 ^ s2 ## 0; \
+		d ## 1 = s1 ## 1 ^ s2 ## 1; \
+		d ## 2 = s1 ## 2 ^ s2 ## 2; \
+		d ## 3 = s1 ## 3 ^ s2 ## 3; \
+		d ## 4 = s1 ## 4 ^ s2 ## 4; \
+		d ## 5 = s1 ## 5 ^ s2 ## 5; \
+		d ## 6 = s1 ## 6 ^ s2 ## 6; \
+		d ## 7 = s1 ## 7 ^ s2 ## 7; \
+	} while (0)
+
+#if SPH_LUFFA_PARALLEL
+
+#define SUB_CRUMB_GEN(a0, a1, a2, a3, width)   do { \
+		sph_u ## width tmp; \
+		tmp = (a0); \
+		(a0) |= (a1); \
+		(a2) ^= (a3); \
+		(a1) = SPH_T ## width(~(a1)); \
+		(a0) ^= (a3); \
+		(a3) &= tmp; \
+		(a1) ^= (a3); \
+		(a3) ^= (a2); \
+		(a2) &= (a0); \
+		(a0) = SPH_T ## width(~(a0)); \
+		(a2) ^= (a1); \
+		(a1) |= (a3); \
+		tmp ^= (a1); \
+		(a3) ^= (a2); \
+		(a2) &= (a1); \
+		(a1) ^= (a0); \
+		(a0) = tmp; \
+	} while (0)
+
+#define SUB_CRUMB(a0, a1, a2, a3)    SUB_CRUMB_GEN(a0, a1, a2, a3, 32)
+#define SUB_CRUMBW(a0, a1, a2, a3)   SUB_CRUMB_GEN(a0, a1, a2, a3, 64)
+
+
+#if 0
+
+#define ROL32W(x, n)   SPH_T64( \
+                       (((x) << (n)) \
+                       & ~((SPH_C64(0xFFFFFFFF) >> (32 - (n))) << 32)) \
+                       | (((x) >> (32 - (n))) \
+                       & ~((SPH_C64(0xFFFFFFFF) >> (n)) << (n))))
+
+#define MIX_WORDW(u, v)   do { \
+		(v) ^= (u); \
+		(u) = ROL32W((u), 2) ^ (v); \
+		(v) = ROL32W((v), 14) ^ (u); \
+		(u) = ROL32W((u), 10) ^ (v); \
+		(v) = ROL32W((v), 1); \
+	} while (0)
+
+#endif
+
+#define MIX_WORDW(u, v)   do { \
+		sph_u32 ul, uh, vl, vh; \
+		(v) ^= (u); \
+		ul = SPH_T32((sph_u32)(u)); \
+		uh = SPH_T32((sph_u32)((u) >> 32)); \
+		vl = SPH_T32((sph_u32)(v)); \
+		vh = SPH_T32((sph_u32)((v) >> 32)); \
+		ul = SPH_ROTL32(ul, 2) ^ vl; \
+		vl = SPH_ROTL32(vl, 14) ^ ul; \
+		ul = SPH_ROTL32(ul, 10) ^ vl; \
+		vl = SPH_ROTL32(vl, 1); \
+		uh = SPH_ROTL32(uh, 2) ^ vh; \
+		vh = SPH_ROTL32(vh, 14) ^ uh; \
+		uh = SPH_ROTL32(uh, 10) ^ vh; \
+		vh = SPH_ROTL32(vh, 1); \
+		(u) = (sph_u64)ul | ((sph_u64)uh << 32); \
+		(v) = (sph_u64)vl | ((sph_u64)vh << 32); \
+	} while (0)
+
+#else
+
+#define SUB_CRUMB(a0, a1, a2, a3)   do { \
+		sph_u32 tmp; \
+		tmp = (a0); \
+		(a0) |= (a1); \
+		(a2) ^= (a3); \
+		(a1) = SPH_T32(~(a1)); \
+		(a0) ^= (a3); \
+		(a3) &= tmp; \
+		(a1) ^= (a3); \
+		(a3) ^= (a2); \
+		(a2) &= (a0); \
+		(a0) = SPH_T32(~(a0)); \
+		(a2) ^= (a1); \
+		(a1) |= (a3); \
+		tmp ^= (a1); \
+		(a3) ^= (a2); \
+		(a2) &= (a1); \
+		(a1) ^= (a0); \
+		(a0) = tmp; \
+	} while (0)
+
+#endif
+
+#define MIX_WORD(u, v)   do { \
+		(v) ^= (u); \
+		(u) = SPH_ROTL32((u), 2) ^ (v); \
+		(v) = SPH_ROTL32((v), 14) ^ (u); \
+		(u) = SPH_ROTL32((u), 10) ^ (v); \
+		(v) = SPH_ROTL32((v), 1); \
+	} while (0)
+
+#define DECL_STATE3 \
+	sph_u32 V00, V01, V02, V03, V04, V05, V06, V07; \
+	sph_u32 V10, V11, V12, V13, V14, V15, V16, V17; \
+	sph_u32 V20, V21, V22, V23, V24, V25, V26, V27;
+
+#define READ_STATE3(state)   do { \
+		V00 = (state)->V[0][0]; \
+		V01 = (state)->V[0][1]; \
+		V02 = (state)->V[0][2]; \
+		V03 = (state)->V[0][3]; \
+		V04 = (state)->V[0][4]; \
+		V05 = (state)->V[0][5]; \
+		V06 = (state)->V[0][6]; \
+		V07 = (state)->V[0][7]; \
+		V10 = (state)->V[1][0]; \
+		V11 = (state)->V[1][1]; \
+		V12 = (state)->V[1][2]; \
+		V13 = (state)->V[1][3]; \
+		V14 = (state)->V[1][4]; \
+		V15 = (state)->V[1][5]; \
+		V16 = (state)->V[1][6]; \
+		V17 = (state)->V[1][7]; \
+		V20 = (state)->V[2][0]; \
+		V21 = (state)->V[2][1]; \
+		V22 = (state)->V[2][2]; \
+		V23 = (state)->V[2][3]; \
+		V24 = (state)->V[2][4]; \
+		V25 = (state)->V[2][5]; \
+		V26 = (state)->V[2][6]; \
+		V27 = (state)->V[2][7]; \
+	} while (0)
+
+#define WRITE_STATE3(state)   do { \
+		(state)->V[0][0] = V00; \
+		(state)->V[0][1] = V01; \
+		(state)->V[0][2] = V02; \
+		(state)->V[0][3] = V03; \
+		(state)->V[0][4] = V04; \
+		(state)->V[0][5] = V05; \
+		(state)->V[0][6] = V06; \
+		(state)->V[0][7] = V07; \
+		(state)->V[1][0] = V10; \
+		(state)->V[1][1] = V11; \
+		(state)->V[1][2] = V12; \
+		(state)->V[1][3] = V13; \
+		(state)->V[1][4] = V14; \
+		(state)->V[1][5] = V15; \
+		(state)->V[1][6] = V16; \
+		(state)->V[1][7] = V17; \
+		(state)->V[2][0] = V20; \
+		(state)->V[2][1] = V21; \
+		(state)->V[2][2] = V22; \
+		(state)->V[2][3] = V23; \
+		(state)->V[2][4] = V24; \
+		(state)->V[2][5] = V25; \
+		(state)->V[2][6] = V26; \
+		(state)->V[2][7] = V27; \
+	} while (0)
+
+#define MI3   do { \
+		DECL_TMP8(M) \
+		DECL_TMP8(a) \
+		M0 = sph_dec32be_aligned(buf +  0); \
+		M1 = sph_dec32be_aligned(buf +  4); \
+		M2 = sph_dec32be_aligned(buf +  8); \
+		M3 = sph_dec32be_aligned(buf + 12); \
+		M4 = sph_dec32be_aligned(buf + 16); \
+		M5 = sph_dec32be_aligned(buf + 20); \
+		M6 = sph_dec32be_aligned(buf + 24); \
+		M7 = sph_dec32be_aligned(buf + 28); \
+		XOR(a, V0, V1); \
+		XOR(a, a, V2); \
+		M2(a, a); \
+		XOR(V0, a, V0); \
+		XOR(V0, M, V0); \
+		M2(M, M); \
+		XOR(V1, a, V1); \
+		XOR(V1, M, V1); \
+		M2(M, M); \
+		XOR(V2, a, V2); \
+		XOR(V2, M, V2); \
+	} while (0)
+
+#define TWEAK3   do { \
+		V14 = SPH_ROTL32(V14, 1); \
+		V15 = SPH_ROTL32(V15, 1); \
+		V16 = SPH_ROTL32(V16, 1); \
+		V17 = SPH_ROTL32(V17, 1); \
+		V24 = SPH_ROTL32(V24, 2); \
+		V25 = SPH_ROTL32(V25, 2); \
+		V26 = SPH_ROTL32(V26, 2); \
+		V27 = SPH_ROTL32(V27, 2); \
+	} while (0)
+
+#if SPH_LUFFA_PARALLEL
+
+#define P3   do { \
+		int r; \
+		sph_u64 W0, W1, W2, W3, W4, W5, W6, W7; \
+		TWEAK3; \
+		W0 = (sph_u64)V00 | ((sph_u64)V10 << 32); \
+		W1 = (sph_u64)V01 | ((sph_u64)V11 << 32); \
+		W2 = (sph_u64)V02 | ((sph_u64)V12 << 32); \
+		W3 = (sph_u64)V03 | ((sph_u64)V13 << 32); \
+		W4 = (sph_u64)V04 | ((sph_u64)V14 << 32); \
+		W5 = (sph_u64)V05 | ((sph_u64)V15 << 32); \
+		W6 = (sph_u64)V06 | ((sph_u64)V16 << 32); \
+		W7 = (sph_u64)V07 | ((sph_u64)V17 << 32); \
+		for (r = 0; r < 8; r ++) { \
+			SUB_CRUMBW(W0, W1, W2, W3); \
+			SUB_CRUMBW(W5, W6, W7, W4); \
+			MIX_WORDW(W0, W4); \
+			MIX_WORDW(W1, W5); \
+			MIX_WORDW(W2, W6); \
+			MIX_WORDW(W3, W7); \
+			W0 ^= RCW010[r]; \
+			W4 ^= RCW014[r]; \
+		} \
+		V00 = SPH_T32((sph_u32)W0); \
+		V10 = SPH_T32((sph_u32)(W0 >> 32)); \
+		V01 = SPH_T32((sph_u32)W1); \
+		V11 = SPH_T32((sph_u32)(W1 >> 32)); \
+		V02 = SPH_T32((sph_u32)W2); \
+		V12 = SPH_T32((sph_u32)(W2 >> 32)); \
+		V03 = SPH_T32((sph_u32)W3); \
+		V13 = SPH_T32((sph_u32)(W3 >> 32)); \
+		V04 = SPH_T32((sph_u32)W4); \
+		V14 = SPH_T32((sph_u32)(W4 >> 32)); \
+		V05 = SPH_T32((sph_u32)W5); \
+		V15 = SPH_T32((sph_u32)(W5 >> 32)); \
+		V06 = SPH_T32((sph_u32)W6); \
+		V16 = SPH_T32((sph_u32)(W6 >> 32)); \
+		V07 = SPH_T32((sph_u32)W7); \
+		V17 = SPH_T32((sph_u32)(W7 >> 32)); \
+		for (r = 0; r < 8; r ++) { \
+			SUB_CRUMB(V20, V21, V22, V23); \
+			SUB_CRUMB(V25, V26, V27, V24); \
+			MIX_WORD(V20, V24); \
+			MIX_WORD(V21, V25); \
+			MIX_WORD(V22, V26); \
+			MIX_WORD(V23, V27); \
+			V20 ^= RC20[r]; \
+			V24 ^= RC24[r]; \
+		} \
+	} while (0)
+
+#else
+
+#define P3   do { \
+		int r; \
+		TWEAK3; \
+		for (r = 0; r < 8; r ++) { \
+			SUB_CRUMB(V00, V01, V02, V03); \
+			SUB_CRUMB(V05, V06, V07, V04); \
+			MIX_WORD(V00, V04); \
+			MIX_WORD(V01, V05); \
+			MIX_WORD(V02, V06); \
+			MIX_WORD(V03, V07); \
+			V00 ^= RC00[r]; \
+			V04 ^= RC04[r]; \
+		} \
+		for (r = 0; r < 8; r ++) { \
+			SUB_CRUMB(V10, V11, V12, V13); \
+			SUB_CRUMB(V15, V16, V17, V14); \
+			MIX_WORD(V10, V14); \
+			MIX_WORD(V11, V15); \
+			MIX_WORD(V12, V16); \
+			MIX_WORD(V13, V17); \
+			V10 ^= RC10[r]; \
+			V14 ^= RC14[r]; \
+		} \
+		for (r = 0; r < 8; r ++) { \
+			SUB_CRUMB(V20, V21, V22, V23); \
+			SUB_CRUMB(V25, V26, V27, V24); \
+			MIX_WORD(V20, V24); \
+			MIX_WORD(V21, V25); \
+			MIX_WORD(V22, V26); \
+			MIX_WORD(V23, V27); \
+			V20 ^= RC20[r]; \
+			V24 ^= RC24[r]; \
+		} \
+	} while (0)
+
+#endif
+
+#define DECL_STATE4 \
+	sph_u32 V00, V01, V02, V03, V04, V05, V06, V07; \
+	sph_u32 V10, V11, V12, V13, V14, V15, V16, V17; \
+	sph_u32 V20, V21, V22, V23, V24, V25, V26, V27; \
+	sph_u32 V30, V31, V32, V33, V34, V35, V36, V37;
+
+#define READ_STATE4(state)   do { \
+		V00 = (state)->V[0][0]; \
+		V01 = (state)->V[0][1]; \
+		V02 = (state)->V[0][2]; \
+		V03 = (state)->V[0][3]; \
+		V04 = (state)->V[0][4]; \
+		V05 = (state)->V[0][5]; \
+		V06 = (state)->V[0][6]; \
+		V07 = (state)->V[0][7]; \
+		V10 = (state)->V[1][0]; \
+		V11 = (state)->V[1][1]; \
+		V12 = (state)->V[1][2]; \
+		V13 = (state)->V[1][3]; \
+		V14 = (state)->V[1][4]; \
+		V15 = (state)->V[1][5]; \
+		V16 = (state)->V[1][6]; \
+		V17 = (state)->V[1][7]; \
+		V20 = (state)->V[2][0]; \
+		V21 = (state)->V[2][1]; \
+		V22 = (state)->V[2][2]; \
+		V23 = (state)->V[2][3]; \
+		V24 = (state)->V[2][4]; \
+		V25 = (state)->V[2][5]; \
+		V26 = (state)->V[2][6]; \
+		V27 = (state)->V[2][7]; \
+		V30 = (state)->V[3][0]; \
+		V31 = (state)->V[3][1]; \
+		V32 = (state)->V[3][2]; \
+		V33 = (state)->V[3][3]; \
+		V34 = (state)->V[3][4]; \
+		V35 = (state)->V[3][5]; \
+		V36 = (state)->V[3][6]; \
+		V37 = (state)->V[3][7]; \
+	} while (0)
+
+#define WRITE_STATE4(state)   do { \
+		(state)->V[0][0] = V00; \
+		(state)->V[0][1] = V01; \
+		(state)->V[0][2] = V02; \
+		(state)->V[0][3] = V03; \
+		(state)->V[0][4] = V04; \
+		(state)->V[0][5] = V05; \
+		(state)->V[0][6] = V06; \
+		(state)->V[0][7] = V07; \
+		(state)->V[1][0] = V10; \
+		(state)->V[1][1] = V11; \
+		(state)->V[1][2] = V12; \
+		(state)->V[1][3] = V13; \
+		(state)->V[1][4] = V14; \
+		(state)->V[1][5] = V15; \
+		(state)->V[1][6] = V16; \
+		(state)->V[1][7] = V17; \
+		(state)->V[2][0] = V20; \
+		(state)->V[2][1] = V21; \
+		(state)->V[2][2] = V22; \
+		(state)->V[2][3] = V23; \
+		(state)->V[2][4] = V24; \
+		(state)->V[2][5] = V25; \
+		(state)->V[2][6] = V26; \
+		(state)->V[2][7] = V27; \
+		(state)->V[3][0] = V30; \
+		(state)->V[3][1] = V31; \
+		(state)->V[3][2] = V32; \
+		(state)->V[3][3] = V33; \
+		(state)->V[3][4] = V34; \
+		(state)->V[3][5] = V35; \
+		(state)->V[3][6] = V36; \
+		(state)->V[3][7] = V37; \
+	} while (0)
+
+#define MI4   do { \
+		DECL_TMP8(M) \
+		DECL_TMP8(a) \
+		DECL_TMP8(b) \
+		M0 = sph_dec32be_aligned(buf +  0); \
+		M1 = sph_dec32be_aligned(buf +  4); \
+		M2 = sph_dec32be_aligned(buf +  8); \
+		M3 = sph_dec32be_aligned(buf + 12); \
+		M4 = sph_dec32be_aligned(buf + 16); \
+		M5 = sph_dec32be_aligned(buf + 20); \
+		M6 = sph_dec32be_aligned(buf + 24); \
+		M7 = sph_dec32be_aligned(buf + 28); \
+		XOR(a, V0, V1); \
+		XOR(b, V2, V3); \
+		XOR(a, a, b); \
+		M2(a, a); \
+		XOR(V0, a, V0); \
+		XOR(V1, a, V1); \
+		XOR(V2, a, V2); \
+		XOR(V3, a, V3); \
+		M2(b, V0); \
+		XOR(b, b, V3); \
+		M2(V3, V3); \
+		XOR(V3, V3, V2); \
+		M2(V2, V2); \
+		XOR(V2, V2, V1); \
+		M2(V1, V1); \
+		XOR(V1, V1, V0); \
+		XOR(V0, b, M); \
+		M2(M, M); \
+		XOR(V1, V1, M); \
+		M2(M, M); \
+		XOR(V2, V2, M); \
+		M2(M, M); \
+		XOR(V3, V3, M); \
+	} while (0)
+
+#define TWEAK4   do { \
+		V14 = SPH_ROTL32(V14, 1); \
+		V15 = SPH_ROTL32(V15, 1); \
+		V16 = SPH_ROTL32(V16, 1); \
+		V17 = SPH_ROTL32(V17, 1); \
+		V24 = SPH_ROTL32(V24, 2); \
+		V25 = SPH_ROTL32(V25, 2); \
+		V26 = SPH_ROTL32(V26, 2); \
+		V27 = SPH_ROTL32(V27, 2); \
+		V34 = SPH_ROTL32(V34, 3); \
+		V35 = SPH_ROTL32(V35, 3); \
+		V36 = SPH_ROTL32(V36, 3); \
+		V37 = SPH_ROTL32(V37, 3); \
+	} while (0)
+
+#if SPH_LUFFA_PARALLEL
+
+#define P4   do { \
+		int r; \
+		sph_u64 W0, W1, W2, W3, W4, W5, W6, W7; \
+		TWEAK4; \
+		W0 = (sph_u64)V00 | ((sph_u64)V10 << 32); \
+		W1 = (sph_u64)V01 | ((sph_u64)V11 << 32); \
+		W2 = (sph_u64)V02 | ((sph_u64)V12 << 32); \
+		W3 = (sph_u64)V03 | ((sph_u64)V13 << 32); \
+		W4 = (sph_u64)V04 | ((sph_u64)V14 << 32); \
+		W5 = (sph_u64)V05 | ((sph_u64)V15 << 32); \
+		W6 = (sph_u64)V06 | ((sph_u64)V16 << 32); \
+		W7 = (sph_u64)V07 | ((sph_u64)V17 << 32); \
+		for (r = 0; r < 8; r ++) { \
+			SUB_CRUMBW(W0, W1, W2, W3); \
+			SUB_CRUMBW(W5, W6, W7, W4); \
+			MIX_WORDW(W0, W4); \
+			MIX_WORDW(W1, W5); \
+			MIX_WORDW(W2, W6); \
+			MIX_WORDW(W3, W7); \
+			W0 ^= RCW010[r]; \
+			W4 ^= RCW014[r]; \
+		} \
+		V00 = SPH_T32((sph_u32)W0); \
+		V10 = SPH_T32((sph_u32)(W0 >> 32)); \
+		V01 = SPH_T32((sph_u32)W1); \
+		V11 = SPH_T32((sph_u32)(W1 >> 32)); \
+		V02 = SPH_T32((sph_u32)W2); \
+		V12 = SPH_T32((sph_u32)(W2 >> 32)); \
+		V03 = SPH_T32((sph_u32)W3); \
+		V13 = SPH_T32((sph_u32)(W3 >> 32)); \
+		V04 = SPH_T32((sph_u32)W4); \
+		V14 = SPH_T32((sph_u32)(W4 >> 32)); \
+		V05 = SPH_T32((sph_u32)W5); \
+		V15 = SPH_T32((sph_u32)(W5 >> 32)); \
+		V06 = SPH_T32((sph_u32)W6); \
+		V16 = SPH_T32((sph_u32)(W6 >> 32)); \
+		V07 = SPH_T32((sph_u32)W7); \
+		V17 = SPH_T32((sph_u32)(W7 >> 32)); \
+		W0 = (sph_u64)V20 | ((sph_u64)V30 << 32); \
+		W1 = (sph_u64)V21 | ((sph_u64)V31 << 32); \
+		W2 = (sph_u64)V22 | ((sph_u64)V32 << 32); \
+		W3 = (sph_u64)V23 | ((sph_u64)V33 << 32); \
+		W4 = (sph_u64)V24 | ((sph_u64)V34 << 32); \
+		W5 = (sph_u64)V25 | ((sph_u64)V35 << 32); \
+		W6 = (sph_u64)V26 | ((sph_u64)V36 << 32); \
+		W7 = (sph_u64)V27 | ((sph_u64)V37 << 32); \
+		for (r = 0; r < 8; r ++) { \
+			SUB_CRUMBW(W0, W1, W2, W3); \
+			SUB_CRUMBW(W5, W6, W7, W4); \
+			MIX_WORDW(W0, W4); \
+			MIX_WORDW(W1, W5); \
+			MIX_WORDW(W2, W6); \
+			MIX_WORDW(W3, W7); \
+			W0 ^= RCW230[r]; \
+			W4 ^= RCW234[r]; \
+		} \
+		V20 = SPH_T32((sph_u32)W0); \
+		V30 = SPH_T32((sph_u32)(W0 >> 32)); \
+		V21 = SPH_T32((sph_u32)W1); \
+		V31 = SPH_T32((sph_u32)(W1 >> 32)); \
+		V22 = SPH_T32((sph_u32)W2); \
+		V32 = SPH_T32((sph_u32)(W2 >> 32)); \
+		V23 = SPH_T32((sph_u32)W3); \
+		V33 = SPH_T32((sph_u32)(W3 >> 32)); \
+		V24 = SPH_T32((sph_u32)W4); \
+		V34 = SPH_T32((sph_u32)(W4 >> 32)); \
+		V25 = SPH_T32((sph_u32)W5); \
+		V35 = SPH_T32((sph_u32)(W5 >> 32)); \
+		V26 = SPH_T32((sph_u32)W6); \
+		V36 = SPH_T32((sph_u32)(W6 >> 32)); \
+		V27 = SPH_T32((sph_u32)W7); \
+		V37 = SPH_T32((sph_u32)(W7 >> 32)); \
+	} while (0)
+
+#else
+
+#define P4   do { \
+		int r; \
+		TWEAK4; \
+		for (r = 0; r < 8; r ++) { \
+			SUB_CRUMB(V00, V01, V02, V03); \
+			SUB_CRUMB(V05, V06, V07, V04); \
+			MIX_WORD(V00, V04); \
+			MIX_WORD(V01, V05); \
+			MIX_WORD(V02, V06); \
+			MIX_WORD(V03, V07); \
+			V00 ^= RC00[r]; \
+			V04 ^= RC04[r]; \
+		} \
+		for (r = 0; r < 8; r ++) { \
+			SUB_CRUMB(V10, V11, V12, V13); \
+			SUB_CRUMB(V15, V16, V17, V14); \
+			MIX_WORD(V10, V14); \
+			MIX_WORD(V11, V15); \
+			MIX_WORD(V12, V16); \
+			MIX_WORD(V13, V17); \
+			V10 ^= RC10[r]; \
+			V14 ^= RC14[r]; \
+		} \
+		for (r = 0; r < 8; r ++) { \
+			SUB_CRUMB(V20, V21, V22, V23); \
+			SUB_CRUMB(V25, V26, V27, V24); \
+			MIX_WORD(V20, V24); \
+			MIX_WORD(V21, V25); \
+			MIX_WORD(V22, V26); \
+			MIX_WORD(V23, V27); \
+			V20 ^= RC20[r]; \
+			V24 ^= RC24[r]; \
+		} \
+		for (r = 0; r < 8; r ++) { \
+			SUB_CRUMB(V30, V31, V32, V33); \
+			SUB_CRUMB(V35, V36, V37, V34); \
+			MIX_WORD(V30, V34); \
+			MIX_WORD(V31, V35); \
+			MIX_WORD(V32, V36); \
+			MIX_WORD(V33, V37); \
+			V30 ^= RC30[r]; \
+			V34 ^= RC34[r]; \
+		} \
+	} while (0)
+
+#endif
+
+#define DECL_STATE5 \
+	sph_u32 V00, V01, V02, V03, V04, V05, V06, V07; \
+	sph_u32 V10, V11, V12, V13, V14, V15, V16, V17; \
+	sph_u32 V20, V21, V22, V23, V24, V25, V26, V27; \
+	sph_u32 V30, V31, V32, V33, V34, V35, V36, V37; \
+	sph_u32 V40, V41, V42, V43, V44, V45, V46, V47;
+
+#define READ_STATE5(state)   do { \
+		V00 = (state)->V[0][0]; \
+		V01 = (state)->V[0][1]; \
+		V02 = (state)->V[0][2]; \
+		V03 = (state)->V[0][3]; \
+		V04 = (state)->V[0][4]; \
+		V05 = (state)->V[0][5]; \
+		V06 = (state)->V[0][6]; \
+		V07 = (state)->V[0][7]; \
+		V10 = (state)->V[1][0]; \
+		V11 = (state)->V[1][1]; \
+		V12 = (state)->V[1][2]; \
+		V13 = (state)->V[1][3]; \
+		V14 = (state)->V[1][4]; \
+		V15 = (state)->V[1][5]; \
+		V16 = (state)->V[1][6]; \
+		V17 = (state)->V[1][7]; \
+		V20 = (state)->V[2][0]; \
+		V21 = (state)->V[2][1]; \
+		V22 = (state)->V[2][2]; \
+		V23 = (state)->V[2][3]; \
+		V24 = (state)->V[2][4]; \
+		V25 = (state)->V[2][5]; \
+		V26 = (state)->V[2][6]; \
+		V27 = (state)->V[2][7]; \
+		V30 = (state)->V[3][0]; \
+		V31 = (state)->V[3][1]; \
+		V32 = (state)->V[3][2]; \
+		V33 = (state)->V[3][3]; \
+		V34 = (state)->V[3][4]; \
+		V35 = (state)->V[3][5]; \
+		V36 = (state)->V[3][6]; \
+		V37 = (state)->V[3][7]; \
+		V40 = (state)->V[4][0]; \
+		V41 = (state)->V[4][1]; \
+		V42 = (state)->V[4][2]; \
+		V43 = (state)->V[4][3]; \
+		V44 = (state)->V[4][4]; \
+		V45 = (state)->V[4][5]; \
+		V46 = (state)->V[4][6]; \
+		V47 = (state)->V[4][7]; \
+	} while (0)
+
+#define WRITE_STATE5(state)   do { \
+		(state)->V[0][0] = V00; \
+		(state)->V[0][1] = V01; \
+		(state)->V[0][2] = V02; \
+		(state)->V[0][3] = V03; \
+		(state)->V[0][4] = V04; \
+		(state)->V[0][5] = V05; \
+		(state)->V[0][6] = V06; \
+		(state)->V[0][7] = V07; \
+		(state)->V[1][0] = V10; \
+		(state)->V[1][1] = V11; \
+		(state)->V[1][2] = V12; \
+		(state)->V[1][3] = V13; \
+		(state)->V[1][4] = V14; \
+		(state)->V[1][5] = V15; \
+		(state)->V[1][6] = V16; \
+		(state)->V[1][7] = V17; \
+		(state)->V[2][0] = V20; \
+		(state)->V[2][1] = V21; \
+		(state)->V[2][2] = V22; \
+		(state)->V[2][3] = V23; \
+		(state)->V[2][4] = V24; \
+		(state)->V[2][5] = V25; \
+		(state)->V[2][6] = V26; \
+		(state)->V[2][7] = V27; \
+		(state)->V[3][0] = V30; \
+		(state)->V[3][1] = V31; \
+		(state)->V[3][2] = V32; \
+		(state)->V[3][3] = V33; \
+		(state)->V[3][4] = V34; \
+		(state)->V[3][5] = V35; \
+		(state)->V[3][6] = V36; \
+		(state)->V[3][7] = V37; \
+		(state)->V[4][0] = V40; \
+		(state)->V[4][1] = V41; \
+		(state)->V[4][2] = V42; \
+		(state)->V[4][3] = V43; \
+		(state)->V[4][4] = V44; \
+		(state)->V[4][5] = V45; \
+		(state)->V[4][6] = V46; \
+		(state)->V[4][7] = V47; \
+	} while (0)
+
+#define MI5   do { \
+		DECL_TMP8(M) \
+		DECL_TMP8(a) \
+		DECL_TMP8(b) \
+		M0 = sph_dec32be_aligned(buf +  0); \
+		M1 = sph_dec32be_aligned(buf +  4); \
+		M2 = sph_dec32be_aligned(buf +  8); \
+		M3 = sph_dec32be_aligned(buf + 12); \
+		M4 = sph_dec32be_aligned(buf + 16); \
+		M5 = sph_dec32be_aligned(buf + 20); \
+		M6 = sph_dec32be_aligned(buf + 24); \
+		M7 = sph_dec32be_aligned(buf + 28); \
+		XOR(a, V0, V1); \
+		XOR(b, V2, V3); \
+		XOR(a, a, b); \
+		XOR(a, a, V4); \
+		M2(a, a); \
+		XOR(V0, a, V0); \
+		XOR(V1, a, V1); \
+		XOR(V2, a, V2); \
+		XOR(V3, a, V3); \
+		XOR(V4, a, V4); \
+		M2(b, V0); \
+		XOR(b, b, V1); \
+		M2(V1, V1); \
+		XOR(V1, V1, V2); \
+		M2(V2, V2); \
+		XOR(V2, V2, V3); \
+		M2(V3, V3); \
+		XOR(V3, V3, V4); \
+		M2(V4, V4); \
+		XOR(V4, V4, V0); \
+		M2(V0, b); \
+		XOR(V0, V0, V4); \
+		M2(V4, V4); \
+		XOR(V4, V4, V3); \
+		M2(V3, V3); \
+		XOR(V3, V3, V2); \
+		M2(V2, V2); \
+		XOR(V2, V2, V1); \
+		M2(V1, V1); \
+		XOR(V1, V1, b); \
+		XOR(V0, V0, M); \
+		M2(M, M); \
+		XOR(V1, V1, M); \
+		M2(M, M); \
+		XOR(V2, V2, M); \
+		M2(M, M); \
+		XOR(V3, V3, M); \
+		M2(M, M); \
+		XOR(V4, V4, M); \
+	} while (0)
+
+#define TWEAK5   do { \
+		V14 = SPH_ROTL32(V14, 1); \
+		V15 = SPH_ROTL32(V15, 1); \
+		V16 = SPH_ROTL32(V16, 1); \
+		V17 = SPH_ROTL32(V17, 1); \
+		V24 = SPH_ROTL32(V24, 2); \
+		V25 = SPH_ROTL32(V25, 2); \
+		V26 = SPH_ROTL32(V26, 2); \
+		V27 = SPH_ROTL32(V27, 2); \
+		V34 = SPH_ROTL32(V34, 3); \
+		V35 = SPH_ROTL32(V35, 3); \
+		V36 = SPH_ROTL32(V36, 3); \
+		V37 = SPH_ROTL32(V37, 3); \
+		V44 = SPH_ROTL32(V44, 4); \
+		V45 = SPH_ROTL32(V45, 4); \
+		V46 = SPH_ROTL32(V46, 4); \
+		V47 = SPH_ROTL32(V47, 4); \
+	} while (0)
+
+#if SPH_LUFFA_PARALLEL
+
+#define P5   do { \
+		int r; \
+		sph_u64 W0, W1, W2, W3, W4, W5, W6, W7; \
+		TWEAK5; \
+		W0 = (sph_u64)V00 | ((sph_u64)V10 << 32); \
+		W1 = (sph_u64)V01 | ((sph_u64)V11 << 32); \
+		W2 = (sph_u64)V02 | ((sph_u64)V12 << 32); \
+		W3 = (sph_u64)V03 | ((sph_u64)V13 << 32); \
+		W4 = (sph_u64)V04 | ((sph_u64)V14 << 32); \
+		W5 = (sph_u64)V05 | ((sph_u64)V15 << 32); \
+		W6 = (sph_u64)V06 | ((sph_u64)V16 << 32); \
+		W7 = (sph_u64)V07 | ((sph_u64)V17 << 32); \
+		for (r = 0; r < 8; r ++) { \
+			SUB_CRUMBW(W0, W1, W2, W3); \
+			SUB_CRUMBW(W5, W6, W7, W4); \
+			MIX_WORDW(W0, W4); \
+			MIX_WORDW(W1, W5); \
+			MIX_WORDW(W2, W6); \
+			MIX_WORDW(W3, W7); \
+			W0 ^= RCW010[r]; \
+			W4 ^= RCW014[r]; \
+		} \
+		V00 = SPH_T32((sph_u32)W0); \
+		V10 = SPH_T32((sph_u32)(W0 >> 32)); \
+		V01 = SPH_T32((sph_u32)W1); \
+		V11 = SPH_T32((sph_u32)(W1 >> 32)); \
+		V02 = SPH_T32((sph_u32)W2); \
+		V12 = SPH_T32((sph_u32)(W2 >> 32)); \
+		V03 = SPH_T32((sph_u32)W3); \
+		V13 = SPH_T32((sph_u32)(W3 >> 32)); \
+		V04 = SPH_T32((sph_u32)W4); \
+		V14 = SPH_T32((sph_u32)(W4 >> 32)); \
+		V05 = SPH_T32((sph_u32)W5); \
+		V15 = SPH_T32((sph_u32)(W5 >> 32)); \
+		V06 = SPH_T32((sph_u32)W6); \
+		V16 = SPH_T32((sph_u32)(W6 >> 32)); \
+		V07 = SPH_T32((sph_u32)W7); \
+		V17 = SPH_T32((sph_u32)(W7 >> 32)); \
+		W0 = (sph_u64)V20 | ((sph_u64)V30 << 32); \
+		W1 = (sph_u64)V21 | ((sph_u64)V31 << 32); \
+		W2 = (sph_u64)V22 | ((sph_u64)V32 << 32); \
+		W3 = (sph_u64)V23 | ((sph_u64)V33 << 32); \
+		W4 = (sph_u64)V24 | ((sph_u64)V34 << 32); \
+		W5 = (sph_u64)V25 | ((sph_u64)V35 << 32); \
+		W6 = (sph_u64)V26 | ((sph_u64)V36 << 32); \
+		W7 = (sph_u64)V27 | ((sph_u64)V37 << 32); \
+		for (r = 0; r < 8; r ++) { \
+			SUB_CRUMBW(W0, W1, W2, W3); \
+			SUB_CRUMBW(W5, W6, W7, W4); \
+			MIX_WORDW(W0, W4); \
+			MIX_WORDW(W1, W5); \
+			MIX_WORDW(W2, W6); \
+			MIX_WORDW(W3, W7); \
+			W0 ^= RCW230[r]; \
+			W4 ^= RCW234[r]; \
+		} \
+		V20 = SPH_T32((sph_u32)W0); \
+		V30 = SPH_T32((sph_u32)(W0 >> 32)); \
+		V21 = SPH_T32((sph_u32)W1); \
+		V31 = SPH_T32((sph_u32)(W1 >> 32)); \
+		V22 = SPH_T32((sph_u32)W2); \
+		V32 = SPH_T32((sph_u32)(W2 >> 32)); \
+		V23 = SPH_T32((sph_u32)W3); \
+		V33 = SPH_T32((sph_u32)(W3 >> 32)); \
+		V24 = SPH_T32((sph_u32)W4); \
+		V34 = SPH_T32((sph_u32)(W4 >> 32)); \
+		V25 = SPH_T32((sph_u32)W5); \
+		V35 = SPH_T32((sph_u32)(W5 >> 32)); \
+		V26 = SPH_T32((sph_u32)W6); \
+		V36 = SPH_T32((sph_u32)(W6 >> 32)); \
+		V27 = SPH_T32((sph_u32)W7); \
+		V37 = SPH_T32((sph_u32)(W7 >> 32)); \
+		for (r = 0; r < 8; r ++) { \
+			SUB_CRUMB(V40, V41, V42, V43); \
+			SUB_CRUMB(V45, V46, V47, V44); \
+			MIX_WORD(V40, V44); \
+			MIX_WORD(V41, V45); \
+			MIX_WORD(V42, V46); \
+			MIX_WORD(V43, V47); \
+			V40 ^= RC40[r]; \
+			V44 ^= RC44[r]; \
+		} \
+	} while (0)
+
+#else
+
+#define P5   do { \
+		int r; \
+		TWEAK5; \
+		for (r = 0; r < 8; r ++) { \
+			SUB_CRUMB(V00, V01, V02, V03); \
+			SUB_CRUMB(V05, V06, V07, V04); \
+			MIX_WORD(V00, V04); \
+			MIX_WORD(V01, V05); \
+			MIX_WORD(V02, V06); \
+			MIX_WORD(V03, V07); \
+			V00 ^= RC00[r]; \
+			V04 ^= RC04[r]; \
+		} \
+		for (r = 0; r < 8; r ++) { \
+			SUB_CRUMB(V10, V11, V12, V13); \
+			SUB_CRUMB(V15, V16, V17, V14); \
+			MIX_WORD(V10, V14); \
+			MIX_WORD(V11, V15); \
+			MIX_WORD(V12, V16); \
+			MIX_WORD(V13, V17); \
+			V10 ^= RC10[r]; \
+			V14 ^= RC14[r]; \
+		} \
+		for (r = 0; r < 8; r ++) { \
+			SUB_CRUMB(V20, V21, V22, V23); \
+			SUB_CRUMB(V25, V26, V27, V24); \
+			MIX_WORD(V20, V24); \
+			MIX_WORD(V21, V25); \
+			MIX_WORD(V22, V26); \
+			MIX_WORD(V23, V27); \
+			V20 ^= RC20[r]; \
+			V24 ^= RC24[r]; \
+		} \
+		for (r = 0; r < 8; r ++) { \
+			SUB_CRUMB(V30, V31, V32, V33); \
+			SUB_CRUMB(V35, V36, V37, V34); \
+			MIX_WORD(V30, V34); \
+			MIX_WORD(V31, V35); \
+			MIX_WORD(V32, V36); \
+			MIX_WORD(V33, V37); \
+			V30 ^= RC30[r]; \
+			V34 ^= RC34[r]; \
+		} \
+		for (r = 0; r < 8; r ++) { \
+			SUB_CRUMB(V40, V41, V42, V43); \
+			SUB_CRUMB(V45, V46, V47, V44); \
+			MIX_WORD(V40, V44); \
+			MIX_WORD(V41, V45); \
+			MIX_WORD(V42, V46); \
+			MIX_WORD(V43, V47); \
+			V40 ^= RC40[r]; \
+			V44 ^= RC44[r]; \
+		} \
+	} while (0)
+
+#endif
+
+static void
+luffa3(sph_luffa224_context *sc, const void *data, size_t len)
+{
+	unsigned char *buf;
+	size_t ptr;
+	DECL_STATE3
+
+	buf = sc->buf;
+	ptr = sc->ptr;
+	if (len < (sizeof sc->buf) - ptr) {
+		memcpy(buf + ptr, data, len);
+		ptr += len;
+		sc->ptr = ptr;
+		return;
+	}
+
+	READ_STATE3(sc);
+	while (len > 0) {
+		size_t clen;
+
+		clen = (sizeof sc->buf) - ptr;
+		if (clen > len)
+			clen = len;
+		memcpy(buf + ptr, data, clen);
+		ptr += clen;
+		data = (const unsigned char *)data + clen;
+		len -= clen;
+		if (ptr == sizeof sc->buf) {
+			MI3;
+			P3;
+			ptr = 0;
+		}
+	}
+	WRITE_STATE3(sc);
+	sc->ptr = ptr;
+}
+
+static void
+luffa3_close(sph_luffa224_context *sc, unsigned ub, unsigned n,
+	void *dst, unsigned out_size_w32)
+{
+	unsigned char *buf, *out;
+	size_t ptr;
+	unsigned z;
+	int i;
+	DECL_STATE3
+
+	buf = sc->buf;
+	ptr = sc->ptr;
+	z = 0x80 >> n;
+	buf[ptr ++] = ((ub & -z) | z) & 0xFF;
+	memset(buf + ptr, 0, (sizeof sc->buf) - ptr);
+	READ_STATE3(sc);
+	for (i = 0; i < 2; i ++) {
+		MI3;
+		P3;
+		memset(buf, 0, sizeof sc->buf);
+	}
+	out = dst;
+	sph_enc32be(out +  0, V00 ^ V10 ^ V20);
+	sph_enc32be(out +  4, V01 ^ V11 ^ V21);
+	sph_enc32be(out +  8, V02 ^ V12 ^ V22);
+	sph_enc32be(out + 12, V03 ^ V13 ^ V23);
+	sph_enc32be(out + 16, V04 ^ V14 ^ V24);
+	sph_enc32be(out + 20, V05 ^ V15 ^ V25);
+	sph_enc32be(out + 24, V06 ^ V16 ^ V26);
+	if (out_size_w32 > 7)
+		sph_enc32be(out + 28, V07 ^ V17 ^ V27);
+}
+
+static void
+luffa4(sph_luffa384_context *sc, const void *data, size_t len)
+{
+	unsigned char *buf;
+	size_t ptr;
+	DECL_STATE4
+
+	buf = sc->buf;
+	ptr = sc->ptr;
+	if (len < (sizeof sc->buf) - ptr) {
+		memcpy(buf + ptr, data, len);
+		ptr += len;
+		sc->ptr = ptr;
+		return;
+	}
+
+	READ_STATE4(sc);
+	while (len > 0) {
+		size_t clen;
+
+		clen = (sizeof sc->buf) - ptr;
+		if (clen > len)
+			clen = len;
+		memcpy(buf + ptr, data, clen);
+		ptr += clen;
+		data = (const unsigned char *)data + clen;
+		len -= clen;
+		if (ptr == sizeof sc->buf) {
+			MI4;
+			P4;
+			ptr = 0;
+		}
+	}
+	WRITE_STATE4(sc);
+	sc->ptr = ptr;
+}
+
+static void
+luffa4_close(sph_luffa384_context *sc, unsigned ub, unsigned n, void *dst)
+{
+	unsigned char *buf, *out;
+	size_t ptr;
+	unsigned z;
+	int i;
+	DECL_STATE4
+
+	buf = sc->buf;
+	ptr = sc->ptr;
+	out = dst;
+	z = 0x80 >> n;
+	buf[ptr ++] = ((ub & -z) | z) & 0xFF;
+	memset(buf + ptr, 0, (sizeof sc->buf) - ptr);
+	READ_STATE4(sc);
+	for (i = 0; i < 3; i ++) {
+		MI4;
+		P4;
+		switch (i) {
+		case 0:
+			memset(buf, 0, sizeof sc->buf);
+			break;
+		case 1:
+			sph_enc32be(out +  0, V00 ^ V10 ^ V20 ^ V30);
+			sph_enc32be(out +  4, V01 ^ V11 ^ V21 ^ V31);
+			sph_enc32be(out +  8, V02 ^ V12 ^ V22 ^ V32);
+			sph_enc32be(out + 12, V03 ^ V13 ^ V23 ^ V33);
+			sph_enc32be(out + 16, V04 ^ V14 ^ V24 ^ V34);
+			sph_enc32be(out + 20, V05 ^ V15 ^ V25 ^ V35);
+			sph_enc32be(out + 24, V06 ^ V16 ^ V26 ^ V36);
+			sph_enc32be(out + 28, V07 ^ V17 ^ V27 ^ V37);
+			break;
+		case 2:
+			sph_enc32be(out + 32, V00 ^ V10 ^ V20 ^ V30);
+			sph_enc32be(out + 36, V01 ^ V11 ^ V21 ^ V31);
+			sph_enc32be(out + 40, V02 ^ V12 ^ V22 ^ V32);
+			sph_enc32be(out + 44, V03 ^ V13 ^ V23 ^ V33);
+			break;
+		}
+	}
+}
+
+static void
+luffa5(sph_luffa512_context *sc, const void *data, size_t len)
+{
+	unsigned char *buf;
+	size_t ptr;
+	DECL_STATE5
+
+	buf = sc->buf;
+	ptr = sc->ptr;
+	if (len < (sizeof sc->buf) - ptr) {
+		memcpy(buf + ptr, data, len);
+		ptr += len;
+		sc->ptr = ptr;
+		return;
+	}
+
+	READ_STATE5(sc);
+	while (len > 0) {
+		size_t clen;
+
+		clen = (sizeof sc->buf) - ptr;
+		if (clen > len)
+			clen = len;
+		memcpy(buf + ptr, data, clen);
+		ptr += clen;
+		data = (const unsigned char *)data + clen;
+		len -= clen;
+		if (ptr == sizeof sc->buf) {
+			MI5;
+			P5;
+			ptr = 0;
+		}
+	}
+	WRITE_STATE5(sc);
+	sc->ptr = ptr;
+}
+
+static void
+luffa5_close(sph_luffa512_context *sc, unsigned ub, unsigned n, void *dst)
+{
+	unsigned char *buf, *out;
+	size_t ptr;
+	unsigned z;
+	int i;
+	DECL_STATE5
+
+	buf = sc->buf;
+	ptr = sc->ptr;
+	out = dst;
+	z = 0x80 >> n;
+	buf[ptr ++] = ((ub & -z) | z) & 0xFF;
+	memset(buf + ptr, 0, (sizeof sc->buf) - ptr);
+	READ_STATE5(sc);
+	for (i = 0; i < 3; i ++) {
+		MI5;
+		P5;
+		switch (i) {
+		case 0:
+			memset(buf, 0, sizeof sc->buf);
+			break;
+		case 1:
+			sph_enc32be(out +  0, V00 ^ V10 ^ V20 ^ V30 ^ V40);
+			sph_enc32be(out +  4, V01 ^ V11 ^ V21 ^ V31 ^ V41);
+			sph_enc32be(out +  8, V02 ^ V12 ^ V22 ^ V32 ^ V42);
+			sph_enc32be(out + 12, V03 ^ V13 ^ V23 ^ V33 ^ V43);
+			sph_enc32be(out + 16, V04 ^ V14 ^ V24 ^ V34 ^ V44);
+			sph_enc32be(out + 20, V05 ^ V15 ^ V25 ^ V35 ^ V45);
+			sph_enc32be(out + 24, V06 ^ V16 ^ V26 ^ V36 ^ V46);
+			sph_enc32be(out + 28, V07 ^ V17 ^ V27 ^ V37 ^ V47);
+			break;
+		case 2:
+			sph_enc32be(out + 32, V00 ^ V10 ^ V20 ^ V30 ^ V40);
+			sph_enc32be(out + 36, V01 ^ V11 ^ V21 ^ V31 ^ V41);
+			sph_enc32be(out + 40, V02 ^ V12 ^ V22 ^ V32 ^ V42);
+			sph_enc32be(out + 44, V03 ^ V13 ^ V23 ^ V33 ^ V43);
+			sph_enc32be(out + 48, V04 ^ V14 ^ V24 ^ V34 ^ V44);
+			sph_enc32be(out + 52, V05 ^ V15 ^ V25 ^ V35 ^ V45);
+			sph_enc32be(out + 56, V06 ^ V16 ^ V26 ^ V36 ^ V46);
+			sph_enc32be(out + 60, V07 ^ V17 ^ V27 ^ V37 ^ V47);
+			break;
+		}
+	}
+}
+
+/* see sph_luffa.h */
+void
+sph_luffa224_init(void *cc)
+{
+	sph_luffa224_context *sc;
+
+	sc = cc;
+	memcpy(sc->V, V_INIT, sizeof(sc->V));
+	sc->ptr = 0;
+}
+
+/* see sph_luffa.h */
+void
+sph_luffa224(void *cc, const void *data, size_t len)
+{
+	luffa3(cc, data, len);
+}
+
+/* see sph_luffa.h */
+void
+sph_luffa224_close(void *cc, void *dst)
+{
+	sph_luffa224_addbits_and_close(cc, 0, 0, dst);
+}
+
+/* see sph_luffa.h */
+void
+sph_luffa224_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
+{
+	luffa3_close(cc, ub, n, dst, 7);
+	sph_luffa224_init(cc);
+}
+
+/* see sph_luffa.h */
+void
+sph_luffa256_init(void *cc)
+{
+	sph_luffa256_context *sc;
+
+	sc = cc;
+	memcpy(sc->V, V_INIT, sizeof(sc->V));
+	sc->ptr = 0;
+}
+
+/* see sph_luffa.h */
+void
+sph_luffa256(void *cc, const void *data, size_t len)
+{
+	luffa3(cc, data, len);
+}
+
+/* see sph_luffa.h */
+void
+sph_luffa256_close(void *cc, void *dst)
+{
+	sph_luffa256_addbits_and_close(cc, 0, 0, dst);
+}
+
+/* see sph_luffa.h */
+void
+sph_luffa256_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
+{
+	luffa3_close(cc, ub, n, dst, 8);
+	sph_luffa256_init(cc);
+}
+
+/* see sph_luffa.h */
+void
+sph_luffa384_init(void *cc)
+{
+	sph_luffa384_context *sc;
+
+	sc = cc;
+	memcpy(sc->V, V_INIT, sizeof(sc->V));
+	sc->ptr = 0;
+}
+
+/* see sph_luffa.h */
+void
+sph_luffa384(void *cc, const void *data, size_t len)
+{
+	luffa4(cc, data, len);
+}
+
+/* see sph_luffa.h */
+void
+sph_luffa384_close(void *cc, void *dst)
+{
+	sph_luffa384_addbits_and_close(cc, 0, 0, dst);
+}
+
+/* see sph_luffa.h */
+void
+sph_luffa384_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
+{
+	luffa4_close(cc, ub, n, dst);
+	sph_luffa384_init(cc);
+}
+
+/* see sph_luffa.h */
+void
+sph_luffa512_init(void *cc)
+{
+	sph_luffa512_context *sc;
+
+	sc = cc;
+	memcpy(sc->V, V_INIT, sizeof(sc->V));
+	sc->ptr = 0;
+}
+
+/* see sph_luffa.h */
+void
+sph_luffa512(void *cc, const void *data, size_t len)
+{
+	luffa5(cc, data, len);
+}
+
+/* see sph_luffa.h */
+void
+sph_luffa512_close(void *cc, void *dst)
+{
+	sph_luffa512_addbits_and_close(cc, 0, 0, dst);
+}
+
+/* see sph_luffa.h */
+void
+sph_luffa512_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
+{
+	luffa5_close(cc, ub, n, dst);
+	sph_luffa512_init(cc);
+}
+
+#ifdef __cplusplus
+}
+#endif
\ No newline at end of file
diff --git a/sha3/shavite.c b/sha3/shavite.c
new file mode 100644
index 0000000..85074f3
--- /dev/null
+++ b/sha3/shavite.c
@@ -0,0 +1,1764 @@
+/* $Id: shavite.c 227 2010-06-16 17:28:38Z tp $ */
+/*
+ * SHAvite-3 implementation.
+ *
+ * ==========================(LICENSE BEGIN)============================
+ *
+ * Copyright (c) 2007-2010  Projet RNRT SAPHIR
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ===========================(LICENSE END)=============================
+ *
+ * @author   Thomas Pornin <thomas.pornin@cryptolog.com>
+ */
+
+#include <stddef.h>
+#include <string.h>
+
+#include "sph_shavite.h"
+
+#ifdef __cplusplus
+extern "C"{
+#endif
+
+#if SPH_SMALL_FOOTPRINT && !defined SPH_SMALL_FOOTPRINT_SHAVITE
+#define SPH_SMALL_FOOTPRINT_SHAVITE   1
+#endif
+
+#ifdef _MSC_VER
+#pragma warning (disable: 4146)
+#endif
+
+#define C32   SPH_C32
+
+/*
+ * As of round 2 of the SHA-3 competition, the published reference
+ * implementation and test vectors are wrong, because they use
+ * big-endian AES tables while the internal decoding uses little-endian.
+ * The code below follows the specification. To turn it into a code
+ * which follows the reference implementation (the one called "BugFix"
+ * on the SHAvite-3 web site, published on Nov 23rd, 2009), comment out
+ * the code below (from the '#define AES_BIG_ENDIAN...' to the definition
+ * of the AES_ROUND_NOKEY macro) and replace it with the version which
+ * is commented out afterwards.
+ */
+
+#define AES_BIG_ENDIAN   0
+#include "aes_helper.c"
+
+static const sph_u32 IV224[] = {
+	C32(0x6774F31C), C32(0x990AE210), C32(0xC87D4274), C32(0xC9546371),
+	C32(0x62B2AEA8), C32(0x4B5801D8), C32(0x1B702860), C32(0x842F3017)
+};
+
+static const sph_u32 IV256[] = {
+	C32(0x49BB3E47), C32(0x2674860D), C32(0xA8B392AC), C32(0x021AC4E6),
+	C32(0x409283CF), C32(0x620E5D86), C32(0x6D929DCB), C32(0x96CC2A8B)
+};
+
+static const sph_u32 IV384[] = {
+	C32(0x83DF1545), C32(0xF9AAEC13), C32(0xF4803CB0), C32(0x11FE1F47),
+	C32(0xDA6CD269), C32(0x4F53FCD7), C32(0x950529A2), C32(0x97908147),
+	C32(0xB0A4D7AF), C32(0x2B9132BF), C32(0x226E607D), C32(0x3C0F8D7C),
+	C32(0x487B3F0F), C32(0x04363E22), C32(0x0155C99C), C32(0xEC2E20D3)
+};
+
+static const sph_u32 IV512[] = {
+	C32(0x72FCCDD8), C32(0x79CA4727), C32(0x128A077B), C32(0x40D55AEC),
+	C32(0xD1901A06), C32(0x430AE307), C32(0xB29F5CD1), C32(0xDF07FBFC),
+	C32(0x8E45D73D), C32(0x681AB538), C32(0xBDE86578), C32(0xDD577E47),
+	C32(0xE275EADE), C32(0x502D9FCD), C32(0xB9357178), C32(0x022A4B9A)
+};
+
+#define AES_ROUND_NOKEY(x0, x1, x2, x3)   do { \
+		sph_u32 t0 = (x0); \
+		sph_u32 t1 = (x1); \
+		sph_u32 t2 = (x2); \
+		sph_u32 t3 = (x3); \
+		AES_ROUND_NOKEY_LE(t0, t1, t2, t3, x0, x1, x2, x3); \
+	} while (0)
+
+/*
+ * This is the code needed to match the "reference implementation" as
+ * published on Nov 23rd, 2009, instead of the published specification.
+ * 
+
+#define AES_BIG_ENDIAN   1
+#include "aes_helper.c"
+
+static const sph_u32 IV224[] = {
+	C32(0xC4C67795), C32(0xC0B1817F), C32(0xEAD88924), C32(0x1ABB1BB0),
+	C32(0xE0C29152), C32(0xBDE046BA), C32(0xAEEECF99), C32(0x58D509D8)
+};
+
+static const sph_u32 IV256[] = {
+	C32(0x3EECF551), C32(0xBF10819B), C32(0xE6DC8559), C32(0xF3E23FD5),
+	C32(0x431AEC73), C32(0x79E3F731), C32(0x98325F05), C32(0xA92A31F1)
+};
+
+static const sph_u32 IV384[] = {
+	C32(0x71F48510), C32(0xA903A8AC), C32(0xFE3216DD), C32(0x0B2D2AD4),
+	C32(0x6672900A), C32(0x41032819), C32(0x15A7D780), C32(0xB3CAB8D9),
+	C32(0x34EF4711), C32(0xDE019FE8), C32(0x4D674DC4), C32(0xE056D96B),
+	C32(0xA35C016B), C32(0xDD903BA7), C32(0x8C1B09B4), C32(0x2C3E9F25)
+};
+
+static const sph_u32 IV512[] = {
+	C32(0xD5652B63), C32(0x25F1E6EA), C32(0xB18F48FA), C32(0xA1EE3A47),
+	C32(0xC8B67B07), C32(0xBDCE48D3), C32(0xE3937B78), C32(0x05DB5186),
+	C32(0x613BE326), C32(0xA11FA303), C32(0x90C833D4), C32(0x79CEE316),
+	C32(0x1E1AF00F), C32(0x2829B165), C32(0x23B25F80), C32(0x21E11499)
+};
+
+#define AES_ROUND_NOKEY(x0, x1, x2, x3)   do { \
+		sph_u32 t0 = (x0); \
+		sph_u32 t1 = (x1); \
+		sph_u32 t2 = (x2); \
+		sph_u32 t3 = (x3); \
+		AES_ROUND_NOKEY_BE(t0, t1, t2, t3, x0, x1, x2, x3); \
+	} while (0)
+
+ */
+
+#define KEY_EXPAND_ELT(k0, k1, k2, k3)   do { \
+		sph_u32 kt; \
+		AES_ROUND_NOKEY(k1, k2, k3, k0); \
+		kt = (k0); \
+		(k0) = (k1); \
+		(k1) = (k2); \
+		(k2) = (k3); \
+		(k3) = kt; \
+	} while (0)
+
+#if SPH_SMALL_FOOTPRINT_SHAVITE
+
+/*
+ * This function assumes that "msg" is aligned for 32-bit access.
+ */
+static void
+c256(sph_shavite_small_context *sc, const void *msg)
+{
+	sph_u32 p0, p1, p2, p3, p4, p5, p6, p7;
+	sph_u32 rk[144];
+	size_t u;
+	int r, s;
+
+#if SPH_LITTLE_ENDIAN
+	memcpy(rk, msg, 64);
+#else
+	for (u = 0; u < 16; u += 4) {
+		rk[u + 0] = sph_dec32le_aligned(
+			(const unsigned char *)msg + (u << 2) +  0);
+		rk[u + 1] = sph_dec32le_aligned(
+			(const unsigned char *)msg + (u << 2) +  4);
+		rk[u + 2] = sph_dec32le_aligned(
+			(const unsigned char *)msg + (u << 2) +  8);
+		rk[u + 3] = sph_dec32le_aligned(
+			(const unsigned char *)msg + (u << 2) + 12);
+	}
+#endif
+	u = 16;
+	for (r = 0; r < 4; r ++) {
+		for (s = 0; s < 2; s ++) {
+			sph_u32 x0, x1, x2, x3;
+
+			x0 = rk[u - 15];
+			x1 = rk[u - 14];
+			x2 = rk[u - 13];
+			x3 = rk[u - 16];
+			AES_ROUND_NOKEY(x0, x1, x2, x3);
+			rk[u + 0] = x0 ^ rk[u - 4];
+			rk[u + 1] = x1 ^ rk[u - 3];
+			rk[u + 2] = x2 ^ rk[u - 2];
+			rk[u + 3] = x3 ^ rk[u - 1];
+			if (u == 16) {
+				rk[ 16] ^= sc->count0;
+				rk[ 17] ^= SPH_T32(~sc->count1);
+			} else if (u == 56) {
+				rk[ 57] ^= sc->count1;
+				rk[ 58] ^= SPH_T32(~sc->count0);
+			}
+			u += 4;
+
+			x0 = rk[u - 15];
+			x1 = rk[u - 14];
+			x2 = rk[u - 13];
+			x3 = rk[u - 16];
+			AES_ROUND_NOKEY(x0, x1, x2, x3);
+			rk[u + 0] = x0 ^ rk[u - 4];
+			rk[u + 1] = x1 ^ rk[u - 3];
+			rk[u + 2] = x2 ^ rk[u - 2];
+			rk[u + 3] = x3 ^ rk[u - 1];
+			if (u == 84) {
+				rk[ 86] ^= sc->count1;
+				rk[ 87] ^= SPH_T32(~sc->count0);
+			} else if (u == 124) {
+				rk[124] ^= sc->count0;
+				rk[127] ^= SPH_T32(~sc->count1);
+			}
+			u += 4;
+		}
+		for (s = 0; s < 4; s ++) {
+			rk[u + 0] = rk[u - 16] ^ rk[u - 3];
+			rk[u + 1] = rk[u - 15] ^ rk[u - 2];
+			rk[u + 2] = rk[u - 14] ^ rk[u - 1];
+			rk[u + 3] = rk[u - 13] ^ rk[u - 0];
+			u += 4;
+		}
+	}
+
+	p0 = sc->h[0x0];
+	p1 = sc->h[0x1];
+	p2 = sc->h[0x2];
+	p3 = sc->h[0x3];
+	p4 = sc->h[0x4];
+	p5 = sc->h[0x5];
+	p6 = sc->h[0x6];
+	p7 = sc->h[0x7];
+	u = 0;
+	for (r = 0; r < 6; r ++) {
+		sph_u32 x0, x1, x2, x3;
+
+		x0 = p4 ^ rk[u ++];
+		x1 = p5 ^ rk[u ++];
+		x2 = p6 ^ rk[u ++];
+		x3 = p7 ^ rk[u ++];
+		AES_ROUND_NOKEY(x0, x1, x2, x3);
+		x0 ^= rk[u ++];
+		x1 ^= rk[u ++];
+		x2 ^= rk[u ++];
+		x3 ^= rk[u ++];
+		AES_ROUND_NOKEY(x0, x1, x2, x3);
+		x0 ^= rk[u ++];
+		x1 ^= rk[u ++];
+		x2 ^= rk[u ++];
+		x3 ^= rk[u ++];
+		AES_ROUND_NOKEY(x0, x1, x2, x3);
+		p0 ^= x0;
+		p1 ^= x1;
+		p2 ^= x2;
+		p3 ^= x3;
+
+		x0 = p0 ^ rk[u ++];
+		x1 = p1 ^ rk[u ++];
+		x2 = p2 ^ rk[u ++];
+		x3 = p3 ^ rk[u ++];
+		AES_ROUND_NOKEY(x0, x1, x2, x3);
+		x0 ^= rk[u ++];
+		x1 ^= rk[u ++];
+		x2 ^= rk[u ++];
+		x3 ^= rk[u ++];
+		AES_ROUND_NOKEY(x0, x1, x2, x3);
+		x0 ^= rk[u ++];
+		x1 ^= rk[u ++];
+		x2 ^= rk[u ++];
+		x3 ^= rk[u ++];
+		AES_ROUND_NOKEY(x0, x1, x2, x3);
+		p4 ^= x0;
+		p5 ^= x1;
+		p6 ^= x2;
+		p7 ^= x3;
+	}
+	sc->h[0x0] ^= p0;
+	sc->h[0x1] ^= p1;
+	sc->h[0x2] ^= p2;
+	sc->h[0x3] ^= p3;
+	sc->h[0x4] ^= p4;
+	sc->h[0x5] ^= p5;
+	sc->h[0x6] ^= p6;
+	sc->h[0x7] ^= p7;
+}
+
+#else
+
+/*
+ * This function assumes that "msg" is aligned for 32-bit access.
+ */
+static void
+c256(sph_shavite_small_context *sc, const void *msg)
+{
+	sph_u32 p0, p1, p2, p3, p4, p5, p6, p7;
+	sph_u32 x0, x1, x2, x3;
+	sph_u32 rk0, rk1, rk2, rk3, rk4, rk5, rk6, rk7;
+	sph_u32 rk8, rk9, rkA, rkB, rkC, rkD, rkE, rkF;
+
+	p0 = sc->h[0x0];
+	p1 = sc->h[0x1];
+	p2 = sc->h[0x2];
+	p3 = sc->h[0x3];
+	p4 = sc->h[0x4];
+	p5 = sc->h[0x5];
+	p6 = sc->h[0x6];
+	p7 = sc->h[0x7];
+	/* round 0 */
+	rk0 = sph_dec32le_aligned((const unsigned char *)msg +  0);
+	x0 = p4 ^ rk0;
+	rk1 = sph_dec32le_aligned((const unsigned char *)msg +  4);
+	x1 = p5 ^ rk1;
+	rk2 = sph_dec32le_aligned((const unsigned char *)msg +  8);
+	x2 = p6 ^ rk2;
+	rk3 = sph_dec32le_aligned((const unsigned char *)msg + 12);
+	x3 = p7 ^ rk3;
+	AES_ROUND_NOKEY(x0, x1, x2, x3);
+	rk4 = sph_dec32le_aligned((const unsigned char *)msg + 16);
+	x0 ^= rk4;
+	rk5 = sph_dec32le_aligned((const unsigned char *)msg + 20);
+	x1 ^= rk5;
+	rk6 = sph_dec32le_aligned((const unsigned char *)msg + 24);
+	x2 ^= rk6;
+	rk7 = sph_dec32le_aligned((const unsigned char *)msg + 28);
+	x3 ^= rk7;
+	AES_ROUND_NOKEY(x0, x1, x2, x3);
+	rk8 = sph_dec32le_aligned((const unsigned char *)msg + 32);
+	x0 ^= rk8;
+	rk9 = sph_dec32le_aligned((const unsigned char *)msg + 36);
+	x1 ^= rk9;
+	rkA = sph_dec32le_aligned((const unsigned char *)msg + 40);
+	x2 ^= rkA;
+	rkB = sph_dec32le_aligned((const unsigned char *)msg + 44);
+	x3 ^= rkB;
+	AES_ROUND_NOKEY(x0, x1, x2, x3);
+	p0 ^= x0;
+	p1 ^= x1;
+	p2 ^= x2;
+	p3 ^= x3;
+	/* round 1 */
+	rkC = sph_dec32le_aligned((const unsigned char *)msg + 48);
+	x0 = p0 ^ rkC;
+	rkD = sph_dec32le_aligned((const unsigned char *)msg + 52);
+	x1 = p1 ^ rkD;
+	rkE = sph_dec32le_aligned((const unsigned char *)msg + 56);
+	x2 = p2 ^ rkE;
+	rkF = sph_dec32le_aligned((const unsigned char *)msg + 60);
+	x3 = p3 ^ rkF;
+	AES_ROUND_NOKEY(x0, x1, x2, x3);
+	KEY_EXPAND_ELT(rk0, rk1, rk2, rk3);
+	rk0 ^= rkC ^ sc->count0;
+	rk1 ^= rkD ^ SPH_T32(~sc->count1);
+	rk2 ^= rkE;
+	rk3 ^= rkF;
+	x0 ^= rk0;
+	x1 ^= rk1;
+	x2 ^= rk2;
+	x3 ^= rk3;
+	AES_ROUND_NOKEY(x0, x1, x2, x3);
+	KEY_EXPAND_ELT(rk4, rk5, rk6, rk7);
+	rk4 ^= rk0;
+	rk5 ^= rk1;
+	rk6 ^= rk2;
+	rk7 ^= rk3;
+	x0 ^= rk4;
+	x1 ^= rk5;
+	x2 ^= rk6;
+	x3 ^= rk7;
+	AES_ROUND_NOKEY(x0, x1, x2, x3);
+	p4 ^= x0;
+	p5 ^= x1;
+	p6 ^= x2;
+	p7 ^= x3;
+	/* round 2 */
+	KEY_EXPAND_ELT(rk8, rk9, rkA, rkB);
+	rk8 ^= rk4;
+	rk9 ^= rk5;
+	rkA ^= rk6;
+	rkB ^= rk7;
+	x0 = p4 ^ rk8;
+	x1 = p5 ^ rk9;
+	x2 = p6 ^ rkA;
+	x3 = p7 ^ rkB;
+	AES_ROUND_NOKEY(x0, x1, x2, x3);
+	KEY_EXPAND_ELT(rkC, rkD, rkE, rkF);
+	rkC ^= rk8;
+	rkD ^= rk9;
+	rkE ^= rkA;
+	rkF ^= rkB;
+	x0 ^= rkC;
+	x1 ^= rkD;
+	x2 ^= rkE;
+	x3 ^= rkF;
+	AES_ROUND_NOKEY(x0, x1, x2, x3);
+	rk0 ^= rkD;
+	x0 ^= rk0;
+	rk1 ^= rkE;
+	x1 ^= rk1;
+	rk2 ^= rkF;
+	x2 ^= rk2;
+	rk3 ^= rk0;
+	x3 ^= rk3;
+	AES_ROUND_NOKEY(x0, x1, x2, x3);
+	p0 ^= x0;
+	p1 ^= x1;
+	p2 ^= x2;
+	p3 ^= x3;
+	/* round 3 */
+	rk4 ^= rk1;
+	x0 = p0 ^ rk4;
+	rk5 ^= rk2;
+	x1 = p1 ^ rk5;
+	rk6 ^= rk3;
+	x2 = p2 ^ rk6;
+	rk7 ^= rk4;
+	x3 = p3 ^ rk7;
+	AES_ROUND_NOKEY(x0, x1, x2, x3);
+	rk8 ^= rk5;
+	x0 ^= rk8;
+	rk9 ^= rk6;
+	x1 ^= rk9;
+	rkA ^= rk7;
+	x2 ^= rkA;
+	rkB ^= rk8;
+	x3 ^= rkB;
+	AES_ROUND_NOKEY(x0, x1, x2, x3);
+	rkC ^= rk9;
+	x0 ^= rkC;
+	rkD ^= rkA;
+	x1 ^= rkD;
+	rkE ^= rkB;
+	x2 ^= rkE;
+	rkF ^= rkC;
+	x3 ^= rkF;
+	AES_ROUND_NOKEY(x0, x1, x2, x3);
+	p4 ^= x0;
+	p5 ^= x1;
+	p6 ^= x2;
+	p7 ^= x3;
+	/* round 4 */
+	KEY_EXPAND_ELT(rk0, rk1, rk2, rk3);
+	rk0 ^= rkC;
+	rk1 ^= rkD;
+	rk2 ^= rkE;
+	rk3 ^= rkF;
+	x0 = p4 ^ rk0;
+	x1 = p5 ^ rk1;
+	x2 = p6 ^ rk2;
+	x3 = p7 ^ rk3;
+	AES_ROUND_NOKEY(x0, x1, x2, x3);
+	KEY_EXPAND_ELT(rk4, rk5, rk6, rk7);
+	rk4 ^= rk0;
+	rk5 ^= rk1;
+	rk6 ^= rk2;
+	rk7 ^= rk3;
+	x0 ^= rk4;
+	x1 ^= rk5;
+	x2 ^= rk6;
+	x3 ^= rk7;
+	AES_ROUND_NOKEY(x0, x1, x2, x3);
+	KEY_EXPAND_ELT(rk8, rk9, rkA, rkB);
+	rk8 ^= rk4;
+	rk9 ^= rk5 ^ sc->count1;
+	rkA ^= rk6 ^ SPH_T32(~sc->count0);
+	rkB ^= rk7;
+	x0 ^= rk8;
+	x1 ^= rk9;
+	x2 ^= rkA;
+	x3 ^= rkB;
+	AES_ROUND_NOKEY(x0, x1, x2, x3);
+	p0 ^= x0;
+	p1 ^= x1;
+	p2 ^= x2;
+	p3 ^= x3;
+	/* round 5 */
+	KEY_EXPAND_ELT(rkC, rkD, rkE, rkF);
+	rkC ^= rk8;
+	rkD ^= rk9;
+	rkE ^= rkA;
+	rkF ^= rkB;
+	x0 = p0 ^ rkC;
+	x1 = p1 ^ rkD;
+	x2 = p2 ^ rkE;
+	x3 = p3 ^ rkF;
+	AES_ROUND_NOKEY(x0, x1, x2, x3);
+	rk0 ^= rkD;
+	x0 ^= rk0;
+	rk1 ^= rkE;
+	x1 ^= rk1;
+	rk2 ^= rkF;
+	x2 ^= rk2;
+	rk3 ^= rk0;
+	x3 ^= rk3;
+	AES_ROUND_NOKEY(x0, x1, x2, x3);
+	rk4 ^= rk1;
+	x0 ^= rk4;
+	rk5 ^= rk2;
+	x1 ^= rk5;
+	rk6 ^= rk3;
+	x2 ^= rk6;
+	rk7 ^= rk4;
+	x3 ^= rk7;
+	AES_ROUND_NOKEY(x0, x1, x2, x3);
+	p4 ^= x0;
+	p5 ^= x1;
+	p6 ^= x2;
+	p7 ^= x3;
+	/* round 6 */
+	rk8 ^= rk5;
+	x0 = p4 ^ rk8;
+	rk9 ^= rk6;
+	x1 = p5 ^ rk9;
+	rkA ^= rk7;
+	x2 = p6 ^ rkA;
+	rkB ^= rk8;
+	x3 = p7 ^ rkB;
+	AES_ROUND_NOKEY(x0, x1, x2, x3);
+	rkC ^= rk9;
+	x0 ^= rkC;
+	rkD ^= rkA;
+	x1 ^= rkD;
+	rkE ^= rkB;
+	x2 ^= rkE;
+	rkF ^= rkC;
+	x3 ^= rkF;
+	AES_ROUND_NOKEY(x0, x1, x2, x3);
+	KEY_EXPAND_ELT(rk0, rk1, rk2, rk3);
+	rk0 ^= rkC;
+	rk1 ^= rkD;
+	rk2 ^= rkE;
+	rk3 ^= rkF;
+	x0 ^= rk0;
+	x1 ^= rk1;
+	x2 ^= rk2;
+	x3 ^= rk3;
+	AES_ROUND_NOKEY(x0, x1, x2, x3);
+	p0 ^= x0;
+	p1 ^= x1;
+	p2 ^= x2;
+	p3 ^= x3;
+	/* round 7 */
+	KEY_EXPAND_ELT(rk4, rk5, rk6, rk7);
+	rk4 ^= rk0;
+	rk5 ^= rk1;
+	rk6 ^= rk2 ^ sc->count1;
+	rk7 ^= rk3 ^ SPH_T32(~sc->count0);
+	x0 = p0 ^ rk4;
+	x1 = p1 ^ rk5;
+	x2 = p2 ^ rk6;
+	x3 = p3 ^ rk7;
+	AES_ROUND_NOKEY(x0, x1, x2, x3);
+	KEY_EXPAND_ELT(rk8, rk9, rkA, rkB);
+	rk8 ^= rk4;
+	rk9 ^= rk5;
+	rkA ^= rk6;
+	rkB ^= rk7;
+	x0 ^= rk8;
+	x1 ^= rk9;
+	x2 ^= rkA;
+	x3 ^= rkB;
+	AES_ROUND_NOKEY(x0, x1, x2, x3);
+	KEY_EXPAND_ELT(rkC, rkD, rkE, rkF);
+	rkC ^= rk8;
+	rkD ^= rk9;
+	rkE ^= rkA;
+	rkF ^= rkB;
+	x0 ^= rkC;
+	x1 ^= rkD;
+	x2 ^= rkE;
+	x3 ^= rkF;
+	AES_ROUND_NOKEY(x0, x1, x2, x3);
+	p4 ^= x0;
+	p5 ^= x1;
+	p6 ^= x2;
+	p7 ^= x3;
+	/* round 8 */
+	rk0 ^= rkD;
+	x0 = p4 ^ rk0;
+	rk1 ^= rkE;
+	x1 = p5 ^ rk1;
+	rk2 ^= rkF;
+	x2 = p6 ^ rk2;
+	rk3 ^= rk0;
+	x3 = p7 ^ rk3;
+	AES_ROUND_NOKEY(x0, x1, x2, x3);
+	rk4 ^= rk1;
+	x0 ^= rk4;
+	rk5 ^= rk2;
+	x1 ^= rk5;
+	rk6 ^= rk3;
+	x2 ^= rk6;
+	rk7 ^= rk4;
+	x3 ^= rk7;
+	AES_ROUND_NOKEY(x0, x1, x2, x3);
+	rk8 ^= rk5;
+	x0 ^= rk8;
+	rk9 ^= rk6;
+	x1 ^= rk9;
+	rkA ^= rk7;
+	x2 ^= rkA;
+	rkB ^= rk8;
+	x3 ^= rkB;
+	AES_ROUND_NOKEY(x0, x1, x2, x3);
+	p0 ^= x0;
+	p1 ^= x1;
+	p2 ^= x2;
+	p3 ^= x3;
+	/* round 9 */
+	rkC ^= rk9;
+	x0 = p0 ^ rkC;
+	rkD ^= rkA;
+	x1 = p1 ^ rkD;
+	rkE ^= rkB;
+	x2 = p2 ^ rkE;
+	rkF ^= rkC;
+	x3 = p3 ^ rkF;
+	AES_ROUND_NOKEY(x0, x1, x2, x3);
+	KEY_EXPAND_ELT(rk0, rk1, rk2, rk3);
+	rk0 ^= rkC;
+	rk1 ^= rkD;
+	rk2 ^= rkE;
+	rk3 ^= rkF;
+	x0 ^= rk0;
+	x1 ^= rk1;
+	x2 ^= rk2;
+	x3 ^= rk3;
+	AES_ROUND_NOKEY(x0, x1, x2, x3);
+	KEY_EXPAND_ELT(rk4, rk5, rk6, rk7);
+	rk4 ^= rk0;
+	rk5 ^= rk1;
+	rk6 ^= rk2;
+	rk7 ^= rk3;
+	x0 ^= rk4;
+	x1 ^= rk5;
+	x2 ^= rk6;
+	x3 ^= rk7;
+	AES_ROUND_NOKEY(x0, x1, x2, x3);
+	p4 ^= x0;
+	p5 ^= x1;
+	p6 ^= x2;
+	p7 ^= x3;
+	/* round 10 */
+	KEY_EXPAND_ELT(rk8, rk9, rkA, rkB);
+	rk8 ^= rk4;
+	rk9 ^= rk5;
+	rkA ^= rk6;
+	rkB ^= rk7;
+	x0 = p4 ^ rk8;
+	x1 = p5 ^ rk9;
+	x2 = p6 ^ rkA;
+	x3 = p7 ^ rkB;
+	AES_ROUND_NOKEY(x0, x1, x2, x3);
+	KEY_EXPAND_ELT(rkC, rkD, rkE, rkF);
+	rkC ^= rk8 ^ sc->count0;
+	rkD ^= rk9;
+	rkE ^= rkA;
+	rkF ^= rkB ^ SPH_T32(~sc->count1);
+	x0 ^= rkC;
+	x1 ^= rkD;
+	x2 ^= rkE;
+	x3 ^= rkF;
+	AES_ROUND_NOKEY(x0, x1, x2, x3);
+	rk0 ^= rkD;
+	x0 ^= rk0;
+	rk1 ^= rkE;
+	x1 ^= rk1;
+	rk2 ^= rkF;
+	x2 ^= rk2;
+	rk3 ^= rk0;
+	x3 ^= rk3;
+	AES_ROUND_NOKEY(x0, x1, x2, x3);
+	p0 ^= x0;
+	p1 ^= x1;
+	p2 ^= x2;
+	p3 ^= x3;
+	/* round 11 */
+	rk4 ^= rk1;
+	x0 = p0 ^ rk4;
+	rk5 ^= rk2;
+	x1 = p1 ^ rk5;
+	rk6 ^= rk3;
+	x2 = p2 ^ rk6;
+	rk7 ^= rk4;
+	x3 = p3 ^ rk7;
+	AES_ROUND_NOKEY(x0, x1, x2, x3);
+	rk8 ^= rk5;
+	x0 ^= rk8;
+	rk9 ^= rk6;
+	x1 ^= rk9;
+	rkA ^= rk7;
+	x2 ^= rkA;
+	rkB ^= rk8;
+	x3 ^= rkB;
+	AES_ROUND_NOKEY(x0, x1, x2, x3);
+	rkC ^= rk9;
+	x0 ^= rkC;
+	rkD ^= rkA;
+	x1 ^= rkD;
+	rkE ^= rkB;
+	x2 ^= rkE;
+	rkF ^= rkC;
+	x3 ^= rkF;
+	AES_ROUND_NOKEY(x0, x1, x2, x3);
+	p4 ^= x0;
+	p5 ^= x1;
+	p6 ^= x2;
+	p7 ^= x3;
+	sc->h[0x0] ^= p0;
+	sc->h[0x1] ^= p1;
+	sc->h[0x2] ^= p2;
+	sc->h[0x3] ^= p3;
+	sc->h[0x4] ^= p4;
+	sc->h[0x5] ^= p5;
+	sc->h[0x6] ^= p6;
+	sc->h[0x7] ^= p7;
+}
+
+#endif
+
+#if SPH_SMALL_FOOTPRINT_SHAVITE
+
+/*
+ * This function assumes that "msg" is aligned for 32-bit access.
+ */
+static void
+c512(sph_shavite_big_context *sc, const void *msg)
+{
+	sph_u32 p0, p1, p2, p3, p4, p5, p6, p7;
+	sph_u32 p8, p9, pA, pB, pC, pD, pE, pF;
+	sph_u32 rk[448];
+	size_t u;
+	int r, s;
+
+#if SPH_LITTLE_ENDIAN
+	memcpy(rk, msg, 128);
+#else
+	for (u = 0; u < 32; u += 4) {
+		rk[u + 0] = sph_dec32le_aligned(
+			(const unsigned char *)msg + (u << 2) +  0);
+		rk[u + 1] = sph_dec32le_aligned(
+			(const unsigned char *)msg + (u << 2) +  4);
+		rk[u + 2] = sph_dec32le_aligned(
+			(const unsigned char *)msg + (u << 2) +  8);
+		rk[u + 3] = sph_dec32le_aligned(
+			(const unsigned char *)msg + (u << 2) + 12);
+	}
+#endif
+	u = 32;
+	for (;;) {
+		for (s = 0; s < 4; s ++) {
+			sph_u32 x0, x1, x2, x3;
+
+			x0 = rk[u - 31];
+			x1 = rk[u - 30];
+			x2 = rk[u - 29];
+			x3 = rk[u - 32];
+			AES_ROUND_NOKEY(x0, x1, x2, x3);
+			rk[u + 0] = x0 ^ rk[u - 4];
+			rk[u + 1] = x1 ^ rk[u - 3];
+			rk[u + 2] = x2 ^ rk[u - 2];
+			rk[u + 3] = x3 ^ rk[u - 1];
+			if (u == 32) {
+				rk[ 32] ^= sc->count0;
+				rk[ 33] ^= sc->count1;
+				rk[ 34] ^= sc->count2;
+				rk[ 35] ^= SPH_T32(~sc->count3);
+			} else if (u == 440) {
+				rk[440] ^= sc->count1;
+				rk[441] ^= sc->count0;
+				rk[442] ^= sc->count3;
+				rk[443] ^= SPH_T32(~sc->count2);
+			}
+			u += 4;
+
+			x0 = rk[u - 31];
+			x1 = rk[u - 30];
+			x2 = rk[u - 29];
+			x3 = rk[u - 32];
+			AES_ROUND_NOKEY(x0, x1, x2, x3);
+			rk[u + 0] = x0 ^ rk[u - 4];
+			rk[u + 1] = x1 ^ rk[u - 3];
+			rk[u + 2] = x2 ^ rk[u - 2];
+			rk[u + 3] = x3 ^ rk[u - 1];
+			if (u == 164) {
+				rk[164] ^= sc->count3;
+				rk[165] ^= sc->count2;
+				rk[166] ^= sc->count1;
+				rk[167] ^= SPH_T32(~sc->count0);
+			} else if (u == 316) {
+				rk[316] ^= sc->count2;
+				rk[317] ^= sc->count3;
+				rk[318] ^= sc->count0;
+				rk[319] ^= SPH_T32(~sc->count1);
+			}
+			u += 4;
+		}
+		if (u == 448)
+			break;
+		for (s = 0; s < 8; s ++) {
+			rk[u + 0] = rk[u - 32] ^ rk[u - 7];
+			rk[u + 1] = rk[u - 31] ^ rk[u - 6];
+			rk[u + 2] = rk[u - 30] ^ rk[u - 5];
+			rk[u + 3] = rk[u - 29] ^ rk[u - 4];
+			u += 4;
+		}
+	}
+
+	p0 = sc->h[0x0];
+	p1 = sc->h[0x1];
+	p2 = sc->h[0x2];
+	p3 = sc->h[0x3];
+	p4 = sc->h[0x4];
+	p5 = sc->h[0x5];
+	p6 = sc->h[0x6];
+	p7 = sc->h[0x7];
+	p8 = sc->h[0x8];
+	p9 = sc->h[0x9];
+	pA = sc->h[0xA];
+	pB = sc->h[0xB];
+	pC = sc->h[0xC];
+	pD = sc->h[0xD];
+	pE = sc->h[0xE];
+	pF = sc->h[0xF];
+	u = 0;
+	for (r = 0; r < 14; r ++) {
+#define C512_ELT(l0, l1, l2, l3, r0, r1, r2, r3)   do { \
+		sph_u32 x0, x1, x2, x3; \
+		x0 = r0 ^ rk[u ++]; \
+		x1 = r1 ^ rk[u ++]; \
+		x2 = r2 ^ rk[u ++]; \
+		x3 = r3 ^ rk[u ++]; \
+		AES_ROUND_NOKEY(x0, x1, x2, x3); \
+		x0 ^= rk[u ++]; \
+		x1 ^= rk[u ++]; \
+		x2 ^= rk[u ++]; \
+		x3 ^= rk[u ++]; \
+		AES_ROUND_NOKEY(x0, x1, x2, x3); \
+		x0 ^= rk[u ++]; \
+		x1 ^= rk[u ++]; \
+		x2 ^= rk[u ++]; \
+		x3 ^= rk[u ++]; \
+		AES_ROUND_NOKEY(x0, x1, x2, x3); \
+		x0 ^= rk[u ++]; \
+		x1 ^= rk[u ++]; \
+		x2 ^= rk[u ++]; \
+		x3 ^= rk[u ++]; \
+		AES_ROUND_NOKEY(x0, x1, x2, x3); \
+		l0 ^= x0; \
+		l1 ^= x1; \
+		l2 ^= x2; \
+		l3 ^= x3; \
+	} while (0)
+
+#define WROT(a, b, c, d)   do { \
+		sph_u32 t = d; \
+		d = c; \
+		c = b; \
+		b = a; \
+		a = t; \
+	} while (0)
+
+		C512_ELT(p0, p1, p2, p3, p4, p5, p6, p7);
+		C512_ELT(p8, p9, pA, pB, pC, pD, pE, pF);
+
+		WROT(p0, p4, p8, pC);
+		WROT(p1, p5, p9, pD);
+		WROT(p2, p6, pA, pE);
+		WROT(p3, p7, pB, pF);
+
+#undef C512_ELT
+#undef WROT
+	}
+	sc->h[0x0] ^= p0;
+	sc->h[0x1] ^= p1;
+	sc->h[0x2] ^= p2;
+	sc->h[0x3] ^= p3;
+	sc->h[0x4] ^= p4;
+	sc->h[0x5] ^= p5;
+	sc->h[0x6] ^= p6;
+	sc->h[0x7] ^= p7;
+	sc->h[0x8] ^= p8;
+	sc->h[0x9] ^= p9;
+	sc->h[0xA] ^= pA;
+	sc->h[0xB] ^= pB;
+	sc->h[0xC] ^= pC;
+	sc->h[0xD] ^= pD;
+	sc->h[0xE] ^= pE;
+	sc->h[0xF] ^= pF;
+}
+
+#else
+
+/*
+ * This function assumes that "msg" is aligned for 32-bit access.
+ */
+static void
+c512(sph_shavite_big_context *sc, const void *msg)
+{
+	sph_u32 p0, p1, p2, p3, p4, p5, p6, p7;
+	sph_u32 p8, p9, pA, pB, pC, pD, pE, pF;
+	sph_u32 x0, x1, x2, x3;
+	sph_u32 rk00, rk01, rk02, rk03, rk04, rk05, rk06, rk07;
+	sph_u32 rk08, rk09, rk0A, rk0B, rk0C, rk0D, rk0E, rk0F;
+	sph_u32 rk10, rk11, rk12, rk13, rk14, rk15, rk16, rk17;
+	sph_u32 rk18, rk19, rk1A, rk1B, rk1C, rk1D, rk1E, rk1F;
+	int r;
+
+	p0 = sc->h[0x0];
+	p1 = sc->h[0x1];
+	p2 = sc->h[0x2];
+	p3 = sc->h[0x3];
+	p4 = sc->h[0x4];
+	p5 = sc->h[0x5];
+	p6 = sc->h[0x6];
+	p7 = sc->h[0x7];
+	p8 = sc->h[0x8];
+	p9 = sc->h[0x9];
+	pA = sc->h[0xA];
+	pB = sc->h[0xB];
+	pC = sc->h[0xC];
+	pD = sc->h[0xD];
+	pE = sc->h[0xE];
+	pF = sc->h[0xF];
+	/* round 0 */
+	rk00 = sph_dec32le_aligned((const unsigned char *)msg +   0);
+	x0 = p4 ^ rk00;
+	rk01 = sph_dec32le_aligned((const unsigned char *)msg +   4);
+	x1 = p5 ^ rk01;
+	rk02 = sph_dec32le_aligned((const unsigned char *)msg +   8);
+	x2 = p6 ^ rk02;
+	rk03 = sph_dec32le_aligned((const unsigned char *)msg +  12);
+	x3 = p7 ^ rk03;
+	AES_ROUND_NOKEY(x0, x1, x2, x3);
+	rk04 = sph_dec32le_aligned((const unsigned char *)msg +  16);
+	x0 ^= rk04;
+	rk05 = sph_dec32le_aligned((const unsigned char *)msg +  20);
+	x1 ^= rk05;
+	rk06 = sph_dec32le_aligned((const unsigned char *)msg +  24);
+	x2 ^= rk06;
+	rk07 = sph_dec32le_aligned((const unsigned char *)msg +  28);
+	x3 ^= rk07;
+	AES_ROUND_NOKEY(x0, x1, x2, x3);
+	rk08 = sph_dec32le_aligned((const unsigned char *)msg +  32);
+	x0 ^= rk08;
+	rk09 = sph_dec32le_aligned((const unsigned char *)msg +  36);
+	x1 ^= rk09;
+	rk0A = sph_dec32le_aligned((const unsigned char *)msg +  40);
+	x2 ^= rk0A;
+	rk0B = sph_dec32le_aligned((const unsigned char *)msg +  44);
+	x3 ^= rk0B;
+	AES_ROUND_NOKEY(x0, x1, x2, x3);
+	rk0C = sph_dec32le_aligned((const unsigned char *)msg +  48);
+	x0 ^= rk0C;
+	rk0D = sph_dec32le_aligned((const unsigned char *)msg +  52);
+	x1 ^= rk0D;
+	rk0E = sph_dec32le_aligned((const unsigned char *)msg +  56);
+	x2 ^= rk0E;
+	rk0F = sph_dec32le_aligned((const unsigned char *)msg +  60);
+	x3 ^= rk0F;
+	AES_ROUND_NOKEY(x0, x1, x2, x3);
+	p0 ^= x0;
+	p1 ^= x1;
+	p2 ^= x2;
+	p3 ^= x3;
+	rk10 = sph_dec32le_aligned((const unsigned char *)msg +  64);
+	x0 = pC ^ rk10;
+	rk11 = sph_dec32le_aligned((const unsigned char *)msg +  68);
+	x1 = pD ^ rk11;
+	rk12 = sph_dec32le_aligned((const unsigned char *)msg +  72);
+	x2 = pE ^ rk12;
+	rk13 = sph_dec32le_aligned((const unsigned char *)msg +  76);
+	x3 = pF ^ rk13;
+	AES_ROUND_NOKEY(x0, x1, x2, x3);
+	rk14 = sph_dec32le_aligned((const unsigned char *)msg +  80);
+	x0 ^= rk14;
+	rk15 = sph_dec32le_aligned((const unsigned char *)msg +  84);
+	x1 ^= rk15;
+	rk16 = sph_dec32le_aligned((const unsigned char *)msg +  88);
+	x2 ^= rk16;
+	rk17 = sph_dec32le_aligned((const unsigned char *)msg +  92);
+	x3 ^= rk17;
+	AES_ROUND_NOKEY(x0, x1, x2, x3);
+	rk18 = sph_dec32le_aligned((const unsigned char *)msg +  96);
+	x0 ^= rk18;
+	rk19 = sph_dec32le_aligned((const unsigned char *)msg + 100);
+	x1 ^= rk19;
+	rk1A = sph_dec32le_aligned((const unsigned char *)msg + 104);
+	x2 ^= rk1A;
+	rk1B = sph_dec32le_aligned((const unsigned char *)msg + 108);
+	x3 ^= rk1B;
+	AES_ROUND_NOKEY(x0, x1, x2, x3);
+	rk1C = sph_dec32le_aligned((const unsigned char *)msg + 112);
+	x0 ^= rk1C;
+	rk1D = sph_dec32le_aligned((const unsigned char *)msg + 116);
+	x1 ^= rk1D;
+	rk1E = sph_dec32le_aligned((const unsigned char *)msg + 120);
+	x2 ^= rk1E;
+	rk1F = sph_dec32le_aligned((const unsigned char *)msg + 124);
+	x3 ^= rk1F;
+	AES_ROUND_NOKEY(x0, x1, x2, x3);
+	p8 ^= x0;
+	p9 ^= x1;
+	pA ^= x2;
+	pB ^= x3;
+
+	for (r = 0; r < 3; r ++) {
+		/* round 1, 5, 9 */
+		KEY_EXPAND_ELT(rk00, rk01, rk02, rk03);
+		rk00 ^= rk1C;
+		rk01 ^= rk1D;
+		rk02 ^= rk1E;
+		rk03 ^= rk1F;
+		if (r == 0) {
+			rk00 ^= sc->count0;
+			rk01 ^= sc->count1;
+			rk02 ^= sc->count2;
+			rk03 ^= SPH_T32(~sc->count3);
+		}
+		x0 = p0 ^ rk00;
+		x1 = p1 ^ rk01;
+		x2 = p2 ^ rk02;
+		x3 = p3 ^ rk03;
+		AES_ROUND_NOKEY(x0, x1, x2, x3);
+		KEY_EXPAND_ELT(rk04, rk05, rk06, rk07);
+		rk04 ^= rk00;
+		rk05 ^= rk01;
+		rk06 ^= rk02;
+		rk07 ^= rk03;
+		if (r == 1) {
+			rk04 ^= sc->count3;
+			rk05 ^= sc->count2;
+			rk06 ^= sc->count1;
+			rk07 ^= SPH_T32(~sc->count0);
+		}
+		x0 ^= rk04;
+		x1 ^= rk05;
+		x2 ^= rk06;
+		x3 ^= rk07;
+		AES_ROUND_NOKEY(x0, x1, x2, x3);
+		KEY_EXPAND_ELT(rk08, rk09, rk0A, rk0B);
+		rk08 ^= rk04;
+		rk09 ^= rk05;
+		rk0A ^= rk06;
+		rk0B ^= rk07;
+		x0 ^= rk08;
+		x1 ^= rk09;
+		x2 ^= rk0A;
+		x3 ^= rk0B;
+		AES_ROUND_NOKEY(x0, x1, x2, x3);
+		KEY_EXPAND_ELT(rk0C, rk0D, rk0E, rk0F);
+		rk0C ^= rk08;
+		rk0D ^= rk09;
+		rk0E ^= rk0A;
+		rk0F ^= rk0B;
+		x0 ^= rk0C;
+		x1 ^= rk0D;
+		x2 ^= rk0E;
+		x3 ^= rk0F;
+		AES_ROUND_NOKEY(x0, x1, x2, x3);
+		pC ^= x0;
+		pD ^= x1;
+		pE ^= x2;
+		pF ^= x3;
+		KEY_EXPAND_ELT(rk10, rk11, rk12, rk13);
+		rk10 ^= rk0C;
+		rk11 ^= rk0D;
+		rk12 ^= rk0E;
+		rk13 ^= rk0F;
+		x0 = p8 ^ rk10;
+		x1 = p9 ^ rk11;
+		x2 = pA ^ rk12;
+		x3 = pB ^ rk13;
+		AES_ROUND_NOKEY(x0, x1, x2, x3);
+		KEY_EXPAND_ELT(rk14, rk15, rk16, rk17);
+		rk14 ^= rk10;
+		rk15 ^= rk11;
+		rk16 ^= rk12;
+		rk17 ^= rk13;
+		x0 ^= rk14;
+		x1 ^= rk15;
+		x2 ^= rk16;
+		x3 ^= rk17;
+		AES_ROUND_NOKEY(x0, x1, x2, x3);
+		KEY_EXPAND_ELT(rk18, rk19, rk1A, rk1B);
+		rk18 ^= rk14;
+		rk19 ^= rk15;
+		rk1A ^= rk16;
+		rk1B ^= rk17;
+		x0 ^= rk18;
+		x1 ^= rk19;
+		x2 ^= rk1A;
+		x3 ^= rk1B;
+		AES_ROUND_NOKEY(x0, x1, x2, x3);
+		KEY_EXPAND_ELT(rk1C, rk1D, rk1E, rk1F);
+		rk1C ^= rk18;
+		rk1D ^= rk19;
+		rk1E ^= rk1A;
+		rk1F ^= rk1B;
+		if (r == 2) {
+			rk1C ^= sc->count2;
+			rk1D ^= sc->count3;
+			rk1E ^= sc->count0;
+			rk1F ^= SPH_T32(~sc->count1);
+		}
+		x0 ^= rk1C;
+		x1 ^= rk1D;
+		x2 ^= rk1E;
+		x3 ^= rk1F;
+		AES_ROUND_NOKEY(x0, x1, x2, x3);
+		p4 ^= x0;
+		p5 ^= x1;
+		p6 ^= x2;
+		p7 ^= x3;
+		/* round 2, 6, 10 */
+		rk00 ^= rk19;
+		x0 = pC ^ rk00;
+		rk01 ^= rk1A;
+		x1 = pD ^ rk01;
+		rk02 ^= rk1B;
+		x2 = pE ^ rk02;
+		rk03 ^= rk1C;
+		x3 = pF ^ rk03;
+		AES_ROUND_NOKEY(x0, x1, x2, x3);
+		rk04 ^= rk1D;
+		x0 ^= rk04;
+		rk05 ^= rk1E;
+		x1 ^= rk05;
+		rk06 ^= rk1F;
+		x2 ^= rk06;
+		rk07 ^= rk00;
+		x3 ^= rk07;
+		AES_ROUND_NOKEY(x0, x1, x2, x3);
+		rk08 ^= rk01;
+		x0 ^= rk08;
+		rk09 ^= rk02;
+		x1 ^= rk09;
+		rk0A ^= rk03;
+		x2 ^= rk0A;
+		rk0B ^= rk04;
+		x3 ^= rk0B;
+		AES_ROUND_NOKEY(x0, x1, x2, x3);
+		rk0C ^= rk05;
+		x0 ^= rk0C;
+		rk0D ^= rk06;
+		x1 ^= rk0D;
+		rk0E ^= rk07;
+		x2 ^= rk0E;
+		rk0F ^= rk08;
+		x3 ^= rk0F;
+		AES_ROUND_NOKEY(x0, x1, x2, x3);
+		p8 ^= x0;
+		p9 ^= x1;
+		pA ^= x2;
+		pB ^= x3;
+		rk10 ^= rk09;
+		x0 = p4 ^ rk10;
+		rk11 ^= rk0A;
+		x1 = p5 ^ rk11;
+		rk12 ^= rk0B;
+		x2 = p6 ^ rk12;
+		rk13 ^= rk0C;
+		x3 = p7 ^ rk13;
+		AES_ROUND_NOKEY(x0, x1, x2, x3);
+		rk14 ^= rk0D;
+		x0 ^= rk14;
+		rk15 ^= rk0E;
+		x1 ^= rk15;
+		rk16 ^= rk0F;
+		x2 ^= rk16;
+		rk17 ^= rk10;
+		x3 ^= rk17;
+		AES_ROUND_NOKEY(x0, x1, x2, x3);
+		rk18 ^= rk11;
+		x0 ^= rk18;
+		rk19 ^= rk12;
+		x1 ^= rk19;
+		rk1A ^= rk13;
+		x2 ^= rk1A;
+		rk1B ^= rk14;
+		x3 ^= rk1B;
+		AES_ROUND_NOKEY(x0, x1, x2, x3);
+		rk1C ^= rk15;
+		x0 ^= rk1C;
+		rk1D ^= rk16;
+		x1 ^= rk1D;
+		rk1E ^= rk17;
+		x2 ^= rk1E;
+		rk1F ^= rk18;
+		x3 ^= rk1F;
+		AES_ROUND_NOKEY(x0, x1, x2, x3);
+		p0 ^= x0;
+		p1 ^= x1;
+		p2 ^= x2;
+		p3 ^= x3;
+		/* round 3, 7, 11 */
+		KEY_EXPAND_ELT(rk00, rk01, rk02, rk03);
+		rk00 ^= rk1C;
+		rk01 ^= rk1D;
+		rk02 ^= rk1E;
+		rk03 ^= rk1F;
+		x0 = p8 ^ rk00;
+		x1 = p9 ^ rk01;
+		x2 = pA ^ rk02;
+		x3 = pB ^ rk03;
+		AES_ROUND_NOKEY(x0, x1, x2, x3);
+		KEY_EXPAND_ELT(rk04, rk05, rk06, rk07);
+		rk04 ^= rk00;
+		rk05 ^= rk01;
+		rk06 ^= rk02;
+		rk07 ^= rk03;
+		x0 ^= rk04;
+		x1 ^= rk05;
+		x2 ^= rk06;
+		x3 ^= rk07;
+		AES_ROUND_NOKEY(x0, x1, x2, x3);
+		KEY_EXPAND_ELT(rk08, rk09, rk0A, rk0B);
+		rk08 ^= rk04;
+		rk09 ^= rk05;
+		rk0A ^= rk06;
+		rk0B ^= rk07;
+		x0 ^= rk08;
+		x1 ^= rk09;
+		x2 ^= rk0A;
+		x3 ^= rk0B;
+		AES_ROUND_NOKEY(x0, x1, x2, x3);
+		KEY_EXPAND_ELT(rk0C, rk0D, rk0E, rk0F);
+		rk0C ^= rk08;
+		rk0D ^= rk09;
+		rk0E ^= rk0A;
+		rk0F ^= rk0B;
+		x0 ^= rk0C;
+		x1 ^= rk0D;
+		x2 ^= rk0E;
+		x3 ^= rk0F;
+		AES_ROUND_NOKEY(x0, x1, x2, x3);
+		p4 ^= x0;
+		p5 ^= x1;
+		p6 ^= x2;
+		p7 ^= x3;
+		KEY_EXPAND_ELT(rk10, rk11, rk12, rk13);
+		rk10 ^= rk0C;
+		rk11 ^= rk0D;
+		rk12 ^= rk0E;
+		rk13 ^= rk0F;
+		x0 = p0 ^ rk10;
+		x1 = p1 ^ rk11;
+		x2 = p2 ^ rk12;
+		x3 = p3 ^ rk13;
+		AES_ROUND_NOKEY(x0, x1, x2, x3);
+		KEY_EXPAND_ELT(rk14, rk15, rk16, rk17);
+		rk14 ^= rk10;
+		rk15 ^= rk11;
+		rk16 ^= rk12;
+		rk17 ^= rk13;
+		x0 ^= rk14;
+		x1 ^= rk15;
+		x2 ^= rk16;
+		x3 ^= rk17;
+		AES_ROUND_NOKEY(x0, x1, x2, x3);
+		KEY_EXPAND_ELT(rk18, rk19, rk1A, rk1B);
+		rk18 ^= rk14;
+		rk19 ^= rk15;
+		rk1A ^= rk16;
+		rk1B ^= rk17;
+		x0 ^= rk18;
+		x1 ^= rk19;
+		x2 ^= rk1A;
+		x3 ^= rk1B;
+		AES_ROUND_NOKEY(x0, x1, x2, x3);
+		KEY_EXPAND_ELT(rk1C, rk1D, rk1E, rk1F);
+		rk1C ^= rk18;
+		rk1D ^= rk19;
+		rk1E ^= rk1A;
+		rk1F ^= rk1B;
+		x0 ^= rk1C;
+		x1 ^= rk1D;
+		x2 ^= rk1E;
+		x3 ^= rk1F;
+		AES_ROUND_NOKEY(x0, x1, x2, x3);
+		pC ^= x0;
+		pD ^= x1;
+		pE ^= x2;
+		pF ^= x3;
+		/* round 4, 8, 12 */
+		rk00 ^= rk19;
+		x0 = p4 ^ rk00;
+		rk01 ^= rk1A;
+		x1 = p5 ^ rk01;
+		rk02 ^= rk1B;
+		x2 = p6 ^ rk02;
+		rk03 ^= rk1C;
+		x3 = p7 ^ rk03;
+		AES_ROUND_NOKEY(x0, x1, x2, x3);
+		rk04 ^= rk1D;
+		x0 ^= rk04;
+		rk05 ^= rk1E;
+		x1 ^= rk05;
+		rk06 ^= rk1F;
+		x2 ^= rk06;
+		rk07 ^= rk00;
+		x3 ^= rk07;
+		AES_ROUND_NOKEY(x0, x1, x2, x3);
+		rk08 ^= rk01;
+		x0 ^= rk08;
+		rk09 ^= rk02;
+		x1 ^= rk09;
+		rk0A ^= rk03;
+		x2 ^= rk0A;
+		rk0B ^= rk04;
+		x3 ^= rk0B;
+		AES_ROUND_NOKEY(x0, x1, x2, x3);
+		rk0C ^= rk05;
+		x0 ^= rk0C;
+		rk0D ^= rk06;
+		x1 ^= rk0D;
+		rk0E ^= rk07;
+		x2 ^= rk0E;
+		rk0F ^= rk08;
+		x3 ^= rk0F;
+		AES_ROUND_NOKEY(x0, x1, x2, x3);
+		p0 ^= x0;
+		p1 ^= x1;
+		p2 ^= x2;
+		p3 ^= x3;
+		rk10 ^= rk09;
+		x0 = pC ^ rk10;
+		rk11 ^= rk0A;
+		x1 = pD ^ rk11;
+		rk12 ^= rk0B;
+		x2 = pE ^ rk12;
+		rk13 ^= rk0C;
+		x3 = pF ^ rk13;
+		AES_ROUND_NOKEY(x0, x1, x2, x3);
+		rk14 ^= rk0D;
+		x0 ^= rk14;
+		rk15 ^= rk0E;
+		x1 ^= rk15;
+		rk16 ^= rk0F;
+		x2 ^= rk16;
+		rk17 ^= rk10;
+		x3 ^= rk17;
+		AES_ROUND_NOKEY(x0, x1, x2, x3);
+		rk18 ^= rk11;
+		x0 ^= rk18;
+		rk19 ^= rk12;
+		x1 ^= rk19;
+		rk1A ^= rk13;
+		x2 ^= rk1A;
+		rk1B ^= rk14;
+		x3 ^= rk1B;
+		AES_ROUND_NOKEY(x0, x1, x2, x3);
+		rk1C ^= rk15;
+		x0 ^= rk1C;
+		rk1D ^= rk16;
+		x1 ^= rk1D;
+		rk1E ^= rk17;
+		x2 ^= rk1E;
+		rk1F ^= rk18;
+		x3 ^= rk1F;
+		AES_ROUND_NOKEY(x0, x1, x2, x3);
+		p8 ^= x0;
+		p9 ^= x1;
+		pA ^= x2;
+		pB ^= x3;
+	}
+	/* round 13 */
+	KEY_EXPAND_ELT(rk00, rk01, rk02, rk03);
+	rk00 ^= rk1C;
+	rk01 ^= rk1D;
+	rk02 ^= rk1E;
+	rk03 ^= rk1F;
+	x0 = p0 ^ rk00;
+	x1 = p1 ^ rk01;
+	x2 = p2 ^ rk02;
+	x3 = p3 ^ rk03;
+	AES_ROUND_NOKEY(x0, x1, x2, x3);
+	KEY_EXPAND_ELT(rk04, rk05, rk06, rk07);
+	rk04 ^= rk00;
+	rk05 ^= rk01;
+	rk06 ^= rk02;
+	rk07 ^= rk03;
+	x0 ^= rk04;
+	x1 ^= rk05;
+	x2 ^= rk06;
+	x3 ^= rk07;
+	AES_ROUND_NOKEY(x0, x1, x2, x3);
+	KEY_EXPAND_ELT(rk08, rk09, rk0A, rk0B);
+	rk08 ^= rk04;
+	rk09 ^= rk05;
+	rk0A ^= rk06;
+	rk0B ^= rk07;
+	x0 ^= rk08;
+	x1 ^= rk09;
+	x2 ^= rk0A;
+	x3 ^= rk0B;
+	AES_ROUND_NOKEY(x0, x1, x2, x3);
+	KEY_EXPAND_ELT(rk0C, rk0D, rk0E, rk0F);
+	rk0C ^= rk08;
+	rk0D ^= rk09;
+	rk0E ^= rk0A;
+	rk0F ^= rk0B;
+	x0 ^= rk0C;
+	x1 ^= rk0D;
+	x2 ^= rk0E;
+	x3 ^= rk0F;
+	AES_ROUND_NOKEY(x0, x1, x2, x3);
+	pC ^= x0;
+	pD ^= x1;
+	pE ^= x2;
+	pF ^= x3;
+	KEY_EXPAND_ELT(rk10, rk11, rk12, rk13);
+	rk10 ^= rk0C;
+	rk11 ^= rk0D;
+	rk12 ^= rk0E;
+	rk13 ^= rk0F;
+	x0 = p8 ^ rk10;
+	x1 = p9 ^ rk11;
+	x2 = pA ^ rk12;
+	x3 = pB ^ rk13;
+	AES_ROUND_NOKEY(x0, x1, x2, x3);
+	KEY_EXPAND_ELT(rk14, rk15, rk16, rk17);
+	rk14 ^= rk10;
+	rk15 ^= rk11;
+	rk16 ^= rk12;
+	rk17 ^= rk13;
+	x0 ^= rk14;
+	x1 ^= rk15;
+	x2 ^= rk16;
+	x3 ^= rk17;
+	AES_ROUND_NOKEY(x0, x1, x2, x3);
+	KEY_EXPAND_ELT(rk18, rk19, rk1A, rk1B);
+	rk18 ^= rk14 ^ sc->count1;
+	rk19 ^= rk15 ^ sc->count0;
+	rk1A ^= rk16 ^ sc->count3;
+	rk1B ^= rk17 ^ SPH_T32(~sc->count2);
+	x0 ^= rk18;
+	x1 ^= rk19;
+	x2 ^= rk1A;
+	x3 ^= rk1B;
+	AES_ROUND_NOKEY(x0, x1, x2, x3);
+	KEY_EXPAND_ELT(rk1C, rk1D, rk1E, rk1F);
+	rk1C ^= rk18;
+	rk1D ^= rk19;
+	rk1E ^= rk1A;
+	rk1F ^= rk1B;
+	x0 ^= rk1C;
+	x1 ^= rk1D;
+	x2 ^= rk1E;
+	x3 ^= rk1F;
+	AES_ROUND_NOKEY(x0, x1, x2, x3);
+	p4 ^= x0;
+	p5 ^= x1;
+	p6 ^= x2;
+	p7 ^= x3;
+	sc->h[0x0] ^= p8;
+	sc->h[0x1] ^= p9;
+	sc->h[0x2] ^= pA;
+	sc->h[0x3] ^= pB;
+	sc->h[0x4] ^= pC;
+	sc->h[0x5] ^= pD;
+	sc->h[0x6] ^= pE;
+	sc->h[0x7] ^= pF;
+	sc->h[0x8] ^= p0;
+	sc->h[0x9] ^= p1;
+	sc->h[0xA] ^= p2;
+	sc->h[0xB] ^= p3;
+	sc->h[0xC] ^= p4;
+	sc->h[0xD] ^= p5;
+	sc->h[0xE] ^= p6;
+	sc->h[0xF] ^= p7;
+}
+
+#endif
+
+static void
+shavite_small_init(sph_shavite_small_context *sc, const sph_u32 *iv)
+{
+	memcpy(sc->h, iv, sizeof sc->h);
+	sc->ptr = 0;
+	sc->count0 = 0;
+	sc->count1 = 0;
+}
+
+static void
+shavite_small_core(sph_shavite_small_context *sc, const void *data, size_t len)
+{
+	unsigned char *buf;
+	size_t ptr;
+
+	buf = sc->buf;
+	ptr = sc->ptr;
+	while (len > 0) {
+		size_t clen;
+
+		clen = (sizeof sc->buf) - ptr;
+		if (clen > len)
+			clen = len;
+		memcpy(buf + ptr, data, clen);
+		data = (const unsigned char *)data + clen;
+		ptr += clen;
+		len -= clen;
+		if (ptr == sizeof sc->buf) {
+			if ((sc->count0 = SPH_T32(sc->count0 + 512)) == 0)
+				sc->count1 = SPH_T32(sc->count1 + 1);
+			c256(sc, buf);
+			ptr = 0;
+		}
+	}
+	sc->ptr = ptr;
+}
+
+static void
+shavite_small_close(sph_shavite_small_context *sc,
+	unsigned ub, unsigned n, void *dst, size_t out_size_w32)
+{
+	unsigned char *buf;
+	size_t ptr, u;
+	unsigned z;
+	sph_u32 count0, count1;
+
+	buf = sc->buf;
+	ptr = sc->ptr;
+	count0 = (sc->count0 += (ptr << 3) + n);
+	count1 = sc->count1;
+	z = 0x80 >> n;
+	z = ((ub & -z) | z) & 0xFF;
+	if (ptr == 0 && n == 0) {
+		buf[0] = 0x80;
+		memset(buf + 1, 0, 53);
+		sc->count0 = sc->count1 = 0;
+	} else if (ptr < 54) {
+		buf[ptr ++] = z;
+		memset(buf + ptr, 0, 54 - ptr);
+	} else {
+		buf[ptr ++] = z;
+		memset(buf + ptr, 0, 64 - ptr);
+		c256(sc, buf);
+		memset(buf, 0, 54);
+		sc->count0 = sc->count1 = 0;
+	}
+	sph_enc32le(buf + 54, count0);
+	sph_enc32le(buf + 58, count1);
+	buf[62] = out_size_w32 << 5;
+	buf[63] = out_size_w32 >> 3;
+	c256(sc, buf);
+	for (u = 0; u < out_size_w32; u ++)
+		sph_enc32le((unsigned char *)dst + (u << 2), sc->h[u]);
+}
+
+static void
+shavite_big_init(sph_shavite_big_context *sc, const sph_u32 *iv)
+{
+	memcpy(sc->h, iv, sizeof sc->h);
+	sc->ptr = 0;
+	sc->count0 = 0;
+	sc->count1 = 0;
+	sc->count2 = 0;
+	sc->count3 = 0;
+}
+
+static void
+shavite_big_core(sph_shavite_big_context *sc, const void *data, size_t len)
+{
+	unsigned char *buf;
+	size_t ptr;
+
+	buf = sc->buf;
+	ptr = sc->ptr;
+	while (len > 0) {
+		size_t clen;
+
+		clen = (sizeof sc->buf) - ptr;
+		if (clen > len)
+			clen = len;
+		memcpy(buf + ptr, data, clen);
+		data = (const unsigned char *)data + clen;
+		ptr += clen;
+		len -= clen;
+		if (ptr == sizeof sc->buf) {
+			if ((sc->count0 = SPH_T32(sc->count0 + 1024)) == 0) {
+				sc->count1 = SPH_T32(sc->count1 + 1);
+				if (sc->count1 == 0) {
+					sc->count2 = SPH_T32(sc->count2 + 1);
+					if (sc->count2 == 0) {
+						sc->count3 = SPH_T32(
+							sc->count3 + 1);
+					}
+				}
+			}
+			c512(sc, buf);
+			ptr = 0;
+		}
+	}
+	sc->ptr = ptr;
+}
+
+static void
+shavite_big_close(sph_shavite_big_context *sc,
+	unsigned ub, unsigned n, void *dst, size_t out_size_w32)
+{
+	unsigned char *buf;
+	size_t ptr, u;
+	unsigned z;
+	sph_u32 count0, count1, count2, count3;
+
+	buf = sc->buf;
+	ptr = sc->ptr;
+	count0 = (sc->count0 += (ptr << 3) + n);
+	count1 = sc->count1;
+	count2 = sc->count2;
+	count3 = sc->count3;
+	z = 0x80 >> n;
+	z = ((ub & -z) | z) & 0xFF;
+	if (ptr == 0 && n == 0) {
+		buf[0] = 0x80;
+		memset(buf + 1, 0, 109);
+		sc->count0 = sc->count1 = sc->count2 = sc->count3 = 0;
+	} else if (ptr < 110) {
+		buf[ptr ++] = z;
+		memset(buf + ptr, 0, 110 - ptr);
+	} else {
+		buf[ptr ++] = z;
+		memset(buf + ptr, 0, 128 - ptr);
+		c512(sc, buf);
+		memset(buf, 0, 110);
+		sc->count0 = sc->count1 = sc->count2 = sc->count3 = 0;
+	}
+	sph_enc32le(buf + 110, count0);
+	sph_enc32le(buf + 114, count1);
+	sph_enc32le(buf + 118, count2);
+	sph_enc32le(buf + 122, count3);
+	buf[126] = out_size_w32 << 5;
+	buf[127] = out_size_w32 >> 3;
+	c512(sc, buf);
+	for (u = 0; u < out_size_w32; u ++)
+		sph_enc32le((unsigned char *)dst + (u << 2), sc->h[u]);
+}
+
+/* see sph_shavite.h */
+void
+sph_shavite224_init(void *cc)
+{
+	shavite_small_init(cc, IV224);
+}
+
+/* see sph_shavite.h */
+void
+sph_shavite224(void *cc, const void *data, size_t len)
+{
+	shavite_small_core(cc, data, len);
+}
+
+/* see sph_shavite.h */
+void
+sph_shavite224_close(void *cc, void *dst)
+{
+	shavite_small_close(cc, 0, 0, dst, 7);
+	shavite_small_init(cc, IV224);
+}
+
+/* see sph_shavite.h */
+void
+sph_shavite224_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
+{
+	shavite_small_close(cc, ub, n, dst, 7);
+	shavite_small_init(cc, IV224);
+}
+
+/* see sph_shavite.h */
+void
+sph_shavite256_init(void *cc)
+{
+	shavite_small_init(cc, IV256);
+}
+
+/* see sph_shavite.h */
+void
+sph_shavite256(void *cc, const void *data, size_t len)
+{
+	shavite_small_core(cc, data, len);
+}
+
+/* see sph_shavite.h */
+void
+sph_shavite256_close(void *cc, void *dst)
+{
+	shavite_small_close(cc, 0, 0, dst, 8);
+	shavite_small_init(cc, IV256);
+}
+
+/* see sph_shavite.h */
+void
+sph_shavite256_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
+{
+	shavite_small_close(cc, ub, n, dst, 8);
+	shavite_small_init(cc, IV256);
+}
+
+/* see sph_shavite.h */
+void
+sph_shavite384_init(void *cc)
+{
+	shavite_big_init(cc, IV384);
+}
+
+/* see sph_shavite.h */
+void
+sph_shavite384(void *cc, const void *data, size_t len)
+{
+	shavite_big_core(cc, data, len);
+}
+
+/* see sph_shavite.h */
+void
+sph_shavite384_close(void *cc, void *dst)
+{
+	shavite_big_close(cc, 0, 0, dst, 12);
+	shavite_big_init(cc, IV384);
+}
+
+/* see sph_shavite.h */
+void
+sph_shavite384_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
+{
+	shavite_big_close(cc, ub, n, dst, 12);
+	shavite_big_init(cc, IV384);
+}
+
+/* see sph_shavite.h */
+void
+sph_shavite512_init(void *cc)
+{
+	shavite_big_init(cc, IV512);
+}
+
+/* see sph_shavite.h */
+void
+sph_shavite512(void *cc, const void *data, size_t len)
+{
+	shavite_big_core(cc, data, len);
+}
+
+/* see sph_shavite.h */
+void
+sph_shavite512_close(void *cc, void *dst)
+{
+	shavite_big_close(cc, 0, 0, dst, 16);
+	shavite_big_init(cc, IV512);
+}
+
+/* see sph_shavite.h */
+void
+sph_shavite512_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
+{
+	shavite_big_close(cc, ub, n, dst, 16);
+	shavite_big_init(cc, IV512);
+}
+
+#ifdef __cplusplus
+}
+#endif
\ No newline at end of file
diff --git a/sha3/simd.c b/sha3/simd.c
new file mode 100644
index 0000000..2c80626
--- /dev/null
+++ b/sha3/simd.c
@@ -0,0 +1,1799 @@
+/* $Id: simd.c 227 2010-06-16 17:28:38Z tp $ */
+/*
+ * SIMD implementation.
+ *
+ * ==========================(LICENSE BEGIN)============================
+ *
+ * Copyright (c) 2007-2010  Projet RNRT SAPHIR
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ===========================(LICENSE END)=============================
+ *
+ * @author   Thomas Pornin <thomas.pornin@cryptolog.com>
+ */
+
+#include <stddef.h>
+#include <string.h>
+#include <limits.h>
+
+#include "sph_simd.h"
+
+#ifdef __cplusplus
+extern "C"{
+#endif
+
+#if SPH_SMALL_FOOTPRINT && !defined SPH_SMALL_FOOTPRINT_SIMD
+#define SPH_SMALL_FOOTPRINT_SIMD   1
+#endif
+
+#ifdef _MSC_VER
+#pragma warning (disable: 4146)
+#endif
+
+typedef sph_u32 u32;
+typedef sph_s32 s32;
+#define C32     SPH_C32
+#define T32     SPH_T32
+#define ROL32   SPH_ROTL32
+
+#define XCAT(x, y)    XCAT_(x, y)
+#define XCAT_(x, y)   x ## y
+
+/*
+ * The powers of 41 modulo 257. We use exponents from 0 to 255, inclusive.
+ */
+static const s32 alpha_tab[] = {
+	  1,  41, 139,  45,  46,  87, 226,  14,  60, 147, 116, 130,
+	190,  80, 196,  69,   2,  82,  21,  90,  92, 174, 195,  28,
+	120,  37, 232,   3, 123, 160, 135, 138,   4, 164,  42, 180,
+	184,  91, 133,  56, 240,  74, 207,   6, 246,  63,  13,  19,
+	  8,  71,  84, 103, 111, 182,   9, 112, 223, 148, 157,  12,
+	235, 126,  26,  38,  16, 142, 168, 206, 222, 107,  18, 224,
+	189,  39,  57,  24, 213, 252,  52,  76,  32,  27,  79, 155,
+	187, 214,  36, 191, 121,  78, 114,  48, 169, 247, 104, 152,
+	 64,  54, 158,  53, 117, 171,  72, 125, 242, 156, 228,  96,
+	 81, 237, 208,  47, 128, 108,  59, 106, 234,  85, 144, 250,
+	227,  55, 199, 192, 162, 217, 159,  94, 256, 216, 118, 212,
+	211, 170,  31, 243, 197, 110, 141, 127,  67, 177,  61, 188,
+	255, 175, 236, 167, 165,  83,  62, 229, 137, 220,  25, 254,
+	134,  97, 122, 119, 253,  93, 215,  77,  73, 166, 124, 201,
+	 17, 183,  50, 251,  11, 194, 244, 238, 249, 186, 173, 154,
+	146,  75, 248, 145,  34, 109, 100, 245,  22, 131, 231, 219,
+	241, 115,  89,  51,  35, 150, 239,  33,  68, 218, 200, 233,
+	 44,   5, 205, 181, 225, 230, 178, 102,  70,  43, 221,  66,
+	136, 179, 143, 209,  88,  10, 153, 105, 193, 203,  99, 204,
+	140,  86, 185, 132,  15, 101,  29, 161, 176,  20,  49, 210,
+	129, 149, 198, 151,  23, 172, 113,   7,  30, 202,  58,  65,
+	 95,  40,  98, 163
+};
+
+/*
+ * Ranges:
+ *   REDS1: from -32768..98302 to -383..383
+ *   REDS2: from -2^31..2^31-1 to -32768..98302
+ */
+#define REDS1(x)    (((x) & 0xFF) - ((x) >> 8))
+#define REDS2(x)    (((x) & 0xFFFF) + ((x) >> 16))
+
+/*
+ * If, upon entry, the values of q[] are all in the -N..N range (where
+ * N >= 98302) then the new values of q[] are in the -2N..2N range.
+ *
+ * Since alpha_tab[v] <= 256, maximum allowed range is for N = 8388608.
+ */
+#define FFT_LOOP(rb, hk, as, id)   do { \
+		size_t u, v; \
+		s32 m = q[(rb)]; \
+		s32 n = q[(rb) + (hk)]; \
+		q[(rb)] = m + n; \
+		q[(rb) + (hk)] = m - n; \
+		u = v = 0; \
+		goto id; \
+		for (; u < (hk); u += 4, v += 4 * (as)) { \
+			s32 t; \
+			m = q[(rb) + u + 0]; \
+			n = q[(rb) + u + 0 + (hk)]; \
+			t = REDS2(n * alpha_tab[v + 0 * (as)]); \
+			q[(rb) + u + 0] = m + t; \
+			q[(rb) + u + 0 + (hk)] = m - t; \
+		id: \
+			m = q[(rb) + u + 1]; \
+			n = q[(rb) + u + 1 + (hk)]; \
+			t = REDS2(n * alpha_tab[v + 1 * (as)]); \
+			q[(rb) + u + 1] = m + t; \
+			q[(rb) + u + 1 + (hk)] = m - t; \
+			m = q[(rb) + u + 2]; \
+			n = q[(rb) + u + 2 + (hk)]; \
+			t = REDS2(n * alpha_tab[v + 2 * (as)]); \
+			q[(rb) + u + 2] = m + t; \
+			q[(rb) + u + 2 + (hk)] = m - t; \
+			m = q[(rb) + u + 3]; \
+			n = q[(rb) + u + 3 + (hk)]; \
+			t = REDS2(n * alpha_tab[v + 3 * (as)]); \
+			q[(rb) + u + 3] = m + t; \
+			q[(rb) + u + 3 + (hk)] = m - t; \
+		} \
+	} while (0)
+
+/*
+ * Output ranges:
+ *   d0:   min=    0   max= 1020
+ *   d1:   min=  -67   max= 4587
+ *   d2:   min=-4335   max= 4335
+ *   d3:   min=-4147   max=  507
+ *   d4:   min= -510   max=  510
+ *   d5:   min= -252   max= 4402
+ *   d6:   min=-4335   max= 4335
+ *   d7:   min=-4332   max=  322
+ */
+#define FFT8(xb, xs, d)   do { \
+		s32 x0 = x[(xb)]; \
+		s32 x1 = x[(xb) + (xs)]; \
+		s32 x2 = x[(xb) + 2 * (xs)]; \
+		s32 x3 = x[(xb) + 3 * (xs)]; \
+		s32 a0 = x0 + x2; \
+		s32 a1 = x0 + (x2 << 4); \
+		s32 a2 = x0 - x2; \
+		s32 a3 = x0 - (x2 << 4); \
+		s32 b0 = x1 + x3; \
+		s32 b1 = REDS1((x1 << 2) + (x3 << 6)); \
+		s32 b2 = (x1 << 4) - (x3 << 4); \
+		s32 b3 = REDS1((x1 << 6) + (x3 << 2)); \
+		d ## 0 = a0 + b0; \
+		d ## 1 = a1 + b1; \
+		d ## 2 = a2 + b2; \
+		d ## 3 = a3 + b3; \
+		d ## 4 = a0 - b0; \
+		d ## 5 = a1 - b1; \
+		d ## 6 = a2 - b2; \
+		d ## 7 = a3 - b3; \
+	} while (0)
+
+/*
+ * When k=16, we have alpha=2. Multiplication by alpha^i is then reduced
+ * to some shifting.
+ *
+ * Output: within -591471..591723
+ */
+#define FFT16(xb, xs, rb)   do { \
+		s32 d1_0, d1_1, d1_2, d1_3, d1_4, d1_5, d1_6, d1_7; \
+		s32 d2_0, d2_1, d2_2, d2_3, d2_4, d2_5, d2_6, d2_7; \
+		FFT8(xb, (xs) << 1, d1_); \
+		FFT8((xb) + (xs), (xs) << 1, d2_); \
+		q[(rb) +  0] = d1_0 + d2_0; \
+		q[(rb) +  1] = d1_1 + (d2_1 << 1); \
+		q[(rb) +  2] = d1_2 + (d2_2 << 2); \
+		q[(rb) +  3] = d1_3 + (d2_3 << 3); \
+		q[(rb) +  4] = d1_4 + (d2_4 << 4); \
+		q[(rb) +  5] = d1_5 + (d2_5 << 5); \
+		q[(rb) +  6] = d1_6 + (d2_6 << 6); \
+		q[(rb) +  7] = d1_7 + (d2_7 << 7); \
+		q[(rb) +  8] = d1_0 - d2_0; \
+		q[(rb) +  9] = d1_1 - (d2_1 << 1); \
+		q[(rb) + 10] = d1_2 - (d2_2 << 2); \
+		q[(rb) + 11] = d1_3 - (d2_3 << 3); \
+		q[(rb) + 12] = d1_4 - (d2_4 << 4); \
+		q[(rb) + 13] = d1_5 - (d2_5 << 5); \
+		q[(rb) + 14] = d1_6 - (d2_6 << 6); \
+		q[(rb) + 15] = d1_7 - (d2_7 << 7); \
+	} while (0)
+
+/*
+ * Output range: |q| <= 1183446
+ */
+#define FFT32(xb, xs, rb, id)   do { \
+		FFT16(xb, (xs) << 1, rb); \
+		FFT16((xb) + (xs), (xs) << 1, (rb) + 16); \
+		FFT_LOOP(rb, 16, 8, id); \
+	} while (0)
+
+/*
+ * Output range: |q| <= 2366892
+ */
+#define FFT64(xb, xs, rb, id)   do { \
+		FFT32(xb, (xs) << 1, rb, XCAT(id, a)); \
+		FFT32((xb) + (xs), (xs) << 1, (rb) + 32, XCAT(id, b)); \
+		FFT_LOOP(rb, 32, 4, id); \
+	} while (0)
+
+#if SPH_SMALL_FOOTPRINT_SIMD
+
+static void
+fft32(unsigned char *x, size_t xs, s32 *q)
+{
+	size_t xd;
+
+	xd = xs << 1;
+	FFT16(0, xd, 0);
+	FFT16(xs, xd, 16);
+	FFT_LOOP(0, 16, 8, label_);
+}
+
+#define FFT128(xb, xs, rb, id)   do { \
+		fft32(x + (xb) + ((xs) * 0), (xs) << 2, &q[(rb) +  0]); \
+		fft32(x + (xb) + ((xs) * 2), (xs) << 2, &q[(rb) + 32]); \
+		FFT_LOOP(rb, 32, 4, XCAT(id, aa)); \
+		fft32(x + (xb) + ((xs) * 1), (xs) << 2, &q[(rb) + 64]); \
+		fft32(x + (xb) + ((xs) * 3), (xs) << 2, &q[(rb) + 96]); \
+		FFT_LOOP((rb) + 64, 32, 4, XCAT(id, ab)); \
+		FFT_LOOP(rb, 64, 2, XCAT(id, a)); \
+	} while (0)
+
+#else
+
+/*
+ * Output range: |q| <= 4733784
+ */
+#define FFT128(xb, xs, rb, id)   do { \
+		FFT64(xb, (xs) << 1, rb, XCAT(id, a)); \
+		FFT64((xb) + (xs), (xs) << 1, (rb) + 64, XCAT(id, b)); \
+		FFT_LOOP(rb, 64, 2, id); \
+	} while (0)
+
+#endif
+
+/*
+ * For SIMD-384 / SIMD-512, the fully unrolled FFT yields a compression
+ * function which does not fit in the 32 kB L1 cache of a typical x86
+ * Intel. We therefore add a function call layer at the FFT64 level.
+ */
+
+static void
+fft64(unsigned char *x, size_t xs, s32 *q)
+{
+	size_t xd;
+
+	xd = xs << 1;
+	FFT32(0, xd, 0, label_a);
+	FFT32(xs, xd, 32, label_b);
+	FFT_LOOP(0, 32, 4, label_);
+}
+
+/*
+ * Output range: |q| <= 9467568
+ */
+#define FFT256(xb, xs, rb, id)   do { \
+		fft64(x + (xb) + ((xs) * 0), (xs) << 2, &q[(rb) +   0]); \
+		fft64(x + (xb) + ((xs) * 2), (xs) << 2, &q[(rb) +  64]); \
+		FFT_LOOP(rb, 64, 2, XCAT(id, aa)); \
+		fft64(x + (xb) + ((xs) * 1), (xs) << 2, &q[(rb) + 128]); \
+		fft64(x + (xb) + ((xs) * 3), (xs) << 2, &q[(rb) + 192]); \
+		FFT_LOOP((rb) + 128, 64, 2, XCAT(id, ab)); \
+		FFT_LOOP(rb, 128, 1, XCAT(id, a)); \
+	} while (0)
+
+/*
+ * alpha^(127*i) mod 257
+ */
+static const unsigned short yoff_s_n[] = {
+	  1,  98,  95,  58,  30, 113,  23, 198, 129,  49, 176,  29,
+	 15, 185, 140,  99, 193, 153,  88, 143, 136, 221,  70, 178,
+	225, 205,  44, 200,  68, 239,  35,  89, 241, 231,  22, 100,
+	 34, 248, 146, 173, 249, 244,  11,  50,  17, 124,  73, 215,
+	253, 122, 134,  25, 137,  62, 165, 236, 255,  61,  67, 141,
+	197,  31, 211, 118, 256, 159, 162, 199, 227, 144, 234,  59,
+	128, 208,  81, 228, 242,  72, 117, 158,  64, 104, 169, 114,
+	121,  36, 187,  79,  32,  52, 213,  57, 189,  18, 222, 168,
+	 16,  26, 235, 157, 223,   9, 111,  84,   8,  13, 246, 207,
+	240, 133, 184,  42,   4, 135, 123, 232, 120, 195,  92,  21,
+	  2, 196, 190, 116,  60, 226,  46, 139
+};
+
+/*
+ * alpha^(127*i) + alpha^(125*i) mod 257
+ */
+static const unsigned short yoff_s_f[] = {
+	  2, 156, 118, 107,  45, 212, 111, 162,  97, 249, 211,   3,
+	 49, 101, 151, 223, 189, 178, 253, 204,  76,  82, 232,  65,
+	 96, 176, 161,  47, 189,  61, 248, 107,   0, 131, 133, 113,
+	 17,  33,  12, 111, 251, 103,  57, 148,  47,  65, 249, 143,
+	189,   8, 204, 230, 205, 151, 187, 227, 247, 111, 140,   6,
+	 77,  10,  21, 149, 255, 101, 139, 150, 212,  45, 146,  95,
+	160,   8,  46, 254, 208, 156, 106,  34,  68,  79,   4,  53,
+	181, 175,  25, 192, 161,  81,  96, 210,  68, 196,   9, 150,
+	  0, 126, 124, 144, 240, 224, 245, 146,   6, 154, 200, 109,
+	210, 192,   8, 114,  68, 249,  53,  27,  52, 106,  70,  30,
+	 10, 146, 117, 251, 180, 247, 236, 108
+};
+
+/*
+ * beta^(255*i) mod 257
+ */
+static const unsigned short yoff_b_n[] = {
+	  1, 163,  98,  40,  95,  65,  58, 202,  30,   7, 113, 172,
+	 23, 151, 198, 149, 129, 210,  49,  20, 176, 161,  29, 101,
+	 15, 132, 185,  86, 140, 204,  99, 203, 193, 105, 153,  10,
+	 88, 209, 143, 179, 136,  66, 221,  43,  70, 102, 178, 230,
+	225, 181, 205,   5,  44, 233, 200, 218,  68,  33, 239, 150,
+	 35,  51,  89, 115, 241, 219, 231, 131,  22, 245, 100, 109,
+	 34, 145, 248,  75, 146, 154, 173, 186, 249, 238, 244, 194,
+	 11, 251,  50, 183,  17, 201, 124, 166,  73,  77, 215,  93,
+	253, 119, 122,  97, 134, 254,  25, 220, 137, 229,  62,  83,
+	165, 167, 236, 175, 255, 188,  61, 177,  67, 127, 141, 110,
+	197, 243,  31, 170, 211, 212, 118, 216, 256,  94, 159, 217,
+	162, 192, 199,  55, 227, 250, 144,  85, 234, 106,  59, 108,
+	128,  47, 208, 237,  81,  96, 228, 156, 242, 125,  72, 171,
+	117,  53, 158,  54,  64, 152, 104, 247, 169,  48, 114,  78,
+	121, 191,  36, 214, 187, 155,  79,  27,  32,  76,  52, 252,
+	213,  24,  57,  39, 189, 224,  18, 107, 222, 206, 168, 142,
+	 16,  38,  26, 126, 235,  12, 157, 148, 223, 112,   9, 182,
+	111, 103,  84,  71,   8,  19,  13,  63, 246,   6, 207,  74,
+	240,  56, 133,  91, 184, 180,  42, 164,   4, 138, 135, 160,
+	123,   3, 232,  37, 120,  28, 195, 174,  92,  90,  21,  82,
+	  2,  69, 196,  80, 190, 130, 116, 147,  60,  14, 226,  87,
+	 46,  45, 139,  41
+};
+
+/*
+ * beta^(255*i) + beta^(253*i) mod 257
+ */
+static const unsigned short yoff_b_f[] = {
+	  2, 203, 156,  47, 118, 214, 107, 106,  45,  93, 212,  20,
+	111,  73, 162, 251,  97, 215, 249,  53, 211,  19,   3,  89,
+	 49, 207, 101,  67, 151, 130, 223,  23, 189, 202, 178, 239,
+	253, 127, 204,  49,  76, 236,  82, 137, 232, 157,  65,  79,
+	 96, 161, 176, 130, 161,  30,  47,   9, 189, 247,  61, 226,
+	248,  90, 107,  64,   0,  88, 131, 243, 133,  59, 113, 115,
+	 17, 236,  33, 213,  12, 191, 111,  19, 251,  61, 103, 208,
+	 57,  35, 148, 248,  47, 116,  65, 119, 249, 178, 143,  40,
+	189, 129,   8, 163, 204, 227, 230, 196, 205, 122, 151,  45,
+	187,  19, 227,  72, 247, 125, 111, 121, 140, 220,   6, 107,
+	 77,  69,  10, 101,  21,  65, 149, 171, 255,  54, 101, 210,
+	139,  43, 150, 151, 212, 164,  45, 237, 146, 184,  95,   6,
+	160,  42,   8, 204,  46, 238, 254, 168, 208,  50, 156, 190,
+	106, 127,  34, 234,  68,  55,  79,  18,   4, 130,  53, 208,
+	181,  21, 175, 120,  25, 100, 192, 178, 161,  96,  81, 127,
+	 96, 227, 210, 248,  68,  10, 196,  31,   9, 167, 150, 193,
+	  0, 169, 126,  14, 124, 198, 144, 142, 240,  21, 224,  44,
+	245,  66, 146, 238,   6, 196, 154,  49, 200, 222, 109,   9,
+	210, 141, 192, 138,   8,  79, 114, 217,  68, 128, 249,  94,
+	 53,  30,  27,  61,  52, 135, 106, 212,  70, 238,  30, 185,
+	 10, 132, 146, 136, 117,  37, 251, 150, 180, 188, 247, 156,
+	236, 192, 108,  86
+};
+
+#define INNER(l, h, mm)   (((u32)((l) * (mm)) & 0xFFFFU) \
+                          + ((u32)((h) * (mm)) << 16))
+
+#define W_SMALL(sb, o1, o2, mm) \
+	(INNER(q[8 * (sb) + 2 * 0 + o1], q[8 * (sb) + 2 * 0 + o2], mm), \
+	 INNER(q[8 * (sb) + 2 * 1 + o1], q[8 * (sb) + 2 * 1 + o2], mm), \
+	 INNER(q[8 * (sb) + 2 * 2 + o1], q[8 * (sb) + 2 * 2 + o2], mm), \
+	 INNER(q[8 * (sb) + 2 * 3 + o1], q[8 * (sb) + 2 * 3 + o2], mm)
+
+#define WS_0_0   W_SMALL( 4,    0,    1, 185)
+#define WS_0_1   W_SMALL( 6,    0,    1, 185)
+#define WS_0_2   W_SMALL( 0,    0,    1, 185)
+#define WS_0_3   W_SMALL( 2,    0,    1, 185)
+#define WS_0_4   W_SMALL( 7,    0,    1, 185)
+#define WS_0_5   W_SMALL( 5,    0,    1, 185)
+#define WS_0_6   W_SMALL( 3,    0,    1, 185)
+#define WS_0_7   W_SMALL( 1,    0,    1, 185)
+#define WS_1_0   W_SMALL(15,    0,    1, 185)
+#define WS_1_1   W_SMALL(11,    0,    1, 185)
+#define WS_1_2   W_SMALL(12,    0,    1, 185)
+#define WS_1_3   W_SMALL( 8,    0,    1, 185)
+#define WS_1_4   W_SMALL( 9,    0,    1, 185)
+#define WS_1_5   W_SMALL(13,    0,    1, 185)
+#define WS_1_6   W_SMALL(10,    0,    1, 185)
+#define WS_1_7   W_SMALL(14,    0,    1, 185)
+#define WS_2_0   W_SMALL(17, -128,  -64, 233)
+#define WS_2_1   W_SMALL(18, -128,  -64, 233)
+#define WS_2_2   W_SMALL(23, -128,  -64, 233)
+#define WS_2_3   W_SMALL(20, -128,  -64, 233)
+#define WS_2_4   W_SMALL(22, -128,  -64, 233)
+#define WS_2_5   W_SMALL(21, -128,  -64, 233)
+#define WS_2_6   W_SMALL(16, -128,  -64, 233)
+#define WS_2_7   W_SMALL(19, -128,  -64, 233)
+#define WS_3_0   W_SMALL(30, -191, -127, 233)
+#define WS_3_1   W_SMALL(24, -191, -127, 233)
+#define WS_3_2   W_SMALL(25, -191, -127, 233)
+#define WS_3_3   W_SMALL(31, -191, -127, 233)
+#define WS_3_4   W_SMALL(27, -191, -127, 233)
+#define WS_3_5   W_SMALL(29, -191, -127, 233)
+#define WS_3_6   W_SMALL(28, -191, -127, 233)
+#define WS_3_7   W_SMALL(26, -191, -127, 233)
+
+#define W_BIG(sb, o1, o2, mm) \
+	(INNER(q[16 * (sb) + 2 * 0 + o1], q[16 * (sb) + 2 * 0 + o2], mm), \
+	 INNER(q[16 * (sb) + 2 * 1 + o1], q[16 * (sb) + 2 * 1 + o2], mm), \
+	 INNER(q[16 * (sb) + 2 * 2 + o1], q[16 * (sb) + 2 * 2 + o2], mm), \
+	 INNER(q[16 * (sb) + 2 * 3 + o1], q[16 * (sb) + 2 * 3 + o2], mm), \
+	 INNER(q[16 * (sb) + 2 * 4 + o1], q[16 * (sb) + 2 * 4 + o2], mm), \
+	 INNER(q[16 * (sb) + 2 * 5 + o1], q[16 * (sb) + 2 * 5 + o2], mm), \
+	 INNER(q[16 * (sb) + 2 * 6 + o1], q[16 * (sb) + 2 * 6 + o2], mm), \
+	 INNER(q[16 * (sb) + 2 * 7 + o1], q[16 * (sb) + 2 * 7 + o2], mm)
+
+#define WB_0_0   W_BIG( 4,    0,    1, 185)
+#define WB_0_1   W_BIG( 6,    0,    1, 185)
+#define WB_0_2   W_BIG( 0,    0,    1, 185)
+#define WB_0_3   W_BIG( 2,    0,    1, 185)
+#define WB_0_4   W_BIG( 7,    0,    1, 185)
+#define WB_0_5   W_BIG( 5,    0,    1, 185)
+#define WB_0_6   W_BIG( 3,    0,    1, 185)
+#define WB_0_7   W_BIG( 1,    0,    1, 185)
+#define WB_1_0   W_BIG(15,    0,    1, 185)
+#define WB_1_1   W_BIG(11,    0,    1, 185)
+#define WB_1_2   W_BIG(12,    0,    1, 185)
+#define WB_1_3   W_BIG( 8,    0,    1, 185)
+#define WB_1_4   W_BIG( 9,    0,    1, 185)
+#define WB_1_5   W_BIG(13,    0,    1, 185)
+#define WB_1_6   W_BIG(10,    0,    1, 185)
+#define WB_1_7   W_BIG(14,    0,    1, 185)
+#define WB_2_0   W_BIG(17, -256, -128, 233)
+#define WB_2_1   W_BIG(18, -256, -128, 233)
+#define WB_2_2   W_BIG(23, -256, -128, 233)
+#define WB_2_3   W_BIG(20, -256, -128, 233)
+#define WB_2_4   W_BIG(22, -256, -128, 233)
+#define WB_2_5   W_BIG(21, -256, -128, 233)
+#define WB_2_6   W_BIG(16, -256, -128, 233)
+#define WB_2_7   W_BIG(19, -256, -128, 233)
+#define WB_3_0   W_BIG(30, -383, -255, 233)
+#define WB_3_1   W_BIG(24, -383, -255, 233)
+#define WB_3_2   W_BIG(25, -383, -255, 233)
+#define WB_3_3   W_BIG(31, -383, -255, 233)
+#define WB_3_4   W_BIG(27, -383, -255, 233)
+#define WB_3_5   W_BIG(29, -383, -255, 233)
+#define WB_3_6   W_BIG(28, -383, -255, 233)
+#define WB_3_7   W_BIG(26, -383, -255, 233)
+
+#define IF(x, y, z)    ((((y) ^ (z)) & (x)) ^ (z))
+#define MAJ(x, y, z)   (((x) & (y)) | (((x) | (y)) & (z)))
+
+#define PP4_0_0   1
+#define PP4_0_1   0
+#define PP4_0_2   3
+#define PP4_0_3   2
+#define PP4_1_0   2
+#define PP4_1_1   3
+#define PP4_1_2   0
+#define PP4_1_3   1
+#define PP4_2_0   3
+#define PP4_2_1   2
+#define PP4_2_2   1
+#define PP4_2_3   0
+
+#define PP8_0_0   1
+#define PP8_0_1   0
+#define PP8_0_2   3
+#define PP8_0_3   2
+#define PP8_0_4   5
+#define PP8_0_5   4
+#define PP8_0_6   7
+#define PP8_0_7   6
+
+#define PP8_1_0   6
+#define PP8_1_1   7
+#define PP8_1_2   4
+#define PP8_1_3   5
+#define PP8_1_4   2
+#define PP8_1_5   3
+#define PP8_1_6   0
+#define PP8_1_7   1
+
+#define PP8_2_0   2
+#define PP8_2_1   3
+#define PP8_2_2   0
+#define PP8_2_3   1
+#define PP8_2_4   6
+#define PP8_2_5   7
+#define PP8_2_6   4
+#define PP8_2_7   5
+
+#define PP8_3_0   3
+#define PP8_3_1   2
+#define PP8_3_2   1
+#define PP8_3_3   0
+#define PP8_3_4   7
+#define PP8_3_5   6
+#define PP8_3_6   5
+#define PP8_3_7   4
+
+#define PP8_4_0   5
+#define PP8_4_1   4
+#define PP8_4_2   7
+#define PP8_4_3   6
+#define PP8_4_4   1
+#define PP8_4_5   0
+#define PP8_4_6   3
+#define PP8_4_7   2
+
+#define PP8_5_0   7
+#define PP8_5_1   6
+#define PP8_5_2   5
+#define PP8_5_3   4
+#define PP8_5_4   3
+#define PP8_5_5   2
+#define PP8_5_6   1
+#define PP8_5_7   0
+
+#define PP8_6_0   4
+#define PP8_6_1   5
+#define PP8_6_2   6
+#define PP8_6_3   7
+#define PP8_6_4   0
+#define PP8_6_5   1
+#define PP8_6_6   2
+#define PP8_6_7   3
+
+#if SPH_SIMD_NOCOPY
+
+#define DECL_STATE_SMALL
+#define READ_STATE_SMALL(sc)
+#define WRITE_STATE_SMALL(sc)
+#define DECL_STATE_BIG
+#define READ_STATE_BIG(sc)
+#define WRITE_STATE_BIG(sc)
+
+#else
+
+#define DECL_STATE_SMALL   \
+	u32 A0, A1, A2, A3, B0, B1, B2, B3, C0, C1, C2, C3, D0, D1, D2, D3;
+
+#define READ_STATE_SMALL(sc)   do { \
+		A0 = (sc)->state[ 0]; \
+		A1 = (sc)->state[ 1]; \
+		A2 = (sc)->state[ 2]; \
+		A3 = (sc)->state[ 3]; \
+		B0 = (sc)->state[ 4]; \
+		B1 = (sc)->state[ 5]; \
+		B2 = (sc)->state[ 6]; \
+		B3 = (sc)->state[ 7]; \
+		C0 = (sc)->state[ 8]; \
+		C1 = (sc)->state[ 9]; \
+		C2 = (sc)->state[10]; \
+		C3 = (sc)->state[11]; \
+		D0 = (sc)->state[12]; \
+		D1 = (sc)->state[13]; \
+		D2 = (sc)->state[14]; \
+		D3 = (sc)->state[15]; \
+	} while (0)
+
+#define WRITE_STATE_SMALL(sc)   do { \
+		(sc)->state[ 0] = A0; \
+		(sc)->state[ 1] = A1; \
+		(sc)->state[ 2] = A2; \
+		(sc)->state[ 3] = A3; \
+		(sc)->state[ 4] = B0; \
+		(sc)->state[ 5] = B1; \
+		(sc)->state[ 6] = B2; \
+		(sc)->state[ 7] = B3; \
+		(sc)->state[ 8] = C0; \
+		(sc)->state[ 9] = C1; \
+		(sc)->state[10] = C2; \
+		(sc)->state[11] = C3; \
+		(sc)->state[12] = D0; \
+		(sc)->state[13] = D1; \
+		(sc)->state[14] = D2; \
+		(sc)->state[15] = D3; \
+	} while (0)
+
+#define DECL_STATE_BIG   \
+	u32 A0, A1, A2, A3, A4, A5, A6, A7; \
+	u32 B0, B1, B2, B3, B4, B5, B6, B7; \
+	u32 C0, C1, C2, C3, C4, C5, C6, C7; \
+	u32 D0, D1, D2, D3, D4, D5, D6, D7;
+
+#define READ_STATE_BIG(sc)   do { \
+		A0 = (sc)->state[ 0]; \
+		A1 = (sc)->state[ 1]; \
+		A2 = (sc)->state[ 2]; \
+		A3 = (sc)->state[ 3]; \
+		A4 = (sc)->state[ 4]; \
+		A5 = (sc)->state[ 5]; \
+		A6 = (sc)->state[ 6]; \
+		A7 = (sc)->state[ 7]; \
+		B0 = (sc)->state[ 8]; \
+		B1 = (sc)->state[ 9]; \
+		B2 = (sc)->state[10]; \
+		B3 = (sc)->state[11]; \
+		B4 = (sc)->state[12]; \
+		B5 = (sc)->state[13]; \
+		B6 = (sc)->state[14]; \
+		B7 = (sc)->state[15]; \
+		C0 = (sc)->state[16]; \
+		C1 = (sc)->state[17]; \
+		C2 = (sc)->state[18]; \
+		C3 = (sc)->state[19]; \
+		C4 = (sc)->state[20]; \
+		C5 = (sc)->state[21]; \
+		C6 = (sc)->state[22]; \
+		C7 = (sc)->state[23]; \
+		D0 = (sc)->state[24]; \
+		D1 = (sc)->state[25]; \
+		D2 = (sc)->state[26]; \
+		D3 = (sc)->state[27]; \
+		D4 = (sc)->state[28]; \
+		D5 = (sc)->state[29]; \
+		D6 = (sc)->state[30]; \
+		D7 = (sc)->state[31]; \
+	} while (0)
+
+#define WRITE_STATE_BIG(sc)   do { \
+		(sc)->state[ 0] = A0; \
+		(sc)->state[ 1] = A1; \
+		(sc)->state[ 2] = A2; \
+		(sc)->state[ 3] = A3; \
+		(sc)->state[ 4] = A4; \
+		(sc)->state[ 5] = A5; \
+		(sc)->state[ 6] = A6; \
+		(sc)->state[ 7] = A7; \
+		(sc)->state[ 8] = B0; \
+		(sc)->state[ 9] = B1; \
+		(sc)->state[10] = B2; \
+		(sc)->state[11] = B3; \
+		(sc)->state[12] = B4; \
+		(sc)->state[13] = B5; \
+		(sc)->state[14] = B6; \
+		(sc)->state[15] = B7; \
+		(sc)->state[16] = C0; \
+		(sc)->state[17] = C1; \
+		(sc)->state[18] = C2; \
+		(sc)->state[19] = C3; \
+		(sc)->state[20] = C4; \
+		(sc)->state[21] = C5; \
+		(sc)->state[22] = C6; \
+		(sc)->state[23] = C7; \
+		(sc)->state[24] = D0; \
+		(sc)->state[25] = D1; \
+		(sc)->state[26] = D2; \
+		(sc)->state[27] = D3; \
+		(sc)->state[28] = D4; \
+		(sc)->state[29] = D5; \
+		(sc)->state[30] = D6; \
+		(sc)->state[31] = D7; \
+	} while (0)
+
+#endif
+
+#define STEP_ELT(n, w, fun, s, ppb)   do { \
+		u32 tt = T32(D ## n + (w) + fun(A ## n, B ## n, C ## n)); \
+		A ## n = T32(ROL32(tt, s) + XCAT(tA, XCAT(ppb, n))); \
+		D ## n = C ## n; \
+		C ## n = B ## n; \
+		B ## n = tA ## n; \
+	} while (0)
+
+#define STEP_SMALL(w0, w1, w2, w3, fun, r, s, pp4b)   do { \
+		u32 tA0 = ROL32(A0, r); \
+		u32 tA1 = ROL32(A1, r); \
+		u32 tA2 = ROL32(A2, r); \
+		u32 tA3 = ROL32(A3, r); \
+		STEP_ELT(0, w0, fun, s, pp4b); \
+		STEP_ELT(1, w1, fun, s, pp4b); \
+		STEP_ELT(2, w2, fun, s, pp4b); \
+		STEP_ELT(3, w3, fun, s, pp4b); \
+	} while (0)
+
+#define STEP_BIG(w0, w1, w2, w3, w4, w5, w6, w7, fun, r, s, pp8b)   do { \
+		u32 tA0 = ROL32(A0, r); \
+		u32 tA1 = ROL32(A1, r); \
+		u32 tA2 = ROL32(A2, r); \
+		u32 tA3 = ROL32(A3, r); \
+		u32 tA4 = ROL32(A4, r); \
+		u32 tA5 = ROL32(A5, r); \
+		u32 tA6 = ROL32(A6, r); \
+		u32 tA7 = ROL32(A7, r); \
+		STEP_ELT(0, w0, fun, s, pp8b); \
+		STEP_ELT(1, w1, fun, s, pp8b); \
+		STEP_ELT(2, w2, fun, s, pp8b); \
+		STEP_ELT(3, w3, fun, s, pp8b); \
+		STEP_ELT(4, w4, fun, s, pp8b); \
+		STEP_ELT(5, w5, fun, s, pp8b); \
+		STEP_ELT(6, w6, fun, s, pp8b); \
+		STEP_ELT(7, w7, fun, s, pp8b); \
+	} while (0)
+
+#define M3_0_0   0_
+#define M3_1_0   1_
+#define M3_2_0   2_
+#define M3_3_0   0_
+#define M3_4_0   1_
+#define M3_5_0   2_
+#define M3_6_0   0_
+#define M3_7_0   1_
+
+#define M3_0_1   1_
+#define M3_1_1   2_
+#define M3_2_1   0_
+#define M3_3_1   1_
+#define M3_4_1   2_
+#define M3_5_1   0_
+#define M3_6_1   1_
+#define M3_7_1   2_
+
+#define M3_0_2   2_
+#define M3_1_2   0_
+#define M3_2_2   1_
+#define M3_3_2   2_
+#define M3_4_2   0_
+#define M3_5_2   1_
+#define M3_6_2   2_
+#define M3_7_2   0_
+
+#define STEP_SMALL_(w, fun, r, s, pp4b)   STEP_SMALL w, fun, r, s, pp4b)
+
+#define ONE_ROUND_SMALL(ri, isp, p0, p1, p2, p3)   do { \
+		STEP_SMALL_(WS_ ## ri ## 0, \
+			IF,  p0, p1, XCAT(PP4_, M3_0_ ## isp)); \
+		STEP_SMALL_(WS_ ## ri ## 1, \
+			IF,  p1, p2, XCAT(PP4_, M3_1_ ## isp)); \
+		STEP_SMALL_(WS_ ## ri ## 2, \
+			IF,  p2, p3, XCAT(PP4_, M3_2_ ## isp)); \
+		STEP_SMALL_(WS_ ## ri ## 3, \
+			IF,  p3, p0, XCAT(PP4_, M3_3_ ## isp)); \
+		STEP_SMALL_(WS_ ## ri ## 4, \
+			MAJ, p0, p1, XCAT(PP4_, M3_4_ ## isp)); \
+		STEP_SMALL_(WS_ ## ri ## 5, \
+			MAJ, p1, p2, XCAT(PP4_, M3_5_ ## isp)); \
+		STEP_SMALL_(WS_ ## ri ## 6, \
+			MAJ, p2, p3, XCAT(PP4_, M3_6_ ## isp)); \
+		STEP_SMALL_(WS_ ## ri ## 7, \
+			MAJ, p3, p0, XCAT(PP4_, M3_7_ ## isp)); \
+	} while (0)
+
+#define M7_0_0   0_
+#define M7_1_0   1_
+#define M7_2_0   2_
+#define M7_3_0   3_
+#define M7_4_0   4_
+#define M7_5_0   5_
+#define M7_6_0   6_
+#define M7_7_0   0_
+
+#define M7_0_1   1_
+#define M7_1_1   2_
+#define M7_2_1   3_
+#define M7_3_1   4_
+#define M7_4_1   5_
+#define M7_5_1   6_
+#define M7_6_1   0_
+#define M7_7_1   1_
+
+#define M7_0_2   2_
+#define M7_1_2   3_
+#define M7_2_2   4_
+#define M7_3_2   5_
+#define M7_4_2   6_
+#define M7_5_2   0_
+#define M7_6_2   1_
+#define M7_7_2   2_
+
+#define M7_0_3   3_
+#define M7_1_3   4_
+#define M7_2_3   5_
+#define M7_3_3   6_
+#define M7_4_3   0_
+#define M7_5_3   1_
+#define M7_6_3   2_
+#define M7_7_3   3_
+
+#define STEP_BIG_(w, fun, r, s, pp8b)   STEP_BIG w, fun, r, s, pp8b)
+
+#define ONE_ROUND_BIG(ri, isp, p0, p1, p2, p3)   do { \
+		STEP_BIG_(WB_ ## ri ## 0, \
+			IF,  p0, p1, XCAT(PP8_, M7_0_ ## isp)); \
+		STEP_BIG_(WB_ ## ri ## 1, \
+			IF,  p1, p2, XCAT(PP8_, M7_1_ ## isp)); \
+		STEP_BIG_(WB_ ## ri ## 2, \
+			IF,  p2, p3, XCAT(PP8_, M7_2_ ## isp)); \
+		STEP_BIG_(WB_ ## ri ## 3, \
+			IF,  p3, p0, XCAT(PP8_, M7_3_ ## isp)); \
+		STEP_BIG_(WB_ ## ri ## 4, \
+			MAJ, p0, p1, XCAT(PP8_, M7_4_ ## isp)); \
+		STEP_BIG_(WB_ ## ri ## 5, \
+			MAJ, p1, p2, XCAT(PP8_, M7_5_ ## isp)); \
+		STEP_BIG_(WB_ ## ri ## 6, \
+			MAJ, p2, p3, XCAT(PP8_, M7_6_ ## isp)); \
+		STEP_BIG_(WB_ ## ri ## 7, \
+			MAJ, p3, p0, XCAT(PP8_, M7_7_ ## isp)); \
+	} while (0)
+
+#if SPH_SMALL_FOOTPRINT_SIMD
+
+#define A0   state[ 0]
+#define A1   state[ 1]
+#define A2   state[ 2]
+#define A3   state[ 3]
+#define B0   state[ 4]
+#define B1   state[ 5]
+#define B2   state[ 6]
+#define B3   state[ 7]
+#define C0   state[ 8]
+#define C1   state[ 9]
+#define C2   state[10]
+#define C3   state[11]
+#define D0   state[12]
+#define D1   state[13]
+#define D2   state[14]
+#define D3   state[15]
+
+#define STEP2_ELT(n, w, fun, s, ppb)   do { \
+		u32 tt = T32(D ## n + (w) + fun(A ## n, B ## n, C ## n)); \
+		A ## n = T32(ROL32(tt, s) + tA[(ppb) ^ n]); \
+		D ## n = C ## n; \
+		C ## n = B ## n; \
+		B ## n = tA[n]; \
+	} while (0)
+
+#define STEP2_SMALL(w0, w1, w2, w3, fun, r, s, pp4b)   do { \
+		u32 tA[4]; \
+		tA[0] = ROL32(A0, r); \
+		tA[1] = ROL32(A1, r); \
+		tA[2] = ROL32(A2, r); \
+		tA[3] = ROL32(A3, r); \
+		STEP2_ELT(0, w0, fun, s, pp4b); \
+		STEP2_ELT(1, w1, fun, s, pp4b); \
+		STEP2_ELT(2, w2, fun, s, pp4b); \
+		STEP2_ELT(3, w3, fun, s, pp4b); \
+	} while (0)
+
+static void
+one_round_small(u32 *state, u32 *w, int isp, int p0, int p1, int p2, int p3)
+{
+	static const int pp4k[] = { 1, 2, 3, 1, 2, 3, 1, 2, 3, 1, 2 };
+
+	STEP2_SMALL(w[ 0], w[ 1], w[ 2], w[ 3], IF,  p0, p1, pp4k[isp + 0]);
+	STEP2_SMALL(w[ 4], w[ 5], w[ 6], w[ 7], IF,  p1, p2, pp4k[isp + 1]);
+	STEP2_SMALL(w[ 8], w[ 9], w[10], w[11], IF,  p2, p3, pp4k[isp + 2]);
+	STEP2_SMALL(w[12], w[13], w[14], w[15], IF,  p3, p0, pp4k[isp + 3]);
+	STEP2_SMALL(w[16], w[17], w[18], w[19], MAJ, p0, p1, pp4k[isp + 4]);
+	STEP2_SMALL(w[20], w[21], w[22], w[23], MAJ, p1, p2, pp4k[isp + 5]);
+	STEP2_SMALL(w[24], w[25], w[26], w[27], MAJ, p2, p3, pp4k[isp + 6]);
+	STEP2_SMALL(w[28], w[29], w[30], w[31], MAJ, p3, p0, pp4k[isp + 7]);
+}
+
+static void
+compress_small(sph_simd_small_context *sc, int last)
+{
+	unsigned char *x;
+	s32 q[128];
+	int i;
+	u32 w[32];
+	u32 state[16];
+	size_t u;
+
+	static const size_t wsp[32] = {
+		 4 << 3,  6 << 3,  0 << 3,  2 << 3,
+		 7 << 3,  5 << 3,  3 << 3,  1 << 3,
+		15 << 3, 11 << 3, 12 << 3,  8 << 3,
+		 9 << 3, 13 << 3, 10 << 3, 14 << 3,
+		17 << 3, 18 << 3, 23 << 3, 20 << 3,
+		22 << 3, 21 << 3, 16 << 3, 19 << 3,
+		30 << 3, 24 << 3, 25 << 3, 31 << 3,
+		27 << 3, 29 << 3, 28 << 3, 26 << 3
+	};
+
+	x = sc->buf;
+	FFT128(0, 1, 0, ll);
+	if (last) {
+		for (i = 0; i < 128; i ++) {
+			s32 tq;
+
+			tq = q[i] + yoff_s_f[i];
+			tq = REDS2(tq);
+			tq = REDS1(tq);
+			tq = REDS1(tq);
+			q[i] = (tq <= 128 ? tq : tq - 257);
+		}
+	} else {
+		for (i = 0; i < 128; i ++) {
+			s32 tq;
+
+			tq = q[i] + yoff_s_n[i];
+			tq = REDS2(tq);
+			tq = REDS1(tq);
+			tq = REDS1(tq);
+			q[i] = (tq <= 128 ? tq : tq - 257);
+		}
+	}
+
+	for (i = 0; i < 16; i += 4) {
+		state[i + 0] = sc->state[i + 0]
+			^ sph_dec32le_aligned(x + 4 * (i + 0));
+		state[i + 1] = sc->state[i + 1]
+			^ sph_dec32le_aligned(x + 4 * (i + 1));
+		state[i + 2] = sc->state[i + 2]
+			^ sph_dec32le_aligned(x + 4 * (i + 2));
+		state[i + 3] = sc->state[i + 3]
+			^ sph_dec32le_aligned(x + 4 * (i + 3));
+	}
+
+#define WSREAD(sb, o1, o2, mm)   do { \
+		for (u = 0; u < 32; u += 4) { \
+			size_t v = wsp[(u >> 2) + (sb)]; \
+			w[u + 0] = INNER(q[v + 2 * 0 + (o1)], \
+				q[v + 2 * 0 + (o2)], mm); \
+			w[u + 1] = INNER(q[v + 2 * 1 + (o1)], \
+				q[v + 2 * 1 + (o2)], mm); \
+			w[u + 2] = INNER(q[v + 2 * 2 + (o1)], \
+				q[v + 2 * 2 + (o2)], mm); \
+			w[u + 3] = INNER(q[v + 2 * 3 + (o1)], \
+				q[v + 2 * 3 + (o2)], mm); \
+		} \
+	} while (0)
+
+	WSREAD( 0,    0,    1, 185);
+	one_round_small(state, w, 0,  3, 23, 17, 27);
+	WSREAD( 8,    0,    1, 185);
+	one_round_small(state, w, 2, 28, 19, 22,  7);
+	WSREAD(16, -128,  -64, 233);
+	one_round_small(state, w, 1, 29,  9, 15,  5);
+	WSREAD(24, -191, -127, 233);
+	one_round_small(state, w, 0,  4, 13, 10, 25);
+
+#undef WSREAD
+
+	STEP_SMALL(sc->state[ 0], sc->state[ 1], sc->state[ 2], sc->state[ 3],
+		IF,  4, 13, PP4_2_);
+	STEP_SMALL(sc->state[ 4], sc->state[ 5], sc->state[ 6], sc->state[ 7],
+		IF, 13, 10, PP4_0_);
+	STEP_SMALL(sc->state[ 8], sc->state[ 9], sc->state[10], sc->state[11],
+		IF, 10, 25, PP4_1_);
+	STEP_SMALL(sc->state[12], sc->state[13], sc->state[14], sc->state[15],
+		IF, 25,  4, PP4_2_);
+
+	memcpy(sc->state, state, sizeof state);
+}
+
+#undef A0
+#undef A1
+#undef A2
+#undef A3
+#undef B0
+#undef B1
+#undef B2
+#undef B3
+#undef C0
+#undef C1
+#undef C2
+#undef C3
+#undef D0
+#undef D1
+#undef D2
+#undef D3
+
+#else
+
+#if SPH_SIMD_NOCOPY
+#define A0   (sc->state[ 0])
+#define A1   (sc->state[ 1])
+#define A2   (sc->state[ 2])
+#define A3   (sc->state[ 3])
+#define B0   (sc->state[ 4])
+#define B1   (sc->state[ 5])
+#define B2   (sc->state[ 6])
+#define B3   (sc->state[ 7])
+#define C0   (sc->state[ 8])
+#define C1   (sc->state[ 9])
+#define C2   (sc->state[10])
+#define C3   (sc->state[11])
+#define D0   (sc->state[12])
+#define D1   (sc->state[13])
+#define D2   (sc->state[14])
+#define D3   (sc->state[15])
+#endif
+
+static void
+compress_small(sph_simd_small_context *sc, int last)
+{
+	unsigned char *x;
+	s32 q[128];
+	int i;
+	DECL_STATE_SMALL
+#if SPH_SIMD_NOCOPY
+	sph_u32 saved[16];
+#endif
+
+#if SPH_SIMD_NOCOPY
+	memcpy(saved, sc->state, sizeof saved);
+#endif
+	x = sc->buf;
+	FFT128(0, 1, 0, ll);
+	if (last) {
+		for (i = 0; i < 128; i ++) {
+			s32 tq;
+
+			tq = q[i] + yoff_s_f[i];
+			tq = REDS2(tq);
+			tq = REDS1(tq);
+			tq = REDS1(tq);
+			q[i] = (tq <= 128 ? tq : tq - 257);
+		}
+	} else {
+		for (i = 0; i < 128; i ++) {
+			s32 tq;
+
+			tq = q[i] + yoff_s_n[i];
+			tq = REDS2(tq);
+			tq = REDS1(tq);
+			tq = REDS1(tq);
+			q[i] = (tq <= 128 ? tq : tq - 257);
+		}
+	}
+	READ_STATE_SMALL(sc);
+	A0 ^= sph_dec32le_aligned(x +  0);
+	A1 ^= sph_dec32le_aligned(x +  4);
+	A2 ^= sph_dec32le_aligned(x +  8);
+	A3 ^= sph_dec32le_aligned(x + 12);
+	B0 ^= sph_dec32le_aligned(x + 16);
+	B1 ^= sph_dec32le_aligned(x + 20);
+	B2 ^= sph_dec32le_aligned(x + 24);
+	B3 ^= sph_dec32le_aligned(x + 28);
+	C0 ^= sph_dec32le_aligned(x + 32);
+	C1 ^= sph_dec32le_aligned(x + 36);
+	C2 ^= sph_dec32le_aligned(x + 40);
+	C3 ^= sph_dec32le_aligned(x + 44);
+	D0 ^= sph_dec32le_aligned(x + 48);
+	D1 ^= sph_dec32le_aligned(x + 52);
+	D2 ^= sph_dec32le_aligned(x + 56);
+	D3 ^= sph_dec32le_aligned(x + 60);
+	ONE_ROUND_SMALL(0_, 0,  3, 23, 17, 27);
+	ONE_ROUND_SMALL(1_, 2, 28, 19, 22,  7);
+	ONE_ROUND_SMALL(2_, 1, 29,  9, 15,  5);
+	ONE_ROUND_SMALL(3_, 0,  4, 13, 10, 25);
+#if SPH_SIMD_NOCOPY
+	STEP_SMALL(saved[ 0], saved[ 1], saved[ 2], saved[ 3],
+		IF,  4, 13, PP4_2_);
+	STEP_SMALL(saved[ 4], saved[ 5], saved[ 6], saved[ 7],
+		IF, 13, 10, PP4_0_);
+	STEP_SMALL(saved[ 8], saved[ 9], saved[10], saved[11],
+		IF, 10, 25, PP4_1_);
+	STEP_SMALL(saved[12], saved[13], saved[14], saved[15],
+		IF, 25,  4, PP4_2_);
+#else
+	STEP_SMALL(sc->state[ 0], sc->state[ 1], sc->state[ 2], sc->state[ 3],
+		IF,  4, 13, PP4_2_);
+	STEP_SMALL(sc->state[ 4], sc->state[ 5], sc->state[ 6], sc->state[ 7],
+		IF, 13, 10, PP4_0_);
+	STEP_SMALL(sc->state[ 8], sc->state[ 9], sc->state[10], sc->state[11],
+		IF, 10, 25, PP4_1_);
+	STEP_SMALL(sc->state[12], sc->state[13], sc->state[14], sc->state[15],
+		IF, 25,  4, PP4_2_);
+	WRITE_STATE_SMALL(sc);
+#endif
+}
+
+#if SPH_SIMD_NOCOPY
+#undef A0
+#undef A1
+#undef A2
+#undef A3
+#undef B0
+#undef B1
+#undef B2
+#undef B3
+#undef C0
+#undef C1
+#undef C2
+#undef C3
+#undef D0
+#undef D1
+#undef D2
+#undef D3
+#endif
+
+#endif
+
+#if SPH_SMALL_FOOTPRINT_SIMD
+
+#define A0   state[ 0]
+#define A1   state[ 1]
+#define A2   state[ 2]
+#define A3   state[ 3]
+#define A4   state[ 4]
+#define A5   state[ 5]
+#define A6   state[ 6]
+#define A7   state[ 7]
+#define B0   state[ 8]
+#define B1   state[ 9]
+#define B2   state[10]
+#define B3   state[11]
+#define B4   state[12]
+#define B5   state[13]
+#define B6   state[14]
+#define B7   state[15]
+#define C0   state[16]
+#define C1   state[17]
+#define C2   state[18]
+#define C3   state[19]
+#define C4   state[20]
+#define C5   state[21]
+#define C6   state[22]
+#define C7   state[23]
+#define D0   state[24]
+#define D1   state[25]
+#define D2   state[26]
+#define D3   state[27]
+#define D4   state[28]
+#define D5   state[29]
+#define D6   state[30]
+#define D7   state[31]
+
+/*
+ * Not needed -- already defined for SIMD-224 / SIMD-256
+ *
+#define STEP2_ELT(n, w, fun, s, ppb)   do { \
+		u32 tt = T32(D ## n + (w) + fun(A ## n, B ## n, C ## n)); \
+		A ## n = T32(ROL32(tt, s) + tA[(ppb) ^ n]); \
+		D ## n = C ## n; \
+		C ## n = B ## n; \
+		B ## n = tA[n]; \
+	} while (0)
+ */
+
+#define STEP2_BIG(w0, w1, w2, w3, w4, w5, w6, w7, fun, r, s, pp8b)   do { \
+		u32 tA[8]; \
+		tA[0] = ROL32(A0, r); \
+		tA[1] = ROL32(A1, r); \
+		tA[2] = ROL32(A2, r); \
+		tA[3] = ROL32(A3, r); \
+		tA[4] = ROL32(A4, r); \
+		tA[5] = ROL32(A5, r); \
+		tA[6] = ROL32(A6, r); \
+		tA[7] = ROL32(A7, r); \
+		STEP2_ELT(0, w0, fun, s, pp8b); \
+		STEP2_ELT(1, w1, fun, s, pp8b); \
+		STEP2_ELT(2, w2, fun, s, pp8b); \
+		STEP2_ELT(3, w3, fun, s, pp8b); \
+		STEP2_ELT(4, w4, fun, s, pp8b); \
+		STEP2_ELT(5, w5, fun, s, pp8b); \
+		STEP2_ELT(6, w6, fun, s, pp8b); \
+		STEP2_ELT(7, w7, fun, s, pp8b); \
+	} while (0)
+
+static void
+one_round_big(u32 *state, u32 *w, int isp, int p0, int p1, int p2, int p3)
+{
+	static const int pp8k[] = { 1, 6, 2, 3, 5, 7, 4, 1, 6, 2, 3 };
+
+	STEP2_BIG(w[ 0], w[ 1], w[ 2], w[ 3], w[ 4], w[ 5], w[ 6], w[ 7],
+		IF,  p0, p1, pp8k[isp + 0]);
+	STEP2_BIG(w[ 8], w[ 9], w[10], w[11], w[12], w[13], w[14], w[15],
+		IF,  p1, p2, pp8k[isp + 1]);
+	STEP2_BIG(w[16], w[17], w[18], w[19], w[20], w[21], w[22], w[23],
+		IF,  p2, p3, pp8k[isp + 2]);
+	STEP2_BIG(w[24], w[25], w[26], w[27], w[28], w[29], w[30], w[31],
+		IF,  p3, p0, pp8k[isp + 3]);
+	STEP2_BIG(w[32], w[33], w[34], w[35], w[36], w[37], w[38], w[39],
+		MAJ, p0, p1, pp8k[isp + 4]);
+	STEP2_BIG(w[40], w[41], w[42], w[43], w[44], w[45], w[46], w[47],
+		MAJ, p1, p2, pp8k[isp + 5]);
+	STEP2_BIG(w[48], w[49], w[50], w[51], w[52], w[53], w[54], w[55],
+		MAJ, p2, p3, pp8k[isp + 6]);
+	STEP2_BIG(w[56], w[57], w[58], w[59], w[60], w[61], w[62], w[63],
+		MAJ, p3, p0, pp8k[isp + 7]);
+}
+
+static void
+compress_big(sph_simd_big_context *sc, int last)
+{
+	unsigned char *x;
+	s32 q[256];
+	int i;
+	u32 w[64];
+	u32 state[32];
+	size_t u;
+
+	static const size_t wbp[32] = {
+		 4 << 4,  6 << 4,  0 << 4,  2 << 4,
+		 7 << 4,  5 << 4,  3 << 4,  1 << 4,
+		15 << 4, 11 << 4, 12 << 4,  8 << 4,
+		 9 << 4, 13 << 4, 10 << 4, 14 << 4,
+		17 << 4, 18 << 4, 23 << 4, 20 << 4,
+		22 << 4, 21 << 4, 16 << 4, 19 << 4,
+		30 << 4, 24 << 4, 25 << 4, 31 << 4,
+		27 << 4, 29 << 4, 28 << 4, 26 << 4
+	};
+
+	x = sc->buf;
+	FFT256(0, 1, 0, ll);
+	if (last) {
+		for (i = 0; i < 256; i ++) {
+			s32 tq;
+
+			tq = q[i] + yoff_b_f[i];
+			tq = REDS2(tq);
+			tq = REDS1(tq);
+			tq = REDS1(tq);
+			q[i] = (tq <= 128 ? tq : tq - 257);
+		}
+	} else {
+		for (i = 0; i < 256; i ++) {
+			s32 tq;
+
+			tq = q[i] + yoff_b_n[i];
+			tq = REDS2(tq);
+			tq = REDS1(tq);
+			tq = REDS1(tq);
+			q[i] = (tq <= 128 ? tq : tq - 257);
+		}
+	}
+
+	for (i = 0; i < 32; i += 8) {
+		state[i + 0] = sc->state[i + 0]
+			^ sph_dec32le_aligned(x + 4 * (i + 0));
+		state[i + 1] = sc->state[i + 1]
+			^ sph_dec32le_aligned(x + 4 * (i + 1));
+		state[i + 2] = sc->state[i + 2]
+			^ sph_dec32le_aligned(x + 4 * (i + 2));
+		state[i + 3] = sc->state[i + 3]
+			^ sph_dec32le_aligned(x + 4 * (i + 3));
+		state[i + 4] = sc->state[i + 4]
+			^ sph_dec32le_aligned(x + 4 * (i + 4));
+		state[i + 5] = sc->state[i + 5]
+			^ sph_dec32le_aligned(x + 4 * (i + 5));
+		state[i + 6] = sc->state[i + 6]
+			^ sph_dec32le_aligned(x + 4 * (i + 6));
+		state[i + 7] = sc->state[i + 7]
+			^ sph_dec32le_aligned(x + 4 * (i + 7));
+	}
+
+#define WBREAD(sb, o1, o2, mm)   do { \
+		for (u = 0; u < 64; u += 8) { \
+			size_t v = wbp[(u >> 3) + (sb)]; \
+			w[u + 0] = INNER(q[v + 2 * 0 + (o1)], \
+				q[v + 2 * 0 + (o2)], mm); \
+			w[u + 1] = INNER(q[v + 2 * 1 + (o1)], \
+				q[v + 2 * 1 + (o2)], mm); \
+			w[u + 2] = INNER(q[v + 2 * 2 + (o1)], \
+				q[v + 2 * 2 + (o2)], mm); \
+			w[u + 3] = INNER(q[v + 2 * 3 + (o1)], \
+				q[v + 2 * 3 + (o2)], mm); \
+			w[u + 4] = INNER(q[v + 2 * 4 + (o1)], \
+				q[v + 2 * 4 + (o2)], mm); \
+			w[u + 5] = INNER(q[v + 2 * 5 + (o1)], \
+				q[v + 2 * 5 + (o2)], mm); \
+			w[u + 6] = INNER(q[v + 2 * 6 + (o1)], \
+				q[v + 2 * 6 + (o2)], mm); \
+			w[u + 7] = INNER(q[v + 2 * 7 + (o1)], \
+				q[v + 2 * 7 + (o2)], mm); \
+		} \
+	} while (0)
+
+	WBREAD( 0,    0,    1, 185);
+	one_round_big(state, w, 0,  3, 23, 17, 27);
+	WBREAD( 8,    0,    1, 185);
+	one_round_big(state, w, 1, 28, 19, 22,  7);
+	WBREAD(16, -256, -128, 233);
+	one_round_big(state, w, 2, 29,  9, 15,  5);
+	WBREAD(24, -383, -255, 233);
+	one_round_big(state, w, 3,  4, 13, 10, 25);
+
+#undef WBREAD
+
+	STEP_BIG(
+		sc->state[ 0], sc->state[ 1], sc->state[ 2], sc->state[ 3],
+		sc->state[ 4], sc->state[ 5], sc->state[ 6], sc->state[ 7],
+		IF,  4, 13, PP8_4_);
+	STEP_BIG(
+		sc->state[ 8], sc->state[ 9], sc->state[10], sc->state[11],
+		sc->state[12], sc->state[13], sc->state[14], sc->state[15],
+		IF, 13, 10, PP8_5_);
+	STEP_BIG(
+		sc->state[16], sc->state[17], sc->state[18], sc->state[19],
+		sc->state[20], sc->state[21], sc->state[22], sc->state[23],
+		IF, 10, 25, PP8_6_);
+	STEP_BIG(
+		sc->state[24], sc->state[25], sc->state[26], sc->state[27],
+		sc->state[28], sc->state[29], sc->state[30], sc->state[31],
+		IF, 25,  4, PP8_0_);
+
+	memcpy(sc->state, state, sizeof state);
+}
+
+#undef A0
+#undef A1
+#undef A2
+#undef A3
+#undef A4
+#undef A5
+#undef A6
+#undef A7
+#undef B0
+#undef B1
+#undef B2
+#undef B3
+#undef B4
+#undef B5
+#undef B6
+#undef B7
+#undef C0
+#undef C1
+#undef C2
+#undef C3
+#undef C4
+#undef C5
+#undef C6
+#undef C7
+#undef D0
+#undef D1
+#undef D2
+#undef D3
+#undef D4
+#undef D5
+#undef D6
+#undef D7
+
+#else
+
+#if SPH_SIMD_NOCOPY
+#define A0   (sc->state[ 0])
+#define A1   (sc->state[ 1])
+#define A2   (sc->state[ 2])
+#define A3   (sc->state[ 3])
+#define A4   (sc->state[ 4])
+#define A5   (sc->state[ 5])
+#define A6   (sc->state[ 6])
+#define A7   (sc->state[ 7])
+#define B0   (sc->state[ 8])
+#define B1   (sc->state[ 9])
+#define B2   (sc->state[10])
+#define B3   (sc->state[11])
+#define B4   (sc->state[12])
+#define B5   (sc->state[13])
+#define B6   (sc->state[14])
+#define B7   (sc->state[15])
+#define C0   (sc->state[16])
+#define C1   (sc->state[17])
+#define C2   (sc->state[18])
+#define C3   (sc->state[19])
+#define C4   (sc->state[20])
+#define C5   (sc->state[21])
+#define C6   (sc->state[22])
+#define C7   (sc->state[23])
+#define D0   (sc->state[24])
+#define D1   (sc->state[25])
+#define D2   (sc->state[26])
+#define D3   (sc->state[27])
+#define D4   (sc->state[28])
+#define D5   (sc->state[29])
+#define D6   (sc->state[30])
+#define D7   (sc->state[31])
+#endif
+
+static void
+compress_big(sph_simd_big_context *sc, int last)
+{
+	unsigned char *x;
+	s32 q[256];
+	int i;
+	DECL_STATE_BIG
+#if SPH_SIMD_NOCOPY
+	sph_u32 saved[32];
+#endif
+
+#if SPH_SIMD_NOCOPY
+	memcpy(saved, sc->state, sizeof saved);
+#endif
+
+	x = sc->buf;
+	FFT256(0, 1, 0, ll);
+	if (last) {
+		for (i = 0; i < 256; i ++) {
+			s32 tq;
+
+			tq = q[i] + yoff_b_f[i];
+			tq = REDS2(tq);
+			tq = REDS1(tq);
+			tq = REDS1(tq);
+			q[i] = (tq <= 128 ? tq : tq - 257);
+		}
+	} else {
+		for (i = 0; i < 256; i ++) {
+			s32 tq;
+
+			tq = q[i] + yoff_b_n[i];
+			tq = REDS2(tq);
+			tq = REDS1(tq);
+			tq = REDS1(tq);
+			q[i] = (tq <= 128 ? tq : tq - 257);
+		}
+	}
+	READ_STATE_BIG(sc);
+	A0 ^= sph_dec32le_aligned(x +   0);
+	A1 ^= sph_dec32le_aligned(x +   4);
+	A2 ^= sph_dec32le_aligned(x +   8);
+	A3 ^= sph_dec32le_aligned(x +  12);
+	A4 ^= sph_dec32le_aligned(x +  16);
+	A5 ^= sph_dec32le_aligned(x +  20);
+	A6 ^= sph_dec32le_aligned(x +  24);
+	A7 ^= sph_dec32le_aligned(x +  28);
+	B0 ^= sph_dec32le_aligned(x +  32);
+	B1 ^= sph_dec32le_aligned(x +  36);
+	B2 ^= sph_dec32le_aligned(x +  40);
+	B3 ^= sph_dec32le_aligned(x +  44);
+	B4 ^= sph_dec32le_aligned(x +  48);
+	B5 ^= sph_dec32le_aligned(x +  52);
+	B6 ^= sph_dec32le_aligned(x +  56);
+	B7 ^= sph_dec32le_aligned(x +  60);
+	C0 ^= sph_dec32le_aligned(x +  64);
+	C1 ^= sph_dec32le_aligned(x +  68);
+	C2 ^= sph_dec32le_aligned(x +  72);
+	C3 ^= sph_dec32le_aligned(x +  76);
+	C4 ^= sph_dec32le_aligned(x +  80);
+	C5 ^= sph_dec32le_aligned(x +  84);
+	C6 ^= sph_dec32le_aligned(x +  88);
+	C7 ^= sph_dec32le_aligned(x +  92);
+	D0 ^= sph_dec32le_aligned(x +  96);
+	D1 ^= sph_dec32le_aligned(x + 100);
+	D2 ^= sph_dec32le_aligned(x + 104);
+	D3 ^= sph_dec32le_aligned(x + 108);
+	D4 ^= sph_dec32le_aligned(x + 112);
+	D5 ^= sph_dec32le_aligned(x + 116);
+	D6 ^= sph_dec32le_aligned(x + 120);
+	D7 ^= sph_dec32le_aligned(x + 124);
+
+	ONE_ROUND_BIG(0_, 0,  3, 23, 17, 27);
+	ONE_ROUND_BIG(1_, 1, 28, 19, 22,  7);
+	ONE_ROUND_BIG(2_, 2, 29,  9, 15,  5);
+	ONE_ROUND_BIG(3_, 3,  4, 13, 10, 25);
+#if SPH_SIMD_NOCOPY
+	STEP_BIG(
+		saved[ 0], saved[ 1], saved[ 2], saved[ 3],
+		saved[ 4], saved[ 5], saved[ 6], saved[ 7],
+		IF,  4, 13, PP8_4_);
+	STEP_BIG(
+		saved[ 8], saved[ 9], saved[10], saved[11],
+		saved[12], saved[13], saved[14], saved[15],
+		IF, 13, 10, PP8_5_);
+	STEP_BIG(
+		saved[16], saved[17], saved[18], saved[19],
+		saved[20], saved[21], saved[22], saved[23],
+		IF, 10, 25, PP8_6_);
+	STEP_BIG(
+		saved[24], saved[25], saved[26], saved[27],
+		saved[28], saved[29], saved[30], saved[31],
+		IF, 25,  4, PP8_0_);
+#else
+	STEP_BIG(
+		sc->state[ 0], sc->state[ 1], sc->state[ 2], sc->state[ 3],
+		sc->state[ 4], sc->state[ 5], sc->state[ 6], sc->state[ 7],
+		IF,  4, 13, PP8_4_);
+	STEP_BIG(
+		sc->state[ 8], sc->state[ 9], sc->state[10], sc->state[11],
+		sc->state[12], sc->state[13], sc->state[14], sc->state[15],
+		IF, 13, 10, PP8_5_);
+	STEP_BIG(
+		sc->state[16], sc->state[17], sc->state[18], sc->state[19],
+		sc->state[20], sc->state[21], sc->state[22], sc->state[23],
+		IF, 10, 25, PP8_6_);
+	STEP_BIG(
+		sc->state[24], sc->state[25], sc->state[26], sc->state[27],
+		sc->state[28], sc->state[29], sc->state[30], sc->state[31],
+		IF, 25,  4, PP8_0_);
+	WRITE_STATE_BIG(sc);
+#endif
+}
+
+#if SPH_SIMD_NOCOPY
+#undef A0
+#undef A1
+#undef A2
+#undef A3
+#undef A4
+#undef A5
+#undef A6
+#undef A7
+#undef B0
+#undef B1
+#undef B2
+#undef B3
+#undef B4
+#undef B5
+#undef B6
+#undef B7
+#undef C0
+#undef C1
+#undef C2
+#undef C3
+#undef C4
+#undef C5
+#undef C6
+#undef C7
+#undef D0
+#undef D1
+#undef D2
+#undef D3
+#undef D4
+#undef D5
+#undef D6
+#undef D7
+#endif
+
+#endif
+
+static const u32 IV224[] = {
+	C32(0x33586E9F), C32(0x12FFF033), C32(0xB2D9F64D), C32(0x6F8FEA53),
+	C32(0xDE943106), C32(0x2742E439), C32(0x4FBAB5AC), C32(0x62B9FF96),
+	C32(0x22E7B0AF), C32(0xC862B3A8), C32(0x33E00CDC), C32(0x236B86A6),
+	C32(0xF64AE77C), C32(0xFA373B76), C32(0x7DC1EE5B), C32(0x7FB29CE8)
+};
+
+static const u32 IV256[] = {
+	C32(0x4D567983), C32(0x07190BA9), C32(0x8474577B), C32(0x39D726E9),
+	C32(0xAAF3D925), C32(0x3EE20B03), C32(0xAFD5E751), C32(0xC96006D3),
+	C32(0xC2C2BA14), C32(0x49B3BCB4), C32(0xF67CAF46), C32(0x668626C9),
+	C32(0xE2EAA8D2), C32(0x1FF47833), C32(0xD0C661A5), C32(0x55693DE1)
+};
+
+static const u32 IV384[] = {
+	C32(0x8A36EEBC), C32(0x94A3BD90), C32(0xD1537B83), C32(0xB25B070B),
+	C32(0xF463F1B5), C32(0xB6F81E20), C32(0x0055C339), C32(0xB4D144D1),
+	C32(0x7360CA61), C32(0x18361A03), C32(0x17DCB4B9), C32(0x3414C45A),
+	C32(0xA699A9D2), C32(0xE39E9664), C32(0x468BFE77), C32(0x51D062F8),
+	C32(0xB9E3BFE8), C32(0x63BECE2A), C32(0x8FE506B9), C32(0xF8CC4AC2),
+	C32(0x7AE11542), C32(0xB1AADDA1), C32(0x64B06794), C32(0x28D2F462),
+	C32(0xE64071EC), C32(0x1DEB91A8), C32(0x8AC8DB23), C32(0x3F782AB5),
+	C32(0x039B5CB8), C32(0x71DDD962), C32(0xFADE2CEA), C32(0x1416DF71)
+};
+
+static const u32 IV512[] = {
+	C32(0x0BA16B95), C32(0x72F999AD), C32(0x9FECC2AE), C32(0xBA3264FC),
+	C32(0x5E894929), C32(0x8E9F30E5), C32(0x2F1DAA37), C32(0xF0F2C558),
+	C32(0xAC506643), C32(0xA90635A5), C32(0xE25B878B), C32(0xAAB7878F),
+	C32(0x88817F7A), C32(0x0A02892B), C32(0x559A7550), C32(0x598F657E),
+	C32(0x7EEF60A1), C32(0x6B70E3E8), C32(0x9C1714D1), C32(0xB958E2A8),
+	C32(0xAB02675E), C32(0xED1C014F), C32(0xCD8D65BB), C32(0xFDB7A257),
+	C32(0x09254899), C32(0xD699C7BC), C32(0x9019B6DC), C32(0x2B9022E4),
+	C32(0x8FA14956), C32(0x21BF9BD3), C32(0xB94D0943), C32(0x6FFDDC22)
+};
+
+static void
+init_small(void *cc, const u32 *iv)
+{
+	sph_simd_small_context *sc;
+
+	sc = cc;
+	memcpy(sc->state, iv, sizeof sc->state);
+	sc->count_low = sc->count_high = 0;
+	sc->ptr = 0;
+}
+
+static void
+init_big(void *cc, const u32 *iv)
+{
+	sph_simd_big_context *sc;
+
+	sc = cc;
+	memcpy(sc->state, iv, sizeof sc->state);
+	sc->count_low = sc->count_high = 0;
+	sc->ptr = 0;
+}
+
+static void
+update_small(void *cc, const void *data, size_t len)
+{
+	sph_simd_small_context *sc;
+
+	sc = cc;
+	while (len > 0) {
+		size_t clen;
+
+		clen = (sizeof sc->buf) - sc->ptr;
+		if (clen > len)
+			clen = len;
+		memcpy(sc->buf + sc->ptr, data, clen);
+		data = (const unsigned char *)data + clen;
+		len -= clen;
+		if ((sc->ptr += clen) == sizeof sc->buf) {
+			compress_small(sc, 0);
+			sc->ptr = 0;
+			sc->count_low = T32(sc->count_low + 1);
+			if (sc->count_low == 0)
+				sc->count_high ++;
+		}
+	}
+}
+
+static void
+update_big(void *cc, const void *data, size_t len)
+{
+	sph_simd_big_context *sc;
+
+	sc = cc;
+	while (len > 0) {
+		size_t clen;
+
+		clen = (sizeof sc->buf) - sc->ptr;
+		if (clen > len)
+			clen = len;
+		memcpy(sc->buf + sc->ptr, data, clen);
+		data = (const unsigned char *)data + clen;
+		len -= clen;
+		if ((sc->ptr += clen) == sizeof sc->buf) {
+			compress_big(sc, 0);
+			sc->ptr = 0;
+			sc->count_low = T32(sc->count_low + 1);
+			if (sc->count_low == 0)
+				sc->count_high ++;
+		}
+	}
+}
+
+static void
+encode_count_small(unsigned char *dst,
+	u32 low, u32 high, size_t ptr, unsigned n)
+{
+	low = T32(low << 9);
+	high = T32(high << 9) + (low >> 23);
+	low += (ptr << 3) + n;
+	sph_enc32le(dst, low);
+	sph_enc32le(dst + 4, high);
+}
+
+static void
+encode_count_big(unsigned char *dst,
+	u32 low, u32 high, size_t ptr, unsigned n)
+{
+	low = T32(low << 10);
+	high = T32(high << 10) + (low >> 22);
+	low += (ptr << 3) + n;
+	sph_enc32le(dst, low);
+	sph_enc32le(dst + 4, high);
+}
+
+static void
+finalize_small(void *cc, unsigned ub, unsigned n, void *dst, size_t dst_len)
+{
+	sph_simd_small_context *sc;
+	unsigned char *d;
+	size_t u;
+
+	sc = cc;
+	if (sc->ptr > 0 || n > 0) {
+		memset(sc->buf + sc->ptr, 0,
+			(sizeof sc->buf) - sc->ptr);
+		sc->buf[sc->ptr] = ub & (0xFF << (8 - n));
+		compress_small(sc, 0);
+	}
+	memset(sc->buf, 0, sizeof sc->buf);
+	encode_count_small(sc->buf, sc->count_low, sc->count_high, sc->ptr, n);
+	compress_small(sc, 1);
+	d = dst;
+	for (d = dst, u = 0; u < dst_len; u ++)
+		sph_enc32le(d + (u << 2), sc->state[u]);
+}
+
+static void
+finalize_big(void *cc, unsigned ub, unsigned n, void *dst, size_t dst_len)
+{
+	sph_simd_big_context *sc;
+	unsigned char *d;
+	size_t u;
+
+	sc = cc;
+	if (sc->ptr > 0 || n > 0) {
+		memset(sc->buf + sc->ptr, 0,
+			(sizeof sc->buf) - sc->ptr);
+		sc->buf[sc->ptr] = ub & (0xFF << (8 - n));
+		compress_big(sc, 0);
+	}
+	memset(sc->buf, 0, sizeof sc->buf);
+	encode_count_big(sc->buf, sc->count_low, sc->count_high, sc->ptr, n);
+	compress_big(sc, 1);
+	d = dst;
+	for (d = dst, u = 0; u < dst_len; u ++)
+		sph_enc32le(d + (u << 2), sc->state[u]);
+}
+
+void
+sph_simd224_init(void *cc)
+{
+	init_small(cc, IV224);
+}
+
+void
+sph_simd224(void *cc, const void *data, size_t len)
+{
+	update_small(cc, data, len);
+}
+
+void
+sph_simd224_close(void *cc, void *dst)
+{
+	sph_simd224_addbits_and_close(cc, 0, 0, dst);
+}
+
+void
+sph_simd224_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
+{
+	finalize_small(cc, ub, n, dst, 7);
+	sph_simd224_init(cc);
+}
+
+void
+sph_simd256_init(void *cc)
+{
+	init_small(cc, IV256);
+}
+
+void
+sph_simd256(void *cc, const void *data, size_t len)
+{
+	update_small(cc, data, len);
+}
+
+void
+sph_simd256_close(void *cc, void *dst)
+{
+	sph_simd256_addbits_and_close(cc, 0, 0, dst);
+}
+
+void
+sph_simd256_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
+{
+	finalize_small(cc, ub, n, dst, 8);
+	sph_simd256_init(cc);
+}
+
+void
+sph_simd384_init(void *cc)
+{
+	init_big(cc, IV384);
+}
+
+void
+sph_simd384(void *cc, const void *data, size_t len)
+{
+	update_big(cc, data, len);
+}
+
+void
+sph_simd384_close(void *cc, void *dst)
+{
+	sph_simd384_addbits_and_close(cc, 0, 0, dst);
+}
+
+void
+sph_simd384_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
+{
+	finalize_big(cc, ub, n, dst, 12);
+	sph_simd384_init(cc);
+}
+
+void
+sph_simd512_init(void *cc)
+{
+	init_big(cc, IV512);
+}
+
+void
+sph_simd512(void *cc, const void *data, size_t len)
+{
+	update_big(cc, data, len);
+}
+
+void
+sph_simd512_close(void *cc, void *dst)
+{
+	sph_simd512_addbits_and_close(cc, 0, 0, dst);
+}
+
+void
+sph_simd512_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
+{
+	finalize_big(cc, ub, n, dst, 16);
+	sph_simd512_init(cc);
+}
+#ifdef __cplusplus
+}
+#endif
\ No newline at end of file
diff --git a/sha3/skein.c b/sha3/skein.c
new file mode 100644
index 0000000..7e47e35
--- /dev/null
+++ b/sha3/skein.c
@@ -0,0 +1,1254 @@
+/* $Id: skein.c 254 2011-06-07 19:38:58Z tp $ */
+/*
+ * Skein implementation.
+ *
+ * ==========================(LICENSE BEGIN)============================
+ *
+ * Copyright (c) 2007-2010  Projet RNRT SAPHIR
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ===========================(LICENSE END)=============================
+ *
+ * @author   Thomas Pornin <thomas.pornin@cryptolog.com>
+ */
+
+#include <stddef.h>
+#include <string.h>
+
+#include "sph_skein.h"
+
+#ifdef __cplusplus
+extern "C"{
+#endif
+
+
+#if SPH_SMALL_FOOTPRINT && !defined SPH_SMALL_FOOTPRINT_SKEIN
+#define SPH_SMALL_FOOTPRINT_SKEIN   1
+#endif
+
+#ifdef _MSC_VER
+#pragma warning (disable: 4146)
+#endif
+
+#if SPH_64
+
+#if 0
+/* obsolete */
+/*
+ * M5_ ## s ## _ ## i  evaluates to s+i mod 5 (0 <= s <= 18, 0 <= i <= 3).
+ */
+
+#define M5_0_0    0
+#define M5_0_1    1
+#define M5_0_2    2
+#define M5_0_3    3
+
+#define M5_1_0    1
+#define M5_1_1    2
+#define M5_1_2    3
+#define M5_1_3    4
+
+#define M5_2_0    2
+#define M5_2_1    3
+#define M5_2_2    4
+#define M5_2_3    0
+
+#define M5_3_0    3
+#define M5_3_1    4
+#define M5_3_2    0
+#define M5_3_3    1
+
+#define M5_4_0    4
+#define M5_4_1    0
+#define M5_4_2    1
+#define M5_4_3    2
+
+#define M5_5_0    0
+#define M5_5_1    1
+#define M5_5_2    2
+#define M5_5_3    3
+
+#define M5_6_0    1
+#define M5_6_1    2
+#define M5_6_2    3
+#define M5_6_3    4
+
+#define M5_7_0    2
+#define M5_7_1    3
+#define M5_7_2    4
+#define M5_7_3    0
+
+#define M5_8_0    3
+#define M5_8_1    4
+#define M5_8_2    0
+#define M5_8_3    1
+
+#define M5_9_0    4
+#define M5_9_1    0
+#define M5_9_2    1
+#define M5_9_3    2
+
+#define M5_10_0   0
+#define M5_10_1   1
+#define M5_10_2   2
+#define M5_10_3   3
+
+#define M5_11_0   1
+#define M5_11_1   2
+#define M5_11_2   3
+#define M5_11_3   4
+
+#define M5_12_0   2
+#define M5_12_1   3
+#define M5_12_2   4
+#define M5_12_3   0
+
+#define M5_13_0   3
+#define M5_13_1   4
+#define M5_13_2   0
+#define M5_13_3   1
+
+#define M5_14_0   4
+#define M5_14_1   0
+#define M5_14_2   1
+#define M5_14_3   2
+
+#define M5_15_0   0
+#define M5_15_1   1
+#define M5_15_2   2
+#define M5_15_3   3
+
+#define M5_16_0   1
+#define M5_16_1   2
+#define M5_16_2   3
+#define M5_16_3   4
+
+#define M5_17_0   2
+#define M5_17_1   3
+#define M5_17_2   4
+#define M5_17_3   0
+
+#define M5_18_0   3
+#define M5_18_1   4
+#define M5_18_2   0
+#define M5_18_3   1
+#endif
+
+/*
+ * M9_ ## s ## _ ## i  evaluates to s+i mod 9 (0 <= s <= 18, 0 <= i <= 7).
+ */
+
+#define M9_0_0    0
+#define M9_0_1    1
+#define M9_0_2    2
+#define M9_0_3    3
+#define M9_0_4    4
+#define M9_0_5    5
+#define M9_0_6    6
+#define M9_0_7    7
+
+#define M9_1_0    1
+#define M9_1_1    2
+#define M9_1_2    3
+#define M9_1_3    4
+#define M9_1_4    5
+#define M9_1_5    6
+#define M9_1_6    7
+#define M9_1_7    8
+
+#define M9_2_0    2
+#define M9_2_1    3
+#define M9_2_2    4
+#define M9_2_3    5
+#define M9_2_4    6
+#define M9_2_5    7
+#define M9_2_6    8
+#define M9_2_7    0
+
+#define M9_3_0    3
+#define M9_3_1    4
+#define M9_3_2    5
+#define M9_3_3    6
+#define M9_3_4    7
+#define M9_3_5    8
+#define M9_3_6    0
+#define M9_3_7    1
+
+#define M9_4_0    4
+#define M9_4_1    5
+#define M9_4_2    6
+#define M9_4_3    7
+#define M9_4_4    8
+#define M9_4_5    0
+#define M9_4_6    1
+#define M9_4_7    2
+
+#define M9_5_0    5
+#define M9_5_1    6
+#define M9_5_2    7
+#define M9_5_3    8
+#define M9_5_4    0
+#define M9_5_5    1
+#define M9_5_6    2
+#define M9_5_7    3
+
+#define M9_6_0    6
+#define M9_6_1    7
+#define M9_6_2    8
+#define M9_6_3    0
+#define M9_6_4    1
+#define M9_6_5    2
+#define M9_6_6    3
+#define M9_6_7    4
+
+#define M9_7_0    7
+#define M9_7_1    8
+#define M9_7_2    0
+#define M9_7_3    1
+#define M9_7_4    2
+#define M9_7_5    3
+#define M9_7_6    4
+#define M9_7_7    5
+
+#define M9_8_0    8
+#define M9_8_1    0
+#define M9_8_2    1
+#define M9_8_3    2
+#define M9_8_4    3
+#define M9_8_5    4
+#define M9_8_6    5
+#define M9_8_7    6
+
+#define M9_9_0    0
+#define M9_9_1    1
+#define M9_9_2    2
+#define M9_9_3    3
+#define M9_9_4    4
+#define M9_9_5    5
+#define M9_9_6    6
+#define M9_9_7    7
+
+#define M9_10_0   1
+#define M9_10_1   2
+#define M9_10_2   3
+#define M9_10_3   4
+#define M9_10_4   5
+#define M9_10_5   6
+#define M9_10_6   7
+#define M9_10_7   8
+
+#define M9_11_0   2
+#define M9_11_1   3
+#define M9_11_2   4
+#define M9_11_3   5
+#define M9_11_4   6
+#define M9_11_5   7
+#define M9_11_6   8
+#define M9_11_7   0
+
+#define M9_12_0   3
+#define M9_12_1   4
+#define M9_12_2   5
+#define M9_12_3   6
+#define M9_12_4   7
+#define M9_12_5   8
+#define M9_12_6   0
+#define M9_12_7   1
+
+#define M9_13_0   4
+#define M9_13_1   5
+#define M9_13_2   6
+#define M9_13_3   7
+#define M9_13_4   8
+#define M9_13_5   0
+#define M9_13_6   1
+#define M9_13_7   2
+
+#define M9_14_0   5
+#define M9_14_1   6
+#define M9_14_2   7
+#define M9_14_3   8
+#define M9_14_4   0
+#define M9_14_5   1
+#define M9_14_6   2
+#define M9_14_7   3
+
+#define M9_15_0   6
+#define M9_15_1   7
+#define M9_15_2   8
+#define M9_15_3   0
+#define M9_15_4   1
+#define M9_15_5   2
+#define M9_15_6   3
+#define M9_15_7   4
+
+#define M9_16_0   7
+#define M9_16_1   8
+#define M9_16_2   0
+#define M9_16_3   1
+#define M9_16_4   2
+#define M9_16_5   3
+#define M9_16_6   4
+#define M9_16_7   5
+
+#define M9_17_0   8
+#define M9_17_1   0
+#define M9_17_2   1
+#define M9_17_3   2
+#define M9_17_4   3
+#define M9_17_5   4
+#define M9_17_6   5
+#define M9_17_7   6
+
+#define M9_18_0   0
+#define M9_18_1   1
+#define M9_18_2   2
+#define M9_18_3   3
+#define M9_18_4   4
+#define M9_18_5   5
+#define M9_18_6   6
+#define M9_18_7   7
+
+/*
+ * M3_ ## s ## _ ## i  evaluates to s+i mod 3 (0 <= s <= 18, 0 <= i <= 1).
+ */
+
+#define M3_0_0    0
+#define M3_0_1    1
+#define M3_1_0    1
+#define M3_1_1    2
+#define M3_2_0    2
+#define M3_2_1    0
+#define M3_3_0    0
+#define M3_3_1    1
+#define M3_4_0    1
+#define M3_4_1    2
+#define M3_5_0    2
+#define M3_5_1    0
+#define M3_6_0    0
+#define M3_6_1    1
+#define M3_7_0    1
+#define M3_7_1    2
+#define M3_8_0    2
+#define M3_8_1    0
+#define M3_9_0    0
+#define M3_9_1    1
+#define M3_10_0   1
+#define M3_10_1   2
+#define M3_11_0   2
+#define M3_11_1   0
+#define M3_12_0   0
+#define M3_12_1   1
+#define M3_13_0   1
+#define M3_13_1   2
+#define M3_14_0   2
+#define M3_14_1   0
+#define M3_15_0   0
+#define M3_15_1   1
+#define M3_16_0   1
+#define M3_16_1   2
+#define M3_17_0   2
+#define M3_17_1   0
+#define M3_18_0   0
+#define M3_18_1   1
+
+#define XCAT(x, y)     XCAT_(x, y)
+#define XCAT_(x, y)    x ## y
+
+#if 0
+/* obsolete */
+#define SKSI(k, s, i)   XCAT(k, XCAT(XCAT(XCAT(M5_, s), _), i))
+#define SKST(t, s, v)   XCAT(t, XCAT(XCAT(XCAT(M3_, s), _), v))
+#endif
+
+#define SKBI(k, s, i)   XCAT(k, XCAT(XCAT(XCAT(M9_, s), _), i))
+#define SKBT(t, s, v)   XCAT(t, XCAT(XCAT(XCAT(M3_, s), _), v))
+
+#if 0
+/* obsolete */
+#define TFSMALL_KINIT(k0, k1, k2, k3, k4, t0, t1, t2)   do { \
+		k4 = (k0 ^ k1) ^ (k2 ^ k3) ^ SPH_C64(0x1BD11BDAA9FC1A22); \
+		t2 = t0 ^ t1; \
+	} while (0)
+#endif
+
+#define TFBIG_KINIT(k0, k1, k2, k3, k4, k5, k6, k7, k8, t0, t1, t2)   do { \
+		k8 = ((k0 ^ k1) ^ (k2 ^ k3)) ^ ((k4 ^ k5) ^ (k6 ^ k7)) \
+			^ SPH_C64(0x1BD11BDAA9FC1A22); \
+		t2 = t0 ^ t1; \
+	} while (0)
+
+#if 0
+/* obsolete */
+#define TFSMALL_ADDKEY(w0, w1, w2, w3, k, t, s)   do { \
+		w0 = SPH_T64(w0 + SKSI(k, s, 0)); \
+		w1 = SPH_T64(w1 + SKSI(k, s, 1) + SKST(t, s, 0)); \
+		w2 = SPH_T64(w2 + SKSI(k, s, 2) + SKST(t, s, 1)); \
+		w3 = SPH_T64(w3 + SKSI(k, s, 3) + (sph_u64)s); \
+	} while (0)
+#endif
+
+#if SPH_SMALL_FOOTPRINT_SKEIN
+
+#define TFBIG_ADDKEY(s, tt0, tt1)   do { \
+		p0 = SPH_T64(p0 + h[s + 0]); \
+		p1 = SPH_T64(p1 + h[s + 1]); \
+		p2 = SPH_T64(p2 + h[s + 2]); \
+		p3 = SPH_T64(p3 + h[s + 3]); \
+		p4 = SPH_T64(p4 + h[s + 4]); \
+		p5 = SPH_T64(p5 + h[s + 5] + tt0); \
+		p6 = SPH_T64(p6 + h[s + 6] + tt1); \
+		p7 = SPH_T64(p7 + h[s + 7] + (sph_u64)s); \
+	} while (0)
+
+#else
+
+#define TFBIG_ADDKEY(w0, w1, w2, w3, w4, w5, w6, w7, k, t, s)   do { \
+		w0 = SPH_T64(w0 + SKBI(k, s, 0)); \
+		w1 = SPH_T64(w1 + SKBI(k, s, 1)); \
+		w2 = SPH_T64(w2 + SKBI(k, s, 2)); \
+		w3 = SPH_T64(w3 + SKBI(k, s, 3)); \
+		w4 = SPH_T64(w4 + SKBI(k, s, 4)); \
+		w5 = SPH_T64(w5 + SKBI(k, s, 5) + SKBT(t, s, 0)); \
+		w6 = SPH_T64(w6 + SKBI(k, s, 6) + SKBT(t, s, 1)); \
+		w7 = SPH_T64(w7 + SKBI(k, s, 7) + (sph_u64)s); \
+	} while (0)
+
+#endif
+
+#if 0
+/* obsolete */
+#define TFSMALL_MIX(x0, x1, rc)   do { \
+		x0 = SPH_T64(x0 + x1); \
+		x1 = SPH_ROTL64(x1, rc) ^ x0; \
+	} while (0)
+#endif
+
+#define TFBIG_MIX(x0, x1, rc)   do { \
+		x0 = SPH_T64(x0 + x1); \
+		x1 = SPH_ROTL64(x1, rc) ^ x0; \
+	} while (0)
+
+#if 0
+/* obsolete */
+#define TFSMALL_MIX4(w0, w1, w2, w3, rc0, rc1)  do { \
+		TFSMALL_MIX(w0, w1, rc0); \
+		TFSMALL_MIX(w2, w3, rc1); \
+	} while (0)
+#endif
+
+#define TFBIG_MIX8(w0, w1, w2, w3, w4, w5, w6, w7, rc0, rc1, rc2, rc3)  do { \
+		TFBIG_MIX(w0, w1, rc0); \
+		TFBIG_MIX(w2, w3, rc1); \
+		TFBIG_MIX(w4, w5, rc2); \
+		TFBIG_MIX(w6, w7, rc3); \
+	} while (0)
+
+#if 0
+/* obsolete */
+#define TFSMALL_4e(s)   do { \
+		TFSMALL_ADDKEY(p0, p1, p2, p3, h, t, s); \
+		TFSMALL_MIX4(p0, p1, p2, p3, 14, 16); \
+		TFSMALL_MIX4(p0, p3, p2, p1, 52, 57); \
+		TFSMALL_MIX4(p0, p1, p2, p3, 23, 40); \
+		TFSMALL_MIX4(p0, p3, p2, p1,  5, 37); \
+	} while (0)
+
+#define TFSMALL_4o(s)   do { \
+		TFSMALL_ADDKEY(p0, p1, p2, p3, h, t, s); \
+		TFSMALL_MIX4(p0, p1, p2, p3, 25, 33); \
+		TFSMALL_MIX4(p0, p3, p2, p1, 46, 12); \
+		TFSMALL_MIX4(p0, p1, p2, p3, 58, 22); \
+		TFSMALL_MIX4(p0, p3, p2, p1, 32, 32); \
+	} while (0)
+#endif
+
+#if SPH_SMALL_FOOTPRINT_SKEIN
+
+#define TFBIG_4e(s)   do { \
+		TFBIG_ADDKEY(s, t0, t1); \
+		TFBIG_MIX8(p0, p1, p2, p3, p4, p5, p6, p7, 46, 36, 19, 37); \
+		TFBIG_MIX8(p2, p1, p4, p7, p6, p5, p0, p3, 33, 27, 14, 42); \
+		TFBIG_MIX8(p4, p1, p6, p3, p0, p5, p2, p7, 17, 49, 36, 39); \
+		TFBIG_MIX8(p6, p1, p0, p7, p2, p5, p4, p3, 44,  9, 54, 56); \
+	} while (0)
+
+#define TFBIG_4o(s)   do { \
+		TFBIG_ADDKEY(s, t1, t2); \
+		TFBIG_MIX8(p0, p1, p2, p3, p4, p5, p6, p7, 39, 30, 34, 24); \
+		TFBIG_MIX8(p2, p1, p4, p7, p6, p5, p0, p3, 13, 50, 10, 17); \
+		TFBIG_MIX8(p4, p1, p6, p3, p0, p5, p2, p7, 25, 29, 39, 43); \
+		TFBIG_MIX8(p6, p1, p0, p7, p2, p5, p4, p3,  8, 35, 56, 22); \
+	} while (0)
+
+#else
+
+#define TFBIG_4e(s)   do { \
+		TFBIG_ADDKEY(p0, p1, p2, p3, p4, p5, p6, p7, h, t, s); \
+		TFBIG_MIX8(p0, p1, p2, p3, p4, p5, p6, p7, 46, 36, 19, 37); \
+		TFBIG_MIX8(p2, p1, p4, p7, p6, p5, p0, p3, 33, 27, 14, 42); \
+		TFBIG_MIX8(p4, p1, p6, p3, p0, p5, p2, p7, 17, 49, 36, 39); \
+		TFBIG_MIX8(p6, p1, p0, p7, p2, p5, p4, p3, 44,  9, 54, 56); \
+	} while (0)
+
+#define TFBIG_4o(s)   do { \
+		TFBIG_ADDKEY(p0, p1, p2, p3, p4, p5, p6, p7, h, t, s); \
+		TFBIG_MIX8(p0, p1, p2, p3, p4, p5, p6, p7, 39, 30, 34, 24); \
+		TFBIG_MIX8(p2, p1, p4, p7, p6, p5, p0, p3, 13, 50, 10, 17); \
+		TFBIG_MIX8(p4, p1, p6, p3, p0, p5, p2, p7, 25, 29, 39, 43); \
+		TFBIG_MIX8(p6, p1, p0, p7, p2, p5, p4, p3,  8, 35, 56, 22); \
+	} while (0)
+
+#endif
+
+#if 0
+/* obsolete */
+#define UBI_SMALL(etype, extra)  do { \
+		sph_u64 h4, t0, t1, t2; \
+		sph_u64 m0 = sph_dec64le(buf +  0); \
+		sph_u64 m1 = sph_dec64le(buf +  8); \
+		sph_u64 m2 = sph_dec64le(buf + 16); \
+		sph_u64 m3 = sph_dec64le(buf + 24); \
+		sph_u64 p0 = m0; \
+		sph_u64 p1 = m1; \
+		sph_u64 p2 = m2; \
+		sph_u64 p3 = m3; \
+		t0 = SPH_T64(bcount << 5) + (sph_u64)(extra); \
+		t1 = (bcount >> 59) + ((sph_u64)(etype) << 55); \
+		TFSMALL_KINIT(h0, h1, h2, h3, h4, t0, t1, t2); \
+		TFSMALL_4e(0); \
+		TFSMALL_4o(1); \
+		TFSMALL_4e(2); \
+		TFSMALL_4o(3); \
+		TFSMALL_4e(4); \
+		TFSMALL_4o(5); \
+		TFSMALL_4e(6); \
+		TFSMALL_4o(7); \
+		TFSMALL_4e(8); \
+		TFSMALL_4o(9); \
+		TFSMALL_4e(10); \
+		TFSMALL_4o(11); \
+		TFSMALL_4e(12); \
+		TFSMALL_4o(13); \
+		TFSMALL_4e(14); \
+		TFSMALL_4o(15); \
+		TFSMALL_4e(16); \
+		TFSMALL_4o(17); \
+		TFSMALL_ADDKEY(p0, p1, p2, p3, h, t, 18); \
+		h0 = m0 ^ p0; \
+		h1 = m1 ^ p1; \
+		h2 = m2 ^ p2; \
+		h3 = m3 ^ p3; \
+	} while (0)
+#endif
+
+#if SPH_SMALL_FOOTPRINT_SKEIN
+
+#define UBI_BIG(etype, extra)  do { \
+		sph_u64 t0, t1, t2; \
+		unsigned u; \
+		sph_u64 m0 = sph_dec64le_aligned(buf +  0); \
+		sph_u64 m1 = sph_dec64le_aligned(buf +  8); \
+		sph_u64 m2 = sph_dec64le_aligned(buf + 16); \
+		sph_u64 m3 = sph_dec64le_aligned(buf + 24); \
+		sph_u64 m4 = sph_dec64le_aligned(buf + 32); \
+		sph_u64 m5 = sph_dec64le_aligned(buf + 40); \
+		sph_u64 m6 = sph_dec64le_aligned(buf + 48); \
+		sph_u64 m7 = sph_dec64le_aligned(buf + 56); \
+		sph_u64 p0 = m0; \
+		sph_u64 p1 = m1; \
+		sph_u64 p2 = m2; \
+		sph_u64 p3 = m3; \
+		sph_u64 p4 = m4; \
+		sph_u64 p5 = m5; \
+		sph_u64 p6 = m6; \
+		sph_u64 p7 = m7; \
+		t0 = SPH_T64(bcount << 6) + (sph_u64)(extra); \
+		t1 = (bcount >> 58) + ((sph_u64)(etype) << 55); \
+		TFBIG_KINIT(h[0], h[1], h[2], h[3], h[4], h[5], \
+			h[6], h[7], h[8], t0, t1, t2); \
+		for (u = 0; u <= 15; u += 3) { \
+			h[u +  9] = h[u + 0]; \
+			h[u + 10] = h[u + 1]; \
+			h[u + 11] = h[u + 2]; \
+		} \
+		for (u = 0; u < 9; u ++) { \
+			sph_u64 s = u << 1; \
+			sph_u64 tmp; \
+			TFBIG_4e(s); \
+			TFBIG_4o(s + 1); \
+			tmp = t2; \
+			t2 = t1; \
+			t1 = t0; \
+			t0 = tmp; \
+		} \
+		TFBIG_ADDKEY(18, t0, t1); \
+		h[0] = m0 ^ p0; \
+		h[1] = m1 ^ p1; \
+		h[2] = m2 ^ p2; \
+		h[3] = m3 ^ p3; \
+		h[4] = m4 ^ p4; \
+		h[5] = m5 ^ p5; \
+		h[6] = m6 ^ p6; \
+		h[7] = m7 ^ p7; \
+	} while (0)
+
+#else
+
+#define UBI_BIG(etype, extra)  do { \
+		sph_u64 h8, t0, t1, t2; \
+		sph_u64 m0 = sph_dec64le_aligned(buf +  0); \
+		sph_u64 m1 = sph_dec64le_aligned(buf +  8); \
+		sph_u64 m2 = sph_dec64le_aligned(buf + 16); \
+		sph_u64 m3 = sph_dec64le_aligned(buf + 24); \
+		sph_u64 m4 = sph_dec64le_aligned(buf + 32); \
+		sph_u64 m5 = sph_dec64le_aligned(buf + 40); \
+		sph_u64 m6 = sph_dec64le_aligned(buf + 48); \
+		sph_u64 m7 = sph_dec64le_aligned(buf + 56); \
+		sph_u64 p0 = m0; \
+		sph_u64 p1 = m1; \
+		sph_u64 p2 = m2; \
+		sph_u64 p3 = m3; \
+		sph_u64 p4 = m4; \
+		sph_u64 p5 = m5; \
+		sph_u64 p6 = m6; \
+		sph_u64 p7 = m7; \
+		t0 = SPH_T64(bcount << 6) + (sph_u64)(extra); \
+		t1 = (bcount >> 58) + ((sph_u64)(etype) << 55); \
+		TFBIG_KINIT(h0, h1, h2, h3, h4, h5, h6, h7, h8, t0, t1, t2); \
+		TFBIG_4e(0); \
+		TFBIG_4o(1); \
+		TFBIG_4e(2); \
+		TFBIG_4o(3); \
+		TFBIG_4e(4); \
+		TFBIG_4o(5); \
+		TFBIG_4e(6); \
+		TFBIG_4o(7); \
+		TFBIG_4e(8); \
+		TFBIG_4o(9); \
+		TFBIG_4e(10); \
+		TFBIG_4o(11); \
+		TFBIG_4e(12); \
+		TFBIG_4o(13); \
+		TFBIG_4e(14); \
+		TFBIG_4o(15); \
+		TFBIG_4e(16); \
+		TFBIG_4o(17); \
+		TFBIG_ADDKEY(p0, p1, p2, p3, p4, p5, p6, p7, h, t, 18); \
+		h0 = m0 ^ p0; \
+		h1 = m1 ^ p1; \
+		h2 = m2 ^ p2; \
+		h3 = m3 ^ p3; \
+		h4 = m4 ^ p4; \
+		h5 = m5 ^ p5; \
+		h6 = m6 ^ p6; \
+		h7 = m7 ^ p7; \
+	} while (0)
+
+#endif
+
+#if 0
+/* obsolete */
+#define DECL_STATE_SMALL \
+	sph_u64 h0, h1, h2, h3; \
+	sph_u64 bcount;
+
+#define READ_STATE_SMALL(sc)   do { \
+		h0 = (sc)->h0; \
+		h1 = (sc)->h1; \
+		h2 = (sc)->h2; \
+		h3 = (sc)->h3; \
+		bcount = sc->bcount; \
+	} while (0)
+
+#define WRITE_STATE_SMALL(sc)   do { \
+		(sc)->h0 = h0; \
+		(sc)->h1 = h1; \
+		(sc)->h2 = h2; \
+		(sc)->h3 = h3; \
+		sc->bcount = bcount; \
+	} while (0)
+#endif
+
+#if SPH_SMALL_FOOTPRINT_SKEIN
+
+#define DECL_STATE_BIG \
+	sph_u64 h[27]; \
+	sph_u64 bcount;
+
+#define READ_STATE_BIG(sc)   do { \
+		h[0] = (sc)->h0; \
+		h[1] = (sc)->h1; \
+		h[2] = (sc)->h2; \
+		h[3] = (sc)->h3; \
+		h[4] = (sc)->h4; \
+		h[5] = (sc)->h5; \
+		h[6] = (sc)->h6; \
+		h[7] = (sc)->h7; \
+		bcount = sc->bcount; \
+	} while (0)
+
+#define WRITE_STATE_BIG(sc)   do { \
+		(sc)->h0 = h[0]; \
+		(sc)->h1 = h[1]; \
+		(sc)->h2 = h[2]; \
+		(sc)->h3 = h[3]; \
+		(sc)->h4 = h[4]; \
+		(sc)->h5 = h[5]; \
+		(sc)->h6 = h[6]; \
+		(sc)->h7 = h[7]; \
+		sc->bcount = bcount; \
+	} while (0)
+
+#else
+
+#define DECL_STATE_BIG \
+	sph_u64 h0, h1, h2, h3, h4, h5, h6, h7; \
+	sph_u64 bcount;
+
+#define READ_STATE_BIG(sc)   do { \
+		h0 = (sc)->h0; \
+		h1 = (sc)->h1; \
+		h2 = (sc)->h2; \
+		h3 = (sc)->h3; \
+		h4 = (sc)->h4; \
+		h5 = (sc)->h5; \
+		h6 = (sc)->h6; \
+		h7 = (sc)->h7; \
+		bcount = sc->bcount; \
+	} while (0)
+
+#define WRITE_STATE_BIG(sc)   do { \
+		(sc)->h0 = h0; \
+		(sc)->h1 = h1; \
+		(sc)->h2 = h2; \
+		(sc)->h3 = h3; \
+		(sc)->h4 = h4; \
+		(sc)->h5 = h5; \
+		(sc)->h6 = h6; \
+		(sc)->h7 = h7; \
+		sc->bcount = bcount; \
+	} while (0)
+
+#endif
+
+#if 0
+/* obsolete */
+static void
+skein_small_init(sph_skein_small_context *sc, const sph_u64 *iv)
+{
+	sc->h0 = iv[0];
+	sc->h1 = iv[1];
+	sc->h2 = iv[2];
+	sc->h3 = iv[3];
+	sc->bcount = 0;
+	sc->ptr = 0;
+}
+#endif
+
+static void
+skein_big_init(sph_skein_big_context *sc, const sph_u64 *iv)
+{
+	sc->h0 = iv[0];
+	sc->h1 = iv[1];
+	sc->h2 = iv[2];
+	sc->h3 = iv[3];
+	sc->h4 = iv[4];
+	sc->h5 = iv[5];
+	sc->h6 = iv[6];
+	sc->h7 = iv[7];
+	sc->bcount = 0;
+	sc->ptr = 0;
+}
+
+#if 0
+/* obsolete */
+static void
+skein_small_core(sph_skein_small_context *sc, const void *data, size_t len)
+{
+	unsigned char *buf;
+	size_t ptr, clen;
+	unsigned first;
+	DECL_STATE_SMALL
+
+	buf = sc->buf;
+	ptr = sc->ptr;
+	clen = (sizeof sc->buf) - ptr;
+	if (len <= clen) {
+		memcpy(buf + ptr, data, len);
+		sc->ptr = ptr + len;
+		return;
+	}
+	if (clen != 0) {
+		memcpy(buf + ptr, data, clen);
+		data = (const unsigned char *)data + clen;
+		len -= clen;
+	}
+
+#if SPH_SMALL_FOOTPRINT_SKEIN
+
+	READ_STATE_SMALL(sc);
+	first = (bcount == 0) << 7;
+	for (;;) {
+		bcount ++;
+		UBI_SMALL(96 + first, 0);
+		if (len <= sizeof sc->buf)
+			break;
+		first = 0;
+		memcpy(buf, data, sizeof sc->buf);
+		data = (const unsigned char *)data + sizeof sc->buf;
+		len -= sizeof sc->buf;
+	}
+	WRITE_STATE_SMALL(sc);
+	sc->ptr = len;
+	memcpy(buf, data, len);
+
+#else
+
+	/*
+	 * Unrolling the loop yields a slight performance boost, while
+	 * keeping the code size aorund 24 kB on 32-bit x86.
+	 */
+	READ_STATE_SMALL(sc);
+	first = (bcount == 0) << 7;
+	for (;;) {
+		bcount ++;
+		UBI_SMALL(96 + first, 0);
+		if (len <= sizeof sc->buf)
+			break;
+		buf = (unsigned char *)data;
+		bcount ++;
+		UBI_SMALL(96, 0);
+		if (len <= 2 * sizeof sc->buf) {
+			data = buf + sizeof sc->buf;
+			len -= sizeof sc->buf;
+			break;
+		}
+		buf += sizeof sc->buf;
+		data = buf + sizeof sc->buf;
+		first = 0;
+		len -= 2 * sizeof sc->buf;
+	}
+	WRITE_STATE_SMALL(sc);
+	sc->ptr = len;
+	memcpy(sc->buf, data, len);
+
+#endif
+}
+#endif
+
+static void
+skein_big_core(sph_skein_big_context *sc, const void *data, size_t len)
+{
+	/*
+	 * The Skein "final bit" in the tweak is troublesome here,
+	 * because if the input has a length which is a multiple of the
+	 * block size (512 bits) then that bit must be set for the
+	 * final block, which is full of message bits (padding in
+	 * Skein can be reduced to no extra bit at all). However, this
+	 * function cannot know whether it processes the last chunks of
+	 * the message or not. Hence we may keep a full block of buffered
+	 * data (64 bytes).
+	 */
+	unsigned char *buf;
+	size_t ptr;
+	unsigned first;
+	DECL_STATE_BIG
+
+	buf = sc->buf;
+	ptr = sc->ptr;
+	if (len <= (sizeof sc->buf) - ptr) {
+		memcpy(buf + ptr, data, len);
+		ptr += len;
+		sc->ptr = ptr;
+		return;
+	}
+
+	READ_STATE_BIG(sc);
+	first = (bcount == 0) << 7;
+	do {
+		size_t clen;
+
+		if (ptr == sizeof sc->buf) {
+			bcount ++;
+			UBI_BIG(96 + first, 0);
+			first = 0;
+			ptr = 0;
+		}
+		clen = (sizeof sc->buf) - ptr;
+		if (clen > len)
+			clen = len;
+		memcpy(buf + ptr, data, clen);
+		ptr += clen;
+		data = (const unsigned char *)data + clen;
+		len -= clen;
+	} while (len > 0);
+	WRITE_STATE_BIG(sc);
+	sc->ptr = ptr;
+}
+
+#if 0
+/* obsolete */
+static void
+skein_small_close(sph_skein_small_context *sc, unsigned ub, unsigned n,
+	void *dst, size_t out_len)
+{
+	unsigned char *buf;
+	size_t ptr;
+	unsigned et;
+	int i;
+	DECL_STATE_SMALL
+
+	if (n != 0) {
+		unsigned z;
+		unsigned char x;
+
+		z = 0x80 >> n;
+		x = ((ub & -z) | z) & 0xFF;
+		skein_small_core(sc, &x, 1);
+	}
+
+	buf = sc->buf;
+	ptr = sc->ptr;
+	READ_STATE_SMALL(sc);
+	memset(buf + ptr, 0, (sizeof sc->buf) - ptr);
+	et = 352 + ((bcount == 0) << 7) + (n != 0);
+	for (i = 0; i < 2; i ++) {
+		UBI_SMALL(et, ptr);
+		if (i == 0) {
+			memset(buf, 0, sizeof sc->buf);
+			bcount = 0;
+			et = 510;
+			ptr = 8;
+		}
+	}
+
+	sph_enc64le_aligned(buf +  0, h0);
+	sph_enc64le_aligned(buf +  8, h1);
+	sph_enc64le_aligned(buf + 16, h2);
+	sph_enc64le_aligned(buf + 24, h3);
+	memcpy(dst, buf, out_len);
+}
+#endif
+
+static void
+skein_big_close(sph_skein_big_context *sc, unsigned ub, unsigned n,
+	void *dst, size_t out_len)
+{
+	unsigned char *buf;
+	size_t ptr;
+	unsigned et;
+	int i;
+#if SPH_SMALL_FOOTPRINT_SKEIN
+	size_t u;
+#endif
+	DECL_STATE_BIG
+
+	/*
+	 * Add bit padding if necessary.
+	 */
+	if (n != 0) {
+		unsigned z;
+		unsigned char x;
+
+		z = 0x80 >> n;
+		x = ((ub & -z) | z) & 0xFF;
+		skein_big_core(sc, &x, 1);
+	}
+
+	buf = sc->buf;
+	ptr = sc->ptr;
+
+	/*
+	 * At that point, if ptr == 0, then the message was empty;
+	 * otherwise, there is between 1 and 64 bytes (inclusive) which
+	 * are yet to be processed. Either way, we complete the buffer
+	 * to a full block with zeros (the Skein specification mandates
+	 * that an empty message is padded so that there is at least
+	 * one block to process).
+	 *
+	 * Once this block has been processed, we do it again, with
+	 * a block full of zeros, for the output (that block contains
+	 * the encoding of "0", over 8 bytes, then padded with zeros).
+	 */
+	READ_STATE_BIG(sc);
+	memset(buf + ptr, 0, (sizeof sc->buf) - ptr);
+	et = 352 + ((bcount == 0) << 7) + (n != 0);
+	for (i = 0; i < 2; i ++) {
+		UBI_BIG(et, ptr);
+		if (i == 0) {
+			memset(buf, 0, sizeof sc->buf);
+			bcount = 0;
+			et = 510;
+			ptr = 8;
+		}
+	}
+
+#if SPH_SMALL_FOOTPRINT_SKEIN
+
+	/*
+	 * We use a temporary buffer because we must support the case
+	 * where output size is not a multiple of 64 (namely, a 224-bit
+	 * output).
+	 */
+	for (u = 0; u < out_len; u += 8)
+		sph_enc64le_aligned(buf + u, h[u >> 3]);
+	memcpy(dst, buf, out_len);
+
+#else
+
+	sph_enc64le_aligned(buf +  0, h0);
+	sph_enc64le_aligned(buf +  8, h1);
+	sph_enc64le_aligned(buf + 16, h2);
+	sph_enc64le_aligned(buf + 24, h3);
+	sph_enc64le_aligned(buf + 32, h4);
+	sph_enc64le_aligned(buf + 40, h5);
+	sph_enc64le_aligned(buf + 48, h6);
+	sph_enc64le_aligned(buf + 56, h7);
+	memcpy(dst, buf, out_len);
+
+#endif
+}
+
+#if 0
+/* obsolete */
+static const sph_u64 IV224[] = {
+	SPH_C64(0xC6098A8C9AE5EA0B), SPH_C64(0x876D568608C5191C),
+	SPH_C64(0x99CB88D7D7F53884), SPH_C64(0x384BDDB1AEDDB5DE)
+};
+
+static const sph_u64 IV256[] = {
+	SPH_C64(0xFC9DA860D048B449), SPH_C64(0x2FCA66479FA7D833),
+	SPH_C64(0xB33BC3896656840F), SPH_C64(0x6A54E920FDE8DA69)
+};
+#endif
+
+static const sph_u64 IV224[] = {
+	SPH_C64(0xCCD0616248677224), SPH_C64(0xCBA65CF3A92339EF),
+	SPH_C64(0x8CCD69D652FF4B64), SPH_C64(0x398AED7B3AB890B4),
+	SPH_C64(0x0F59D1B1457D2BD0), SPH_C64(0x6776FE6575D4EB3D),
+	SPH_C64(0x99FBC70E997413E9), SPH_C64(0x9E2CFCCFE1C41EF7)
+};
+
+static const sph_u64 IV256[] = {
+	SPH_C64(0xCCD044A12FDB3E13), SPH_C64(0xE83590301A79A9EB),
+	SPH_C64(0x55AEA0614F816E6F), SPH_C64(0x2A2767A4AE9B94DB),
+	SPH_C64(0xEC06025E74DD7683), SPH_C64(0xE7A436CDC4746251),
+	SPH_C64(0xC36FBAF9393AD185), SPH_C64(0x3EEDBA1833EDFC13)
+};
+
+static const sph_u64 IV384[] = {
+	SPH_C64(0xA3F6C6BF3A75EF5F), SPH_C64(0xB0FEF9CCFD84FAA4),
+	SPH_C64(0x9D77DD663D770CFE), SPH_C64(0xD798CBF3B468FDDA),
+	SPH_C64(0x1BC4A6668A0E4465), SPH_C64(0x7ED7D434E5807407),
+	SPH_C64(0x548FC1ACD4EC44D6), SPH_C64(0x266E17546AA18FF8)
+};
+
+static const sph_u64 IV512[] = {
+	SPH_C64(0x4903ADFF749C51CE), SPH_C64(0x0D95DE399746DF03),
+	SPH_C64(0x8FD1934127C79BCE), SPH_C64(0x9A255629FF352CB1),
+	SPH_C64(0x5DB62599DF6CA7B0), SPH_C64(0xEABE394CA9D5C3F4),
+	SPH_C64(0x991112C71A75B523), SPH_C64(0xAE18A40B660FCC33)
+};
+
+#if 0
+/* obsolete */
+/* see sph_skein.h */
+void
+sph_skein224_init(void *cc)
+{
+	skein_small_init(cc, IV224);
+}
+
+/* see sph_skein.h */
+void
+sph_skein224(void *cc, const void *data, size_t len)
+{
+	skein_small_core(cc, data, len);
+}
+
+/* see sph_skein.h */
+void
+sph_skein224_close(void *cc, void *dst)
+{
+	sph_skein224_addbits_and_close(cc, 0, 0, dst);
+}
+
+/* see sph_skein.h */
+void
+sph_skein224_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
+{
+	skein_small_close(cc, ub, n, dst, 28);
+	sph_skein224_init(cc);
+}
+
+/* see sph_skein.h */
+void
+sph_skein256_init(void *cc)
+{
+	skein_small_init(cc, IV256);
+}
+
+/* see sph_skein.h */
+void
+sph_skein256(void *cc, const void *data, size_t len)
+{
+	skein_small_core(cc, data, len);
+}
+
+/* see sph_skein.h */
+void
+sph_skein256_close(void *cc, void *dst)
+{
+	sph_skein256_addbits_and_close(cc, 0, 0, dst);
+}
+
+/* see sph_skein.h */
+void
+sph_skein256_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
+{
+	skein_small_close(cc, ub, n, dst, 32);
+	sph_skein256_init(cc);
+}
+#endif
+
+/* see sph_skein.h */
+void
+sph_skein224_init(void *cc)
+{
+	skein_big_init(cc, IV224);
+}
+
+/* see sph_skein.h */
+void
+sph_skein224(void *cc, const void *data, size_t len)
+{
+	skein_big_core(cc, data, len);
+}
+
+/* see sph_skein.h */
+void
+sph_skein224_close(void *cc, void *dst)
+{
+	sph_skein224_addbits_and_close(cc, 0, 0, dst);
+}
+
+/* see sph_skein.h */
+void
+sph_skein224_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
+{
+	skein_big_close(cc, ub, n, dst, 28);
+	sph_skein224_init(cc);
+}
+
+/* see sph_skein.h */
+void
+sph_skein256_init(void *cc)
+{
+	skein_big_init(cc, IV256);
+}
+
+/* see sph_skein.h */
+void
+sph_skein256(void *cc, const void *data, size_t len)
+{
+	skein_big_core(cc, data, len);
+}
+
+/* see sph_skein.h */
+void
+sph_skein256_close(void *cc, void *dst)
+{
+	sph_skein256_addbits_and_close(cc, 0, 0, dst);
+}
+
+/* see sph_skein.h */
+void
+sph_skein256_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
+{
+	skein_big_close(cc, ub, n, dst, 32);
+	sph_skein256_init(cc);
+}
+
+/* see sph_skein.h */
+void
+sph_skein384_init(void *cc)
+{
+	skein_big_init(cc, IV384);
+}
+
+/* see sph_skein.h */
+void
+sph_skein384(void *cc, const void *data, size_t len)
+{
+	skein_big_core(cc, data, len);
+}
+
+/* see sph_skein.h */
+void
+sph_skein384_close(void *cc, void *dst)
+{
+	sph_skein384_addbits_and_close(cc, 0, 0, dst);
+}
+
+/* see sph_skein.h */
+void
+sph_skein384_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
+{
+	skein_big_close(cc, ub, n, dst, 48);
+	sph_skein384_init(cc);
+}
+
+/* see sph_skein.h */
+void
+sph_skein512_init(void *cc)
+{
+	skein_big_init(cc, IV512);
+}
+
+/* see sph_skein.h */
+void
+sph_skein512(void *cc, const void *data, size_t len)
+{
+	skein_big_core(cc, data, len);
+}
+
+/* see sph_skein.h */
+void
+sph_skein512_close(void *cc, void *dst)
+{
+	sph_skein512_addbits_and_close(cc, 0, 0, dst);
+}
+
+/* see sph_skein.h */
+void
+sph_skein512_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
+{
+	skein_big_close(cc, ub, n, dst, 64);
+	sph_skein512_init(cc);
+}
+
+#endif
+
+
+#ifdef __cplusplus
+}
+#endif
diff --git a/sha3/sph_blake.h b/sha3/sph_blake.h
new file mode 100644
index 0000000..d8d7943
--- /dev/null
+++ b/sha3/sph_blake.h
@@ -0,0 +1,327 @@
+/* $Id: sph_blake.h 252 2011-06-07 17:55:14Z tp $ */
+/**
+ * BLAKE interface. BLAKE is a family of functions which differ by their
+ * output size; this implementation defines BLAKE for output sizes 224,
+ * 256, 384 and 512 bits. This implementation conforms to the "third
+ * round" specification.
+ *
+ * ==========================(LICENSE BEGIN)============================
+ *
+ * Copyright (c) 2007-2010  Projet RNRT SAPHIR
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ===========================(LICENSE END)=============================
+ *
+ * @file     sph_blake.h
+ * @author   Thomas Pornin <thomas.pornin@cryptolog.com>
+ */
+
+#ifndef SPH_BLAKE_H__
+#define SPH_BLAKE_H__
+
+#ifdef __cplusplus
+extern "C"{
+#endif
+
+#include <stddef.h>
+#include "sph_types.h"
+
+/**
+ * Output size (in bits) for BLAKE-224.
+ */
+#define SPH_SIZE_blake224   224
+
+/**
+ * Output size (in bits) for BLAKE-256.
+ */
+#define SPH_SIZE_blake256   256
+
+#if SPH_64
+
+/**
+ * Output size (in bits) for BLAKE-384.
+ */
+#define SPH_SIZE_blake384   384
+
+/**
+ * Output size (in bits) for BLAKE-512.
+ */
+#define SPH_SIZE_blake512   512
+
+#endif
+
+/**
+ * This structure is a context for BLAKE-224 and BLAKE-256 computations:
+ * it contains the intermediate values and some data from the last
+ * entered block. Once a BLAKE computation has been performed, the
+ * context can be reused for another computation.
+ *
+ * The contents of this structure are private. A running BLAKE
+ * computation can be cloned by copying the context (e.g. with a simple
+ * <code>memcpy()</code>).
+ */
+typedef struct {
+#ifndef DOXYGEN_IGNORE
+	unsigned char buf[64];    /* first field, for alignment */
+	size_t ptr;
+	sph_u32 H[8];
+	sph_u32 S[4];
+	sph_u32 T0, T1;
+#endif
+} sph_blake_small_context;
+
+/**
+ * This structure is a context for BLAKE-224 computations. It is
+ * identical to the common <code>sph_blake_small_context</code>.
+ */
+typedef sph_blake_small_context sph_blake224_context;
+
+/**
+ * This structure is a context for BLAKE-256 computations. It is
+ * identical to the common <code>sph_blake_small_context</code>.
+ */
+typedef sph_blake_small_context sph_blake256_context;
+
+#if SPH_64
+
+/**
+ * This structure is a context for BLAKE-384 and BLAKE-512 computations:
+ * it contains the intermediate values and some data from the last
+ * entered block. Once a BLAKE computation has been performed, the
+ * context can be reused for another computation.
+ *
+ * The contents of this structure are private. A running BLAKE
+ * computation can be cloned by copying the context (e.g. with a simple
+ * <code>memcpy()</code>).
+ */
+typedef struct {
+#ifndef DOXYGEN_IGNORE
+	unsigned char buf[128];    /* first field, for alignment */
+	size_t ptr;
+	sph_u64 H[8];
+	sph_u64 S[4];
+	sph_u64 T0, T1;
+#endif
+} sph_blake_big_context;
+
+/**
+ * This structure is a context for BLAKE-384 computations. It is
+ * identical to the common <code>sph_blake_small_context</code>.
+ */
+typedef sph_blake_big_context sph_blake384_context;
+
+/**
+ * This structure is a context for BLAKE-512 computations. It is
+ * identical to the common <code>sph_blake_small_context</code>.
+ */
+typedef sph_blake_big_context sph_blake512_context;
+
+#endif
+
+/**
+ * Initialize a BLAKE-224 context. This process performs no memory allocation.
+ *
+ * @param cc   the BLAKE-224 context (pointer to a
+ *             <code>sph_blake224_context</code>)
+ */
+void sph_blake224_init(void *cc);
+
+/**
+ * Process some data bytes. It is acceptable that <code>len</code> is zero
+ * (in which case this function does nothing).
+ *
+ * @param cc     the BLAKE-224 context
+ * @param data   the input data
+ * @param len    the input data length (in bytes)
+ */
+void sph_blake224(void *cc, const void *data, size_t len);
+
+/**
+ * Terminate the current BLAKE-224 computation and output the result into
+ * the provided buffer. The destination buffer must be wide enough to
+ * accomodate the result (28 bytes). The context is automatically
+ * reinitialized.
+ *
+ * @param cc    the BLAKE-224 context
+ * @param dst   the destination buffer
+ */
+void sph_blake224_close(void *cc, void *dst);
+
+/**
+ * Add a few additional bits (0 to 7) to the current computation, then
+ * terminate it and output the result in the provided buffer, which must
+ * be wide enough to accomodate the result (28 bytes). If bit number i
+ * in <code>ub</code> has value 2^i, then the extra bits are those
+ * numbered 7 downto 8-n (this is the big-endian convention at the byte
+ * level). The context is automatically reinitialized.
+ *
+ * @param cc    the BLAKE-224 context
+ * @param ub    the extra bits
+ * @param n     the number of extra bits (0 to 7)
+ * @param dst   the destination buffer
+ */
+void sph_blake224_addbits_and_close(
+	void *cc, unsigned ub, unsigned n, void *dst);
+
+/**
+ * Initialize a BLAKE-256 context. This process performs no memory allocation.
+ *
+ * @param cc   the BLAKE-256 context (pointer to a
+ *             <code>sph_blake256_context</code>)
+ */
+void sph_blake256_init(void *cc);
+
+/**
+ * Process some data bytes. It is acceptable that <code>len</code> is zero
+ * (in which case this function does nothing).
+ *
+ * @param cc     the BLAKE-256 context
+ * @param data   the input data
+ * @param len    the input data length (in bytes)
+ */
+void sph_blake256(void *cc, const void *data, size_t len);
+
+/**
+ * Terminate the current BLAKE-256 computation and output the result into
+ * the provided buffer. The destination buffer must be wide enough to
+ * accomodate the result (32 bytes). The context is automatically
+ * reinitialized.
+ *
+ * @param cc    the BLAKE-256 context
+ * @param dst   the destination buffer
+ */
+void sph_blake256_close(void *cc, void *dst);
+
+/**
+ * Add a few additional bits (0 to 7) to the current computation, then
+ * terminate it and output the result in the provided buffer, which must
+ * be wide enough to accomodate the result (32 bytes). If bit number i
+ * in <code>ub</code> has value 2^i, then the extra bits are those
+ * numbered 7 downto 8-n (this is the big-endian convention at the byte
+ * level). The context is automatically reinitialized.
+ *
+ * @param cc    the BLAKE-256 context
+ * @param ub    the extra bits
+ * @param n     the number of extra bits (0 to 7)
+ * @param dst   the destination buffer
+ */
+void sph_blake256_addbits_and_close(
+	void *cc, unsigned ub, unsigned n, void *dst);
+
+#if SPH_64
+
+/**
+ * Initialize a BLAKE-384 context. This process performs no memory allocation.
+ *
+ * @param cc   the BLAKE-384 context (pointer to a
+ *             <code>sph_blake384_context</code>)
+ */
+void sph_blake384_init(void *cc);
+
+/**
+ * Process some data bytes. It is acceptable that <code>len</code> is zero
+ * (in which case this function does nothing).
+ *
+ * @param cc     the BLAKE-384 context
+ * @param data   the input data
+ * @param len    the input data length (in bytes)
+ */
+void sph_blake384(void *cc, const void *data, size_t len);
+
+/**
+ * Terminate the current BLAKE-384 computation and output the result into
+ * the provided buffer. The destination buffer must be wide enough to
+ * accomodate the result (48 bytes). The context is automatically
+ * reinitialized.
+ *
+ * @param cc    the BLAKE-384 context
+ * @param dst   the destination buffer
+ */
+void sph_blake384_close(void *cc, void *dst);
+
+/**
+ * Add a few additional bits (0 to 7) to the current computation, then
+ * terminate it and output the result in the provided buffer, which must
+ * be wide enough to accomodate the result (48 bytes). If bit number i
+ * in <code>ub</code> has value 2^i, then the extra bits are those
+ * numbered 7 downto 8-n (this is the big-endian convention at the byte
+ * level). The context is automatically reinitialized.
+ *
+ * @param cc    the BLAKE-384 context
+ * @param ub    the extra bits
+ * @param n     the number of extra bits (0 to 7)
+ * @param dst   the destination buffer
+ */
+void sph_blake384_addbits_and_close(
+	void *cc, unsigned ub, unsigned n, void *dst);
+
+/**
+ * Initialize a BLAKE-512 context. This process performs no memory allocation.
+ *
+ * @param cc   the BLAKE-512 context (pointer to a
+ *             <code>sph_blake512_context</code>)
+ */
+void sph_blake512_init(void *cc);
+
+/**
+ * Process some data bytes. It is acceptable that <code>len</code> is zero
+ * (in which case this function does nothing).
+ *
+ * @param cc     the BLAKE-512 context
+ * @param data   the input data
+ * @param len    the input data length (in bytes)
+ */
+void sph_blake512(void *cc, const void *data, size_t len);
+
+/**
+ * Terminate the current BLAKE-512 computation and output the result into
+ * the provided buffer. The destination buffer must be wide enough to
+ * accomodate the result (64 bytes). The context is automatically
+ * reinitialized.
+ *
+ * @param cc    the BLAKE-512 context
+ * @param dst   the destination buffer
+ */
+void sph_blake512_close(void *cc, void *dst);
+
+/**
+ * Add a few additional bits (0 to 7) to the current computation, then
+ * terminate it and output the result in the provided buffer, which must
+ * be wide enough to accomodate the result (64 bytes). If bit number i
+ * in <code>ub</code> has value 2^i, then the extra bits are those
+ * numbered 7 downto 8-n (this is the big-endian convention at the byte
+ * level). The context is automatically reinitialized.
+ *
+ * @param cc    the BLAKE-512 context
+ * @param ub    the extra bits
+ * @param n     the number of extra bits (0 to 7)
+ * @param dst   the destination buffer
+ */
+void sph_blake512_addbits_and_close(
+	void *cc, unsigned ub, unsigned n, void *dst);
+
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/sha3/sph_bmw.h b/sha3/sph_bmw.h
new file mode 100644
index 0000000..d386b0c
--- /dev/null
+++ b/sha3/sph_bmw.h
@@ -0,0 +1,328 @@
+/* $Id: sph_bmw.h 216 2010-06-08 09:46:57Z tp $ */
+/**
+ * BMW interface. BMW (aka "Blue Midnight Wish") is a family of
+ * functions which differ by their output size; this implementation
+ * defines BMW for output sizes 224, 256, 384 and 512 bits.
+ *
+ * ==========================(LICENSE BEGIN)============================
+ *
+ * Copyright (c) 2007-2010  Projet RNRT SAPHIR
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ===========================(LICENSE END)=============================
+ *
+ * @file     sph_bmw.h
+ * @author   Thomas Pornin <thomas.pornin@cryptolog.com>
+ */
+
+#ifndef SPH_BMW_H__
+#define SPH_BMW_H__
+
+#ifdef __cplusplus
+extern "C"{
+#endif
+
+#include <stddef.h>
+#include "sph_types.h"
+
+/**
+ * Output size (in bits) for BMW-224.
+ */
+#define SPH_SIZE_bmw224   224
+
+/**
+ * Output size (in bits) for BMW-256.
+ */
+#define SPH_SIZE_bmw256   256
+
+#if SPH_64
+
+/**
+ * Output size (in bits) for BMW-384.
+ */
+#define SPH_SIZE_bmw384   384
+
+/**
+ * Output size (in bits) for BMW-512.
+ */
+#define SPH_SIZE_bmw512   512
+
+#endif
+
+/**
+ * This structure is a context for BMW-224 and BMW-256 computations:
+ * it contains the intermediate values and some data from the last
+ * entered block. Once a BMW computation has been performed, the
+ * context can be reused for another computation.
+ *
+ * The contents of this structure are private. A running BMW
+ * computation can be cloned by copying the context (e.g. with a simple
+ * <code>memcpy()</code>).
+ */
+typedef struct {
+#ifndef DOXYGEN_IGNORE
+	unsigned char buf[64];    /* first field, for alignment */
+	size_t ptr;
+	sph_u32 H[16];
+#if SPH_64
+	sph_u64 bit_count;
+#else
+	sph_u32 bit_count_high, bit_count_low;
+#endif
+#endif
+} sph_bmw_small_context;
+
+/**
+ * This structure is a context for BMW-224 computations. It is
+ * identical to the common <code>sph_bmw_small_context</code>.
+ */
+typedef sph_bmw_small_context sph_bmw224_context;
+
+/**
+ * This structure is a context for BMW-256 computations. It is
+ * identical to the common <code>sph_bmw_small_context</code>.
+ */
+typedef sph_bmw_small_context sph_bmw256_context;
+
+#if SPH_64
+
+/**
+ * This structure is a context for BMW-384 and BMW-512 computations:
+ * it contains the intermediate values and some data from the last
+ * entered block. Once a BMW computation has been performed, the
+ * context can be reused for another computation.
+ *
+ * The contents of this structure are private. A running BMW
+ * computation can be cloned by copying the context (e.g. with a simple
+ * <code>memcpy()</code>).
+ */
+typedef struct {
+#ifndef DOXYGEN_IGNORE
+	unsigned char buf[128];    /* first field, for alignment */
+	size_t ptr;
+	sph_u64 H[16];
+	sph_u64 bit_count;
+#endif
+} sph_bmw_big_context;
+
+/**
+ * This structure is a context for BMW-384 computations. It is
+ * identical to the common <code>sph_bmw_small_context</code>.
+ */
+typedef sph_bmw_big_context sph_bmw384_context;
+
+/**
+ * This structure is a context for BMW-512 computations. It is
+ * identical to the common <code>sph_bmw_small_context</code>.
+ */
+typedef sph_bmw_big_context sph_bmw512_context;
+
+#endif
+
+/**
+ * Initialize a BMW-224 context. This process performs no memory allocation.
+ *
+ * @param cc   the BMW-224 context (pointer to a
+ *             <code>sph_bmw224_context</code>)
+ */
+void sph_bmw224_init(void *cc);
+
+/**
+ * Process some data bytes. It is acceptable that <code>len</code> is zero
+ * (in which case this function does nothing).
+ *
+ * @param cc     the BMW-224 context
+ * @param data   the input data
+ * @param len    the input data length (in bytes)
+ */
+void sph_bmw224(void *cc, const void *data, size_t len);
+
+/**
+ * Terminate the current BMW-224 computation and output the result into
+ * the provided buffer. The destination buffer must be wide enough to
+ * accomodate the result (28 bytes). The context is automatically
+ * reinitialized.
+ *
+ * @param cc    the BMW-224 context
+ * @param dst   the destination buffer
+ */
+void sph_bmw224_close(void *cc, void *dst);
+
+/**
+ * Add a few additional bits (0 to 7) to the current computation, then
+ * terminate it and output the result in the provided buffer, which must
+ * be wide enough to accomodate the result (28 bytes). If bit number i
+ * in <code>ub</code> has value 2^i, then the extra bits are those
+ * numbered 7 downto 8-n (this is the big-endian convention at the byte
+ * level). The context is automatically reinitialized.
+ *
+ * @param cc    the BMW-224 context
+ * @param ub    the extra bits
+ * @param n     the number of extra bits (0 to 7)
+ * @param dst   the destination buffer
+ */
+void sph_bmw224_addbits_and_close(
+	void *cc, unsigned ub, unsigned n, void *dst);
+
+/**
+ * Initialize a BMW-256 context. This process performs no memory allocation.
+ *
+ * @param cc   the BMW-256 context (pointer to a
+ *             <code>sph_bmw256_context</code>)
+ */
+void sph_bmw256_init(void *cc);
+
+/**
+ * Process some data bytes. It is acceptable that <code>len</code> is zero
+ * (in which case this function does nothing).
+ *
+ * @param cc     the BMW-256 context
+ * @param data   the input data
+ * @param len    the input data length (in bytes)
+ */
+void sph_bmw256(void *cc, const void *data, size_t len);
+
+/**
+ * Terminate the current BMW-256 computation and output the result into
+ * the provided buffer. The destination buffer must be wide enough to
+ * accomodate the result (32 bytes). The context is automatically
+ * reinitialized.
+ *
+ * @param cc    the BMW-256 context
+ * @param dst   the destination buffer
+ */
+void sph_bmw256_close(void *cc, void *dst);
+
+/**
+ * Add a few additional bits (0 to 7) to the current computation, then
+ * terminate it and output the result in the provided buffer, which must
+ * be wide enough to accomodate the result (32 bytes). If bit number i
+ * in <code>ub</code> has value 2^i, then the extra bits are those
+ * numbered 7 downto 8-n (this is the big-endian convention at the byte
+ * level). The context is automatically reinitialized.
+ *
+ * @param cc    the BMW-256 context
+ * @param ub    the extra bits
+ * @param n     the number of extra bits (0 to 7)
+ * @param dst   the destination buffer
+ */
+void sph_bmw256_addbits_and_close(
+	void *cc, unsigned ub, unsigned n, void *dst);
+
+#if SPH_64
+
+/**
+ * Initialize a BMW-384 context. This process performs no memory allocation.
+ *
+ * @param cc   the BMW-384 context (pointer to a
+ *             <code>sph_bmw384_context</code>)
+ */
+void sph_bmw384_init(void *cc);
+
+/**
+ * Process some data bytes. It is acceptable that <code>len</code> is zero
+ * (in which case this function does nothing).
+ *
+ * @param cc     the BMW-384 context
+ * @param data   the input data
+ * @param len    the input data length (in bytes)
+ */
+void sph_bmw384(void *cc, const void *data, size_t len);
+
+/**
+ * Terminate the current BMW-384 computation and output the result into
+ * the provided buffer. The destination buffer must be wide enough to
+ * accomodate the result (48 bytes). The context is automatically
+ * reinitialized.
+ *
+ * @param cc    the BMW-384 context
+ * @param dst   the destination buffer
+ */
+void sph_bmw384_close(void *cc, void *dst);
+
+/**
+ * Add a few additional bits (0 to 7) to the current computation, then
+ * terminate it and output the result in the provided buffer, which must
+ * be wide enough to accomodate the result (48 bytes). If bit number i
+ * in <code>ub</code> has value 2^i, then the extra bits are those
+ * numbered 7 downto 8-n (this is the big-endian convention at the byte
+ * level). The context is automatically reinitialized.
+ *
+ * @param cc    the BMW-384 context
+ * @param ub    the extra bits
+ * @param n     the number of extra bits (0 to 7)
+ * @param dst   the destination buffer
+ */
+void sph_bmw384_addbits_and_close(
+	void *cc, unsigned ub, unsigned n, void *dst);
+
+/**
+ * Initialize a BMW-512 context. This process performs no memory allocation.
+ *
+ * @param cc   the BMW-512 context (pointer to a
+ *             <code>sph_bmw512_context</code>)
+ */
+void sph_bmw512_init(void *cc);
+
+/**
+ * Process some data bytes. It is acceptable that <code>len</code> is zero
+ * (in which case this function does nothing).
+ *
+ * @param cc     the BMW-512 context
+ * @param data   the input data
+ * @param len    the input data length (in bytes)
+ */
+void sph_bmw512(void *cc, const void *data, size_t len);
+
+/**
+ * Terminate the current BMW-512 computation and output the result into
+ * the provided buffer. The destination buffer must be wide enough to
+ * accomodate the result (64 bytes). The context is automatically
+ * reinitialized.
+ *
+ * @param cc    the BMW-512 context
+ * @param dst   the destination buffer
+ */
+void sph_bmw512_close(void *cc, void *dst);
+
+/**
+ * Add a few additional bits (0 to 7) to the current computation, then
+ * terminate it and output the result in the provided buffer, which must
+ * be wide enough to accomodate the result (64 bytes). If bit number i
+ * in <code>ub</code> has value 2^i, then the extra bits are those
+ * numbered 7 downto 8-n (this is the big-endian convention at the byte
+ * level). The context is automatically reinitialized.
+ *
+ * @param cc    the BMW-512 context
+ * @param ub    the extra bits
+ * @param n     the number of extra bits (0 to 7)
+ * @param dst   the destination buffer
+ */
+void sph_bmw512_addbits_and_close(
+	void *cc, unsigned ub, unsigned n, void *dst);
+
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/sha3/sph_cubehash.h b/sha3/sph_cubehash.h
new file mode 100644
index 0000000..487a194
--- /dev/null
+++ b/sha3/sph_cubehash.h
@@ -0,0 +1,292 @@
+/* $Id: sph_cubehash.h 180 2010-05-08 02:29:25Z tp $ */
+/**
+ * CubeHash interface. CubeHash is a family of functions which differ by
+ * their output size; this implementation defines CubeHash for output
+ * sizes 224, 256, 384 and 512 bits, with the "standard parameters"
+ * (CubeHash16/32 with the CubeHash specification notations).
+ *
+ * ==========================(LICENSE BEGIN)============================
+ *
+ * Copyright (c) 2007-2010  Projet RNRT SAPHIR
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ===========================(LICENSE END)=============================
+ *
+ * @file     sph_cubehash.h
+ * @author   Thomas Pornin <thomas.pornin@cryptolog.com>
+ */
+
+#ifndef SPH_CUBEHASH_H__
+#define SPH_CUBEHASH_H__
+
+#ifdef __cplusplus
+extern "C"{
+#endif
+
+#include <stddef.h>
+#include "sph_types.h"
+
+/**
+ * Output size (in bits) for CubeHash-224.
+ */
+#define SPH_SIZE_cubehash224   224
+
+/**
+ * Output size (in bits) for CubeHash-256.
+ */
+#define SPH_SIZE_cubehash256   256
+
+/**
+ * Output size (in bits) for CubeHash-384.
+ */
+#define SPH_SIZE_cubehash384   384
+
+/**
+ * Output size (in bits) for CubeHash-512.
+ */
+#define SPH_SIZE_cubehash512   512
+
+/**
+ * This structure is a context for CubeHash computations: it contains the
+ * intermediate values and some data from the last entered block. Once
+ * a CubeHash computation has been performed, the context can be reused for
+ * another computation.
+ *
+ * The contents of this structure are private. A running CubeHash computation
+ * can be cloned by copying the context (e.g. with a simple
+ * <code>memcpy()</code>).
+ */
+typedef struct {
+#ifndef DOXYGEN_IGNORE
+	unsigned char buf[32];    /* first field, for alignment */
+	size_t ptr;
+	sph_u32 state[32];
+#endif
+} sph_cubehash_context;
+
+/**
+ * Type for a CubeHash-224 context (identical to the common context).
+ */
+typedef sph_cubehash_context sph_cubehash224_context;
+
+/**
+ * Type for a CubeHash-256 context (identical to the common context).
+ */
+typedef sph_cubehash_context sph_cubehash256_context;
+
+/**
+ * Type for a CubeHash-384 context (identical to the common context).
+ */
+typedef sph_cubehash_context sph_cubehash384_context;
+
+/**
+ * Type for a CubeHash-512 context (identical to the common context).
+ */
+typedef sph_cubehash_context sph_cubehash512_context;
+
+/**
+ * Initialize a CubeHash-224 context. This process performs no memory
+ * allocation.
+ *
+ * @param cc   the CubeHash-224 context (pointer to a
+ *             <code>sph_cubehash224_context</code>)
+ */
+void sph_cubehash224_init(void *cc);
+
+/**
+ * Process some data bytes. It is acceptable that <code>len</code> is zero
+ * (in which case this function does nothing).
+ *
+ * @param cc     the CubeHash-224 context
+ * @param data   the input data
+ * @param len    the input data length (in bytes)
+ */
+void sph_cubehash224(void *cc, const void *data, size_t len);
+
+/**
+ * Terminate the current CubeHash-224 computation and output the result into
+ * the provided buffer. The destination buffer must be wide enough to
+ * accomodate the result (28 bytes). The context is automatically
+ * reinitialized.
+ *
+ * @param cc    the CubeHash-224 context
+ * @param dst   the destination buffer
+ */
+void sph_cubehash224_close(void *cc, void *dst);
+
+/**
+ * Add a few additional bits (0 to 7) to the current computation, then
+ * terminate it and output the result in the provided buffer, which must
+ * be wide enough to accomodate the result (28 bytes). If bit number i
+ * in <code>ub</code> has value 2^i, then the extra bits are those
+ * numbered 7 downto 8-n (this is the big-endian convention at the byte
+ * level). The context is automatically reinitialized.
+ *
+ * @param cc    the CubeHash-224 context
+ * @param ub    the extra bits
+ * @param n     the number of extra bits (0 to 7)
+ * @param dst   the destination buffer
+ */
+void sph_cubehash224_addbits_and_close(
+	void *cc, unsigned ub, unsigned n, void *dst);
+
+/**
+ * Initialize a CubeHash-256 context. This process performs no memory
+ * allocation.
+ *
+ * @param cc   the CubeHash-256 context (pointer to a
+ *             <code>sph_cubehash256_context</code>)
+ */
+void sph_cubehash256_init(void *cc);
+
+/**
+ * Process some data bytes. It is acceptable that <code>len</code> is zero
+ * (in which case this function does nothing).
+ *
+ * @param cc     the CubeHash-256 context
+ * @param data   the input data
+ * @param len    the input data length (in bytes)
+ */
+void sph_cubehash256(void *cc, const void *data, size_t len);
+
+/**
+ * Terminate the current CubeHash-256 computation and output the result into
+ * the provided buffer. The destination buffer must be wide enough to
+ * accomodate the result (32 bytes). The context is automatically
+ * reinitialized.
+ *
+ * @param cc    the CubeHash-256 context
+ * @param dst   the destination buffer
+ */
+void sph_cubehash256_close(void *cc, void *dst);
+
+/**
+ * Add a few additional bits (0 to 7) to the current computation, then
+ * terminate it and output the result in the provided buffer, which must
+ * be wide enough to accomodate the result (32 bytes). If bit number i
+ * in <code>ub</code> has value 2^i, then the extra bits are those
+ * numbered 7 downto 8-n (this is the big-endian convention at the byte
+ * level). The context is automatically reinitialized.
+ *
+ * @param cc    the CubeHash-256 context
+ * @param ub    the extra bits
+ * @param n     the number of extra bits (0 to 7)
+ * @param dst   the destination buffer
+ */
+void sph_cubehash256_addbits_and_close(
+	void *cc, unsigned ub, unsigned n, void *dst);
+
+/**
+ * Initialize a CubeHash-384 context. This process performs no memory
+ * allocation.
+ *
+ * @param cc   the CubeHash-384 context (pointer to a
+ *             <code>sph_cubehash384_context</code>)
+ */
+void sph_cubehash384_init(void *cc);
+
+/**
+ * Process some data bytes. It is acceptable that <code>len</code> is zero
+ * (in which case this function does nothing).
+ *
+ * @param cc     the CubeHash-384 context
+ * @param data   the input data
+ * @param len    the input data length (in bytes)
+ */
+void sph_cubehash384(void *cc, const void *data, size_t len);
+
+/**
+ * Terminate the current CubeHash-384 computation and output the result into
+ * the provided buffer. The destination buffer must be wide enough to
+ * accomodate the result (48 bytes). The context is automatically
+ * reinitialized.
+ *
+ * @param cc    the CubeHash-384 context
+ * @param dst   the destination buffer
+ */
+void sph_cubehash384_close(void *cc, void *dst);
+
+/**
+ * Add a few additional bits (0 to 7) to the current computation, then
+ * terminate it and output the result in the provided buffer, which must
+ * be wide enough to accomodate the result (48 bytes). If bit number i
+ * in <code>ub</code> has value 2^i, then the extra bits are those
+ * numbered 7 downto 8-n (this is the big-endian convention at the byte
+ * level). The context is automatically reinitialized.
+ *
+ * @param cc    the CubeHash-384 context
+ * @param ub    the extra bits
+ * @param n     the number of extra bits (0 to 7)
+ * @param dst   the destination buffer
+ */
+void sph_cubehash384_addbits_and_close(
+	void *cc, unsigned ub, unsigned n, void *dst);
+
+/**
+ * Initialize a CubeHash-512 context. This process performs no memory
+ * allocation.
+ *
+ * @param cc   the CubeHash-512 context (pointer to a
+ *             <code>sph_cubehash512_context</code>)
+ */
+void sph_cubehash512_init(void *cc);
+
+/**
+ * Process some data bytes. It is acceptable that <code>len</code> is zero
+ * (in which case this function does nothing).
+ *
+ * @param cc     the CubeHash-512 context
+ * @param data   the input data
+ * @param len    the input data length (in bytes)
+ */
+void sph_cubehash512(void *cc, const void *data, size_t len);
+
+/**
+ * Terminate the current CubeHash-512 computation and output the result into
+ * the provided buffer. The destination buffer must be wide enough to
+ * accomodate the result (64 bytes). The context is automatically
+ * reinitialized.
+ *
+ * @param cc    the CubeHash-512 context
+ * @param dst   the destination buffer
+ */
+void sph_cubehash512_close(void *cc, void *dst);
+
+/**
+ * Add a few additional bits (0 to 7) to the current computation, then
+ * terminate it and output the result in the provided buffer, which must
+ * be wide enough to accomodate the result (64 bytes). If bit number i
+ * in <code>ub</code> has value 2^i, then the extra bits are those
+ * numbered 7 downto 8-n (this is the big-endian convention at the byte
+ * level). The context is automatically reinitialized.
+ *
+ * @param cc    the CubeHash-512 context
+ * @param ub    the extra bits
+ * @param n     the number of extra bits (0 to 7)
+ * @param dst   the destination buffer
+ */
+void sph_cubehash512_addbits_and_close(
+	void *cc, unsigned ub, unsigned n, void *dst);
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/sha3/sph_echo.h b/sha3/sph_echo.h
new file mode 100644
index 0000000..1ae1e3d
--- /dev/null
+++ b/sha3/sph_echo.h
@@ -0,0 +1,320 @@
+/* $Id: sph_echo.h 216 2010-06-08 09:46:57Z tp $ */
+/**
+ * ECHO interface. ECHO is a family of functions which differ by
+ * their output size; this implementation defines ECHO for output
+ * sizes 224, 256, 384 and 512 bits.
+ *
+ * ==========================(LICENSE BEGIN)============================
+ *
+ * Copyright (c) 2007-2010  Projet RNRT SAPHIR
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ===========================(LICENSE END)=============================
+ *
+ * @file     sph_echo.h
+ * @author   Thomas Pornin <thomas.pornin@cryptolog.com>
+ */
+
+#ifndef SPH_ECHO_H__
+#define SPH_ECHO_H__
+
+#ifdef __cplusplus
+extern "C"{
+#endif
+
+#include <stddef.h>
+#include "sph_types.h"
+
+/**
+ * Output size (in bits) for ECHO-224.
+ */
+#define SPH_SIZE_echo224   224
+
+/**
+ * Output size (in bits) for ECHO-256.
+ */
+#define SPH_SIZE_echo256   256
+
+/**
+ * Output size (in bits) for ECHO-384.
+ */
+#define SPH_SIZE_echo384   384
+
+/**
+ * Output size (in bits) for ECHO-512.
+ */
+#define SPH_SIZE_echo512   512
+
+/**
+ * This structure is a context for ECHO computations: it contains the
+ * intermediate values and some data from the last entered block. Once
+ * an ECHO computation has been performed, the context can be reused for
+ * another computation. This specific structure is used for ECHO-224
+ * and ECHO-256.
+ *
+ * The contents of this structure are private. A running ECHO computation
+ * can be cloned by copying the context (e.g. with a simple
+ * <code>memcpy()</code>).
+ */
+typedef struct {
+#ifndef DOXYGEN_IGNORE
+	unsigned char buf[192];    /* first field, for alignment */
+	size_t ptr;
+	union {
+		sph_u32 Vs[4][4];
+#if SPH_64
+		sph_u64 Vb[4][2];
+#endif
+	} u;
+	sph_u32 C0, C1, C2, C3;
+#endif
+} sph_echo_small_context;
+
+/**
+ * This structure is a context for ECHO computations: it contains the
+ * intermediate values and some data from the last entered block. Once
+ * an ECHO computation has been performed, the context can be reused for
+ * another computation. This specific structure is used for ECHO-384
+ * and ECHO-512.
+ *
+ * The contents of this structure are private. A running ECHO computation
+ * can be cloned by copying the context (e.g. with a simple
+ * <code>memcpy()</code>).
+ */
+typedef struct {
+#ifndef DOXYGEN_IGNORE
+	unsigned char buf[128];    /* first field, for alignment */
+	size_t ptr;
+	union {
+		sph_u32 Vs[8][4];
+#if SPH_64
+		sph_u64 Vb[8][2];
+#endif
+	} u;
+	sph_u32 C0, C1, C2, C3;
+#endif
+} sph_echo_big_context;
+
+/**
+ * Type for a ECHO-224 context (identical to the common "small" context).
+ */
+typedef sph_echo_small_context sph_echo224_context;
+
+/**
+ * Type for a ECHO-256 context (identical to the common "small" context).
+ */
+typedef sph_echo_small_context sph_echo256_context;
+
+/**
+ * Type for a ECHO-384 context (identical to the common "big" context).
+ */
+typedef sph_echo_big_context sph_echo384_context;
+
+/**
+ * Type for a ECHO-512 context (identical to the common "big" context).
+ */
+typedef sph_echo_big_context sph_echo512_context;
+
+/**
+ * Initialize an ECHO-224 context. This process performs no memory allocation.
+ *
+ * @param cc   the ECHO-224 context (pointer to a
+ *             <code>sph_echo224_context</code>)
+ */
+void sph_echo224_init(void *cc);
+
+/**
+ * Process some data bytes. It is acceptable that <code>len</code> is zero
+ * (in which case this function does nothing).
+ *
+ * @param cc     the ECHO-224 context
+ * @param data   the input data
+ * @param len    the input data length (in bytes)
+ */
+void sph_echo224(void *cc, const void *data, size_t len);
+
+/**
+ * Terminate the current ECHO-224 computation and output the result into
+ * the provided buffer. The destination buffer must be wide enough to
+ * accomodate the result (28 bytes). The context is automatically
+ * reinitialized.
+ *
+ * @param cc    the ECHO-224 context
+ * @param dst   the destination buffer
+ */
+void sph_echo224_close(void *cc, void *dst);
+
+/**
+ * Add a few additional bits (0 to 7) to the current computation, then
+ * terminate it and output the result in the provided buffer, which must
+ * be wide enough to accomodate the result (28 bytes). If bit number i
+ * in <code>ub</code> has value 2^i, then the extra bits are those
+ * numbered 7 downto 8-n (this is the big-endian convention at the byte
+ * level). The context is automatically reinitialized.
+ *
+ * @param cc    the ECHO-224 context
+ * @param ub    the extra bits
+ * @param n     the number of extra bits (0 to 7)
+ * @param dst   the destination buffer
+ */
+void sph_echo224_addbits_and_close(
+	void *cc, unsigned ub, unsigned n, void *dst);
+
+/**
+ * Initialize an ECHO-256 context. This process performs no memory allocation.
+ *
+ * @param cc   the ECHO-256 context (pointer to a
+ *             <code>sph_echo256_context</code>)
+ */
+void sph_echo256_init(void *cc);
+
+/**
+ * Process some data bytes. It is acceptable that <code>len</code> is zero
+ * (in which case this function does nothing).
+ *
+ * @param cc     the ECHO-256 context
+ * @param data   the input data
+ * @param len    the input data length (in bytes)
+ */
+void sph_echo256(void *cc, const void *data, size_t len);
+
+/**
+ * Terminate the current ECHO-256 computation and output the result into
+ * the provided buffer. The destination buffer must be wide enough to
+ * accomodate the result (32 bytes). The context is automatically
+ * reinitialized.
+ *
+ * @param cc    the ECHO-256 context
+ * @param dst   the destination buffer
+ */
+void sph_echo256_close(void *cc, void *dst);
+
+/**
+ * Add a few additional bits (0 to 7) to the current computation, then
+ * terminate it and output the result in the provided buffer, which must
+ * be wide enough to accomodate the result (32 bytes). If bit number i
+ * in <code>ub</code> has value 2^i, then the extra bits are those
+ * numbered 7 downto 8-n (this is the big-endian convention at the byte
+ * level). The context is automatically reinitialized.
+ *
+ * @param cc    the ECHO-256 context
+ * @param ub    the extra bits
+ * @param n     the number of extra bits (0 to 7)
+ * @param dst   the destination buffer
+ */
+void sph_echo256_addbits_and_close(
+	void *cc, unsigned ub, unsigned n, void *dst);
+
+/**
+ * Initialize an ECHO-384 context. This process performs no memory allocation.
+ *
+ * @param cc   the ECHO-384 context (pointer to a
+ *             <code>sph_echo384_context</code>)
+ */
+void sph_echo384_init(void *cc);
+
+/**
+ * Process some data bytes. It is acceptable that <code>len</code> is zero
+ * (in which case this function does nothing).
+ *
+ * @param cc     the ECHO-384 context
+ * @param data   the input data
+ * @param len    the input data length (in bytes)
+ */
+void sph_echo384(void *cc, const void *data, size_t len);
+
+/**
+ * Terminate the current ECHO-384 computation and output the result into
+ * the provided buffer. The destination buffer must be wide enough to
+ * accomodate the result (48 bytes). The context is automatically
+ * reinitialized.
+ *
+ * @param cc    the ECHO-384 context
+ * @param dst   the destination buffer
+ */
+void sph_echo384_close(void *cc, void *dst);
+
+/**
+ * Add a few additional bits (0 to 7) to the current computation, then
+ * terminate it and output the result in the provided buffer, which must
+ * be wide enough to accomodate the result (48 bytes). If bit number i
+ * in <code>ub</code> has value 2^i, then the extra bits are those
+ * numbered 7 downto 8-n (this is the big-endian convention at the byte
+ * level). The context is automatically reinitialized.
+ *
+ * @param cc    the ECHO-384 context
+ * @param ub    the extra bits
+ * @param n     the number of extra bits (0 to 7)
+ * @param dst   the destination buffer
+ */
+void sph_echo384_addbits_and_close(
+	void *cc, unsigned ub, unsigned n, void *dst);
+
+/**
+ * Initialize an ECHO-512 context. This process performs no memory allocation.
+ *
+ * @param cc   the ECHO-512 context (pointer to a
+ *             <code>sph_echo512_context</code>)
+ */
+void sph_echo512_init(void *cc);
+
+/**
+ * Process some data bytes. It is acceptable that <code>len</code> is zero
+ * (in which case this function does nothing).
+ *
+ * @param cc     the ECHO-512 context
+ * @param data   the input data
+ * @param len    the input data length (in bytes)
+ */
+void sph_echo512(void *cc, const void *data, size_t len);
+
+/**
+ * Terminate the current ECHO-512 computation and output the result into
+ * the provided buffer. The destination buffer must be wide enough to
+ * accomodate the result (64 bytes). The context is automatically
+ * reinitialized.
+ *
+ * @param cc    the ECHO-512 context
+ * @param dst   the destination buffer
+ */
+void sph_echo512_close(void *cc, void *dst);
+
+/**
+ * Add a few additional bits (0 to 7) to the current computation, then
+ * terminate it and output the result in the provided buffer, which must
+ * be wide enough to accomodate the result (64 bytes). If bit number i
+ * in <code>ub</code> has value 2^i, then the extra bits are those
+ * numbered 7 downto 8-n (this is the big-endian convention at the byte
+ * level). The context is automatically reinitialized.
+ *
+ * @param cc    the ECHO-512 context
+ * @param ub    the extra bits
+ * @param n     the number of extra bits (0 to 7)
+ * @param dst   the destination buffer
+ */
+void sph_echo512_addbits_and_close(
+	void *cc, unsigned ub, unsigned n, void *dst);
+	
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/sha3/sph_groestl.h b/sha3/sph_groestl.h
new file mode 100644
index 0000000..495f05e
--- /dev/null
+++ b/sha3/sph_groestl.h
@@ -0,0 +1,329 @@
+/* $Id: sph_groestl.h 216 2010-06-08 09:46:57Z tp $ */
+/**
+ * Groestl interface. This code implements Groestl with the recommended
+ * parameters for SHA-3, with outputs of 224, 256, 384 and 512 bits.
+ *
+ * ==========================(LICENSE BEGIN)============================
+ *
+ * Copyright (c) 2007-2010  Projet RNRT SAPHIR
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ===========================(LICENSE END)=============================
+ *
+ * @file     sph_groestl.h
+ * @author   Thomas Pornin <thomas.pornin@cryptolog.com>
+ */
+
+#ifndef SPH_GROESTL_H__
+#define SPH_GROESTL_H__
+
+#ifdef __cplusplus
+extern "C"{
+#endif
+
+#include <stddef.h>
+#include "sph_types.h"
+
+/**
+ * Output size (in bits) for Groestl-224.
+ */
+#define SPH_SIZE_groestl224   224
+
+/**
+ * Output size (in bits) for Groestl-256.
+ */
+#define SPH_SIZE_groestl256   256
+
+/**
+ * Output size (in bits) for Groestl-384.
+ */
+#define SPH_SIZE_groestl384   384
+
+/**
+ * Output size (in bits) for Groestl-512.
+ */
+#define SPH_SIZE_groestl512   512
+
+/**
+ * This structure is a context for Groestl-224 and Groestl-256 computations:
+ * it contains the intermediate values and some data from the last
+ * entered block. Once a Groestl computation has been performed, the
+ * context can be reused for another computation.
+ *
+ * The contents of this structure are private. A running Groestl
+ * computation can be cloned by copying the context (e.g. with a simple
+ * <code>memcpy()</code>).
+ */
+typedef struct {
+#ifndef DOXYGEN_IGNORE
+	unsigned char buf[64];    /* first field, for alignment */
+	size_t ptr;
+	union {
+#if SPH_64
+		sph_u64 wide[8];
+#endif
+		sph_u32 narrow[16];
+	} state;
+#if SPH_64
+	sph_u64 count;
+#else
+	sph_u32 count_high, count_low;
+#endif
+#endif
+} sph_groestl_small_context;
+
+/**
+ * This structure is a context for Groestl-224 computations. It is
+ * identical to the common <code>sph_groestl_small_context</code>.
+ */
+typedef sph_groestl_small_context sph_groestl224_context;
+
+/**
+ * This structure is a context for Groestl-256 computations. It is
+ * identical to the common <code>sph_groestl_small_context</code>.
+ */
+typedef sph_groestl_small_context sph_groestl256_context;
+
+/**
+ * This structure is a context for Groestl-384 and Groestl-512 computations:
+ * it contains the intermediate values and some data from the last
+ * entered block. Once a Groestl computation has been performed, the
+ * context can be reused for another computation.
+ *
+ * The contents of this structure are private. A running Groestl
+ * computation can be cloned by copying the context (e.g. with a simple
+ * <code>memcpy()</code>).
+ */
+typedef struct {
+#ifndef DOXYGEN_IGNORE
+	unsigned char buf[128];    /* first field, for alignment */
+	size_t ptr;
+	union {
+#if SPH_64
+		sph_u64 wide[16];
+#endif
+		sph_u32 narrow[32];
+	} state;
+#if SPH_64
+	sph_u64 count;
+#else
+	sph_u32 count_high, count_low;
+#endif
+#endif
+} sph_groestl_big_context;
+
+/**
+ * This structure is a context for Groestl-384 computations. It is
+ * identical to the common <code>sph_groestl_small_context</code>.
+ */
+typedef sph_groestl_big_context sph_groestl384_context;
+
+/**
+ * This structure is a context for Groestl-512 computations. It is
+ * identical to the common <code>sph_groestl_small_context</code>.
+ */
+typedef sph_groestl_big_context sph_groestl512_context;
+
+/**
+ * Initialize a Groestl-224 context. This process performs no memory allocation.
+ *
+ * @param cc   the Groestl-224 context (pointer to a
+ *             <code>sph_groestl224_context</code>)
+ */
+void sph_groestl224_init(void *cc);
+
+/**
+ * Process some data bytes. It is acceptable that <code>len</code> is zero
+ * (in which case this function does nothing).
+ *
+ * @param cc     the Groestl-224 context
+ * @param data   the input data
+ * @param len    the input data length (in bytes)
+ */
+void sph_groestl224(void *cc, const void *data, size_t len);
+
+/**
+ * Terminate the current Groestl-224 computation and output the result into
+ * the provided buffer. The destination buffer must be wide enough to
+ * accomodate the result (28 bytes). The context is automatically
+ * reinitialized.
+ *
+ * @param cc    the Groestl-224 context
+ * @param dst   the destination buffer
+ */
+void sph_groestl224_close(void *cc, void *dst);
+
+/**
+ * Add a few additional bits (0 to 7) to the current computation, then
+ * terminate it and output the result in the provided buffer, which must
+ * be wide enough to accomodate the result (28 bytes). If bit number i
+ * in <code>ub</code> has value 2^i, then the extra bits are those
+ * numbered 7 downto 8-n (this is the big-endian convention at the byte
+ * level). The context is automatically reinitialized.
+ *
+ * @param cc    the Groestl-224 context
+ * @param ub    the extra bits
+ * @param n     the number of extra bits (0 to 7)
+ * @param dst   the destination buffer
+ */
+void sph_groestl224_addbits_and_close(
+	void *cc, unsigned ub, unsigned n, void *dst);
+
+/**
+ * Initialize a Groestl-256 context. This process performs no memory allocation.
+ *
+ * @param cc   the Groestl-256 context (pointer to a
+ *             <code>sph_groestl256_context</code>)
+ */
+void sph_groestl256_init(void *cc);
+
+/**
+ * Process some data bytes. It is acceptable that <code>len</code> is zero
+ * (in which case this function does nothing).
+ *
+ * @param cc     the Groestl-256 context
+ * @param data   the input data
+ * @param len    the input data length (in bytes)
+ */
+void sph_groestl256(void *cc, const void *data, size_t len);
+
+/**
+ * Terminate the current Groestl-256 computation and output the result into
+ * the provided buffer. The destination buffer must be wide enough to
+ * accomodate the result (32 bytes). The context is automatically
+ * reinitialized.
+ *
+ * @param cc    the Groestl-256 context
+ * @param dst   the destination buffer
+ */
+void sph_groestl256_close(void *cc, void *dst);
+
+/**
+ * Add a few additional bits (0 to 7) to the current computation, then
+ * terminate it and output the result in the provided buffer, which must
+ * be wide enough to accomodate the result (32 bytes). If bit number i
+ * in <code>ub</code> has value 2^i, then the extra bits are those
+ * numbered 7 downto 8-n (this is the big-endian convention at the byte
+ * level). The context is automatically reinitialized.
+ *
+ * @param cc    the Groestl-256 context
+ * @param ub    the extra bits
+ * @param n     the number of extra bits (0 to 7)
+ * @param dst   the destination buffer
+ */
+void sph_groestl256_addbits_and_close(
+	void *cc, unsigned ub, unsigned n, void *dst);
+
+/**
+ * Initialize a Groestl-384 context. This process performs no memory allocation.
+ *
+ * @param cc   the Groestl-384 context (pointer to a
+ *             <code>sph_groestl384_context</code>)
+ */
+void sph_groestl384_init(void *cc);
+
+/**
+ * Process some data bytes. It is acceptable that <code>len</code> is zero
+ * (in which case this function does nothing).
+ *
+ * @param cc     the Groestl-384 context
+ * @param data   the input data
+ * @param len    the input data length (in bytes)
+ */
+void sph_groestl384(void *cc, const void *data, size_t len);
+
+/**
+ * Terminate the current Groestl-384 computation and output the result into
+ * the provided buffer. The destination buffer must be wide enough to
+ * accomodate the result (48 bytes). The context is automatically
+ * reinitialized.
+ *
+ * @param cc    the Groestl-384 context
+ * @param dst   the destination buffer
+ */
+void sph_groestl384_close(void *cc, void *dst);
+
+/**
+ * Add a few additional bits (0 to 7) to the current computation, then
+ * terminate it and output the result in the provided buffer, which must
+ * be wide enough to accomodate the result (48 bytes). If bit number i
+ * in <code>ub</code> has value 2^i, then the extra bits are those
+ * numbered 7 downto 8-n (this is the big-endian convention at the byte
+ * level). The context is automatically reinitialized.
+ *
+ * @param cc    the Groestl-384 context
+ * @param ub    the extra bits
+ * @param n     the number of extra bits (0 to 7)
+ * @param dst   the destination buffer
+ */
+void sph_groestl384_addbits_and_close(
+	void *cc, unsigned ub, unsigned n, void *dst);
+
+/**
+ * Initialize a Groestl-512 context. This process performs no memory allocation.
+ *
+ * @param cc   the Groestl-512 context (pointer to a
+ *             <code>sph_groestl512_context</code>)
+ */
+void sph_groestl512_init(void *cc);
+
+/**
+ * Process some data bytes. It is acceptable that <code>len</code> is zero
+ * (in which case this function does nothing).
+ *
+ * @param cc     the Groestl-512 context
+ * @param data   the input data
+ * @param len    the input data length (in bytes)
+ */
+void sph_groestl512(void *cc, const void *data, size_t len);
+
+/**
+ * Terminate the current Groestl-512 computation and output the result into
+ * the provided buffer. The destination buffer must be wide enough to
+ * accomodate the result (64 bytes). The context is automatically
+ * reinitialized.
+ *
+ * @param cc    the Groestl-512 context
+ * @param dst   the destination buffer
+ */
+void sph_groestl512_close(void *cc, void *dst);
+
+/**
+ * Add a few additional bits (0 to 7) to the current computation, then
+ * terminate it and output the result in the provided buffer, which must
+ * be wide enough to accomodate the result (64 bytes). If bit number i
+ * in <code>ub</code> has value 2^i, then the extra bits are those
+ * numbered 7 downto 8-n (this is the big-endian convention at the byte
+ * level). The context is automatically reinitialized.
+ *
+ * @param cc    the Groestl-512 context
+ * @param ub    the extra bits
+ * @param n     the number of extra bits (0 to 7)
+ * @param dst   the destination buffer
+ */
+void sph_groestl512_addbits_and_close(
+	void *cc, unsigned ub, unsigned n, void *dst);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/sha3/sph_jh.h b/sha3/sph_jh.h
new file mode 100644
index 0000000..82fae58
--- /dev/null
+++ b/sha3/sph_jh.h
@@ -0,0 +1,298 @@
+/* $Id: sph_jh.h 216 2010-06-08 09:46:57Z tp $ */
+/**
+ * JH interface. JH is a family of functions which differ by
+ * their output size; this implementation defines JH for output
+ * sizes 224, 256, 384 and 512 bits.
+ *
+ * ==========================(LICENSE BEGIN)============================
+ *
+ * Copyright (c) 2007-2010  Projet RNRT SAPHIR
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ===========================(LICENSE END)=============================
+ *
+ * @file     sph_jh.h
+ * @author   Thomas Pornin <thomas.pornin@cryptolog.com>
+ */
+
+#ifndef SPH_JH_H__
+#define SPH_JH_H__
+
+#ifdef __cplusplus
+extern "C"{
+#endif
+
+#include <stddef.h>
+#include "sph_types.h"
+
+/**
+ * Output size (in bits) for JH-224.
+ */
+#define SPH_SIZE_jh224   224
+
+/**
+ * Output size (in bits) for JH-256.
+ */
+#define SPH_SIZE_jh256   256
+
+/**
+ * Output size (in bits) for JH-384.
+ */
+#define SPH_SIZE_jh384   384
+
+/**
+ * Output size (in bits) for JH-512.
+ */
+#define SPH_SIZE_jh512   512
+
+/**
+ * This structure is a context for JH computations: it contains the
+ * intermediate values and some data from the last entered block. Once
+ * a JH computation has been performed, the context can be reused for
+ * another computation.
+ *
+ * The contents of this structure are private. A running JH computation
+ * can be cloned by copying the context (e.g. with a simple
+ * <code>memcpy()</code>).
+ */
+typedef struct {
+#ifndef DOXYGEN_IGNORE
+	unsigned char buf[64];    /* first field, for alignment */
+	size_t ptr;
+	union {
+#if SPH_64
+		sph_u64 wide[16];
+#endif
+		sph_u32 narrow[32];
+	} H;
+#if SPH_64
+	sph_u64 block_count;
+#else
+	sph_u32 block_count_high, block_count_low;
+#endif
+#endif
+} sph_jh_context;
+
+/**
+ * Type for a JH-224 context (identical to the common context).
+ */
+typedef sph_jh_context sph_jh224_context;
+
+/**
+ * Type for a JH-256 context (identical to the common context).
+ */
+typedef sph_jh_context sph_jh256_context;
+
+/**
+ * Type for a JH-384 context (identical to the common context).
+ */
+typedef sph_jh_context sph_jh384_context;
+
+/**
+ * Type for a JH-512 context (identical to the common context).
+ */
+typedef sph_jh_context sph_jh512_context;
+
+/**
+ * Initialize a JH-224 context. This process performs no memory allocation.
+ *
+ * @param cc   the JH-224 context (pointer to a
+ *             <code>sph_jh224_context</code>)
+ */
+void sph_jh224_init(void *cc);
+
+/**
+ * Process some data bytes. It is acceptable that <code>len</code> is zero
+ * (in which case this function does nothing).
+ *
+ * @param cc     the JH-224 context
+ * @param data   the input data
+ * @param len    the input data length (in bytes)
+ */
+void sph_jh224(void *cc, const void *data, size_t len);
+
+/**
+ * Terminate the current JH-224 computation and output the result into
+ * the provided buffer. The destination buffer must be wide enough to
+ * accomodate the result (28 bytes). The context is automatically
+ * reinitialized.
+ *
+ * @param cc    the JH-224 context
+ * @param dst   the destination buffer
+ */
+void sph_jh224_close(void *cc, void *dst);
+
+/**
+ * Add a few additional bits (0 to 7) to the current computation, then
+ * terminate it and output the result in the provided buffer, which must
+ * be wide enough to accomodate the result (28 bytes). If bit number i
+ * in <code>ub</code> has value 2^i, then the extra bits are those
+ * numbered 7 downto 8-n (this is the big-endian convention at the byte
+ * level). The context is automatically reinitialized.
+ *
+ * @param cc    the JH-224 context
+ * @param ub    the extra bits
+ * @param n     the number of extra bits (0 to 7)
+ * @param dst   the destination buffer
+ */
+void sph_jh224_addbits_and_close(
+	void *cc, unsigned ub, unsigned n, void *dst);
+
+/**
+ * Initialize a JH-256 context. This process performs no memory allocation.
+ *
+ * @param cc   the JH-256 context (pointer to a
+ *             <code>sph_jh256_context</code>)
+ */
+void sph_jh256_init(void *cc);
+
+/**
+ * Process some data bytes. It is acceptable that <code>len</code> is zero
+ * (in which case this function does nothing).
+ *
+ * @param cc     the JH-256 context
+ * @param data   the input data
+ * @param len    the input data length (in bytes)
+ */
+void sph_jh256(void *cc, const void *data, size_t len);
+
+/**
+ * Terminate the current JH-256 computation and output the result into
+ * the provided buffer. The destination buffer must be wide enough to
+ * accomodate the result (32 bytes). The context is automatically
+ * reinitialized.
+ *
+ * @param cc    the JH-256 context
+ * @param dst   the destination buffer
+ */
+void sph_jh256_close(void *cc, void *dst);
+
+/**
+ * Add a few additional bits (0 to 7) to the current computation, then
+ * terminate it and output the result in the provided buffer, which must
+ * be wide enough to accomodate the result (32 bytes). If bit number i
+ * in <code>ub</code> has value 2^i, then the extra bits are those
+ * numbered 7 downto 8-n (this is the big-endian convention at the byte
+ * level). The context is automatically reinitialized.
+ *
+ * @param cc    the JH-256 context
+ * @param ub    the extra bits
+ * @param n     the number of extra bits (0 to 7)
+ * @param dst   the destination buffer
+ */
+void sph_jh256_addbits_and_close(
+	void *cc, unsigned ub, unsigned n, void *dst);
+
+/**
+ * Initialize a JH-384 context. This process performs no memory allocation.
+ *
+ * @param cc   the JH-384 context (pointer to a
+ *             <code>sph_jh384_context</code>)
+ */
+void sph_jh384_init(void *cc);
+
+/**
+ * Process some data bytes. It is acceptable that <code>len</code> is zero
+ * (in which case this function does nothing).
+ *
+ * @param cc     the JH-384 context
+ * @param data   the input data
+ * @param len    the input data length (in bytes)
+ */
+void sph_jh384(void *cc, const void *data, size_t len);
+
+/**
+ * Terminate the current JH-384 computation and output the result into
+ * the provided buffer. The destination buffer must be wide enough to
+ * accomodate the result (48 bytes). The context is automatically
+ * reinitialized.
+ *
+ * @param cc    the JH-384 context
+ * @param dst   the destination buffer
+ */
+void sph_jh384_close(void *cc, void *dst);
+
+/**
+ * Add a few additional bits (0 to 7) to the current computation, then
+ * terminate it and output the result in the provided buffer, which must
+ * be wide enough to accomodate the result (48 bytes). If bit number i
+ * in <code>ub</code> has value 2^i, then the extra bits are those
+ * numbered 7 downto 8-n (this is the big-endian convention at the byte
+ * level). The context is automatically reinitialized.
+ *
+ * @param cc    the JH-384 context
+ * @param ub    the extra bits
+ * @param n     the number of extra bits (0 to 7)
+ * @param dst   the destination buffer
+ */
+void sph_jh384_addbits_and_close(
+	void *cc, unsigned ub, unsigned n, void *dst);
+
+/**
+ * Initialize a JH-512 context. This process performs no memory allocation.
+ *
+ * @param cc   the JH-512 context (pointer to a
+ *             <code>sph_jh512_context</code>)
+ */
+void sph_jh512_init(void *cc);
+
+/**
+ * Process some data bytes. It is acceptable that <code>len</code> is zero
+ * (in which case this function does nothing).
+ *
+ * @param cc     the JH-512 context
+ * @param data   the input data
+ * @param len    the input data length (in bytes)
+ */
+void sph_jh512(void *cc, const void *data, size_t len);
+
+/**
+ * Terminate the current JH-512 computation and output the result into
+ * the provided buffer. The destination buffer must be wide enough to
+ * accomodate the result (64 bytes). The context is automatically
+ * reinitialized.
+ *
+ * @param cc    the JH-512 context
+ * @param dst   the destination buffer
+ */
+void sph_jh512_close(void *cc, void *dst);
+
+/**
+ * Add a few additional bits (0 to 7) to the current computation, then
+ * terminate it and output the result in the provided buffer, which must
+ * be wide enough to accomodate the result (64 bytes). If bit number i
+ * in <code>ub</code> has value 2^i, then the extra bits are those
+ * numbered 7 downto 8-n (this is the big-endian convention at the byte
+ * level). The context is automatically reinitialized.
+ *
+ * @param cc    the JH-512 context
+ * @param ub    the extra bits
+ * @param n     the number of extra bits (0 to 7)
+ * @param dst   the destination buffer
+ */
+void sph_jh512_addbits_and_close(
+	void *cc, unsigned ub, unsigned n, void *dst);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/sha3/sph_keccak.h b/sha3/sph_keccak.h
new file mode 100644
index 0000000..bdafdb8
--- /dev/null
+++ b/sha3/sph_keccak.h
@@ -0,0 +1,293 @@
+/* $Id: sph_keccak.h 216 2010-06-08 09:46:57Z tp $ */
+/**
+ * Keccak interface. This is the interface for Keccak with the
+ * recommended parameters for SHA-3, with output lengths 224, 256,
+ * 384 and 512 bits.
+ *
+ * ==========================(LICENSE BEGIN)============================
+ *
+ * Copyright (c) 2007-2010  Projet RNRT SAPHIR
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ===========================(LICENSE END)=============================
+ *
+ * @file     sph_keccak.h
+ * @author   Thomas Pornin <thomas.pornin@cryptolog.com>
+ */
+
+#ifndef SPH_KECCAK_H__
+#define SPH_KECCAK_H__
+
+#ifdef __cplusplus
+extern "C"{
+#endif
+
+#include <stddef.h>
+#include "sph_types.h"
+
+/**
+ * Output size (in bits) for Keccak-224.
+ */
+#define SPH_SIZE_keccak224   224
+
+/**
+ * Output size (in bits) for Keccak-256.
+ */
+#define SPH_SIZE_keccak256   256
+
+/**
+ * Output size (in bits) for Keccak-384.
+ */
+#define SPH_SIZE_keccak384   384
+
+/**
+ * Output size (in bits) for Keccak-512.
+ */
+#define SPH_SIZE_keccak512   512
+
+/**
+ * This structure is a context for Keccak computations: it contains the
+ * intermediate values and some data from the last entered block. Once a
+ * Keccak computation has been performed, the context can be reused for
+ * another computation.
+ *
+ * The contents of this structure are private. A running Keccak computation
+ * can be cloned by copying the context (e.g. with a simple
+ * <code>memcpy()</code>).
+ */
+typedef struct {
+#ifndef DOXYGEN_IGNORE
+	unsigned char buf[144];    /* first field, for alignment */
+	size_t ptr, lim;
+	union {
+#if SPH_64
+		sph_u64 wide[25];
+#endif
+		sph_u32 narrow[50];
+	} u;
+#endif
+} sph_keccak_context;
+
+/**
+ * Type for a Keccak-224 context (identical to the common context).
+ */
+typedef sph_keccak_context sph_keccak224_context;
+
+/**
+ * Type for a Keccak-256 context (identical to the common context).
+ */
+typedef sph_keccak_context sph_keccak256_context;
+
+/**
+ * Type for a Keccak-384 context (identical to the common context).
+ */
+typedef sph_keccak_context sph_keccak384_context;
+
+/**
+ * Type for a Keccak-512 context (identical to the common context).
+ */
+typedef sph_keccak_context sph_keccak512_context;
+
+/**
+ * Initialize a Keccak-224 context. This process performs no memory allocation.
+ *
+ * @param cc   the Keccak-224 context (pointer to a
+ *             <code>sph_keccak224_context</code>)
+ */
+void sph_keccak224_init(void *cc);
+
+/**
+ * Process some data bytes. It is acceptable that <code>len</code> is zero
+ * (in which case this function does nothing).
+ *
+ * @param cc     the Keccak-224 context
+ * @param data   the input data
+ * @param len    the input data length (in bytes)
+ */
+void sph_keccak224(void *cc, const void *data, size_t len);
+
+/**
+ * Terminate the current Keccak-224 computation and output the result into
+ * the provided buffer. The destination buffer must be wide enough to
+ * accomodate the result (28 bytes). The context is automatically
+ * reinitialized.
+ *
+ * @param cc    the Keccak-224 context
+ * @param dst   the destination buffer
+ */
+void sph_keccak224_close(void *cc, void *dst);
+
+/**
+ * Add a few additional bits (0 to 7) to the current computation, then
+ * terminate it and output the result in the provided buffer, which must
+ * be wide enough to accomodate the result (28 bytes). If bit number i
+ * in <code>ub</code> has value 2^i, then the extra bits are those
+ * numbered 7 downto 8-n (this is the big-endian convention at the byte
+ * level). The context is automatically reinitialized.
+ *
+ * @param cc    the Keccak-224 context
+ * @param ub    the extra bits
+ * @param n     the number of extra bits (0 to 7)
+ * @param dst   the destination buffer
+ */
+void sph_keccak224_addbits_and_close(
+	void *cc, unsigned ub, unsigned n, void *dst);
+
+/**
+ * Initialize a Keccak-256 context. This process performs no memory allocation.
+ *
+ * @param cc   the Keccak-256 context (pointer to a
+ *             <code>sph_keccak256_context</code>)
+ */
+void sph_keccak256_init(void *cc);
+
+/**
+ * Process some data bytes. It is acceptable that <code>len</code> is zero
+ * (in which case this function does nothing).
+ *
+ * @param cc     the Keccak-256 context
+ * @param data   the input data
+ * @param len    the input data length (in bytes)
+ */
+void sph_keccak256(void *cc, const void *data, size_t len);
+
+/**
+ * Terminate the current Keccak-256 computation and output the result into
+ * the provided buffer. The destination buffer must be wide enough to
+ * accomodate the result (32 bytes). The context is automatically
+ * reinitialized.
+ *
+ * @param cc    the Keccak-256 context
+ * @param dst   the destination buffer
+ */
+void sph_keccak256_close(void *cc, void *dst);
+
+/**
+ * Add a few additional bits (0 to 7) to the current computation, then
+ * terminate it and output the result in the provided buffer, which must
+ * be wide enough to accomodate the result (32 bytes). If bit number i
+ * in <code>ub</code> has value 2^i, then the extra bits are those
+ * numbered 7 downto 8-n (this is the big-endian convention at the byte
+ * level). The context is automatically reinitialized.
+ *
+ * @param cc    the Keccak-256 context
+ * @param ub    the extra bits
+ * @param n     the number of extra bits (0 to 7)
+ * @param dst   the destination buffer
+ */
+void sph_keccak256_addbits_and_close(
+	void *cc, unsigned ub, unsigned n, void *dst);
+
+/**
+ * Initialize a Keccak-384 context. This process performs no memory allocation.
+ *
+ * @param cc   the Keccak-384 context (pointer to a
+ *             <code>sph_keccak384_context</code>)
+ */
+void sph_keccak384_init(void *cc);
+
+/**
+ * Process some data bytes. It is acceptable that <code>len</code> is zero
+ * (in which case this function does nothing).
+ *
+ * @param cc     the Keccak-384 context
+ * @param data   the input data
+ * @param len    the input data length (in bytes)
+ */
+void sph_keccak384(void *cc, const void *data, size_t len);
+
+/**
+ * Terminate the current Keccak-384 computation and output the result into
+ * the provided buffer. The destination buffer must be wide enough to
+ * accomodate the result (48 bytes). The context is automatically
+ * reinitialized.
+ *
+ * @param cc    the Keccak-384 context
+ * @param dst   the destination buffer
+ */
+void sph_keccak384_close(void *cc, void *dst);
+
+/**
+ * Add a few additional bits (0 to 7) to the current computation, then
+ * terminate it and output the result in the provided buffer, which must
+ * be wide enough to accomodate the result (48 bytes). If bit number i
+ * in <code>ub</code> has value 2^i, then the extra bits are those
+ * numbered 7 downto 8-n (this is the big-endian convention at the byte
+ * level). The context is automatically reinitialized.
+ *
+ * @param cc    the Keccak-384 context
+ * @param ub    the extra bits
+ * @param n     the number of extra bits (0 to 7)
+ * @param dst   the destination buffer
+ */
+void sph_keccak384_addbits_and_close(
+	void *cc, unsigned ub, unsigned n, void *dst);
+
+/**
+ * Initialize a Keccak-512 context. This process performs no memory allocation.
+ *
+ * @param cc   the Keccak-512 context (pointer to a
+ *             <code>sph_keccak512_context</code>)
+ */
+void sph_keccak512_init(void *cc);
+
+/**
+ * Process some data bytes. It is acceptable that <code>len</code> is zero
+ * (in which case this function does nothing).
+ *
+ * @param cc     the Keccak-512 context
+ * @param data   the input data
+ * @param len    the input data length (in bytes)
+ */
+void sph_keccak512(void *cc, const void *data, size_t len);
+
+/**
+ * Terminate the current Keccak-512 computation and output the result into
+ * the provided buffer. The destination buffer must be wide enough to
+ * accomodate the result (64 bytes). The context is automatically
+ * reinitialized.
+ *
+ * @param cc    the Keccak-512 context
+ * @param dst   the destination buffer
+ */
+void sph_keccak512_close(void *cc, void *dst);
+
+/**
+ * Add a few additional bits (0 to 7) to the current computation, then
+ * terminate it and output the result in the provided buffer, which must
+ * be wide enough to accomodate the result (64 bytes). If bit number i
+ * in <code>ub</code> has value 2^i, then the extra bits are those
+ * numbered 7 downto 8-n (this is the big-endian convention at the byte
+ * level). The context is automatically reinitialized.
+ *
+ * @param cc    the Keccak-512 context
+ * @param ub    the extra bits
+ * @param n     the number of extra bits (0 to 7)
+ * @param dst   the destination buffer
+ */
+void sph_keccak512_addbits_and_close(
+	void *cc, unsigned ub, unsigned n, void *dst);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/sha3/sph_luffa.h b/sha3/sph_luffa.h
new file mode 100644
index 0000000..a32fd7b
--- /dev/null
+++ b/sha3/sph_luffa.h
@@ -0,0 +1,296 @@
+/* $Id: sph_luffa.h 154 2010-04-26 17:00:24Z tp $ */
+/**
+ * Luffa interface. Luffa is a family of functions which differ by
+ * their output size; this implementation defines Luffa for output
+ * sizes 224, 256, 384 and 512 bits.
+ *
+ * ==========================(LICENSE BEGIN)============================
+ *
+ * Copyright (c) 2007-2010  Projet RNRT SAPHIR
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ===========================(LICENSE END)=============================
+ *
+ * @file     sph_luffa.h
+ * @author   Thomas Pornin <thomas.pornin@cryptolog.com>
+ */
+
+#ifndef SPH_LUFFA_H__
+#define SPH_LUFFA_H__
+
+#ifdef __cplusplus
+extern "C"{
+#endif
+
+#include <stddef.h>
+#include "sph_types.h"
+
+/**
+ * Output size (in bits) for Luffa-224.
+ */
+#define SPH_SIZE_luffa224   224
+
+/**
+ * Output size (in bits) for Luffa-256.
+ */
+#define SPH_SIZE_luffa256   256
+
+/**
+ * Output size (in bits) for Luffa-384.
+ */
+#define SPH_SIZE_luffa384   384
+
+/**
+ * Output size (in bits) for Luffa-512.
+ */
+#define SPH_SIZE_luffa512   512
+
+/**
+ * This structure is a context for Luffa-224 computations: it contains
+ * the intermediate values and some data from the last entered block.
+ * Once a Luffa computation has been performed, the context can be
+ * reused for another computation.
+ *
+ * The contents of this structure are private. A running Luffa
+ * computation can be cloned by copying the context (e.g. with a simple
+ * <code>memcpy()</code>).
+ */
+typedef struct {
+#ifndef DOXYGEN_IGNORE
+	unsigned char buf[32];    /* first field, for alignment */
+	size_t ptr;
+	sph_u32 V[3][8];
+#endif
+} sph_luffa224_context;
+
+/**
+ * This structure is a context for Luffa-256 computations. It is
+ * identical to <code>sph_luffa224_context</code>.
+ */
+typedef sph_luffa224_context sph_luffa256_context;
+
+/**
+ * This structure is a context for Luffa-384 computations.
+ */
+typedef struct {
+#ifndef DOXYGEN_IGNORE
+	unsigned char buf[32];    /* first field, for alignment */
+	size_t ptr;
+	sph_u32 V[4][8];
+#endif
+} sph_luffa384_context;
+
+/**
+ * This structure is a context for Luffa-512 computations.
+ */
+typedef struct {
+#ifndef DOXYGEN_IGNORE
+	unsigned char buf[32];    /* first field, for alignment */
+	size_t ptr;
+	sph_u32 V[5][8];
+#endif
+} sph_luffa512_context;
+
+/**
+ * Initialize a Luffa-224 context. This process performs no memory allocation.
+ *
+ * @param cc   the Luffa-224 context (pointer to a
+ *             <code>sph_luffa224_context</code>)
+ */
+void sph_luffa224_init(void *cc);
+
+/**
+ * Process some data bytes. It is acceptable that <code>len</code> is zero
+ * (in which case this function does nothing).
+ *
+ * @param cc     the Luffa-224 context
+ * @param data   the input data
+ * @param len    the input data length (in bytes)
+ */
+void sph_luffa224(void *cc, const void *data, size_t len);
+
+/**
+ * Terminate the current Luffa-224 computation and output the result into
+ * the provided buffer. The destination buffer must be wide enough to
+ * accomodate the result (28 bytes). The context is automatically
+ * reinitialized.
+ *
+ * @param cc    the Luffa-224 context
+ * @param dst   the destination buffer
+ */
+void sph_luffa224_close(void *cc, void *dst);
+
+/**
+ * Add a few additional bits (0 to 7) to the current computation, then
+ * terminate it and output the result in the provided buffer, which must
+ * be wide enough to accomodate the result (28 bytes). If bit number i
+ * in <code>ub</code> has value 2^i, then the extra bits are those
+ * numbered 7 downto 8-n (this is the big-endian convention at the byte
+ * level). The context is automatically reinitialized.
+ *
+ * @param cc    the Luffa-224 context
+ * @param ub    the extra bits
+ * @param n     the number of extra bits (0 to 7)
+ * @param dst   the destination buffer
+ */
+void sph_luffa224_addbits_and_close(
+	void *cc, unsigned ub, unsigned n, void *dst);
+
+/**
+ * Initialize a Luffa-256 context. This process performs no memory allocation.
+ *
+ * @param cc   the Luffa-256 context (pointer to a
+ *             <code>sph_luffa256_context</code>)
+ */
+void sph_luffa256_init(void *cc);
+
+/**
+ * Process some data bytes. It is acceptable that <code>len</code> is zero
+ * (in which case this function does nothing).
+ *
+ * @param cc     the Luffa-256 context
+ * @param data   the input data
+ * @param len    the input data length (in bytes)
+ */
+void sph_luffa256(void *cc, const void *data, size_t len);
+
+/**
+ * Terminate the current Luffa-256 computation and output the result into
+ * the provided buffer. The destination buffer must be wide enough to
+ * accomodate the result (32 bytes). The context is automatically
+ * reinitialized.
+ *
+ * @param cc    the Luffa-256 context
+ * @param dst   the destination buffer
+ */
+void sph_luffa256_close(void *cc, void *dst);
+
+/**
+ * Add a few additional bits (0 to 7) to the current computation, then
+ * terminate it and output the result in the provided buffer, which must
+ * be wide enough to accomodate the result (32 bytes). If bit number i
+ * in <code>ub</code> has value 2^i, then the extra bits are those
+ * numbered 7 downto 8-n (this is the big-endian convention at the byte
+ * level). The context is automatically reinitialized.
+ *
+ * @param cc    the Luffa-256 context
+ * @param ub    the extra bits
+ * @param n     the number of extra bits (0 to 7)
+ * @param dst   the destination buffer
+ */
+void sph_luffa256_addbits_and_close(
+	void *cc, unsigned ub, unsigned n, void *dst);
+
+/**
+ * Initialize a Luffa-384 context. This process performs no memory allocation.
+ *
+ * @param cc   the Luffa-384 context (pointer to a
+ *             <code>sph_luffa384_context</code>)
+ */
+void sph_luffa384_init(void *cc);
+
+/**
+ * Process some data bytes. It is acceptable that <code>len</code> is zero
+ * (in which case this function does nothing).
+ *
+ * @param cc     the Luffa-384 context
+ * @param data   the input data
+ * @param len    the input data length (in bytes)
+ */
+void sph_luffa384(void *cc, const void *data, size_t len);
+
+/**
+ * Terminate the current Luffa-384 computation and output the result into
+ * the provided buffer. The destination buffer must be wide enough to
+ * accomodate the result (48 bytes). The context is automatically
+ * reinitialized.
+ *
+ * @param cc    the Luffa-384 context
+ * @param dst   the destination buffer
+ */
+void sph_luffa384_close(void *cc, void *dst);
+
+/**
+ * Add a few additional bits (0 to 7) to the current computation, then
+ * terminate it and output the result in the provided buffer, which must
+ * be wide enough to accomodate the result (48 bytes). If bit number i
+ * in <code>ub</code> has value 2^i, then the extra bits are those
+ * numbered 7 downto 8-n (this is the big-endian convention at the byte
+ * level). The context is automatically reinitialized.
+ *
+ * @param cc    the Luffa-384 context
+ * @param ub    the extra bits
+ * @param n     the number of extra bits (0 to 7)
+ * @param dst   the destination buffer
+ */
+void sph_luffa384_addbits_and_close(
+	void *cc, unsigned ub, unsigned n, void *dst);
+
+/**
+ * Initialize a Luffa-512 context. This process performs no memory allocation.
+ *
+ * @param cc   the Luffa-512 context (pointer to a
+ *             <code>sph_luffa512_context</code>)
+ */
+void sph_luffa512_init(void *cc);
+
+/**
+ * Process some data bytes. It is acceptable that <code>len</code> is zero
+ * (in which case this function does nothing).
+ *
+ * @param cc     the Luffa-512 context
+ * @param data   the input data
+ * @param len    the input data length (in bytes)
+ */
+void sph_luffa512(void *cc, const void *data, size_t len);
+
+/**
+ * Terminate the current Luffa-512 computation and output the result into
+ * the provided buffer. The destination buffer must be wide enough to
+ * accomodate the result (64 bytes). The context is automatically
+ * reinitialized.
+ *
+ * @param cc    the Luffa-512 context
+ * @param dst   the destination buffer
+ */
+void sph_luffa512_close(void *cc, void *dst);
+
+/**
+ * Add a few additional bits (0 to 7) to the current computation, then
+ * terminate it and output the result in the provided buffer, which must
+ * be wide enough to accomodate the result (64 bytes). If bit number i
+ * in <code>ub</code> has value 2^i, then the extra bits are those
+ * numbered 7 downto 8-n (this is the big-endian convention at the byte
+ * level). The context is automatically reinitialized.
+ *
+ * @param cc    the Luffa-512 context
+ * @param ub    the extra bits
+ * @param n     the number of extra bits (0 to 7)
+ * @param dst   the destination buffer
+ */
+void sph_luffa512_addbits_and_close(
+	void *cc, unsigned ub, unsigned n, void *dst);
+	
+#ifdef __cplusplus
+}
+#endif
+	
+#endif
diff --git a/sha3/sph_shavite.h b/sha3/sph_shavite.h
new file mode 100644
index 0000000..0957e42
--- /dev/null
+++ b/sha3/sph_shavite.h
@@ -0,0 +1,314 @@
+/* $Id: sph_shavite.h 208 2010-06-02 20:33:00Z tp $ */
+/**
+ * SHAvite-3 interface. This code implements SHAvite-3 with the
+ * recommended parameters for SHA-3, with outputs of 224, 256, 384 and
+ * 512 bits. In the following, we call the function "SHAvite" (without
+ * the "-3" suffix), thus "SHAvite-224" is "SHAvite-3 with a 224-bit
+ * output".
+ *
+ * ==========================(LICENSE BEGIN)============================
+ *
+ * Copyright (c) 2007-2010  Projet RNRT SAPHIR
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ===========================(LICENSE END)=============================
+ *
+ * @file     sph_shavite.h
+ * @author   Thomas Pornin <thomas.pornin@cryptolog.com>
+ */
+
+#ifndef SPH_SHAVITE_H__
+#define SPH_SHAVITE_H__
+
+#include <stddef.h>
+#include "sph_types.h"
+
+#ifdef __cplusplus
+extern "C"{
+#endif
+
+/**
+ * Output size (in bits) for SHAvite-224.
+ */
+#define SPH_SIZE_shavite224   224
+
+/**
+ * Output size (in bits) for SHAvite-256.
+ */
+#define SPH_SIZE_shavite256   256
+
+/**
+ * Output size (in bits) for SHAvite-384.
+ */
+#define SPH_SIZE_shavite384   384
+
+/**
+ * Output size (in bits) for SHAvite-512.
+ */
+#define SPH_SIZE_shavite512   512
+
+/**
+ * This structure is a context for SHAvite-224 and SHAvite-256 computations:
+ * it contains the intermediate values and some data from the last
+ * entered block. Once a SHAvite computation has been performed, the
+ * context can be reused for another computation.
+ *
+ * The contents of this structure are private. A running SHAvite
+ * computation can be cloned by copying the context (e.g. with a simple
+ * <code>memcpy()</code>).
+ */
+typedef struct {
+#ifndef DOXYGEN_IGNORE
+	unsigned char buf[64];    /* first field, for alignment */
+	size_t ptr;
+	sph_u32 h[8];
+	sph_u32 count0, count1;
+#endif
+} sph_shavite_small_context;
+
+/**
+ * This structure is a context for SHAvite-224 computations. It is
+ * identical to the common <code>sph_shavite_small_context</code>.
+ */
+typedef sph_shavite_small_context sph_shavite224_context;
+
+/**
+ * This structure is a context for SHAvite-256 computations. It is
+ * identical to the common <code>sph_shavite_small_context</code>.
+ */
+typedef sph_shavite_small_context sph_shavite256_context;
+
+/**
+ * This structure is a context for SHAvite-384 and SHAvite-512 computations:
+ * it contains the intermediate values and some data from the last
+ * entered block. Once a SHAvite computation has been performed, the
+ * context can be reused for another computation.
+ *
+ * The contents of this structure are private. A running SHAvite
+ * computation can be cloned by copying the context (e.g. with a simple
+ * <code>memcpy()</code>).
+ */
+typedef struct {
+#ifndef DOXYGEN_IGNORE
+	unsigned char buf[128];    /* first field, for alignment */
+	size_t ptr;
+	sph_u32 h[16];
+	sph_u32 count0, count1, count2, count3;
+#endif
+} sph_shavite_big_context;
+
+/**
+ * This structure is a context for SHAvite-384 computations. It is
+ * identical to the common <code>sph_shavite_small_context</code>.
+ */
+typedef sph_shavite_big_context sph_shavite384_context;
+
+/**
+ * This structure is a context for SHAvite-512 computations. It is
+ * identical to the common <code>sph_shavite_small_context</code>.
+ */
+typedef sph_shavite_big_context sph_shavite512_context;
+
+/**
+ * Initialize a SHAvite-224 context. This process performs no memory allocation.
+ *
+ * @param cc   the SHAvite-224 context (pointer to a
+ *             <code>sph_shavite224_context</code>)
+ */
+void sph_shavite224_init(void *cc);
+
+/**
+ * Process some data bytes. It is acceptable that <code>len</code> is zero
+ * (in which case this function does nothing).
+ *
+ * @param cc     the SHAvite-224 context
+ * @param data   the input data
+ * @param len    the input data length (in bytes)
+ */
+void sph_shavite224(void *cc, const void *data, size_t len);
+
+/**
+ * Terminate the current SHAvite-224 computation and output the result into
+ * the provided buffer. The destination buffer must be wide enough to
+ * accomodate the result (28 bytes). The context is automatically
+ * reinitialized.
+ *
+ * @param cc    the SHAvite-224 context
+ * @param dst   the destination buffer
+ */
+void sph_shavite224_close(void *cc, void *dst);
+
+/**
+ * Add a few additional bits (0 to 7) to the current computation, then
+ * terminate it and output the result in the provided buffer, which must
+ * be wide enough to accomodate the result (28 bytes). If bit number i
+ * in <code>ub</code> has value 2^i, then the extra bits are those
+ * numbered 7 downto 8-n (this is the big-endian convention at the byte
+ * level). The context is automatically reinitialized.
+ *
+ * @param cc    the SHAvite-224 context
+ * @param ub    the extra bits
+ * @param n     the number of extra bits (0 to 7)
+ * @param dst   the destination buffer
+ */
+void sph_shavite224_addbits_and_close(
+	void *cc, unsigned ub, unsigned n, void *dst);
+
+/**
+ * Initialize a SHAvite-256 context. This process performs no memory allocation.
+ *
+ * @param cc   the SHAvite-256 context (pointer to a
+ *             <code>sph_shavite256_context</code>)
+ */
+void sph_shavite256_init(void *cc);
+
+/**
+ * Process some data bytes. It is acceptable that <code>len</code> is zero
+ * (in which case this function does nothing).
+ *
+ * @param cc     the SHAvite-256 context
+ * @param data   the input data
+ * @param len    the input data length (in bytes)
+ */
+void sph_shavite256(void *cc, const void *data, size_t len);
+
+/**
+ * Terminate the current SHAvite-256 computation and output the result into
+ * the provided buffer. The destination buffer must be wide enough to
+ * accomodate the result (32 bytes). The context is automatically
+ * reinitialized.
+ *
+ * @param cc    the SHAvite-256 context
+ * @param dst   the destination buffer
+ */
+void sph_shavite256_close(void *cc, void *dst);
+
+/**
+ * Add a few additional bits (0 to 7) to the current computation, then
+ * terminate it and output the result in the provided buffer, which must
+ * be wide enough to accomodate the result (32 bytes). If bit number i
+ * in <code>ub</code> has value 2^i, then the extra bits are those
+ * numbered 7 downto 8-n (this is the big-endian convention at the byte
+ * level). The context is automatically reinitialized.
+ *
+ * @param cc    the SHAvite-256 context
+ * @param ub    the extra bits
+ * @param n     the number of extra bits (0 to 7)
+ * @param dst   the destination buffer
+ */
+void sph_shavite256_addbits_and_close(
+	void *cc, unsigned ub, unsigned n, void *dst);
+
+/**
+ * Initialize a SHAvite-384 context. This process performs no memory allocation.
+ *
+ * @param cc   the SHAvite-384 context (pointer to a
+ *             <code>sph_shavite384_context</code>)
+ */
+void sph_shavite384_init(void *cc);
+
+/**
+ * Process some data bytes. It is acceptable that <code>len</code> is zero
+ * (in which case this function does nothing).
+ *
+ * @param cc     the SHAvite-384 context
+ * @param data   the input data
+ * @param len    the input data length (in bytes)
+ */
+void sph_shavite384(void *cc, const void *data, size_t len);
+
+/**
+ * Terminate the current SHAvite-384 computation and output the result into
+ * the provided buffer. The destination buffer must be wide enough to
+ * accomodate the result (48 bytes). The context is automatically
+ * reinitialized.
+ *
+ * @param cc    the SHAvite-384 context
+ * @param dst   the destination buffer
+ */
+void sph_shavite384_close(void *cc, void *dst);
+
+/**
+ * Add a few additional bits (0 to 7) to the current computation, then
+ * terminate it and output the result in the provided buffer, which must
+ * be wide enough to accomodate the result (48 bytes). If bit number i
+ * in <code>ub</code> has value 2^i, then the extra bits are those
+ * numbered 7 downto 8-n (this is the big-endian convention at the byte
+ * level). The context is automatically reinitialized.
+ *
+ * @param cc    the SHAvite-384 context
+ * @param ub    the extra bits
+ * @param n     the number of extra bits (0 to 7)
+ * @param dst   the destination buffer
+ */
+void sph_shavite384_addbits_and_close(
+	void *cc, unsigned ub, unsigned n, void *dst);
+
+/**
+ * Initialize a SHAvite-512 context. This process performs no memory allocation.
+ *
+ * @param cc   the SHAvite-512 context (pointer to a
+ *             <code>sph_shavite512_context</code>)
+ */
+void sph_shavite512_init(void *cc);
+
+/**
+ * Process some data bytes. It is acceptable that <code>len</code> is zero
+ * (in which case this function does nothing).
+ *
+ * @param cc     the SHAvite-512 context
+ * @param data   the input data
+ * @param len    the input data length (in bytes)
+ */
+void sph_shavite512(void *cc, const void *data, size_t len);
+
+/**
+ * Terminate the current SHAvite-512 computation and output the result into
+ * the provided buffer. The destination buffer must be wide enough to
+ * accomodate the result (64 bytes). The context is automatically
+ * reinitialized.
+ *
+ * @param cc    the SHAvite-512 context
+ * @param dst   the destination buffer
+ */
+void sph_shavite512_close(void *cc, void *dst);
+
+/**
+ * Add a few additional bits (0 to 7) to the current computation, then
+ * terminate it and output the result in the provided buffer, which must
+ * be wide enough to accomodate the result (64 bytes). If bit number i
+ * in <code>ub</code> has value 2^i, then the extra bits are those
+ * numbered 7 downto 8-n (this is the big-endian convention at the byte
+ * level). The context is automatically reinitialized.
+ *
+ * @param cc    the SHAvite-512 context
+ * @param ub    the extra bits
+ * @param n     the number of extra bits (0 to 7)
+ * @param dst   the destination buffer
+ */
+void sph_shavite512_addbits_and_close(
+	void *cc, unsigned ub, unsigned n, void *dst);
+	
+#ifdef __cplusplus
+}
+#endif	
+	
+#endif
diff --git a/sha3/sph_simd.h b/sha3/sph_simd.h
new file mode 100644
index 0000000..92ee1e7
--- /dev/null
+++ b/sha3/sph_simd.h
@@ -0,0 +1,309 @@
+/* $Id: sph_simd.h 154 2010-04-26 17:00:24Z tp $ */
+/**
+ * SIMD interface. SIMD is a family of functions which differ by
+ * their output size; this implementation defines SIMD for output
+ * sizes 224, 256, 384 and 512 bits.
+ *
+ * ==========================(LICENSE BEGIN)============================
+ *
+ * Copyright (c) 2007-2010  Projet RNRT SAPHIR
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ===========================(LICENSE END)=============================
+ *
+ * @file     sph_simd.h
+ * @author   Thomas Pornin <thomas.pornin@cryptolog.com>
+ */
+
+#ifndef SPH_SIMD_H__
+#define SPH_SIMD_H__
+
+#ifdef __cplusplus
+extern "C"{
+#endif
+
+#include <stddef.h>
+#include "sph_types.h"
+
+/**
+ * Output size (in bits) for SIMD-224.
+ */
+#define SPH_SIZE_simd224   224
+
+/**
+ * Output size (in bits) for SIMD-256.
+ */
+#define SPH_SIZE_simd256   256
+
+/**
+ * Output size (in bits) for SIMD-384.
+ */
+#define SPH_SIZE_simd384   384
+
+/**
+ * Output size (in bits) for SIMD-512.
+ */
+#define SPH_SIZE_simd512   512
+
+/**
+ * This structure is a context for SIMD computations: it contains the
+ * intermediate values and some data from the last entered block. Once
+ * an SIMD computation has been performed, the context can be reused for
+ * another computation. This specific structure is used for SIMD-224
+ * and SIMD-256.
+ *
+ * The contents of this structure are private. A running SIMD computation
+ * can be cloned by copying the context (e.g. with a simple
+ * <code>memcpy()</code>).
+ */
+typedef struct {
+#ifndef DOXYGEN_IGNORE
+	unsigned char buf[64];    /* first field, for alignment */
+	size_t ptr;
+	sph_u32 state[16];
+	sph_u32 count_low, count_high;
+#endif
+} sph_simd_small_context;
+
+/**
+ * This structure is a context for SIMD computations: it contains the
+ * intermediate values and some data from the last entered block. Once
+ * an SIMD computation has been performed, the context can be reused for
+ * another computation. This specific structure is used for SIMD-384
+ * and SIMD-512.
+ *
+ * The contents of this structure are private. A running SIMD computation
+ * can be cloned by copying the context (e.g. with a simple
+ * <code>memcpy()</code>).
+ */
+typedef struct {
+#ifndef DOXYGEN_IGNORE
+	unsigned char buf[128];    /* first field, for alignment */
+	size_t ptr;
+	sph_u32 state[32];
+	sph_u32 count_low, count_high;
+#endif
+} sph_simd_big_context;
+
+/**
+ * Type for a SIMD-224 context (identical to the common "small" context).
+ */
+typedef sph_simd_small_context sph_simd224_context;
+
+/**
+ * Type for a SIMD-256 context (identical to the common "small" context).
+ */
+typedef sph_simd_small_context sph_simd256_context;
+
+/**
+ * Type for a SIMD-384 context (identical to the common "big" context).
+ */
+typedef sph_simd_big_context sph_simd384_context;
+
+/**
+ * Type for a SIMD-512 context (identical to the common "big" context).
+ */
+typedef sph_simd_big_context sph_simd512_context;
+
+/**
+ * Initialize an SIMD-224 context. This process performs no memory allocation.
+ *
+ * @param cc   the SIMD-224 context (pointer to a
+ *             <code>sph_simd224_context</code>)
+ */
+void sph_simd224_init(void *cc);
+
+/**
+ * Process some data bytes. It is acceptable that <code>len</code> is zero
+ * (in which case this function does nothing).
+ *
+ * @param cc     the SIMD-224 context
+ * @param data   the input data
+ * @param len    the input data length (in bytes)
+ */
+void sph_simd224(void *cc, const void *data, size_t len);
+
+/**
+ * Terminate the current SIMD-224 computation and output the result into
+ * the provided buffer. The destination buffer must be wide enough to
+ * accomodate the result (28 bytes). The context is automatically
+ * reinitialized.
+ *
+ * @param cc    the SIMD-224 context
+ * @param dst   the destination buffer
+ */
+void sph_simd224_close(void *cc, void *dst);
+
+/**
+ * Add a few additional bits (0 to 7) to the current computation, then
+ * terminate it and output the result in the provided buffer, which must
+ * be wide enough to accomodate the result (28 bytes). If bit number i
+ * in <code>ub</code> has value 2^i, then the extra bits are those
+ * numbered 7 downto 8-n (this is the big-endian convention at the byte
+ * level). The context is automatically reinitialized.
+ *
+ * @param cc    the SIMD-224 context
+ * @param ub    the extra bits
+ * @param n     the number of extra bits (0 to 7)
+ * @param dst   the destination buffer
+ */
+void sph_simd224_addbits_and_close(
+	void *cc, unsigned ub, unsigned n, void *dst);
+
+/**
+ * Initialize an SIMD-256 context. This process performs no memory allocation.
+ *
+ * @param cc   the SIMD-256 context (pointer to a
+ *             <code>sph_simd256_context</code>)
+ */
+void sph_simd256_init(void *cc);
+
+/**
+ * Process some data bytes. It is acceptable that <code>len</code> is zero
+ * (in which case this function does nothing).
+ *
+ * @param cc     the SIMD-256 context
+ * @param data   the input data
+ * @param len    the input data length (in bytes)
+ */
+void sph_simd256(void *cc, const void *data, size_t len);
+
+/**
+ * Terminate the current SIMD-256 computation and output the result into
+ * the provided buffer. The destination buffer must be wide enough to
+ * accomodate the result (32 bytes). The context is automatically
+ * reinitialized.
+ *
+ * @param cc    the SIMD-256 context
+ * @param dst   the destination buffer
+ */
+void sph_simd256_close(void *cc, void *dst);
+
+/**
+ * Add a few additional bits (0 to 7) to the current computation, then
+ * terminate it and output the result in the provided buffer, which must
+ * be wide enough to accomodate the result (32 bytes). If bit number i
+ * in <code>ub</code> has value 2^i, then the extra bits are those
+ * numbered 7 downto 8-n (this is the big-endian convention at the byte
+ * level). The context is automatically reinitialized.
+ *
+ * @param cc    the SIMD-256 context
+ * @param ub    the extra bits
+ * @param n     the number of extra bits (0 to 7)
+ * @param dst   the destination buffer
+ */
+void sph_simd256_addbits_and_close(
+	void *cc, unsigned ub, unsigned n, void *dst);
+
+/**
+ * Initialize an SIMD-384 context. This process performs no memory allocation.
+ *
+ * @param cc   the SIMD-384 context (pointer to a
+ *             <code>sph_simd384_context</code>)
+ */
+void sph_simd384_init(void *cc);
+
+/**
+ * Process some data bytes. It is acceptable that <code>len</code> is zero
+ * (in which case this function does nothing).
+ *
+ * @param cc     the SIMD-384 context
+ * @param data   the input data
+ * @param len    the input data length (in bytes)
+ */
+void sph_simd384(void *cc, const void *data, size_t len);
+
+/**
+ * Terminate the current SIMD-384 computation and output the result into
+ * the provided buffer. The destination buffer must be wide enough to
+ * accomodate the result (48 bytes). The context is automatically
+ * reinitialized.
+ *
+ * @param cc    the SIMD-384 context
+ * @param dst   the destination buffer
+ */
+void sph_simd384_close(void *cc, void *dst);
+
+/**
+ * Add a few additional bits (0 to 7) to the current computation, then
+ * terminate it and output the result in the provided buffer, which must
+ * be wide enough to accomodate the result (48 bytes). If bit number i
+ * in <code>ub</code> has value 2^i, then the extra bits are those
+ * numbered 7 downto 8-n (this is the big-endian convention at the byte
+ * level). The context is automatically reinitialized.
+ *
+ * @param cc    the SIMD-384 context
+ * @param ub    the extra bits
+ * @param n     the number of extra bits (0 to 7)
+ * @param dst   the destination buffer
+ */
+void sph_simd384_addbits_and_close(
+	void *cc, unsigned ub, unsigned n, void *dst);
+
+/**
+ * Initialize an SIMD-512 context. This process performs no memory allocation.
+ *
+ * @param cc   the SIMD-512 context (pointer to a
+ *             <code>sph_simd512_context</code>)
+ */
+void sph_simd512_init(void *cc);
+
+/**
+ * Process some data bytes. It is acceptable that <code>len</code> is zero
+ * (in which case this function does nothing).
+ *
+ * @param cc     the SIMD-512 context
+ * @param data   the input data
+ * @param len    the input data length (in bytes)
+ */
+void sph_simd512(void *cc, const void *data, size_t len);
+
+/**
+ * Terminate the current SIMD-512 computation and output the result into
+ * the provided buffer. The destination buffer must be wide enough to
+ * accomodate the result (64 bytes). The context is automatically
+ * reinitialized.
+ *
+ * @param cc    the SIMD-512 context
+ * @param dst   the destination buffer
+ */
+void sph_simd512_close(void *cc, void *dst);
+
+/**
+ * Add a few additional bits (0 to 7) to the current computation, then
+ * terminate it and output the result in the provided buffer, which must
+ * be wide enough to accomodate the result (64 bytes). If bit number i
+ * in <code>ub</code> has value 2^i, then the extra bits are those
+ * numbered 7 downto 8-n (this is the big-endian convention at the byte
+ * level). The context is automatically reinitialized.
+ *
+ * @param cc    the SIMD-512 context
+ * @param ub    the extra bits
+ * @param n     the number of extra bits (0 to 7)
+ * @param dst   the destination buffer
+ */
+void sph_simd512_addbits_and_close(
+	void *cc, unsigned ub, unsigned n, void *dst);
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/sha3/sph_skein.h b/sha3/sph_skein.h
new file mode 100644
index 0000000..bddbc86
--- /dev/null
+++ b/sha3/sph_skein.h
@@ -0,0 +1,298 @@
+/* $Id: sph_skein.h 253 2011-06-07 18:33:10Z tp $ */
+/**
+ * Skein interface. The Skein specification defines three main
+ * functions, called Skein-256, Skein-512 and Skein-1024, which can be
+ * further parameterized with an output length. For the SHA-3
+ * competition, Skein-512 is used for output sizes of 224, 256, 384 and
+ * 512 bits; this is what this code implements. Thus, we hereafter call
+ * Skein-224, Skein-256, Skein-384 and Skein-512 what the Skein
+ * specification defines as Skein-512-224, Skein-512-256, Skein-512-384
+ * and Skein-512-512, respectively.
+ *
+ * ==========================(LICENSE BEGIN)============================
+ *
+ * Copyright (c) 2007-2010  Projet RNRT SAPHIR
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ===========================(LICENSE END)=============================
+ *
+ * @file     sph_skein.h
+ * @author   Thomas Pornin <thomas.pornin@cryptolog.com>
+ */
+
+#ifndef SPH_SKEIN_H__
+#define SPH_SKEIN_H__
+
+#ifdef __cplusplus
+extern "C"{
+#endif
+
+#include <stddef.h>
+#include "sph_types.h"
+
+#if SPH_64
+
+/**
+ * Output size (in bits) for Skein-224.
+ */
+#define SPH_SIZE_skein224   224
+
+/**
+ * Output size (in bits) for Skein-256.
+ */
+#define SPH_SIZE_skein256   256
+
+/**
+ * Output size (in bits) for Skein-384.
+ */
+#define SPH_SIZE_skein384   384
+
+/**
+ * Output size (in bits) for Skein-512.
+ */
+#define SPH_SIZE_skein512   512
+
+/**
+ * This structure is a context for Skein computations (with a 384- or
+ * 512-bit output): it contains the intermediate values and some data
+ * from the last entered block. Once a Skein computation has been
+ * performed, the context can be reused for another computation.
+ *
+ * The contents of this structure are private. A running Skein computation
+ * can be cloned by copying the context (e.g. with a simple
+ * <code>memcpy()</code>).
+ */
+typedef struct {
+#ifndef DOXYGEN_IGNORE
+	unsigned char buf[64];    /* first field, for alignment */
+	size_t ptr;
+	sph_u64 h0, h1, h2, h3, h4, h5, h6, h7;
+	sph_u64 bcount;
+#endif
+} sph_skein_big_context;
+
+/**
+ * Type for a Skein-224 context (identical to the common "big" context).
+ */
+typedef sph_skein_big_context sph_skein224_context;
+
+/**
+ * Type for a Skein-256 context (identical to the common "big" context).
+ */
+typedef sph_skein_big_context sph_skein256_context;
+
+/**
+ * Type for a Skein-384 context (identical to the common "big" context).
+ */
+typedef sph_skein_big_context sph_skein384_context;
+
+/**
+ * Type for a Skein-512 context (identical to the common "big" context).
+ */
+typedef sph_skein_big_context sph_skein512_context;
+
+/**
+ * Initialize a Skein-224 context. This process performs no memory allocation.
+ *
+ * @param cc   the Skein-224 context (pointer to a
+ *             <code>sph_skein224_context</code>)
+ */
+void sph_skein224_init(void *cc);
+
+/**
+ * Process some data bytes. It is acceptable that <code>len</code> is zero
+ * (in which case this function does nothing).
+ *
+ * @param cc     the Skein-224 context
+ * @param data   the input data
+ * @param len    the input data length (in bytes)
+ */
+void sph_skein224(void *cc, const void *data, size_t len);
+
+/**
+ * Terminate the current Skein-224 computation and output the result into
+ * the provided buffer. The destination buffer must be wide enough to
+ * accomodate the result (28 bytes). The context is automatically
+ * reinitialized.
+ *
+ * @param cc    the Skein-224 context
+ * @param dst   the destination buffer
+ */
+void sph_skein224_close(void *cc, void *dst);
+
+/**
+ * Add a few additional bits (0 to 7) to the current computation, then
+ * terminate it and output the result in the provided buffer, which must
+ * be wide enough to accomodate the result (28 bytes). If bit number i
+ * in <code>ub</code> has value 2^i, then the extra bits are those
+ * numbered 7 downto 8-n (this is the big-endian convention at the byte
+ * level). The context is automatically reinitialized.
+ *
+ * @param cc    the Skein-224 context
+ * @param ub    the extra bits
+ * @param n     the number of extra bits (0 to 7)
+ * @param dst   the destination buffer
+ */
+void sph_skein224_addbits_and_close(
+	void *cc, unsigned ub, unsigned n, void *dst);
+
+/**
+ * Initialize a Skein-256 context. This process performs no memory allocation.
+ *
+ * @param cc   the Skein-256 context (pointer to a
+ *             <code>sph_skein256_context</code>)
+ */
+void sph_skein256_init(void *cc);
+
+/**
+ * Process some data bytes. It is acceptable that <code>len</code> is zero
+ * (in which case this function does nothing).
+ *
+ * @param cc     the Skein-256 context
+ * @param data   the input data
+ * @param len    the input data length (in bytes)
+ */
+void sph_skein256(void *cc, const void *data, size_t len);
+
+/**
+ * Terminate the current Skein-256 computation and output the result into
+ * the provided buffer. The destination buffer must be wide enough to
+ * accomodate the result (32 bytes). The context is automatically
+ * reinitialized.
+ *
+ * @param cc    the Skein-256 context
+ * @param dst   the destination buffer
+ */
+void sph_skein256_close(void *cc, void *dst);
+
+/**
+ * Add a few additional bits (0 to 7) to the current computation, then
+ * terminate it and output the result in the provided buffer, which must
+ * be wide enough to accomodate the result (32 bytes). If bit number i
+ * in <code>ub</code> has value 2^i, then the extra bits are those
+ * numbered 7 downto 8-n (this is the big-endian convention at the byte
+ * level). The context is automatically reinitialized.
+ *
+ * @param cc    the Skein-256 context
+ * @param ub    the extra bits
+ * @param n     the number of extra bits (0 to 7)
+ * @param dst   the destination buffer
+ */
+void sph_skein256_addbits_and_close(
+	void *cc, unsigned ub, unsigned n, void *dst);
+
+/**
+ * Initialize a Skein-384 context. This process performs no memory allocation.
+ *
+ * @param cc   the Skein-384 context (pointer to a
+ *             <code>sph_skein384_context</code>)
+ */
+void sph_skein384_init(void *cc);
+
+/**
+ * Process some data bytes. It is acceptable that <code>len</code> is zero
+ * (in which case this function does nothing).
+ *
+ * @param cc     the Skein-384 context
+ * @param data   the input data
+ * @param len    the input data length (in bytes)
+ */
+void sph_skein384(void *cc, const void *data, size_t len);
+
+/**
+ * Terminate the current Skein-384 computation and output the result into
+ * the provided buffer. The destination buffer must be wide enough to
+ * accomodate the result (48 bytes). The context is automatically
+ * reinitialized.
+ *
+ * @param cc    the Skein-384 context
+ * @param dst   the destination buffer
+ */
+void sph_skein384_close(void *cc, void *dst);
+
+/**
+ * Add a few additional bits (0 to 7) to the current computation, then
+ * terminate it and output the result in the provided buffer, which must
+ * be wide enough to accomodate the result (48 bytes). If bit number i
+ * in <code>ub</code> has value 2^i, then the extra bits are those
+ * numbered 7 downto 8-n (this is the big-endian convention at the byte
+ * level). The context is automatically reinitialized.
+ *
+ * @param cc    the Skein-384 context
+ * @param ub    the extra bits
+ * @param n     the number of extra bits (0 to 7)
+ * @param dst   the destination buffer
+ */
+void sph_skein384_addbits_and_close(
+	void *cc, unsigned ub, unsigned n, void *dst);
+
+/**
+ * Initialize a Skein-512 context. This process performs no memory allocation.
+ *
+ * @param cc   the Skein-512 context (pointer to a
+ *             <code>sph_skein512_context</code>)
+ */
+void sph_skein512_init(void *cc);
+
+/**
+ * Process some data bytes. It is acceptable that <code>len</code> is zero
+ * (in which case this function does nothing).
+ *
+ * @param cc     the Skein-512 context
+ * @param data   the input data
+ * @param len    the input data length (in bytes)
+ */
+void sph_skein512(void *cc, const void *data, size_t len);
+
+/**
+ * Terminate the current Skein-512 computation and output the result into
+ * the provided buffer. The destination buffer must be wide enough to
+ * accomodate the result (64 bytes). The context is automatically
+ * reinitialized.
+ *
+ * @param cc    the Skein-512 context
+ * @param dst   the destination buffer
+ */
+void sph_skein512_close(void *cc, void *dst);
+
+/**
+ * Add a few additional bits (0 to 7) to the current computation, then
+ * terminate it and output the result in the provided buffer, which must
+ * be wide enough to accomodate the result (64 bytes). If bit number i
+ * in <code>ub</code> has value 2^i, then the extra bits are those
+ * numbered 7 downto 8-n (this is the big-endian convention at the byte
+ * level). The context is automatically reinitialized.
+ *
+ * @param cc    the Skein-512 context
+ * @param ub    the extra bits
+ * @param n     the number of extra bits (0 to 7)
+ * @param dst   the destination buffer
+ */
+void sph_skein512_addbits_and_close(
+	void *cc, unsigned ub, unsigned n, void *dst);
+
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/sha3/sph_types.h b/sha3/sph_types.h
new file mode 100644
index 0000000..7295b0b
--- /dev/null
+++ b/sha3/sph_types.h
@@ -0,0 +1,1976 @@
+/* $Id: sph_types.h 260 2011-07-21 01:02:38Z tp $ */
+/**
+ * Basic type definitions.
+ *
+ * This header file defines the generic integer types that will be used
+ * for the implementation of hash functions; it also contains helper
+ * functions which encode and decode multi-byte integer values, using
+ * either little-endian or big-endian conventions.
+ *
+ * This file contains a compile-time test on the size of a byte
+ * (the <code>unsigned char</code> C type). If bytes are not octets,
+ * i.e. if they do not have a size of exactly 8 bits, then compilation
+ * is aborted. Architectures where bytes are not octets are relatively
+ * rare, even in the embedded devices market. We forbid non-octet bytes
+ * because there is no clear convention on how octet streams are encoded
+ * on such systems.
+ *
+ * ==========================(LICENSE BEGIN)============================
+ *
+ * Copyright (c) 2007-2010  Projet RNRT SAPHIR
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ===========================(LICENSE END)=============================
+ *
+ * @file     sph_types.h
+ * @author   Thomas Pornin <thomas.pornin@cryptolog.com>
+ */
+
+#ifndef SPH_TYPES_H__
+#define SPH_TYPES_H__
+
+#include <limits.h>
+
+/*
+ * All our I/O functions are defined over octet streams. We do not know
+ * how to handle input data if bytes are not octets.
+ */
+#if CHAR_BIT != 8
+#error This code requires 8-bit bytes
+#endif
+
+/* ============= BEGIN documentation block for Doxygen ============ */
+
+#ifdef DOXYGEN_IGNORE
+
+/** @mainpage sphlib C code documentation
+ *
+ * @section overview Overview
+ *
+ * <code>sphlib</code> is a library which contains implementations of
+ * various cryptographic hash functions. These pages have been generated
+ * with <a href="http://www.doxygen.org/index.html">doxygen</a> and
+ * document the API for the C implementations.
+ *
+ * The API is described in appropriate header files, which are available
+ * in the "Files" section. Each hash function family has its own header,
+ * whose name begins with <code>"sph_"</code> and contains the family
+ * name. For instance, the API for the RIPEMD hash functions is available
+ * in the header file <code>sph_ripemd.h</code>.
+ *
+ * @section principles API structure and conventions
+ *
+ * @subsection io Input/output conventions
+ *
+ * In all generality, hash functions operate over strings of bits.
+ * Individual bits are rarely encountered in C programming or actual
+ * communication protocols; most protocols converge on the ubiquitous
+ * "octet" which is a group of eight bits. Data is thus expressed as a
+ * stream of octets. The C programming language contains the notion of a
+ * "byte", which is a data unit managed under the type <code>"unsigned
+ * char"</code>. The C standard prescribes that a byte should hold at
+ * least eight bits, but possibly more. Most modern architectures, even
+ * in the embedded world, feature eight-bit bytes, i.e. map bytes to
+ * octets.
+ *
+ * Nevertheless, for some of the implemented hash functions, an extra
+ * API has been added, which allows the input of arbitrary sequences of
+ * bits: when the computation is about to be closed, 1 to 7 extra bits
+ * can be added. The functions for which this API is implemented include
+ * the SHA-2 functions and all SHA-3 candidates.
+ *
+ * <code>sphlib</code> defines hash function which may hash octet streams,
+ * i.e. streams of bits where the number of bits is a multiple of eight.
+ * The data input functions in the <code>sphlib</code> API expect data
+ * as anonymous pointers (<code>"const void *"</code>) with a length
+ * (of type <code>"size_t"</code>) which gives the input data chunk length
+ * in bytes. A byte is assumed to be an octet; the <code>sph_types.h</code>
+ * header contains a compile-time test which prevents compilation on
+ * architectures where this property is not met.
+ *
+ * The hash function output is also converted into bytes. All currently
+ * implemented hash functions have an output width which is a multiple of
+ * eight, and this is likely to remain true for new designs.
+ *
+ * Most hash functions internally convert input data into 32-bit of 64-bit
+ * words, using either little-endian or big-endian conversion. The hash
+ * output also often consists of such words, which are encoded into output
+ * bytes with a similar endianness convention. Some hash functions have
+ * been only loosely specified on that subject; when necessary,
+ * <code>sphlib</code> has been tested against published "reference"
+ * implementations in order to use the same conventions.
+ *
+ * @subsection shortname Function short name
+ *
+ * Each implemented hash function has a "short name" which is used
+ * internally to derive the identifiers for the functions and context
+ * structures which the function uses. For instance, MD5 has the short
+ * name <code>"md5"</code>. Short names are listed in the next section,
+ * for the implemented hash functions. In subsequent sections, the
+ * short name will be assumed to be <code>"XXX"</code>: replace with the
+ * actual hash function name to get the C identifier.
+ *
+ * Note: some functions within the same family share the same core
+ * elements, such as update function or context structure. Correspondingly,
+ * some of the defined types or functions may actually be macros which
+ * transparently evaluate to another type or function name.
+ *
+ * @subsection context Context structure
+ *
+ * Each implemented hash fonction has its own context structure, available
+ * under the type name <code>"sph_XXX_context"</code> for the hash function
+ * with short name <code>"XXX"</code>. This structure holds all needed
+ * state for a running hash computation.
+ *
+ * The contents of these structures are meant to be opaque, and private
+ * to the implementation. However, these contents are specified in the
+ * header files so that application code which uses <code>sphlib</code>
+ * may access the size of those structures.
+ *
+ * The caller is responsible for allocating the context structure,
+ * whether by dynamic allocation (<code>malloc()</code> or equivalent),
+ * static allocation (a global permanent variable), as an automatic
+ * variable ("on the stack"), or by any other mean which ensures proper
+ * structure alignment. <code>sphlib</code> code performs no dynamic
+ * allocation by itself.
+ *
+ * The context must be initialized before use, using the
+ * <code>sph_XXX_init()</code> function. This function sets the context
+ * state to proper initial values for hashing.
+ *
+ * Since all state data is contained within the context structure,
+ * <code>sphlib</code> is thread-safe and reentrant: several hash
+ * computations may be performed in parallel, provided that they do not
+ * operate on the same context. Moreover, a running computation can be
+ * cloned by copying the context (with a simple <code>memcpy()</code>):
+ * the context and its clone are then independant and may be updated
+ * with new data and/or closed without interfering with each other.
+ * Similarly, a context structure can be moved in memory at will:
+ * context structures contain no pointer, in particular no pointer to
+ * themselves.
+ *
+ * @subsection dataio Data input
+ *
+ * Hashed data is input with the <code>sph_XXX()</code> fonction, which
+ * takes as parameters a pointer to the context, a pointer to the data
+ * to hash, and the number of data bytes to hash. The context is updated
+ * with the new data.
+ *
+ * Data can be input in one or several calls, with arbitrary input lengths.
+ * However, it is best, performance wise, to input data by relatively big
+ * chunks (say a few kilobytes), because this allows <code>sphlib</code> to
+ * optimize things and avoid internal copying.
+ *
+ * When all data has been input, the context can be closed with
+ * <code>sph_XXX_close()</code>. The hash output is computed and written
+ * into the provided buffer. The caller must take care to provide a
+ * buffer of appropriate length; e.g., when using SHA-1, the output is
+ * a 20-byte word, therefore the output buffer must be at least 20-byte
+ * long.
+ *
+ * For some hash functions, the <code>sph_XXX_addbits_and_close()</code>
+ * function can be used instead of <code>sph_XXX_close()</code>. This
+ * function can take a few extra <strong>bits</strong> to be added at
+ * the end of the input message. This allows hashing messages with a
+ * bit length which is not a multiple of 8. The extra bits are provided
+ * as an unsigned integer value, and a bit count. The bit count must be
+ * between 0 and 7, inclusive. The extra bits are provided as bits 7 to
+ * 0 (bits of numerical value 128, 64, 32... downto 0), in that order.
+ * For instance, to add three bits of value 1, 1 and 0, the unsigned
+ * integer will have value 192 (1*128 + 1*64 + 0*32) and the bit count
+ * will be 3.
+ *
+ * The <code>SPH_SIZE_XXX</code> macro is defined for each hash function;
+ * it evaluates to the function output size, expressed in bits. For instance,
+ * <code>SPH_SIZE_sha1</code> evaluates to <code>160</code>.
+ *
+ * When closed, the context is automatically reinitialized and can be
+ * immediately used for another computation. It is not necessary to call
+ * <code>sph_XXX_init()</code> after a close. Note that
+ * <code>sph_XXX_init()</code> can still be called to "reset" a context,
+ * i.e. forget previously input data, and get back to the initial state.
+ *
+ * @subsection alignment Data alignment
+ *
+ * "Alignment" is a property of data, which is said to be "properly
+ * aligned" when its emplacement in memory is such that the data can
+ * be optimally read by full words. This depends on the type of access;
+ * basically, some hash functions will read data by 32-bit or 64-bit
+ * words. <code>sphlib</code> does not mandate such alignment for input
+ * data, but using aligned data can substantially improve performance.
+ *
+ * As a rule, it is best to input data by chunks whose length (in bytes)
+ * is a multiple of eight, and which begins at "generally aligned"
+ * addresses, such as the base address returned by a call to
+ * <code>malloc()</code>.
+ *
+ * @section functions Implemented functions
+ *
+ * We give here the list of implemented functions. They are grouped by
+ * family; to each family corresponds a specific header file. Each
+ * individual function has its associated "short name". Please refer to
+ * the documentation for that header file to get details on the hash
+ * function denomination and provenance.
+ *
+ * Note: the functions marked with a '(64)' in the list below are
+ * available only if the C compiler provides an integer type of length
+ * 64 bits or more. Such a type is mandatory in the latest C standard
+ * (ISO 9899:1999, aka "C99") and is present in several older compilers
+ * as well, so chances are that such a type is available.
+ *
+ * - HAVAL family: file <code>sph_haval.h</code>
+ *   - HAVAL-128/3 (128-bit, 3 passes): short name: <code>haval128_3</code>
+ *   - HAVAL-128/4 (128-bit, 4 passes): short name: <code>haval128_4</code>
+ *   - HAVAL-128/5 (128-bit, 5 passes): short name: <code>haval128_5</code>
+ *   - HAVAL-160/3 (160-bit, 3 passes): short name: <code>haval160_3</code>
+ *   - HAVAL-160/4 (160-bit, 4 passes): short name: <code>haval160_4</code>
+ *   - HAVAL-160/5 (160-bit, 5 passes): short name: <code>haval160_5</code>
+ *   - HAVAL-192/3 (192-bit, 3 passes): short name: <code>haval192_3</code>
+ *   - HAVAL-192/4 (192-bit, 4 passes): short name: <code>haval192_4</code>
+ *   - HAVAL-192/5 (192-bit, 5 passes): short name: <code>haval192_5</code>
+ *   - HAVAL-224/3 (224-bit, 3 passes): short name: <code>haval224_3</code>
+ *   - HAVAL-224/4 (224-bit, 4 passes): short name: <code>haval224_4</code>
+ *   - HAVAL-224/5 (224-bit, 5 passes): short name: <code>haval224_5</code>
+ *   - HAVAL-256/3 (256-bit, 3 passes): short name: <code>haval256_3</code>
+ *   - HAVAL-256/4 (256-bit, 4 passes): short name: <code>haval256_4</code>
+ *   - HAVAL-256/5 (256-bit, 5 passes): short name: <code>haval256_5</code>
+ * - MD2: file <code>sph_md2.h</code>, short name: <code>md2</code>
+ * - MD4: file <code>sph_md4.h</code>, short name: <code>md4</code>
+ * - MD5: file <code>sph_md5.h</code>, short name: <code>md5</code>
+ * - PANAMA: file <code>sph_panama.h</code>, short name: <code>panama</code>
+ * - RadioGatun family: file <code>sph_radiogatun.h</code>
+ *   - RadioGatun[32]: short name: <code>radiogatun32</code>
+ *   - RadioGatun[64]: short name: <code>radiogatun64</code> (64)
+ * - RIPEMD family: file <code>sph_ripemd.h</code>
+ *   - RIPEMD: short name: <code>ripemd</code>
+ *   - RIPEMD-128: short name: <code>ripemd128</code>
+ *   - RIPEMD-160: short name: <code>ripemd160</code>
+ * - SHA-0: file <code>sph_sha0.h</code>, short name: <code>sha0</code>
+ * - SHA-1: file <code>sph_sha1.h</code>, short name: <code>sha1</code>
+ * - SHA-2 family, 32-bit hashes: file <code>sph_sha2.h</code>
+ *   - SHA-224: short name: <code>sha224</code>
+ *   - SHA-256: short name: <code>sha256</code>
+ *   - SHA-384: short name: <code>sha384</code> (64)
+ *   - SHA-512: short name: <code>sha512</code> (64)
+ * - Tiger family: file <code>sph_tiger.h</code>
+ *   - Tiger: short name: <code>tiger</code> (64)
+ *   - Tiger2: short name: <code>tiger2</code> (64)
+ * - WHIRLPOOL family: file <code>sph_whirlpool.h</code>
+ *   - WHIRLPOOL-0: short name: <code>whirlpool0</code> (64)
+ *   - WHIRLPOOL-1: short name: <code>whirlpool1</code> (64)
+ *   - WHIRLPOOL: short name: <code>whirlpool</code> (64)
+ *
+ * The fourteen second-round SHA-3 candidates are also implemented;
+ * when applicable, the implementations follow the "final" specifications
+ * as published for the third round of the SHA-3 competition (BLAKE,
+ * Groestl, JH, Keccak and Skein have been tweaked for third round).
+ *
+ * - BLAKE family: file <code>sph_blake.h</code>
+ *   - BLAKE-224: short name: <code>blake224</code>
+ *   - BLAKE-256: short name: <code>blake256</code>
+ *   - BLAKE-384: short name: <code>blake384</code>
+ *   - BLAKE-512: short name: <code>blake512</code>
+ * - BMW (Blue Midnight Wish) family: file <code>sph_bmw.h</code>
+ *   - BMW-224: short name: <code>bmw224</code>
+ *   - BMW-256: short name: <code>bmw256</code>
+ *   - BMW-384: short name: <code>bmw384</code> (64)
+ *   - BMW-512: short name: <code>bmw512</code> (64)
+ * - CubeHash family: file <code>sph_cubehash.h</code> (specified as
+ *   CubeHash16/32 in the CubeHash specification)
+ *   - CubeHash-224: short name: <code>cubehash224</code>
+ *   - CubeHash-256: short name: <code>cubehash256</code>
+ *   - CubeHash-384: short name: <code>cubehash384</code>
+ *   - CubeHash-512: short name: <code>cubehash512</code>
+ * - ECHO family: file <code>sph_echo.h</code>
+ *   - ECHO-224: short name: <code>echo224</code>
+ *   - ECHO-256: short name: <code>echo256</code>
+ *   - ECHO-384: short name: <code>echo384</code>
+ *   - ECHO-512: short name: <code>echo512</code>
+ * - Fugue family: file <code>sph_fugue.h</code>
+ *   - Fugue-224: short name: <code>fugue224</code>
+ *   - Fugue-256: short name: <code>fugue256</code>
+ *   - Fugue-384: short name: <code>fugue384</code>
+ *   - Fugue-512: short name: <code>fugue512</code>
+ * - Groestl family: file <code>sph_groestl.h</code>
+ *   - Groestl-224: short name: <code>groestl224</code>
+ *   - Groestl-256: short name: <code>groestl256</code>
+ *   - Groestl-384: short name: <code>groestl384</code>
+ *   - Groestl-512: short name: <code>groestl512</code>
+ * - Hamsi family: file <code>sph_hamsi.h</code>
+ *   - Hamsi-224: short name: <code>hamsi224</code>
+ *   - Hamsi-256: short name: <code>hamsi256</code>
+ *   - Hamsi-384: short name: <code>hamsi384</code>
+ *   - Hamsi-512: short name: <code>hamsi512</code>
+ * - JH family: file <code>sph_jh.h</code>
+ *   - JH-224: short name: <code>jh224</code>
+ *   - JH-256: short name: <code>jh256</code>
+ *   - JH-384: short name: <code>jh384</code>
+ *   - JH-512: short name: <code>jh512</code>
+ * - Keccak family: file <code>sph_keccak.h</code>
+ *   - Keccak-224: short name: <code>keccak224</code>
+ *   - Keccak-256: short name: <code>keccak256</code>
+ *   - Keccak-384: short name: <code>keccak384</code>
+ *   - Keccak-512: short name: <code>keccak512</code>
+ * - Luffa family: file <code>sph_luffa.h</code>
+ *   - Luffa-224: short name: <code>luffa224</code>
+ *   - Luffa-256: short name: <code>luffa256</code>
+ *   - Luffa-384: short name: <code>luffa384</code>
+ *   - Luffa-512: short name: <code>luffa512</code>
+ * - Shabal family: file <code>sph_shabal.h</code>
+ *   - Shabal-192: short name: <code>shabal192</code>
+ *   - Shabal-224: short name: <code>shabal224</code>
+ *   - Shabal-256: short name: <code>shabal256</code>
+ *   - Shabal-384: short name: <code>shabal384</code>
+ *   - Shabal-512: short name: <code>shabal512</code>
+ * - SHAvite-3 family: file <code>sph_shavite.h</code>
+ *   - SHAvite-224 (nominally "SHAvite-3 with 224-bit output"):
+ *     short name: <code>shabal224</code>
+ *   - SHAvite-256 (nominally "SHAvite-3 with 256-bit output"):
+ *     short name: <code>shabal256</code>
+ *   - SHAvite-384 (nominally "SHAvite-3 with 384-bit output"):
+ *     short name: <code>shabal384</code>
+ *   - SHAvite-512 (nominally "SHAvite-3 with 512-bit output"):
+ *     short name: <code>shabal512</code>
+ * - SIMD family: file <code>sph_simd.h</code>
+ *   - SIMD-224: short name: <code>simd224</code>
+ *   - SIMD-256: short name: <code>simd256</code>
+ *   - SIMD-384: short name: <code>simd384</code>
+ *   - SIMD-512: short name: <code>simd512</code>
+ * - Skein family: file <code>sph_skein.h</code>
+ *   - Skein-224 (nominally specified as Skein-512-224): short name:
+ *     <code>skein224</code> (64)
+ *   - Skein-256 (nominally specified as Skein-512-256): short name:
+ *     <code>skein256</code> (64)
+ *   - Skein-384 (nominally specified as Skein-512-384): short name:
+ *     <code>skein384</code> (64)
+ *   - Skein-512 (nominally specified as Skein-512-512): short name:
+ *     <code>skein512</code> (64)
+ *
+ * For the second-round SHA-3 candidates, the functions are as specified
+ * for round 2, i.e. with the "tweaks" that some candidates added
+ * between round 1 and round 2. Also, some of the submitted packages for
+ * round 2 contained errors, in the specification, reference code, or
+ * both. <code>sphlib</code> implements the corrected versions.
+ */
+
+/** @hideinitializer
+ * Unsigned integer type whose length is at least 32 bits; on most
+ * architectures, it will have a width of exactly 32 bits. Unsigned C
+ * types implement arithmetics modulo a power of 2; use the
+ * <code>SPH_T32()</code> macro to ensure that the value is truncated
+ * to exactly 32 bits. Unless otherwise specified, all macros and
+ * functions which accept <code>sph_u32</code> values assume that these
+ * values fit on 32 bits, i.e. do not exceed 2^32-1, even on architectures
+ * where <code>sph_u32</code> is larger than that.
+ */
+typedef __arch_dependant__ sph_u32;
+
+/** @hideinitializer
+ * Signed integer type corresponding to <code>sph_u32</code>; it has
+ * width 32 bits or more.
+ */
+typedef __arch_dependant__ sph_s32;
+
+/** @hideinitializer
+ * Unsigned integer type whose length is at least 64 bits; on most
+ * architectures which feature such a type, it will have a width of
+ * exactly 64 bits. C99-compliant platform will have this type; it
+ * is also defined when the GNU compiler (gcc) is used, and on
+ * platforms where <code>unsigned long</code> is large enough. If this
+ * type is not available, then some hash functions which depends on
+ * a 64-bit type will not be available (most notably SHA-384, SHA-512,
+ * Tiger and WHIRLPOOL).
+ */
+typedef __arch_dependant__ sph_u64;
+
+/** @hideinitializer
+ * Signed integer type corresponding to <code>sph_u64</code>; it has
+ * width 64 bits or more.
+ */
+typedef __arch_dependant__ sph_s64;
+
+/**
+ * This macro expands the token <code>x</code> into a suitable
+ * constant expression of type <code>sph_u32</code>. Depending on
+ * how this type is defined, a suffix such as <code>UL</code> may
+ * be appended to the argument.
+ *
+ * @param x   the token to expand into a suitable constant expression
+ */
+#define SPH_C32(x)
+
+/**
+ * Truncate a 32-bit value to exactly 32 bits. On most systems, this is
+ * a no-op, recognized as such by the compiler.
+ *
+ * @param x   the value to truncate (of type <code>sph_u32</code>)
+ */
+#define SPH_T32(x)
+
+/**
+ * Rotate a 32-bit value by a number of bits to the left. The rotate
+ * count must reside between 1 and 31. This macro assumes that its
+ * first argument fits in 32 bits (no extra bit allowed on machines where
+ * <code>sph_u32</code> is wider); both arguments may be evaluated
+ * several times.
+ *
+ * @param x   the value to rotate (of type <code>sph_u32</code>)
+ * @param n   the rotation count (between 1 and 31, inclusive)
+ */
+#define SPH_ROTL32(x, n)
+
+/**
+ * Rotate a 32-bit value by a number of bits to the left. The rotate
+ * count must reside between 1 and 31. This macro assumes that its
+ * first argument fits in 32 bits (no extra bit allowed on machines where
+ * <code>sph_u32</code> is wider); both arguments may be evaluated
+ * several times.
+ *
+ * @param x   the value to rotate (of type <code>sph_u32</code>)
+ * @param n   the rotation count (between 1 and 31, inclusive)
+ */
+#define SPH_ROTR32(x, n)
+
+/**
+ * This macro is defined on systems for which a 64-bit type has been
+ * detected, and is used for <code>sph_u64</code>.
+ */
+#define SPH_64
+
+/**
+ * This macro is defined on systems for the "native" integer size is
+ * 64 bits (64-bit values fit in one register).
+ */
+#define SPH_64_TRUE
+
+/**
+ * This macro expands the token <code>x</code> into a suitable
+ * constant expression of type <code>sph_u64</code>. Depending on
+ * how this type is defined, a suffix such as <code>ULL</code> may
+ * be appended to the argument. This macro is defined only if a
+ * 64-bit type was detected and used for <code>sph_u64</code>.
+ *
+ * @param x   the token to expand into a suitable constant expression
+ */
+#define SPH_C64(x)
+
+/**
+ * Truncate a 64-bit value to exactly 64 bits. On most systems, this is
+ * a no-op, recognized as such by the compiler. This macro is defined only
+ * if a 64-bit type was detected and used for <code>sph_u64</code>.
+ *
+ * @param x   the value to truncate (of type <code>sph_u64</code>)
+ */
+#define SPH_T64(x)
+
+/**
+ * Rotate a 64-bit value by a number of bits to the left. The rotate
+ * count must reside between 1 and 63. This macro assumes that its
+ * first argument fits in 64 bits (no extra bit allowed on machines where
+ * <code>sph_u64</code> is wider); both arguments may be evaluated
+ * several times. This macro is defined only if a 64-bit type was detected
+ * and used for <code>sph_u64</code>.
+ *
+ * @param x   the value to rotate (of type <code>sph_u64</code>)
+ * @param n   the rotation count (between 1 and 63, inclusive)
+ */
+#define SPH_ROTL64(x, n)
+
+/**
+ * Rotate a 64-bit value by a number of bits to the left. The rotate
+ * count must reside between 1 and 63. This macro assumes that its
+ * first argument fits in 64 bits (no extra bit allowed on machines where
+ * <code>sph_u64</code> is wider); both arguments may be evaluated
+ * several times. This macro is defined only if a 64-bit type was detected
+ * and used for <code>sph_u64</code>.
+ *
+ * @param x   the value to rotate (of type <code>sph_u64</code>)
+ * @param n   the rotation count (between 1 and 63, inclusive)
+ */
+#define SPH_ROTR64(x, n)
+
+/**
+ * This macro evaluates to <code>inline</code> or an equivalent construction,
+ * if available on the compilation platform, or to nothing otherwise. This
+ * is used to declare inline functions, for which the compiler should
+ * endeavour to include the code directly in the caller. Inline functions
+ * are typically defined in header files as replacement for macros.
+ */
+#define SPH_INLINE
+
+/**
+ * This macro is defined if the platform has been detected as using
+ * little-endian convention. This implies that the <code>sph_u32</code>
+ * type (and the <code>sph_u64</code> type also, if it is defined) has
+ * an exact width (i.e. exactly 32-bit, respectively 64-bit).
+ */
+#define SPH_LITTLE_ENDIAN
+
+/**
+ * This macro is defined if the platform has been detected as using
+ * big-endian convention. This implies that the <code>sph_u32</code>
+ * type (and the <code>sph_u64</code> type also, if it is defined) has
+ * an exact width (i.e. exactly 32-bit, respectively 64-bit).
+ */
+#define SPH_BIG_ENDIAN
+
+/**
+ * This macro is defined if 32-bit words (and 64-bit words, if defined)
+ * can be read from and written to memory efficiently in little-endian
+ * convention. This is the case for little-endian platforms, and also
+ * for the big-endian platforms which have special little-endian access
+ * opcodes (e.g. Ultrasparc).
+ */
+#define SPH_LITTLE_FAST
+
+/**
+ * This macro is defined if 32-bit words (and 64-bit words, if defined)
+ * can be read from and written to memory efficiently in big-endian
+ * convention. This is the case for little-endian platforms, and also
+ * for the little-endian platforms which have special big-endian access
+ * opcodes.
+ */
+#define SPH_BIG_FAST
+
+/**
+ * On some platforms, this macro is defined to an unsigned integer type
+ * into which pointer values may be cast. The resulting value can then
+ * be tested for being a multiple of 2, 4 or 8, indicating an aligned
+ * pointer for, respectively, 16-bit, 32-bit or 64-bit memory accesses.
+ */
+#define SPH_UPTR
+
+/**
+ * When defined, this macro indicates that unaligned memory accesses
+ * are possible with only a minor penalty, and thus should be prefered
+ * over strategies which first copy data to an aligned buffer.
+ */
+#define SPH_UNALIGNED
+
+/**
+ * Byte-swap a 32-bit word (i.e. <code>0x12345678</code> becomes
+ * <code>0x78563412</code>). This is an inline function which resorts
+ * to inline assembly on some platforms, for better performance.
+ *
+ * @param x   the 32-bit value to byte-swap
+ * @return  the byte-swapped value
+ */
+static inline sph_u32 sph_bswap32(sph_u32 x);
+
+/**
+ * Byte-swap a 64-bit word. This is an inline function which resorts
+ * to inline assembly on some platforms, for better performance. This
+ * function is defined only if a suitable 64-bit type was found for
+ * <code>sph_u64</code>
+ *
+ * @param x   the 64-bit value to byte-swap
+ * @return  the byte-swapped value
+ */
+static inline sph_u64 sph_bswap64(sph_u64 x);
+
+/**
+ * Decode a 16-bit unsigned value from memory, in little-endian convention
+ * (least significant byte comes first).
+ *
+ * @param src   the source address
+ * @return  the decoded value
+ */
+static inline unsigned sph_dec16le(const void *src);
+
+/**
+ * Encode a 16-bit unsigned value into memory, in little-endian convention
+ * (least significant byte comes first).
+ *
+ * @param dst   the destination buffer
+ * @param val   the value to encode
+ */
+static inline void sph_enc16le(void *dst, unsigned val);
+
+/**
+ * Decode a 16-bit unsigned value from memory, in big-endian convention
+ * (most significant byte comes first).
+ *
+ * @param src   the source address
+ * @return  the decoded value
+ */
+static inline unsigned sph_dec16be(const void *src);
+
+/**
+ * Encode a 16-bit unsigned value into memory, in big-endian convention
+ * (most significant byte comes first).
+ *
+ * @param dst   the destination buffer
+ * @param val   the value to encode
+ */
+static inline void sph_enc16be(void *dst, unsigned val);
+
+/**
+ * Decode a 32-bit unsigned value from memory, in little-endian convention
+ * (least significant byte comes first).
+ *
+ * @param src   the source address
+ * @return  the decoded value
+ */
+static inline sph_u32 sph_dec32le(const void *src);
+
+/**
+ * Decode a 32-bit unsigned value from memory, in little-endian convention
+ * (least significant byte comes first). This function assumes that the
+ * source address is suitably aligned for a direct access, if the platform
+ * supports such things; it can thus be marginally faster than the generic
+ * <code>sph_dec32le()</code> function.
+ *
+ * @param src   the source address
+ * @return  the decoded value
+ */
+static inline sph_u32 sph_dec32le_aligned(const void *src);
+
+/**
+ * Encode a 32-bit unsigned value into memory, in little-endian convention
+ * (least significant byte comes first).
+ *
+ * @param dst   the destination buffer
+ * @param val   the value to encode
+ */
+static inline void sph_enc32le(void *dst, sph_u32 val);
+
+/**
+ * Encode a 32-bit unsigned value into memory, in little-endian convention
+ * (least significant byte comes first). This function assumes that the
+ * destination address is suitably aligned for a direct access, if the
+ * platform supports such things; it can thus be marginally faster than
+ * the generic <code>sph_enc32le()</code> function.
+ *
+ * @param dst   the destination buffer
+ * @param val   the value to encode
+ */
+static inline void sph_enc32le_aligned(void *dst, sph_u32 val);
+
+/**
+ * Decode a 32-bit unsigned value from memory, in big-endian convention
+ * (most significant byte comes first).
+ *
+ * @param src   the source address
+ * @return  the decoded value
+ */
+static inline sph_u32 sph_dec32be(const void *src);
+
+/**
+ * Decode a 32-bit unsigned value from memory, in big-endian convention
+ * (most significant byte comes first). This function assumes that the
+ * source address is suitably aligned for a direct access, if the platform
+ * supports such things; it can thus be marginally faster than the generic
+ * <code>sph_dec32be()</code> function.
+ *
+ * @param src   the source address
+ * @return  the decoded value
+ */
+static inline sph_u32 sph_dec32be_aligned(const void *src);
+
+/**
+ * Encode a 32-bit unsigned value into memory, in big-endian convention
+ * (most significant byte comes first).
+ *
+ * @param dst   the destination buffer
+ * @param val   the value to encode
+ */
+static inline void sph_enc32be(void *dst, sph_u32 val);
+
+/**
+ * Encode a 32-bit unsigned value into memory, in big-endian convention
+ * (most significant byte comes first). This function assumes that the
+ * destination address is suitably aligned for a direct access, if the
+ * platform supports such things; it can thus be marginally faster than
+ * the generic <code>sph_enc32be()</code> function.
+ *
+ * @param dst   the destination buffer
+ * @param val   the value to encode
+ */
+static inline void sph_enc32be_aligned(void *dst, sph_u32 val);
+
+/**
+ * Decode a 64-bit unsigned value from memory, in little-endian convention
+ * (least significant byte comes first). This function is defined only
+ * if a suitable 64-bit type was detected and used for <code>sph_u64</code>.
+ *
+ * @param src   the source address
+ * @return  the decoded value
+ */
+static inline sph_u64 sph_dec64le(const void *src);
+
+/**
+ * Decode a 64-bit unsigned value from memory, in little-endian convention
+ * (least significant byte comes first). This function assumes that the
+ * source address is suitably aligned for a direct access, if the platform
+ * supports such things; it can thus be marginally faster than the generic
+ * <code>sph_dec64le()</code> function. This function is defined only
+ * if a suitable 64-bit type was detected and used for <code>sph_u64</code>.
+ *
+ * @param src   the source address
+ * @return  the decoded value
+ */
+static inline sph_u64 sph_dec64le_aligned(const void *src);
+
+/**
+ * Encode a 64-bit unsigned value into memory, in little-endian convention
+ * (least significant byte comes first). This function is defined only
+ * if a suitable 64-bit type was detected and used for <code>sph_u64</code>.
+ *
+ * @param dst   the destination buffer
+ * @param val   the value to encode
+ */
+static inline void sph_enc64le(void *dst, sph_u64 val);
+
+/**
+ * Encode a 64-bit unsigned value into memory, in little-endian convention
+ * (least significant byte comes first). This function assumes that the
+ * destination address is suitably aligned for a direct access, if the
+ * platform supports such things; it can thus be marginally faster than
+ * the generic <code>sph_enc64le()</code> function. This function is defined
+ * only if a suitable 64-bit type was detected and used for
+ * <code>sph_u64</code>.
+ *
+ * @param dst   the destination buffer
+ * @param val   the value to encode
+ */
+static inline void sph_enc64le_aligned(void *dst, sph_u64 val);
+
+/**
+ * Decode a 64-bit unsigned value from memory, in big-endian convention
+ * (most significant byte comes first). This function is defined only
+ * if a suitable 64-bit type was detected and used for <code>sph_u64</code>.
+ *
+ * @param src   the source address
+ * @return  the decoded value
+ */
+static inline sph_u64 sph_dec64be(const void *src);
+
+/**
+ * Decode a 64-bit unsigned value from memory, in big-endian convention
+ * (most significant byte comes first). This function assumes that the
+ * source address is suitably aligned for a direct access, if the platform
+ * supports such things; it can thus be marginally faster than the generic
+ * <code>sph_dec64be()</code> function. This function is defined only
+ * if a suitable 64-bit type was detected and used for <code>sph_u64</code>.
+ *
+ * @param src   the source address
+ * @return  the decoded value
+ */
+static inline sph_u64 sph_dec64be_aligned(const void *src);
+
+/**
+ * Encode a 64-bit unsigned value into memory, in big-endian convention
+ * (most significant byte comes first). This function is defined only
+ * if a suitable 64-bit type was detected and used for <code>sph_u64</code>.
+ *
+ * @param dst   the destination buffer
+ * @param val   the value to encode
+ */
+static inline void sph_enc64be(void *dst, sph_u64 val);
+
+/**
+ * Encode a 64-bit unsigned value into memory, in big-endian convention
+ * (most significant byte comes first). This function assumes that the
+ * destination address is suitably aligned for a direct access, if the
+ * platform supports such things; it can thus be marginally faster than
+ * the generic <code>sph_enc64be()</code> function. This function is defined
+ * only if a suitable 64-bit type was detected and used for
+ * <code>sph_u64</code>.
+ *
+ * @param dst   the destination buffer
+ * @param val   the value to encode
+ */
+static inline void sph_enc64be_aligned(void *dst, sph_u64 val);
+
+#endif
+
+/* ============== END documentation block for Doxygen ============= */
+
+#ifndef DOXYGEN_IGNORE
+
+/*
+ * We want to define the types "sph_u32" and "sph_u64" which hold
+ * unsigned values of at least, respectively, 32 and 64 bits. These
+ * tests should select appropriate types for most platforms. The
+ * macro "SPH_64" is defined if the 64-bit is supported.
+ */
+
+#undef SPH_64
+#undef SPH_64_TRUE
+
+#if defined __STDC__ && __STDC_VERSION__ >= 199901L
+
+/*
+ * On C99 implementations, we can use <stdint.h> to get an exact 64-bit
+ * type, if any, or otherwise use a wider type (which must exist, for
+ * C99 conformance).
+ */
+
+#include <stdint.h>
+
+#ifdef UINT32_MAX
+typedef uint32_t sph_u32;
+typedef int32_t sph_s32;
+#else
+typedef uint_fast32_t sph_u32;
+typedef int_fast32_t sph_s32;
+#endif
+#if !SPH_NO_64
+#ifdef UINT64_MAX
+typedef uint64_t sph_u64;
+typedef int64_t sph_s64;
+#else
+typedef uint_fast64_t sph_u64;
+typedef int_fast64_t sph_s64;
+#endif
+#endif
+
+#define SPH_C32(x)    ((sph_u32)(x))
+#if !SPH_NO_64
+#define SPH_C64(x)    ((sph_u64)(x))
+#define SPH_64  1
+#endif
+
+#else
+
+/*
+ * On non-C99 systems, we use "unsigned int" if it is wide enough,
+ * "unsigned long" otherwise. This supports all "reasonable" architectures.
+ * We have to be cautious: pre-C99 preprocessors handle constants
+ * differently in '#if' expressions. Hence the shifts to test UINT_MAX.
+ */
+
+#if ((UINT_MAX >> 11) >> 11) >= 0x3FF
+
+typedef unsigned int sph_u32;
+typedef int sph_s32;
+
+#define SPH_C32(x)    ((sph_u32)(x ## U))
+
+#else
+
+typedef unsigned long sph_u32;
+typedef long sph_s32;
+
+#define SPH_C32(x)    ((sph_u32)(x ## UL))
+
+#endif
+
+#if !SPH_NO_64
+
+/*
+ * We want a 64-bit type. We use "unsigned long" if it is wide enough (as
+ * is common on 64-bit architectures such as AMD64, Alpha or Sparcv9),
+ * "unsigned long long" otherwise, if available. We use ULLONG_MAX to
+ * test whether "unsigned long long" is available; we also know that
+ * gcc features this type, even if the libc header do not know it.
+ */
+
+#if ((ULONG_MAX >> 31) >> 31) >= 3
+
+typedef unsigned long sph_u64;
+typedef long sph_s64;
+
+#define SPH_C64(x)    ((sph_u64)(x ## UL))
+
+#define SPH_64  1
+
+#elif ((ULLONG_MAX >> 31) >> 31) >= 3 || defined __GNUC__
+
+typedef unsigned long long sph_u64;
+typedef long long sph_s64;
+
+#define SPH_C64(x)    ((sph_u64)(x ## ULL))
+
+#define SPH_64  1
+
+#else
+
+/*
+ * No 64-bit type...
+ */
+
+#endif
+
+#endif
+
+#endif
+
+/*
+ * If the "unsigned long" type has length 64 bits or more, then this is
+ * a "true" 64-bit architectures. This is also true with Visual C on
+ * amd64, even though the "long" type is limited to 32 bits.
+ */
+#if SPH_64 && (((ULONG_MAX >> 31) >> 31) >= 3 || defined _M_X64)
+#define SPH_64_TRUE   1
+#endif
+
+/*
+ * Implementation note: some processors have specific opcodes to perform
+ * a rotation. Recent versions of gcc recognize the expression above and
+ * use the relevant opcodes, when appropriate.
+ */
+
+#define SPH_T32(x)    ((x) & SPH_C32(0xFFFFFFFF))
+#define SPH_ROTL32(x, n)   SPH_T32(((x) << (n)) | ((x) >> (32 - (n))))
+#define SPH_ROTR32(x, n)   SPH_ROTL32(x, (32 - (n)))
+
+#if SPH_64
+
+#define SPH_T64(x)    ((x) & SPH_C64(0xFFFFFFFFFFFFFFFF))
+#define SPH_ROTL64(x, n)   SPH_T64(((x) << (n)) | ((x) >> (64 - (n))))
+#define SPH_ROTR64(x, n)   SPH_ROTL64(x, (64 - (n)))
+
+#endif
+
+#ifndef DOXYGEN_IGNORE
+/*
+ * Define SPH_INLINE to be an "inline" qualifier, if available. We define
+ * some small macro-like functions which benefit greatly from being inlined.
+ */
+#if (defined __STDC__ && __STDC_VERSION__ >= 199901L) || defined __GNUC__
+#define SPH_INLINE inline
+#elif defined _MSC_VER
+#define SPH_INLINE __inline
+#else
+#define SPH_INLINE
+#endif
+#endif
+
+/*
+ * We define some macros which qualify the architecture. These macros
+ * may be explicit set externally (e.g. as compiler parameters). The
+ * code below sets those macros if they are not already defined.
+ *
+ * Most macros are boolean, thus evaluate to either zero or non-zero.
+ * The SPH_UPTR macro is special, in that it evaluates to a C type,
+ * or is not defined.
+ *
+ * SPH_UPTR             if defined: unsigned type to cast pointers into
+ *
+ * SPH_UNALIGNED        non-zero if unaligned accesses are efficient
+ * SPH_LITTLE_ENDIAN    non-zero if architecture is known to be little-endian
+ * SPH_BIG_ENDIAN       non-zero if architecture is known to be big-endian
+ * SPH_LITTLE_FAST      non-zero if little-endian decoding is fast
+ * SPH_BIG_FAST         non-zero if big-endian decoding is fast
+ *
+ * If SPH_UPTR is defined, then encoding and decoding of 32-bit and 64-bit
+ * values will try to be "smart". Either SPH_LITTLE_ENDIAN or SPH_BIG_ENDIAN
+ * _must_ be non-zero in those situations. The 32-bit and 64-bit types
+ * _must_ also have an exact width.
+ *
+ * SPH_SPARCV9_GCC_32   UltraSPARC-compatible with gcc, 32-bit mode
+ * SPH_SPARCV9_GCC_64   UltraSPARC-compatible with gcc, 64-bit mode
+ * SPH_SPARCV9_GCC      UltraSPARC-compatible with gcc
+ * SPH_I386_GCC         x86-compatible (32-bit) with gcc
+ * SPH_I386_MSVC        x86-compatible (32-bit) with Microsoft Visual C
+ * SPH_AMD64_GCC        x86-compatible (64-bit) with gcc
+ * SPH_AMD64_MSVC       x86-compatible (64-bit) with Microsoft Visual C
+ * SPH_PPC32_GCC        PowerPC, 32-bit, with gcc
+ * SPH_PPC64_GCC        PowerPC, 64-bit, with gcc
+ *
+ * TODO: enhance automatic detection, for more architectures and compilers.
+ * Endianness is the most important. SPH_UNALIGNED and SPH_UPTR help with
+ * some very fast functions (e.g. MD4) when using unaligned input data.
+ * The CPU-specific-with-GCC macros are useful only for inline assembly,
+ * normally restrained to this header file.
+ */
+
+/*
+ * 32-bit x86, aka "i386 compatible".
+ */
+#if defined __i386__ || defined _M_IX86
+
+#define SPH_DETECT_UNALIGNED         1
+#define SPH_DETECT_LITTLE_ENDIAN     1
+#define SPH_DETECT_UPTR              sph_u32
+#ifdef __GNUC__
+#define SPH_DETECT_I386_GCC          1
+#endif
+#ifdef _MSC_VER
+#define SPH_DETECT_I386_MSVC         1
+#endif
+
+/*
+ * 64-bit x86, hereafter known as "amd64".
+ */
+#elif defined __x86_64 || defined _M_X64
+
+#define SPH_DETECT_UNALIGNED         1
+#define SPH_DETECT_LITTLE_ENDIAN     1
+#define SPH_DETECT_UPTR              sph_u64
+#ifdef __GNUC__
+#define SPH_DETECT_AMD64_GCC         1
+#endif
+#ifdef _MSC_VER
+#define SPH_DETECT_AMD64_MSVC        1
+#endif
+
+/*
+ * 64-bit Sparc architecture (implies v9).
+ */
+#elif ((defined __sparc__ || defined __sparc) && defined __arch64__) \
+	|| defined __sparcv9
+
+#define SPH_DETECT_BIG_ENDIAN        1
+#define SPH_DETECT_UPTR              sph_u64
+#ifdef __GNUC__
+#define SPH_DETECT_SPARCV9_GCC_64    1
+#define SPH_DETECT_LITTLE_FAST       1
+#endif
+
+/*
+ * 32-bit Sparc.
+ */
+#elif (defined __sparc__ || defined __sparc) \
+	&& !(defined __sparcv9 || defined __arch64__)
+
+#define SPH_DETECT_BIG_ENDIAN        1
+#define SPH_DETECT_UPTR              sph_u32
+#if defined __GNUC__ && defined __sparc_v9__
+#define SPH_DETECT_SPARCV9_GCC_32    1
+#define SPH_DETECT_LITTLE_FAST       1
+#endif
+
+/*
+ * ARM, little-endian.
+ */
+#elif defined __arm__ && __ARMEL__
+
+#define SPH_DETECT_LITTLE_ENDIAN     1
+
+/*
+ * MIPS, little-endian.
+ */
+#elif MIPSEL || _MIPSEL || __MIPSEL || __MIPSEL__
+
+#define SPH_DETECT_LITTLE_ENDIAN     1
+
+/*
+ * MIPS, big-endian.
+ */
+#elif MIPSEB || _MIPSEB || __MIPSEB || __MIPSEB__
+
+#define SPH_DETECT_BIG_ENDIAN        1
+
+/*
+ * PowerPC.
+ */
+#elif defined __powerpc__ || defined __POWERPC__ || defined __ppc__ \
+	|| defined _ARCH_PPC
+
+/*
+ * Note: we do not declare cross-endian access to be "fast": even if
+ * using inline assembly, implementation should still assume that
+ * keeping the decoded word in a temporary is faster than decoding
+ * it again.
+ */
+#if defined __GNUC__
+#if SPH_64_TRUE
+#define SPH_DETECT_PPC64_GCC         1
+#else
+#define SPH_DETECT_PPC32_GCC         1
+#endif
+#endif
+
+#if defined __BIG_ENDIAN__ || defined _BIG_ENDIAN
+#define SPH_DETECT_BIG_ENDIAN        1
+#elif defined __LITTLE_ENDIAN__ || defined _LITTLE_ENDIAN
+#define SPH_DETECT_LITTLE_ENDIAN     1
+#endif
+
+/*
+ * Itanium, 64-bit.
+ */
+#elif defined __ia64 || defined __ia64__ \
+	|| defined __itanium__ || defined _M_IA64
+
+#if defined __BIG_ENDIAN__ || defined _BIG_ENDIAN
+#define SPH_DETECT_BIG_ENDIAN        1
+#else
+#define SPH_DETECT_LITTLE_ENDIAN     1
+#endif
+#if defined __LP64__ || defined _LP64
+#define SPH_DETECT_UPTR              sph_u64
+#else
+#define SPH_DETECT_UPTR              sph_u32
+#endif
+
+#endif
+
+#if defined SPH_DETECT_SPARCV9_GCC_32 || defined SPH_DETECT_SPARCV9_GCC_64
+#define SPH_DETECT_SPARCV9_GCC       1
+#endif
+
+#if defined SPH_DETECT_UNALIGNED && !defined SPH_UNALIGNED
+#define SPH_UNALIGNED         SPH_DETECT_UNALIGNED
+#endif
+#if defined SPH_DETECT_UPTR && !defined SPH_UPTR
+#define SPH_UPTR              SPH_DETECT_UPTR
+#endif
+#if defined SPH_DETECT_LITTLE_ENDIAN && !defined SPH_LITTLE_ENDIAN
+#define SPH_LITTLE_ENDIAN     SPH_DETECT_LITTLE_ENDIAN
+#endif
+#if defined SPH_DETECT_BIG_ENDIAN && !defined SPH_BIG_ENDIAN
+#define SPH_BIG_ENDIAN        SPH_DETECT_BIG_ENDIAN
+#endif
+#if defined SPH_DETECT_LITTLE_FAST && !defined SPH_LITTLE_FAST
+#define SPH_LITTLE_FAST       SPH_DETECT_LITTLE_FAST
+#endif
+#if defined SPH_DETECT_BIG_FAST && !defined SPH_BIG_FAST
+#define SPH_BIG_FAST    SPH_DETECT_BIG_FAST
+#endif
+#if defined SPH_DETECT_SPARCV9_GCC_32 && !defined SPH_SPARCV9_GCC_32
+#define SPH_SPARCV9_GCC_32    SPH_DETECT_SPARCV9_GCC_32
+#endif
+#if defined SPH_DETECT_SPARCV9_GCC_64 && !defined SPH_SPARCV9_GCC_64
+#define SPH_SPARCV9_GCC_64    SPH_DETECT_SPARCV9_GCC_64
+#endif
+#if defined SPH_DETECT_SPARCV9_GCC && !defined SPH_SPARCV9_GCC
+#define SPH_SPARCV9_GCC       SPH_DETECT_SPARCV9_GCC
+#endif
+#if defined SPH_DETECT_I386_GCC && !defined SPH_I386_GCC
+#define SPH_I386_GCC          SPH_DETECT_I386_GCC
+#endif
+#if defined SPH_DETECT_I386_MSVC && !defined SPH_I386_MSVC
+#define SPH_I386_MSVC         SPH_DETECT_I386_MSVC
+#endif
+#if defined SPH_DETECT_AMD64_GCC && !defined SPH_AMD64_GCC
+#define SPH_AMD64_GCC         SPH_DETECT_AMD64_GCC
+#endif
+#if defined SPH_DETECT_AMD64_MSVC && !defined SPH_AMD64_MSVC
+#define SPH_AMD64_MSVC        SPH_DETECT_AMD64_MSVC
+#endif
+#if defined SPH_DETECT_PPC32_GCC && !defined SPH_PPC32_GCC
+#define SPH_PPC32_GCC         SPH_DETECT_PPC32_GCC
+#endif
+#if defined SPH_DETECT_PPC64_GCC && !defined SPH_PPC64_GCC
+#define SPH_PPC64_GCC         SPH_DETECT_PPC64_GCC
+#endif
+
+#if SPH_LITTLE_ENDIAN && !defined SPH_LITTLE_FAST
+#define SPH_LITTLE_FAST              1
+#endif
+#if SPH_BIG_ENDIAN && !defined SPH_BIG_FAST
+#define SPH_BIG_FAST                 1
+#endif
+
+#if defined SPH_UPTR && !(SPH_LITTLE_ENDIAN || SPH_BIG_ENDIAN)
+#error SPH_UPTR defined, but endianness is not known.
+#endif
+
+#if SPH_I386_GCC && !SPH_NO_ASM
+
+/*
+ * On x86 32-bit, with gcc, we use the bswapl opcode to byte-swap 32-bit
+ * values.
+ */
+
+static SPH_INLINE sph_u32
+sph_bswap32(sph_u32 x)
+{
+	__asm__ __volatile__ ("bswapl %0" : "=r" (x) : "0" (x));
+	return x;
+}
+
+#if SPH_64
+
+static SPH_INLINE sph_u64
+sph_bswap64(sph_u64 x)
+{
+	return ((sph_u64)sph_bswap32((sph_u32)x) << 32)
+		| (sph_u64)sph_bswap32((sph_u32)(x >> 32));
+}
+
+#endif
+
+#elif SPH_AMD64_GCC && !SPH_NO_ASM
+
+/*
+ * On x86 64-bit, with gcc, we use the bswapl opcode to byte-swap 32-bit
+ * and 64-bit values.
+ */
+
+static SPH_INLINE sph_u32
+sph_bswap32(sph_u32 x)
+{
+	__asm__ __volatile__ ("bswapl %0" : "=r" (x) : "0" (x));
+	return x;
+}
+
+#if SPH_64
+
+static SPH_INLINE sph_u64
+sph_bswap64(sph_u64 x)
+{
+	__asm__ __volatile__ ("bswapq %0" : "=r" (x) : "0" (x));
+	return x;
+}
+
+#endif
+
+/*
+ * Disabled code. Apparently, Microsoft Visual C 2005 is smart enough
+ * to generate proper opcodes for endianness swapping with the pure C
+ * implementation below.
+ *
+
+#elif SPH_I386_MSVC && !SPH_NO_ASM
+
+static __inline sph_u32 __declspec(naked) __fastcall
+sph_bswap32(sph_u32 x)
+{
+	__asm {
+		bswap  ecx
+		mov    eax,ecx
+		ret
+	}
+}
+
+#if SPH_64
+
+static SPH_INLINE sph_u64
+sph_bswap64(sph_u64 x)
+{
+	return ((sph_u64)sph_bswap32((sph_u32)x) << 32)
+		| (sph_u64)sph_bswap32((sph_u32)(x >> 32));
+}
+
+#endif
+
+ *
+ * [end of disabled code]
+ */
+
+#else
+
+static SPH_INLINE sph_u32
+sph_bswap32(sph_u32 x)
+{
+	x = SPH_T32((x << 16) | (x >> 16));
+	x = ((x & SPH_C32(0xFF00FF00)) >> 8)
+		| ((x & SPH_C32(0x00FF00FF)) << 8);
+	return x;
+}
+
+#if SPH_64
+
+/**
+ * Byte-swap a 64-bit value.
+ *
+ * @param x   the input value
+ * @return  the byte-swapped value
+ */
+static SPH_INLINE sph_u64
+sph_bswap64(sph_u64 x)
+{
+	x = SPH_T64((x << 32) | (x >> 32));
+	x = ((x & SPH_C64(0xFFFF0000FFFF0000)) >> 16)
+		| ((x & SPH_C64(0x0000FFFF0000FFFF)) << 16);
+	x = ((x & SPH_C64(0xFF00FF00FF00FF00)) >> 8)
+		| ((x & SPH_C64(0x00FF00FF00FF00FF)) << 8);
+	return x;
+}
+
+#endif
+
+#endif
+
+#if SPH_SPARCV9_GCC && !SPH_NO_ASM
+
+/*
+ * On UltraSPARC systems, native ordering is big-endian, but it is
+ * possible to perform little-endian read accesses by specifying the
+ * address space 0x88 (ASI_PRIMARY_LITTLE). Basically, either we use
+ * the opcode "lda [%reg]0x88,%dst", where %reg is the register which
+ * contains the source address and %dst is the destination register,
+ * or we use "lda [%reg+imm]%asi,%dst", which uses the %asi register
+ * to get the address space name. The latter format is better since it
+ * combines an addition and the actual access in a single opcode; but
+ * it requires the setting (and subsequent resetting) of %asi, which is
+ * slow. Some operations (i.e. MD5 compression function) combine many
+ * successive little-endian read accesses, which may share the same
+ * %asi setting. The macros below contain the appropriate inline
+ * assembly.
+ */
+
+#define SPH_SPARCV9_SET_ASI   \
+	sph_u32 sph_sparcv9_asi; \
+	__asm__ __volatile__ ( \
+		"rd %%asi,%0\n\twr %%g0,0x88,%%asi" : "=r" (sph_sparcv9_asi));
+
+#define SPH_SPARCV9_RESET_ASI  \
+	__asm__ __volatile__ ("wr %%g0,%0,%%asi" : : "r" (sph_sparcv9_asi));
+
+#define SPH_SPARCV9_DEC32LE(base, idx)   ({ \
+		sph_u32 sph_sparcv9_tmp; \
+		__asm__ __volatile__ ("lda [%1+" #idx "*4]%%asi,%0" \
+			: "=r" (sph_sparcv9_tmp) : "r" (base)); \
+		sph_sparcv9_tmp; \
+	})
+
+#endif
+
+static SPH_INLINE void
+sph_enc16be(void *dst, unsigned val)
+{
+	((unsigned char *)dst)[0] = (val >> 8);
+	((unsigned char *)dst)[1] = val;
+}
+
+static SPH_INLINE unsigned
+sph_dec16be(const void *src)
+{
+	return ((unsigned)(((const unsigned char *)src)[0]) << 8)
+		| (unsigned)(((const unsigned char *)src)[1]);
+}
+
+static SPH_INLINE void
+sph_enc16le(void *dst, unsigned val)
+{
+	((unsigned char *)dst)[0] = val;
+	((unsigned char *)dst)[1] = val >> 8;
+}
+
+static SPH_INLINE unsigned
+sph_dec16le(const void *src)
+{
+	return (unsigned)(((const unsigned char *)src)[0])
+		| ((unsigned)(((const unsigned char *)src)[1]) << 8);
+}
+
+/**
+ * Encode a 32-bit value into the provided buffer (big endian convention).
+ *
+ * @param dst   the destination buffer
+ * @param val   the 32-bit value to encode
+ */
+static SPH_INLINE void
+sph_enc32be(void *dst, sph_u32 val)
+{
+#if defined SPH_UPTR
+#if SPH_UNALIGNED
+#if SPH_LITTLE_ENDIAN
+	val = sph_bswap32(val);
+#endif
+	*(sph_u32 *)dst = val;
+#else
+	if (((SPH_UPTR)dst & 3) == 0) {
+#if SPH_LITTLE_ENDIAN
+		val = sph_bswap32(val);
+#endif
+		*(sph_u32 *)dst = val;
+	} else {
+		((unsigned char *)dst)[0] = (val >> 24);
+		((unsigned char *)dst)[1] = (val >> 16);
+		((unsigned char *)dst)[2] = (val >> 8);
+		((unsigned char *)dst)[3] = val;
+	}
+#endif
+#else
+	((unsigned char *)dst)[0] = (val >> 24);
+	((unsigned char *)dst)[1] = (val >> 16);
+	((unsigned char *)dst)[2] = (val >> 8);
+	((unsigned char *)dst)[3] = val;
+#endif
+}
+
+/**
+ * Encode a 32-bit value into the provided buffer (big endian convention).
+ * The destination buffer must be properly aligned.
+ *
+ * @param dst   the destination buffer (32-bit aligned)
+ * @param val   the value to encode
+ */
+static SPH_INLINE void
+sph_enc32be_aligned(void *dst, sph_u32 val)
+{
+#if SPH_LITTLE_ENDIAN
+	*(sph_u32 *)dst = sph_bswap32(val);
+#elif SPH_BIG_ENDIAN
+	*(sph_u32 *)dst = val;
+#else
+	((unsigned char *)dst)[0] = (val >> 24);
+	((unsigned char *)dst)[1] = (val >> 16);
+	((unsigned char *)dst)[2] = (val >> 8);
+	((unsigned char *)dst)[3] = val;
+#endif
+}
+
+/**
+ * Decode a 32-bit value from the provided buffer (big endian convention).
+ *
+ * @param src   the source buffer
+ * @return  the decoded value
+ */
+static SPH_INLINE sph_u32
+sph_dec32be(const void *src)
+{
+#if defined SPH_UPTR
+#if SPH_UNALIGNED
+#if SPH_LITTLE_ENDIAN
+	return sph_bswap32(*(const sph_u32 *)src);
+#else
+	return *(const sph_u32 *)src;
+#endif
+#else
+	if (((SPH_UPTR)src & 3) == 0) {
+#if SPH_LITTLE_ENDIAN
+		return sph_bswap32(*(const sph_u32 *)src);
+#else
+		return *(const sph_u32 *)src;
+#endif
+	} else {
+		return ((sph_u32)(((const unsigned char *)src)[0]) << 24)
+			| ((sph_u32)(((const unsigned char *)src)[1]) << 16)
+			| ((sph_u32)(((const unsigned char *)src)[2]) << 8)
+			| (sph_u32)(((const unsigned char *)src)[3]);
+	}
+#endif
+#else
+	return ((sph_u32)(((const unsigned char *)src)[0]) << 24)
+		| ((sph_u32)(((const unsigned char *)src)[1]) << 16)
+		| ((sph_u32)(((const unsigned char *)src)[2]) << 8)
+		| (sph_u32)(((const unsigned char *)src)[3]);
+#endif
+}
+
+/**
+ * Decode a 32-bit value from the provided buffer (big endian convention).
+ * The source buffer must be properly aligned.
+ *
+ * @param src   the source buffer (32-bit aligned)
+ * @return  the decoded value
+ */
+static SPH_INLINE sph_u32
+sph_dec32be_aligned(const void *src)
+{
+#if SPH_LITTLE_ENDIAN
+	return sph_bswap32(*(const sph_u32 *)src);
+#elif SPH_BIG_ENDIAN
+	return *(const sph_u32 *)src;
+#else
+	return ((sph_u32)(((const unsigned char *)src)[0]) << 24)
+		| ((sph_u32)(((const unsigned char *)src)[1]) << 16)
+		| ((sph_u32)(((const unsigned char *)src)[2]) << 8)
+		| (sph_u32)(((const unsigned char *)src)[3]);
+#endif
+}
+
+/**
+ * Encode a 32-bit value into the provided buffer (little endian convention).
+ *
+ * @param dst   the destination buffer
+ * @param val   the 32-bit value to encode
+ */
+static SPH_INLINE void
+sph_enc32le(void *dst, sph_u32 val)
+{
+#if defined SPH_UPTR
+#if SPH_UNALIGNED
+#if SPH_BIG_ENDIAN
+	val = sph_bswap32(val);
+#endif
+	*(sph_u32 *)dst = val;
+#else
+	if (((SPH_UPTR)dst & 3) == 0) {
+#if SPH_BIG_ENDIAN
+		val = sph_bswap32(val);
+#endif
+		*(sph_u32 *)dst = val;
+	} else {
+		((unsigned char *)dst)[0] = val;
+		((unsigned char *)dst)[1] = (val >> 8);
+		((unsigned char *)dst)[2] = (val >> 16);
+		((unsigned char *)dst)[3] = (val >> 24);
+	}
+#endif
+#else
+	((unsigned char *)dst)[0] = val;
+	((unsigned char *)dst)[1] = (val >> 8);
+	((unsigned char *)dst)[2] = (val >> 16);
+	((unsigned char *)dst)[3] = (val >> 24);
+#endif
+}
+
+/**
+ * Encode a 32-bit value into the provided buffer (little endian convention).
+ * The destination buffer must be properly aligned.
+ *
+ * @param dst   the destination buffer (32-bit aligned)
+ * @param val   the value to encode
+ */
+static SPH_INLINE void
+sph_enc32le_aligned(void *dst, sph_u32 val)
+{
+#if SPH_LITTLE_ENDIAN
+	*(sph_u32 *)dst = val;
+#elif SPH_BIG_ENDIAN
+	*(sph_u32 *)dst = sph_bswap32(val);
+#else
+	((unsigned char *)dst)[0] = val;
+	((unsigned char *)dst)[1] = (val >> 8);
+	((unsigned char *)dst)[2] = (val >> 16);
+	((unsigned char *)dst)[3] = (val >> 24);
+#endif
+}
+
+/**
+ * Decode a 32-bit value from the provided buffer (little endian convention).
+ *
+ * @param src   the source buffer
+ * @return  the decoded value
+ */
+static SPH_INLINE sph_u32
+sph_dec32le(const void *src)
+{
+#if defined SPH_UPTR
+#if SPH_UNALIGNED
+#if SPH_BIG_ENDIAN
+	return sph_bswap32(*(const sph_u32 *)src);
+#else
+	return *(const sph_u32 *)src;
+#endif
+#else
+	if (((SPH_UPTR)src & 3) == 0) {
+#if SPH_BIG_ENDIAN
+#if SPH_SPARCV9_GCC && !SPH_NO_ASM
+		sph_u32 tmp;
+
+		/*
+		 * "__volatile__" is needed here because without it,
+		 * gcc-3.4.3 miscompiles the code and performs the
+		 * access before the test on the address, thus triggering
+		 * a bus error...
+		 */
+		__asm__ __volatile__ (
+			"lda [%1]0x88,%0" : "=r" (tmp) : "r" (src));
+		return tmp;
+/*
+ * On PowerPC, this turns out not to be worth the effort: the inline
+ * assembly makes GCC optimizer uncomfortable, which tends to nullify
+ * the decoding gains.
+ *
+ * For most hash functions, using this inline assembly trick changes
+ * hashing speed by less than 5% and often _reduces_ it. The biggest
+ * gains are for MD4 (+11%) and CubeHash (+30%). For all others, it is
+ * less then 10%. The speed gain on CubeHash is probably due to the
+ * chronic shortage of registers that CubeHash endures; for the other
+ * functions, the generic code appears to be efficient enough already.
+ *
+#elif (SPH_PPC32_GCC || SPH_PPC64_GCC) && !SPH_NO_ASM
+		sph_u32 tmp;
+
+		__asm__ __volatile__ (
+			"lwbrx %0,0,%1" : "=r" (tmp) : "r" (src));
+		return tmp;
+ */
+#else
+		return sph_bswap32(*(const sph_u32 *)src);
+#endif
+#else
+		return *(const sph_u32 *)src;
+#endif
+	} else {
+		return (sph_u32)(((const unsigned char *)src)[0])
+			| ((sph_u32)(((const unsigned char *)src)[1]) << 8)
+			| ((sph_u32)(((const unsigned char *)src)[2]) << 16)
+			| ((sph_u32)(((const unsigned char *)src)[3]) << 24);
+	}
+#endif
+#else
+	return (sph_u32)(((const unsigned char *)src)[0])
+		| ((sph_u32)(((const unsigned char *)src)[1]) << 8)
+		| ((sph_u32)(((const unsigned char *)src)[2]) << 16)
+		| ((sph_u32)(((const unsigned char *)src)[3]) << 24);
+#endif
+}
+
+/**
+ * Decode a 32-bit value from the provided buffer (little endian convention).
+ * The source buffer must be properly aligned.
+ *
+ * @param src   the source buffer (32-bit aligned)
+ * @return  the decoded value
+ */
+static SPH_INLINE sph_u32
+sph_dec32le_aligned(const void *src)
+{
+#if SPH_LITTLE_ENDIAN
+	return *(const sph_u32 *)src;
+#elif SPH_BIG_ENDIAN
+#if SPH_SPARCV9_GCC && !SPH_NO_ASM
+	sph_u32 tmp;
+
+	__asm__ __volatile__ ("lda [%1]0x88,%0" : "=r" (tmp) : "r" (src));
+	return tmp;
+/*
+ * Not worth it generally.
+ *
+#elif (SPH_PPC32_GCC || SPH_PPC64_GCC) && !SPH_NO_ASM
+	sph_u32 tmp;
+
+	__asm__ __volatile__ ("lwbrx %0,0,%1" : "=r" (tmp) : "r" (src));
+	return tmp;
+ */
+#else
+	return sph_bswap32(*(const sph_u32 *)src);
+#endif
+#else
+	return (sph_u32)(((const unsigned char *)src)[0])
+		| ((sph_u32)(((const unsigned char *)src)[1]) << 8)
+		| ((sph_u32)(((const unsigned char *)src)[2]) << 16)
+		| ((sph_u32)(((const unsigned char *)src)[3]) << 24);
+#endif
+}
+
+#if SPH_64
+
+/**
+ * Encode a 64-bit value into the provided buffer (big endian convention).
+ *
+ * @param dst   the destination buffer
+ * @param val   the 64-bit value to encode
+ */
+static SPH_INLINE void
+sph_enc64be(void *dst, sph_u64 val)
+{
+#if defined SPH_UPTR
+#if SPH_UNALIGNED
+#if SPH_LITTLE_ENDIAN
+	val = sph_bswap64(val);
+#endif
+	*(sph_u64 *)dst = val;
+#else
+	if (((SPH_UPTR)dst & 7) == 0) {
+#if SPH_LITTLE_ENDIAN
+		val = sph_bswap64(val);
+#endif
+		*(sph_u64 *)dst = val;
+	} else {
+		((unsigned char *)dst)[0] = (val >> 56);
+		((unsigned char *)dst)[1] = (val >> 48);
+		((unsigned char *)dst)[2] = (val >> 40);
+		((unsigned char *)dst)[3] = (val >> 32);
+		((unsigned char *)dst)[4] = (val >> 24);
+		((unsigned char *)dst)[5] = (val >> 16);
+		((unsigned char *)dst)[6] = (val >> 8);
+		((unsigned char *)dst)[7] = val;
+	}
+#endif
+#else
+	((unsigned char *)dst)[0] = (val >> 56);
+	((unsigned char *)dst)[1] = (val >> 48);
+	((unsigned char *)dst)[2] = (val >> 40);
+	((unsigned char *)dst)[3] = (val >> 32);
+	((unsigned char *)dst)[4] = (val >> 24);
+	((unsigned char *)dst)[5] = (val >> 16);
+	((unsigned char *)dst)[6] = (val >> 8);
+	((unsigned char *)dst)[7] = val;
+#endif
+}
+
+/**
+ * Encode a 64-bit value into the provided buffer (big endian convention).
+ * The destination buffer must be properly aligned.
+ *
+ * @param dst   the destination buffer (64-bit aligned)
+ * @param val   the value to encode
+ */
+static SPH_INLINE void
+sph_enc64be_aligned(void *dst, sph_u64 val)
+{
+#if SPH_LITTLE_ENDIAN
+	*(sph_u64 *)dst = sph_bswap64(val);
+#elif SPH_BIG_ENDIAN
+	*(sph_u64 *)dst = val;
+#else
+	((unsigned char *)dst)[0] = (val >> 56);
+	((unsigned char *)dst)[1] = (val >> 48);
+	((unsigned char *)dst)[2] = (val >> 40);
+	((unsigned char *)dst)[3] = (val >> 32);
+	((unsigned char *)dst)[4] = (val >> 24);
+	((unsigned char *)dst)[5] = (val >> 16);
+	((unsigned char *)dst)[6] = (val >> 8);
+	((unsigned char *)dst)[7] = val;
+#endif
+}
+
+/**
+ * Decode a 64-bit value from the provided buffer (big endian convention).
+ *
+ * @param src   the source buffer
+ * @return  the decoded value
+ */
+static SPH_INLINE sph_u64
+sph_dec64be(const void *src)
+{
+#if defined SPH_UPTR
+#if SPH_UNALIGNED
+#if SPH_LITTLE_ENDIAN
+	return sph_bswap64(*(const sph_u64 *)src);
+#else
+	return *(const sph_u64 *)src;
+#endif
+#else
+	if (((SPH_UPTR)src & 7) == 0) {
+#if SPH_LITTLE_ENDIAN
+		return sph_bswap64(*(const sph_u64 *)src);
+#else
+		return *(const sph_u64 *)src;
+#endif
+	} else {
+		return ((sph_u64)(((const unsigned char *)src)[0]) << 56)
+			| ((sph_u64)(((const unsigned char *)src)[1]) << 48)
+			| ((sph_u64)(((const unsigned char *)src)[2]) << 40)
+			| ((sph_u64)(((const unsigned char *)src)[3]) << 32)
+			| ((sph_u64)(((const unsigned char *)src)[4]) << 24)
+			| ((sph_u64)(((const unsigned char *)src)[5]) << 16)
+			| ((sph_u64)(((const unsigned char *)src)[6]) << 8)
+			| (sph_u64)(((const unsigned char *)src)[7]);
+	}
+#endif
+#else
+	return ((sph_u64)(((const unsigned char *)src)[0]) << 56)
+		| ((sph_u64)(((const unsigned char *)src)[1]) << 48)
+		| ((sph_u64)(((const unsigned char *)src)[2]) << 40)
+		| ((sph_u64)(((const unsigned char *)src)[3]) << 32)
+		| ((sph_u64)(((const unsigned char *)src)[4]) << 24)
+		| ((sph_u64)(((const unsigned char *)src)[5]) << 16)
+		| ((sph_u64)(((const unsigned char *)src)[6]) << 8)
+		| (sph_u64)(((const unsigned char *)src)[7]);
+#endif
+}
+
+/**
+ * Decode a 64-bit value from the provided buffer (big endian convention).
+ * The source buffer must be properly aligned.
+ *
+ * @param src   the source buffer (64-bit aligned)
+ * @return  the decoded value
+ */
+static SPH_INLINE sph_u64
+sph_dec64be_aligned(const void *src)
+{
+#if SPH_LITTLE_ENDIAN
+	return sph_bswap64(*(const sph_u64 *)src);
+#elif SPH_BIG_ENDIAN
+	return *(const sph_u64 *)src;
+#else
+	return ((sph_u64)(((const unsigned char *)src)[0]) << 56)
+		| ((sph_u64)(((const unsigned char *)src)[1]) << 48)
+		| ((sph_u64)(((const unsigned char *)src)[2]) << 40)
+		| ((sph_u64)(((const unsigned char *)src)[3]) << 32)
+		| ((sph_u64)(((const unsigned char *)src)[4]) << 24)
+		| ((sph_u64)(((const unsigned char *)src)[5]) << 16)
+		| ((sph_u64)(((const unsigned char *)src)[6]) << 8)
+		| (sph_u64)(((const unsigned char *)src)[7]);
+#endif
+}
+
+/**
+ * Encode a 64-bit value into the provided buffer (little endian convention).
+ *
+ * @param dst   the destination buffer
+ * @param val   the 64-bit value to encode
+ */
+static SPH_INLINE void
+sph_enc64le(void *dst, sph_u64 val)
+{
+#if defined SPH_UPTR
+#if SPH_UNALIGNED
+#if SPH_BIG_ENDIAN
+	val = sph_bswap64(val);
+#endif
+	*(sph_u64 *)dst = val;
+#else
+	if (((SPH_UPTR)dst & 7) == 0) {
+#if SPH_BIG_ENDIAN
+		val = sph_bswap64(val);
+#endif
+		*(sph_u64 *)dst = val;
+	} else {
+		((unsigned char *)dst)[0] = val;
+		((unsigned char *)dst)[1] = (val >> 8);
+		((unsigned char *)dst)[2] = (val >> 16);
+		((unsigned char *)dst)[3] = (val >> 24);
+		((unsigned char *)dst)[4] = (val >> 32);
+		((unsigned char *)dst)[5] = (val >> 40);
+		((unsigned char *)dst)[6] = (val >> 48);
+		((unsigned char *)dst)[7] = (val >> 56);
+	}
+#endif
+#else
+	((unsigned char *)dst)[0] = val;
+	((unsigned char *)dst)[1] = (val >> 8);
+	((unsigned char *)dst)[2] = (val >> 16);
+	((unsigned char *)dst)[3] = (val >> 24);
+	((unsigned char *)dst)[4] = (val >> 32);
+	((unsigned char *)dst)[5] = (val >> 40);
+	((unsigned char *)dst)[6] = (val >> 48);
+	((unsigned char *)dst)[7] = (val >> 56);
+#endif
+}
+
+/**
+ * Encode a 64-bit value into the provided buffer (little endian convention).
+ * The destination buffer must be properly aligned.
+ *
+ * @param dst   the destination buffer (64-bit aligned)
+ * @param val   the value to encode
+ */
+static SPH_INLINE void
+sph_enc64le_aligned(void *dst, sph_u64 val)
+{
+#if SPH_LITTLE_ENDIAN
+	*(sph_u64 *)dst = val;
+#elif SPH_BIG_ENDIAN
+	*(sph_u64 *)dst = sph_bswap64(val);
+#else
+	((unsigned char *)dst)[0] = val;
+	((unsigned char *)dst)[1] = (val >> 8);
+	((unsigned char *)dst)[2] = (val >> 16);
+	((unsigned char *)dst)[3] = (val >> 24);
+	((unsigned char *)dst)[4] = (val >> 32);
+	((unsigned char *)dst)[5] = (val >> 40);
+	((unsigned char *)dst)[6] = (val >> 48);
+	((unsigned char *)dst)[7] = (val >> 56);
+#endif
+}
+
+/**
+ * Decode a 64-bit value from the provided buffer (little endian convention).
+ *
+ * @param src   the source buffer
+ * @return  the decoded value
+ */
+static SPH_INLINE sph_u64
+sph_dec64le(const void *src)
+{
+#if defined SPH_UPTR
+#if SPH_UNALIGNED
+#if SPH_BIG_ENDIAN
+	return sph_bswap64(*(const sph_u64 *)src);
+#else
+	return *(const sph_u64 *)src;
+#endif
+#else
+	if (((SPH_UPTR)src & 7) == 0) {
+#if SPH_BIG_ENDIAN
+#if SPH_SPARCV9_GCC_64 && !SPH_NO_ASM
+		sph_u64 tmp;
+
+		__asm__ __volatile__ (
+			"ldxa [%1]0x88,%0" : "=r" (tmp) : "r" (src));
+		return tmp;
+/*
+ * Not worth it generally.
+ *
+#elif SPH_PPC32_GCC && !SPH_NO_ASM
+		return (sph_u64)sph_dec32le_aligned(src)
+			| ((sph_u64)sph_dec32le_aligned(
+				(const char *)src + 4) << 32);
+#elif SPH_PPC64_GCC && !SPH_NO_ASM
+		sph_u64 tmp;
+
+		__asm__ __volatile__ (
+			"ldbrx %0,0,%1" : "=r" (tmp) : "r" (src));
+		return tmp;
+ */
+#else
+		return sph_bswap64(*(const sph_u64 *)src);
+#endif
+#else
+		return *(const sph_u64 *)src;
+#endif
+	} else {
+		return (sph_u64)(((const unsigned char *)src)[0])
+			| ((sph_u64)(((const unsigned char *)src)[1]) << 8)
+			| ((sph_u64)(((const unsigned char *)src)[2]) << 16)
+			| ((sph_u64)(((const unsigned char *)src)[3]) << 24)
+			| ((sph_u64)(((const unsigned char *)src)[4]) << 32)
+			| ((sph_u64)(((const unsigned char *)src)[5]) << 40)
+			| ((sph_u64)(((const unsigned char *)src)[6]) << 48)
+			| ((sph_u64)(((const unsigned char *)src)[7]) << 56);
+	}
+#endif
+#else
+	return (sph_u64)(((const unsigned char *)src)[0])
+		| ((sph_u64)(((const unsigned char *)src)[1]) << 8)
+		| ((sph_u64)(((const unsigned char *)src)[2]) << 16)
+		| ((sph_u64)(((const unsigned char *)src)[3]) << 24)
+		| ((sph_u64)(((const unsigned char *)src)[4]) << 32)
+		| ((sph_u64)(((const unsigned char *)src)[5]) << 40)
+		| ((sph_u64)(((const unsigned char *)src)[6]) << 48)
+		| ((sph_u64)(((const unsigned char *)src)[7]) << 56);
+#endif
+}
+
+/**
+ * Decode a 64-bit value from the provided buffer (little endian convention).
+ * The source buffer must be properly aligned.
+ *
+ * @param src   the source buffer (64-bit aligned)
+ * @return  the decoded value
+ */
+static SPH_INLINE sph_u64
+sph_dec64le_aligned(const void *src)
+{
+#if SPH_LITTLE_ENDIAN
+	return *(const sph_u64 *)src;
+#elif SPH_BIG_ENDIAN
+#if SPH_SPARCV9_GCC_64 && !SPH_NO_ASM
+	sph_u64 tmp;
+
+	__asm__ __volatile__ ("ldxa [%1]0x88,%0" : "=r" (tmp) : "r" (src));
+	return tmp;
+/*
+ * Not worth it generally.
+ *
+#elif SPH_PPC32_GCC && !SPH_NO_ASM
+	return (sph_u64)sph_dec32le_aligned(src)
+		| ((sph_u64)sph_dec32le_aligned((const char *)src + 4) << 32);
+#elif SPH_PPC64_GCC && !SPH_NO_ASM
+	sph_u64 tmp;
+
+	__asm__ __volatile__ ("ldbrx %0,0,%1" : "=r" (tmp) : "r" (src));
+	return tmp;
+ */
+#else
+	return sph_bswap64(*(const sph_u64 *)src);
+#endif
+#else
+	return (sph_u64)(((const unsigned char *)src)[0])
+		| ((sph_u64)(((const unsigned char *)src)[1]) << 8)
+		| ((sph_u64)(((const unsigned char *)src)[2]) << 16)
+		| ((sph_u64)(((const unsigned char *)src)[3]) << 24)
+		| ((sph_u64)(((const unsigned char *)src)[4]) << 32)
+		| ((sph_u64)(((const unsigned char *)src)[5]) << 40)
+		| ((sph_u64)(((const unsigned char *)src)[6]) << 48)
+		| ((sph_u64)(((const unsigned char *)src)[7]) << 56);
+#endif
+}
+
+#endif
+
+#endif /* Doxygen excluded block */
+
+#endif
diff --git a/skein.c b/skein.c
new file mode 100644
index 0000000..4aefa2a
--- /dev/null
+++ b/skein.c
@@ -0,0 +1,18 @@
+#include "skein.h"
+#include <stdlib.h>
+#include <stdint.h>
+#include <string.h>
+#include <stdio.h>
+
+#include "sha3/sph_skein.h"
+
+
+void keccak_hash(const char* input, char* output)
+{
+    sph_keccak512_context    ctx_keccak;
+    sph_keccak512_init(&ctx_keccak);
+    sph_keccak512 (&ctx_keccak, input, 64);
+    sph_keccak512_close(&ctx_keccak, output);
+
+}
+
diff --git a/skein.h b/skein.h
new file mode 100644
index 0000000..bc4867a
--- /dev/null
+++ b/skein.h
@@ -0,0 +1,14 @@
+#ifndef SKEIN_H
+#define SKEIN_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void skein_hash(const char* input, char* output);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/stdint.h b/stdint.h
new file mode 100644
index 0000000..4fe0ef9
--- /dev/null
+++ b/stdint.h
@@ -0,0 +1,259 @@
+// ISO C9x  compliant stdint.h for Microsoft Visual Studio
+// Based on ISO/IEC 9899:TC2 Committee draft (May 6, 2005) WG14/N1124 
+// 
+//  Copyright (c) 2006-2013 Alexander Chemeris
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// 
+//   1. Redistributions of source code must retain the above copyright notice,
+//      this list of conditions and the following disclaimer.
+// 
+//   2. Redistributions in binary form must reproduce the above copyright
+//      notice, this list of conditions and the following disclaimer in the
+//      documentation and/or other materials provided with the distribution.
+// 
+//   3. Neither the name of the product nor the names of its contributors may
+//      be used to endorse or promote products derived from this software
+//      without specific prior written permission.
+// 
+// THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR IMPLIED
+// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+// MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO
+// EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+// OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, 
+// WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
+// OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
+// ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+// 
+///////////////////////////////////////////////////////////////////////////////
+
+#ifndef _MSC_VER // [
+#error "Use this header only with Microsoft Visual C++ compilers!"
+#endif // _MSC_VER ]
+
+#ifndef _MSC_STDINT_H_ // [
+#define _MSC_STDINT_H_
+
+#if _MSC_VER > 1000
+#pragma once
+#endif
+
+#if _MSC_VER >= 1600 // [
+#include <stdint.h>
+#else // ] _MSC_VER >= 1600 [
+
+#include <limits.h>
+
+// For Visual Studio 6 in C++ mode and for many Visual Studio versions when
+// compiling for ARM we should wrap <wchar.h> include with 'extern "C++" {}'
+// or compiler give many errors like this:
+//   error C2733: second C linkage of overloaded function 'wmemchr' not allowed
+#ifdef __cplusplus
+extern "C" {
+#endif
+#  include <wchar.h>
+#ifdef __cplusplus
+}
+#endif
+
+// Define _W64 macros to mark types changing their size, like intptr_t.
+#ifndef _W64
+#  if !defined(__midl) && (defined(_X86_) || defined(_M_IX86)) && _MSC_VER >= 1300
+#     define _W64 __w64
+#  else
+#     define _W64
+#  endif
+#endif
+
+
+// 7.18.1 Integer types
+
+// 7.18.1.1 Exact-width integer types
+
+// Visual Studio 6 and Embedded Visual C++ 4 doesn't
+// realize that, e.g. char has the same size as __int8
+// so we give up on __intX for them.
+#if (_MSC_VER < 1300)
+   typedef signed char       int8_t;
+   typedef signed short      int16_t;
+   typedef signed int        int32_t;
+   typedef unsigned char     uint8_t;
+   typedef unsigned short    uint16_t;
+   typedef unsigned int      uint32_t;
+#else
+   typedef signed __int8     int8_t;
+   typedef signed __int16    int16_t;
+   typedef signed __int32    int32_t;
+   typedef unsigned __int8   uint8_t;
+   typedef unsigned __int16  uint16_t;
+   typedef unsigned __int32  uint32_t;
+#endif
+typedef signed __int64       int64_t;
+typedef unsigned __int64     uint64_t;
+
+
+// 7.18.1.2 Minimum-width integer types
+typedef int8_t    int_least8_t;
+typedef int16_t   int_least16_t;
+typedef int32_t   int_least32_t;
+typedef int64_t   int_least64_t;
+typedef uint8_t   uint_least8_t;
+typedef uint16_t  uint_least16_t;
+typedef uint32_t  uint_least32_t;
+typedef uint64_t  uint_least64_t;
+
+// 7.18.1.3 Fastest minimum-width integer types
+typedef int8_t    int_fast8_t;
+typedef int16_t   int_fast16_t;
+typedef int32_t   int_fast32_t;
+typedef int64_t   int_fast64_t;
+typedef uint8_t   uint_fast8_t;
+typedef uint16_t  uint_fast16_t;
+typedef uint32_t  uint_fast32_t;
+typedef uint64_t  uint_fast64_t;
+
+// 7.18.1.4 Integer types capable of holding object pointers
+#ifdef _WIN64 // [
+   typedef signed __int64    intptr_t;
+   typedef unsigned __int64  uintptr_t;
+#else // _WIN64 ][
+   typedef _W64 signed int   intptr_t;
+   typedef _W64 unsigned int uintptr_t;
+#endif // _WIN64 ]
+
+// 7.18.1.5 Greatest-width integer types
+typedef int64_t   intmax_t;
+typedef uint64_t  uintmax_t;
+
+
+// 7.18.2 Limits of specified-width integer types
+
+#if !defined(__cplusplus) || defined(__STDC_LIMIT_MACROS) // [   See footnote 220 at page 257 and footnote 221 at page 259
+
+// 7.18.2.1 Limits of exact-width integer types
+#define INT8_MIN     ((int8_t)_I8_MIN)
+#define INT8_MAX     _I8_MAX
+#define INT16_MIN    ((int16_t)_I16_MIN)
+#define INT16_MAX    _I16_MAX
+#define INT32_MIN    ((int32_t)_I32_MIN)
+#define INT32_MAX    _I32_MAX
+#define INT64_MIN    ((int64_t)_I64_MIN)
+#define INT64_MAX    _I64_MAX
+#define UINT8_MAX    _UI8_MAX
+#define UINT16_MAX   _UI16_MAX
+#define UINT32_MAX   _UI32_MAX
+#define UINT64_MAX   _UI64_MAX
+
+// 7.18.2.2 Limits of minimum-width integer types
+#define INT_LEAST8_MIN    INT8_MIN
+#define INT_LEAST8_MAX    INT8_MAX
+#define INT_LEAST16_MIN   INT16_MIN
+#define INT_LEAST16_MAX   INT16_MAX
+#define INT_LEAST32_MIN   INT32_MIN
+#define INT_LEAST32_MAX   INT32_MAX
+#define INT_LEAST64_MIN   INT64_MIN
+#define INT_LEAST64_MAX   INT64_MAX
+#define UINT_LEAST8_MAX   UINT8_MAX
+#define UINT_LEAST16_MAX  UINT16_MAX
+#define UINT_LEAST32_MAX  UINT32_MAX
+#define UINT_LEAST64_MAX  UINT64_MAX
+
+// 7.18.2.3 Limits of fastest minimum-width integer types
+#define INT_FAST8_MIN    INT8_MIN
+#define INT_FAST8_MAX    INT8_MAX
+#define INT_FAST16_MIN   INT16_MIN
+#define INT_FAST16_MAX   INT16_MAX
+#define INT_FAST32_MIN   INT32_MIN
+#define INT_FAST32_MAX   INT32_MAX
+#define INT_FAST64_MIN   INT64_MIN
+#define INT_FAST64_MAX   INT64_MAX
+#define UINT_FAST8_MAX   UINT8_MAX
+#define UINT_FAST16_MAX  UINT16_MAX
+#define UINT_FAST32_MAX  UINT32_MAX
+#define UINT_FAST64_MAX  UINT64_MAX
+
+// 7.18.2.4 Limits of integer types capable of holding object pointers
+#ifdef _WIN64 // [
+#  define INTPTR_MIN   INT64_MIN
+#  define INTPTR_MAX   INT64_MAX
+#  define UINTPTR_MAX  UINT64_MAX
+#else // _WIN64 ][
+#  define INTPTR_MIN   INT32_MIN
+#  define INTPTR_MAX   INT32_MAX
+#  define UINTPTR_MAX  UINT32_MAX
+#endif // _WIN64 ]
+
+// 7.18.2.5 Limits of greatest-width integer types
+#define INTMAX_MIN   INT64_MIN
+#define INTMAX_MAX   INT64_MAX
+#define UINTMAX_MAX  UINT64_MAX
+
+// 7.18.3 Limits of other integer types
+
+#ifdef _WIN64 // [
+#  define PTRDIFF_MIN  _I64_MIN
+#  define PTRDIFF_MAX  _I64_MAX
+#else  // _WIN64 ][
+#  define PTRDIFF_MIN  _I32_MIN
+#  define PTRDIFF_MAX  _I32_MAX
+#endif  // _WIN64 ]
+
+#define SIG_ATOMIC_MIN  INT_MIN
+#define SIG_ATOMIC_MAX  INT_MAX
+
+#ifndef SIZE_MAX // [
+#  ifdef _WIN64 // [
+#     define SIZE_MAX  _UI64_MAX
+#  else // _WIN64 ][
+#     define SIZE_MAX  _UI32_MAX
+#  endif // _WIN64 ]
+#endif // SIZE_MAX ]
+
+// WCHAR_MIN and WCHAR_MAX are also defined in <wchar.h>
+#ifndef WCHAR_MIN // [
+#  define WCHAR_MIN  0
+#endif  // WCHAR_MIN ]
+#ifndef WCHAR_MAX // [
+#  define WCHAR_MAX  _UI16_MAX
+#endif  // WCHAR_MAX ]
+
+#define WINT_MIN  0
+#define WINT_MAX  _UI16_MAX
+
+#endif // __STDC_LIMIT_MACROS ]
+
+
+// 7.18.4 Limits of other integer types
+
+#if !defined(__cplusplus) || defined(__STDC_CONSTANT_MACROS) // [   See footnote 224 at page 260
+
+// 7.18.4.1 Macros for minimum-width integer constants
+
+#define INT8_C(val)  val##i8
+#define INT16_C(val) val##i16
+#define INT32_C(val) val##i32
+#define INT64_C(val) val##i64
+
+#define UINT8_C(val)  val##ui8
+#define UINT16_C(val) val##ui16
+#define UINT32_C(val) val##ui32
+#define UINT64_C(val) val##ui64
+
+// 7.18.4.2 Macros for greatest-width integer constants
+// These #ifndef's are needed to prevent collisions with <boost/cstdint.hpp>.
+// Check out Issue 9 for the details.
+#ifndef INTMAX_C //   [
+#  define INTMAX_C   INT64_C
+#endif // INTMAX_C    ]
+#ifndef UINTMAX_C //  [
+#  define UINTMAX_C  UINT64_C
+#endif // UINTMAX_C   ]
+
+#endif // __STDC_CONSTANT_MACROS ]
+
+#endif // _MSC_VER >= 1600 ]
+
+#endif // _MSC_STDINT_H_ ]
diff --git a/x11.c b/x11.c
new file mode 100644
index 0000000..9290587
--- /dev/null
+++ b/x11.c
@@ -0,0 +1,85 @@
+#include "x11.h"
+#include <stdlib.h>
+#include <stdint.h>
+#include <string.h>
+#include <stdio.h>
+
+#include "sha3/sph_blake.h"
+#include "sha3/sph_bmw.h"
+#include "sha3/sph_groestl.h"
+#include "sha3/sph_jh.h"
+#include "sha3/sph_keccak.h"
+#include "sha3/sph_skein.h"
+#include "sha3/sph_luffa.h"
+#include "sha3/sph_cubehash.h"
+#include "sha3/sph_shavite.h"
+#include "sha3/sph_simd.h"
+#include "sha3/sph_echo.h"
+
+
+void x11_hash(const char* input, char* output)
+{
+    sph_blake512_context     ctx_blake;
+    sph_bmw512_context       ctx_bmw;
+    sph_groestl512_context   ctx_groestl;
+    sph_skein512_context     ctx_skein;
+    sph_jh512_context        ctx_jh;
+    sph_keccak512_context    ctx_keccak;
+
+    sph_luffa512_context		ctx_luffa1;
+    sph_cubehash512_context		ctx_cubehash1;
+    sph_shavite512_context		ctx_shavite1;
+    sph_simd512_context		ctx_simd1;
+    sph_echo512_context		ctx_echo1;
+
+    //these uint512 in the c++ source of the client are backed by an array of uint32
+    uint32_t hashA[16], hashB[16];	
+
+    sph_blake512_init(&ctx_blake);
+    sph_blake512 (&ctx_blake, input, 80);
+    sph_blake512_close (&ctx_blake, hashA);
+
+    sph_bmw512_init(&ctx_bmw);
+    sph_bmw512 (&ctx_bmw, hashA, 64);
+    sph_bmw512_close(&ctx_bmw, hashB);
+
+    sph_groestl512_init(&ctx_groestl);
+    sph_groestl512 (&ctx_groestl, hashB, 64);
+    sph_groestl512_close(&ctx_groestl, hashA);
+
+    sph_skein512_init(&ctx_skein);
+    sph_skein512 (&ctx_skein, hashA, 64);
+    sph_skein512_close (&ctx_skein, hashB);
+
+    sph_jh512_init(&ctx_jh);
+    sph_jh512 (&ctx_jh, hashB, 64);
+    sph_jh512_close(&ctx_jh, hashA);
+
+    sph_keccak512_init(&ctx_keccak);
+    sph_keccak512 (&ctx_keccak, hashA, 64);
+    sph_keccak512_close(&ctx_keccak, hashB);
+	
+    sph_luffa512_init (&ctx_luffa1);
+    sph_luffa512 (&ctx_luffa1, hashB, 64);
+    sph_luffa512_close (&ctx_luffa1, hashA);	
+	
+    sph_cubehash512_init (&ctx_cubehash1); 
+    sph_cubehash512 (&ctx_cubehash1, hashA, 64);   
+    sph_cubehash512_close(&ctx_cubehash1, hashB);  
+	
+    sph_shavite512_init (&ctx_shavite1);
+    sph_shavite512 (&ctx_shavite1, hashB, 64);   
+    sph_shavite512_close(&ctx_shavite1, hashA);  
+	
+    sph_simd512_init (&ctx_simd1); 
+    sph_simd512 (&ctx_simd1, hashA, 64);   
+    sph_simd512_close(&ctx_simd1, hashB); 
+	
+    sph_echo512_init (&ctx_echo1); 
+    sph_echo512 (&ctx_echo1, hashB, 64);   
+    sph_echo512_close(&ctx_echo1, hashA); 
+
+    memcpy(output, hashA, 32);
+	
+}
+
diff --git a/x11.h b/x11.h
new file mode 100644
index 0000000..aefdc8c
--- /dev/null
+++ b/x11.h
@@ -0,0 +1,14 @@
+#ifndef X11_H
+#define X11_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void x11_hash(const char* input, char* output);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/xcoin.c b/xcoin.c
new file mode 100644
index 0000000..e69de29