diff --git a/build/Release/.deps/Release/obj.target/multihashing/multihashing.o.d b/build/Release/.deps/Release/obj.target/multihashing/multihashing.o.d new file mode 100644 index 0000000..126cad4 --- /dev/null +++ b/build/Release/.deps/Release/obj.target/multihashing/multihashing.o.d @@ -0,0 +1,32 @@ +cmd_Release/obj.target/multihashing/multihashing.o := g++ '-D_LARGEFILE_SOURCE' '-D_FILE_OFFSET_BITS=64' '-DBUILDING_NODE_EXTENSION' -I/root/.node-gyp/0.10.26/src -I/root/.node-gyp/0.10.26/deps/uv/include -I/root/.node-gyp/0.10.26/deps/v8/include -fPIC -Wall -Wextra -Wno-unused-parameter -pthread -m64 -O2 -fno-strict-aliasing -fno-tree-vrp -fno-omit-frame-pointer -fno-rtti -fno-exceptions -MMD -MF ./Release/.deps/Release/obj.target/multihashing/multihashing.o.d.raw -c -o Release/obj.target/multihashing/multihashing.o ../multihashing.cc +Release/obj.target/multihashing/multihashing.o: ../multihashing.cc \ + /root/.node-gyp/0.10.26/src/node.h \ + /root/.node-gyp/0.10.26/deps/uv/include/uv.h \ + /root/.node-gyp/0.10.26/deps/uv/include/uv-private/uv-unix.h \ + /root/.node-gyp/0.10.26/deps/uv/include/uv-private/ngx-queue.h \ + /root/.node-gyp/0.10.26/deps/uv/include/uv-private/uv-linux.h \ + /root/.node-gyp/0.10.26/deps/v8/include/v8.h \ + /root/.node-gyp/0.10.26/deps/v8/include/v8stdint.h \ + /root/.node-gyp/0.10.26/src/node_object_wrap.h \ + /root/.node-gyp/0.10.26/src/node.h \ + /root/.node-gyp/0.10.26/src/node_buffer.h ../bcrypt.h ../keccak.h \ + ../quark.h ../scrypt.h ../scryptjane.h ../scryptn.h ../skein.h ../x11.h +../multihashing.cc: +/root/.node-gyp/0.10.26/src/node.h: +/root/.node-gyp/0.10.26/deps/uv/include/uv.h: +/root/.node-gyp/0.10.26/deps/uv/include/uv-private/uv-unix.h: +/root/.node-gyp/0.10.26/deps/uv/include/uv-private/ngx-queue.h: +/root/.node-gyp/0.10.26/deps/uv/include/uv-private/uv-linux.h: +/root/.node-gyp/0.10.26/deps/v8/include/v8.h: +/root/.node-gyp/0.10.26/deps/v8/include/v8stdint.h: +/root/.node-gyp/0.10.26/src/node_object_wrap.h: +/root/.node-gyp/0.10.26/src/node.h: +/root/.node-gyp/0.10.26/src/node_buffer.h: +../bcrypt.h: +../keccak.h: +../quark.h: +../scrypt.h: +../scryptjane.h: +../scryptn.h: +../skein.h: +../x11.h: diff --git a/build/Release/.deps/Release/obj.target/multihashing/scrypt.o.d b/build/Release/.deps/Release/obj.target/multihashing/scrypt.o.d new file mode 100644 index 0000000..edadb2b --- /dev/null +++ b/build/Release/.deps/Release/obj.target/multihashing/scrypt.o.d @@ -0,0 +1,4 @@ +cmd_Release/obj.target/multihashing/scrypt.o := cc '-D_LARGEFILE_SOURCE' '-D_FILE_OFFSET_BITS=64' '-DBUILDING_NODE_EXTENSION' -I/root/.node-gyp/0.10.26/src -I/root/.node-gyp/0.10.26/deps/uv/include -I/root/.node-gyp/0.10.26/deps/v8/include -fPIC -Wall -Wextra -Wno-unused-parameter -pthread -m64 -O2 -fno-strict-aliasing -fno-tree-vrp -fno-omit-frame-pointer -MMD -MF ./Release/.deps/Release/obj.target/multihashing/scrypt.o.d.raw -c -o Release/obj.target/multihashing/scrypt.o ../scrypt.c +Release/obj.target/multihashing/scrypt.o: ../scrypt.c ../scrypt.h +../scrypt.c: +../scrypt.h: diff --git a/build/Release/.deps/Release/obj.target/multihashing/scryptjane.o.d b/build/Release/.deps/Release/obj.target/multihashing/scryptjane.o.d new file mode 100644 index 0000000..579a6db --- /dev/null +++ b/build/Release/.deps/Release/obj.target/multihashing/scryptjane.o.d @@ -0,0 +1,30 @@ +cmd_Release/obj.target/multihashing/scryptjane.o := cc '-D_LARGEFILE_SOURCE' '-D_FILE_OFFSET_BITS=64' '-DBUILDING_NODE_EXTENSION' -I/root/.node-gyp/0.10.26/src -I/root/.node-gyp/0.10.26/deps/uv/include -I/root/.node-gyp/0.10.26/deps/v8/include -fPIC -Wall -Wextra -Wno-unused-parameter -pthread -m64 -O2 -fno-strict-aliasing -fno-tree-vrp -fno-omit-frame-pointer -MMD -MF ./Release/.deps/Release/obj.target/multihashing/scryptjane.o.d.raw -c -o Release/obj.target/multihashing/scryptjane.o ../scryptjane.c +Release/obj.target/multihashing/scryptjane.o: ../scryptjane.c \ + ../scryptjane.h ../scryptjane/scrypt-jane-portable.h \ + ../scryptjane/scrypt-jane-portable-x86.h \ + ../scryptjane/scrypt-jane-hash.h ../scryptjane/scrypt-jane-hash_keccak.h \ + ../scryptjane/scrypt-jane-pbkdf2.h ../scryptjane/scrypt-jane-romix.h \ + ../scryptjane/scrypt-jane-chacha.h \ + ../scryptjane/scrypt-jane-romix-basic.h \ + ../scryptjane/scrypt-jane-mix_chacha-avx.h \ + ../scryptjane/scrypt-jane-mix_chacha-ssse3.h \ + ../scryptjane/scrypt-jane-mix_chacha-sse2.h \ + ../scryptjane/scrypt-jane-mix_chacha.h \ + ../scryptjane/scrypt-jane-romix-template.h \ + ../scryptjane/scrypt-jane-test-vectors.h +../scryptjane.c: +../scryptjane.h: +../scryptjane/scrypt-jane-portable.h: +../scryptjane/scrypt-jane-portable-x86.h: +../scryptjane/scrypt-jane-hash.h: +../scryptjane/scrypt-jane-hash_keccak.h: +../scryptjane/scrypt-jane-pbkdf2.h: +../scryptjane/scrypt-jane-romix.h: +../scryptjane/scrypt-jane-chacha.h: +../scryptjane/scrypt-jane-romix-basic.h: +../scryptjane/scrypt-jane-mix_chacha-avx.h: +../scryptjane/scrypt-jane-mix_chacha-ssse3.h: +../scryptjane/scrypt-jane-mix_chacha-sse2.h: +../scryptjane/scrypt-jane-mix_chacha.h: +../scryptjane/scrypt-jane-romix-template.h: +../scryptjane/scrypt-jane-test-vectors.h: diff --git a/build/Release/obj.target/multihashing/multihashing.o b/build/Release/obj.target/multihashing/multihashing.o new file mode 100644 index 0000000..bb4068c Binary files /dev/null and b/build/Release/obj.target/multihashing/multihashing.o differ diff --git a/build/Release/obj.target/multihashing/scrypt.o b/build/Release/obj.target/multihashing/scrypt.o new file mode 100644 index 0000000..94073fa Binary files /dev/null and b/build/Release/obj.target/multihashing/scrypt.o differ diff --git a/build/Release/obj.target/multihashing/scryptjane.o b/build/Release/obj.target/multihashing/scryptjane.o new file mode 100644 index 0000000..75ad453 Binary files /dev/null and b/build/Release/obj.target/multihashing/scryptjane.o differ diff --git a/multihashing.cc b/multihashing.cc index fad8023..74f974d 100644 --- a/multihashing.cc +++ b/multihashing.cc @@ -193,8 +193,8 @@ Handle scryptjane(const Arguments& args) { Local num = args[1]->ToNumber(); int timestamp = num->Value(); - Local num = args[2]->ToNumber(); - int nChainStarTime = num->Value(); + Local num2 = args[2]->ToNumber(); + int nChainStarTime = num2->Value(); char * input = Buffer::Data(target); diff --git a/package.json b/package.json index 6160b2f..dacd698 100644 --- a/package.json +++ b/package.json @@ -15,7 +15,7 @@ }, "keywords": [ "scrypt", - "scrypt-jane", + "scryptjane", "script-n", "x11", "quark", diff --git a/scryptjane/scrypt-jane-chacha.h b/scryptjane/scrypt-jane-chacha.h new file mode 100644 index 0000000..41d96e5 --- /dev/null +++ b/scryptjane/scrypt-jane-chacha.h @@ -0,0 +1,132 @@ +#define SCRYPT_MIX_BASE "ChaCha20/8" + +typedef uint32_t scrypt_mix_word_t; + +#define SCRYPT_WORDTO8_LE U32TO8_LE +#define SCRYPT_WORD_ENDIAN_SWAP U32_SWAP + +#define SCRYPT_BLOCK_BYTES 64 +#define SCRYPT_BLOCK_WORDS (SCRYPT_BLOCK_BYTES / sizeof(scrypt_mix_word_t)) + +/* must have these here in case block bytes is ever != 64 */ +#include "scrypt-jane-romix-basic.h" + +#include "scrypt-jane-mix_chacha-avx.h" +#include "scrypt-jane-mix_chacha-ssse3.h" +#include "scrypt-jane-mix_chacha-sse2.h" +#include "scrypt-jane-mix_chacha.h" + +#if defined(SCRYPT_CHACHA_AVX) + #define SCRYPT_CHUNKMIX_FN scrypt_ChunkMix_avx + #define SCRYPT_ROMIX_FN scrypt_ROMix_avx + #define SCRYPT_MIX_FN chacha_core_avx + #define SCRYPT_ROMIX_TANGLE_FN scrypt_romix_nop + #define SCRYPT_ROMIX_UNTANGLE_FN scrypt_romix_nop + #include "scrypt-jane-romix-template.h" +#endif + +#if defined(SCRYPT_CHACHA_SSSE3) + #define SCRYPT_CHUNKMIX_FN scrypt_ChunkMix_ssse3 + #define SCRYPT_ROMIX_FN scrypt_ROMix_ssse3 + #define SCRYPT_MIX_FN chacha_core_ssse3 + #define SCRYPT_ROMIX_TANGLE_FN scrypt_romix_nop + #define SCRYPT_ROMIX_UNTANGLE_FN scrypt_romix_nop + #include "scrypt-jane-romix-template.h" +#endif + +#if defined(SCRYPT_CHACHA_SSE2) + #define SCRYPT_CHUNKMIX_FN scrypt_ChunkMix_sse2 + #define SCRYPT_ROMIX_FN scrypt_ROMix_sse2 + #define SCRYPT_MIX_FN chacha_core_sse2 + #define SCRYPT_ROMIX_TANGLE_FN scrypt_romix_nop + #define SCRYPT_ROMIX_UNTANGLE_FN scrypt_romix_nop + #include "scrypt-jane-romix-template.h" +#endif + +/* cpu agnostic */ +#define SCRYPT_ROMIX_FN scrypt_ROMix_basic +#define SCRYPT_MIX_FN chacha_core_basic +#define SCRYPT_ROMIX_TANGLE_FN scrypt_romix_convert_endian +#define SCRYPT_ROMIX_UNTANGLE_FN scrypt_romix_convert_endian +#include "scrypt-jane-romix-template.h" + +#if !defined(SCRYPT_CHOOSE_COMPILETIME) +static scrypt_ROMixfn +scrypt_getROMix() { + size_t cpuflags = detect_cpu(); + +#if defined(SCRYPT_CHACHA_AVX) + if (cpuflags & cpu_avx) + return scrypt_ROMix_avx; + else +#endif + +#if defined(SCRYPT_CHACHA_SSSE3) + if (cpuflags & cpu_ssse3) + return scrypt_ROMix_ssse3; + else +#endif + +#if defined(SCRYPT_CHACHA_SSE2) + if (cpuflags & cpu_sse2) + return scrypt_ROMix_sse2; + else +#endif + + return scrypt_ROMix_basic; +} +#endif + + +#if defined(SCRYPT_TEST_SPEED) +static size_t +available_implementations() { + size_t flags = 0; + +#if defined(SCRYPT_CHACHA_AVX) + flags |= cpu_avx; +#endif + +#if defined(SCRYPT_CHACHA_SSSE3) + flags |= cpu_ssse3; +#endif + +#if defined(SCRYPT_CHACHA_SSE2) + flags |= cpu_sse2; +#endif + + return flags; +} +#endif + +static int +scrypt_test_mix() { + static const uint8_t expected[16] = { + 0x48,0x2b,0x2d,0xb8,0xa1,0x33,0x22,0x73,0xcd,0x16,0xc4,0xb4,0xb0,0x7f,0xb1,0x8a, + }; + + int ret = 1; + size_t cpuflags = detect_cpu(); + +#if defined(SCRYPT_CHACHA_AVX) + if (cpuflags & cpu_avx) + ret &= scrypt_test_mix_instance(scrypt_ChunkMix_avx, scrypt_romix_nop, scrypt_romix_nop, expected); +#endif + +#if defined(SCRYPT_CHACHA_SSSE3) + if (cpuflags & cpu_ssse3) + ret &= scrypt_test_mix_instance(scrypt_ChunkMix_ssse3, scrypt_romix_nop, scrypt_romix_nop, expected); +#endif + +#if defined(SCRYPT_CHACHA_SSE2) + if (cpuflags & cpu_sse2) + ret &= scrypt_test_mix_instance(scrypt_ChunkMix_sse2, scrypt_romix_nop, scrypt_romix_nop, expected); +#endif + +#if defined(SCRYPT_CHACHA_BASIC) + ret &= scrypt_test_mix_instance(scrypt_ChunkMix_basic, scrypt_romix_convert_endian, scrypt_romix_convert_endian, expected); +#endif + + return ret; +} + diff --git a/scryptjane/scrypt-jane-hash.h b/scryptjane/scrypt-jane-hash.h new file mode 100644 index 0000000..db5c1db --- /dev/null +++ b/scryptjane/scrypt-jane-hash.h @@ -0,0 +1,48 @@ +#if defined(SCRYPT_BLAKE512) +#include "scrypt-jane-hash_blake512.h" +#elif defined(SCRYPT_BLAKE256) +#include "scrypt-jane-hash_blake256.h" +#elif defined(SCRYPT_SHA512) +#include "scrypt-jane-hash_sha512.h" +#elif defined(SCRYPT_SHA256) +#include "scrypt-jane-hash_sha256.h" +#elif defined(SCRYPT_SKEIN512) +#include "scrypt-jane-hash_skein512.h" +#elif defined(SCRYPT_KECCAK512) || defined(SCRYPT_KECCAK256) +#include "scrypt-jane-hash_keccak.h" +#else + #define SCRYPT_HASH "ERROR" + #define SCRYPT_HASH_BLOCK_SIZE 64 + #define SCRYPT_HASH_DIGEST_SIZE 64 + typedef struct scrypt_hash_state_t { size_t dummy; } scrypt_hash_state; + typedef uint8_t scrypt_hash_digest[SCRYPT_HASH_DIGEST_SIZE]; + static void scrypt_hash_init(scrypt_hash_state *S) {} + static void scrypt_hash_update(scrypt_hash_state *S, const uint8_t *in, size_t inlen) {} + static void scrypt_hash_finish(scrypt_hash_state *S, uint8_t *hash) {} + static const uint8_t scrypt_test_hash_expected[SCRYPT_HASH_DIGEST_SIZE] = {0}; + #error must define a hash function! +#endif + +#include "scrypt-jane-pbkdf2.h" + +#define SCRYPT_TEST_HASH_LEN 257 /* (2 * largest block size) + 1 */ + +static int +scrypt_test_hash() { + scrypt_hash_state st; + scrypt_hash_digest hash, final; + uint8_t msg[SCRYPT_TEST_HASH_LEN]; + size_t i; + + for (i = 0; i < SCRYPT_TEST_HASH_LEN; i++) + msg[i] = (uint8_t)i; + + scrypt_hash_init(&st); + for (i = 0; i < SCRYPT_TEST_HASH_LEN + 1; i++) { + scrypt_hash(hash, msg, i); + scrypt_hash_update(&st, hash, sizeof(hash)); + } + scrypt_hash_finish(&st, final); + return scrypt_verify(final, scrypt_test_hash_expected, SCRYPT_HASH_DIGEST_SIZE); +} + diff --git a/scryptjane/scrypt-jane-hash_keccak.h b/scryptjane/scrypt-jane-hash_keccak.h new file mode 100644 index 0000000..7ed5574 --- /dev/null +++ b/scryptjane/scrypt-jane-hash_keccak.h @@ -0,0 +1,168 @@ +#if defined(SCRYPT_KECCAK256) + #define SCRYPT_HASH "Keccak-256" + #define SCRYPT_HASH_DIGEST_SIZE 32 +#else + #define SCRYPT_HASH "Keccak-512" + #define SCRYPT_HASH_DIGEST_SIZE 64 +#endif +#define SCRYPT_KECCAK_F 1600 +#define SCRYPT_KECCAK_C (SCRYPT_HASH_DIGEST_SIZE * 8 * 2) /* 256=512, 512=1024 */ +#define SCRYPT_KECCAK_R (SCRYPT_KECCAK_F - SCRYPT_KECCAK_C) /* 256=1088, 512=576 */ +#define SCRYPT_HASH_BLOCK_SIZE (SCRYPT_KECCAK_R / 8) + +typedef uint8_t scrypt_hash_digest[SCRYPT_HASH_DIGEST_SIZE]; + +typedef struct scrypt_hash_state_t { + uint64_t state[SCRYPT_KECCAK_F / 64]; + uint32_t leftover; + uint8_t buffer[SCRYPT_HASH_BLOCK_SIZE]; +} scrypt_hash_state; + +static const uint64_t keccak_round_constants[24] = { + 0x0000000000000001ull, 0x0000000000008082ull, + 0x800000000000808aull, 0x8000000080008000ull, + 0x000000000000808bull, 0x0000000080000001ull, + 0x8000000080008081ull, 0x8000000000008009ull, + 0x000000000000008aull, 0x0000000000000088ull, + 0x0000000080008009ull, 0x000000008000000aull, + 0x000000008000808bull, 0x800000000000008bull, + 0x8000000000008089ull, 0x8000000000008003ull, + 0x8000000000008002ull, 0x8000000000000080ull, + 0x000000000000800aull, 0x800000008000000aull, + 0x8000000080008081ull, 0x8000000000008080ull, + 0x0000000080000001ull, 0x8000000080008008ull +}; + +static void +keccak_block(scrypt_hash_state *S, const uint8_t *in) { + size_t i; + uint64_t *s = S->state, t[5], u[5], v, w; + + /* absorb input */ + for (i = 0; i < SCRYPT_HASH_BLOCK_SIZE / 8; i++, in += 8) + s[i] ^= U8TO64_LE(in); + + for (i = 0; i < 24; i++) { + /* theta: c = a[0,i] ^ a[1,i] ^ .. a[4,i] */ + t[0] = s[0] ^ s[5] ^ s[10] ^ s[15] ^ s[20]; + t[1] = s[1] ^ s[6] ^ s[11] ^ s[16] ^ s[21]; + t[2] = s[2] ^ s[7] ^ s[12] ^ s[17] ^ s[22]; + t[3] = s[3] ^ s[8] ^ s[13] ^ s[18] ^ s[23]; + t[4] = s[4] ^ s[9] ^ s[14] ^ s[19] ^ s[24]; + + /* theta: d[i] = c[i+4] ^ rotl(c[i+1],1) */ + u[0] = t[4] ^ ROTL64(t[1], 1); + u[1] = t[0] ^ ROTL64(t[2], 1); + u[2] = t[1] ^ ROTL64(t[3], 1); + u[3] = t[2] ^ ROTL64(t[4], 1); + u[4] = t[3] ^ ROTL64(t[0], 1); + + /* theta: a[0,i], a[1,i], .. a[4,i] ^= d[i] */ + s[0] ^= u[0]; s[5] ^= u[0]; s[10] ^= u[0]; s[15] ^= u[0]; s[20] ^= u[0]; + s[1] ^= u[1]; s[6] ^= u[1]; s[11] ^= u[1]; s[16] ^= u[1]; s[21] ^= u[1]; + s[2] ^= u[2]; s[7] ^= u[2]; s[12] ^= u[2]; s[17] ^= u[2]; s[22] ^= u[2]; + s[3] ^= u[3]; s[8] ^= u[3]; s[13] ^= u[3]; s[18] ^= u[3]; s[23] ^= u[3]; + s[4] ^= u[4]; s[9] ^= u[4]; s[14] ^= u[4]; s[19] ^= u[4]; s[24] ^= u[4]; + + /* rho pi: b[..] = rotl(a[..], ..) */ + v = s[ 1]; + s[ 1] = ROTL64(s[ 6], 44); + s[ 6] = ROTL64(s[ 9], 20); + s[ 9] = ROTL64(s[22], 61); + s[22] = ROTL64(s[14], 39); + s[14] = ROTL64(s[20], 18); + s[20] = ROTL64(s[ 2], 62); + s[ 2] = ROTL64(s[12], 43); + s[12] = ROTL64(s[13], 25); + s[13] = ROTL64(s[19], 8); + s[19] = ROTL64(s[23], 56); + s[23] = ROTL64(s[15], 41); + s[15] = ROTL64(s[ 4], 27); + s[ 4] = ROTL64(s[24], 14); + s[24] = ROTL64(s[21], 2); + s[21] = ROTL64(s[ 8], 55); + s[ 8] = ROTL64(s[16], 45); + s[16] = ROTL64(s[ 5], 36); + s[ 5] = ROTL64(s[ 3], 28); + s[ 3] = ROTL64(s[18], 21); + s[18] = ROTL64(s[17], 15); + s[17] = ROTL64(s[11], 10); + s[11] = ROTL64(s[ 7], 6); + s[ 7] = ROTL64(s[10], 3); + s[10] = ROTL64( v, 1); + + /* chi: a[i,j] ^= ~b[i,j+1] & b[i,j+2] */ + v = s[ 0]; w = s[ 1]; s[ 0] ^= (~w) & s[ 2]; s[ 1] ^= (~s[ 2]) & s[ 3]; s[ 2] ^= (~s[ 3]) & s[ 4]; s[ 3] ^= (~s[ 4]) & v; s[ 4] ^= (~v) & w; + v = s[ 5]; w = s[ 6]; s[ 5] ^= (~w) & s[ 7]; s[ 6] ^= (~s[ 7]) & s[ 8]; s[ 7] ^= (~s[ 8]) & s[ 9]; s[ 8] ^= (~s[ 9]) & v; s[ 9] ^= (~v) & w; + v = s[10]; w = s[11]; s[10] ^= (~w) & s[12]; s[11] ^= (~s[12]) & s[13]; s[12] ^= (~s[13]) & s[14]; s[13] ^= (~s[14]) & v; s[14] ^= (~v) & w; + v = s[15]; w = s[16]; s[15] ^= (~w) & s[17]; s[16] ^= (~s[17]) & s[18]; s[17] ^= (~s[18]) & s[19]; s[18] ^= (~s[19]) & v; s[19] ^= (~v) & w; + v = s[20]; w = s[21]; s[20] ^= (~w) & s[22]; s[21] ^= (~s[22]) & s[23]; s[22] ^= (~s[23]) & s[24]; s[23] ^= (~s[24]) & v; s[24] ^= (~v) & w; + + /* iota: a[0,0] ^= round constant */ + s[0] ^= keccak_round_constants[i]; + } +} + +static void +scrypt_hash_init(scrypt_hash_state *S) { + memset(S, 0, sizeof(*S)); +} + +static void +scrypt_hash_update(scrypt_hash_state *S, const uint8_t *in, size_t inlen) { + size_t want; + + /* handle the previous data */ + if (S->leftover) { + want = (SCRYPT_HASH_BLOCK_SIZE - S->leftover); + want = (want < inlen) ? want : inlen; + memcpy(S->buffer + S->leftover, in, want); + S->leftover += (uint32_t)want; + if (S->leftover < SCRYPT_HASH_BLOCK_SIZE) + return; + in += want; + inlen -= want; + keccak_block(S, S->buffer); + } + + /* handle the current data */ + while (inlen >= SCRYPT_HASH_BLOCK_SIZE) { + keccak_block(S, in); + in += SCRYPT_HASH_BLOCK_SIZE; + inlen -= SCRYPT_HASH_BLOCK_SIZE; + } + + /* handle leftover data */ + S->leftover = (uint32_t)inlen; + if (S->leftover) + memcpy(S->buffer, in, S->leftover); +} + +static void +scrypt_hash_finish(scrypt_hash_state *S, uint8_t *hash) { + size_t i; + + S->buffer[S->leftover] = 0x01; + memset(S->buffer + (S->leftover + 1), 0, SCRYPT_HASH_BLOCK_SIZE - (S->leftover + 1)); + S->buffer[SCRYPT_HASH_BLOCK_SIZE - 1] |= 0x80; + keccak_block(S, S->buffer); + + for (i = 0; i < SCRYPT_HASH_DIGEST_SIZE; i += 8) { + U64TO8_LE(&hash[i], S->state[i / 8]); + } +} + +#if defined(SCRYPT_KECCAK256) +static const uint8_t scrypt_test_hash_expected[SCRYPT_HASH_DIGEST_SIZE] = { + 0x26,0xb7,0x10,0xb3,0x66,0xb1,0xd1,0xb1,0x25,0xfc,0x3e,0xe3,0x1e,0x33,0x1d,0x19, + 0x94,0xaa,0x63,0x7a,0xd5,0x77,0x29,0xb4,0x27,0xe9,0xe0,0xf4,0x19,0xba,0x68,0xea, +}; +#else +static const uint8_t scrypt_test_hash_expected[SCRYPT_HASH_DIGEST_SIZE] = { + 0x17,0xc7,0x8c,0xa0,0xd9,0x08,0x1d,0xba,0x8a,0xc8,0x3e,0x07,0x90,0xda,0x91,0x88, + 0x25,0xbd,0xd3,0xf8,0x78,0x4a,0x8d,0x5e,0xe4,0x96,0x9c,0x01,0xf3,0xeb,0xdc,0x12, + 0xea,0x35,0x57,0xba,0x94,0xb8,0xe9,0xb9,0x27,0x45,0x0a,0x48,0x5c,0x3d,0x69,0xf0, + 0xdb,0x22,0x38,0xb5,0x52,0x22,0x29,0xea,0x7a,0xb2,0xe6,0x07,0xaa,0x37,0x4d,0xe6, +}; +#endif + diff --git a/scryptjane/scrypt-jane-hash_sha256.h b/scryptjane/scrypt-jane-hash_sha256.h new file mode 100644 index 0000000..d06d3e1 --- /dev/null +++ b/scryptjane/scrypt-jane-hash_sha256.h @@ -0,0 +1,135 @@ +#define SCRYPT_HASH "SHA-2-256" +#define SCRYPT_HASH_BLOCK_SIZE 64 +#define SCRYPT_HASH_DIGEST_SIZE 32 + +typedef uint8_t scrypt_hash_digest[SCRYPT_HASH_DIGEST_SIZE]; + +typedef struct scrypt_hash_state_t { + uint32_t H[8]; + uint64_t T; + uint32_t leftover; + uint8_t buffer[SCRYPT_HASH_BLOCK_SIZE]; +} scrypt_hash_state; + +static const uint32_t sha256_constants[64] = { + 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5, 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5, + 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3, 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174, + 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc, 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da, + 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7, 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967, + 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13, 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85, + 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3, 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070, + 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5, 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3, + 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208, 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2 +}; + +#define Ch(x,y,z) (z ^ (x & (y ^ z))) +#define Maj(x,y,z) (((x | y) & z) | (x & y)) +#define S0(x) (ROTR32(x, 2) ^ ROTR32(x, 13) ^ ROTR32(x, 22)) +#define S1(x) (ROTR32(x, 6) ^ ROTR32(x, 11) ^ ROTR32(x, 25)) +#define G0(x) (ROTR32(x, 7) ^ ROTR32(x, 18) ^ (x >> 3)) +#define G1(x) (ROTR32(x, 17) ^ ROTR32(x, 19) ^ (x >> 10)) +#define W0(in,i) (U8TO32_BE(&in[i * 4])) +#define W1(i) (G1(w[i - 2]) + w[i - 7] + G0(w[i - 15]) + w[i - 16]) +#define STEP(i) \ + t1 = S0(r[0]) + Maj(r[0], r[1], r[2]); \ + t0 = r[7] + S1(r[4]) + Ch(r[4], r[5], r[6]) + sha256_constants[i] + w[i]; \ + r[7] = r[6]; \ + r[6] = r[5]; \ + r[5] = r[4]; \ + r[4] = r[3] + t0; \ + r[3] = r[2]; \ + r[2] = r[1]; \ + r[1] = r[0]; \ + r[0] = t0 + t1; + +static void +sha256_blocks(scrypt_hash_state *S, const uint8_t *in, size_t blocks) { + uint32_t r[8], w[64], t0, t1; + size_t i; + + for (i = 0; i < 8; i++) r[i] = S->H[i]; + + while (blocks--) { + for (i = 0; i < 16; i++) { w[i] = W0(in, i); } + for (i = 16; i < 64; i++) { w[i] = W1(i); } + for (i = 0; i < 64; i++) { STEP(i); } + for (i = 0; i < 8; i++) { r[i] += S->H[i]; S->H[i] = r[i]; } + S->T += SCRYPT_HASH_BLOCK_SIZE * 8; + in += SCRYPT_HASH_BLOCK_SIZE; + } +} + +static void +scrypt_hash_init(scrypt_hash_state *S) { + S->H[0] = 0x6a09e667; + S->H[1] = 0xbb67ae85; + S->H[2] = 0x3c6ef372; + S->H[3] = 0xa54ff53a; + S->H[4] = 0x510e527f; + S->H[5] = 0x9b05688c; + S->H[6] = 0x1f83d9ab; + S->H[7] = 0x5be0cd19; + S->T = 0; + S->leftover = 0; +} + +static void +scrypt_hash_update(scrypt_hash_state *S, const uint8_t *in, size_t inlen) { + size_t blocks, want; + + /* handle the previous data */ + if (S->leftover) { + want = (SCRYPT_HASH_BLOCK_SIZE - S->leftover); + want = (want < inlen) ? want : inlen; + memcpy(S->buffer + S->leftover, in, want); + S->leftover += (uint32_t)want; + if (S->leftover < SCRYPT_HASH_BLOCK_SIZE) + return; + in += want; + inlen -= want; + sha256_blocks(S, S->buffer, 1); + } + + /* handle the current data */ + blocks = (inlen & ~(SCRYPT_HASH_BLOCK_SIZE - 1)); + S->leftover = (uint32_t)(inlen - blocks); + if (blocks) { + sha256_blocks(S, in, blocks / SCRYPT_HASH_BLOCK_SIZE); + in += blocks; + } + + /* handle leftover data */ + if (S->leftover) + memcpy(S->buffer, in, S->leftover); +} + +static void +scrypt_hash_finish(scrypt_hash_state *S, uint8_t *hash) { + uint64_t t = S->T + (S->leftover * 8); + + S->buffer[S->leftover] = 0x80; + if (S->leftover <= 55) { + memset(S->buffer + S->leftover + 1, 0, 55 - S->leftover); + } else { + memset(S->buffer + S->leftover + 1, 0, 63 - S->leftover); + sha256_blocks(S, S->buffer, 1); + memset(S->buffer, 0, 56); + } + + U64TO8_BE(S->buffer + 56, t); + sha256_blocks(S, S->buffer, 1); + + U32TO8_BE(&hash[ 0], S->H[0]); + U32TO8_BE(&hash[ 4], S->H[1]); + U32TO8_BE(&hash[ 8], S->H[2]); + U32TO8_BE(&hash[12], S->H[3]); + U32TO8_BE(&hash[16], S->H[4]); + U32TO8_BE(&hash[20], S->H[5]); + U32TO8_BE(&hash[24], S->H[6]); + U32TO8_BE(&hash[28], S->H[7]); +} + +static const uint8_t scrypt_test_hash_expected[SCRYPT_HASH_DIGEST_SIZE] = { + 0xee,0x36,0xae,0xa6,0x65,0xf0,0x28,0x7d,0xc9,0xde,0xd8,0xad,0x48,0x33,0x7d,0xbf, + 0xcb,0xc0,0x48,0xfa,0x5f,0x92,0xfd,0x0a,0x95,0x6f,0x34,0x8e,0x8c,0x1e,0x73,0xad, +}; diff --git a/scryptjane/scrypt-jane-mix_chacha-avx.h b/scryptjane/scrypt-jane-mix_chacha-avx.h new file mode 100644 index 0000000..50d6e2d --- /dev/null +++ b/scryptjane/scrypt-jane-mix_chacha-avx.h @@ -0,0 +1,340 @@ +/* x86 */ +#if defined(X86ASM_AVX) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_CHACHA_INCLUDED)) + +#define SCRYPT_CHACHA_AVX + +asm_naked_fn_proto(void, scrypt_ChunkMix_avx)(uint32_t *Bout/*[chunkBytes]*/, uint32_t *Bin/*[chunkBytes]*/, uint32_t *Bxor/*[chunkBytes]*/, uint32_t r) +asm_naked_fn(scrypt_ChunkMix_avx) + a1(push ebx) + a1(push edi) + a1(push esi) + a1(push ebp) + a2(mov ebp,esp) + a2(mov edi,[ebp+20]) + a2(mov esi,[ebp+24]) + a2(mov eax,[ebp+28]) + a2(mov ebx,[ebp+32]) + a2(sub esp,64) + a2(and esp,~63) + a2(lea edx,[ebx*2]) + a2(shl edx,6) + a2(lea ecx,[edx-64]) + a2(and eax, eax) + a2(vmovdqa xmm4,[ssse3_rotl16_32bit]) + a2(vmovdqa xmm5,[ssse3_rotl8_32bit]) + a2(vmovdqa xmm0,[ecx+esi+0]) + a2(vmovdqa xmm1,[ecx+esi+16]) + a2(vmovdqa xmm2,[ecx+esi+32]) + a2(vmovdqa xmm3,[ecx+esi+48]) + a1(jz scrypt_ChunkMix_avx_no_xor1) + a3(vpxor xmm0,xmm0,[ecx+eax+0]) + a3(vpxor xmm1,xmm1,[ecx+eax+16]) + a3(vpxor xmm2,xmm2,[ecx+eax+32]) + a3(vpxor xmm3,xmm3,[ecx+eax+48]) + a1(scrypt_ChunkMix_avx_no_xor1:) + a2(xor ecx,ecx) + a2(xor ebx,ebx) + a1(scrypt_ChunkMix_avx_loop:) + a2(and eax, eax) + a3(vpxor xmm0,xmm0,[esi+ecx+0]) + a3(vpxor xmm1,xmm1,[esi+ecx+16]) + a3(vpxor xmm2,xmm2,[esi+ecx+32]) + a3(vpxor xmm3,xmm3,[esi+ecx+48]) + a1(jz scrypt_ChunkMix_avx_no_xor2) + a3(vpxor xmm0,xmm0,[eax+ecx+0]) + a3(vpxor xmm1,xmm1,[eax+ecx+16]) + a3(vpxor xmm2,xmm2,[eax+ecx+32]) + a3(vpxor xmm3,xmm3,[eax+ecx+48]) + a1(scrypt_ChunkMix_avx_no_xor2:) + a2(vmovdqa [esp+0],xmm0) + a2(vmovdqa [esp+16],xmm1) + a2(vmovdqa [esp+32],xmm2) + a2(vmovdqa [esp+48],xmm3) + a2(mov eax,8) + a1(scrypt_chacha_avx_loop: ) + a3(vpaddd xmm0,xmm0,xmm1) + a3(vpxor xmm3,xmm3,xmm0) + a3(vpshufb xmm3,xmm3,xmm4) + a3(vpaddd xmm2,xmm2,xmm3) + a3(vpxor xmm1,xmm1,xmm2) + a3(vpsrld xmm6,xmm1,20) + a3(vpslld xmm1,xmm1,12) + a3(vpxor xmm1,xmm1,xmm6) + a3(vpaddd xmm0,xmm0,xmm1) + a3(vpxor xmm3,xmm3,xmm0) + a3(vpshufb xmm3,xmm3,xmm5) + a3(vpshufd xmm0,xmm0,0x93) + a3(vpaddd xmm2,xmm2,xmm3) + a3(vpshufd xmm3,xmm3,0x4e) + a3(vpxor xmm1,xmm1,xmm2) + a3(vpshufd xmm2,xmm2,0x39) + a3(vpsrld xmm6,xmm1,25) + a3(vpslld xmm1,xmm1,7) + a3(vpxor xmm1,xmm1,xmm6) + a2(sub eax,2) + a3(vpaddd xmm0,xmm0,xmm1) + a3(vpxor xmm3,xmm3,xmm0) + a3(vpshufb xmm3,xmm3,xmm4) + a3(vpaddd xmm2,xmm2,xmm3) + a3(vpxor xmm1,xmm1,xmm2) + a3(vpsrld xmm6,xmm1,20) + a3(vpslld xmm1,xmm1,12) + a3(vpxor xmm1,xmm1,xmm6) + a3(vpaddd xmm0,xmm0,xmm1) + a3(vpxor xmm3,xmm3,xmm0) + a3(vpshufb xmm3,xmm3,xmm5) + a3(vpshufd xmm0,xmm0,0x39) + a3(vpaddd xmm2,xmm2,xmm3) + a3(pshufd xmm3,xmm3,0x4e) + a3(vpxor xmm1,xmm1,xmm2) + a3(pshufd xmm2,xmm2,0x93) + a3(vpsrld xmm6,xmm1,25) + a3(vpslld xmm1,xmm1,7) + a3(vpxor xmm1,xmm1,xmm6) + a1(ja scrypt_chacha_avx_loop) + a3(vpaddd xmm0,xmm0,[esp+0]) + a3(vpaddd xmm1,xmm1,[esp+16]) + a3(vpaddd xmm2,xmm2,[esp+32]) + a3(vpaddd xmm3,xmm3,[esp+48]) + a2(lea eax,[ebx+ecx]) + a2(xor ebx,edx) + a2(and eax,~0x7f) + a2(add ecx,64) + a2(shr eax,1) + a2(add eax, edi) + a2(cmp ecx,edx) + a2(vmovdqa [eax+0],xmm0) + a2(vmovdqa [eax+16],xmm1) + a2(vmovdqa [eax+32],xmm2) + a2(vmovdqa [eax+48],xmm3) + a2(mov eax,[ebp+28]) + a1(jne scrypt_ChunkMix_avx_loop) + a2(mov esp,ebp) + a1(pop ebp) + a1(pop esi) + a1(pop edi) + a1(pop ebx) + a1(ret 16) +asm_naked_fn_end(scrypt_ChunkMix_avx) + +#endif + + + +/* x64 */ +#if defined(X86_64ASM_AVX) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_CHACHA_INCLUDED)) + +#define SCRYPT_CHACHA_AVX + +asm_naked_fn_proto(void, scrypt_ChunkMix_avx)(uint32_t *Bout/*[chunkBytes]*/, uint32_t *Bin/*[chunkBytes]*/, uint32_t *Bxor/*[chunkBytes]*/, uint32_t r) +asm_naked_fn(scrypt_ChunkMix_avx) + a2(lea rcx,[rcx*2]) + a2(shl rcx,6) + a2(lea r9,[rcx-64]) + a2(lea rax,[rsi+r9]) + a2(lea r9,[rdx+r9]) + a2(and rdx, rdx) + a2(vmovdqa xmm4,[ssse3_rotl16_32bit]) + a2(vmovdqa xmm5,[ssse3_rotl8_32bit]) + a2(vmovdqa xmm0,[rax+0]) + a2(vmovdqa xmm1,[rax+16]) + a2(vmovdqa xmm2,[rax+32]) + a2(vmovdqa xmm3,[rax+48]) + a1(jz scrypt_ChunkMix_avx_no_xor1) + a3(vpxor xmm0,xmm0,[r9+0]) + a3(vpxor xmm1,xmm1,[r9+16]) + a3(vpxor xmm2,xmm2,[r9+32]) + a3(vpxor xmm3,xmm3,[r9+48]) + a1(scrypt_ChunkMix_avx_no_xor1:) + a2(xor r8,r8) + a2(xor r9,r9) + a1(scrypt_ChunkMix_avx_loop:) + a2(and rdx, rdx) + a3(vpxor xmm0,xmm0,[rsi+r9+0]) + a3(vpxor xmm1,xmm1,[rsi+r9+16]) + a3(vpxor xmm2,xmm2,[rsi+r9+32]) + a3(vpxor xmm3,xmm3,[rsi+r9+48]) + a1(jz scrypt_ChunkMix_avx_no_xor2) + a3(vpxor xmm0,xmm0,[rdx+r9+0]) + a3(vpxor xmm1,xmm1,[rdx+r9+16]) + a3(vpxor xmm2,xmm2,[rdx+r9+32]) + a3(vpxor xmm3,xmm3,[rdx+r9+48]) + a1(scrypt_ChunkMix_avx_no_xor2:) + a2(vmovdqa xmm8,xmm0) + a2(vmovdqa xmm9,xmm1) + a2(vmovdqa xmm10,xmm2) + a2(vmovdqa xmm11,xmm3) + a2(mov rax,8) + a1(scrypt_chacha_avx_loop: ) + a3(vpaddd xmm0,xmm0,xmm1) + a3(vpxor xmm3,xmm3,xmm0) + a3(vpshufb xmm3,xmm3,xmm4) + a3(vpaddd xmm2,xmm2,xmm3) + a3(vpxor xmm1,xmm1,xmm2) + a3(vpsrld xmm12,xmm1,20) + a3(vpslld xmm1,xmm1,12) + a3(vpxor xmm1,xmm1,xmm12) + a3(vpaddd xmm0,xmm0,xmm1) + a3(vpxor xmm3,xmm3,xmm0) + a3(vpshufb xmm3,xmm3,xmm5) + a3(vpshufd xmm0,xmm0,0x93) + a3(vpaddd xmm2,xmm2,xmm3) + a3(vpshufd xmm3,xmm3,0x4e) + a3(vpxor xmm1,xmm1,xmm2) + a3(vpshufd xmm2,xmm2,0x39) + a3(vpsrld xmm12,xmm1,25) + a3(vpslld xmm1,xmm1,7) + a3(vpxor xmm1,xmm1,xmm12) + a2(sub rax,2) + a3(vpaddd xmm0,xmm0,xmm1) + a3(vpxor xmm3,xmm3,xmm0) + a3(vpshufb xmm3,xmm3,xmm4) + a3(vpaddd xmm2,xmm2,xmm3) + a3(vpxor xmm1,xmm1,xmm2) + a3(vpsrld xmm12,xmm1,20) + a3(vpslld xmm1,xmm1,12) + a3(vpxor xmm1,xmm1,xmm12) + a3(vpaddd xmm0,xmm0,xmm1) + a3(vpxor xmm3,xmm3,xmm0) + a3(vpshufb xmm3,xmm3,xmm5) + a3(vpshufd xmm0,xmm0,0x39) + a3(vpaddd xmm2,xmm2,xmm3) + a3(pshufd xmm3,xmm3,0x4e) + a3(vpxor xmm1,xmm1,xmm2) + a3(pshufd xmm2,xmm2,0x93) + a3(vpsrld xmm12,xmm1,25) + a3(vpslld xmm1,xmm1,7) + a3(vpxor xmm1,xmm1,xmm12) + a1(ja scrypt_chacha_avx_loop) + a3(vpaddd xmm0,xmm0,xmm8) + a3(vpaddd xmm1,xmm1,xmm9) + a3(vpaddd xmm2,xmm2,xmm10) + a3(vpaddd xmm3,xmm3,xmm11) + a2(lea rax,[r8+r9]) + a2(xor r8,rcx) + a2(and rax,~0x7f) + a2(add r9,64) + a2(shr rax,1) + a2(add rax, rdi) + a2(cmp r9,rcx) + a2(vmovdqa [rax+0],xmm0) + a2(vmovdqa [rax+16],xmm1) + a2(vmovdqa [rax+32],xmm2) + a2(vmovdqa [rax+48],xmm3) + a1(jne scrypt_ChunkMix_avx_loop) + a1(ret) +asm_naked_fn_end(scrypt_ChunkMix_avx) + +#endif + + +/* intrinsic */ +#if defined(X86_INTRINSIC_AVX) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_CHACHA_INCLUDED)) + +#define SCRYPT_CHACHA_AVX + +static void NOINLINE +scrypt_ChunkMix_avx(uint32_t *Bout/*[chunkBytes]*/, uint32_t *Bin/*[chunkBytes]*/, uint32_t *Bxor/*[chunkBytes]*/, uint32_t r) { + uint32_t i, blocksPerChunk = r * 2, half = 0; + xmmi *xmmp,x0,x1,x2,x3,x6,t0,t1,t2,t3; + const xmmi x4 = *(xmmi *)&ssse3_rotl16_32bit, x5 = *(xmmi *)&ssse3_rotl8_32bit; + size_t rounds; + + /* 1: X = B_{2r - 1} */ + xmmp = (xmmi *)scrypt_block(Bin, blocksPerChunk - 1); + x0 = xmmp[0]; + x1 = xmmp[1]; + x2 = xmmp[2]; + x3 = xmmp[3]; + + if (Bxor) { + xmmp = (xmmi *)scrypt_block(Bxor, blocksPerChunk - 1); + x0 = _mm_xor_si128(x0, xmmp[0]); + x1 = _mm_xor_si128(x1, xmmp[1]); + x2 = _mm_xor_si128(x2, xmmp[2]); + x3 = _mm_xor_si128(x3, xmmp[3]); + } + + /* 2: for i = 0 to 2r - 1 do */ + for (i = 0; i < blocksPerChunk; i++, half ^= r) { + /* 3: X = H(X ^ B_i) */ + xmmp = (xmmi *)scrypt_block(Bin, i); + x0 = _mm_xor_si128(x0, xmmp[0]); + x1 = _mm_xor_si128(x1, xmmp[1]); + x2 = _mm_xor_si128(x2, xmmp[2]); + x3 = _mm_xor_si128(x3, xmmp[3]); + + if (Bxor) { + xmmp = (xmmi *)scrypt_block(Bxor, i); + x0 = _mm_xor_si128(x0, xmmp[0]); + x1 = _mm_xor_si128(x1, xmmp[1]); + x2 = _mm_xor_si128(x2, xmmp[2]); + x3 = _mm_xor_si128(x3, xmmp[3]); + } + + t0 = x0; + t1 = x1; + t2 = x2; + t3 = x3; + + for (rounds = 8; rounds; rounds -= 2) { + x0 = _mm_add_epi32(x0, x1); + x3 = _mm_xor_si128(x3, x0); + x3 = _mm_shuffle_epi8(x3, x4); + x2 = _mm_add_epi32(x2, x3); + x1 = _mm_xor_si128(x1, x2); + x6 = x1; + x1 = _mm_or_si128(_mm_slli_epi32(x1, 12), _mm_srli_epi32(x6, 20)); + x0 = _mm_add_epi32(x0, x1); + x3 = _mm_xor_si128(x3, x0); + x3 = _mm_shuffle_epi8(x3, x5); + x0 = _mm_shuffle_epi32(x0, 0x93); + x2 = _mm_add_epi32(x2, x3); + x3 = _mm_shuffle_epi32(x3, 0x4e); + x1 = _mm_xor_si128(x1, x2); + x2 = _mm_shuffle_epi32(x2, 0x39); + x6 = x1; + x1 = _mm_or_si128(_mm_slli_epi32(x1, 7), _mm_srli_epi32(x6, 25)); + x0 = _mm_add_epi32(x0, x1); + x3 = _mm_xor_si128(x3, x0); + x3 = _mm_shuffle_epi8(x3, x4); + x2 = _mm_add_epi32(x2, x3); + x1 = _mm_xor_si128(x1, x2); + x6 = x1; + x1 = _mm_or_si128(_mm_slli_epi32(x1, 12), _mm_srli_epi32(x6, 20)); + x0 = _mm_add_epi32(x0, x1); + x3 = _mm_xor_si128(x3, x0); + x3 = _mm_shuffle_epi8(x3, x5); + x0 = _mm_shuffle_epi32(x0, 0x39); + x2 = _mm_add_epi32(x2, x3); + x3 = _mm_shuffle_epi32(x3, 0x4e); + x1 = _mm_xor_si128(x1, x2); + x2 = _mm_shuffle_epi32(x2, 0x93); + x6 = x1; + x1 = _mm_or_si128(_mm_slli_epi32(x1, 7), _mm_srli_epi32(x6, 25)); + } + + x0 = _mm_add_epi32(x0, t0); + x1 = _mm_add_epi32(x1, t1); + x2 = _mm_add_epi32(x2, t2); + x3 = _mm_add_epi32(x3, t3); + + /* 4: Y_i = X */ + /* 6: B'[0..r-1] = Y_even */ + /* 6: B'[r..2r-1] = Y_odd */ + xmmp = (xmmi *)scrypt_block(Bout, (i / 2) + half); + xmmp[0] = x0; + xmmp[1] = x1; + xmmp[2] = x2; + xmmp[3] = x3; + } +} + +#endif + +#if defined(SCRYPT_CHACHA_AVX) + #undef SCRYPT_MIX + #define SCRYPT_MIX "ChaCha/8-AVX" + #undef SCRYPT_CHACHA_INCLUDED + #define SCRYPT_CHACHA_INCLUDED +#endif diff --git a/scryptjane/scrypt-jane-mix_chacha-sse2.h b/scryptjane/scrypt-jane-mix_chacha-sse2.h new file mode 100644 index 0000000..d2192c8 --- /dev/null +++ b/scryptjane/scrypt-jane-mix_chacha-sse2.h @@ -0,0 +1,371 @@ +/* x86 */ +#if defined(X86ASM_SSE2) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_CHACHA_INCLUDED)) + +#define SCRYPT_CHACHA_SSE2 + +asm_naked_fn_proto(void, scrypt_ChunkMix_sse2)(uint32_t *Bout/*[chunkBytes]*/, uint32_t *Bin/*[chunkBytes]*/, uint32_t *Bxor/*[chunkBytes]*/, uint32_t r) +asm_naked_fn(scrypt_ChunkMix_sse2) + a1(push ebx) + a1(push edi) + a1(push esi) + a1(push ebp) + a2(mov ebp,esp) + a2(mov edi,[ebp+20]) + a2(mov esi,[ebp+24]) + a2(mov eax,[ebp+28]) + a2(mov ebx,[ebp+32]) + a2(sub esp,16) + a2(and esp,~15) + a2(lea edx,[ebx*2]) + a2(shl edx,6) + a2(lea ecx,[edx-64]) + a2(and eax, eax) + a2(movdqa xmm0,[ecx+esi+0]) + a2(movdqa xmm1,[ecx+esi+16]) + a2(movdqa xmm2,[ecx+esi+32]) + a2(movdqa xmm3,[ecx+esi+48]) + a1(jz scrypt_ChunkMix_sse2_no_xor1) + a2(pxor xmm0,[ecx+eax+0]) + a2(pxor xmm1,[ecx+eax+16]) + a2(pxor xmm2,[ecx+eax+32]) + a2(pxor xmm3,[ecx+eax+48]) + a1(scrypt_ChunkMix_sse2_no_xor1:) + a2(xor ecx,ecx) + a2(xor ebx,ebx) + a1(scrypt_ChunkMix_sse2_loop:) + a2(and eax, eax) + a2(pxor xmm0,[esi+ecx+0]) + a2(pxor xmm1,[esi+ecx+16]) + a2(pxor xmm2,[esi+ecx+32]) + a2(pxor xmm3,[esi+ecx+48]) + a1(jz scrypt_ChunkMix_sse2_no_xor2) + a2(pxor xmm0,[eax+ecx+0]) + a2(pxor xmm1,[eax+ecx+16]) + a2(pxor xmm2,[eax+ecx+32]) + a2(pxor xmm3,[eax+ecx+48]) + a1(scrypt_ChunkMix_sse2_no_xor2:) + a2(movdqa [esp+0],xmm0) + a2(movdqa xmm4,xmm1) + a2(movdqa xmm5,xmm2) + a2(movdqa xmm7,xmm3) + a2(mov eax,8) + a1(scrypt_chacha_sse2_loop: ) + a2(paddd xmm0,xmm1) + a2(pxor xmm3,xmm0) + a2(movdqa xmm6,xmm3) + a2(pslld xmm3,16) + a2(psrld xmm6,16) + a2(pxor xmm3,xmm6) + a2(paddd xmm2,xmm3) + a2(pxor xmm1,xmm2) + a2(movdqa xmm6,xmm1) + a2(pslld xmm1,12) + a2(psrld xmm6,20) + a2(pxor xmm1,xmm6) + a2(paddd xmm0,xmm1) + a2(pxor xmm3,xmm0) + a2(movdqa xmm6,xmm3) + a2(pslld xmm3,8) + a2(psrld xmm6,24) + a2(pxor xmm3,xmm6) + a3(pshufd xmm0,xmm0,0x93) + a2(paddd xmm2,xmm3) + a3(pshufd xmm3,xmm3,0x4e) + a2(pxor xmm1,xmm2) + a3(pshufd xmm2,xmm2,0x39) + a2(movdqa xmm6,xmm1) + a2(pslld xmm1,7) + a2(psrld xmm6,25) + a2(pxor xmm1,xmm6) + a2(sub eax,2) + a2(paddd xmm0,xmm1) + a2(pxor xmm3,xmm0) + a2(movdqa xmm6,xmm3) + a2(pslld xmm3,16) + a2(psrld xmm6,16) + a2(pxor xmm3,xmm6) + a2(paddd xmm2,xmm3) + a2(pxor xmm1,xmm2) + a2(movdqa xmm6,xmm1) + a2(pslld xmm1,12) + a2(psrld xmm6,20) + a2(pxor xmm1,xmm6) + a2(paddd xmm0,xmm1) + a2(pxor xmm3,xmm0) + a2(movdqa xmm6,xmm3) + a2(pslld xmm3,8) + a2(psrld xmm6,24) + a2(pxor xmm3,xmm6) + a3(pshufd xmm0,xmm0,0x39) + a2(paddd xmm2,xmm3) + a3(pshufd xmm3,xmm3,0x4e) + a2(pxor xmm1,xmm2) + a3(pshufd xmm2,xmm2,0x93) + a2(movdqa xmm6,xmm1) + a2(pslld xmm1,7) + a2(psrld xmm6,25) + a2(pxor xmm1,xmm6) + a1(ja scrypt_chacha_sse2_loop) + a2(paddd xmm0,[esp+0]) + a2(paddd xmm1,xmm4) + a2(paddd xmm2,xmm5) + a2(paddd xmm3,xmm7) + a2(lea eax,[ebx+ecx]) + a2(xor ebx,edx) + a2(and eax,~0x7f) + a2(add ecx,64) + a2(shr eax,1) + a2(add eax, edi) + a2(cmp ecx,edx) + a2(movdqa [eax+0],xmm0) + a2(movdqa [eax+16],xmm1) + a2(movdqa [eax+32],xmm2) + a2(movdqa [eax+48],xmm3) + a2(mov eax,[ebp+28]) + a1(jne scrypt_ChunkMix_sse2_loop) + a2(mov esp,ebp) + a1(pop ebp) + a1(pop esi) + a1(pop edi) + a1(pop ebx) + a1(ret 16) +asm_naked_fn_end(scrypt_ChunkMix_sse2) + +#endif + + + +/* x64 */ +#if defined(X86_64ASM_SSE2) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_CHACHA_INCLUDED)) + +#define SCRYPT_CHACHA_SSE2 + +asm_naked_fn_proto(void, scrypt_ChunkMix_sse2)(uint32_t *Bout/*[chunkBytes]*/, uint32_t *Bin/*[chunkBytes]*/, uint32_t *Bxor/*[chunkBytes]*/, uint32_t r) +asm_naked_fn(scrypt_ChunkMix_sse2) + a2(lea rcx,[rcx*2]) + a2(shl rcx,6) + a2(lea r9,[rcx-64]) + a2(lea rax,[rsi+r9]) + a2(lea r9,[rdx+r9]) + a2(and rdx, rdx) + a2(movdqa xmm0,[rax+0]) + a2(movdqa xmm1,[rax+16]) + a2(movdqa xmm2,[rax+32]) + a2(movdqa xmm3,[rax+48]) + a1(jz scrypt_ChunkMix_sse2_no_xor1) + a2(pxor xmm0,[r9+0]) + a2(pxor xmm1,[r9+16]) + a2(pxor xmm2,[r9+32]) + a2(pxor xmm3,[r9+48]) + a1(scrypt_ChunkMix_sse2_no_xor1:) + a2(xor r9,r9) + a2(xor r8,r8) + a1(scrypt_ChunkMix_sse2_loop:) + a2(and rdx, rdx) + a2(pxor xmm0,[rsi+r9+0]) + a2(pxor xmm1,[rsi+r9+16]) + a2(pxor xmm2,[rsi+r9+32]) + a2(pxor xmm3,[rsi+r9+48]) + a1(jz scrypt_ChunkMix_sse2_no_xor2) + a2(pxor xmm0,[rdx+r9+0]) + a2(pxor xmm1,[rdx+r9+16]) + a2(pxor xmm2,[rdx+r9+32]) + a2(pxor xmm3,[rdx+r9+48]) + a1(scrypt_ChunkMix_sse2_no_xor2:) + a2(movdqa xmm8,xmm0) + a2(movdqa xmm9,xmm1) + a2(movdqa xmm10,xmm2) + a2(movdqa xmm11,xmm3) + a2(mov rax,8) + a1(scrypt_chacha_sse2_loop: ) + a2(paddd xmm0,xmm1) + a2(pxor xmm3,xmm0) + a2(movdqa xmm6,xmm3) + a2(pslld xmm3,16) + a2(psrld xmm6,16) + a2(pxor xmm3,xmm6) + a2(paddd xmm2,xmm3) + a2(pxor xmm1,xmm2) + a2(movdqa xmm6,xmm1) + a2(pslld xmm1,12) + a2(psrld xmm6,20) + a2(pxor xmm1,xmm6) + a2(paddd xmm0,xmm1) + a2(pxor xmm3,xmm0) + a2(movdqa xmm6,xmm3) + a2(pslld xmm3,8) + a2(psrld xmm6,24) + a2(pxor xmm3,xmm6) + a3(pshufd xmm0,xmm0,0x93) + a2(paddd xmm2,xmm3) + a3(pshufd xmm3,xmm3,0x4e) + a2(pxor xmm1,xmm2) + a3(pshufd xmm2,xmm2,0x39) + a2(movdqa xmm6,xmm1) + a2(pslld xmm1,7) + a2(psrld xmm6,25) + a2(pxor xmm1,xmm6) + a2(sub rax,2) + a2(paddd xmm0,xmm1) + a2(pxor xmm3,xmm0) + a2(movdqa xmm6,xmm3) + a2(pslld xmm3,16) + a2(psrld xmm6,16) + a2(pxor xmm3,xmm6) + a2(paddd xmm2,xmm3) + a2(pxor xmm1,xmm2) + a2(movdqa xmm6,xmm1) + a2(pslld xmm1,12) + a2(psrld xmm6,20) + a2(pxor xmm1,xmm6) + a2(paddd xmm0,xmm1) + a2(pxor xmm3,xmm0) + a2(movdqa xmm6,xmm3) + a2(pslld xmm3,8) + a2(psrld xmm6,24) + a2(pxor xmm3,xmm6) + a3(pshufd xmm0,xmm0,0x39) + a2(paddd xmm2,xmm3) + a3(pshufd xmm3,xmm3,0x4e) + a2(pxor xmm1,xmm2) + a3(pshufd xmm2,xmm2,0x93) + a2(movdqa xmm6,xmm1) + a2(pslld xmm1,7) + a2(psrld xmm6,25) + a2(pxor xmm1,xmm6) + a1(ja scrypt_chacha_sse2_loop) + a2(paddd xmm0,xmm8) + a2(paddd xmm1,xmm9) + a2(paddd xmm2,xmm10) + a2(paddd xmm3,xmm11) + a2(lea rax,[r8+r9]) + a2(xor r8,rcx) + a2(and rax,~0x7f) + a2(add r9,64) + a2(shr rax,1) + a2(add rax, rdi) + a2(cmp r9,rcx) + a2(movdqa [rax+0],xmm0) + a2(movdqa [rax+16],xmm1) + a2(movdqa [rax+32],xmm2) + a2(movdqa [rax+48],xmm3) + a1(jne scrypt_ChunkMix_sse2_loop) + a1(ret) +asm_naked_fn_end(scrypt_ChunkMix_sse2) + +#endif + + +/* intrinsic */ +#if defined(X86_INTRINSIC_SSE2) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_CHACHA_INCLUDED)) + +#define SCRYPT_CHACHA_SSE2 + +static void NOINLINE +scrypt_ChunkMix_sse2(uint32_t *Bout/*[chunkBytes]*/, uint32_t *Bin/*[chunkBytes]*/, uint32_t *Bxor/*[chunkBytes]*/, uint32_t r) { + uint32_t i, blocksPerChunk = r * 2, half = 0; + xmmi *xmmp,x0,x1,x2,x3,x4,t0,t1,t2,t3; + size_t rounds; + + /* 1: X = B_{2r - 1} */ + xmmp = (xmmi *)scrypt_block(Bin, blocksPerChunk - 1); + x0 = xmmp[0]; + x1 = xmmp[1]; + x2 = xmmp[2]; + x3 = xmmp[3]; + + if (Bxor) { + xmmp = (xmmi *)scrypt_block(Bxor, blocksPerChunk - 1); + x0 = _mm_xor_si128(x0, xmmp[0]); + x1 = _mm_xor_si128(x1, xmmp[1]); + x2 = _mm_xor_si128(x2, xmmp[2]); + x3 = _mm_xor_si128(x3, xmmp[3]); + } + + /* 2: for i = 0 to 2r - 1 do */ + for (i = 0; i < blocksPerChunk; i++, half ^= r) { + /* 3: X = H(X ^ B_i) */ + xmmp = (xmmi *)scrypt_block(Bin, i); + x0 = _mm_xor_si128(x0, xmmp[0]); + x1 = _mm_xor_si128(x1, xmmp[1]); + x2 = _mm_xor_si128(x2, xmmp[2]); + x3 = _mm_xor_si128(x3, xmmp[3]); + + if (Bxor) { + xmmp = (xmmi *)scrypt_block(Bxor, i); + x0 = _mm_xor_si128(x0, xmmp[0]); + x1 = _mm_xor_si128(x1, xmmp[1]); + x2 = _mm_xor_si128(x2, xmmp[2]); + x3 = _mm_xor_si128(x3, xmmp[3]); + } + + t0 = x0; + t1 = x1; + t2 = x2; + t3 = x3; + + for (rounds = 8; rounds; rounds -= 2) { + x0 = _mm_add_epi32(x0, x1); + x3 = _mm_xor_si128(x3, x0); + x4 = x3; + x3 = _mm_or_si128(_mm_slli_epi32(x3, 16), _mm_srli_epi32(x4, 16)); + x2 = _mm_add_epi32(x2, x3); + x1 = _mm_xor_si128(x1, x2); + x4 = x1; + x1 = _mm_or_si128(_mm_slli_epi32(x1, 12), _mm_srli_epi32(x4, 20)); + x0 = _mm_add_epi32(x0, x1); + x3 = _mm_xor_si128(x3, x0); + x4 = x3; + x3 = _mm_or_si128(_mm_slli_epi32(x3, 8), _mm_srli_epi32(x4, 24)); + x0 = _mm_shuffle_epi32(x0, 0x93); + x2 = _mm_add_epi32(x2, x3); + x3 = _mm_shuffle_epi32(x3, 0x4e); + x1 = _mm_xor_si128(x1, x2); + x2 = _mm_shuffle_epi32(x2, 0x39); + x4 = x1; + x1 = _mm_or_si128(_mm_slli_epi32(x1, 7), _mm_srli_epi32(x4, 25)); + x0 = _mm_add_epi32(x0, x1); + x3 = _mm_xor_si128(x3, x0); + x4 = x3; + x3 = _mm_or_si128(_mm_slli_epi32(x3, 16), _mm_srli_epi32(x4, 16)); + x2 = _mm_add_epi32(x2, x3); + x1 = _mm_xor_si128(x1, x2); + x4 = x1; + x1 = _mm_or_si128(_mm_slli_epi32(x1, 12), _mm_srli_epi32(x4, 20)); + x0 = _mm_add_epi32(x0, x1); + x3 = _mm_xor_si128(x3, x0); + x4 = x3; + x3 = _mm_or_si128(_mm_slli_epi32(x3, 8), _mm_srli_epi32(x4, 24)); + x0 = _mm_shuffle_epi32(x0, 0x39); + x2 = _mm_add_epi32(x2, x3); + x3 = _mm_shuffle_epi32(x3, 0x4e); + x1 = _mm_xor_si128(x1, x2); + x2 = _mm_shuffle_epi32(x2, 0x93); + x4 = x1; + x1 = _mm_or_si128(_mm_slli_epi32(x1, 7), _mm_srli_epi32(x4, 25)); + } + + x0 = _mm_add_epi32(x0, t0); + x1 = _mm_add_epi32(x1, t1); + x2 = _mm_add_epi32(x2, t2); + x3 = _mm_add_epi32(x3, t3); + + /* 4: Y_i = X */ + /* 6: B'[0..r-1] = Y_even */ + /* 6: B'[r..2r-1] = Y_odd */ + xmmp = (xmmi *)scrypt_block(Bout, (i / 2) + half); + xmmp[0] = x0; + xmmp[1] = x1; + xmmp[2] = x2; + xmmp[3] = x3; + } +} + +#endif + +#if defined(SCRYPT_CHACHA_SSE2) + #undef SCRYPT_MIX + #define SCRYPT_MIX "ChaCha/8-SSE2" + #undef SCRYPT_CHACHA_INCLUDED + #define SCRYPT_CHACHA_INCLUDED +#endif diff --git a/scryptjane/scrypt-jane-mix_chacha-ssse3.h b/scryptjane/scrypt-jane-mix_chacha-ssse3.h new file mode 100644 index 0000000..b25e356 --- /dev/null +++ b/scryptjane/scrypt-jane-mix_chacha-ssse3.h @@ -0,0 +1,348 @@ +/* x86 */ +#if defined(X86ASM_SSSE3) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_CHACHA_INCLUDED)) + +#define SCRYPT_CHACHA_SSSE3 + +asm_naked_fn_proto(void, scrypt_ChunkMix_ssse3)(uint32_t *Bout/*[chunkBytes]*/, uint32_t *Bin/*[chunkBytes]*/, uint32_t *Bxor/*[chunkBytes]*/, uint32_t r) +asm_naked_fn(scrypt_ChunkMix_ssse3) + a1(push ebx) + a1(push edi) + a1(push esi) + a1(push ebp) + a2(mov ebp,esp) + a2(mov edi,[ebp+20]) + a2(mov esi,[ebp+24]) + a2(mov eax,[ebp+28]) + a2(mov ebx,[ebp+32]) + a2(sub esp,64) + a2(and esp,~63) + a2(lea edx,[ebx*2]) + a2(shl edx,6) + a2(lea ecx,[edx-64]) + a2(and eax, eax) + a2(movdqa xmm4,[ssse3_rotl16_32bit]) + a2(movdqa xmm5,[ssse3_rotl8_32bit]) + a2(movdqa xmm0,[ecx+esi+0]) + a2(movdqa xmm1,[ecx+esi+16]) + a2(movdqa xmm2,[ecx+esi+32]) + a2(movdqa xmm3,[ecx+esi+48]) + a1(jz scrypt_ChunkMix_ssse3_no_xor1) + a2(pxor xmm0,[ecx+eax+0]) + a2(pxor xmm1,[ecx+eax+16]) + a2(pxor xmm2,[ecx+eax+32]) + a2(pxor xmm3,[ecx+eax+48]) + a1(scrypt_ChunkMix_ssse3_no_xor1:) + a2(xor ecx,ecx) + a2(xor ebx,ebx) + a1(scrypt_ChunkMix_ssse3_loop:) + a2(and eax, eax) + a2(pxor xmm0,[esi+ecx+0]) + a2(pxor xmm1,[esi+ecx+16]) + a2(pxor xmm2,[esi+ecx+32]) + a2(pxor xmm3,[esi+ecx+48]) + a1(jz scrypt_ChunkMix_ssse3_no_xor2) + a2(pxor xmm0,[eax+ecx+0]) + a2(pxor xmm1,[eax+ecx+16]) + a2(pxor xmm2,[eax+ecx+32]) + a2(pxor xmm3,[eax+ecx+48]) + a1(scrypt_ChunkMix_ssse3_no_xor2:) + a2(movdqa [esp+0],xmm0) + a2(movdqa [esp+16],xmm1) + a2(movdqa [esp+32],xmm2) + a2(movdqa xmm7,xmm3) + a2(mov eax,8) + a1(scrypt_chacha_ssse3_loop: ) + a2(paddd xmm0,xmm1) + a2(pxor xmm3,xmm0) + a2(pshufb xmm3,xmm4) + a2(paddd xmm2,xmm3) + a2(pxor xmm1,xmm2) + a2(movdqa xmm6,xmm1) + a2(pslld xmm1,12) + a2(psrld xmm6,20) + a2(pxor xmm1,xmm6) + a2(paddd xmm0,xmm1) + a2(pxor xmm3,xmm0) + a2(pshufb xmm3,xmm5) + a3(pshufd xmm0,xmm0,0x93) + a2(paddd xmm2,xmm3) + a3(pshufd xmm3,xmm3,0x4e) + a2(pxor xmm1,xmm2) + a3(pshufd xmm2,xmm2,0x39) + a2(movdqa xmm6,xmm1) + a2(pslld xmm1,7) + a2(psrld xmm6,25) + a2(pxor xmm1,xmm6) + a2(sub eax,2) + a2(paddd xmm0,xmm1) + a2(pxor xmm3,xmm0) + a2(pshufb xmm3,xmm4) + a2(paddd xmm2,xmm3) + a2(pxor xmm1,xmm2) + a2(movdqa xmm6,xmm1) + a2(pslld xmm1,12) + a2(psrld xmm6,20) + a2(pxor xmm1,xmm6) + a2(paddd xmm0,xmm1) + a2(pxor xmm3,xmm0) + a2(pshufb xmm3,xmm5) + a3(pshufd xmm0,xmm0,0x39) + a2(paddd xmm2,xmm3) + a3(pshufd xmm3,xmm3,0x4e) + a2(pxor xmm1,xmm2) + a3(pshufd xmm2,xmm2,0x93) + a2(movdqa xmm6,xmm1) + a2(pslld xmm1,7) + a2(psrld xmm6,25) + a2(pxor xmm1,xmm6) + a1(ja scrypt_chacha_ssse3_loop) + a2(paddd xmm0,[esp+0]) + a2(paddd xmm1,[esp+16]) + a2(paddd xmm2,[esp+32]) + a2(paddd xmm3,xmm7) + a2(lea eax,[ebx+ecx]) + a2(xor ebx,edx) + a2(and eax,~0x7f) + a2(add ecx,64) + a2(shr eax,1) + a2(add eax, edi) + a2(cmp ecx,edx) + a2(movdqa [eax+0],xmm0) + a2(movdqa [eax+16],xmm1) + a2(movdqa [eax+32],xmm2) + a2(movdqa [eax+48],xmm3) + a2(mov eax,[ebp+28]) + a1(jne scrypt_ChunkMix_ssse3_loop) + a2(mov esp,ebp) + a1(pop ebp) + a1(pop esi) + a1(pop edi) + a1(pop ebx) + a1(ret 16) +asm_naked_fn_end(scrypt_ChunkMix_ssse3) + +#endif + + + +/* x64 */ +#if defined(X86_64ASM_SSSE3) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_CHACHA_INCLUDED)) + +#define SCRYPT_CHACHA_SSSE3 + +asm_naked_fn_proto(void, scrypt_ChunkMix_ssse3)(uint32_t *Bout/*[chunkBytes]*/, uint32_t *Bin/*[chunkBytes]*/, uint32_t *Bxor/*[chunkBytes]*/, uint32_t r) +asm_naked_fn(scrypt_ChunkMix_ssse3) + a2(lea rcx,[rcx*2]) + a2(shl rcx,6) + a2(lea r9,[rcx-64]) + a2(lea rax,[rsi+r9]) + a2(lea r9,[rdx+r9]) + a2(and rdx, rdx) + a2(movdqa xmm4,[ssse3_rotl16_32bit]) + a2(movdqa xmm5,[ssse3_rotl8_32bit]) + a2(movdqa xmm0,[rax+0]) + a2(movdqa xmm1,[rax+16]) + a2(movdqa xmm2,[rax+32]) + a2(movdqa xmm3,[rax+48]) + a1(jz scrypt_ChunkMix_ssse3_no_xor1) + a2(pxor xmm0,[r9+0]) + a2(pxor xmm1,[r9+16]) + a2(pxor xmm2,[r9+32]) + a2(pxor xmm3,[r9+48]) + a1(scrypt_ChunkMix_ssse3_no_xor1:) + a2(xor r8,r8) + a2(xor r9,r9) + a1(scrypt_ChunkMix_ssse3_loop:) + a2(and rdx, rdx) + a2(pxor xmm0,[rsi+r9+0]) + a2(pxor xmm1,[rsi+r9+16]) + a2(pxor xmm2,[rsi+r9+32]) + a2(pxor xmm3,[rsi+r9+48]) + a1(jz scrypt_ChunkMix_ssse3_no_xor2) + a2(pxor xmm0,[rdx+r9+0]) + a2(pxor xmm1,[rdx+r9+16]) + a2(pxor xmm2,[rdx+r9+32]) + a2(pxor xmm3,[rdx+r9+48]) + a1(scrypt_ChunkMix_ssse3_no_xor2:) + a2(movdqa xmm8,xmm0) + a2(movdqa xmm9,xmm1) + a2(movdqa xmm10,xmm2) + a2(movdqa xmm11,xmm3) + a2(mov rax,8) + a1(scrypt_chacha_ssse3_loop: ) + a2(paddd xmm0,xmm1) + a2(pxor xmm3,xmm0) + a2(pshufb xmm3,xmm4) + a2(paddd xmm2,xmm3) + a2(pxor xmm1,xmm2) + a2(movdqa xmm12,xmm1) + a2(pslld xmm1,12) + a2(psrld xmm12,20) + a2(pxor xmm1,xmm12) + a2(paddd xmm0,xmm1) + a2(pxor xmm3,xmm0) + a2(pshufb xmm3,xmm5) + a3(pshufd xmm0,xmm0,0x93) + a2(paddd xmm2,xmm3) + a3(pshufd xmm3,xmm3,0x4e) + a2(pxor xmm1,xmm2) + a3(pshufd xmm2,xmm2,0x39) + a2(movdqa xmm12,xmm1) + a2(pslld xmm1,7) + a2(psrld xmm12,25) + a2(pxor xmm1,xmm12) + a2(sub rax,2) + a2(paddd xmm0,xmm1) + a2(pxor xmm3,xmm0) + a2(pshufb xmm3,xmm4) + a2(paddd xmm2,xmm3) + a2(pxor xmm1,xmm2) + a2(movdqa xmm12,xmm1) + a2(pslld xmm1,12) + a2(psrld xmm12,20) + a2(pxor xmm1,xmm12) + a2(paddd xmm0,xmm1) + a2(pxor xmm3,xmm0) + a2(pshufb xmm3,xmm5) + a3(pshufd xmm0,xmm0,0x39) + a2(paddd xmm2,xmm3) + a3(pshufd xmm3,xmm3,0x4e) + a2(pxor xmm1,xmm2) + a3(pshufd xmm2,xmm2,0x93) + a2(movdqa xmm12,xmm1) + a2(pslld xmm1,7) + a2(psrld xmm12,25) + a2(pxor xmm1,xmm12) + a1(ja scrypt_chacha_ssse3_loop) + a2(paddd xmm0,xmm8) + a2(paddd xmm1,xmm9) + a2(paddd xmm2,xmm10) + a2(paddd xmm3,xmm11) + a2(lea rax,[r8+r9]) + a2(xor r8,rcx) + a2(and rax,~0x7f) + a2(add r9,64) + a2(shr rax,1) + a2(add rax, rdi) + a2(cmp r9,rcx) + a2(movdqa [rax+0],xmm0) + a2(movdqa [rax+16],xmm1) + a2(movdqa [rax+32],xmm2) + a2(movdqa [rax+48],xmm3) + a1(jne scrypt_ChunkMix_ssse3_loop) + a1(ret) +asm_naked_fn_end(scrypt_ChunkMix_ssse3) + +#endif + + +/* intrinsic */ +#if defined(X86_INTRINSIC_SSSE3) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_CHACHA_INCLUDED)) + +#define SCRYPT_CHACHA_SSSE3 + +static void NOINLINE +scrypt_ChunkMix_ssse3(uint32_t *Bout/*[chunkBytes]*/, uint32_t *Bin/*[chunkBytes]*/, uint32_t *Bxor/*[chunkBytes]*/, uint32_t r) { + uint32_t i, blocksPerChunk = r * 2, half = 0; + xmmi *xmmp,x0,x1,x2,x3,x6,t0,t1,t2,t3; + const xmmi x4 = *(xmmi *)&ssse3_rotl16_32bit, x5 = *(xmmi *)&ssse3_rotl8_32bit; + size_t rounds; + + /* 1: X = B_{2r - 1} */ + xmmp = (xmmi *)scrypt_block(Bin, blocksPerChunk - 1); + x0 = xmmp[0]; + x1 = xmmp[1]; + x2 = xmmp[2]; + x3 = xmmp[3]; + + if (Bxor) { + xmmp = (xmmi *)scrypt_block(Bxor, blocksPerChunk - 1); + x0 = _mm_xor_si128(x0, xmmp[0]); + x1 = _mm_xor_si128(x1, xmmp[1]); + x2 = _mm_xor_si128(x2, xmmp[2]); + x3 = _mm_xor_si128(x3, xmmp[3]); + } + + /* 2: for i = 0 to 2r - 1 do */ + for (i = 0; i < blocksPerChunk; i++, half ^= r) { + /* 3: X = H(X ^ B_i) */ + xmmp = (xmmi *)scrypt_block(Bin, i); + x0 = _mm_xor_si128(x0, xmmp[0]); + x1 = _mm_xor_si128(x1, xmmp[1]); + x2 = _mm_xor_si128(x2, xmmp[2]); + x3 = _mm_xor_si128(x3, xmmp[3]); + + if (Bxor) { + xmmp = (xmmi *)scrypt_block(Bxor, i); + x0 = _mm_xor_si128(x0, xmmp[0]); + x1 = _mm_xor_si128(x1, xmmp[1]); + x2 = _mm_xor_si128(x2, xmmp[2]); + x3 = _mm_xor_si128(x3, xmmp[3]); + } + + t0 = x0; + t1 = x1; + t2 = x2; + t3 = x3; + + for (rounds = 8; rounds; rounds -= 2) { + x0 = _mm_add_epi32(x0, x1); + x3 = _mm_xor_si128(x3, x0); + x3 = _mm_shuffle_epi8(x3, x4); + x2 = _mm_add_epi32(x2, x3); + x1 = _mm_xor_si128(x1, x2); + x6 = x1; + x1 = _mm_or_si128(_mm_slli_epi32(x1, 12), _mm_srli_epi32(x6, 20)); + x0 = _mm_add_epi32(x0, x1); + x3 = _mm_xor_si128(x3, x0); + x3 = _mm_shuffle_epi8(x3, x5); + x0 = _mm_shuffle_epi32(x0, 0x93); + x2 = _mm_add_epi32(x2, x3); + x3 = _mm_shuffle_epi32(x3, 0x4e); + x1 = _mm_xor_si128(x1, x2); + x2 = _mm_shuffle_epi32(x2, 0x39); + x6 = x1; + x1 = _mm_or_si128(_mm_slli_epi32(x1, 7), _mm_srli_epi32(x6, 25)); + x0 = _mm_add_epi32(x0, x1); + x3 = _mm_xor_si128(x3, x0); + x3 = _mm_shuffle_epi8(x3, x4); + x2 = _mm_add_epi32(x2, x3); + x1 = _mm_xor_si128(x1, x2); + x6 = x1; + x1 = _mm_or_si128(_mm_slli_epi32(x1, 12), _mm_srli_epi32(x6, 20)); + x0 = _mm_add_epi32(x0, x1); + x3 = _mm_xor_si128(x3, x0); + x3 = _mm_shuffle_epi8(x3, x5); + x0 = _mm_shuffle_epi32(x0, 0x39); + x2 = _mm_add_epi32(x2, x3); + x3 = _mm_shuffle_epi32(x3, 0x4e); + x1 = _mm_xor_si128(x1, x2); + x2 = _mm_shuffle_epi32(x2, 0x93); + x6 = x1; + x1 = _mm_or_si128(_mm_slli_epi32(x1, 7), _mm_srli_epi32(x6, 25)); + } + + x0 = _mm_add_epi32(x0, t0); + x1 = _mm_add_epi32(x1, t1); + x2 = _mm_add_epi32(x2, t2); + x3 = _mm_add_epi32(x3, t3); + + /* 4: Y_i = X */ + /* 6: B'[0..r-1] = Y_even */ + /* 6: B'[r..2r-1] = Y_odd */ + xmmp = (xmmi *)scrypt_block(Bout, (i / 2) + half); + xmmp[0] = x0; + xmmp[1] = x1; + xmmp[2] = x2; + xmmp[3] = x3; + } +} + +#endif + +#if defined(SCRYPT_CHACHA_SSSE3) + #undef SCRYPT_MIX + #define SCRYPT_MIX "ChaCha/8-SSSE3" + #undef SCRYPT_CHACHA_INCLUDED + #define SCRYPT_CHACHA_INCLUDED +#endif diff --git a/scryptjane/scrypt-jane-mix_chacha.h b/scryptjane/scrypt-jane-mix_chacha.h new file mode 100644 index 0000000..85ee9c1 --- /dev/null +++ b/scryptjane/scrypt-jane-mix_chacha.h @@ -0,0 +1,69 @@ +#if !defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_CHACHA_INCLUDED) + +#undef SCRYPT_MIX +#define SCRYPT_MIX "ChaCha20/8 Ref" + +#undef SCRYPT_CHACHA_INCLUDED +#define SCRYPT_CHACHA_INCLUDED +#define SCRYPT_CHACHA_BASIC + +static void +chacha_core_basic(uint32_t state[16]) { + size_t rounds = 8; + uint32_t x0,x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15,t; + + x0 = state[0]; + x1 = state[1]; + x2 = state[2]; + x3 = state[3]; + x4 = state[4]; + x5 = state[5]; + x6 = state[6]; + x7 = state[7]; + x8 = state[8]; + x9 = state[9]; + x10 = state[10]; + x11 = state[11]; + x12 = state[12]; + x13 = state[13]; + x14 = state[14]; + x15 = state[15]; + + #define quarter(a,b,c,d) \ + a += b; t = d^a; d = ROTL32(t,16); \ + c += d; t = b^c; b = ROTL32(t,12); \ + a += b; t = d^a; d = ROTL32(t, 8); \ + c += d; t = b^c; b = ROTL32(t, 7); + + for (; rounds; rounds -= 2) { + quarter( x0, x4, x8,x12) + quarter( x1, x5, x9,x13) + quarter( x2, x6,x10,x14) + quarter( x3, x7,x11,x15) + quarter( x0, x5,x10,x15) + quarter( x1, x6,x11,x12) + quarter( x2, x7, x8,x13) + quarter( x3, x4, x9,x14) + } + + state[0] += x0; + state[1] += x1; + state[2] += x2; + state[3] += x3; + state[4] += x4; + state[5] += x5; + state[6] += x6; + state[7] += x7; + state[8] += x8; + state[9] += x9; + state[10] += x10; + state[11] += x11; + state[12] += x12; + state[13] += x13; + state[14] += x14; + state[15] += x15; + + #undef quarter +} + +#endif \ No newline at end of file diff --git a/scryptjane/scrypt-jane-mix_salsa-avx.h b/scryptjane/scrypt-jane-mix_salsa-avx.h new file mode 100644 index 0000000..15fb48e --- /dev/null +++ b/scryptjane/scrypt-jane-mix_salsa-avx.h @@ -0,0 +1,381 @@ +/* x86 */ +#if defined(X86ASM_AVX) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_SALSA_INCLUDED)) + +#define SCRYPT_SALSA_AVX + +asm_naked_fn_proto(void, scrypt_ChunkMix_avx)(uint32_t *Bout/*[chunkBytes]*/, uint32_t *Bin/*[chunkBytes]*/, uint32_t *Bxor/*[chunkBytes]*/, uint32_t r) +asm_naked_fn(scrypt_ChunkMix_avx) + a1(push ebx) + a1(push edi) + a1(push esi) + a1(push ebp) + a2(mov ebp,esp) + a2(mov edi,[ebp+20]) + a2(mov esi,[ebp+24]) + a2(mov eax,[ebp+28]) + a2(mov ebx,[ebp+32]) + a2(sub esp,32) + a2(and esp,~63) + a2(lea edx,[ebx*2]) + a2(shl edx,6) + a2(lea ecx,[edx-64]) + a2(and eax, eax) + a2(movdqa xmm0,[ecx+esi+0]) + a2(movdqa xmm1,[ecx+esi+16]) + a2(movdqa xmm2,[ecx+esi+32]) + a2(movdqa xmm3,[ecx+esi+48]) + a1(jz scrypt_ChunkMix_avx_no_xor1) + a3(vpxor xmm0,xmm0,[ecx+eax+0]) + a3(vpxor xmm1,xmm1,[ecx+eax+16]) + a3(vpxor xmm2,xmm2,[ecx+eax+32]) + a3(vpxor xmm3,xmm3,[ecx+eax+48]) + a1(scrypt_ChunkMix_avx_no_xor1:) + a2(xor ecx,ecx) + a2(xor ebx,ebx) + a1(scrypt_ChunkMix_avx_loop:) + a2(and eax, eax) + a3(vpxor xmm0,xmm0,[esi+ecx+0]) + a3(vpxor xmm1,xmm1,[esi+ecx+16]) + a3(vpxor xmm2,xmm2,[esi+ecx+32]) + a3(vpxor xmm3,xmm3,[esi+ecx+48]) + a1(jz scrypt_ChunkMix_avx_no_xor2) + a3(vpxor xmm0,xmm0,[eax+ecx+0]) + a3(vpxor xmm1,xmm1,[eax+ecx+16]) + a3(vpxor xmm2,xmm2,[eax+ecx+32]) + a3(vpxor xmm3,xmm3,[eax+ecx+48]) + a1(scrypt_ChunkMix_avx_no_xor2:) + a2(vmovdqa [esp+0],xmm0) + a2(vmovdqa [esp+16],xmm1) + a2(vmovdqa xmm6,xmm2) + a2(vmovdqa xmm7,xmm3) + a2(mov eax,8) + a1(scrypt_salsa_avx_loop: ) + a3(vpaddd xmm4, xmm1, xmm0) + a3(vpsrld xmm5, xmm4, 25) + a3(vpslld xmm4, xmm4, 7) + a3(vpxor xmm3, xmm3, xmm5) + a3(vpxor xmm3, xmm3, xmm4) + a3(vpaddd xmm4, xmm0, xmm3) + a3(vpsrld xmm5, xmm4, 23) + a3(vpslld xmm4, xmm4, 9) + a3(vpxor xmm2, xmm2, xmm5) + a3(vpxor xmm2, xmm2, xmm4) + a3(vpaddd xmm4, xmm3, xmm2) + a3(vpsrld xmm5, xmm4, 19) + a3(vpslld xmm4, xmm4, 13) + a3(vpxor xmm1, xmm1, xmm5) + a3(pshufd xmm3, xmm3, 0x93) + a3(vpxor xmm1, xmm1, xmm4) + a3(vpaddd xmm4, xmm2, xmm1) + a3(vpsrld xmm5, xmm4, 14) + a3(vpslld xmm4, xmm4, 18) + a3(vpxor xmm0, xmm0, xmm5) + a3(pshufd xmm2, xmm2, 0x4e) + a3(vpxor xmm0, xmm0, xmm4) + a2(sub eax, 2) + a3(vpaddd xmm4, xmm3, xmm0) + a3(pshufd xmm1, xmm1, 0x39) + a3(vpsrld xmm5, xmm4, 25) + a3(vpslld xmm4, xmm4, 7) + a3(vpxor xmm1, xmm1, xmm5) + a3(vpxor xmm1, xmm1, xmm4) + a3(vpaddd xmm4, xmm0, xmm1) + a3(vpsrld xmm5, xmm4, 23) + a3(vpslld xmm4, xmm4, 9) + a3(vpxor xmm2, xmm2, xmm5) + a3(vpxor xmm2, xmm2, xmm4) + a3(vpaddd xmm4, xmm1, xmm2) + a3(vpsrld xmm5, xmm4, 19) + a3(vpslld xmm4, xmm4, 13) + a3(vpxor xmm3, xmm3, xmm5) + a3(pshufd xmm1, xmm1, 0x93) + a3(vpxor xmm3, xmm3, xmm4) + a3(vpaddd xmm4, xmm2, xmm3) + a3(vpsrld xmm5, xmm4, 14) + a3(vpslld xmm4, xmm4, 18) + a3(vpxor xmm0, xmm0, xmm5) + a3(pshufd xmm2, xmm2, 0x4e) + a3(vpxor xmm0, xmm0, xmm4) + a3(pshufd xmm3, xmm3, 0x39) + a1(ja scrypt_salsa_avx_loop) + a3(vpaddd xmm0,xmm0,[esp+0]) + a3(vpaddd xmm1,xmm1,[esp+16]) + a3(vpaddd xmm2,xmm2,xmm6) + a3(vpaddd xmm3,xmm3,xmm7) + a2(lea eax,[ebx+ecx]) + a2(xor ebx,edx) + a2(and eax,~0x7f) + a2(add ecx,64) + a2(shr eax,1) + a2(add eax, edi) + a2(cmp ecx,edx) + a2(vmovdqa [eax+0],xmm0) + a2(vmovdqa [eax+16],xmm1) + a2(vmovdqa [eax+32],xmm2) + a2(vmovdqa [eax+48],xmm3) + a2(mov eax,[ebp+28]) + a1(jne scrypt_ChunkMix_avx_loop) + a2(mov esp,ebp) + a1(pop ebp) + a1(pop esi) + a1(pop edi) + a1(pop ebx) + a1(ret 16) +asm_naked_fn_end(scrypt_ChunkMix_avx) + +#endif + + + +/* x64 */ +#if defined(X86_64ASM_AVX) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_SALSA_INCLUDED)) + +#define SCRYPT_SALSA_AVX + +asm_naked_fn_proto(void, scrypt_ChunkMix_avx)(uint32_t *Bout/*[chunkBytes]*/, uint32_t *Bin/*[chunkBytes]*/, uint32_t *Bxor/*[chunkBytes]*/, uint32_t r) +asm_naked_fn(scrypt_ChunkMix_avx) + a2(lea rcx,[rcx*2]) + a2(shl rcx,6) + a2(lea r9,[rcx-64]) + a2(lea rax,[rsi+r9]) + a2(lea r9,[rdx+r9]) + a2(and rdx, rdx) + a2(vmovdqa xmm0,[rax+0]) + a2(vmovdqa xmm1,[rax+16]) + a2(vmovdqa xmm2,[rax+32]) + a2(vmovdqa xmm3,[rax+48]) + a1(jz scrypt_ChunkMix_avx_no_xor1) + a3(vpxor xmm0,xmm0,[r9+0]) + a3(vpxor xmm1,xmm1,[r9+16]) + a3(vpxor xmm2,xmm2,[r9+32]) + a3(vpxor xmm3,xmm3,[r9+48]) + a1(scrypt_ChunkMix_avx_no_xor1:) + a2(xor r9,r9) + a2(xor r8,r8) + a1(scrypt_ChunkMix_avx_loop:) + a2(and rdx, rdx) + a3(vpxor xmm0,xmm0,[rsi+r9+0]) + a3(vpxor xmm1,xmm1,[rsi+r9+16]) + a3(vpxor xmm2,xmm2,[rsi+r9+32]) + a3(vpxor xmm3,xmm3,[rsi+r9+48]) + a1(jz scrypt_ChunkMix_avx_no_xor2) + a3(vpxor xmm0,xmm0,[rdx+r9+0]) + a3(vpxor xmm1,xmm1,[rdx+r9+16]) + a3(vpxor xmm2,xmm2,[rdx+r9+32]) + a3(vpxor xmm3,xmm3,[rdx+r9+48]) + a1(scrypt_ChunkMix_avx_no_xor2:) + a2(vmovdqa xmm8,xmm0) + a2(vmovdqa xmm9,xmm1) + a2(vmovdqa xmm10,xmm2) + a2(vmovdqa xmm11,xmm3) + a2(mov rax,8) + a1(scrypt_salsa_avx_loop: ) + a3(vpaddd xmm4, xmm1, xmm0) + a3(vpsrld xmm5, xmm4, 25) + a3(vpslld xmm4, xmm4, 7) + a3(vpxor xmm3, xmm3, xmm5) + a3(vpxor xmm3, xmm3, xmm4) + a3(vpaddd xmm4, xmm0, xmm3) + a3(vpsrld xmm5, xmm4, 23) + a3(vpslld xmm4, xmm4, 9) + a3(vpxor xmm2, xmm2, xmm5) + a3(vpxor xmm2, xmm2, xmm4) + a3(vpaddd xmm4, xmm3, xmm2) + a3(vpsrld xmm5, xmm4, 19) + a3(vpslld xmm4, xmm4, 13) + a3(vpxor xmm1, xmm1, xmm5) + a3(pshufd xmm3, xmm3, 0x93) + a3(vpxor xmm1, xmm1, xmm4) + a3(vpaddd xmm4, xmm2, xmm1) + a3(vpsrld xmm5, xmm4, 14) + a3(vpslld xmm4, xmm4, 18) + a3(vpxor xmm0, xmm0, xmm5) + a3(pshufd xmm2, xmm2, 0x4e) + a3(vpxor xmm0, xmm0, xmm4) + a2(sub rax, 2) + a3(vpaddd xmm4, xmm3, xmm0) + a3(pshufd xmm1, xmm1, 0x39) + a3(vpsrld xmm5, xmm4, 25) + a3(vpslld xmm4, xmm4, 7) + a3(vpxor xmm1, xmm1, xmm5) + a3(vpxor xmm1, xmm1, xmm4) + a3(vpaddd xmm4, xmm0, xmm1) + a3(vpsrld xmm5, xmm4, 23) + a3(vpslld xmm4, xmm4, 9) + a3(vpxor xmm2, xmm2, xmm5) + a3(vpxor xmm2, xmm2, xmm4) + a3(vpaddd xmm4, xmm1, xmm2) + a3(vpsrld xmm5, xmm4, 19) + a3(vpslld xmm4, xmm4, 13) + a3(vpxor xmm3, xmm3, xmm5) + a3(pshufd xmm1, xmm1, 0x93) + a3(vpxor xmm3, xmm3, xmm4) + a3(vpaddd xmm4, xmm2, xmm3) + a3(vpsrld xmm5, xmm4, 14) + a3(vpslld xmm4, xmm4, 18) + a3(vpxor xmm0, xmm0, xmm5) + a3(pshufd xmm2, xmm2, 0x4e) + a3(vpxor xmm0, xmm0, xmm4) + a3(pshufd xmm3, xmm3, 0x39) + a1(ja scrypt_salsa_avx_loop) + a3(vpaddd xmm0,xmm0,xmm8) + a3(vpaddd xmm1,xmm1,xmm9) + a3(vpaddd xmm2,xmm2,xmm10) + a3(vpaddd xmm3,xmm3,xmm11) + a2(lea rax,[r8+r9]) + a2(xor r8,rcx) + a2(and rax,~0x7f) + a2(add r9,64) + a2(shr rax,1) + a2(add rax, rdi) + a2(cmp r9,rcx) + a2(vmovdqa [rax+0],xmm0) + a2(vmovdqa [rax+16],xmm1) + a2(vmovdqa [rax+32],xmm2) + a2(vmovdqa [rax+48],xmm3) + a1(jne scrypt_ChunkMix_avx_loop) + a1(ret) +asm_naked_fn_end(scrypt_ChunkMix_avx) + +#endif + + +/* intrinsic */ +#if defined(X86_INTRINSIC_AVX) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_SALSA_INCLUDED)) + +#define SCRYPT_SALSA_AVX + +static void NOINLINE +scrypt_ChunkMix_avx(uint32_t *Bout/*[chunkBytes]*/, uint32_t *Bin/*[chunkBytes]*/, uint32_t *Bxor/*[chunkBytes]*/, uint32_t r) { + uint32_t i, blocksPerChunk = r * 2, half = 0; + xmmi *xmmp,x0,x1,x2,x3,x4,x5,t0,t1,t2,t3; + size_t rounds; + + /* 1: X = B_{2r - 1} */ + xmmp = (xmmi *)scrypt_block(Bin, blocksPerChunk - 1); + x0 = xmmp[0]; + x1 = xmmp[1]; + x2 = xmmp[2]; + x3 = xmmp[3]; + + if (Bxor) { + xmmp = (xmmi *)scrypt_block(Bxor, blocksPerChunk - 1); + x0 = _mm_xor_si128(x0, xmmp[0]); + x1 = _mm_xor_si128(x1, xmmp[1]); + x2 = _mm_xor_si128(x2, xmmp[2]); + x3 = _mm_xor_si128(x3, xmmp[3]); + } + + /* 2: for i = 0 to 2r - 1 do */ + for (i = 0; i < blocksPerChunk; i++, half ^= r) { + /* 3: X = H(X ^ B_i) */ + xmmp = (xmmi *)scrypt_block(Bin, i); + x0 = _mm_xor_si128(x0, xmmp[0]); + x1 = _mm_xor_si128(x1, xmmp[1]); + x2 = _mm_xor_si128(x2, xmmp[2]); + x3 = _mm_xor_si128(x3, xmmp[3]); + + if (Bxor) { + xmmp = (xmmi *)scrypt_block(Bxor, i); + x0 = _mm_xor_si128(x0, xmmp[0]); + x1 = _mm_xor_si128(x1, xmmp[1]); + x2 = _mm_xor_si128(x2, xmmp[2]); + x3 = _mm_xor_si128(x3, xmmp[3]); + } + + t0 = x0; + t1 = x1; + t2 = x2; + t3 = x3; + + for (rounds = 8; rounds; rounds -= 2) { + x4 = x1; + x4 = _mm_add_epi32(x4, x0); + x5 = x4; + x4 = _mm_slli_epi32(x4, 7); + x5 = _mm_srli_epi32(x5, 25); + x3 = _mm_xor_si128(x3, x4); + x4 = x0; + x3 = _mm_xor_si128(x3, x5); + x4 = _mm_add_epi32(x4, x3); + x5 = x4; + x4 = _mm_slli_epi32(x4, 9); + x5 = _mm_srli_epi32(x5, 23); + x2 = _mm_xor_si128(x2, x4); + x4 = x3; + x2 = _mm_xor_si128(x2, x5); + x3 = _mm_shuffle_epi32(x3, 0x93); + x4 = _mm_add_epi32(x4, x2); + x5 = x4; + x4 = _mm_slli_epi32(x4, 13); + x5 = _mm_srli_epi32(x5, 19); + x1 = _mm_xor_si128(x1, x4); + x4 = x2; + x1 = _mm_xor_si128(x1, x5); + x2 = _mm_shuffle_epi32(x2, 0x4e); + x4 = _mm_add_epi32(x4, x1); + x5 = x4; + x4 = _mm_slli_epi32(x4, 18); + x5 = _mm_srli_epi32(x5, 14); + x0 = _mm_xor_si128(x0, x4); + x4 = x3; + x0 = _mm_xor_si128(x0, x5); + x1 = _mm_shuffle_epi32(x1, 0x39); + x4 = _mm_add_epi32(x4, x0); + x5 = x4; + x4 = _mm_slli_epi32(x4, 7); + x5 = _mm_srli_epi32(x5, 25); + x1 = _mm_xor_si128(x1, x4); + x4 = x0; + x1 = _mm_xor_si128(x1, x5); + x4 = _mm_add_epi32(x4, x1); + x5 = x4; + x4 = _mm_slli_epi32(x4, 9); + x5 = _mm_srli_epi32(x5, 23); + x2 = _mm_xor_si128(x2, x4); + x4 = x1; + x2 = _mm_xor_si128(x2, x5); + x1 = _mm_shuffle_epi32(x1, 0x93); + x4 = _mm_add_epi32(x4, x2); + x5 = x4; + x4 = _mm_slli_epi32(x4, 13); + x5 = _mm_srli_epi32(x5, 19); + x3 = _mm_xor_si128(x3, x4); + x4 = x2; + x3 = _mm_xor_si128(x3, x5); + x2 = _mm_shuffle_epi32(x2, 0x4e); + x4 = _mm_add_epi32(x4, x3); + x5 = x4; + x4 = _mm_slli_epi32(x4, 18); + x5 = _mm_srli_epi32(x5, 14); + x0 = _mm_xor_si128(x0, x4); + x3 = _mm_shuffle_epi32(x3, 0x39); + x0 = _mm_xor_si128(x0, x5); + } + + x0 = _mm_add_epi32(x0, t0); + x1 = _mm_add_epi32(x1, t1); + x2 = _mm_add_epi32(x2, t2); + x3 = _mm_add_epi32(x3, t3); + + /* 4: Y_i = X */ + /* 6: B'[0..r-1] = Y_even */ + /* 6: B'[r..2r-1] = Y_odd */ + xmmp = (xmmi *)scrypt_block(Bout, (i / 2) + half); + xmmp[0] = x0; + xmmp[1] = x1; + xmmp[2] = x2; + xmmp[3] = x3; + } +} + +#endif + +#if defined(SCRYPT_SALSA_AVX) + /* uses salsa_core_tangle_sse2 */ + + #undef SCRYPT_MIX + #define SCRYPT_MIX "Salsa/8-AVX" + #undef SCRYPT_SALSA_INCLUDED + #define SCRYPT_SALSA_INCLUDED +#endif diff --git a/scryptjane/scrypt-jane-mix_salsa-sse2.h b/scryptjane/scrypt-jane-mix_salsa-sse2.h new file mode 100644 index 0000000..4898659 --- /dev/null +++ b/scryptjane/scrypt-jane-mix_salsa-sse2.h @@ -0,0 +1,443 @@ +/* x86 */ +#if defined(X86ASM_SSE2) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_SALSA_INCLUDED)) + +#define SCRYPT_SALSA_SSE2 + +asm_naked_fn_proto(void, scrypt_ChunkMix_sse2)(uint32_t *Bout/*[chunkBytes]*/, uint32_t *Bin/*[chunkBytes]*/, uint32_t *Bxor/*[chunkBytes]*/, uint32_t r) +asm_naked_fn(scrypt_ChunkMix_sse2) + a1(push ebx) + a1(push edi) + a1(push esi) + a1(push ebp) + a2(mov ebp,esp) + a2(mov edi,[ebp+20]) + a2(mov esi,[ebp+24]) + a2(mov eax,[ebp+28]) + a2(mov ebx,[ebp+32]) + a2(sub esp,32) + a2(and esp,~63) + a2(lea edx,[ebx*2]) + a2(shl edx,6) + a2(lea ecx,[edx-64]) + a2(and eax, eax) + a2(movdqa xmm0,[ecx+esi+0]) + a2(movdqa xmm1,[ecx+esi+16]) + a2(movdqa xmm2,[ecx+esi+32]) + a2(movdqa xmm3,[ecx+esi+48]) + a1(jz scrypt_ChunkMix_sse2_no_xor1) + a2(pxor xmm0,[ecx+eax+0]) + a2(pxor xmm1,[ecx+eax+16]) + a2(pxor xmm2,[ecx+eax+32]) + a2(pxor xmm3,[ecx+eax+48]) + a1(scrypt_ChunkMix_sse2_no_xor1:) + a2(xor ecx,ecx) + a2(xor ebx,ebx) + a1(scrypt_ChunkMix_sse2_loop:) + a2(and eax, eax) + a2(pxor xmm0,[esi+ecx+0]) + a2(pxor xmm1,[esi+ecx+16]) + a2(pxor xmm2,[esi+ecx+32]) + a2(pxor xmm3,[esi+ecx+48]) + a1(jz scrypt_ChunkMix_sse2_no_xor2) + a2(pxor xmm0,[eax+ecx+0]) + a2(pxor xmm1,[eax+ecx+16]) + a2(pxor xmm2,[eax+ecx+32]) + a2(pxor xmm3,[eax+ecx+48]) + a1(scrypt_ChunkMix_sse2_no_xor2:) + a2(movdqa [esp+0],xmm0) + a2(movdqa [esp+16],xmm1) + a2(movdqa xmm6,xmm2) + a2(movdqa xmm7,xmm3) + a2(mov eax,8) + a1(scrypt_salsa_sse2_loop: ) + a2(movdqa xmm4, xmm1) + a2(paddd xmm4, xmm0) + a2(movdqa xmm5, xmm4) + a2(pslld xmm4, 7) + a2(psrld xmm5, 25) + a2(pxor xmm3, xmm4) + a2(movdqa xmm4, xmm0) + a2(pxor xmm3, xmm5) + a2(paddd xmm4, xmm3) + a2(movdqa xmm5, xmm4) + a2(pslld xmm4, 9) + a2(psrld xmm5, 23) + a2(pxor xmm2, xmm4) + a2(movdqa xmm4, xmm3) + a2(pxor xmm2, xmm5) + a3(pshufd xmm3, xmm3, 0x93) + a2(paddd xmm4, xmm2) + a2(movdqa xmm5, xmm4) + a2(pslld xmm4, 13) + a2(psrld xmm5, 19) + a2(pxor xmm1, xmm4) + a2(movdqa xmm4, xmm2) + a2(pxor xmm1, xmm5) + a3(pshufd xmm2, xmm2, 0x4e) + a2(paddd xmm4, xmm1) + a2(movdqa xmm5, xmm4) + a2(pslld xmm4, 18) + a2(psrld xmm5, 14) + a2(pxor xmm0, xmm4) + a2(movdqa xmm4, xmm3) + a2(pxor xmm0, xmm5) + a3(pshufd xmm1, xmm1, 0x39) + a2(paddd xmm4, xmm0) + a2(movdqa xmm5, xmm4) + a2(pslld xmm4, 7) + a2(psrld xmm5, 25) + a2(pxor xmm1, xmm4) + a2(movdqa xmm4, xmm0) + a2(pxor xmm1, xmm5) + a2(paddd xmm4, xmm1) + a2(movdqa xmm5, xmm4) + a2(pslld xmm4, 9) + a2(psrld xmm5, 23) + a2(pxor xmm2, xmm4) + a2(movdqa xmm4, xmm1) + a2(pxor xmm2, xmm5) + a3(pshufd xmm1, xmm1, 0x93) + a2(paddd xmm4, xmm2) + a2(movdqa xmm5, xmm4) + a2(pslld xmm4, 13) + a2(psrld xmm5, 19) + a2(pxor xmm3, xmm4) + a2(movdqa xmm4, xmm2) + a2(pxor xmm3, xmm5) + a3(pshufd xmm2, xmm2, 0x4e) + a2(paddd xmm4, xmm3) + a2(sub eax, 2) + a2(movdqa xmm5, xmm4) + a2(pslld xmm4, 18) + a2(psrld xmm5, 14) + a2(pxor xmm0, xmm4) + a3(pshufd xmm3, xmm3, 0x39) + a2(pxor xmm0, xmm5) + a1(ja scrypt_salsa_sse2_loop) + a2(paddd xmm0,[esp+0]) + a2(paddd xmm1,[esp+16]) + a2(paddd xmm2,xmm6) + a2(paddd xmm3,xmm7) + a2(lea eax,[ebx+ecx]) + a2(xor ebx,edx) + a2(and eax,~0x7f) + a2(add ecx,64) + a2(shr eax,1) + a2(add eax, edi) + a2(cmp ecx,edx) + a2(movdqa [eax+0],xmm0) + a2(movdqa [eax+16],xmm1) + a2(movdqa [eax+32],xmm2) + a2(movdqa [eax+48],xmm3) + a2(mov eax,[ebp+28]) + a1(jne scrypt_ChunkMix_sse2_loop) + a2(mov esp,ebp) + a1(pop ebp) + a1(pop esi) + a1(pop edi) + a1(pop ebx) + a1(ret 16) +asm_naked_fn_end(scrypt_ChunkMix_sse2) + +#endif + + + +/* x64 */ +#if defined(X86_64ASM_SSE2) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_SALSA_INCLUDED)) + +#define SCRYPT_SALSA_SSE2 + +asm_naked_fn_proto(void, scrypt_ChunkMix_sse2)(uint32_t *Bout/*[chunkBytes]*/, uint32_t *Bin/*[chunkBytes]*/, uint32_t *Bxor/*[chunkBytes]*/, uint32_t r) +asm_naked_fn(scrypt_ChunkMix_sse2) + a2(lea rcx,[rcx*2]) + a2(shl rcx,6) + a2(lea r9,[rcx-64]) + a2(lea rax,[rsi+r9]) + a2(lea r9,[rdx+r9]) + a2(and rdx, rdx) + a2(movdqa xmm0,[rax+0]) + a2(movdqa xmm1,[rax+16]) + a2(movdqa xmm2,[rax+32]) + a2(movdqa xmm3,[rax+48]) + a1(jz scrypt_ChunkMix_sse2_no_xor1) + a2(pxor xmm0,[r9+0]) + a2(pxor xmm1,[r9+16]) + a2(pxor xmm2,[r9+32]) + a2(pxor xmm3,[r9+48]) + a1(scrypt_ChunkMix_sse2_no_xor1:) + a2(xor r9,r9) + a2(xor r8,r8) + a1(scrypt_ChunkMix_sse2_loop:) + a2(and rdx, rdx) + a2(pxor xmm0,[rsi+r9+0]) + a2(pxor xmm1,[rsi+r9+16]) + a2(pxor xmm2,[rsi+r9+32]) + a2(pxor xmm3,[rsi+r9+48]) + a1(jz scrypt_ChunkMix_sse2_no_xor2) + a2(pxor xmm0,[rdx+r9+0]) + a2(pxor xmm1,[rdx+r9+16]) + a2(pxor xmm2,[rdx+r9+32]) + a2(pxor xmm3,[rdx+r9+48]) + a1(scrypt_ChunkMix_sse2_no_xor2:) + a2(movdqa xmm8,xmm0) + a2(movdqa xmm9,xmm1) + a2(movdqa xmm10,xmm2) + a2(movdqa xmm11,xmm3) + a2(mov rax,8) + a1(scrypt_salsa_sse2_loop: ) + a2(movdqa xmm4, xmm1) + a2(paddd xmm4, xmm0) + a2(movdqa xmm5, xmm4) + a2(pslld xmm4, 7) + a2(psrld xmm5, 25) + a2(pxor xmm3, xmm4) + a2(movdqa xmm4, xmm0) + a2(pxor xmm3, xmm5) + a2(paddd xmm4, xmm3) + a2(movdqa xmm5, xmm4) + a2(pslld xmm4, 9) + a2(psrld xmm5, 23) + a2(pxor xmm2, xmm4) + a2(movdqa xmm4, xmm3) + a2(pxor xmm2, xmm5) + a3(pshufd xmm3, xmm3, 0x93) + a2(paddd xmm4, xmm2) + a2(movdqa xmm5, xmm4) + a2(pslld xmm4, 13) + a2(psrld xmm5, 19) + a2(pxor xmm1, xmm4) + a2(movdqa xmm4, xmm2) + a2(pxor xmm1, xmm5) + a3(pshufd xmm2, xmm2, 0x4e) + a2(paddd xmm4, xmm1) + a2(movdqa xmm5, xmm4) + a2(pslld xmm4, 18) + a2(psrld xmm5, 14) + a2(pxor xmm0, xmm4) + a2(movdqa xmm4, xmm3) + a2(pxor xmm0, xmm5) + a3(pshufd xmm1, xmm1, 0x39) + a2(paddd xmm4, xmm0) + a2(movdqa xmm5, xmm4) + a2(pslld xmm4, 7) + a2(psrld xmm5, 25) + a2(pxor xmm1, xmm4) + a2(movdqa xmm4, xmm0) + a2(pxor xmm1, xmm5) + a2(paddd xmm4, xmm1) + a2(movdqa xmm5, xmm4) + a2(pslld xmm4, 9) + a2(psrld xmm5, 23) + a2(pxor xmm2, xmm4) + a2(movdqa xmm4, xmm1) + a2(pxor xmm2, xmm5) + a3(pshufd xmm1, xmm1, 0x93) + a2(paddd xmm4, xmm2) + a2(movdqa xmm5, xmm4) + a2(pslld xmm4, 13) + a2(psrld xmm5, 19) + a2(pxor xmm3, xmm4) + a2(movdqa xmm4, xmm2) + a2(pxor xmm3, xmm5) + a3(pshufd xmm2, xmm2, 0x4e) + a2(paddd xmm4, xmm3) + a2(sub rax, 2) + a2(movdqa xmm5, xmm4) + a2(pslld xmm4, 18) + a2(psrld xmm5, 14) + a2(pxor xmm0, xmm4) + a3(pshufd xmm3, xmm3, 0x39) + a2(pxor xmm0, xmm5) + a1(ja scrypt_salsa_sse2_loop) + a2(paddd xmm0,xmm8) + a2(paddd xmm1,xmm9) + a2(paddd xmm2,xmm10) + a2(paddd xmm3,xmm11) + a2(lea rax,[r8+r9]) + a2(xor r8,rcx) + a2(and rax,~0x7f) + a2(add r9,64) + a2(shr rax,1) + a2(add rax, rdi) + a2(cmp r9,rcx) + a2(movdqa [rax+0],xmm0) + a2(movdqa [rax+16],xmm1) + a2(movdqa [rax+32],xmm2) + a2(movdqa [rax+48],xmm3) + a1(jne scrypt_ChunkMix_sse2_loop) + a1(ret) +asm_naked_fn_end(scrypt_ChunkMix_sse2) + +#endif + + +/* intrinsic */ +#if defined(X86_INTRINSIC_SSE2) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_SALSA_INCLUDED)) + +#define SCRYPT_SALSA_SSE2 + +static void NOINLINE +scrypt_ChunkMix_sse2(uint32_t *Bout/*[chunkBytes]*/, uint32_t *Bin/*[chunkBytes]*/, uint32_t *Bxor/*[chunkBytes]*/, uint32_t r) { + uint32_t i, blocksPerChunk = r * 2, half = 0; + xmmi *xmmp,x0,x1,x2,x3,x4,x5,t0,t1,t2,t3; + size_t rounds; + + /* 1: X = B_{2r - 1} */ + xmmp = (xmmi *)scrypt_block(Bin, blocksPerChunk - 1); + x0 = xmmp[0]; + x1 = xmmp[1]; + x2 = xmmp[2]; + x3 = xmmp[3]; + + if (Bxor) { + xmmp = (xmmi *)scrypt_block(Bxor, blocksPerChunk - 1); + x0 = _mm_xor_si128(x0, xmmp[0]); + x1 = _mm_xor_si128(x1, xmmp[1]); + x2 = _mm_xor_si128(x2, xmmp[2]); + x3 = _mm_xor_si128(x3, xmmp[3]); + } + + /* 2: for i = 0 to 2r - 1 do */ + for (i = 0; i < blocksPerChunk; i++, half ^= r) { + /* 3: X = H(X ^ B_i) */ + xmmp = (xmmi *)scrypt_block(Bin, i); + x0 = _mm_xor_si128(x0, xmmp[0]); + x1 = _mm_xor_si128(x1, xmmp[1]); + x2 = _mm_xor_si128(x2, xmmp[2]); + x3 = _mm_xor_si128(x3, xmmp[3]); + + if (Bxor) { + xmmp = (xmmi *)scrypt_block(Bxor, i); + x0 = _mm_xor_si128(x0, xmmp[0]); + x1 = _mm_xor_si128(x1, xmmp[1]); + x2 = _mm_xor_si128(x2, xmmp[2]); + x3 = _mm_xor_si128(x3, xmmp[3]); + } + + t0 = x0; + t1 = x1; + t2 = x2; + t3 = x3; + + for (rounds = 8; rounds; rounds -= 2) { + x4 = x1; + x4 = _mm_add_epi32(x4, x0); + x5 = x4; + x4 = _mm_slli_epi32(x4, 7); + x5 = _mm_srli_epi32(x5, 25); + x3 = _mm_xor_si128(x3, x4); + x4 = x0; + x3 = _mm_xor_si128(x3, x5); + x4 = _mm_add_epi32(x4, x3); + x5 = x4; + x4 = _mm_slli_epi32(x4, 9); + x5 = _mm_srli_epi32(x5, 23); + x2 = _mm_xor_si128(x2, x4); + x4 = x3; + x2 = _mm_xor_si128(x2, x5); + x3 = _mm_shuffle_epi32(x3, 0x93); + x4 = _mm_add_epi32(x4, x2); + x5 = x4; + x4 = _mm_slli_epi32(x4, 13); + x5 = _mm_srli_epi32(x5, 19); + x1 = _mm_xor_si128(x1, x4); + x4 = x2; + x1 = _mm_xor_si128(x1, x5); + x2 = _mm_shuffle_epi32(x2, 0x4e); + x4 = _mm_add_epi32(x4, x1); + x5 = x4; + x4 = _mm_slli_epi32(x4, 18); + x5 = _mm_srli_epi32(x5, 14); + x0 = _mm_xor_si128(x0, x4); + x4 = x3; + x0 = _mm_xor_si128(x0, x5); + x1 = _mm_shuffle_epi32(x1, 0x39); + x4 = _mm_add_epi32(x4, x0); + x5 = x4; + x4 = _mm_slli_epi32(x4, 7); + x5 = _mm_srli_epi32(x5, 25); + x1 = _mm_xor_si128(x1, x4); + x4 = x0; + x1 = _mm_xor_si128(x1, x5); + x4 = _mm_add_epi32(x4, x1); + x5 = x4; + x4 = _mm_slli_epi32(x4, 9); + x5 = _mm_srli_epi32(x5, 23); + x2 = _mm_xor_si128(x2, x4); + x4 = x1; + x2 = _mm_xor_si128(x2, x5); + x1 = _mm_shuffle_epi32(x1, 0x93); + x4 = _mm_add_epi32(x4, x2); + x5 = x4; + x4 = _mm_slli_epi32(x4, 13); + x5 = _mm_srli_epi32(x5, 19); + x3 = _mm_xor_si128(x3, x4); + x4 = x2; + x3 = _mm_xor_si128(x3, x5); + x2 = _mm_shuffle_epi32(x2, 0x4e); + x4 = _mm_add_epi32(x4, x3); + x5 = x4; + x4 = _mm_slli_epi32(x4, 18); + x5 = _mm_srli_epi32(x5, 14); + x0 = _mm_xor_si128(x0, x4); + x3 = _mm_shuffle_epi32(x3, 0x39); + x0 = _mm_xor_si128(x0, x5); + } + + x0 = _mm_add_epi32(x0, t0); + x1 = _mm_add_epi32(x1, t1); + x2 = _mm_add_epi32(x2, t2); + x3 = _mm_add_epi32(x3, t3); + + /* 4: Y_i = X */ + /* 6: B'[0..r-1] = Y_even */ + /* 6: B'[r..2r-1] = Y_odd */ + xmmp = (xmmi *)scrypt_block(Bout, (i / 2) + half); + xmmp[0] = x0; + xmmp[1] = x1; + xmmp[2] = x2; + xmmp[3] = x3; + } +} + +#endif + +#if defined(SCRYPT_SALSA_SSE2) + #undef SCRYPT_MIX + #define SCRYPT_MIX "Salsa/8-SSE2" + #undef SCRYPT_SALSA_INCLUDED + #define SCRYPT_SALSA_INCLUDED +#endif + +/* used by avx,etc as well */ +#if defined(SCRYPT_SALSA_INCLUDED) + /* + Default layout: + 0 1 2 3 + 4 5 6 7 + 8 9 10 11 + 12 13 14 15 + + SSE2 layout: + 0 5 10 15 + 12 1 6 11 + 8 13 2 7 + 4 9 14 3 + */ + + static void STDCALL + salsa_core_tangle_sse2(uint32_t *blocks, size_t count) { + uint32_t t; + while (count--) { + t = blocks[1]; blocks[1] = blocks[5]; blocks[5] = t; + t = blocks[2]; blocks[2] = blocks[10]; blocks[10] = t; + t = blocks[3]; blocks[3] = blocks[15]; blocks[15] = t; + t = blocks[4]; blocks[4] = blocks[12]; blocks[12] = t; + t = blocks[7]; blocks[7] = blocks[11]; blocks[11] = t; + t = blocks[9]; blocks[9] = blocks[13]; blocks[13] = t; + blocks += 16; + } + } +#endif + diff --git a/scryptjane/scrypt-jane-mix_salsa.h b/scryptjane/scrypt-jane-mix_salsa.h new file mode 100644 index 0000000..33f3340 --- /dev/null +++ b/scryptjane/scrypt-jane-mix_salsa.h @@ -0,0 +1,70 @@ +#if !defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_SALSA_INCLUDED) + +#undef SCRYPT_MIX +#define SCRYPT_MIX "Salsa20/8 Ref" + +#undef SCRYPT_SALSA_INCLUDED +#define SCRYPT_SALSA_INCLUDED +#define SCRYPT_SALSA_BASIC + +static void +salsa_core_basic(uint32_t state[16]) { + size_t rounds = 8; + uint32_t x0,x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15,t; + + x0 = state[0]; + x1 = state[1]; + x2 = state[2]; + x3 = state[3]; + x4 = state[4]; + x5 = state[5]; + x6 = state[6]; + x7 = state[7]; + x8 = state[8]; + x9 = state[9]; + x10 = state[10]; + x11 = state[11]; + x12 = state[12]; + x13 = state[13]; + x14 = state[14]; + x15 = state[15]; + + #define quarter(a,b,c,d) \ + t = a+d; t = ROTL32(t, 7); b ^= t; \ + t = b+a; t = ROTL32(t, 9); c ^= t; \ + t = c+b; t = ROTL32(t, 13); d ^= t; \ + t = d+c; t = ROTL32(t, 18); a ^= t; \ + + for (; rounds; rounds -= 2) { + quarter( x0, x4, x8,x12) + quarter( x5, x9,x13, x1) + quarter(x10,x14, x2, x6) + quarter(x15, x3, x7,x11) + quarter( x0, x1, x2, x3) + quarter( x5, x6, x7, x4) + quarter(x10,x11, x8, x9) + quarter(x15,x12,x13,x14) + } + + state[0] += x0; + state[1] += x1; + state[2] += x2; + state[3] += x3; + state[4] += x4; + state[5] += x5; + state[6] += x6; + state[7] += x7; + state[8] += x8; + state[9] += x9; + state[10] += x10; + state[11] += x11; + state[12] += x12; + state[13] += x13; + state[14] += x14; + state[15] += x15; + + #undef quarter +} + +#endif + diff --git a/scryptjane/scrypt-jane-pbkdf2.h b/scryptjane/scrypt-jane-pbkdf2.h new file mode 100644 index 0000000..711e3d6 --- /dev/null +++ b/scryptjane/scrypt-jane-pbkdf2.h @@ -0,0 +1,112 @@ +typedef struct scrypt_hmac_state_t { + scrypt_hash_state inner, outer; +} scrypt_hmac_state; + + +static void +scrypt_hash(scrypt_hash_digest hash, const uint8_t *m, size_t mlen) { + scrypt_hash_state st; + scrypt_hash_init(&st); + scrypt_hash_update(&st, m, mlen); + scrypt_hash_finish(&st, hash); +} + +/* hmac */ +static void +scrypt_hmac_init(scrypt_hmac_state *st, const uint8_t *key, size_t keylen) { + uint8_t pad[SCRYPT_HASH_BLOCK_SIZE] = {0}; + size_t i; + + scrypt_hash_init(&st->inner); + scrypt_hash_init(&st->outer); + + if (keylen <= SCRYPT_HASH_BLOCK_SIZE) { + /* use the key directly if it's <= blocksize bytes */ + memcpy(pad, key, keylen); + } else { + /* if it's > blocksize bytes, hash it */ + scrypt_hash(pad, key, keylen); + } + + /* inner = (key ^ 0x36) */ + /* h(inner || ...) */ + for (i = 0; i < SCRYPT_HASH_BLOCK_SIZE; i++) + pad[i] ^= 0x36; + scrypt_hash_update(&st->inner, pad, SCRYPT_HASH_BLOCK_SIZE); + + /* outer = (key ^ 0x5c) */ + /* h(outer || ...) */ + for (i = 0; i < SCRYPT_HASH_BLOCK_SIZE; i++) + pad[i] ^= (0x5c ^ 0x36); + scrypt_hash_update(&st->outer, pad, SCRYPT_HASH_BLOCK_SIZE); + + scrypt_ensure_zero(pad, sizeof(pad)); +} + +static void +scrypt_hmac_update(scrypt_hmac_state *st, const uint8_t *m, size_t mlen) { + /* h(inner || m...) */ + scrypt_hash_update(&st->inner, m, mlen); +} + +static void +scrypt_hmac_finish(scrypt_hmac_state *st, scrypt_hash_digest mac) { + /* h(inner || m) */ + scrypt_hash_digest innerhash; + scrypt_hash_finish(&st->inner, innerhash); + + /* h(outer || h(inner || m)) */ + scrypt_hash_update(&st->outer, innerhash, sizeof(innerhash)); + scrypt_hash_finish(&st->outer, mac); + + scrypt_ensure_zero(st, sizeof(*st)); +} + +static void +scrypt_pbkdf2(const uint8_t *password, size_t password_len, const uint8_t *salt, size_t salt_len, uint64_t N, uint8_t *out, size_t bytes) { + scrypt_hmac_state hmac_pw, hmac_pw_salt, work; + scrypt_hash_digest ti, u; + uint8_t be[4]; + uint32_t i, j, blocks; + uint64_t c; + + /* bytes must be <= (0xffffffff - (SCRYPT_HASH_DIGEST_SIZE - 1)), which they will always be under scrypt */ + + /* hmac(password, ...) */ + scrypt_hmac_init(&hmac_pw, password, password_len); + + /* hmac(password, salt...) */ + hmac_pw_salt = hmac_pw; + scrypt_hmac_update(&hmac_pw_salt, salt, salt_len); + + blocks = ((uint32_t)bytes + (SCRYPT_HASH_DIGEST_SIZE - 1)) / SCRYPT_HASH_DIGEST_SIZE; + for (i = 1; i <= blocks; i++) { + /* U1 = hmac(password, salt || be(i)) */ + U32TO8_BE(be, i); + work = hmac_pw_salt; + scrypt_hmac_update(&work, be, 4); + scrypt_hmac_finish(&work, ti); + memcpy(u, ti, sizeof(u)); + + /* T[i] = U1 ^ U2 ^ U3... */ + for (c = 0; c < N - 1; c++) { + /* UX = hmac(password, U{X-1}) */ + work = hmac_pw; + scrypt_hmac_update(&work, u, SCRYPT_HASH_DIGEST_SIZE); + scrypt_hmac_finish(&work, u); + + /* T[i] ^= UX */ + for (j = 0; j < sizeof(u); j++) + ti[j] ^= u[j]; + } + + memcpy(out, ti, (bytes > SCRYPT_HASH_DIGEST_SIZE) ? SCRYPT_HASH_DIGEST_SIZE : bytes); + out += SCRYPT_HASH_DIGEST_SIZE; + bytes -= SCRYPT_HASH_DIGEST_SIZE; + } + + scrypt_ensure_zero(ti, sizeof(ti)); + scrypt_ensure_zero(u, sizeof(u)); + scrypt_ensure_zero(&hmac_pw, sizeof(hmac_pw)); + scrypt_ensure_zero(&hmac_pw_salt, sizeof(hmac_pw_salt)); +} diff --git a/scryptjane/scrypt-jane-portable-x86.h b/scryptjane/scrypt-jane-portable-x86.h new file mode 100644 index 0000000..03282fa --- /dev/null +++ b/scryptjane/scrypt-jane-portable-x86.h @@ -0,0 +1,364 @@ +#if defined(CPU_X86) && (defined(COMPILER_MSVC) || defined(COMPILER_GCC)) + #define X86ASM + /* gcc 2.95 royally screws up stack alignments on variables */ + #if (defined(COMPILER_MSVC6PP_AND_LATER) || (defined(COMPILER_GCC) && (COMPILER_GCC >= 30000))) + #define X86ASM_SSE + #define X86ASM_SSE2 + #endif + #if ((defined(COMPILER_MSVC) && (COMPILER_MSVC >= 1400)) || (defined(COMPILER_GCC) && (COMPILER_GCC >= 40102))) + #define X86ASM_SSSE3 + #endif + #if ((defined(COMPILER_GCC) && (COMPILER_GCC >= 40400))) + #define X86ASM_AVX + #endif +#endif + +#if defined(CPU_X86_64) && defined(COMPILER_GCC) + #define X86_64ASM + #define X86_64ASM_SSE2 + #if (COMPILER_GCC >= 40102) + #define X86_64ASM_SSSE3 + #endif + #if (COMPILER_GCC >= 40400) + #define X86_64ASM_AVX + #endif +#endif + +#if defined(COMPILER_MSVC) + #define X86_INTRINSIC + #if defined(CPU_X86_64) || defined(X86ASM_SSE) + #define X86_INTRINSIC_SSE + #endif + #if defined(CPU_X86_64) || defined(X86ASM_SSE2) + #define X86_INTRINSIC_SSE2 + #endif + #if (COMPILER_MSVC >= 1400) + #define X86_INTRINSIC_SSSE3 + #endif +#endif + +#if defined(COMPILER_MSVC) && defined(CPU_X86_64) + #define X86_64USE_INTRINSIC +#endif + +#if defined(COMPILER_MSVC) && defined(CPU_X86_64) + #define X86_64USE_INTRINSIC +#endif + +#if defined(COMPILER_GCC) && defined(CPU_X86_FORCE_INTRINSICS) + #define X86_INTRINSIC + #if defined(__SSE__) + #define X86_INTRINSIC_SSE + #endif + #if defined(__SSE2__) + #define X86_INTRINSIC_SSE2 + #endif + #if defined(__SSSE3__) + #define X86_INTRINSIC_SSSE3 + #endif + #if defined(__AVX__) + #define X86_INTRINSIC_AVX + #endif +#endif + +/* only use simd on windows (or SSE2 on gcc)! */ +#if defined(CPU_X86_FORCE_INTRINSICS) || defined(X86_INTRINSIC) + #if defined(X86_INTRINSIC_SSE) + #define X86_INTRINSIC + #include + #include + typedef __m64 qmm; + typedef __m128 xmm; + typedef __m128d xmmd; + #endif + #if defined(X86_INTRINSIC_SSE2) + #define X86_INTRINSIC_SSE2 + #include + typedef __m128i xmmi; + #endif + #if defined(X86_INTRINSIC_SSSE3) + #define X86_INTRINSIC_SSSE3 + #include + #endif +#endif + + +#if defined(X86_INTRINSIC_SSE2) + typedef union packedelem8_t { + uint8_t u[16]; + xmmi v; + } packedelem8; + + typedef union packedelem32_t { + uint32_t u[4]; + xmmi v; + } packedelem32; + + typedef union packedelem64_t { + uint64_t u[2]; + xmmi v; + } packedelem64; +#else + typedef union packedelem8_t { + uint8_t u[16]; + uint32_t dw[4]; + } packedelem8; + + typedef union packedelem32_t { + uint32_t u[4]; + uint8_t b[16]; + } packedelem32; + + typedef union packedelem64_t { + uint64_t u[2]; + uint8_t b[16]; + } packedelem64; +#endif + +#if defined(X86_INTRINSIC_SSSE3) || defined(X86ASM_SSSE3) || defined(X86_64ASM_SSSE3) + const packedelem8 MM16 ssse3_rotr16_64bit = {{2,3,4,5,6,7,0,1,10,11,12,13,14,15,8,9}}; + const packedelem8 MM16 ssse3_rotl16_32bit = {{2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13}}; + const packedelem8 MM16 ssse3_rotl8_32bit = {{3,0,1,2,7,4,5,6,11,8,9,10,15,12,13,14}}; + const packedelem8 MM16 ssse3_endian_swap_64bit = {{7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8}}; +#endif + +/* + x86 inline asm for gcc/msvc. usage: + + asm_naked_fn_proto(return_type, name) (type parm1, type parm2..) + asm_naked_fn(name) + a1(..) + a2(.., ..) + a3(.., .., ..) + a1(ret) + asm_naked_fn_end(name) +*/ + +#if defined(X86ASM) || defined(X86_64ASM) + +#if defined(COMPILER_MSVC) + #pragma warning(disable : 4731) /* frame pointer modified by inline assembly */ + #define a1(x) __asm {x} + #define a2(x, y) __asm {x, y} + #define a3(x, y, z) __asm {x, y, z} + #define a4(x, y, z, w) __asm {x, y, z, w} + #define al(x) __asm {label##x:} + #define aj(x, y, z) __asm {x label##y} + #define asm_align8 a1(ALIGN 8) + #define asm_align16 a1(ALIGN 16) + + #define asm_naked_fn_proto(type, fn) static NAKED type STDCALL fn + #define asm_naked_fn(fn) { + #define asm_naked_fn_end(fn) } +#elif defined(COMPILER_GCC) + #define GNU_AS1(x) #x ";\n" + #define GNU_AS2(x, y) #x ", " #y ";\n" + #define GNU_AS3(x, y, z) #x ", " #y ", " #z ";\n" + #define GNU_AS4(x, y, z, w) #x ", " #y ", " #z ", " #w ";\n" + #define GNU_ASL(x) "\n" #x ":\n" + #define GNU_ASJ(x, y, z) #x " " #y #z ";" + + #define a1(x) GNU_AS1(x) + #define a2(x, y) GNU_AS2(x, y) + #define a3(x, y, z) GNU_AS3(x, y, z) + #define a4(x, y, z, w) GNU_AS4(x, y, z, w) + #define al(x) GNU_ASL(x) + #define aj(x, y, z) GNU_ASJ(x, y, z) + #define asm_align8 a1(.align 8) + #define asm_align16 a1(.align 16) + + #define asm_naked_fn_proto(type, fn) extern type STDCALL fn + #define asm_naked_fn(fn) ; __asm__ (".intel_syntax noprefix;\n.text\n" asm_align16 GNU_ASL(fn) + #define asm_naked_fn_end(fn) ".att_syntax prefix;\n.type " #fn ",@function\n.size " #fn ",.-" #fn "\n" ); + #define asm_gcc() __asm__ __volatile__(".intel_syntax noprefix;\n" + #define asm_gcc_parms() ".att_syntax prefix;" + #define asm_gcc_trashed() __asm__ __volatile__("" ::: + #define asm_gcc_end() ); +#else + need x86 asm +#endif + +#endif /* X86ASM || X86_64ASM */ + + +#if defined(CPU_X86) || defined(CPU_X86_64) + +typedef enum cpu_flags_x86_t { + cpu_mmx = 1 << 0, + cpu_sse = 1 << 1, + cpu_sse2 = 1 << 2, + cpu_sse3 = 1 << 3, + cpu_ssse3 = 1 << 4, + cpu_sse4_1 = 1 << 5, + cpu_sse4_2 = 1 << 6, + cpu_avx = 1 << 7 +} cpu_flags_x86; + +typedef enum cpu_vendors_x86_t { + cpu_nobody, + cpu_intel, + cpu_amd +} cpu_vendors_x86; + +typedef struct x86_regs_t { + uint32_t eax, ebx, ecx, edx; +} x86_regs; + +#if defined(X86ASM) +asm_naked_fn_proto(int, has_cpuid)(void) +asm_naked_fn(has_cpuid) + a1(pushfd) + a1(pop eax) + a2(mov ecx, eax) + a2(xor eax, 0x200000) + a1(push eax) + a1(popfd) + a1(pushfd) + a1(pop eax) + a2(xor eax, ecx) + a2(shr eax, 21) + a2(and eax, 1) + a1(push ecx) + a1(popfd) + a1(ret) +asm_naked_fn_end(has_cpuid) +#endif /* X86ASM */ + + +static void NOINLINE +get_cpuid(x86_regs *regs, uint32_t flags) { +#if defined(COMPILER_MSVC) + __cpuid((int *)regs, (int)flags); +#else + #if defined(CPU_X86_64) + #define cpuid_bx rbx + #else + #define cpuid_bx ebx + #endif + + asm_gcc() + a1(push cpuid_bx) + a1(cpuid) + a2(mov [%1 + 0], eax) + a2(mov [%1 + 4], ebx) + a2(mov [%1 + 8], ecx) + a2(mov [%1 + 12], edx) + a1(pop cpuid_bx) + asm_gcc_parms() : "+a"(flags) : "S"(regs) : "%ecx", "%edx", "cc" + asm_gcc_end() +#endif +} + +#if defined(X86ASM_AVX) || defined(X86_64ASM_AVX) +static uint64_t NOINLINE +get_xgetbv(uint32_t flags) { +#if defined(COMPILER_MSVC) + return _xgetbv(flags); +#else + uint32_t lo, hi; + asm_gcc() + a1(xgetbv) + asm_gcc_parms() : "+c"(flags), "=a" (lo), "=d" (hi) + asm_gcc_end() + return ((uint64_t)lo | ((uint64_t)hi << 32)); +#endif +} +#endif // AVX support + +#if defined(SCRYPT_TEST_SPEED) +size_t cpu_detect_mask = (size_t)-1; +#endif + +static size_t +detect_cpu(void) { + union { uint8_t s[12]; uint32_t i[3]; } vendor_string; + cpu_vendors_x86 vendor = cpu_nobody; + x86_regs regs; + uint32_t max_level; + size_t cpu_flags = 0; +#if defined(X86ASM_AVX) || defined(X86_64ASM_AVX) + uint64_t xgetbv_flags; +#endif + +#if defined(CPU_X86) + if (!has_cpuid()) + return cpu_flags; +#endif + + get_cpuid(®s, 0); + max_level = regs.eax; + vendor_string.i[0] = regs.ebx; + vendor_string.i[1] = regs.edx; + vendor_string.i[2] = regs.ecx; + + if (scrypt_verify(vendor_string.s, (const uint8_t *)"GenuineIntel", 12)) + vendor = cpu_intel; + else if (scrypt_verify(vendor_string.s, (const uint8_t *)"AuthenticAMD", 12)) + vendor = cpu_amd; + + if (max_level & 0x00000500) { + /* "Intel P5 pre-B0" */ + cpu_flags |= cpu_mmx; + return cpu_flags; + } + + if (max_level < 1) + return cpu_flags; + + get_cpuid(®s, 1); +#if defined(X86ASM_AVX) || defined(X86_64ASM_AVX) + /* xsave/xrestore */ + if (regs.ecx & (1 << 27)) { + xgetbv_flags = get_xgetbv(0); + if ((regs.ecx & (1 << 28)) && (xgetbv_flags & 0x6)) cpu_flags |= cpu_avx; + } +#endif + if (regs.ecx & (1 << 20)) cpu_flags |= cpu_sse4_2; + if (regs.ecx & (1 << 19)) cpu_flags |= cpu_sse4_2; + if (regs.ecx & (1 << 9)) cpu_flags |= cpu_ssse3; + if (regs.ecx & (1 )) cpu_flags |= cpu_sse3; + if (regs.edx & (1 << 26)) cpu_flags |= cpu_sse2; + if (regs.edx & (1 << 25)) cpu_flags |= cpu_sse; + if (regs.edx & (1 << 23)) cpu_flags |= cpu_mmx; + +#if defined(SCRYPT_TEST_SPEED) + cpu_flags &= cpu_detect_mask; +#endif + + return cpu_flags; +} + +#if defined(SCRYPT_TEST_SPEED) +static const char * +get_top_cpuflag_desc(size_t flag) { + if (flag & cpu_avx) return "AVX"; + else if (flag & cpu_sse4_2) return "SSE4.2"; + else if (flag & cpu_sse4_1) return "SSE4.1"; + else if (flag & cpu_ssse3) return "SSSE3"; + else if (flag & cpu_sse2) return "SSE2"; + else if (flag & cpu_sse) return "SSE"; + else if (flag & cpu_mmx) return "MMX"; + else return "Basic"; +} +#endif + +/* enable the highest system-wide option */ +#if defined(SCRYPT_CHOOSE_COMPILETIME) + #if !defined(__AVX__) + #undef X86_64ASM_AVX + #undef X86ASM_AVX + #undef X86_INTRINSIC_AVX + #endif + #if !defined(__SSSE3__) + #undef X86_64ASM_SSSE3 + #undef X86ASM_SSSE3 + #undef X86_INTRINSIC_SSSE3 + #endif + #if !defined(__SSE2__) + #undef X86_64ASM_SSE2 + #undef X86ASM_SSE2 + #undef X86_INTRINSIC_SSE2 + #endif +#endif + +#endif /* defined(CPU_X86) || defined(CPU_X86_64) */ \ No newline at end of file diff --git a/scryptjane/scrypt-jane-portable.h b/scryptjane/scrypt-jane-portable.h new file mode 100644 index 0000000..33c8c2c --- /dev/null +++ b/scryptjane/scrypt-jane-portable.h @@ -0,0 +1,281 @@ +/* determine os */ +#if defined(_WIN32) || defined(_WIN64) || defined(__TOS_WIN__) || defined(__WINDOWS__) + #include + #include + #define OS_WINDOWS +#elif defined(sun) || defined(__sun) || defined(__SVR4) || defined(__svr4__) + #include + #include + #include + + #define OS_SOLARIS +#else + #include + #include + #include /* need this to define BSD */ + #include + #include + + #define OS_NIX + #if defined(__linux__) + #include + #define OS_LINUX + #elif defined(BSD) + #define OS_BSD + + #if defined(MACOS_X) || (defined(__APPLE__) & defined(__MACH__)) + #define OS_OSX + #elif defined(macintosh) || defined(Macintosh) + #define OS_MAC + #elif defined(__OpenBSD__) + #define OS_OPENBSD + #endif + #endif +#endif + + +/* determine compiler */ +#if defined(_MSC_VER) + #define COMPILER_MSVC _MSC_VER + #if ((COMPILER_MSVC > 1200) || defined(_mm_free)) + #define COMPILER_MSVC6PP_AND_LATER + #endif + #if (COMPILER_MSVC >= 1500) + #define COMPILER_HAS_TMMINTRIN + #endif + + #pragma warning(disable : 4127) /* conditional expression is constant */ + #pragma warning(disable : 4100) /* unreferenced formal parameter */ + + #define _CRT_SECURE_NO_WARNINGS + #include + #include /* _rotl */ + #include + + typedef unsigned char uint8_t; + typedef unsigned short uint16_t; + typedef unsigned int uint32_t; + typedef signed int int32_t; + typedef unsigned __int64 uint64_t; + typedef signed __int64 int64_t; + + #define ROTL32(a,b) _rotl(a,b) + #define ROTR32(a,b) _rotr(a,b) + #define ROTL64(a,b) _rotl64(a,b) + #define ROTR64(a,b) _rotr64(a,b) + #undef NOINLINE + #define NOINLINE __declspec(noinline) + #undef INLINE + #define INLINE __forceinline + #undef FASTCALL + #define FASTCALL __fastcall + #undef CDECL + #define CDECL __cdecl + #undef STDCALL + #define STDCALL __stdcall + #undef NAKED + #define NAKED __declspec(naked) + #define MM16 __declspec(align(16)) +#endif +#if defined(__ICC) + #define COMPILER_INTEL +#endif +#if defined(__GNUC__) + #if (__GNUC__ >= 3) + #define COMPILER_GCC_PATCHLEVEL __GNUC_PATCHLEVEL__ + #else + #define COMPILER_GCC_PATCHLEVEL 0 + #endif + #define COMPILER_GCC (__GNUC__ * 10000 + __GNUC_MINOR__ * 100 + COMPILER_GCC_PATCHLEVEL) + #define ROTL32(a,b) (((a) << (b)) | ((a) >> (32 - b))) + #define ROTR32(a,b) (((a) >> (b)) | ((a) << (32 - b))) + #define ROTL64(a,b) (((a) << (b)) | ((a) >> (64 - b))) + #define ROTR64(a,b) (((a) >> (b)) | ((a) << (64 - b))) + #undef NOINLINE + #if (COMPILER_GCC >= 30000) + #define NOINLINE __attribute__((noinline)) + #else + #define NOINLINE + #endif + #undef INLINE + #if (COMPILER_GCC >= 30000) + #define INLINE __attribute__((always_inline)) + #else + #define INLINE inline + #endif + #undef FASTCALL + #if (COMPILER_GCC >= 30400) + #define FASTCALL __attribute__((fastcall)) + #else + #define FASTCALL + #endif + #undef CDECL + #define CDECL __attribute__((cdecl)) + #undef STDCALL + #define STDCALL __attribute__((stdcall)) + #define MM16 __attribute__((aligned(16))) + #include +#endif +#if defined(__MINGW32__) || defined(__MINGW64__) + #define COMPILER_MINGW +#endif +#if defined(__PATHCC__) + #define COMPILER_PATHCC +#endif + +#define OPTIONAL_INLINE +#if defined(OPTIONAL_INLINE) + #undef OPTIONAL_INLINE + #define OPTIONAL_INLINE INLINE +#else + #define OPTIONAL_INLINE +#endif + +#define CRYPTO_FN NOINLINE STDCALL + +/* determine cpu */ +#if defined(__amd64__) || defined(__amd64) || defined(__x86_64__ ) || defined(_M_X64) + #define CPU_X86_64 +#elif defined(__i586__) || defined(__i686__) || (defined(_M_IX86) && (_M_IX86 >= 500)) + #define CPU_X86 500 +#elif defined(__i486__) || (defined(_M_IX86) && (_M_IX86 >= 400)) + #define CPU_X86 400 +#elif defined(__i386__) || (defined(_M_IX86) && (_M_IX86 >= 300)) || defined(__X86__) || defined(_X86_) || defined(__I86__) + #define CPU_X86 300 +#elif defined(__ia64__) || defined(_IA64) || defined(__IA64__) || defined(_M_IA64) || defined(__ia64) + #define CPU_IA64 +#endif + +#if defined(__sparc__) || defined(__sparc) || defined(__sparcv9) + #define CPU_SPARC + #if defined(__sparcv9) + #define CPU_SPARC64 + #endif +#endif + +#if defined(CPU_X86_64) || defined(CPU_IA64) || defined(CPU_SPARC64) || defined(__64BIT__) || defined(__LP64__) || defined(_LP64) || (defined(_MIPS_SZLONG) && (_MIPS_SZLONG == 64)) + #define CPU_64BITS + #undef FASTCALL + #define FASTCALL + #undef CDECL + #define CDECL + #undef STDCALL + #define STDCALL +#endif + +#if defined(powerpc) || defined(__PPC__) || defined(__ppc__) || defined(_ARCH_PPC) || defined(__powerpc__) || defined(__powerpc) || defined(POWERPC) || defined(_M_PPC) + #define CPU_PPC + #if defined(_ARCH_PWR7) + #define CPU_POWER7 + #elif defined(__64BIT__) + #define CPU_PPC64 + #else + #define CPU_PPC32 + #endif +#endif + +#if defined(__hppa__) || defined(__hppa) + #define CPU_HPPA +#endif + +#if defined(__alpha__) || defined(__alpha) || defined(_M_ALPHA) + #define CPU_ALPHA +#endif + +/* endian */ + +#if ((defined(__BYTE_ORDER) && defined(__LITTLE_ENDIAN) && (__BYTE_ORDER == __LITTLE_ENDIAN)) || \ + (defined(BYTE_ORDER) && defined(LITTLE_ENDIAN) && (BYTE_ORDER == LITTLE_ENDIAN)) || \ + (defined(CPU_X86) || defined(CPU_X86_64)) || \ + (defined(vax) || defined(MIPSEL) || defined(_MIPSEL))) +#define CPU_LE +#elif ((defined(__BYTE_ORDER) && defined(__BIG_ENDIAN) && (__BYTE_ORDER == __BIG_ENDIAN)) || \ + (defined(BYTE_ORDER) && defined(BIG_ENDIAN) && (BYTE_ORDER == BIG_ENDIAN)) || \ + (defined(CPU_SPARC) || defined(CPU_PPC) || defined(mc68000) || defined(sel)) || defined(_MIPSEB)) +#define CPU_BE +#else + /* unknown endian! */ +#endif + + +#define U8TO32_BE(p) \ + (((uint32_t)((p)[0]) << 24) | ((uint32_t)((p)[1]) << 16) | \ + ((uint32_t)((p)[2]) << 8) | ((uint32_t)((p)[3]) )) + +#define U8TO32_LE(p) \ + (((uint32_t)((p)[0]) ) | ((uint32_t)((p)[1]) << 8) | \ + ((uint32_t)((p)[2]) << 16) | ((uint32_t)((p)[3]) << 24)) + +#define U32TO8_BE(p, v) \ + (p)[0] = (uint8_t)((v) >> 24); (p)[1] = (uint8_t)((v) >> 16); \ + (p)[2] = (uint8_t)((v) >> 8); (p)[3] = (uint8_t)((v) ); + +#define U32TO8_LE(p, v) \ + (p)[0] = (uint8_t)((v) ); (p)[1] = (uint8_t)((v) >> 8); \ + (p)[2] = (uint8_t)((v) >> 16); (p)[3] = (uint8_t)((v) >> 24); + +#define U8TO64_BE(p) \ + (((uint64_t)U8TO32_BE(p) << 32) | (uint64_t)U8TO32_BE((p) + 4)) + +#define U8TO64_LE(p) \ + (((uint64_t)U8TO32_LE(p)) | ((uint64_t)U8TO32_LE((p) + 4) << 32)) + +#define U64TO8_BE(p, v) \ + U32TO8_BE((p), (uint32_t)((v) >> 32)); \ + U32TO8_BE((p) + 4, (uint32_t)((v) )); + +#define U64TO8_LE(p, v) \ + U32TO8_LE((p), (uint32_t)((v) )); \ + U32TO8_LE((p) + 4, (uint32_t)((v) >> 32)); + +#define U32_SWAP(v) { \ + (v) = (((v) << 8) & 0xFF00FF00 ) | (((v) >> 8) & 0xFF00FF ); \ + (v) = ((v) << 16) | ((v) >> 16); \ +} + +#define U64_SWAP(v) { \ + (v) = (((v) << 8) & 0xFF00FF00FF00FF00ull ) | (((v) >> 8) & 0x00FF00FF00FF00FFull ); \ + (v) = (((v) << 16) & 0xFFFF0000FFFF0000ull ) | (((v) >> 16) & 0x0000FFFF0000FFFFull ); \ + (v) = ((v) << 32) | ((v) >> 32); \ +} + +static int +scrypt_verify(const uint8_t *x, const uint8_t *y, size_t len) { + uint32_t differentbits = 0; + while (len--) + differentbits |= (*x++ ^ *y++); + return (1 & ((differentbits - 1) >> 8)); +} + +void +scrypt_ensure_zero(void *p, size_t len) { +#if ((defined(CPU_X86) || defined(CPU_X86_64)) && defined(COMPILER_MSVC)) + __stosb((unsigned char *)p, 0, len); +#elif (defined(CPU_X86) && defined(COMPILER_GCC)) + __asm__ __volatile__( + "pushl %%edi;\n" + "pushl %%ecx;\n" + "rep stosb;\n" + "popl %%ecx;\n" + "popl %%edi;\n" + :: "a"(0), "D"(p), "c"(len) : "cc", "memory" + ); +#elif (defined(CPU_X86_64) && defined(COMPILER_GCC)) + __asm__ __volatile__( + "pushq %%rdi;\n" + "pushq %%rcx;\n" + "rep stosb;\n" + "popq %%rcx;\n" + "popq %%rdi;\n" + :: "a"(0), "D"(p), "c"(len) : "cc", "memory" + ); +#else + volatile uint8_t *b = (volatile uint8_t *)p; + size_t i; + for (i = 0; i < len; i++) + b[i] = 0; +#endif +} + +#include "scrypt-jane-portable-x86.h" + diff --git a/scryptjane/scrypt-jane-romix-basic.h b/scryptjane/scrypt-jane-romix-basic.h new file mode 100644 index 0000000..ca1df02 --- /dev/null +++ b/scryptjane/scrypt-jane-romix-basic.h @@ -0,0 +1,67 @@ +#if !defined(SCRYPT_CHOOSE_COMPILETIME) +/* function type returned by scrypt_getROMix, used with cpu detection */ +typedef void (FASTCALL *scrypt_ROMixfn)(scrypt_mix_word_t *X/*[chunkWords]*/, scrypt_mix_word_t *Y/*[chunkWords]*/, scrypt_mix_word_t *V/*[chunkWords * N]*/, uint32_t N, uint32_t r); +#endif + +/* romix pre/post nop function */ +static void STDCALL +scrypt_romix_nop(scrypt_mix_word_t *blocks, size_t nblocks) { +} + +/* romix pre/post endian conversion function */ +static void STDCALL +scrypt_romix_convert_endian(scrypt_mix_word_t *blocks, size_t nblocks) { +#if !defined(CPU_LE) + static const union { uint8_t b[2]; uint16_t w; } endian_test = {{1,0}}; + size_t i; + if (endian_test.w == 0x100) { + nblocks *= SCRYPT_BLOCK_WORDS; + for (i = 0; i < nblocks; i++) { + SCRYPT_WORD_ENDIAN_SWAP(blocks[i]); + } + } +#endif +} + +/* chunkmix test function */ +typedef void (STDCALL *chunkmixfn)(scrypt_mix_word_t *Bout/*[chunkWords]*/, scrypt_mix_word_t *Bin/*[chunkWords]*/, scrypt_mix_word_t *Bxor/*[chunkWords]*/, uint32_t r); +typedef void (STDCALL *blockfixfn)(scrypt_mix_word_t *blocks, size_t nblocks); + +static int +scrypt_test_mix_instance(chunkmixfn mixfn, blockfixfn prefn, blockfixfn postfn, const uint8_t expected[16]) { + /* r = 2, (2 * r) = 4 blocks in a chunk, 4 * SCRYPT_BLOCK_WORDS total */ + const uint32_t r = 2, blocks = 2 * r, words = blocks * SCRYPT_BLOCK_WORDS; + scrypt_mix_word_t MM16 chunk[2][4 * SCRYPT_BLOCK_WORDS], v; + uint8_t final[16]; + size_t i; + + for (i = 0; i < words; i++) { + v = (scrypt_mix_word_t)i; + v = (v << 8) | v; + v = (v << 16) | v; + chunk[0][i] = v; + } + + prefn(chunk[0], blocks); + mixfn(chunk[1], chunk[0], NULL, r); + postfn(chunk[1], blocks); + + /* grab the last 16 bytes of the final block */ + for (i = 0; i < 16; i += sizeof(scrypt_mix_word_t)) { + SCRYPT_WORDTO8_LE(final + i, chunk[1][words - (16 / sizeof(scrypt_mix_word_t)) + (i / sizeof(scrypt_mix_word_t))]); + } + + return scrypt_verify(expected, final, 16); +} + +/* returns a pointer to item i, where item is len scrypt_mix_word_t's long */ +static scrypt_mix_word_t * +scrypt_item(scrypt_mix_word_t *base, scrypt_mix_word_t i, scrypt_mix_word_t len) { + return base + (i * len); +} + +/* returns a pointer to block i */ +static scrypt_mix_word_t * +scrypt_block(scrypt_mix_word_t *base, scrypt_mix_word_t i) { + return base + (i * SCRYPT_BLOCK_WORDS); +} diff --git a/scryptjane/scrypt-jane-romix-template.h b/scryptjane/scrypt-jane-romix-template.h new file mode 100644 index 0000000..2fd7674 --- /dev/null +++ b/scryptjane/scrypt-jane-romix-template.h @@ -0,0 +1,118 @@ +#if !defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_HAVE_ROMIX) + +#if defined(SCRYPT_CHOOSE_COMPILETIME) +#undef SCRYPT_ROMIX_FN +#define SCRYPT_ROMIX_FN scrypt_ROMix +#endif + +#undef SCRYPT_HAVE_ROMIX +#define SCRYPT_HAVE_ROMIX + +#if !defined(SCRYPT_CHUNKMIX_FN) + +#define SCRYPT_CHUNKMIX_FN scrypt_ChunkMix_basic + +/* + Bout = ChunkMix(Bin) + + 2*r: number of blocks in the chunk +*/ +static void STDCALL +SCRYPT_CHUNKMIX_FN(scrypt_mix_word_t *Bout/*[chunkWords]*/, scrypt_mix_word_t *Bin/*[chunkWords]*/, scrypt_mix_word_t *Bxor/*[chunkWords]*/, uint32_t r) { + scrypt_mix_word_t MM16 X[SCRYPT_BLOCK_WORDS], *block; + uint32_t i, j, blocksPerChunk = r * 2, half = 0; + + /* 1: X = B_{2r - 1} */ + block = scrypt_block(Bin, blocksPerChunk - 1); + for (i = 0; i < SCRYPT_BLOCK_WORDS; i++) + X[i] = block[i]; + + if (Bxor) { + block = scrypt_block(Bxor, blocksPerChunk - 1); + for (i = 0; i < SCRYPT_BLOCK_WORDS; i++) + X[i] ^= block[i]; + } + + /* 2: for i = 0 to 2r - 1 do */ + for (i = 0; i < blocksPerChunk; i++, half ^= r) { + /* 3: X = H(X ^ B_i) */ + block = scrypt_block(Bin, i); + for (j = 0; j < SCRYPT_BLOCK_WORDS; j++) + X[j] ^= block[j]; + + if (Bxor) { + block = scrypt_block(Bxor, i); + for (j = 0; j < SCRYPT_BLOCK_WORDS; j++) + X[j] ^= block[j]; + } + SCRYPT_MIX_FN(X); + + /* 4: Y_i = X */ + /* 6: B'[0..r-1] = Y_even */ + /* 6: B'[r..2r-1] = Y_odd */ + block = scrypt_block(Bout, (i / 2) + half); + for (j = 0; j < SCRYPT_BLOCK_WORDS; j++) + block[j] = X[j]; + } +} +#endif + +/* + X = ROMix(X) + + X: chunk to mix + Y: scratch chunk + N: number of rounds + V[N]: array of chunks to randomly index in to + 2*r: number of blocks in a chunk +*/ + +static void NOINLINE FASTCALL +SCRYPT_ROMIX_FN(scrypt_mix_word_t *X/*[chunkWords]*/, scrypt_mix_word_t *Y/*[chunkWords]*/, scrypt_mix_word_t *V/*[N * chunkWords]*/, uint32_t N, uint32_t r) { + uint32_t i, j, chunkWords = SCRYPT_BLOCK_WORDS * r * 2; + scrypt_mix_word_t *block = V; + + SCRYPT_ROMIX_TANGLE_FN(X, r * 2); + + /* 1: X = B */ + /* implicit */ + + /* 2: for i = 0 to N - 1 do */ + memcpy(block, X, chunkWords * sizeof(scrypt_mix_word_t)); + for (i = 0; i < N - 1; i++, block += chunkWords) { + /* 3: V_i = X */ + /* 4: X = H(X) */ + SCRYPT_CHUNKMIX_FN(block + chunkWords, block, NULL, r); + } + SCRYPT_CHUNKMIX_FN(X, block, NULL, r); + + /* 6: for i = 0 to N - 1 do */ + for (i = 0; i < N; i += 2) { + /* 7: j = Integerify(X) % N */ + j = X[chunkWords - SCRYPT_BLOCK_WORDS] & (N - 1); + + /* 8: X = H(Y ^ V_j) */ + SCRYPT_CHUNKMIX_FN(Y, X, scrypt_item(V, j, chunkWords), r); + + /* 7: j = Integerify(Y) % N */ + j = Y[chunkWords - SCRYPT_BLOCK_WORDS] & (N - 1); + + /* 8: X = H(Y ^ V_j) */ + SCRYPT_CHUNKMIX_FN(X, Y, scrypt_item(V, j, chunkWords), r); + } + + /* 10: B' = X */ + /* implicit */ + + SCRYPT_ROMIX_UNTANGLE_FN(X, r * 2); +} + +#endif /* !defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_HAVE_ROMIX) */ + + +#undef SCRYPT_CHUNKMIX_FN +#undef SCRYPT_ROMIX_FN +#undef SCRYPT_MIX_FN +#undef SCRYPT_ROMIX_TANGLE_FN +#undef SCRYPT_ROMIX_UNTANGLE_FN + diff --git a/scryptjane/scrypt-jane-romix.h b/scryptjane/scrypt-jane-romix.h new file mode 100644 index 0000000..faa655a --- /dev/null +++ b/scryptjane/scrypt-jane-romix.h @@ -0,0 +1,27 @@ +#if defined(SCRYPT_CHACHA) +#include "scrypt-jane-chacha.h" +#elif defined(SCRYPT_SALSA) +#include "scrypt-jane-salsa.h" +#elif defined(SCRYPT_SALSA64) +#include "scrypt-jane-salsa64.h" +#else + #define SCRYPT_MIX_BASE "ERROR" + typedef uint32_t scrypt_mix_word_t; + #define SCRYPT_WORDTO8_LE U32TO8_LE + #define SCRYPT_WORD_ENDIAN_SWAP U32_SWAP + #define SCRYPT_BLOCK_BYTES 64 + #define SCRYPT_BLOCK_WORDS (SCRYPT_BLOCK_BYTES / sizeof(scrypt_mix_word_t)) + #if !defined(SCRYPT_CHOOSE_COMPILETIME) + static void FASTCALL scrypt_ROMix_error(scrypt_mix_word_t *X/*[chunkWords]*/, scrypt_mix_word_t *Y/*[chunkWords]*/, scrypt_mix_word_t *V/*[chunkWords * N]*/, uint32_t N, uint32_t r) {} + static scrypt_ROMixfn scrypt_getROMix() { return scrypt_ROMix_error; } + #else + static void FASTCALL scrypt_ROMix(scrypt_mix_word_t *X, scrypt_mix_word_t *Y, scrypt_mix_word_t *V, uint32_t N, uint32_t r) {} + #endif + static int scrypt_test_mix() { return 0; } + #error must define a mix function! +#endif + +#if !defined(SCRYPT_CHOOSE_COMPILETIME) +#undef SCRYPT_MIX +#define SCRYPT_MIX SCRYPT_MIX_BASE +#endif diff --git a/scryptjane/scrypt-jane-salsa.h b/scryptjane/scrypt-jane-salsa.h new file mode 100644 index 0000000..0c1604b --- /dev/null +++ b/scryptjane/scrypt-jane-salsa.h @@ -0,0 +1,106 @@ +#define SCRYPT_MIX_BASE "Salsa20/8" + +typedef uint32_t scrypt_mix_word_t; + +#define SCRYPT_WORDTO8_LE U32TO8_LE +#define SCRYPT_WORD_ENDIAN_SWAP U32_SWAP + +#define SCRYPT_BLOCK_BYTES 64 +#define SCRYPT_BLOCK_WORDS (SCRYPT_BLOCK_BYTES / sizeof(scrypt_mix_word_t)) + +/* must have these here in case block bytes is ever != 64 */ +#include "scrypt-jane-romix-basic.h" + +#include "scrypt-jane-mix_salsa-avx.h" +#include "scrypt-jane-mix_salsa-sse2.h" +#include "scrypt-jane-mix_salsa.h" + +#if defined(SCRYPT_SALSA_AVX) + #define SCRYPT_CHUNKMIX_FN scrypt_ChunkMix_avx + #define SCRYPT_ROMIX_FN scrypt_ROMix_avx + #define SCRYPT_ROMIX_TANGLE_FN salsa_core_tangle_sse2 + #define SCRYPT_ROMIX_UNTANGLE_FN salsa_core_tangle_sse2 + #include "scrypt-jane-romix-template.h" +#endif + +#if defined(SCRYPT_SALSA_SSE2) + #define SCRYPT_CHUNKMIX_FN scrypt_ChunkMix_sse2 + #define SCRYPT_ROMIX_FN scrypt_ROMix_sse2 + #define SCRYPT_MIX_FN salsa_core_sse2 + #define SCRYPT_ROMIX_TANGLE_FN salsa_core_tangle_sse2 + #define SCRYPT_ROMIX_UNTANGLE_FN salsa_core_tangle_sse2 + #include "scrypt-jane-romix-template.h" +#endif + +/* cpu agnostic */ +#define SCRYPT_ROMIX_FN scrypt_ROMix_basic +#define SCRYPT_MIX_FN salsa_core_basic +#define SCRYPT_ROMIX_TANGLE_FN scrypt_romix_convert_endian +#define SCRYPT_ROMIX_UNTANGLE_FN scrypt_romix_convert_endian +#include "scrypt-jane-romix-template.h" + +#if !defined(SCRYPT_CHOOSE_COMPILETIME) +static scrypt_ROMixfn +scrypt_getROMix() { + size_t cpuflags = detect_cpu(); + +#if defined(SCRYPT_SALSA_AVX) + if (cpuflags & cpu_avx) + return scrypt_ROMix_avx; + else +#endif + +#if defined(SCRYPT_SALSA_SSE2) + if (cpuflags & cpu_sse2) + return scrypt_ROMix_sse2; + else +#endif + + return scrypt_ROMix_basic; +} +#endif + + +#if defined(SCRYPT_TEST_SPEED) +static size_t +available_implementations() { + size_t flags = 0; + +#if defined(SCRYPT_SALSA_AVX) + flags |= cpu_avx; +#endif + +#if defined(SCRYPT_SALSA_SSE2) + flags |= cpu_sse2; +#endif + + return flags; +} +#endif + + +static int +scrypt_test_mix() { + static const uint8_t expected[16] = { + 0x41,0x1f,0x2e,0xa3,0xab,0xa3,0x1a,0x34,0x87,0x1d,0x8a,0x1c,0x76,0xa0,0x27,0x66, + }; + + int ret = 1; + size_t cpuflags = detect_cpu(); + +#if defined(SCRYPT_SALSA_AVX) + if (cpuflags & cpu_avx) + ret &= scrypt_test_mix_instance(scrypt_ChunkMix_avx, salsa_core_tangle_sse2, salsa_core_tangle_sse2, expected); +#endif + +#if defined(SCRYPT_SALSA_SSE2) + if (cpuflags & cpu_sse2) + ret &= scrypt_test_mix_instance(scrypt_ChunkMix_sse2, salsa_core_tangle_sse2, salsa_core_tangle_sse2, expected); +#endif + +#if defined(SCRYPT_SALSA_BASIC) + ret &= scrypt_test_mix_instance(scrypt_ChunkMix_basic, scrypt_romix_convert_endian, scrypt_romix_convert_endian, expected); +#endif + + return ret; +} diff --git a/scryptjane/scrypt-jane-test-vectors.h b/scryptjane/scrypt-jane-test-vectors.h new file mode 100644 index 0000000..a1e4c61 --- /dev/null +++ b/scryptjane/scrypt-jane-test-vectors.h @@ -0,0 +1,261 @@ +typedef struct scrypt_test_setting_t { + const char *pw, *salt; + uint8_t Nfactor, rfactor, pfactor; +} scrypt_test_setting; + +static const scrypt_test_setting post_settings[] = { + {"", "", 3, 0, 0}, + {"password", "NaCl", 9, 3, 4}, + {0} +}; + +#if defined(SCRYPT_SHA256) + #if defined(SCRYPT_SALSA) + /* sha256 + salsa20/8, the only 'official' test vectors! */ + static const uint8_t post_vectors[][64] = { + {0x77,0xd6,0x57,0x62,0x38,0x65,0x7b,0x20,0x3b,0x19,0xca,0x42,0xc1,0x8a,0x04,0x97, + 0xf1,0x6b,0x48,0x44,0xe3,0x07,0x4a,0xe8,0xdf,0xdf,0xfa,0x3f,0xed,0xe2,0x14,0x42, + 0xfc,0xd0,0x06,0x9d,0xed,0x09,0x48,0xf8,0x32,0x6a,0x75,0x3a,0x0f,0xc8,0x1f,0x17, + 0xe8,0xd3,0xe0,0xfb,0x2e,0x0d,0x36,0x28,0xcf,0x35,0xe2,0x0c,0x38,0xd1,0x89,0x06}, + {0xfd,0xba,0xbe,0x1c,0x9d,0x34,0x72,0x00,0x78,0x56,0xe7,0x19,0x0d,0x01,0xe9,0xfe, + 0x7c,0x6a,0xd7,0xcb,0xc8,0x23,0x78,0x30,0xe7,0x73,0x76,0x63,0x4b,0x37,0x31,0x62, + 0x2e,0xaf,0x30,0xd9,0x2e,0x22,0xa3,0x88,0x6f,0xf1,0x09,0x27,0x9d,0x98,0x30,0xda, + 0xc7,0x27,0xaf,0xb9,0x4a,0x83,0xee,0x6d,0x83,0x60,0xcb,0xdf,0xa2,0xcc,0x06,0x40} + }; + #elif defined(SCRYPT_CHACHA) + static const uint8_t post_vectors[][64] = { + {0xef,0x8f,0x44,0x8f,0xc3,0xef,0x78,0x13,0xb2,0x26,0xa7,0x2a,0x40,0xa1,0x98,0x7f, + 0xc8,0x7f,0x0d,0x5f,0x40,0x66,0xa2,0x05,0x07,0x4f,0xc7,0xac,0x3b,0x47,0x07,0x0c, + 0xf5,0x20,0x46,0x76,0x20,0x7b,0xee,0x51,0x6d,0x5f,0xfa,0x9c,0x27,0xac,0xa9,0x36, + 0x62,0xbd,0xde,0x0b,0xa3,0xc0,0x66,0x84,0xde,0x82,0xd0,0x1a,0xb4,0xd1,0xb5,0xfe}, + {0xf1,0x94,0xf7,0x5f,0x15,0x12,0x10,0x4d,0x6e,0xfb,0x04,0x8c,0x35,0xc4,0x51,0xb6, + 0x11,0x04,0xa7,0x9b,0xb0,0x46,0xaf,0x7b,0x47,0x39,0xf0,0xac,0xb2,0x8a,0xfa,0x45, + 0x09,0x86,0x8f,0x10,0x4b,0xc6,0xee,0x00,0x11,0x38,0x73,0x7a,0x6a,0xd8,0x25,0x67, + 0x85,0xa4,0x10,0x4e,0xa9,0x2f,0x15,0xfe,0xcf,0x63,0xe1,0xe8,0xcf,0xab,0xe8,0xbd} + }; + #elif defined(SCRYPT_SALSA64) + static const uint8_t post_vectors[][64] = { + {0xf4,0x87,0x29,0xf4,0xc3,0x31,0x8c,0xe8,0xdf,0xe5,0xd8,0x73,0xff,0xca,0x32,0xcf, + 0xd8,0xac,0xe7,0xf7,0x15,0xda,0x84,0x41,0x60,0x23,0x26,0x4a,0xc8,0x3e,0xee,0xa6, + 0xa5,0x6e,0x52,0xd6,0x64,0x55,0x16,0x31,0x3e,0x66,0x7b,0x65,0xd5,0xe2,0xc9,0x95, + 0x1b,0xf0,0x81,0x40,0xb7,0x2f,0xff,0xa6,0xe6,0x02,0xcc,0x63,0x08,0x4a,0x74,0x31}, + {0x7a,0xd8,0xad,0x02,0x9c,0xa5,0xf4,0x42,0x6a,0x29,0xd2,0xb5,0x53,0xf1,0x6d,0x1d, + 0x25,0xc8,0x70,0x48,0x80,0xb9,0xa3,0xf6,0x94,0xf8,0xfa,0xb8,0x52,0x42,0xcd,0x14, + 0x26,0x46,0x28,0x06,0xc7,0xf6,0x1f,0xa7,0x89,0x6d,0xc5,0xa0,0x36,0xcc,0xde,0xcb, + 0x73,0x0b,0xa4,0xe2,0xd3,0xd1,0x44,0x06,0x35,0x08,0xe0,0x35,0x5b,0xf8,0xd7,0xe7} + }; + #endif +#elif defined(SCRYPT_SHA512) + #if defined(SCRYPT_SALSA) + static const uint8_t post_vectors[][64] = { + {0xae,0x54,0xe7,0x74,0xe4,0x51,0x6b,0x0f,0xe1,0xe7,0x28,0x03,0x17,0xe4,0x8c,0xfa, + 0x2f,0x66,0x55,0x7f,0xdc,0x3b,0x40,0xab,0x47,0x84,0xc9,0x63,0x36,0x07,0x9d,0xe5, + 0x86,0x43,0x95,0x89,0xb6,0xc0,0x6c,0x72,0x64,0x00,0xc1,0x2a,0xd7,0x69,0x21,0x92, + 0x8e,0xba,0xa4,0x59,0x9f,0x00,0x14,0x3a,0x7c,0x12,0x58,0x91,0x09,0xa0,0x32,0xfe}, + {0xc5,0xb3,0xd6,0xea,0x0a,0x4b,0x1e,0xcc,0x40,0x00,0xe5,0x98,0x5c,0xdc,0x06,0x06, + 0x78,0x34,0x92,0x16,0xcf,0xe4,0x9f,0x03,0x96,0x2d,0x41,0x35,0x00,0x9b,0xff,0x74, + 0x60,0x19,0x6e,0xe6,0xa6,0x46,0xf7,0x37,0xcb,0xfa,0xd0,0x9f,0x80,0x72,0x2e,0x85, + 0x13,0x3e,0x1a,0x91,0x90,0x53,0xa1,0x33,0x85,0x51,0xdc,0x62,0x1c,0x0e,0x4d,0x30} + }; + #elif defined(SCRYPT_CHACHA) + static const uint8_t post_vectors[][64] = { + {0xe2,0x05,0x7c,0x44,0xf9,0x55,0x9f,0x64,0xbe,0xd5,0x7f,0x85,0x69,0xc7,0x8c,0x7f, + 0x2b,0x91,0xd6,0x9a,0x6c,0xf8,0x57,0x55,0x61,0x25,0x3d,0xee,0xb8,0xd5,0x8c,0xdc, + 0x2d,0xd5,0x53,0x84,0x8c,0x06,0xaa,0x37,0x77,0xa6,0xf0,0xf1,0x35,0xfe,0xb5,0xcb, + 0x61,0xd7,0x2c,0x67,0xf3,0x7e,0x8a,0x1b,0x04,0xa3,0xa3,0x43,0xa2,0xb2,0x29,0xf2}, + {0x82,0xda,0x29,0xb2,0x08,0x27,0xfc,0x78,0x22,0xc4,0xb8,0x7e,0xbc,0x36,0xcf,0xcd, + 0x17,0x4b,0xa1,0x30,0x16,0x4a,0x25,0x70,0xc7,0xcb,0xe0,0x2b,0x56,0xd3,0x16,0x4e, + 0x85,0xb6,0x84,0xe7,0x9b,0x7f,0x8b,0xb5,0x94,0x33,0xcf,0x33,0x44,0x65,0xc8,0xa1, + 0x46,0xf9,0xf5,0xfc,0x74,0x29,0x7e,0xd5,0x46,0xec,0xbd,0x95,0xc1,0x80,0x24,0xe4} + }; + #elif defined(SCRYPT_SALSA64) + static const uint8_t post_vectors[][64] = { + {0xa6,0xcb,0x77,0x9a,0x64,0x1f,0x95,0x02,0x53,0xe7,0x5c,0x78,0xdb,0xa3,0x43,0xff, + 0xbe,0x10,0x4c,0x7b,0xe4,0xe1,0x91,0xcf,0x67,0x69,0x5a,0x2c,0x12,0xd6,0x99,0x49, + 0x92,0xfd,0x5a,0xaa,0x12,0x4c,0x2e,0xf6,0x95,0x46,0x8f,0x5e,0x77,0x62,0x16,0x29, + 0xdb,0xe7,0xab,0x02,0x2b,0x9c,0x35,0x03,0xf8,0xd4,0x04,0x7d,0x2d,0x73,0x85,0xf1}, + {0x54,0xb7,0xca,0xbb,0xaf,0x0f,0xb0,0x5f,0xb7,0x10,0x63,0x48,0xb3,0x15,0xd8,0xb5, + 0x62,0x64,0x89,0x6a,0x59,0xc6,0x0f,0x86,0x96,0x38,0xf0,0xcf,0xd4,0x62,0x90,0x61, + 0x7d,0xce,0xd6,0x13,0x85,0x67,0x4a,0xf5,0x32,0x03,0x74,0x30,0x0b,0x5a,0x2f,0x86, + 0x82,0x6e,0x0c,0x3e,0x40,0x7a,0xde,0xbe,0x42,0x6e,0x80,0x2b,0xaf,0xdb,0xcc,0x94} + }; + #endif +#elif defined(SCRYPT_BLAKE512) + #if defined(SCRYPT_SALSA) + static const uint8_t post_vectors[][64] = { + {0x4a,0x48,0xb3,0xfa,0xdc,0xb0,0xb8,0xdb,0x54,0xee,0xf3,0x5c,0x27,0x65,0x6c,0x20, + 0xab,0x61,0x9a,0x5b,0xd5,0x1d,0xd9,0x95,0xab,0x88,0x0e,0x4d,0x1e,0x71,0x2f,0x11, + 0x43,0x2e,0xef,0x23,0xca,0x8a,0x49,0x3b,0x11,0x38,0xa5,0x28,0x61,0x2f,0xb7,0x89, + 0x5d,0xef,0x42,0x4c,0xc1,0x74,0xea,0x8a,0x56,0xbe,0x4a,0x82,0x76,0x15,0x1a,0x87}, + {0x96,0x24,0xbf,0x40,0xeb,0x03,0x8e,0xfe,0xc0,0xd5,0xa4,0x81,0x85,0x7b,0x09,0x88, + 0x52,0xb5,0xcb,0xc4,0x48,0xe1,0xb9,0x1d,0x3f,0x8b,0x3a,0xc6,0x38,0x32,0xc7,0x55, + 0x30,0x28,0x7a,0x42,0xa9,0x5d,0x54,0x33,0x62,0xf3,0xd9,0x3c,0x96,0x40,0xd1,0x80, + 0xe4,0x0e,0x7e,0xf0,0x64,0x53,0xfe,0x7b,0xd7,0x15,0xba,0xad,0x16,0x80,0x01,0xb5} + }; + #elif defined(SCRYPT_CHACHA) + static const uint8_t post_vectors[][64] = { + {0x45,0x42,0x22,0x31,0x26,0x13,0x5f,0x94,0xa4,0x00,0x04,0x47,0xe8,0x50,0x6d,0xd6, + 0xdd,0xd5,0x08,0xd4,0x90,0x64,0xe0,0x59,0x70,0x46,0xff,0xfc,0x29,0xb3,0x6a,0xc9, + 0x4d,0x45,0x97,0x95,0xa8,0xf0,0x53,0xe7,0xee,0x4b,0x6b,0x5d,0x1e,0xa5,0xb2,0x58, + 0x4b,0x93,0xc9,0x89,0x4c,0xa8,0xab,0x03,0x74,0x38,0xbd,0x54,0x97,0x6b,0xab,0x4a}, + {0x4b,0x4a,0x63,0x96,0x73,0x34,0x9f,0x39,0x64,0x51,0x0e,0x2e,0x3b,0x07,0xd5,0x1c, + 0xd2,0xf7,0xce,0x60,0xab,0xac,0x89,0xa4,0x16,0x0c,0x58,0x82,0xb3,0xd3,0x25,0x5b, + 0xd5,0x62,0x32,0xf4,0x86,0x5d,0xb2,0x4b,0xbf,0x8e,0xc6,0xc0,0xac,0x40,0x48,0xb4, + 0x69,0x08,0xba,0x40,0x4b,0x07,0x2a,0x13,0x9c,0x98,0x3b,0x8b,0x20,0x0c,0xac,0x9e} + }; + #elif defined(SCRYPT_SALSA64) + static const uint8_t post_vectors[][64] = { + {0xcb,0x4b,0xc2,0xd1,0xf4,0x77,0x32,0x3c,0x42,0x9d,0xf7,0x7d,0x1f,0x22,0x64,0xa4, + 0xe2,0x88,0x30,0x2d,0x54,0x9d,0xb6,0x26,0x89,0x25,0x30,0xc3,0x3d,0xdb,0xba,0x99, + 0xe9,0x8e,0x1e,0x5e,0x57,0x66,0x75,0x7c,0x24,0xda,0x00,0x6f,0x79,0xf7,0x47,0xf5, + 0xea,0x40,0x70,0x37,0xd2,0x91,0xc7,0x4d,0xdf,0x46,0xb6,0x3e,0x95,0x7d,0xcb,0xc1}, + {0x25,0xc2,0xcb,0x7f,0xc8,0x50,0xb7,0x0b,0x11,0x9e,0x1d,0x10,0xb2,0xa8,0x35,0x23, + 0x91,0x39,0xfb,0x45,0xf2,0xbf,0xe4,0xd0,0x84,0xec,0x72,0x33,0x6d,0x09,0xed,0x41, + 0x9a,0x7e,0x4f,0x10,0x73,0x97,0x22,0x76,0x58,0x93,0x39,0x24,0xdf,0xd2,0xaa,0x2f, + 0x6b,0x2b,0x64,0x48,0xa5,0xb7,0xf5,0x56,0x77,0x02,0xa7,0x71,0x46,0xe5,0x0e,0x8d}, + }; + #endif +#elif defined(SCRYPT_BLAKE256) + #if defined(SCRYPT_SALSA) + static const uint8_t post_vectors[][64] = { + {0xf1,0xf1,0x91,0x1a,0x81,0xe6,0x9f,0xc1,0xce,0x43,0xab,0xb1,0x1a,0x02,0x1e,0x16, + 0x08,0xc6,0xf9,0x00,0x50,0x1b,0x6d,0xf1,0x31,0x06,0x95,0x48,0x5d,0xf7,0x6c,0x00, + 0xa2,0x4c,0xb1,0x0e,0x52,0x66,0x94,0x7e,0x84,0xfc,0xa5,0x34,0xfd,0xf0,0xe9,0x57, + 0x85,0x2d,0x8c,0x05,0x5c,0x0f,0x04,0xd4,0x8d,0x3e,0x13,0x52,0x3d,0x90,0x2d,0x2c}, + {0xd5,0x42,0xd2,0x7b,0x06,0xae,0x63,0x90,0x9e,0x30,0x00,0x0e,0xd8,0xa4,0x3a,0x0b, + 0xee,0x4a,0xef,0xb2,0xc4,0x95,0x0d,0x72,0x07,0x70,0xcc,0xa3,0xf9,0x1e,0xc2,0x75, + 0xcf,0xaf,0xe1,0x44,0x1c,0x8c,0xe2,0x3e,0x0c,0x81,0xf3,0x92,0xe1,0x13,0xe6,0x4f, + 0x2d,0x27,0xc3,0x87,0xe5,0xb6,0xf9,0xd7,0x02,0x04,0x37,0x64,0x78,0x36,0x6e,0xb3} + }; + #elif defined(SCRYPT_CHACHA) + static const uint8_t post_vectors[][64] = { + {0xad,0x1b,0x4b,0xca,0xe3,0x26,0x1a,0xfd,0xb7,0x77,0x8c,0xde,0x8d,0x26,0x14,0xe1, + 0x54,0x38,0x42,0xf3,0xb3,0x66,0x29,0xf9,0x90,0x04,0xf1,0x82,0x7c,0x5a,0x6f,0xa8, + 0x7d,0xd6,0x08,0x0d,0x8b,0x78,0x04,0xad,0x31,0xea,0xd4,0x87,0x2d,0xf7,0x74,0x9a, + 0xe5,0xce,0x97,0xef,0xa3,0xbb,0x90,0x46,0x7c,0xf4,0x51,0x38,0xc7,0x60,0x53,0x21}, + {0x39,0xbb,0x56,0x3d,0x0d,0x7b,0x74,0x82,0xfe,0x5a,0x78,0x3d,0x66,0xe8,0x3a,0xdf, + 0x51,0x6f,0x3e,0xf4,0x86,0x20,0x8d,0xe1,0x81,0x22,0x02,0xf7,0x0d,0xb5,0x1a,0x0f, + 0xfc,0x59,0xb6,0x60,0xc9,0xdb,0x38,0x0b,0x5b,0x95,0xa5,0x94,0xda,0x42,0x2d,0x90, + 0x47,0xeb,0x73,0x31,0x9f,0x20,0xf6,0x81,0xc2,0xef,0x33,0x77,0x51,0xd8,0x2c,0xe4} + }; + #elif defined(SCRYPT_SALSA64) + static const uint8_t post_vectors[][64] = { + {0x9e,0xf2,0x60,0x7c,0xbd,0x7c,0x19,0x5c,0x79,0xc6,0x1b,0x7e,0xb0,0x65,0x1b,0xc3, + 0x70,0x0d,0x89,0xfc,0x72,0xb2,0x03,0x72,0x15,0xcb,0x8e,0x8c,0x49,0x50,0x4c,0x27, + 0x99,0xda,0x47,0x32,0x5e,0xb4,0xa2,0x07,0x83,0x51,0x6b,0x06,0x37,0x60,0x42,0xc4, + 0x59,0x49,0x99,0xdd,0xc0,0xd2,0x08,0x94,0x7f,0xe3,0x9e,0x4e,0x43,0x8e,0x5b,0xba}, + {0x86,0x6f,0x3b,0x11,0xb8,0xca,0x4b,0x6e,0xa7,0x6f,0xc2,0xc9,0x33,0xb7,0x8b,0x9f, + 0xa3,0xb9,0xf5,0xb5,0x62,0xa6,0x17,0x66,0xe4,0xc3,0x9d,0x9b,0xca,0x51,0xb0,0x2f, + 0xda,0x09,0xc1,0x77,0xed,0x8b,0x89,0xc2,0x69,0x5a,0x34,0x05,0x4a,0x1f,0x4d,0x76, + 0xcb,0xd5,0xa4,0x78,0xfa,0x1b,0xb9,0x5b,0xbc,0x3d,0xce,0x04,0x63,0x99,0xad,0x54} + }; + #endif +#elif defined(SCRYPT_SKEIN512) + #if defined(SCRYPT_SALSA) + static const uint8_t post_vectors[][64] = { + {0xe4,0x36,0xa0,0x9a,0xdb,0xf0,0xd1,0x45,0x56,0xda,0x25,0x53,0x00,0xf9,0x2c,0x69, + 0xa4,0xc2,0xa5,0x8e,0x1a,0x85,0xfa,0x53,0xbd,0x55,0x3d,0x11,0x2a,0x44,0x13,0x87, + 0x8f,0x81,0x88,0x13,0x1e,0x49,0xa8,0xc4,0xc5,0xcd,0x1f,0xe1,0x5f,0xf5,0xcb,0x2f, + 0x8b,0xab,0x57,0x38,0x59,0xeb,0x6b,0xac,0x3b,0x73,0x10,0xa6,0xe1,0xfe,0x17,0x3e}, + {0x6d,0x61,0xde,0x43,0xa9,0x38,0x53,0x5f,0xd8,0xf2,0x6d,0xf3,0xe4,0xd6,0xd8,0x5e, + 0x81,0x89,0xd0,0x0b,0x86,0x16,0xb1,0x91,0x65,0x76,0xd8,0xc1,0xf7,0x3b,0xca,0x8b, + 0x35,0x07,0x58,0xba,0x77,0xdf,0x11,0x6c,0xbc,0x58,0xee,0x11,0x59,0xf2,0xfe,0xcb, + 0x51,0xdc,0xcd,0x35,0x2e,0x46,0x22,0xa0,0xaa,0x55,0x60,0x7c,0x91,0x15,0xb8,0x00} + }; + #elif defined(SCRYPT_CHACHA) + static const uint8_t post_vectors[][64] = { + {0xd1,0x12,0x6d,0x64,0x10,0x0e,0x98,0x6c,0xbe,0x70,0x21,0xd9,0xc6,0x04,0x62,0xa4, + 0x29,0x13,0x9a,0x3c,0xf8,0xe9,0x1e,0x87,0x9f,0x88,0xf4,0x98,0x01,0x41,0x8e,0xce, + 0x60,0xf7,0xbe,0x17,0x0a,0xec,0xd6,0x30,0x80,0xcf,0x6b,0x1e,0xcf,0x95,0xa0,0x4d, + 0x37,0xed,0x3a,0x09,0xd1,0xeb,0x0c,0x80,0x82,0x22,0x8e,0xd3,0xb1,0x7f,0xd6,0xa8}, + {0x5c,0x5c,0x05,0xe2,0x75,0xa5,0xa4,0xec,0x81,0x97,0x9c,0x5b,0xd7,0x26,0xb3,0x16, + 0xb4,0x02,0x8c,0x56,0xe6,0x32,0x57,0x33,0x47,0x19,0x06,0x6c,0xde,0x68,0x41,0x37, + 0x5b,0x7d,0xa7,0xb3,0x73,0xeb,0x82,0xca,0x0f,0x86,0x2e,0x6b,0x47,0xa2,0x70,0x39, + 0x35,0xfd,0x2d,0x2e,0x7b,0xc3,0x68,0xbb,0x52,0x42,0x19,0x3b,0x78,0x96,0xe7,0xc8} + }; + #elif defined(SCRYPT_SALSA64) + static const uint8_t post_vectors[][64] = { + {0xd2,0xad,0x32,0x05,0xee,0x80,0xe3,0x44,0x70,0xc6,0x34,0xde,0x05,0xb6,0xcf,0x60, + 0x89,0x98,0x70,0xc0,0xb8,0xf5,0x54,0xf1,0xa6,0xb2,0xc8,0x76,0x34,0xec,0xc4,0x59, + 0x8e,0x64,0x42,0xd0,0xa9,0xed,0xe7,0x19,0xb2,0x8a,0x11,0xc6,0xa6,0xbf,0xa7,0xa9, + 0x4e,0x44,0x32,0x7e,0x12,0x91,0x9d,0xfe,0x52,0x48,0xa8,0x27,0xb3,0xfc,0xb1,0x89}, + {0xd6,0x67,0xd2,0x3e,0x30,0x1e,0x9d,0xe2,0x55,0x68,0x17,0x3d,0x2b,0x75,0x5a,0xe5, + 0x04,0xfb,0x3d,0x0e,0x86,0xe0,0xaa,0x1d,0xd4,0x72,0xda,0xb0,0x79,0x41,0xb7,0x99, + 0x68,0xe5,0xd9,0x55,0x79,0x7d,0xc3,0xd1,0xa6,0x56,0xc1,0xbe,0x0b,0x6c,0x62,0x23, + 0x66,0x67,0x91,0x47,0x99,0x13,0x6b,0xe3,0xda,0x59,0x55,0x18,0x67,0x8f,0x2e,0x3b} + }; + #endif +#elif defined(SCRYPT_KECCAK512) + #if defined(SCRYPT_SALSA) + static const uint8_t post_vectors[][64] = { + {0xc2,0x7b,0xbe,0x1d,0xf1,0x99,0xd8,0xe7,0x1b,0xac,0xe0,0x9d,0xeb,0x5a,0xfe,0x21, + 0x71,0xff,0x41,0x51,0x4f,0xbe,0x41,0x01,0x15,0xe2,0xb7,0xb9,0x55,0x15,0x25,0xa1, + 0x40,0x4c,0x66,0x29,0x32,0xb7,0xc9,0x62,0x60,0x88,0xe0,0x99,0x39,0xae,0xce,0x25, + 0x3c,0x11,0x89,0xdd,0xc6,0x14,0xd7,0x3e,0xa3,0x6d,0x07,0x2e,0x56,0xa0,0xff,0x97}, + {0x3c,0x91,0x12,0x4a,0x37,0x7d,0xd6,0x96,0xd2,0x9b,0x5d,0xea,0xb8,0xb9,0x82,0x4e, + 0x4f,0x6b,0x60,0x4c,0x59,0x01,0xe5,0x73,0xfd,0xf6,0xb8,0x9a,0x5a,0xd3,0x7c,0x7a, + 0xd2,0x4f,0x8e,0x74,0xc1,0x90,0x88,0xa0,0x3f,0x55,0x75,0x79,0x10,0xd0,0x09,0x79, + 0x0f,0x6c,0x74,0x0c,0x05,0x08,0x3c,0x8c,0x94,0x7b,0x30,0x56,0xca,0xdf,0xdf,0x34} + }; + #elif defined(SCRYPT_CHACHA) + static const uint8_t post_vectors[][64] = { + {0x77,0xcb,0x70,0xbf,0xae,0xd4,0x4c,0x5b,0xbc,0xd3,0xec,0x8a,0x82,0x43,0x8d,0xb3, + 0x7f,0x1f,0xfb,0x70,0x36,0x32,0x4d,0xa6,0xb7,0x13,0x37,0x77,0x30,0x0c,0x3c,0xfb, + 0x2c,0x20,0x8f,0x2a,0xf4,0x47,0x4d,0x69,0x8e,0xae,0x2d,0xad,0xba,0x35,0xe9,0x2f, + 0xe6,0x99,0x7a,0xf8,0xcf,0x70,0x78,0xbb,0x0c,0x72,0x64,0x95,0x8b,0x36,0x77,0x3d}, + {0xc6,0x43,0x17,0x16,0x87,0x09,0x5f,0x12,0xed,0x21,0xe2,0xb4,0xad,0x55,0xa1,0xa1, + 0x49,0x50,0x90,0x70,0xab,0x81,0x83,0x7a,0xcd,0xdf,0x23,0x52,0x19,0xc0,0xa2,0xd8, + 0x8e,0x98,0xeb,0xf0,0x37,0xab,0xad,0xfd,0x1c,0x04,0x97,0x18,0x42,0x85,0xf7,0x4b, + 0x18,0x2c,0x55,0xd3,0xa9,0xe6,0x89,0xfb,0x58,0x0a,0xb2,0x37,0xb9,0xf8,0xfb,0xc5} + }; + #elif defined(SCRYPT_SALSA64) + static const uint8_t post_vectors[][64] = { + {0xc7,0x34,0x95,0x02,0x5e,0x31,0x0d,0x1f,0x10,0x38,0x9c,0x3f,0x04,0x53,0xed,0x05, + 0x27,0x38,0xc1,0x3f,0x6a,0x0f,0xc5,0xa3,0x9b,0x73,0x8a,0x28,0x7e,0x5d,0x3c,0xdc, + 0x9d,0x5a,0x09,0xbf,0x8c,0x0a,0xad,0xe4,0x73,0x52,0xe3,0x6d,0xaa,0xd1,0x8b,0xbf, + 0xa3,0xb7,0xf0,0x58,0xad,0x22,0x24,0xc9,0xaa,0x96,0xb7,0x5d,0xfc,0x5f,0xb0,0xcf}, + {0x76,0x22,0xfd,0xe8,0xa2,0x79,0x8e,0x9d,0x43,0x8c,0x7a,0xba,0x78,0xb7,0x84,0xf1, + 0xc8,0xee,0x3b,0xae,0x31,0x89,0xbf,0x7e,0xd0,0x4b,0xc1,0x2d,0x58,0x5d,0x84,0x6b, + 0xec,0x86,0x56,0xe0,0x87,0x94,0x7f,0xbc,0xf9,0x48,0x92,0xef,0x54,0x7f,0x23,0x8d, + 0x4f,0x8b,0x0a,0x75,0xa7,0x39,0x0e,0x46,0x6e,0xee,0x58,0xc8,0xfa,0xea,0x90,0x53} + }; + #endif +#elif defined(SCRYPT_KECCAK256) + #if defined(SCRYPT_SALSA) + static const uint8_t post_vectors[][64] = { + {0x2e,0x96,0xd8,0x87,0x45,0xcd,0xd6,0xc8,0xf6,0xd2,0x87,0x33,0x50,0xc7,0x04,0xe5, + 0x3c,0x4b,0x48,0x44,0x57,0xc1,0x74,0x09,0x76,0x02,0xaa,0xd3,0x7b,0xf3,0xbf,0xed, + 0x4b,0x72,0xd7,0x1b,0x49,0x6b,0xe0,0x44,0x83,0xee,0x8f,0xaf,0xa1,0xb5,0x33,0xa9, + 0x9e,0x86,0xab,0xe2,0x9f,0xcf,0x68,0x6e,0x7e,0xbd,0xf5,0x7a,0x83,0x4b,0x1c,0x10}, + {0x42,0x7e,0xf9,0x4b,0x72,0x61,0xda,0x2d,0xb3,0x27,0x0e,0xe1,0xd9,0xde,0x5f,0x3e, + 0x64,0x2f,0xd6,0xda,0x90,0x59,0xce,0xbf,0x02,0x5b,0x32,0xf7,0x6d,0x94,0x51,0x7b, + 0xb6,0xa6,0x0d,0x99,0x3e,0x7f,0x39,0xbe,0x1b,0x1d,0x6c,0x97,0x12,0xd8,0xb7,0xfd, + 0x5b,0xb5,0xf3,0x73,0x5a,0x89,0xb2,0xdd,0xcc,0x3d,0x74,0x2e,0x3d,0x9e,0x3c,0x22} + }; + #elif defined(SCRYPT_CHACHA) + static const uint8_t post_vectors[][64] = { + {0x76,0x1d,0x5b,0x8f,0xa9,0xe1,0xa6,0x01,0xcb,0xc5,0x7a,0x5f,0x02,0x23,0xb6,0x82, + 0x57,0x79,0x60,0x2f,0x05,0x7f,0xb8,0x0a,0xcb,0x5e,0x54,0x11,0x49,0x2e,0xdd,0x85, + 0x83,0x30,0x67,0xb3,0x24,0x5c,0xce,0xfc,0x32,0xcf,0x12,0xc3,0xff,0xe0,0x79,0x36, + 0x74,0x17,0xa6,0x3e,0xcd,0xa0,0x7e,0xcb,0x37,0xeb,0xcb,0xb6,0xe1,0xb9,0xf5,0x15}, + {0xf5,0x66,0xa7,0x4c,0xe4,0xdc,0x18,0x56,0x2f,0x3e,0x86,0x4d,0x92,0xa5,0x5c,0x5a, + 0x8f,0xc3,0x6b,0x32,0xdb,0xe5,0x72,0x50,0x84,0xfc,0x6e,0x5d,0x15,0x77,0x3d,0xca, + 0xc5,0x2b,0x20,0x3c,0x78,0x37,0x80,0x78,0x23,0x56,0x91,0xa0,0xce,0xa4,0x06,0x5a, + 0x7f,0xe3,0xbf,0xab,0x51,0x57,0x32,0x2c,0x0a,0xf0,0xc5,0x6f,0xf4,0xcb,0xff,0x42} + }; + #elif defined(SCRYPT_SALSA64) + static const uint8_t post_vectors[][64] = { + {0xb0,0xb7,0x10,0xb5,0x1f,0x2b,0x7f,0xaf,0x9d,0x95,0x5f,0x4c,0x2d,0x98,0x7c,0xc1, + 0xbc,0x37,0x2f,0x50,0x8d,0xb2,0x9f,0xfd,0x48,0x0d,0xe0,0x44,0x19,0xdf,0x28,0x6c, + 0xab,0xbf,0x1e,0x17,0x26,0xcc,0x57,0x95,0x18,0x17,0x83,0x4c,0x12,0x48,0xd9,0xee, + 0x4b,0x00,0x29,0x06,0x31,0x01,0x6b,0x8c,0x26,0x39,0xbf,0xe4,0xe4,0xd4,0x6a,0x26}, + {0xa0,0x40,0xb2,0xf2,0x11,0xb6,0x5f,0x3d,0x4c,0x1e,0xef,0x59,0xd4,0x98,0xdb,0x14, + 0x01,0xff,0xe3,0x34,0xd7,0x19,0xcd,0xeb,0xde,0x52,0x1c,0xf4,0x86,0x43,0xc9,0xe2, + 0xfb,0xf9,0x4f,0x0a,0xbb,0x1f,0x5c,0x6a,0xdf,0xb9,0x28,0xfa,0xac,0xc4,0x48,0xed, + 0xcc,0xd2,0x2e,0x25,0x5f,0xf3,0x56,0x1d,0x2d,0x23,0x22,0xc1,0xbc,0xff,0x78,0x80} + }; + #endif +#else + static const uint8_t post_vectors[][64] = {{0}}; +#endif + diff --git a/scryptn.h b/scryptn.h index ba461a2..f14a826 100644 --- a/scryptn.h +++ b/scryptn.h @@ -1,6 +1,6 @@ -#ifndef SCRYPT_H -#define SCRYPT_H -#include +#ifndef SCRYPTN_H +#define SCRYPTN_H + #ifdef __cplusplus extern "C" { #endif @@ -13,4 +13,4 @@ void scrypt_N_1_1_256_sp(const char* input, char* output, char* scratchpad, uint } #endif -#endif +#endif \ No newline at end of file