Cryptonight optimizations

master
lucas 2014-05-18 15:00:56 +01:00
parent edd4fa7b5c
commit 1760c4476b
6 changed files with 266 additions and 83 deletions

View File

@ -38,7 +38,8 @@
"crypto/c_blake256.c",
"crypto/c_jh.c",
"crypto/c_skein.c",
"crypto/hash.c"
"crypto/hash.c",
"crypto/aesb.c"
]
}
]

177
crypto/aesb.c 100644
View File

@ -0,0 +1,177 @@
/*
---------------------------------------------------------------------------
Copyright (c) 1998-2013, Brian Gladman, Worcester, UK. All rights reserved.
The redistribution and use of this software (with or without changes)
is allowed without the payment of fees or royalties provided that:
source code distributions include the above copyright notice, this
list of conditions and the following disclaimer;
binary distributions include the above copyright notice, this list
of conditions and the following disclaimer in their documentation.
This software is provided 'as is' with no explicit or implied warranties
in respect of its operation, including, but not limited to, correctness
and fitness for purpose.
---------------------------------------------------------------------------
Issue Date: 20/12/2007
*/
#include <stdint.h>
#if defined(__cplusplus)
extern "C"
{
#endif
#define TABLE_ALIGN 32
#define WPOLY 0x011b
#define N_COLS 4
#define AES_BLOCK_SIZE 16
#define RC_LENGTH (5 * (AES_BLOCK_SIZE / 4 - 2))
#if defined(_MSC_VER)
#define ALIGN __declspec(align(TABLE_ALIGN))
#elif defined(__GNUC__)
#define ALIGN __attribute__ ((aligned(16)))
#else
#define ALIGN
#endif
#define rf1(r,c) (r)
#define word_in(x,c) (*((uint32_t*)(x)+(c)))
#define word_out(x,c,v) (*((uint32_t*)(x)+(c)) = (v))
#define s(x,c) x[c]
#define si(y,x,c) (s(y,c) = word_in(x, c))
#define so(y,x,c) word_out(y, c, s(x,c))
#define state_in(y,x) si(y,x,0); si(y,x,1); si(y,x,2); si(y,x,3)
#define state_out(y,x) so(y,x,0); so(y,x,1); so(y,x,2); so(y,x,3)
#define round(rm,y,x,k) rm(y,x,k,0); rm(y,x,k,1); rm(y,x,k,2); rm(y,x,k,3)
#define to_byte(x) ((x) & 0xff)
#define bval(x,n) to_byte((x) >> (8 * (n)))
#define fwd_var(x,r,c)\
( r == 0 ? ( c == 0 ? s(x,0) : c == 1 ? s(x,1) : c == 2 ? s(x,2) : s(x,3))\
: r == 1 ? ( c == 0 ? s(x,1) : c == 1 ? s(x,2) : c == 2 ? s(x,3) : s(x,0))\
: r == 2 ? ( c == 0 ? s(x,2) : c == 1 ? s(x,3) : c == 2 ? s(x,0) : s(x,1))\
: ( c == 0 ? s(x,3) : c == 1 ? s(x,0) : c == 2 ? s(x,1) : s(x,2)))
#define fwd_rnd(y,x,k,c) (s(y,c) = (k)[c] ^ four_tables(x,t_use(f,n),fwd_var,rf1,c))
#define sb_data(w) {\
w(0x63), w(0x7c), w(0x77), w(0x7b), w(0xf2), w(0x6b), w(0x6f), w(0xc5),\
w(0x30), w(0x01), w(0x67), w(0x2b), w(0xfe), w(0xd7), w(0xab), w(0x76),\
w(0xca), w(0x82), w(0xc9), w(0x7d), w(0xfa), w(0x59), w(0x47), w(0xf0),\
w(0xad), w(0xd4), w(0xa2), w(0xaf), w(0x9c), w(0xa4), w(0x72), w(0xc0),\
w(0xb7), w(0xfd), w(0x93), w(0x26), w(0x36), w(0x3f), w(0xf7), w(0xcc),\
w(0x34), w(0xa5), w(0xe5), w(0xf1), w(0x71), w(0xd8), w(0x31), w(0x15),\
w(0x04), w(0xc7), w(0x23), w(0xc3), w(0x18), w(0x96), w(0x05), w(0x9a),\
w(0x07), w(0x12), w(0x80), w(0xe2), w(0xeb), w(0x27), w(0xb2), w(0x75),\
w(0x09), w(0x83), w(0x2c), w(0x1a), w(0x1b), w(0x6e), w(0x5a), w(0xa0),\
w(0x52), w(0x3b), w(0xd6), w(0xb3), w(0x29), w(0xe3), w(0x2f), w(0x84),\
w(0x53), w(0xd1), w(0x00), w(0xed), w(0x20), w(0xfc), w(0xb1), w(0x5b),\
w(0x6a), w(0xcb), w(0xbe), w(0x39), w(0x4a), w(0x4c), w(0x58), w(0xcf),\
w(0xd0), w(0xef), w(0xaa), w(0xfb), w(0x43), w(0x4d), w(0x33), w(0x85),\
w(0x45), w(0xf9), w(0x02), w(0x7f), w(0x50), w(0x3c), w(0x9f), w(0xa8),\
w(0x51), w(0xa3), w(0x40), w(0x8f), w(0x92), w(0x9d), w(0x38), w(0xf5),\
w(0xbc), w(0xb6), w(0xda), w(0x21), w(0x10), w(0xff), w(0xf3), w(0xd2),\
w(0xcd), w(0x0c), w(0x13), w(0xec), w(0x5f), w(0x97), w(0x44), w(0x17),\
w(0xc4), w(0xa7), w(0x7e), w(0x3d), w(0x64), w(0x5d), w(0x19), w(0x73),\
w(0x60), w(0x81), w(0x4f), w(0xdc), w(0x22), w(0x2a), w(0x90), w(0x88),\
w(0x46), w(0xee), w(0xb8), w(0x14), w(0xde), w(0x5e), w(0x0b), w(0xdb),\
w(0xe0), w(0x32), w(0x3a), w(0x0a), w(0x49), w(0x06), w(0x24), w(0x5c),\
w(0xc2), w(0xd3), w(0xac), w(0x62), w(0x91), w(0x95), w(0xe4), w(0x79),\
w(0xe7), w(0xc8), w(0x37), w(0x6d), w(0x8d), w(0xd5), w(0x4e), w(0xa9),\
w(0x6c), w(0x56), w(0xf4), w(0xea), w(0x65), w(0x7a), w(0xae), w(0x08),\
w(0xba), w(0x78), w(0x25), w(0x2e), w(0x1c), w(0xa6), w(0xb4), w(0xc6),\
w(0xe8), w(0xdd), w(0x74), w(0x1f), w(0x4b), w(0xbd), w(0x8b), w(0x8a),\
w(0x70), w(0x3e), w(0xb5), w(0x66), w(0x48), w(0x03), w(0xf6), w(0x0e),\
w(0x61), w(0x35), w(0x57), w(0xb9), w(0x86), w(0xc1), w(0x1d), w(0x9e),\
w(0xe1), w(0xf8), w(0x98), w(0x11), w(0x69), w(0xd9), w(0x8e), w(0x94),\
w(0x9b), w(0x1e), w(0x87), w(0xe9), w(0xce), w(0x55), w(0x28), w(0xdf),\
w(0x8c), w(0xa1), w(0x89), w(0x0d), w(0xbf), w(0xe6), w(0x42), w(0x68),\
w(0x41), w(0x99), w(0x2d), w(0x0f), w(0xb0), w(0x54), w(0xbb), w(0x16) }
#define rc_data(w) {\
w(0x01), w(0x02), w(0x04), w(0x08), w(0x10),w(0x20), w(0x40), w(0x80),\
w(0x1b), w(0x36) }
#define bytes2word(b0, b1, b2, b3) (((uint32_t)(b3) << 24) | \
((uint32_t)(b2) << 16) | ((uint32_t)(b1) << 8) | (b0))
#define h0(x) (x)
#define w0(p) bytes2word(p, 0, 0, 0)
#define w1(p) bytes2word(0, p, 0, 0)
#define w2(p) bytes2word(0, 0, p, 0)
#define w3(p) bytes2word(0, 0, 0, p)
#define u0(p) bytes2word(f2(p), p, p, f3(p))
#define u1(p) bytes2word(f3(p), f2(p), p, p)
#define u2(p) bytes2word(p, f3(p), f2(p), p)
#define u3(p) bytes2word(p, p, f3(p), f2(p))
#define v0(p) bytes2word(fe(p), f9(p), fd(p), fb(p))
#define v1(p) bytes2word(fb(p), fe(p), f9(p), fd(p))
#define v2(p) bytes2word(fd(p), fb(p), fe(p), f9(p))
#define v3(p) bytes2word(f9(p), fd(p), fb(p), fe(p))
#define f2(x) ((x<<1) ^ (((x>>7) & 1) * WPOLY))
#define f4(x) ((x<<2) ^ (((x>>6) & 1) * WPOLY) ^ (((x>>6) & 2) * WPOLY))
#define f8(x) ((x<<3) ^ (((x>>5) & 1) * WPOLY) ^ (((x>>5) & 2) * WPOLY) ^ (((x>>5) & 4) * WPOLY))
#define f3(x) (f2(x) ^ x)
#define f9(x) (f8(x) ^ x)
#define fb(x) (f8(x) ^ f2(x) ^ x)
#define fd(x) (f8(x) ^ f4(x) ^ x)
#define fe(x) (f8(x) ^ f4(x) ^ f2(x))
#define t_dec(m,n) t_##m##n
#define t_set(m,n) t_##m##n
#define t_use(m,n) t_##m##n
#define d_4(t,n,b,e,f,g,h) ALIGN const t n[4][256] = { b(e), b(f), b(g), b(h) }
#define four_tables(x,tab,vf,rf,c) \
(tab[0][bval(vf(x,0,c),rf(0,c))] \
^ tab[1][bval(vf(x,1,c),rf(1,c))] \
^ tab[2][bval(vf(x,2,c),rf(2,c))] \
^ tab[3][bval(vf(x,3,c),rf(3,c))])
d_4(uint32_t, t_dec(f,n), sb_data, u0, u1, u2, u3);
void aesb_single_round(const uint8_t *in, uint8_t *out, uint8_t *expandedKey)
{
uint32_t b0[4], b1[4];
const uint32_t *kp = (uint32_t *) expandedKey;
state_in(b0, in);
round(fwd_rnd, b1, b0, kp);
state_out(out, b1);
}
void aesb_pseudo_round(const uint8_t *in, uint8_t *out, uint8_t *expandedKey)
{
uint32_t b0[4], b1[4];
const uint32_t *kp = (uint32_t *) expandedKey;
state_in(b0, in);
round(fwd_rnd, b1, b0, kp);
round(fwd_rnd, b0, b1, kp + 1 * N_COLS);
round(fwd_rnd, b1, b0, kp + 2 * N_COLS);
round(fwd_rnd, b0, b1, kp + 3 * N_COLS);
round(fwd_rnd, b1, b0, kp + 4 * N_COLS);
round(fwd_rnd, b0, b1, kp + 5 * N_COLS);
round(fwd_rnd, b1, b0, kp + 6 * N_COLS);
round(fwd_rnd, b0, b1, kp + 7 * N_COLS);
round(fwd_rnd, b1, b0, kp + 8 * N_COLS);
round(fwd_rnd, b0, b1, kp + 9 * N_COLS);
state_out(out, b0);
}
#if defined(__cplusplus)
}
#endif

View File

@ -39,9 +39,9 @@ extern "C" {
//#define OAES_HAVE_ISAAC 1
//#endif // OAES_HAVE_ISAAC
#ifndef OAES_DEBUG
#define OAES_DEBUG 0
#endif // OAES_DEBUG
//#ifndef OAES_DEBUG
//#define OAES_DEBUG 0
//#endif // OAES_DEBUG
#ifdef __cplusplus
}

View File

@ -64,31 +64,6 @@ static const char _NR[] = {
# define min(a,b) (((a)<(b)) ? (a) : (b))
#endif /* min */
typedef struct _oaes_key
{
size_t data_len;
uint8_t *data;
size_t exp_data_len;
uint8_t *exp_data;
size_t num_keys;
size_t key_base;
} oaes_key;
typedef struct _oaes_ctx
{
#ifdef OAES_HAVE_ISAAC
randctx * rctx;
#endif // OAES_HAVE_ISAAC
#ifdef OAES_DEBUG
oaes_step_cb step_cb;
#endif // OAES_DEBUG
oaes_key * key;
OAES_OPTION options;
uint8_t iv[OAES_BLOCK_SIZE];
} oaes_ctx;
// "OAES<8-bit header version><8-bit type><16-bit options><8-bit flags><56-bit reserved>"
static uint8_t oaes_header[OAES_BLOCK_SIZE] = {
// 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, a, b, c, d, e, f,

View File

@ -102,6 +102,30 @@ typedef int ( * oaes_step_cb ) (
typedef uint16_t OAES_OPTION;
typedef struct _oaes_key
{
size_t data_len;
uint8_t *data;
size_t exp_data_len;
uint8_t *exp_data;
size_t num_keys;
size_t key_base;
} oaes_key;
typedef struct _oaes_ctx
{
#ifdef OAES_HAVE_ISAAC
randctx * rctx;
#endif // OAES_HAVE_ISAAC
#ifdef OAES_DEBUG
oaes_step_cb step_cb;
#endif // OAES_DEBUG
oaes_key * key;
OAES_OPTION options;
uint8_t iv[OAES_BLOCK_SIZE];
} oaes_ctx;
/*
* // usage:
*

View File

@ -50,19 +50,25 @@ static void (* const extra_hashes[4])(const void *, size_t, char *) = {
do_blake_hash, do_groestl_hash, do_jh_hash, do_skein_hash
};
static size_t e2i(const uint8_t* a, size_t count) {
return (*((uint64_t*) a) / AES_BLOCK_SIZE) & (count - 1);
extern int aesb_single_round(const uint8_t *in, uint8_t*out, const uint8_t *expandedKey);
extern int aesb_pseudo_round(const uint8_t *in, uint8_t *out, const uint8_t *expandedKey);
static inline size_t e2i(const uint8_t* a) {
return (*((uint64_t*) a) / AES_BLOCK_SIZE) & (MEMORY / AES_BLOCK_SIZE - 1);
}
static void mul(const uint8_t* a, const uint8_t* b, uint8_t* res) {
uint64_t a0, b0;
uint64_t hi, lo;
((uint64_t*) res)[1] = mul128(((uint64_t*) a)[0], ((uint64_t*) b)[0], (uint64_t*) res);
}
a0 = SWAP64LE(((uint64_t*) a)[0]);
b0 = SWAP64LE(((uint64_t*) b)[0]);
lo = mul128(a0, b0, &hi);
((uint64_t*) res)[0] = SWAP64LE(hi);
((uint64_t*) res)[1] = SWAP64LE(lo);
static void mul_sum_xor_dst(const uint8_t* a, uint8_t* c, uint8_t* dst) {
uint64_t hi, lo = mul128(((uint64_t*) a)[0], ((uint64_t*) dst)[0], &hi) + ((uint64_t*) c)[1];
hi += ((uint64_t*) c)[0];
((uint64_t*) c)[0] = ((uint64_t*) dst)[0] ^ hi;
((uint64_t*) c)[1] = ((uint64_t*) dst)[1] ^ lo;
((uint64_t*) dst)[0] = hi;
((uint64_t*) dst)[1] = lo;
}
static void sum_half_blocks(uint8_t* a, const uint8_t* b) {
@ -78,8 +84,9 @@ static void sum_half_blocks(uint8_t* a, const uint8_t* b) {
((uint64_t*) a)[1] = SWAP64LE(a1);
}
static void copy_block(uint8_t* dst, const uint8_t* src) {
memcpy(dst, src, AES_BLOCK_SIZE);
static inline void copy_block(uint8_t* dst, const uint8_t* src) {
((uint64_t*) dst)[0] = ((uint64_t*) src)[0];
((uint64_t*) dst)[1] = ((uint64_t*) src)[1];
}
static void swap_blocks(uint8_t* a, uint8_t* b) {
@ -92,41 +99,48 @@ static void swap_blocks(uint8_t* a, uint8_t* b) {
}
}
static void xor_blocks(uint8_t* a, const uint8_t* b) {
size_t i;
for (i = 0; i < AES_BLOCK_SIZE; i++) {
a[i] ^= b[i];
}
static inline void xor_blocks(uint8_t* a, const uint8_t* b) {
((uint64_t*) a)[0] ^= ((uint64_t*) b)[0];
((uint64_t*) a)[1] ^= ((uint64_t*) b)[1];
}
void cryptonight_hash(const char* input, char* output, uint32_t len) {
static inline void xor_blocks_dst(const uint8_t* a, const uint8_t* b, uint8_t* dst) {
((uint64_t*) dst)[0] = ((uint64_t*) a)[0] ^ ((uint64_t*) b)[0];
((uint64_t*) dst)[1] = ((uint64_t*) a)[1] ^ ((uint64_t*) b)[1];
}
struct cryptonight_ctx {
uint8_t long_state[MEMORY];
union cn_slow_hash_state state;
uint8_t text[INIT_SIZE_BYTE];
uint8_t a[AES_BLOCK_SIZE];
uint8_t b[AES_BLOCK_SIZE];
uint8_t c[AES_BLOCK_SIZE];
uint8_t d[AES_BLOCK_SIZE];
size_t i, j;
uint8_t aes_key[AES_KEY_SIZE];
OAES_CTX* aes_ctx;
oaes_ctx* aes_ctx;
};
hash_process(&state.hs, (const uint8_t*) input, len);
memcpy(text, state.init, INIT_SIZE_BYTE);
memcpy(aes_key, state.hs.b, AES_KEY_SIZE);
aes_ctx = oaes_alloc();
void cryptonight_hash(const char* input, char* output, uint32_t len) {
struct cryptonight_ctx *ctx = alloca(sizeof(struct cryptonight_ctx));
hash_process(&ctx->state.hs, (const uint8_t*) input, len);
memcpy(ctx->text, ctx->state.init, INIT_SIZE_BYTE);
memcpy(ctx->aes_key, ctx->state.hs.b, AES_KEY_SIZE);
ctx->aes_ctx = (oaes_ctx*) oaes_alloc();
size_t i, j;
oaes_key_import_data(aes_ctx, aes_key, AES_KEY_SIZE);
oaes_key_import_data(ctx->aes_ctx, ctx->aes_key, AES_KEY_SIZE);
for (i = 0; i < MEMORY / INIT_SIZE_BYTE; i++) {
for (j = 0; j < INIT_SIZE_BLK; j++) {
oaes_pseudo_encrypt_ecb(aes_ctx, &text[AES_BLOCK_SIZE * j]);
aesb_pseudo_round(&ctx->text[AES_BLOCK_SIZE * j],
&ctx->text[AES_BLOCK_SIZE * j],
ctx->aes_ctx->key->exp_data);
}
memcpy(&long_state[i * INIT_SIZE_BYTE], text, INIT_SIZE_BYTE);
memcpy(&ctx->long_state[i * INIT_SIZE_BYTE], ctx->text, INIT_SIZE_BYTE);
}
for (i = 0; i < 16; i++) {
a[i] = state.k[i] ^ state.k[32 + i];
b[i] = state.k[16 + i] ^ state.k[48 + i];
ctx->a[i] = ctx->state.k[i] ^ ctx->state.k[32 + i];
ctx->b[i] = ctx->state.k[16 + i] ^ ctx->state.k[48 + i];
}
for (i = 0; i < ITER / 2; i++) {
@ -135,39 +149,31 @@ void cryptonight_hash(const char* input, char* output, uint32_t len) {
* next address <-+
*/
/* Iteration 1 */
j = e2i(a, MEMORY / AES_BLOCK_SIZE);
copy_block(c, &long_state[j * AES_BLOCK_SIZE]);
oaes_encryption_round(a, c);
xor_blocks(b, c);
swap_blocks(b, c);
copy_block(&long_state[j * AES_BLOCK_SIZE], c);
assert(j == e2i(a, MEMORY / AES_BLOCK_SIZE));
swap_blocks(a, b);
j = e2i(ctx->a);
aesb_single_round(&ctx->long_state[j * AES_BLOCK_SIZE], ctx->c, ctx->a);
xor_blocks_dst(ctx->c, ctx->b, &ctx->long_state[j * AES_BLOCK_SIZE]);
/* Iteration 2 */
j = e2i(a, MEMORY / AES_BLOCK_SIZE);
copy_block(c, &long_state[j * AES_BLOCK_SIZE]);
mul(a, c, d);
sum_half_blocks(b, d);
swap_blocks(b, c);
xor_blocks(b, c);
copy_block(&long_state[j * AES_BLOCK_SIZE], c);
swap_blocks(a, b);
mul_sum_xor_dst(ctx->c, ctx->a,
&ctx->long_state[e2i(ctx->c) * AES_BLOCK_SIZE]);
copy_block(ctx->b, ctx->c);
}
memcpy(text, state.init, INIT_SIZE_BYTE);
oaes_key_import_data(aes_ctx, &state.hs.b[32], AES_KEY_SIZE);
memcpy(ctx->text, ctx->state.init, INIT_SIZE_BYTE);
oaes_key_import_data(ctx->aes_ctx, &ctx->state.hs.b[32], AES_KEY_SIZE);
for (i = 0; i < MEMORY / INIT_SIZE_BYTE; i++) {
for (j = 0; j < INIT_SIZE_BLK; j++) {
xor_blocks(&text[j * AES_BLOCK_SIZE],
&long_state[i * INIT_SIZE_BYTE + j * AES_BLOCK_SIZE]);
oaes_pseudo_encrypt_ecb(aes_ctx, &text[j * AES_BLOCK_SIZE]);
xor_blocks(&ctx->text[j * AES_BLOCK_SIZE],
&ctx->long_state[i * INIT_SIZE_BYTE + j * AES_BLOCK_SIZE]);
aesb_pseudo_round(&ctx->text[j * AES_BLOCK_SIZE],
&ctx->text[j * AES_BLOCK_SIZE],
ctx->aes_ctx->key->exp_data);
}
}
memcpy(state.init, text, INIT_SIZE_BYTE);
hash_permutation(&state.hs);
memcpy(ctx->state.init, ctx->text, INIT_SIZE_BYTE);
hash_permutation(&ctx->state.hs);
/*memcpy(hash, &state, 32);*/
extra_hashes[state.hs.b[0] & 3](&state, 200, output);
oaes_free(&aes_ctx);
extra_hashes[ctx->state.hs.b[0] & 3](&ctx->state, 200, output);
oaes_free((OAES_CTX **) &ctx->aes_ctx);
}
void cryptonight_fast_hash(const char* input, char* output, uint32_t len) {