commit 5e9699704bc4327eb2523276cce6f9c90ed618db Author: zone117x@gmail.com Date: Sun Mar 30 04:04:48 2014 -0400 Init diff --git a/README.md b/README.md new file mode 100644 index 0000000..5abaed8 --- /dev/null +++ b/README.md @@ -0,0 +1,32 @@ +node-multi-hashing +=============== + +Cryptocurrency hashing functions for node.js. + +Usage +----- + +Install + +```bash +npm install multi-hashing +``` + + +Hash your data + +```javascript +var multiHashing = require('multi-hashing'); + +var data = new Buffer("hash me good bro"); +var hashed = multiHashing.x11(data); //returns a 32 byte buffer + +console.log(hashed); +// +``` + +Credits +------- + +* Creators of the SHA2 and SHA3 hashing algorithms used here +* X11 & Quark creators \ No newline at end of file diff --git a/bcrypt.c b/bcrypt.c new file mode 100644 index 0000000..068bcb9 --- /dev/null +++ b/bcrypt.c @@ -0,0 +1,566 @@ +/* + * This code comes from John the Ripper password cracker, with reentrant + * and crypt(3) interfaces added, but optimizations specific to password + * cracking removed. + * + * Written by Solar Designer in 1998-2002 and + * placed in the public domain. + * + * There's absolutely no warranty. + * + * It is my intent that you should be able to use this on your system, + * as a part of a software package, or anywhere else to improve security, + * ensure compatibility, or for any other purpose. I would appreciate + * it if you give credit where it is due and keep your modifications in + * the public domain as well, but I don't require that in order to let + * you place this code and any modifications you make under a license + * of your choice. + * + * This implementation is compatible with OpenBSD bcrypt.c (version 2a) + * by Niels Provos , and uses some of his + * ideas. The password hashing algorithm was designed by David Mazieres + * . + * + * There's a paper on the algorithm that explains its design decisions: + * + * http://www.usenix.org/events/usenix99/provos.html + * + * Some of the tricks in BF_ROUND might be inspired by Eric Young's + * Blowfish library (I can't be sure if I would think of something if I + * hadn't seen his code). + */ +// Copyright (c) 2009-2015 The Tjcoin developers +/* + * Modified 2011/9/4 by Brendan Younger + * removed unneeded code, changed test for endianness, added wrappers for _crypt_blowfish_rn worker function + */ + +#include +#include +#include +#include +#include "bcrypt.h" + +#ifndef __set_errno +#define __set_errno(val) errno = (val) +#endif + +typedef uint32_t BF_word; + +/* Number of Blowfish rounds, this is also hardcoded into a few places */ +#define BF_N 16 + +typedef BF_word BF_key[BF_N + 2]; + +typedef struct { + BF_word S[4][0x100]; + BF_key P; +} BF_ctx; + +/* + * Magic IV for 64 Blowfish encryptions that we do at the end. + * The string is "OrpheanBeholderS" on big-endian. + */ +static BF_word BF_magic_w[4] = { //---------------:-) + 0x4F727068, 0x65616E42, + 0x65686F6C, 0x64657253 +}; + +/* + * P-box and S-box tables initialized with digits of Pi. + */ +static BF_ctx BF_init_state = { + { + { + 0xd1310ba6, 0x98dfb5ac, 0x2ffd72db, 0xd01adfb7, + 0xb8e1afed, 0x6a267e96, 0xba7c9045, 0xf12c7f99, + 0x24a19947, 0xb3916cf7, 0x0801f2e2, 0x858efc16, + 0x636920d8, 0x71574e69, 0xa458fea3, 0xf4933d7e, + 0x0d95748f, 0x728eb658, 0x718bcd58, 0x82154aee, + 0x7b54a41d, 0xc25a59b5, 0x9c30d539, 0x2af26013, + 0xc5d1b023, 0x286085f0, 0xca417918, 0xb8db38ef, + 0x8e79dcb0, 0x603a180e, 0x6c9e0e8b, 0xb01e8a3e, + 0xd71577c1, 0xbd314b27, 0x78af2fda, 0x55605c60, + 0xe65525f3, 0xaa55ab94, 0x57489862, 0x63e81440, + 0x55ca396a, 0x2aab10b6, 0xb4cc5c34, 0x1141e8ce, + 0xa15486af, 0x7c72e993, 0xb3ee1411, 0x636fbc2a, + 0x2ba9c55d, 0x741831f6, 0xce5c3e16, 0x9b87931e, + 0xafd6ba33, 0x6c24cf5c, 0x7a325381, 0x28958677, + 0x3b8f4898, 0x6b4bb9af, 0xc4bfe81b, 0x66282193, + 0x61d809cc, 0xfb21a991, 0x487cac60, 0x5dec8032, + 0xef845d5d, 0xe98575b1, 0xdc262302, 0xeb651b88, + 0x23893e81, 0xd396acc5, 0x0f6d6ff3, 0x83f44239, + 0x2e0b4482, 0xa4842004, 0x69c8f04a, 0x9e1f9b5e, + 0x21c66842, 0xf6e96c9a, 0x670c9c61, 0xabd388f0, + 0x6a51a0d2, 0xd8542f68, 0x960fa728, 0xab5133a3, + 0x6eef0b6c, 0x137a3be4, 0xba3bf050, 0x7efb2a98, + 0xa1f1651d, 0x39af0176, 0x66ca593e, 0x82430e88, + 0x8cee8619, 0x456f9fb4, 0x7d84a5c3, 0x3b8b5ebe, + 0xe06f75d8, 0x85c12073, 0x401a449f, 0x56c16aa6, + 0x4ed3aa62, 0x363f7706, 0x1bfedf72, 0x429b023d, + 0x37d0d724, 0xd00a1248, 0xdb0fead3, 0x49f1c09b, + 0x075372c9, 0x80991b7b, 0x25d479d8, 0xf6e8def7, + 0xe3fe501a, 0xb6794c3b, 0x976ce0bd, 0x04c006ba, + 0xc1a94fb6, 0x409f60c4, 0x5e5c9ec2, 0x196a2463, + 0x68fb6faf, 0x3e6c53b5, 0x1339b2eb, 0x3b52ec6f, + 0x6dfc511f, 0x9b30952c, 0xcc814544, 0xaf5ebd09, + 0xbee3d004, 0xde334afd, 0x660f2807, 0x192e4bb3, + 0xc0cba857, 0x45c8740f, 0xd20b5f39, 0xb9d3fbdb, + 0x5579c0bd, 0x1a60320a, 0xd6a100c6, 0x402c7279, + 0x679f25fe, 0xfb1fa3cc, 0x8ea5e9f8, 0xdb3222f8, + 0x3c7516df, 0xfd616b15, 0x2f501ec8, 0xad0552ab, + 0x323db5fa, 0xfd238760, 0x53317b48, 0x3e00df82, + 0x9e5c57bb, 0xca6f8ca0, 0x1a87562e, 0xdf1769db, + 0xd542a8f6, 0x287effc3, 0xac6732c6, 0x8c4f5573, + 0x695b27b0, 0xbbca58c8, 0xe1ffa35d, 0xb8f011a0, + 0x10fa3d98, 0xfd2183b8, 0x4afcb56c, 0x2dd1d35b, + 0x9a53e479, 0xb6f84565, 0xd28e49bc, 0x4bfb9790, + 0xe1ddf2da, 0xa4cb7e33, 0x62fb1341, 0xcee4c6e8, + 0xef20cada, 0x36774c01, 0xd07e9efe, 0x2bf11fb4, + 0x95dbda4d, 0xae909198, 0xeaad8e71, 0x6b93d5a0, + 0xd08ed1d0, 0xafc725e0, 0x8e3c5b2f, 0x8e7594b7, + 0x8ff6e2fb, 0xf2122b64, 0x8888b812, 0x900df01c, + 0x4fad5ea0, 0x688fc31c, 0xd1cff191, 0xb3a8c1ad, + 0x2f2f2218, 0xbe0e1777, 0xea752dfe, 0x8b021fa1, + 0xe5a0cc0f, 0xb56f74e8, 0x18acf3d6, 0xce89e299, + 0xb4a84fe0, 0xfd13e0b7, 0x7cc43b81, 0xd2ada8d9, + 0x165fa266, 0x80957705, 0x93cc7314, 0x211a1477, + 0xe6ad2065, 0x77b5fa86, 0xc75442f5, 0xfb9d35cf, + 0xebcdaf0c, 0x7b3e89a0, 0xd6411bd3, 0xae1e7e49, + 0x00250e2d, 0x2071b35e, 0x226800bb, 0x57b8e0af, + 0x2464369b, 0xf009b91e, 0x5563911d, 0x59dfa6aa, + 0x78c14389, 0xd95a537f, 0x207d5ba2, 0x02e5b9c5, + 0x83260376, 0x6295cfa9, 0x11c81968, 0x4e734a41, + 0xb3472dca, 0x7b14a94a, 0x1b510052, 0x9a532915, + 0xd60f573f, 0xbc9bc6e4, 0x2b60a476, 0x81e67400, + 0x08ba6fb5, 0x571be91f, 0xf296ec6b, 0x2a0dd915, + 0xb6636521, 0xe7b9f9b6, 0xff34052e, 0xc5855664, + 0x53b02d5d, 0xa99f8fa1, 0x08ba4799, 0x6e85076a + }, { + 0x4b7a70e9, 0xb5b32944, 0xdb75092e, 0xc4192623, + 0xad6ea6b0, 0x49a7df7d, 0x9cee60b8, 0x8fedb266, + 0xecaa8c71, 0x699a17ff, 0x5664526c, 0xc2b19ee1, + 0x193602a5, 0x75094c29, 0xa0591340, 0xe4183a3e, + 0x3f54989a, 0x5b429d65, 0x6b8fe4d6, 0x99f73fd6, + 0xa1d29c07, 0xefe830f5, 0x4d2d38e6, 0xf0255dc1, + 0x4cdd2086, 0x8470eb26, 0x6382e9c6, 0x021ecc5e, + 0x09686b3f, 0x3ebaefc9, 0x3c971814, 0x6b6a70a1, + 0x687f3584, 0x52a0e286, 0xb79c5305, 0xaa500737, + 0x3e07841c, 0x7fdeae5c, 0x8e7d44ec, 0x5716f2b8, + 0xb03ada37, 0xf0500c0d, 0xf01c1f04, 0x0200b3ff, + 0xae0cf51a, 0x3cb574b2, 0x25837a58, 0xdc0921bd, + 0xd19113f9, 0x7ca92ff6, 0x94324773, 0x22f54701, + 0x3ae5e581, 0x37c2dadc, 0xc8b57634, 0x9af3dda7, + 0xa9446146, 0x0fd0030e, 0xecc8c73e, 0xa4751e41, + 0xe238cd99, 0x3bea0e2f, 0x3280bba1, 0x183eb331, + 0x4e548b38, 0x4f6db908, 0x6f420d03, 0xf60a04bf, + 0x2cb81290, 0x24977c79, 0x5679b072, 0xbcaf89af, + 0xde9a771f, 0xd9930810, 0xb38bae12, 0xdccf3f2e, + 0x5512721f, 0x2e6b7124, 0x501adde6, 0x9f84cd87, + 0x7a584718, 0x7408da17, 0xbc9f9abc, 0xe94b7d8c, + 0xec7aec3a, 0xdb851dfa, 0x63094366, 0xc464c3d2, + 0xef1c1847, 0x3215d908, 0xdd433b37, 0x24c2ba16, + 0x12a14d43, 0x2a65c451, 0x50940002, 0x133ae4dd, + 0x71dff89e, 0x10314e55, 0x81ac77d6, 0x5f11199b, + 0x043556f1, 0xd7a3c76b, 0x3c11183b, 0x5924a509, + 0xf28fe6ed, 0x97f1fbfa, 0x9ebabf2c, 0x1e153c6e, + 0x86e34570, 0xeae96fb1, 0x860e5e0a, 0x5a3e2ab3, + 0x771fe71c, 0x4e3d06fa, 0x2965dcb9, 0x99e71d0f, + 0x803e89d6, 0x5266c825, 0x2e4cc978, 0x9c10b36a, + 0xc6150eba, 0x94e2ea78, 0xa5fc3c53, 0x1e0a2df4, + 0xf2f74ea7, 0x361d2b3d, 0x1939260f, 0x19c27960, + 0x5223a708, 0xf71312b6, 0xebadfe6e, 0xeac31f66, + 0xe3bc4595, 0xa67bc883, 0xb17f37d1, 0x018cff28, + 0xc332ddef, 0xbe6c5aa5, 0x65582185, 0x68ab9802, + 0xeecea50f, 0xdb2f953b, 0x2aef7dad, 0x5b6e2f84, + 0x1521b628, 0x29076170, 0xecdd4775, 0x619f1510, + 0x13cca830, 0xeb61bd96, 0x0334fe1e, 0xaa0363cf, + 0xb5735c90, 0x4c70a239, 0xd59e9e0b, 0xcbaade14, + 0xeecc86bc, 0x60622ca7, 0x9cab5cab, 0xb2f3846e, + 0x648b1eaf, 0x19bdf0ca, 0xa02369b9, 0x655abb50, + 0x40685a32, 0x3c2ab4b3, 0x319ee9d5, 0xc021b8f7, + 0x9b540b19, 0x875fa099, 0x95f7997e, 0x623d7da8, + 0xf837889a, 0x97e32d77, 0x11ed935f, 0x16681281, + 0x0e358829, 0xc7e61fd6, 0x96dedfa1, 0x7858ba99, + 0x57f584a5, 0x1b227263, 0x9b83c3ff, 0x1ac24696, + 0xcdb30aeb, 0x532e3054, 0x8fd948e4, 0x6dbc3128, + 0x58ebf2ef, 0x34c6ffea, 0xfe28ed61, 0xee7c3c73, + 0x5d4a14d9, 0xe864b7e3, 0x42105d14, 0x203e13e0, + 0x45eee2b6, 0xa3aaabea, 0xdb6c4f15, 0xfacb4fd0, + 0xc742f442, 0xef6abbb5, 0x654f3b1d, 0x41cd2105, + 0xd81e799e, 0x86854dc7, 0xe44b476a, 0x3d816250, + 0xcf62a1f2, 0x5b8d2646, 0xfc8883a0, 0xc1c7b6a3, + 0x7f1524c3, 0x69cb7492, 0x47848a0b, 0x5692b285, + 0x095bbf00, 0xad19489d, 0x1462b174, 0x23820e00, + 0x58428d2a, 0x0c55f5ea, 0x1dadf43e, 0x233f7061, + 0x3372f092, 0x8d937e41, 0xd65fecf1, 0x6c223bdb, + 0x7cde3759, 0xcbee7460, 0x4085f2a7, 0xce77326e, + 0xa6078084, 0x19f8509e, 0xe8efd855, 0x61d99735, + 0xa969a7aa, 0xc50c06c2, 0x5a04abfc, 0x800bcadc, + 0x9e447a2e, 0xc3453484, 0xfdd56705, 0x0e1e9ec9, + 0xdb73dbd3, 0x105588cd, 0x675fda79, 0xe3674340, + 0xc5c43465, 0x713e38d8, 0x3d28f89e, 0xf16dff20, + 0x153e21e7, 0x8fb03d4a, 0xe6e39f2b, 0xdb83adf7 + }, { + 0xe93d5a68, 0x948140f7, 0xf64c261c, 0x94692934, + 0x411520f7, 0x7602d4f7, 0xbcf46b2e, 0xd4a20068, + 0xd4082471, 0x3320f46a, 0x43b7d4b7, 0x500061af, + 0x1e39f62e, 0x97244546, 0x14214f74, 0xbf8b8840, + 0x4d95fc1d, 0x96b591af, 0x70f4ddd3, 0x66a02f45, + 0xbfbc09ec, 0x03bd9785, 0x7fac6dd0, 0x31cb8504, + 0x96eb27b3, 0x55fd3941, 0xda2547e6, 0xabca0a9a, + 0x28507825, 0x530429f4, 0x0a2c86da, 0xe9b66dfb, + 0x68dc1462, 0xd7486900, 0x680ec0a4, 0x27a18dee, + 0x4f3ffea2, 0xe887ad8c, 0xb58ce006, 0x7af4d6b6, + 0xaace1e7c, 0xd3375fec, 0xce78a399, 0x406b2a42, + 0x20fe9e35, 0xd9f385b9, 0xee39d7ab, 0x3b124e8b, + 0x1dc9faf7, 0x4b6d1856, 0x26a36631, 0xeae397b2, + 0x3a6efa74, 0xdd5b4332, 0x6841e7f7, 0xca7820fb, + 0xfb0af54e, 0xd8feb397, 0x454056ac, 0xba489527, + 0x55533a3a, 0x20838d87, 0xfe6ba9b7, 0xd096954b, + 0x55a867bc, 0xa1159a58, 0xcca92963, 0x99e1db33, + 0xa62a4a56, 0x3f3125f9, 0x5ef47e1c, 0x9029317c, + 0xfdf8e802, 0x04272f70, 0x80bb155c, 0x05282ce3, + 0x95c11548, 0xe4c66d22, 0x48c1133f, 0xc70f86dc, + 0x07f9c9ee, 0x41041f0f, 0x404779a4, 0x5d886e17, + 0x325f51eb, 0xd59bc0d1, 0xf2bcc18f, 0x41113564, + 0x257b7834, 0x602a9c60, 0xdff8e8a3, 0x1f636c1b, + 0x0e12b4c2, 0x02e1329e, 0xaf664fd1, 0xcad18115, + 0x6b2395e0, 0x333e92e1, 0x3b240b62, 0xeebeb922, + 0x85b2a20e, 0xe6ba0d99, 0xde720c8c, 0x2da2f728, + 0xd0127845, 0x95b794fd, 0x647d0862, 0xe7ccf5f0, + 0x5449a36f, 0x877d48fa, 0xc39dfd27, 0xf33e8d1e, + 0x0a476341, 0x992eff74, 0x3a6f6eab, 0xf4f8fd37, + 0xa812dc60, 0xa1ebddf8, 0x991be14c, 0xdb6e6b0d, + 0xc67b5510, 0x6d672c37, 0x2765d43b, 0xdcd0e804, + 0xf1290dc7, 0xcc00ffa3, 0xb5390f92, 0x690fed0b, + 0x667b9ffb, 0xcedb7d9c, 0xa091cf0b, 0xd9155ea3, + 0xbb132f88, 0x515bad24, 0x7b9479bf, 0x763bd6eb, + 0x37392eb3, 0xcc115979, 0x8026e297, 0xf42e312d, + 0x6842ada7, 0xc66a2b3b, 0x12754ccc, 0x782ef11c, + 0x6a124237, 0xb79251e7, 0x06a1bbe6, 0x4bfb6350, + 0x1a6b1018, 0x11caedfa, 0x3d25bdd8, 0xe2e1c3c9, + 0x44421659, 0x0a121386, 0xd90cec6e, 0xd5abea2a, + 0x64af674e, 0xda86a85f, 0xbebfe988, 0x64e4c3fe, + 0x9dbc8057, 0xf0f7c086, 0x60787bf8, 0x6003604d, + 0xd1fd8346, 0xf6381fb0, 0x7745ae04, 0xd736fccc, + 0x83426b33, 0xf01eab71, 0xb0804187, 0x3c005e5f, + 0x77a057be, 0xbde8ae24, 0x55464299, 0xbf582e61, + 0x4e58f48f, 0xf2ddfda2, 0xf474ef38, 0x8789bdc2, + 0x5366f9c3, 0xc8b38e74, 0xb475f255, 0x46fcd9b9, + 0x7aeb2661, 0x8b1ddf84, 0x846a0e79, 0x915f95e2, + 0x466e598e, 0x20b45770, 0x8cd55591, 0xc902de4c, + 0xb90bace1, 0xbb8205d0, 0x11a86248, 0x7574a99e, + 0xb77f19b6, 0xe0a9dc09, 0x662d09a1, 0xc4324633, + 0xe85a1f02, 0x09f0be8c, 0x4a99a025, 0x1d6efe10, + 0x1ab93d1d, 0x0ba5a4df, 0xa186f20f, 0x2868f169, + 0xdcb7da83, 0x573906fe, 0xa1e2ce9b, 0x4fcd7f52, + 0x50115e01, 0xa70683fa, 0xa002b5c4, 0x0de6d027, + 0x9af88c27, 0x773f8641, 0xc3604c06, 0x61a806b5, + 0xf0177a28, 0xc0f586e0, 0x006058aa, 0x30dc7d62, + 0x11e69ed7, 0x2338ea63, 0x53c2dd94, 0xc2c21634, + 0xbbcbee56, 0x90bcb6de, 0xebfc7da1, 0xce591d76, + 0x6f05e409, 0x4b7c0188, 0x39720a3d, 0x7c927c24, + 0x86e3725f, 0x724d9db9, 0x1ac15bb4, 0xd39eb8fc, + 0xed545578, 0x08fca5b5, 0xd83d7cd3, 0x4dad0fc4, + 0x1e50ef5e, 0xb161e6f8, 0xa28514d9, 0x6c51133c, + 0x6fd5c7e7, 0x56e14ec4, 0x362abfce, 0xddc6c837, + 0xd79a3234, 0x92638212, 0x670efa8e, 0x406000e0 + }, { + 0x3a39ce37, 0xd3faf5cf, 0xabc27737, 0x5ac52d1b, + 0x5cb0679e, 0x4fa33742, 0xd3822740, 0x99bc9bbe, + 0xd5118e9d, 0xbf0f7315, 0xd62d1c7e, 0xc700c47b, + 0xb78c1b6b, 0x21a19045, 0xb26eb1be, 0x6a366eb4, + 0x5748ab2f, 0xbc946e79, 0xc6a376d2, 0x6549c2c8, + 0x530ff8ee, 0x468dde7d, 0xd5730a1d, 0x4cd04dc6, + 0x2939bbdb, 0xa9ba4650, 0xac9526e8, 0xbe5ee304, + 0xa1fad5f0, 0x6a2d519a, 0x63ef8ce2, 0x9a86ee22, + 0xc089c2b8, 0x43242ef6, 0xa51e03aa, 0x9cf2d0a4, + 0x83c061ba, 0x9be96a4d, 0x8fe51550, 0xba645bd6, + 0x2826a2f9, 0xa73a3ae1, 0x4ba99586, 0xef5562e9, + 0xc72fefd3, 0xf752f7da, 0x3f046f69, 0x77fa0a59, + 0x80e4a915, 0x87b08601, 0x9b09e6ad, 0x3b3ee593, + 0xe990fd5a, 0x9e34d797, 0x2cf0b7d9, 0x022b8b51, + 0x96d5ac3a, 0x017da67d, 0xd1cf3ed6, 0x7c7d2d28, + 0x1f9f25cf, 0xadf2b89b, 0x5ad6b472, 0x5a88f54c, + 0xe029ac71, 0xe019a5e6, 0x47b0acfd, 0xed93fa9b, + 0xe8d3c48d, 0x283b57cc, 0xf8d56629, 0x79132e28, + 0x785f0191, 0xed756055, 0xf7960e44, 0xe3d35e8c, + 0x15056dd4, 0x88f46dba, 0x03a16125, 0x0564f0bd, + 0xc3eb9e15, 0x3c9057a2, 0x97271aec, 0xa93a072a, + 0x1b3f6d9b, 0x1e6321f5, 0xf59c66fb, 0x26dcf319, + 0x7533d928, 0xb155fdf5, 0x03563482, 0x8aba3cbb, + 0x28517711, 0xc20ad9f8, 0xabcc5167, 0xccad925f, + 0x4de81751, 0x3830dc8e, 0x379d5862, 0x9320f991, + 0xea7a90c2, 0xfb3e7bce, 0x5121ce64, 0x774fbe32, + 0xa8b6e37e, 0xc3293d46, 0x48de5369, 0x6413e680, + 0xa2ae0810, 0xdd6db224, 0x69852dfd, 0x09072166, + 0xb39a460a, 0x6445c0dd, 0x586cdecf, 0x1c20c8ae, + 0x5bbef7dd, 0x1b588d40, 0xccd2017f, 0x6bb4e3bb, + 0xdda26a7e, 0x3a59ff45, 0x3e350a44, 0xbcb4cdd5, + 0x72eacea8, 0xfa6484bb, 0x8d6612ae, 0xbf3c6f47, + 0xd29be463, 0x542f5d9e, 0xaec2771b, 0xf64e6370, + 0x740e0d8d, 0xe75b1357, 0xf8721671, 0xaf537d5d, + 0x4040cb08, 0x4eb4e2cc, 0x34d2466a, 0x0115af84, + 0xe1b00428, 0x95983a1d, 0x06b89fb4, 0xce6ea048, + 0x6f3f3b82, 0x3520ab82, 0x011a1d4b, 0x277227f8, + 0x611560b1, 0xe7933fdc, 0xbb3a792b, 0x344525bd, + 0xa08839e1, 0x51ce794b, 0x2f32c9b7, 0xa01fbac9, + 0xe01cc87e, 0xbcc7d1f6, 0xcf0111c3, 0xa1e8aac7, + 0x1a908749, 0xd44fbd9a, 0xd0dadecb, 0xd50ada38, + 0x0339c32a, 0xc6913667, 0x8df9317c, 0xe0b12b4f, + 0xf79e59b7, 0x43f5bb3a, 0xf2d519ff, 0x27d9459c, + 0xbf97222c, 0x15e6fc2a, 0x0f91fc71, 0x9b941525, + 0xfae59361, 0xceb69ceb, 0xc2a86459, 0x12baa8d1, + 0xb6c1075e, 0xe3056a0c, 0x10d25065, 0xcb03a442, + 0xe0ec6e0e, 0x1698db3b, 0x4c98a0be, 0x3278e964, + 0x9f1f9532, 0xe0d392df, 0xd3a0342b, 0x8971f21e, + 0x1b0a7441, 0x4ba3348c, 0xc5be7120, 0xc37632d8, + 0xdf359f8d, 0x9b992f2e, 0xe60b6f47, 0x0fe3f11d, + 0xe54cda54, 0x1edad891, 0xce6279cf, 0xcd3e7e6f, + 0x1618b166, 0xfd2c1d05, 0x848fd2c5, 0xf6fb2299, + 0xf523f357, 0xa6327623, 0x93a83531, 0x56cccd02, + 0xacf08162, 0x5a75ebb5, 0x6e163697, 0x88d273cc, + 0xde966292, 0x81b949d0, 0x4c50901b, 0x71c65614, + 0xe6c6c7bd, 0x327a140a, 0x45e1d006, 0xc3f27b9a, + 0xc9aa53fd, 0x62a80f00, 0xbb25bfe2, 0x35bdd2f6, + 0x71126905, 0xb2040222, 0xb6cbcf7c, 0xcd769c2b, + 0x53113ec0, 0x1640e3d3, 0x38abbd60, 0x2547adf0, + 0xba38209c, 0xf746ce76, 0x77afa1c5, 0x20756060, + 0x85cbfe4e, 0x8ae88dd8, 0x7aaaf9b0, 0x4cf9aa7e, + 0x1948c25c, 0x02fb8a8c, 0x01c36ae4, 0xd6ebe1f9, + 0x90d4f869, 0xa65cdea0, 0x3f09252d, 0xc208e69f, + 0xb74e6132, 0xce77e25b, 0x578fdfe3, 0x3ac372e6 + } + }, { + 0x243f6a88, 0x85a308d3, 0x13198a2e, 0x03707344, + 0xa4093822, 0x299f31d0, 0x082efa98, 0xec4e6c89, + 0x452821e6, 0x38d01377, 0xbe5466cf, 0x34e90c6c, + 0xc0ac29b7, 0xc97c50dd, 0x3f84d5b5, 0xb5470917, + 0x9216d5d9, 0x8979fb1b + } +}; + +static void clean(void *data, int size) { + memset(data, 0, size); +} + +static void BF_swap(BF_word *x, int count) { + static union { + uint16_t i; + uint8_t c[2]; + } is_little = { 0x0001 }; + BF_word tmp; + + if(is_little.c[0]) { + do { + tmp = *x; + tmp = (tmp << 16) | (tmp >> 16); + *x++ = ((tmp & 0x00FF00FF) << 8) | ((tmp >> 8) & 0x00FF00FF); + } while(--count); + } +} + +#define BF_INDEX(S, i) \ + (*((BF_word *)(((unsigned char *)S) + (i)))) + +#define BF_ROUND(L, R, N) \ + tmp1 = L & 0xFF; \ + tmp1 <<= 2; \ + tmp2 = L >> 6; \ + tmp2 &= 0x3FC; \ + tmp3 = L >> 14; \ + tmp3 &= 0x3FC; \ + tmp4 = L >> 22; \ + tmp4 &= 0x3FC; \ + tmp1 = BF_INDEX(data.ctx.S[3], tmp1); \ + tmp2 = BF_INDEX(data.ctx.S[2], tmp2); \ + tmp3 = BF_INDEX(data.ctx.S[1], tmp3); \ + tmp3 += BF_INDEX(data.ctx.S[0], tmp4); \ + tmp3 ^= tmp2; \ + R ^= data.ctx.P[N + 1]; \ + tmp3 += tmp1; \ + R ^= tmp3; + +/* + * Encrypt one block, BF_N is hardcoded here. + */ +#define BF_ENCRYPT \ + L ^= data.ctx.P[0]; \ + BF_ROUND(L, R, 0); \ + BF_ROUND(R, L, 1); \ + BF_ROUND(L, R, 2); \ + BF_ROUND(R, L, 3); \ + BF_ROUND(L, R, 4); \ + BF_ROUND(R, L, 5); \ + BF_ROUND(L, R, 6); \ + BF_ROUND(R, L, 7); \ + BF_ROUND(L, R, 8); \ + BF_ROUND(R, L, 9); \ + BF_ROUND(L, R, 10); \ + BF_ROUND(R, L, 11); \ + BF_ROUND(L, R, 12); \ + BF_ROUND(R, L, 13); \ + BF_ROUND(L, R, 14); \ + BF_ROUND(R, L, 15); \ + tmp4 = R; \ + R = L; \ + L = tmp4 ^ data.ctx.P[BF_N + 1]; + + +#define BF_body() \ + L = R = 0; \ + ptr = data.ctx.P; \ + do { \ + ptr += 2; \ + BF_ENCRYPT; \ + *(ptr - 2) = L; \ + *(ptr - 1) = R; \ + } while (ptr < &data.ctx.P[BF_N + 2]); \ +\ + ptr = data.ctx.S[0]; \ + do { \ + ptr += 2; \ + BF_ENCRYPT; \ + *(ptr - 2) = L; \ + *(ptr - 1) = R; \ + } while (ptr < &data.ctx.S[3][0xFF]); + +static void BF_set_key(const char *key, BF_key expanded, BF_key initial) { + const char *ptr = key; + int i, j; + BF_word tmp; + + for (i = 0; i < BF_N + 2; i++) { + tmp = 0; + for (j = 0; j < 4; j++) { + tmp <<= 8; + tmp |= (unsigned char)*ptr; + + if (!*ptr) ptr = key; else ptr++; + } + + expanded[i] = tmp; + initial[i] = BF_init_state.P[i] ^ tmp; + } +} + +static void _crypt_blowfish_rn(const char *key, const char *salt, char *output) { + struct { + BF_ctx ctx; + BF_key expanded_key; + union { + BF_word salt[4]; + BF_word output[4]; + } binary; + } data; + BF_word L, R; + BF_word tmp1, tmp2, tmp3, tmp4; + BF_word *ptr; + BF_word count = (BF_word)1 << 12; //work factor + int i; + + memcpy(data.binary.salt, salt, 16); + BF_swap(data.binary.salt, 4); + + BF_set_key(key, data.expanded_key, data.ctx.P); + + memcpy(data.ctx.S, BF_init_state.S, sizeof(data.ctx.S)); + + L = R = 0; + for (i = 0; i < BF_N + 2; i += 2) { + L ^= data.binary.salt[i & 2]; + R ^= data.binary.salt[(i & 2) + 1]; + BF_ENCRYPT; + data.ctx.P[i] = L; + data.ctx.P[i + 1] = R; + } + + ptr = data.ctx.S[0]; + do { + ptr += 4; + L ^= data.binary.salt[(BF_N + 2) & 3]; + R ^= data.binary.salt[(BF_N + 3) & 3]; + BF_ENCRYPT; + *(ptr - 4) = L; + *(ptr - 3) = R; + + L ^= data.binary.salt[(BF_N + 4) & 3]; + R ^= data.binary.salt[(BF_N + 5) & 3]; + BF_ENCRYPT; + *(ptr - 2) = L; + *(ptr - 1) = R; + } while (ptr < &data.ctx.S[3][0xFF]); + + do { + data.ctx.P[0] ^= data.expanded_key[0]; + data.ctx.P[1] ^= data.expanded_key[1]; + data.ctx.P[2] ^= data.expanded_key[2]; + data.ctx.P[3] ^= data.expanded_key[3]; + data.ctx.P[4] ^= data.expanded_key[4]; + data.ctx.P[5] ^= data.expanded_key[5]; + data.ctx.P[6] ^= data.expanded_key[6]; + data.ctx.P[7] ^= data.expanded_key[7]; + data.ctx.P[8] ^= data.expanded_key[8]; + data.ctx.P[9] ^= data.expanded_key[9]; + data.ctx.P[10] ^= data.expanded_key[10]; + data.ctx.P[11] ^= data.expanded_key[11]; + data.ctx.P[12] ^= data.expanded_key[12]; + data.ctx.P[13] ^= data.expanded_key[13]; + data.ctx.P[14] ^= data.expanded_key[14]; + data.ctx.P[15] ^= data.expanded_key[15]; + data.ctx.P[16] ^= data.expanded_key[16]; + data.ctx.P[17] ^= data.expanded_key[17]; + + BF_body(); + + tmp1 = data.binary.salt[0]; + tmp2 = data.binary.salt[1]; + tmp3 = data.binary.salt[2]; + tmp4 = data.binary.salt[3]; + data.ctx.P[0] ^= tmp1; + data.ctx.P[1] ^= tmp2; + data.ctx.P[2] ^= tmp3; + data.ctx.P[3] ^= tmp4; + data.ctx.P[4] ^= tmp1; + data.ctx.P[5] ^= tmp2; + data.ctx.P[6] ^= tmp3; + data.ctx.P[7] ^= tmp4; + data.ctx.P[8] ^= tmp1; + data.ctx.P[9] ^= tmp2; + data.ctx.P[10] ^= tmp3; + data.ctx.P[11] ^= tmp4; + data.ctx.P[12] ^= tmp1; + data.ctx.P[13] ^= tmp2; + data.ctx.P[14] ^= tmp3; + data.ctx.P[15] ^= tmp4; + data.ctx.P[16] ^= tmp1; + data.ctx.P[17] ^= tmp2; + + BF_body(); + } while (--count); + + for (i = 0; i < 4; i += 2) { + L = BF_magic_w[i]; + R = BF_magic_w[i + 1]; + + count = 64; + do { + BF_ENCRYPT; + } while (--count); + + data.binary.output[i] = L; + data.binary.output[i + 1] = R; + } + + memcpy(output, data.binary.output, 16); + + clean(&data, sizeof(data)); +} + +void bcrypt_hash(const char *in, char *out) +{ + _crypt_blowfish_rn(&in[0 * BF_N], &in[1 * BF_N], &out[0 * BF_N]); + _crypt_blowfish_rn(&in[2 * BF_N], &in[3 * BF_N], &out[1 * BF_N]); + _crypt_blowfish_rn(&in[4 * BF_N], &out[1 * BF_N], &out[1 * BF_N]); +} \ No newline at end of file diff --git a/bcrypt.h b/bcrypt.h new file mode 100644 index 0000000..7a9d8cb --- /dev/null +++ b/bcrypt.h @@ -0,0 +1,14 @@ +#ifndef BCRYPT_H +#define BCRYPT_H + +#ifdef __cplusplus +extern "C" { +#endif + +void bcrypt_hash(const char *input, char *output); + +#ifdef __cplusplus +} +#endif + +#endif \ No newline at end of file diff --git a/binding.gyp b/binding.gyp new file mode 100644 index 0000000..dce7a11 --- /dev/null +++ b/binding.gyp @@ -0,0 +1,29 @@ +{ + "targets": [ + { + "target_name": "multihashing", + "sources": [ + "multihashing.cc", + "scrypt.c", + "scryptjane.c", + "scryptn.c", + "keccak.c", + "skein.c", + "x11.c", + "quark.c", + "sha3/aes_helper.c", + "sha3/blake.c", + "sha3/bmw.c", + "sha3/cubehash.c", + "sha3/echo.c", + "sha3/groestl.c", + "sha3/jh.c", + "sha3/keccak.c", + "sha3/luffa.c", + "sha3/shavite.c", + "sha3/simd.c", + "sha3/skein.c" + ] + } + ] +} diff --git a/build/Makefile b/build/Makefile new file mode 100644 index 0000000..9f17b5f --- /dev/null +++ b/build/Makefile @@ -0,0 +1,332 @@ +# We borrow heavily from the kernel build setup, though we are simpler since +# we don't have Kconfig tweaking settings on us. + +# The implicit make rules have it looking for RCS files, among other things. +# We instead explicitly write all the rules we care about. +# It's even quicker (saves ~200ms) to pass -r on the command line. +MAKEFLAGS=-r + +# The source directory tree. +srcdir := .. +abs_srcdir := $(abspath $(srcdir)) + +# The name of the builddir. +builddir_name ?= . + +# The V=1 flag on command line makes us verbosely print command lines. +ifdef V + quiet= +else + quiet=quiet_ +endif + +# Specify BUILDTYPE=Release on the command line for a release build. +BUILDTYPE ?= Release + +# Directory all our build output goes into. +# Note that this must be two directories beneath src/ for unit tests to pass, +# as they reach into the src/ directory for data with relative paths. +builddir ?= $(builddir_name)/$(BUILDTYPE) +abs_builddir := $(abspath $(builddir)) +depsdir := $(builddir)/.deps + +# Object output directory. +obj := $(builddir)/obj +abs_obj := $(abspath $(obj)) + +# We build up a list of every single one of the targets so we can slurp in the +# generated dependency rule Makefiles in one pass. +all_deps := + + + +CC.target ?= $(CC) +CFLAGS.target ?= $(CFLAGS) +CXX.target ?= $(CXX) +CXXFLAGS.target ?= $(CXXFLAGS) +LINK.target ?= $(LINK) +LDFLAGS.target ?= $(LDFLAGS) +AR.target ?= $(AR) + +# C++ apps need to be linked with g++. +# +# Note: flock is used to seralize linking. Linking is a memory-intensive +# process so running parallel links can often lead to thrashing. To disable +# the serialization, override LINK via an envrionment variable as follows: +# +# export LINK=g++ +# +# This will allow make to invoke N linker processes as specified in -jN. +LINK ?= flock $(builddir)/linker.lock $(CXX.target) + +# TODO(evan): move all cross-compilation logic to gyp-time so we don't need +# to replicate this environment fallback in make as well. +CC.host ?= gcc +CFLAGS.host ?= +CXX.host ?= g++ +CXXFLAGS.host ?= +LINK.host ?= $(CXX.host) +LDFLAGS.host ?= +AR.host ?= ar + +# Define a dir function that can handle spaces. +# http://www.gnu.org/software/make/manual/make.html#Syntax-of-Functions +# "leading spaces cannot appear in the text of the first argument as written. +# These characters can be put into the argument value by variable substitution." +empty := +space := $(empty) $(empty) + +# http://stackoverflow.com/questions/1189781/using-make-dir-or-notdir-on-a-path-with-spaces +replace_spaces = $(subst $(space),?,$1) +unreplace_spaces = $(subst ?,$(space),$1) +dirx = $(call unreplace_spaces,$(dir $(call replace_spaces,$1))) + +# Flags to make gcc output dependency info. Note that you need to be +# careful here to use the flags that ccache and distcc can understand. +# We write to a dep file on the side first and then rename at the end +# so we can't end up with a broken dep file. +depfile = $(depsdir)/$(call replace_spaces,$@).d +DEPFLAGS = -MMD -MF $(depfile).raw + +# We have to fixup the deps output in a few ways. +# (1) the file output should mention the proper .o file. +# ccache or distcc lose the path to the target, so we convert a rule of +# the form: +# foobar.o: DEP1 DEP2 +# into +# path/to/foobar.o: DEP1 DEP2 +# (2) we want missing files not to cause us to fail to build. +# We want to rewrite +# foobar.o: DEP1 DEP2 \ +# DEP3 +# to +# DEP1: +# DEP2: +# DEP3: +# so if the files are missing, they're just considered phony rules. +# We have to do some pretty insane escaping to get those backslashes +# and dollar signs past make, the shell, and sed at the same time. +# Doesn't work with spaces, but that's fine: .d files have spaces in +# their names replaced with other characters. +define fixup_dep +# The depfile may not exist if the input file didn't have any #includes. +touch $(depfile).raw +# Fixup path as in (1). +sed -e "s|^$(notdir $@)|$@|" $(depfile).raw >> $(depfile) +# Add extra rules as in (2). +# We remove slashes and replace spaces with new lines; +# remove blank lines; +# delete the first line and append a colon to the remaining lines. +sed -e 's|\\||' -e 'y| |\n|' $(depfile).raw |\ + grep -v '^$$' |\ + sed -e 1d -e 's|$$|:|' \ + >> $(depfile) +rm $(depfile).raw +endef + +# Command definitions: +# - cmd_foo is the actual command to run; +# - quiet_cmd_foo is the brief-output summary of the command. + +quiet_cmd_cc = CC($(TOOLSET)) $@ +cmd_cc = $(CC.$(TOOLSET)) $(GYP_CFLAGS) $(DEPFLAGS) $(CFLAGS.$(TOOLSET)) -c -o $@ $< + +quiet_cmd_cxx = CXX($(TOOLSET)) $@ +cmd_cxx = $(CXX.$(TOOLSET)) $(GYP_CXXFLAGS) $(DEPFLAGS) $(CXXFLAGS.$(TOOLSET)) -c -o $@ $< + +quiet_cmd_touch = TOUCH $@ +cmd_touch = touch $@ + +quiet_cmd_copy = COPY $@ +# send stderr to /dev/null to ignore messages when linking directories. +cmd_copy = rm -rf "$@" && cp -af "$<" "$@" + +quiet_cmd_alink = AR($(TOOLSET)) $@ +cmd_alink = rm -f $@ && $(AR.$(TOOLSET)) crs $@ $(filter %.o,$^) + +quiet_cmd_alink_thin = AR($(TOOLSET)) $@ +cmd_alink_thin = rm -f $@ && $(AR.$(TOOLSET)) crsT $@ $(filter %.o,$^) + +# Due to circular dependencies between libraries :(, we wrap the +# special "figure out circular dependencies" flags around the entire +# input list during linking. +quiet_cmd_link = LINK($(TOOLSET)) $@ +cmd_link = $(LINK.$(TOOLSET)) $(GYP_LDFLAGS) $(LDFLAGS.$(TOOLSET)) -o $@ -Wl,--start-group $(LD_INPUTS) -Wl,--end-group $(LIBS) + +# We support two kinds of shared objects (.so): +# 1) shared_library, which is just bundling together many dependent libraries +# into a link line. +# 2) loadable_module, which is generating a module intended for dlopen(). +# +# They differ only slightly: +# In the former case, we want to package all dependent code into the .so. +# In the latter case, we want to package just the API exposed by the +# outermost module. +# This means shared_library uses --whole-archive, while loadable_module doesn't. +# (Note that --whole-archive is incompatible with the --start-group used in +# normal linking.) + +# Other shared-object link notes: +# - Set SONAME to the library filename so our binaries don't reference +# the local, absolute paths used on the link command-line. +quiet_cmd_solink = SOLINK($(TOOLSET)) $@ +cmd_solink = $(LINK.$(TOOLSET)) -shared $(GYP_LDFLAGS) $(LDFLAGS.$(TOOLSET)) -Wl,-soname=$(@F) -o $@ -Wl,--whole-archive $(LD_INPUTS) -Wl,--no-whole-archive $(LIBS) + +quiet_cmd_solink_module = SOLINK_MODULE($(TOOLSET)) $@ +cmd_solink_module = $(LINK.$(TOOLSET)) -shared $(GYP_LDFLAGS) $(LDFLAGS.$(TOOLSET)) -Wl,-soname=$(@F) -o $@ -Wl,--start-group $(filter-out FORCE_DO_CMD, $^) -Wl,--end-group $(LIBS) + + +# Define an escape_quotes function to escape single quotes. +# This allows us to handle quotes properly as long as we always use +# use single quotes and escape_quotes. +escape_quotes = $(subst ','\'',$(1)) +# This comment is here just to include a ' to unconfuse syntax highlighting. +# Define an escape_vars function to escape '$' variable syntax. +# This allows us to read/write command lines with shell variables (e.g. +# $LD_LIBRARY_PATH), without triggering make substitution. +escape_vars = $(subst $$,$$$$,$(1)) +# Helper that expands to a shell command to echo a string exactly as it is in +# make. This uses printf instead of echo because printf's behaviour with respect +# to escape sequences is more portable than echo's across different shells +# (e.g., dash, bash). +exact_echo = printf '%s\n' '$(call escape_quotes,$(1))' + +# Helper to compare the command we're about to run against the command +# we logged the last time we ran the command. Produces an empty +# string (false) when the commands match. +# Tricky point: Make has no string-equality test function. +# The kernel uses the following, but it seems like it would have false +# positives, where one string reordered its arguments. +# arg_check = $(strip $(filter-out $(cmd_$(1)), $(cmd_$@)) \ +# $(filter-out $(cmd_$@), $(cmd_$(1)))) +# We instead substitute each for the empty string into the other, and +# say they're equal if both substitutions produce the empty string. +# .d files contain ? instead of spaces, take that into account. +command_changed = $(or $(subst $(cmd_$(1)),,$(cmd_$(call replace_spaces,$@))),\ + $(subst $(cmd_$(call replace_spaces,$@)),,$(cmd_$(1)))) + +# Helper that is non-empty when a prerequisite changes. +# Normally make does this implicitly, but we force rules to always run +# so we can check their command lines. +# $? -- new prerequisites +# $| -- order-only dependencies +prereq_changed = $(filter-out FORCE_DO_CMD,$(filter-out $|,$?)) + +# Helper that executes all postbuilds until one fails. +define do_postbuilds + @E=0;\ + for p in $(POSTBUILDS); do\ + eval $$p;\ + E=$$?;\ + if [ $$E -ne 0 ]; then\ + break;\ + fi;\ + done;\ + if [ $$E -ne 0 ]; then\ + rm -rf "$@";\ + exit $$E;\ + fi +endef + +# do_cmd: run a command via the above cmd_foo names, if necessary. +# Should always run for a given target to handle command-line changes. +# Second argument, if non-zero, makes it do asm/C/C++ dependency munging. +# Third argument, if non-zero, makes it do POSTBUILDS processing. +# Note: We intentionally do NOT call dirx for depfile, since it contains ? for +# spaces already and dirx strips the ? characters. +define do_cmd +$(if $(or $(command_changed),$(prereq_changed)), + @$(call exact_echo, $($(quiet)cmd_$(1))) + @mkdir -p "$(call dirx,$@)" "$(dir $(depfile))" + $(if $(findstring flock,$(word 1,$(cmd_$1))), + @$(cmd_$(1)) + @echo " $(quiet_cmd_$(1)): Finished", + @$(cmd_$(1)) + ) + @$(call exact_echo,$(call escape_vars,cmd_$(call replace_spaces,$@) := $(cmd_$(1)))) > $(depfile) + @$(if $(2),$(fixup_dep)) + $(if $(and $(3), $(POSTBUILDS)), + $(call do_postbuilds) + ) +) +endef + +# Declare the "all" target first so it is the default, +# even though we don't have the deps yet. +.PHONY: all +all: + +# make looks for ways to re-generate included makefiles, but in our case, we +# don't have a direct way. Explicitly telling make that it has nothing to do +# for them makes it go faster. +%.d: ; + +# Use FORCE_DO_CMD to force a target to run. Should be coupled with +# do_cmd. +.PHONY: FORCE_DO_CMD +FORCE_DO_CMD: + +TOOLSET := target +# Suffix rules, putting all outputs into $(obj). +$(obj).$(TOOLSET)/%.o: $(srcdir)/%.c FORCE_DO_CMD + @$(call do_cmd,cc,1) +$(obj).$(TOOLSET)/%.o: $(srcdir)/%.cc FORCE_DO_CMD + @$(call do_cmd,cxx,1) +$(obj).$(TOOLSET)/%.o: $(srcdir)/%.cpp FORCE_DO_CMD + @$(call do_cmd,cxx,1) +$(obj).$(TOOLSET)/%.o: $(srcdir)/%.cxx FORCE_DO_CMD + @$(call do_cmd,cxx,1) +$(obj).$(TOOLSET)/%.o: $(srcdir)/%.S FORCE_DO_CMD + @$(call do_cmd,cc,1) +$(obj).$(TOOLSET)/%.o: $(srcdir)/%.s FORCE_DO_CMD + @$(call do_cmd,cc,1) + +# Try building from generated source, too. +$(obj).$(TOOLSET)/%.o: $(obj).$(TOOLSET)/%.c FORCE_DO_CMD + @$(call do_cmd,cc,1) +$(obj).$(TOOLSET)/%.o: $(obj).$(TOOLSET)/%.cc FORCE_DO_CMD + @$(call do_cmd,cxx,1) +$(obj).$(TOOLSET)/%.o: $(obj).$(TOOLSET)/%.cpp FORCE_DO_CMD + @$(call do_cmd,cxx,1) +$(obj).$(TOOLSET)/%.o: $(obj).$(TOOLSET)/%.cxx FORCE_DO_CMD + @$(call do_cmd,cxx,1) +$(obj).$(TOOLSET)/%.o: $(obj).$(TOOLSET)/%.S FORCE_DO_CMD + @$(call do_cmd,cc,1) +$(obj).$(TOOLSET)/%.o: $(obj).$(TOOLSET)/%.s FORCE_DO_CMD + @$(call do_cmd,cc,1) + +$(obj).$(TOOLSET)/%.o: $(obj)/%.c FORCE_DO_CMD + @$(call do_cmd,cc,1) +$(obj).$(TOOLSET)/%.o: $(obj)/%.cc FORCE_DO_CMD + @$(call do_cmd,cxx,1) +$(obj).$(TOOLSET)/%.o: $(obj)/%.cpp FORCE_DO_CMD + @$(call do_cmd,cxx,1) +$(obj).$(TOOLSET)/%.o: $(obj)/%.cxx FORCE_DO_CMD + @$(call do_cmd,cxx,1) +$(obj).$(TOOLSET)/%.o: $(obj)/%.S FORCE_DO_CMD + @$(call do_cmd,cc,1) +$(obj).$(TOOLSET)/%.o: $(obj)/%.s FORCE_DO_CMD + @$(call do_cmd,cc,1) + + +ifeq ($(strip $(foreach prefix,$(NO_LOAD),\ + $(findstring $(join ^,$(prefix)),\ + $(join ^,multihashing.target.mk)))),) + include multihashing.target.mk +endif + +quiet_cmd_regen_makefile = ACTION Regenerating $@ +cmd_regen_makefile = cd $(srcdir); /usr/lib/node_modules/node-gyp/gyp/gyp_main.py -fmake --ignore-environment "--toplevel-dir=." -I/root/multi-hashing/build/config.gypi -I/usr/lib/node_modules/node-gyp/addon.gypi -I/root/.node-gyp/0.10.26/common.gypi "--depth=." "-Goutput_dir=." "--generator-output=build" "-Dlibrary=shared_library" "-Dvisibility=default" "-Dnode_root_dir=/root/.node-gyp/0.10.26" "-Dmodule_root_dir=/root/multi-hashing" binding.gyp +Makefile: $(srcdir)/../../usr/lib/node_modules/node-gyp/addon.gypi $(srcdir)/../.node-gyp/0.10.26/common.gypi $(srcdir)/build/config.gypi $(srcdir)/binding.gyp + $(call do_cmd,regen_makefile) + +# "all" is a concatenation of the "all" targets from all the included +# sub-makefiles. This is just here to clarify. +all: + +# Add in dependency-tracking rules. $(all_deps) is the list of every single +# target in our tree. Only consider the ones with .d (dependency) info: +d_files := $(wildcard $(foreach f,$(all_deps),$(depsdir)/$(f).d)) +ifneq ($(d_files),) + include $(d_files) +endif diff --git a/build/binding.Makefile b/build/binding.Makefile new file mode 100644 index 0000000..3dfc8e4 --- /dev/null +++ b/build/binding.Makefile @@ -0,0 +1,6 @@ +# This file is generated by gyp; do not edit. + +export builddir_name ?= ./build/. +.PHONY: all +all: + $(MAKE) multihashing diff --git a/build/config.gypi b/build/config.gypi new file mode 100644 index 0000000..9f4ebd4 --- /dev/null +++ b/build/config.gypi @@ -0,0 +1,38 @@ +# Do not edit. File was generated by node-gyp's "configure" step +{ + "target_defaults": { + "cflags": [], + "default_configuration": "Release", + "defines": [], + "include_dirs": [], + "libraries": [] + }, + "variables": { + "clang": 0, + "gcc_version": 48, + "host_arch": "x64", + "node_install_npm": "true", + "node_prefix": "/usr", + "node_shared_cares": "false", + "node_shared_http_parser": "false", + "node_shared_libuv": "false", + "node_shared_openssl": "false", + "node_shared_v8": "false", + "node_shared_zlib": "false", + "node_tag": "", + "node_unsafe_optimizations": 0, + "node_use_dtrace": "false", + "node_use_etw": "false", + "node_use_openssl": "true", + "node_use_perfctr": "false", + "node_use_systemtap": "false", + "python": "/usr/bin/python", + "target_arch": "x64", + "v8_enable_gdbjit": 0, + "v8_no_strict_aliasing": 1, + "v8_use_snapshot": "false", + "nodedir": "/root/.node-gyp/0.10.26", + "copy_dev_lib": "true", + "standalone_static_library": 1 + } +} diff --git a/build/multihashing.target.mk b/build/multihashing.target.mk new file mode 100644 index 0000000..de2a674 --- /dev/null +++ b/build/multihashing.target.mk @@ -0,0 +1,158 @@ +# This file is generated by gyp; do not edit. + +TOOLSET := target +TARGET := multihashing +DEFS_Debug := \ + '-D_LARGEFILE_SOURCE' \ + '-D_FILE_OFFSET_BITS=64' \ + '-DBUILDING_NODE_EXTENSION' \ + '-DDEBUG' \ + '-D_DEBUG' + +# Flags passed to all source files. +CFLAGS_Debug := \ + -fPIC \ + -Wall \ + -Wextra \ + -Wno-unused-parameter \ + -pthread \ + -m64 \ + -g \ + -O0 + +# Flags passed to only C files. +CFLAGS_C_Debug := + +# Flags passed to only C++ files. +CFLAGS_CC_Debug := \ + -fno-rtti \ + -fno-exceptions + +INCS_Debug := \ + -I/root/.node-gyp/0.10.26/src \ + -I/root/.node-gyp/0.10.26/deps/uv/include \ + -I/root/.node-gyp/0.10.26/deps/v8/include + +DEFS_Release := \ + '-D_LARGEFILE_SOURCE' \ + '-D_FILE_OFFSET_BITS=64' \ + '-DBUILDING_NODE_EXTENSION' + +# Flags passed to all source files. +CFLAGS_Release := \ + -fPIC \ + -Wall \ + -Wextra \ + -Wno-unused-parameter \ + -pthread \ + -m64 \ + -O2 \ + -fno-strict-aliasing \ + -fno-tree-vrp \ + -fno-omit-frame-pointer + +# Flags passed to only C files. +CFLAGS_C_Release := + +# Flags passed to only C++ files. +CFLAGS_CC_Release := \ + -fno-rtti \ + -fno-exceptions + +INCS_Release := \ + -I/root/.node-gyp/0.10.26/src \ + -I/root/.node-gyp/0.10.26/deps/uv/include \ + -I/root/.node-gyp/0.10.26/deps/v8/include + +OBJS := \ + $(obj).target/$(TARGET)/multihashing.o \ + $(obj).target/$(TARGET)/scrypt.o \ + $(obj).target/$(TARGET)/scryptjane.o \ + $(obj).target/$(TARGET)/scryptn.o \ + $(obj).target/$(TARGET)/keccak.o \ + $(obj).target/$(TARGET)/skein.o \ + $(obj).target/$(TARGET)/x11.o \ + $(obj).target/$(TARGET)/quark.o \ + $(obj).target/$(TARGET)/sha3/aes_helper.o \ + $(obj).target/$(TARGET)/sha3/blake.o \ + $(obj).target/$(TARGET)/sha3/bmw.o \ + $(obj).target/$(TARGET)/sha3/cubehash.o \ + $(obj).target/$(TARGET)/sha3/echo.o \ + $(obj).target/$(TARGET)/sha3/groestl.o \ + $(obj).target/$(TARGET)/sha3/jh.o \ + $(obj).target/$(TARGET)/sha3/keccak.o \ + $(obj).target/$(TARGET)/sha3/luffa.o \ + $(obj).target/$(TARGET)/sha3/shavite.o \ + $(obj).target/$(TARGET)/sha3/simd.o \ + $(obj).target/$(TARGET)/sha3/skein.o + +# Add to the list of files we specially track dependencies for. +all_deps += $(OBJS) + +# CFLAGS et al overrides must be target-local. +# See "Target-specific Variable Values" in the GNU Make manual. +$(OBJS): TOOLSET := $(TOOLSET) +$(OBJS): GYP_CFLAGS := $(DEFS_$(BUILDTYPE)) $(INCS_$(BUILDTYPE)) $(CFLAGS_$(BUILDTYPE)) $(CFLAGS_C_$(BUILDTYPE)) +$(OBJS): GYP_CXXFLAGS := $(DEFS_$(BUILDTYPE)) $(INCS_$(BUILDTYPE)) $(CFLAGS_$(BUILDTYPE)) $(CFLAGS_CC_$(BUILDTYPE)) + +# Suffix rules, putting all outputs into $(obj). + +$(obj).$(TOOLSET)/$(TARGET)/%.o: $(srcdir)/%.cc FORCE_DO_CMD + @$(call do_cmd,cxx,1) + +$(obj).$(TOOLSET)/$(TARGET)/%.o: $(srcdir)/%.c FORCE_DO_CMD + @$(call do_cmd,cc,1) + +# Try building from generated source, too. + +$(obj).$(TOOLSET)/$(TARGET)/%.o: $(obj).$(TOOLSET)/%.cc FORCE_DO_CMD + @$(call do_cmd,cxx,1) + +$(obj).$(TOOLSET)/$(TARGET)/%.o: $(obj).$(TOOLSET)/%.c FORCE_DO_CMD + @$(call do_cmd,cc,1) + +$(obj).$(TOOLSET)/$(TARGET)/%.o: $(obj)/%.cc FORCE_DO_CMD + @$(call do_cmd,cxx,1) + +$(obj).$(TOOLSET)/$(TARGET)/%.o: $(obj)/%.c FORCE_DO_CMD + @$(call do_cmd,cc,1) + +# End of this set of suffix rules +### Rules for final target. +LDFLAGS_Debug := \ + -pthread \ + -rdynamic \ + -m64 + +LDFLAGS_Release := \ + -pthread \ + -rdynamic \ + -m64 + +LIBS := + +$(obj).target/multihashing.node: GYP_LDFLAGS := $(LDFLAGS_$(BUILDTYPE)) +$(obj).target/multihashing.node: LIBS := $(LIBS) +$(obj).target/multihashing.node: TOOLSET := $(TOOLSET) +$(obj).target/multihashing.node: $(OBJS) FORCE_DO_CMD + $(call do_cmd,solink_module) + +all_deps += $(obj).target/multihashing.node +# Add target alias +.PHONY: multihashing +multihashing: $(builddir)/multihashing.node + +# Copy this to the executable output path. +$(builddir)/multihashing.node: TOOLSET := $(TOOLSET) +$(builddir)/multihashing.node: $(obj).target/multihashing.node FORCE_DO_CMD + $(call do_cmd,copy) + +all_deps += $(builddir)/multihashing.node +# Short alias for building this executable. +.PHONY: multihashing.node +multihashing.node: $(obj).target/multihashing.node $(builddir)/multihashing.node + +# Add executable to "all" target. +.PHONY: all +all: $(builddir)/multihashing.node + diff --git a/index.js b/index.js new file mode 100644 index 0000000..be19772 --- /dev/null +++ b/index.js @@ -0,0 +1 @@ +module.exports = require('bindings')('multihashing.node') \ No newline at end of file diff --git a/keccak.c b/keccak.c new file mode 100644 index 0000000..f41312f --- /dev/null +++ b/keccak.c @@ -0,0 +1,18 @@ +#include "keccak.h" +#include +#include +#include +#include + +#include "sha3/sph_keccak.h" + + +void keccak_hash(const char* input, char* output) +{ + sph_keccak256_context ctx_keccak; + sph_keccak256_init(&ctx_keccak); + sph_keccak256 (&ctx_keccak, input, 80); + sph_keccak256_close(&ctx_keccak, output); + +} + diff --git a/keccak.h b/keccak.h new file mode 100644 index 0000000..4442ae9 --- /dev/null +++ b/keccak.h @@ -0,0 +1,14 @@ +#ifndef KECCAK_H +#define KECCAK_H + +#ifdef __cplusplus +extern "C" { +#endif + +void keccak_hash(const char* input, char* output); + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/multihashing.cc b/multihashing.cc new file mode 100644 index 0000000..fad8023 --- /dev/null +++ b/multihashing.cc @@ -0,0 +1,260 @@ +#include +#include +#include + +extern "C" { + #include "bcrypt.h" + #include "keccak.h" + #include "quark.h" + #include "scrypt.h" + #include "scryptjane.h" + #include "scryptn.h" + #include "skein.h" + #include "x11.h" + + + static unsigned char getNfactor(char* blockheader) { + int n,l = 0; + unsigned long nTimestamp = *(unsigned int*)(&blockheader[68]); + unsigned char minNfactor = 10; + unsigned char maxNfactor = 30; + unsigned char N; + uint64_t s; + + if (nTimestamp <= 1389306217) { + return minNfactor; + } + + s = nTimestamp - 1389306217; + while ((s >> 1) > 3) { + l += 1; + s >>= 1; + } + + s &= 3; + + n = (l * 158 + s * 28 - 2670) / 100; + + if (n < 0) n = 0; + + N = (unsigned char) n; + n = N > minNfactor ? N : minNfactor; + N = n < maxNfactor ? n : maxNfactor; + + return N; + } + + #define max(a,b) (((a) > (b)) ? (a) : (b)) + #define min(a,b) (((a) < (b)) ? (a) : (b)) + unsigned char GetNfactorJane(int nTimestamp, int nChainStartTime) { + + const unsigned char minNfactor = 4; + const unsigned char maxNfactor = 30; + + int l = 0, s, n; + unsigned char N; + + if (nTimestamp <= nChainStartTime) + return 4; + + s = nTimestamp - nChainStartTime; + while ((s >> 1) > 3) { + l += 1; + s >>= 1; + } + + s &= 3; + + n = (l * 170 + s * 25 - 2320) / 100; + + if (n < 0) n = 0; + + if (n > 255) + printf("GetNfactor(%d) - something wrong(n == %d)\n", nTimestamp, n); + + N = (unsigned char)n; + //printf("GetNfactor: %d -> %d %d : %d / %d\n", nTimestamp - nChainStartTime, l, s, n, min(max(N, minNfactor), maxNfactor)); + + return min(max(N, minNfactor), maxNfactor); + } + + void scryptjane_hash(const void* input, size_t inputlen, uint32_t *res, unsigned char Nfactor) + { + return scrypt((const unsigned char*)input, inputlen, + (const unsigned char*)input, inputlen, + Nfactor, 0, 0, (unsigned char*)res, 32); + } +} + +using namespace node; +using namespace v8; + +Handle except(const char* msg) { + return ThrowException(Exception::Error(String::New(msg))); +} + +Handle quark(const Arguments& args) { + HandleScope scope; + + if (args.Length() < 1) + return except("You must provide one argument."); + + Local target = args[0]->ToObject(); + + if(!Buffer::HasInstance(target)) + return except("Argument should be a buffer object."); + + char * input = Buffer::Data(target); + char * output = new char[32]; + + quark_hash(input, output); + + Buffer* buff = Buffer::New(output, 32); + return scope.Close(buff->handle_); +} + +Handle x11(const Arguments& args) { + HandleScope scope; + + if (args.Length() < 1) + return except("You must provide one argument."); + + Local target = args[0]->ToObject(); + + if(!Buffer::HasInstance(target)) + return except("Argument should be a buffer object."); + + char * input = Buffer::Data(target); + char * output = new char[32]; + + x11_hash(input, output); + + Buffer* buff = Buffer::New(output, 32); + return scope.Close(buff->handle_); +} + +Handle scrypt(const Arguments& args) { + HandleScope scope; + + if (args.Length() < 1) + return except("You must provide one argument."); + + Local target = args[0]->ToObject(); + + if(!Buffer::HasInstance(target)) + return except("Argument should be a buffer object."); + + char * input = Buffer::Data(target); + char * output = new char[32]; + + scrypt_1024_1_1_256(input, output); + + Buffer* buff = Buffer::New(output, 32); + return scope.Close(buff->handle_); +} + + + +Handle scryptn(const Arguments& args) { + HandleScope scope; + + if (args.Length() < 1) + return except("You must provide one argument."); + + Local target = args[0]->ToObject(); + + if(!Buffer::HasInstance(target)) + return except("Argument should be a buffer object."); + + + + char * input = Buffer::Data(target); + char * output = new char[32]; + + unsigned int N = 1 << (getNfactor(input) + 1); + + scrypt_N_1_1_256(input, output, N); + + Buffer* buff = Buffer::New(output, 32); + return scope.Close(buff->handle_); +} + +Handle scryptjane(const Arguments& args) { + HandleScope scope; + + if (args.Length() < 3) + return except("You must provide two argument: buffer, timestamp as number, and nChainStarTime as number"); + + Local target = args[0]->ToObject(); + + if(!Buffer::HasInstance(target)) + return except("First should be a buffer object."); + + Local num = args[1]->ToNumber(); + int timestamp = num->Value(); + + Local num = args[2]->ToNumber(); + int nChainStarTime = num->Value(); + + + char * input = Buffer::Data(target); + char * output = new char[32]; + + scryptjane_hash(input, 80, (uint32_t *)output, GetNfactorJane(timestamp, nChainStarTime)); + + Buffer* buff = Buffer::New(output, 32); + return scope.Close(buff->handle_); +} + +Handle keccak(const Arguments& args) { + HandleScope scope; + + if (args.Length() < 1) + return except("You must provide one argument."); + + Local target = args[0]->ToObject(); + + if(!Buffer::HasInstance(target)) + return except("Argument should be a buffer object."); + + char * input = Buffer::Data(target); + char * output = new char[32]; + + keccak_hash(input, output); + + Buffer* buff = Buffer::New(output, 32); + return scope.Close(buff->handle_); +} + + +Handle bcrypt(const Arguments& args) { + HandleScope scope; + + if (args.Length() < 1) + return except("You must provide one argument."); + + Local target = args[0]->ToObject(); + + if(!Buffer::HasInstance(target)) + return except("Argument should be a buffer object."); + + char * input = Buffer::Data(target); + char * output = new char[32]; + + bcrypt_hash(input, output); + + Buffer* buff = Buffer::New(output, 32); + return scope.Close(buff->handle_); +} + +void init(Handle exports) { + exports->Set(String::NewSymbol("quark"), FunctionTemplate::New(quark)->GetFunction()); + exports->Set(String::NewSymbol("x11"), FunctionTemplate::New(x11)->GetFunction()); + exports->Set(String::NewSymbol("scrypt"), FunctionTemplate::New(scrypt)->GetFunction()); + exports->Set(String::NewSymbol("scryptn"), FunctionTemplate::New(scryptn)->GetFunction()); + exports->Set(String::NewSymbol("scryptjane"), FunctionTemplate::New(scryptjane)->GetFunction()); + exports->Set(String::NewSymbol("keccak"), FunctionTemplate::New(keccak)->GetFunction()); + exports->Set(String::NewSymbol("bcrypt"), FunctionTemplate::New(keccak)->GetFunction()); +} + +NODE_MODULE(multihashing, init) diff --git a/package.json b/package.json new file mode 100644 index 0000000..6160b2f --- /dev/null +++ b/package.json @@ -0,0 +1,30 @@ +{ + "name": "multi-hashing", + "version": "0.0.1", + "main": "multihashing", + "author": { + "name": "Matthew Little", + "email": "zone117x@gmail.com" + }, + "repository": { + "type": "git", + "url": "https://github.com/zone117x/node-multi-hashing.git" + }, + "dependencies" : { + "bindings" : "*" + }, + "keywords": [ + "scrypt", + "scrypt-jane", + "script-n", + "x11", + "quark", + "keccak_hash", + "skein", + "bcrypt", + "keccak", + "blake", + "shavite", + "fugue" + ] +} \ No newline at end of file diff --git a/quark.c b/quark.c new file mode 100644 index 0000000..fc6a022 --- /dev/null +++ b/quark.c @@ -0,0 +1,211 @@ +/*- + * Copyright 2009 Colin Percival, 2011 ArtForz, 2013 Neisklar, + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * This file was originally written by Colin Percival as part of the Tarsnap + * online backup system. + */ + +#include "quark.h" +#include +#include +#include +#include +#include "sha3/sph_blake.h" +#include "sha3/sph_bmw.h" +#include "sha3/sph_groestl.h" +#include "sha3/sph_jh.h" +#include "sha3/sph_keccak.h" +#include "sha3/sph_skein.h" + + +static __inline uint32_t +be32dec(const void *pp) +{ + const uint8_t *p = (uint8_t const *)pp; + + return ((uint32_t)(p[3]) + ((uint32_t)(p[2]) << 8) + + ((uint32_t)(p[1]) << 16) + ((uint32_t)(p[0]) << 24)); +} + +static __inline void +be32enc(void *pp, uint32_t x) +{ + uint8_t * p = (uint8_t *)pp; + + p[3] = x & 0xff; + p[2] = (x >> 8) & 0xff; + p[1] = (x >> 16) & 0xff; + p[0] = (x >> 24) & 0xff; +} + +static __inline uint32_t +le32dec(const void *pp) +{ + const uint8_t *p = (uint8_t const *)pp; + + return ((uint32_t)(p[0]) + ((uint32_t)(p[1]) << 8) + + ((uint32_t)(p[2]) << 16) + ((uint32_t)(p[3]) << 24)); +} + +static __inline void +le32enc(void *pp, uint32_t x) +{ + uint8_t * p = (uint8_t *)pp; + + p[0] = x & 0xff; + p[1] = (x >> 8) & 0xff; + p[2] = (x >> 16) & 0xff; + p[3] = (x >> 24) & 0xff; +} + +/* + * Encode a length len/4 vector of (uint32_t) into a length len vector of + * (unsigned char) in big-endian form. Assumes len is a multiple of 4. + */ +static void +be32enc_vect(unsigned char *dst, const uint32_t *src, size_t len) +{ + size_t i; + + for (i = 0; i < len / 4; i++) + be32enc(dst + i * 4, src[i]); +} + +/* + * Decode a big-endian length len vector of (unsigned char) into a length + * len/4 vector of (uint32_t). Assumes len is a multiple of 4. + */ +static void +be32dec_vect(uint32_t *dst, const unsigned char *src, size_t len) +{ + size_t i; + + for (i = 0; i < len / 4; i++) + dst[i] = be32dec(src + i * 4); +} + +void quark_hash(const char* input, char* output) +{ + sph_blake512_context ctx_blake; + sph_bmw512_context ctx_bmw; + sph_groestl512_context ctx_groestl; + sph_jh512_context ctx_jh; + sph_keccak512_context ctx_keccak; + sph_skein512_context ctx_skein; + static unsigned char pblank[1]; + + uint32_t mask = 8; + uint32_t zero = 0; + + uint32_t hashA[16], hashB[16]; + + + + sph_blake512_init(&ctx_blake); + sph_blake512 (&ctx_blake, input, 80); + sph_blake512_close (&ctx_blake, hashA); //0 + + + sph_bmw512_init(&ctx_bmw); + sph_bmw512 (&ctx_bmw, hashA, 64); //0 + sph_bmw512_close(&ctx_bmw, hashB); //1 + + + if ((hashB[0] & mask) != zero) //1 + { + sph_groestl512_init(&ctx_groestl); + sph_groestl512 (&ctx_groestl, hashB, 64); //1 + sph_groestl512_close(&ctx_groestl, hashA); //2 + } + else + { + sph_skein512_init(&ctx_skein); + sph_skein512 (&ctx_skein, hashB, 64); //1 + sph_skein512_close(&ctx_skein, hashA); //2 + } + + + sph_groestl512_init(&ctx_groestl); + sph_groestl512 (&ctx_groestl, hashA, 64); //2 + sph_groestl512_close(&ctx_groestl, hashB); //3 + + sph_jh512_init(&ctx_jh); + sph_jh512 (&ctx_jh, hashB, 64); //3 + sph_jh512_close(&ctx_jh, hashA); //4 + + if ((hashA[0] & mask) != zero) //4 + { + sph_blake512_init(&ctx_blake); + sph_blake512 (&ctx_blake, hashA, 64); // + sph_blake512_close(&ctx_blake, hashB); //5 + } + else + { + sph_bmw512_init(&ctx_bmw); + sph_bmw512 (&ctx_bmw, hashA, 64); //4 + sph_bmw512_close(&ctx_bmw, hashB); //5 + } + + sph_keccak512_init(&ctx_keccak); + sph_keccak512 (&ctx_keccak,hashB, 64); //5 + sph_keccak512_close(&ctx_keccak, hashA); //6 + + sph_skein512_init(&ctx_skein); + sph_skein512 (&ctx_skein, hashA, 64); //6 + sph_skein512_close(&ctx_skein, hashB); //7 + + if ((hashB[0] & mask) != zero) //7 + { + sph_keccak512_init(&ctx_keccak); + sph_keccak512 (&ctx_keccak, hashB, 64); // + sph_keccak512_close(&ctx_keccak, hashA); //8 + } + else + { + sph_jh512_init(&ctx_jh); + sph_jh512 (&ctx_jh, hashB, 64); //7 + sph_jh512_close(&ctx_jh, hashA); //8 + } + + + + memcpy(output, hashA, 32); + + +/* + printf("result: "); + for (ii=0; ii < 32; ii++) + { + printf ("%.2x",((uint8_t*)output)[ii]); + } + printf ("\n"); +*/ + + + + +} + + diff --git a/quark.h b/quark.h new file mode 100644 index 0000000..5b76749 --- /dev/null +++ b/quark.h @@ -0,0 +1,6 @@ +#ifndef QUARK_H +#define QUARK_H + +void quark_hash(const char* input, char* output); + +#endif diff --git a/scrypt-jane/scrypt-jane-chacha.h b/scrypt-jane/scrypt-jane-chacha.h new file mode 100644 index 0000000..41d96e5 --- /dev/null +++ b/scrypt-jane/scrypt-jane-chacha.h @@ -0,0 +1,132 @@ +#define SCRYPT_MIX_BASE "ChaCha20/8" + +typedef uint32_t scrypt_mix_word_t; + +#define SCRYPT_WORDTO8_LE U32TO8_LE +#define SCRYPT_WORD_ENDIAN_SWAP U32_SWAP + +#define SCRYPT_BLOCK_BYTES 64 +#define SCRYPT_BLOCK_WORDS (SCRYPT_BLOCK_BYTES / sizeof(scrypt_mix_word_t)) + +/* must have these here in case block bytes is ever != 64 */ +#include "scrypt-jane-romix-basic.h" + +#include "scrypt-jane-mix_chacha-avx.h" +#include "scrypt-jane-mix_chacha-ssse3.h" +#include "scrypt-jane-mix_chacha-sse2.h" +#include "scrypt-jane-mix_chacha.h" + +#if defined(SCRYPT_CHACHA_AVX) + #define SCRYPT_CHUNKMIX_FN scrypt_ChunkMix_avx + #define SCRYPT_ROMIX_FN scrypt_ROMix_avx + #define SCRYPT_MIX_FN chacha_core_avx + #define SCRYPT_ROMIX_TANGLE_FN scrypt_romix_nop + #define SCRYPT_ROMIX_UNTANGLE_FN scrypt_romix_nop + #include "scrypt-jane-romix-template.h" +#endif + +#if defined(SCRYPT_CHACHA_SSSE3) + #define SCRYPT_CHUNKMIX_FN scrypt_ChunkMix_ssse3 + #define SCRYPT_ROMIX_FN scrypt_ROMix_ssse3 + #define SCRYPT_MIX_FN chacha_core_ssse3 + #define SCRYPT_ROMIX_TANGLE_FN scrypt_romix_nop + #define SCRYPT_ROMIX_UNTANGLE_FN scrypt_romix_nop + #include "scrypt-jane-romix-template.h" +#endif + +#if defined(SCRYPT_CHACHA_SSE2) + #define SCRYPT_CHUNKMIX_FN scrypt_ChunkMix_sse2 + #define SCRYPT_ROMIX_FN scrypt_ROMix_sse2 + #define SCRYPT_MIX_FN chacha_core_sse2 + #define SCRYPT_ROMIX_TANGLE_FN scrypt_romix_nop + #define SCRYPT_ROMIX_UNTANGLE_FN scrypt_romix_nop + #include "scrypt-jane-romix-template.h" +#endif + +/* cpu agnostic */ +#define SCRYPT_ROMIX_FN scrypt_ROMix_basic +#define SCRYPT_MIX_FN chacha_core_basic +#define SCRYPT_ROMIX_TANGLE_FN scrypt_romix_convert_endian +#define SCRYPT_ROMIX_UNTANGLE_FN scrypt_romix_convert_endian +#include "scrypt-jane-romix-template.h" + +#if !defined(SCRYPT_CHOOSE_COMPILETIME) +static scrypt_ROMixfn +scrypt_getROMix() { + size_t cpuflags = detect_cpu(); + +#if defined(SCRYPT_CHACHA_AVX) + if (cpuflags & cpu_avx) + return scrypt_ROMix_avx; + else +#endif + +#if defined(SCRYPT_CHACHA_SSSE3) + if (cpuflags & cpu_ssse3) + return scrypt_ROMix_ssse3; + else +#endif + +#if defined(SCRYPT_CHACHA_SSE2) + if (cpuflags & cpu_sse2) + return scrypt_ROMix_sse2; + else +#endif + + return scrypt_ROMix_basic; +} +#endif + + +#if defined(SCRYPT_TEST_SPEED) +static size_t +available_implementations() { + size_t flags = 0; + +#if defined(SCRYPT_CHACHA_AVX) + flags |= cpu_avx; +#endif + +#if defined(SCRYPT_CHACHA_SSSE3) + flags |= cpu_ssse3; +#endif + +#if defined(SCRYPT_CHACHA_SSE2) + flags |= cpu_sse2; +#endif + + return flags; +} +#endif + +static int +scrypt_test_mix() { + static const uint8_t expected[16] = { + 0x48,0x2b,0x2d,0xb8,0xa1,0x33,0x22,0x73,0xcd,0x16,0xc4,0xb4,0xb0,0x7f,0xb1,0x8a, + }; + + int ret = 1; + size_t cpuflags = detect_cpu(); + +#if defined(SCRYPT_CHACHA_AVX) + if (cpuflags & cpu_avx) + ret &= scrypt_test_mix_instance(scrypt_ChunkMix_avx, scrypt_romix_nop, scrypt_romix_nop, expected); +#endif + +#if defined(SCRYPT_CHACHA_SSSE3) + if (cpuflags & cpu_ssse3) + ret &= scrypt_test_mix_instance(scrypt_ChunkMix_ssse3, scrypt_romix_nop, scrypt_romix_nop, expected); +#endif + +#if defined(SCRYPT_CHACHA_SSE2) + if (cpuflags & cpu_sse2) + ret &= scrypt_test_mix_instance(scrypt_ChunkMix_sse2, scrypt_romix_nop, scrypt_romix_nop, expected); +#endif + +#if defined(SCRYPT_CHACHA_BASIC) + ret &= scrypt_test_mix_instance(scrypt_ChunkMix_basic, scrypt_romix_convert_endian, scrypt_romix_convert_endian, expected); +#endif + + return ret; +} + diff --git a/scrypt-jane/scrypt-jane-hash.h b/scrypt-jane/scrypt-jane-hash.h new file mode 100644 index 0000000..db5c1db --- /dev/null +++ b/scrypt-jane/scrypt-jane-hash.h @@ -0,0 +1,48 @@ +#if defined(SCRYPT_BLAKE512) +#include "scrypt-jane-hash_blake512.h" +#elif defined(SCRYPT_BLAKE256) +#include "scrypt-jane-hash_blake256.h" +#elif defined(SCRYPT_SHA512) +#include "scrypt-jane-hash_sha512.h" +#elif defined(SCRYPT_SHA256) +#include "scrypt-jane-hash_sha256.h" +#elif defined(SCRYPT_SKEIN512) +#include "scrypt-jane-hash_skein512.h" +#elif defined(SCRYPT_KECCAK512) || defined(SCRYPT_KECCAK256) +#include "scrypt-jane-hash_keccak.h" +#else + #define SCRYPT_HASH "ERROR" + #define SCRYPT_HASH_BLOCK_SIZE 64 + #define SCRYPT_HASH_DIGEST_SIZE 64 + typedef struct scrypt_hash_state_t { size_t dummy; } scrypt_hash_state; + typedef uint8_t scrypt_hash_digest[SCRYPT_HASH_DIGEST_SIZE]; + static void scrypt_hash_init(scrypt_hash_state *S) {} + static void scrypt_hash_update(scrypt_hash_state *S, const uint8_t *in, size_t inlen) {} + static void scrypt_hash_finish(scrypt_hash_state *S, uint8_t *hash) {} + static const uint8_t scrypt_test_hash_expected[SCRYPT_HASH_DIGEST_SIZE] = {0}; + #error must define a hash function! +#endif + +#include "scrypt-jane-pbkdf2.h" + +#define SCRYPT_TEST_HASH_LEN 257 /* (2 * largest block size) + 1 */ + +static int +scrypt_test_hash() { + scrypt_hash_state st; + scrypt_hash_digest hash, final; + uint8_t msg[SCRYPT_TEST_HASH_LEN]; + size_t i; + + for (i = 0; i < SCRYPT_TEST_HASH_LEN; i++) + msg[i] = (uint8_t)i; + + scrypt_hash_init(&st); + for (i = 0; i < SCRYPT_TEST_HASH_LEN + 1; i++) { + scrypt_hash(hash, msg, i); + scrypt_hash_update(&st, hash, sizeof(hash)); + } + scrypt_hash_finish(&st, final); + return scrypt_verify(final, scrypt_test_hash_expected, SCRYPT_HASH_DIGEST_SIZE); +} + diff --git a/scrypt-jane/scrypt-jane-hash_keccak.h b/scrypt-jane/scrypt-jane-hash_keccak.h new file mode 100644 index 0000000..7ed5574 --- /dev/null +++ b/scrypt-jane/scrypt-jane-hash_keccak.h @@ -0,0 +1,168 @@ +#if defined(SCRYPT_KECCAK256) + #define SCRYPT_HASH "Keccak-256" + #define SCRYPT_HASH_DIGEST_SIZE 32 +#else + #define SCRYPT_HASH "Keccak-512" + #define SCRYPT_HASH_DIGEST_SIZE 64 +#endif +#define SCRYPT_KECCAK_F 1600 +#define SCRYPT_KECCAK_C (SCRYPT_HASH_DIGEST_SIZE * 8 * 2) /* 256=512, 512=1024 */ +#define SCRYPT_KECCAK_R (SCRYPT_KECCAK_F - SCRYPT_KECCAK_C) /* 256=1088, 512=576 */ +#define SCRYPT_HASH_BLOCK_SIZE (SCRYPT_KECCAK_R / 8) + +typedef uint8_t scrypt_hash_digest[SCRYPT_HASH_DIGEST_SIZE]; + +typedef struct scrypt_hash_state_t { + uint64_t state[SCRYPT_KECCAK_F / 64]; + uint32_t leftover; + uint8_t buffer[SCRYPT_HASH_BLOCK_SIZE]; +} scrypt_hash_state; + +static const uint64_t keccak_round_constants[24] = { + 0x0000000000000001ull, 0x0000000000008082ull, + 0x800000000000808aull, 0x8000000080008000ull, + 0x000000000000808bull, 0x0000000080000001ull, + 0x8000000080008081ull, 0x8000000000008009ull, + 0x000000000000008aull, 0x0000000000000088ull, + 0x0000000080008009ull, 0x000000008000000aull, + 0x000000008000808bull, 0x800000000000008bull, + 0x8000000000008089ull, 0x8000000000008003ull, + 0x8000000000008002ull, 0x8000000000000080ull, + 0x000000000000800aull, 0x800000008000000aull, + 0x8000000080008081ull, 0x8000000000008080ull, + 0x0000000080000001ull, 0x8000000080008008ull +}; + +static void +keccak_block(scrypt_hash_state *S, const uint8_t *in) { + size_t i; + uint64_t *s = S->state, t[5], u[5], v, w; + + /* absorb input */ + for (i = 0; i < SCRYPT_HASH_BLOCK_SIZE / 8; i++, in += 8) + s[i] ^= U8TO64_LE(in); + + for (i = 0; i < 24; i++) { + /* theta: c = a[0,i] ^ a[1,i] ^ .. a[4,i] */ + t[0] = s[0] ^ s[5] ^ s[10] ^ s[15] ^ s[20]; + t[1] = s[1] ^ s[6] ^ s[11] ^ s[16] ^ s[21]; + t[2] = s[2] ^ s[7] ^ s[12] ^ s[17] ^ s[22]; + t[3] = s[3] ^ s[8] ^ s[13] ^ s[18] ^ s[23]; + t[4] = s[4] ^ s[9] ^ s[14] ^ s[19] ^ s[24]; + + /* theta: d[i] = c[i+4] ^ rotl(c[i+1],1) */ + u[0] = t[4] ^ ROTL64(t[1], 1); + u[1] = t[0] ^ ROTL64(t[2], 1); + u[2] = t[1] ^ ROTL64(t[3], 1); + u[3] = t[2] ^ ROTL64(t[4], 1); + u[4] = t[3] ^ ROTL64(t[0], 1); + + /* theta: a[0,i], a[1,i], .. a[4,i] ^= d[i] */ + s[0] ^= u[0]; s[5] ^= u[0]; s[10] ^= u[0]; s[15] ^= u[0]; s[20] ^= u[0]; + s[1] ^= u[1]; s[6] ^= u[1]; s[11] ^= u[1]; s[16] ^= u[1]; s[21] ^= u[1]; + s[2] ^= u[2]; s[7] ^= u[2]; s[12] ^= u[2]; s[17] ^= u[2]; s[22] ^= u[2]; + s[3] ^= u[3]; s[8] ^= u[3]; s[13] ^= u[3]; s[18] ^= u[3]; s[23] ^= u[3]; + s[4] ^= u[4]; s[9] ^= u[4]; s[14] ^= u[4]; s[19] ^= u[4]; s[24] ^= u[4]; + + /* rho pi: b[..] = rotl(a[..], ..) */ + v = s[ 1]; + s[ 1] = ROTL64(s[ 6], 44); + s[ 6] = ROTL64(s[ 9], 20); + s[ 9] = ROTL64(s[22], 61); + s[22] = ROTL64(s[14], 39); + s[14] = ROTL64(s[20], 18); + s[20] = ROTL64(s[ 2], 62); + s[ 2] = ROTL64(s[12], 43); + s[12] = ROTL64(s[13], 25); + s[13] = ROTL64(s[19], 8); + s[19] = ROTL64(s[23], 56); + s[23] = ROTL64(s[15], 41); + s[15] = ROTL64(s[ 4], 27); + s[ 4] = ROTL64(s[24], 14); + s[24] = ROTL64(s[21], 2); + s[21] = ROTL64(s[ 8], 55); + s[ 8] = ROTL64(s[16], 45); + s[16] = ROTL64(s[ 5], 36); + s[ 5] = ROTL64(s[ 3], 28); + s[ 3] = ROTL64(s[18], 21); + s[18] = ROTL64(s[17], 15); + s[17] = ROTL64(s[11], 10); + s[11] = ROTL64(s[ 7], 6); + s[ 7] = ROTL64(s[10], 3); + s[10] = ROTL64( v, 1); + + /* chi: a[i,j] ^= ~b[i,j+1] & b[i,j+2] */ + v = s[ 0]; w = s[ 1]; s[ 0] ^= (~w) & s[ 2]; s[ 1] ^= (~s[ 2]) & s[ 3]; s[ 2] ^= (~s[ 3]) & s[ 4]; s[ 3] ^= (~s[ 4]) & v; s[ 4] ^= (~v) & w; + v = s[ 5]; w = s[ 6]; s[ 5] ^= (~w) & s[ 7]; s[ 6] ^= (~s[ 7]) & s[ 8]; s[ 7] ^= (~s[ 8]) & s[ 9]; s[ 8] ^= (~s[ 9]) & v; s[ 9] ^= (~v) & w; + v = s[10]; w = s[11]; s[10] ^= (~w) & s[12]; s[11] ^= (~s[12]) & s[13]; s[12] ^= (~s[13]) & s[14]; s[13] ^= (~s[14]) & v; s[14] ^= (~v) & w; + v = s[15]; w = s[16]; s[15] ^= (~w) & s[17]; s[16] ^= (~s[17]) & s[18]; s[17] ^= (~s[18]) & s[19]; s[18] ^= (~s[19]) & v; s[19] ^= (~v) & w; + v = s[20]; w = s[21]; s[20] ^= (~w) & s[22]; s[21] ^= (~s[22]) & s[23]; s[22] ^= (~s[23]) & s[24]; s[23] ^= (~s[24]) & v; s[24] ^= (~v) & w; + + /* iota: a[0,0] ^= round constant */ + s[0] ^= keccak_round_constants[i]; + } +} + +static void +scrypt_hash_init(scrypt_hash_state *S) { + memset(S, 0, sizeof(*S)); +} + +static void +scrypt_hash_update(scrypt_hash_state *S, const uint8_t *in, size_t inlen) { + size_t want; + + /* handle the previous data */ + if (S->leftover) { + want = (SCRYPT_HASH_BLOCK_SIZE - S->leftover); + want = (want < inlen) ? want : inlen; + memcpy(S->buffer + S->leftover, in, want); + S->leftover += (uint32_t)want; + if (S->leftover < SCRYPT_HASH_BLOCK_SIZE) + return; + in += want; + inlen -= want; + keccak_block(S, S->buffer); + } + + /* handle the current data */ + while (inlen >= SCRYPT_HASH_BLOCK_SIZE) { + keccak_block(S, in); + in += SCRYPT_HASH_BLOCK_SIZE; + inlen -= SCRYPT_HASH_BLOCK_SIZE; + } + + /* handle leftover data */ + S->leftover = (uint32_t)inlen; + if (S->leftover) + memcpy(S->buffer, in, S->leftover); +} + +static void +scrypt_hash_finish(scrypt_hash_state *S, uint8_t *hash) { + size_t i; + + S->buffer[S->leftover] = 0x01; + memset(S->buffer + (S->leftover + 1), 0, SCRYPT_HASH_BLOCK_SIZE - (S->leftover + 1)); + S->buffer[SCRYPT_HASH_BLOCK_SIZE - 1] |= 0x80; + keccak_block(S, S->buffer); + + for (i = 0; i < SCRYPT_HASH_DIGEST_SIZE; i += 8) { + U64TO8_LE(&hash[i], S->state[i / 8]); + } +} + +#if defined(SCRYPT_KECCAK256) +static const uint8_t scrypt_test_hash_expected[SCRYPT_HASH_DIGEST_SIZE] = { + 0x26,0xb7,0x10,0xb3,0x66,0xb1,0xd1,0xb1,0x25,0xfc,0x3e,0xe3,0x1e,0x33,0x1d,0x19, + 0x94,0xaa,0x63,0x7a,0xd5,0x77,0x29,0xb4,0x27,0xe9,0xe0,0xf4,0x19,0xba,0x68,0xea, +}; +#else +static const uint8_t scrypt_test_hash_expected[SCRYPT_HASH_DIGEST_SIZE] = { + 0x17,0xc7,0x8c,0xa0,0xd9,0x08,0x1d,0xba,0x8a,0xc8,0x3e,0x07,0x90,0xda,0x91,0x88, + 0x25,0xbd,0xd3,0xf8,0x78,0x4a,0x8d,0x5e,0xe4,0x96,0x9c,0x01,0xf3,0xeb,0xdc,0x12, + 0xea,0x35,0x57,0xba,0x94,0xb8,0xe9,0xb9,0x27,0x45,0x0a,0x48,0x5c,0x3d,0x69,0xf0, + 0xdb,0x22,0x38,0xb5,0x52,0x22,0x29,0xea,0x7a,0xb2,0xe6,0x07,0xaa,0x37,0x4d,0xe6, +}; +#endif + diff --git a/scrypt-jane/scrypt-jane-hash_sha256.h b/scrypt-jane/scrypt-jane-hash_sha256.h new file mode 100644 index 0000000..d06d3e1 --- /dev/null +++ b/scrypt-jane/scrypt-jane-hash_sha256.h @@ -0,0 +1,135 @@ +#define SCRYPT_HASH "SHA-2-256" +#define SCRYPT_HASH_BLOCK_SIZE 64 +#define SCRYPT_HASH_DIGEST_SIZE 32 + +typedef uint8_t scrypt_hash_digest[SCRYPT_HASH_DIGEST_SIZE]; + +typedef struct scrypt_hash_state_t { + uint32_t H[8]; + uint64_t T; + uint32_t leftover; + uint8_t buffer[SCRYPT_HASH_BLOCK_SIZE]; +} scrypt_hash_state; + +static const uint32_t sha256_constants[64] = { + 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5, 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5, + 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3, 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174, + 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc, 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da, + 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7, 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967, + 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13, 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85, + 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3, 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070, + 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5, 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3, + 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208, 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2 +}; + +#define Ch(x,y,z) (z ^ (x & (y ^ z))) +#define Maj(x,y,z) (((x | y) & z) | (x & y)) +#define S0(x) (ROTR32(x, 2) ^ ROTR32(x, 13) ^ ROTR32(x, 22)) +#define S1(x) (ROTR32(x, 6) ^ ROTR32(x, 11) ^ ROTR32(x, 25)) +#define G0(x) (ROTR32(x, 7) ^ ROTR32(x, 18) ^ (x >> 3)) +#define G1(x) (ROTR32(x, 17) ^ ROTR32(x, 19) ^ (x >> 10)) +#define W0(in,i) (U8TO32_BE(&in[i * 4])) +#define W1(i) (G1(w[i - 2]) + w[i - 7] + G0(w[i - 15]) + w[i - 16]) +#define STEP(i) \ + t1 = S0(r[0]) + Maj(r[0], r[1], r[2]); \ + t0 = r[7] + S1(r[4]) + Ch(r[4], r[5], r[6]) + sha256_constants[i] + w[i]; \ + r[7] = r[6]; \ + r[6] = r[5]; \ + r[5] = r[4]; \ + r[4] = r[3] + t0; \ + r[3] = r[2]; \ + r[2] = r[1]; \ + r[1] = r[0]; \ + r[0] = t0 + t1; + +static void +sha256_blocks(scrypt_hash_state *S, const uint8_t *in, size_t blocks) { + uint32_t r[8], w[64], t0, t1; + size_t i; + + for (i = 0; i < 8; i++) r[i] = S->H[i]; + + while (blocks--) { + for (i = 0; i < 16; i++) { w[i] = W0(in, i); } + for (i = 16; i < 64; i++) { w[i] = W1(i); } + for (i = 0; i < 64; i++) { STEP(i); } + for (i = 0; i < 8; i++) { r[i] += S->H[i]; S->H[i] = r[i]; } + S->T += SCRYPT_HASH_BLOCK_SIZE * 8; + in += SCRYPT_HASH_BLOCK_SIZE; + } +} + +static void +scrypt_hash_init(scrypt_hash_state *S) { + S->H[0] = 0x6a09e667; + S->H[1] = 0xbb67ae85; + S->H[2] = 0x3c6ef372; + S->H[3] = 0xa54ff53a; + S->H[4] = 0x510e527f; + S->H[5] = 0x9b05688c; + S->H[6] = 0x1f83d9ab; + S->H[7] = 0x5be0cd19; + S->T = 0; + S->leftover = 0; +} + +static void +scrypt_hash_update(scrypt_hash_state *S, const uint8_t *in, size_t inlen) { + size_t blocks, want; + + /* handle the previous data */ + if (S->leftover) { + want = (SCRYPT_HASH_BLOCK_SIZE - S->leftover); + want = (want < inlen) ? want : inlen; + memcpy(S->buffer + S->leftover, in, want); + S->leftover += (uint32_t)want; + if (S->leftover < SCRYPT_HASH_BLOCK_SIZE) + return; + in += want; + inlen -= want; + sha256_blocks(S, S->buffer, 1); + } + + /* handle the current data */ + blocks = (inlen & ~(SCRYPT_HASH_BLOCK_SIZE - 1)); + S->leftover = (uint32_t)(inlen - blocks); + if (blocks) { + sha256_blocks(S, in, blocks / SCRYPT_HASH_BLOCK_SIZE); + in += blocks; + } + + /* handle leftover data */ + if (S->leftover) + memcpy(S->buffer, in, S->leftover); +} + +static void +scrypt_hash_finish(scrypt_hash_state *S, uint8_t *hash) { + uint64_t t = S->T + (S->leftover * 8); + + S->buffer[S->leftover] = 0x80; + if (S->leftover <= 55) { + memset(S->buffer + S->leftover + 1, 0, 55 - S->leftover); + } else { + memset(S->buffer + S->leftover + 1, 0, 63 - S->leftover); + sha256_blocks(S, S->buffer, 1); + memset(S->buffer, 0, 56); + } + + U64TO8_BE(S->buffer + 56, t); + sha256_blocks(S, S->buffer, 1); + + U32TO8_BE(&hash[ 0], S->H[0]); + U32TO8_BE(&hash[ 4], S->H[1]); + U32TO8_BE(&hash[ 8], S->H[2]); + U32TO8_BE(&hash[12], S->H[3]); + U32TO8_BE(&hash[16], S->H[4]); + U32TO8_BE(&hash[20], S->H[5]); + U32TO8_BE(&hash[24], S->H[6]); + U32TO8_BE(&hash[28], S->H[7]); +} + +static const uint8_t scrypt_test_hash_expected[SCRYPT_HASH_DIGEST_SIZE] = { + 0xee,0x36,0xae,0xa6,0x65,0xf0,0x28,0x7d,0xc9,0xde,0xd8,0xad,0x48,0x33,0x7d,0xbf, + 0xcb,0xc0,0x48,0xfa,0x5f,0x92,0xfd,0x0a,0x95,0x6f,0x34,0x8e,0x8c,0x1e,0x73,0xad, +}; diff --git a/scrypt-jane/scrypt-jane-mix_chacha-avx.h b/scrypt-jane/scrypt-jane-mix_chacha-avx.h new file mode 100644 index 0000000..50d6e2d --- /dev/null +++ b/scrypt-jane/scrypt-jane-mix_chacha-avx.h @@ -0,0 +1,340 @@ +/* x86 */ +#if defined(X86ASM_AVX) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_CHACHA_INCLUDED)) + +#define SCRYPT_CHACHA_AVX + +asm_naked_fn_proto(void, scrypt_ChunkMix_avx)(uint32_t *Bout/*[chunkBytes]*/, uint32_t *Bin/*[chunkBytes]*/, uint32_t *Bxor/*[chunkBytes]*/, uint32_t r) +asm_naked_fn(scrypt_ChunkMix_avx) + a1(push ebx) + a1(push edi) + a1(push esi) + a1(push ebp) + a2(mov ebp,esp) + a2(mov edi,[ebp+20]) + a2(mov esi,[ebp+24]) + a2(mov eax,[ebp+28]) + a2(mov ebx,[ebp+32]) + a2(sub esp,64) + a2(and esp,~63) + a2(lea edx,[ebx*2]) + a2(shl edx,6) + a2(lea ecx,[edx-64]) + a2(and eax, eax) + a2(vmovdqa xmm4,[ssse3_rotl16_32bit]) + a2(vmovdqa xmm5,[ssse3_rotl8_32bit]) + a2(vmovdqa xmm0,[ecx+esi+0]) + a2(vmovdqa xmm1,[ecx+esi+16]) + a2(vmovdqa xmm2,[ecx+esi+32]) + a2(vmovdqa xmm3,[ecx+esi+48]) + a1(jz scrypt_ChunkMix_avx_no_xor1) + a3(vpxor xmm0,xmm0,[ecx+eax+0]) + a3(vpxor xmm1,xmm1,[ecx+eax+16]) + a3(vpxor xmm2,xmm2,[ecx+eax+32]) + a3(vpxor xmm3,xmm3,[ecx+eax+48]) + a1(scrypt_ChunkMix_avx_no_xor1:) + a2(xor ecx,ecx) + a2(xor ebx,ebx) + a1(scrypt_ChunkMix_avx_loop:) + a2(and eax, eax) + a3(vpxor xmm0,xmm0,[esi+ecx+0]) + a3(vpxor xmm1,xmm1,[esi+ecx+16]) + a3(vpxor xmm2,xmm2,[esi+ecx+32]) + a3(vpxor xmm3,xmm3,[esi+ecx+48]) + a1(jz scrypt_ChunkMix_avx_no_xor2) + a3(vpxor xmm0,xmm0,[eax+ecx+0]) + a3(vpxor xmm1,xmm1,[eax+ecx+16]) + a3(vpxor xmm2,xmm2,[eax+ecx+32]) + a3(vpxor xmm3,xmm3,[eax+ecx+48]) + a1(scrypt_ChunkMix_avx_no_xor2:) + a2(vmovdqa [esp+0],xmm0) + a2(vmovdqa [esp+16],xmm1) + a2(vmovdqa [esp+32],xmm2) + a2(vmovdqa [esp+48],xmm3) + a2(mov eax,8) + a1(scrypt_chacha_avx_loop: ) + a3(vpaddd xmm0,xmm0,xmm1) + a3(vpxor xmm3,xmm3,xmm0) + a3(vpshufb xmm3,xmm3,xmm4) + a3(vpaddd xmm2,xmm2,xmm3) + a3(vpxor xmm1,xmm1,xmm2) + a3(vpsrld xmm6,xmm1,20) + a3(vpslld xmm1,xmm1,12) + a3(vpxor xmm1,xmm1,xmm6) + a3(vpaddd xmm0,xmm0,xmm1) + a3(vpxor xmm3,xmm3,xmm0) + a3(vpshufb xmm3,xmm3,xmm5) + a3(vpshufd xmm0,xmm0,0x93) + a3(vpaddd xmm2,xmm2,xmm3) + a3(vpshufd xmm3,xmm3,0x4e) + a3(vpxor xmm1,xmm1,xmm2) + a3(vpshufd xmm2,xmm2,0x39) + a3(vpsrld xmm6,xmm1,25) + a3(vpslld xmm1,xmm1,7) + a3(vpxor xmm1,xmm1,xmm6) + a2(sub eax,2) + a3(vpaddd xmm0,xmm0,xmm1) + a3(vpxor xmm3,xmm3,xmm0) + a3(vpshufb xmm3,xmm3,xmm4) + a3(vpaddd xmm2,xmm2,xmm3) + a3(vpxor xmm1,xmm1,xmm2) + a3(vpsrld xmm6,xmm1,20) + a3(vpslld xmm1,xmm1,12) + a3(vpxor xmm1,xmm1,xmm6) + a3(vpaddd xmm0,xmm0,xmm1) + a3(vpxor xmm3,xmm3,xmm0) + a3(vpshufb xmm3,xmm3,xmm5) + a3(vpshufd xmm0,xmm0,0x39) + a3(vpaddd xmm2,xmm2,xmm3) + a3(pshufd xmm3,xmm3,0x4e) + a3(vpxor xmm1,xmm1,xmm2) + a3(pshufd xmm2,xmm2,0x93) + a3(vpsrld xmm6,xmm1,25) + a3(vpslld xmm1,xmm1,7) + a3(vpxor xmm1,xmm1,xmm6) + a1(ja scrypt_chacha_avx_loop) + a3(vpaddd xmm0,xmm0,[esp+0]) + a3(vpaddd xmm1,xmm1,[esp+16]) + a3(vpaddd xmm2,xmm2,[esp+32]) + a3(vpaddd xmm3,xmm3,[esp+48]) + a2(lea eax,[ebx+ecx]) + a2(xor ebx,edx) + a2(and eax,~0x7f) + a2(add ecx,64) + a2(shr eax,1) + a2(add eax, edi) + a2(cmp ecx,edx) + a2(vmovdqa [eax+0],xmm0) + a2(vmovdqa [eax+16],xmm1) + a2(vmovdqa [eax+32],xmm2) + a2(vmovdqa [eax+48],xmm3) + a2(mov eax,[ebp+28]) + a1(jne scrypt_ChunkMix_avx_loop) + a2(mov esp,ebp) + a1(pop ebp) + a1(pop esi) + a1(pop edi) + a1(pop ebx) + a1(ret 16) +asm_naked_fn_end(scrypt_ChunkMix_avx) + +#endif + + + +/* x64 */ +#if defined(X86_64ASM_AVX) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_CHACHA_INCLUDED)) + +#define SCRYPT_CHACHA_AVX + +asm_naked_fn_proto(void, scrypt_ChunkMix_avx)(uint32_t *Bout/*[chunkBytes]*/, uint32_t *Bin/*[chunkBytes]*/, uint32_t *Bxor/*[chunkBytes]*/, uint32_t r) +asm_naked_fn(scrypt_ChunkMix_avx) + a2(lea rcx,[rcx*2]) + a2(shl rcx,6) + a2(lea r9,[rcx-64]) + a2(lea rax,[rsi+r9]) + a2(lea r9,[rdx+r9]) + a2(and rdx, rdx) + a2(vmovdqa xmm4,[ssse3_rotl16_32bit]) + a2(vmovdqa xmm5,[ssse3_rotl8_32bit]) + a2(vmovdqa xmm0,[rax+0]) + a2(vmovdqa xmm1,[rax+16]) + a2(vmovdqa xmm2,[rax+32]) + a2(vmovdqa xmm3,[rax+48]) + a1(jz scrypt_ChunkMix_avx_no_xor1) + a3(vpxor xmm0,xmm0,[r9+0]) + a3(vpxor xmm1,xmm1,[r9+16]) + a3(vpxor xmm2,xmm2,[r9+32]) + a3(vpxor xmm3,xmm3,[r9+48]) + a1(scrypt_ChunkMix_avx_no_xor1:) + a2(xor r8,r8) + a2(xor r9,r9) + a1(scrypt_ChunkMix_avx_loop:) + a2(and rdx, rdx) + a3(vpxor xmm0,xmm0,[rsi+r9+0]) + a3(vpxor xmm1,xmm1,[rsi+r9+16]) + a3(vpxor xmm2,xmm2,[rsi+r9+32]) + a3(vpxor xmm3,xmm3,[rsi+r9+48]) + a1(jz scrypt_ChunkMix_avx_no_xor2) + a3(vpxor xmm0,xmm0,[rdx+r9+0]) + a3(vpxor xmm1,xmm1,[rdx+r9+16]) + a3(vpxor xmm2,xmm2,[rdx+r9+32]) + a3(vpxor xmm3,xmm3,[rdx+r9+48]) + a1(scrypt_ChunkMix_avx_no_xor2:) + a2(vmovdqa xmm8,xmm0) + a2(vmovdqa xmm9,xmm1) + a2(vmovdqa xmm10,xmm2) + a2(vmovdqa xmm11,xmm3) + a2(mov rax,8) + a1(scrypt_chacha_avx_loop: ) + a3(vpaddd xmm0,xmm0,xmm1) + a3(vpxor xmm3,xmm3,xmm0) + a3(vpshufb xmm3,xmm3,xmm4) + a3(vpaddd xmm2,xmm2,xmm3) + a3(vpxor xmm1,xmm1,xmm2) + a3(vpsrld xmm12,xmm1,20) + a3(vpslld xmm1,xmm1,12) + a3(vpxor xmm1,xmm1,xmm12) + a3(vpaddd xmm0,xmm0,xmm1) + a3(vpxor xmm3,xmm3,xmm0) + a3(vpshufb xmm3,xmm3,xmm5) + a3(vpshufd xmm0,xmm0,0x93) + a3(vpaddd xmm2,xmm2,xmm3) + a3(vpshufd xmm3,xmm3,0x4e) + a3(vpxor xmm1,xmm1,xmm2) + a3(vpshufd xmm2,xmm2,0x39) + a3(vpsrld xmm12,xmm1,25) + a3(vpslld xmm1,xmm1,7) + a3(vpxor xmm1,xmm1,xmm12) + a2(sub rax,2) + a3(vpaddd xmm0,xmm0,xmm1) + a3(vpxor xmm3,xmm3,xmm0) + a3(vpshufb xmm3,xmm3,xmm4) + a3(vpaddd xmm2,xmm2,xmm3) + a3(vpxor xmm1,xmm1,xmm2) + a3(vpsrld xmm12,xmm1,20) + a3(vpslld xmm1,xmm1,12) + a3(vpxor xmm1,xmm1,xmm12) + a3(vpaddd xmm0,xmm0,xmm1) + a3(vpxor xmm3,xmm3,xmm0) + a3(vpshufb xmm3,xmm3,xmm5) + a3(vpshufd xmm0,xmm0,0x39) + a3(vpaddd xmm2,xmm2,xmm3) + a3(pshufd xmm3,xmm3,0x4e) + a3(vpxor xmm1,xmm1,xmm2) + a3(pshufd xmm2,xmm2,0x93) + a3(vpsrld xmm12,xmm1,25) + a3(vpslld xmm1,xmm1,7) + a3(vpxor xmm1,xmm1,xmm12) + a1(ja scrypt_chacha_avx_loop) + a3(vpaddd xmm0,xmm0,xmm8) + a3(vpaddd xmm1,xmm1,xmm9) + a3(vpaddd xmm2,xmm2,xmm10) + a3(vpaddd xmm3,xmm3,xmm11) + a2(lea rax,[r8+r9]) + a2(xor r8,rcx) + a2(and rax,~0x7f) + a2(add r9,64) + a2(shr rax,1) + a2(add rax, rdi) + a2(cmp r9,rcx) + a2(vmovdqa [rax+0],xmm0) + a2(vmovdqa [rax+16],xmm1) + a2(vmovdqa [rax+32],xmm2) + a2(vmovdqa [rax+48],xmm3) + a1(jne scrypt_ChunkMix_avx_loop) + a1(ret) +asm_naked_fn_end(scrypt_ChunkMix_avx) + +#endif + + +/* intrinsic */ +#if defined(X86_INTRINSIC_AVX) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_CHACHA_INCLUDED)) + +#define SCRYPT_CHACHA_AVX + +static void NOINLINE +scrypt_ChunkMix_avx(uint32_t *Bout/*[chunkBytes]*/, uint32_t *Bin/*[chunkBytes]*/, uint32_t *Bxor/*[chunkBytes]*/, uint32_t r) { + uint32_t i, blocksPerChunk = r * 2, half = 0; + xmmi *xmmp,x0,x1,x2,x3,x6,t0,t1,t2,t3; + const xmmi x4 = *(xmmi *)&ssse3_rotl16_32bit, x5 = *(xmmi *)&ssse3_rotl8_32bit; + size_t rounds; + + /* 1: X = B_{2r - 1} */ + xmmp = (xmmi *)scrypt_block(Bin, blocksPerChunk - 1); + x0 = xmmp[0]; + x1 = xmmp[1]; + x2 = xmmp[2]; + x3 = xmmp[3]; + + if (Bxor) { + xmmp = (xmmi *)scrypt_block(Bxor, blocksPerChunk - 1); + x0 = _mm_xor_si128(x0, xmmp[0]); + x1 = _mm_xor_si128(x1, xmmp[1]); + x2 = _mm_xor_si128(x2, xmmp[2]); + x3 = _mm_xor_si128(x3, xmmp[3]); + } + + /* 2: for i = 0 to 2r - 1 do */ + for (i = 0; i < blocksPerChunk; i++, half ^= r) { + /* 3: X = H(X ^ B_i) */ + xmmp = (xmmi *)scrypt_block(Bin, i); + x0 = _mm_xor_si128(x0, xmmp[0]); + x1 = _mm_xor_si128(x1, xmmp[1]); + x2 = _mm_xor_si128(x2, xmmp[2]); + x3 = _mm_xor_si128(x3, xmmp[3]); + + if (Bxor) { + xmmp = (xmmi *)scrypt_block(Bxor, i); + x0 = _mm_xor_si128(x0, xmmp[0]); + x1 = _mm_xor_si128(x1, xmmp[1]); + x2 = _mm_xor_si128(x2, xmmp[2]); + x3 = _mm_xor_si128(x3, xmmp[3]); + } + + t0 = x0; + t1 = x1; + t2 = x2; + t3 = x3; + + for (rounds = 8; rounds; rounds -= 2) { + x0 = _mm_add_epi32(x0, x1); + x3 = _mm_xor_si128(x3, x0); + x3 = _mm_shuffle_epi8(x3, x4); + x2 = _mm_add_epi32(x2, x3); + x1 = _mm_xor_si128(x1, x2); + x6 = x1; + x1 = _mm_or_si128(_mm_slli_epi32(x1, 12), _mm_srli_epi32(x6, 20)); + x0 = _mm_add_epi32(x0, x1); + x3 = _mm_xor_si128(x3, x0); + x3 = _mm_shuffle_epi8(x3, x5); + x0 = _mm_shuffle_epi32(x0, 0x93); + x2 = _mm_add_epi32(x2, x3); + x3 = _mm_shuffle_epi32(x3, 0x4e); + x1 = _mm_xor_si128(x1, x2); + x2 = _mm_shuffle_epi32(x2, 0x39); + x6 = x1; + x1 = _mm_or_si128(_mm_slli_epi32(x1, 7), _mm_srli_epi32(x6, 25)); + x0 = _mm_add_epi32(x0, x1); + x3 = _mm_xor_si128(x3, x0); + x3 = _mm_shuffle_epi8(x3, x4); + x2 = _mm_add_epi32(x2, x3); + x1 = _mm_xor_si128(x1, x2); + x6 = x1; + x1 = _mm_or_si128(_mm_slli_epi32(x1, 12), _mm_srli_epi32(x6, 20)); + x0 = _mm_add_epi32(x0, x1); + x3 = _mm_xor_si128(x3, x0); + x3 = _mm_shuffle_epi8(x3, x5); + x0 = _mm_shuffle_epi32(x0, 0x39); + x2 = _mm_add_epi32(x2, x3); + x3 = _mm_shuffle_epi32(x3, 0x4e); + x1 = _mm_xor_si128(x1, x2); + x2 = _mm_shuffle_epi32(x2, 0x93); + x6 = x1; + x1 = _mm_or_si128(_mm_slli_epi32(x1, 7), _mm_srli_epi32(x6, 25)); + } + + x0 = _mm_add_epi32(x0, t0); + x1 = _mm_add_epi32(x1, t1); + x2 = _mm_add_epi32(x2, t2); + x3 = _mm_add_epi32(x3, t3); + + /* 4: Y_i = X */ + /* 6: B'[0..r-1] = Y_even */ + /* 6: B'[r..2r-1] = Y_odd */ + xmmp = (xmmi *)scrypt_block(Bout, (i / 2) + half); + xmmp[0] = x0; + xmmp[1] = x1; + xmmp[2] = x2; + xmmp[3] = x3; + } +} + +#endif + +#if defined(SCRYPT_CHACHA_AVX) + #undef SCRYPT_MIX + #define SCRYPT_MIX "ChaCha/8-AVX" + #undef SCRYPT_CHACHA_INCLUDED + #define SCRYPT_CHACHA_INCLUDED +#endif diff --git a/scrypt-jane/scrypt-jane-mix_chacha-sse2.h b/scrypt-jane/scrypt-jane-mix_chacha-sse2.h new file mode 100644 index 0000000..d2192c8 --- /dev/null +++ b/scrypt-jane/scrypt-jane-mix_chacha-sse2.h @@ -0,0 +1,371 @@ +/* x86 */ +#if defined(X86ASM_SSE2) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_CHACHA_INCLUDED)) + +#define SCRYPT_CHACHA_SSE2 + +asm_naked_fn_proto(void, scrypt_ChunkMix_sse2)(uint32_t *Bout/*[chunkBytes]*/, uint32_t *Bin/*[chunkBytes]*/, uint32_t *Bxor/*[chunkBytes]*/, uint32_t r) +asm_naked_fn(scrypt_ChunkMix_sse2) + a1(push ebx) + a1(push edi) + a1(push esi) + a1(push ebp) + a2(mov ebp,esp) + a2(mov edi,[ebp+20]) + a2(mov esi,[ebp+24]) + a2(mov eax,[ebp+28]) + a2(mov ebx,[ebp+32]) + a2(sub esp,16) + a2(and esp,~15) + a2(lea edx,[ebx*2]) + a2(shl edx,6) + a2(lea ecx,[edx-64]) + a2(and eax, eax) + a2(movdqa xmm0,[ecx+esi+0]) + a2(movdqa xmm1,[ecx+esi+16]) + a2(movdqa xmm2,[ecx+esi+32]) + a2(movdqa xmm3,[ecx+esi+48]) + a1(jz scrypt_ChunkMix_sse2_no_xor1) + a2(pxor xmm0,[ecx+eax+0]) + a2(pxor xmm1,[ecx+eax+16]) + a2(pxor xmm2,[ecx+eax+32]) + a2(pxor xmm3,[ecx+eax+48]) + a1(scrypt_ChunkMix_sse2_no_xor1:) + a2(xor ecx,ecx) + a2(xor ebx,ebx) + a1(scrypt_ChunkMix_sse2_loop:) + a2(and eax, eax) + a2(pxor xmm0,[esi+ecx+0]) + a2(pxor xmm1,[esi+ecx+16]) + a2(pxor xmm2,[esi+ecx+32]) + a2(pxor xmm3,[esi+ecx+48]) + a1(jz scrypt_ChunkMix_sse2_no_xor2) + a2(pxor xmm0,[eax+ecx+0]) + a2(pxor xmm1,[eax+ecx+16]) + a2(pxor xmm2,[eax+ecx+32]) + a2(pxor xmm3,[eax+ecx+48]) + a1(scrypt_ChunkMix_sse2_no_xor2:) + a2(movdqa [esp+0],xmm0) + a2(movdqa xmm4,xmm1) + a2(movdqa xmm5,xmm2) + a2(movdqa xmm7,xmm3) + a2(mov eax,8) + a1(scrypt_chacha_sse2_loop: ) + a2(paddd xmm0,xmm1) + a2(pxor xmm3,xmm0) + a2(movdqa xmm6,xmm3) + a2(pslld xmm3,16) + a2(psrld xmm6,16) + a2(pxor xmm3,xmm6) + a2(paddd xmm2,xmm3) + a2(pxor xmm1,xmm2) + a2(movdqa xmm6,xmm1) + a2(pslld xmm1,12) + a2(psrld xmm6,20) + a2(pxor xmm1,xmm6) + a2(paddd xmm0,xmm1) + a2(pxor xmm3,xmm0) + a2(movdqa xmm6,xmm3) + a2(pslld xmm3,8) + a2(psrld xmm6,24) + a2(pxor xmm3,xmm6) + a3(pshufd xmm0,xmm0,0x93) + a2(paddd xmm2,xmm3) + a3(pshufd xmm3,xmm3,0x4e) + a2(pxor xmm1,xmm2) + a3(pshufd xmm2,xmm2,0x39) + a2(movdqa xmm6,xmm1) + a2(pslld xmm1,7) + a2(psrld xmm6,25) + a2(pxor xmm1,xmm6) + a2(sub eax,2) + a2(paddd xmm0,xmm1) + a2(pxor xmm3,xmm0) + a2(movdqa xmm6,xmm3) + a2(pslld xmm3,16) + a2(psrld xmm6,16) + a2(pxor xmm3,xmm6) + a2(paddd xmm2,xmm3) + a2(pxor xmm1,xmm2) + a2(movdqa xmm6,xmm1) + a2(pslld xmm1,12) + a2(psrld xmm6,20) + a2(pxor xmm1,xmm6) + a2(paddd xmm0,xmm1) + a2(pxor xmm3,xmm0) + a2(movdqa xmm6,xmm3) + a2(pslld xmm3,8) + a2(psrld xmm6,24) + a2(pxor xmm3,xmm6) + a3(pshufd xmm0,xmm0,0x39) + a2(paddd xmm2,xmm3) + a3(pshufd xmm3,xmm3,0x4e) + a2(pxor xmm1,xmm2) + a3(pshufd xmm2,xmm2,0x93) + a2(movdqa xmm6,xmm1) + a2(pslld xmm1,7) + a2(psrld xmm6,25) + a2(pxor xmm1,xmm6) + a1(ja scrypt_chacha_sse2_loop) + a2(paddd xmm0,[esp+0]) + a2(paddd xmm1,xmm4) + a2(paddd xmm2,xmm5) + a2(paddd xmm3,xmm7) + a2(lea eax,[ebx+ecx]) + a2(xor ebx,edx) + a2(and eax,~0x7f) + a2(add ecx,64) + a2(shr eax,1) + a2(add eax, edi) + a2(cmp ecx,edx) + a2(movdqa [eax+0],xmm0) + a2(movdqa [eax+16],xmm1) + a2(movdqa [eax+32],xmm2) + a2(movdqa [eax+48],xmm3) + a2(mov eax,[ebp+28]) + a1(jne scrypt_ChunkMix_sse2_loop) + a2(mov esp,ebp) + a1(pop ebp) + a1(pop esi) + a1(pop edi) + a1(pop ebx) + a1(ret 16) +asm_naked_fn_end(scrypt_ChunkMix_sse2) + +#endif + + + +/* x64 */ +#if defined(X86_64ASM_SSE2) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_CHACHA_INCLUDED)) + +#define SCRYPT_CHACHA_SSE2 + +asm_naked_fn_proto(void, scrypt_ChunkMix_sse2)(uint32_t *Bout/*[chunkBytes]*/, uint32_t *Bin/*[chunkBytes]*/, uint32_t *Bxor/*[chunkBytes]*/, uint32_t r) +asm_naked_fn(scrypt_ChunkMix_sse2) + a2(lea rcx,[rcx*2]) + a2(shl rcx,6) + a2(lea r9,[rcx-64]) + a2(lea rax,[rsi+r9]) + a2(lea r9,[rdx+r9]) + a2(and rdx, rdx) + a2(movdqa xmm0,[rax+0]) + a2(movdqa xmm1,[rax+16]) + a2(movdqa xmm2,[rax+32]) + a2(movdqa xmm3,[rax+48]) + a1(jz scrypt_ChunkMix_sse2_no_xor1) + a2(pxor xmm0,[r9+0]) + a2(pxor xmm1,[r9+16]) + a2(pxor xmm2,[r9+32]) + a2(pxor xmm3,[r9+48]) + a1(scrypt_ChunkMix_sse2_no_xor1:) + a2(xor r9,r9) + a2(xor r8,r8) + a1(scrypt_ChunkMix_sse2_loop:) + a2(and rdx, rdx) + a2(pxor xmm0,[rsi+r9+0]) + a2(pxor xmm1,[rsi+r9+16]) + a2(pxor xmm2,[rsi+r9+32]) + a2(pxor xmm3,[rsi+r9+48]) + a1(jz scrypt_ChunkMix_sse2_no_xor2) + a2(pxor xmm0,[rdx+r9+0]) + a2(pxor xmm1,[rdx+r9+16]) + a2(pxor xmm2,[rdx+r9+32]) + a2(pxor xmm3,[rdx+r9+48]) + a1(scrypt_ChunkMix_sse2_no_xor2:) + a2(movdqa xmm8,xmm0) + a2(movdqa xmm9,xmm1) + a2(movdqa xmm10,xmm2) + a2(movdqa xmm11,xmm3) + a2(mov rax,8) + a1(scrypt_chacha_sse2_loop: ) + a2(paddd xmm0,xmm1) + a2(pxor xmm3,xmm0) + a2(movdqa xmm6,xmm3) + a2(pslld xmm3,16) + a2(psrld xmm6,16) + a2(pxor xmm3,xmm6) + a2(paddd xmm2,xmm3) + a2(pxor xmm1,xmm2) + a2(movdqa xmm6,xmm1) + a2(pslld xmm1,12) + a2(psrld xmm6,20) + a2(pxor xmm1,xmm6) + a2(paddd xmm0,xmm1) + a2(pxor xmm3,xmm0) + a2(movdqa xmm6,xmm3) + a2(pslld xmm3,8) + a2(psrld xmm6,24) + a2(pxor xmm3,xmm6) + a3(pshufd xmm0,xmm0,0x93) + a2(paddd xmm2,xmm3) + a3(pshufd xmm3,xmm3,0x4e) + a2(pxor xmm1,xmm2) + a3(pshufd xmm2,xmm2,0x39) + a2(movdqa xmm6,xmm1) + a2(pslld xmm1,7) + a2(psrld xmm6,25) + a2(pxor xmm1,xmm6) + a2(sub rax,2) + a2(paddd xmm0,xmm1) + a2(pxor xmm3,xmm0) + a2(movdqa xmm6,xmm3) + a2(pslld xmm3,16) + a2(psrld xmm6,16) + a2(pxor xmm3,xmm6) + a2(paddd xmm2,xmm3) + a2(pxor xmm1,xmm2) + a2(movdqa xmm6,xmm1) + a2(pslld xmm1,12) + a2(psrld xmm6,20) + a2(pxor xmm1,xmm6) + a2(paddd xmm0,xmm1) + a2(pxor xmm3,xmm0) + a2(movdqa xmm6,xmm3) + a2(pslld xmm3,8) + a2(psrld xmm6,24) + a2(pxor xmm3,xmm6) + a3(pshufd xmm0,xmm0,0x39) + a2(paddd xmm2,xmm3) + a3(pshufd xmm3,xmm3,0x4e) + a2(pxor xmm1,xmm2) + a3(pshufd xmm2,xmm2,0x93) + a2(movdqa xmm6,xmm1) + a2(pslld xmm1,7) + a2(psrld xmm6,25) + a2(pxor xmm1,xmm6) + a1(ja scrypt_chacha_sse2_loop) + a2(paddd xmm0,xmm8) + a2(paddd xmm1,xmm9) + a2(paddd xmm2,xmm10) + a2(paddd xmm3,xmm11) + a2(lea rax,[r8+r9]) + a2(xor r8,rcx) + a2(and rax,~0x7f) + a2(add r9,64) + a2(shr rax,1) + a2(add rax, rdi) + a2(cmp r9,rcx) + a2(movdqa [rax+0],xmm0) + a2(movdqa [rax+16],xmm1) + a2(movdqa [rax+32],xmm2) + a2(movdqa [rax+48],xmm3) + a1(jne scrypt_ChunkMix_sse2_loop) + a1(ret) +asm_naked_fn_end(scrypt_ChunkMix_sse2) + +#endif + + +/* intrinsic */ +#if defined(X86_INTRINSIC_SSE2) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_CHACHA_INCLUDED)) + +#define SCRYPT_CHACHA_SSE2 + +static void NOINLINE +scrypt_ChunkMix_sse2(uint32_t *Bout/*[chunkBytes]*/, uint32_t *Bin/*[chunkBytes]*/, uint32_t *Bxor/*[chunkBytes]*/, uint32_t r) { + uint32_t i, blocksPerChunk = r * 2, half = 0; + xmmi *xmmp,x0,x1,x2,x3,x4,t0,t1,t2,t3; + size_t rounds; + + /* 1: X = B_{2r - 1} */ + xmmp = (xmmi *)scrypt_block(Bin, blocksPerChunk - 1); + x0 = xmmp[0]; + x1 = xmmp[1]; + x2 = xmmp[2]; + x3 = xmmp[3]; + + if (Bxor) { + xmmp = (xmmi *)scrypt_block(Bxor, blocksPerChunk - 1); + x0 = _mm_xor_si128(x0, xmmp[0]); + x1 = _mm_xor_si128(x1, xmmp[1]); + x2 = _mm_xor_si128(x2, xmmp[2]); + x3 = _mm_xor_si128(x3, xmmp[3]); + } + + /* 2: for i = 0 to 2r - 1 do */ + for (i = 0; i < blocksPerChunk; i++, half ^= r) { + /* 3: X = H(X ^ B_i) */ + xmmp = (xmmi *)scrypt_block(Bin, i); + x0 = _mm_xor_si128(x0, xmmp[0]); + x1 = _mm_xor_si128(x1, xmmp[1]); + x2 = _mm_xor_si128(x2, xmmp[2]); + x3 = _mm_xor_si128(x3, xmmp[3]); + + if (Bxor) { + xmmp = (xmmi *)scrypt_block(Bxor, i); + x0 = _mm_xor_si128(x0, xmmp[0]); + x1 = _mm_xor_si128(x1, xmmp[1]); + x2 = _mm_xor_si128(x2, xmmp[2]); + x3 = _mm_xor_si128(x3, xmmp[3]); + } + + t0 = x0; + t1 = x1; + t2 = x2; + t3 = x3; + + for (rounds = 8; rounds; rounds -= 2) { + x0 = _mm_add_epi32(x0, x1); + x3 = _mm_xor_si128(x3, x0); + x4 = x3; + x3 = _mm_or_si128(_mm_slli_epi32(x3, 16), _mm_srli_epi32(x4, 16)); + x2 = _mm_add_epi32(x2, x3); + x1 = _mm_xor_si128(x1, x2); + x4 = x1; + x1 = _mm_or_si128(_mm_slli_epi32(x1, 12), _mm_srli_epi32(x4, 20)); + x0 = _mm_add_epi32(x0, x1); + x3 = _mm_xor_si128(x3, x0); + x4 = x3; + x3 = _mm_or_si128(_mm_slli_epi32(x3, 8), _mm_srli_epi32(x4, 24)); + x0 = _mm_shuffle_epi32(x0, 0x93); + x2 = _mm_add_epi32(x2, x3); + x3 = _mm_shuffle_epi32(x3, 0x4e); + x1 = _mm_xor_si128(x1, x2); + x2 = _mm_shuffle_epi32(x2, 0x39); + x4 = x1; + x1 = _mm_or_si128(_mm_slli_epi32(x1, 7), _mm_srli_epi32(x4, 25)); + x0 = _mm_add_epi32(x0, x1); + x3 = _mm_xor_si128(x3, x0); + x4 = x3; + x3 = _mm_or_si128(_mm_slli_epi32(x3, 16), _mm_srli_epi32(x4, 16)); + x2 = _mm_add_epi32(x2, x3); + x1 = _mm_xor_si128(x1, x2); + x4 = x1; + x1 = _mm_or_si128(_mm_slli_epi32(x1, 12), _mm_srli_epi32(x4, 20)); + x0 = _mm_add_epi32(x0, x1); + x3 = _mm_xor_si128(x3, x0); + x4 = x3; + x3 = _mm_or_si128(_mm_slli_epi32(x3, 8), _mm_srli_epi32(x4, 24)); + x0 = _mm_shuffle_epi32(x0, 0x39); + x2 = _mm_add_epi32(x2, x3); + x3 = _mm_shuffle_epi32(x3, 0x4e); + x1 = _mm_xor_si128(x1, x2); + x2 = _mm_shuffle_epi32(x2, 0x93); + x4 = x1; + x1 = _mm_or_si128(_mm_slli_epi32(x1, 7), _mm_srli_epi32(x4, 25)); + } + + x0 = _mm_add_epi32(x0, t0); + x1 = _mm_add_epi32(x1, t1); + x2 = _mm_add_epi32(x2, t2); + x3 = _mm_add_epi32(x3, t3); + + /* 4: Y_i = X */ + /* 6: B'[0..r-1] = Y_even */ + /* 6: B'[r..2r-1] = Y_odd */ + xmmp = (xmmi *)scrypt_block(Bout, (i / 2) + half); + xmmp[0] = x0; + xmmp[1] = x1; + xmmp[2] = x2; + xmmp[3] = x3; + } +} + +#endif + +#if defined(SCRYPT_CHACHA_SSE2) + #undef SCRYPT_MIX + #define SCRYPT_MIX "ChaCha/8-SSE2" + #undef SCRYPT_CHACHA_INCLUDED + #define SCRYPT_CHACHA_INCLUDED +#endif diff --git a/scrypt-jane/scrypt-jane-mix_chacha-ssse3.h b/scrypt-jane/scrypt-jane-mix_chacha-ssse3.h new file mode 100644 index 0000000..b25e356 --- /dev/null +++ b/scrypt-jane/scrypt-jane-mix_chacha-ssse3.h @@ -0,0 +1,348 @@ +/* x86 */ +#if defined(X86ASM_SSSE3) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_CHACHA_INCLUDED)) + +#define SCRYPT_CHACHA_SSSE3 + +asm_naked_fn_proto(void, scrypt_ChunkMix_ssse3)(uint32_t *Bout/*[chunkBytes]*/, uint32_t *Bin/*[chunkBytes]*/, uint32_t *Bxor/*[chunkBytes]*/, uint32_t r) +asm_naked_fn(scrypt_ChunkMix_ssse3) + a1(push ebx) + a1(push edi) + a1(push esi) + a1(push ebp) + a2(mov ebp,esp) + a2(mov edi,[ebp+20]) + a2(mov esi,[ebp+24]) + a2(mov eax,[ebp+28]) + a2(mov ebx,[ebp+32]) + a2(sub esp,64) + a2(and esp,~63) + a2(lea edx,[ebx*2]) + a2(shl edx,6) + a2(lea ecx,[edx-64]) + a2(and eax, eax) + a2(movdqa xmm4,[ssse3_rotl16_32bit]) + a2(movdqa xmm5,[ssse3_rotl8_32bit]) + a2(movdqa xmm0,[ecx+esi+0]) + a2(movdqa xmm1,[ecx+esi+16]) + a2(movdqa xmm2,[ecx+esi+32]) + a2(movdqa xmm3,[ecx+esi+48]) + a1(jz scrypt_ChunkMix_ssse3_no_xor1) + a2(pxor xmm0,[ecx+eax+0]) + a2(pxor xmm1,[ecx+eax+16]) + a2(pxor xmm2,[ecx+eax+32]) + a2(pxor xmm3,[ecx+eax+48]) + a1(scrypt_ChunkMix_ssse3_no_xor1:) + a2(xor ecx,ecx) + a2(xor ebx,ebx) + a1(scrypt_ChunkMix_ssse3_loop:) + a2(and eax, eax) + a2(pxor xmm0,[esi+ecx+0]) + a2(pxor xmm1,[esi+ecx+16]) + a2(pxor xmm2,[esi+ecx+32]) + a2(pxor xmm3,[esi+ecx+48]) + a1(jz scrypt_ChunkMix_ssse3_no_xor2) + a2(pxor xmm0,[eax+ecx+0]) + a2(pxor xmm1,[eax+ecx+16]) + a2(pxor xmm2,[eax+ecx+32]) + a2(pxor xmm3,[eax+ecx+48]) + a1(scrypt_ChunkMix_ssse3_no_xor2:) + a2(movdqa [esp+0],xmm0) + a2(movdqa [esp+16],xmm1) + a2(movdqa [esp+32],xmm2) + a2(movdqa xmm7,xmm3) + a2(mov eax,8) + a1(scrypt_chacha_ssse3_loop: ) + a2(paddd xmm0,xmm1) + a2(pxor xmm3,xmm0) + a2(pshufb xmm3,xmm4) + a2(paddd xmm2,xmm3) + a2(pxor xmm1,xmm2) + a2(movdqa xmm6,xmm1) + a2(pslld xmm1,12) + a2(psrld xmm6,20) + a2(pxor xmm1,xmm6) + a2(paddd xmm0,xmm1) + a2(pxor xmm3,xmm0) + a2(pshufb xmm3,xmm5) + a3(pshufd xmm0,xmm0,0x93) + a2(paddd xmm2,xmm3) + a3(pshufd xmm3,xmm3,0x4e) + a2(pxor xmm1,xmm2) + a3(pshufd xmm2,xmm2,0x39) + a2(movdqa xmm6,xmm1) + a2(pslld xmm1,7) + a2(psrld xmm6,25) + a2(pxor xmm1,xmm6) + a2(sub eax,2) + a2(paddd xmm0,xmm1) + a2(pxor xmm3,xmm0) + a2(pshufb xmm3,xmm4) + a2(paddd xmm2,xmm3) + a2(pxor xmm1,xmm2) + a2(movdqa xmm6,xmm1) + a2(pslld xmm1,12) + a2(psrld xmm6,20) + a2(pxor xmm1,xmm6) + a2(paddd xmm0,xmm1) + a2(pxor xmm3,xmm0) + a2(pshufb xmm3,xmm5) + a3(pshufd xmm0,xmm0,0x39) + a2(paddd xmm2,xmm3) + a3(pshufd xmm3,xmm3,0x4e) + a2(pxor xmm1,xmm2) + a3(pshufd xmm2,xmm2,0x93) + a2(movdqa xmm6,xmm1) + a2(pslld xmm1,7) + a2(psrld xmm6,25) + a2(pxor xmm1,xmm6) + a1(ja scrypt_chacha_ssse3_loop) + a2(paddd xmm0,[esp+0]) + a2(paddd xmm1,[esp+16]) + a2(paddd xmm2,[esp+32]) + a2(paddd xmm3,xmm7) + a2(lea eax,[ebx+ecx]) + a2(xor ebx,edx) + a2(and eax,~0x7f) + a2(add ecx,64) + a2(shr eax,1) + a2(add eax, edi) + a2(cmp ecx,edx) + a2(movdqa [eax+0],xmm0) + a2(movdqa [eax+16],xmm1) + a2(movdqa [eax+32],xmm2) + a2(movdqa [eax+48],xmm3) + a2(mov eax,[ebp+28]) + a1(jne scrypt_ChunkMix_ssse3_loop) + a2(mov esp,ebp) + a1(pop ebp) + a1(pop esi) + a1(pop edi) + a1(pop ebx) + a1(ret 16) +asm_naked_fn_end(scrypt_ChunkMix_ssse3) + +#endif + + + +/* x64 */ +#if defined(X86_64ASM_SSSE3) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_CHACHA_INCLUDED)) + +#define SCRYPT_CHACHA_SSSE3 + +asm_naked_fn_proto(void, scrypt_ChunkMix_ssse3)(uint32_t *Bout/*[chunkBytes]*/, uint32_t *Bin/*[chunkBytes]*/, uint32_t *Bxor/*[chunkBytes]*/, uint32_t r) +asm_naked_fn(scrypt_ChunkMix_ssse3) + a2(lea rcx,[rcx*2]) + a2(shl rcx,6) + a2(lea r9,[rcx-64]) + a2(lea rax,[rsi+r9]) + a2(lea r9,[rdx+r9]) + a2(and rdx, rdx) + a2(movdqa xmm4,[ssse3_rotl16_32bit]) + a2(movdqa xmm5,[ssse3_rotl8_32bit]) + a2(movdqa xmm0,[rax+0]) + a2(movdqa xmm1,[rax+16]) + a2(movdqa xmm2,[rax+32]) + a2(movdqa xmm3,[rax+48]) + a1(jz scrypt_ChunkMix_ssse3_no_xor1) + a2(pxor xmm0,[r9+0]) + a2(pxor xmm1,[r9+16]) + a2(pxor xmm2,[r9+32]) + a2(pxor xmm3,[r9+48]) + a1(scrypt_ChunkMix_ssse3_no_xor1:) + a2(xor r8,r8) + a2(xor r9,r9) + a1(scrypt_ChunkMix_ssse3_loop:) + a2(and rdx, rdx) + a2(pxor xmm0,[rsi+r9+0]) + a2(pxor xmm1,[rsi+r9+16]) + a2(pxor xmm2,[rsi+r9+32]) + a2(pxor xmm3,[rsi+r9+48]) + a1(jz scrypt_ChunkMix_ssse3_no_xor2) + a2(pxor xmm0,[rdx+r9+0]) + a2(pxor xmm1,[rdx+r9+16]) + a2(pxor xmm2,[rdx+r9+32]) + a2(pxor xmm3,[rdx+r9+48]) + a1(scrypt_ChunkMix_ssse3_no_xor2:) + a2(movdqa xmm8,xmm0) + a2(movdqa xmm9,xmm1) + a2(movdqa xmm10,xmm2) + a2(movdqa xmm11,xmm3) + a2(mov rax,8) + a1(scrypt_chacha_ssse3_loop: ) + a2(paddd xmm0,xmm1) + a2(pxor xmm3,xmm0) + a2(pshufb xmm3,xmm4) + a2(paddd xmm2,xmm3) + a2(pxor xmm1,xmm2) + a2(movdqa xmm12,xmm1) + a2(pslld xmm1,12) + a2(psrld xmm12,20) + a2(pxor xmm1,xmm12) + a2(paddd xmm0,xmm1) + a2(pxor xmm3,xmm0) + a2(pshufb xmm3,xmm5) + a3(pshufd xmm0,xmm0,0x93) + a2(paddd xmm2,xmm3) + a3(pshufd xmm3,xmm3,0x4e) + a2(pxor xmm1,xmm2) + a3(pshufd xmm2,xmm2,0x39) + a2(movdqa xmm12,xmm1) + a2(pslld xmm1,7) + a2(psrld xmm12,25) + a2(pxor xmm1,xmm12) + a2(sub rax,2) + a2(paddd xmm0,xmm1) + a2(pxor xmm3,xmm0) + a2(pshufb xmm3,xmm4) + a2(paddd xmm2,xmm3) + a2(pxor xmm1,xmm2) + a2(movdqa xmm12,xmm1) + a2(pslld xmm1,12) + a2(psrld xmm12,20) + a2(pxor xmm1,xmm12) + a2(paddd xmm0,xmm1) + a2(pxor xmm3,xmm0) + a2(pshufb xmm3,xmm5) + a3(pshufd xmm0,xmm0,0x39) + a2(paddd xmm2,xmm3) + a3(pshufd xmm3,xmm3,0x4e) + a2(pxor xmm1,xmm2) + a3(pshufd xmm2,xmm2,0x93) + a2(movdqa xmm12,xmm1) + a2(pslld xmm1,7) + a2(psrld xmm12,25) + a2(pxor xmm1,xmm12) + a1(ja scrypt_chacha_ssse3_loop) + a2(paddd xmm0,xmm8) + a2(paddd xmm1,xmm9) + a2(paddd xmm2,xmm10) + a2(paddd xmm3,xmm11) + a2(lea rax,[r8+r9]) + a2(xor r8,rcx) + a2(and rax,~0x7f) + a2(add r9,64) + a2(shr rax,1) + a2(add rax, rdi) + a2(cmp r9,rcx) + a2(movdqa [rax+0],xmm0) + a2(movdqa [rax+16],xmm1) + a2(movdqa [rax+32],xmm2) + a2(movdqa [rax+48],xmm3) + a1(jne scrypt_ChunkMix_ssse3_loop) + a1(ret) +asm_naked_fn_end(scrypt_ChunkMix_ssse3) + +#endif + + +/* intrinsic */ +#if defined(X86_INTRINSIC_SSSE3) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_CHACHA_INCLUDED)) + +#define SCRYPT_CHACHA_SSSE3 + +static void NOINLINE +scrypt_ChunkMix_ssse3(uint32_t *Bout/*[chunkBytes]*/, uint32_t *Bin/*[chunkBytes]*/, uint32_t *Bxor/*[chunkBytes]*/, uint32_t r) { + uint32_t i, blocksPerChunk = r * 2, half = 0; + xmmi *xmmp,x0,x1,x2,x3,x6,t0,t1,t2,t3; + const xmmi x4 = *(xmmi *)&ssse3_rotl16_32bit, x5 = *(xmmi *)&ssse3_rotl8_32bit; + size_t rounds; + + /* 1: X = B_{2r - 1} */ + xmmp = (xmmi *)scrypt_block(Bin, blocksPerChunk - 1); + x0 = xmmp[0]; + x1 = xmmp[1]; + x2 = xmmp[2]; + x3 = xmmp[3]; + + if (Bxor) { + xmmp = (xmmi *)scrypt_block(Bxor, blocksPerChunk - 1); + x0 = _mm_xor_si128(x0, xmmp[0]); + x1 = _mm_xor_si128(x1, xmmp[1]); + x2 = _mm_xor_si128(x2, xmmp[2]); + x3 = _mm_xor_si128(x3, xmmp[3]); + } + + /* 2: for i = 0 to 2r - 1 do */ + for (i = 0; i < blocksPerChunk; i++, half ^= r) { + /* 3: X = H(X ^ B_i) */ + xmmp = (xmmi *)scrypt_block(Bin, i); + x0 = _mm_xor_si128(x0, xmmp[0]); + x1 = _mm_xor_si128(x1, xmmp[1]); + x2 = _mm_xor_si128(x2, xmmp[2]); + x3 = _mm_xor_si128(x3, xmmp[3]); + + if (Bxor) { + xmmp = (xmmi *)scrypt_block(Bxor, i); + x0 = _mm_xor_si128(x0, xmmp[0]); + x1 = _mm_xor_si128(x1, xmmp[1]); + x2 = _mm_xor_si128(x2, xmmp[2]); + x3 = _mm_xor_si128(x3, xmmp[3]); + } + + t0 = x0; + t1 = x1; + t2 = x2; + t3 = x3; + + for (rounds = 8; rounds; rounds -= 2) { + x0 = _mm_add_epi32(x0, x1); + x3 = _mm_xor_si128(x3, x0); + x3 = _mm_shuffle_epi8(x3, x4); + x2 = _mm_add_epi32(x2, x3); + x1 = _mm_xor_si128(x1, x2); + x6 = x1; + x1 = _mm_or_si128(_mm_slli_epi32(x1, 12), _mm_srli_epi32(x6, 20)); + x0 = _mm_add_epi32(x0, x1); + x3 = _mm_xor_si128(x3, x0); + x3 = _mm_shuffle_epi8(x3, x5); + x0 = _mm_shuffle_epi32(x0, 0x93); + x2 = _mm_add_epi32(x2, x3); + x3 = _mm_shuffle_epi32(x3, 0x4e); + x1 = _mm_xor_si128(x1, x2); + x2 = _mm_shuffle_epi32(x2, 0x39); + x6 = x1; + x1 = _mm_or_si128(_mm_slli_epi32(x1, 7), _mm_srli_epi32(x6, 25)); + x0 = _mm_add_epi32(x0, x1); + x3 = _mm_xor_si128(x3, x0); + x3 = _mm_shuffle_epi8(x3, x4); + x2 = _mm_add_epi32(x2, x3); + x1 = _mm_xor_si128(x1, x2); + x6 = x1; + x1 = _mm_or_si128(_mm_slli_epi32(x1, 12), _mm_srli_epi32(x6, 20)); + x0 = _mm_add_epi32(x0, x1); + x3 = _mm_xor_si128(x3, x0); + x3 = _mm_shuffle_epi8(x3, x5); + x0 = _mm_shuffle_epi32(x0, 0x39); + x2 = _mm_add_epi32(x2, x3); + x3 = _mm_shuffle_epi32(x3, 0x4e); + x1 = _mm_xor_si128(x1, x2); + x2 = _mm_shuffle_epi32(x2, 0x93); + x6 = x1; + x1 = _mm_or_si128(_mm_slli_epi32(x1, 7), _mm_srli_epi32(x6, 25)); + } + + x0 = _mm_add_epi32(x0, t0); + x1 = _mm_add_epi32(x1, t1); + x2 = _mm_add_epi32(x2, t2); + x3 = _mm_add_epi32(x3, t3); + + /* 4: Y_i = X */ + /* 6: B'[0..r-1] = Y_even */ + /* 6: B'[r..2r-1] = Y_odd */ + xmmp = (xmmi *)scrypt_block(Bout, (i / 2) + half); + xmmp[0] = x0; + xmmp[1] = x1; + xmmp[2] = x2; + xmmp[3] = x3; + } +} + +#endif + +#if defined(SCRYPT_CHACHA_SSSE3) + #undef SCRYPT_MIX + #define SCRYPT_MIX "ChaCha/8-SSSE3" + #undef SCRYPT_CHACHA_INCLUDED + #define SCRYPT_CHACHA_INCLUDED +#endif diff --git a/scrypt-jane/scrypt-jane-mix_chacha.h b/scrypt-jane/scrypt-jane-mix_chacha.h new file mode 100644 index 0000000..85ee9c1 --- /dev/null +++ b/scrypt-jane/scrypt-jane-mix_chacha.h @@ -0,0 +1,69 @@ +#if !defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_CHACHA_INCLUDED) + +#undef SCRYPT_MIX +#define SCRYPT_MIX "ChaCha20/8 Ref" + +#undef SCRYPT_CHACHA_INCLUDED +#define SCRYPT_CHACHA_INCLUDED +#define SCRYPT_CHACHA_BASIC + +static void +chacha_core_basic(uint32_t state[16]) { + size_t rounds = 8; + uint32_t x0,x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15,t; + + x0 = state[0]; + x1 = state[1]; + x2 = state[2]; + x3 = state[3]; + x4 = state[4]; + x5 = state[5]; + x6 = state[6]; + x7 = state[7]; + x8 = state[8]; + x9 = state[9]; + x10 = state[10]; + x11 = state[11]; + x12 = state[12]; + x13 = state[13]; + x14 = state[14]; + x15 = state[15]; + + #define quarter(a,b,c,d) \ + a += b; t = d^a; d = ROTL32(t,16); \ + c += d; t = b^c; b = ROTL32(t,12); \ + a += b; t = d^a; d = ROTL32(t, 8); \ + c += d; t = b^c; b = ROTL32(t, 7); + + for (; rounds; rounds -= 2) { + quarter( x0, x4, x8,x12) + quarter( x1, x5, x9,x13) + quarter( x2, x6,x10,x14) + quarter( x3, x7,x11,x15) + quarter( x0, x5,x10,x15) + quarter( x1, x6,x11,x12) + quarter( x2, x7, x8,x13) + quarter( x3, x4, x9,x14) + } + + state[0] += x0; + state[1] += x1; + state[2] += x2; + state[3] += x3; + state[4] += x4; + state[5] += x5; + state[6] += x6; + state[7] += x7; + state[8] += x8; + state[9] += x9; + state[10] += x10; + state[11] += x11; + state[12] += x12; + state[13] += x13; + state[14] += x14; + state[15] += x15; + + #undef quarter +} + +#endif \ No newline at end of file diff --git a/scrypt-jane/scrypt-jane-mix_salsa-avx.h b/scrypt-jane/scrypt-jane-mix_salsa-avx.h new file mode 100644 index 0000000..15fb48e --- /dev/null +++ b/scrypt-jane/scrypt-jane-mix_salsa-avx.h @@ -0,0 +1,381 @@ +/* x86 */ +#if defined(X86ASM_AVX) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_SALSA_INCLUDED)) + +#define SCRYPT_SALSA_AVX + +asm_naked_fn_proto(void, scrypt_ChunkMix_avx)(uint32_t *Bout/*[chunkBytes]*/, uint32_t *Bin/*[chunkBytes]*/, uint32_t *Bxor/*[chunkBytes]*/, uint32_t r) +asm_naked_fn(scrypt_ChunkMix_avx) + a1(push ebx) + a1(push edi) + a1(push esi) + a1(push ebp) + a2(mov ebp,esp) + a2(mov edi,[ebp+20]) + a2(mov esi,[ebp+24]) + a2(mov eax,[ebp+28]) + a2(mov ebx,[ebp+32]) + a2(sub esp,32) + a2(and esp,~63) + a2(lea edx,[ebx*2]) + a2(shl edx,6) + a2(lea ecx,[edx-64]) + a2(and eax, eax) + a2(movdqa xmm0,[ecx+esi+0]) + a2(movdqa xmm1,[ecx+esi+16]) + a2(movdqa xmm2,[ecx+esi+32]) + a2(movdqa xmm3,[ecx+esi+48]) + a1(jz scrypt_ChunkMix_avx_no_xor1) + a3(vpxor xmm0,xmm0,[ecx+eax+0]) + a3(vpxor xmm1,xmm1,[ecx+eax+16]) + a3(vpxor xmm2,xmm2,[ecx+eax+32]) + a3(vpxor xmm3,xmm3,[ecx+eax+48]) + a1(scrypt_ChunkMix_avx_no_xor1:) + a2(xor ecx,ecx) + a2(xor ebx,ebx) + a1(scrypt_ChunkMix_avx_loop:) + a2(and eax, eax) + a3(vpxor xmm0,xmm0,[esi+ecx+0]) + a3(vpxor xmm1,xmm1,[esi+ecx+16]) + a3(vpxor xmm2,xmm2,[esi+ecx+32]) + a3(vpxor xmm3,xmm3,[esi+ecx+48]) + a1(jz scrypt_ChunkMix_avx_no_xor2) + a3(vpxor xmm0,xmm0,[eax+ecx+0]) + a3(vpxor xmm1,xmm1,[eax+ecx+16]) + a3(vpxor xmm2,xmm2,[eax+ecx+32]) + a3(vpxor xmm3,xmm3,[eax+ecx+48]) + a1(scrypt_ChunkMix_avx_no_xor2:) + a2(vmovdqa [esp+0],xmm0) + a2(vmovdqa [esp+16],xmm1) + a2(vmovdqa xmm6,xmm2) + a2(vmovdqa xmm7,xmm3) + a2(mov eax,8) + a1(scrypt_salsa_avx_loop: ) + a3(vpaddd xmm4, xmm1, xmm0) + a3(vpsrld xmm5, xmm4, 25) + a3(vpslld xmm4, xmm4, 7) + a3(vpxor xmm3, xmm3, xmm5) + a3(vpxor xmm3, xmm3, xmm4) + a3(vpaddd xmm4, xmm0, xmm3) + a3(vpsrld xmm5, xmm4, 23) + a3(vpslld xmm4, xmm4, 9) + a3(vpxor xmm2, xmm2, xmm5) + a3(vpxor xmm2, xmm2, xmm4) + a3(vpaddd xmm4, xmm3, xmm2) + a3(vpsrld xmm5, xmm4, 19) + a3(vpslld xmm4, xmm4, 13) + a3(vpxor xmm1, xmm1, xmm5) + a3(pshufd xmm3, xmm3, 0x93) + a3(vpxor xmm1, xmm1, xmm4) + a3(vpaddd xmm4, xmm2, xmm1) + a3(vpsrld xmm5, xmm4, 14) + a3(vpslld xmm4, xmm4, 18) + a3(vpxor xmm0, xmm0, xmm5) + a3(pshufd xmm2, xmm2, 0x4e) + a3(vpxor xmm0, xmm0, xmm4) + a2(sub eax, 2) + a3(vpaddd xmm4, xmm3, xmm0) + a3(pshufd xmm1, xmm1, 0x39) + a3(vpsrld xmm5, xmm4, 25) + a3(vpslld xmm4, xmm4, 7) + a3(vpxor xmm1, xmm1, xmm5) + a3(vpxor xmm1, xmm1, xmm4) + a3(vpaddd xmm4, xmm0, xmm1) + a3(vpsrld xmm5, xmm4, 23) + a3(vpslld xmm4, xmm4, 9) + a3(vpxor xmm2, xmm2, xmm5) + a3(vpxor xmm2, xmm2, xmm4) + a3(vpaddd xmm4, xmm1, xmm2) + a3(vpsrld xmm5, xmm4, 19) + a3(vpslld xmm4, xmm4, 13) + a3(vpxor xmm3, xmm3, xmm5) + a3(pshufd xmm1, xmm1, 0x93) + a3(vpxor xmm3, xmm3, xmm4) + a3(vpaddd xmm4, xmm2, xmm3) + a3(vpsrld xmm5, xmm4, 14) + a3(vpslld xmm4, xmm4, 18) + a3(vpxor xmm0, xmm0, xmm5) + a3(pshufd xmm2, xmm2, 0x4e) + a3(vpxor xmm0, xmm0, xmm4) + a3(pshufd xmm3, xmm3, 0x39) + a1(ja scrypt_salsa_avx_loop) + a3(vpaddd xmm0,xmm0,[esp+0]) + a3(vpaddd xmm1,xmm1,[esp+16]) + a3(vpaddd xmm2,xmm2,xmm6) + a3(vpaddd xmm3,xmm3,xmm7) + a2(lea eax,[ebx+ecx]) + a2(xor ebx,edx) + a2(and eax,~0x7f) + a2(add ecx,64) + a2(shr eax,1) + a2(add eax, edi) + a2(cmp ecx,edx) + a2(vmovdqa [eax+0],xmm0) + a2(vmovdqa [eax+16],xmm1) + a2(vmovdqa [eax+32],xmm2) + a2(vmovdqa [eax+48],xmm3) + a2(mov eax,[ebp+28]) + a1(jne scrypt_ChunkMix_avx_loop) + a2(mov esp,ebp) + a1(pop ebp) + a1(pop esi) + a1(pop edi) + a1(pop ebx) + a1(ret 16) +asm_naked_fn_end(scrypt_ChunkMix_avx) + +#endif + + + +/* x64 */ +#if defined(X86_64ASM_AVX) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_SALSA_INCLUDED)) + +#define SCRYPT_SALSA_AVX + +asm_naked_fn_proto(void, scrypt_ChunkMix_avx)(uint32_t *Bout/*[chunkBytes]*/, uint32_t *Bin/*[chunkBytes]*/, uint32_t *Bxor/*[chunkBytes]*/, uint32_t r) +asm_naked_fn(scrypt_ChunkMix_avx) + a2(lea rcx,[rcx*2]) + a2(shl rcx,6) + a2(lea r9,[rcx-64]) + a2(lea rax,[rsi+r9]) + a2(lea r9,[rdx+r9]) + a2(and rdx, rdx) + a2(vmovdqa xmm0,[rax+0]) + a2(vmovdqa xmm1,[rax+16]) + a2(vmovdqa xmm2,[rax+32]) + a2(vmovdqa xmm3,[rax+48]) + a1(jz scrypt_ChunkMix_avx_no_xor1) + a3(vpxor xmm0,xmm0,[r9+0]) + a3(vpxor xmm1,xmm1,[r9+16]) + a3(vpxor xmm2,xmm2,[r9+32]) + a3(vpxor xmm3,xmm3,[r9+48]) + a1(scrypt_ChunkMix_avx_no_xor1:) + a2(xor r9,r9) + a2(xor r8,r8) + a1(scrypt_ChunkMix_avx_loop:) + a2(and rdx, rdx) + a3(vpxor xmm0,xmm0,[rsi+r9+0]) + a3(vpxor xmm1,xmm1,[rsi+r9+16]) + a3(vpxor xmm2,xmm2,[rsi+r9+32]) + a3(vpxor xmm3,xmm3,[rsi+r9+48]) + a1(jz scrypt_ChunkMix_avx_no_xor2) + a3(vpxor xmm0,xmm0,[rdx+r9+0]) + a3(vpxor xmm1,xmm1,[rdx+r9+16]) + a3(vpxor xmm2,xmm2,[rdx+r9+32]) + a3(vpxor xmm3,xmm3,[rdx+r9+48]) + a1(scrypt_ChunkMix_avx_no_xor2:) + a2(vmovdqa xmm8,xmm0) + a2(vmovdqa xmm9,xmm1) + a2(vmovdqa xmm10,xmm2) + a2(vmovdqa xmm11,xmm3) + a2(mov rax,8) + a1(scrypt_salsa_avx_loop: ) + a3(vpaddd xmm4, xmm1, xmm0) + a3(vpsrld xmm5, xmm4, 25) + a3(vpslld xmm4, xmm4, 7) + a3(vpxor xmm3, xmm3, xmm5) + a3(vpxor xmm3, xmm3, xmm4) + a3(vpaddd xmm4, xmm0, xmm3) + a3(vpsrld xmm5, xmm4, 23) + a3(vpslld xmm4, xmm4, 9) + a3(vpxor xmm2, xmm2, xmm5) + a3(vpxor xmm2, xmm2, xmm4) + a3(vpaddd xmm4, xmm3, xmm2) + a3(vpsrld xmm5, xmm4, 19) + a3(vpslld xmm4, xmm4, 13) + a3(vpxor xmm1, xmm1, xmm5) + a3(pshufd xmm3, xmm3, 0x93) + a3(vpxor xmm1, xmm1, xmm4) + a3(vpaddd xmm4, xmm2, xmm1) + a3(vpsrld xmm5, xmm4, 14) + a3(vpslld xmm4, xmm4, 18) + a3(vpxor xmm0, xmm0, xmm5) + a3(pshufd xmm2, xmm2, 0x4e) + a3(vpxor xmm0, xmm0, xmm4) + a2(sub rax, 2) + a3(vpaddd xmm4, xmm3, xmm0) + a3(pshufd xmm1, xmm1, 0x39) + a3(vpsrld xmm5, xmm4, 25) + a3(vpslld xmm4, xmm4, 7) + a3(vpxor xmm1, xmm1, xmm5) + a3(vpxor xmm1, xmm1, xmm4) + a3(vpaddd xmm4, xmm0, xmm1) + a3(vpsrld xmm5, xmm4, 23) + a3(vpslld xmm4, xmm4, 9) + a3(vpxor xmm2, xmm2, xmm5) + a3(vpxor xmm2, xmm2, xmm4) + a3(vpaddd xmm4, xmm1, xmm2) + a3(vpsrld xmm5, xmm4, 19) + a3(vpslld xmm4, xmm4, 13) + a3(vpxor xmm3, xmm3, xmm5) + a3(pshufd xmm1, xmm1, 0x93) + a3(vpxor xmm3, xmm3, xmm4) + a3(vpaddd xmm4, xmm2, xmm3) + a3(vpsrld xmm5, xmm4, 14) + a3(vpslld xmm4, xmm4, 18) + a3(vpxor xmm0, xmm0, xmm5) + a3(pshufd xmm2, xmm2, 0x4e) + a3(vpxor xmm0, xmm0, xmm4) + a3(pshufd xmm3, xmm3, 0x39) + a1(ja scrypt_salsa_avx_loop) + a3(vpaddd xmm0,xmm0,xmm8) + a3(vpaddd xmm1,xmm1,xmm9) + a3(vpaddd xmm2,xmm2,xmm10) + a3(vpaddd xmm3,xmm3,xmm11) + a2(lea rax,[r8+r9]) + a2(xor r8,rcx) + a2(and rax,~0x7f) + a2(add r9,64) + a2(shr rax,1) + a2(add rax, rdi) + a2(cmp r9,rcx) + a2(vmovdqa [rax+0],xmm0) + a2(vmovdqa [rax+16],xmm1) + a2(vmovdqa [rax+32],xmm2) + a2(vmovdqa [rax+48],xmm3) + a1(jne scrypt_ChunkMix_avx_loop) + a1(ret) +asm_naked_fn_end(scrypt_ChunkMix_avx) + +#endif + + +/* intrinsic */ +#if defined(X86_INTRINSIC_AVX) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_SALSA_INCLUDED)) + +#define SCRYPT_SALSA_AVX + +static void NOINLINE +scrypt_ChunkMix_avx(uint32_t *Bout/*[chunkBytes]*/, uint32_t *Bin/*[chunkBytes]*/, uint32_t *Bxor/*[chunkBytes]*/, uint32_t r) { + uint32_t i, blocksPerChunk = r * 2, half = 0; + xmmi *xmmp,x0,x1,x2,x3,x4,x5,t0,t1,t2,t3; + size_t rounds; + + /* 1: X = B_{2r - 1} */ + xmmp = (xmmi *)scrypt_block(Bin, blocksPerChunk - 1); + x0 = xmmp[0]; + x1 = xmmp[1]; + x2 = xmmp[2]; + x3 = xmmp[3]; + + if (Bxor) { + xmmp = (xmmi *)scrypt_block(Bxor, blocksPerChunk - 1); + x0 = _mm_xor_si128(x0, xmmp[0]); + x1 = _mm_xor_si128(x1, xmmp[1]); + x2 = _mm_xor_si128(x2, xmmp[2]); + x3 = _mm_xor_si128(x3, xmmp[3]); + } + + /* 2: for i = 0 to 2r - 1 do */ + for (i = 0; i < blocksPerChunk; i++, half ^= r) { + /* 3: X = H(X ^ B_i) */ + xmmp = (xmmi *)scrypt_block(Bin, i); + x0 = _mm_xor_si128(x0, xmmp[0]); + x1 = _mm_xor_si128(x1, xmmp[1]); + x2 = _mm_xor_si128(x2, xmmp[2]); + x3 = _mm_xor_si128(x3, xmmp[3]); + + if (Bxor) { + xmmp = (xmmi *)scrypt_block(Bxor, i); + x0 = _mm_xor_si128(x0, xmmp[0]); + x1 = _mm_xor_si128(x1, xmmp[1]); + x2 = _mm_xor_si128(x2, xmmp[2]); + x3 = _mm_xor_si128(x3, xmmp[3]); + } + + t0 = x0; + t1 = x1; + t2 = x2; + t3 = x3; + + for (rounds = 8; rounds; rounds -= 2) { + x4 = x1; + x4 = _mm_add_epi32(x4, x0); + x5 = x4; + x4 = _mm_slli_epi32(x4, 7); + x5 = _mm_srli_epi32(x5, 25); + x3 = _mm_xor_si128(x3, x4); + x4 = x0; + x3 = _mm_xor_si128(x3, x5); + x4 = _mm_add_epi32(x4, x3); + x5 = x4; + x4 = _mm_slli_epi32(x4, 9); + x5 = _mm_srli_epi32(x5, 23); + x2 = _mm_xor_si128(x2, x4); + x4 = x3; + x2 = _mm_xor_si128(x2, x5); + x3 = _mm_shuffle_epi32(x3, 0x93); + x4 = _mm_add_epi32(x4, x2); + x5 = x4; + x4 = _mm_slli_epi32(x4, 13); + x5 = _mm_srli_epi32(x5, 19); + x1 = _mm_xor_si128(x1, x4); + x4 = x2; + x1 = _mm_xor_si128(x1, x5); + x2 = _mm_shuffle_epi32(x2, 0x4e); + x4 = _mm_add_epi32(x4, x1); + x5 = x4; + x4 = _mm_slli_epi32(x4, 18); + x5 = _mm_srli_epi32(x5, 14); + x0 = _mm_xor_si128(x0, x4); + x4 = x3; + x0 = _mm_xor_si128(x0, x5); + x1 = _mm_shuffle_epi32(x1, 0x39); + x4 = _mm_add_epi32(x4, x0); + x5 = x4; + x4 = _mm_slli_epi32(x4, 7); + x5 = _mm_srli_epi32(x5, 25); + x1 = _mm_xor_si128(x1, x4); + x4 = x0; + x1 = _mm_xor_si128(x1, x5); + x4 = _mm_add_epi32(x4, x1); + x5 = x4; + x4 = _mm_slli_epi32(x4, 9); + x5 = _mm_srli_epi32(x5, 23); + x2 = _mm_xor_si128(x2, x4); + x4 = x1; + x2 = _mm_xor_si128(x2, x5); + x1 = _mm_shuffle_epi32(x1, 0x93); + x4 = _mm_add_epi32(x4, x2); + x5 = x4; + x4 = _mm_slli_epi32(x4, 13); + x5 = _mm_srli_epi32(x5, 19); + x3 = _mm_xor_si128(x3, x4); + x4 = x2; + x3 = _mm_xor_si128(x3, x5); + x2 = _mm_shuffle_epi32(x2, 0x4e); + x4 = _mm_add_epi32(x4, x3); + x5 = x4; + x4 = _mm_slli_epi32(x4, 18); + x5 = _mm_srli_epi32(x5, 14); + x0 = _mm_xor_si128(x0, x4); + x3 = _mm_shuffle_epi32(x3, 0x39); + x0 = _mm_xor_si128(x0, x5); + } + + x0 = _mm_add_epi32(x0, t0); + x1 = _mm_add_epi32(x1, t1); + x2 = _mm_add_epi32(x2, t2); + x3 = _mm_add_epi32(x3, t3); + + /* 4: Y_i = X */ + /* 6: B'[0..r-1] = Y_even */ + /* 6: B'[r..2r-1] = Y_odd */ + xmmp = (xmmi *)scrypt_block(Bout, (i / 2) + half); + xmmp[0] = x0; + xmmp[1] = x1; + xmmp[2] = x2; + xmmp[3] = x3; + } +} + +#endif + +#if defined(SCRYPT_SALSA_AVX) + /* uses salsa_core_tangle_sse2 */ + + #undef SCRYPT_MIX + #define SCRYPT_MIX "Salsa/8-AVX" + #undef SCRYPT_SALSA_INCLUDED + #define SCRYPT_SALSA_INCLUDED +#endif diff --git a/scrypt-jane/scrypt-jane-mix_salsa-sse2.h b/scrypt-jane/scrypt-jane-mix_salsa-sse2.h new file mode 100644 index 0000000..4898659 --- /dev/null +++ b/scrypt-jane/scrypt-jane-mix_salsa-sse2.h @@ -0,0 +1,443 @@ +/* x86 */ +#if defined(X86ASM_SSE2) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_SALSA_INCLUDED)) + +#define SCRYPT_SALSA_SSE2 + +asm_naked_fn_proto(void, scrypt_ChunkMix_sse2)(uint32_t *Bout/*[chunkBytes]*/, uint32_t *Bin/*[chunkBytes]*/, uint32_t *Bxor/*[chunkBytes]*/, uint32_t r) +asm_naked_fn(scrypt_ChunkMix_sse2) + a1(push ebx) + a1(push edi) + a1(push esi) + a1(push ebp) + a2(mov ebp,esp) + a2(mov edi,[ebp+20]) + a2(mov esi,[ebp+24]) + a2(mov eax,[ebp+28]) + a2(mov ebx,[ebp+32]) + a2(sub esp,32) + a2(and esp,~63) + a2(lea edx,[ebx*2]) + a2(shl edx,6) + a2(lea ecx,[edx-64]) + a2(and eax, eax) + a2(movdqa xmm0,[ecx+esi+0]) + a2(movdqa xmm1,[ecx+esi+16]) + a2(movdqa xmm2,[ecx+esi+32]) + a2(movdqa xmm3,[ecx+esi+48]) + a1(jz scrypt_ChunkMix_sse2_no_xor1) + a2(pxor xmm0,[ecx+eax+0]) + a2(pxor xmm1,[ecx+eax+16]) + a2(pxor xmm2,[ecx+eax+32]) + a2(pxor xmm3,[ecx+eax+48]) + a1(scrypt_ChunkMix_sse2_no_xor1:) + a2(xor ecx,ecx) + a2(xor ebx,ebx) + a1(scrypt_ChunkMix_sse2_loop:) + a2(and eax, eax) + a2(pxor xmm0,[esi+ecx+0]) + a2(pxor xmm1,[esi+ecx+16]) + a2(pxor xmm2,[esi+ecx+32]) + a2(pxor xmm3,[esi+ecx+48]) + a1(jz scrypt_ChunkMix_sse2_no_xor2) + a2(pxor xmm0,[eax+ecx+0]) + a2(pxor xmm1,[eax+ecx+16]) + a2(pxor xmm2,[eax+ecx+32]) + a2(pxor xmm3,[eax+ecx+48]) + a1(scrypt_ChunkMix_sse2_no_xor2:) + a2(movdqa [esp+0],xmm0) + a2(movdqa [esp+16],xmm1) + a2(movdqa xmm6,xmm2) + a2(movdqa xmm7,xmm3) + a2(mov eax,8) + a1(scrypt_salsa_sse2_loop: ) + a2(movdqa xmm4, xmm1) + a2(paddd xmm4, xmm0) + a2(movdqa xmm5, xmm4) + a2(pslld xmm4, 7) + a2(psrld xmm5, 25) + a2(pxor xmm3, xmm4) + a2(movdqa xmm4, xmm0) + a2(pxor xmm3, xmm5) + a2(paddd xmm4, xmm3) + a2(movdqa xmm5, xmm4) + a2(pslld xmm4, 9) + a2(psrld xmm5, 23) + a2(pxor xmm2, xmm4) + a2(movdqa xmm4, xmm3) + a2(pxor xmm2, xmm5) + a3(pshufd xmm3, xmm3, 0x93) + a2(paddd xmm4, xmm2) + a2(movdqa xmm5, xmm4) + a2(pslld xmm4, 13) + a2(psrld xmm5, 19) + a2(pxor xmm1, xmm4) + a2(movdqa xmm4, xmm2) + a2(pxor xmm1, xmm5) + a3(pshufd xmm2, xmm2, 0x4e) + a2(paddd xmm4, xmm1) + a2(movdqa xmm5, xmm4) + a2(pslld xmm4, 18) + a2(psrld xmm5, 14) + a2(pxor xmm0, xmm4) + a2(movdqa xmm4, xmm3) + a2(pxor xmm0, xmm5) + a3(pshufd xmm1, xmm1, 0x39) + a2(paddd xmm4, xmm0) + a2(movdqa xmm5, xmm4) + a2(pslld xmm4, 7) + a2(psrld xmm5, 25) + a2(pxor xmm1, xmm4) + a2(movdqa xmm4, xmm0) + a2(pxor xmm1, xmm5) + a2(paddd xmm4, xmm1) + a2(movdqa xmm5, xmm4) + a2(pslld xmm4, 9) + a2(psrld xmm5, 23) + a2(pxor xmm2, xmm4) + a2(movdqa xmm4, xmm1) + a2(pxor xmm2, xmm5) + a3(pshufd xmm1, xmm1, 0x93) + a2(paddd xmm4, xmm2) + a2(movdqa xmm5, xmm4) + a2(pslld xmm4, 13) + a2(psrld xmm5, 19) + a2(pxor xmm3, xmm4) + a2(movdqa xmm4, xmm2) + a2(pxor xmm3, xmm5) + a3(pshufd xmm2, xmm2, 0x4e) + a2(paddd xmm4, xmm3) + a2(sub eax, 2) + a2(movdqa xmm5, xmm4) + a2(pslld xmm4, 18) + a2(psrld xmm5, 14) + a2(pxor xmm0, xmm4) + a3(pshufd xmm3, xmm3, 0x39) + a2(pxor xmm0, xmm5) + a1(ja scrypt_salsa_sse2_loop) + a2(paddd xmm0,[esp+0]) + a2(paddd xmm1,[esp+16]) + a2(paddd xmm2,xmm6) + a2(paddd xmm3,xmm7) + a2(lea eax,[ebx+ecx]) + a2(xor ebx,edx) + a2(and eax,~0x7f) + a2(add ecx,64) + a2(shr eax,1) + a2(add eax, edi) + a2(cmp ecx,edx) + a2(movdqa [eax+0],xmm0) + a2(movdqa [eax+16],xmm1) + a2(movdqa [eax+32],xmm2) + a2(movdqa [eax+48],xmm3) + a2(mov eax,[ebp+28]) + a1(jne scrypt_ChunkMix_sse2_loop) + a2(mov esp,ebp) + a1(pop ebp) + a1(pop esi) + a1(pop edi) + a1(pop ebx) + a1(ret 16) +asm_naked_fn_end(scrypt_ChunkMix_sse2) + +#endif + + + +/* x64 */ +#if defined(X86_64ASM_SSE2) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_SALSA_INCLUDED)) + +#define SCRYPT_SALSA_SSE2 + +asm_naked_fn_proto(void, scrypt_ChunkMix_sse2)(uint32_t *Bout/*[chunkBytes]*/, uint32_t *Bin/*[chunkBytes]*/, uint32_t *Bxor/*[chunkBytes]*/, uint32_t r) +asm_naked_fn(scrypt_ChunkMix_sse2) + a2(lea rcx,[rcx*2]) + a2(shl rcx,6) + a2(lea r9,[rcx-64]) + a2(lea rax,[rsi+r9]) + a2(lea r9,[rdx+r9]) + a2(and rdx, rdx) + a2(movdqa xmm0,[rax+0]) + a2(movdqa xmm1,[rax+16]) + a2(movdqa xmm2,[rax+32]) + a2(movdqa xmm3,[rax+48]) + a1(jz scrypt_ChunkMix_sse2_no_xor1) + a2(pxor xmm0,[r9+0]) + a2(pxor xmm1,[r9+16]) + a2(pxor xmm2,[r9+32]) + a2(pxor xmm3,[r9+48]) + a1(scrypt_ChunkMix_sse2_no_xor1:) + a2(xor r9,r9) + a2(xor r8,r8) + a1(scrypt_ChunkMix_sse2_loop:) + a2(and rdx, rdx) + a2(pxor xmm0,[rsi+r9+0]) + a2(pxor xmm1,[rsi+r9+16]) + a2(pxor xmm2,[rsi+r9+32]) + a2(pxor xmm3,[rsi+r9+48]) + a1(jz scrypt_ChunkMix_sse2_no_xor2) + a2(pxor xmm0,[rdx+r9+0]) + a2(pxor xmm1,[rdx+r9+16]) + a2(pxor xmm2,[rdx+r9+32]) + a2(pxor xmm3,[rdx+r9+48]) + a1(scrypt_ChunkMix_sse2_no_xor2:) + a2(movdqa xmm8,xmm0) + a2(movdqa xmm9,xmm1) + a2(movdqa xmm10,xmm2) + a2(movdqa xmm11,xmm3) + a2(mov rax,8) + a1(scrypt_salsa_sse2_loop: ) + a2(movdqa xmm4, xmm1) + a2(paddd xmm4, xmm0) + a2(movdqa xmm5, xmm4) + a2(pslld xmm4, 7) + a2(psrld xmm5, 25) + a2(pxor xmm3, xmm4) + a2(movdqa xmm4, xmm0) + a2(pxor xmm3, xmm5) + a2(paddd xmm4, xmm3) + a2(movdqa xmm5, xmm4) + a2(pslld xmm4, 9) + a2(psrld xmm5, 23) + a2(pxor xmm2, xmm4) + a2(movdqa xmm4, xmm3) + a2(pxor xmm2, xmm5) + a3(pshufd xmm3, xmm3, 0x93) + a2(paddd xmm4, xmm2) + a2(movdqa xmm5, xmm4) + a2(pslld xmm4, 13) + a2(psrld xmm5, 19) + a2(pxor xmm1, xmm4) + a2(movdqa xmm4, xmm2) + a2(pxor xmm1, xmm5) + a3(pshufd xmm2, xmm2, 0x4e) + a2(paddd xmm4, xmm1) + a2(movdqa xmm5, xmm4) + a2(pslld xmm4, 18) + a2(psrld xmm5, 14) + a2(pxor xmm0, xmm4) + a2(movdqa xmm4, xmm3) + a2(pxor xmm0, xmm5) + a3(pshufd xmm1, xmm1, 0x39) + a2(paddd xmm4, xmm0) + a2(movdqa xmm5, xmm4) + a2(pslld xmm4, 7) + a2(psrld xmm5, 25) + a2(pxor xmm1, xmm4) + a2(movdqa xmm4, xmm0) + a2(pxor xmm1, xmm5) + a2(paddd xmm4, xmm1) + a2(movdqa xmm5, xmm4) + a2(pslld xmm4, 9) + a2(psrld xmm5, 23) + a2(pxor xmm2, xmm4) + a2(movdqa xmm4, xmm1) + a2(pxor xmm2, xmm5) + a3(pshufd xmm1, xmm1, 0x93) + a2(paddd xmm4, xmm2) + a2(movdqa xmm5, xmm4) + a2(pslld xmm4, 13) + a2(psrld xmm5, 19) + a2(pxor xmm3, xmm4) + a2(movdqa xmm4, xmm2) + a2(pxor xmm3, xmm5) + a3(pshufd xmm2, xmm2, 0x4e) + a2(paddd xmm4, xmm3) + a2(sub rax, 2) + a2(movdqa xmm5, xmm4) + a2(pslld xmm4, 18) + a2(psrld xmm5, 14) + a2(pxor xmm0, xmm4) + a3(pshufd xmm3, xmm3, 0x39) + a2(pxor xmm0, xmm5) + a1(ja scrypt_salsa_sse2_loop) + a2(paddd xmm0,xmm8) + a2(paddd xmm1,xmm9) + a2(paddd xmm2,xmm10) + a2(paddd xmm3,xmm11) + a2(lea rax,[r8+r9]) + a2(xor r8,rcx) + a2(and rax,~0x7f) + a2(add r9,64) + a2(shr rax,1) + a2(add rax, rdi) + a2(cmp r9,rcx) + a2(movdqa [rax+0],xmm0) + a2(movdqa [rax+16],xmm1) + a2(movdqa [rax+32],xmm2) + a2(movdqa [rax+48],xmm3) + a1(jne scrypt_ChunkMix_sse2_loop) + a1(ret) +asm_naked_fn_end(scrypt_ChunkMix_sse2) + +#endif + + +/* intrinsic */ +#if defined(X86_INTRINSIC_SSE2) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_SALSA_INCLUDED)) + +#define SCRYPT_SALSA_SSE2 + +static void NOINLINE +scrypt_ChunkMix_sse2(uint32_t *Bout/*[chunkBytes]*/, uint32_t *Bin/*[chunkBytes]*/, uint32_t *Bxor/*[chunkBytes]*/, uint32_t r) { + uint32_t i, blocksPerChunk = r * 2, half = 0; + xmmi *xmmp,x0,x1,x2,x3,x4,x5,t0,t1,t2,t3; + size_t rounds; + + /* 1: X = B_{2r - 1} */ + xmmp = (xmmi *)scrypt_block(Bin, blocksPerChunk - 1); + x0 = xmmp[0]; + x1 = xmmp[1]; + x2 = xmmp[2]; + x3 = xmmp[3]; + + if (Bxor) { + xmmp = (xmmi *)scrypt_block(Bxor, blocksPerChunk - 1); + x0 = _mm_xor_si128(x0, xmmp[0]); + x1 = _mm_xor_si128(x1, xmmp[1]); + x2 = _mm_xor_si128(x2, xmmp[2]); + x3 = _mm_xor_si128(x3, xmmp[3]); + } + + /* 2: for i = 0 to 2r - 1 do */ + for (i = 0; i < blocksPerChunk; i++, half ^= r) { + /* 3: X = H(X ^ B_i) */ + xmmp = (xmmi *)scrypt_block(Bin, i); + x0 = _mm_xor_si128(x0, xmmp[0]); + x1 = _mm_xor_si128(x1, xmmp[1]); + x2 = _mm_xor_si128(x2, xmmp[2]); + x3 = _mm_xor_si128(x3, xmmp[3]); + + if (Bxor) { + xmmp = (xmmi *)scrypt_block(Bxor, i); + x0 = _mm_xor_si128(x0, xmmp[0]); + x1 = _mm_xor_si128(x1, xmmp[1]); + x2 = _mm_xor_si128(x2, xmmp[2]); + x3 = _mm_xor_si128(x3, xmmp[3]); + } + + t0 = x0; + t1 = x1; + t2 = x2; + t3 = x3; + + for (rounds = 8; rounds; rounds -= 2) { + x4 = x1; + x4 = _mm_add_epi32(x4, x0); + x5 = x4; + x4 = _mm_slli_epi32(x4, 7); + x5 = _mm_srli_epi32(x5, 25); + x3 = _mm_xor_si128(x3, x4); + x4 = x0; + x3 = _mm_xor_si128(x3, x5); + x4 = _mm_add_epi32(x4, x3); + x5 = x4; + x4 = _mm_slli_epi32(x4, 9); + x5 = _mm_srli_epi32(x5, 23); + x2 = _mm_xor_si128(x2, x4); + x4 = x3; + x2 = _mm_xor_si128(x2, x5); + x3 = _mm_shuffle_epi32(x3, 0x93); + x4 = _mm_add_epi32(x4, x2); + x5 = x4; + x4 = _mm_slli_epi32(x4, 13); + x5 = _mm_srli_epi32(x5, 19); + x1 = _mm_xor_si128(x1, x4); + x4 = x2; + x1 = _mm_xor_si128(x1, x5); + x2 = _mm_shuffle_epi32(x2, 0x4e); + x4 = _mm_add_epi32(x4, x1); + x5 = x4; + x4 = _mm_slli_epi32(x4, 18); + x5 = _mm_srli_epi32(x5, 14); + x0 = _mm_xor_si128(x0, x4); + x4 = x3; + x0 = _mm_xor_si128(x0, x5); + x1 = _mm_shuffle_epi32(x1, 0x39); + x4 = _mm_add_epi32(x4, x0); + x5 = x4; + x4 = _mm_slli_epi32(x4, 7); + x5 = _mm_srli_epi32(x5, 25); + x1 = _mm_xor_si128(x1, x4); + x4 = x0; + x1 = _mm_xor_si128(x1, x5); + x4 = _mm_add_epi32(x4, x1); + x5 = x4; + x4 = _mm_slli_epi32(x4, 9); + x5 = _mm_srli_epi32(x5, 23); + x2 = _mm_xor_si128(x2, x4); + x4 = x1; + x2 = _mm_xor_si128(x2, x5); + x1 = _mm_shuffle_epi32(x1, 0x93); + x4 = _mm_add_epi32(x4, x2); + x5 = x4; + x4 = _mm_slli_epi32(x4, 13); + x5 = _mm_srli_epi32(x5, 19); + x3 = _mm_xor_si128(x3, x4); + x4 = x2; + x3 = _mm_xor_si128(x3, x5); + x2 = _mm_shuffle_epi32(x2, 0x4e); + x4 = _mm_add_epi32(x4, x3); + x5 = x4; + x4 = _mm_slli_epi32(x4, 18); + x5 = _mm_srli_epi32(x5, 14); + x0 = _mm_xor_si128(x0, x4); + x3 = _mm_shuffle_epi32(x3, 0x39); + x0 = _mm_xor_si128(x0, x5); + } + + x0 = _mm_add_epi32(x0, t0); + x1 = _mm_add_epi32(x1, t1); + x2 = _mm_add_epi32(x2, t2); + x3 = _mm_add_epi32(x3, t3); + + /* 4: Y_i = X */ + /* 6: B'[0..r-1] = Y_even */ + /* 6: B'[r..2r-1] = Y_odd */ + xmmp = (xmmi *)scrypt_block(Bout, (i / 2) + half); + xmmp[0] = x0; + xmmp[1] = x1; + xmmp[2] = x2; + xmmp[3] = x3; + } +} + +#endif + +#if defined(SCRYPT_SALSA_SSE2) + #undef SCRYPT_MIX + #define SCRYPT_MIX "Salsa/8-SSE2" + #undef SCRYPT_SALSA_INCLUDED + #define SCRYPT_SALSA_INCLUDED +#endif + +/* used by avx,etc as well */ +#if defined(SCRYPT_SALSA_INCLUDED) + /* + Default layout: + 0 1 2 3 + 4 5 6 7 + 8 9 10 11 + 12 13 14 15 + + SSE2 layout: + 0 5 10 15 + 12 1 6 11 + 8 13 2 7 + 4 9 14 3 + */ + + static void STDCALL + salsa_core_tangle_sse2(uint32_t *blocks, size_t count) { + uint32_t t; + while (count--) { + t = blocks[1]; blocks[1] = blocks[5]; blocks[5] = t; + t = blocks[2]; blocks[2] = blocks[10]; blocks[10] = t; + t = blocks[3]; blocks[3] = blocks[15]; blocks[15] = t; + t = blocks[4]; blocks[4] = blocks[12]; blocks[12] = t; + t = blocks[7]; blocks[7] = blocks[11]; blocks[11] = t; + t = blocks[9]; blocks[9] = blocks[13]; blocks[13] = t; + blocks += 16; + } + } +#endif + diff --git a/scrypt-jane/scrypt-jane-mix_salsa.h b/scrypt-jane/scrypt-jane-mix_salsa.h new file mode 100644 index 0000000..33f3340 --- /dev/null +++ b/scrypt-jane/scrypt-jane-mix_salsa.h @@ -0,0 +1,70 @@ +#if !defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_SALSA_INCLUDED) + +#undef SCRYPT_MIX +#define SCRYPT_MIX "Salsa20/8 Ref" + +#undef SCRYPT_SALSA_INCLUDED +#define SCRYPT_SALSA_INCLUDED +#define SCRYPT_SALSA_BASIC + +static void +salsa_core_basic(uint32_t state[16]) { + size_t rounds = 8; + uint32_t x0,x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15,t; + + x0 = state[0]; + x1 = state[1]; + x2 = state[2]; + x3 = state[3]; + x4 = state[4]; + x5 = state[5]; + x6 = state[6]; + x7 = state[7]; + x8 = state[8]; + x9 = state[9]; + x10 = state[10]; + x11 = state[11]; + x12 = state[12]; + x13 = state[13]; + x14 = state[14]; + x15 = state[15]; + + #define quarter(a,b,c,d) \ + t = a+d; t = ROTL32(t, 7); b ^= t; \ + t = b+a; t = ROTL32(t, 9); c ^= t; \ + t = c+b; t = ROTL32(t, 13); d ^= t; \ + t = d+c; t = ROTL32(t, 18); a ^= t; \ + + for (; rounds; rounds -= 2) { + quarter( x0, x4, x8,x12) + quarter( x5, x9,x13, x1) + quarter(x10,x14, x2, x6) + quarter(x15, x3, x7,x11) + quarter( x0, x1, x2, x3) + quarter( x5, x6, x7, x4) + quarter(x10,x11, x8, x9) + quarter(x15,x12,x13,x14) + } + + state[0] += x0; + state[1] += x1; + state[2] += x2; + state[3] += x3; + state[4] += x4; + state[5] += x5; + state[6] += x6; + state[7] += x7; + state[8] += x8; + state[9] += x9; + state[10] += x10; + state[11] += x11; + state[12] += x12; + state[13] += x13; + state[14] += x14; + state[15] += x15; + + #undef quarter +} + +#endif + diff --git a/scrypt-jane/scrypt-jane-pbkdf2.h b/scrypt-jane/scrypt-jane-pbkdf2.h new file mode 100644 index 0000000..711e3d6 --- /dev/null +++ b/scrypt-jane/scrypt-jane-pbkdf2.h @@ -0,0 +1,112 @@ +typedef struct scrypt_hmac_state_t { + scrypt_hash_state inner, outer; +} scrypt_hmac_state; + + +static void +scrypt_hash(scrypt_hash_digest hash, const uint8_t *m, size_t mlen) { + scrypt_hash_state st; + scrypt_hash_init(&st); + scrypt_hash_update(&st, m, mlen); + scrypt_hash_finish(&st, hash); +} + +/* hmac */ +static void +scrypt_hmac_init(scrypt_hmac_state *st, const uint8_t *key, size_t keylen) { + uint8_t pad[SCRYPT_HASH_BLOCK_SIZE] = {0}; + size_t i; + + scrypt_hash_init(&st->inner); + scrypt_hash_init(&st->outer); + + if (keylen <= SCRYPT_HASH_BLOCK_SIZE) { + /* use the key directly if it's <= blocksize bytes */ + memcpy(pad, key, keylen); + } else { + /* if it's > blocksize bytes, hash it */ + scrypt_hash(pad, key, keylen); + } + + /* inner = (key ^ 0x36) */ + /* h(inner || ...) */ + for (i = 0; i < SCRYPT_HASH_BLOCK_SIZE; i++) + pad[i] ^= 0x36; + scrypt_hash_update(&st->inner, pad, SCRYPT_HASH_BLOCK_SIZE); + + /* outer = (key ^ 0x5c) */ + /* h(outer || ...) */ + for (i = 0; i < SCRYPT_HASH_BLOCK_SIZE; i++) + pad[i] ^= (0x5c ^ 0x36); + scrypt_hash_update(&st->outer, pad, SCRYPT_HASH_BLOCK_SIZE); + + scrypt_ensure_zero(pad, sizeof(pad)); +} + +static void +scrypt_hmac_update(scrypt_hmac_state *st, const uint8_t *m, size_t mlen) { + /* h(inner || m...) */ + scrypt_hash_update(&st->inner, m, mlen); +} + +static void +scrypt_hmac_finish(scrypt_hmac_state *st, scrypt_hash_digest mac) { + /* h(inner || m) */ + scrypt_hash_digest innerhash; + scrypt_hash_finish(&st->inner, innerhash); + + /* h(outer || h(inner || m)) */ + scrypt_hash_update(&st->outer, innerhash, sizeof(innerhash)); + scrypt_hash_finish(&st->outer, mac); + + scrypt_ensure_zero(st, sizeof(*st)); +} + +static void +scrypt_pbkdf2(const uint8_t *password, size_t password_len, const uint8_t *salt, size_t salt_len, uint64_t N, uint8_t *out, size_t bytes) { + scrypt_hmac_state hmac_pw, hmac_pw_salt, work; + scrypt_hash_digest ti, u; + uint8_t be[4]; + uint32_t i, j, blocks; + uint64_t c; + + /* bytes must be <= (0xffffffff - (SCRYPT_HASH_DIGEST_SIZE - 1)), which they will always be under scrypt */ + + /* hmac(password, ...) */ + scrypt_hmac_init(&hmac_pw, password, password_len); + + /* hmac(password, salt...) */ + hmac_pw_salt = hmac_pw; + scrypt_hmac_update(&hmac_pw_salt, salt, salt_len); + + blocks = ((uint32_t)bytes + (SCRYPT_HASH_DIGEST_SIZE - 1)) / SCRYPT_HASH_DIGEST_SIZE; + for (i = 1; i <= blocks; i++) { + /* U1 = hmac(password, salt || be(i)) */ + U32TO8_BE(be, i); + work = hmac_pw_salt; + scrypt_hmac_update(&work, be, 4); + scrypt_hmac_finish(&work, ti); + memcpy(u, ti, sizeof(u)); + + /* T[i] = U1 ^ U2 ^ U3... */ + for (c = 0; c < N - 1; c++) { + /* UX = hmac(password, U{X-1}) */ + work = hmac_pw; + scrypt_hmac_update(&work, u, SCRYPT_HASH_DIGEST_SIZE); + scrypt_hmac_finish(&work, u); + + /* T[i] ^= UX */ + for (j = 0; j < sizeof(u); j++) + ti[j] ^= u[j]; + } + + memcpy(out, ti, (bytes > SCRYPT_HASH_DIGEST_SIZE) ? SCRYPT_HASH_DIGEST_SIZE : bytes); + out += SCRYPT_HASH_DIGEST_SIZE; + bytes -= SCRYPT_HASH_DIGEST_SIZE; + } + + scrypt_ensure_zero(ti, sizeof(ti)); + scrypt_ensure_zero(u, sizeof(u)); + scrypt_ensure_zero(&hmac_pw, sizeof(hmac_pw)); + scrypt_ensure_zero(&hmac_pw_salt, sizeof(hmac_pw_salt)); +} diff --git a/scrypt-jane/scrypt-jane-portable-x86.h b/scrypt-jane/scrypt-jane-portable-x86.h new file mode 100644 index 0000000..03282fa --- /dev/null +++ b/scrypt-jane/scrypt-jane-portable-x86.h @@ -0,0 +1,364 @@ +#if defined(CPU_X86) && (defined(COMPILER_MSVC) || defined(COMPILER_GCC)) + #define X86ASM + /* gcc 2.95 royally screws up stack alignments on variables */ + #if (defined(COMPILER_MSVC6PP_AND_LATER) || (defined(COMPILER_GCC) && (COMPILER_GCC >= 30000))) + #define X86ASM_SSE + #define X86ASM_SSE2 + #endif + #if ((defined(COMPILER_MSVC) && (COMPILER_MSVC >= 1400)) || (defined(COMPILER_GCC) && (COMPILER_GCC >= 40102))) + #define X86ASM_SSSE3 + #endif + #if ((defined(COMPILER_GCC) && (COMPILER_GCC >= 40400))) + #define X86ASM_AVX + #endif +#endif + +#if defined(CPU_X86_64) && defined(COMPILER_GCC) + #define X86_64ASM + #define X86_64ASM_SSE2 + #if (COMPILER_GCC >= 40102) + #define X86_64ASM_SSSE3 + #endif + #if (COMPILER_GCC >= 40400) + #define X86_64ASM_AVX + #endif +#endif + +#if defined(COMPILER_MSVC) + #define X86_INTRINSIC + #if defined(CPU_X86_64) || defined(X86ASM_SSE) + #define X86_INTRINSIC_SSE + #endif + #if defined(CPU_X86_64) || defined(X86ASM_SSE2) + #define X86_INTRINSIC_SSE2 + #endif + #if (COMPILER_MSVC >= 1400) + #define X86_INTRINSIC_SSSE3 + #endif +#endif + +#if defined(COMPILER_MSVC) && defined(CPU_X86_64) + #define X86_64USE_INTRINSIC +#endif + +#if defined(COMPILER_MSVC) && defined(CPU_X86_64) + #define X86_64USE_INTRINSIC +#endif + +#if defined(COMPILER_GCC) && defined(CPU_X86_FORCE_INTRINSICS) + #define X86_INTRINSIC + #if defined(__SSE__) + #define X86_INTRINSIC_SSE + #endif + #if defined(__SSE2__) + #define X86_INTRINSIC_SSE2 + #endif + #if defined(__SSSE3__) + #define X86_INTRINSIC_SSSE3 + #endif + #if defined(__AVX__) + #define X86_INTRINSIC_AVX + #endif +#endif + +/* only use simd on windows (or SSE2 on gcc)! */ +#if defined(CPU_X86_FORCE_INTRINSICS) || defined(X86_INTRINSIC) + #if defined(X86_INTRINSIC_SSE) + #define X86_INTRINSIC + #include + #include + typedef __m64 qmm; + typedef __m128 xmm; + typedef __m128d xmmd; + #endif + #if defined(X86_INTRINSIC_SSE2) + #define X86_INTRINSIC_SSE2 + #include + typedef __m128i xmmi; + #endif + #if defined(X86_INTRINSIC_SSSE3) + #define X86_INTRINSIC_SSSE3 + #include + #endif +#endif + + +#if defined(X86_INTRINSIC_SSE2) + typedef union packedelem8_t { + uint8_t u[16]; + xmmi v; + } packedelem8; + + typedef union packedelem32_t { + uint32_t u[4]; + xmmi v; + } packedelem32; + + typedef union packedelem64_t { + uint64_t u[2]; + xmmi v; + } packedelem64; +#else + typedef union packedelem8_t { + uint8_t u[16]; + uint32_t dw[4]; + } packedelem8; + + typedef union packedelem32_t { + uint32_t u[4]; + uint8_t b[16]; + } packedelem32; + + typedef union packedelem64_t { + uint64_t u[2]; + uint8_t b[16]; + } packedelem64; +#endif + +#if defined(X86_INTRINSIC_SSSE3) || defined(X86ASM_SSSE3) || defined(X86_64ASM_SSSE3) + const packedelem8 MM16 ssse3_rotr16_64bit = {{2,3,4,5,6,7,0,1,10,11,12,13,14,15,8,9}}; + const packedelem8 MM16 ssse3_rotl16_32bit = {{2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13}}; + const packedelem8 MM16 ssse3_rotl8_32bit = {{3,0,1,2,7,4,5,6,11,8,9,10,15,12,13,14}}; + const packedelem8 MM16 ssse3_endian_swap_64bit = {{7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8}}; +#endif + +/* + x86 inline asm for gcc/msvc. usage: + + asm_naked_fn_proto(return_type, name) (type parm1, type parm2..) + asm_naked_fn(name) + a1(..) + a2(.., ..) + a3(.., .., ..) + a1(ret) + asm_naked_fn_end(name) +*/ + +#if defined(X86ASM) || defined(X86_64ASM) + +#if defined(COMPILER_MSVC) + #pragma warning(disable : 4731) /* frame pointer modified by inline assembly */ + #define a1(x) __asm {x} + #define a2(x, y) __asm {x, y} + #define a3(x, y, z) __asm {x, y, z} + #define a4(x, y, z, w) __asm {x, y, z, w} + #define al(x) __asm {label##x:} + #define aj(x, y, z) __asm {x label##y} + #define asm_align8 a1(ALIGN 8) + #define asm_align16 a1(ALIGN 16) + + #define asm_naked_fn_proto(type, fn) static NAKED type STDCALL fn + #define asm_naked_fn(fn) { + #define asm_naked_fn_end(fn) } +#elif defined(COMPILER_GCC) + #define GNU_AS1(x) #x ";\n" + #define GNU_AS2(x, y) #x ", " #y ";\n" + #define GNU_AS3(x, y, z) #x ", " #y ", " #z ";\n" + #define GNU_AS4(x, y, z, w) #x ", " #y ", " #z ", " #w ";\n" + #define GNU_ASL(x) "\n" #x ":\n" + #define GNU_ASJ(x, y, z) #x " " #y #z ";" + + #define a1(x) GNU_AS1(x) + #define a2(x, y) GNU_AS2(x, y) + #define a3(x, y, z) GNU_AS3(x, y, z) + #define a4(x, y, z, w) GNU_AS4(x, y, z, w) + #define al(x) GNU_ASL(x) + #define aj(x, y, z) GNU_ASJ(x, y, z) + #define asm_align8 a1(.align 8) + #define asm_align16 a1(.align 16) + + #define asm_naked_fn_proto(type, fn) extern type STDCALL fn + #define asm_naked_fn(fn) ; __asm__ (".intel_syntax noprefix;\n.text\n" asm_align16 GNU_ASL(fn) + #define asm_naked_fn_end(fn) ".att_syntax prefix;\n.type " #fn ",@function\n.size " #fn ",.-" #fn "\n" ); + #define asm_gcc() __asm__ __volatile__(".intel_syntax noprefix;\n" + #define asm_gcc_parms() ".att_syntax prefix;" + #define asm_gcc_trashed() __asm__ __volatile__("" ::: + #define asm_gcc_end() ); +#else + need x86 asm +#endif + +#endif /* X86ASM || X86_64ASM */ + + +#if defined(CPU_X86) || defined(CPU_X86_64) + +typedef enum cpu_flags_x86_t { + cpu_mmx = 1 << 0, + cpu_sse = 1 << 1, + cpu_sse2 = 1 << 2, + cpu_sse3 = 1 << 3, + cpu_ssse3 = 1 << 4, + cpu_sse4_1 = 1 << 5, + cpu_sse4_2 = 1 << 6, + cpu_avx = 1 << 7 +} cpu_flags_x86; + +typedef enum cpu_vendors_x86_t { + cpu_nobody, + cpu_intel, + cpu_amd +} cpu_vendors_x86; + +typedef struct x86_regs_t { + uint32_t eax, ebx, ecx, edx; +} x86_regs; + +#if defined(X86ASM) +asm_naked_fn_proto(int, has_cpuid)(void) +asm_naked_fn(has_cpuid) + a1(pushfd) + a1(pop eax) + a2(mov ecx, eax) + a2(xor eax, 0x200000) + a1(push eax) + a1(popfd) + a1(pushfd) + a1(pop eax) + a2(xor eax, ecx) + a2(shr eax, 21) + a2(and eax, 1) + a1(push ecx) + a1(popfd) + a1(ret) +asm_naked_fn_end(has_cpuid) +#endif /* X86ASM */ + + +static void NOINLINE +get_cpuid(x86_regs *regs, uint32_t flags) { +#if defined(COMPILER_MSVC) + __cpuid((int *)regs, (int)flags); +#else + #if defined(CPU_X86_64) + #define cpuid_bx rbx + #else + #define cpuid_bx ebx + #endif + + asm_gcc() + a1(push cpuid_bx) + a1(cpuid) + a2(mov [%1 + 0], eax) + a2(mov [%1 + 4], ebx) + a2(mov [%1 + 8], ecx) + a2(mov [%1 + 12], edx) + a1(pop cpuid_bx) + asm_gcc_parms() : "+a"(flags) : "S"(regs) : "%ecx", "%edx", "cc" + asm_gcc_end() +#endif +} + +#if defined(X86ASM_AVX) || defined(X86_64ASM_AVX) +static uint64_t NOINLINE +get_xgetbv(uint32_t flags) { +#if defined(COMPILER_MSVC) + return _xgetbv(flags); +#else + uint32_t lo, hi; + asm_gcc() + a1(xgetbv) + asm_gcc_parms() : "+c"(flags), "=a" (lo), "=d" (hi) + asm_gcc_end() + return ((uint64_t)lo | ((uint64_t)hi << 32)); +#endif +} +#endif // AVX support + +#if defined(SCRYPT_TEST_SPEED) +size_t cpu_detect_mask = (size_t)-1; +#endif + +static size_t +detect_cpu(void) { + union { uint8_t s[12]; uint32_t i[3]; } vendor_string; + cpu_vendors_x86 vendor = cpu_nobody; + x86_regs regs; + uint32_t max_level; + size_t cpu_flags = 0; +#if defined(X86ASM_AVX) || defined(X86_64ASM_AVX) + uint64_t xgetbv_flags; +#endif + +#if defined(CPU_X86) + if (!has_cpuid()) + return cpu_flags; +#endif + + get_cpuid(®s, 0); + max_level = regs.eax; + vendor_string.i[0] = regs.ebx; + vendor_string.i[1] = regs.edx; + vendor_string.i[2] = regs.ecx; + + if (scrypt_verify(vendor_string.s, (const uint8_t *)"GenuineIntel", 12)) + vendor = cpu_intel; + else if (scrypt_verify(vendor_string.s, (const uint8_t *)"AuthenticAMD", 12)) + vendor = cpu_amd; + + if (max_level & 0x00000500) { + /* "Intel P5 pre-B0" */ + cpu_flags |= cpu_mmx; + return cpu_flags; + } + + if (max_level < 1) + return cpu_flags; + + get_cpuid(®s, 1); +#if defined(X86ASM_AVX) || defined(X86_64ASM_AVX) + /* xsave/xrestore */ + if (regs.ecx & (1 << 27)) { + xgetbv_flags = get_xgetbv(0); + if ((regs.ecx & (1 << 28)) && (xgetbv_flags & 0x6)) cpu_flags |= cpu_avx; + } +#endif + if (regs.ecx & (1 << 20)) cpu_flags |= cpu_sse4_2; + if (regs.ecx & (1 << 19)) cpu_flags |= cpu_sse4_2; + if (regs.ecx & (1 << 9)) cpu_flags |= cpu_ssse3; + if (regs.ecx & (1 )) cpu_flags |= cpu_sse3; + if (regs.edx & (1 << 26)) cpu_flags |= cpu_sse2; + if (regs.edx & (1 << 25)) cpu_flags |= cpu_sse; + if (regs.edx & (1 << 23)) cpu_flags |= cpu_mmx; + +#if defined(SCRYPT_TEST_SPEED) + cpu_flags &= cpu_detect_mask; +#endif + + return cpu_flags; +} + +#if defined(SCRYPT_TEST_SPEED) +static const char * +get_top_cpuflag_desc(size_t flag) { + if (flag & cpu_avx) return "AVX"; + else if (flag & cpu_sse4_2) return "SSE4.2"; + else if (flag & cpu_sse4_1) return "SSE4.1"; + else if (flag & cpu_ssse3) return "SSSE3"; + else if (flag & cpu_sse2) return "SSE2"; + else if (flag & cpu_sse) return "SSE"; + else if (flag & cpu_mmx) return "MMX"; + else return "Basic"; +} +#endif + +/* enable the highest system-wide option */ +#if defined(SCRYPT_CHOOSE_COMPILETIME) + #if !defined(__AVX__) + #undef X86_64ASM_AVX + #undef X86ASM_AVX + #undef X86_INTRINSIC_AVX + #endif + #if !defined(__SSSE3__) + #undef X86_64ASM_SSSE3 + #undef X86ASM_SSSE3 + #undef X86_INTRINSIC_SSSE3 + #endif + #if !defined(__SSE2__) + #undef X86_64ASM_SSE2 + #undef X86ASM_SSE2 + #undef X86_INTRINSIC_SSE2 + #endif +#endif + +#endif /* defined(CPU_X86) || defined(CPU_X86_64) */ \ No newline at end of file diff --git a/scrypt-jane/scrypt-jane-portable.h b/scrypt-jane/scrypt-jane-portable.h new file mode 100644 index 0000000..33c8c2c --- /dev/null +++ b/scrypt-jane/scrypt-jane-portable.h @@ -0,0 +1,281 @@ +/* determine os */ +#if defined(_WIN32) || defined(_WIN64) || defined(__TOS_WIN__) || defined(__WINDOWS__) + #include + #include + #define OS_WINDOWS +#elif defined(sun) || defined(__sun) || defined(__SVR4) || defined(__svr4__) + #include + #include + #include + + #define OS_SOLARIS +#else + #include + #include + #include /* need this to define BSD */ + #include + #include + + #define OS_NIX + #if defined(__linux__) + #include + #define OS_LINUX + #elif defined(BSD) + #define OS_BSD + + #if defined(MACOS_X) || (defined(__APPLE__) & defined(__MACH__)) + #define OS_OSX + #elif defined(macintosh) || defined(Macintosh) + #define OS_MAC + #elif defined(__OpenBSD__) + #define OS_OPENBSD + #endif + #endif +#endif + + +/* determine compiler */ +#if defined(_MSC_VER) + #define COMPILER_MSVC _MSC_VER + #if ((COMPILER_MSVC > 1200) || defined(_mm_free)) + #define COMPILER_MSVC6PP_AND_LATER + #endif + #if (COMPILER_MSVC >= 1500) + #define COMPILER_HAS_TMMINTRIN + #endif + + #pragma warning(disable : 4127) /* conditional expression is constant */ + #pragma warning(disable : 4100) /* unreferenced formal parameter */ + + #define _CRT_SECURE_NO_WARNINGS + #include + #include /* _rotl */ + #include + + typedef unsigned char uint8_t; + typedef unsigned short uint16_t; + typedef unsigned int uint32_t; + typedef signed int int32_t; + typedef unsigned __int64 uint64_t; + typedef signed __int64 int64_t; + + #define ROTL32(a,b) _rotl(a,b) + #define ROTR32(a,b) _rotr(a,b) + #define ROTL64(a,b) _rotl64(a,b) + #define ROTR64(a,b) _rotr64(a,b) + #undef NOINLINE + #define NOINLINE __declspec(noinline) + #undef INLINE + #define INLINE __forceinline + #undef FASTCALL + #define FASTCALL __fastcall + #undef CDECL + #define CDECL __cdecl + #undef STDCALL + #define STDCALL __stdcall + #undef NAKED + #define NAKED __declspec(naked) + #define MM16 __declspec(align(16)) +#endif +#if defined(__ICC) + #define COMPILER_INTEL +#endif +#if defined(__GNUC__) + #if (__GNUC__ >= 3) + #define COMPILER_GCC_PATCHLEVEL __GNUC_PATCHLEVEL__ + #else + #define COMPILER_GCC_PATCHLEVEL 0 + #endif + #define COMPILER_GCC (__GNUC__ * 10000 + __GNUC_MINOR__ * 100 + COMPILER_GCC_PATCHLEVEL) + #define ROTL32(a,b) (((a) << (b)) | ((a) >> (32 - b))) + #define ROTR32(a,b) (((a) >> (b)) | ((a) << (32 - b))) + #define ROTL64(a,b) (((a) << (b)) | ((a) >> (64 - b))) + #define ROTR64(a,b) (((a) >> (b)) | ((a) << (64 - b))) + #undef NOINLINE + #if (COMPILER_GCC >= 30000) + #define NOINLINE __attribute__((noinline)) + #else + #define NOINLINE + #endif + #undef INLINE + #if (COMPILER_GCC >= 30000) + #define INLINE __attribute__((always_inline)) + #else + #define INLINE inline + #endif + #undef FASTCALL + #if (COMPILER_GCC >= 30400) + #define FASTCALL __attribute__((fastcall)) + #else + #define FASTCALL + #endif + #undef CDECL + #define CDECL __attribute__((cdecl)) + #undef STDCALL + #define STDCALL __attribute__((stdcall)) + #define MM16 __attribute__((aligned(16))) + #include +#endif +#if defined(__MINGW32__) || defined(__MINGW64__) + #define COMPILER_MINGW +#endif +#if defined(__PATHCC__) + #define COMPILER_PATHCC +#endif + +#define OPTIONAL_INLINE +#if defined(OPTIONAL_INLINE) + #undef OPTIONAL_INLINE + #define OPTIONAL_INLINE INLINE +#else + #define OPTIONAL_INLINE +#endif + +#define CRYPTO_FN NOINLINE STDCALL + +/* determine cpu */ +#if defined(__amd64__) || defined(__amd64) || defined(__x86_64__ ) || defined(_M_X64) + #define CPU_X86_64 +#elif defined(__i586__) || defined(__i686__) || (defined(_M_IX86) && (_M_IX86 >= 500)) + #define CPU_X86 500 +#elif defined(__i486__) || (defined(_M_IX86) && (_M_IX86 >= 400)) + #define CPU_X86 400 +#elif defined(__i386__) || (defined(_M_IX86) && (_M_IX86 >= 300)) || defined(__X86__) || defined(_X86_) || defined(__I86__) + #define CPU_X86 300 +#elif defined(__ia64__) || defined(_IA64) || defined(__IA64__) || defined(_M_IA64) || defined(__ia64) + #define CPU_IA64 +#endif + +#if defined(__sparc__) || defined(__sparc) || defined(__sparcv9) + #define CPU_SPARC + #if defined(__sparcv9) + #define CPU_SPARC64 + #endif +#endif + +#if defined(CPU_X86_64) || defined(CPU_IA64) || defined(CPU_SPARC64) || defined(__64BIT__) || defined(__LP64__) || defined(_LP64) || (defined(_MIPS_SZLONG) && (_MIPS_SZLONG == 64)) + #define CPU_64BITS + #undef FASTCALL + #define FASTCALL + #undef CDECL + #define CDECL + #undef STDCALL + #define STDCALL +#endif + +#if defined(powerpc) || defined(__PPC__) || defined(__ppc__) || defined(_ARCH_PPC) || defined(__powerpc__) || defined(__powerpc) || defined(POWERPC) || defined(_M_PPC) + #define CPU_PPC + #if defined(_ARCH_PWR7) + #define CPU_POWER7 + #elif defined(__64BIT__) + #define CPU_PPC64 + #else + #define CPU_PPC32 + #endif +#endif + +#if defined(__hppa__) || defined(__hppa) + #define CPU_HPPA +#endif + +#if defined(__alpha__) || defined(__alpha) || defined(_M_ALPHA) + #define CPU_ALPHA +#endif + +/* endian */ + +#if ((defined(__BYTE_ORDER) && defined(__LITTLE_ENDIAN) && (__BYTE_ORDER == __LITTLE_ENDIAN)) || \ + (defined(BYTE_ORDER) && defined(LITTLE_ENDIAN) && (BYTE_ORDER == LITTLE_ENDIAN)) || \ + (defined(CPU_X86) || defined(CPU_X86_64)) || \ + (defined(vax) || defined(MIPSEL) || defined(_MIPSEL))) +#define CPU_LE +#elif ((defined(__BYTE_ORDER) && defined(__BIG_ENDIAN) && (__BYTE_ORDER == __BIG_ENDIAN)) || \ + (defined(BYTE_ORDER) && defined(BIG_ENDIAN) && (BYTE_ORDER == BIG_ENDIAN)) || \ + (defined(CPU_SPARC) || defined(CPU_PPC) || defined(mc68000) || defined(sel)) || defined(_MIPSEB)) +#define CPU_BE +#else + /* unknown endian! */ +#endif + + +#define U8TO32_BE(p) \ + (((uint32_t)((p)[0]) << 24) | ((uint32_t)((p)[1]) << 16) | \ + ((uint32_t)((p)[2]) << 8) | ((uint32_t)((p)[3]) )) + +#define U8TO32_LE(p) \ + (((uint32_t)((p)[0]) ) | ((uint32_t)((p)[1]) << 8) | \ + ((uint32_t)((p)[2]) << 16) | ((uint32_t)((p)[3]) << 24)) + +#define U32TO8_BE(p, v) \ + (p)[0] = (uint8_t)((v) >> 24); (p)[1] = (uint8_t)((v) >> 16); \ + (p)[2] = (uint8_t)((v) >> 8); (p)[3] = (uint8_t)((v) ); + +#define U32TO8_LE(p, v) \ + (p)[0] = (uint8_t)((v) ); (p)[1] = (uint8_t)((v) >> 8); \ + (p)[2] = (uint8_t)((v) >> 16); (p)[3] = (uint8_t)((v) >> 24); + +#define U8TO64_BE(p) \ + (((uint64_t)U8TO32_BE(p) << 32) | (uint64_t)U8TO32_BE((p) + 4)) + +#define U8TO64_LE(p) \ + (((uint64_t)U8TO32_LE(p)) | ((uint64_t)U8TO32_LE((p) + 4) << 32)) + +#define U64TO8_BE(p, v) \ + U32TO8_BE((p), (uint32_t)((v) >> 32)); \ + U32TO8_BE((p) + 4, (uint32_t)((v) )); + +#define U64TO8_LE(p, v) \ + U32TO8_LE((p), (uint32_t)((v) )); \ + U32TO8_LE((p) + 4, (uint32_t)((v) >> 32)); + +#define U32_SWAP(v) { \ + (v) = (((v) << 8) & 0xFF00FF00 ) | (((v) >> 8) & 0xFF00FF ); \ + (v) = ((v) << 16) | ((v) >> 16); \ +} + +#define U64_SWAP(v) { \ + (v) = (((v) << 8) & 0xFF00FF00FF00FF00ull ) | (((v) >> 8) & 0x00FF00FF00FF00FFull ); \ + (v) = (((v) << 16) & 0xFFFF0000FFFF0000ull ) | (((v) >> 16) & 0x0000FFFF0000FFFFull ); \ + (v) = ((v) << 32) | ((v) >> 32); \ +} + +static int +scrypt_verify(const uint8_t *x, const uint8_t *y, size_t len) { + uint32_t differentbits = 0; + while (len--) + differentbits |= (*x++ ^ *y++); + return (1 & ((differentbits - 1) >> 8)); +} + +void +scrypt_ensure_zero(void *p, size_t len) { +#if ((defined(CPU_X86) || defined(CPU_X86_64)) && defined(COMPILER_MSVC)) + __stosb((unsigned char *)p, 0, len); +#elif (defined(CPU_X86) && defined(COMPILER_GCC)) + __asm__ __volatile__( + "pushl %%edi;\n" + "pushl %%ecx;\n" + "rep stosb;\n" + "popl %%ecx;\n" + "popl %%edi;\n" + :: "a"(0), "D"(p), "c"(len) : "cc", "memory" + ); +#elif (defined(CPU_X86_64) && defined(COMPILER_GCC)) + __asm__ __volatile__( + "pushq %%rdi;\n" + "pushq %%rcx;\n" + "rep stosb;\n" + "popq %%rcx;\n" + "popq %%rdi;\n" + :: "a"(0), "D"(p), "c"(len) : "cc", "memory" + ); +#else + volatile uint8_t *b = (volatile uint8_t *)p; + size_t i; + for (i = 0; i < len; i++) + b[i] = 0; +#endif +} + +#include "scrypt-jane-portable-x86.h" + diff --git a/scrypt-jane/scrypt-jane-romix-basic.h b/scrypt-jane/scrypt-jane-romix-basic.h new file mode 100644 index 0000000..ca1df02 --- /dev/null +++ b/scrypt-jane/scrypt-jane-romix-basic.h @@ -0,0 +1,67 @@ +#if !defined(SCRYPT_CHOOSE_COMPILETIME) +/* function type returned by scrypt_getROMix, used with cpu detection */ +typedef void (FASTCALL *scrypt_ROMixfn)(scrypt_mix_word_t *X/*[chunkWords]*/, scrypt_mix_word_t *Y/*[chunkWords]*/, scrypt_mix_word_t *V/*[chunkWords * N]*/, uint32_t N, uint32_t r); +#endif + +/* romix pre/post nop function */ +static void STDCALL +scrypt_romix_nop(scrypt_mix_word_t *blocks, size_t nblocks) { +} + +/* romix pre/post endian conversion function */ +static void STDCALL +scrypt_romix_convert_endian(scrypt_mix_word_t *blocks, size_t nblocks) { +#if !defined(CPU_LE) + static const union { uint8_t b[2]; uint16_t w; } endian_test = {{1,0}}; + size_t i; + if (endian_test.w == 0x100) { + nblocks *= SCRYPT_BLOCK_WORDS; + for (i = 0; i < nblocks; i++) { + SCRYPT_WORD_ENDIAN_SWAP(blocks[i]); + } + } +#endif +} + +/* chunkmix test function */ +typedef void (STDCALL *chunkmixfn)(scrypt_mix_word_t *Bout/*[chunkWords]*/, scrypt_mix_word_t *Bin/*[chunkWords]*/, scrypt_mix_word_t *Bxor/*[chunkWords]*/, uint32_t r); +typedef void (STDCALL *blockfixfn)(scrypt_mix_word_t *blocks, size_t nblocks); + +static int +scrypt_test_mix_instance(chunkmixfn mixfn, blockfixfn prefn, blockfixfn postfn, const uint8_t expected[16]) { + /* r = 2, (2 * r) = 4 blocks in a chunk, 4 * SCRYPT_BLOCK_WORDS total */ + const uint32_t r = 2, blocks = 2 * r, words = blocks * SCRYPT_BLOCK_WORDS; + scrypt_mix_word_t MM16 chunk[2][4 * SCRYPT_BLOCK_WORDS], v; + uint8_t final[16]; + size_t i; + + for (i = 0; i < words; i++) { + v = (scrypt_mix_word_t)i; + v = (v << 8) | v; + v = (v << 16) | v; + chunk[0][i] = v; + } + + prefn(chunk[0], blocks); + mixfn(chunk[1], chunk[0], NULL, r); + postfn(chunk[1], blocks); + + /* grab the last 16 bytes of the final block */ + for (i = 0; i < 16; i += sizeof(scrypt_mix_word_t)) { + SCRYPT_WORDTO8_LE(final + i, chunk[1][words - (16 / sizeof(scrypt_mix_word_t)) + (i / sizeof(scrypt_mix_word_t))]); + } + + return scrypt_verify(expected, final, 16); +} + +/* returns a pointer to item i, where item is len scrypt_mix_word_t's long */ +static scrypt_mix_word_t * +scrypt_item(scrypt_mix_word_t *base, scrypt_mix_word_t i, scrypt_mix_word_t len) { + return base + (i * len); +} + +/* returns a pointer to block i */ +static scrypt_mix_word_t * +scrypt_block(scrypt_mix_word_t *base, scrypt_mix_word_t i) { + return base + (i * SCRYPT_BLOCK_WORDS); +} diff --git a/scrypt-jane/scrypt-jane-romix-template.h b/scrypt-jane/scrypt-jane-romix-template.h new file mode 100644 index 0000000..2fd7674 --- /dev/null +++ b/scrypt-jane/scrypt-jane-romix-template.h @@ -0,0 +1,118 @@ +#if !defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_HAVE_ROMIX) + +#if defined(SCRYPT_CHOOSE_COMPILETIME) +#undef SCRYPT_ROMIX_FN +#define SCRYPT_ROMIX_FN scrypt_ROMix +#endif + +#undef SCRYPT_HAVE_ROMIX +#define SCRYPT_HAVE_ROMIX + +#if !defined(SCRYPT_CHUNKMIX_FN) + +#define SCRYPT_CHUNKMIX_FN scrypt_ChunkMix_basic + +/* + Bout = ChunkMix(Bin) + + 2*r: number of blocks in the chunk +*/ +static void STDCALL +SCRYPT_CHUNKMIX_FN(scrypt_mix_word_t *Bout/*[chunkWords]*/, scrypt_mix_word_t *Bin/*[chunkWords]*/, scrypt_mix_word_t *Bxor/*[chunkWords]*/, uint32_t r) { + scrypt_mix_word_t MM16 X[SCRYPT_BLOCK_WORDS], *block; + uint32_t i, j, blocksPerChunk = r * 2, half = 0; + + /* 1: X = B_{2r - 1} */ + block = scrypt_block(Bin, blocksPerChunk - 1); + for (i = 0; i < SCRYPT_BLOCK_WORDS; i++) + X[i] = block[i]; + + if (Bxor) { + block = scrypt_block(Bxor, blocksPerChunk - 1); + for (i = 0; i < SCRYPT_BLOCK_WORDS; i++) + X[i] ^= block[i]; + } + + /* 2: for i = 0 to 2r - 1 do */ + for (i = 0; i < blocksPerChunk; i++, half ^= r) { + /* 3: X = H(X ^ B_i) */ + block = scrypt_block(Bin, i); + for (j = 0; j < SCRYPT_BLOCK_WORDS; j++) + X[j] ^= block[j]; + + if (Bxor) { + block = scrypt_block(Bxor, i); + for (j = 0; j < SCRYPT_BLOCK_WORDS; j++) + X[j] ^= block[j]; + } + SCRYPT_MIX_FN(X); + + /* 4: Y_i = X */ + /* 6: B'[0..r-1] = Y_even */ + /* 6: B'[r..2r-1] = Y_odd */ + block = scrypt_block(Bout, (i / 2) + half); + for (j = 0; j < SCRYPT_BLOCK_WORDS; j++) + block[j] = X[j]; + } +} +#endif + +/* + X = ROMix(X) + + X: chunk to mix + Y: scratch chunk + N: number of rounds + V[N]: array of chunks to randomly index in to + 2*r: number of blocks in a chunk +*/ + +static void NOINLINE FASTCALL +SCRYPT_ROMIX_FN(scrypt_mix_word_t *X/*[chunkWords]*/, scrypt_mix_word_t *Y/*[chunkWords]*/, scrypt_mix_word_t *V/*[N * chunkWords]*/, uint32_t N, uint32_t r) { + uint32_t i, j, chunkWords = SCRYPT_BLOCK_WORDS * r * 2; + scrypt_mix_word_t *block = V; + + SCRYPT_ROMIX_TANGLE_FN(X, r * 2); + + /* 1: X = B */ + /* implicit */ + + /* 2: for i = 0 to N - 1 do */ + memcpy(block, X, chunkWords * sizeof(scrypt_mix_word_t)); + for (i = 0; i < N - 1; i++, block += chunkWords) { + /* 3: V_i = X */ + /* 4: X = H(X) */ + SCRYPT_CHUNKMIX_FN(block + chunkWords, block, NULL, r); + } + SCRYPT_CHUNKMIX_FN(X, block, NULL, r); + + /* 6: for i = 0 to N - 1 do */ + for (i = 0; i < N; i += 2) { + /* 7: j = Integerify(X) % N */ + j = X[chunkWords - SCRYPT_BLOCK_WORDS] & (N - 1); + + /* 8: X = H(Y ^ V_j) */ + SCRYPT_CHUNKMIX_FN(Y, X, scrypt_item(V, j, chunkWords), r); + + /* 7: j = Integerify(Y) % N */ + j = Y[chunkWords - SCRYPT_BLOCK_WORDS] & (N - 1); + + /* 8: X = H(Y ^ V_j) */ + SCRYPT_CHUNKMIX_FN(X, Y, scrypt_item(V, j, chunkWords), r); + } + + /* 10: B' = X */ + /* implicit */ + + SCRYPT_ROMIX_UNTANGLE_FN(X, r * 2); +} + +#endif /* !defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_HAVE_ROMIX) */ + + +#undef SCRYPT_CHUNKMIX_FN +#undef SCRYPT_ROMIX_FN +#undef SCRYPT_MIX_FN +#undef SCRYPT_ROMIX_TANGLE_FN +#undef SCRYPT_ROMIX_UNTANGLE_FN + diff --git a/scrypt-jane/scrypt-jane-romix.h b/scrypt-jane/scrypt-jane-romix.h new file mode 100644 index 0000000..faa655a --- /dev/null +++ b/scrypt-jane/scrypt-jane-romix.h @@ -0,0 +1,27 @@ +#if defined(SCRYPT_CHACHA) +#include "scrypt-jane-chacha.h" +#elif defined(SCRYPT_SALSA) +#include "scrypt-jane-salsa.h" +#elif defined(SCRYPT_SALSA64) +#include "scrypt-jane-salsa64.h" +#else + #define SCRYPT_MIX_BASE "ERROR" + typedef uint32_t scrypt_mix_word_t; + #define SCRYPT_WORDTO8_LE U32TO8_LE + #define SCRYPT_WORD_ENDIAN_SWAP U32_SWAP + #define SCRYPT_BLOCK_BYTES 64 + #define SCRYPT_BLOCK_WORDS (SCRYPT_BLOCK_BYTES / sizeof(scrypt_mix_word_t)) + #if !defined(SCRYPT_CHOOSE_COMPILETIME) + static void FASTCALL scrypt_ROMix_error(scrypt_mix_word_t *X/*[chunkWords]*/, scrypt_mix_word_t *Y/*[chunkWords]*/, scrypt_mix_word_t *V/*[chunkWords * N]*/, uint32_t N, uint32_t r) {} + static scrypt_ROMixfn scrypt_getROMix() { return scrypt_ROMix_error; } + #else + static void FASTCALL scrypt_ROMix(scrypt_mix_word_t *X, scrypt_mix_word_t *Y, scrypt_mix_word_t *V, uint32_t N, uint32_t r) {} + #endif + static int scrypt_test_mix() { return 0; } + #error must define a mix function! +#endif + +#if !defined(SCRYPT_CHOOSE_COMPILETIME) +#undef SCRYPT_MIX +#define SCRYPT_MIX SCRYPT_MIX_BASE +#endif diff --git a/scrypt-jane/scrypt-jane-salsa.h b/scrypt-jane/scrypt-jane-salsa.h new file mode 100644 index 0000000..0c1604b --- /dev/null +++ b/scrypt-jane/scrypt-jane-salsa.h @@ -0,0 +1,106 @@ +#define SCRYPT_MIX_BASE "Salsa20/8" + +typedef uint32_t scrypt_mix_word_t; + +#define SCRYPT_WORDTO8_LE U32TO8_LE +#define SCRYPT_WORD_ENDIAN_SWAP U32_SWAP + +#define SCRYPT_BLOCK_BYTES 64 +#define SCRYPT_BLOCK_WORDS (SCRYPT_BLOCK_BYTES / sizeof(scrypt_mix_word_t)) + +/* must have these here in case block bytes is ever != 64 */ +#include "scrypt-jane-romix-basic.h" + +#include "scrypt-jane-mix_salsa-avx.h" +#include "scrypt-jane-mix_salsa-sse2.h" +#include "scrypt-jane-mix_salsa.h" + +#if defined(SCRYPT_SALSA_AVX) + #define SCRYPT_CHUNKMIX_FN scrypt_ChunkMix_avx + #define SCRYPT_ROMIX_FN scrypt_ROMix_avx + #define SCRYPT_ROMIX_TANGLE_FN salsa_core_tangle_sse2 + #define SCRYPT_ROMIX_UNTANGLE_FN salsa_core_tangle_sse2 + #include "scrypt-jane-romix-template.h" +#endif + +#if defined(SCRYPT_SALSA_SSE2) + #define SCRYPT_CHUNKMIX_FN scrypt_ChunkMix_sse2 + #define SCRYPT_ROMIX_FN scrypt_ROMix_sse2 + #define SCRYPT_MIX_FN salsa_core_sse2 + #define SCRYPT_ROMIX_TANGLE_FN salsa_core_tangle_sse2 + #define SCRYPT_ROMIX_UNTANGLE_FN salsa_core_tangle_sse2 + #include "scrypt-jane-romix-template.h" +#endif + +/* cpu agnostic */ +#define SCRYPT_ROMIX_FN scrypt_ROMix_basic +#define SCRYPT_MIX_FN salsa_core_basic +#define SCRYPT_ROMIX_TANGLE_FN scrypt_romix_convert_endian +#define SCRYPT_ROMIX_UNTANGLE_FN scrypt_romix_convert_endian +#include "scrypt-jane-romix-template.h" + +#if !defined(SCRYPT_CHOOSE_COMPILETIME) +static scrypt_ROMixfn +scrypt_getROMix() { + size_t cpuflags = detect_cpu(); + +#if defined(SCRYPT_SALSA_AVX) + if (cpuflags & cpu_avx) + return scrypt_ROMix_avx; + else +#endif + +#if defined(SCRYPT_SALSA_SSE2) + if (cpuflags & cpu_sse2) + return scrypt_ROMix_sse2; + else +#endif + + return scrypt_ROMix_basic; +} +#endif + + +#if defined(SCRYPT_TEST_SPEED) +static size_t +available_implementations() { + size_t flags = 0; + +#if defined(SCRYPT_SALSA_AVX) + flags |= cpu_avx; +#endif + +#if defined(SCRYPT_SALSA_SSE2) + flags |= cpu_sse2; +#endif + + return flags; +} +#endif + + +static int +scrypt_test_mix() { + static const uint8_t expected[16] = { + 0x41,0x1f,0x2e,0xa3,0xab,0xa3,0x1a,0x34,0x87,0x1d,0x8a,0x1c,0x76,0xa0,0x27,0x66, + }; + + int ret = 1; + size_t cpuflags = detect_cpu(); + +#if defined(SCRYPT_SALSA_AVX) + if (cpuflags & cpu_avx) + ret &= scrypt_test_mix_instance(scrypt_ChunkMix_avx, salsa_core_tangle_sse2, salsa_core_tangle_sse2, expected); +#endif + +#if defined(SCRYPT_SALSA_SSE2) + if (cpuflags & cpu_sse2) + ret &= scrypt_test_mix_instance(scrypt_ChunkMix_sse2, salsa_core_tangle_sse2, salsa_core_tangle_sse2, expected); +#endif + +#if defined(SCRYPT_SALSA_BASIC) + ret &= scrypt_test_mix_instance(scrypt_ChunkMix_basic, scrypt_romix_convert_endian, scrypt_romix_convert_endian, expected); +#endif + + return ret; +} diff --git a/scrypt-jane/scrypt-jane-test-vectors.h b/scrypt-jane/scrypt-jane-test-vectors.h new file mode 100644 index 0000000..a1e4c61 --- /dev/null +++ b/scrypt-jane/scrypt-jane-test-vectors.h @@ -0,0 +1,261 @@ +typedef struct scrypt_test_setting_t { + const char *pw, *salt; + uint8_t Nfactor, rfactor, pfactor; +} scrypt_test_setting; + +static const scrypt_test_setting post_settings[] = { + {"", "", 3, 0, 0}, + {"password", "NaCl", 9, 3, 4}, + {0} +}; + +#if defined(SCRYPT_SHA256) + #if defined(SCRYPT_SALSA) + /* sha256 + salsa20/8, the only 'official' test vectors! */ + static const uint8_t post_vectors[][64] = { + {0x77,0xd6,0x57,0x62,0x38,0x65,0x7b,0x20,0x3b,0x19,0xca,0x42,0xc1,0x8a,0x04,0x97, + 0xf1,0x6b,0x48,0x44,0xe3,0x07,0x4a,0xe8,0xdf,0xdf,0xfa,0x3f,0xed,0xe2,0x14,0x42, + 0xfc,0xd0,0x06,0x9d,0xed,0x09,0x48,0xf8,0x32,0x6a,0x75,0x3a,0x0f,0xc8,0x1f,0x17, + 0xe8,0xd3,0xe0,0xfb,0x2e,0x0d,0x36,0x28,0xcf,0x35,0xe2,0x0c,0x38,0xd1,0x89,0x06}, + {0xfd,0xba,0xbe,0x1c,0x9d,0x34,0x72,0x00,0x78,0x56,0xe7,0x19,0x0d,0x01,0xe9,0xfe, + 0x7c,0x6a,0xd7,0xcb,0xc8,0x23,0x78,0x30,0xe7,0x73,0x76,0x63,0x4b,0x37,0x31,0x62, + 0x2e,0xaf,0x30,0xd9,0x2e,0x22,0xa3,0x88,0x6f,0xf1,0x09,0x27,0x9d,0x98,0x30,0xda, + 0xc7,0x27,0xaf,0xb9,0x4a,0x83,0xee,0x6d,0x83,0x60,0xcb,0xdf,0xa2,0xcc,0x06,0x40} + }; + #elif defined(SCRYPT_CHACHA) + static const uint8_t post_vectors[][64] = { + {0xef,0x8f,0x44,0x8f,0xc3,0xef,0x78,0x13,0xb2,0x26,0xa7,0x2a,0x40,0xa1,0x98,0x7f, + 0xc8,0x7f,0x0d,0x5f,0x40,0x66,0xa2,0x05,0x07,0x4f,0xc7,0xac,0x3b,0x47,0x07,0x0c, + 0xf5,0x20,0x46,0x76,0x20,0x7b,0xee,0x51,0x6d,0x5f,0xfa,0x9c,0x27,0xac,0xa9,0x36, + 0x62,0xbd,0xde,0x0b,0xa3,0xc0,0x66,0x84,0xde,0x82,0xd0,0x1a,0xb4,0xd1,0xb5,0xfe}, + {0xf1,0x94,0xf7,0x5f,0x15,0x12,0x10,0x4d,0x6e,0xfb,0x04,0x8c,0x35,0xc4,0x51,0xb6, + 0x11,0x04,0xa7,0x9b,0xb0,0x46,0xaf,0x7b,0x47,0x39,0xf0,0xac,0xb2,0x8a,0xfa,0x45, + 0x09,0x86,0x8f,0x10,0x4b,0xc6,0xee,0x00,0x11,0x38,0x73,0x7a,0x6a,0xd8,0x25,0x67, + 0x85,0xa4,0x10,0x4e,0xa9,0x2f,0x15,0xfe,0xcf,0x63,0xe1,0xe8,0xcf,0xab,0xe8,0xbd} + }; + #elif defined(SCRYPT_SALSA64) + static const uint8_t post_vectors[][64] = { + {0xf4,0x87,0x29,0xf4,0xc3,0x31,0x8c,0xe8,0xdf,0xe5,0xd8,0x73,0xff,0xca,0x32,0xcf, + 0xd8,0xac,0xe7,0xf7,0x15,0xda,0x84,0x41,0x60,0x23,0x26,0x4a,0xc8,0x3e,0xee,0xa6, + 0xa5,0x6e,0x52,0xd6,0x64,0x55,0x16,0x31,0x3e,0x66,0x7b,0x65,0xd5,0xe2,0xc9,0x95, + 0x1b,0xf0,0x81,0x40,0xb7,0x2f,0xff,0xa6,0xe6,0x02,0xcc,0x63,0x08,0x4a,0x74,0x31}, + {0x7a,0xd8,0xad,0x02,0x9c,0xa5,0xf4,0x42,0x6a,0x29,0xd2,0xb5,0x53,0xf1,0x6d,0x1d, + 0x25,0xc8,0x70,0x48,0x80,0xb9,0xa3,0xf6,0x94,0xf8,0xfa,0xb8,0x52,0x42,0xcd,0x14, + 0x26,0x46,0x28,0x06,0xc7,0xf6,0x1f,0xa7,0x89,0x6d,0xc5,0xa0,0x36,0xcc,0xde,0xcb, + 0x73,0x0b,0xa4,0xe2,0xd3,0xd1,0x44,0x06,0x35,0x08,0xe0,0x35,0x5b,0xf8,0xd7,0xe7} + }; + #endif +#elif defined(SCRYPT_SHA512) + #if defined(SCRYPT_SALSA) + static const uint8_t post_vectors[][64] = { + {0xae,0x54,0xe7,0x74,0xe4,0x51,0x6b,0x0f,0xe1,0xe7,0x28,0x03,0x17,0xe4,0x8c,0xfa, + 0x2f,0x66,0x55,0x7f,0xdc,0x3b,0x40,0xab,0x47,0x84,0xc9,0x63,0x36,0x07,0x9d,0xe5, + 0x86,0x43,0x95,0x89,0xb6,0xc0,0x6c,0x72,0x64,0x00,0xc1,0x2a,0xd7,0x69,0x21,0x92, + 0x8e,0xba,0xa4,0x59,0x9f,0x00,0x14,0x3a,0x7c,0x12,0x58,0x91,0x09,0xa0,0x32,0xfe}, + {0xc5,0xb3,0xd6,0xea,0x0a,0x4b,0x1e,0xcc,0x40,0x00,0xe5,0x98,0x5c,0xdc,0x06,0x06, + 0x78,0x34,0x92,0x16,0xcf,0xe4,0x9f,0x03,0x96,0x2d,0x41,0x35,0x00,0x9b,0xff,0x74, + 0x60,0x19,0x6e,0xe6,0xa6,0x46,0xf7,0x37,0xcb,0xfa,0xd0,0x9f,0x80,0x72,0x2e,0x85, + 0x13,0x3e,0x1a,0x91,0x90,0x53,0xa1,0x33,0x85,0x51,0xdc,0x62,0x1c,0x0e,0x4d,0x30} + }; + #elif defined(SCRYPT_CHACHA) + static const uint8_t post_vectors[][64] = { + {0xe2,0x05,0x7c,0x44,0xf9,0x55,0x9f,0x64,0xbe,0xd5,0x7f,0x85,0x69,0xc7,0x8c,0x7f, + 0x2b,0x91,0xd6,0x9a,0x6c,0xf8,0x57,0x55,0x61,0x25,0x3d,0xee,0xb8,0xd5,0x8c,0xdc, + 0x2d,0xd5,0x53,0x84,0x8c,0x06,0xaa,0x37,0x77,0xa6,0xf0,0xf1,0x35,0xfe,0xb5,0xcb, + 0x61,0xd7,0x2c,0x67,0xf3,0x7e,0x8a,0x1b,0x04,0xa3,0xa3,0x43,0xa2,0xb2,0x29,0xf2}, + {0x82,0xda,0x29,0xb2,0x08,0x27,0xfc,0x78,0x22,0xc4,0xb8,0x7e,0xbc,0x36,0xcf,0xcd, + 0x17,0x4b,0xa1,0x30,0x16,0x4a,0x25,0x70,0xc7,0xcb,0xe0,0x2b,0x56,0xd3,0x16,0x4e, + 0x85,0xb6,0x84,0xe7,0x9b,0x7f,0x8b,0xb5,0x94,0x33,0xcf,0x33,0x44,0x65,0xc8,0xa1, + 0x46,0xf9,0xf5,0xfc,0x74,0x29,0x7e,0xd5,0x46,0xec,0xbd,0x95,0xc1,0x80,0x24,0xe4} + }; + #elif defined(SCRYPT_SALSA64) + static const uint8_t post_vectors[][64] = { + {0xa6,0xcb,0x77,0x9a,0x64,0x1f,0x95,0x02,0x53,0xe7,0x5c,0x78,0xdb,0xa3,0x43,0xff, + 0xbe,0x10,0x4c,0x7b,0xe4,0xe1,0x91,0xcf,0x67,0x69,0x5a,0x2c,0x12,0xd6,0x99,0x49, + 0x92,0xfd,0x5a,0xaa,0x12,0x4c,0x2e,0xf6,0x95,0x46,0x8f,0x5e,0x77,0x62,0x16,0x29, + 0xdb,0xe7,0xab,0x02,0x2b,0x9c,0x35,0x03,0xf8,0xd4,0x04,0x7d,0x2d,0x73,0x85,0xf1}, + {0x54,0xb7,0xca,0xbb,0xaf,0x0f,0xb0,0x5f,0xb7,0x10,0x63,0x48,0xb3,0x15,0xd8,0xb5, + 0x62,0x64,0x89,0x6a,0x59,0xc6,0x0f,0x86,0x96,0x38,0xf0,0xcf,0xd4,0x62,0x90,0x61, + 0x7d,0xce,0xd6,0x13,0x85,0x67,0x4a,0xf5,0x32,0x03,0x74,0x30,0x0b,0x5a,0x2f,0x86, + 0x82,0x6e,0x0c,0x3e,0x40,0x7a,0xde,0xbe,0x42,0x6e,0x80,0x2b,0xaf,0xdb,0xcc,0x94} + }; + #endif +#elif defined(SCRYPT_BLAKE512) + #if defined(SCRYPT_SALSA) + static const uint8_t post_vectors[][64] = { + {0x4a,0x48,0xb3,0xfa,0xdc,0xb0,0xb8,0xdb,0x54,0xee,0xf3,0x5c,0x27,0x65,0x6c,0x20, + 0xab,0x61,0x9a,0x5b,0xd5,0x1d,0xd9,0x95,0xab,0x88,0x0e,0x4d,0x1e,0x71,0x2f,0x11, + 0x43,0x2e,0xef,0x23,0xca,0x8a,0x49,0x3b,0x11,0x38,0xa5,0x28,0x61,0x2f,0xb7,0x89, + 0x5d,0xef,0x42,0x4c,0xc1,0x74,0xea,0x8a,0x56,0xbe,0x4a,0x82,0x76,0x15,0x1a,0x87}, + {0x96,0x24,0xbf,0x40,0xeb,0x03,0x8e,0xfe,0xc0,0xd5,0xa4,0x81,0x85,0x7b,0x09,0x88, + 0x52,0xb5,0xcb,0xc4,0x48,0xe1,0xb9,0x1d,0x3f,0x8b,0x3a,0xc6,0x38,0x32,0xc7,0x55, + 0x30,0x28,0x7a,0x42,0xa9,0x5d,0x54,0x33,0x62,0xf3,0xd9,0x3c,0x96,0x40,0xd1,0x80, + 0xe4,0x0e,0x7e,0xf0,0x64,0x53,0xfe,0x7b,0xd7,0x15,0xba,0xad,0x16,0x80,0x01,0xb5} + }; + #elif defined(SCRYPT_CHACHA) + static const uint8_t post_vectors[][64] = { + {0x45,0x42,0x22,0x31,0x26,0x13,0x5f,0x94,0xa4,0x00,0x04,0x47,0xe8,0x50,0x6d,0xd6, + 0xdd,0xd5,0x08,0xd4,0x90,0x64,0xe0,0x59,0x70,0x46,0xff,0xfc,0x29,0xb3,0x6a,0xc9, + 0x4d,0x45,0x97,0x95,0xa8,0xf0,0x53,0xe7,0xee,0x4b,0x6b,0x5d,0x1e,0xa5,0xb2,0x58, + 0x4b,0x93,0xc9,0x89,0x4c,0xa8,0xab,0x03,0x74,0x38,0xbd,0x54,0x97,0x6b,0xab,0x4a}, + {0x4b,0x4a,0x63,0x96,0x73,0x34,0x9f,0x39,0x64,0x51,0x0e,0x2e,0x3b,0x07,0xd5,0x1c, + 0xd2,0xf7,0xce,0x60,0xab,0xac,0x89,0xa4,0x16,0x0c,0x58,0x82,0xb3,0xd3,0x25,0x5b, + 0xd5,0x62,0x32,0xf4,0x86,0x5d,0xb2,0x4b,0xbf,0x8e,0xc6,0xc0,0xac,0x40,0x48,0xb4, + 0x69,0x08,0xba,0x40,0x4b,0x07,0x2a,0x13,0x9c,0x98,0x3b,0x8b,0x20,0x0c,0xac,0x9e} + }; + #elif defined(SCRYPT_SALSA64) + static const uint8_t post_vectors[][64] = { + {0xcb,0x4b,0xc2,0xd1,0xf4,0x77,0x32,0x3c,0x42,0x9d,0xf7,0x7d,0x1f,0x22,0x64,0xa4, + 0xe2,0x88,0x30,0x2d,0x54,0x9d,0xb6,0x26,0x89,0x25,0x30,0xc3,0x3d,0xdb,0xba,0x99, + 0xe9,0x8e,0x1e,0x5e,0x57,0x66,0x75,0x7c,0x24,0xda,0x00,0x6f,0x79,0xf7,0x47,0xf5, + 0xea,0x40,0x70,0x37,0xd2,0x91,0xc7,0x4d,0xdf,0x46,0xb6,0x3e,0x95,0x7d,0xcb,0xc1}, + {0x25,0xc2,0xcb,0x7f,0xc8,0x50,0xb7,0x0b,0x11,0x9e,0x1d,0x10,0xb2,0xa8,0x35,0x23, + 0x91,0x39,0xfb,0x45,0xf2,0xbf,0xe4,0xd0,0x84,0xec,0x72,0x33,0x6d,0x09,0xed,0x41, + 0x9a,0x7e,0x4f,0x10,0x73,0x97,0x22,0x76,0x58,0x93,0x39,0x24,0xdf,0xd2,0xaa,0x2f, + 0x6b,0x2b,0x64,0x48,0xa5,0xb7,0xf5,0x56,0x77,0x02,0xa7,0x71,0x46,0xe5,0x0e,0x8d}, + }; + #endif +#elif defined(SCRYPT_BLAKE256) + #if defined(SCRYPT_SALSA) + static const uint8_t post_vectors[][64] = { + {0xf1,0xf1,0x91,0x1a,0x81,0xe6,0x9f,0xc1,0xce,0x43,0xab,0xb1,0x1a,0x02,0x1e,0x16, + 0x08,0xc6,0xf9,0x00,0x50,0x1b,0x6d,0xf1,0x31,0x06,0x95,0x48,0x5d,0xf7,0x6c,0x00, + 0xa2,0x4c,0xb1,0x0e,0x52,0x66,0x94,0x7e,0x84,0xfc,0xa5,0x34,0xfd,0xf0,0xe9,0x57, + 0x85,0x2d,0x8c,0x05,0x5c,0x0f,0x04,0xd4,0x8d,0x3e,0x13,0x52,0x3d,0x90,0x2d,0x2c}, + {0xd5,0x42,0xd2,0x7b,0x06,0xae,0x63,0x90,0x9e,0x30,0x00,0x0e,0xd8,0xa4,0x3a,0x0b, + 0xee,0x4a,0xef,0xb2,0xc4,0x95,0x0d,0x72,0x07,0x70,0xcc,0xa3,0xf9,0x1e,0xc2,0x75, + 0xcf,0xaf,0xe1,0x44,0x1c,0x8c,0xe2,0x3e,0x0c,0x81,0xf3,0x92,0xe1,0x13,0xe6,0x4f, + 0x2d,0x27,0xc3,0x87,0xe5,0xb6,0xf9,0xd7,0x02,0x04,0x37,0x64,0x78,0x36,0x6e,0xb3} + }; + #elif defined(SCRYPT_CHACHA) + static const uint8_t post_vectors[][64] = { + {0xad,0x1b,0x4b,0xca,0xe3,0x26,0x1a,0xfd,0xb7,0x77,0x8c,0xde,0x8d,0x26,0x14,0xe1, + 0x54,0x38,0x42,0xf3,0xb3,0x66,0x29,0xf9,0x90,0x04,0xf1,0x82,0x7c,0x5a,0x6f,0xa8, + 0x7d,0xd6,0x08,0x0d,0x8b,0x78,0x04,0xad,0x31,0xea,0xd4,0x87,0x2d,0xf7,0x74,0x9a, + 0xe5,0xce,0x97,0xef,0xa3,0xbb,0x90,0x46,0x7c,0xf4,0x51,0x38,0xc7,0x60,0x53,0x21}, + {0x39,0xbb,0x56,0x3d,0x0d,0x7b,0x74,0x82,0xfe,0x5a,0x78,0x3d,0x66,0xe8,0x3a,0xdf, + 0x51,0x6f,0x3e,0xf4,0x86,0x20,0x8d,0xe1,0x81,0x22,0x02,0xf7,0x0d,0xb5,0x1a,0x0f, + 0xfc,0x59,0xb6,0x60,0xc9,0xdb,0x38,0x0b,0x5b,0x95,0xa5,0x94,0xda,0x42,0x2d,0x90, + 0x47,0xeb,0x73,0x31,0x9f,0x20,0xf6,0x81,0xc2,0xef,0x33,0x77,0x51,0xd8,0x2c,0xe4} + }; + #elif defined(SCRYPT_SALSA64) + static const uint8_t post_vectors[][64] = { + {0x9e,0xf2,0x60,0x7c,0xbd,0x7c,0x19,0x5c,0x79,0xc6,0x1b,0x7e,0xb0,0x65,0x1b,0xc3, + 0x70,0x0d,0x89,0xfc,0x72,0xb2,0x03,0x72,0x15,0xcb,0x8e,0x8c,0x49,0x50,0x4c,0x27, + 0x99,0xda,0x47,0x32,0x5e,0xb4,0xa2,0x07,0x83,0x51,0x6b,0x06,0x37,0x60,0x42,0xc4, + 0x59,0x49,0x99,0xdd,0xc0,0xd2,0x08,0x94,0x7f,0xe3,0x9e,0x4e,0x43,0x8e,0x5b,0xba}, + {0x86,0x6f,0x3b,0x11,0xb8,0xca,0x4b,0x6e,0xa7,0x6f,0xc2,0xc9,0x33,0xb7,0x8b,0x9f, + 0xa3,0xb9,0xf5,0xb5,0x62,0xa6,0x17,0x66,0xe4,0xc3,0x9d,0x9b,0xca,0x51,0xb0,0x2f, + 0xda,0x09,0xc1,0x77,0xed,0x8b,0x89,0xc2,0x69,0x5a,0x34,0x05,0x4a,0x1f,0x4d,0x76, + 0xcb,0xd5,0xa4,0x78,0xfa,0x1b,0xb9,0x5b,0xbc,0x3d,0xce,0x04,0x63,0x99,0xad,0x54} + }; + #endif +#elif defined(SCRYPT_SKEIN512) + #if defined(SCRYPT_SALSA) + static const uint8_t post_vectors[][64] = { + {0xe4,0x36,0xa0,0x9a,0xdb,0xf0,0xd1,0x45,0x56,0xda,0x25,0x53,0x00,0xf9,0x2c,0x69, + 0xa4,0xc2,0xa5,0x8e,0x1a,0x85,0xfa,0x53,0xbd,0x55,0x3d,0x11,0x2a,0x44,0x13,0x87, + 0x8f,0x81,0x88,0x13,0x1e,0x49,0xa8,0xc4,0xc5,0xcd,0x1f,0xe1,0x5f,0xf5,0xcb,0x2f, + 0x8b,0xab,0x57,0x38,0x59,0xeb,0x6b,0xac,0x3b,0x73,0x10,0xa6,0xe1,0xfe,0x17,0x3e}, + {0x6d,0x61,0xde,0x43,0xa9,0x38,0x53,0x5f,0xd8,0xf2,0x6d,0xf3,0xe4,0xd6,0xd8,0x5e, + 0x81,0x89,0xd0,0x0b,0x86,0x16,0xb1,0x91,0x65,0x76,0xd8,0xc1,0xf7,0x3b,0xca,0x8b, + 0x35,0x07,0x58,0xba,0x77,0xdf,0x11,0x6c,0xbc,0x58,0xee,0x11,0x59,0xf2,0xfe,0xcb, + 0x51,0xdc,0xcd,0x35,0x2e,0x46,0x22,0xa0,0xaa,0x55,0x60,0x7c,0x91,0x15,0xb8,0x00} + }; + #elif defined(SCRYPT_CHACHA) + static const uint8_t post_vectors[][64] = { + {0xd1,0x12,0x6d,0x64,0x10,0x0e,0x98,0x6c,0xbe,0x70,0x21,0xd9,0xc6,0x04,0x62,0xa4, + 0x29,0x13,0x9a,0x3c,0xf8,0xe9,0x1e,0x87,0x9f,0x88,0xf4,0x98,0x01,0x41,0x8e,0xce, + 0x60,0xf7,0xbe,0x17,0x0a,0xec,0xd6,0x30,0x80,0xcf,0x6b,0x1e,0xcf,0x95,0xa0,0x4d, + 0x37,0xed,0x3a,0x09,0xd1,0xeb,0x0c,0x80,0x82,0x22,0x8e,0xd3,0xb1,0x7f,0xd6,0xa8}, + {0x5c,0x5c,0x05,0xe2,0x75,0xa5,0xa4,0xec,0x81,0x97,0x9c,0x5b,0xd7,0x26,0xb3,0x16, + 0xb4,0x02,0x8c,0x56,0xe6,0x32,0x57,0x33,0x47,0x19,0x06,0x6c,0xde,0x68,0x41,0x37, + 0x5b,0x7d,0xa7,0xb3,0x73,0xeb,0x82,0xca,0x0f,0x86,0x2e,0x6b,0x47,0xa2,0x70,0x39, + 0x35,0xfd,0x2d,0x2e,0x7b,0xc3,0x68,0xbb,0x52,0x42,0x19,0x3b,0x78,0x96,0xe7,0xc8} + }; + #elif defined(SCRYPT_SALSA64) + static const uint8_t post_vectors[][64] = { + {0xd2,0xad,0x32,0x05,0xee,0x80,0xe3,0x44,0x70,0xc6,0x34,0xde,0x05,0xb6,0xcf,0x60, + 0x89,0x98,0x70,0xc0,0xb8,0xf5,0x54,0xf1,0xa6,0xb2,0xc8,0x76,0x34,0xec,0xc4,0x59, + 0x8e,0x64,0x42,0xd0,0xa9,0xed,0xe7,0x19,0xb2,0x8a,0x11,0xc6,0xa6,0xbf,0xa7,0xa9, + 0x4e,0x44,0x32,0x7e,0x12,0x91,0x9d,0xfe,0x52,0x48,0xa8,0x27,0xb3,0xfc,0xb1,0x89}, + {0xd6,0x67,0xd2,0x3e,0x30,0x1e,0x9d,0xe2,0x55,0x68,0x17,0x3d,0x2b,0x75,0x5a,0xe5, + 0x04,0xfb,0x3d,0x0e,0x86,0xe0,0xaa,0x1d,0xd4,0x72,0xda,0xb0,0x79,0x41,0xb7,0x99, + 0x68,0xe5,0xd9,0x55,0x79,0x7d,0xc3,0xd1,0xa6,0x56,0xc1,0xbe,0x0b,0x6c,0x62,0x23, + 0x66,0x67,0x91,0x47,0x99,0x13,0x6b,0xe3,0xda,0x59,0x55,0x18,0x67,0x8f,0x2e,0x3b} + }; + #endif +#elif defined(SCRYPT_KECCAK512) + #if defined(SCRYPT_SALSA) + static const uint8_t post_vectors[][64] = { + {0xc2,0x7b,0xbe,0x1d,0xf1,0x99,0xd8,0xe7,0x1b,0xac,0xe0,0x9d,0xeb,0x5a,0xfe,0x21, + 0x71,0xff,0x41,0x51,0x4f,0xbe,0x41,0x01,0x15,0xe2,0xb7,0xb9,0x55,0x15,0x25,0xa1, + 0x40,0x4c,0x66,0x29,0x32,0xb7,0xc9,0x62,0x60,0x88,0xe0,0x99,0x39,0xae,0xce,0x25, + 0x3c,0x11,0x89,0xdd,0xc6,0x14,0xd7,0x3e,0xa3,0x6d,0x07,0x2e,0x56,0xa0,0xff,0x97}, + {0x3c,0x91,0x12,0x4a,0x37,0x7d,0xd6,0x96,0xd2,0x9b,0x5d,0xea,0xb8,0xb9,0x82,0x4e, + 0x4f,0x6b,0x60,0x4c,0x59,0x01,0xe5,0x73,0xfd,0xf6,0xb8,0x9a,0x5a,0xd3,0x7c,0x7a, + 0xd2,0x4f,0x8e,0x74,0xc1,0x90,0x88,0xa0,0x3f,0x55,0x75,0x79,0x10,0xd0,0x09,0x79, + 0x0f,0x6c,0x74,0x0c,0x05,0x08,0x3c,0x8c,0x94,0x7b,0x30,0x56,0xca,0xdf,0xdf,0x34} + }; + #elif defined(SCRYPT_CHACHA) + static const uint8_t post_vectors[][64] = { + {0x77,0xcb,0x70,0xbf,0xae,0xd4,0x4c,0x5b,0xbc,0xd3,0xec,0x8a,0x82,0x43,0x8d,0xb3, + 0x7f,0x1f,0xfb,0x70,0x36,0x32,0x4d,0xa6,0xb7,0x13,0x37,0x77,0x30,0x0c,0x3c,0xfb, + 0x2c,0x20,0x8f,0x2a,0xf4,0x47,0x4d,0x69,0x8e,0xae,0x2d,0xad,0xba,0x35,0xe9,0x2f, + 0xe6,0x99,0x7a,0xf8,0xcf,0x70,0x78,0xbb,0x0c,0x72,0x64,0x95,0x8b,0x36,0x77,0x3d}, + {0xc6,0x43,0x17,0x16,0x87,0x09,0x5f,0x12,0xed,0x21,0xe2,0xb4,0xad,0x55,0xa1,0xa1, + 0x49,0x50,0x90,0x70,0xab,0x81,0x83,0x7a,0xcd,0xdf,0x23,0x52,0x19,0xc0,0xa2,0xd8, + 0x8e,0x98,0xeb,0xf0,0x37,0xab,0xad,0xfd,0x1c,0x04,0x97,0x18,0x42,0x85,0xf7,0x4b, + 0x18,0x2c,0x55,0xd3,0xa9,0xe6,0x89,0xfb,0x58,0x0a,0xb2,0x37,0xb9,0xf8,0xfb,0xc5} + }; + #elif defined(SCRYPT_SALSA64) + static const uint8_t post_vectors[][64] = { + {0xc7,0x34,0x95,0x02,0x5e,0x31,0x0d,0x1f,0x10,0x38,0x9c,0x3f,0x04,0x53,0xed,0x05, + 0x27,0x38,0xc1,0x3f,0x6a,0x0f,0xc5,0xa3,0x9b,0x73,0x8a,0x28,0x7e,0x5d,0x3c,0xdc, + 0x9d,0x5a,0x09,0xbf,0x8c,0x0a,0xad,0xe4,0x73,0x52,0xe3,0x6d,0xaa,0xd1,0x8b,0xbf, + 0xa3,0xb7,0xf0,0x58,0xad,0x22,0x24,0xc9,0xaa,0x96,0xb7,0x5d,0xfc,0x5f,0xb0,0xcf}, + {0x76,0x22,0xfd,0xe8,0xa2,0x79,0x8e,0x9d,0x43,0x8c,0x7a,0xba,0x78,0xb7,0x84,0xf1, + 0xc8,0xee,0x3b,0xae,0x31,0x89,0xbf,0x7e,0xd0,0x4b,0xc1,0x2d,0x58,0x5d,0x84,0x6b, + 0xec,0x86,0x56,0xe0,0x87,0x94,0x7f,0xbc,0xf9,0x48,0x92,0xef,0x54,0x7f,0x23,0x8d, + 0x4f,0x8b,0x0a,0x75,0xa7,0x39,0x0e,0x46,0x6e,0xee,0x58,0xc8,0xfa,0xea,0x90,0x53} + }; + #endif +#elif defined(SCRYPT_KECCAK256) + #if defined(SCRYPT_SALSA) + static const uint8_t post_vectors[][64] = { + {0x2e,0x96,0xd8,0x87,0x45,0xcd,0xd6,0xc8,0xf6,0xd2,0x87,0x33,0x50,0xc7,0x04,0xe5, + 0x3c,0x4b,0x48,0x44,0x57,0xc1,0x74,0x09,0x76,0x02,0xaa,0xd3,0x7b,0xf3,0xbf,0xed, + 0x4b,0x72,0xd7,0x1b,0x49,0x6b,0xe0,0x44,0x83,0xee,0x8f,0xaf,0xa1,0xb5,0x33,0xa9, + 0x9e,0x86,0xab,0xe2,0x9f,0xcf,0x68,0x6e,0x7e,0xbd,0xf5,0x7a,0x83,0x4b,0x1c,0x10}, + {0x42,0x7e,0xf9,0x4b,0x72,0x61,0xda,0x2d,0xb3,0x27,0x0e,0xe1,0xd9,0xde,0x5f,0x3e, + 0x64,0x2f,0xd6,0xda,0x90,0x59,0xce,0xbf,0x02,0x5b,0x32,0xf7,0x6d,0x94,0x51,0x7b, + 0xb6,0xa6,0x0d,0x99,0x3e,0x7f,0x39,0xbe,0x1b,0x1d,0x6c,0x97,0x12,0xd8,0xb7,0xfd, + 0x5b,0xb5,0xf3,0x73,0x5a,0x89,0xb2,0xdd,0xcc,0x3d,0x74,0x2e,0x3d,0x9e,0x3c,0x22} + }; + #elif defined(SCRYPT_CHACHA) + static const uint8_t post_vectors[][64] = { + {0x76,0x1d,0x5b,0x8f,0xa9,0xe1,0xa6,0x01,0xcb,0xc5,0x7a,0x5f,0x02,0x23,0xb6,0x82, + 0x57,0x79,0x60,0x2f,0x05,0x7f,0xb8,0x0a,0xcb,0x5e,0x54,0x11,0x49,0x2e,0xdd,0x85, + 0x83,0x30,0x67,0xb3,0x24,0x5c,0xce,0xfc,0x32,0xcf,0x12,0xc3,0xff,0xe0,0x79,0x36, + 0x74,0x17,0xa6,0x3e,0xcd,0xa0,0x7e,0xcb,0x37,0xeb,0xcb,0xb6,0xe1,0xb9,0xf5,0x15}, + {0xf5,0x66,0xa7,0x4c,0xe4,0xdc,0x18,0x56,0x2f,0x3e,0x86,0x4d,0x92,0xa5,0x5c,0x5a, + 0x8f,0xc3,0x6b,0x32,0xdb,0xe5,0x72,0x50,0x84,0xfc,0x6e,0x5d,0x15,0x77,0x3d,0xca, + 0xc5,0x2b,0x20,0x3c,0x78,0x37,0x80,0x78,0x23,0x56,0x91,0xa0,0xce,0xa4,0x06,0x5a, + 0x7f,0xe3,0xbf,0xab,0x51,0x57,0x32,0x2c,0x0a,0xf0,0xc5,0x6f,0xf4,0xcb,0xff,0x42} + }; + #elif defined(SCRYPT_SALSA64) + static const uint8_t post_vectors[][64] = { + {0xb0,0xb7,0x10,0xb5,0x1f,0x2b,0x7f,0xaf,0x9d,0x95,0x5f,0x4c,0x2d,0x98,0x7c,0xc1, + 0xbc,0x37,0x2f,0x50,0x8d,0xb2,0x9f,0xfd,0x48,0x0d,0xe0,0x44,0x19,0xdf,0x28,0x6c, + 0xab,0xbf,0x1e,0x17,0x26,0xcc,0x57,0x95,0x18,0x17,0x83,0x4c,0x12,0x48,0xd9,0xee, + 0x4b,0x00,0x29,0x06,0x31,0x01,0x6b,0x8c,0x26,0x39,0xbf,0xe4,0xe4,0xd4,0x6a,0x26}, + {0xa0,0x40,0xb2,0xf2,0x11,0xb6,0x5f,0x3d,0x4c,0x1e,0xef,0x59,0xd4,0x98,0xdb,0x14, + 0x01,0xff,0xe3,0x34,0xd7,0x19,0xcd,0xeb,0xde,0x52,0x1c,0xf4,0x86,0x43,0xc9,0xe2, + 0xfb,0xf9,0x4f,0x0a,0xbb,0x1f,0x5c,0x6a,0xdf,0xb9,0x28,0xfa,0xac,0xc4,0x48,0xed, + 0xcc,0xd2,0x2e,0x25,0x5f,0xf3,0x56,0x1d,0x2d,0x23,0x22,0xc1,0xbc,0xff,0x78,0x80} + }; + #endif +#else + static const uint8_t post_vectors[][64] = {{0}}; +#endif + diff --git a/scrypt.c b/scrypt.c new file mode 100644 index 0000000..24ad35a --- /dev/null +++ b/scrypt.c @@ -0,0 +1,686 @@ +/*- + * Copyright 2009 Colin Percival, 2011 ArtForz + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * This file was originally written by Colin Percival as part of the Tarsnap + * online backup system. + */ + +#include "scrypt.h" +#include + +#if defined(WIN32) || defined(_WIN32) || defined(__WIN32) && !defined(__CYGWIN__) +#include "stdint.h" +#else +#include +#endif + +#include + +static __inline uint32_t +be32dec(const void *pp) +{ + const uint8_t *p = (uint8_t const *)pp; + + return ((uint32_t)(p[3]) + ((uint32_t)(p[2]) << 8) + + ((uint32_t)(p[1]) << 16) + ((uint32_t)(p[0]) << 24)); +} + +static __inline void +be32enc(void *pp, uint32_t x) +{ + uint8_t * p = (uint8_t *)pp; + + p[3] = x & 0xff; + p[2] = (x >> 8) & 0xff; + p[1] = (x >> 16) & 0xff; + p[0] = (x >> 24) & 0xff; +} + +static __inline uint32_t +le32dec(const void *pp) +{ + const uint8_t *p = (uint8_t const *)pp; + + return ((uint32_t)(p[0]) + ((uint32_t)(p[1]) << 8) + + ((uint32_t)(p[2]) << 16) + ((uint32_t)(p[3]) << 24)); +} + +static __inline void +le32enc(void *pp, uint32_t x) +{ + uint8_t * p = (uint8_t *)pp; + + p[0] = x & 0xff; + p[1] = (x >> 8) & 0xff; + p[2] = (x >> 16) & 0xff; + p[3] = (x >> 24) & 0xff; +} + + +typedef struct SHA256Context { + uint32_t state[8]; + uint32_t count[2]; + unsigned char buf[64]; +} SHA256_CTX; + +typedef struct HMAC_SHA256Context { + SHA256_CTX ictx; + SHA256_CTX octx; +} HMAC_SHA256_CTX; + +/* + * Encode a length len/4 vector of (uint32_t) into a length len vector of + * (unsigned char) in big-endian form. Assumes len is a multiple of 4. + */ +static void +be32enc_vect(unsigned char *dst, const uint32_t *src, size_t len) +{ + size_t i; + + for (i = 0; i < len / 4; i++) + be32enc(dst + i * 4, src[i]); +} + +/* + * Decode a big-endian length len vector of (unsigned char) into a length + * len/4 vector of (uint32_t). Assumes len is a multiple of 4. + */ +static void +be32dec_vect(uint32_t *dst, const unsigned char *src, size_t len) +{ + size_t i; + + for (i = 0; i < len / 4; i++) + dst[i] = be32dec(src + i * 4); +} + +/* Elementary functions used by SHA256 */ +#define Ch(x, y, z) ((x & (y ^ z)) ^ z) +#define Maj(x, y, z) ((x & (y | z)) | (y & z)) +#define SHR(x, n) (x >> n) +#define ROTR(x, n) ((x >> n) | (x << (32 - n))) +#define S0(x) (ROTR(x, 2) ^ ROTR(x, 13) ^ ROTR(x, 22)) +#define S1(x) (ROTR(x, 6) ^ ROTR(x, 11) ^ ROTR(x, 25)) +#define s0(x) (ROTR(x, 7) ^ ROTR(x, 18) ^ SHR(x, 3)) +#define s1(x) (ROTR(x, 17) ^ ROTR(x, 19) ^ SHR(x, 10)) + +/* SHA256 round function */ +#define RND(a, b, c, d, e, f, g, h, k) \ + t0 = h + S1(e) + Ch(e, f, g) + k; \ + t1 = S0(a) + Maj(a, b, c); \ + d += t0; \ + h = t0 + t1; + +/* Adjusted round function for rotating state */ +#define RNDr(S, W, i, k) \ + RND(S[(64 - i) % 8], S[(65 - i) % 8], \ + S[(66 - i) % 8], S[(67 - i) % 8], \ + S[(68 - i) % 8], S[(69 - i) % 8], \ + S[(70 - i) % 8], S[(71 - i) % 8], \ + W[i] + k) + +/* + * SHA256 block compression function. The 256-bit state is transformed via + * the 512-bit input block to produce a new state. + */ +static void +SHA256_Transform(uint32_t * state, const unsigned char block[64]) +{ + uint32_t W[64]; + uint32_t S[8]; + uint32_t t0, t1; + int i; + + /* 1. Prepare message schedule W. */ + be32dec_vect(W, block, 64); + for (i = 16; i < 64; i++) + W[i] = s1(W[i - 2]) + W[i - 7] + s0(W[i - 15]) + W[i - 16]; + + /* 2. Initialize working variables. */ + memcpy(S, state, 32); + + /* 3. Mix. */ + RNDr(S, W, 0, 0x428a2f98); + RNDr(S, W, 1, 0x71374491); + RNDr(S, W, 2, 0xb5c0fbcf); + RNDr(S, W, 3, 0xe9b5dba5); + RNDr(S, W, 4, 0x3956c25b); + RNDr(S, W, 5, 0x59f111f1); + RNDr(S, W, 6, 0x923f82a4); + RNDr(S, W, 7, 0xab1c5ed5); + RNDr(S, W, 8, 0xd807aa98); + RNDr(S, W, 9, 0x12835b01); + RNDr(S, W, 10, 0x243185be); + RNDr(S, W, 11, 0x550c7dc3); + RNDr(S, W, 12, 0x72be5d74); + RNDr(S, W, 13, 0x80deb1fe); + RNDr(S, W, 14, 0x9bdc06a7); + RNDr(S, W, 15, 0xc19bf174); + RNDr(S, W, 16, 0xe49b69c1); + RNDr(S, W, 17, 0xefbe4786); + RNDr(S, W, 18, 0x0fc19dc6); + RNDr(S, W, 19, 0x240ca1cc); + RNDr(S, W, 20, 0x2de92c6f); + RNDr(S, W, 21, 0x4a7484aa); + RNDr(S, W, 22, 0x5cb0a9dc); + RNDr(S, W, 23, 0x76f988da); + RNDr(S, W, 24, 0x983e5152); + RNDr(S, W, 25, 0xa831c66d); + RNDr(S, W, 26, 0xb00327c8); + RNDr(S, W, 27, 0xbf597fc7); + RNDr(S, W, 28, 0xc6e00bf3); + RNDr(S, W, 29, 0xd5a79147); + RNDr(S, W, 30, 0x06ca6351); + RNDr(S, W, 31, 0x14292967); + RNDr(S, W, 32, 0x27b70a85); + RNDr(S, W, 33, 0x2e1b2138); + RNDr(S, W, 34, 0x4d2c6dfc); + RNDr(S, W, 35, 0x53380d13); + RNDr(S, W, 36, 0x650a7354); + RNDr(S, W, 37, 0x766a0abb); + RNDr(S, W, 38, 0x81c2c92e); + RNDr(S, W, 39, 0x92722c85); + RNDr(S, W, 40, 0xa2bfe8a1); + RNDr(S, W, 41, 0xa81a664b); + RNDr(S, W, 42, 0xc24b8b70); + RNDr(S, W, 43, 0xc76c51a3); + RNDr(S, W, 44, 0xd192e819); + RNDr(S, W, 45, 0xd6990624); + RNDr(S, W, 46, 0xf40e3585); + RNDr(S, W, 47, 0x106aa070); + RNDr(S, W, 48, 0x19a4c116); + RNDr(S, W, 49, 0x1e376c08); + RNDr(S, W, 50, 0x2748774c); + RNDr(S, W, 51, 0x34b0bcb5); + RNDr(S, W, 52, 0x391c0cb3); + RNDr(S, W, 53, 0x4ed8aa4a); + RNDr(S, W, 54, 0x5b9cca4f); + RNDr(S, W, 55, 0x682e6ff3); + RNDr(S, W, 56, 0x748f82ee); + RNDr(S, W, 57, 0x78a5636f); + RNDr(S, W, 58, 0x84c87814); + RNDr(S, W, 59, 0x8cc70208); + RNDr(S, W, 60, 0x90befffa); + RNDr(S, W, 61, 0xa4506ceb); + RNDr(S, W, 62, 0xbef9a3f7); + RNDr(S, W, 63, 0xc67178f2); + + /* 4. Mix local working variables into global state */ + for (i = 0; i < 8; i++) + state[i] += S[i]; + + /* Clean the stack. */ + memset(W, 0, 256); + memset(S, 0, 32); + t0 = t1 = 0; +} + +static unsigned char PAD[64] = { + 0x80, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 +}; + +/* SHA-256 initialization. Begins a SHA-256 operation. */ +static void +SHA256_Init(SHA256_CTX * ctx) +{ + + /* Zero bits processed so far */ + ctx->count[0] = ctx->count[1] = 0; + + /* Magic initialization constants */ + ctx->state[0] = 0x6A09E667; + ctx->state[1] = 0xBB67AE85; + ctx->state[2] = 0x3C6EF372; + ctx->state[3] = 0xA54FF53A; + ctx->state[4] = 0x510E527F; + ctx->state[5] = 0x9B05688C; + ctx->state[6] = 0x1F83D9AB; + ctx->state[7] = 0x5BE0CD19; +} + +/* Add bytes into the hash */ +static void +SHA256_Update(SHA256_CTX * ctx, const void *in, size_t len) +{ + uint32_t bitlen[2]; + uint32_t r; + const unsigned char *src = in; + + /* Number of bytes left in the buffer from previous updates */ + r = (ctx->count[1] >> 3) & 0x3f; + + /* Convert the length into a number of bits */ + bitlen[1] = ((uint32_t)len) << 3; + bitlen[0] = (uint32_t)(len >> 29); + + /* Update number of bits */ + if ((ctx->count[1] += bitlen[1]) < bitlen[1]) + ctx->count[0]++; + ctx->count[0] += bitlen[0]; + + /* Handle the case where we don't need to perform any transforms */ + if (len < 64 - r) { + memcpy(&ctx->buf[r], src, len); + return; + } + + /* Finish the current block */ + memcpy(&ctx->buf[r], src, 64 - r); + SHA256_Transform(ctx->state, ctx->buf); + src += 64 - r; + len -= 64 - r; + + /* Perform complete blocks */ + while (len >= 64) { + SHA256_Transform(ctx->state, src); + src += 64; + len -= 64; + } + + /* Copy left over data into buffer */ + memcpy(ctx->buf, src, len); +} + +/* Add padding and terminating bit-count. */ +static void +SHA256_Pad(SHA256_CTX * ctx) +{ + unsigned char len[8]; + uint32_t r, plen; + + /* + * Convert length to a vector of bytes -- we do this now rather + * than later because the length will change after we pad. + */ + be32enc_vect(len, ctx->count, 8); + + /* Add 1--64 bytes so that the resulting length is 56 mod 64 */ + r = (ctx->count[1] >> 3) & 0x3f; + plen = (r < 56) ? (56 - r) : (120 - r); + SHA256_Update(ctx, PAD, (size_t)plen); + + /* Add the terminating bit-count */ + SHA256_Update(ctx, len, 8); +} + +/* + * SHA-256 finalization. Pads the input data, exports the hash value, + * and clears the context state. + */ +static void +SHA256_Final(unsigned char digest[32], SHA256_CTX * ctx) +{ + + /* Add padding */ + SHA256_Pad(ctx); + + /* Write the hash */ + be32enc_vect(digest, ctx->state, 32); + + /* Clear the context state */ + memset((void *)ctx, 0, sizeof(*ctx)); +} + +/* Initialize an HMAC-SHA256 operation with the given key. */ +static void +HMAC_SHA256_Init(HMAC_SHA256_CTX * ctx, const void * _K, size_t Klen) +{ + unsigned char pad[64]; + unsigned char khash[32]; + const unsigned char * K = _K; + size_t i; + + /* If Klen > 64, the key is really SHA256(K). */ + if (Klen > 64) { + SHA256_Init(&ctx->ictx); + SHA256_Update(&ctx->ictx, K, Klen); + SHA256_Final(khash, &ctx->ictx); + K = khash; + Klen = 32; + } + + /* Inner SHA256 operation is SHA256(K xor [block of 0x36] || data). */ + SHA256_Init(&ctx->ictx); + memset(pad, 0x36, 64); + for (i = 0; i < Klen; i++) + pad[i] ^= K[i]; + SHA256_Update(&ctx->ictx, pad, 64); + + /* Outer SHA256 operation is SHA256(K xor [block of 0x5c] || hash). */ + SHA256_Init(&ctx->octx); + memset(pad, 0x5c, 64); + for (i = 0; i < Klen; i++) + pad[i] ^= K[i]; + SHA256_Update(&ctx->octx, pad, 64); + + /* Clean the stack. */ + memset(khash, 0, 32); +} + +/* Add bytes to the HMAC-SHA256 operation. */ +static void +HMAC_SHA256_Update(HMAC_SHA256_CTX * ctx, const void *in, size_t len) +{ + + /* Feed data to the inner SHA256 operation. */ + SHA256_Update(&ctx->ictx, in, len); +} + +/* Finish an HMAC-SHA256 operation. */ +static void +HMAC_SHA256_Final(unsigned char digest[32], HMAC_SHA256_CTX * ctx) +{ + unsigned char ihash[32]; + + /* Finish the inner SHA256 operation. */ + SHA256_Final(ihash, &ctx->ictx); + + /* Feed the inner hash to the outer SHA256 operation. */ + SHA256_Update(&ctx->octx, ihash, 32); + + /* Finish the outer SHA256 operation. */ + SHA256_Final(digest, &ctx->octx); + + /* Clean the stack. */ + memset(ihash, 0, 32); +} + +/** + * PBKDF2_SHA256(passwd, passwdlen, salt, saltlen, c, buf, dkLen): + * Compute PBKDF2(passwd, salt, c, dkLen) using HMAC-SHA256 as the PRF, and + * write the output to buf. The value dkLen must be at most 32 * (2^32 - 1). + */ +static void +PBKDF2_SHA256(const uint8_t * passwd, size_t passwdlen, const uint8_t * salt, + size_t saltlen, uint64_t c, uint8_t * buf, size_t dkLen) +{ + HMAC_SHA256_CTX PShctx, hctx; + size_t i; + uint8_t ivec[4]; + uint8_t U[32]; + uint8_t T[32]; + uint64_t j; + int k; + size_t clen; + + /* Compute HMAC state after processing P and S. */ + HMAC_SHA256_Init(&PShctx, passwd, passwdlen); + HMAC_SHA256_Update(&PShctx, salt, saltlen); + + /* Iterate through the blocks. */ + for (i = 0; i * 32 < dkLen; i++) { + /* Generate INT(i + 1). */ + be32enc(ivec, (uint32_t)(i + 1)); + + /* Compute U_1 = PRF(P, S || INT(i)). */ + memcpy(&hctx, &PShctx, sizeof(HMAC_SHA256_CTX)); + HMAC_SHA256_Update(&hctx, ivec, 4); + HMAC_SHA256_Final(U, &hctx); + + /* T_i = U_1 ... */ + memcpy(T, U, 32); + + for (j = 2; j <= c; j++) { + /* Compute U_j. */ + HMAC_SHA256_Init(&hctx, passwd, passwdlen); + HMAC_SHA256_Update(&hctx, U, 32); + HMAC_SHA256_Final(U, &hctx); + + /* ... xor U_j ... */ + for (k = 0; k < 32; k++) + T[k] ^= U[k]; + } + + /* Copy as many bytes as necessary into buf. */ + clen = dkLen - i * 32; + if (clen > 32) + clen = 32; + memcpy(&buf[i * 32], T, clen); + } + + /* Clean PShctx, since we never called _Final on it. */ + memset(&PShctx, 0, sizeof(HMAC_SHA256_CTX)); +} + + +static void blkcpy(void *, void *, size_t); +static void blkxor(void *, void *, size_t); +static void salsa20_8(uint32_t[16]); +static void blockmix_salsa8(uint32_t *, uint32_t *, uint32_t *, size_t); +static uint64_t integerify(void *, size_t); +static void smix(uint8_t *, size_t, uint64_t, uint32_t *, uint32_t *); + +static void +blkcpy(void * dest, void * src, size_t len) +{ + size_t * D = dest; + size_t * S = src; + size_t L = len / sizeof(size_t); + size_t i; + + for (i = 0; i < L; i++) + D[i] = S[i]; +} + +static void +blkxor(void * dest, void * src, size_t len) +{ + size_t * D = dest; + size_t * S = src; + size_t L = len / sizeof(size_t); + size_t i; + + for (i = 0; i < L; i++) + D[i] ^= S[i]; +} + +/** + * salsa20_8(B): + * Apply the salsa20/8 core to the provided block. + */ +static void +salsa20_8(uint32_t B[16]) +{ + uint32_t x[16]; + size_t i; + + blkcpy(x, B, 64); + for (i = 0; i < 8; i += 2) { +#define R(a,b) (((a) << (b)) | ((a) >> (32 - (b)))) + /* Operate on columns. */ + x[ 4] ^= R(x[ 0]+x[12], 7); x[ 8] ^= R(x[ 4]+x[ 0], 9); + x[12] ^= R(x[ 8]+x[ 4],13); x[ 0] ^= R(x[12]+x[ 8],18); + + x[ 9] ^= R(x[ 5]+x[ 1], 7); x[13] ^= R(x[ 9]+x[ 5], 9); + x[ 1] ^= R(x[13]+x[ 9],13); x[ 5] ^= R(x[ 1]+x[13],18); + + x[14] ^= R(x[10]+x[ 6], 7); x[ 2] ^= R(x[14]+x[10], 9); + x[ 6] ^= R(x[ 2]+x[14],13); x[10] ^= R(x[ 6]+x[ 2],18); + + x[ 3] ^= R(x[15]+x[11], 7); x[ 7] ^= R(x[ 3]+x[15], 9); + x[11] ^= R(x[ 7]+x[ 3],13); x[15] ^= R(x[11]+x[ 7],18); + + /* Operate on rows. */ + x[ 1] ^= R(x[ 0]+x[ 3], 7); x[ 2] ^= R(x[ 1]+x[ 0], 9); + x[ 3] ^= R(x[ 2]+x[ 1],13); x[ 0] ^= R(x[ 3]+x[ 2],18); + + x[ 6] ^= R(x[ 5]+x[ 4], 7); x[ 7] ^= R(x[ 6]+x[ 5], 9); + x[ 4] ^= R(x[ 7]+x[ 6],13); x[ 5] ^= R(x[ 4]+x[ 7],18); + + x[11] ^= R(x[10]+x[ 9], 7); x[ 8] ^= R(x[11]+x[10], 9); + x[ 9] ^= R(x[ 8]+x[11],13); x[10] ^= R(x[ 9]+x[ 8],18); + + x[12] ^= R(x[15]+x[14], 7); x[13] ^= R(x[12]+x[15], 9); + x[14] ^= R(x[13]+x[12],13); x[15] ^= R(x[14]+x[13],18); +#undef R + } + for (i = 0; i < 16; i++) + B[i] += x[i]; +} + +/** + * blockmix_salsa8(Bin, Bout, X, r): + * Compute Bout = BlockMix_{salsa20/8, r}(Bin). The input Bin must be 128r + * bytes in length; the output Bout must also be the same size. The + * temporary space X must be 64 bytes. + */ +static void +blockmix_salsa8(uint32_t * Bin, uint32_t * Bout, uint32_t * X, size_t r) +{ + size_t i; + + /* 1: X <-- B_{2r - 1} */ + blkcpy(X, &Bin[(2 * r - 1) * 16], 64); + + /* 2: for i = 0 to 2r - 1 do */ + for (i = 0; i < 2 * r; i += 2) { + /* 3: X <-- H(X \xor B_i) */ + blkxor(X, &Bin[i * 16], 64); + salsa20_8(X); + + /* 4: Y_i <-- X */ + /* 6: B' <-- (Y_0, Y_2 ... Y_{2r-2}, Y_1, Y_3 ... Y_{2r-1}) */ + blkcpy(&Bout[i * 8], X, 64); + + /* 3: X <-- H(X \xor B_i) */ + blkxor(X, &Bin[i * 16 + 16], 64); + salsa20_8(X); + + /* 4: Y_i <-- X */ + /* 6: B' <-- (Y_0, Y_2 ... Y_{2r-2}, Y_1, Y_3 ... Y_{2r-1}) */ + blkcpy(&Bout[i * 8 + r * 16], X, 64); + } +} + +/** + * integerify(B, r): + * Return the result of parsing B_{2r-1} as a little-endian integer. + */ +static uint64_t +integerify(void * B, size_t r) +{ + uint32_t * X = (void *)((uintptr_t)(B) + (2 * r - 1) * 64); + + return (((uint64_t)(X[1]) << 32) + X[0]); +} + +/** + * smix(B, r, N, V, XY): + * Compute B = SMix_r(B, N). The input B must be 128r bytes in length; + * the temporary storage V must be 128rN bytes in length; the temporary + * storage XY must be 256r + 64 bytes in length. The value N must be a + * power of 2 greater than 1. The arrays B, V, and XY must be aligned to a + * multiple of 64 bytes. + */ +static void +smix(uint8_t * B, size_t r, uint64_t N, uint32_t * V, uint32_t * XY) +{ + uint32_t * X = XY; + uint32_t * Y = &XY[32 * r]; + uint32_t * Z = &XY[64 * r]; + uint64_t i; + uint64_t j; + size_t k; + + /* 1: X <-- B */ + for (k = 0; k < 32 * r; k++) + X[k] = le32dec(&B[4 * k]); + + /* 2: for i = 0 to N - 1 do */ + for (i = 0; i < N; i += 2) { + /* 3: V_i <-- X */ + blkcpy(&V[i * (32 * r)], X, 128 * r); + + /* 4: X <-- H(X) */ + blockmix_salsa8(X, Y, Z, r); + + /* 3: V_i <-- X */ + blkcpy(&V[(i + 1) * (32 * r)], Y, 128 * r); + + /* 4: X <-- H(X) */ + blockmix_salsa8(Y, X, Z, r); + } + + /* 6: for i = 0 to N - 1 do */ + for (i = 0; i < N; i += 2) { + /* 7: j <-- Integerify(X) mod N */ + j = integerify(X, r) & (N - 1); + + /* 8: X <-- H(X \xor V_j) */ + blkxor(X, &V[j * (32 * r)], 128 * r); + blockmix_salsa8(X, Y, Z, r); + + /* 7: j <-- Integerify(X) mod N */ + j = integerify(Y, r) & (N - 1); + + /* 8: X <-- H(X \xor V_j) */ + blkxor(Y, &V[j * (32 * r)], 128 * r); + blockmix_salsa8(Y, X, Z, r); + } + + /* 10: B' <-- X */ + for (k = 0; k < 32 * r; k++) + le32enc(&B[4 * k], X[k]); +} + +/* cpu and memory intensive function to transform a 80 byte buffer into a 32 byte output + scratchpad size needs to be at least 63 + (128 * r * p) + (256 * r + 64) + (128 * r * N) bytes + */ +void scrypt_1024_1_1_256_sp(const char* input, char* output, char* scratchpad) +{ + uint8_t * B; + uint32_t * V; + uint32_t * XY; + uint32_t i; + + const uint32_t N = 1024; + const uint32_t r = 1; + const uint32_t p = 1; + + B = (uint8_t *)(((uintptr_t)(scratchpad) + 63) & ~ (uintptr_t)(63)); + XY = (uint32_t *)(B + (128 * r * p)); + V = (uint32_t *)(B + (128 * r * p) + (256 * r + 64)); + + /* 1: (B_0 ... B_{p-1}) <-- PBKDF2(P, S, 1, p * MFLen) */ + PBKDF2_SHA256((const uint8_t*)input, 80, (const uint8_t*)input, 80, 1, B, p * 128 * r); + + /* 2: for i = 0 to p - 1 do */ + for (i = 0; i < p; i++) { + /* 3: B_i <-- MF(B_i, N) */ + smix(&B[i * 128 * r], r, N, V, XY); + } + + /* 5: DK <-- PBKDF2(P, B, 1, dkLen) */ + PBKDF2_SHA256((const uint8_t*)input, 80, B, p * 128 * r, 1, (uint8_t*)output, 32); +} + +void scrypt_1024_1_1_256(const char* input, char* output) +{ + char scratchpad[131583]; + scrypt_1024_1_1_256_sp(input, output, scratchpad); +} diff --git a/scrypt.h b/scrypt.h new file mode 100644 index 0000000..ece0134 --- /dev/null +++ b/scrypt.h @@ -0,0 +1,8 @@ +#ifndef SCRYPT_H +#define SCRYPT_H + +void scrypt_1024_1_1_256(const char* input, char* output); +void scrypt_1024_1_1_256_sp(const char* input, char* output, char* scratchpad); +#define scrypt_scratchpad_size 131583; + +#endif \ No newline at end of file diff --git a/scryptjane.c b/scryptjane.c new file mode 100644 index 0000000..ca4f1aa --- /dev/null +++ b/scryptjane.c @@ -0,0 +1,182 @@ +/* + scrypt-jane by Andrew M, https://github.com/floodyberry/scrypt-jane + + Public Domain or MIT License, whichever is easier +*/ + +#include + +#include "scryptjane.h" +#include "scryptjane/scrypt-jane-portable.h" +#include "scryptjane/scrypt-jane-hash.h" +#include "scryptjane/scrypt-jane-romix.h" +#include "scryptjane/scrypt-jane-test-vectors.h" + + +#define scrypt_maxN 30 /* (1 << (30 + 1)) = ~2 billion */ +#if (SCRYPT_BLOCK_BYTES == 64) +#define scrypt_r_32kb 8 /* (1 << 8) = 256 * 2 blocks in a chunk * 64 bytes = Max of 32kb in a chunk */ +#elif (SCRYPT_BLOCK_BYTES == 128) +#define scrypt_r_32kb 7 /* (1 << 7) = 128 * 2 blocks in a chunk * 128 bytes = Max of 32kb in a chunk */ +#elif (SCRYPT_BLOCK_BYTES == 256) +#define scrypt_r_32kb 6 /* (1 << 6) = 64 * 2 blocks in a chunk * 256 bytes = Max of 32kb in a chunk */ +#elif (SCRYPT_BLOCK_BYTES == 512) +#define scrypt_r_32kb 5 /* (1 << 5) = 32 * 2 blocks in a chunk * 512 bytes = Max of 32kb in a chunk */ +#endif +#define scrypt_maxr scrypt_r_32kb /* 32kb */ +#define scrypt_maxp 25 /* (1 << 25) = ~33 million */ + +#include +#include + +static void +scrypt_fatal_error_default(const char *msg) { + fprintf(stderr, "%s\n", msg); + exit(1); +} + +static scrypt_fatal_errorfn scrypt_fatal_error = scrypt_fatal_error_default; + +void +scrypt_set_fatal_error_default(scrypt_fatal_errorfn fn) { + scrypt_fatal_error = fn; +} + +static int +scrypt_power_on_self_test() { + const scrypt_test_setting *t; + uint8_t test_digest[64]; + uint32_t i; + int res = 7, scrypt_valid; + + if (!scrypt_test_mix()) { +#if !defined(SCRYPT_TEST) + scrypt_fatal_error("scrypt: mix function power-on-self-test failed"); +#endif + res &= ~1; + } + + if (!scrypt_test_hash()) { +#if !defined(SCRYPT_TEST) + scrypt_fatal_error("scrypt: hash function power-on-self-test failed"); +#endif + res &= ~2; + } + + for (i = 0, scrypt_valid = 1; post_settings[i].pw; i++) { + t = post_settings + i; + scrypt((uint8_t *)t->pw, strlen(t->pw), (uint8_t *)t->salt, strlen(t->salt), t->Nfactor, t->rfactor, t->pfactor, test_digest, sizeof(test_digest)); + scrypt_valid &= scrypt_verify(post_vectors[i], test_digest, sizeof(test_digest)); + } + + if (!scrypt_valid) { +#if !defined(SCRYPT_TEST) + scrypt_fatal_error("scrypt: scrypt power-on-self-test failed"); +#endif + res &= ~4; + } + + return res; +} + +typedef struct scrypt_aligned_alloc_t { + uint8_t *mem, *ptr; +} scrypt_aligned_alloc; + +#if defined(SCRYPT_TEST_SPEED) +static uint8_t *mem_base = (uint8_t *)0; +static size_t mem_bump = 0; + +/* allocations are assumed to be multiples of 64 bytes and total allocations not to exceed ~1.01gb */ +static scrypt_aligned_alloc +scrypt_alloc(uint64_t size) { + scrypt_aligned_alloc aa; + if (!mem_base) { + mem_base = (uint8_t *)malloc((1024 * 1024 * 1024) + (1024 * 1024) + (SCRYPT_BLOCK_BYTES - 1)); + if (!mem_base) + scrypt_fatal_error("scrypt: out of memory"); + mem_base = (uint8_t *)(((size_t)mem_base + (SCRYPT_BLOCK_BYTES - 1)) & ~(SCRYPT_BLOCK_BYTES - 1)); + } + aa.mem = mem_base + mem_bump; + aa.ptr = aa.mem; + mem_bump += (size_t)size; + return aa; +} + +static void +scrypt_free(scrypt_aligned_alloc *aa) { + mem_bump = 0; +} +#else +static scrypt_aligned_alloc +scrypt_alloc(uint64_t size) { + static const size_t max_alloc = (size_t)-1; + scrypt_aligned_alloc aa; + size += (SCRYPT_BLOCK_BYTES - 1); + if (size > max_alloc) + scrypt_fatal_error("scrypt: not enough address space on this CPU to allocate required memory"); + aa.mem = (uint8_t *)malloc((size_t)size); + aa.ptr = (uint8_t *)(((size_t)aa.mem + (SCRYPT_BLOCK_BYTES - 1)) & ~(SCRYPT_BLOCK_BYTES - 1)); + if (!aa.mem) + scrypt_fatal_error("scrypt: out of memory"); + return aa; +} + +static void +scrypt_free(scrypt_aligned_alloc *aa) { + free(aa->mem); +} +#endif + + +void +scrypt(const uint8_t *password, size_t password_len, const uint8_t *salt, size_t salt_len, uint8_t Nfactor, uint8_t rfactor, uint8_t pfactor, uint8_t *out, size_t bytes) { + scrypt_aligned_alloc YX, V; + uint8_t *X, *Y; + uint32_t N, r, p, chunk_bytes, i; + +#if !defined(SCRYPT_CHOOSE_COMPILETIME) + scrypt_ROMixfn scrypt_ROMix = scrypt_getROMix(); +#endif + +#if !defined(SCRYPT_TEST) + static int power_on_self_test = 0; + if (!power_on_self_test) { + power_on_self_test = 1; + if (!scrypt_power_on_self_test()) + scrypt_fatal_error("scrypt: power on self test failed"); + } +#endif + + if (Nfactor > scrypt_maxN) + scrypt_fatal_error("scrypt: N out of range"); + if (rfactor > scrypt_maxr) + scrypt_fatal_error("scrypt: r out of range"); + if (pfactor > scrypt_maxp) + scrypt_fatal_error("scrypt: p out of range"); + + N = (1 << (Nfactor + 1)); + r = (1 << rfactor); + p = (1 << pfactor); + + chunk_bytes = SCRYPT_BLOCK_BYTES * r * 2; + V = scrypt_alloc((uint64_t)N * chunk_bytes); + YX = scrypt_alloc((p + 1) * chunk_bytes); + + /* 1: X = PBKDF2(password, salt) */ + Y = YX.ptr; + X = Y + chunk_bytes; + scrypt_pbkdf2(password, password_len, salt, salt_len, 1, X, chunk_bytes * p); + + /* 2: X = ROMix(X) */ + for (i = 0; i < p; i++) + scrypt_ROMix((scrypt_mix_word_t *)(X + (chunk_bytes * i)), (scrypt_mix_word_t *)Y, (scrypt_mix_word_t *)V.ptr, N, r); + + /* 3: Out = PBKDF2(password, X) */ + scrypt_pbkdf2(password, password_len, X, chunk_bytes * p, 1, out, bytes); + + scrypt_ensure_zero(YX.ptr, (p + 1) * chunk_bytes); + + scrypt_free(&V); + scrypt_free(&YX); +} diff --git a/scryptjane.h b/scryptjane.h new file mode 100644 index 0000000..00a28dd --- /dev/null +++ b/scryptjane.h @@ -0,0 +1,32 @@ +#ifndef SCRYPT_JANE_H +#define SCRYPT_JANE_H + + +#define SCRYPT_KECCAK512 +#define SCRYPT_CHACHA +#define SCRYPT_CHOOSE_COMPILETIME + +/* + Nfactor: Increases CPU & Memory Hardness + N = (1 << (Nfactor + 1)): How many times to mix a chunk and how many temporary chunks are used + + rfactor: Increases Memory Hardness + r = (1 << rfactor): How large a chunk is + + pfactor: Increases CPU Hardness + p = (1 << pfactor): Number of times to mix the main chunk + + A block is the basic mixing unit (salsa/chacha block = 64 bytes) + A chunk is (2 * r) blocks + + ~Memory used = (N + 2) * ((2 * r) * block size) +*/ + +#include + +typedef void (*scrypt_fatal_errorfn)(const char *msg); +void scrypt_set_fatal_error(scrypt_fatal_errorfn fn); + +void scrypt(const unsigned char *password, size_t password_len, const unsigned char *salt, size_t salt_len, unsigned char Nfactor, unsigned char rfactor, unsigned char pfactor, unsigned char *out, size_t bytes); + +#endif /* SCRYPT_JANE_H */ diff --git a/scryptn.c b/scryptn.c new file mode 100644 index 0000000..11623dc --- /dev/null +++ b/scryptn.c @@ -0,0 +1,687 @@ +/*- + * Copyright 2009 Colin Percival, 2011 ArtForz + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * This file was originally written by Colin Percival as part of the Tarsnap + * online backup system. + */ + + +#include +#include + +#include "scryptn.h" + +static __inline uint32_t +be32dec(const void *pp) +{ + const uint8_t *p = (uint8_t const *)pp; + + return ((uint32_t)(p[3]) + ((uint32_t)(p[2]) << 8) + + ((uint32_t)(p[1]) << 16) + ((uint32_t)(p[0]) << 24)); +} + +static __inline void +be32enc(void *pp, uint32_t x) +{ + uint8_t * p = (uint8_t *)pp; + + p[3] = x & 0xff; + p[2] = (x >> 8) & 0xff; + p[1] = (x >> 16) & 0xff; + p[0] = (x >> 24) & 0xff; +} + +static __inline uint32_t +le32dec(const void *pp) +{ + const uint8_t *p = (uint8_t const *)pp; + + return ((uint32_t)(p[0]) + ((uint32_t)(p[1]) << 8) + + ((uint32_t)(p[2]) << 16) + ((uint32_t)(p[3]) << 24)); +} + +static __inline void +le32enc(void *pp, uint32_t x) +{ + uint8_t * p = (uint8_t *)pp; + + p[0] = x & 0xff; + p[1] = (x >> 8) & 0xff; + p[2] = (x >> 16) & 0xff; + p[3] = (x >> 24) & 0xff; +} + + +typedef struct SHA256Context { + uint32_t state[8]; + uint32_t count[2]; + unsigned char buf[64]; +} SHA256_CTX; + +typedef struct HMAC_SHA256Context { + SHA256_CTX ictx; + SHA256_CTX octx; +} HMAC_SHA256_CTX; + +/* + * Encode a length len/4 vector of (uint32_t) into a length len vector of + * (unsigned char) in big-endian form. Assumes len is a multiple of 4. + */ +static void +be32enc_vect(unsigned char *dst, const uint32_t *src, size_t len) +{ + size_t i; + + for (i = 0; i < len / 4; i++) + be32enc(dst + i * 4, src[i]); +} + +/* + * Decode a big-endian length len vector of (unsigned char) into a length + * len/4 vector of (uint32_t). Assumes len is a multiple of 4. + */ +static void +be32dec_vect(uint32_t *dst, const unsigned char *src, size_t len) +{ + size_t i; + + for (i = 0; i < len / 4; i++) + dst[i] = be32dec(src + i * 4); +} + +/* Elementary functions used by SHA256 */ +#define Ch(x, y, z) ((x & (y ^ z)) ^ z) +#define Maj(x, y, z) ((x & (y | z)) | (y & z)) +#define SHR(x, n) (x >> n) +#define ROTR(x, n) ((x >> n) | (x << (32 - n))) +#define S0(x) (ROTR(x, 2) ^ ROTR(x, 13) ^ ROTR(x, 22)) +#define S1(x) (ROTR(x, 6) ^ ROTR(x, 11) ^ ROTR(x, 25)) +#define s0(x) (ROTR(x, 7) ^ ROTR(x, 18) ^ SHR(x, 3)) +#define s1(x) (ROTR(x, 17) ^ ROTR(x, 19) ^ SHR(x, 10)) + +/* SHA256 round function */ +#define RND(a, b, c, d, e, f, g, h, k) \ + t0 = h + S1(e) + Ch(e, f, g) + k; \ + t1 = S0(a) + Maj(a, b, c); \ + d += t0; \ + h = t0 + t1; + +/* Adjusted round function for rotating state */ +#define RNDr(S, W, i, k) \ + RND(S[(64 - i) % 8], S[(65 - i) % 8], \ + S[(66 - i) % 8], S[(67 - i) % 8], \ + S[(68 - i) % 8], S[(69 - i) % 8], \ + S[(70 - i) % 8], S[(71 - i) % 8], \ + W[i] + k) + +/* + * SHA256 block compression function. The 256-bit state is transformed via + * the 512-bit input block to produce a new state. + */ +static void +SHA256_Transform(uint32_t * state, const unsigned char block[64]) +{ + uint32_t W[64]; + uint32_t S[8]; + uint32_t t0, t1; + int i; + + /* 1. Prepare message schedule W. */ + be32dec_vect(W, block, 64); + for (i = 16; i < 64; i++) + W[i] = s1(W[i - 2]) + W[i - 7] + s0(W[i - 15]) + W[i - 16]; + + /* 2. Initialize working variables. */ + memcpy(S, state, 32); + + /* 3. Mix. */ + RNDr(S, W, 0, 0x428a2f98); + RNDr(S, W, 1, 0x71374491); + RNDr(S, W, 2, 0xb5c0fbcf); + RNDr(S, W, 3, 0xe9b5dba5); + RNDr(S, W, 4, 0x3956c25b); + RNDr(S, W, 5, 0x59f111f1); + RNDr(S, W, 6, 0x923f82a4); + RNDr(S, W, 7, 0xab1c5ed5); + RNDr(S, W, 8, 0xd807aa98); + RNDr(S, W, 9, 0x12835b01); + RNDr(S, W, 10, 0x243185be); + RNDr(S, W, 11, 0x550c7dc3); + RNDr(S, W, 12, 0x72be5d74); + RNDr(S, W, 13, 0x80deb1fe); + RNDr(S, W, 14, 0x9bdc06a7); + RNDr(S, W, 15, 0xc19bf174); + RNDr(S, W, 16, 0xe49b69c1); + RNDr(S, W, 17, 0xefbe4786); + RNDr(S, W, 18, 0x0fc19dc6); + RNDr(S, W, 19, 0x240ca1cc); + RNDr(S, W, 20, 0x2de92c6f); + RNDr(S, W, 21, 0x4a7484aa); + RNDr(S, W, 22, 0x5cb0a9dc); + RNDr(S, W, 23, 0x76f988da); + RNDr(S, W, 24, 0x983e5152); + RNDr(S, W, 25, 0xa831c66d); + RNDr(S, W, 26, 0xb00327c8); + RNDr(S, W, 27, 0xbf597fc7); + RNDr(S, W, 28, 0xc6e00bf3); + RNDr(S, W, 29, 0xd5a79147); + RNDr(S, W, 30, 0x06ca6351); + RNDr(S, W, 31, 0x14292967); + RNDr(S, W, 32, 0x27b70a85); + RNDr(S, W, 33, 0x2e1b2138); + RNDr(S, W, 34, 0x4d2c6dfc); + RNDr(S, W, 35, 0x53380d13); + RNDr(S, W, 36, 0x650a7354); + RNDr(S, W, 37, 0x766a0abb); + RNDr(S, W, 38, 0x81c2c92e); + RNDr(S, W, 39, 0x92722c85); + RNDr(S, W, 40, 0xa2bfe8a1); + RNDr(S, W, 41, 0xa81a664b); + RNDr(S, W, 42, 0xc24b8b70); + RNDr(S, W, 43, 0xc76c51a3); + RNDr(S, W, 44, 0xd192e819); + RNDr(S, W, 45, 0xd6990624); + RNDr(S, W, 46, 0xf40e3585); + RNDr(S, W, 47, 0x106aa070); + RNDr(S, W, 48, 0x19a4c116); + RNDr(S, W, 49, 0x1e376c08); + RNDr(S, W, 50, 0x2748774c); + RNDr(S, W, 51, 0x34b0bcb5); + RNDr(S, W, 52, 0x391c0cb3); + RNDr(S, W, 53, 0x4ed8aa4a); + RNDr(S, W, 54, 0x5b9cca4f); + RNDr(S, W, 55, 0x682e6ff3); + RNDr(S, W, 56, 0x748f82ee); + RNDr(S, W, 57, 0x78a5636f); + RNDr(S, W, 58, 0x84c87814); + RNDr(S, W, 59, 0x8cc70208); + RNDr(S, W, 60, 0x90befffa); + RNDr(S, W, 61, 0xa4506ceb); + RNDr(S, W, 62, 0xbef9a3f7); + RNDr(S, W, 63, 0xc67178f2); + + /* 4. Mix local working variables into global state */ + for (i = 0; i < 8; i++) + state[i] += S[i]; + + /* Clean the stack. */ + memset(W, 0, 256); + memset(S, 0, 32); + t0 = t1 = 0; +} + +static unsigned char PAD[64] = { + 0x80, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 +}; + +/* SHA-256 initialization. Begins a SHA-256 operation. */ +static void +SHA256_Init(SHA256_CTX * ctx) +{ + + /* Zero bits processed so far */ + ctx->count[0] = ctx->count[1] = 0; + + /* Magic initialization constants */ + ctx->state[0] = 0x6A09E667; + ctx->state[1] = 0xBB67AE85; + ctx->state[2] = 0x3C6EF372; + ctx->state[3] = 0xA54FF53A; + ctx->state[4] = 0x510E527F; + ctx->state[5] = 0x9B05688C; + ctx->state[6] = 0x1F83D9AB; + ctx->state[7] = 0x5BE0CD19; +} + +/* Add bytes into the hash */ +static void +SHA256_Update(SHA256_CTX * ctx, const void *in, size_t len) +{ + uint32_t bitlen[2]; + uint32_t r; + const unsigned char *src = in; + + /* Number of bytes left in the buffer from previous updates */ + r = (ctx->count[1] >> 3) & 0x3f; + + /* Convert the length into a number of bits */ + bitlen[1] = ((uint32_t)len) << 3; + bitlen[0] = (uint32_t)(len >> 29); + + /* Update number of bits */ + if ((ctx->count[1] += bitlen[1]) < bitlen[1]) + ctx->count[0]++; + ctx->count[0] += bitlen[0]; + + /* Handle the case where we don't need to perform any transforms */ + if (len < 64 - r) { + memcpy(&ctx->buf[r], src, len); + return; + } + + /* Finish the current block */ + memcpy(&ctx->buf[r], src, 64 - r); + SHA256_Transform(ctx->state, ctx->buf); + src += 64 - r; + len -= 64 - r; + + /* Perform complete blocks */ + while (len >= 64) { + SHA256_Transform(ctx->state, src); + src += 64; + len -= 64; + } + + /* Copy left over data into buffer */ + memcpy(ctx->buf, src, len); +} + +/* Add padding and terminating bit-count. */ +static void +SHA256_Pad(SHA256_CTX * ctx) +{ + unsigned char len[8]; + uint32_t r, plen; + + /* + * Convert length to a vector of bytes -- we do this now rather + * than later because the length will change after we pad. + */ + be32enc_vect(len, ctx->count, 8); + + /* Add 1--64 bytes so that the resulting length is 56 mod 64 */ + r = (ctx->count[1] >> 3) & 0x3f; + plen = (r < 56) ? (56 - r) : (120 - r); + SHA256_Update(ctx, PAD, (size_t)plen); + + /* Add the terminating bit-count */ + SHA256_Update(ctx, len, 8); +} + +/* + * SHA-256 finalization. Pads the input data, exports the hash value, + * and clears the context state. + */ +static void +SHA256_Final(unsigned char digest[32], SHA256_CTX * ctx) +{ + + /* Add padding */ + SHA256_Pad(ctx); + + /* Write the hash */ + be32enc_vect(digest, ctx->state, 32); + + /* Clear the context state */ + memset((void *)ctx, 0, sizeof(*ctx)); +} + +/* Initialize an HMAC-SHA256 operation with the given key. */ +static void +HMAC_SHA256_Init(HMAC_SHA256_CTX * ctx, const void * _K, size_t Klen) +{ + unsigned char pad[64]; + unsigned char khash[32]; + const unsigned char * K = _K; + size_t i; + + /* If Klen > 64, the key is really SHA256(K). */ + if (Klen > 64) { + SHA256_Init(&ctx->ictx); + SHA256_Update(&ctx->ictx, K, Klen); + SHA256_Final(khash, &ctx->ictx); + K = khash; + Klen = 32; + } + + /* Inner SHA256 operation is SHA256(K xor [block of 0x36] || data). */ + SHA256_Init(&ctx->ictx); + memset(pad, 0x36, 64); + for (i = 0; i < Klen; i++) + pad[i] ^= K[i]; + SHA256_Update(&ctx->ictx, pad, 64); + + /* Outer SHA256 operation is SHA256(K xor [block of 0x5c] || hash). */ + SHA256_Init(&ctx->octx); + memset(pad, 0x5c, 64); + for (i = 0; i < Klen; i++) + pad[i] ^= K[i]; + SHA256_Update(&ctx->octx, pad, 64); + + /* Clean the stack. */ + memset(khash, 0, 32); +} + +/* Add bytes to the HMAC-SHA256 operation. */ +static void +HMAC_SHA256_Update(HMAC_SHA256_CTX * ctx, const void *in, size_t len) +{ + + /* Feed data to the inner SHA256 operation. */ + SHA256_Update(&ctx->ictx, in, len); +} + +/* Finish an HMAC-SHA256 operation. */ +static void +HMAC_SHA256_Final(unsigned char digest[32], HMAC_SHA256_CTX * ctx) +{ + unsigned char ihash[32]; + + /* Finish the inner SHA256 operation. */ + SHA256_Final(ihash, &ctx->ictx); + + /* Feed the inner hash to the outer SHA256 operation. */ + SHA256_Update(&ctx->octx, ihash, 32); + + /* Finish the outer SHA256 operation. */ + SHA256_Final(digest, &ctx->octx); + + /* Clean the stack. */ + memset(ihash, 0, 32); +} + +/** + * PBKDF2_SHA256(passwd, passwdlen, salt, saltlen, c, buf, dkLen): + * Compute PBKDF2(passwd, salt, c, dkLen) using HMAC-SHA256 as the PRF, and + * write the output to buf. The value dkLen must be at most 32 * (2^32 - 1). + */ +static void +PBKDF2_SHA256(const uint8_t * passwd, size_t passwdlen, const uint8_t * salt, + size_t saltlen, uint64_t c, uint8_t * buf, size_t dkLen) +{ + HMAC_SHA256_CTX PShctx, hctx; + size_t i; + uint8_t ivec[4]; + uint8_t U[32]; + uint8_t T[32]; + uint64_t j; + int k; + size_t clen; + + /* Compute HMAC state after processing P and S. */ + HMAC_SHA256_Init(&PShctx, passwd, passwdlen); + HMAC_SHA256_Update(&PShctx, salt, saltlen); + + /* Iterate through the blocks. */ + for (i = 0; i * 32 < dkLen; i++) { + /* Generate INT(i + 1). */ + be32enc(ivec, (uint32_t)(i + 1)); + + /* Compute U_1 = PRF(P, S || INT(i)). */ + memcpy(&hctx, &PShctx, sizeof(HMAC_SHA256_CTX)); + HMAC_SHA256_Update(&hctx, ivec, 4); + HMAC_SHA256_Final(U, &hctx); + + /* T_i = U_1 ... */ + memcpy(T, U, 32); + + for (j = 2; j <= c; j++) { + /* Compute U_j. */ + HMAC_SHA256_Init(&hctx, passwd, passwdlen); + HMAC_SHA256_Update(&hctx, U, 32); + HMAC_SHA256_Final(U, &hctx); + + /* ... xor U_j ... */ + for (k = 0; k < 32; k++) + T[k] ^= U[k]; + } + + /* Copy as many bytes as necessary into buf. */ + clen = dkLen - i * 32; + if (clen > 32) + clen = 32; + memcpy(&buf[i * 32], T, clen); + } + + /* Clean PShctx, since we never called _Final on it. */ + memset(&PShctx, 0, sizeof(HMAC_SHA256_CTX)); +} + + +static void blkcpy(void *, void *, size_t); +static void blkxor(void *, void *, size_t); +static void salsa20_8(uint32_t[16]); +static void blockmix_salsa8(uint32_t *, uint32_t *, uint32_t *, size_t); +static uint64_t integerify(void *, size_t); +static void smix(uint8_t *, size_t, uint64_t, uint32_t *, uint32_t *); + +static void +blkcpy(void * dest, void * src, size_t len) +{ + size_t * D = dest; + size_t * S = src; + size_t L = len / sizeof(size_t); + size_t i; + + for (i = 0; i < L; i++) + D[i] = S[i]; +} + +static void +blkxor(void * dest, void * src, size_t len) +{ + size_t * D = dest; + size_t * S = src; + size_t L = len / sizeof(size_t); + size_t i; + + for (i = 0; i < L; i++) + D[i] ^= S[i]; +} + +/** + * salsa20_8(B): + * Apply the salsa20/8 core to the provided block. + */ +static void +salsa20_8(uint32_t B[16]) +{ + uint32_t x[16]; + size_t i; + + blkcpy(x, B, 64); + for (i = 0; i < 8; i += 2) { +#define R(a,b) (((a) << (b)) | ((a) >> (32 - (b)))) + /* Operate on columns. */ + x[ 4] ^= R(x[ 0]+x[12], 7); x[ 8] ^= R(x[ 4]+x[ 0], 9); + x[12] ^= R(x[ 8]+x[ 4],13); x[ 0] ^= R(x[12]+x[ 8],18); + + x[ 9] ^= R(x[ 5]+x[ 1], 7); x[13] ^= R(x[ 9]+x[ 5], 9); + x[ 1] ^= R(x[13]+x[ 9],13); x[ 5] ^= R(x[ 1]+x[13],18); + + x[14] ^= R(x[10]+x[ 6], 7); x[ 2] ^= R(x[14]+x[10], 9); + x[ 6] ^= R(x[ 2]+x[14],13); x[10] ^= R(x[ 6]+x[ 2],18); + + x[ 3] ^= R(x[15]+x[11], 7); x[ 7] ^= R(x[ 3]+x[15], 9); + x[11] ^= R(x[ 7]+x[ 3],13); x[15] ^= R(x[11]+x[ 7],18); + + /* Operate on rows. */ + x[ 1] ^= R(x[ 0]+x[ 3], 7); x[ 2] ^= R(x[ 1]+x[ 0], 9); + x[ 3] ^= R(x[ 2]+x[ 1],13); x[ 0] ^= R(x[ 3]+x[ 2],18); + + x[ 6] ^= R(x[ 5]+x[ 4], 7); x[ 7] ^= R(x[ 6]+x[ 5], 9); + x[ 4] ^= R(x[ 7]+x[ 6],13); x[ 5] ^= R(x[ 4]+x[ 7],18); + + x[11] ^= R(x[10]+x[ 9], 7); x[ 8] ^= R(x[11]+x[10], 9); + x[ 9] ^= R(x[ 8]+x[11],13); x[10] ^= R(x[ 9]+x[ 8],18); + + x[12] ^= R(x[15]+x[14], 7); x[13] ^= R(x[12]+x[15], 9); + x[14] ^= R(x[13]+x[12],13); x[15] ^= R(x[14]+x[13],18); +#undef R + } + for (i = 0; i < 16; i++) + B[i] += x[i]; +} + +/** + * blockmix_salsa8(Bin, Bout, X, r): + * Compute Bout = BlockMix_{salsa20/8, r}(Bin). The input Bin must be 128r + * bytes in length; the output Bout must also be the same size. The + * temporary space X must be 64 bytes. + */ +static void +blockmix_salsa8(uint32_t * Bin, uint32_t * Bout, uint32_t * X, size_t r) +{ + size_t i; + + /* 1: X <-- B_{2r - 1} */ + blkcpy(X, &Bin[(2 * r - 1) * 16], 64); + + /* 2: for i = 0 to 2r - 1 do */ + for (i = 0; i < 2 * r; i += 2) { + /* 3: X <-- H(X \xor B_i) */ + blkxor(X, &Bin[i * 16], 64); + salsa20_8(X); + + /* 4: Y_i <-- X */ + /* 6: B' <-- (Y_0, Y_2 ... Y_{2r-2}, Y_1, Y_3 ... Y_{2r-1}) */ + blkcpy(&Bout[i * 8], X, 64); + + /* 3: X <-- H(X \xor B_i) */ + blkxor(X, &Bin[i * 16 + 16], 64); + salsa20_8(X); + + /* 4: Y_i <-- X */ + /* 6: B' <-- (Y_0, Y_2 ... Y_{2r-2}, Y_1, Y_3 ... Y_{2r-1}) */ + blkcpy(&Bout[i * 8 + r * 16], X, 64); + } +} + +/** + * integerify(B, r): + * Return the result of parsing B_{2r-1} as a little-endian integer. + */ +static uint64_t +integerify(void * B, size_t r) +{ + uint32_t * X = (void *)((uintptr_t)(B) + (2 * r - 1) * 64); + + return (((uint64_t)(X[1]) << 32) + X[0]); +} + +/** + * smix(B, r, N, V, XY): + * Compute B = SMix_r(B, N). The input B must be 128r bytes in length; + * the temporary storage V must be 128rN bytes in length; the temporary + * storage XY must be 256r + 64 bytes in length. The value N must be a + * power of 2 greater than 1. The arrays B, V, and XY must be aligned to a + * multiple of 64 bytes. + */ +static void +smix(uint8_t * B, size_t r, uint64_t N, uint32_t * V, uint32_t * XY) +{ + uint32_t * X = XY; + uint32_t * Y = &XY[32 * r]; + uint32_t * Z = &XY[64 * r]; + uint64_t i; + uint64_t j; + size_t k; + + /* 1: X <-- B */ + for (k = 0; k < 32 * r; k++) + X[k] = le32dec(&B[4 * k]); + + /* 2: for i = 0 to N - 1 do */ + for (i = 0; i < N; i += 2) { + /* 3: V_i <-- X */ + blkcpy(&V[i * (32 * r)], X, 128 * r); + + /* 4: X <-- H(X) */ + blockmix_salsa8(X, Y, Z, r); + + /* 3: V_i <-- X */ + blkcpy(&V[(i + 1) * (32 * r)], Y, 128 * r); + + /* 4: X <-- H(X) */ + blockmix_salsa8(Y, X, Z, r); + } + + /* 6: for i = 0 to N - 1 do */ + for (i = 0; i < N; i += 2) { + /* 7: j <-- Integerify(X) mod N */ + j = integerify(X, r) & (N - 1); + + /* 8: X <-- H(X \xor V_j) */ + blkxor(X, &V[j * (32 * r)], 128 * r); + blockmix_salsa8(X, Y, Z, r); + + /* 7: j <-- Integerify(X) mod N */ + j = integerify(Y, r) & (N - 1); + + /* 8: X <-- H(X \xor V_j) */ + blkxor(Y, &V[j * (32 * r)], 128 * r); + blockmix_salsa8(Y, X, Z, r); + } + + /* 10: B' <-- X */ + for (k = 0; k < 32 * r; k++) + le32enc(&B[4 * k], X[k]); +} + +/* cpu and memory intensive function to transform a 80 byte buffer into a 32 byte output + scratchpad size needs to be at least 63 + (128 * r * p) + (256 * r + 64) + (128 * r * N) bytes + */ +void scrypt_N_1_1_256_sp(const char* input, char* output, char* scratchpad, uint32_t N) +{ + uint8_t * B; + uint32_t * V; + uint32_t * XY; + uint32_t i; + + //const uint32_t N = 1024; + const uint32_t r = 1; + const uint32_t p = 1; + + B = (uint8_t *)(((uintptr_t)(scratchpad) + 63) & ~ (uintptr_t)(63)); + XY = (uint32_t *)(B + (128 * r * p)); + V = (uint32_t *)(B + (128 * r * p) + (256 * r + 64)); + + /* 1: (B_0 ... B_{p-1}) <-- PBKDF2(P, S, 1, p * MFLen) */ + PBKDF2_SHA256((const uint8_t*)input, 80, (const uint8_t*)input, 80, 1, B, p * 128 * r); + + /* 2: for i = 0 to p - 1 do */ + for (i = 0; i < p; i++) { + /* 3: B_i <-- MF(B_i, N) */ + smix(&B[i * 128 * r], r, N, V, XY); + } + + /* 5: DK <-- PBKDF2(P, B, 1, dkLen) */ + PBKDF2_SHA256((const uint8_t*)input, 80, B, p * 128 * r, 1, (uint8_t*)output, 32); +} + +void scrypt_N_1_1_256(const char* input, char* output, uint32_t N) +{ + //char scratchpad[131583]; + char *scratchpad; + + // align on 4 byte boundary + scratchpad = (char*)malloc(128*N + 512); + scrypt_N_1_1_256_sp(input, output, scratchpad, N); + free(scratchpad); +} + diff --git a/scryptn.h b/scryptn.h new file mode 100644 index 0000000..ba461a2 --- /dev/null +++ b/scryptn.h @@ -0,0 +1,16 @@ +#ifndef SCRYPT_H +#define SCRYPT_H +#include +#ifdef __cplusplus +extern "C" { +#endif + +void scrypt_N_1_1_256(const char* input, char* output, uint32_t N); +void scrypt_N_1_1_256_sp(const char* input, char* output, char* scratchpad, uint32_t N); +//const int scrypt_scratchpad_size = 131583; + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/sha3/aes_helper.c b/sha3/aes_helper.c new file mode 100644 index 0000000..75b7cc6 --- /dev/null +++ b/sha3/aes_helper.c @@ -0,0 +1,392 @@ +/* $Id: aes_helper.c 220 2010-06-09 09:21:50Z tp $ */ +/* + * AES tables. This file is not meant to be compiled by itself; it + * is included by some hash function implementations. It contains + * the precomputed tables and helper macros for evaluating an AES + * round, optionally with a final XOR with a subkey. + * + * By default, this file defines the tables and macros for little-endian + * processing (i.e. it is assumed that the input bytes have been read + * from memory and assembled with the little-endian convention). If + * the 'AES_BIG_ENDIAN' macro is defined (to a non-zero integer value) + * when this file is included, then the tables and macros for big-endian + * processing are defined instead. The big-endian tables and macros have + * names distinct from the little-endian tables and macros, hence it is + * possible to have both simultaneously, by including this file twice + * (with and without the AES_BIG_ENDIAN macro). + * + * ==========================(LICENSE BEGIN)============================ + * + * Copyright (c) 2007-2010 Projet RNRT SAPHIR + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY + * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * + * ===========================(LICENSE END)============================= + * + * @author Thomas Pornin + */ + +#include "sph_types.h" +#ifdef __cplusplus +extern "C"{ +#endif +#if AES_BIG_ENDIAN + +#define AESx(x) ( ((SPH_C32(x) >> 24) & SPH_C32(0x000000FF)) \ + | ((SPH_C32(x) >> 8) & SPH_C32(0x0000FF00)) \ + | ((SPH_C32(x) << 8) & SPH_C32(0x00FF0000)) \ + | ((SPH_C32(x) << 24) & SPH_C32(0xFF000000))) + +#define AES0 AES0_BE +#define AES1 AES1_BE +#define AES2 AES2_BE +#define AES3 AES3_BE + +#define AES_ROUND_BE(X0, X1, X2, X3, K0, K1, K2, K3, Y0, Y1, Y2, Y3) do { \ + (Y0) = AES0[((X0) >> 24) & 0xFF] \ + ^ AES1[((X1) >> 16) & 0xFF] \ + ^ AES2[((X2) >> 8) & 0xFF] \ + ^ AES3[(X3) & 0xFF] ^ (K0); \ + (Y1) = AES0[((X1) >> 24) & 0xFF] \ + ^ AES1[((X2) >> 16) & 0xFF] \ + ^ AES2[((X3) >> 8) & 0xFF] \ + ^ AES3[(X0) & 0xFF] ^ (K1); \ + (Y2) = AES0[((X2) >> 24) & 0xFF] \ + ^ AES1[((X3) >> 16) & 0xFF] \ + ^ AES2[((X0) >> 8) & 0xFF] \ + ^ AES3[(X1) & 0xFF] ^ (K2); \ + (Y3) = AES0[((X3) >> 24) & 0xFF] \ + ^ AES1[((X0) >> 16) & 0xFF] \ + ^ AES2[((X1) >> 8) & 0xFF] \ + ^ AES3[(X2) & 0xFF] ^ (K3); \ + } while (0) + +#define AES_ROUND_NOKEY_BE(X0, X1, X2, X3, Y0, Y1, Y2, Y3) \ + AES_ROUND_BE(X0, X1, X2, X3, 0, 0, 0, 0, Y0, Y1, Y2, Y3) + +#else + +#define AESx(x) SPH_C32(x) +#define AES0 AES0_LE +#define AES1 AES1_LE +#define AES2 AES2_LE +#define AES3 AES3_LE + +#define AES_ROUND_LE(X0, X1, X2, X3, K0, K1, K2, K3, Y0, Y1, Y2, Y3) do { \ + (Y0) = AES0[(X0) & 0xFF] \ + ^ AES1[((X1) >> 8) & 0xFF] \ + ^ AES2[((X2) >> 16) & 0xFF] \ + ^ AES3[((X3) >> 24) & 0xFF] ^ (K0); \ + (Y1) = AES0[(X1) & 0xFF] \ + ^ AES1[((X2) >> 8) & 0xFF] \ + ^ AES2[((X3) >> 16) & 0xFF] \ + ^ AES3[((X0) >> 24) & 0xFF] ^ (K1); \ + (Y2) = AES0[(X2) & 0xFF] \ + ^ AES1[((X3) >> 8) & 0xFF] \ + ^ AES2[((X0) >> 16) & 0xFF] \ + ^ AES3[((X1) >> 24) & 0xFF] ^ (K2); \ + (Y3) = AES0[(X3) & 0xFF] \ + ^ AES1[((X0) >> 8) & 0xFF] \ + ^ AES2[((X1) >> 16) & 0xFF] \ + ^ AES3[((X2) >> 24) & 0xFF] ^ (K3); \ + } while (0) + +#define AES_ROUND_NOKEY_LE(X0, X1, X2, X3, Y0, Y1, Y2, Y3) \ + AES_ROUND_LE(X0, X1, X2, X3, 0, 0, 0, 0, Y0, Y1, Y2, Y3) + +#endif + +/* + * The AES*[] tables allow us to perform a fast evaluation of an AES + * round; table AESi[] combines SubBytes for a byte at row i, and + * MixColumns for the column where that byte goes after ShiftRows. + */ + +static const sph_u32 AES0[256] = { + AESx(0xA56363C6), AESx(0x847C7CF8), AESx(0x997777EE), AESx(0x8D7B7BF6), + AESx(0x0DF2F2FF), AESx(0xBD6B6BD6), AESx(0xB16F6FDE), AESx(0x54C5C591), + AESx(0x50303060), AESx(0x03010102), AESx(0xA96767CE), AESx(0x7D2B2B56), + AESx(0x19FEFEE7), AESx(0x62D7D7B5), AESx(0xE6ABAB4D), AESx(0x9A7676EC), + AESx(0x45CACA8F), AESx(0x9D82821F), AESx(0x40C9C989), AESx(0x877D7DFA), + AESx(0x15FAFAEF), AESx(0xEB5959B2), AESx(0xC947478E), AESx(0x0BF0F0FB), + AESx(0xECADAD41), AESx(0x67D4D4B3), AESx(0xFDA2A25F), AESx(0xEAAFAF45), + AESx(0xBF9C9C23), AESx(0xF7A4A453), AESx(0x967272E4), AESx(0x5BC0C09B), + AESx(0xC2B7B775), AESx(0x1CFDFDE1), AESx(0xAE93933D), AESx(0x6A26264C), + AESx(0x5A36366C), AESx(0x413F3F7E), AESx(0x02F7F7F5), AESx(0x4FCCCC83), + AESx(0x5C343468), AESx(0xF4A5A551), AESx(0x34E5E5D1), AESx(0x08F1F1F9), + AESx(0x937171E2), AESx(0x73D8D8AB), AESx(0x53313162), AESx(0x3F15152A), + AESx(0x0C040408), AESx(0x52C7C795), AESx(0x65232346), AESx(0x5EC3C39D), + AESx(0x28181830), AESx(0xA1969637), AESx(0x0F05050A), AESx(0xB59A9A2F), + AESx(0x0907070E), AESx(0x36121224), AESx(0x9B80801B), AESx(0x3DE2E2DF), + AESx(0x26EBEBCD), AESx(0x6927274E), AESx(0xCDB2B27F), AESx(0x9F7575EA), + AESx(0x1B090912), AESx(0x9E83831D), AESx(0x742C2C58), AESx(0x2E1A1A34), + AESx(0x2D1B1B36), AESx(0xB26E6EDC), AESx(0xEE5A5AB4), AESx(0xFBA0A05B), + AESx(0xF65252A4), AESx(0x4D3B3B76), AESx(0x61D6D6B7), AESx(0xCEB3B37D), + AESx(0x7B292952), AESx(0x3EE3E3DD), AESx(0x712F2F5E), AESx(0x97848413), + AESx(0xF55353A6), AESx(0x68D1D1B9), AESx(0x00000000), AESx(0x2CEDEDC1), + AESx(0x60202040), AESx(0x1FFCFCE3), AESx(0xC8B1B179), AESx(0xED5B5BB6), + AESx(0xBE6A6AD4), AESx(0x46CBCB8D), AESx(0xD9BEBE67), AESx(0x4B393972), + AESx(0xDE4A4A94), AESx(0xD44C4C98), AESx(0xE85858B0), AESx(0x4ACFCF85), + AESx(0x6BD0D0BB), AESx(0x2AEFEFC5), AESx(0xE5AAAA4F), AESx(0x16FBFBED), + AESx(0xC5434386), AESx(0xD74D4D9A), AESx(0x55333366), AESx(0x94858511), + AESx(0xCF45458A), AESx(0x10F9F9E9), AESx(0x06020204), AESx(0x817F7FFE), + AESx(0xF05050A0), AESx(0x443C3C78), AESx(0xBA9F9F25), AESx(0xE3A8A84B), + AESx(0xF35151A2), AESx(0xFEA3A35D), AESx(0xC0404080), AESx(0x8A8F8F05), + AESx(0xAD92923F), AESx(0xBC9D9D21), AESx(0x48383870), AESx(0x04F5F5F1), + AESx(0xDFBCBC63), AESx(0xC1B6B677), AESx(0x75DADAAF), AESx(0x63212142), + AESx(0x30101020), AESx(0x1AFFFFE5), AESx(0x0EF3F3FD), AESx(0x6DD2D2BF), + AESx(0x4CCDCD81), AESx(0x140C0C18), AESx(0x35131326), AESx(0x2FECECC3), + AESx(0xE15F5FBE), AESx(0xA2979735), AESx(0xCC444488), AESx(0x3917172E), + AESx(0x57C4C493), AESx(0xF2A7A755), AESx(0x827E7EFC), AESx(0x473D3D7A), + AESx(0xAC6464C8), AESx(0xE75D5DBA), AESx(0x2B191932), AESx(0x957373E6), + AESx(0xA06060C0), AESx(0x98818119), AESx(0xD14F4F9E), AESx(0x7FDCDCA3), + AESx(0x66222244), AESx(0x7E2A2A54), AESx(0xAB90903B), AESx(0x8388880B), + AESx(0xCA46468C), AESx(0x29EEEEC7), AESx(0xD3B8B86B), AESx(0x3C141428), + AESx(0x79DEDEA7), AESx(0xE25E5EBC), AESx(0x1D0B0B16), AESx(0x76DBDBAD), + AESx(0x3BE0E0DB), AESx(0x56323264), AESx(0x4E3A3A74), AESx(0x1E0A0A14), + AESx(0xDB494992), AESx(0x0A06060C), AESx(0x6C242448), AESx(0xE45C5CB8), + AESx(0x5DC2C29F), AESx(0x6ED3D3BD), AESx(0xEFACAC43), AESx(0xA66262C4), + AESx(0xA8919139), AESx(0xA4959531), AESx(0x37E4E4D3), AESx(0x8B7979F2), + AESx(0x32E7E7D5), AESx(0x43C8C88B), AESx(0x5937376E), AESx(0xB76D6DDA), + AESx(0x8C8D8D01), AESx(0x64D5D5B1), AESx(0xD24E4E9C), AESx(0xE0A9A949), + AESx(0xB46C6CD8), AESx(0xFA5656AC), AESx(0x07F4F4F3), AESx(0x25EAEACF), + AESx(0xAF6565CA), AESx(0x8E7A7AF4), AESx(0xE9AEAE47), AESx(0x18080810), + AESx(0xD5BABA6F), AESx(0x887878F0), AESx(0x6F25254A), AESx(0x722E2E5C), + AESx(0x241C1C38), AESx(0xF1A6A657), AESx(0xC7B4B473), AESx(0x51C6C697), + AESx(0x23E8E8CB), AESx(0x7CDDDDA1), AESx(0x9C7474E8), AESx(0x211F1F3E), + AESx(0xDD4B4B96), AESx(0xDCBDBD61), AESx(0x868B8B0D), AESx(0x858A8A0F), + AESx(0x907070E0), AESx(0x423E3E7C), AESx(0xC4B5B571), AESx(0xAA6666CC), + AESx(0xD8484890), AESx(0x05030306), AESx(0x01F6F6F7), AESx(0x120E0E1C), + AESx(0xA36161C2), AESx(0x5F35356A), AESx(0xF95757AE), AESx(0xD0B9B969), + AESx(0x91868617), AESx(0x58C1C199), AESx(0x271D1D3A), AESx(0xB99E9E27), + AESx(0x38E1E1D9), AESx(0x13F8F8EB), AESx(0xB398982B), AESx(0x33111122), + AESx(0xBB6969D2), AESx(0x70D9D9A9), AESx(0x898E8E07), AESx(0xA7949433), + AESx(0xB69B9B2D), AESx(0x221E1E3C), AESx(0x92878715), AESx(0x20E9E9C9), + AESx(0x49CECE87), AESx(0xFF5555AA), AESx(0x78282850), AESx(0x7ADFDFA5), + AESx(0x8F8C8C03), AESx(0xF8A1A159), AESx(0x80898909), AESx(0x170D0D1A), + AESx(0xDABFBF65), AESx(0x31E6E6D7), AESx(0xC6424284), AESx(0xB86868D0), + AESx(0xC3414182), AESx(0xB0999929), AESx(0x772D2D5A), AESx(0x110F0F1E), + AESx(0xCBB0B07B), AESx(0xFC5454A8), AESx(0xD6BBBB6D), AESx(0x3A16162C) +}; + +static const sph_u32 AES1[256] = { + AESx(0x6363C6A5), AESx(0x7C7CF884), AESx(0x7777EE99), AESx(0x7B7BF68D), + AESx(0xF2F2FF0D), AESx(0x6B6BD6BD), AESx(0x6F6FDEB1), AESx(0xC5C59154), + AESx(0x30306050), AESx(0x01010203), AESx(0x6767CEA9), AESx(0x2B2B567D), + AESx(0xFEFEE719), AESx(0xD7D7B562), AESx(0xABAB4DE6), AESx(0x7676EC9A), + AESx(0xCACA8F45), AESx(0x82821F9D), AESx(0xC9C98940), AESx(0x7D7DFA87), + AESx(0xFAFAEF15), AESx(0x5959B2EB), AESx(0x47478EC9), AESx(0xF0F0FB0B), + AESx(0xADAD41EC), AESx(0xD4D4B367), AESx(0xA2A25FFD), AESx(0xAFAF45EA), + AESx(0x9C9C23BF), AESx(0xA4A453F7), AESx(0x7272E496), AESx(0xC0C09B5B), + AESx(0xB7B775C2), AESx(0xFDFDE11C), AESx(0x93933DAE), AESx(0x26264C6A), + AESx(0x36366C5A), AESx(0x3F3F7E41), AESx(0xF7F7F502), AESx(0xCCCC834F), + AESx(0x3434685C), AESx(0xA5A551F4), AESx(0xE5E5D134), AESx(0xF1F1F908), + AESx(0x7171E293), AESx(0xD8D8AB73), AESx(0x31316253), AESx(0x15152A3F), + AESx(0x0404080C), AESx(0xC7C79552), AESx(0x23234665), AESx(0xC3C39D5E), + AESx(0x18183028), AESx(0x969637A1), AESx(0x05050A0F), AESx(0x9A9A2FB5), + AESx(0x07070E09), AESx(0x12122436), AESx(0x80801B9B), AESx(0xE2E2DF3D), + AESx(0xEBEBCD26), AESx(0x27274E69), AESx(0xB2B27FCD), AESx(0x7575EA9F), + AESx(0x0909121B), AESx(0x83831D9E), AESx(0x2C2C5874), AESx(0x1A1A342E), + AESx(0x1B1B362D), AESx(0x6E6EDCB2), AESx(0x5A5AB4EE), AESx(0xA0A05BFB), + AESx(0x5252A4F6), AESx(0x3B3B764D), AESx(0xD6D6B761), AESx(0xB3B37DCE), + AESx(0x2929527B), AESx(0xE3E3DD3E), AESx(0x2F2F5E71), AESx(0x84841397), + AESx(0x5353A6F5), AESx(0xD1D1B968), AESx(0x00000000), AESx(0xEDEDC12C), + AESx(0x20204060), AESx(0xFCFCE31F), AESx(0xB1B179C8), AESx(0x5B5BB6ED), + AESx(0x6A6AD4BE), AESx(0xCBCB8D46), AESx(0xBEBE67D9), AESx(0x3939724B), + AESx(0x4A4A94DE), AESx(0x4C4C98D4), AESx(0x5858B0E8), AESx(0xCFCF854A), + AESx(0xD0D0BB6B), AESx(0xEFEFC52A), AESx(0xAAAA4FE5), AESx(0xFBFBED16), + AESx(0x434386C5), AESx(0x4D4D9AD7), AESx(0x33336655), AESx(0x85851194), + AESx(0x45458ACF), AESx(0xF9F9E910), AESx(0x02020406), AESx(0x7F7FFE81), + AESx(0x5050A0F0), AESx(0x3C3C7844), AESx(0x9F9F25BA), AESx(0xA8A84BE3), + AESx(0x5151A2F3), AESx(0xA3A35DFE), AESx(0x404080C0), AESx(0x8F8F058A), + AESx(0x92923FAD), AESx(0x9D9D21BC), AESx(0x38387048), AESx(0xF5F5F104), + AESx(0xBCBC63DF), AESx(0xB6B677C1), AESx(0xDADAAF75), AESx(0x21214263), + AESx(0x10102030), AESx(0xFFFFE51A), AESx(0xF3F3FD0E), AESx(0xD2D2BF6D), + AESx(0xCDCD814C), AESx(0x0C0C1814), AESx(0x13132635), AESx(0xECECC32F), + AESx(0x5F5FBEE1), AESx(0x979735A2), AESx(0x444488CC), AESx(0x17172E39), + AESx(0xC4C49357), AESx(0xA7A755F2), AESx(0x7E7EFC82), AESx(0x3D3D7A47), + AESx(0x6464C8AC), AESx(0x5D5DBAE7), AESx(0x1919322B), AESx(0x7373E695), + AESx(0x6060C0A0), AESx(0x81811998), AESx(0x4F4F9ED1), AESx(0xDCDCA37F), + AESx(0x22224466), AESx(0x2A2A547E), AESx(0x90903BAB), AESx(0x88880B83), + AESx(0x46468CCA), AESx(0xEEEEC729), AESx(0xB8B86BD3), AESx(0x1414283C), + AESx(0xDEDEA779), AESx(0x5E5EBCE2), AESx(0x0B0B161D), AESx(0xDBDBAD76), + AESx(0xE0E0DB3B), AESx(0x32326456), AESx(0x3A3A744E), AESx(0x0A0A141E), + AESx(0x494992DB), AESx(0x06060C0A), AESx(0x2424486C), AESx(0x5C5CB8E4), + AESx(0xC2C29F5D), AESx(0xD3D3BD6E), AESx(0xACAC43EF), AESx(0x6262C4A6), + AESx(0x919139A8), AESx(0x959531A4), AESx(0xE4E4D337), AESx(0x7979F28B), + AESx(0xE7E7D532), AESx(0xC8C88B43), AESx(0x37376E59), AESx(0x6D6DDAB7), + AESx(0x8D8D018C), AESx(0xD5D5B164), AESx(0x4E4E9CD2), AESx(0xA9A949E0), + AESx(0x6C6CD8B4), AESx(0x5656ACFA), AESx(0xF4F4F307), AESx(0xEAEACF25), + AESx(0x6565CAAF), AESx(0x7A7AF48E), AESx(0xAEAE47E9), AESx(0x08081018), + AESx(0xBABA6FD5), AESx(0x7878F088), AESx(0x25254A6F), AESx(0x2E2E5C72), + AESx(0x1C1C3824), AESx(0xA6A657F1), AESx(0xB4B473C7), AESx(0xC6C69751), + AESx(0xE8E8CB23), AESx(0xDDDDA17C), AESx(0x7474E89C), AESx(0x1F1F3E21), + AESx(0x4B4B96DD), AESx(0xBDBD61DC), AESx(0x8B8B0D86), AESx(0x8A8A0F85), + AESx(0x7070E090), AESx(0x3E3E7C42), AESx(0xB5B571C4), AESx(0x6666CCAA), + AESx(0x484890D8), AESx(0x03030605), AESx(0xF6F6F701), AESx(0x0E0E1C12), + AESx(0x6161C2A3), AESx(0x35356A5F), AESx(0x5757AEF9), AESx(0xB9B969D0), + AESx(0x86861791), AESx(0xC1C19958), AESx(0x1D1D3A27), AESx(0x9E9E27B9), + AESx(0xE1E1D938), AESx(0xF8F8EB13), AESx(0x98982BB3), AESx(0x11112233), + AESx(0x6969D2BB), AESx(0xD9D9A970), AESx(0x8E8E0789), AESx(0x949433A7), + AESx(0x9B9B2DB6), AESx(0x1E1E3C22), AESx(0x87871592), AESx(0xE9E9C920), + AESx(0xCECE8749), AESx(0x5555AAFF), AESx(0x28285078), AESx(0xDFDFA57A), + AESx(0x8C8C038F), AESx(0xA1A159F8), AESx(0x89890980), AESx(0x0D0D1A17), + AESx(0xBFBF65DA), AESx(0xE6E6D731), AESx(0x424284C6), AESx(0x6868D0B8), + AESx(0x414182C3), AESx(0x999929B0), AESx(0x2D2D5A77), AESx(0x0F0F1E11), + AESx(0xB0B07BCB), AESx(0x5454A8FC), AESx(0xBBBB6DD6), AESx(0x16162C3A) +}; + +static const sph_u32 AES2[256] = { + AESx(0x63C6A563), AESx(0x7CF8847C), AESx(0x77EE9977), AESx(0x7BF68D7B), + AESx(0xF2FF0DF2), AESx(0x6BD6BD6B), AESx(0x6FDEB16F), AESx(0xC59154C5), + AESx(0x30605030), AESx(0x01020301), AESx(0x67CEA967), AESx(0x2B567D2B), + AESx(0xFEE719FE), AESx(0xD7B562D7), AESx(0xAB4DE6AB), AESx(0x76EC9A76), + AESx(0xCA8F45CA), AESx(0x821F9D82), AESx(0xC98940C9), AESx(0x7DFA877D), + AESx(0xFAEF15FA), AESx(0x59B2EB59), AESx(0x478EC947), AESx(0xF0FB0BF0), + AESx(0xAD41ECAD), AESx(0xD4B367D4), AESx(0xA25FFDA2), AESx(0xAF45EAAF), + AESx(0x9C23BF9C), AESx(0xA453F7A4), AESx(0x72E49672), AESx(0xC09B5BC0), + AESx(0xB775C2B7), AESx(0xFDE11CFD), AESx(0x933DAE93), AESx(0x264C6A26), + AESx(0x366C5A36), AESx(0x3F7E413F), AESx(0xF7F502F7), AESx(0xCC834FCC), + AESx(0x34685C34), AESx(0xA551F4A5), AESx(0xE5D134E5), AESx(0xF1F908F1), + AESx(0x71E29371), AESx(0xD8AB73D8), AESx(0x31625331), AESx(0x152A3F15), + AESx(0x04080C04), AESx(0xC79552C7), AESx(0x23466523), AESx(0xC39D5EC3), + AESx(0x18302818), AESx(0x9637A196), AESx(0x050A0F05), AESx(0x9A2FB59A), + AESx(0x070E0907), AESx(0x12243612), AESx(0x801B9B80), AESx(0xE2DF3DE2), + AESx(0xEBCD26EB), AESx(0x274E6927), AESx(0xB27FCDB2), AESx(0x75EA9F75), + AESx(0x09121B09), AESx(0x831D9E83), AESx(0x2C58742C), AESx(0x1A342E1A), + AESx(0x1B362D1B), AESx(0x6EDCB26E), AESx(0x5AB4EE5A), AESx(0xA05BFBA0), + AESx(0x52A4F652), AESx(0x3B764D3B), AESx(0xD6B761D6), AESx(0xB37DCEB3), + AESx(0x29527B29), AESx(0xE3DD3EE3), AESx(0x2F5E712F), AESx(0x84139784), + AESx(0x53A6F553), AESx(0xD1B968D1), AESx(0x00000000), AESx(0xEDC12CED), + AESx(0x20406020), AESx(0xFCE31FFC), AESx(0xB179C8B1), AESx(0x5BB6ED5B), + AESx(0x6AD4BE6A), AESx(0xCB8D46CB), AESx(0xBE67D9BE), AESx(0x39724B39), + AESx(0x4A94DE4A), AESx(0x4C98D44C), AESx(0x58B0E858), AESx(0xCF854ACF), + AESx(0xD0BB6BD0), AESx(0xEFC52AEF), AESx(0xAA4FE5AA), AESx(0xFBED16FB), + AESx(0x4386C543), AESx(0x4D9AD74D), AESx(0x33665533), AESx(0x85119485), + AESx(0x458ACF45), AESx(0xF9E910F9), AESx(0x02040602), AESx(0x7FFE817F), + AESx(0x50A0F050), AESx(0x3C78443C), AESx(0x9F25BA9F), AESx(0xA84BE3A8), + AESx(0x51A2F351), AESx(0xA35DFEA3), AESx(0x4080C040), AESx(0x8F058A8F), + AESx(0x923FAD92), AESx(0x9D21BC9D), AESx(0x38704838), AESx(0xF5F104F5), + AESx(0xBC63DFBC), AESx(0xB677C1B6), AESx(0xDAAF75DA), AESx(0x21426321), + AESx(0x10203010), AESx(0xFFE51AFF), AESx(0xF3FD0EF3), AESx(0xD2BF6DD2), + AESx(0xCD814CCD), AESx(0x0C18140C), AESx(0x13263513), AESx(0xECC32FEC), + AESx(0x5FBEE15F), AESx(0x9735A297), AESx(0x4488CC44), AESx(0x172E3917), + AESx(0xC49357C4), AESx(0xA755F2A7), AESx(0x7EFC827E), AESx(0x3D7A473D), + AESx(0x64C8AC64), AESx(0x5DBAE75D), AESx(0x19322B19), AESx(0x73E69573), + AESx(0x60C0A060), AESx(0x81199881), AESx(0x4F9ED14F), AESx(0xDCA37FDC), + AESx(0x22446622), AESx(0x2A547E2A), AESx(0x903BAB90), AESx(0x880B8388), + AESx(0x468CCA46), AESx(0xEEC729EE), AESx(0xB86BD3B8), AESx(0x14283C14), + AESx(0xDEA779DE), AESx(0x5EBCE25E), AESx(0x0B161D0B), AESx(0xDBAD76DB), + AESx(0xE0DB3BE0), AESx(0x32645632), AESx(0x3A744E3A), AESx(0x0A141E0A), + AESx(0x4992DB49), AESx(0x060C0A06), AESx(0x24486C24), AESx(0x5CB8E45C), + AESx(0xC29F5DC2), AESx(0xD3BD6ED3), AESx(0xAC43EFAC), AESx(0x62C4A662), + AESx(0x9139A891), AESx(0x9531A495), AESx(0xE4D337E4), AESx(0x79F28B79), + AESx(0xE7D532E7), AESx(0xC88B43C8), AESx(0x376E5937), AESx(0x6DDAB76D), + AESx(0x8D018C8D), AESx(0xD5B164D5), AESx(0x4E9CD24E), AESx(0xA949E0A9), + AESx(0x6CD8B46C), AESx(0x56ACFA56), AESx(0xF4F307F4), AESx(0xEACF25EA), + AESx(0x65CAAF65), AESx(0x7AF48E7A), AESx(0xAE47E9AE), AESx(0x08101808), + AESx(0xBA6FD5BA), AESx(0x78F08878), AESx(0x254A6F25), AESx(0x2E5C722E), + AESx(0x1C38241C), AESx(0xA657F1A6), AESx(0xB473C7B4), AESx(0xC69751C6), + AESx(0xE8CB23E8), AESx(0xDDA17CDD), AESx(0x74E89C74), AESx(0x1F3E211F), + AESx(0x4B96DD4B), AESx(0xBD61DCBD), AESx(0x8B0D868B), AESx(0x8A0F858A), + AESx(0x70E09070), AESx(0x3E7C423E), AESx(0xB571C4B5), AESx(0x66CCAA66), + AESx(0x4890D848), AESx(0x03060503), AESx(0xF6F701F6), AESx(0x0E1C120E), + AESx(0x61C2A361), AESx(0x356A5F35), AESx(0x57AEF957), AESx(0xB969D0B9), + AESx(0x86179186), AESx(0xC19958C1), AESx(0x1D3A271D), AESx(0x9E27B99E), + AESx(0xE1D938E1), AESx(0xF8EB13F8), AESx(0x982BB398), AESx(0x11223311), + AESx(0x69D2BB69), AESx(0xD9A970D9), AESx(0x8E07898E), AESx(0x9433A794), + AESx(0x9B2DB69B), AESx(0x1E3C221E), AESx(0x87159287), AESx(0xE9C920E9), + AESx(0xCE8749CE), AESx(0x55AAFF55), AESx(0x28507828), AESx(0xDFA57ADF), + AESx(0x8C038F8C), AESx(0xA159F8A1), AESx(0x89098089), AESx(0x0D1A170D), + AESx(0xBF65DABF), AESx(0xE6D731E6), AESx(0x4284C642), AESx(0x68D0B868), + AESx(0x4182C341), AESx(0x9929B099), AESx(0x2D5A772D), AESx(0x0F1E110F), + AESx(0xB07BCBB0), AESx(0x54A8FC54), AESx(0xBB6DD6BB), AESx(0x162C3A16) +}; + +static const sph_u32 AES3[256] = { + AESx(0xC6A56363), AESx(0xF8847C7C), AESx(0xEE997777), AESx(0xF68D7B7B), + AESx(0xFF0DF2F2), AESx(0xD6BD6B6B), AESx(0xDEB16F6F), AESx(0x9154C5C5), + AESx(0x60503030), AESx(0x02030101), AESx(0xCEA96767), AESx(0x567D2B2B), + AESx(0xE719FEFE), AESx(0xB562D7D7), AESx(0x4DE6ABAB), AESx(0xEC9A7676), + AESx(0x8F45CACA), AESx(0x1F9D8282), AESx(0x8940C9C9), AESx(0xFA877D7D), + AESx(0xEF15FAFA), AESx(0xB2EB5959), AESx(0x8EC94747), AESx(0xFB0BF0F0), + AESx(0x41ECADAD), AESx(0xB367D4D4), AESx(0x5FFDA2A2), AESx(0x45EAAFAF), + AESx(0x23BF9C9C), AESx(0x53F7A4A4), AESx(0xE4967272), AESx(0x9B5BC0C0), + AESx(0x75C2B7B7), AESx(0xE11CFDFD), AESx(0x3DAE9393), AESx(0x4C6A2626), + AESx(0x6C5A3636), AESx(0x7E413F3F), AESx(0xF502F7F7), AESx(0x834FCCCC), + AESx(0x685C3434), AESx(0x51F4A5A5), AESx(0xD134E5E5), AESx(0xF908F1F1), + AESx(0xE2937171), AESx(0xAB73D8D8), AESx(0x62533131), AESx(0x2A3F1515), + AESx(0x080C0404), AESx(0x9552C7C7), AESx(0x46652323), AESx(0x9D5EC3C3), + AESx(0x30281818), AESx(0x37A19696), AESx(0x0A0F0505), AESx(0x2FB59A9A), + AESx(0x0E090707), AESx(0x24361212), AESx(0x1B9B8080), AESx(0xDF3DE2E2), + AESx(0xCD26EBEB), AESx(0x4E692727), AESx(0x7FCDB2B2), AESx(0xEA9F7575), + AESx(0x121B0909), AESx(0x1D9E8383), AESx(0x58742C2C), AESx(0x342E1A1A), + AESx(0x362D1B1B), AESx(0xDCB26E6E), AESx(0xB4EE5A5A), AESx(0x5BFBA0A0), + AESx(0xA4F65252), AESx(0x764D3B3B), AESx(0xB761D6D6), AESx(0x7DCEB3B3), + AESx(0x527B2929), AESx(0xDD3EE3E3), AESx(0x5E712F2F), AESx(0x13978484), + AESx(0xA6F55353), AESx(0xB968D1D1), AESx(0x00000000), AESx(0xC12CEDED), + AESx(0x40602020), AESx(0xE31FFCFC), AESx(0x79C8B1B1), AESx(0xB6ED5B5B), + AESx(0xD4BE6A6A), AESx(0x8D46CBCB), AESx(0x67D9BEBE), AESx(0x724B3939), + AESx(0x94DE4A4A), AESx(0x98D44C4C), AESx(0xB0E85858), AESx(0x854ACFCF), + AESx(0xBB6BD0D0), AESx(0xC52AEFEF), AESx(0x4FE5AAAA), AESx(0xED16FBFB), + AESx(0x86C54343), AESx(0x9AD74D4D), AESx(0x66553333), AESx(0x11948585), + AESx(0x8ACF4545), AESx(0xE910F9F9), AESx(0x04060202), AESx(0xFE817F7F), + AESx(0xA0F05050), AESx(0x78443C3C), AESx(0x25BA9F9F), AESx(0x4BE3A8A8), + AESx(0xA2F35151), AESx(0x5DFEA3A3), AESx(0x80C04040), AESx(0x058A8F8F), + AESx(0x3FAD9292), AESx(0x21BC9D9D), AESx(0x70483838), AESx(0xF104F5F5), + AESx(0x63DFBCBC), AESx(0x77C1B6B6), AESx(0xAF75DADA), AESx(0x42632121), + AESx(0x20301010), AESx(0xE51AFFFF), AESx(0xFD0EF3F3), AESx(0xBF6DD2D2), + AESx(0x814CCDCD), AESx(0x18140C0C), AESx(0x26351313), AESx(0xC32FECEC), + AESx(0xBEE15F5F), AESx(0x35A29797), AESx(0x88CC4444), AESx(0x2E391717), + AESx(0x9357C4C4), AESx(0x55F2A7A7), AESx(0xFC827E7E), AESx(0x7A473D3D), + AESx(0xC8AC6464), AESx(0xBAE75D5D), AESx(0x322B1919), AESx(0xE6957373), + AESx(0xC0A06060), AESx(0x19988181), AESx(0x9ED14F4F), AESx(0xA37FDCDC), + AESx(0x44662222), AESx(0x547E2A2A), AESx(0x3BAB9090), AESx(0x0B838888), + AESx(0x8CCA4646), AESx(0xC729EEEE), AESx(0x6BD3B8B8), AESx(0x283C1414), + AESx(0xA779DEDE), AESx(0xBCE25E5E), AESx(0x161D0B0B), AESx(0xAD76DBDB), + AESx(0xDB3BE0E0), AESx(0x64563232), AESx(0x744E3A3A), AESx(0x141E0A0A), + AESx(0x92DB4949), AESx(0x0C0A0606), AESx(0x486C2424), AESx(0xB8E45C5C), + AESx(0x9F5DC2C2), AESx(0xBD6ED3D3), AESx(0x43EFACAC), AESx(0xC4A66262), + AESx(0x39A89191), AESx(0x31A49595), AESx(0xD337E4E4), AESx(0xF28B7979), + AESx(0xD532E7E7), AESx(0x8B43C8C8), AESx(0x6E593737), AESx(0xDAB76D6D), + AESx(0x018C8D8D), AESx(0xB164D5D5), AESx(0x9CD24E4E), AESx(0x49E0A9A9), + AESx(0xD8B46C6C), AESx(0xACFA5656), AESx(0xF307F4F4), AESx(0xCF25EAEA), + AESx(0xCAAF6565), AESx(0xF48E7A7A), AESx(0x47E9AEAE), AESx(0x10180808), + AESx(0x6FD5BABA), AESx(0xF0887878), AESx(0x4A6F2525), AESx(0x5C722E2E), + AESx(0x38241C1C), AESx(0x57F1A6A6), AESx(0x73C7B4B4), AESx(0x9751C6C6), + AESx(0xCB23E8E8), AESx(0xA17CDDDD), AESx(0xE89C7474), AESx(0x3E211F1F), + AESx(0x96DD4B4B), AESx(0x61DCBDBD), AESx(0x0D868B8B), AESx(0x0F858A8A), + AESx(0xE0907070), AESx(0x7C423E3E), AESx(0x71C4B5B5), AESx(0xCCAA6666), + AESx(0x90D84848), AESx(0x06050303), AESx(0xF701F6F6), AESx(0x1C120E0E), + AESx(0xC2A36161), AESx(0x6A5F3535), AESx(0xAEF95757), AESx(0x69D0B9B9), + AESx(0x17918686), AESx(0x9958C1C1), AESx(0x3A271D1D), AESx(0x27B99E9E), + AESx(0xD938E1E1), AESx(0xEB13F8F8), AESx(0x2BB39898), AESx(0x22331111), + AESx(0xD2BB6969), AESx(0xA970D9D9), AESx(0x07898E8E), AESx(0x33A79494), + AESx(0x2DB69B9B), AESx(0x3C221E1E), AESx(0x15928787), AESx(0xC920E9E9), + AESx(0x8749CECE), AESx(0xAAFF5555), AESx(0x50782828), AESx(0xA57ADFDF), + AESx(0x038F8C8C), AESx(0x59F8A1A1), AESx(0x09808989), AESx(0x1A170D0D), + AESx(0x65DABFBF), AESx(0xD731E6E6), AESx(0x84C64242), AESx(0xD0B86868), + AESx(0x82C34141), AESx(0x29B09999), AESx(0x5A772D2D), AESx(0x1E110F0F), + AESx(0x7BCBB0B0), AESx(0xA8FC5454), AESx(0x6DD6BBBB), AESx(0x2C3A1616) +}; + +#ifdef __cplusplus +} +#endif diff --git a/sha3/blake.c b/sha3/blake.c new file mode 100644 index 0000000..0650b9c --- /dev/null +++ b/sha3/blake.c @@ -0,0 +1,1120 @@ +/* $Id: blake.c 252 2011-06-07 17:55:14Z tp $ */ +/* + * BLAKE implementation. + * + * ==========================(LICENSE BEGIN)============================ + * + * Copyright (c) 2007-2010 Projet RNRT SAPHIR + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY + * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * + * ===========================(LICENSE END)============================= + * + * @author Thomas Pornin + */ + +#include +#include +#include + +#include "sph_blake.h" + +#ifdef __cplusplus +extern "C"{ +#endif + +#if SPH_SMALL_FOOTPRINT && !defined SPH_SMALL_FOOTPRINT_BLAKE +#define SPH_SMALL_FOOTPRINT_BLAKE 1 +#endif + +#if SPH_SMALL_FOOTPRINT_BLAKE +#define SPH_COMPACT_BLAKE_32 1 +#endif + +#if SPH_64 && (SPH_SMALL_FOOTPRINT_BLAKE || !SPH_64_TRUE) +#define SPH_COMPACT_BLAKE_64 1 +#endif + +#ifdef _MSC_VER +#pragma warning (disable: 4146) +#endif + +static const sph_u32 IV224[8] = { + SPH_C32(0xC1059ED8), SPH_C32(0x367CD507), + SPH_C32(0x3070DD17), SPH_C32(0xF70E5939), + SPH_C32(0xFFC00B31), SPH_C32(0x68581511), + SPH_C32(0x64F98FA7), SPH_C32(0xBEFA4FA4) +}; + +static const sph_u32 IV256[8] = { + SPH_C32(0x6A09E667), SPH_C32(0xBB67AE85), + SPH_C32(0x3C6EF372), SPH_C32(0xA54FF53A), + SPH_C32(0x510E527F), SPH_C32(0x9B05688C), + SPH_C32(0x1F83D9AB), SPH_C32(0x5BE0CD19) +}; + +#if SPH_64 + +static const sph_u64 IV384[8] = { + SPH_C64(0xCBBB9D5DC1059ED8), SPH_C64(0x629A292A367CD507), + SPH_C64(0x9159015A3070DD17), SPH_C64(0x152FECD8F70E5939), + SPH_C64(0x67332667FFC00B31), SPH_C64(0x8EB44A8768581511), + SPH_C64(0xDB0C2E0D64F98FA7), SPH_C64(0x47B5481DBEFA4FA4) +}; + +static const sph_u64 IV512[8] = { + SPH_C64(0x6A09E667F3BCC908), SPH_C64(0xBB67AE8584CAA73B), + SPH_C64(0x3C6EF372FE94F82B), SPH_C64(0xA54FF53A5F1D36F1), + SPH_C64(0x510E527FADE682D1), SPH_C64(0x9B05688C2B3E6C1F), + SPH_C64(0x1F83D9ABFB41BD6B), SPH_C64(0x5BE0CD19137E2179) +}; + +#endif + +#if SPH_COMPACT_BLAKE_32 || SPH_COMPACT_BLAKE_64 + +static const unsigned sigma[16][16] = { + { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 }, + { 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 }, + { 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 }, + { 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 }, + { 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13 }, + { 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 }, + { 12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11 }, + { 13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10 }, + { 6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5 }, + { 10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13, 0 }, + { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 }, + { 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 }, + { 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 }, + { 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 }, + { 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13 }, + { 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 } +}; + +/* + 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 + 14 10 4 8 9 15 13 6 1 12 0 2 11 7 5 3 + 11 8 12 0 5 2 15 13 10 14 3 6 7 1 9 4 + 7 9 3 1 13 12 11 14 2 6 5 10 4 0 15 8 + 9 0 5 7 2 4 10 15 14 1 11 12 6 8 3 13 + 2 12 6 10 0 11 8 3 4 13 7 5 15 14 1 9 + 12 5 1 15 14 13 4 10 0 7 6 3 9 2 8 11 + 13 11 7 14 12 1 3 9 5 0 15 4 8 6 2 10 + 6 15 14 9 11 3 0 8 12 2 13 7 1 4 10 5 + 10 2 8 4 7 6 1 5 15 11 9 14 3 12 13 0 +*/ +#endif + +#define Z00 0 +#define Z01 1 +#define Z02 2 +#define Z03 3 +#define Z04 4 +#define Z05 5 +#define Z06 6 +#define Z07 7 +#define Z08 8 +#define Z09 9 +#define Z0A A +#define Z0B B +#define Z0C C +#define Z0D D +#define Z0E E +#define Z0F F + +#define Z10 E +#define Z11 A +#define Z12 4 +#define Z13 8 +#define Z14 9 +#define Z15 F +#define Z16 D +#define Z17 6 +#define Z18 1 +#define Z19 C +#define Z1A 0 +#define Z1B 2 +#define Z1C B +#define Z1D 7 +#define Z1E 5 +#define Z1F 3 + +#define Z20 B +#define Z21 8 +#define Z22 C +#define Z23 0 +#define Z24 5 +#define Z25 2 +#define Z26 F +#define Z27 D +#define Z28 A +#define Z29 E +#define Z2A 3 +#define Z2B 6 +#define Z2C 7 +#define Z2D 1 +#define Z2E 9 +#define Z2F 4 + +#define Z30 7 +#define Z31 9 +#define Z32 3 +#define Z33 1 +#define Z34 D +#define Z35 C +#define Z36 B +#define Z37 E +#define Z38 2 +#define Z39 6 +#define Z3A 5 +#define Z3B A +#define Z3C 4 +#define Z3D 0 +#define Z3E F +#define Z3F 8 + +#define Z40 9 +#define Z41 0 +#define Z42 5 +#define Z43 7 +#define Z44 2 +#define Z45 4 +#define Z46 A +#define Z47 F +#define Z48 E +#define Z49 1 +#define Z4A B +#define Z4B C +#define Z4C 6 +#define Z4D 8 +#define Z4E 3 +#define Z4F D + +#define Z50 2 +#define Z51 C +#define Z52 6 +#define Z53 A +#define Z54 0 +#define Z55 B +#define Z56 8 +#define Z57 3 +#define Z58 4 +#define Z59 D +#define Z5A 7 +#define Z5B 5 +#define Z5C F +#define Z5D E +#define Z5E 1 +#define Z5F 9 + +#define Z60 C +#define Z61 5 +#define Z62 1 +#define Z63 F +#define Z64 E +#define Z65 D +#define Z66 4 +#define Z67 A +#define Z68 0 +#define Z69 7 +#define Z6A 6 +#define Z6B 3 +#define Z6C 9 +#define Z6D 2 +#define Z6E 8 +#define Z6F B + +#define Z70 D +#define Z71 B +#define Z72 7 +#define Z73 E +#define Z74 C +#define Z75 1 +#define Z76 3 +#define Z77 9 +#define Z78 5 +#define Z79 0 +#define Z7A F +#define Z7B 4 +#define Z7C 8 +#define Z7D 6 +#define Z7E 2 +#define Z7F A + +#define Z80 6 +#define Z81 F +#define Z82 E +#define Z83 9 +#define Z84 B +#define Z85 3 +#define Z86 0 +#define Z87 8 +#define Z88 C +#define Z89 2 +#define Z8A D +#define Z8B 7 +#define Z8C 1 +#define Z8D 4 +#define Z8E A +#define Z8F 5 + +#define Z90 A +#define Z91 2 +#define Z92 8 +#define Z93 4 +#define Z94 7 +#define Z95 6 +#define Z96 1 +#define Z97 5 +#define Z98 F +#define Z99 B +#define Z9A 9 +#define Z9B E +#define Z9C 3 +#define Z9D C +#define Z9E D +#define Z9F 0 + +#define Mx(r, i) Mx_(Z ## r ## i) +#define Mx_(n) Mx__(n) +#define Mx__(n) M ## n + +#define CSx(r, i) CSx_(Z ## r ## i) +#define CSx_(n) CSx__(n) +#define CSx__(n) CS ## n + +#define CS0 SPH_C32(0x243F6A88) +#define CS1 SPH_C32(0x85A308D3) +#define CS2 SPH_C32(0x13198A2E) +#define CS3 SPH_C32(0x03707344) +#define CS4 SPH_C32(0xA4093822) +#define CS5 SPH_C32(0x299F31D0) +#define CS6 SPH_C32(0x082EFA98) +#define CS7 SPH_C32(0xEC4E6C89) +#define CS8 SPH_C32(0x452821E6) +#define CS9 SPH_C32(0x38D01377) +#define CSA SPH_C32(0xBE5466CF) +#define CSB SPH_C32(0x34E90C6C) +#define CSC SPH_C32(0xC0AC29B7) +#define CSD SPH_C32(0xC97C50DD) +#define CSE SPH_C32(0x3F84D5B5) +#define CSF SPH_C32(0xB5470917) + +#if SPH_COMPACT_BLAKE_32 + +static const sph_u32 CS[16] = { + SPH_C32(0x243F6A88), SPH_C32(0x85A308D3), + SPH_C32(0x13198A2E), SPH_C32(0x03707344), + SPH_C32(0xA4093822), SPH_C32(0x299F31D0), + SPH_C32(0x082EFA98), SPH_C32(0xEC4E6C89), + SPH_C32(0x452821E6), SPH_C32(0x38D01377), + SPH_C32(0xBE5466CF), SPH_C32(0x34E90C6C), + SPH_C32(0xC0AC29B7), SPH_C32(0xC97C50DD), + SPH_C32(0x3F84D5B5), SPH_C32(0xB5470917) +}; + +#endif + +#if SPH_64 + +#define CBx(r, i) CBx_(Z ## r ## i) +#define CBx_(n) CBx__(n) +#define CBx__(n) CB ## n + +#define CB0 SPH_C64(0x243F6A8885A308D3) +#define CB1 SPH_C64(0x13198A2E03707344) +#define CB2 SPH_C64(0xA4093822299F31D0) +#define CB3 SPH_C64(0x082EFA98EC4E6C89) +#define CB4 SPH_C64(0x452821E638D01377) +#define CB5 SPH_C64(0xBE5466CF34E90C6C) +#define CB6 SPH_C64(0xC0AC29B7C97C50DD) +#define CB7 SPH_C64(0x3F84D5B5B5470917) +#define CB8 SPH_C64(0x9216D5D98979FB1B) +#define CB9 SPH_C64(0xD1310BA698DFB5AC) +#define CBA SPH_C64(0x2FFD72DBD01ADFB7) +#define CBB SPH_C64(0xB8E1AFED6A267E96) +#define CBC SPH_C64(0xBA7C9045F12C7F99) +#define CBD SPH_C64(0x24A19947B3916CF7) +#define CBE SPH_C64(0x0801F2E2858EFC16) +#define CBF SPH_C64(0x636920D871574E69) + +#if SPH_COMPACT_BLAKE_64 + +static const sph_u64 CB[16] = { + SPH_C64(0x243F6A8885A308D3), SPH_C64(0x13198A2E03707344), + SPH_C64(0xA4093822299F31D0), SPH_C64(0x082EFA98EC4E6C89), + SPH_C64(0x452821E638D01377), SPH_C64(0xBE5466CF34E90C6C), + SPH_C64(0xC0AC29B7C97C50DD), SPH_C64(0x3F84D5B5B5470917), + SPH_C64(0x9216D5D98979FB1B), SPH_C64(0xD1310BA698DFB5AC), + SPH_C64(0x2FFD72DBD01ADFB7), SPH_C64(0xB8E1AFED6A267E96), + SPH_C64(0xBA7C9045F12C7F99), SPH_C64(0x24A19947B3916CF7), + SPH_C64(0x0801F2E2858EFC16), SPH_C64(0x636920D871574E69) +}; + +#endif + +#endif + +#define GS(m0, m1, c0, c1, a, b, c, d) do { \ + a = SPH_T32(a + b + (m0 ^ c1)); \ + d = SPH_ROTR32(d ^ a, 16); \ + c = SPH_T32(c + d); \ + b = SPH_ROTR32(b ^ c, 12); \ + a = SPH_T32(a + b + (m1 ^ c0)); \ + d = SPH_ROTR32(d ^ a, 8); \ + c = SPH_T32(c + d); \ + b = SPH_ROTR32(b ^ c, 7); \ + } while (0) + +#if SPH_COMPACT_BLAKE_32 + +#define ROUND_S(r) do { \ + GS(M[sigma[r][0x0]], M[sigma[r][0x1]], \ + CS[sigma[r][0x0]], CS[sigma[r][0x1]], V0, V4, V8, VC); \ + GS(M[sigma[r][0x2]], M[sigma[r][0x3]], \ + CS[sigma[r][0x2]], CS[sigma[r][0x3]], V1, V5, V9, VD); \ + GS(M[sigma[r][0x4]], M[sigma[r][0x5]], \ + CS[sigma[r][0x4]], CS[sigma[r][0x5]], V2, V6, VA, VE); \ + GS(M[sigma[r][0x6]], M[sigma[r][0x7]], \ + CS[sigma[r][0x6]], CS[sigma[r][0x7]], V3, V7, VB, VF); \ + GS(M[sigma[r][0x8]], M[sigma[r][0x9]], \ + CS[sigma[r][0x8]], CS[sigma[r][0x9]], V0, V5, VA, VF); \ + GS(M[sigma[r][0xA]], M[sigma[r][0xB]], \ + CS[sigma[r][0xA]], CS[sigma[r][0xB]], V1, V6, VB, VC); \ + GS(M[sigma[r][0xC]], M[sigma[r][0xD]], \ + CS[sigma[r][0xC]], CS[sigma[r][0xD]], V2, V7, V8, VD); \ + GS(M[sigma[r][0xE]], M[sigma[r][0xF]], \ + CS[sigma[r][0xE]], CS[sigma[r][0xF]], V3, V4, V9, VE); \ + } while (0) + +#else + +#define ROUND_S(r) do { \ + GS(Mx(r, 0), Mx(r, 1), CSx(r, 0), CSx(r, 1), V0, V4, V8, VC); \ + GS(Mx(r, 2), Mx(r, 3), CSx(r, 2), CSx(r, 3), V1, V5, V9, VD); \ + GS(Mx(r, 4), Mx(r, 5), CSx(r, 4), CSx(r, 5), V2, V6, VA, VE); \ + GS(Mx(r, 6), Mx(r, 7), CSx(r, 6), CSx(r, 7), V3, V7, VB, VF); \ + GS(Mx(r, 8), Mx(r, 9), CSx(r, 8), CSx(r, 9), V0, V5, VA, VF); \ + GS(Mx(r, A), Mx(r, B), CSx(r, A), CSx(r, B), V1, V6, VB, VC); \ + GS(Mx(r, C), Mx(r, D), CSx(r, C), CSx(r, D), V2, V7, V8, VD); \ + GS(Mx(r, E), Mx(r, F), CSx(r, E), CSx(r, F), V3, V4, V9, VE); \ + } while (0) + +#endif + +#if SPH_64 + +#define GB(m0, m1, c0, c1, a, b, c, d) do { \ + a = SPH_T64(a + b + (m0 ^ c1)); \ + d = SPH_ROTR64(d ^ a, 32); \ + c = SPH_T64(c + d); \ + b = SPH_ROTR64(b ^ c, 25); \ + a = SPH_T64(a + b + (m1 ^ c0)); \ + d = SPH_ROTR64(d ^ a, 16); \ + c = SPH_T64(c + d); \ + b = SPH_ROTR64(b ^ c, 11); \ + } while (0) + +#if SPH_COMPACT_BLAKE_64 + +#define ROUND_B(r) do { \ + GB(M[sigma[r][0x0]], M[sigma[r][0x1]], \ + CB[sigma[r][0x0]], CB[sigma[r][0x1]], V0, V4, V8, VC); \ + GB(M[sigma[r][0x2]], M[sigma[r][0x3]], \ + CB[sigma[r][0x2]], CB[sigma[r][0x3]], V1, V5, V9, VD); \ + GB(M[sigma[r][0x4]], M[sigma[r][0x5]], \ + CB[sigma[r][0x4]], CB[sigma[r][0x5]], V2, V6, VA, VE); \ + GB(M[sigma[r][0x6]], M[sigma[r][0x7]], \ + CB[sigma[r][0x6]], CB[sigma[r][0x7]], V3, V7, VB, VF); \ + GB(M[sigma[r][0x8]], M[sigma[r][0x9]], \ + CB[sigma[r][0x8]], CB[sigma[r][0x9]], V0, V5, VA, VF); \ + GB(M[sigma[r][0xA]], M[sigma[r][0xB]], \ + CB[sigma[r][0xA]], CB[sigma[r][0xB]], V1, V6, VB, VC); \ + GB(M[sigma[r][0xC]], M[sigma[r][0xD]], \ + CB[sigma[r][0xC]], CB[sigma[r][0xD]], V2, V7, V8, VD); \ + GB(M[sigma[r][0xE]], M[sigma[r][0xF]], \ + CB[sigma[r][0xE]], CB[sigma[r][0xF]], V3, V4, V9, VE); \ + } while (0) + +#else + +#define ROUND_B(r) do { \ + GB(Mx(r, 0), Mx(r, 1), CBx(r, 0), CBx(r, 1), V0, V4, V8, VC); \ + GB(Mx(r, 2), Mx(r, 3), CBx(r, 2), CBx(r, 3), V1, V5, V9, VD); \ + GB(Mx(r, 4), Mx(r, 5), CBx(r, 4), CBx(r, 5), V2, V6, VA, VE); \ + GB(Mx(r, 6), Mx(r, 7), CBx(r, 6), CBx(r, 7), V3, V7, VB, VF); \ + GB(Mx(r, 8), Mx(r, 9), CBx(r, 8), CBx(r, 9), V0, V5, VA, VF); \ + GB(Mx(r, A), Mx(r, B), CBx(r, A), CBx(r, B), V1, V6, VB, VC); \ + GB(Mx(r, C), Mx(r, D), CBx(r, C), CBx(r, D), V2, V7, V8, VD); \ + GB(Mx(r, E), Mx(r, F), CBx(r, E), CBx(r, F), V3, V4, V9, VE); \ + } while (0) + +#endif + +#endif + +#define DECL_STATE32 \ + sph_u32 H0, H1, H2, H3, H4, H5, H6, H7; \ + sph_u32 S0, S1, S2, S3, T0, T1; + +#define READ_STATE32(state) do { \ + H0 = (state)->H[0]; \ + H1 = (state)->H[1]; \ + H2 = (state)->H[2]; \ + H3 = (state)->H[3]; \ + H4 = (state)->H[4]; \ + H5 = (state)->H[5]; \ + H6 = (state)->H[6]; \ + H7 = (state)->H[7]; \ + S0 = (state)->S[0]; \ + S1 = (state)->S[1]; \ + S2 = (state)->S[2]; \ + S3 = (state)->S[3]; \ + T0 = (state)->T0; \ + T1 = (state)->T1; \ + } while (0) + +#define WRITE_STATE32(state) do { \ + (state)->H[0] = H0; \ + (state)->H[1] = H1; \ + (state)->H[2] = H2; \ + (state)->H[3] = H3; \ + (state)->H[4] = H4; \ + (state)->H[5] = H5; \ + (state)->H[6] = H6; \ + (state)->H[7] = H7; \ + (state)->S[0] = S0; \ + (state)->S[1] = S1; \ + (state)->S[2] = S2; \ + (state)->S[3] = S3; \ + (state)->T0 = T0; \ + (state)->T1 = T1; \ + } while (0) + +#if SPH_COMPACT_BLAKE_32 + +#define COMPRESS32 do { \ + sph_u32 M[16]; \ + sph_u32 V0, V1, V2, V3, V4, V5, V6, V7; \ + sph_u32 V8, V9, VA, VB, VC, VD, VE, VF; \ + unsigned r; \ + V0 = H0; \ + V1 = H1; \ + V2 = H2; \ + V3 = H3; \ + V4 = H4; \ + V5 = H5; \ + V6 = H6; \ + V7 = H7; \ + V8 = S0 ^ CS0; \ + V9 = S1 ^ CS1; \ + VA = S2 ^ CS2; \ + VB = S3 ^ CS3; \ + VC = T0 ^ CS4; \ + VD = T0 ^ CS5; \ + VE = T1 ^ CS6; \ + VF = T1 ^ CS7; \ + M[0x0] = sph_dec32be_aligned(buf + 0); \ + M[0x1] = sph_dec32be_aligned(buf + 4); \ + M[0x2] = sph_dec32be_aligned(buf + 8); \ + M[0x3] = sph_dec32be_aligned(buf + 12); \ + M[0x4] = sph_dec32be_aligned(buf + 16); \ + M[0x5] = sph_dec32be_aligned(buf + 20); \ + M[0x6] = sph_dec32be_aligned(buf + 24); \ + M[0x7] = sph_dec32be_aligned(buf + 28); \ + M[0x8] = sph_dec32be_aligned(buf + 32); \ + M[0x9] = sph_dec32be_aligned(buf + 36); \ + M[0xA] = sph_dec32be_aligned(buf + 40); \ + M[0xB] = sph_dec32be_aligned(buf + 44); \ + M[0xC] = sph_dec32be_aligned(buf + 48); \ + M[0xD] = sph_dec32be_aligned(buf + 52); \ + M[0xE] = sph_dec32be_aligned(buf + 56); \ + M[0xF] = sph_dec32be_aligned(buf + 60); \ + for (r = 0; r < 14; r ++) \ + ROUND_S(r); \ + H0 ^= S0 ^ V0 ^ V8; \ + H1 ^= S1 ^ V1 ^ V9; \ + H2 ^= S2 ^ V2 ^ VA; \ + H3 ^= S3 ^ V3 ^ VB; \ + H4 ^= S0 ^ V4 ^ VC; \ + H5 ^= S1 ^ V5 ^ VD; \ + H6 ^= S2 ^ V6 ^ VE; \ + H7 ^= S3 ^ V7 ^ VF; \ + } while (0) + +#else + +#define COMPRESS32 do { \ + sph_u32 M0, M1, M2, M3, M4, M5, M6, M7; \ + sph_u32 M8, M9, MA, MB, MC, MD, ME, MF; \ + sph_u32 V0, V1, V2, V3, V4, V5, V6, V7; \ + sph_u32 V8, V9, VA, VB, VC, VD, VE, VF; \ + V0 = H0; \ + V1 = H1; \ + V2 = H2; \ + V3 = H3; \ + V4 = H4; \ + V5 = H5; \ + V6 = H6; \ + V7 = H7; \ + V8 = S0 ^ CS0; \ + V9 = S1 ^ CS1; \ + VA = S2 ^ CS2; \ + VB = S3 ^ CS3; \ + VC = T0 ^ CS4; \ + VD = T0 ^ CS5; \ + VE = T1 ^ CS6; \ + VF = T1 ^ CS7; \ + M0 = sph_dec32be_aligned(buf + 0); \ + M1 = sph_dec32be_aligned(buf + 4); \ + M2 = sph_dec32be_aligned(buf + 8); \ + M3 = sph_dec32be_aligned(buf + 12); \ + M4 = sph_dec32be_aligned(buf + 16); \ + M5 = sph_dec32be_aligned(buf + 20); \ + M6 = sph_dec32be_aligned(buf + 24); \ + M7 = sph_dec32be_aligned(buf + 28); \ + M8 = sph_dec32be_aligned(buf + 32); \ + M9 = sph_dec32be_aligned(buf + 36); \ + MA = sph_dec32be_aligned(buf + 40); \ + MB = sph_dec32be_aligned(buf + 44); \ + MC = sph_dec32be_aligned(buf + 48); \ + MD = sph_dec32be_aligned(buf + 52); \ + ME = sph_dec32be_aligned(buf + 56); \ + MF = sph_dec32be_aligned(buf + 60); \ + ROUND_S(0); \ + ROUND_S(1); \ + ROUND_S(2); \ + ROUND_S(3); \ + ROUND_S(4); \ + ROUND_S(5); \ + ROUND_S(6); \ + ROUND_S(7); \ + ROUND_S(8); \ + ROUND_S(9); \ + ROUND_S(0); \ + ROUND_S(1); \ + ROUND_S(2); \ + ROUND_S(3); \ + H0 ^= S0 ^ V0 ^ V8; \ + H1 ^= S1 ^ V1 ^ V9; \ + H2 ^= S2 ^ V2 ^ VA; \ + H3 ^= S3 ^ V3 ^ VB; \ + H4 ^= S0 ^ V4 ^ VC; \ + H5 ^= S1 ^ V5 ^ VD; \ + H6 ^= S2 ^ V6 ^ VE; \ + H7 ^= S3 ^ V7 ^ VF; \ + } while (0) + +#endif + +#if SPH_64 + +#define DECL_STATE64 \ + sph_u64 H0, H1, H2, H3, H4, H5, H6, H7; \ + sph_u64 S0, S1, S2, S3, T0, T1; + +#define READ_STATE64(state) do { \ + H0 = (state)->H[0]; \ + H1 = (state)->H[1]; \ + H2 = (state)->H[2]; \ + H3 = (state)->H[3]; \ + H4 = (state)->H[4]; \ + H5 = (state)->H[5]; \ + H6 = (state)->H[6]; \ + H7 = (state)->H[7]; \ + S0 = (state)->S[0]; \ + S1 = (state)->S[1]; \ + S2 = (state)->S[2]; \ + S3 = (state)->S[3]; \ + T0 = (state)->T0; \ + T1 = (state)->T1; \ + } while (0) + +#define WRITE_STATE64(state) do { \ + (state)->H[0] = H0; \ + (state)->H[1] = H1; \ + (state)->H[2] = H2; \ + (state)->H[3] = H3; \ + (state)->H[4] = H4; \ + (state)->H[5] = H5; \ + (state)->H[6] = H6; \ + (state)->H[7] = H7; \ + (state)->S[0] = S0; \ + (state)->S[1] = S1; \ + (state)->S[2] = S2; \ + (state)->S[3] = S3; \ + (state)->T0 = T0; \ + (state)->T1 = T1; \ + } while (0) + +#if SPH_COMPACT_BLAKE_64 + +#define COMPRESS64 do { \ + sph_u64 M[16]; \ + sph_u64 V0, V1, V2, V3, V4, V5, V6, V7; \ + sph_u64 V8, V9, VA, VB, VC, VD, VE, VF; \ + unsigned r; \ + V0 = H0; \ + V1 = H1; \ + V2 = H2; \ + V3 = H3; \ + V4 = H4; \ + V5 = H5; \ + V6 = H6; \ + V7 = H7; \ + V8 = S0 ^ CB0; \ + V9 = S1 ^ CB1; \ + VA = S2 ^ CB2; \ + VB = S3 ^ CB3; \ + VC = T0 ^ CB4; \ + VD = T0 ^ CB5; \ + VE = T1 ^ CB6; \ + VF = T1 ^ CB7; \ + M[0x0] = sph_dec64be_aligned(buf + 0); \ + M[0x1] = sph_dec64be_aligned(buf + 8); \ + M[0x2] = sph_dec64be_aligned(buf + 16); \ + M[0x3] = sph_dec64be_aligned(buf + 24); \ + M[0x4] = sph_dec64be_aligned(buf + 32); \ + M[0x5] = sph_dec64be_aligned(buf + 40); \ + M[0x6] = sph_dec64be_aligned(buf + 48); \ + M[0x7] = sph_dec64be_aligned(buf + 56); \ + M[0x8] = sph_dec64be_aligned(buf + 64); \ + M[0x9] = sph_dec64be_aligned(buf + 72); \ + M[0xA] = sph_dec64be_aligned(buf + 80); \ + M[0xB] = sph_dec64be_aligned(buf + 88); \ + M[0xC] = sph_dec64be_aligned(buf + 96); \ + M[0xD] = sph_dec64be_aligned(buf + 104); \ + M[0xE] = sph_dec64be_aligned(buf + 112); \ + M[0xF] = sph_dec64be_aligned(buf + 120); \ + for (r = 0; r < 16; r ++) \ + ROUND_B(r); \ + H0 ^= S0 ^ V0 ^ V8; \ + H1 ^= S1 ^ V1 ^ V9; \ + H2 ^= S2 ^ V2 ^ VA; \ + H3 ^= S3 ^ V3 ^ VB; \ + H4 ^= S0 ^ V4 ^ VC; \ + H5 ^= S1 ^ V5 ^ VD; \ + H6 ^= S2 ^ V6 ^ VE; \ + H7 ^= S3 ^ V7 ^ VF; \ + } while (0) + +#else + +#define COMPRESS64 do { \ + sph_u64 M0, M1, M2, M3, M4, M5, M6, M7; \ + sph_u64 M8, M9, MA, MB, MC, MD, ME, MF; \ + sph_u64 V0, V1, V2, V3, V4, V5, V6, V7; \ + sph_u64 V8, V9, VA, VB, VC, VD, VE, VF; \ + V0 = H0; \ + V1 = H1; \ + V2 = H2; \ + V3 = H3; \ + V4 = H4; \ + V5 = H5; \ + V6 = H6; \ + V7 = H7; \ + V8 = S0 ^ CB0; \ + V9 = S1 ^ CB1; \ + VA = S2 ^ CB2; \ + VB = S3 ^ CB3; \ + VC = T0 ^ CB4; \ + VD = T0 ^ CB5; \ + VE = T1 ^ CB6; \ + VF = T1 ^ CB7; \ + M0 = sph_dec64be_aligned(buf + 0); \ + M1 = sph_dec64be_aligned(buf + 8); \ + M2 = sph_dec64be_aligned(buf + 16); \ + M3 = sph_dec64be_aligned(buf + 24); \ + M4 = sph_dec64be_aligned(buf + 32); \ + M5 = sph_dec64be_aligned(buf + 40); \ + M6 = sph_dec64be_aligned(buf + 48); \ + M7 = sph_dec64be_aligned(buf + 56); \ + M8 = sph_dec64be_aligned(buf + 64); \ + M9 = sph_dec64be_aligned(buf + 72); \ + MA = sph_dec64be_aligned(buf + 80); \ + MB = sph_dec64be_aligned(buf + 88); \ + MC = sph_dec64be_aligned(buf + 96); \ + MD = sph_dec64be_aligned(buf + 104); \ + ME = sph_dec64be_aligned(buf + 112); \ + MF = sph_dec64be_aligned(buf + 120); \ + ROUND_B(0); \ + ROUND_B(1); \ + ROUND_B(2); \ + ROUND_B(3); \ + ROUND_B(4); \ + ROUND_B(5); \ + ROUND_B(6); \ + ROUND_B(7); \ + ROUND_B(8); \ + ROUND_B(9); \ + ROUND_B(0); \ + ROUND_B(1); \ + ROUND_B(2); \ + ROUND_B(3); \ + ROUND_B(4); \ + ROUND_B(5); \ + H0 ^= S0 ^ V0 ^ V8; \ + H1 ^= S1 ^ V1 ^ V9; \ + H2 ^= S2 ^ V2 ^ VA; \ + H3 ^= S3 ^ V3 ^ VB; \ + H4 ^= S0 ^ V4 ^ VC; \ + H5 ^= S1 ^ V5 ^ VD; \ + H6 ^= S2 ^ V6 ^ VE; \ + H7 ^= S3 ^ V7 ^ VF; \ + } while (0) + +#endif + +#endif + +static const sph_u32 salt_zero_small[4] = { 0, 0, 0, 0 }; + +static void +blake32_init(sph_blake_small_context *sc, + const sph_u32 *iv, const sph_u32 *salt) +{ + memcpy(sc->H, iv, 8 * sizeof(sph_u32)); + memcpy(sc->S, salt, 4 * sizeof(sph_u32)); + sc->T0 = sc->T1 = 0; + sc->ptr = 0; +} + +static void +blake32(sph_blake_small_context *sc, const void *data, size_t len) +{ + unsigned char *buf; + size_t ptr; + DECL_STATE32 + + buf = sc->buf; + ptr = sc->ptr; + if (len < (sizeof sc->buf) - ptr) { + memcpy(buf + ptr, data, len); + ptr += len; + sc->ptr = ptr; + return; + } + + READ_STATE32(sc); + while (len > 0) { + size_t clen; + + clen = (sizeof sc->buf) - ptr; + if (clen > len) + clen = len; + memcpy(buf + ptr, data, clen); + ptr += clen; + data = (const unsigned char *)data + clen; + len -= clen; + if (ptr == sizeof sc->buf) { + if ((T0 = SPH_T32(T0 + 512)) < 512) + T1 = SPH_T32(T1 + 1); + COMPRESS32; + ptr = 0; + } + } + WRITE_STATE32(sc); + sc->ptr = ptr; +} + +static void +blake32_close(sph_blake_small_context *sc, + unsigned ub, unsigned n, void *dst, size_t out_size_w32) +{ + union { + unsigned char buf[64]; + sph_u32 dummy; + } u; + size_t ptr, k; + unsigned bit_len; + unsigned z; + sph_u32 th, tl; + unsigned char *out; + + ptr = sc->ptr; + bit_len = ((unsigned)ptr << 3) + n; + z = 0x80 >> n; + u.buf[ptr] = ((ub & -z) | z) & 0xFF; + tl = sc->T0 + bit_len; + th = sc->T1; + if (ptr == 0 && n == 0) { + sc->T0 = SPH_C32(0xFFFFFE00); + sc->T1 = SPH_C32(0xFFFFFFFF); + } else if (sc->T0 == 0) { + sc->T0 = SPH_C32(0xFFFFFE00) + bit_len; + sc->T1 = SPH_T32(sc->T1 - 1); + } else { + sc->T0 -= 512 - bit_len; + } + if (bit_len <= 446) { + memset(u.buf + ptr + 1, 0, 55 - ptr); + if (out_size_w32 == 8) + u.buf[55] |= 1; + sph_enc32be_aligned(u.buf + 56, th); + sph_enc32be_aligned(u.buf + 60, tl); + blake32(sc, u.buf + ptr, 64 - ptr); + } else { + memset(u.buf + ptr + 1, 0, 63 - ptr); + blake32(sc, u.buf + ptr, 64 - ptr); + sc->T0 = SPH_C32(0xFFFFFE00); + sc->T1 = SPH_C32(0xFFFFFFFF); + memset(u.buf, 0, 56); + if (out_size_w32 == 8) + u.buf[55] = 1; + sph_enc32be_aligned(u.buf + 56, th); + sph_enc32be_aligned(u.buf + 60, tl); + blake32(sc, u.buf, 64); + } + out = dst; + for (k = 0; k < out_size_w32; k ++) + sph_enc32be(out + (k << 2), sc->H[k]); +} + +#if SPH_64 + +static const sph_u64 salt_zero_big[4] = { 0, 0, 0, 0 }; + +static void +blake64_init(sph_blake_big_context *sc, + const sph_u64 *iv, const sph_u64 *salt) +{ + memcpy(sc->H, iv, 8 * sizeof(sph_u64)); + memcpy(sc->S, salt, 4 * sizeof(sph_u64)); + sc->T0 = sc->T1 = 0; + sc->ptr = 0; +} + +static void +blake64(sph_blake_big_context *sc, const void *data, size_t len) +{ + unsigned char *buf; + size_t ptr; + DECL_STATE64 + + buf = sc->buf; + ptr = sc->ptr; + if (len < (sizeof sc->buf) - ptr) { + memcpy(buf + ptr, data, len); + ptr += len; + sc->ptr = ptr; + return; + } + + READ_STATE64(sc); + while (len > 0) { + size_t clen; + + clen = (sizeof sc->buf) - ptr; + if (clen > len) + clen = len; + memcpy(buf + ptr, data, clen); + ptr += clen; + data = (const unsigned char *)data + clen; + len -= clen; + if (ptr == sizeof sc->buf) { + if ((T0 = SPH_T64(T0 + 1024)) < 1024) + T1 = SPH_T64(T1 + 1); + COMPRESS64; + ptr = 0; + } + } + WRITE_STATE64(sc); + sc->ptr = ptr; +} + +static void +blake64_close(sph_blake_big_context *sc, + unsigned ub, unsigned n, void *dst, size_t out_size_w64) +{ + union { + unsigned char buf[128]; + sph_u64 dummy; + } u; + size_t ptr, k; + unsigned bit_len; + unsigned z; + sph_u64 th, tl; + unsigned char *out; + + ptr = sc->ptr; + bit_len = ((unsigned)ptr << 3) + n; + z = 0x80 >> n; + u.buf[ptr] = ((ub & -z) | z) & 0xFF; + tl = sc->T0 + bit_len; + th = sc->T1; + if (ptr == 0 && n == 0) { + sc->T0 = SPH_C64(0xFFFFFFFFFFFFFC00); + sc->T1 = SPH_C64(0xFFFFFFFFFFFFFFFF); + } else if (sc->T0 == 0) { + sc->T0 = SPH_C64(0xFFFFFFFFFFFFFC00) + bit_len; + sc->T1 = SPH_T64(sc->T1 - 1); + } else { + sc->T0 -= 1024 - bit_len; + } + if (bit_len <= 894) { + memset(u.buf + ptr + 1, 0, 111 - ptr); + if (out_size_w64 == 8) + u.buf[111] |= 1; + sph_enc64be_aligned(u.buf + 112, th); + sph_enc64be_aligned(u.buf + 120, tl); + blake64(sc, u.buf + ptr, 128 - ptr); + } else { + memset(u.buf + ptr + 1, 0, 127 - ptr); + blake64(sc, u.buf + ptr, 128 - ptr); + sc->T0 = SPH_C64(0xFFFFFFFFFFFFFC00); + sc->T1 = SPH_C64(0xFFFFFFFFFFFFFFFF); + memset(u.buf, 0, 112); + if (out_size_w64 == 8) + u.buf[111] = 1; + sph_enc64be_aligned(u.buf + 112, th); + sph_enc64be_aligned(u.buf + 120, tl); + blake64(sc, u.buf, 128); + } + out = dst; + for (k = 0; k < out_size_w64; k ++) + sph_enc64be(out + (k << 3), sc->H[k]); +} + +#endif + +/* see sph_blake.h */ +void +sph_blake224_init(void *cc) +{ + blake32_init(cc, IV224, salt_zero_small); +} + +/* see sph_blake.h */ +void +sph_blake224(void *cc, const void *data, size_t len) +{ + blake32(cc, data, len); +} + +/* see sph_blake.h */ +void +sph_blake224_close(void *cc, void *dst) +{ + sph_blake224_addbits_and_close(cc, 0, 0, dst); +} + +/* see sph_blake.h */ +void +sph_blake224_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst) +{ + blake32_close(cc, ub, n, dst, 7); + sph_blake224_init(cc); +} + +/* see sph_blake.h */ +void +sph_blake256_init(void *cc) +{ + blake32_init(cc, IV256, salt_zero_small); +} + +/* see sph_blake.h */ +void +sph_blake256(void *cc, const void *data, size_t len) +{ + blake32(cc, data, len); +} + +/* see sph_blake.h */ +void +sph_blake256_close(void *cc, void *dst) +{ + sph_blake256_addbits_and_close(cc, 0, 0, dst); +} + +/* see sph_blake.h */ +void +sph_blake256_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst) +{ + blake32_close(cc, ub, n, dst, 8); + sph_blake256_init(cc); +} + +#if SPH_64 + +/* see sph_blake.h */ +void +sph_blake384_init(void *cc) +{ + blake64_init(cc, IV384, salt_zero_big); +} + +/* see sph_blake.h */ +void +sph_blake384(void *cc, const void *data, size_t len) +{ + blake64(cc, data, len); +} + +/* see sph_blake.h */ +void +sph_blake384_close(void *cc, void *dst) +{ + sph_blake384_addbits_and_close(cc, 0, 0, dst); +} + +/* see sph_blake.h */ +void +sph_blake384_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst) +{ + blake64_close(cc, ub, n, dst, 6); + sph_blake384_init(cc); +} + +/* see sph_blake.h */ +void +sph_blake512_init(void *cc) +{ + blake64_init(cc, IV512, salt_zero_big); +} + +/* see sph_blake.h */ +void +sph_blake512(void *cc, const void *data, size_t len) +{ + blake64(cc, data, len); +} + +/* see sph_blake.h */ +void +sph_blake512_close(void *cc, void *dst) +{ + sph_blake512_addbits_and_close(cc, 0, 0, dst); +} + +/* see sph_blake.h */ +void +sph_blake512_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst) +{ + blake64_close(cc, ub, n, dst, 8); + sph_blake512_init(cc); +} + +#endif + +#ifdef __cplusplus +} +#endif diff --git a/sha3/bmw.c b/sha3/bmw.c new file mode 100644 index 0000000..b89a881 --- /dev/null +++ b/sha3/bmw.c @@ -0,0 +1,965 @@ +/* $Id: bmw.c 227 2010-06-16 17:28:38Z tp $ */ +/* + * BMW implementation. + * + * ==========================(LICENSE BEGIN)============================ + * + * Copyright (c) 2007-2010 Projet RNRT SAPHIR + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY + * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * + * ===========================(LICENSE END)============================= + * + * @author Thomas Pornin + */ + +#include +#include +#include + +#ifdef __cplusplus +extern "C"{ +#endif + +#include "sph_bmw.h" + +#if SPH_SMALL_FOOTPRINT && !defined SPH_SMALL_FOOTPRINT_BMW +#define SPH_SMALL_FOOTPRINT_BMW 1 +#endif + +#ifdef _MSC_VER +#pragma warning (disable: 4146) +#endif + +static const sph_u32 IV224[] = { + SPH_C32(0x00010203), SPH_C32(0x04050607), + SPH_C32(0x08090A0B), SPH_C32(0x0C0D0E0F), + SPH_C32(0x10111213), SPH_C32(0x14151617), + SPH_C32(0x18191A1B), SPH_C32(0x1C1D1E1F), + SPH_C32(0x20212223), SPH_C32(0x24252627), + SPH_C32(0x28292A2B), SPH_C32(0x2C2D2E2F), + SPH_C32(0x30313233), SPH_C32(0x34353637), + SPH_C32(0x38393A3B), SPH_C32(0x3C3D3E3F) +}; + +static const sph_u32 IV256[] = { + SPH_C32(0x40414243), SPH_C32(0x44454647), + SPH_C32(0x48494A4B), SPH_C32(0x4C4D4E4F), + SPH_C32(0x50515253), SPH_C32(0x54555657), + SPH_C32(0x58595A5B), SPH_C32(0x5C5D5E5F), + SPH_C32(0x60616263), SPH_C32(0x64656667), + SPH_C32(0x68696A6B), SPH_C32(0x6C6D6E6F), + SPH_C32(0x70717273), SPH_C32(0x74757677), + SPH_C32(0x78797A7B), SPH_C32(0x7C7D7E7F) +}; + +#if SPH_64 + +static const sph_u64 IV384[] = { + SPH_C64(0x0001020304050607), SPH_C64(0x08090A0B0C0D0E0F), + SPH_C64(0x1011121314151617), SPH_C64(0x18191A1B1C1D1E1F), + SPH_C64(0x2021222324252627), SPH_C64(0x28292A2B2C2D2E2F), + SPH_C64(0x3031323334353637), SPH_C64(0x38393A3B3C3D3E3F), + SPH_C64(0x4041424344454647), SPH_C64(0x48494A4B4C4D4E4F), + SPH_C64(0x5051525354555657), SPH_C64(0x58595A5B5C5D5E5F), + SPH_C64(0x6061626364656667), SPH_C64(0x68696A6B6C6D6E6F), + SPH_C64(0x7071727374757677), SPH_C64(0x78797A7B7C7D7E7F) +}; + +static const sph_u64 IV512[] = { + SPH_C64(0x8081828384858687), SPH_C64(0x88898A8B8C8D8E8F), + SPH_C64(0x9091929394959697), SPH_C64(0x98999A9B9C9D9E9F), + SPH_C64(0xA0A1A2A3A4A5A6A7), SPH_C64(0xA8A9AAABACADAEAF), + SPH_C64(0xB0B1B2B3B4B5B6B7), SPH_C64(0xB8B9BABBBCBDBEBF), + SPH_C64(0xC0C1C2C3C4C5C6C7), SPH_C64(0xC8C9CACBCCCDCECF), + SPH_C64(0xD0D1D2D3D4D5D6D7), SPH_C64(0xD8D9DADBDCDDDEDF), + SPH_C64(0xE0E1E2E3E4E5E6E7), SPH_C64(0xE8E9EAEBECEDEEEF), + SPH_C64(0xF0F1F2F3F4F5F6F7), SPH_C64(0xF8F9FAFBFCFDFEFF) +}; + +#endif + +#define XCAT(x, y) XCAT_(x, y) +#define XCAT_(x, y) x ## y + +#define LPAR ( + +#define I16_16 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 +#define I16_17 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 +#define I16_18 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17 +#define I16_19 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18 +#define I16_20 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19 +#define I16_21 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20 +#define I16_22 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21 +#define I16_23 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22 +#define I16_24 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23 +#define I16_25 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24 +#define I16_26 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25 +#define I16_27 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26 +#define I16_28 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27 +#define I16_29 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28 +#define I16_30 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29 +#define I16_31 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30 + +#define M16_16 0, 1, 3, 4, 7, 10, 11 +#define M16_17 1, 2, 4, 5, 8, 11, 12 +#define M16_18 2, 3, 5, 6, 9, 12, 13 +#define M16_19 3, 4, 6, 7, 10, 13, 14 +#define M16_20 4, 5, 7, 8, 11, 14, 15 +#define M16_21 5, 6, 8, 9, 12, 15, 16 +#define M16_22 6, 7, 9, 10, 13, 0, 1 +#define M16_23 7, 8, 10, 11, 14, 1, 2 +#define M16_24 8, 9, 11, 12, 15, 2, 3 +#define M16_25 9, 10, 12, 13, 0, 3, 4 +#define M16_26 10, 11, 13, 14, 1, 4, 5 +#define M16_27 11, 12, 14, 15, 2, 5, 6 +#define M16_28 12, 13, 15, 16, 3, 6, 7 +#define M16_29 13, 14, 0, 1, 4, 7, 8 +#define M16_30 14, 15, 1, 2, 5, 8, 9 +#define M16_31 15, 16, 2, 3, 6, 9, 10 + +#define ss0(x) (((x) >> 1) ^ SPH_T32((x) << 3) \ + ^ SPH_ROTL32(x, 4) ^ SPH_ROTL32(x, 19)) +#define ss1(x) (((x) >> 1) ^ SPH_T32((x) << 2) \ + ^ SPH_ROTL32(x, 8) ^ SPH_ROTL32(x, 23)) +#define ss2(x) (((x) >> 2) ^ SPH_T32((x) << 1) \ + ^ SPH_ROTL32(x, 12) ^ SPH_ROTL32(x, 25)) +#define ss3(x) (((x) >> 2) ^ SPH_T32((x) << 2) \ + ^ SPH_ROTL32(x, 15) ^ SPH_ROTL32(x, 29)) +#define ss4(x) (((x) >> 1) ^ (x)) +#define ss5(x) (((x) >> 2) ^ (x)) +#define rs1(x) SPH_ROTL32(x, 3) +#define rs2(x) SPH_ROTL32(x, 7) +#define rs3(x) SPH_ROTL32(x, 13) +#define rs4(x) SPH_ROTL32(x, 16) +#define rs5(x) SPH_ROTL32(x, 19) +#define rs6(x) SPH_ROTL32(x, 23) +#define rs7(x) SPH_ROTL32(x, 27) + +#define Ks(j) SPH_T32((sph_u32)(j) * SPH_C32(0x05555555)) + +#define add_elt_s(mf, hf, j0m, j1m, j3m, j4m, j7m, j10m, j11m, j16) \ + (SPH_T32(SPH_ROTL32(mf(j0m), j1m) + SPH_ROTL32(mf(j3m), j4m) \ + - SPH_ROTL32(mf(j10m), j11m) + Ks(j16)) ^ hf(j7m)) + +#define expand1s_inner(qf, mf, hf, i16, \ + i0, i1, i2, i3, i4, i5, i6, i7, i8, \ + i9, i10, i11, i12, i13, i14, i15, \ + i0m, i1m, i3m, i4m, i7m, i10m, i11m) \ + SPH_T32(ss1(qf(i0)) + ss2(qf(i1)) + ss3(qf(i2)) + ss0(qf(i3)) \ + + ss1(qf(i4)) + ss2(qf(i5)) + ss3(qf(i6)) + ss0(qf(i7)) \ + + ss1(qf(i8)) + ss2(qf(i9)) + ss3(qf(i10)) + ss0(qf(i11)) \ + + ss1(qf(i12)) + ss2(qf(i13)) + ss3(qf(i14)) + ss0(qf(i15)) \ + + add_elt_s(mf, hf, i0m, i1m, i3m, i4m, i7m, i10m, i11m, i16)) + +#define expand1s(qf, mf, hf, i16) \ + expand1s_(qf, mf, hf, i16, I16_ ## i16, M16_ ## i16) +#define expand1s_(qf, mf, hf, i16, ix, iy) \ + expand1s_inner LPAR qf, mf, hf, i16, ix, iy) + +#define expand2s_inner(qf, mf, hf, i16, \ + i0, i1, i2, i3, i4, i5, i6, i7, i8, \ + i9, i10, i11, i12, i13, i14, i15, \ + i0m, i1m, i3m, i4m, i7m, i10m, i11m) \ + SPH_T32(qf(i0) + rs1(qf(i1)) + qf(i2) + rs2(qf(i3)) \ + + qf(i4) + rs3(qf(i5)) + qf(i6) + rs4(qf(i7)) \ + + qf(i8) + rs5(qf(i9)) + qf(i10) + rs6(qf(i11)) \ + + qf(i12) + rs7(qf(i13)) + ss4(qf(i14)) + ss5(qf(i15)) \ + + add_elt_s(mf, hf, i0m, i1m, i3m, i4m, i7m, i10m, i11m, i16)) + +#define expand2s(qf, mf, hf, i16) \ + expand2s_(qf, mf, hf, i16, I16_ ## i16, M16_ ## i16) +#define expand2s_(qf, mf, hf, i16, ix, iy) \ + expand2s_inner LPAR qf, mf, hf, i16, ix, iy) + +#if SPH_64 + +#define sb0(x) (((x) >> 1) ^ SPH_T64((x) << 3) \ + ^ SPH_ROTL64(x, 4) ^ SPH_ROTL64(x, 37)) +#define sb1(x) (((x) >> 1) ^ SPH_T64((x) << 2) \ + ^ SPH_ROTL64(x, 13) ^ SPH_ROTL64(x, 43)) +#define sb2(x) (((x) >> 2) ^ SPH_T64((x) << 1) \ + ^ SPH_ROTL64(x, 19) ^ SPH_ROTL64(x, 53)) +#define sb3(x) (((x) >> 2) ^ SPH_T64((x) << 2) \ + ^ SPH_ROTL64(x, 28) ^ SPH_ROTL64(x, 59)) +#define sb4(x) (((x) >> 1) ^ (x)) +#define sb5(x) (((x) >> 2) ^ (x)) +#define rb1(x) SPH_ROTL64(x, 5) +#define rb2(x) SPH_ROTL64(x, 11) +#define rb3(x) SPH_ROTL64(x, 27) +#define rb4(x) SPH_ROTL64(x, 32) +#define rb5(x) SPH_ROTL64(x, 37) +#define rb6(x) SPH_ROTL64(x, 43) +#define rb7(x) SPH_ROTL64(x, 53) + +#define Kb(j) SPH_T64((sph_u64)(j) * SPH_C64(0x0555555555555555)) + +#if SPH_SMALL_FOOTPRINT_BMW + +static const sph_u64 Kb_tab[] = { + Kb(16), Kb(17), Kb(18), Kb(19), Kb(20), Kb(21), Kb(22), Kb(23), + Kb(24), Kb(25), Kb(26), Kb(27), Kb(28), Kb(29), Kb(30), Kb(31) +}; + +#define rol_off(mf, j, off) \ + SPH_ROTL64(mf(((j) + (off)) & 15), (((j) + (off)) & 15) + 1) + +#define add_elt_b(mf, hf, j) \ + (SPH_T64(rol_off(mf, j, 0) + rol_off(mf, j, 3) \ + - rol_off(mf, j, 10) + Kb_tab[j]) ^ hf(((j) + 7) & 15)) + +#define expand1b(qf, mf, hf, i) \ + SPH_T64(sb1(qf((i) - 16)) + sb2(qf((i) - 15)) \ + + sb3(qf((i) - 14)) + sb0(qf((i) - 13)) \ + + sb1(qf((i) - 12)) + sb2(qf((i) - 11)) \ + + sb3(qf((i) - 10)) + sb0(qf((i) - 9)) \ + + sb1(qf((i) - 8)) + sb2(qf((i) - 7)) \ + + sb3(qf((i) - 6)) + sb0(qf((i) - 5)) \ + + sb1(qf((i) - 4)) + sb2(qf((i) - 3)) \ + + sb3(qf((i) - 2)) + sb0(qf((i) - 1)) \ + + add_elt_b(mf, hf, (i) - 16)) + +#define expand2b(qf, mf, hf, i) \ + SPH_T64(qf((i) - 16) + rb1(qf((i) - 15)) \ + + qf((i) - 14) + rb2(qf((i) - 13)) \ + + qf((i) - 12) + rb3(qf((i) - 11)) \ + + qf((i) - 10) + rb4(qf((i) - 9)) \ + + qf((i) - 8) + rb5(qf((i) - 7)) \ + + qf((i) - 6) + rb6(qf((i) - 5)) \ + + qf((i) - 4) + rb7(qf((i) - 3)) \ + + sb4(qf((i) - 2)) + sb5(qf((i) - 1)) \ + + add_elt_b(mf, hf, (i) - 16)) + +#else + +#define add_elt_b(mf, hf, j0m, j1m, j3m, j4m, j7m, j10m, j11m, j16) \ + (SPH_T64(SPH_ROTL64(mf(j0m), j1m) + SPH_ROTL64(mf(j3m), j4m) \ + - SPH_ROTL64(mf(j10m), j11m) + Kb(j16)) ^ hf(j7m)) + +#define expand1b_inner(qf, mf, hf, i16, \ + i0, i1, i2, i3, i4, i5, i6, i7, i8, \ + i9, i10, i11, i12, i13, i14, i15, \ + i0m, i1m, i3m, i4m, i7m, i10m, i11m) \ + SPH_T64(sb1(qf(i0)) + sb2(qf(i1)) + sb3(qf(i2)) + sb0(qf(i3)) \ + + sb1(qf(i4)) + sb2(qf(i5)) + sb3(qf(i6)) + sb0(qf(i7)) \ + + sb1(qf(i8)) + sb2(qf(i9)) + sb3(qf(i10)) + sb0(qf(i11)) \ + + sb1(qf(i12)) + sb2(qf(i13)) + sb3(qf(i14)) + sb0(qf(i15)) \ + + add_elt_b(mf, hf, i0m, i1m, i3m, i4m, i7m, i10m, i11m, i16)) + +#define expand1b(qf, mf, hf, i16) \ + expand1b_(qf, mf, hf, i16, I16_ ## i16, M16_ ## i16) +#define expand1b_(qf, mf, hf, i16, ix, iy) \ + expand1b_inner LPAR qf, mf, hf, i16, ix, iy) + +#define expand2b_inner(qf, mf, hf, i16, \ + i0, i1, i2, i3, i4, i5, i6, i7, i8, \ + i9, i10, i11, i12, i13, i14, i15, \ + i0m, i1m, i3m, i4m, i7m, i10m, i11m) \ + SPH_T64(qf(i0) + rb1(qf(i1)) + qf(i2) + rb2(qf(i3)) \ + + qf(i4) + rb3(qf(i5)) + qf(i6) + rb4(qf(i7)) \ + + qf(i8) + rb5(qf(i9)) + qf(i10) + rb6(qf(i11)) \ + + qf(i12) + rb7(qf(i13)) + sb4(qf(i14)) + sb5(qf(i15)) \ + + add_elt_b(mf, hf, i0m, i1m, i3m, i4m, i7m, i10m, i11m, i16)) + +#define expand2b(qf, mf, hf, i16) \ + expand2b_(qf, mf, hf, i16, I16_ ## i16, M16_ ## i16) +#define expand2b_(qf, mf, hf, i16, ix, iy) \ + expand2b_inner LPAR qf, mf, hf, i16, ix, iy) + +#endif + +#endif + +#define MAKE_W(tt, i0, op01, i1, op12, i2, op23, i3, op34, i4) \ + tt((M(i0) ^ H(i0)) op01 (M(i1) ^ H(i1)) op12 (M(i2) ^ H(i2)) \ + op23 (M(i3) ^ H(i3)) op34 (M(i4) ^ H(i4))) + +#define Ws0 MAKE_W(SPH_T32, 5, -, 7, +, 10, +, 13, +, 14) +#define Ws1 MAKE_W(SPH_T32, 6, -, 8, +, 11, +, 14, -, 15) +#define Ws2 MAKE_W(SPH_T32, 0, +, 7, +, 9, -, 12, +, 15) +#define Ws3 MAKE_W(SPH_T32, 0, -, 1, +, 8, -, 10, +, 13) +#define Ws4 MAKE_W(SPH_T32, 1, +, 2, +, 9, -, 11, -, 14) +#define Ws5 MAKE_W(SPH_T32, 3, -, 2, +, 10, -, 12, +, 15) +#define Ws6 MAKE_W(SPH_T32, 4, -, 0, -, 3, -, 11, +, 13) +#define Ws7 MAKE_W(SPH_T32, 1, -, 4, -, 5, -, 12, -, 14) +#define Ws8 MAKE_W(SPH_T32, 2, -, 5, -, 6, +, 13, -, 15) +#define Ws9 MAKE_W(SPH_T32, 0, -, 3, +, 6, -, 7, +, 14) +#define Ws10 MAKE_W(SPH_T32, 8, -, 1, -, 4, -, 7, +, 15) +#define Ws11 MAKE_W(SPH_T32, 8, -, 0, -, 2, -, 5, +, 9) +#define Ws12 MAKE_W(SPH_T32, 1, +, 3, -, 6, -, 9, +, 10) +#define Ws13 MAKE_W(SPH_T32, 2, +, 4, +, 7, +, 10, +, 11) +#define Ws14 MAKE_W(SPH_T32, 3, -, 5, +, 8, -, 11, -, 12) +#define Ws15 MAKE_W(SPH_T32, 12, -, 4, -, 6, -, 9, +, 13) + +#if SPH_SMALL_FOOTPRINT_BMW + +#define MAKE_Qas do { \ + unsigned u; \ + sph_u32 Ws[16]; \ + Ws[ 0] = Ws0; \ + Ws[ 1] = Ws1; \ + Ws[ 2] = Ws2; \ + Ws[ 3] = Ws3; \ + Ws[ 4] = Ws4; \ + Ws[ 5] = Ws5; \ + Ws[ 6] = Ws6; \ + Ws[ 7] = Ws7; \ + Ws[ 8] = Ws8; \ + Ws[ 9] = Ws9; \ + Ws[10] = Ws10; \ + Ws[11] = Ws11; \ + Ws[12] = Ws12; \ + Ws[13] = Ws13; \ + Ws[14] = Ws14; \ + Ws[15] = Ws15; \ + for (u = 0; u < 15; u += 5) { \ + qt[u + 0] = SPH_T32(ss0(Ws[u + 0]) + H(u + 1)); \ + qt[u + 1] = SPH_T32(ss1(Ws[u + 1]) + H(u + 2)); \ + qt[u + 2] = SPH_T32(ss2(Ws[u + 2]) + H(u + 3)); \ + qt[u + 3] = SPH_T32(ss3(Ws[u + 3]) + H(u + 4)); \ + qt[u + 4] = SPH_T32(ss4(Ws[u + 4]) + H(u + 5)); \ + } \ + qt[15] = SPH_T32(ss0(Ws[15]) + H(0)); \ + } while (0) + +#define MAKE_Qbs do { \ + qt[16] = expand1s(Qs, M, H, 16); \ + qt[17] = expand1s(Qs, M, H, 17); \ + qt[18] = expand2s(Qs, M, H, 18); \ + qt[19] = expand2s(Qs, M, H, 19); \ + qt[20] = expand2s(Qs, M, H, 20); \ + qt[21] = expand2s(Qs, M, H, 21); \ + qt[22] = expand2s(Qs, M, H, 22); \ + qt[23] = expand2s(Qs, M, H, 23); \ + qt[24] = expand2s(Qs, M, H, 24); \ + qt[25] = expand2s(Qs, M, H, 25); \ + qt[26] = expand2s(Qs, M, H, 26); \ + qt[27] = expand2s(Qs, M, H, 27); \ + qt[28] = expand2s(Qs, M, H, 28); \ + qt[29] = expand2s(Qs, M, H, 29); \ + qt[30] = expand2s(Qs, M, H, 30); \ + qt[31] = expand2s(Qs, M, H, 31); \ + } while (0) + +#else + +#define MAKE_Qas do { \ + qt[ 0] = SPH_T32(ss0(Ws0 ) + H( 1)); \ + qt[ 1] = SPH_T32(ss1(Ws1 ) + H( 2)); \ + qt[ 2] = SPH_T32(ss2(Ws2 ) + H( 3)); \ + qt[ 3] = SPH_T32(ss3(Ws3 ) + H( 4)); \ + qt[ 4] = SPH_T32(ss4(Ws4 ) + H( 5)); \ + qt[ 5] = SPH_T32(ss0(Ws5 ) + H( 6)); \ + qt[ 6] = SPH_T32(ss1(Ws6 ) + H( 7)); \ + qt[ 7] = SPH_T32(ss2(Ws7 ) + H( 8)); \ + qt[ 8] = SPH_T32(ss3(Ws8 ) + H( 9)); \ + qt[ 9] = SPH_T32(ss4(Ws9 ) + H(10)); \ + qt[10] = SPH_T32(ss0(Ws10) + H(11)); \ + qt[11] = SPH_T32(ss1(Ws11) + H(12)); \ + qt[12] = SPH_T32(ss2(Ws12) + H(13)); \ + qt[13] = SPH_T32(ss3(Ws13) + H(14)); \ + qt[14] = SPH_T32(ss4(Ws14) + H(15)); \ + qt[15] = SPH_T32(ss0(Ws15) + H( 0)); \ + } while (0) + +#define MAKE_Qbs do { \ + qt[16] = expand1s(Qs, M, H, 16); \ + qt[17] = expand1s(Qs, M, H, 17); \ + qt[18] = expand2s(Qs, M, H, 18); \ + qt[19] = expand2s(Qs, M, H, 19); \ + qt[20] = expand2s(Qs, M, H, 20); \ + qt[21] = expand2s(Qs, M, H, 21); \ + qt[22] = expand2s(Qs, M, H, 22); \ + qt[23] = expand2s(Qs, M, H, 23); \ + qt[24] = expand2s(Qs, M, H, 24); \ + qt[25] = expand2s(Qs, M, H, 25); \ + qt[26] = expand2s(Qs, M, H, 26); \ + qt[27] = expand2s(Qs, M, H, 27); \ + qt[28] = expand2s(Qs, M, H, 28); \ + qt[29] = expand2s(Qs, M, H, 29); \ + qt[30] = expand2s(Qs, M, H, 30); \ + qt[31] = expand2s(Qs, M, H, 31); \ + } while (0) + +#endif + +#define MAKE_Qs do { \ + MAKE_Qas; \ + MAKE_Qbs; \ + } while (0) + +#define Qs(j) (qt[j]) + +#if SPH_64 + +#define Wb0 MAKE_W(SPH_T64, 5, -, 7, +, 10, +, 13, +, 14) +#define Wb1 MAKE_W(SPH_T64, 6, -, 8, +, 11, +, 14, -, 15) +#define Wb2 MAKE_W(SPH_T64, 0, +, 7, +, 9, -, 12, +, 15) +#define Wb3 MAKE_W(SPH_T64, 0, -, 1, +, 8, -, 10, +, 13) +#define Wb4 MAKE_W(SPH_T64, 1, +, 2, +, 9, -, 11, -, 14) +#define Wb5 MAKE_W(SPH_T64, 3, -, 2, +, 10, -, 12, +, 15) +#define Wb6 MAKE_W(SPH_T64, 4, -, 0, -, 3, -, 11, +, 13) +#define Wb7 MAKE_W(SPH_T64, 1, -, 4, -, 5, -, 12, -, 14) +#define Wb8 MAKE_W(SPH_T64, 2, -, 5, -, 6, +, 13, -, 15) +#define Wb9 MAKE_W(SPH_T64, 0, -, 3, +, 6, -, 7, +, 14) +#define Wb10 MAKE_W(SPH_T64, 8, -, 1, -, 4, -, 7, +, 15) +#define Wb11 MAKE_W(SPH_T64, 8, -, 0, -, 2, -, 5, +, 9) +#define Wb12 MAKE_W(SPH_T64, 1, +, 3, -, 6, -, 9, +, 10) +#define Wb13 MAKE_W(SPH_T64, 2, +, 4, +, 7, +, 10, +, 11) +#define Wb14 MAKE_W(SPH_T64, 3, -, 5, +, 8, -, 11, -, 12) +#define Wb15 MAKE_W(SPH_T64, 12, -, 4, -, 6, -, 9, +, 13) + +#if SPH_SMALL_FOOTPRINT_BMW + +#define MAKE_Qab do { \ + unsigned u; \ + sph_u64 Wb[16]; \ + Wb[ 0] = Wb0; \ + Wb[ 1] = Wb1; \ + Wb[ 2] = Wb2; \ + Wb[ 3] = Wb3; \ + Wb[ 4] = Wb4; \ + Wb[ 5] = Wb5; \ + Wb[ 6] = Wb6; \ + Wb[ 7] = Wb7; \ + Wb[ 8] = Wb8; \ + Wb[ 9] = Wb9; \ + Wb[10] = Wb10; \ + Wb[11] = Wb11; \ + Wb[12] = Wb12; \ + Wb[13] = Wb13; \ + Wb[14] = Wb14; \ + Wb[15] = Wb15; \ + for (u = 0; u < 15; u += 5) { \ + qt[u + 0] = SPH_T64(sb0(Wb[u + 0]) + H(u + 1)); \ + qt[u + 1] = SPH_T64(sb1(Wb[u + 1]) + H(u + 2)); \ + qt[u + 2] = SPH_T64(sb2(Wb[u + 2]) + H(u + 3)); \ + qt[u + 3] = SPH_T64(sb3(Wb[u + 3]) + H(u + 4)); \ + qt[u + 4] = SPH_T64(sb4(Wb[u + 4]) + H(u + 5)); \ + } \ + qt[15] = SPH_T64(sb0(Wb[15]) + H(0)); \ + } while (0) + +#define MAKE_Qbb do { \ + unsigned u; \ + for (u = 16; u < 18; u ++) \ + qt[u] = expand1b(Qb, M, H, u); \ + for (u = 18; u < 32; u ++) \ + qt[u] = expand2b(Qb, M, H, u); \ + } while (0) + +#else + +#define MAKE_Qab do { \ + qt[ 0] = SPH_T64(sb0(Wb0 ) + H( 1)); \ + qt[ 1] = SPH_T64(sb1(Wb1 ) + H( 2)); \ + qt[ 2] = SPH_T64(sb2(Wb2 ) + H( 3)); \ + qt[ 3] = SPH_T64(sb3(Wb3 ) + H( 4)); \ + qt[ 4] = SPH_T64(sb4(Wb4 ) + H( 5)); \ + qt[ 5] = SPH_T64(sb0(Wb5 ) + H( 6)); \ + qt[ 6] = SPH_T64(sb1(Wb6 ) + H( 7)); \ + qt[ 7] = SPH_T64(sb2(Wb7 ) + H( 8)); \ + qt[ 8] = SPH_T64(sb3(Wb8 ) + H( 9)); \ + qt[ 9] = SPH_T64(sb4(Wb9 ) + H(10)); \ + qt[10] = SPH_T64(sb0(Wb10) + H(11)); \ + qt[11] = SPH_T64(sb1(Wb11) + H(12)); \ + qt[12] = SPH_T64(sb2(Wb12) + H(13)); \ + qt[13] = SPH_T64(sb3(Wb13) + H(14)); \ + qt[14] = SPH_T64(sb4(Wb14) + H(15)); \ + qt[15] = SPH_T64(sb0(Wb15) + H( 0)); \ + } while (0) + +#define MAKE_Qbb do { \ + qt[16] = expand1b(Qb, M, H, 16); \ + qt[17] = expand1b(Qb, M, H, 17); \ + qt[18] = expand2b(Qb, M, H, 18); \ + qt[19] = expand2b(Qb, M, H, 19); \ + qt[20] = expand2b(Qb, M, H, 20); \ + qt[21] = expand2b(Qb, M, H, 21); \ + qt[22] = expand2b(Qb, M, H, 22); \ + qt[23] = expand2b(Qb, M, H, 23); \ + qt[24] = expand2b(Qb, M, H, 24); \ + qt[25] = expand2b(Qb, M, H, 25); \ + qt[26] = expand2b(Qb, M, H, 26); \ + qt[27] = expand2b(Qb, M, H, 27); \ + qt[28] = expand2b(Qb, M, H, 28); \ + qt[29] = expand2b(Qb, M, H, 29); \ + qt[30] = expand2b(Qb, M, H, 30); \ + qt[31] = expand2b(Qb, M, H, 31); \ + } while (0) + +#endif + +#define MAKE_Qb do { \ + MAKE_Qab; \ + MAKE_Qbb; \ + } while (0) + +#define Qb(j) (qt[j]) + +#endif + +#define FOLD(type, mkQ, tt, rol, mf, qf, dhf) do { \ + type qt[32], xl, xh; \ + mkQ; \ + xl = qf(16) ^ qf(17) ^ qf(18) ^ qf(19) \ + ^ qf(20) ^ qf(21) ^ qf(22) ^ qf(23); \ + xh = xl ^ qf(24) ^ qf(25) ^ qf(26) ^ qf(27) \ + ^ qf(28) ^ qf(29) ^ qf(30) ^ qf(31); \ + dhf( 0) = tt(((xh << 5) ^ (qf(16) >> 5) ^ mf( 0)) \ + + (xl ^ qf(24) ^ qf( 0))); \ + dhf( 1) = tt(((xh >> 7) ^ (qf(17) << 8) ^ mf( 1)) \ + + (xl ^ qf(25) ^ qf( 1))); \ + dhf( 2) = tt(((xh >> 5) ^ (qf(18) << 5) ^ mf( 2)) \ + + (xl ^ qf(26) ^ qf( 2))); \ + dhf( 3) = tt(((xh >> 1) ^ (qf(19) << 5) ^ mf( 3)) \ + + (xl ^ qf(27) ^ qf( 3))); \ + dhf( 4) = tt(((xh >> 3) ^ (qf(20) << 0) ^ mf( 4)) \ + + (xl ^ qf(28) ^ qf( 4))); \ + dhf( 5) = tt(((xh << 6) ^ (qf(21) >> 6) ^ mf( 5)) \ + + (xl ^ qf(29) ^ qf( 5))); \ + dhf( 6) = tt(((xh >> 4) ^ (qf(22) << 6) ^ mf( 6)) \ + + (xl ^ qf(30) ^ qf( 6))); \ + dhf( 7) = tt(((xh >> 11) ^ (qf(23) << 2) ^ mf( 7)) \ + + (xl ^ qf(31) ^ qf( 7))); \ + dhf( 8) = tt(rol(dhf(4), 9) + (xh ^ qf(24) ^ mf( 8)) \ + + ((xl << 8) ^ qf(23) ^ qf( 8))); \ + dhf( 9) = tt(rol(dhf(5), 10) + (xh ^ qf(25) ^ mf( 9)) \ + + ((xl >> 6) ^ qf(16) ^ qf( 9))); \ + dhf(10) = tt(rol(dhf(6), 11) + (xh ^ qf(26) ^ mf(10)) \ + + ((xl << 6) ^ qf(17) ^ qf(10))); \ + dhf(11) = tt(rol(dhf(7), 12) + (xh ^ qf(27) ^ mf(11)) \ + + ((xl << 4) ^ qf(18) ^ qf(11))); \ + dhf(12) = tt(rol(dhf(0), 13) + (xh ^ qf(28) ^ mf(12)) \ + + ((xl >> 3) ^ qf(19) ^ qf(12))); \ + dhf(13) = tt(rol(dhf(1), 14) + (xh ^ qf(29) ^ mf(13)) \ + + ((xl >> 4) ^ qf(20) ^ qf(13))); \ + dhf(14) = tt(rol(dhf(2), 15) + (xh ^ qf(30) ^ mf(14)) \ + + ((xl >> 7) ^ qf(21) ^ qf(14))); \ + dhf(15) = tt(rol(dhf(3), 16) + (xh ^ qf(31) ^ mf(15)) \ + + ((xl >> 2) ^ qf(22) ^ qf(15))); \ + } while (0) + +#define FOLDs FOLD(sph_u32, MAKE_Qs, SPH_T32, SPH_ROTL32, M, Qs, dH) + +#if SPH_64 + +#define FOLDb FOLD(sph_u64, MAKE_Qb, SPH_T64, SPH_ROTL64, M, Qb, dH) + +#endif + +static void +compress_small(const unsigned char *data, const sph_u32 h[16], sph_u32 dh[16]) +{ +#if SPH_LITTLE_FAST +#define M(x) sph_dec32le_aligned(data + 4 * (x)) +#else + sph_u32 mv[16]; + + mv[ 0] = sph_dec32le_aligned(data + 0); + mv[ 1] = sph_dec32le_aligned(data + 4); + mv[ 2] = sph_dec32le_aligned(data + 8); + mv[ 3] = sph_dec32le_aligned(data + 12); + mv[ 4] = sph_dec32le_aligned(data + 16); + mv[ 5] = sph_dec32le_aligned(data + 20); + mv[ 6] = sph_dec32le_aligned(data + 24); + mv[ 7] = sph_dec32le_aligned(data + 28); + mv[ 8] = sph_dec32le_aligned(data + 32); + mv[ 9] = sph_dec32le_aligned(data + 36); + mv[10] = sph_dec32le_aligned(data + 40); + mv[11] = sph_dec32le_aligned(data + 44); + mv[12] = sph_dec32le_aligned(data + 48); + mv[13] = sph_dec32le_aligned(data + 52); + mv[14] = sph_dec32le_aligned(data + 56); + mv[15] = sph_dec32le_aligned(data + 60); +#define M(x) (mv[x]) +#endif +#define H(x) (h[x]) +#define dH(x) (dh[x]) + + FOLDs; + +#undef M +#undef H +#undef dH +} + +static const sph_u32 final_s[16] = { + SPH_C32(0xaaaaaaa0), SPH_C32(0xaaaaaaa1), SPH_C32(0xaaaaaaa2), + SPH_C32(0xaaaaaaa3), SPH_C32(0xaaaaaaa4), SPH_C32(0xaaaaaaa5), + SPH_C32(0xaaaaaaa6), SPH_C32(0xaaaaaaa7), SPH_C32(0xaaaaaaa8), + SPH_C32(0xaaaaaaa9), SPH_C32(0xaaaaaaaa), SPH_C32(0xaaaaaaab), + SPH_C32(0xaaaaaaac), SPH_C32(0xaaaaaaad), SPH_C32(0xaaaaaaae), + SPH_C32(0xaaaaaaaf) +}; + +static void +bmw32_init(sph_bmw_small_context *sc, const sph_u32 *iv) +{ + memcpy(sc->H, iv, sizeof sc->H); + sc->ptr = 0; +#if SPH_64 + sc->bit_count = 0; +#else + sc->bit_count_high = 0; + sc->bit_count_low = 0; +#endif +} + +static void +bmw32(sph_bmw_small_context *sc, const void *data, size_t len) +{ + unsigned char *buf; + size_t ptr; + sph_u32 htmp[16]; + sph_u32 *h1, *h2; +#if !SPH_64 + sph_u32 tmp; +#endif + +#if SPH_64 + sc->bit_count += (sph_u64)len << 3; +#else + tmp = sc->bit_count_low; + sc->bit_count_low = SPH_T32(tmp + ((sph_u32)len << 3)); + if (sc->bit_count_low < tmp) + sc->bit_count_high ++; + sc->bit_count_high += len >> 29; +#endif + buf = sc->buf; + ptr = sc->ptr; + h1 = sc->H; + h2 = htmp; + while (len > 0) { + size_t clen; + + clen = (sizeof sc->buf) - ptr; + if (clen > len) + clen = len; + memcpy(buf + ptr, data, clen); + data = (const unsigned char *)data + clen; + len -= clen; + ptr += clen; + if (ptr == sizeof sc->buf) { + sph_u32 *ht; + + compress_small(buf, h1, h2); + ht = h1; + h1 = h2; + h2 = ht; + ptr = 0; + } + } + sc->ptr = ptr; + if (h1 != sc->H) + memcpy(sc->H, h1, sizeof sc->H); +} + +static void +bmw32_close(sph_bmw_small_context *sc, unsigned ub, unsigned n, + void *dst, size_t out_size_w32) +{ + unsigned char *buf, *out; + size_t ptr, u, v; + unsigned z; + sph_u32 h1[16], h2[16], *h; + + buf = sc->buf; + ptr = sc->ptr; + z = 0x80 >> n; + buf[ptr ++] = ((ub & -z) | z) & 0xFF; + h = sc->H; + if (ptr > (sizeof sc->buf) - 8) { + memset(buf + ptr, 0, (sizeof sc->buf) - ptr); + compress_small(buf, h, h1); + ptr = 0; + h = h1; + } + memset(buf + ptr, 0, (sizeof sc->buf) - 8 - ptr); +#if SPH_64 + sph_enc64le_aligned(buf + (sizeof sc->buf) - 8, + SPH_T64(sc->bit_count + n)); +#else + sph_enc32le_aligned(buf + (sizeof sc->buf) - 8, + sc->bit_count_low + n); + sph_enc32le_aligned(buf + (sizeof sc->buf) - 4, + SPH_T32(sc->bit_count_high)); +#endif + compress_small(buf, h, h2); + for (u = 0; u < 16; u ++) + sph_enc32le_aligned(buf + 4 * u, h2[u]); + compress_small(buf, final_s, h1); + out = dst; + for (u = 0, v = 16 - out_size_w32; u < out_size_w32; u ++, v ++) + sph_enc32le(out + 4 * u, h1[v]); +} + +#if SPH_64 + +static void +compress_big(const unsigned char *data, const sph_u64 h[16], sph_u64 dh[16]) +{ +#if SPH_LITTLE_FAST +#define M(x) sph_dec64le_aligned(data + 8 * (x)) +#else + sph_u64 mv[16]; + + mv[ 0] = sph_dec64le_aligned(data + 0); + mv[ 1] = sph_dec64le_aligned(data + 8); + mv[ 2] = sph_dec64le_aligned(data + 16); + mv[ 3] = sph_dec64le_aligned(data + 24); + mv[ 4] = sph_dec64le_aligned(data + 32); + mv[ 5] = sph_dec64le_aligned(data + 40); + mv[ 6] = sph_dec64le_aligned(data + 48); + mv[ 7] = sph_dec64le_aligned(data + 56); + mv[ 8] = sph_dec64le_aligned(data + 64); + mv[ 9] = sph_dec64le_aligned(data + 72); + mv[10] = sph_dec64le_aligned(data + 80); + mv[11] = sph_dec64le_aligned(data + 88); + mv[12] = sph_dec64le_aligned(data + 96); + mv[13] = sph_dec64le_aligned(data + 104); + mv[14] = sph_dec64le_aligned(data + 112); + mv[15] = sph_dec64le_aligned(data + 120); +#define M(x) (mv[x]) +#endif +#define H(x) (h[x]) +#define dH(x) (dh[x]) + + FOLDb; + +#undef M +#undef H +#undef dH +} + +static const sph_u64 final_b[16] = { + SPH_C64(0xaaaaaaaaaaaaaaa0), SPH_C64(0xaaaaaaaaaaaaaaa1), + SPH_C64(0xaaaaaaaaaaaaaaa2), SPH_C64(0xaaaaaaaaaaaaaaa3), + SPH_C64(0xaaaaaaaaaaaaaaa4), SPH_C64(0xaaaaaaaaaaaaaaa5), + SPH_C64(0xaaaaaaaaaaaaaaa6), SPH_C64(0xaaaaaaaaaaaaaaa7), + SPH_C64(0xaaaaaaaaaaaaaaa8), SPH_C64(0xaaaaaaaaaaaaaaa9), + SPH_C64(0xaaaaaaaaaaaaaaaa), SPH_C64(0xaaaaaaaaaaaaaaab), + SPH_C64(0xaaaaaaaaaaaaaaac), SPH_C64(0xaaaaaaaaaaaaaaad), + SPH_C64(0xaaaaaaaaaaaaaaae), SPH_C64(0xaaaaaaaaaaaaaaaf) +}; + +static void +bmw64_init(sph_bmw_big_context *sc, const sph_u64 *iv) +{ + memcpy(sc->H, iv, sizeof sc->H); + sc->ptr = 0; + sc->bit_count = 0; +} + +static void +bmw64(sph_bmw_big_context *sc, const void *data, size_t len) +{ + unsigned char *buf; + size_t ptr; + sph_u64 htmp[16]; + sph_u64 *h1, *h2; + + sc->bit_count += (sph_u64)len << 3; + buf = sc->buf; + ptr = sc->ptr; + h1 = sc->H; + h2 = htmp; + while (len > 0) { + size_t clen; + + clen = (sizeof sc->buf) - ptr; + if (clen > len) + clen = len; + memcpy(buf + ptr, data, clen); + data = (const unsigned char *)data + clen; + len -= clen; + ptr += clen; + if (ptr == sizeof sc->buf) { + sph_u64 *ht; + + compress_big(buf, h1, h2); + ht = h1; + h1 = h2; + h2 = ht; + ptr = 0; + } + } + sc->ptr = ptr; + if (h1 != sc->H) + memcpy(sc->H, h1, sizeof sc->H); +} + +static void +bmw64_close(sph_bmw_big_context *sc, unsigned ub, unsigned n, + void *dst, size_t out_size_w64) +{ + unsigned char *buf, *out; + size_t ptr, u, v; + unsigned z; + sph_u64 h1[16], h2[16], *h; + + buf = sc->buf; + ptr = sc->ptr; + z = 0x80 >> n; + buf[ptr ++] = ((ub & -z) | z) & 0xFF; + h = sc->H; + if (ptr > (sizeof sc->buf) - 8) { + memset(buf + ptr, 0, (sizeof sc->buf) - ptr); + compress_big(buf, h, h1); + ptr = 0; + h = h1; + } + memset(buf + ptr, 0, (sizeof sc->buf) - 8 - ptr); + sph_enc64le_aligned(buf + (sizeof sc->buf) - 8, + SPH_T64(sc->bit_count + n)); + compress_big(buf, h, h2); + for (u = 0; u < 16; u ++) + sph_enc64le_aligned(buf + 8 * u, h2[u]); + compress_big(buf, final_b, h1); + out = dst; + for (u = 0, v = 16 - out_size_w64; u < out_size_w64; u ++, v ++) + sph_enc64le(out + 8 * u, h1[v]); +} + +#endif + +/* see sph_bmw.h */ +void +sph_bmw224_init(void *cc) +{ + bmw32_init(cc, IV224); +} + +/* see sph_bmw.h */ +void +sph_bmw224(void *cc, const void *data, size_t len) +{ + bmw32(cc, data, len); +} + +/* see sph_bmw.h */ +void +sph_bmw224_close(void *cc, void *dst) +{ + sph_bmw224_addbits_and_close(cc, 0, 0, dst); +} + +/* see sph_bmw.h */ +void +sph_bmw224_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst) +{ + bmw32_close(cc, ub, n, dst, 7); + sph_bmw224_init(cc); +} + +/* see sph_bmw.h */ +void +sph_bmw256_init(void *cc) +{ + bmw32_init(cc, IV256); +} + +/* see sph_bmw.h */ +void +sph_bmw256(void *cc, const void *data, size_t len) +{ + bmw32(cc, data, len); +} + +/* see sph_bmw.h */ +void +sph_bmw256_close(void *cc, void *dst) +{ + sph_bmw256_addbits_and_close(cc, 0, 0, dst); +} + +/* see sph_bmw.h */ +void +sph_bmw256_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst) +{ + bmw32_close(cc, ub, n, dst, 8); + sph_bmw256_init(cc); +} + +#if SPH_64 + +/* see sph_bmw.h */ +void +sph_bmw384_init(void *cc) +{ + bmw64_init(cc, IV384); +} + +/* see sph_bmw.h */ +void +sph_bmw384(void *cc, const void *data, size_t len) +{ + bmw64(cc, data, len); +} + +/* see sph_bmw.h */ +void +sph_bmw384_close(void *cc, void *dst) +{ + sph_bmw384_addbits_and_close(cc, 0, 0, dst); +} + +/* see sph_bmw.h */ +void +sph_bmw384_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst) +{ + bmw64_close(cc, ub, n, dst, 6); + sph_bmw384_init(cc); +} + +/* see sph_bmw.h */ +void +sph_bmw512_init(void *cc) +{ + bmw64_init(cc, IV512); +} + +/* see sph_bmw.h */ +void +sph_bmw512(void *cc, const void *data, size_t len) +{ + bmw64(cc, data, len); +} + +/* see sph_bmw.h */ +void +sph_bmw512_close(void *cc, void *dst) +{ + sph_bmw512_addbits_and_close(cc, 0, 0, dst); +} + +/* see sph_bmw.h */ +void +sph_bmw512_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst) +{ + bmw64_close(cc, ub, n, dst, 8); + sph_bmw512_init(cc); +} + +#endif + +#ifdef __cplusplus +} +#endif diff --git a/sha3/cubehash.c b/sha3/cubehash.c new file mode 100644 index 0000000..9322fe1 --- /dev/null +++ b/sha3/cubehash.c @@ -0,0 +1,723 @@ +/* $Id: cubehash.c 227 2010-06-16 17:28:38Z tp $ */ +/* + * CubeHash implementation. + * + * ==========================(LICENSE BEGIN)============================ + * + * Copyright (c) 2007-2010 Projet RNRT SAPHIR + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY + * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * + * ===========================(LICENSE END)============================= + * + * @author Thomas Pornin + */ + +#include +#include +#include + +#include "sph_cubehash.h" +#ifdef __cplusplus +extern "C"{ +#endif + +#if SPH_SMALL_FOOTPRINT && !defined SPH_SMALL_FOOTPRINT_CUBEHASH +#define SPH_SMALL_FOOTPRINT_CUBEHASH 1 +#endif + +/* + * Some tests were conducted on an Intel Core2 Q6600 (32-bit and 64-bit + * mode), a PowerPC G3, and a MIPS-compatible CPU (Broadcom BCM3302). + * It appears that the optimal settings are: + * -- full unroll, no state copy on the "big" systems (x86, PowerPC) + * -- unroll to 4 or 8, state copy on the "small" system (MIPS) + */ + +#if SPH_SMALL_FOOTPRINT_CUBEHASH + +#if !defined SPH_CUBEHASH_UNROLL +#define SPH_CUBEHASH_UNROLL 4 +#endif +#if !defined SPH_CUBEHASH_NOCOPY +#define SPH_CUBEHASH_NOCOPY 1 +#endif + +#else + +#if !defined SPH_CUBEHASH_UNROLL +#define SPH_CUBEHASH_UNROLL 0 +#endif +#if !defined SPH_CUBEHASH_NOCOPY +#define SPH_CUBEHASH_NOCOPY 0 +#endif + +#endif + +#ifdef _MSC_VER +#pragma warning (disable: 4146) +#endif + +static const sph_u32 IV224[] = { + SPH_C32(0xB0FC8217), SPH_C32(0x1BEE1A90), SPH_C32(0x829E1A22), + SPH_C32(0x6362C342), SPH_C32(0x24D91C30), SPH_C32(0x03A7AA24), + SPH_C32(0xA63721C8), SPH_C32(0x85B0E2EF), SPH_C32(0xF35D13F3), + SPH_C32(0x41DA807D), SPH_C32(0x21A70CA6), SPH_C32(0x1F4E9774), + SPH_C32(0xB3E1C932), SPH_C32(0xEB0A79A8), SPH_C32(0xCDDAAA66), + SPH_C32(0xE2F6ECAA), SPH_C32(0x0A713362), SPH_C32(0xAA3080E0), + SPH_C32(0xD8F23A32), SPH_C32(0xCEF15E28), SPH_C32(0xDB086314), + SPH_C32(0x7F709DF7), SPH_C32(0xACD228A4), SPH_C32(0x704D6ECE), + SPH_C32(0xAA3EC95F), SPH_C32(0xE387C214), SPH_C32(0x3A6445FF), + SPH_C32(0x9CAB81C3), SPH_C32(0xC73D4B98), SPH_C32(0xD277AEBE), + SPH_C32(0xFD20151C), SPH_C32(0x00CB573E) +}; + +static const sph_u32 IV256[] = { + SPH_C32(0xEA2BD4B4), SPH_C32(0xCCD6F29F), SPH_C32(0x63117E71), + SPH_C32(0x35481EAE), SPH_C32(0x22512D5B), SPH_C32(0xE5D94E63), + SPH_C32(0x7E624131), SPH_C32(0xF4CC12BE), SPH_C32(0xC2D0B696), + SPH_C32(0x42AF2070), SPH_C32(0xD0720C35), SPH_C32(0x3361DA8C), + SPH_C32(0x28CCECA4), SPH_C32(0x8EF8AD83), SPH_C32(0x4680AC00), + SPH_C32(0x40E5FBAB), SPH_C32(0xD89041C3), SPH_C32(0x6107FBD5), + SPH_C32(0x6C859D41), SPH_C32(0xF0B26679), SPH_C32(0x09392549), + SPH_C32(0x5FA25603), SPH_C32(0x65C892FD), SPH_C32(0x93CB6285), + SPH_C32(0x2AF2B5AE), SPH_C32(0x9E4B4E60), SPH_C32(0x774ABFDD), + SPH_C32(0x85254725), SPH_C32(0x15815AEB), SPH_C32(0x4AB6AAD6), + SPH_C32(0x9CDAF8AF), SPH_C32(0xD6032C0A) +}; + +static const sph_u32 IV384[] = { + SPH_C32(0xE623087E), SPH_C32(0x04C00C87), SPH_C32(0x5EF46453), + SPH_C32(0x69524B13), SPH_C32(0x1A05C7A9), SPH_C32(0x3528DF88), + SPH_C32(0x6BDD01B5), SPH_C32(0x5057B792), SPH_C32(0x6AA7A922), + SPH_C32(0x649C7EEE), SPH_C32(0xF426309F), SPH_C32(0xCB629052), + SPH_C32(0xFC8E20ED), SPH_C32(0xB3482BAB), SPH_C32(0xF89E5E7E), + SPH_C32(0xD83D4DE4), SPH_C32(0x44BFC10D), SPH_C32(0x5FC1E63D), + SPH_C32(0x2104E6CB), SPH_C32(0x17958F7F), SPH_C32(0xDBEAEF70), + SPH_C32(0xB4B97E1E), SPH_C32(0x32C195F6), SPH_C32(0x6184A8E4), + SPH_C32(0x796C2543), SPH_C32(0x23DE176D), SPH_C32(0xD33BBAEC), + SPH_C32(0x0C12E5D2), SPH_C32(0x4EB95A7B), SPH_C32(0x2D18BA01), + SPH_C32(0x04EE475F), SPH_C32(0x1FC5F22E) +}; + +static const sph_u32 IV512[] = { + SPH_C32(0x2AEA2A61), SPH_C32(0x50F494D4), SPH_C32(0x2D538B8B), + SPH_C32(0x4167D83E), SPH_C32(0x3FEE2313), SPH_C32(0xC701CF8C), + SPH_C32(0xCC39968E), SPH_C32(0x50AC5695), SPH_C32(0x4D42C787), + SPH_C32(0xA647A8B3), SPH_C32(0x97CF0BEF), SPH_C32(0x825B4537), + SPH_C32(0xEEF864D2), SPH_C32(0xF22090C4), SPH_C32(0xD0E5CD33), + SPH_C32(0xA23911AE), SPH_C32(0xFCD398D9), SPH_C32(0x148FE485), + SPH_C32(0x1B017BEF), SPH_C32(0xB6444532), SPH_C32(0x6A536159), + SPH_C32(0x2FF5781C), SPH_C32(0x91FA7934), SPH_C32(0x0DBADEA9), + SPH_C32(0xD65C8A2B), SPH_C32(0xA5A70E75), SPH_C32(0xB1C62456), + SPH_C32(0xBC796576), SPH_C32(0x1921C8F7), SPH_C32(0xE7989AF1), + SPH_C32(0x7795D246), SPH_C32(0xD43E3B44) +}; + +#define T32 SPH_T32 +#define ROTL32 SPH_ROTL32 + +#if SPH_CUBEHASH_NOCOPY + +#define DECL_STATE +#define READ_STATE(cc) +#define WRITE_STATE(cc) + +#define x0 ((sc)->state[ 0]) +#define x1 ((sc)->state[ 1]) +#define x2 ((sc)->state[ 2]) +#define x3 ((sc)->state[ 3]) +#define x4 ((sc)->state[ 4]) +#define x5 ((sc)->state[ 5]) +#define x6 ((sc)->state[ 6]) +#define x7 ((sc)->state[ 7]) +#define x8 ((sc)->state[ 8]) +#define x9 ((sc)->state[ 9]) +#define xa ((sc)->state[10]) +#define xb ((sc)->state[11]) +#define xc ((sc)->state[12]) +#define xd ((sc)->state[13]) +#define xe ((sc)->state[14]) +#define xf ((sc)->state[15]) +#define xg ((sc)->state[16]) +#define xh ((sc)->state[17]) +#define xi ((sc)->state[18]) +#define xj ((sc)->state[19]) +#define xk ((sc)->state[20]) +#define xl ((sc)->state[21]) +#define xm ((sc)->state[22]) +#define xn ((sc)->state[23]) +#define xo ((sc)->state[24]) +#define xp ((sc)->state[25]) +#define xq ((sc)->state[26]) +#define xr ((sc)->state[27]) +#define xs ((sc)->state[28]) +#define xt ((sc)->state[29]) +#define xu ((sc)->state[30]) +#define xv ((sc)->state[31]) + +#else + +#define DECL_STATE \ + sph_u32 x0, x1, x2, x3, x4, x5, x6, x7; \ + sph_u32 x8, x9, xa, xb, xc, xd, xe, xf; \ + sph_u32 xg, xh, xi, xj, xk, xl, xm, xn; \ + sph_u32 xo, xp, xq, xr, xs, xt, xu, xv; + +#define READ_STATE(cc) do { \ + x0 = (cc)->state[ 0]; \ + x1 = (cc)->state[ 1]; \ + x2 = (cc)->state[ 2]; \ + x3 = (cc)->state[ 3]; \ + x4 = (cc)->state[ 4]; \ + x5 = (cc)->state[ 5]; \ + x6 = (cc)->state[ 6]; \ + x7 = (cc)->state[ 7]; \ + x8 = (cc)->state[ 8]; \ + x9 = (cc)->state[ 9]; \ + xa = (cc)->state[10]; \ + xb = (cc)->state[11]; \ + xc = (cc)->state[12]; \ + xd = (cc)->state[13]; \ + xe = (cc)->state[14]; \ + xf = (cc)->state[15]; \ + xg = (cc)->state[16]; \ + xh = (cc)->state[17]; \ + xi = (cc)->state[18]; \ + xj = (cc)->state[19]; \ + xk = (cc)->state[20]; \ + xl = (cc)->state[21]; \ + xm = (cc)->state[22]; \ + xn = (cc)->state[23]; \ + xo = (cc)->state[24]; \ + xp = (cc)->state[25]; \ + xq = (cc)->state[26]; \ + xr = (cc)->state[27]; \ + xs = (cc)->state[28]; \ + xt = (cc)->state[29]; \ + xu = (cc)->state[30]; \ + xv = (cc)->state[31]; \ + } while (0) + +#define WRITE_STATE(cc) do { \ + (cc)->state[ 0] = x0; \ + (cc)->state[ 1] = x1; \ + (cc)->state[ 2] = x2; \ + (cc)->state[ 3] = x3; \ + (cc)->state[ 4] = x4; \ + (cc)->state[ 5] = x5; \ + (cc)->state[ 6] = x6; \ + (cc)->state[ 7] = x7; \ + (cc)->state[ 8] = x8; \ + (cc)->state[ 9] = x9; \ + (cc)->state[10] = xa; \ + (cc)->state[11] = xb; \ + (cc)->state[12] = xc; \ + (cc)->state[13] = xd; \ + (cc)->state[14] = xe; \ + (cc)->state[15] = xf; \ + (cc)->state[16] = xg; \ + (cc)->state[17] = xh; \ + (cc)->state[18] = xi; \ + (cc)->state[19] = xj; \ + (cc)->state[20] = xk; \ + (cc)->state[21] = xl; \ + (cc)->state[22] = xm; \ + (cc)->state[23] = xn; \ + (cc)->state[24] = xo; \ + (cc)->state[25] = xp; \ + (cc)->state[26] = xq; \ + (cc)->state[27] = xr; \ + (cc)->state[28] = xs; \ + (cc)->state[29] = xt; \ + (cc)->state[30] = xu; \ + (cc)->state[31] = xv; \ + } while (0) + +#endif + +#define INPUT_BLOCK do { \ + x0 ^= sph_dec32le_aligned(buf + 0); \ + x1 ^= sph_dec32le_aligned(buf + 4); \ + x2 ^= sph_dec32le_aligned(buf + 8); \ + x3 ^= sph_dec32le_aligned(buf + 12); \ + x4 ^= sph_dec32le_aligned(buf + 16); \ + x5 ^= sph_dec32le_aligned(buf + 20); \ + x6 ^= sph_dec32le_aligned(buf + 24); \ + x7 ^= sph_dec32le_aligned(buf + 28); \ + } while (0) + +#define ROUND_EVEN do { \ + xg = T32(x0 + xg); \ + x0 = ROTL32(x0, 7); \ + xh = T32(x1 + xh); \ + x1 = ROTL32(x1, 7); \ + xi = T32(x2 + xi); \ + x2 = ROTL32(x2, 7); \ + xj = T32(x3 + xj); \ + x3 = ROTL32(x3, 7); \ + xk = T32(x4 + xk); \ + x4 = ROTL32(x4, 7); \ + xl = T32(x5 + xl); \ + x5 = ROTL32(x5, 7); \ + xm = T32(x6 + xm); \ + x6 = ROTL32(x6, 7); \ + xn = T32(x7 + xn); \ + x7 = ROTL32(x7, 7); \ + xo = T32(x8 + xo); \ + x8 = ROTL32(x8, 7); \ + xp = T32(x9 + xp); \ + x9 = ROTL32(x9, 7); \ + xq = T32(xa + xq); \ + xa = ROTL32(xa, 7); \ + xr = T32(xb + xr); \ + xb = ROTL32(xb, 7); \ + xs = T32(xc + xs); \ + xc = ROTL32(xc, 7); \ + xt = T32(xd + xt); \ + xd = ROTL32(xd, 7); \ + xu = T32(xe + xu); \ + xe = ROTL32(xe, 7); \ + xv = T32(xf + xv); \ + xf = ROTL32(xf, 7); \ + x8 ^= xg; \ + x9 ^= xh; \ + xa ^= xi; \ + xb ^= xj; \ + xc ^= xk; \ + xd ^= xl; \ + xe ^= xm; \ + xf ^= xn; \ + x0 ^= xo; \ + x1 ^= xp; \ + x2 ^= xq; \ + x3 ^= xr; \ + x4 ^= xs; \ + x5 ^= xt; \ + x6 ^= xu; \ + x7 ^= xv; \ + xi = T32(x8 + xi); \ + x8 = ROTL32(x8, 11); \ + xj = T32(x9 + xj); \ + x9 = ROTL32(x9, 11); \ + xg = T32(xa + xg); \ + xa = ROTL32(xa, 11); \ + xh = T32(xb + xh); \ + xb = ROTL32(xb, 11); \ + xm = T32(xc + xm); \ + xc = ROTL32(xc, 11); \ + xn = T32(xd + xn); \ + xd = ROTL32(xd, 11); \ + xk = T32(xe + xk); \ + xe = ROTL32(xe, 11); \ + xl = T32(xf + xl); \ + xf = ROTL32(xf, 11); \ + xq = T32(x0 + xq); \ + x0 = ROTL32(x0, 11); \ + xr = T32(x1 + xr); \ + x1 = ROTL32(x1, 11); \ + xo = T32(x2 + xo); \ + x2 = ROTL32(x2, 11); \ + xp = T32(x3 + xp); \ + x3 = ROTL32(x3, 11); \ + xu = T32(x4 + xu); \ + x4 = ROTL32(x4, 11); \ + xv = T32(x5 + xv); \ + x5 = ROTL32(x5, 11); \ + xs = T32(x6 + xs); \ + x6 = ROTL32(x6, 11); \ + xt = T32(x7 + xt); \ + x7 = ROTL32(x7, 11); \ + xc ^= xi; \ + xd ^= xj; \ + xe ^= xg; \ + xf ^= xh; \ + x8 ^= xm; \ + x9 ^= xn; \ + xa ^= xk; \ + xb ^= xl; \ + x4 ^= xq; \ + x5 ^= xr; \ + x6 ^= xo; \ + x7 ^= xp; \ + x0 ^= xu; \ + x1 ^= xv; \ + x2 ^= xs; \ + x3 ^= xt; \ + } while (0) + +#define ROUND_ODD do { \ + xj = T32(xc + xj); \ + xc = ROTL32(xc, 7); \ + xi = T32(xd + xi); \ + xd = ROTL32(xd, 7); \ + xh = T32(xe + xh); \ + xe = ROTL32(xe, 7); \ + xg = T32(xf + xg); \ + xf = ROTL32(xf, 7); \ + xn = T32(x8 + xn); \ + x8 = ROTL32(x8, 7); \ + xm = T32(x9 + xm); \ + x9 = ROTL32(x9, 7); \ + xl = T32(xa + xl); \ + xa = ROTL32(xa, 7); \ + xk = T32(xb + xk); \ + xb = ROTL32(xb, 7); \ + xr = T32(x4 + xr); \ + x4 = ROTL32(x4, 7); \ + xq = T32(x5 + xq); \ + x5 = ROTL32(x5, 7); \ + xp = T32(x6 + xp); \ + x6 = ROTL32(x6, 7); \ + xo = T32(x7 + xo); \ + x7 = ROTL32(x7, 7); \ + xv = T32(x0 + xv); \ + x0 = ROTL32(x0, 7); \ + xu = T32(x1 + xu); \ + x1 = ROTL32(x1, 7); \ + xt = T32(x2 + xt); \ + x2 = ROTL32(x2, 7); \ + xs = T32(x3 + xs); \ + x3 = ROTL32(x3, 7); \ + x4 ^= xj; \ + x5 ^= xi; \ + x6 ^= xh; \ + x7 ^= xg; \ + x0 ^= xn; \ + x1 ^= xm; \ + x2 ^= xl; \ + x3 ^= xk; \ + xc ^= xr; \ + xd ^= xq; \ + xe ^= xp; \ + xf ^= xo; \ + x8 ^= xv; \ + x9 ^= xu; \ + xa ^= xt; \ + xb ^= xs; \ + xh = T32(x4 + xh); \ + x4 = ROTL32(x4, 11); \ + xg = T32(x5 + xg); \ + x5 = ROTL32(x5, 11); \ + xj = T32(x6 + xj); \ + x6 = ROTL32(x6, 11); \ + xi = T32(x7 + xi); \ + x7 = ROTL32(x7, 11); \ + xl = T32(x0 + xl); \ + x0 = ROTL32(x0, 11); \ + xk = T32(x1 + xk); \ + x1 = ROTL32(x1, 11); \ + xn = T32(x2 + xn); \ + x2 = ROTL32(x2, 11); \ + xm = T32(x3 + xm); \ + x3 = ROTL32(x3, 11); \ + xp = T32(xc + xp); \ + xc = ROTL32(xc, 11); \ + xo = T32(xd + xo); \ + xd = ROTL32(xd, 11); \ + xr = T32(xe + xr); \ + xe = ROTL32(xe, 11); \ + xq = T32(xf + xq); \ + xf = ROTL32(xf, 11); \ + xt = T32(x8 + xt); \ + x8 = ROTL32(x8, 11); \ + xs = T32(x9 + xs); \ + x9 = ROTL32(x9, 11); \ + xv = T32(xa + xv); \ + xa = ROTL32(xa, 11); \ + xu = T32(xb + xu); \ + xb = ROTL32(xb, 11); \ + x0 ^= xh; \ + x1 ^= xg; \ + x2 ^= xj; \ + x3 ^= xi; \ + x4 ^= xl; \ + x5 ^= xk; \ + x6 ^= xn; \ + x7 ^= xm; \ + x8 ^= xp; \ + x9 ^= xo; \ + xa ^= xr; \ + xb ^= xq; \ + xc ^= xt; \ + xd ^= xs; \ + xe ^= xv; \ + xf ^= xu; \ + } while (0) + +/* + * There is no need to unroll all 16 rounds. The word-swapping permutation + * is an involution, so we need to unroll an even number of rounds. On + * "big" systems, unrolling 4 rounds yields about 97% of the speed + * achieved with full unrolling; and it keeps the code more compact + * for small architectures. + */ + +#if SPH_CUBEHASH_UNROLL == 2 + +#define SIXTEEN_ROUNDS do { \ + int j; \ + for (j = 0; j < 8; j ++) { \ + ROUND_EVEN; \ + ROUND_ODD; \ + } \ + } while (0) + +#elif SPH_CUBEHASH_UNROLL == 4 + +#define SIXTEEN_ROUNDS do { \ + int j; \ + for (j = 0; j < 4; j ++) { \ + ROUND_EVEN; \ + ROUND_ODD; \ + ROUND_EVEN; \ + ROUND_ODD; \ + } \ + } while (0) + +#elif SPH_CUBEHASH_UNROLL == 8 + +#define SIXTEEN_ROUNDS do { \ + int j; \ + for (j = 0; j < 2; j ++) { \ + ROUND_EVEN; \ + ROUND_ODD; \ + ROUND_EVEN; \ + ROUND_ODD; \ + ROUND_EVEN; \ + ROUND_ODD; \ + ROUND_EVEN; \ + ROUND_ODD; \ + } \ + } while (0) + +#else + +#define SIXTEEN_ROUNDS do { \ + ROUND_EVEN; \ + ROUND_ODD; \ + ROUND_EVEN; \ + ROUND_ODD; \ + ROUND_EVEN; \ + ROUND_ODD; \ + ROUND_EVEN; \ + ROUND_ODD; \ + ROUND_EVEN; \ + ROUND_ODD; \ + ROUND_EVEN; \ + ROUND_ODD; \ + ROUND_EVEN; \ + ROUND_ODD; \ + ROUND_EVEN; \ + ROUND_ODD; \ + } while (0) + +#endif + +static void +cubehash_init(sph_cubehash_context *sc, const sph_u32 *iv) +{ + memcpy(sc->state, iv, sizeof sc->state); + sc->ptr = 0; +} + +static void +cubehash_core(sph_cubehash_context *sc, const void *data, size_t len) +{ + unsigned char *buf; + size_t ptr; + DECL_STATE + + buf = sc->buf; + ptr = sc->ptr; + if (len < (sizeof sc->buf) - ptr) { + memcpy(buf + ptr, data, len); + ptr += len; + sc->ptr = ptr; + return; + } + + READ_STATE(sc); + while (len > 0) { + size_t clen; + + clen = (sizeof sc->buf) - ptr; + if (clen > len) + clen = len; + memcpy(buf + ptr, data, clen); + ptr += clen; + data = (const unsigned char *)data + clen; + len -= clen; + if (ptr == sizeof sc->buf) { + INPUT_BLOCK; + SIXTEEN_ROUNDS; + ptr = 0; + } + } + WRITE_STATE(sc); + sc->ptr = ptr; +} + +static void +cubehash_close(sph_cubehash_context *sc, unsigned ub, unsigned n, + void *dst, size_t out_size_w32) +{ + unsigned char *buf, *out; + size_t ptr; + unsigned z; + int i; + DECL_STATE + + buf = sc->buf; + ptr = sc->ptr; + z = 0x80 >> n; + buf[ptr ++] = ((ub & -z) | z) & 0xFF; + memset(buf + ptr, 0, (sizeof sc->buf) - ptr); + READ_STATE(sc); + INPUT_BLOCK; + for (i = 0; i < 11; i ++) { + SIXTEEN_ROUNDS; + if (i == 0) + xv ^= SPH_C32(1); + } + WRITE_STATE(sc); + out = dst; + for (z = 0; z < out_size_w32; z ++) + sph_enc32le(out + (z << 2), sc->state[z]); +} + +/* see sph_cubehash.h */ +void +sph_cubehash224_init(void *cc) +{ + cubehash_init(cc, IV224); +} + +/* see sph_cubehash.h */ +void +sph_cubehash224(void *cc, const void *data, size_t len) +{ + cubehash_core(cc, data, len); +} + +/* see sph_cubehash.h */ +void +sph_cubehash224_close(void *cc, void *dst) +{ + sph_cubehash224_addbits_and_close(cc, 0, 0, dst); +} + +/* see sph_cubehash.h */ +void +sph_cubehash224_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst) +{ + cubehash_close(cc, ub, n, dst, 7); + sph_cubehash224_init(cc); +} + +/* see sph_cubehash.h */ +void +sph_cubehash256_init(void *cc) +{ + cubehash_init(cc, IV256); +} + +/* see sph_cubehash.h */ +void +sph_cubehash256(void *cc, const void *data, size_t len) +{ + cubehash_core(cc, data, len); +} + +/* see sph_cubehash.h */ +void +sph_cubehash256_close(void *cc, void *dst) +{ + sph_cubehash256_addbits_and_close(cc, 0, 0, dst); +} + +/* see sph_cubehash.h */ +void +sph_cubehash256_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst) +{ + cubehash_close(cc, ub, n, dst, 8); + sph_cubehash256_init(cc); +} + +/* see sph_cubehash.h */ +void +sph_cubehash384_init(void *cc) +{ + cubehash_init(cc, IV384); +} + +/* see sph_cubehash.h */ +void +sph_cubehash384(void *cc, const void *data, size_t len) +{ + cubehash_core(cc, data, len); +} + +/* see sph_cubehash.h */ +void +sph_cubehash384_close(void *cc, void *dst) +{ + sph_cubehash384_addbits_and_close(cc, 0, 0, dst); +} + +/* see sph_cubehash.h */ +void +sph_cubehash384_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst) +{ + cubehash_close(cc, ub, n, dst, 12); + sph_cubehash384_init(cc); +} + +/* see sph_cubehash.h */ +void +sph_cubehash512_init(void *cc) +{ + cubehash_init(cc, IV512); +} + +/* see sph_cubehash.h */ +void +sph_cubehash512(void *cc, const void *data, size_t len) +{ + cubehash_core(cc, data, len); +} + +/* see sph_cubehash.h */ +void +sph_cubehash512_close(void *cc, void *dst) +{ + sph_cubehash512_addbits_and_close(cc, 0, 0, dst); +} + +/* see sph_cubehash.h */ +void +sph_cubehash512_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst) +{ + cubehash_close(cc, ub, n, dst, 16); + sph_cubehash512_init(cc); +} +#ifdef __cplusplus +} +#endif diff --git a/sha3/echo.c b/sha3/echo.c new file mode 100644 index 0000000..667e3f3 --- /dev/null +++ b/sha3/echo.c @@ -0,0 +1,1031 @@ +/* $Id: echo.c 227 2010-06-16 17:28:38Z tp $ */ +/* + * ECHO implementation. + * + * ==========================(LICENSE BEGIN)============================ + * + * Copyright (c) 2007-2010 Projet RNRT SAPHIR + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY + * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * + * ===========================(LICENSE END)============================= + * + * @author Thomas Pornin + */ + +#include +#include +#include + +#include "sph_echo.h" + +#ifdef __cplusplus +extern "C"{ +#endif + +#if SPH_SMALL_FOOTPRINT && !defined SPH_SMALL_FOOTPRINT_ECHO +#define SPH_SMALL_FOOTPRINT_ECHO 1 +#endif + +/* + * Some measures tend to show that the 64-bit implementation offers + * better performance only on a "64-bit architectures", those which have + * actual 64-bit registers. + */ +#if !defined SPH_ECHO_64 && SPH_64_TRUE +#define SPH_ECHO_64 1 +#endif + +/* + * We can use a 64-bit implementation only if a 64-bit type is available. + */ +#if !SPH_64 +#undef SPH_ECHO_64 +#endif + +#ifdef _MSC_VER +#pragma warning (disable: 4146) +#endif + +#define T32 SPH_T32 +#define C32 SPH_C32 +#if SPH_64 +#define C64 SPH_C64 +#endif + +#define AES_BIG_ENDIAN 0 +#include "aes_helper.c" + +#if SPH_ECHO_64 + +#define DECL_STATE_SMALL \ + sph_u64 W[16][2]; + +#define DECL_STATE_BIG \ + sph_u64 W[16][2]; + +#define INPUT_BLOCK_SMALL(sc) do { \ + unsigned u; \ + memcpy(W, sc->u.Vb, 8 * sizeof(sph_u64)); \ + for (u = 0; u < 12; u ++) { \ + W[u + 4][0] = sph_dec64le_aligned( \ + sc->buf + 16 * u); \ + W[u + 4][1] = sph_dec64le_aligned( \ + sc->buf + 16 * u + 8); \ + } \ + } while (0) + +#define INPUT_BLOCK_BIG(sc) do { \ + unsigned u; \ + memcpy(W, sc->u.Vb, 16 * sizeof(sph_u64)); \ + for (u = 0; u < 8; u ++) { \ + W[u + 8][0] = sph_dec64le_aligned( \ + sc->buf + 16 * u); \ + W[u + 8][1] = sph_dec64le_aligned( \ + sc->buf + 16 * u + 8); \ + } \ + } while (0) + +#if SPH_SMALL_FOOTPRINT_ECHO + +static void +aes_2rounds_all(sph_u64 W[16][2], + sph_u32 *pK0, sph_u32 *pK1, sph_u32 *pK2, sph_u32 *pK3) +{ + int n; + sph_u32 K0 = *pK0; + sph_u32 K1 = *pK1; + sph_u32 K2 = *pK2; + sph_u32 K3 = *pK3; + + for (n = 0; n < 16; n ++) { + sph_u64 Wl = W[n][0]; + sph_u64 Wh = W[n][1]; + sph_u32 X0 = (sph_u32)Wl; + sph_u32 X1 = (sph_u32)(Wl >> 32); + sph_u32 X2 = (sph_u32)Wh; + sph_u32 X3 = (sph_u32)(Wh >> 32); + sph_u32 Y0, Y1, Y2, Y3; \ + AES_ROUND_LE(X0, X1, X2, X3, K0, K1, K2, K3, Y0, Y1, Y2, Y3); + AES_ROUND_NOKEY_LE(Y0, Y1, Y2, Y3, X0, X1, X2, X3); + W[n][0] = (sph_u64)X0 | ((sph_u64)X1 << 32); + W[n][1] = (sph_u64)X2 | ((sph_u64)X3 << 32); + if ((K0 = T32(K0 + 1)) == 0) { + if ((K1 = T32(K1 + 1)) == 0) + if ((K2 = T32(K2 + 1)) == 0) + K3 = T32(K3 + 1); + } + } + *pK0 = K0; + *pK1 = K1; + *pK2 = K2; + *pK3 = K3; +} + +#define BIG_SUB_WORDS do { \ + aes_2rounds_all(W, &K0, &K1, &K2, &K3); \ + } while (0) + +#else + +#define AES_2ROUNDS(X) do { \ + sph_u32 X0 = (sph_u32)(X[0]); \ + sph_u32 X1 = (sph_u32)(X[0] >> 32); \ + sph_u32 X2 = (sph_u32)(X[1]); \ + sph_u32 X3 = (sph_u32)(X[1] >> 32); \ + sph_u32 Y0, Y1, Y2, Y3; \ + AES_ROUND_LE(X0, X1, X2, X3, K0, K1, K2, K3, Y0, Y1, Y2, Y3); \ + AES_ROUND_NOKEY_LE(Y0, Y1, Y2, Y3, X0, X1, X2, X3); \ + X[0] = (sph_u64)X0 | ((sph_u64)X1 << 32); \ + X[1] = (sph_u64)X2 | ((sph_u64)X3 << 32); \ + if ((K0 = T32(K0 + 1)) == 0) { \ + if ((K1 = T32(K1 + 1)) == 0) \ + if ((K2 = T32(K2 + 1)) == 0) \ + K3 = T32(K3 + 1); \ + } \ + } while (0) + +#define BIG_SUB_WORDS do { \ + AES_2ROUNDS(W[ 0]); \ + AES_2ROUNDS(W[ 1]); \ + AES_2ROUNDS(W[ 2]); \ + AES_2ROUNDS(W[ 3]); \ + AES_2ROUNDS(W[ 4]); \ + AES_2ROUNDS(W[ 5]); \ + AES_2ROUNDS(W[ 6]); \ + AES_2ROUNDS(W[ 7]); \ + AES_2ROUNDS(W[ 8]); \ + AES_2ROUNDS(W[ 9]); \ + AES_2ROUNDS(W[10]); \ + AES_2ROUNDS(W[11]); \ + AES_2ROUNDS(W[12]); \ + AES_2ROUNDS(W[13]); \ + AES_2ROUNDS(W[14]); \ + AES_2ROUNDS(W[15]); \ + } while (0) + +#endif + +#define SHIFT_ROW1(a, b, c, d) do { \ + sph_u64 tmp; \ + tmp = W[a][0]; \ + W[a][0] = W[b][0]; \ + W[b][0] = W[c][0]; \ + W[c][0] = W[d][0]; \ + W[d][0] = tmp; \ + tmp = W[a][1]; \ + W[a][1] = W[b][1]; \ + W[b][1] = W[c][1]; \ + W[c][1] = W[d][1]; \ + W[d][1] = tmp; \ + } while (0) + +#define SHIFT_ROW2(a, b, c, d) do { \ + sph_u64 tmp; \ + tmp = W[a][0]; \ + W[a][0] = W[c][0]; \ + W[c][0] = tmp; \ + tmp = W[b][0]; \ + W[b][0] = W[d][0]; \ + W[d][0] = tmp; \ + tmp = W[a][1]; \ + W[a][1] = W[c][1]; \ + W[c][1] = tmp; \ + tmp = W[b][1]; \ + W[b][1] = W[d][1]; \ + W[d][1] = tmp; \ + } while (0) + +#define SHIFT_ROW3(a, b, c, d) SHIFT_ROW1(d, c, b, a) + +#define BIG_SHIFT_ROWS do { \ + SHIFT_ROW1(1, 5, 9, 13); \ + SHIFT_ROW2(2, 6, 10, 14); \ + SHIFT_ROW3(3, 7, 11, 15); \ + } while (0) + +#if SPH_SMALL_FOOTPRINT_ECHO + +static void +mix_column(sph_u64 W[16][2], int ia, int ib, int ic, int id) +{ + int n; + + for (n = 0; n < 2; n ++) { + sph_u64 a = W[ia][n]; + sph_u64 b = W[ib][n]; + sph_u64 c = W[ic][n]; + sph_u64 d = W[id][n]; + sph_u64 ab = a ^ b; + sph_u64 bc = b ^ c; + sph_u64 cd = c ^ d; + sph_u64 abx = ((ab & C64(0x8080808080808080)) >> 7) * 27U + ^ ((ab & C64(0x7F7F7F7F7F7F7F7F)) << 1); + sph_u64 bcx = ((bc & C64(0x8080808080808080)) >> 7) * 27U + ^ ((bc & C64(0x7F7F7F7F7F7F7F7F)) << 1); + sph_u64 cdx = ((cd & C64(0x8080808080808080)) >> 7) * 27U + ^ ((cd & C64(0x7F7F7F7F7F7F7F7F)) << 1); + W[ia][n] = abx ^ bc ^ d; + W[ib][n] = bcx ^ a ^ cd; + W[ic][n] = cdx ^ ab ^ d; + W[id][n] = abx ^ bcx ^ cdx ^ ab ^ c; + } +} + +#define MIX_COLUMN(a, b, c, d) mix_column(W, a, b, c, d) + +#else + +#define MIX_COLUMN1(ia, ib, ic, id, n) do { \ + sph_u64 a = W[ia][n]; \ + sph_u64 b = W[ib][n]; \ + sph_u64 c = W[ic][n]; \ + sph_u64 d = W[id][n]; \ + sph_u64 ab = a ^ b; \ + sph_u64 bc = b ^ c; \ + sph_u64 cd = c ^ d; \ + sph_u64 abx = ((ab & C64(0x8080808080808080)) >> 7) * 27U \ + ^ ((ab & C64(0x7F7F7F7F7F7F7F7F)) << 1); \ + sph_u64 bcx = ((bc & C64(0x8080808080808080)) >> 7) * 27U \ + ^ ((bc & C64(0x7F7F7F7F7F7F7F7F)) << 1); \ + sph_u64 cdx = ((cd & C64(0x8080808080808080)) >> 7) * 27U \ + ^ ((cd & C64(0x7F7F7F7F7F7F7F7F)) << 1); \ + W[ia][n] = abx ^ bc ^ d; \ + W[ib][n] = bcx ^ a ^ cd; \ + W[ic][n] = cdx ^ ab ^ d; \ + W[id][n] = abx ^ bcx ^ cdx ^ ab ^ c; \ + } while (0) + +#define MIX_COLUMN(a, b, c, d) do { \ + MIX_COLUMN1(a, b, c, d, 0); \ + MIX_COLUMN1(a, b, c, d, 1); \ + } while (0) + +#endif + +#define BIG_MIX_COLUMNS do { \ + MIX_COLUMN(0, 1, 2, 3); \ + MIX_COLUMN(4, 5, 6, 7); \ + MIX_COLUMN(8, 9, 10, 11); \ + MIX_COLUMN(12, 13, 14, 15); \ + } while (0) + +#define BIG_ROUND do { \ + BIG_SUB_WORDS; \ + BIG_SHIFT_ROWS; \ + BIG_MIX_COLUMNS; \ + } while (0) + +#define FINAL_SMALL do { \ + unsigned u; \ + sph_u64 *VV = &sc->u.Vb[0][0]; \ + sph_u64 *WW = &W[0][0]; \ + for (u = 0; u < 8; u ++) { \ + VV[u] ^= sph_dec64le_aligned(sc->buf + (u * 8)) \ + ^ sph_dec64le_aligned(sc->buf + (u * 8) + 64) \ + ^ sph_dec64le_aligned(sc->buf + (u * 8) + 128) \ + ^ WW[u] ^ WW[u + 8] \ + ^ WW[u + 16] ^ WW[u + 24]; \ + } \ + } while (0) + +#define FINAL_BIG do { \ + unsigned u; \ + sph_u64 *VV = &sc->u.Vb[0][0]; \ + sph_u64 *WW = &W[0][0]; \ + for (u = 0; u < 16; u ++) { \ + VV[u] ^= sph_dec64le_aligned(sc->buf + (u * 8)) \ + ^ WW[u] ^ WW[u + 16]; \ + } \ + } while (0) + +#define COMPRESS_SMALL(sc) do { \ + sph_u32 K0 = sc->C0; \ + sph_u32 K1 = sc->C1; \ + sph_u32 K2 = sc->C2; \ + sph_u32 K3 = sc->C3; \ + unsigned u; \ + INPUT_BLOCK_SMALL(sc); \ + for (u = 0; u < 8; u ++) { \ + BIG_ROUND; \ + } \ + FINAL_SMALL; \ + } while (0) + +#define COMPRESS_BIG(sc) do { \ + sph_u32 K0 = sc->C0; \ + sph_u32 K1 = sc->C1; \ + sph_u32 K2 = sc->C2; \ + sph_u32 K3 = sc->C3; \ + unsigned u; \ + INPUT_BLOCK_BIG(sc); \ + for (u = 0; u < 10; u ++) { \ + BIG_ROUND; \ + } \ + FINAL_BIG; \ + } while (0) + +#else + +#define DECL_STATE_SMALL \ + sph_u32 W[16][4]; + +#define DECL_STATE_BIG \ + sph_u32 W[16][4]; + +#define INPUT_BLOCK_SMALL(sc) do { \ + unsigned u; \ + memcpy(W, sc->u.Vs, 16 * sizeof(sph_u32)); \ + for (u = 0; u < 12; u ++) { \ + W[u + 4][0] = sph_dec32le_aligned( \ + sc->buf + 16 * u); \ + W[u + 4][1] = sph_dec32le_aligned( \ + sc->buf + 16 * u + 4); \ + W[u + 4][2] = sph_dec32le_aligned( \ + sc->buf + 16 * u + 8); \ + W[u + 4][3] = sph_dec32le_aligned( \ + sc->buf + 16 * u + 12); \ + } \ + } while (0) + +#define INPUT_BLOCK_BIG(sc) do { \ + unsigned u; \ + memcpy(W, sc->u.Vs, 32 * sizeof(sph_u32)); \ + for (u = 0; u < 8; u ++) { \ + W[u + 8][0] = sph_dec32le_aligned( \ + sc->buf + 16 * u); \ + W[u + 8][1] = sph_dec32le_aligned( \ + sc->buf + 16 * u + 4); \ + W[u + 8][2] = sph_dec32le_aligned( \ + sc->buf + 16 * u + 8); \ + W[u + 8][3] = sph_dec32le_aligned( \ + sc->buf + 16 * u + 12); \ + } \ + } while (0) + +#if SPH_SMALL_FOOTPRINT_ECHO + +static void +aes_2rounds_all(sph_u32 W[16][4], + sph_u32 *pK0, sph_u32 *pK1, sph_u32 *pK2, sph_u32 *pK3) +{ + int n; + sph_u32 K0 = *pK0; + sph_u32 K1 = *pK1; + sph_u32 K2 = *pK2; + sph_u32 K3 = *pK3; + + for (n = 0; n < 16; n ++) { + sph_u32 *X = W[n]; + sph_u32 Y0, Y1, Y2, Y3; + AES_ROUND_LE(X[0], X[1], X[2], X[3], + K0, K1, K2, K3, Y0, Y1, Y2, Y3); + AES_ROUND_NOKEY_LE(Y0, Y1, Y2, Y3, X[0], X[1], X[2], X[3]); + if ((K0 = T32(K0 + 1)) == 0) { + if ((K1 = T32(K1 + 1)) == 0) + if ((K2 = T32(K2 + 1)) == 0) + K3 = T32(K3 + 1); + } + } + *pK0 = K0; + *pK1 = K1; + *pK2 = K2; + *pK3 = K3; +} + +#define BIG_SUB_WORDS do { \ + aes_2rounds_all(W, &K0, &K1, &K2, &K3); \ + } while (0) + +#else + +#define AES_2ROUNDS(X) do { \ + sph_u32 Y0, Y1, Y2, Y3; \ + AES_ROUND_LE(X[0], X[1], X[2], X[3], \ + K0, K1, K2, K3, Y0, Y1, Y2, Y3); \ + AES_ROUND_NOKEY_LE(Y0, Y1, Y2, Y3, X[0], X[1], X[2], X[3]); \ + if ((K0 = T32(K0 + 1)) == 0) { \ + if ((K1 = T32(K1 + 1)) == 0) \ + if ((K2 = T32(K2 + 1)) == 0) \ + K3 = T32(K3 + 1); \ + } \ + } while (0) + +#define BIG_SUB_WORDS do { \ + AES_2ROUNDS(W[ 0]); \ + AES_2ROUNDS(W[ 1]); \ + AES_2ROUNDS(W[ 2]); \ + AES_2ROUNDS(W[ 3]); \ + AES_2ROUNDS(W[ 4]); \ + AES_2ROUNDS(W[ 5]); \ + AES_2ROUNDS(W[ 6]); \ + AES_2ROUNDS(W[ 7]); \ + AES_2ROUNDS(W[ 8]); \ + AES_2ROUNDS(W[ 9]); \ + AES_2ROUNDS(W[10]); \ + AES_2ROUNDS(W[11]); \ + AES_2ROUNDS(W[12]); \ + AES_2ROUNDS(W[13]); \ + AES_2ROUNDS(W[14]); \ + AES_2ROUNDS(W[15]); \ + } while (0) + +#endif + +#define SHIFT_ROW1(a, b, c, d) do { \ + sph_u32 tmp; \ + tmp = W[a][0]; \ + W[a][0] = W[b][0]; \ + W[b][0] = W[c][0]; \ + W[c][0] = W[d][0]; \ + W[d][0] = tmp; \ + tmp = W[a][1]; \ + W[a][1] = W[b][1]; \ + W[b][1] = W[c][1]; \ + W[c][1] = W[d][1]; \ + W[d][1] = tmp; \ + tmp = W[a][2]; \ + W[a][2] = W[b][2]; \ + W[b][2] = W[c][2]; \ + W[c][2] = W[d][2]; \ + W[d][2] = tmp; \ + tmp = W[a][3]; \ + W[a][3] = W[b][3]; \ + W[b][3] = W[c][3]; \ + W[c][3] = W[d][3]; \ + W[d][3] = tmp; \ + } while (0) + +#define SHIFT_ROW2(a, b, c, d) do { \ + sph_u32 tmp; \ + tmp = W[a][0]; \ + W[a][0] = W[c][0]; \ + W[c][0] = tmp; \ + tmp = W[b][0]; \ + W[b][0] = W[d][0]; \ + W[d][0] = tmp; \ + tmp = W[a][1]; \ + W[a][1] = W[c][1]; \ + W[c][1] = tmp; \ + tmp = W[b][1]; \ + W[b][1] = W[d][1]; \ + W[d][1] = tmp; \ + tmp = W[a][2]; \ + W[a][2] = W[c][2]; \ + W[c][2] = tmp; \ + tmp = W[b][2]; \ + W[b][2] = W[d][2]; \ + W[d][2] = tmp; \ + tmp = W[a][3]; \ + W[a][3] = W[c][3]; \ + W[c][3] = tmp; \ + tmp = W[b][3]; \ + W[b][3] = W[d][3]; \ + W[d][3] = tmp; \ + } while (0) + +#define SHIFT_ROW3(a, b, c, d) SHIFT_ROW1(d, c, b, a) + +#define BIG_SHIFT_ROWS do { \ + SHIFT_ROW1(1, 5, 9, 13); \ + SHIFT_ROW2(2, 6, 10, 14); \ + SHIFT_ROW3(3, 7, 11, 15); \ + } while (0) + +#if SPH_SMALL_FOOTPRINT_ECHO + +static void +mix_column(sph_u32 W[16][4], int ia, int ib, int ic, int id) +{ + int n; + + for (n = 0; n < 4; n ++) { + sph_u32 a = W[ia][n]; + sph_u32 b = W[ib][n]; + sph_u32 c = W[ic][n]; + sph_u32 d = W[id][n]; + sph_u32 ab = a ^ b; + sph_u32 bc = b ^ c; + sph_u32 cd = c ^ d; + sph_u32 abx = ((ab & C32(0x80808080)) >> 7) * 27U + ^ ((ab & C32(0x7F7F7F7F)) << 1); + sph_u32 bcx = ((bc & C32(0x80808080)) >> 7) * 27U + ^ ((bc & C32(0x7F7F7F7F)) << 1); + sph_u32 cdx = ((cd & C32(0x80808080)) >> 7) * 27U + ^ ((cd & C32(0x7F7F7F7F)) << 1); + W[ia][n] = abx ^ bc ^ d; + W[ib][n] = bcx ^ a ^ cd; + W[ic][n] = cdx ^ ab ^ d; + W[id][n] = abx ^ bcx ^ cdx ^ ab ^ c; + } +} + +#define MIX_COLUMN(a, b, c, d) mix_column(W, a, b, c, d) + +#else + +#define MIX_COLUMN1(ia, ib, ic, id, n) do { \ + sph_u32 a = W[ia][n]; \ + sph_u32 b = W[ib][n]; \ + sph_u32 c = W[ic][n]; \ + sph_u32 d = W[id][n]; \ + sph_u32 ab = a ^ b; \ + sph_u32 bc = b ^ c; \ + sph_u32 cd = c ^ d; \ + sph_u32 abx = ((ab & C32(0x80808080)) >> 7) * 27U \ + ^ ((ab & C32(0x7F7F7F7F)) << 1); \ + sph_u32 bcx = ((bc & C32(0x80808080)) >> 7) * 27U \ + ^ ((bc & C32(0x7F7F7F7F)) << 1); \ + sph_u32 cdx = ((cd & C32(0x80808080)) >> 7) * 27U \ + ^ ((cd & C32(0x7F7F7F7F)) << 1); \ + W[ia][n] = abx ^ bc ^ d; \ + W[ib][n] = bcx ^ a ^ cd; \ + W[ic][n] = cdx ^ ab ^ d; \ + W[id][n] = abx ^ bcx ^ cdx ^ ab ^ c; \ + } while (0) + +#define MIX_COLUMN(a, b, c, d) do { \ + MIX_COLUMN1(a, b, c, d, 0); \ + MIX_COLUMN1(a, b, c, d, 1); \ + MIX_COLUMN1(a, b, c, d, 2); \ + MIX_COLUMN1(a, b, c, d, 3); \ + } while (0) + +#endif + +#define BIG_MIX_COLUMNS do { \ + MIX_COLUMN(0, 1, 2, 3); \ + MIX_COLUMN(4, 5, 6, 7); \ + MIX_COLUMN(8, 9, 10, 11); \ + MIX_COLUMN(12, 13, 14, 15); \ + } while (0) + +#define BIG_ROUND do { \ + BIG_SUB_WORDS; \ + BIG_SHIFT_ROWS; \ + BIG_MIX_COLUMNS; \ + } while (0) + +#define FINAL_SMALL do { \ + unsigned u; \ + sph_u32 *VV = &sc->u.Vs[0][0]; \ + sph_u32 *WW = &W[0][0]; \ + for (u = 0; u < 16; u ++) { \ + VV[u] ^= sph_dec32le_aligned(sc->buf + (u * 4)) \ + ^ sph_dec32le_aligned(sc->buf + (u * 4) + 64) \ + ^ sph_dec32le_aligned(sc->buf + (u * 4) + 128) \ + ^ WW[u] ^ WW[u + 16] \ + ^ WW[u + 32] ^ WW[u + 48]; \ + } \ + } while (0) + +#define FINAL_BIG do { \ + unsigned u; \ + sph_u32 *VV = &sc->u.Vs[0][0]; \ + sph_u32 *WW = &W[0][0]; \ + for (u = 0; u < 32; u ++) { \ + VV[u] ^= sph_dec32le_aligned(sc->buf + (u * 4)) \ + ^ WW[u] ^ WW[u + 32]; \ + } \ + } while (0) + +#define COMPRESS_SMALL(sc) do { \ + sph_u32 K0 = sc->C0; \ + sph_u32 K1 = sc->C1; \ + sph_u32 K2 = sc->C2; \ + sph_u32 K3 = sc->C3; \ + unsigned u; \ + INPUT_BLOCK_SMALL(sc); \ + for (u = 0; u < 8; u ++) { \ + BIG_ROUND; \ + } \ + FINAL_SMALL; \ + } while (0) + +#define COMPRESS_BIG(sc) do { \ + sph_u32 K0 = sc->C0; \ + sph_u32 K1 = sc->C1; \ + sph_u32 K2 = sc->C2; \ + sph_u32 K3 = sc->C3; \ + unsigned u; \ + INPUT_BLOCK_BIG(sc); \ + for (u = 0; u < 10; u ++) { \ + BIG_ROUND; \ + } \ + FINAL_BIG; \ + } while (0) + +#endif + +#define INCR_COUNTER(sc, val) do { \ + sc->C0 = T32(sc->C0 + (sph_u32)(val)); \ + if (sc->C0 < (sph_u32)(val)) { \ + if ((sc->C1 = T32(sc->C1 + 1)) == 0) \ + if ((sc->C2 = T32(sc->C2 + 1)) == 0) \ + sc->C3 = T32(sc->C3 + 1); \ + } \ + } while (0) + +static void +echo_small_init(sph_echo_small_context *sc, unsigned out_len) +{ +#if SPH_ECHO_64 + sc->u.Vb[0][0] = (sph_u64)out_len; + sc->u.Vb[0][1] = 0; + sc->u.Vb[1][0] = (sph_u64)out_len; + sc->u.Vb[1][1] = 0; + sc->u.Vb[2][0] = (sph_u64)out_len; + sc->u.Vb[2][1] = 0; + sc->u.Vb[3][0] = (sph_u64)out_len; + sc->u.Vb[3][1] = 0; +#else + sc->u.Vs[0][0] = (sph_u32)out_len; + sc->u.Vs[0][1] = sc->u.Vs[0][2] = sc->u.Vs[0][3] = 0; + sc->u.Vs[1][0] = (sph_u32)out_len; + sc->u.Vs[1][1] = sc->u.Vs[1][2] = sc->u.Vs[1][3] = 0; + sc->u.Vs[2][0] = (sph_u32)out_len; + sc->u.Vs[2][1] = sc->u.Vs[2][2] = sc->u.Vs[2][3] = 0; + sc->u.Vs[3][0] = (sph_u32)out_len; + sc->u.Vs[3][1] = sc->u.Vs[3][2] = sc->u.Vs[3][3] = 0; +#endif + sc->ptr = 0; + sc->C0 = sc->C1 = sc->C2 = sc->C3 = 0; +} + +static void +echo_big_init(sph_echo_big_context *sc, unsigned out_len) +{ +#if SPH_ECHO_64 + sc->u.Vb[0][0] = (sph_u64)out_len; + sc->u.Vb[0][1] = 0; + sc->u.Vb[1][0] = (sph_u64)out_len; + sc->u.Vb[1][1] = 0; + sc->u.Vb[2][0] = (sph_u64)out_len; + sc->u.Vb[2][1] = 0; + sc->u.Vb[3][0] = (sph_u64)out_len; + sc->u.Vb[3][1] = 0; + sc->u.Vb[4][0] = (sph_u64)out_len; + sc->u.Vb[4][1] = 0; + sc->u.Vb[5][0] = (sph_u64)out_len; + sc->u.Vb[5][1] = 0; + sc->u.Vb[6][0] = (sph_u64)out_len; + sc->u.Vb[6][1] = 0; + sc->u.Vb[7][0] = (sph_u64)out_len; + sc->u.Vb[7][1] = 0; +#else + sc->u.Vs[0][0] = (sph_u32)out_len; + sc->u.Vs[0][1] = sc->u.Vs[0][2] = sc->u.Vs[0][3] = 0; + sc->u.Vs[1][0] = (sph_u32)out_len; + sc->u.Vs[1][1] = sc->u.Vs[1][2] = sc->u.Vs[1][3] = 0; + sc->u.Vs[2][0] = (sph_u32)out_len; + sc->u.Vs[2][1] = sc->u.Vs[2][2] = sc->u.Vs[2][3] = 0; + sc->u.Vs[3][0] = (sph_u32)out_len; + sc->u.Vs[3][1] = sc->u.Vs[3][2] = sc->u.Vs[3][3] = 0; + sc->u.Vs[4][0] = (sph_u32)out_len; + sc->u.Vs[4][1] = sc->u.Vs[4][2] = sc->u.Vs[4][3] = 0; + sc->u.Vs[5][0] = (sph_u32)out_len; + sc->u.Vs[5][1] = sc->u.Vs[5][2] = sc->u.Vs[5][3] = 0; + sc->u.Vs[6][0] = (sph_u32)out_len; + sc->u.Vs[6][1] = sc->u.Vs[6][2] = sc->u.Vs[6][3] = 0; + sc->u.Vs[7][0] = (sph_u32)out_len; + sc->u.Vs[7][1] = sc->u.Vs[7][2] = sc->u.Vs[7][3] = 0; +#endif + sc->ptr = 0; + sc->C0 = sc->C1 = sc->C2 = sc->C3 = 0; +} + +static void +echo_small_compress(sph_echo_small_context *sc) +{ + DECL_STATE_SMALL + + COMPRESS_SMALL(sc); +} + +static void +echo_big_compress(sph_echo_big_context *sc) +{ + DECL_STATE_BIG + + COMPRESS_BIG(sc); +} + +static void +echo_small_core(sph_echo_small_context *sc, + const unsigned char *data, size_t len) +{ + unsigned char *buf; + size_t ptr; + + buf = sc->buf; + ptr = sc->ptr; + if (len < (sizeof sc->buf) - ptr) { + memcpy(buf + ptr, data, len); + ptr += len; + sc->ptr = ptr; + return; + } + + while (len > 0) { + size_t clen; + + clen = (sizeof sc->buf) - ptr; + if (clen > len) + clen = len; + memcpy(buf + ptr, data, clen); + ptr += clen; + data += clen; + len -= clen; + if (ptr == sizeof sc->buf) { + INCR_COUNTER(sc, 1536); + echo_small_compress(sc); + ptr = 0; + } + } + sc->ptr = ptr; +} + +static void +echo_big_core(sph_echo_big_context *sc, + const unsigned char *data, size_t len) +{ + unsigned char *buf; + size_t ptr; + + buf = sc->buf; + ptr = sc->ptr; + if (len < (sizeof sc->buf) - ptr) { + memcpy(buf + ptr, data, len); + ptr += len; + sc->ptr = ptr; + return; + } + + while (len > 0) { + size_t clen; + + clen = (sizeof sc->buf) - ptr; + if (clen > len) + clen = len; + memcpy(buf + ptr, data, clen); + ptr += clen; + data += clen; + len -= clen; + if (ptr == sizeof sc->buf) { + INCR_COUNTER(sc, 1024); + echo_big_compress(sc); + ptr = 0; + } + } + sc->ptr = ptr; +} + +static void +echo_small_close(sph_echo_small_context *sc, unsigned ub, unsigned n, + void *dst, unsigned out_size_w32) +{ + unsigned char *buf; + size_t ptr; + unsigned z; + unsigned elen; + union { + unsigned char tmp[32]; + sph_u32 dummy; +#if SPH_ECHO_64 + sph_u64 dummy2; +#endif + } u; +#if SPH_ECHO_64 + sph_u64 *VV; +#else + sph_u32 *VV; +#endif + unsigned k; + + buf = sc->buf; + ptr = sc->ptr; + elen = ((unsigned)ptr << 3) + n; + INCR_COUNTER(sc, elen); + sph_enc32le_aligned(u.tmp, sc->C0); + sph_enc32le_aligned(u.tmp + 4, sc->C1); + sph_enc32le_aligned(u.tmp + 8, sc->C2); + sph_enc32le_aligned(u.tmp + 12, sc->C3); + /* + * If elen is zero, then this block actually contains no message + * bit, only the first padding bit. + */ + if (elen == 0) { + sc->C0 = sc->C1 = sc->C2 = sc->C3 = 0; + } + z = 0x80 >> n; + buf[ptr ++] = ((ub & -z) | z) & 0xFF; + memset(buf + ptr, 0, (sizeof sc->buf) - ptr); + if (ptr > ((sizeof sc->buf) - 18)) { + echo_small_compress(sc); + sc->C0 = sc->C1 = sc->C2 = sc->C3 = 0; + memset(buf, 0, sizeof sc->buf); + } + sph_enc16le(buf + (sizeof sc->buf) - 18, out_size_w32 << 5); + memcpy(buf + (sizeof sc->buf) - 16, u.tmp, 16); + echo_small_compress(sc); +#if SPH_ECHO_64 + for (VV = &sc->u.Vb[0][0], k = 0; k < ((out_size_w32 + 1) >> 1); k ++) + sph_enc64le_aligned(u.tmp + (k << 3), VV[k]); +#else + for (VV = &sc->u.Vs[0][0], k = 0; k < out_size_w32; k ++) + sph_enc32le_aligned(u.tmp + (k << 2), VV[k]); +#endif + memcpy(dst, u.tmp, out_size_w32 << 2); + echo_small_init(sc, out_size_w32 << 5); +} + +static void +echo_big_close(sph_echo_big_context *sc, unsigned ub, unsigned n, + void *dst, unsigned out_size_w32) +{ + unsigned char *buf; + size_t ptr; + unsigned z; + unsigned elen; + union { + unsigned char tmp[64]; + sph_u32 dummy; +#if SPH_ECHO_64 + sph_u64 dummy2; +#endif + } u; +#if SPH_ECHO_64 + sph_u64 *VV; +#else + sph_u32 *VV; +#endif + unsigned k; + + buf = sc->buf; + ptr = sc->ptr; + elen = ((unsigned)ptr << 3) + n; + INCR_COUNTER(sc, elen); + sph_enc32le_aligned(u.tmp, sc->C0); + sph_enc32le_aligned(u.tmp + 4, sc->C1); + sph_enc32le_aligned(u.tmp + 8, sc->C2); + sph_enc32le_aligned(u.tmp + 12, sc->C3); + /* + * If elen is zero, then this block actually contains no message + * bit, only the first padding bit. + */ + if (elen == 0) { + sc->C0 = sc->C1 = sc->C2 = sc->C3 = 0; + } + z = 0x80 >> n; + buf[ptr ++] = ((ub & -z) | z) & 0xFF; + memset(buf + ptr, 0, (sizeof sc->buf) - ptr); + if (ptr > ((sizeof sc->buf) - 18)) { + echo_big_compress(sc); + sc->C0 = sc->C1 = sc->C2 = sc->C3 = 0; + memset(buf, 0, sizeof sc->buf); + } + sph_enc16le(buf + (sizeof sc->buf) - 18, out_size_w32 << 5); + memcpy(buf + (sizeof sc->buf) - 16, u.tmp, 16); + echo_big_compress(sc); +#if SPH_ECHO_64 + for (VV = &sc->u.Vb[0][0], k = 0; k < ((out_size_w32 + 1) >> 1); k ++) + sph_enc64le_aligned(u.tmp + (k << 3), VV[k]); +#else + for (VV = &sc->u.Vs[0][0], k = 0; k < out_size_w32; k ++) + sph_enc32le_aligned(u.tmp + (k << 2), VV[k]); +#endif + memcpy(dst, u.tmp, out_size_w32 << 2); + echo_big_init(sc, out_size_w32 << 5); +} + +/* see sph_echo.h */ +void +sph_echo224_init(void *cc) +{ + echo_small_init(cc, 224); +} + +/* see sph_echo.h */ +void +sph_echo224(void *cc, const void *data, size_t len) +{ + echo_small_core(cc, data, len); +} + +/* see sph_echo.h */ +void +sph_echo224_close(void *cc, void *dst) +{ + echo_small_close(cc, 0, 0, dst, 7); +} + +/* see sph_echo.h */ +void +sph_echo224_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst) +{ + echo_small_close(cc, ub, n, dst, 7); +} + +/* see sph_echo.h */ +void +sph_echo256_init(void *cc) +{ + echo_small_init(cc, 256); +} + +/* see sph_echo.h */ +void +sph_echo256(void *cc, const void *data, size_t len) +{ + echo_small_core(cc, data, len); +} + +/* see sph_echo.h */ +void +sph_echo256_close(void *cc, void *dst) +{ + echo_small_close(cc, 0, 0, dst, 8); +} + +/* see sph_echo.h */ +void +sph_echo256_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst) +{ + echo_small_close(cc, ub, n, dst, 8); +} + +/* see sph_echo.h */ +void +sph_echo384_init(void *cc) +{ + echo_big_init(cc, 384); +} + +/* see sph_echo.h */ +void +sph_echo384(void *cc, const void *data, size_t len) +{ + echo_big_core(cc, data, len); +} + +/* see sph_echo.h */ +void +sph_echo384_close(void *cc, void *dst) +{ + echo_big_close(cc, 0, 0, dst, 12); +} + +/* see sph_echo.h */ +void +sph_echo384_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst) +{ + echo_big_close(cc, ub, n, dst, 12); +} + +/* see sph_echo.h */ +void +sph_echo512_init(void *cc) +{ + echo_big_init(cc, 512); +} + +/* see sph_echo.h */ +void +sph_echo512(void *cc, const void *data, size_t len) +{ + echo_big_core(cc, data, len); +} + +/* see sph_echo.h */ +void +sph_echo512_close(void *cc, void *dst) +{ + echo_big_close(cc, 0, 0, dst, 16); +} + +/* see sph_echo.h */ +void +sph_echo512_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst) +{ + echo_big_close(cc, ub, n, dst, 16); +} +#ifdef __cplusplus +} +#endif diff --git a/sha3/groestl.c b/sha3/groestl.c new file mode 100644 index 0000000..928bc41 --- /dev/null +++ b/sha3/groestl.c @@ -0,0 +1,3123 @@ +/* $Id: groestl.c 260 2011-07-21 01:02:38Z tp $ */ +/* + * Groestl implementation. + * + * ==========================(LICENSE BEGIN)============================ + * + * Copyright (c) 2007-2010 Projet RNRT SAPHIR + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY + * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * + * ===========================(LICENSE END)============================= + * + * @author Thomas Pornin + */ + +#include +#include + +#include "sph_groestl.h" + +#ifdef __cplusplus +extern "C"{ +#endif + +#if SPH_SMALL_FOOTPRINT && !defined SPH_SMALL_FOOTPRINT_GROESTL +#define SPH_SMALL_FOOTPRINT_GROESTL 1 +#endif + +/* + * Apparently, the 32-bit-only version is not faster than the 64-bit + * version unless using the "small footprint" code on a 32-bit machine. + */ +#if !defined SPH_GROESTL_64 +#if SPH_SMALL_FOOTPRINT_GROESTL && !SPH_64_TRUE +#define SPH_GROESTL_64 0 +#else +#define SPH_GROESTL_64 1 +#endif +#endif + +#if !SPH_64 +#undef SPH_GROESTL_64 +#endif + +#ifdef _MSC_VER +#pragma warning (disable: 4146) +#endif + +/* + * The internal representation may use either big-endian or + * little-endian. Using the platform default representation speeds up + * encoding and decoding between bytes and the matrix columns. + */ + +#undef USE_LE +#if SPH_GROESTL_LITTLE_ENDIAN +#define USE_LE 1 +#elif SPH_GROESTL_BIG_ENDIAN +#define USE_LE 0 +#elif SPH_LITTLE_ENDIAN +#define USE_LE 1 +#endif + +#if USE_LE + +#define C32e(x) ((SPH_C32(x) >> 24) \ + | ((SPH_C32(x) >> 8) & SPH_C32(0x0000FF00)) \ + | ((SPH_C32(x) << 8) & SPH_C32(0x00FF0000)) \ + | ((SPH_C32(x) << 24) & SPH_C32(0xFF000000))) +#define dec32e_aligned sph_dec32le_aligned +#define enc32e sph_enc32le +#define B32_0(x) ((x) & 0xFF) +#define B32_1(x) (((x) >> 8) & 0xFF) +#define B32_2(x) (((x) >> 16) & 0xFF) +#define B32_3(x) ((x) >> 24) + +#define R32u(u, d) SPH_T32(((u) << 16) | ((d) >> 16)) +#define R32d(u, d) SPH_T32(((u) >> 16) | ((d) << 16)) + +#define PC32up(j, r) ((sph_u32)((j) + (r))) +#define PC32dn(j, r) 0 +#define QC32up(j, r) SPH_C32(0xFFFFFFFF) +#define QC32dn(j, r) (((sph_u32)(r) << 24) ^ SPH_T32(~((sph_u32)(j) << 24))) + +#if SPH_64 +#define C64e(x) ((SPH_C64(x) >> 56) \ + | ((SPH_C64(x) >> 40) & SPH_C64(0x000000000000FF00)) \ + | ((SPH_C64(x) >> 24) & SPH_C64(0x0000000000FF0000)) \ + | ((SPH_C64(x) >> 8) & SPH_C64(0x00000000FF000000)) \ + | ((SPH_C64(x) << 8) & SPH_C64(0x000000FF00000000)) \ + | ((SPH_C64(x) << 24) & SPH_C64(0x0000FF0000000000)) \ + | ((SPH_C64(x) << 40) & SPH_C64(0x00FF000000000000)) \ + | ((SPH_C64(x) << 56) & SPH_C64(0xFF00000000000000))) +#define dec64e_aligned sph_dec64le_aligned +#define enc64e sph_enc64le +#define B64_0(x) ((x) & 0xFF) +#define B64_1(x) (((x) >> 8) & 0xFF) +#define B64_2(x) (((x) >> 16) & 0xFF) +#define B64_3(x) (((x) >> 24) & 0xFF) +#define B64_4(x) (((x) >> 32) & 0xFF) +#define B64_5(x) (((x) >> 40) & 0xFF) +#define B64_6(x) (((x) >> 48) & 0xFF) +#define B64_7(x) ((x) >> 56) +#define R64 SPH_ROTL64 +#define PC64(j, r) ((sph_u64)((j) + (r))) +#define QC64(j, r) (((sph_u64)(r) << 56) ^ SPH_T64(~((sph_u64)(j) << 56))) +#endif + +#else + +#define C32e(x) SPH_C32(x) +#define dec32e_aligned sph_dec32be_aligned +#define enc32e sph_enc32be +#define B32_0(x) ((x) >> 24) +#define B32_1(x) (((x) >> 16) & 0xFF) +#define B32_2(x) (((x) >> 8) & 0xFF) +#define B32_3(x) ((x) & 0xFF) + +#define R32u(u, d) SPH_T32(((u) >> 16) | ((d) << 16)) +#define R32d(u, d) SPH_T32(((u) << 16) | ((d) >> 16)) + +#define PC32up(j, r) ((sph_u32)((j) + (r)) << 24) +#define PC32dn(j, r) 0 +#define QC32up(j, r) SPH_C32(0xFFFFFFFF) +#define QC32dn(j, r) ((sph_u32)(r) ^ SPH_T32(~(sph_u32)(j))) + +#if SPH_64 +#define C64e(x) SPH_C64(x) +#define dec64e_aligned sph_dec64be_aligned +#define enc64e sph_enc64be +#define B64_0(x) ((x) >> 56) +#define B64_1(x) (((x) >> 48) & 0xFF) +#define B64_2(x) (((x) >> 40) & 0xFF) +#define B64_3(x) (((x) >> 32) & 0xFF) +#define B64_4(x) (((x) >> 24) & 0xFF) +#define B64_5(x) (((x) >> 16) & 0xFF) +#define B64_6(x) (((x) >> 8) & 0xFF) +#define B64_7(x) ((x) & 0xFF) +#define R64 SPH_ROTR64 +#define PC64(j, r) ((sph_u64)((j) + (r)) << 56) +#define QC64(j, r) ((sph_u64)(r) ^ SPH_T64(~(sph_u64)(j))) +#endif + +#endif + +#if SPH_GROESTL_64 + +static const sph_u64 T0[] = { + C64e(0xc632f4a5f497a5c6), C64e(0xf86f978497eb84f8), + C64e(0xee5eb099b0c799ee), C64e(0xf67a8c8d8cf78df6), + C64e(0xffe8170d17e50dff), C64e(0xd60adcbddcb7bdd6), + C64e(0xde16c8b1c8a7b1de), C64e(0x916dfc54fc395491), + C64e(0x6090f050f0c05060), C64e(0x0207050305040302), + C64e(0xce2ee0a9e087a9ce), C64e(0x56d1877d87ac7d56), + C64e(0xe7cc2b192bd519e7), C64e(0xb513a662a67162b5), + C64e(0x4d7c31e6319ae64d), C64e(0xec59b59ab5c39aec), + C64e(0x8f40cf45cf05458f), C64e(0x1fa3bc9dbc3e9d1f), + C64e(0x8949c040c0094089), C64e(0xfa68928792ef87fa), + C64e(0xefd03f153fc515ef), C64e(0xb29426eb267febb2), + C64e(0x8ece40c94007c98e), C64e(0xfbe61d0b1ded0bfb), + C64e(0x416e2fec2f82ec41), C64e(0xb31aa967a97d67b3), + C64e(0x5f431cfd1cbefd5f), C64e(0x456025ea258aea45), + C64e(0x23f9dabfda46bf23), C64e(0x535102f702a6f753), + C64e(0xe445a196a1d396e4), C64e(0x9b76ed5bed2d5b9b), + C64e(0x75285dc25deac275), C64e(0xe1c5241c24d91ce1), + C64e(0x3dd4e9aee97aae3d), C64e(0x4cf2be6abe986a4c), + C64e(0x6c82ee5aeed85a6c), C64e(0x7ebdc341c3fc417e), + C64e(0xf5f3060206f102f5), C64e(0x8352d14fd11d4f83), + C64e(0x688ce45ce4d05c68), C64e(0x515607f407a2f451), + C64e(0xd18d5c345cb934d1), C64e(0xf9e1180818e908f9), + C64e(0xe24cae93aedf93e2), C64e(0xab3e9573954d73ab), + C64e(0x6297f553f5c45362), C64e(0x2a6b413f41543f2a), + C64e(0x081c140c14100c08), C64e(0x9563f652f6315295), + C64e(0x46e9af65af8c6546), C64e(0x9d7fe25ee2215e9d), + C64e(0x3048782878602830), C64e(0x37cff8a1f86ea137), + C64e(0x0a1b110f11140f0a), C64e(0x2febc4b5c45eb52f), + C64e(0x0e151b091b1c090e), C64e(0x247e5a365a483624), + C64e(0x1badb69bb6369b1b), C64e(0xdf98473d47a53ddf), + C64e(0xcda76a266a8126cd), C64e(0x4ef5bb69bb9c694e), + C64e(0x7f334ccd4cfecd7f), C64e(0xea50ba9fbacf9fea), + C64e(0x123f2d1b2d241b12), C64e(0x1da4b99eb93a9e1d), + C64e(0x58c49c749cb07458), C64e(0x3446722e72682e34), + C64e(0x3641772d776c2d36), C64e(0xdc11cdb2cda3b2dc), + C64e(0xb49d29ee2973eeb4), C64e(0x5b4d16fb16b6fb5b), + C64e(0xa4a501f60153f6a4), C64e(0x76a1d74dd7ec4d76), + C64e(0xb714a361a37561b7), C64e(0x7d3449ce49face7d), + C64e(0x52df8d7b8da47b52), C64e(0xdd9f423e42a13edd), + C64e(0x5ecd937193bc715e), C64e(0x13b1a297a2269713), + C64e(0xa6a204f50457f5a6), C64e(0xb901b868b86968b9), + C64e(0x0000000000000000), C64e(0xc1b5742c74992cc1), + C64e(0x40e0a060a0806040), C64e(0xe3c2211f21dd1fe3), + C64e(0x793a43c843f2c879), C64e(0xb69a2ced2c77edb6), + C64e(0xd40dd9bed9b3bed4), C64e(0x8d47ca46ca01468d), + C64e(0x671770d970ced967), C64e(0x72afdd4bdde44b72), + C64e(0x94ed79de7933de94), C64e(0x98ff67d4672bd498), + C64e(0xb09323e8237be8b0), C64e(0x855bde4ade114a85), + C64e(0xbb06bd6bbd6d6bbb), C64e(0xc5bb7e2a7e912ac5), + C64e(0x4f7b34e5349ee54f), C64e(0xedd73a163ac116ed), + C64e(0x86d254c55417c586), C64e(0x9af862d7622fd79a), + C64e(0x6699ff55ffcc5566), C64e(0x11b6a794a7229411), + C64e(0x8ac04acf4a0fcf8a), C64e(0xe9d9301030c910e9), + C64e(0x040e0a060a080604), C64e(0xfe66988198e781fe), + C64e(0xa0ab0bf00b5bf0a0), C64e(0x78b4cc44ccf04478), + C64e(0x25f0d5bad54aba25), C64e(0x4b753ee33e96e34b), + C64e(0xa2ac0ef30e5ff3a2), C64e(0x5d4419fe19bafe5d), + C64e(0x80db5bc05b1bc080), C64e(0x0580858a850a8a05), + C64e(0x3fd3ecadec7ead3f), C64e(0x21fedfbcdf42bc21), + C64e(0x70a8d848d8e04870), C64e(0xf1fd0c040cf904f1), + C64e(0x63197adf7ac6df63), C64e(0x772f58c158eec177), + C64e(0xaf309f759f4575af), C64e(0x42e7a563a5846342), + C64e(0x2070503050403020), C64e(0xe5cb2e1a2ed11ae5), + C64e(0xfdef120e12e10efd), C64e(0xbf08b76db7656dbf), + C64e(0x8155d44cd4194c81), C64e(0x18243c143c301418), + C64e(0x26795f355f4c3526), C64e(0xc3b2712f719d2fc3), + C64e(0xbe8638e13867e1be), C64e(0x35c8fda2fd6aa235), + C64e(0x88c74fcc4f0bcc88), C64e(0x2e654b394b5c392e), + C64e(0x936af957f93d5793), C64e(0x55580df20daaf255), + C64e(0xfc619d829de382fc), C64e(0x7ab3c947c9f4477a), + C64e(0xc827efacef8bacc8), C64e(0xba8832e7326fe7ba), + C64e(0x324f7d2b7d642b32), C64e(0xe642a495a4d795e6), + C64e(0xc03bfba0fb9ba0c0), C64e(0x19aab398b3329819), + C64e(0x9ef668d16827d19e), C64e(0xa322817f815d7fa3), + C64e(0x44eeaa66aa886644), C64e(0x54d6827e82a87e54), + C64e(0x3bdde6abe676ab3b), C64e(0x0b959e839e16830b), + C64e(0x8cc945ca4503ca8c), C64e(0xc7bc7b297b9529c7), + C64e(0x6b056ed36ed6d36b), C64e(0x286c443c44503c28), + C64e(0xa72c8b798b5579a7), C64e(0xbc813de23d63e2bc), + C64e(0x1631271d272c1d16), C64e(0xad379a769a4176ad), + C64e(0xdb964d3b4dad3bdb), C64e(0x649efa56fac85664), + C64e(0x74a6d24ed2e84e74), C64e(0x1436221e22281e14), + C64e(0x92e476db763fdb92), C64e(0x0c121e0a1e180a0c), + C64e(0x48fcb46cb4906c48), C64e(0xb88f37e4376be4b8), + C64e(0x9f78e75de7255d9f), C64e(0xbd0fb26eb2616ebd), + C64e(0x43692aef2a86ef43), C64e(0xc435f1a6f193a6c4), + C64e(0x39dae3a8e372a839), C64e(0x31c6f7a4f762a431), + C64e(0xd38a593759bd37d3), C64e(0xf274868b86ff8bf2), + C64e(0xd583563256b132d5), C64e(0x8b4ec543c50d438b), + C64e(0x6e85eb59ebdc596e), C64e(0xda18c2b7c2afb7da), + C64e(0x018e8f8c8f028c01), C64e(0xb11dac64ac7964b1), + C64e(0x9cf16dd26d23d29c), C64e(0x49723be03b92e049), + C64e(0xd81fc7b4c7abb4d8), C64e(0xacb915fa1543faac), + C64e(0xf3fa090709fd07f3), C64e(0xcfa06f256f8525cf), + C64e(0xca20eaafea8fafca), C64e(0xf47d898e89f38ef4), + C64e(0x476720e9208ee947), C64e(0x1038281828201810), + C64e(0x6f0b64d564ded56f), C64e(0xf073838883fb88f0), + C64e(0x4afbb16fb1946f4a), C64e(0x5cca967296b8725c), + C64e(0x38546c246c702438), C64e(0x575f08f108aef157), + C64e(0x732152c752e6c773), C64e(0x9764f351f3355197), + C64e(0xcbae6523658d23cb), C64e(0xa125847c84597ca1), + C64e(0xe857bf9cbfcb9ce8), C64e(0x3e5d6321637c213e), + C64e(0x96ea7cdd7c37dd96), C64e(0x611e7fdc7fc2dc61), + C64e(0x0d9c9186911a860d), C64e(0x0f9b9485941e850f), + C64e(0xe04bab90abdb90e0), C64e(0x7cbac642c6f8427c), + C64e(0x712657c457e2c471), C64e(0xcc29e5aae583aacc), + C64e(0x90e373d8733bd890), C64e(0x06090f050f0c0506), + C64e(0xf7f4030103f501f7), C64e(0x1c2a36123638121c), + C64e(0xc23cfea3fe9fa3c2), C64e(0x6a8be15fe1d45f6a), + C64e(0xaebe10f91047f9ae), C64e(0x69026bd06bd2d069), + C64e(0x17bfa891a82e9117), C64e(0x9971e858e8295899), + C64e(0x3a5369276974273a), C64e(0x27f7d0b9d04eb927), + C64e(0xd991483848a938d9), C64e(0xebde351335cd13eb), + C64e(0x2be5ceb3ce56b32b), C64e(0x2277553355443322), + C64e(0xd204d6bbd6bfbbd2), C64e(0xa9399070904970a9), + C64e(0x07878089800e8907), C64e(0x33c1f2a7f266a733), + C64e(0x2decc1b6c15ab62d), C64e(0x3c5a66226678223c), + C64e(0x15b8ad92ad2a9215), C64e(0xc9a96020608920c9), + C64e(0x875cdb49db154987), C64e(0xaab01aff1a4fffaa), + C64e(0x50d8887888a07850), C64e(0xa52b8e7a8e517aa5), + C64e(0x03898a8f8a068f03), C64e(0x594a13f813b2f859), + C64e(0x09929b809b128009), C64e(0x1a2339173934171a), + C64e(0x651075da75cada65), C64e(0xd784533153b531d7), + C64e(0x84d551c65113c684), C64e(0xd003d3b8d3bbb8d0), + C64e(0x82dc5ec35e1fc382), C64e(0x29e2cbb0cb52b029), + C64e(0x5ac3997799b4775a), C64e(0x1e2d3311333c111e), + C64e(0x7b3d46cb46f6cb7b), C64e(0xa8b71ffc1f4bfca8), + C64e(0x6d0c61d661dad66d), C64e(0x2c624e3a4e583a2c) +}; + +#if !SPH_SMALL_FOOTPRINT_GROESTL + +static const sph_u64 T1[] = { + C64e(0xc6c632f4a5f497a5), C64e(0xf8f86f978497eb84), + C64e(0xeeee5eb099b0c799), C64e(0xf6f67a8c8d8cf78d), + C64e(0xffffe8170d17e50d), C64e(0xd6d60adcbddcb7bd), + C64e(0xdede16c8b1c8a7b1), C64e(0x91916dfc54fc3954), + C64e(0x606090f050f0c050), C64e(0x0202070503050403), + C64e(0xcece2ee0a9e087a9), C64e(0x5656d1877d87ac7d), + C64e(0xe7e7cc2b192bd519), C64e(0xb5b513a662a67162), + C64e(0x4d4d7c31e6319ae6), C64e(0xecec59b59ab5c39a), + C64e(0x8f8f40cf45cf0545), C64e(0x1f1fa3bc9dbc3e9d), + C64e(0x898949c040c00940), C64e(0xfafa68928792ef87), + C64e(0xefefd03f153fc515), C64e(0xb2b29426eb267feb), + C64e(0x8e8ece40c94007c9), C64e(0xfbfbe61d0b1ded0b), + C64e(0x41416e2fec2f82ec), C64e(0xb3b31aa967a97d67), + C64e(0x5f5f431cfd1cbefd), C64e(0x45456025ea258aea), + C64e(0x2323f9dabfda46bf), C64e(0x53535102f702a6f7), + C64e(0xe4e445a196a1d396), C64e(0x9b9b76ed5bed2d5b), + C64e(0x7575285dc25deac2), C64e(0xe1e1c5241c24d91c), + C64e(0x3d3dd4e9aee97aae), C64e(0x4c4cf2be6abe986a), + C64e(0x6c6c82ee5aeed85a), C64e(0x7e7ebdc341c3fc41), + C64e(0xf5f5f3060206f102), C64e(0x838352d14fd11d4f), + C64e(0x68688ce45ce4d05c), C64e(0x51515607f407a2f4), + C64e(0xd1d18d5c345cb934), C64e(0xf9f9e1180818e908), + C64e(0xe2e24cae93aedf93), C64e(0xabab3e9573954d73), + C64e(0x626297f553f5c453), C64e(0x2a2a6b413f41543f), + C64e(0x08081c140c14100c), C64e(0x959563f652f63152), + C64e(0x4646e9af65af8c65), C64e(0x9d9d7fe25ee2215e), + C64e(0x3030487828786028), C64e(0x3737cff8a1f86ea1), + C64e(0x0a0a1b110f11140f), C64e(0x2f2febc4b5c45eb5), + C64e(0x0e0e151b091b1c09), C64e(0x24247e5a365a4836), + C64e(0x1b1badb69bb6369b), C64e(0xdfdf98473d47a53d), + C64e(0xcdcda76a266a8126), C64e(0x4e4ef5bb69bb9c69), + C64e(0x7f7f334ccd4cfecd), C64e(0xeaea50ba9fbacf9f), + C64e(0x12123f2d1b2d241b), C64e(0x1d1da4b99eb93a9e), + C64e(0x5858c49c749cb074), C64e(0x343446722e72682e), + C64e(0x363641772d776c2d), C64e(0xdcdc11cdb2cda3b2), + C64e(0xb4b49d29ee2973ee), C64e(0x5b5b4d16fb16b6fb), + C64e(0xa4a4a501f60153f6), C64e(0x7676a1d74dd7ec4d), + C64e(0xb7b714a361a37561), C64e(0x7d7d3449ce49face), + C64e(0x5252df8d7b8da47b), C64e(0xdddd9f423e42a13e), + C64e(0x5e5ecd937193bc71), C64e(0x1313b1a297a22697), + C64e(0xa6a6a204f50457f5), C64e(0xb9b901b868b86968), + C64e(0x0000000000000000), C64e(0xc1c1b5742c74992c), + C64e(0x4040e0a060a08060), C64e(0xe3e3c2211f21dd1f), + C64e(0x79793a43c843f2c8), C64e(0xb6b69a2ced2c77ed), + C64e(0xd4d40dd9bed9b3be), C64e(0x8d8d47ca46ca0146), + C64e(0x67671770d970ced9), C64e(0x7272afdd4bdde44b), + C64e(0x9494ed79de7933de), C64e(0x9898ff67d4672bd4), + C64e(0xb0b09323e8237be8), C64e(0x85855bde4ade114a), + C64e(0xbbbb06bd6bbd6d6b), C64e(0xc5c5bb7e2a7e912a), + C64e(0x4f4f7b34e5349ee5), C64e(0xededd73a163ac116), + C64e(0x8686d254c55417c5), C64e(0x9a9af862d7622fd7), + C64e(0x666699ff55ffcc55), C64e(0x1111b6a794a72294), + C64e(0x8a8ac04acf4a0fcf), C64e(0xe9e9d9301030c910), + C64e(0x04040e0a060a0806), C64e(0xfefe66988198e781), + C64e(0xa0a0ab0bf00b5bf0), C64e(0x7878b4cc44ccf044), + C64e(0x2525f0d5bad54aba), C64e(0x4b4b753ee33e96e3), + C64e(0xa2a2ac0ef30e5ff3), C64e(0x5d5d4419fe19bafe), + C64e(0x8080db5bc05b1bc0), C64e(0x050580858a850a8a), + C64e(0x3f3fd3ecadec7ead), C64e(0x2121fedfbcdf42bc), + C64e(0x7070a8d848d8e048), C64e(0xf1f1fd0c040cf904), + C64e(0x6363197adf7ac6df), C64e(0x77772f58c158eec1), + C64e(0xafaf309f759f4575), C64e(0x4242e7a563a58463), + C64e(0x2020705030504030), C64e(0xe5e5cb2e1a2ed11a), + C64e(0xfdfdef120e12e10e), C64e(0xbfbf08b76db7656d), + C64e(0x818155d44cd4194c), C64e(0x1818243c143c3014), + C64e(0x2626795f355f4c35), C64e(0xc3c3b2712f719d2f), + C64e(0xbebe8638e13867e1), C64e(0x3535c8fda2fd6aa2), + C64e(0x8888c74fcc4f0bcc), C64e(0x2e2e654b394b5c39), + C64e(0x93936af957f93d57), C64e(0x5555580df20daaf2), + C64e(0xfcfc619d829de382), C64e(0x7a7ab3c947c9f447), + C64e(0xc8c827efacef8bac), C64e(0xbaba8832e7326fe7), + C64e(0x32324f7d2b7d642b), C64e(0xe6e642a495a4d795), + C64e(0xc0c03bfba0fb9ba0), C64e(0x1919aab398b33298), + C64e(0x9e9ef668d16827d1), C64e(0xa3a322817f815d7f), + C64e(0x4444eeaa66aa8866), C64e(0x5454d6827e82a87e), + C64e(0x3b3bdde6abe676ab), C64e(0x0b0b959e839e1683), + C64e(0x8c8cc945ca4503ca), C64e(0xc7c7bc7b297b9529), + C64e(0x6b6b056ed36ed6d3), C64e(0x28286c443c44503c), + C64e(0xa7a72c8b798b5579), C64e(0xbcbc813de23d63e2), + C64e(0x161631271d272c1d), C64e(0xadad379a769a4176), + C64e(0xdbdb964d3b4dad3b), C64e(0x64649efa56fac856), + C64e(0x7474a6d24ed2e84e), C64e(0x141436221e22281e), + C64e(0x9292e476db763fdb), C64e(0x0c0c121e0a1e180a), + C64e(0x4848fcb46cb4906c), C64e(0xb8b88f37e4376be4), + C64e(0x9f9f78e75de7255d), C64e(0xbdbd0fb26eb2616e), + C64e(0x4343692aef2a86ef), C64e(0xc4c435f1a6f193a6), + C64e(0x3939dae3a8e372a8), C64e(0x3131c6f7a4f762a4), + C64e(0xd3d38a593759bd37), C64e(0xf2f274868b86ff8b), + C64e(0xd5d583563256b132), C64e(0x8b8b4ec543c50d43), + C64e(0x6e6e85eb59ebdc59), C64e(0xdada18c2b7c2afb7), + C64e(0x01018e8f8c8f028c), C64e(0xb1b11dac64ac7964), + C64e(0x9c9cf16dd26d23d2), C64e(0x4949723be03b92e0), + C64e(0xd8d81fc7b4c7abb4), C64e(0xacacb915fa1543fa), + C64e(0xf3f3fa090709fd07), C64e(0xcfcfa06f256f8525), + C64e(0xcaca20eaafea8faf), C64e(0xf4f47d898e89f38e), + C64e(0x47476720e9208ee9), C64e(0x1010382818282018), + C64e(0x6f6f0b64d564ded5), C64e(0xf0f073838883fb88), + C64e(0x4a4afbb16fb1946f), C64e(0x5c5cca967296b872), + C64e(0x3838546c246c7024), C64e(0x57575f08f108aef1), + C64e(0x73732152c752e6c7), C64e(0x979764f351f33551), + C64e(0xcbcbae6523658d23), C64e(0xa1a125847c84597c), + C64e(0xe8e857bf9cbfcb9c), C64e(0x3e3e5d6321637c21), + C64e(0x9696ea7cdd7c37dd), C64e(0x61611e7fdc7fc2dc), + C64e(0x0d0d9c9186911a86), C64e(0x0f0f9b9485941e85), + C64e(0xe0e04bab90abdb90), C64e(0x7c7cbac642c6f842), + C64e(0x71712657c457e2c4), C64e(0xcccc29e5aae583aa), + C64e(0x9090e373d8733bd8), C64e(0x0606090f050f0c05), + C64e(0xf7f7f4030103f501), C64e(0x1c1c2a3612363812), + C64e(0xc2c23cfea3fe9fa3), C64e(0x6a6a8be15fe1d45f), + C64e(0xaeaebe10f91047f9), C64e(0x6969026bd06bd2d0), + C64e(0x1717bfa891a82e91), C64e(0x999971e858e82958), + C64e(0x3a3a536927697427), C64e(0x2727f7d0b9d04eb9), + C64e(0xd9d991483848a938), C64e(0xebebde351335cd13), + C64e(0x2b2be5ceb3ce56b3), C64e(0x2222775533554433), + C64e(0xd2d204d6bbd6bfbb), C64e(0xa9a9399070904970), + C64e(0x0707878089800e89), C64e(0x3333c1f2a7f266a7), + C64e(0x2d2decc1b6c15ab6), C64e(0x3c3c5a6622667822), + C64e(0x1515b8ad92ad2a92), C64e(0xc9c9a96020608920), + C64e(0x87875cdb49db1549), C64e(0xaaaab01aff1a4fff), + C64e(0x5050d8887888a078), C64e(0xa5a52b8e7a8e517a), + C64e(0x0303898a8f8a068f), C64e(0x59594a13f813b2f8), + C64e(0x0909929b809b1280), C64e(0x1a1a233917393417), + C64e(0x65651075da75cada), C64e(0xd7d784533153b531), + C64e(0x8484d551c65113c6), C64e(0xd0d003d3b8d3bbb8), + C64e(0x8282dc5ec35e1fc3), C64e(0x2929e2cbb0cb52b0), + C64e(0x5a5ac3997799b477), C64e(0x1e1e2d3311333c11), + C64e(0x7b7b3d46cb46f6cb), C64e(0xa8a8b71ffc1f4bfc), + C64e(0x6d6d0c61d661dad6), C64e(0x2c2c624e3a4e583a) +}; + +static const sph_u64 T2[] = { + C64e(0xa5c6c632f4a5f497), C64e(0x84f8f86f978497eb), + C64e(0x99eeee5eb099b0c7), C64e(0x8df6f67a8c8d8cf7), + C64e(0x0dffffe8170d17e5), C64e(0xbdd6d60adcbddcb7), + C64e(0xb1dede16c8b1c8a7), C64e(0x5491916dfc54fc39), + C64e(0x50606090f050f0c0), C64e(0x0302020705030504), + C64e(0xa9cece2ee0a9e087), C64e(0x7d5656d1877d87ac), + C64e(0x19e7e7cc2b192bd5), C64e(0x62b5b513a662a671), + C64e(0xe64d4d7c31e6319a), C64e(0x9aecec59b59ab5c3), + C64e(0x458f8f40cf45cf05), C64e(0x9d1f1fa3bc9dbc3e), + C64e(0x40898949c040c009), C64e(0x87fafa68928792ef), + C64e(0x15efefd03f153fc5), C64e(0xebb2b29426eb267f), + C64e(0xc98e8ece40c94007), C64e(0x0bfbfbe61d0b1ded), + C64e(0xec41416e2fec2f82), C64e(0x67b3b31aa967a97d), + C64e(0xfd5f5f431cfd1cbe), C64e(0xea45456025ea258a), + C64e(0xbf2323f9dabfda46), C64e(0xf753535102f702a6), + C64e(0x96e4e445a196a1d3), C64e(0x5b9b9b76ed5bed2d), + C64e(0xc27575285dc25dea), C64e(0x1ce1e1c5241c24d9), + C64e(0xae3d3dd4e9aee97a), C64e(0x6a4c4cf2be6abe98), + C64e(0x5a6c6c82ee5aeed8), C64e(0x417e7ebdc341c3fc), + C64e(0x02f5f5f3060206f1), C64e(0x4f838352d14fd11d), + C64e(0x5c68688ce45ce4d0), C64e(0xf451515607f407a2), + C64e(0x34d1d18d5c345cb9), C64e(0x08f9f9e1180818e9), + C64e(0x93e2e24cae93aedf), C64e(0x73abab3e9573954d), + C64e(0x53626297f553f5c4), C64e(0x3f2a2a6b413f4154), + C64e(0x0c08081c140c1410), C64e(0x52959563f652f631), + C64e(0x654646e9af65af8c), C64e(0x5e9d9d7fe25ee221), + C64e(0x2830304878287860), C64e(0xa13737cff8a1f86e), + C64e(0x0f0a0a1b110f1114), C64e(0xb52f2febc4b5c45e), + C64e(0x090e0e151b091b1c), C64e(0x3624247e5a365a48), + C64e(0x9b1b1badb69bb636), C64e(0x3ddfdf98473d47a5), + C64e(0x26cdcda76a266a81), C64e(0x694e4ef5bb69bb9c), + C64e(0xcd7f7f334ccd4cfe), C64e(0x9feaea50ba9fbacf), + C64e(0x1b12123f2d1b2d24), C64e(0x9e1d1da4b99eb93a), + C64e(0x745858c49c749cb0), C64e(0x2e343446722e7268), + C64e(0x2d363641772d776c), C64e(0xb2dcdc11cdb2cda3), + C64e(0xeeb4b49d29ee2973), C64e(0xfb5b5b4d16fb16b6), + C64e(0xf6a4a4a501f60153), C64e(0x4d7676a1d74dd7ec), + C64e(0x61b7b714a361a375), C64e(0xce7d7d3449ce49fa), + C64e(0x7b5252df8d7b8da4), C64e(0x3edddd9f423e42a1), + C64e(0x715e5ecd937193bc), C64e(0x971313b1a297a226), + C64e(0xf5a6a6a204f50457), C64e(0x68b9b901b868b869), + C64e(0x0000000000000000), C64e(0x2cc1c1b5742c7499), + C64e(0x604040e0a060a080), C64e(0x1fe3e3c2211f21dd), + C64e(0xc879793a43c843f2), C64e(0xedb6b69a2ced2c77), + C64e(0xbed4d40dd9bed9b3), C64e(0x468d8d47ca46ca01), + C64e(0xd967671770d970ce), C64e(0x4b7272afdd4bdde4), + C64e(0xde9494ed79de7933), C64e(0xd49898ff67d4672b), + C64e(0xe8b0b09323e8237b), C64e(0x4a85855bde4ade11), + C64e(0x6bbbbb06bd6bbd6d), C64e(0x2ac5c5bb7e2a7e91), + C64e(0xe54f4f7b34e5349e), C64e(0x16ededd73a163ac1), + C64e(0xc58686d254c55417), C64e(0xd79a9af862d7622f), + C64e(0x55666699ff55ffcc), C64e(0x941111b6a794a722), + C64e(0xcf8a8ac04acf4a0f), C64e(0x10e9e9d9301030c9), + C64e(0x0604040e0a060a08), C64e(0x81fefe66988198e7), + C64e(0xf0a0a0ab0bf00b5b), C64e(0x447878b4cc44ccf0), + C64e(0xba2525f0d5bad54a), C64e(0xe34b4b753ee33e96), + C64e(0xf3a2a2ac0ef30e5f), C64e(0xfe5d5d4419fe19ba), + C64e(0xc08080db5bc05b1b), C64e(0x8a050580858a850a), + C64e(0xad3f3fd3ecadec7e), C64e(0xbc2121fedfbcdf42), + C64e(0x487070a8d848d8e0), C64e(0x04f1f1fd0c040cf9), + C64e(0xdf6363197adf7ac6), C64e(0xc177772f58c158ee), + C64e(0x75afaf309f759f45), C64e(0x634242e7a563a584), + C64e(0x3020207050305040), C64e(0x1ae5e5cb2e1a2ed1), + C64e(0x0efdfdef120e12e1), C64e(0x6dbfbf08b76db765), + C64e(0x4c818155d44cd419), C64e(0x141818243c143c30), + C64e(0x352626795f355f4c), C64e(0x2fc3c3b2712f719d), + C64e(0xe1bebe8638e13867), C64e(0xa23535c8fda2fd6a), + C64e(0xcc8888c74fcc4f0b), C64e(0x392e2e654b394b5c), + C64e(0x5793936af957f93d), C64e(0xf25555580df20daa), + C64e(0x82fcfc619d829de3), C64e(0x477a7ab3c947c9f4), + C64e(0xacc8c827efacef8b), C64e(0xe7baba8832e7326f), + C64e(0x2b32324f7d2b7d64), C64e(0x95e6e642a495a4d7), + C64e(0xa0c0c03bfba0fb9b), C64e(0x981919aab398b332), + C64e(0xd19e9ef668d16827), C64e(0x7fa3a322817f815d), + C64e(0x664444eeaa66aa88), C64e(0x7e5454d6827e82a8), + C64e(0xab3b3bdde6abe676), C64e(0x830b0b959e839e16), + C64e(0xca8c8cc945ca4503), C64e(0x29c7c7bc7b297b95), + C64e(0xd36b6b056ed36ed6), C64e(0x3c28286c443c4450), + C64e(0x79a7a72c8b798b55), C64e(0xe2bcbc813de23d63), + C64e(0x1d161631271d272c), C64e(0x76adad379a769a41), + C64e(0x3bdbdb964d3b4dad), C64e(0x5664649efa56fac8), + C64e(0x4e7474a6d24ed2e8), C64e(0x1e141436221e2228), + C64e(0xdb9292e476db763f), C64e(0x0a0c0c121e0a1e18), + C64e(0x6c4848fcb46cb490), C64e(0xe4b8b88f37e4376b), + C64e(0x5d9f9f78e75de725), C64e(0x6ebdbd0fb26eb261), + C64e(0xef4343692aef2a86), C64e(0xa6c4c435f1a6f193), + C64e(0xa83939dae3a8e372), C64e(0xa43131c6f7a4f762), + C64e(0x37d3d38a593759bd), C64e(0x8bf2f274868b86ff), + C64e(0x32d5d583563256b1), C64e(0x438b8b4ec543c50d), + C64e(0x596e6e85eb59ebdc), C64e(0xb7dada18c2b7c2af), + C64e(0x8c01018e8f8c8f02), C64e(0x64b1b11dac64ac79), + C64e(0xd29c9cf16dd26d23), C64e(0xe04949723be03b92), + C64e(0xb4d8d81fc7b4c7ab), C64e(0xfaacacb915fa1543), + C64e(0x07f3f3fa090709fd), C64e(0x25cfcfa06f256f85), + C64e(0xafcaca20eaafea8f), C64e(0x8ef4f47d898e89f3), + C64e(0xe947476720e9208e), C64e(0x1810103828182820), + C64e(0xd56f6f0b64d564de), C64e(0x88f0f073838883fb), + C64e(0x6f4a4afbb16fb194), C64e(0x725c5cca967296b8), + C64e(0x243838546c246c70), C64e(0xf157575f08f108ae), + C64e(0xc773732152c752e6), C64e(0x51979764f351f335), + C64e(0x23cbcbae6523658d), C64e(0x7ca1a125847c8459), + C64e(0x9ce8e857bf9cbfcb), C64e(0x213e3e5d6321637c), + C64e(0xdd9696ea7cdd7c37), C64e(0xdc61611e7fdc7fc2), + C64e(0x860d0d9c9186911a), C64e(0x850f0f9b9485941e), + C64e(0x90e0e04bab90abdb), C64e(0x427c7cbac642c6f8), + C64e(0xc471712657c457e2), C64e(0xaacccc29e5aae583), + C64e(0xd89090e373d8733b), C64e(0x050606090f050f0c), + C64e(0x01f7f7f4030103f5), C64e(0x121c1c2a36123638), + C64e(0xa3c2c23cfea3fe9f), C64e(0x5f6a6a8be15fe1d4), + C64e(0xf9aeaebe10f91047), C64e(0xd06969026bd06bd2), + C64e(0x911717bfa891a82e), C64e(0x58999971e858e829), + C64e(0x273a3a5369276974), C64e(0xb92727f7d0b9d04e), + C64e(0x38d9d991483848a9), C64e(0x13ebebde351335cd), + C64e(0xb32b2be5ceb3ce56), C64e(0x3322227755335544), + C64e(0xbbd2d204d6bbd6bf), C64e(0x70a9a93990709049), + C64e(0x890707878089800e), C64e(0xa73333c1f2a7f266), + C64e(0xb62d2decc1b6c15a), C64e(0x223c3c5a66226678), + C64e(0x921515b8ad92ad2a), C64e(0x20c9c9a960206089), + C64e(0x4987875cdb49db15), C64e(0xffaaaab01aff1a4f), + C64e(0x785050d8887888a0), C64e(0x7aa5a52b8e7a8e51), + C64e(0x8f0303898a8f8a06), C64e(0xf859594a13f813b2), + C64e(0x800909929b809b12), C64e(0x171a1a2339173934), + C64e(0xda65651075da75ca), C64e(0x31d7d784533153b5), + C64e(0xc68484d551c65113), C64e(0xb8d0d003d3b8d3bb), + C64e(0xc38282dc5ec35e1f), C64e(0xb02929e2cbb0cb52), + C64e(0x775a5ac3997799b4), C64e(0x111e1e2d3311333c), + C64e(0xcb7b7b3d46cb46f6), C64e(0xfca8a8b71ffc1f4b), + C64e(0xd66d6d0c61d661da), C64e(0x3a2c2c624e3a4e58) +}; + +static const sph_u64 T3[] = { + C64e(0x97a5c6c632f4a5f4), C64e(0xeb84f8f86f978497), + C64e(0xc799eeee5eb099b0), C64e(0xf78df6f67a8c8d8c), + C64e(0xe50dffffe8170d17), C64e(0xb7bdd6d60adcbddc), + C64e(0xa7b1dede16c8b1c8), C64e(0x395491916dfc54fc), + C64e(0xc050606090f050f0), C64e(0x0403020207050305), + C64e(0x87a9cece2ee0a9e0), C64e(0xac7d5656d1877d87), + C64e(0xd519e7e7cc2b192b), C64e(0x7162b5b513a662a6), + C64e(0x9ae64d4d7c31e631), C64e(0xc39aecec59b59ab5), + C64e(0x05458f8f40cf45cf), C64e(0x3e9d1f1fa3bc9dbc), + C64e(0x0940898949c040c0), C64e(0xef87fafa68928792), + C64e(0xc515efefd03f153f), C64e(0x7febb2b29426eb26), + C64e(0x07c98e8ece40c940), C64e(0xed0bfbfbe61d0b1d), + C64e(0x82ec41416e2fec2f), C64e(0x7d67b3b31aa967a9), + C64e(0xbefd5f5f431cfd1c), C64e(0x8aea45456025ea25), + C64e(0x46bf2323f9dabfda), C64e(0xa6f753535102f702), + C64e(0xd396e4e445a196a1), C64e(0x2d5b9b9b76ed5bed), + C64e(0xeac27575285dc25d), C64e(0xd91ce1e1c5241c24), + C64e(0x7aae3d3dd4e9aee9), C64e(0x986a4c4cf2be6abe), + C64e(0xd85a6c6c82ee5aee), C64e(0xfc417e7ebdc341c3), + C64e(0xf102f5f5f3060206), C64e(0x1d4f838352d14fd1), + C64e(0xd05c68688ce45ce4), C64e(0xa2f451515607f407), + C64e(0xb934d1d18d5c345c), C64e(0xe908f9f9e1180818), + C64e(0xdf93e2e24cae93ae), C64e(0x4d73abab3e957395), + C64e(0xc453626297f553f5), C64e(0x543f2a2a6b413f41), + C64e(0x100c08081c140c14), C64e(0x3152959563f652f6), + C64e(0x8c654646e9af65af), C64e(0x215e9d9d7fe25ee2), + C64e(0x6028303048782878), C64e(0x6ea13737cff8a1f8), + C64e(0x140f0a0a1b110f11), C64e(0x5eb52f2febc4b5c4), + C64e(0x1c090e0e151b091b), C64e(0x483624247e5a365a), + C64e(0x369b1b1badb69bb6), C64e(0xa53ddfdf98473d47), + C64e(0x8126cdcda76a266a), C64e(0x9c694e4ef5bb69bb), + C64e(0xfecd7f7f334ccd4c), C64e(0xcf9feaea50ba9fba), + C64e(0x241b12123f2d1b2d), C64e(0x3a9e1d1da4b99eb9), + C64e(0xb0745858c49c749c), C64e(0x682e343446722e72), + C64e(0x6c2d363641772d77), C64e(0xa3b2dcdc11cdb2cd), + C64e(0x73eeb4b49d29ee29), C64e(0xb6fb5b5b4d16fb16), + C64e(0x53f6a4a4a501f601), C64e(0xec4d7676a1d74dd7), + C64e(0x7561b7b714a361a3), C64e(0xface7d7d3449ce49), + C64e(0xa47b5252df8d7b8d), C64e(0xa13edddd9f423e42), + C64e(0xbc715e5ecd937193), C64e(0x26971313b1a297a2), + C64e(0x57f5a6a6a204f504), C64e(0x6968b9b901b868b8), + C64e(0x0000000000000000), C64e(0x992cc1c1b5742c74), + C64e(0x80604040e0a060a0), C64e(0xdd1fe3e3c2211f21), + C64e(0xf2c879793a43c843), C64e(0x77edb6b69a2ced2c), + C64e(0xb3bed4d40dd9bed9), C64e(0x01468d8d47ca46ca), + C64e(0xced967671770d970), C64e(0xe44b7272afdd4bdd), + C64e(0x33de9494ed79de79), C64e(0x2bd49898ff67d467), + C64e(0x7be8b0b09323e823), C64e(0x114a85855bde4ade), + C64e(0x6d6bbbbb06bd6bbd), C64e(0x912ac5c5bb7e2a7e), + C64e(0x9ee54f4f7b34e534), C64e(0xc116ededd73a163a), + C64e(0x17c58686d254c554), C64e(0x2fd79a9af862d762), + C64e(0xcc55666699ff55ff), C64e(0x22941111b6a794a7), + C64e(0x0fcf8a8ac04acf4a), C64e(0xc910e9e9d9301030), + C64e(0x080604040e0a060a), C64e(0xe781fefe66988198), + C64e(0x5bf0a0a0ab0bf00b), C64e(0xf0447878b4cc44cc), + C64e(0x4aba2525f0d5bad5), C64e(0x96e34b4b753ee33e), + C64e(0x5ff3a2a2ac0ef30e), C64e(0xbafe5d5d4419fe19), + C64e(0x1bc08080db5bc05b), C64e(0x0a8a050580858a85), + C64e(0x7ead3f3fd3ecadec), C64e(0x42bc2121fedfbcdf), + C64e(0xe0487070a8d848d8), C64e(0xf904f1f1fd0c040c), + C64e(0xc6df6363197adf7a), C64e(0xeec177772f58c158), + C64e(0x4575afaf309f759f), C64e(0x84634242e7a563a5), + C64e(0x4030202070503050), C64e(0xd11ae5e5cb2e1a2e), + C64e(0xe10efdfdef120e12), C64e(0x656dbfbf08b76db7), + C64e(0x194c818155d44cd4), C64e(0x30141818243c143c), + C64e(0x4c352626795f355f), C64e(0x9d2fc3c3b2712f71), + C64e(0x67e1bebe8638e138), C64e(0x6aa23535c8fda2fd), + C64e(0x0bcc8888c74fcc4f), C64e(0x5c392e2e654b394b), + C64e(0x3d5793936af957f9), C64e(0xaaf25555580df20d), + C64e(0xe382fcfc619d829d), C64e(0xf4477a7ab3c947c9), + C64e(0x8bacc8c827efacef), C64e(0x6fe7baba8832e732), + C64e(0x642b32324f7d2b7d), C64e(0xd795e6e642a495a4), + C64e(0x9ba0c0c03bfba0fb), C64e(0x32981919aab398b3), + C64e(0x27d19e9ef668d168), C64e(0x5d7fa3a322817f81), + C64e(0x88664444eeaa66aa), C64e(0xa87e5454d6827e82), + C64e(0x76ab3b3bdde6abe6), C64e(0x16830b0b959e839e), + C64e(0x03ca8c8cc945ca45), C64e(0x9529c7c7bc7b297b), + C64e(0xd6d36b6b056ed36e), C64e(0x503c28286c443c44), + C64e(0x5579a7a72c8b798b), C64e(0x63e2bcbc813de23d), + C64e(0x2c1d161631271d27), C64e(0x4176adad379a769a), + C64e(0xad3bdbdb964d3b4d), C64e(0xc85664649efa56fa), + C64e(0xe84e7474a6d24ed2), C64e(0x281e141436221e22), + C64e(0x3fdb9292e476db76), C64e(0x180a0c0c121e0a1e), + C64e(0x906c4848fcb46cb4), C64e(0x6be4b8b88f37e437), + C64e(0x255d9f9f78e75de7), C64e(0x616ebdbd0fb26eb2), + C64e(0x86ef4343692aef2a), C64e(0x93a6c4c435f1a6f1), + C64e(0x72a83939dae3a8e3), C64e(0x62a43131c6f7a4f7), + C64e(0xbd37d3d38a593759), C64e(0xff8bf2f274868b86), + C64e(0xb132d5d583563256), C64e(0x0d438b8b4ec543c5), + C64e(0xdc596e6e85eb59eb), C64e(0xafb7dada18c2b7c2), + C64e(0x028c01018e8f8c8f), C64e(0x7964b1b11dac64ac), + C64e(0x23d29c9cf16dd26d), C64e(0x92e04949723be03b), + C64e(0xabb4d8d81fc7b4c7), C64e(0x43faacacb915fa15), + C64e(0xfd07f3f3fa090709), C64e(0x8525cfcfa06f256f), + C64e(0x8fafcaca20eaafea), C64e(0xf38ef4f47d898e89), + C64e(0x8ee947476720e920), C64e(0x2018101038281828), + C64e(0xded56f6f0b64d564), C64e(0xfb88f0f073838883), + C64e(0x946f4a4afbb16fb1), C64e(0xb8725c5cca967296), + C64e(0x70243838546c246c), C64e(0xaef157575f08f108), + C64e(0xe6c773732152c752), C64e(0x3551979764f351f3), + C64e(0x8d23cbcbae652365), C64e(0x597ca1a125847c84), + C64e(0xcb9ce8e857bf9cbf), C64e(0x7c213e3e5d632163), + C64e(0x37dd9696ea7cdd7c), C64e(0xc2dc61611e7fdc7f), + C64e(0x1a860d0d9c918691), C64e(0x1e850f0f9b948594), + C64e(0xdb90e0e04bab90ab), C64e(0xf8427c7cbac642c6), + C64e(0xe2c471712657c457), C64e(0x83aacccc29e5aae5), + C64e(0x3bd89090e373d873), C64e(0x0c050606090f050f), + C64e(0xf501f7f7f4030103), C64e(0x38121c1c2a361236), + C64e(0x9fa3c2c23cfea3fe), C64e(0xd45f6a6a8be15fe1), + C64e(0x47f9aeaebe10f910), C64e(0xd2d06969026bd06b), + C64e(0x2e911717bfa891a8), C64e(0x2958999971e858e8), + C64e(0x74273a3a53692769), C64e(0x4eb92727f7d0b9d0), + C64e(0xa938d9d991483848), C64e(0xcd13ebebde351335), + C64e(0x56b32b2be5ceb3ce), C64e(0x4433222277553355), + C64e(0xbfbbd2d204d6bbd6), C64e(0x4970a9a939907090), + C64e(0x0e89070787808980), C64e(0x66a73333c1f2a7f2), + C64e(0x5ab62d2decc1b6c1), C64e(0x78223c3c5a662266), + C64e(0x2a921515b8ad92ad), C64e(0x8920c9c9a9602060), + C64e(0x154987875cdb49db), C64e(0x4fffaaaab01aff1a), + C64e(0xa0785050d8887888), C64e(0x517aa5a52b8e7a8e), + C64e(0x068f0303898a8f8a), C64e(0xb2f859594a13f813), + C64e(0x12800909929b809b), C64e(0x34171a1a23391739), + C64e(0xcada65651075da75), C64e(0xb531d7d784533153), + C64e(0x13c68484d551c651), C64e(0xbbb8d0d003d3b8d3), + C64e(0x1fc38282dc5ec35e), C64e(0x52b02929e2cbb0cb), + C64e(0xb4775a5ac3997799), C64e(0x3c111e1e2d331133), + C64e(0xf6cb7b7b3d46cb46), C64e(0x4bfca8a8b71ffc1f), + C64e(0xdad66d6d0c61d661), C64e(0x583a2c2c624e3a4e) +}; + +#endif + +static const sph_u64 T4[] = { + C64e(0xf497a5c6c632f4a5), C64e(0x97eb84f8f86f9784), + C64e(0xb0c799eeee5eb099), C64e(0x8cf78df6f67a8c8d), + C64e(0x17e50dffffe8170d), C64e(0xdcb7bdd6d60adcbd), + C64e(0xc8a7b1dede16c8b1), C64e(0xfc395491916dfc54), + C64e(0xf0c050606090f050), C64e(0x0504030202070503), + C64e(0xe087a9cece2ee0a9), C64e(0x87ac7d5656d1877d), + C64e(0x2bd519e7e7cc2b19), C64e(0xa67162b5b513a662), + C64e(0x319ae64d4d7c31e6), C64e(0xb5c39aecec59b59a), + C64e(0xcf05458f8f40cf45), C64e(0xbc3e9d1f1fa3bc9d), + C64e(0xc00940898949c040), C64e(0x92ef87fafa689287), + C64e(0x3fc515efefd03f15), C64e(0x267febb2b29426eb), + C64e(0x4007c98e8ece40c9), C64e(0x1ded0bfbfbe61d0b), + C64e(0x2f82ec41416e2fec), C64e(0xa97d67b3b31aa967), + C64e(0x1cbefd5f5f431cfd), C64e(0x258aea45456025ea), + C64e(0xda46bf2323f9dabf), C64e(0x02a6f753535102f7), + C64e(0xa1d396e4e445a196), C64e(0xed2d5b9b9b76ed5b), + C64e(0x5deac27575285dc2), C64e(0x24d91ce1e1c5241c), + C64e(0xe97aae3d3dd4e9ae), C64e(0xbe986a4c4cf2be6a), + C64e(0xeed85a6c6c82ee5a), C64e(0xc3fc417e7ebdc341), + C64e(0x06f102f5f5f30602), C64e(0xd11d4f838352d14f), + C64e(0xe4d05c68688ce45c), C64e(0x07a2f451515607f4), + C64e(0x5cb934d1d18d5c34), C64e(0x18e908f9f9e11808), + C64e(0xaedf93e2e24cae93), C64e(0x954d73abab3e9573), + C64e(0xf5c453626297f553), C64e(0x41543f2a2a6b413f), + C64e(0x14100c08081c140c), C64e(0xf63152959563f652), + C64e(0xaf8c654646e9af65), C64e(0xe2215e9d9d7fe25e), + C64e(0x7860283030487828), C64e(0xf86ea13737cff8a1), + C64e(0x11140f0a0a1b110f), C64e(0xc45eb52f2febc4b5), + C64e(0x1b1c090e0e151b09), C64e(0x5a483624247e5a36), + C64e(0xb6369b1b1badb69b), C64e(0x47a53ddfdf98473d), + C64e(0x6a8126cdcda76a26), C64e(0xbb9c694e4ef5bb69), + C64e(0x4cfecd7f7f334ccd), C64e(0xbacf9feaea50ba9f), + C64e(0x2d241b12123f2d1b), C64e(0xb93a9e1d1da4b99e), + C64e(0x9cb0745858c49c74), C64e(0x72682e343446722e), + C64e(0x776c2d363641772d), C64e(0xcda3b2dcdc11cdb2), + C64e(0x2973eeb4b49d29ee), C64e(0x16b6fb5b5b4d16fb), + C64e(0x0153f6a4a4a501f6), C64e(0xd7ec4d7676a1d74d), + C64e(0xa37561b7b714a361), C64e(0x49face7d7d3449ce), + C64e(0x8da47b5252df8d7b), C64e(0x42a13edddd9f423e), + C64e(0x93bc715e5ecd9371), C64e(0xa226971313b1a297), + C64e(0x0457f5a6a6a204f5), C64e(0xb86968b9b901b868), + C64e(0x0000000000000000), C64e(0x74992cc1c1b5742c), + C64e(0xa080604040e0a060), C64e(0x21dd1fe3e3c2211f), + C64e(0x43f2c879793a43c8), C64e(0x2c77edb6b69a2ced), + C64e(0xd9b3bed4d40dd9be), C64e(0xca01468d8d47ca46), + C64e(0x70ced967671770d9), C64e(0xdde44b7272afdd4b), + C64e(0x7933de9494ed79de), C64e(0x672bd49898ff67d4), + C64e(0x237be8b0b09323e8), C64e(0xde114a85855bde4a), + C64e(0xbd6d6bbbbb06bd6b), C64e(0x7e912ac5c5bb7e2a), + C64e(0x349ee54f4f7b34e5), C64e(0x3ac116ededd73a16), + C64e(0x5417c58686d254c5), C64e(0x622fd79a9af862d7), + C64e(0xffcc55666699ff55), C64e(0xa722941111b6a794), + C64e(0x4a0fcf8a8ac04acf), C64e(0x30c910e9e9d93010), + C64e(0x0a080604040e0a06), C64e(0x98e781fefe669881), + C64e(0x0b5bf0a0a0ab0bf0), C64e(0xccf0447878b4cc44), + C64e(0xd54aba2525f0d5ba), C64e(0x3e96e34b4b753ee3), + C64e(0x0e5ff3a2a2ac0ef3), C64e(0x19bafe5d5d4419fe), + C64e(0x5b1bc08080db5bc0), C64e(0x850a8a050580858a), + C64e(0xec7ead3f3fd3ecad), C64e(0xdf42bc2121fedfbc), + C64e(0xd8e0487070a8d848), C64e(0x0cf904f1f1fd0c04), + C64e(0x7ac6df6363197adf), C64e(0x58eec177772f58c1), + C64e(0x9f4575afaf309f75), C64e(0xa584634242e7a563), + C64e(0x5040302020705030), C64e(0x2ed11ae5e5cb2e1a), + C64e(0x12e10efdfdef120e), C64e(0xb7656dbfbf08b76d), + C64e(0xd4194c818155d44c), C64e(0x3c30141818243c14), + C64e(0x5f4c352626795f35), C64e(0x719d2fc3c3b2712f), + C64e(0x3867e1bebe8638e1), C64e(0xfd6aa23535c8fda2), + C64e(0x4f0bcc8888c74fcc), C64e(0x4b5c392e2e654b39), + C64e(0xf93d5793936af957), C64e(0x0daaf25555580df2), + C64e(0x9de382fcfc619d82), C64e(0xc9f4477a7ab3c947), + C64e(0xef8bacc8c827efac), C64e(0x326fe7baba8832e7), + C64e(0x7d642b32324f7d2b), C64e(0xa4d795e6e642a495), + C64e(0xfb9ba0c0c03bfba0), C64e(0xb332981919aab398), + C64e(0x6827d19e9ef668d1), C64e(0x815d7fa3a322817f), + C64e(0xaa88664444eeaa66), C64e(0x82a87e5454d6827e), + C64e(0xe676ab3b3bdde6ab), C64e(0x9e16830b0b959e83), + C64e(0x4503ca8c8cc945ca), C64e(0x7b9529c7c7bc7b29), + C64e(0x6ed6d36b6b056ed3), C64e(0x44503c28286c443c), + C64e(0x8b5579a7a72c8b79), C64e(0x3d63e2bcbc813de2), + C64e(0x272c1d161631271d), C64e(0x9a4176adad379a76), + C64e(0x4dad3bdbdb964d3b), C64e(0xfac85664649efa56), + C64e(0xd2e84e7474a6d24e), C64e(0x22281e141436221e), + C64e(0x763fdb9292e476db), C64e(0x1e180a0c0c121e0a), + C64e(0xb4906c4848fcb46c), C64e(0x376be4b8b88f37e4), + C64e(0xe7255d9f9f78e75d), C64e(0xb2616ebdbd0fb26e), + C64e(0x2a86ef4343692aef), C64e(0xf193a6c4c435f1a6), + C64e(0xe372a83939dae3a8), C64e(0xf762a43131c6f7a4), + C64e(0x59bd37d3d38a5937), C64e(0x86ff8bf2f274868b), + C64e(0x56b132d5d5835632), C64e(0xc50d438b8b4ec543), + C64e(0xebdc596e6e85eb59), C64e(0xc2afb7dada18c2b7), + C64e(0x8f028c01018e8f8c), C64e(0xac7964b1b11dac64), + C64e(0x6d23d29c9cf16dd2), C64e(0x3b92e04949723be0), + C64e(0xc7abb4d8d81fc7b4), C64e(0x1543faacacb915fa), + C64e(0x09fd07f3f3fa0907), C64e(0x6f8525cfcfa06f25), + C64e(0xea8fafcaca20eaaf), C64e(0x89f38ef4f47d898e), + C64e(0x208ee947476720e9), C64e(0x2820181010382818), + C64e(0x64ded56f6f0b64d5), C64e(0x83fb88f0f0738388), + C64e(0xb1946f4a4afbb16f), C64e(0x96b8725c5cca9672), + C64e(0x6c70243838546c24), C64e(0x08aef157575f08f1), + C64e(0x52e6c773732152c7), C64e(0xf33551979764f351), + C64e(0x658d23cbcbae6523), C64e(0x84597ca1a125847c), + C64e(0xbfcb9ce8e857bf9c), C64e(0x637c213e3e5d6321), + C64e(0x7c37dd9696ea7cdd), C64e(0x7fc2dc61611e7fdc), + C64e(0x911a860d0d9c9186), C64e(0x941e850f0f9b9485), + C64e(0xabdb90e0e04bab90), C64e(0xc6f8427c7cbac642), + C64e(0x57e2c471712657c4), C64e(0xe583aacccc29e5aa), + C64e(0x733bd89090e373d8), C64e(0x0f0c050606090f05), + C64e(0x03f501f7f7f40301), C64e(0x3638121c1c2a3612), + C64e(0xfe9fa3c2c23cfea3), C64e(0xe1d45f6a6a8be15f), + C64e(0x1047f9aeaebe10f9), C64e(0x6bd2d06969026bd0), + C64e(0xa82e911717bfa891), C64e(0xe82958999971e858), + C64e(0x6974273a3a536927), C64e(0xd04eb92727f7d0b9), + C64e(0x48a938d9d9914838), C64e(0x35cd13ebebde3513), + C64e(0xce56b32b2be5ceb3), C64e(0x5544332222775533), + C64e(0xd6bfbbd2d204d6bb), C64e(0x904970a9a9399070), + C64e(0x800e890707878089), C64e(0xf266a73333c1f2a7), + C64e(0xc15ab62d2decc1b6), C64e(0x6678223c3c5a6622), + C64e(0xad2a921515b8ad92), C64e(0x608920c9c9a96020), + C64e(0xdb154987875cdb49), C64e(0x1a4fffaaaab01aff), + C64e(0x88a0785050d88878), C64e(0x8e517aa5a52b8e7a), + C64e(0x8a068f0303898a8f), C64e(0x13b2f859594a13f8), + C64e(0x9b12800909929b80), C64e(0x3934171a1a233917), + C64e(0x75cada65651075da), C64e(0x53b531d7d7845331), + C64e(0x5113c68484d551c6), C64e(0xd3bbb8d0d003d3b8), + C64e(0x5e1fc38282dc5ec3), C64e(0xcb52b02929e2cbb0), + C64e(0x99b4775a5ac39977), C64e(0x333c111e1e2d3311), + C64e(0x46f6cb7b7b3d46cb), C64e(0x1f4bfca8a8b71ffc), + C64e(0x61dad66d6d0c61d6), C64e(0x4e583a2c2c624e3a) +}; + +#if !SPH_SMALL_FOOTPRINT_GROESTL + +static const sph_u64 T5[] = { + C64e(0xa5f497a5c6c632f4), C64e(0x8497eb84f8f86f97), + C64e(0x99b0c799eeee5eb0), C64e(0x8d8cf78df6f67a8c), + C64e(0x0d17e50dffffe817), C64e(0xbddcb7bdd6d60adc), + C64e(0xb1c8a7b1dede16c8), C64e(0x54fc395491916dfc), + C64e(0x50f0c050606090f0), C64e(0x0305040302020705), + C64e(0xa9e087a9cece2ee0), C64e(0x7d87ac7d5656d187), + C64e(0x192bd519e7e7cc2b), C64e(0x62a67162b5b513a6), + C64e(0xe6319ae64d4d7c31), C64e(0x9ab5c39aecec59b5), + C64e(0x45cf05458f8f40cf), C64e(0x9dbc3e9d1f1fa3bc), + C64e(0x40c00940898949c0), C64e(0x8792ef87fafa6892), + C64e(0x153fc515efefd03f), C64e(0xeb267febb2b29426), + C64e(0xc94007c98e8ece40), C64e(0x0b1ded0bfbfbe61d), + C64e(0xec2f82ec41416e2f), C64e(0x67a97d67b3b31aa9), + C64e(0xfd1cbefd5f5f431c), C64e(0xea258aea45456025), + C64e(0xbfda46bf2323f9da), C64e(0xf702a6f753535102), + C64e(0x96a1d396e4e445a1), C64e(0x5bed2d5b9b9b76ed), + C64e(0xc25deac27575285d), C64e(0x1c24d91ce1e1c524), + C64e(0xaee97aae3d3dd4e9), C64e(0x6abe986a4c4cf2be), + C64e(0x5aeed85a6c6c82ee), C64e(0x41c3fc417e7ebdc3), + C64e(0x0206f102f5f5f306), C64e(0x4fd11d4f838352d1), + C64e(0x5ce4d05c68688ce4), C64e(0xf407a2f451515607), + C64e(0x345cb934d1d18d5c), C64e(0x0818e908f9f9e118), + C64e(0x93aedf93e2e24cae), C64e(0x73954d73abab3e95), + C64e(0x53f5c453626297f5), C64e(0x3f41543f2a2a6b41), + C64e(0x0c14100c08081c14), C64e(0x52f63152959563f6), + C64e(0x65af8c654646e9af), C64e(0x5ee2215e9d9d7fe2), + C64e(0x2878602830304878), C64e(0xa1f86ea13737cff8), + C64e(0x0f11140f0a0a1b11), C64e(0xb5c45eb52f2febc4), + C64e(0x091b1c090e0e151b), C64e(0x365a483624247e5a), + C64e(0x9bb6369b1b1badb6), C64e(0x3d47a53ddfdf9847), + C64e(0x266a8126cdcda76a), C64e(0x69bb9c694e4ef5bb), + C64e(0xcd4cfecd7f7f334c), C64e(0x9fbacf9feaea50ba), + C64e(0x1b2d241b12123f2d), C64e(0x9eb93a9e1d1da4b9), + C64e(0x749cb0745858c49c), C64e(0x2e72682e34344672), + C64e(0x2d776c2d36364177), C64e(0xb2cda3b2dcdc11cd), + C64e(0xee2973eeb4b49d29), C64e(0xfb16b6fb5b5b4d16), + C64e(0xf60153f6a4a4a501), C64e(0x4dd7ec4d7676a1d7), + C64e(0x61a37561b7b714a3), C64e(0xce49face7d7d3449), + C64e(0x7b8da47b5252df8d), C64e(0x3e42a13edddd9f42), + C64e(0x7193bc715e5ecd93), C64e(0x97a226971313b1a2), + C64e(0xf50457f5a6a6a204), C64e(0x68b86968b9b901b8), + C64e(0x0000000000000000), C64e(0x2c74992cc1c1b574), + C64e(0x60a080604040e0a0), C64e(0x1f21dd1fe3e3c221), + C64e(0xc843f2c879793a43), C64e(0xed2c77edb6b69a2c), + C64e(0xbed9b3bed4d40dd9), C64e(0x46ca01468d8d47ca), + C64e(0xd970ced967671770), C64e(0x4bdde44b7272afdd), + C64e(0xde7933de9494ed79), C64e(0xd4672bd49898ff67), + C64e(0xe8237be8b0b09323), C64e(0x4ade114a85855bde), + C64e(0x6bbd6d6bbbbb06bd), C64e(0x2a7e912ac5c5bb7e), + C64e(0xe5349ee54f4f7b34), C64e(0x163ac116ededd73a), + C64e(0xc55417c58686d254), C64e(0xd7622fd79a9af862), + C64e(0x55ffcc55666699ff), C64e(0x94a722941111b6a7), + C64e(0xcf4a0fcf8a8ac04a), C64e(0x1030c910e9e9d930), + C64e(0x060a080604040e0a), C64e(0x8198e781fefe6698), + C64e(0xf00b5bf0a0a0ab0b), C64e(0x44ccf0447878b4cc), + C64e(0xbad54aba2525f0d5), C64e(0xe33e96e34b4b753e), + C64e(0xf30e5ff3a2a2ac0e), C64e(0xfe19bafe5d5d4419), + C64e(0xc05b1bc08080db5b), C64e(0x8a850a8a05058085), + C64e(0xadec7ead3f3fd3ec), C64e(0xbcdf42bc2121fedf), + C64e(0x48d8e0487070a8d8), C64e(0x040cf904f1f1fd0c), + C64e(0xdf7ac6df6363197a), C64e(0xc158eec177772f58), + C64e(0x759f4575afaf309f), C64e(0x63a584634242e7a5), + C64e(0x3050403020207050), C64e(0x1a2ed11ae5e5cb2e), + C64e(0x0e12e10efdfdef12), C64e(0x6db7656dbfbf08b7), + C64e(0x4cd4194c818155d4), C64e(0x143c30141818243c), + C64e(0x355f4c352626795f), C64e(0x2f719d2fc3c3b271), + C64e(0xe13867e1bebe8638), C64e(0xa2fd6aa23535c8fd), + C64e(0xcc4f0bcc8888c74f), C64e(0x394b5c392e2e654b), + C64e(0x57f93d5793936af9), C64e(0xf20daaf25555580d), + C64e(0x829de382fcfc619d), C64e(0x47c9f4477a7ab3c9), + C64e(0xacef8bacc8c827ef), C64e(0xe7326fe7baba8832), + C64e(0x2b7d642b32324f7d), C64e(0x95a4d795e6e642a4), + C64e(0xa0fb9ba0c0c03bfb), C64e(0x98b332981919aab3), + C64e(0xd16827d19e9ef668), C64e(0x7f815d7fa3a32281), + C64e(0x66aa88664444eeaa), C64e(0x7e82a87e5454d682), + C64e(0xabe676ab3b3bdde6), C64e(0x839e16830b0b959e), + C64e(0xca4503ca8c8cc945), C64e(0x297b9529c7c7bc7b), + C64e(0xd36ed6d36b6b056e), C64e(0x3c44503c28286c44), + C64e(0x798b5579a7a72c8b), C64e(0xe23d63e2bcbc813d), + C64e(0x1d272c1d16163127), C64e(0x769a4176adad379a), + C64e(0x3b4dad3bdbdb964d), C64e(0x56fac85664649efa), + C64e(0x4ed2e84e7474a6d2), C64e(0x1e22281e14143622), + C64e(0xdb763fdb9292e476), C64e(0x0a1e180a0c0c121e), + C64e(0x6cb4906c4848fcb4), C64e(0xe4376be4b8b88f37), + C64e(0x5de7255d9f9f78e7), C64e(0x6eb2616ebdbd0fb2), + C64e(0xef2a86ef4343692a), C64e(0xa6f193a6c4c435f1), + C64e(0xa8e372a83939dae3), C64e(0xa4f762a43131c6f7), + C64e(0x3759bd37d3d38a59), C64e(0x8b86ff8bf2f27486), + C64e(0x3256b132d5d58356), C64e(0x43c50d438b8b4ec5), + C64e(0x59ebdc596e6e85eb), C64e(0xb7c2afb7dada18c2), + C64e(0x8c8f028c01018e8f), C64e(0x64ac7964b1b11dac), + C64e(0xd26d23d29c9cf16d), C64e(0xe03b92e04949723b), + C64e(0xb4c7abb4d8d81fc7), C64e(0xfa1543faacacb915), + C64e(0x0709fd07f3f3fa09), C64e(0x256f8525cfcfa06f), + C64e(0xafea8fafcaca20ea), C64e(0x8e89f38ef4f47d89), + C64e(0xe9208ee947476720), C64e(0x1828201810103828), + C64e(0xd564ded56f6f0b64), C64e(0x8883fb88f0f07383), + C64e(0x6fb1946f4a4afbb1), C64e(0x7296b8725c5cca96), + C64e(0x246c70243838546c), C64e(0xf108aef157575f08), + C64e(0xc752e6c773732152), C64e(0x51f33551979764f3), + C64e(0x23658d23cbcbae65), C64e(0x7c84597ca1a12584), + C64e(0x9cbfcb9ce8e857bf), C64e(0x21637c213e3e5d63), + C64e(0xdd7c37dd9696ea7c), C64e(0xdc7fc2dc61611e7f), + C64e(0x86911a860d0d9c91), C64e(0x85941e850f0f9b94), + C64e(0x90abdb90e0e04bab), C64e(0x42c6f8427c7cbac6), + C64e(0xc457e2c471712657), C64e(0xaae583aacccc29e5), + C64e(0xd8733bd89090e373), C64e(0x050f0c050606090f), + C64e(0x0103f501f7f7f403), C64e(0x123638121c1c2a36), + C64e(0xa3fe9fa3c2c23cfe), C64e(0x5fe1d45f6a6a8be1), + C64e(0xf91047f9aeaebe10), C64e(0xd06bd2d06969026b), + C64e(0x91a82e911717bfa8), C64e(0x58e82958999971e8), + C64e(0x276974273a3a5369), C64e(0xb9d04eb92727f7d0), + C64e(0x3848a938d9d99148), C64e(0x1335cd13ebebde35), + C64e(0xb3ce56b32b2be5ce), C64e(0x3355443322227755), + C64e(0xbbd6bfbbd2d204d6), C64e(0x70904970a9a93990), + C64e(0x89800e8907078780), C64e(0xa7f266a73333c1f2), + C64e(0xb6c15ab62d2decc1), C64e(0x226678223c3c5a66), + C64e(0x92ad2a921515b8ad), C64e(0x20608920c9c9a960), + C64e(0x49db154987875cdb), C64e(0xff1a4fffaaaab01a), + C64e(0x7888a0785050d888), C64e(0x7a8e517aa5a52b8e), + C64e(0x8f8a068f0303898a), C64e(0xf813b2f859594a13), + C64e(0x809b12800909929b), C64e(0x173934171a1a2339), + C64e(0xda75cada65651075), C64e(0x3153b531d7d78453), + C64e(0xc65113c68484d551), C64e(0xb8d3bbb8d0d003d3), + C64e(0xc35e1fc38282dc5e), C64e(0xb0cb52b02929e2cb), + C64e(0x7799b4775a5ac399), C64e(0x11333c111e1e2d33), + C64e(0xcb46f6cb7b7b3d46), C64e(0xfc1f4bfca8a8b71f), + C64e(0xd661dad66d6d0c61), C64e(0x3a4e583a2c2c624e) +}; + +static const sph_u64 T6[] = { + C64e(0xf4a5f497a5c6c632), C64e(0x978497eb84f8f86f), + C64e(0xb099b0c799eeee5e), C64e(0x8c8d8cf78df6f67a), + C64e(0x170d17e50dffffe8), C64e(0xdcbddcb7bdd6d60a), + C64e(0xc8b1c8a7b1dede16), C64e(0xfc54fc395491916d), + C64e(0xf050f0c050606090), C64e(0x0503050403020207), + C64e(0xe0a9e087a9cece2e), C64e(0x877d87ac7d5656d1), + C64e(0x2b192bd519e7e7cc), C64e(0xa662a67162b5b513), + C64e(0x31e6319ae64d4d7c), C64e(0xb59ab5c39aecec59), + C64e(0xcf45cf05458f8f40), C64e(0xbc9dbc3e9d1f1fa3), + C64e(0xc040c00940898949), C64e(0x928792ef87fafa68), + C64e(0x3f153fc515efefd0), C64e(0x26eb267febb2b294), + C64e(0x40c94007c98e8ece), C64e(0x1d0b1ded0bfbfbe6), + C64e(0x2fec2f82ec41416e), C64e(0xa967a97d67b3b31a), + C64e(0x1cfd1cbefd5f5f43), C64e(0x25ea258aea454560), + C64e(0xdabfda46bf2323f9), C64e(0x02f702a6f7535351), + C64e(0xa196a1d396e4e445), C64e(0xed5bed2d5b9b9b76), + C64e(0x5dc25deac2757528), C64e(0x241c24d91ce1e1c5), + C64e(0xe9aee97aae3d3dd4), C64e(0xbe6abe986a4c4cf2), + C64e(0xee5aeed85a6c6c82), C64e(0xc341c3fc417e7ebd), + C64e(0x060206f102f5f5f3), C64e(0xd14fd11d4f838352), + C64e(0xe45ce4d05c68688c), C64e(0x07f407a2f4515156), + C64e(0x5c345cb934d1d18d), C64e(0x180818e908f9f9e1), + C64e(0xae93aedf93e2e24c), C64e(0x9573954d73abab3e), + C64e(0xf553f5c453626297), C64e(0x413f41543f2a2a6b), + C64e(0x140c14100c08081c), C64e(0xf652f63152959563), + C64e(0xaf65af8c654646e9), C64e(0xe25ee2215e9d9d7f), + C64e(0x7828786028303048), C64e(0xf8a1f86ea13737cf), + C64e(0x110f11140f0a0a1b), C64e(0xc4b5c45eb52f2feb), + C64e(0x1b091b1c090e0e15), C64e(0x5a365a483624247e), + C64e(0xb69bb6369b1b1bad), C64e(0x473d47a53ddfdf98), + C64e(0x6a266a8126cdcda7), C64e(0xbb69bb9c694e4ef5), + C64e(0x4ccd4cfecd7f7f33), C64e(0xba9fbacf9feaea50), + C64e(0x2d1b2d241b12123f), C64e(0xb99eb93a9e1d1da4), + C64e(0x9c749cb0745858c4), C64e(0x722e72682e343446), + C64e(0x772d776c2d363641), C64e(0xcdb2cda3b2dcdc11), + C64e(0x29ee2973eeb4b49d), C64e(0x16fb16b6fb5b5b4d), + C64e(0x01f60153f6a4a4a5), C64e(0xd74dd7ec4d7676a1), + C64e(0xa361a37561b7b714), C64e(0x49ce49face7d7d34), + C64e(0x8d7b8da47b5252df), C64e(0x423e42a13edddd9f), + C64e(0x937193bc715e5ecd), C64e(0xa297a226971313b1), + C64e(0x04f50457f5a6a6a2), C64e(0xb868b86968b9b901), + C64e(0x0000000000000000), C64e(0x742c74992cc1c1b5), + C64e(0xa060a080604040e0), C64e(0x211f21dd1fe3e3c2), + C64e(0x43c843f2c879793a), C64e(0x2ced2c77edb6b69a), + C64e(0xd9bed9b3bed4d40d), C64e(0xca46ca01468d8d47), + C64e(0x70d970ced9676717), C64e(0xdd4bdde44b7272af), + C64e(0x79de7933de9494ed), C64e(0x67d4672bd49898ff), + C64e(0x23e8237be8b0b093), C64e(0xde4ade114a85855b), + C64e(0xbd6bbd6d6bbbbb06), C64e(0x7e2a7e912ac5c5bb), + C64e(0x34e5349ee54f4f7b), C64e(0x3a163ac116ededd7), + C64e(0x54c55417c58686d2), C64e(0x62d7622fd79a9af8), + C64e(0xff55ffcc55666699), C64e(0xa794a722941111b6), + C64e(0x4acf4a0fcf8a8ac0), C64e(0x301030c910e9e9d9), + C64e(0x0a060a080604040e), C64e(0x988198e781fefe66), + C64e(0x0bf00b5bf0a0a0ab), C64e(0xcc44ccf0447878b4), + C64e(0xd5bad54aba2525f0), C64e(0x3ee33e96e34b4b75), + C64e(0x0ef30e5ff3a2a2ac), C64e(0x19fe19bafe5d5d44), + C64e(0x5bc05b1bc08080db), C64e(0x858a850a8a050580), + C64e(0xecadec7ead3f3fd3), C64e(0xdfbcdf42bc2121fe), + C64e(0xd848d8e0487070a8), C64e(0x0c040cf904f1f1fd), + C64e(0x7adf7ac6df636319), C64e(0x58c158eec177772f), + C64e(0x9f759f4575afaf30), C64e(0xa563a584634242e7), + C64e(0x5030504030202070), C64e(0x2e1a2ed11ae5e5cb), + C64e(0x120e12e10efdfdef), C64e(0xb76db7656dbfbf08), + C64e(0xd44cd4194c818155), C64e(0x3c143c3014181824), + C64e(0x5f355f4c35262679), C64e(0x712f719d2fc3c3b2), + C64e(0x38e13867e1bebe86), C64e(0xfda2fd6aa23535c8), + C64e(0x4fcc4f0bcc8888c7), C64e(0x4b394b5c392e2e65), + C64e(0xf957f93d5793936a), C64e(0x0df20daaf2555558), + C64e(0x9d829de382fcfc61), C64e(0xc947c9f4477a7ab3), + C64e(0xefacef8bacc8c827), C64e(0x32e7326fe7baba88), + C64e(0x7d2b7d642b32324f), C64e(0xa495a4d795e6e642), + C64e(0xfba0fb9ba0c0c03b), C64e(0xb398b332981919aa), + C64e(0x68d16827d19e9ef6), C64e(0x817f815d7fa3a322), + C64e(0xaa66aa88664444ee), C64e(0x827e82a87e5454d6), + C64e(0xe6abe676ab3b3bdd), C64e(0x9e839e16830b0b95), + C64e(0x45ca4503ca8c8cc9), C64e(0x7b297b9529c7c7bc), + C64e(0x6ed36ed6d36b6b05), C64e(0x443c44503c28286c), + C64e(0x8b798b5579a7a72c), C64e(0x3de23d63e2bcbc81), + C64e(0x271d272c1d161631), C64e(0x9a769a4176adad37), + C64e(0x4d3b4dad3bdbdb96), C64e(0xfa56fac85664649e), + C64e(0xd24ed2e84e7474a6), C64e(0x221e22281e141436), + C64e(0x76db763fdb9292e4), C64e(0x1e0a1e180a0c0c12), + C64e(0xb46cb4906c4848fc), C64e(0x37e4376be4b8b88f), + C64e(0xe75de7255d9f9f78), C64e(0xb26eb2616ebdbd0f), + C64e(0x2aef2a86ef434369), C64e(0xf1a6f193a6c4c435), + C64e(0xe3a8e372a83939da), C64e(0xf7a4f762a43131c6), + C64e(0x593759bd37d3d38a), C64e(0x868b86ff8bf2f274), + C64e(0x563256b132d5d583), C64e(0xc543c50d438b8b4e), + C64e(0xeb59ebdc596e6e85), C64e(0xc2b7c2afb7dada18), + C64e(0x8f8c8f028c01018e), C64e(0xac64ac7964b1b11d), + C64e(0x6dd26d23d29c9cf1), C64e(0x3be03b92e0494972), + C64e(0xc7b4c7abb4d8d81f), C64e(0x15fa1543faacacb9), + C64e(0x090709fd07f3f3fa), C64e(0x6f256f8525cfcfa0), + C64e(0xeaafea8fafcaca20), C64e(0x898e89f38ef4f47d), + C64e(0x20e9208ee9474767), C64e(0x2818282018101038), + C64e(0x64d564ded56f6f0b), C64e(0x838883fb88f0f073), + C64e(0xb16fb1946f4a4afb), C64e(0x967296b8725c5cca), + C64e(0x6c246c7024383854), C64e(0x08f108aef157575f), + C64e(0x52c752e6c7737321), C64e(0xf351f33551979764), + C64e(0x6523658d23cbcbae), C64e(0x847c84597ca1a125), + C64e(0xbf9cbfcb9ce8e857), C64e(0x6321637c213e3e5d), + C64e(0x7cdd7c37dd9696ea), C64e(0x7fdc7fc2dc61611e), + C64e(0x9186911a860d0d9c), C64e(0x9485941e850f0f9b), + C64e(0xab90abdb90e0e04b), C64e(0xc642c6f8427c7cba), + C64e(0x57c457e2c4717126), C64e(0xe5aae583aacccc29), + C64e(0x73d8733bd89090e3), C64e(0x0f050f0c05060609), + C64e(0x030103f501f7f7f4), C64e(0x36123638121c1c2a), + C64e(0xfea3fe9fa3c2c23c), C64e(0xe15fe1d45f6a6a8b), + C64e(0x10f91047f9aeaebe), C64e(0x6bd06bd2d0696902), + C64e(0xa891a82e911717bf), C64e(0xe858e82958999971), + C64e(0x69276974273a3a53), C64e(0xd0b9d04eb92727f7), + C64e(0x483848a938d9d991), C64e(0x351335cd13ebebde), + C64e(0xceb3ce56b32b2be5), C64e(0x5533554433222277), + C64e(0xd6bbd6bfbbd2d204), C64e(0x9070904970a9a939), + C64e(0x8089800e89070787), C64e(0xf2a7f266a73333c1), + C64e(0xc1b6c15ab62d2dec), C64e(0x66226678223c3c5a), + C64e(0xad92ad2a921515b8), C64e(0x6020608920c9c9a9), + C64e(0xdb49db154987875c), C64e(0x1aff1a4fffaaaab0), + C64e(0x887888a0785050d8), C64e(0x8e7a8e517aa5a52b), + C64e(0x8a8f8a068f030389), C64e(0x13f813b2f859594a), + C64e(0x9b809b1280090992), C64e(0x39173934171a1a23), + C64e(0x75da75cada656510), C64e(0x533153b531d7d784), + C64e(0x51c65113c68484d5), C64e(0xd3b8d3bbb8d0d003), + C64e(0x5ec35e1fc38282dc), C64e(0xcbb0cb52b02929e2), + C64e(0x997799b4775a5ac3), C64e(0x3311333c111e1e2d), + C64e(0x46cb46f6cb7b7b3d), C64e(0x1ffc1f4bfca8a8b7), + C64e(0x61d661dad66d6d0c), C64e(0x4e3a4e583a2c2c62) +}; + +static const sph_u64 T7[] = { + C64e(0x32f4a5f497a5c6c6), C64e(0x6f978497eb84f8f8), + C64e(0x5eb099b0c799eeee), C64e(0x7a8c8d8cf78df6f6), + C64e(0xe8170d17e50dffff), C64e(0x0adcbddcb7bdd6d6), + C64e(0x16c8b1c8a7b1dede), C64e(0x6dfc54fc39549191), + C64e(0x90f050f0c0506060), C64e(0x0705030504030202), + C64e(0x2ee0a9e087a9cece), C64e(0xd1877d87ac7d5656), + C64e(0xcc2b192bd519e7e7), C64e(0x13a662a67162b5b5), + C64e(0x7c31e6319ae64d4d), C64e(0x59b59ab5c39aecec), + C64e(0x40cf45cf05458f8f), C64e(0xa3bc9dbc3e9d1f1f), + C64e(0x49c040c009408989), C64e(0x68928792ef87fafa), + C64e(0xd03f153fc515efef), C64e(0x9426eb267febb2b2), + C64e(0xce40c94007c98e8e), C64e(0xe61d0b1ded0bfbfb), + C64e(0x6e2fec2f82ec4141), C64e(0x1aa967a97d67b3b3), + C64e(0x431cfd1cbefd5f5f), C64e(0x6025ea258aea4545), + C64e(0xf9dabfda46bf2323), C64e(0x5102f702a6f75353), + C64e(0x45a196a1d396e4e4), C64e(0x76ed5bed2d5b9b9b), + C64e(0x285dc25deac27575), C64e(0xc5241c24d91ce1e1), + C64e(0xd4e9aee97aae3d3d), C64e(0xf2be6abe986a4c4c), + C64e(0x82ee5aeed85a6c6c), C64e(0xbdc341c3fc417e7e), + C64e(0xf3060206f102f5f5), C64e(0x52d14fd11d4f8383), + C64e(0x8ce45ce4d05c6868), C64e(0x5607f407a2f45151), + C64e(0x8d5c345cb934d1d1), C64e(0xe1180818e908f9f9), + C64e(0x4cae93aedf93e2e2), C64e(0x3e9573954d73abab), + C64e(0x97f553f5c4536262), C64e(0x6b413f41543f2a2a), + C64e(0x1c140c14100c0808), C64e(0x63f652f631529595), + C64e(0xe9af65af8c654646), C64e(0x7fe25ee2215e9d9d), + C64e(0x4878287860283030), C64e(0xcff8a1f86ea13737), + C64e(0x1b110f11140f0a0a), C64e(0xebc4b5c45eb52f2f), + C64e(0x151b091b1c090e0e), C64e(0x7e5a365a48362424), + C64e(0xadb69bb6369b1b1b), C64e(0x98473d47a53ddfdf), + C64e(0xa76a266a8126cdcd), C64e(0xf5bb69bb9c694e4e), + C64e(0x334ccd4cfecd7f7f), C64e(0x50ba9fbacf9feaea), + C64e(0x3f2d1b2d241b1212), C64e(0xa4b99eb93a9e1d1d), + C64e(0xc49c749cb0745858), C64e(0x46722e72682e3434), + C64e(0x41772d776c2d3636), C64e(0x11cdb2cda3b2dcdc), + C64e(0x9d29ee2973eeb4b4), C64e(0x4d16fb16b6fb5b5b), + C64e(0xa501f60153f6a4a4), C64e(0xa1d74dd7ec4d7676), + C64e(0x14a361a37561b7b7), C64e(0x3449ce49face7d7d), + C64e(0xdf8d7b8da47b5252), C64e(0x9f423e42a13edddd), + C64e(0xcd937193bc715e5e), C64e(0xb1a297a226971313), + C64e(0xa204f50457f5a6a6), C64e(0x01b868b86968b9b9), + C64e(0x0000000000000000), C64e(0xb5742c74992cc1c1), + C64e(0xe0a060a080604040), C64e(0xc2211f21dd1fe3e3), + C64e(0x3a43c843f2c87979), C64e(0x9a2ced2c77edb6b6), + C64e(0x0dd9bed9b3bed4d4), C64e(0x47ca46ca01468d8d), + C64e(0x1770d970ced96767), C64e(0xafdd4bdde44b7272), + C64e(0xed79de7933de9494), C64e(0xff67d4672bd49898), + C64e(0x9323e8237be8b0b0), C64e(0x5bde4ade114a8585), + C64e(0x06bd6bbd6d6bbbbb), C64e(0xbb7e2a7e912ac5c5), + C64e(0x7b34e5349ee54f4f), C64e(0xd73a163ac116eded), + C64e(0xd254c55417c58686), C64e(0xf862d7622fd79a9a), + C64e(0x99ff55ffcc556666), C64e(0xb6a794a722941111), + C64e(0xc04acf4a0fcf8a8a), C64e(0xd9301030c910e9e9), + C64e(0x0e0a060a08060404), C64e(0x66988198e781fefe), + C64e(0xab0bf00b5bf0a0a0), C64e(0xb4cc44ccf0447878), + C64e(0xf0d5bad54aba2525), C64e(0x753ee33e96e34b4b), + C64e(0xac0ef30e5ff3a2a2), C64e(0x4419fe19bafe5d5d), + C64e(0xdb5bc05b1bc08080), C64e(0x80858a850a8a0505), + C64e(0xd3ecadec7ead3f3f), C64e(0xfedfbcdf42bc2121), + C64e(0xa8d848d8e0487070), C64e(0xfd0c040cf904f1f1), + C64e(0x197adf7ac6df6363), C64e(0x2f58c158eec17777), + C64e(0x309f759f4575afaf), C64e(0xe7a563a584634242), + C64e(0x7050305040302020), C64e(0xcb2e1a2ed11ae5e5), + C64e(0xef120e12e10efdfd), C64e(0x08b76db7656dbfbf), + C64e(0x55d44cd4194c8181), C64e(0x243c143c30141818), + C64e(0x795f355f4c352626), C64e(0xb2712f719d2fc3c3), + C64e(0x8638e13867e1bebe), C64e(0xc8fda2fd6aa23535), + C64e(0xc74fcc4f0bcc8888), C64e(0x654b394b5c392e2e), + C64e(0x6af957f93d579393), C64e(0x580df20daaf25555), + C64e(0x619d829de382fcfc), C64e(0xb3c947c9f4477a7a), + C64e(0x27efacef8bacc8c8), C64e(0x8832e7326fe7baba), + C64e(0x4f7d2b7d642b3232), C64e(0x42a495a4d795e6e6), + C64e(0x3bfba0fb9ba0c0c0), C64e(0xaab398b332981919), + C64e(0xf668d16827d19e9e), C64e(0x22817f815d7fa3a3), + C64e(0xeeaa66aa88664444), C64e(0xd6827e82a87e5454), + C64e(0xdde6abe676ab3b3b), C64e(0x959e839e16830b0b), + C64e(0xc945ca4503ca8c8c), C64e(0xbc7b297b9529c7c7), + C64e(0x056ed36ed6d36b6b), C64e(0x6c443c44503c2828), + C64e(0x2c8b798b5579a7a7), C64e(0x813de23d63e2bcbc), + C64e(0x31271d272c1d1616), C64e(0x379a769a4176adad), + C64e(0x964d3b4dad3bdbdb), C64e(0x9efa56fac8566464), + C64e(0xa6d24ed2e84e7474), C64e(0x36221e22281e1414), + C64e(0xe476db763fdb9292), C64e(0x121e0a1e180a0c0c), + C64e(0xfcb46cb4906c4848), C64e(0x8f37e4376be4b8b8), + C64e(0x78e75de7255d9f9f), C64e(0x0fb26eb2616ebdbd), + C64e(0x692aef2a86ef4343), C64e(0x35f1a6f193a6c4c4), + C64e(0xdae3a8e372a83939), C64e(0xc6f7a4f762a43131), + C64e(0x8a593759bd37d3d3), C64e(0x74868b86ff8bf2f2), + C64e(0x83563256b132d5d5), C64e(0x4ec543c50d438b8b), + C64e(0x85eb59ebdc596e6e), C64e(0x18c2b7c2afb7dada), + C64e(0x8e8f8c8f028c0101), C64e(0x1dac64ac7964b1b1), + C64e(0xf16dd26d23d29c9c), C64e(0x723be03b92e04949), + C64e(0x1fc7b4c7abb4d8d8), C64e(0xb915fa1543faacac), + C64e(0xfa090709fd07f3f3), C64e(0xa06f256f8525cfcf), + C64e(0x20eaafea8fafcaca), C64e(0x7d898e89f38ef4f4), + C64e(0x6720e9208ee94747), C64e(0x3828182820181010), + C64e(0x0b64d564ded56f6f), C64e(0x73838883fb88f0f0), + C64e(0xfbb16fb1946f4a4a), C64e(0xca967296b8725c5c), + C64e(0x546c246c70243838), C64e(0x5f08f108aef15757), + C64e(0x2152c752e6c77373), C64e(0x64f351f335519797), + C64e(0xae6523658d23cbcb), C64e(0x25847c84597ca1a1), + C64e(0x57bf9cbfcb9ce8e8), C64e(0x5d6321637c213e3e), + C64e(0xea7cdd7c37dd9696), C64e(0x1e7fdc7fc2dc6161), + C64e(0x9c9186911a860d0d), C64e(0x9b9485941e850f0f), + C64e(0x4bab90abdb90e0e0), C64e(0xbac642c6f8427c7c), + C64e(0x2657c457e2c47171), C64e(0x29e5aae583aacccc), + C64e(0xe373d8733bd89090), C64e(0x090f050f0c050606), + C64e(0xf4030103f501f7f7), C64e(0x2a36123638121c1c), + C64e(0x3cfea3fe9fa3c2c2), C64e(0x8be15fe1d45f6a6a), + C64e(0xbe10f91047f9aeae), C64e(0x026bd06bd2d06969), + C64e(0xbfa891a82e911717), C64e(0x71e858e829589999), + C64e(0x5369276974273a3a), C64e(0xf7d0b9d04eb92727), + C64e(0x91483848a938d9d9), C64e(0xde351335cd13ebeb), + C64e(0xe5ceb3ce56b32b2b), C64e(0x7755335544332222), + C64e(0x04d6bbd6bfbbd2d2), C64e(0x399070904970a9a9), + C64e(0x878089800e890707), C64e(0xc1f2a7f266a73333), + C64e(0xecc1b6c15ab62d2d), C64e(0x5a66226678223c3c), + C64e(0xb8ad92ad2a921515), C64e(0xa96020608920c9c9), + C64e(0x5cdb49db15498787), C64e(0xb01aff1a4fffaaaa), + C64e(0xd8887888a0785050), C64e(0x2b8e7a8e517aa5a5), + C64e(0x898a8f8a068f0303), C64e(0x4a13f813b2f85959), + C64e(0x929b809b12800909), C64e(0x2339173934171a1a), + C64e(0x1075da75cada6565), C64e(0x84533153b531d7d7), + C64e(0xd551c65113c68484), C64e(0x03d3b8d3bbb8d0d0), + C64e(0xdc5ec35e1fc38282), C64e(0xe2cbb0cb52b02929), + C64e(0xc3997799b4775a5a), C64e(0x2d3311333c111e1e), + C64e(0x3d46cb46f6cb7b7b), C64e(0xb71ffc1f4bfca8a8), + C64e(0x0c61d661dad66d6d), C64e(0x624e3a4e583a2c2c) +}; + +#endif + +#define DECL_STATE_SMALL \ + sph_u64 H[8]; + +#define READ_STATE_SMALL(sc) do { \ + memcpy(H, (sc)->state.wide, sizeof H); \ + } while (0) + +#define WRITE_STATE_SMALL(sc) do { \ + memcpy((sc)->state.wide, H, sizeof H); \ + } while (0) + +#if SPH_SMALL_FOOTPRINT_GROESTL + +#define RSTT(d, a, b0, b1, b2, b3, b4, b5, b6, b7) do { \ + t[d] = T0[B64_0(a[b0])] \ + ^ R64(T0[B64_1(a[b1])], 8) \ + ^ R64(T0[B64_2(a[b2])], 16) \ + ^ R64(T0[B64_3(a[b3])], 24) \ + ^ T4[B64_4(a[b4])] \ + ^ R64(T4[B64_5(a[b5])], 8) \ + ^ R64(T4[B64_6(a[b6])], 16) \ + ^ R64(T4[B64_7(a[b7])], 24); \ + } while (0) + +#else + +#define RSTT(d, a, b0, b1, b2, b3, b4, b5, b6, b7) do { \ + t[d] = T0[B64_0(a[b0])] \ + ^ T1[B64_1(a[b1])] \ + ^ T2[B64_2(a[b2])] \ + ^ T3[B64_3(a[b3])] \ + ^ T4[B64_4(a[b4])] \ + ^ T5[B64_5(a[b5])] \ + ^ T6[B64_6(a[b6])] \ + ^ T7[B64_7(a[b7])]; \ + } while (0) + +#endif + +#define ROUND_SMALL_P(a, r) do { \ + sph_u64 t[8]; \ + a[0] ^= PC64(0x00, r); \ + a[1] ^= PC64(0x10, r); \ + a[2] ^= PC64(0x20, r); \ + a[3] ^= PC64(0x30, r); \ + a[4] ^= PC64(0x40, r); \ + a[5] ^= PC64(0x50, r); \ + a[6] ^= PC64(0x60, r); \ + a[7] ^= PC64(0x70, r); \ + RSTT(0, a, 0, 1, 2, 3, 4, 5, 6, 7); \ + RSTT(1, a, 1, 2, 3, 4, 5, 6, 7, 0); \ + RSTT(2, a, 2, 3, 4, 5, 6, 7, 0, 1); \ + RSTT(3, a, 3, 4, 5, 6, 7, 0, 1, 2); \ + RSTT(4, a, 4, 5, 6, 7, 0, 1, 2, 3); \ + RSTT(5, a, 5, 6, 7, 0, 1, 2, 3, 4); \ + RSTT(6, a, 6, 7, 0, 1, 2, 3, 4, 5); \ + RSTT(7, a, 7, 0, 1, 2, 3, 4, 5, 6); \ + a[0] = t[0]; \ + a[1] = t[1]; \ + a[2] = t[2]; \ + a[3] = t[3]; \ + a[4] = t[4]; \ + a[5] = t[5]; \ + a[6] = t[6]; \ + a[7] = t[7]; \ + } while (0) + +#define ROUND_SMALL_Q(a, r) do { \ + sph_u64 t[8]; \ + a[0] ^= QC64(0x00, r); \ + a[1] ^= QC64(0x10, r); \ + a[2] ^= QC64(0x20, r); \ + a[3] ^= QC64(0x30, r); \ + a[4] ^= QC64(0x40, r); \ + a[5] ^= QC64(0x50, r); \ + a[6] ^= QC64(0x60, r); \ + a[7] ^= QC64(0x70, r); \ + RSTT(0, a, 1, 3, 5, 7, 0, 2, 4, 6); \ + RSTT(1, a, 2, 4, 6, 0, 1, 3, 5, 7); \ + RSTT(2, a, 3, 5, 7, 1, 2, 4, 6, 0); \ + RSTT(3, a, 4, 6, 0, 2, 3, 5, 7, 1); \ + RSTT(4, a, 5, 7, 1, 3, 4, 6, 0, 2); \ + RSTT(5, a, 6, 0, 2, 4, 5, 7, 1, 3); \ + RSTT(6, a, 7, 1, 3, 5, 6, 0, 2, 4); \ + RSTT(7, a, 0, 2, 4, 6, 7, 1, 3, 5); \ + a[0] = t[0]; \ + a[1] = t[1]; \ + a[2] = t[2]; \ + a[3] = t[3]; \ + a[4] = t[4]; \ + a[5] = t[5]; \ + a[6] = t[6]; \ + a[7] = t[7]; \ + } while (0) + +#if SPH_SMALL_FOOTPRINT_GROESTL + +#define PERM_SMALL_P(a) do { \ + int r; \ + for (r = 0; r < 10; r ++) \ + ROUND_SMALL_P(a, r); \ + } while (0) + +#define PERM_SMALL_Q(a) do { \ + int r; \ + for (r = 0; r < 10; r ++) \ + ROUND_SMALL_Q(a, r); \ + } while (0) + +#else + +/* + * Apparently, unrolling more than that confuses GCC, resulting in + * lower performance, even though L1 cache would be no problem. + */ +#define PERM_SMALL_P(a) do { \ + int r; \ + for (r = 0; r < 10; r += 2) { \ + ROUND_SMALL_P(a, r + 0); \ + ROUND_SMALL_P(a, r + 1); \ + } \ + } while (0) + +#define PERM_SMALL_Q(a) do { \ + int r; \ + for (r = 0; r < 10; r += 2) { \ + ROUND_SMALL_Q(a, r + 0); \ + ROUND_SMALL_Q(a, r + 1); \ + } \ + } while (0) + +#endif + +#define COMPRESS_SMALL do { \ + sph_u64 g[8], m[8]; \ + size_t u; \ + for (u = 0; u < 8; u ++) { \ + m[u] = dec64e_aligned(buf + (u << 3)); \ + g[u] = m[u] ^ H[u]; \ + } \ + PERM_SMALL_P(g); \ + PERM_SMALL_Q(m); \ + for (u = 0; u < 8; u ++) \ + H[u] ^= g[u] ^ m[u]; \ + } while (0) + +#define FINAL_SMALL do { \ + sph_u64 x[8]; \ + size_t u; \ + memcpy(x, H, sizeof x); \ + PERM_SMALL_P(x); \ + for (u = 0; u < 8; u ++) \ + H[u] ^= x[u]; \ + } while (0) + +#define DECL_STATE_BIG \ + sph_u64 H[16]; + +#define READ_STATE_BIG(sc) do { \ + memcpy(H, (sc)->state.wide, sizeof H); \ + } while (0) + +#define WRITE_STATE_BIG(sc) do { \ + memcpy((sc)->state.wide, H, sizeof H); \ + } while (0) + +#if SPH_SMALL_FOOTPRINT_GROESTL + +#define RBTT(d, a, b0, b1, b2, b3, b4, b5, b6, b7) do { \ + t[d] = T0[B64_0(a[b0])] \ + ^ R64(T0[B64_1(a[b1])], 8) \ + ^ R64(T0[B64_2(a[b2])], 16) \ + ^ R64(T0[B64_3(a[b3])], 24) \ + ^ T4[B64_4(a[b4])] \ + ^ R64(T4[B64_5(a[b5])], 8) \ + ^ R64(T4[B64_6(a[b6])], 16) \ + ^ R64(T4[B64_7(a[b7])], 24); \ + } while (0) + +#else + +#define RBTT(d, a, b0, b1, b2, b3, b4, b5, b6, b7) do { \ + t[d] = T0[B64_0(a[b0])] \ + ^ T1[B64_1(a[b1])] \ + ^ T2[B64_2(a[b2])] \ + ^ T3[B64_3(a[b3])] \ + ^ T4[B64_4(a[b4])] \ + ^ T5[B64_5(a[b5])] \ + ^ T6[B64_6(a[b6])] \ + ^ T7[B64_7(a[b7])]; \ + } while (0) + +#endif + +#if SPH_SMALL_FOOTPRINT_GROESTL + +#define ROUND_BIG_P(a, r) do { \ + sph_u64 t[16]; \ + size_t u; \ + a[0x0] ^= PC64(0x00, r); \ + a[0x1] ^= PC64(0x10, r); \ + a[0x2] ^= PC64(0x20, r); \ + a[0x3] ^= PC64(0x30, r); \ + a[0x4] ^= PC64(0x40, r); \ + a[0x5] ^= PC64(0x50, r); \ + a[0x6] ^= PC64(0x60, r); \ + a[0x7] ^= PC64(0x70, r); \ + a[0x8] ^= PC64(0x80, r); \ + a[0x9] ^= PC64(0x90, r); \ + a[0xA] ^= PC64(0xA0, r); \ + a[0xB] ^= PC64(0xB0, r); \ + a[0xC] ^= PC64(0xC0, r); \ + a[0xD] ^= PC64(0xD0, r); \ + a[0xE] ^= PC64(0xE0, r); \ + a[0xF] ^= PC64(0xF0, r); \ + for (u = 0; u < 16; u += 4) { \ + RBTT(u + 0, a, u + 0, (u + 1) & 0xF, \ + (u + 2) & 0xF, (u + 3) & 0xF, (u + 4) & 0xF, \ + (u + 5) & 0xF, (u + 6) & 0xF, (u + 11) & 0xF); \ + RBTT(u + 1, a, u + 1, (u + 2) & 0xF, \ + (u + 3) & 0xF, (u + 4) & 0xF, (u + 5) & 0xF, \ + (u + 6) & 0xF, (u + 7) & 0xF, (u + 12) & 0xF); \ + RBTT(u + 2, a, u + 2, (u + 3) & 0xF, \ + (u + 4) & 0xF, (u + 5) & 0xF, (u + 6) & 0xF, \ + (u + 7) & 0xF, (u + 8) & 0xF, (u + 13) & 0xF); \ + RBTT(u + 3, a, u + 3, (u + 4) & 0xF, \ + (u + 5) & 0xF, (u + 6) & 0xF, (u + 7) & 0xF, \ + (u + 8) & 0xF, (u + 9) & 0xF, (u + 14) & 0xF); \ + } \ + memcpy(a, t, sizeof t); \ + } while (0) + +#define ROUND_BIG_Q(a, r) do { \ + sph_u64 t[16]; \ + size_t u; \ + a[0x0] ^= QC64(0x00, r); \ + a[0x1] ^= QC64(0x10, r); \ + a[0x2] ^= QC64(0x20, r); \ + a[0x3] ^= QC64(0x30, r); \ + a[0x4] ^= QC64(0x40, r); \ + a[0x5] ^= QC64(0x50, r); \ + a[0x6] ^= QC64(0x60, r); \ + a[0x7] ^= QC64(0x70, r); \ + a[0x8] ^= QC64(0x80, r); \ + a[0x9] ^= QC64(0x90, r); \ + a[0xA] ^= QC64(0xA0, r); \ + a[0xB] ^= QC64(0xB0, r); \ + a[0xC] ^= QC64(0xC0, r); \ + a[0xD] ^= QC64(0xD0, r); \ + a[0xE] ^= QC64(0xE0, r); \ + a[0xF] ^= QC64(0xF0, r); \ + for (u = 0; u < 16; u += 4) { \ + RBTT(u + 0, a, (u + 1) & 0xF, (u + 3) & 0xF, \ + (u + 5) & 0xF, (u + 11) & 0xF, (u + 0) & 0xF, \ + (u + 2) & 0xF, (u + 4) & 0xF, (u + 6) & 0xF); \ + RBTT(u + 1, a, (u + 2) & 0xF, (u + 4) & 0xF, \ + (u + 6) & 0xF, (u + 12) & 0xF, (u + 1) & 0xF, \ + (u + 3) & 0xF, (u + 5) & 0xF, (u + 7) & 0xF); \ + RBTT(u + 2, a, (u + 3) & 0xF, (u + 5) & 0xF, \ + (u + 7) & 0xF, (u + 13) & 0xF, (u + 2) & 0xF, \ + (u + 4) & 0xF, (u + 6) & 0xF, (u + 8) & 0xF); \ + RBTT(u + 3, a, (u + 4) & 0xF, (u + 6) & 0xF, \ + (u + 8) & 0xF, (u + 14) & 0xF, (u + 3) & 0xF, \ + (u + 5) & 0xF, (u + 7) & 0xF, (u + 9) & 0xF); \ + } \ + memcpy(a, t, sizeof t); \ + } while (0) + +#else + +#define ROUND_BIG_P(a, r) do { \ + sph_u64 t[16]; \ + a[0x0] ^= PC64(0x00, r); \ + a[0x1] ^= PC64(0x10, r); \ + a[0x2] ^= PC64(0x20, r); \ + a[0x3] ^= PC64(0x30, r); \ + a[0x4] ^= PC64(0x40, r); \ + a[0x5] ^= PC64(0x50, r); \ + a[0x6] ^= PC64(0x60, r); \ + a[0x7] ^= PC64(0x70, r); \ + a[0x8] ^= PC64(0x80, r); \ + a[0x9] ^= PC64(0x90, r); \ + a[0xA] ^= PC64(0xA0, r); \ + a[0xB] ^= PC64(0xB0, r); \ + a[0xC] ^= PC64(0xC0, r); \ + a[0xD] ^= PC64(0xD0, r); \ + a[0xE] ^= PC64(0xE0, r); \ + a[0xF] ^= PC64(0xF0, r); \ + RBTT(0x0, a, 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0xB); \ + RBTT(0x1, a, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0xC); \ + RBTT(0x2, a, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0xD); \ + RBTT(0x3, a, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xE); \ + RBTT(0x4, a, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xA, 0xF); \ + RBTT(0x5, a, 0x5, 0x6, 0x7, 0x8, 0x9, 0xA, 0xB, 0x0); \ + RBTT(0x6, a, 0x6, 0x7, 0x8, 0x9, 0xA, 0xB, 0xC, 0x1); \ + RBTT(0x7, a, 0x7, 0x8, 0x9, 0xA, 0xB, 0xC, 0xD, 0x2); \ + RBTT(0x8, a, 0x8, 0x9, 0xA, 0xB, 0xC, 0xD, 0xE, 0x3); \ + RBTT(0x9, a, 0x9, 0xA, 0xB, 0xC, 0xD, 0xE, 0xF, 0x4); \ + RBTT(0xA, a, 0xA, 0xB, 0xC, 0xD, 0xE, 0xF, 0x0, 0x5); \ + RBTT(0xB, a, 0xB, 0xC, 0xD, 0xE, 0xF, 0x0, 0x1, 0x6); \ + RBTT(0xC, a, 0xC, 0xD, 0xE, 0xF, 0x0, 0x1, 0x2, 0x7); \ + RBTT(0xD, a, 0xD, 0xE, 0xF, 0x0, 0x1, 0x2, 0x3, 0x8); \ + RBTT(0xE, a, 0xE, 0xF, 0x0, 0x1, 0x2, 0x3, 0x4, 0x9); \ + RBTT(0xF, a, 0xF, 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0xA); \ + a[0x0] = t[0x0]; \ + a[0x1] = t[0x1]; \ + a[0x2] = t[0x2]; \ + a[0x3] = t[0x3]; \ + a[0x4] = t[0x4]; \ + a[0x5] = t[0x5]; \ + a[0x6] = t[0x6]; \ + a[0x7] = t[0x7]; \ + a[0x8] = t[0x8]; \ + a[0x9] = t[0x9]; \ + a[0xA] = t[0xA]; \ + a[0xB] = t[0xB]; \ + a[0xC] = t[0xC]; \ + a[0xD] = t[0xD]; \ + a[0xE] = t[0xE]; \ + a[0xF] = t[0xF]; \ + } while (0) + +#define ROUND_BIG_Q(a, r) do { \ + sph_u64 t[16]; \ + a[0x0] ^= QC64(0x00, r); \ + a[0x1] ^= QC64(0x10, r); \ + a[0x2] ^= QC64(0x20, r); \ + a[0x3] ^= QC64(0x30, r); \ + a[0x4] ^= QC64(0x40, r); \ + a[0x5] ^= QC64(0x50, r); \ + a[0x6] ^= QC64(0x60, r); \ + a[0x7] ^= QC64(0x70, r); \ + a[0x8] ^= QC64(0x80, r); \ + a[0x9] ^= QC64(0x90, r); \ + a[0xA] ^= QC64(0xA0, r); \ + a[0xB] ^= QC64(0xB0, r); \ + a[0xC] ^= QC64(0xC0, r); \ + a[0xD] ^= QC64(0xD0, r); \ + a[0xE] ^= QC64(0xE0, r); \ + a[0xF] ^= QC64(0xF0, r); \ + RBTT(0x0, a, 0x1, 0x3, 0x5, 0xB, 0x0, 0x2, 0x4, 0x6); \ + RBTT(0x1, a, 0x2, 0x4, 0x6, 0xC, 0x1, 0x3, 0x5, 0x7); \ + RBTT(0x2, a, 0x3, 0x5, 0x7, 0xD, 0x2, 0x4, 0x6, 0x8); \ + RBTT(0x3, a, 0x4, 0x6, 0x8, 0xE, 0x3, 0x5, 0x7, 0x9); \ + RBTT(0x4, a, 0x5, 0x7, 0x9, 0xF, 0x4, 0x6, 0x8, 0xA); \ + RBTT(0x5, a, 0x6, 0x8, 0xA, 0x0, 0x5, 0x7, 0x9, 0xB); \ + RBTT(0x6, a, 0x7, 0x9, 0xB, 0x1, 0x6, 0x8, 0xA, 0xC); \ + RBTT(0x7, a, 0x8, 0xA, 0xC, 0x2, 0x7, 0x9, 0xB, 0xD); \ + RBTT(0x8, a, 0x9, 0xB, 0xD, 0x3, 0x8, 0xA, 0xC, 0xE); \ + RBTT(0x9, a, 0xA, 0xC, 0xE, 0x4, 0x9, 0xB, 0xD, 0xF); \ + RBTT(0xA, a, 0xB, 0xD, 0xF, 0x5, 0xA, 0xC, 0xE, 0x0); \ + RBTT(0xB, a, 0xC, 0xE, 0x0, 0x6, 0xB, 0xD, 0xF, 0x1); \ + RBTT(0xC, a, 0xD, 0xF, 0x1, 0x7, 0xC, 0xE, 0x0, 0x2); \ + RBTT(0xD, a, 0xE, 0x0, 0x2, 0x8, 0xD, 0xF, 0x1, 0x3); \ + RBTT(0xE, a, 0xF, 0x1, 0x3, 0x9, 0xE, 0x0, 0x2, 0x4); \ + RBTT(0xF, a, 0x0, 0x2, 0x4, 0xA, 0xF, 0x1, 0x3, 0x5); \ + a[0x0] = t[0x0]; \ + a[0x1] = t[0x1]; \ + a[0x2] = t[0x2]; \ + a[0x3] = t[0x3]; \ + a[0x4] = t[0x4]; \ + a[0x5] = t[0x5]; \ + a[0x6] = t[0x6]; \ + a[0x7] = t[0x7]; \ + a[0x8] = t[0x8]; \ + a[0x9] = t[0x9]; \ + a[0xA] = t[0xA]; \ + a[0xB] = t[0xB]; \ + a[0xC] = t[0xC]; \ + a[0xD] = t[0xD]; \ + a[0xE] = t[0xE]; \ + a[0xF] = t[0xF]; \ + } while (0) + +#endif + +#define PERM_BIG_P(a) do { \ + int r; \ + for (r = 0; r < 14; r += 2) { \ + ROUND_BIG_P(a, r + 0); \ + ROUND_BIG_P(a, r + 1); \ + } \ + } while (0) + +#define PERM_BIG_Q(a) do { \ + int r; \ + for (r = 0; r < 14; r += 2) { \ + ROUND_BIG_Q(a, r + 0); \ + ROUND_BIG_Q(a, r + 1); \ + } \ + } while (0) + +/* obsolete +#if SPH_SMALL_FOOTPRINT_GROESTL + +#define COMPRESS_BIG do { \ + sph_u64 g[16], m[16], *ya; \ + const sph_u64 *yc; \ + size_t u; \ + int i; \ + for (u = 0; u < 16; u ++) { \ + m[u] = dec64e_aligned(buf + (u << 3)); \ + g[u] = m[u] ^ H[u]; \ + } \ + ya = g; \ + yc = CP; \ + for (i = 0; i < 2; i ++) { \ + PERM_BIG(ya, yc); \ + ya = m; \ + yc = CQ; \ + } \ + for (u = 0; u < 16; u ++) { \ + H[u] ^= g[u] ^ m[u]; \ + } \ + } while (0) + +#else +*/ + +#define COMPRESS_BIG do { \ + sph_u64 g[16], m[16]; \ + size_t u; \ + for (u = 0; u < 16; u ++) { \ + m[u] = dec64e_aligned(buf + (u << 3)); \ + g[u] = m[u] ^ H[u]; \ + } \ + PERM_BIG_P(g); \ + PERM_BIG_Q(m); \ + for (u = 0; u < 16; u ++) { \ + H[u] ^= g[u] ^ m[u]; \ + } \ + } while (0) + +/* obsolete +#endif +*/ + +#define FINAL_BIG do { \ + sph_u64 x[16]; \ + size_t u; \ + memcpy(x, H, sizeof x); \ + PERM_BIG_P(x); \ + for (u = 0; u < 16; u ++) \ + H[u] ^= x[u]; \ + } while (0) + +#else + +static const sph_u32 T0up[] = { + C32e(0xc632f4a5), C32e(0xf86f9784), C32e(0xee5eb099), C32e(0xf67a8c8d), + C32e(0xffe8170d), C32e(0xd60adcbd), C32e(0xde16c8b1), C32e(0x916dfc54), + C32e(0x6090f050), C32e(0x02070503), C32e(0xce2ee0a9), C32e(0x56d1877d), + C32e(0xe7cc2b19), C32e(0xb513a662), C32e(0x4d7c31e6), C32e(0xec59b59a), + C32e(0x8f40cf45), C32e(0x1fa3bc9d), C32e(0x8949c040), C32e(0xfa689287), + C32e(0xefd03f15), C32e(0xb29426eb), C32e(0x8ece40c9), C32e(0xfbe61d0b), + C32e(0x416e2fec), C32e(0xb31aa967), C32e(0x5f431cfd), C32e(0x456025ea), + C32e(0x23f9dabf), C32e(0x535102f7), C32e(0xe445a196), C32e(0x9b76ed5b), + C32e(0x75285dc2), C32e(0xe1c5241c), C32e(0x3dd4e9ae), C32e(0x4cf2be6a), + C32e(0x6c82ee5a), C32e(0x7ebdc341), C32e(0xf5f30602), C32e(0x8352d14f), + C32e(0x688ce45c), C32e(0x515607f4), C32e(0xd18d5c34), C32e(0xf9e11808), + C32e(0xe24cae93), C32e(0xab3e9573), C32e(0x6297f553), C32e(0x2a6b413f), + C32e(0x081c140c), C32e(0x9563f652), C32e(0x46e9af65), C32e(0x9d7fe25e), + C32e(0x30487828), C32e(0x37cff8a1), C32e(0x0a1b110f), C32e(0x2febc4b5), + C32e(0x0e151b09), C32e(0x247e5a36), C32e(0x1badb69b), C32e(0xdf98473d), + C32e(0xcda76a26), C32e(0x4ef5bb69), C32e(0x7f334ccd), C32e(0xea50ba9f), + C32e(0x123f2d1b), C32e(0x1da4b99e), C32e(0x58c49c74), C32e(0x3446722e), + C32e(0x3641772d), C32e(0xdc11cdb2), C32e(0xb49d29ee), C32e(0x5b4d16fb), + C32e(0xa4a501f6), C32e(0x76a1d74d), C32e(0xb714a361), C32e(0x7d3449ce), + C32e(0x52df8d7b), C32e(0xdd9f423e), C32e(0x5ecd9371), C32e(0x13b1a297), + C32e(0xa6a204f5), C32e(0xb901b868), C32e(0x00000000), C32e(0xc1b5742c), + C32e(0x40e0a060), C32e(0xe3c2211f), C32e(0x793a43c8), C32e(0xb69a2ced), + C32e(0xd40dd9be), C32e(0x8d47ca46), C32e(0x671770d9), C32e(0x72afdd4b), + C32e(0x94ed79de), C32e(0x98ff67d4), C32e(0xb09323e8), C32e(0x855bde4a), + C32e(0xbb06bd6b), C32e(0xc5bb7e2a), C32e(0x4f7b34e5), C32e(0xedd73a16), + C32e(0x86d254c5), C32e(0x9af862d7), C32e(0x6699ff55), C32e(0x11b6a794), + C32e(0x8ac04acf), C32e(0xe9d93010), C32e(0x040e0a06), C32e(0xfe669881), + C32e(0xa0ab0bf0), C32e(0x78b4cc44), C32e(0x25f0d5ba), C32e(0x4b753ee3), + C32e(0xa2ac0ef3), C32e(0x5d4419fe), C32e(0x80db5bc0), C32e(0x0580858a), + C32e(0x3fd3ecad), C32e(0x21fedfbc), C32e(0x70a8d848), C32e(0xf1fd0c04), + C32e(0x63197adf), C32e(0x772f58c1), C32e(0xaf309f75), C32e(0x42e7a563), + C32e(0x20705030), C32e(0xe5cb2e1a), C32e(0xfdef120e), C32e(0xbf08b76d), + C32e(0x8155d44c), C32e(0x18243c14), C32e(0x26795f35), C32e(0xc3b2712f), + C32e(0xbe8638e1), C32e(0x35c8fda2), C32e(0x88c74fcc), C32e(0x2e654b39), + C32e(0x936af957), C32e(0x55580df2), C32e(0xfc619d82), C32e(0x7ab3c947), + C32e(0xc827efac), C32e(0xba8832e7), C32e(0x324f7d2b), C32e(0xe642a495), + C32e(0xc03bfba0), C32e(0x19aab398), C32e(0x9ef668d1), C32e(0xa322817f), + C32e(0x44eeaa66), C32e(0x54d6827e), C32e(0x3bdde6ab), C32e(0x0b959e83), + C32e(0x8cc945ca), C32e(0xc7bc7b29), C32e(0x6b056ed3), C32e(0x286c443c), + C32e(0xa72c8b79), C32e(0xbc813de2), C32e(0x1631271d), C32e(0xad379a76), + C32e(0xdb964d3b), C32e(0x649efa56), C32e(0x74a6d24e), C32e(0x1436221e), + C32e(0x92e476db), C32e(0x0c121e0a), C32e(0x48fcb46c), C32e(0xb88f37e4), + C32e(0x9f78e75d), C32e(0xbd0fb26e), C32e(0x43692aef), C32e(0xc435f1a6), + C32e(0x39dae3a8), C32e(0x31c6f7a4), C32e(0xd38a5937), C32e(0xf274868b), + C32e(0xd5835632), C32e(0x8b4ec543), C32e(0x6e85eb59), C32e(0xda18c2b7), + C32e(0x018e8f8c), C32e(0xb11dac64), C32e(0x9cf16dd2), C32e(0x49723be0), + C32e(0xd81fc7b4), C32e(0xacb915fa), C32e(0xf3fa0907), C32e(0xcfa06f25), + C32e(0xca20eaaf), C32e(0xf47d898e), C32e(0x476720e9), C32e(0x10382818), + C32e(0x6f0b64d5), C32e(0xf0738388), C32e(0x4afbb16f), C32e(0x5cca9672), + C32e(0x38546c24), C32e(0x575f08f1), C32e(0x732152c7), C32e(0x9764f351), + C32e(0xcbae6523), C32e(0xa125847c), C32e(0xe857bf9c), C32e(0x3e5d6321), + C32e(0x96ea7cdd), C32e(0x611e7fdc), C32e(0x0d9c9186), C32e(0x0f9b9485), + C32e(0xe04bab90), C32e(0x7cbac642), C32e(0x712657c4), C32e(0xcc29e5aa), + C32e(0x90e373d8), C32e(0x06090f05), C32e(0xf7f40301), C32e(0x1c2a3612), + C32e(0xc23cfea3), C32e(0x6a8be15f), C32e(0xaebe10f9), C32e(0x69026bd0), + C32e(0x17bfa891), C32e(0x9971e858), C32e(0x3a536927), C32e(0x27f7d0b9), + C32e(0xd9914838), C32e(0xebde3513), C32e(0x2be5ceb3), C32e(0x22775533), + C32e(0xd204d6bb), C32e(0xa9399070), C32e(0x07878089), C32e(0x33c1f2a7), + C32e(0x2decc1b6), C32e(0x3c5a6622), C32e(0x15b8ad92), C32e(0xc9a96020), + C32e(0x875cdb49), C32e(0xaab01aff), C32e(0x50d88878), C32e(0xa52b8e7a), + C32e(0x03898a8f), C32e(0x594a13f8), C32e(0x09929b80), C32e(0x1a233917), + C32e(0x651075da), C32e(0xd7845331), C32e(0x84d551c6), C32e(0xd003d3b8), + C32e(0x82dc5ec3), C32e(0x29e2cbb0), C32e(0x5ac39977), C32e(0x1e2d3311), + C32e(0x7b3d46cb), C32e(0xa8b71ffc), C32e(0x6d0c61d6), C32e(0x2c624e3a) +}; + +static const sph_u32 T0dn[] = { + C32e(0xf497a5c6), C32e(0x97eb84f8), C32e(0xb0c799ee), C32e(0x8cf78df6), + C32e(0x17e50dff), C32e(0xdcb7bdd6), C32e(0xc8a7b1de), C32e(0xfc395491), + C32e(0xf0c05060), C32e(0x05040302), C32e(0xe087a9ce), C32e(0x87ac7d56), + C32e(0x2bd519e7), C32e(0xa67162b5), C32e(0x319ae64d), C32e(0xb5c39aec), + C32e(0xcf05458f), C32e(0xbc3e9d1f), C32e(0xc0094089), C32e(0x92ef87fa), + C32e(0x3fc515ef), C32e(0x267febb2), C32e(0x4007c98e), C32e(0x1ded0bfb), + C32e(0x2f82ec41), C32e(0xa97d67b3), C32e(0x1cbefd5f), C32e(0x258aea45), + C32e(0xda46bf23), C32e(0x02a6f753), C32e(0xa1d396e4), C32e(0xed2d5b9b), + C32e(0x5deac275), C32e(0x24d91ce1), C32e(0xe97aae3d), C32e(0xbe986a4c), + C32e(0xeed85a6c), C32e(0xc3fc417e), C32e(0x06f102f5), C32e(0xd11d4f83), + C32e(0xe4d05c68), C32e(0x07a2f451), C32e(0x5cb934d1), C32e(0x18e908f9), + C32e(0xaedf93e2), C32e(0x954d73ab), C32e(0xf5c45362), C32e(0x41543f2a), + C32e(0x14100c08), C32e(0xf6315295), C32e(0xaf8c6546), C32e(0xe2215e9d), + C32e(0x78602830), C32e(0xf86ea137), C32e(0x11140f0a), C32e(0xc45eb52f), + C32e(0x1b1c090e), C32e(0x5a483624), C32e(0xb6369b1b), C32e(0x47a53ddf), + C32e(0x6a8126cd), C32e(0xbb9c694e), C32e(0x4cfecd7f), C32e(0xbacf9fea), + C32e(0x2d241b12), C32e(0xb93a9e1d), C32e(0x9cb07458), C32e(0x72682e34), + C32e(0x776c2d36), C32e(0xcda3b2dc), C32e(0x2973eeb4), C32e(0x16b6fb5b), + C32e(0x0153f6a4), C32e(0xd7ec4d76), C32e(0xa37561b7), C32e(0x49face7d), + C32e(0x8da47b52), C32e(0x42a13edd), C32e(0x93bc715e), C32e(0xa2269713), + C32e(0x0457f5a6), C32e(0xb86968b9), C32e(0x00000000), C32e(0x74992cc1), + C32e(0xa0806040), C32e(0x21dd1fe3), C32e(0x43f2c879), C32e(0x2c77edb6), + C32e(0xd9b3bed4), C32e(0xca01468d), C32e(0x70ced967), C32e(0xdde44b72), + C32e(0x7933de94), C32e(0x672bd498), C32e(0x237be8b0), C32e(0xde114a85), + C32e(0xbd6d6bbb), C32e(0x7e912ac5), C32e(0x349ee54f), C32e(0x3ac116ed), + C32e(0x5417c586), C32e(0x622fd79a), C32e(0xffcc5566), C32e(0xa7229411), + C32e(0x4a0fcf8a), C32e(0x30c910e9), C32e(0x0a080604), C32e(0x98e781fe), + C32e(0x0b5bf0a0), C32e(0xccf04478), C32e(0xd54aba25), C32e(0x3e96e34b), + C32e(0x0e5ff3a2), C32e(0x19bafe5d), C32e(0x5b1bc080), C32e(0x850a8a05), + C32e(0xec7ead3f), C32e(0xdf42bc21), C32e(0xd8e04870), C32e(0x0cf904f1), + C32e(0x7ac6df63), C32e(0x58eec177), C32e(0x9f4575af), C32e(0xa5846342), + C32e(0x50403020), C32e(0x2ed11ae5), C32e(0x12e10efd), C32e(0xb7656dbf), + C32e(0xd4194c81), C32e(0x3c301418), C32e(0x5f4c3526), C32e(0x719d2fc3), + C32e(0x3867e1be), C32e(0xfd6aa235), C32e(0x4f0bcc88), C32e(0x4b5c392e), + C32e(0xf93d5793), C32e(0x0daaf255), C32e(0x9de382fc), C32e(0xc9f4477a), + C32e(0xef8bacc8), C32e(0x326fe7ba), C32e(0x7d642b32), C32e(0xa4d795e6), + C32e(0xfb9ba0c0), C32e(0xb3329819), C32e(0x6827d19e), C32e(0x815d7fa3), + C32e(0xaa886644), C32e(0x82a87e54), C32e(0xe676ab3b), C32e(0x9e16830b), + C32e(0x4503ca8c), C32e(0x7b9529c7), C32e(0x6ed6d36b), C32e(0x44503c28), + C32e(0x8b5579a7), C32e(0x3d63e2bc), C32e(0x272c1d16), C32e(0x9a4176ad), + C32e(0x4dad3bdb), C32e(0xfac85664), C32e(0xd2e84e74), C32e(0x22281e14), + C32e(0x763fdb92), C32e(0x1e180a0c), C32e(0xb4906c48), C32e(0x376be4b8), + C32e(0xe7255d9f), C32e(0xb2616ebd), C32e(0x2a86ef43), C32e(0xf193a6c4), + C32e(0xe372a839), C32e(0xf762a431), C32e(0x59bd37d3), C32e(0x86ff8bf2), + C32e(0x56b132d5), C32e(0xc50d438b), C32e(0xebdc596e), C32e(0xc2afb7da), + C32e(0x8f028c01), C32e(0xac7964b1), C32e(0x6d23d29c), C32e(0x3b92e049), + C32e(0xc7abb4d8), C32e(0x1543faac), C32e(0x09fd07f3), C32e(0x6f8525cf), + C32e(0xea8fafca), C32e(0x89f38ef4), C32e(0x208ee947), C32e(0x28201810), + C32e(0x64ded56f), C32e(0x83fb88f0), C32e(0xb1946f4a), C32e(0x96b8725c), + C32e(0x6c702438), C32e(0x08aef157), C32e(0x52e6c773), C32e(0xf3355197), + C32e(0x658d23cb), C32e(0x84597ca1), C32e(0xbfcb9ce8), C32e(0x637c213e), + C32e(0x7c37dd96), C32e(0x7fc2dc61), C32e(0x911a860d), C32e(0x941e850f), + C32e(0xabdb90e0), C32e(0xc6f8427c), C32e(0x57e2c471), C32e(0xe583aacc), + C32e(0x733bd890), C32e(0x0f0c0506), C32e(0x03f501f7), C32e(0x3638121c), + C32e(0xfe9fa3c2), C32e(0xe1d45f6a), C32e(0x1047f9ae), C32e(0x6bd2d069), + C32e(0xa82e9117), C32e(0xe8295899), C32e(0x6974273a), C32e(0xd04eb927), + C32e(0x48a938d9), C32e(0x35cd13eb), C32e(0xce56b32b), C32e(0x55443322), + C32e(0xd6bfbbd2), C32e(0x904970a9), C32e(0x800e8907), C32e(0xf266a733), + C32e(0xc15ab62d), C32e(0x6678223c), C32e(0xad2a9215), C32e(0x608920c9), + C32e(0xdb154987), C32e(0x1a4fffaa), C32e(0x88a07850), C32e(0x8e517aa5), + C32e(0x8a068f03), C32e(0x13b2f859), C32e(0x9b128009), C32e(0x3934171a), + C32e(0x75cada65), C32e(0x53b531d7), C32e(0x5113c684), C32e(0xd3bbb8d0), + C32e(0x5e1fc382), C32e(0xcb52b029), C32e(0x99b4775a), C32e(0x333c111e), + C32e(0x46f6cb7b), C32e(0x1f4bfca8), C32e(0x61dad66d), C32e(0x4e583a2c) +}; + +static const sph_u32 T1up[] = { + C32e(0xc6c632f4), C32e(0xf8f86f97), C32e(0xeeee5eb0), C32e(0xf6f67a8c), + C32e(0xffffe817), C32e(0xd6d60adc), C32e(0xdede16c8), C32e(0x91916dfc), + C32e(0x606090f0), C32e(0x02020705), C32e(0xcece2ee0), C32e(0x5656d187), + C32e(0xe7e7cc2b), C32e(0xb5b513a6), C32e(0x4d4d7c31), C32e(0xecec59b5), + C32e(0x8f8f40cf), C32e(0x1f1fa3bc), C32e(0x898949c0), C32e(0xfafa6892), + C32e(0xefefd03f), C32e(0xb2b29426), C32e(0x8e8ece40), C32e(0xfbfbe61d), + C32e(0x41416e2f), C32e(0xb3b31aa9), C32e(0x5f5f431c), C32e(0x45456025), + C32e(0x2323f9da), C32e(0x53535102), C32e(0xe4e445a1), C32e(0x9b9b76ed), + C32e(0x7575285d), C32e(0xe1e1c524), C32e(0x3d3dd4e9), C32e(0x4c4cf2be), + C32e(0x6c6c82ee), C32e(0x7e7ebdc3), C32e(0xf5f5f306), C32e(0x838352d1), + C32e(0x68688ce4), C32e(0x51515607), C32e(0xd1d18d5c), C32e(0xf9f9e118), + C32e(0xe2e24cae), C32e(0xabab3e95), C32e(0x626297f5), C32e(0x2a2a6b41), + C32e(0x08081c14), C32e(0x959563f6), C32e(0x4646e9af), C32e(0x9d9d7fe2), + C32e(0x30304878), C32e(0x3737cff8), C32e(0x0a0a1b11), C32e(0x2f2febc4), + C32e(0x0e0e151b), C32e(0x24247e5a), C32e(0x1b1badb6), C32e(0xdfdf9847), + C32e(0xcdcda76a), C32e(0x4e4ef5bb), C32e(0x7f7f334c), C32e(0xeaea50ba), + C32e(0x12123f2d), C32e(0x1d1da4b9), C32e(0x5858c49c), C32e(0x34344672), + C32e(0x36364177), C32e(0xdcdc11cd), C32e(0xb4b49d29), C32e(0x5b5b4d16), + C32e(0xa4a4a501), C32e(0x7676a1d7), C32e(0xb7b714a3), C32e(0x7d7d3449), + C32e(0x5252df8d), C32e(0xdddd9f42), C32e(0x5e5ecd93), C32e(0x1313b1a2), + C32e(0xa6a6a204), C32e(0xb9b901b8), C32e(0x00000000), C32e(0xc1c1b574), + C32e(0x4040e0a0), C32e(0xe3e3c221), C32e(0x79793a43), C32e(0xb6b69a2c), + C32e(0xd4d40dd9), C32e(0x8d8d47ca), C32e(0x67671770), C32e(0x7272afdd), + C32e(0x9494ed79), C32e(0x9898ff67), C32e(0xb0b09323), C32e(0x85855bde), + C32e(0xbbbb06bd), C32e(0xc5c5bb7e), C32e(0x4f4f7b34), C32e(0xededd73a), + C32e(0x8686d254), C32e(0x9a9af862), C32e(0x666699ff), C32e(0x1111b6a7), + C32e(0x8a8ac04a), C32e(0xe9e9d930), C32e(0x04040e0a), C32e(0xfefe6698), + C32e(0xa0a0ab0b), C32e(0x7878b4cc), C32e(0x2525f0d5), C32e(0x4b4b753e), + C32e(0xa2a2ac0e), C32e(0x5d5d4419), C32e(0x8080db5b), C32e(0x05058085), + C32e(0x3f3fd3ec), C32e(0x2121fedf), C32e(0x7070a8d8), C32e(0xf1f1fd0c), + C32e(0x6363197a), C32e(0x77772f58), C32e(0xafaf309f), C32e(0x4242e7a5), + C32e(0x20207050), C32e(0xe5e5cb2e), C32e(0xfdfdef12), C32e(0xbfbf08b7), + C32e(0x818155d4), C32e(0x1818243c), C32e(0x2626795f), C32e(0xc3c3b271), + C32e(0xbebe8638), C32e(0x3535c8fd), C32e(0x8888c74f), C32e(0x2e2e654b), + C32e(0x93936af9), C32e(0x5555580d), C32e(0xfcfc619d), C32e(0x7a7ab3c9), + C32e(0xc8c827ef), C32e(0xbaba8832), C32e(0x32324f7d), C32e(0xe6e642a4), + C32e(0xc0c03bfb), C32e(0x1919aab3), C32e(0x9e9ef668), C32e(0xa3a32281), + C32e(0x4444eeaa), C32e(0x5454d682), C32e(0x3b3bdde6), C32e(0x0b0b959e), + C32e(0x8c8cc945), C32e(0xc7c7bc7b), C32e(0x6b6b056e), C32e(0x28286c44), + C32e(0xa7a72c8b), C32e(0xbcbc813d), C32e(0x16163127), C32e(0xadad379a), + C32e(0xdbdb964d), C32e(0x64649efa), C32e(0x7474a6d2), C32e(0x14143622), + C32e(0x9292e476), C32e(0x0c0c121e), C32e(0x4848fcb4), C32e(0xb8b88f37), + C32e(0x9f9f78e7), C32e(0xbdbd0fb2), C32e(0x4343692a), C32e(0xc4c435f1), + C32e(0x3939dae3), C32e(0x3131c6f7), C32e(0xd3d38a59), C32e(0xf2f27486), + C32e(0xd5d58356), C32e(0x8b8b4ec5), C32e(0x6e6e85eb), C32e(0xdada18c2), + C32e(0x01018e8f), C32e(0xb1b11dac), C32e(0x9c9cf16d), C32e(0x4949723b), + C32e(0xd8d81fc7), C32e(0xacacb915), C32e(0xf3f3fa09), C32e(0xcfcfa06f), + C32e(0xcaca20ea), C32e(0xf4f47d89), C32e(0x47476720), C32e(0x10103828), + C32e(0x6f6f0b64), C32e(0xf0f07383), C32e(0x4a4afbb1), C32e(0x5c5cca96), + C32e(0x3838546c), C32e(0x57575f08), C32e(0x73732152), C32e(0x979764f3), + C32e(0xcbcbae65), C32e(0xa1a12584), C32e(0xe8e857bf), C32e(0x3e3e5d63), + C32e(0x9696ea7c), C32e(0x61611e7f), C32e(0x0d0d9c91), C32e(0x0f0f9b94), + C32e(0xe0e04bab), C32e(0x7c7cbac6), C32e(0x71712657), C32e(0xcccc29e5), + C32e(0x9090e373), C32e(0x0606090f), C32e(0xf7f7f403), C32e(0x1c1c2a36), + C32e(0xc2c23cfe), C32e(0x6a6a8be1), C32e(0xaeaebe10), C32e(0x6969026b), + C32e(0x1717bfa8), C32e(0x999971e8), C32e(0x3a3a5369), C32e(0x2727f7d0), + C32e(0xd9d99148), C32e(0xebebde35), C32e(0x2b2be5ce), C32e(0x22227755), + C32e(0xd2d204d6), C32e(0xa9a93990), C32e(0x07078780), C32e(0x3333c1f2), + C32e(0x2d2decc1), C32e(0x3c3c5a66), C32e(0x1515b8ad), C32e(0xc9c9a960), + C32e(0x87875cdb), C32e(0xaaaab01a), C32e(0x5050d888), C32e(0xa5a52b8e), + C32e(0x0303898a), C32e(0x59594a13), C32e(0x0909929b), C32e(0x1a1a2339), + C32e(0x65651075), C32e(0xd7d78453), C32e(0x8484d551), C32e(0xd0d003d3), + C32e(0x8282dc5e), C32e(0x2929e2cb), C32e(0x5a5ac399), C32e(0x1e1e2d33), + C32e(0x7b7b3d46), C32e(0xa8a8b71f), C32e(0x6d6d0c61), C32e(0x2c2c624e) +}; + +static const sph_u32 T1dn[] = { + C32e(0xa5f497a5), C32e(0x8497eb84), C32e(0x99b0c799), C32e(0x8d8cf78d), + C32e(0x0d17e50d), C32e(0xbddcb7bd), C32e(0xb1c8a7b1), C32e(0x54fc3954), + C32e(0x50f0c050), C32e(0x03050403), C32e(0xa9e087a9), C32e(0x7d87ac7d), + C32e(0x192bd519), C32e(0x62a67162), C32e(0xe6319ae6), C32e(0x9ab5c39a), + C32e(0x45cf0545), C32e(0x9dbc3e9d), C32e(0x40c00940), C32e(0x8792ef87), + C32e(0x153fc515), C32e(0xeb267feb), C32e(0xc94007c9), C32e(0x0b1ded0b), + C32e(0xec2f82ec), C32e(0x67a97d67), C32e(0xfd1cbefd), C32e(0xea258aea), + C32e(0xbfda46bf), C32e(0xf702a6f7), C32e(0x96a1d396), C32e(0x5bed2d5b), + C32e(0xc25deac2), C32e(0x1c24d91c), C32e(0xaee97aae), C32e(0x6abe986a), + C32e(0x5aeed85a), C32e(0x41c3fc41), C32e(0x0206f102), C32e(0x4fd11d4f), + C32e(0x5ce4d05c), C32e(0xf407a2f4), C32e(0x345cb934), C32e(0x0818e908), + C32e(0x93aedf93), C32e(0x73954d73), C32e(0x53f5c453), C32e(0x3f41543f), + C32e(0x0c14100c), C32e(0x52f63152), C32e(0x65af8c65), C32e(0x5ee2215e), + C32e(0x28786028), C32e(0xa1f86ea1), C32e(0x0f11140f), C32e(0xb5c45eb5), + C32e(0x091b1c09), C32e(0x365a4836), C32e(0x9bb6369b), C32e(0x3d47a53d), + C32e(0x266a8126), C32e(0x69bb9c69), C32e(0xcd4cfecd), C32e(0x9fbacf9f), + C32e(0x1b2d241b), C32e(0x9eb93a9e), C32e(0x749cb074), C32e(0x2e72682e), + C32e(0x2d776c2d), C32e(0xb2cda3b2), C32e(0xee2973ee), C32e(0xfb16b6fb), + C32e(0xf60153f6), C32e(0x4dd7ec4d), C32e(0x61a37561), C32e(0xce49face), + C32e(0x7b8da47b), C32e(0x3e42a13e), C32e(0x7193bc71), C32e(0x97a22697), + C32e(0xf50457f5), C32e(0x68b86968), C32e(0x00000000), C32e(0x2c74992c), + C32e(0x60a08060), C32e(0x1f21dd1f), C32e(0xc843f2c8), C32e(0xed2c77ed), + C32e(0xbed9b3be), C32e(0x46ca0146), C32e(0xd970ced9), C32e(0x4bdde44b), + C32e(0xde7933de), C32e(0xd4672bd4), C32e(0xe8237be8), C32e(0x4ade114a), + C32e(0x6bbd6d6b), C32e(0x2a7e912a), C32e(0xe5349ee5), C32e(0x163ac116), + C32e(0xc55417c5), C32e(0xd7622fd7), C32e(0x55ffcc55), C32e(0x94a72294), + C32e(0xcf4a0fcf), C32e(0x1030c910), C32e(0x060a0806), C32e(0x8198e781), + C32e(0xf00b5bf0), C32e(0x44ccf044), C32e(0xbad54aba), C32e(0xe33e96e3), + C32e(0xf30e5ff3), C32e(0xfe19bafe), C32e(0xc05b1bc0), C32e(0x8a850a8a), + C32e(0xadec7ead), C32e(0xbcdf42bc), C32e(0x48d8e048), C32e(0x040cf904), + C32e(0xdf7ac6df), C32e(0xc158eec1), C32e(0x759f4575), C32e(0x63a58463), + C32e(0x30504030), C32e(0x1a2ed11a), C32e(0x0e12e10e), C32e(0x6db7656d), + C32e(0x4cd4194c), C32e(0x143c3014), C32e(0x355f4c35), C32e(0x2f719d2f), + C32e(0xe13867e1), C32e(0xa2fd6aa2), C32e(0xcc4f0bcc), C32e(0x394b5c39), + C32e(0x57f93d57), C32e(0xf20daaf2), C32e(0x829de382), C32e(0x47c9f447), + C32e(0xacef8bac), C32e(0xe7326fe7), C32e(0x2b7d642b), C32e(0x95a4d795), + C32e(0xa0fb9ba0), C32e(0x98b33298), C32e(0xd16827d1), C32e(0x7f815d7f), + C32e(0x66aa8866), C32e(0x7e82a87e), C32e(0xabe676ab), C32e(0x839e1683), + C32e(0xca4503ca), C32e(0x297b9529), C32e(0xd36ed6d3), C32e(0x3c44503c), + C32e(0x798b5579), C32e(0xe23d63e2), C32e(0x1d272c1d), C32e(0x769a4176), + C32e(0x3b4dad3b), C32e(0x56fac856), C32e(0x4ed2e84e), C32e(0x1e22281e), + C32e(0xdb763fdb), C32e(0x0a1e180a), C32e(0x6cb4906c), C32e(0xe4376be4), + C32e(0x5de7255d), C32e(0x6eb2616e), C32e(0xef2a86ef), C32e(0xa6f193a6), + C32e(0xa8e372a8), C32e(0xa4f762a4), C32e(0x3759bd37), C32e(0x8b86ff8b), + C32e(0x3256b132), C32e(0x43c50d43), C32e(0x59ebdc59), C32e(0xb7c2afb7), + C32e(0x8c8f028c), C32e(0x64ac7964), C32e(0xd26d23d2), C32e(0xe03b92e0), + C32e(0xb4c7abb4), C32e(0xfa1543fa), C32e(0x0709fd07), C32e(0x256f8525), + C32e(0xafea8faf), C32e(0x8e89f38e), C32e(0xe9208ee9), C32e(0x18282018), + C32e(0xd564ded5), C32e(0x8883fb88), C32e(0x6fb1946f), C32e(0x7296b872), + C32e(0x246c7024), C32e(0xf108aef1), C32e(0xc752e6c7), C32e(0x51f33551), + C32e(0x23658d23), C32e(0x7c84597c), C32e(0x9cbfcb9c), C32e(0x21637c21), + C32e(0xdd7c37dd), C32e(0xdc7fc2dc), C32e(0x86911a86), C32e(0x85941e85), + C32e(0x90abdb90), C32e(0x42c6f842), C32e(0xc457e2c4), C32e(0xaae583aa), + C32e(0xd8733bd8), C32e(0x050f0c05), C32e(0x0103f501), C32e(0x12363812), + C32e(0xa3fe9fa3), C32e(0x5fe1d45f), C32e(0xf91047f9), C32e(0xd06bd2d0), + C32e(0x91a82e91), C32e(0x58e82958), C32e(0x27697427), C32e(0xb9d04eb9), + C32e(0x3848a938), C32e(0x1335cd13), C32e(0xb3ce56b3), C32e(0x33554433), + C32e(0xbbd6bfbb), C32e(0x70904970), C32e(0x89800e89), C32e(0xa7f266a7), + C32e(0xb6c15ab6), C32e(0x22667822), C32e(0x92ad2a92), C32e(0x20608920), + C32e(0x49db1549), C32e(0xff1a4fff), C32e(0x7888a078), C32e(0x7a8e517a), + C32e(0x8f8a068f), C32e(0xf813b2f8), C32e(0x809b1280), C32e(0x17393417), + C32e(0xda75cada), C32e(0x3153b531), C32e(0xc65113c6), C32e(0xb8d3bbb8), + C32e(0xc35e1fc3), C32e(0xb0cb52b0), C32e(0x7799b477), C32e(0x11333c11), + C32e(0xcb46f6cb), C32e(0xfc1f4bfc), C32e(0xd661dad6), C32e(0x3a4e583a) +}; + +static const sph_u32 T2up[] = { + C32e(0xa5c6c632), C32e(0x84f8f86f), C32e(0x99eeee5e), C32e(0x8df6f67a), + C32e(0x0dffffe8), C32e(0xbdd6d60a), C32e(0xb1dede16), C32e(0x5491916d), + C32e(0x50606090), C32e(0x03020207), C32e(0xa9cece2e), C32e(0x7d5656d1), + C32e(0x19e7e7cc), C32e(0x62b5b513), C32e(0xe64d4d7c), C32e(0x9aecec59), + C32e(0x458f8f40), C32e(0x9d1f1fa3), C32e(0x40898949), C32e(0x87fafa68), + C32e(0x15efefd0), C32e(0xebb2b294), C32e(0xc98e8ece), C32e(0x0bfbfbe6), + C32e(0xec41416e), C32e(0x67b3b31a), C32e(0xfd5f5f43), C32e(0xea454560), + C32e(0xbf2323f9), C32e(0xf7535351), C32e(0x96e4e445), C32e(0x5b9b9b76), + C32e(0xc2757528), C32e(0x1ce1e1c5), C32e(0xae3d3dd4), C32e(0x6a4c4cf2), + C32e(0x5a6c6c82), C32e(0x417e7ebd), C32e(0x02f5f5f3), C32e(0x4f838352), + C32e(0x5c68688c), C32e(0xf4515156), C32e(0x34d1d18d), C32e(0x08f9f9e1), + C32e(0x93e2e24c), C32e(0x73abab3e), C32e(0x53626297), C32e(0x3f2a2a6b), + C32e(0x0c08081c), C32e(0x52959563), C32e(0x654646e9), C32e(0x5e9d9d7f), + C32e(0x28303048), C32e(0xa13737cf), C32e(0x0f0a0a1b), C32e(0xb52f2feb), + C32e(0x090e0e15), C32e(0x3624247e), C32e(0x9b1b1bad), C32e(0x3ddfdf98), + C32e(0x26cdcda7), C32e(0x694e4ef5), C32e(0xcd7f7f33), C32e(0x9feaea50), + C32e(0x1b12123f), C32e(0x9e1d1da4), C32e(0x745858c4), C32e(0x2e343446), + C32e(0x2d363641), C32e(0xb2dcdc11), C32e(0xeeb4b49d), C32e(0xfb5b5b4d), + C32e(0xf6a4a4a5), C32e(0x4d7676a1), C32e(0x61b7b714), C32e(0xce7d7d34), + C32e(0x7b5252df), C32e(0x3edddd9f), C32e(0x715e5ecd), C32e(0x971313b1), + C32e(0xf5a6a6a2), C32e(0x68b9b901), C32e(0x00000000), C32e(0x2cc1c1b5), + C32e(0x604040e0), C32e(0x1fe3e3c2), C32e(0xc879793a), C32e(0xedb6b69a), + C32e(0xbed4d40d), C32e(0x468d8d47), C32e(0xd9676717), C32e(0x4b7272af), + C32e(0xde9494ed), C32e(0xd49898ff), C32e(0xe8b0b093), C32e(0x4a85855b), + C32e(0x6bbbbb06), C32e(0x2ac5c5bb), C32e(0xe54f4f7b), C32e(0x16ededd7), + C32e(0xc58686d2), C32e(0xd79a9af8), C32e(0x55666699), C32e(0x941111b6), + C32e(0xcf8a8ac0), C32e(0x10e9e9d9), C32e(0x0604040e), C32e(0x81fefe66), + C32e(0xf0a0a0ab), C32e(0x447878b4), C32e(0xba2525f0), C32e(0xe34b4b75), + C32e(0xf3a2a2ac), C32e(0xfe5d5d44), C32e(0xc08080db), C32e(0x8a050580), + C32e(0xad3f3fd3), C32e(0xbc2121fe), C32e(0x487070a8), C32e(0x04f1f1fd), + C32e(0xdf636319), C32e(0xc177772f), C32e(0x75afaf30), C32e(0x634242e7), + C32e(0x30202070), C32e(0x1ae5e5cb), C32e(0x0efdfdef), C32e(0x6dbfbf08), + C32e(0x4c818155), C32e(0x14181824), C32e(0x35262679), C32e(0x2fc3c3b2), + C32e(0xe1bebe86), C32e(0xa23535c8), C32e(0xcc8888c7), C32e(0x392e2e65), + C32e(0x5793936a), C32e(0xf2555558), C32e(0x82fcfc61), C32e(0x477a7ab3), + C32e(0xacc8c827), C32e(0xe7baba88), C32e(0x2b32324f), C32e(0x95e6e642), + C32e(0xa0c0c03b), C32e(0x981919aa), C32e(0xd19e9ef6), C32e(0x7fa3a322), + C32e(0x664444ee), C32e(0x7e5454d6), C32e(0xab3b3bdd), C32e(0x830b0b95), + C32e(0xca8c8cc9), C32e(0x29c7c7bc), C32e(0xd36b6b05), C32e(0x3c28286c), + C32e(0x79a7a72c), C32e(0xe2bcbc81), C32e(0x1d161631), C32e(0x76adad37), + C32e(0x3bdbdb96), C32e(0x5664649e), C32e(0x4e7474a6), C32e(0x1e141436), + C32e(0xdb9292e4), C32e(0x0a0c0c12), C32e(0x6c4848fc), C32e(0xe4b8b88f), + C32e(0x5d9f9f78), C32e(0x6ebdbd0f), C32e(0xef434369), C32e(0xa6c4c435), + C32e(0xa83939da), C32e(0xa43131c6), C32e(0x37d3d38a), C32e(0x8bf2f274), + C32e(0x32d5d583), C32e(0x438b8b4e), C32e(0x596e6e85), C32e(0xb7dada18), + C32e(0x8c01018e), C32e(0x64b1b11d), C32e(0xd29c9cf1), C32e(0xe0494972), + C32e(0xb4d8d81f), C32e(0xfaacacb9), C32e(0x07f3f3fa), C32e(0x25cfcfa0), + C32e(0xafcaca20), C32e(0x8ef4f47d), C32e(0xe9474767), C32e(0x18101038), + C32e(0xd56f6f0b), C32e(0x88f0f073), C32e(0x6f4a4afb), C32e(0x725c5cca), + C32e(0x24383854), C32e(0xf157575f), C32e(0xc7737321), C32e(0x51979764), + C32e(0x23cbcbae), C32e(0x7ca1a125), C32e(0x9ce8e857), C32e(0x213e3e5d), + C32e(0xdd9696ea), C32e(0xdc61611e), C32e(0x860d0d9c), C32e(0x850f0f9b), + C32e(0x90e0e04b), C32e(0x427c7cba), C32e(0xc4717126), C32e(0xaacccc29), + C32e(0xd89090e3), C32e(0x05060609), C32e(0x01f7f7f4), C32e(0x121c1c2a), + C32e(0xa3c2c23c), C32e(0x5f6a6a8b), C32e(0xf9aeaebe), C32e(0xd0696902), + C32e(0x911717bf), C32e(0x58999971), C32e(0x273a3a53), C32e(0xb92727f7), + C32e(0x38d9d991), C32e(0x13ebebde), C32e(0xb32b2be5), C32e(0x33222277), + C32e(0xbbd2d204), C32e(0x70a9a939), C32e(0x89070787), C32e(0xa73333c1), + C32e(0xb62d2dec), C32e(0x223c3c5a), C32e(0x921515b8), C32e(0x20c9c9a9), + C32e(0x4987875c), C32e(0xffaaaab0), C32e(0x785050d8), C32e(0x7aa5a52b), + C32e(0x8f030389), C32e(0xf859594a), C32e(0x80090992), C32e(0x171a1a23), + C32e(0xda656510), C32e(0x31d7d784), C32e(0xc68484d5), C32e(0xb8d0d003), + C32e(0xc38282dc), C32e(0xb02929e2), C32e(0x775a5ac3), C32e(0x111e1e2d), + C32e(0xcb7b7b3d), C32e(0xfca8a8b7), C32e(0xd66d6d0c), C32e(0x3a2c2c62) +}; + +static const sph_u32 T2dn[] = { + C32e(0xf4a5f497), C32e(0x978497eb), C32e(0xb099b0c7), C32e(0x8c8d8cf7), + C32e(0x170d17e5), C32e(0xdcbddcb7), C32e(0xc8b1c8a7), C32e(0xfc54fc39), + C32e(0xf050f0c0), C32e(0x05030504), C32e(0xe0a9e087), C32e(0x877d87ac), + C32e(0x2b192bd5), C32e(0xa662a671), C32e(0x31e6319a), C32e(0xb59ab5c3), + C32e(0xcf45cf05), C32e(0xbc9dbc3e), C32e(0xc040c009), C32e(0x928792ef), + C32e(0x3f153fc5), C32e(0x26eb267f), C32e(0x40c94007), C32e(0x1d0b1ded), + C32e(0x2fec2f82), C32e(0xa967a97d), C32e(0x1cfd1cbe), C32e(0x25ea258a), + C32e(0xdabfda46), C32e(0x02f702a6), C32e(0xa196a1d3), C32e(0xed5bed2d), + C32e(0x5dc25dea), C32e(0x241c24d9), C32e(0xe9aee97a), C32e(0xbe6abe98), + C32e(0xee5aeed8), C32e(0xc341c3fc), C32e(0x060206f1), C32e(0xd14fd11d), + C32e(0xe45ce4d0), C32e(0x07f407a2), C32e(0x5c345cb9), C32e(0x180818e9), + C32e(0xae93aedf), C32e(0x9573954d), C32e(0xf553f5c4), C32e(0x413f4154), + C32e(0x140c1410), C32e(0xf652f631), C32e(0xaf65af8c), C32e(0xe25ee221), + C32e(0x78287860), C32e(0xf8a1f86e), C32e(0x110f1114), C32e(0xc4b5c45e), + C32e(0x1b091b1c), C32e(0x5a365a48), C32e(0xb69bb636), C32e(0x473d47a5), + C32e(0x6a266a81), C32e(0xbb69bb9c), C32e(0x4ccd4cfe), C32e(0xba9fbacf), + C32e(0x2d1b2d24), C32e(0xb99eb93a), C32e(0x9c749cb0), C32e(0x722e7268), + C32e(0x772d776c), C32e(0xcdb2cda3), C32e(0x29ee2973), C32e(0x16fb16b6), + C32e(0x01f60153), C32e(0xd74dd7ec), C32e(0xa361a375), C32e(0x49ce49fa), + C32e(0x8d7b8da4), C32e(0x423e42a1), C32e(0x937193bc), C32e(0xa297a226), + C32e(0x04f50457), C32e(0xb868b869), C32e(0x00000000), C32e(0x742c7499), + C32e(0xa060a080), C32e(0x211f21dd), C32e(0x43c843f2), C32e(0x2ced2c77), + C32e(0xd9bed9b3), C32e(0xca46ca01), C32e(0x70d970ce), C32e(0xdd4bdde4), + C32e(0x79de7933), C32e(0x67d4672b), C32e(0x23e8237b), C32e(0xde4ade11), + C32e(0xbd6bbd6d), C32e(0x7e2a7e91), C32e(0x34e5349e), C32e(0x3a163ac1), + C32e(0x54c55417), C32e(0x62d7622f), C32e(0xff55ffcc), C32e(0xa794a722), + C32e(0x4acf4a0f), C32e(0x301030c9), C32e(0x0a060a08), C32e(0x988198e7), + C32e(0x0bf00b5b), C32e(0xcc44ccf0), C32e(0xd5bad54a), C32e(0x3ee33e96), + C32e(0x0ef30e5f), C32e(0x19fe19ba), C32e(0x5bc05b1b), C32e(0x858a850a), + C32e(0xecadec7e), C32e(0xdfbcdf42), C32e(0xd848d8e0), C32e(0x0c040cf9), + C32e(0x7adf7ac6), C32e(0x58c158ee), C32e(0x9f759f45), C32e(0xa563a584), + C32e(0x50305040), C32e(0x2e1a2ed1), C32e(0x120e12e1), C32e(0xb76db765), + C32e(0xd44cd419), C32e(0x3c143c30), C32e(0x5f355f4c), C32e(0x712f719d), + C32e(0x38e13867), C32e(0xfda2fd6a), C32e(0x4fcc4f0b), C32e(0x4b394b5c), + C32e(0xf957f93d), C32e(0x0df20daa), C32e(0x9d829de3), C32e(0xc947c9f4), + C32e(0xefacef8b), C32e(0x32e7326f), C32e(0x7d2b7d64), C32e(0xa495a4d7), + C32e(0xfba0fb9b), C32e(0xb398b332), C32e(0x68d16827), C32e(0x817f815d), + C32e(0xaa66aa88), C32e(0x827e82a8), C32e(0xe6abe676), C32e(0x9e839e16), + C32e(0x45ca4503), C32e(0x7b297b95), C32e(0x6ed36ed6), C32e(0x443c4450), + C32e(0x8b798b55), C32e(0x3de23d63), C32e(0x271d272c), C32e(0x9a769a41), + C32e(0x4d3b4dad), C32e(0xfa56fac8), C32e(0xd24ed2e8), C32e(0x221e2228), + C32e(0x76db763f), C32e(0x1e0a1e18), C32e(0xb46cb490), C32e(0x37e4376b), + C32e(0xe75de725), C32e(0xb26eb261), C32e(0x2aef2a86), C32e(0xf1a6f193), + C32e(0xe3a8e372), C32e(0xf7a4f762), C32e(0x593759bd), C32e(0x868b86ff), + C32e(0x563256b1), C32e(0xc543c50d), C32e(0xeb59ebdc), C32e(0xc2b7c2af), + C32e(0x8f8c8f02), C32e(0xac64ac79), C32e(0x6dd26d23), C32e(0x3be03b92), + C32e(0xc7b4c7ab), C32e(0x15fa1543), C32e(0x090709fd), C32e(0x6f256f85), + C32e(0xeaafea8f), C32e(0x898e89f3), C32e(0x20e9208e), C32e(0x28182820), + C32e(0x64d564de), C32e(0x838883fb), C32e(0xb16fb194), C32e(0x967296b8), + C32e(0x6c246c70), C32e(0x08f108ae), C32e(0x52c752e6), C32e(0xf351f335), + C32e(0x6523658d), C32e(0x847c8459), C32e(0xbf9cbfcb), C32e(0x6321637c), + C32e(0x7cdd7c37), C32e(0x7fdc7fc2), C32e(0x9186911a), C32e(0x9485941e), + C32e(0xab90abdb), C32e(0xc642c6f8), C32e(0x57c457e2), C32e(0xe5aae583), + C32e(0x73d8733b), C32e(0x0f050f0c), C32e(0x030103f5), C32e(0x36123638), + C32e(0xfea3fe9f), C32e(0xe15fe1d4), C32e(0x10f91047), C32e(0x6bd06bd2), + C32e(0xa891a82e), C32e(0xe858e829), C32e(0x69276974), C32e(0xd0b9d04e), + C32e(0x483848a9), C32e(0x351335cd), C32e(0xceb3ce56), C32e(0x55335544), + C32e(0xd6bbd6bf), C32e(0x90709049), C32e(0x8089800e), C32e(0xf2a7f266), + C32e(0xc1b6c15a), C32e(0x66226678), C32e(0xad92ad2a), C32e(0x60206089), + C32e(0xdb49db15), C32e(0x1aff1a4f), C32e(0x887888a0), C32e(0x8e7a8e51), + C32e(0x8a8f8a06), C32e(0x13f813b2), C32e(0x9b809b12), C32e(0x39173934), + C32e(0x75da75ca), C32e(0x533153b5), C32e(0x51c65113), C32e(0xd3b8d3bb), + C32e(0x5ec35e1f), C32e(0xcbb0cb52), C32e(0x997799b4), C32e(0x3311333c), + C32e(0x46cb46f6), C32e(0x1ffc1f4b), C32e(0x61d661da), C32e(0x4e3a4e58) +}; + +static const sph_u32 T3up[] = { + C32e(0x97a5c6c6), C32e(0xeb84f8f8), C32e(0xc799eeee), C32e(0xf78df6f6), + C32e(0xe50dffff), C32e(0xb7bdd6d6), C32e(0xa7b1dede), C32e(0x39549191), + C32e(0xc0506060), C32e(0x04030202), C32e(0x87a9cece), C32e(0xac7d5656), + C32e(0xd519e7e7), C32e(0x7162b5b5), C32e(0x9ae64d4d), C32e(0xc39aecec), + C32e(0x05458f8f), C32e(0x3e9d1f1f), C32e(0x09408989), C32e(0xef87fafa), + C32e(0xc515efef), C32e(0x7febb2b2), C32e(0x07c98e8e), C32e(0xed0bfbfb), + C32e(0x82ec4141), C32e(0x7d67b3b3), C32e(0xbefd5f5f), C32e(0x8aea4545), + C32e(0x46bf2323), C32e(0xa6f75353), C32e(0xd396e4e4), C32e(0x2d5b9b9b), + C32e(0xeac27575), C32e(0xd91ce1e1), C32e(0x7aae3d3d), C32e(0x986a4c4c), + C32e(0xd85a6c6c), C32e(0xfc417e7e), C32e(0xf102f5f5), C32e(0x1d4f8383), + C32e(0xd05c6868), C32e(0xa2f45151), C32e(0xb934d1d1), C32e(0xe908f9f9), + C32e(0xdf93e2e2), C32e(0x4d73abab), C32e(0xc4536262), C32e(0x543f2a2a), + C32e(0x100c0808), C32e(0x31529595), C32e(0x8c654646), C32e(0x215e9d9d), + C32e(0x60283030), C32e(0x6ea13737), C32e(0x140f0a0a), C32e(0x5eb52f2f), + C32e(0x1c090e0e), C32e(0x48362424), C32e(0x369b1b1b), C32e(0xa53ddfdf), + C32e(0x8126cdcd), C32e(0x9c694e4e), C32e(0xfecd7f7f), C32e(0xcf9feaea), + C32e(0x241b1212), C32e(0x3a9e1d1d), C32e(0xb0745858), C32e(0x682e3434), + C32e(0x6c2d3636), C32e(0xa3b2dcdc), C32e(0x73eeb4b4), C32e(0xb6fb5b5b), + C32e(0x53f6a4a4), C32e(0xec4d7676), C32e(0x7561b7b7), C32e(0xface7d7d), + C32e(0xa47b5252), C32e(0xa13edddd), C32e(0xbc715e5e), C32e(0x26971313), + C32e(0x57f5a6a6), C32e(0x6968b9b9), C32e(0x00000000), C32e(0x992cc1c1), + C32e(0x80604040), C32e(0xdd1fe3e3), C32e(0xf2c87979), C32e(0x77edb6b6), + C32e(0xb3bed4d4), C32e(0x01468d8d), C32e(0xced96767), C32e(0xe44b7272), + C32e(0x33de9494), C32e(0x2bd49898), C32e(0x7be8b0b0), C32e(0x114a8585), + C32e(0x6d6bbbbb), C32e(0x912ac5c5), C32e(0x9ee54f4f), C32e(0xc116eded), + C32e(0x17c58686), C32e(0x2fd79a9a), C32e(0xcc556666), C32e(0x22941111), + C32e(0x0fcf8a8a), C32e(0xc910e9e9), C32e(0x08060404), C32e(0xe781fefe), + C32e(0x5bf0a0a0), C32e(0xf0447878), C32e(0x4aba2525), C32e(0x96e34b4b), + C32e(0x5ff3a2a2), C32e(0xbafe5d5d), C32e(0x1bc08080), C32e(0x0a8a0505), + C32e(0x7ead3f3f), C32e(0x42bc2121), C32e(0xe0487070), C32e(0xf904f1f1), + C32e(0xc6df6363), C32e(0xeec17777), C32e(0x4575afaf), C32e(0x84634242), + C32e(0x40302020), C32e(0xd11ae5e5), C32e(0xe10efdfd), C32e(0x656dbfbf), + C32e(0x194c8181), C32e(0x30141818), C32e(0x4c352626), C32e(0x9d2fc3c3), + C32e(0x67e1bebe), C32e(0x6aa23535), C32e(0x0bcc8888), C32e(0x5c392e2e), + C32e(0x3d579393), C32e(0xaaf25555), C32e(0xe382fcfc), C32e(0xf4477a7a), + C32e(0x8bacc8c8), C32e(0x6fe7baba), C32e(0x642b3232), C32e(0xd795e6e6), + C32e(0x9ba0c0c0), C32e(0x32981919), C32e(0x27d19e9e), C32e(0x5d7fa3a3), + C32e(0x88664444), C32e(0xa87e5454), C32e(0x76ab3b3b), C32e(0x16830b0b), + C32e(0x03ca8c8c), C32e(0x9529c7c7), C32e(0xd6d36b6b), C32e(0x503c2828), + C32e(0x5579a7a7), C32e(0x63e2bcbc), C32e(0x2c1d1616), C32e(0x4176adad), + C32e(0xad3bdbdb), C32e(0xc8566464), C32e(0xe84e7474), C32e(0x281e1414), + C32e(0x3fdb9292), C32e(0x180a0c0c), C32e(0x906c4848), C32e(0x6be4b8b8), + C32e(0x255d9f9f), C32e(0x616ebdbd), C32e(0x86ef4343), C32e(0x93a6c4c4), + C32e(0x72a83939), C32e(0x62a43131), C32e(0xbd37d3d3), C32e(0xff8bf2f2), + C32e(0xb132d5d5), C32e(0x0d438b8b), C32e(0xdc596e6e), C32e(0xafb7dada), + C32e(0x028c0101), C32e(0x7964b1b1), C32e(0x23d29c9c), C32e(0x92e04949), + C32e(0xabb4d8d8), C32e(0x43faacac), C32e(0xfd07f3f3), C32e(0x8525cfcf), + C32e(0x8fafcaca), C32e(0xf38ef4f4), C32e(0x8ee94747), C32e(0x20181010), + C32e(0xded56f6f), C32e(0xfb88f0f0), C32e(0x946f4a4a), C32e(0xb8725c5c), + C32e(0x70243838), C32e(0xaef15757), C32e(0xe6c77373), C32e(0x35519797), + C32e(0x8d23cbcb), C32e(0x597ca1a1), C32e(0xcb9ce8e8), C32e(0x7c213e3e), + C32e(0x37dd9696), C32e(0xc2dc6161), C32e(0x1a860d0d), C32e(0x1e850f0f), + C32e(0xdb90e0e0), C32e(0xf8427c7c), C32e(0xe2c47171), C32e(0x83aacccc), + C32e(0x3bd89090), C32e(0x0c050606), C32e(0xf501f7f7), C32e(0x38121c1c), + C32e(0x9fa3c2c2), C32e(0xd45f6a6a), C32e(0x47f9aeae), C32e(0xd2d06969), + C32e(0x2e911717), C32e(0x29589999), C32e(0x74273a3a), C32e(0x4eb92727), + C32e(0xa938d9d9), C32e(0xcd13ebeb), C32e(0x56b32b2b), C32e(0x44332222), + C32e(0xbfbbd2d2), C32e(0x4970a9a9), C32e(0x0e890707), C32e(0x66a73333), + C32e(0x5ab62d2d), C32e(0x78223c3c), C32e(0x2a921515), C32e(0x8920c9c9), + C32e(0x15498787), C32e(0x4fffaaaa), C32e(0xa0785050), C32e(0x517aa5a5), + C32e(0x068f0303), C32e(0xb2f85959), C32e(0x12800909), C32e(0x34171a1a), + C32e(0xcada6565), C32e(0xb531d7d7), C32e(0x13c68484), C32e(0xbbb8d0d0), + C32e(0x1fc38282), C32e(0x52b02929), C32e(0xb4775a5a), C32e(0x3c111e1e), + C32e(0xf6cb7b7b), C32e(0x4bfca8a8), C32e(0xdad66d6d), C32e(0x583a2c2c) +}; + +static const sph_u32 T3dn[] = { + C32e(0x32f4a5f4), C32e(0x6f978497), C32e(0x5eb099b0), C32e(0x7a8c8d8c), + C32e(0xe8170d17), C32e(0x0adcbddc), C32e(0x16c8b1c8), C32e(0x6dfc54fc), + C32e(0x90f050f0), C32e(0x07050305), C32e(0x2ee0a9e0), C32e(0xd1877d87), + C32e(0xcc2b192b), C32e(0x13a662a6), C32e(0x7c31e631), C32e(0x59b59ab5), + C32e(0x40cf45cf), C32e(0xa3bc9dbc), C32e(0x49c040c0), C32e(0x68928792), + C32e(0xd03f153f), C32e(0x9426eb26), C32e(0xce40c940), C32e(0xe61d0b1d), + C32e(0x6e2fec2f), C32e(0x1aa967a9), C32e(0x431cfd1c), C32e(0x6025ea25), + C32e(0xf9dabfda), C32e(0x5102f702), C32e(0x45a196a1), C32e(0x76ed5bed), + C32e(0x285dc25d), C32e(0xc5241c24), C32e(0xd4e9aee9), C32e(0xf2be6abe), + C32e(0x82ee5aee), C32e(0xbdc341c3), C32e(0xf3060206), C32e(0x52d14fd1), + C32e(0x8ce45ce4), C32e(0x5607f407), C32e(0x8d5c345c), C32e(0xe1180818), + C32e(0x4cae93ae), C32e(0x3e957395), C32e(0x97f553f5), C32e(0x6b413f41), + C32e(0x1c140c14), C32e(0x63f652f6), C32e(0xe9af65af), C32e(0x7fe25ee2), + C32e(0x48782878), C32e(0xcff8a1f8), C32e(0x1b110f11), C32e(0xebc4b5c4), + C32e(0x151b091b), C32e(0x7e5a365a), C32e(0xadb69bb6), C32e(0x98473d47), + C32e(0xa76a266a), C32e(0xf5bb69bb), C32e(0x334ccd4c), C32e(0x50ba9fba), + C32e(0x3f2d1b2d), C32e(0xa4b99eb9), C32e(0xc49c749c), C32e(0x46722e72), + C32e(0x41772d77), C32e(0x11cdb2cd), C32e(0x9d29ee29), C32e(0x4d16fb16), + C32e(0xa501f601), C32e(0xa1d74dd7), C32e(0x14a361a3), C32e(0x3449ce49), + C32e(0xdf8d7b8d), C32e(0x9f423e42), C32e(0xcd937193), C32e(0xb1a297a2), + C32e(0xa204f504), C32e(0x01b868b8), C32e(0x00000000), C32e(0xb5742c74), + C32e(0xe0a060a0), C32e(0xc2211f21), C32e(0x3a43c843), C32e(0x9a2ced2c), + C32e(0x0dd9bed9), C32e(0x47ca46ca), C32e(0x1770d970), C32e(0xafdd4bdd), + C32e(0xed79de79), C32e(0xff67d467), C32e(0x9323e823), C32e(0x5bde4ade), + C32e(0x06bd6bbd), C32e(0xbb7e2a7e), C32e(0x7b34e534), C32e(0xd73a163a), + C32e(0xd254c554), C32e(0xf862d762), C32e(0x99ff55ff), C32e(0xb6a794a7), + C32e(0xc04acf4a), C32e(0xd9301030), C32e(0x0e0a060a), C32e(0x66988198), + C32e(0xab0bf00b), C32e(0xb4cc44cc), C32e(0xf0d5bad5), C32e(0x753ee33e), + C32e(0xac0ef30e), C32e(0x4419fe19), C32e(0xdb5bc05b), C32e(0x80858a85), + C32e(0xd3ecadec), C32e(0xfedfbcdf), C32e(0xa8d848d8), C32e(0xfd0c040c), + C32e(0x197adf7a), C32e(0x2f58c158), C32e(0x309f759f), C32e(0xe7a563a5), + C32e(0x70503050), C32e(0xcb2e1a2e), C32e(0xef120e12), C32e(0x08b76db7), + C32e(0x55d44cd4), C32e(0x243c143c), C32e(0x795f355f), C32e(0xb2712f71), + C32e(0x8638e138), C32e(0xc8fda2fd), C32e(0xc74fcc4f), C32e(0x654b394b), + C32e(0x6af957f9), C32e(0x580df20d), C32e(0x619d829d), C32e(0xb3c947c9), + C32e(0x27efacef), C32e(0x8832e732), C32e(0x4f7d2b7d), C32e(0x42a495a4), + C32e(0x3bfba0fb), C32e(0xaab398b3), C32e(0xf668d168), C32e(0x22817f81), + C32e(0xeeaa66aa), C32e(0xd6827e82), C32e(0xdde6abe6), C32e(0x959e839e), + C32e(0xc945ca45), C32e(0xbc7b297b), C32e(0x056ed36e), C32e(0x6c443c44), + C32e(0x2c8b798b), C32e(0x813de23d), C32e(0x31271d27), C32e(0x379a769a), + C32e(0x964d3b4d), C32e(0x9efa56fa), C32e(0xa6d24ed2), C32e(0x36221e22), + C32e(0xe476db76), C32e(0x121e0a1e), C32e(0xfcb46cb4), C32e(0x8f37e437), + C32e(0x78e75de7), C32e(0x0fb26eb2), C32e(0x692aef2a), C32e(0x35f1a6f1), + C32e(0xdae3a8e3), C32e(0xc6f7a4f7), C32e(0x8a593759), C32e(0x74868b86), + C32e(0x83563256), C32e(0x4ec543c5), C32e(0x85eb59eb), C32e(0x18c2b7c2), + C32e(0x8e8f8c8f), C32e(0x1dac64ac), C32e(0xf16dd26d), C32e(0x723be03b), + C32e(0x1fc7b4c7), C32e(0xb915fa15), C32e(0xfa090709), C32e(0xa06f256f), + C32e(0x20eaafea), C32e(0x7d898e89), C32e(0x6720e920), C32e(0x38281828), + C32e(0x0b64d564), C32e(0x73838883), C32e(0xfbb16fb1), C32e(0xca967296), + C32e(0x546c246c), C32e(0x5f08f108), C32e(0x2152c752), C32e(0x64f351f3), + C32e(0xae652365), C32e(0x25847c84), C32e(0x57bf9cbf), C32e(0x5d632163), + C32e(0xea7cdd7c), C32e(0x1e7fdc7f), C32e(0x9c918691), C32e(0x9b948594), + C32e(0x4bab90ab), C32e(0xbac642c6), C32e(0x2657c457), C32e(0x29e5aae5), + C32e(0xe373d873), C32e(0x090f050f), C32e(0xf4030103), C32e(0x2a361236), + C32e(0x3cfea3fe), C32e(0x8be15fe1), C32e(0xbe10f910), C32e(0x026bd06b), + C32e(0xbfa891a8), C32e(0x71e858e8), C32e(0x53692769), C32e(0xf7d0b9d0), + C32e(0x91483848), C32e(0xde351335), C32e(0xe5ceb3ce), C32e(0x77553355), + C32e(0x04d6bbd6), C32e(0x39907090), C32e(0x87808980), C32e(0xc1f2a7f2), + C32e(0xecc1b6c1), C32e(0x5a662266), C32e(0xb8ad92ad), C32e(0xa9602060), + C32e(0x5cdb49db), C32e(0xb01aff1a), C32e(0xd8887888), C32e(0x2b8e7a8e), + C32e(0x898a8f8a), C32e(0x4a13f813), C32e(0x929b809b), C32e(0x23391739), + C32e(0x1075da75), C32e(0x84533153), C32e(0xd551c651), C32e(0x03d3b8d3), + C32e(0xdc5ec35e), C32e(0xe2cbb0cb), C32e(0xc3997799), C32e(0x2d331133), + C32e(0x3d46cb46), C32e(0xb71ffc1f), C32e(0x0c61d661), C32e(0x624e3a4e) +}; + +#define DECL_STATE_SMALL \ + sph_u32 H[16]; + +#define READ_STATE_SMALL(sc) do { \ + memcpy(H, (sc)->state.narrow, sizeof H); \ + } while (0) + +#define WRITE_STATE_SMALL(sc) do { \ + memcpy((sc)->state.narrow, H, sizeof H); \ + } while (0) + +#define XCAT(x, y) XCAT_(x, y) +#define XCAT_(x, y) x ## y + +#define RSTT(d0, d1, a, b0, b1, b2, b3, b4, b5, b6, b7) do { \ + t[d0] = T0up[B32_0(a[b0])] \ + ^ T1up[B32_1(a[b1])] \ + ^ T2up[B32_2(a[b2])] \ + ^ T3up[B32_3(a[b3])] \ + ^ T0dn[B32_0(a[b4])] \ + ^ T1dn[B32_1(a[b5])] \ + ^ T2dn[B32_2(a[b6])] \ + ^ T3dn[B32_3(a[b7])]; \ + t[d1] = T0dn[B32_0(a[b0])] \ + ^ T1dn[B32_1(a[b1])] \ + ^ T2dn[B32_2(a[b2])] \ + ^ T3dn[B32_3(a[b3])] \ + ^ T0up[B32_0(a[b4])] \ + ^ T1up[B32_1(a[b5])] \ + ^ T2up[B32_2(a[b6])] \ + ^ T3up[B32_3(a[b7])]; \ + } while (0) + +#define ROUND_SMALL_P(a, r) do { \ + sph_u32 t[16]; \ + a[0x0] ^= PC32up(0x00, r); \ + a[0x1] ^= PC32dn(0x00, r); \ + a[0x2] ^= PC32up(0x10, r); \ + a[0x3] ^= PC32dn(0x10, r); \ + a[0x4] ^= PC32up(0x20, r); \ + a[0x5] ^= PC32dn(0x20, r); \ + a[0x6] ^= PC32up(0x30, r); \ + a[0x7] ^= PC32dn(0x30, r); \ + a[0x8] ^= PC32up(0x40, r); \ + a[0x9] ^= PC32dn(0x40, r); \ + a[0xA] ^= PC32up(0x50, r); \ + a[0xB] ^= PC32dn(0x50, r); \ + a[0xC] ^= PC32up(0x60, r); \ + a[0xD] ^= PC32dn(0x60, r); \ + a[0xE] ^= PC32up(0x70, r); \ + a[0xF] ^= PC32dn(0x70, r); \ + RSTT(0x0, 0x1, a, 0x0, 0x2, 0x4, 0x6, 0x9, 0xB, 0xD, 0xF); \ + RSTT(0x2, 0x3, a, 0x2, 0x4, 0x6, 0x8, 0xB, 0xD, 0xF, 0x1); \ + RSTT(0x4, 0x5, a, 0x4, 0x6, 0x8, 0xA, 0xD, 0xF, 0x1, 0x3); \ + RSTT(0x6, 0x7, a, 0x6, 0x8, 0xA, 0xC, 0xF, 0x1, 0x3, 0x5); \ + RSTT(0x8, 0x9, a, 0x8, 0xA, 0xC, 0xE, 0x1, 0x3, 0x5, 0x7); \ + RSTT(0xA, 0xB, a, 0xA, 0xC, 0xE, 0x0, 0x3, 0x5, 0x7, 0x9); \ + RSTT(0xC, 0xD, a, 0xC, 0xE, 0x0, 0x2, 0x5, 0x7, 0x9, 0xB); \ + RSTT(0xE, 0xF, a, 0xE, 0x0, 0x2, 0x4, 0x7, 0x9, 0xB, 0xD); \ + memcpy(a, t, sizeof t); \ + } while (0) + +#define ROUND_SMALL_Q(a, r) do { \ + sph_u32 t[16]; \ + a[0x0] ^= QC32up(0x00, r); \ + a[0x1] ^= QC32dn(0x00, r); \ + a[0x2] ^= QC32up(0x10, r); \ + a[0x3] ^= QC32dn(0x10, r); \ + a[0x4] ^= QC32up(0x20, r); \ + a[0x5] ^= QC32dn(0x20, r); \ + a[0x6] ^= QC32up(0x30, r); \ + a[0x7] ^= QC32dn(0x30, r); \ + a[0x8] ^= QC32up(0x40, r); \ + a[0x9] ^= QC32dn(0x40, r); \ + a[0xA] ^= QC32up(0x50, r); \ + a[0xB] ^= QC32dn(0x50, r); \ + a[0xC] ^= QC32up(0x60, r); \ + a[0xD] ^= QC32dn(0x60, r); \ + a[0xE] ^= QC32up(0x70, r); \ + a[0xF] ^= QC32dn(0x70, r); \ + RSTT(0x0, 0x1, a, 0x2, 0x6, 0xA, 0xE, 0x1, 0x5, 0x9, 0xD); \ + RSTT(0x2, 0x3, a, 0x4, 0x8, 0xC, 0x0, 0x3, 0x7, 0xB, 0xF); \ + RSTT(0x4, 0x5, a, 0x6, 0xA, 0xE, 0x2, 0x5, 0x9, 0xD, 0x1); \ + RSTT(0x6, 0x7, a, 0x8, 0xC, 0x0, 0x4, 0x7, 0xB, 0xF, 0x3); \ + RSTT(0x8, 0x9, a, 0xA, 0xE, 0x2, 0x6, 0x9, 0xD, 0x1, 0x5); \ + RSTT(0xA, 0xB, a, 0xC, 0x0, 0x4, 0x8, 0xB, 0xF, 0x3, 0x7); \ + RSTT(0xC, 0xD, a, 0xE, 0x2, 0x6, 0xA, 0xD, 0x1, 0x5, 0x9); \ + RSTT(0xE, 0xF, a, 0x0, 0x4, 0x8, 0xC, 0xF, 0x3, 0x7, 0xB); \ + memcpy(a, t, sizeof t); \ + } while (0) + +#if SPH_SMALL_FOOTPRINT_GROESTL + +#define PERM_SMALL_P(a) do { \ + int r; \ + for (r = 0; r < 10; r ++) \ + ROUND_SMALL_P(a, r); \ + } while (0) + +#define PERM_SMALL_Q(a) do { \ + int r; \ + for (r = 0; r < 10; r ++) \ + ROUND_SMALL_Q(a, r); \ + } while (0) + +#else + +#define PERM_SMALL_P(a) do { \ + int r; \ + for (r = 0; r < 10; r += 2) { \ + ROUND_SMALL_P(a, r + 0); \ + ROUND_SMALL_P(a, r + 1); \ + } \ + } while (0) + +#define PERM_SMALL_Q(a) do { \ + int r; \ + for (r = 0; r < 10; r += 2) { \ + ROUND_SMALL_Q(a, r + 0); \ + ROUND_SMALL_Q(a, r + 1); \ + } \ + } while (0) + +#endif + +#define COMPRESS_SMALL do { \ + sph_u32 g[16], m[16]; \ + size_t u; \ + for (u = 0; u < 16; u ++) { \ + m[u] = dec32e_aligned(buf + (u << 2)); \ + g[u] = m[u] ^ H[u]; \ + } \ + PERM_SMALL_P(g); \ + PERM_SMALL_Q(m); \ + for (u = 0; u < 16; u ++) \ + H[u] ^= g[u] ^ m[u]; \ + } while (0) + +#define FINAL_SMALL do { \ + sph_u32 x[16]; \ + size_t u; \ + memcpy(x, H, sizeof x); \ + PERM_SMALL_P(x); \ + for (u = 0; u < 16; u ++) \ + H[u] ^= x[u]; \ + } while (0) + +#define DECL_STATE_BIG \ + sph_u32 H[32]; + +#define READ_STATE_BIG(sc) do { \ + memcpy(H, (sc)->state.narrow, sizeof H); \ + } while (0) + +#define WRITE_STATE_BIG(sc) do { \ + memcpy((sc)->state.narrow, H, sizeof H); \ + } while (0) + +#if SPH_SMALL_FOOTPRINT_GROESTL + +#define RBTT(d0, d1, a, b0, b1, b2, b3, b4, b5, b6, b7) do { \ + sph_u32 fu2 = T0up[B32_2(a[b2])]; \ + sph_u32 fd2 = T0dn[B32_2(a[b2])]; \ + sph_u32 fu3 = T1up[B32_3(a[b3])]; \ + sph_u32 fd3 = T1dn[B32_3(a[b3])]; \ + sph_u32 fu6 = T0up[B32_2(a[b6])]; \ + sph_u32 fd6 = T0dn[B32_2(a[b6])]; \ + sph_u32 fu7 = T1up[B32_3(a[b7])]; \ + sph_u32 fd7 = T1dn[B32_3(a[b7])]; \ + t[d0] = T0up[B32_0(a[b0])] \ + ^ T1up[B32_1(a[b1])] \ + ^ R32u(fu2, fd2) \ + ^ R32u(fu3, fd3) \ + ^ T0dn[B32_0(a[b4])] \ + ^ T1dn[B32_1(a[b5])] \ + ^ R32d(fu6, fd6) \ + ^ R32d(fu7, fd7); \ + t[d1] = T0dn[B32_0(a[b0])] \ + ^ T1dn[B32_1(a[b1])] \ + ^ R32d(fu2, fd2) \ + ^ R32d(fu3, fd3) \ + ^ T0up[B32_0(a[b4])] \ + ^ T1up[B32_1(a[b5])] \ + ^ R32u(fu6, fd6) \ + ^ R32u(fu7, fd7); \ + } while (0) + +#else + +#define RBTT(d0, d1, a, b0, b1, b2, b3, b4, b5, b6, b7) do { \ + t[d0] = T0up[B32_0(a[b0])] \ + ^ T1up[B32_1(a[b1])] \ + ^ T2up[B32_2(a[b2])] \ + ^ T3up[B32_3(a[b3])] \ + ^ T0dn[B32_0(a[b4])] \ + ^ T1dn[B32_1(a[b5])] \ + ^ T2dn[B32_2(a[b6])] \ + ^ T3dn[B32_3(a[b7])]; \ + t[d1] = T0dn[B32_0(a[b0])] \ + ^ T1dn[B32_1(a[b1])] \ + ^ T2dn[B32_2(a[b2])] \ + ^ T3dn[B32_3(a[b3])] \ + ^ T0up[B32_0(a[b4])] \ + ^ T1up[B32_1(a[b5])] \ + ^ T2up[B32_2(a[b6])] \ + ^ T3up[B32_3(a[b7])]; \ + } while (0) + +#endif + +#if SPH_SMALL_FOOTPRINT_GROESTL + +#define ROUND_BIG_P(a, r) do { \ + sph_u32 t[32]; \ + size_t u; \ + a[0x00] ^= PC32up(0x00, r); \ + a[0x01] ^= PC32dn(0x00, r); \ + a[0x02] ^= PC32up(0x10, r); \ + a[0x03] ^= PC32dn(0x10, r); \ + a[0x04] ^= PC32up(0x20, r); \ + a[0x05] ^= PC32dn(0x20, r); \ + a[0x06] ^= PC32up(0x30, r); \ + a[0x07] ^= PC32dn(0x30, r); \ + a[0x08] ^= PC32up(0x40, r); \ + a[0x09] ^= PC32dn(0x40, r); \ + a[0x0A] ^= PC32up(0x50, r); \ + a[0x0B] ^= PC32dn(0x50, r); \ + a[0x0C] ^= PC32up(0x60, r); \ + a[0x0D] ^= PC32dn(0x60, r); \ + a[0x0E] ^= PC32up(0x70, r); \ + a[0x0F] ^= PC32dn(0x70, r); \ + a[0x10] ^= PC32up(0x80, r); \ + a[0x11] ^= PC32dn(0x80, r); \ + a[0x12] ^= PC32up(0x90, r); \ + a[0x13] ^= PC32dn(0x90, r); \ + a[0x14] ^= PC32up(0xA0, r); \ + a[0x15] ^= PC32dn(0xA0, r); \ + a[0x16] ^= PC32up(0xB0, r); \ + a[0x17] ^= PC32dn(0xB0, r); \ + a[0x18] ^= PC32up(0xC0, r); \ + a[0x19] ^= PC32dn(0xC0, r); \ + a[0x1A] ^= PC32up(0xD0, r); \ + a[0x1B] ^= PC32dn(0xD0, r); \ + a[0x1C] ^= PC32up(0xE0, r); \ + a[0x1D] ^= PC32dn(0xE0, r); \ + a[0x1E] ^= PC32up(0xF0, r); \ + a[0x1F] ^= PC32dn(0xF0, r); \ + for (u = 0; u < 32; u += 8) { \ + RBTT(u + 0x00, (u + 0x01) & 0x1F, a, \ + u + 0x00, (u + 0x02) & 0x1F, \ + (u + 0x04) & 0x1F, (u + 0x06) & 0x1F, \ + (u + 0x09) & 0x1F, (u + 0x0B) & 0x1F, \ + (u + 0x0D) & 0x1F, (u + 0x17) & 0x1F); \ + RBTT(u + 0x02, (u + 0x03) & 0x1F, a, \ + u + 0x02, (u + 0x04) & 0x1F, \ + (u + 0x06) & 0x1F, (u + 0x08) & 0x1F, \ + (u + 0x0B) & 0x1F, (u + 0x0D) & 0x1F, \ + (u + 0x0F) & 0x1F, (u + 0x19) & 0x1F); \ + RBTT(u + 0x04, (u + 0x05) & 0x1F, a, \ + u + 0x04, (u + 0x06) & 0x1F, \ + (u + 0x08) & 0x1F, (u + 0x0A) & 0x1F, \ + (u + 0x0D) & 0x1F, (u + 0x0F) & 0x1F, \ + (u + 0x11) & 0x1F, (u + 0x1B) & 0x1F); \ + RBTT(u + 0x06, (u + 0x07) & 0x1F, a, \ + u + 0x06, (u + 0x08) & 0x1F, \ + (u + 0x0A) & 0x1F, (u + 0x0C) & 0x1F, \ + (u + 0x0F) & 0x1F, (u + 0x11) & 0x1F, \ + (u + 0x13) & 0x1F, (u + 0x1D) & 0x1F); \ + } \ + memcpy(a, t, sizeof t); \ + } while (0) + +#define ROUND_BIG_Q(a, r) do { \ + sph_u32 t[32]; \ + size_t u; \ + a[0x00] ^= QC32up(0x00, r); \ + a[0x01] ^= QC32dn(0x00, r); \ + a[0x02] ^= QC32up(0x10, r); \ + a[0x03] ^= QC32dn(0x10, r); \ + a[0x04] ^= QC32up(0x20, r); \ + a[0x05] ^= QC32dn(0x20, r); \ + a[0x06] ^= QC32up(0x30, r); \ + a[0x07] ^= QC32dn(0x30, r); \ + a[0x08] ^= QC32up(0x40, r); \ + a[0x09] ^= QC32dn(0x40, r); \ + a[0x0A] ^= QC32up(0x50, r); \ + a[0x0B] ^= QC32dn(0x50, r); \ + a[0x0C] ^= QC32up(0x60, r); \ + a[0x0D] ^= QC32dn(0x60, r); \ + a[0x0E] ^= QC32up(0x70, r); \ + a[0x0F] ^= QC32dn(0x70, r); \ + a[0x10] ^= QC32up(0x80, r); \ + a[0x11] ^= QC32dn(0x80, r); \ + a[0x12] ^= QC32up(0x90, r); \ + a[0x13] ^= QC32dn(0x90, r); \ + a[0x14] ^= QC32up(0xA0, r); \ + a[0x15] ^= QC32dn(0xA0, r); \ + a[0x16] ^= QC32up(0xB0, r); \ + a[0x17] ^= QC32dn(0xB0, r); \ + a[0x18] ^= QC32up(0xC0, r); \ + a[0x19] ^= QC32dn(0xC0, r); \ + a[0x1A] ^= QC32up(0xD0, r); \ + a[0x1B] ^= QC32dn(0xD0, r); \ + a[0x1C] ^= QC32up(0xE0, r); \ + a[0x1D] ^= QC32dn(0xE0, r); \ + a[0x1E] ^= QC32up(0xF0, r); \ + a[0x1F] ^= QC32dn(0xF0, r); \ + for (u = 0; u < 32; u += 8) { \ + RBTT(u + 0x00, (u + 0x01) & 0x1F, a, \ + (u + 0x02) & 0x1F, (u + 0x06) & 0x1F, \ + (u + 0x0A) & 0x1F, (u + 0x16) & 0x1F, \ + (u + 0x01) & 0x1F, (u + 0x05) & 0x1F, \ + (u + 0x09) & 0x1F, (u + 0x0D) & 0x1F); \ + RBTT(u + 0x02, (u + 0x03) & 0x1F, a, \ + (u + 0x04) & 0x1F, (u + 0x08) & 0x1F, \ + (u + 0x0C) & 0x1F, (u + 0x18) & 0x1F, \ + (u + 0x03) & 0x1F, (u + 0x07) & 0x1F, \ + (u + 0x0B) & 0x1F, (u + 0x0F) & 0x1F); \ + RBTT(u + 0x04, (u + 0x05) & 0x1F, a, \ + (u + 0x06) & 0x1F, (u + 0x0A) & 0x1F, \ + (u + 0x0E) & 0x1F, (u + 0x1A) & 0x1F, \ + (u + 0x05) & 0x1F, (u + 0x09) & 0x1F, \ + (u + 0x0D) & 0x1F, (u + 0x11) & 0x1F); \ + RBTT(u + 0x06, (u + 0x07) & 0x1F, a, \ + (u + 0x08) & 0x1F, (u + 0x0C) & 0x1F, \ + (u + 0x10) & 0x1F, (u + 0x1C) & 0x1F, \ + (u + 0x07) & 0x1F, (u + 0x0B) & 0x1F, \ + (u + 0x0F) & 0x1F, (u + 0x13) & 0x1F); \ + } \ + memcpy(a, t, sizeof t); \ + } while (0) + +#else + +#define ROUND_BIG_P(a, r) do { \ + sph_u32 t[32]; \ + a[0x00] ^= PC32up(0x00, r); \ + a[0x01] ^= PC32dn(0x00, r); \ + a[0x02] ^= PC32up(0x10, r); \ + a[0x03] ^= PC32dn(0x10, r); \ + a[0x04] ^= PC32up(0x20, r); \ + a[0x05] ^= PC32dn(0x20, r); \ + a[0x06] ^= PC32up(0x30, r); \ + a[0x07] ^= PC32dn(0x30, r); \ + a[0x08] ^= PC32up(0x40, r); \ + a[0x09] ^= PC32dn(0x40, r); \ + a[0x0A] ^= PC32up(0x50, r); \ + a[0x0B] ^= PC32dn(0x50, r); \ + a[0x0C] ^= PC32up(0x60, r); \ + a[0x0D] ^= PC32dn(0x60, r); \ + a[0x0E] ^= PC32up(0x70, r); \ + a[0x0F] ^= PC32dn(0x70, r); \ + a[0x10] ^= PC32up(0x80, r); \ + a[0x11] ^= PC32dn(0x80, r); \ + a[0x12] ^= PC32up(0x90, r); \ + a[0x13] ^= PC32dn(0x90, r); \ + a[0x14] ^= PC32up(0xA0, r); \ + a[0x15] ^= PC32dn(0xA0, r); \ + a[0x16] ^= PC32up(0xB0, r); \ + a[0x17] ^= PC32dn(0xB0, r); \ + a[0x18] ^= PC32up(0xC0, r); \ + a[0x19] ^= PC32dn(0xC0, r); \ + a[0x1A] ^= PC32up(0xD0, r); \ + a[0x1B] ^= PC32dn(0xD0, r); \ + a[0x1C] ^= PC32up(0xE0, r); \ + a[0x1D] ^= PC32dn(0xE0, r); \ + a[0x1E] ^= PC32up(0xF0, r); \ + a[0x1F] ^= PC32dn(0xF0, r); \ + RBTT(0x00, 0x01, a, \ + 0x00, 0x02, 0x04, 0x06, 0x09, 0x0B, 0x0D, 0x17); \ + RBTT(0x02, 0x03, a, \ + 0x02, 0x04, 0x06, 0x08, 0x0B, 0x0D, 0x0F, 0x19); \ + RBTT(0x04, 0x05, a, \ + 0x04, 0x06, 0x08, 0x0A, 0x0D, 0x0F, 0x11, 0x1B); \ + RBTT(0x06, 0x07, a, \ + 0x06, 0x08, 0x0A, 0x0C, 0x0F, 0x11, 0x13, 0x1D); \ + RBTT(0x08, 0x09, a, \ + 0x08, 0x0A, 0x0C, 0x0E, 0x11, 0x13, 0x15, 0x1F); \ + RBTT(0x0A, 0x0B, a, \ + 0x0A, 0x0C, 0x0E, 0x10, 0x13, 0x15, 0x17, 0x01); \ + RBTT(0x0C, 0x0D, a, \ + 0x0C, 0x0E, 0x10, 0x12, 0x15, 0x17, 0x19, 0x03); \ + RBTT(0x0E, 0x0F, a, \ + 0x0E, 0x10, 0x12, 0x14, 0x17, 0x19, 0x1B, 0x05); \ + RBTT(0x10, 0x11, a, \ + 0x10, 0x12, 0x14, 0x16, 0x19, 0x1B, 0x1D, 0x07); \ + RBTT(0x12, 0x13, a, \ + 0x12, 0x14, 0x16, 0x18, 0x1B, 0x1D, 0x1F, 0x09); \ + RBTT(0x14, 0x15, a, \ + 0x14, 0x16, 0x18, 0x1A, 0x1D, 0x1F, 0x01, 0x0B); \ + RBTT(0x16, 0x17, a, \ + 0x16, 0x18, 0x1A, 0x1C, 0x1F, 0x01, 0x03, 0x0D); \ + RBTT(0x18, 0x19, a, \ + 0x18, 0x1A, 0x1C, 0x1E, 0x01, 0x03, 0x05, 0x0F); \ + RBTT(0x1A, 0x1B, a, \ + 0x1A, 0x1C, 0x1E, 0x00, 0x03, 0x05, 0x07, 0x11); \ + RBTT(0x1C, 0x1D, a, \ + 0x1C, 0x1E, 0x00, 0x02, 0x05, 0x07, 0x09, 0x13); \ + RBTT(0x1E, 0x1F, a, \ + 0x1E, 0x00, 0x02, 0x04, 0x07, 0x09, 0x0B, 0x15); \ + memcpy(a, t, sizeof t); \ + } while (0) + +#define ROUND_BIG_Q(a, r) do { \ + sph_u32 t[32]; \ + a[0x00] ^= QC32up(0x00, r); \ + a[0x01] ^= QC32dn(0x00, r); \ + a[0x02] ^= QC32up(0x10, r); \ + a[0x03] ^= QC32dn(0x10, r); \ + a[0x04] ^= QC32up(0x20, r); \ + a[0x05] ^= QC32dn(0x20, r); \ + a[0x06] ^= QC32up(0x30, r); \ + a[0x07] ^= QC32dn(0x30, r); \ + a[0x08] ^= QC32up(0x40, r); \ + a[0x09] ^= QC32dn(0x40, r); \ + a[0x0A] ^= QC32up(0x50, r); \ + a[0x0B] ^= QC32dn(0x50, r); \ + a[0x0C] ^= QC32up(0x60, r); \ + a[0x0D] ^= QC32dn(0x60, r); \ + a[0x0E] ^= QC32up(0x70, r); \ + a[0x0F] ^= QC32dn(0x70, r); \ + a[0x10] ^= QC32up(0x80, r); \ + a[0x11] ^= QC32dn(0x80, r); \ + a[0x12] ^= QC32up(0x90, r); \ + a[0x13] ^= QC32dn(0x90, r); \ + a[0x14] ^= QC32up(0xA0, r); \ + a[0x15] ^= QC32dn(0xA0, r); \ + a[0x16] ^= QC32up(0xB0, r); \ + a[0x17] ^= QC32dn(0xB0, r); \ + a[0x18] ^= QC32up(0xC0, r); \ + a[0x19] ^= QC32dn(0xC0, r); \ + a[0x1A] ^= QC32up(0xD0, r); \ + a[0x1B] ^= QC32dn(0xD0, r); \ + a[0x1C] ^= QC32up(0xE0, r); \ + a[0x1D] ^= QC32dn(0xE0, r); \ + a[0x1E] ^= QC32up(0xF0, r); \ + a[0x1F] ^= QC32dn(0xF0, r); \ + RBTT(0x00, 0x01, a, \ + 0x02, 0x06, 0x0A, 0x16, 0x01, 0x05, 0x09, 0x0D); \ + RBTT(0x02, 0x03, a, \ + 0x04, 0x08, 0x0C, 0x18, 0x03, 0x07, 0x0B, 0x0F); \ + RBTT(0x04, 0x05, a, \ + 0x06, 0x0A, 0x0E, 0x1A, 0x05, 0x09, 0x0D, 0x11); \ + RBTT(0x06, 0x07, a, \ + 0x08, 0x0C, 0x10, 0x1C, 0x07, 0x0B, 0x0F, 0x13); \ + RBTT(0x08, 0x09, a, \ + 0x0A, 0x0E, 0x12, 0x1E, 0x09, 0x0D, 0x11, 0x15); \ + RBTT(0x0A, 0x0B, a, \ + 0x0C, 0x10, 0x14, 0x00, 0x0B, 0x0F, 0x13, 0x17); \ + RBTT(0x0C, 0x0D, a, \ + 0x0E, 0x12, 0x16, 0x02, 0x0D, 0x11, 0x15, 0x19); \ + RBTT(0x0E, 0x0F, a, \ + 0x10, 0x14, 0x18, 0x04, 0x0F, 0x13, 0x17, 0x1B); \ + RBTT(0x10, 0x11, a, \ + 0x12, 0x16, 0x1A, 0x06, 0x11, 0x15, 0x19, 0x1D); \ + RBTT(0x12, 0x13, a, \ + 0x14, 0x18, 0x1C, 0x08, 0x13, 0x17, 0x1B, 0x1F); \ + RBTT(0x14, 0x15, a, \ + 0x16, 0x1A, 0x1E, 0x0A, 0x15, 0x19, 0x1D, 0x01); \ + RBTT(0x16, 0x17, a, \ + 0x18, 0x1C, 0x00, 0x0C, 0x17, 0x1B, 0x1F, 0x03); \ + RBTT(0x18, 0x19, a, \ + 0x1A, 0x1E, 0x02, 0x0E, 0x19, 0x1D, 0x01, 0x05); \ + RBTT(0x1A, 0x1B, a, \ + 0x1C, 0x00, 0x04, 0x10, 0x1B, 0x1F, 0x03, 0x07); \ + RBTT(0x1C, 0x1D, a, \ + 0x1E, 0x02, 0x06, 0x12, 0x1D, 0x01, 0x05, 0x09); \ + RBTT(0x1E, 0x1F, a, \ + 0x00, 0x04, 0x08, 0x14, 0x1F, 0x03, 0x07, 0x0B); \ + memcpy(a, t, sizeof t); \ + } while (0) + +#endif + +#if SPH_SMALL_FOOTPRINT_GROESTL + +#define PERM_BIG_P(a) do { \ + int r; \ + for (r = 0; r < 14; r ++) \ + ROUND_BIG_P(a, r); \ + } while (0) + +#define PERM_BIG_Q(a) do { \ + int r; \ + for (r = 0; r < 14; r ++) \ + ROUND_BIG_Q(a, r); \ + } while (0) + +#else + +#define PERM_BIG_P(a) do { \ + int r; \ + for (r = 0; r < 14; r += 2) { \ + ROUND_BIG_P(a, r + 0); \ + ROUND_BIG_P(a, r + 1); \ + } \ + } while (0) + +#define PERM_BIG_Q(a) do { \ + int r; \ + for (r = 0; r < 14; r += 2) { \ + ROUND_BIG_Q(a, r + 0); \ + ROUND_BIG_Q(a, r + 1); \ + } \ + } while (0) + +#endif + +#define COMPRESS_BIG do { \ + sph_u32 g[32], m[32]; \ + size_t u; \ + for (u = 0; u < 32; u ++) { \ + m[u] = dec32e_aligned(buf + (u << 2)); \ + g[u] = m[u] ^ H[u]; \ + } \ + PERM_BIG_P(g); \ + PERM_BIG_Q(m); \ + for (u = 0; u < 32; u ++) \ + H[u] ^= g[u] ^ m[u]; \ + } while (0) + +#define FINAL_BIG do { \ + sph_u32 x[32]; \ + size_t u; \ + memcpy(x, H, sizeof x); \ + PERM_BIG_P(x); \ + for (u = 0; u < 32; u ++) \ + H[u] ^= x[u]; \ + } while (0) + +#endif + +static void +groestl_small_init(sph_groestl_small_context *sc, unsigned out_size) +{ + size_t u; + + sc->ptr = 0; +#if SPH_GROESTL_64 + for (u = 0; u < 7; u ++) + sc->state.wide[u] = 0; +#if USE_LE + sc->state.wide[7] = ((sph_u64)(out_size & 0xFF) << 56) + | ((sph_u64)(out_size & 0xFF00) << 40); +#else + sc->state.wide[7] = (sph_u64)out_size; +#endif +#else + for (u = 0; u < 15; u ++) + sc->state.narrow[u] = 0; +#if USE_LE + sc->state.narrow[15] = ((sph_u32)(out_size & 0xFF) << 24) + | ((sph_u32)(out_size & 0xFF00) << 8); +#else + sc->state.narrow[15] = (sph_u32)out_size; +#endif +#endif +#if SPH_64 + sc->count = 0; +#else + sc->count_high = 0; + sc->count_low = 0; +#endif +} + +static void +groestl_small_core(sph_groestl_small_context *sc, const void *data, size_t len) +{ + unsigned char *buf; + size_t ptr; + DECL_STATE_SMALL + + buf = sc->buf; + ptr = sc->ptr; + if (len < (sizeof sc->buf) - ptr) { + memcpy(buf + ptr, data, len); + ptr += len; + sc->ptr = ptr; + return; + } + + READ_STATE_SMALL(sc); + while (len > 0) { + size_t clen; + + clen = (sizeof sc->buf) - ptr; + if (clen > len) + clen = len; + memcpy(buf + ptr, data, clen); + ptr += clen; + data = (const unsigned char *)data + clen; + len -= clen; + if (ptr == sizeof sc->buf) { + COMPRESS_SMALL; +#if SPH_64 + sc->count ++; +#else + if ((sc->count_low = SPH_T32(sc->count_low + 1)) == 0) + sc->count_high = SPH_T32(sc->count_high + 1); +#endif + ptr = 0; + } + } + WRITE_STATE_SMALL(sc); + sc->ptr = ptr; +} + +static void +groestl_small_close(sph_groestl_small_context *sc, + unsigned ub, unsigned n, void *dst, size_t out_len) +{ + unsigned char *buf; + unsigned char pad[72]; + size_t u, ptr, pad_len; +#if SPH_64 + sph_u64 count; +#else + sph_u32 count_high, count_low; +#endif + unsigned z; + DECL_STATE_SMALL + + buf = sc->buf; + ptr = sc->ptr; + z = 0x80 >> n; + pad[0] = ((ub & -z) | z) & 0xFF; + if (ptr < 56) { + pad_len = 64 - ptr; +#if SPH_64 + count = SPH_T64(sc->count + 1); +#else + count_low = SPH_T32(sc->count_low + 1); + count_high = SPH_T32(sc->count_high); + if (count_low == 0) + count_high = SPH_T32(count_high + 1); +#endif + } else { + pad_len = 128 - ptr; +#if SPH_64 + count = SPH_T64(sc->count + 2); +#else + count_low = SPH_T32(sc->count_low + 2); + count_high = SPH_T32(sc->count_high); + if (count_low <= 1) + count_high = SPH_T32(count_high + 1); +#endif + } + memset(pad + 1, 0, pad_len - 9); +#if SPH_64 + sph_enc64be(pad + pad_len - 8, count); +#else + sph_enc64be(pad + pad_len - 8, count_high); + sph_enc64be(pad + pad_len - 4, count_low); +#endif + groestl_small_core(sc, pad, pad_len); + READ_STATE_SMALL(sc); + FINAL_SMALL; +#if SPH_GROESTL_64 + for (u = 0; u < 4; u ++) + enc64e(pad + (u << 3), H[u + 4]); +#else + for (u = 0; u < 8; u ++) + enc32e(pad + (u << 2), H[u + 8]); +#endif + memcpy(dst, pad + 32 - out_len, out_len); + groestl_small_init(sc, (unsigned)out_len << 3); +} + +static void +groestl_big_init(sph_groestl_big_context *sc, unsigned out_size) +{ + size_t u; + + sc->ptr = 0; +#if SPH_GROESTL_64 + for (u = 0; u < 15; u ++) + sc->state.wide[u] = 0; +#if USE_LE + sc->state.wide[15] = ((sph_u64)(out_size & 0xFF) << 56) + | ((sph_u64)(out_size & 0xFF00) << 40); +#else + sc->state.wide[15] = (sph_u64)out_size; +#endif +#else + for (u = 0; u < 31; u ++) + sc->state.narrow[u] = 0; +#if USE_LE + sc->state.narrow[31] = ((sph_u32)(out_size & 0xFF) << 24) + | ((sph_u32)(out_size & 0xFF00) << 8); +#else + sc->state.narrow[31] = (sph_u32)out_size; +#endif +#endif +#if SPH_64 + sc->count = 0; +#else + sc->count_high = 0; + sc->count_low = 0; +#endif +} + +static void +groestl_big_core(sph_groestl_big_context *sc, const void *data, size_t len) +{ + unsigned char *buf; + size_t ptr; + DECL_STATE_BIG + + buf = sc->buf; + ptr = sc->ptr; + if (len < (sizeof sc->buf) - ptr) { + memcpy(buf + ptr, data, len); + ptr += len; + sc->ptr = ptr; + return; + } + + READ_STATE_BIG(sc); + while (len > 0) { + size_t clen; + + clen = (sizeof sc->buf) - ptr; + if (clen > len) + clen = len; + memcpy(buf + ptr, data, clen); + ptr += clen; + data = (const unsigned char *)data + clen; + len -= clen; + if (ptr == sizeof sc->buf) { + COMPRESS_BIG; +#if SPH_64 + sc->count ++; +#else + if ((sc->count_low = SPH_T32(sc->count_low + 1)) == 0) + sc->count_high = SPH_T32(sc->count_high + 1); +#endif + ptr = 0; + } + } + WRITE_STATE_BIG(sc); + sc->ptr = ptr; +} + +static void +groestl_big_close(sph_groestl_big_context *sc, + unsigned ub, unsigned n, void *dst, size_t out_len) +{ + unsigned char *buf; + unsigned char pad[136]; + size_t ptr, pad_len, u; +#if SPH_64 + sph_u64 count; +#else + sph_u32 count_high, count_low; +#endif + unsigned z; + DECL_STATE_BIG + + buf = sc->buf; + ptr = sc->ptr; + z = 0x80 >> n; + pad[0] = ((ub & -z) | z) & 0xFF; + if (ptr < 120) { + pad_len = 128 - ptr; +#if SPH_64 + count = SPH_T64(sc->count + 1); +#else + count_low = SPH_T32(sc->count_low + 1); + count_high = SPH_T32(sc->count_high); + if (count_low == 0) + count_high = SPH_T32(count_high + 1); +#endif + } else { + pad_len = 256 - ptr; +#if SPH_64 + count = SPH_T64(sc->count + 2); +#else + count_low = SPH_T32(sc->count_low + 2); + count_high = SPH_T32(sc->count_high); + if (count_low <= 1) + count_high = SPH_T32(count_high + 1); +#endif + } + memset(pad + 1, 0, pad_len - 9); +#if SPH_64 + sph_enc64be(pad + pad_len - 8, count); +#else + sph_enc64be(pad + pad_len - 8, count_high); + sph_enc64be(pad + pad_len - 4, count_low); +#endif + groestl_big_core(sc, pad, pad_len); + READ_STATE_BIG(sc); + FINAL_BIG; +#if SPH_GROESTL_64 + for (u = 0; u < 8; u ++) + enc64e(pad + (u << 3), H[u + 8]); +#else + for (u = 0; u < 16; u ++) + enc32e(pad + (u << 2), H[u + 16]); +#endif + memcpy(dst, pad + 64 - out_len, out_len); + groestl_big_init(sc, (unsigned)out_len << 3); +} + +/* see sph_groestl.h */ +void +sph_groestl224_init(void *cc) +{ + groestl_small_init(cc, 224); +} + +/* see sph_groestl.h */ +void +sph_groestl224(void *cc, const void *data, size_t len) +{ + groestl_small_core(cc, data, len); +} + +/* see sph_groestl.h */ +void +sph_groestl224_close(void *cc, void *dst) +{ + groestl_small_close(cc, 0, 0, dst, 28); +} + +/* see sph_groestl.h */ +void +sph_groestl224_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst) +{ + groestl_small_close(cc, ub, n, dst, 28); +} + +/* see sph_groestl.h */ +void +sph_groestl256_init(void *cc) +{ + groestl_small_init(cc, 256); +} + +/* see sph_groestl.h */ +void +sph_groestl256(void *cc, const void *data, size_t len) +{ + groestl_small_core(cc, data, len); +} + +/* see sph_groestl.h */ +void +sph_groestl256_close(void *cc, void *dst) +{ + groestl_small_close(cc, 0, 0, dst, 32); +} + +/* see sph_groestl.h */ +void +sph_groestl256_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst) +{ + groestl_small_close(cc, ub, n, dst, 32); +} + +/* see sph_groestl.h */ +void +sph_groestl384_init(void *cc) +{ + groestl_big_init(cc, 384); +} + +/* see sph_groestl.h */ +void +sph_groestl384(void *cc, const void *data, size_t len) +{ + groestl_big_core(cc, data, len); +} + +/* see sph_groestl.h */ +void +sph_groestl384_close(void *cc, void *dst) +{ + groestl_big_close(cc, 0, 0, dst, 48); +} + +/* see sph_groestl.h */ +void +sph_groestl384_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst) +{ + groestl_big_close(cc, ub, n, dst, 48); +} + +/* see sph_groestl.h */ +void +sph_groestl512_init(void *cc) +{ + groestl_big_init(cc, 512); +} + +/* see sph_groestl.h */ +void +sph_groestl512(void *cc, const void *data, size_t len) +{ + groestl_big_core(cc, data, len); +} + +/* see sph_groestl.h */ +void +sph_groestl512_close(void *cc, void *dst) +{ + groestl_big_close(cc, 0, 0, dst, 64); +} + +/* see sph_groestl.h */ +void +sph_groestl512_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst) +{ + groestl_big_close(cc, ub, n, dst, 64); +} + +#ifdef __cplusplus +} +#endif diff --git a/sha3/jh.c b/sha3/jh.c new file mode 100644 index 0000000..41487a5 --- /dev/null +++ b/sha3/jh.c @@ -0,0 +1,1116 @@ +/* $Id: jh.c 255 2011-06-07 19:50:20Z tp $ */ +/* + * JH implementation. + * + * ==========================(LICENSE BEGIN)============================ + * + * Copyright (c) 2007-2010 Projet RNRT SAPHIR + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY + * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * + * ===========================(LICENSE END)============================= + * + * @author Thomas Pornin + */ + +#include +#include + +#include "sph_jh.h" + +#ifdef __cplusplus +extern "C"{ +#endif + + +#if SPH_SMALL_FOOTPRINT && !defined SPH_SMALL_FOOTPRINT_JH +#define SPH_SMALL_FOOTPRINT_JH 1 +#endif + +#if !defined SPH_JH_64 && SPH_64_TRUE +#define SPH_JH_64 1 +#endif + +#if !SPH_64 +#undef SPH_JH_64 +#endif + +#ifdef _MSC_VER +#pragma warning (disable: 4146) +#endif + +/* + * The internal bitslice representation may use either big-endian or + * little-endian (true bitslice operations do not care about the bit + * ordering, and the bit-swapping linear operations in JH happen to + * be invariant through endianness-swapping). The constants must be + * defined according to the chosen endianness; we use some + * byte-swapping macros for that. + */ + +#if SPH_LITTLE_ENDIAN + +#define C32e(x) ((SPH_C32(x) >> 24) \ + | ((SPH_C32(x) >> 8) & SPH_C32(0x0000FF00)) \ + | ((SPH_C32(x) << 8) & SPH_C32(0x00FF0000)) \ + | ((SPH_C32(x) << 24) & SPH_C32(0xFF000000))) +#define dec32e_aligned sph_dec32le_aligned +#define enc32e sph_enc32le + +#if SPH_64 +#define C64e(x) ((SPH_C64(x) >> 56) \ + | ((SPH_C64(x) >> 40) & SPH_C64(0x000000000000FF00)) \ + | ((SPH_C64(x) >> 24) & SPH_C64(0x0000000000FF0000)) \ + | ((SPH_C64(x) >> 8) & SPH_C64(0x00000000FF000000)) \ + | ((SPH_C64(x) << 8) & SPH_C64(0x000000FF00000000)) \ + | ((SPH_C64(x) << 24) & SPH_C64(0x0000FF0000000000)) \ + | ((SPH_C64(x) << 40) & SPH_C64(0x00FF000000000000)) \ + | ((SPH_C64(x) << 56) & SPH_C64(0xFF00000000000000))) +#define dec64e_aligned sph_dec64le_aligned +#define enc64e sph_enc64le +#endif + +#else + +#define C32e(x) SPH_C32(x) +#define dec32e_aligned sph_dec32be_aligned +#define enc32e sph_enc32be +#if SPH_64 +#define C64e(x) SPH_C64(x) +#define dec64e_aligned sph_dec64be_aligned +#define enc64e sph_enc64be +#endif + +#endif + +#define Sb(x0, x1, x2, x3, c) do { \ + x3 = ~x3; \ + x0 ^= (c) & ~x2; \ + tmp = (c) ^ (x0 & x1); \ + x0 ^= x2 & x3; \ + x3 ^= ~x1 & x2; \ + x1 ^= x0 & x2; \ + x2 ^= x0 & ~x3; \ + x0 ^= x1 | x3; \ + x3 ^= x1 & x2; \ + x1 ^= tmp & x0; \ + x2 ^= tmp; \ + } while (0) + +#define Lb(x0, x1, x2, x3, x4, x5, x6, x7) do { \ + x4 ^= x1; \ + x5 ^= x2; \ + x6 ^= x3 ^ x0; \ + x7 ^= x0; \ + x0 ^= x5; \ + x1 ^= x6; \ + x2 ^= x7 ^ x4; \ + x3 ^= x4; \ + } while (0) + +#if SPH_JH_64 + +static const sph_u64 C[] = { + C64e(0x72d5dea2df15f867), C64e(0x7b84150ab7231557), + C64e(0x81abd6904d5a87f6), C64e(0x4e9f4fc5c3d12b40), + C64e(0xea983ae05c45fa9c), C64e(0x03c5d29966b2999a), + C64e(0x660296b4f2bb538a), C64e(0xb556141a88dba231), + C64e(0x03a35a5c9a190edb), C64e(0x403fb20a87c14410), + C64e(0x1c051980849e951d), C64e(0x6f33ebad5ee7cddc), + C64e(0x10ba139202bf6b41), C64e(0xdc786515f7bb27d0), + C64e(0x0a2c813937aa7850), C64e(0x3f1abfd2410091d3), + C64e(0x422d5a0df6cc7e90), C64e(0xdd629f9c92c097ce), + C64e(0x185ca70bc72b44ac), C64e(0xd1df65d663c6fc23), + C64e(0x976e6c039ee0b81a), C64e(0x2105457e446ceca8), + C64e(0xeef103bb5d8e61fa), C64e(0xfd9697b294838197), + C64e(0x4a8e8537db03302f), C64e(0x2a678d2dfb9f6a95), + C64e(0x8afe7381f8b8696c), C64e(0x8ac77246c07f4214), + C64e(0xc5f4158fbdc75ec4), C64e(0x75446fa78f11bb80), + C64e(0x52de75b7aee488bc), C64e(0x82b8001e98a6a3f4), + C64e(0x8ef48f33a9a36315), C64e(0xaa5f5624d5b7f989), + C64e(0xb6f1ed207c5ae0fd), C64e(0x36cae95a06422c36), + C64e(0xce2935434efe983d), C64e(0x533af974739a4ba7), + C64e(0xd0f51f596f4e8186), C64e(0x0e9dad81afd85a9f), + C64e(0xa7050667ee34626a), C64e(0x8b0b28be6eb91727), + C64e(0x47740726c680103f), C64e(0xe0a07e6fc67e487b), + C64e(0x0d550aa54af8a4c0), C64e(0x91e3e79f978ef19e), + C64e(0x8676728150608dd4), C64e(0x7e9e5a41f3e5b062), + C64e(0xfc9f1fec4054207a), C64e(0xe3e41a00cef4c984), + C64e(0x4fd794f59dfa95d8), C64e(0x552e7e1124c354a5), + C64e(0x5bdf7228bdfe6e28), C64e(0x78f57fe20fa5c4b2), + C64e(0x05897cefee49d32e), C64e(0x447e9385eb28597f), + C64e(0x705f6937b324314a), C64e(0x5e8628f11dd6e465), + C64e(0xc71b770451b920e7), C64e(0x74fe43e823d4878a), + C64e(0x7d29e8a3927694f2), C64e(0xddcb7a099b30d9c1), + C64e(0x1d1b30fb5bdc1be0), C64e(0xda24494ff29c82bf), + C64e(0xa4e7ba31b470bfff), C64e(0x0d324405def8bc48), + C64e(0x3baefc3253bbd339), C64e(0x459fc3c1e0298ba0), + C64e(0xe5c905fdf7ae090f), C64e(0x947034124290f134), + C64e(0xa271b701e344ed95), C64e(0xe93b8e364f2f984a), + C64e(0x88401d63a06cf615), C64e(0x47c1444b8752afff), + C64e(0x7ebb4af1e20ac630), C64e(0x4670b6c5cc6e8ce6), + C64e(0xa4d5a456bd4fca00), C64e(0xda9d844bc83e18ae), + C64e(0x7357ce453064d1ad), C64e(0xe8a6ce68145c2567), + C64e(0xa3da8cf2cb0ee116), C64e(0x33e906589a94999a), + C64e(0x1f60b220c26f847b), C64e(0xd1ceac7fa0d18518), + C64e(0x32595ba18ddd19d3), C64e(0x509a1cc0aaa5b446), + C64e(0x9f3d6367e4046bba), C64e(0xf6ca19ab0b56ee7e), + C64e(0x1fb179eaa9282174), C64e(0xe9bdf7353b3651ee), + C64e(0x1d57ac5a7550d376), C64e(0x3a46c2fea37d7001), + C64e(0xf735c1af98a4d842), C64e(0x78edec209e6b6779), + C64e(0x41836315ea3adba8), C64e(0xfac33b4d32832c83), + C64e(0xa7403b1f1c2747f3), C64e(0x5940f034b72d769a), + C64e(0xe73e4e6cd2214ffd), C64e(0xb8fd8d39dc5759ef), + C64e(0x8d9b0c492b49ebda), C64e(0x5ba2d74968f3700d), + C64e(0x7d3baed07a8d5584), C64e(0xf5a5e9f0e4f88e65), + C64e(0xa0b8a2f436103b53), C64e(0x0ca8079e753eec5a), + C64e(0x9168949256e8884f), C64e(0x5bb05c55f8babc4c), + C64e(0xe3bb3b99f387947b), C64e(0x75daf4d6726b1c5d), + C64e(0x64aeac28dc34b36d), C64e(0x6c34a550b828db71), + C64e(0xf861e2f2108d512a), C64e(0xe3db643359dd75fc), + C64e(0x1cacbcf143ce3fa2), C64e(0x67bbd13c02e843b0), + C64e(0x330a5bca8829a175), C64e(0x7f34194db416535c), + C64e(0x923b94c30e794d1e), C64e(0x797475d7b6eeaf3f), + C64e(0xeaa8d4f7be1a3921), C64e(0x5cf47e094c232751), + C64e(0x26a32453ba323cd2), C64e(0x44a3174a6da6d5ad), + C64e(0xb51d3ea6aff2c908), C64e(0x83593d98916b3c56), + C64e(0x4cf87ca17286604d), C64e(0x46e23ecc086ec7f6), + C64e(0x2f9833b3b1bc765e), C64e(0x2bd666a5efc4e62a), + C64e(0x06f4b6e8bec1d436), C64e(0x74ee8215bcef2163), + C64e(0xfdc14e0df453c969), C64e(0xa77d5ac406585826), + C64e(0x7ec1141606e0fa16), C64e(0x7e90af3d28639d3f), + C64e(0xd2c9f2e3009bd20c), C64e(0x5faace30b7d40c30), + C64e(0x742a5116f2e03298), C64e(0x0deb30d8e3cef89a), + C64e(0x4bc59e7bb5f17992), C64e(0xff51e66e048668d3), + C64e(0x9b234d57e6966731), C64e(0xcce6a6f3170a7505), + C64e(0xb17681d913326cce), C64e(0x3c175284f805a262), + C64e(0xf42bcbb378471547), C64e(0xff46548223936a48), + C64e(0x38df58074e5e6565), C64e(0xf2fc7c89fc86508e), + C64e(0x31702e44d00bca86), C64e(0xf04009a23078474e), + C64e(0x65a0ee39d1f73883), C64e(0xf75ee937e42c3abd), + C64e(0x2197b2260113f86f), C64e(0xa344edd1ef9fdee7), + C64e(0x8ba0df15762592d9), C64e(0x3c85f7f612dc42be), + C64e(0xd8a7ec7cab27b07e), C64e(0x538d7ddaaa3ea8de), + C64e(0xaa25ce93bd0269d8), C64e(0x5af643fd1a7308f9), + C64e(0xc05fefda174a19a5), C64e(0x974d66334cfd216a), + C64e(0x35b49831db411570), C64e(0xea1e0fbbedcd549b), + C64e(0x9ad063a151974072), C64e(0xf6759dbf91476fe2) +}; + +#define Ceven_hi(r) (C[((r) << 2) + 0]) +#define Ceven_lo(r) (C[((r) << 2) + 1]) +#define Codd_hi(r) (C[((r) << 2) + 2]) +#define Codd_lo(r) (C[((r) << 2) + 3]) + +#define S(x0, x1, x2, x3, cb, r) do { \ + Sb(x0 ## h, x1 ## h, x2 ## h, x3 ## h, cb ## hi(r)); \ + Sb(x0 ## l, x1 ## l, x2 ## l, x3 ## l, cb ## lo(r)); \ + } while (0) + +#define L(x0, x1, x2, x3, x4, x5, x6, x7) do { \ + Lb(x0 ## h, x1 ## h, x2 ## h, x3 ## h, \ + x4 ## h, x5 ## h, x6 ## h, x7 ## h); \ + Lb(x0 ## l, x1 ## l, x2 ## l, x3 ## l, \ + x4 ## l, x5 ## l, x6 ## l, x7 ## l); \ + } while (0) + +#define Wz(x, c, n) do { \ + sph_u64 t = (x ## h & (c)) << (n); \ + x ## h = ((x ## h >> (n)) & (c)) | t; \ + t = (x ## l & (c)) << (n); \ + x ## l = ((x ## l >> (n)) & (c)) | t; \ + } while (0) + +#define W0(x) Wz(x, SPH_C64(0x5555555555555555), 1) +#define W1(x) Wz(x, SPH_C64(0x3333333333333333), 2) +#define W2(x) Wz(x, SPH_C64(0x0F0F0F0F0F0F0F0F), 4) +#define W3(x) Wz(x, SPH_C64(0x00FF00FF00FF00FF), 8) +#define W4(x) Wz(x, SPH_C64(0x0000FFFF0000FFFF), 16) +#define W5(x) Wz(x, SPH_C64(0x00000000FFFFFFFF), 32) +#define W6(x) do { \ + sph_u64 t = x ## h; \ + x ## h = x ## l; \ + x ## l = t; \ + } while (0) + +#define DECL_STATE \ + sph_u64 h0h, h1h, h2h, h3h, h4h, h5h, h6h, h7h; \ + sph_u64 h0l, h1l, h2l, h3l, h4l, h5l, h6l, h7l; \ + sph_u64 tmp; + +#define READ_STATE(state) do { \ + h0h = (state)->H.wide[ 0]; \ + h0l = (state)->H.wide[ 1]; \ + h1h = (state)->H.wide[ 2]; \ + h1l = (state)->H.wide[ 3]; \ + h2h = (state)->H.wide[ 4]; \ + h2l = (state)->H.wide[ 5]; \ + h3h = (state)->H.wide[ 6]; \ + h3l = (state)->H.wide[ 7]; \ + h4h = (state)->H.wide[ 8]; \ + h4l = (state)->H.wide[ 9]; \ + h5h = (state)->H.wide[10]; \ + h5l = (state)->H.wide[11]; \ + h6h = (state)->H.wide[12]; \ + h6l = (state)->H.wide[13]; \ + h7h = (state)->H.wide[14]; \ + h7l = (state)->H.wide[15]; \ + } while (0) + +#define WRITE_STATE(state) do { \ + (state)->H.wide[ 0] = h0h; \ + (state)->H.wide[ 1] = h0l; \ + (state)->H.wide[ 2] = h1h; \ + (state)->H.wide[ 3] = h1l; \ + (state)->H.wide[ 4] = h2h; \ + (state)->H.wide[ 5] = h2l; \ + (state)->H.wide[ 6] = h3h; \ + (state)->H.wide[ 7] = h3l; \ + (state)->H.wide[ 8] = h4h; \ + (state)->H.wide[ 9] = h4l; \ + (state)->H.wide[10] = h5h; \ + (state)->H.wide[11] = h5l; \ + (state)->H.wide[12] = h6h; \ + (state)->H.wide[13] = h6l; \ + (state)->H.wide[14] = h7h; \ + (state)->H.wide[15] = h7l; \ + } while (0) + +#define INPUT_BUF1 \ + sph_u64 m0h = dec64e_aligned(buf + 0); \ + sph_u64 m0l = dec64e_aligned(buf + 8); \ + sph_u64 m1h = dec64e_aligned(buf + 16); \ + sph_u64 m1l = dec64e_aligned(buf + 24); \ + sph_u64 m2h = dec64e_aligned(buf + 32); \ + sph_u64 m2l = dec64e_aligned(buf + 40); \ + sph_u64 m3h = dec64e_aligned(buf + 48); \ + sph_u64 m3l = dec64e_aligned(buf + 56); \ + h0h ^= m0h; \ + h0l ^= m0l; \ + h1h ^= m1h; \ + h1l ^= m1l; \ + h2h ^= m2h; \ + h2l ^= m2l; \ + h3h ^= m3h; \ + h3l ^= m3l; + +#define INPUT_BUF2 \ + h4h ^= m0h; \ + h4l ^= m0l; \ + h5h ^= m1h; \ + h5l ^= m1l; \ + h6h ^= m2h; \ + h6l ^= m2l; \ + h7h ^= m3h; \ + h7l ^= m3l; + +static const sph_u64 IV224[] = { + C64e(0x2dfedd62f99a98ac), C64e(0xae7cacd619d634e7), + C64e(0xa4831005bc301216), C64e(0xb86038c6c9661494), + C64e(0x66d9899f2580706f), C64e(0xce9ea31b1d9b1adc), + C64e(0x11e8325f7b366e10), C64e(0xf994857f02fa06c1), + C64e(0x1b4f1b5cd8c840b3), C64e(0x97f6a17f6e738099), + C64e(0xdcdf93a5adeaa3d3), C64e(0xa431e8dec9539a68), + C64e(0x22b4a98aec86a1e4), C64e(0xd574ac959ce56cf0), + C64e(0x15960deab5ab2bbf), C64e(0x9611dcf0dd64ea6e) +}; + +static const sph_u64 IV256[] = { + C64e(0xeb98a3412c20d3eb), C64e(0x92cdbe7b9cb245c1), + C64e(0x1c93519160d4c7fa), C64e(0x260082d67e508a03), + C64e(0xa4239e267726b945), C64e(0xe0fb1a48d41a9477), + C64e(0xcdb5ab26026b177a), C64e(0x56f024420fff2fa8), + C64e(0x71a396897f2e4d75), C64e(0x1d144908f77de262), + C64e(0x277695f776248f94), C64e(0x87d5b6574780296c), + C64e(0x5c5e272dac8e0d6c), C64e(0x518450c657057a0f), + C64e(0x7be4d367702412ea), C64e(0x89e3ab13d31cd769) +}; + +static const sph_u64 IV384[] = { + C64e(0x481e3bc6d813398a), C64e(0x6d3b5e894ade879b), + C64e(0x63faea68d480ad2e), C64e(0x332ccb21480f8267), + C64e(0x98aec84d9082b928), C64e(0xd455ea3041114249), + C64e(0x36f555b2924847ec), C64e(0xc7250a93baf43ce1), + C64e(0x569b7f8a27db454c), C64e(0x9efcbd496397af0e), + C64e(0x589fc27d26aa80cd), C64e(0x80c08b8c9deb2eda), + C64e(0x8a7981e8f8d5373a), C64e(0xf43967adddd17a71), + C64e(0xa9b4d3bda475d394), C64e(0x976c3fba9842737f) +}; + +static const sph_u64 IV512[] = { + C64e(0x6fd14b963e00aa17), C64e(0x636a2e057a15d543), + C64e(0x8a225e8d0c97ef0b), C64e(0xe9341259f2b3c361), + C64e(0x891da0c1536f801e), C64e(0x2aa9056bea2b6d80), + C64e(0x588eccdb2075baa6), C64e(0xa90f3a76baf83bf7), + C64e(0x0169e60541e34a69), C64e(0x46b58a8e2e6fe65a), + C64e(0x1047a7d0c1843c24), C64e(0x3b6e71b12d5ac199), + C64e(0xcf57f6ec9db1f856), C64e(0xa706887c5716b156), + C64e(0xe3c2fcdfe68517fb), C64e(0x545a4678cc8cdd4b) +}; + +#else + +static const sph_u32 C[] = { + C32e(0x72d5dea2), C32e(0xdf15f867), C32e(0x7b84150a), + C32e(0xb7231557), C32e(0x81abd690), C32e(0x4d5a87f6), + C32e(0x4e9f4fc5), C32e(0xc3d12b40), C32e(0xea983ae0), + C32e(0x5c45fa9c), C32e(0x03c5d299), C32e(0x66b2999a), + C32e(0x660296b4), C32e(0xf2bb538a), C32e(0xb556141a), + C32e(0x88dba231), C32e(0x03a35a5c), C32e(0x9a190edb), + C32e(0x403fb20a), C32e(0x87c14410), C32e(0x1c051980), + C32e(0x849e951d), C32e(0x6f33ebad), C32e(0x5ee7cddc), + C32e(0x10ba1392), C32e(0x02bf6b41), C32e(0xdc786515), + C32e(0xf7bb27d0), C32e(0x0a2c8139), C32e(0x37aa7850), + C32e(0x3f1abfd2), C32e(0x410091d3), C32e(0x422d5a0d), + C32e(0xf6cc7e90), C32e(0xdd629f9c), C32e(0x92c097ce), + C32e(0x185ca70b), C32e(0xc72b44ac), C32e(0xd1df65d6), + C32e(0x63c6fc23), C32e(0x976e6c03), C32e(0x9ee0b81a), + C32e(0x2105457e), C32e(0x446ceca8), C32e(0xeef103bb), + C32e(0x5d8e61fa), C32e(0xfd9697b2), C32e(0x94838197), + C32e(0x4a8e8537), C32e(0xdb03302f), C32e(0x2a678d2d), + C32e(0xfb9f6a95), C32e(0x8afe7381), C32e(0xf8b8696c), + C32e(0x8ac77246), C32e(0xc07f4214), C32e(0xc5f4158f), + C32e(0xbdc75ec4), C32e(0x75446fa7), C32e(0x8f11bb80), + C32e(0x52de75b7), C32e(0xaee488bc), C32e(0x82b8001e), + C32e(0x98a6a3f4), C32e(0x8ef48f33), C32e(0xa9a36315), + C32e(0xaa5f5624), C32e(0xd5b7f989), C32e(0xb6f1ed20), + C32e(0x7c5ae0fd), C32e(0x36cae95a), C32e(0x06422c36), + C32e(0xce293543), C32e(0x4efe983d), C32e(0x533af974), + C32e(0x739a4ba7), C32e(0xd0f51f59), C32e(0x6f4e8186), + C32e(0x0e9dad81), C32e(0xafd85a9f), C32e(0xa7050667), + C32e(0xee34626a), C32e(0x8b0b28be), C32e(0x6eb91727), + C32e(0x47740726), C32e(0xc680103f), C32e(0xe0a07e6f), + C32e(0xc67e487b), C32e(0x0d550aa5), C32e(0x4af8a4c0), + C32e(0x91e3e79f), C32e(0x978ef19e), C32e(0x86767281), + C32e(0x50608dd4), C32e(0x7e9e5a41), C32e(0xf3e5b062), + C32e(0xfc9f1fec), C32e(0x4054207a), C32e(0xe3e41a00), + C32e(0xcef4c984), C32e(0x4fd794f5), C32e(0x9dfa95d8), + C32e(0x552e7e11), C32e(0x24c354a5), C32e(0x5bdf7228), + C32e(0xbdfe6e28), C32e(0x78f57fe2), C32e(0x0fa5c4b2), + C32e(0x05897cef), C32e(0xee49d32e), C32e(0x447e9385), + C32e(0xeb28597f), C32e(0x705f6937), C32e(0xb324314a), + C32e(0x5e8628f1), C32e(0x1dd6e465), C32e(0xc71b7704), + C32e(0x51b920e7), C32e(0x74fe43e8), C32e(0x23d4878a), + C32e(0x7d29e8a3), C32e(0x927694f2), C32e(0xddcb7a09), + C32e(0x9b30d9c1), C32e(0x1d1b30fb), C32e(0x5bdc1be0), + C32e(0xda24494f), C32e(0xf29c82bf), C32e(0xa4e7ba31), + C32e(0xb470bfff), C32e(0x0d324405), C32e(0xdef8bc48), + C32e(0x3baefc32), C32e(0x53bbd339), C32e(0x459fc3c1), + C32e(0xe0298ba0), C32e(0xe5c905fd), C32e(0xf7ae090f), + C32e(0x94703412), C32e(0x4290f134), C32e(0xa271b701), + C32e(0xe344ed95), C32e(0xe93b8e36), C32e(0x4f2f984a), + C32e(0x88401d63), C32e(0xa06cf615), C32e(0x47c1444b), + C32e(0x8752afff), C32e(0x7ebb4af1), C32e(0xe20ac630), + C32e(0x4670b6c5), C32e(0xcc6e8ce6), C32e(0xa4d5a456), + C32e(0xbd4fca00), C32e(0xda9d844b), C32e(0xc83e18ae), + C32e(0x7357ce45), C32e(0x3064d1ad), C32e(0xe8a6ce68), + C32e(0x145c2567), C32e(0xa3da8cf2), C32e(0xcb0ee116), + C32e(0x33e90658), C32e(0x9a94999a), C32e(0x1f60b220), + C32e(0xc26f847b), C32e(0xd1ceac7f), C32e(0xa0d18518), + C32e(0x32595ba1), C32e(0x8ddd19d3), C32e(0x509a1cc0), + C32e(0xaaa5b446), C32e(0x9f3d6367), C32e(0xe4046bba), + C32e(0xf6ca19ab), C32e(0x0b56ee7e), C32e(0x1fb179ea), + C32e(0xa9282174), C32e(0xe9bdf735), C32e(0x3b3651ee), + C32e(0x1d57ac5a), C32e(0x7550d376), C32e(0x3a46c2fe), + C32e(0xa37d7001), C32e(0xf735c1af), C32e(0x98a4d842), + C32e(0x78edec20), C32e(0x9e6b6779), C32e(0x41836315), + C32e(0xea3adba8), C32e(0xfac33b4d), C32e(0x32832c83), + C32e(0xa7403b1f), C32e(0x1c2747f3), C32e(0x5940f034), + C32e(0xb72d769a), C32e(0xe73e4e6c), C32e(0xd2214ffd), + C32e(0xb8fd8d39), C32e(0xdc5759ef), C32e(0x8d9b0c49), + C32e(0x2b49ebda), C32e(0x5ba2d749), C32e(0x68f3700d), + C32e(0x7d3baed0), C32e(0x7a8d5584), C32e(0xf5a5e9f0), + C32e(0xe4f88e65), C32e(0xa0b8a2f4), C32e(0x36103b53), + C32e(0x0ca8079e), C32e(0x753eec5a), C32e(0x91689492), + C32e(0x56e8884f), C32e(0x5bb05c55), C32e(0xf8babc4c), + C32e(0xe3bb3b99), C32e(0xf387947b), C32e(0x75daf4d6), + C32e(0x726b1c5d), C32e(0x64aeac28), C32e(0xdc34b36d), + C32e(0x6c34a550), C32e(0xb828db71), C32e(0xf861e2f2), + C32e(0x108d512a), C32e(0xe3db6433), C32e(0x59dd75fc), + C32e(0x1cacbcf1), C32e(0x43ce3fa2), C32e(0x67bbd13c), + C32e(0x02e843b0), C32e(0x330a5bca), C32e(0x8829a175), + C32e(0x7f34194d), C32e(0xb416535c), C32e(0x923b94c3), + C32e(0x0e794d1e), C32e(0x797475d7), C32e(0xb6eeaf3f), + C32e(0xeaa8d4f7), C32e(0xbe1a3921), C32e(0x5cf47e09), + C32e(0x4c232751), C32e(0x26a32453), C32e(0xba323cd2), + C32e(0x44a3174a), C32e(0x6da6d5ad), C32e(0xb51d3ea6), + C32e(0xaff2c908), C32e(0x83593d98), C32e(0x916b3c56), + C32e(0x4cf87ca1), C32e(0x7286604d), C32e(0x46e23ecc), + C32e(0x086ec7f6), C32e(0x2f9833b3), C32e(0xb1bc765e), + C32e(0x2bd666a5), C32e(0xefc4e62a), C32e(0x06f4b6e8), + C32e(0xbec1d436), C32e(0x74ee8215), C32e(0xbcef2163), + C32e(0xfdc14e0d), C32e(0xf453c969), C32e(0xa77d5ac4), + C32e(0x06585826), C32e(0x7ec11416), C32e(0x06e0fa16), + C32e(0x7e90af3d), C32e(0x28639d3f), C32e(0xd2c9f2e3), + C32e(0x009bd20c), C32e(0x5faace30), C32e(0xb7d40c30), + C32e(0x742a5116), C32e(0xf2e03298), C32e(0x0deb30d8), + C32e(0xe3cef89a), C32e(0x4bc59e7b), C32e(0xb5f17992), + C32e(0xff51e66e), C32e(0x048668d3), C32e(0x9b234d57), + C32e(0xe6966731), C32e(0xcce6a6f3), C32e(0x170a7505), + C32e(0xb17681d9), C32e(0x13326cce), C32e(0x3c175284), + C32e(0xf805a262), C32e(0xf42bcbb3), C32e(0x78471547), + C32e(0xff465482), C32e(0x23936a48), C32e(0x38df5807), + C32e(0x4e5e6565), C32e(0xf2fc7c89), C32e(0xfc86508e), + C32e(0x31702e44), C32e(0xd00bca86), C32e(0xf04009a2), + C32e(0x3078474e), C32e(0x65a0ee39), C32e(0xd1f73883), + C32e(0xf75ee937), C32e(0xe42c3abd), C32e(0x2197b226), + C32e(0x0113f86f), C32e(0xa344edd1), C32e(0xef9fdee7), + C32e(0x8ba0df15), C32e(0x762592d9), C32e(0x3c85f7f6), + C32e(0x12dc42be), C32e(0xd8a7ec7c), C32e(0xab27b07e), + C32e(0x538d7dda), C32e(0xaa3ea8de), C32e(0xaa25ce93), + C32e(0xbd0269d8), C32e(0x5af643fd), C32e(0x1a7308f9), + C32e(0xc05fefda), C32e(0x174a19a5), C32e(0x974d6633), + C32e(0x4cfd216a), C32e(0x35b49831), C32e(0xdb411570), + C32e(0xea1e0fbb), C32e(0xedcd549b), C32e(0x9ad063a1), + C32e(0x51974072), C32e(0xf6759dbf), C32e(0x91476fe2) +}; + +#define Ceven_w3(r) (C[((r) << 3) + 0]) +#define Ceven_w2(r) (C[((r) << 3) + 1]) +#define Ceven_w1(r) (C[((r) << 3) + 2]) +#define Ceven_w0(r) (C[((r) << 3) + 3]) +#define Codd_w3(r) (C[((r) << 3) + 4]) +#define Codd_w2(r) (C[((r) << 3) + 5]) +#define Codd_w1(r) (C[((r) << 3) + 6]) +#define Codd_w0(r) (C[((r) << 3) + 7]) + +#define S(x0, x1, x2, x3, cb, r) do { \ + Sb(x0 ## 3, x1 ## 3, x2 ## 3, x3 ## 3, cb ## w3(r)); \ + Sb(x0 ## 2, x1 ## 2, x2 ## 2, x3 ## 2, cb ## w2(r)); \ + Sb(x0 ## 1, x1 ## 1, x2 ## 1, x3 ## 1, cb ## w1(r)); \ + Sb(x0 ## 0, x1 ## 0, x2 ## 0, x3 ## 0, cb ## w0(r)); \ + } while (0) + +#define L(x0, x1, x2, x3, x4, x5, x6, x7) do { \ + Lb(x0 ## 3, x1 ## 3, x2 ## 3, x3 ## 3, \ + x4 ## 3, x5 ## 3, x6 ## 3, x7 ## 3); \ + Lb(x0 ## 2, x1 ## 2, x2 ## 2, x3 ## 2, \ + x4 ## 2, x5 ## 2, x6 ## 2, x7 ## 2); \ + Lb(x0 ## 1, x1 ## 1, x2 ## 1, x3 ## 1, \ + x4 ## 1, x5 ## 1, x6 ## 1, x7 ## 1); \ + Lb(x0 ## 0, x1 ## 0, x2 ## 0, x3 ## 0, \ + x4 ## 0, x5 ## 0, x6 ## 0, x7 ## 0); \ + } while (0) + +#define Wz(x, c, n) do { \ + sph_u32 t = (x ## 3 & (c)) << (n); \ + x ## 3 = ((x ## 3 >> (n)) & (c)) | t; \ + t = (x ## 2 & (c)) << (n); \ + x ## 2 = ((x ## 2 >> (n)) & (c)) | t; \ + t = (x ## 1 & (c)) << (n); \ + x ## 1 = ((x ## 1 >> (n)) & (c)) | t; \ + t = (x ## 0 & (c)) << (n); \ + x ## 0 = ((x ## 0 >> (n)) & (c)) | t; \ + } while (0) + +#define W0(x) Wz(x, SPH_C32(0x55555555), 1) +#define W1(x) Wz(x, SPH_C32(0x33333333), 2) +#define W2(x) Wz(x, SPH_C32(0x0F0F0F0F), 4) +#define W3(x) Wz(x, SPH_C32(0x00FF00FF), 8) +#define W4(x) Wz(x, SPH_C32(0x0000FFFF), 16) +#define W5(x) do { \ + sph_u32 t = x ## 3; \ + x ## 3 = x ## 2; \ + x ## 2 = t; \ + t = x ## 1; \ + x ## 1 = x ## 0; \ + x ## 0 = t; \ + } while (0) +#define W6(x) do { \ + sph_u32 t = x ## 3; \ + x ## 3 = x ## 1; \ + x ## 1 = t; \ + t = x ## 2; \ + x ## 2 = x ## 0; \ + x ## 0 = t; \ + } while (0) + +#define DECL_STATE \ + sph_u32 h03, h02, h01, h00, h13, h12, h11, h10; \ + sph_u32 h23, h22, h21, h20, h33, h32, h31, h30; \ + sph_u32 h43, h42, h41, h40, h53, h52, h51, h50; \ + sph_u32 h63, h62, h61, h60, h73, h72, h71, h70; \ + sph_u32 tmp; + +#define READ_STATE(state) do { \ + h03 = (state)->H.narrow[ 0]; \ + h02 = (state)->H.narrow[ 1]; \ + h01 = (state)->H.narrow[ 2]; \ + h00 = (state)->H.narrow[ 3]; \ + h13 = (state)->H.narrow[ 4]; \ + h12 = (state)->H.narrow[ 5]; \ + h11 = (state)->H.narrow[ 6]; \ + h10 = (state)->H.narrow[ 7]; \ + h23 = (state)->H.narrow[ 8]; \ + h22 = (state)->H.narrow[ 9]; \ + h21 = (state)->H.narrow[10]; \ + h20 = (state)->H.narrow[11]; \ + h33 = (state)->H.narrow[12]; \ + h32 = (state)->H.narrow[13]; \ + h31 = (state)->H.narrow[14]; \ + h30 = (state)->H.narrow[15]; \ + h43 = (state)->H.narrow[16]; \ + h42 = (state)->H.narrow[17]; \ + h41 = (state)->H.narrow[18]; \ + h40 = (state)->H.narrow[19]; \ + h53 = (state)->H.narrow[20]; \ + h52 = (state)->H.narrow[21]; \ + h51 = (state)->H.narrow[22]; \ + h50 = (state)->H.narrow[23]; \ + h63 = (state)->H.narrow[24]; \ + h62 = (state)->H.narrow[25]; \ + h61 = (state)->H.narrow[26]; \ + h60 = (state)->H.narrow[27]; \ + h73 = (state)->H.narrow[28]; \ + h72 = (state)->H.narrow[29]; \ + h71 = (state)->H.narrow[30]; \ + h70 = (state)->H.narrow[31]; \ + } while (0) + +#define WRITE_STATE(state) do { \ + (state)->H.narrow[ 0] = h03; \ + (state)->H.narrow[ 1] = h02; \ + (state)->H.narrow[ 2] = h01; \ + (state)->H.narrow[ 3] = h00; \ + (state)->H.narrow[ 4] = h13; \ + (state)->H.narrow[ 5] = h12; \ + (state)->H.narrow[ 6] = h11; \ + (state)->H.narrow[ 7] = h10; \ + (state)->H.narrow[ 8] = h23; \ + (state)->H.narrow[ 9] = h22; \ + (state)->H.narrow[10] = h21; \ + (state)->H.narrow[11] = h20; \ + (state)->H.narrow[12] = h33; \ + (state)->H.narrow[13] = h32; \ + (state)->H.narrow[14] = h31; \ + (state)->H.narrow[15] = h30; \ + (state)->H.narrow[16] = h43; \ + (state)->H.narrow[17] = h42; \ + (state)->H.narrow[18] = h41; \ + (state)->H.narrow[19] = h40; \ + (state)->H.narrow[20] = h53; \ + (state)->H.narrow[21] = h52; \ + (state)->H.narrow[22] = h51; \ + (state)->H.narrow[23] = h50; \ + (state)->H.narrow[24] = h63; \ + (state)->H.narrow[25] = h62; \ + (state)->H.narrow[26] = h61; \ + (state)->H.narrow[27] = h60; \ + (state)->H.narrow[28] = h73; \ + (state)->H.narrow[29] = h72; \ + (state)->H.narrow[30] = h71; \ + (state)->H.narrow[31] = h70; \ + } while (0) + +#define INPUT_BUF1 \ + sph_u32 m03 = dec32e_aligned(buf + 0); \ + sph_u32 m02 = dec32e_aligned(buf + 4); \ + sph_u32 m01 = dec32e_aligned(buf + 8); \ + sph_u32 m00 = dec32e_aligned(buf + 12); \ + sph_u32 m13 = dec32e_aligned(buf + 16); \ + sph_u32 m12 = dec32e_aligned(buf + 20); \ + sph_u32 m11 = dec32e_aligned(buf + 24); \ + sph_u32 m10 = dec32e_aligned(buf + 28); \ + sph_u32 m23 = dec32e_aligned(buf + 32); \ + sph_u32 m22 = dec32e_aligned(buf + 36); \ + sph_u32 m21 = dec32e_aligned(buf + 40); \ + sph_u32 m20 = dec32e_aligned(buf + 44); \ + sph_u32 m33 = dec32e_aligned(buf + 48); \ + sph_u32 m32 = dec32e_aligned(buf + 52); \ + sph_u32 m31 = dec32e_aligned(buf + 56); \ + sph_u32 m30 = dec32e_aligned(buf + 60); \ + h03 ^= m03; \ + h02 ^= m02; \ + h01 ^= m01; \ + h00 ^= m00; \ + h13 ^= m13; \ + h12 ^= m12; \ + h11 ^= m11; \ + h10 ^= m10; \ + h23 ^= m23; \ + h22 ^= m22; \ + h21 ^= m21; \ + h20 ^= m20; \ + h33 ^= m33; \ + h32 ^= m32; \ + h31 ^= m31; \ + h30 ^= m30; + +#define INPUT_BUF2 \ + h43 ^= m03; \ + h42 ^= m02; \ + h41 ^= m01; \ + h40 ^= m00; \ + h53 ^= m13; \ + h52 ^= m12; \ + h51 ^= m11; \ + h50 ^= m10; \ + h63 ^= m23; \ + h62 ^= m22; \ + h61 ^= m21; \ + h60 ^= m20; \ + h73 ^= m33; \ + h72 ^= m32; \ + h71 ^= m31; \ + h70 ^= m30; + +static const sph_u32 IV224[] = { + C32e(0x2dfedd62), C32e(0xf99a98ac), C32e(0xae7cacd6), C32e(0x19d634e7), + C32e(0xa4831005), C32e(0xbc301216), C32e(0xb86038c6), C32e(0xc9661494), + C32e(0x66d9899f), C32e(0x2580706f), C32e(0xce9ea31b), C32e(0x1d9b1adc), + C32e(0x11e8325f), C32e(0x7b366e10), C32e(0xf994857f), C32e(0x02fa06c1), + C32e(0x1b4f1b5c), C32e(0xd8c840b3), C32e(0x97f6a17f), C32e(0x6e738099), + C32e(0xdcdf93a5), C32e(0xadeaa3d3), C32e(0xa431e8de), C32e(0xc9539a68), + C32e(0x22b4a98a), C32e(0xec86a1e4), C32e(0xd574ac95), C32e(0x9ce56cf0), + C32e(0x15960dea), C32e(0xb5ab2bbf), C32e(0x9611dcf0), C32e(0xdd64ea6e) +}; + +static const sph_u32 IV256[] = { + C32e(0xeb98a341), C32e(0x2c20d3eb), C32e(0x92cdbe7b), C32e(0x9cb245c1), + C32e(0x1c935191), C32e(0x60d4c7fa), C32e(0x260082d6), C32e(0x7e508a03), + C32e(0xa4239e26), C32e(0x7726b945), C32e(0xe0fb1a48), C32e(0xd41a9477), + C32e(0xcdb5ab26), C32e(0x026b177a), C32e(0x56f02442), C32e(0x0fff2fa8), + C32e(0x71a39689), C32e(0x7f2e4d75), C32e(0x1d144908), C32e(0xf77de262), + C32e(0x277695f7), C32e(0x76248f94), C32e(0x87d5b657), C32e(0x4780296c), + C32e(0x5c5e272d), C32e(0xac8e0d6c), C32e(0x518450c6), C32e(0x57057a0f), + C32e(0x7be4d367), C32e(0x702412ea), C32e(0x89e3ab13), C32e(0xd31cd769) +}; + +static const sph_u32 IV384[] = { + C32e(0x481e3bc6), C32e(0xd813398a), C32e(0x6d3b5e89), C32e(0x4ade879b), + C32e(0x63faea68), C32e(0xd480ad2e), C32e(0x332ccb21), C32e(0x480f8267), + C32e(0x98aec84d), C32e(0x9082b928), C32e(0xd455ea30), C32e(0x41114249), + C32e(0x36f555b2), C32e(0x924847ec), C32e(0xc7250a93), C32e(0xbaf43ce1), + C32e(0x569b7f8a), C32e(0x27db454c), C32e(0x9efcbd49), C32e(0x6397af0e), + C32e(0x589fc27d), C32e(0x26aa80cd), C32e(0x80c08b8c), C32e(0x9deb2eda), + C32e(0x8a7981e8), C32e(0xf8d5373a), C32e(0xf43967ad), C32e(0xddd17a71), + C32e(0xa9b4d3bd), C32e(0xa475d394), C32e(0x976c3fba), C32e(0x9842737f) +}; + +static const sph_u32 IV512[] = { + C32e(0x6fd14b96), C32e(0x3e00aa17), C32e(0x636a2e05), C32e(0x7a15d543), + C32e(0x8a225e8d), C32e(0x0c97ef0b), C32e(0xe9341259), C32e(0xf2b3c361), + C32e(0x891da0c1), C32e(0x536f801e), C32e(0x2aa9056b), C32e(0xea2b6d80), + C32e(0x588eccdb), C32e(0x2075baa6), C32e(0xa90f3a76), C32e(0xbaf83bf7), + C32e(0x0169e605), C32e(0x41e34a69), C32e(0x46b58a8e), C32e(0x2e6fe65a), + C32e(0x1047a7d0), C32e(0xc1843c24), C32e(0x3b6e71b1), C32e(0x2d5ac199), + C32e(0xcf57f6ec), C32e(0x9db1f856), C32e(0xa706887c), C32e(0x5716b156), + C32e(0xe3c2fcdf), C32e(0xe68517fb), C32e(0x545a4678), C32e(0xcc8cdd4b) +}; + +#endif + +#define SL(ro) SLu(r + ro, ro) + +#define SLu(r, ro) do { \ + S(h0, h2, h4, h6, Ceven_, r); \ + S(h1, h3, h5, h7, Codd_, r); \ + L(h0, h2, h4, h6, h1, h3, h5, h7); \ + W ## ro(h1); \ + W ## ro(h3); \ + W ## ro(h5); \ + W ## ro(h7); \ + } while (0) + +#if SPH_SMALL_FOOTPRINT_JH + +#if SPH_JH_64 + +/* + * The "small footprint" 64-bit version just uses a partially unrolled + * loop. + */ + +#define E8 do { \ + unsigned r; \ + for (r = 0; r < 42; r += 7) { \ + SL(0); \ + SL(1); \ + SL(2); \ + SL(3); \ + SL(4); \ + SL(5); \ + SL(6); \ + } \ + } while (0) + +#else + +#define E8 do { \ + unsigned r, g; \ + for (r = g = 0; r < 42; r ++) { \ + S(h0, h2, h4, h6, Ceven_, r); \ + S(h1, h3, h5, h7, Codd_, r); \ + L(h0, h2, h4, h6, h1, h3, h5, h7); \ + switch (g) { \ + case 0: \ + W0(h1); \ + W0(h3); \ + W0(h5); \ + W0(h7); \ + break; \ + case 1: \ + W1(h1); \ + W1(h3); \ + W1(h5); \ + W1(h7); \ + break; \ + case 2: \ + W2(h1); \ + W2(h3); \ + W2(h5); \ + W2(h7); \ + break; \ + case 3: \ + W3(h1); \ + W3(h3); \ + W3(h5); \ + W3(h7); \ + break; \ + case 4: \ + W4(h1); \ + W4(h3); \ + W4(h5); \ + W4(h7); \ + break; \ + case 5: \ + W5(h1); \ + W5(h3); \ + W5(h5); \ + W5(h7); \ + break; \ + case 6: \ + W6(h1); \ + W6(h3); \ + W6(h5); \ + W6(h7); \ + break; \ + } \ + if (++ g == 7) \ + g = 0; \ + } \ + } while (0) + +#endif + +#else + +#if SPH_JH_64 + +/* + * On a "true 64-bit" architecture, we can unroll at will. + */ + +#define E8 do { \ + SLu( 0, 0); \ + SLu( 1, 1); \ + SLu( 2, 2); \ + SLu( 3, 3); \ + SLu( 4, 4); \ + SLu( 5, 5); \ + SLu( 6, 6); \ + SLu( 7, 0); \ + SLu( 8, 1); \ + SLu( 9, 2); \ + SLu(10, 3); \ + SLu(11, 4); \ + SLu(12, 5); \ + SLu(13, 6); \ + SLu(14, 0); \ + SLu(15, 1); \ + SLu(16, 2); \ + SLu(17, 3); \ + SLu(18, 4); \ + SLu(19, 5); \ + SLu(20, 6); \ + SLu(21, 0); \ + SLu(22, 1); \ + SLu(23, 2); \ + SLu(24, 3); \ + SLu(25, 4); \ + SLu(26, 5); \ + SLu(27, 6); \ + SLu(28, 0); \ + SLu(29, 1); \ + SLu(30, 2); \ + SLu(31, 3); \ + SLu(32, 4); \ + SLu(33, 5); \ + SLu(34, 6); \ + SLu(35, 0); \ + SLu(36, 1); \ + SLu(37, 2); \ + SLu(38, 3); \ + SLu(39, 4); \ + SLu(40, 5); \ + SLu(41, 6); \ + } while (0) + +#else + +/* + * We are not aiming at a small footprint, but we are still using a + * 32-bit implementation. Full loop unrolling would smash the L1 + * cache on some "big" architectures (32 kB L1 cache). + */ + +#define E8 do { \ + unsigned r; \ + for (r = 0; r < 42; r += 7) { \ + SL(0); \ + SL(1); \ + SL(2); \ + SL(3); \ + SL(4); \ + SL(5); \ + SL(6); \ + } \ + } while (0) + +#endif + +#endif + +static void +jh_init(sph_jh_context *sc, const void *iv) +{ + sc->ptr = 0; +#if SPH_JH_64 + memcpy(sc->H.wide, iv, sizeof sc->H.wide); +#else + memcpy(sc->H.narrow, iv, sizeof sc->H.narrow); +#endif +#if SPH_64 + sc->block_count = 0; +#else + sc->block_count_high = 0; + sc->block_count_low = 0; +#endif +} + +static void +jh_core(sph_jh_context *sc, const void *data, size_t len) +{ + unsigned char *buf; + size_t ptr; + DECL_STATE + + buf = sc->buf; + ptr = sc->ptr; + if (len < (sizeof sc->buf) - ptr) { + memcpy(buf + ptr, data, len); + ptr += len; + sc->ptr = ptr; + return; + } + + READ_STATE(sc); + while (len > 0) { + size_t clen; + + clen = (sizeof sc->buf) - ptr; + if (clen > len) + clen = len; + memcpy(buf + ptr, data, clen); + ptr += clen; + data = (const unsigned char *)data + clen; + len -= clen; + if (ptr == sizeof sc->buf) { + INPUT_BUF1; + E8; + INPUT_BUF2; +#if SPH_64 + sc->block_count ++; +#else + if ((sc->block_count_low = SPH_T32( + sc->block_count_low + 1)) == 0) + sc->block_count_high ++; +#endif + ptr = 0; + } + } + WRITE_STATE(sc); + sc->ptr = ptr; +} + +static void +jh_close(sph_jh_context *sc, unsigned ub, unsigned n, + void *dst, size_t out_size_w32, const void *iv) +{ + unsigned z; + unsigned char buf[128]; + size_t numz, u; +#if SPH_64 + sph_u64 l0, l1; +#else + sph_u32 l0, l1, l2, l3; +#endif + + z = 0x80 >> n; + buf[0] = ((ub & -z) | z) & 0xFF; + if (sc->ptr == 0 && n == 0) { + numz = 47; + } else { + numz = 111 - sc->ptr; + } + memset(buf + 1, 0, numz); +#if SPH_64 + l0 = SPH_T64(sc->block_count << 9) + (sc->ptr << 3) + n; + l1 = SPH_T64(sc->block_count >> 55); + sph_enc64be(buf + numz + 1, l1); + sph_enc64be(buf + numz + 9, l0); +#else + l0 = SPH_T32(sc->block_count_low << 9) + (sc->ptr << 3) + n; + l1 = SPH_T32(sc->block_count_low >> 23) + + SPH_T32(sc->block_count_high << 9); + l2 = SPH_T32(sc->block_count_high >> 23); + l3 = 0; + sph_enc32be(buf + numz + 1, l3); + sph_enc32be(buf + numz + 5, l2); + sph_enc32be(buf + numz + 9, l1); + sph_enc32be(buf + numz + 13, l0); +#endif + jh_core(sc, buf, numz + 17); +#if SPH_JH_64 + for (u = 0; u < 8; u ++) + enc64e(buf + (u << 3), sc->H.wide[u + 8]); +#else + for (u = 0; u < 16; u ++) + enc32e(buf + (u << 2), sc->H.narrow[u + 16]); +#endif + memcpy(dst, buf + ((16 - out_size_w32) << 2), out_size_w32 << 2); + jh_init(sc, iv); +} + +/* see sph_jh.h */ +void +sph_jh224_init(void *cc) +{ + jh_init(cc, IV224); +} + +/* see sph_jh.h */ +void +sph_jh224(void *cc, const void *data, size_t len) +{ + jh_core(cc, data, len); +} + +/* see sph_jh.h */ +void +sph_jh224_close(void *cc, void *dst) +{ + jh_close(cc, 0, 0, dst, 7, IV224); +} + +/* see sph_jh.h */ +void +sph_jh224_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst) +{ + jh_close(cc, ub, n, dst, 7, IV224); +} + +/* see sph_jh.h */ +void +sph_jh256_init(void *cc) +{ + jh_init(cc, IV256); +} + +/* see sph_jh.h */ +void +sph_jh256(void *cc, const void *data, size_t len) +{ + jh_core(cc, data, len); +} + +/* see sph_jh.h */ +void +sph_jh256_close(void *cc, void *dst) +{ + jh_close(cc, 0, 0, dst, 8, IV256); +} + +/* see sph_jh.h */ +void +sph_jh256_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst) +{ + jh_close(cc, ub, n, dst, 8, IV256); +} + +/* see sph_jh.h */ +void +sph_jh384_init(void *cc) +{ + jh_init(cc, IV384); +} + +/* see sph_jh.h */ +void +sph_jh384(void *cc, const void *data, size_t len) +{ + jh_core(cc, data, len); +} + +/* see sph_jh.h */ +void +sph_jh384_close(void *cc, void *dst) +{ + jh_close(cc, 0, 0, dst, 12, IV384); +} + +/* see sph_jh.h */ +void +sph_jh384_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst) +{ + jh_close(cc, ub, n, dst, 12, IV384); +} + +/* see sph_jh.h */ +void +sph_jh512_init(void *cc) +{ + jh_init(cc, IV512); +} + +/* see sph_jh.h */ +void +sph_jh512(void *cc, const void *data, size_t len) +{ + jh_core(cc, data, len); +} + +/* see sph_jh.h */ +void +sph_jh512_close(void *cc, void *dst) +{ + jh_close(cc, 0, 0, dst, 16, IV512); +} + +/* see sph_jh.h */ +void +sph_jh512_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst) +{ + jh_close(cc, ub, n, dst, 16, IV512); +} + +#ifdef __cplusplus +} +#endif diff --git a/sha3/keccak.c b/sha3/keccak.c new file mode 100644 index 0000000..cff9f87 --- /dev/null +++ b/sha3/keccak.c @@ -0,0 +1,1824 @@ +/* $Id: keccak.c 259 2011-07-19 22:11:27Z tp $ */ +/* + * Keccak implementation. + * + * ==========================(LICENSE BEGIN)============================ + * + * Copyright (c) 2007-2010 Projet RNRT SAPHIR + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY + * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * + * ===========================(LICENSE END)============================= + * + * @author Thomas Pornin + */ + +#include +#include + +#include "sph_keccak.h" + +#ifdef __cplusplus +extern "C"{ +#endif + +/* + * Parameters: + * + * SPH_KECCAK_64 use a 64-bit type + * SPH_KECCAK_UNROLL number of loops to unroll (0/undef for full unroll) + * SPH_KECCAK_INTERLEAVE use bit-interleaving (32-bit type only) + * SPH_KECCAK_NOCOPY do not copy the state into local variables + * + * If there is no usable 64-bit type, the code automatically switches + * back to the 32-bit implementation. + * + * Some tests on an Intel Core2 Q6600 (both 64-bit and 32-bit, 32 kB L1 + * code cache), a PowerPC (G3, 32 kB L1 code cache), an ARM920T core + * (16 kB L1 code cache), and a small MIPS-compatible CPU (Broadcom BCM3302, + * 8 kB L1 code cache), seem to show that the following are optimal: + * + * -- x86, 64-bit: use the 64-bit implementation, unroll 8 rounds, + * do not copy the state; unrolling 2, 6 or all rounds also provides + * near-optimal performance. + * -- x86, 32-bit: use the 32-bit implementation, unroll 6 rounds, + * interleave, do not copy the state. Unrolling 1, 2, 4 or 8 rounds + * also provides near-optimal performance. + * -- PowerPC: use the 64-bit implementation, unroll 8 rounds, + * copy the state. Unrolling 4 or 6 rounds is near-optimal. + * -- ARM: use the 64-bit implementation, unroll 2 or 4 rounds, + * copy the state. + * -- MIPS: use the 64-bit implementation, unroll 2 rounds, copy + * the state. Unrolling only 1 round is also near-optimal. + * + * Also, interleaving does not always yield actual improvements when + * using a 32-bit implementation; in particular when the architecture + * does not offer a native rotation opcode (interleaving replaces one + * 64-bit rotation with two 32-bit rotations, which is a gain only if + * there is a native 32-bit rotation opcode and not a native 64-bit + * rotation opcode; also, interleaving implies a small overhead when + * processing input words). + * + * To sum up: + * -- when possible, use the 64-bit code + * -- exception: on 32-bit x86, use 32-bit code + * -- when using 32-bit code, use interleaving + * -- copy the state, except on x86 + * -- unroll 8 rounds on "big" machine, 2 rounds on "small" machines + */ + +#if SPH_SMALL_FOOTPRINT && !defined SPH_SMALL_FOOTPRINT_KECCAK +#define SPH_SMALL_FOOTPRINT_KECCAK 1 +#endif + +/* + * By default, we select the 64-bit implementation if a 64-bit type + * is available, unless a 32-bit x86 is detected. + */ +#if !defined SPH_KECCAK_64 && SPH_64 \ + && !(defined __i386__ || SPH_I386_GCC || SPH_I386_MSVC) +#define SPH_KECCAK_64 1 +#endif + +/* + * If using a 32-bit implementation, we prefer to interleave. + */ +#if !SPH_KECCAK_64 && !defined SPH_KECCAK_INTERLEAVE +#define SPH_KECCAK_INTERLEAVE 1 +#endif + +/* + * Unroll 8 rounds on big systems, 2 rounds on small systems. + */ +#ifndef SPH_KECCAK_UNROLL +#if SPH_SMALL_FOOTPRINT_KECCAK +#define SPH_KECCAK_UNROLL 2 +#else +#define SPH_KECCAK_UNROLL 8 +#endif +#endif + +/* + * We do not want to copy the state to local variables on x86 (32-bit + * and 64-bit alike). + */ +#ifndef SPH_KECCAK_NOCOPY +#if defined __i386__ || defined __x86_64 || SPH_I386_MSVC || SPH_I386_GCC +#define SPH_KECCAK_NOCOPY 1 +#else +#define SPH_KECCAK_NOCOPY 0 +#endif +#endif + +#ifdef _MSC_VER +#pragma warning (disable: 4146) +#endif + +#if SPH_KECCAK_64 + +static const sph_u64 RC[] = { + SPH_C64(0x0000000000000001), SPH_C64(0x0000000000008082), + SPH_C64(0x800000000000808A), SPH_C64(0x8000000080008000), + SPH_C64(0x000000000000808B), SPH_C64(0x0000000080000001), + SPH_C64(0x8000000080008081), SPH_C64(0x8000000000008009), + SPH_C64(0x000000000000008A), SPH_C64(0x0000000000000088), + SPH_C64(0x0000000080008009), SPH_C64(0x000000008000000A), + SPH_C64(0x000000008000808B), SPH_C64(0x800000000000008B), + SPH_C64(0x8000000000008089), SPH_C64(0x8000000000008003), + SPH_C64(0x8000000000008002), SPH_C64(0x8000000000000080), + SPH_C64(0x000000000000800A), SPH_C64(0x800000008000000A), + SPH_C64(0x8000000080008081), SPH_C64(0x8000000000008080), + SPH_C64(0x0000000080000001), SPH_C64(0x8000000080008008) +}; + +#if SPH_KECCAK_NOCOPY + +#define a00 (kc->u.wide[ 0]) +#define a10 (kc->u.wide[ 1]) +#define a20 (kc->u.wide[ 2]) +#define a30 (kc->u.wide[ 3]) +#define a40 (kc->u.wide[ 4]) +#define a01 (kc->u.wide[ 5]) +#define a11 (kc->u.wide[ 6]) +#define a21 (kc->u.wide[ 7]) +#define a31 (kc->u.wide[ 8]) +#define a41 (kc->u.wide[ 9]) +#define a02 (kc->u.wide[10]) +#define a12 (kc->u.wide[11]) +#define a22 (kc->u.wide[12]) +#define a32 (kc->u.wide[13]) +#define a42 (kc->u.wide[14]) +#define a03 (kc->u.wide[15]) +#define a13 (kc->u.wide[16]) +#define a23 (kc->u.wide[17]) +#define a33 (kc->u.wide[18]) +#define a43 (kc->u.wide[19]) +#define a04 (kc->u.wide[20]) +#define a14 (kc->u.wide[21]) +#define a24 (kc->u.wide[22]) +#define a34 (kc->u.wide[23]) +#define a44 (kc->u.wide[24]) + +#define DECL_STATE +#define READ_STATE(sc) +#define WRITE_STATE(sc) + +#define INPUT_BUF(size) do { \ + size_t j; \ + for (j = 0; j < (size); j += 8) { \ + kc->u.wide[j >> 3] ^= sph_dec64le_aligned(buf + j); \ + } \ + } while (0) + +#define INPUT_BUF144 INPUT_BUF(144) +#define INPUT_BUF136 INPUT_BUF(136) +#define INPUT_BUF104 INPUT_BUF(104) +#define INPUT_BUF72 INPUT_BUF(72) + +#else + +#define DECL_STATE \ + sph_u64 a00, a01, a02, a03, a04; \ + sph_u64 a10, a11, a12, a13, a14; \ + sph_u64 a20, a21, a22, a23, a24; \ + sph_u64 a30, a31, a32, a33, a34; \ + sph_u64 a40, a41, a42, a43, a44; + +#define READ_STATE(state) do { \ + a00 = (state)->u.wide[ 0]; \ + a10 = (state)->u.wide[ 1]; \ + a20 = (state)->u.wide[ 2]; \ + a30 = (state)->u.wide[ 3]; \ + a40 = (state)->u.wide[ 4]; \ + a01 = (state)->u.wide[ 5]; \ + a11 = (state)->u.wide[ 6]; \ + a21 = (state)->u.wide[ 7]; \ + a31 = (state)->u.wide[ 8]; \ + a41 = (state)->u.wide[ 9]; \ + a02 = (state)->u.wide[10]; \ + a12 = (state)->u.wide[11]; \ + a22 = (state)->u.wide[12]; \ + a32 = (state)->u.wide[13]; \ + a42 = (state)->u.wide[14]; \ + a03 = (state)->u.wide[15]; \ + a13 = (state)->u.wide[16]; \ + a23 = (state)->u.wide[17]; \ + a33 = (state)->u.wide[18]; \ + a43 = (state)->u.wide[19]; \ + a04 = (state)->u.wide[20]; \ + a14 = (state)->u.wide[21]; \ + a24 = (state)->u.wide[22]; \ + a34 = (state)->u.wide[23]; \ + a44 = (state)->u.wide[24]; \ + } while (0) + +#define WRITE_STATE(state) do { \ + (state)->u.wide[ 0] = a00; \ + (state)->u.wide[ 1] = a10; \ + (state)->u.wide[ 2] = a20; \ + (state)->u.wide[ 3] = a30; \ + (state)->u.wide[ 4] = a40; \ + (state)->u.wide[ 5] = a01; \ + (state)->u.wide[ 6] = a11; \ + (state)->u.wide[ 7] = a21; \ + (state)->u.wide[ 8] = a31; \ + (state)->u.wide[ 9] = a41; \ + (state)->u.wide[10] = a02; \ + (state)->u.wide[11] = a12; \ + (state)->u.wide[12] = a22; \ + (state)->u.wide[13] = a32; \ + (state)->u.wide[14] = a42; \ + (state)->u.wide[15] = a03; \ + (state)->u.wide[16] = a13; \ + (state)->u.wide[17] = a23; \ + (state)->u.wide[18] = a33; \ + (state)->u.wide[19] = a43; \ + (state)->u.wide[20] = a04; \ + (state)->u.wide[21] = a14; \ + (state)->u.wide[22] = a24; \ + (state)->u.wide[23] = a34; \ + (state)->u.wide[24] = a44; \ + } while (0) + +#define INPUT_BUF144 do { \ + a00 ^= sph_dec64le_aligned(buf + 0); \ + a10 ^= sph_dec64le_aligned(buf + 8); \ + a20 ^= sph_dec64le_aligned(buf + 16); \ + a30 ^= sph_dec64le_aligned(buf + 24); \ + a40 ^= sph_dec64le_aligned(buf + 32); \ + a01 ^= sph_dec64le_aligned(buf + 40); \ + a11 ^= sph_dec64le_aligned(buf + 48); \ + a21 ^= sph_dec64le_aligned(buf + 56); \ + a31 ^= sph_dec64le_aligned(buf + 64); \ + a41 ^= sph_dec64le_aligned(buf + 72); \ + a02 ^= sph_dec64le_aligned(buf + 80); \ + a12 ^= sph_dec64le_aligned(buf + 88); \ + a22 ^= sph_dec64le_aligned(buf + 96); \ + a32 ^= sph_dec64le_aligned(buf + 104); \ + a42 ^= sph_dec64le_aligned(buf + 112); \ + a03 ^= sph_dec64le_aligned(buf + 120); \ + a13 ^= sph_dec64le_aligned(buf + 128); \ + a23 ^= sph_dec64le_aligned(buf + 136); \ + } while (0) + +#define INPUT_BUF136 do { \ + a00 ^= sph_dec64le_aligned(buf + 0); \ + a10 ^= sph_dec64le_aligned(buf + 8); \ + a20 ^= sph_dec64le_aligned(buf + 16); \ + a30 ^= sph_dec64le_aligned(buf + 24); \ + a40 ^= sph_dec64le_aligned(buf + 32); \ + a01 ^= sph_dec64le_aligned(buf + 40); \ + a11 ^= sph_dec64le_aligned(buf + 48); \ + a21 ^= sph_dec64le_aligned(buf + 56); \ + a31 ^= sph_dec64le_aligned(buf + 64); \ + a41 ^= sph_dec64le_aligned(buf + 72); \ + a02 ^= sph_dec64le_aligned(buf + 80); \ + a12 ^= sph_dec64le_aligned(buf + 88); \ + a22 ^= sph_dec64le_aligned(buf + 96); \ + a32 ^= sph_dec64le_aligned(buf + 104); \ + a42 ^= sph_dec64le_aligned(buf + 112); \ + a03 ^= sph_dec64le_aligned(buf + 120); \ + a13 ^= sph_dec64le_aligned(buf + 128); \ + } while (0) + +#define INPUT_BUF104 do { \ + a00 ^= sph_dec64le_aligned(buf + 0); \ + a10 ^= sph_dec64le_aligned(buf + 8); \ + a20 ^= sph_dec64le_aligned(buf + 16); \ + a30 ^= sph_dec64le_aligned(buf + 24); \ + a40 ^= sph_dec64le_aligned(buf + 32); \ + a01 ^= sph_dec64le_aligned(buf + 40); \ + a11 ^= sph_dec64le_aligned(buf + 48); \ + a21 ^= sph_dec64le_aligned(buf + 56); \ + a31 ^= sph_dec64le_aligned(buf + 64); \ + a41 ^= sph_dec64le_aligned(buf + 72); \ + a02 ^= sph_dec64le_aligned(buf + 80); \ + a12 ^= sph_dec64le_aligned(buf + 88); \ + a22 ^= sph_dec64le_aligned(buf + 96); \ + } while (0) + +#define INPUT_BUF72 do { \ + a00 ^= sph_dec64le_aligned(buf + 0); \ + a10 ^= sph_dec64le_aligned(buf + 8); \ + a20 ^= sph_dec64le_aligned(buf + 16); \ + a30 ^= sph_dec64le_aligned(buf + 24); \ + a40 ^= sph_dec64le_aligned(buf + 32); \ + a01 ^= sph_dec64le_aligned(buf + 40); \ + a11 ^= sph_dec64le_aligned(buf + 48); \ + a21 ^= sph_dec64le_aligned(buf + 56); \ + a31 ^= sph_dec64le_aligned(buf + 64); \ + } while (0) + +#define INPUT_BUF(lim) do { \ + a00 ^= sph_dec64le_aligned(buf + 0); \ + a10 ^= sph_dec64le_aligned(buf + 8); \ + a20 ^= sph_dec64le_aligned(buf + 16); \ + a30 ^= sph_dec64le_aligned(buf + 24); \ + a40 ^= sph_dec64le_aligned(buf + 32); \ + a01 ^= sph_dec64le_aligned(buf + 40); \ + a11 ^= sph_dec64le_aligned(buf + 48); \ + a21 ^= sph_dec64le_aligned(buf + 56); \ + a31 ^= sph_dec64le_aligned(buf + 64); \ + if ((lim) == 72) \ + break; \ + a41 ^= sph_dec64le_aligned(buf + 72); \ + a02 ^= sph_dec64le_aligned(buf + 80); \ + a12 ^= sph_dec64le_aligned(buf + 88); \ + a22 ^= sph_dec64le_aligned(buf + 96); \ + if ((lim) == 104) \ + break; \ + a32 ^= sph_dec64le_aligned(buf + 104); \ + a42 ^= sph_dec64le_aligned(buf + 112); \ + a03 ^= sph_dec64le_aligned(buf + 120); \ + a13 ^= sph_dec64le_aligned(buf + 128); \ + if ((lim) == 136) \ + break; \ + a23 ^= sph_dec64le_aligned(buf + 136); \ + } while (0) + +#endif + +#define DECL64(x) sph_u64 x +#define MOV64(d, s) (d = s) +#define XOR64(d, a, b) (d = a ^ b) +#define AND64(d, a, b) (d = a & b) +#define OR64(d, a, b) (d = a | b) +#define NOT64(d, s) (d = SPH_T64(~s)) +#define ROL64(d, v, n) (d = SPH_ROTL64(v, n)) +#define XOR64_IOTA XOR64 + +#else + +static const struct { + sph_u32 high, low; +} RC[] = { +#if SPH_KECCAK_INTERLEAVE + { SPH_C32(0x00000000), SPH_C32(0x00000001) }, + { SPH_C32(0x00000089), SPH_C32(0x00000000) }, + { SPH_C32(0x8000008B), SPH_C32(0x00000000) }, + { SPH_C32(0x80008080), SPH_C32(0x00000000) }, + { SPH_C32(0x0000008B), SPH_C32(0x00000001) }, + { SPH_C32(0x00008000), SPH_C32(0x00000001) }, + { SPH_C32(0x80008088), SPH_C32(0x00000001) }, + { SPH_C32(0x80000082), SPH_C32(0x00000001) }, + { SPH_C32(0x0000000B), SPH_C32(0x00000000) }, + { SPH_C32(0x0000000A), SPH_C32(0x00000000) }, + { SPH_C32(0x00008082), SPH_C32(0x00000001) }, + { SPH_C32(0x00008003), SPH_C32(0x00000000) }, + { SPH_C32(0x0000808B), SPH_C32(0x00000001) }, + { SPH_C32(0x8000000B), SPH_C32(0x00000001) }, + { SPH_C32(0x8000008A), SPH_C32(0x00000001) }, + { SPH_C32(0x80000081), SPH_C32(0x00000001) }, + { SPH_C32(0x80000081), SPH_C32(0x00000000) }, + { SPH_C32(0x80000008), SPH_C32(0x00000000) }, + { SPH_C32(0x00000083), SPH_C32(0x00000000) }, + { SPH_C32(0x80008003), SPH_C32(0x00000000) }, + { SPH_C32(0x80008088), SPH_C32(0x00000001) }, + { SPH_C32(0x80000088), SPH_C32(0x00000000) }, + { SPH_C32(0x00008000), SPH_C32(0x00000001) }, + { SPH_C32(0x80008082), SPH_C32(0x00000000) } +#else + { SPH_C32(0x00000000), SPH_C32(0x00000001) }, + { SPH_C32(0x00000000), SPH_C32(0x00008082) }, + { SPH_C32(0x80000000), SPH_C32(0x0000808A) }, + { SPH_C32(0x80000000), SPH_C32(0x80008000) }, + { SPH_C32(0x00000000), SPH_C32(0x0000808B) }, + { SPH_C32(0x00000000), SPH_C32(0x80000001) }, + { SPH_C32(0x80000000), SPH_C32(0x80008081) }, + { SPH_C32(0x80000000), SPH_C32(0x00008009) }, + { SPH_C32(0x00000000), SPH_C32(0x0000008A) }, + { SPH_C32(0x00000000), SPH_C32(0x00000088) }, + { SPH_C32(0x00000000), SPH_C32(0x80008009) }, + { SPH_C32(0x00000000), SPH_C32(0x8000000A) }, + { SPH_C32(0x00000000), SPH_C32(0x8000808B) }, + { SPH_C32(0x80000000), SPH_C32(0x0000008B) }, + { SPH_C32(0x80000000), SPH_C32(0x00008089) }, + { SPH_C32(0x80000000), SPH_C32(0x00008003) }, + { SPH_C32(0x80000000), SPH_C32(0x00008002) }, + { SPH_C32(0x80000000), SPH_C32(0x00000080) }, + { SPH_C32(0x00000000), SPH_C32(0x0000800A) }, + { SPH_C32(0x80000000), SPH_C32(0x8000000A) }, + { SPH_C32(0x80000000), SPH_C32(0x80008081) }, + { SPH_C32(0x80000000), SPH_C32(0x00008080) }, + { SPH_C32(0x00000000), SPH_C32(0x80000001) }, + { SPH_C32(0x80000000), SPH_C32(0x80008008) } +#endif +}; + +#if SPH_KECCAK_INTERLEAVE + +#define INTERLEAVE(xl, xh) do { \ + sph_u32 l, h, t; \ + l = (xl); h = (xh); \ + t = (l ^ (l >> 1)) & SPH_C32(0x22222222); l ^= t ^ (t << 1); \ + t = (h ^ (h >> 1)) & SPH_C32(0x22222222); h ^= t ^ (t << 1); \ + t = (l ^ (l >> 2)) & SPH_C32(0x0C0C0C0C); l ^= t ^ (t << 2); \ + t = (h ^ (h >> 2)) & SPH_C32(0x0C0C0C0C); h ^= t ^ (t << 2); \ + t = (l ^ (l >> 4)) & SPH_C32(0x00F000F0); l ^= t ^ (t << 4); \ + t = (h ^ (h >> 4)) & SPH_C32(0x00F000F0); h ^= t ^ (t << 4); \ + t = (l ^ (l >> 8)) & SPH_C32(0x0000FF00); l ^= t ^ (t << 8); \ + t = (h ^ (h >> 8)) & SPH_C32(0x0000FF00); h ^= t ^ (t << 8); \ + t = (l ^ SPH_T32(h << 16)) & SPH_C32(0xFFFF0000); \ + l ^= t; h ^= t >> 16; \ + (xl) = l; (xh) = h; \ + } while (0) + +#define UNINTERLEAVE(xl, xh) do { \ + sph_u32 l, h, t; \ + l = (xl); h = (xh); \ + t = (l ^ SPH_T32(h << 16)) & SPH_C32(0xFFFF0000); \ + l ^= t; h ^= t >> 16; \ + t = (l ^ (l >> 8)) & SPH_C32(0x0000FF00); l ^= t ^ (t << 8); \ + t = (h ^ (h >> 8)) & SPH_C32(0x0000FF00); h ^= t ^ (t << 8); \ + t = (l ^ (l >> 4)) & SPH_C32(0x00F000F0); l ^= t ^ (t << 4); \ + t = (h ^ (h >> 4)) & SPH_C32(0x00F000F0); h ^= t ^ (t << 4); \ + t = (l ^ (l >> 2)) & SPH_C32(0x0C0C0C0C); l ^= t ^ (t << 2); \ + t = (h ^ (h >> 2)) & SPH_C32(0x0C0C0C0C); h ^= t ^ (t << 2); \ + t = (l ^ (l >> 1)) & SPH_C32(0x22222222); l ^= t ^ (t << 1); \ + t = (h ^ (h >> 1)) & SPH_C32(0x22222222); h ^= t ^ (t << 1); \ + (xl) = l; (xh) = h; \ + } while (0) + +#else + +#define INTERLEAVE(l, h) +#define UNINTERLEAVE(l, h) + +#endif + +#if SPH_KECCAK_NOCOPY + +#define a00l (kc->u.narrow[2 * 0 + 0]) +#define a00h (kc->u.narrow[2 * 0 + 1]) +#define a10l (kc->u.narrow[2 * 1 + 0]) +#define a10h (kc->u.narrow[2 * 1 + 1]) +#define a20l (kc->u.narrow[2 * 2 + 0]) +#define a20h (kc->u.narrow[2 * 2 + 1]) +#define a30l (kc->u.narrow[2 * 3 + 0]) +#define a30h (kc->u.narrow[2 * 3 + 1]) +#define a40l (kc->u.narrow[2 * 4 + 0]) +#define a40h (kc->u.narrow[2 * 4 + 1]) +#define a01l (kc->u.narrow[2 * 5 + 0]) +#define a01h (kc->u.narrow[2 * 5 + 1]) +#define a11l (kc->u.narrow[2 * 6 + 0]) +#define a11h (kc->u.narrow[2 * 6 + 1]) +#define a21l (kc->u.narrow[2 * 7 + 0]) +#define a21h (kc->u.narrow[2 * 7 + 1]) +#define a31l (kc->u.narrow[2 * 8 + 0]) +#define a31h (kc->u.narrow[2 * 8 + 1]) +#define a41l (kc->u.narrow[2 * 9 + 0]) +#define a41h (kc->u.narrow[2 * 9 + 1]) +#define a02l (kc->u.narrow[2 * 10 + 0]) +#define a02h (kc->u.narrow[2 * 10 + 1]) +#define a12l (kc->u.narrow[2 * 11 + 0]) +#define a12h (kc->u.narrow[2 * 11 + 1]) +#define a22l (kc->u.narrow[2 * 12 + 0]) +#define a22h (kc->u.narrow[2 * 12 + 1]) +#define a32l (kc->u.narrow[2 * 13 + 0]) +#define a32h (kc->u.narrow[2 * 13 + 1]) +#define a42l (kc->u.narrow[2 * 14 + 0]) +#define a42h (kc->u.narrow[2 * 14 + 1]) +#define a03l (kc->u.narrow[2 * 15 + 0]) +#define a03h (kc->u.narrow[2 * 15 + 1]) +#define a13l (kc->u.narrow[2 * 16 + 0]) +#define a13h (kc->u.narrow[2 * 16 + 1]) +#define a23l (kc->u.narrow[2 * 17 + 0]) +#define a23h (kc->u.narrow[2 * 17 + 1]) +#define a33l (kc->u.narrow[2 * 18 + 0]) +#define a33h (kc->u.narrow[2 * 18 + 1]) +#define a43l (kc->u.narrow[2 * 19 + 0]) +#define a43h (kc->u.narrow[2 * 19 + 1]) +#define a04l (kc->u.narrow[2 * 20 + 0]) +#define a04h (kc->u.narrow[2 * 20 + 1]) +#define a14l (kc->u.narrow[2 * 21 + 0]) +#define a14h (kc->u.narrow[2 * 21 + 1]) +#define a24l (kc->u.narrow[2 * 22 + 0]) +#define a24h (kc->u.narrow[2 * 22 + 1]) +#define a34l (kc->u.narrow[2 * 23 + 0]) +#define a34h (kc->u.narrow[2 * 23 + 1]) +#define a44l (kc->u.narrow[2 * 24 + 0]) +#define a44h (kc->u.narrow[2 * 24 + 1]) + +#define DECL_STATE +#define READ_STATE(state) +#define WRITE_STATE(state) + +#define INPUT_BUF(size) do { \ + size_t j; \ + for (j = 0; j < (size); j += 8) { \ + sph_u32 tl, th; \ + tl = sph_dec32le_aligned(buf + j + 0); \ + th = sph_dec32le_aligned(buf + j + 4); \ + INTERLEAVE(tl, th); \ + kc->u.narrow[(j >> 2) + 0] ^= tl; \ + kc->u.narrow[(j >> 2) + 1] ^= th; \ + } \ + } while (0) + +#define INPUT_BUF144 INPUT_BUF(144) +#define INPUT_BUF136 INPUT_BUF(136) +#define INPUT_BUF104 INPUT_BUF(104) +#define INPUT_BUF72 INPUT_BUF(72) + +#else + +#define DECL_STATE \ + sph_u32 a00l, a00h, a01l, a01h, a02l, a02h, a03l, a03h, a04l, a04h; \ + sph_u32 a10l, a10h, a11l, a11h, a12l, a12h, a13l, a13h, a14l, a14h; \ + sph_u32 a20l, a20h, a21l, a21h, a22l, a22h, a23l, a23h, a24l, a24h; \ + sph_u32 a30l, a30h, a31l, a31h, a32l, a32h, a33l, a33h, a34l, a34h; \ + sph_u32 a40l, a40h, a41l, a41h, a42l, a42h, a43l, a43h, a44l, a44h; + +#define READ_STATE(state) do { \ + a00l = (state)->u.narrow[2 * 0 + 0]; \ + a00h = (state)->u.narrow[2 * 0 + 1]; \ + a10l = (state)->u.narrow[2 * 1 + 0]; \ + a10h = (state)->u.narrow[2 * 1 + 1]; \ + a20l = (state)->u.narrow[2 * 2 + 0]; \ + a20h = (state)->u.narrow[2 * 2 + 1]; \ + a30l = (state)->u.narrow[2 * 3 + 0]; \ + a30h = (state)->u.narrow[2 * 3 + 1]; \ + a40l = (state)->u.narrow[2 * 4 + 0]; \ + a40h = (state)->u.narrow[2 * 4 + 1]; \ + a01l = (state)->u.narrow[2 * 5 + 0]; \ + a01h = (state)->u.narrow[2 * 5 + 1]; \ + a11l = (state)->u.narrow[2 * 6 + 0]; \ + a11h = (state)->u.narrow[2 * 6 + 1]; \ + a21l = (state)->u.narrow[2 * 7 + 0]; \ + a21h = (state)->u.narrow[2 * 7 + 1]; \ + a31l = (state)->u.narrow[2 * 8 + 0]; \ + a31h = (state)->u.narrow[2 * 8 + 1]; \ + a41l = (state)->u.narrow[2 * 9 + 0]; \ + a41h = (state)->u.narrow[2 * 9 + 1]; \ + a02l = (state)->u.narrow[2 * 10 + 0]; \ + a02h = (state)->u.narrow[2 * 10 + 1]; \ + a12l = (state)->u.narrow[2 * 11 + 0]; \ + a12h = (state)->u.narrow[2 * 11 + 1]; \ + a22l = (state)->u.narrow[2 * 12 + 0]; \ + a22h = (state)->u.narrow[2 * 12 + 1]; \ + a32l = (state)->u.narrow[2 * 13 + 0]; \ + a32h = (state)->u.narrow[2 * 13 + 1]; \ + a42l = (state)->u.narrow[2 * 14 + 0]; \ + a42h = (state)->u.narrow[2 * 14 + 1]; \ + a03l = (state)->u.narrow[2 * 15 + 0]; \ + a03h = (state)->u.narrow[2 * 15 + 1]; \ + a13l = (state)->u.narrow[2 * 16 + 0]; \ + a13h = (state)->u.narrow[2 * 16 + 1]; \ + a23l = (state)->u.narrow[2 * 17 + 0]; \ + a23h = (state)->u.narrow[2 * 17 + 1]; \ + a33l = (state)->u.narrow[2 * 18 + 0]; \ + a33h = (state)->u.narrow[2 * 18 + 1]; \ + a43l = (state)->u.narrow[2 * 19 + 0]; \ + a43h = (state)->u.narrow[2 * 19 + 1]; \ + a04l = (state)->u.narrow[2 * 20 + 0]; \ + a04h = (state)->u.narrow[2 * 20 + 1]; \ + a14l = (state)->u.narrow[2 * 21 + 0]; \ + a14h = (state)->u.narrow[2 * 21 + 1]; \ + a24l = (state)->u.narrow[2 * 22 + 0]; \ + a24h = (state)->u.narrow[2 * 22 + 1]; \ + a34l = (state)->u.narrow[2 * 23 + 0]; \ + a34h = (state)->u.narrow[2 * 23 + 1]; \ + a44l = (state)->u.narrow[2 * 24 + 0]; \ + a44h = (state)->u.narrow[2 * 24 + 1]; \ + } while (0) + +#define WRITE_STATE(state) do { \ + (state)->u.narrow[2 * 0 + 0] = a00l; \ + (state)->u.narrow[2 * 0 + 1] = a00h; \ + (state)->u.narrow[2 * 1 + 0] = a10l; \ + (state)->u.narrow[2 * 1 + 1] = a10h; \ + (state)->u.narrow[2 * 2 + 0] = a20l; \ + (state)->u.narrow[2 * 2 + 1] = a20h; \ + (state)->u.narrow[2 * 3 + 0] = a30l; \ + (state)->u.narrow[2 * 3 + 1] = a30h; \ + (state)->u.narrow[2 * 4 + 0] = a40l; \ + (state)->u.narrow[2 * 4 + 1] = a40h; \ + (state)->u.narrow[2 * 5 + 0] = a01l; \ + (state)->u.narrow[2 * 5 + 1] = a01h; \ + (state)->u.narrow[2 * 6 + 0] = a11l; \ + (state)->u.narrow[2 * 6 + 1] = a11h; \ + (state)->u.narrow[2 * 7 + 0] = a21l; \ + (state)->u.narrow[2 * 7 + 1] = a21h; \ + (state)->u.narrow[2 * 8 + 0] = a31l; \ + (state)->u.narrow[2 * 8 + 1] = a31h; \ + (state)->u.narrow[2 * 9 + 0] = a41l; \ + (state)->u.narrow[2 * 9 + 1] = a41h; \ + (state)->u.narrow[2 * 10 + 0] = a02l; \ + (state)->u.narrow[2 * 10 + 1] = a02h; \ + (state)->u.narrow[2 * 11 + 0] = a12l; \ + (state)->u.narrow[2 * 11 + 1] = a12h; \ + (state)->u.narrow[2 * 12 + 0] = a22l; \ + (state)->u.narrow[2 * 12 + 1] = a22h; \ + (state)->u.narrow[2 * 13 + 0] = a32l; \ + (state)->u.narrow[2 * 13 + 1] = a32h; \ + (state)->u.narrow[2 * 14 + 0] = a42l; \ + (state)->u.narrow[2 * 14 + 1] = a42h; \ + (state)->u.narrow[2 * 15 + 0] = a03l; \ + (state)->u.narrow[2 * 15 + 1] = a03h; \ + (state)->u.narrow[2 * 16 + 0] = a13l; \ + (state)->u.narrow[2 * 16 + 1] = a13h; \ + (state)->u.narrow[2 * 17 + 0] = a23l; \ + (state)->u.narrow[2 * 17 + 1] = a23h; \ + (state)->u.narrow[2 * 18 + 0] = a33l; \ + (state)->u.narrow[2 * 18 + 1] = a33h; \ + (state)->u.narrow[2 * 19 + 0] = a43l; \ + (state)->u.narrow[2 * 19 + 1] = a43h; \ + (state)->u.narrow[2 * 20 + 0] = a04l; \ + (state)->u.narrow[2 * 20 + 1] = a04h; \ + (state)->u.narrow[2 * 21 + 0] = a14l; \ + (state)->u.narrow[2 * 21 + 1] = a14h; \ + (state)->u.narrow[2 * 22 + 0] = a24l; \ + (state)->u.narrow[2 * 22 + 1] = a24h; \ + (state)->u.narrow[2 * 23 + 0] = a34l; \ + (state)->u.narrow[2 * 23 + 1] = a34h; \ + (state)->u.narrow[2 * 24 + 0] = a44l; \ + (state)->u.narrow[2 * 24 + 1] = a44h; \ + } while (0) + +#define READ64(d, off) do { \ + sph_u32 tl, th; \ + tl = sph_dec32le_aligned(buf + (off)); \ + th = sph_dec32le_aligned(buf + (off) + 4); \ + INTERLEAVE(tl, th); \ + d ## l ^= tl; \ + d ## h ^= th; \ + } while (0) + +#define INPUT_BUF144 do { \ + READ64(a00, 0); \ + READ64(a10, 8); \ + READ64(a20, 16); \ + READ64(a30, 24); \ + READ64(a40, 32); \ + READ64(a01, 40); \ + READ64(a11, 48); \ + READ64(a21, 56); \ + READ64(a31, 64); \ + READ64(a41, 72); \ + READ64(a02, 80); \ + READ64(a12, 88); \ + READ64(a22, 96); \ + READ64(a32, 104); \ + READ64(a42, 112); \ + READ64(a03, 120); \ + READ64(a13, 128); \ + READ64(a23, 136); \ + } while (0) + +#define INPUT_BUF136 do { \ + READ64(a00, 0); \ + READ64(a10, 8); \ + READ64(a20, 16); \ + READ64(a30, 24); \ + READ64(a40, 32); \ + READ64(a01, 40); \ + READ64(a11, 48); \ + READ64(a21, 56); \ + READ64(a31, 64); \ + READ64(a41, 72); \ + READ64(a02, 80); \ + READ64(a12, 88); \ + READ64(a22, 96); \ + READ64(a32, 104); \ + READ64(a42, 112); \ + READ64(a03, 120); \ + READ64(a13, 128); \ + } while (0) + +#define INPUT_BUF104 do { \ + READ64(a00, 0); \ + READ64(a10, 8); \ + READ64(a20, 16); \ + READ64(a30, 24); \ + READ64(a40, 32); \ + READ64(a01, 40); \ + READ64(a11, 48); \ + READ64(a21, 56); \ + READ64(a31, 64); \ + READ64(a41, 72); \ + READ64(a02, 80); \ + READ64(a12, 88); \ + READ64(a22, 96); \ + } while (0) + +#define INPUT_BUF72 do { \ + READ64(a00, 0); \ + READ64(a10, 8); \ + READ64(a20, 16); \ + READ64(a30, 24); \ + READ64(a40, 32); \ + READ64(a01, 40); \ + READ64(a11, 48); \ + READ64(a21, 56); \ + READ64(a31, 64); \ + } while (0) + +#define INPUT_BUF(lim) do { \ + READ64(a00, 0); \ + READ64(a10, 8); \ + READ64(a20, 16); \ + READ64(a30, 24); \ + READ64(a40, 32); \ + READ64(a01, 40); \ + READ64(a11, 48); \ + READ64(a21, 56); \ + READ64(a31, 64); \ + if ((lim) == 72) \ + break; \ + READ64(a41, 72); \ + READ64(a02, 80); \ + READ64(a12, 88); \ + READ64(a22, 96); \ + if ((lim) == 104) \ + break; \ + READ64(a32, 104); \ + READ64(a42, 112); \ + READ64(a03, 120); \ + READ64(a13, 128); \ + if ((lim) == 136) \ + break; \ + READ64(a23, 136); \ + } while (0) + +#endif + +#define DECL64(x) sph_u64 x ## l, x ## h +#define MOV64(d, s) (d ## l = s ## l, d ## h = s ## h) +#define XOR64(d, a, b) (d ## l = a ## l ^ b ## l, d ## h = a ## h ^ b ## h) +#define AND64(d, a, b) (d ## l = a ## l & b ## l, d ## h = a ## h & b ## h) +#define OR64(d, a, b) (d ## l = a ## l | b ## l, d ## h = a ## h | b ## h) +#define NOT64(d, s) (d ## l = SPH_T32(~s ## l), d ## h = SPH_T32(~s ## h)) +#define ROL64(d, v, n) ROL64_ ## n(d, v) + +#if SPH_KECCAK_INTERLEAVE + +#define ROL64_odd1(d, v) do { \ + sph_u32 tmp; \ + tmp = v ## l; \ + d ## l = SPH_T32(v ## h << 1) | (v ## h >> 31); \ + d ## h = tmp; \ + } while (0) + +#define ROL64_odd63(d, v) do { \ + sph_u32 tmp; \ + tmp = SPH_T32(v ## l << 31) | (v ## l >> 1); \ + d ## l = v ## h; \ + d ## h = tmp; \ + } while (0) + +#define ROL64_odd(d, v, n) do { \ + sph_u32 tmp; \ + tmp = SPH_T32(v ## l << (n - 1)) | (v ## l >> (33 - n)); \ + d ## l = SPH_T32(v ## h << n) | (v ## h >> (32 - n)); \ + d ## h = tmp; \ + } while (0) + +#define ROL64_even(d, v, n) do { \ + d ## l = SPH_T32(v ## l << n) | (v ## l >> (32 - n)); \ + d ## h = SPH_T32(v ## h << n) | (v ## h >> (32 - n)); \ + } while (0) + +#define ROL64_0(d, v) +#define ROL64_1(d, v) ROL64_odd1(d, v) +#define ROL64_2(d, v) ROL64_even(d, v, 1) +#define ROL64_3(d, v) ROL64_odd( d, v, 2) +#define ROL64_4(d, v) ROL64_even(d, v, 2) +#define ROL64_5(d, v) ROL64_odd( d, v, 3) +#define ROL64_6(d, v) ROL64_even(d, v, 3) +#define ROL64_7(d, v) ROL64_odd( d, v, 4) +#define ROL64_8(d, v) ROL64_even(d, v, 4) +#define ROL64_9(d, v) ROL64_odd( d, v, 5) +#define ROL64_10(d, v) ROL64_even(d, v, 5) +#define ROL64_11(d, v) ROL64_odd( d, v, 6) +#define ROL64_12(d, v) ROL64_even(d, v, 6) +#define ROL64_13(d, v) ROL64_odd( d, v, 7) +#define ROL64_14(d, v) ROL64_even(d, v, 7) +#define ROL64_15(d, v) ROL64_odd( d, v, 8) +#define ROL64_16(d, v) ROL64_even(d, v, 8) +#define ROL64_17(d, v) ROL64_odd( d, v, 9) +#define ROL64_18(d, v) ROL64_even(d, v, 9) +#define ROL64_19(d, v) ROL64_odd( d, v, 10) +#define ROL64_20(d, v) ROL64_even(d, v, 10) +#define ROL64_21(d, v) ROL64_odd( d, v, 11) +#define ROL64_22(d, v) ROL64_even(d, v, 11) +#define ROL64_23(d, v) ROL64_odd( d, v, 12) +#define ROL64_24(d, v) ROL64_even(d, v, 12) +#define ROL64_25(d, v) ROL64_odd( d, v, 13) +#define ROL64_26(d, v) ROL64_even(d, v, 13) +#define ROL64_27(d, v) ROL64_odd( d, v, 14) +#define ROL64_28(d, v) ROL64_even(d, v, 14) +#define ROL64_29(d, v) ROL64_odd( d, v, 15) +#define ROL64_30(d, v) ROL64_even(d, v, 15) +#define ROL64_31(d, v) ROL64_odd( d, v, 16) +#define ROL64_32(d, v) ROL64_even(d, v, 16) +#define ROL64_33(d, v) ROL64_odd( d, v, 17) +#define ROL64_34(d, v) ROL64_even(d, v, 17) +#define ROL64_35(d, v) ROL64_odd( d, v, 18) +#define ROL64_36(d, v) ROL64_even(d, v, 18) +#define ROL64_37(d, v) ROL64_odd( d, v, 19) +#define ROL64_38(d, v) ROL64_even(d, v, 19) +#define ROL64_39(d, v) ROL64_odd( d, v, 20) +#define ROL64_40(d, v) ROL64_even(d, v, 20) +#define ROL64_41(d, v) ROL64_odd( d, v, 21) +#define ROL64_42(d, v) ROL64_even(d, v, 21) +#define ROL64_43(d, v) ROL64_odd( d, v, 22) +#define ROL64_44(d, v) ROL64_even(d, v, 22) +#define ROL64_45(d, v) ROL64_odd( d, v, 23) +#define ROL64_46(d, v) ROL64_even(d, v, 23) +#define ROL64_47(d, v) ROL64_odd( d, v, 24) +#define ROL64_48(d, v) ROL64_even(d, v, 24) +#define ROL64_49(d, v) ROL64_odd( d, v, 25) +#define ROL64_50(d, v) ROL64_even(d, v, 25) +#define ROL64_51(d, v) ROL64_odd( d, v, 26) +#define ROL64_52(d, v) ROL64_even(d, v, 26) +#define ROL64_53(d, v) ROL64_odd( d, v, 27) +#define ROL64_54(d, v) ROL64_even(d, v, 27) +#define ROL64_55(d, v) ROL64_odd( d, v, 28) +#define ROL64_56(d, v) ROL64_even(d, v, 28) +#define ROL64_57(d, v) ROL64_odd( d, v, 29) +#define ROL64_58(d, v) ROL64_even(d, v, 29) +#define ROL64_59(d, v) ROL64_odd( d, v, 30) +#define ROL64_60(d, v) ROL64_even(d, v, 30) +#define ROL64_61(d, v) ROL64_odd( d, v, 31) +#define ROL64_62(d, v) ROL64_even(d, v, 31) +#define ROL64_63(d, v) ROL64_odd63(d, v) + +#else + +#define ROL64_small(d, v, n) do { \ + sph_u32 tmp; \ + tmp = SPH_T32(v ## l << n) | (v ## h >> (32 - n)); \ + d ## h = SPH_T32(v ## h << n) | (v ## l >> (32 - n)); \ + d ## l = tmp; \ + } while (0) + +#define ROL64_0(d, v) 0 +#define ROL64_1(d, v) ROL64_small(d, v, 1) +#define ROL64_2(d, v) ROL64_small(d, v, 2) +#define ROL64_3(d, v) ROL64_small(d, v, 3) +#define ROL64_4(d, v) ROL64_small(d, v, 4) +#define ROL64_5(d, v) ROL64_small(d, v, 5) +#define ROL64_6(d, v) ROL64_small(d, v, 6) +#define ROL64_7(d, v) ROL64_small(d, v, 7) +#define ROL64_8(d, v) ROL64_small(d, v, 8) +#define ROL64_9(d, v) ROL64_small(d, v, 9) +#define ROL64_10(d, v) ROL64_small(d, v, 10) +#define ROL64_11(d, v) ROL64_small(d, v, 11) +#define ROL64_12(d, v) ROL64_small(d, v, 12) +#define ROL64_13(d, v) ROL64_small(d, v, 13) +#define ROL64_14(d, v) ROL64_small(d, v, 14) +#define ROL64_15(d, v) ROL64_small(d, v, 15) +#define ROL64_16(d, v) ROL64_small(d, v, 16) +#define ROL64_17(d, v) ROL64_small(d, v, 17) +#define ROL64_18(d, v) ROL64_small(d, v, 18) +#define ROL64_19(d, v) ROL64_small(d, v, 19) +#define ROL64_20(d, v) ROL64_small(d, v, 20) +#define ROL64_21(d, v) ROL64_small(d, v, 21) +#define ROL64_22(d, v) ROL64_small(d, v, 22) +#define ROL64_23(d, v) ROL64_small(d, v, 23) +#define ROL64_24(d, v) ROL64_small(d, v, 24) +#define ROL64_25(d, v) ROL64_small(d, v, 25) +#define ROL64_26(d, v) ROL64_small(d, v, 26) +#define ROL64_27(d, v) ROL64_small(d, v, 27) +#define ROL64_28(d, v) ROL64_small(d, v, 28) +#define ROL64_29(d, v) ROL64_small(d, v, 29) +#define ROL64_30(d, v) ROL64_small(d, v, 30) +#define ROL64_31(d, v) ROL64_small(d, v, 31) + +#define ROL64_32(d, v) do { \ + sph_u32 tmp; \ + tmp = v ## l; \ + d ## l = v ## h; \ + d ## h = tmp; \ + } while (0) + +#define ROL64_big(d, v, n) do { \ + sph_u32 trl, trh; \ + ROL64_small(tr, v, n); \ + d ## h = trl; \ + d ## l = trh; \ + } while (0) + +#define ROL64_33(d, v) ROL64_big(d, v, 1) +#define ROL64_34(d, v) ROL64_big(d, v, 2) +#define ROL64_35(d, v) ROL64_big(d, v, 3) +#define ROL64_36(d, v) ROL64_big(d, v, 4) +#define ROL64_37(d, v) ROL64_big(d, v, 5) +#define ROL64_38(d, v) ROL64_big(d, v, 6) +#define ROL64_39(d, v) ROL64_big(d, v, 7) +#define ROL64_40(d, v) ROL64_big(d, v, 8) +#define ROL64_41(d, v) ROL64_big(d, v, 9) +#define ROL64_42(d, v) ROL64_big(d, v, 10) +#define ROL64_43(d, v) ROL64_big(d, v, 11) +#define ROL64_44(d, v) ROL64_big(d, v, 12) +#define ROL64_45(d, v) ROL64_big(d, v, 13) +#define ROL64_46(d, v) ROL64_big(d, v, 14) +#define ROL64_47(d, v) ROL64_big(d, v, 15) +#define ROL64_48(d, v) ROL64_big(d, v, 16) +#define ROL64_49(d, v) ROL64_big(d, v, 17) +#define ROL64_50(d, v) ROL64_big(d, v, 18) +#define ROL64_51(d, v) ROL64_big(d, v, 19) +#define ROL64_52(d, v) ROL64_big(d, v, 20) +#define ROL64_53(d, v) ROL64_big(d, v, 21) +#define ROL64_54(d, v) ROL64_big(d, v, 22) +#define ROL64_55(d, v) ROL64_big(d, v, 23) +#define ROL64_56(d, v) ROL64_big(d, v, 24) +#define ROL64_57(d, v) ROL64_big(d, v, 25) +#define ROL64_58(d, v) ROL64_big(d, v, 26) +#define ROL64_59(d, v) ROL64_big(d, v, 27) +#define ROL64_60(d, v) ROL64_big(d, v, 28) +#define ROL64_61(d, v) ROL64_big(d, v, 29) +#define ROL64_62(d, v) ROL64_big(d, v, 30) +#define ROL64_63(d, v) ROL64_big(d, v, 31) + +#endif + +#define XOR64_IOTA(d, s, k) \ + (d ## l = s ## l ^ k.low, d ## h = s ## h ^ k.high) + +#endif + +#define TH_ELT(t, c0, c1, c2, c3, c4, d0, d1, d2, d3, d4) do { \ + DECL64(tt0); \ + DECL64(tt1); \ + DECL64(tt2); \ + DECL64(tt3); \ + XOR64(tt0, d0, d1); \ + XOR64(tt1, d2, d3); \ + XOR64(tt0, tt0, d4); \ + XOR64(tt0, tt0, tt1); \ + ROL64(tt0, tt0, 1); \ + XOR64(tt2, c0, c1); \ + XOR64(tt3, c2, c3); \ + XOR64(tt0, tt0, c4); \ + XOR64(tt2, tt2, tt3); \ + XOR64(t, tt0, tt2); \ + } while (0) + +#define THETA(b00, b01, b02, b03, b04, b10, b11, b12, b13, b14, \ + b20, b21, b22, b23, b24, b30, b31, b32, b33, b34, \ + b40, b41, b42, b43, b44) \ + do { \ + DECL64(t0); \ + DECL64(t1); \ + DECL64(t2); \ + DECL64(t3); \ + DECL64(t4); \ + TH_ELT(t0, b40, b41, b42, b43, b44, b10, b11, b12, b13, b14); \ + TH_ELT(t1, b00, b01, b02, b03, b04, b20, b21, b22, b23, b24); \ + TH_ELT(t2, b10, b11, b12, b13, b14, b30, b31, b32, b33, b34); \ + TH_ELT(t3, b20, b21, b22, b23, b24, b40, b41, b42, b43, b44); \ + TH_ELT(t4, b30, b31, b32, b33, b34, b00, b01, b02, b03, b04); \ + XOR64(b00, b00, t0); \ + XOR64(b01, b01, t0); \ + XOR64(b02, b02, t0); \ + XOR64(b03, b03, t0); \ + XOR64(b04, b04, t0); \ + XOR64(b10, b10, t1); \ + XOR64(b11, b11, t1); \ + XOR64(b12, b12, t1); \ + XOR64(b13, b13, t1); \ + XOR64(b14, b14, t1); \ + XOR64(b20, b20, t2); \ + XOR64(b21, b21, t2); \ + XOR64(b22, b22, t2); \ + XOR64(b23, b23, t2); \ + XOR64(b24, b24, t2); \ + XOR64(b30, b30, t3); \ + XOR64(b31, b31, t3); \ + XOR64(b32, b32, t3); \ + XOR64(b33, b33, t3); \ + XOR64(b34, b34, t3); \ + XOR64(b40, b40, t4); \ + XOR64(b41, b41, t4); \ + XOR64(b42, b42, t4); \ + XOR64(b43, b43, t4); \ + XOR64(b44, b44, t4); \ + } while (0) + +#define RHO(b00, b01, b02, b03, b04, b10, b11, b12, b13, b14, \ + b20, b21, b22, b23, b24, b30, b31, b32, b33, b34, \ + b40, b41, b42, b43, b44) \ + do { \ + /* ROL64(b00, b00, 0); */ \ + ROL64(b01, b01, 36); \ + ROL64(b02, b02, 3); \ + ROL64(b03, b03, 41); \ + ROL64(b04, b04, 18); \ + ROL64(b10, b10, 1); \ + ROL64(b11, b11, 44); \ + ROL64(b12, b12, 10); \ + ROL64(b13, b13, 45); \ + ROL64(b14, b14, 2); \ + ROL64(b20, b20, 62); \ + ROL64(b21, b21, 6); \ + ROL64(b22, b22, 43); \ + ROL64(b23, b23, 15); \ + ROL64(b24, b24, 61); \ + ROL64(b30, b30, 28); \ + ROL64(b31, b31, 55); \ + ROL64(b32, b32, 25); \ + ROL64(b33, b33, 21); \ + ROL64(b34, b34, 56); \ + ROL64(b40, b40, 27); \ + ROL64(b41, b41, 20); \ + ROL64(b42, b42, 39); \ + ROL64(b43, b43, 8); \ + ROL64(b44, b44, 14); \ + } while (0) + +/* + * The KHI macro integrates the "lane complement" optimization. On input, + * some words are complemented: + * a00 a01 a02 a04 a13 a20 a21 a22 a30 a33 a34 a43 + * On output, the following words are complemented: + * a04 a10 a20 a22 a23 a31 + * + * The (implicit) permutation and the theta expansion will bring back + * the input mask for the next round. + */ + +#define KHI_XO(d, a, b, c) do { \ + DECL64(kt); \ + OR64(kt, b, c); \ + XOR64(d, a, kt); \ + } while (0) + +#define KHI_XA(d, a, b, c) do { \ + DECL64(kt); \ + AND64(kt, b, c); \ + XOR64(d, a, kt); \ + } while (0) + +#define KHI(b00, b01, b02, b03, b04, b10, b11, b12, b13, b14, \ + b20, b21, b22, b23, b24, b30, b31, b32, b33, b34, \ + b40, b41, b42, b43, b44) \ + do { \ + DECL64(c0); \ + DECL64(c1); \ + DECL64(c2); \ + DECL64(c3); \ + DECL64(c4); \ + DECL64(bnn); \ + NOT64(bnn, b20); \ + KHI_XO(c0, b00, b10, b20); \ + KHI_XO(c1, b10, bnn, b30); \ + KHI_XA(c2, b20, b30, b40); \ + KHI_XO(c3, b30, b40, b00); \ + KHI_XA(c4, b40, b00, b10); \ + MOV64(b00, c0); \ + MOV64(b10, c1); \ + MOV64(b20, c2); \ + MOV64(b30, c3); \ + MOV64(b40, c4); \ + NOT64(bnn, b41); \ + KHI_XO(c0, b01, b11, b21); \ + KHI_XA(c1, b11, b21, b31); \ + KHI_XO(c2, b21, b31, bnn); \ + KHI_XO(c3, b31, b41, b01); \ + KHI_XA(c4, b41, b01, b11); \ + MOV64(b01, c0); \ + MOV64(b11, c1); \ + MOV64(b21, c2); \ + MOV64(b31, c3); \ + MOV64(b41, c4); \ + NOT64(bnn, b32); \ + KHI_XO(c0, b02, b12, b22); \ + KHI_XA(c1, b12, b22, b32); \ + KHI_XA(c2, b22, bnn, b42); \ + KHI_XO(c3, bnn, b42, b02); \ + KHI_XA(c4, b42, b02, b12); \ + MOV64(b02, c0); \ + MOV64(b12, c1); \ + MOV64(b22, c2); \ + MOV64(b32, c3); \ + MOV64(b42, c4); \ + NOT64(bnn, b33); \ + KHI_XA(c0, b03, b13, b23); \ + KHI_XO(c1, b13, b23, b33); \ + KHI_XO(c2, b23, bnn, b43); \ + KHI_XA(c3, bnn, b43, b03); \ + KHI_XO(c4, b43, b03, b13); \ + MOV64(b03, c0); \ + MOV64(b13, c1); \ + MOV64(b23, c2); \ + MOV64(b33, c3); \ + MOV64(b43, c4); \ + NOT64(bnn, b14); \ + KHI_XA(c0, b04, bnn, b24); \ + KHI_XO(c1, bnn, b24, b34); \ + KHI_XA(c2, b24, b34, b44); \ + KHI_XO(c3, b34, b44, b04); \ + KHI_XA(c4, b44, b04, b14); \ + MOV64(b04, c0); \ + MOV64(b14, c1); \ + MOV64(b24, c2); \ + MOV64(b34, c3); \ + MOV64(b44, c4); \ + } while (0) + +#define IOTA(r) XOR64_IOTA(a00, a00, r) + +#define P0 a00, a01, a02, a03, a04, a10, a11, a12, a13, a14, a20, a21, \ + a22, a23, a24, a30, a31, a32, a33, a34, a40, a41, a42, a43, a44 +#define P1 a00, a30, a10, a40, a20, a11, a41, a21, a01, a31, a22, a02, \ + a32, a12, a42, a33, a13, a43, a23, a03, a44, a24, a04, a34, a14 +#define P2 a00, a33, a11, a44, a22, a41, a24, a02, a30, a13, a32, a10, \ + a43, a21, a04, a23, a01, a34, a12, a40, a14, a42, a20, a03, a31 +#define P3 a00, a23, a41, a14, a32, a24, a42, a10, a33, a01, a43, a11, \ + a34, a02, a20, a12, a30, a03, a21, a44, a31, a04, a22, a40, a13 +#define P4 a00, a12, a24, a31, a43, a42, a04, a11, a23, a30, a34, a41, \ + a03, a10, a22, a21, a33, a40, a02, a14, a13, a20, a32, a44, a01 +#define P5 a00, a21, a42, a13, a34, a04, a20, a41, a12, a33, a03, a24, \ + a40, a11, a32, a02, a23, a44, a10, a31, a01, a22, a43, a14, a30 +#define P6 a00, a02, a04, a01, a03, a20, a22, a24, a21, a23, a40, a42, \ + a44, a41, a43, a10, a12, a14, a11, a13, a30, a32, a34, a31, a33 +#define P7 a00, a10, a20, a30, a40, a22, a32, a42, a02, a12, a44, a04, \ + a14, a24, a34, a11, a21, a31, a41, a01, a33, a43, a03, a13, a23 +#define P8 a00, a11, a22, a33, a44, a32, a43, a04, a10, a21, a14, a20, \ + a31, a42, a03, a41, a02, a13, a24, a30, a23, a34, a40, a01, a12 +#define P9 a00, a41, a32, a23, a14, a43, a34, a20, a11, a02, a31, a22, \ + a13, a04, a40, a24, a10, a01, a42, a33, a12, a03, a44, a30, a21 +#define P10 a00, a24, a43, a12, a31, a34, a03, a22, a41, a10, a13, a32, \ + a01, a20, a44, a42, a11, a30, a04, a23, a21, a40, a14, a33, a02 +#define P11 a00, a42, a34, a21, a13, a03, a40, a32, a24, a11, a01, a43, \ + a30, a22, a14, a04, a41, a33, a20, a12, a02, a44, a31, a23, a10 +#define P12 a00, a04, a03, a02, a01, a40, a44, a43, a42, a41, a30, a34, \ + a33, a32, a31, a20, a24, a23, a22, a21, a10, a14, a13, a12, a11 +#define P13 a00, a20, a40, a10, a30, a44, a14, a34, a04, a24, a33, a03, \ + a23, a43, a13, a22, a42, a12, a32, a02, a11, a31, a01, a21, a41 +#define P14 a00, a22, a44, a11, a33, a14, a31, a03, a20, a42, a23, a40, \ + a12, a34, a01, a32, a04, a21, a43, a10, a41, a13, a30, a02, a24 +#define P15 a00, a32, a14, a41, a23, a31, a13, a40, a22, a04, a12, a44, \ + a21, a03, a30, a43, a20, a02, a34, a11, a24, a01, a33, a10, a42 +#define P16 a00, a43, a31, a24, a12, a13, a01, a44, a32, a20, a21, a14, \ + a02, a40, a33, a34, a22, a10, a03, a41, a42, a30, a23, a11, a04 +#define P17 a00, a34, a13, a42, a21, a01, a30, a14, a43, a22, a02, a31, \ + a10, a44, a23, a03, a32, a11, a40, a24, a04, a33, a12, a41, a20 +#define P18 a00, a03, a01, a04, a02, a30, a33, a31, a34, a32, a10, a13, \ + a11, a14, a12, a40, a43, a41, a44, a42, a20, a23, a21, a24, a22 +#define P19 a00, a40, a30, a20, a10, a33, a23, a13, a03, a43, a11, a01, \ + a41, a31, a21, a44, a34, a24, a14, a04, a22, a12, a02, a42, a32 +#define P20 a00, a44, a33, a22, a11, a23, a12, a01, a40, a34, a41, a30, \ + a24, a13, a02, a14, a03, a42, a31, a20, a32, a21, a10, a04, a43 +#define P21 a00, a14, a23, a32, a41, a12, a21, a30, a44, a03, a24, a33, \ + a42, a01, a10, a31, a40, a04, a13, a22, a43, a02, a11, a20, a34 +#define P22 a00, a31, a12, a43, a24, a21, a02, a33, a14, a40, a42, a23, \ + a04, a30, a11, a13, a44, a20, a01, a32, a34, a10, a41, a22, a03 +#define P23 a00, a13, a21, a34, a42, a02, a10, a23, a31, a44, a04, a12, \ + a20, a33, a41, a01, a14, a22, a30, a43, a03, a11, a24, a32, a40 + +#define P1_TO_P0 do { \ + DECL64(t); \ + MOV64(t, a01); \ + MOV64(a01, a30); \ + MOV64(a30, a33); \ + MOV64(a33, a23); \ + MOV64(a23, a12); \ + MOV64(a12, a21); \ + MOV64(a21, a02); \ + MOV64(a02, a10); \ + MOV64(a10, a11); \ + MOV64(a11, a41); \ + MOV64(a41, a24); \ + MOV64(a24, a42); \ + MOV64(a42, a04); \ + MOV64(a04, a20); \ + MOV64(a20, a22); \ + MOV64(a22, a32); \ + MOV64(a32, a43); \ + MOV64(a43, a34); \ + MOV64(a34, a03); \ + MOV64(a03, a40); \ + MOV64(a40, a44); \ + MOV64(a44, a14); \ + MOV64(a14, a31); \ + MOV64(a31, a13); \ + MOV64(a13, t); \ + } while (0) + +#define P2_TO_P0 do { \ + DECL64(t); \ + MOV64(t, a01); \ + MOV64(a01, a33); \ + MOV64(a33, a12); \ + MOV64(a12, a02); \ + MOV64(a02, a11); \ + MOV64(a11, a24); \ + MOV64(a24, a04); \ + MOV64(a04, a22); \ + MOV64(a22, a43); \ + MOV64(a43, a03); \ + MOV64(a03, a44); \ + MOV64(a44, a31); \ + MOV64(a31, t); \ + MOV64(t, a10); \ + MOV64(a10, a41); \ + MOV64(a41, a42); \ + MOV64(a42, a20); \ + MOV64(a20, a32); \ + MOV64(a32, a34); \ + MOV64(a34, a40); \ + MOV64(a40, a14); \ + MOV64(a14, a13); \ + MOV64(a13, a30); \ + MOV64(a30, a23); \ + MOV64(a23, a21); \ + MOV64(a21, t); \ + } while (0) + +#define P4_TO_P0 do { \ + DECL64(t); \ + MOV64(t, a01); \ + MOV64(a01, a12); \ + MOV64(a12, a11); \ + MOV64(a11, a04); \ + MOV64(a04, a43); \ + MOV64(a43, a44); \ + MOV64(a44, t); \ + MOV64(t, a02); \ + MOV64(a02, a24); \ + MOV64(a24, a22); \ + MOV64(a22, a03); \ + MOV64(a03, a31); \ + MOV64(a31, a33); \ + MOV64(a33, t); \ + MOV64(t, a10); \ + MOV64(a10, a42); \ + MOV64(a42, a32); \ + MOV64(a32, a40); \ + MOV64(a40, a13); \ + MOV64(a13, a23); \ + MOV64(a23, t); \ + MOV64(t, a14); \ + MOV64(a14, a30); \ + MOV64(a30, a21); \ + MOV64(a21, a41); \ + MOV64(a41, a20); \ + MOV64(a20, a34); \ + MOV64(a34, t); \ + } while (0) + +#define P6_TO_P0 do { \ + DECL64(t); \ + MOV64(t, a01); \ + MOV64(a01, a02); \ + MOV64(a02, a04); \ + MOV64(a04, a03); \ + MOV64(a03, t); \ + MOV64(t, a10); \ + MOV64(a10, a20); \ + MOV64(a20, a40); \ + MOV64(a40, a30); \ + MOV64(a30, t); \ + MOV64(t, a11); \ + MOV64(a11, a22); \ + MOV64(a22, a44); \ + MOV64(a44, a33); \ + MOV64(a33, t); \ + MOV64(t, a12); \ + MOV64(a12, a24); \ + MOV64(a24, a43); \ + MOV64(a43, a31); \ + MOV64(a31, t); \ + MOV64(t, a13); \ + MOV64(a13, a21); \ + MOV64(a21, a42); \ + MOV64(a42, a34); \ + MOV64(a34, t); \ + MOV64(t, a14); \ + MOV64(a14, a23); \ + MOV64(a23, a41); \ + MOV64(a41, a32); \ + MOV64(a32, t); \ + } while (0) + +#define P8_TO_P0 do { \ + DECL64(t); \ + MOV64(t, a01); \ + MOV64(a01, a11); \ + MOV64(a11, a43); \ + MOV64(a43, t); \ + MOV64(t, a02); \ + MOV64(a02, a22); \ + MOV64(a22, a31); \ + MOV64(a31, t); \ + MOV64(t, a03); \ + MOV64(a03, a33); \ + MOV64(a33, a24); \ + MOV64(a24, t); \ + MOV64(t, a04); \ + MOV64(a04, a44); \ + MOV64(a44, a12); \ + MOV64(a12, t); \ + MOV64(t, a10); \ + MOV64(a10, a32); \ + MOV64(a32, a13); \ + MOV64(a13, t); \ + MOV64(t, a14); \ + MOV64(a14, a21); \ + MOV64(a21, a20); \ + MOV64(a20, t); \ + MOV64(t, a23); \ + MOV64(a23, a42); \ + MOV64(a42, a40); \ + MOV64(a40, t); \ + MOV64(t, a30); \ + MOV64(a30, a41); \ + MOV64(a41, a34); \ + MOV64(a34, t); \ + } while (0) + +#define P12_TO_P0 do { \ + DECL64(t); \ + MOV64(t, a01); \ + MOV64(a01, a04); \ + MOV64(a04, t); \ + MOV64(t, a02); \ + MOV64(a02, a03); \ + MOV64(a03, t); \ + MOV64(t, a10); \ + MOV64(a10, a40); \ + MOV64(a40, t); \ + MOV64(t, a11); \ + MOV64(a11, a44); \ + MOV64(a44, t); \ + MOV64(t, a12); \ + MOV64(a12, a43); \ + MOV64(a43, t); \ + MOV64(t, a13); \ + MOV64(a13, a42); \ + MOV64(a42, t); \ + MOV64(t, a14); \ + MOV64(a14, a41); \ + MOV64(a41, t); \ + MOV64(t, a20); \ + MOV64(a20, a30); \ + MOV64(a30, t); \ + MOV64(t, a21); \ + MOV64(a21, a34); \ + MOV64(a34, t); \ + MOV64(t, a22); \ + MOV64(a22, a33); \ + MOV64(a33, t); \ + MOV64(t, a23); \ + MOV64(a23, a32); \ + MOV64(a32, t); \ + MOV64(t, a24); \ + MOV64(a24, a31); \ + MOV64(a31, t); \ + } while (0) + +#define LPAR ( +#define RPAR ) + +#define KF_ELT(r, s, k) do { \ + THETA LPAR P ## r RPAR; \ + RHO LPAR P ## r RPAR; \ + KHI LPAR P ## s RPAR; \ + IOTA(k); \ + } while (0) + +#define DO(x) x + +#define KECCAK_F_1600 DO(KECCAK_F_1600_) + +#if SPH_KECCAK_UNROLL == 1 + +#define KECCAK_F_1600_ do { \ + int j; \ + for (j = 0; j < 24; j ++) { \ + KF_ELT( 0, 1, RC[j + 0]); \ + P1_TO_P0; \ + } \ + } while (0) + +#elif SPH_KECCAK_UNROLL == 2 + +#define KECCAK_F_1600_ do { \ + int j; \ + for (j = 0; j < 24; j += 2) { \ + KF_ELT( 0, 1, RC[j + 0]); \ + KF_ELT( 1, 2, RC[j + 1]); \ + P2_TO_P0; \ + } \ + } while (0) + +#elif SPH_KECCAK_UNROLL == 4 + +#define KECCAK_F_1600_ do { \ + int j; \ + for (j = 0; j < 24; j += 4) { \ + KF_ELT( 0, 1, RC[j + 0]); \ + KF_ELT( 1, 2, RC[j + 1]); \ + KF_ELT( 2, 3, RC[j + 2]); \ + KF_ELT( 3, 4, RC[j + 3]); \ + P4_TO_P0; \ + } \ + } while (0) + +#elif SPH_KECCAK_UNROLL == 6 + +#define KECCAK_F_1600_ do { \ + int j; \ + for (j = 0; j < 24; j += 6) { \ + KF_ELT( 0, 1, RC[j + 0]); \ + KF_ELT( 1, 2, RC[j + 1]); \ + KF_ELT( 2, 3, RC[j + 2]); \ + KF_ELT( 3, 4, RC[j + 3]); \ + KF_ELT( 4, 5, RC[j + 4]); \ + KF_ELT( 5, 6, RC[j + 5]); \ + P6_TO_P0; \ + } \ + } while (0) + +#elif SPH_KECCAK_UNROLL == 8 + +#define KECCAK_F_1600_ do { \ + int j; \ + for (j = 0; j < 24; j += 8) { \ + KF_ELT( 0, 1, RC[j + 0]); \ + KF_ELT( 1, 2, RC[j + 1]); \ + KF_ELT( 2, 3, RC[j + 2]); \ + KF_ELT( 3, 4, RC[j + 3]); \ + KF_ELT( 4, 5, RC[j + 4]); \ + KF_ELT( 5, 6, RC[j + 5]); \ + KF_ELT( 6, 7, RC[j + 6]); \ + KF_ELT( 7, 8, RC[j + 7]); \ + P8_TO_P0; \ + } \ + } while (0) + +#elif SPH_KECCAK_UNROLL == 12 + +#define KECCAK_F_1600_ do { \ + int j; \ + for (j = 0; j < 24; j += 12) { \ + KF_ELT( 0, 1, RC[j + 0]); \ + KF_ELT( 1, 2, RC[j + 1]); \ + KF_ELT( 2, 3, RC[j + 2]); \ + KF_ELT( 3, 4, RC[j + 3]); \ + KF_ELT( 4, 5, RC[j + 4]); \ + KF_ELT( 5, 6, RC[j + 5]); \ + KF_ELT( 6, 7, RC[j + 6]); \ + KF_ELT( 7, 8, RC[j + 7]); \ + KF_ELT( 8, 9, RC[j + 8]); \ + KF_ELT( 9, 10, RC[j + 9]); \ + KF_ELT(10, 11, RC[j + 10]); \ + KF_ELT(11, 12, RC[j + 11]); \ + P12_TO_P0; \ + } \ + } while (0) + +#elif SPH_KECCAK_UNROLL == 0 + +#define KECCAK_F_1600_ do { \ + KF_ELT( 0, 1, RC[ 0]); \ + KF_ELT( 1, 2, RC[ 1]); \ + KF_ELT( 2, 3, RC[ 2]); \ + KF_ELT( 3, 4, RC[ 3]); \ + KF_ELT( 4, 5, RC[ 4]); \ + KF_ELT( 5, 6, RC[ 5]); \ + KF_ELT( 6, 7, RC[ 6]); \ + KF_ELT( 7, 8, RC[ 7]); \ + KF_ELT( 8, 9, RC[ 8]); \ + KF_ELT( 9, 10, RC[ 9]); \ + KF_ELT(10, 11, RC[10]); \ + KF_ELT(11, 12, RC[11]); \ + KF_ELT(12, 13, RC[12]); \ + KF_ELT(13, 14, RC[13]); \ + KF_ELT(14, 15, RC[14]); \ + KF_ELT(15, 16, RC[15]); \ + KF_ELT(16, 17, RC[16]); \ + KF_ELT(17, 18, RC[17]); \ + KF_ELT(18, 19, RC[18]); \ + KF_ELT(19, 20, RC[19]); \ + KF_ELT(20, 21, RC[20]); \ + KF_ELT(21, 22, RC[21]); \ + KF_ELT(22, 23, RC[22]); \ + KF_ELT(23, 0, RC[23]); \ + } while (0) + +#else + +#error Unimplemented unroll count for Keccak. + +#endif + +static void +keccak_init(sph_keccak_context *kc, unsigned out_size) +{ + int i; + +#if SPH_KECCAK_64 + for (i = 0; i < 25; i ++) + kc->u.wide[i] = 0; + /* + * Initialization for the "lane complement". + */ + kc->u.wide[ 1] = SPH_C64(0xFFFFFFFFFFFFFFFF); + kc->u.wide[ 2] = SPH_C64(0xFFFFFFFFFFFFFFFF); + kc->u.wide[ 8] = SPH_C64(0xFFFFFFFFFFFFFFFF); + kc->u.wide[12] = SPH_C64(0xFFFFFFFFFFFFFFFF); + kc->u.wide[17] = SPH_C64(0xFFFFFFFFFFFFFFFF); + kc->u.wide[20] = SPH_C64(0xFFFFFFFFFFFFFFFF); +#else + + for (i = 0; i < 50; i ++) + kc->u.narrow[i] = 0; + /* + * Initialization for the "lane complement". + * Note: since we set to all-one full 64-bit words, + * interleaving (if applicable) is a no-op. + */ + kc->u.narrow[ 2] = SPH_C32(0xFFFFFFFF); + kc->u.narrow[ 3] = SPH_C32(0xFFFFFFFF); + kc->u.narrow[ 4] = SPH_C32(0xFFFFFFFF); + kc->u.narrow[ 5] = SPH_C32(0xFFFFFFFF); + kc->u.narrow[16] = SPH_C32(0xFFFFFFFF); + kc->u.narrow[17] = SPH_C32(0xFFFFFFFF); + kc->u.narrow[24] = SPH_C32(0xFFFFFFFF); + kc->u.narrow[25] = SPH_C32(0xFFFFFFFF); + kc->u.narrow[34] = SPH_C32(0xFFFFFFFF); + kc->u.narrow[35] = SPH_C32(0xFFFFFFFF); + kc->u.narrow[40] = SPH_C32(0xFFFFFFFF); + kc->u.narrow[41] = SPH_C32(0xFFFFFFFF); +#endif + kc->ptr = 0; + kc->lim = 200 - (out_size >> 2); +} + +static void +keccak_core(sph_keccak_context *kc, const void *data, size_t len, size_t lim) +{ + unsigned char *buf; + size_t ptr; + DECL_STATE + + buf = kc->buf; + ptr = kc->ptr; + + if (len < (lim - ptr)) { + memcpy(buf + ptr, data, len); + kc->ptr = ptr + len; + return; + } + + READ_STATE(kc); + while (len > 0) { + size_t clen; + + clen = (lim - ptr); + if (clen > len) + clen = len; + memcpy(buf + ptr, data, clen); + ptr += clen; + data = (const unsigned char *)data + clen; + len -= clen; + if (ptr == lim) { + INPUT_BUF(lim); + KECCAK_F_1600; + ptr = 0; + } + } + WRITE_STATE(kc); + kc->ptr = ptr; +} + +#if SPH_KECCAK_64 + +#define DEFCLOSE(d, lim) \ + static void keccak_close ## d( \ + sph_keccak_context *kc, unsigned ub, unsigned n, void *dst) \ + { \ + unsigned eb; \ + union { \ + unsigned char tmp[lim + 1]; \ + sph_u64 dummy; /* for alignment */ \ + } u; \ + size_t j; \ + \ + eb = (0x100 | (ub & 0xFF)) >> (8 - n); \ + if (kc->ptr == (lim - 1)) { \ + if (n == 7) { \ + u.tmp[0] = eb; \ + memset(u.tmp + 1, 0, lim - 1); \ + u.tmp[lim] = 0x80; \ + j = 1 + lim; \ + } else { \ + u.tmp[0] = eb | 0x80; \ + j = 1; \ + } \ + } else { \ + j = lim - kc->ptr; \ + u.tmp[0] = eb; \ + memset(u.tmp + 1, 0, j - 2); \ + u.tmp[j - 1] = 0x80; \ + } \ + keccak_core(kc, u.tmp, j, lim); \ + /* Finalize the "lane complement" */ \ + kc->u.wide[ 1] = ~kc->u.wide[ 1]; \ + kc->u.wide[ 2] = ~kc->u.wide[ 2]; \ + kc->u.wide[ 8] = ~kc->u.wide[ 8]; \ + kc->u.wide[12] = ~kc->u.wide[12]; \ + kc->u.wide[17] = ~kc->u.wide[17]; \ + kc->u.wide[20] = ~kc->u.wide[20]; \ + for (j = 0; j < d; j += 8) \ + sph_enc64le_aligned(u.tmp + j, kc->u.wide[j >> 3]); \ + memcpy(dst, u.tmp, d); \ + keccak_init(kc, (unsigned)d << 3); \ + } \ + +#else + +#define DEFCLOSE(d, lim) \ + static void keccak_close ## d( \ + sph_keccak_context *kc, unsigned ub, unsigned n, void *dst) \ + { \ + unsigned eb; \ + union { \ + unsigned char tmp[lim + 1]; \ + sph_u64 dummy; /* for alignment */ \ + } u; \ + size_t j; \ + \ + eb = (0x100 | (ub & 0xFF)) >> (8 - n); \ + if (kc->ptr == (lim - 1)) { \ + if (n == 7) { \ + u.tmp[0] = eb; \ + memset(u.tmp + 1, 0, lim - 1); \ + u.tmp[lim] = 0x80; \ + j = 1 + lim; \ + } else { \ + u.tmp[0] = eb | 0x80; \ + j = 1; \ + } \ + } else { \ + j = lim - kc->ptr; \ + u.tmp[0] = eb; \ + memset(u.tmp + 1, 0, j - 2); \ + u.tmp[j - 1] = 0x80; \ + } \ + keccak_core(kc, u.tmp, j, lim); \ + /* Finalize the "lane complement" */ \ + kc->u.narrow[ 2] = ~kc->u.narrow[ 2]; \ + kc->u.narrow[ 3] = ~kc->u.narrow[ 3]; \ + kc->u.narrow[ 4] = ~kc->u.narrow[ 4]; \ + kc->u.narrow[ 5] = ~kc->u.narrow[ 5]; \ + kc->u.narrow[16] = ~kc->u.narrow[16]; \ + kc->u.narrow[17] = ~kc->u.narrow[17]; \ + kc->u.narrow[24] = ~kc->u.narrow[24]; \ + kc->u.narrow[25] = ~kc->u.narrow[25]; \ + kc->u.narrow[34] = ~kc->u.narrow[34]; \ + kc->u.narrow[35] = ~kc->u.narrow[35]; \ + kc->u.narrow[40] = ~kc->u.narrow[40]; \ + kc->u.narrow[41] = ~kc->u.narrow[41]; \ + /* un-interleave */ \ + for (j = 0; j < 50; j += 2) \ + UNINTERLEAVE(kc->u.narrow[j], kc->u.narrow[j + 1]); \ + for (j = 0; j < d; j += 4) \ + sph_enc32le_aligned(u.tmp + j, kc->u.narrow[j >> 2]); \ + memcpy(dst, u.tmp, d); \ + keccak_init(kc, (unsigned)d << 3); \ + } \ + +#endif + +DEFCLOSE(28, 144) +DEFCLOSE(32, 136) +DEFCLOSE(48, 104) +DEFCLOSE(64, 72) + +/* see sph_keccak.h */ +void +sph_keccak224_init(void *cc) +{ + keccak_init(cc, 224); +} + +/* see sph_keccak.h */ +void +sph_keccak224(void *cc, const void *data, size_t len) +{ + keccak_core(cc, data, len, 144); +} + +/* see sph_keccak.h */ +void +sph_keccak224_close(void *cc, void *dst) +{ + sph_keccak224_addbits_and_close(cc, 0, 0, dst); +} + +/* see sph_keccak.h */ +void +sph_keccak224_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst) +{ + keccak_close28(cc, ub, n, dst); +} + +/* see sph_keccak.h */ +void +sph_keccak256_init(void *cc) +{ + keccak_init(cc, 256); +} + +/* see sph_keccak.h */ +void +sph_keccak256(void *cc, const void *data, size_t len) +{ + keccak_core(cc, data, len, 136); +} + +/* see sph_keccak.h */ +void +sph_keccak256_close(void *cc, void *dst) +{ + sph_keccak256_addbits_and_close(cc, 0, 0, dst); +} + +/* see sph_keccak.h */ +void +sph_keccak256_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst) +{ + keccak_close32(cc, ub, n, dst); +} + +/* see sph_keccak.h */ +void +sph_keccak384_init(void *cc) +{ + keccak_init(cc, 384); +} + +/* see sph_keccak.h */ +void +sph_keccak384(void *cc, const void *data, size_t len) +{ + keccak_core(cc, data, len, 104); +} + +/* see sph_keccak.h */ +void +sph_keccak384_close(void *cc, void *dst) +{ + sph_keccak384_addbits_and_close(cc, 0, 0, dst); +} + +/* see sph_keccak.h */ +void +sph_keccak384_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst) +{ + keccak_close48(cc, ub, n, dst); +} + +/* see sph_keccak.h */ +void +sph_keccak512_init(void *cc) +{ + keccak_init(cc, 512); +} + +/* see sph_keccak.h */ +void +sph_keccak512(void *cc, const void *data, size_t len) +{ + keccak_core(cc, data, len, 72); +} + +/* see sph_keccak.h */ +void +sph_keccak512_close(void *cc, void *dst) +{ + sph_keccak512_addbits_and_close(cc, 0, 0, dst); +} + +/* see sph_keccak.h */ +void +sph_keccak512_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst) +{ + keccak_close64(cc, ub, n, dst); +} + + +#ifdef __cplusplus +} +#endif diff --git a/sha3/luffa.c b/sha3/luffa.c new file mode 100644 index 0000000..a761bea --- /dev/null +++ b/sha3/luffa.c @@ -0,0 +1,1426 @@ +/* $Id: luffa.c 219 2010-06-08 17:24:41Z tp $ */ +/* + * Luffa implementation. + * + * ==========================(LICENSE BEGIN)============================ + * + * Copyright (c) 2007-2010 Projet RNRT SAPHIR + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY + * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * + * ===========================(LICENSE END)============================= + * + * @author Thomas Pornin + */ + +#include +#include +#include + +#include "sph_luffa.h" + +#ifdef __cplusplus +extern "C"{ +#endif + +#if SPH_64_TRUE && !defined SPH_LUFFA_PARALLEL +#define SPH_LUFFA_PARALLEL 1 +#endif + +#ifdef _MSC_VER +#pragma warning (disable: 4146) +#endif + +static const sph_u32 V_INIT[5][8] = { + { + SPH_C32(0x6d251e69), SPH_C32(0x44b051e0), + SPH_C32(0x4eaa6fb4), SPH_C32(0xdbf78465), + SPH_C32(0x6e292011), SPH_C32(0x90152df4), + SPH_C32(0xee058139), SPH_C32(0xdef610bb) + }, { + SPH_C32(0xc3b44b95), SPH_C32(0xd9d2f256), + SPH_C32(0x70eee9a0), SPH_C32(0xde099fa3), + SPH_C32(0x5d9b0557), SPH_C32(0x8fc944b3), + SPH_C32(0xcf1ccf0e), SPH_C32(0x746cd581) + }, { + SPH_C32(0xf7efc89d), SPH_C32(0x5dba5781), + SPH_C32(0x04016ce5), SPH_C32(0xad659c05), + SPH_C32(0x0306194f), SPH_C32(0x666d1836), + SPH_C32(0x24aa230a), SPH_C32(0x8b264ae7) + }, { + SPH_C32(0x858075d5), SPH_C32(0x36d79cce), + SPH_C32(0xe571f7d7), SPH_C32(0x204b1f67), + SPH_C32(0x35870c6a), SPH_C32(0x57e9e923), + SPH_C32(0x14bcb808), SPH_C32(0x7cde72ce) + }, { + SPH_C32(0x6c68e9be), SPH_C32(0x5ec41e22), + SPH_C32(0xc825b7c7), SPH_C32(0xaffb4363), + SPH_C32(0xf5df3999), SPH_C32(0x0fc688f1), + SPH_C32(0xb07224cc), SPH_C32(0x03e86cea) + } +}; + +static const sph_u32 RC00[8] = { + SPH_C32(0x303994a6), SPH_C32(0xc0e65299), + SPH_C32(0x6cc33a12), SPH_C32(0xdc56983e), + SPH_C32(0x1e00108f), SPH_C32(0x7800423d), + SPH_C32(0x8f5b7882), SPH_C32(0x96e1db12) +}; + +static const sph_u32 RC04[8] = { + SPH_C32(0xe0337818), SPH_C32(0x441ba90d), + SPH_C32(0x7f34d442), SPH_C32(0x9389217f), + SPH_C32(0xe5a8bce6), SPH_C32(0x5274baf4), + SPH_C32(0x26889ba7), SPH_C32(0x9a226e9d) +}; + +static const sph_u32 RC10[8] = { + SPH_C32(0xb6de10ed), SPH_C32(0x70f47aae), + SPH_C32(0x0707a3d4), SPH_C32(0x1c1e8f51), + SPH_C32(0x707a3d45), SPH_C32(0xaeb28562), + SPH_C32(0xbaca1589), SPH_C32(0x40a46f3e) +}; + +static const sph_u32 RC14[8] = { + SPH_C32(0x01685f3d), SPH_C32(0x05a17cf4), + SPH_C32(0xbd09caca), SPH_C32(0xf4272b28), + SPH_C32(0x144ae5cc), SPH_C32(0xfaa7ae2b), + SPH_C32(0x2e48f1c1), SPH_C32(0xb923c704) +}; + +#if SPH_LUFFA_PARALLEL + +static const sph_u64 RCW010[8] = { + SPH_C64(0xb6de10ed303994a6), SPH_C64(0x70f47aaec0e65299), + SPH_C64(0x0707a3d46cc33a12), SPH_C64(0x1c1e8f51dc56983e), + SPH_C64(0x707a3d451e00108f), SPH_C64(0xaeb285627800423d), + SPH_C64(0xbaca15898f5b7882), SPH_C64(0x40a46f3e96e1db12) +}; + +static const sph_u64 RCW014[8] = { + SPH_C64(0x01685f3de0337818), SPH_C64(0x05a17cf4441ba90d), + SPH_C64(0xbd09caca7f34d442), SPH_C64(0xf4272b289389217f), + SPH_C64(0x144ae5cce5a8bce6), SPH_C64(0xfaa7ae2b5274baf4), + SPH_C64(0x2e48f1c126889ba7), SPH_C64(0xb923c7049a226e9d) +}; + +#endif + +static const sph_u32 RC20[8] = { + SPH_C32(0xfc20d9d2), SPH_C32(0x34552e25), + SPH_C32(0x7ad8818f), SPH_C32(0x8438764a), + SPH_C32(0xbb6de032), SPH_C32(0xedb780c8), + SPH_C32(0xd9847356), SPH_C32(0xa2c78434) +}; + +static const sph_u32 RC24[8] = { + SPH_C32(0xe25e72c1), SPH_C32(0xe623bb72), + SPH_C32(0x5c58a4a4), SPH_C32(0x1e38e2e7), + SPH_C32(0x78e38b9d), SPH_C32(0x27586719), + SPH_C32(0x36eda57f), SPH_C32(0x703aace7) +}; + +static const sph_u32 RC30[8] = { + SPH_C32(0xb213afa5), SPH_C32(0xc84ebe95), + SPH_C32(0x4e608a22), SPH_C32(0x56d858fe), + SPH_C32(0x343b138f), SPH_C32(0xd0ec4e3d), + SPH_C32(0x2ceb4882), SPH_C32(0xb3ad2208) +}; + +static const sph_u32 RC34[8] = { + SPH_C32(0xe028c9bf), SPH_C32(0x44756f91), + SPH_C32(0x7e8fce32), SPH_C32(0x956548be), + SPH_C32(0xfe191be2), SPH_C32(0x3cb226e5), + SPH_C32(0x5944a28e), SPH_C32(0xa1c4c355) +}; + +#if SPH_LUFFA_PARALLEL + +static const sph_u64 RCW230[8] = { + SPH_C64(0xb213afa5fc20d9d2), SPH_C64(0xc84ebe9534552e25), + SPH_C64(0x4e608a227ad8818f), SPH_C64(0x56d858fe8438764a), + SPH_C64(0x343b138fbb6de032), SPH_C64(0xd0ec4e3dedb780c8), + SPH_C64(0x2ceb4882d9847356), SPH_C64(0xb3ad2208a2c78434) +}; + + +static const sph_u64 RCW234[8] = { + SPH_C64(0xe028c9bfe25e72c1), SPH_C64(0x44756f91e623bb72), + SPH_C64(0x7e8fce325c58a4a4), SPH_C64(0x956548be1e38e2e7), + SPH_C64(0xfe191be278e38b9d), SPH_C64(0x3cb226e527586719), + SPH_C64(0x5944a28e36eda57f), SPH_C64(0xa1c4c355703aace7) +}; + +#endif + +static const sph_u32 RC40[8] = { + SPH_C32(0xf0d2e9e3), SPH_C32(0xac11d7fa), + SPH_C32(0x1bcb66f2), SPH_C32(0x6f2d9bc9), + SPH_C32(0x78602649), SPH_C32(0x8edae952), + SPH_C32(0x3b6ba548), SPH_C32(0xedae9520) +}; + +static const sph_u32 RC44[8] = { + SPH_C32(0x5090d577), SPH_C32(0x2d1925ab), + SPH_C32(0xb46496ac), SPH_C32(0xd1925ab0), + SPH_C32(0x29131ab6), SPH_C32(0x0fc053c3), + SPH_C32(0x3f014f0c), SPH_C32(0xfc053c31) +}; + +#define DECL_TMP8(w) \ + sph_u32 w ## 0, w ## 1, w ## 2, w ## 3, w ## 4, w ## 5, w ## 6, w ## 7; + +#define M2(d, s) do { \ + sph_u32 tmp = s ## 7; \ + d ## 7 = s ## 6; \ + d ## 6 = s ## 5; \ + d ## 5 = s ## 4; \ + d ## 4 = s ## 3 ^ tmp; \ + d ## 3 = s ## 2 ^ tmp; \ + d ## 2 = s ## 1; \ + d ## 1 = s ## 0 ^ tmp; \ + d ## 0 = tmp; \ + } while (0) + +#define XOR(d, s1, s2) do { \ + d ## 0 = s1 ## 0 ^ s2 ## 0; \ + d ## 1 = s1 ## 1 ^ s2 ## 1; \ + d ## 2 = s1 ## 2 ^ s2 ## 2; \ + d ## 3 = s1 ## 3 ^ s2 ## 3; \ + d ## 4 = s1 ## 4 ^ s2 ## 4; \ + d ## 5 = s1 ## 5 ^ s2 ## 5; \ + d ## 6 = s1 ## 6 ^ s2 ## 6; \ + d ## 7 = s1 ## 7 ^ s2 ## 7; \ + } while (0) + +#if SPH_LUFFA_PARALLEL + +#define SUB_CRUMB_GEN(a0, a1, a2, a3, width) do { \ + sph_u ## width tmp; \ + tmp = (a0); \ + (a0) |= (a1); \ + (a2) ^= (a3); \ + (a1) = SPH_T ## width(~(a1)); \ + (a0) ^= (a3); \ + (a3) &= tmp; \ + (a1) ^= (a3); \ + (a3) ^= (a2); \ + (a2) &= (a0); \ + (a0) = SPH_T ## width(~(a0)); \ + (a2) ^= (a1); \ + (a1) |= (a3); \ + tmp ^= (a1); \ + (a3) ^= (a2); \ + (a2) &= (a1); \ + (a1) ^= (a0); \ + (a0) = tmp; \ + } while (0) + +#define SUB_CRUMB(a0, a1, a2, a3) SUB_CRUMB_GEN(a0, a1, a2, a3, 32) +#define SUB_CRUMBW(a0, a1, a2, a3) SUB_CRUMB_GEN(a0, a1, a2, a3, 64) + + +#if 0 + +#define ROL32W(x, n) SPH_T64( \ + (((x) << (n)) \ + & ~((SPH_C64(0xFFFFFFFF) >> (32 - (n))) << 32)) \ + | (((x) >> (32 - (n))) \ + & ~((SPH_C64(0xFFFFFFFF) >> (n)) << (n)))) + +#define MIX_WORDW(u, v) do { \ + (v) ^= (u); \ + (u) = ROL32W((u), 2) ^ (v); \ + (v) = ROL32W((v), 14) ^ (u); \ + (u) = ROL32W((u), 10) ^ (v); \ + (v) = ROL32W((v), 1); \ + } while (0) + +#endif + +#define MIX_WORDW(u, v) do { \ + sph_u32 ul, uh, vl, vh; \ + (v) ^= (u); \ + ul = SPH_T32((sph_u32)(u)); \ + uh = SPH_T32((sph_u32)((u) >> 32)); \ + vl = SPH_T32((sph_u32)(v)); \ + vh = SPH_T32((sph_u32)((v) >> 32)); \ + ul = SPH_ROTL32(ul, 2) ^ vl; \ + vl = SPH_ROTL32(vl, 14) ^ ul; \ + ul = SPH_ROTL32(ul, 10) ^ vl; \ + vl = SPH_ROTL32(vl, 1); \ + uh = SPH_ROTL32(uh, 2) ^ vh; \ + vh = SPH_ROTL32(vh, 14) ^ uh; \ + uh = SPH_ROTL32(uh, 10) ^ vh; \ + vh = SPH_ROTL32(vh, 1); \ + (u) = (sph_u64)ul | ((sph_u64)uh << 32); \ + (v) = (sph_u64)vl | ((sph_u64)vh << 32); \ + } while (0) + +#else + +#define SUB_CRUMB(a0, a1, a2, a3) do { \ + sph_u32 tmp; \ + tmp = (a0); \ + (a0) |= (a1); \ + (a2) ^= (a3); \ + (a1) = SPH_T32(~(a1)); \ + (a0) ^= (a3); \ + (a3) &= tmp; \ + (a1) ^= (a3); \ + (a3) ^= (a2); \ + (a2) &= (a0); \ + (a0) = SPH_T32(~(a0)); \ + (a2) ^= (a1); \ + (a1) |= (a3); \ + tmp ^= (a1); \ + (a3) ^= (a2); \ + (a2) &= (a1); \ + (a1) ^= (a0); \ + (a0) = tmp; \ + } while (0) + +#endif + +#define MIX_WORD(u, v) do { \ + (v) ^= (u); \ + (u) = SPH_ROTL32((u), 2) ^ (v); \ + (v) = SPH_ROTL32((v), 14) ^ (u); \ + (u) = SPH_ROTL32((u), 10) ^ (v); \ + (v) = SPH_ROTL32((v), 1); \ + } while (0) + +#define DECL_STATE3 \ + sph_u32 V00, V01, V02, V03, V04, V05, V06, V07; \ + sph_u32 V10, V11, V12, V13, V14, V15, V16, V17; \ + sph_u32 V20, V21, V22, V23, V24, V25, V26, V27; + +#define READ_STATE3(state) do { \ + V00 = (state)->V[0][0]; \ + V01 = (state)->V[0][1]; \ + V02 = (state)->V[0][2]; \ + V03 = (state)->V[0][3]; \ + V04 = (state)->V[0][4]; \ + V05 = (state)->V[0][5]; \ + V06 = (state)->V[0][6]; \ + V07 = (state)->V[0][7]; \ + V10 = (state)->V[1][0]; \ + V11 = (state)->V[1][1]; \ + V12 = (state)->V[1][2]; \ + V13 = (state)->V[1][3]; \ + V14 = (state)->V[1][4]; \ + V15 = (state)->V[1][5]; \ + V16 = (state)->V[1][6]; \ + V17 = (state)->V[1][7]; \ + V20 = (state)->V[2][0]; \ + V21 = (state)->V[2][1]; \ + V22 = (state)->V[2][2]; \ + V23 = (state)->V[2][3]; \ + V24 = (state)->V[2][4]; \ + V25 = (state)->V[2][5]; \ + V26 = (state)->V[2][6]; \ + V27 = (state)->V[2][7]; \ + } while (0) + +#define WRITE_STATE3(state) do { \ + (state)->V[0][0] = V00; \ + (state)->V[0][1] = V01; \ + (state)->V[0][2] = V02; \ + (state)->V[0][3] = V03; \ + (state)->V[0][4] = V04; \ + (state)->V[0][5] = V05; \ + (state)->V[0][6] = V06; \ + (state)->V[0][7] = V07; \ + (state)->V[1][0] = V10; \ + (state)->V[1][1] = V11; \ + (state)->V[1][2] = V12; \ + (state)->V[1][3] = V13; \ + (state)->V[1][4] = V14; \ + (state)->V[1][5] = V15; \ + (state)->V[1][6] = V16; \ + (state)->V[1][7] = V17; \ + (state)->V[2][0] = V20; \ + (state)->V[2][1] = V21; \ + (state)->V[2][2] = V22; \ + (state)->V[2][3] = V23; \ + (state)->V[2][4] = V24; \ + (state)->V[2][5] = V25; \ + (state)->V[2][6] = V26; \ + (state)->V[2][7] = V27; \ + } while (0) + +#define MI3 do { \ + DECL_TMP8(M) \ + DECL_TMP8(a) \ + M0 = sph_dec32be_aligned(buf + 0); \ + M1 = sph_dec32be_aligned(buf + 4); \ + M2 = sph_dec32be_aligned(buf + 8); \ + M3 = sph_dec32be_aligned(buf + 12); \ + M4 = sph_dec32be_aligned(buf + 16); \ + M5 = sph_dec32be_aligned(buf + 20); \ + M6 = sph_dec32be_aligned(buf + 24); \ + M7 = sph_dec32be_aligned(buf + 28); \ + XOR(a, V0, V1); \ + XOR(a, a, V2); \ + M2(a, a); \ + XOR(V0, a, V0); \ + XOR(V0, M, V0); \ + M2(M, M); \ + XOR(V1, a, V1); \ + XOR(V1, M, V1); \ + M2(M, M); \ + XOR(V2, a, V2); \ + XOR(V2, M, V2); \ + } while (0) + +#define TWEAK3 do { \ + V14 = SPH_ROTL32(V14, 1); \ + V15 = SPH_ROTL32(V15, 1); \ + V16 = SPH_ROTL32(V16, 1); \ + V17 = SPH_ROTL32(V17, 1); \ + V24 = SPH_ROTL32(V24, 2); \ + V25 = SPH_ROTL32(V25, 2); \ + V26 = SPH_ROTL32(V26, 2); \ + V27 = SPH_ROTL32(V27, 2); \ + } while (0) + +#if SPH_LUFFA_PARALLEL + +#define P3 do { \ + int r; \ + sph_u64 W0, W1, W2, W3, W4, W5, W6, W7; \ + TWEAK3; \ + W0 = (sph_u64)V00 | ((sph_u64)V10 << 32); \ + W1 = (sph_u64)V01 | ((sph_u64)V11 << 32); \ + W2 = (sph_u64)V02 | ((sph_u64)V12 << 32); \ + W3 = (sph_u64)V03 | ((sph_u64)V13 << 32); \ + W4 = (sph_u64)V04 | ((sph_u64)V14 << 32); \ + W5 = (sph_u64)V05 | ((sph_u64)V15 << 32); \ + W6 = (sph_u64)V06 | ((sph_u64)V16 << 32); \ + W7 = (sph_u64)V07 | ((sph_u64)V17 << 32); \ + for (r = 0; r < 8; r ++) { \ + SUB_CRUMBW(W0, W1, W2, W3); \ + SUB_CRUMBW(W5, W6, W7, W4); \ + MIX_WORDW(W0, W4); \ + MIX_WORDW(W1, W5); \ + MIX_WORDW(W2, W6); \ + MIX_WORDW(W3, W7); \ + W0 ^= RCW010[r]; \ + W4 ^= RCW014[r]; \ + } \ + V00 = SPH_T32((sph_u32)W0); \ + V10 = SPH_T32((sph_u32)(W0 >> 32)); \ + V01 = SPH_T32((sph_u32)W1); \ + V11 = SPH_T32((sph_u32)(W1 >> 32)); \ + V02 = SPH_T32((sph_u32)W2); \ + V12 = SPH_T32((sph_u32)(W2 >> 32)); \ + V03 = SPH_T32((sph_u32)W3); \ + V13 = SPH_T32((sph_u32)(W3 >> 32)); \ + V04 = SPH_T32((sph_u32)W4); \ + V14 = SPH_T32((sph_u32)(W4 >> 32)); \ + V05 = SPH_T32((sph_u32)W5); \ + V15 = SPH_T32((sph_u32)(W5 >> 32)); \ + V06 = SPH_T32((sph_u32)W6); \ + V16 = SPH_T32((sph_u32)(W6 >> 32)); \ + V07 = SPH_T32((sph_u32)W7); \ + V17 = SPH_T32((sph_u32)(W7 >> 32)); \ + for (r = 0; r < 8; r ++) { \ + SUB_CRUMB(V20, V21, V22, V23); \ + SUB_CRUMB(V25, V26, V27, V24); \ + MIX_WORD(V20, V24); \ + MIX_WORD(V21, V25); \ + MIX_WORD(V22, V26); \ + MIX_WORD(V23, V27); \ + V20 ^= RC20[r]; \ + V24 ^= RC24[r]; \ + } \ + } while (0) + +#else + +#define P3 do { \ + int r; \ + TWEAK3; \ + for (r = 0; r < 8; r ++) { \ + SUB_CRUMB(V00, V01, V02, V03); \ + SUB_CRUMB(V05, V06, V07, V04); \ + MIX_WORD(V00, V04); \ + MIX_WORD(V01, V05); \ + MIX_WORD(V02, V06); \ + MIX_WORD(V03, V07); \ + V00 ^= RC00[r]; \ + V04 ^= RC04[r]; \ + } \ + for (r = 0; r < 8; r ++) { \ + SUB_CRUMB(V10, V11, V12, V13); \ + SUB_CRUMB(V15, V16, V17, V14); \ + MIX_WORD(V10, V14); \ + MIX_WORD(V11, V15); \ + MIX_WORD(V12, V16); \ + MIX_WORD(V13, V17); \ + V10 ^= RC10[r]; \ + V14 ^= RC14[r]; \ + } \ + for (r = 0; r < 8; r ++) { \ + SUB_CRUMB(V20, V21, V22, V23); \ + SUB_CRUMB(V25, V26, V27, V24); \ + MIX_WORD(V20, V24); \ + MIX_WORD(V21, V25); \ + MIX_WORD(V22, V26); \ + MIX_WORD(V23, V27); \ + V20 ^= RC20[r]; \ + V24 ^= RC24[r]; \ + } \ + } while (0) + +#endif + +#define DECL_STATE4 \ + sph_u32 V00, V01, V02, V03, V04, V05, V06, V07; \ + sph_u32 V10, V11, V12, V13, V14, V15, V16, V17; \ + sph_u32 V20, V21, V22, V23, V24, V25, V26, V27; \ + sph_u32 V30, V31, V32, V33, V34, V35, V36, V37; + +#define READ_STATE4(state) do { \ + V00 = (state)->V[0][0]; \ + V01 = (state)->V[0][1]; \ + V02 = (state)->V[0][2]; \ + V03 = (state)->V[0][3]; \ + V04 = (state)->V[0][4]; \ + V05 = (state)->V[0][5]; \ + V06 = (state)->V[0][6]; \ + V07 = (state)->V[0][7]; \ + V10 = (state)->V[1][0]; \ + V11 = (state)->V[1][1]; \ + V12 = (state)->V[1][2]; \ + V13 = (state)->V[1][3]; \ + V14 = (state)->V[1][4]; \ + V15 = (state)->V[1][5]; \ + V16 = (state)->V[1][6]; \ + V17 = (state)->V[1][7]; \ + V20 = (state)->V[2][0]; \ + V21 = (state)->V[2][1]; \ + V22 = (state)->V[2][2]; \ + V23 = (state)->V[2][3]; \ + V24 = (state)->V[2][4]; \ + V25 = (state)->V[2][5]; \ + V26 = (state)->V[2][6]; \ + V27 = (state)->V[2][7]; \ + V30 = (state)->V[3][0]; \ + V31 = (state)->V[3][1]; \ + V32 = (state)->V[3][2]; \ + V33 = (state)->V[3][3]; \ + V34 = (state)->V[3][4]; \ + V35 = (state)->V[3][5]; \ + V36 = (state)->V[3][6]; \ + V37 = (state)->V[3][7]; \ + } while (0) + +#define WRITE_STATE4(state) do { \ + (state)->V[0][0] = V00; \ + (state)->V[0][1] = V01; \ + (state)->V[0][2] = V02; \ + (state)->V[0][3] = V03; \ + (state)->V[0][4] = V04; \ + (state)->V[0][5] = V05; \ + (state)->V[0][6] = V06; \ + (state)->V[0][7] = V07; \ + (state)->V[1][0] = V10; \ + (state)->V[1][1] = V11; \ + (state)->V[1][2] = V12; \ + (state)->V[1][3] = V13; \ + (state)->V[1][4] = V14; \ + (state)->V[1][5] = V15; \ + (state)->V[1][6] = V16; \ + (state)->V[1][7] = V17; \ + (state)->V[2][0] = V20; \ + (state)->V[2][1] = V21; \ + (state)->V[2][2] = V22; \ + (state)->V[2][3] = V23; \ + (state)->V[2][4] = V24; \ + (state)->V[2][5] = V25; \ + (state)->V[2][6] = V26; \ + (state)->V[2][7] = V27; \ + (state)->V[3][0] = V30; \ + (state)->V[3][1] = V31; \ + (state)->V[3][2] = V32; \ + (state)->V[3][3] = V33; \ + (state)->V[3][4] = V34; \ + (state)->V[3][5] = V35; \ + (state)->V[3][6] = V36; \ + (state)->V[3][7] = V37; \ + } while (0) + +#define MI4 do { \ + DECL_TMP8(M) \ + DECL_TMP8(a) \ + DECL_TMP8(b) \ + M0 = sph_dec32be_aligned(buf + 0); \ + M1 = sph_dec32be_aligned(buf + 4); \ + M2 = sph_dec32be_aligned(buf + 8); \ + M3 = sph_dec32be_aligned(buf + 12); \ + M4 = sph_dec32be_aligned(buf + 16); \ + M5 = sph_dec32be_aligned(buf + 20); \ + M6 = sph_dec32be_aligned(buf + 24); \ + M7 = sph_dec32be_aligned(buf + 28); \ + XOR(a, V0, V1); \ + XOR(b, V2, V3); \ + XOR(a, a, b); \ + M2(a, a); \ + XOR(V0, a, V0); \ + XOR(V1, a, V1); \ + XOR(V2, a, V2); \ + XOR(V3, a, V3); \ + M2(b, V0); \ + XOR(b, b, V3); \ + M2(V3, V3); \ + XOR(V3, V3, V2); \ + M2(V2, V2); \ + XOR(V2, V2, V1); \ + M2(V1, V1); \ + XOR(V1, V1, V0); \ + XOR(V0, b, M); \ + M2(M, M); \ + XOR(V1, V1, M); \ + M2(M, M); \ + XOR(V2, V2, M); \ + M2(M, M); \ + XOR(V3, V3, M); \ + } while (0) + +#define TWEAK4 do { \ + V14 = SPH_ROTL32(V14, 1); \ + V15 = SPH_ROTL32(V15, 1); \ + V16 = SPH_ROTL32(V16, 1); \ + V17 = SPH_ROTL32(V17, 1); \ + V24 = SPH_ROTL32(V24, 2); \ + V25 = SPH_ROTL32(V25, 2); \ + V26 = SPH_ROTL32(V26, 2); \ + V27 = SPH_ROTL32(V27, 2); \ + V34 = SPH_ROTL32(V34, 3); \ + V35 = SPH_ROTL32(V35, 3); \ + V36 = SPH_ROTL32(V36, 3); \ + V37 = SPH_ROTL32(V37, 3); \ + } while (0) + +#if SPH_LUFFA_PARALLEL + +#define P4 do { \ + int r; \ + sph_u64 W0, W1, W2, W3, W4, W5, W6, W7; \ + TWEAK4; \ + W0 = (sph_u64)V00 | ((sph_u64)V10 << 32); \ + W1 = (sph_u64)V01 | ((sph_u64)V11 << 32); \ + W2 = (sph_u64)V02 | ((sph_u64)V12 << 32); \ + W3 = (sph_u64)V03 | ((sph_u64)V13 << 32); \ + W4 = (sph_u64)V04 | ((sph_u64)V14 << 32); \ + W5 = (sph_u64)V05 | ((sph_u64)V15 << 32); \ + W6 = (sph_u64)V06 | ((sph_u64)V16 << 32); \ + W7 = (sph_u64)V07 | ((sph_u64)V17 << 32); \ + for (r = 0; r < 8; r ++) { \ + SUB_CRUMBW(W0, W1, W2, W3); \ + SUB_CRUMBW(W5, W6, W7, W4); \ + MIX_WORDW(W0, W4); \ + MIX_WORDW(W1, W5); \ + MIX_WORDW(W2, W6); \ + MIX_WORDW(W3, W7); \ + W0 ^= RCW010[r]; \ + W4 ^= RCW014[r]; \ + } \ + V00 = SPH_T32((sph_u32)W0); \ + V10 = SPH_T32((sph_u32)(W0 >> 32)); \ + V01 = SPH_T32((sph_u32)W1); \ + V11 = SPH_T32((sph_u32)(W1 >> 32)); \ + V02 = SPH_T32((sph_u32)W2); \ + V12 = SPH_T32((sph_u32)(W2 >> 32)); \ + V03 = SPH_T32((sph_u32)W3); \ + V13 = SPH_T32((sph_u32)(W3 >> 32)); \ + V04 = SPH_T32((sph_u32)W4); \ + V14 = SPH_T32((sph_u32)(W4 >> 32)); \ + V05 = SPH_T32((sph_u32)W5); \ + V15 = SPH_T32((sph_u32)(W5 >> 32)); \ + V06 = SPH_T32((sph_u32)W6); \ + V16 = SPH_T32((sph_u32)(W6 >> 32)); \ + V07 = SPH_T32((sph_u32)W7); \ + V17 = SPH_T32((sph_u32)(W7 >> 32)); \ + W0 = (sph_u64)V20 | ((sph_u64)V30 << 32); \ + W1 = (sph_u64)V21 | ((sph_u64)V31 << 32); \ + W2 = (sph_u64)V22 | ((sph_u64)V32 << 32); \ + W3 = (sph_u64)V23 | ((sph_u64)V33 << 32); \ + W4 = (sph_u64)V24 | ((sph_u64)V34 << 32); \ + W5 = (sph_u64)V25 | ((sph_u64)V35 << 32); \ + W6 = (sph_u64)V26 | ((sph_u64)V36 << 32); \ + W7 = (sph_u64)V27 | ((sph_u64)V37 << 32); \ + for (r = 0; r < 8; r ++) { \ + SUB_CRUMBW(W0, W1, W2, W3); \ + SUB_CRUMBW(W5, W6, W7, W4); \ + MIX_WORDW(W0, W4); \ + MIX_WORDW(W1, W5); \ + MIX_WORDW(W2, W6); \ + MIX_WORDW(W3, W7); \ + W0 ^= RCW230[r]; \ + W4 ^= RCW234[r]; \ + } \ + V20 = SPH_T32((sph_u32)W0); \ + V30 = SPH_T32((sph_u32)(W0 >> 32)); \ + V21 = SPH_T32((sph_u32)W1); \ + V31 = SPH_T32((sph_u32)(W1 >> 32)); \ + V22 = SPH_T32((sph_u32)W2); \ + V32 = SPH_T32((sph_u32)(W2 >> 32)); \ + V23 = SPH_T32((sph_u32)W3); \ + V33 = SPH_T32((sph_u32)(W3 >> 32)); \ + V24 = SPH_T32((sph_u32)W4); \ + V34 = SPH_T32((sph_u32)(W4 >> 32)); \ + V25 = SPH_T32((sph_u32)W5); \ + V35 = SPH_T32((sph_u32)(W5 >> 32)); \ + V26 = SPH_T32((sph_u32)W6); \ + V36 = SPH_T32((sph_u32)(W6 >> 32)); \ + V27 = SPH_T32((sph_u32)W7); \ + V37 = SPH_T32((sph_u32)(W7 >> 32)); \ + } while (0) + +#else + +#define P4 do { \ + int r; \ + TWEAK4; \ + for (r = 0; r < 8; r ++) { \ + SUB_CRUMB(V00, V01, V02, V03); \ + SUB_CRUMB(V05, V06, V07, V04); \ + MIX_WORD(V00, V04); \ + MIX_WORD(V01, V05); \ + MIX_WORD(V02, V06); \ + MIX_WORD(V03, V07); \ + V00 ^= RC00[r]; \ + V04 ^= RC04[r]; \ + } \ + for (r = 0; r < 8; r ++) { \ + SUB_CRUMB(V10, V11, V12, V13); \ + SUB_CRUMB(V15, V16, V17, V14); \ + MIX_WORD(V10, V14); \ + MIX_WORD(V11, V15); \ + MIX_WORD(V12, V16); \ + MIX_WORD(V13, V17); \ + V10 ^= RC10[r]; \ + V14 ^= RC14[r]; \ + } \ + for (r = 0; r < 8; r ++) { \ + SUB_CRUMB(V20, V21, V22, V23); \ + SUB_CRUMB(V25, V26, V27, V24); \ + MIX_WORD(V20, V24); \ + MIX_WORD(V21, V25); \ + MIX_WORD(V22, V26); \ + MIX_WORD(V23, V27); \ + V20 ^= RC20[r]; \ + V24 ^= RC24[r]; \ + } \ + for (r = 0; r < 8; r ++) { \ + SUB_CRUMB(V30, V31, V32, V33); \ + SUB_CRUMB(V35, V36, V37, V34); \ + MIX_WORD(V30, V34); \ + MIX_WORD(V31, V35); \ + MIX_WORD(V32, V36); \ + MIX_WORD(V33, V37); \ + V30 ^= RC30[r]; \ + V34 ^= RC34[r]; \ + } \ + } while (0) + +#endif + +#define DECL_STATE5 \ + sph_u32 V00, V01, V02, V03, V04, V05, V06, V07; \ + sph_u32 V10, V11, V12, V13, V14, V15, V16, V17; \ + sph_u32 V20, V21, V22, V23, V24, V25, V26, V27; \ + sph_u32 V30, V31, V32, V33, V34, V35, V36, V37; \ + sph_u32 V40, V41, V42, V43, V44, V45, V46, V47; + +#define READ_STATE5(state) do { \ + V00 = (state)->V[0][0]; \ + V01 = (state)->V[0][1]; \ + V02 = (state)->V[0][2]; \ + V03 = (state)->V[0][3]; \ + V04 = (state)->V[0][4]; \ + V05 = (state)->V[0][5]; \ + V06 = (state)->V[0][6]; \ + V07 = (state)->V[0][7]; \ + V10 = (state)->V[1][0]; \ + V11 = (state)->V[1][1]; \ + V12 = (state)->V[1][2]; \ + V13 = (state)->V[1][3]; \ + V14 = (state)->V[1][4]; \ + V15 = (state)->V[1][5]; \ + V16 = (state)->V[1][6]; \ + V17 = (state)->V[1][7]; \ + V20 = (state)->V[2][0]; \ + V21 = (state)->V[2][1]; \ + V22 = (state)->V[2][2]; \ + V23 = (state)->V[2][3]; \ + V24 = (state)->V[2][4]; \ + V25 = (state)->V[2][5]; \ + V26 = (state)->V[2][6]; \ + V27 = (state)->V[2][7]; \ + V30 = (state)->V[3][0]; \ + V31 = (state)->V[3][1]; \ + V32 = (state)->V[3][2]; \ + V33 = (state)->V[3][3]; \ + V34 = (state)->V[3][4]; \ + V35 = (state)->V[3][5]; \ + V36 = (state)->V[3][6]; \ + V37 = (state)->V[3][7]; \ + V40 = (state)->V[4][0]; \ + V41 = (state)->V[4][1]; \ + V42 = (state)->V[4][2]; \ + V43 = (state)->V[4][3]; \ + V44 = (state)->V[4][4]; \ + V45 = (state)->V[4][5]; \ + V46 = (state)->V[4][6]; \ + V47 = (state)->V[4][7]; \ + } while (0) + +#define WRITE_STATE5(state) do { \ + (state)->V[0][0] = V00; \ + (state)->V[0][1] = V01; \ + (state)->V[0][2] = V02; \ + (state)->V[0][3] = V03; \ + (state)->V[0][4] = V04; \ + (state)->V[0][5] = V05; \ + (state)->V[0][6] = V06; \ + (state)->V[0][7] = V07; \ + (state)->V[1][0] = V10; \ + (state)->V[1][1] = V11; \ + (state)->V[1][2] = V12; \ + (state)->V[1][3] = V13; \ + (state)->V[1][4] = V14; \ + (state)->V[1][5] = V15; \ + (state)->V[1][6] = V16; \ + (state)->V[1][7] = V17; \ + (state)->V[2][0] = V20; \ + (state)->V[2][1] = V21; \ + (state)->V[2][2] = V22; \ + (state)->V[2][3] = V23; \ + (state)->V[2][4] = V24; \ + (state)->V[2][5] = V25; \ + (state)->V[2][6] = V26; \ + (state)->V[2][7] = V27; \ + (state)->V[3][0] = V30; \ + (state)->V[3][1] = V31; \ + (state)->V[3][2] = V32; \ + (state)->V[3][3] = V33; \ + (state)->V[3][4] = V34; \ + (state)->V[3][5] = V35; \ + (state)->V[3][6] = V36; \ + (state)->V[3][7] = V37; \ + (state)->V[4][0] = V40; \ + (state)->V[4][1] = V41; \ + (state)->V[4][2] = V42; \ + (state)->V[4][3] = V43; \ + (state)->V[4][4] = V44; \ + (state)->V[4][5] = V45; \ + (state)->V[4][6] = V46; \ + (state)->V[4][7] = V47; \ + } while (0) + +#define MI5 do { \ + DECL_TMP8(M) \ + DECL_TMP8(a) \ + DECL_TMP8(b) \ + M0 = sph_dec32be_aligned(buf + 0); \ + M1 = sph_dec32be_aligned(buf + 4); \ + M2 = sph_dec32be_aligned(buf + 8); \ + M3 = sph_dec32be_aligned(buf + 12); \ + M4 = sph_dec32be_aligned(buf + 16); \ + M5 = sph_dec32be_aligned(buf + 20); \ + M6 = sph_dec32be_aligned(buf + 24); \ + M7 = sph_dec32be_aligned(buf + 28); \ + XOR(a, V0, V1); \ + XOR(b, V2, V3); \ + XOR(a, a, b); \ + XOR(a, a, V4); \ + M2(a, a); \ + XOR(V0, a, V0); \ + XOR(V1, a, V1); \ + XOR(V2, a, V2); \ + XOR(V3, a, V3); \ + XOR(V4, a, V4); \ + M2(b, V0); \ + XOR(b, b, V1); \ + M2(V1, V1); \ + XOR(V1, V1, V2); \ + M2(V2, V2); \ + XOR(V2, V2, V3); \ + M2(V3, V3); \ + XOR(V3, V3, V4); \ + M2(V4, V4); \ + XOR(V4, V4, V0); \ + M2(V0, b); \ + XOR(V0, V0, V4); \ + M2(V4, V4); \ + XOR(V4, V4, V3); \ + M2(V3, V3); \ + XOR(V3, V3, V2); \ + M2(V2, V2); \ + XOR(V2, V2, V1); \ + M2(V1, V1); \ + XOR(V1, V1, b); \ + XOR(V0, V0, M); \ + M2(M, M); \ + XOR(V1, V1, M); \ + M2(M, M); \ + XOR(V2, V2, M); \ + M2(M, M); \ + XOR(V3, V3, M); \ + M2(M, M); \ + XOR(V4, V4, M); \ + } while (0) + +#define TWEAK5 do { \ + V14 = SPH_ROTL32(V14, 1); \ + V15 = SPH_ROTL32(V15, 1); \ + V16 = SPH_ROTL32(V16, 1); \ + V17 = SPH_ROTL32(V17, 1); \ + V24 = SPH_ROTL32(V24, 2); \ + V25 = SPH_ROTL32(V25, 2); \ + V26 = SPH_ROTL32(V26, 2); \ + V27 = SPH_ROTL32(V27, 2); \ + V34 = SPH_ROTL32(V34, 3); \ + V35 = SPH_ROTL32(V35, 3); \ + V36 = SPH_ROTL32(V36, 3); \ + V37 = SPH_ROTL32(V37, 3); \ + V44 = SPH_ROTL32(V44, 4); \ + V45 = SPH_ROTL32(V45, 4); \ + V46 = SPH_ROTL32(V46, 4); \ + V47 = SPH_ROTL32(V47, 4); \ + } while (0) + +#if SPH_LUFFA_PARALLEL + +#define P5 do { \ + int r; \ + sph_u64 W0, W1, W2, W3, W4, W5, W6, W7; \ + TWEAK5; \ + W0 = (sph_u64)V00 | ((sph_u64)V10 << 32); \ + W1 = (sph_u64)V01 | ((sph_u64)V11 << 32); \ + W2 = (sph_u64)V02 | ((sph_u64)V12 << 32); \ + W3 = (sph_u64)V03 | ((sph_u64)V13 << 32); \ + W4 = (sph_u64)V04 | ((sph_u64)V14 << 32); \ + W5 = (sph_u64)V05 | ((sph_u64)V15 << 32); \ + W6 = (sph_u64)V06 | ((sph_u64)V16 << 32); \ + W7 = (sph_u64)V07 | ((sph_u64)V17 << 32); \ + for (r = 0; r < 8; r ++) { \ + SUB_CRUMBW(W0, W1, W2, W3); \ + SUB_CRUMBW(W5, W6, W7, W4); \ + MIX_WORDW(W0, W4); \ + MIX_WORDW(W1, W5); \ + MIX_WORDW(W2, W6); \ + MIX_WORDW(W3, W7); \ + W0 ^= RCW010[r]; \ + W4 ^= RCW014[r]; \ + } \ + V00 = SPH_T32((sph_u32)W0); \ + V10 = SPH_T32((sph_u32)(W0 >> 32)); \ + V01 = SPH_T32((sph_u32)W1); \ + V11 = SPH_T32((sph_u32)(W1 >> 32)); \ + V02 = SPH_T32((sph_u32)W2); \ + V12 = SPH_T32((sph_u32)(W2 >> 32)); \ + V03 = SPH_T32((sph_u32)W3); \ + V13 = SPH_T32((sph_u32)(W3 >> 32)); \ + V04 = SPH_T32((sph_u32)W4); \ + V14 = SPH_T32((sph_u32)(W4 >> 32)); \ + V05 = SPH_T32((sph_u32)W5); \ + V15 = SPH_T32((sph_u32)(W5 >> 32)); \ + V06 = SPH_T32((sph_u32)W6); \ + V16 = SPH_T32((sph_u32)(W6 >> 32)); \ + V07 = SPH_T32((sph_u32)W7); \ + V17 = SPH_T32((sph_u32)(W7 >> 32)); \ + W0 = (sph_u64)V20 | ((sph_u64)V30 << 32); \ + W1 = (sph_u64)V21 | ((sph_u64)V31 << 32); \ + W2 = (sph_u64)V22 | ((sph_u64)V32 << 32); \ + W3 = (sph_u64)V23 | ((sph_u64)V33 << 32); \ + W4 = (sph_u64)V24 | ((sph_u64)V34 << 32); \ + W5 = (sph_u64)V25 | ((sph_u64)V35 << 32); \ + W6 = (sph_u64)V26 | ((sph_u64)V36 << 32); \ + W7 = (sph_u64)V27 | ((sph_u64)V37 << 32); \ + for (r = 0; r < 8; r ++) { \ + SUB_CRUMBW(W0, W1, W2, W3); \ + SUB_CRUMBW(W5, W6, W7, W4); \ + MIX_WORDW(W0, W4); \ + MIX_WORDW(W1, W5); \ + MIX_WORDW(W2, W6); \ + MIX_WORDW(W3, W7); \ + W0 ^= RCW230[r]; \ + W4 ^= RCW234[r]; \ + } \ + V20 = SPH_T32((sph_u32)W0); \ + V30 = SPH_T32((sph_u32)(W0 >> 32)); \ + V21 = SPH_T32((sph_u32)W1); \ + V31 = SPH_T32((sph_u32)(W1 >> 32)); \ + V22 = SPH_T32((sph_u32)W2); \ + V32 = SPH_T32((sph_u32)(W2 >> 32)); \ + V23 = SPH_T32((sph_u32)W3); \ + V33 = SPH_T32((sph_u32)(W3 >> 32)); \ + V24 = SPH_T32((sph_u32)W4); \ + V34 = SPH_T32((sph_u32)(W4 >> 32)); \ + V25 = SPH_T32((sph_u32)W5); \ + V35 = SPH_T32((sph_u32)(W5 >> 32)); \ + V26 = SPH_T32((sph_u32)W6); \ + V36 = SPH_T32((sph_u32)(W6 >> 32)); \ + V27 = SPH_T32((sph_u32)W7); \ + V37 = SPH_T32((sph_u32)(W7 >> 32)); \ + for (r = 0; r < 8; r ++) { \ + SUB_CRUMB(V40, V41, V42, V43); \ + SUB_CRUMB(V45, V46, V47, V44); \ + MIX_WORD(V40, V44); \ + MIX_WORD(V41, V45); \ + MIX_WORD(V42, V46); \ + MIX_WORD(V43, V47); \ + V40 ^= RC40[r]; \ + V44 ^= RC44[r]; \ + } \ + } while (0) + +#else + +#define P5 do { \ + int r; \ + TWEAK5; \ + for (r = 0; r < 8; r ++) { \ + SUB_CRUMB(V00, V01, V02, V03); \ + SUB_CRUMB(V05, V06, V07, V04); \ + MIX_WORD(V00, V04); \ + MIX_WORD(V01, V05); \ + MIX_WORD(V02, V06); \ + MIX_WORD(V03, V07); \ + V00 ^= RC00[r]; \ + V04 ^= RC04[r]; \ + } \ + for (r = 0; r < 8; r ++) { \ + SUB_CRUMB(V10, V11, V12, V13); \ + SUB_CRUMB(V15, V16, V17, V14); \ + MIX_WORD(V10, V14); \ + MIX_WORD(V11, V15); \ + MIX_WORD(V12, V16); \ + MIX_WORD(V13, V17); \ + V10 ^= RC10[r]; \ + V14 ^= RC14[r]; \ + } \ + for (r = 0; r < 8; r ++) { \ + SUB_CRUMB(V20, V21, V22, V23); \ + SUB_CRUMB(V25, V26, V27, V24); \ + MIX_WORD(V20, V24); \ + MIX_WORD(V21, V25); \ + MIX_WORD(V22, V26); \ + MIX_WORD(V23, V27); \ + V20 ^= RC20[r]; \ + V24 ^= RC24[r]; \ + } \ + for (r = 0; r < 8; r ++) { \ + SUB_CRUMB(V30, V31, V32, V33); \ + SUB_CRUMB(V35, V36, V37, V34); \ + MIX_WORD(V30, V34); \ + MIX_WORD(V31, V35); \ + MIX_WORD(V32, V36); \ + MIX_WORD(V33, V37); \ + V30 ^= RC30[r]; \ + V34 ^= RC34[r]; \ + } \ + for (r = 0; r < 8; r ++) { \ + SUB_CRUMB(V40, V41, V42, V43); \ + SUB_CRUMB(V45, V46, V47, V44); \ + MIX_WORD(V40, V44); \ + MIX_WORD(V41, V45); \ + MIX_WORD(V42, V46); \ + MIX_WORD(V43, V47); \ + V40 ^= RC40[r]; \ + V44 ^= RC44[r]; \ + } \ + } while (0) + +#endif + +static void +luffa3(sph_luffa224_context *sc, const void *data, size_t len) +{ + unsigned char *buf; + size_t ptr; + DECL_STATE3 + + buf = sc->buf; + ptr = sc->ptr; + if (len < (sizeof sc->buf) - ptr) { + memcpy(buf + ptr, data, len); + ptr += len; + sc->ptr = ptr; + return; + } + + READ_STATE3(sc); + while (len > 0) { + size_t clen; + + clen = (sizeof sc->buf) - ptr; + if (clen > len) + clen = len; + memcpy(buf + ptr, data, clen); + ptr += clen; + data = (const unsigned char *)data + clen; + len -= clen; + if (ptr == sizeof sc->buf) { + MI3; + P3; + ptr = 0; + } + } + WRITE_STATE3(sc); + sc->ptr = ptr; +} + +static void +luffa3_close(sph_luffa224_context *sc, unsigned ub, unsigned n, + void *dst, unsigned out_size_w32) +{ + unsigned char *buf, *out; + size_t ptr; + unsigned z; + int i; + DECL_STATE3 + + buf = sc->buf; + ptr = sc->ptr; + z = 0x80 >> n; + buf[ptr ++] = ((ub & -z) | z) & 0xFF; + memset(buf + ptr, 0, (sizeof sc->buf) - ptr); + READ_STATE3(sc); + for (i = 0; i < 2; i ++) { + MI3; + P3; + memset(buf, 0, sizeof sc->buf); + } + out = dst; + sph_enc32be(out + 0, V00 ^ V10 ^ V20); + sph_enc32be(out + 4, V01 ^ V11 ^ V21); + sph_enc32be(out + 8, V02 ^ V12 ^ V22); + sph_enc32be(out + 12, V03 ^ V13 ^ V23); + sph_enc32be(out + 16, V04 ^ V14 ^ V24); + sph_enc32be(out + 20, V05 ^ V15 ^ V25); + sph_enc32be(out + 24, V06 ^ V16 ^ V26); + if (out_size_w32 > 7) + sph_enc32be(out + 28, V07 ^ V17 ^ V27); +} + +static void +luffa4(sph_luffa384_context *sc, const void *data, size_t len) +{ + unsigned char *buf; + size_t ptr; + DECL_STATE4 + + buf = sc->buf; + ptr = sc->ptr; + if (len < (sizeof sc->buf) - ptr) { + memcpy(buf + ptr, data, len); + ptr += len; + sc->ptr = ptr; + return; + } + + READ_STATE4(sc); + while (len > 0) { + size_t clen; + + clen = (sizeof sc->buf) - ptr; + if (clen > len) + clen = len; + memcpy(buf + ptr, data, clen); + ptr += clen; + data = (const unsigned char *)data + clen; + len -= clen; + if (ptr == sizeof sc->buf) { + MI4; + P4; + ptr = 0; + } + } + WRITE_STATE4(sc); + sc->ptr = ptr; +} + +static void +luffa4_close(sph_luffa384_context *sc, unsigned ub, unsigned n, void *dst) +{ + unsigned char *buf, *out; + size_t ptr; + unsigned z; + int i; + DECL_STATE4 + + buf = sc->buf; + ptr = sc->ptr; + out = dst; + z = 0x80 >> n; + buf[ptr ++] = ((ub & -z) | z) & 0xFF; + memset(buf + ptr, 0, (sizeof sc->buf) - ptr); + READ_STATE4(sc); + for (i = 0; i < 3; i ++) { + MI4; + P4; + switch (i) { + case 0: + memset(buf, 0, sizeof sc->buf); + break; + case 1: + sph_enc32be(out + 0, V00 ^ V10 ^ V20 ^ V30); + sph_enc32be(out + 4, V01 ^ V11 ^ V21 ^ V31); + sph_enc32be(out + 8, V02 ^ V12 ^ V22 ^ V32); + sph_enc32be(out + 12, V03 ^ V13 ^ V23 ^ V33); + sph_enc32be(out + 16, V04 ^ V14 ^ V24 ^ V34); + sph_enc32be(out + 20, V05 ^ V15 ^ V25 ^ V35); + sph_enc32be(out + 24, V06 ^ V16 ^ V26 ^ V36); + sph_enc32be(out + 28, V07 ^ V17 ^ V27 ^ V37); + break; + case 2: + sph_enc32be(out + 32, V00 ^ V10 ^ V20 ^ V30); + sph_enc32be(out + 36, V01 ^ V11 ^ V21 ^ V31); + sph_enc32be(out + 40, V02 ^ V12 ^ V22 ^ V32); + sph_enc32be(out + 44, V03 ^ V13 ^ V23 ^ V33); + break; + } + } +} + +static void +luffa5(sph_luffa512_context *sc, const void *data, size_t len) +{ + unsigned char *buf; + size_t ptr; + DECL_STATE5 + + buf = sc->buf; + ptr = sc->ptr; + if (len < (sizeof sc->buf) - ptr) { + memcpy(buf + ptr, data, len); + ptr += len; + sc->ptr = ptr; + return; + } + + READ_STATE5(sc); + while (len > 0) { + size_t clen; + + clen = (sizeof sc->buf) - ptr; + if (clen > len) + clen = len; + memcpy(buf + ptr, data, clen); + ptr += clen; + data = (const unsigned char *)data + clen; + len -= clen; + if (ptr == sizeof sc->buf) { + MI5; + P5; + ptr = 0; + } + } + WRITE_STATE5(sc); + sc->ptr = ptr; +} + +static void +luffa5_close(sph_luffa512_context *sc, unsigned ub, unsigned n, void *dst) +{ + unsigned char *buf, *out; + size_t ptr; + unsigned z; + int i; + DECL_STATE5 + + buf = sc->buf; + ptr = sc->ptr; + out = dst; + z = 0x80 >> n; + buf[ptr ++] = ((ub & -z) | z) & 0xFF; + memset(buf + ptr, 0, (sizeof sc->buf) - ptr); + READ_STATE5(sc); + for (i = 0; i < 3; i ++) { + MI5; + P5; + switch (i) { + case 0: + memset(buf, 0, sizeof sc->buf); + break; + case 1: + sph_enc32be(out + 0, V00 ^ V10 ^ V20 ^ V30 ^ V40); + sph_enc32be(out + 4, V01 ^ V11 ^ V21 ^ V31 ^ V41); + sph_enc32be(out + 8, V02 ^ V12 ^ V22 ^ V32 ^ V42); + sph_enc32be(out + 12, V03 ^ V13 ^ V23 ^ V33 ^ V43); + sph_enc32be(out + 16, V04 ^ V14 ^ V24 ^ V34 ^ V44); + sph_enc32be(out + 20, V05 ^ V15 ^ V25 ^ V35 ^ V45); + sph_enc32be(out + 24, V06 ^ V16 ^ V26 ^ V36 ^ V46); + sph_enc32be(out + 28, V07 ^ V17 ^ V27 ^ V37 ^ V47); + break; + case 2: + sph_enc32be(out + 32, V00 ^ V10 ^ V20 ^ V30 ^ V40); + sph_enc32be(out + 36, V01 ^ V11 ^ V21 ^ V31 ^ V41); + sph_enc32be(out + 40, V02 ^ V12 ^ V22 ^ V32 ^ V42); + sph_enc32be(out + 44, V03 ^ V13 ^ V23 ^ V33 ^ V43); + sph_enc32be(out + 48, V04 ^ V14 ^ V24 ^ V34 ^ V44); + sph_enc32be(out + 52, V05 ^ V15 ^ V25 ^ V35 ^ V45); + sph_enc32be(out + 56, V06 ^ V16 ^ V26 ^ V36 ^ V46); + sph_enc32be(out + 60, V07 ^ V17 ^ V27 ^ V37 ^ V47); + break; + } + } +} + +/* see sph_luffa.h */ +void +sph_luffa224_init(void *cc) +{ + sph_luffa224_context *sc; + + sc = cc; + memcpy(sc->V, V_INIT, sizeof(sc->V)); + sc->ptr = 0; +} + +/* see sph_luffa.h */ +void +sph_luffa224(void *cc, const void *data, size_t len) +{ + luffa3(cc, data, len); +} + +/* see sph_luffa.h */ +void +sph_luffa224_close(void *cc, void *dst) +{ + sph_luffa224_addbits_and_close(cc, 0, 0, dst); +} + +/* see sph_luffa.h */ +void +sph_luffa224_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst) +{ + luffa3_close(cc, ub, n, dst, 7); + sph_luffa224_init(cc); +} + +/* see sph_luffa.h */ +void +sph_luffa256_init(void *cc) +{ + sph_luffa256_context *sc; + + sc = cc; + memcpy(sc->V, V_INIT, sizeof(sc->V)); + sc->ptr = 0; +} + +/* see sph_luffa.h */ +void +sph_luffa256(void *cc, const void *data, size_t len) +{ + luffa3(cc, data, len); +} + +/* see sph_luffa.h */ +void +sph_luffa256_close(void *cc, void *dst) +{ + sph_luffa256_addbits_and_close(cc, 0, 0, dst); +} + +/* see sph_luffa.h */ +void +sph_luffa256_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst) +{ + luffa3_close(cc, ub, n, dst, 8); + sph_luffa256_init(cc); +} + +/* see sph_luffa.h */ +void +sph_luffa384_init(void *cc) +{ + sph_luffa384_context *sc; + + sc = cc; + memcpy(sc->V, V_INIT, sizeof(sc->V)); + sc->ptr = 0; +} + +/* see sph_luffa.h */ +void +sph_luffa384(void *cc, const void *data, size_t len) +{ + luffa4(cc, data, len); +} + +/* see sph_luffa.h */ +void +sph_luffa384_close(void *cc, void *dst) +{ + sph_luffa384_addbits_and_close(cc, 0, 0, dst); +} + +/* see sph_luffa.h */ +void +sph_luffa384_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst) +{ + luffa4_close(cc, ub, n, dst); + sph_luffa384_init(cc); +} + +/* see sph_luffa.h */ +void +sph_luffa512_init(void *cc) +{ + sph_luffa512_context *sc; + + sc = cc; + memcpy(sc->V, V_INIT, sizeof(sc->V)); + sc->ptr = 0; +} + +/* see sph_luffa.h */ +void +sph_luffa512(void *cc, const void *data, size_t len) +{ + luffa5(cc, data, len); +} + +/* see sph_luffa.h */ +void +sph_luffa512_close(void *cc, void *dst) +{ + sph_luffa512_addbits_and_close(cc, 0, 0, dst); +} + +/* see sph_luffa.h */ +void +sph_luffa512_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst) +{ + luffa5_close(cc, ub, n, dst); + sph_luffa512_init(cc); +} + +#ifdef __cplusplus +} +#endif \ No newline at end of file diff --git a/sha3/shavite.c b/sha3/shavite.c new file mode 100644 index 0000000..85074f3 --- /dev/null +++ b/sha3/shavite.c @@ -0,0 +1,1764 @@ +/* $Id: shavite.c 227 2010-06-16 17:28:38Z tp $ */ +/* + * SHAvite-3 implementation. + * + * ==========================(LICENSE BEGIN)============================ + * + * Copyright (c) 2007-2010 Projet RNRT SAPHIR + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY + * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * + * ===========================(LICENSE END)============================= + * + * @author Thomas Pornin + */ + +#include +#include + +#include "sph_shavite.h" + +#ifdef __cplusplus +extern "C"{ +#endif + +#if SPH_SMALL_FOOTPRINT && !defined SPH_SMALL_FOOTPRINT_SHAVITE +#define SPH_SMALL_FOOTPRINT_SHAVITE 1 +#endif + +#ifdef _MSC_VER +#pragma warning (disable: 4146) +#endif + +#define C32 SPH_C32 + +/* + * As of round 2 of the SHA-3 competition, the published reference + * implementation and test vectors are wrong, because they use + * big-endian AES tables while the internal decoding uses little-endian. + * The code below follows the specification. To turn it into a code + * which follows the reference implementation (the one called "BugFix" + * on the SHAvite-3 web site, published on Nov 23rd, 2009), comment out + * the code below (from the '#define AES_BIG_ENDIAN...' to the definition + * of the AES_ROUND_NOKEY macro) and replace it with the version which + * is commented out afterwards. + */ + +#define AES_BIG_ENDIAN 0 +#include "aes_helper.c" + +static const sph_u32 IV224[] = { + C32(0x6774F31C), C32(0x990AE210), C32(0xC87D4274), C32(0xC9546371), + C32(0x62B2AEA8), C32(0x4B5801D8), C32(0x1B702860), C32(0x842F3017) +}; + +static const sph_u32 IV256[] = { + C32(0x49BB3E47), C32(0x2674860D), C32(0xA8B392AC), C32(0x021AC4E6), + C32(0x409283CF), C32(0x620E5D86), C32(0x6D929DCB), C32(0x96CC2A8B) +}; + +static const sph_u32 IV384[] = { + C32(0x83DF1545), C32(0xF9AAEC13), C32(0xF4803CB0), C32(0x11FE1F47), + C32(0xDA6CD269), C32(0x4F53FCD7), C32(0x950529A2), C32(0x97908147), + C32(0xB0A4D7AF), C32(0x2B9132BF), C32(0x226E607D), C32(0x3C0F8D7C), + C32(0x487B3F0F), C32(0x04363E22), C32(0x0155C99C), C32(0xEC2E20D3) +}; + +static const sph_u32 IV512[] = { + C32(0x72FCCDD8), C32(0x79CA4727), C32(0x128A077B), C32(0x40D55AEC), + C32(0xD1901A06), C32(0x430AE307), C32(0xB29F5CD1), C32(0xDF07FBFC), + C32(0x8E45D73D), C32(0x681AB538), C32(0xBDE86578), C32(0xDD577E47), + C32(0xE275EADE), C32(0x502D9FCD), C32(0xB9357178), C32(0x022A4B9A) +}; + +#define AES_ROUND_NOKEY(x0, x1, x2, x3) do { \ + sph_u32 t0 = (x0); \ + sph_u32 t1 = (x1); \ + sph_u32 t2 = (x2); \ + sph_u32 t3 = (x3); \ + AES_ROUND_NOKEY_LE(t0, t1, t2, t3, x0, x1, x2, x3); \ + } while (0) + +/* + * This is the code needed to match the "reference implementation" as + * published on Nov 23rd, 2009, instead of the published specification. + * + +#define AES_BIG_ENDIAN 1 +#include "aes_helper.c" + +static const sph_u32 IV224[] = { + C32(0xC4C67795), C32(0xC0B1817F), C32(0xEAD88924), C32(0x1ABB1BB0), + C32(0xE0C29152), C32(0xBDE046BA), C32(0xAEEECF99), C32(0x58D509D8) +}; + +static const sph_u32 IV256[] = { + C32(0x3EECF551), C32(0xBF10819B), C32(0xE6DC8559), C32(0xF3E23FD5), + C32(0x431AEC73), C32(0x79E3F731), C32(0x98325F05), C32(0xA92A31F1) +}; + +static const sph_u32 IV384[] = { + C32(0x71F48510), C32(0xA903A8AC), C32(0xFE3216DD), C32(0x0B2D2AD4), + C32(0x6672900A), C32(0x41032819), C32(0x15A7D780), C32(0xB3CAB8D9), + C32(0x34EF4711), C32(0xDE019FE8), C32(0x4D674DC4), C32(0xE056D96B), + C32(0xA35C016B), C32(0xDD903BA7), C32(0x8C1B09B4), C32(0x2C3E9F25) +}; + +static const sph_u32 IV512[] = { + C32(0xD5652B63), C32(0x25F1E6EA), C32(0xB18F48FA), C32(0xA1EE3A47), + C32(0xC8B67B07), C32(0xBDCE48D3), C32(0xE3937B78), C32(0x05DB5186), + C32(0x613BE326), C32(0xA11FA303), C32(0x90C833D4), C32(0x79CEE316), + C32(0x1E1AF00F), C32(0x2829B165), C32(0x23B25F80), C32(0x21E11499) +}; + +#define AES_ROUND_NOKEY(x0, x1, x2, x3) do { \ + sph_u32 t0 = (x0); \ + sph_u32 t1 = (x1); \ + sph_u32 t2 = (x2); \ + sph_u32 t3 = (x3); \ + AES_ROUND_NOKEY_BE(t0, t1, t2, t3, x0, x1, x2, x3); \ + } while (0) + + */ + +#define KEY_EXPAND_ELT(k0, k1, k2, k3) do { \ + sph_u32 kt; \ + AES_ROUND_NOKEY(k1, k2, k3, k0); \ + kt = (k0); \ + (k0) = (k1); \ + (k1) = (k2); \ + (k2) = (k3); \ + (k3) = kt; \ + } while (0) + +#if SPH_SMALL_FOOTPRINT_SHAVITE + +/* + * This function assumes that "msg" is aligned for 32-bit access. + */ +static void +c256(sph_shavite_small_context *sc, const void *msg) +{ + sph_u32 p0, p1, p2, p3, p4, p5, p6, p7; + sph_u32 rk[144]; + size_t u; + int r, s; + +#if SPH_LITTLE_ENDIAN + memcpy(rk, msg, 64); +#else + for (u = 0; u < 16; u += 4) { + rk[u + 0] = sph_dec32le_aligned( + (const unsigned char *)msg + (u << 2) + 0); + rk[u + 1] = sph_dec32le_aligned( + (const unsigned char *)msg + (u << 2) + 4); + rk[u + 2] = sph_dec32le_aligned( + (const unsigned char *)msg + (u << 2) + 8); + rk[u + 3] = sph_dec32le_aligned( + (const unsigned char *)msg + (u << 2) + 12); + } +#endif + u = 16; + for (r = 0; r < 4; r ++) { + for (s = 0; s < 2; s ++) { + sph_u32 x0, x1, x2, x3; + + x0 = rk[u - 15]; + x1 = rk[u - 14]; + x2 = rk[u - 13]; + x3 = rk[u - 16]; + AES_ROUND_NOKEY(x0, x1, x2, x3); + rk[u + 0] = x0 ^ rk[u - 4]; + rk[u + 1] = x1 ^ rk[u - 3]; + rk[u + 2] = x2 ^ rk[u - 2]; + rk[u + 3] = x3 ^ rk[u - 1]; + if (u == 16) { + rk[ 16] ^= sc->count0; + rk[ 17] ^= SPH_T32(~sc->count1); + } else if (u == 56) { + rk[ 57] ^= sc->count1; + rk[ 58] ^= SPH_T32(~sc->count0); + } + u += 4; + + x0 = rk[u - 15]; + x1 = rk[u - 14]; + x2 = rk[u - 13]; + x3 = rk[u - 16]; + AES_ROUND_NOKEY(x0, x1, x2, x3); + rk[u + 0] = x0 ^ rk[u - 4]; + rk[u + 1] = x1 ^ rk[u - 3]; + rk[u + 2] = x2 ^ rk[u - 2]; + rk[u + 3] = x3 ^ rk[u - 1]; + if (u == 84) { + rk[ 86] ^= sc->count1; + rk[ 87] ^= SPH_T32(~sc->count0); + } else if (u == 124) { + rk[124] ^= sc->count0; + rk[127] ^= SPH_T32(~sc->count1); + } + u += 4; + } + for (s = 0; s < 4; s ++) { + rk[u + 0] = rk[u - 16] ^ rk[u - 3]; + rk[u + 1] = rk[u - 15] ^ rk[u - 2]; + rk[u + 2] = rk[u - 14] ^ rk[u - 1]; + rk[u + 3] = rk[u - 13] ^ rk[u - 0]; + u += 4; + } + } + + p0 = sc->h[0x0]; + p1 = sc->h[0x1]; + p2 = sc->h[0x2]; + p3 = sc->h[0x3]; + p4 = sc->h[0x4]; + p5 = sc->h[0x5]; + p6 = sc->h[0x6]; + p7 = sc->h[0x7]; + u = 0; + for (r = 0; r < 6; r ++) { + sph_u32 x0, x1, x2, x3; + + x0 = p4 ^ rk[u ++]; + x1 = p5 ^ rk[u ++]; + x2 = p6 ^ rk[u ++]; + x3 = p7 ^ rk[u ++]; + AES_ROUND_NOKEY(x0, x1, x2, x3); + x0 ^= rk[u ++]; + x1 ^= rk[u ++]; + x2 ^= rk[u ++]; + x3 ^= rk[u ++]; + AES_ROUND_NOKEY(x0, x1, x2, x3); + x0 ^= rk[u ++]; + x1 ^= rk[u ++]; + x2 ^= rk[u ++]; + x3 ^= rk[u ++]; + AES_ROUND_NOKEY(x0, x1, x2, x3); + p0 ^= x0; + p1 ^= x1; + p2 ^= x2; + p3 ^= x3; + + x0 = p0 ^ rk[u ++]; + x1 = p1 ^ rk[u ++]; + x2 = p2 ^ rk[u ++]; + x3 = p3 ^ rk[u ++]; + AES_ROUND_NOKEY(x0, x1, x2, x3); + x0 ^= rk[u ++]; + x1 ^= rk[u ++]; + x2 ^= rk[u ++]; + x3 ^= rk[u ++]; + AES_ROUND_NOKEY(x0, x1, x2, x3); + x0 ^= rk[u ++]; + x1 ^= rk[u ++]; + x2 ^= rk[u ++]; + x3 ^= rk[u ++]; + AES_ROUND_NOKEY(x0, x1, x2, x3); + p4 ^= x0; + p5 ^= x1; + p6 ^= x2; + p7 ^= x3; + } + sc->h[0x0] ^= p0; + sc->h[0x1] ^= p1; + sc->h[0x2] ^= p2; + sc->h[0x3] ^= p3; + sc->h[0x4] ^= p4; + sc->h[0x5] ^= p5; + sc->h[0x6] ^= p6; + sc->h[0x7] ^= p7; +} + +#else + +/* + * This function assumes that "msg" is aligned for 32-bit access. + */ +static void +c256(sph_shavite_small_context *sc, const void *msg) +{ + sph_u32 p0, p1, p2, p3, p4, p5, p6, p7; + sph_u32 x0, x1, x2, x3; + sph_u32 rk0, rk1, rk2, rk3, rk4, rk5, rk6, rk7; + sph_u32 rk8, rk9, rkA, rkB, rkC, rkD, rkE, rkF; + + p0 = sc->h[0x0]; + p1 = sc->h[0x1]; + p2 = sc->h[0x2]; + p3 = sc->h[0x3]; + p4 = sc->h[0x4]; + p5 = sc->h[0x5]; + p6 = sc->h[0x6]; + p7 = sc->h[0x7]; + /* round 0 */ + rk0 = sph_dec32le_aligned((const unsigned char *)msg + 0); + x0 = p4 ^ rk0; + rk1 = sph_dec32le_aligned((const unsigned char *)msg + 4); + x1 = p5 ^ rk1; + rk2 = sph_dec32le_aligned((const unsigned char *)msg + 8); + x2 = p6 ^ rk2; + rk3 = sph_dec32le_aligned((const unsigned char *)msg + 12); + x3 = p7 ^ rk3; + AES_ROUND_NOKEY(x0, x1, x2, x3); + rk4 = sph_dec32le_aligned((const unsigned char *)msg + 16); + x0 ^= rk4; + rk5 = sph_dec32le_aligned((const unsigned char *)msg + 20); + x1 ^= rk5; + rk6 = sph_dec32le_aligned((const unsigned char *)msg + 24); + x2 ^= rk6; + rk7 = sph_dec32le_aligned((const unsigned char *)msg + 28); + x3 ^= rk7; + AES_ROUND_NOKEY(x0, x1, x2, x3); + rk8 = sph_dec32le_aligned((const unsigned char *)msg + 32); + x0 ^= rk8; + rk9 = sph_dec32le_aligned((const unsigned char *)msg + 36); + x1 ^= rk9; + rkA = sph_dec32le_aligned((const unsigned char *)msg + 40); + x2 ^= rkA; + rkB = sph_dec32le_aligned((const unsigned char *)msg + 44); + x3 ^= rkB; + AES_ROUND_NOKEY(x0, x1, x2, x3); + p0 ^= x0; + p1 ^= x1; + p2 ^= x2; + p3 ^= x3; + /* round 1 */ + rkC = sph_dec32le_aligned((const unsigned char *)msg + 48); + x0 = p0 ^ rkC; + rkD = sph_dec32le_aligned((const unsigned char *)msg + 52); + x1 = p1 ^ rkD; + rkE = sph_dec32le_aligned((const unsigned char *)msg + 56); + x2 = p2 ^ rkE; + rkF = sph_dec32le_aligned((const unsigned char *)msg + 60); + x3 = p3 ^ rkF; + AES_ROUND_NOKEY(x0, x1, x2, x3); + KEY_EXPAND_ELT(rk0, rk1, rk2, rk3); + rk0 ^= rkC ^ sc->count0; + rk1 ^= rkD ^ SPH_T32(~sc->count1); + rk2 ^= rkE; + rk3 ^= rkF; + x0 ^= rk0; + x1 ^= rk1; + x2 ^= rk2; + x3 ^= rk3; + AES_ROUND_NOKEY(x0, x1, x2, x3); + KEY_EXPAND_ELT(rk4, rk5, rk6, rk7); + rk4 ^= rk0; + rk5 ^= rk1; + rk6 ^= rk2; + rk7 ^= rk3; + x0 ^= rk4; + x1 ^= rk5; + x2 ^= rk6; + x3 ^= rk7; + AES_ROUND_NOKEY(x0, x1, x2, x3); + p4 ^= x0; + p5 ^= x1; + p6 ^= x2; + p7 ^= x3; + /* round 2 */ + KEY_EXPAND_ELT(rk8, rk9, rkA, rkB); + rk8 ^= rk4; + rk9 ^= rk5; + rkA ^= rk6; + rkB ^= rk7; + x0 = p4 ^ rk8; + x1 = p5 ^ rk9; + x2 = p6 ^ rkA; + x3 = p7 ^ rkB; + AES_ROUND_NOKEY(x0, x1, x2, x3); + KEY_EXPAND_ELT(rkC, rkD, rkE, rkF); + rkC ^= rk8; + rkD ^= rk9; + rkE ^= rkA; + rkF ^= rkB; + x0 ^= rkC; + x1 ^= rkD; + x2 ^= rkE; + x3 ^= rkF; + AES_ROUND_NOKEY(x0, x1, x2, x3); + rk0 ^= rkD; + x0 ^= rk0; + rk1 ^= rkE; + x1 ^= rk1; + rk2 ^= rkF; + x2 ^= rk2; + rk3 ^= rk0; + x3 ^= rk3; + AES_ROUND_NOKEY(x0, x1, x2, x3); + p0 ^= x0; + p1 ^= x1; + p2 ^= x2; + p3 ^= x3; + /* round 3 */ + rk4 ^= rk1; + x0 = p0 ^ rk4; + rk5 ^= rk2; + x1 = p1 ^ rk5; + rk6 ^= rk3; + x2 = p2 ^ rk6; + rk7 ^= rk4; + x3 = p3 ^ rk7; + AES_ROUND_NOKEY(x0, x1, x2, x3); + rk8 ^= rk5; + x0 ^= rk8; + rk9 ^= rk6; + x1 ^= rk9; + rkA ^= rk7; + x2 ^= rkA; + rkB ^= rk8; + x3 ^= rkB; + AES_ROUND_NOKEY(x0, x1, x2, x3); + rkC ^= rk9; + x0 ^= rkC; + rkD ^= rkA; + x1 ^= rkD; + rkE ^= rkB; + x2 ^= rkE; + rkF ^= rkC; + x3 ^= rkF; + AES_ROUND_NOKEY(x0, x1, x2, x3); + p4 ^= x0; + p5 ^= x1; + p6 ^= x2; + p7 ^= x3; + /* round 4 */ + KEY_EXPAND_ELT(rk0, rk1, rk2, rk3); + rk0 ^= rkC; + rk1 ^= rkD; + rk2 ^= rkE; + rk3 ^= rkF; + x0 = p4 ^ rk0; + x1 = p5 ^ rk1; + x2 = p6 ^ rk2; + x3 = p7 ^ rk3; + AES_ROUND_NOKEY(x0, x1, x2, x3); + KEY_EXPAND_ELT(rk4, rk5, rk6, rk7); + rk4 ^= rk0; + rk5 ^= rk1; + rk6 ^= rk2; + rk7 ^= rk3; + x0 ^= rk4; + x1 ^= rk5; + x2 ^= rk6; + x3 ^= rk7; + AES_ROUND_NOKEY(x0, x1, x2, x3); + KEY_EXPAND_ELT(rk8, rk9, rkA, rkB); + rk8 ^= rk4; + rk9 ^= rk5 ^ sc->count1; + rkA ^= rk6 ^ SPH_T32(~sc->count0); + rkB ^= rk7; + x0 ^= rk8; + x1 ^= rk9; + x2 ^= rkA; + x3 ^= rkB; + AES_ROUND_NOKEY(x0, x1, x2, x3); + p0 ^= x0; + p1 ^= x1; + p2 ^= x2; + p3 ^= x3; + /* round 5 */ + KEY_EXPAND_ELT(rkC, rkD, rkE, rkF); + rkC ^= rk8; + rkD ^= rk9; + rkE ^= rkA; + rkF ^= rkB; + x0 = p0 ^ rkC; + x1 = p1 ^ rkD; + x2 = p2 ^ rkE; + x3 = p3 ^ rkF; + AES_ROUND_NOKEY(x0, x1, x2, x3); + rk0 ^= rkD; + x0 ^= rk0; + rk1 ^= rkE; + x1 ^= rk1; + rk2 ^= rkF; + x2 ^= rk2; + rk3 ^= rk0; + x3 ^= rk3; + AES_ROUND_NOKEY(x0, x1, x2, x3); + rk4 ^= rk1; + x0 ^= rk4; + rk5 ^= rk2; + x1 ^= rk5; + rk6 ^= rk3; + x2 ^= rk6; + rk7 ^= rk4; + x3 ^= rk7; + AES_ROUND_NOKEY(x0, x1, x2, x3); + p4 ^= x0; + p5 ^= x1; + p6 ^= x2; + p7 ^= x3; + /* round 6 */ + rk8 ^= rk5; + x0 = p4 ^ rk8; + rk9 ^= rk6; + x1 = p5 ^ rk9; + rkA ^= rk7; + x2 = p6 ^ rkA; + rkB ^= rk8; + x3 = p7 ^ rkB; + AES_ROUND_NOKEY(x0, x1, x2, x3); + rkC ^= rk9; + x0 ^= rkC; + rkD ^= rkA; + x1 ^= rkD; + rkE ^= rkB; + x2 ^= rkE; + rkF ^= rkC; + x3 ^= rkF; + AES_ROUND_NOKEY(x0, x1, x2, x3); + KEY_EXPAND_ELT(rk0, rk1, rk2, rk3); + rk0 ^= rkC; + rk1 ^= rkD; + rk2 ^= rkE; + rk3 ^= rkF; + x0 ^= rk0; + x1 ^= rk1; + x2 ^= rk2; + x3 ^= rk3; + AES_ROUND_NOKEY(x0, x1, x2, x3); + p0 ^= x0; + p1 ^= x1; + p2 ^= x2; + p3 ^= x3; + /* round 7 */ + KEY_EXPAND_ELT(rk4, rk5, rk6, rk7); + rk4 ^= rk0; + rk5 ^= rk1; + rk6 ^= rk2 ^ sc->count1; + rk7 ^= rk3 ^ SPH_T32(~sc->count0); + x0 = p0 ^ rk4; + x1 = p1 ^ rk5; + x2 = p2 ^ rk6; + x3 = p3 ^ rk7; + AES_ROUND_NOKEY(x0, x1, x2, x3); + KEY_EXPAND_ELT(rk8, rk9, rkA, rkB); + rk8 ^= rk4; + rk9 ^= rk5; + rkA ^= rk6; + rkB ^= rk7; + x0 ^= rk8; + x1 ^= rk9; + x2 ^= rkA; + x3 ^= rkB; + AES_ROUND_NOKEY(x0, x1, x2, x3); + KEY_EXPAND_ELT(rkC, rkD, rkE, rkF); + rkC ^= rk8; + rkD ^= rk9; + rkE ^= rkA; + rkF ^= rkB; + x0 ^= rkC; + x1 ^= rkD; + x2 ^= rkE; + x3 ^= rkF; + AES_ROUND_NOKEY(x0, x1, x2, x3); + p4 ^= x0; + p5 ^= x1; + p6 ^= x2; + p7 ^= x3; + /* round 8 */ + rk0 ^= rkD; + x0 = p4 ^ rk0; + rk1 ^= rkE; + x1 = p5 ^ rk1; + rk2 ^= rkF; + x2 = p6 ^ rk2; + rk3 ^= rk0; + x3 = p7 ^ rk3; + AES_ROUND_NOKEY(x0, x1, x2, x3); + rk4 ^= rk1; + x0 ^= rk4; + rk5 ^= rk2; + x1 ^= rk5; + rk6 ^= rk3; + x2 ^= rk6; + rk7 ^= rk4; + x3 ^= rk7; + AES_ROUND_NOKEY(x0, x1, x2, x3); + rk8 ^= rk5; + x0 ^= rk8; + rk9 ^= rk6; + x1 ^= rk9; + rkA ^= rk7; + x2 ^= rkA; + rkB ^= rk8; + x3 ^= rkB; + AES_ROUND_NOKEY(x0, x1, x2, x3); + p0 ^= x0; + p1 ^= x1; + p2 ^= x2; + p3 ^= x3; + /* round 9 */ + rkC ^= rk9; + x0 = p0 ^ rkC; + rkD ^= rkA; + x1 = p1 ^ rkD; + rkE ^= rkB; + x2 = p2 ^ rkE; + rkF ^= rkC; + x3 = p3 ^ rkF; + AES_ROUND_NOKEY(x0, x1, x2, x3); + KEY_EXPAND_ELT(rk0, rk1, rk2, rk3); + rk0 ^= rkC; + rk1 ^= rkD; + rk2 ^= rkE; + rk3 ^= rkF; + x0 ^= rk0; + x1 ^= rk1; + x2 ^= rk2; + x3 ^= rk3; + AES_ROUND_NOKEY(x0, x1, x2, x3); + KEY_EXPAND_ELT(rk4, rk5, rk6, rk7); + rk4 ^= rk0; + rk5 ^= rk1; + rk6 ^= rk2; + rk7 ^= rk3; + x0 ^= rk4; + x1 ^= rk5; + x2 ^= rk6; + x3 ^= rk7; + AES_ROUND_NOKEY(x0, x1, x2, x3); + p4 ^= x0; + p5 ^= x1; + p6 ^= x2; + p7 ^= x3; + /* round 10 */ + KEY_EXPAND_ELT(rk8, rk9, rkA, rkB); + rk8 ^= rk4; + rk9 ^= rk5; + rkA ^= rk6; + rkB ^= rk7; + x0 = p4 ^ rk8; + x1 = p5 ^ rk9; + x2 = p6 ^ rkA; + x3 = p7 ^ rkB; + AES_ROUND_NOKEY(x0, x1, x2, x3); + KEY_EXPAND_ELT(rkC, rkD, rkE, rkF); + rkC ^= rk8 ^ sc->count0; + rkD ^= rk9; + rkE ^= rkA; + rkF ^= rkB ^ SPH_T32(~sc->count1); + x0 ^= rkC; + x1 ^= rkD; + x2 ^= rkE; + x3 ^= rkF; + AES_ROUND_NOKEY(x0, x1, x2, x3); + rk0 ^= rkD; + x0 ^= rk0; + rk1 ^= rkE; + x1 ^= rk1; + rk2 ^= rkF; + x2 ^= rk2; + rk3 ^= rk0; + x3 ^= rk3; + AES_ROUND_NOKEY(x0, x1, x2, x3); + p0 ^= x0; + p1 ^= x1; + p2 ^= x2; + p3 ^= x3; + /* round 11 */ + rk4 ^= rk1; + x0 = p0 ^ rk4; + rk5 ^= rk2; + x1 = p1 ^ rk5; + rk6 ^= rk3; + x2 = p2 ^ rk6; + rk7 ^= rk4; + x3 = p3 ^ rk7; + AES_ROUND_NOKEY(x0, x1, x2, x3); + rk8 ^= rk5; + x0 ^= rk8; + rk9 ^= rk6; + x1 ^= rk9; + rkA ^= rk7; + x2 ^= rkA; + rkB ^= rk8; + x3 ^= rkB; + AES_ROUND_NOKEY(x0, x1, x2, x3); + rkC ^= rk9; + x0 ^= rkC; + rkD ^= rkA; + x1 ^= rkD; + rkE ^= rkB; + x2 ^= rkE; + rkF ^= rkC; + x3 ^= rkF; + AES_ROUND_NOKEY(x0, x1, x2, x3); + p4 ^= x0; + p5 ^= x1; + p6 ^= x2; + p7 ^= x3; + sc->h[0x0] ^= p0; + sc->h[0x1] ^= p1; + sc->h[0x2] ^= p2; + sc->h[0x3] ^= p3; + sc->h[0x4] ^= p4; + sc->h[0x5] ^= p5; + sc->h[0x6] ^= p6; + sc->h[0x7] ^= p7; +} + +#endif + +#if SPH_SMALL_FOOTPRINT_SHAVITE + +/* + * This function assumes that "msg" is aligned for 32-bit access. + */ +static void +c512(sph_shavite_big_context *sc, const void *msg) +{ + sph_u32 p0, p1, p2, p3, p4, p5, p6, p7; + sph_u32 p8, p9, pA, pB, pC, pD, pE, pF; + sph_u32 rk[448]; + size_t u; + int r, s; + +#if SPH_LITTLE_ENDIAN + memcpy(rk, msg, 128); +#else + for (u = 0; u < 32; u += 4) { + rk[u + 0] = sph_dec32le_aligned( + (const unsigned char *)msg + (u << 2) + 0); + rk[u + 1] = sph_dec32le_aligned( + (const unsigned char *)msg + (u << 2) + 4); + rk[u + 2] = sph_dec32le_aligned( + (const unsigned char *)msg + (u << 2) + 8); + rk[u + 3] = sph_dec32le_aligned( + (const unsigned char *)msg + (u << 2) + 12); + } +#endif + u = 32; + for (;;) { + for (s = 0; s < 4; s ++) { + sph_u32 x0, x1, x2, x3; + + x0 = rk[u - 31]; + x1 = rk[u - 30]; + x2 = rk[u - 29]; + x3 = rk[u - 32]; + AES_ROUND_NOKEY(x0, x1, x2, x3); + rk[u + 0] = x0 ^ rk[u - 4]; + rk[u + 1] = x1 ^ rk[u - 3]; + rk[u + 2] = x2 ^ rk[u - 2]; + rk[u + 3] = x3 ^ rk[u - 1]; + if (u == 32) { + rk[ 32] ^= sc->count0; + rk[ 33] ^= sc->count1; + rk[ 34] ^= sc->count2; + rk[ 35] ^= SPH_T32(~sc->count3); + } else if (u == 440) { + rk[440] ^= sc->count1; + rk[441] ^= sc->count0; + rk[442] ^= sc->count3; + rk[443] ^= SPH_T32(~sc->count2); + } + u += 4; + + x0 = rk[u - 31]; + x1 = rk[u - 30]; + x2 = rk[u - 29]; + x3 = rk[u - 32]; + AES_ROUND_NOKEY(x0, x1, x2, x3); + rk[u + 0] = x0 ^ rk[u - 4]; + rk[u + 1] = x1 ^ rk[u - 3]; + rk[u + 2] = x2 ^ rk[u - 2]; + rk[u + 3] = x3 ^ rk[u - 1]; + if (u == 164) { + rk[164] ^= sc->count3; + rk[165] ^= sc->count2; + rk[166] ^= sc->count1; + rk[167] ^= SPH_T32(~sc->count0); + } else if (u == 316) { + rk[316] ^= sc->count2; + rk[317] ^= sc->count3; + rk[318] ^= sc->count0; + rk[319] ^= SPH_T32(~sc->count1); + } + u += 4; + } + if (u == 448) + break; + for (s = 0; s < 8; s ++) { + rk[u + 0] = rk[u - 32] ^ rk[u - 7]; + rk[u + 1] = rk[u - 31] ^ rk[u - 6]; + rk[u + 2] = rk[u - 30] ^ rk[u - 5]; + rk[u + 3] = rk[u - 29] ^ rk[u - 4]; + u += 4; + } + } + + p0 = sc->h[0x0]; + p1 = sc->h[0x1]; + p2 = sc->h[0x2]; + p3 = sc->h[0x3]; + p4 = sc->h[0x4]; + p5 = sc->h[0x5]; + p6 = sc->h[0x6]; + p7 = sc->h[0x7]; + p8 = sc->h[0x8]; + p9 = sc->h[0x9]; + pA = sc->h[0xA]; + pB = sc->h[0xB]; + pC = sc->h[0xC]; + pD = sc->h[0xD]; + pE = sc->h[0xE]; + pF = sc->h[0xF]; + u = 0; + for (r = 0; r < 14; r ++) { +#define C512_ELT(l0, l1, l2, l3, r0, r1, r2, r3) do { \ + sph_u32 x0, x1, x2, x3; \ + x0 = r0 ^ rk[u ++]; \ + x1 = r1 ^ rk[u ++]; \ + x2 = r2 ^ rk[u ++]; \ + x3 = r3 ^ rk[u ++]; \ + AES_ROUND_NOKEY(x0, x1, x2, x3); \ + x0 ^= rk[u ++]; \ + x1 ^= rk[u ++]; \ + x2 ^= rk[u ++]; \ + x3 ^= rk[u ++]; \ + AES_ROUND_NOKEY(x0, x1, x2, x3); \ + x0 ^= rk[u ++]; \ + x1 ^= rk[u ++]; \ + x2 ^= rk[u ++]; \ + x3 ^= rk[u ++]; \ + AES_ROUND_NOKEY(x0, x1, x2, x3); \ + x0 ^= rk[u ++]; \ + x1 ^= rk[u ++]; \ + x2 ^= rk[u ++]; \ + x3 ^= rk[u ++]; \ + AES_ROUND_NOKEY(x0, x1, x2, x3); \ + l0 ^= x0; \ + l1 ^= x1; \ + l2 ^= x2; \ + l3 ^= x3; \ + } while (0) + +#define WROT(a, b, c, d) do { \ + sph_u32 t = d; \ + d = c; \ + c = b; \ + b = a; \ + a = t; \ + } while (0) + + C512_ELT(p0, p1, p2, p3, p4, p5, p6, p7); + C512_ELT(p8, p9, pA, pB, pC, pD, pE, pF); + + WROT(p0, p4, p8, pC); + WROT(p1, p5, p9, pD); + WROT(p2, p6, pA, pE); + WROT(p3, p7, pB, pF); + +#undef C512_ELT +#undef WROT + } + sc->h[0x0] ^= p0; + sc->h[0x1] ^= p1; + sc->h[0x2] ^= p2; + sc->h[0x3] ^= p3; + sc->h[0x4] ^= p4; + sc->h[0x5] ^= p5; + sc->h[0x6] ^= p6; + sc->h[0x7] ^= p7; + sc->h[0x8] ^= p8; + sc->h[0x9] ^= p9; + sc->h[0xA] ^= pA; + sc->h[0xB] ^= pB; + sc->h[0xC] ^= pC; + sc->h[0xD] ^= pD; + sc->h[0xE] ^= pE; + sc->h[0xF] ^= pF; +} + +#else + +/* + * This function assumes that "msg" is aligned for 32-bit access. + */ +static void +c512(sph_shavite_big_context *sc, const void *msg) +{ + sph_u32 p0, p1, p2, p3, p4, p5, p6, p7; + sph_u32 p8, p9, pA, pB, pC, pD, pE, pF; + sph_u32 x0, x1, x2, x3; + sph_u32 rk00, rk01, rk02, rk03, rk04, rk05, rk06, rk07; + sph_u32 rk08, rk09, rk0A, rk0B, rk0C, rk0D, rk0E, rk0F; + sph_u32 rk10, rk11, rk12, rk13, rk14, rk15, rk16, rk17; + sph_u32 rk18, rk19, rk1A, rk1B, rk1C, rk1D, rk1E, rk1F; + int r; + + p0 = sc->h[0x0]; + p1 = sc->h[0x1]; + p2 = sc->h[0x2]; + p3 = sc->h[0x3]; + p4 = sc->h[0x4]; + p5 = sc->h[0x5]; + p6 = sc->h[0x6]; + p7 = sc->h[0x7]; + p8 = sc->h[0x8]; + p9 = sc->h[0x9]; + pA = sc->h[0xA]; + pB = sc->h[0xB]; + pC = sc->h[0xC]; + pD = sc->h[0xD]; + pE = sc->h[0xE]; + pF = sc->h[0xF]; + /* round 0 */ + rk00 = sph_dec32le_aligned((const unsigned char *)msg + 0); + x0 = p4 ^ rk00; + rk01 = sph_dec32le_aligned((const unsigned char *)msg + 4); + x1 = p5 ^ rk01; + rk02 = sph_dec32le_aligned((const unsigned char *)msg + 8); + x2 = p6 ^ rk02; + rk03 = sph_dec32le_aligned((const unsigned char *)msg + 12); + x3 = p7 ^ rk03; + AES_ROUND_NOKEY(x0, x1, x2, x3); + rk04 = sph_dec32le_aligned((const unsigned char *)msg + 16); + x0 ^= rk04; + rk05 = sph_dec32le_aligned((const unsigned char *)msg + 20); + x1 ^= rk05; + rk06 = sph_dec32le_aligned((const unsigned char *)msg + 24); + x2 ^= rk06; + rk07 = sph_dec32le_aligned((const unsigned char *)msg + 28); + x3 ^= rk07; + AES_ROUND_NOKEY(x0, x1, x2, x3); + rk08 = sph_dec32le_aligned((const unsigned char *)msg + 32); + x0 ^= rk08; + rk09 = sph_dec32le_aligned((const unsigned char *)msg + 36); + x1 ^= rk09; + rk0A = sph_dec32le_aligned((const unsigned char *)msg + 40); + x2 ^= rk0A; + rk0B = sph_dec32le_aligned((const unsigned char *)msg + 44); + x3 ^= rk0B; + AES_ROUND_NOKEY(x0, x1, x2, x3); + rk0C = sph_dec32le_aligned((const unsigned char *)msg + 48); + x0 ^= rk0C; + rk0D = sph_dec32le_aligned((const unsigned char *)msg + 52); + x1 ^= rk0D; + rk0E = sph_dec32le_aligned((const unsigned char *)msg + 56); + x2 ^= rk0E; + rk0F = sph_dec32le_aligned((const unsigned char *)msg + 60); + x3 ^= rk0F; + AES_ROUND_NOKEY(x0, x1, x2, x3); + p0 ^= x0; + p1 ^= x1; + p2 ^= x2; + p3 ^= x3; + rk10 = sph_dec32le_aligned((const unsigned char *)msg + 64); + x0 = pC ^ rk10; + rk11 = sph_dec32le_aligned((const unsigned char *)msg + 68); + x1 = pD ^ rk11; + rk12 = sph_dec32le_aligned((const unsigned char *)msg + 72); + x2 = pE ^ rk12; + rk13 = sph_dec32le_aligned((const unsigned char *)msg + 76); + x3 = pF ^ rk13; + AES_ROUND_NOKEY(x0, x1, x2, x3); + rk14 = sph_dec32le_aligned((const unsigned char *)msg + 80); + x0 ^= rk14; + rk15 = sph_dec32le_aligned((const unsigned char *)msg + 84); + x1 ^= rk15; + rk16 = sph_dec32le_aligned((const unsigned char *)msg + 88); + x2 ^= rk16; + rk17 = sph_dec32le_aligned((const unsigned char *)msg + 92); + x3 ^= rk17; + AES_ROUND_NOKEY(x0, x1, x2, x3); + rk18 = sph_dec32le_aligned((const unsigned char *)msg + 96); + x0 ^= rk18; + rk19 = sph_dec32le_aligned((const unsigned char *)msg + 100); + x1 ^= rk19; + rk1A = sph_dec32le_aligned((const unsigned char *)msg + 104); + x2 ^= rk1A; + rk1B = sph_dec32le_aligned((const unsigned char *)msg + 108); + x3 ^= rk1B; + AES_ROUND_NOKEY(x0, x1, x2, x3); + rk1C = sph_dec32le_aligned((const unsigned char *)msg + 112); + x0 ^= rk1C; + rk1D = sph_dec32le_aligned((const unsigned char *)msg + 116); + x1 ^= rk1D; + rk1E = sph_dec32le_aligned((const unsigned char *)msg + 120); + x2 ^= rk1E; + rk1F = sph_dec32le_aligned((const unsigned char *)msg + 124); + x3 ^= rk1F; + AES_ROUND_NOKEY(x0, x1, x2, x3); + p8 ^= x0; + p9 ^= x1; + pA ^= x2; + pB ^= x3; + + for (r = 0; r < 3; r ++) { + /* round 1, 5, 9 */ + KEY_EXPAND_ELT(rk00, rk01, rk02, rk03); + rk00 ^= rk1C; + rk01 ^= rk1D; + rk02 ^= rk1E; + rk03 ^= rk1F; + if (r == 0) { + rk00 ^= sc->count0; + rk01 ^= sc->count1; + rk02 ^= sc->count2; + rk03 ^= SPH_T32(~sc->count3); + } + x0 = p0 ^ rk00; + x1 = p1 ^ rk01; + x2 = p2 ^ rk02; + x3 = p3 ^ rk03; + AES_ROUND_NOKEY(x0, x1, x2, x3); + KEY_EXPAND_ELT(rk04, rk05, rk06, rk07); + rk04 ^= rk00; + rk05 ^= rk01; + rk06 ^= rk02; + rk07 ^= rk03; + if (r == 1) { + rk04 ^= sc->count3; + rk05 ^= sc->count2; + rk06 ^= sc->count1; + rk07 ^= SPH_T32(~sc->count0); + } + x0 ^= rk04; + x1 ^= rk05; + x2 ^= rk06; + x3 ^= rk07; + AES_ROUND_NOKEY(x0, x1, x2, x3); + KEY_EXPAND_ELT(rk08, rk09, rk0A, rk0B); + rk08 ^= rk04; + rk09 ^= rk05; + rk0A ^= rk06; + rk0B ^= rk07; + x0 ^= rk08; + x1 ^= rk09; + x2 ^= rk0A; + x3 ^= rk0B; + AES_ROUND_NOKEY(x0, x1, x2, x3); + KEY_EXPAND_ELT(rk0C, rk0D, rk0E, rk0F); + rk0C ^= rk08; + rk0D ^= rk09; + rk0E ^= rk0A; + rk0F ^= rk0B; + x0 ^= rk0C; + x1 ^= rk0D; + x2 ^= rk0E; + x3 ^= rk0F; + AES_ROUND_NOKEY(x0, x1, x2, x3); + pC ^= x0; + pD ^= x1; + pE ^= x2; + pF ^= x3; + KEY_EXPAND_ELT(rk10, rk11, rk12, rk13); + rk10 ^= rk0C; + rk11 ^= rk0D; + rk12 ^= rk0E; + rk13 ^= rk0F; + x0 = p8 ^ rk10; + x1 = p9 ^ rk11; + x2 = pA ^ rk12; + x3 = pB ^ rk13; + AES_ROUND_NOKEY(x0, x1, x2, x3); + KEY_EXPAND_ELT(rk14, rk15, rk16, rk17); + rk14 ^= rk10; + rk15 ^= rk11; + rk16 ^= rk12; + rk17 ^= rk13; + x0 ^= rk14; + x1 ^= rk15; + x2 ^= rk16; + x3 ^= rk17; + AES_ROUND_NOKEY(x0, x1, x2, x3); + KEY_EXPAND_ELT(rk18, rk19, rk1A, rk1B); + rk18 ^= rk14; + rk19 ^= rk15; + rk1A ^= rk16; + rk1B ^= rk17; + x0 ^= rk18; + x1 ^= rk19; + x2 ^= rk1A; + x3 ^= rk1B; + AES_ROUND_NOKEY(x0, x1, x2, x3); + KEY_EXPAND_ELT(rk1C, rk1D, rk1E, rk1F); + rk1C ^= rk18; + rk1D ^= rk19; + rk1E ^= rk1A; + rk1F ^= rk1B; + if (r == 2) { + rk1C ^= sc->count2; + rk1D ^= sc->count3; + rk1E ^= sc->count0; + rk1F ^= SPH_T32(~sc->count1); + } + x0 ^= rk1C; + x1 ^= rk1D; + x2 ^= rk1E; + x3 ^= rk1F; + AES_ROUND_NOKEY(x0, x1, x2, x3); + p4 ^= x0; + p5 ^= x1; + p6 ^= x2; + p7 ^= x3; + /* round 2, 6, 10 */ + rk00 ^= rk19; + x0 = pC ^ rk00; + rk01 ^= rk1A; + x1 = pD ^ rk01; + rk02 ^= rk1B; + x2 = pE ^ rk02; + rk03 ^= rk1C; + x3 = pF ^ rk03; + AES_ROUND_NOKEY(x0, x1, x2, x3); + rk04 ^= rk1D; + x0 ^= rk04; + rk05 ^= rk1E; + x1 ^= rk05; + rk06 ^= rk1F; + x2 ^= rk06; + rk07 ^= rk00; + x3 ^= rk07; + AES_ROUND_NOKEY(x0, x1, x2, x3); + rk08 ^= rk01; + x0 ^= rk08; + rk09 ^= rk02; + x1 ^= rk09; + rk0A ^= rk03; + x2 ^= rk0A; + rk0B ^= rk04; + x3 ^= rk0B; + AES_ROUND_NOKEY(x0, x1, x2, x3); + rk0C ^= rk05; + x0 ^= rk0C; + rk0D ^= rk06; + x1 ^= rk0D; + rk0E ^= rk07; + x2 ^= rk0E; + rk0F ^= rk08; + x3 ^= rk0F; + AES_ROUND_NOKEY(x0, x1, x2, x3); + p8 ^= x0; + p9 ^= x1; + pA ^= x2; + pB ^= x3; + rk10 ^= rk09; + x0 = p4 ^ rk10; + rk11 ^= rk0A; + x1 = p5 ^ rk11; + rk12 ^= rk0B; + x2 = p6 ^ rk12; + rk13 ^= rk0C; + x3 = p7 ^ rk13; + AES_ROUND_NOKEY(x0, x1, x2, x3); + rk14 ^= rk0D; + x0 ^= rk14; + rk15 ^= rk0E; + x1 ^= rk15; + rk16 ^= rk0F; + x2 ^= rk16; + rk17 ^= rk10; + x3 ^= rk17; + AES_ROUND_NOKEY(x0, x1, x2, x3); + rk18 ^= rk11; + x0 ^= rk18; + rk19 ^= rk12; + x1 ^= rk19; + rk1A ^= rk13; + x2 ^= rk1A; + rk1B ^= rk14; + x3 ^= rk1B; + AES_ROUND_NOKEY(x0, x1, x2, x3); + rk1C ^= rk15; + x0 ^= rk1C; + rk1D ^= rk16; + x1 ^= rk1D; + rk1E ^= rk17; + x2 ^= rk1E; + rk1F ^= rk18; + x3 ^= rk1F; + AES_ROUND_NOKEY(x0, x1, x2, x3); + p0 ^= x0; + p1 ^= x1; + p2 ^= x2; + p3 ^= x3; + /* round 3, 7, 11 */ + KEY_EXPAND_ELT(rk00, rk01, rk02, rk03); + rk00 ^= rk1C; + rk01 ^= rk1D; + rk02 ^= rk1E; + rk03 ^= rk1F; + x0 = p8 ^ rk00; + x1 = p9 ^ rk01; + x2 = pA ^ rk02; + x3 = pB ^ rk03; + AES_ROUND_NOKEY(x0, x1, x2, x3); + KEY_EXPAND_ELT(rk04, rk05, rk06, rk07); + rk04 ^= rk00; + rk05 ^= rk01; + rk06 ^= rk02; + rk07 ^= rk03; + x0 ^= rk04; + x1 ^= rk05; + x2 ^= rk06; + x3 ^= rk07; + AES_ROUND_NOKEY(x0, x1, x2, x3); + KEY_EXPAND_ELT(rk08, rk09, rk0A, rk0B); + rk08 ^= rk04; + rk09 ^= rk05; + rk0A ^= rk06; + rk0B ^= rk07; + x0 ^= rk08; + x1 ^= rk09; + x2 ^= rk0A; + x3 ^= rk0B; + AES_ROUND_NOKEY(x0, x1, x2, x3); + KEY_EXPAND_ELT(rk0C, rk0D, rk0E, rk0F); + rk0C ^= rk08; + rk0D ^= rk09; + rk0E ^= rk0A; + rk0F ^= rk0B; + x0 ^= rk0C; + x1 ^= rk0D; + x2 ^= rk0E; + x3 ^= rk0F; + AES_ROUND_NOKEY(x0, x1, x2, x3); + p4 ^= x0; + p5 ^= x1; + p6 ^= x2; + p7 ^= x3; + KEY_EXPAND_ELT(rk10, rk11, rk12, rk13); + rk10 ^= rk0C; + rk11 ^= rk0D; + rk12 ^= rk0E; + rk13 ^= rk0F; + x0 = p0 ^ rk10; + x1 = p1 ^ rk11; + x2 = p2 ^ rk12; + x3 = p3 ^ rk13; + AES_ROUND_NOKEY(x0, x1, x2, x3); + KEY_EXPAND_ELT(rk14, rk15, rk16, rk17); + rk14 ^= rk10; + rk15 ^= rk11; + rk16 ^= rk12; + rk17 ^= rk13; + x0 ^= rk14; + x1 ^= rk15; + x2 ^= rk16; + x3 ^= rk17; + AES_ROUND_NOKEY(x0, x1, x2, x3); + KEY_EXPAND_ELT(rk18, rk19, rk1A, rk1B); + rk18 ^= rk14; + rk19 ^= rk15; + rk1A ^= rk16; + rk1B ^= rk17; + x0 ^= rk18; + x1 ^= rk19; + x2 ^= rk1A; + x3 ^= rk1B; + AES_ROUND_NOKEY(x0, x1, x2, x3); + KEY_EXPAND_ELT(rk1C, rk1D, rk1E, rk1F); + rk1C ^= rk18; + rk1D ^= rk19; + rk1E ^= rk1A; + rk1F ^= rk1B; + x0 ^= rk1C; + x1 ^= rk1D; + x2 ^= rk1E; + x3 ^= rk1F; + AES_ROUND_NOKEY(x0, x1, x2, x3); + pC ^= x0; + pD ^= x1; + pE ^= x2; + pF ^= x3; + /* round 4, 8, 12 */ + rk00 ^= rk19; + x0 = p4 ^ rk00; + rk01 ^= rk1A; + x1 = p5 ^ rk01; + rk02 ^= rk1B; + x2 = p6 ^ rk02; + rk03 ^= rk1C; + x3 = p7 ^ rk03; + AES_ROUND_NOKEY(x0, x1, x2, x3); + rk04 ^= rk1D; + x0 ^= rk04; + rk05 ^= rk1E; + x1 ^= rk05; + rk06 ^= rk1F; + x2 ^= rk06; + rk07 ^= rk00; + x3 ^= rk07; + AES_ROUND_NOKEY(x0, x1, x2, x3); + rk08 ^= rk01; + x0 ^= rk08; + rk09 ^= rk02; + x1 ^= rk09; + rk0A ^= rk03; + x2 ^= rk0A; + rk0B ^= rk04; + x3 ^= rk0B; + AES_ROUND_NOKEY(x0, x1, x2, x3); + rk0C ^= rk05; + x0 ^= rk0C; + rk0D ^= rk06; + x1 ^= rk0D; + rk0E ^= rk07; + x2 ^= rk0E; + rk0F ^= rk08; + x3 ^= rk0F; + AES_ROUND_NOKEY(x0, x1, x2, x3); + p0 ^= x0; + p1 ^= x1; + p2 ^= x2; + p3 ^= x3; + rk10 ^= rk09; + x0 = pC ^ rk10; + rk11 ^= rk0A; + x1 = pD ^ rk11; + rk12 ^= rk0B; + x2 = pE ^ rk12; + rk13 ^= rk0C; + x3 = pF ^ rk13; + AES_ROUND_NOKEY(x0, x1, x2, x3); + rk14 ^= rk0D; + x0 ^= rk14; + rk15 ^= rk0E; + x1 ^= rk15; + rk16 ^= rk0F; + x2 ^= rk16; + rk17 ^= rk10; + x3 ^= rk17; + AES_ROUND_NOKEY(x0, x1, x2, x3); + rk18 ^= rk11; + x0 ^= rk18; + rk19 ^= rk12; + x1 ^= rk19; + rk1A ^= rk13; + x2 ^= rk1A; + rk1B ^= rk14; + x3 ^= rk1B; + AES_ROUND_NOKEY(x0, x1, x2, x3); + rk1C ^= rk15; + x0 ^= rk1C; + rk1D ^= rk16; + x1 ^= rk1D; + rk1E ^= rk17; + x2 ^= rk1E; + rk1F ^= rk18; + x3 ^= rk1F; + AES_ROUND_NOKEY(x0, x1, x2, x3); + p8 ^= x0; + p9 ^= x1; + pA ^= x2; + pB ^= x3; + } + /* round 13 */ + KEY_EXPAND_ELT(rk00, rk01, rk02, rk03); + rk00 ^= rk1C; + rk01 ^= rk1D; + rk02 ^= rk1E; + rk03 ^= rk1F; + x0 = p0 ^ rk00; + x1 = p1 ^ rk01; + x2 = p2 ^ rk02; + x3 = p3 ^ rk03; + AES_ROUND_NOKEY(x0, x1, x2, x3); + KEY_EXPAND_ELT(rk04, rk05, rk06, rk07); + rk04 ^= rk00; + rk05 ^= rk01; + rk06 ^= rk02; + rk07 ^= rk03; + x0 ^= rk04; + x1 ^= rk05; + x2 ^= rk06; + x3 ^= rk07; + AES_ROUND_NOKEY(x0, x1, x2, x3); + KEY_EXPAND_ELT(rk08, rk09, rk0A, rk0B); + rk08 ^= rk04; + rk09 ^= rk05; + rk0A ^= rk06; + rk0B ^= rk07; + x0 ^= rk08; + x1 ^= rk09; + x2 ^= rk0A; + x3 ^= rk0B; + AES_ROUND_NOKEY(x0, x1, x2, x3); + KEY_EXPAND_ELT(rk0C, rk0D, rk0E, rk0F); + rk0C ^= rk08; + rk0D ^= rk09; + rk0E ^= rk0A; + rk0F ^= rk0B; + x0 ^= rk0C; + x1 ^= rk0D; + x2 ^= rk0E; + x3 ^= rk0F; + AES_ROUND_NOKEY(x0, x1, x2, x3); + pC ^= x0; + pD ^= x1; + pE ^= x2; + pF ^= x3; + KEY_EXPAND_ELT(rk10, rk11, rk12, rk13); + rk10 ^= rk0C; + rk11 ^= rk0D; + rk12 ^= rk0E; + rk13 ^= rk0F; + x0 = p8 ^ rk10; + x1 = p9 ^ rk11; + x2 = pA ^ rk12; + x3 = pB ^ rk13; + AES_ROUND_NOKEY(x0, x1, x2, x3); + KEY_EXPAND_ELT(rk14, rk15, rk16, rk17); + rk14 ^= rk10; + rk15 ^= rk11; + rk16 ^= rk12; + rk17 ^= rk13; + x0 ^= rk14; + x1 ^= rk15; + x2 ^= rk16; + x3 ^= rk17; + AES_ROUND_NOKEY(x0, x1, x2, x3); + KEY_EXPAND_ELT(rk18, rk19, rk1A, rk1B); + rk18 ^= rk14 ^ sc->count1; + rk19 ^= rk15 ^ sc->count0; + rk1A ^= rk16 ^ sc->count3; + rk1B ^= rk17 ^ SPH_T32(~sc->count2); + x0 ^= rk18; + x1 ^= rk19; + x2 ^= rk1A; + x3 ^= rk1B; + AES_ROUND_NOKEY(x0, x1, x2, x3); + KEY_EXPAND_ELT(rk1C, rk1D, rk1E, rk1F); + rk1C ^= rk18; + rk1D ^= rk19; + rk1E ^= rk1A; + rk1F ^= rk1B; + x0 ^= rk1C; + x1 ^= rk1D; + x2 ^= rk1E; + x3 ^= rk1F; + AES_ROUND_NOKEY(x0, x1, x2, x3); + p4 ^= x0; + p5 ^= x1; + p6 ^= x2; + p7 ^= x3; + sc->h[0x0] ^= p8; + sc->h[0x1] ^= p9; + sc->h[0x2] ^= pA; + sc->h[0x3] ^= pB; + sc->h[0x4] ^= pC; + sc->h[0x5] ^= pD; + sc->h[0x6] ^= pE; + sc->h[0x7] ^= pF; + sc->h[0x8] ^= p0; + sc->h[0x9] ^= p1; + sc->h[0xA] ^= p2; + sc->h[0xB] ^= p3; + sc->h[0xC] ^= p4; + sc->h[0xD] ^= p5; + sc->h[0xE] ^= p6; + sc->h[0xF] ^= p7; +} + +#endif + +static void +shavite_small_init(sph_shavite_small_context *sc, const sph_u32 *iv) +{ + memcpy(sc->h, iv, sizeof sc->h); + sc->ptr = 0; + sc->count0 = 0; + sc->count1 = 0; +} + +static void +shavite_small_core(sph_shavite_small_context *sc, const void *data, size_t len) +{ + unsigned char *buf; + size_t ptr; + + buf = sc->buf; + ptr = sc->ptr; + while (len > 0) { + size_t clen; + + clen = (sizeof sc->buf) - ptr; + if (clen > len) + clen = len; + memcpy(buf + ptr, data, clen); + data = (const unsigned char *)data + clen; + ptr += clen; + len -= clen; + if (ptr == sizeof sc->buf) { + if ((sc->count0 = SPH_T32(sc->count0 + 512)) == 0) + sc->count1 = SPH_T32(sc->count1 + 1); + c256(sc, buf); + ptr = 0; + } + } + sc->ptr = ptr; +} + +static void +shavite_small_close(sph_shavite_small_context *sc, + unsigned ub, unsigned n, void *dst, size_t out_size_w32) +{ + unsigned char *buf; + size_t ptr, u; + unsigned z; + sph_u32 count0, count1; + + buf = sc->buf; + ptr = sc->ptr; + count0 = (sc->count0 += (ptr << 3) + n); + count1 = sc->count1; + z = 0x80 >> n; + z = ((ub & -z) | z) & 0xFF; + if (ptr == 0 && n == 0) { + buf[0] = 0x80; + memset(buf + 1, 0, 53); + sc->count0 = sc->count1 = 0; + } else if (ptr < 54) { + buf[ptr ++] = z; + memset(buf + ptr, 0, 54 - ptr); + } else { + buf[ptr ++] = z; + memset(buf + ptr, 0, 64 - ptr); + c256(sc, buf); + memset(buf, 0, 54); + sc->count0 = sc->count1 = 0; + } + sph_enc32le(buf + 54, count0); + sph_enc32le(buf + 58, count1); + buf[62] = out_size_w32 << 5; + buf[63] = out_size_w32 >> 3; + c256(sc, buf); + for (u = 0; u < out_size_w32; u ++) + sph_enc32le((unsigned char *)dst + (u << 2), sc->h[u]); +} + +static void +shavite_big_init(sph_shavite_big_context *sc, const sph_u32 *iv) +{ + memcpy(sc->h, iv, sizeof sc->h); + sc->ptr = 0; + sc->count0 = 0; + sc->count1 = 0; + sc->count2 = 0; + sc->count3 = 0; +} + +static void +shavite_big_core(sph_shavite_big_context *sc, const void *data, size_t len) +{ + unsigned char *buf; + size_t ptr; + + buf = sc->buf; + ptr = sc->ptr; + while (len > 0) { + size_t clen; + + clen = (sizeof sc->buf) - ptr; + if (clen > len) + clen = len; + memcpy(buf + ptr, data, clen); + data = (const unsigned char *)data + clen; + ptr += clen; + len -= clen; + if (ptr == sizeof sc->buf) { + if ((sc->count0 = SPH_T32(sc->count0 + 1024)) == 0) { + sc->count1 = SPH_T32(sc->count1 + 1); + if (sc->count1 == 0) { + sc->count2 = SPH_T32(sc->count2 + 1); + if (sc->count2 == 0) { + sc->count3 = SPH_T32( + sc->count3 + 1); + } + } + } + c512(sc, buf); + ptr = 0; + } + } + sc->ptr = ptr; +} + +static void +shavite_big_close(sph_shavite_big_context *sc, + unsigned ub, unsigned n, void *dst, size_t out_size_w32) +{ + unsigned char *buf; + size_t ptr, u; + unsigned z; + sph_u32 count0, count1, count2, count3; + + buf = sc->buf; + ptr = sc->ptr; + count0 = (sc->count0 += (ptr << 3) + n); + count1 = sc->count1; + count2 = sc->count2; + count3 = sc->count3; + z = 0x80 >> n; + z = ((ub & -z) | z) & 0xFF; + if (ptr == 0 && n == 0) { + buf[0] = 0x80; + memset(buf + 1, 0, 109); + sc->count0 = sc->count1 = sc->count2 = sc->count3 = 0; + } else if (ptr < 110) { + buf[ptr ++] = z; + memset(buf + ptr, 0, 110 - ptr); + } else { + buf[ptr ++] = z; + memset(buf + ptr, 0, 128 - ptr); + c512(sc, buf); + memset(buf, 0, 110); + sc->count0 = sc->count1 = sc->count2 = sc->count3 = 0; + } + sph_enc32le(buf + 110, count0); + sph_enc32le(buf + 114, count1); + sph_enc32le(buf + 118, count2); + sph_enc32le(buf + 122, count3); + buf[126] = out_size_w32 << 5; + buf[127] = out_size_w32 >> 3; + c512(sc, buf); + for (u = 0; u < out_size_w32; u ++) + sph_enc32le((unsigned char *)dst + (u << 2), sc->h[u]); +} + +/* see sph_shavite.h */ +void +sph_shavite224_init(void *cc) +{ + shavite_small_init(cc, IV224); +} + +/* see sph_shavite.h */ +void +sph_shavite224(void *cc, const void *data, size_t len) +{ + shavite_small_core(cc, data, len); +} + +/* see sph_shavite.h */ +void +sph_shavite224_close(void *cc, void *dst) +{ + shavite_small_close(cc, 0, 0, dst, 7); + shavite_small_init(cc, IV224); +} + +/* see sph_shavite.h */ +void +sph_shavite224_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst) +{ + shavite_small_close(cc, ub, n, dst, 7); + shavite_small_init(cc, IV224); +} + +/* see sph_shavite.h */ +void +sph_shavite256_init(void *cc) +{ + shavite_small_init(cc, IV256); +} + +/* see sph_shavite.h */ +void +sph_shavite256(void *cc, const void *data, size_t len) +{ + shavite_small_core(cc, data, len); +} + +/* see sph_shavite.h */ +void +sph_shavite256_close(void *cc, void *dst) +{ + shavite_small_close(cc, 0, 0, dst, 8); + shavite_small_init(cc, IV256); +} + +/* see sph_shavite.h */ +void +sph_shavite256_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst) +{ + shavite_small_close(cc, ub, n, dst, 8); + shavite_small_init(cc, IV256); +} + +/* see sph_shavite.h */ +void +sph_shavite384_init(void *cc) +{ + shavite_big_init(cc, IV384); +} + +/* see sph_shavite.h */ +void +sph_shavite384(void *cc, const void *data, size_t len) +{ + shavite_big_core(cc, data, len); +} + +/* see sph_shavite.h */ +void +sph_shavite384_close(void *cc, void *dst) +{ + shavite_big_close(cc, 0, 0, dst, 12); + shavite_big_init(cc, IV384); +} + +/* see sph_shavite.h */ +void +sph_shavite384_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst) +{ + shavite_big_close(cc, ub, n, dst, 12); + shavite_big_init(cc, IV384); +} + +/* see sph_shavite.h */ +void +sph_shavite512_init(void *cc) +{ + shavite_big_init(cc, IV512); +} + +/* see sph_shavite.h */ +void +sph_shavite512(void *cc, const void *data, size_t len) +{ + shavite_big_core(cc, data, len); +} + +/* see sph_shavite.h */ +void +sph_shavite512_close(void *cc, void *dst) +{ + shavite_big_close(cc, 0, 0, dst, 16); + shavite_big_init(cc, IV512); +} + +/* see sph_shavite.h */ +void +sph_shavite512_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst) +{ + shavite_big_close(cc, ub, n, dst, 16); + shavite_big_init(cc, IV512); +} + +#ifdef __cplusplus +} +#endif \ No newline at end of file diff --git a/sha3/simd.c b/sha3/simd.c new file mode 100644 index 0000000..2c80626 --- /dev/null +++ b/sha3/simd.c @@ -0,0 +1,1799 @@ +/* $Id: simd.c 227 2010-06-16 17:28:38Z tp $ */ +/* + * SIMD implementation. + * + * ==========================(LICENSE BEGIN)============================ + * + * Copyright (c) 2007-2010 Projet RNRT SAPHIR + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY + * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * + * ===========================(LICENSE END)============================= + * + * @author Thomas Pornin + */ + +#include +#include +#include + +#include "sph_simd.h" + +#ifdef __cplusplus +extern "C"{ +#endif + +#if SPH_SMALL_FOOTPRINT && !defined SPH_SMALL_FOOTPRINT_SIMD +#define SPH_SMALL_FOOTPRINT_SIMD 1 +#endif + +#ifdef _MSC_VER +#pragma warning (disable: 4146) +#endif + +typedef sph_u32 u32; +typedef sph_s32 s32; +#define C32 SPH_C32 +#define T32 SPH_T32 +#define ROL32 SPH_ROTL32 + +#define XCAT(x, y) XCAT_(x, y) +#define XCAT_(x, y) x ## y + +/* + * The powers of 41 modulo 257. We use exponents from 0 to 255, inclusive. + */ +static const s32 alpha_tab[] = { + 1, 41, 139, 45, 46, 87, 226, 14, 60, 147, 116, 130, + 190, 80, 196, 69, 2, 82, 21, 90, 92, 174, 195, 28, + 120, 37, 232, 3, 123, 160, 135, 138, 4, 164, 42, 180, + 184, 91, 133, 56, 240, 74, 207, 6, 246, 63, 13, 19, + 8, 71, 84, 103, 111, 182, 9, 112, 223, 148, 157, 12, + 235, 126, 26, 38, 16, 142, 168, 206, 222, 107, 18, 224, + 189, 39, 57, 24, 213, 252, 52, 76, 32, 27, 79, 155, + 187, 214, 36, 191, 121, 78, 114, 48, 169, 247, 104, 152, + 64, 54, 158, 53, 117, 171, 72, 125, 242, 156, 228, 96, + 81, 237, 208, 47, 128, 108, 59, 106, 234, 85, 144, 250, + 227, 55, 199, 192, 162, 217, 159, 94, 256, 216, 118, 212, + 211, 170, 31, 243, 197, 110, 141, 127, 67, 177, 61, 188, + 255, 175, 236, 167, 165, 83, 62, 229, 137, 220, 25, 254, + 134, 97, 122, 119, 253, 93, 215, 77, 73, 166, 124, 201, + 17, 183, 50, 251, 11, 194, 244, 238, 249, 186, 173, 154, + 146, 75, 248, 145, 34, 109, 100, 245, 22, 131, 231, 219, + 241, 115, 89, 51, 35, 150, 239, 33, 68, 218, 200, 233, + 44, 5, 205, 181, 225, 230, 178, 102, 70, 43, 221, 66, + 136, 179, 143, 209, 88, 10, 153, 105, 193, 203, 99, 204, + 140, 86, 185, 132, 15, 101, 29, 161, 176, 20, 49, 210, + 129, 149, 198, 151, 23, 172, 113, 7, 30, 202, 58, 65, + 95, 40, 98, 163 +}; + +/* + * Ranges: + * REDS1: from -32768..98302 to -383..383 + * REDS2: from -2^31..2^31-1 to -32768..98302 + */ +#define REDS1(x) (((x) & 0xFF) - ((x) >> 8)) +#define REDS2(x) (((x) & 0xFFFF) + ((x) >> 16)) + +/* + * If, upon entry, the values of q[] are all in the -N..N range (where + * N >= 98302) then the new values of q[] are in the -2N..2N range. + * + * Since alpha_tab[v] <= 256, maximum allowed range is for N = 8388608. + */ +#define FFT_LOOP(rb, hk, as, id) do { \ + size_t u, v; \ + s32 m = q[(rb)]; \ + s32 n = q[(rb) + (hk)]; \ + q[(rb)] = m + n; \ + q[(rb) + (hk)] = m - n; \ + u = v = 0; \ + goto id; \ + for (; u < (hk); u += 4, v += 4 * (as)) { \ + s32 t; \ + m = q[(rb) + u + 0]; \ + n = q[(rb) + u + 0 + (hk)]; \ + t = REDS2(n * alpha_tab[v + 0 * (as)]); \ + q[(rb) + u + 0] = m + t; \ + q[(rb) + u + 0 + (hk)] = m - t; \ + id: \ + m = q[(rb) + u + 1]; \ + n = q[(rb) + u + 1 + (hk)]; \ + t = REDS2(n * alpha_tab[v + 1 * (as)]); \ + q[(rb) + u + 1] = m + t; \ + q[(rb) + u + 1 + (hk)] = m - t; \ + m = q[(rb) + u + 2]; \ + n = q[(rb) + u + 2 + (hk)]; \ + t = REDS2(n * alpha_tab[v + 2 * (as)]); \ + q[(rb) + u + 2] = m + t; \ + q[(rb) + u + 2 + (hk)] = m - t; \ + m = q[(rb) + u + 3]; \ + n = q[(rb) + u + 3 + (hk)]; \ + t = REDS2(n * alpha_tab[v + 3 * (as)]); \ + q[(rb) + u + 3] = m + t; \ + q[(rb) + u + 3 + (hk)] = m - t; \ + } \ + } while (0) + +/* + * Output ranges: + * d0: min= 0 max= 1020 + * d1: min= -67 max= 4587 + * d2: min=-4335 max= 4335 + * d3: min=-4147 max= 507 + * d4: min= -510 max= 510 + * d5: min= -252 max= 4402 + * d6: min=-4335 max= 4335 + * d7: min=-4332 max= 322 + */ +#define FFT8(xb, xs, d) do { \ + s32 x0 = x[(xb)]; \ + s32 x1 = x[(xb) + (xs)]; \ + s32 x2 = x[(xb) + 2 * (xs)]; \ + s32 x3 = x[(xb) + 3 * (xs)]; \ + s32 a0 = x0 + x2; \ + s32 a1 = x0 + (x2 << 4); \ + s32 a2 = x0 - x2; \ + s32 a3 = x0 - (x2 << 4); \ + s32 b0 = x1 + x3; \ + s32 b1 = REDS1((x1 << 2) + (x3 << 6)); \ + s32 b2 = (x1 << 4) - (x3 << 4); \ + s32 b3 = REDS1((x1 << 6) + (x3 << 2)); \ + d ## 0 = a0 + b0; \ + d ## 1 = a1 + b1; \ + d ## 2 = a2 + b2; \ + d ## 3 = a3 + b3; \ + d ## 4 = a0 - b0; \ + d ## 5 = a1 - b1; \ + d ## 6 = a2 - b2; \ + d ## 7 = a3 - b3; \ + } while (0) + +/* + * When k=16, we have alpha=2. Multiplication by alpha^i is then reduced + * to some shifting. + * + * Output: within -591471..591723 + */ +#define FFT16(xb, xs, rb) do { \ + s32 d1_0, d1_1, d1_2, d1_3, d1_4, d1_5, d1_6, d1_7; \ + s32 d2_0, d2_1, d2_2, d2_3, d2_4, d2_5, d2_6, d2_7; \ + FFT8(xb, (xs) << 1, d1_); \ + FFT8((xb) + (xs), (xs) << 1, d2_); \ + q[(rb) + 0] = d1_0 + d2_0; \ + q[(rb) + 1] = d1_1 + (d2_1 << 1); \ + q[(rb) + 2] = d1_2 + (d2_2 << 2); \ + q[(rb) + 3] = d1_3 + (d2_3 << 3); \ + q[(rb) + 4] = d1_4 + (d2_4 << 4); \ + q[(rb) + 5] = d1_5 + (d2_5 << 5); \ + q[(rb) + 6] = d1_6 + (d2_6 << 6); \ + q[(rb) + 7] = d1_7 + (d2_7 << 7); \ + q[(rb) + 8] = d1_0 - d2_0; \ + q[(rb) + 9] = d1_1 - (d2_1 << 1); \ + q[(rb) + 10] = d1_2 - (d2_2 << 2); \ + q[(rb) + 11] = d1_3 - (d2_3 << 3); \ + q[(rb) + 12] = d1_4 - (d2_4 << 4); \ + q[(rb) + 13] = d1_5 - (d2_5 << 5); \ + q[(rb) + 14] = d1_6 - (d2_6 << 6); \ + q[(rb) + 15] = d1_7 - (d2_7 << 7); \ + } while (0) + +/* + * Output range: |q| <= 1183446 + */ +#define FFT32(xb, xs, rb, id) do { \ + FFT16(xb, (xs) << 1, rb); \ + FFT16((xb) + (xs), (xs) << 1, (rb) + 16); \ + FFT_LOOP(rb, 16, 8, id); \ + } while (0) + +/* + * Output range: |q| <= 2366892 + */ +#define FFT64(xb, xs, rb, id) do { \ + FFT32(xb, (xs) << 1, rb, XCAT(id, a)); \ + FFT32((xb) + (xs), (xs) << 1, (rb) + 32, XCAT(id, b)); \ + FFT_LOOP(rb, 32, 4, id); \ + } while (0) + +#if SPH_SMALL_FOOTPRINT_SIMD + +static void +fft32(unsigned char *x, size_t xs, s32 *q) +{ + size_t xd; + + xd = xs << 1; + FFT16(0, xd, 0); + FFT16(xs, xd, 16); + FFT_LOOP(0, 16, 8, label_); +} + +#define FFT128(xb, xs, rb, id) do { \ + fft32(x + (xb) + ((xs) * 0), (xs) << 2, &q[(rb) + 0]); \ + fft32(x + (xb) + ((xs) * 2), (xs) << 2, &q[(rb) + 32]); \ + FFT_LOOP(rb, 32, 4, XCAT(id, aa)); \ + fft32(x + (xb) + ((xs) * 1), (xs) << 2, &q[(rb) + 64]); \ + fft32(x + (xb) + ((xs) * 3), (xs) << 2, &q[(rb) + 96]); \ + FFT_LOOP((rb) + 64, 32, 4, XCAT(id, ab)); \ + FFT_LOOP(rb, 64, 2, XCAT(id, a)); \ + } while (0) + +#else + +/* + * Output range: |q| <= 4733784 + */ +#define FFT128(xb, xs, rb, id) do { \ + FFT64(xb, (xs) << 1, rb, XCAT(id, a)); \ + FFT64((xb) + (xs), (xs) << 1, (rb) + 64, XCAT(id, b)); \ + FFT_LOOP(rb, 64, 2, id); \ + } while (0) + +#endif + +/* + * For SIMD-384 / SIMD-512, the fully unrolled FFT yields a compression + * function which does not fit in the 32 kB L1 cache of a typical x86 + * Intel. We therefore add a function call layer at the FFT64 level. + */ + +static void +fft64(unsigned char *x, size_t xs, s32 *q) +{ + size_t xd; + + xd = xs << 1; + FFT32(0, xd, 0, label_a); + FFT32(xs, xd, 32, label_b); + FFT_LOOP(0, 32, 4, label_); +} + +/* + * Output range: |q| <= 9467568 + */ +#define FFT256(xb, xs, rb, id) do { \ + fft64(x + (xb) + ((xs) * 0), (xs) << 2, &q[(rb) + 0]); \ + fft64(x + (xb) + ((xs) * 2), (xs) << 2, &q[(rb) + 64]); \ + FFT_LOOP(rb, 64, 2, XCAT(id, aa)); \ + fft64(x + (xb) + ((xs) * 1), (xs) << 2, &q[(rb) + 128]); \ + fft64(x + (xb) + ((xs) * 3), (xs) << 2, &q[(rb) + 192]); \ + FFT_LOOP((rb) + 128, 64, 2, XCAT(id, ab)); \ + FFT_LOOP(rb, 128, 1, XCAT(id, a)); \ + } while (0) + +/* + * alpha^(127*i) mod 257 + */ +static const unsigned short yoff_s_n[] = { + 1, 98, 95, 58, 30, 113, 23, 198, 129, 49, 176, 29, + 15, 185, 140, 99, 193, 153, 88, 143, 136, 221, 70, 178, + 225, 205, 44, 200, 68, 239, 35, 89, 241, 231, 22, 100, + 34, 248, 146, 173, 249, 244, 11, 50, 17, 124, 73, 215, + 253, 122, 134, 25, 137, 62, 165, 236, 255, 61, 67, 141, + 197, 31, 211, 118, 256, 159, 162, 199, 227, 144, 234, 59, + 128, 208, 81, 228, 242, 72, 117, 158, 64, 104, 169, 114, + 121, 36, 187, 79, 32, 52, 213, 57, 189, 18, 222, 168, + 16, 26, 235, 157, 223, 9, 111, 84, 8, 13, 246, 207, + 240, 133, 184, 42, 4, 135, 123, 232, 120, 195, 92, 21, + 2, 196, 190, 116, 60, 226, 46, 139 +}; + +/* + * alpha^(127*i) + alpha^(125*i) mod 257 + */ +static const unsigned short yoff_s_f[] = { + 2, 156, 118, 107, 45, 212, 111, 162, 97, 249, 211, 3, + 49, 101, 151, 223, 189, 178, 253, 204, 76, 82, 232, 65, + 96, 176, 161, 47, 189, 61, 248, 107, 0, 131, 133, 113, + 17, 33, 12, 111, 251, 103, 57, 148, 47, 65, 249, 143, + 189, 8, 204, 230, 205, 151, 187, 227, 247, 111, 140, 6, + 77, 10, 21, 149, 255, 101, 139, 150, 212, 45, 146, 95, + 160, 8, 46, 254, 208, 156, 106, 34, 68, 79, 4, 53, + 181, 175, 25, 192, 161, 81, 96, 210, 68, 196, 9, 150, + 0, 126, 124, 144, 240, 224, 245, 146, 6, 154, 200, 109, + 210, 192, 8, 114, 68, 249, 53, 27, 52, 106, 70, 30, + 10, 146, 117, 251, 180, 247, 236, 108 +}; + +/* + * beta^(255*i) mod 257 + */ +static const unsigned short yoff_b_n[] = { + 1, 163, 98, 40, 95, 65, 58, 202, 30, 7, 113, 172, + 23, 151, 198, 149, 129, 210, 49, 20, 176, 161, 29, 101, + 15, 132, 185, 86, 140, 204, 99, 203, 193, 105, 153, 10, + 88, 209, 143, 179, 136, 66, 221, 43, 70, 102, 178, 230, + 225, 181, 205, 5, 44, 233, 200, 218, 68, 33, 239, 150, + 35, 51, 89, 115, 241, 219, 231, 131, 22, 245, 100, 109, + 34, 145, 248, 75, 146, 154, 173, 186, 249, 238, 244, 194, + 11, 251, 50, 183, 17, 201, 124, 166, 73, 77, 215, 93, + 253, 119, 122, 97, 134, 254, 25, 220, 137, 229, 62, 83, + 165, 167, 236, 175, 255, 188, 61, 177, 67, 127, 141, 110, + 197, 243, 31, 170, 211, 212, 118, 216, 256, 94, 159, 217, + 162, 192, 199, 55, 227, 250, 144, 85, 234, 106, 59, 108, + 128, 47, 208, 237, 81, 96, 228, 156, 242, 125, 72, 171, + 117, 53, 158, 54, 64, 152, 104, 247, 169, 48, 114, 78, + 121, 191, 36, 214, 187, 155, 79, 27, 32, 76, 52, 252, + 213, 24, 57, 39, 189, 224, 18, 107, 222, 206, 168, 142, + 16, 38, 26, 126, 235, 12, 157, 148, 223, 112, 9, 182, + 111, 103, 84, 71, 8, 19, 13, 63, 246, 6, 207, 74, + 240, 56, 133, 91, 184, 180, 42, 164, 4, 138, 135, 160, + 123, 3, 232, 37, 120, 28, 195, 174, 92, 90, 21, 82, + 2, 69, 196, 80, 190, 130, 116, 147, 60, 14, 226, 87, + 46, 45, 139, 41 +}; + +/* + * beta^(255*i) + beta^(253*i) mod 257 + */ +static const unsigned short yoff_b_f[] = { + 2, 203, 156, 47, 118, 214, 107, 106, 45, 93, 212, 20, + 111, 73, 162, 251, 97, 215, 249, 53, 211, 19, 3, 89, + 49, 207, 101, 67, 151, 130, 223, 23, 189, 202, 178, 239, + 253, 127, 204, 49, 76, 236, 82, 137, 232, 157, 65, 79, + 96, 161, 176, 130, 161, 30, 47, 9, 189, 247, 61, 226, + 248, 90, 107, 64, 0, 88, 131, 243, 133, 59, 113, 115, + 17, 236, 33, 213, 12, 191, 111, 19, 251, 61, 103, 208, + 57, 35, 148, 248, 47, 116, 65, 119, 249, 178, 143, 40, + 189, 129, 8, 163, 204, 227, 230, 196, 205, 122, 151, 45, + 187, 19, 227, 72, 247, 125, 111, 121, 140, 220, 6, 107, + 77, 69, 10, 101, 21, 65, 149, 171, 255, 54, 101, 210, + 139, 43, 150, 151, 212, 164, 45, 237, 146, 184, 95, 6, + 160, 42, 8, 204, 46, 238, 254, 168, 208, 50, 156, 190, + 106, 127, 34, 234, 68, 55, 79, 18, 4, 130, 53, 208, + 181, 21, 175, 120, 25, 100, 192, 178, 161, 96, 81, 127, + 96, 227, 210, 248, 68, 10, 196, 31, 9, 167, 150, 193, + 0, 169, 126, 14, 124, 198, 144, 142, 240, 21, 224, 44, + 245, 66, 146, 238, 6, 196, 154, 49, 200, 222, 109, 9, + 210, 141, 192, 138, 8, 79, 114, 217, 68, 128, 249, 94, + 53, 30, 27, 61, 52, 135, 106, 212, 70, 238, 30, 185, + 10, 132, 146, 136, 117, 37, 251, 150, 180, 188, 247, 156, + 236, 192, 108, 86 +}; + +#define INNER(l, h, mm) (((u32)((l) * (mm)) & 0xFFFFU) \ + + ((u32)((h) * (mm)) << 16)) + +#define W_SMALL(sb, o1, o2, mm) \ + (INNER(q[8 * (sb) + 2 * 0 + o1], q[8 * (sb) + 2 * 0 + o2], mm), \ + INNER(q[8 * (sb) + 2 * 1 + o1], q[8 * (sb) + 2 * 1 + o2], mm), \ + INNER(q[8 * (sb) + 2 * 2 + o1], q[8 * (sb) + 2 * 2 + o2], mm), \ + INNER(q[8 * (sb) + 2 * 3 + o1], q[8 * (sb) + 2 * 3 + o2], mm) + +#define WS_0_0 W_SMALL( 4, 0, 1, 185) +#define WS_0_1 W_SMALL( 6, 0, 1, 185) +#define WS_0_2 W_SMALL( 0, 0, 1, 185) +#define WS_0_3 W_SMALL( 2, 0, 1, 185) +#define WS_0_4 W_SMALL( 7, 0, 1, 185) +#define WS_0_5 W_SMALL( 5, 0, 1, 185) +#define WS_0_6 W_SMALL( 3, 0, 1, 185) +#define WS_0_7 W_SMALL( 1, 0, 1, 185) +#define WS_1_0 W_SMALL(15, 0, 1, 185) +#define WS_1_1 W_SMALL(11, 0, 1, 185) +#define WS_1_2 W_SMALL(12, 0, 1, 185) +#define WS_1_3 W_SMALL( 8, 0, 1, 185) +#define WS_1_4 W_SMALL( 9, 0, 1, 185) +#define WS_1_5 W_SMALL(13, 0, 1, 185) +#define WS_1_6 W_SMALL(10, 0, 1, 185) +#define WS_1_7 W_SMALL(14, 0, 1, 185) +#define WS_2_0 W_SMALL(17, -128, -64, 233) +#define WS_2_1 W_SMALL(18, -128, -64, 233) +#define WS_2_2 W_SMALL(23, -128, -64, 233) +#define WS_2_3 W_SMALL(20, -128, -64, 233) +#define WS_2_4 W_SMALL(22, -128, -64, 233) +#define WS_2_5 W_SMALL(21, -128, -64, 233) +#define WS_2_6 W_SMALL(16, -128, -64, 233) +#define WS_2_7 W_SMALL(19, -128, -64, 233) +#define WS_3_0 W_SMALL(30, -191, -127, 233) +#define WS_3_1 W_SMALL(24, -191, -127, 233) +#define WS_3_2 W_SMALL(25, -191, -127, 233) +#define WS_3_3 W_SMALL(31, -191, -127, 233) +#define WS_3_4 W_SMALL(27, -191, -127, 233) +#define WS_3_5 W_SMALL(29, -191, -127, 233) +#define WS_3_6 W_SMALL(28, -191, -127, 233) +#define WS_3_7 W_SMALL(26, -191, -127, 233) + +#define W_BIG(sb, o1, o2, mm) \ + (INNER(q[16 * (sb) + 2 * 0 + o1], q[16 * (sb) + 2 * 0 + o2], mm), \ + INNER(q[16 * (sb) + 2 * 1 + o1], q[16 * (sb) + 2 * 1 + o2], mm), \ + INNER(q[16 * (sb) + 2 * 2 + o1], q[16 * (sb) + 2 * 2 + o2], mm), \ + INNER(q[16 * (sb) + 2 * 3 + o1], q[16 * (sb) + 2 * 3 + o2], mm), \ + INNER(q[16 * (sb) + 2 * 4 + o1], q[16 * (sb) + 2 * 4 + o2], mm), \ + INNER(q[16 * (sb) + 2 * 5 + o1], q[16 * (sb) + 2 * 5 + o2], mm), \ + INNER(q[16 * (sb) + 2 * 6 + o1], q[16 * (sb) + 2 * 6 + o2], mm), \ + INNER(q[16 * (sb) + 2 * 7 + o1], q[16 * (sb) + 2 * 7 + o2], mm) + +#define WB_0_0 W_BIG( 4, 0, 1, 185) +#define WB_0_1 W_BIG( 6, 0, 1, 185) +#define WB_0_2 W_BIG( 0, 0, 1, 185) +#define WB_0_3 W_BIG( 2, 0, 1, 185) +#define WB_0_4 W_BIG( 7, 0, 1, 185) +#define WB_0_5 W_BIG( 5, 0, 1, 185) +#define WB_0_6 W_BIG( 3, 0, 1, 185) +#define WB_0_7 W_BIG( 1, 0, 1, 185) +#define WB_1_0 W_BIG(15, 0, 1, 185) +#define WB_1_1 W_BIG(11, 0, 1, 185) +#define WB_1_2 W_BIG(12, 0, 1, 185) +#define WB_1_3 W_BIG( 8, 0, 1, 185) +#define WB_1_4 W_BIG( 9, 0, 1, 185) +#define WB_1_5 W_BIG(13, 0, 1, 185) +#define WB_1_6 W_BIG(10, 0, 1, 185) +#define WB_1_7 W_BIG(14, 0, 1, 185) +#define WB_2_0 W_BIG(17, -256, -128, 233) +#define WB_2_1 W_BIG(18, -256, -128, 233) +#define WB_2_2 W_BIG(23, -256, -128, 233) +#define WB_2_3 W_BIG(20, -256, -128, 233) +#define WB_2_4 W_BIG(22, -256, -128, 233) +#define WB_2_5 W_BIG(21, -256, -128, 233) +#define WB_2_6 W_BIG(16, -256, -128, 233) +#define WB_2_7 W_BIG(19, -256, -128, 233) +#define WB_3_0 W_BIG(30, -383, -255, 233) +#define WB_3_1 W_BIG(24, -383, -255, 233) +#define WB_3_2 W_BIG(25, -383, -255, 233) +#define WB_3_3 W_BIG(31, -383, -255, 233) +#define WB_3_4 W_BIG(27, -383, -255, 233) +#define WB_3_5 W_BIG(29, -383, -255, 233) +#define WB_3_6 W_BIG(28, -383, -255, 233) +#define WB_3_7 W_BIG(26, -383, -255, 233) + +#define IF(x, y, z) ((((y) ^ (z)) & (x)) ^ (z)) +#define MAJ(x, y, z) (((x) & (y)) | (((x) | (y)) & (z))) + +#define PP4_0_0 1 +#define PP4_0_1 0 +#define PP4_0_2 3 +#define PP4_0_3 2 +#define PP4_1_0 2 +#define PP4_1_1 3 +#define PP4_1_2 0 +#define PP4_1_3 1 +#define PP4_2_0 3 +#define PP4_2_1 2 +#define PP4_2_2 1 +#define PP4_2_3 0 + +#define PP8_0_0 1 +#define PP8_0_1 0 +#define PP8_0_2 3 +#define PP8_0_3 2 +#define PP8_0_4 5 +#define PP8_0_5 4 +#define PP8_0_6 7 +#define PP8_0_7 6 + +#define PP8_1_0 6 +#define PP8_1_1 7 +#define PP8_1_2 4 +#define PP8_1_3 5 +#define PP8_1_4 2 +#define PP8_1_5 3 +#define PP8_1_6 0 +#define PP8_1_7 1 + +#define PP8_2_0 2 +#define PP8_2_1 3 +#define PP8_2_2 0 +#define PP8_2_3 1 +#define PP8_2_4 6 +#define PP8_2_5 7 +#define PP8_2_6 4 +#define PP8_2_7 5 + +#define PP8_3_0 3 +#define PP8_3_1 2 +#define PP8_3_2 1 +#define PP8_3_3 0 +#define PP8_3_4 7 +#define PP8_3_5 6 +#define PP8_3_6 5 +#define PP8_3_7 4 + +#define PP8_4_0 5 +#define PP8_4_1 4 +#define PP8_4_2 7 +#define PP8_4_3 6 +#define PP8_4_4 1 +#define PP8_4_5 0 +#define PP8_4_6 3 +#define PP8_4_7 2 + +#define PP8_5_0 7 +#define PP8_5_1 6 +#define PP8_5_2 5 +#define PP8_5_3 4 +#define PP8_5_4 3 +#define PP8_5_5 2 +#define PP8_5_6 1 +#define PP8_5_7 0 + +#define PP8_6_0 4 +#define PP8_6_1 5 +#define PP8_6_2 6 +#define PP8_6_3 7 +#define PP8_6_4 0 +#define PP8_6_5 1 +#define PP8_6_6 2 +#define PP8_6_7 3 + +#if SPH_SIMD_NOCOPY + +#define DECL_STATE_SMALL +#define READ_STATE_SMALL(sc) +#define WRITE_STATE_SMALL(sc) +#define DECL_STATE_BIG +#define READ_STATE_BIG(sc) +#define WRITE_STATE_BIG(sc) + +#else + +#define DECL_STATE_SMALL \ + u32 A0, A1, A2, A3, B0, B1, B2, B3, C0, C1, C2, C3, D0, D1, D2, D3; + +#define READ_STATE_SMALL(sc) do { \ + A0 = (sc)->state[ 0]; \ + A1 = (sc)->state[ 1]; \ + A2 = (sc)->state[ 2]; \ + A3 = (sc)->state[ 3]; \ + B0 = (sc)->state[ 4]; \ + B1 = (sc)->state[ 5]; \ + B2 = (sc)->state[ 6]; \ + B3 = (sc)->state[ 7]; \ + C0 = (sc)->state[ 8]; \ + C1 = (sc)->state[ 9]; \ + C2 = (sc)->state[10]; \ + C3 = (sc)->state[11]; \ + D0 = (sc)->state[12]; \ + D1 = (sc)->state[13]; \ + D2 = (sc)->state[14]; \ + D3 = (sc)->state[15]; \ + } while (0) + +#define WRITE_STATE_SMALL(sc) do { \ + (sc)->state[ 0] = A0; \ + (sc)->state[ 1] = A1; \ + (sc)->state[ 2] = A2; \ + (sc)->state[ 3] = A3; \ + (sc)->state[ 4] = B0; \ + (sc)->state[ 5] = B1; \ + (sc)->state[ 6] = B2; \ + (sc)->state[ 7] = B3; \ + (sc)->state[ 8] = C0; \ + (sc)->state[ 9] = C1; \ + (sc)->state[10] = C2; \ + (sc)->state[11] = C3; \ + (sc)->state[12] = D0; \ + (sc)->state[13] = D1; \ + (sc)->state[14] = D2; \ + (sc)->state[15] = D3; \ + } while (0) + +#define DECL_STATE_BIG \ + u32 A0, A1, A2, A3, A4, A5, A6, A7; \ + u32 B0, B1, B2, B3, B4, B5, B6, B7; \ + u32 C0, C1, C2, C3, C4, C5, C6, C7; \ + u32 D0, D1, D2, D3, D4, D5, D6, D7; + +#define READ_STATE_BIG(sc) do { \ + A0 = (sc)->state[ 0]; \ + A1 = (sc)->state[ 1]; \ + A2 = (sc)->state[ 2]; \ + A3 = (sc)->state[ 3]; \ + A4 = (sc)->state[ 4]; \ + A5 = (sc)->state[ 5]; \ + A6 = (sc)->state[ 6]; \ + A7 = (sc)->state[ 7]; \ + B0 = (sc)->state[ 8]; \ + B1 = (sc)->state[ 9]; \ + B2 = (sc)->state[10]; \ + B3 = (sc)->state[11]; \ + B4 = (sc)->state[12]; \ + B5 = (sc)->state[13]; \ + B6 = (sc)->state[14]; \ + B7 = (sc)->state[15]; \ + C0 = (sc)->state[16]; \ + C1 = (sc)->state[17]; \ + C2 = (sc)->state[18]; \ + C3 = (sc)->state[19]; \ + C4 = (sc)->state[20]; \ + C5 = (sc)->state[21]; \ + C6 = (sc)->state[22]; \ + C7 = (sc)->state[23]; \ + D0 = (sc)->state[24]; \ + D1 = (sc)->state[25]; \ + D2 = (sc)->state[26]; \ + D3 = (sc)->state[27]; \ + D4 = (sc)->state[28]; \ + D5 = (sc)->state[29]; \ + D6 = (sc)->state[30]; \ + D7 = (sc)->state[31]; \ + } while (0) + +#define WRITE_STATE_BIG(sc) do { \ + (sc)->state[ 0] = A0; \ + (sc)->state[ 1] = A1; \ + (sc)->state[ 2] = A2; \ + (sc)->state[ 3] = A3; \ + (sc)->state[ 4] = A4; \ + (sc)->state[ 5] = A5; \ + (sc)->state[ 6] = A6; \ + (sc)->state[ 7] = A7; \ + (sc)->state[ 8] = B0; \ + (sc)->state[ 9] = B1; \ + (sc)->state[10] = B2; \ + (sc)->state[11] = B3; \ + (sc)->state[12] = B4; \ + (sc)->state[13] = B5; \ + (sc)->state[14] = B6; \ + (sc)->state[15] = B7; \ + (sc)->state[16] = C0; \ + (sc)->state[17] = C1; \ + (sc)->state[18] = C2; \ + (sc)->state[19] = C3; \ + (sc)->state[20] = C4; \ + (sc)->state[21] = C5; \ + (sc)->state[22] = C6; \ + (sc)->state[23] = C7; \ + (sc)->state[24] = D0; \ + (sc)->state[25] = D1; \ + (sc)->state[26] = D2; \ + (sc)->state[27] = D3; \ + (sc)->state[28] = D4; \ + (sc)->state[29] = D5; \ + (sc)->state[30] = D6; \ + (sc)->state[31] = D7; \ + } while (0) + +#endif + +#define STEP_ELT(n, w, fun, s, ppb) do { \ + u32 tt = T32(D ## n + (w) + fun(A ## n, B ## n, C ## n)); \ + A ## n = T32(ROL32(tt, s) + XCAT(tA, XCAT(ppb, n))); \ + D ## n = C ## n; \ + C ## n = B ## n; \ + B ## n = tA ## n; \ + } while (0) + +#define STEP_SMALL(w0, w1, w2, w3, fun, r, s, pp4b) do { \ + u32 tA0 = ROL32(A0, r); \ + u32 tA1 = ROL32(A1, r); \ + u32 tA2 = ROL32(A2, r); \ + u32 tA3 = ROL32(A3, r); \ + STEP_ELT(0, w0, fun, s, pp4b); \ + STEP_ELT(1, w1, fun, s, pp4b); \ + STEP_ELT(2, w2, fun, s, pp4b); \ + STEP_ELT(3, w3, fun, s, pp4b); \ + } while (0) + +#define STEP_BIG(w0, w1, w2, w3, w4, w5, w6, w7, fun, r, s, pp8b) do { \ + u32 tA0 = ROL32(A0, r); \ + u32 tA1 = ROL32(A1, r); \ + u32 tA2 = ROL32(A2, r); \ + u32 tA3 = ROL32(A3, r); \ + u32 tA4 = ROL32(A4, r); \ + u32 tA5 = ROL32(A5, r); \ + u32 tA6 = ROL32(A6, r); \ + u32 tA7 = ROL32(A7, r); \ + STEP_ELT(0, w0, fun, s, pp8b); \ + STEP_ELT(1, w1, fun, s, pp8b); \ + STEP_ELT(2, w2, fun, s, pp8b); \ + STEP_ELT(3, w3, fun, s, pp8b); \ + STEP_ELT(4, w4, fun, s, pp8b); \ + STEP_ELT(5, w5, fun, s, pp8b); \ + STEP_ELT(6, w6, fun, s, pp8b); \ + STEP_ELT(7, w7, fun, s, pp8b); \ + } while (0) + +#define M3_0_0 0_ +#define M3_1_0 1_ +#define M3_2_0 2_ +#define M3_3_0 0_ +#define M3_4_0 1_ +#define M3_5_0 2_ +#define M3_6_0 0_ +#define M3_7_0 1_ + +#define M3_0_1 1_ +#define M3_1_1 2_ +#define M3_2_1 0_ +#define M3_3_1 1_ +#define M3_4_1 2_ +#define M3_5_1 0_ +#define M3_6_1 1_ +#define M3_7_1 2_ + +#define M3_0_2 2_ +#define M3_1_2 0_ +#define M3_2_2 1_ +#define M3_3_2 2_ +#define M3_4_2 0_ +#define M3_5_2 1_ +#define M3_6_2 2_ +#define M3_7_2 0_ + +#define STEP_SMALL_(w, fun, r, s, pp4b) STEP_SMALL w, fun, r, s, pp4b) + +#define ONE_ROUND_SMALL(ri, isp, p0, p1, p2, p3) do { \ + STEP_SMALL_(WS_ ## ri ## 0, \ + IF, p0, p1, XCAT(PP4_, M3_0_ ## isp)); \ + STEP_SMALL_(WS_ ## ri ## 1, \ + IF, p1, p2, XCAT(PP4_, M3_1_ ## isp)); \ + STEP_SMALL_(WS_ ## ri ## 2, \ + IF, p2, p3, XCAT(PP4_, M3_2_ ## isp)); \ + STEP_SMALL_(WS_ ## ri ## 3, \ + IF, p3, p0, XCAT(PP4_, M3_3_ ## isp)); \ + STEP_SMALL_(WS_ ## ri ## 4, \ + MAJ, p0, p1, XCAT(PP4_, M3_4_ ## isp)); \ + STEP_SMALL_(WS_ ## ri ## 5, \ + MAJ, p1, p2, XCAT(PP4_, M3_5_ ## isp)); \ + STEP_SMALL_(WS_ ## ri ## 6, \ + MAJ, p2, p3, XCAT(PP4_, M3_6_ ## isp)); \ + STEP_SMALL_(WS_ ## ri ## 7, \ + MAJ, p3, p0, XCAT(PP4_, M3_7_ ## isp)); \ + } while (0) + +#define M7_0_0 0_ +#define M7_1_0 1_ +#define M7_2_0 2_ +#define M7_3_0 3_ +#define M7_4_0 4_ +#define M7_5_0 5_ +#define M7_6_0 6_ +#define M7_7_0 0_ + +#define M7_0_1 1_ +#define M7_1_1 2_ +#define M7_2_1 3_ +#define M7_3_1 4_ +#define M7_4_1 5_ +#define M7_5_1 6_ +#define M7_6_1 0_ +#define M7_7_1 1_ + +#define M7_0_2 2_ +#define M7_1_2 3_ +#define M7_2_2 4_ +#define M7_3_2 5_ +#define M7_4_2 6_ +#define M7_5_2 0_ +#define M7_6_2 1_ +#define M7_7_2 2_ + +#define M7_0_3 3_ +#define M7_1_3 4_ +#define M7_2_3 5_ +#define M7_3_3 6_ +#define M7_4_3 0_ +#define M7_5_3 1_ +#define M7_6_3 2_ +#define M7_7_3 3_ + +#define STEP_BIG_(w, fun, r, s, pp8b) STEP_BIG w, fun, r, s, pp8b) + +#define ONE_ROUND_BIG(ri, isp, p0, p1, p2, p3) do { \ + STEP_BIG_(WB_ ## ri ## 0, \ + IF, p0, p1, XCAT(PP8_, M7_0_ ## isp)); \ + STEP_BIG_(WB_ ## ri ## 1, \ + IF, p1, p2, XCAT(PP8_, M7_1_ ## isp)); \ + STEP_BIG_(WB_ ## ri ## 2, \ + IF, p2, p3, XCAT(PP8_, M7_2_ ## isp)); \ + STEP_BIG_(WB_ ## ri ## 3, \ + IF, p3, p0, XCAT(PP8_, M7_3_ ## isp)); \ + STEP_BIG_(WB_ ## ri ## 4, \ + MAJ, p0, p1, XCAT(PP8_, M7_4_ ## isp)); \ + STEP_BIG_(WB_ ## ri ## 5, \ + MAJ, p1, p2, XCAT(PP8_, M7_5_ ## isp)); \ + STEP_BIG_(WB_ ## ri ## 6, \ + MAJ, p2, p3, XCAT(PP8_, M7_6_ ## isp)); \ + STEP_BIG_(WB_ ## ri ## 7, \ + MAJ, p3, p0, XCAT(PP8_, M7_7_ ## isp)); \ + } while (0) + +#if SPH_SMALL_FOOTPRINT_SIMD + +#define A0 state[ 0] +#define A1 state[ 1] +#define A2 state[ 2] +#define A3 state[ 3] +#define B0 state[ 4] +#define B1 state[ 5] +#define B2 state[ 6] +#define B3 state[ 7] +#define C0 state[ 8] +#define C1 state[ 9] +#define C2 state[10] +#define C3 state[11] +#define D0 state[12] +#define D1 state[13] +#define D2 state[14] +#define D3 state[15] + +#define STEP2_ELT(n, w, fun, s, ppb) do { \ + u32 tt = T32(D ## n + (w) + fun(A ## n, B ## n, C ## n)); \ + A ## n = T32(ROL32(tt, s) + tA[(ppb) ^ n]); \ + D ## n = C ## n; \ + C ## n = B ## n; \ + B ## n = tA[n]; \ + } while (0) + +#define STEP2_SMALL(w0, w1, w2, w3, fun, r, s, pp4b) do { \ + u32 tA[4]; \ + tA[0] = ROL32(A0, r); \ + tA[1] = ROL32(A1, r); \ + tA[2] = ROL32(A2, r); \ + tA[3] = ROL32(A3, r); \ + STEP2_ELT(0, w0, fun, s, pp4b); \ + STEP2_ELT(1, w1, fun, s, pp4b); \ + STEP2_ELT(2, w2, fun, s, pp4b); \ + STEP2_ELT(3, w3, fun, s, pp4b); \ + } while (0) + +static void +one_round_small(u32 *state, u32 *w, int isp, int p0, int p1, int p2, int p3) +{ + static const int pp4k[] = { 1, 2, 3, 1, 2, 3, 1, 2, 3, 1, 2 }; + + STEP2_SMALL(w[ 0], w[ 1], w[ 2], w[ 3], IF, p0, p1, pp4k[isp + 0]); + STEP2_SMALL(w[ 4], w[ 5], w[ 6], w[ 7], IF, p1, p2, pp4k[isp + 1]); + STEP2_SMALL(w[ 8], w[ 9], w[10], w[11], IF, p2, p3, pp4k[isp + 2]); + STEP2_SMALL(w[12], w[13], w[14], w[15], IF, p3, p0, pp4k[isp + 3]); + STEP2_SMALL(w[16], w[17], w[18], w[19], MAJ, p0, p1, pp4k[isp + 4]); + STEP2_SMALL(w[20], w[21], w[22], w[23], MAJ, p1, p2, pp4k[isp + 5]); + STEP2_SMALL(w[24], w[25], w[26], w[27], MAJ, p2, p3, pp4k[isp + 6]); + STEP2_SMALL(w[28], w[29], w[30], w[31], MAJ, p3, p0, pp4k[isp + 7]); +} + +static void +compress_small(sph_simd_small_context *sc, int last) +{ + unsigned char *x; + s32 q[128]; + int i; + u32 w[32]; + u32 state[16]; + size_t u; + + static const size_t wsp[32] = { + 4 << 3, 6 << 3, 0 << 3, 2 << 3, + 7 << 3, 5 << 3, 3 << 3, 1 << 3, + 15 << 3, 11 << 3, 12 << 3, 8 << 3, + 9 << 3, 13 << 3, 10 << 3, 14 << 3, + 17 << 3, 18 << 3, 23 << 3, 20 << 3, + 22 << 3, 21 << 3, 16 << 3, 19 << 3, + 30 << 3, 24 << 3, 25 << 3, 31 << 3, + 27 << 3, 29 << 3, 28 << 3, 26 << 3 + }; + + x = sc->buf; + FFT128(0, 1, 0, ll); + if (last) { + for (i = 0; i < 128; i ++) { + s32 tq; + + tq = q[i] + yoff_s_f[i]; + tq = REDS2(tq); + tq = REDS1(tq); + tq = REDS1(tq); + q[i] = (tq <= 128 ? tq : tq - 257); + } + } else { + for (i = 0; i < 128; i ++) { + s32 tq; + + tq = q[i] + yoff_s_n[i]; + tq = REDS2(tq); + tq = REDS1(tq); + tq = REDS1(tq); + q[i] = (tq <= 128 ? tq : tq - 257); + } + } + + for (i = 0; i < 16; i += 4) { + state[i + 0] = sc->state[i + 0] + ^ sph_dec32le_aligned(x + 4 * (i + 0)); + state[i + 1] = sc->state[i + 1] + ^ sph_dec32le_aligned(x + 4 * (i + 1)); + state[i + 2] = sc->state[i + 2] + ^ sph_dec32le_aligned(x + 4 * (i + 2)); + state[i + 3] = sc->state[i + 3] + ^ sph_dec32le_aligned(x + 4 * (i + 3)); + } + +#define WSREAD(sb, o1, o2, mm) do { \ + for (u = 0; u < 32; u += 4) { \ + size_t v = wsp[(u >> 2) + (sb)]; \ + w[u + 0] = INNER(q[v + 2 * 0 + (o1)], \ + q[v + 2 * 0 + (o2)], mm); \ + w[u + 1] = INNER(q[v + 2 * 1 + (o1)], \ + q[v + 2 * 1 + (o2)], mm); \ + w[u + 2] = INNER(q[v + 2 * 2 + (o1)], \ + q[v + 2 * 2 + (o2)], mm); \ + w[u + 3] = INNER(q[v + 2 * 3 + (o1)], \ + q[v + 2 * 3 + (o2)], mm); \ + } \ + } while (0) + + WSREAD( 0, 0, 1, 185); + one_round_small(state, w, 0, 3, 23, 17, 27); + WSREAD( 8, 0, 1, 185); + one_round_small(state, w, 2, 28, 19, 22, 7); + WSREAD(16, -128, -64, 233); + one_round_small(state, w, 1, 29, 9, 15, 5); + WSREAD(24, -191, -127, 233); + one_round_small(state, w, 0, 4, 13, 10, 25); + +#undef WSREAD + + STEP_SMALL(sc->state[ 0], sc->state[ 1], sc->state[ 2], sc->state[ 3], + IF, 4, 13, PP4_2_); + STEP_SMALL(sc->state[ 4], sc->state[ 5], sc->state[ 6], sc->state[ 7], + IF, 13, 10, PP4_0_); + STEP_SMALL(sc->state[ 8], sc->state[ 9], sc->state[10], sc->state[11], + IF, 10, 25, PP4_1_); + STEP_SMALL(sc->state[12], sc->state[13], sc->state[14], sc->state[15], + IF, 25, 4, PP4_2_); + + memcpy(sc->state, state, sizeof state); +} + +#undef A0 +#undef A1 +#undef A2 +#undef A3 +#undef B0 +#undef B1 +#undef B2 +#undef B3 +#undef C0 +#undef C1 +#undef C2 +#undef C3 +#undef D0 +#undef D1 +#undef D2 +#undef D3 + +#else + +#if SPH_SIMD_NOCOPY +#define A0 (sc->state[ 0]) +#define A1 (sc->state[ 1]) +#define A2 (sc->state[ 2]) +#define A3 (sc->state[ 3]) +#define B0 (sc->state[ 4]) +#define B1 (sc->state[ 5]) +#define B2 (sc->state[ 6]) +#define B3 (sc->state[ 7]) +#define C0 (sc->state[ 8]) +#define C1 (sc->state[ 9]) +#define C2 (sc->state[10]) +#define C3 (sc->state[11]) +#define D0 (sc->state[12]) +#define D1 (sc->state[13]) +#define D2 (sc->state[14]) +#define D3 (sc->state[15]) +#endif + +static void +compress_small(sph_simd_small_context *sc, int last) +{ + unsigned char *x; + s32 q[128]; + int i; + DECL_STATE_SMALL +#if SPH_SIMD_NOCOPY + sph_u32 saved[16]; +#endif + +#if SPH_SIMD_NOCOPY + memcpy(saved, sc->state, sizeof saved); +#endif + x = sc->buf; + FFT128(0, 1, 0, ll); + if (last) { + for (i = 0; i < 128; i ++) { + s32 tq; + + tq = q[i] + yoff_s_f[i]; + tq = REDS2(tq); + tq = REDS1(tq); + tq = REDS1(tq); + q[i] = (tq <= 128 ? tq : tq - 257); + } + } else { + for (i = 0; i < 128; i ++) { + s32 tq; + + tq = q[i] + yoff_s_n[i]; + tq = REDS2(tq); + tq = REDS1(tq); + tq = REDS1(tq); + q[i] = (tq <= 128 ? tq : tq - 257); + } + } + READ_STATE_SMALL(sc); + A0 ^= sph_dec32le_aligned(x + 0); + A1 ^= sph_dec32le_aligned(x + 4); + A2 ^= sph_dec32le_aligned(x + 8); + A3 ^= sph_dec32le_aligned(x + 12); + B0 ^= sph_dec32le_aligned(x + 16); + B1 ^= sph_dec32le_aligned(x + 20); + B2 ^= sph_dec32le_aligned(x + 24); + B3 ^= sph_dec32le_aligned(x + 28); + C0 ^= sph_dec32le_aligned(x + 32); + C1 ^= sph_dec32le_aligned(x + 36); + C2 ^= sph_dec32le_aligned(x + 40); + C3 ^= sph_dec32le_aligned(x + 44); + D0 ^= sph_dec32le_aligned(x + 48); + D1 ^= sph_dec32le_aligned(x + 52); + D2 ^= sph_dec32le_aligned(x + 56); + D3 ^= sph_dec32le_aligned(x + 60); + ONE_ROUND_SMALL(0_, 0, 3, 23, 17, 27); + ONE_ROUND_SMALL(1_, 2, 28, 19, 22, 7); + ONE_ROUND_SMALL(2_, 1, 29, 9, 15, 5); + ONE_ROUND_SMALL(3_, 0, 4, 13, 10, 25); +#if SPH_SIMD_NOCOPY + STEP_SMALL(saved[ 0], saved[ 1], saved[ 2], saved[ 3], + IF, 4, 13, PP4_2_); + STEP_SMALL(saved[ 4], saved[ 5], saved[ 6], saved[ 7], + IF, 13, 10, PP4_0_); + STEP_SMALL(saved[ 8], saved[ 9], saved[10], saved[11], + IF, 10, 25, PP4_1_); + STEP_SMALL(saved[12], saved[13], saved[14], saved[15], + IF, 25, 4, PP4_2_); +#else + STEP_SMALL(sc->state[ 0], sc->state[ 1], sc->state[ 2], sc->state[ 3], + IF, 4, 13, PP4_2_); + STEP_SMALL(sc->state[ 4], sc->state[ 5], sc->state[ 6], sc->state[ 7], + IF, 13, 10, PP4_0_); + STEP_SMALL(sc->state[ 8], sc->state[ 9], sc->state[10], sc->state[11], + IF, 10, 25, PP4_1_); + STEP_SMALL(sc->state[12], sc->state[13], sc->state[14], sc->state[15], + IF, 25, 4, PP4_2_); + WRITE_STATE_SMALL(sc); +#endif +} + +#if SPH_SIMD_NOCOPY +#undef A0 +#undef A1 +#undef A2 +#undef A3 +#undef B0 +#undef B1 +#undef B2 +#undef B3 +#undef C0 +#undef C1 +#undef C2 +#undef C3 +#undef D0 +#undef D1 +#undef D2 +#undef D3 +#endif + +#endif + +#if SPH_SMALL_FOOTPRINT_SIMD + +#define A0 state[ 0] +#define A1 state[ 1] +#define A2 state[ 2] +#define A3 state[ 3] +#define A4 state[ 4] +#define A5 state[ 5] +#define A6 state[ 6] +#define A7 state[ 7] +#define B0 state[ 8] +#define B1 state[ 9] +#define B2 state[10] +#define B3 state[11] +#define B4 state[12] +#define B5 state[13] +#define B6 state[14] +#define B7 state[15] +#define C0 state[16] +#define C1 state[17] +#define C2 state[18] +#define C3 state[19] +#define C4 state[20] +#define C5 state[21] +#define C6 state[22] +#define C7 state[23] +#define D0 state[24] +#define D1 state[25] +#define D2 state[26] +#define D3 state[27] +#define D4 state[28] +#define D5 state[29] +#define D6 state[30] +#define D7 state[31] + +/* + * Not needed -- already defined for SIMD-224 / SIMD-256 + * +#define STEP2_ELT(n, w, fun, s, ppb) do { \ + u32 tt = T32(D ## n + (w) + fun(A ## n, B ## n, C ## n)); \ + A ## n = T32(ROL32(tt, s) + tA[(ppb) ^ n]); \ + D ## n = C ## n; \ + C ## n = B ## n; \ + B ## n = tA[n]; \ + } while (0) + */ + +#define STEP2_BIG(w0, w1, w2, w3, w4, w5, w6, w7, fun, r, s, pp8b) do { \ + u32 tA[8]; \ + tA[0] = ROL32(A0, r); \ + tA[1] = ROL32(A1, r); \ + tA[2] = ROL32(A2, r); \ + tA[3] = ROL32(A3, r); \ + tA[4] = ROL32(A4, r); \ + tA[5] = ROL32(A5, r); \ + tA[6] = ROL32(A6, r); \ + tA[7] = ROL32(A7, r); \ + STEP2_ELT(0, w0, fun, s, pp8b); \ + STEP2_ELT(1, w1, fun, s, pp8b); \ + STEP2_ELT(2, w2, fun, s, pp8b); \ + STEP2_ELT(3, w3, fun, s, pp8b); \ + STEP2_ELT(4, w4, fun, s, pp8b); \ + STEP2_ELT(5, w5, fun, s, pp8b); \ + STEP2_ELT(6, w6, fun, s, pp8b); \ + STEP2_ELT(7, w7, fun, s, pp8b); \ + } while (0) + +static void +one_round_big(u32 *state, u32 *w, int isp, int p0, int p1, int p2, int p3) +{ + static const int pp8k[] = { 1, 6, 2, 3, 5, 7, 4, 1, 6, 2, 3 }; + + STEP2_BIG(w[ 0], w[ 1], w[ 2], w[ 3], w[ 4], w[ 5], w[ 6], w[ 7], + IF, p0, p1, pp8k[isp + 0]); + STEP2_BIG(w[ 8], w[ 9], w[10], w[11], w[12], w[13], w[14], w[15], + IF, p1, p2, pp8k[isp + 1]); + STEP2_BIG(w[16], w[17], w[18], w[19], w[20], w[21], w[22], w[23], + IF, p2, p3, pp8k[isp + 2]); + STEP2_BIG(w[24], w[25], w[26], w[27], w[28], w[29], w[30], w[31], + IF, p3, p0, pp8k[isp + 3]); + STEP2_BIG(w[32], w[33], w[34], w[35], w[36], w[37], w[38], w[39], + MAJ, p0, p1, pp8k[isp + 4]); + STEP2_BIG(w[40], w[41], w[42], w[43], w[44], w[45], w[46], w[47], + MAJ, p1, p2, pp8k[isp + 5]); + STEP2_BIG(w[48], w[49], w[50], w[51], w[52], w[53], w[54], w[55], + MAJ, p2, p3, pp8k[isp + 6]); + STEP2_BIG(w[56], w[57], w[58], w[59], w[60], w[61], w[62], w[63], + MAJ, p3, p0, pp8k[isp + 7]); +} + +static void +compress_big(sph_simd_big_context *sc, int last) +{ + unsigned char *x; + s32 q[256]; + int i; + u32 w[64]; + u32 state[32]; + size_t u; + + static const size_t wbp[32] = { + 4 << 4, 6 << 4, 0 << 4, 2 << 4, + 7 << 4, 5 << 4, 3 << 4, 1 << 4, + 15 << 4, 11 << 4, 12 << 4, 8 << 4, + 9 << 4, 13 << 4, 10 << 4, 14 << 4, + 17 << 4, 18 << 4, 23 << 4, 20 << 4, + 22 << 4, 21 << 4, 16 << 4, 19 << 4, + 30 << 4, 24 << 4, 25 << 4, 31 << 4, + 27 << 4, 29 << 4, 28 << 4, 26 << 4 + }; + + x = sc->buf; + FFT256(0, 1, 0, ll); + if (last) { + for (i = 0; i < 256; i ++) { + s32 tq; + + tq = q[i] + yoff_b_f[i]; + tq = REDS2(tq); + tq = REDS1(tq); + tq = REDS1(tq); + q[i] = (tq <= 128 ? tq : tq - 257); + } + } else { + for (i = 0; i < 256; i ++) { + s32 tq; + + tq = q[i] + yoff_b_n[i]; + tq = REDS2(tq); + tq = REDS1(tq); + tq = REDS1(tq); + q[i] = (tq <= 128 ? tq : tq - 257); + } + } + + for (i = 0; i < 32; i += 8) { + state[i + 0] = sc->state[i + 0] + ^ sph_dec32le_aligned(x + 4 * (i + 0)); + state[i + 1] = sc->state[i + 1] + ^ sph_dec32le_aligned(x + 4 * (i + 1)); + state[i + 2] = sc->state[i + 2] + ^ sph_dec32le_aligned(x + 4 * (i + 2)); + state[i + 3] = sc->state[i + 3] + ^ sph_dec32le_aligned(x + 4 * (i + 3)); + state[i + 4] = sc->state[i + 4] + ^ sph_dec32le_aligned(x + 4 * (i + 4)); + state[i + 5] = sc->state[i + 5] + ^ sph_dec32le_aligned(x + 4 * (i + 5)); + state[i + 6] = sc->state[i + 6] + ^ sph_dec32le_aligned(x + 4 * (i + 6)); + state[i + 7] = sc->state[i + 7] + ^ sph_dec32le_aligned(x + 4 * (i + 7)); + } + +#define WBREAD(sb, o1, o2, mm) do { \ + for (u = 0; u < 64; u += 8) { \ + size_t v = wbp[(u >> 3) + (sb)]; \ + w[u + 0] = INNER(q[v + 2 * 0 + (o1)], \ + q[v + 2 * 0 + (o2)], mm); \ + w[u + 1] = INNER(q[v + 2 * 1 + (o1)], \ + q[v + 2 * 1 + (o2)], mm); \ + w[u + 2] = INNER(q[v + 2 * 2 + (o1)], \ + q[v + 2 * 2 + (o2)], mm); \ + w[u + 3] = INNER(q[v + 2 * 3 + (o1)], \ + q[v + 2 * 3 + (o2)], mm); \ + w[u + 4] = INNER(q[v + 2 * 4 + (o1)], \ + q[v + 2 * 4 + (o2)], mm); \ + w[u + 5] = INNER(q[v + 2 * 5 + (o1)], \ + q[v + 2 * 5 + (o2)], mm); \ + w[u + 6] = INNER(q[v + 2 * 6 + (o1)], \ + q[v + 2 * 6 + (o2)], mm); \ + w[u + 7] = INNER(q[v + 2 * 7 + (o1)], \ + q[v + 2 * 7 + (o2)], mm); \ + } \ + } while (0) + + WBREAD( 0, 0, 1, 185); + one_round_big(state, w, 0, 3, 23, 17, 27); + WBREAD( 8, 0, 1, 185); + one_round_big(state, w, 1, 28, 19, 22, 7); + WBREAD(16, -256, -128, 233); + one_round_big(state, w, 2, 29, 9, 15, 5); + WBREAD(24, -383, -255, 233); + one_round_big(state, w, 3, 4, 13, 10, 25); + +#undef WBREAD + + STEP_BIG( + sc->state[ 0], sc->state[ 1], sc->state[ 2], sc->state[ 3], + sc->state[ 4], sc->state[ 5], sc->state[ 6], sc->state[ 7], + IF, 4, 13, PP8_4_); + STEP_BIG( + sc->state[ 8], sc->state[ 9], sc->state[10], sc->state[11], + sc->state[12], sc->state[13], sc->state[14], sc->state[15], + IF, 13, 10, PP8_5_); + STEP_BIG( + sc->state[16], sc->state[17], sc->state[18], sc->state[19], + sc->state[20], sc->state[21], sc->state[22], sc->state[23], + IF, 10, 25, PP8_6_); + STEP_BIG( + sc->state[24], sc->state[25], sc->state[26], sc->state[27], + sc->state[28], sc->state[29], sc->state[30], sc->state[31], + IF, 25, 4, PP8_0_); + + memcpy(sc->state, state, sizeof state); +} + +#undef A0 +#undef A1 +#undef A2 +#undef A3 +#undef A4 +#undef A5 +#undef A6 +#undef A7 +#undef B0 +#undef B1 +#undef B2 +#undef B3 +#undef B4 +#undef B5 +#undef B6 +#undef B7 +#undef C0 +#undef C1 +#undef C2 +#undef C3 +#undef C4 +#undef C5 +#undef C6 +#undef C7 +#undef D0 +#undef D1 +#undef D2 +#undef D3 +#undef D4 +#undef D5 +#undef D6 +#undef D7 + +#else + +#if SPH_SIMD_NOCOPY +#define A0 (sc->state[ 0]) +#define A1 (sc->state[ 1]) +#define A2 (sc->state[ 2]) +#define A3 (sc->state[ 3]) +#define A4 (sc->state[ 4]) +#define A5 (sc->state[ 5]) +#define A6 (sc->state[ 6]) +#define A7 (sc->state[ 7]) +#define B0 (sc->state[ 8]) +#define B1 (sc->state[ 9]) +#define B2 (sc->state[10]) +#define B3 (sc->state[11]) +#define B4 (sc->state[12]) +#define B5 (sc->state[13]) +#define B6 (sc->state[14]) +#define B7 (sc->state[15]) +#define C0 (sc->state[16]) +#define C1 (sc->state[17]) +#define C2 (sc->state[18]) +#define C3 (sc->state[19]) +#define C4 (sc->state[20]) +#define C5 (sc->state[21]) +#define C6 (sc->state[22]) +#define C7 (sc->state[23]) +#define D0 (sc->state[24]) +#define D1 (sc->state[25]) +#define D2 (sc->state[26]) +#define D3 (sc->state[27]) +#define D4 (sc->state[28]) +#define D5 (sc->state[29]) +#define D6 (sc->state[30]) +#define D7 (sc->state[31]) +#endif + +static void +compress_big(sph_simd_big_context *sc, int last) +{ + unsigned char *x; + s32 q[256]; + int i; + DECL_STATE_BIG +#if SPH_SIMD_NOCOPY + sph_u32 saved[32]; +#endif + +#if SPH_SIMD_NOCOPY + memcpy(saved, sc->state, sizeof saved); +#endif + + x = sc->buf; + FFT256(0, 1, 0, ll); + if (last) { + for (i = 0; i < 256; i ++) { + s32 tq; + + tq = q[i] + yoff_b_f[i]; + tq = REDS2(tq); + tq = REDS1(tq); + tq = REDS1(tq); + q[i] = (tq <= 128 ? tq : tq - 257); + } + } else { + for (i = 0; i < 256; i ++) { + s32 tq; + + tq = q[i] + yoff_b_n[i]; + tq = REDS2(tq); + tq = REDS1(tq); + tq = REDS1(tq); + q[i] = (tq <= 128 ? tq : tq - 257); + } + } + READ_STATE_BIG(sc); + A0 ^= sph_dec32le_aligned(x + 0); + A1 ^= sph_dec32le_aligned(x + 4); + A2 ^= sph_dec32le_aligned(x + 8); + A3 ^= sph_dec32le_aligned(x + 12); + A4 ^= sph_dec32le_aligned(x + 16); + A5 ^= sph_dec32le_aligned(x + 20); + A6 ^= sph_dec32le_aligned(x + 24); + A7 ^= sph_dec32le_aligned(x + 28); + B0 ^= sph_dec32le_aligned(x + 32); + B1 ^= sph_dec32le_aligned(x + 36); + B2 ^= sph_dec32le_aligned(x + 40); + B3 ^= sph_dec32le_aligned(x + 44); + B4 ^= sph_dec32le_aligned(x + 48); + B5 ^= sph_dec32le_aligned(x + 52); + B6 ^= sph_dec32le_aligned(x + 56); + B7 ^= sph_dec32le_aligned(x + 60); + C0 ^= sph_dec32le_aligned(x + 64); + C1 ^= sph_dec32le_aligned(x + 68); + C2 ^= sph_dec32le_aligned(x + 72); + C3 ^= sph_dec32le_aligned(x + 76); + C4 ^= sph_dec32le_aligned(x + 80); + C5 ^= sph_dec32le_aligned(x + 84); + C6 ^= sph_dec32le_aligned(x + 88); + C7 ^= sph_dec32le_aligned(x + 92); + D0 ^= sph_dec32le_aligned(x + 96); + D1 ^= sph_dec32le_aligned(x + 100); + D2 ^= sph_dec32le_aligned(x + 104); + D3 ^= sph_dec32le_aligned(x + 108); + D4 ^= sph_dec32le_aligned(x + 112); + D5 ^= sph_dec32le_aligned(x + 116); + D6 ^= sph_dec32le_aligned(x + 120); + D7 ^= sph_dec32le_aligned(x + 124); + + ONE_ROUND_BIG(0_, 0, 3, 23, 17, 27); + ONE_ROUND_BIG(1_, 1, 28, 19, 22, 7); + ONE_ROUND_BIG(2_, 2, 29, 9, 15, 5); + ONE_ROUND_BIG(3_, 3, 4, 13, 10, 25); +#if SPH_SIMD_NOCOPY + STEP_BIG( + saved[ 0], saved[ 1], saved[ 2], saved[ 3], + saved[ 4], saved[ 5], saved[ 6], saved[ 7], + IF, 4, 13, PP8_4_); + STEP_BIG( + saved[ 8], saved[ 9], saved[10], saved[11], + saved[12], saved[13], saved[14], saved[15], + IF, 13, 10, PP8_5_); + STEP_BIG( + saved[16], saved[17], saved[18], saved[19], + saved[20], saved[21], saved[22], saved[23], + IF, 10, 25, PP8_6_); + STEP_BIG( + saved[24], saved[25], saved[26], saved[27], + saved[28], saved[29], saved[30], saved[31], + IF, 25, 4, PP8_0_); +#else + STEP_BIG( + sc->state[ 0], sc->state[ 1], sc->state[ 2], sc->state[ 3], + sc->state[ 4], sc->state[ 5], sc->state[ 6], sc->state[ 7], + IF, 4, 13, PP8_4_); + STEP_BIG( + sc->state[ 8], sc->state[ 9], sc->state[10], sc->state[11], + sc->state[12], sc->state[13], sc->state[14], sc->state[15], + IF, 13, 10, PP8_5_); + STEP_BIG( + sc->state[16], sc->state[17], sc->state[18], sc->state[19], + sc->state[20], sc->state[21], sc->state[22], sc->state[23], + IF, 10, 25, PP8_6_); + STEP_BIG( + sc->state[24], sc->state[25], sc->state[26], sc->state[27], + sc->state[28], sc->state[29], sc->state[30], sc->state[31], + IF, 25, 4, PP8_0_); + WRITE_STATE_BIG(sc); +#endif +} + +#if SPH_SIMD_NOCOPY +#undef A0 +#undef A1 +#undef A2 +#undef A3 +#undef A4 +#undef A5 +#undef A6 +#undef A7 +#undef B0 +#undef B1 +#undef B2 +#undef B3 +#undef B4 +#undef B5 +#undef B6 +#undef B7 +#undef C0 +#undef C1 +#undef C2 +#undef C3 +#undef C4 +#undef C5 +#undef C6 +#undef C7 +#undef D0 +#undef D1 +#undef D2 +#undef D3 +#undef D4 +#undef D5 +#undef D6 +#undef D7 +#endif + +#endif + +static const u32 IV224[] = { + C32(0x33586E9F), C32(0x12FFF033), C32(0xB2D9F64D), C32(0x6F8FEA53), + C32(0xDE943106), C32(0x2742E439), C32(0x4FBAB5AC), C32(0x62B9FF96), + C32(0x22E7B0AF), C32(0xC862B3A8), C32(0x33E00CDC), C32(0x236B86A6), + C32(0xF64AE77C), C32(0xFA373B76), C32(0x7DC1EE5B), C32(0x7FB29CE8) +}; + +static const u32 IV256[] = { + C32(0x4D567983), C32(0x07190BA9), C32(0x8474577B), C32(0x39D726E9), + C32(0xAAF3D925), C32(0x3EE20B03), C32(0xAFD5E751), C32(0xC96006D3), + C32(0xC2C2BA14), C32(0x49B3BCB4), C32(0xF67CAF46), C32(0x668626C9), + C32(0xE2EAA8D2), C32(0x1FF47833), C32(0xD0C661A5), C32(0x55693DE1) +}; + +static const u32 IV384[] = { + C32(0x8A36EEBC), C32(0x94A3BD90), C32(0xD1537B83), C32(0xB25B070B), + C32(0xF463F1B5), C32(0xB6F81E20), C32(0x0055C339), C32(0xB4D144D1), + C32(0x7360CA61), C32(0x18361A03), C32(0x17DCB4B9), C32(0x3414C45A), + C32(0xA699A9D2), C32(0xE39E9664), C32(0x468BFE77), C32(0x51D062F8), + C32(0xB9E3BFE8), C32(0x63BECE2A), C32(0x8FE506B9), C32(0xF8CC4AC2), + C32(0x7AE11542), C32(0xB1AADDA1), C32(0x64B06794), C32(0x28D2F462), + C32(0xE64071EC), C32(0x1DEB91A8), C32(0x8AC8DB23), C32(0x3F782AB5), + C32(0x039B5CB8), C32(0x71DDD962), C32(0xFADE2CEA), C32(0x1416DF71) +}; + +static const u32 IV512[] = { + C32(0x0BA16B95), C32(0x72F999AD), C32(0x9FECC2AE), C32(0xBA3264FC), + C32(0x5E894929), C32(0x8E9F30E5), C32(0x2F1DAA37), C32(0xF0F2C558), + C32(0xAC506643), C32(0xA90635A5), C32(0xE25B878B), C32(0xAAB7878F), + C32(0x88817F7A), C32(0x0A02892B), C32(0x559A7550), C32(0x598F657E), + C32(0x7EEF60A1), C32(0x6B70E3E8), C32(0x9C1714D1), C32(0xB958E2A8), + C32(0xAB02675E), C32(0xED1C014F), C32(0xCD8D65BB), C32(0xFDB7A257), + C32(0x09254899), C32(0xD699C7BC), C32(0x9019B6DC), C32(0x2B9022E4), + C32(0x8FA14956), C32(0x21BF9BD3), C32(0xB94D0943), C32(0x6FFDDC22) +}; + +static void +init_small(void *cc, const u32 *iv) +{ + sph_simd_small_context *sc; + + sc = cc; + memcpy(sc->state, iv, sizeof sc->state); + sc->count_low = sc->count_high = 0; + sc->ptr = 0; +} + +static void +init_big(void *cc, const u32 *iv) +{ + sph_simd_big_context *sc; + + sc = cc; + memcpy(sc->state, iv, sizeof sc->state); + sc->count_low = sc->count_high = 0; + sc->ptr = 0; +} + +static void +update_small(void *cc, const void *data, size_t len) +{ + sph_simd_small_context *sc; + + sc = cc; + while (len > 0) { + size_t clen; + + clen = (sizeof sc->buf) - sc->ptr; + if (clen > len) + clen = len; + memcpy(sc->buf + sc->ptr, data, clen); + data = (const unsigned char *)data + clen; + len -= clen; + if ((sc->ptr += clen) == sizeof sc->buf) { + compress_small(sc, 0); + sc->ptr = 0; + sc->count_low = T32(sc->count_low + 1); + if (sc->count_low == 0) + sc->count_high ++; + } + } +} + +static void +update_big(void *cc, const void *data, size_t len) +{ + sph_simd_big_context *sc; + + sc = cc; + while (len > 0) { + size_t clen; + + clen = (sizeof sc->buf) - sc->ptr; + if (clen > len) + clen = len; + memcpy(sc->buf + sc->ptr, data, clen); + data = (const unsigned char *)data + clen; + len -= clen; + if ((sc->ptr += clen) == sizeof sc->buf) { + compress_big(sc, 0); + sc->ptr = 0; + sc->count_low = T32(sc->count_low + 1); + if (sc->count_low == 0) + sc->count_high ++; + } + } +} + +static void +encode_count_small(unsigned char *dst, + u32 low, u32 high, size_t ptr, unsigned n) +{ + low = T32(low << 9); + high = T32(high << 9) + (low >> 23); + low += (ptr << 3) + n; + sph_enc32le(dst, low); + sph_enc32le(dst + 4, high); +} + +static void +encode_count_big(unsigned char *dst, + u32 low, u32 high, size_t ptr, unsigned n) +{ + low = T32(low << 10); + high = T32(high << 10) + (low >> 22); + low += (ptr << 3) + n; + sph_enc32le(dst, low); + sph_enc32le(dst + 4, high); +} + +static void +finalize_small(void *cc, unsigned ub, unsigned n, void *dst, size_t dst_len) +{ + sph_simd_small_context *sc; + unsigned char *d; + size_t u; + + sc = cc; + if (sc->ptr > 0 || n > 0) { + memset(sc->buf + sc->ptr, 0, + (sizeof sc->buf) - sc->ptr); + sc->buf[sc->ptr] = ub & (0xFF << (8 - n)); + compress_small(sc, 0); + } + memset(sc->buf, 0, sizeof sc->buf); + encode_count_small(sc->buf, sc->count_low, sc->count_high, sc->ptr, n); + compress_small(sc, 1); + d = dst; + for (d = dst, u = 0; u < dst_len; u ++) + sph_enc32le(d + (u << 2), sc->state[u]); +} + +static void +finalize_big(void *cc, unsigned ub, unsigned n, void *dst, size_t dst_len) +{ + sph_simd_big_context *sc; + unsigned char *d; + size_t u; + + sc = cc; + if (sc->ptr > 0 || n > 0) { + memset(sc->buf + sc->ptr, 0, + (sizeof sc->buf) - sc->ptr); + sc->buf[sc->ptr] = ub & (0xFF << (8 - n)); + compress_big(sc, 0); + } + memset(sc->buf, 0, sizeof sc->buf); + encode_count_big(sc->buf, sc->count_low, sc->count_high, sc->ptr, n); + compress_big(sc, 1); + d = dst; + for (d = dst, u = 0; u < dst_len; u ++) + sph_enc32le(d + (u << 2), sc->state[u]); +} + +void +sph_simd224_init(void *cc) +{ + init_small(cc, IV224); +} + +void +sph_simd224(void *cc, const void *data, size_t len) +{ + update_small(cc, data, len); +} + +void +sph_simd224_close(void *cc, void *dst) +{ + sph_simd224_addbits_and_close(cc, 0, 0, dst); +} + +void +sph_simd224_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst) +{ + finalize_small(cc, ub, n, dst, 7); + sph_simd224_init(cc); +} + +void +sph_simd256_init(void *cc) +{ + init_small(cc, IV256); +} + +void +sph_simd256(void *cc, const void *data, size_t len) +{ + update_small(cc, data, len); +} + +void +sph_simd256_close(void *cc, void *dst) +{ + sph_simd256_addbits_and_close(cc, 0, 0, dst); +} + +void +sph_simd256_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst) +{ + finalize_small(cc, ub, n, dst, 8); + sph_simd256_init(cc); +} + +void +sph_simd384_init(void *cc) +{ + init_big(cc, IV384); +} + +void +sph_simd384(void *cc, const void *data, size_t len) +{ + update_big(cc, data, len); +} + +void +sph_simd384_close(void *cc, void *dst) +{ + sph_simd384_addbits_and_close(cc, 0, 0, dst); +} + +void +sph_simd384_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst) +{ + finalize_big(cc, ub, n, dst, 12); + sph_simd384_init(cc); +} + +void +sph_simd512_init(void *cc) +{ + init_big(cc, IV512); +} + +void +sph_simd512(void *cc, const void *data, size_t len) +{ + update_big(cc, data, len); +} + +void +sph_simd512_close(void *cc, void *dst) +{ + sph_simd512_addbits_and_close(cc, 0, 0, dst); +} + +void +sph_simd512_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst) +{ + finalize_big(cc, ub, n, dst, 16); + sph_simd512_init(cc); +} +#ifdef __cplusplus +} +#endif \ No newline at end of file diff --git a/sha3/skein.c b/sha3/skein.c new file mode 100644 index 0000000..7e47e35 --- /dev/null +++ b/sha3/skein.c @@ -0,0 +1,1254 @@ +/* $Id: skein.c 254 2011-06-07 19:38:58Z tp $ */ +/* + * Skein implementation. + * + * ==========================(LICENSE BEGIN)============================ + * + * Copyright (c) 2007-2010 Projet RNRT SAPHIR + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY + * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * + * ===========================(LICENSE END)============================= + * + * @author Thomas Pornin + */ + +#include +#include + +#include "sph_skein.h" + +#ifdef __cplusplus +extern "C"{ +#endif + + +#if SPH_SMALL_FOOTPRINT && !defined SPH_SMALL_FOOTPRINT_SKEIN +#define SPH_SMALL_FOOTPRINT_SKEIN 1 +#endif + +#ifdef _MSC_VER +#pragma warning (disable: 4146) +#endif + +#if SPH_64 + +#if 0 +/* obsolete */ +/* + * M5_ ## s ## _ ## i evaluates to s+i mod 5 (0 <= s <= 18, 0 <= i <= 3). + */ + +#define M5_0_0 0 +#define M5_0_1 1 +#define M5_0_2 2 +#define M5_0_3 3 + +#define M5_1_0 1 +#define M5_1_1 2 +#define M5_1_2 3 +#define M5_1_3 4 + +#define M5_2_0 2 +#define M5_2_1 3 +#define M5_2_2 4 +#define M5_2_3 0 + +#define M5_3_0 3 +#define M5_3_1 4 +#define M5_3_2 0 +#define M5_3_3 1 + +#define M5_4_0 4 +#define M5_4_1 0 +#define M5_4_2 1 +#define M5_4_3 2 + +#define M5_5_0 0 +#define M5_5_1 1 +#define M5_5_2 2 +#define M5_5_3 3 + +#define M5_6_0 1 +#define M5_6_1 2 +#define M5_6_2 3 +#define M5_6_3 4 + +#define M5_7_0 2 +#define M5_7_1 3 +#define M5_7_2 4 +#define M5_7_3 0 + +#define M5_8_0 3 +#define M5_8_1 4 +#define M5_8_2 0 +#define M5_8_3 1 + +#define M5_9_0 4 +#define M5_9_1 0 +#define M5_9_2 1 +#define M5_9_3 2 + +#define M5_10_0 0 +#define M5_10_1 1 +#define M5_10_2 2 +#define M5_10_3 3 + +#define M5_11_0 1 +#define M5_11_1 2 +#define M5_11_2 3 +#define M5_11_3 4 + +#define M5_12_0 2 +#define M5_12_1 3 +#define M5_12_2 4 +#define M5_12_3 0 + +#define M5_13_0 3 +#define M5_13_1 4 +#define M5_13_2 0 +#define M5_13_3 1 + +#define M5_14_0 4 +#define M5_14_1 0 +#define M5_14_2 1 +#define M5_14_3 2 + +#define M5_15_0 0 +#define M5_15_1 1 +#define M5_15_2 2 +#define M5_15_3 3 + +#define M5_16_0 1 +#define M5_16_1 2 +#define M5_16_2 3 +#define M5_16_3 4 + +#define M5_17_0 2 +#define M5_17_1 3 +#define M5_17_2 4 +#define M5_17_3 0 + +#define M5_18_0 3 +#define M5_18_1 4 +#define M5_18_2 0 +#define M5_18_3 1 +#endif + +/* + * M9_ ## s ## _ ## i evaluates to s+i mod 9 (0 <= s <= 18, 0 <= i <= 7). + */ + +#define M9_0_0 0 +#define M9_0_1 1 +#define M9_0_2 2 +#define M9_0_3 3 +#define M9_0_4 4 +#define M9_0_5 5 +#define M9_0_6 6 +#define M9_0_7 7 + +#define M9_1_0 1 +#define M9_1_1 2 +#define M9_1_2 3 +#define M9_1_3 4 +#define M9_1_4 5 +#define M9_1_5 6 +#define M9_1_6 7 +#define M9_1_7 8 + +#define M9_2_0 2 +#define M9_2_1 3 +#define M9_2_2 4 +#define M9_2_3 5 +#define M9_2_4 6 +#define M9_2_5 7 +#define M9_2_6 8 +#define M9_2_7 0 + +#define M9_3_0 3 +#define M9_3_1 4 +#define M9_3_2 5 +#define M9_3_3 6 +#define M9_3_4 7 +#define M9_3_5 8 +#define M9_3_6 0 +#define M9_3_7 1 + +#define M9_4_0 4 +#define M9_4_1 5 +#define M9_4_2 6 +#define M9_4_3 7 +#define M9_4_4 8 +#define M9_4_5 0 +#define M9_4_6 1 +#define M9_4_7 2 + +#define M9_5_0 5 +#define M9_5_1 6 +#define M9_5_2 7 +#define M9_5_3 8 +#define M9_5_4 0 +#define M9_5_5 1 +#define M9_5_6 2 +#define M9_5_7 3 + +#define M9_6_0 6 +#define M9_6_1 7 +#define M9_6_2 8 +#define M9_6_3 0 +#define M9_6_4 1 +#define M9_6_5 2 +#define M9_6_6 3 +#define M9_6_7 4 + +#define M9_7_0 7 +#define M9_7_1 8 +#define M9_7_2 0 +#define M9_7_3 1 +#define M9_7_4 2 +#define M9_7_5 3 +#define M9_7_6 4 +#define M9_7_7 5 + +#define M9_8_0 8 +#define M9_8_1 0 +#define M9_8_2 1 +#define M9_8_3 2 +#define M9_8_4 3 +#define M9_8_5 4 +#define M9_8_6 5 +#define M9_8_7 6 + +#define M9_9_0 0 +#define M9_9_1 1 +#define M9_9_2 2 +#define M9_9_3 3 +#define M9_9_4 4 +#define M9_9_5 5 +#define M9_9_6 6 +#define M9_9_7 7 + +#define M9_10_0 1 +#define M9_10_1 2 +#define M9_10_2 3 +#define M9_10_3 4 +#define M9_10_4 5 +#define M9_10_5 6 +#define M9_10_6 7 +#define M9_10_7 8 + +#define M9_11_0 2 +#define M9_11_1 3 +#define M9_11_2 4 +#define M9_11_3 5 +#define M9_11_4 6 +#define M9_11_5 7 +#define M9_11_6 8 +#define M9_11_7 0 + +#define M9_12_0 3 +#define M9_12_1 4 +#define M9_12_2 5 +#define M9_12_3 6 +#define M9_12_4 7 +#define M9_12_5 8 +#define M9_12_6 0 +#define M9_12_7 1 + +#define M9_13_0 4 +#define M9_13_1 5 +#define M9_13_2 6 +#define M9_13_3 7 +#define M9_13_4 8 +#define M9_13_5 0 +#define M9_13_6 1 +#define M9_13_7 2 + +#define M9_14_0 5 +#define M9_14_1 6 +#define M9_14_2 7 +#define M9_14_3 8 +#define M9_14_4 0 +#define M9_14_5 1 +#define M9_14_6 2 +#define M9_14_7 3 + +#define M9_15_0 6 +#define M9_15_1 7 +#define M9_15_2 8 +#define M9_15_3 0 +#define M9_15_4 1 +#define M9_15_5 2 +#define M9_15_6 3 +#define M9_15_7 4 + +#define M9_16_0 7 +#define M9_16_1 8 +#define M9_16_2 0 +#define M9_16_3 1 +#define M9_16_4 2 +#define M9_16_5 3 +#define M9_16_6 4 +#define M9_16_7 5 + +#define M9_17_0 8 +#define M9_17_1 0 +#define M9_17_2 1 +#define M9_17_3 2 +#define M9_17_4 3 +#define M9_17_5 4 +#define M9_17_6 5 +#define M9_17_7 6 + +#define M9_18_0 0 +#define M9_18_1 1 +#define M9_18_2 2 +#define M9_18_3 3 +#define M9_18_4 4 +#define M9_18_5 5 +#define M9_18_6 6 +#define M9_18_7 7 + +/* + * M3_ ## s ## _ ## i evaluates to s+i mod 3 (0 <= s <= 18, 0 <= i <= 1). + */ + +#define M3_0_0 0 +#define M3_0_1 1 +#define M3_1_0 1 +#define M3_1_1 2 +#define M3_2_0 2 +#define M3_2_1 0 +#define M3_3_0 0 +#define M3_3_1 1 +#define M3_4_0 1 +#define M3_4_1 2 +#define M3_5_0 2 +#define M3_5_1 0 +#define M3_6_0 0 +#define M3_6_1 1 +#define M3_7_0 1 +#define M3_7_1 2 +#define M3_8_0 2 +#define M3_8_1 0 +#define M3_9_0 0 +#define M3_9_1 1 +#define M3_10_0 1 +#define M3_10_1 2 +#define M3_11_0 2 +#define M3_11_1 0 +#define M3_12_0 0 +#define M3_12_1 1 +#define M3_13_0 1 +#define M3_13_1 2 +#define M3_14_0 2 +#define M3_14_1 0 +#define M3_15_0 0 +#define M3_15_1 1 +#define M3_16_0 1 +#define M3_16_1 2 +#define M3_17_0 2 +#define M3_17_1 0 +#define M3_18_0 0 +#define M3_18_1 1 + +#define XCAT(x, y) XCAT_(x, y) +#define XCAT_(x, y) x ## y + +#if 0 +/* obsolete */ +#define SKSI(k, s, i) XCAT(k, XCAT(XCAT(XCAT(M5_, s), _), i)) +#define SKST(t, s, v) XCAT(t, XCAT(XCAT(XCAT(M3_, s), _), v)) +#endif + +#define SKBI(k, s, i) XCAT(k, XCAT(XCAT(XCAT(M9_, s), _), i)) +#define SKBT(t, s, v) XCAT(t, XCAT(XCAT(XCAT(M3_, s), _), v)) + +#if 0 +/* obsolete */ +#define TFSMALL_KINIT(k0, k1, k2, k3, k4, t0, t1, t2) do { \ + k4 = (k0 ^ k1) ^ (k2 ^ k3) ^ SPH_C64(0x1BD11BDAA9FC1A22); \ + t2 = t0 ^ t1; \ + } while (0) +#endif + +#define TFBIG_KINIT(k0, k1, k2, k3, k4, k5, k6, k7, k8, t0, t1, t2) do { \ + k8 = ((k0 ^ k1) ^ (k2 ^ k3)) ^ ((k4 ^ k5) ^ (k6 ^ k7)) \ + ^ SPH_C64(0x1BD11BDAA9FC1A22); \ + t2 = t0 ^ t1; \ + } while (0) + +#if 0 +/* obsolete */ +#define TFSMALL_ADDKEY(w0, w1, w2, w3, k, t, s) do { \ + w0 = SPH_T64(w0 + SKSI(k, s, 0)); \ + w1 = SPH_T64(w1 + SKSI(k, s, 1) + SKST(t, s, 0)); \ + w2 = SPH_T64(w2 + SKSI(k, s, 2) + SKST(t, s, 1)); \ + w3 = SPH_T64(w3 + SKSI(k, s, 3) + (sph_u64)s); \ + } while (0) +#endif + +#if SPH_SMALL_FOOTPRINT_SKEIN + +#define TFBIG_ADDKEY(s, tt0, tt1) do { \ + p0 = SPH_T64(p0 + h[s + 0]); \ + p1 = SPH_T64(p1 + h[s + 1]); \ + p2 = SPH_T64(p2 + h[s + 2]); \ + p3 = SPH_T64(p3 + h[s + 3]); \ + p4 = SPH_T64(p4 + h[s + 4]); \ + p5 = SPH_T64(p5 + h[s + 5] + tt0); \ + p6 = SPH_T64(p6 + h[s + 6] + tt1); \ + p7 = SPH_T64(p7 + h[s + 7] + (sph_u64)s); \ + } while (0) + +#else + +#define TFBIG_ADDKEY(w0, w1, w2, w3, w4, w5, w6, w7, k, t, s) do { \ + w0 = SPH_T64(w0 + SKBI(k, s, 0)); \ + w1 = SPH_T64(w1 + SKBI(k, s, 1)); \ + w2 = SPH_T64(w2 + SKBI(k, s, 2)); \ + w3 = SPH_T64(w3 + SKBI(k, s, 3)); \ + w4 = SPH_T64(w4 + SKBI(k, s, 4)); \ + w5 = SPH_T64(w5 + SKBI(k, s, 5) + SKBT(t, s, 0)); \ + w6 = SPH_T64(w6 + SKBI(k, s, 6) + SKBT(t, s, 1)); \ + w7 = SPH_T64(w7 + SKBI(k, s, 7) + (sph_u64)s); \ + } while (0) + +#endif + +#if 0 +/* obsolete */ +#define TFSMALL_MIX(x0, x1, rc) do { \ + x0 = SPH_T64(x0 + x1); \ + x1 = SPH_ROTL64(x1, rc) ^ x0; \ + } while (0) +#endif + +#define TFBIG_MIX(x0, x1, rc) do { \ + x0 = SPH_T64(x0 + x1); \ + x1 = SPH_ROTL64(x1, rc) ^ x0; \ + } while (0) + +#if 0 +/* obsolete */ +#define TFSMALL_MIX4(w0, w1, w2, w3, rc0, rc1) do { \ + TFSMALL_MIX(w0, w1, rc0); \ + TFSMALL_MIX(w2, w3, rc1); \ + } while (0) +#endif + +#define TFBIG_MIX8(w0, w1, w2, w3, w4, w5, w6, w7, rc0, rc1, rc2, rc3) do { \ + TFBIG_MIX(w0, w1, rc0); \ + TFBIG_MIX(w2, w3, rc1); \ + TFBIG_MIX(w4, w5, rc2); \ + TFBIG_MIX(w6, w7, rc3); \ + } while (0) + +#if 0 +/* obsolete */ +#define TFSMALL_4e(s) do { \ + TFSMALL_ADDKEY(p0, p1, p2, p3, h, t, s); \ + TFSMALL_MIX4(p0, p1, p2, p3, 14, 16); \ + TFSMALL_MIX4(p0, p3, p2, p1, 52, 57); \ + TFSMALL_MIX4(p0, p1, p2, p3, 23, 40); \ + TFSMALL_MIX4(p0, p3, p2, p1, 5, 37); \ + } while (0) + +#define TFSMALL_4o(s) do { \ + TFSMALL_ADDKEY(p0, p1, p2, p3, h, t, s); \ + TFSMALL_MIX4(p0, p1, p2, p3, 25, 33); \ + TFSMALL_MIX4(p0, p3, p2, p1, 46, 12); \ + TFSMALL_MIX4(p0, p1, p2, p3, 58, 22); \ + TFSMALL_MIX4(p0, p3, p2, p1, 32, 32); \ + } while (0) +#endif + +#if SPH_SMALL_FOOTPRINT_SKEIN + +#define TFBIG_4e(s) do { \ + TFBIG_ADDKEY(s, t0, t1); \ + TFBIG_MIX8(p0, p1, p2, p3, p4, p5, p6, p7, 46, 36, 19, 37); \ + TFBIG_MIX8(p2, p1, p4, p7, p6, p5, p0, p3, 33, 27, 14, 42); \ + TFBIG_MIX8(p4, p1, p6, p3, p0, p5, p2, p7, 17, 49, 36, 39); \ + TFBIG_MIX8(p6, p1, p0, p7, p2, p5, p4, p3, 44, 9, 54, 56); \ + } while (0) + +#define TFBIG_4o(s) do { \ + TFBIG_ADDKEY(s, t1, t2); \ + TFBIG_MIX8(p0, p1, p2, p3, p4, p5, p6, p7, 39, 30, 34, 24); \ + TFBIG_MIX8(p2, p1, p4, p7, p6, p5, p0, p3, 13, 50, 10, 17); \ + TFBIG_MIX8(p4, p1, p6, p3, p0, p5, p2, p7, 25, 29, 39, 43); \ + TFBIG_MIX8(p6, p1, p0, p7, p2, p5, p4, p3, 8, 35, 56, 22); \ + } while (0) + +#else + +#define TFBIG_4e(s) do { \ + TFBIG_ADDKEY(p0, p1, p2, p3, p4, p5, p6, p7, h, t, s); \ + TFBIG_MIX8(p0, p1, p2, p3, p4, p5, p6, p7, 46, 36, 19, 37); \ + TFBIG_MIX8(p2, p1, p4, p7, p6, p5, p0, p3, 33, 27, 14, 42); \ + TFBIG_MIX8(p4, p1, p6, p3, p0, p5, p2, p7, 17, 49, 36, 39); \ + TFBIG_MIX8(p6, p1, p0, p7, p2, p5, p4, p3, 44, 9, 54, 56); \ + } while (0) + +#define TFBIG_4o(s) do { \ + TFBIG_ADDKEY(p0, p1, p2, p3, p4, p5, p6, p7, h, t, s); \ + TFBIG_MIX8(p0, p1, p2, p3, p4, p5, p6, p7, 39, 30, 34, 24); \ + TFBIG_MIX8(p2, p1, p4, p7, p6, p5, p0, p3, 13, 50, 10, 17); \ + TFBIG_MIX8(p4, p1, p6, p3, p0, p5, p2, p7, 25, 29, 39, 43); \ + TFBIG_MIX8(p6, p1, p0, p7, p2, p5, p4, p3, 8, 35, 56, 22); \ + } while (0) + +#endif + +#if 0 +/* obsolete */ +#define UBI_SMALL(etype, extra) do { \ + sph_u64 h4, t0, t1, t2; \ + sph_u64 m0 = sph_dec64le(buf + 0); \ + sph_u64 m1 = sph_dec64le(buf + 8); \ + sph_u64 m2 = sph_dec64le(buf + 16); \ + sph_u64 m3 = sph_dec64le(buf + 24); \ + sph_u64 p0 = m0; \ + sph_u64 p1 = m1; \ + sph_u64 p2 = m2; \ + sph_u64 p3 = m3; \ + t0 = SPH_T64(bcount << 5) + (sph_u64)(extra); \ + t1 = (bcount >> 59) + ((sph_u64)(etype) << 55); \ + TFSMALL_KINIT(h0, h1, h2, h3, h4, t0, t1, t2); \ + TFSMALL_4e(0); \ + TFSMALL_4o(1); \ + TFSMALL_4e(2); \ + TFSMALL_4o(3); \ + TFSMALL_4e(4); \ + TFSMALL_4o(5); \ + TFSMALL_4e(6); \ + TFSMALL_4o(7); \ + TFSMALL_4e(8); \ + TFSMALL_4o(9); \ + TFSMALL_4e(10); \ + TFSMALL_4o(11); \ + TFSMALL_4e(12); \ + TFSMALL_4o(13); \ + TFSMALL_4e(14); \ + TFSMALL_4o(15); \ + TFSMALL_4e(16); \ + TFSMALL_4o(17); \ + TFSMALL_ADDKEY(p0, p1, p2, p3, h, t, 18); \ + h0 = m0 ^ p0; \ + h1 = m1 ^ p1; \ + h2 = m2 ^ p2; \ + h3 = m3 ^ p3; \ + } while (0) +#endif + +#if SPH_SMALL_FOOTPRINT_SKEIN + +#define UBI_BIG(etype, extra) do { \ + sph_u64 t0, t1, t2; \ + unsigned u; \ + sph_u64 m0 = sph_dec64le_aligned(buf + 0); \ + sph_u64 m1 = sph_dec64le_aligned(buf + 8); \ + sph_u64 m2 = sph_dec64le_aligned(buf + 16); \ + sph_u64 m3 = sph_dec64le_aligned(buf + 24); \ + sph_u64 m4 = sph_dec64le_aligned(buf + 32); \ + sph_u64 m5 = sph_dec64le_aligned(buf + 40); \ + sph_u64 m6 = sph_dec64le_aligned(buf + 48); \ + sph_u64 m7 = sph_dec64le_aligned(buf + 56); \ + sph_u64 p0 = m0; \ + sph_u64 p1 = m1; \ + sph_u64 p2 = m2; \ + sph_u64 p3 = m3; \ + sph_u64 p4 = m4; \ + sph_u64 p5 = m5; \ + sph_u64 p6 = m6; \ + sph_u64 p7 = m7; \ + t0 = SPH_T64(bcount << 6) + (sph_u64)(extra); \ + t1 = (bcount >> 58) + ((sph_u64)(etype) << 55); \ + TFBIG_KINIT(h[0], h[1], h[2], h[3], h[4], h[5], \ + h[6], h[7], h[8], t0, t1, t2); \ + for (u = 0; u <= 15; u += 3) { \ + h[u + 9] = h[u + 0]; \ + h[u + 10] = h[u + 1]; \ + h[u + 11] = h[u + 2]; \ + } \ + for (u = 0; u < 9; u ++) { \ + sph_u64 s = u << 1; \ + sph_u64 tmp; \ + TFBIG_4e(s); \ + TFBIG_4o(s + 1); \ + tmp = t2; \ + t2 = t1; \ + t1 = t0; \ + t0 = tmp; \ + } \ + TFBIG_ADDKEY(18, t0, t1); \ + h[0] = m0 ^ p0; \ + h[1] = m1 ^ p1; \ + h[2] = m2 ^ p2; \ + h[3] = m3 ^ p3; \ + h[4] = m4 ^ p4; \ + h[5] = m5 ^ p5; \ + h[6] = m6 ^ p6; \ + h[7] = m7 ^ p7; \ + } while (0) + +#else + +#define UBI_BIG(etype, extra) do { \ + sph_u64 h8, t0, t1, t2; \ + sph_u64 m0 = sph_dec64le_aligned(buf + 0); \ + sph_u64 m1 = sph_dec64le_aligned(buf + 8); \ + sph_u64 m2 = sph_dec64le_aligned(buf + 16); \ + sph_u64 m3 = sph_dec64le_aligned(buf + 24); \ + sph_u64 m4 = sph_dec64le_aligned(buf + 32); \ + sph_u64 m5 = sph_dec64le_aligned(buf + 40); \ + sph_u64 m6 = sph_dec64le_aligned(buf + 48); \ + sph_u64 m7 = sph_dec64le_aligned(buf + 56); \ + sph_u64 p0 = m0; \ + sph_u64 p1 = m1; \ + sph_u64 p2 = m2; \ + sph_u64 p3 = m3; \ + sph_u64 p4 = m4; \ + sph_u64 p5 = m5; \ + sph_u64 p6 = m6; \ + sph_u64 p7 = m7; \ + t0 = SPH_T64(bcount << 6) + (sph_u64)(extra); \ + t1 = (bcount >> 58) + ((sph_u64)(etype) << 55); \ + TFBIG_KINIT(h0, h1, h2, h3, h4, h5, h6, h7, h8, t0, t1, t2); \ + TFBIG_4e(0); \ + TFBIG_4o(1); \ + TFBIG_4e(2); \ + TFBIG_4o(3); \ + TFBIG_4e(4); \ + TFBIG_4o(5); \ + TFBIG_4e(6); \ + TFBIG_4o(7); \ + TFBIG_4e(8); \ + TFBIG_4o(9); \ + TFBIG_4e(10); \ + TFBIG_4o(11); \ + TFBIG_4e(12); \ + TFBIG_4o(13); \ + TFBIG_4e(14); \ + TFBIG_4o(15); \ + TFBIG_4e(16); \ + TFBIG_4o(17); \ + TFBIG_ADDKEY(p0, p1, p2, p3, p4, p5, p6, p7, h, t, 18); \ + h0 = m0 ^ p0; \ + h1 = m1 ^ p1; \ + h2 = m2 ^ p2; \ + h3 = m3 ^ p3; \ + h4 = m4 ^ p4; \ + h5 = m5 ^ p5; \ + h6 = m6 ^ p6; \ + h7 = m7 ^ p7; \ + } while (0) + +#endif + +#if 0 +/* obsolete */ +#define DECL_STATE_SMALL \ + sph_u64 h0, h1, h2, h3; \ + sph_u64 bcount; + +#define READ_STATE_SMALL(sc) do { \ + h0 = (sc)->h0; \ + h1 = (sc)->h1; \ + h2 = (sc)->h2; \ + h3 = (sc)->h3; \ + bcount = sc->bcount; \ + } while (0) + +#define WRITE_STATE_SMALL(sc) do { \ + (sc)->h0 = h0; \ + (sc)->h1 = h1; \ + (sc)->h2 = h2; \ + (sc)->h3 = h3; \ + sc->bcount = bcount; \ + } while (0) +#endif + +#if SPH_SMALL_FOOTPRINT_SKEIN + +#define DECL_STATE_BIG \ + sph_u64 h[27]; \ + sph_u64 bcount; + +#define READ_STATE_BIG(sc) do { \ + h[0] = (sc)->h0; \ + h[1] = (sc)->h1; \ + h[2] = (sc)->h2; \ + h[3] = (sc)->h3; \ + h[4] = (sc)->h4; \ + h[5] = (sc)->h5; \ + h[6] = (sc)->h6; \ + h[7] = (sc)->h7; \ + bcount = sc->bcount; \ + } while (0) + +#define WRITE_STATE_BIG(sc) do { \ + (sc)->h0 = h[0]; \ + (sc)->h1 = h[1]; \ + (sc)->h2 = h[2]; \ + (sc)->h3 = h[3]; \ + (sc)->h4 = h[4]; \ + (sc)->h5 = h[5]; \ + (sc)->h6 = h[6]; \ + (sc)->h7 = h[7]; \ + sc->bcount = bcount; \ + } while (0) + +#else + +#define DECL_STATE_BIG \ + sph_u64 h0, h1, h2, h3, h4, h5, h6, h7; \ + sph_u64 bcount; + +#define READ_STATE_BIG(sc) do { \ + h0 = (sc)->h0; \ + h1 = (sc)->h1; \ + h2 = (sc)->h2; \ + h3 = (sc)->h3; \ + h4 = (sc)->h4; \ + h5 = (sc)->h5; \ + h6 = (sc)->h6; \ + h7 = (sc)->h7; \ + bcount = sc->bcount; \ + } while (0) + +#define WRITE_STATE_BIG(sc) do { \ + (sc)->h0 = h0; \ + (sc)->h1 = h1; \ + (sc)->h2 = h2; \ + (sc)->h3 = h3; \ + (sc)->h4 = h4; \ + (sc)->h5 = h5; \ + (sc)->h6 = h6; \ + (sc)->h7 = h7; \ + sc->bcount = bcount; \ + } while (0) + +#endif + +#if 0 +/* obsolete */ +static void +skein_small_init(sph_skein_small_context *sc, const sph_u64 *iv) +{ + sc->h0 = iv[0]; + sc->h1 = iv[1]; + sc->h2 = iv[2]; + sc->h3 = iv[3]; + sc->bcount = 0; + sc->ptr = 0; +} +#endif + +static void +skein_big_init(sph_skein_big_context *sc, const sph_u64 *iv) +{ + sc->h0 = iv[0]; + sc->h1 = iv[1]; + sc->h2 = iv[2]; + sc->h3 = iv[3]; + sc->h4 = iv[4]; + sc->h5 = iv[5]; + sc->h6 = iv[6]; + sc->h7 = iv[7]; + sc->bcount = 0; + sc->ptr = 0; +} + +#if 0 +/* obsolete */ +static void +skein_small_core(sph_skein_small_context *sc, const void *data, size_t len) +{ + unsigned char *buf; + size_t ptr, clen; + unsigned first; + DECL_STATE_SMALL + + buf = sc->buf; + ptr = sc->ptr; + clen = (sizeof sc->buf) - ptr; + if (len <= clen) { + memcpy(buf + ptr, data, len); + sc->ptr = ptr + len; + return; + } + if (clen != 0) { + memcpy(buf + ptr, data, clen); + data = (const unsigned char *)data + clen; + len -= clen; + } + +#if SPH_SMALL_FOOTPRINT_SKEIN + + READ_STATE_SMALL(sc); + first = (bcount == 0) << 7; + for (;;) { + bcount ++; + UBI_SMALL(96 + first, 0); + if (len <= sizeof sc->buf) + break; + first = 0; + memcpy(buf, data, sizeof sc->buf); + data = (const unsigned char *)data + sizeof sc->buf; + len -= sizeof sc->buf; + } + WRITE_STATE_SMALL(sc); + sc->ptr = len; + memcpy(buf, data, len); + +#else + + /* + * Unrolling the loop yields a slight performance boost, while + * keeping the code size aorund 24 kB on 32-bit x86. + */ + READ_STATE_SMALL(sc); + first = (bcount == 0) << 7; + for (;;) { + bcount ++; + UBI_SMALL(96 + first, 0); + if (len <= sizeof sc->buf) + break; + buf = (unsigned char *)data; + bcount ++; + UBI_SMALL(96, 0); + if (len <= 2 * sizeof sc->buf) { + data = buf + sizeof sc->buf; + len -= sizeof sc->buf; + break; + } + buf += sizeof sc->buf; + data = buf + sizeof sc->buf; + first = 0; + len -= 2 * sizeof sc->buf; + } + WRITE_STATE_SMALL(sc); + sc->ptr = len; + memcpy(sc->buf, data, len); + +#endif +} +#endif + +static void +skein_big_core(sph_skein_big_context *sc, const void *data, size_t len) +{ + /* + * The Skein "final bit" in the tweak is troublesome here, + * because if the input has a length which is a multiple of the + * block size (512 bits) then that bit must be set for the + * final block, which is full of message bits (padding in + * Skein can be reduced to no extra bit at all). However, this + * function cannot know whether it processes the last chunks of + * the message or not. Hence we may keep a full block of buffered + * data (64 bytes). + */ + unsigned char *buf; + size_t ptr; + unsigned first; + DECL_STATE_BIG + + buf = sc->buf; + ptr = sc->ptr; + if (len <= (sizeof sc->buf) - ptr) { + memcpy(buf + ptr, data, len); + ptr += len; + sc->ptr = ptr; + return; + } + + READ_STATE_BIG(sc); + first = (bcount == 0) << 7; + do { + size_t clen; + + if (ptr == sizeof sc->buf) { + bcount ++; + UBI_BIG(96 + first, 0); + first = 0; + ptr = 0; + } + clen = (sizeof sc->buf) - ptr; + if (clen > len) + clen = len; + memcpy(buf + ptr, data, clen); + ptr += clen; + data = (const unsigned char *)data + clen; + len -= clen; + } while (len > 0); + WRITE_STATE_BIG(sc); + sc->ptr = ptr; +} + +#if 0 +/* obsolete */ +static void +skein_small_close(sph_skein_small_context *sc, unsigned ub, unsigned n, + void *dst, size_t out_len) +{ + unsigned char *buf; + size_t ptr; + unsigned et; + int i; + DECL_STATE_SMALL + + if (n != 0) { + unsigned z; + unsigned char x; + + z = 0x80 >> n; + x = ((ub & -z) | z) & 0xFF; + skein_small_core(sc, &x, 1); + } + + buf = sc->buf; + ptr = sc->ptr; + READ_STATE_SMALL(sc); + memset(buf + ptr, 0, (sizeof sc->buf) - ptr); + et = 352 + ((bcount == 0) << 7) + (n != 0); + for (i = 0; i < 2; i ++) { + UBI_SMALL(et, ptr); + if (i == 0) { + memset(buf, 0, sizeof sc->buf); + bcount = 0; + et = 510; + ptr = 8; + } + } + + sph_enc64le_aligned(buf + 0, h0); + sph_enc64le_aligned(buf + 8, h1); + sph_enc64le_aligned(buf + 16, h2); + sph_enc64le_aligned(buf + 24, h3); + memcpy(dst, buf, out_len); +} +#endif + +static void +skein_big_close(sph_skein_big_context *sc, unsigned ub, unsigned n, + void *dst, size_t out_len) +{ + unsigned char *buf; + size_t ptr; + unsigned et; + int i; +#if SPH_SMALL_FOOTPRINT_SKEIN + size_t u; +#endif + DECL_STATE_BIG + + /* + * Add bit padding if necessary. + */ + if (n != 0) { + unsigned z; + unsigned char x; + + z = 0x80 >> n; + x = ((ub & -z) | z) & 0xFF; + skein_big_core(sc, &x, 1); + } + + buf = sc->buf; + ptr = sc->ptr; + + /* + * At that point, if ptr == 0, then the message was empty; + * otherwise, there is between 1 and 64 bytes (inclusive) which + * are yet to be processed. Either way, we complete the buffer + * to a full block with zeros (the Skein specification mandates + * that an empty message is padded so that there is at least + * one block to process). + * + * Once this block has been processed, we do it again, with + * a block full of zeros, for the output (that block contains + * the encoding of "0", over 8 bytes, then padded with zeros). + */ + READ_STATE_BIG(sc); + memset(buf + ptr, 0, (sizeof sc->buf) - ptr); + et = 352 + ((bcount == 0) << 7) + (n != 0); + for (i = 0; i < 2; i ++) { + UBI_BIG(et, ptr); + if (i == 0) { + memset(buf, 0, sizeof sc->buf); + bcount = 0; + et = 510; + ptr = 8; + } + } + +#if SPH_SMALL_FOOTPRINT_SKEIN + + /* + * We use a temporary buffer because we must support the case + * where output size is not a multiple of 64 (namely, a 224-bit + * output). + */ + for (u = 0; u < out_len; u += 8) + sph_enc64le_aligned(buf + u, h[u >> 3]); + memcpy(dst, buf, out_len); + +#else + + sph_enc64le_aligned(buf + 0, h0); + sph_enc64le_aligned(buf + 8, h1); + sph_enc64le_aligned(buf + 16, h2); + sph_enc64le_aligned(buf + 24, h3); + sph_enc64le_aligned(buf + 32, h4); + sph_enc64le_aligned(buf + 40, h5); + sph_enc64le_aligned(buf + 48, h6); + sph_enc64le_aligned(buf + 56, h7); + memcpy(dst, buf, out_len); + +#endif +} + +#if 0 +/* obsolete */ +static const sph_u64 IV224[] = { + SPH_C64(0xC6098A8C9AE5EA0B), SPH_C64(0x876D568608C5191C), + SPH_C64(0x99CB88D7D7F53884), SPH_C64(0x384BDDB1AEDDB5DE) +}; + +static const sph_u64 IV256[] = { + SPH_C64(0xFC9DA860D048B449), SPH_C64(0x2FCA66479FA7D833), + SPH_C64(0xB33BC3896656840F), SPH_C64(0x6A54E920FDE8DA69) +}; +#endif + +static const sph_u64 IV224[] = { + SPH_C64(0xCCD0616248677224), SPH_C64(0xCBA65CF3A92339EF), + SPH_C64(0x8CCD69D652FF4B64), SPH_C64(0x398AED7B3AB890B4), + SPH_C64(0x0F59D1B1457D2BD0), SPH_C64(0x6776FE6575D4EB3D), + SPH_C64(0x99FBC70E997413E9), SPH_C64(0x9E2CFCCFE1C41EF7) +}; + +static const sph_u64 IV256[] = { + SPH_C64(0xCCD044A12FDB3E13), SPH_C64(0xE83590301A79A9EB), + SPH_C64(0x55AEA0614F816E6F), SPH_C64(0x2A2767A4AE9B94DB), + SPH_C64(0xEC06025E74DD7683), SPH_C64(0xE7A436CDC4746251), + SPH_C64(0xC36FBAF9393AD185), SPH_C64(0x3EEDBA1833EDFC13) +}; + +static const sph_u64 IV384[] = { + SPH_C64(0xA3F6C6BF3A75EF5F), SPH_C64(0xB0FEF9CCFD84FAA4), + SPH_C64(0x9D77DD663D770CFE), SPH_C64(0xD798CBF3B468FDDA), + SPH_C64(0x1BC4A6668A0E4465), SPH_C64(0x7ED7D434E5807407), + SPH_C64(0x548FC1ACD4EC44D6), SPH_C64(0x266E17546AA18FF8) +}; + +static const sph_u64 IV512[] = { + SPH_C64(0x4903ADFF749C51CE), SPH_C64(0x0D95DE399746DF03), + SPH_C64(0x8FD1934127C79BCE), SPH_C64(0x9A255629FF352CB1), + SPH_C64(0x5DB62599DF6CA7B0), SPH_C64(0xEABE394CA9D5C3F4), + SPH_C64(0x991112C71A75B523), SPH_C64(0xAE18A40B660FCC33) +}; + +#if 0 +/* obsolete */ +/* see sph_skein.h */ +void +sph_skein224_init(void *cc) +{ + skein_small_init(cc, IV224); +} + +/* see sph_skein.h */ +void +sph_skein224(void *cc, const void *data, size_t len) +{ + skein_small_core(cc, data, len); +} + +/* see sph_skein.h */ +void +sph_skein224_close(void *cc, void *dst) +{ + sph_skein224_addbits_and_close(cc, 0, 0, dst); +} + +/* see sph_skein.h */ +void +sph_skein224_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst) +{ + skein_small_close(cc, ub, n, dst, 28); + sph_skein224_init(cc); +} + +/* see sph_skein.h */ +void +sph_skein256_init(void *cc) +{ + skein_small_init(cc, IV256); +} + +/* see sph_skein.h */ +void +sph_skein256(void *cc, const void *data, size_t len) +{ + skein_small_core(cc, data, len); +} + +/* see sph_skein.h */ +void +sph_skein256_close(void *cc, void *dst) +{ + sph_skein256_addbits_and_close(cc, 0, 0, dst); +} + +/* see sph_skein.h */ +void +sph_skein256_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst) +{ + skein_small_close(cc, ub, n, dst, 32); + sph_skein256_init(cc); +} +#endif + +/* see sph_skein.h */ +void +sph_skein224_init(void *cc) +{ + skein_big_init(cc, IV224); +} + +/* see sph_skein.h */ +void +sph_skein224(void *cc, const void *data, size_t len) +{ + skein_big_core(cc, data, len); +} + +/* see sph_skein.h */ +void +sph_skein224_close(void *cc, void *dst) +{ + sph_skein224_addbits_and_close(cc, 0, 0, dst); +} + +/* see sph_skein.h */ +void +sph_skein224_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst) +{ + skein_big_close(cc, ub, n, dst, 28); + sph_skein224_init(cc); +} + +/* see sph_skein.h */ +void +sph_skein256_init(void *cc) +{ + skein_big_init(cc, IV256); +} + +/* see sph_skein.h */ +void +sph_skein256(void *cc, const void *data, size_t len) +{ + skein_big_core(cc, data, len); +} + +/* see sph_skein.h */ +void +sph_skein256_close(void *cc, void *dst) +{ + sph_skein256_addbits_and_close(cc, 0, 0, dst); +} + +/* see sph_skein.h */ +void +sph_skein256_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst) +{ + skein_big_close(cc, ub, n, dst, 32); + sph_skein256_init(cc); +} + +/* see sph_skein.h */ +void +sph_skein384_init(void *cc) +{ + skein_big_init(cc, IV384); +} + +/* see sph_skein.h */ +void +sph_skein384(void *cc, const void *data, size_t len) +{ + skein_big_core(cc, data, len); +} + +/* see sph_skein.h */ +void +sph_skein384_close(void *cc, void *dst) +{ + sph_skein384_addbits_and_close(cc, 0, 0, dst); +} + +/* see sph_skein.h */ +void +sph_skein384_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst) +{ + skein_big_close(cc, ub, n, dst, 48); + sph_skein384_init(cc); +} + +/* see sph_skein.h */ +void +sph_skein512_init(void *cc) +{ + skein_big_init(cc, IV512); +} + +/* see sph_skein.h */ +void +sph_skein512(void *cc, const void *data, size_t len) +{ + skein_big_core(cc, data, len); +} + +/* see sph_skein.h */ +void +sph_skein512_close(void *cc, void *dst) +{ + sph_skein512_addbits_and_close(cc, 0, 0, dst); +} + +/* see sph_skein.h */ +void +sph_skein512_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst) +{ + skein_big_close(cc, ub, n, dst, 64); + sph_skein512_init(cc); +} + +#endif + + +#ifdef __cplusplus +} +#endif diff --git a/sha3/sph_blake.h b/sha3/sph_blake.h new file mode 100644 index 0000000..d8d7943 --- /dev/null +++ b/sha3/sph_blake.h @@ -0,0 +1,327 @@ +/* $Id: sph_blake.h 252 2011-06-07 17:55:14Z tp $ */ +/** + * BLAKE interface. BLAKE is a family of functions which differ by their + * output size; this implementation defines BLAKE for output sizes 224, + * 256, 384 and 512 bits. This implementation conforms to the "third + * round" specification. + * + * ==========================(LICENSE BEGIN)============================ + * + * Copyright (c) 2007-2010 Projet RNRT SAPHIR + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY + * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * + * ===========================(LICENSE END)============================= + * + * @file sph_blake.h + * @author Thomas Pornin + */ + +#ifndef SPH_BLAKE_H__ +#define SPH_BLAKE_H__ + +#ifdef __cplusplus +extern "C"{ +#endif + +#include +#include "sph_types.h" + +/** + * Output size (in bits) for BLAKE-224. + */ +#define SPH_SIZE_blake224 224 + +/** + * Output size (in bits) for BLAKE-256. + */ +#define SPH_SIZE_blake256 256 + +#if SPH_64 + +/** + * Output size (in bits) for BLAKE-384. + */ +#define SPH_SIZE_blake384 384 + +/** + * Output size (in bits) for BLAKE-512. + */ +#define SPH_SIZE_blake512 512 + +#endif + +/** + * This structure is a context for BLAKE-224 and BLAKE-256 computations: + * it contains the intermediate values and some data from the last + * entered block. Once a BLAKE computation has been performed, the + * context can be reused for another computation. + * + * The contents of this structure are private. A running BLAKE + * computation can be cloned by copying the context (e.g. with a simple + * memcpy()). + */ +typedef struct { +#ifndef DOXYGEN_IGNORE + unsigned char buf[64]; /* first field, for alignment */ + size_t ptr; + sph_u32 H[8]; + sph_u32 S[4]; + sph_u32 T0, T1; +#endif +} sph_blake_small_context; + +/** + * This structure is a context for BLAKE-224 computations. It is + * identical to the common sph_blake_small_context. + */ +typedef sph_blake_small_context sph_blake224_context; + +/** + * This structure is a context for BLAKE-256 computations. It is + * identical to the common sph_blake_small_context. + */ +typedef sph_blake_small_context sph_blake256_context; + +#if SPH_64 + +/** + * This structure is a context for BLAKE-384 and BLAKE-512 computations: + * it contains the intermediate values and some data from the last + * entered block. Once a BLAKE computation has been performed, the + * context can be reused for another computation. + * + * The contents of this structure are private. A running BLAKE + * computation can be cloned by copying the context (e.g. with a simple + * memcpy()). + */ +typedef struct { +#ifndef DOXYGEN_IGNORE + unsigned char buf[128]; /* first field, for alignment */ + size_t ptr; + sph_u64 H[8]; + sph_u64 S[4]; + sph_u64 T0, T1; +#endif +} sph_blake_big_context; + +/** + * This structure is a context for BLAKE-384 computations. It is + * identical to the common sph_blake_small_context. + */ +typedef sph_blake_big_context sph_blake384_context; + +/** + * This structure is a context for BLAKE-512 computations. It is + * identical to the common sph_blake_small_context. + */ +typedef sph_blake_big_context sph_blake512_context; + +#endif + +/** + * Initialize a BLAKE-224 context. This process performs no memory allocation. + * + * @param cc the BLAKE-224 context (pointer to a + * sph_blake224_context) + */ +void sph_blake224_init(void *cc); + +/** + * Process some data bytes. It is acceptable that len is zero + * (in which case this function does nothing). + * + * @param cc the BLAKE-224 context + * @param data the input data + * @param len the input data length (in bytes) + */ +void sph_blake224(void *cc, const void *data, size_t len); + +/** + * Terminate the current BLAKE-224 computation and output the result into + * the provided buffer. The destination buffer must be wide enough to + * accomodate the result (28 bytes). The context is automatically + * reinitialized. + * + * @param cc the BLAKE-224 context + * @param dst the destination buffer + */ +void sph_blake224_close(void *cc, void *dst); + +/** + * Add a few additional bits (0 to 7) to the current computation, then + * terminate it and output the result in the provided buffer, which must + * be wide enough to accomodate the result (28 bytes). If bit number i + * in ub has value 2^i, then the extra bits are those + * numbered 7 downto 8-n (this is the big-endian convention at the byte + * level). The context is automatically reinitialized. + * + * @param cc the BLAKE-224 context + * @param ub the extra bits + * @param n the number of extra bits (0 to 7) + * @param dst the destination buffer + */ +void sph_blake224_addbits_and_close( + void *cc, unsigned ub, unsigned n, void *dst); + +/** + * Initialize a BLAKE-256 context. This process performs no memory allocation. + * + * @param cc the BLAKE-256 context (pointer to a + * sph_blake256_context) + */ +void sph_blake256_init(void *cc); + +/** + * Process some data bytes. It is acceptable that len is zero + * (in which case this function does nothing). + * + * @param cc the BLAKE-256 context + * @param data the input data + * @param len the input data length (in bytes) + */ +void sph_blake256(void *cc, const void *data, size_t len); + +/** + * Terminate the current BLAKE-256 computation and output the result into + * the provided buffer. The destination buffer must be wide enough to + * accomodate the result (32 bytes). The context is automatically + * reinitialized. + * + * @param cc the BLAKE-256 context + * @param dst the destination buffer + */ +void sph_blake256_close(void *cc, void *dst); + +/** + * Add a few additional bits (0 to 7) to the current computation, then + * terminate it and output the result in the provided buffer, which must + * be wide enough to accomodate the result (32 bytes). If bit number i + * in ub has value 2^i, then the extra bits are those + * numbered 7 downto 8-n (this is the big-endian convention at the byte + * level). The context is automatically reinitialized. + * + * @param cc the BLAKE-256 context + * @param ub the extra bits + * @param n the number of extra bits (0 to 7) + * @param dst the destination buffer + */ +void sph_blake256_addbits_and_close( + void *cc, unsigned ub, unsigned n, void *dst); + +#if SPH_64 + +/** + * Initialize a BLAKE-384 context. This process performs no memory allocation. + * + * @param cc the BLAKE-384 context (pointer to a + * sph_blake384_context) + */ +void sph_blake384_init(void *cc); + +/** + * Process some data bytes. It is acceptable that len is zero + * (in which case this function does nothing). + * + * @param cc the BLAKE-384 context + * @param data the input data + * @param len the input data length (in bytes) + */ +void sph_blake384(void *cc, const void *data, size_t len); + +/** + * Terminate the current BLAKE-384 computation and output the result into + * the provided buffer. The destination buffer must be wide enough to + * accomodate the result (48 bytes). The context is automatically + * reinitialized. + * + * @param cc the BLAKE-384 context + * @param dst the destination buffer + */ +void sph_blake384_close(void *cc, void *dst); + +/** + * Add a few additional bits (0 to 7) to the current computation, then + * terminate it and output the result in the provided buffer, which must + * be wide enough to accomodate the result (48 bytes). If bit number i + * in ub has value 2^i, then the extra bits are those + * numbered 7 downto 8-n (this is the big-endian convention at the byte + * level). The context is automatically reinitialized. + * + * @param cc the BLAKE-384 context + * @param ub the extra bits + * @param n the number of extra bits (0 to 7) + * @param dst the destination buffer + */ +void sph_blake384_addbits_and_close( + void *cc, unsigned ub, unsigned n, void *dst); + +/** + * Initialize a BLAKE-512 context. This process performs no memory allocation. + * + * @param cc the BLAKE-512 context (pointer to a + * sph_blake512_context) + */ +void sph_blake512_init(void *cc); + +/** + * Process some data bytes. It is acceptable that len is zero + * (in which case this function does nothing). + * + * @param cc the BLAKE-512 context + * @param data the input data + * @param len the input data length (in bytes) + */ +void sph_blake512(void *cc, const void *data, size_t len); + +/** + * Terminate the current BLAKE-512 computation and output the result into + * the provided buffer. The destination buffer must be wide enough to + * accomodate the result (64 bytes). The context is automatically + * reinitialized. + * + * @param cc the BLAKE-512 context + * @param dst the destination buffer + */ +void sph_blake512_close(void *cc, void *dst); + +/** + * Add a few additional bits (0 to 7) to the current computation, then + * terminate it and output the result in the provided buffer, which must + * be wide enough to accomodate the result (64 bytes). If bit number i + * in ub has value 2^i, then the extra bits are those + * numbered 7 downto 8-n (this is the big-endian convention at the byte + * level). The context is automatically reinitialized. + * + * @param cc the BLAKE-512 context + * @param ub the extra bits + * @param n the number of extra bits (0 to 7) + * @param dst the destination buffer + */ +void sph_blake512_addbits_and_close( + void *cc, unsigned ub, unsigned n, void *dst); + +#endif + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/sha3/sph_bmw.h b/sha3/sph_bmw.h new file mode 100644 index 0000000..d386b0c --- /dev/null +++ b/sha3/sph_bmw.h @@ -0,0 +1,328 @@ +/* $Id: sph_bmw.h 216 2010-06-08 09:46:57Z tp $ */ +/** + * BMW interface. BMW (aka "Blue Midnight Wish") is a family of + * functions which differ by their output size; this implementation + * defines BMW for output sizes 224, 256, 384 and 512 bits. + * + * ==========================(LICENSE BEGIN)============================ + * + * Copyright (c) 2007-2010 Projet RNRT SAPHIR + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY + * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * + * ===========================(LICENSE END)============================= + * + * @file sph_bmw.h + * @author Thomas Pornin + */ + +#ifndef SPH_BMW_H__ +#define SPH_BMW_H__ + +#ifdef __cplusplus +extern "C"{ +#endif + +#include +#include "sph_types.h" + +/** + * Output size (in bits) for BMW-224. + */ +#define SPH_SIZE_bmw224 224 + +/** + * Output size (in bits) for BMW-256. + */ +#define SPH_SIZE_bmw256 256 + +#if SPH_64 + +/** + * Output size (in bits) for BMW-384. + */ +#define SPH_SIZE_bmw384 384 + +/** + * Output size (in bits) for BMW-512. + */ +#define SPH_SIZE_bmw512 512 + +#endif + +/** + * This structure is a context for BMW-224 and BMW-256 computations: + * it contains the intermediate values and some data from the last + * entered block. Once a BMW computation has been performed, the + * context can be reused for another computation. + * + * The contents of this structure are private. A running BMW + * computation can be cloned by copying the context (e.g. with a simple + * memcpy()). + */ +typedef struct { +#ifndef DOXYGEN_IGNORE + unsigned char buf[64]; /* first field, for alignment */ + size_t ptr; + sph_u32 H[16]; +#if SPH_64 + sph_u64 bit_count; +#else + sph_u32 bit_count_high, bit_count_low; +#endif +#endif +} sph_bmw_small_context; + +/** + * This structure is a context for BMW-224 computations. It is + * identical to the common sph_bmw_small_context. + */ +typedef sph_bmw_small_context sph_bmw224_context; + +/** + * This structure is a context for BMW-256 computations. It is + * identical to the common sph_bmw_small_context. + */ +typedef sph_bmw_small_context sph_bmw256_context; + +#if SPH_64 + +/** + * This structure is a context for BMW-384 and BMW-512 computations: + * it contains the intermediate values and some data from the last + * entered block. Once a BMW computation has been performed, the + * context can be reused for another computation. + * + * The contents of this structure are private. A running BMW + * computation can be cloned by copying the context (e.g. with a simple + * memcpy()). + */ +typedef struct { +#ifndef DOXYGEN_IGNORE + unsigned char buf[128]; /* first field, for alignment */ + size_t ptr; + sph_u64 H[16]; + sph_u64 bit_count; +#endif +} sph_bmw_big_context; + +/** + * This structure is a context for BMW-384 computations. It is + * identical to the common sph_bmw_small_context. + */ +typedef sph_bmw_big_context sph_bmw384_context; + +/** + * This structure is a context for BMW-512 computations. It is + * identical to the common sph_bmw_small_context. + */ +typedef sph_bmw_big_context sph_bmw512_context; + +#endif + +/** + * Initialize a BMW-224 context. This process performs no memory allocation. + * + * @param cc the BMW-224 context (pointer to a + * sph_bmw224_context) + */ +void sph_bmw224_init(void *cc); + +/** + * Process some data bytes. It is acceptable that len is zero + * (in which case this function does nothing). + * + * @param cc the BMW-224 context + * @param data the input data + * @param len the input data length (in bytes) + */ +void sph_bmw224(void *cc, const void *data, size_t len); + +/** + * Terminate the current BMW-224 computation and output the result into + * the provided buffer. The destination buffer must be wide enough to + * accomodate the result (28 bytes). The context is automatically + * reinitialized. + * + * @param cc the BMW-224 context + * @param dst the destination buffer + */ +void sph_bmw224_close(void *cc, void *dst); + +/** + * Add a few additional bits (0 to 7) to the current computation, then + * terminate it and output the result in the provided buffer, which must + * be wide enough to accomodate the result (28 bytes). If bit number i + * in ub has value 2^i, then the extra bits are those + * numbered 7 downto 8-n (this is the big-endian convention at the byte + * level). The context is automatically reinitialized. + * + * @param cc the BMW-224 context + * @param ub the extra bits + * @param n the number of extra bits (0 to 7) + * @param dst the destination buffer + */ +void sph_bmw224_addbits_and_close( + void *cc, unsigned ub, unsigned n, void *dst); + +/** + * Initialize a BMW-256 context. This process performs no memory allocation. + * + * @param cc the BMW-256 context (pointer to a + * sph_bmw256_context) + */ +void sph_bmw256_init(void *cc); + +/** + * Process some data bytes. It is acceptable that len is zero + * (in which case this function does nothing). + * + * @param cc the BMW-256 context + * @param data the input data + * @param len the input data length (in bytes) + */ +void sph_bmw256(void *cc, const void *data, size_t len); + +/** + * Terminate the current BMW-256 computation and output the result into + * the provided buffer. The destination buffer must be wide enough to + * accomodate the result (32 bytes). The context is automatically + * reinitialized. + * + * @param cc the BMW-256 context + * @param dst the destination buffer + */ +void sph_bmw256_close(void *cc, void *dst); + +/** + * Add a few additional bits (0 to 7) to the current computation, then + * terminate it and output the result in the provided buffer, which must + * be wide enough to accomodate the result (32 bytes). If bit number i + * in ub has value 2^i, then the extra bits are those + * numbered 7 downto 8-n (this is the big-endian convention at the byte + * level). The context is automatically reinitialized. + * + * @param cc the BMW-256 context + * @param ub the extra bits + * @param n the number of extra bits (0 to 7) + * @param dst the destination buffer + */ +void sph_bmw256_addbits_and_close( + void *cc, unsigned ub, unsigned n, void *dst); + +#if SPH_64 + +/** + * Initialize a BMW-384 context. This process performs no memory allocation. + * + * @param cc the BMW-384 context (pointer to a + * sph_bmw384_context) + */ +void sph_bmw384_init(void *cc); + +/** + * Process some data bytes. It is acceptable that len is zero + * (in which case this function does nothing). + * + * @param cc the BMW-384 context + * @param data the input data + * @param len the input data length (in bytes) + */ +void sph_bmw384(void *cc, const void *data, size_t len); + +/** + * Terminate the current BMW-384 computation and output the result into + * the provided buffer. The destination buffer must be wide enough to + * accomodate the result (48 bytes). The context is automatically + * reinitialized. + * + * @param cc the BMW-384 context + * @param dst the destination buffer + */ +void sph_bmw384_close(void *cc, void *dst); + +/** + * Add a few additional bits (0 to 7) to the current computation, then + * terminate it and output the result in the provided buffer, which must + * be wide enough to accomodate the result (48 bytes). If bit number i + * in ub has value 2^i, then the extra bits are those + * numbered 7 downto 8-n (this is the big-endian convention at the byte + * level). The context is automatically reinitialized. + * + * @param cc the BMW-384 context + * @param ub the extra bits + * @param n the number of extra bits (0 to 7) + * @param dst the destination buffer + */ +void sph_bmw384_addbits_and_close( + void *cc, unsigned ub, unsigned n, void *dst); + +/** + * Initialize a BMW-512 context. This process performs no memory allocation. + * + * @param cc the BMW-512 context (pointer to a + * sph_bmw512_context) + */ +void sph_bmw512_init(void *cc); + +/** + * Process some data bytes. It is acceptable that len is zero + * (in which case this function does nothing). + * + * @param cc the BMW-512 context + * @param data the input data + * @param len the input data length (in bytes) + */ +void sph_bmw512(void *cc, const void *data, size_t len); + +/** + * Terminate the current BMW-512 computation and output the result into + * the provided buffer. The destination buffer must be wide enough to + * accomodate the result (64 bytes). The context is automatically + * reinitialized. + * + * @param cc the BMW-512 context + * @param dst the destination buffer + */ +void sph_bmw512_close(void *cc, void *dst); + +/** + * Add a few additional bits (0 to 7) to the current computation, then + * terminate it and output the result in the provided buffer, which must + * be wide enough to accomodate the result (64 bytes). If bit number i + * in ub has value 2^i, then the extra bits are those + * numbered 7 downto 8-n (this is the big-endian convention at the byte + * level). The context is automatically reinitialized. + * + * @param cc the BMW-512 context + * @param ub the extra bits + * @param n the number of extra bits (0 to 7) + * @param dst the destination buffer + */ +void sph_bmw512_addbits_and_close( + void *cc, unsigned ub, unsigned n, void *dst); + +#endif + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/sha3/sph_cubehash.h b/sha3/sph_cubehash.h new file mode 100644 index 0000000..487a194 --- /dev/null +++ b/sha3/sph_cubehash.h @@ -0,0 +1,292 @@ +/* $Id: sph_cubehash.h 180 2010-05-08 02:29:25Z tp $ */ +/** + * CubeHash interface. CubeHash is a family of functions which differ by + * their output size; this implementation defines CubeHash for output + * sizes 224, 256, 384 and 512 bits, with the "standard parameters" + * (CubeHash16/32 with the CubeHash specification notations). + * + * ==========================(LICENSE BEGIN)============================ + * + * Copyright (c) 2007-2010 Projet RNRT SAPHIR + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY + * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * + * ===========================(LICENSE END)============================= + * + * @file sph_cubehash.h + * @author Thomas Pornin + */ + +#ifndef SPH_CUBEHASH_H__ +#define SPH_CUBEHASH_H__ + +#ifdef __cplusplus +extern "C"{ +#endif + +#include +#include "sph_types.h" + +/** + * Output size (in bits) for CubeHash-224. + */ +#define SPH_SIZE_cubehash224 224 + +/** + * Output size (in bits) for CubeHash-256. + */ +#define SPH_SIZE_cubehash256 256 + +/** + * Output size (in bits) for CubeHash-384. + */ +#define SPH_SIZE_cubehash384 384 + +/** + * Output size (in bits) for CubeHash-512. + */ +#define SPH_SIZE_cubehash512 512 + +/** + * This structure is a context for CubeHash computations: it contains the + * intermediate values and some data from the last entered block. Once + * a CubeHash computation has been performed, the context can be reused for + * another computation. + * + * The contents of this structure are private. A running CubeHash computation + * can be cloned by copying the context (e.g. with a simple + * memcpy()). + */ +typedef struct { +#ifndef DOXYGEN_IGNORE + unsigned char buf[32]; /* first field, for alignment */ + size_t ptr; + sph_u32 state[32]; +#endif +} sph_cubehash_context; + +/** + * Type for a CubeHash-224 context (identical to the common context). + */ +typedef sph_cubehash_context sph_cubehash224_context; + +/** + * Type for a CubeHash-256 context (identical to the common context). + */ +typedef sph_cubehash_context sph_cubehash256_context; + +/** + * Type for a CubeHash-384 context (identical to the common context). + */ +typedef sph_cubehash_context sph_cubehash384_context; + +/** + * Type for a CubeHash-512 context (identical to the common context). + */ +typedef sph_cubehash_context sph_cubehash512_context; + +/** + * Initialize a CubeHash-224 context. This process performs no memory + * allocation. + * + * @param cc the CubeHash-224 context (pointer to a + * sph_cubehash224_context) + */ +void sph_cubehash224_init(void *cc); + +/** + * Process some data bytes. It is acceptable that len is zero + * (in which case this function does nothing). + * + * @param cc the CubeHash-224 context + * @param data the input data + * @param len the input data length (in bytes) + */ +void sph_cubehash224(void *cc, const void *data, size_t len); + +/** + * Terminate the current CubeHash-224 computation and output the result into + * the provided buffer. The destination buffer must be wide enough to + * accomodate the result (28 bytes). The context is automatically + * reinitialized. + * + * @param cc the CubeHash-224 context + * @param dst the destination buffer + */ +void sph_cubehash224_close(void *cc, void *dst); + +/** + * Add a few additional bits (0 to 7) to the current computation, then + * terminate it and output the result in the provided buffer, which must + * be wide enough to accomodate the result (28 bytes). If bit number i + * in ub has value 2^i, then the extra bits are those + * numbered 7 downto 8-n (this is the big-endian convention at the byte + * level). The context is automatically reinitialized. + * + * @param cc the CubeHash-224 context + * @param ub the extra bits + * @param n the number of extra bits (0 to 7) + * @param dst the destination buffer + */ +void sph_cubehash224_addbits_and_close( + void *cc, unsigned ub, unsigned n, void *dst); + +/** + * Initialize a CubeHash-256 context. This process performs no memory + * allocation. + * + * @param cc the CubeHash-256 context (pointer to a + * sph_cubehash256_context) + */ +void sph_cubehash256_init(void *cc); + +/** + * Process some data bytes. It is acceptable that len is zero + * (in which case this function does nothing). + * + * @param cc the CubeHash-256 context + * @param data the input data + * @param len the input data length (in bytes) + */ +void sph_cubehash256(void *cc, const void *data, size_t len); + +/** + * Terminate the current CubeHash-256 computation and output the result into + * the provided buffer. The destination buffer must be wide enough to + * accomodate the result (32 bytes). The context is automatically + * reinitialized. + * + * @param cc the CubeHash-256 context + * @param dst the destination buffer + */ +void sph_cubehash256_close(void *cc, void *dst); + +/** + * Add a few additional bits (0 to 7) to the current computation, then + * terminate it and output the result in the provided buffer, which must + * be wide enough to accomodate the result (32 bytes). If bit number i + * in ub has value 2^i, then the extra bits are those + * numbered 7 downto 8-n (this is the big-endian convention at the byte + * level). The context is automatically reinitialized. + * + * @param cc the CubeHash-256 context + * @param ub the extra bits + * @param n the number of extra bits (0 to 7) + * @param dst the destination buffer + */ +void sph_cubehash256_addbits_and_close( + void *cc, unsigned ub, unsigned n, void *dst); + +/** + * Initialize a CubeHash-384 context. This process performs no memory + * allocation. + * + * @param cc the CubeHash-384 context (pointer to a + * sph_cubehash384_context) + */ +void sph_cubehash384_init(void *cc); + +/** + * Process some data bytes. It is acceptable that len is zero + * (in which case this function does nothing). + * + * @param cc the CubeHash-384 context + * @param data the input data + * @param len the input data length (in bytes) + */ +void sph_cubehash384(void *cc, const void *data, size_t len); + +/** + * Terminate the current CubeHash-384 computation and output the result into + * the provided buffer. The destination buffer must be wide enough to + * accomodate the result (48 bytes). The context is automatically + * reinitialized. + * + * @param cc the CubeHash-384 context + * @param dst the destination buffer + */ +void sph_cubehash384_close(void *cc, void *dst); + +/** + * Add a few additional bits (0 to 7) to the current computation, then + * terminate it and output the result in the provided buffer, which must + * be wide enough to accomodate the result (48 bytes). If bit number i + * in ub has value 2^i, then the extra bits are those + * numbered 7 downto 8-n (this is the big-endian convention at the byte + * level). The context is automatically reinitialized. + * + * @param cc the CubeHash-384 context + * @param ub the extra bits + * @param n the number of extra bits (0 to 7) + * @param dst the destination buffer + */ +void sph_cubehash384_addbits_and_close( + void *cc, unsigned ub, unsigned n, void *dst); + +/** + * Initialize a CubeHash-512 context. This process performs no memory + * allocation. + * + * @param cc the CubeHash-512 context (pointer to a + * sph_cubehash512_context) + */ +void sph_cubehash512_init(void *cc); + +/** + * Process some data bytes. It is acceptable that len is zero + * (in which case this function does nothing). + * + * @param cc the CubeHash-512 context + * @param data the input data + * @param len the input data length (in bytes) + */ +void sph_cubehash512(void *cc, const void *data, size_t len); + +/** + * Terminate the current CubeHash-512 computation and output the result into + * the provided buffer. The destination buffer must be wide enough to + * accomodate the result (64 bytes). The context is automatically + * reinitialized. + * + * @param cc the CubeHash-512 context + * @param dst the destination buffer + */ +void sph_cubehash512_close(void *cc, void *dst); + +/** + * Add a few additional bits (0 to 7) to the current computation, then + * terminate it and output the result in the provided buffer, which must + * be wide enough to accomodate the result (64 bytes). If bit number i + * in ub has value 2^i, then the extra bits are those + * numbered 7 downto 8-n (this is the big-endian convention at the byte + * level). The context is automatically reinitialized. + * + * @param cc the CubeHash-512 context + * @param ub the extra bits + * @param n the number of extra bits (0 to 7) + * @param dst the destination buffer + */ +void sph_cubehash512_addbits_and_close( + void *cc, unsigned ub, unsigned n, void *dst); +#ifdef __cplusplus +} +#endif + +#endif diff --git a/sha3/sph_echo.h b/sha3/sph_echo.h new file mode 100644 index 0000000..1ae1e3d --- /dev/null +++ b/sha3/sph_echo.h @@ -0,0 +1,320 @@ +/* $Id: sph_echo.h 216 2010-06-08 09:46:57Z tp $ */ +/** + * ECHO interface. ECHO is a family of functions which differ by + * their output size; this implementation defines ECHO for output + * sizes 224, 256, 384 and 512 bits. + * + * ==========================(LICENSE BEGIN)============================ + * + * Copyright (c) 2007-2010 Projet RNRT SAPHIR + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY + * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * + * ===========================(LICENSE END)============================= + * + * @file sph_echo.h + * @author Thomas Pornin + */ + +#ifndef SPH_ECHO_H__ +#define SPH_ECHO_H__ + +#ifdef __cplusplus +extern "C"{ +#endif + +#include +#include "sph_types.h" + +/** + * Output size (in bits) for ECHO-224. + */ +#define SPH_SIZE_echo224 224 + +/** + * Output size (in bits) for ECHO-256. + */ +#define SPH_SIZE_echo256 256 + +/** + * Output size (in bits) for ECHO-384. + */ +#define SPH_SIZE_echo384 384 + +/** + * Output size (in bits) for ECHO-512. + */ +#define SPH_SIZE_echo512 512 + +/** + * This structure is a context for ECHO computations: it contains the + * intermediate values and some data from the last entered block. Once + * an ECHO computation has been performed, the context can be reused for + * another computation. This specific structure is used for ECHO-224 + * and ECHO-256. + * + * The contents of this structure are private. A running ECHO computation + * can be cloned by copying the context (e.g. with a simple + * memcpy()). + */ +typedef struct { +#ifndef DOXYGEN_IGNORE + unsigned char buf[192]; /* first field, for alignment */ + size_t ptr; + union { + sph_u32 Vs[4][4]; +#if SPH_64 + sph_u64 Vb[4][2]; +#endif + } u; + sph_u32 C0, C1, C2, C3; +#endif +} sph_echo_small_context; + +/** + * This structure is a context for ECHO computations: it contains the + * intermediate values and some data from the last entered block. Once + * an ECHO computation has been performed, the context can be reused for + * another computation. This specific structure is used for ECHO-384 + * and ECHO-512. + * + * The contents of this structure are private. A running ECHO computation + * can be cloned by copying the context (e.g. with a simple + * memcpy()). + */ +typedef struct { +#ifndef DOXYGEN_IGNORE + unsigned char buf[128]; /* first field, for alignment */ + size_t ptr; + union { + sph_u32 Vs[8][4]; +#if SPH_64 + sph_u64 Vb[8][2]; +#endif + } u; + sph_u32 C0, C1, C2, C3; +#endif +} sph_echo_big_context; + +/** + * Type for a ECHO-224 context (identical to the common "small" context). + */ +typedef sph_echo_small_context sph_echo224_context; + +/** + * Type for a ECHO-256 context (identical to the common "small" context). + */ +typedef sph_echo_small_context sph_echo256_context; + +/** + * Type for a ECHO-384 context (identical to the common "big" context). + */ +typedef sph_echo_big_context sph_echo384_context; + +/** + * Type for a ECHO-512 context (identical to the common "big" context). + */ +typedef sph_echo_big_context sph_echo512_context; + +/** + * Initialize an ECHO-224 context. This process performs no memory allocation. + * + * @param cc the ECHO-224 context (pointer to a + * sph_echo224_context) + */ +void sph_echo224_init(void *cc); + +/** + * Process some data bytes. It is acceptable that len is zero + * (in which case this function does nothing). + * + * @param cc the ECHO-224 context + * @param data the input data + * @param len the input data length (in bytes) + */ +void sph_echo224(void *cc, const void *data, size_t len); + +/** + * Terminate the current ECHO-224 computation and output the result into + * the provided buffer. The destination buffer must be wide enough to + * accomodate the result (28 bytes). The context is automatically + * reinitialized. + * + * @param cc the ECHO-224 context + * @param dst the destination buffer + */ +void sph_echo224_close(void *cc, void *dst); + +/** + * Add a few additional bits (0 to 7) to the current computation, then + * terminate it and output the result in the provided buffer, which must + * be wide enough to accomodate the result (28 bytes). If bit number i + * in ub has value 2^i, then the extra bits are those + * numbered 7 downto 8-n (this is the big-endian convention at the byte + * level). The context is automatically reinitialized. + * + * @param cc the ECHO-224 context + * @param ub the extra bits + * @param n the number of extra bits (0 to 7) + * @param dst the destination buffer + */ +void sph_echo224_addbits_and_close( + void *cc, unsigned ub, unsigned n, void *dst); + +/** + * Initialize an ECHO-256 context. This process performs no memory allocation. + * + * @param cc the ECHO-256 context (pointer to a + * sph_echo256_context) + */ +void sph_echo256_init(void *cc); + +/** + * Process some data bytes. It is acceptable that len is zero + * (in which case this function does nothing). + * + * @param cc the ECHO-256 context + * @param data the input data + * @param len the input data length (in bytes) + */ +void sph_echo256(void *cc, const void *data, size_t len); + +/** + * Terminate the current ECHO-256 computation and output the result into + * the provided buffer. The destination buffer must be wide enough to + * accomodate the result (32 bytes). The context is automatically + * reinitialized. + * + * @param cc the ECHO-256 context + * @param dst the destination buffer + */ +void sph_echo256_close(void *cc, void *dst); + +/** + * Add a few additional bits (0 to 7) to the current computation, then + * terminate it and output the result in the provided buffer, which must + * be wide enough to accomodate the result (32 bytes). If bit number i + * in ub has value 2^i, then the extra bits are those + * numbered 7 downto 8-n (this is the big-endian convention at the byte + * level). The context is automatically reinitialized. + * + * @param cc the ECHO-256 context + * @param ub the extra bits + * @param n the number of extra bits (0 to 7) + * @param dst the destination buffer + */ +void sph_echo256_addbits_and_close( + void *cc, unsigned ub, unsigned n, void *dst); + +/** + * Initialize an ECHO-384 context. This process performs no memory allocation. + * + * @param cc the ECHO-384 context (pointer to a + * sph_echo384_context) + */ +void sph_echo384_init(void *cc); + +/** + * Process some data bytes. It is acceptable that len is zero + * (in which case this function does nothing). + * + * @param cc the ECHO-384 context + * @param data the input data + * @param len the input data length (in bytes) + */ +void sph_echo384(void *cc, const void *data, size_t len); + +/** + * Terminate the current ECHO-384 computation and output the result into + * the provided buffer. The destination buffer must be wide enough to + * accomodate the result (48 bytes). The context is automatically + * reinitialized. + * + * @param cc the ECHO-384 context + * @param dst the destination buffer + */ +void sph_echo384_close(void *cc, void *dst); + +/** + * Add a few additional bits (0 to 7) to the current computation, then + * terminate it and output the result in the provided buffer, which must + * be wide enough to accomodate the result (48 bytes). If bit number i + * in ub has value 2^i, then the extra bits are those + * numbered 7 downto 8-n (this is the big-endian convention at the byte + * level). The context is automatically reinitialized. + * + * @param cc the ECHO-384 context + * @param ub the extra bits + * @param n the number of extra bits (0 to 7) + * @param dst the destination buffer + */ +void sph_echo384_addbits_and_close( + void *cc, unsigned ub, unsigned n, void *dst); + +/** + * Initialize an ECHO-512 context. This process performs no memory allocation. + * + * @param cc the ECHO-512 context (pointer to a + * sph_echo512_context) + */ +void sph_echo512_init(void *cc); + +/** + * Process some data bytes. It is acceptable that len is zero + * (in which case this function does nothing). + * + * @param cc the ECHO-512 context + * @param data the input data + * @param len the input data length (in bytes) + */ +void sph_echo512(void *cc, const void *data, size_t len); + +/** + * Terminate the current ECHO-512 computation and output the result into + * the provided buffer. The destination buffer must be wide enough to + * accomodate the result (64 bytes). The context is automatically + * reinitialized. + * + * @param cc the ECHO-512 context + * @param dst the destination buffer + */ +void sph_echo512_close(void *cc, void *dst); + +/** + * Add a few additional bits (0 to 7) to the current computation, then + * terminate it and output the result in the provided buffer, which must + * be wide enough to accomodate the result (64 bytes). If bit number i + * in ub has value 2^i, then the extra bits are those + * numbered 7 downto 8-n (this is the big-endian convention at the byte + * level). The context is automatically reinitialized. + * + * @param cc the ECHO-512 context + * @param ub the extra bits + * @param n the number of extra bits (0 to 7) + * @param dst the destination buffer + */ +void sph_echo512_addbits_and_close( + void *cc, unsigned ub, unsigned n, void *dst); + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/sha3/sph_groestl.h b/sha3/sph_groestl.h new file mode 100644 index 0000000..495f05e --- /dev/null +++ b/sha3/sph_groestl.h @@ -0,0 +1,329 @@ +/* $Id: sph_groestl.h 216 2010-06-08 09:46:57Z tp $ */ +/** + * Groestl interface. This code implements Groestl with the recommended + * parameters for SHA-3, with outputs of 224, 256, 384 and 512 bits. + * + * ==========================(LICENSE BEGIN)============================ + * + * Copyright (c) 2007-2010 Projet RNRT SAPHIR + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY + * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * + * ===========================(LICENSE END)============================= + * + * @file sph_groestl.h + * @author Thomas Pornin + */ + +#ifndef SPH_GROESTL_H__ +#define SPH_GROESTL_H__ + +#ifdef __cplusplus +extern "C"{ +#endif + +#include +#include "sph_types.h" + +/** + * Output size (in bits) for Groestl-224. + */ +#define SPH_SIZE_groestl224 224 + +/** + * Output size (in bits) for Groestl-256. + */ +#define SPH_SIZE_groestl256 256 + +/** + * Output size (in bits) for Groestl-384. + */ +#define SPH_SIZE_groestl384 384 + +/** + * Output size (in bits) for Groestl-512. + */ +#define SPH_SIZE_groestl512 512 + +/** + * This structure is a context for Groestl-224 and Groestl-256 computations: + * it contains the intermediate values and some data from the last + * entered block. Once a Groestl computation has been performed, the + * context can be reused for another computation. + * + * The contents of this structure are private. A running Groestl + * computation can be cloned by copying the context (e.g. with a simple + * memcpy()). + */ +typedef struct { +#ifndef DOXYGEN_IGNORE + unsigned char buf[64]; /* first field, for alignment */ + size_t ptr; + union { +#if SPH_64 + sph_u64 wide[8]; +#endif + sph_u32 narrow[16]; + } state; +#if SPH_64 + sph_u64 count; +#else + sph_u32 count_high, count_low; +#endif +#endif +} sph_groestl_small_context; + +/** + * This structure is a context for Groestl-224 computations. It is + * identical to the common sph_groestl_small_context. + */ +typedef sph_groestl_small_context sph_groestl224_context; + +/** + * This structure is a context for Groestl-256 computations. It is + * identical to the common sph_groestl_small_context. + */ +typedef sph_groestl_small_context sph_groestl256_context; + +/** + * This structure is a context for Groestl-384 and Groestl-512 computations: + * it contains the intermediate values and some data from the last + * entered block. Once a Groestl computation has been performed, the + * context can be reused for another computation. + * + * The contents of this structure are private. A running Groestl + * computation can be cloned by copying the context (e.g. with a simple + * memcpy()). + */ +typedef struct { +#ifndef DOXYGEN_IGNORE + unsigned char buf[128]; /* first field, for alignment */ + size_t ptr; + union { +#if SPH_64 + sph_u64 wide[16]; +#endif + sph_u32 narrow[32]; + } state; +#if SPH_64 + sph_u64 count; +#else + sph_u32 count_high, count_low; +#endif +#endif +} sph_groestl_big_context; + +/** + * This structure is a context for Groestl-384 computations. It is + * identical to the common sph_groestl_small_context. + */ +typedef sph_groestl_big_context sph_groestl384_context; + +/** + * This structure is a context for Groestl-512 computations. It is + * identical to the common sph_groestl_small_context. + */ +typedef sph_groestl_big_context sph_groestl512_context; + +/** + * Initialize a Groestl-224 context. This process performs no memory allocation. + * + * @param cc the Groestl-224 context (pointer to a + * sph_groestl224_context) + */ +void sph_groestl224_init(void *cc); + +/** + * Process some data bytes. It is acceptable that len is zero + * (in which case this function does nothing). + * + * @param cc the Groestl-224 context + * @param data the input data + * @param len the input data length (in bytes) + */ +void sph_groestl224(void *cc, const void *data, size_t len); + +/** + * Terminate the current Groestl-224 computation and output the result into + * the provided buffer. The destination buffer must be wide enough to + * accomodate the result (28 bytes). The context is automatically + * reinitialized. + * + * @param cc the Groestl-224 context + * @param dst the destination buffer + */ +void sph_groestl224_close(void *cc, void *dst); + +/** + * Add a few additional bits (0 to 7) to the current computation, then + * terminate it and output the result in the provided buffer, which must + * be wide enough to accomodate the result (28 bytes). If bit number i + * in ub has value 2^i, then the extra bits are those + * numbered 7 downto 8-n (this is the big-endian convention at the byte + * level). The context is automatically reinitialized. + * + * @param cc the Groestl-224 context + * @param ub the extra bits + * @param n the number of extra bits (0 to 7) + * @param dst the destination buffer + */ +void sph_groestl224_addbits_and_close( + void *cc, unsigned ub, unsigned n, void *dst); + +/** + * Initialize a Groestl-256 context. This process performs no memory allocation. + * + * @param cc the Groestl-256 context (pointer to a + * sph_groestl256_context) + */ +void sph_groestl256_init(void *cc); + +/** + * Process some data bytes. It is acceptable that len is zero + * (in which case this function does nothing). + * + * @param cc the Groestl-256 context + * @param data the input data + * @param len the input data length (in bytes) + */ +void sph_groestl256(void *cc, const void *data, size_t len); + +/** + * Terminate the current Groestl-256 computation and output the result into + * the provided buffer. The destination buffer must be wide enough to + * accomodate the result (32 bytes). The context is automatically + * reinitialized. + * + * @param cc the Groestl-256 context + * @param dst the destination buffer + */ +void sph_groestl256_close(void *cc, void *dst); + +/** + * Add a few additional bits (0 to 7) to the current computation, then + * terminate it and output the result in the provided buffer, which must + * be wide enough to accomodate the result (32 bytes). If bit number i + * in ub has value 2^i, then the extra bits are those + * numbered 7 downto 8-n (this is the big-endian convention at the byte + * level). The context is automatically reinitialized. + * + * @param cc the Groestl-256 context + * @param ub the extra bits + * @param n the number of extra bits (0 to 7) + * @param dst the destination buffer + */ +void sph_groestl256_addbits_and_close( + void *cc, unsigned ub, unsigned n, void *dst); + +/** + * Initialize a Groestl-384 context. This process performs no memory allocation. + * + * @param cc the Groestl-384 context (pointer to a + * sph_groestl384_context) + */ +void sph_groestl384_init(void *cc); + +/** + * Process some data bytes. It is acceptable that len is zero + * (in which case this function does nothing). + * + * @param cc the Groestl-384 context + * @param data the input data + * @param len the input data length (in bytes) + */ +void sph_groestl384(void *cc, const void *data, size_t len); + +/** + * Terminate the current Groestl-384 computation and output the result into + * the provided buffer. The destination buffer must be wide enough to + * accomodate the result (48 bytes). The context is automatically + * reinitialized. + * + * @param cc the Groestl-384 context + * @param dst the destination buffer + */ +void sph_groestl384_close(void *cc, void *dst); + +/** + * Add a few additional bits (0 to 7) to the current computation, then + * terminate it and output the result in the provided buffer, which must + * be wide enough to accomodate the result (48 bytes). If bit number i + * in ub has value 2^i, then the extra bits are those + * numbered 7 downto 8-n (this is the big-endian convention at the byte + * level). The context is automatically reinitialized. + * + * @param cc the Groestl-384 context + * @param ub the extra bits + * @param n the number of extra bits (0 to 7) + * @param dst the destination buffer + */ +void sph_groestl384_addbits_and_close( + void *cc, unsigned ub, unsigned n, void *dst); + +/** + * Initialize a Groestl-512 context. This process performs no memory allocation. + * + * @param cc the Groestl-512 context (pointer to a + * sph_groestl512_context) + */ +void sph_groestl512_init(void *cc); + +/** + * Process some data bytes. It is acceptable that len is zero + * (in which case this function does nothing). + * + * @param cc the Groestl-512 context + * @param data the input data + * @param len the input data length (in bytes) + */ +void sph_groestl512(void *cc, const void *data, size_t len); + +/** + * Terminate the current Groestl-512 computation and output the result into + * the provided buffer. The destination buffer must be wide enough to + * accomodate the result (64 bytes). The context is automatically + * reinitialized. + * + * @param cc the Groestl-512 context + * @param dst the destination buffer + */ +void sph_groestl512_close(void *cc, void *dst); + +/** + * Add a few additional bits (0 to 7) to the current computation, then + * terminate it and output the result in the provided buffer, which must + * be wide enough to accomodate the result (64 bytes). If bit number i + * in ub has value 2^i, then the extra bits are those + * numbered 7 downto 8-n (this is the big-endian convention at the byte + * level). The context is automatically reinitialized. + * + * @param cc the Groestl-512 context + * @param ub the extra bits + * @param n the number of extra bits (0 to 7) + * @param dst the destination buffer + */ +void sph_groestl512_addbits_and_close( + void *cc, unsigned ub, unsigned n, void *dst); + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/sha3/sph_jh.h b/sha3/sph_jh.h new file mode 100644 index 0000000..82fae58 --- /dev/null +++ b/sha3/sph_jh.h @@ -0,0 +1,298 @@ +/* $Id: sph_jh.h 216 2010-06-08 09:46:57Z tp $ */ +/** + * JH interface. JH is a family of functions which differ by + * their output size; this implementation defines JH for output + * sizes 224, 256, 384 and 512 bits. + * + * ==========================(LICENSE BEGIN)============================ + * + * Copyright (c) 2007-2010 Projet RNRT SAPHIR + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY + * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * + * ===========================(LICENSE END)============================= + * + * @file sph_jh.h + * @author Thomas Pornin + */ + +#ifndef SPH_JH_H__ +#define SPH_JH_H__ + +#ifdef __cplusplus +extern "C"{ +#endif + +#include +#include "sph_types.h" + +/** + * Output size (in bits) for JH-224. + */ +#define SPH_SIZE_jh224 224 + +/** + * Output size (in bits) for JH-256. + */ +#define SPH_SIZE_jh256 256 + +/** + * Output size (in bits) for JH-384. + */ +#define SPH_SIZE_jh384 384 + +/** + * Output size (in bits) for JH-512. + */ +#define SPH_SIZE_jh512 512 + +/** + * This structure is a context for JH computations: it contains the + * intermediate values and some data from the last entered block. Once + * a JH computation has been performed, the context can be reused for + * another computation. + * + * The contents of this structure are private. A running JH computation + * can be cloned by copying the context (e.g. with a simple + * memcpy()). + */ +typedef struct { +#ifndef DOXYGEN_IGNORE + unsigned char buf[64]; /* first field, for alignment */ + size_t ptr; + union { +#if SPH_64 + sph_u64 wide[16]; +#endif + sph_u32 narrow[32]; + } H; +#if SPH_64 + sph_u64 block_count; +#else + sph_u32 block_count_high, block_count_low; +#endif +#endif +} sph_jh_context; + +/** + * Type for a JH-224 context (identical to the common context). + */ +typedef sph_jh_context sph_jh224_context; + +/** + * Type for a JH-256 context (identical to the common context). + */ +typedef sph_jh_context sph_jh256_context; + +/** + * Type for a JH-384 context (identical to the common context). + */ +typedef sph_jh_context sph_jh384_context; + +/** + * Type for a JH-512 context (identical to the common context). + */ +typedef sph_jh_context sph_jh512_context; + +/** + * Initialize a JH-224 context. This process performs no memory allocation. + * + * @param cc the JH-224 context (pointer to a + * sph_jh224_context) + */ +void sph_jh224_init(void *cc); + +/** + * Process some data bytes. It is acceptable that len is zero + * (in which case this function does nothing). + * + * @param cc the JH-224 context + * @param data the input data + * @param len the input data length (in bytes) + */ +void sph_jh224(void *cc, const void *data, size_t len); + +/** + * Terminate the current JH-224 computation and output the result into + * the provided buffer. The destination buffer must be wide enough to + * accomodate the result (28 bytes). The context is automatically + * reinitialized. + * + * @param cc the JH-224 context + * @param dst the destination buffer + */ +void sph_jh224_close(void *cc, void *dst); + +/** + * Add a few additional bits (0 to 7) to the current computation, then + * terminate it and output the result in the provided buffer, which must + * be wide enough to accomodate the result (28 bytes). If bit number i + * in ub has value 2^i, then the extra bits are those + * numbered 7 downto 8-n (this is the big-endian convention at the byte + * level). The context is automatically reinitialized. + * + * @param cc the JH-224 context + * @param ub the extra bits + * @param n the number of extra bits (0 to 7) + * @param dst the destination buffer + */ +void sph_jh224_addbits_and_close( + void *cc, unsigned ub, unsigned n, void *dst); + +/** + * Initialize a JH-256 context. This process performs no memory allocation. + * + * @param cc the JH-256 context (pointer to a + * sph_jh256_context) + */ +void sph_jh256_init(void *cc); + +/** + * Process some data bytes. It is acceptable that len is zero + * (in which case this function does nothing). + * + * @param cc the JH-256 context + * @param data the input data + * @param len the input data length (in bytes) + */ +void sph_jh256(void *cc, const void *data, size_t len); + +/** + * Terminate the current JH-256 computation and output the result into + * the provided buffer. The destination buffer must be wide enough to + * accomodate the result (32 bytes). The context is automatically + * reinitialized. + * + * @param cc the JH-256 context + * @param dst the destination buffer + */ +void sph_jh256_close(void *cc, void *dst); + +/** + * Add a few additional bits (0 to 7) to the current computation, then + * terminate it and output the result in the provided buffer, which must + * be wide enough to accomodate the result (32 bytes). If bit number i + * in ub has value 2^i, then the extra bits are those + * numbered 7 downto 8-n (this is the big-endian convention at the byte + * level). The context is automatically reinitialized. + * + * @param cc the JH-256 context + * @param ub the extra bits + * @param n the number of extra bits (0 to 7) + * @param dst the destination buffer + */ +void sph_jh256_addbits_and_close( + void *cc, unsigned ub, unsigned n, void *dst); + +/** + * Initialize a JH-384 context. This process performs no memory allocation. + * + * @param cc the JH-384 context (pointer to a + * sph_jh384_context) + */ +void sph_jh384_init(void *cc); + +/** + * Process some data bytes. It is acceptable that len is zero + * (in which case this function does nothing). + * + * @param cc the JH-384 context + * @param data the input data + * @param len the input data length (in bytes) + */ +void sph_jh384(void *cc, const void *data, size_t len); + +/** + * Terminate the current JH-384 computation and output the result into + * the provided buffer. The destination buffer must be wide enough to + * accomodate the result (48 bytes). The context is automatically + * reinitialized. + * + * @param cc the JH-384 context + * @param dst the destination buffer + */ +void sph_jh384_close(void *cc, void *dst); + +/** + * Add a few additional bits (0 to 7) to the current computation, then + * terminate it and output the result in the provided buffer, which must + * be wide enough to accomodate the result (48 bytes). If bit number i + * in ub has value 2^i, then the extra bits are those + * numbered 7 downto 8-n (this is the big-endian convention at the byte + * level). The context is automatically reinitialized. + * + * @param cc the JH-384 context + * @param ub the extra bits + * @param n the number of extra bits (0 to 7) + * @param dst the destination buffer + */ +void sph_jh384_addbits_and_close( + void *cc, unsigned ub, unsigned n, void *dst); + +/** + * Initialize a JH-512 context. This process performs no memory allocation. + * + * @param cc the JH-512 context (pointer to a + * sph_jh512_context) + */ +void sph_jh512_init(void *cc); + +/** + * Process some data bytes. It is acceptable that len is zero + * (in which case this function does nothing). + * + * @param cc the JH-512 context + * @param data the input data + * @param len the input data length (in bytes) + */ +void sph_jh512(void *cc, const void *data, size_t len); + +/** + * Terminate the current JH-512 computation and output the result into + * the provided buffer. The destination buffer must be wide enough to + * accomodate the result (64 bytes). The context is automatically + * reinitialized. + * + * @param cc the JH-512 context + * @param dst the destination buffer + */ +void sph_jh512_close(void *cc, void *dst); + +/** + * Add a few additional bits (0 to 7) to the current computation, then + * terminate it and output the result in the provided buffer, which must + * be wide enough to accomodate the result (64 bytes). If bit number i + * in ub has value 2^i, then the extra bits are those + * numbered 7 downto 8-n (this is the big-endian convention at the byte + * level). The context is automatically reinitialized. + * + * @param cc the JH-512 context + * @param ub the extra bits + * @param n the number of extra bits (0 to 7) + * @param dst the destination buffer + */ +void sph_jh512_addbits_and_close( + void *cc, unsigned ub, unsigned n, void *dst); + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/sha3/sph_keccak.h b/sha3/sph_keccak.h new file mode 100644 index 0000000..bdafdb8 --- /dev/null +++ b/sha3/sph_keccak.h @@ -0,0 +1,293 @@ +/* $Id: sph_keccak.h 216 2010-06-08 09:46:57Z tp $ */ +/** + * Keccak interface. This is the interface for Keccak with the + * recommended parameters for SHA-3, with output lengths 224, 256, + * 384 and 512 bits. + * + * ==========================(LICENSE BEGIN)============================ + * + * Copyright (c) 2007-2010 Projet RNRT SAPHIR + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY + * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * + * ===========================(LICENSE END)============================= + * + * @file sph_keccak.h + * @author Thomas Pornin + */ + +#ifndef SPH_KECCAK_H__ +#define SPH_KECCAK_H__ + +#ifdef __cplusplus +extern "C"{ +#endif + +#include +#include "sph_types.h" + +/** + * Output size (in bits) for Keccak-224. + */ +#define SPH_SIZE_keccak224 224 + +/** + * Output size (in bits) for Keccak-256. + */ +#define SPH_SIZE_keccak256 256 + +/** + * Output size (in bits) for Keccak-384. + */ +#define SPH_SIZE_keccak384 384 + +/** + * Output size (in bits) for Keccak-512. + */ +#define SPH_SIZE_keccak512 512 + +/** + * This structure is a context for Keccak computations: it contains the + * intermediate values and some data from the last entered block. Once a + * Keccak computation has been performed, the context can be reused for + * another computation. + * + * The contents of this structure are private. A running Keccak computation + * can be cloned by copying the context (e.g. with a simple + * memcpy()). + */ +typedef struct { +#ifndef DOXYGEN_IGNORE + unsigned char buf[144]; /* first field, for alignment */ + size_t ptr, lim; + union { +#if SPH_64 + sph_u64 wide[25]; +#endif + sph_u32 narrow[50]; + } u; +#endif +} sph_keccak_context; + +/** + * Type for a Keccak-224 context (identical to the common context). + */ +typedef sph_keccak_context sph_keccak224_context; + +/** + * Type for a Keccak-256 context (identical to the common context). + */ +typedef sph_keccak_context sph_keccak256_context; + +/** + * Type for a Keccak-384 context (identical to the common context). + */ +typedef sph_keccak_context sph_keccak384_context; + +/** + * Type for a Keccak-512 context (identical to the common context). + */ +typedef sph_keccak_context sph_keccak512_context; + +/** + * Initialize a Keccak-224 context. This process performs no memory allocation. + * + * @param cc the Keccak-224 context (pointer to a + * sph_keccak224_context) + */ +void sph_keccak224_init(void *cc); + +/** + * Process some data bytes. It is acceptable that len is zero + * (in which case this function does nothing). + * + * @param cc the Keccak-224 context + * @param data the input data + * @param len the input data length (in bytes) + */ +void sph_keccak224(void *cc, const void *data, size_t len); + +/** + * Terminate the current Keccak-224 computation and output the result into + * the provided buffer. The destination buffer must be wide enough to + * accomodate the result (28 bytes). The context is automatically + * reinitialized. + * + * @param cc the Keccak-224 context + * @param dst the destination buffer + */ +void sph_keccak224_close(void *cc, void *dst); + +/** + * Add a few additional bits (0 to 7) to the current computation, then + * terminate it and output the result in the provided buffer, which must + * be wide enough to accomodate the result (28 bytes). If bit number i + * in ub has value 2^i, then the extra bits are those + * numbered 7 downto 8-n (this is the big-endian convention at the byte + * level). The context is automatically reinitialized. + * + * @param cc the Keccak-224 context + * @param ub the extra bits + * @param n the number of extra bits (0 to 7) + * @param dst the destination buffer + */ +void sph_keccak224_addbits_and_close( + void *cc, unsigned ub, unsigned n, void *dst); + +/** + * Initialize a Keccak-256 context. This process performs no memory allocation. + * + * @param cc the Keccak-256 context (pointer to a + * sph_keccak256_context) + */ +void sph_keccak256_init(void *cc); + +/** + * Process some data bytes. It is acceptable that len is zero + * (in which case this function does nothing). + * + * @param cc the Keccak-256 context + * @param data the input data + * @param len the input data length (in bytes) + */ +void sph_keccak256(void *cc, const void *data, size_t len); + +/** + * Terminate the current Keccak-256 computation and output the result into + * the provided buffer. The destination buffer must be wide enough to + * accomodate the result (32 bytes). The context is automatically + * reinitialized. + * + * @param cc the Keccak-256 context + * @param dst the destination buffer + */ +void sph_keccak256_close(void *cc, void *dst); + +/** + * Add a few additional bits (0 to 7) to the current computation, then + * terminate it and output the result in the provided buffer, which must + * be wide enough to accomodate the result (32 bytes). If bit number i + * in ub has value 2^i, then the extra bits are those + * numbered 7 downto 8-n (this is the big-endian convention at the byte + * level). The context is automatically reinitialized. + * + * @param cc the Keccak-256 context + * @param ub the extra bits + * @param n the number of extra bits (0 to 7) + * @param dst the destination buffer + */ +void sph_keccak256_addbits_and_close( + void *cc, unsigned ub, unsigned n, void *dst); + +/** + * Initialize a Keccak-384 context. This process performs no memory allocation. + * + * @param cc the Keccak-384 context (pointer to a + * sph_keccak384_context) + */ +void sph_keccak384_init(void *cc); + +/** + * Process some data bytes. It is acceptable that len is zero + * (in which case this function does nothing). + * + * @param cc the Keccak-384 context + * @param data the input data + * @param len the input data length (in bytes) + */ +void sph_keccak384(void *cc, const void *data, size_t len); + +/** + * Terminate the current Keccak-384 computation and output the result into + * the provided buffer. The destination buffer must be wide enough to + * accomodate the result (48 bytes). The context is automatically + * reinitialized. + * + * @param cc the Keccak-384 context + * @param dst the destination buffer + */ +void sph_keccak384_close(void *cc, void *dst); + +/** + * Add a few additional bits (0 to 7) to the current computation, then + * terminate it and output the result in the provided buffer, which must + * be wide enough to accomodate the result (48 bytes). If bit number i + * in ub has value 2^i, then the extra bits are those + * numbered 7 downto 8-n (this is the big-endian convention at the byte + * level). The context is automatically reinitialized. + * + * @param cc the Keccak-384 context + * @param ub the extra bits + * @param n the number of extra bits (0 to 7) + * @param dst the destination buffer + */ +void sph_keccak384_addbits_and_close( + void *cc, unsigned ub, unsigned n, void *dst); + +/** + * Initialize a Keccak-512 context. This process performs no memory allocation. + * + * @param cc the Keccak-512 context (pointer to a + * sph_keccak512_context) + */ +void sph_keccak512_init(void *cc); + +/** + * Process some data bytes. It is acceptable that len is zero + * (in which case this function does nothing). + * + * @param cc the Keccak-512 context + * @param data the input data + * @param len the input data length (in bytes) + */ +void sph_keccak512(void *cc, const void *data, size_t len); + +/** + * Terminate the current Keccak-512 computation and output the result into + * the provided buffer. The destination buffer must be wide enough to + * accomodate the result (64 bytes). The context is automatically + * reinitialized. + * + * @param cc the Keccak-512 context + * @param dst the destination buffer + */ +void sph_keccak512_close(void *cc, void *dst); + +/** + * Add a few additional bits (0 to 7) to the current computation, then + * terminate it and output the result in the provided buffer, which must + * be wide enough to accomodate the result (64 bytes). If bit number i + * in ub has value 2^i, then the extra bits are those + * numbered 7 downto 8-n (this is the big-endian convention at the byte + * level). The context is automatically reinitialized. + * + * @param cc the Keccak-512 context + * @param ub the extra bits + * @param n the number of extra bits (0 to 7) + * @param dst the destination buffer + */ +void sph_keccak512_addbits_and_close( + void *cc, unsigned ub, unsigned n, void *dst); + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/sha3/sph_luffa.h b/sha3/sph_luffa.h new file mode 100644 index 0000000..a32fd7b --- /dev/null +++ b/sha3/sph_luffa.h @@ -0,0 +1,296 @@ +/* $Id: sph_luffa.h 154 2010-04-26 17:00:24Z tp $ */ +/** + * Luffa interface. Luffa is a family of functions which differ by + * their output size; this implementation defines Luffa for output + * sizes 224, 256, 384 and 512 bits. + * + * ==========================(LICENSE BEGIN)============================ + * + * Copyright (c) 2007-2010 Projet RNRT SAPHIR + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY + * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * + * ===========================(LICENSE END)============================= + * + * @file sph_luffa.h + * @author Thomas Pornin + */ + +#ifndef SPH_LUFFA_H__ +#define SPH_LUFFA_H__ + +#ifdef __cplusplus +extern "C"{ +#endif + +#include +#include "sph_types.h" + +/** + * Output size (in bits) for Luffa-224. + */ +#define SPH_SIZE_luffa224 224 + +/** + * Output size (in bits) for Luffa-256. + */ +#define SPH_SIZE_luffa256 256 + +/** + * Output size (in bits) for Luffa-384. + */ +#define SPH_SIZE_luffa384 384 + +/** + * Output size (in bits) for Luffa-512. + */ +#define SPH_SIZE_luffa512 512 + +/** + * This structure is a context for Luffa-224 computations: it contains + * the intermediate values and some data from the last entered block. + * Once a Luffa computation has been performed, the context can be + * reused for another computation. + * + * The contents of this structure are private. A running Luffa + * computation can be cloned by copying the context (e.g. with a simple + * memcpy()). + */ +typedef struct { +#ifndef DOXYGEN_IGNORE + unsigned char buf[32]; /* first field, for alignment */ + size_t ptr; + sph_u32 V[3][8]; +#endif +} sph_luffa224_context; + +/** + * This structure is a context for Luffa-256 computations. It is + * identical to sph_luffa224_context. + */ +typedef sph_luffa224_context sph_luffa256_context; + +/** + * This structure is a context for Luffa-384 computations. + */ +typedef struct { +#ifndef DOXYGEN_IGNORE + unsigned char buf[32]; /* first field, for alignment */ + size_t ptr; + sph_u32 V[4][8]; +#endif +} sph_luffa384_context; + +/** + * This structure is a context for Luffa-512 computations. + */ +typedef struct { +#ifndef DOXYGEN_IGNORE + unsigned char buf[32]; /* first field, for alignment */ + size_t ptr; + sph_u32 V[5][8]; +#endif +} sph_luffa512_context; + +/** + * Initialize a Luffa-224 context. This process performs no memory allocation. + * + * @param cc the Luffa-224 context (pointer to a + * sph_luffa224_context) + */ +void sph_luffa224_init(void *cc); + +/** + * Process some data bytes. It is acceptable that len is zero + * (in which case this function does nothing). + * + * @param cc the Luffa-224 context + * @param data the input data + * @param len the input data length (in bytes) + */ +void sph_luffa224(void *cc, const void *data, size_t len); + +/** + * Terminate the current Luffa-224 computation and output the result into + * the provided buffer. The destination buffer must be wide enough to + * accomodate the result (28 bytes). The context is automatically + * reinitialized. + * + * @param cc the Luffa-224 context + * @param dst the destination buffer + */ +void sph_luffa224_close(void *cc, void *dst); + +/** + * Add a few additional bits (0 to 7) to the current computation, then + * terminate it and output the result in the provided buffer, which must + * be wide enough to accomodate the result (28 bytes). If bit number i + * in ub has value 2^i, then the extra bits are those + * numbered 7 downto 8-n (this is the big-endian convention at the byte + * level). The context is automatically reinitialized. + * + * @param cc the Luffa-224 context + * @param ub the extra bits + * @param n the number of extra bits (0 to 7) + * @param dst the destination buffer + */ +void sph_luffa224_addbits_and_close( + void *cc, unsigned ub, unsigned n, void *dst); + +/** + * Initialize a Luffa-256 context. This process performs no memory allocation. + * + * @param cc the Luffa-256 context (pointer to a + * sph_luffa256_context) + */ +void sph_luffa256_init(void *cc); + +/** + * Process some data bytes. It is acceptable that len is zero + * (in which case this function does nothing). + * + * @param cc the Luffa-256 context + * @param data the input data + * @param len the input data length (in bytes) + */ +void sph_luffa256(void *cc, const void *data, size_t len); + +/** + * Terminate the current Luffa-256 computation and output the result into + * the provided buffer. The destination buffer must be wide enough to + * accomodate the result (32 bytes). The context is automatically + * reinitialized. + * + * @param cc the Luffa-256 context + * @param dst the destination buffer + */ +void sph_luffa256_close(void *cc, void *dst); + +/** + * Add a few additional bits (0 to 7) to the current computation, then + * terminate it and output the result in the provided buffer, which must + * be wide enough to accomodate the result (32 bytes). If bit number i + * in ub has value 2^i, then the extra bits are those + * numbered 7 downto 8-n (this is the big-endian convention at the byte + * level). The context is automatically reinitialized. + * + * @param cc the Luffa-256 context + * @param ub the extra bits + * @param n the number of extra bits (0 to 7) + * @param dst the destination buffer + */ +void sph_luffa256_addbits_and_close( + void *cc, unsigned ub, unsigned n, void *dst); + +/** + * Initialize a Luffa-384 context. This process performs no memory allocation. + * + * @param cc the Luffa-384 context (pointer to a + * sph_luffa384_context) + */ +void sph_luffa384_init(void *cc); + +/** + * Process some data bytes. It is acceptable that len is zero + * (in which case this function does nothing). + * + * @param cc the Luffa-384 context + * @param data the input data + * @param len the input data length (in bytes) + */ +void sph_luffa384(void *cc, const void *data, size_t len); + +/** + * Terminate the current Luffa-384 computation and output the result into + * the provided buffer. The destination buffer must be wide enough to + * accomodate the result (48 bytes). The context is automatically + * reinitialized. + * + * @param cc the Luffa-384 context + * @param dst the destination buffer + */ +void sph_luffa384_close(void *cc, void *dst); + +/** + * Add a few additional bits (0 to 7) to the current computation, then + * terminate it and output the result in the provided buffer, which must + * be wide enough to accomodate the result (48 bytes). If bit number i + * in ub has value 2^i, then the extra bits are those + * numbered 7 downto 8-n (this is the big-endian convention at the byte + * level). The context is automatically reinitialized. + * + * @param cc the Luffa-384 context + * @param ub the extra bits + * @param n the number of extra bits (0 to 7) + * @param dst the destination buffer + */ +void sph_luffa384_addbits_and_close( + void *cc, unsigned ub, unsigned n, void *dst); + +/** + * Initialize a Luffa-512 context. This process performs no memory allocation. + * + * @param cc the Luffa-512 context (pointer to a + * sph_luffa512_context) + */ +void sph_luffa512_init(void *cc); + +/** + * Process some data bytes. It is acceptable that len is zero + * (in which case this function does nothing). + * + * @param cc the Luffa-512 context + * @param data the input data + * @param len the input data length (in bytes) + */ +void sph_luffa512(void *cc, const void *data, size_t len); + +/** + * Terminate the current Luffa-512 computation and output the result into + * the provided buffer. The destination buffer must be wide enough to + * accomodate the result (64 bytes). The context is automatically + * reinitialized. + * + * @param cc the Luffa-512 context + * @param dst the destination buffer + */ +void sph_luffa512_close(void *cc, void *dst); + +/** + * Add a few additional bits (0 to 7) to the current computation, then + * terminate it and output the result in the provided buffer, which must + * be wide enough to accomodate the result (64 bytes). If bit number i + * in ub has value 2^i, then the extra bits are those + * numbered 7 downto 8-n (this is the big-endian convention at the byte + * level). The context is automatically reinitialized. + * + * @param cc the Luffa-512 context + * @param ub the extra bits + * @param n the number of extra bits (0 to 7) + * @param dst the destination buffer + */ +void sph_luffa512_addbits_and_close( + void *cc, unsigned ub, unsigned n, void *dst); + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/sha3/sph_shavite.h b/sha3/sph_shavite.h new file mode 100644 index 0000000..0957e42 --- /dev/null +++ b/sha3/sph_shavite.h @@ -0,0 +1,314 @@ +/* $Id: sph_shavite.h 208 2010-06-02 20:33:00Z tp $ */ +/** + * SHAvite-3 interface. This code implements SHAvite-3 with the + * recommended parameters for SHA-3, with outputs of 224, 256, 384 and + * 512 bits. In the following, we call the function "SHAvite" (without + * the "-3" suffix), thus "SHAvite-224" is "SHAvite-3 with a 224-bit + * output". + * + * ==========================(LICENSE BEGIN)============================ + * + * Copyright (c) 2007-2010 Projet RNRT SAPHIR + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY + * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * + * ===========================(LICENSE END)============================= + * + * @file sph_shavite.h + * @author Thomas Pornin + */ + +#ifndef SPH_SHAVITE_H__ +#define SPH_SHAVITE_H__ + +#include +#include "sph_types.h" + +#ifdef __cplusplus +extern "C"{ +#endif + +/** + * Output size (in bits) for SHAvite-224. + */ +#define SPH_SIZE_shavite224 224 + +/** + * Output size (in bits) for SHAvite-256. + */ +#define SPH_SIZE_shavite256 256 + +/** + * Output size (in bits) for SHAvite-384. + */ +#define SPH_SIZE_shavite384 384 + +/** + * Output size (in bits) for SHAvite-512. + */ +#define SPH_SIZE_shavite512 512 + +/** + * This structure is a context for SHAvite-224 and SHAvite-256 computations: + * it contains the intermediate values and some data from the last + * entered block. Once a SHAvite computation has been performed, the + * context can be reused for another computation. + * + * The contents of this structure are private. A running SHAvite + * computation can be cloned by copying the context (e.g. with a simple + * memcpy()). + */ +typedef struct { +#ifndef DOXYGEN_IGNORE + unsigned char buf[64]; /* first field, for alignment */ + size_t ptr; + sph_u32 h[8]; + sph_u32 count0, count1; +#endif +} sph_shavite_small_context; + +/** + * This structure is a context for SHAvite-224 computations. It is + * identical to the common sph_shavite_small_context. + */ +typedef sph_shavite_small_context sph_shavite224_context; + +/** + * This structure is a context for SHAvite-256 computations. It is + * identical to the common sph_shavite_small_context. + */ +typedef sph_shavite_small_context sph_shavite256_context; + +/** + * This structure is a context for SHAvite-384 and SHAvite-512 computations: + * it contains the intermediate values and some data from the last + * entered block. Once a SHAvite computation has been performed, the + * context can be reused for another computation. + * + * The contents of this structure are private. A running SHAvite + * computation can be cloned by copying the context (e.g. with a simple + * memcpy()). + */ +typedef struct { +#ifndef DOXYGEN_IGNORE + unsigned char buf[128]; /* first field, for alignment */ + size_t ptr; + sph_u32 h[16]; + sph_u32 count0, count1, count2, count3; +#endif +} sph_shavite_big_context; + +/** + * This structure is a context for SHAvite-384 computations. It is + * identical to the common sph_shavite_small_context. + */ +typedef sph_shavite_big_context sph_shavite384_context; + +/** + * This structure is a context for SHAvite-512 computations. It is + * identical to the common sph_shavite_small_context. + */ +typedef sph_shavite_big_context sph_shavite512_context; + +/** + * Initialize a SHAvite-224 context. This process performs no memory allocation. + * + * @param cc the SHAvite-224 context (pointer to a + * sph_shavite224_context) + */ +void sph_shavite224_init(void *cc); + +/** + * Process some data bytes. It is acceptable that len is zero + * (in which case this function does nothing). + * + * @param cc the SHAvite-224 context + * @param data the input data + * @param len the input data length (in bytes) + */ +void sph_shavite224(void *cc, const void *data, size_t len); + +/** + * Terminate the current SHAvite-224 computation and output the result into + * the provided buffer. The destination buffer must be wide enough to + * accomodate the result (28 bytes). The context is automatically + * reinitialized. + * + * @param cc the SHAvite-224 context + * @param dst the destination buffer + */ +void sph_shavite224_close(void *cc, void *dst); + +/** + * Add a few additional bits (0 to 7) to the current computation, then + * terminate it and output the result in the provided buffer, which must + * be wide enough to accomodate the result (28 bytes). If bit number i + * in ub has value 2^i, then the extra bits are those + * numbered 7 downto 8-n (this is the big-endian convention at the byte + * level). The context is automatically reinitialized. + * + * @param cc the SHAvite-224 context + * @param ub the extra bits + * @param n the number of extra bits (0 to 7) + * @param dst the destination buffer + */ +void sph_shavite224_addbits_and_close( + void *cc, unsigned ub, unsigned n, void *dst); + +/** + * Initialize a SHAvite-256 context. This process performs no memory allocation. + * + * @param cc the SHAvite-256 context (pointer to a + * sph_shavite256_context) + */ +void sph_shavite256_init(void *cc); + +/** + * Process some data bytes. It is acceptable that len is zero + * (in which case this function does nothing). + * + * @param cc the SHAvite-256 context + * @param data the input data + * @param len the input data length (in bytes) + */ +void sph_shavite256(void *cc, const void *data, size_t len); + +/** + * Terminate the current SHAvite-256 computation and output the result into + * the provided buffer. The destination buffer must be wide enough to + * accomodate the result (32 bytes). The context is automatically + * reinitialized. + * + * @param cc the SHAvite-256 context + * @param dst the destination buffer + */ +void sph_shavite256_close(void *cc, void *dst); + +/** + * Add a few additional bits (0 to 7) to the current computation, then + * terminate it and output the result in the provided buffer, which must + * be wide enough to accomodate the result (32 bytes). If bit number i + * in ub has value 2^i, then the extra bits are those + * numbered 7 downto 8-n (this is the big-endian convention at the byte + * level). The context is automatically reinitialized. + * + * @param cc the SHAvite-256 context + * @param ub the extra bits + * @param n the number of extra bits (0 to 7) + * @param dst the destination buffer + */ +void sph_shavite256_addbits_and_close( + void *cc, unsigned ub, unsigned n, void *dst); + +/** + * Initialize a SHAvite-384 context. This process performs no memory allocation. + * + * @param cc the SHAvite-384 context (pointer to a + * sph_shavite384_context) + */ +void sph_shavite384_init(void *cc); + +/** + * Process some data bytes. It is acceptable that len is zero + * (in which case this function does nothing). + * + * @param cc the SHAvite-384 context + * @param data the input data + * @param len the input data length (in bytes) + */ +void sph_shavite384(void *cc, const void *data, size_t len); + +/** + * Terminate the current SHAvite-384 computation and output the result into + * the provided buffer. The destination buffer must be wide enough to + * accomodate the result (48 bytes). The context is automatically + * reinitialized. + * + * @param cc the SHAvite-384 context + * @param dst the destination buffer + */ +void sph_shavite384_close(void *cc, void *dst); + +/** + * Add a few additional bits (0 to 7) to the current computation, then + * terminate it and output the result in the provided buffer, which must + * be wide enough to accomodate the result (48 bytes). If bit number i + * in ub has value 2^i, then the extra bits are those + * numbered 7 downto 8-n (this is the big-endian convention at the byte + * level). The context is automatically reinitialized. + * + * @param cc the SHAvite-384 context + * @param ub the extra bits + * @param n the number of extra bits (0 to 7) + * @param dst the destination buffer + */ +void sph_shavite384_addbits_and_close( + void *cc, unsigned ub, unsigned n, void *dst); + +/** + * Initialize a SHAvite-512 context. This process performs no memory allocation. + * + * @param cc the SHAvite-512 context (pointer to a + * sph_shavite512_context) + */ +void sph_shavite512_init(void *cc); + +/** + * Process some data bytes. It is acceptable that len is zero + * (in which case this function does nothing). + * + * @param cc the SHAvite-512 context + * @param data the input data + * @param len the input data length (in bytes) + */ +void sph_shavite512(void *cc, const void *data, size_t len); + +/** + * Terminate the current SHAvite-512 computation and output the result into + * the provided buffer. The destination buffer must be wide enough to + * accomodate the result (64 bytes). The context is automatically + * reinitialized. + * + * @param cc the SHAvite-512 context + * @param dst the destination buffer + */ +void sph_shavite512_close(void *cc, void *dst); + +/** + * Add a few additional bits (0 to 7) to the current computation, then + * terminate it and output the result in the provided buffer, which must + * be wide enough to accomodate the result (64 bytes). If bit number i + * in ub has value 2^i, then the extra bits are those + * numbered 7 downto 8-n (this is the big-endian convention at the byte + * level). The context is automatically reinitialized. + * + * @param cc the SHAvite-512 context + * @param ub the extra bits + * @param n the number of extra bits (0 to 7) + * @param dst the destination buffer + */ +void sph_shavite512_addbits_and_close( + void *cc, unsigned ub, unsigned n, void *dst); + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/sha3/sph_simd.h b/sha3/sph_simd.h new file mode 100644 index 0000000..92ee1e7 --- /dev/null +++ b/sha3/sph_simd.h @@ -0,0 +1,309 @@ +/* $Id: sph_simd.h 154 2010-04-26 17:00:24Z tp $ */ +/** + * SIMD interface. SIMD is a family of functions which differ by + * their output size; this implementation defines SIMD for output + * sizes 224, 256, 384 and 512 bits. + * + * ==========================(LICENSE BEGIN)============================ + * + * Copyright (c) 2007-2010 Projet RNRT SAPHIR + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY + * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * + * ===========================(LICENSE END)============================= + * + * @file sph_simd.h + * @author Thomas Pornin + */ + +#ifndef SPH_SIMD_H__ +#define SPH_SIMD_H__ + +#ifdef __cplusplus +extern "C"{ +#endif + +#include +#include "sph_types.h" + +/** + * Output size (in bits) for SIMD-224. + */ +#define SPH_SIZE_simd224 224 + +/** + * Output size (in bits) for SIMD-256. + */ +#define SPH_SIZE_simd256 256 + +/** + * Output size (in bits) for SIMD-384. + */ +#define SPH_SIZE_simd384 384 + +/** + * Output size (in bits) for SIMD-512. + */ +#define SPH_SIZE_simd512 512 + +/** + * This structure is a context for SIMD computations: it contains the + * intermediate values and some data from the last entered block. Once + * an SIMD computation has been performed, the context can be reused for + * another computation. This specific structure is used for SIMD-224 + * and SIMD-256. + * + * The contents of this structure are private. A running SIMD computation + * can be cloned by copying the context (e.g. with a simple + * memcpy()). + */ +typedef struct { +#ifndef DOXYGEN_IGNORE + unsigned char buf[64]; /* first field, for alignment */ + size_t ptr; + sph_u32 state[16]; + sph_u32 count_low, count_high; +#endif +} sph_simd_small_context; + +/** + * This structure is a context for SIMD computations: it contains the + * intermediate values and some data from the last entered block. Once + * an SIMD computation has been performed, the context can be reused for + * another computation. This specific structure is used for SIMD-384 + * and SIMD-512. + * + * The contents of this structure are private. A running SIMD computation + * can be cloned by copying the context (e.g. with a simple + * memcpy()). + */ +typedef struct { +#ifndef DOXYGEN_IGNORE + unsigned char buf[128]; /* first field, for alignment */ + size_t ptr; + sph_u32 state[32]; + sph_u32 count_low, count_high; +#endif +} sph_simd_big_context; + +/** + * Type for a SIMD-224 context (identical to the common "small" context). + */ +typedef sph_simd_small_context sph_simd224_context; + +/** + * Type for a SIMD-256 context (identical to the common "small" context). + */ +typedef sph_simd_small_context sph_simd256_context; + +/** + * Type for a SIMD-384 context (identical to the common "big" context). + */ +typedef sph_simd_big_context sph_simd384_context; + +/** + * Type for a SIMD-512 context (identical to the common "big" context). + */ +typedef sph_simd_big_context sph_simd512_context; + +/** + * Initialize an SIMD-224 context. This process performs no memory allocation. + * + * @param cc the SIMD-224 context (pointer to a + * sph_simd224_context) + */ +void sph_simd224_init(void *cc); + +/** + * Process some data bytes. It is acceptable that len is zero + * (in which case this function does nothing). + * + * @param cc the SIMD-224 context + * @param data the input data + * @param len the input data length (in bytes) + */ +void sph_simd224(void *cc, const void *data, size_t len); + +/** + * Terminate the current SIMD-224 computation and output the result into + * the provided buffer. The destination buffer must be wide enough to + * accomodate the result (28 bytes). The context is automatically + * reinitialized. + * + * @param cc the SIMD-224 context + * @param dst the destination buffer + */ +void sph_simd224_close(void *cc, void *dst); + +/** + * Add a few additional bits (0 to 7) to the current computation, then + * terminate it and output the result in the provided buffer, which must + * be wide enough to accomodate the result (28 bytes). If bit number i + * in ub has value 2^i, then the extra bits are those + * numbered 7 downto 8-n (this is the big-endian convention at the byte + * level). The context is automatically reinitialized. + * + * @param cc the SIMD-224 context + * @param ub the extra bits + * @param n the number of extra bits (0 to 7) + * @param dst the destination buffer + */ +void sph_simd224_addbits_and_close( + void *cc, unsigned ub, unsigned n, void *dst); + +/** + * Initialize an SIMD-256 context. This process performs no memory allocation. + * + * @param cc the SIMD-256 context (pointer to a + * sph_simd256_context) + */ +void sph_simd256_init(void *cc); + +/** + * Process some data bytes. It is acceptable that len is zero + * (in which case this function does nothing). + * + * @param cc the SIMD-256 context + * @param data the input data + * @param len the input data length (in bytes) + */ +void sph_simd256(void *cc, const void *data, size_t len); + +/** + * Terminate the current SIMD-256 computation and output the result into + * the provided buffer. The destination buffer must be wide enough to + * accomodate the result (32 bytes). The context is automatically + * reinitialized. + * + * @param cc the SIMD-256 context + * @param dst the destination buffer + */ +void sph_simd256_close(void *cc, void *dst); + +/** + * Add a few additional bits (0 to 7) to the current computation, then + * terminate it and output the result in the provided buffer, which must + * be wide enough to accomodate the result (32 bytes). If bit number i + * in ub has value 2^i, then the extra bits are those + * numbered 7 downto 8-n (this is the big-endian convention at the byte + * level). The context is automatically reinitialized. + * + * @param cc the SIMD-256 context + * @param ub the extra bits + * @param n the number of extra bits (0 to 7) + * @param dst the destination buffer + */ +void sph_simd256_addbits_and_close( + void *cc, unsigned ub, unsigned n, void *dst); + +/** + * Initialize an SIMD-384 context. This process performs no memory allocation. + * + * @param cc the SIMD-384 context (pointer to a + * sph_simd384_context) + */ +void sph_simd384_init(void *cc); + +/** + * Process some data bytes. It is acceptable that len is zero + * (in which case this function does nothing). + * + * @param cc the SIMD-384 context + * @param data the input data + * @param len the input data length (in bytes) + */ +void sph_simd384(void *cc, const void *data, size_t len); + +/** + * Terminate the current SIMD-384 computation and output the result into + * the provided buffer. The destination buffer must be wide enough to + * accomodate the result (48 bytes). The context is automatically + * reinitialized. + * + * @param cc the SIMD-384 context + * @param dst the destination buffer + */ +void sph_simd384_close(void *cc, void *dst); + +/** + * Add a few additional bits (0 to 7) to the current computation, then + * terminate it and output the result in the provided buffer, which must + * be wide enough to accomodate the result (48 bytes). If bit number i + * in ub has value 2^i, then the extra bits are those + * numbered 7 downto 8-n (this is the big-endian convention at the byte + * level). The context is automatically reinitialized. + * + * @param cc the SIMD-384 context + * @param ub the extra bits + * @param n the number of extra bits (0 to 7) + * @param dst the destination buffer + */ +void sph_simd384_addbits_and_close( + void *cc, unsigned ub, unsigned n, void *dst); + +/** + * Initialize an SIMD-512 context. This process performs no memory allocation. + * + * @param cc the SIMD-512 context (pointer to a + * sph_simd512_context) + */ +void sph_simd512_init(void *cc); + +/** + * Process some data bytes. It is acceptable that len is zero + * (in which case this function does nothing). + * + * @param cc the SIMD-512 context + * @param data the input data + * @param len the input data length (in bytes) + */ +void sph_simd512(void *cc, const void *data, size_t len); + +/** + * Terminate the current SIMD-512 computation and output the result into + * the provided buffer. The destination buffer must be wide enough to + * accomodate the result (64 bytes). The context is automatically + * reinitialized. + * + * @param cc the SIMD-512 context + * @param dst the destination buffer + */ +void sph_simd512_close(void *cc, void *dst); + +/** + * Add a few additional bits (0 to 7) to the current computation, then + * terminate it and output the result in the provided buffer, which must + * be wide enough to accomodate the result (64 bytes). If bit number i + * in ub has value 2^i, then the extra bits are those + * numbered 7 downto 8-n (this is the big-endian convention at the byte + * level). The context is automatically reinitialized. + * + * @param cc the SIMD-512 context + * @param ub the extra bits + * @param n the number of extra bits (0 to 7) + * @param dst the destination buffer + */ +void sph_simd512_addbits_and_close( + void *cc, unsigned ub, unsigned n, void *dst); +#ifdef __cplusplus +} +#endif + +#endif diff --git a/sha3/sph_skein.h b/sha3/sph_skein.h new file mode 100644 index 0000000..bddbc86 --- /dev/null +++ b/sha3/sph_skein.h @@ -0,0 +1,298 @@ +/* $Id: sph_skein.h 253 2011-06-07 18:33:10Z tp $ */ +/** + * Skein interface. The Skein specification defines three main + * functions, called Skein-256, Skein-512 and Skein-1024, which can be + * further parameterized with an output length. For the SHA-3 + * competition, Skein-512 is used for output sizes of 224, 256, 384 and + * 512 bits; this is what this code implements. Thus, we hereafter call + * Skein-224, Skein-256, Skein-384 and Skein-512 what the Skein + * specification defines as Skein-512-224, Skein-512-256, Skein-512-384 + * and Skein-512-512, respectively. + * + * ==========================(LICENSE BEGIN)============================ + * + * Copyright (c) 2007-2010 Projet RNRT SAPHIR + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY + * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * + * ===========================(LICENSE END)============================= + * + * @file sph_skein.h + * @author Thomas Pornin + */ + +#ifndef SPH_SKEIN_H__ +#define SPH_SKEIN_H__ + +#ifdef __cplusplus +extern "C"{ +#endif + +#include +#include "sph_types.h" + +#if SPH_64 + +/** + * Output size (in bits) for Skein-224. + */ +#define SPH_SIZE_skein224 224 + +/** + * Output size (in bits) for Skein-256. + */ +#define SPH_SIZE_skein256 256 + +/** + * Output size (in bits) for Skein-384. + */ +#define SPH_SIZE_skein384 384 + +/** + * Output size (in bits) for Skein-512. + */ +#define SPH_SIZE_skein512 512 + +/** + * This structure is a context for Skein computations (with a 384- or + * 512-bit output): it contains the intermediate values and some data + * from the last entered block. Once a Skein computation has been + * performed, the context can be reused for another computation. + * + * The contents of this structure are private. A running Skein computation + * can be cloned by copying the context (e.g. with a simple + * memcpy()). + */ +typedef struct { +#ifndef DOXYGEN_IGNORE + unsigned char buf[64]; /* first field, for alignment */ + size_t ptr; + sph_u64 h0, h1, h2, h3, h4, h5, h6, h7; + sph_u64 bcount; +#endif +} sph_skein_big_context; + +/** + * Type for a Skein-224 context (identical to the common "big" context). + */ +typedef sph_skein_big_context sph_skein224_context; + +/** + * Type for a Skein-256 context (identical to the common "big" context). + */ +typedef sph_skein_big_context sph_skein256_context; + +/** + * Type for a Skein-384 context (identical to the common "big" context). + */ +typedef sph_skein_big_context sph_skein384_context; + +/** + * Type for a Skein-512 context (identical to the common "big" context). + */ +typedef sph_skein_big_context sph_skein512_context; + +/** + * Initialize a Skein-224 context. This process performs no memory allocation. + * + * @param cc the Skein-224 context (pointer to a + * sph_skein224_context) + */ +void sph_skein224_init(void *cc); + +/** + * Process some data bytes. It is acceptable that len is zero + * (in which case this function does nothing). + * + * @param cc the Skein-224 context + * @param data the input data + * @param len the input data length (in bytes) + */ +void sph_skein224(void *cc, const void *data, size_t len); + +/** + * Terminate the current Skein-224 computation and output the result into + * the provided buffer. The destination buffer must be wide enough to + * accomodate the result (28 bytes). The context is automatically + * reinitialized. + * + * @param cc the Skein-224 context + * @param dst the destination buffer + */ +void sph_skein224_close(void *cc, void *dst); + +/** + * Add a few additional bits (0 to 7) to the current computation, then + * terminate it and output the result in the provided buffer, which must + * be wide enough to accomodate the result (28 bytes). If bit number i + * in ub has value 2^i, then the extra bits are those + * numbered 7 downto 8-n (this is the big-endian convention at the byte + * level). The context is automatically reinitialized. + * + * @param cc the Skein-224 context + * @param ub the extra bits + * @param n the number of extra bits (0 to 7) + * @param dst the destination buffer + */ +void sph_skein224_addbits_and_close( + void *cc, unsigned ub, unsigned n, void *dst); + +/** + * Initialize a Skein-256 context. This process performs no memory allocation. + * + * @param cc the Skein-256 context (pointer to a + * sph_skein256_context) + */ +void sph_skein256_init(void *cc); + +/** + * Process some data bytes. It is acceptable that len is zero + * (in which case this function does nothing). + * + * @param cc the Skein-256 context + * @param data the input data + * @param len the input data length (in bytes) + */ +void sph_skein256(void *cc, const void *data, size_t len); + +/** + * Terminate the current Skein-256 computation and output the result into + * the provided buffer. The destination buffer must be wide enough to + * accomodate the result (32 bytes). The context is automatically + * reinitialized. + * + * @param cc the Skein-256 context + * @param dst the destination buffer + */ +void sph_skein256_close(void *cc, void *dst); + +/** + * Add a few additional bits (0 to 7) to the current computation, then + * terminate it and output the result in the provided buffer, which must + * be wide enough to accomodate the result (32 bytes). If bit number i + * in ub has value 2^i, then the extra bits are those + * numbered 7 downto 8-n (this is the big-endian convention at the byte + * level). The context is automatically reinitialized. + * + * @param cc the Skein-256 context + * @param ub the extra bits + * @param n the number of extra bits (0 to 7) + * @param dst the destination buffer + */ +void sph_skein256_addbits_and_close( + void *cc, unsigned ub, unsigned n, void *dst); + +/** + * Initialize a Skein-384 context. This process performs no memory allocation. + * + * @param cc the Skein-384 context (pointer to a + * sph_skein384_context) + */ +void sph_skein384_init(void *cc); + +/** + * Process some data bytes. It is acceptable that len is zero + * (in which case this function does nothing). + * + * @param cc the Skein-384 context + * @param data the input data + * @param len the input data length (in bytes) + */ +void sph_skein384(void *cc, const void *data, size_t len); + +/** + * Terminate the current Skein-384 computation and output the result into + * the provided buffer. The destination buffer must be wide enough to + * accomodate the result (48 bytes). The context is automatically + * reinitialized. + * + * @param cc the Skein-384 context + * @param dst the destination buffer + */ +void sph_skein384_close(void *cc, void *dst); + +/** + * Add a few additional bits (0 to 7) to the current computation, then + * terminate it and output the result in the provided buffer, which must + * be wide enough to accomodate the result (48 bytes). If bit number i + * in ub has value 2^i, then the extra bits are those + * numbered 7 downto 8-n (this is the big-endian convention at the byte + * level). The context is automatically reinitialized. + * + * @param cc the Skein-384 context + * @param ub the extra bits + * @param n the number of extra bits (0 to 7) + * @param dst the destination buffer + */ +void sph_skein384_addbits_and_close( + void *cc, unsigned ub, unsigned n, void *dst); + +/** + * Initialize a Skein-512 context. This process performs no memory allocation. + * + * @param cc the Skein-512 context (pointer to a + * sph_skein512_context) + */ +void sph_skein512_init(void *cc); + +/** + * Process some data bytes. It is acceptable that len is zero + * (in which case this function does nothing). + * + * @param cc the Skein-512 context + * @param data the input data + * @param len the input data length (in bytes) + */ +void sph_skein512(void *cc, const void *data, size_t len); + +/** + * Terminate the current Skein-512 computation and output the result into + * the provided buffer. The destination buffer must be wide enough to + * accomodate the result (64 bytes). The context is automatically + * reinitialized. + * + * @param cc the Skein-512 context + * @param dst the destination buffer + */ +void sph_skein512_close(void *cc, void *dst); + +/** + * Add a few additional bits (0 to 7) to the current computation, then + * terminate it and output the result in the provided buffer, which must + * be wide enough to accomodate the result (64 bytes). If bit number i + * in ub has value 2^i, then the extra bits are those + * numbered 7 downto 8-n (this is the big-endian convention at the byte + * level). The context is automatically reinitialized. + * + * @param cc the Skein-512 context + * @param ub the extra bits + * @param n the number of extra bits (0 to 7) + * @param dst the destination buffer + */ +void sph_skein512_addbits_and_close( + void *cc, unsigned ub, unsigned n, void *dst); + +#endif + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/sha3/sph_types.h b/sha3/sph_types.h new file mode 100644 index 0000000..7295b0b --- /dev/null +++ b/sha3/sph_types.h @@ -0,0 +1,1976 @@ +/* $Id: sph_types.h 260 2011-07-21 01:02:38Z tp $ */ +/** + * Basic type definitions. + * + * This header file defines the generic integer types that will be used + * for the implementation of hash functions; it also contains helper + * functions which encode and decode multi-byte integer values, using + * either little-endian or big-endian conventions. + * + * This file contains a compile-time test on the size of a byte + * (the unsigned char C type). If bytes are not octets, + * i.e. if they do not have a size of exactly 8 bits, then compilation + * is aborted. Architectures where bytes are not octets are relatively + * rare, even in the embedded devices market. We forbid non-octet bytes + * because there is no clear convention on how octet streams are encoded + * on such systems. + * + * ==========================(LICENSE BEGIN)============================ + * + * Copyright (c) 2007-2010 Projet RNRT SAPHIR + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY + * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * + * ===========================(LICENSE END)============================= + * + * @file sph_types.h + * @author Thomas Pornin + */ + +#ifndef SPH_TYPES_H__ +#define SPH_TYPES_H__ + +#include + +/* + * All our I/O functions are defined over octet streams. We do not know + * how to handle input data if bytes are not octets. + */ +#if CHAR_BIT != 8 +#error This code requires 8-bit bytes +#endif + +/* ============= BEGIN documentation block for Doxygen ============ */ + +#ifdef DOXYGEN_IGNORE + +/** @mainpage sphlib C code documentation + * + * @section overview Overview + * + * sphlib is a library which contains implementations of + * various cryptographic hash functions. These pages have been generated + * with doxygen and + * document the API for the C implementations. + * + * The API is described in appropriate header files, which are available + * in the "Files" section. Each hash function family has its own header, + * whose name begins with "sph_" and contains the family + * name. For instance, the API for the RIPEMD hash functions is available + * in the header file sph_ripemd.h. + * + * @section principles API structure and conventions + * + * @subsection io Input/output conventions + * + * In all generality, hash functions operate over strings of bits. + * Individual bits are rarely encountered in C programming or actual + * communication protocols; most protocols converge on the ubiquitous + * "octet" which is a group of eight bits. Data is thus expressed as a + * stream of octets. The C programming language contains the notion of a + * "byte", which is a data unit managed under the type "unsigned + * char". The C standard prescribes that a byte should hold at + * least eight bits, but possibly more. Most modern architectures, even + * in the embedded world, feature eight-bit bytes, i.e. map bytes to + * octets. + * + * Nevertheless, for some of the implemented hash functions, an extra + * API has been added, which allows the input of arbitrary sequences of + * bits: when the computation is about to be closed, 1 to 7 extra bits + * can be added. The functions for which this API is implemented include + * the SHA-2 functions and all SHA-3 candidates. + * + * sphlib defines hash function which may hash octet streams, + * i.e. streams of bits where the number of bits is a multiple of eight. + * The data input functions in the sphlib API expect data + * as anonymous pointers ("const void *") with a length + * (of type "size_t") which gives the input data chunk length + * in bytes. A byte is assumed to be an octet; the sph_types.h + * header contains a compile-time test which prevents compilation on + * architectures where this property is not met. + * + * The hash function output is also converted into bytes. All currently + * implemented hash functions have an output width which is a multiple of + * eight, and this is likely to remain true for new designs. + * + * Most hash functions internally convert input data into 32-bit of 64-bit + * words, using either little-endian or big-endian conversion. The hash + * output also often consists of such words, which are encoded into output + * bytes with a similar endianness convention. Some hash functions have + * been only loosely specified on that subject; when necessary, + * sphlib has been tested against published "reference" + * implementations in order to use the same conventions. + * + * @subsection shortname Function short name + * + * Each implemented hash function has a "short name" which is used + * internally to derive the identifiers for the functions and context + * structures which the function uses. For instance, MD5 has the short + * name "md5". Short names are listed in the next section, + * for the implemented hash functions. In subsequent sections, the + * short name will be assumed to be "XXX": replace with the + * actual hash function name to get the C identifier. + * + * Note: some functions within the same family share the same core + * elements, such as update function or context structure. Correspondingly, + * some of the defined types or functions may actually be macros which + * transparently evaluate to another type or function name. + * + * @subsection context Context structure + * + * Each implemented hash fonction has its own context structure, available + * under the type name "sph_XXX_context" for the hash function + * with short name "XXX". This structure holds all needed + * state for a running hash computation. + * + * The contents of these structures are meant to be opaque, and private + * to the implementation. However, these contents are specified in the + * header files so that application code which uses sphlib + * may access the size of those structures. + * + * The caller is responsible for allocating the context structure, + * whether by dynamic allocation (malloc() or equivalent), + * static allocation (a global permanent variable), as an automatic + * variable ("on the stack"), or by any other mean which ensures proper + * structure alignment. sphlib code performs no dynamic + * allocation by itself. + * + * The context must be initialized before use, using the + * sph_XXX_init() function. This function sets the context + * state to proper initial values for hashing. + * + * Since all state data is contained within the context structure, + * sphlib is thread-safe and reentrant: several hash + * computations may be performed in parallel, provided that they do not + * operate on the same context. Moreover, a running computation can be + * cloned by copying the context (with a simple memcpy()): + * the context and its clone are then independant and may be updated + * with new data and/or closed without interfering with each other. + * Similarly, a context structure can be moved in memory at will: + * context structures contain no pointer, in particular no pointer to + * themselves. + * + * @subsection dataio Data input + * + * Hashed data is input with the sph_XXX() fonction, which + * takes as parameters a pointer to the context, a pointer to the data + * to hash, and the number of data bytes to hash. The context is updated + * with the new data. + * + * Data can be input in one or several calls, with arbitrary input lengths. + * However, it is best, performance wise, to input data by relatively big + * chunks (say a few kilobytes), because this allows sphlib to + * optimize things and avoid internal copying. + * + * When all data has been input, the context can be closed with + * sph_XXX_close(). The hash output is computed and written + * into the provided buffer. The caller must take care to provide a + * buffer of appropriate length; e.g., when using SHA-1, the output is + * a 20-byte word, therefore the output buffer must be at least 20-byte + * long. + * + * For some hash functions, the sph_XXX_addbits_and_close() + * function can be used instead of sph_XXX_close(). This + * function can take a few extra bits to be added at + * the end of the input message. This allows hashing messages with a + * bit length which is not a multiple of 8. The extra bits are provided + * as an unsigned integer value, and a bit count. The bit count must be + * between 0 and 7, inclusive. The extra bits are provided as bits 7 to + * 0 (bits of numerical value 128, 64, 32... downto 0), in that order. + * For instance, to add three bits of value 1, 1 and 0, the unsigned + * integer will have value 192 (1*128 + 1*64 + 0*32) and the bit count + * will be 3. + * + * The SPH_SIZE_XXX macro is defined for each hash function; + * it evaluates to the function output size, expressed in bits. For instance, + * SPH_SIZE_sha1 evaluates to 160. + * + * When closed, the context is automatically reinitialized and can be + * immediately used for another computation. It is not necessary to call + * sph_XXX_init() after a close. Note that + * sph_XXX_init() can still be called to "reset" a context, + * i.e. forget previously input data, and get back to the initial state. + * + * @subsection alignment Data alignment + * + * "Alignment" is a property of data, which is said to be "properly + * aligned" when its emplacement in memory is such that the data can + * be optimally read by full words. This depends on the type of access; + * basically, some hash functions will read data by 32-bit or 64-bit + * words. sphlib does not mandate such alignment for input + * data, but using aligned data can substantially improve performance. + * + * As a rule, it is best to input data by chunks whose length (in bytes) + * is a multiple of eight, and which begins at "generally aligned" + * addresses, such as the base address returned by a call to + * malloc(). + * + * @section functions Implemented functions + * + * We give here the list of implemented functions. They are grouped by + * family; to each family corresponds a specific header file. Each + * individual function has its associated "short name". Please refer to + * the documentation for that header file to get details on the hash + * function denomination and provenance. + * + * Note: the functions marked with a '(64)' in the list below are + * available only if the C compiler provides an integer type of length + * 64 bits or more. Such a type is mandatory in the latest C standard + * (ISO 9899:1999, aka "C99") and is present in several older compilers + * as well, so chances are that such a type is available. + * + * - HAVAL family: file sph_haval.h + * - HAVAL-128/3 (128-bit, 3 passes): short name: haval128_3 + * - HAVAL-128/4 (128-bit, 4 passes): short name: haval128_4 + * - HAVAL-128/5 (128-bit, 5 passes): short name: haval128_5 + * - HAVAL-160/3 (160-bit, 3 passes): short name: haval160_3 + * - HAVAL-160/4 (160-bit, 4 passes): short name: haval160_4 + * - HAVAL-160/5 (160-bit, 5 passes): short name: haval160_5 + * - HAVAL-192/3 (192-bit, 3 passes): short name: haval192_3 + * - HAVAL-192/4 (192-bit, 4 passes): short name: haval192_4 + * - HAVAL-192/5 (192-bit, 5 passes): short name: haval192_5 + * - HAVAL-224/3 (224-bit, 3 passes): short name: haval224_3 + * - HAVAL-224/4 (224-bit, 4 passes): short name: haval224_4 + * - HAVAL-224/5 (224-bit, 5 passes): short name: haval224_5 + * - HAVAL-256/3 (256-bit, 3 passes): short name: haval256_3 + * - HAVAL-256/4 (256-bit, 4 passes): short name: haval256_4 + * - HAVAL-256/5 (256-bit, 5 passes): short name: haval256_5 + * - MD2: file sph_md2.h, short name: md2 + * - MD4: file sph_md4.h, short name: md4 + * - MD5: file sph_md5.h, short name: md5 + * - PANAMA: file sph_panama.h, short name: panama + * - RadioGatun family: file sph_radiogatun.h + * - RadioGatun[32]: short name: radiogatun32 + * - RadioGatun[64]: short name: radiogatun64 (64) + * - RIPEMD family: file sph_ripemd.h + * - RIPEMD: short name: ripemd + * - RIPEMD-128: short name: ripemd128 + * - RIPEMD-160: short name: ripemd160 + * - SHA-0: file sph_sha0.h, short name: sha0 + * - SHA-1: file sph_sha1.h, short name: sha1 + * - SHA-2 family, 32-bit hashes: file sph_sha2.h + * - SHA-224: short name: sha224 + * - SHA-256: short name: sha256 + * - SHA-384: short name: sha384 (64) + * - SHA-512: short name: sha512 (64) + * - Tiger family: file sph_tiger.h + * - Tiger: short name: tiger (64) + * - Tiger2: short name: tiger2 (64) + * - WHIRLPOOL family: file sph_whirlpool.h + * - WHIRLPOOL-0: short name: whirlpool0 (64) + * - WHIRLPOOL-1: short name: whirlpool1 (64) + * - WHIRLPOOL: short name: whirlpool (64) + * + * The fourteen second-round SHA-3 candidates are also implemented; + * when applicable, the implementations follow the "final" specifications + * as published for the third round of the SHA-3 competition (BLAKE, + * Groestl, JH, Keccak and Skein have been tweaked for third round). + * + * - BLAKE family: file sph_blake.h + * - BLAKE-224: short name: blake224 + * - BLAKE-256: short name: blake256 + * - BLAKE-384: short name: blake384 + * - BLAKE-512: short name: blake512 + * - BMW (Blue Midnight Wish) family: file sph_bmw.h + * - BMW-224: short name: bmw224 + * - BMW-256: short name: bmw256 + * - BMW-384: short name: bmw384 (64) + * - BMW-512: short name: bmw512 (64) + * - CubeHash family: file sph_cubehash.h (specified as + * CubeHash16/32 in the CubeHash specification) + * - CubeHash-224: short name: cubehash224 + * - CubeHash-256: short name: cubehash256 + * - CubeHash-384: short name: cubehash384 + * - CubeHash-512: short name: cubehash512 + * - ECHO family: file sph_echo.h + * - ECHO-224: short name: echo224 + * - ECHO-256: short name: echo256 + * - ECHO-384: short name: echo384 + * - ECHO-512: short name: echo512 + * - Fugue family: file sph_fugue.h + * - Fugue-224: short name: fugue224 + * - Fugue-256: short name: fugue256 + * - Fugue-384: short name: fugue384 + * - Fugue-512: short name: fugue512 + * - Groestl family: file sph_groestl.h + * - Groestl-224: short name: groestl224 + * - Groestl-256: short name: groestl256 + * - Groestl-384: short name: groestl384 + * - Groestl-512: short name: groestl512 + * - Hamsi family: file sph_hamsi.h + * - Hamsi-224: short name: hamsi224 + * - Hamsi-256: short name: hamsi256 + * - Hamsi-384: short name: hamsi384 + * - Hamsi-512: short name: hamsi512 + * - JH family: file sph_jh.h + * - JH-224: short name: jh224 + * - JH-256: short name: jh256 + * - JH-384: short name: jh384 + * - JH-512: short name: jh512 + * - Keccak family: file sph_keccak.h + * - Keccak-224: short name: keccak224 + * - Keccak-256: short name: keccak256 + * - Keccak-384: short name: keccak384 + * - Keccak-512: short name: keccak512 + * - Luffa family: file sph_luffa.h + * - Luffa-224: short name: luffa224 + * - Luffa-256: short name: luffa256 + * - Luffa-384: short name: luffa384 + * - Luffa-512: short name: luffa512 + * - Shabal family: file sph_shabal.h + * - Shabal-192: short name: shabal192 + * - Shabal-224: short name: shabal224 + * - Shabal-256: short name: shabal256 + * - Shabal-384: short name: shabal384 + * - Shabal-512: short name: shabal512 + * - SHAvite-3 family: file sph_shavite.h + * - SHAvite-224 (nominally "SHAvite-3 with 224-bit output"): + * short name: shabal224 + * - SHAvite-256 (nominally "SHAvite-3 with 256-bit output"): + * short name: shabal256 + * - SHAvite-384 (nominally "SHAvite-3 with 384-bit output"): + * short name: shabal384 + * - SHAvite-512 (nominally "SHAvite-3 with 512-bit output"): + * short name: shabal512 + * - SIMD family: file sph_simd.h + * - SIMD-224: short name: simd224 + * - SIMD-256: short name: simd256 + * - SIMD-384: short name: simd384 + * - SIMD-512: short name: simd512 + * - Skein family: file sph_skein.h + * - Skein-224 (nominally specified as Skein-512-224): short name: + * skein224 (64) + * - Skein-256 (nominally specified as Skein-512-256): short name: + * skein256 (64) + * - Skein-384 (nominally specified as Skein-512-384): short name: + * skein384 (64) + * - Skein-512 (nominally specified as Skein-512-512): short name: + * skein512 (64) + * + * For the second-round SHA-3 candidates, the functions are as specified + * for round 2, i.e. with the "tweaks" that some candidates added + * between round 1 and round 2. Also, some of the submitted packages for + * round 2 contained errors, in the specification, reference code, or + * both. sphlib implements the corrected versions. + */ + +/** @hideinitializer + * Unsigned integer type whose length is at least 32 bits; on most + * architectures, it will have a width of exactly 32 bits. Unsigned C + * types implement arithmetics modulo a power of 2; use the + * SPH_T32() macro to ensure that the value is truncated + * to exactly 32 bits. Unless otherwise specified, all macros and + * functions which accept sph_u32 values assume that these + * values fit on 32 bits, i.e. do not exceed 2^32-1, even on architectures + * where sph_u32 is larger than that. + */ +typedef __arch_dependant__ sph_u32; + +/** @hideinitializer + * Signed integer type corresponding to sph_u32; it has + * width 32 bits or more. + */ +typedef __arch_dependant__ sph_s32; + +/** @hideinitializer + * Unsigned integer type whose length is at least 64 bits; on most + * architectures which feature such a type, it will have a width of + * exactly 64 bits. C99-compliant platform will have this type; it + * is also defined when the GNU compiler (gcc) is used, and on + * platforms where unsigned long is large enough. If this + * type is not available, then some hash functions which depends on + * a 64-bit type will not be available (most notably SHA-384, SHA-512, + * Tiger and WHIRLPOOL). + */ +typedef __arch_dependant__ sph_u64; + +/** @hideinitializer + * Signed integer type corresponding to sph_u64; it has + * width 64 bits or more. + */ +typedef __arch_dependant__ sph_s64; + +/** + * This macro expands the token x into a suitable + * constant expression of type sph_u32. Depending on + * how this type is defined, a suffix such as UL may + * be appended to the argument. + * + * @param x the token to expand into a suitable constant expression + */ +#define SPH_C32(x) + +/** + * Truncate a 32-bit value to exactly 32 bits. On most systems, this is + * a no-op, recognized as such by the compiler. + * + * @param x the value to truncate (of type sph_u32) + */ +#define SPH_T32(x) + +/** + * Rotate a 32-bit value by a number of bits to the left. The rotate + * count must reside between 1 and 31. This macro assumes that its + * first argument fits in 32 bits (no extra bit allowed on machines where + * sph_u32 is wider); both arguments may be evaluated + * several times. + * + * @param x the value to rotate (of type sph_u32) + * @param n the rotation count (between 1 and 31, inclusive) + */ +#define SPH_ROTL32(x, n) + +/** + * Rotate a 32-bit value by a number of bits to the left. The rotate + * count must reside between 1 and 31. This macro assumes that its + * first argument fits in 32 bits (no extra bit allowed on machines where + * sph_u32 is wider); both arguments may be evaluated + * several times. + * + * @param x the value to rotate (of type sph_u32) + * @param n the rotation count (between 1 and 31, inclusive) + */ +#define SPH_ROTR32(x, n) + +/** + * This macro is defined on systems for which a 64-bit type has been + * detected, and is used for sph_u64. + */ +#define SPH_64 + +/** + * This macro is defined on systems for the "native" integer size is + * 64 bits (64-bit values fit in one register). + */ +#define SPH_64_TRUE + +/** + * This macro expands the token x into a suitable + * constant expression of type sph_u64. Depending on + * how this type is defined, a suffix such as ULL may + * be appended to the argument. This macro is defined only if a + * 64-bit type was detected and used for sph_u64. + * + * @param x the token to expand into a suitable constant expression + */ +#define SPH_C64(x) + +/** + * Truncate a 64-bit value to exactly 64 bits. On most systems, this is + * a no-op, recognized as such by the compiler. This macro is defined only + * if a 64-bit type was detected and used for sph_u64. + * + * @param x the value to truncate (of type sph_u64) + */ +#define SPH_T64(x) + +/** + * Rotate a 64-bit value by a number of bits to the left. The rotate + * count must reside between 1 and 63. This macro assumes that its + * first argument fits in 64 bits (no extra bit allowed on machines where + * sph_u64 is wider); both arguments may be evaluated + * several times. This macro is defined only if a 64-bit type was detected + * and used for sph_u64. + * + * @param x the value to rotate (of type sph_u64) + * @param n the rotation count (between 1 and 63, inclusive) + */ +#define SPH_ROTL64(x, n) + +/** + * Rotate a 64-bit value by a number of bits to the left. The rotate + * count must reside between 1 and 63. This macro assumes that its + * first argument fits in 64 bits (no extra bit allowed on machines where + * sph_u64 is wider); both arguments may be evaluated + * several times. This macro is defined only if a 64-bit type was detected + * and used for sph_u64. + * + * @param x the value to rotate (of type sph_u64) + * @param n the rotation count (between 1 and 63, inclusive) + */ +#define SPH_ROTR64(x, n) + +/** + * This macro evaluates to inline or an equivalent construction, + * if available on the compilation platform, or to nothing otherwise. This + * is used to declare inline functions, for which the compiler should + * endeavour to include the code directly in the caller. Inline functions + * are typically defined in header files as replacement for macros. + */ +#define SPH_INLINE + +/** + * This macro is defined if the platform has been detected as using + * little-endian convention. This implies that the sph_u32 + * type (and the sph_u64 type also, if it is defined) has + * an exact width (i.e. exactly 32-bit, respectively 64-bit). + */ +#define SPH_LITTLE_ENDIAN + +/** + * This macro is defined if the platform has been detected as using + * big-endian convention. This implies that the sph_u32 + * type (and the sph_u64 type also, if it is defined) has + * an exact width (i.e. exactly 32-bit, respectively 64-bit). + */ +#define SPH_BIG_ENDIAN + +/** + * This macro is defined if 32-bit words (and 64-bit words, if defined) + * can be read from and written to memory efficiently in little-endian + * convention. This is the case for little-endian platforms, and also + * for the big-endian platforms which have special little-endian access + * opcodes (e.g. Ultrasparc). + */ +#define SPH_LITTLE_FAST + +/** + * This macro is defined if 32-bit words (and 64-bit words, if defined) + * can be read from and written to memory efficiently in big-endian + * convention. This is the case for little-endian platforms, and also + * for the little-endian platforms which have special big-endian access + * opcodes. + */ +#define SPH_BIG_FAST + +/** + * On some platforms, this macro is defined to an unsigned integer type + * into which pointer values may be cast. The resulting value can then + * be tested for being a multiple of 2, 4 or 8, indicating an aligned + * pointer for, respectively, 16-bit, 32-bit or 64-bit memory accesses. + */ +#define SPH_UPTR + +/** + * When defined, this macro indicates that unaligned memory accesses + * are possible with only a minor penalty, and thus should be prefered + * over strategies which first copy data to an aligned buffer. + */ +#define SPH_UNALIGNED + +/** + * Byte-swap a 32-bit word (i.e. 0x12345678 becomes + * 0x78563412). This is an inline function which resorts + * to inline assembly on some platforms, for better performance. + * + * @param x the 32-bit value to byte-swap + * @return the byte-swapped value + */ +static inline sph_u32 sph_bswap32(sph_u32 x); + +/** + * Byte-swap a 64-bit word. This is an inline function which resorts + * to inline assembly on some platforms, for better performance. This + * function is defined only if a suitable 64-bit type was found for + * sph_u64 + * + * @param x the 64-bit value to byte-swap + * @return the byte-swapped value + */ +static inline sph_u64 sph_bswap64(sph_u64 x); + +/** + * Decode a 16-bit unsigned value from memory, in little-endian convention + * (least significant byte comes first). + * + * @param src the source address + * @return the decoded value + */ +static inline unsigned sph_dec16le(const void *src); + +/** + * Encode a 16-bit unsigned value into memory, in little-endian convention + * (least significant byte comes first). + * + * @param dst the destination buffer + * @param val the value to encode + */ +static inline void sph_enc16le(void *dst, unsigned val); + +/** + * Decode a 16-bit unsigned value from memory, in big-endian convention + * (most significant byte comes first). + * + * @param src the source address + * @return the decoded value + */ +static inline unsigned sph_dec16be(const void *src); + +/** + * Encode a 16-bit unsigned value into memory, in big-endian convention + * (most significant byte comes first). + * + * @param dst the destination buffer + * @param val the value to encode + */ +static inline void sph_enc16be(void *dst, unsigned val); + +/** + * Decode a 32-bit unsigned value from memory, in little-endian convention + * (least significant byte comes first). + * + * @param src the source address + * @return the decoded value + */ +static inline sph_u32 sph_dec32le(const void *src); + +/** + * Decode a 32-bit unsigned value from memory, in little-endian convention + * (least significant byte comes first). This function assumes that the + * source address is suitably aligned for a direct access, if the platform + * supports such things; it can thus be marginally faster than the generic + * sph_dec32le() function. + * + * @param src the source address + * @return the decoded value + */ +static inline sph_u32 sph_dec32le_aligned(const void *src); + +/** + * Encode a 32-bit unsigned value into memory, in little-endian convention + * (least significant byte comes first). + * + * @param dst the destination buffer + * @param val the value to encode + */ +static inline void sph_enc32le(void *dst, sph_u32 val); + +/** + * Encode a 32-bit unsigned value into memory, in little-endian convention + * (least significant byte comes first). This function assumes that the + * destination address is suitably aligned for a direct access, if the + * platform supports such things; it can thus be marginally faster than + * the generic sph_enc32le() function. + * + * @param dst the destination buffer + * @param val the value to encode + */ +static inline void sph_enc32le_aligned(void *dst, sph_u32 val); + +/** + * Decode a 32-bit unsigned value from memory, in big-endian convention + * (most significant byte comes first). + * + * @param src the source address + * @return the decoded value + */ +static inline sph_u32 sph_dec32be(const void *src); + +/** + * Decode a 32-bit unsigned value from memory, in big-endian convention + * (most significant byte comes first). This function assumes that the + * source address is suitably aligned for a direct access, if the platform + * supports such things; it can thus be marginally faster than the generic + * sph_dec32be() function. + * + * @param src the source address + * @return the decoded value + */ +static inline sph_u32 sph_dec32be_aligned(const void *src); + +/** + * Encode a 32-bit unsigned value into memory, in big-endian convention + * (most significant byte comes first). + * + * @param dst the destination buffer + * @param val the value to encode + */ +static inline void sph_enc32be(void *dst, sph_u32 val); + +/** + * Encode a 32-bit unsigned value into memory, in big-endian convention + * (most significant byte comes first). This function assumes that the + * destination address is suitably aligned for a direct access, if the + * platform supports such things; it can thus be marginally faster than + * the generic sph_enc32be() function. + * + * @param dst the destination buffer + * @param val the value to encode + */ +static inline void sph_enc32be_aligned(void *dst, sph_u32 val); + +/** + * Decode a 64-bit unsigned value from memory, in little-endian convention + * (least significant byte comes first). This function is defined only + * if a suitable 64-bit type was detected and used for sph_u64. + * + * @param src the source address + * @return the decoded value + */ +static inline sph_u64 sph_dec64le(const void *src); + +/** + * Decode a 64-bit unsigned value from memory, in little-endian convention + * (least significant byte comes first). This function assumes that the + * source address is suitably aligned for a direct access, if the platform + * supports such things; it can thus be marginally faster than the generic + * sph_dec64le() function. This function is defined only + * if a suitable 64-bit type was detected and used for sph_u64. + * + * @param src the source address + * @return the decoded value + */ +static inline sph_u64 sph_dec64le_aligned(const void *src); + +/** + * Encode a 64-bit unsigned value into memory, in little-endian convention + * (least significant byte comes first). This function is defined only + * if a suitable 64-bit type was detected and used for sph_u64. + * + * @param dst the destination buffer + * @param val the value to encode + */ +static inline void sph_enc64le(void *dst, sph_u64 val); + +/** + * Encode a 64-bit unsigned value into memory, in little-endian convention + * (least significant byte comes first). This function assumes that the + * destination address is suitably aligned for a direct access, if the + * platform supports such things; it can thus be marginally faster than + * the generic sph_enc64le() function. This function is defined + * only if a suitable 64-bit type was detected and used for + * sph_u64. + * + * @param dst the destination buffer + * @param val the value to encode + */ +static inline void sph_enc64le_aligned(void *dst, sph_u64 val); + +/** + * Decode a 64-bit unsigned value from memory, in big-endian convention + * (most significant byte comes first). This function is defined only + * if a suitable 64-bit type was detected and used for sph_u64. + * + * @param src the source address + * @return the decoded value + */ +static inline sph_u64 sph_dec64be(const void *src); + +/** + * Decode a 64-bit unsigned value from memory, in big-endian convention + * (most significant byte comes first). This function assumes that the + * source address is suitably aligned for a direct access, if the platform + * supports such things; it can thus be marginally faster than the generic + * sph_dec64be() function. This function is defined only + * if a suitable 64-bit type was detected and used for sph_u64. + * + * @param src the source address + * @return the decoded value + */ +static inline sph_u64 sph_dec64be_aligned(const void *src); + +/** + * Encode a 64-bit unsigned value into memory, in big-endian convention + * (most significant byte comes first). This function is defined only + * if a suitable 64-bit type was detected and used for sph_u64. + * + * @param dst the destination buffer + * @param val the value to encode + */ +static inline void sph_enc64be(void *dst, sph_u64 val); + +/** + * Encode a 64-bit unsigned value into memory, in big-endian convention + * (most significant byte comes first). This function assumes that the + * destination address is suitably aligned for a direct access, if the + * platform supports such things; it can thus be marginally faster than + * the generic sph_enc64be() function. This function is defined + * only if a suitable 64-bit type was detected and used for + * sph_u64. + * + * @param dst the destination buffer + * @param val the value to encode + */ +static inline void sph_enc64be_aligned(void *dst, sph_u64 val); + +#endif + +/* ============== END documentation block for Doxygen ============= */ + +#ifndef DOXYGEN_IGNORE + +/* + * We want to define the types "sph_u32" and "sph_u64" which hold + * unsigned values of at least, respectively, 32 and 64 bits. These + * tests should select appropriate types for most platforms. The + * macro "SPH_64" is defined if the 64-bit is supported. + */ + +#undef SPH_64 +#undef SPH_64_TRUE + +#if defined __STDC__ && __STDC_VERSION__ >= 199901L + +/* + * On C99 implementations, we can use to get an exact 64-bit + * type, if any, or otherwise use a wider type (which must exist, for + * C99 conformance). + */ + +#include + +#ifdef UINT32_MAX +typedef uint32_t sph_u32; +typedef int32_t sph_s32; +#else +typedef uint_fast32_t sph_u32; +typedef int_fast32_t sph_s32; +#endif +#if !SPH_NO_64 +#ifdef UINT64_MAX +typedef uint64_t sph_u64; +typedef int64_t sph_s64; +#else +typedef uint_fast64_t sph_u64; +typedef int_fast64_t sph_s64; +#endif +#endif + +#define SPH_C32(x) ((sph_u32)(x)) +#if !SPH_NO_64 +#define SPH_C64(x) ((sph_u64)(x)) +#define SPH_64 1 +#endif + +#else + +/* + * On non-C99 systems, we use "unsigned int" if it is wide enough, + * "unsigned long" otherwise. This supports all "reasonable" architectures. + * We have to be cautious: pre-C99 preprocessors handle constants + * differently in '#if' expressions. Hence the shifts to test UINT_MAX. + */ + +#if ((UINT_MAX >> 11) >> 11) >= 0x3FF + +typedef unsigned int sph_u32; +typedef int sph_s32; + +#define SPH_C32(x) ((sph_u32)(x ## U)) + +#else + +typedef unsigned long sph_u32; +typedef long sph_s32; + +#define SPH_C32(x) ((sph_u32)(x ## UL)) + +#endif + +#if !SPH_NO_64 + +/* + * We want a 64-bit type. We use "unsigned long" if it is wide enough (as + * is common on 64-bit architectures such as AMD64, Alpha or Sparcv9), + * "unsigned long long" otherwise, if available. We use ULLONG_MAX to + * test whether "unsigned long long" is available; we also know that + * gcc features this type, even if the libc header do not know it. + */ + +#if ((ULONG_MAX >> 31) >> 31) >= 3 + +typedef unsigned long sph_u64; +typedef long sph_s64; + +#define SPH_C64(x) ((sph_u64)(x ## UL)) + +#define SPH_64 1 + +#elif ((ULLONG_MAX >> 31) >> 31) >= 3 || defined __GNUC__ + +typedef unsigned long long sph_u64; +typedef long long sph_s64; + +#define SPH_C64(x) ((sph_u64)(x ## ULL)) + +#define SPH_64 1 + +#else + +/* + * No 64-bit type... + */ + +#endif + +#endif + +#endif + +/* + * If the "unsigned long" type has length 64 bits or more, then this is + * a "true" 64-bit architectures. This is also true with Visual C on + * amd64, even though the "long" type is limited to 32 bits. + */ +#if SPH_64 && (((ULONG_MAX >> 31) >> 31) >= 3 || defined _M_X64) +#define SPH_64_TRUE 1 +#endif + +/* + * Implementation note: some processors have specific opcodes to perform + * a rotation. Recent versions of gcc recognize the expression above and + * use the relevant opcodes, when appropriate. + */ + +#define SPH_T32(x) ((x) & SPH_C32(0xFFFFFFFF)) +#define SPH_ROTL32(x, n) SPH_T32(((x) << (n)) | ((x) >> (32 - (n)))) +#define SPH_ROTR32(x, n) SPH_ROTL32(x, (32 - (n))) + +#if SPH_64 + +#define SPH_T64(x) ((x) & SPH_C64(0xFFFFFFFFFFFFFFFF)) +#define SPH_ROTL64(x, n) SPH_T64(((x) << (n)) | ((x) >> (64 - (n)))) +#define SPH_ROTR64(x, n) SPH_ROTL64(x, (64 - (n))) + +#endif + +#ifndef DOXYGEN_IGNORE +/* + * Define SPH_INLINE to be an "inline" qualifier, if available. We define + * some small macro-like functions which benefit greatly from being inlined. + */ +#if (defined __STDC__ && __STDC_VERSION__ >= 199901L) || defined __GNUC__ +#define SPH_INLINE inline +#elif defined _MSC_VER +#define SPH_INLINE __inline +#else +#define SPH_INLINE +#endif +#endif + +/* + * We define some macros which qualify the architecture. These macros + * may be explicit set externally (e.g. as compiler parameters). The + * code below sets those macros if they are not already defined. + * + * Most macros are boolean, thus evaluate to either zero or non-zero. + * The SPH_UPTR macro is special, in that it evaluates to a C type, + * or is not defined. + * + * SPH_UPTR if defined: unsigned type to cast pointers into + * + * SPH_UNALIGNED non-zero if unaligned accesses are efficient + * SPH_LITTLE_ENDIAN non-zero if architecture is known to be little-endian + * SPH_BIG_ENDIAN non-zero if architecture is known to be big-endian + * SPH_LITTLE_FAST non-zero if little-endian decoding is fast + * SPH_BIG_FAST non-zero if big-endian decoding is fast + * + * If SPH_UPTR is defined, then encoding and decoding of 32-bit and 64-bit + * values will try to be "smart". Either SPH_LITTLE_ENDIAN or SPH_BIG_ENDIAN + * _must_ be non-zero in those situations. The 32-bit and 64-bit types + * _must_ also have an exact width. + * + * SPH_SPARCV9_GCC_32 UltraSPARC-compatible with gcc, 32-bit mode + * SPH_SPARCV9_GCC_64 UltraSPARC-compatible with gcc, 64-bit mode + * SPH_SPARCV9_GCC UltraSPARC-compatible with gcc + * SPH_I386_GCC x86-compatible (32-bit) with gcc + * SPH_I386_MSVC x86-compatible (32-bit) with Microsoft Visual C + * SPH_AMD64_GCC x86-compatible (64-bit) with gcc + * SPH_AMD64_MSVC x86-compatible (64-bit) with Microsoft Visual C + * SPH_PPC32_GCC PowerPC, 32-bit, with gcc + * SPH_PPC64_GCC PowerPC, 64-bit, with gcc + * + * TODO: enhance automatic detection, for more architectures and compilers. + * Endianness is the most important. SPH_UNALIGNED and SPH_UPTR help with + * some very fast functions (e.g. MD4) when using unaligned input data. + * The CPU-specific-with-GCC macros are useful only for inline assembly, + * normally restrained to this header file. + */ + +/* + * 32-bit x86, aka "i386 compatible". + */ +#if defined __i386__ || defined _M_IX86 + +#define SPH_DETECT_UNALIGNED 1 +#define SPH_DETECT_LITTLE_ENDIAN 1 +#define SPH_DETECT_UPTR sph_u32 +#ifdef __GNUC__ +#define SPH_DETECT_I386_GCC 1 +#endif +#ifdef _MSC_VER +#define SPH_DETECT_I386_MSVC 1 +#endif + +/* + * 64-bit x86, hereafter known as "amd64". + */ +#elif defined __x86_64 || defined _M_X64 + +#define SPH_DETECT_UNALIGNED 1 +#define SPH_DETECT_LITTLE_ENDIAN 1 +#define SPH_DETECT_UPTR sph_u64 +#ifdef __GNUC__ +#define SPH_DETECT_AMD64_GCC 1 +#endif +#ifdef _MSC_VER +#define SPH_DETECT_AMD64_MSVC 1 +#endif + +/* + * 64-bit Sparc architecture (implies v9). + */ +#elif ((defined __sparc__ || defined __sparc) && defined __arch64__) \ + || defined __sparcv9 + +#define SPH_DETECT_BIG_ENDIAN 1 +#define SPH_DETECT_UPTR sph_u64 +#ifdef __GNUC__ +#define SPH_DETECT_SPARCV9_GCC_64 1 +#define SPH_DETECT_LITTLE_FAST 1 +#endif + +/* + * 32-bit Sparc. + */ +#elif (defined __sparc__ || defined __sparc) \ + && !(defined __sparcv9 || defined __arch64__) + +#define SPH_DETECT_BIG_ENDIAN 1 +#define SPH_DETECT_UPTR sph_u32 +#if defined __GNUC__ && defined __sparc_v9__ +#define SPH_DETECT_SPARCV9_GCC_32 1 +#define SPH_DETECT_LITTLE_FAST 1 +#endif + +/* + * ARM, little-endian. + */ +#elif defined __arm__ && __ARMEL__ + +#define SPH_DETECT_LITTLE_ENDIAN 1 + +/* + * MIPS, little-endian. + */ +#elif MIPSEL || _MIPSEL || __MIPSEL || __MIPSEL__ + +#define SPH_DETECT_LITTLE_ENDIAN 1 + +/* + * MIPS, big-endian. + */ +#elif MIPSEB || _MIPSEB || __MIPSEB || __MIPSEB__ + +#define SPH_DETECT_BIG_ENDIAN 1 + +/* + * PowerPC. + */ +#elif defined __powerpc__ || defined __POWERPC__ || defined __ppc__ \ + || defined _ARCH_PPC + +/* + * Note: we do not declare cross-endian access to be "fast": even if + * using inline assembly, implementation should still assume that + * keeping the decoded word in a temporary is faster than decoding + * it again. + */ +#if defined __GNUC__ +#if SPH_64_TRUE +#define SPH_DETECT_PPC64_GCC 1 +#else +#define SPH_DETECT_PPC32_GCC 1 +#endif +#endif + +#if defined __BIG_ENDIAN__ || defined _BIG_ENDIAN +#define SPH_DETECT_BIG_ENDIAN 1 +#elif defined __LITTLE_ENDIAN__ || defined _LITTLE_ENDIAN +#define SPH_DETECT_LITTLE_ENDIAN 1 +#endif + +/* + * Itanium, 64-bit. + */ +#elif defined __ia64 || defined __ia64__ \ + || defined __itanium__ || defined _M_IA64 + +#if defined __BIG_ENDIAN__ || defined _BIG_ENDIAN +#define SPH_DETECT_BIG_ENDIAN 1 +#else +#define SPH_DETECT_LITTLE_ENDIAN 1 +#endif +#if defined __LP64__ || defined _LP64 +#define SPH_DETECT_UPTR sph_u64 +#else +#define SPH_DETECT_UPTR sph_u32 +#endif + +#endif + +#if defined SPH_DETECT_SPARCV9_GCC_32 || defined SPH_DETECT_SPARCV9_GCC_64 +#define SPH_DETECT_SPARCV9_GCC 1 +#endif + +#if defined SPH_DETECT_UNALIGNED && !defined SPH_UNALIGNED +#define SPH_UNALIGNED SPH_DETECT_UNALIGNED +#endif +#if defined SPH_DETECT_UPTR && !defined SPH_UPTR +#define SPH_UPTR SPH_DETECT_UPTR +#endif +#if defined SPH_DETECT_LITTLE_ENDIAN && !defined SPH_LITTLE_ENDIAN +#define SPH_LITTLE_ENDIAN SPH_DETECT_LITTLE_ENDIAN +#endif +#if defined SPH_DETECT_BIG_ENDIAN && !defined SPH_BIG_ENDIAN +#define SPH_BIG_ENDIAN SPH_DETECT_BIG_ENDIAN +#endif +#if defined SPH_DETECT_LITTLE_FAST && !defined SPH_LITTLE_FAST +#define SPH_LITTLE_FAST SPH_DETECT_LITTLE_FAST +#endif +#if defined SPH_DETECT_BIG_FAST && !defined SPH_BIG_FAST +#define SPH_BIG_FAST SPH_DETECT_BIG_FAST +#endif +#if defined SPH_DETECT_SPARCV9_GCC_32 && !defined SPH_SPARCV9_GCC_32 +#define SPH_SPARCV9_GCC_32 SPH_DETECT_SPARCV9_GCC_32 +#endif +#if defined SPH_DETECT_SPARCV9_GCC_64 && !defined SPH_SPARCV9_GCC_64 +#define SPH_SPARCV9_GCC_64 SPH_DETECT_SPARCV9_GCC_64 +#endif +#if defined SPH_DETECT_SPARCV9_GCC && !defined SPH_SPARCV9_GCC +#define SPH_SPARCV9_GCC SPH_DETECT_SPARCV9_GCC +#endif +#if defined SPH_DETECT_I386_GCC && !defined SPH_I386_GCC +#define SPH_I386_GCC SPH_DETECT_I386_GCC +#endif +#if defined SPH_DETECT_I386_MSVC && !defined SPH_I386_MSVC +#define SPH_I386_MSVC SPH_DETECT_I386_MSVC +#endif +#if defined SPH_DETECT_AMD64_GCC && !defined SPH_AMD64_GCC +#define SPH_AMD64_GCC SPH_DETECT_AMD64_GCC +#endif +#if defined SPH_DETECT_AMD64_MSVC && !defined SPH_AMD64_MSVC +#define SPH_AMD64_MSVC SPH_DETECT_AMD64_MSVC +#endif +#if defined SPH_DETECT_PPC32_GCC && !defined SPH_PPC32_GCC +#define SPH_PPC32_GCC SPH_DETECT_PPC32_GCC +#endif +#if defined SPH_DETECT_PPC64_GCC && !defined SPH_PPC64_GCC +#define SPH_PPC64_GCC SPH_DETECT_PPC64_GCC +#endif + +#if SPH_LITTLE_ENDIAN && !defined SPH_LITTLE_FAST +#define SPH_LITTLE_FAST 1 +#endif +#if SPH_BIG_ENDIAN && !defined SPH_BIG_FAST +#define SPH_BIG_FAST 1 +#endif + +#if defined SPH_UPTR && !(SPH_LITTLE_ENDIAN || SPH_BIG_ENDIAN) +#error SPH_UPTR defined, but endianness is not known. +#endif + +#if SPH_I386_GCC && !SPH_NO_ASM + +/* + * On x86 32-bit, with gcc, we use the bswapl opcode to byte-swap 32-bit + * values. + */ + +static SPH_INLINE sph_u32 +sph_bswap32(sph_u32 x) +{ + __asm__ __volatile__ ("bswapl %0" : "=r" (x) : "0" (x)); + return x; +} + +#if SPH_64 + +static SPH_INLINE sph_u64 +sph_bswap64(sph_u64 x) +{ + return ((sph_u64)sph_bswap32((sph_u32)x) << 32) + | (sph_u64)sph_bswap32((sph_u32)(x >> 32)); +} + +#endif + +#elif SPH_AMD64_GCC && !SPH_NO_ASM + +/* + * On x86 64-bit, with gcc, we use the bswapl opcode to byte-swap 32-bit + * and 64-bit values. + */ + +static SPH_INLINE sph_u32 +sph_bswap32(sph_u32 x) +{ + __asm__ __volatile__ ("bswapl %0" : "=r" (x) : "0" (x)); + return x; +} + +#if SPH_64 + +static SPH_INLINE sph_u64 +sph_bswap64(sph_u64 x) +{ + __asm__ __volatile__ ("bswapq %0" : "=r" (x) : "0" (x)); + return x; +} + +#endif + +/* + * Disabled code. Apparently, Microsoft Visual C 2005 is smart enough + * to generate proper opcodes for endianness swapping with the pure C + * implementation below. + * + +#elif SPH_I386_MSVC && !SPH_NO_ASM + +static __inline sph_u32 __declspec(naked) __fastcall +sph_bswap32(sph_u32 x) +{ + __asm { + bswap ecx + mov eax,ecx + ret + } +} + +#if SPH_64 + +static SPH_INLINE sph_u64 +sph_bswap64(sph_u64 x) +{ + return ((sph_u64)sph_bswap32((sph_u32)x) << 32) + | (sph_u64)sph_bswap32((sph_u32)(x >> 32)); +} + +#endif + + * + * [end of disabled code] + */ + +#else + +static SPH_INLINE sph_u32 +sph_bswap32(sph_u32 x) +{ + x = SPH_T32((x << 16) | (x >> 16)); + x = ((x & SPH_C32(0xFF00FF00)) >> 8) + | ((x & SPH_C32(0x00FF00FF)) << 8); + return x; +} + +#if SPH_64 + +/** + * Byte-swap a 64-bit value. + * + * @param x the input value + * @return the byte-swapped value + */ +static SPH_INLINE sph_u64 +sph_bswap64(sph_u64 x) +{ + x = SPH_T64((x << 32) | (x >> 32)); + x = ((x & SPH_C64(0xFFFF0000FFFF0000)) >> 16) + | ((x & SPH_C64(0x0000FFFF0000FFFF)) << 16); + x = ((x & SPH_C64(0xFF00FF00FF00FF00)) >> 8) + | ((x & SPH_C64(0x00FF00FF00FF00FF)) << 8); + return x; +} + +#endif + +#endif + +#if SPH_SPARCV9_GCC && !SPH_NO_ASM + +/* + * On UltraSPARC systems, native ordering is big-endian, but it is + * possible to perform little-endian read accesses by specifying the + * address space 0x88 (ASI_PRIMARY_LITTLE). Basically, either we use + * the opcode "lda [%reg]0x88,%dst", where %reg is the register which + * contains the source address and %dst is the destination register, + * or we use "lda [%reg+imm]%asi,%dst", which uses the %asi register + * to get the address space name. The latter format is better since it + * combines an addition and the actual access in a single opcode; but + * it requires the setting (and subsequent resetting) of %asi, which is + * slow. Some operations (i.e. MD5 compression function) combine many + * successive little-endian read accesses, which may share the same + * %asi setting. The macros below contain the appropriate inline + * assembly. + */ + +#define SPH_SPARCV9_SET_ASI \ + sph_u32 sph_sparcv9_asi; \ + __asm__ __volatile__ ( \ + "rd %%asi,%0\n\twr %%g0,0x88,%%asi" : "=r" (sph_sparcv9_asi)); + +#define SPH_SPARCV9_RESET_ASI \ + __asm__ __volatile__ ("wr %%g0,%0,%%asi" : : "r" (sph_sparcv9_asi)); + +#define SPH_SPARCV9_DEC32LE(base, idx) ({ \ + sph_u32 sph_sparcv9_tmp; \ + __asm__ __volatile__ ("lda [%1+" #idx "*4]%%asi,%0" \ + : "=r" (sph_sparcv9_tmp) : "r" (base)); \ + sph_sparcv9_tmp; \ + }) + +#endif + +static SPH_INLINE void +sph_enc16be(void *dst, unsigned val) +{ + ((unsigned char *)dst)[0] = (val >> 8); + ((unsigned char *)dst)[1] = val; +} + +static SPH_INLINE unsigned +sph_dec16be(const void *src) +{ + return ((unsigned)(((const unsigned char *)src)[0]) << 8) + | (unsigned)(((const unsigned char *)src)[1]); +} + +static SPH_INLINE void +sph_enc16le(void *dst, unsigned val) +{ + ((unsigned char *)dst)[0] = val; + ((unsigned char *)dst)[1] = val >> 8; +} + +static SPH_INLINE unsigned +sph_dec16le(const void *src) +{ + return (unsigned)(((const unsigned char *)src)[0]) + | ((unsigned)(((const unsigned char *)src)[1]) << 8); +} + +/** + * Encode a 32-bit value into the provided buffer (big endian convention). + * + * @param dst the destination buffer + * @param val the 32-bit value to encode + */ +static SPH_INLINE void +sph_enc32be(void *dst, sph_u32 val) +{ +#if defined SPH_UPTR +#if SPH_UNALIGNED +#if SPH_LITTLE_ENDIAN + val = sph_bswap32(val); +#endif + *(sph_u32 *)dst = val; +#else + if (((SPH_UPTR)dst & 3) == 0) { +#if SPH_LITTLE_ENDIAN + val = sph_bswap32(val); +#endif + *(sph_u32 *)dst = val; + } else { + ((unsigned char *)dst)[0] = (val >> 24); + ((unsigned char *)dst)[1] = (val >> 16); + ((unsigned char *)dst)[2] = (val >> 8); + ((unsigned char *)dst)[3] = val; + } +#endif +#else + ((unsigned char *)dst)[0] = (val >> 24); + ((unsigned char *)dst)[1] = (val >> 16); + ((unsigned char *)dst)[2] = (val >> 8); + ((unsigned char *)dst)[3] = val; +#endif +} + +/** + * Encode a 32-bit value into the provided buffer (big endian convention). + * The destination buffer must be properly aligned. + * + * @param dst the destination buffer (32-bit aligned) + * @param val the value to encode + */ +static SPH_INLINE void +sph_enc32be_aligned(void *dst, sph_u32 val) +{ +#if SPH_LITTLE_ENDIAN + *(sph_u32 *)dst = sph_bswap32(val); +#elif SPH_BIG_ENDIAN + *(sph_u32 *)dst = val; +#else + ((unsigned char *)dst)[0] = (val >> 24); + ((unsigned char *)dst)[1] = (val >> 16); + ((unsigned char *)dst)[2] = (val >> 8); + ((unsigned char *)dst)[3] = val; +#endif +} + +/** + * Decode a 32-bit value from the provided buffer (big endian convention). + * + * @param src the source buffer + * @return the decoded value + */ +static SPH_INLINE sph_u32 +sph_dec32be(const void *src) +{ +#if defined SPH_UPTR +#if SPH_UNALIGNED +#if SPH_LITTLE_ENDIAN + return sph_bswap32(*(const sph_u32 *)src); +#else + return *(const sph_u32 *)src; +#endif +#else + if (((SPH_UPTR)src & 3) == 0) { +#if SPH_LITTLE_ENDIAN + return sph_bswap32(*(const sph_u32 *)src); +#else + return *(const sph_u32 *)src; +#endif + } else { + return ((sph_u32)(((const unsigned char *)src)[0]) << 24) + | ((sph_u32)(((const unsigned char *)src)[1]) << 16) + | ((sph_u32)(((const unsigned char *)src)[2]) << 8) + | (sph_u32)(((const unsigned char *)src)[3]); + } +#endif +#else + return ((sph_u32)(((const unsigned char *)src)[0]) << 24) + | ((sph_u32)(((const unsigned char *)src)[1]) << 16) + | ((sph_u32)(((const unsigned char *)src)[2]) << 8) + | (sph_u32)(((const unsigned char *)src)[3]); +#endif +} + +/** + * Decode a 32-bit value from the provided buffer (big endian convention). + * The source buffer must be properly aligned. + * + * @param src the source buffer (32-bit aligned) + * @return the decoded value + */ +static SPH_INLINE sph_u32 +sph_dec32be_aligned(const void *src) +{ +#if SPH_LITTLE_ENDIAN + return sph_bswap32(*(const sph_u32 *)src); +#elif SPH_BIG_ENDIAN + return *(const sph_u32 *)src; +#else + return ((sph_u32)(((const unsigned char *)src)[0]) << 24) + | ((sph_u32)(((const unsigned char *)src)[1]) << 16) + | ((sph_u32)(((const unsigned char *)src)[2]) << 8) + | (sph_u32)(((const unsigned char *)src)[3]); +#endif +} + +/** + * Encode a 32-bit value into the provided buffer (little endian convention). + * + * @param dst the destination buffer + * @param val the 32-bit value to encode + */ +static SPH_INLINE void +sph_enc32le(void *dst, sph_u32 val) +{ +#if defined SPH_UPTR +#if SPH_UNALIGNED +#if SPH_BIG_ENDIAN + val = sph_bswap32(val); +#endif + *(sph_u32 *)dst = val; +#else + if (((SPH_UPTR)dst & 3) == 0) { +#if SPH_BIG_ENDIAN + val = sph_bswap32(val); +#endif + *(sph_u32 *)dst = val; + } else { + ((unsigned char *)dst)[0] = val; + ((unsigned char *)dst)[1] = (val >> 8); + ((unsigned char *)dst)[2] = (val >> 16); + ((unsigned char *)dst)[3] = (val >> 24); + } +#endif +#else + ((unsigned char *)dst)[0] = val; + ((unsigned char *)dst)[1] = (val >> 8); + ((unsigned char *)dst)[2] = (val >> 16); + ((unsigned char *)dst)[3] = (val >> 24); +#endif +} + +/** + * Encode a 32-bit value into the provided buffer (little endian convention). + * The destination buffer must be properly aligned. + * + * @param dst the destination buffer (32-bit aligned) + * @param val the value to encode + */ +static SPH_INLINE void +sph_enc32le_aligned(void *dst, sph_u32 val) +{ +#if SPH_LITTLE_ENDIAN + *(sph_u32 *)dst = val; +#elif SPH_BIG_ENDIAN + *(sph_u32 *)dst = sph_bswap32(val); +#else + ((unsigned char *)dst)[0] = val; + ((unsigned char *)dst)[1] = (val >> 8); + ((unsigned char *)dst)[2] = (val >> 16); + ((unsigned char *)dst)[3] = (val >> 24); +#endif +} + +/** + * Decode a 32-bit value from the provided buffer (little endian convention). + * + * @param src the source buffer + * @return the decoded value + */ +static SPH_INLINE sph_u32 +sph_dec32le(const void *src) +{ +#if defined SPH_UPTR +#if SPH_UNALIGNED +#if SPH_BIG_ENDIAN + return sph_bswap32(*(const sph_u32 *)src); +#else + return *(const sph_u32 *)src; +#endif +#else + if (((SPH_UPTR)src & 3) == 0) { +#if SPH_BIG_ENDIAN +#if SPH_SPARCV9_GCC && !SPH_NO_ASM + sph_u32 tmp; + + /* + * "__volatile__" is needed here because without it, + * gcc-3.4.3 miscompiles the code and performs the + * access before the test on the address, thus triggering + * a bus error... + */ + __asm__ __volatile__ ( + "lda [%1]0x88,%0" : "=r" (tmp) : "r" (src)); + return tmp; +/* + * On PowerPC, this turns out not to be worth the effort: the inline + * assembly makes GCC optimizer uncomfortable, which tends to nullify + * the decoding gains. + * + * For most hash functions, using this inline assembly trick changes + * hashing speed by less than 5% and often _reduces_ it. The biggest + * gains are for MD4 (+11%) and CubeHash (+30%). For all others, it is + * less then 10%. The speed gain on CubeHash is probably due to the + * chronic shortage of registers that CubeHash endures; for the other + * functions, the generic code appears to be efficient enough already. + * +#elif (SPH_PPC32_GCC || SPH_PPC64_GCC) && !SPH_NO_ASM + sph_u32 tmp; + + __asm__ __volatile__ ( + "lwbrx %0,0,%1" : "=r" (tmp) : "r" (src)); + return tmp; + */ +#else + return sph_bswap32(*(const sph_u32 *)src); +#endif +#else + return *(const sph_u32 *)src; +#endif + } else { + return (sph_u32)(((const unsigned char *)src)[0]) + | ((sph_u32)(((const unsigned char *)src)[1]) << 8) + | ((sph_u32)(((const unsigned char *)src)[2]) << 16) + | ((sph_u32)(((const unsigned char *)src)[3]) << 24); + } +#endif +#else + return (sph_u32)(((const unsigned char *)src)[0]) + | ((sph_u32)(((const unsigned char *)src)[1]) << 8) + | ((sph_u32)(((const unsigned char *)src)[2]) << 16) + | ((sph_u32)(((const unsigned char *)src)[3]) << 24); +#endif +} + +/** + * Decode a 32-bit value from the provided buffer (little endian convention). + * The source buffer must be properly aligned. + * + * @param src the source buffer (32-bit aligned) + * @return the decoded value + */ +static SPH_INLINE sph_u32 +sph_dec32le_aligned(const void *src) +{ +#if SPH_LITTLE_ENDIAN + return *(const sph_u32 *)src; +#elif SPH_BIG_ENDIAN +#if SPH_SPARCV9_GCC && !SPH_NO_ASM + sph_u32 tmp; + + __asm__ __volatile__ ("lda [%1]0x88,%0" : "=r" (tmp) : "r" (src)); + return tmp; +/* + * Not worth it generally. + * +#elif (SPH_PPC32_GCC || SPH_PPC64_GCC) && !SPH_NO_ASM + sph_u32 tmp; + + __asm__ __volatile__ ("lwbrx %0,0,%1" : "=r" (tmp) : "r" (src)); + return tmp; + */ +#else + return sph_bswap32(*(const sph_u32 *)src); +#endif +#else + return (sph_u32)(((const unsigned char *)src)[0]) + | ((sph_u32)(((const unsigned char *)src)[1]) << 8) + | ((sph_u32)(((const unsigned char *)src)[2]) << 16) + | ((sph_u32)(((const unsigned char *)src)[3]) << 24); +#endif +} + +#if SPH_64 + +/** + * Encode a 64-bit value into the provided buffer (big endian convention). + * + * @param dst the destination buffer + * @param val the 64-bit value to encode + */ +static SPH_INLINE void +sph_enc64be(void *dst, sph_u64 val) +{ +#if defined SPH_UPTR +#if SPH_UNALIGNED +#if SPH_LITTLE_ENDIAN + val = sph_bswap64(val); +#endif + *(sph_u64 *)dst = val; +#else + if (((SPH_UPTR)dst & 7) == 0) { +#if SPH_LITTLE_ENDIAN + val = sph_bswap64(val); +#endif + *(sph_u64 *)dst = val; + } else { + ((unsigned char *)dst)[0] = (val >> 56); + ((unsigned char *)dst)[1] = (val >> 48); + ((unsigned char *)dst)[2] = (val >> 40); + ((unsigned char *)dst)[3] = (val >> 32); + ((unsigned char *)dst)[4] = (val >> 24); + ((unsigned char *)dst)[5] = (val >> 16); + ((unsigned char *)dst)[6] = (val >> 8); + ((unsigned char *)dst)[7] = val; + } +#endif +#else + ((unsigned char *)dst)[0] = (val >> 56); + ((unsigned char *)dst)[1] = (val >> 48); + ((unsigned char *)dst)[2] = (val >> 40); + ((unsigned char *)dst)[3] = (val >> 32); + ((unsigned char *)dst)[4] = (val >> 24); + ((unsigned char *)dst)[5] = (val >> 16); + ((unsigned char *)dst)[6] = (val >> 8); + ((unsigned char *)dst)[7] = val; +#endif +} + +/** + * Encode a 64-bit value into the provided buffer (big endian convention). + * The destination buffer must be properly aligned. + * + * @param dst the destination buffer (64-bit aligned) + * @param val the value to encode + */ +static SPH_INLINE void +sph_enc64be_aligned(void *dst, sph_u64 val) +{ +#if SPH_LITTLE_ENDIAN + *(sph_u64 *)dst = sph_bswap64(val); +#elif SPH_BIG_ENDIAN + *(sph_u64 *)dst = val; +#else + ((unsigned char *)dst)[0] = (val >> 56); + ((unsigned char *)dst)[1] = (val >> 48); + ((unsigned char *)dst)[2] = (val >> 40); + ((unsigned char *)dst)[3] = (val >> 32); + ((unsigned char *)dst)[4] = (val >> 24); + ((unsigned char *)dst)[5] = (val >> 16); + ((unsigned char *)dst)[6] = (val >> 8); + ((unsigned char *)dst)[7] = val; +#endif +} + +/** + * Decode a 64-bit value from the provided buffer (big endian convention). + * + * @param src the source buffer + * @return the decoded value + */ +static SPH_INLINE sph_u64 +sph_dec64be(const void *src) +{ +#if defined SPH_UPTR +#if SPH_UNALIGNED +#if SPH_LITTLE_ENDIAN + return sph_bswap64(*(const sph_u64 *)src); +#else + return *(const sph_u64 *)src; +#endif +#else + if (((SPH_UPTR)src & 7) == 0) { +#if SPH_LITTLE_ENDIAN + return sph_bswap64(*(const sph_u64 *)src); +#else + return *(const sph_u64 *)src; +#endif + } else { + return ((sph_u64)(((const unsigned char *)src)[0]) << 56) + | ((sph_u64)(((const unsigned char *)src)[1]) << 48) + | ((sph_u64)(((const unsigned char *)src)[2]) << 40) + | ((sph_u64)(((const unsigned char *)src)[3]) << 32) + | ((sph_u64)(((const unsigned char *)src)[4]) << 24) + | ((sph_u64)(((const unsigned char *)src)[5]) << 16) + | ((sph_u64)(((const unsigned char *)src)[6]) << 8) + | (sph_u64)(((const unsigned char *)src)[7]); + } +#endif +#else + return ((sph_u64)(((const unsigned char *)src)[0]) << 56) + | ((sph_u64)(((const unsigned char *)src)[1]) << 48) + | ((sph_u64)(((const unsigned char *)src)[2]) << 40) + | ((sph_u64)(((const unsigned char *)src)[3]) << 32) + | ((sph_u64)(((const unsigned char *)src)[4]) << 24) + | ((sph_u64)(((const unsigned char *)src)[5]) << 16) + | ((sph_u64)(((const unsigned char *)src)[6]) << 8) + | (sph_u64)(((const unsigned char *)src)[7]); +#endif +} + +/** + * Decode a 64-bit value from the provided buffer (big endian convention). + * The source buffer must be properly aligned. + * + * @param src the source buffer (64-bit aligned) + * @return the decoded value + */ +static SPH_INLINE sph_u64 +sph_dec64be_aligned(const void *src) +{ +#if SPH_LITTLE_ENDIAN + return sph_bswap64(*(const sph_u64 *)src); +#elif SPH_BIG_ENDIAN + return *(const sph_u64 *)src; +#else + return ((sph_u64)(((const unsigned char *)src)[0]) << 56) + | ((sph_u64)(((const unsigned char *)src)[1]) << 48) + | ((sph_u64)(((const unsigned char *)src)[2]) << 40) + | ((sph_u64)(((const unsigned char *)src)[3]) << 32) + | ((sph_u64)(((const unsigned char *)src)[4]) << 24) + | ((sph_u64)(((const unsigned char *)src)[5]) << 16) + | ((sph_u64)(((const unsigned char *)src)[6]) << 8) + | (sph_u64)(((const unsigned char *)src)[7]); +#endif +} + +/** + * Encode a 64-bit value into the provided buffer (little endian convention). + * + * @param dst the destination buffer + * @param val the 64-bit value to encode + */ +static SPH_INLINE void +sph_enc64le(void *dst, sph_u64 val) +{ +#if defined SPH_UPTR +#if SPH_UNALIGNED +#if SPH_BIG_ENDIAN + val = sph_bswap64(val); +#endif + *(sph_u64 *)dst = val; +#else + if (((SPH_UPTR)dst & 7) == 0) { +#if SPH_BIG_ENDIAN + val = sph_bswap64(val); +#endif + *(sph_u64 *)dst = val; + } else { + ((unsigned char *)dst)[0] = val; + ((unsigned char *)dst)[1] = (val >> 8); + ((unsigned char *)dst)[2] = (val >> 16); + ((unsigned char *)dst)[3] = (val >> 24); + ((unsigned char *)dst)[4] = (val >> 32); + ((unsigned char *)dst)[5] = (val >> 40); + ((unsigned char *)dst)[6] = (val >> 48); + ((unsigned char *)dst)[7] = (val >> 56); + } +#endif +#else + ((unsigned char *)dst)[0] = val; + ((unsigned char *)dst)[1] = (val >> 8); + ((unsigned char *)dst)[2] = (val >> 16); + ((unsigned char *)dst)[3] = (val >> 24); + ((unsigned char *)dst)[4] = (val >> 32); + ((unsigned char *)dst)[5] = (val >> 40); + ((unsigned char *)dst)[6] = (val >> 48); + ((unsigned char *)dst)[7] = (val >> 56); +#endif +} + +/** + * Encode a 64-bit value into the provided buffer (little endian convention). + * The destination buffer must be properly aligned. + * + * @param dst the destination buffer (64-bit aligned) + * @param val the value to encode + */ +static SPH_INLINE void +sph_enc64le_aligned(void *dst, sph_u64 val) +{ +#if SPH_LITTLE_ENDIAN + *(sph_u64 *)dst = val; +#elif SPH_BIG_ENDIAN + *(sph_u64 *)dst = sph_bswap64(val); +#else + ((unsigned char *)dst)[0] = val; + ((unsigned char *)dst)[1] = (val >> 8); + ((unsigned char *)dst)[2] = (val >> 16); + ((unsigned char *)dst)[3] = (val >> 24); + ((unsigned char *)dst)[4] = (val >> 32); + ((unsigned char *)dst)[5] = (val >> 40); + ((unsigned char *)dst)[6] = (val >> 48); + ((unsigned char *)dst)[7] = (val >> 56); +#endif +} + +/** + * Decode a 64-bit value from the provided buffer (little endian convention). + * + * @param src the source buffer + * @return the decoded value + */ +static SPH_INLINE sph_u64 +sph_dec64le(const void *src) +{ +#if defined SPH_UPTR +#if SPH_UNALIGNED +#if SPH_BIG_ENDIAN + return sph_bswap64(*(const sph_u64 *)src); +#else + return *(const sph_u64 *)src; +#endif +#else + if (((SPH_UPTR)src & 7) == 0) { +#if SPH_BIG_ENDIAN +#if SPH_SPARCV9_GCC_64 && !SPH_NO_ASM + sph_u64 tmp; + + __asm__ __volatile__ ( + "ldxa [%1]0x88,%0" : "=r" (tmp) : "r" (src)); + return tmp; +/* + * Not worth it generally. + * +#elif SPH_PPC32_GCC && !SPH_NO_ASM + return (sph_u64)sph_dec32le_aligned(src) + | ((sph_u64)sph_dec32le_aligned( + (const char *)src + 4) << 32); +#elif SPH_PPC64_GCC && !SPH_NO_ASM + sph_u64 tmp; + + __asm__ __volatile__ ( + "ldbrx %0,0,%1" : "=r" (tmp) : "r" (src)); + return tmp; + */ +#else + return sph_bswap64(*(const sph_u64 *)src); +#endif +#else + return *(const sph_u64 *)src; +#endif + } else { + return (sph_u64)(((const unsigned char *)src)[0]) + | ((sph_u64)(((const unsigned char *)src)[1]) << 8) + | ((sph_u64)(((const unsigned char *)src)[2]) << 16) + | ((sph_u64)(((const unsigned char *)src)[3]) << 24) + | ((sph_u64)(((const unsigned char *)src)[4]) << 32) + | ((sph_u64)(((const unsigned char *)src)[5]) << 40) + | ((sph_u64)(((const unsigned char *)src)[6]) << 48) + | ((sph_u64)(((const unsigned char *)src)[7]) << 56); + } +#endif +#else + return (sph_u64)(((const unsigned char *)src)[0]) + | ((sph_u64)(((const unsigned char *)src)[1]) << 8) + | ((sph_u64)(((const unsigned char *)src)[2]) << 16) + | ((sph_u64)(((const unsigned char *)src)[3]) << 24) + | ((sph_u64)(((const unsigned char *)src)[4]) << 32) + | ((sph_u64)(((const unsigned char *)src)[5]) << 40) + | ((sph_u64)(((const unsigned char *)src)[6]) << 48) + | ((sph_u64)(((const unsigned char *)src)[7]) << 56); +#endif +} + +/** + * Decode a 64-bit value from the provided buffer (little endian convention). + * The source buffer must be properly aligned. + * + * @param src the source buffer (64-bit aligned) + * @return the decoded value + */ +static SPH_INLINE sph_u64 +sph_dec64le_aligned(const void *src) +{ +#if SPH_LITTLE_ENDIAN + return *(const sph_u64 *)src; +#elif SPH_BIG_ENDIAN +#if SPH_SPARCV9_GCC_64 && !SPH_NO_ASM + sph_u64 tmp; + + __asm__ __volatile__ ("ldxa [%1]0x88,%0" : "=r" (tmp) : "r" (src)); + return tmp; +/* + * Not worth it generally. + * +#elif SPH_PPC32_GCC && !SPH_NO_ASM + return (sph_u64)sph_dec32le_aligned(src) + | ((sph_u64)sph_dec32le_aligned((const char *)src + 4) << 32); +#elif SPH_PPC64_GCC && !SPH_NO_ASM + sph_u64 tmp; + + __asm__ __volatile__ ("ldbrx %0,0,%1" : "=r" (tmp) : "r" (src)); + return tmp; + */ +#else + return sph_bswap64(*(const sph_u64 *)src); +#endif +#else + return (sph_u64)(((const unsigned char *)src)[0]) + | ((sph_u64)(((const unsigned char *)src)[1]) << 8) + | ((sph_u64)(((const unsigned char *)src)[2]) << 16) + | ((sph_u64)(((const unsigned char *)src)[3]) << 24) + | ((sph_u64)(((const unsigned char *)src)[4]) << 32) + | ((sph_u64)(((const unsigned char *)src)[5]) << 40) + | ((sph_u64)(((const unsigned char *)src)[6]) << 48) + | ((sph_u64)(((const unsigned char *)src)[7]) << 56); +#endif +} + +#endif + +#endif /* Doxygen excluded block */ + +#endif diff --git a/skein.c b/skein.c new file mode 100644 index 0000000..4aefa2a --- /dev/null +++ b/skein.c @@ -0,0 +1,18 @@ +#include "skein.h" +#include +#include +#include +#include + +#include "sha3/sph_skein.h" + + +void keccak_hash(const char* input, char* output) +{ + sph_keccak512_context ctx_keccak; + sph_keccak512_init(&ctx_keccak); + sph_keccak512 (&ctx_keccak, input, 64); + sph_keccak512_close(&ctx_keccak, output); + +} + diff --git a/skein.h b/skein.h new file mode 100644 index 0000000..bc4867a --- /dev/null +++ b/skein.h @@ -0,0 +1,14 @@ +#ifndef SKEIN_H +#define SKEIN_H + +#ifdef __cplusplus +extern "C" { +#endif + +void skein_hash(const char* input, char* output); + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/stdint.h b/stdint.h new file mode 100644 index 0000000..4fe0ef9 --- /dev/null +++ b/stdint.h @@ -0,0 +1,259 @@ +// ISO C9x compliant stdint.h for Microsoft Visual Studio +// Based on ISO/IEC 9899:TC2 Committee draft (May 6, 2005) WG14/N1124 +// +// Copyright (c) 2006-2013 Alexander Chemeris +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// +// 1. Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the product nor the names of its contributors may +// be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR IMPLIED +// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF +// MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO +// EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; +// OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, +// WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR +// OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF +// ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +/////////////////////////////////////////////////////////////////////////////// + +#ifndef _MSC_VER // [ +#error "Use this header only with Microsoft Visual C++ compilers!" +#endif // _MSC_VER ] + +#ifndef _MSC_STDINT_H_ // [ +#define _MSC_STDINT_H_ + +#if _MSC_VER > 1000 +#pragma once +#endif + +#if _MSC_VER >= 1600 // [ +#include +#else // ] _MSC_VER >= 1600 [ + +#include + +// For Visual Studio 6 in C++ mode and for many Visual Studio versions when +// compiling for ARM we should wrap include with 'extern "C++" {}' +// or compiler give many errors like this: +// error C2733: second C linkage of overloaded function 'wmemchr' not allowed +#ifdef __cplusplus +extern "C" { +#endif +# include +#ifdef __cplusplus +} +#endif + +// Define _W64 macros to mark types changing their size, like intptr_t. +#ifndef _W64 +# if !defined(__midl) && (defined(_X86_) || defined(_M_IX86)) && _MSC_VER >= 1300 +# define _W64 __w64 +# else +# define _W64 +# endif +#endif + + +// 7.18.1 Integer types + +// 7.18.1.1 Exact-width integer types + +// Visual Studio 6 and Embedded Visual C++ 4 doesn't +// realize that, e.g. char has the same size as __int8 +// so we give up on __intX for them. +#if (_MSC_VER < 1300) + typedef signed char int8_t; + typedef signed short int16_t; + typedef signed int int32_t; + typedef unsigned char uint8_t; + typedef unsigned short uint16_t; + typedef unsigned int uint32_t; +#else + typedef signed __int8 int8_t; + typedef signed __int16 int16_t; + typedef signed __int32 int32_t; + typedef unsigned __int8 uint8_t; + typedef unsigned __int16 uint16_t; + typedef unsigned __int32 uint32_t; +#endif +typedef signed __int64 int64_t; +typedef unsigned __int64 uint64_t; + + +// 7.18.1.2 Minimum-width integer types +typedef int8_t int_least8_t; +typedef int16_t int_least16_t; +typedef int32_t int_least32_t; +typedef int64_t int_least64_t; +typedef uint8_t uint_least8_t; +typedef uint16_t uint_least16_t; +typedef uint32_t uint_least32_t; +typedef uint64_t uint_least64_t; + +// 7.18.1.3 Fastest minimum-width integer types +typedef int8_t int_fast8_t; +typedef int16_t int_fast16_t; +typedef int32_t int_fast32_t; +typedef int64_t int_fast64_t; +typedef uint8_t uint_fast8_t; +typedef uint16_t uint_fast16_t; +typedef uint32_t uint_fast32_t; +typedef uint64_t uint_fast64_t; + +// 7.18.1.4 Integer types capable of holding object pointers +#ifdef _WIN64 // [ + typedef signed __int64 intptr_t; + typedef unsigned __int64 uintptr_t; +#else // _WIN64 ][ + typedef _W64 signed int intptr_t; + typedef _W64 unsigned int uintptr_t; +#endif // _WIN64 ] + +// 7.18.1.5 Greatest-width integer types +typedef int64_t intmax_t; +typedef uint64_t uintmax_t; + + +// 7.18.2 Limits of specified-width integer types + +#if !defined(__cplusplus) || defined(__STDC_LIMIT_MACROS) // [ See footnote 220 at page 257 and footnote 221 at page 259 + +// 7.18.2.1 Limits of exact-width integer types +#define INT8_MIN ((int8_t)_I8_MIN) +#define INT8_MAX _I8_MAX +#define INT16_MIN ((int16_t)_I16_MIN) +#define INT16_MAX _I16_MAX +#define INT32_MIN ((int32_t)_I32_MIN) +#define INT32_MAX _I32_MAX +#define INT64_MIN ((int64_t)_I64_MIN) +#define INT64_MAX _I64_MAX +#define UINT8_MAX _UI8_MAX +#define UINT16_MAX _UI16_MAX +#define UINT32_MAX _UI32_MAX +#define UINT64_MAX _UI64_MAX + +// 7.18.2.2 Limits of minimum-width integer types +#define INT_LEAST8_MIN INT8_MIN +#define INT_LEAST8_MAX INT8_MAX +#define INT_LEAST16_MIN INT16_MIN +#define INT_LEAST16_MAX INT16_MAX +#define INT_LEAST32_MIN INT32_MIN +#define INT_LEAST32_MAX INT32_MAX +#define INT_LEAST64_MIN INT64_MIN +#define INT_LEAST64_MAX INT64_MAX +#define UINT_LEAST8_MAX UINT8_MAX +#define UINT_LEAST16_MAX UINT16_MAX +#define UINT_LEAST32_MAX UINT32_MAX +#define UINT_LEAST64_MAX UINT64_MAX + +// 7.18.2.3 Limits of fastest minimum-width integer types +#define INT_FAST8_MIN INT8_MIN +#define INT_FAST8_MAX INT8_MAX +#define INT_FAST16_MIN INT16_MIN +#define INT_FAST16_MAX INT16_MAX +#define INT_FAST32_MIN INT32_MIN +#define INT_FAST32_MAX INT32_MAX +#define INT_FAST64_MIN INT64_MIN +#define INT_FAST64_MAX INT64_MAX +#define UINT_FAST8_MAX UINT8_MAX +#define UINT_FAST16_MAX UINT16_MAX +#define UINT_FAST32_MAX UINT32_MAX +#define UINT_FAST64_MAX UINT64_MAX + +// 7.18.2.4 Limits of integer types capable of holding object pointers +#ifdef _WIN64 // [ +# define INTPTR_MIN INT64_MIN +# define INTPTR_MAX INT64_MAX +# define UINTPTR_MAX UINT64_MAX +#else // _WIN64 ][ +# define INTPTR_MIN INT32_MIN +# define INTPTR_MAX INT32_MAX +# define UINTPTR_MAX UINT32_MAX +#endif // _WIN64 ] + +// 7.18.2.5 Limits of greatest-width integer types +#define INTMAX_MIN INT64_MIN +#define INTMAX_MAX INT64_MAX +#define UINTMAX_MAX UINT64_MAX + +// 7.18.3 Limits of other integer types + +#ifdef _WIN64 // [ +# define PTRDIFF_MIN _I64_MIN +# define PTRDIFF_MAX _I64_MAX +#else // _WIN64 ][ +# define PTRDIFF_MIN _I32_MIN +# define PTRDIFF_MAX _I32_MAX +#endif // _WIN64 ] + +#define SIG_ATOMIC_MIN INT_MIN +#define SIG_ATOMIC_MAX INT_MAX + +#ifndef SIZE_MAX // [ +# ifdef _WIN64 // [ +# define SIZE_MAX _UI64_MAX +# else // _WIN64 ][ +# define SIZE_MAX _UI32_MAX +# endif // _WIN64 ] +#endif // SIZE_MAX ] + +// WCHAR_MIN and WCHAR_MAX are also defined in +#ifndef WCHAR_MIN // [ +# define WCHAR_MIN 0 +#endif // WCHAR_MIN ] +#ifndef WCHAR_MAX // [ +# define WCHAR_MAX _UI16_MAX +#endif // WCHAR_MAX ] + +#define WINT_MIN 0 +#define WINT_MAX _UI16_MAX + +#endif // __STDC_LIMIT_MACROS ] + + +// 7.18.4 Limits of other integer types + +#if !defined(__cplusplus) || defined(__STDC_CONSTANT_MACROS) // [ See footnote 224 at page 260 + +// 7.18.4.1 Macros for minimum-width integer constants + +#define INT8_C(val) val##i8 +#define INT16_C(val) val##i16 +#define INT32_C(val) val##i32 +#define INT64_C(val) val##i64 + +#define UINT8_C(val) val##ui8 +#define UINT16_C(val) val##ui16 +#define UINT32_C(val) val##ui32 +#define UINT64_C(val) val##ui64 + +// 7.18.4.2 Macros for greatest-width integer constants +// These #ifndef's are needed to prevent collisions with . +// Check out Issue 9 for the details. +#ifndef INTMAX_C // [ +# define INTMAX_C INT64_C +#endif // INTMAX_C ] +#ifndef UINTMAX_C // [ +# define UINTMAX_C UINT64_C +#endif // UINTMAX_C ] + +#endif // __STDC_CONSTANT_MACROS ] + +#endif // _MSC_VER >= 1600 ] + +#endif // _MSC_STDINT_H_ ] diff --git a/x11.c b/x11.c new file mode 100644 index 0000000..9290587 --- /dev/null +++ b/x11.c @@ -0,0 +1,85 @@ +#include "x11.h" +#include +#include +#include +#include + +#include "sha3/sph_blake.h" +#include "sha3/sph_bmw.h" +#include "sha3/sph_groestl.h" +#include "sha3/sph_jh.h" +#include "sha3/sph_keccak.h" +#include "sha3/sph_skein.h" +#include "sha3/sph_luffa.h" +#include "sha3/sph_cubehash.h" +#include "sha3/sph_shavite.h" +#include "sha3/sph_simd.h" +#include "sha3/sph_echo.h" + + +void x11_hash(const char* input, char* output) +{ + sph_blake512_context ctx_blake; + sph_bmw512_context ctx_bmw; + sph_groestl512_context ctx_groestl; + sph_skein512_context ctx_skein; + sph_jh512_context ctx_jh; + sph_keccak512_context ctx_keccak; + + sph_luffa512_context ctx_luffa1; + sph_cubehash512_context ctx_cubehash1; + sph_shavite512_context ctx_shavite1; + sph_simd512_context ctx_simd1; + sph_echo512_context ctx_echo1; + + //these uint512 in the c++ source of the client are backed by an array of uint32 + uint32_t hashA[16], hashB[16]; + + sph_blake512_init(&ctx_blake); + sph_blake512 (&ctx_blake, input, 80); + sph_blake512_close (&ctx_blake, hashA); + + sph_bmw512_init(&ctx_bmw); + sph_bmw512 (&ctx_bmw, hashA, 64); + sph_bmw512_close(&ctx_bmw, hashB); + + sph_groestl512_init(&ctx_groestl); + sph_groestl512 (&ctx_groestl, hashB, 64); + sph_groestl512_close(&ctx_groestl, hashA); + + sph_skein512_init(&ctx_skein); + sph_skein512 (&ctx_skein, hashA, 64); + sph_skein512_close (&ctx_skein, hashB); + + sph_jh512_init(&ctx_jh); + sph_jh512 (&ctx_jh, hashB, 64); + sph_jh512_close(&ctx_jh, hashA); + + sph_keccak512_init(&ctx_keccak); + sph_keccak512 (&ctx_keccak, hashA, 64); + sph_keccak512_close(&ctx_keccak, hashB); + + sph_luffa512_init (&ctx_luffa1); + sph_luffa512 (&ctx_luffa1, hashB, 64); + sph_luffa512_close (&ctx_luffa1, hashA); + + sph_cubehash512_init (&ctx_cubehash1); + sph_cubehash512 (&ctx_cubehash1, hashA, 64); + sph_cubehash512_close(&ctx_cubehash1, hashB); + + sph_shavite512_init (&ctx_shavite1); + sph_shavite512 (&ctx_shavite1, hashB, 64); + sph_shavite512_close(&ctx_shavite1, hashA); + + sph_simd512_init (&ctx_simd1); + sph_simd512 (&ctx_simd1, hashA, 64); + sph_simd512_close(&ctx_simd1, hashB); + + sph_echo512_init (&ctx_echo1); + sph_echo512 (&ctx_echo1, hashB, 64); + sph_echo512_close(&ctx_echo1, hashA); + + memcpy(output, hashA, 32); + +} + diff --git a/x11.h b/x11.h new file mode 100644 index 0000000..aefdc8c --- /dev/null +++ b/x11.h @@ -0,0 +1,14 @@ +#ifndef X11_H +#define X11_H + +#ifdef __cplusplus +extern "C" { +#endif + +void x11_hash(const char* input, char* output); + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/xcoin.c b/xcoin.c new file mode 100644 index 0000000..e69de29