Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/BLAKE2/BLAKE2.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorLeigh Brown <leigh@solinno.co.uk>2018-04-02 22:07:05 +0300
committerLeigh Brown <leigh@solinno.co.uk>2018-04-02 22:07:05 +0300
commit7965d3e6e1b4193438b8d3a656787587d2579227 (patch)
treeef8d4edef59ecaae3389baba8a8ab49e173ec9fb
parentbeb75f4512223e6a3a03a48992345256c5ef393a (diff)
Add ARM NEON versions of blake2s and blake2b
NOTE! The NEON version of blake2s is currently NO FASTER than the reference implementations. However, it is retained for reference and in case it can be further improved. The NEON version of blake2b is more than twice as fast as the reference implementation on the Raspberry PI 2 Model B.
-rw-r--r--.gitignore6
-rw-r--r--neon/blake2-impl.h160
-rw-r--r--neon/blake2.h195
-rw-r--r--neon/blake2b-load-neon.h211
-rw-r--r--neon/blake2b-neon.c621
-rw-r--r--neon/blake2b-round.h76
-rw-r--r--neon/blake2b.c342
-rw-r--r--neon/blake2bp.c361
-rw-r--r--neon/blake2s-load-neon.h193
-rw-r--r--neon/blake2s-neon.c693
-rw-r--r--neon/blake2s-round.h70
-rw-r--r--neon/blake2s.c331
-rw-r--r--neon/blake2sp.c358
-rw-r--r--neon/blake2xb.c241
-rw-r--r--neon/blake2xs.c239
-rw-r--r--neon/genkat-c.c139
-rw-r--r--neon/genkat-json.c154
-rw-r--r--neon/makefile41
18 files changed, 4431 insertions, 0 deletions
diff --git a/.gitignore b/.gitignore
index 531c8f1..da7808f 100644
--- a/.gitignore
+++ b/.gitignore
@@ -18,4 +18,10 @@ ref/blake2xs
ref/blake2xb
sse/blake2xs
sse/blake2xb
+neon/blake2s
+neon/blake2b
+neon/blake2sp
+neon/blake2bp
+neon/blake2xs
+neon/blake2xb
**tags
diff --git a/neon/blake2-impl.h b/neon/blake2-impl.h
new file mode 100644
index 0000000..5dff7fc
--- /dev/null
+++ b/neon/blake2-impl.h
@@ -0,0 +1,160 @@
+/*
+ BLAKE2 reference source code package - reference C implementations
+
+ Copyright 2012, Samuel Neves <sneves@dei.uc.pt>. You may use this under the
+ terms of the CC0, the OpenSSL Licence, or the Apache Public License 2.0, at
+ your option. The terms of these licenses can be found at:
+
+ - CC0 1.0 Universal : http://creativecommons.org/publicdomain/zero/1.0
+ - OpenSSL license : https://www.openssl.org/source/license.html
+ - Apache 2.0 : http://www.apache.org/licenses/LICENSE-2.0
+
+ More information about the BLAKE2 hash function can be found at
+ https://blake2.net.
+*/
+#ifndef BLAKE2_IMPL_H
+#define BLAKE2_IMPL_H
+
+#include <stdint.h>
+#include <string.h>
+
+#if !defined(__cplusplus) && (!defined(__STDC_VERSION__) || __STDC_VERSION__ < 199901L)
+ #if defined(_MSC_VER)
+ #define BLAKE2_INLINE __inline
+ #elif defined(__GNUC__)
+ #define BLAKE2_INLINE __inline__
+ #else
+ #define BLAKE2_INLINE
+ #endif
+#else
+ #define BLAKE2_INLINE inline
+#endif
+
+static BLAKE2_INLINE uint32_t load32( const void *src )
+{
+#if defined(NATIVE_LITTLE_ENDIAN)
+ uint32_t w;
+ memcpy(&w, src, sizeof w);
+ return w;
+#else
+ const uint8_t *p = ( const uint8_t * )src;
+ return (( uint32_t )( p[0] ) << 0) |
+ (( uint32_t )( p[1] ) << 8) |
+ (( uint32_t )( p[2] ) << 16) |
+ (( uint32_t )( p[3] ) << 24) ;
+#endif
+}
+
+static BLAKE2_INLINE uint64_t load64( const void *src )
+{
+#if defined(NATIVE_LITTLE_ENDIAN)
+ uint64_t w;
+ memcpy(&w, src, sizeof w);
+ return w;
+#else
+ const uint8_t *p = ( const uint8_t * )src;
+ return (( uint64_t )( p[0] ) << 0) |
+ (( uint64_t )( p[1] ) << 8) |
+ (( uint64_t )( p[2] ) << 16) |
+ (( uint64_t )( p[3] ) << 24) |
+ (( uint64_t )( p[4] ) << 32) |
+ (( uint64_t )( p[5] ) << 40) |
+ (( uint64_t )( p[6] ) << 48) |
+ (( uint64_t )( p[7] ) << 56) ;
+#endif
+}
+
+static BLAKE2_INLINE uint16_t load16( const void *src )
+{
+#if defined(NATIVE_LITTLE_ENDIAN)
+ uint16_t w;
+ memcpy(&w, src, sizeof w);
+ return w;
+#else
+ const uint8_t *p = ( const uint8_t * )src;
+ return (( uint16_t )( p[0] ) << 0) |
+ (( uint16_t )( p[1] ) << 8) ;
+#endif
+}
+
+static BLAKE2_INLINE void store16( void *dst, uint16_t w )
+{
+#if defined(NATIVE_LITTLE_ENDIAN)
+ memcpy(dst, &w, sizeof w);
+#else
+ uint8_t *p = ( uint8_t * )dst;
+ *p++ = ( uint8_t )w; w >>= 8;
+ *p++ = ( uint8_t )w;
+#endif
+}
+
+static BLAKE2_INLINE void store32( void *dst, uint32_t w )
+{
+#if defined(NATIVE_LITTLE_ENDIAN)
+ memcpy(dst, &w, sizeof w);
+#else
+ uint8_t *p = ( uint8_t * )dst;
+ p[0] = (uint8_t)(w >> 0);
+ p[1] = (uint8_t)(w >> 8);
+ p[2] = (uint8_t)(w >> 16);
+ p[3] = (uint8_t)(w >> 24);
+#endif
+}
+
+static BLAKE2_INLINE void store64( void *dst, uint64_t w )
+{
+#if defined(NATIVE_LITTLE_ENDIAN)
+ memcpy(dst, &w, sizeof w);
+#else
+ uint8_t *p = ( uint8_t * )dst;
+ p[0] = (uint8_t)(w >> 0);
+ p[1] = (uint8_t)(w >> 8);
+ p[2] = (uint8_t)(w >> 16);
+ p[3] = (uint8_t)(w >> 24);
+ p[4] = (uint8_t)(w >> 32);
+ p[5] = (uint8_t)(w >> 40);
+ p[6] = (uint8_t)(w >> 48);
+ p[7] = (uint8_t)(w >> 56);
+#endif
+}
+
+static BLAKE2_INLINE uint64_t load48( const void *src )
+{
+ const uint8_t *p = ( const uint8_t * )src;
+ return (( uint64_t )( p[0] ) << 0) |
+ (( uint64_t )( p[1] ) << 8) |
+ (( uint64_t )( p[2] ) << 16) |
+ (( uint64_t )( p[3] ) << 24) |
+ (( uint64_t )( p[4] ) << 32) |
+ (( uint64_t )( p[5] ) << 40) ;
+}
+
+static BLAKE2_INLINE void store48( void *dst, uint64_t w )
+{
+ uint8_t *p = ( uint8_t * )dst;
+ p[0] = (uint8_t)(w >> 0);
+ p[1] = (uint8_t)(w >> 8);
+ p[2] = (uint8_t)(w >> 16);
+ p[3] = (uint8_t)(w >> 24);
+ p[4] = (uint8_t)(w >> 32);
+ p[5] = (uint8_t)(w >> 40);
+}
+
+static BLAKE2_INLINE uint32_t rotr32( const uint32_t w, const unsigned c )
+{
+ return ( w >> c ) | ( w << ( 32 - c ) );
+}
+
+static BLAKE2_INLINE uint64_t rotr64( const uint64_t w, const unsigned c )
+{
+ return ( w >> c ) | ( w << ( 64 - c ) );
+}
+
+/* prevents compiler optimizing out memset() */
+static BLAKE2_INLINE void secure_zero_memory(void *v, size_t n)
+{
+ static void *(*const volatile memset_v)(void *, int, size_t) = &memset;
+ memset_v(v, 0, n);
+}
+
+#endif
diff --git a/neon/blake2.h b/neon/blake2.h
new file mode 100644
index 0000000..ad62f26
--- /dev/null
+++ b/neon/blake2.h
@@ -0,0 +1,195 @@
+/*
+ BLAKE2 reference source code package - reference C implementations
+
+ Copyright 2012, Samuel Neves <sneves@dei.uc.pt>. You may use this under the
+ terms of the CC0, the OpenSSL Licence, or the Apache Public License 2.0, at
+ your option. The terms of these licenses can be found at:
+
+ - CC0 1.0 Universal : http://creativecommons.org/publicdomain/zero/1.0
+ - OpenSSL license : https://www.openssl.org/source/license.html
+ - Apache 2.0 : http://www.apache.org/licenses/LICENSE-2.0
+
+ More information about the BLAKE2 hash function can be found at
+ https://blake2.net.
+*/
+#ifndef BLAKE2_H
+#define BLAKE2_H
+
+#include <stddef.h>
+#include <stdint.h>
+
+#if defined(_MSC_VER)
+#define BLAKE2_PACKED(x) __pragma(pack(push, 1)) x __pragma(pack(pop))
+#else
+#define BLAKE2_PACKED(x) x __attribute__((packed))
+#endif
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+ enum blake2s_constant
+ {
+ BLAKE2S_BLOCKBYTES = 64,
+ BLAKE2S_OUTBYTES = 32,
+ BLAKE2S_KEYBYTES = 32,
+ BLAKE2S_SALTBYTES = 8,
+ BLAKE2S_PERSONALBYTES = 8
+ };
+
+ enum blake2b_constant
+ {
+ BLAKE2B_BLOCKBYTES = 128,
+ BLAKE2B_OUTBYTES = 64,
+ BLAKE2B_KEYBYTES = 64,
+ BLAKE2B_SALTBYTES = 16,
+ BLAKE2B_PERSONALBYTES = 16
+ };
+
+ typedef struct blake2s_state__
+ {
+ uint32_t h[8];
+ uint32_t t[2];
+ uint32_t f[2];
+ uint8_t buf[BLAKE2S_BLOCKBYTES];
+ size_t buflen;
+ size_t outlen;
+ uint8_t last_node;
+ } blake2s_state;
+
+ typedef struct blake2b_state__
+ {
+ uint64_t h[8];
+ uint64_t t[2];
+ uint64_t f[2];
+ uint8_t buf[BLAKE2B_BLOCKBYTES];
+ size_t buflen;
+ size_t outlen;
+ uint8_t last_node;
+ } blake2b_state;
+
+ typedef struct blake2sp_state__
+ {
+ blake2s_state S[8][1];
+ blake2s_state R[1];
+ uint8_t buf[8 * BLAKE2S_BLOCKBYTES];
+ size_t buflen;
+ size_t outlen;
+ } blake2sp_state;
+
+ typedef struct blake2bp_state__
+ {
+ blake2b_state S[4][1];
+ blake2b_state R[1];
+ uint8_t buf[4 * BLAKE2B_BLOCKBYTES];
+ size_t buflen;
+ size_t outlen;
+ } blake2bp_state;
+
+
+ BLAKE2_PACKED(struct blake2s_param__
+ {
+ uint8_t digest_length; /* 1 */
+ uint8_t key_length; /* 2 */
+ uint8_t fanout; /* 3 */
+ uint8_t depth; /* 4 */
+ uint32_t leaf_length; /* 8 */
+ uint32_t node_offset; /* 12 */
+ uint16_t xof_length; /* 14 */
+ uint8_t node_depth; /* 15 */
+ uint8_t inner_length; /* 16 */
+ /* uint8_t reserved[0]; */
+ uint8_t salt[BLAKE2S_SALTBYTES]; /* 24 */
+ uint8_t personal[BLAKE2S_PERSONALBYTES]; /* 32 */
+ });
+
+ typedef struct blake2s_param__ blake2s_param;
+
+ BLAKE2_PACKED(struct blake2b_param__
+ {
+ uint8_t digest_length; /* 1 */
+ uint8_t key_length; /* 2 */
+ uint8_t fanout; /* 3 */
+ uint8_t depth; /* 4 */
+ uint32_t leaf_length; /* 8 */
+ uint32_t node_offset; /* 12 */
+ uint32_t xof_length; /* 16 */
+ uint8_t node_depth; /* 17 */
+ uint8_t inner_length; /* 18 */
+ uint8_t reserved[14]; /* 32 */
+ uint8_t salt[BLAKE2B_SALTBYTES]; /* 48 */
+ uint8_t personal[BLAKE2B_PERSONALBYTES]; /* 64 */
+ });
+
+ typedef struct blake2b_param__ blake2b_param;
+
+ typedef struct blake2xs_state__
+ {
+ blake2s_state S[1];
+ blake2s_param P[1];
+ } blake2xs_state;
+
+ typedef struct blake2xb_state__
+ {
+ blake2b_state S[1];
+ blake2b_param P[1];
+ } blake2xb_state;
+
+ /* Padded structs result in a compile-time error */
+ enum {
+ BLAKE2_DUMMY_1 = 1/(sizeof(blake2s_param) == BLAKE2S_OUTBYTES),
+ BLAKE2_DUMMY_2 = 1/(sizeof(blake2b_param) == BLAKE2B_OUTBYTES)
+ };
+
+ /* Streaming API */
+ int blake2s_init( blake2s_state *S, size_t outlen );
+ int blake2s_init_key( blake2s_state *S, size_t outlen, const void *key, size_t keylen );
+ int blake2s_init_param( blake2s_state *S, const blake2s_param *P );
+ int blake2s_update( blake2s_state *S, const void *in, size_t inlen );
+ int blake2s_final( blake2s_state *S, void *out, size_t outlen );
+
+ int blake2b_init( blake2b_state *S, size_t outlen );
+ int blake2b_init_key( blake2b_state *S, size_t outlen, const void *key, size_t keylen );
+ int blake2b_init_param( blake2b_state *S, const blake2b_param *P );
+ int blake2b_update( blake2b_state *S, const void *in, size_t inlen );
+ int blake2b_final( blake2b_state *S, void *out, size_t outlen );
+
+ int blake2sp_init( blake2sp_state *S, size_t outlen );
+ int blake2sp_init_key( blake2sp_state *S, size_t outlen, const void *key, size_t keylen );
+ int blake2sp_update( blake2sp_state *S, const void *in, size_t inlen );
+ int blake2sp_final( blake2sp_state *S, void *out, size_t outlen );
+
+ int blake2bp_init( blake2bp_state *S, size_t outlen );
+ int blake2bp_init_key( blake2bp_state *S, size_t outlen, const void *key, size_t keylen );
+ int blake2bp_update( blake2bp_state *S, const void *in, size_t inlen );
+ int blake2bp_final( blake2bp_state *S, void *out, size_t outlen );
+
+ /* Variable output length API */
+ int blake2xs_init( blake2xs_state *S, const size_t outlen );
+ int blake2xs_init_key( blake2xs_state *S, const size_t outlen, const void *key, size_t keylen );
+ int blake2xs_update( blake2xs_state *S, const void *in, size_t inlen );
+ int blake2xs_final(blake2xs_state *S, void *out, size_t outlen);
+
+ int blake2xb_init( blake2xb_state *S, const size_t outlen );
+ int blake2xb_init_key( blake2xb_state *S, const size_t outlen, const void *key, size_t keylen );
+ int blake2xb_update( blake2xb_state *S, const void *in, size_t inlen );
+ int blake2xb_final(blake2xb_state *S, void *out, size_t outlen);
+
+ /* Simple API */
+ int blake2s( void *out, size_t outlen, const void *in, size_t inlen, const void *key, size_t keylen );
+ int blake2b( void *out, size_t outlen, const void *in, size_t inlen, const void *key, size_t keylen );
+
+ int blake2sp( void *out, size_t outlen, const void *in, size_t inlen, const void *key, size_t keylen );
+ int blake2bp( void *out, size_t outlen, const void *in, size_t inlen, const void *key, size_t keylen );
+
+ int blake2xs( void *out, size_t outlen, const void *in, size_t inlen, const void *key, size_t keylen );
+ int blake2xb( void *out, size_t outlen, const void *in, size_t inlen, const void *key, size_t keylen );
+
+ /* This is simply an alias for blake2b */
+ int blake2( void *out, size_t outlen, const void *in, size_t inlen, const void *key, size_t keylen );
+
+#if defined(__cplusplus)
+}
+#endif
+
+#endif
diff --git a/neon/blake2b-load-neon.h b/neon/blake2b-load-neon.h
new file mode 100644
index 0000000..5f75a05
--- /dev/null
+++ b/neon/blake2b-load-neon.h
@@ -0,0 +1,211 @@
+/*
+ BLAKE2 reference source code package - reference C implementations
+
+ Copyright 2012, Samuel Neves <sneves@dei.uc.pt>. You may use this under the
+ terms of the CC0, the OpenSSL Licence, or the Apache Public License 2.0, at
+ your option. The terms of these licenses can be found at:
+
+ - CC0 1.0 Universal : http://creativecommons.org/publicdomain/zero/1.0
+ - OpenSSL license : https://www.openssl.org/source/license.html
+ - Apache 2.0 : http://www.apache.org/licenses/LICENSE-2.0
+
+ More information about the BLAKE2 hash function can be found at
+ https://blake2.net.
+*/
+#ifndef BLAKE2B_LOAD_NEON_H
+#define BLAKE2B_LOAD_NEON_H
+
+#define LOAD_MSG_0_1(b0, b1) \
+ b0 = vcombine_u64(vget_low_u64(m0), vget_low_u64(m1)); \
+ b1 = vcombine_u64(vget_low_u64(m2), vget_low_u64(m3));
+
+#define LOAD_MSG_0_2(b0, b1) \
+ b0 = vcombine_u64(vget_high_u64(m0), vget_high_u64(m1)); \
+ b1 = vcombine_u64(vget_high_u64(m2), vget_high_u64(m3));
+
+#define LOAD_MSG_0_3(b0, b1) \
+ b0 = vcombine_u64(vget_low_u64(m4), vget_low_u64(m5)); \
+ b1 = vcombine_u64(vget_low_u64(m6), vget_low_u64(m7));
+
+#define LOAD_MSG_0_4(b0, b1) \
+ b0 = vcombine_u64(vget_high_u64(m4), vget_high_u64(m5)); \
+ b1 = vcombine_u64(vget_high_u64(m6), vget_high_u64(m7));
+
+#define LOAD_MSG_1_1(b0, b1) \
+ b0 = vcombine_u64(vget_low_u64(m7), vget_low_u64(m2)); \
+ b1 = vcombine_u64(vget_high_u64(m4), vget_high_u64(m6));
+
+#define LOAD_MSG_1_2(b0, b1) \
+ b0 = vcombine_u64(vget_low_u64(m5), vget_low_u64(m4)); \
+ b1 = vextq_u64(m7, m3, 1);
+
+#define LOAD_MSG_1_3(b0, b1) \
+ b0 = vextq_u64(m0, m0, 1); \
+ b1 = vcombine_u64(vget_high_u64(m5), vget_high_u64(m2));
+
+#define LOAD_MSG_1_4(b0, b1) \
+ b0 = vcombine_u64(vget_low_u64(m6), vget_low_u64(m1)); \
+ b1 = vcombine_u64(vget_high_u64(m3), vget_high_u64(m1));
+
+#define LOAD_MSG_2_1(b0, b1) \
+ b0 = vextq_u64(m5, m6, 1); \
+ b1 = vcombine_u64(vget_high_u64(m2), vget_high_u64(m7));
+
+#define LOAD_MSG_2_2(b0, b1) \
+ b0 = vcombine_u64(vget_low_u64(m4), vget_low_u64(m0)); \
+ b1 = vcombine_u64(vget_low_u64(m1), vget_high_u64(m6));
+
+#define LOAD_MSG_2_3(b0, b1) \
+ b0 = vcombine_u64(vget_low_u64(m5), vget_high_u64(m1)); \
+ b1 = vcombine_u64(vget_high_u64(m3), vget_high_u64(m4));
+
+#define LOAD_MSG_2_4(b0, b1) \
+ b0 = vcombine_u64(vget_low_u64(m7), vget_low_u64(m3)); \
+ b1 = vextq_u64(m0, m2, 1);
+
+#define LOAD_MSG_3_1(b0, b1) \
+ b0 = vcombine_u64(vget_high_u64(m3), vget_high_u64(m1)); \
+ b1 = vcombine_u64(vget_high_u64(m6), vget_high_u64(m5));
+
+#define LOAD_MSG_3_2(b0, b1) \
+ b0 = vcombine_u64(vget_high_u64(m4), vget_high_u64(m0)); \
+ b1 = vcombine_u64(vget_low_u64(m6), vget_low_u64(m7));
+
+#define LOAD_MSG_3_3(b0, b1) \
+ b0 = vcombine_u64(vget_low_u64(m1), vget_high_u64(m2)); \
+ b1 = vcombine_u64(vget_low_u64(m2), vget_high_u64(m7));
+
+#define LOAD_MSG_3_4(b0, b1) \
+ b0 = vcombine_u64(vget_low_u64(m3), vget_low_u64(m5)); \
+ b1 = vcombine_u64(vget_low_u64(m0), vget_low_u64(m4));
+
+#define LOAD_MSG_4_1(b0, b1) \
+ b0 = vcombine_u64(vget_high_u64(m4), vget_high_u64(m2)); \
+ b1 = vcombine_u64(vget_low_u64(m1), vget_low_u64(m5));
+
+#define LOAD_MSG_4_2(b0, b1) \
+ b0 = vcombine_u64(vget_low_u64(m0), vget_high_u64(m3)); \
+ b1 = vcombine_u64(vget_low_u64(m2), vget_high_u64(m7));
+
+#define LOAD_MSG_4_3(b0, b1) \
+ b0 = vcombine_u64(vget_low_u64(m7), vget_high_u64(m5)); \
+ b1 = vcombine_u64(vget_low_u64(m3), vget_high_u64(m1));
+
+#define LOAD_MSG_4_4(b0, b1) \
+ b0 = vextq_u64(m0, m6, 1); \
+ b1 = vcombine_u64(vget_low_u64(m4), vget_high_u64(m6));
+
+#define LOAD_MSG_5_1(b0, b1) \
+ b0 = vcombine_u64(vget_low_u64(m1), vget_low_u64(m3)); \
+ b1 = vcombine_u64(vget_low_u64(m0), vget_low_u64(m4));
+
+#define LOAD_MSG_5_2(b0, b1) \
+ b0 = vcombine_u64(vget_low_u64(m6), vget_low_u64(m5)); \
+ b1 = vcombine_u64(vget_high_u64(m5), vget_high_u64(m1));
+
+#define LOAD_MSG_5_3(b0, b1) \
+ b0 = vcombine_u64(vget_low_u64(m2), vget_high_u64(m3)); \
+ b1 = vcombine_u64(vget_high_u64(m7), vget_high_u64(m0));
+
+#define LOAD_MSG_5_4(b0, b1) \
+ b0 = vcombine_u64(vget_high_u64(m6), vget_high_u64(m2)); \
+ b1 = vcombine_u64(vget_low_u64(m7), vget_high_u64(m4));
+
+#define LOAD_MSG_6_1(b0, b1) \
+ b0 = vcombine_u64(vget_low_u64(m6), vget_high_u64(m0)); \
+ b1 = vcombine_u64(vget_low_u64(m7), vget_low_u64(m2));
+
+#define LOAD_MSG_6_2(b0, b1) \
+ b0 = vcombine_u64(vget_high_u64(m2), vget_high_u64(m7)); \
+ b1 = vextq_u64(m6, m5, 1);
+
+#define LOAD_MSG_6_3(b0, b1) \
+ b0 = vcombine_u64(vget_low_u64(m0), vget_low_u64(m3)); \
+ b1 = vextq_u64(m4, m4, 1);
+
+#define LOAD_MSG_6_4(b0, b1) \
+ b0 = vcombine_u64(vget_high_u64(m3), vget_high_u64(m1)); \
+ b1 = vcombine_u64(vget_low_u64(m1), vget_high_u64(m5));
+
+#define LOAD_MSG_7_1(b0, b1) \
+ b0 = vcombine_u64(vget_high_u64(m6), vget_high_u64(m3)); \
+ b1 = vcombine_u64(vget_low_u64(m6), vget_high_u64(m1));
+
+#define LOAD_MSG_7_2(b0, b1) \
+ b0 = vextq_u64(m5, m7, 1); \
+ b1 = vcombine_u64(vget_high_u64(m0), vget_high_u64(m4));
+
+#define LOAD_MSG_7_3(b0, b1) \
+ b0 = vcombine_u64(vget_high_u64(m2), vget_high_u64(m7)); \
+ b1 = vcombine_u64(vget_low_u64(m4), vget_low_u64(m1));
+
+#define LOAD_MSG_7_4(b0, b1) \
+ b0 = vcombine_u64(vget_low_u64(m0), vget_low_u64(m2)); \
+ b1 = vcombine_u64(vget_low_u64(m3), vget_low_u64(m5));
+
+#define LOAD_MSG_8_1(b0, b1) \
+ b0 = vcombine_u64(vget_low_u64(m3), vget_low_u64(m7)); \
+ b1 = vextq_u64(m5, m0, 1);
+
+#define LOAD_MSG_8_2(b0, b1) \
+ b0 = vcombine_u64(vget_high_u64(m7), vget_high_u64(m4)); \
+ b1 = vextq_u64(m1, m4, 1);
+
+#define LOAD_MSG_8_3(b0, b1) \
+ b0 = m6; \
+ b1 = vextq_u64(m0, m5, 1);
+
+#define LOAD_MSG_8_4(b0, b1) \
+ b0 = vcombine_u64(vget_low_u64(m1), vget_high_u64(m3)); \
+ b1 = m2;
+
+#define LOAD_MSG_9_1(b0, b1) \
+ b0 = vcombine_u64(vget_low_u64(m5), vget_low_u64(m4)); \
+ b1 = vcombine_u64(vget_high_u64(m3), vget_high_u64(m0));
+
+#define LOAD_MSG_9_2(b0, b1) \
+ b0 = vcombine_u64(vget_low_u64(m1), vget_low_u64(m2)); \
+ b1 = vcombine_u64(vget_low_u64(m3), vget_high_u64(m2));
+
+#define LOAD_MSG_9_3(b0, b1) \
+ b0 = vcombine_u64(vget_high_u64(m7), vget_high_u64(m4)); \
+ b1 = vcombine_u64(vget_high_u64(m1), vget_high_u64(m6));
+
+#define LOAD_MSG_9_4(b0, b1) \
+ b0 = vextq_u64(m5, m7, 1); \
+ b1 = vcombine_u64(vget_low_u64(m6), vget_low_u64(m0));
+
+#define LOAD_MSG_10_1(b0, b1) \
+ b0 = vcombine_u64(vget_low_u64(m0), vget_low_u64(m1)); \
+ b1 = vcombine_u64(vget_low_u64(m2), vget_low_u64(m3));
+
+#define LOAD_MSG_10_2(b0, b1) \
+ b0 = vcombine_u64(vget_high_u64(m0), vget_high_u64(m1)); \
+ b1 = vcombine_u64(vget_high_u64(m2), vget_high_u64(m3));
+
+#define LOAD_MSG_10_3(b0, b1) \
+ b0 = vcombine_u64(vget_low_u64(m4), vget_low_u64(m5)); \
+ b1 = vcombine_u64(vget_low_u64(m6), vget_low_u64(m7));
+
+#define LOAD_MSG_10_4(b0, b1) \
+ b0 = vcombine_u64(vget_high_u64(m4), vget_high_u64(m5)); \
+ b1 = vcombine_u64(vget_high_u64(m6), vget_high_u64(m7));
+
+#define LOAD_MSG_11_1(b0, b1) \
+ b0 = vcombine_u64(vget_low_u64(m7), vget_low_u64(m2)); \
+ b1 = vcombine_u64(vget_high_u64(m4), vget_high_u64(m6));
+
+#define LOAD_MSG_11_2(b0, b1) \
+ b0 = vcombine_u64(vget_low_u64(m5), vget_low_u64(m4)); \
+ b1 = vextq_u64(m7, m3, 1);
+
+#define LOAD_MSG_11_3(b0, b1) \
+ b0 = vextq_u64(m0, m0, 1); \
+ b1 = vcombine_u64(vget_high_u64(m5), vget_high_u64(m2));
+
+#define LOAD_MSG_11_4(b0, b1) \
+ b0 = vcombine_u64(vget_low_u64(m6), vget_low_u64(m1)); \
+ b1 = vcombine_u64(vget_high_u64(m3), vget_high_u64(m1));
+
+
+#endif
diff --git a/neon/blake2b-neon.c b/neon/blake2b-neon.c
new file mode 100644
index 0000000..f202f07
--- /dev/null
+++ b/neon/blake2b-neon.c
@@ -0,0 +1,621 @@
+/*
+ BLAKE2 reference source code package - reference C implementations
+
+ Copyright 2012, Samuel Neves <sneves@dei.uc.pt>. You may use this under the
+ terms of the CC0, the OpenSSL Licence, or the Apache Public License 2.0, at
+ your option. The terms of these licenses can be found at:
+
+ - CC0 1.0 Universal : http://creativecommons.org/publicdomain/zero/1.0
+ - OpenSSL license : https://www.openssl.org/source/license.html
+ - Apache 2.0 : http://www.apache.org/licenses/LICENSE-2.0
+
+ More information about the BLAKE2 hash function can be found at
+ https://blake2.net.
+*/
+
+#include <stdint.h>
+#include <string.h>
+#include <stdio.h>
+#include <arm_neon.h>
+
+#include "blake2.h"
+#include "blake2-impl.h"
+
+static const uint64_t blake2b_IV[8] =
+{
+ 0x6a09e667f3bcc908ULL, 0xbb67ae8584caa73bULL,
+ 0x3c6ef372fe94f82bULL, 0xa54ff53a5f1d36f1ULL,
+ 0x510e527fade682d1ULL, 0x9b05688c2b3e6c1fULL,
+ 0x1f83d9abfb41bd6bULL, 0x5be0cd19137e2179ULL
+};
+
+/*
+static const uint8_t blake2b_sigma[12][16] =
+{
+ { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 } ,
+ { 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 } ,
+ { 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 } ,
+ { 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 } ,
+ { 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13 } ,
+ { 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 } ,
+ { 12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11 } ,
+ { 13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10 } ,
+ { 6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5 } ,
+ { 10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13 , 0 } ,
+ { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 } ,
+ { 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 }
+};
+*/
+
+static void blake2b_set_lastnode( blake2b_state *S )
+{
+ S->f[1] = (uint64_t)-1;
+}
+
+/* Some helper functions, not necessarily useful */
+static int blake2b_is_lastblock( const blake2b_state *S )
+{
+ return S->f[0] != 0;
+}
+
+static void blake2b_set_lastblock( blake2b_state *S )
+{
+ if( S->last_node ) blake2b_set_lastnode( S );
+
+ S->f[0] = (uint64_t)-1;
+}
+
+static void blake2b_increment_counter( blake2b_state *S, const uint64_t inc )
+{
+ S->t[0] += inc;
+ S->t[1] += ( S->t[0] < inc );
+}
+
+static void blake2b_init0( blake2b_state *S )
+{
+ size_t i;
+ memset( S, 0, sizeof( blake2b_state ) );
+
+ for( i = 0; i < 8; ++i ) S->h[i] = blake2b_IV[i];
+}
+
+/* init xors IV with input parameter block */
+int blake2b_init_param( blake2b_state *S, const blake2b_param *P )
+{
+ const uint8_t *p = ( const uint8_t * )( P );
+ size_t i;
+
+ blake2b_init0( S );
+
+ /* IV XOR ParamBlock */
+ for( i = 0; i < 8; ++i )
+ S->h[i] ^= load64( p + sizeof( S->h[i] ) * i );
+
+ S->outlen = P->digest_length;
+ return 0;
+}
+
+
+
+int blake2b_init( blake2b_state *S, size_t outlen )
+{
+ blake2b_param P[1];
+
+ if ( ( !outlen ) || ( outlen > BLAKE2B_OUTBYTES ) ) return -1;
+
+ P->digest_length = (uint8_t)outlen;
+ P->key_length = 0;
+ P->fanout = 1;
+ P->depth = 1;
+ store32( &P->leaf_length, 0 );
+ store32( &P->node_offset, 0 );
+ store32( &P->xof_length, 0 );
+ P->node_depth = 0;
+ P->inner_length = 0;
+ memset( P->reserved, 0, sizeof( P->reserved ) );
+ memset( P->salt, 0, sizeof( P->salt ) );
+ memset( P->personal, 0, sizeof( P->personal ) );
+ return blake2b_init_param( S, P );
+}
+
+
+int blake2b_init_key( blake2b_state *S, size_t outlen, const void *key, size_t keylen )
+{
+ blake2b_param P[1];
+
+ if ( ( !outlen ) || ( outlen > BLAKE2B_OUTBYTES ) ) return -1;
+
+ if ( !key || !keylen || keylen > BLAKE2B_KEYBYTES ) return -1;
+
+ P->digest_length = (uint8_t)outlen;
+ P->key_length = (uint8_t)keylen;
+ P->fanout = 1;
+ P->depth = 1;
+ store32( &P->leaf_length, 0 );
+ store32( &P->node_offset, 0 );
+ store32( &P->xof_length, 0 );
+ P->node_depth = 0;
+ P->inner_length = 0;
+ memset( P->reserved, 0, sizeof( P->reserved ) );
+ memset( P->salt, 0, sizeof( P->salt ) );
+ memset( P->personal, 0, sizeof( P->personal ) );
+
+ if( blake2b_init_param( S, P ) < 0 ) return -1;
+
+ {
+ uint8_t block[BLAKE2B_BLOCKBYTES];
+ memset( block, 0, BLAKE2B_BLOCKBYTES );
+ memcpy( block, key, keylen );
+ blake2b_update( S, block, BLAKE2B_BLOCKBYTES );
+ secure_zero_memory( block, BLAKE2B_BLOCKBYTES ); /* Burn the key from stack */
+ }
+ return 0;
+}
+
+#undef LOAD_MSG_0_1
+#define LOAD_MSG_0_1(b0, b1) \
+do { b0 = vcombine_u64(vget_low_u64(m0), vget_low_u64(m1)); b1 = vcombine_u64(vget_low_u64(m2), vget_low_u64(m3)); } while(0)
+
+#undef LOAD_MSG_0_2
+#define LOAD_MSG_0_2(b0, b1) \
+ do { b0 = vcombine_u64(vget_high_u64(m0), vget_high_u64(m1)); b1 = vcombine_u64(vget_high_u64(m2), vget_high_u64(m3)); } while(0)
+
+#undef LOAD_MSG_0_3
+#define LOAD_MSG_0_3(b0, b1) \
+ do { b0 = vcombine_u64(vget_low_u64(m4), vget_low_u64(m5)); b1 = vcombine_u64(vget_low_u64(m6), vget_low_u64(m7)); } while(0)
+
+#undef LOAD_MSG_0_4
+#define LOAD_MSG_0_4(b0, b1) \
+ do { b0 = vcombine_u64(vget_high_u64(m4), vget_high_u64(m5)); b1 = vcombine_u64(vget_high_u64(m6), vget_high_u64(m7)); } while(0)
+
+#undef LOAD_MSG_1_1
+#define LOAD_MSG_1_1(b0, b1) \
+ do { b0 = vcombine_u64(vget_low_u64(m7), vget_low_u64(m2)); b1 = vcombine_u64(vget_high_u64(m4), vget_high_u64(m6)); } while(0)
+
+#undef LOAD_MSG_1_2
+#define LOAD_MSG_1_2(b0, b1) \
+ do { b0 = vcombine_u64(vget_low_u64(m5), vget_low_u64(m4)); b1 = vextq_u64(m7, m3, 1); } while(0)
+
+#undef LOAD_MSG_1_3
+#define LOAD_MSG_1_3(b0, b1) \
+ do { b0 = vextq_u64(m0, m0, 1); b1 = vcombine_u64(vget_high_u64(m5), vget_high_u64(m2)); } while(0)
+
+#undef LOAD_MSG_1_4
+#define LOAD_MSG_1_4(b0, b1) \
+ do { b0 = vcombine_u64(vget_low_u64(m6), vget_low_u64(m1)); b1 = vcombine_u64(vget_high_u64(m3), vget_high_u64(m1)); } while(0)
+
+#undef LOAD_MSG_2_1
+#define LOAD_MSG_2_1(b0, b1) \
+ do { b0 = vextq_u64(m5, m6, 1); b1 = vcombine_u64(vget_high_u64(m2), vget_high_u64(m7)); } while(0)
+
+#undef LOAD_MSG_2_2
+#define LOAD_MSG_2_2(b0, b1) \
+ do { b0 = vcombine_u64(vget_low_u64(m4), vget_low_u64(m0)); b1 = vcombine_u64(vget_low_u64(m1), vget_high_u64(m6)); } while(0)
+
+#undef LOAD_MSG_2_3
+#define LOAD_MSG_2_3(b0, b1) \
+ do { b0 = vcombine_u64(vget_low_u64(m5), vget_high_u64(m1)); b1 = vcombine_u64(vget_high_u64(m3), vget_high_u64(m4)); } while(0)
+
+#undef LOAD_MSG_2_4
+#define LOAD_MSG_2_4(b0, b1) \
+ do { b0 = vcombine_u64(vget_low_u64(m7), vget_low_u64(m3)); b1 = vextq_u64(m0, m2, 1); } while(0)
+
+#undef LOAD_MSG_3_1
+#define LOAD_MSG_3_1(b0, b1) \
+ do { b0 = vcombine_u64(vget_high_u64(m3), vget_high_u64(m1)); b1 = vcombine_u64(vget_high_u64(m6), vget_high_u64(m5)); } while(0)
+
+#undef LOAD_MSG_3_2
+#define LOAD_MSG_3_2(b0, b1) \
+ do { b0 = vcombine_u64(vget_high_u64(m4), vget_high_u64(m0)); b1 = vcombine_u64(vget_low_u64(m6), vget_low_u64(m7)); } while(0)
+
+#undef LOAD_MSG_3_3
+#define LOAD_MSG_3_3(b0, b1) \
+ do { b0 = vcombine_u64(vget_low_u64(m1), vget_high_u64(m2)); b1 = vcombine_u64(vget_low_u64(m2), vget_high_u64(m7)); } while(0)
+
+#undef LOAD_MSG_3_4
+#define LOAD_MSG_3_4(b0, b1) \
+ do { b0 = vcombine_u64(vget_low_u64(m3), vget_low_u64(m5)); b1 = vcombine_u64(vget_low_u64(m0), vget_low_u64(m4)); } while(0)
+
+#undef LOAD_MSG_4_1
+#define LOAD_MSG_4_1(b0, b1) \
+ do { b0 = vcombine_u64(vget_high_u64(m4), vget_high_u64(m2)); b1 = vcombine_u64(vget_low_u64(m1), vget_low_u64(m5)); } while(0)
+
+#undef LOAD_MSG_4_2
+#define LOAD_MSG_4_2(b0, b1) \
+ do { b0 = vcombine_u64(vget_low_u64(m0), vget_high_u64(m3)); b1 = vcombine_u64(vget_low_u64(m2), vget_high_u64(m7)); } while(0)
+
+#undef LOAD_MSG_4_3
+#define LOAD_MSG_4_3(b0, b1) \
+ do { b0 = vcombine_u64(vget_low_u64(m7), vget_high_u64(m5)); b1 = vcombine_u64(vget_low_u64(m3), vget_high_u64(m1)); } while(0)
+
+#undef LOAD_MSG_4_4
+#define LOAD_MSG_4_4(b0, b1) \
+ do { b0 = vextq_u64(m0, m6, 1); b1 = vcombine_u64(vget_low_u64(m4), vget_high_u64(m6)); } while(0)
+
+#undef LOAD_MSG_5_1
+#define LOAD_MSG_5_1(b0, b1) \
+ do { b0 = vcombine_u64(vget_low_u64(m1), vget_low_u64(m3)); b1 = vcombine_u64(vget_low_u64(m0), vget_low_u64(m4)); } while(0)
+
+#undef LOAD_MSG_5_2
+#define LOAD_MSG_5_2(b0, b1) \
+ do { b0 = vcombine_u64(vget_low_u64(m6), vget_low_u64(m5)); b1 = vcombine_u64(vget_high_u64(m5), vget_high_u64(m1)); } while(0)
+
+#undef LOAD_MSG_5_3
+#define LOAD_MSG_5_3(b0, b1) \
+ do { b0 = vcombine_u64(vget_low_u64(m2), vget_high_u64(m3)); b1 = vcombine_u64(vget_high_u64(m7), vget_high_u64(m0)); } while(0)
+
+#undef LOAD_MSG_5_4
+#define LOAD_MSG_5_4(b0, b1) \
+ do { b0 = vcombine_u64(vget_high_u64(m6), vget_high_u64(m2)); b1 = vcombine_u64(vget_low_u64(m7), vget_high_u64(m4)); } while(0)
+
+#undef LOAD_MSG_6_1
+#define LOAD_MSG_6_1(b0, b1) \
+ do { b0 = vcombine_u64(vget_low_u64(m6), vget_high_u64(m0)); b1 = vcombine_u64(vget_low_u64(m7), vget_low_u64(m2)); } while(0)
+
+#undef LOAD_MSG_6_2
+#define LOAD_MSG_6_2(b0, b1) \
+ do { b0 = vcombine_u64(vget_high_u64(m2), vget_high_u64(m7)); b1 = vextq_u64(m6, m5, 1); } while(0)
+
+#undef LOAD_MSG_6_3
+#define LOAD_MSG_6_3(b0, b1) \
+ do { b0 = vcombine_u64(vget_low_u64(m0), vget_low_u64(m3)); b1 = vextq_u64(m4, m4, 1); } while(0)
+
+#undef LOAD_MSG_6_4
+#define LOAD_MSG_6_4(b0, b1) \
+ do { b0 = vcombine_u64(vget_high_u64(m3), vget_high_u64(m1)); b1 = vcombine_u64(vget_low_u64(m1), vget_high_u64(m5)); } while(0)
+
+#undef LOAD_MSG_7_1
+#define LOAD_MSG_7_1(b0, b1) \
+ do { b0 = vcombine_u64(vget_high_u64(m6), vget_high_u64(m3)); b1 = vcombine_u64(vget_low_u64(m6), vget_high_u64(m1)); } while(0)
+
+#undef LOAD_MSG_7_2
+#define LOAD_MSG_7_2(b0, b1) \
+ do { b0 = vextq_u64(m5, m7, 1); b1 = vcombine_u64(vget_high_u64(m0), vget_high_u64(m4)); } while(0)
+
+#undef LOAD_MSG_7_3
+#define LOAD_MSG_7_3(b0, b1) \
+ do { b0 = vcombine_u64(vget_high_u64(m2), vget_high_u64(m7)); b1 = vcombine_u64(vget_low_u64(m4), vget_low_u64(m1)); } while(0)
+
+#undef LOAD_MSG_7_4
+#define LOAD_MSG_7_4(b0, b1) \
+ do { b0 = vcombine_u64(vget_low_u64(m0), vget_low_u64(m2)); b1 = vcombine_u64(vget_low_u64(m3), vget_low_u64(m5)); } while(0)
+
+#undef LOAD_MSG_8_1
+#define LOAD_MSG_8_1(b0, b1) \
+ do { b0 = vcombine_u64(vget_low_u64(m3), vget_low_u64(m7)); b1 = vextq_u64(m5, m0, 1); } while(0)
+
+#undef LOAD_MSG_8_2
+#define LOAD_MSG_8_2(b0, b1) \
+ do { b0 = vcombine_u64(vget_high_u64(m7), vget_high_u64(m4)); b1 = vextq_u64(m1, m4, 1); } while(0)
+
+#undef LOAD_MSG_8_3
+#define LOAD_MSG_8_3(b0, b1) \
+ do { b0 = m6; b1 = vextq_u64(m0, m5, 1); } while(0)
+
+#undef LOAD_MSG_8_4
+#define LOAD_MSG_8_4(b0, b1) \
+ do { b0 = vcombine_u64(vget_low_u64(m1), vget_high_u64(m3)); b1 = m2; } while(0)
+
+#undef LOAD_MSG_9_1
+#define LOAD_MSG_9_1(b0, b1) \
+ do { b0 = vcombine_u64(vget_low_u64(m5), vget_low_u64(m4)); b1 = vcombine_u64(vget_high_u64(m3), vget_high_u64(m0)); } while(0)
+
+#undef LOAD_MSG_9_2
+#define LOAD_MSG_9_2(b0, b1) \
+ do { b0 = vcombine_u64(vget_low_u64(m1), vget_low_u64(m2)); b1 = vcombine_u64(vget_low_u64(m3), vget_high_u64(m2)); } while(0)
+
+#undef LOAD_MSG_9_3
+#define LOAD_MSG_9_3(b0, b1) \
+ do { b0 = vcombine_u64(vget_high_u64(m7), vget_high_u64(m4)); b1 = vcombine_u64(vget_high_u64(m1), vget_high_u64(m6)); } while(0)
+
+#undef LOAD_MSG_9_4
+#define LOAD_MSG_9_4(b0, b1) \
+ do { b0 = vextq_u64(m5, m7, 1); b1 = vcombine_u64(vget_low_u64(m6), vget_low_u64(m0)); } while(0)
+
+#undef LOAD_MSG_10_1
+#define LOAD_MSG_10_1(b0, b1) \
+ do { b0 = vcombine_u64(vget_low_u64(m0), vget_low_u64(m1)); b1 = vcombine_u64(vget_low_u64(m2), vget_low_u64(m3)); } while(0)
+
+#undef LOAD_MSG_10_2
+#define LOAD_MSG_10_2(b0, b1) \
+ do { b0 = vcombine_u64(vget_high_u64(m0), vget_high_u64(m1)); b1 = vcombine_u64(vget_high_u64(m2), vget_high_u64(m3)); } while(0)
+
+#undef LOAD_MSG_10_3
+#define LOAD_MSG_10_3(b0, b1) \
+ do { b0 = vcombine_u64(vget_low_u64(m4), vget_low_u64(m5)); b1 = vcombine_u64(vget_low_u64(m6), vget_low_u64(m7)); } while(0)
+
+#undef LOAD_MSG_10_4
+#define LOAD_MSG_10_4(b0, b1) \
+ do { b0 = vcombine_u64(vget_high_u64(m4), vget_high_u64(m5)); b1 = vcombine_u64(vget_high_u64(m6), vget_high_u64(m7)); } while(0)
+
+#undef LOAD_MSG_11_1
+#define LOAD_MSG_11_1(b0, b1) \
+ do { b0 = vcombine_u64(vget_low_u64(m7), vget_low_u64(m2)); b1 = vcombine_u64(vget_high_u64(m4), vget_high_u64(m6)); } while(0)
+
+#undef LOAD_MSG_11_2
+#define LOAD_MSG_11_2(b0, b1) \
+ do { b0 = vcombine_u64(vget_low_u64(m5), vget_low_u64(m4)); b1 = vextq_u64(m7, m3, 1); } while(0)
+
+#undef LOAD_MSG_11_3
+#define LOAD_MSG_11_3(b0, b1) \
+ do { b0 = vextq_u64(m0, m0, 1); b1 = vcombine_u64(vget_high_u64(m5), vget_high_u64(m2)); } while(0)
+
+#undef LOAD_MSG_11_4
+#define LOAD_MSG_11_4(b0, b1) \
+ do { b0 = vcombine_u64(vget_low_u64(m6), vget_low_u64(m1)); b1 = vcombine_u64(vget_high_u64(m3), vget_high_u64(m1)); } while(0)
+
+#define vrorq_n_u64_32(x) vreinterpretq_u64_u32(vrev64q_u32(vreinterpretq_u32_u64((x))))
+
+#define vrorq_n_u64_24(x) vcombine_u64( \
+ vreinterpret_u64_u8(vext_u8(vreinterpret_u8_u64(vget_low_u64(x)), vreinterpret_u8_u64(vget_low_u64(x)), 3)), \
+ vreinterpret_u64_u8(vext_u8(vreinterpret_u8_u64(vget_high_u64(x)), vreinterpret_u8_u64(vget_high_u64(x)), 3)))
+
+#define vrorq_n_u64_16(x) vcombine_u64( \
+ vreinterpret_u64_u8(vext_u8(vreinterpret_u8_u64(vget_low_u64(x)), vreinterpret_u8_u64(vget_low_u64(x)), 2)), \
+ vreinterpret_u64_u8(vext_u8(vreinterpret_u8_u64(vget_high_u64(x)), vreinterpret_u8_u64(vget_high_u64(x)), 2)))
+
+#define vrorq_n_u64_63(x) veorq_u64(vaddq_u64(x, x), vshrq_n_u64(x, 63))
+
+#undef G1
+#define G1(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1) \
+ do { \
+ row1l = vaddq_u64(vaddq_u64(row1l, b0), row2l); \
+ row1h = vaddq_u64(vaddq_u64(row1h, b1), row2h); \
+ row4l = veorq_u64(row4l, row1l); row4h = veorq_u64(row4h, row1h); \
+ row4l = vrorq_n_u64_32(row4l); row4h = vrorq_n_u64_32(row4h); \
+ row3l = vaddq_u64(row3l, row4l); row3h = vaddq_u64(row3h, row4h); \
+ row2l = veorq_u64(row2l, row3l); row2h = veorq_u64(row2h, row3h); \
+ row2l = vrorq_n_u64_24(row2l); row2h = vrorq_n_u64_24(row2h); \
+ } while(0)
+
+#undef G2
+#define G2(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1) \
+ do { \
+ row1l = vaddq_u64(vaddq_u64(row1l, b0), row2l); \
+ row1h = vaddq_u64(vaddq_u64(row1h, b1), row2h); \
+ row4l = veorq_u64(row4l, row1l); row4h = veorq_u64(row4h, row1h); \
+ row4l = vrorq_n_u64_16(row4l); row4h = vrorq_n_u64_16(row4h); \
+ row3l = vaddq_u64(row3l, row4l); row3h = vaddq_u64(row3h, row4h); \
+ row2l = veorq_u64(row2l, row3l); row2h = veorq_u64(row2h, row3h); \
+ row2l = vrorq_n_u64_63(row2l); row2h = vrorq_n_u64_63(row2h); \
+ } while(0)
+
+#define DIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h) \
+ do { \
+ uint64x2_t t0 = vextq_u64(row2l, row2h, 1); \
+ uint64x2_t t1 = vextq_u64(row2h, row2l, 1); \
+ row2l = t0; row2h = t1; t0 = row3l; row3l = row3h; row3h = t0; \
+ t0 = vextq_u64(row4h, row4l, 1); t1 = vextq_u64(row4l, row4h, 1); \
+ row4l = t0; row4h = t1; \
+ } while(0)
+
+#define UNDIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h) \
+ do { \
+ uint64x2_t t0 = vextq_u64(row2h, row2l, 1); \
+ uint64x2_t t1 = vextq_u64(row2l, row2h, 1); \
+ row2l = t0; row2h = t1; t0 = row3l; row3l = row3h; row3h = t0; \
+ t0 = vextq_u64(row4l, row4h, 1); t1 = vextq_u64(row4h, row4l, 1); \
+ row4l = t0; row4h = t1; \
+ } while(0)
+
+#undef ROUND
+#define ROUND(r) \
+ do { \
+ uint64x2_t b0, b1; \
+ LOAD_MSG_ ##r ##_1(b0, b1); \
+ G1(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1); \
+ LOAD_MSG_ ##r ##_2(b0, b1); \
+ G2(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1); \
+ DIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h); \
+ LOAD_MSG_ ##r ##_3(b0, b1); \
+ G1(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1); \
+ LOAD_MSG_ ##r ##_4(b0, b1); \
+ G2(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1); \
+ UNDIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h); \
+ } while(0)
+
+static void blake2b_compress( blake2b_state *S, const uint8_t block[BLAKE2B_BLOCKBYTES] )
+{
+ const uint64x2_t m0 = vreinterpretq_u64_u8(vld1q_u8(&block[ 0]));
+ const uint64x2_t m1 = vreinterpretq_u64_u8(vld1q_u8(&block[ 16]));
+ const uint64x2_t m2 = vreinterpretq_u64_u8(vld1q_u8(&block[ 32]));
+ const uint64x2_t m3 = vreinterpretq_u64_u8(vld1q_u8(&block[ 48]));
+ const uint64x2_t m4 = vreinterpretq_u64_u8(vld1q_u8(&block[ 64]));
+ const uint64x2_t m5 = vreinterpretq_u64_u8(vld1q_u8(&block[ 80]));
+ const uint64x2_t m6 = vreinterpretq_u64_u8(vld1q_u8(&block[ 96]));
+ const uint64x2_t m7 = vreinterpretq_u64_u8(vld1q_u8(&block[112]));
+
+ uint64x2_t row1l, row1h, row2l, row2h;
+ uint64x2_t row3l, row3h, row4l, row4h;
+
+ const uint64x2_t h0 = row1l = vld1q_u64(&S->h[0]);
+ const uint64x2_t h1 = row1h = vld1q_u64(&S->h[2]);
+ const uint64x2_t h2 = row2l = vld1q_u64(&S->h[4]);
+ const uint64x2_t h3 = row2h = vld1q_u64(&S->h[6]);
+
+ row3l = vld1q_u64(&blake2b_IV[0]);
+ row3h = vld1q_u64(&blake2b_IV[2]);
+ row4l = veorq_u64(vld1q_u64(&blake2b_IV[4]), vld1q_u64(&S->t[0]));
+ row4h = veorq_u64(vld1q_u64(&blake2b_IV[6]), vld1q_u64(&S->f[0]));
+
+ ROUND( 0 );
+ ROUND( 1 );
+ ROUND( 2 );
+ ROUND( 3 );
+ ROUND( 4 );
+ ROUND( 5 );
+ ROUND( 6 );
+ ROUND( 7 );
+ ROUND( 8 );
+ ROUND( 9 );
+ ROUND( 10 );
+ ROUND( 11 );
+
+ vst1q_u64(&S->h[0], veorq_u64(h0, veorq_u64(row1l, row3l)));
+ vst1q_u64(&S->h[2], veorq_u64(h1, veorq_u64(row1h, row3h)));
+ vst1q_u64(&S->h[4], veorq_u64(h2, veorq_u64(row2l, row4l)));
+ vst1q_u64(&S->h[6], veorq_u64(h3, veorq_u64(row2h, row4h)));
+}
+
+#undef G
+#undef ROUND
+
+int blake2b_update( blake2b_state *S, const void *pin, size_t inlen )
+{
+ const unsigned char * in = (const unsigned char *)pin;
+ if( inlen > 0 )
+ {
+ size_t left = S->buflen;
+ size_t fill = BLAKE2B_BLOCKBYTES - left;
+ if( inlen > fill )
+ {
+ S->buflen = 0;
+ memcpy( S->buf + left, in, fill ); /* Fill buffer */
+ blake2b_increment_counter( S, BLAKE2B_BLOCKBYTES );
+ blake2b_compress( S, S->buf ); /* Compress */
+ in += fill; inlen -= fill;
+ while(inlen > BLAKE2B_BLOCKBYTES) {
+ blake2b_increment_counter(S, BLAKE2B_BLOCKBYTES);
+ blake2b_compress( S, in );
+ in += BLAKE2B_BLOCKBYTES;
+ inlen -= BLAKE2B_BLOCKBYTES;
+ }
+ }
+ memcpy( S->buf + S->buflen, in, inlen );
+ S->buflen += inlen;
+ }
+ return 0;
+}
+
+int blake2b_final( blake2b_state *S, void *out, size_t outlen )
+{
+ uint8_t buffer[BLAKE2B_OUTBYTES] = {0};
+ size_t i;
+
+ if( out == NULL || outlen < S->outlen )
+ return -1;
+
+ if( blake2b_is_lastblock( S ) )
+ return -1;
+
+ blake2b_increment_counter( S, S->buflen );
+ blake2b_set_lastblock( S );
+ memset( S->buf + S->buflen, 0, BLAKE2B_BLOCKBYTES - S->buflen ); /* Padding */
+ blake2b_compress( S, S->buf );
+
+ for( i = 0; i < 8; ++i ) /* Output full hash to temp buffer */
+ store64( buffer + sizeof( S->h[i] ) * i, S->h[i] );
+
+ memcpy( out, buffer, S->outlen );
+ secure_zero_memory(buffer, sizeof(buffer));
+ return 0;
+}
+
+/* inlen, at least, should be uint64_t. Others can be size_t. */
+int blake2b( void *out, size_t outlen, const void *in, size_t inlen, const void *key, size_t keylen )
+{
+ blake2b_state S[1];
+
+ /* Verify parameters */
+ if ( NULL == in && inlen > 0 ) return -1;
+
+ if ( NULL == out ) return -1;
+
+ if( NULL == key && keylen > 0 ) return -1;
+
+ if( !outlen || outlen > BLAKE2B_OUTBYTES ) return -1;
+
+ if( keylen > BLAKE2B_KEYBYTES ) return -1;
+
+ if( keylen > 0 )
+ {
+ if( blake2b_init_key( S, outlen, key, keylen ) < 0 ) return -1;
+ }
+ else
+ {
+ if( blake2b_init( S, outlen ) < 0 ) return -1;
+ }
+
+ blake2b_update( S, ( const uint8_t * )in, inlen );
+ blake2b_final( S, out, outlen );
+ return 0;
+}
+
+int blake2( void *out, size_t outlen, const void *in, size_t inlen, const void *key, size_t keylen ) {
+ return blake2b(out, outlen, in, inlen, key, keylen);
+}
+
+#if defined(SUPERCOP)
+int crypto_hash( unsigned char *out, unsigned char *in, unsigned long long inlen )
+{
+ return blake2b( out, BLAKE2B_OUTBYTES, in, inlen, NULL, 0 );
+}
+#endif
+
+#if defined(BLAKE2B_SELFTEST)
+#include <string.h>
+#include "blake2-kat.h"
+int main( void )
+{
+ uint8_t key[BLAKE2B_KEYBYTES];
+ uint8_t buf[BLAKE2_KAT_LENGTH];
+ size_t i, step;
+
+ for( i = 0; i < BLAKE2B_KEYBYTES; ++i )
+ key[i] = ( uint8_t )i;
+
+ for( i = 0; i < BLAKE2_KAT_LENGTH; ++i )
+ buf[i] = ( uint8_t )i;
+
+ /* Test simple API */
+ for( i = 0; i < BLAKE2_KAT_LENGTH; ++i )
+ {
+ uint8_t hash[BLAKE2B_OUTBYTES];
+ blake2b( hash, BLAKE2B_OUTBYTES, buf, i, key, BLAKE2B_KEYBYTES );
+
+ if( 0 != memcmp( hash, blake2b_keyed_kat[i], BLAKE2B_OUTBYTES ) )
+ {
+ goto fail;
+ }
+ }
+
+ /* Test streaming API */
+ for(step = 1; step < BLAKE2B_BLOCKBYTES; ++step) {
+ for (i = 0; i < BLAKE2_KAT_LENGTH; ++i) {
+ uint8_t hash[BLAKE2B_OUTBYTES];
+ blake2b_state S;
+ uint8_t * p = buf;
+ size_t mlen = i;
+ int err = 0;
+
+ if( (err = blake2b_init_key(&S, BLAKE2B_OUTBYTES, key, BLAKE2B_KEYBYTES)) < 0 ) {
+ goto fail;
+ }
+
+ while (mlen >= step) {
+ if ( (err = blake2b_update(&S, p, step)) < 0 ) {
+ goto fail;
+ }
+ mlen -= step;
+ p += step;
+ }
+ if ( (err = blake2b_update(&S, p, mlen)) < 0) {
+ goto fail;
+ }
+ if ( (err = blake2b_final(&S, hash, BLAKE2B_OUTBYTES)) < 0) {
+ goto fail;
+ }
+
+ if (0 != memcmp(hash, blake2b_keyed_kat[i], BLAKE2B_OUTBYTES)) {
+ goto fail;
+ }
+ }
+ }
+
+ puts( "ok" );
+ return 0;
+fail:
+ puts("error");
+ return -1;
+}
+#endif
diff --git a/neon/blake2b-round.h b/neon/blake2b-round.h
new file mode 100644
index 0000000..9abf25b
--- /dev/null
+++ b/neon/blake2b-round.h
@@ -0,0 +1,76 @@
+/*
+ BLAKE2 reference source code package - reference C implementations
+
+ Copyright 2012, Samuel Neves <sneves@dei.uc.pt>. You may use this under the
+ terms of the CC0, the OpenSSL Licence, or the Apache Public License 2.0, at
+ your option. The terms of these licenses can be found at:
+
+ - CC0 1.0 Universal : http://creativecommons.org/publicdomain/zero/1.0
+ - OpenSSL license : https://www.openssl.org/source/license.html
+ - Apache 2.0 : http://www.apache.org/licenses/LICENSE-2.0
+
+ More information about the BLAKE2 hash function can be found at
+ https://blake2.net.
+*/
+#ifndef BLAKE2B_ROUND_H
+#define BLAKE2B_ROUND_H
+
+#define vrorq_n_u64_32(x) vreinterpretq_u64_u32(vrev64q_u32(vreinterpretq_u32_u64((x))))
+
+#define vrorq_n_u64_24(x) vcombine_u64( \
+ vreinterpret_u64_u8(vext_u8(vreinterpret_u8_u64(vget_low_u64(x)), vreinterpret_u8_u64(vget_low_u64(x)), 3)), \
+ vreinterpret_u64_u8(vext_u8(vreinterpret_u8_u64(vget_high_u64(x)), vreinterpret_u8_u64(vget_high_u64(x)), 3)))
+
+#define vrorq_n_u64_16(x) vcombine_u64( \
+ vreinterpret_u64_u8(vext_u8(vreinterpret_u8_u64(vget_low_u64(x)), vreinterpret_u8_u64(vget_low_u64(x)), 2)), \
+ vreinterpret_u64_u8(vext_u8(vreinterpret_u8_u64(vget_high_u64(x)), vreinterpret_u8_u64(vget_high_u64(x)), 2)))
+
+#define vrorq_n_u64_63(x) veorq_u64(vaddq_u64(x, x), vshrq_n_u64(x, 63))
+
+#define G1(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1) \
+ row1l = vaddq_u64(vaddq_u64(row1l, b0), row2l); \
+ row1h = vaddq_u64(vaddq_u64(row1h, b1), row2h); \
+ row4l = veorq_u64(row4l, row1l); row4h = veorq_u64(row4h, row1h); \
+ row4l = vrorq_n_u64_32(row4l); row4h = vrorq_n_u64_32(row4h); \
+ row3l = vaddq_u64(row3l, row4l); row3h = vaddq_u64(row3h, row4h); \
+ row2l = veorq_u64(row2l, row3l); row2h = veorq_u64(row2h, row3h); \
+ row2l = vrorq_n_u64_24(row2l); row2h = vrorq_n_u64_24(row2h);
+
+#define G2(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1) \
+ row1l = vaddq_u64(vaddq_u64(row1l, b0), row2l); \
+ row1h = vaddq_u64(vaddq_u64(row1h, b1), row2h); \
+ row4l = veorq_u64(row4l, row1l); row4h = veorq_u64(row4h, row1h); \
+ row4l = vrorq_n_u64_16(row4l); row4h = vrorq_n_u64_16(row4h); \
+ row3l = vaddq_u64(row3l, row4l); row3h = vaddq_u64(row3h, row4h); \
+ row2l = veorq_u64(row2l, row3l); row2h = veorq_u64(row2h, row3h); \
+ row2l = vrorq_n_u64_63(row2l); row2h = vrorq_n_u64_63(row2h);
+
+#define DIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h) \
+ t0 = vextq_u64(row2l, row2h, 1); \
+ t1 = vextq_u64(row2h, row2l, 1); \
+ row2l = t0; row2h = t1; t0 = row3l; row3l = row3h; row3h = t0; \
+ t0 = vextq_u64(row4h, row4l, 1); t1 = vextq_u64(row4l, row4h, 1); \
+ row4l = t0; row4h = t1;
+
+#define UNDIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h) \
+ t0 = vextq_u64(row2h, row2l, 1); \
+ t1 = vextq_u64(row2l, row2h, 1); \
+ row2l = t0; row2h = t1; t0 = row3l; row3l = row3h; row3h = t0; \
+ t0 = vextq_u64(row4l, row4h, 1); t1 = vextq_u64(row4h, row4l, 1); \
+ row4l = t0; row4h = t1;
+
+#include "blake2b-load-neon.h"
+
+#define ROUND(r) \
+ LOAD_MSG_ ##r ##_1(b0, b1); \
+ G1(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1); \
+ LOAD_MSG_ ##r ##_2(b0, b1); \
+ G2(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1); \
+ DIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h); \
+ LOAD_MSG_ ##r ##_3(b0, b1); \
+ G1(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1); \
+ LOAD_MSG_ ##r ##_4(b0, b1); \
+ G2(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1); \
+ UNDIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h);
+
+#endif
diff --git a/neon/blake2b.c b/neon/blake2b.c
new file mode 100644
index 0000000..b8c8ad0
--- /dev/null
+++ b/neon/blake2b.c
@@ -0,0 +1,342 @@
+/*
+ BLAKE2 reference source code package - reference C implementations
+
+ Copyright 2012, Samuel Neves <sneves@dei.uc.pt>. You may use this under the
+ terms of the CC0, the OpenSSL Licence, or the Apache Public License 2.0, at
+ your option. The terms of these licenses can be found at:
+
+ - CC0 1.0 Universal : http://creativecommons.org/publicdomain/zero/1.0
+ - OpenSSL license : https://www.openssl.org/source/license.html
+ - Apache 2.0 : http://www.apache.org/licenses/LICENSE-2.0
+
+ More information about the BLAKE2 hash function can be found at
+ https://blake2.net.
+*/
+
+#include <stdint.h>
+#include <string.h>
+#include <stdio.h>
+#include <arm_neon.h>
+
+#include "blake2.h"
+#include "blake2-impl.h"
+
+#include "blake2b-round.h"
+
+static const uint64_t blake2b_IV[8] =
+{
+ 0x6a09e667f3bcc908ULL, 0xbb67ae8584caa73bULL,
+ 0x3c6ef372fe94f82bULL, 0xa54ff53a5f1d36f1ULL,
+ 0x510e527fade682d1ULL, 0x9b05688c2b3e6c1fULL,
+ 0x1f83d9abfb41bd6bULL, 0x5be0cd19137e2179ULL
+};
+
+/* Some helper functions */
+static void blake2b_set_lastnode( blake2b_state *S )
+{
+ S->f[1] = (uint64_t)-1;
+}
+
+static int blake2b_is_lastblock( const blake2b_state *S )
+{
+ return S->f[0] != 0;
+}
+
+static void blake2b_set_lastblock( blake2b_state *S )
+{
+ if( S->last_node ) blake2b_set_lastnode( S );
+
+ S->f[0] = (uint64_t)-1;
+}
+
+static void blake2b_increment_counter( blake2b_state *S, const uint64_t inc )
+{
+ S->t[0] += inc;
+ S->t[1] += ( S->t[0] < inc );
+}
+
+static void blake2b_init0( blake2b_state *S )
+{
+ size_t i;
+ memset( S, 0, sizeof( blake2b_state ) );
+
+ for( i = 0; i < 8; ++i ) S->h[i] = blake2b_IV[i];
+}
+
+/* init xors IV with input parameter block */
+int blake2b_init_param( blake2b_state *S, const blake2b_param *P )
+{
+ const uint8_t *p = ( const uint8_t * )( P );
+ size_t i;
+
+ blake2b_init0( S );
+
+ /* IV XOR ParamBlock */
+ for( i = 0; i < 8; ++i )
+ S->h[i] ^= load64( p + sizeof( S->h[i] ) * i );
+
+ S->outlen = P->digest_length;
+ return 0;
+}
+
+
+
+int blake2b_init( blake2b_state *S, size_t outlen )
+{
+ blake2b_param P[1];
+
+ if ( ( !outlen ) || ( outlen > BLAKE2B_OUTBYTES ) ) return -1;
+
+ P->digest_length = (uint8_t)outlen;
+ P->key_length = 0;
+ P->fanout = 1;
+ P->depth = 1;
+ store32( &P->leaf_length, 0 );
+ store32( &P->node_offset, 0 );
+ store32( &P->xof_length, 0 );
+ P->node_depth = 0;
+ P->inner_length = 0;
+ memset( P->reserved, 0, sizeof( P->reserved ) );
+ memset( P->salt, 0, sizeof( P->salt ) );
+ memset( P->personal, 0, sizeof( P->personal ) );
+ return blake2b_init_param( S, P );
+}
+
+
+int blake2b_init_key( blake2b_state *S, size_t outlen, const void *key, size_t keylen )
+{
+ blake2b_param P[1];
+
+ if ( ( !outlen ) || ( outlen > BLAKE2B_OUTBYTES ) ) return -1;
+
+ if ( !key || !keylen || keylen > BLAKE2B_KEYBYTES ) return -1;
+
+ P->digest_length = (uint8_t)outlen;
+ P->key_length = (uint8_t)keylen;
+ P->fanout = 1;
+ P->depth = 1;
+ store32( &P->leaf_length, 0 );
+ store32( &P->node_offset, 0 );
+ store32( &P->xof_length, 0 );
+ P->node_depth = 0;
+ P->inner_length = 0;
+ memset( P->reserved, 0, sizeof( P->reserved ) );
+ memset( P->salt, 0, sizeof( P->salt ) );
+ memset( P->personal, 0, sizeof( P->personal ) );
+
+ if( blake2b_init_param( S, P ) < 0 ) return -1;
+
+ {
+ uint8_t block[BLAKE2B_BLOCKBYTES];
+ memset( block, 0, BLAKE2B_BLOCKBYTES );
+ memcpy( block, key, keylen );
+ blake2b_update( S, block, BLAKE2B_BLOCKBYTES );
+ secure_zero_memory( block, BLAKE2B_BLOCKBYTES ); /* Burn the key from stack */
+ }
+ return 0;
+}
+
+static void blake2b_compress( blake2b_state *S, const uint8_t block[BLAKE2B_BLOCKBYTES] )
+{
+ const uint64x2_t m0 = vreinterpretq_u64_u8(vld1q_u8(&block[ 0]));
+ const uint64x2_t m1 = vreinterpretq_u64_u8(vld1q_u8(&block[ 16]));
+ const uint64x2_t m2 = vreinterpretq_u64_u8(vld1q_u8(&block[ 32]));
+ const uint64x2_t m3 = vreinterpretq_u64_u8(vld1q_u8(&block[ 48]));
+ const uint64x2_t m4 = vreinterpretq_u64_u8(vld1q_u8(&block[ 64]));
+ const uint64x2_t m5 = vreinterpretq_u64_u8(vld1q_u8(&block[ 80]));
+ const uint64x2_t m6 = vreinterpretq_u64_u8(vld1q_u8(&block[ 96]));
+ const uint64x2_t m7 = vreinterpretq_u64_u8(vld1q_u8(&block[112]));
+
+ uint64x2_t row1l, row1h, row2l, row2h;
+ uint64x2_t row3l, row3h, row4l, row4h;
+ uint64x2_t t0, t1, b0, b1;
+
+ const uint64x2_t h0 = row1l = vld1q_u64(&S->h[0]);
+ const uint64x2_t h1 = row1h = vld1q_u64(&S->h[2]);
+ const uint64x2_t h2 = row2l = vld1q_u64(&S->h[4]);
+ const uint64x2_t h3 = row2h = vld1q_u64(&S->h[6]);
+
+ row3l = vld1q_u64(&blake2b_IV[0]);
+ row3h = vld1q_u64(&blake2b_IV[2]);
+ row4l = veorq_u64(vld1q_u64(&blake2b_IV[4]), vld1q_u64(&S->t[0]));
+ row4h = veorq_u64(vld1q_u64(&blake2b_IV[6]), vld1q_u64(&S->f[0]));
+
+ ROUND( 0 );
+ ROUND( 1 );
+ ROUND( 2 );
+ ROUND( 3 );
+ ROUND( 4 );
+ ROUND( 5 );
+ ROUND( 6 );
+ ROUND( 7 );
+ ROUND( 8 );
+ ROUND( 9 );
+ ROUND( 10 );
+ ROUND( 11 );
+
+ vst1q_u64(&S->h[0], veorq_u64(h0, veorq_u64(row1l, row3l)));
+ vst1q_u64(&S->h[2], veorq_u64(h1, veorq_u64(row1h, row3h)));
+ vst1q_u64(&S->h[4], veorq_u64(h2, veorq_u64(row2l, row4l)));
+ vst1q_u64(&S->h[6], veorq_u64(h3, veorq_u64(row2h, row4h)));
+}
+
+
+int blake2b_update( blake2b_state *S, const void *pin, size_t inlen )
+{
+ const unsigned char * in = (const unsigned char *)pin;
+ if( inlen > 0 )
+ {
+ size_t left = S->buflen;
+ size_t fill = BLAKE2B_BLOCKBYTES - left;
+ if( inlen > fill )
+ {
+ S->buflen = 0;
+ memcpy( S->buf + left, in, fill ); /* Fill buffer */
+ blake2b_increment_counter( S, BLAKE2B_BLOCKBYTES );
+ blake2b_compress( S, S->buf ); /* Compress */
+ in += fill; inlen -= fill;
+ while(inlen > BLAKE2B_BLOCKBYTES) {
+ blake2b_increment_counter(S, BLAKE2B_BLOCKBYTES);
+ blake2b_compress( S, in );
+ in += BLAKE2B_BLOCKBYTES;
+ inlen -= BLAKE2B_BLOCKBYTES;
+ }
+ }
+ memcpy( S->buf + S->buflen, in, inlen );
+ S->buflen += inlen;
+ }
+ return 0;
+}
+
+int blake2b_final( blake2b_state *S, void *out, size_t outlen )
+{
+ uint8_t buffer[BLAKE2B_OUTBYTES] = {0};
+ size_t i;
+
+ if( out == NULL || outlen < S->outlen )
+ return -1;
+
+ if( blake2b_is_lastblock( S ) )
+ return -1;
+
+ blake2b_increment_counter( S, S->buflen );
+ blake2b_set_lastblock( S );
+ memset( S->buf + S->buflen, 0, BLAKE2B_BLOCKBYTES - S->buflen ); /* Padding */
+ blake2b_compress( S, S->buf );
+
+ for( i = 0; i < 8; ++i ) /* Output full hash to temp buffer */
+ store64( buffer + sizeof( S->h[i] ) * i, S->h[i] );
+
+ memcpy( out, buffer, S->outlen );
+ secure_zero_memory(buffer, sizeof(buffer));
+ return 0;
+}
+
+/* inlen, at least, should be uint64_t. Others can be size_t. */
+int blake2b( void *out, size_t outlen, const void *in, size_t inlen, const void *key, size_t keylen )
+{
+ blake2b_state S[1];
+
+ /* Verify parameters */
+ if ( NULL == in && inlen > 0 ) return -1;
+
+ if ( NULL == out ) return -1;
+
+ if( NULL == key && keylen > 0 ) return -1;
+
+ if( !outlen || outlen > BLAKE2B_OUTBYTES ) return -1;
+
+ if( keylen > BLAKE2B_KEYBYTES ) return -1;
+
+ if( keylen > 0 )
+ {
+ if( blake2b_init_key( S, outlen, key, keylen ) < 0 ) return -1;
+ }
+ else
+ {
+ if( blake2b_init( S, outlen ) < 0 ) return -1;
+ }
+
+ blake2b_update( S, ( const uint8_t * )in, inlen );
+ blake2b_final( S, out, outlen );
+ return 0;
+}
+
+int blake2( void *out, size_t outlen, const void *in, size_t inlen, const void *key, size_t keylen ) {
+ return blake2b(out, outlen, in, inlen, key, keylen);
+}
+
+#if defined(SUPERCOP)
+int crypto_hash( unsigned char *out, unsigned char *in, unsigned long long inlen )
+{
+ return blake2b( out, BLAKE2B_OUTBYTES, in, inlen, NULL, 0 );
+}
+#endif
+
+#if defined(BLAKE2B_SELFTEST)
+#include <string.h>
+#include "blake2-kat.h"
+int main( void )
+{
+ uint8_t key[BLAKE2B_KEYBYTES];
+ uint8_t buf[BLAKE2_KAT_LENGTH];
+ size_t i, step;
+
+ for( i = 0; i < BLAKE2B_KEYBYTES; ++i )
+ key[i] = ( uint8_t )i;
+
+ for( i = 0; i < BLAKE2_KAT_LENGTH; ++i )
+ buf[i] = ( uint8_t )i;
+
+ /* Test simple API */
+ for( i = 0; i < BLAKE2_KAT_LENGTH; ++i )
+ {
+ uint8_t hash[BLAKE2B_OUTBYTES];
+ blake2b( hash, BLAKE2B_OUTBYTES, buf, i, key, BLAKE2B_KEYBYTES );
+
+ if( 0 != memcmp( hash, blake2b_keyed_kat[i], BLAKE2B_OUTBYTES ) )
+ {
+ goto fail;
+ }
+ }
+
+ /* Test streaming API */
+ for(step = 1; step < BLAKE2B_BLOCKBYTES; ++step) {
+ for (i = 0; i < BLAKE2_KAT_LENGTH; ++i) {
+ uint8_t hash[BLAKE2B_OUTBYTES];
+ blake2b_state S;
+ uint8_t * p = buf;
+ size_t mlen = i;
+ int err = 0;
+
+ if( (err = blake2b_init_key(&S, BLAKE2B_OUTBYTES, key, BLAKE2B_KEYBYTES)) < 0 ) {
+ goto fail;
+ }
+
+ while (mlen >= step) {
+ if ( (err = blake2b_update(&S, p, step)) < 0 ) {
+ goto fail;
+ }
+ mlen -= step;
+ p += step;
+ }
+ if ( (err = blake2b_update(&S, p, mlen)) < 0) {
+ goto fail;
+ }
+ if ( (err = blake2b_final(&S, hash, BLAKE2B_OUTBYTES)) < 0) {
+ goto fail;
+ }
+
+ if (0 != memcmp(hash, blake2b_keyed_kat[i], BLAKE2B_OUTBYTES)) {
+ goto fail;
+ }
+ }
+ }
+
+ puts( "ok" );
+ return 0;
+fail:
+ puts("error");
+ return -1;
+}
+#endif
diff --git a/neon/blake2bp.c b/neon/blake2bp.c
new file mode 100644
index 0000000..3eb95d0
--- /dev/null
+++ b/neon/blake2bp.c
@@ -0,0 +1,361 @@
+/*
+ BLAKE2 reference source code package - optimized C implementations
+
+ Copyright 2012, Samuel Neves <sneves@dei.uc.pt>. You may use this under the
+ terms of the CC0, the OpenSSL Licence, or the Apache Public License 2.0, at
+ your option. The terms of these licenses can be found at:
+
+ - CC0 1.0 Universal : http://creativecommons.org/publicdomain/zero/1.0
+ - OpenSSL license : https://www.openssl.org/source/license.html
+ - Apache 2.0 : http://www.apache.org/licenses/LICENSE-2.0
+
+ More information about the BLAKE2 hash function can be found at
+ https://blake2.net.
+*/
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <stdint.h>
+
+#if defined(_OPENMP)
+#include <omp.h>
+#endif
+
+#include "blake2.h"
+#include "blake2-impl.h"
+
+#define PARALLELISM_DEGREE 4
+
+/*
+ blake2b_init_param defaults to setting the expecting output length
+ from the digest_length parameter block field.
+
+ In some cases, however, we do not want this, as the output length
+ of these instances is given by inner_length instead.
+*/
+static int blake2bp_init_leaf_param( blake2b_state *S, const blake2b_param *P )
+{
+ int err = blake2b_init_param(S, P);
+ S->outlen = P->inner_length;
+ return err;
+}
+
+static int blake2bp_init_leaf( blake2b_state *S, size_t outlen, size_t keylen, uint64_t offset )
+{
+ blake2b_param P[1];
+ P->digest_length = (uint8_t)outlen;
+ P->key_length = (uint8_t)keylen;
+ P->fanout = PARALLELISM_DEGREE;
+ P->depth = 2;
+ P->leaf_length = 0;
+ P->node_offset = offset;
+ P->xof_length = 0;
+ P->node_depth = 0;
+ P->inner_length = BLAKE2B_OUTBYTES;
+ memset( P->reserved, 0, sizeof( P->reserved ) );
+ memset( P->salt, 0, sizeof( P->salt ) );
+ memset( P->personal, 0, sizeof( P->personal ) );
+ return blake2bp_init_leaf_param( S, P );
+}
+
+static int blake2bp_init_root( blake2b_state *S, size_t outlen, size_t keylen )
+{
+ blake2b_param P[1];
+ P->digest_length = (uint8_t)outlen;
+ P->key_length = (uint8_t)keylen;
+ P->fanout = PARALLELISM_DEGREE;
+ P->depth = 2;
+ P->leaf_length = 0;
+ P->node_offset = 0;
+ P->xof_length = 0;
+ P->node_depth = 1;
+ P->inner_length = BLAKE2B_OUTBYTES;
+ memset( P->reserved, 0, sizeof( P->reserved ) );
+ memset( P->salt, 0, sizeof( P->salt ) );
+ memset( P->personal, 0, sizeof( P->personal ) );
+ return blake2b_init_param( S, P );
+}
+
+
+int blake2bp_init( blake2bp_state *S, size_t outlen )
+{
+ size_t i;
+ if( !outlen || outlen > BLAKE2B_OUTBYTES ) return -1;
+
+ memset( S->buf, 0, sizeof( S->buf ) );
+ S->buflen = 0;
+ S->outlen = outlen;
+
+ if( blake2bp_init_root( S->R, outlen, 0 ) < 0 )
+ return -1;
+
+ for( i = 0; i < PARALLELISM_DEGREE; ++i )
+ if( blake2bp_init_leaf( S->S[i], outlen, 0, i ) < 0 ) return -1;
+
+ S->R->last_node = 1;
+ S->S[PARALLELISM_DEGREE - 1]->last_node = 1;
+ return 0;
+}
+
+int blake2bp_init_key( blake2bp_state *S, size_t outlen, const void *key, size_t keylen )
+{
+ size_t i;
+
+ if( !outlen || outlen > BLAKE2B_OUTBYTES ) return -1;
+
+ if( !key || !keylen || keylen > BLAKE2B_KEYBYTES ) return -1;
+
+ memset( S->buf, 0, sizeof( S->buf ) );
+ S->buflen = 0;
+ S->outlen = outlen;
+
+ if( blake2bp_init_root( S->R, outlen, keylen ) < 0 )
+ return -1;
+
+ for( i = 0; i < PARALLELISM_DEGREE; ++i )
+ if( blake2bp_init_leaf( S->S[i], outlen, keylen, i ) < 0 ) return -1;
+
+ S->R->last_node = 1;
+ S->S[PARALLELISM_DEGREE - 1]->last_node = 1;
+ {
+ uint8_t block[BLAKE2B_BLOCKBYTES];
+ memset( block, 0, BLAKE2B_BLOCKBYTES );
+ memcpy( block, key, keylen );
+
+ for( i = 0; i < PARALLELISM_DEGREE; ++i )
+ blake2b_update( S->S[i], block, BLAKE2B_BLOCKBYTES );
+
+ secure_zero_memory( block, BLAKE2B_BLOCKBYTES ); /* Burn the key from stack */
+ }
+ return 0;
+}
+
+
+int blake2bp_update( blake2bp_state *S, const void *pin, size_t inlen )
+{
+ const unsigned char * in = (const unsigned char *)pin;
+ size_t left = S->buflen;
+ size_t fill = sizeof( S->buf ) - left;
+ size_t i;
+
+ if( left && inlen >= fill )
+ {
+ memcpy( S->buf + left, in, fill );
+
+ for( i = 0; i < PARALLELISM_DEGREE; ++i )
+ blake2b_update( S->S[i], S->buf + i * BLAKE2B_BLOCKBYTES, BLAKE2B_BLOCKBYTES );
+
+ in += fill;
+ inlen -= fill;
+ left = 0;
+ }
+
+#if defined(_OPENMP)
+ #pragma omp parallel shared(S), num_threads(PARALLELISM_DEGREE)
+#else
+
+ for( i = 0; i < PARALLELISM_DEGREE; ++i )
+#endif
+ {
+#if defined(_OPENMP)
+ size_t i = omp_get_thread_num();
+#endif
+ size_t inlen__ = inlen;
+ const unsigned char *in__ = ( const unsigned char * )in;
+ in__ += i * BLAKE2B_BLOCKBYTES;
+
+ while( inlen__ >= PARALLELISM_DEGREE * BLAKE2B_BLOCKBYTES )
+ {
+ blake2b_update( S->S[i], in__, BLAKE2B_BLOCKBYTES );
+ in__ += PARALLELISM_DEGREE * BLAKE2B_BLOCKBYTES;
+ inlen__ -= PARALLELISM_DEGREE * BLAKE2B_BLOCKBYTES;
+ }
+ }
+
+ in += inlen - inlen % ( PARALLELISM_DEGREE * BLAKE2B_BLOCKBYTES );
+ inlen %= PARALLELISM_DEGREE * BLAKE2B_BLOCKBYTES;
+
+ if( inlen > 0 )
+ memcpy( S->buf + left, in, inlen );
+
+ S->buflen = left + inlen;
+ return 0;
+}
+
+
+
+int blake2bp_final( blake2bp_state *S, void *out, size_t outlen )
+{
+ uint8_t hash[PARALLELISM_DEGREE][BLAKE2B_OUTBYTES];
+ size_t i;
+
+ if(out == NULL || outlen < S->outlen) {
+ return -1;
+ }
+
+ for( i = 0; i < PARALLELISM_DEGREE; ++i )
+ {
+ if( S->buflen > i * BLAKE2B_BLOCKBYTES )
+ {
+ size_t left = S->buflen - i * BLAKE2B_BLOCKBYTES;
+
+ if( left > BLAKE2B_BLOCKBYTES ) left = BLAKE2B_BLOCKBYTES;
+
+ blake2b_update( S->S[i], S->buf + i * BLAKE2B_BLOCKBYTES, left );
+ }
+
+ blake2b_final( S->S[i], hash[i], BLAKE2B_OUTBYTES );
+ }
+
+ for( i = 0; i < PARALLELISM_DEGREE; ++i )
+ blake2b_update( S->R, hash[i], BLAKE2B_OUTBYTES );
+
+ return blake2b_final( S->R, out, S->outlen );
+}
+
+int blake2bp( void *out, size_t outlen, const void *in, size_t inlen, const void *key, size_t keylen )
+{
+ uint8_t hash[PARALLELISM_DEGREE][BLAKE2B_OUTBYTES];
+ blake2b_state S[PARALLELISM_DEGREE][1];
+ blake2b_state FS[1];
+ size_t i;
+
+ /* Verify parameters */
+ if ( NULL == in && inlen > 0 ) return -1;
+
+ if ( NULL == out ) return -1;
+
+ if( NULL == key && keylen > 0 ) return -1;
+
+ if( !outlen || outlen > BLAKE2B_OUTBYTES ) return -1;
+
+ if( keylen > BLAKE2B_KEYBYTES ) return -1;
+
+ for( i = 0; i < PARALLELISM_DEGREE; ++i )
+ if( blake2bp_init_leaf( S[i], outlen, keylen, i ) < 0 ) return -1;
+
+ S[PARALLELISM_DEGREE - 1]->last_node = 1; /* mark last node */
+
+ if( keylen > 0 )
+ {
+ uint8_t block[BLAKE2B_BLOCKBYTES];
+ memset( block, 0, BLAKE2B_BLOCKBYTES );
+ memcpy( block, key, keylen );
+
+ for( i = 0; i < PARALLELISM_DEGREE; ++i )
+ blake2b_update( S[i], block, BLAKE2B_BLOCKBYTES );
+
+ secure_zero_memory( block, BLAKE2B_BLOCKBYTES ); /* Burn the key from stack */
+ }
+
+#if defined(_OPENMP)
+ #pragma omp parallel shared(S,hash), num_threads(PARALLELISM_DEGREE)
+#else
+
+ for( i = 0; i < PARALLELISM_DEGREE; ++i )
+#endif
+ {
+#if defined(_OPENMP)
+ size_t i = omp_get_thread_num();
+#endif
+ size_t inlen__ = inlen;
+ const unsigned char *in__ = ( const unsigned char * )in;
+ in__ += i * BLAKE2B_BLOCKBYTES;
+
+ while( inlen__ >= PARALLELISM_DEGREE * BLAKE2B_BLOCKBYTES )
+ {
+ blake2b_update( S[i], in__, BLAKE2B_BLOCKBYTES );
+ in__ += PARALLELISM_DEGREE * BLAKE2B_BLOCKBYTES;
+ inlen__ -= PARALLELISM_DEGREE * BLAKE2B_BLOCKBYTES;
+ }
+
+ if( inlen__ > i * BLAKE2B_BLOCKBYTES )
+ {
+ const size_t left = inlen__ - i * BLAKE2B_BLOCKBYTES;
+ const size_t len = left <= BLAKE2B_BLOCKBYTES ? left : BLAKE2B_BLOCKBYTES;
+ blake2b_update( S[i], in__, len );
+ }
+
+ blake2b_final( S[i], hash[i], BLAKE2B_OUTBYTES );
+ }
+
+ if( blake2bp_init_root( FS, outlen, keylen ) < 0 )
+ return -1;
+
+ FS->last_node = 1; /* Mark as last node */
+
+ for( i = 0; i < PARALLELISM_DEGREE; ++i )
+ blake2b_update( FS, hash[i], BLAKE2B_OUTBYTES );
+
+ return blake2b_final( FS, out, outlen );
+}
+
+
+#if defined(BLAKE2BP_SELFTEST)
+#include <string.h>
+#include "blake2-kat.h"
+int main( void )
+{
+ uint8_t key[BLAKE2B_KEYBYTES];
+ uint8_t buf[BLAKE2_KAT_LENGTH];
+ size_t i, step;
+
+ for( i = 0; i < BLAKE2B_KEYBYTES; ++i )
+ key[i] = ( uint8_t )i;
+
+ for( i = 0; i < BLAKE2_KAT_LENGTH; ++i )
+ buf[i] = ( uint8_t )i;
+
+ /* Test simple API */
+ for( i = 0; i < BLAKE2_KAT_LENGTH; ++i )
+ {
+ uint8_t hash[BLAKE2B_OUTBYTES];
+ blake2bp( hash, BLAKE2B_OUTBYTES, buf, i, key, BLAKE2B_KEYBYTES );
+
+ if( 0 != memcmp( hash, blake2bp_keyed_kat[i], BLAKE2B_OUTBYTES ) )
+ {
+ goto fail;
+ }
+ }
+
+ /* Test streaming API */
+ for(step = 1; step < BLAKE2B_BLOCKBYTES; ++step) {
+ for (i = 0; i < BLAKE2_KAT_LENGTH; ++i) {
+ uint8_t hash[BLAKE2B_OUTBYTES];
+ blake2bp_state S;
+ uint8_t * p = buf;
+ size_t mlen = i;
+ int err = 0;
+
+ if( (err = blake2bp_init_key(&S, BLAKE2B_OUTBYTES, key, BLAKE2B_KEYBYTES)) < 0 ) {
+ goto fail;
+ }
+
+ while (mlen >= step) {
+ if ( (err = blake2bp_update(&S, p, step)) < 0 ) {
+ goto fail;
+ }
+ mlen -= step;
+ p += step;
+ }
+ if ( (err = blake2bp_update(&S, p, mlen)) < 0) {
+ goto fail;
+ }
+ if ( (err = blake2bp_final(&S, hash, BLAKE2B_OUTBYTES)) < 0) {
+ goto fail;
+ }
+
+ if (0 != memcmp(hash, blake2bp_keyed_kat[i], BLAKE2B_OUTBYTES)) {
+ goto fail;
+ }
+ }
+ }
+
+ puts( "ok" );
+ return 0;
+fail:
+ puts("error");
+ return -1;
+}
+#endif
diff --git a/neon/blake2s-load-neon.h b/neon/blake2s-load-neon.h
new file mode 100644
index 0000000..852197e
--- /dev/null
+++ b/neon/blake2s-load-neon.h
@@ -0,0 +1,193 @@
+/*
+ BLAKE2 reference source code package - optimized C implementations
+
+ Copyright 2012, Samuel Neves <sneves@dei.uc.pt>. You may use this under the
+ terms of the CC0, the OpenSSL Licence, or the Apache Public License 2.0, at
+ your option. The terms of these licenses can be found at:
+
+ - CC0 1.0 Universal : http://creativecommons.org/publicdomain/zero/1.0
+ - OpenSSL license : https://www.openssl.org/source/license.html
+ - Apache 2.0 : http://www.apache.org/licenses/LICENSE-2.0
+
+ More information about the BLAKE2 hash function can be found at
+ https://blake2.net.
+*/
+#ifndef BLAKE2S_LOAD_NEON_H
+#define BLAKE2S_LOAD_NEON_H
+
+#define LOAD_MSG_0_1(buf) \
+ t1 = vzip_u32(m0, m1); \
+ t2 = vzip_u32(m2, m3); \
+ buf = vcombine_u32(t1.val[0], t2.val[0]);
+
+#define LOAD_MSG_0_2(buf) \
+ t1 = vzip_u32(m0, m1); \
+ t2 = vzip_u32(m2, m3); \
+ buf = vcombine_u32(t1.val[1], t2.val[1]);
+
+#define LOAD_MSG_0_3(buf) \
+ t1 = vzip_u32(m4, m5); \
+ t2 = vzip_u32(m6, m7); \
+ buf = vcombine_u32(t1.val[0], t2.val[0]);
+
+#define LOAD_MSG_0_4(buf) \
+ t1 = vzip_u32(m4, m5); \
+ t2 = vzip_u32(m6, m7); \
+ buf = vcombine_u32(t1.val[1], t2.val[1]);
+
+#define LOAD_MSG_1_1(buf) \
+ t1 = vzip_u32(m7, m2); \
+ t2 = vzip_u32(m4, m6); \
+ buf = vcombine_u32(t1.val[0], t2.val[1]);
+
+#define LOAD_MSG_1_2(buf) \
+ t1 = vzip_u32(m5, m4); \
+ buf = vcombine_u32(t1.val[0], vext_u32(m7, m3, 1));
+
+#define LOAD_MSG_1_3(buf) \
+ t2 = vzip_u32(m5, m2); \
+ buf = vcombine_u32(vrev64_u32(m0), t2.val[1]);
+
+#define LOAD_MSG_1_4(buf) \
+ t1 = vzip_u32(m6, m1); \
+ t2 = vzip_u32(m3, m1); \
+ buf = vcombine_u32(t1.val[0], t2.val[1]);
+
+#define LOAD_MSG_2_1(buf) \
+ t2 = vzip_u32(m2, m7); \
+ buf = vcombine_u32(vext_u32(m5, m6, 1), t2.val[1]);
+
+#define LOAD_MSG_2_2(buf) \
+ t1 = vzip_u32(m4, m0); \
+ buf = vcombine_u32(t1.val[0], vrev64_u32(vext_u32(m6, m1, 1)));
+
+#define LOAD_MSG_2_3(buf) \
+ t2 = vzip_u32(m3, m4); \
+ buf = vcombine_u32(vrev64_u32(vext_u32(m1, m5, 1)), t2.val[1]);
+
+#define LOAD_MSG_2_4(buf) \
+ t1 = vzip_u32(m7, m3); \
+ buf = vcombine_u32(t1.val[0], vext_u32(m0, m2, 1));
+
+#define LOAD_MSG_3_1(buf) \
+ t1 = vzip_u32(m3, m1); \
+ t2 = vzip_u32(m6, m5); \
+ buf = vcombine_u32(t1.val[1], t2.val[1]);
+
+#define LOAD_MSG_3_2(buf) \
+ t1 = vzip_u32(m4, m0); \
+ t2 = vzip_u32(m6, m7); \
+ buf = vcombine_u32(t1.val[1], t2.val[0]);
+
+#define LOAD_MSG_3_3(buf) \
+ buf = vcombine_u32(vrev64_u32(vext_u32(m2, m1, 1)), \
+ vrev64_u32(vext_u32(m7, m2, 1)));
+
+#define LOAD_MSG_3_4(buf) \
+ t1 = vzip_u32(m3, m5); \
+ t2 = vzip_u32(m0, m4); \
+ buf = vcombine_u32(t1.val[0], t2.val[0]);
+
+#define LOAD_MSG_4_1(buf) \
+ t1 = vzip_u32(m4, m2); \
+ t2 = vzip_u32(m1, m5); \
+ buf = vcombine_u32(t1.val[1], t2.val[0]);
+
+#define LOAD_MSG_4_2(buf) \
+ buf = vcombine_u32(vrev64_u32(vext_u32(m3, m0, 1)), \
+ vrev64_u32(vext_u32(m7, m2, 1)));
+
+#define LOAD_MSG_4_3(buf) \
+ buf = vcombine_u32(vrev64_u32(vext_u32(m5, m7, 1)), \
+ vrev64_u32(vext_u32(m1, m3, 1)));
+
+#define LOAD_MSG_4_4(buf) \
+ buf = vcombine_u32(vext_u32(m0, m6, 1), \
+ vrev64_u32(vext_u32(m6, m4, 1)));
+
+#define LOAD_MSG_5_1(buf) \
+ t1 = vzip_u32(m1, m3); \
+ t2 = vzip_u32(m0, m4); \
+ buf = vcombine_u32(t1.val[0], t2.val[0]);
+
+#define LOAD_MSG_5_2(buf) \
+ t1 = vzip_u32(m6, m5); \
+ t2 = vzip_u32(m5, m1); \
+ buf = vcombine_u32(t1.val[0], t2.val[1]);
+
+#define LOAD_MSG_5_3(buf) \
+ t2 = vzip_u32(m7, m0); \
+ buf = vcombine_u32(vrev64_u32(vext_u32(m3, m2, 1)), t2.val[1]);
+
+#define LOAD_MSG_5_4(buf) \
+ t1 = vzip_u32(m6, m2); \
+ buf = vcombine_u32(t1.val[1], vrev64_u32(vext_u32(m4, m7, 1)));
+
+#define LOAD_MSG_6_1(buf) \
+ t2 = vzip_u32(m7, m2); \
+ buf = vcombine_u32(vrev64_u32(vext_u32(m0, m6, 1)), t2.val[0]);
+
+#define LOAD_MSG_6_2(buf) \
+ t1 = vzip_u32(m2, m7); \
+ buf = vcombine_u32(t1.val[1], vext_u32(m6, m5, 1));
+
+#define LOAD_MSG_6_3(buf) \
+ t1 = vzip_u32(m0, m3); \
+ buf = vcombine_u32(t1.val[0], vrev64_u32(m4));
+
+#define LOAD_MSG_6_4(buf) \
+ t1 = vzip_u32(m3, m1); \
+ buf = vcombine_u32(t1.val[1], vrev64_u32(vext_u32(m5, m1, 1)));
+
+#define LOAD_MSG_7_1(buf) \
+ t1 = vzip_u32(m6, m3); \
+ buf = vcombine_u32(t1.val[1], vrev64_u32(vext_u32(m1, m6, 1)));
+
+#define LOAD_MSG_7_2(buf) \
+ t2 = vzip_u32(m0, m4); \
+ buf = vcombine_u32(vext_u32(m5, m7, 1), t2.val[1]);
+
+#define LOAD_MSG_7_3(buf) \
+ t1 = vzip_u32(m2, m7); \
+ t2 = vzip_u32(m4, m1); \
+ buf = vcombine_u32(t1.val[1], t2.val[0]);
+
+#define LOAD_MSG_7_4(buf) \
+ t1 = vzip_u32(m0, m2); \
+ t2 = vzip_u32(m3, m5); \
+ buf = vcombine_u32(t1.val[0], t2.val[0]);
+
+#define LOAD_MSG_8_1(buf) \
+ t1 = vzip_u32(m3, m7); \
+ buf = vcombine_u32(t1.val[0], vext_u32(m5, m0, 1));
+
+#define LOAD_MSG_8_2(buf) \
+ t1 = vzip_u32(m7, m4); \
+ buf = vcombine_u32(t1.val[1], vext_u32(m1, m4, 1));
+
+#define LOAD_MSG_8_3(buf) \
+ buf = vcombine_u32(m6, vext_u32(m0, m5, 1));
+
+#define LOAD_MSG_8_4(buf) \
+ buf = vcombine_u32(vrev64_u32(vext_u32(m3, m1, 1)), m2);
+
+#define LOAD_MSG_9_1(buf) \
+ t1 = vzip_u32(m5, m4); \
+ t2 = vzip_u32(m3, m0); \
+ buf = vcombine_u32(t1.val[0], t2.val[1]);
+
+#define LOAD_MSG_9_2(buf) \
+ t1 = vzip_u32(m1, m2); \
+ buf = vcombine_u32(t1.val[0], vrev64_u32(vext_u32(m2, m3, 1)));
+
+#define LOAD_MSG_9_3(buf) \
+ t1 = vzip_u32(m7, m4); \
+ t2 = vzip_u32(m1, m6); \
+ buf = vcombine_u32(t1.val[1], t2.val[1]);
+
+#define LOAD_MSG_9_4(buf) \
+ t2 = vzip_u32(m6, m0); \
+ buf = vcombine_u32(vext_u32(m5, m7, 1), t2.val[0]);
+
+
+#endif
diff --git a/neon/blake2s-neon.c b/neon/blake2s-neon.c
new file mode 100644
index 0000000..96265b6
--- /dev/null
+++ b/neon/blake2s-neon.c
@@ -0,0 +1,693 @@
+/*
+ BLAKE2 reference source code package - reference C implementations
+
+ Copyright 2012, Samuel Neves <sneves@dei.uc.pt>. You may use this under the
+ terms of the CC0, the OpenSSL Licence, or the Apache Public License 2.0, at
+ your option. The terms of these licenses can be found at:
+
+ - CC0 1.0 Universal : http://creativecommons.org/publicdomain/zero/1.0
+ - OpenSSL license : https://www.openssl.org/source/license.html
+ - Apache 2.0 : http://www.apache.org/licenses/LICENSE-2.0
+
+ More information about the BLAKE2 hash function can be found at
+ https://blake2.net.
+*/
+
+#include <stdint.h>
+#include <string.h>
+#include <stdio.h>
+#include <arm_neon.h>
+
+#include "blake2.h"
+#include "blake2-impl.h"
+
+static const uint32_t blake2s_IV[8] =
+{
+ 0x6A09E667UL, 0xBB67AE85UL, 0x3C6EF372UL, 0xA54FF53AUL,
+ 0x510E527FUL, 0x9B05688CUL, 0x1F83D9ABUL, 0x5BE0CD19UL
+};
+
+static void blake2s_set_lastnode( blake2s_state *S )
+{
+ S->f[1] = (uint32_t)-1;
+}
+
+/* Some helper functions, not necessarily useful */
+static int blake2s_is_lastblock( const blake2s_state *S )
+{
+ return S->f[0] != 0;
+}
+
+static void blake2s_set_lastblock( blake2s_state *S )
+{
+ if( S->last_node ) blake2s_set_lastnode( S );
+
+ S->f[0] = (uint32_t)-1;
+}
+
+static void blake2s_increment_counter( blake2s_state *S, const uint32_t inc )
+{
+ S->t[0] += inc;
+ S->t[1] += ( S->t[0] < inc );
+}
+
+static void blake2s_init0( blake2s_state *S )
+{
+ size_t i;
+ memset( S, 0, sizeof( blake2s_state ) );
+
+ for( i = 0; i < 8; ++i ) S->h[i] = blake2s_IV[i];
+}
+
+/* init2 xors IV with input parameter block */
+int blake2s_init_param( blake2s_state *S, const blake2s_param *P )
+{
+ const unsigned char *p = ( const unsigned char * )( P );
+ size_t i;
+
+ blake2s_init0( S );
+
+ /* IV XOR ParamBlock */
+ for( i = 0; i < 8; ++i )
+ S->h[i] ^= load32( &p[i * 4] );
+
+ S->outlen = P->digest_length;
+ return 0;
+}
+
+
+/* Sequential blake2s initialization */
+int blake2s_init( blake2s_state *S, size_t outlen )
+{
+ blake2s_param P[1];
+
+ /* Move interval verification here? */
+ if ( ( !outlen ) || ( outlen > BLAKE2S_OUTBYTES ) ) return -1;
+
+ P->digest_length = (uint8_t)outlen;
+ P->key_length = 0;
+ P->fanout = 1;
+ P->depth = 1;
+ store32( &P->leaf_length, 0 );
+ store32( &P->node_offset, 0 );
+ store16( &P->xof_length, 0 );
+ P->node_depth = 0;
+ P->inner_length = 0;
+ /* memset(P->reserved, 0, sizeof(P->reserved) ); */
+ memset( P->salt, 0, sizeof( P->salt ) );
+ memset( P->personal, 0, sizeof( P->personal ) );
+ return blake2s_init_param( S, P );
+}
+
+int blake2s_init_key( blake2s_state *S, size_t outlen, const void *key, size_t keylen )
+{
+ blake2s_param P[1];
+
+ if ( ( !outlen ) || ( outlen > BLAKE2S_OUTBYTES ) ) return -1;
+
+ if ( !key || !keylen || keylen > BLAKE2S_KEYBYTES ) return -1;
+
+ P->digest_length = (uint8_t)outlen;
+ P->key_length = (uint8_t)keylen;
+ P->fanout = 1;
+ P->depth = 1;
+ store32( &P->leaf_length, 0 );
+ store32( &P->node_offset, 0 );
+ store16( &P->xof_length, 0 );
+ P->node_depth = 0;
+ P->inner_length = 0;
+ /* memset(P->reserved, 0, sizeof(P->reserved) ); */
+ memset( P->salt, 0, sizeof( P->salt ) );
+ memset( P->personal, 0, sizeof( P->personal ) );
+
+ if( blake2s_init_param( S, P ) < 0 ) return -1;
+
+ {
+ uint8_t block[BLAKE2S_BLOCKBYTES];
+ memset( block, 0, BLAKE2S_BLOCKBYTES );
+ memcpy( block, key, keylen );
+ blake2s_update( S, block, BLAKE2S_BLOCKBYTES );
+ secure_zero_memory( block, BLAKE2S_BLOCKBYTES ); /* Burn the key from stack */
+ }
+ return 0;
+}
+
+/* Round 0 */
+#undef LOAD_MSG_0_1_
+#define LOAD_MSG_0_1_(x) \
+ do { \
+ t1 = vzip_u32(m0, m1); \
+ t2 = vzip_u32(m2, m3); \
+ x = vcombine_u32(t1.val[0], t2.val[0]); \
+ } while(0)
+
+#undef LOAD_MSG_0_2_
+#define LOAD_MSG_0_2_(x) \
+ do { \
+ t1 = vzip_u32(m0, m1); \
+ t2 = vzip_u32(m2, m3); \
+ x = vcombine_u32(t1.val[1], t2.val[1]); \
+ } while(0)
+
+#undef LOAD_MSG_0_3_
+#define LOAD_MSG_0_3_(x) \
+ do { \
+ t1 = vzip_u32(m4, m5); \
+ t2 = vzip_u32(m6, m7); \
+ x = vcombine_u32(t1.val[0], t2.val[0]); \
+ } while(0)
+
+#undef LOAD_MSG_0_4_
+#define LOAD_MSG_0_4_(x) \
+ do { \
+ t1 = vzip_u32(m4, m5); \
+ t2 = vzip_u32(m6, m7); \
+ x = vcombine_u32(t1.val[1], t2.val[1]); \
+ } while(0)
+
+/* Round 1 */
+#undef LOAD_MSG_1_1_
+#define LOAD_MSG_1_1_(x) \
+ do { \
+ t1 = vzip_u32(m7, m2); \
+ t2 = vzip_u32(m4, m6); \
+ x = vcombine_u32(t1.val[0], t2.val[1]); \
+ } while(0)
+
+#undef LOAD_MSG_1_2_
+#define LOAD_MSG_1_2_(x) \
+ do { \
+ t1 = vzip_u32(m5, m4); \
+ x = vcombine_u32(t1.val[0], vext_u32(m7, m3, 1)); \
+ } while(0)
+
+#undef LOAD_MSG_1_3_
+#define LOAD_MSG_1_3_(x) \
+ do { \
+ t2 = vzip_u32(m5, m2); \
+ x = vcombine_u32(vrev64_u32(m0), t2.val[1]); \
+ } while(0)
+
+#undef LOAD_MSG_1_4_
+#define LOAD_MSG_1_4_(x) \
+ do { \
+ t1 = vzip_u32(m6, m1); \
+ t2 = vzip_u32(m3, m1); \
+ x = vcombine_u32(t1.val[0], t2.val[1]); \
+ } while(0)
+
+/* Round 2 */
+#undef LOAD_MSG_2_1_
+#define LOAD_MSG_2_1_(x) \
+ do { \
+ t2 = vzip_u32(m2, m7); \
+ x = vcombine_u32(vext_u32(m5, m6, 1), t2.val[1]); \
+ } while(0)
+
+#undef LOAD_MSG_2_2_
+#define LOAD_MSG_2_2_(x) \
+ do { \
+ t1 = vzip_u32(m4, m0); \
+ x = vcombine_u32(t1.val[0], vrev64_u32(vext_u32(m6, m1, 1))); \
+ } while(0)
+
+#undef LOAD_MSG_2_3_
+#define LOAD_MSG_2_3_(x) \
+ do { \
+ t2 = vzip_u32(m3, m4); \
+ x = vcombine_u32(vrev64_u32(vext_u32(m1, m5, 1)), t2.val[1]); \
+ } while(0)
+
+#undef LOAD_MSG_2_4_
+#define LOAD_MSG_2_4_(x) \
+ do { \
+ t1 = vzip_u32(m7, m3); \
+ x = vcombine_u32(t1.val[0], vext_u32(m0, m2, 1)); \
+ } while(0)
+
+/* Round 3 */
+#undef LOAD_MSG_3_1_
+#define LOAD_MSG_3_1_(x) \
+ do { \
+ t1 = vzip_u32(m3, m1); \
+ t2 = vzip_u32(m6, m5); \
+ x = vcombine_u32(t1.val[1], t2.val[1]); \
+ } while(0)
+
+#undef LOAD_MSG_3_2_
+#define LOAD_MSG_3_2_(x) \
+ do { \
+ t1 = vzip_u32(m4, m0); \
+ t2 = vzip_u32(m6, m7); \
+ x = vcombine_u32(t1.val[1], t2.val[0]); \
+ } while(0)
+
+#undef LOAD_MSG_3_3_
+#define LOAD_MSG_3_3_(x) \
+ do { \
+ x = vcombine_u32(vrev64_u32(vext_u32(m2, m1, 1)), \
+ vrev64_u32(vext_u32(m7, m2, 1))); \
+ } while(0)
+
+#undef LOAD_MSG_3_4_
+#define LOAD_MSG_3_4_(x) \
+ do { \
+ t1 = vzip_u32(m3, m5); \
+ t2 = vzip_u32(m0, m4); \
+ x = vcombine_u32(t1.val[0], t2.val[0]); \
+ } while(0)
+
+/* Round 4 */
+#undef LOAD_MSG_4_1_
+#define LOAD_MSG_4_1_(x) \
+ do { \
+ t1 = vzip_u32(m4, m2); \
+ t2 = vzip_u32(m1, m5); \
+ x = vcombine_u32(t1.val[1], t2.val[0]); \
+ } while(0)
+
+#undef LOAD_MSG_4_2_
+#define LOAD_MSG_4_2_(x) \
+ do { \
+ x = vcombine_u32(vrev64_u32(vext_u32(m3, m0, 1)), \
+ vrev64_u32(vext_u32(m7, m2, 1))); \
+ } while(0)
+
+#undef LOAD_MSG_4_3_
+#define LOAD_MSG_4_3_(x) \
+ do { \
+ x = vcombine_u32(vrev64_u32(vext_u32(m5, m7, 1)), \
+ vrev64_u32(vext_u32(m1, m3, 1))); \
+ } while(0)
+
+#undef LOAD_MSG_4_4_
+#define LOAD_MSG_4_4_(x) \
+ do { \
+ x = vcombine_u32(vext_u32(m0, m6, 1), \
+ vrev64_u32(vext_u32(m6, m4, 1))); \
+ } while(0)
+
+/* Round 5 */
+#undef LOAD_MSG_5_1_
+#define LOAD_MSG_5_1_(x) \
+ do { \
+ t1 = vzip_u32(m1, m3); \
+ t2 = vzip_u32(m0, m4); \
+ x = vcombine_u32(t1.val[0], t2.val[0]); \
+ } while(0)
+
+#undef LOAD_MSG_5_2_
+#define LOAD_MSG_5_2_(x) \
+ do { \
+ t1 = vzip_u32(m6, m5); \
+ t2 = vzip_u32(m5, m1); \
+ x = vcombine_u32(t1.val[0], t2.val[1]); \
+ } while(0)
+
+#undef LOAD_MSG_5_3_
+#define LOAD_MSG_5_3_(x) \
+ do { \
+ t2 = vzip_u32(m7, m0); \
+ x = vcombine_u32(vrev64_u32(vext_u32(m3, m2, 1)), t2.val[1]); \
+ } while(0)
+
+#undef LOAD_MSG_5_4_
+#define LOAD_MSG_5_4_(x) \
+ do { \
+ t1 = vzip_u32(m6, m2); \
+ x = vcombine_u32(t1.val[1], vrev64_u32(vext_u32(m4, m7, 1))); \
+ } while(0)
+
+/* Round 6 */
+#undef LOAD_MSG_6_1_
+#define LOAD_MSG_6_1_(x) \
+ do { \
+ t2 = vzip_u32(m7, m2); \
+ x = vcombine_u32(vrev64_u32(vext_u32(m0, m6, 1)), t2.val[0]); \
+ } while(0)
+
+#undef LOAD_MSG_6_2_
+#define LOAD_MSG_6_2_(x) \
+ do { \
+ t1 = vzip_u32(m2, m7); \
+ x = vcombine_u32(t1.val[1], vext_u32(m6, m5, 1)); \
+ } while(0)
+
+#undef LOAD_MSG_6_3_
+#define LOAD_MSG_6_3_(x) \
+ do { \
+ t1 = vzip_u32(m0, m3); \
+ x = vcombine_u32(t1.val[0], vrev64_u32(m4)); \
+ } while(0)
+
+#undef LOAD_MSG_6_4_
+#define LOAD_MSG_6_4_(x) \
+ do { \
+ t1 = vzip_u32(m3, m1); \
+ x = vcombine_u32(t1.val[1], vrev64_u32(vext_u32(m5, m1, 1))); \
+ } while(0)
+
+/* Round 7 */
+#undef LOAD_MSG_7_1_
+#define LOAD_MSG_7_1_(x) \
+ do { \
+ t1 = vzip_u32(m6, m3); \
+ x = vcombine_u32(t1.val[1], vrev64_u32(vext_u32(m1, m6, 1))); \
+ } while(0)
+
+#undef LOAD_MSG_7_2_
+#define LOAD_MSG_7_2_(x) \
+ do { \
+ t2 = vzip_u32(m0, m4); \
+ x = vcombine_u32(vext_u32(m5, m7, 1), t2.val[1]); \
+ } while(0)
+
+#undef LOAD_MSG_7_3_
+#define LOAD_MSG_7_3_(x) \
+ do { \
+ t1 = vzip_u32(m2, m7); \
+ t2 = vzip_u32(m4, m1); \
+ x = vcombine_u32(t1.val[1], t2.val[0]); \
+ } while(0)
+
+#undef LOAD_MSG_7_4_
+#define LOAD_MSG_7_4_(x) \
+ do { \
+ t1 = vzip_u32(m0, m2); \
+ t2 = vzip_u32(m3, m5); \
+ x = vcombine_u32(t1.val[0], t2.val[0]); \
+ } while(0)
+
+/* Round 8 */
+#undef LOAD_MSG_8_1_
+#define LOAD_MSG_8_1_(x) \
+ do { \
+ t1 = vzip_u32(m3, m7); \
+ x = vcombine_u32(t1.val[0], vext_u32(m5, m0, 1)); \
+ } while(0)
+
+#undef LOAD_MSG_8_2_
+#define LOAD_MSG_8_2_(x) \
+ do { \
+ t1 = vzip_u32(m7, m4); \
+ x = vcombine_u32(t1.val[1], vext_u32(m1, m4, 1)); \
+ } while(0)
+
+#undef LOAD_MSG_8_3_
+#define LOAD_MSG_8_3_(x) \
+ do { \
+ x = vcombine_u32(m6, vext_u32(m0, m5, 1)); \
+ } while(0)
+
+#undef LOAD_MSG_8_4_
+#define LOAD_MSG_8_4_(x) \
+ do { \
+ x = vcombine_u32(vrev64_u32(vext_u32(m3, m1, 1)), m2); \
+ } while(0)
+
+/* Round 9 */
+#undef LOAD_MSG_9_1_
+#define LOAD_MSG_9_1_(x) \
+ do { \
+ t1 = vzip_u32(m5, m4); \
+ t2 = vzip_u32(m3, m0); \
+ x = vcombine_u32(t1.val[0], t2.val[1]); \
+ } while(0)
+
+#undef LOAD_MSG_9_2_
+#define LOAD_MSG_9_2_(x) \
+ do { \
+ t1 = vzip_u32(m1, m2); \
+ x = vcombine_u32(t1.val[0], vrev64_u32(vext_u32(m2, m3, 1))); \
+ } while(0)
+
+#undef LOAD_MSG_9_3_
+#define LOAD_MSG_9_3_(x) \
+ do { \
+ t1 = vzip_u32(m7, m4); \
+ t2 = vzip_u32(m1, m6); \
+ x = vcombine_u32(t1.val[1], t2.val[1]); \
+ } while(0)
+
+#undef LOAD_MSG_9_4_
+#define LOAD_MSG_9_4_(x) \
+ do { \
+ t2 = vzip_u32(m6, m0); \
+ x = vcombine_u32(vext_u32(m5, m7, 1), t2.val[0]); \
+ } while(0)
+
+#define vrorq_n_u32_16(x) vreinterpretq_u32_u16( \
+ vrev32q_u16( \
+ vreinterpretq_u16_u32(x)))
+
+#define vrorq_n_u32_12(x) vorrq_u32( \
+ vshrq_n_u32(x, 12), \
+ vshlq_n_u32(x, 20));
+
+#define vrorq_n_u32_8(x) vorrq_u32( \
+ vshrq_n_u32(x, 8), \
+ vshlq_n_u32(x, 24));
+
+#define vrorq_n_u32_7(x) vorrq_u32( \
+ vshrq_n_u32(x, 7), \
+ vshlq_n_u32(x, 25));
+
+#define DIAGONALIZE(row1, row2, row3, row4) \
+ do { \
+ /* do nothing to row1 */ \
+ row2 = vextq_u32(row2, row2, 1); \
+ row3 = vextq_u32(row3, row3, 2); \
+ row4 = vextq_u32(row4, row4, 3); \
+ } while(0)
+
+#define UNDIAGONALIZE(row1, row2, row3, row4) \
+ do { \
+ /* do nothing to row1 */ \
+ row2 = vextq_u32(row2, row2, 3); \
+ row3 = vextq_u32(row3, row3, 2); \
+ row4 = vextq_u32(row4, row4, 1); \
+ } while(0)
+
+#define G1(r, i, row1, row2, row3, row4) \
+ do { \
+ LOAD_MSG_##r##_##i##_(e1234); \
+ row1 = vaddq_u32(row1, vaddq_u32(row2, e1234)); \
+ row4 = vrorq_n_u32_16(veorq_u32(row4, row1)); \
+ row3 = vaddq_u32(row3, row4); \
+ row2 = vrorq_n_u32_12(veorq_u32(row2, row3)); \
+ } while(0)
+
+
+#define G2(r, i, row1, row2, row3, row4) \
+ do { \
+ LOAD_MSG_##r##_##i##_(e1234); \
+ row1 = vaddq_u32(row1, vaddq_u32(row2, e1234)); \
+ row4 = vrorq_n_u32_8(veorq_u32(row4, row1)); \
+ row3 = vaddq_u32(row3, row4); \
+ row2 = vrorq_n_u32_7(veorq_u32(row2, row3)); \
+ } while(0)
+
+#define ROUND(r) \
+ do { \
+ G1(r, 1, row1, row2, row3, row4); \
+ G2(r, 2, row1, row2, row3, row4); \
+ DIAGONALIZE(row1, row2, row3, row4); \
+ G1(r, 3, row1, row2, row3, row4); \
+ G2(r, 4, row1, row2, row3, row4); \
+ UNDIAGONALIZE(row1, row2, row3, row4); \
+ } while(0)
+
+static void blake2s_compress( blake2s_state *S,
+ const uint8_t in[BLAKE2S_BLOCKBYTES] )
+{
+ uint32x4_t row1, row2, row3, row4, e1234;
+ uint32x2x2_t t1, t2;
+ const uint32x4_t h1234 = row1 = vld1q_u32(&S->h[0]);
+ const uint32x4_t h5678 = row2 = vld1q_u32(&S->h[4]);
+
+ const uint32x2_t m0 = vreinterpret_u32_u8(vld1_u8(&in[ 0]));
+ const uint32x2_t m1 = vreinterpret_u32_u8(vld1_u8(&in[ 8]));
+ const uint32x2_t m2 = vreinterpret_u32_u8(vld1_u8(&in[16]));
+ const uint32x2_t m3 = vreinterpret_u32_u8(vld1_u8(&in[24]));
+ const uint32x2_t m4 = vreinterpret_u32_u8(vld1_u8(&in[32]));
+ const uint32x2_t m5 = vreinterpret_u32_u8(vld1_u8(&in[40]));
+ const uint32x2_t m6 = vreinterpret_u32_u8(vld1_u8(&in[48]));
+ const uint32x2_t m7 = vreinterpret_u32_u8(vld1_u8(&in[56]));
+
+ row3 = vld1q_u32(&blake2s_IV[0]);
+
+ row4 = veorq_u32(vcombine_u32(vld1_u32(&S->t[0]), vld1_u32(&S->f[0])),
+ vld1q_u32(&blake2s_IV[4]));
+
+ ROUND( 0 );
+ ROUND( 1 );
+ ROUND( 2 );
+ ROUND( 3 );
+ ROUND( 4 );
+ ROUND( 5 );
+ ROUND( 6 );
+ ROUND( 7 );
+ ROUND( 8 );
+ ROUND( 9 );
+
+ vst1q_u32(&S->h[0], veorq_u32(h1234, veorq_u32(row1, row3)));
+ vst1q_u32(&S->h[4], veorq_u32(h5678, veorq_u32(row2, row4)));
+}
+
+#undef G1234
+#undef ROUND
+
+int blake2s_update( blake2s_state *S, const void *pin, size_t inlen )
+{
+ const unsigned char * in = (const unsigned char *)pin;
+ if( inlen > 0 )
+ {
+ size_t left = S->buflen;
+ size_t fill = BLAKE2S_BLOCKBYTES - left;
+ if( inlen > fill )
+ {
+ S->buflen = 0;
+ memcpy( S->buf + left, in, fill ); /* Fill buffer */
+ blake2s_increment_counter( S, BLAKE2S_BLOCKBYTES );
+ blake2s_compress( S, S->buf ); /* Compress */
+ in += fill; inlen -= fill;
+ while(inlen > BLAKE2S_BLOCKBYTES) {
+ blake2s_increment_counter(S, BLAKE2S_BLOCKBYTES);
+ blake2s_compress( S, in );
+ in += BLAKE2S_BLOCKBYTES;
+ inlen -= BLAKE2S_BLOCKBYTES;
+ }
+ }
+ memcpy( S->buf + S->buflen, in, inlen );
+ S->buflen += inlen;
+ }
+ return 0;
+}
+
+int blake2s_final( blake2s_state *S, void *out, size_t outlen )
+{
+ uint8_t buffer[BLAKE2S_OUTBYTES] = {0};
+ size_t i;
+
+ if( out == NULL || outlen < S->outlen )
+ return -1;
+
+ if( blake2s_is_lastblock( S ) )
+ return -1;
+
+ blake2s_increment_counter( S, ( uint32_t )S->buflen );
+ blake2s_set_lastblock( S );
+ memset( S->buf + S->buflen, 0, BLAKE2S_BLOCKBYTES - S->buflen ); /* Padding */
+ blake2s_compress( S, S->buf );
+
+ for( i = 0; i < 8; ++i ) /* Output full hash to temp buffer */
+ store32( buffer + sizeof( S->h[i] ) * i, S->h[i] );
+
+ memcpy( out, buffer, outlen );
+ secure_zero_memory(buffer, sizeof(buffer));
+ return 0;
+}
+
+int blake2s( void *out, size_t outlen, const void *in, size_t inlen, const void *key, size_t keylen )
+{
+ blake2s_state S[1];
+
+ /* Verify parameters */
+ if ( NULL == in && inlen > 0 ) return -1;
+
+ if ( NULL == out ) return -1;
+
+ if ( NULL == key && keylen > 0) return -1;
+
+ if( !outlen || outlen > BLAKE2S_OUTBYTES ) return -1;
+
+ if( keylen > BLAKE2S_KEYBYTES ) return -1;
+
+ if( keylen > 0 )
+ {
+ if( blake2s_init_key( S, outlen, key, keylen ) < 0 ) return -1;
+ }
+ else
+ {
+ if( blake2s_init( S, outlen ) < 0 ) return -1;
+ }
+
+ blake2s_update( S, ( const uint8_t * )in, inlen );
+ blake2s_final( S, out, outlen );
+ return 0;
+}
+
+#if defined(SUPERCOP)
+int crypto_hash( unsigned char *out, unsigned char *in, unsigned long long inlen )
+{
+ return blake2s( out, BLAKE2S_OUTBYTES, in, inlen, NULL, 0 );
+}
+#endif
+
+#if defined(BLAKE2S_SELFTEST)
+#include <string.h>
+#include "blake2-kat.h"
+int main( void )
+{
+ uint8_t key[BLAKE2S_KEYBYTES];
+ uint8_t buf[BLAKE2_KAT_LENGTH];
+ size_t i, step;
+
+ for( i = 0; i < BLAKE2S_KEYBYTES; ++i )
+ key[i] = ( uint8_t )i;
+
+ for( i = 0; i < BLAKE2_KAT_LENGTH; ++i )
+ buf[i] = ( uint8_t )i;
+
+ /* Test simple API */
+ for( i = 0; i < BLAKE2_KAT_LENGTH; ++i )
+ {
+ uint8_t hash[BLAKE2S_OUTBYTES];
+ blake2s( hash, BLAKE2S_OUTBYTES, buf, i, key, BLAKE2S_KEYBYTES );
+
+ if( 0 != memcmp( hash, blake2s_keyed_kat[i], BLAKE2S_OUTBYTES ) )
+ {
+ goto fail;
+ }
+ }
+
+ /* Test streaming API */
+ for(step = 1; step < BLAKE2S_BLOCKBYTES; ++step) {
+ for (i = 0; i < BLAKE2_KAT_LENGTH; ++i) {
+ uint8_t hash[BLAKE2S_OUTBYTES];
+ blake2s_state S;
+ uint8_t * p = buf;
+ size_t mlen = i;
+ int err = 0;
+
+ if( (err = blake2s_init_key(&S, BLAKE2S_OUTBYTES, key, BLAKE2S_KEYBYTES)) < 0 ) {
+ goto fail;
+ }
+
+ while (mlen >= step) {
+ if ( (err = blake2s_update(&S, p, step)) < 0 ) {
+ goto fail;
+ }
+ mlen -= step;
+ p += step;
+ }
+ if ( (err = blake2s_update(&S, p, mlen)) < 0) {
+ goto fail;
+ }
+ if ( (err = blake2s_final(&S, hash, BLAKE2S_OUTBYTES)) < 0) {
+ goto fail;
+ }
+
+ if (0 != memcmp(hash, blake2s_keyed_kat[i], BLAKE2S_OUTBYTES)) {
+ goto fail;
+ }
+ }
+ }
+
+ puts( "ok" );
+ return 0;
+fail:
+ puts("error");
+ return -1;
+}
+#endif
diff --git a/neon/blake2s-round.h b/neon/blake2s-round.h
new file mode 100644
index 0000000..b32c3ad
--- /dev/null
+++ b/neon/blake2s-round.h
@@ -0,0 +1,70 @@
+/*
+ BLAKE2 reference source code package - optimized C implementations
+
+ Copyright 2012, Samuel Neves <sneves@dei.uc.pt>. You may use this under the
+ terms of the CC0, the OpenSSL Licence, or the Apache Public License 2.0, at
+ your option. The terms of these licenses can be found at:
+
+ - CC0 1.0 Universal : http://creativecommons.org/publicdomain/zero/1.0
+ - OpenSSL license : https://www.openssl.org/source/license.html
+ - Apache 2.0 : http://www.apache.org/licenses/LICENSE-2.0
+
+ More information about the BLAKE2 hash function can be found at
+ https://blake2.net.
+*/
+#ifndef BLAKE2S_ROUND_H
+#define BLAKE2S_ROUND_H
+
+#define vrorq_n_u32_16(x) vreinterpretq_u32_u16( \
+ vrev32q_u16( \
+ vreinterpretq_u16_u32(x)))
+
+#define vrorq_n_u32_12(x) vorrq_u32( \
+ vshrq_n_u32(x, 12), \
+ vshlq_n_u32(x, 20));
+
+#define vrorq_n_u32_8(x) vorrq_u32( \
+ vshrq_n_u32(x, 8), \
+ vshlq_n_u32(x, 24));
+
+#define vrorq_n_u32_7(x) vorrq_u32( \
+ vshrq_n_u32(x, 7), \
+ vshlq_n_u32(x, 25));
+
+#define G1(row1,row2,row3,row4,buf) \
+ row1 = vaddq_u32(row1, vaddq_u32(row2, buf)); \
+ row4 = vrorq_n_u32_16(veorq_u32(row4, row1)); \
+ row3 = vaddq_u32(row3, row4); \
+ row2 = vrorq_n_u32_12(veorq_u32(row2, row3));
+
+#define G2(row1, row2, row3, row4,buf) \
+ row1 = vaddq_u32(row1, vaddq_u32(row2, buf)); \
+ row4 = vrorq_n_u32_8(veorq_u32(row4, row1)); \
+ row3 = vaddq_u32(row3, row4); \
+ row2 = vrorq_n_u32_7(veorq_u32(row2, row3));
+
+#define DIAGONALIZE(row1,row2,row3,row4) \
+ row2 = vextq_u32(row2, row2, 1); \
+ row3 = vextq_u32(row3, row3, 2); \
+ row4 = vextq_u32(row4, row4, 3);
+
+#define UNDIAGONALIZE(row1,row2,row3,row4) \
+ row2 = vextq_u32(row2, row2, 3); \
+ row3 = vextq_u32(row3, row3, 2); \
+ row4 = vextq_u32(row4, row4, 1);
+
+#include "blake2s-load-neon.h"
+
+#define ROUND(r) \
+ LOAD_MSG_ ##r ##_1(buf1); \
+ G1(row1, row2, row3, row4, buf1); \
+ LOAD_MSG_ ##r ##_2(buf2); \
+ G2(row1, row2, row3, row4, buf2); \
+ DIAGONALIZE(row1, row2, row3, row4); \
+ LOAD_MSG_ ##r ##_3(buf3); \
+ G1(row1, row2, row3, row4, buf3); \
+ LOAD_MSG_ ##r ##_4(buf4); \
+ G2(row1, row2, row3, row4, buf4); \
+ UNDIAGONALIZE(row1, row2, row3, row4);
+
+#endif
diff --git a/neon/blake2s.c b/neon/blake2s.c
new file mode 100644
index 0000000..1b52420
--- /dev/null
+++ b/neon/blake2s.c
@@ -0,0 +1,331 @@
+/*
+ BLAKE2 reference source code package - reference C implementations
+
+ Copyright 2012, Samuel Neves <sneves@dei.uc.pt>. You may use this under the
+ terms of the CC0, the OpenSSL Licence, or the Apache Public License 2.0, at
+ your option. The terms of these licenses can be found at:
+
+ - CC0 1.0 Universal : http://creativecommons.org/publicdomain/zero/1.0
+ - OpenSSL license : https://www.openssl.org/source/license.html
+ - Apache 2.0 : http://www.apache.org/licenses/LICENSE-2.0
+
+ More information about the BLAKE2 hash function can be found at
+ https://blake2.net.
+*/
+
+#include <stdint.h>
+#include <string.h>
+#include <stdio.h>
+#include <arm_neon.h>
+
+#include "blake2.h"
+#include "blake2-impl.h"
+
+#include "blake2s-round.h"
+
+static const uint32_t blake2s_IV[8] =
+{
+ 0x6A09E667UL, 0xBB67AE85UL, 0x3C6EF372UL, 0xA54FF53AUL,
+ 0x510E527FUL, 0x9B05688CUL, 0x1F83D9ABUL, 0x5BE0CD19UL
+};
+
+static void blake2s_set_lastnode( blake2s_state *S )
+{
+ S->f[1] = (uint32_t)-1;
+}
+
+/* Some helper functions, not necessarily useful */
+static int blake2s_is_lastblock( const blake2s_state *S )
+{
+ return S->f[0] != 0;
+}
+
+static void blake2s_set_lastblock( blake2s_state *S )
+{
+ if( S->last_node ) blake2s_set_lastnode( S );
+
+ S->f[0] = (uint32_t)-1;
+}
+
+static void blake2s_increment_counter( blake2s_state *S, const uint32_t inc )
+{
+ S->t[0] += inc;
+ S->t[1] += ( S->t[0] < inc );
+}
+
+static void blake2s_init0( blake2s_state *S )
+{
+ size_t i;
+ memset( S, 0, sizeof( blake2s_state ) );
+
+ for( i = 0; i < 8; ++i ) S->h[i] = blake2s_IV[i];
+}
+
+/* init2 xors IV with input parameter block */
+int blake2s_init_param( blake2s_state *S, const blake2s_param *P )
+{
+ const unsigned char *p = ( const unsigned char * )( P );
+ size_t i;
+
+ blake2s_init0( S );
+
+ /* IV XOR ParamBlock */
+ for( i = 0; i < 8; ++i )
+ S->h[i] ^= load32( &p[i * 4] );
+
+ S->outlen = P->digest_length;
+ return 0;
+}
+
+
+/* Sequential blake2s initialization */
+int blake2s_init( blake2s_state *S, size_t outlen )
+{
+ blake2s_param P[1];
+
+ /* Move interval verification here? */
+ if ( ( !outlen ) || ( outlen > BLAKE2S_OUTBYTES ) ) return -1;
+
+ P->digest_length = (uint8_t)outlen;
+ P->key_length = 0;
+ P->fanout = 1;
+ P->depth = 1;
+ store32( &P->leaf_length, 0 );
+ store32( &P->node_offset, 0 );
+ store16( &P->xof_length, 0 );
+ P->node_depth = 0;
+ P->inner_length = 0;
+ /* memset(P->reserved, 0, sizeof(P->reserved) ); */
+ memset( P->salt, 0, sizeof( P->salt ) );
+ memset( P->personal, 0, sizeof( P->personal ) );
+ return blake2s_init_param( S, P );
+}
+
+
+int blake2s_init_key( blake2s_state *S, size_t outlen, const void *key, size_t keylen )
+{
+ blake2s_param P[1];
+
+ if ( ( !outlen ) || ( outlen > BLAKE2S_OUTBYTES ) ) return -1;
+
+ if ( !key || !keylen || keylen > BLAKE2S_KEYBYTES ) return -1;
+
+ P->digest_length = (uint8_t)outlen;
+ P->key_length = (uint8_t)keylen;
+ P->fanout = 1;
+ P->depth = 1;
+ store32( &P->leaf_length, 0 );
+ store32( &P->node_offset, 0 );
+ store16( &P->xof_length, 0 );
+ P->node_depth = 0;
+ P->inner_length = 0;
+ /* memset(P->reserved, 0, sizeof(P->reserved) ); */
+ memset( P->salt, 0, sizeof( P->salt ) );
+ memset( P->personal, 0, sizeof( P->personal ) );
+
+ if( blake2s_init_param( S, P ) < 0 ) return -1;
+
+ {
+ uint8_t block[BLAKE2S_BLOCKBYTES];
+ memset( block, 0, BLAKE2S_BLOCKBYTES );
+ memcpy( block, key, keylen );
+ blake2s_update( S, block, BLAKE2S_BLOCKBYTES );
+ secure_zero_memory( block, BLAKE2S_BLOCKBYTES ); /* Burn the key from stack */
+ }
+ return 0;
+}
+
+
+static void blake2s_compress( blake2s_state *S,
+ const uint8_t in[BLAKE2S_BLOCKBYTES] )
+{
+ uint32x4_t row1, row2, row3, row4;
+ uint32x4_t buf1, buf2, buf3, buf4;
+ uint32x2x2_t t1, t2;
+ const uint32x4_t h1234 = row1 = vld1q_u32(&S->h[0]);
+ const uint32x4_t h5678 = row2 = vld1q_u32(&S->h[4]);
+
+ const uint32x2_t m0 = vreinterpret_u32_u8(vld1_u8(&in[ 0]));
+ const uint32x2_t m1 = vreinterpret_u32_u8(vld1_u8(&in[ 8]));
+ const uint32x2_t m2 = vreinterpret_u32_u8(vld1_u8(&in[16]));
+ const uint32x2_t m3 = vreinterpret_u32_u8(vld1_u8(&in[24]));
+ const uint32x2_t m4 = vreinterpret_u32_u8(vld1_u8(&in[32]));
+ const uint32x2_t m5 = vreinterpret_u32_u8(vld1_u8(&in[40]));
+ const uint32x2_t m6 = vreinterpret_u32_u8(vld1_u8(&in[48]));
+ const uint32x2_t m7 = vreinterpret_u32_u8(vld1_u8(&in[56]));
+
+ row3 = vld1q_u32(&blake2s_IV[0]);
+
+ row4 = veorq_u32(vcombine_u32(vld1_u32(&S->t[0]), vld1_u32(&S->f[0])),
+ vld1q_u32(&blake2s_IV[4]));
+
+ ROUND( 0 );
+ ROUND( 1 );
+ ROUND( 2 );
+ ROUND( 3 );
+ ROUND( 4 );
+ ROUND( 5 );
+ ROUND( 6 );
+ ROUND( 7 );
+ ROUND( 8 );
+ ROUND( 9 );
+
+ vst1q_u32(&S->h[0], veorq_u32(h1234, veorq_u32(row1, row3)));
+ vst1q_u32(&S->h[4], veorq_u32(h5678, veorq_u32(row2, row4)));
+}
+
+
+int blake2s_update( blake2s_state *S, const void *pin, size_t inlen )
+{
+ const unsigned char * in = (const unsigned char *)pin;
+ if( inlen > 0 )
+ {
+ size_t left = S->buflen;
+ size_t fill = BLAKE2S_BLOCKBYTES - left;
+ if( inlen > fill )
+ {
+ S->buflen = 0;
+ memcpy( S->buf + left, in, fill ); /* Fill buffer */
+ blake2s_increment_counter( S, BLAKE2S_BLOCKBYTES );
+ blake2s_compress( S, S->buf ); /* Compress */
+ in += fill; inlen -= fill;
+ while(inlen > BLAKE2S_BLOCKBYTES) {
+ blake2s_increment_counter(S, BLAKE2S_BLOCKBYTES);
+ blake2s_compress( S, in );
+ in += BLAKE2S_BLOCKBYTES;
+ inlen -= BLAKE2S_BLOCKBYTES;
+ }
+ }
+ memcpy( S->buf + S->buflen, in, inlen );
+ S->buflen += inlen;
+ }
+ return 0;
+}
+
+int blake2s_final( blake2s_state *S, void *out, size_t outlen )
+{
+ uint8_t buffer[BLAKE2S_OUTBYTES] = {0};
+ size_t i;
+
+ if( out == NULL || outlen < S->outlen )
+ return -1;
+
+ if( blake2s_is_lastblock( S ) )
+ return -1;
+
+ blake2s_increment_counter( S, ( uint32_t )S->buflen );
+ blake2s_set_lastblock( S );
+ memset( S->buf + S->buflen, 0, BLAKE2S_BLOCKBYTES - S->buflen ); /* Padding */
+ blake2s_compress( S, S->buf );
+
+ for( i = 0; i < 8; ++i ) /* Output full hash to temp buffer */
+ store32( buffer + sizeof( S->h[i] ) * i, S->h[i] );
+
+ memcpy( out, buffer, outlen );
+ secure_zero_memory(buffer, sizeof(buffer));
+ return 0;
+}
+
+int blake2s( void *out, size_t outlen, const void *in, size_t inlen, const void *key, size_t keylen )
+{
+ blake2s_state S[1];
+
+ /* Verify parameters */
+ if ( NULL == in && inlen > 0 ) return -1;
+
+ if ( NULL == out ) return -1;
+
+ if ( NULL == key && keylen > 0) return -1;
+
+ if( !outlen || outlen > BLAKE2S_OUTBYTES ) return -1;
+
+ if( keylen > BLAKE2S_KEYBYTES ) return -1;
+
+ if( keylen > 0 )
+ {
+ if( blake2s_init_key( S, outlen, key, keylen ) < 0 ) return -1;
+ }
+ else
+ {
+ if( blake2s_init( S, outlen ) < 0 ) return -1;
+ }
+
+ blake2s_update( S, ( const uint8_t * )in, inlen );
+ blake2s_final( S, out, outlen );
+ return 0;
+}
+
+#if defined(SUPERCOP)
+int crypto_hash( unsigned char *out, unsigned char *in, unsigned long long inlen )
+{
+ return blake2s( out, BLAKE2S_OUTBYTES, in, inlen, NULL, 0 );
+}
+#endif
+
+#if defined(BLAKE2S_SELFTEST)
+#include <string.h>
+#include "blake2-kat.h"
+int main( void )
+{
+ uint8_t key[BLAKE2S_KEYBYTES];
+ uint8_t buf[BLAKE2_KAT_LENGTH];
+ size_t i, step;
+
+ for( i = 0; i < BLAKE2S_KEYBYTES; ++i )
+ key[i] = ( uint8_t )i;
+
+ for( i = 0; i < BLAKE2_KAT_LENGTH; ++i )
+ buf[i] = ( uint8_t )i;
+
+ /* Test simple API */
+ for( i = 0; i < BLAKE2_KAT_LENGTH; ++i )
+ {
+ uint8_t hash[BLAKE2S_OUTBYTES];
+ blake2s( hash, BLAKE2S_OUTBYTES, buf, i, key, BLAKE2S_KEYBYTES );
+
+ if( 0 != memcmp( hash, blake2s_keyed_kat[i], BLAKE2S_OUTBYTES ) )
+ {
+ goto fail;
+ }
+ }
+
+ /* Test streaming API */
+ for(step = 1; step < BLAKE2S_BLOCKBYTES; ++step) {
+ for (i = 0; i < BLAKE2_KAT_LENGTH; ++i) {
+ uint8_t hash[BLAKE2S_OUTBYTES];
+ blake2s_state S;
+ uint8_t * p = buf;
+ size_t mlen = i;
+ int err = 0;
+
+ if( (err = blake2s_init_key(&S, BLAKE2S_OUTBYTES, key, BLAKE2S_KEYBYTES)) < 0 ) {
+ goto fail;
+ }
+
+ while (mlen >= step) {
+ if ( (err = blake2s_update(&S, p, step)) < 0 ) {
+ goto fail;
+ }
+ mlen -= step;
+ p += step;
+ }
+ if ( (err = blake2s_update(&S, p, mlen)) < 0) {
+ goto fail;
+ }
+ if ( (err = blake2s_final(&S, hash, BLAKE2S_OUTBYTES)) < 0) {
+ goto fail;
+ }
+
+ if (0 != memcmp(hash, blake2s_keyed_kat[i], BLAKE2S_OUTBYTES)) {
+ goto fail;
+ }
+ }
+ }
+
+ puts( "ok" );
+ return 0;
+fail:
+ puts("error");
+ return -1;
+}
+#endif
diff --git a/neon/blake2sp.c b/neon/blake2sp.c
new file mode 100644
index 0000000..ed0e1ad
--- /dev/null
+++ b/neon/blake2sp.c
@@ -0,0 +1,358 @@
+/*
+ BLAKE2 reference source code package - optimized C implementations
+
+ Copyright 2012, Samuel Neves <sneves@dei.uc.pt>. You may use this under the
+ terms of the CC0, the OpenSSL Licence, or the Apache Public License 2.0, at
+ your option. The terms of these licenses can be found at:
+
+ - CC0 1.0 Universal : http://creativecommons.org/publicdomain/zero/1.0
+ - OpenSSL license : https://www.openssl.org/source/license.html
+ - Apache 2.0 : http://www.apache.org/licenses/LICENSE-2.0
+
+ More information about the BLAKE2 hash function can be found at
+ https://blake2.net.
+*/
+
+#include <stdlib.h>
+#include <string.h>
+#include <stdio.h>
+
+#if defined(_OPENMP)
+#include <omp.h>
+#endif
+
+#include "blake2.h"
+#include "blake2-impl.h"
+
+#define PARALLELISM_DEGREE 8
+
+/*
+ blake2sp_init_param defaults to setting the expecting output length
+ from the digest_length parameter block field.
+
+ In some cases, however, we do not want this, as the output length
+ of these instances is given by inner_length instead.
+*/
+static int blake2sp_init_leaf_param( blake2s_state *S, const blake2s_param *P )
+{
+ int err = blake2s_init_param(S, P);
+ S->outlen = P->inner_length;
+ return err;
+}
+
+static int blake2sp_init_leaf( blake2s_state *S, size_t outlen, size_t keylen, uint64_t offset )
+{
+ blake2s_param P[1];
+ P->digest_length = (uint8_t)outlen;
+ P->key_length = (uint8_t)keylen;
+ P->fanout = PARALLELISM_DEGREE;
+ P->depth = 2;
+ P->leaf_length = 0;
+ P->node_offset = offset;
+ P->xof_length = 0;
+ P->node_depth = 0;
+ P->inner_length = BLAKE2S_OUTBYTES;
+ memset( P->salt, 0, sizeof( P->salt ) );
+ memset( P->personal, 0, sizeof( P->personal ) );
+ return blake2sp_init_leaf_param( S, P );
+}
+
+static int blake2sp_init_root( blake2s_state *S, size_t outlen, size_t keylen )
+{
+ blake2s_param P[1];
+ P->digest_length = (uint8_t)outlen;
+ P->key_length = (uint8_t)keylen;
+ P->fanout = PARALLELISM_DEGREE;
+ P->depth = 2;
+ P->leaf_length = 0;
+ P->node_offset = 0;
+ P->xof_length = 0;
+ P->node_depth = 1;
+ P->inner_length = BLAKE2S_OUTBYTES;
+ memset( P->salt, 0, sizeof( P->salt ) );
+ memset( P->personal, 0, sizeof( P->personal ) );
+ return blake2s_init_param( S, P );
+}
+
+
+int blake2sp_init( blake2sp_state *S, size_t outlen )
+{
+ size_t i;
+
+ if( !outlen || outlen > BLAKE2S_OUTBYTES ) return -1;
+
+ memset( S->buf, 0, sizeof( S->buf ) );
+ S->buflen = 0;
+ S->outlen = outlen;
+
+ if( blake2sp_init_root( S->R, outlen, 0 ) < 0 )
+ return -1;
+
+ for( i = 0; i < PARALLELISM_DEGREE; ++i )
+ if( blake2sp_init_leaf( S->S[i], outlen, 0, i ) < 0 ) return -1;
+
+ S->R->last_node = 1;
+ S->S[PARALLELISM_DEGREE - 1]->last_node = 1;
+ return 0;
+}
+
+int blake2sp_init_key( blake2sp_state *S, size_t outlen, const void *key, size_t keylen )
+{
+ size_t i;
+
+ if( !outlen || outlen > BLAKE2S_OUTBYTES ) return -1;
+
+ if( !key || !keylen || keylen > BLAKE2S_KEYBYTES ) return -1;
+
+ memset( S->buf, 0, sizeof( S->buf ) );
+ S->buflen = 0;
+ S->outlen = outlen;
+
+ if( blake2sp_init_root( S->R, outlen, keylen ) < 0 )
+ return -1;
+
+ for( i = 0; i < PARALLELISM_DEGREE; ++i )
+ if( blake2sp_init_leaf( S->S[i], outlen, keylen, i ) < 0 ) return -1;
+
+ S->R->last_node = 1;
+ S->S[PARALLELISM_DEGREE - 1]->last_node = 1;
+ {
+ uint8_t block[BLAKE2S_BLOCKBYTES];
+ memset( block, 0, BLAKE2S_BLOCKBYTES );
+ memcpy( block, key, keylen );
+
+ for( i = 0; i < PARALLELISM_DEGREE; ++i )
+ blake2s_update( S->S[i], block, BLAKE2S_BLOCKBYTES );
+
+ secure_zero_memory( block, BLAKE2S_BLOCKBYTES ); /* Burn the key from stack */
+ }
+ return 0;
+}
+
+
+int blake2sp_update( blake2sp_state *S, const void *pin, size_t inlen )
+{
+ const unsigned char * in = (const unsigned char *)pin;
+ size_t left = S->buflen;
+ size_t fill = sizeof( S->buf ) - left;
+ size_t i;
+
+ if( left && inlen >= fill )
+ {
+ memcpy( S->buf + left, in, fill );
+
+ for( i = 0; i < PARALLELISM_DEGREE; ++i )
+ blake2s_update( S->S[i], S->buf + i * BLAKE2S_BLOCKBYTES, BLAKE2S_BLOCKBYTES );
+
+ in += fill;
+ inlen -= fill;
+ left = 0;
+ }
+
+#if defined(_OPENMP)
+ #pragma omp parallel shared(S), num_threads(PARALLELISM_DEGREE)
+#else
+
+ for( i = 0; i < PARALLELISM_DEGREE; ++i )
+#endif
+ {
+#if defined(_OPENMP)
+ size_t i = omp_get_thread_num();
+#endif
+ size_t inlen__ = inlen;
+ const unsigned char *in__ = ( const unsigned char * )in;
+ in__ += i * BLAKE2S_BLOCKBYTES;
+
+ while( inlen__ >= PARALLELISM_DEGREE * BLAKE2S_BLOCKBYTES )
+ {
+ blake2s_update( S->S[i], in__, BLAKE2S_BLOCKBYTES );
+ in__ += PARALLELISM_DEGREE * BLAKE2S_BLOCKBYTES;
+ inlen__ -= PARALLELISM_DEGREE * BLAKE2S_BLOCKBYTES;
+ }
+ }
+
+ in += inlen - inlen % ( PARALLELISM_DEGREE * BLAKE2S_BLOCKBYTES );
+ inlen %= PARALLELISM_DEGREE * BLAKE2S_BLOCKBYTES;
+
+ if( inlen > 0 )
+ memcpy( S->buf + left, in, inlen );
+
+ S->buflen = left + inlen;
+ return 0;
+}
+
+
+int blake2sp_final( blake2sp_state *S, void *out, size_t outlen )
+{
+ uint8_t hash[PARALLELISM_DEGREE][BLAKE2S_OUTBYTES];
+ size_t i;
+
+ if(out == NULL || outlen < S->outlen) {
+ return -1;
+ }
+
+ for( i = 0; i < PARALLELISM_DEGREE; ++i )
+ {
+ if( S->buflen > i * BLAKE2S_BLOCKBYTES )
+ {
+ size_t left = S->buflen - i * BLAKE2S_BLOCKBYTES;
+
+ if( left > BLAKE2S_BLOCKBYTES ) left = BLAKE2S_BLOCKBYTES;
+
+ blake2s_update( S->S[i], S->buf + i * BLAKE2S_BLOCKBYTES, left );
+ }
+
+ blake2s_final( S->S[i], hash[i], BLAKE2S_OUTBYTES );
+ }
+
+ for( i = 0; i < PARALLELISM_DEGREE; ++i )
+ blake2s_update( S->R, hash[i], BLAKE2S_OUTBYTES );
+
+ return blake2s_final( S->R, out, S->outlen );
+}
+
+
+int blake2sp( void *out, size_t outlen, const void *in, size_t inlen, const void *key, size_t keylen )
+{
+ uint8_t hash[PARALLELISM_DEGREE][BLAKE2S_OUTBYTES];
+ blake2s_state S[PARALLELISM_DEGREE][1];
+ blake2s_state FS[1];
+ size_t i;
+
+ /* Verify parameters */
+ if ( NULL == in && inlen > 0 ) return -1;
+
+ if ( NULL == out ) return -1;
+
+ if ( NULL == key && keylen > 0) return -1;
+
+ if( !outlen || outlen > BLAKE2S_OUTBYTES ) return -1;
+
+ if( keylen > BLAKE2S_KEYBYTES ) return -1;
+
+ for( i = 0; i < PARALLELISM_DEGREE; ++i )
+ if( blake2sp_init_leaf( S[i], outlen, keylen, i ) < 0 ) return -1;
+
+ S[PARALLELISM_DEGREE - 1]->last_node = 1; /* mark last node */
+
+ if( keylen > 0 )
+ {
+ uint8_t block[BLAKE2S_BLOCKBYTES];
+ memset( block, 0, BLAKE2S_BLOCKBYTES );
+ memcpy( block, key, keylen );
+
+ for( i = 0; i < PARALLELISM_DEGREE; ++i )
+ blake2s_update( S[i], block, BLAKE2S_BLOCKBYTES );
+
+ secure_zero_memory( block, BLAKE2S_BLOCKBYTES ); /* Burn the key from stack */
+ }
+
+#if defined(_OPENMP)
+ #pragma omp parallel shared(S,hash), num_threads(PARALLELISM_DEGREE)
+#else
+
+ for( i = 0; i < PARALLELISM_DEGREE; ++i )
+#endif
+ {
+#if defined(_OPENMP)
+ size_t i = omp_get_thread_num();
+#endif
+ size_t inlen__ = inlen;
+ const unsigned char *in__ = ( const unsigned char * )in;
+ in__ += i * BLAKE2S_BLOCKBYTES;
+
+ while( inlen__ >= PARALLELISM_DEGREE * BLAKE2S_BLOCKBYTES )
+ {
+ blake2s_update( S[i], in__, BLAKE2S_BLOCKBYTES );
+ in__ += PARALLELISM_DEGREE * BLAKE2S_BLOCKBYTES;
+ inlen__ -= PARALLELISM_DEGREE * BLAKE2S_BLOCKBYTES;
+ }
+
+ if( inlen__ > i * BLAKE2S_BLOCKBYTES )
+ {
+ const size_t left = inlen__ - i * BLAKE2S_BLOCKBYTES;
+ const size_t len = left <= BLAKE2S_BLOCKBYTES ? left : BLAKE2S_BLOCKBYTES;
+ blake2s_update( S[i], in__, len );
+ }
+
+ blake2s_final( S[i], hash[i], BLAKE2S_OUTBYTES );
+ }
+
+ if( blake2sp_init_root( FS, outlen, keylen ) < 0 )
+ return -1;
+
+ FS->last_node = 1;
+
+ for( i = 0; i < PARALLELISM_DEGREE; ++i )
+ blake2s_update( FS, hash[i], BLAKE2S_OUTBYTES );
+
+ return blake2s_final( FS, out, outlen );
+}
+
+#if defined(BLAKE2SP_SELFTEST)
+#include <string.h>
+#include "blake2-kat.h"
+int main( void )
+{
+ uint8_t key[BLAKE2S_KEYBYTES];
+ uint8_t buf[BLAKE2_KAT_LENGTH];
+ size_t i, step;
+
+ for( i = 0; i < BLAKE2S_KEYBYTES; ++i )
+ key[i] = ( uint8_t )i;
+
+ for( i = 0; i < BLAKE2_KAT_LENGTH; ++i )
+ buf[i] = ( uint8_t )i;
+
+ /* Test simple API */
+ for( i = 0; i < BLAKE2_KAT_LENGTH; ++i )
+ {
+ uint8_t hash[BLAKE2S_OUTBYTES];
+ blake2sp( hash, BLAKE2S_OUTBYTES, buf, i, key, BLAKE2S_KEYBYTES );
+
+ if( 0 != memcmp( hash, blake2sp_keyed_kat[i], BLAKE2S_OUTBYTES ) )
+ {
+ goto fail;
+ }
+ }
+
+ /* Test streaming API */
+ for(step = 1; step < BLAKE2S_BLOCKBYTES; ++step) {
+ for (i = 0; i < BLAKE2_KAT_LENGTH; ++i) {
+ uint8_t hash[BLAKE2S_OUTBYTES];
+ blake2sp_state S;
+ uint8_t * p = buf;
+ size_t mlen = i;
+ int err = 0;
+
+ if( (err = blake2sp_init_key(&S, BLAKE2S_OUTBYTES, key, BLAKE2S_KEYBYTES)) < 0 ) {
+ goto fail;
+ }
+
+ while (mlen >= step) {
+ if ( (err = blake2sp_update(&S, p, step)) < 0 ) {
+ goto fail;
+ }
+ mlen -= step;
+ p += step;
+ }
+ if ( (err = blake2sp_update(&S, p, mlen)) < 0) {
+ goto fail;
+ }
+ if ( (err = blake2sp_final(&S, hash, BLAKE2S_OUTBYTES)) < 0) {
+ goto fail;
+ }
+
+ if (0 != memcmp(hash, blake2sp_keyed_kat[i], BLAKE2S_OUTBYTES)) {
+ goto fail;
+ }
+ }
+ }
+
+ puts( "ok" );
+ return 0;
+fail:
+ puts("error");
+ return -1;
+}
+#endif
diff --git a/neon/blake2xb.c b/neon/blake2xb.c
new file mode 100644
index 0000000..2da56ae
--- /dev/null
+++ b/neon/blake2xb.c
@@ -0,0 +1,241 @@
+/*
+ BLAKE2 reference source code package - reference C implementations
+
+ Copyright 2016, JP Aumasson <jeanphilippe.aumasson@gmail.com>.
+ Copyright 2016, Samuel Neves <sneves@dei.uc.pt>.
+
+ You may use this under the terms of the CC0, the OpenSSL Licence, or
+ the Apache Public License 2.0, at your option. The terms of these
+ licenses can be found at:
+
+ - CC0 1.0 Universal : http://creativecommons.org/publicdomain/zero/1.0
+ - OpenSSL license : https://www.openssl.org/source/license.html
+ - Apache 2.0 : http://www.apache.org/licenses/LICENSE-2.0
+
+ More information about the BLAKE2 hash function can be found at
+ https://blake2.net.
+*/
+
+#include <stdint.h>
+#include <string.h>
+#include <stdio.h>
+
+#include "blake2.h"
+#include "blake2-impl.h"
+
+int blake2xb_init( blake2xb_state *S, const size_t outlen ) {
+ return blake2xb_init_key(S, outlen, NULL, 0);
+}
+
+int blake2xb_init_key( blake2xb_state *S, const size_t outlen, const void *key, size_t keylen)
+{
+ if ( outlen == 0 || outlen > 0xFFFFFFFFUL ) {
+ return -1;
+ }
+
+ if (NULL != key && keylen > BLAKE2B_KEYBYTES) {
+ return -1;
+ }
+
+ if (NULL == key && keylen > 0) {
+ return -1;
+ }
+
+ /* Initialize parameter block */
+ S->P->digest_length = BLAKE2B_OUTBYTES;
+ S->P->key_length = keylen;
+ S->P->fanout = 1;
+ S->P->depth = 1;
+ store32( &S->P->leaf_length, 0 );
+ store32( &S->P->node_offset, 0 );
+ store32( &S->P->xof_length, outlen );
+ S->P->node_depth = 0;
+ S->P->inner_length = 0;
+ memset( S->P->reserved, 0, sizeof( S->P->reserved ) );
+ memset( S->P->salt, 0, sizeof( S->P->salt ) );
+ memset( S->P->personal, 0, sizeof( S->P->personal ) );
+
+ if( blake2b_init_param( S->S, S->P ) < 0 ) {
+ return -1;
+ }
+
+ if (keylen > 0) {
+ uint8_t block[BLAKE2B_BLOCKBYTES];
+ memset(block, 0, BLAKE2B_BLOCKBYTES);
+ memcpy(block, key, keylen);
+ blake2b_update(S->S, block, BLAKE2B_BLOCKBYTES);
+ secure_zero_memory(block, BLAKE2B_BLOCKBYTES);
+ }
+ return 0;
+}
+
+int blake2xb_update( blake2xb_state *S, const void *in, size_t inlen ) {
+ return blake2b_update( S->S, in, inlen );
+}
+
+int blake2xb_final( blake2xb_state *S, void *out, size_t outlen) {
+
+ blake2b_state C[1];
+ blake2b_param P[1];
+ uint32_t xof_length = load32(&S->P->xof_length);
+ uint8_t root[BLAKE2B_BLOCKBYTES];
+ size_t i;
+
+ if (NULL == out) {
+ return -1;
+ }
+
+ /* outlen must match the output size defined in xof_length, */
+ /* unless it was -1, in which case anything goes except 0. */
+ if(xof_length == 0xFFFFFFFFUL) {
+ if(outlen == 0) {
+ return -1;
+ }
+ } else {
+ if(outlen != xof_length) {
+ return -1;
+ }
+ }
+
+ /* Finalize the root hash */
+ if (blake2b_final(S->S, root, BLAKE2B_OUTBYTES) < 0) {
+ return -1;
+ }
+
+ /* Set common block structure values */
+ /* Copy values from parent instance, and only change the ones below */
+ memcpy(P, S->P, sizeof(blake2b_param));
+ P->key_length = 0;
+ P->fanout = 0;
+ P->depth = 0;
+ store32(&P->leaf_length, BLAKE2B_OUTBYTES);
+ P->inner_length = BLAKE2B_OUTBYTES;
+ P->node_depth = 0;
+
+ for (i = 0; outlen > 0; ++i) {
+ const size_t block_size = (outlen < BLAKE2B_OUTBYTES) ? outlen : BLAKE2B_OUTBYTES;
+ /* Initialize state */
+ P->digest_length = block_size;
+ store32(&P->node_offset, i);
+ blake2b_init_param(C, P);
+ /* Process key if needed */
+ blake2b_update(C, root, BLAKE2B_OUTBYTES);
+ if (blake2b_final(C, (uint8_t *)out + i * BLAKE2B_OUTBYTES, block_size) < 0 ) {
+ return -1;
+ }
+ outlen -= block_size;
+ }
+ secure_zero_memory(root, sizeof(root));
+ secure_zero_memory(P, sizeof(P));
+ secure_zero_memory(C, sizeof(C));
+ /* Put blake2xb in an invalid state? cf. blake2s_is_lastblock */
+ return 0;
+
+}
+
+int blake2xb(void *out, size_t outlen, const void *in, size_t inlen, const void *key, size_t keylen)
+{
+ blake2xb_state S[1];
+
+ /* Verify parameters */
+ if (NULL == in && inlen > 0)
+ return -1;
+
+ if (NULL == out)
+ return -1;
+
+ if (NULL == key && keylen > 0)
+ return -1;
+
+ if (keylen > BLAKE2B_KEYBYTES)
+ return -1;
+
+ if (outlen == 0)
+ return -1;
+
+ /* Initialize the root block structure */
+ if (blake2xb_init_key(S, outlen, key, keylen) < 0) {
+ return -1;
+ }
+
+ /* Absorb the input message */
+ blake2xb_update(S, in, inlen);
+
+ /* Compute the root node of the tree and the final hash using the counter construction */
+ return blake2xb_final(S, out, outlen);
+}
+
+#if defined(BLAKE2XB_SELFTEST)
+#include <string.h>
+#include "blake2-kat.h"
+int main( void )
+{
+ uint8_t key[BLAKE2B_KEYBYTES];
+ uint8_t buf[BLAKE2_KAT_LENGTH];
+ size_t i, step, outlen;
+
+ for( i = 0; i < BLAKE2B_KEYBYTES; ++i ) {
+ key[i] = ( uint8_t )i;
+ }
+
+ for( i = 0; i < BLAKE2_KAT_LENGTH; ++i ) {
+ buf[i] = ( uint8_t )i;
+ }
+
+ /* Testing length of ouputs rather than inputs */
+ /* (Test of input lengths mostly covered by blake2s tests) */
+
+ /* Test simple API */
+ for( outlen = 1; outlen <= BLAKE2_KAT_LENGTH; ++outlen )
+ {
+ uint8_t hash[BLAKE2_KAT_LENGTH] = {0};
+ if( blake2xb( hash, outlen, buf, BLAKE2_KAT_LENGTH, key, BLAKE2B_KEYBYTES ) < 0 ) {
+ goto fail;
+ }
+
+ if( 0 != memcmp( hash, blake2xb_keyed_kat[outlen-1], outlen ) )
+ {
+ goto fail;
+ }
+ }
+
+ /* Test streaming API */
+ for(step = 1; step < BLAKE2B_BLOCKBYTES; ++step) {
+ for (outlen = 1; outlen <= BLAKE2_KAT_LENGTH; ++outlen) {
+ uint8_t hash[BLAKE2_KAT_LENGTH];
+ blake2xb_state S;
+ uint8_t * p = buf;
+ size_t mlen = BLAKE2_KAT_LENGTH;
+ int err = 0;
+
+ if( (err = blake2xb_init_key(&S, outlen, key, BLAKE2B_KEYBYTES)) < 0 ) {
+ goto fail;
+ }
+
+ while (mlen >= step) {
+ if ( (err = blake2xb_update(&S, p, step)) < 0 ) {
+ goto fail;
+ }
+ mlen -= step;
+ p += step;
+ }
+ if ( (err = blake2xb_update(&S, p, mlen)) < 0) {
+ goto fail;
+ }
+ if ( (err = blake2xb_final(&S, hash, outlen)) < 0) {
+ goto fail;
+ }
+
+ if (0 != memcmp(hash, blake2xb_keyed_kat[outlen-1], outlen)) {
+ goto fail;
+ }
+ }
+ }
+
+ puts( "ok" );
+ return 0;
+fail:
+ puts("error");
+ return -1;
+}
+#endif
diff --git a/neon/blake2xs.c b/neon/blake2xs.c
new file mode 100644
index 0000000..625693e
--- /dev/null
+++ b/neon/blake2xs.c
@@ -0,0 +1,239 @@
+/*
+ BLAKE2 reference source code package - reference C implementations
+
+ Copyright 2016, JP Aumasson <jeanphilippe.aumasson@gmail.com>.
+ Copyright 2016, Samuel Neves <sneves@dei.uc.pt>.
+
+ You may use this under the terms of the CC0, the OpenSSL Licence, or
+ the Apache Public License 2.0, at your option. The terms of these
+ licenses can be found at:
+
+ - CC0 1.0 Universal : http://creativecommons.org/publicdomain/zero/1.0
+ - OpenSSL license : https://www.openssl.org/source/license.html
+ - Apache 2.0 : http://www.apache.org/licenses/LICENSE-2.0
+
+ More information about the BLAKE2 hash function can be found at
+ https://blake2.net.
+*/
+
+#include <stdint.h>
+#include <string.h>
+#include <stdio.h>
+
+#include "blake2.h"
+#include "blake2-impl.h"
+
+int blake2xs_init( blake2xs_state *S, const size_t outlen ) {
+ return blake2xs_init_key(S, outlen, NULL, 0);
+}
+
+int blake2xs_init_key( blake2xs_state *S, const size_t outlen, const void *key, size_t keylen )
+{
+ if ( outlen == 0 || outlen > 0xFFFFUL ) {
+ return -1;
+ }
+
+ if (NULL != key && keylen > BLAKE2B_KEYBYTES) {
+ return -1;
+ }
+
+ if (NULL == key && keylen > 0) {
+ return -1;
+ }
+
+ /* Initialize parameter block */
+ S->P->digest_length = BLAKE2S_OUTBYTES;
+ S->P->key_length = keylen;
+ S->P->fanout = 1;
+ S->P->depth = 1;
+ store32( &S->P->leaf_length, 0 );
+ store32( &S->P->node_offset, 0 );
+ store16( &S->P->xof_length, outlen );
+ S->P->node_depth = 0;
+ S->P->inner_length = 0;
+ memset( S->P->salt, 0, sizeof( S->P->salt ) );
+ memset( S->P->personal, 0, sizeof( S->P->personal ) );
+
+ if( blake2s_init_param( S->S, S->P ) < 0 ) {
+ return -1;
+ }
+
+ if (keylen > 0) {
+ uint8_t block[BLAKE2S_BLOCKBYTES];
+ memset(block, 0, BLAKE2S_BLOCKBYTES);
+ memcpy(block, key, keylen);
+ blake2s_update(S->S, block, BLAKE2S_BLOCKBYTES);
+ secure_zero_memory(block, BLAKE2S_BLOCKBYTES);
+ }
+ return 0;
+}
+
+int blake2xs_update( blake2xs_state *S, const void *in, size_t inlen ) {
+ return blake2s_update( S->S, in, inlen );
+}
+
+int blake2xs_final(blake2xs_state *S, void *out, size_t outlen) {
+
+ blake2s_state C[1];
+ blake2s_param P[1];
+ uint16_t xof_length = load16(&S->P->xof_length);
+ uint8_t root[BLAKE2S_BLOCKBYTES];
+ size_t i;
+
+ if (NULL == out) {
+ return -1;
+ }
+
+ /* outlen must match the output size defined in xof_length, */
+ /* unless it was -1, in which case anything goes except 0. */
+ if(xof_length == 0xFFFFUL) {
+ if(outlen == 0) {
+ return -1;
+ }
+ } else {
+ if(outlen != xof_length) {
+ return -1;
+ }
+ }
+
+ /* Finalize the root hash */
+ if (blake2s_final(S->S, root, BLAKE2S_OUTBYTES) < 0) {
+ return -1;
+ }
+
+ /* Set common block structure values */
+ /* Copy values from parent instance, and only change the ones below */
+ memcpy(P, S->P, sizeof(blake2s_param));
+ P->key_length = 0;
+ P->fanout = 0;
+ P->depth = 0;
+ store32(&P->leaf_length, BLAKE2S_OUTBYTES);
+ P->inner_length = BLAKE2S_OUTBYTES;
+ P->node_depth = 0;
+
+ for (i = 0; outlen > 0; ++i) {
+ const size_t block_size = (outlen < BLAKE2S_OUTBYTES) ? outlen : BLAKE2S_OUTBYTES;
+ /* Initialize state */
+ P->digest_length = block_size;
+ store32(&P->node_offset, i);
+ blake2s_init_param(C, P);
+ /* Process key if needed */
+ blake2s_update(C, root, BLAKE2S_OUTBYTES);
+ if (blake2s_final(C, (uint8_t *)out + i * BLAKE2S_OUTBYTES, block_size) < 0) {
+ return -1;
+ }
+ outlen -= block_size;
+ }
+ secure_zero_memory(root, sizeof(root));
+ secure_zero_memory(P, sizeof(P));
+ secure_zero_memory(C, sizeof(C));
+ /* Put blake2xs in an invalid state? cf. blake2s_is_lastblock */
+ return 0;
+}
+
+int blake2xs(void *out, size_t outlen, const void *in, size_t inlen, const void *key, size_t keylen)
+{
+ blake2xs_state S[1];
+
+ /* Verify parameters */
+ if (NULL == in && inlen > 0)
+ return -1;
+
+ if (NULL == out)
+ return -1;
+
+ if (NULL == key && keylen > 0)
+ return -1;
+
+ if (keylen > BLAKE2S_KEYBYTES)
+ return -1;
+
+ if (outlen == 0)
+ return -1;
+
+ /* Initialize the root block structure */
+ if (blake2xs_init_key(S, outlen, key, keylen) < 0) {
+ return -1;
+ }
+
+ /* Absorb the input message */
+ blake2xs_update(S, in, inlen);
+
+ /* Compute the root node of the tree and the final hash using the counter construction */
+ return blake2xs_final(S, out, outlen);
+}
+
+#if defined(BLAKE2XS_SELFTEST)
+#include <string.h>
+#include "blake2-kat.h"
+int main( void )
+{
+ uint8_t key[BLAKE2S_KEYBYTES];
+ uint8_t buf[BLAKE2_KAT_LENGTH];
+ size_t i, step, outlen;
+
+ for( i = 0; i < BLAKE2S_KEYBYTES; ++i ) {
+ key[i] = ( uint8_t )i;
+ }
+
+ for( i = 0; i < BLAKE2_KAT_LENGTH; ++i ) {
+ buf[i] = ( uint8_t )i;
+ }
+
+ /* Testing length of ouputs rather than inputs */
+ /* (Test of input lengths mostly covered by blake2s tests) */
+
+ /* Test simple API */
+ for( outlen = 1; outlen <= BLAKE2_KAT_LENGTH; ++outlen )
+ {
+ uint8_t hash[BLAKE2_KAT_LENGTH] = {0};
+ if( blake2xs( hash, outlen, buf, BLAKE2_KAT_LENGTH, key, BLAKE2S_KEYBYTES ) < 0 ) {
+ goto fail;
+ }
+
+ if( 0 != memcmp( hash, blake2xs_keyed_kat[outlen-1], outlen ) )
+ {
+ goto fail;
+ }
+ }
+
+ /* Test streaming API */
+ for(step = 1; step < BLAKE2S_BLOCKBYTES; ++step) {
+ for (outlen = 1; outlen <= BLAKE2_KAT_LENGTH; ++outlen) {
+ uint8_t hash[BLAKE2_KAT_LENGTH];
+ blake2xs_state S;
+ uint8_t * p = buf;
+ size_t mlen = BLAKE2_KAT_LENGTH;
+ int err = 0;
+
+ if( (err = blake2xs_init_key(&S, outlen, key, BLAKE2S_KEYBYTES)) < 0 ) {
+ goto fail;
+ }
+
+ while (mlen >= step) {
+ if ( (err = blake2xs_update(&S, p, step)) < 0 ) {
+ goto fail;
+ }
+ mlen -= step;
+ p += step;
+ }
+ if ( (err = blake2xs_update(&S, p, mlen)) < 0) {
+ goto fail;
+ }
+ if ( (err = blake2xs_final(&S, hash, outlen)) < 0) {
+ goto fail;
+ }
+
+ if (0 != memcmp(hash, blake2xs_keyed_kat[outlen-1], outlen)) {
+ goto fail;
+ }
+ }
+ }
+
+ puts( "ok" );
+ return 0;
+fail:
+ puts("error");
+ return -1;
+}
+#endif
diff --git a/neon/genkat-c.c b/neon/genkat-c.c
new file mode 100644
index 0000000..58a48fd
--- /dev/null
+++ b/neon/genkat-c.c
@@ -0,0 +1,139 @@
+/*
+ BLAKE2 reference source code package - reference C implementations
+
+ Copyright 2012, Samuel Neves <sneves@dei.uc.pt>. You may use this under the
+ terms of the CC0, the OpenSSL Licence, or the Apache Public License 2.0, at
+ your option. The terms of these licenses can be found at:
+
+ - CC0 1.0 Universal : http://creativecommons.org/publicdomain/zero/1.0
+ - OpenSSL license : https://www.openssl.org/source/license.html
+ - Apache 2.0 : http://www.apache.org/licenses/LICENSE-2.0
+
+ More information about the BLAKE2 hash function can be found at
+ https://blake2.net.
+*/
+
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "blake2.h"
+
+#define STR_(x) #x
+#define STR(x) STR_(x)
+
+#define LENGTH 256
+
+#define MAKE_KAT(name, size_prefix) \
+ do { \
+ printf("static const uint8_t " #name "_kat[BLAKE2_KAT_LENGTH][" #size_prefix \
+ "_OUTBYTES] = \n{\n"); \
+ \
+ for (i = 0; i < LENGTH; ++i) { \
+ name(hash, size_prefix##_OUTBYTES, in, i, NULL, 0); \
+ printf("\t{\n\t\t"); \
+ \
+ for (j = 0; j < size_prefix##_OUTBYTES; ++j) \
+ printf("0x%02X%s", hash[j], \
+ (j + 1) == size_prefix##_OUTBYTES ? "\n" : j && !((j + 1) % 8) ? ",\n\t\t" : ", "); \
+ \
+ printf("\t},\n"); \
+ } \
+ \
+ printf("};\n\n\n\n\n"); \
+ } while (0)
+
+#define MAKE_KEYED_KAT(name, size_prefix) \
+ do { \
+ printf("static const uint8_t " #name "_keyed_kat[BLAKE2_KAT_LENGTH][" #size_prefix \
+ "_OUTBYTES] = \n{\n"); \
+ \
+ for (i = 0; i < LENGTH; ++i) { \
+ name(hash, size_prefix##_OUTBYTES, in, i, key, size_prefix##_KEYBYTES); \
+ printf("\t{\n\t\t"); \
+ \
+ for (j = 0; j < size_prefix##_OUTBYTES; ++j) \
+ printf("0x%02X%s", hash[j], \
+ (j + 1) == size_prefix##_OUTBYTES ? "\n" : j && !((j + 1) % 8) ? ",\n\t\t" : ", "); \
+ \
+ printf("\t},\n"); \
+ } \
+ \
+ printf("};\n\n\n\n\n"); \
+ } while (0)
+
+#define MAKE_XOF_KAT(name) \
+ do { \
+ printf("static const uint8_t " #name "_kat[BLAKE2_KAT_LENGTH][BLAKE2_KAT_LENGTH] = \n{\n"); \
+ \
+ for (i = 1; i <= LENGTH; ++i) { \
+ name(hash, i, in, LENGTH, NULL, 0); \
+ printf("\t{\n\t\t"); \
+ \
+ for (j = 0; j < i; ++j) \
+ printf("0x%02X%s", hash[j], \
+ (j + 1) == LENGTH ? "\n" : j && !((j + 1) % 8) ? ",\n\t\t" : ", "); \
+ \
+ for (j = i; j < LENGTH; ++j) \
+ printf("0x00%s", (j + 1) == LENGTH ? "\n" : j && !((j + 1) % 8) ? ",\n\t\t" : ", "); \
+ \
+ printf("\t},\n"); \
+ } \
+ \
+ printf("};\n\n\n\n\n"); \
+ } while (0)
+
+#define MAKE_XOF_KEYED_KAT(name, size_prefix) \
+ do { \
+ printf("static const uint8_t " #name \
+ "_keyed_kat[BLAKE2_KAT_LENGTH][BLAKE2_KAT_LENGTH] = \n{\n"); \
+ \
+ for (i = 1; i <= LENGTH; ++i) { \
+ name(hash, i, in, LENGTH, key, size_prefix##_KEYBYTES); \
+ printf("\t{\n\t\t"); \
+ \
+ for (j = 0; j < i; ++j) \
+ printf("0x%02X%s", hash[j], \
+ (j + 1) == LENGTH ? "\n" : j && !((j + 1) % 8) ? ",\n\t\t" : ", "); \
+ \
+ for (j = i; j < LENGTH; ++j) \
+ printf("0x00%s", (j + 1) == LENGTH ? "\n" : j && !((j + 1) % 8) ? ",\n\t\t" : ", "); \
+ \
+ printf("\t},\n"); \
+ } \
+ \
+ printf("};\n\n\n\n\n"); \
+ } while (0)
+
+int main() {
+ uint8_t key[64] = {0};
+ uint8_t in[LENGTH] = {0};
+ uint8_t hash[LENGTH] = {0};
+ size_t i, j;
+
+ for (i = 0; i < sizeof(in); ++i)
+ in[i] = i;
+
+ for (i = 0; i < sizeof(key); ++i)
+ key[i] = i;
+
+ puts("#ifndef BLAKE2_KAT_H\n"
+ "#define BLAKE2_KAT_H\n\n\n"
+ "#include <stdint.h>\n\n"
+ "#define BLAKE2_KAT_LENGTH " STR(LENGTH) "\n\n\n");
+ MAKE_KAT(blake2s, BLAKE2S);
+ MAKE_KEYED_KAT(blake2s, BLAKE2S);
+ MAKE_KAT(blake2b, BLAKE2B);
+ MAKE_KEYED_KAT(blake2b, BLAKE2B);
+ MAKE_KAT(blake2sp, BLAKE2S);
+ MAKE_KEYED_KAT(blake2sp, BLAKE2S);
+ MAKE_KAT(blake2bp, BLAKE2B);
+ MAKE_KEYED_KAT(blake2bp, BLAKE2B);
+ MAKE_XOF_KAT(blake2xs);
+ MAKE_XOF_KEYED_KAT(blake2xs, BLAKE2S);
+ MAKE_XOF_KAT(blake2xb);
+ MAKE_XOF_KEYED_KAT(blake2xb, BLAKE2B);
+ puts("#endif");
+ return 0;
+}
diff --git a/neon/genkat-json.c b/neon/genkat-json.c
new file mode 100644
index 0000000..0275fb5
--- /dev/null
+++ b/neon/genkat-json.c
@@ -0,0 +1,154 @@
+/*
+ BLAKE2 reference source code package - reference C implementations
+
+ Copyright 2012, Samuel Neves <sneves@dei.uc.pt>. You may use this under the
+ terms of the CC0, the OpenSSL Licence, or the Apache Public License 2.0, at
+ your option. The terms of these licenses can be found at:
+
+ - CC0 1.0 Universal : http://creativecommons.org/publicdomain/zero/1.0
+ - OpenSSL license : https://www.openssl.org/source/license.html
+ - Apache 2.0 : http://www.apache.org/licenses/LICENSE-2.0
+
+ More information about the BLAKE2 hash function can be found at
+ https://blake2.net.
+*/
+
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "blake2.h"
+
+#define STR_(x) #x
+#define STR(x) STR_(x)
+
+#define LENGTH 256
+
+#define MAKE_KAT(name, size_prefix, first) \
+ do { \
+ for (i = 0; i < LENGTH; ++i) { \
+ printf("%s\n{\n", i == 0 && first ? "" : ","); \
+ \
+ printf(" \"hash\": \"" #name "\",\n"); \
+ printf(" \"in\": \""); \
+ for (j = 0; j < i; ++j) \
+ printf("%02x", in[j]); \
+ \
+ printf("\",\n"); \
+ printf(" \"key\": \"\",\n"); \
+ printf(" \"out\": \""); \
+ \
+ name(hash, size_prefix##_OUTBYTES, in, i, NULL, 0); \
+ \
+ for (j = 0; j < size_prefix##_OUTBYTES; ++j) \
+ printf("%02x", hash[j]); \
+ printf("\"\n"); \
+ printf("}"); \
+ } \
+ } while (0)
+
+#define MAKE_KEYED_KAT(name, size_prefix, first) \
+ do { \
+ for (i = 0; i < LENGTH; ++i) { \
+ printf("%s\n{\n", i == 0 && first ? "" : ","); \
+ \
+ printf(" \"hash\": \"" #name "\",\n"); \
+ printf(" \"in\": \""); \
+ for (j = 0; j < i; ++j) \
+ printf("%02x", in[j]); \
+ \
+ printf("\",\n"); \
+ printf(" \"key\": \""); \
+ for (j = 0; j < size_prefix##_KEYBYTES; ++j) \
+ printf("%02x", key[j]); \
+ printf("\",\n"); \
+ printf(" \"out\": \""); \
+ \
+ name(hash, size_prefix##_OUTBYTES, in, i, key, size_prefix##_KEYBYTES); \
+ \
+ for (j = 0; j < size_prefix##_OUTBYTES; ++j) \
+ printf("%02x", hash[j]); \
+ printf("\"\n"); \
+ printf("}"); \
+ } \
+ } while (0)
+
+#define MAKE_XOF_KAT(name, first) \
+ do { \
+ for (i = 1; i <= LENGTH; ++i) { \
+ printf("%s\n{\n", i == 1 && first ? "" : ","); \
+ \
+ printf(" \"hash\": \"" #name "\",\n"); \
+ printf(" \"in\": \""); \
+ for (j = 0; j < LENGTH; ++j) \
+ printf("%02x", in[j]); \
+ \
+ printf("\",\n"); \
+ printf(" \"key\": \"\",\n"); \
+ printf(" \"out\": \""); \
+ \
+ name(hash, i, in, LENGTH, NULL, 0); \
+ \
+ for (j = 0; j < i; ++j) \
+ printf("%02x", hash[j]); \
+ printf("\"\n"); \
+ printf("}"); \
+ } \
+ } while (0)
+
+#define MAKE_XOF_KEYED_KAT(name, size_prefix, first) \
+ do { \
+ for (i = 1; i <= LENGTH; ++i) { \
+ printf("%s\n{\n", i == 1 && first ? "" : ","); \
+ \
+ printf(" \"hash\": \"" #name "\",\n"); \
+ printf(" \"in\": \""); \
+ for (j = 0; j < LENGTH; ++j) \
+ printf("%02x", in[j]); \
+ \
+ printf("\",\n"); \
+ printf(" \"key\": \""); \
+ for (j = 0; j < size_prefix##_KEYBYTES; ++j) \
+ printf("%02x", key[j]); \
+ printf("\",\n"); \
+ printf(" \"out\": \""); \
+ \
+ name(hash, i, in, LENGTH, key, size_prefix##_KEYBYTES); \
+ \
+ for (j = 0; j < i; ++j) \
+ printf("%02x", hash[j]); \
+ printf("\"\n"); \
+ printf("}"); \
+ } \
+ } while (0)
+
+int main() {
+ uint8_t key[64] = {0};
+ uint8_t in[LENGTH] = {0};
+ uint8_t hash[LENGTH] = {0};
+ size_t i, j;
+
+ for (i = 0; i < sizeof(in); ++i)
+ in[i] = i;
+
+ for (i = 0; i < sizeof(key); ++i)
+ key[i] = i;
+
+ printf("[");
+ MAKE_KAT(blake2s, BLAKE2S, 1);
+ MAKE_KEYED_KAT(blake2s, BLAKE2S, 0);
+ MAKE_KAT(blake2b, BLAKE2B, 0);
+ MAKE_KEYED_KAT(blake2b, BLAKE2B, 0);
+ MAKE_KAT(blake2sp, BLAKE2S, 0);
+ MAKE_KEYED_KAT(blake2sp, BLAKE2S, 0);
+ MAKE_KAT(blake2bp, BLAKE2B, 0);
+ MAKE_KEYED_KAT(blake2bp, BLAKE2B, 0);
+ MAKE_XOF_KAT(blake2xs, 0);
+ MAKE_XOF_KEYED_KAT(blake2xs, BLAKE2S, 0);
+ MAKE_XOF_KAT(blake2xb, 0);
+ MAKE_XOF_KEYED_KAT(blake2xb, BLAKE2B, 0);
+ printf("\n]\n");
+ fflush(stdout);
+ return 0;
+}
diff --git a/neon/makefile b/neon/makefile
new file mode 100644
index 0000000..c817204
--- /dev/null
+++ b/neon/makefile
@@ -0,0 +1,41 @@
+CC=gcc
+CFLAGS=-march=armv7-a -mfpu=neon-vfpv4 -mfloat-abi=hard -O3 -I../testvectors -Wall -Wextra -std=c89 -pedantic -Wno-long-long
+CFLAGSOMP=-fopenmp
+BLAKEBINS=blake2s blake2b blake2sp blake2bp blake2xs blake2xb
+
+all: $(BLAKEBINS) check
+
+blake2s: blake2s.c blake2s-round.h blake2s-load-neon.h
+ $(CC) blake2s.c -o $@ $(CFLAGS) -DBLAKE2S_SELFTEST
+
+blake2b: blake2b.c blake2b-round.h blake2b-load-neon.h
+ $(CC) blake2b.c -o $@ $(CFLAGS) -DBLAKE2B_SELFTEST
+
+blake2sp: blake2sp.c blake2s.c blake2s-round.h blake2s-load-neon.h
+ $(CC) blake2sp.c blake2s.c -o $@ $(CFLAGS) -fopenmp -DBLAKE2SP_SELFTEST
+
+blake2bp: blake2bp.c blake2b.c blake2b-round.h blake2b-load-neon.h
+ $(CC) blake2bp.c blake2b.c -o $@ $(CFLAGS) -fopenmp -DBLAKE2BP_SELFTEST
+
+blake2xs: blake2xs.c blake2s.c blake2s-round.h blake2s-load-neon.h
+ $(CC) blake2xs.c blake2s.c -o $@ $(CFLAGS) -DBLAKE2XS_SELFTEST
+
+blake2xb: blake2xb.c blake2b.c blake2b-round.h blake2b-load-neon.h
+ $(CC) blake2xb.c blake2b.c -o $@ $(CFLAGS) -DBLAKE2XB_SELFTEST
+
+check: blake2s blake2b blake2sp blake2bp blake2xs blake2xb
+ ./blake2s
+ ./blake2b
+ ./blake2sp
+ ./blake2bp
+ ./blake2xs
+ ./blake2xb
+
+kat:
+ $(CC) $(CFLAGS) -o genkat-c genkat-c.c blake2b.c blake2s.c blake2sp.c blake2bp.c blake2xs.c blake2xb.c
+ $(CC) $(CFLAGS) -o genkat-json genkat-json.c blake2b.c blake2s.c blake2sp.c blake2bp.c blake2xs.c blake2xb.c
+ ./genkat-c > blake2-kat.h
+ ./genkat-json > blake2-kat.json
+
+clean:
+ rm -rf *.o genkat-c genkat-json blake2-kat.h blake2-kat.json $(BLAKEBINS)