Add ARM NEON versions of blake2s and blake2b

NOTE! The NEON version of blake2s is currently NO FASTER than the reference implementations. However, it is retained for reference and in case it can be further improved. The NEON version of blake2b is more than twice as fast as the reference implementation on the Raspberry PI 2 Model B.
author: Leigh Brown <leigh@solinno.co.uk> 2018-04-02 22:07:05 +0300
committer: Leigh Brown <leigh@solinno.co.uk> 2018-04-02 22:07:05 +0300
commit: 7965d3e6e1b4193438b8d3a656787587d2579227 (patch)
tree: ef8d4edef59ecaae3389baba8a8ab49e173ec9fb /neon/blake2b-round.h
parent: beb75f4512223e6a3a03a48992345256c5ef393a (diff)
1 files changed, 76 insertions, 0 deletions
diff --git a/neon/blake2b-round.h b/neon/blake2b-round.h
new file mode 100644
index 0000000..9abf25b
--- /dev/null
+++ b/neon/blake2b-round.h
@@ -0,0 +1,76 @@
+/*
+   BLAKE2 reference source code package - reference C implementations
+
+   Copyright 2012, Samuel Neves <sneves@dei.uc.pt>.  You may use this under the
+   terms of the CC0, the OpenSSL Licence, or the Apache Public License 2.0, at
+   your option.  The terms of these licenses can be found at:
+
+   - CC0 1.0 Universal : http://creativecommons.org/publicdomain/zero/1.0
+   - OpenSSL license   : https://www.openssl.org/source/license.html
+   - Apache 2.0        : http://www.apache.org/licenses/LICENSE-2.0
+
+   More information about the BLAKE2 hash function can be found at
+   https://blake2.net.
+*/
+#ifndef BLAKE2B_ROUND_H
+#define BLAKE2B_ROUND_H
+
+#define vrorq_n_u64_32(x) vreinterpretq_u64_u32(vrev64q_u32(vreinterpretq_u32_u64((x))))
+
+#define vrorq_n_u64_24(x) vcombine_u64( \
+      vreinterpret_u64_u8(vext_u8(vreinterpret_u8_u64(vget_low_u64(x)), vreinterpret_u8_u64(vget_low_u64(x)), 3)), \
+      vreinterpret_u64_u8(vext_u8(vreinterpret_u8_u64(vget_high_u64(x)), vreinterpret_u8_u64(vget_high_u64(x)), 3)))
+
+#define vrorq_n_u64_16(x) vcombine_u64( \
+      vreinterpret_u64_u8(vext_u8(vreinterpret_u8_u64(vget_low_u64(x)), vreinterpret_u8_u64(vget_low_u64(x)), 2)), \
+      vreinterpret_u64_u8(vext_u8(vreinterpret_u8_u64(vget_high_u64(x)), vreinterpret_u8_u64(vget_high_u64(x)), 2)))
+
+#define vrorq_n_u64_63(x) veorq_u64(vaddq_u64(x, x), vshrq_n_u64(x, 63))
+
+#define G1(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1) \
+  row1l = vaddq_u64(vaddq_u64(row1l, b0), row2l); \
+  row1h = vaddq_u64(vaddq_u64(row1h, b1), row2h); \
+  row4l = veorq_u64(row4l, row1l); row4h = veorq_u64(row4h, row1h); \
+  row4l = vrorq_n_u64_32(row4l); row4h = vrorq_n_u64_32(row4h); \
+  row3l = vaddq_u64(row3l, row4l); row3h = vaddq_u64(row3h, row4h); \
+  row2l = veorq_u64(row2l, row3l); row2h = veorq_u64(row2h, row3h); \
+  row2l = vrorq_n_u64_24(row2l); row2h = vrorq_n_u64_24(row2h);
+
+#define G2(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1) \
+  row1l = vaddq_u64(vaddq_u64(row1l, b0), row2l); \
+  row1h = vaddq_u64(vaddq_u64(row1h, b1), row2h); \
+  row4l = veorq_u64(row4l, row1l); row4h = veorq_u64(row4h, row1h); \
+  row4l = vrorq_n_u64_16(row4l); row4h = vrorq_n_u64_16(row4h); \
+  row3l = vaddq_u64(row3l, row4l); row3h = vaddq_u64(row3h, row4h); \
+  row2l = veorq_u64(row2l, row3l); row2h = veorq_u64(row2h, row3h); \
+  row2l = vrorq_n_u64_63(row2l); row2h = vrorq_n_u64_63(row2h);
+
+#define DIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h) \
+    t0 = vextq_u64(row2l, row2h, 1); \
+    t1 = vextq_u64(row2h, row2l, 1); \
+    row2l = t0; row2h = t1; t0 = row3l;  row3l = row3h; row3h = t0; \
+    t0 = vextq_u64(row4h, row4l, 1); t1 = vextq_u64(row4l, row4h, 1); \
+    row4l = t0; row4h = t1;
+
+#define UNDIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h) \
+    t0 = vextq_u64(row2h, row2l, 1); \
+    t1 = vextq_u64(row2l, row2h, 1); \
+    row2l = t0; row2h = t1; t0 = row3l; row3l = row3h; row3h = t0; \
+    t0 = vextq_u64(row4l, row4h, 1); t1 = vextq_u64(row4h, row4l, 1); \
+    row4l = t0; row4h = t1;
+
+#include "blake2b-load-neon.h"
+
+#define ROUND(r) \
+  LOAD_MSG_ ##r ##_1(b0, b1); \
+  G1(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1); \
+  LOAD_MSG_ ##r ##_2(b0, b1); \
+  G2(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1); \
+  DIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h); \
+  LOAD_MSG_ ##r ##_3(b0, b1); \
+  G1(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1); \
+  LOAD_MSG_ ##r ##_4(b0, b1); \
+  G2(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1); \
+  UNDIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h);
+
+#endif
author	Leigh Brown <leigh@solinno.co.uk>	2018-04-02 22:07:05 +0300
committer	Leigh Brown <leigh@solinno.co.uk>	2018-04-02 22:07:05 +0300
commit	7965d3e6e1b4193438b8d3a656787587d2579227 (patch)
tree	ef8d4edef59ecaae3389baba8a8ab49e173ec9fb /neon/blake2b-round.h
parent	beb75f4512223e6a3a03a48992345256c5ef393a (diff)