From 75c6ae28fb35a3c825b55d67a9f86aa01f93265e Mon Sep 17 00:00:00 2001 From: sean-sn Date: Mon, 22 Jul 2019 15:02:28 -0400 Subject: Updated Blake2s code to match diagonal shuffle tweak done in blake2-avx for Blake2b --- sse/blake2s-load-sse2.h | 40 ++++++++++++++++---------------- sse/blake2s-load-sse41.h | 59 +++++++++++++++++++++++++++--------------------- sse/blake2s-load-xop.h | 40 ++++++++++++++++---------------- sse/blake2s-round.h | 12 +++++----- 4 files changed, 79 insertions(+), 72 deletions(-) diff --git a/sse/blake2s-load-sse2.h b/sse/blake2s-load-sse2.h index d2e9a09..8359e81 100644 --- a/sse/blake2s-load-sse2.h +++ b/sse/blake2s-load-sse2.h @@ -17,44 +17,44 @@ #define LOAD_MSG_0_1(buf) buf = _mm_set_epi32(m6,m4,m2,m0) #define LOAD_MSG_0_2(buf) buf = _mm_set_epi32(m7,m5,m3,m1) -#define LOAD_MSG_0_3(buf) buf = _mm_set_epi32(m14,m12,m10,m8) -#define LOAD_MSG_0_4(buf) buf = _mm_set_epi32(m15,m13,m11,m9) +#define LOAD_MSG_0_3(buf) buf = _mm_set_epi32(m12,m10,m8,m14) +#define LOAD_MSG_0_4(buf) buf = _mm_set_epi32(m13,m11,m9,m15) #define LOAD_MSG_1_1(buf) buf = _mm_set_epi32(m13,m9,m4,m14) #define LOAD_MSG_1_2(buf) buf = _mm_set_epi32(m6,m15,m8,m10) -#define LOAD_MSG_1_3(buf) buf = _mm_set_epi32(m5,m11,m0,m1) -#define LOAD_MSG_1_4(buf) buf = _mm_set_epi32(m3,m7,m2,m12) +#define LOAD_MSG_1_3(buf) buf = _mm_set_epi32(m11,m0,m1,m5) +#define LOAD_MSG_1_4(buf) buf = _mm_set_epi32(m7,m2,m12,m3) #define LOAD_MSG_2_1(buf) buf = _mm_set_epi32(m15,m5,m12,m11) #define LOAD_MSG_2_2(buf) buf = _mm_set_epi32(m13,m2,m0,m8) -#define LOAD_MSG_2_3(buf) buf = _mm_set_epi32(m9,m7,m3,m10) -#define LOAD_MSG_2_4(buf) buf = _mm_set_epi32(m4,m1,m6,m14) +#define LOAD_MSG_2_3(buf) buf = _mm_set_epi32(m7,m3,m10,m9) +#define LOAD_MSG_2_4(buf) buf = _mm_set_epi32(m1,m6,m14,m4) #define LOAD_MSG_3_1(buf) buf = _mm_set_epi32(m11,m13,m3,m7) #define LOAD_MSG_3_2(buf) buf = _mm_set_epi32(m14,m12,m1,m9) -#define LOAD_MSG_3_3(buf) buf = _mm_set_epi32(m15,m4,m5,m2) -#define LOAD_MSG_3_4(buf) buf = _mm_set_epi32(m8,m0,m10,m6) +#define LOAD_MSG_3_3(buf) buf = _mm_set_epi32(m4,m5,m2,m15) +#define LOAD_MSG_3_4(buf) buf = _mm_set_epi32(m0,m10,m6,m8) #define LOAD_MSG_4_1(buf) buf = _mm_set_epi32(m10,m2,m5,m9) #define LOAD_MSG_4_2(buf) buf = _mm_set_epi32(m15,m4,m7,m0) -#define LOAD_MSG_4_3(buf) buf = _mm_set_epi32(m3,m6,m11,m14) -#define LOAD_MSG_4_4(buf) buf = _mm_set_epi32(m13,m8,m12,m1) +#define LOAD_MSG_4_3(buf) buf = _mm_set_epi32(m6,m11,m14,m3) +#define LOAD_MSG_4_4(buf) buf = _mm_set_epi32(m8,m12,m1,m13) #define LOAD_MSG_5_1(buf) buf = _mm_set_epi32(m8,m0,m6,m2) #define LOAD_MSG_5_2(buf) buf = _mm_set_epi32(m3,m11,m10,m12) -#define LOAD_MSG_5_3(buf) buf = _mm_set_epi32(m1,m15,m7,m4) -#define LOAD_MSG_5_4(buf) buf = _mm_set_epi32(m9,m14,m5,m13) +#define LOAD_MSG_5_3(buf) buf = _mm_set_epi32(m15,m7,m4,m1) +#define LOAD_MSG_5_4(buf) buf = _mm_set_epi32(m14,m5,m13,m9) #define LOAD_MSG_6_1(buf) buf = _mm_set_epi32(m4,m14,m1,m12) #define LOAD_MSG_6_2(buf) buf = _mm_set_epi32(m10,m13,m15,m5) -#define LOAD_MSG_6_3(buf) buf = _mm_set_epi32(m8,m9,m6,m0) -#define LOAD_MSG_6_4(buf) buf = _mm_set_epi32(m11,m2,m3,m7) +#define LOAD_MSG_6_3(buf) buf = _mm_set_epi32(m9,m6,m0,m8) +#define LOAD_MSG_6_4(buf) buf = _mm_set_epi32(m2,m3,m7,m11) #define LOAD_MSG_7_1(buf) buf = _mm_set_epi32(m3,m12,m7,m13) #define LOAD_MSG_7_2(buf) buf = _mm_set_epi32(m9,m1,m14,m11) -#define LOAD_MSG_7_3(buf) buf = _mm_set_epi32(m2,m8,m15,m5) -#define LOAD_MSG_7_4(buf) buf = _mm_set_epi32(m10,m6,m4,m0) +#define LOAD_MSG_7_3(buf) buf = _mm_set_epi32(m8,m15,m5,m2) +#define LOAD_MSG_7_4(buf) buf = _mm_set_epi32(m6,m4,m0,m10) #define LOAD_MSG_8_1(buf) buf = _mm_set_epi32(m0,m11,m14,m6) #define LOAD_MSG_8_2(buf) buf = _mm_set_epi32(m8,m3,m9,m15) -#define LOAD_MSG_8_3(buf) buf = _mm_set_epi32(m10,m1,m13,m12) -#define LOAD_MSG_8_4(buf) buf = _mm_set_epi32(m5,m4,m7,m2) +#define LOAD_MSG_8_3(buf) buf = _mm_set_epi32(m1,m13,m12,m10) +#define LOAD_MSG_8_4(buf) buf = _mm_set_epi32(m4,m7,m2,m5) #define LOAD_MSG_9_1(buf) buf = _mm_set_epi32(m1,m7,m8,m10) #define LOAD_MSG_9_2(buf) buf = _mm_set_epi32(m5,m6,m4,m2) -#define LOAD_MSG_9_3(buf) buf = _mm_set_epi32(m13,m3,m9,m15) -#define LOAD_MSG_9_4(buf) buf = _mm_set_epi32(m0,m12,m14,m11) +#define LOAD_MSG_9_3(buf) buf = _mm_set_epi32(m3,m9,m15,m13) +#define LOAD_MSG_9_4(buf) buf = _mm_set_epi32(m12,m14,m11,m0) #endif diff --git a/sse/blake2s-load-sse41.h b/sse/blake2s-load-sse41.h index c316fb5..8d2b6b1 100644 --- a/sse/blake2s-load-sse41.h +++ b/sse/blake2s-load-sse41.h @@ -22,10 +22,13 @@ buf = TOI(_mm_shuffle_ps(TOF(m0), TOF(m1), _MM_SHUFFLE(2,0,2,0))); buf = TOI(_mm_shuffle_ps(TOF(m0), TOF(m1), _MM_SHUFFLE(3,1,3,1))); #define LOAD_MSG_0_3(buf) \ -buf = TOI(_mm_shuffle_ps(TOF(m2), TOF(m3), _MM_SHUFFLE(2,0,2,0))); +t0 = _mm_shuffle_epi32(m2, _MM_SHUFFLE(3,2,0,1)); \ +t1 = _mm_shuffle_epi32(m3, _MM_SHUFFLE(0,1,3,2)); \ +buf = _mm_blend_epi16(t0, t1, 0xC3); #define LOAD_MSG_0_4(buf) \ -buf = TOI(_mm_shuffle_ps(TOF(m2), TOF(m3), _MM_SHUFFLE(3,1,3,1))); +t0 = _mm_blend_epi16(t0, t1, 0x3C); \ +buf = _mm_shuffle_epi32(t0, _MM_SHUFFLE(2,3,0,1)); #define LOAD_MSG_1_1(buf) \ t0 = _mm_blend_epi16(m1, m2, 0x0C); \ @@ -43,13 +46,13 @@ buf = _mm_shuffle_epi32(t2, _MM_SHUFFLE(2,3,0,1)); t0 = _mm_slli_si128(m1, 4); \ t1 = _mm_blend_epi16(m2, t0, 0x30); \ t2 = _mm_blend_epi16(m0, t1, 0xF0); \ -buf = _mm_shuffle_epi32(t2, _MM_SHUFFLE(2,3,0,1)); +buf = _mm_shuffle_epi32(t2, _MM_SHUFFLE(3,0,1,2)); #define LOAD_MSG_1_4(buf) \ t0 = _mm_unpackhi_epi32(m0,m1); \ t1 = _mm_slli_si128(m3, 4); \ t2 = _mm_blend_epi16(t0, t1, 0x0C); \ -buf = _mm_shuffle_epi32(t2, _MM_SHUFFLE(2,3,0,1)); +buf = _mm_shuffle_epi32(t2, _MM_SHUFFLE(3,0,1,2)); #define LOAD_MSG_2_1(buf) \ t0 = _mm_unpackhi_epi32(m2,m3); \ @@ -67,13 +70,13 @@ buf = _mm_blend_epi16(t1, t2, 0xC0); t0 = _mm_blend_epi16(m0, m2, 0x3C); \ t1 = _mm_srli_si128(m1, 12); \ t2 = _mm_blend_epi16(t0,t1,0x03); \ -buf = _mm_shuffle_epi32(t2, _MM_SHUFFLE(1,0,3,2)); +buf = _mm_shuffle_epi32(t2, _MM_SHUFFLE(0,3,2,1)); #define LOAD_MSG_2_4(buf) \ t0 = _mm_slli_si128(m3, 4); \ t1 = _mm_blend_epi16(m0, m1, 0x33); \ t2 = _mm_blend_epi16(t1, t0, 0xC0); \ -buf = _mm_shuffle_epi32(t2, _MM_SHUFFLE(0,1,2,3)); +buf = _mm_shuffle_epi32(t2, _MM_SHUFFLE(1,2,3,0)); #define LOAD_MSG_3_1(buf) \ t0 = _mm_unpackhi_epi32(m0,m1); \ @@ -90,12 +93,11 @@ buf = _mm_shuffle_epi32(t2, _MM_SHUFFLE(2,0,1,3)); #define LOAD_MSG_3_3(buf) \ t0 = _mm_blend_epi16(m0,m1,0x0F); \ t1 = _mm_blend_epi16(t0, m3, 0xC0); \ -buf = _mm_shuffle_epi32(t1, _MM_SHUFFLE(3,0,1,2)); +buf = _mm_shuffle_epi32(t1, _MM_SHUFFLE(0,1,2,3)); #define LOAD_MSG_3_4(buf) \ -t0 = _mm_unpacklo_epi32(m0,m2); \ -t1 = _mm_unpackhi_epi32(m1,m2); \ -buf = _mm_unpacklo_epi64(t1,t0); +t0 = _mm_alignr_epi8(m0, m1, 4); \ +buf = _mm_blend_epi16(t0, m2, 0x33); #define LOAD_MSG_4_1(buf) \ t0 = _mm_unpacklo_epi64(m1,m2); \ @@ -111,13 +113,14 @@ buf = _mm_blend_epi16(t0,t1,0x33); #define LOAD_MSG_4_3(buf) \ t0 = _mm_unpackhi_epi64(m3,m1); \ t1 = _mm_unpackhi_epi64(m2,m0); \ -buf = _mm_blend_epi16(t1,t0,0x33); +t2 = _mm_blend_epi16(t1,t0,0x33); \ +buf = _mm_shuffle_epi32(t2, _MM_SHUFFLE(2,1,0,3)); #define LOAD_MSG_4_4(buf) \ t0 = _mm_blend_epi16(m0,m2,0x03); \ t1 = _mm_slli_si128(t0, 8); \ t2 = _mm_blend_epi16(t1,m3,0x0F); \ -buf = _mm_shuffle_epi32(t2, _MM_SHUFFLE(1,2,0,3)); +buf = _mm_shuffle_epi32(t2, _MM_SHUFFLE(2,0,3,1)); #define LOAD_MSG_5_1(buf) \ t0 = _mm_unpackhi_epi32(m0,m1); \ @@ -133,12 +136,13 @@ buf = _mm_blend_epi16(t1,t0,0x3C); t0 = _mm_blend_epi16(m1,m0,0x0C); \ t1 = _mm_srli_si128(m3, 4); \ t2 = _mm_blend_epi16(t0,t1,0x30); \ -buf = _mm_shuffle_epi32(t2, _MM_SHUFFLE(1,2,3,0)); +buf = _mm_shuffle_epi32(t2, _MM_SHUFFLE(2,3,0,1)); #define LOAD_MSG_5_4(buf) \ -t0 = _mm_unpacklo_epi64(m1,m2); \ -t1= _mm_shuffle_epi32(m3, _MM_SHUFFLE(0,2,0,1)); \ -buf = _mm_blend_epi16(t0,t1,0x33); +t0 = _mm_unpacklo_epi64(m2,m1); \ +t1 = _mm_shuffle_epi32(m3, _MM_SHUFFLE(2,0,1,0)); \ +t2 = _mm_srli_si128(t0, 4); \ +buf = _mm_blend_epi16(t1,t2,0x33); #define LOAD_MSG_6_1(buf) \ t0 = _mm_slli_si128(m1, 12); \ @@ -154,12 +158,13 @@ buf = _mm_shuffle_epi32(t2, _MM_SHUFFLE(2,1,3,0)); #define LOAD_MSG_6_3(buf) \ t0 = _mm_unpacklo_epi64(m0,m2); \ t1 = _mm_srli_si128(m1, 4); \ -buf = _mm_shuffle_epi32(_mm_blend_epi16(t0,t1,0x0C), _MM_SHUFFLE(2,3,1,0)); +t2 = _mm_blend_epi16(t0,t1,0x0C); \ +buf = _mm_shuffle_epi32(t2, _MM_SHUFFLE(3,1,0,2)); #define LOAD_MSG_6_4(buf) \ t0 = _mm_unpackhi_epi32(m1,m2); \ t1 = _mm_unpackhi_epi64(m0,t0); \ -buf = _mm_shuffle_epi32(t1, _MM_SHUFFLE(3,0,1,2)); +buf = _mm_shuffle_epi32(t1, _MM_SHUFFLE(0,1,2,3)); #define LOAD_MSG_7_1(buf) \ t0 = _mm_unpackhi_epi32(m0,m1); \ @@ -176,12 +181,13 @@ buf = _mm_shuffle_epi32(t2, _MM_SHUFFLE(1,0,2,3)); t0 = _mm_unpackhi_epi64(m0,m3); \ t1 = _mm_unpacklo_epi64(m1,m2); \ t2 = _mm_blend_epi16(t0,t1,0x3C); \ -buf = _mm_shuffle_epi32(t2,_MM_SHUFFLE(0,2,3,1)); +buf = _mm_shuffle_epi32(t2,_MM_SHUFFLE(2,3,1,0)); #define LOAD_MSG_7_4(buf) \ t0 = _mm_unpacklo_epi32(m0,m1); \ t1 = _mm_unpackhi_epi32(m1,m2); \ -buf = _mm_unpacklo_epi64(t0,t1); +t2 = _mm_unpacklo_epi64(t0,t1); \ +buf = _mm_shuffle_epi32(t2, _MM_SHUFFLE(2,1,0,3)); #define LOAD_MSG_8_1(buf) \ t0 = _mm_unpackhi_epi32(m1,m3); \ @@ -195,13 +201,14 @@ t1 = _mm_blend_epi16(m2,t0,0xF0); \ buf = _mm_shuffle_epi32(t1,_MM_SHUFFLE(0,2,1,3)); #define LOAD_MSG_8_3(buf) \ -t0 = _mm_blend_epi16(m2,m0,0x0C); \ -t1 = _mm_slli_si128(t0,4); \ -buf = _mm_blend_epi16(t1,m3,0x0F); +t0 = _mm_unpacklo_epi64(m0,m3); \ +t1 = _mm_srli_si128(m2,8); \ +t2 = _mm_blend_epi16(t0,t1,0x03); \ +buf = _mm_shuffle_epi32(t2, _MM_SHUFFLE(1,3,2,0)); #define LOAD_MSG_8_4(buf) \ t0 = _mm_blend_epi16(m1,m0,0x30); \ -buf = _mm_shuffle_epi32(t0,_MM_SHUFFLE(1,0,3,2)); +buf = _mm_shuffle_epi32(t0,_MM_SHUFFLE(0,3,2,1)); #define LOAD_MSG_9_1(buf) \ t0 = _mm_blend_epi16(m0,m2,0x03); \ @@ -218,12 +225,12 @@ buf = _mm_shuffle_epi32(t1,_MM_SHUFFLE(1,2,0,3)); t0 = _mm_unpackhi_epi32(m0,m3); \ t1 = _mm_unpacklo_epi32(m2,m3); \ t2 = _mm_unpackhi_epi64(t0,t1); \ -buf = _mm_shuffle_epi32(t2,_MM_SHUFFLE(3,0,2,1)); +buf = _mm_shuffle_epi32(t2,_MM_SHUFFLE(0,2,1,3)); #define LOAD_MSG_9_4(buf) \ t0 = _mm_blend_epi16(m3,m2,0xC0); \ t1 = _mm_unpacklo_epi32(m0,m3); \ t2 = _mm_blend_epi16(t0,t1,0x0F); \ -buf = _mm_shuffle_epi32(t2,_MM_SHUFFLE(0,1,2,3)); +buf = _mm_shuffle_epi32(t2,_MM_SHUFFLE(1,2,3,0)); #endif diff --git a/sse/blake2s-load-xop.h b/sse/blake2s-load-xop.h index a97ddcc..426edc1 100644 --- a/sse/blake2s-load-xop.h +++ b/sse/blake2s-load-xop.h @@ -37,10 +37,10 @@ buf = _mm_perm_epi8(m0, m1, _mm_set_epi32(TOB(6),TOB(4),TOB(2),TOB(0)) ); buf = _mm_perm_epi8(m0, m1, _mm_set_epi32(TOB(7),TOB(5),TOB(3),TOB(1)) ); #define LOAD_MSG_0_3(buf) \ -buf = _mm_perm_epi8(m2, m3, _mm_set_epi32(TOB(6),TOB(4),TOB(2),TOB(0)) ); +buf = _mm_perm_epi8(m2, m3, _mm_set_epi32(TOB(4),TOB(2),TOB(0),TOB(6)) ); #define LOAD_MSG_0_4(buf) \ -buf = _mm_perm_epi8(m2, m3, _mm_set_epi32(TOB(7),TOB(5),TOB(3),TOB(1)) ); +buf = _mm_perm_epi8(m2, m3, _mm_set_epi32(TOB(5),TOB(3),TOB(1),TOB(7)) ); #define LOAD_MSG_1_1(buf) \ t0 = _mm_perm_epi8(m1, m2, _mm_set_epi32(TOB(0),TOB(5),TOB(0),TOB(0)) ); \ @@ -52,11 +52,11 @@ buf = _mm_perm_epi8(t1, m3, _mm_set_epi32(TOB(3),TOB(7),TOB(1),TOB(0)) ); #define LOAD_MSG_1_3(buf) \ t0 = _mm_perm_epi8(m0, m1, _mm_set_epi32(TOB(5),TOB(0),TOB(0),TOB(1)) ); \ -buf = _mm_perm_epi8(t0, m2, _mm_set_epi32(TOB(3),TOB(7),TOB(1),TOB(0)) ); +buf = _mm_perm_epi8(t0, m2, _mm_set_epi32(TOB(7),TOB(1),TOB(0),TOB(3)) ); #define LOAD_MSG_1_4(buf) \ t1 = _mm_perm_epi8(m0, m1, _mm_set_epi32(TOB(3),TOB(7),TOB(2),TOB(0)) ); \ -buf = _mm_perm_epi8(t1, m3, _mm_set_epi32(TOB(3),TOB(2),TOB(1),TOB(4)) ); +buf = _mm_perm_epi8(t1, m3, _mm_set_epi32(TOB(2),TOB(1),TOB(4),TOB(3)) ); #define LOAD_MSG_2_1(buf) \ t0 = _mm_perm_epi8(m1, m2, _mm_set_epi32(TOB(0),TOB(1),TOB(0),TOB(7)) ); \ @@ -68,11 +68,11 @@ buf = _mm_perm_epi8(t1, m3, _mm_set_epi32(TOB(5),TOB(2),TOB(1),TOB(0)) ); #define LOAD_MSG_2_3(buf) \ t0 = _mm_perm_epi8(m0, m1, _mm_set_epi32(TOB(0),TOB(7),TOB(3),TOB(0)) ); \ -buf = _mm_perm_epi8(t0, m2, _mm_set_epi32(TOB(5),TOB(2),TOB(1),TOB(6)) ); +buf = _mm_perm_epi8(t0, m2, _mm_set_epi32(TOB(2),TOB(1),TOB(6),TOB(5)) ); #define LOAD_MSG_2_4(buf) \ t1 = _mm_perm_epi8(m0, m1, _mm_set_epi32(TOB(4),TOB(1),TOB(6),TOB(0)) ); \ -buf = _mm_perm_epi8(t1, m3, _mm_set_epi32(TOB(3),TOB(2),TOB(1),TOB(6)) ); +buf = _mm_perm_epi8(t1, m3, _mm_set_epi32(TOB(2),TOB(1),TOB(6),TOB(3)) ); #define LOAD_MSG_3_1(buf) \ t0 = _mm_perm_epi8(m0, m1, _mm_set_epi32(TOB(0),TOB(0),TOB(3),TOB(7)) ); \ @@ -85,11 +85,11 @@ buf = _mm_perm_epi8(t1, m3, _mm_set_epi32(TOB(6),TOB(4),TOB(1),TOB(0)) ); #define LOAD_MSG_3_3(buf) \ t0 = _mm_perm_epi8(m0, m1, _mm_set_epi32(TOB(0),TOB(4),TOB(5),TOB(2)) ); \ -buf = _mm_perm_epi8(t0, m3, _mm_set_epi32(TOB(7),TOB(2),TOB(1),TOB(0)) ); +buf = _mm_perm_epi8(t0, m3, _mm_set_epi32(TOB(2),TOB(1),TOB(0),TOB(7)) ); #define LOAD_MSG_3_4(buf) \ t1 = _mm_perm_epi8(m0, m1, _mm_set_epi32(TOB(0),TOB(0),TOB(0),TOB(6)) ); \ -buf = _mm_perm_epi8(t1, m2, _mm_set_epi32(TOB(4),TOB(2),TOB(6),TOB(0)) ); +buf = _mm_perm_epi8(t1, m2, _mm_set_epi32(TOB(2),TOB(6),TOB(0),TOB(4)) ); #define LOAD_MSG_4_1(buf) \ t0 = _mm_perm_epi8(m0, m1, _mm_set_epi32(TOB(0),TOB(2),TOB(5),TOB(0)) ); \ @@ -102,11 +102,11 @@ buf = _mm_perm_epi8(t1, m3, _mm_set_epi32(TOB(7),TOB(2),TOB(1),TOB(0)) ); #define LOAD_MSG_4_3(buf) \ t0 = _mm_perm_epi8(m0, m1, _mm_set_epi32(TOB(3),TOB(6),TOB(0),TOB(0)) ); \ t0 = _mm_perm_epi8(t0, m2, _mm_set_epi32(TOB(3),TOB(2),TOB(7),TOB(0)) ); \ -buf = _mm_perm_epi8(t0, m3, _mm_set_epi32(TOB(3),TOB(2),TOB(1),TOB(6)) ); +buf = _mm_perm_epi8(t0, m3, _mm_set_epi32(TOB(2),TOB(1),TOB(6),TOB(3)) ); #define LOAD_MSG_4_4(buf) \ t1 = _mm_perm_epi8(m0, m2, _mm_set_epi32(TOB(0),TOB(4),TOB(0),TOB(1)) ); \ -buf = _mm_perm_epi8(t1, m3, _mm_set_epi32(TOB(5),TOB(2),TOB(4),TOB(0)) ); +buf = _mm_perm_epi8(t1, m3, _mm_set_epi32(TOB(2),TOB(4),TOB(0),TOB(5)) ); #define LOAD_MSG_5_1(buf) \ t0 = _mm_perm_epi8(m0, m1, _mm_set_epi32(TOB(0),TOB(0),TOB(6),TOB(2)) ); \ @@ -118,11 +118,11 @@ buf = _mm_perm_epi8(t1, m3, _mm_set_epi32(TOB(3),TOB(2),TOB(1),TOB(4)) ); #define LOAD_MSG_5_3(buf) \ t0 = _mm_perm_epi8(m0, m1, _mm_set_epi32(TOB(1),TOB(0),TOB(7),TOB(4)) ); \ -buf = _mm_perm_epi8(t0, m3, _mm_set_epi32(TOB(3),TOB(7),TOB(1),TOB(0)) ); +buf = _mm_perm_epi8(t0, m3, _mm_set_epi32(TOB(7),TOB(1),TOB(0),TOB(3)) ); #define LOAD_MSG_5_4(buf) \ t1 = _mm_perm_epi8(m1, m2, _mm_set_epi32(TOB(5),TOB(0),TOB(1),TOB(0)) ); \ -buf = _mm_perm_epi8(t1, m3, _mm_set_epi32(TOB(3),TOB(6),TOB(1),TOB(5)) ); +buf = _mm_perm_epi8(t1, m3, _mm_set_epi32(TOB(6),TOB(1),TOB(5),TOB(3)) ); #define LOAD_MSG_6_1(buf) \ t0 = _mm_perm_epi8(m0, m1, _mm_set_epi32(TOB(4),TOB(0),TOB(1),TOB(0)) ); \ @@ -134,11 +134,11 @@ buf = _mm_perm_epi8(t1, m3, _mm_set_epi32(TOB(3),TOB(5),TOB(7),TOB(0)) ); #define LOAD_MSG_6_3(buf) \ t0 = _mm_perm_epi8(m0, m1, _mm_set_epi32(TOB(0),TOB(0),TOB(6),TOB(0)) ); \ -buf = _mm_perm_epi8(t0, m2, _mm_set_epi32(TOB(4),TOB(5),TOB(1),TOB(0)) ); +buf = _mm_perm_epi8(t0, m2, _mm_set_epi32(TOB(5),TOB(1),TOB(0),TOB(4)) ); #define LOAD_MSG_6_4(buf) \ t1 = _mm_perm_epi8(m0, m1, _mm_set_epi32(TOB(0),TOB(2),TOB(3),TOB(7)) ); \ -buf = _mm_perm_epi8(t1, m2, _mm_set_epi32(TOB(7),TOB(2),TOB(1),TOB(0)) ); +buf = _mm_perm_epi8(t1, m2, _mm_set_epi32(TOB(2),TOB(1),TOB(0),TOB(7)) ); #define LOAD_MSG_7_1(buf) \ t0 = _mm_perm_epi8(m0, m1, _mm_set_epi32(TOB(3),TOB(0),TOB(7),TOB(0)) ); \ @@ -151,11 +151,11 @@ buf = _mm_perm_epi8(t1, m3, _mm_set_epi32(TOB(3),TOB(2),TOB(6),TOB(0)) ); #define LOAD_MSG_7_3(buf) \ t0 = _mm_perm_epi8(m0, m1, _mm_set_epi32(TOB(2),TOB(0),TOB(0),TOB(5)) ); \ t0 = _mm_perm_epi8(t0, m2, _mm_set_epi32(TOB(3),TOB(4),TOB(1),TOB(0)) ); \ -buf = _mm_perm_epi8(t0, m3, _mm_set_epi32(TOB(3),TOB(2),TOB(7),TOB(0)) ); +buf = _mm_perm_epi8(t0, m3, _mm_set_epi32(TOB(2),TOB(7),TOB(0),TOB(3)) ); #define LOAD_MSG_7_4(buf) \ t1 = _mm_perm_epi8(m0, m1, _mm_set_epi32(TOB(0),TOB(6),TOB(4),TOB(0)) ); \ -buf = _mm_perm_epi8(t1, m2, _mm_set_epi32(TOB(6),TOB(2),TOB(1),TOB(0)) ); +buf = _mm_perm_epi8(t1, m2, _mm_set_epi32(TOB(2),TOB(1),TOB(0),TOB(6)) ); #define LOAD_MSG_8_1(buf) \ t0 = _mm_perm_epi8(m0, m1, _mm_set_epi32(TOB(0),TOB(0),TOB(0),TOB(6)) ); \ @@ -168,10 +168,10 @@ buf = _mm_perm_epi8(t1, m3, _mm_set_epi32(TOB(3),TOB(2),TOB(1),TOB(7)) ); #define LOAD_MSG_8_3(buf) \ t0 = _mm_perm_epi8(m0, m2, _mm_set_epi32(TOB(6),TOB(1),TOB(0),TOB(0)) ); \ -buf = _mm_perm_epi8(t0, m3, _mm_set_epi32(TOB(3),TOB(2),TOB(5),TOB(4)) ); \ +buf = _mm_perm_epi8(t0, m3, _mm_set_epi32(TOB(2),TOB(5),TOB(4),TOB(3)) ); \ #define LOAD_MSG_8_4(buf) \ -buf = _mm_perm_epi8(m0, m1, _mm_set_epi32(TOB(5),TOB(4),TOB(7),TOB(2)) ); +buf = _mm_perm_epi8(m0, m1, _mm_set_epi32(TOB(4),TOB(7),TOB(2),TOB(5)) ); #define LOAD_MSG_9_1(buf) \ t0 = _mm_perm_epi8(m0, m1, _mm_set_epi32(TOB(1),TOB(7),TOB(0),TOB(0)) ); \ @@ -182,10 +182,10 @@ buf = _mm_perm_epi8(m0, m1, _mm_set_epi32(TOB(5),TOB(6),TOB(4),TOB(2)) ); #define LOAD_MSG_9_3(buf) \ t0 = _mm_perm_epi8(m0, m2, _mm_set_epi32(TOB(0),TOB(3),TOB(5),TOB(0)) ); \ -buf = _mm_perm_epi8(t0, m3, _mm_set_epi32(TOB(5),TOB(2),TOB(1),TOB(7)) ); +buf = _mm_perm_epi8(t0, m3, _mm_set_epi32(TOB(2),TOB(1),TOB(7),TOB(5)) ); #define LOAD_MSG_9_4(buf) \ t1 = _mm_perm_epi8(m0, m2, _mm_set_epi32(TOB(0),TOB(0),TOB(0),TOB(7)) ); \ -buf = _mm_perm_epi8(t1, m3, _mm_set_epi32(TOB(3),TOB(4),TOB(6),TOB(0)) ); +buf = _mm_perm_epi8(t1, m3, _mm_set_epi32(TOB(4),TOB(6),TOB(0),TOB(3)) ); #endif diff --git a/sse/blake2s-round.h b/sse/blake2s-round.h index 44a5574..b75c669 100644 --- a/sse/blake2s-round.h +++ b/sse/blake2s-round.h @@ -56,14 +56,14 @@ row2 = _mm_roti_epi32(row2, -7); #define DIAGONALIZE(row1,row2,row3,row4) \ - row4 = _mm_shuffle_epi32( row4, _MM_SHUFFLE(2,1,0,3) ); \ - row3 = _mm_shuffle_epi32( row3, _MM_SHUFFLE(1,0,3,2) ); \ - row2 = _mm_shuffle_epi32( row2, _MM_SHUFFLE(0,3,2,1) ); + row1 = _mm_shuffle_epi32( row1, _MM_SHUFFLE(2,1,0,3) ); \ + row4 = _mm_shuffle_epi32( row4, _MM_SHUFFLE(1,0,3,2) ); \ + row3 = _mm_shuffle_epi32( row3, _MM_SHUFFLE(0,3,2,1) ); #define UNDIAGONALIZE(row1,row2,row3,row4) \ - row4 = _mm_shuffle_epi32( row4, _MM_SHUFFLE(0,3,2,1) ); \ - row3 = _mm_shuffle_epi32( row3, _MM_SHUFFLE(1,0,3,2) ); \ - row2 = _mm_shuffle_epi32( row2, _MM_SHUFFLE(2,1,0,3) ); + row1 = _mm_shuffle_epi32( row1, _MM_SHUFFLE(0,3,2,1) ); \ + row4 = _mm_shuffle_epi32( row4, _MM_SHUFFLE(1,0,3,2) ); \ + row3 = _mm_shuffle_epi32( row3, _MM_SHUFFLE(2,1,0,3) ); #if defined(HAVE_XOP) #include "blake2s-load-xop.h" -- cgit v1.2.3