Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/BLAKE2/BLAKE2.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorSamuel Neves <sneves@users.noreply.github.com>2019-07-23 23:13:49 +0300
committerGitHub <noreply@github.com>2019-07-23 23:13:49 +0300
commit997fa5ba1e14b52c554fb03ce39e579e6f27b90c (patch)
tree2e8259ffc03be97d6023cc45b02ea9ecd17421b3
parent320c325437539ae91091ce62efec1913cd8093c2 (diff)
parent75c6ae28fb35a3c825b55d67a9f86aa01f93265e (diff)
Merge pull request #57 from sean-sn/master20190724
Updated Blake2s code to match diagonal shuffle tweak done in blake2-a…
-rw-r--r--sse/blake2s-load-sse2.h40
-rw-r--r--sse/blake2s-load-sse41.h59
-rw-r--r--sse/blake2s-load-xop.h40
-rw-r--r--sse/blake2s-round.h12
4 files changed, 79 insertions, 72 deletions
diff --git a/sse/blake2s-load-sse2.h b/sse/blake2s-load-sse2.h
index d2e9a09..8359e81 100644
--- a/sse/blake2s-load-sse2.h
+++ b/sse/blake2s-load-sse2.h
@@ -17,44 +17,44 @@
#define LOAD_MSG_0_1(buf) buf = _mm_set_epi32(m6,m4,m2,m0)
#define LOAD_MSG_0_2(buf) buf = _mm_set_epi32(m7,m5,m3,m1)
-#define LOAD_MSG_0_3(buf) buf = _mm_set_epi32(m14,m12,m10,m8)
-#define LOAD_MSG_0_4(buf) buf = _mm_set_epi32(m15,m13,m11,m9)
+#define LOAD_MSG_0_3(buf) buf = _mm_set_epi32(m12,m10,m8,m14)
+#define LOAD_MSG_0_4(buf) buf = _mm_set_epi32(m13,m11,m9,m15)
#define LOAD_MSG_1_1(buf) buf = _mm_set_epi32(m13,m9,m4,m14)
#define LOAD_MSG_1_2(buf) buf = _mm_set_epi32(m6,m15,m8,m10)
-#define LOAD_MSG_1_3(buf) buf = _mm_set_epi32(m5,m11,m0,m1)
-#define LOAD_MSG_1_4(buf) buf = _mm_set_epi32(m3,m7,m2,m12)
+#define LOAD_MSG_1_3(buf) buf = _mm_set_epi32(m11,m0,m1,m5)
+#define LOAD_MSG_1_4(buf) buf = _mm_set_epi32(m7,m2,m12,m3)
#define LOAD_MSG_2_1(buf) buf = _mm_set_epi32(m15,m5,m12,m11)
#define LOAD_MSG_2_2(buf) buf = _mm_set_epi32(m13,m2,m0,m8)
-#define LOAD_MSG_2_3(buf) buf = _mm_set_epi32(m9,m7,m3,m10)
-#define LOAD_MSG_2_4(buf) buf = _mm_set_epi32(m4,m1,m6,m14)
+#define LOAD_MSG_2_3(buf) buf = _mm_set_epi32(m7,m3,m10,m9)
+#define LOAD_MSG_2_4(buf) buf = _mm_set_epi32(m1,m6,m14,m4)
#define LOAD_MSG_3_1(buf) buf = _mm_set_epi32(m11,m13,m3,m7)
#define LOAD_MSG_3_2(buf) buf = _mm_set_epi32(m14,m12,m1,m9)
-#define LOAD_MSG_3_3(buf) buf = _mm_set_epi32(m15,m4,m5,m2)
-#define LOAD_MSG_3_4(buf) buf = _mm_set_epi32(m8,m0,m10,m6)
+#define LOAD_MSG_3_3(buf) buf = _mm_set_epi32(m4,m5,m2,m15)
+#define LOAD_MSG_3_4(buf) buf = _mm_set_epi32(m0,m10,m6,m8)
#define LOAD_MSG_4_1(buf) buf = _mm_set_epi32(m10,m2,m5,m9)
#define LOAD_MSG_4_2(buf) buf = _mm_set_epi32(m15,m4,m7,m0)
-#define LOAD_MSG_4_3(buf) buf = _mm_set_epi32(m3,m6,m11,m14)
-#define LOAD_MSG_4_4(buf) buf = _mm_set_epi32(m13,m8,m12,m1)
+#define LOAD_MSG_4_3(buf) buf = _mm_set_epi32(m6,m11,m14,m3)
+#define LOAD_MSG_4_4(buf) buf = _mm_set_epi32(m8,m12,m1,m13)
#define LOAD_MSG_5_1(buf) buf = _mm_set_epi32(m8,m0,m6,m2)
#define LOAD_MSG_5_2(buf) buf = _mm_set_epi32(m3,m11,m10,m12)
-#define LOAD_MSG_5_3(buf) buf = _mm_set_epi32(m1,m15,m7,m4)
-#define LOAD_MSG_5_4(buf) buf = _mm_set_epi32(m9,m14,m5,m13)
+#define LOAD_MSG_5_3(buf) buf = _mm_set_epi32(m15,m7,m4,m1)
+#define LOAD_MSG_5_4(buf) buf = _mm_set_epi32(m14,m5,m13,m9)
#define LOAD_MSG_6_1(buf) buf = _mm_set_epi32(m4,m14,m1,m12)
#define LOAD_MSG_6_2(buf) buf = _mm_set_epi32(m10,m13,m15,m5)
-#define LOAD_MSG_6_3(buf) buf = _mm_set_epi32(m8,m9,m6,m0)
-#define LOAD_MSG_6_4(buf) buf = _mm_set_epi32(m11,m2,m3,m7)
+#define LOAD_MSG_6_3(buf) buf = _mm_set_epi32(m9,m6,m0,m8)
+#define LOAD_MSG_6_4(buf) buf = _mm_set_epi32(m2,m3,m7,m11)
#define LOAD_MSG_7_1(buf) buf = _mm_set_epi32(m3,m12,m7,m13)
#define LOAD_MSG_7_2(buf) buf = _mm_set_epi32(m9,m1,m14,m11)
-#define LOAD_MSG_7_3(buf) buf = _mm_set_epi32(m2,m8,m15,m5)
-#define LOAD_MSG_7_4(buf) buf = _mm_set_epi32(m10,m6,m4,m0)
+#define LOAD_MSG_7_3(buf) buf = _mm_set_epi32(m8,m15,m5,m2)
+#define LOAD_MSG_7_4(buf) buf = _mm_set_epi32(m6,m4,m0,m10)
#define LOAD_MSG_8_1(buf) buf = _mm_set_epi32(m0,m11,m14,m6)
#define LOAD_MSG_8_2(buf) buf = _mm_set_epi32(m8,m3,m9,m15)
-#define LOAD_MSG_8_3(buf) buf = _mm_set_epi32(m10,m1,m13,m12)
-#define LOAD_MSG_8_4(buf) buf = _mm_set_epi32(m5,m4,m7,m2)
+#define LOAD_MSG_8_3(buf) buf = _mm_set_epi32(m1,m13,m12,m10)
+#define LOAD_MSG_8_4(buf) buf = _mm_set_epi32(m4,m7,m2,m5)
#define LOAD_MSG_9_1(buf) buf = _mm_set_epi32(m1,m7,m8,m10)
#define LOAD_MSG_9_2(buf) buf = _mm_set_epi32(m5,m6,m4,m2)
-#define LOAD_MSG_9_3(buf) buf = _mm_set_epi32(m13,m3,m9,m15)
-#define LOAD_MSG_9_4(buf) buf = _mm_set_epi32(m0,m12,m14,m11)
+#define LOAD_MSG_9_3(buf) buf = _mm_set_epi32(m3,m9,m15,m13)
+#define LOAD_MSG_9_4(buf) buf = _mm_set_epi32(m12,m14,m11,m0)
#endif
diff --git a/sse/blake2s-load-sse41.h b/sse/blake2s-load-sse41.h
index c316fb5..8d2b6b1 100644
--- a/sse/blake2s-load-sse41.h
+++ b/sse/blake2s-load-sse41.h
@@ -22,10 +22,13 @@ buf = TOI(_mm_shuffle_ps(TOF(m0), TOF(m1), _MM_SHUFFLE(2,0,2,0)));
buf = TOI(_mm_shuffle_ps(TOF(m0), TOF(m1), _MM_SHUFFLE(3,1,3,1)));
#define LOAD_MSG_0_3(buf) \
-buf = TOI(_mm_shuffle_ps(TOF(m2), TOF(m3), _MM_SHUFFLE(2,0,2,0)));
+t0 = _mm_shuffle_epi32(m2, _MM_SHUFFLE(3,2,0,1)); \
+t1 = _mm_shuffle_epi32(m3, _MM_SHUFFLE(0,1,3,2)); \
+buf = _mm_blend_epi16(t0, t1, 0xC3);
#define LOAD_MSG_0_4(buf) \
-buf = TOI(_mm_shuffle_ps(TOF(m2), TOF(m3), _MM_SHUFFLE(3,1,3,1)));
+t0 = _mm_blend_epi16(t0, t1, 0x3C); \
+buf = _mm_shuffle_epi32(t0, _MM_SHUFFLE(2,3,0,1));
#define LOAD_MSG_1_1(buf) \
t0 = _mm_blend_epi16(m1, m2, 0x0C); \
@@ -43,13 +46,13 @@ buf = _mm_shuffle_epi32(t2, _MM_SHUFFLE(2,3,0,1));
t0 = _mm_slli_si128(m1, 4); \
t1 = _mm_blend_epi16(m2, t0, 0x30); \
t2 = _mm_blend_epi16(m0, t1, 0xF0); \
-buf = _mm_shuffle_epi32(t2, _MM_SHUFFLE(2,3,0,1));
+buf = _mm_shuffle_epi32(t2, _MM_SHUFFLE(3,0,1,2));
#define LOAD_MSG_1_4(buf) \
t0 = _mm_unpackhi_epi32(m0,m1); \
t1 = _mm_slli_si128(m3, 4); \
t2 = _mm_blend_epi16(t0, t1, 0x0C); \
-buf = _mm_shuffle_epi32(t2, _MM_SHUFFLE(2,3,0,1));
+buf = _mm_shuffle_epi32(t2, _MM_SHUFFLE(3,0,1,2));
#define LOAD_MSG_2_1(buf) \
t0 = _mm_unpackhi_epi32(m2,m3); \
@@ -67,13 +70,13 @@ buf = _mm_blend_epi16(t1, t2, 0xC0);
t0 = _mm_blend_epi16(m0, m2, 0x3C); \
t1 = _mm_srli_si128(m1, 12); \
t2 = _mm_blend_epi16(t0,t1,0x03); \
-buf = _mm_shuffle_epi32(t2, _MM_SHUFFLE(1,0,3,2));
+buf = _mm_shuffle_epi32(t2, _MM_SHUFFLE(0,3,2,1));
#define LOAD_MSG_2_4(buf) \
t0 = _mm_slli_si128(m3, 4); \
t1 = _mm_blend_epi16(m0, m1, 0x33); \
t2 = _mm_blend_epi16(t1, t0, 0xC0); \
-buf = _mm_shuffle_epi32(t2, _MM_SHUFFLE(0,1,2,3));
+buf = _mm_shuffle_epi32(t2, _MM_SHUFFLE(1,2,3,0));
#define LOAD_MSG_3_1(buf) \
t0 = _mm_unpackhi_epi32(m0,m1); \
@@ -90,12 +93,11 @@ buf = _mm_shuffle_epi32(t2, _MM_SHUFFLE(2,0,1,3));
#define LOAD_MSG_3_3(buf) \
t0 = _mm_blend_epi16(m0,m1,0x0F); \
t1 = _mm_blend_epi16(t0, m3, 0xC0); \
-buf = _mm_shuffle_epi32(t1, _MM_SHUFFLE(3,0,1,2));
+buf = _mm_shuffle_epi32(t1, _MM_SHUFFLE(0,1,2,3));
#define LOAD_MSG_3_4(buf) \
-t0 = _mm_unpacklo_epi32(m0,m2); \
-t1 = _mm_unpackhi_epi32(m1,m2); \
-buf = _mm_unpacklo_epi64(t1,t0);
+t0 = _mm_alignr_epi8(m0, m1, 4); \
+buf = _mm_blend_epi16(t0, m2, 0x33);
#define LOAD_MSG_4_1(buf) \
t0 = _mm_unpacklo_epi64(m1,m2); \
@@ -111,13 +113,14 @@ buf = _mm_blend_epi16(t0,t1,0x33);
#define LOAD_MSG_4_3(buf) \
t0 = _mm_unpackhi_epi64(m3,m1); \
t1 = _mm_unpackhi_epi64(m2,m0); \
-buf = _mm_blend_epi16(t1,t0,0x33);
+t2 = _mm_blend_epi16(t1,t0,0x33); \
+buf = _mm_shuffle_epi32(t2, _MM_SHUFFLE(2,1,0,3));
#define LOAD_MSG_4_4(buf) \
t0 = _mm_blend_epi16(m0,m2,0x03); \
t1 = _mm_slli_si128(t0, 8); \
t2 = _mm_blend_epi16(t1,m3,0x0F); \
-buf = _mm_shuffle_epi32(t2, _MM_SHUFFLE(1,2,0,3));
+buf = _mm_shuffle_epi32(t2, _MM_SHUFFLE(2,0,3,1));
#define LOAD_MSG_5_1(buf) \
t0 = _mm_unpackhi_epi32(m0,m1); \
@@ -133,12 +136,13 @@ buf = _mm_blend_epi16(t1,t0,0x3C);
t0 = _mm_blend_epi16(m1,m0,0x0C); \
t1 = _mm_srli_si128(m3, 4); \
t2 = _mm_blend_epi16(t0,t1,0x30); \
-buf = _mm_shuffle_epi32(t2, _MM_SHUFFLE(1,2,3,0));
+buf = _mm_shuffle_epi32(t2, _MM_SHUFFLE(2,3,0,1));
#define LOAD_MSG_5_4(buf) \
-t0 = _mm_unpacklo_epi64(m1,m2); \
-t1= _mm_shuffle_epi32(m3, _MM_SHUFFLE(0,2,0,1)); \
-buf = _mm_blend_epi16(t0,t1,0x33);
+t0 = _mm_unpacklo_epi64(m2,m1); \
+t1 = _mm_shuffle_epi32(m3, _MM_SHUFFLE(2,0,1,0)); \
+t2 = _mm_srli_si128(t0, 4); \
+buf = _mm_blend_epi16(t1,t2,0x33);
#define LOAD_MSG_6_1(buf) \
t0 = _mm_slli_si128(m1, 12); \
@@ -154,12 +158,13 @@ buf = _mm_shuffle_epi32(t2, _MM_SHUFFLE(2,1,3,0));
#define LOAD_MSG_6_3(buf) \
t0 = _mm_unpacklo_epi64(m0,m2); \
t1 = _mm_srli_si128(m1, 4); \
-buf = _mm_shuffle_epi32(_mm_blend_epi16(t0,t1,0x0C), _MM_SHUFFLE(2,3,1,0));
+t2 = _mm_blend_epi16(t0,t1,0x0C); \
+buf = _mm_shuffle_epi32(t2, _MM_SHUFFLE(3,1,0,2));
#define LOAD_MSG_6_4(buf) \
t0 = _mm_unpackhi_epi32(m1,m2); \
t1 = _mm_unpackhi_epi64(m0,t0); \
-buf = _mm_shuffle_epi32(t1, _MM_SHUFFLE(3,0,1,2));
+buf = _mm_shuffle_epi32(t1, _MM_SHUFFLE(0,1,2,3));
#define LOAD_MSG_7_1(buf) \
t0 = _mm_unpackhi_epi32(m0,m1); \
@@ -176,12 +181,13 @@ buf = _mm_shuffle_epi32(t2, _MM_SHUFFLE(1,0,2,3));
t0 = _mm_unpackhi_epi64(m0,m3); \
t1 = _mm_unpacklo_epi64(m1,m2); \
t2 = _mm_blend_epi16(t0,t1,0x3C); \
-buf = _mm_shuffle_epi32(t2,_MM_SHUFFLE(0,2,3,1));
+buf = _mm_shuffle_epi32(t2,_MM_SHUFFLE(2,3,1,0));
#define LOAD_MSG_7_4(buf) \
t0 = _mm_unpacklo_epi32(m0,m1); \
t1 = _mm_unpackhi_epi32(m1,m2); \
-buf = _mm_unpacklo_epi64(t0,t1);
+t2 = _mm_unpacklo_epi64(t0,t1); \
+buf = _mm_shuffle_epi32(t2, _MM_SHUFFLE(2,1,0,3));
#define LOAD_MSG_8_1(buf) \
t0 = _mm_unpackhi_epi32(m1,m3); \
@@ -195,13 +201,14 @@ t1 = _mm_blend_epi16(m2,t0,0xF0); \
buf = _mm_shuffle_epi32(t1,_MM_SHUFFLE(0,2,1,3));
#define LOAD_MSG_8_3(buf) \
-t0 = _mm_blend_epi16(m2,m0,0x0C); \
-t1 = _mm_slli_si128(t0,4); \
-buf = _mm_blend_epi16(t1,m3,0x0F);
+t0 = _mm_unpacklo_epi64(m0,m3); \
+t1 = _mm_srli_si128(m2,8); \
+t2 = _mm_blend_epi16(t0,t1,0x03); \
+buf = _mm_shuffle_epi32(t2, _MM_SHUFFLE(1,3,2,0));
#define LOAD_MSG_8_4(buf) \
t0 = _mm_blend_epi16(m1,m0,0x30); \
-buf = _mm_shuffle_epi32(t0,_MM_SHUFFLE(1,0,3,2));
+buf = _mm_shuffle_epi32(t0,_MM_SHUFFLE(0,3,2,1));
#define LOAD_MSG_9_1(buf) \
t0 = _mm_blend_epi16(m0,m2,0x03); \
@@ -218,12 +225,12 @@ buf = _mm_shuffle_epi32(t1,_MM_SHUFFLE(1,2,0,3));
t0 = _mm_unpackhi_epi32(m0,m3); \
t1 = _mm_unpacklo_epi32(m2,m3); \
t2 = _mm_unpackhi_epi64(t0,t1); \
-buf = _mm_shuffle_epi32(t2,_MM_SHUFFLE(3,0,2,1));
+buf = _mm_shuffle_epi32(t2,_MM_SHUFFLE(0,2,1,3));
#define LOAD_MSG_9_4(buf) \
t0 = _mm_blend_epi16(m3,m2,0xC0); \
t1 = _mm_unpacklo_epi32(m0,m3); \
t2 = _mm_blend_epi16(t0,t1,0x0F); \
-buf = _mm_shuffle_epi32(t2,_MM_SHUFFLE(0,1,2,3));
+buf = _mm_shuffle_epi32(t2,_MM_SHUFFLE(1,2,3,0));
#endif
diff --git a/sse/blake2s-load-xop.h b/sse/blake2s-load-xop.h
index a97ddcc..426edc1 100644
--- a/sse/blake2s-load-xop.h
+++ b/sse/blake2s-load-xop.h
@@ -37,10 +37,10 @@ buf = _mm_perm_epi8(m0, m1, _mm_set_epi32(TOB(6),TOB(4),TOB(2),TOB(0)) );
buf = _mm_perm_epi8(m0, m1, _mm_set_epi32(TOB(7),TOB(5),TOB(3),TOB(1)) );
#define LOAD_MSG_0_3(buf) \
-buf = _mm_perm_epi8(m2, m3, _mm_set_epi32(TOB(6),TOB(4),TOB(2),TOB(0)) );
+buf = _mm_perm_epi8(m2, m3, _mm_set_epi32(TOB(4),TOB(2),TOB(0),TOB(6)) );
#define LOAD_MSG_0_4(buf) \
-buf = _mm_perm_epi8(m2, m3, _mm_set_epi32(TOB(7),TOB(5),TOB(3),TOB(1)) );
+buf = _mm_perm_epi8(m2, m3, _mm_set_epi32(TOB(5),TOB(3),TOB(1),TOB(7)) );
#define LOAD_MSG_1_1(buf) \
t0 = _mm_perm_epi8(m1, m2, _mm_set_epi32(TOB(0),TOB(5),TOB(0),TOB(0)) ); \
@@ -52,11 +52,11 @@ buf = _mm_perm_epi8(t1, m3, _mm_set_epi32(TOB(3),TOB(7),TOB(1),TOB(0)) );
#define LOAD_MSG_1_3(buf) \
t0 = _mm_perm_epi8(m0, m1, _mm_set_epi32(TOB(5),TOB(0),TOB(0),TOB(1)) ); \
-buf = _mm_perm_epi8(t0, m2, _mm_set_epi32(TOB(3),TOB(7),TOB(1),TOB(0)) );
+buf = _mm_perm_epi8(t0, m2, _mm_set_epi32(TOB(7),TOB(1),TOB(0),TOB(3)) );
#define LOAD_MSG_1_4(buf) \
t1 = _mm_perm_epi8(m0, m1, _mm_set_epi32(TOB(3),TOB(7),TOB(2),TOB(0)) ); \
-buf = _mm_perm_epi8(t1, m3, _mm_set_epi32(TOB(3),TOB(2),TOB(1),TOB(4)) );
+buf = _mm_perm_epi8(t1, m3, _mm_set_epi32(TOB(2),TOB(1),TOB(4),TOB(3)) );
#define LOAD_MSG_2_1(buf) \
t0 = _mm_perm_epi8(m1, m2, _mm_set_epi32(TOB(0),TOB(1),TOB(0),TOB(7)) ); \
@@ -68,11 +68,11 @@ buf = _mm_perm_epi8(t1, m3, _mm_set_epi32(TOB(5),TOB(2),TOB(1),TOB(0)) );
#define LOAD_MSG_2_3(buf) \
t0 = _mm_perm_epi8(m0, m1, _mm_set_epi32(TOB(0),TOB(7),TOB(3),TOB(0)) ); \
-buf = _mm_perm_epi8(t0, m2, _mm_set_epi32(TOB(5),TOB(2),TOB(1),TOB(6)) );
+buf = _mm_perm_epi8(t0, m2, _mm_set_epi32(TOB(2),TOB(1),TOB(6),TOB(5)) );
#define LOAD_MSG_2_4(buf) \
t1 = _mm_perm_epi8(m0, m1, _mm_set_epi32(TOB(4),TOB(1),TOB(6),TOB(0)) ); \
-buf = _mm_perm_epi8(t1, m3, _mm_set_epi32(TOB(3),TOB(2),TOB(1),TOB(6)) );
+buf = _mm_perm_epi8(t1, m3, _mm_set_epi32(TOB(2),TOB(1),TOB(6),TOB(3)) );
#define LOAD_MSG_3_1(buf) \
t0 = _mm_perm_epi8(m0, m1, _mm_set_epi32(TOB(0),TOB(0),TOB(3),TOB(7)) ); \
@@ -85,11 +85,11 @@ buf = _mm_perm_epi8(t1, m3, _mm_set_epi32(TOB(6),TOB(4),TOB(1),TOB(0)) );
#define LOAD_MSG_3_3(buf) \
t0 = _mm_perm_epi8(m0, m1, _mm_set_epi32(TOB(0),TOB(4),TOB(5),TOB(2)) ); \
-buf = _mm_perm_epi8(t0, m3, _mm_set_epi32(TOB(7),TOB(2),TOB(1),TOB(0)) );
+buf = _mm_perm_epi8(t0, m3, _mm_set_epi32(TOB(2),TOB(1),TOB(0),TOB(7)) );
#define LOAD_MSG_3_4(buf) \
t1 = _mm_perm_epi8(m0, m1, _mm_set_epi32(TOB(0),TOB(0),TOB(0),TOB(6)) ); \
-buf = _mm_perm_epi8(t1, m2, _mm_set_epi32(TOB(4),TOB(2),TOB(6),TOB(0)) );
+buf = _mm_perm_epi8(t1, m2, _mm_set_epi32(TOB(2),TOB(6),TOB(0),TOB(4)) );
#define LOAD_MSG_4_1(buf) \
t0 = _mm_perm_epi8(m0, m1, _mm_set_epi32(TOB(0),TOB(2),TOB(5),TOB(0)) ); \
@@ -102,11 +102,11 @@ buf = _mm_perm_epi8(t1, m3, _mm_set_epi32(TOB(7),TOB(2),TOB(1),TOB(0)) );
#define LOAD_MSG_4_3(buf) \
t0 = _mm_perm_epi8(m0, m1, _mm_set_epi32(TOB(3),TOB(6),TOB(0),TOB(0)) ); \
t0 = _mm_perm_epi8(t0, m2, _mm_set_epi32(TOB(3),TOB(2),TOB(7),TOB(0)) ); \
-buf = _mm_perm_epi8(t0, m3, _mm_set_epi32(TOB(3),TOB(2),TOB(1),TOB(6)) );
+buf = _mm_perm_epi8(t0, m3, _mm_set_epi32(TOB(2),TOB(1),TOB(6),TOB(3)) );
#define LOAD_MSG_4_4(buf) \
t1 = _mm_perm_epi8(m0, m2, _mm_set_epi32(TOB(0),TOB(4),TOB(0),TOB(1)) ); \
-buf = _mm_perm_epi8(t1, m3, _mm_set_epi32(TOB(5),TOB(2),TOB(4),TOB(0)) );
+buf = _mm_perm_epi8(t1, m3, _mm_set_epi32(TOB(2),TOB(4),TOB(0),TOB(5)) );
#define LOAD_MSG_5_1(buf) \
t0 = _mm_perm_epi8(m0, m1, _mm_set_epi32(TOB(0),TOB(0),TOB(6),TOB(2)) ); \
@@ -118,11 +118,11 @@ buf = _mm_perm_epi8(t1, m3, _mm_set_epi32(TOB(3),TOB(2),TOB(1),TOB(4)) );
#define LOAD_MSG_5_3(buf) \
t0 = _mm_perm_epi8(m0, m1, _mm_set_epi32(TOB(1),TOB(0),TOB(7),TOB(4)) ); \
-buf = _mm_perm_epi8(t0, m3, _mm_set_epi32(TOB(3),TOB(7),TOB(1),TOB(0)) );
+buf = _mm_perm_epi8(t0, m3, _mm_set_epi32(TOB(7),TOB(1),TOB(0),TOB(3)) );
#define LOAD_MSG_5_4(buf) \
t1 = _mm_perm_epi8(m1, m2, _mm_set_epi32(TOB(5),TOB(0),TOB(1),TOB(0)) ); \
-buf = _mm_perm_epi8(t1, m3, _mm_set_epi32(TOB(3),TOB(6),TOB(1),TOB(5)) );
+buf = _mm_perm_epi8(t1, m3, _mm_set_epi32(TOB(6),TOB(1),TOB(5),TOB(3)) );
#define LOAD_MSG_6_1(buf) \
t0 = _mm_perm_epi8(m0, m1, _mm_set_epi32(TOB(4),TOB(0),TOB(1),TOB(0)) ); \
@@ -134,11 +134,11 @@ buf = _mm_perm_epi8(t1, m3, _mm_set_epi32(TOB(3),TOB(5),TOB(7),TOB(0)) );
#define LOAD_MSG_6_3(buf) \
t0 = _mm_perm_epi8(m0, m1, _mm_set_epi32(TOB(0),TOB(0),TOB(6),TOB(0)) ); \
-buf = _mm_perm_epi8(t0, m2, _mm_set_epi32(TOB(4),TOB(5),TOB(1),TOB(0)) );
+buf = _mm_perm_epi8(t0, m2, _mm_set_epi32(TOB(5),TOB(1),TOB(0),TOB(4)) );
#define LOAD_MSG_6_4(buf) \
t1 = _mm_perm_epi8(m0, m1, _mm_set_epi32(TOB(0),TOB(2),TOB(3),TOB(7)) ); \
-buf = _mm_perm_epi8(t1, m2, _mm_set_epi32(TOB(7),TOB(2),TOB(1),TOB(0)) );
+buf = _mm_perm_epi8(t1, m2, _mm_set_epi32(TOB(2),TOB(1),TOB(0),TOB(7)) );
#define LOAD_MSG_7_1(buf) \
t0 = _mm_perm_epi8(m0, m1, _mm_set_epi32(TOB(3),TOB(0),TOB(7),TOB(0)) ); \
@@ -151,11 +151,11 @@ buf = _mm_perm_epi8(t1, m3, _mm_set_epi32(TOB(3),TOB(2),TOB(6),TOB(0)) );
#define LOAD_MSG_7_3(buf) \
t0 = _mm_perm_epi8(m0, m1, _mm_set_epi32(TOB(2),TOB(0),TOB(0),TOB(5)) ); \
t0 = _mm_perm_epi8(t0, m2, _mm_set_epi32(TOB(3),TOB(4),TOB(1),TOB(0)) ); \
-buf = _mm_perm_epi8(t0, m3, _mm_set_epi32(TOB(3),TOB(2),TOB(7),TOB(0)) );
+buf = _mm_perm_epi8(t0, m3, _mm_set_epi32(TOB(2),TOB(7),TOB(0),TOB(3)) );
#define LOAD_MSG_7_4(buf) \
t1 = _mm_perm_epi8(m0, m1, _mm_set_epi32(TOB(0),TOB(6),TOB(4),TOB(0)) ); \
-buf = _mm_perm_epi8(t1, m2, _mm_set_epi32(TOB(6),TOB(2),TOB(1),TOB(0)) );
+buf = _mm_perm_epi8(t1, m2, _mm_set_epi32(TOB(2),TOB(1),TOB(0),TOB(6)) );
#define LOAD_MSG_8_1(buf) \
t0 = _mm_perm_epi8(m0, m1, _mm_set_epi32(TOB(0),TOB(0),TOB(0),TOB(6)) ); \
@@ -168,10 +168,10 @@ buf = _mm_perm_epi8(t1, m3, _mm_set_epi32(TOB(3),TOB(2),TOB(1),TOB(7)) );
#define LOAD_MSG_8_3(buf) \
t0 = _mm_perm_epi8(m0, m2, _mm_set_epi32(TOB(6),TOB(1),TOB(0),TOB(0)) ); \
-buf = _mm_perm_epi8(t0, m3, _mm_set_epi32(TOB(3),TOB(2),TOB(5),TOB(4)) ); \
+buf = _mm_perm_epi8(t0, m3, _mm_set_epi32(TOB(2),TOB(5),TOB(4),TOB(3)) ); \
#define LOAD_MSG_8_4(buf) \
-buf = _mm_perm_epi8(m0, m1, _mm_set_epi32(TOB(5),TOB(4),TOB(7),TOB(2)) );
+buf = _mm_perm_epi8(m0, m1, _mm_set_epi32(TOB(4),TOB(7),TOB(2),TOB(5)) );
#define LOAD_MSG_9_1(buf) \
t0 = _mm_perm_epi8(m0, m1, _mm_set_epi32(TOB(1),TOB(7),TOB(0),TOB(0)) ); \
@@ -182,10 +182,10 @@ buf = _mm_perm_epi8(m0, m1, _mm_set_epi32(TOB(5),TOB(6),TOB(4),TOB(2)) );
#define LOAD_MSG_9_3(buf) \
t0 = _mm_perm_epi8(m0, m2, _mm_set_epi32(TOB(0),TOB(3),TOB(5),TOB(0)) ); \
-buf = _mm_perm_epi8(t0, m3, _mm_set_epi32(TOB(5),TOB(2),TOB(1),TOB(7)) );
+buf = _mm_perm_epi8(t0, m3, _mm_set_epi32(TOB(2),TOB(1),TOB(7),TOB(5)) );
#define LOAD_MSG_9_4(buf) \
t1 = _mm_perm_epi8(m0, m2, _mm_set_epi32(TOB(0),TOB(0),TOB(0),TOB(7)) ); \
-buf = _mm_perm_epi8(t1, m3, _mm_set_epi32(TOB(3),TOB(4),TOB(6),TOB(0)) );
+buf = _mm_perm_epi8(t1, m3, _mm_set_epi32(TOB(4),TOB(6),TOB(0),TOB(3)) );
#endif
diff --git a/sse/blake2s-round.h b/sse/blake2s-round.h
index 44a5574..b75c669 100644
--- a/sse/blake2s-round.h
+++ b/sse/blake2s-round.h
@@ -56,14 +56,14 @@
row2 = _mm_roti_epi32(row2, -7);
#define DIAGONALIZE(row1,row2,row3,row4) \
- row4 = _mm_shuffle_epi32( row4, _MM_SHUFFLE(2,1,0,3) ); \
- row3 = _mm_shuffle_epi32( row3, _MM_SHUFFLE(1,0,3,2) ); \
- row2 = _mm_shuffle_epi32( row2, _MM_SHUFFLE(0,3,2,1) );
+ row1 = _mm_shuffle_epi32( row1, _MM_SHUFFLE(2,1,0,3) ); \
+ row4 = _mm_shuffle_epi32( row4, _MM_SHUFFLE(1,0,3,2) ); \
+ row3 = _mm_shuffle_epi32( row3, _MM_SHUFFLE(0,3,2,1) );
#define UNDIAGONALIZE(row1,row2,row3,row4) \
- row4 = _mm_shuffle_epi32( row4, _MM_SHUFFLE(0,3,2,1) ); \
- row3 = _mm_shuffle_epi32( row3, _MM_SHUFFLE(1,0,3,2) ); \
- row2 = _mm_shuffle_epi32( row2, _MM_SHUFFLE(2,1,0,3) );
+ row1 = _mm_shuffle_epi32( row1, _MM_SHUFFLE(0,3,2,1) ); \
+ row4 = _mm_shuffle_epi32( row4, _MM_SHUFFLE(1,0,3,2) ); \
+ row3 = _mm_shuffle_epi32( row3, _MM_SHUFFLE(2,1,0,3) );
#if defined(HAVE_XOP)
#include "blake2s-load-xop.h"