From 22a1ce9b2f81115068688989a1c325662e026b52 Mon Sep 17 00:00:00 2001 From: Samuel Neves Date: Tue, 14 Jan 2014 19:02:48 +0000 Subject: Use unaligned instructions for non speed-critical memory accesses --- sse/blake2b.c | 24 ++++++++++++------------ sse/blake2s.c | 10 +++++----- 2 files changed, 17 insertions(+), 17 deletions(-) diff --git a/sse/blake2b.c b/sse/blake2b.c index 526943b..1d5ad24 100644 --- a/sse/blake2b.c +++ b/sse/blake2b.c @@ -284,14 +284,14 @@ static inline int blake2b_compress( blake2b_state *S, const uint8_t block[BLAKE2 const uint64_t m14 = ( ( uint64_t * )block )[14]; const uint64_t m15 = ( ( uint64_t * )block )[15]; #endif - row1l = LOAD( &S->h[0] ); - row1h = LOAD( &S->h[2] ); - row2l = LOAD( &S->h[4] ); - row2h = LOAD( &S->h[6] ); - row3l = LOAD( &blake2b_IV[0] ); - row3h = LOAD( &blake2b_IV[2] ); - row4l = _mm_xor_si128( LOAD( &blake2b_IV[4] ), LOAD( &S->t[0] ) ); - row4h = _mm_xor_si128( LOAD( &blake2b_IV[6] ), LOAD( &S->f[0] ) ); + row1l = LOADU( &S->h[0] ); + row1h = LOADU( &S->h[2] ); + row2l = LOADU( &S->h[4] ); + row2h = LOADU( &S->h[6] ); + row3l = LOADU( &blake2b_IV[0] ); + row3h = LOADU( &blake2b_IV[2] ); + row4l = _mm_xor_si128( LOADU( &blake2b_IV[4] ), LOADU( &S->t[0] ) ); + row4h = _mm_xor_si128( LOADU( &blake2b_IV[6] ), LOADU( &S->f[0] ) ); ROUND( 0 ); ROUND( 1 ); ROUND( 2 ); @@ -306,12 +306,12 @@ static inline int blake2b_compress( blake2b_state *S, const uint8_t block[BLAKE2 ROUND( 11 ); row1l = _mm_xor_si128( row3l, row1l ); row1h = _mm_xor_si128( row3h, row1h ); - STORE( &S->h[0], _mm_xor_si128( LOAD( &S->h[0] ), row1l ) ); - STORE( &S->h[2], _mm_xor_si128( LOAD( &S->h[2] ), row1h ) ); + STOREU( &S->h[0], _mm_xor_si128( LOADU( &S->h[0] ), row1l ) ); + STOREU( &S->h[2], _mm_xor_si128( LOADU( &S->h[2] ), row1h ) ); row2l = _mm_xor_si128( row4l, row2l ); row2h = _mm_xor_si128( row4h, row2h ); - STORE( &S->h[4], _mm_xor_si128( LOAD( &S->h[4] ), row2l ) ); - STORE( &S->h[6], _mm_xor_si128( LOAD( &S->h[6] ), row2h ) ); + STOREU( &S->h[4], _mm_xor_si128( LOADU( &S->h[4] ), row2l ) ); + STOREU( &S->h[6], _mm_xor_si128( LOADU( &S->h[6] ), row2h ) ); return 0; } diff --git a/sse/blake2s.c b/sse/blake2s.c index 03744ac..9ec2df1 100644 --- a/sse/blake2s.c +++ b/sse/blake2s.c @@ -274,10 +274,10 @@ static inline int blake2s_compress( blake2s_state *S, const uint8_t block[BLAKE2 const uint32_t m14 = ( ( uint32_t * )block )[14]; const uint32_t m15 = ( ( uint32_t * )block )[15]; #endif - row1 = ff0 = LOAD( &S->h[0] ); - row2 = ff1 = LOAD( &S->h[4] ); + row1 = ff0 = LOADU( &S->h[0] ); + row2 = ff1 = LOADU( &S->h[4] ); row3 = _mm_setr_epi32( 0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A ); - row4 = _mm_xor_si128( _mm_setr_epi32( 0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19 ), LOAD( &S->t[0] ) ); + row4 = _mm_xor_si128( _mm_setr_epi32( 0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19 ), LOADU( &S->t[0] ) ); ROUND( 0 ); ROUND( 1 ); ROUND( 2 ); @@ -288,8 +288,8 @@ static inline int blake2s_compress( blake2s_state *S, const uint8_t block[BLAKE2 ROUND( 7 ); ROUND( 8 ); ROUND( 9 ); - STORE( &S->h[0], _mm_xor_si128( ff0, _mm_xor_si128( row1, row3 ) ) ); - STORE( &S->h[4], _mm_xor_si128( ff1, _mm_xor_si128( row2, row4 ) ) ); + STOREU( &S->h[0], _mm_xor_si128( ff0, _mm_xor_si128( row1, row3 ) ) ); + STOREU( &S->h[4], _mm_xor_si128( ff1, _mm_xor_si128( row2, row4 ) ) ); return 0; } -- cgit v1.2.3