Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/elfmz/far2l.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
Diffstat (limited to 'multiarc/src/formats/7z/C/Sha256Opt.c')
-rwxr-xr-xmultiarc/src/formats/7z/C/Sha256Opt.c373
1 files changed, 373 insertions, 0 deletions
diff --git a/multiarc/src/formats/7z/C/Sha256Opt.c b/multiarc/src/formats/7z/C/Sha256Opt.c
new file mode 100755
index 00000000..decc1382
--- /dev/null
+++ b/multiarc/src/formats/7z/C/Sha256Opt.c
@@ -0,0 +1,373 @@
+/* Sha256Opt.c -- SHA-256 optimized code for SHA-256 hardware instructions
+2021-04-01 : Igor Pavlov : Public domain */
+
+#include "Precomp.h"
+
+#if defined(_MSC_VER)
+#if (_MSC_VER < 1900) && (_MSC_VER >= 1200)
+// #define USE_MY_MM
+#endif
+#endif
+
+#include "CpuArch.h"
+
+#ifdef MY_CPU_X86_OR_AMD64
+ #if defined(__clang__)
+ #if (__clang_major__ >= 8) // fix that check
+ #define USE_HW_SHA
+ #ifndef __SHA__
+ #define ATTRIB_SHA __attribute__((__target__("sha,ssse3")))
+ #if defined(_MSC_VER)
+ // SSSE3: for clang-cl:
+ #include <tmmintrin.h>
+ #define __SHA__
+ #endif
+ #endif
+
+ #endif
+ #elif defined(__GNUC__)
+ #if (__GNUC__ >= 8) // fix that check
+ #define USE_HW_SHA
+ #ifndef __SHA__
+ #define ATTRIB_SHA __attribute__((__target__("sha,ssse3")))
+ // #pragma GCC target("sha,ssse3")
+ #endif
+ #endif
+ #elif defined(__INTEL_COMPILER)
+ #if (__INTEL_COMPILER >= 1800) // fix that check
+ #define USE_HW_SHA
+ #endif
+ #elif defined(_MSC_VER)
+ #ifdef USE_MY_MM
+ #define USE_VER_MIN 1300
+ #else
+ #define USE_VER_MIN 1910
+ #endif
+ #if _MSC_VER >= USE_VER_MIN
+ #define USE_HW_SHA
+ #endif
+ #endif
+// #endif // MY_CPU_X86_OR_AMD64
+
+#ifdef USE_HW_SHA
+
+// #pragma message("Sha256 HW")
+// #include <wmmintrin.h>
+
+#if !defined(_MSC_VER) || (_MSC_VER >= 1900)
+#include <immintrin.h>
+#else
+#include <emmintrin.h>
+
+#if defined(_MSC_VER) && (_MSC_VER >= 1600)
+// #include <intrin.h>
+#endif
+
+#ifdef USE_MY_MM
+#include "My_mm.h"
+#endif
+
+#endif
+
+/*
+SHA256 uses:
+SSE2:
+ _mm_loadu_si128
+ _mm_storeu_si128
+ _mm_set_epi32
+ _mm_add_epi32
+ _mm_shuffle_epi32 / pshufd
+
+
+
+SSSE3:
+ _mm_shuffle_epi8 / pshufb
+ _mm_alignr_epi8
+SHA:
+ _mm_sha256*
+*/
+
+// K array must be aligned for 16-bytes at least.
+// The compiler can look align attribute and selects
+// movdqu - for code without align attribute
+// movdqa - for code with align attribute
+extern
+MY_ALIGN(64)
+const UInt32 SHA256_K_ARRAY[64];
+
+#define K SHA256_K_ARRAY
+
+
+#define ADD_EPI32(dest, src) dest = _mm_add_epi32(dest, src);
+#define SHA256_MSG1(dest, src) dest = _mm_sha256msg1_epu32(dest, src);
+#define SHA25G_MSG2(dest, src) dest = _mm_sha256msg2_epu32(dest, src);
+
+
+#define LOAD_SHUFFLE(m, k) \
+ m = _mm_loadu_si128((const __m128i *)(const void *)(data + (k) * 16)); \
+ m = _mm_shuffle_epi8(m, mask); \
+
+#define SM1(g0, g1, g2, g3) \
+ SHA256_MSG1(g3, g0); \
+
+#define SM2(g0, g1, g2, g3) \
+ tmp = _mm_alignr_epi8(g1, g0, 4); \
+ ADD_EPI32(g2, tmp); \
+ SHA25G_MSG2(g2, g1); \
+
+// #define LS0(k, g0, g1, g2, g3) LOAD_SHUFFLE(g0, k)
+// #define LS1(k, g0, g1, g2, g3) LOAD_SHUFFLE(g1, k+1)
+
+
+#define NNN(g0, g1, g2, g3)
+
+
+#define RND2(t0, t1) \
+ t0 = _mm_sha256rnds2_epu32(t0, t1, msg);
+
+#define RND2_0(m, k) \
+ msg = _mm_add_epi32(m, *(const __m128i *) (const void *) &K[(k) * 4]); \
+ RND2(state0, state1); \
+ msg = _mm_shuffle_epi32(msg, 0x0E); \
+
+
+#define RND2_1 \
+ RND2(state1, state0); \
+
+
+// We use scheme with 3 rounds ahead for SHA256_MSG1 / 2 rounds ahead for SHA256_MSG2
+
+#define R4(k, g0, g1, g2, g3, OP0, OP1) \
+ RND2_0(g0, k); \
+ OP0(g0, g1, g2, g3); \
+ RND2_1; \
+ OP1(g0, g1, g2, g3); \
+
+#define R16(k, OP0, OP1, OP2, OP3, OP4, OP5, OP6, OP7) \
+ R4 ( (k)*4+0, m0, m1, m2, m3, OP0, OP1 ) \
+ R4 ( (k)*4+1, m1, m2, m3, m0, OP2, OP3 ) \
+ R4 ( (k)*4+2, m2, m3, m0, m1, OP4, OP5 ) \
+ R4 ( (k)*4+3, m3, m0, m1, m2, OP6, OP7 ) \
+
+#define PREPARE_STATE \
+ tmp = _mm_shuffle_epi32(state0, 0x1B); /* abcd */ \
+ state0 = _mm_shuffle_epi32(state1, 0x1B); /* efgh */ \
+ state1 = state0; \
+ state0 = _mm_unpacklo_epi64(state0, tmp); /* cdgh */ \
+ state1 = _mm_unpackhi_epi64(state1, tmp); /* abef */ \
+
+
+void MY_FAST_CALL Sha256_UpdateBlocks_HW(UInt32 state[8], const Byte *data, size_t numBlocks);
+#ifdef ATTRIB_SHA
+ATTRIB_SHA
+#endif
+void MY_FAST_CALL Sha256_UpdateBlocks_HW(UInt32 state[8], const Byte *data, size_t numBlocks)
+{
+ const __m128i mask = _mm_set_epi32(0x0c0d0e0f, 0x08090a0b, 0x04050607, 0x00010203);
+ __m128i tmp;
+ __m128i state0, state1;
+
+ if (numBlocks == 0)
+ return;
+
+ state0 = _mm_loadu_si128((const __m128i *) (const void *) &state[0]);
+ state1 = _mm_loadu_si128((const __m128i *) (const void *) &state[4]);
+
+ PREPARE_STATE
+
+ do
+ {
+ __m128i state0_save, state1_save;
+ __m128i m0, m1, m2, m3;
+ __m128i msg;
+ // #define msg tmp
+
+ state0_save = state0;
+ state1_save = state1;
+
+ LOAD_SHUFFLE (m0, 0)
+ LOAD_SHUFFLE (m1, 1)
+ LOAD_SHUFFLE (m2, 2)
+ LOAD_SHUFFLE (m3, 3)
+
+
+
+ R16 ( 0, NNN, NNN, SM1, NNN, SM1, SM2, SM1, SM2 );
+ R16 ( 1, SM1, SM2, SM1, SM2, SM1, SM2, SM1, SM2 );
+ R16 ( 2, SM1, SM2, SM1, SM2, SM1, SM2, SM1, SM2 );
+ R16 ( 3, SM1, SM2, NNN, SM2, NNN, NNN, NNN, NNN );
+
+ ADD_EPI32(state0, state0_save);
+ ADD_EPI32(state1, state1_save);
+
+ data += 64;
+ }
+ while (--numBlocks);
+
+ PREPARE_STATE
+
+ _mm_storeu_si128((__m128i *) (void *) &state[0], state0);
+ _mm_storeu_si128((__m128i *) (void *) &state[4], state1);
+}
+
+#endif // USE_HW_SHA
+
+#elif defined(MY_CPU_ARM_OR_ARM64)
+
+ #if defined(__clang__)
+ #if (__clang_major__ >= 8) // fix that check
+ #define USE_HW_SHA
+ #endif
+ #elif defined(__GNUC__)
+ #if (__GNUC__ >= 6) // fix that check
+ #define USE_HW_SHA
+ #endif
+ #elif defined(_MSC_VER)
+ #if _MSC_VER >= 1910
+ #define USE_HW_SHA
+ #endif
+ #endif
+
+#ifdef USE_HW_SHA
+
+// #pragma message("=== Sha256 HW === ")
+
+#if defined(__clang__) || defined(__GNUC__)
+ #ifdef MY_CPU_ARM64
+ #define ATTRIB_SHA __attribute__((__target__("+crypto")))
+ #else
+ #define ATTRIB_SHA __attribute__((__target__("fpu=crypto-neon-fp-armv8")))
+ #endif
+#else
+ // _MSC_VER
+ // for arm32
+ #define _ARM_USE_NEW_NEON_INTRINSICS
+#endif
+
+#if defined(_MSC_VER) && defined(MY_CPU_ARM64)
+#include <arm64_neon.h>
+#else
+#include <arm_neon.h>
+#endif
+
+typedef uint32x4_t v128;
+// typedef __n128 v128; // MSVC
+
+#ifdef MY_CPU_BE
+ #define MY_rev32_for_LE(x)
+#else
+ #define MY_rev32_for_LE(x) x = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(x)))
+#endif
+
+#define LOAD_128(_p) (*(const v128 *)(const void *)(_p))
+#define STORE_128(_p, _v) *(v128 *)(void *)(_p) = (_v)
+
+#define LOAD_SHUFFLE(m, k) \
+ m = LOAD_128((data + (k) * 16)); \
+ MY_rev32_for_LE(m); \
+
+// K array must be aligned for 16-bytes at least.
+extern
+MY_ALIGN(64)
+const UInt32 SHA256_K_ARRAY[64];
+
+#define K SHA256_K_ARRAY
+
+
+#define SHA256_SU0(dest, src) dest = vsha256su0q_u32(dest, src);
+#define SHA25G_SU1(dest, src2, src3) dest = vsha256su1q_u32(dest, src2, src3);
+
+#define SM1(g0, g1, g2, g3) SHA256_SU0(g3, g0)
+#define SM2(g0, g1, g2, g3) SHA25G_SU1(g2, g0, g1)
+#define NNN(g0, g1, g2, g3)
+
+
+#define R4(k, g0, g1, g2, g3, OP0, OP1) \
+ msg = vaddq_u32(g0, *(const v128 *) (const void *) &K[(k) * 4]); \
+ tmp = state0; \
+ state0 = vsha256hq_u32( state0, state1, msg ); \
+ state1 = vsha256h2q_u32( state1, tmp, msg ); \
+ OP0(g0, g1, g2, g3); \
+ OP1(g0, g1, g2, g3); \
+
+
+#define R16(k, OP0, OP1, OP2, OP3, OP4, OP5, OP6, OP7) \
+ R4 ( (k)*4+0, m0, m1, m2, m3, OP0, OP1 ) \
+ R4 ( (k)*4+1, m1, m2, m3, m0, OP2, OP3 ) \
+ R4 ( (k)*4+2, m2, m3, m0, m1, OP4, OP5 ) \
+ R4 ( (k)*4+3, m3, m0, m1, m2, OP6, OP7 ) \
+
+
+void MY_FAST_CALL Sha256_UpdateBlocks_HW(UInt32 state[8], const Byte *data, size_t numBlocks);
+#ifdef ATTRIB_SHA
+ATTRIB_SHA
+#endif
+void MY_FAST_CALL Sha256_UpdateBlocks_HW(UInt32 state[8], const Byte *data, size_t numBlocks)
+{
+ v128 state0, state1;
+
+ if (numBlocks == 0)
+ return;
+
+ state0 = LOAD_128(&state[0]);
+ state1 = LOAD_128(&state[4]);
+
+ do
+ {
+ v128 state0_save, state1_save;
+ v128 m0, m1, m2, m3;
+ v128 msg, tmp;
+
+ state0_save = state0;
+ state1_save = state1;
+
+ LOAD_SHUFFLE (m0, 0)
+ LOAD_SHUFFLE (m1, 1)
+ LOAD_SHUFFLE (m2, 2)
+ LOAD_SHUFFLE (m3, 3)
+
+ R16 ( 0, NNN, NNN, SM1, NNN, SM1, SM2, SM1, SM2 );
+ R16 ( 1, SM1, SM2, SM1, SM2, SM1, SM2, SM1, SM2 );
+ R16 ( 2, SM1, SM2, SM1, SM2, SM1, SM2, SM1, SM2 );
+ R16 ( 3, SM1, SM2, NNN, SM2, NNN, NNN, NNN, NNN );
+
+ state0 = vaddq_u32(state0, state0_save);
+ state1 = vaddq_u32(state1, state1_save);
+
+ data += 64;
+ }
+ while (--numBlocks);
+
+ STORE_128(&state[0], state0);
+ STORE_128(&state[4], state1);
+}
+
+#endif // USE_HW_SHA
+
+#endif // MY_CPU_ARM_OR_ARM64
+
+
+#ifndef USE_HW_SHA
+
+// #error Stop_Compiling_UNSUPPORTED_SHA
+// #include <stdlib.h>
+
+// #include "Sha256.h"
+void MY_FAST_CALL Sha256_UpdateBlocks(UInt32 state[8], const Byte *data, size_t numBlocks);
+
+#pragma message("Sha256 HW-SW stub was used")
+
+void MY_FAST_CALL Sha256_UpdateBlocks_HW(UInt32 state[8], const Byte *data, size_t numBlocks);
+void MY_FAST_CALL Sha256_UpdateBlocks_HW(UInt32 state[8], const Byte *data, size_t numBlocks)
+{
+ Sha256_UpdateBlocks(state, data, numBlocks);
+ /*
+ UNUSED_VAR(state);
+ UNUSED_VAR(data);
+ UNUSED_VAR(numBlocks);
+ exit(1);
+ return;
+ */
+}
+
+#endif