From 8f0845cad7bfc46939132b33f9cd0753b261b953 Mon Sep 17 00:00:00 2001 From: Denys Vlasenko Date: Tue, 18 Jul 2023 16:41:12 +0200 Subject: libbb: rename source files, no code changes Signed-off-by: Denys Vlasenko --- libbb/Kbuild.src | 10 +- libbb/hash_md5_sha256_x86-32_shaNI.S | 284 ------- libbb/hash_md5_sha256_x86-64_shaNI.S | 290 ------- libbb/hash_md5_sha_x86-32_shaNI.S | 234 ------ libbb/hash_md5_sha_x86-64.S | 1489 ---------------------------------- libbb/hash_md5_sha_x86-64.S.sh | 478 ----------- libbb/hash_md5_sha_x86-64_shaNI.S | 232 ------ libbb/hash_sha1_hwaccel_x86-32.S | 234 ++++++ libbb/hash_sha1_hwaccel_x86-64.S | 232 ++++++ libbb/hash_sha1_x86-64.S | 1489 ++++++++++++++++++++++++++++++++++ libbb/hash_sha1_x86-64.S.sh | 478 +++++++++++ libbb/hash_sha256_hwaccel_x86-32.S | 284 +++++++ libbb/hash_sha256_hwaccel_x86-64.S | 290 +++++++ 13 files changed, 3012 insertions(+), 3012 deletions(-) delete mode 100644 libbb/hash_md5_sha256_x86-32_shaNI.S delete mode 100644 libbb/hash_md5_sha256_x86-64_shaNI.S delete mode 100644 libbb/hash_md5_sha_x86-32_shaNI.S delete mode 100644 libbb/hash_md5_sha_x86-64.S delete mode 100755 libbb/hash_md5_sha_x86-64.S.sh delete mode 100644 libbb/hash_md5_sha_x86-64_shaNI.S create mode 100644 libbb/hash_sha1_hwaccel_x86-32.S create mode 100644 libbb/hash_sha1_hwaccel_x86-64.S create mode 100644 libbb/hash_sha1_x86-64.S create mode 100755 libbb/hash_sha1_x86-64.S.sh create mode 100644 libbb/hash_sha256_hwaccel_x86-32.S create mode 100644 libbb/hash_sha256_hwaccel_x86-64.S diff --git a/libbb/Kbuild.src b/libbb/Kbuild.src index 653025e56..c3b30003f 100644 --- a/libbb/Kbuild.src +++ b/libbb/Kbuild.src @@ -56,11 +56,11 @@ lib-y += login.o lib-y += make_directory.o lib-y += makedev.o lib-y += hash_md5_sha.o -lib-y += hash_md5_sha_x86-64.o -lib-y += hash_md5_sha_x86-64_shaNI.o -lib-y += hash_md5_sha_x86-32_shaNI.o -lib-y += hash_md5_sha256_x86-64_shaNI.o -lib-y += hash_md5_sha256_x86-32_shaNI.o +lib-y += hash_sha1_x86-64.o +lib-y += hash_sha1_hwaccel_x86-64.o +lib-y += hash_sha1_hwaccel_x86-32.o +lib-y += hash_sha256_hwaccel_x86-64.o +lib-y += hash_sha256_hwaccel_x86-32.o # Alternative (disabled) MD5 implementation #lib-y += hash_md5prime.o lib-y += messages.o diff --git a/libbb/hash_md5_sha256_x86-32_shaNI.S b/libbb/hash_md5_sha256_x86-32_shaNI.S deleted file mode 100644 index a0e4a571a..000000000 --- a/libbb/hash_md5_sha256_x86-32_shaNI.S +++ /dev/null @@ -1,284 +0,0 @@ -#if ENABLE_SHA256_HWACCEL && defined(__GNUC__) && defined(__i386__) -/* The code is adapted from Linux kernel's source */ - -// We use shorter insns, even though they are for "wrong" -// data type (fp, not int). -// For Intel, there is no penalty for doing it at all -// (CPUs which do have such penalty do not support SHA insns). -// For AMD, the penalty is one extra cycle -// (allegedly: I failed to find measurable difference). - -//#define mova128 movdqa -#define mova128 movaps -//#define movu128 movdqu -#define movu128 movups -//#define shuf128_32 pshufd -#define shuf128_32 shufps - -// pshufb and palignr are SSSE3 insns. -// We do not check SSSE3 in cpuid, -// all SHA-capable CPUs support it as well. - -#ifdef __linux__ - .section .note.GNU-stack, "", @progbits -#endif - .section .text.sha256_process_block64_shaNI, "ax", @progbits - .globl sha256_process_block64_shaNI - .hidden sha256_process_block64_shaNI - .type sha256_process_block64_shaNI, @function - -#define DATA_PTR %eax - -#define SHA256CONSTANTS %ecx - -#define MSG %xmm0 -#define STATE0 %xmm1 -#define STATE1 %xmm2 -#define MSGTMP0 %xmm3 -#define MSGTMP1 %xmm4 -#define MSGTMP2 %xmm5 -#define MSGTMP3 %xmm6 - -#define XMMTMP %xmm7 - -#define SHUF(a,b,c,d) $(a+(b<<2)+(c<<4)+(d<<6)) - - .balign 8 # allow decoders to fetch at least 2 first insns -sha256_process_block64_shaNI: - - movu128 76+0*16(%eax), XMMTMP /* ABCD (little-endian dword order) */ - movu128 76+1*16(%eax), STATE1 /* EFGH */ -/* shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one */ - mova128 STATE1, STATE0 - /* --- -------------- ABCD -- EFGH */ - shufps SHUF(1,0,1,0), XMMTMP, STATE0 /* FEBA */ - shufps SHUF(3,2,3,2), XMMTMP, STATE1 /* HGDC */ - -/* XMMTMP holds flip mask from here... */ - mova128 PSHUFFLE_BSWAP32_FLIP_MASK, XMMTMP - movl $K256+8*16, SHA256CONSTANTS - - /* Rounds 0-3 */ - movu128 0*16(DATA_PTR), MSG - pshufb XMMTMP, MSG - mova128 MSG, MSGTMP0 - paddd 0*16-8*16(SHA256CONSTANTS), MSG - sha256rnds2 MSG, STATE0, STATE1 - shuf128_32 $0x0E, MSG, MSG - sha256rnds2 MSG, STATE1, STATE0 - - /* Rounds 4-7 */ - movu128 1*16(DATA_PTR), MSG - pshufb XMMTMP, MSG - mova128 MSG, MSGTMP1 - paddd 1*16-8*16(SHA256CONSTANTS), MSG - sha256rnds2 MSG, STATE0, STATE1 - shuf128_32 $0x0E, MSG, MSG - sha256rnds2 MSG, STATE1, STATE0 - sha256msg1 MSGTMP1, MSGTMP0 - - /* Rounds 8-11 */ - movu128 2*16(DATA_PTR), MSG - pshufb XMMTMP, MSG - mova128 MSG, MSGTMP2 - paddd 2*16-8*16(SHA256CONSTANTS), MSG - sha256rnds2 MSG, STATE0, STATE1 - shuf128_32 $0x0E, MSG, MSG - sha256rnds2 MSG, STATE1, STATE0 - sha256msg1 MSGTMP2, MSGTMP1 - - /* Rounds 12-15 */ - movu128 3*16(DATA_PTR), MSG - pshufb XMMTMP, MSG -/* ...to here */ - mova128 MSG, MSGTMP3 - paddd 3*16-8*16(SHA256CONSTANTS), MSG - sha256rnds2 MSG, STATE0, STATE1 - mova128 MSGTMP3, XMMTMP - palignr $4, MSGTMP2, XMMTMP - paddd XMMTMP, MSGTMP0 - sha256msg2 MSGTMP3, MSGTMP0 - shuf128_32 $0x0E, MSG, MSG - sha256rnds2 MSG, STATE1, STATE0 - sha256msg1 MSGTMP3, MSGTMP2 - - /* Rounds 16-19 */ - mova128 MSGTMP0, MSG - paddd 4*16-8*16(SHA256CONSTANTS), MSG - sha256rnds2 MSG, STATE0, STATE1 - mova128 MSGTMP0, XMMTMP - palignr $4, MSGTMP3, XMMTMP - paddd XMMTMP, MSGTMP1 - sha256msg2 MSGTMP0, MSGTMP1 - shuf128_32 $0x0E, MSG, MSG - sha256rnds2 MSG, STATE1, STATE0 - sha256msg1 MSGTMP0, MSGTMP3 - - /* Rounds 20-23 */ - mova128 MSGTMP1, MSG - paddd 5*16-8*16(SHA256CONSTANTS), MSG - sha256rnds2 MSG, STATE0, STATE1 - mova128 MSGTMP1, XMMTMP - palignr $4, MSGTMP0, XMMTMP - paddd XMMTMP, MSGTMP2 - sha256msg2 MSGTMP1, MSGTMP2 - shuf128_32 $0x0E, MSG, MSG - sha256rnds2 MSG, STATE1, STATE0 - sha256msg1 MSGTMP1, MSGTMP0 - - /* Rounds 24-27 */ - mova128 MSGTMP2, MSG - paddd 6*16-8*16(SHA256CONSTANTS), MSG - sha256rnds2 MSG, STATE0, STATE1 - mova128 MSGTMP2, XMMTMP - palignr $4, MSGTMP1, XMMTMP - paddd XMMTMP, MSGTMP3 - sha256msg2 MSGTMP2, MSGTMP3 - shuf128_32 $0x0E, MSG, MSG - sha256rnds2 MSG, STATE1, STATE0 - sha256msg1 MSGTMP2, MSGTMP1 - - /* Rounds 28-31 */ - mova128 MSGTMP3, MSG - paddd 7*16-8*16(SHA256CONSTANTS), MSG - sha256rnds2 MSG, STATE0, STATE1 - mova128 MSGTMP3, XMMTMP - palignr $4, MSGTMP2, XMMTMP - paddd XMMTMP, MSGTMP0 - sha256msg2 MSGTMP3, MSGTMP0 - shuf128_32 $0x0E, MSG, MSG - sha256rnds2 MSG, STATE1, STATE0 - sha256msg1 MSGTMP3, MSGTMP2 - - /* Rounds 32-35 */ - mova128 MSGTMP0, MSG - paddd 8*16-8*16(SHA256CONSTANTS), MSG - sha256rnds2 MSG, STATE0, STATE1 - mova128 MSGTMP0, XMMTMP - palignr $4, MSGTMP3, XMMTMP - paddd XMMTMP, MSGTMP1 - sha256msg2 MSGTMP0, MSGTMP1 - shuf128_32 $0x0E, MSG, MSG - sha256rnds2 MSG, STATE1, STATE0 - sha256msg1 MSGTMP0, MSGTMP3 - - /* Rounds 36-39 */ - mova128 MSGTMP1, MSG - paddd 9*16-8*16(SHA256CONSTANTS), MSG - sha256rnds2 MSG, STATE0, STATE1 - mova128 MSGTMP1, XMMTMP - palignr $4, MSGTMP0, XMMTMP - paddd XMMTMP, MSGTMP2 - sha256msg2 MSGTMP1, MSGTMP2 - shuf128_32 $0x0E, MSG, MSG - sha256rnds2 MSG, STATE1, STATE0 - sha256msg1 MSGTMP1, MSGTMP0 - - /* Rounds 40-43 */ - mova128 MSGTMP2, MSG - paddd 10*16-8*16(SHA256CONSTANTS), MSG - sha256rnds2 MSG, STATE0, STATE1 - mova128 MSGTMP2, XMMTMP - palignr $4, MSGTMP1, XMMTMP - paddd XMMTMP, MSGTMP3 - sha256msg2 MSGTMP2, MSGTMP3 - shuf128_32 $0x0E, MSG, MSG - sha256rnds2 MSG, STATE1, STATE0 - sha256msg1 MSGTMP2, MSGTMP1 - - /* Rounds 44-47 */ - mova128 MSGTMP3, MSG - paddd 11*16-8*16(SHA256CONSTANTS), MSG - sha256rnds2 MSG, STATE0, STATE1 - mova128 MSGTMP3, XMMTMP - palignr $4, MSGTMP2, XMMTMP - paddd XMMTMP, MSGTMP0 - sha256msg2 MSGTMP3, MSGTMP0 - shuf128_32 $0x0E, MSG, MSG - sha256rnds2 MSG, STATE1, STATE0 - sha256msg1 MSGTMP3, MSGTMP2 - - /* Rounds 48-51 */ - mova128 MSGTMP0, MSG - paddd 12*16-8*16(SHA256CONSTANTS), MSG - sha256rnds2 MSG, STATE0, STATE1 - mova128 MSGTMP0, XMMTMP - palignr $4, MSGTMP3, XMMTMP - paddd XMMTMP, MSGTMP1 - sha256msg2 MSGTMP0, MSGTMP1 - shuf128_32 $0x0E, MSG, MSG - sha256rnds2 MSG, STATE1, STATE0 - sha256msg1 MSGTMP0, MSGTMP3 - - /* Rounds 52-55 */ - mova128 MSGTMP1, MSG - paddd 13*16-8*16(SHA256CONSTANTS), MSG - sha256rnds2 MSG, STATE0, STATE1 - mova128 MSGTMP1, XMMTMP - palignr $4, MSGTMP0, XMMTMP - paddd XMMTMP, MSGTMP2 - sha256msg2 MSGTMP1, MSGTMP2 - shuf128_32 $0x0E, MSG, MSG - sha256rnds2 MSG, STATE1, STATE0 - - /* Rounds 56-59 */ - mova128 MSGTMP2, MSG - paddd 14*16-8*16(SHA256CONSTANTS), MSG - sha256rnds2 MSG, STATE0, STATE1 - mova128 MSGTMP2, XMMTMP - palignr $4, MSGTMP1, XMMTMP - paddd XMMTMP, MSGTMP3 - sha256msg2 MSGTMP2, MSGTMP3 - shuf128_32 $0x0E, MSG, MSG - sha256rnds2 MSG, STATE1, STATE0 - - /* Rounds 60-63 */ - mova128 MSGTMP3, MSG - paddd 15*16-8*16(SHA256CONSTANTS), MSG - sha256rnds2 MSG, STATE0, STATE1 - shuf128_32 $0x0E, MSG, MSG - sha256rnds2 MSG, STATE1, STATE0 - - /* Write hash values back in the correct order */ - mova128 STATE0, XMMTMP -/* shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one */ - /* --- -------------- HGDC -- FEBA */ - shufps SHUF(3,2,3,2), STATE1, STATE0 /* ABCD */ - shufps SHUF(1,0,1,0), STATE1, XMMTMP /* EFGH */ - /* add current hash values to previous ones */ - movu128 76+1*16(%eax), STATE1 - paddd XMMTMP, STATE1 - movu128 STATE1, 76+1*16(%eax) - movu128 76+0*16(%eax), XMMTMP - paddd XMMTMP, STATE0 - movu128 STATE0, 76+0*16(%eax) - - ret - .size sha256_process_block64_shaNI, .-sha256_process_block64_shaNI - - .section .rodata.cst256.K256, "aM", @progbits, 256 - .balign 16 -K256: - .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 - .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 - .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 - .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 - .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc - .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da - .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 - .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 - .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 - .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 - .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 - .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 - .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 - .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 - .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 - .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 - - .section .rodata.cst16.PSHUFFLE_BSWAP32_FLIP_MASK, "aM", @progbits, 16 - .balign 16 -PSHUFFLE_BSWAP32_FLIP_MASK: - .octa 0x0c0d0e0f08090a0b0405060700010203 - -#endif diff --git a/libbb/hash_md5_sha256_x86-64_shaNI.S b/libbb/hash_md5_sha256_x86-64_shaNI.S deleted file mode 100644 index 172c2eae2..000000000 --- a/libbb/hash_md5_sha256_x86-64_shaNI.S +++ /dev/null @@ -1,290 +0,0 @@ -#if ENABLE_SHA256_HWACCEL && defined(__GNUC__) && defined(__x86_64__) -/* The code is adapted from Linux kernel's source */ - -// We use shorter insns, even though they are for "wrong" -// data type (fp, not int). -// For Intel, there is no penalty for doing it at all -// (CPUs which do have such penalty do not support SHA insns). -// For AMD, the penalty is one extra cycle -// (allegedly: I failed to find measurable difference). - -//#define mova128 movdqa -#define mova128 movaps -//#define movu128 movdqu -#define movu128 movups -//#define shuf128_32 pshufd -#define shuf128_32 shufps - -// pshufb and palignr are SSSE3 insns. -// We do not check SSSE3 in cpuid, -// all SHA-capable CPUs support it as well. - -#ifdef __linux__ - .section .note.GNU-stack, "", @progbits -#endif - .section .text.sha256_process_block64_shaNI, "ax", @progbits - .globl sha256_process_block64_shaNI - .hidden sha256_process_block64_shaNI - .type sha256_process_block64_shaNI, @function - -#define DATA_PTR %rdi - -#define SHA256CONSTANTS %rax - -#define MSG %xmm0 -#define STATE0 %xmm1 -#define STATE1 %xmm2 -#define MSGTMP0 %xmm3 -#define MSGTMP1 %xmm4 -#define MSGTMP2 %xmm5 -#define MSGTMP3 %xmm6 - -#define XMMTMP %xmm7 - -#define SAVE0 %xmm8 -#define SAVE1 %xmm9 - -#define SHUF(a,b,c,d) $(a+(b<<2)+(c<<4)+(d<<6)) - - .balign 8 # allow decoders to fetch at least 2 first insns -sha256_process_block64_shaNI: - - movu128 80+0*16(%rdi), XMMTMP /* ABCD (little-endian dword order) */ - movu128 80+1*16(%rdi), STATE1 /* EFGH */ -/* shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one */ - mova128 STATE1, STATE0 - /* --- -------------- ABCD -- EFGH */ - shufps SHUF(1,0,1,0), XMMTMP, STATE0 /* FEBA */ - shufps SHUF(3,2,3,2), XMMTMP, STATE1 /* HGDC */ - -/* XMMTMP holds flip mask from here... */ - mova128 PSHUFFLE_BSWAP32_FLIP_MASK(%rip), XMMTMP - leaq K256+8*16(%rip), SHA256CONSTANTS - - /* Save hash values for addition after rounds */ - mova128 STATE0, SAVE0 - mova128 STATE1, SAVE1 - - /* Rounds 0-3 */ - movu128 0*16(DATA_PTR), MSG - pshufb XMMTMP, MSG - mova128 MSG, MSGTMP0 - paddd 0*16-8*16(SHA256CONSTANTS), MSG - sha256rnds2 MSG, STATE0, STATE1 - shuf128_32 $0x0E, MSG, MSG - sha256rnds2 MSG, STATE1, STATE0 - - /* Rounds 4-7 */ - movu128 1*16(DATA_PTR), MSG - pshufb XMMTMP, MSG - mova128 MSG, MSGTMP1 - paddd 1*16-8*16(SHA256CONSTANTS), MSG - sha256rnds2 MSG, STATE0, STATE1 - shuf128_32 $0x0E, MSG, MSG - sha256rnds2 MSG, STATE1, STATE0 - sha256msg1 MSGTMP1, MSGTMP0 - - /* Rounds 8-11 */ - movu128 2*16(DATA_PTR), MSG - pshufb XMMTMP, MSG - mova128 MSG, MSGTMP2 - paddd 2*16-8*16(SHA256CONSTANTS), MSG - sha256rnds2 MSG, STATE0, STATE1 - shuf128_32 $0x0E, MSG, MSG - sha256rnds2 MSG, STATE1, STATE0 - sha256msg1 MSGTMP2, MSGTMP1 - - /* Rounds 12-15 */ - movu128 3*16(DATA_PTR), MSG - pshufb XMMTMP, MSG -/* ...to here */ - mova128 MSG, MSGTMP3 - paddd 3*16-8*16(SHA256CONSTANTS), MSG - sha256rnds2 MSG, STATE0, STATE1 - mova128 MSGTMP3, XMMTMP - palignr $4, MSGTMP2, XMMTMP - paddd XMMTMP, MSGTMP0 - sha256msg2 MSGTMP3, MSGTMP0 - shuf128_32 $0x0E, MSG, MSG - sha256rnds2 MSG, STATE1, STATE0 - sha256msg1 MSGTMP3, MSGTMP2 - - /* Rounds 16-19 */ - mova128 MSGTMP0, MSG - paddd 4*16-8*16(SHA256CONSTANTS), MSG - sha256rnds2 MSG, STATE0, STATE1 - mova128 MSGTMP0, XMMTMP - palignr $4, MSGTMP3, XMMTMP - paddd XMMTMP, MSGTMP1 - sha256msg2 MSGTMP0, MSGTMP1 - shuf128_32 $0x0E, MSG, MSG - sha256rnds2 MSG, STATE1, STATE0 - sha256msg1 MSGTMP0, MSGTMP3 - - /* Rounds 20-23 */ - mova128 MSGTMP1, MSG - paddd 5*16-8*16(SHA256CONSTANTS), MSG - sha256rnds2 MSG, STATE0, STATE1 - mova128 MSGTMP1, XMMTMP - palignr $4, MSGTMP0, XMMTMP - paddd XMMTMP, MSGTMP2 - sha256msg2 MSGTMP1, MSGTMP2 - shuf128_32 $0x0E, MSG, MSG - sha256rnds2 MSG, STATE1, STATE0 - sha256msg1 MSGTMP1, MSGTMP0 - - /* Rounds 24-27 */ - mova128 MSGTMP2, MSG - paddd 6*16-8*16(SHA256CONSTANTS), MSG - sha256rnds2 MSG, STATE0, STATE1 - mova128 MSGTMP2, XMMTMP - palignr $4, MSGTMP1, XMMTMP - paddd XMMTMP, MSGTMP3 - sha256msg2 MSGTMP2, MSGTMP3 - shuf128_32 $0x0E, MSG, MSG - sha256rnds2 MSG, STATE1, STATE0 - sha256msg1 MSGTMP2, MSGTMP1 - - /* Rounds 28-31 */ - mova128 MSGTMP3, MSG - paddd 7*16-8*16(SHA256CONSTANTS), MSG - sha256rnds2 MSG, STATE0, STATE1 - mova128 MSGTMP3, XMMTMP - palignr $4, MSGTMP2, XMMTMP - paddd XMMTMP, MSGTMP0 - sha256msg2 MSGTMP3, MSGTMP0 - shuf128_32 $0x0E, MSG, MSG - sha256rnds2 MSG, STATE1, STATE0 - sha256msg1 MSGTMP3, MSGTMP2 - - /* Rounds 32-35 */ - mova128 MSGTMP0, MSG - paddd 8*16-8*16(SHA256CONSTANTS), MSG - sha256rnds2 MSG, STATE0, STATE1 - mova128 MSGTMP0, XMMTMP - palignr $4, MSGTMP3, XMMTMP - paddd XMMTMP, MSGTMP1 - sha256msg2 MSGTMP0, MSGTMP1 - shuf128_32 $0x0E, MSG, MSG - sha256rnds2 MSG, STATE1, STATE0 - sha256msg1 MSGTMP0, MSGTMP3 - - /* Rounds 36-39 */ - mova128 MSGTMP1, MSG - paddd 9*16-8*16(SHA256CONSTANTS), MSG - sha256rnds2 MSG, STATE0, STATE1 - mova128 MSGTMP1, XMMTMP - palignr $4, MSGTMP0, XMMTMP - paddd XMMTMP, MSGTMP2 - sha256msg2 MSGTMP1, MSGTMP2 - shuf128_32 $0x0E, MSG, MSG - sha256rnds2 MSG, STATE1, STATE0 - sha256msg1 MSGTMP1, MSGTMP0 - - /* Rounds 40-43 */ - mova128 MSGTMP2, MSG - paddd 10*16-8*16(SHA256CONSTANTS), MSG - sha256rnds2 MSG, STATE0, STATE1 - mova128 MSGTMP2, XMMTMP - palignr $4, MSGTMP1, XMMTMP - paddd XMMTMP, MSGTMP3 - sha256msg2 MSGTMP2, MSGTMP3 - shuf128_32 $0x0E, MSG, MSG - sha256rnds2 MSG, STATE1, STATE0 - sha256msg1 MSGTMP2, MSGTMP1 - - /* Rounds 44-47 */ - mova128 MSGTMP3, MSG - paddd 11*16-8*16(SHA256CONSTANTS), MSG - sha256rnds2 MSG, STATE0, STATE1 - mova128 MSGTMP3, XMMTMP - palignr $4, MSGTMP2, XMMTMP - paddd XMMTMP, MSGTMP0 - sha256msg2 MSGTMP3, MSGTMP0 - shuf128_32 $0x0E, MSG, MSG - sha256rnds2 MSG, STATE1, STATE0 - sha256msg1 MSGTMP3, MSGTMP2 - - /* Rounds 48-51 */ - mova128 MSGTMP0, MSG - paddd 12*16-8*16(SHA256CONSTANTS), MSG - sha256rnds2 MSG, STATE0, STATE1 - mova128 MSGTMP0, XMMTMP - palignr $4, MSGTMP3, XMMTMP - paddd XMMTMP, MSGTMP1 - sha256msg2 MSGTMP0, MSGTMP1 - shuf128_32 $0x0E, MSG, MSG - sha256rnds2 MSG, STATE1, STATE0 - sha256msg1 MSGTMP0, MSGTMP3 - - /* Rounds 52-55 */ - mova128 MSGTMP1, MSG - paddd 13*16-8*16(SHA256CONSTANTS), MSG - sha256rnds2 MSG, STATE0, STATE1 - mova128 MSGTMP1, XMMTMP - palignr $4, MSGTMP0, XMMTMP - paddd XMMTMP, MSGTMP2 - sha256msg2 MSGTMP1, MSGTMP2 - shuf128_32 $0x0E, MSG, MSG - sha256rnds2 MSG, STATE1, STATE0 - - /* Rounds 56-59 */ - mova128 MSGTMP2, MSG - paddd 14*16-8*16(SHA256CONSTANTS), MSG - sha256rnds2 MSG, STATE0, STATE1 - mova128 MSGTMP2, XMMTMP - palignr $4, MSGTMP1, XMMTMP - paddd XMMTMP, MSGTMP3 - sha256msg2 MSGTMP2, MSGTMP3 - shuf128_32 $0x0E, MSG, MSG - sha256rnds2 MSG, STATE1, STATE0 - - /* Rounds 60-63 */ - mova128 MSGTMP3, MSG - paddd 15*16-8*16(SHA256CONSTANTS), MSG - sha256rnds2 MSG, STATE0, STATE1 - shuf128_32 $0x0E, MSG, MSG - sha256rnds2 MSG, STATE1, STATE0 - - /* Add current hash values with previously saved */ - paddd SAVE0, STATE0 - paddd SAVE1, STATE1 - - /* Write hash values back in the correct order */ - mova128 STATE0, XMMTMP -/* shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one */ - /* --- -------------- HGDC -- FEBA */ - shufps SHUF(3,2,3,2), STATE1, STATE0 /* ABCD */ - shufps SHUF(1,0,1,0), STATE1, XMMTMP /* EFGH */ - movu128 STATE0, 80+0*16(%rdi) - movu128 XMMTMP, 80+1*16(%rdi) - - ret - .size sha256_process_block64_shaNI, .-sha256_process_block64_shaNI - - .section .rodata.cst256.K256, "aM", @progbits, 256 - .balign 16 -K256: - .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 - .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 - .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 - .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 - .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc - .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da - .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 - .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 - .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 - .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 - .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 - .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 - .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 - .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 - .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 - .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 - - .section .rodata.cst16.PSHUFFLE_BSWAP32_FLIP_MASK, "aM", @progbits, 16 - .balign 16 -PSHUFFLE_BSWAP32_FLIP_MASK: - .octa 0x0c0d0e0f08090a0b0405060700010203 - -#endif diff --git a/libbb/hash_md5_sha_x86-32_shaNI.S b/libbb/hash_md5_sha_x86-32_shaNI.S deleted file mode 100644 index 7455a29f0..000000000 --- a/libbb/hash_md5_sha_x86-32_shaNI.S +++ /dev/null @@ -1,234 +0,0 @@ -#if ENABLE_SHA1_HWACCEL && defined(__GNUC__) && defined(__i386__) -/* The code is adapted from Linux kernel's source */ - -// We use shorter insns, even though they are for "wrong" -// data type (fp, not int). -// For Intel, there is no penalty for doing it at all -// (CPUs which do have such penalty do not support SHA insns). -// For AMD, the penalty is one extra cycle -// (allegedly: I failed to find measurable difference). - -//#define mova128 movdqa -#define mova128 movaps -//#define movu128 movdqu -#define movu128 movups -//#define xor128 pxor -#define xor128 xorps -//#define shuf128_32 pshufd -#define shuf128_32 shufps - -#define extr128_32 pextrd -//#define extr128_32 extractps # not shorter - -// pshufb is a SSSE3 insn. -// pinsrd, pextrd, extractps are SSE4.1 insns. -// We do not check SSSE3/SSE4.1 in cpuid, -// all SHA-capable CPUs support them as well. - -#ifdef __linux__ - .section .note.GNU-stack, "", @progbits -#endif - .section .text.sha1_process_block64_shaNI, "ax", @progbits - .globl sha1_process_block64_shaNI - .hidden sha1_process_block64_shaNI - .type sha1_process_block64_shaNI, @function - -#define ABCD %xmm0 -#define E0 %xmm1 /* Need two E's b/c they ping pong */ -#define E1 %xmm2 -#define MSG0 %xmm3 -#define MSG1 %xmm4 -#define MSG2 %xmm5 -#define MSG3 %xmm6 - - .balign 8 # allow decoders to fetch at least 2 first insns -sha1_process_block64_shaNI: - /* load initial hash values */ - movu128 76(%eax), ABCD - xor128 E0, E0 - pinsrd $3, 76+4*4(%eax), E0 # load to uppermost 32-bit word - shuf128_32 $0x1B, ABCD, ABCD # DCBA -> ABCD - - mova128 PSHUFFLE_BYTE_FLIP_MASK, %xmm7 - - movu128 0*16(%eax), MSG0 - pshufb %xmm7, MSG0 - movu128 1*16(%eax), MSG1 - pshufb %xmm7, MSG1 - movu128 2*16(%eax), MSG2 - pshufb %xmm7, MSG2 - movu128 3*16(%eax), MSG3 - pshufb %xmm7, MSG3 - - /* Save hash values for addition after rounds */ - mova128 E0, %xmm7 - /*mova128 ABCD, %xmm8 - NOPE, 32bit has no xmm8 */ - - /* Rounds 0-3 */ - paddd MSG0, E0 - mova128 ABCD, E1 - sha1rnds4 $0, E0, ABCD - - /* Rounds 4-7 */ - sha1nexte MSG1, E1 - mova128 ABCD, E0 - sha1rnds4 $0, E1, ABCD - sha1msg1 MSG1, MSG0 - - /* Rounds 8-11 */ - sha1nexte MSG2, E0 - mova128 ABCD, E1 - sha1rnds4 $0, E0, ABCD - sha1msg1 MSG2, MSG1 - xor128 MSG2, MSG0 - - /* Rounds 12-15 */ - sha1nexte MSG3, E1 - mova128 ABCD, E0 - sha1msg2 MSG3, MSG0 - sha1rnds4 $0, E1, ABCD - sha1msg1 MSG3, MSG2 - xor128 MSG3, MSG1 - - /* Rounds 16-19 */ - sha1nexte MSG0, E0 - mova128 ABCD, E1 - sha1msg2 MSG0, MSG1 - sha1rnds4 $0, E0, ABCD - sha1msg1 MSG0, MSG3 - xor128 MSG0, MSG2 - - /* Rounds 20-23 */ - sha1nexte MSG1, E1 - mova128 ABCD, E0 - sha1msg2 MSG1, MSG2 - sha1rnds4 $1, E1, ABCD - sha1msg1 MSG1, MSG0 - xor128 MSG1, MSG3 - - /* Rounds 24-27 */ - sha1nexte MSG2, E0 - mova128 ABCD, E1 - sha1msg2 MSG2, MSG3 - sha1rnds4 $1, E0, ABCD - sha1msg1 MSG2, MSG1 - xor128 MSG2, MSG0 - - /* Rounds 28-31 */ - sha1nexte MSG3, E1 - mova128 ABCD, E0 - sha1msg2 MSG3, MSG0 - sha1rnds4 $1, E1, ABCD - sha1msg1 MSG3, MSG2 - xor128 MSG3, MSG1 - - /* Rounds 32-35 */ - sha1nexte MSG0, E0 - mova128 ABCD, E1 - sha1msg2 MSG0, MSG1 - sha1rnds4 $1, E0, ABCD - sha1msg1 MSG0, MSG3 - xor128 MSG0, MSG2 - - /* Rounds 36-39 */ - sha1nexte MSG1, E1 - mova128 ABCD, E0 - sha1msg2 MSG1, MSG2 - sha1rnds4 $1, E1, ABCD - sha1msg1 MSG1, MSG0 - xor128 MSG1, MSG3 - - /* Rounds 40-43 */ - sha1nexte MSG2, E0 - mova128 ABCD, E1 - sha1msg2 MSG2, MSG3 - sha1rnds4 $2, E0, ABCD - sha1msg1 MSG2, MSG1 - xor128 MSG2, MSG0 - - /* Rounds 44-47 */ - sha1nexte MSG3, E1 - mova128 ABCD, E0 - sha1msg2 MSG3, MSG0 - sha1rnds4 $2, E1, ABCD - sha1msg1 MSG3, MSG2 - xor128 MSG3, MSG1 - - /* Rounds 48-51 */ - sha1nexte MSG0, E0 - mova128 ABCD, E1 - sha1msg2 MSG0, MSG1 - sha1rnds4 $2, E0, ABCD - sha1msg1 MSG0, MSG3 - xor128 MSG0, MSG2 - - /* Rounds 52-55 */ - sha1nexte MSG1, E1 - mova128 ABCD, E0 - sha1msg2 MSG1, MSG2 - sha1rnds4 $2, E1, ABCD - sha1msg1 MSG1, MSG0 - xor128 MSG1, MSG3 - - /* Rounds 56-59 */ - sha1nexte MSG2, E0 - mova128 ABCD, E1 - sha1msg2 MSG2, MSG3 - sha1rnds4 $2, E0, ABCD - sha1msg1 MSG2, MSG1 - xor128 MSG2, MSG0 - - /* Rounds 60-63 */ - sha1nexte MSG3, E1 - mova128 ABCD, E0 - sha1msg2 MSG3, MSG0 - sha1rnds4 $3, E1, ABCD - sha1msg1 MSG3, MSG2 - xor128 MSG3, MSG1 - - /* Rounds 64-67 */ - sha1nexte MSG0, E0 - mova128 ABCD, E1 - sha1msg2 MSG0, MSG1 - sha1rnds4 $3, E0, ABCD - sha1msg1 MSG0, MSG3 - xor128 MSG0, MSG2 - - /* Rounds 68-71 */ - sha1nexte MSG1, E1 - mova128 ABCD, E0 - sha1msg2 MSG1, MSG2 - sha1rnds4 $3, E1, ABCD - xor128 MSG1, MSG3 - - /* Rounds 72-75 */ - sha1nexte MSG2, E0 - mova128 ABCD, E1 - sha1msg2 MSG2, MSG3 - sha1rnds4 $3, E0, ABCD - - /* Rounds 76-79 */ - sha1nexte MSG3, E1 - mova128 ABCD, E0 - sha1rnds4 $3, E1, ABCD - - /* Add current hash values with previously saved */ - sha1nexte %xmm7, E0 - /*paddd %xmm8, ABCD - 32-bit mode has no xmm8 */ - movu128 76(%eax), %xmm7 # get original ABCD (not shuffled)... - - /* Write hash values back in the correct order */ - shuf128_32 $0x1B, ABCD, ABCD - paddd %xmm7, ABCD # ...add it to final ABCD - movu128 ABCD, 76(%eax) - extr128_32 $3, E0, 76+4*4(%eax) - - ret - .size sha1_process_block64_shaNI, .-sha1_process_block64_shaNI - - .section .rodata.cst16.PSHUFFLE_BYTE_FLIP_MASK, "aM", @progbits, 16 - .balign 16 -PSHUFFLE_BYTE_FLIP_MASK: - .octa 0x000102030405060708090a0b0c0d0e0f - -#endif diff --git a/libbb/hash_md5_sha_x86-64.S b/libbb/hash_md5_sha_x86-64.S deleted file mode 100644 index 2cdd22015..000000000 --- a/libbb/hash_md5_sha_x86-64.S +++ /dev/null @@ -1,1489 +0,0 @@ -### Generated by hash_md5_sha_x86-64.S.sh ### - -#if CONFIG_SHA1_SMALL == 0 && defined(__GNUC__) && defined(__x86_64__) -#ifdef __linux__ - .section .note.GNU-stack, "", @progbits -#endif - .section .text.sha1_process_block64, "ax", @progbits - .globl sha1_process_block64 - .hidden sha1_process_block64 - .type sha1_process_block64, @function - - .balign 8 # allow decoders to fetch at least 5 first insns -sha1_process_block64: - pushq %rbp # 1 byte insn - pushq %rbx # 1 byte insn -# pushq %r15 # 2 byte insn - pushq %r14 # 2 byte insn - pushq %r13 # 2 byte insn - pushq %r12 # 2 byte insn - pushq %rdi # we need ctx at the end - -#Register and stack use: -# eax..edx: a..d -# ebp: e -# esi,edi,r8..r14: temps -# r15: unused -# xmm0..xmm3: W[] -# xmm4,xmm5: temps -# xmm6: current round constant -# xmm7: all round constants -# -64(%rsp): area for passing RCONST + W[] from vector to integer units - - movl 80(%rdi), %eax # a = ctx->hash[0] - movl 84(%rdi), %ebx # b = ctx->hash[1] - movl 88(%rdi), %ecx # c = ctx->hash[2] - movl 92(%rdi), %edx # d = ctx->hash[3] - movl 96(%rdi), %ebp # e = ctx->hash[4] - - movaps sha1const(%rip), %xmm7 - pshufd $0x00, %xmm7, %xmm6 - - # Load W[] to xmm0..3, byteswapping on the fly. - # - # For iterations 0..15, we pass W[] in rsi,r8..r14 - # for use in RD1As instead of spilling them to stack. - # We lose parallelized addition of RCONST, but LEA - # can do two additions at once, so it is probably a wash. - # (We use rsi instead of rN because this makes two - # LEAs in two first RD1As shorter by one byte). - movq 4*0(%rdi), %rsi - movq 4*2(%rdi), %r8 - bswapq %rsi - bswapq %r8 - rolq $32, %rsi # rsi = W[1]:W[0] - rolq $32, %r8 # r8 = W[3]:W[2] - movq %rsi, %xmm0 - movq %r8, %xmm4 - punpcklqdq %xmm4, %xmm0 # xmm0 = r8:rsi = (W[0],W[1],W[2],W[3]) -# movaps %xmm0, %xmm4 # add RCONST, spill to stack -# paddd %xmm6, %xmm4 -# movups %xmm4, -64+16*0(%rsp) - - movq 4*4(%rdi), %r9 - movq 4*6(%rdi), %r10 - bswapq %r9 - bswapq %r10 - rolq $32, %r9 # r9 = W[5]:W[4] - rolq $32, %r10 # r10 = W[7]:W[6] - movq %r9, %xmm1 - movq %r10, %xmm4 - punpcklqdq %xmm4, %xmm1 # xmm1 = r10:r9 = (W[4],W[5],W[6],W[7]) - - movq 4*8(%rdi), %r11 - movq 4*10(%rdi), %r12 - bswapq %r11 - bswapq %r12 - rolq $32, %r11 # r11 = W[9]:W[8] - rolq $32, %r12 # r12 = W[11]:W[10] - movq %r11, %xmm2 - movq %r12, %xmm4 - punpcklqdq %xmm4, %xmm2 # xmm2 = r12:r11 = (W[8],W[9],W[10],W[11]) - - movq 4*12(%rdi), %r13 - movq 4*14(%rdi), %r14 - bswapq %r13 - bswapq %r14 - rolq $32, %r13 # r13 = W[13]:W[12] - rolq $32, %r14 # r14 = W[15]:W[14] - movq %r13, %xmm3 - movq %r14, %xmm4 - punpcklqdq %xmm4, %xmm3 # xmm3 = r14:r13 = (W[12],W[13],W[14],W[15]) - -# 0 - leal 0x5A827999(%rbp,%rsi), %ebp # e += RCONST + W[n] - shrq $32, %rsi - movl %ecx, %edi # c - xorl %edx, %edi # ^d - andl %ebx, %edi # &b - xorl %edx, %edi # (((c ^ d) & b) ^ d) - addl %edi, %ebp # e += (((c ^ d) & b) ^ d) - movl %eax, %edi # - roll $5, %edi # rotl32(a,5) - addl %edi, %ebp # e += rotl32(a,5) - rorl $2, %ebx # b = rotl32(b,30) -# 1 - leal 0x5A827999(%rdx,%rsi), %edx # e += RCONST + W[n] - movl %ebx, %edi # c - xorl %ecx, %edi # ^d - andl %eax, %edi # &b - xorl %ecx, %edi # (((c ^ d) & b) ^ d) - addl %edi, %edx # e += (((c ^ d) & b) ^ d) - movl %ebp, %edi # - roll $5, %edi # rotl32(a,5) - addl %edi, %edx # e += rotl32(a,5) - rorl $2, %eax # b = rotl32(b,30) -# 2 - leal 0x5A827999(%rcx,%r8), %ecx # e += RCONST + W[n] - shrq $32, %r8 - movl %eax, %edi # c - xorl %ebx, %edi # ^d - andl %ebp, %edi # &b - xorl %ebx, %edi # (((c ^ d) & b) ^ d) - addl %edi, %ecx # e += (((c ^ d) & b) ^ d) - movl %edx, %edi # - roll $5, %edi # rotl32(a,5) - addl %edi, %ecx # e += rotl32(a,5) - rorl $2, %ebp # b = rotl32(b,30) -# 3 - leal 0x5A827999(%rbx,%r8), %ebx # e += RCONST + W[n] - movl %ebp, %edi # c - xorl %eax, %edi # ^d - andl %edx, %edi # &b - xorl %eax, %edi # (((c ^ d) & b) ^ d) - addl %edi, %ebx # e += (((c ^ d) & b) ^ d) - movl %ecx, %edi # - roll $5, %edi # rotl32(a,5) - addl %edi, %ebx # e += rotl32(a,5) - rorl $2, %edx # b = rotl32(b,30) -# 4 - leal 0x5A827999(%rax,%r9), %eax # e += RCONST + W[n] - shrq $32, %r9 - movl %edx, %edi # c - xorl %ebp, %edi # ^d - andl %ecx, %edi # &b - xorl %ebp, %edi # (((c ^ d) & b) ^ d) - addl %edi, %eax # e += (((c ^ d) & b) ^ d) - movl %ebx, %edi # - roll $5, %edi # rotl32(a,5) - addl %edi, %eax # e += rotl32(a,5) - rorl $2, %ecx # b = rotl32(b,30) -# 5 - leal 0x5A827999(%rbp,%r9), %ebp # e += RCONST + W[n] - movl %ecx, %edi # c - xorl %edx, %edi # ^d - andl %ebx, %edi # &b - xorl %edx, %edi # (((c ^ d) & b) ^ d) - addl %edi, %ebp # e += (((c ^ d) & b) ^ d) - movl %eax, %edi # - roll $5, %edi # rotl32(a,5) - addl %edi, %ebp # e += rotl32(a,5) - rorl $2, %ebx # b = rotl32(b,30) -# 6 - leal 0x5A827999(%rdx,%r10), %edx # e += RCONST + W[n] - shrq $32, %r10 - movl %ebx, %edi # c - xorl %ecx, %edi # ^d - andl %eax, %edi # &b - xorl %ecx, %edi # (((c ^ d) & b) ^ d) - addl %edi, %edx # e += (((c ^ d) & b) ^ d) - movl %ebp, %edi # - roll $5, %edi # rotl32(a,5) - addl %edi, %edx # e += rotl32(a,5) - rorl $2, %eax # b = rotl32(b,30) -# 7 - leal 0x5A827999(%rcx,%r10), %ecx # e += RCONST + W[n] - movl %eax, %edi # c - xorl %ebx, %edi # ^d - andl %ebp, %edi # &b - xorl %ebx, %edi # (((c ^ d) & b) ^ d) - addl %edi, %ecx # e += (((c ^ d) & b) ^ d) - movl %edx, %edi # - roll $5, %edi # rotl32(a,5) - addl %edi, %ecx # e += rotl32(a,5) - rorl $2, %ebp # b = rotl32(b,30) -# PREP %xmm0 %xmm1 %xmm2 %xmm3 -64+16*0(%rsp) - movaps %xmm3, %xmm4 - psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0) -# pshufd $0x4e, %xmm0, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x) -# punpcklqdq %xmm1, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5]) -# same result as above, but shorter and faster: -# pshufd/shufps are subtly different: pshufd takes all dwords from source operand, -# shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one! - movaps %xmm0, %xmm5 - shufps $0x4e, %xmm1, %xmm5 # 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5]) - xorps %xmm2, %xmm0 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3]) - xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5]) - xorps %xmm5, %xmm0 # ^ - # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup - movaps %xmm0, %xmm5 - xorps %xmm4, %xmm4 # rol(W0,1): - pcmpgtd %xmm0, %xmm4 # ffffffff for elements <0 (ones with msb bit 1) - paddd %xmm0, %xmm0 # shift left by 1 - psubd %xmm4, %xmm0 # add 1 to those who had msb bit 1 - # W0 = rotated (W[0]..W[3]), still needs W[3] fixup - pslldq $12, %xmm5 # lshift by 12 bytes: T2 = (0,0,0,unrotW[0]) - movaps %xmm5, %xmm4 - pslld $2, %xmm5 - psrld $30, %xmm4 -# xorps %xmm4, %xmm5 # rol((0,0,0,unrotW[0]),2) - xorps %xmm4, %xmm0 # same result, but does not depend on/does not modify T2 - xorps %xmm5, %xmm0 # W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2)) - movaps %xmm0, %xmm5 - paddd %xmm6, %xmm5 - movups %xmm5, -64+16*0(%rsp) -# 8 - leal 0x5A827999(%rbx,%r11), %ebx # e += RCONST + W[n] - shrq $32, %r11 - movl %ebp, %edi # c - xorl %eax, %edi # ^d - andl %edx, %edi # &b - xorl %eax, %edi # (((c ^ d) & b) ^ d) - addl %edi, %ebx # e += (((c ^ d) & b) ^ d) - movl %ecx, %edi # - roll $5, %edi # rotl32(a,5) - addl %edi, %ebx # e += rotl32(a,5) - rorl $2, %edx # b = rotl32(b,30) -# 9 - leal 0x5A827999(%rax,%r11), %eax # e += RCONST + W[n] - movl %edx, %edi # c - xorl %ebp, %edi # ^d - andl %ecx, %edi # &b - xorl %ebp, %edi # (((c ^ d) & b) ^ d) - addl %edi, %eax # e += (((c ^ d) & b) ^ d) - movl %ebx, %edi # - roll $5, %edi # rotl32(a,5) - addl %edi, %eax # e += rotl32(a,5) - rorl $2, %ecx # b = rotl32(b,30) -# 10 - leal 0x5A827999(%rbp,%r12), %ebp # e += RCONST + W[n] - shrq $32, %r12 - movl %ecx, %edi # c - xorl %edx, %edi # ^d - andl %ebx, %edi # &b - xorl %edx, %edi # (((c ^ d) & b) ^ d) - addl %edi, %ebp # e += (((c ^ d) & b) ^ d) - movl %eax, %edi # - roll $5, %edi # rotl32(a,5) - addl %edi, %ebp # e += rotl32(a,5) - rorl $2, %ebx # b = rotl32(b,30) -# 11 - leal 0x5A827999(%rdx,%r12), %edx # e += RCONST + W[n] - movl %ebx, %edi # c - xorl %ecx, %edi # ^d - andl %eax, %edi # &b - xorl %ecx, %edi # (((c ^ d) & b) ^ d) - addl %edi, %edx # e += (((c ^ d) & b) ^ d) - movl %ebp, %edi # - roll $5, %edi # rotl32(a,5) - addl %edi, %edx # e += rotl32(a,5) - rorl $2, %eax # b = rotl32(b,30) - pshufd $0x55, %xmm7, %xmm6 -# PREP %xmm1 %xmm2 %xmm3 %xmm0 -64+16*1(%rsp) - movaps %xmm0, %xmm4 - psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0) -# pshufd $0x4e, %xmm1, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x) -# punpcklqdq %xmm2, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5]) -# same result as above, but shorter and faster: -# pshufd/shufps are subtly different: pshufd takes all dwords from source operand, -# shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one! - movaps %xmm1, %xmm5 - shufps $0x4e, %xmm2, %xmm5 # 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5]) - xorps %xmm3, %xmm1 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3]) - xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5]) - xorps %xmm5, %xmm1 # ^ - # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup - movaps %xmm1, %xmm5 - xorps %xmm4, %xmm4 # rol(W0,1): - pcmpgtd %xmm1, %xmm4 # ffffffff for elements <0 (ones with msb bit 1) - paddd %xmm1, %xmm1 # shift left by 1 - psubd %xmm4, %xmm1 # add 1 to those who had msb bit 1 - # W0 = rotated (W[0]..W[3]), still needs W[3] fixup - pslldq $12, %xmm5 # lshift by 12 bytes: T2 = (0,0,0,unrotW[0]) - movaps %xmm5, %xmm4 - pslld $2, %xmm5 - psrld $30, %xmm4 -# xorps %xmm4, %xmm5 # rol((0,0,0,unrotW[0]),2) - xorps %xmm4, %xmm1 # same result, but does not depend on/does not modify T2 - xorps %xmm5, %xmm1 # W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2)) - movaps %xmm1, %xmm5 - paddd %xmm6, %xmm5 - movups %xmm5, -64+16*1(%rsp) -# 12 - leal 0x5A827999(%rcx,%r13), %ecx # e += RCONST + W[n] - shrq $32, %r13 - movl %eax, %edi # c - xorl %ebx, %edi # ^d - andl %ebp, %edi # &b - xorl %ebx, %edi # (((c ^ d) & b) ^ d) - addl %edi, %ecx # e += (((c ^ d) & b) ^ d) - movl %edx, %edi # - roll $5, %edi # rotl32(a,5) - addl %edi, %ecx # e += rotl32(a,5) - rorl $2, %ebp # b = rotl32(b,30) -# 13 - leal 0x5A827999(%rbx,%r13), %ebx # e += RCONST + W[n] - movl %ebp, %edi # c - xorl %eax, %edi # ^d - andl %edx, %edi # &b - xorl %eax, %edi # (((c ^ d) & b) ^ d) - addl %edi, %ebx # e += (((c ^ d) & b) ^ d) - movl %ecx, %edi # - roll $5, %edi # rotl32(a,5) - addl %edi, %ebx # e += rotl32(a,5) - rorl $2, %edx # b = rotl32(b,30) -# 14 - leal 0x5A827999(%rax,%r14), %eax # e += RCONST + W[n] - shrq $32, %r14 - movl %edx, %edi # c - xorl %ebp, %edi # ^d - andl %ecx, %edi # &b - xorl %ebp, %edi # (((c ^ d) & b) ^ d) - addl %edi, %eax # e += (((c ^ d) & b) ^ d) - movl %ebx, %edi # - roll $5, %edi # rotl32(a,5) - addl %edi, %eax # e += rotl32(a,5) - rorl $2, %ecx # b = rotl32(b,30) -# 15 - leal 0x5A827999(%rbp,%r14), %ebp # e += RCONST + W[n] - movl %ecx, %edi # c - xorl %edx, %edi # ^d - andl %ebx, %edi # &b - xorl %edx, %edi # (((c ^ d) & b) ^ d) - addl %edi, %ebp # e += (((c ^ d) & b) ^ d) - movl %eax, %edi # - roll $5, %edi # rotl32(a,5) - addl %edi, %ebp # e += rotl32(a,5) - rorl $2, %ebx # b = rotl32(b,30) -# PREP %xmm2 %xmm3 %xmm0 %xmm1 -64+16*2(%rsp) - movaps %xmm1, %xmm4 - psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0) -# pshufd $0x4e, %xmm2, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x) -# punpcklqdq %xmm3, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5]) -# same result as above, but shorter and faster: -# pshufd/shufps are subtly different: pshufd takes all dwords from source operand, -# shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one! - movaps %xmm2, %xmm5 - shufps $0x4e, %xmm3, %xmm5 # 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5]) - xorps %xmm0, %xmm2 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3]) - xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5]) - xorps %xmm5, %xmm2 # ^ - # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup - movaps %xmm2, %xmm5 - xorps %xmm4, %xmm4 # rol(W0,1): - pcmpgtd %xmm2, %xmm4 # ffffffff for elements <0 (ones with msb bit 1) - paddd %xmm2, %xmm2 # shift left by 1 - psubd %xmm4, %xmm2 # add 1 to those who had msb bit 1 - # W0 = rotated (W[0]..W[3]), still needs W[3] fixup - pslldq $12, %xmm5 # lshift by 12 bytes: T2 = (0,0,0,unrotW[0]) - movaps %xmm5, %xmm4 - pslld $2, %xmm5 - psrld $30, %xmm4 -# xorps %xmm4, %xmm5 # rol((0,0,0,unrotW[0]),2) - xorps %xmm4, %xmm2 # same result, but does not depend on/does not modify T2 - xorps %xmm5, %xmm2 # W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2)) - movaps %xmm2, %xmm5 - paddd %xmm6, %xmm5 - movups %xmm5, -64+16*2(%rsp) -# 16 - movl %ebx, %edi # c - xorl %ecx, %edi # ^d - andl %eax, %edi # &b - xorl %ecx, %edi # (((c ^ d) & b) ^ d) - addl -64+4*0(%rsp), %edx # e += RCONST + W[n & 15] - addl %edi, %edx # e += (((c ^ d) & b) ^ d) - movl %ebp, %esi # - roll $5, %esi # rotl32(a,5) - addl %esi, %edx # e += rotl32(a,5) - rorl $2, %eax # b = rotl32(b,30) -# 17 - movl %eax, %edi # c - xorl %ebx, %edi # ^d - andl %ebp, %edi # &b - xorl %ebx, %edi # (((c ^ d) & b) ^ d) - addl -64+4*1(%rsp), %ecx # e += RCONST + W[n & 15] - addl %edi, %ecx # e += (((c ^ d) & b) ^ d) - movl %edx, %esi # - roll $5, %esi # rotl32(a,5) - addl %esi, %ecx # e += rotl32(a,5) - rorl $2, %ebp # b = rotl32(b,30) -# 18 - movl %ebp, %edi # c - xorl %eax, %edi # ^d - andl %edx, %edi # &b - xorl %eax, %edi # (((c ^ d) & b) ^ d) - addl -64+4*2(%rsp), %ebx # e += RCONST + W[n & 15] - addl %edi, %ebx # e += (((c ^ d) & b) ^ d) - movl %ecx, %esi # - roll $5, %esi # rotl32(a,5) - addl %esi, %ebx # e += rotl32(a,5) - rorl $2, %edx # b = rotl32(b,30) -# 19 - movl %edx, %edi # c - xorl %ebp, %edi # ^d - andl %ecx, %edi # &b - xorl %ebp, %edi # (((c ^ d) & b) ^ d) - addl -64+4*3(%rsp), %eax # e += RCONST + W[n & 15] - addl %edi, %eax # e += (((c ^ d) & b) ^ d) - movl %ebx, %esi # - roll $5, %esi # rotl32(a,5) - addl %esi, %eax # e += rotl32(a,5) - rorl $2, %ecx # b = rotl32(b,30) -# PREP %xmm3 %xmm0 %xmm1 %xmm2 -64+16*3(%rsp) - movaps %xmm2, %xmm4 - psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0) -# pshufd $0x4e, %xmm3, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x) -# punpcklqdq %xmm0, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5]) -# same result as above, but shorter and faster: -# pshufd/shufps are subtly different: pshufd takes all dwords from source operand, -# shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one! - movaps %xmm3, %xmm5 - shufps $0x4e, %xmm0, %xmm5 # 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5]) - xorps %xmm1, %xmm3 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3]) - xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5]) - xorps %xmm5, %xmm3 # ^ - # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup - movaps %xmm3, %xmm5 - xorps %xmm4, %xmm4 # rol(W0,1): - pcmpgtd %xmm3, %xmm4 # ffffffff for elements <0 (ones with msb bit 1) - paddd %xmm3, %xmm3 # shift left by 1 - psubd %xmm4, %xmm3 # add 1 to those who had msb bit 1 - # W0 = rotated (W[0]..W[3]), still needs W[3] fixup - pslldq $12, %xmm5 # lshift by 12 bytes: T2 = (0,0,0,unrotW[0]) - movaps %xmm5, %xmm4 - pslld $2, %xmm5 - psrld $30, %xmm4 -# xorps %xmm4, %xmm5 # rol((0,0,0,unrotW[0]),2) - xorps %xmm4, %xmm3 # same result, but does not depend on/does not modify T2 - xorps %xmm5, %xmm3 # W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2)) - movaps %xmm3, %xmm5 - paddd %xmm6, %xmm5 - movups %xmm5, -64+16*3(%rsp) -# 20 - movl %ecx, %edi # c - xorl %edx, %edi # ^d - xorl %ebx, %edi # ^b - addl -64+4*4(%rsp), %ebp # e += RCONST + W[n & 15] - addl %edi, %ebp # e += (c ^ d ^ b) - movl %eax, %esi # - roll $5, %esi # rotl32(a,5) - addl %esi, %ebp # e += rotl32(a,5) - rorl $2, %ebx # b = rotl32(b,30) -# 21 - movl %ebx, %edi # c - xorl %ecx, %edi # ^d - xorl %eax, %edi # ^b - addl -64+4*5(%rsp), %edx # e += RCONST + W[n & 15] - addl %edi, %edx # e += (c ^ d ^ b) - movl %ebp, %esi # - roll $5, %esi # rotl32(a,5) - addl %esi, %edx # e += rotl32(a,5) - rorl $2, %eax # b = rotl32(b,30) -# 22 - movl %eax, %edi # c - xorl %ebx, %edi # ^d - xorl %ebp, %edi # ^b - addl -64+4*6(%rsp), %ecx # e += RCONST + W[n & 15] - addl %edi, %ecx # e += (c ^ d ^ b) - movl %edx, %esi # - roll $5, %esi # rotl32(a,5) - addl %esi, %ecx # e += rotl32(a,5) - rorl $2, %ebp # b = rotl32(b,30) -# 23 - movl %ebp, %edi # c - xorl %eax, %edi # ^d - xorl %edx, %edi # ^b - addl -64+4*7(%rsp), %ebx # e += RCONST + W[n & 15] - addl %edi, %ebx # e += (c ^ d ^ b) - movl %ecx, %esi # - roll $5, %esi # rotl32(a,5) - addl %esi, %ebx # e += rotl32(a,5) - rorl $2, %edx # b = rotl32(b,30) -# PREP %xmm0 %xmm1 %xmm2 %xmm3 -64+16*0(%rsp) - movaps %xmm3, %xmm4 - psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0) -# pshufd $0x4e, %xmm0, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x) -# punpcklqdq %xmm1, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5]) -# same result as above, but shorter and faster: -# pshufd/shufps are subtly different: pshufd takes all dwords from source operand, -# shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one! - movaps %xmm0, %xmm5 - shufps $0x4e, %xmm1, %xmm5 # 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5]) - xorps %xmm2, %xmm0 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3]) - xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5]) - xorps %xmm5, %xmm0 # ^ - # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup - movaps %xmm0, %xmm5 - xorps %xmm4, %xmm4 # rol(W0,1): - pcmpgtd %xmm0, %xmm4 # ffffffff for elements <0 (ones with msb bit 1) - paddd %xmm0, %xmm0 # shift left by 1 - psubd %xmm4, %xmm0 # add 1 to those who had msb bit 1 - # W0 = rotated (W[0]..W[3]), still needs W[3] fixup - pslldq $12, %xmm5 # lshift by 12 bytes: T2 = (0,0,0,unrotW[0]) - movaps %xmm5, %xmm4 - pslld $2, %xmm5 - psrld $30, %xmm4 -# xorps %xmm4, %xmm5 # rol((0,0,0,unrotW[0]),2) - xorps %xmm4, %xmm0 # same result, but does not depend on/does not modify T2 - xorps %xmm5, %xmm0 # W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2)) - movaps %xmm0, %xmm5 - paddd %xmm6, %xmm5 - movups %xmm5, -64+16*0(%rsp) -# 24 - movl %edx, %edi # c - xorl %ebp, %edi # ^d - xorl %ecx, %edi # ^b - addl -64+4*8(%rsp), %eax # e += RCONST + W[n & 15] - addl %edi, %eax # e += (c ^ d ^ b) - movl %ebx, %esi # - roll $5, %esi # rotl32(a,5) - addl %esi, %eax # e += rotl32(a,5) - rorl $2, %ecx # b = rotl32(b,30) -# 25 - movl %ecx, %edi # c - xorl %edx, %edi # ^d - xorl %ebx, %edi # ^b - addl -64+4*9(%rsp), %ebp # e += RCONST + W[n & 15] - addl %edi, %ebp # e += (c ^ d ^ b) - movl %eax, %esi # - roll $5, %esi # rotl32(a,5) - addl %esi, %ebp # e += rotl32(a,5) - rorl $2, %ebx # b = rotl32(b,30) -# 26 - movl %ebx, %edi # c - xorl %ecx, %edi # ^d - xorl %eax, %edi # ^b - addl -64+4*10(%rsp), %edx # e += RCONST + W[n & 15] - addl %edi, %edx # e += (c ^ d ^ b) - movl %ebp, %esi # - roll $5, %esi # rotl32(a,5) - addl %esi, %edx # e += rotl32(a,5) - rorl $2, %eax # b = rotl32(b,30) -# 27 - movl %eax, %edi # c - xorl %ebx, %edi # ^d - xorl %ebp, %edi # ^b - addl -64+4*11(%rsp), %ecx # e += RCONST + W[n & 15] - addl %edi, %ecx # e += (c ^ d ^ b) - movl %edx, %esi # - roll $5, %esi # rotl32(a,5) - addl %esi, %ecx # e += rotl32(a,5) - rorl $2, %ebp # b = rotl32(b,30) -# PREP %xmm1 %xmm2 %xmm3 %xmm0 -64+16*1(%rsp) - movaps %xmm0, %xmm4 - psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0) -# pshufd $0x4e, %xmm1, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x) -# punpcklqdq %xmm2, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5]) -# same result as above, but shorter and faster: -# pshufd/shufps are subtly different: pshufd takes all dwords from source operand, -# shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one! - movaps %xmm1, %xmm5 - shufps $0x4e, %xmm2, %xmm5 # 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5]) - xorps %xmm3, %xmm1 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3]) - xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5]) - xorps %xmm5, %xmm1 # ^ - # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup - movaps %xmm1, %xmm5 - xorps %xmm4, %xmm4 # rol(W0,1): - pcmpgtd %xmm1, %xmm4 # ffffffff for elements <0 (ones with msb bit 1) - paddd %xmm1, %xmm1 # shift left by 1 - psubd %xmm4, %xmm1 # add 1 to those who had msb bit 1 - # W0 = rotated (W[0]..W[3]), still needs W[3] fixup - pslldq $12, %xmm5 # lshift by 12 bytes: T2 = (0,0,0,unrotW[0]) - movaps %xmm5, %xmm4 - pslld $2, %xmm5 - psrld $30, %xmm4 -# xorps %xmm4, %xmm5 # rol((0,0,0,unrotW[0]),2) - xorps %xmm4, %xmm1 # same result, but does not depend on/does not modify T2 - xorps %xmm5, %xmm1 # W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2)) - movaps %xmm1, %xmm5 - paddd %xmm6, %xmm5 - movups %xmm5, -64+16*1(%rsp) -# 28 - movl %ebp, %edi # c - xorl %eax, %edi # ^d - xorl %edx, %edi # ^b - addl -64+4*12(%rsp), %ebx # e += RCONST + W[n & 15] - addl %edi, %ebx # e += (c ^ d ^ b) - movl %ecx, %esi # - roll $5, %esi # rotl32(a,5) - addl %esi, %ebx # e += rotl32(a,5) - rorl $2, %edx # b = rotl32(b,30) -# 29 - movl %edx, %edi # c - xorl %ebp, %edi # ^d - xorl %ecx, %edi # ^b - addl -64+4*13(%rsp), %eax # e += RCONST + W[n & 15] - addl %edi, %eax # e += (c ^ d ^ b) - movl %ebx, %esi # - roll $5, %esi # rotl32(a,5) - addl %esi, %eax # e += rotl32(a,5) - rorl $2, %ecx # b = rotl32(b,30) -# 30 - movl %ecx, %edi # c - xorl %edx, %edi # ^d - xorl %ebx, %edi # ^b - addl -64+4*14(%rsp), %ebp # e += RCONST + W[n & 15] - addl %edi, %ebp # e += (c ^ d ^ b) - movl %eax, %esi # - roll $5, %esi # rotl32(a,5) - addl %esi, %ebp # e += rotl32(a,5) - rorl $2, %ebx # b = rotl32(b,30) -# 31 - movl %ebx, %edi # c - xorl %ecx, %edi # ^d - xorl %eax, %edi # ^b - addl -64+4*15(%rsp), %edx # e += RCONST + W[n & 15] - addl %edi, %edx # e += (c ^ d ^ b) - movl %ebp, %esi # - roll $5, %esi # rotl32(a,5) - addl %esi, %edx # e += rotl32(a,5) - rorl $2, %eax # b = rotl32(b,30) - pshufd $0xaa, %xmm7, %xmm6 -# PREP %xmm2 %xmm3 %xmm0 %xmm1 -64+16*2(%rsp) - movaps %xmm1, %xmm4 - psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0) -# pshufd $0x4e, %xmm2, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x) -# punpcklqdq %xmm3, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5]) -# same result as above, but shorter and faster: -# pshufd/shufps are subtly different: pshufd takes all dwords from source operand, -# shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one! - movaps %xmm2, %xmm5 - shufps $0x4e, %xmm3, %xmm5 # 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5]) - xorps %xmm0, %xmm2 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3]) - xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5]) - xorps %xmm5, %xmm2 # ^ - # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup - movaps %xmm2, %xmm5 - xorps %xmm4, %xmm4 # rol(W0,1): - pcmpgtd %xmm2, %xmm4 # ffffffff for elements <0 (ones with msb bit 1) - paddd %xmm2, %xmm2 # shift left by 1 - psubd %xmm4, %xmm2 # add 1 to those who had msb bit 1 - # W0 = rotated (W[0]..W[3]), still needs W[3] fixup - pslldq $12, %xmm5 # lshift by 12 bytes: T2 = (0,0,0,unrotW[0]) - movaps %xmm5, %xmm4 - pslld $2, %xmm5 - psrld $30, %xmm4 -# xorps %xmm4, %xmm5 # rol((0,0,0,unrotW[0]),2) - xorps %xmm4, %xmm2 # same result, but does not depend on/does not modify T2 - xorps %xmm5, %xmm2 # W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2)) - movaps %xmm2, %xmm5 - paddd %xmm6, %xmm5 - movups %xmm5, -64+16*2(%rsp) -# 32 - movl %eax, %edi # c - xorl %ebx, %edi # ^d - xorl %ebp, %edi # ^b - addl -64+4*0(%rsp), %ecx # e += RCONST + W[n & 15] - addl %edi, %ecx # e += (c ^ d ^ b) - movl %edx, %esi # - roll $5, %esi # rotl32(a,5) - addl %esi, %ecx # e += rotl32(a,5) - rorl $2, %ebp # b = rotl32(b,30) -# 33 - movl %ebp, %edi # c - xorl %eax, %edi # ^d - xorl %edx, %edi # ^b - addl -64+4*1(%rsp), %ebx # e += RCONST + W[n & 15] - addl %edi, %ebx # e += (c ^ d ^ b) - movl %ecx, %esi # - roll $5, %esi # rotl32(a,5) - addl %esi, %ebx # e += rotl32(a,5) - rorl $2, %edx # b = rotl32(b,30) -# 34 - movl %edx, %edi # c - xorl %ebp, %edi # ^d - xorl %ecx, %edi # ^b - addl -64+4*2(%rsp), %eax # e += RCONST + W[n & 15] - addl %edi, %eax # e += (c ^ d ^ b) - movl %ebx, %esi # - roll $5, %esi # rotl32(a,5) - addl %esi, %eax # e += rotl32(a,5) - rorl $2, %ecx # b = rotl32(b,30) -# 35 - movl %ecx, %edi # c - xorl %edx, %edi # ^d - xorl %ebx, %edi # ^b - addl -64+4*3(%rsp), %ebp # e += RCONST + W[n & 15] - addl %edi, %ebp # e += (c ^ d ^ b) - movl %eax, %esi # - roll $5, %esi # rotl32(a,5) - addl %esi, %ebp # e += rotl32(a,5) - rorl $2, %ebx # b = rotl32(b,30) -# PREP %xmm3 %xmm0 %xmm1 %xmm2 -64+16*3(%rsp) - movaps %xmm2, %xmm4 - psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0) -# pshufd $0x4e, %xmm3, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x) -# punpcklqdq %xmm0, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5]) -# same result as above, but shorter and faster: -# pshufd/shufps are subtly different: pshufd takes all dwords from source operand, -# shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one! - movaps %xmm3, %xmm5 - shufps $0x4e, %xmm0, %xmm5 # 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5]) - xorps %xmm1, %xmm3 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3]) - xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5]) - xorps %xmm5, %xmm3 # ^ - # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup - movaps %xmm3, %xmm5 - xorps %xmm4, %xmm4 # rol(W0,1): - pcmpgtd %xmm3, %xmm4 # ffffffff for elements <0 (ones with msb bit 1) - paddd %xmm3, %xmm3 # shift left by 1 - psubd %xmm4, %xmm3 # add 1 to those who had msb bit 1 - # W0 = rotated (W[0]..W[3]), still needs W[3] fixup - pslldq $12, %xmm5 # lshift by 12 bytes: T2 = (0,0,0,unrotW[0]) - movaps %xmm5, %xmm4 - pslld $2, %xmm5 - psrld $30, %xmm4 -# xorps %xmm4, %xmm5 # rol((0,0,0,unrotW[0]),2) - xorps %xmm4, %xmm3 # same result, but does not depend on/does not modify T2 - xorps %xmm5, %xmm3 # W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2)) - movaps %xmm3, %xmm5 - paddd %xmm6, %xmm5 - movups %xmm5, -64+16*3(%rsp) -# 36 - movl %ebx, %edi # c - xorl %ecx, %edi # ^d - xorl %eax, %edi # ^b - addl -64+4*4(%rsp), %edx # e += RCONST + W[n & 15] - addl %edi, %edx # e += (c ^ d ^ b) - movl %ebp, %esi # - roll $5, %esi # rotl32(a,5) - addl %esi, %edx # e += rotl32(a,5) - rorl $2, %eax # b = rotl32(b,30) -# 37 - movl %eax, %edi # c - xorl %ebx, %edi # ^d - xorl %ebp, %edi # ^b - addl -64+4*5(%rsp), %ecx # e += RCONST + W[n & 15] - addl %edi, %ecx # e += (c ^ d ^ b) - movl %edx, %esi # - roll $5, %esi # rotl32(a,5) - addl %esi, %ecx # e += rotl32(a,5) - rorl $2, %ebp # b = rotl32(b,30) -# 38 - movl %ebp, %edi # c - xorl %eax, %edi # ^d - xorl %edx, %edi # ^b - addl -64+4*6(%rsp), %ebx # e += RCONST + W[n & 15] - addl %edi, %ebx # e += (c ^ d ^ b) - movl %ecx, %esi # - roll $5, %esi # rotl32(a,5) - addl %esi, %ebx # e += rotl32(a,5) - rorl $2, %edx # b = rotl32(b,30) -# 39 - movl %edx, %edi # c - xorl %ebp, %edi # ^d - xorl %ecx, %edi # ^b - addl -64+4*7(%rsp), %eax # e += RCONST + W[n & 15] - addl %edi, %eax # e += (c ^ d ^ b) - movl %ebx, %esi # - roll $5, %esi # rotl32(a,5) - addl %esi, %eax # e += rotl32(a,5) - rorl $2, %ecx # b = rotl32(b,30) -# PREP %xmm0 %xmm1 %xmm2 %xmm3 -64+16*0(%rsp) - movaps %xmm3, %xmm4 - psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0) -# pshufd $0x4e, %xmm0, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x) -# punpcklqdq %xmm1, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5]) -# same result as above, but shorter and faster: -# pshufd/shufps are subtly different: pshufd takes all dwords from source operand, -# shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one! - movaps %xmm0, %xmm5 - shufps $0x4e, %xmm1, %xmm5 # 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5]) - xorps %xmm2, %xmm0 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3]) - xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5]) - xorps %xmm5, %xmm0 # ^ - # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup - movaps %xmm0, %xmm5 - xorps %xmm4, %xmm4 # rol(W0,1): - pcmpgtd %xmm0, %xmm4 # ffffffff for elements <0 (ones with msb bit 1) - paddd %xmm0, %xmm0 # shift left by 1 - psubd %xmm4, %xmm0 # add 1 to those who had msb bit 1 - # W0 = rotated (W[0]..W[3]), still needs W[3] fixup - pslldq $12, %xmm5 # lshift by 12 bytes: T2 = (0,0,0,unrotW[0]) - movaps %xmm5, %xmm4 - pslld $2, %xmm5 - psrld $30, %xmm4 -# xorps %xmm4, %xmm5 # rol((0,0,0,unrotW[0]),2) - xorps %xmm4, %xmm0 # same result, but does not depend on/does not modify T2 - xorps %xmm5, %xmm0 # W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2)) - movaps %xmm0, %xmm5 - paddd %xmm6, %xmm5 - movups %xmm5, -64+16*0(%rsp) -# 40 - movl %ebx, %edi # di: b - movl %ebx, %esi # si: b - orl %ecx, %edi # di: b | c - andl %ecx, %esi # si: b & c - andl %edx, %edi # di: (b | c) & d - orl %esi, %edi # ((b | c) & d) | (b & c) - addl %edi, %ebp # += ((b | c) & d) | (b & c) - addl -64+4*8(%rsp), %ebp # e += RCONST + W[n & 15] - movl %eax, %esi # - roll $5, %esi # rotl32(a,5) - addl %esi, %ebp # e += rotl32(a,5) - rorl $2, %ebx # b = rotl32(b,30) -# 41 - movl %eax, %edi # di: b - movl %eax, %esi # si: b - orl %ebx, %edi # di: b | c - andl %ebx, %esi # si: b & c - andl %ecx, %edi # di: (b | c) & d - orl %esi, %edi # ((b | c) & d) | (b & c) - addl %edi, %edx # += ((b | c) & d) | (b & c) - addl -64+4*9(%rsp), %edx # e += RCONST + W[n & 15] - movl %ebp, %esi # - roll $5, %esi # rotl32(a,5) - addl %esi, %edx # e += rotl32(a,5) - rorl $2, %eax # b = rotl32(b,30) -# 42 - movl %ebp, %edi # di: b - movl %ebp, %esi # si: b - orl %eax, %edi # di: b | c - andl %eax, %esi # si: b & c - andl %ebx, %edi # di: (b | c) & d - orl %esi, %edi # ((b | c) & d) | (b & c) - addl %edi, %ecx # += ((b | c) & d) | (b & c) - addl -64+4*10(%rsp), %ecx # e += RCONST + W[n & 15] - movl %edx, %esi # - roll $5, %esi # rotl32(a,5) - addl %esi, %ecx # e += rotl32(a,5) - rorl $2, %ebp # b = rotl32(b,30) -# 43 - movl %edx, %edi # di: b - movl %edx, %esi # si: b - orl %ebp, %edi # di: b | c - andl %ebp, %esi # si: b & c - andl %eax, %edi # di: (b | c) & d - orl %esi, %edi # ((b | c) & d) | (b & c) - addl %edi, %ebx # += ((b | c) & d) | (b & c) - addl -64+4*11(%rsp), %ebx # e += RCONST + W[n & 15] - movl %ecx, %esi # - roll $5, %esi # rotl32(a,5) - addl %esi, %ebx # e += rotl32(a,5) - rorl $2, %edx # b = rotl32(b,30) -# PREP %xmm1 %xmm2 %xmm3 %xmm0 -64+16*1(%rsp) - movaps %xmm0, %xmm4 - psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0) -# pshufd $0x4e, %xmm1, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x) -# punpcklqdq %xmm2, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5]) -# same result as above, but shorter and faster: -# pshufd/shufps are subtly different: pshufd takes all dwords from source operand, -# shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one! - movaps %xmm1, %xmm5 - shufps $0x4e, %xmm2, %xmm5 # 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5]) - xorps %xmm3, %xmm1 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3]) - xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5]) - xorps %xmm5, %xmm1 # ^ - # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup - movaps %xmm1, %xmm5 - xorps %xmm4, %xmm4 # rol(W0,1): - pcmpgtd %xmm1, %xmm4 # ffffffff for elements <0 (ones with msb bit 1) - paddd %xmm1, %xmm1 # shift left by 1 - psubd %xmm4, %xmm1 # add 1 to those who had msb bit 1 - # W0 = rotated (W[0]..W[3]), still needs W[3] fixup - pslldq $12, %xmm5 # lshift by 12 bytes: T2 = (0,0,0,unrotW[0]) - movaps %xmm5, %xmm4 - pslld $2, %xmm5 - psrld $30, %xmm4 -# xorps %xmm4, %xmm5 # rol((0,0,0,unrotW[0]),2) - xorps %xmm4, %xmm1 # same result, but does not depend on/does not modify T2 - xorps %xmm5, %xmm1 # W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2)) - movaps %xmm1, %xmm5 - paddd %xmm6, %xmm5 - movups %xmm5, -64+16*1(%rsp) -# 44 - movl %ecx, %edi # di: b - movl %ecx, %esi # si: b - orl %edx, %edi # di: b | c - andl %edx, %esi # si: b & c - andl %ebp, %edi # di: (b | c) & d - orl %esi, %edi # ((b | c) & d) | (b & c) - addl %edi, %eax # += ((b | c) & d) | (b & c) - addl -64+4*12(%rsp), %eax # e += RCONST + W[n & 15] - movl %ebx, %esi # - roll $5, %esi # rotl32(a,5) - addl %esi, %eax # e += rotl32(a,5) - rorl $2, %ecx # b = rotl32(b,30) -# 45 - movl %ebx, %edi # di: b - movl %ebx, %esi # si: b - orl %ecx, %edi # di: b | c - andl %ecx, %esi # si: b & c - andl %edx, %edi # di: (b | c) & d - orl %esi, %edi # ((b | c) & d) | (b & c) - addl %edi, %ebp # += ((b | c) & d) | (b & c) - addl -64+4*13(%rsp), %ebp # e += RCONST + W[n & 15] - movl %eax, %esi # - roll $5, %esi # rotl32(a,5) - addl %esi, %ebp # e += rotl32(a,5) - rorl $2, %ebx # b = rotl32(b,30) -# 46 - movl %eax, %edi # di: b - movl %eax, %esi # si: b - orl %ebx, %edi # di: b | c - andl %ebx, %esi # si: b & c - andl %ecx, %edi # di: (b | c) & d - orl %esi, %edi # ((b | c) & d) | (b & c) - addl %edi, %edx # += ((b | c) & d) | (b & c) - addl -64+4*14(%rsp), %edx # e += RCONST + W[n & 15] - movl %ebp, %esi # - roll $5, %esi # rotl32(a,5) - addl %esi, %edx # e += rotl32(a,5) - rorl $2, %eax # b = rotl32(b,30) -# 47 - movl %ebp, %edi # di: b - movl %ebp, %esi # si: b - orl %eax, %edi # di: b | c - andl %eax, %esi # si: b & c - andl %ebx, %edi # di: (b | c) & d - orl %esi, %edi # ((b | c) & d) | (b & c) - addl %edi, %ecx # += ((b | c) & d) | (b & c) - addl -64+4*15(%rsp), %ecx # e += RCONST + W[n & 15] - movl %edx, %esi # - roll $5, %esi # rotl32(a,5) - addl %esi, %ecx # e += rotl32(a,5) - rorl $2, %ebp # b = rotl32(b,30) -# PREP %xmm2 %xmm3 %xmm0 %xmm1 -64+16*2(%rsp) - movaps %xmm1, %xmm4 - psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0) -# pshufd $0x4e, %xmm2, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x) -# punpcklqdq %xmm3, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5]) -# same result as above, but shorter and faster: -# pshufd/shufps are subtly different: pshufd takes all dwords from source operand, -# shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one! - movaps %xmm2, %xmm5 - shufps $0x4e, %xmm3, %xmm5 # 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5]) - xorps %xmm0, %xmm2 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3]) - xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5]) - xorps %xmm5, %xmm2 # ^ - # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup - movaps %xmm2, %xmm5 - xorps %xmm4, %xmm4 # rol(W0,1): - pcmpgtd %xmm2, %xmm4 # ffffffff for elements <0 (ones with msb bit 1) - paddd %xmm2, %xmm2 # shift left by 1 - psubd %xmm4, %xmm2 # add 1 to those who had msb bit 1 - # W0 = rotated (W[0]..W[3]), still needs W[3] fixup - pslldq $12, %xmm5 # lshift by 12 bytes: T2 = (0,0,0,unrotW[0]) - movaps %xmm5, %xmm4 - pslld $2, %xmm5 - psrld $30, %xmm4 -# xorps %xmm4, %xmm5 # rol((0,0,0,unrotW[0]),2) - xorps %xmm4, %xmm2 # same result, but does not depend on/does not modify T2 - xorps %xmm5, %xmm2 # W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2)) - movaps %xmm2, %xmm5 - paddd %xmm6, %xmm5 - movups %xmm5, -64+16*2(%rsp) -# 48 - movl %edx, %edi # di: b - movl %edx, %esi # si: b - orl %ebp, %edi # di: b | c - andl %ebp, %esi # si: b & c - andl %eax, %edi # di: (b | c) & d - orl %esi, %edi # ((b | c) & d) | (b & c) - addl %edi, %ebx # += ((b | c) & d) | (b & c) - addl -64+4*0(%rsp), %ebx # e += RCONST + W[n & 15] - movl %ecx, %esi # - roll $5, %esi # rotl32(a,5) - addl %esi, %ebx # e += rotl32(a,5) - rorl $2, %edx # b = rotl32(b,30) -# 49 - movl %ecx, %edi # di: b - movl %ecx, %esi # si: b - orl %edx, %edi # di: b | c - andl %edx, %esi # si: b & c - andl %ebp, %edi # di: (b | c) & d - orl %esi, %edi # ((b | c) & d) | (b & c) - addl %edi, %eax # += ((b | c) & d) | (b & c) - addl -64+4*1(%rsp), %eax # e += RCONST + W[n & 15] - movl %ebx, %esi # - roll $5, %esi # rotl32(a,5) - addl %esi, %eax # e += rotl32(a,5) - rorl $2, %ecx # b = rotl32(b,30) -# 50 - movl %ebx, %edi # di: b - movl %ebx, %esi # si: b - orl %ecx, %edi # di: b | c - andl %ecx, %esi # si: b & c - andl %edx, %edi # di: (b | c) & d - orl %esi, %edi # ((b | c) & d) | (b & c) - addl %edi, %ebp # += ((b | c) & d) | (b & c) - addl -64+4*2(%rsp), %ebp # e += RCONST + W[n & 15] - movl %eax, %esi # - roll $5, %esi # rotl32(a,5) - addl %esi, %ebp # e += rotl32(a,5) - rorl $2, %ebx # b = rotl32(b,30) -# 51 - movl %eax, %edi # di: b - movl %eax, %esi # si: b - orl %ebx, %edi # di: b | c - andl %ebx, %esi # si: b & c - andl %ecx, %edi # di: (b | c) & d - orl %esi, %edi # ((b | c) & d) | (b & c) - addl %edi, %edx # += ((b | c) & d) | (b & c) - addl -64+4*3(%rsp), %edx # e += RCONST + W[n & 15] - movl %ebp, %esi # - roll $5, %esi # rotl32(a,5) - addl %esi, %edx # e += rotl32(a,5) - rorl $2, %eax # b = rotl32(b,30) - pshufd $0xff, %xmm7, %xmm6 -# PREP %xmm3 %xmm0 %xmm1 %xmm2 -64+16*3(%rsp) - movaps %xmm2, %xmm4 - psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0) -# pshufd $0x4e, %xmm3, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x) -# punpcklqdq %xmm0, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5]) -# same result as above, but shorter and faster: -# pshufd/shufps are subtly different: pshufd takes all dwords from source operand, -# shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one! - movaps %xmm3, %xmm5 - shufps $0x4e, %xmm0, %xmm5 # 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5]) - xorps %xmm1, %xmm3 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3]) - xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5]) - xorps %xmm5, %xmm3 # ^ - # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup - movaps %xmm3, %xmm5 - xorps %xmm4, %xmm4 # rol(W0,1): - pcmpgtd %xmm3, %xmm4 # ffffffff for elements <0 (ones with msb bit 1) - paddd %xmm3, %xmm3 # shift left by 1 - psubd %xmm4, %xmm3 # add 1 to those who had msb bit 1 - # W0 = rotated (W[0]..W[3]), still needs W[3] fixup - pslldq $12, %xmm5 # lshift by 12 bytes: T2 = (0,0,0,unrotW[0]) - movaps %xmm5, %xmm4 - pslld $2, %xmm5 - psrld $30, %xmm4 -# xorps %xmm4, %xmm5 # rol((0,0,0,unrotW[0]),2) - xorps %xmm4, %xmm3 # same result, but does not depend on/does not modify T2 - xorps %xmm5, %xmm3 # W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2)) - movaps %xmm3, %xmm5 - paddd %xmm6, %xmm5 - movups %xmm5, -64+16*3(%rsp) -# 52 - movl %ebp, %edi # di: b - movl %ebp, %esi # si: b - orl %eax, %edi # di: b | c - andl %eax, %esi # si: b & c - andl %ebx, %edi # di: (b | c) & d - orl %esi, %edi # ((b | c) & d) | (b & c) - addl %edi, %ecx # += ((b | c) & d) | (b & c) - addl -64+4*4(%rsp), %ecx # e += RCONST + W[n & 15] - movl %edx, %esi # - roll $5, %esi # rotl32(a,5) - addl %esi, %ecx # e += rotl32(a,5) - rorl $2, %ebp # b = rotl32(b,30) -# 53 - movl %edx, %edi # di: b - movl %edx, %esi # si: b - orl %ebp, %edi # di: b | c - andl %ebp, %esi # si: b & c - andl %eax, %edi # di: (b | c) & d - orl %esi, %edi # ((b | c) & d) | (b & c) - addl %edi, %ebx # += ((b | c) & d) | (b & c) - addl -64+4*5(%rsp), %ebx # e += RCONST + W[n & 15] - movl %ecx, %esi # - roll $5, %esi # rotl32(a,5) - addl %esi, %ebx # e += rotl32(a,5) - rorl $2, %edx # b = rotl32(b,30) -# 54 - movl %ecx, %edi # di: b - movl %ecx, %esi # si: b - orl %edx, %edi # di: b | c - andl %edx, %esi # si: b & c - andl %ebp, %edi # di: (b | c) & d - orl %esi, %edi # ((b | c) & d) | (b & c) - addl %edi, %eax # += ((b | c) & d) | (b & c) - addl -64+4*6(%rsp), %eax # e += RCONST + W[n & 15] - movl %ebx, %esi # - roll $5, %esi # rotl32(a,5) - addl %esi, %eax # e += rotl32(a,5) - rorl $2, %ecx # b = rotl32(b,30) -# 55 - movl %ebx, %edi # di: b - movl %ebx, %esi # si: b - orl %ecx, %edi # di: b | c - andl %ecx, %esi # si: b & c - andl %edx, %edi # di: (b | c) & d - orl %esi, %edi # ((b | c) & d) | (b & c) - addl %edi, %ebp # += ((b | c) & d) | (b & c) - addl -64+4*7(%rsp), %ebp # e += RCONST + W[n & 15] - movl %eax, %esi # - roll $5, %esi # rotl32(a,5) - addl %esi, %ebp # e += rotl32(a,5) - rorl $2, %ebx # b = rotl32(b,30) -# PREP %xmm0 %xmm1 %xmm2 %xmm3 -64+16*0(%rsp) - movaps %xmm3, %xmm4 - psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0) -# pshufd $0x4e, %xmm0, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x) -# punpcklqdq %xmm1, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5]) -# same result as above, but shorter and faster: -# pshufd/shufps are subtly different: pshufd takes all dwords from source operand, -# shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one! - movaps %xmm0, %xmm5 - shufps $0x4e, %xmm1, %xmm5 # 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5]) - xorps %xmm2, %xmm0 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3]) - xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5]) - xorps %xmm5, %xmm0 # ^ - # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup - movaps %xmm0, %xmm5 - xorps %xmm4, %xmm4 # rol(W0,1): - pcmpgtd %xmm0, %xmm4 # ffffffff for elements <0 (ones with msb bit 1) - paddd %xmm0, %xmm0 # shift left by 1 - psubd %xmm4, %xmm0 # add 1 to those who had msb bit 1 - # W0 = rotated (W[0]..W[3]), still needs W[3] fixup - pslldq $12, %xmm5 # lshift by 12 bytes: T2 = (0,0,0,unrotW[0]) - movaps %xmm5, %xmm4 - pslld $2, %xmm5 - psrld $30, %xmm4 -# xorps %xmm4, %xmm5 # rol((0,0,0,unrotW[0]),2) - xorps %xmm4, %xmm0 # same result, but does not depend on/does not modify T2 - xorps %xmm5, %xmm0 # W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2)) - movaps %xmm0, %xmm5 - paddd %xmm6, %xmm5 - movups %xmm5, -64+16*0(%rsp) -# 56 - movl %eax, %edi # di: b - movl %eax, %esi # si: b - orl %ebx, %edi # di: b | c - andl %ebx, %esi # si: b & c - andl %ecx, %edi # di: (b | c) & d - orl %esi, %edi # ((b | c) & d) | (b & c) - addl %edi, %edx # += ((b | c) & d) | (b & c) - addl -64+4*8(%rsp), %edx # e += RCONST + W[n & 15] - movl %ebp, %esi # - roll $5, %esi # rotl32(a,5) - addl %esi, %edx # e += rotl32(a,5) - rorl $2, %eax # b = rotl32(b,30) -# 57 - movl %ebp, %edi # di: b - movl %ebp, %esi # si: b - orl %eax, %edi # di: b | c - andl %eax, %esi # si: b & c - andl %ebx, %edi # di: (b | c) & d - orl %esi, %edi # ((b | c) & d) | (b & c) - addl %edi, %ecx # += ((b | c) & d) | (b & c) - addl -64+4*9(%rsp), %ecx # e += RCONST + W[n & 15] - movl %edx, %esi # - roll $5, %esi # rotl32(a,5) - addl %esi, %ecx # e += rotl32(a,5) - rorl $2, %ebp # b = rotl32(b,30) -# 58 - movl %edx, %edi # di: b - movl %edx, %esi # si: b - orl %ebp, %edi # di: b | c - andl %ebp, %esi # si: b & c - andl %eax, %edi # di: (b | c) & d - orl %esi, %edi # ((b | c) & d) | (b & c) - addl %edi, %ebx # += ((b | c) & d) | (b & c) - addl -64+4*10(%rsp), %ebx # e += RCONST + W[n & 15] - movl %ecx, %esi # - roll $5, %esi # rotl32(a,5) - addl %esi, %ebx # e += rotl32(a,5) - rorl $2, %edx # b = rotl32(b,30) -# 59 - movl %ecx, %edi # di: b - movl %ecx, %esi # si: b - orl %edx, %edi # di: b | c - andl %edx, %esi # si: b & c - andl %ebp, %edi # di: (b | c) & d - orl %esi, %edi # ((b | c) & d) | (b & c) - addl %edi, %eax # += ((b | c) & d) | (b & c) - addl -64+4*11(%rsp), %eax # e += RCONST + W[n & 15] - movl %ebx, %esi # - roll $5, %esi # rotl32(a,5) - addl %esi, %eax # e += rotl32(a,5) - rorl $2, %ecx # b = rotl32(b,30) -# PREP %xmm1 %xmm2 %xmm3 %xmm0 -64+16*1(%rsp) - movaps %xmm0, %xmm4 - psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0) -# pshufd $0x4e, %xmm1, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x) -# punpcklqdq %xmm2, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5]) -# same result as above, but shorter and faster: -# pshufd/shufps are subtly different: pshufd takes all dwords from source operand, -# shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one! - movaps %xmm1, %xmm5 - shufps $0x4e, %xmm2, %xmm5 # 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5]) - xorps %xmm3, %xmm1 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3]) - xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5]) - xorps %xmm5, %xmm1 # ^ - # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup - movaps %xmm1, %xmm5 - xorps %xmm4, %xmm4 # rol(W0,1): - pcmpgtd %xmm1, %xmm4 # ffffffff for elements <0 (ones with msb bit 1) - paddd %xmm1, %xmm1 # shift left by 1 - psubd %xmm4, %xmm1 # add 1 to those who had msb bit 1 - # W0 = rotated (W[0]..W[3]), still needs W[3] fixup - pslldq $12, %xmm5 # lshift by 12 bytes: T2 = (0,0,0,unrotW[0]) - movaps %xmm5, %xmm4 - pslld $2, %xmm5 - psrld $30, %xmm4 -# xorps %xmm4, %xmm5 # rol((0,0,0,unrotW[0]),2) - xorps %xmm4, %xmm1 # same result, but does not depend on/does not modify T2 - xorps %xmm5, %xmm1 # W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2)) - movaps %xmm1, %xmm5 - paddd %xmm6, %xmm5 - movups %xmm5, -64+16*1(%rsp) -# 60 - movl %ecx, %edi # c - xorl %edx, %edi # ^d - xorl %ebx, %edi # ^b - addl -64+4*12(%rsp), %ebp # e += RCONST + W[n & 15] - addl %edi, %ebp # e += (c ^ d ^ b) - movl %eax, %esi # - roll $5, %esi # rotl32(a,5) - addl %esi, %ebp # e += rotl32(a,5) - rorl $2, %ebx # b = rotl32(b,30) -# 61 - movl %ebx, %edi # c - xorl %ecx, %edi # ^d - xorl %eax, %edi # ^b - addl -64+4*13(%rsp), %edx # e += RCONST + W[n & 15] - addl %edi, %edx # e += (c ^ d ^ b) - movl %ebp, %esi # - roll $5, %esi # rotl32(a,5) - addl %esi, %edx # e += rotl32(a,5) - rorl $2, %eax # b = rotl32(b,30) -# 62 - movl %eax, %edi # c - xorl %ebx, %edi # ^d - xorl %ebp, %edi # ^b - addl -64+4*14(%rsp), %ecx # e += RCONST + W[n & 15] - addl %edi, %ecx # e += (c ^ d ^ b) - movl %edx, %esi # - roll $5, %esi # rotl32(a,5) - addl %esi, %ecx # e += rotl32(a,5) - rorl $2, %ebp # b = rotl32(b,30) -# 63 - movl %ebp, %edi # c - xorl %eax, %edi # ^d - xorl %edx, %edi # ^b - addl -64+4*15(%rsp), %ebx # e += RCONST + W[n & 15] - addl %edi, %ebx # e += (c ^ d ^ b) - movl %ecx, %esi # - roll $5, %esi # rotl32(a,5) - addl %esi, %ebx # e += rotl32(a,5) - rorl $2, %edx # b = rotl32(b,30) -# PREP %xmm2 %xmm3 %xmm0 %xmm1 -64+16*2(%rsp) - movaps %xmm1, %xmm4 - psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0) -# pshufd $0x4e, %xmm2, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x) -# punpcklqdq %xmm3, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5]) -# same result as above, but shorter and faster: -# pshufd/shufps are subtly different: pshufd takes all dwords from source operand, -# shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one! - movaps %xmm2, %xmm5 - shufps $0x4e, %xmm3, %xmm5 # 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5]) - xorps %xmm0, %xmm2 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3]) - xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5]) - xorps %xmm5, %xmm2 # ^ - # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup - movaps %xmm2, %xmm5 - xorps %xmm4, %xmm4 # rol(W0,1): - pcmpgtd %xmm2, %xmm4 # ffffffff for elements <0 (ones with msb bit 1) - paddd %xmm2, %xmm2 # shift left by 1 - psubd %xmm4, %xmm2 # add 1 to those who had msb bit 1 - # W0 = rotated (W[0]..W[3]), still needs W[3] fixup - pslldq $12, %xmm5 # lshift by 12 bytes: T2 = (0,0,0,unrotW[0]) - movaps %xmm5, %xmm4 - pslld $2, %xmm5 - psrld $30, %xmm4 -# xorps %xmm4, %xmm5 # rol((0,0,0,unrotW[0]),2) - xorps %xmm4, %xmm2 # same result, but does not depend on/does not modify T2 - xorps %xmm5, %xmm2 # W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2)) - movaps %xmm2, %xmm5 - paddd %xmm6, %xmm5 - movups %xmm5, -64+16*2(%rsp) -# 64 - movl %edx, %edi # c - xorl %ebp, %edi # ^d - xorl %ecx, %edi # ^b - addl -64+4*0(%rsp), %eax # e += RCONST + W[n & 15] - addl %edi, %eax # e += (c ^ d ^ b) - movl %ebx, %esi # - roll $5, %esi # rotl32(a,5) - addl %esi, %eax # e += rotl32(a,5) - rorl $2, %ecx # b = rotl32(b,30) -# 65 - movl %ecx, %edi # c - xorl %edx, %edi # ^d - xorl %ebx, %edi # ^b - addl -64+4*1(%rsp), %ebp # e += RCONST + W[n & 15] - addl %edi, %ebp # e += (c ^ d ^ b) - movl %eax, %esi # - roll $5, %esi # rotl32(a,5) - addl %esi, %ebp # e += rotl32(a,5) - rorl $2, %ebx # b = rotl32(b,30) -# 66 - movl %ebx, %edi # c - xorl %ecx, %edi # ^d - xorl %eax, %edi # ^b - addl -64+4*2(%rsp), %edx # e += RCONST + W[n & 15] - addl %edi, %edx # e += (c ^ d ^ b) - movl %ebp, %esi # - roll $5, %esi # rotl32(a,5) - addl %esi, %edx # e += rotl32(a,5) - rorl $2, %eax # b = rotl32(b,30) -# 67 - movl %eax, %edi # c - xorl %ebx, %edi # ^d - xorl %ebp, %edi # ^b - addl -64+4*3(%rsp), %ecx # e += RCONST + W[n & 15] - addl %edi, %ecx # e += (c ^ d ^ b) - movl %edx, %esi # - roll $5, %esi # rotl32(a,5) - addl %esi, %ecx # e += rotl32(a,5) - rorl $2, %ebp # b = rotl32(b,30) -# PREP %xmm3 %xmm0 %xmm1 %xmm2 -64+16*3(%rsp) - movaps %xmm2, %xmm4 - psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0) -# pshufd $0x4e, %xmm3, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x) -# punpcklqdq %xmm0, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5]) -# same result as above, but shorter and faster: -# pshufd/shufps are subtly different: pshufd takes all dwords from source operand, -# shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one! - movaps %xmm3, %xmm5 - shufps $0x4e, %xmm0, %xmm5 # 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5]) - xorps %xmm1, %xmm3 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3]) - xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5]) - xorps %xmm5, %xmm3 # ^ - # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup - movaps %xmm3, %xmm5 - xorps %xmm4, %xmm4 # rol(W0,1): - pcmpgtd %xmm3, %xmm4 # ffffffff for elements <0 (ones with msb bit 1) - paddd %xmm3, %xmm3 # shift left by 1 - psubd %xmm4, %xmm3 # add 1 to those who had msb bit 1 - # W0 = rotated (W[0]..W[3]), still needs W[3] fixup - pslldq $12, %xmm5 # lshift by 12 bytes: T2 = (0,0,0,unrotW[0]) - movaps %xmm5, %xmm4 - pslld $2, %xmm5 - psrld $30, %xmm4 -# xorps %xmm4, %xmm5 # rol((0,0,0,unrotW[0]),2) - xorps %xmm4, %xmm3 # same result, but does not depend on/does not modify T2 - xorps %xmm5, %xmm3 # W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2)) - movaps %xmm3, %xmm5 - paddd %xmm6, %xmm5 - movups %xmm5, -64+16*3(%rsp) -# 68 - movl %ebp, %edi # c - xorl %eax, %edi # ^d - xorl %edx, %edi # ^b - addl -64+4*4(%rsp), %ebx # e += RCONST + W[n & 15] - addl %edi, %ebx # e += (c ^ d ^ b) - movl %ecx, %esi # - roll $5, %esi # rotl32(a,5) - addl %esi, %ebx # e += rotl32(a,5) - rorl $2, %edx # b = rotl32(b,30) -# 69 - movl %edx, %edi # c - xorl %ebp, %edi # ^d - xorl %ecx, %edi # ^b - addl -64+4*5(%rsp), %eax # e += RCONST + W[n & 15] - addl %edi, %eax # e += (c ^ d ^ b) - movl %ebx, %esi # - roll $5, %esi # rotl32(a,5) - addl %esi, %eax # e += rotl32(a,5) - rorl $2, %ecx # b = rotl32(b,30) -# 70 - movl %ecx, %edi # c - xorl %edx, %edi # ^d - xorl %ebx, %edi # ^b - addl -64+4*6(%rsp), %ebp # e += RCONST + W[n & 15] - addl %edi, %ebp # e += (c ^ d ^ b) - movl %eax, %esi # - roll $5, %esi # rotl32(a,5) - addl %esi, %ebp # e += rotl32(a,5) - rorl $2, %ebx # b = rotl32(b,30) -# 71 - movl %ebx, %edi # c - xorl %ecx, %edi # ^d - xorl %eax, %edi # ^b - addl -64+4*7(%rsp), %edx # e += RCONST + W[n & 15] - addl %edi, %edx # e += (c ^ d ^ b) - movl %ebp, %esi # - roll $5, %esi # rotl32(a,5) - addl %esi, %edx # e += rotl32(a,5) - rorl $2, %eax # b = rotl32(b,30) -# 72 - movl %eax, %edi # c - xorl %ebx, %edi # ^d - xorl %ebp, %edi # ^b - addl -64+4*8(%rsp), %ecx # e += RCONST + W[n & 15] - addl %edi, %ecx # e += (c ^ d ^ b) - movl %edx, %esi # - roll $5, %esi # rotl32(a,5) - addl %esi, %ecx # e += rotl32(a,5) - rorl $2, %ebp # b = rotl32(b,30) -# 73 - movl %ebp, %edi # c - xorl %eax, %edi # ^d - xorl %edx, %edi # ^b - addl -64+4*9(%rsp), %ebx # e += RCONST + W[n & 15] - addl %edi, %ebx # e += (c ^ d ^ b) - movl %ecx, %esi # - roll $5, %esi # rotl32(a,5) - addl %esi, %ebx # e += rotl32(a,5) - rorl $2, %edx # b = rotl32(b,30) -# 74 - movl %edx, %edi # c - xorl %ebp, %edi # ^d - xorl %ecx, %edi # ^b - addl -64+4*10(%rsp), %eax # e += RCONST + W[n & 15] - addl %edi, %eax # e += (c ^ d ^ b) - movl %ebx, %esi # - roll $5, %esi # rotl32(a,5) - addl %esi, %eax # e += rotl32(a,5) - rorl $2, %ecx # b = rotl32(b,30) -# 75 - movl %ecx, %edi # c - xorl %edx, %edi # ^d - xorl %ebx, %edi # ^b - addl -64+4*11(%rsp), %ebp # e += RCONST + W[n & 15] - addl %edi, %ebp # e += (c ^ d ^ b) - movl %eax, %esi # - roll $5, %esi # rotl32(a,5) - addl %esi, %ebp # e += rotl32(a,5) - rorl $2, %ebx # b = rotl32(b,30) -# 76 - movl %ebx, %edi # c - xorl %ecx, %edi # ^d - xorl %eax, %edi # ^b - addl -64+4*12(%rsp), %edx # e += RCONST + W[n & 15] - addl %edi, %edx # e += (c ^ d ^ b) - movl %ebp, %esi # - roll $5, %esi # rotl32(a,5) - addl %esi, %edx # e += rotl32(a,5) - rorl $2, %eax # b = rotl32(b,30) -# 77 - movl %eax, %edi # c - xorl %ebx, %edi # ^d - xorl %ebp, %edi # ^b - addl -64+4*13(%rsp), %ecx # e += RCONST + W[n & 15] - addl %edi, %ecx # e += (c ^ d ^ b) - movl %edx, %esi # - roll $5, %esi # rotl32(a,5) - addl %esi, %ecx # e += rotl32(a,5) - rorl $2, %ebp # b = rotl32(b,30) -# 78 - movl %ebp, %edi # c - xorl %eax, %edi # ^d - xorl %edx, %edi # ^b - addl -64+4*14(%rsp), %ebx # e += RCONST + W[n & 15] - addl %edi, %ebx # e += (c ^ d ^ b) - movl %ecx, %esi # - roll $5, %esi # rotl32(a,5) - addl %esi, %ebx # e += rotl32(a,5) - rorl $2, %edx # b = rotl32(b,30) -# 79 - movl %edx, %edi # c - xorl %ebp, %edi # ^d - xorl %ecx, %edi # ^b - addl -64+4*15(%rsp), %eax # e += RCONST + W[n & 15] - addl %edi, %eax # e += (c ^ d ^ b) - movl %ebx, %esi # - roll $5, %esi # rotl32(a,5) - addl %esi, %eax # e += rotl32(a,5) - rorl $2, %ecx # b = rotl32(b,30) - - popq %rdi # - popq %r12 # - addl %eax, 80(%rdi) # ctx->hash[0] += a - popq %r13 # - addl %ebx, 84(%rdi) # ctx->hash[1] += b - popq %r14 # - addl %ecx, 88(%rdi) # ctx->hash[2] += c -# popq %r15 # - addl %edx, 92(%rdi) # ctx->hash[3] += d - popq %rbx # - addl %ebp, 96(%rdi) # ctx->hash[4] += e - popq %rbp # - - ret - .size sha1_process_block64, .-sha1_process_block64 - - .section .rodata.cst16.sha1const, "aM", @progbits, 16 - .balign 16 -sha1const: - .long 0x5A827999 - .long 0x6ED9EBA1 - .long 0x8F1BBCDC - .long 0xCA62C1D6 - -#endif diff --git a/libbb/hash_md5_sha_x86-64.S.sh b/libbb/hash_md5_sha_x86-64.S.sh deleted file mode 100755 index 653fe4989..000000000 --- a/libbb/hash_md5_sha_x86-64.S.sh +++ /dev/null @@ -1,478 +0,0 @@ -#!/bin/sh - -# We don't regenerate it on every "make" invocation - only by hand. -# The reason is that the changes to generated code are difficult -# to visualize by looking only at this script, it helps when the commit -# also contains the diff of the generated file. -exec >hash_md5_sha_x86-64.S - -# Based on http://arctic.org/~dean/crypto/sha1.html. -# ("This SHA1 implementation is public domain.") -# -# x86-64 has at least SSE2 vector insns always available. -# We can use them without any CPUID checks (and without a need -# for a fallback code if needed insns are not available). -# This code uses them to calculate W[] ahead of time. -# -# Unfortunately, results are passed from vector unit to -# integer ALUs on the stack. MOVD/Q insns to move them directly -# from vector to integer registers are slower than store-to-load -# forwarding in LSU (on Skylake at least). -# -# The win against a purely integer code is small on Skylake, -# only about 7-8%. We offload about 1/3 of our operations to the vector unit. -# It can do 4 ops at once in one 128-bit register, -# but we have to use x2 of them because of W[0] complication, -# SSE2 has no "rotate each word by N bits" insns, -# moving data to/from vector unit is clunky, and Skylake -# has four integer ALUs unified with three vector ALUs, -# which makes pure integer code rather fast, and makes -# vector ops compete with integer ones. -# -# Zen3, with its separate vector ALUs, wins more, about 12%. - -xmmT1="%xmm4" -xmmT2="%xmm5" -xmmRCONST="%xmm6" -xmmALLRCONST="%xmm7" -T=`printf '\t'` - -# SSE instructions are longer than 4 bytes on average. -# Intel CPUs (up to Tiger Lake at least) can't decode -# more than 16 bytes of code in one cycle. -# By interleaving SSE code and integer code -# we mostly achieve a situation where 16-byte decode fetch window -# contains 4 (or more) insns. -# -# However. On Skylake, there was no observed difference, -# but on Zen3, non-interleaved code is ~3% faster -# (822 Mb/s versus 795 Mb/s hashing speed). -# Off for now: -interleave=false - -INTERLEAVE() { - $interleave || \ - { - # Generate non-interleaved code - # (it should work correctly too) - echo "$1" - echo "$2" - return - } - ( - echo "$1" | grep -v '^$' >"$0.temp1" - echo "$2" | grep -v '^$' >"$0.temp2" - exec 3<"$0.temp1" - exec 4<"$0.temp2" - IFS='' - while :; do - line1='' - line2='' - while :; do - read -r line1 <&3 - if test "${line1:0:1}" != "#" && test "${line1:0:2}" != "$T#"; then - break - fi - echo "$line1" - done - while :; do - read -r line2 <&4 - if test "${line2:0:4}" = "${T}lea"; then - # We use 7-8 byte long forms of LEA. - # Do not interleave them with SSE insns - # which are also long. - echo "$line2" - read -r line2 <&4 - echo "$line2" - continue - fi - if test "${line2:0:1}" != "#" && test "${line2:0:2}" != "$T#"; then - break - fi - echo "$line2" - done - test "$line1$line2" || break - echo "$line1" - echo "$line2" - done - rm "$0.temp1" "$0.temp2" - ) -} - -# movaps bswap32_mask(%rip), $xmmT1 -# Load W[] to xmm0..3, byteswapping on the fly. -# For iterations 0..15, we pass RCONST+W[] in rsi,r8..r14 -# for use in RD1As instead of spilling them to stack. -# (We use rsi instead of rN because this makes two -# ADDs in two first RD1As shorter by one byte). -# movups 16*0(%rdi), %xmm0 -# pshufb $xmmT1, %xmm0 #SSSE3 insn -# movaps %xmm0, $xmmT2 -# paddd $xmmRCONST, $xmmT2 -# movq $xmmT2, %rsi -# #pextrq \$1, $xmmT2, %r8 #SSE4.1 insn -# #movhpd $xmmT2, %r8 #can only move to mem, not to reg -# shufps \$0x0e, $xmmT2, $xmmT2 # have to use two-insn sequence -# movq $xmmT2, %r8 # instead -# ... -# -# ... -#- leal $RCONST(%r$e,%rsi), %e$e # e += RCONST + W[n] -#+ addl %esi, %e$e # e += RCONST + W[n] -# ^^^^^^^^^^^^^^^^^^^^^^^^ -# The above is -97 bytes of code... -# ...but pshufb is a SSSE3 insn. Can't use it. - -echo \ -"### Generated by hash_md5_sha_x86-64.S.sh ### - -#if CONFIG_SHA1_SMALL == 0 && defined(__GNUC__) && defined(__x86_64__) -#ifdef __linux__ - .section .note.GNU-stack, \"\", @progbits -#endif - .section .text.sha1_process_block64, \"ax\", @progbits - .globl sha1_process_block64 - .hidden sha1_process_block64 - .type sha1_process_block64, @function - - .balign 8 # allow decoders to fetch at least 5 first insns -sha1_process_block64: - pushq %rbp # 1 byte insn - pushq %rbx # 1 byte insn -# pushq %r15 # 2 byte insn - pushq %r14 # 2 byte insn - pushq %r13 # 2 byte insn - pushq %r12 # 2 byte insn - pushq %rdi # we need ctx at the end - -#Register and stack use: -# eax..edx: a..d -# ebp: e -# esi,edi,r8..r14: temps -# r15: unused -# xmm0..xmm3: W[] -# xmm4,xmm5: temps -# xmm6: current round constant -# xmm7: all round constants -# -64(%rsp): area for passing RCONST + W[] from vector to integer units - - movl 80(%rdi), %eax # a = ctx->hash[0] - movl 84(%rdi), %ebx # b = ctx->hash[1] - movl 88(%rdi), %ecx # c = ctx->hash[2] - movl 92(%rdi), %edx # d = ctx->hash[3] - movl 96(%rdi), %ebp # e = ctx->hash[4] - - movaps sha1const(%rip), $xmmALLRCONST - pshufd \$0x00, $xmmALLRCONST, $xmmRCONST - - # Load W[] to xmm0..3, byteswapping on the fly. - # - # For iterations 0..15, we pass W[] in rsi,r8..r14 - # for use in RD1As instead of spilling them to stack. - # We lose parallelized addition of RCONST, but LEA - # can do two additions at once, so it is probably a wash. - # (We use rsi instead of rN because this makes two - # LEAs in two first RD1As shorter by one byte). - movq 4*0(%rdi), %rsi - movq 4*2(%rdi), %r8 - bswapq %rsi - bswapq %r8 - rolq \$32, %rsi # rsi = W[1]:W[0] - rolq \$32, %r8 # r8 = W[3]:W[2] - movq %rsi, %xmm0 - movq %r8, $xmmT1 - punpcklqdq $xmmT1, %xmm0 # xmm0 = r8:rsi = (W[0],W[1],W[2],W[3]) -# movaps %xmm0, $xmmT1 # add RCONST, spill to stack -# paddd $xmmRCONST, $xmmT1 -# movups $xmmT1, -64+16*0(%rsp) - - movq 4*4(%rdi), %r9 - movq 4*6(%rdi), %r10 - bswapq %r9 - bswapq %r10 - rolq \$32, %r9 # r9 = W[5]:W[4] - rolq \$32, %r10 # r10 = W[7]:W[6] - movq %r9, %xmm1 - movq %r10, $xmmT1 - punpcklqdq $xmmT1, %xmm1 # xmm1 = r10:r9 = (W[4],W[5],W[6],W[7]) - - movq 4*8(%rdi), %r11 - movq 4*10(%rdi), %r12 - bswapq %r11 - bswapq %r12 - rolq \$32, %r11 # r11 = W[9]:W[8] - rolq \$32, %r12 # r12 = W[11]:W[10] - movq %r11, %xmm2 - movq %r12, $xmmT1 - punpcklqdq $xmmT1, %xmm2 # xmm2 = r12:r11 = (W[8],W[9],W[10],W[11]) - - movq 4*12(%rdi), %r13 - movq 4*14(%rdi), %r14 - bswapq %r13 - bswapq %r14 - rolq \$32, %r13 # r13 = W[13]:W[12] - rolq \$32, %r14 # r14 = W[15]:W[14] - movq %r13, %xmm3 - movq %r14, $xmmT1 - punpcklqdq $xmmT1, %xmm3 # xmm3 = r14:r13 = (W[12],W[13],W[14],W[15]) -" - -PREP() { -local xmmW0=$1 -local xmmW4=$2 -local xmmW8=$3 -local xmmW12=$4 -# the above must be %xmm0..3 in some permutation -local dstmem=$5 -#W[0] = rol(W[13] ^ W[8] ^ W[2] ^ W[0], 1); -#W[1] = rol(W[14] ^ W[9] ^ W[3] ^ W[1], 1); -#W[2] = rol(W[15] ^ W[10] ^ W[4] ^ W[2], 1); -#W[3] = rol( 0 ^ W[11] ^ W[5] ^ W[3], 1); -#W[3] ^= rol(W[0], 1); -echo "# PREP $@ - movaps $xmmW12, $xmmT1 - psrldq \$4, $xmmT1 # rshift by 4 bytes: T1 = ([13],[14],[15],0) - -# pshufd \$0x4e, $xmmW0, $xmmT2 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x) -# punpcklqdq $xmmW4, $xmmT2 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5]) -# same result as above, but shorter and faster: -# pshufd/shufps are subtly different: pshufd takes all dwords from source operand, -# shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one! - movaps $xmmW0, $xmmT2 - shufps \$0x4e, $xmmW4, $xmmT2 # 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5]) - - xorps $xmmW8, $xmmW0 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3]) - xorps $xmmT1, $xmmT2 # ([13],[14],[15],0) ^ ([2],[3],[4],[5]) - xorps $xmmT2, $xmmW0 # ^ - # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup - movaps $xmmW0, $xmmT2 - - xorps $xmmT1, $xmmT1 # rol(W0,1): - pcmpgtd $xmmW0, $xmmT1 # ffffffff for elements <0 (ones with msb bit 1) - paddd $xmmW0, $xmmW0 # shift left by 1 - psubd $xmmT1, $xmmW0 # add 1 to those who had msb bit 1 - # W0 = rotated (W[0]..W[3]), still needs W[3] fixup - - pslldq \$12, $xmmT2 # lshift by 12 bytes: T2 = (0,0,0,unrotW[0]) - movaps $xmmT2, $xmmT1 - pslld \$2, $xmmT2 - psrld \$30, $xmmT1 -# xorps $xmmT1, $xmmT2 # rol((0,0,0,unrotW[0]),2) - xorps $xmmT1, $xmmW0 # same result, but does not depend on/does not modify T2 - - xorps $xmmT2, $xmmW0 # W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2)) -" -# movq $xmmW0, %r8 # high latency (~6 cycles) -# movaps $xmmW0, $xmmT1 -# psrldq \$8, $xmmT1 # rshift by 8 bytes: move upper 64 bits to lower -# movq $xmmT1, %r10 # high latency -# movq %r8, %r9 -# movq %r10, %r11 -# shrq \$32, %r9 -# shrq \$32, %r11 -# ^^^ slower than passing the results on stack (!!!) -echo " - movaps $xmmW0, $xmmT2 - paddd $xmmRCONST, $xmmT2 - movups $xmmT2, $dstmem -" -} - -# It's possible to interleave integer insns in rounds to mostly eliminate -# dependency chains, but this likely to only help old Pentium-based -# CPUs (ones without OOO, which can only simultaneously execute a pair -# of _adjacent_ insns). -# Testing on old-ish Silvermont CPU (which has OOO window of only -# about ~8 insns) shows very small (~1%) speedup. - -RD1A() { -local a=$1;local b=$2;local c=$3;local d=$4;local e=$5 -local n=$(($6)) -local n0=$(((n+0) & 15)) -local rN=$((7+n0/2)) -echo " -# $n -";test $n0 = 0 && echo " - leal $RCONST(%r$e,%rsi), %e$e # e += RCONST + W[n] - shrq \$32, %rsi -";test $n0 = 1 && echo " - leal $RCONST(%r$e,%rsi), %e$e # e += RCONST + W[n] -";test $n0 -ge 2 && test $((n0 & 1)) = 0 && echo " - leal $RCONST(%r$e,%r$rN), %e$e # e += RCONST + W[n] - shrq \$32, %r$rN -";test $n0 -ge 2 && test $((n0 & 1)) = 1 && echo " - leal $RCONST(%r$e,%r$rN), %e$e # e += RCONST + W[n] -";echo " - movl %e$c, %edi # c - xorl %e$d, %edi # ^d - andl %e$b, %edi # &b - xorl %e$d, %edi # (((c ^ d) & b) ^ d) - addl %edi, %e$e # e += (((c ^ d) & b) ^ d) - movl %e$a, %edi # - roll \$5, %edi # rotl32(a,5) - addl %edi, %e$e # e += rotl32(a,5) - rorl \$2, %e$b # b = rotl32(b,30) -" -} -RD1B() { -local a=$1;local b=$2;local c=$3;local d=$4;local e=$5 -local n=$(($6)) -local n13=$(((n+13) & 15)) -local n8=$(((n+8) & 15)) -local n2=$(((n+2) & 15)) -local n0=$(((n+0) & 15)) -echo " -# $n - movl %e$c, %edi # c - xorl %e$d, %edi # ^d - andl %e$b, %edi # &b - xorl %e$d, %edi # (((c ^ d) & b) ^ d) - addl -64+4*$n0(%rsp), %e$e # e += RCONST + W[n & 15] - addl %edi, %e$e # e += (((c ^ d) & b) ^ d) - movl %e$a, %esi # - roll \$5, %esi # rotl32(a,5) - addl %esi, %e$e # e += rotl32(a,5) - rorl \$2, %e$b # b = rotl32(b,30) -" -} - -RD2() { -local a=$1;local b=$2;local c=$3;local d=$4;local e=$5 -local n=$(($6)) -local n13=$(((n+13) & 15)) -local n8=$(((n+8) & 15)) -local n2=$(((n+2) & 15)) -local n0=$(((n+0) & 15)) -echo " -# $n - movl %e$c, %edi # c - xorl %e$d, %edi # ^d - xorl %e$b, %edi # ^b - addl -64+4*$n0(%rsp), %e$e # e += RCONST + W[n & 15] - addl %edi, %e$e # e += (c ^ d ^ b) - movl %e$a, %esi # - roll \$5, %esi # rotl32(a,5) - addl %esi, %e$e # e += rotl32(a,5) - rorl \$2, %e$b # b = rotl32(b,30) -" -} - -RD3() { -local a=$1;local b=$2;local c=$3;local d=$4;local e=$5 -local n=$(($6)) -local n13=$(((n+13) & 15)) -local n8=$(((n+8) & 15)) -local n2=$(((n+2) & 15)) -local n0=$(((n+0) & 15)) -echo " -# $n - movl %e$b, %edi # di: b - movl %e$b, %esi # si: b - orl %e$c, %edi # di: b | c - andl %e$c, %esi # si: b & c - andl %e$d, %edi # di: (b | c) & d - orl %esi, %edi # ((b | c) & d) | (b & c) - addl %edi, %e$e # += ((b | c) & d) | (b & c) - addl -64+4*$n0(%rsp), %e$e # e += RCONST + W[n & 15] - movl %e$a, %esi # - roll \$5, %esi # rotl32(a,5) - addl %esi, %e$e # e += rotl32(a,5) - rorl \$2, %e$b # b = rotl32(b,30) -" -} - -{ -# Round 1 -RCONST=0x5A827999 -RD1A ax bx cx dx bp 0; RD1A bp ax bx cx dx 1; RD1A dx bp ax bx cx 2; RD1A cx dx bp ax bx 3; -RD1A bx cx dx bp ax 4; RD1A ax bx cx dx bp 5; RD1A bp ax bx cx dx 6; RD1A dx bp ax bx cx 7; -a=`PREP %xmm0 %xmm1 %xmm2 %xmm3 "-64+16*0(%rsp)"` -b=`RD1A cx dx bp ax bx 8; RD1A bx cx dx bp ax 9; RD1A ax bx cx dx bp 10; RD1A bp ax bx cx dx 11;` -INTERLEAVE "$a" "$b" -a=`echo " pshufd \\$0x55, $xmmALLRCONST, $xmmRCONST" - PREP %xmm1 %xmm2 %xmm3 %xmm0 "-64+16*1(%rsp)"` -b=`RD1A dx bp ax bx cx 12; RD1A cx dx bp ax bx 13; RD1A bx cx dx bp ax 14; RD1A ax bx cx dx bp 15;` -INTERLEAVE "$a" "$b" -a=`PREP %xmm2 %xmm3 %xmm0 %xmm1 "-64+16*2(%rsp)"` -b=`RD1B bp ax bx cx dx 16; RD1B dx bp ax bx cx 17; RD1B cx dx bp ax bx 18; RD1B bx cx dx bp ax 19;` -INTERLEAVE "$a" "$b" - -# Round 2 -RCONST=0x6ED9EBA1 -a=`PREP %xmm3 %xmm0 %xmm1 %xmm2 "-64+16*3(%rsp)"` -b=`RD2 ax bx cx dx bp 20; RD2 bp ax bx cx dx 21; RD2 dx bp ax bx cx 22; RD2 cx dx bp ax bx 23;` -INTERLEAVE "$a" "$b" -a=`PREP %xmm0 %xmm1 %xmm2 %xmm3 "-64+16*0(%rsp)"` -b=`RD2 bx cx dx bp ax 24; RD2 ax bx cx dx bp 25; RD2 bp ax bx cx dx 26; RD2 dx bp ax bx cx 27;` -INTERLEAVE "$a" "$b" -a=`PREP %xmm1 %xmm2 %xmm3 %xmm0 "-64+16*1(%rsp)"` -b=`RD2 cx dx bp ax bx 28; RD2 bx cx dx bp ax 29; RD2 ax bx cx dx bp 30; RD2 bp ax bx cx dx 31;` -INTERLEAVE "$a" "$b" -a=`echo " pshufd \\$0xaa, $xmmALLRCONST, $xmmRCONST" - PREP %xmm2 %xmm3 %xmm0 %xmm1 "-64+16*2(%rsp)"` -b=`RD2 dx bp ax bx cx 32; RD2 cx dx bp ax bx 33; RD2 bx cx dx bp ax 34; RD2 ax bx cx dx bp 35;` -INTERLEAVE "$a" "$b" -a=`PREP %xmm3 %xmm0 %xmm1 %xmm2 "-64+16*3(%rsp)"` -b=`RD2 bp ax bx cx dx 36; RD2 dx bp ax bx cx 37; RD2 cx dx bp ax bx 38; RD2 bx cx dx bp ax 39;` -INTERLEAVE "$a" "$b" - -# Round 3 -RCONST=0x8F1BBCDC -a=`PREP %xmm0 %xmm1 %xmm2 %xmm3 "-64+16*0(%rsp)"` -b=`RD3 ax bx cx dx bp 40; RD3 bp ax bx cx dx 41; RD3 dx bp ax bx cx 42; RD3 cx dx bp ax bx 43;` -INTERLEAVE "$a" "$b" -a=`PREP %xmm1 %xmm2 %xmm3 %xmm0 "-64+16*1(%rsp)"` -b=`RD3 bx cx dx bp ax 44; RD3 ax bx cx dx bp 45; RD3 bp ax bx cx dx 46; RD3 dx bp ax bx cx 47;` -INTERLEAVE "$a" "$b" -a=`PREP %xmm2 %xmm3 %xmm0 %xmm1 "-64+16*2(%rsp)"` -b=`RD3 cx dx bp ax bx 48; RD3 bx cx dx bp ax 49; RD3 ax bx cx dx bp 50; RD3 bp ax bx cx dx 51;` -INTERLEAVE "$a" "$b" -a=`echo " pshufd \\$0xff, $xmmALLRCONST, $xmmRCONST" - PREP %xmm3 %xmm0 %xmm1 %xmm2 "-64+16*3(%rsp)"` -b=`RD3 dx bp ax bx cx 52; RD3 cx dx bp ax bx 53; RD3 bx cx dx bp ax 54; RD3 ax bx cx dx bp 55;` -INTERLEAVE "$a" "$b" -a=`PREP %xmm0 %xmm1 %xmm2 %xmm3 "-64+16*0(%rsp)"` -b=`RD3 bp ax bx cx dx 56; RD3 dx bp ax bx cx 57; RD3 cx dx bp ax bx 58; RD3 bx cx dx bp ax 59;` -INTERLEAVE "$a" "$b" - -# Round 4 has the same logic as round 2, only n and RCONST are different -RCONST=0xCA62C1D6 -a=`PREP %xmm1 %xmm2 %xmm3 %xmm0 "-64+16*1(%rsp)"` -b=`RD2 ax bx cx dx bp 60; RD2 bp ax bx cx dx 61; RD2 dx bp ax bx cx 62; RD2 cx dx bp ax bx 63;` -INTERLEAVE "$a" "$b" -a=`PREP %xmm2 %xmm3 %xmm0 %xmm1 "-64+16*2(%rsp)"` -b=`RD2 bx cx dx bp ax 64; RD2 ax bx cx dx bp 65; RD2 bp ax bx cx dx 66; RD2 dx bp ax bx cx 67;` -INTERLEAVE "$a" "$b" -a=`PREP %xmm3 %xmm0 %xmm1 %xmm2 "-64+16*3(%rsp)"` -b=`RD2 cx dx bp ax bx 68; RD2 bx cx dx bp ax 69; RD2 ax bx cx dx bp 70; RD2 bp ax bx cx dx 71;` -INTERLEAVE "$a" "$b" -RD2 dx bp ax bx cx 72; RD2 cx dx bp ax bx 73; RD2 bx cx dx bp ax 74; RD2 ax bx cx dx bp 75; -RD2 bp ax bx cx dx 76; RD2 dx bp ax bx cx 77; RD2 cx dx bp ax bx 78; RD2 bx cx dx bp ax 79; -} | grep -v '^$' - -echo " - popq %rdi # - popq %r12 # - addl %eax, 80(%rdi) # ctx->hash[0] += a - popq %r13 # - addl %ebx, 84(%rdi) # ctx->hash[1] += b - popq %r14 # - addl %ecx, 88(%rdi) # ctx->hash[2] += c -# popq %r15 # - addl %edx, 92(%rdi) # ctx->hash[3] += d - popq %rbx # - addl %ebp, 96(%rdi) # ctx->hash[4] += e - popq %rbp # - - ret - .size sha1_process_block64, .-sha1_process_block64 - - .section .rodata.cst16.sha1const, \"aM\", @progbits, 16 - .balign 16 -sha1const: - .long 0x5A827999 - .long 0x6ED9EBA1 - .long 0x8F1BBCDC - .long 0xCA62C1D6 - -#endif" diff --git a/libbb/hash_md5_sha_x86-64_shaNI.S b/libbb/hash_md5_sha_x86-64_shaNI.S deleted file mode 100644 index 2f03e1ce4..000000000 --- a/libbb/hash_md5_sha_x86-64_shaNI.S +++ /dev/null @@ -1,232 +0,0 @@ -#if ENABLE_SHA1_HWACCEL && defined(__GNUC__) && defined(__x86_64__) -/* The code is adapted from Linux kernel's source */ - -// We use shorter insns, even though they are for "wrong" -// data type (fp, not int). -// For Intel, there is no penalty for doing it at all -// (CPUs which do have such penalty do not support SHA insns). -// For AMD, the penalty is one extra cycle -// (allegedly: I failed to find measurable difference). - -//#define mova128 movdqa -#define mova128 movaps -//#define movu128 movdqu -#define movu128 movups -//#define xor128 pxor -#define xor128 xorps -//#define shuf128_32 pshufd -#define shuf128_32 shufps - -#define extr128_32 pextrd -//#define extr128_32 extractps # not shorter - -// pshufb is a SSSE3 insn. -// pinsrd, pextrd, extractps are SSE4.1 insns. -// We do not check SSSE3/SSE4.1 in cpuid, -// all SHA-capable CPUs support them as well. - -#ifdef __linux__ - .section .note.GNU-stack, "", @progbits -#endif - .section .text.sha1_process_block64_shaNI, "ax", @progbits - .globl sha1_process_block64_shaNI - .hidden sha1_process_block64_shaNI - .type sha1_process_block64_shaNI, @function - -#define ABCD %xmm0 -#define E0 %xmm1 /* Need two E's b/c they ping pong */ -#define E1 %xmm2 -#define MSG0 %xmm3 -#define MSG1 %xmm4 -#define MSG2 %xmm5 -#define MSG3 %xmm6 - - .balign 8 # allow decoders to fetch at least 2 first insns -sha1_process_block64_shaNI: - /* load initial hash values */ - movu128 80(%rdi), ABCD - xor128 E0, E0 - pinsrd $3, 80+4*4(%rdi), E0 # load to uppermost 32-bit word - shuf128_32 $0x1B, ABCD, ABCD # DCBA -> ABCD - - mova128 PSHUFFLE_BYTE_FLIP_MASK(%rip), %xmm7 - - movu128 0*16(%rdi), MSG0 - pshufb %xmm7, MSG0 - movu128 1*16(%rdi), MSG1 - pshufb %xmm7, MSG1 - movu128 2*16(%rdi), MSG2 - pshufb %xmm7, MSG2 - movu128 3*16(%rdi), MSG3 - pshufb %xmm7, MSG3 - - /* Save hash values for addition after rounds */ - mova128 E0, %xmm7 - mova128 ABCD, %xmm8 - - /* Rounds 0-3 */ - paddd MSG0, E0 - mova128 ABCD, E1 - sha1rnds4 $0, E0, ABCD - - /* Rounds 4-7 */ - sha1nexte MSG1, E1 - mova128 ABCD, E0 - sha1rnds4 $0, E1, ABCD - sha1msg1 MSG1, MSG0 - - /* Rounds 8-11 */ - sha1nexte MSG2, E0 - mova128 ABCD, E1 - sha1rnds4 $0, E0, ABCD - sha1msg1 MSG2, MSG1 - xor128 MSG2, MSG0 - - /* Rounds 12-15 */ - sha1nexte MSG3, E1 - mova128 ABCD, E0 - sha1msg2 MSG3, MSG0 - sha1rnds4 $0, E1, ABCD - sha1msg1 MSG3, MSG2 - xor128 MSG3, MSG1 - - /* Rounds 16-19 */ - sha1nexte MSG0, E0 - mova128 ABCD, E1 - sha1msg2 MSG0, MSG1 - sha1rnds4 $0, E0, ABCD - sha1msg1 MSG0, MSG3 - xor128 MSG0, MSG2 - - /* Rounds 20-23 */ - sha1nexte MSG1, E1 - mova128 ABCD, E0 - sha1msg2 MSG1, MSG2 - sha1rnds4 $1, E1, ABCD - sha1msg1 MSG1, MSG0 - xor128 MSG1, MSG3 - - /* Rounds 24-27 */ - sha1nexte MSG2, E0 - mova128 ABCD, E1 - sha1msg2 MSG2, MSG3 - sha1rnds4 $1, E0, ABCD - sha1msg1 MSG2, MSG1 - xor128 MSG2, MSG0 - - /* Rounds 28-31 */ - sha1nexte MSG3, E1 - mova128 ABCD, E0 - sha1msg2 MSG3, MSG0 - sha1rnds4 $1, E1, ABCD - sha1msg1 MSG3, MSG2 - xor128 MSG3, MSG1 - - /* Rounds 32-35 */ - sha1nexte MSG0, E0 - mova128 ABCD, E1 - sha1msg2 MSG0, MSG1 - sha1rnds4 $1, E0, ABCD - sha1msg1 MSG0, MSG3 - xor128 MSG0, MSG2 - - /* Rounds 36-39 */ - sha1nexte MSG1, E1 - mova128 ABCD, E0 - sha1msg2 MSG1, MSG2 - sha1rnds4 $1, E1, ABCD - sha1msg1 MSG1, MSG0 - xor128 MSG1, MSG3 - - /* Rounds 40-43 */ - sha1nexte MSG2, E0 - mova128 ABCD, E1 - sha1msg2 MSG2, MSG3 - sha1rnds4 $2, E0, ABCD - sha1msg1 MSG2, MSG1 - xor128 MSG2, MSG0 - - /* Rounds 44-47 */ - sha1nexte MSG3, E1 - mova128 ABCD, E0 - sha1msg2 MSG3, MSG0 - sha1rnds4 $2, E1, ABCD - sha1msg1 MSG3, MSG2 - xor128 MSG3, MSG1 - - /* Rounds 48-51 */ - sha1nexte MSG0, E0 - mova128 ABCD, E1 - sha1msg2 MSG0, MSG1 - sha1rnds4 $2, E0, ABCD - sha1msg1 MSG0, MSG3 - xor128 MSG0, MSG2 - - /* Rounds 52-55 */ - sha1nexte MSG1, E1 - mova128 ABCD, E0 - sha1msg2 MSG1, MSG2 - sha1rnds4 $2, E1, ABCD - sha1msg1 MSG1, MSG0 - xor128 MSG1, MSG3 - - /* Rounds 56-59 */ - sha1nexte MSG2, E0 - mova128 ABCD, E1 - sha1msg2 MSG2, MSG3 - sha1rnds4 $2, E0, ABCD - sha1msg1 MSG2, MSG1 - xor128 MSG2, MSG0 - - /* Rounds 60-63 */ - sha1nexte MSG3, E1 - mova128 ABCD, E0 - sha1msg2 MSG3, MSG0 - sha1rnds4 $3, E1, ABCD - sha1msg1 MSG3, MSG2 - xor128 MSG3, MSG1 - - /* Rounds 64-67 */ - sha1nexte MSG0, E0 - mova128 ABCD, E1 - sha1msg2 MSG0, MSG1 - sha1rnds4 $3, E0, ABCD - sha1msg1 MSG0, MSG3 - xor128 MSG0, MSG2 - - /* Rounds 68-71 */ - sha1nexte MSG1, E1 - mova128 ABCD, E0 - sha1msg2 MSG1, MSG2 - sha1rnds4 $3, E1, ABCD - xor128 MSG1, MSG3 - - /* Rounds 72-75 */ - sha1nexte MSG2, E0 - mova128 ABCD, E1 - sha1msg2 MSG2, MSG3 - sha1rnds4 $3, E0, ABCD - - /* Rounds 76-79 */ - sha1nexte MSG3, E1 - mova128 ABCD, E0 - sha1rnds4 $3, E1, ABCD - - /* Add current hash values with previously saved */ - sha1nexte %xmm7, E0 - paddd %xmm8, ABCD - - /* Write hash values back in the correct order */ - shuf128_32 $0x1B, ABCD, ABCD - movu128 ABCD, 80(%rdi) - extr128_32 $3, E0, 80+4*4(%rdi) - - ret - .size sha1_process_block64_shaNI, .-sha1_process_block64_shaNI - - .section .rodata.cst16.PSHUFFLE_BYTE_FLIP_MASK, "aM", @progbits, 16 - .balign 16 -PSHUFFLE_BYTE_FLIP_MASK: - .octa 0x000102030405060708090a0b0c0d0e0f - -#endif diff --git a/libbb/hash_sha1_hwaccel_x86-32.S b/libbb/hash_sha1_hwaccel_x86-32.S new file mode 100644 index 000000000..7455a29f0 --- /dev/null +++ b/libbb/hash_sha1_hwaccel_x86-32.S @@ -0,0 +1,234 @@ +#if ENABLE_SHA1_HWACCEL && defined(__GNUC__) && defined(__i386__) +/* The code is adapted from Linux kernel's source */ + +// We use shorter insns, even though they are for "wrong" +// data type (fp, not int). +// For Intel, there is no penalty for doing it at all +// (CPUs which do have such penalty do not support SHA insns). +// For AMD, the penalty is one extra cycle +// (allegedly: I failed to find measurable difference). + +//#define mova128 movdqa +#define mova128 movaps +//#define movu128 movdqu +#define movu128 movups +//#define xor128 pxor +#define xor128 xorps +//#define shuf128_32 pshufd +#define shuf128_32 shufps + +#define extr128_32 pextrd +//#define extr128_32 extractps # not shorter + +// pshufb is a SSSE3 insn. +// pinsrd, pextrd, extractps are SSE4.1 insns. +// We do not check SSSE3/SSE4.1 in cpuid, +// all SHA-capable CPUs support them as well. + +#ifdef __linux__ + .section .note.GNU-stack, "", @progbits +#endif + .section .text.sha1_process_block64_shaNI, "ax", @progbits + .globl sha1_process_block64_shaNI + .hidden sha1_process_block64_shaNI + .type sha1_process_block64_shaNI, @function + +#define ABCD %xmm0 +#define E0 %xmm1 /* Need two E's b/c they ping pong */ +#define E1 %xmm2 +#define MSG0 %xmm3 +#define MSG1 %xmm4 +#define MSG2 %xmm5 +#define MSG3 %xmm6 + + .balign 8 # allow decoders to fetch at least 2 first insns +sha1_process_block64_shaNI: + /* load initial hash values */ + movu128 76(%eax), ABCD + xor128 E0, E0 + pinsrd $3, 76+4*4(%eax), E0 # load to uppermost 32-bit word + shuf128_32 $0x1B, ABCD, ABCD # DCBA -> ABCD + + mova128 PSHUFFLE_BYTE_FLIP_MASK, %xmm7 + + movu128 0*16(%eax), MSG0 + pshufb %xmm7, MSG0 + movu128 1*16(%eax), MSG1 + pshufb %xmm7, MSG1 + movu128 2*16(%eax), MSG2 + pshufb %xmm7, MSG2 + movu128 3*16(%eax), MSG3 + pshufb %xmm7, MSG3 + + /* Save hash values for addition after rounds */ + mova128 E0, %xmm7 + /*mova128 ABCD, %xmm8 - NOPE, 32bit has no xmm8 */ + + /* Rounds 0-3 */ + paddd MSG0, E0 + mova128 ABCD, E1 + sha1rnds4 $0, E0, ABCD + + /* Rounds 4-7 */ + sha1nexte MSG1, E1 + mova128 ABCD, E0 + sha1rnds4 $0, E1, ABCD + sha1msg1 MSG1, MSG0 + + /* Rounds 8-11 */ + sha1nexte MSG2, E0 + mova128 ABCD, E1 + sha1rnds4 $0, E0, ABCD + sha1msg1 MSG2, MSG1 + xor128 MSG2, MSG0 + + /* Rounds 12-15 */ + sha1nexte MSG3, E1 + mova128 ABCD, E0 + sha1msg2 MSG3, MSG0 + sha1rnds4 $0, E1, ABCD + sha1msg1 MSG3, MSG2 + xor128 MSG3, MSG1 + + /* Rounds 16-19 */ + sha1nexte MSG0, E0 + mova128 ABCD, E1 + sha1msg2 MSG0, MSG1 + sha1rnds4 $0, E0, ABCD + sha1msg1 MSG0, MSG3 + xor128 MSG0, MSG2 + + /* Rounds 20-23 */ + sha1nexte MSG1, E1 + mova128 ABCD, E0 + sha1msg2 MSG1, MSG2 + sha1rnds4 $1, E1, ABCD + sha1msg1 MSG1, MSG0 + xor128 MSG1, MSG3 + + /* Rounds 24-27 */ + sha1nexte MSG2, E0 + mova128 ABCD, E1 + sha1msg2 MSG2, MSG3 + sha1rnds4 $1, E0, ABCD + sha1msg1 MSG2, MSG1 + xor128 MSG2, MSG0 + + /* Rounds 28-31 */ + sha1nexte MSG3, E1 + mova128 ABCD, E0 + sha1msg2 MSG3, MSG0 + sha1rnds4 $1, E1, ABCD + sha1msg1 MSG3, MSG2 + xor128 MSG3, MSG1 + + /* Rounds 32-35 */ + sha1nexte MSG0, E0 + mova128 ABCD, E1 + sha1msg2 MSG0, MSG1 + sha1rnds4 $1, E0, ABCD + sha1msg1 MSG0, MSG3 + xor128 MSG0, MSG2 + + /* Rounds 36-39 */ + sha1nexte MSG1, E1 + mova128 ABCD, E0 + sha1msg2 MSG1, MSG2 + sha1rnds4 $1, E1, ABCD + sha1msg1 MSG1, MSG0 + xor128 MSG1, MSG3 + + /* Rounds 40-43 */ + sha1nexte MSG2, E0 + mova128 ABCD, E1 + sha1msg2 MSG2, MSG3 + sha1rnds4 $2, E0, ABCD + sha1msg1 MSG2, MSG1 + xor128 MSG2, MSG0 + + /* Rounds 44-47 */ + sha1nexte MSG3, E1 + mova128 ABCD, E0 + sha1msg2 MSG3, MSG0 + sha1rnds4 $2, E1, ABCD + sha1msg1 MSG3, MSG2 + xor128 MSG3, MSG1 + + /* Rounds 48-51 */ + sha1nexte MSG0, E0 + mova128 ABCD, E1 + sha1msg2 MSG0, MSG1 + sha1rnds4 $2, E0, ABCD + sha1msg1 MSG0, MSG3 + xor128 MSG0, MSG2 + + /* Rounds 52-55 */ + sha1nexte MSG1, E1 + mova128 ABCD, E0 + sha1msg2 MSG1, MSG2 + sha1rnds4 $2, E1, ABCD + sha1msg1 MSG1, MSG0 + xor128 MSG1, MSG3 + + /* Rounds 56-59 */ + sha1nexte MSG2, E0 + mova128 ABCD, E1 + sha1msg2 MSG2, MSG3 + sha1rnds4 $2, E0, ABCD + sha1msg1 MSG2, MSG1 + xor128 MSG2, MSG0 + + /* Rounds 60-63 */ + sha1nexte MSG3, E1 + mova128 ABCD, E0 + sha1msg2 MSG3, MSG0 + sha1rnds4 $3, E1, ABCD + sha1msg1 MSG3, MSG2 + xor128 MSG3, MSG1 + + /* Rounds 64-67 */ + sha1nexte MSG0, E0 + mova128 ABCD, E1 + sha1msg2 MSG0, MSG1 + sha1rnds4 $3, E0, ABCD + sha1msg1 MSG0, MSG3 + xor128 MSG0, MSG2 + + /* Rounds 68-71 */ + sha1nexte MSG1, E1 + mova128 ABCD, E0 + sha1msg2 MSG1, MSG2 + sha1rnds4 $3, E1, ABCD + xor128 MSG1, MSG3 + + /* Rounds 72-75 */ + sha1nexte MSG2, E0 + mova128 ABCD, E1 + sha1msg2 MSG2, MSG3 + sha1rnds4 $3, E0, ABCD + + /* Rounds 76-79 */ + sha1nexte MSG3, E1 + mova128 ABCD, E0 + sha1rnds4 $3, E1, ABCD + + /* Add current hash values with previously saved */ + sha1nexte %xmm7, E0 + /*paddd %xmm8, ABCD - 32-bit mode has no xmm8 */ + movu128 76(%eax), %xmm7 # get original ABCD (not shuffled)... + + /* Write hash values back in the correct order */ + shuf128_32 $0x1B, ABCD, ABCD + paddd %xmm7, ABCD # ...add it to final ABCD + movu128 ABCD, 76(%eax) + extr128_32 $3, E0, 76+4*4(%eax) + + ret + .size sha1_process_block64_shaNI, .-sha1_process_block64_shaNI + + .section .rodata.cst16.PSHUFFLE_BYTE_FLIP_MASK, "aM", @progbits, 16 + .balign 16 +PSHUFFLE_BYTE_FLIP_MASK: + .octa 0x000102030405060708090a0b0c0d0e0f + +#endif diff --git a/libbb/hash_sha1_hwaccel_x86-64.S b/libbb/hash_sha1_hwaccel_x86-64.S new file mode 100644 index 000000000..2f03e1ce4 --- /dev/null +++ b/libbb/hash_sha1_hwaccel_x86-64.S @@ -0,0 +1,232 @@ +#if ENABLE_SHA1_HWACCEL && defined(__GNUC__) && defined(__x86_64__) +/* The code is adapted from Linux kernel's source */ + +// We use shorter insns, even though they are for "wrong" +// data type (fp, not int). +// For Intel, there is no penalty for doing it at all +// (CPUs which do have such penalty do not support SHA insns). +// For AMD, the penalty is one extra cycle +// (allegedly: I failed to find measurable difference). + +//#define mova128 movdqa +#define mova128 movaps +//#define movu128 movdqu +#define movu128 movups +//#define xor128 pxor +#define xor128 xorps +//#define shuf128_32 pshufd +#define shuf128_32 shufps + +#define extr128_32 pextrd +//#define extr128_32 extractps # not shorter + +// pshufb is a SSSE3 insn. +// pinsrd, pextrd, extractps are SSE4.1 insns. +// We do not check SSSE3/SSE4.1 in cpuid, +// all SHA-capable CPUs support them as well. + +#ifdef __linux__ + .section .note.GNU-stack, "", @progbits +#endif + .section .text.sha1_process_block64_shaNI, "ax", @progbits + .globl sha1_process_block64_shaNI + .hidden sha1_process_block64_shaNI + .type sha1_process_block64_shaNI, @function + +#define ABCD %xmm0 +#define E0 %xmm1 /* Need two E's b/c they ping pong */ +#define E1 %xmm2 +#define MSG0 %xmm3 +#define MSG1 %xmm4 +#define MSG2 %xmm5 +#define MSG3 %xmm6 + + .balign 8 # allow decoders to fetch at least 2 first insns +sha1_process_block64_shaNI: + /* load initial hash values */ + movu128 80(%rdi), ABCD + xor128 E0, E0 + pinsrd $3, 80+4*4(%rdi), E0 # load to uppermost 32-bit word + shuf128_32 $0x1B, ABCD, ABCD # DCBA -> ABCD + + mova128 PSHUFFLE_BYTE_FLIP_MASK(%rip), %xmm7 + + movu128 0*16(%rdi), MSG0 + pshufb %xmm7, MSG0 + movu128 1*16(%rdi), MSG1 + pshufb %xmm7, MSG1 + movu128 2*16(%rdi), MSG2 + pshufb %xmm7, MSG2 + movu128 3*16(%rdi), MSG3 + pshufb %xmm7, MSG3 + + /* Save hash values for addition after rounds */ + mova128 E0, %xmm7 + mova128 ABCD, %xmm8 + + /* Rounds 0-3 */ + paddd MSG0, E0 + mova128 ABCD, E1 + sha1rnds4 $0, E0, ABCD + + /* Rounds 4-7 */ + sha1nexte MSG1, E1 + mova128 ABCD, E0 + sha1rnds4 $0, E1, ABCD + sha1msg1 MSG1, MSG0 + + /* Rounds 8-11 */ + sha1nexte MSG2, E0 + mova128 ABCD, E1 + sha1rnds4 $0, E0, ABCD + sha1msg1 MSG2, MSG1 + xor128 MSG2, MSG0 + + /* Rounds 12-15 */ + sha1nexte MSG3, E1 + mova128 ABCD, E0 + sha1msg2 MSG3, MSG0 + sha1rnds4 $0, E1, ABCD + sha1msg1 MSG3, MSG2 + xor128 MSG3, MSG1 + + /* Rounds 16-19 */ + sha1nexte MSG0, E0 + mova128 ABCD, E1 + sha1msg2 MSG0, MSG1 + sha1rnds4 $0, E0, ABCD + sha1msg1 MSG0, MSG3 + xor128 MSG0, MSG2 + + /* Rounds 20-23 */ + sha1nexte MSG1, E1 + mova128 ABCD, E0 + sha1msg2 MSG1, MSG2 + sha1rnds4 $1, E1, ABCD + sha1msg1 MSG1, MSG0 + xor128 MSG1, MSG3 + + /* Rounds 24-27 */ + sha1nexte MSG2, E0 + mova128 ABCD, E1 + sha1msg2 MSG2, MSG3 + sha1rnds4 $1, E0, ABCD + sha1msg1 MSG2, MSG1 + xor128 MSG2, MSG0 + + /* Rounds 28-31 */ + sha1nexte MSG3, E1 + mova128 ABCD, E0 + sha1msg2 MSG3, MSG0 + sha1rnds4 $1, E1, ABCD + sha1msg1 MSG3, MSG2 + xor128 MSG3, MSG1 + + /* Rounds 32-35 */ + sha1nexte MSG0, E0 + mova128 ABCD, E1 + sha1msg2 MSG0, MSG1 + sha1rnds4 $1, E0, ABCD + sha1msg1 MSG0, MSG3 + xor128 MSG0, MSG2 + + /* Rounds 36-39 */ + sha1nexte MSG1, E1 + mova128 ABCD, E0 + sha1msg2 MSG1, MSG2 + sha1rnds4 $1, E1, ABCD + sha1msg1 MSG1, MSG0 + xor128 MSG1, MSG3 + + /* Rounds 40-43 */ + sha1nexte MSG2, E0 + mova128 ABCD, E1 + sha1msg2 MSG2, MSG3 + sha1rnds4 $2, E0, ABCD + sha1msg1 MSG2, MSG1 + xor128 MSG2, MSG0 + + /* Rounds 44-47 */ + sha1nexte MSG3, E1 + mova128 ABCD, E0 + sha1msg2 MSG3, MSG0 + sha1rnds4 $2, E1, ABCD + sha1msg1 MSG3, MSG2 + xor128 MSG3, MSG1 + + /* Rounds 48-51 */ + sha1nexte MSG0, E0 + mova128 ABCD, E1 + sha1msg2 MSG0, MSG1 + sha1rnds4 $2, E0, ABCD + sha1msg1 MSG0, MSG3 + xor128 MSG0, MSG2 + + /* Rounds 52-55 */ + sha1nexte MSG1, E1 + mova128 ABCD, E0 + sha1msg2 MSG1, MSG2 + sha1rnds4 $2, E1, ABCD + sha1msg1 MSG1, MSG0 + xor128 MSG1, MSG3 + + /* Rounds 56-59 */ + sha1nexte MSG2, E0 + mova128 ABCD, E1 + sha1msg2 MSG2, MSG3 + sha1rnds4 $2, E0, ABCD + sha1msg1 MSG2, MSG1 + xor128 MSG2, MSG0 + + /* Rounds 60-63 */ + sha1nexte MSG3, E1 + mova128 ABCD, E0 + sha1msg2 MSG3, MSG0 + sha1rnds4 $3, E1, ABCD + sha1msg1 MSG3, MSG2 + xor128 MSG3, MSG1 + + /* Rounds 64-67 */ + sha1nexte MSG0, E0 + mova128 ABCD, E1 + sha1msg2 MSG0, MSG1 + sha1rnds4 $3, E0, ABCD + sha1msg1 MSG0, MSG3 + xor128 MSG0, MSG2 + + /* Rounds 68-71 */ + sha1nexte MSG1, E1 + mova128 ABCD, E0 + sha1msg2 MSG1, MSG2 + sha1rnds4 $3, E1, ABCD + xor128 MSG1, MSG3 + + /* Rounds 72-75 */ + sha1nexte MSG2, E0 + mova128 ABCD, E1 + sha1msg2 MSG2, MSG3 + sha1rnds4 $3, E0, ABCD + + /* Rounds 76-79 */ + sha1nexte MSG3, E1 + mova128 ABCD, E0 + sha1rnds4 $3, E1, ABCD + + /* Add current hash values with previously saved */ + sha1nexte %xmm7, E0 + paddd %xmm8, ABCD + + /* Write hash values back in the correct order */ + shuf128_32 $0x1B, ABCD, ABCD + movu128 ABCD, 80(%rdi) + extr128_32 $3, E0, 80+4*4(%rdi) + + ret + .size sha1_process_block64_shaNI, .-sha1_process_block64_shaNI + + .section .rodata.cst16.PSHUFFLE_BYTE_FLIP_MASK, "aM", @progbits, 16 + .balign 16 +PSHUFFLE_BYTE_FLIP_MASK: + .octa 0x000102030405060708090a0b0c0d0e0f + +#endif diff --git a/libbb/hash_sha1_x86-64.S b/libbb/hash_sha1_x86-64.S new file mode 100644 index 000000000..b1968fff6 --- /dev/null +++ b/libbb/hash_sha1_x86-64.S @@ -0,0 +1,1489 @@ +### Generated by hash_sha1_x86-64.S.sh ### + +#if CONFIG_SHA1_SMALL == 0 && defined(__GNUC__) && defined(__x86_64__) +#ifdef __linux__ + .section .note.GNU-stack, "", @progbits +#endif + .section .text.sha1_process_block64, "ax", @progbits + .globl sha1_process_block64 + .hidden sha1_process_block64 + .type sha1_process_block64, @function + + .balign 8 # allow decoders to fetch at least 5 first insns +sha1_process_block64: + pushq %rbp # 1 byte insn + pushq %rbx # 1 byte insn +# pushq %r15 # 2 byte insn + pushq %r14 # 2 byte insn + pushq %r13 # 2 byte insn + pushq %r12 # 2 byte insn + pushq %rdi # we need ctx at the end + +#Register and stack use: +# eax..edx: a..d +# ebp: e +# esi,edi,r8..r14: temps +# r15: unused +# xmm0..xmm3: W[] +# xmm4,xmm5: temps +# xmm6: current round constant +# xmm7: all round constants +# -64(%rsp): area for passing RCONST + W[] from vector to integer units + + movl 80(%rdi), %eax # a = ctx->hash[0] + movl 84(%rdi), %ebx # b = ctx->hash[1] + movl 88(%rdi), %ecx # c = ctx->hash[2] + movl 92(%rdi), %edx # d = ctx->hash[3] + movl 96(%rdi), %ebp # e = ctx->hash[4] + + movaps sha1const(%rip), %xmm7 + pshufd $0x00, %xmm7, %xmm6 + + # Load W[] to xmm0..3, byteswapping on the fly. + # + # For iterations 0..15, we pass W[] in rsi,r8..r14 + # for use in RD1As instead of spilling them to stack. + # We lose parallelized addition of RCONST, but LEA + # can do two additions at once, so it is probably a wash. + # (We use rsi instead of rN because this makes two + # LEAs in two first RD1As shorter by one byte). + movq 4*0(%rdi), %rsi + movq 4*2(%rdi), %r8 + bswapq %rsi + bswapq %r8 + rolq $32, %rsi # rsi = W[1]:W[0] + rolq $32, %r8 # r8 = W[3]:W[2] + movq %rsi, %xmm0 + movq %r8, %xmm4 + punpcklqdq %xmm4, %xmm0 # xmm0 = r8:rsi = (W[0],W[1],W[2],W[3]) +# movaps %xmm0, %xmm4 # add RCONST, spill to stack +# paddd %xmm6, %xmm4 +# movups %xmm4, -64+16*0(%rsp) + + movq 4*4(%rdi), %r9 + movq 4*6(%rdi), %r10 + bswapq %r9 + bswapq %r10 + rolq $32, %r9 # r9 = W[5]:W[4] + rolq $32, %r10 # r10 = W[7]:W[6] + movq %r9, %xmm1 + movq %r10, %xmm4 + punpcklqdq %xmm4, %xmm1 # xmm1 = r10:r9 = (W[4],W[5],W[6],W[7]) + + movq 4*8(%rdi), %r11 + movq 4*10(%rdi), %r12 + bswapq %r11 + bswapq %r12 + rolq $32, %r11 # r11 = W[9]:W[8] + rolq $32, %r12 # r12 = W[11]:W[10] + movq %r11, %xmm2 + movq %r12, %xmm4 + punpcklqdq %xmm4, %xmm2 # xmm2 = r12:r11 = (W[8],W[9],W[10],W[11]) + + movq 4*12(%rdi), %r13 + movq 4*14(%rdi), %r14 + bswapq %r13 + bswapq %r14 + rolq $32, %r13 # r13 = W[13]:W[12] + rolq $32, %r14 # r14 = W[15]:W[14] + movq %r13, %xmm3 + movq %r14, %xmm4 + punpcklqdq %xmm4, %xmm3 # xmm3 = r14:r13 = (W[12],W[13],W[14],W[15]) + +# 0 + leal 0x5A827999(%rbp,%rsi), %ebp # e += RCONST + W[n] + shrq $32, %rsi + movl %ecx, %edi # c + xorl %edx, %edi # ^d + andl %ebx, %edi # &b + xorl %edx, %edi # (((c ^ d) & b) ^ d) + addl %edi, %ebp # e += (((c ^ d) & b) ^ d) + movl %eax, %edi # + roll $5, %edi # rotl32(a,5) + addl %edi, %ebp # e += rotl32(a,5) + rorl $2, %ebx # b = rotl32(b,30) +# 1 + leal 0x5A827999(%rdx,%rsi), %edx # e += RCONST + W[n] + movl %ebx, %edi # c + xorl %ecx, %edi # ^d + andl %eax, %edi # &b + xorl %ecx, %edi # (((c ^ d) & b) ^ d) + addl %edi, %edx # e += (((c ^ d) & b) ^ d) + movl %ebp, %edi # + roll $5, %edi # rotl32(a,5) + addl %edi, %edx # e += rotl32(a,5) + rorl $2, %eax # b = rotl32(b,30) +# 2 + leal 0x5A827999(%rcx,%r8), %ecx # e += RCONST + W[n] + shrq $32, %r8 + movl %eax, %edi # c + xorl %ebx, %edi # ^d + andl %ebp, %edi # &b + xorl %ebx, %edi # (((c ^ d) & b) ^ d) + addl %edi, %ecx # e += (((c ^ d) & b) ^ d) + movl %edx, %edi # + roll $5, %edi # rotl32(a,5) + addl %edi, %ecx # e += rotl32(a,5) + rorl $2, %ebp # b = rotl32(b,30) +# 3 + leal 0x5A827999(%rbx,%r8), %ebx # e += RCONST + W[n] + movl %ebp, %edi # c + xorl %eax, %edi # ^d + andl %edx, %edi # &b + xorl %eax, %edi # (((c ^ d) & b) ^ d) + addl %edi, %ebx # e += (((c ^ d) & b) ^ d) + movl %ecx, %edi # + roll $5, %edi # rotl32(a,5) + addl %edi, %ebx # e += rotl32(a,5) + rorl $2, %edx # b = rotl32(b,30) +# 4 + leal 0x5A827999(%rax,%r9), %eax # e += RCONST + W[n] + shrq $32, %r9 + movl %edx, %edi # c + xorl %ebp, %edi # ^d + andl %ecx, %edi # &b + xorl %ebp, %edi # (((c ^ d) & b) ^ d) + addl %edi, %eax # e += (((c ^ d) & b) ^ d) + movl %ebx, %edi # + roll $5, %edi # rotl32(a,5) + addl %edi, %eax # e += rotl32(a,5) + rorl $2, %ecx # b = rotl32(b,30) +# 5 + leal 0x5A827999(%rbp,%r9), %ebp # e += RCONST + W[n] + movl %ecx, %edi # c + xorl %edx, %edi # ^d + andl %ebx, %edi # &b + xorl %edx, %edi # (((c ^ d) & b) ^ d) + addl %edi, %ebp # e += (((c ^ d) & b) ^ d) + movl %eax, %edi # + roll $5, %edi # rotl32(a,5) + addl %edi, %ebp # e += rotl32(a,5) + rorl $2, %ebx # b = rotl32(b,30) +# 6 + leal 0x5A827999(%rdx,%r10), %edx # e += RCONST + W[n] + shrq $32, %r10 + movl %ebx, %edi # c + xorl %ecx, %edi # ^d + andl %eax, %edi # &b + xorl %ecx, %edi # (((c ^ d) & b) ^ d) + addl %edi, %edx # e += (((c ^ d) & b) ^ d) + movl %ebp, %edi # + roll $5, %edi # rotl32(a,5) + addl %edi, %edx # e += rotl32(a,5) + rorl $2, %eax # b = rotl32(b,30) +# 7 + leal 0x5A827999(%rcx,%r10), %ecx # e += RCONST + W[n] + movl %eax, %edi # c + xorl %ebx, %edi # ^d + andl %ebp, %edi # &b + xorl %ebx, %edi # (((c ^ d) & b) ^ d) + addl %edi, %ecx # e += (((c ^ d) & b) ^ d) + movl %edx, %edi # + roll $5, %edi # rotl32(a,5) + addl %edi, %ecx # e += rotl32(a,5) + rorl $2, %ebp # b = rotl32(b,30) +# PREP %xmm0 %xmm1 %xmm2 %xmm3 -64+16*0(%rsp) + movaps %xmm3, %xmm4 + psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0) +# pshufd $0x4e, %xmm0, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x) +# punpcklqdq %xmm1, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5]) +# same result as above, but shorter and faster: +# pshufd/shufps are subtly different: pshufd takes all dwords from source operand, +# shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one! + movaps %xmm0, %xmm5 + shufps $0x4e, %xmm1, %xmm5 # 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5]) + xorps %xmm2, %xmm0 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3]) + xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5]) + xorps %xmm5, %xmm0 # ^ + # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup + movaps %xmm0, %xmm5 + xorps %xmm4, %xmm4 # rol(W0,1): + pcmpgtd %xmm0, %xmm4 # ffffffff for elements <0 (ones with msb bit 1) + paddd %xmm0, %xmm0 # shift left by 1 + psubd %xmm4, %xmm0 # add 1 to those who had msb bit 1 + # W0 = rotated (W[0]..W[3]), still needs W[3] fixup + pslldq $12, %xmm5 # lshift by 12 bytes: T2 = (0,0,0,unrotW[0]) + movaps %xmm5, %xmm4 + pslld $2, %xmm5 + psrld $30, %xmm4 +# xorps %xmm4, %xmm5 # rol((0,0,0,unrotW[0]),2) + xorps %xmm4, %xmm0 # same result, but does not depend on/does not modify T2 + xorps %xmm5, %xmm0 # W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2)) + movaps %xmm0, %xmm5 + paddd %xmm6, %xmm5 + movups %xmm5, -64+16*0(%rsp) +# 8 + leal 0x5A827999(%rbx,%r11), %ebx # e += RCONST + W[n] + shrq $32, %r11 + movl %ebp, %edi # c + xorl %eax, %edi # ^d + andl %edx, %edi # &b + xorl %eax, %edi # (((c ^ d) & b) ^ d) + addl %edi, %ebx # e += (((c ^ d) & b) ^ d) + movl %ecx, %edi # + roll $5, %edi # rotl32(a,5) + addl %edi, %ebx # e += rotl32(a,5) + rorl $2, %edx # b = rotl32(b,30) +# 9 + leal 0x5A827999(%rax,%r11), %eax # e += RCONST + W[n] + movl %edx, %edi # c + xorl %ebp, %edi # ^d + andl %ecx, %edi # &b + xorl %ebp, %edi # (((c ^ d) & b) ^ d) + addl %edi, %eax # e += (((c ^ d) & b) ^ d) + movl %ebx, %edi # + roll $5, %edi # rotl32(a,5) + addl %edi, %eax # e += rotl32(a,5) + rorl $2, %ecx # b = rotl32(b,30) +# 10 + leal 0x5A827999(%rbp,%r12), %ebp # e += RCONST + W[n] + shrq $32, %r12 + movl %ecx, %edi # c + xorl %edx, %edi # ^d + andl %ebx, %edi # &b + xorl %edx, %edi # (((c ^ d) & b) ^ d) + addl %edi, %ebp # e += (((c ^ d) & b) ^ d) + movl %eax, %edi # + roll $5, %edi # rotl32(a,5) + addl %edi, %ebp # e += rotl32(a,5) + rorl $2, %ebx # b = rotl32(b,30) +# 11 + leal 0x5A827999(%rdx,%r12), %edx # e += RCONST + W[n] + movl %ebx, %edi # c + xorl %ecx, %edi # ^d + andl %eax, %edi # &b + xorl %ecx, %edi # (((c ^ d) & b) ^ d) + addl %edi, %edx # e += (((c ^ d) & b) ^ d) + movl %ebp, %edi # + roll $5, %edi # rotl32(a,5) + addl %edi, %edx # e += rotl32(a,5) + rorl $2, %eax # b = rotl32(b,30) + pshufd $0x55, %xmm7, %xmm6 +# PREP %xmm1 %xmm2 %xmm3 %xmm0 -64+16*1(%rsp) + movaps %xmm0, %xmm4 + psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0) +# pshufd $0x4e, %xmm1, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x) +# punpcklqdq %xmm2, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5]) +# same result as above, but shorter and faster: +# pshufd/shufps are subtly different: pshufd takes all dwords from source operand, +# shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one! + movaps %xmm1, %xmm5 + shufps $0x4e, %xmm2, %xmm5 # 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5]) + xorps %xmm3, %xmm1 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3]) + xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5]) + xorps %xmm5, %xmm1 # ^ + # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup + movaps %xmm1, %xmm5 + xorps %xmm4, %xmm4 # rol(W0,1): + pcmpgtd %xmm1, %xmm4 # ffffffff for elements <0 (ones with msb bit 1) + paddd %xmm1, %xmm1 # shift left by 1 + psubd %xmm4, %xmm1 # add 1 to those who had msb bit 1 + # W0 = rotated (W[0]..W[3]), still needs W[3] fixup + pslldq $12, %xmm5 # lshift by 12 bytes: T2 = (0,0,0,unrotW[0]) + movaps %xmm5, %xmm4 + pslld $2, %xmm5 + psrld $30, %xmm4 +# xorps %xmm4, %xmm5 # rol((0,0,0,unrotW[0]),2) + xorps %xmm4, %xmm1 # same result, but does not depend on/does not modify T2 + xorps %xmm5, %xmm1 # W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2)) + movaps %xmm1, %xmm5 + paddd %xmm6, %xmm5 + movups %xmm5, -64+16*1(%rsp) +# 12 + leal 0x5A827999(%rcx,%r13), %ecx # e += RCONST + W[n] + shrq $32, %r13 + movl %eax, %edi # c + xorl %ebx, %edi # ^d + andl %ebp, %edi # &b + xorl %ebx, %edi # (((c ^ d) & b) ^ d) + addl %edi, %ecx # e += (((c ^ d) & b) ^ d) + movl %edx, %edi # + roll $5, %edi # rotl32(a,5) + addl %edi, %ecx # e += rotl32(a,5) + rorl $2, %ebp # b = rotl32(b,30) +# 13 + leal 0x5A827999(%rbx,%r13), %ebx # e += RCONST + W[n] + movl %ebp, %edi # c + xorl %eax, %edi # ^d + andl %edx, %edi # &b + xorl %eax, %edi # (((c ^ d) & b) ^ d) + addl %edi, %ebx # e += (((c ^ d) & b) ^ d) + movl %ecx, %edi # + roll $5, %edi # rotl32(a,5) + addl %edi, %ebx # e += rotl32(a,5) + rorl $2, %edx # b = rotl32(b,30) +# 14 + leal 0x5A827999(%rax,%r14), %eax # e += RCONST + W[n] + shrq $32, %r14 + movl %edx, %edi # c + xorl %ebp, %edi # ^d + andl %ecx, %edi # &b + xorl %ebp, %edi # (((c ^ d) & b) ^ d) + addl %edi, %eax # e += (((c ^ d) & b) ^ d) + movl %ebx, %edi # + roll $5, %edi # rotl32(a,5) + addl %edi, %eax # e += rotl32(a,5) + rorl $2, %ecx # b = rotl32(b,30) +# 15 + leal 0x5A827999(%rbp,%r14), %ebp # e += RCONST + W[n] + movl %ecx, %edi # c + xorl %edx, %edi # ^d + andl %ebx, %edi # &b + xorl %edx, %edi # (((c ^ d) & b) ^ d) + addl %edi, %ebp # e += (((c ^ d) & b) ^ d) + movl %eax, %edi # + roll $5, %edi # rotl32(a,5) + addl %edi, %ebp # e += rotl32(a,5) + rorl $2, %ebx # b = rotl32(b,30) +# PREP %xmm2 %xmm3 %xmm0 %xmm1 -64+16*2(%rsp) + movaps %xmm1, %xmm4 + psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0) +# pshufd $0x4e, %xmm2, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x) +# punpcklqdq %xmm3, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5]) +# same result as above, but shorter and faster: +# pshufd/shufps are subtly different: pshufd takes all dwords from source operand, +# shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one! + movaps %xmm2, %xmm5 + shufps $0x4e, %xmm3, %xmm5 # 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5]) + xorps %xmm0, %xmm2 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3]) + xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5]) + xorps %xmm5, %xmm2 # ^ + # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup + movaps %xmm2, %xmm5 + xorps %xmm4, %xmm4 # rol(W0,1): + pcmpgtd %xmm2, %xmm4 # ffffffff for elements <0 (ones with msb bit 1) + paddd %xmm2, %xmm2 # shift left by 1 + psubd %xmm4, %xmm2 # add 1 to those who had msb bit 1 + # W0 = rotated (W[0]..W[3]), still needs W[3] fixup + pslldq $12, %xmm5 # lshift by 12 bytes: T2 = (0,0,0,unrotW[0]) + movaps %xmm5, %xmm4 + pslld $2, %xmm5 + psrld $30, %xmm4 +# xorps %xmm4, %xmm5 # rol((0,0,0,unrotW[0]),2) + xorps %xmm4, %xmm2 # same result, but does not depend on/does not modify T2 + xorps %xmm5, %xmm2 # W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2)) + movaps %xmm2, %xmm5 + paddd %xmm6, %xmm5 + movups %xmm5, -64+16*2(%rsp) +# 16 + movl %ebx, %edi # c + xorl %ecx, %edi # ^d + andl %eax, %edi # &b + xorl %ecx, %edi # (((c ^ d) & b) ^ d) + addl -64+4*0(%rsp), %edx # e += RCONST + W[n & 15] + addl %edi, %edx # e += (((c ^ d) & b) ^ d) + movl %ebp, %esi # + roll $5, %esi # rotl32(a,5) + addl %esi, %edx # e += rotl32(a,5) + rorl $2, %eax # b = rotl32(b,30) +# 17 + movl %eax, %edi # c + xorl %ebx, %edi # ^d + andl %ebp, %edi # &b + xorl %ebx, %edi # (((c ^ d) & b) ^ d) + addl -64+4*1(%rsp), %ecx # e += RCONST + W[n & 15] + addl %edi, %ecx # e += (((c ^ d) & b) ^ d) + movl %edx, %esi # + roll $5, %esi # rotl32(a,5) + addl %esi, %ecx # e += rotl32(a,5) + rorl $2, %ebp # b = rotl32(b,30) +# 18 + movl %ebp, %edi # c + xorl %eax, %edi # ^d + andl %edx, %edi # &b + xorl %eax, %edi # (((c ^ d) & b) ^ d) + addl -64+4*2(%rsp), %ebx # e += RCONST + W[n & 15] + addl %edi, %ebx # e += (((c ^ d) & b) ^ d) + movl %ecx, %esi # + roll $5, %esi # rotl32(a,5) + addl %esi, %ebx # e += rotl32(a,5) + rorl $2, %edx # b = rotl32(b,30) +# 19 + movl %edx, %edi # c + xorl %ebp, %edi # ^d + andl %ecx, %edi # &b + xorl %ebp, %edi # (((c ^ d) & b) ^ d) + addl -64+4*3(%rsp), %eax # e += RCONST + W[n & 15] + addl %edi, %eax # e += (((c ^ d) & b) ^ d) + movl %ebx, %esi # + roll $5, %esi # rotl32(a,5) + addl %esi, %eax # e += rotl32(a,5) + rorl $2, %ecx # b = rotl32(b,30) +# PREP %xmm3 %xmm0 %xmm1 %xmm2 -64+16*3(%rsp) + movaps %xmm2, %xmm4 + psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0) +# pshufd $0x4e, %xmm3, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x) +# punpcklqdq %xmm0, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5]) +# same result as above, but shorter and faster: +# pshufd/shufps are subtly different: pshufd takes all dwords from source operand, +# shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one! + movaps %xmm3, %xmm5 + shufps $0x4e, %xmm0, %xmm5 # 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5]) + xorps %xmm1, %xmm3 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3]) + xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5]) + xorps %xmm5, %xmm3 # ^ + # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup + movaps %xmm3, %xmm5 + xorps %xmm4, %xmm4 # rol(W0,1): + pcmpgtd %xmm3, %xmm4 # ffffffff for elements <0 (ones with msb bit 1) + paddd %xmm3, %xmm3 # shift left by 1 + psubd %xmm4, %xmm3 # add 1 to those who had msb bit 1 + # W0 = rotated (W[0]..W[3]), still needs W[3] fixup + pslldq $12, %xmm5 # lshift by 12 bytes: T2 = (0,0,0,unrotW[0]) + movaps %xmm5, %xmm4 + pslld $2, %xmm5 + psrld $30, %xmm4 +# xorps %xmm4, %xmm5 # rol((0,0,0,unrotW[0]),2) + xorps %xmm4, %xmm3 # same result, but does not depend on/does not modify T2 + xorps %xmm5, %xmm3 # W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2)) + movaps %xmm3, %xmm5 + paddd %xmm6, %xmm5 + movups %xmm5, -64+16*3(%rsp) +# 20 + movl %ecx, %edi # c + xorl %edx, %edi # ^d + xorl %ebx, %edi # ^b + addl -64+4*4(%rsp), %ebp # e += RCONST + W[n & 15] + addl %edi, %ebp # e += (c ^ d ^ b) + movl %eax, %esi # + roll $5, %esi # rotl32(a,5) + addl %esi, %ebp # e += rotl32(a,5) + rorl $2, %ebx # b = rotl32(b,30) +# 21 + movl %ebx, %edi # c + xorl %ecx, %edi # ^d + xorl %eax, %edi # ^b + addl -64+4*5(%rsp), %edx # e += RCONST + W[n & 15] + addl %edi, %edx # e += (c ^ d ^ b) + movl %ebp, %esi # + roll $5, %esi # rotl32(a,5) + addl %esi, %edx # e += rotl32(a,5) + rorl $2, %eax # b = rotl32(b,30) +# 22 + movl %eax, %edi # c + xorl %ebx, %edi # ^d + xorl %ebp, %edi # ^b + addl -64+4*6(%rsp), %ecx # e += RCONST + W[n & 15] + addl %edi, %ecx # e += (c ^ d ^ b) + movl %edx, %esi # + roll $5, %esi # rotl32(a,5) + addl %esi, %ecx # e += rotl32(a,5) + rorl $2, %ebp # b = rotl32(b,30) +# 23 + movl %ebp, %edi # c + xorl %eax, %edi # ^d + xorl %edx, %edi # ^b + addl -64+4*7(%rsp), %ebx # e += RCONST + W[n & 15] + addl %edi, %ebx # e += (c ^ d ^ b) + movl %ecx, %esi # + roll $5, %esi # rotl32(a,5) + addl %esi, %ebx # e += rotl32(a,5) + rorl $2, %edx # b = rotl32(b,30) +# PREP %xmm0 %xmm1 %xmm2 %xmm3 -64+16*0(%rsp) + movaps %xmm3, %xmm4 + psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0) +# pshufd $0x4e, %xmm0, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x) +# punpcklqdq %xmm1, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5]) +# same result as above, but shorter and faster: +# pshufd/shufps are subtly different: pshufd takes all dwords from source operand, +# shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one! + movaps %xmm0, %xmm5 + shufps $0x4e, %xmm1, %xmm5 # 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5]) + xorps %xmm2, %xmm0 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3]) + xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5]) + xorps %xmm5, %xmm0 # ^ + # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup + movaps %xmm0, %xmm5 + xorps %xmm4, %xmm4 # rol(W0,1): + pcmpgtd %xmm0, %xmm4 # ffffffff for elements <0 (ones with msb bit 1) + paddd %xmm0, %xmm0 # shift left by 1 + psubd %xmm4, %xmm0 # add 1 to those who had msb bit 1 + # W0 = rotated (W[0]..W[3]), still needs W[3] fixup + pslldq $12, %xmm5 # lshift by 12 bytes: T2 = (0,0,0,unrotW[0]) + movaps %xmm5, %xmm4 + pslld $2, %xmm5 + psrld $30, %xmm4 +# xorps %xmm4, %xmm5 # rol((0,0,0,unrotW[0]),2) + xorps %xmm4, %xmm0 # same result, but does not depend on/does not modify T2 + xorps %xmm5, %xmm0 # W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2)) + movaps %xmm0, %xmm5 + paddd %xmm6, %xmm5 + movups %xmm5, -64+16*0(%rsp) +# 24 + movl %edx, %edi # c + xorl %ebp, %edi # ^d + xorl %ecx, %edi # ^b + addl -64+4*8(%rsp), %eax # e += RCONST + W[n & 15] + addl %edi, %eax # e += (c ^ d ^ b) + movl %ebx, %esi # + roll $5, %esi # rotl32(a,5) + addl %esi, %eax # e += rotl32(a,5) + rorl $2, %ecx # b = rotl32(b,30) +# 25 + movl %ecx, %edi # c + xorl %edx, %edi # ^d + xorl %ebx, %edi # ^b + addl -64+4*9(%rsp), %ebp # e += RCONST + W[n & 15] + addl %edi, %ebp # e += (c ^ d ^ b) + movl %eax, %esi # + roll $5, %esi # rotl32(a,5) + addl %esi, %ebp # e += rotl32(a,5) + rorl $2, %ebx # b = rotl32(b,30) +# 26 + movl %ebx, %edi # c + xorl %ecx, %edi # ^d + xorl %eax, %edi # ^b + addl -64+4*10(%rsp), %edx # e += RCONST + W[n & 15] + addl %edi, %edx # e += (c ^ d ^ b) + movl %ebp, %esi # + roll $5, %esi # rotl32(a,5) + addl %esi, %edx # e += rotl32(a,5) + rorl $2, %eax # b = rotl32(b,30) +# 27 + movl %eax, %edi # c + xorl %ebx, %edi # ^d + xorl %ebp, %edi # ^b + addl -64+4*11(%rsp), %ecx # e += RCONST + W[n & 15] + addl %edi, %ecx # e += (c ^ d ^ b) + movl %edx, %esi # + roll $5, %esi # rotl32(a,5) + addl %esi, %ecx # e += rotl32(a,5) + rorl $2, %ebp # b = rotl32(b,30) +# PREP %xmm1 %xmm2 %xmm3 %xmm0 -64+16*1(%rsp) + movaps %xmm0, %xmm4 + psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0) +# pshufd $0x4e, %xmm1, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x) +# punpcklqdq %xmm2, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5]) +# same result as above, but shorter and faster: +# pshufd/shufps are subtly different: pshufd takes all dwords from source operand, +# shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one! + movaps %xmm1, %xmm5 + shufps $0x4e, %xmm2, %xmm5 # 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5]) + xorps %xmm3, %xmm1 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3]) + xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5]) + xorps %xmm5, %xmm1 # ^ + # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup + movaps %xmm1, %xmm5 + xorps %xmm4, %xmm4 # rol(W0,1): + pcmpgtd %xmm1, %xmm4 # ffffffff for elements <0 (ones with msb bit 1) + paddd %xmm1, %xmm1 # shift left by 1 + psubd %xmm4, %xmm1 # add 1 to those who had msb bit 1 + # W0 = rotated (W[0]..W[3]), still needs W[3] fixup + pslldq $12, %xmm5 # lshift by 12 bytes: T2 = (0,0,0,unrotW[0]) + movaps %xmm5, %xmm4 + pslld $2, %xmm5 + psrld $30, %xmm4 +# xorps %xmm4, %xmm5 # rol((0,0,0,unrotW[0]),2) + xorps %xmm4, %xmm1 # same result, but does not depend on/does not modify T2 + xorps %xmm5, %xmm1 # W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2)) + movaps %xmm1, %xmm5 + paddd %xmm6, %xmm5 + movups %xmm5, -64+16*1(%rsp) +# 28 + movl %ebp, %edi # c + xorl %eax, %edi # ^d + xorl %edx, %edi # ^b + addl -64+4*12(%rsp), %ebx # e += RCONST + W[n & 15] + addl %edi, %ebx # e += (c ^ d ^ b) + movl %ecx, %esi # + roll $5, %esi # rotl32(a,5) + addl %esi, %ebx # e += rotl32(a,5) + rorl $2, %edx # b = rotl32(b,30) +# 29 + movl %edx, %edi # c + xorl %ebp, %edi # ^d + xorl %ecx, %edi # ^b + addl -64+4*13(%rsp), %eax # e += RCONST + W[n & 15] + addl %edi, %eax # e += (c ^ d ^ b) + movl %ebx, %esi # + roll $5, %esi # rotl32(a,5) + addl %esi, %eax # e += rotl32(a,5) + rorl $2, %ecx # b = rotl32(b,30) +# 30 + movl %ecx, %edi # c + xorl %edx, %edi # ^d + xorl %ebx, %edi # ^b + addl -64+4*14(%rsp), %ebp # e += RCONST + W[n & 15] + addl %edi, %ebp # e += (c ^ d ^ b) + movl %eax, %esi # + roll $5, %esi # rotl32(a,5) + addl %esi, %ebp # e += rotl32(a,5) + rorl $2, %ebx # b = rotl32(b,30) +# 31 + movl %ebx, %edi # c + xorl %ecx, %edi # ^d + xorl %eax, %edi # ^b + addl -64+4*15(%rsp), %edx # e += RCONST + W[n & 15] + addl %edi, %edx # e += (c ^ d ^ b) + movl %ebp, %esi # + roll $5, %esi # rotl32(a,5) + addl %esi, %edx # e += rotl32(a,5) + rorl $2, %eax # b = rotl32(b,30) + pshufd $0xaa, %xmm7, %xmm6 +# PREP %xmm2 %xmm3 %xmm0 %xmm1 -64+16*2(%rsp) + movaps %xmm1, %xmm4 + psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0) +# pshufd $0x4e, %xmm2, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x) +# punpcklqdq %xmm3, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5]) +# same result as above, but shorter and faster: +# pshufd/shufps are subtly different: pshufd takes all dwords from source operand, +# shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one! + movaps %xmm2, %xmm5 + shufps $0x4e, %xmm3, %xmm5 # 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5]) + xorps %xmm0, %xmm2 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3]) + xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5]) + xorps %xmm5, %xmm2 # ^ + # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup + movaps %xmm2, %xmm5 + xorps %xmm4, %xmm4 # rol(W0,1): + pcmpgtd %xmm2, %xmm4 # ffffffff for elements <0 (ones with msb bit 1) + paddd %xmm2, %xmm2 # shift left by 1 + psubd %xmm4, %xmm2 # add 1 to those who had msb bit 1 + # W0 = rotated (W[0]..W[3]), still needs W[3] fixup + pslldq $12, %xmm5 # lshift by 12 bytes: T2 = (0,0,0,unrotW[0]) + movaps %xmm5, %xmm4 + pslld $2, %xmm5 + psrld $30, %xmm4 +# xorps %xmm4, %xmm5 # rol((0,0,0,unrotW[0]),2) + xorps %xmm4, %xmm2 # same result, but does not depend on/does not modify T2 + xorps %xmm5, %xmm2 # W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2)) + movaps %xmm2, %xmm5 + paddd %xmm6, %xmm5 + movups %xmm5, -64+16*2(%rsp) +# 32 + movl %eax, %edi # c + xorl %ebx, %edi # ^d + xorl %ebp, %edi # ^b + addl -64+4*0(%rsp), %ecx # e += RCONST + W[n & 15] + addl %edi, %ecx # e += (c ^ d ^ b) + movl %edx, %esi # + roll $5, %esi # rotl32(a,5) + addl %esi, %ecx # e += rotl32(a,5) + rorl $2, %ebp # b = rotl32(b,30) +# 33 + movl %ebp, %edi # c + xorl %eax, %edi # ^d + xorl %edx, %edi # ^b + addl -64+4*1(%rsp), %ebx # e += RCONST + W[n & 15] + addl %edi, %ebx # e += (c ^ d ^ b) + movl %ecx, %esi # + roll $5, %esi # rotl32(a,5) + addl %esi, %ebx # e += rotl32(a,5) + rorl $2, %edx # b = rotl32(b,30) +# 34 + movl %edx, %edi # c + xorl %ebp, %edi # ^d + xorl %ecx, %edi # ^b + addl -64+4*2(%rsp), %eax # e += RCONST + W[n & 15] + addl %edi, %eax # e += (c ^ d ^ b) + movl %ebx, %esi # + roll $5, %esi # rotl32(a,5) + addl %esi, %eax # e += rotl32(a,5) + rorl $2, %ecx # b = rotl32(b,30) +# 35 + movl %ecx, %edi # c + xorl %edx, %edi # ^d + xorl %ebx, %edi # ^b + addl -64+4*3(%rsp), %ebp # e += RCONST + W[n & 15] + addl %edi, %ebp # e += (c ^ d ^ b) + movl %eax, %esi # + roll $5, %esi # rotl32(a,5) + addl %esi, %ebp # e += rotl32(a,5) + rorl $2, %ebx # b = rotl32(b,30) +# PREP %xmm3 %xmm0 %xmm1 %xmm2 -64+16*3(%rsp) + movaps %xmm2, %xmm4 + psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0) +# pshufd $0x4e, %xmm3, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x) +# punpcklqdq %xmm0, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5]) +# same result as above, but shorter and faster: +# pshufd/shufps are subtly different: pshufd takes all dwords from source operand, +# shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one! + movaps %xmm3, %xmm5 + shufps $0x4e, %xmm0, %xmm5 # 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5]) + xorps %xmm1, %xmm3 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3]) + xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5]) + xorps %xmm5, %xmm3 # ^ + # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup + movaps %xmm3, %xmm5 + xorps %xmm4, %xmm4 # rol(W0,1): + pcmpgtd %xmm3, %xmm4 # ffffffff for elements <0 (ones with msb bit 1) + paddd %xmm3, %xmm3 # shift left by 1 + psubd %xmm4, %xmm3 # add 1 to those who had msb bit 1 + # W0 = rotated (W[0]..W[3]), still needs W[3] fixup + pslldq $12, %xmm5 # lshift by 12 bytes: T2 = (0,0,0,unrotW[0]) + movaps %xmm5, %xmm4 + pslld $2, %xmm5 + psrld $30, %xmm4 +# xorps %xmm4, %xmm5 # rol((0,0,0,unrotW[0]),2) + xorps %xmm4, %xmm3 # same result, but does not depend on/does not modify T2 + xorps %xmm5, %xmm3 # W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2)) + movaps %xmm3, %xmm5 + paddd %xmm6, %xmm5 + movups %xmm5, -64+16*3(%rsp) +# 36 + movl %ebx, %edi # c + xorl %ecx, %edi # ^d + xorl %eax, %edi # ^b + addl -64+4*4(%rsp), %edx # e += RCONST + W[n & 15] + addl %edi, %edx # e += (c ^ d ^ b) + movl %ebp, %esi # + roll $5, %esi # rotl32(a,5) + addl %esi, %edx # e += rotl32(a,5) + rorl $2, %eax # b = rotl32(b,30) +# 37 + movl %eax, %edi # c + xorl %ebx, %edi # ^d + xorl %ebp, %edi # ^b + addl -64+4*5(%rsp), %ecx # e += RCONST + W[n & 15] + addl %edi, %ecx # e += (c ^ d ^ b) + movl %edx, %esi # + roll $5, %esi # rotl32(a,5) + addl %esi, %ecx # e += rotl32(a,5) + rorl $2, %ebp # b = rotl32(b,30) +# 38 + movl %ebp, %edi # c + xorl %eax, %edi # ^d + xorl %edx, %edi # ^b + addl -64+4*6(%rsp), %ebx # e += RCONST + W[n & 15] + addl %edi, %ebx # e += (c ^ d ^ b) + movl %ecx, %esi # + roll $5, %esi # rotl32(a,5) + addl %esi, %ebx # e += rotl32(a,5) + rorl $2, %edx # b = rotl32(b,30) +# 39 + movl %edx, %edi # c + xorl %ebp, %edi # ^d + xorl %ecx, %edi # ^b + addl -64+4*7(%rsp), %eax # e += RCONST + W[n & 15] + addl %edi, %eax # e += (c ^ d ^ b) + movl %ebx, %esi # + roll $5, %esi # rotl32(a,5) + addl %esi, %eax # e += rotl32(a,5) + rorl $2, %ecx # b = rotl32(b,30) +# PREP %xmm0 %xmm1 %xmm2 %xmm3 -64+16*0(%rsp) + movaps %xmm3, %xmm4 + psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0) +# pshufd $0x4e, %xmm0, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x) +# punpcklqdq %xmm1, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5]) +# same result as above, but shorter and faster: +# pshufd/shufps are subtly different: pshufd takes all dwords from source operand, +# shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one! + movaps %xmm0, %xmm5 + shufps $0x4e, %xmm1, %xmm5 # 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5]) + xorps %xmm2, %xmm0 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3]) + xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5]) + xorps %xmm5, %xmm0 # ^ + # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup + movaps %xmm0, %xmm5 + xorps %xmm4, %xmm4 # rol(W0,1): + pcmpgtd %xmm0, %xmm4 # ffffffff for elements <0 (ones with msb bit 1) + paddd %xmm0, %xmm0 # shift left by 1 + psubd %xmm4, %xmm0 # add 1 to those who had msb bit 1 + # W0 = rotated (W[0]..W[3]), still needs W[3] fixup + pslldq $12, %xmm5 # lshift by 12 bytes: T2 = (0,0,0,unrotW[0]) + movaps %xmm5, %xmm4 + pslld $2, %xmm5 + psrld $30, %xmm4 +# xorps %xmm4, %xmm5 # rol((0,0,0,unrotW[0]),2) + xorps %xmm4, %xmm0 # same result, but does not depend on/does not modify T2 + xorps %xmm5, %xmm0 # W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2)) + movaps %xmm0, %xmm5 + paddd %xmm6, %xmm5 + movups %xmm5, -64+16*0(%rsp) +# 40 + movl %ebx, %edi # di: b + movl %ebx, %esi # si: b + orl %ecx, %edi # di: b | c + andl %ecx, %esi # si: b & c + andl %edx, %edi # di: (b | c) & d + orl %esi, %edi # ((b | c) & d) | (b & c) + addl %edi, %ebp # += ((b | c) & d) | (b & c) + addl -64+4*8(%rsp), %ebp # e += RCONST + W[n & 15] + movl %eax, %esi # + roll $5, %esi # rotl32(a,5) + addl %esi, %ebp # e += rotl32(a,5) + rorl $2, %ebx # b = rotl32(b,30) +# 41 + movl %eax, %edi # di: b + movl %eax, %esi # si: b + orl %ebx, %edi # di: b | c + andl %ebx, %esi # si: b & c + andl %ecx, %edi # di: (b | c) & d + orl %esi, %edi # ((b | c) & d) | (b & c) + addl %edi, %edx # += ((b | c) & d) | (b & c) + addl -64+4*9(%rsp), %edx # e += RCONST + W[n & 15] + movl %ebp, %esi # + roll $5, %esi # rotl32(a,5) + addl %esi, %edx # e += rotl32(a,5) + rorl $2, %eax # b = rotl32(b,30) +# 42 + movl %ebp, %edi # di: b + movl %ebp, %esi # si: b + orl %eax, %edi # di: b | c + andl %eax, %esi # si: b & c + andl %ebx, %edi # di: (b | c) & d + orl %esi, %edi # ((b | c) & d) | (b & c) + addl %edi, %ecx # += ((b | c) & d) | (b & c) + addl -64+4*10(%rsp), %ecx # e += RCONST + W[n & 15] + movl %edx, %esi # + roll $5, %esi # rotl32(a,5) + addl %esi, %ecx # e += rotl32(a,5) + rorl $2, %ebp # b = rotl32(b,30) +# 43 + movl %edx, %edi # di: b + movl %edx, %esi # si: b + orl %ebp, %edi # di: b | c + andl %ebp, %esi # si: b & c + andl %eax, %edi # di: (b | c) & d + orl %esi, %edi # ((b | c) & d) | (b & c) + addl %edi, %ebx # += ((b | c) & d) | (b & c) + addl -64+4*11(%rsp), %ebx # e += RCONST + W[n & 15] + movl %ecx, %esi # + roll $5, %esi # rotl32(a,5) + addl %esi, %ebx # e += rotl32(a,5) + rorl $2, %edx # b = rotl32(b,30) +# PREP %xmm1 %xmm2 %xmm3 %xmm0 -64+16*1(%rsp) + movaps %xmm0, %xmm4 + psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0) +# pshufd $0x4e, %xmm1, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x) +# punpcklqdq %xmm2, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5]) +# same result as above, but shorter and faster: +# pshufd/shufps are subtly different: pshufd takes all dwords from source operand, +# shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one! + movaps %xmm1, %xmm5 + shufps $0x4e, %xmm2, %xmm5 # 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5]) + xorps %xmm3, %xmm1 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3]) + xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5]) + xorps %xmm5, %xmm1 # ^ + # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup + movaps %xmm1, %xmm5 + xorps %xmm4, %xmm4 # rol(W0,1): + pcmpgtd %xmm1, %xmm4 # ffffffff for elements <0 (ones with msb bit 1) + paddd %xmm1, %xmm1 # shift left by 1 + psubd %xmm4, %xmm1 # add 1 to those who had msb bit 1 + # W0 = rotated (W[0]..W[3]), still needs W[3] fixup + pslldq $12, %xmm5 # lshift by 12 bytes: T2 = (0,0,0,unrotW[0]) + movaps %xmm5, %xmm4 + pslld $2, %xmm5 + psrld $30, %xmm4 +# xorps %xmm4, %xmm5 # rol((0,0,0,unrotW[0]),2) + xorps %xmm4, %xmm1 # same result, but does not depend on/does not modify T2 + xorps %xmm5, %xmm1 # W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2)) + movaps %xmm1, %xmm5 + paddd %xmm6, %xmm5 + movups %xmm5, -64+16*1(%rsp) +# 44 + movl %ecx, %edi # di: b + movl %ecx, %esi # si: b + orl %edx, %edi # di: b | c + andl %edx, %esi # si: b & c + andl %ebp, %edi # di: (b | c) & d + orl %esi, %edi # ((b | c) & d) | (b & c) + addl %edi, %eax # += ((b | c) & d) | (b & c) + addl -64+4*12(%rsp), %eax # e += RCONST + W[n & 15] + movl %ebx, %esi # + roll $5, %esi # rotl32(a,5) + addl %esi, %eax # e += rotl32(a,5) + rorl $2, %ecx # b = rotl32(b,30) +# 45 + movl %ebx, %edi # di: b + movl %ebx, %esi # si: b + orl %ecx, %edi # di: b | c + andl %ecx, %esi # si: b & c + andl %edx, %edi # di: (b | c) & d + orl %esi, %edi # ((b | c) & d) | (b & c) + addl %edi, %ebp # += ((b | c) & d) | (b & c) + addl -64+4*13(%rsp), %ebp # e += RCONST + W[n & 15] + movl %eax, %esi # + roll $5, %esi # rotl32(a,5) + addl %esi, %ebp # e += rotl32(a,5) + rorl $2, %ebx # b = rotl32(b,30) +# 46 + movl %eax, %edi # di: b + movl %eax, %esi # si: b + orl %ebx, %edi # di: b | c + andl %ebx, %esi # si: b & c + andl %ecx, %edi # di: (b | c) & d + orl %esi, %edi # ((b | c) & d) | (b & c) + addl %edi, %edx # += ((b | c) & d) | (b & c) + addl -64+4*14(%rsp), %edx # e += RCONST + W[n & 15] + movl %ebp, %esi # + roll $5, %esi # rotl32(a,5) + addl %esi, %edx # e += rotl32(a,5) + rorl $2, %eax # b = rotl32(b,30) +# 47 + movl %ebp, %edi # di: b + movl %ebp, %esi # si: b + orl %eax, %edi # di: b | c + andl %eax, %esi # si: b & c + andl %ebx, %edi # di: (b | c) & d + orl %esi, %edi # ((b | c) & d) | (b & c) + addl %edi, %ecx # += ((b | c) & d) | (b & c) + addl -64+4*15(%rsp), %ecx # e += RCONST + W[n & 15] + movl %edx, %esi # + roll $5, %esi # rotl32(a,5) + addl %esi, %ecx # e += rotl32(a,5) + rorl $2, %ebp # b = rotl32(b,30) +# PREP %xmm2 %xmm3 %xmm0 %xmm1 -64+16*2(%rsp) + movaps %xmm1, %xmm4 + psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0) +# pshufd $0x4e, %xmm2, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x) +# punpcklqdq %xmm3, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5]) +# same result as above, but shorter and faster: +# pshufd/shufps are subtly different: pshufd takes all dwords from source operand, +# shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one! + movaps %xmm2, %xmm5 + shufps $0x4e, %xmm3, %xmm5 # 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5]) + xorps %xmm0, %xmm2 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3]) + xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5]) + xorps %xmm5, %xmm2 # ^ + # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup + movaps %xmm2, %xmm5 + xorps %xmm4, %xmm4 # rol(W0,1): + pcmpgtd %xmm2, %xmm4 # ffffffff for elements <0 (ones with msb bit 1) + paddd %xmm2, %xmm2 # shift left by 1 + psubd %xmm4, %xmm2 # add 1 to those who had msb bit 1 + # W0 = rotated (W[0]..W[3]), still needs W[3] fixup + pslldq $12, %xmm5 # lshift by 12 bytes: T2 = (0,0,0,unrotW[0]) + movaps %xmm5, %xmm4 + pslld $2, %xmm5 + psrld $30, %xmm4 +# xorps %xmm4, %xmm5 # rol((0,0,0,unrotW[0]),2) + xorps %xmm4, %xmm2 # same result, but does not depend on/does not modify T2 + xorps %xmm5, %xmm2 # W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2)) + movaps %xmm2, %xmm5 + paddd %xmm6, %xmm5 + movups %xmm5, -64+16*2(%rsp) +# 48 + movl %edx, %edi # di: b + movl %edx, %esi # si: b + orl %ebp, %edi # di: b | c + andl %ebp, %esi # si: b & c + andl %eax, %edi # di: (b | c) & d + orl %esi, %edi # ((b | c) & d) | (b & c) + addl %edi, %ebx # += ((b | c) & d) | (b & c) + addl -64+4*0(%rsp), %ebx # e += RCONST + W[n & 15] + movl %ecx, %esi # + roll $5, %esi # rotl32(a,5) + addl %esi, %ebx # e += rotl32(a,5) + rorl $2, %edx # b = rotl32(b,30) +# 49 + movl %ecx, %edi # di: b + movl %ecx, %esi # si: b + orl %edx, %edi # di: b | c + andl %edx, %esi # si: b & c + andl %ebp, %edi # di: (b | c) & d + orl %esi, %edi # ((b | c) & d) | (b & c) + addl %edi, %eax # += ((b | c) & d) | (b & c) + addl -64+4*1(%rsp), %eax # e += RCONST + W[n & 15] + movl %ebx, %esi # + roll $5, %esi # rotl32(a,5) + addl %esi, %eax # e += rotl32(a,5) + rorl $2, %ecx # b = rotl32(b,30) +# 50 + movl %ebx, %edi # di: b + movl %ebx, %esi # si: b + orl %ecx, %edi # di: b | c + andl %ecx, %esi # si: b & c + andl %edx, %edi # di: (b | c) & d + orl %esi, %edi # ((b | c) & d) | (b & c) + addl %edi, %ebp # += ((b | c) & d) | (b & c) + addl -64+4*2(%rsp), %ebp # e += RCONST + W[n & 15] + movl %eax, %esi # + roll $5, %esi # rotl32(a,5) + addl %esi, %ebp # e += rotl32(a,5) + rorl $2, %ebx # b = rotl32(b,30) +# 51 + movl %eax, %edi # di: b + movl %eax, %esi # si: b + orl %ebx, %edi # di: b | c + andl %ebx, %esi # si: b & c + andl %ecx, %edi # di: (b | c) & d + orl %esi, %edi # ((b | c) & d) | (b & c) + addl %edi, %edx # += ((b | c) & d) | (b & c) + addl -64+4*3(%rsp), %edx # e += RCONST + W[n & 15] + movl %ebp, %esi # + roll $5, %esi # rotl32(a,5) + addl %esi, %edx # e += rotl32(a,5) + rorl $2, %eax # b = rotl32(b,30) + pshufd $0xff, %xmm7, %xmm6 +# PREP %xmm3 %xmm0 %xmm1 %xmm2 -64+16*3(%rsp) + movaps %xmm2, %xmm4 + psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0) +# pshufd $0x4e, %xmm3, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x) +# punpcklqdq %xmm0, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5]) +# same result as above, but shorter and faster: +# pshufd/shufps are subtly different: pshufd takes all dwords from source operand, +# shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one! + movaps %xmm3, %xmm5 + shufps $0x4e, %xmm0, %xmm5 # 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5]) + xorps %xmm1, %xmm3 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3]) + xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5]) + xorps %xmm5, %xmm3 # ^ + # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup + movaps %xmm3, %xmm5 + xorps %xmm4, %xmm4 # rol(W0,1): + pcmpgtd %xmm3, %xmm4 # ffffffff for elements <0 (ones with msb bit 1) + paddd %xmm3, %xmm3 # shift left by 1 + psubd %xmm4, %xmm3 # add 1 to those who had msb bit 1 + # W0 = rotated (W[0]..W[3]), still needs W[3] fixup + pslldq $12, %xmm5 # lshift by 12 bytes: T2 = (0,0,0,unrotW[0]) + movaps %xmm5, %xmm4 + pslld $2, %xmm5 + psrld $30, %xmm4 +# xorps %xmm4, %xmm5 # rol((0,0,0,unrotW[0]),2) + xorps %xmm4, %xmm3 # same result, but does not depend on/does not modify T2 + xorps %xmm5, %xmm3 # W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2)) + movaps %xmm3, %xmm5 + paddd %xmm6, %xmm5 + movups %xmm5, -64+16*3(%rsp) +# 52 + movl %ebp, %edi # di: b + movl %ebp, %esi # si: b + orl %eax, %edi # di: b | c + andl %eax, %esi # si: b & c + andl %ebx, %edi # di: (b | c) & d + orl %esi, %edi # ((b | c) & d) | (b & c) + addl %edi, %ecx # += ((b | c) & d) | (b & c) + addl -64+4*4(%rsp), %ecx # e += RCONST + W[n & 15] + movl %edx, %esi # + roll $5, %esi # rotl32(a,5) + addl %esi, %ecx # e += rotl32(a,5) + rorl $2, %ebp # b = rotl32(b,30) +# 53 + movl %edx, %edi # di: b + movl %edx, %esi # si: b + orl %ebp, %edi # di: b | c + andl %ebp, %esi # si: b & c + andl %eax, %edi # di: (b | c) & d + orl %esi, %edi # ((b | c) & d) | (b & c) + addl %edi, %ebx # += ((b | c) & d) | (b & c) + addl -64+4*5(%rsp), %ebx # e += RCONST + W[n & 15] + movl %ecx, %esi # + roll $5, %esi # rotl32(a,5) + addl %esi, %ebx # e += rotl32(a,5) + rorl $2, %edx # b = rotl32(b,30) +# 54 + movl %ecx, %edi # di: b + movl %ecx, %esi # si: b + orl %edx, %edi # di: b | c + andl %edx, %esi # si: b & c + andl %ebp, %edi # di: (b | c) & d + orl %esi, %edi # ((b | c) & d) | (b & c) + addl %edi, %eax # += ((b | c) & d) | (b & c) + addl -64+4*6(%rsp), %eax # e += RCONST + W[n & 15] + movl %ebx, %esi # + roll $5, %esi # rotl32(a,5) + addl %esi, %eax # e += rotl32(a,5) + rorl $2, %ecx # b = rotl32(b,30) +# 55 + movl %ebx, %edi # di: b + movl %ebx, %esi # si: b + orl %ecx, %edi # di: b | c + andl %ecx, %esi # si: b & c + andl %edx, %edi # di: (b | c) & d + orl %esi, %edi # ((b | c) & d) | (b & c) + addl %edi, %ebp # += ((b | c) & d) | (b & c) + addl -64+4*7(%rsp), %ebp # e += RCONST + W[n & 15] + movl %eax, %esi # + roll $5, %esi # rotl32(a,5) + addl %esi, %ebp # e += rotl32(a,5) + rorl $2, %ebx # b = rotl32(b,30) +# PREP %xmm0 %xmm1 %xmm2 %xmm3 -64+16*0(%rsp) + movaps %xmm3, %xmm4 + psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0) +# pshufd $0x4e, %xmm0, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x) +# punpcklqdq %xmm1, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5]) +# same result as above, but shorter and faster: +# pshufd/shufps are subtly different: pshufd takes all dwords from source operand, +# shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one! + movaps %xmm0, %xmm5 + shufps $0x4e, %xmm1, %xmm5 # 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5]) + xorps %xmm2, %xmm0 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3]) + xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5]) + xorps %xmm5, %xmm0 # ^ + # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup + movaps %xmm0, %xmm5 + xorps %xmm4, %xmm4 # rol(W0,1): + pcmpgtd %xmm0, %xmm4 # ffffffff for elements <0 (ones with msb bit 1) + paddd %xmm0, %xmm0 # shift left by 1 + psubd %xmm4, %xmm0 # add 1 to those who had msb bit 1 + # W0 = rotated (W[0]..W[3]), still needs W[3] fixup + pslldq $12, %xmm5 # lshift by 12 bytes: T2 = (0,0,0,unrotW[0]) + movaps %xmm5, %xmm4 + pslld $2, %xmm5 + psrld $30, %xmm4 +# xorps %xmm4, %xmm5 # rol((0,0,0,unrotW[0]),2) + xorps %xmm4, %xmm0 # same result, but does not depend on/does not modify T2 + xorps %xmm5, %xmm0 # W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2)) + movaps %xmm0, %xmm5 + paddd %xmm6, %xmm5 + movups %xmm5, -64+16*0(%rsp) +# 56 + movl %eax, %edi # di: b + movl %eax, %esi # si: b + orl %ebx, %edi # di: b | c + andl %ebx, %esi # si: b & c + andl %ecx, %edi # di: (b | c) & d + orl %esi, %edi # ((b | c) & d) | (b & c) + addl %edi, %edx # += ((b | c) & d) | (b & c) + addl -64+4*8(%rsp), %edx # e += RCONST + W[n & 15] + movl %ebp, %esi # + roll $5, %esi # rotl32(a,5) + addl %esi, %edx # e += rotl32(a,5) + rorl $2, %eax # b = rotl32(b,30) +# 57 + movl %ebp, %edi # di: b + movl %ebp, %esi # si: b + orl %eax, %edi # di: b | c + andl %eax, %esi # si: b & c + andl %ebx, %edi # di: (b | c) & d + orl %esi, %edi # ((b | c) & d) | (b & c) + addl %edi, %ecx # += ((b | c) & d) | (b & c) + addl -64+4*9(%rsp), %ecx # e += RCONST + W[n & 15] + movl %edx, %esi # + roll $5, %esi # rotl32(a,5) + addl %esi, %ecx # e += rotl32(a,5) + rorl $2, %ebp # b = rotl32(b,30) +# 58 + movl %edx, %edi # di: b + movl %edx, %esi # si: b + orl %ebp, %edi # di: b | c + andl %ebp, %esi # si: b & c + andl %eax, %edi # di: (b | c) & d + orl %esi, %edi # ((b | c) & d) | (b & c) + addl %edi, %ebx # += ((b | c) & d) | (b & c) + addl -64+4*10(%rsp), %ebx # e += RCONST + W[n & 15] + movl %ecx, %esi # + roll $5, %esi # rotl32(a,5) + addl %esi, %ebx # e += rotl32(a,5) + rorl $2, %edx # b = rotl32(b,30) +# 59 + movl %ecx, %edi # di: b + movl %ecx, %esi # si: b + orl %edx, %edi # di: b | c + andl %edx, %esi # si: b & c + andl %ebp, %edi # di: (b | c) & d + orl %esi, %edi # ((b | c) & d) | (b & c) + addl %edi, %eax # += ((b | c) & d) | (b & c) + addl -64+4*11(%rsp), %eax # e += RCONST + W[n & 15] + movl %ebx, %esi # + roll $5, %esi # rotl32(a,5) + addl %esi, %eax # e += rotl32(a,5) + rorl $2, %ecx # b = rotl32(b,30) +# PREP %xmm1 %xmm2 %xmm3 %xmm0 -64+16*1(%rsp) + movaps %xmm0, %xmm4 + psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0) +# pshufd $0x4e, %xmm1, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x) +# punpcklqdq %xmm2, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5]) +# same result as above, but shorter and faster: +# pshufd/shufps are subtly different: pshufd takes all dwords from source operand, +# shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one! + movaps %xmm1, %xmm5 + shufps $0x4e, %xmm2, %xmm5 # 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5]) + xorps %xmm3, %xmm1 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3]) + xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5]) + xorps %xmm5, %xmm1 # ^ + # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup + movaps %xmm1, %xmm5 + xorps %xmm4, %xmm4 # rol(W0,1): + pcmpgtd %xmm1, %xmm4 # ffffffff for elements <0 (ones with msb bit 1) + paddd %xmm1, %xmm1 # shift left by 1 + psubd %xmm4, %xmm1 # add 1 to those who had msb bit 1 + # W0 = rotated (W[0]..W[3]), still needs W[3] fixup + pslldq $12, %xmm5 # lshift by 12 bytes: T2 = (0,0,0,unrotW[0]) + movaps %xmm5, %xmm4 + pslld $2, %xmm5 + psrld $30, %xmm4 +# xorps %xmm4, %xmm5 # rol((0,0,0,unrotW[0]),2) + xorps %xmm4, %xmm1 # same result, but does not depend on/does not modify T2 + xorps %xmm5, %xmm1 # W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2)) + movaps %xmm1, %xmm5 + paddd %xmm6, %xmm5 + movups %xmm5, -64+16*1(%rsp) +# 60 + movl %ecx, %edi # c + xorl %edx, %edi # ^d + xorl %ebx, %edi # ^b + addl -64+4*12(%rsp), %ebp # e += RCONST + W[n & 15] + addl %edi, %ebp # e += (c ^ d ^ b) + movl %eax, %esi # + roll $5, %esi # rotl32(a,5) + addl %esi, %ebp # e += rotl32(a,5) + rorl $2, %ebx # b = rotl32(b,30) +# 61 + movl %ebx, %edi # c + xorl %ecx, %edi # ^d + xorl %eax, %edi # ^b + addl -64+4*13(%rsp), %edx # e += RCONST + W[n & 15] + addl %edi, %edx # e += (c ^ d ^ b) + movl %ebp, %esi # + roll $5, %esi # rotl32(a,5) + addl %esi, %edx # e += rotl32(a,5) + rorl $2, %eax # b = rotl32(b,30) +# 62 + movl %eax, %edi # c + xorl %ebx, %edi # ^d + xorl %ebp, %edi # ^b + addl -64+4*14(%rsp), %ecx # e += RCONST + W[n & 15] + addl %edi, %ecx # e += (c ^ d ^ b) + movl %edx, %esi # + roll $5, %esi # rotl32(a,5) + addl %esi, %ecx # e += rotl32(a,5) + rorl $2, %ebp # b = rotl32(b,30) +# 63 + movl %ebp, %edi # c + xorl %eax, %edi # ^d + xorl %edx, %edi # ^b + addl -64+4*15(%rsp), %ebx # e += RCONST + W[n & 15] + addl %edi, %ebx # e += (c ^ d ^ b) + movl %ecx, %esi # + roll $5, %esi # rotl32(a,5) + addl %esi, %ebx # e += rotl32(a,5) + rorl $2, %edx # b = rotl32(b,30) +# PREP %xmm2 %xmm3 %xmm0 %xmm1 -64+16*2(%rsp) + movaps %xmm1, %xmm4 + psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0) +# pshufd $0x4e, %xmm2, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x) +# punpcklqdq %xmm3, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5]) +# same result as above, but shorter and faster: +# pshufd/shufps are subtly different: pshufd takes all dwords from source operand, +# shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one! + movaps %xmm2, %xmm5 + shufps $0x4e, %xmm3, %xmm5 # 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5]) + xorps %xmm0, %xmm2 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3]) + xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5]) + xorps %xmm5, %xmm2 # ^ + # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup + movaps %xmm2, %xmm5 + xorps %xmm4, %xmm4 # rol(W0,1): + pcmpgtd %xmm2, %xmm4 # ffffffff for elements <0 (ones with msb bit 1) + paddd %xmm2, %xmm2 # shift left by 1 + psubd %xmm4, %xmm2 # add 1 to those who had msb bit 1 + # W0 = rotated (W[0]..W[3]), still needs W[3] fixup + pslldq $12, %xmm5 # lshift by 12 bytes: T2 = (0,0,0,unrotW[0]) + movaps %xmm5, %xmm4 + pslld $2, %xmm5 + psrld $30, %xmm4 +# xorps %xmm4, %xmm5 # rol((0,0,0,unrotW[0]),2) + xorps %xmm4, %xmm2 # same result, but does not depend on/does not modify T2 + xorps %xmm5, %xmm2 # W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2)) + movaps %xmm2, %xmm5 + paddd %xmm6, %xmm5 + movups %xmm5, -64+16*2(%rsp) +# 64 + movl %edx, %edi # c + xorl %ebp, %edi # ^d + xorl %ecx, %edi # ^b + addl -64+4*0(%rsp), %eax # e += RCONST + W[n & 15] + addl %edi, %eax # e += (c ^ d ^ b) + movl %ebx, %esi # + roll $5, %esi # rotl32(a,5) + addl %esi, %eax # e += rotl32(a,5) + rorl $2, %ecx # b = rotl32(b,30) +# 65 + movl %ecx, %edi # c + xorl %edx, %edi # ^d + xorl %ebx, %edi # ^b + addl -64+4*1(%rsp), %ebp # e += RCONST + W[n & 15] + addl %edi, %ebp # e += (c ^ d ^ b) + movl %eax, %esi # + roll $5, %esi # rotl32(a,5) + addl %esi, %ebp # e += rotl32(a,5) + rorl $2, %ebx # b = rotl32(b,30) +# 66 + movl %ebx, %edi # c + xorl %ecx, %edi # ^d + xorl %eax, %edi # ^b + addl -64+4*2(%rsp), %edx # e += RCONST + W[n & 15] + addl %edi, %edx # e += (c ^ d ^ b) + movl %ebp, %esi # + roll $5, %esi # rotl32(a,5) + addl %esi, %edx # e += rotl32(a,5) + rorl $2, %eax # b = rotl32(b,30) +# 67 + movl %eax, %edi # c + xorl %ebx, %edi # ^d + xorl %ebp, %edi # ^b + addl -64+4*3(%rsp), %ecx # e += RCONST + W[n & 15] + addl %edi, %ecx # e += (c ^ d ^ b) + movl %edx, %esi # + roll $5, %esi # rotl32(a,5) + addl %esi, %ecx # e += rotl32(a,5) + rorl $2, %ebp # b = rotl32(b,30) +# PREP %xmm3 %xmm0 %xmm1 %xmm2 -64+16*3(%rsp) + movaps %xmm2, %xmm4 + psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0) +# pshufd $0x4e, %xmm3, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x) +# punpcklqdq %xmm0, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5]) +# same result as above, but shorter and faster: +# pshufd/shufps are subtly different: pshufd takes all dwords from source operand, +# shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one! + movaps %xmm3, %xmm5 + shufps $0x4e, %xmm0, %xmm5 # 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5]) + xorps %xmm1, %xmm3 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3]) + xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5]) + xorps %xmm5, %xmm3 # ^ + # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup + movaps %xmm3, %xmm5 + xorps %xmm4, %xmm4 # rol(W0,1): + pcmpgtd %xmm3, %xmm4 # ffffffff for elements <0 (ones with msb bit 1) + paddd %xmm3, %xmm3 # shift left by 1 + psubd %xmm4, %xmm3 # add 1 to those who had msb bit 1 + # W0 = rotated (W[0]..W[3]), still needs W[3] fixup + pslldq $12, %xmm5 # lshift by 12 bytes: T2 = (0,0,0,unrotW[0]) + movaps %xmm5, %xmm4 + pslld $2, %xmm5 + psrld $30, %xmm4 +# xorps %xmm4, %xmm5 # rol((0,0,0,unrotW[0]),2) + xorps %xmm4, %xmm3 # same result, but does not depend on/does not modify T2 + xorps %xmm5, %xmm3 # W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2)) + movaps %xmm3, %xmm5 + paddd %xmm6, %xmm5 + movups %xmm5, -64+16*3(%rsp) +# 68 + movl %ebp, %edi # c + xorl %eax, %edi # ^d + xorl %edx, %edi # ^b + addl -64+4*4(%rsp), %ebx # e += RCONST + W[n & 15] + addl %edi, %ebx # e += (c ^ d ^ b) + movl %ecx, %esi # + roll $5, %esi # rotl32(a,5) + addl %esi, %ebx # e += rotl32(a,5) + rorl $2, %edx # b = rotl32(b,30) +# 69 + movl %edx, %edi # c + xorl %ebp, %edi # ^d + xorl %ecx, %edi # ^b + addl -64+4*5(%rsp), %eax # e += RCONST + W[n & 15] + addl %edi, %eax # e += (c ^ d ^ b) + movl %ebx, %esi # + roll $5, %esi # rotl32(a,5) + addl %esi, %eax # e += rotl32(a,5) + rorl $2, %ecx # b = rotl32(b,30) +# 70 + movl %ecx, %edi # c + xorl %edx, %edi # ^d + xorl %ebx, %edi # ^b + addl -64+4*6(%rsp), %ebp # e += RCONST + W[n & 15] + addl %edi, %ebp # e += (c ^ d ^ b) + movl %eax, %esi # + roll $5, %esi # rotl32(a,5) + addl %esi, %ebp # e += rotl32(a,5) + rorl $2, %ebx # b = rotl32(b,30) +# 71 + movl %ebx, %edi # c + xorl %ecx, %edi # ^d + xorl %eax, %edi # ^b + addl -64+4*7(%rsp), %edx # e += RCONST + W[n & 15] + addl %edi, %edx # e += (c ^ d ^ b) + movl %ebp, %esi # + roll $5, %esi # rotl32(a,5) + addl %esi, %edx # e += rotl32(a,5) + rorl $2, %eax # b = rotl32(b,30) +# 72 + movl %eax, %edi # c + xorl %ebx, %edi # ^d + xorl %ebp, %edi # ^b + addl -64+4*8(%rsp), %ecx # e += RCONST + W[n & 15] + addl %edi, %ecx # e += (c ^ d ^ b) + movl %edx, %esi # + roll $5, %esi # rotl32(a,5) + addl %esi, %ecx # e += rotl32(a,5) + rorl $2, %ebp # b = rotl32(b,30) +# 73 + movl %ebp, %edi # c + xorl %eax, %edi # ^d + xorl %edx, %edi # ^b + addl -64+4*9(%rsp), %ebx # e += RCONST + W[n & 15] + addl %edi, %ebx # e += (c ^ d ^ b) + movl %ecx, %esi # + roll $5, %esi # rotl32(a,5) + addl %esi, %ebx # e += rotl32(a,5) + rorl $2, %edx # b = rotl32(b,30) +# 74 + movl %edx, %edi # c + xorl %ebp, %edi # ^d + xorl %ecx, %edi # ^b + addl -64+4*10(%rsp), %eax # e += RCONST + W[n & 15] + addl %edi, %eax # e += (c ^ d ^ b) + movl %ebx, %esi # + roll $5, %esi # rotl32(a,5) + addl %esi, %eax # e += rotl32(a,5) + rorl $2, %ecx # b = rotl32(b,30) +# 75 + movl %ecx, %edi # c + xorl %edx, %edi # ^d + xorl %ebx, %edi # ^b + addl -64+4*11(%rsp), %ebp # e += RCONST + W[n & 15] + addl %edi, %ebp # e += (c ^ d ^ b) + movl %eax, %esi # + roll $5, %esi # rotl32(a,5) + addl %esi, %ebp # e += rotl32(a,5) + rorl $2, %ebx # b = rotl32(b,30) +# 76 + movl %ebx, %edi # c + xorl %ecx, %edi # ^d + xorl %eax, %edi # ^b + addl -64+4*12(%rsp), %edx # e += RCONST + W[n & 15] + addl %edi, %edx # e += (c ^ d ^ b) + movl %ebp, %esi # + roll $5, %esi # rotl32(a,5) + addl %esi, %edx # e += rotl32(a,5) + rorl $2, %eax # b = rotl32(b,30) +# 77 + movl %eax, %edi # c + xorl %ebx, %edi # ^d + xorl %ebp, %edi # ^b + addl -64+4*13(%rsp), %ecx # e += RCONST + W[n & 15] + addl %edi, %ecx # e += (c ^ d ^ b) + movl %edx, %esi # + roll $5, %esi # rotl32(a,5) + addl %esi, %ecx # e += rotl32(a,5) + rorl $2, %ebp # b = rotl32(b,30) +# 78 + movl %ebp, %edi # c + xorl %eax, %edi # ^d + xorl %edx, %edi # ^b + addl -64+4*14(%rsp), %ebx # e += RCONST + W[n & 15] + addl %edi, %ebx # e += (c ^ d ^ b) + movl %ecx, %esi # + roll $5, %esi # rotl32(a,5) + addl %esi, %ebx # e += rotl32(a,5) + rorl $2, %edx # b = rotl32(b,30) +# 79 + movl %edx, %edi # c + xorl %ebp, %edi # ^d + xorl %ecx, %edi # ^b + addl -64+4*15(%rsp), %eax # e += RCONST + W[n & 15] + addl %edi, %eax # e += (c ^ d ^ b) + movl %ebx, %esi # + roll $5, %esi # rotl32(a,5) + addl %esi, %eax # e += rotl32(a,5) + rorl $2, %ecx # b = rotl32(b,30) + + popq %rdi # + popq %r12 # + addl %eax, 80(%rdi) # ctx->hash[0] += a + popq %r13 # + addl %ebx, 84(%rdi) # ctx->hash[1] += b + popq %r14 # + addl %ecx, 88(%rdi) # ctx->hash[2] += c +# popq %r15 # + addl %edx, 92(%rdi) # ctx->hash[3] += d + popq %rbx # + addl %ebp, 96(%rdi) # ctx->hash[4] += e + popq %rbp # + + ret + .size sha1_process_block64, .-sha1_process_block64 + + .section .rodata.cst16.sha1const, "aM", @progbits, 16 + .balign 16 +sha1const: + .long 0x5A827999 + .long 0x6ED9EBA1 + .long 0x8F1BBCDC + .long 0xCA62C1D6 + +#endif diff --git a/libbb/hash_sha1_x86-64.S.sh b/libbb/hash_sha1_x86-64.S.sh new file mode 100755 index 000000000..3fc125d51 --- /dev/null +++ b/libbb/hash_sha1_x86-64.S.sh @@ -0,0 +1,478 @@ +#!/bin/sh + +# We don't regenerate it on every "make" invocation - only by hand. +# The reason is that the changes to generated code are difficult +# to visualize by looking only at this script, it helps when the commit +# also contains the diff of the generated file. +exec >hash_sha1_x86-64.S + +# Based on http://arctic.org/~dean/crypto/sha1.html. +# ("This SHA1 implementation is public domain.") +# +# x86-64 has at least SSE2 vector insns always available. +# We can use them without any CPUID checks (and without a need +# for a fallback code if needed insns are not available). +# This code uses them to calculate W[] ahead of time. +# +# Unfortunately, results are passed from vector unit to +# integer ALUs on the stack. MOVD/Q insns to move them directly +# from vector to integer registers are slower than store-to-load +# forwarding in LSU (on Skylake at least). +# +# The win against a purely integer code is small on Skylake, +# only about 7-8%. We offload about 1/3 of our operations to the vector unit. +# It can do 4 ops at once in one 128-bit register, +# but we have to use x2 of them because of W[0] complication, +# SSE2 has no "rotate each word by N bits" insns, +# moving data to/from vector unit is clunky, and Skylake +# has four integer ALUs unified with three vector ALUs, +# which makes pure integer code rather fast, and makes +# vector ops compete with integer ones. +# +# Zen3, with its separate vector ALUs, wins more, about 12%. + +xmmT1="%xmm4" +xmmT2="%xmm5" +xmmRCONST="%xmm6" +xmmALLRCONST="%xmm7" +T=`printf '\t'` + +# SSE instructions are longer than 4 bytes on average. +# Intel CPUs (up to Tiger Lake at least) can't decode +# more than 16 bytes of code in one cycle. +# By interleaving SSE code and integer code +# we mostly achieve a situation where 16-byte decode fetch window +# contains 4 (or more) insns. +# +# However. On Skylake, there was no observed difference, +# but on Zen3, non-interleaved code is ~3% faster +# (822 Mb/s versus 795 Mb/s hashing speed). +# Off for now: +interleave=false + +INTERLEAVE() { + $interleave || \ + { + # Generate non-interleaved code + # (it should work correctly too) + echo "$1" + echo "$2" + return + } + ( + echo "$1" | grep -v '^$' >"$0.temp1" + echo "$2" | grep -v '^$' >"$0.temp2" + exec 3<"$0.temp1" + exec 4<"$0.temp2" + IFS='' + while :; do + line1='' + line2='' + while :; do + read -r line1 <&3 + if test "${line1:0:1}" != "#" && test "${line1:0:2}" != "$T#"; then + break + fi + echo "$line1" + done + while :; do + read -r line2 <&4 + if test "${line2:0:4}" = "${T}lea"; then + # We use 7-8 byte long forms of LEA. + # Do not interleave them with SSE insns + # which are also long. + echo "$line2" + read -r line2 <&4 + echo "$line2" + continue + fi + if test "${line2:0:1}" != "#" && test "${line2:0:2}" != "$T#"; then + break + fi + echo "$line2" + done + test "$line1$line2" || break + echo "$line1" + echo "$line2" + done + rm "$0.temp1" "$0.temp2" + ) +} + +# movaps bswap32_mask(%rip), $xmmT1 +# Load W[] to xmm0..3, byteswapping on the fly. +# For iterations 0..15, we pass RCONST+W[] in rsi,r8..r14 +# for use in RD1As instead of spilling them to stack. +# (We use rsi instead of rN because this makes two +# ADDs in two first RD1As shorter by one byte). +# movups 16*0(%rdi), %xmm0 +# pshufb $xmmT1, %xmm0 #SSSE3 insn +# movaps %xmm0, $xmmT2 +# paddd $xmmRCONST, $xmmT2 +# movq $xmmT2, %rsi +# #pextrq \$1, $xmmT2, %r8 #SSE4.1 insn +# #movhpd $xmmT2, %r8 #can only move to mem, not to reg +# shufps \$0x0e, $xmmT2, $xmmT2 # have to use two-insn sequence +# movq $xmmT2, %r8 # instead +# ... +# +# ... +#- leal $RCONST(%r$e,%rsi), %e$e # e += RCONST + W[n] +#+ addl %esi, %e$e # e += RCONST + W[n] +# ^^^^^^^^^^^^^^^^^^^^^^^^ +# The above is -97 bytes of code... +# ...but pshufb is a SSSE3 insn. Can't use it. + +echo \ +"### Generated by hash_sha1_x86-64.S.sh ### + +#if CONFIG_SHA1_SMALL == 0 && defined(__GNUC__) && defined(__x86_64__) +#ifdef __linux__ + .section .note.GNU-stack, \"\", @progbits +#endif + .section .text.sha1_process_block64, \"ax\", @progbits + .globl sha1_process_block64 + .hidden sha1_process_block64 + .type sha1_process_block64, @function + + .balign 8 # allow decoders to fetch at least 5 first insns +sha1_process_block64: + pushq %rbp # 1 byte insn + pushq %rbx # 1 byte insn +# pushq %r15 # 2 byte insn + pushq %r14 # 2 byte insn + pushq %r13 # 2 byte insn + pushq %r12 # 2 byte insn + pushq %rdi # we need ctx at the end + +#Register and stack use: +# eax..edx: a..d +# ebp: e +# esi,edi,r8..r14: temps +# r15: unused +# xmm0..xmm3: W[] +# xmm4,xmm5: temps +# xmm6: current round constant +# xmm7: all round constants +# -64(%rsp): area for passing RCONST + W[] from vector to integer units + + movl 80(%rdi), %eax # a = ctx->hash[0] + movl 84(%rdi), %ebx # b = ctx->hash[1] + movl 88(%rdi), %ecx # c = ctx->hash[2] + movl 92(%rdi), %edx # d = ctx->hash[3] + movl 96(%rdi), %ebp # e = ctx->hash[4] + + movaps sha1const(%rip), $xmmALLRCONST + pshufd \$0x00, $xmmALLRCONST, $xmmRCONST + + # Load W[] to xmm0..3, byteswapping on the fly. + # + # For iterations 0..15, we pass W[] in rsi,r8..r14 + # for use in RD1As instead of spilling them to stack. + # We lose parallelized addition of RCONST, but LEA + # can do two additions at once, so it is probably a wash. + # (We use rsi instead of rN because this makes two + # LEAs in two first RD1As shorter by one byte). + movq 4*0(%rdi), %rsi + movq 4*2(%rdi), %r8 + bswapq %rsi + bswapq %r8 + rolq \$32, %rsi # rsi = W[1]:W[0] + rolq \$32, %r8 # r8 = W[3]:W[2] + movq %rsi, %xmm0 + movq %r8, $xmmT1 + punpcklqdq $xmmT1, %xmm0 # xmm0 = r8:rsi = (W[0],W[1],W[2],W[3]) +# movaps %xmm0, $xmmT1 # add RCONST, spill to stack +# paddd $xmmRCONST, $xmmT1 +# movups $xmmT1, -64+16*0(%rsp) + + movq 4*4(%rdi), %r9 + movq 4*6(%rdi), %r10 + bswapq %r9 + bswapq %r10 + rolq \$32, %r9 # r9 = W[5]:W[4] + rolq \$32, %r10 # r10 = W[7]:W[6] + movq %r9, %xmm1 + movq %r10, $xmmT1 + punpcklqdq $xmmT1, %xmm1 # xmm1 = r10:r9 = (W[4],W[5],W[6],W[7]) + + movq 4*8(%rdi), %r11 + movq 4*10(%rdi), %r12 + bswapq %r11 + bswapq %r12 + rolq \$32, %r11 # r11 = W[9]:W[8] + rolq \$32, %r12 # r12 = W[11]:W[10] + movq %r11, %xmm2 + movq %r12, $xmmT1 + punpcklqdq $xmmT1, %xmm2 # xmm2 = r12:r11 = (W[8],W[9],W[10],W[11]) + + movq 4*12(%rdi), %r13 + movq 4*14(%rdi), %r14 + bswapq %r13 + bswapq %r14 + rolq \$32, %r13 # r13 = W[13]:W[12] + rolq \$32, %r14 # r14 = W[15]:W[14] + movq %r13, %xmm3 + movq %r14, $xmmT1 + punpcklqdq $xmmT1, %xmm3 # xmm3 = r14:r13 = (W[12],W[13],W[14],W[15]) +" + +PREP() { +local xmmW0=$1 +local xmmW4=$2 +local xmmW8=$3 +local xmmW12=$4 +# the above must be %xmm0..3 in some permutation +local dstmem=$5 +#W[0] = rol(W[13] ^ W[8] ^ W[2] ^ W[0], 1); +#W[1] = rol(W[14] ^ W[9] ^ W[3] ^ W[1], 1); +#W[2] = rol(W[15] ^ W[10] ^ W[4] ^ W[2], 1); +#W[3] = rol( 0 ^ W[11] ^ W[5] ^ W[3], 1); +#W[3] ^= rol(W[0], 1); +echo "# PREP $@ + movaps $xmmW12, $xmmT1 + psrldq \$4, $xmmT1 # rshift by 4 bytes: T1 = ([13],[14],[15],0) + +# pshufd \$0x4e, $xmmW0, $xmmT2 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x) +# punpcklqdq $xmmW4, $xmmT2 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5]) +# same result as above, but shorter and faster: +# pshufd/shufps are subtly different: pshufd takes all dwords from source operand, +# shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one! + movaps $xmmW0, $xmmT2 + shufps \$0x4e, $xmmW4, $xmmT2 # 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5]) + + xorps $xmmW8, $xmmW0 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3]) + xorps $xmmT1, $xmmT2 # ([13],[14],[15],0) ^ ([2],[3],[4],[5]) + xorps $xmmT2, $xmmW0 # ^ + # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup + movaps $xmmW0, $xmmT2 + + xorps $xmmT1, $xmmT1 # rol(W0,1): + pcmpgtd $xmmW0, $xmmT1 # ffffffff for elements <0 (ones with msb bit 1) + paddd $xmmW0, $xmmW0 # shift left by 1 + psubd $xmmT1, $xmmW0 # add 1 to those who had msb bit 1 + # W0 = rotated (W[0]..W[3]), still needs W[3] fixup + + pslldq \$12, $xmmT2 # lshift by 12 bytes: T2 = (0,0,0,unrotW[0]) + movaps $xmmT2, $xmmT1 + pslld \$2, $xmmT2 + psrld \$30, $xmmT1 +# xorps $xmmT1, $xmmT2 # rol((0,0,0,unrotW[0]),2) + xorps $xmmT1, $xmmW0 # same result, but does not depend on/does not modify T2 + + xorps $xmmT2, $xmmW0 # W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2)) +" +# movq $xmmW0, %r8 # high latency (~6 cycles) +# movaps $xmmW0, $xmmT1 +# psrldq \$8, $xmmT1 # rshift by 8 bytes: move upper 64 bits to lower +# movq $xmmT1, %r10 # high latency +# movq %r8, %r9 +# movq %r10, %r11 +# shrq \$32, %r9 +# shrq \$32, %r11 +# ^^^ slower than passing the results on stack (!!!) +echo " + movaps $xmmW0, $xmmT2 + paddd $xmmRCONST, $xmmT2 + movups $xmmT2, $dstmem +" +} + +# It's possible to interleave integer insns in rounds to mostly eliminate +# dependency chains, but this likely to only help old Pentium-based +# CPUs (ones without OOO, which can only simultaneously execute a pair +# of _adjacent_ insns). +# Testing on old-ish Silvermont CPU (which has OOO window of only +# about ~8 insns) shows very small (~1%) speedup. + +RD1A() { +local a=$1;local b=$2;local c=$3;local d=$4;local e=$5 +local n=$(($6)) +local n0=$(((n+0) & 15)) +local rN=$((7+n0/2)) +echo " +# $n +";test $n0 = 0 && echo " + leal $RCONST(%r$e,%rsi), %e$e # e += RCONST + W[n] + shrq \$32, %rsi +";test $n0 = 1 && echo " + leal $RCONST(%r$e,%rsi), %e$e # e += RCONST + W[n] +";test $n0 -ge 2 && test $((n0 & 1)) = 0 && echo " + leal $RCONST(%r$e,%r$rN), %e$e # e += RCONST + W[n] + shrq \$32, %r$rN +";test $n0 -ge 2 && test $((n0 & 1)) = 1 && echo " + leal $RCONST(%r$e,%r$rN), %e$e # e += RCONST + W[n] +";echo " + movl %e$c, %edi # c + xorl %e$d, %edi # ^d + andl %e$b, %edi # &b + xorl %e$d, %edi # (((c ^ d) & b) ^ d) + addl %edi, %e$e # e += (((c ^ d) & b) ^ d) + movl %e$a, %edi # + roll \$5, %edi # rotl32(a,5) + addl %edi, %e$e # e += rotl32(a,5) + rorl \$2, %e$b # b = rotl32(b,30) +" +} +RD1B() { +local a=$1;local b=$2;local c=$3;local d=$4;local e=$5 +local n=$(($6)) +local n13=$(((n+13) & 15)) +local n8=$(((n+8) & 15)) +local n2=$(((n+2) & 15)) +local n0=$(((n+0) & 15)) +echo " +# $n + movl %e$c, %edi # c + xorl %e$d, %edi # ^d + andl %e$b, %edi # &b + xorl %e$d, %edi # (((c ^ d) & b) ^ d) + addl -64+4*$n0(%rsp), %e$e # e += RCONST + W[n & 15] + addl %edi, %e$e # e += (((c ^ d) & b) ^ d) + movl %e$a, %esi # + roll \$5, %esi # rotl32(a,5) + addl %esi, %e$e # e += rotl32(a,5) + rorl \$2, %e$b # b = rotl32(b,30) +" +} + +RD2() { +local a=$1;local b=$2;local c=$3;local d=$4;local e=$5 +local n=$(($6)) +local n13=$(((n+13) & 15)) +local n8=$(((n+8) & 15)) +local n2=$(((n+2) & 15)) +local n0=$(((n+0) & 15)) +echo " +# $n + movl %e$c, %edi # c + xorl %e$d, %edi # ^d + xorl %e$b, %edi # ^b + addl -64+4*$n0(%rsp), %e$e # e += RCONST + W[n & 15] + addl %edi, %e$e # e += (c ^ d ^ b) + movl %e$a, %esi # + roll \$5, %esi # rotl32(a,5) + addl %esi, %e$e # e += rotl32(a,5) + rorl \$2, %e$b # b = rotl32(b,30) +" +} + +RD3() { +local a=$1;local b=$2;local c=$3;local d=$4;local e=$5 +local n=$(($6)) +local n13=$(((n+13) & 15)) +local n8=$(((n+8) & 15)) +local n2=$(((n+2) & 15)) +local n0=$(((n+0) & 15)) +echo " +# $n + movl %e$b, %edi # di: b + movl %e$b, %esi # si: b + orl %e$c, %edi # di: b | c + andl %e$c, %esi # si: b & c + andl %e$d, %edi # di: (b | c) & d + orl %esi, %edi # ((b | c) & d) | (b & c) + addl %edi, %e$e # += ((b | c) & d) | (b & c) + addl -64+4*$n0(%rsp), %e$e # e += RCONST + W[n & 15] + movl %e$a, %esi # + roll \$5, %esi # rotl32(a,5) + addl %esi, %e$e # e += rotl32(a,5) + rorl \$2, %e$b # b = rotl32(b,30) +" +} + +{ +# Round 1 +RCONST=0x5A827999 +RD1A ax bx cx dx bp 0; RD1A bp ax bx cx dx 1; RD1A dx bp ax bx cx 2; RD1A cx dx bp ax bx 3; +RD1A bx cx dx bp ax 4; RD1A ax bx cx dx bp 5; RD1A bp ax bx cx dx 6; RD1A dx bp ax bx cx 7; +a=`PREP %xmm0 %xmm1 %xmm2 %xmm3 "-64+16*0(%rsp)"` +b=`RD1A cx dx bp ax bx 8; RD1A bx cx dx bp ax 9; RD1A ax bx cx dx bp 10; RD1A bp ax bx cx dx 11;` +INTERLEAVE "$a" "$b" +a=`echo " pshufd \\$0x55, $xmmALLRCONST, $xmmRCONST" + PREP %xmm1 %xmm2 %xmm3 %xmm0 "-64+16*1(%rsp)"` +b=`RD1A dx bp ax bx cx 12; RD1A cx dx bp ax bx 13; RD1A bx cx dx bp ax 14; RD1A ax bx cx dx bp 15;` +INTERLEAVE "$a" "$b" +a=`PREP %xmm2 %xmm3 %xmm0 %xmm1 "-64+16*2(%rsp)"` +b=`RD1B bp ax bx cx dx 16; RD1B dx bp ax bx cx 17; RD1B cx dx bp ax bx 18; RD1B bx cx dx bp ax 19;` +INTERLEAVE "$a" "$b" + +# Round 2 +RCONST=0x6ED9EBA1 +a=`PREP %xmm3 %xmm0 %xmm1 %xmm2 "-64+16*3(%rsp)"` +b=`RD2 ax bx cx dx bp 20; RD2 bp ax bx cx dx 21; RD2 dx bp ax bx cx 22; RD2 cx dx bp ax bx 23;` +INTERLEAVE "$a" "$b" +a=`PREP %xmm0 %xmm1 %xmm2 %xmm3 "-64+16*0(%rsp)"` +b=`RD2 bx cx dx bp ax 24; RD2 ax bx cx dx bp 25; RD2 bp ax bx cx dx 26; RD2 dx bp ax bx cx 27;` +INTERLEAVE "$a" "$b" +a=`PREP %xmm1 %xmm2 %xmm3 %xmm0 "-64+16*1(%rsp)"` +b=`RD2 cx dx bp ax bx 28; RD2 bx cx dx bp ax 29; RD2 ax bx cx dx bp 30; RD2 bp ax bx cx dx 31;` +INTERLEAVE "$a" "$b" +a=`echo " pshufd \\$0xaa, $xmmALLRCONST, $xmmRCONST" + PREP %xmm2 %xmm3 %xmm0 %xmm1 "-64+16*2(%rsp)"` +b=`RD2 dx bp ax bx cx 32; RD2 cx dx bp ax bx 33; RD2 bx cx dx bp ax 34; RD2 ax bx cx dx bp 35;` +INTERLEAVE "$a" "$b" +a=`PREP %xmm3 %xmm0 %xmm1 %xmm2 "-64+16*3(%rsp)"` +b=`RD2 bp ax bx cx dx 36; RD2 dx bp ax bx cx 37; RD2 cx dx bp ax bx 38; RD2 bx cx dx bp ax 39;` +INTERLEAVE "$a" "$b" + +# Round 3 +RCONST=0x8F1BBCDC +a=`PREP %xmm0 %xmm1 %xmm2 %xmm3 "-64+16*0(%rsp)"` +b=`RD3 ax bx cx dx bp 40; RD3 bp ax bx cx dx 41; RD3 dx bp ax bx cx 42; RD3 cx dx bp ax bx 43;` +INTERLEAVE "$a" "$b" +a=`PREP %xmm1 %xmm2 %xmm3 %xmm0 "-64+16*1(%rsp)"` +b=`RD3 bx cx dx bp ax 44; RD3 ax bx cx dx bp 45; RD3 bp ax bx cx dx 46; RD3 dx bp ax bx cx 47;` +INTERLEAVE "$a" "$b" +a=`PREP %xmm2 %xmm3 %xmm0 %xmm1 "-64+16*2(%rsp)"` +b=`RD3 cx dx bp ax bx 48; RD3 bx cx dx bp ax 49; RD3 ax bx cx dx bp 50; RD3 bp ax bx cx dx 51;` +INTERLEAVE "$a" "$b" +a=`echo " pshufd \\$0xff, $xmmALLRCONST, $xmmRCONST" + PREP %xmm3 %xmm0 %xmm1 %xmm2 "-64+16*3(%rsp)"` +b=`RD3 dx bp ax bx cx 52; RD3 cx dx bp ax bx 53; RD3 bx cx dx bp ax 54; RD3 ax bx cx dx bp 55;` +INTERLEAVE "$a" "$b" +a=`PREP %xmm0 %xmm1 %xmm2 %xmm3 "-64+16*0(%rsp)"` +b=`RD3 bp ax bx cx dx 56; RD3 dx bp ax bx cx 57; RD3 cx dx bp ax bx 58; RD3 bx cx dx bp ax 59;` +INTERLEAVE "$a" "$b" + +# Round 4 has the same logic as round 2, only n and RCONST are different +RCONST=0xCA62C1D6 +a=`PREP %xmm1 %xmm2 %xmm3 %xmm0 "-64+16*1(%rsp)"` +b=`RD2 ax bx cx dx bp 60; RD2 bp ax bx cx dx 61; RD2 dx bp ax bx cx 62; RD2 cx dx bp ax bx 63;` +INTERLEAVE "$a" "$b" +a=`PREP %xmm2 %xmm3 %xmm0 %xmm1 "-64+16*2(%rsp)"` +b=`RD2 bx cx dx bp ax 64; RD2 ax bx cx dx bp 65; RD2 bp ax bx cx dx 66; RD2 dx bp ax bx cx 67;` +INTERLEAVE "$a" "$b" +a=`PREP %xmm3 %xmm0 %xmm1 %xmm2 "-64+16*3(%rsp)"` +b=`RD2 cx dx bp ax bx 68; RD2 bx cx dx bp ax 69; RD2 ax bx cx dx bp 70; RD2 bp ax bx cx dx 71;` +INTERLEAVE "$a" "$b" +RD2 dx bp ax bx cx 72; RD2 cx dx bp ax bx 73; RD2 bx cx dx bp ax 74; RD2 ax bx cx dx bp 75; +RD2 bp ax bx cx dx 76; RD2 dx bp ax bx cx 77; RD2 cx dx bp ax bx 78; RD2 bx cx dx bp ax 79; +} | grep -v '^$' + +echo " + popq %rdi # + popq %r12 # + addl %eax, 80(%rdi) # ctx->hash[0] += a + popq %r13 # + addl %ebx, 84(%rdi) # ctx->hash[1] += b + popq %r14 # + addl %ecx, 88(%rdi) # ctx->hash[2] += c +# popq %r15 # + addl %edx, 92(%rdi) # ctx->hash[3] += d + popq %rbx # + addl %ebp, 96(%rdi) # ctx->hash[4] += e + popq %rbp # + + ret + .size sha1_process_block64, .-sha1_process_block64 + + .section .rodata.cst16.sha1const, \"aM\", @progbits, 16 + .balign 16 +sha1const: + .long 0x5A827999 + .long 0x6ED9EBA1 + .long 0x8F1BBCDC + .long 0xCA62C1D6 + +#endif" diff --git a/libbb/hash_sha256_hwaccel_x86-32.S b/libbb/hash_sha256_hwaccel_x86-32.S new file mode 100644 index 000000000..a0e4a571a --- /dev/null +++ b/libbb/hash_sha256_hwaccel_x86-32.S @@ -0,0 +1,284 @@ +#if ENABLE_SHA256_HWACCEL && defined(__GNUC__) && defined(__i386__) +/* The code is adapted from Linux kernel's source */ + +// We use shorter insns, even though they are for "wrong" +// data type (fp, not int). +// For Intel, there is no penalty for doing it at all +// (CPUs which do have such penalty do not support SHA insns). +// For AMD, the penalty is one extra cycle +// (allegedly: I failed to find measurable difference). + +//#define mova128 movdqa +#define mova128 movaps +//#define movu128 movdqu +#define movu128 movups +//#define shuf128_32 pshufd +#define shuf128_32 shufps + +// pshufb and palignr are SSSE3 insns. +// We do not check SSSE3 in cpuid, +// all SHA-capable CPUs support it as well. + +#ifdef __linux__ + .section .note.GNU-stack, "", @progbits +#endif + .section .text.sha256_process_block64_shaNI, "ax", @progbits + .globl sha256_process_block64_shaNI + .hidden sha256_process_block64_shaNI + .type sha256_process_block64_shaNI, @function + +#define DATA_PTR %eax + +#define SHA256CONSTANTS %ecx + +#define MSG %xmm0 +#define STATE0 %xmm1 +#define STATE1 %xmm2 +#define MSGTMP0 %xmm3 +#define MSGTMP1 %xmm4 +#define MSGTMP2 %xmm5 +#define MSGTMP3 %xmm6 + +#define XMMTMP %xmm7 + +#define SHUF(a,b,c,d) $(a+(b<<2)+(c<<4)+(d<<6)) + + .balign 8 # allow decoders to fetch at least 2 first insns +sha256_process_block64_shaNI: + + movu128 76+0*16(%eax), XMMTMP /* ABCD (little-endian dword order) */ + movu128 76+1*16(%eax), STATE1 /* EFGH */ +/* shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one */ + mova128 STATE1, STATE0 + /* --- -------------- ABCD -- EFGH */ + shufps SHUF(1,0,1,0), XMMTMP, STATE0 /* FEBA */ + shufps SHUF(3,2,3,2), XMMTMP, STATE1 /* HGDC */ + +/* XMMTMP holds flip mask from here... */ + mova128 PSHUFFLE_BSWAP32_FLIP_MASK, XMMTMP + movl $K256+8*16, SHA256CONSTANTS + + /* Rounds 0-3 */ + movu128 0*16(DATA_PTR), MSG + pshufb XMMTMP, MSG + mova128 MSG, MSGTMP0 + paddd 0*16-8*16(SHA256CONSTANTS), MSG + sha256rnds2 MSG, STATE0, STATE1 + shuf128_32 $0x0E, MSG, MSG + sha256rnds2 MSG, STATE1, STATE0 + + /* Rounds 4-7 */ + movu128 1*16(DATA_PTR), MSG + pshufb XMMTMP, MSG + mova128 MSG, MSGTMP1 + paddd 1*16-8*16(SHA256CONSTANTS), MSG + sha256rnds2 MSG, STATE0, STATE1 + shuf128_32 $0x0E, MSG, MSG + sha256rnds2 MSG, STATE1, STATE0 + sha256msg1 MSGTMP1, MSGTMP0 + + /* Rounds 8-11 */ + movu128 2*16(DATA_PTR), MSG + pshufb XMMTMP, MSG + mova128 MSG, MSGTMP2 + paddd 2*16-8*16(SHA256CONSTANTS), MSG + sha256rnds2 MSG, STATE0, STATE1 + shuf128_32 $0x0E, MSG, MSG + sha256rnds2 MSG, STATE1, STATE0 + sha256msg1 MSGTMP2, MSGTMP1 + + /* Rounds 12-15 */ + movu128 3*16(DATA_PTR), MSG + pshufb XMMTMP, MSG +/* ...to here */ + mova128 MSG, MSGTMP3 + paddd 3*16-8*16(SHA256CONSTANTS), MSG + sha256rnds2 MSG, STATE0, STATE1 + mova128 MSGTMP3, XMMTMP + palignr $4, MSGTMP2, XMMTMP + paddd XMMTMP, MSGTMP0 + sha256msg2 MSGTMP3, MSGTMP0 + shuf128_32 $0x0E, MSG, MSG + sha256rnds2 MSG, STATE1, STATE0 + sha256msg1 MSGTMP3, MSGTMP2 + + /* Rounds 16-19 */ + mova128 MSGTMP0, MSG + paddd 4*16-8*16(SHA256CONSTANTS), MSG + sha256rnds2 MSG, STATE0, STATE1 + mova128 MSGTMP0, XMMTMP + palignr $4, MSGTMP3, XMMTMP + paddd XMMTMP, MSGTMP1 + sha256msg2 MSGTMP0, MSGTMP1 + shuf128_32 $0x0E, MSG, MSG + sha256rnds2 MSG, STATE1, STATE0 + sha256msg1 MSGTMP0, MSGTMP3 + + /* Rounds 20-23 */ + mova128 MSGTMP1, MSG + paddd 5*16-8*16(SHA256CONSTANTS), MSG + sha256rnds2 MSG, STATE0, STATE1 + mova128 MSGTMP1, XMMTMP + palignr $4, MSGTMP0, XMMTMP + paddd XMMTMP, MSGTMP2 + sha256msg2 MSGTMP1, MSGTMP2 + shuf128_32 $0x0E, MSG, MSG + sha256rnds2 MSG, STATE1, STATE0 + sha256msg1 MSGTMP1, MSGTMP0 + + /* Rounds 24-27 */ + mova128 MSGTMP2, MSG + paddd 6*16-8*16(SHA256CONSTANTS), MSG + sha256rnds2 MSG, STATE0, STATE1 + mova128 MSGTMP2, XMMTMP + palignr $4, MSGTMP1, XMMTMP + paddd XMMTMP, MSGTMP3 + sha256msg2 MSGTMP2, MSGTMP3 + shuf128_32 $0x0E, MSG, MSG + sha256rnds2 MSG, STATE1, STATE0 + sha256msg1 MSGTMP2, MSGTMP1 + + /* Rounds 28-31 */ + mova128 MSGTMP3, MSG + paddd 7*16-8*16(SHA256CONSTANTS), MSG + sha256rnds2 MSG, STATE0, STATE1 + mova128 MSGTMP3, XMMTMP + palignr $4, MSGTMP2, XMMTMP + paddd XMMTMP, MSGTMP0 + sha256msg2 MSGTMP3, MSGTMP0 + shuf128_32 $0x0E, MSG, MSG + sha256rnds2 MSG, STATE1, STATE0 + sha256msg1 MSGTMP3, MSGTMP2 + + /* Rounds 32-35 */ + mova128 MSGTMP0, MSG + paddd 8*16-8*16(SHA256CONSTANTS), MSG + sha256rnds2 MSG, STATE0, STATE1 + mova128 MSGTMP0, XMMTMP + palignr $4, MSGTMP3, XMMTMP + paddd XMMTMP, MSGTMP1 + sha256msg2 MSGTMP0, MSGTMP1 + shuf128_32 $0x0E, MSG, MSG + sha256rnds2 MSG, STATE1, STATE0 + sha256msg1 MSGTMP0, MSGTMP3 + + /* Rounds 36-39 */ + mova128 MSGTMP1, MSG + paddd 9*16-8*16(SHA256CONSTANTS), MSG + sha256rnds2 MSG, STATE0, STATE1 + mova128 MSGTMP1, XMMTMP + palignr $4, MSGTMP0, XMMTMP + paddd XMMTMP, MSGTMP2 + sha256msg2 MSGTMP1, MSGTMP2 + shuf128_32 $0x0E, MSG, MSG + sha256rnds2 MSG, STATE1, STATE0 + sha256msg1 MSGTMP1, MSGTMP0 + + /* Rounds 40-43 */ + mova128 MSGTMP2, MSG + paddd 10*16-8*16(SHA256CONSTANTS), MSG + sha256rnds2 MSG, STATE0, STATE1 + mova128 MSGTMP2, XMMTMP + palignr $4, MSGTMP1, XMMTMP + paddd XMMTMP, MSGTMP3 + sha256msg2 MSGTMP2, MSGTMP3 + shuf128_32 $0x0E, MSG, MSG + sha256rnds2 MSG, STATE1, STATE0 + sha256msg1 MSGTMP2, MSGTMP1 + + /* Rounds 44-47 */ + mova128 MSGTMP3, MSG + paddd 11*16-8*16(SHA256CONSTANTS), MSG + sha256rnds2 MSG, STATE0, STATE1 + mova128 MSGTMP3, XMMTMP + palignr $4, MSGTMP2, XMMTMP + paddd XMMTMP, MSGTMP0 + sha256msg2 MSGTMP3, MSGTMP0 + shuf128_32 $0x0E, MSG, MSG + sha256rnds2 MSG, STATE1, STATE0 + sha256msg1 MSGTMP3, MSGTMP2 + + /* Rounds 48-51 */ + mova128 MSGTMP0, MSG + paddd 12*16-8*16(SHA256CONSTANTS), MSG + sha256rnds2 MSG, STATE0, STATE1 + mova128 MSGTMP0, XMMTMP + palignr $4, MSGTMP3, XMMTMP + paddd XMMTMP, MSGTMP1 + sha256msg2 MSGTMP0, MSGTMP1 + shuf128_32 $0x0E, MSG, MSG + sha256rnds2 MSG, STATE1, STATE0 + sha256msg1 MSGTMP0, MSGTMP3 + + /* Rounds 52-55 */ + mova128 MSGTMP1, MSG + paddd 13*16-8*16(SHA256CONSTANTS), MSG + sha256rnds2 MSG, STATE0, STATE1 + mova128 MSGTMP1, XMMTMP + palignr $4, MSGTMP0, XMMTMP + paddd XMMTMP, MSGTMP2 + sha256msg2 MSGTMP1, MSGTMP2 + shuf128_32 $0x0E, MSG, MSG + sha256rnds2 MSG, STATE1, STATE0 + + /* Rounds 56-59 */ + mova128 MSGTMP2, MSG + paddd 14*16-8*16(SHA256CONSTANTS), MSG + sha256rnds2 MSG, STATE0, STATE1 + mova128 MSGTMP2, XMMTMP + palignr $4, MSGTMP1, XMMTMP + paddd XMMTMP, MSGTMP3 + sha256msg2 MSGTMP2, MSGTMP3 + shuf128_32 $0x0E, MSG, MSG + sha256rnds2 MSG, STATE1, STATE0 + + /* Rounds 60-63 */ + mova128 MSGTMP3, MSG + paddd 15*16-8*16(SHA256CONSTANTS), MSG + sha256rnds2 MSG, STATE0, STATE1 + shuf128_32 $0x0E, MSG, MSG + sha256rnds2 MSG, STATE1, STATE0 + + /* Write hash values back in the correct order */ + mova128 STATE0, XMMTMP +/* shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one */ + /* --- -------------- HGDC -- FEBA */ + shufps SHUF(3,2,3,2), STATE1, STATE0 /* ABCD */ + shufps SHUF(1,0,1,0), STATE1, XMMTMP /* EFGH */ + /* add current hash values to previous ones */ + movu128 76+1*16(%eax), STATE1 + paddd XMMTMP, STATE1 + movu128 STATE1, 76+1*16(%eax) + movu128 76+0*16(%eax), XMMTMP + paddd XMMTMP, STATE0 + movu128 STATE0, 76+0*16(%eax) + + ret + .size sha256_process_block64_shaNI, .-sha256_process_block64_shaNI + + .section .rodata.cst256.K256, "aM", @progbits, 256 + .balign 16 +K256: + .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 + .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 + .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 + .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 + .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc + .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da + .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 + .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 + .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 + .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 + .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 + .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 + .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 + .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 + .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 + .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 + + .section .rodata.cst16.PSHUFFLE_BSWAP32_FLIP_MASK, "aM", @progbits, 16 + .balign 16 +PSHUFFLE_BSWAP32_FLIP_MASK: + .octa 0x0c0d0e0f08090a0b0405060700010203 + +#endif diff --git a/libbb/hash_sha256_hwaccel_x86-64.S b/libbb/hash_sha256_hwaccel_x86-64.S new file mode 100644 index 000000000..172c2eae2 --- /dev/null +++ b/libbb/hash_sha256_hwaccel_x86-64.S @@ -0,0 +1,290 @@ +#if ENABLE_SHA256_HWACCEL && defined(__GNUC__) && defined(__x86_64__) +/* The code is adapted from Linux kernel's source */ + +// We use shorter insns, even though they are for "wrong" +// data type (fp, not int). +// For Intel, there is no penalty for doing it at all +// (CPUs which do have such penalty do not support SHA insns). +// For AMD, the penalty is one extra cycle +// (allegedly: I failed to find measurable difference). + +//#define mova128 movdqa +#define mova128 movaps +//#define movu128 movdqu +#define movu128 movups +//#define shuf128_32 pshufd +#define shuf128_32 shufps + +// pshufb and palignr are SSSE3 insns. +// We do not check SSSE3 in cpuid, +// all SHA-capable CPUs support it as well. + +#ifdef __linux__ + .section .note.GNU-stack, "", @progbits +#endif + .section .text.sha256_process_block64_shaNI, "ax", @progbits + .globl sha256_process_block64_shaNI + .hidden sha256_process_block64_shaNI + .type sha256_process_block64_shaNI, @function + +#define DATA_PTR %rdi + +#define SHA256CONSTANTS %rax + +#define MSG %xmm0 +#define STATE0 %xmm1 +#define STATE1 %xmm2 +#define MSGTMP0 %xmm3 +#define MSGTMP1 %xmm4 +#define MSGTMP2 %xmm5 +#define MSGTMP3 %xmm6 + +#define XMMTMP %xmm7 + +#define SAVE0 %xmm8 +#define SAVE1 %xmm9 + +#define SHUF(a,b,c,d) $(a+(b<<2)+(c<<4)+(d<<6)) + + .balign 8 # allow decoders to fetch at least 2 first insns +sha256_process_block64_shaNI: + + movu128 80+0*16(%rdi), XMMTMP /* ABCD (little-endian dword order) */ + movu128 80+1*16(%rdi), STATE1 /* EFGH */ +/* shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one */ + mova128 STATE1, STATE0 + /* --- -------------- ABCD -- EFGH */ + shufps SHUF(1,0,1,0), XMMTMP, STATE0 /* FEBA */ + shufps SHUF(3,2,3,2), XMMTMP, STATE1 /* HGDC */ + +/* XMMTMP holds flip mask from here... */ + mova128 PSHUFFLE_BSWAP32_FLIP_MASK(%rip), XMMTMP + leaq K256+8*16(%rip), SHA256CONSTANTS + + /* Save hash values for addition after rounds */ + mova128 STATE0, SAVE0 + mova128 STATE1, SAVE1 + + /* Rounds 0-3 */ + movu128 0*16(DATA_PTR), MSG + pshufb XMMTMP, MSG + mova128 MSG, MSGTMP0 + paddd 0*16-8*16(SHA256CONSTANTS), MSG + sha256rnds2 MSG, STATE0, STATE1 + shuf128_32 $0x0E, MSG, MSG + sha256rnds2 MSG, STATE1, STATE0 + + /* Rounds 4-7 */ + movu128 1*16(DATA_PTR), MSG + pshufb XMMTMP, MSG + mova128 MSG, MSGTMP1 + paddd 1*16-8*16(SHA256CONSTANTS), MSG + sha256rnds2 MSG, STATE0, STATE1 + shuf128_32 $0x0E, MSG, MSG + sha256rnds2 MSG, STATE1, STATE0 + sha256msg1 MSGTMP1, MSGTMP0 + + /* Rounds 8-11 */ + movu128 2*16(DATA_PTR), MSG + pshufb XMMTMP, MSG + mova128 MSG, MSGTMP2 + paddd 2*16-8*16(SHA256CONSTANTS), MSG + sha256rnds2 MSG, STATE0, STATE1 + shuf128_32 $0x0E, MSG, MSG + sha256rnds2 MSG, STATE1, STATE0 + sha256msg1 MSGTMP2, MSGTMP1 + + /* Rounds 12-15 */ + movu128 3*16(DATA_PTR), MSG + pshufb XMMTMP, MSG +/* ...to here */ + mova128 MSG, MSGTMP3 + paddd 3*16-8*16(SHA256CONSTANTS), MSG + sha256rnds2 MSG, STATE0, STATE1 + mova128 MSGTMP3, XMMTMP + palignr $4, MSGTMP2, XMMTMP + paddd XMMTMP, MSGTMP0 + sha256msg2 MSGTMP3, MSGTMP0 + shuf128_32 $0x0E, MSG, MSG + sha256rnds2 MSG, STATE1, STATE0 + sha256msg1 MSGTMP3, MSGTMP2 + + /* Rounds 16-19 */ + mova128 MSGTMP0, MSG + paddd 4*16-8*16(SHA256CONSTANTS), MSG + sha256rnds2 MSG, STATE0, STATE1 + mova128 MSGTMP0, XMMTMP + palignr $4, MSGTMP3, XMMTMP + paddd XMMTMP, MSGTMP1 + sha256msg2 MSGTMP0, MSGTMP1 + shuf128_32 $0x0E, MSG, MSG + sha256rnds2 MSG, STATE1, STATE0 + sha256msg1 MSGTMP0, MSGTMP3 + + /* Rounds 20-23 */ + mova128 MSGTMP1, MSG + paddd 5*16-8*16(SHA256CONSTANTS), MSG + sha256rnds2 MSG, STATE0, STATE1 + mova128 MSGTMP1, XMMTMP + palignr $4, MSGTMP0, XMMTMP + paddd XMMTMP, MSGTMP2 + sha256msg2 MSGTMP1, MSGTMP2 + shuf128_32 $0x0E, MSG, MSG + sha256rnds2 MSG, STATE1, STATE0 + sha256msg1 MSGTMP1, MSGTMP0 + + /* Rounds 24-27 */ + mova128 MSGTMP2, MSG + paddd 6*16-8*16(SHA256CONSTANTS), MSG + sha256rnds2 MSG, STATE0, STATE1 + mova128 MSGTMP2, XMMTMP + palignr $4, MSGTMP1, XMMTMP + paddd XMMTMP, MSGTMP3 + sha256msg2 MSGTMP2, MSGTMP3 + shuf128_32 $0x0E, MSG, MSG + sha256rnds2 MSG, STATE1, STATE0 + sha256msg1 MSGTMP2, MSGTMP1 + + /* Rounds 28-31 */ + mova128 MSGTMP3, MSG + paddd 7*16-8*16(SHA256CONSTANTS), MSG + sha256rnds2 MSG, STATE0, STATE1 + mova128 MSGTMP3, XMMTMP + palignr $4, MSGTMP2, XMMTMP + paddd XMMTMP, MSGTMP0 + sha256msg2 MSGTMP3, MSGTMP0 + shuf128_32 $0x0E, MSG, MSG + sha256rnds2 MSG, STATE1, STATE0 + sha256msg1 MSGTMP3, MSGTMP2 + + /* Rounds 32-35 */ + mova128 MSGTMP0, MSG + paddd 8*16-8*16(SHA256CONSTANTS), MSG + sha256rnds2 MSG, STATE0, STATE1 + mova128 MSGTMP0, XMMTMP + palignr $4, MSGTMP3, XMMTMP + paddd XMMTMP, MSGTMP1 + sha256msg2 MSGTMP0, MSGTMP1 + shuf128_32 $0x0E, MSG, MSG + sha256rnds2 MSG, STATE1, STATE0 + sha256msg1 MSGTMP0, MSGTMP3 + + /* Rounds 36-39 */ + mova128 MSGTMP1, MSG + paddd 9*16-8*16(SHA256CONSTANTS), MSG + sha256rnds2 MSG, STATE0, STATE1 + mova128 MSGTMP1, XMMTMP + palignr $4, MSGTMP0, XMMTMP + paddd XMMTMP, MSGTMP2 + sha256msg2 MSGTMP1, MSGTMP2 + shuf128_32 $0x0E, MSG, MSG + sha256rnds2 MSG, STATE1, STATE0 + sha256msg1 MSGTMP1, MSGTMP0 + + /* Rounds 40-43 */ + mova128 MSGTMP2, MSG + paddd 10*16-8*16(SHA256CONSTANTS), MSG + sha256rnds2 MSG, STATE0, STATE1 + mova128 MSGTMP2, XMMTMP + palignr $4, MSGTMP1, XMMTMP + paddd XMMTMP, MSGTMP3 + sha256msg2 MSGTMP2, MSGTMP3 + shuf128_32 $0x0E, MSG, MSG + sha256rnds2 MSG, STATE1, STATE0 + sha256msg1 MSGTMP2, MSGTMP1 + + /* Rounds 44-47 */ + mova128 MSGTMP3, MSG + paddd 11*16-8*16(SHA256CONSTANTS), MSG + sha256rnds2 MSG, STATE0, STATE1 + mova128 MSGTMP3, XMMTMP + palignr $4, MSGTMP2, XMMTMP + paddd XMMTMP, MSGTMP0 + sha256msg2 MSGTMP3, MSGTMP0 + shuf128_32 $0x0E, MSG, MSG + sha256rnds2 MSG, STATE1, STATE0 + sha256msg1 MSGTMP3, MSGTMP2 + + /* Rounds 48-51 */ + mova128 MSGTMP0, MSG + paddd 12*16-8*16(SHA256CONSTANTS), MSG + sha256rnds2 MSG, STATE0, STATE1 + mova128 MSGTMP0, XMMTMP + palignr $4, MSGTMP3, XMMTMP + paddd XMMTMP, MSGTMP1 + sha256msg2 MSGTMP0, MSGTMP1 + shuf128_32 $0x0E, MSG, MSG + sha256rnds2 MSG, STATE1, STATE0 + sha256msg1 MSGTMP0, MSGTMP3 + + /* Rounds 52-55 */ + mova128 MSGTMP1, MSG + paddd 13*16-8*16(SHA256CONSTANTS), MSG + sha256rnds2 MSG, STATE0, STATE1 + mova128 MSGTMP1, XMMTMP + palignr $4, MSGTMP0, XMMTMP + paddd XMMTMP, MSGTMP2 + sha256msg2 MSGTMP1, MSGTMP2 + shuf128_32 $0x0E, MSG, MSG + sha256rnds2 MSG, STATE1, STATE0 + + /* Rounds 56-59 */ + mova128 MSGTMP2, MSG + paddd 14*16-8*16(SHA256CONSTANTS), MSG + sha256rnds2 MSG, STATE0, STATE1 + mova128 MSGTMP2, XMMTMP + palignr $4, MSGTMP1, XMMTMP + paddd XMMTMP, MSGTMP3 + sha256msg2 MSGTMP2, MSGTMP3 + shuf128_32 $0x0E, MSG, MSG + sha256rnds2 MSG, STATE1, STATE0 + + /* Rounds 60-63 */ + mova128 MSGTMP3, MSG + paddd 15*16-8*16(SHA256CONSTANTS), MSG + sha256rnds2 MSG, STATE0, STATE1 + shuf128_32 $0x0E, MSG, MSG + sha256rnds2 MSG, STATE1, STATE0 + + /* Add current hash values with previously saved */ + paddd SAVE0, STATE0 + paddd SAVE1, STATE1 + + /* Write hash values back in the correct order */ + mova128 STATE0, XMMTMP +/* shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one */ + /* --- -------------- HGDC -- FEBA */ + shufps SHUF(3,2,3,2), STATE1, STATE0 /* ABCD */ + shufps SHUF(1,0,1,0), STATE1, XMMTMP /* EFGH */ + movu128 STATE0, 80+0*16(%rdi) + movu128 XMMTMP, 80+1*16(%rdi) + + ret + .size sha256_process_block64_shaNI, .-sha256_process_block64_shaNI + + .section .rodata.cst256.K256, "aM", @progbits, 256 + .balign 16 +K256: + .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 + .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 + .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 + .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 + .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc + .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da + .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 + .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 + .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 + .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 + .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 + .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 + .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 + .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 + .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 + .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 + + .section .rodata.cst16.PSHUFFLE_BSWAP32_FLIP_MASK, "aM", @progbits, 16 + .balign 16 +PSHUFFLE_BSWAP32_FLIP_MASK: + .octa 0x0c0d0e0f08090a0b0405060700010203 + +#endif -- cgit v1.2.3