From 8f0845cad7bfc46939132b33f9cd0753b261b953 Mon Sep 17 00:00:00 2001
From: Denys Vlasenko <vda.linux@googlemail.com>
Date: Tue, 18 Jul 2023 16:41:12 +0200
Subject: libbb: rename source files, no code changes

Signed-off-by: Denys Vlasenko <vda.linux@googlemail.com>
---
 libbb/Kbuild.src                     |   10 +-
 libbb/hash_md5_sha256_x86-32_shaNI.S |  284 -------
 libbb/hash_md5_sha256_x86-64_shaNI.S |  290 -------
 libbb/hash_md5_sha_x86-32_shaNI.S    |  234 ------
 libbb/hash_md5_sha_x86-64.S          | 1489 ----------------------------------
 libbb/hash_md5_sha_x86-64.S.sh       |  478 -----------
 libbb/hash_md5_sha_x86-64_shaNI.S    |  232 ------
 libbb/hash_sha1_hwaccel_x86-32.S     |  234 ++++++
 libbb/hash_sha1_hwaccel_x86-64.S     |  232 ++++++
 libbb/hash_sha1_x86-64.S             | 1489 ++++++++++++++++++++++++++++++++++
 libbb/hash_sha1_x86-64.S.sh          |  478 +++++++++++
 libbb/hash_sha256_hwaccel_x86-32.S   |  284 +++++++
 libbb/hash_sha256_hwaccel_x86-64.S   |  290 +++++++
 13 files changed, 3012 insertions(+), 3012 deletions(-)
 delete mode 100644 libbb/hash_md5_sha256_x86-32_shaNI.S
 delete mode 100644 libbb/hash_md5_sha256_x86-64_shaNI.S
 delete mode 100644 libbb/hash_md5_sha_x86-32_shaNI.S
 delete mode 100644 libbb/hash_md5_sha_x86-64.S
 delete mode 100755 libbb/hash_md5_sha_x86-64.S.sh
 delete mode 100644 libbb/hash_md5_sha_x86-64_shaNI.S
 create mode 100644 libbb/hash_sha1_hwaccel_x86-32.S
 create mode 100644 libbb/hash_sha1_hwaccel_x86-64.S
 create mode 100644 libbb/hash_sha1_x86-64.S
 create mode 100755 libbb/hash_sha1_x86-64.S.sh
 create mode 100644 libbb/hash_sha256_hwaccel_x86-32.S
 create mode 100644 libbb/hash_sha256_hwaccel_x86-64.S

diff --git a/libbb/Kbuild.src b/libbb/Kbuild.src
index 653025e56..c3b30003f 100644
--- a/libbb/Kbuild.src
+++ b/libbb/Kbuild.src
@@ -56,11 +56,11 @@ lib-y += login.o
 lib-y += make_directory.o
 lib-y += makedev.o
 lib-y += hash_md5_sha.o
-lib-y += hash_md5_sha_x86-64.o
-lib-y += hash_md5_sha_x86-64_shaNI.o
-lib-y += hash_md5_sha_x86-32_shaNI.o
-lib-y += hash_md5_sha256_x86-64_shaNI.o
-lib-y += hash_md5_sha256_x86-32_shaNI.o
+lib-y += hash_sha1_x86-64.o
+lib-y += hash_sha1_hwaccel_x86-64.o
+lib-y += hash_sha1_hwaccel_x86-32.o
+lib-y += hash_sha256_hwaccel_x86-64.o
+lib-y += hash_sha256_hwaccel_x86-32.o
 # Alternative (disabled) MD5 implementation
 #lib-y += hash_md5prime.o
 lib-y += messages.o
diff --git a/libbb/hash_md5_sha256_x86-32_shaNI.S b/libbb/hash_md5_sha256_x86-32_shaNI.S
deleted file mode 100644
index a0e4a571a..000000000
--- a/libbb/hash_md5_sha256_x86-32_shaNI.S
+++ /dev/null
@@ -1,284 +0,0 @@
-#if ENABLE_SHA256_HWACCEL && defined(__GNUC__) && defined(__i386__)
-/* The code is adapted from Linux kernel's source */
-
-// We use shorter insns, even though they are for "wrong"
-// data type (fp, not int).
-// For Intel, there is no penalty for doing it at all
-// (CPUs which do have such penalty do not support SHA insns).
-// For AMD, the penalty is one extra cycle
-// (allegedly: I failed to find measurable difference).
-
-//#define mova128 movdqa
-#define mova128 movaps
-//#define movu128 movdqu
-#define movu128 movups
-//#define shuf128_32 pshufd
-#define shuf128_32 shufps
-
-// pshufb and palignr are SSSE3 insns.
-// We do not check SSSE3 in cpuid,
-// all SHA-capable CPUs support it as well.
-
-#ifdef __linux__
-	.section	.note.GNU-stack, "", @progbits
-#endif
-	.section	.text.sha256_process_block64_shaNI, "ax", @progbits
-	.globl	sha256_process_block64_shaNI
-	.hidden	sha256_process_block64_shaNI
-	.type	sha256_process_block64_shaNI, @function
-
-#define DATA_PTR	%eax
-
-#define SHA256CONSTANTS	%ecx
-
-#define MSG		%xmm0
-#define STATE0		%xmm1
-#define STATE1		%xmm2
-#define MSGTMP0		%xmm3
-#define MSGTMP1		%xmm4
-#define MSGTMP2		%xmm5
-#define MSGTMP3		%xmm6
-
-#define XMMTMP		%xmm7
-
-#define SHUF(a,b,c,d) $(a+(b<<2)+(c<<4)+(d<<6))
-
-	.balign	8	# allow decoders to fetch at least 2 first insns
-sha256_process_block64_shaNI:
-
-	movu128		76+0*16(%eax), XMMTMP /* ABCD (little-endian dword order) */
-	movu128		76+1*16(%eax), STATE1 /* EFGH */
-/* shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one */
-	mova128		STATE1, STATE0
-	/* ---		-------------- ABCD -- EFGH */
-	shufps		SHUF(1,0,1,0), XMMTMP, STATE0 /* FEBA */
-	shufps		SHUF(3,2,3,2), XMMTMP, STATE1 /* HGDC */
-
-/* XMMTMP holds flip mask from here... */
-	mova128		PSHUFFLE_BSWAP32_FLIP_MASK, XMMTMP
-	movl		$K256+8*16, SHA256CONSTANTS
-
-	/* Rounds 0-3 */
-	movu128		0*16(DATA_PTR), MSG
-	pshufb		XMMTMP, MSG
-	mova128		MSG, MSGTMP0
-		paddd		0*16-8*16(SHA256CONSTANTS), MSG
-		sha256rnds2	MSG, STATE0, STATE1
-		shuf128_32	$0x0E, MSG, MSG
-		sha256rnds2	MSG, STATE1, STATE0
-
-	/* Rounds 4-7 */
-	movu128		1*16(DATA_PTR), MSG
-	pshufb		XMMTMP, MSG
-	mova128		MSG, MSGTMP1
-		paddd		1*16-8*16(SHA256CONSTANTS), MSG
-		sha256rnds2	MSG, STATE0, STATE1
-		shuf128_32	$0x0E, MSG, MSG
-		sha256rnds2	MSG, STATE1, STATE0
-	sha256msg1	MSGTMP1, MSGTMP0
-
-	/* Rounds 8-11 */
-	movu128		2*16(DATA_PTR), MSG
-	pshufb		XMMTMP, MSG
-	mova128		MSG, MSGTMP2
-		paddd		2*16-8*16(SHA256CONSTANTS), MSG
-		sha256rnds2	MSG, STATE0, STATE1
-		shuf128_32	$0x0E, MSG, MSG
-		sha256rnds2	MSG, STATE1, STATE0
-	sha256msg1	MSGTMP2, MSGTMP1
-
-	/* Rounds 12-15 */
-	movu128		3*16(DATA_PTR), MSG
-	pshufb		XMMTMP, MSG
-/* ...to here */
-	mova128		MSG, MSGTMP3
-		paddd		3*16-8*16(SHA256CONSTANTS), MSG
-		sha256rnds2	MSG, STATE0, STATE1
-	mova128		MSGTMP3, XMMTMP
-	palignr		$4, MSGTMP2, XMMTMP
-	paddd		XMMTMP, MSGTMP0
-	sha256msg2	MSGTMP3, MSGTMP0
-		shuf128_32	$0x0E, MSG, MSG
-		sha256rnds2	MSG, STATE1, STATE0
-	sha256msg1	MSGTMP3, MSGTMP2
-
-	/* Rounds 16-19 */
-	mova128		MSGTMP0, MSG
-		paddd		4*16-8*16(SHA256CONSTANTS), MSG
-		sha256rnds2	MSG, STATE0, STATE1
-	mova128		MSGTMP0, XMMTMP
-	palignr		$4, MSGTMP3, XMMTMP
-	paddd		XMMTMP, MSGTMP1
-	sha256msg2	MSGTMP0, MSGTMP1
-		shuf128_32	$0x0E, MSG, MSG
-		sha256rnds2	MSG, STATE1, STATE0
-	sha256msg1	MSGTMP0, MSGTMP3
-
-	/* Rounds 20-23 */
-	mova128		MSGTMP1, MSG
-		paddd		5*16-8*16(SHA256CONSTANTS), MSG
-		sha256rnds2	MSG, STATE0, STATE1
-	mova128		MSGTMP1, XMMTMP
-	palignr		$4, MSGTMP0, XMMTMP
-	paddd		XMMTMP, MSGTMP2
-	sha256msg2	MSGTMP1, MSGTMP2
-		shuf128_32	$0x0E, MSG, MSG
-		sha256rnds2	MSG, STATE1, STATE0
-	sha256msg1	MSGTMP1, MSGTMP0
-
-	/* Rounds 24-27 */
-	mova128		MSGTMP2, MSG
-		paddd		6*16-8*16(SHA256CONSTANTS), MSG
-		sha256rnds2	MSG, STATE0, STATE1
-	mova128		MSGTMP2, XMMTMP
-	palignr		$4, MSGTMP1, XMMTMP
-	paddd		XMMTMP, MSGTMP3
-	sha256msg2	MSGTMP2, MSGTMP3
-		shuf128_32	$0x0E, MSG, MSG
-		sha256rnds2	MSG, STATE1, STATE0
-	sha256msg1	MSGTMP2, MSGTMP1
-
-	/* Rounds 28-31 */
-	mova128		MSGTMP3, MSG
-		paddd		7*16-8*16(SHA256CONSTANTS), MSG
-		sha256rnds2	MSG, STATE0, STATE1
-	mova128		MSGTMP3, XMMTMP
-	palignr		$4, MSGTMP2, XMMTMP
-	paddd		XMMTMP, MSGTMP0
-	sha256msg2	MSGTMP3, MSGTMP0
-		shuf128_32	$0x0E, MSG, MSG
-		sha256rnds2	MSG, STATE1, STATE0
-	sha256msg1	MSGTMP3, MSGTMP2
-
-	/* Rounds 32-35 */
-	mova128		MSGTMP0, MSG
-		paddd		8*16-8*16(SHA256CONSTANTS), MSG
-		sha256rnds2	MSG, STATE0, STATE1
-	mova128		MSGTMP0, XMMTMP
-	palignr		$4, MSGTMP3, XMMTMP
-	paddd		XMMTMP, MSGTMP1
-	sha256msg2	MSGTMP0, MSGTMP1
-		shuf128_32	$0x0E, MSG, MSG
-		sha256rnds2	MSG, STATE1, STATE0
-	sha256msg1	MSGTMP0, MSGTMP3
-
-	/* Rounds 36-39 */
-	mova128		MSGTMP1, MSG
-		paddd		9*16-8*16(SHA256CONSTANTS), MSG
-		sha256rnds2	MSG, STATE0, STATE1
-	mova128		MSGTMP1, XMMTMP
-	palignr		$4, MSGTMP0, XMMTMP
-	paddd		XMMTMP, MSGTMP2
-	sha256msg2	MSGTMP1, MSGTMP2
-		shuf128_32	$0x0E, MSG, MSG
-		sha256rnds2	MSG, STATE1, STATE0
-	sha256msg1	MSGTMP1, MSGTMP0
-
-	/* Rounds 40-43 */
-	mova128		MSGTMP2, MSG
-		paddd		10*16-8*16(SHA256CONSTANTS), MSG
-		sha256rnds2	MSG, STATE0, STATE1
-	mova128		MSGTMP2, XMMTMP
-	palignr		$4, MSGTMP1, XMMTMP
-	paddd		XMMTMP, MSGTMP3
-	sha256msg2	MSGTMP2, MSGTMP3
-		shuf128_32	$0x0E, MSG, MSG
-		sha256rnds2	MSG, STATE1, STATE0
-	sha256msg1	MSGTMP2, MSGTMP1
-
-	/* Rounds 44-47 */
-	mova128		MSGTMP3, MSG
-		paddd		11*16-8*16(SHA256CONSTANTS), MSG
-		sha256rnds2	MSG, STATE0, STATE1
-	mova128		MSGTMP3, XMMTMP
-	palignr		$4, MSGTMP2, XMMTMP
-	paddd		XMMTMP, MSGTMP0
-	sha256msg2	MSGTMP3, MSGTMP0
-		shuf128_32	$0x0E, MSG, MSG
-		sha256rnds2	MSG, STATE1, STATE0
-	sha256msg1	MSGTMP3, MSGTMP2
-
-	/* Rounds 48-51 */
-	mova128		MSGTMP0, MSG
-		paddd		12*16-8*16(SHA256CONSTANTS), MSG
-		sha256rnds2	MSG, STATE0, STATE1
-	mova128		MSGTMP0, XMMTMP
-	palignr		$4, MSGTMP3, XMMTMP
-	paddd		XMMTMP, MSGTMP1
-	sha256msg2	MSGTMP0, MSGTMP1
-		shuf128_32	$0x0E, MSG, MSG
-		sha256rnds2	MSG, STATE1, STATE0
-	sha256msg1	MSGTMP0, MSGTMP3
-
-	/* Rounds 52-55 */
-	mova128		MSGTMP1, MSG
-		paddd		13*16-8*16(SHA256CONSTANTS), MSG
-		sha256rnds2	MSG, STATE0, STATE1
-	mova128		MSGTMP1, XMMTMP
-	palignr		$4, MSGTMP0, XMMTMP
-	paddd		XMMTMP, MSGTMP2
-	sha256msg2	MSGTMP1, MSGTMP2
-		shuf128_32	$0x0E, MSG, MSG
-		sha256rnds2	MSG, STATE1, STATE0
-
-	/* Rounds 56-59 */
-	mova128		MSGTMP2, MSG
-		paddd		14*16-8*16(SHA256CONSTANTS), MSG
-		sha256rnds2	MSG, STATE0, STATE1
-	mova128		MSGTMP2, XMMTMP
-	palignr		$4, MSGTMP1, XMMTMP
-	paddd		XMMTMP, MSGTMP3
-	sha256msg2	MSGTMP2, MSGTMP3
-		shuf128_32	$0x0E, MSG, MSG
-		sha256rnds2	MSG, STATE1, STATE0
-
-	/* Rounds 60-63 */
-	mova128		MSGTMP3, MSG
-		paddd		15*16-8*16(SHA256CONSTANTS), MSG
-		sha256rnds2	MSG, STATE0, STATE1
-		shuf128_32	$0x0E, MSG, MSG
-		sha256rnds2	MSG, STATE1, STATE0
-
-	/* Write hash values back in the correct order */
-	mova128		STATE0, XMMTMP
-/* shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one */
-	/* ---		-------------- HGDC -- FEBA */
-	shufps		SHUF(3,2,3,2), STATE1, STATE0 /* ABCD */
-	shufps		SHUF(1,0,1,0), STATE1, XMMTMP /* EFGH */
-	/* add current hash values to previous ones */
-	movu128		76+1*16(%eax), STATE1
-	paddd		XMMTMP, STATE1
-	movu128		STATE1, 76+1*16(%eax)
-	movu128		76+0*16(%eax), XMMTMP
-	paddd		XMMTMP, STATE0
-	movu128		STATE0, 76+0*16(%eax)
-
-	ret
-	.size	sha256_process_block64_shaNI, .-sha256_process_block64_shaNI
-
-	.section	.rodata.cst256.K256, "aM", @progbits, 256
-	.balign	16
-K256:
-	.long	0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
-	.long	0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
-	.long	0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
-	.long	0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
-	.long	0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
-	.long	0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
-	.long	0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
-	.long	0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
-	.long	0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
-	.long	0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
-	.long	0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
-	.long	0xd192e819,0xd6990624,0xf40e3585,0x106aa070
-	.long	0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
-	.long	0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
-	.long	0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
-	.long	0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
-
-	.section	.rodata.cst16.PSHUFFLE_BSWAP32_FLIP_MASK, "aM", @progbits, 16
-	.balign	16
-PSHUFFLE_BSWAP32_FLIP_MASK:
-	.octa	0x0c0d0e0f08090a0b0405060700010203
-
-#endif
diff --git a/libbb/hash_md5_sha256_x86-64_shaNI.S b/libbb/hash_md5_sha256_x86-64_shaNI.S
deleted file mode 100644
index 172c2eae2..000000000
--- a/libbb/hash_md5_sha256_x86-64_shaNI.S
+++ /dev/null
@@ -1,290 +0,0 @@
-#if ENABLE_SHA256_HWACCEL && defined(__GNUC__) && defined(__x86_64__)
-/* The code is adapted from Linux kernel's source */
-
-// We use shorter insns, even though they are for "wrong"
-// data type (fp, not int).
-// For Intel, there is no penalty for doing it at all
-// (CPUs which do have such penalty do not support SHA insns).
-// For AMD, the penalty is one extra cycle
-// (allegedly: I failed to find measurable difference).
-
-//#define mova128 movdqa
-#define mova128 movaps
-//#define movu128 movdqu
-#define movu128 movups
-//#define shuf128_32 pshufd
-#define shuf128_32 shufps
-
-// pshufb and palignr are SSSE3 insns.
-// We do not check SSSE3 in cpuid,
-// all SHA-capable CPUs support it as well.
-
-#ifdef __linux__
-	.section	.note.GNU-stack, "", @progbits
-#endif
-	.section	.text.sha256_process_block64_shaNI, "ax", @progbits
-	.globl	sha256_process_block64_shaNI
-	.hidden	sha256_process_block64_shaNI
-	.type	sha256_process_block64_shaNI, @function
-
-#define DATA_PTR	%rdi
-
-#define SHA256CONSTANTS	%rax
-
-#define MSG		%xmm0
-#define STATE0		%xmm1
-#define STATE1		%xmm2
-#define MSGTMP0		%xmm3
-#define MSGTMP1		%xmm4
-#define MSGTMP2		%xmm5
-#define MSGTMP3		%xmm6
-
-#define XMMTMP		%xmm7
-
-#define SAVE0		%xmm8
-#define SAVE1		%xmm9
-
-#define SHUF(a,b,c,d) $(a+(b<<2)+(c<<4)+(d<<6))
-
-	.balign	8	# allow decoders to fetch at least 2 first insns
-sha256_process_block64_shaNI:
-
-	movu128		80+0*16(%rdi), XMMTMP /* ABCD (little-endian dword order) */
-	movu128		80+1*16(%rdi), STATE1 /* EFGH */
-/* shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one */
-	mova128		STATE1, STATE0
-	/* ---		-------------- ABCD -- EFGH */
-	shufps		SHUF(1,0,1,0), XMMTMP, STATE0 /* FEBA */
-	shufps		SHUF(3,2,3,2), XMMTMP, STATE1 /* HGDC */
-
-/* XMMTMP holds flip mask from here... */
-	mova128		PSHUFFLE_BSWAP32_FLIP_MASK(%rip), XMMTMP
-	leaq		K256+8*16(%rip), SHA256CONSTANTS
-
-	/* Save hash values for addition after rounds */
-	mova128		STATE0, SAVE0
-	mova128		STATE1, SAVE1
-
-	/* Rounds 0-3 */
-	movu128		0*16(DATA_PTR), MSG
-	pshufb		XMMTMP, MSG
-	mova128		MSG, MSGTMP0
-		paddd		0*16-8*16(SHA256CONSTANTS), MSG
-		sha256rnds2	MSG, STATE0, STATE1
-		shuf128_32	$0x0E, MSG, MSG
-		sha256rnds2	MSG, STATE1, STATE0
-
-	/* Rounds 4-7 */
-	movu128		1*16(DATA_PTR), MSG
-	pshufb		XMMTMP, MSG
-	mova128		MSG, MSGTMP1
-		paddd		1*16-8*16(SHA256CONSTANTS), MSG
-		sha256rnds2	MSG, STATE0, STATE1
-		shuf128_32	$0x0E, MSG, MSG
-		sha256rnds2	MSG, STATE1, STATE0
-	sha256msg1	MSGTMP1, MSGTMP0
-
-	/* Rounds 8-11 */
-	movu128		2*16(DATA_PTR), MSG
-	pshufb		XMMTMP, MSG
-	mova128		MSG, MSGTMP2
-		paddd		2*16-8*16(SHA256CONSTANTS), MSG
-		sha256rnds2	MSG, STATE0, STATE1
-		shuf128_32	$0x0E, MSG, MSG
-		sha256rnds2	MSG, STATE1, STATE0
-	sha256msg1	MSGTMP2, MSGTMP1
-
-	/* Rounds 12-15 */
-	movu128		3*16(DATA_PTR), MSG
-	pshufb		XMMTMP, MSG
-/* ...to here */
-	mova128		MSG, MSGTMP3
-		paddd		3*16-8*16(SHA256CONSTANTS), MSG
-		sha256rnds2	MSG, STATE0, STATE1
-	mova128		MSGTMP3, XMMTMP
-	palignr		$4, MSGTMP2, XMMTMP
-	paddd		XMMTMP, MSGTMP0
-	sha256msg2	MSGTMP3, MSGTMP0
-		shuf128_32	$0x0E, MSG, MSG
-		sha256rnds2	MSG, STATE1, STATE0
-	sha256msg1	MSGTMP3, MSGTMP2
-
-	/* Rounds 16-19 */
-	mova128		MSGTMP0, MSG
-		paddd		4*16-8*16(SHA256CONSTANTS), MSG
-		sha256rnds2	MSG, STATE0, STATE1
-	mova128		MSGTMP0, XMMTMP
-	palignr		$4, MSGTMP3, XMMTMP
-	paddd		XMMTMP, MSGTMP1
-	sha256msg2	MSGTMP0, MSGTMP1
-		shuf128_32	$0x0E, MSG, MSG
-		sha256rnds2	MSG, STATE1, STATE0
-	sha256msg1	MSGTMP0, MSGTMP3
-
-	/* Rounds 20-23 */
-	mova128		MSGTMP1, MSG
-		paddd		5*16-8*16(SHA256CONSTANTS), MSG
-		sha256rnds2	MSG, STATE0, STATE1
-	mova128		MSGTMP1, XMMTMP
-	palignr		$4, MSGTMP0, XMMTMP
-	paddd		XMMTMP, MSGTMP2
-	sha256msg2	MSGTMP1, MSGTMP2
-		shuf128_32	$0x0E, MSG, MSG
-		sha256rnds2	MSG, STATE1, STATE0
-	sha256msg1	MSGTMP1, MSGTMP0
-
-	/* Rounds 24-27 */
-	mova128		MSGTMP2, MSG
-		paddd		6*16-8*16(SHA256CONSTANTS), MSG
-		sha256rnds2	MSG, STATE0, STATE1
-	mova128		MSGTMP2, XMMTMP
-	palignr		$4, MSGTMP1, XMMTMP
-	paddd		XMMTMP, MSGTMP3
-	sha256msg2	MSGTMP2, MSGTMP3
-		shuf128_32	$0x0E, MSG, MSG
-		sha256rnds2	MSG, STATE1, STATE0
-	sha256msg1	MSGTMP2, MSGTMP1
-
-	/* Rounds 28-31 */
-	mova128		MSGTMP3, MSG
-		paddd		7*16-8*16(SHA256CONSTANTS), MSG
-		sha256rnds2	MSG, STATE0, STATE1
-	mova128		MSGTMP3, XMMTMP
-	palignr		$4, MSGTMP2, XMMTMP
-	paddd		XMMTMP, MSGTMP0
-	sha256msg2	MSGTMP3, MSGTMP0
-		shuf128_32	$0x0E, MSG, MSG
-		sha256rnds2	MSG, STATE1, STATE0
-	sha256msg1	MSGTMP3, MSGTMP2
-
-	/* Rounds 32-35 */
-	mova128		MSGTMP0, MSG
-		paddd		8*16-8*16(SHA256CONSTANTS), MSG
-		sha256rnds2	MSG, STATE0, STATE1
-	mova128		MSGTMP0, XMMTMP
-	palignr		$4, MSGTMP3, XMMTMP
-	paddd		XMMTMP, MSGTMP1
-	sha256msg2	MSGTMP0, MSGTMP1
-		shuf128_32	$0x0E, MSG, MSG
-		sha256rnds2	MSG, STATE1, STATE0
-	sha256msg1	MSGTMP0, MSGTMP3
-
-	/* Rounds 36-39 */
-	mova128		MSGTMP1, MSG
-		paddd		9*16-8*16(SHA256CONSTANTS), MSG
-		sha256rnds2	MSG, STATE0, STATE1
-	mova128		MSGTMP1, XMMTMP
-	palignr		$4, MSGTMP0, XMMTMP
-	paddd		XMMTMP, MSGTMP2
-	sha256msg2	MSGTMP1, MSGTMP2
-		shuf128_32	$0x0E, MSG, MSG
-		sha256rnds2	MSG, STATE1, STATE0
-	sha256msg1	MSGTMP1, MSGTMP0
-
-	/* Rounds 40-43 */
-	mova128		MSGTMP2, MSG
-		paddd		10*16-8*16(SHA256CONSTANTS), MSG
-		sha256rnds2	MSG, STATE0, STATE1
-	mova128		MSGTMP2, XMMTMP
-	palignr		$4, MSGTMP1, XMMTMP
-	paddd		XMMTMP, MSGTMP3
-	sha256msg2	MSGTMP2, MSGTMP3
-		shuf128_32	$0x0E, MSG, MSG
-		sha256rnds2	MSG, STATE1, STATE0
-	sha256msg1	MSGTMP2, MSGTMP1
-
-	/* Rounds 44-47 */
-	mova128		MSGTMP3, MSG
-		paddd		11*16-8*16(SHA256CONSTANTS), MSG
-		sha256rnds2	MSG, STATE0, STATE1
-	mova128		MSGTMP3, XMMTMP
-	palignr		$4, MSGTMP2, XMMTMP
-	paddd		XMMTMP, MSGTMP0
-	sha256msg2	MSGTMP3, MSGTMP0
-		shuf128_32	$0x0E, MSG, MSG
-		sha256rnds2	MSG, STATE1, STATE0
-	sha256msg1	MSGTMP3, MSGTMP2
-
-	/* Rounds 48-51 */
-	mova128		MSGTMP0, MSG
-		paddd		12*16-8*16(SHA256CONSTANTS), MSG
-		sha256rnds2	MSG, STATE0, STATE1
-	mova128		MSGTMP0, XMMTMP
-	palignr		$4, MSGTMP3, XMMTMP
-	paddd		XMMTMP, MSGTMP1
-	sha256msg2	MSGTMP0, MSGTMP1
-		shuf128_32	$0x0E, MSG, MSG
-		sha256rnds2	MSG, STATE1, STATE0
-	sha256msg1	MSGTMP0, MSGTMP3
-
-	/* Rounds 52-55 */
-	mova128		MSGTMP1, MSG
-		paddd		13*16-8*16(SHA256CONSTANTS), MSG
-		sha256rnds2	MSG, STATE0, STATE1
-	mova128		MSGTMP1, XMMTMP
-	palignr		$4, MSGTMP0, XMMTMP
-	paddd		XMMTMP, MSGTMP2
-	sha256msg2	MSGTMP1, MSGTMP2
-		shuf128_32	$0x0E, MSG, MSG
-		sha256rnds2	MSG, STATE1, STATE0
-
-	/* Rounds 56-59 */
-	mova128		MSGTMP2, MSG
-		paddd		14*16-8*16(SHA256CONSTANTS), MSG
-		sha256rnds2	MSG, STATE0, STATE1
-	mova128		MSGTMP2, XMMTMP
-	palignr		$4, MSGTMP1, XMMTMP
-	paddd		XMMTMP, MSGTMP3
-	sha256msg2	MSGTMP2, MSGTMP3
-		shuf128_32	$0x0E, MSG, MSG
-		sha256rnds2	MSG, STATE1, STATE0
-
-	/* Rounds 60-63 */
-	mova128		MSGTMP3, MSG
-		paddd		15*16-8*16(SHA256CONSTANTS), MSG
-		sha256rnds2	MSG, STATE0, STATE1
-		shuf128_32	$0x0E, MSG, MSG
-		sha256rnds2	MSG, STATE1, STATE0
-
-	/* Add current hash values with previously saved */
-	paddd		SAVE0, STATE0
-	paddd		SAVE1, STATE1
-
-	/* Write hash values back in the correct order */
-	mova128		STATE0, XMMTMP
-/* shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one */
-	/* ---		-------------- HGDC -- FEBA */
-	shufps		SHUF(3,2,3,2), STATE1, STATE0 /* ABCD */
-	shufps		SHUF(1,0,1,0), STATE1, XMMTMP /* EFGH */
-	movu128		STATE0, 80+0*16(%rdi)
-	movu128		XMMTMP, 80+1*16(%rdi)
-
-	ret
-	.size	sha256_process_block64_shaNI, .-sha256_process_block64_shaNI
-
-	.section	.rodata.cst256.K256, "aM", @progbits, 256
-	.balign	16
-K256:
-	.long	0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
-	.long	0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
-	.long	0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
-	.long	0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
-	.long	0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
-	.long	0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
-	.long	0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
-	.long	0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
-	.long	0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
-	.long	0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
-	.long	0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
-	.long	0xd192e819,0xd6990624,0xf40e3585,0x106aa070
-	.long	0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
-	.long	0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
-	.long	0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
-	.long	0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
-
-	.section	.rodata.cst16.PSHUFFLE_BSWAP32_FLIP_MASK, "aM", @progbits, 16
-	.balign	16
-PSHUFFLE_BSWAP32_FLIP_MASK:
-	.octa	0x0c0d0e0f08090a0b0405060700010203
-
-#endif
diff --git a/libbb/hash_md5_sha_x86-32_shaNI.S b/libbb/hash_md5_sha_x86-32_shaNI.S
deleted file mode 100644
index 7455a29f0..000000000
--- a/libbb/hash_md5_sha_x86-32_shaNI.S
+++ /dev/null
@@ -1,234 +0,0 @@
-#if ENABLE_SHA1_HWACCEL && defined(__GNUC__) && defined(__i386__)
-/* The code is adapted from Linux kernel's source */
-
-// We use shorter insns, even though they are for "wrong"
-// data type (fp, not int).
-// For Intel, there is no penalty for doing it at all
-// (CPUs which do have such penalty do not support SHA insns).
-// For AMD, the penalty is one extra cycle
-// (allegedly: I failed to find measurable difference).
-
-//#define mova128 movdqa
-#define mova128 movaps
-//#define movu128 movdqu
-#define movu128 movups
-//#define xor128 pxor
-#define xor128 xorps
-//#define shuf128_32 pshufd
-#define shuf128_32 shufps
-
-#define extr128_32 pextrd
-//#define extr128_32 extractps	# not shorter
-
-// pshufb is a SSSE3 insn.
-// pinsrd, pextrd, extractps are SSE4.1 insns.
-// We do not check SSSE3/SSE4.1 in cpuid,
-// all SHA-capable CPUs support them as well.
-
-#ifdef __linux__
-	.section	.note.GNU-stack, "", @progbits
-#endif
-	.section	.text.sha1_process_block64_shaNI, "ax", @progbits
-	.globl	sha1_process_block64_shaNI
-	.hidden	sha1_process_block64_shaNI
-	.type	sha1_process_block64_shaNI, @function
-
-#define ABCD		%xmm0
-#define E0		%xmm1	/* Need two E's b/c they ping pong */
-#define E1		%xmm2
-#define MSG0		%xmm3
-#define MSG1		%xmm4
-#define MSG2		%xmm5
-#define MSG3		%xmm6
-
-	.balign	8	# allow decoders to fetch at least 2 first insns
-sha1_process_block64_shaNI:
-	/* load initial hash values */
-	movu128		76(%eax), ABCD
-	xor128		E0, E0
-	pinsrd		$3, 76+4*4(%eax), E0	# load to uppermost 32-bit word
-	shuf128_32	$0x1B, ABCD, ABCD	# DCBA -> ABCD
-
-	mova128		PSHUFFLE_BYTE_FLIP_MASK, %xmm7
-
-	movu128		0*16(%eax), MSG0
-	pshufb		%xmm7, MSG0
-	movu128		1*16(%eax), MSG1
-	pshufb		%xmm7, MSG1
-	movu128		2*16(%eax), MSG2
-	pshufb		%xmm7, MSG2
-	movu128		3*16(%eax), MSG3
-	pshufb		%xmm7, MSG3
-
-	/* Save hash values for addition after rounds */
-	mova128		E0, %xmm7
-	/*mova128	ABCD, %xmm8 - NOPE, 32bit has no xmm8 */
-
-	/* Rounds 0-3 */
-		paddd		MSG0, E0
-		mova128		ABCD, E1
-		sha1rnds4	$0, E0, ABCD
-
-	/* Rounds 4-7 */
-		sha1nexte	MSG1, E1
-		mova128		ABCD, E0
-		sha1rnds4	$0, E1, ABCD
-	sha1msg1	MSG1, MSG0
-
-	/* Rounds 8-11 */
-		sha1nexte	MSG2, E0
-		mova128		ABCD, E1
-		sha1rnds4	$0, E0, ABCD
-	sha1msg1	MSG2, MSG1
-	xor128		MSG2, MSG0
-
-	/* Rounds 12-15 */
-		sha1nexte	MSG3, E1
-		mova128		ABCD, E0
-	sha1msg2	MSG3, MSG0
-		sha1rnds4	$0, E1, ABCD
-	sha1msg1	MSG3, MSG2
-	xor128		MSG3, MSG1
-
-	/* Rounds 16-19 */
-		sha1nexte	MSG0, E0
-		mova128		ABCD, E1
-	sha1msg2	MSG0, MSG1
-		sha1rnds4	$0, E0, ABCD
-	sha1msg1	MSG0, MSG3
-	xor128		MSG0, MSG2
-
-	/* Rounds 20-23 */
-		sha1nexte	MSG1, E1
-		mova128		ABCD, E0
-	sha1msg2	MSG1, MSG2
-		sha1rnds4	$1, E1, ABCD
-	sha1msg1	MSG1, MSG0
-	xor128		MSG1, MSG3
-
-	/* Rounds 24-27 */
-		sha1nexte	MSG2, E0
-		mova128		ABCD, E1
-	sha1msg2	MSG2, MSG3
-		sha1rnds4	$1, E0, ABCD
-	sha1msg1	MSG2, MSG1
-	xor128		MSG2, MSG0
-
-	/* Rounds 28-31 */
-		sha1nexte	MSG3, E1
-		mova128		ABCD, E0
-	sha1msg2	MSG3, MSG0
-		sha1rnds4	$1, E1, ABCD
-	sha1msg1	MSG3, MSG2
-	xor128		MSG3, MSG1
-
-	/* Rounds 32-35 */
-		sha1nexte	MSG0, E0
-		mova128		ABCD, E1
-	sha1msg2	MSG0, MSG1
-		sha1rnds4	$1, E0, ABCD
-	sha1msg1	MSG0, MSG3
-	xor128		MSG0, MSG2
-
-	/* Rounds 36-39 */
-		sha1nexte	MSG1, E1
-		mova128		ABCD, E0
-	sha1msg2	MSG1, MSG2
-		sha1rnds4	$1, E1, ABCD
-	sha1msg1	MSG1, MSG0
-	xor128		MSG1, MSG3
-
-	/* Rounds 40-43 */
-		sha1nexte	MSG2, E0
-		mova128		ABCD, E1
-	sha1msg2	MSG2, MSG3
-		sha1rnds4	$2, E0, ABCD
-	sha1msg1	MSG2, MSG1
-	xor128		MSG2, MSG0
-
-	/* Rounds 44-47 */
-		sha1nexte	MSG3, E1
-		mova128		ABCD, E0
-	sha1msg2	MSG3, MSG0
-		sha1rnds4	$2, E1, ABCD
-	sha1msg1	MSG3, MSG2
-	xor128		MSG3, MSG1
-
-	/* Rounds 48-51 */
-		sha1nexte	MSG0, E0
-		mova128		ABCD, E1
-	sha1msg2	MSG0, MSG1
-		sha1rnds4	$2, E0, ABCD
-	sha1msg1	MSG0, MSG3
-	xor128		MSG0, MSG2
-
-	/* Rounds 52-55 */
-		sha1nexte	MSG1, E1
-		mova128		ABCD, E0
-	sha1msg2	MSG1, MSG2
-		sha1rnds4	$2, E1, ABCD
-	sha1msg1	MSG1, MSG0
-	xor128		MSG1, MSG3
-
-	/* Rounds 56-59 */
-		sha1nexte	MSG2, E0
-		mova128		ABCD, E1
-	sha1msg2	MSG2, MSG3
-		sha1rnds4	$2, E0, ABCD
-	sha1msg1	MSG2, MSG1
-	xor128		MSG2, MSG0
-
-	/* Rounds 60-63 */
-		sha1nexte	MSG3, E1
-		mova128		ABCD, E0
-	sha1msg2	MSG3, MSG0
-		sha1rnds4	$3, E1, ABCD
-	sha1msg1	MSG3, MSG2
-	xor128		MSG3, MSG1
-
-	/* Rounds 64-67 */
-		sha1nexte	MSG0, E0
-		mova128		ABCD, E1
-	sha1msg2	MSG0, MSG1
-		sha1rnds4	$3, E0, ABCD
-	sha1msg1	MSG0, MSG3
-	xor128		MSG0, MSG2
-
-	/* Rounds 68-71 */
-		sha1nexte	MSG1, E1
-		mova128		ABCD, E0
-	sha1msg2	MSG1, MSG2
-		sha1rnds4	$3, E1, ABCD
-	xor128		MSG1, MSG3
-
-	/* Rounds 72-75 */
-		sha1nexte	MSG2, E0
-		mova128		ABCD, E1
-	sha1msg2	MSG2, MSG3
-		sha1rnds4	$3, E0, ABCD
-
-	/* Rounds 76-79 */
-		sha1nexte	MSG3, E1
-		mova128		ABCD, E0
-		sha1rnds4	$3, E1, ABCD
-
-	/* Add current hash values with previously saved */
-	sha1nexte	%xmm7, E0
-	/*paddd		%xmm8, ABCD - 32-bit mode has no xmm8 */
-	movu128		76(%eax), %xmm7	# get original ABCD (not shuffled)...
-
-	/* Write hash values back in the correct order */
-	shuf128_32	$0x1B, ABCD, ABCD
-	paddd		%xmm7, ABCD	# ...add it to final ABCD
-	movu128		ABCD, 76(%eax)
-	extr128_32	$3, E0, 76+4*4(%eax)
-
-	ret
-	.size	sha1_process_block64_shaNI, .-sha1_process_block64_shaNI
-
-	.section	.rodata.cst16.PSHUFFLE_BYTE_FLIP_MASK, "aM", @progbits, 16
-	.balign	16
-PSHUFFLE_BYTE_FLIP_MASK:
-	.octa	0x000102030405060708090a0b0c0d0e0f
-
-#endif
diff --git a/libbb/hash_md5_sha_x86-64.S b/libbb/hash_md5_sha_x86-64.S
deleted file mode 100644
index 2cdd22015..000000000
--- a/libbb/hash_md5_sha_x86-64.S
+++ /dev/null
@@ -1,1489 +0,0 @@
-### Generated by hash_md5_sha_x86-64.S.sh ###
-
-#if CONFIG_SHA1_SMALL == 0 && defined(__GNUC__) && defined(__x86_64__)
-#ifdef __linux__
-	.section	.note.GNU-stack, "", @progbits
-#endif
-	.section	.text.sha1_process_block64, "ax", @progbits
-	.globl	sha1_process_block64
-	.hidden	sha1_process_block64
-	.type	sha1_process_block64, @function
-
-	.balign	8	# allow decoders to fetch at least 5 first insns
-sha1_process_block64:
-	pushq	%rbp	# 1 byte insn
-	pushq	%rbx	# 1 byte insn
-#	pushq	%r15	# 2 byte insn
-	pushq	%r14	# 2 byte insn
-	pushq	%r13	# 2 byte insn
-	pushq	%r12	# 2 byte insn
-	pushq	%rdi	# we need ctx at the end
-
-#Register and stack use:
-# eax..edx: a..d
-# ebp: e
-# esi,edi,r8..r14: temps
-# r15: unused
-# xmm0..xmm3: W[]
-# xmm4,xmm5: temps
-# xmm6: current round constant
-# xmm7: all round constants
-# -64(%rsp): area for passing RCONST + W[] from vector to integer units
-
-	movl	80(%rdi), %eax		# a = ctx->hash[0]
-	movl	84(%rdi), %ebx		# b = ctx->hash[1]
-	movl	88(%rdi), %ecx		# c = ctx->hash[2]
-	movl	92(%rdi), %edx		# d = ctx->hash[3]
-	movl	96(%rdi), %ebp		# e = ctx->hash[4]
-
-	movaps	sha1const(%rip), %xmm7
-	pshufd	$0x00, %xmm7, %xmm6
-
-	# Load W[] to xmm0..3, byteswapping on the fly.
-	#
-	# For iterations 0..15, we pass W[] in rsi,r8..r14
-	# for use in RD1As instead of spilling them to stack.
-	# We lose parallelized addition of RCONST, but LEA
-	# can do two additions at once, so it is probably a wash.
-	# (We use rsi instead of rN because this makes two
-	# LEAs in two first RD1As shorter by one byte).
-	movq	4*0(%rdi), %rsi
-	movq	4*2(%rdi), %r8
-	bswapq	%rsi
-	bswapq	%r8
-	rolq	$32, %rsi		# rsi = W[1]:W[0]
-	rolq	$32, %r8		# r8  = W[3]:W[2]
-	movq	%rsi, %xmm0
-	movq	%r8, %xmm4
-	punpcklqdq %xmm4, %xmm0	# xmm0 = r8:rsi = (W[0],W[1],W[2],W[3])
-#	movaps	%xmm0, %xmm4		# add RCONST, spill to stack
-#	paddd	%xmm6, %xmm4
-#	movups	%xmm4, -64+16*0(%rsp)
-
-	movq	4*4(%rdi), %r9
-	movq	4*6(%rdi), %r10
-	bswapq	%r9
-	bswapq	%r10
-	rolq	$32, %r9		# r9  = W[5]:W[4]
-	rolq	$32, %r10		# r10 = W[7]:W[6]
-	movq	%r9, %xmm1
-	movq	%r10, %xmm4
-	punpcklqdq %xmm4, %xmm1	# xmm1 = r10:r9 = (W[4],W[5],W[6],W[7])
-
-	movq	4*8(%rdi), %r11
-	movq	4*10(%rdi), %r12
-	bswapq	%r11
-	bswapq	%r12
-	rolq	$32, %r11		# r11 = W[9]:W[8]
-	rolq	$32, %r12		# r12 = W[11]:W[10]
-	movq	%r11, %xmm2
-	movq	%r12, %xmm4
-	punpcklqdq %xmm4, %xmm2	# xmm2 = r12:r11 = (W[8],W[9],W[10],W[11])
-
-	movq	4*12(%rdi), %r13
-	movq	4*14(%rdi), %r14
-	bswapq	%r13
-	bswapq	%r14
-	rolq	$32, %r13		# r13 = W[13]:W[12]
-	rolq	$32, %r14		# r14 = W[15]:W[14]
-	movq	%r13, %xmm3
-	movq	%r14, %xmm4
-	punpcklqdq %xmm4, %xmm3	# xmm3 = r14:r13 = (W[12],W[13],W[14],W[15])
-
-# 0
-	leal	0x5A827999(%rbp,%rsi), %ebp # e += RCONST + W[n]
-	shrq	$32, %rsi
-	movl	%ecx, %edi		# c
-	xorl	%edx, %edi		# ^d
-	andl	%ebx, %edi		# &b
-	xorl	%edx, %edi		# (((c ^ d) & b) ^ d)
-	addl	%edi, %ebp		# e += (((c ^ d) & b) ^ d)
-	movl	%eax, %edi		#
-	roll	$5, %edi		# rotl32(a,5)
-	addl	%edi, %ebp		# e += rotl32(a,5)
-	rorl	$2, %ebx		# b = rotl32(b,30)
-# 1
-	leal	0x5A827999(%rdx,%rsi), %edx # e += RCONST + W[n]
-	movl	%ebx, %edi		# c
-	xorl	%ecx, %edi		# ^d
-	andl	%eax, %edi		# &b
-	xorl	%ecx, %edi		# (((c ^ d) & b) ^ d)
-	addl	%edi, %edx		# e += (((c ^ d) & b) ^ d)
-	movl	%ebp, %edi		#
-	roll	$5, %edi		# rotl32(a,5)
-	addl	%edi, %edx		# e += rotl32(a,5)
-	rorl	$2, %eax		# b = rotl32(b,30)
-# 2
-	leal	0x5A827999(%rcx,%r8), %ecx # e += RCONST + W[n]
-	shrq	$32, %r8
-	movl	%eax, %edi		# c
-	xorl	%ebx, %edi		# ^d
-	andl	%ebp, %edi		# &b
-	xorl	%ebx, %edi		# (((c ^ d) & b) ^ d)
-	addl	%edi, %ecx		# e += (((c ^ d) & b) ^ d)
-	movl	%edx, %edi		#
-	roll	$5, %edi		# rotl32(a,5)
-	addl	%edi, %ecx		# e += rotl32(a,5)
-	rorl	$2, %ebp		# b = rotl32(b,30)
-# 3
-	leal	0x5A827999(%rbx,%r8), %ebx # e += RCONST + W[n]
-	movl	%ebp, %edi		# c
-	xorl	%eax, %edi		# ^d
-	andl	%edx, %edi		# &b
-	xorl	%eax, %edi		# (((c ^ d) & b) ^ d)
-	addl	%edi, %ebx		# e += (((c ^ d) & b) ^ d)
-	movl	%ecx, %edi		#
-	roll	$5, %edi		# rotl32(a,5)
-	addl	%edi, %ebx		# e += rotl32(a,5)
-	rorl	$2, %edx		# b = rotl32(b,30)
-# 4
-	leal	0x5A827999(%rax,%r9), %eax # e += RCONST + W[n]
-	shrq	$32, %r9
-	movl	%edx, %edi		# c
-	xorl	%ebp, %edi		# ^d
-	andl	%ecx, %edi		# &b
-	xorl	%ebp, %edi		# (((c ^ d) & b) ^ d)
-	addl	%edi, %eax		# e += (((c ^ d) & b) ^ d)
-	movl	%ebx, %edi		#
-	roll	$5, %edi		# rotl32(a,5)
-	addl	%edi, %eax		# e += rotl32(a,5)
-	rorl	$2, %ecx		# b = rotl32(b,30)
-# 5
-	leal	0x5A827999(%rbp,%r9), %ebp # e += RCONST + W[n]
-	movl	%ecx, %edi		# c
-	xorl	%edx, %edi		# ^d
-	andl	%ebx, %edi		# &b
-	xorl	%edx, %edi		# (((c ^ d) & b) ^ d)
-	addl	%edi, %ebp		# e += (((c ^ d) & b) ^ d)
-	movl	%eax, %edi		#
-	roll	$5, %edi		# rotl32(a,5)
-	addl	%edi, %ebp		# e += rotl32(a,5)
-	rorl	$2, %ebx		# b = rotl32(b,30)
-# 6
-	leal	0x5A827999(%rdx,%r10), %edx # e += RCONST + W[n]
-	shrq	$32, %r10
-	movl	%ebx, %edi		# c
-	xorl	%ecx, %edi		# ^d
-	andl	%eax, %edi		# &b
-	xorl	%ecx, %edi		# (((c ^ d) & b) ^ d)
-	addl	%edi, %edx		# e += (((c ^ d) & b) ^ d)
-	movl	%ebp, %edi		#
-	roll	$5, %edi		# rotl32(a,5)
-	addl	%edi, %edx		# e += rotl32(a,5)
-	rorl	$2, %eax		# b = rotl32(b,30)
-# 7
-	leal	0x5A827999(%rcx,%r10), %ecx # e += RCONST + W[n]
-	movl	%eax, %edi		# c
-	xorl	%ebx, %edi		# ^d
-	andl	%ebp, %edi		# &b
-	xorl	%ebx, %edi		# (((c ^ d) & b) ^ d)
-	addl	%edi, %ecx		# e += (((c ^ d) & b) ^ d)
-	movl	%edx, %edi		#
-	roll	$5, %edi		# rotl32(a,5)
-	addl	%edi, %ecx		# e += rotl32(a,5)
-	rorl	$2, %ebp		# b = rotl32(b,30)
-# PREP %xmm0 %xmm1 %xmm2 %xmm3 -64+16*0(%rsp)
-	movaps	%xmm3, %xmm4
-	psrldq	$4, %xmm4	# rshift by 4 bytes: T1 = ([13],[14],[15],0)
-#	pshufd	$0x4e, %xmm0, %xmm5	# 01001110=2,3,0,1 shuffle, ([2],[3],x,x)
-#	punpcklqdq %xmm1, %xmm5	# T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5])
-# same result as above, but shorter and faster:
-# pshufd/shufps are subtly different: pshufd takes all dwords from source operand,
-# shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one!
-	movaps	%xmm0, %xmm5
-	shufps	$0x4e, %xmm1, %xmm5	# 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5])
-	xorps	%xmm2, %xmm0	# ([8],[9],[10],[11]) ^ ([0],[1],[2],[3])
-	xorps	%xmm4, %xmm5	# ([13],[14],[15],0) ^ ([2],[3],[4],[5])
-	xorps	%xmm5, %xmm0	# ^
-	# W0 = unrotated (W[0]..W[3]), still needs W[3] fixup
-	movaps	%xmm0, %xmm5
-	xorps	%xmm4, %xmm4	# rol(W0,1):
-	pcmpgtd	%xmm0, %xmm4	#  ffffffff for elements <0 (ones with msb bit 1)
-	paddd	%xmm0, %xmm0	#  shift left by 1
-	psubd	%xmm4, %xmm0	#  add 1 to those who had msb bit 1
-	# W0 = rotated (W[0]..W[3]), still needs W[3] fixup
-	pslldq	$12, %xmm5	# lshift by 12 bytes: T2 = (0,0,0,unrotW[0])
-	movaps	%xmm5, %xmm4
-	pslld	$2, %xmm5
-	psrld	$30, %xmm4
-#	xorps	%xmm4, %xmm5	# rol((0,0,0,unrotW[0]),2)
-	xorps	%xmm4, %xmm0	# same result, but does not depend on/does not modify T2
-	xorps	%xmm5, %xmm0	# W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2))
-	movaps	%xmm0, %xmm5
-	paddd	%xmm6, %xmm5
-	movups	%xmm5, -64+16*0(%rsp)
-# 8
-	leal	0x5A827999(%rbx,%r11), %ebx # e += RCONST + W[n]
-	shrq	$32, %r11
-	movl	%ebp, %edi		# c
-	xorl	%eax, %edi		# ^d
-	andl	%edx, %edi		# &b
-	xorl	%eax, %edi		# (((c ^ d) & b) ^ d)
-	addl	%edi, %ebx		# e += (((c ^ d) & b) ^ d)
-	movl	%ecx, %edi		#
-	roll	$5, %edi		# rotl32(a,5)
-	addl	%edi, %ebx		# e += rotl32(a,5)
-	rorl	$2, %edx		# b = rotl32(b,30)
-# 9
-	leal	0x5A827999(%rax,%r11), %eax # e += RCONST + W[n]
-	movl	%edx, %edi		# c
-	xorl	%ebp, %edi		# ^d
-	andl	%ecx, %edi		# &b
-	xorl	%ebp, %edi		# (((c ^ d) & b) ^ d)
-	addl	%edi, %eax		# e += (((c ^ d) & b) ^ d)
-	movl	%ebx, %edi		#
-	roll	$5, %edi		# rotl32(a,5)
-	addl	%edi, %eax		# e += rotl32(a,5)
-	rorl	$2, %ecx		# b = rotl32(b,30)
-# 10
-	leal	0x5A827999(%rbp,%r12), %ebp # e += RCONST + W[n]
-	shrq	$32, %r12
-	movl	%ecx, %edi		# c
-	xorl	%edx, %edi		# ^d
-	andl	%ebx, %edi		# &b
-	xorl	%edx, %edi		# (((c ^ d) & b) ^ d)
-	addl	%edi, %ebp		# e += (((c ^ d) & b) ^ d)
-	movl	%eax, %edi		#
-	roll	$5, %edi		# rotl32(a,5)
-	addl	%edi, %ebp		# e += rotl32(a,5)
-	rorl	$2, %ebx		# b = rotl32(b,30)
-# 11
-	leal	0x5A827999(%rdx,%r12), %edx # e += RCONST + W[n]
-	movl	%ebx, %edi		# c
-	xorl	%ecx, %edi		# ^d
-	andl	%eax, %edi		# &b
-	xorl	%ecx, %edi		# (((c ^ d) & b) ^ d)
-	addl	%edi, %edx		# e += (((c ^ d) & b) ^ d)
-	movl	%ebp, %edi		#
-	roll	$5, %edi		# rotl32(a,5)
-	addl	%edi, %edx		# e += rotl32(a,5)
-	rorl	$2, %eax		# b = rotl32(b,30)
-	pshufd	$0x55, %xmm7, %xmm6
-# PREP %xmm1 %xmm2 %xmm3 %xmm0 -64+16*1(%rsp)
-	movaps	%xmm0, %xmm4
-	psrldq	$4, %xmm4	# rshift by 4 bytes: T1 = ([13],[14],[15],0)
-#	pshufd	$0x4e, %xmm1, %xmm5	# 01001110=2,3,0,1 shuffle, ([2],[3],x,x)
-#	punpcklqdq %xmm2, %xmm5	# T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5])
-# same result as above, but shorter and faster:
-# pshufd/shufps are subtly different: pshufd takes all dwords from source operand,
-# shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one!
-	movaps	%xmm1, %xmm5
-	shufps	$0x4e, %xmm2, %xmm5	# 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5])
-	xorps	%xmm3, %xmm1	# ([8],[9],[10],[11]) ^ ([0],[1],[2],[3])
-	xorps	%xmm4, %xmm5	# ([13],[14],[15],0) ^ ([2],[3],[4],[5])
-	xorps	%xmm5, %xmm1	# ^
-	# W0 = unrotated (W[0]..W[3]), still needs W[3] fixup
-	movaps	%xmm1, %xmm5
-	xorps	%xmm4, %xmm4	# rol(W0,1):
-	pcmpgtd	%xmm1, %xmm4	#  ffffffff for elements <0 (ones with msb bit 1)
-	paddd	%xmm1, %xmm1	#  shift left by 1
-	psubd	%xmm4, %xmm1	#  add 1 to those who had msb bit 1
-	# W0 = rotated (W[0]..W[3]), still needs W[3] fixup
-	pslldq	$12, %xmm5	# lshift by 12 bytes: T2 = (0,0,0,unrotW[0])
-	movaps	%xmm5, %xmm4
-	pslld	$2, %xmm5
-	psrld	$30, %xmm4
-#	xorps	%xmm4, %xmm5	# rol((0,0,0,unrotW[0]),2)
-	xorps	%xmm4, %xmm1	# same result, but does not depend on/does not modify T2
-	xorps	%xmm5, %xmm1	# W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2))
-	movaps	%xmm1, %xmm5
-	paddd	%xmm6, %xmm5
-	movups	%xmm5, -64+16*1(%rsp)
-# 12
-	leal	0x5A827999(%rcx,%r13), %ecx # e += RCONST + W[n]
-	shrq	$32, %r13
-	movl	%eax, %edi		# c
-	xorl	%ebx, %edi		# ^d
-	andl	%ebp, %edi		# &b
-	xorl	%ebx, %edi		# (((c ^ d) & b) ^ d)
-	addl	%edi, %ecx		# e += (((c ^ d) & b) ^ d)
-	movl	%edx, %edi		#
-	roll	$5, %edi		# rotl32(a,5)
-	addl	%edi, %ecx		# e += rotl32(a,5)
-	rorl	$2, %ebp		# b = rotl32(b,30)
-# 13
-	leal	0x5A827999(%rbx,%r13), %ebx # e += RCONST + W[n]
-	movl	%ebp, %edi		# c
-	xorl	%eax, %edi		# ^d
-	andl	%edx, %edi		# &b
-	xorl	%eax, %edi		# (((c ^ d) & b) ^ d)
-	addl	%edi, %ebx		# e += (((c ^ d) & b) ^ d)
-	movl	%ecx, %edi		#
-	roll	$5, %edi		# rotl32(a,5)
-	addl	%edi, %ebx		# e += rotl32(a,5)
-	rorl	$2, %edx		# b = rotl32(b,30)
-# 14
-	leal	0x5A827999(%rax,%r14), %eax # e += RCONST + W[n]
-	shrq	$32, %r14
-	movl	%edx, %edi		# c
-	xorl	%ebp, %edi		# ^d
-	andl	%ecx, %edi		# &b
-	xorl	%ebp, %edi		# (((c ^ d) & b) ^ d)
-	addl	%edi, %eax		# e += (((c ^ d) & b) ^ d)
-	movl	%ebx, %edi		#
-	roll	$5, %edi		# rotl32(a,5)
-	addl	%edi, %eax		# e += rotl32(a,5)
-	rorl	$2, %ecx		# b = rotl32(b,30)
-# 15
-	leal	0x5A827999(%rbp,%r14), %ebp # e += RCONST + W[n]
-	movl	%ecx, %edi		# c
-	xorl	%edx, %edi		# ^d
-	andl	%ebx, %edi		# &b
-	xorl	%edx, %edi		# (((c ^ d) & b) ^ d)
-	addl	%edi, %ebp		# e += (((c ^ d) & b) ^ d)
-	movl	%eax, %edi		#
-	roll	$5, %edi		# rotl32(a,5)
-	addl	%edi, %ebp		# e += rotl32(a,5)
-	rorl	$2, %ebx		# b = rotl32(b,30)
-# PREP %xmm2 %xmm3 %xmm0 %xmm1 -64+16*2(%rsp)
-	movaps	%xmm1, %xmm4
-	psrldq	$4, %xmm4	# rshift by 4 bytes: T1 = ([13],[14],[15],0)
-#	pshufd	$0x4e, %xmm2, %xmm5	# 01001110=2,3,0,1 shuffle, ([2],[3],x,x)
-#	punpcklqdq %xmm3, %xmm5	# T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5])
-# same result as above, but shorter and faster:
-# pshufd/shufps are subtly different: pshufd takes all dwords from source operand,
-# shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one!
-	movaps	%xmm2, %xmm5
-	shufps	$0x4e, %xmm3, %xmm5	# 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5])
-	xorps	%xmm0, %xmm2	# ([8],[9],[10],[11]) ^ ([0],[1],[2],[3])
-	xorps	%xmm4, %xmm5	# ([13],[14],[15],0) ^ ([2],[3],[4],[5])
-	xorps	%xmm5, %xmm2	# ^
-	# W0 = unrotated (W[0]..W[3]), still needs W[3] fixup
-	movaps	%xmm2, %xmm5
-	xorps	%xmm4, %xmm4	# rol(W0,1):
-	pcmpgtd	%xmm2, %xmm4	#  ffffffff for elements <0 (ones with msb bit 1)
-	paddd	%xmm2, %xmm2	#  shift left by 1
-	psubd	%xmm4, %xmm2	#  add 1 to those who had msb bit 1
-	# W0 = rotated (W[0]..W[3]), still needs W[3] fixup
-	pslldq	$12, %xmm5	# lshift by 12 bytes: T2 = (0,0,0,unrotW[0])
-	movaps	%xmm5, %xmm4
-	pslld	$2, %xmm5
-	psrld	$30, %xmm4
-#	xorps	%xmm4, %xmm5	# rol((0,0,0,unrotW[0]),2)
-	xorps	%xmm4, %xmm2	# same result, but does not depend on/does not modify T2
-	xorps	%xmm5, %xmm2	# W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2))
-	movaps	%xmm2, %xmm5
-	paddd	%xmm6, %xmm5
-	movups	%xmm5, -64+16*2(%rsp)
-# 16
-	movl	%ebx, %edi		# c
-	xorl	%ecx, %edi		# ^d
-	andl	%eax, %edi		# &b
-	xorl	%ecx, %edi		# (((c ^ d) & b) ^ d)
-	addl	-64+4*0(%rsp), %edx	# e += RCONST + W[n & 15]
-	addl	%edi, %edx		# e += (((c ^ d) & b) ^ d)
-	movl	%ebp, %esi		#
-	roll	$5, %esi		# rotl32(a,5)
-	addl	%esi, %edx		# e += rotl32(a,5)
-	rorl	$2, %eax		# b = rotl32(b,30)
-# 17
-	movl	%eax, %edi		# c
-	xorl	%ebx, %edi		# ^d
-	andl	%ebp, %edi		# &b
-	xorl	%ebx, %edi		# (((c ^ d) & b) ^ d)
-	addl	-64+4*1(%rsp), %ecx	# e += RCONST + W[n & 15]
-	addl	%edi, %ecx		# e += (((c ^ d) & b) ^ d)
-	movl	%edx, %esi		#
-	roll	$5, %esi		# rotl32(a,5)
-	addl	%esi, %ecx		# e += rotl32(a,5)
-	rorl	$2, %ebp		# b = rotl32(b,30)
-# 18
-	movl	%ebp, %edi		# c
-	xorl	%eax, %edi		# ^d
-	andl	%edx, %edi		# &b
-	xorl	%eax, %edi		# (((c ^ d) & b) ^ d)
-	addl	-64+4*2(%rsp), %ebx	# e += RCONST + W[n & 15]
-	addl	%edi, %ebx		# e += (((c ^ d) & b) ^ d)
-	movl	%ecx, %esi		#
-	roll	$5, %esi		# rotl32(a,5)
-	addl	%esi, %ebx		# e += rotl32(a,5)
-	rorl	$2, %edx		# b = rotl32(b,30)
-# 19
-	movl	%edx, %edi		# c
-	xorl	%ebp, %edi		# ^d
-	andl	%ecx, %edi		# &b
-	xorl	%ebp, %edi		# (((c ^ d) & b) ^ d)
-	addl	-64+4*3(%rsp), %eax	# e += RCONST + W[n & 15]
-	addl	%edi, %eax		# e += (((c ^ d) & b) ^ d)
-	movl	%ebx, %esi		#
-	roll	$5, %esi		# rotl32(a,5)
-	addl	%esi, %eax		# e += rotl32(a,5)
-	rorl	$2, %ecx		# b = rotl32(b,30)
-# PREP %xmm3 %xmm0 %xmm1 %xmm2 -64+16*3(%rsp)
-	movaps	%xmm2, %xmm4
-	psrldq	$4, %xmm4	# rshift by 4 bytes: T1 = ([13],[14],[15],0)
-#	pshufd	$0x4e, %xmm3, %xmm5	# 01001110=2,3,0,1 shuffle, ([2],[3],x,x)
-#	punpcklqdq %xmm0, %xmm5	# T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5])
-# same result as above, but shorter and faster:
-# pshufd/shufps are subtly different: pshufd takes all dwords from source operand,
-# shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one!
-	movaps	%xmm3, %xmm5
-	shufps	$0x4e, %xmm0, %xmm5	# 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5])
-	xorps	%xmm1, %xmm3	# ([8],[9],[10],[11]) ^ ([0],[1],[2],[3])
-	xorps	%xmm4, %xmm5	# ([13],[14],[15],0) ^ ([2],[3],[4],[5])
-	xorps	%xmm5, %xmm3	# ^
-	# W0 = unrotated (W[0]..W[3]), still needs W[3] fixup
-	movaps	%xmm3, %xmm5
-	xorps	%xmm4, %xmm4	# rol(W0,1):
-	pcmpgtd	%xmm3, %xmm4	#  ffffffff for elements <0 (ones with msb bit 1)
-	paddd	%xmm3, %xmm3	#  shift left by 1
-	psubd	%xmm4, %xmm3	#  add 1 to those who had msb bit 1
-	# W0 = rotated (W[0]..W[3]), still needs W[3] fixup
-	pslldq	$12, %xmm5	# lshift by 12 bytes: T2 = (0,0,0,unrotW[0])
-	movaps	%xmm5, %xmm4
-	pslld	$2, %xmm5
-	psrld	$30, %xmm4
-#	xorps	%xmm4, %xmm5	# rol((0,0,0,unrotW[0]),2)
-	xorps	%xmm4, %xmm3	# same result, but does not depend on/does not modify T2
-	xorps	%xmm5, %xmm3	# W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2))
-	movaps	%xmm3, %xmm5
-	paddd	%xmm6, %xmm5
-	movups	%xmm5, -64+16*3(%rsp)
-# 20
-	movl	%ecx, %edi		# c
-	xorl	%edx, %edi		# ^d
-	xorl	%ebx, %edi		# ^b
-	addl	-64+4*4(%rsp), %ebp	# e += RCONST + W[n & 15]
-	addl	%edi, %ebp		# e += (c ^ d ^ b)
-	movl	%eax, %esi		#
-	roll	$5, %esi		# rotl32(a,5)
-	addl	%esi, %ebp		# e += rotl32(a,5)
-	rorl	$2, %ebx		# b = rotl32(b,30)
-# 21
-	movl	%ebx, %edi		# c
-	xorl	%ecx, %edi		# ^d
-	xorl	%eax, %edi		# ^b
-	addl	-64+4*5(%rsp), %edx	# e += RCONST + W[n & 15]
-	addl	%edi, %edx		# e += (c ^ d ^ b)
-	movl	%ebp, %esi		#
-	roll	$5, %esi		# rotl32(a,5)
-	addl	%esi, %edx		# e += rotl32(a,5)
-	rorl	$2, %eax		# b = rotl32(b,30)
-# 22
-	movl	%eax, %edi		# c
-	xorl	%ebx, %edi		# ^d
-	xorl	%ebp, %edi		# ^b
-	addl	-64+4*6(%rsp), %ecx	# e += RCONST + W[n & 15]
-	addl	%edi, %ecx		# e += (c ^ d ^ b)
-	movl	%edx, %esi		#
-	roll	$5, %esi		# rotl32(a,5)
-	addl	%esi, %ecx		# e += rotl32(a,5)
-	rorl	$2, %ebp		# b = rotl32(b,30)
-# 23
-	movl	%ebp, %edi		# c
-	xorl	%eax, %edi		# ^d
-	xorl	%edx, %edi		# ^b
-	addl	-64+4*7(%rsp), %ebx	# e += RCONST + W[n & 15]
-	addl	%edi, %ebx		# e += (c ^ d ^ b)
-	movl	%ecx, %esi		#
-	roll	$5, %esi		# rotl32(a,5)
-	addl	%esi, %ebx		# e += rotl32(a,5)
-	rorl	$2, %edx		# b = rotl32(b,30)
-# PREP %xmm0 %xmm1 %xmm2 %xmm3 -64+16*0(%rsp)
-	movaps	%xmm3, %xmm4
-	psrldq	$4, %xmm4	# rshift by 4 bytes: T1 = ([13],[14],[15],0)
-#	pshufd	$0x4e, %xmm0, %xmm5	# 01001110=2,3,0,1 shuffle, ([2],[3],x,x)
-#	punpcklqdq %xmm1, %xmm5	# T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5])
-# same result as above, but shorter and faster:
-# pshufd/shufps are subtly different: pshufd takes all dwords from source operand,
-# shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one!
-	movaps	%xmm0, %xmm5
-	shufps	$0x4e, %xmm1, %xmm5	# 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5])
-	xorps	%xmm2, %xmm0	# ([8],[9],[10],[11]) ^ ([0],[1],[2],[3])
-	xorps	%xmm4, %xmm5	# ([13],[14],[15],0) ^ ([2],[3],[4],[5])
-	xorps	%xmm5, %xmm0	# ^
-	# W0 = unrotated (W[0]..W[3]), still needs W[3] fixup
-	movaps	%xmm0, %xmm5
-	xorps	%xmm4, %xmm4	# rol(W0,1):
-	pcmpgtd	%xmm0, %xmm4	#  ffffffff for elements <0 (ones with msb bit 1)
-	paddd	%xmm0, %xmm0	#  shift left by 1
-	psubd	%xmm4, %xmm0	#  add 1 to those who had msb bit 1
-	# W0 = rotated (W[0]..W[3]), still needs W[3] fixup
-	pslldq	$12, %xmm5	# lshift by 12 bytes: T2 = (0,0,0,unrotW[0])
-	movaps	%xmm5, %xmm4
-	pslld	$2, %xmm5
-	psrld	$30, %xmm4
-#	xorps	%xmm4, %xmm5	# rol((0,0,0,unrotW[0]),2)
-	xorps	%xmm4, %xmm0	# same result, but does not depend on/does not modify T2
-	xorps	%xmm5, %xmm0	# W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2))
-	movaps	%xmm0, %xmm5
-	paddd	%xmm6, %xmm5
-	movups	%xmm5, -64+16*0(%rsp)
-# 24
-	movl	%edx, %edi		# c
-	xorl	%ebp, %edi		# ^d
-	xorl	%ecx, %edi		# ^b
-	addl	-64+4*8(%rsp), %eax	# e += RCONST + W[n & 15]
-	addl	%edi, %eax		# e += (c ^ d ^ b)
-	movl	%ebx, %esi		#
-	roll	$5, %esi		# rotl32(a,5)
-	addl	%esi, %eax		# e += rotl32(a,5)
-	rorl	$2, %ecx		# b = rotl32(b,30)
-# 25
-	movl	%ecx, %edi		# c
-	xorl	%edx, %edi		# ^d
-	xorl	%ebx, %edi		# ^b
-	addl	-64+4*9(%rsp), %ebp	# e += RCONST + W[n & 15]
-	addl	%edi, %ebp		# e += (c ^ d ^ b)
-	movl	%eax, %esi		#
-	roll	$5, %esi		# rotl32(a,5)
-	addl	%esi, %ebp		# e += rotl32(a,5)
-	rorl	$2, %ebx		# b = rotl32(b,30)
-# 26
-	movl	%ebx, %edi		# c
-	xorl	%ecx, %edi		# ^d
-	xorl	%eax, %edi		# ^b
-	addl	-64+4*10(%rsp), %edx	# e += RCONST + W[n & 15]
-	addl	%edi, %edx		# e += (c ^ d ^ b)
-	movl	%ebp, %esi		#
-	roll	$5, %esi		# rotl32(a,5)
-	addl	%esi, %edx		# e += rotl32(a,5)
-	rorl	$2, %eax		# b = rotl32(b,30)
-# 27
-	movl	%eax, %edi		# c
-	xorl	%ebx, %edi		# ^d
-	xorl	%ebp, %edi		# ^b
-	addl	-64+4*11(%rsp), %ecx	# e += RCONST + W[n & 15]
-	addl	%edi, %ecx		# e += (c ^ d ^ b)
-	movl	%edx, %esi		#
-	roll	$5, %esi		# rotl32(a,5)
-	addl	%esi, %ecx		# e += rotl32(a,5)
-	rorl	$2, %ebp		# b = rotl32(b,30)
-# PREP %xmm1 %xmm2 %xmm3 %xmm0 -64+16*1(%rsp)
-	movaps	%xmm0, %xmm4
-	psrldq	$4, %xmm4	# rshift by 4 bytes: T1 = ([13],[14],[15],0)
-#	pshufd	$0x4e, %xmm1, %xmm5	# 01001110=2,3,0,1 shuffle, ([2],[3],x,x)
-#	punpcklqdq %xmm2, %xmm5	# T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5])
-# same result as above, but shorter and faster:
-# pshufd/shufps are subtly different: pshufd takes all dwords from source operand,
-# shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one!
-	movaps	%xmm1, %xmm5
-	shufps	$0x4e, %xmm2, %xmm5	# 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5])
-	xorps	%xmm3, %xmm1	# ([8],[9],[10],[11]) ^ ([0],[1],[2],[3])
-	xorps	%xmm4, %xmm5	# ([13],[14],[15],0) ^ ([2],[3],[4],[5])
-	xorps	%xmm5, %xmm1	# ^
-	# W0 = unrotated (W[0]..W[3]), still needs W[3] fixup
-	movaps	%xmm1, %xmm5
-	xorps	%xmm4, %xmm4	# rol(W0,1):
-	pcmpgtd	%xmm1, %xmm4	#  ffffffff for elements <0 (ones with msb bit 1)
-	paddd	%xmm1, %xmm1	#  shift left by 1
-	psubd	%xmm4, %xmm1	#  add 1 to those who had msb bit 1
-	# W0 = rotated (W[0]..W[3]), still needs W[3] fixup
-	pslldq	$12, %xmm5	# lshift by 12 bytes: T2 = (0,0,0,unrotW[0])
-	movaps	%xmm5, %xmm4
-	pslld	$2, %xmm5
-	psrld	$30, %xmm4
-#	xorps	%xmm4, %xmm5	# rol((0,0,0,unrotW[0]),2)
-	xorps	%xmm4, %xmm1	# same result, but does not depend on/does not modify T2
-	xorps	%xmm5, %xmm1	# W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2))
-	movaps	%xmm1, %xmm5
-	paddd	%xmm6, %xmm5
-	movups	%xmm5, -64+16*1(%rsp)
-# 28
-	movl	%ebp, %edi		# c
-	xorl	%eax, %edi		# ^d
-	xorl	%edx, %edi		# ^b
-	addl	-64+4*12(%rsp), %ebx	# e += RCONST + W[n & 15]
-	addl	%edi, %ebx		# e += (c ^ d ^ b)
-	movl	%ecx, %esi		#
-	roll	$5, %esi		# rotl32(a,5)
-	addl	%esi, %ebx		# e += rotl32(a,5)
-	rorl	$2, %edx		# b = rotl32(b,30)
-# 29
-	movl	%edx, %edi		# c
-	xorl	%ebp, %edi		# ^d
-	xorl	%ecx, %edi		# ^b
-	addl	-64+4*13(%rsp), %eax	# e += RCONST + W[n & 15]
-	addl	%edi, %eax		# e += (c ^ d ^ b)
-	movl	%ebx, %esi		#
-	roll	$5, %esi		# rotl32(a,5)
-	addl	%esi, %eax		# e += rotl32(a,5)
-	rorl	$2, %ecx		# b = rotl32(b,30)
-# 30
-	movl	%ecx, %edi		# c
-	xorl	%edx, %edi		# ^d
-	xorl	%ebx, %edi		# ^b
-	addl	-64+4*14(%rsp), %ebp	# e += RCONST + W[n & 15]
-	addl	%edi, %ebp		# e += (c ^ d ^ b)
-	movl	%eax, %esi		#
-	roll	$5, %esi		# rotl32(a,5)
-	addl	%esi, %ebp		# e += rotl32(a,5)
-	rorl	$2, %ebx		# b = rotl32(b,30)
-# 31
-	movl	%ebx, %edi		# c
-	xorl	%ecx, %edi		# ^d
-	xorl	%eax, %edi		# ^b
-	addl	-64+4*15(%rsp), %edx	# e += RCONST + W[n & 15]
-	addl	%edi, %edx		# e += (c ^ d ^ b)
-	movl	%ebp, %esi		#
-	roll	$5, %esi		# rotl32(a,5)
-	addl	%esi, %edx		# e += rotl32(a,5)
-	rorl	$2, %eax		# b = rotl32(b,30)
-	pshufd	$0xaa, %xmm7, %xmm6
-# PREP %xmm2 %xmm3 %xmm0 %xmm1 -64+16*2(%rsp)
-	movaps	%xmm1, %xmm4
-	psrldq	$4, %xmm4	# rshift by 4 bytes: T1 = ([13],[14],[15],0)
-#	pshufd	$0x4e, %xmm2, %xmm5	# 01001110=2,3,0,1 shuffle, ([2],[3],x,x)
-#	punpcklqdq %xmm3, %xmm5	# T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5])
-# same result as above, but shorter and faster:
-# pshufd/shufps are subtly different: pshufd takes all dwords from source operand,
-# shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one!
-	movaps	%xmm2, %xmm5
-	shufps	$0x4e, %xmm3, %xmm5	# 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5])
-	xorps	%xmm0, %xmm2	# ([8],[9],[10],[11]) ^ ([0],[1],[2],[3])
-	xorps	%xmm4, %xmm5	# ([13],[14],[15],0) ^ ([2],[3],[4],[5])
-	xorps	%xmm5, %xmm2	# ^
-	# W0 = unrotated (W[0]..W[3]), still needs W[3] fixup
-	movaps	%xmm2, %xmm5
-	xorps	%xmm4, %xmm4	# rol(W0,1):
-	pcmpgtd	%xmm2, %xmm4	#  ffffffff for elements <0 (ones with msb bit 1)
-	paddd	%xmm2, %xmm2	#  shift left by 1
-	psubd	%xmm4, %xmm2	#  add 1 to those who had msb bit 1
-	# W0 = rotated (W[0]..W[3]), still needs W[3] fixup
-	pslldq	$12, %xmm5	# lshift by 12 bytes: T2 = (0,0,0,unrotW[0])
-	movaps	%xmm5, %xmm4
-	pslld	$2, %xmm5
-	psrld	$30, %xmm4
-#	xorps	%xmm4, %xmm5	# rol((0,0,0,unrotW[0]),2)
-	xorps	%xmm4, %xmm2	# same result, but does not depend on/does not modify T2
-	xorps	%xmm5, %xmm2	# W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2))
-	movaps	%xmm2, %xmm5
-	paddd	%xmm6, %xmm5
-	movups	%xmm5, -64+16*2(%rsp)
-# 32
-	movl	%eax, %edi		# c
-	xorl	%ebx, %edi		# ^d
-	xorl	%ebp, %edi		# ^b
-	addl	-64+4*0(%rsp), %ecx	# e += RCONST + W[n & 15]
-	addl	%edi, %ecx		# e += (c ^ d ^ b)
-	movl	%edx, %esi		#
-	roll	$5, %esi		# rotl32(a,5)
-	addl	%esi, %ecx		# e += rotl32(a,5)
-	rorl	$2, %ebp		# b = rotl32(b,30)
-# 33
-	movl	%ebp, %edi		# c
-	xorl	%eax, %edi		# ^d
-	xorl	%edx, %edi		# ^b
-	addl	-64+4*1(%rsp), %ebx	# e += RCONST + W[n & 15]
-	addl	%edi, %ebx		# e += (c ^ d ^ b)
-	movl	%ecx, %esi		#
-	roll	$5, %esi		# rotl32(a,5)
-	addl	%esi, %ebx		# e += rotl32(a,5)
-	rorl	$2, %edx		# b = rotl32(b,30)
-# 34
-	movl	%edx, %edi		# c
-	xorl	%ebp, %edi		# ^d
-	xorl	%ecx, %edi		# ^b
-	addl	-64+4*2(%rsp), %eax	# e += RCONST + W[n & 15]
-	addl	%edi, %eax		# e += (c ^ d ^ b)
-	movl	%ebx, %esi		#
-	roll	$5, %esi		# rotl32(a,5)
-	addl	%esi, %eax		# e += rotl32(a,5)
-	rorl	$2, %ecx		# b = rotl32(b,30)
-# 35
-	movl	%ecx, %edi		# c
-	xorl	%edx, %edi		# ^d
-	xorl	%ebx, %edi		# ^b
-	addl	-64+4*3(%rsp), %ebp	# e += RCONST + W[n & 15]
-	addl	%edi, %ebp		# e += (c ^ d ^ b)
-	movl	%eax, %esi		#
-	roll	$5, %esi		# rotl32(a,5)
-	addl	%esi, %ebp		# e += rotl32(a,5)
-	rorl	$2, %ebx		# b = rotl32(b,30)
-# PREP %xmm3 %xmm0 %xmm1 %xmm2 -64+16*3(%rsp)
-	movaps	%xmm2, %xmm4
-	psrldq	$4, %xmm4	# rshift by 4 bytes: T1 = ([13],[14],[15],0)
-#	pshufd	$0x4e, %xmm3, %xmm5	# 01001110=2,3,0,1 shuffle, ([2],[3],x,x)
-#	punpcklqdq %xmm0, %xmm5	# T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5])
-# same result as above, but shorter and faster:
-# pshufd/shufps are subtly different: pshufd takes all dwords from source operand,
-# shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one!
-	movaps	%xmm3, %xmm5
-	shufps	$0x4e, %xmm0, %xmm5	# 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5])
-	xorps	%xmm1, %xmm3	# ([8],[9],[10],[11]) ^ ([0],[1],[2],[3])
-	xorps	%xmm4, %xmm5	# ([13],[14],[15],0) ^ ([2],[3],[4],[5])
-	xorps	%xmm5, %xmm3	# ^
-	# W0 = unrotated (W[0]..W[3]), still needs W[3] fixup
-	movaps	%xmm3, %xmm5
-	xorps	%xmm4, %xmm4	# rol(W0,1):
-	pcmpgtd	%xmm3, %xmm4	#  ffffffff for elements <0 (ones with msb bit 1)
-	paddd	%xmm3, %xmm3	#  shift left by 1
-	psubd	%xmm4, %xmm3	#  add 1 to those who had msb bit 1
-	# W0 = rotated (W[0]..W[3]), still needs W[3] fixup
-	pslldq	$12, %xmm5	# lshift by 12 bytes: T2 = (0,0,0,unrotW[0])
-	movaps	%xmm5, %xmm4
-	pslld	$2, %xmm5
-	psrld	$30, %xmm4
-#	xorps	%xmm4, %xmm5	# rol((0,0,0,unrotW[0]),2)
-	xorps	%xmm4, %xmm3	# same result, but does not depend on/does not modify T2
-	xorps	%xmm5, %xmm3	# W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2))
-	movaps	%xmm3, %xmm5
-	paddd	%xmm6, %xmm5
-	movups	%xmm5, -64+16*3(%rsp)
-# 36
-	movl	%ebx, %edi		# c
-	xorl	%ecx, %edi		# ^d
-	xorl	%eax, %edi		# ^b
-	addl	-64+4*4(%rsp), %edx	# e += RCONST + W[n & 15]
-	addl	%edi, %edx		# e += (c ^ d ^ b)
-	movl	%ebp, %esi		#
-	roll	$5, %esi		# rotl32(a,5)
-	addl	%esi, %edx		# e += rotl32(a,5)
-	rorl	$2, %eax		# b = rotl32(b,30)
-# 37
-	movl	%eax, %edi		# c
-	xorl	%ebx, %edi		# ^d
-	xorl	%ebp, %edi		# ^b
-	addl	-64+4*5(%rsp), %ecx	# e += RCONST + W[n & 15]
-	addl	%edi, %ecx		# e += (c ^ d ^ b)
-	movl	%edx, %esi		#
-	roll	$5, %esi		# rotl32(a,5)
-	addl	%esi, %ecx		# e += rotl32(a,5)
-	rorl	$2, %ebp		# b = rotl32(b,30)
-# 38
-	movl	%ebp, %edi		# c
-	xorl	%eax, %edi		# ^d
-	xorl	%edx, %edi		# ^b
-	addl	-64+4*6(%rsp), %ebx	# e += RCONST + W[n & 15]
-	addl	%edi, %ebx		# e += (c ^ d ^ b)
-	movl	%ecx, %esi		#
-	roll	$5, %esi		# rotl32(a,5)
-	addl	%esi, %ebx		# e += rotl32(a,5)
-	rorl	$2, %edx		# b = rotl32(b,30)
-# 39
-	movl	%edx, %edi		# c
-	xorl	%ebp, %edi		# ^d
-	xorl	%ecx, %edi		# ^b
-	addl	-64+4*7(%rsp), %eax	# e += RCONST + W[n & 15]
-	addl	%edi, %eax		# e += (c ^ d ^ b)
-	movl	%ebx, %esi		#
-	roll	$5, %esi		# rotl32(a,5)
-	addl	%esi, %eax		# e += rotl32(a,5)
-	rorl	$2, %ecx		# b = rotl32(b,30)
-# PREP %xmm0 %xmm1 %xmm2 %xmm3 -64+16*0(%rsp)
-	movaps	%xmm3, %xmm4
-	psrldq	$4, %xmm4	# rshift by 4 bytes: T1 = ([13],[14],[15],0)
-#	pshufd	$0x4e, %xmm0, %xmm5	# 01001110=2,3,0,1 shuffle, ([2],[3],x,x)
-#	punpcklqdq %xmm1, %xmm5	# T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5])
-# same result as above, but shorter and faster:
-# pshufd/shufps are subtly different: pshufd takes all dwords from source operand,
-# shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one!
-	movaps	%xmm0, %xmm5
-	shufps	$0x4e, %xmm1, %xmm5	# 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5])
-	xorps	%xmm2, %xmm0	# ([8],[9],[10],[11]) ^ ([0],[1],[2],[3])
-	xorps	%xmm4, %xmm5	# ([13],[14],[15],0) ^ ([2],[3],[4],[5])
-	xorps	%xmm5, %xmm0	# ^
-	# W0 = unrotated (W[0]..W[3]), still needs W[3] fixup
-	movaps	%xmm0, %xmm5
-	xorps	%xmm4, %xmm4	# rol(W0,1):
-	pcmpgtd	%xmm0, %xmm4	#  ffffffff for elements <0 (ones with msb bit 1)
-	paddd	%xmm0, %xmm0	#  shift left by 1
-	psubd	%xmm4, %xmm0	#  add 1 to those who had msb bit 1
-	# W0 = rotated (W[0]..W[3]), still needs W[3] fixup
-	pslldq	$12, %xmm5	# lshift by 12 bytes: T2 = (0,0,0,unrotW[0])
-	movaps	%xmm5, %xmm4
-	pslld	$2, %xmm5
-	psrld	$30, %xmm4
-#	xorps	%xmm4, %xmm5	# rol((0,0,0,unrotW[0]),2)
-	xorps	%xmm4, %xmm0	# same result, but does not depend on/does not modify T2
-	xorps	%xmm5, %xmm0	# W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2))
-	movaps	%xmm0, %xmm5
-	paddd	%xmm6, %xmm5
-	movups	%xmm5, -64+16*0(%rsp)
-# 40
-	movl	%ebx, %edi		# di: b
-	movl	%ebx, %esi		# si: b
-	orl	%ecx, %edi		# di: b | c
-	andl	%ecx, %esi		# si: b & c
-	andl	%edx, %edi		# di: (b | c) & d
-	orl	%esi, %edi		# ((b | c) & d) | (b & c)
-	addl	%edi, %ebp		# += ((b | c) & d) | (b & c)
-	addl	-64+4*8(%rsp), %ebp	# e += RCONST + W[n & 15]
-	movl	%eax, %esi		#
-	roll	$5, %esi		# rotl32(a,5)
-	addl	%esi, %ebp		# e += rotl32(a,5)
-	rorl	$2, %ebx		# b = rotl32(b,30)
-# 41
-	movl	%eax, %edi		# di: b
-	movl	%eax, %esi		# si: b
-	orl	%ebx, %edi		# di: b | c
-	andl	%ebx, %esi		# si: b & c
-	andl	%ecx, %edi		# di: (b | c) & d
-	orl	%esi, %edi		# ((b | c) & d) | (b & c)
-	addl	%edi, %edx		# += ((b | c) & d) | (b & c)
-	addl	-64+4*9(%rsp), %edx	# e += RCONST + W[n & 15]
-	movl	%ebp, %esi		#
-	roll	$5, %esi		# rotl32(a,5)
-	addl	%esi, %edx		# e += rotl32(a,5)
-	rorl	$2, %eax		# b = rotl32(b,30)
-# 42
-	movl	%ebp, %edi		# di: b
-	movl	%ebp, %esi		# si: b
-	orl	%eax, %edi		# di: b | c
-	andl	%eax, %esi		# si: b & c
-	andl	%ebx, %edi		# di: (b | c) & d
-	orl	%esi, %edi		# ((b | c) & d) | (b & c)
-	addl	%edi, %ecx		# += ((b | c) & d) | (b & c)
-	addl	-64+4*10(%rsp), %ecx	# e += RCONST + W[n & 15]
-	movl	%edx, %esi		#
-	roll	$5, %esi		# rotl32(a,5)
-	addl	%esi, %ecx		# e += rotl32(a,5)
-	rorl	$2, %ebp		# b = rotl32(b,30)
-# 43
-	movl	%edx, %edi		# di: b
-	movl	%edx, %esi		# si: b
-	orl	%ebp, %edi		# di: b | c
-	andl	%ebp, %esi		# si: b & c
-	andl	%eax, %edi		# di: (b | c) & d
-	orl	%esi, %edi		# ((b | c) & d) | (b & c)
-	addl	%edi, %ebx		# += ((b | c) & d) | (b & c)
-	addl	-64+4*11(%rsp), %ebx	# e += RCONST + W[n & 15]
-	movl	%ecx, %esi		#
-	roll	$5, %esi		# rotl32(a,5)
-	addl	%esi, %ebx		# e += rotl32(a,5)
-	rorl	$2, %edx		# b = rotl32(b,30)
-# PREP %xmm1 %xmm2 %xmm3 %xmm0 -64+16*1(%rsp)
-	movaps	%xmm0, %xmm4
-	psrldq	$4, %xmm4	# rshift by 4 bytes: T1 = ([13],[14],[15],0)
-#	pshufd	$0x4e, %xmm1, %xmm5	# 01001110=2,3,0,1 shuffle, ([2],[3],x,x)
-#	punpcklqdq %xmm2, %xmm5	# T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5])
-# same result as above, but shorter and faster:
-# pshufd/shufps are subtly different: pshufd takes all dwords from source operand,
-# shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one!
-	movaps	%xmm1, %xmm5
-	shufps	$0x4e, %xmm2, %xmm5	# 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5])
-	xorps	%xmm3, %xmm1	# ([8],[9],[10],[11]) ^ ([0],[1],[2],[3])
-	xorps	%xmm4, %xmm5	# ([13],[14],[15],0) ^ ([2],[3],[4],[5])
-	xorps	%xmm5, %xmm1	# ^
-	# W0 = unrotated (W[0]..W[3]), still needs W[3] fixup
-	movaps	%xmm1, %xmm5
-	xorps	%xmm4, %xmm4	# rol(W0,1):
-	pcmpgtd	%xmm1, %xmm4	#  ffffffff for elements <0 (ones with msb bit 1)
-	paddd	%xmm1, %xmm1	#  shift left by 1
-	psubd	%xmm4, %xmm1	#  add 1 to those who had msb bit 1
-	# W0 = rotated (W[0]..W[3]), still needs W[3] fixup
-	pslldq	$12, %xmm5	# lshift by 12 bytes: T2 = (0,0,0,unrotW[0])
-	movaps	%xmm5, %xmm4
-	pslld	$2, %xmm5
-	psrld	$30, %xmm4
-#	xorps	%xmm4, %xmm5	# rol((0,0,0,unrotW[0]),2)
-	xorps	%xmm4, %xmm1	# same result, but does not depend on/does not modify T2
-	xorps	%xmm5, %xmm1	# W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2))
-	movaps	%xmm1, %xmm5
-	paddd	%xmm6, %xmm5
-	movups	%xmm5, -64+16*1(%rsp)
-# 44
-	movl	%ecx, %edi		# di: b
-	movl	%ecx, %esi		# si: b
-	orl	%edx, %edi		# di: b | c
-	andl	%edx, %esi		# si: b & c
-	andl	%ebp, %edi		# di: (b | c) & d
-	orl	%esi, %edi		# ((b | c) & d) | (b & c)
-	addl	%edi, %eax		# += ((b | c) & d) | (b & c)
-	addl	-64+4*12(%rsp), %eax	# e += RCONST + W[n & 15]
-	movl	%ebx, %esi		#
-	roll	$5, %esi		# rotl32(a,5)
-	addl	%esi, %eax		# e += rotl32(a,5)
-	rorl	$2, %ecx		# b = rotl32(b,30)
-# 45
-	movl	%ebx, %edi		# di: b
-	movl	%ebx, %esi		# si: b
-	orl	%ecx, %edi		# di: b | c
-	andl	%ecx, %esi		# si: b & c
-	andl	%edx, %edi		# di: (b | c) & d
-	orl	%esi, %edi		# ((b | c) & d) | (b & c)
-	addl	%edi, %ebp		# += ((b | c) & d) | (b & c)
-	addl	-64+4*13(%rsp), %ebp	# e += RCONST + W[n & 15]
-	movl	%eax, %esi		#
-	roll	$5, %esi		# rotl32(a,5)
-	addl	%esi, %ebp		# e += rotl32(a,5)
-	rorl	$2, %ebx		# b = rotl32(b,30)
-# 46
-	movl	%eax, %edi		# di: b
-	movl	%eax, %esi		# si: b
-	orl	%ebx, %edi		# di: b | c
-	andl	%ebx, %esi		# si: b & c
-	andl	%ecx, %edi		# di: (b | c) & d
-	orl	%esi, %edi		# ((b | c) & d) | (b & c)
-	addl	%edi, %edx		# += ((b | c) & d) | (b & c)
-	addl	-64+4*14(%rsp), %edx	# e += RCONST + W[n & 15]
-	movl	%ebp, %esi		#
-	roll	$5, %esi		# rotl32(a,5)
-	addl	%esi, %edx		# e += rotl32(a,5)
-	rorl	$2, %eax		# b = rotl32(b,30)
-# 47
-	movl	%ebp, %edi		# di: b
-	movl	%ebp, %esi		# si: b
-	orl	%eax, %edi		# di: b | c
-	andl	%eax, %esi		# si: b & c
-	andl	%ebx, %edi		# di: (b | c) & d
-	orl	%esi, %edi		# ((b | c) & d) | (b & c)
-	addl	%edi, %ecx		# += ((b | c) & d) | (b & c)
-	addl	-64+4*15(%rsp), %ecx	# e += RCONST + W[n & 15]
-	movl	%edx, %esi		#
-	roll	$5, %esi		# rotl32(a,5)
-	addl	%esi, %ecx		# e += rotl32(a,5)
-	rorl	$2, %ebp		# b = rotl32(b,30)
-# PREP %xmm2 %xmm3 %xmm0 %xmm1 -64+16*2(%rsp)
-	movaps	%xmm1, %xmm4
-	psrldq	$4, %xmm4	# rshift by 4 bytes: T1 = ([13],[14],[15],0)
-#	pshufd	$0x4e, %xmm2, %xmm5	# 01001110=2,3,0,1 shuffle, ([2],[3],x,x)
-#	punpcklqdq %xmm3, %xmm5	# T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5])
-# same result as above, but shorter and faster:
-# pshufd/shufps are subtly different: pshufd takes all dwords from source operand,
-# shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one!
-	movaps	%xmm2, %xmm5
-	shufps	$0x4e, %xmm3, %xmm5	# 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5])
-	xorps	%xmm0, %xmm2	# ([8],[9],[10],[11]) ^ ([0],[1],[2],[3])
-	xorps	%xmm4, %xmm5	# ([13],[14],[15],0) ^ ([2],[3],[4],[5])
-	xorps	%xmm5, %xmm2	# ^
-	# W0 = unrotated (W[0]..W[3]), still needs W[3] fixup
-	movaps	%xmm2, %xmm5
-	xorps	%xmm4, %xmm4	# rol(W0,1):
-	pcmpgtd	%xmm2, %xmm4	#  ffffffff for elements <0 (ones with msb bit 1)
-	paddd	%xmm2, %xmm2	#  shift left by 1
-	psubd	%xmm4, %xmm2	#  add 1 to those who had msb bit 1
-	# W0 = rotated (W[0]..W[3]), still needs W[3] fixup
-	pslldq	$12, %xmm5	# lshift by 12 bytes: T2 = (0,0,0,unrotW[0])
-	movaps	%xmm5, %xmm4
-	pslld	$2, %xmm5
-	psrld	$30, %xmm4
-#	xorps	%xmm4, %xmm5	# rol((0,0,0,unrotW[0]),2)
-	xorps	%xmm4, %xmm2	# same result, but does not depend on/does not modify T2
-	xorps	%xmm5, %xmm2	# W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2))
-	movaps	%xmm2, %xmm5
-	paddd	%xmm6, %xmm5
-	movups	%xmm5, -64+16*2(%rsp)
-# 48
-	movl	%edx, %edi		# di: b
-	movl	%edx, %esi		# si: b
-	orl	%ebp, %edi		# di: b | c
-	andl	%ebp, %esi		# si: b & c
-	andl	%eax, %edi		# di: (b | c) & d
-	orl	%esi, %edi		# ((b | c) & d) | (b & c)
-	addl	%edi, %ebx		# += ((b | c) & d) | (b & c)
-	addl	-64+4*0(%rsp), %ebx	# e += RCONST + W[n & 15]
-	movl	%ecx, %esi		#
-	roll	$5, %esi		# rotl32(a,5)
-	addl	%esi, %ebx		# e += rotl32(a,5)
-	rorl	$2, %edx		# b = rotl32(b,30)
-# 49
-	movl	%ecx, %edi		# di: b
-	movl	%ecx, %esi		# si: b
-	orl	%edx, %edi		# di: b | c
-	andl	%edx, %esi		# si: b & c
-	andl	%ebp, %edi		# di: (b | c) & d
-	orl	%esi, %edi		# ((b | c) & d) | (b & c)
-	addl	%edi, %eax		# += ((b | c) & d) | (b & c)
-	addl	-64+4*1(%rsp), %eax	# e += RCONST + W[n & 15]
-	movl	%ebx, %esi		#
-	roll	$5, %esi		# rotl32(a,5)
-	addl	%esi, %eax		# e += rotl32(a,5)
-	rorl	$2, %ecx		# b = rotl32(b,30)
-# 50
-	movl	%ebx, %edi		# di: b
-	movl	%ebx, %esi		# si: b
-	orl	%ecx, %edi		# di: b | c
-	andl	%ecx, %esi		# si: b & c
-	andl	%edx, %edi		# di: (b | c) & d
-	orl	%esi, %edi		# ((b | c) & d) | (b & c)
-	addl	%edi, %ebp		# += ((b | c) & d) | (b & c)
-	addl	-64+4*2(%rsp), %ebp	# e += RCONST + W[n & 15]
-	movl	%eax, %esi		#
-	roll	$5, %esi		# rotl32(a,5)
-	addl	%esi, %ebp		# e += rotl32(a,5)
-	rorl	$2, %ebx		# b = rotl32(b,30)
-# 51
-	movl	%eax, %edi		# di: b
-	movl	%eax, %esi		# si: b
-	orl	%ebx, %edi		# di: b | c
-	andl	%ebx, %esi		# si: b & c
-	andl	%ecx, %edi		# di: (b | c) & d
-	orl	%esi, %edi		# ((b | c) & d) | (b & c)
-	addl	%edi, %edx		# += ((b | c) & d) | (b & c)
-	addl	-64+4*3(%rsp), %edx	# e += RCONST + W[n & 15]
-	movl	%ebp, %esi		#
-	roll	$5, %esi		# rotl32(a,5)
-	addl	%esi, %edx		# e += rotl32(a,5)
-	rorl	$2, %eax		# b = rotl32(b,30)
-	pshufd	$0xff, %xmm7, %xmm6
-# PREP %xmm3 %xmm0 %xmm1 %xmm2 -64+16*3(%rsp)
-	movaps	%xmm2, %xmm4
-	psrldq	$4, %xmm4	# rshift by 4 bytes: T1 = ([13],[14],[15],0)
-#	pshufd	$0x4e, %xmm3, %xmm5	# 01001110=2,3,0,1 shuffle, ([2],[3],x,x)
-#	punpcklqdq %xmm0, %xmm5	# T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5])
-# same result as above, but shorter and faster:
-# pshufd/shufps are subtly different: pshufd takes all dwords from source operand,
-# shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one!
-	movaps	%xmm3, %xmm5
-	shufps	$0x4e, %xmm0, %xmm5	# 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5])
-	xorps	%xmm1, %xmm3	# ([8],[9],[10],[11]) ^ ([0],[1],[2],[3])
-	xorps	%xmm4, %xmm5	# ([13],[14],[15],0) ^ ([2],[3],[4],[5])
-	xorps	%xmm5, %xmm3	# ^
-	# W0 = unrotated (W[0]..W[3]), still needs W[3] fixup
-	movaps	%xmm3, %xmm5
-	xorps	%xmm4, %xmm4	# rol(W0,1):
-	pcmpgtd	%xmm3, %xmm4	#  ffffffff for elements <0 (ones with msb bit 1)
-	paddd	%xmm3, %xmm3	#  shift left by 1
-	psubd	%xmm4, %xmm3	#  add 1 to those who had msb bit 1
-	# W0 = rotated (W[0]..W[3]), still needs W[3] fixup
-	pslldq	$12, %xmm5	# lshift by 12 bytes: T2 = (0,0,0,unrotW[0])
-	movaps	%xmm5, %xmm4
-	pslld	$2, %xmm5
-	psrld	$30, %xmm4
-#	xorps	%xmm4, %xmm5	# rol((0,0,0,unrotW[0]),2)
-	xorps	%xmm4, %xmm3	# same result, but does not depend on/does not modify T2
-	xorps	%xmm5, %xmm3	# W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2))
-	movaps	%xmm3, %xmm5
-	paddd	%xmm6, %xmm5
-	movups	%xmm5, -64+16*3(%rsp)
-# 52
-	movl	%ebp, %edi		# di: b
-	movl	%ebp, %esi		# si: b
-	orl	%eax, %edi		# di: b | c
-	andl	%eax, %esi		# si: b & c
-	andl	%ebx, %edi		# di: (b | c) & d
-	orl	%esi, %edi		# ((b | c) & d) | (b & c)
-	addl	%edi, %ecx		# += ((b | c) & d) | (b & c)
-	addl	-64+4*4(%rsp), %ecx	# e += RCONST + W[n & 15]
-	movl	%edx, %esi		#
-	roll	$5, %esi		# rotl32(a,5)
-	addl	%esi, %ecx		# e += rotl32(a,5)
-	rorl	$2, %ebp		# b = rotl32(b,30)
-# 53
-	movl	%edx, %edi		# di: b
-	movl	%edx, %esi		# si: b
-	orl	%ebp, %edi		# di: b | c
-	andl	%ebp, %esi		# si: b & c
-	andl	%eax, %edi		# di: (b | c) & d
-	orl	%esi, %edi		# ((b | c) & d) | (b & c)
-	addl	%edi, %ebx		# += ((b | c) & d) | (b & c)
-	addl	-64+4*5(%rsp), %ebx	# e += RCONST + W[n & 15]
-	movl	%ecx, %esi		#
-	roll	$5, %esi		# rotl32(a,5)
-	addl	%esi, %ebx		# e += rotl32(a,5)
-	rorl	$2, %edx		# b = rotl32(b,30)
-# 54
-	movl	%ecx, %edi		# di: b
-	movl	%ecx, %esi		# si: b
-	orl	%edx, %edi		# di: b | c
-	andl	%edx, %esi		# si: b & c
-	andl	%ebp, %edi		# di: (b | c) & d
-	orl	%esi, %edi		# ((b | c) & d) | (b & c)
-	addl	%edi, %eax		# += ((b | c) & d) | (b & c)
-	addl	-64+4*6(%rsp), %eax	# e += RCONST + W[n & 15]
-	movl	%ebx, %esi		#
-	roll	$5, %esi		# rotl32(a,5)
-	addl	%esi, %eax		# e += rotl32(a,5)
-	rorl	$2, %ecx		# b = rotl32(b,30)
-# 55
-	movl	%ebx, %edi		# di: b
-	movl	%ebx, %esi		# si: b
-	orl	%ecx, %edi		# di: b | c
-	andl	%ecx, %esi		# si: b & c
-	andl	%edx, %edi		# di: (b | c) & d
-	orl	%esi, %edi		# ((b | c) & d) | (b & c)
-	addl	%edi, %ebp		# += ((b | c) & d) | (b & c)
-	addl	-64+4*7(%rsp), %ebp	# e += RCONST + W[n & 15]
-	movl	%eax, %esi		#
-	roll	$5, %esi		# rotl32(a,5)
-	addl	%esi, %ebp		# e += rotl32(a,5)
-	rorl	$2, %ebx		# b = rotl32(b,30)
-# PREP %xmm0 %xmm1 %xmm2 %xmm3 -64+16*0(%rsp)
-	movaps	%xmm3, %xmm4
-	psrldq	$4, %xmm4	# rshift by 4 bytes: T1 = ([13],[14],[15],0)
-#	pshufd	$0x4e, %xmm0, %xmm5	# 01001110=2,3,0,1 shuffle, ([2],[3],x,x)
-#	punpcklqdq %xmm1, %xmm5	# T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5])
-# same result as above, but shorter and faster:
-# pshufd/shufps are subtly different: pshufd takes all dwords from source operand,
-# shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one!
-	movaps	%xmm0, %xmm5
-	shufps	$0x4e, %xmm1, %xmm5	# 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5])
-	xorps	%xmm2, %xmm0	# ([8],[9],[10],[11]) ^ ([0],[1],[2],[3])
-	xorps	%xmm4, %xmm5	# ([13],[14],[15],0) ^ ([2],[3],[4],[5])
-	xorps	%xmm5, %xmm0	# ^
-	# W0 = unrotated (W[0]..W[3]), still needs W[3] fixup
-	movaps	%xmm0, %xmm5
-	xorps	%xmm4, %xmm4	# rol(W0,1):
-	pcmpgtd	%xmm0, %xmm4	#  ffffffff for elements <0 (ones with msb bit 1)
-	paddd	%xmm0, %xmm0	#  shift left by 1
-	psubd	%xmm4, %xmm0	#  add 1 to those who had msb bit 1
-	# W0 = rotated (W[0]..W[3]), still needs W[3] fixup
-	pslldq	$12, %xmm5	# lshift by 12 bytes: T2 = (0,0,0,unrotW[0])
-	movaps	%xmm5, %xmm4
-	pslld	$2, %xmm5
-	psrld	$30, %xmm4
-#	xorps	%xmm4, %xmm5	# rol((0,0,0,unrotW[0]),2)
-	xorps	%xmm4, %xmm0	# same result, but does not depend on/does not modify T2
-	xorps	%xmm5, %xmm0	# W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2))
-	movaps	%xmm0, %xmm5
-	paddd	%xmm6, %xmm5
-	movups	%xmm5, -64+16*0(%rsp)
-# 56
-	movl	%eax, %edi		# di: b
-	movl	%eax, %esi		# si: b
-	orl	%ebx, %edi		# di: b | c
-	andl	%ebx, %esi		# si: b & c
-	andl	%ecx, %edi		# di: (b | c) & d
-	orl	%esi, %edi		# ((b | c) & d) | (b & c)
-	addl	%edi, %edx		# += ((b | c) & d) | (b & c)
-	addl	-64+4*8(%rsp), %edx	# e += RCONST + W[n & 15]
-	movl	%ebp, %esi		#
-	roll	$5, %esi		# rotl32(a,5)
-	addl	%esi, %edx		# e += rotl32(a,5)
-	rorl	$2, %eax		# b = rotl32(b,30)
-# 57
-	movl	%ebp, %edi		# di: b
-	movl	%ebp, %esi		# si: b
-	orl	%eax, %edi		# di: b | c
-	andl	%eax, %esi		# si: b & c
-	andl	%ebx, %edi		# di: (b | c) & d
-	orl	%esi, %edi		# ((b | c) & d) | (b & c)
-	addl	%edi, %ecx		# += ((b | c) & d) | (b & c)
-	addl	-64+4*9(%rsp), %ecx	# e += RCONST + W[n & 15]
-	movl	%edx, %esi		#
-	roll	$5, %esi		# rotl32(a,5)
-	addl	%esi, %ecx		# e += rotl32(a,5)
-	rorl	$2, %ebp		# b = rotl32(b,30)
-# 58
-	movl	%edx, %edi		# di: b
-	movl	%edx, %esi		# si: b
-	orl	%ebp, %edi		# di: b | c
-	andl	%ebp, %esi		# si: b & c
-	andl	%eax, %edi		# di: (b | c) & d
-	orl	%esi, %edi		# ((b | c) & d) | (b & c)
-	addl	%edi, %ebx		# += ((b | c) & d) | (b & c)
-	addl	-64+4*10(%rsp), %ebx	# e += RCONST + W[n & 15]
-	movl	%ecx, %esi		#
-	roll	$5, %esi		# rotl32(a,5)
-	addl	%esi, %ebx		# e += rotl32(a,5)
-	rorl	$2, %edx		# b = rotl32(b,30)
-# 59
-	movl	%ecx, %edi		# di: b
-	movl	%ecx, %esi		# si: b
-	orl	%edx, %edi		# di: b | c
-	andl	%edx, %esi		# si: b & c
-	andl	%ebp, %edi		# di: (b | c) & d
-	orl	%esi, %edi		# ((b | c) & d) | (b & c)
-	addl	%edi, %eax		# += ((b | c) & d) | (b & c)
-	addl	-64+4*11(%rsp), %eax	# e += RCONST + W[n & 15]
-	movl	%ebx, %esi		#
-	roll	$5, %esi		# rotl32(a,5)
-	addl	%esi, %eax		# e += rotl32(a,5)
-	rorl	$2, %ecx		# b = rotl32(b,30)
-# PREP %xmm1 %xmm2 %xmm3 %xmm0 -64+16*1(%rsp)
-	movaps	%xmm0, %xmm4
-	psrldq	$4, %xmm4	# rshift by 4 bytes: T1 = ([13],[14],[15],0)
-#	pshufd	$0x4e, %xmm1, %xmm5	# 01001110=2,3,0,1 shuffle, ([2],[3],x,x)
-#	punpcklqdq %xmm2, %xmm5	# T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5])
-# same result as above, but shorter and faster:
-# pshufd/shufps are subtly different: pshufd takes all dwords from source operand,
-# shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one!
-	movaps	%xmm1, %xmm5
-	shufps	$0x4e, %xmm2, %xmm5	# 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5])
-	xorps	%xmm3, %xmm1	# ([8],[9],[10],[11]) ^ ([0],[1],[2],[3])
-	xorps	%xmm4, %xmm5	# ([13],[14],[15],0) ^ ([2],[3],[4],[5])
-	xorps	%xmm5, %xmm1	# ^
-	# W0 = unrotated (W[0]..W[3]), still needs W[3] fixup
-	movaps	%xmm1, %xmm5
-	xorps	%xmm4, %xmm4	# rol(W0,1):
-	pcmpgtd	%xmm1, %xmm4	#  ffffffff for elements <0 (ones with msb bit 1)
-	paddd	%xmm1, %xmm1	#  shift left by 1
-	psubd	%xmm4, %xmm1	#  add 1 to those who had msb bit 1
-	# W0 = rotated (W[0]..W[3]), still needs W[3] fixup
-	pslldq	$12, %xmm5	# lshift by 12 bytes: T2 = (0,0,0,unrotW[0])
-	movaps	%xmm5, %xmm4
-	pslld	$2, %xmm5
-	psrld	$30, %xmm4
-#	xorps	%xmm4, %xmm5	# rol((0,0,0,unrotW[0]),2)
-	xorps	%xmm4, %xmm1	# same result, but does not depend on/does not modify T2
-	xorps	%xmm5, %xmm1	# W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2))
-	movaps	%xmm1, %xmm5
-	paddd	%xmm6, %xmm5
-	movups	%xmm5, -64+16*1(%rsp)
-# 60
-	movl	%ecx, %edi		# c
-	xorl	%edx, %edi		# ^d
-	xorl	%ebx, %edi		# ^b
-	addl	-64+4*12(%rsp), %ebp	# e += RCONST + W[n & 15]
-	addl	%edi, %ebp		# e += (c ^ d ^ b)
-	movl	%eax, %esi		#
-	roll	$5, %esi		# rotl32(a,5)
-	addl	%esi, %ebp		# e += rotl32(a,5)
-	rorl	$2, %ebx		# b = rotl32(b,30)
-# 61
-	movl	%ebx, %edi		# c
-	xorl	%ecx, %edi		# ^d
-	xorl	%eax, %edi		# ^b
-	addl	-64+4*13(%rsp), %edx	# e += RCONST + W[n & 15]
-	addl	%edi, %edx		# e += (c ^ d ^ b)
-	movl	%ebp, %esi		#
-	roll	$5, %esi		# rotl32(a,5)
-	addl	%esi, %edx		# e += rotl32(a,5)
-	rorl	$2, %eax		# b = rotl32(b,30)
-# 62
-	movl	%eax, %edi		# c
-	xorl	%ebx, %edi		# ^d
-	xorl	%ebp, %edi		# ^b
-	addl	-64+4*14(%rsp), %ecx	# e += RCONST + W[n & 15]
-	addl	%edi, %ecx		# e += (c ^ d ^ b)
-	movl	%edx, %esi		#
-	roll	$5, %esi		# rotl32(a,5)
-	addl	%esi, %ecx		# e += rotl32(a,5)
-	rorl	$2, %ebp		# b = rotl32(b,30)
-# 63
-	movl	%ebp, %edi		# c
-	xorl	%eax, %edi		# ^d
-	xorl	%edx, %edi		# ^b
-	addl	-64+4*15(%rsp), %ebx	# e += RCONST + W[n & 15]
-	addl	%edi, %ebx		# e += (c ^ d ^ b)
-	movl	%ecx, %esi		#
-	roll	$5, %esi		# rotl32(a,5)
-	addl	%esi, %ebx		# e += rotl32(a,5)
-	rorl	$2, %edx		# b = rotl32(b,30)
-# PREP %xmm2 %xmm3 %xmm0 %xmm1 -64+16*2(%rsp)
-	movaps	%xmm1, %xmm4
-	psrldq	$4, %xmm4	# rshift by 4 bytes: T1 = ([13],[14],[15],0)
-#	pshufd	$0x4e, %xmm2, %xmm5	# 01001110=2,3,0,1 shuffle, ([2],[3],x,x)
-#	punpcklqdq %xmm3, %xmm5	# T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5])
-# same result as above, but shorter and faster:
-# pshufd/shufps are subtly different: pshufd takes all dwords from source operand,
-# shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one!
-	movaps	%xmm2, %xmm5
-	shufps	$0x4e, %xmm3, %xmm5	# 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5])
-	xorps	%xmm0, %xmm2	# ([8],[9],[10],[11]) ^ ([0],[1],[2],[3])
-	xorps	%xmm4, %xmm5	# ([13],[14],[15],0) ^ ([2],[3],[4],[5])
-	xorps	%xmm5, %xmm2	# ^
-	# W0 = unrotated (W[0]..W[3]), still needs W[3] fixup
-	movaps	%xmm2, %xmm5
-	xorps	%xmm4, %xmm4	# rol(W0,1):
-	pcmpgtd	%xmm2, %xmm4	#  ffffffff for elements <0 (ones with msb bit 1)
-	paddd	%xmm2, %xmm2	#  shift left by 1
-	psubd	%xmm4, %xmm2	#  add 1 to those who had msb bit 1
-	# W0 = rotated (W[0]..W[3]), still needs W[3] fixup
-	pslldq	$12, %xmm5	# lshift by 12 bytes: T2 = (0,0,0,unrotW[0])
-	movaps	%xmm5, %xmm4
-	pslld	$2, %xmm5
-	psrld	$30, %xmm4
-#	xorps	%xmm4, %xmm5	# rol((0,0,0,unrotW[0]),2)
-	xorps	%xmm4, %xmm2	# same result, but does not depend on/does not modify T2
-	xorps	%xmm5, %xmm2	# W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2))
-	movaps	%xmm2, %xmm5
-	paddd	%xmm6, %xmm5
-	movups	%xmm5, -64+16*2(%rsp)
-# 64
-	movl	%edx, %edi		# c
-	xorl	%ebp, %edi		# ^d
-	xorl	%ecx, %edi		# ^b
-	addl	-64+4*0(%rsp), %eax	# e += RCONST + W[n & 15]
-	addl	%edi, %eax		# e += (c ^ d ^ b)
-	movl	%ebx, %esi		#
-	roll	$5, %esi		# rotl32(a,5)
-	addl	%esi, %eax		# e += rotl32(a,5)
-	rorl	$2, %ecx		# b = rotl32(b,30)
-# 65
-	movl	%ecx, %edi		# c
-	xorl	%edx, %edi		# ^d
-	xorl	%ebx, %edi		# ^b
-	addl	-64+4*1(%rsp), %ebp	# e += RCONST + W[n & 15]
-	addl	%edi, %ebp		# e += (c ^ d ^ b)
-	movl	%eax, %esi		#
-	roll	$5, %esi		# rotl32(a,5)
-	addl	%esi, %ebp		# e += rotl32(a,5)
-	rorl	$2, %ebx		# b = rotl32(b,30)
-# 66
-	movl	%ebx, %edi		# c
-	xorl	%ecx, %edi		# ^d
-	xorl	%eax, %edi		# ^b
-	addl	-64+4*2(%rsp), %edx	# e += RCONST + W[n & 15]
-	addl	%edi, %edx		# e += (c ^ d ^ b)
-	movl	%ebp, %esi		#
-	roll	$5, %esi		# rotl32(a,5)
-	addl	%esi, %edx		# e += rotl32(a,5)
-	rorl	$2, %eax		# b = rotl32(b,30)
-# 67
-	movl	%eax, %edi		# c
-	xorl	%ebx, %edi		# ^d
-	xorl	%ebp, %edi		# ^b
-	addl	-64+4*3(%rsp), %ecx	# e += RCONST + W[n & 15]
-	addl	%edi, %ecx		# e += (c ^ d ^ b)
-	movl	%edx, %esi		#
-	roll	$5, %esi		# rotl32(a,5)
-	addl	%esi, %ecx		# e += rotl32(a,5)
-	rorl	$2, %ebp		# b = rotl32(b,30)
-# PREP %xmm3 %xmm0 %xmm1 %xmm2 -64+16*3(%rsp)
-	movaps	%xmm2, %xmm4
-	psrldq	$4, %xmm4	# rshift by 4 bytes: T1 = ([13],[14],[15],0)
-#	pshufd	$0x4e, %xmm3, %xmm5	# 01001110=2,3,0,1 shuffle, ([2],[3],x,x)
-#	punpcklqdq %xmm0, %xmm5	# T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5])
-# same result as above, but shorter and faster:
-# pshufd/shufps are subtly different: pshufd takes all dwords from source operand,
-# shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one!
-	movaps	%xmm3, %xmm5
-	shufps	$0x4e, %xmm0, %xmm5	# 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5])
-	xorps	%xmm1, %xmm3	# ([8],[9],[10],[11]) ^ ([0],[1],[2],[3])
-	xorps	%xmm4, %xmm5	# ([13],[14],[15],0) ^ ([2],[3],[4],[5])
-	xorps	%xmm5, %xmm3	# ^
-	# W0 = unrotated (W[0]..W[3]), still needs W[3] fixup
-	movaps	%xmm3, %xmm5
-	xorps	%xmm4, %xmm4	# rol(W0,1):
-	pcmpgtd	%xmm3, %xmm4	#  ffffffff for elements <0 (ones with msb bit 1)
-	paddd	%xmm3, %xmm3	#  shift left by 1
-	psubd	%xmm4, %xmm3	#  add 1 to those who had msb bit 1
-	# W0 = rotated (W[0]..W[3]), still needs W[3] fixup
-	pslldq	$12, %xmm5	# lshift by 12 bytes: T2 = (0,0,0,unrotW[0])
-	movaps	%xmm5, %xmm4
-	pslld	$2, %xmm5
-	psrld	$30, %xmm4
-#	xorps	%xmm4, %xmm5	# rol((0,0,0,unrotW[0]),2)
-	xorps	%xmm4, %xmm3	# same result, but does not depend on/does not modify T2
-	xorps	%xmm5, %xmm3	# W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2))
-	movaps	%xmm3, %xmm5
-	paddd	%xmm6, %xmm5
-	movups	%xmm5, -64+16*3(%rsp)
-# 68
-	movl	%ebp, %edi		# c
-	xorl	%eax, %edi		# ^d
-	xorl	%edx, %edi		# ^b
-	addl	-64+4*4(%rsp), %ebx	# e += RCONST + W[n & 15]
-	addl	%edi, %ebx		# e += (c ^ d ^ b)
-	movl	%ecx, %esi		#
-	roll	$5, %esi		# rotl32(a,5)
-	addl	%esi, %ebx		# e += rotl32(a,5)
-	rorl	$2, %edx		# b = rotl32(b,30)
-# 69
-	movl	%edx, %edi		# c
-	xorl	%ebp, %edi		# ^d
-	xorl	%ecx, %edi		# ^b
-	addl	-64+4*5(%rsp), %eax	# e += RCONST + W[n & 15]
-	addl	%edi, %eax		# e += (c ^ d ^ b)
-	movl	%ebx, %esi		#
-	roll	$5, %esi		# rotl32(a,5)
-	addl	%esi, %eax		# e += rotl32(a,5)
-	rorl	$2, %ecx		# b = rotl32(b,30)
-# 70
-	movl	%ecx, %edi		# c
-	xorl	%edx, %edi		# ^d
-	xorl	%ebx, %edi		# ^b
-	addl	-64+4*6(%rsp), %ebp	# e += RCONST + W[n & 15]
-	addl	%edi, %ebp		# e += (c ^ d ^ b)
-	movl	%eax, %esi		#
-	roll	$5, %esi		# rotl32(a,5)
-	addl	%esi, %ebp		# e += rotl32(a,5)
-	rorl	$2, %ebx		# b = rotl32(b,30)
-# 71
-	movl	%ebx, %edi		# c
-	xorl	%ecx, %edi		# ^d
-	xorl	%eax, %edi		# ^b
-	addl	-64+4*7(%rsp), %edx	# e += RCONST + W[n & 15]
-	addl	%edi, %edx		# e += (c ^ d ^ b)
-	movl	%ebp, %esi		#
-	roll	$5, %esi		# rotl32(a,5)
-	addl	%esi, %edx		# e += rotl32(a,5)
-	rorl	$2, %eax		# b = rotl32(b,30)
-# 72
-	movl	%eax, %edi		# c
-	xorl	%ebx, %edi		# ^d
-	xorl	%ebp, %edi		# ^b
-	addl	-64+4*8(%rsp), %ecx	# e += RCONST + W[n & 15]
-	addl	%edi, %ecx		# e += (c ^ d ^ b)
-	movl	%edx, %esi		#
-	roll	$5, %esi		# rotl32(a,5)
-	addl	%esi, %ecx		# e += rotl32(a,5)
-	rorl	$2, %ebp		# b = rotl32(b,30)
-# 73
-	movl	%ebp, %edi		# c
-	xorl	%eax, %edi		# ^d
-	xorl	%edx, %edi		# ^b
-	addl	-64+4*9(%rsp), %ebx	# e += RCONST + W[n & 15]
-	addl	%edi, %ebx		# e += (c ^ d ^ b)
-	movl	%ecx, %esi		#
-	roll	$5, %esi		# rotl32(a,5)
-	addl	%esi, %ebx		# e += rotl32(a,5)
-	rorl	$2, %edx		# b = rotl32(b,30)
-# 74
-	movl	%edx, %edi		# c
-	xorl	%ebp, %edi		# ^d
-	xorl	%ecx, %edi		# ^b
-	addl	-64+4*10(%rsp), %eax	# e += RCONST + W[n & 15]
-	addl	%edi, %eax		# e += (c ^ d ^ b)
-	movl	%ebx, %esi		#
-	roll	$5, %esi		# rotl32(a,5)
-	addl	%esi, %eax		# e += rotl32(a,5)
-	rorl	$2, %ecx		# b = rotl32(b,30)
-# 75
-	movl	%ecx, %edi		# c
-	xorl	%edx, %edi		# ^d
-	xorl	%ebx, %edi		# ^b
-	addl	-64+4*11(%rsp), %ebp	# e += RCONST + W[n & 15]
-	addl	%edi, %ebp		# e += (c ^ d ^ b)
-	movl	%eax, %esi		#
-	roll	$5, %esi		# rotl32(a,5)
-	addl	%esi, %ebp		# e += rotl32(a,5)
-	rorl	$2, %ebx		# b = rotl32(b,30)
-# 76
-	movl	%ebx, %edi		# c
-	xorl	%ecx, %edi		# ^d
-	xorl	%eax, %edi		# ^b
-	addl	-64+4*12(%rsp), %edx	# e += RCONST + W[n & 15]
-	addl	%edi, %edx		# e += (c ^ d ^ b)
-	movl	%ebp, %esi		#
-	roll	$5, %esi		# rotl32(a,5)
-	addl	%esi, %edx		# e += rotl32(a,5)
-	rorl	$2, %eax		# b = rotl32(b,30)
-# 77
-	movl	%eax, %edi		# c
-	xorl	%ebx, %edi		# ^d
-	xorl	%ebp, %edi		# ^b
-	addl	-64+4*13(%rsp), %ecx	# e += RCONST + W[n & 15]
-	addl	%edi, %ecx		# e += (c ^ d ^ b)
-	movl	%edx, %esi		#
-	roll	$5, %esi		# rotl32(a,5)
-	addl	%esi, %ecx		# e += rotl32(a,5)
-	rorl	$2, %ebp		# b = rotl32(b,30)
-# 78
-	movl	%ebp, %edi		# c
-	xorl	%eax, %edi		# ^d
-	xorl	%edx, %edi		# ^b
-	addl	-64+4*14(%rsp), %ebx	# e += RCONST + W[n & 15]
-	addl	%edi, %ebx		# e += (c ^ d ^ b)
-	movl	%ecx, %esi		#
-	roll	$5, %esi		# rotl32(a,5)
-	addl	%esi, %ebx		# e += rotl32(a,5)
-	rorl	$2, %edx		# b = rotl32(b,30)
-# 79
-	movl	%edx, %edi		# c
-	xorl	%ebp, %edi		# ^d
-	xorl	%ecx, %edi		# ^b
-	addl	-64+4*15(%rsp), %eax	# e += RCONST + W[n & 15]
-	addl	%edi, %eax		# e += (c ^ d ^ b)
-	movl	%ebx, %esi		#
-	roll	$5, %esi		# rotl32(a,5)
-	addl	%esi, %eax		# e += rotl32(a,5)
-	rorl	$2, %ecx		# b = rotl32(b,30)
-
-	popq	%rdi		#
-	popq	%r12		#
-	addl	%eax, 80(%rdi)	# ctx->hash[0] += a
-	popq	%r13		#
-	addl	%ebx, 84(%rdi)	# ctx->hash[1] += b
-	popq	%r14		#
-	addl	%ecx, 88(%rdi)	# ctx->hash[2] += c
-#	popq	%r15		#
-	addl	%edx, 92(%rdi)	# ctx->hash[3] += d
-	popq	%rbx		#
-	addl	%ebp, 96(%rdi)	# ctx->hash[4] += e
-	popq	%rbp		#
-
-	ret
-	.size	sha1_process_block64, .-sha1_process_block64
-
-	.section	.rodata.cst16.sha1const, "aM", @progbits, 16
-	.balign	16
-sha1const:
-	.long	0x5A827999
-	.long	0x6ED9EBA1
-	.long	0x8F1BBCDC
-	.long	0xCA62C1D6
-
-#endif
diff --git a/libbb/hash_md5_sha_x86-64.S.sh b/libbb/hash_md5_sha_x86-64.S.sh
deleted file mode 100755
index 653fe4989..000000000
--- a/libbb/hash_md5_sha_x86-64.S.sh
+++ /dev/null
@@ -1,478 +0,0 @@
-#!/bin/sh
-
-# We don't regenerate it on every "make" invocation - only by hand.
-# The reason is that the changes to generated code are difficult
-# to visualize by looking only at this script, it helps when the commit
-# also contains the diff of the generated file.
-exec >hash_md5_sha_x86-64.S
-
-# Based on http://arctic.org/~dean/crypto/sha1.html.
-# ("This SHA1 implementation is public domain.")
-#
-# x86-64 has at least SSE2 vector insns always available.
-# We can use them without any CPUID checks (and without a need
-# for a fallback code if needed insns are not available).
-# This code uses them to calculate W[] ahead of time.
-#
-# Unfortunately, results are passed from vector unit to
-# integer ALUs on the stack. MOVD/Q insns to move them directly
-# from vector to integer registers are slower than store-to-load
-# forwarding in LSU (on Skylake at least).
-#
-# The win against a purely integer code is small on Skylake,
-# only about 7-8%. We offload about 1/3 of our operations to the vector unit.
-# It can do 4 ops at once in one 128-bit register,
-# but we have to use x2 of them because of W[0] complication,
-# SSE2 has no "rotate each word by N bits" insns,
-# moving data to/from vector unit is clunky, and Skylake
-# has four integer ALUs unified with three vector ALUs,
-# which makes pure integer code rather fast, and makes
-# vector ops compete with integer ones.
-#
-# Zen3, with its separate vector ALUs, wins more, about 12%.
-
-xmmT1="%xmm4"
-xmmT2="%xmm5"
-xmmRCONST="%xmm6"
-xmmALLRCONST="%xmm7"
-T=`printf '\t'`
-
-# SSE instructions are longer than 4 bytes on average.
-# Intel CPUs (up to Tiger Lake at least) can't decode
-# more than 16 bytes of code in one cycle.
-# By interleaving SSE code and integer code
-# we mostly achieve a situation where 16-byte decode fetch window
-# contains 4 (or more) insns.
-#
-# However. On Skylake, there was no observed difference,
-# but on Zen3, non-interleaved code is ~3% faster
-# (822 Mb/s versus 795 Mb/s hashing speed).
-# Off for now:
-interleave=false
-
-INTERLEAVE() {
-	$interleave || \
-	{
-		# Generate non-interleaved code
-		# (it should work correctly too)
-		echo "$1"
-		echo "$2"
-		return
-	}
-	(
-	echo "$1" | grep -v '^$' >"$0.temp1"
-	echo "$2" | grep -v '^$' >"$0.temp2"
-	exec 3<"$0.temp1"
-	exec 4<"$0.temp2"
-	IFS=''
-	while :; do
-		line1=''
-		line2=''
-		while :; do
-			read -r line1 <&3
-			if test "${line1:0:1}" != "#" && test "${line1:0:2}" != "$T#"; then
-				break
-			fi
-			echo "$line1"
-		done
-		while :; do
-			read -r line2 <&4
-			if test "${line2:0:4}" = "${T}lea"; then
-				# We use 7-8 byte long forms of LEA.
-				# Do not interleave them with SSE insns
-				# which are also long.
-				echo "$line2"
-				read -r line2 <&4
-				echo "$line2"
-				continue
-			fi
-			if test "${line2:0:1}" != "#" && test "${line2:0:2}" != "$T#"; then
-				break
-			fi
-			echo "$line2"
-		done
-		test "$line1$line2" || break
-		echo "$line1"
-		echo "$line2"
-	done
-	rm "$0.temp1" "$0.temp2"
-	)
-}
-
-#	movaps	bswap32_mask(%rip), $xmmT1
-# Load W[] to xmm0..3, byteswapping on the fly.
-# For iterations 0..15, we pass RCONST+W[] in rsi,r8..r14
-# for use in RD1As instead of spilling them to stack.
-# (We use rsi instead of rN because this makes two
-# ADDs in two first RD1As shorter by one byte).
-#	movups	16*0(%rdi), %xmm0
-#	pshufb	$xmmT1, %xmm0		#SSSE3 insn
-#	movaps	%xmm0, $xmmT2
-#	paddd	$xmmRCONST, $xmmT2
-#	movq	$xmmT2, %rsi
-#	#pextrq	\$1, $xmmT2, %r8	#SSE4.1 insn
-#	#movhpd	$xmmT2, %r8		#can only move to mem, not to reg
-#	shufps	\$0x0e, $xmmT2, $xmmT2	# have to use two-insn sequence
-#	movq	$xmmT2, %r8		# instead
-#	...
-#	<repeat for xmm1,2,3>
-#	...
-#-	leal	$RCONST(%r$e,%rsi), %e$e	# e += RCONST + W[n]
-#+	addl	%esi, %e$e			# e += RCONST + W[n]
-# ^^^^^^^^^^^^^^^^^^^^^^^^
-# The above is -97 bytes of code...
-# ...but pshufb is a SSSE3 insn. Can't use it.
-
-echo \
-"### Generated by hash_md5_sha_x86-64.S.sh ###
-
-#if CONFIG_SHA1_SMALL == 0 && defined(__GNUC__) && defined(__x86_64__)
-#ifdef __linux__
-	.section	.note.GNU-stack, \"\", @progbits
-#endif
-	.section	.text.sha1_process_block64, \"ax\", @progbits
-	.globl	sha1_process_block64
-	.hidden	sha1_process_block64
-	.type	sha1_process_block64, @function
-
-	.balign	8	# allow decoders to fetch at least 5 first insns
-sha1_process_block64:
-	pushq	%rbp	# 1 byte insn
-	pushq	%rbx	# 1 byte insn
-#	pushq	%r15	# 2 byte insn
-	pushq	%r14	# 2 byte insn
-	pushq	%r13	# 2 byte insn
-	pushq	%r12	# 2 byte insn
-	pushq	%rdi	# we need ctx at the end
-
-#Register and stack use:
-# eax..edx: a..d
-# ebp: e
-# esi,edi,r8..r14: temps
-# r15: unused
-# xmm0..xmm3: W[]
-# xmm4,xmm5: temps
-# xmm6: current round constant
-# xmm7: all round constants
-# -64(%rsp): area for passing RCONST + W[] from vector to integer units
-
-	movl	80(%rdi), %eax		# a = ctx->hash[0]
-	movl	84(%rdi), %ebx		# b = ctx->hash[1]
-	movl	88(%rdi), %ecx		# c = ctx->hash[2]
-	movl	92(%rdi), %edx		# d = ctx->hash[3]
-	movl	96(%rdi), %ebp		# e = ctx->hash[4]
-
-	movaps	sha1const(%rip), $xmmALLRCONST
-	pshufd	\$0x00, $xmmALLRCONST, $xmmRCONST
-
-	# Load W[] to xmm0..3, byteswapping on the fly.
-	#
-	# For iterations 0..15, we pass W[] in rsi,r8..r14
-	# for use in RD1As instead of spilling them to stack.
-	# We lose parallelized addition of RCONST, but LEA
-	# can do two additions at once, so it is probably a wash.
-	# (We use rsi instead of rN because this makes two
-	# LEAs in two first RD1As shorter by one byte).
-	movq	4*0(%rdi), %rsi
-	movq	4*2(%rdi), %r8
-	bswapq	%rsi
-	bswapq	%r8
-	rolq	\$32, %rsi		# rsi = W[1]:W[0]
-	rolq	\$32, %r8		# r8  = W[3]:W[2]
-	movq	%rsi, %xmm0
-	movq	%r8, $xmmT1
-	punpcklqdq $xmmT1, %xmm0	# xmm0 = r8:rsi = (W[0],W[1],W[2],W[3])
-#	movaps	%xmm0, $xmmT1		# add RCONST, spill to stack
-#	paddd	$xmmRCONST, $xmmT1
-#	movups	$xmmT1, -64+16*0(%rsp)
-
-	movq	4*4(%rdi), %r9
-	movq	4*6(%rdi), %r10
-	bswapq	%r9
-	bswapq	%r10
-	rolq	\$32, %r9		# r9  = W[5]:W[4]
-	rolq	\$32, %r10		# r10 = W[7]:W[6]
-	movq	%r9, %xmm1
-	movq	%r10, $xmmT1
-	punpcklqdq $xmmT1, %xmm1	# xmm1 = r10:r9 = (W[4],W[5],W[6],W[7])
-
-	movq	4*8(%rdi), %r11
-	movq	4*10(%rdi), %r12
-	bswapq	%r11
-	bswapq	%r12
-	rolq	\$32, %r11		# r11 = W[9]:W[8]
-	rolq	\$32, %r12		# r12 = W[11]:W[10]
-	movq	%r11, %xmm2
-	movq	%r12, $xmmT1
-	punpcklqdq $xmmT1, %xmm2	# xmm2 = r12:r11 = (W[8],W[9],W[10],W[11])
-
-	movq	4*12(%rdi), %r13
-	movq	4*14(%rdi), %r14
-	bswapq	%r13
-	bswapq	%r14
-	rolq	\$32, %r13		# r13 = W[13]:W[12]
-	rolq	\$32, %r14		# r14 = W[15]:W[14]
-	movq	%r13, %xmm3
-	movq	%r14, $xmmT1
-	punpcklqdq $xmmT1, %xmm3	# xmm3 = r14:r13 = (W[12],W[13],W[14],W[15])
-"
-
-PREP() {
-local xmmW0=$1
-local xmmW4=$2
-local xmmW8=$3
-local xmmW12=$4
-# the above must be %xmm0..3 in some permutation
-local dstmem=$5
-#W[0] = rol(W[13] ^ W[8]  ^ W[2] ^ W[0], 1);
-#W[1] = rol(W[14] ^ W[9]  ^ W[3] ^ W[1], 1);
-#W[2] = rol(W[15] ^ W[10] ^ W[4] ^ W[2], 1);
-#W[3] = rol(  0   ^ W[11] ^ W[5] ^ W[3], 1);
-#W[3] ^= rol(W[0], 1);
-echo "# PREP $@
-	movaps	$xmmW12, $xmmT1
-	psrldq	\$4, $xmmT1	# rshift by 4 bytes: T1 = ([13],[14],[15],0)
-
-#	pshufd	\$0x4e, $xmmW0, $xmmT2	# 01001110=2,3,0,1 shuffle, ([2],[3],x,x)
-#	punpcklqdq $xmmW4, $xmmT2	# T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5])
-# same result as above, but shorter and faster:
-# pshufd/shufps are subtly different: pshufd takes all dwords from source operand,
-# shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one!
-	movaps	$xmmW0, $xmmT2
-	shufps	\$0x4e, $xmmW4, $xmmT2	# 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5])
-
-	xorps	$xmmW8, $xmmW0	# ([8],[9],[10],[11]) ^ ([0],[1],[2],[3])
-	xorps	$xmmT1, $xmmT2	# ([13],[14],[15],0) ^ ([2],[3],[4],[5])
-	xorps	$xmmT2, $xmmW0	# ^
-	# W0 = unrotated (W[0]..W[3]), still needs W[3] fixup
-	movaps	$xmmW0, $xmmT2
-
-	xorps	$xmmT1, $xmmT1	# rol(W0,1):
-	pcmpgtd	$xmmW0, $xmmT1	#  ffffffff for elements <0 (ones with msb bit 1)
-	paddd	$xmmW0, $xmmW0	#  shift left by 1
-	psubd	$xmmT1, $xmmW0	#  add 1 to those who had msb bit 1
-	# W0 = rotated (W[0]..W[3]), still needs W[3] fixup
-
-	pslldq	\$12, $xmmT2	# lshift by 12 bytes: T2 = (0,0,0,unrotW[0])
-	movaps	$xmmT2, $xmmT1
-	pslld	\$2, $xmmT2
-	psrld	\$30, $xmmT1
-#	xorps	$xmmT1, $xmmT2	# rol((0,0,0,unrotW[0]),2)
-	xorps	$xmmT1, $xmmW0	# same result, but does not depend on/does not modify T2
-
-	xorps	$xmmT2, $xmmW0	# W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2))
-"
-#	movq	$xmmW0, %r8	# high latency (~6 cycles)
-#	movaps	$xmmW0, $xmmT1
-#	psrldq	\$8, $xmmT1	# rshift by 8 bytes: move upper 64 bits to lower
-#	movq	$xmmT1, %r10	# high latency
-#	movq	%r8, %r9
-#	movq	%r10, %r11
-#	shrq	\$32, %r9
-#	shrq	\$32, %r11
-# ^^^ slower than passing the results on stack (!!!)
-echo "
-	movaps	$xmmW0, $xmmT2
-	paddd	$xmmRCONST, $xmmT2
-	movups	$xmmT2, $dstmem
-"
-}
-
-# It's possible to interleave integer insns in rounds to mostly eliminate
-# dependency chains, but this likely to only help old Pentium-based
-# CPUs (ones without OOO, which can only simultaneously execute a pair
-# of _adjacent_ insns).
-# Testing on old-ish Silvermont CPU (which has OOO window of only
-# about ~8 insns) shows very small (~1%) speedup.
-
-RD1A() {
-local a=$1;local b=$2;local c=$3;local d=$4;local e=$5
-local n=$(($6))
-local n0=$(((n+0) & 15))
-local rN=$((7+n0/2))
-echo "
-# $n
-";test $n0 = 0 && echo "
-	leal	$RCONST(%r$e,%rsi), %e$e # e += RCONST + W[n]
-	shrq	\$32, %rsi
-";test $n0 = 1 && echo "
-	leal	$RCONST(%r$e,%rsi), %e$e # e += RCONST + W[n]
-";test $n0 -ge 2 && test $((n0 & 1)) = 0 && echo "
-	leal	$RCONST(%r$e,%r$rN), %e$e # e += RCONST + W[n]
-	shrq	\$32, %r$rN
-";test $n0 -ge 2 && test $((n0 & 1)) = 1 && echo "
-	leal	$RCONST(%r$e,%r$rN), %e$e # e += RCONST + W[n]
-";echo "
-	movl	%e$c, %edi		# c
-	xorl	%e$d, %edi		# ^d
-	andl	%e$b, %edi		# &b
-	xorl	%e$d, %edi		# (((c ^ d) & b) ^ d)
-	addl	%edi, %e$e		# e += (((c ^ d) & b) ^ d)
-	movl	%e$a, %edi		#
-	roll	\$5, %edi		# rotl32(a,5)
-	addl	%edi, %e$e		# e += rotl32(a,5)
-	rorl	\$2, %e$b		# b = rotl32(b,30)
-"
-}
-RD1B() {
-local a=$1;local b=$2;local c=$3;local d=$4;local e=$5
-local n=$(($6))
-local n13=$(((n+13) & 15))
-local n8=$(((n+8) & 15))
-local n2=$(((n+2) & 15))
-local n0=$(((n+0) & 15))
-echo "
-# $n
-	movl	%e$c, %edi		# c
-	xorl	%e$d, %edi		# ^d
-	andl	%e$b, %edi		# &b
-	xorl	%e$d, %edi		# (((c ^ d) & b) ^ d)
-	addl	-64+4*$n0(%rsp), %e$e	# e += RCONST + W[n & 15]
-	addl	%edi, %e$e		# e += (((c ^ d) & b) ^ d)
-	movl	%e$a, %esi		#
-	roll	\$5, %esi		# rotl32(a,5)
-	addl	%esi, %e$e		# e += rotl32(a,5)
-	rorl	\$2, %e$b		# b = rotl32(b,30)
-"
-}
-
-RD2() {
-local a=$1;local b=$2;local c=$3;local d=$4;local e=$5
-local n=$(($6))
-local n13=$(((n+13) & 15))
-local n8=$(((n+8) & 15))
-local n2=$(((n+2) & 15))
-local n0=$(((n+0) & 15))
-echo "
-# $n
-	movl	%e$c, %edi		# c
-	xorl	%e$d, %edi		# ^d
-	xorl	%e$b, %edi		# ^b
-	addl	-64+4*$n0(%rsp), %e$e	# e += RCONST + W[n & 15]
-	addl	%edi, %e$e		# e += (c ^ d ^ b)
-	movl	%e$a, %esi		#
-	roll	\$5, %esi		# rotl32(a,5)
-	addl	%esi, %e$e		# e += rotl32(a,5)
-	rorl	\$2, %e$b		# b = rotl32(b,30)
-"
-}
-
-RD3() {
-local a=$1;local b=$2;local c=$3;local d=$4;local e=$5
-local n=$(($6))
-local n13=$(((n+13) & 15))
-local n8=$(((n+8) & 15))
-local n2=$(((n+2) & 15))
-local n0=$(((n+0) & 15))
-echo "
-# $n
-	movl	%e$b, %edi		# di: b
-	movl	%e$b, %esi		# si: b
-	orl	%e$c, %edi		# di: b | c
-	andl	%e$c, %esi		# si: b & c
-	andl	%e$d, %edi		# di: (b | c) & d
-	orl	%esi, %edi		# ((b | c) & d) | (b & c)
-	addl	%edi, %e$e		# += ((b | c) & d) | (b & c)
-	addl	-64+4*$n0(%rsp), %e$e	# e += RCONST + W[n & 15]
-	movl	%e$a, %esi		#
-	roll	\$5, %esi		# rotl32(a,5)
-	addl	%esi, %e$e		# e += rotl32(a,5)
-	rorl	\$2, %e$b		# b = rotl32(b,30)
-"
-}
-
-{
-# Round 1
-RCONST=0x5A827999
-RD1A ax bx cx dx bp  0; RD1A bp ax bx cx dx  1; RD1A dx bp ax bx cx  2; RD1A cx dx bp ax bx  3;
-RD1A bx cx dx bp ax  4; RD1A ax bx cx dx bp  5; RD1A bp ax bx cx dx  6; RD1A dx bp ax bx cx  7;
-a=`PREP %xmm0 %xmm1 %xmm2 %xmm3 "-64+16*0(%rsp)"`
-b=`RD1A cx dx bp ax bx  8; RD1A bx cx dx bp ax  9; RD1A ax bx cx dx bp 10; RD1A bp ax bx cx dx 11;`
-INTERLEAVE "$a" "$b"
-a=`echo "	pshufd	\\$0x55, $xmmALLRCONST, $xmmRCONST"
-   PREP %xmm1 %xmm2 %xmm3 %xmm0 "-64+16*1(%rsp)"`
-b=`RD1A dx bp ax bx cx 12; RD1A cx dx bp ax bx 13; RD1A bx cx dx bp ax 14; RD1A ax bx cx dx bp 15;`
-INTERLEAVE "$a" "$b"
-a=`PREP %xmm2 %xmm3 %xmm0 %xmm1 "-64+16*2(%rsp)"`
-b=`RD1B bp ax bx cx dx 16; RD1B dx bp ax bx cx 17; RD1B cx dx bp ax bx 18; RD1B bx cx dx bp ax 19;`
-INTERLEAVE "$a" "$b"
-
-# Round 2
-RCONST=0x6ED9EBA1
-a=`PREP %xmm3 %xmm0 %xmm1 %xmm2 "-64+16*3(%rsp)"`
-b=`RD2 ax bx cx dx bp 20; RD2 bp ax bx cx dx 21; RD2 dx bp ax bx cx 22; RD2 cx dx bp ax bx 23;`
-INTERLEAVE "$a" "$b"
-a=`PREP %xmm0 %xmm1 %xmm2 %xmm3 "-64+16*0(%rsp)"`
-b=`RD2 bx cx dx bp ax 24; RD2 ax bx cx dx bp 25; RD2 bp ax bx cx dx 26; RD2 dx bp ax bx cx 27;`
-INTERLEAVE "$a" "$b"
-a=`PREP %xmm1 %xmm2 %xmm3 %xmm0 "-64+16*1(%rsp)"`
-b=`RD2 cx dx bp ax bx 28; RD2 bx cx dx bp ax 29; RD2 ax bx cx dx bp 30; RD2 bp ax bx cx dx 31;`
-INTERLEAVE "$a" "$b"
-a=`echo "	pshufd	\\$0xaa, $xmmALLRCONST, $xmmRCONST"
-   PREP %xmm2 %xmm3 %xmm0 %xmm1 "-64+16*2(%rsp)"`
-b=`RD2 dx bp ax bx cx 32; RD2 cx dx bp ax bx 33; RD2 bx cx dx bp ax 34; RD2 ax bx cx dx bp 35;`
-INTERLEAVE "$a" "$b"
-a=`PREP %xmm3 %xmm0 %xmm1 %xmm2 "-64+16*3(%rsp)"`
-b=`RD2 bp ax bx cx dx 36; RD2 dx bp ax bx cx 37; RD2 cx dx bp ax bx 38; RD2 bx cx dx bp ax 39;`
-INTERLEAVE "$a" "$b"
-
-# Round 3
-RCONST=0x8F1BBCDC
-a=`PREP %xmm0 %xmm1 %xmm2 %xmm3 "-64+16*0(%rsp)"`
-b=`RD3 ax bx cx dx bp 40; RD3 bp ax bx cx dx 41; RD3 dx bp ax bx cx 42; RD3 cx dx bp ax bx 43;`
-INTERLEAVE "$a" "$b"
-a=`PREP %xmm1 %xmm2 %xmm3 %xmm0 "-64+16*1(%rsp)"`
-b=`RD3 bx cx dx bp ax 44; RD3 ax bx cx dx bp 45; RD3 bp ax bx cx dx 46; RD3 dx bp ax bx cx 47;`
-INTERLEAVE "$a" "$b"
-a=`PREP %xmm2 %xmm3 %xmm0 %xmm1 "-64+16*2(%rsp)"`
-b=`RD3 cx dx bp ax bx 48; RD3 bx cx dx bp ax 49; RD3 ax bx cx dx bp 50; RD3 bp ax bx cx dx 51;`
-INTERLEAVE "$a" "$b"
-a=`echo "	pshufd	\\$0xff, $xmmALLRCONST, $xmmRCONST"
-   PREP %xmm3 %xmm0 %xmm1 %xmm2 "-64+16*3(%rsp)"`
-b=`RD3 dx bp ax bx cx 52; RD3 cx dx bp ax bx 53; RD3 bx cx dx bp ax 54; RD3 ax bx cx dx bp 55;`
-INTERLEAVE "$a" "$b"
-a=`PREP %xmm0 %xmm1 %xmm2 %xmm3 "-64+16*0(%rsp)"`
-b=`RD3 bp ax bx cx dx 56; RD3 dx bp ax bx cx 57; RD3 cx dx bp ax bx 58; RD3 bx cx dx bp ax 59;`
-INTERLEAVE "$a" "$b"
-
-# Round 4 has the same logic as round 2, only n and RCONST are different
-RCONST=0xCA62C1D6
-a=`PREP %xmm1 %xmm2 %xmm3 %xmm0 "-64+16*1(%rsp)"`
-b=`RD2 ax bx cx dx bp 60; RD2 bp ax bx cx dx 61; RD2 dx bp ax bx cx 62; RD2 cx dx bp ax bx 63;`
-INTERLEAVE "$a" "$b"
-a=`PREP %xmm2 %xmm3 %xmm0 %xmm1 "-64+16*2(%rsp)"`
-b=`RD2 bx cx dx bp ax 64; RD2 ax bx cx dx bp 65; RD2 bp ax bx cx dx 66; RD2 dx bp ax bx cx 67;`
-INTERLEAVE "$a" "$b"
-a=`PREP %xmm3 %xmm0 %xmm1 %xmm2 "-64+16*3(%rsp)"`
-b=`RD2 cx dx bp ax bx 68; RD2 bx cx dx bp ax 69; RD2 ax bx cx dx bp 70; RD2 bp ax bx cx dx 71;`
-INTERLEAVE "$a" "$b"
-RD2 dx bp ax bx cx 72; RD2 cx dx bp ax bx 73; RD2 bx cx dx bp ax 74; RD2 ax bx cx dx bp 75;
-RD2 bp ax bx cx dx 76; RD2 dx bp ax bx cx 77; RD2 cx dx bp ax bx 78; RD2 bx cx dx bp ax 79;
-} | grep -v '^$'
-
-echo "
-	popq	%rdi		#
-	popq	%r12		#
-	addl	%eax, 80(%rdi)	# ctx->hash[0] += a
-	popq	%r13		#
-	addl	%ebx, 84(%rdi)	# ctx->hash[1] += b
-	popq	%r14		#
-	addl	%ecx, 88(%rdi)	# ctx->hash[2] += c
-#	popq	%r15		#
-	addl	%edx, 92(%rdi)	# ctx->hash[3] += d
-	popq	%rbx		#
-	addl	%ebp, 96(%rdi)	# ctx->hash[4] += e
-	popq	%rbp		#
-
-	ret
-	.size	sha1_process_block64, .-sha1_process_block64
-
-	.section	.rodata.cst16.sha1const, \"aM\", @progbits, 16
-	.balign	16
-sha1const:
-	.long	0x5A827999
-	.long	0x6ED9EBA1
-	.long	0x8F1BBCDC
-	.long	0xCA62C1D6
-
-#endif"
diff --git a/libbb/hash_md5_sha_x86-64_shaNI.S b/libbb/hash_md5_sha_x86-64_shaNI.S
deleted file mode 100644
index 2f03e1ce4..000000000
--- a/libbb/hash_md5_sha_x86-64_shaNI.S
+++ /dev/null
@@ -1,232 +0,0 @@
-#if ENABLE_SHA1_HWACCEL && defined(__GNUC__) && defined(__x86_64__)
-/* The code is adapted from Linux kernel's source */
-
-// We use shorter insns, even though they are for "wrong"
-// data type (fp, not int).
-// For Intel, there is no penalty for doing it at all
-// (CPUs which do have such penalty do not support SHA insns).
-// For AMD, the penalty is one extra cycle
-// (allegedly: I failed to find measurable difference).
-
-//#define mova128 movdqa
-#define mova128 movaps
-//#define movu128 movdqu
-#define movu128 movups
-//#define xor128 pxor
-#define xor128 xorps
-//#define shuf128_32 pshufd
-#define shuf128_32 shufps
-
-#define extr128_32 pextrd
-//#define extr128_32 extractps	# not shorter
-
-// pshufb is a SSSE3 insn.
-// pinsrd, pextrd, extractps are SSE4.1 insns.
-// We do not check SSSE3/SSE4.1 in cpuid,
-// all SHA-capable CPUs support them as well.
-
-#ifdef __linux__
-	.section	.note.GNU-stack, "", @progbits
-#endif
-	.section	.text.sha1_process_block64_shaNI, "ax", @progbits
-	.globl	sha1_process_block64_shaNI
-	.hidden	sha1_process_block64_shaNI
-	.type	sha1_process_block64_shaNI, @function
-
-#define ABCD		%xmm0
-#define E0		%xmm1	/* Need two E's b/c they ping pong */
-#define E1		%xmm2
-#define MSG0		%xmm3
-#define MSG1		%xmm4
-#define MSG2		%xmm5
-#define MSG3		%xmm6
-
-	.balign	8	# allow decoders to fetch at least 2 first insns
-sha1_process_block64_shaNI:
-	/* load initial hash values */
-	movu128		80(%rdi), ABCD
-	xor128		E0, E0
-	pinsrd		$3, 80+4*4(%rdi), E0	# load to uppermost 32-bit word
-	shuf128_32	$0x1B, ABCD, ABCD	# DCBA -> ABCD
-
-	mova128		PSHUFFLE_BYTE_FLIP_MASK(%rip), %xmm7
-
-	movu128		0*16(%rdi), MSG0
-	pshufb		%xmm7, MSG0
-	movu128		1*16(%rdi), MSG1
-	pshufb		%xmm7, MSG1
-	movu128		2*16(%rdi), MSG2
-	pshufb		%xmm7, MSG2
-	movu128		3*16(%rdi), MSG3
-	pshufb		%xmm7, MSG3
-
-	/* Save hash values for addition after rounds */
-	mova128		E0, %xmm7
-	mova128		ABCD, %xmm8
-
-	/* Rounds 0-3 */
-		paddd		MSG0, E0
-		mova128		ABCD, E1
-		sha1rnds4	$0, E0, ABCD
-
-	/* Rounds 4-7 */
-		sha1nexte	MSG1, E1
-		mova128		ABCD, E0
-		sha1rnds4	$0, E1, ABCD
-	sha1msg1	MSG1, MSG0
-
-	/* Rounds 8-11 */
-		sha1nexte	MSG2, E0
-		mova128		ABCD, E1
-		sha1rnds4	$0, E0, ABCD
-	sha1msg1	MSG2, MSG1
-	xor128		MSG2, MSG0
-
-	/* Rounds 12-15 */
-		sha1nexte	MSG3, E1
-		mova128		ABCD, E0
-	sha1msg2	MSG3, MSG0
-		sha1rnds4	$0, E1, ABCD
-	sha1msg1	MSG3, MSG2
-	xor128		MSG3, MSG1
-
-	/* Rounds 16-19 */
-		sha1nexte	MSG0, E0
-		mova128		ABCD, E1
-	sha1msg2	MSG0, MSG1
-		sha1rnds4	$0, E0, ABCD
-	sha1msg1	MSG0, MSG3
-	xor128		MSG0, MSG2
-
-	/* Rounds 20-23 */
-		sha1nexte	MSG1, E1
-		mova128		ABCD, E0
-	sha1msg2	MSG1, MSG2
-		sha1rnds4	$1, E1, ABCD
-	sha1msg1	MSG1, MSG0
-	xor128		MSG1, MSG3
-
-	/* Rounds 24-27 */
-		sha1nexte	MSG2, E0
-		mova128		ABCD, E1
-	sha1msg2	MSG2, MSG3
-		sha1rnds4	$1, E0, ABCD
-	sha1msg1	MSG2, MSG1
-	xor128		MSG2, MSG0
-
-	/* Rounds 28-31 */
-		sha1nexte	MSG3, E1
-		mova128		ABCD, E0
-	sha1msg2	MSG3, MSG0
-		sha1rnds4	$1, E1, ABCD
-	sha1msg1	MSG3, MSG2
-	xor128		MSG3, MSG1
-
-	/* Rounds 32-35 */
-		sha1nexte	MSG0, E0
-		mova128		ABCD, E1
-	sha1msg2	MSG0, MSG1
-		sha1rnds4	$1, E0, ABCD
-	sha1msg1	MSG0, MSG3
-	xor128		MSG0, MSG2
-
-	/* Rounds 36-39 */
-		sha1nexte	MSG1, E1
-		mova128		ABCD, E0
-	sha1msg2	MSG1, MSG2
-		sha1rnds4	$1, E1, ABCD
-	sha1msg1	MSG1, MSG0
-	xor128		MSG1, MSG3
-
-	/* Rounds 40-43 */
-		sha1nexte	MSG2, E0
-		mova128		ABCD, E1
-	sha1msg2	MSG2, MSG3
-		sha1rnds4	$2, E0, ABCD
-	sha1msg1	MSG2, MSG1
-	xor128		MSG2, MSG0
-
-	/* Rounds 44-47 */
-		sha1nexte	MSG3, E1
-		mova128		ABCD, E0
-	sha1msg2	MSG3, MSG0
-		sha1rnds4	$2, E1, ABCD
-	sha1msg1	MSG3, MSG2
-	xor128		MSG3, MSG1
-
-	/* Rounds 48-51 */
-		sha1nexte	MSG0, E0
-		mova128		ABCD, E1
-	sha1msg2	MSG0, MSG1
-		sha1rnds4	$2, E0, ABCD
-	sha1msg1	MSG0, MSG3
-	xor128		MSG0, MSG2
-
-	/* Rounds 52-55 */
-		sha1nexte	MSG1, E1
-		mova128		ABCD, E0
-	sha1msg2	MSG1, MSG2
-		sha1rnds4	$2, E1, ABCD
-	sha1msg1	MSG1, MSG0
-	xor128		MSG1, MSG3
-
-	/* Rounds 56-59 */
-		sha1nexte	MSG2, E0
-		mova128		ABCD, E1
-	sha1msg2	MSG2, MSG3
-		sha1rnds4	$2, E0, ABCD
-	sha1msg1	MSG2, MSG1
-	xor128		MSG2, MSG0
-
-	/* Rounds 60-63 */
-		sha1nexte	MSG3, E1
-		mova128		ABCD, E0
-	sha1msg2	MSG3, MSG0
-		sha1rnds4	$3, E1, ABCD
-	sha1msg1	MSG3, MSG2
-	xor128		MSG3, MSG1
-
-	/* Rounds 64-67 */
-		sha1nexte	MSG0, E0
-		mova128		ABCD, E1
-	sha1msg2	MSG0, MSG1
-		sha1rnds4	$3, E0, ABCD
-	sha1msg1	MSG0, MSG3
-	xor128		MSG0, MSG2
-
-	/* Rounds 68-71 */
-		sha1nexte	MSG1, E1
-		mova128		ABCD, E0
-	sha1msg2	MSG1, MSG2
-		sha1rnds4	$3, E1, ABCD
-	xor128		MSG1, MSG3
-
-	/* Rounds 72-75 */
-		sha1nexte	MSG2, E0
-		mova128		ABCD, E1
-	sha1msg2	MSG2, MSG3
-		sha1rnds4	$3, E0, ABCD
-
-	/* Rounds 76-79 */
-		sha1nexte	MSG3, E1
-		mova128		ABCD, E0
-		sha1rnds4	$3, E1, ABCD
-
-	/* Add current hash values with previously saved */
-	sha1nexte	%xmm7, E0
-	paddd		%xmm8, ABCD
-
-	/* Write hash values back in the correct order */
-	shuf128_32	$0x1B, ABCD, ABCD
-	movu128		ABCD, 80(%rdi)
-	extr128_32	$3, E0, 80+4*4(%rdi)
-
-	ret
-	.size	sha1_process_block64_shaNI, .-sha1_process_block64_shaNI
-
-	.section	.rodata.cst16.PSHUFFLE_BYTE_FLIP_MASK, "aM", @progbits, 16
-	.balign	16
-PSHUFFLE_BYTE_FLIP_MASK:
-	.octa	0x000102030405060708090a0b0c0d0e0f
-
-#endif
diff --git a/libbb/hash_sha1_hwaccel_x86-32.S b/libbb/hash_sha1_hwaccel_x86-32.S
new file mode 100644
index 000000000..7455a29f0
--- /dev/null
+++ b/libbb/hash_sha1_hwaccel_x86-32.S
@@ -0,0 +1,234 @@
+#if ENABLE_SHA1_HWACCEL && defined(__GNUC__) && defined(__i386__)
+/* The code is adapted from Linux kernel's source */
+
+// We use shorter insns, even though they are for "wrong"
+// data type (fp, not int).
+// For Intel, there is no penalty for doing it at all
+// (CPUs which do have such penalty do not support SHA insns).
+// For AMD, the penalty is one extra cycle
+// (allegedly: I failed to find measurable difference).
+
+//#define mova128 movdqa
+#define mova128 movaps
+//#define movu128 movdqu
+#define movu128 movups
+//#define xor128 pxor
+#define xor128 xorps
+//#define shuf128_32 pshufd
+#define shuf128_32 shufps
+
+#define extr128_32 pextrd
+//#define extr128_32 extractps	# not shorter
+
+// pshufb is a SSSE3 insn.
+// pinsrd, pextrd, extractps are SSE4.1 insns.
+// We do not check SSSE3/SSE4.1 in cpuid,
+// all SHA-capable CPUs support them as well.
+
+#ifdef __linux__
+	.section	.note.GNU-stack, "", @progbits
+#endif
+	.section	.text.sha1_process_block64_shaNI, "ax", @progbits
+	.globl	sha1_process_block64_shaNI
+	.hidden	sha1_process_block64_shaNI
+	.type	sha1_process_block64_shaNI, @function
+
+#define ABCD		%xmm0
+#define E0		%xmm1	/* Need two E's b/c they ping pong */
+#define E1		%xmm2
+#define MSG0		%xmm3
+#define MSG1		%xmm4
+#define MSG2		%xmm5
+#define MSG3		%xmm6
+
+	.balign	8	# allow decoders to fetch at least 2 first insns
+sha1_process_block64_shaNI:
+	/* load initial hash values */
+	movu128		76(%eax), ABCD
+	xor128		E0, E0
+	pinsrd		$3, 76+4*4(%eax), E0	# load to uppermost 32-bit word
+	shuf128_32	$0x1B, ABCD, ABCD	# DCBA -> ABCD
+
+	mova128		PSHUFFLE_BYTE_FLIP_MASK, %xmm7
+
+	movu128		0*16(%eax), MSG0
+	pshufb		%xmm7, MSG0
+	movu128		1*16(%eax), MSG1
+	pshufb		%xmm7, MSG1
+	movu128		2*16(%eax), MSG2
+	pshufb		%xmm7, MSG2
+	movu128		3*16(%eax), MSG3
+	pshufb		%xmm7, MSG3
+
+	/* Save hash values for addition after rounds */
+	mova128		E0, %xmm7
+	/*mova128	ABCD, %xmm8 - NOPE, 32bit has no xmm8 */
+
+	/* Rounds 0-3 */
+		paddd		MSG0, E0
+		mova128		ABCD, E1
+		sha1rnds4	$0, E0, ABCD
+
+	/* Rounds 4-7 */
+		sha1nexte	MSG1, E1
+		mova128		ABCD, E0
+		sha1rnds4	$0, E1, ABCD
+	sha1msg1	MSG1, MSG0
+
+	/* Rounds 8-11 */
+		sha1nexte	MSG2, E0
+		mova128		ABCD, E1
+		sha1rnds4	$0, E0, ABCD
+	sha1msg1	MSG2, MSG1
+	xor128		MSG2, MSG0
+
+	/* Rounds 12-15 */
+		sha1nexte	MSG3, E1
+		mova128		ABCD, E0
+	sha1msg2	MSG3, MSG0
+		sha1rnds4	$0, E1, ABCD
+	sha1msg1	MSG3, MSG2
+	xor128		MSG3, MSG1
+
+	/* Rounds 16-19 */
+		sha1nexte	MSG0, E0
+		mova128		ABCD, E1
+	sha1msg2	MSG0, MSG1
+		sha1rnds4	$0, E0, ABCD
+	sha1msg1	MSG0, MSG3
+	xor128		MSG0, MSG2
+
+	/* Rounds 20-23 */
+		sha1nexte	MSG1, E1
+		mova128		ABCD, E0
+	sha1msg2	MSG1, MSG2
+		sha1rnds4	$1, E1, ABCD
+	sha1msg1	MSG1, MSG0
+	xor128		MSG1, MSG3
+
+	/* Rounds 24-27 */
+		sha1nexte	MSG2, E0
+		mova128		ABCD, E1
+	sha1msg2	MSG2, MSG3
+		sha1rnds4	$1, E0, ABCD
+	sha1msg1	MSG2, MSG1
+	xor128		MSG2, MSG0
+
+	/* Rounds 28-31 */
+		sha1nexte	MSG3, E1
+		mova128		ABCD, E0
+	sha1msg2	MSG3, MSG0
+		sha1rnds4	$1, E1, ABCD
+	sha1msg1	MSG3, MSG2
+	xor128		MSG3, MSG1
+
+	/* Rounds 32-35 */
+		sha1nexte	MSG0, E0
+		mova128		ABCD, E1
+	sha1msg2	MSG0, MSG1
+		sha1rnds4	$1, E0, ABCD
+	sha1msg1	MSG0, MSG3
+	xor128		MSG0, MSG2
+
+	/* Rounds 36-39 */
+		sha1nexte	MSG1, E1
+		mova128		ABCD, E0
+	sha1msg2	MSG1, MSG2
+		sha1rnds4	$1, E1, ABCD
+	sha1msg1	MSG1, MSG0
+	xor128		MSG1, MSG3
+
+	/* Rounds 40-43 */
+		sha1nexte	MSG2, E0
+		mova128		ABCD, E1
+	sha1msg2	MSG2, MSG3
+		sha1rnds4	$2, E0, ABCD
+	sha1msg1	MSG2, MSG1
+	xor128		MSG2, MSG0
+
+	/* Rounds 44-47 */
+		sha1nexte	MSG3, E1
+		mova128		ABCD, E0
+	sha1msg2	MSG3, MSG0
+		sha1rnds4	$2, E1, ABCD
+	sha1msg1	MSG3, MSG2
+	xor128		MSG3, MSG1
+
+	/* Rounds 48-51 */
+		sha1nexte	MSG0, E0
+		mova128		ABCD, E1
+	sha1msg2	MSG0, MSG1
+		sha1rnds4	$2, E0, ABCD
+	sha1msg1	MSG0, MSG3
+	xor128		MSG0, MSG2
+
+	/* Rounds 52-55 */
+		sha1nexte	MSG1, E1
+		mova128		ABCD, E0
+	sha1msg2	MSG1, MSG2
+		sha1rnds4	$2, E1, ABCD
+	sha1msg1	MSG1, MSG0
+	xor128		MSG1, MSG3
+
+	/* Rounds 56-59 */
+		sha1nexte	MSG2, E0
+		mova128		ABCD, E1
+	sha1msg2	MSG2, MSG3
+		sha1rnds4	$2, E0, ABCD
+	sha1msg1	MSG2, MSG1
+	xor128		MSG2, MSG0
+
+	/* Rounds 60-63 */
+		sha1nexte	MSG3, E1
+		mova128		ABCD, E0
+	sha1msg2	MSG3, MSG0
+		sha1rnds4	$3, E1, ABCD
+	sha1msg1	MSG3, MSG2
+	xor128		MSG3, MSG1
+
+	/* Rounds 64-67 */
+		sha1nexte	MSG0, E0
+		mova128		ABCD, E1
+	sha1msg2	MSG0, MSG1
+		sha1rnds4	$3, E0, ABCD
+	sha1msg1	MSG0, MSG3
+	xor128		MSG0, MSG2
+
+	/* Rounds 68-71 */
+		sha1nexte	MSG1, E1
+		mova128		ABCD, E0
+	sha1msg2	MSG1, MSG2
+		sha1rnds4	$3, E1, ABCD
+	xor128		MSG1, MSG3
+
+	/* Rounds 72-75 */
+		sha1nexte	MSG2, E0
+		mova128		ABCD, E1
+	sha1msg2	MSG2, MSG3
+		sha1rnds4	$3, E0, ABCD
+
+	/* Rounds 76-79 */
+		sha1nexte	MSG3, E1
+		mova128		ABCD, E0
+		sha1rnds4	$3, E1, ABCD
+
+	/* Add current hash values with previously saved */
+	sha1nexte	%xmm7, E0
+	/*paddd		%xmm8, ABCD - 32-bit mode has no xmm8 */
+	movu128		76(%eax), %xmm7	# get original ABCD (not shuffled)...
+
+	/* Write hash values back in the correct order */
+	shuf128_32	$0x1B, ABCD, ABCD
+	paddd		%xmm7, ABCD	# ...add it to final ABCD
+	movu128		ABCD, 76(%eax)
+	extr128_32	$3, E0, 76+4*4(%eax)
+
+	ret
+	.size	sha1_process_block64_shaNI, .-sha1_process_block64_shaNI
+
+	.section	.rodata.cst16.PSHUFFLE_BYTE_FLIP_MASK, "aM", @progbits, 16
+	.balign	16
+PSHUFFLE_BYTE_FLIP_MASK:
+	.octa	0x000102030405060708090a0b0c0d0e0f
+
+#endif
diff --git a/libbb/hash_sha1_hwaccel_x86-64.S b/libbb/hash_sha1_hwaccel_x86-64.S
new file mode 100644
index 000000000..2f03e1ce4
--- /dev/null
+++ b/libbb/hash_sha1_hwaccel_x86-64.S
@@ -0,0 +1,232 @@
+#if ENABLE_SHA1_HWACCEL && defined(__GNUC__) && defined(__x86_64__)
+/* The code is adapted from Linux kernel's source */
+
+// We use shorter insns, even though they are for "wrong"
+// data type (fp, not int).
+// For Intel, there is no penalty for doing it at all
+// (CPUs which do have such penalty do not support SHA insns).
+// For AMD, the penalty is one extra cycle
+// (allegedly: I failed to find measurable difference).
+
+//#define mova128 movdqa
+#define mova128 movaps
+//#define movu128 movdqu
+#define movu128 movups
+//#define xor128 pxor
+#define xor128 xorps
+//#define shuf128_32 pshufd
+#define shuf128_32 shufps
+
+#define extr128_32 pextrd
+//#define extr128_32 extractps	# not shorter
+
+// pshufb is a SSSE3 insn.
+// pinsrd, pextrd, extractps are SSE4.1 insns.
+// We do not check SSSE3/SSE4.1 in cpuid,
+// all SHA-capable CPUs support them as well.
+
+#ifdef __linux__
+	.section	.note.GNU-stack, "", @progbits
+#endif
+	.section	.text.sha1_process_block64_shaNI, "ax", @progbits
+	.globl	sha1_process_block64_shaNI
+	.hidden	sha1_process_block64_shaNI
+	.type	sha1_process_block64_shaNI, @function
+
+#define ABCD		%xmm0
+#define E0		%xmm1	/* Need two E's b/c they ping pong */
+#define E1		%xmm2
+#define MSG0		%xmm3
+#define MSG1		%xmm4
+#define MSG2		%xmm5
+#define MSG3		%xmm6
+
+	.balign	8	# allow decoders to fetch at least 2 first insns
+sha1_process_block64_shaNI:
+	/* load initial hash values */
+	movu128		80(%rdi), ABCD
+	xor128		E0, E0
+	pinsrd		$3, 80+4*4(%rdi), E0	# load to uppermost 32-bit word
+	shuf128_32	$0x1B, ABCD, ABCD	# DCBA -> ABCD
+
+	mova128		PSHUFFLE_BYTE_FLIP_MASK(%rip), %xmm7
+
+	movu128		0*16(%rdi), MSG0
+	pshufb		%xmm7, MSG0
+	movu128		1*16(%rdi), MSG1
+	pshufb		%xmm7, MSG1
+	movu128		2*16(%rdi), MSG2
+	pshufb		%xmm7, MSG2
+	movu128		3*16(%rdi), MSG3
+	pshufb		%xmm7, MSG3
+
+	/* Save hash values for addition after rounds */
+	mova128		E0, %xmm7
+	mova128		ABCD, %xmm8
+
+	/* Rounds 0-3 */
+		paddd		MSG0, E0
+		mova128		ABCD, E1
+		sha1rnds4	$0, E0, ABCD
+
+	/* Rounds 4-7 */
+		sha1nexte	MSG1, E1
+		mova128		ABCD, E0
+		sha1rnds4	$0, E1, ABCD
+	sha1msg1	MSG1, MSG0
+
+	/* Rounds 8-11 */
+		sha1nexte	MSG2, E0
+		mova128		ABCD, E1
+		sha1rnds4	$0, E0, ABCD
+	sha1msg1	MSG2, MSG1
+	xor128		MSG2, MSG0
+
+	/* Rounds 12-15 */
+		sha1nexte	MSG3, E1
+		mova128		ABCD, E0
+	sha1msg2	MSG3, MSG0
+		sha1rnds4	$0, E1, ABCD
+	sha1msg1	MSG3, MSG2
+	xor128		MSG3, MSG1
+
+	/* Rounds 16-19 */
+		sha1nexte	MSG0, E0
+		mova128		ABCD, E1
+	sha1msg2	MSG0, MSG1
+		sha1rnds4	$0, E0, ABCD
+	sha1msg1	MSG0, MSG3
+	xor128		MSG0, MSG2
+
+	/* Rounds 20-23 */
+		sha1nexte	MSG1, E1
+		mova128		ABCD, E0
+	sha1msg2	MSG1, MSG2
+		sha1rnds4	$1, E1, ABCD
+	sha1msg1	MSG1, MSG0
+	xor128		MSG1, MSG3
+
+	/* Rounds 24-27 */
+		sha1nexte	MSG2, E0
+		mova128		ABCD, E1
+	sha1msg2	MSG2, MSG3
+		sha1rnds4	$1, E0, ABCD
+	sha1msg1	MSG2, MSG1
+	xor128		MSG2, MSG0
+
+	/* Rounds 28-31 */
+		sha1nexte	MSG3, E1
+		mova128		ABCD, E0
+	sha1msg2	MSG3, MSG0
+		sha1rnds4	$1, E1, ABCD
+	sha1msg1	MSG3, MSG2
+	xor128		MSG3, MSG1
+
+	/* Rounds 32-35 */
+		sha1nexte	MSG0, E0
+		mova128		ABCD, E1
+	sha1msg2	MSG0, MSG1
+		sha1rnds4	$1, E0, ABCD
+	sha1msg1	MSG0, MSG3
+	xor128		MSG0, MSG2
+
+	/* Rounds 36-39 */
+		sha1nexte	MSG1, E1
+		mova128		ABCD, E0
+	sha1msg2	MSG1, MSG2
+		sha1rnds4	$1, E1, ABCD
+	sha1msg1	MSG1, MSG0
+	xor128		MSG1, MSG3
+
+	/* Rounds 40-43 */
+		sha1nexte	MSG2, E0
+		mova128		ABCD, E1
+	sha1msg2	MSG2, MSG3
+		sha1rnds4	$2, E0, ABCD
+	sha1msg1	MSG2, MSG1
+	xor128		MSG2, MSG0
+
+	/* Rounds 44-47 */
+		sha1nexte	MSG3, E1
+		mova128		ABCD, E0
+	sha1msg2	MSG3, MSG0
+		sha1rnds4	$2, E1, ABCD
+	sha1msg1	MSG3, MSG2
+	xor128		MSG3, MSG1
+
+	/* Rounds 48-51 */
+		sha1nexte	MSG0, E0
+		mova128		ABCD, E1
+	sha1msg2	MSG0, MSG1
+		sha1rnds4	$2, E0, ABCD
+	sha1msg1	MSG0, MSG3
+	xor128		MSG0, MSG2
+
+	/* Rounds 52-55 */
+		sha1nexte	MSG1, E1
+		mova128		ABCD, E0
+	sha1msg2	MSG1, MSG2
+		sha1rnds4	$2, E1, ABCD
+	sha1msg1	MSG1, MSG0
+	xor128		MSG1, MSG3
+
+	/* Rounds 56-59 */
+		sha1nexte	MSG2, E0
+		mova128		ABCD, E1
+	sha1msg2	MSG2, MSG3
+		sha1rnds4	$2, E0, ABCD
+	sha1msg1	MSG2, MSG1
+	xor128		MSG2, MSG0
+
+	/* Rounds 60-63 */
+		sha1nexte	MSG3, E1
+		mova128		ABCD, E0
+	sha1msg2	MSG3, MSG0
+		sha1rnds4	$3, E1, ABCD
+	sha1msg1	MSG3, MSG2
+	xor128		MSG3, MSG1
+
+	/* Rounds 64-67 */
+		sha1nexte	MSG0, E0
+		mova128		ABCD, E1
+	sha1msg2	MSG0, MSG1
+		sha1rnds4	$3, E0, ABCD
+	sha1msg1	MSG0, MSG3
+	xor128		MSG0, MSG2
+
+	/* Rounds 68-71 */
+		sha1nexte	MSG1, E1
+		mova128		ABCD, E0
+	sha1msg2	MSG1, MSG2
+		sha1rnds4	$3, E1, ABCD
+	xor128		MSG1, MSG3
+
+	/* Rounds 72-75 */
+		sha1nexte	MSG2, E0
+		mova128		ABCD, E1
+	sha1msg2	MSG2, MSG3
+		sha1rnds4	$3, E0, ABCD
+
+	/* Rounds 76-79 */
+		sha1nexte	MSG3, E1
+		mova128		ABCD, E0
+		sha1rnds4	$3, E1, ABCD
+
+	/* Add current hash values with previously saved */
+	sha1nexte	%xmm7, E0
+	paddd		%xmm8, ABCD
+
+	/* Write hash values back in the correct order */
+	shuf128_32	$0x1B, ABCD, ABCD
+	movu128		ABCD, 80(%rdi)
+	extr128_32	$3, E0, 80+4*4(%rdi)
+
+	ret
+	.size	sha1_process_block64_shaNI, .-sha1_process_block64_shaNI
+
+	.section	.rodata.cst16.PSHUFFLE_BYTE_FLIP_MASK, "aM", @progbits, 16
+	.balign	16
+PSHUFFLE_BYTE_FLIP_MASK:
+	.octa	0x000102030405060708090a0b0c0d0e0f
+
+#endif
diff --git a/libbb/hash_sha1_x86-64.S b/libbb/hash_sha1_x86-64.S
new file mode 100644
index 000000000..b1968fff6
--- /dev/null
+++ b/libbb/hash_sha1_x86-64.S
@@ -0,0 +1,1489 @@
+### Generated by hash_sha1_x86-64.S.sh ###
+
+#if CONFIG_SHA1_SMALL == 0 && defined(__GNUC__) && defined(__x86_64__)
+#ifdef __linux__
+	.section	.note.GNU-stack, "", @progbits
+#endif
+	.section	.text.sha1_process_block64, "ax", @progbits
+	.globl	sha1_process_block64
+	.hidden	sha1_process_block64
+	.type	sha1_process_block64, @function
+
+	.balign	8	# allow decoders to fetch at least 5 first insns
+sha1_process_block64:
+	pushq	%rbp	# 1 byte insn
+	pushq	%rbx	# 1 byte insn
+#	pushq	%r15	# 2 byte insn
+	pushq	%r14	# 2 byte insn
+	pushq	%r13	# 2 byte insn
+	pushq	%r12	# 2 byte insn
+	pushq	%rdi	# we need ctx at the end
+
+#Register and stack use:
+# eax..edx: a..d
+# ebp: e
+# esi,edi,r8..r14: temps
+# r15: unused
+# xmm0..xmm3: W[]
+# xmm4,xmm5: temps
+# xmm6: current round constant
+# xmm7: all round constants
+# -64(%rsp): area for passing RCONST + W[] from vector to integer units
+
+	movl	80(%rdi), %eax		# a = ctx->hash[0]
+	movl	84(%rdi), %ebx		# b = ctx->hash[1]
+	movl	88(%rdi), %ecx		# c = ctx->hash[2]
+	movl	92(%rdi), %edx		# d = ctx->hash[3]
+	movl	96(%rdi), %ebp		# e = ctx->hash[4]
+
+	movaps	sha1const(%rip), %xmm7
+	pshufd	$0x00, %xmm7, %xmm6
+
+	# Load W[] to xmm0..3, byteswapping on the fly.
+	#
+	# For iterations 0..15, we pass W[] in rsi,r8..r14
+	# for use in RD1As instead of spilling them to stack.
+	# We lose parallelized addition of RCONST, but LEA
+	# can do two additions at once, so it is probably a wash.
+	# (We use rsi instead of rN because this makes two
+	# LEAs in two first RD1As shorter by one byte).
+	movq	4*0(%rdi), %rsi
+	movq	4*2(%rdi), %r8
+	bswapq	%rsi
+	bswapq	%r8
+	rolq	$32, %rsi		# rsi = W[1]:W[0]
+	rolq	$32, %r8		# r8  = W[3]:W[2]
+	movq	%rsi, %xmm0
+	movq	%r8, %xmm4
+	punpcklqdq %xmm4, %xmm0	# xmm0 = r8:rsi = (W[0],W[1],W[2],W[3])
+#	movaps	%xmm0, %xmm4		# add RCONST, spill to stack
+#	paddd	%xmm6, %xmm4
+#	movups	%xmm4, -64+16*0(%rsp)
+
+	movq	4*4(%rdi), %r9
+	movq	4*6(%rdi), %r10
+	bswapq	%r9
+	bswapq	%r10
+	rolq	$32, %r9		# r9  = W[5]:W[4]
+	rolq	$32, %r10		# r10 = W[7]:W[6]
+	movq	%r9, %xmm1
+	movq	%r10, %xmm4
+	punpcklqdq %xmm4, %xmm1	# xmm1 = r10:r9 = (W[4],W[5],W[6],W[7])
+
+	movq	4*8(%rdi), %r11
+	movq	4*10(%rdi), %r12
+	bswapq	%r11
+	bswapq	%r12
+	rolq	$32, %r11		# r11 = W[9]:W[8]
+	rolq	$32, %r12		# r12 = W[11]:W[10]
+	movq	%r11, %xmm2
+	movq	%r12, %xmm4
+	punpcklqdq %xmm4, %xmm2	# xmm2 = r12:r11 = (W[8],W[9],W[10],W[11])
+
+	movq	4*12(%rdi), %r13
+	movq	4*14(%rdi), %r14
+	bswapq	%r13
+	bswapq	%r14
+	rolq	$32, %r13		# r13 = W[13]:W[12]
+	rolq	$32, %r14		# r14 = W[15]:W[14]
+	movq	%r13, %xmm3
+	movq	%r14, %xmm4
+	punpcklqdq %xmm4, %xmm3	# xmm3 = r14:r13 = (W[12],W[13],W[14],W[15])
+
+# 0
+	leal	0x5A827999(%rbp,%rsi), %ebp # e += RCONST + W[n]
+	shrq	$32, %rsi
+	movl	%ecx, %edi		# c
+	xorl	%edx, %edi		# ^d
+	andl	%ebx, %edi		# &b
+	xorl	%edx, %edi		# (((c ^ d) & b) ^ d)
+	addl	%edi, %ebp		# e += (((c ^ d) & b) ^ d)
+	movl	%eax, %edi		#
+	roll	$5, %edi		# rotl32(a,5)
+	addl	%edi, %ebp		# e += rotl32(a,5)
+	rorl	$2, %ebx		# b = rotl32(b,30)
+# 1
+	leal	0x5A827999(%rdx,%rsi), %edx # e += RCONST + W[n]
+	movl	%ebx, %edi		# c
+	xorl	%ecx, %edi		# ^d
+	andl	%eax, %edi		# &b
+	xorl	%ecx, %edi		# (((c ^ d) & b) ^ d)
+	addl	%edi, %edx		# e += (((c ^ d) & b) ^ d)
+	movl	%ebp, %edi		#
+	roll	$5, %edi		# rotl32(a,5)
+	addl	%edi, %edx		# e += rotl32(a,5)
+	rorl	$2, %eax		# b = rotl32(b,30)
+# 2
+	leal	0x5A827999(%rcx,%r8), %ecx # e += RCONST + W[n]
+	shrq	$32, %r8
+	movl	%eax, %edi		# c
+	xorl	%ebx, %edi		# ^d
+	andl	%ebp, %edi		# &b
+	xorl	%ebx, %edi		# (((c ^ d) & b) ^ d)
+	addl	%edi, %ecx		# e += (((c ^ d) & b) ^ d)
+	movl	%edx, %edi		#
+	roll	$5, %edi		# rotl32(a,5)
+	addl	%edi, %ecx		# e += rotl32(a,5)
+	rorl	$2, %ebp		# b = rotl32(b,30)
+# 3
+	leal	0x5A827999(%rbx,%r8), %ebx # e += RCONST + W[n]
+	movl	%ebp, %edi		# c
+	xorl	%eax, %edi		# ^d
+	andl	%edx, %edi		# &b
+	xorl	%eax, %edi		# (((c ^ d) & b) ^ d)
+	addl	%edi, %ebx		# e += (((c ^ d) & b) ^ d)
+	movl	%ecx, %edi		#
+	roll	$5, %edi		# rotl32(a,5)
+	addl	%edi, %ebx		# e += rotl32(a,5)
+	rorl	$2, %edx		# b = rotl32(b,30)
+# 4
+	leal	0x5A827999(%rax,%r9), %eax # e += RCONST + W[n]
+	shrq	$32, %r9
+	movl	%edx, %edi		# c
+	xorl	%ebp, %edi		# ^d
+	andl	%ecx, %edi		# &b
+	xorl	%ebp, %edi		# (((c ^ d) & b) ^ d)
+	addl	%edi, %eax		# e += (((c ^ d) & b) ^ d)
+	movl	%ebx, %edi		#
+	roll	$5, %edi		# rotl32(a,5)
+	addl	%edi, %eax		# e += rotl32(a,5)
+	rorl	$2, %ecx		# b = rotl32(b,30)
+# 5
+	leal	0x5A827999(%rbp,%r9), %ebp # e += RCONST + W[n]
+	movl	%ecx, %edi		# c
+	xorl	%edx, %edi		# ^d
+	andl	%ebx, %edi		# &b
+	xorl	%edx, %edi		# (((c ^ d) & b) ^ d)
+	addl	%edi, %ebp		# e += (((c ^ d) & b) ^ d)
+	movl	%eax, %edi		#
+	roll	$5, %edi		# rotl32(a,5)
+	addl	%edi, %ebp		# e += rotl32(a,5)
+	rorl	$2, %ebx		# b = rotl32(b,30)
+# 6
+	leal	0x5A827999(%rdx,%r10), %edx # e += RCONST + W[n]
+	shrq	$32, %r10
+	movl	%ebx, %edi		# c
+	xorl	%ecx, %edi		# ^d
+	andl	%eax, %edi		# &b
+	xorl	%ecx, %edi		# (((c ^ d) & b) ^ d)
+	addl	%edi, %edx		# e += (((c ^ d) & b) ^ d)
+	movl	%ebp, %edi		#
+	roll	$5, %edi		# rotl32(a,5)
+	addl	%edi, %edx		# e += rotl32(a,5)
+	rorl	$2, %eax		# b = rotl32(b,30)
+# 7
+	leal	0x5A827999(%rcx,%r10), %ecx # e += RCONST + W[n]
+	movl	%eax, %edi		# c
+	xorl	%ebx, %edi		# ^d
+	andl	%ebp, %edi		# &b
+	xorl	%ebx, %edi		# (((c ^ d) & b) ^ d)
+	addl	%edi, %ecx		# e += (((c ^ d) & b) ^ d)
+	movl	%edx, %edi		#
+	roll	$5, %edi		# rotl32(a,5)
+	addl	%edi, %ecx		# e += rotl32(a,5)
+	rorl	$2, %ebp		# b = rotl32(b,30)
+# PREP %xmm0 %xmm1 %xmm2 %xmm3 -64+16*0(%rsp)
+	movaps	%xmm3, %xmm4
+	psrldq	$4, %xmm4	# rshift by 4 bytes: T1 = ([13],[14],[15],0)
+#	pshufd	$0x4e, %xmm0, %xmm5	# 01001110=2,3,0,1 shuffle, ([2],[3],x,x)
+#	punpcklqdq %xmm1, %xmm5	# T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5])
+# same result as above, but shorter and faster:
+# pshufd/shufps are subtly different: pshufd takes all dwords from source operand,
+# shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one!
+	movaps	%xmm0, %xmm5
+	shufps	$0x4e, %xmm1, %xmm5	# 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5])
+	xorps	%xmm2, %xmm0	# ([8],[9],[10],[11]) ^ ([0],[1],[2],[3])
+	xorps	%xmm4, %xmm5	# ([13],[14],[15],0) ^ ([2],[3],[4],[5])
+	xorps	%xmm5, %xmm0	# ^
+	# W0 = unrotated (W[0]..W[3]), still needs W[3] fixup
+	movaps	%xmm0, %xmm5
+	xorps	%xmm4, %xmm4	# rol(W0,1):
+	pcmpgtd	%xmm0, %xmm4	#  ffffffff for elements <0 (ones with msb bit 1)
+	paddd	%xmm0, %xmm0	#  shift left by 1
+	psubd	%xmm4, %xmm0	#  add 1 to those who had msb bit 1
+	# W0 = rotated (W[0]..W[3]), still needs W[3] fixup
+	pslldq	$12, %xmm5	# lshift by 12 bytes: T2 = (0,0,0,unrotW[0])
+	movaps	%xmm5, %xmm4
+	pslld	$2, %xmm5
+	psrld	$30, %xmm4
+#	xorps	%xmm4, %xmm5	# rol((0,0,0,unrotW[0]),2)
+	xorps	%xmm4, %xmm0	# same result, but does not depend on/does not modify T2
+	xorps	%xmm5, %xmm0	# W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2))
+	movaps	%xmm0, %xmm5
+	paddd	%xmm6, %xmm5
+	movups	%xmm5, -64+16*0(%rsp)
+# 8
+	leal	0x5A827999(%rbx,%r11), %ebx # e += RCONST + W[n]
+	shrq	$32, %r11
+	movl	%ebp, %edi		# c
+	xorl	%eax, %edi		# ^d
+	andl	%edx, %edi		# &b
+	xorl	%eax, %edi		# (((c ^ d) & b) ^ d)
+	addl	%edi, %ebx		# e += (((c ^ d) & b) ^ d)
+	movl	%ecx, %edi		#
+	roll	$5, %edi		# rotl32(a,5)
+	addl	%edi, %ebx		# e += rotl32(a,5)
+	rorl	$2, %edx		# b = rotl32(b,30)
+# 9
+	leal	0x5A827999(%rax,%r11), %eax # e += RCONST + W[n]
+	movl	%edx, %edi		# c
+	xorl	%ebp, %edi		# ^d
+	andl	%ecx, %edi		# &b
+	xorl	%ebp, %edi		# (((c ^ d) & b) ^ d)
+	addl	%edi, %eax		# e += (((c ^ d) & b) ^ d)
+	movl	%ebx, %edi		#
+	roll	$5, %edi		# rotl32(a,5)
+	addl	%edi, %eax		# e += rotl32(a,5)
+	rorl	$2, %ecx		# b = rotl32(b,30)
+# 10
+	leal	0x5A827999(%rbp,%r12), %ebp # e += RCONST + W[n]
+	shrq	$32, %r12
+	movl	%ecx, %edi		# c
+	xorl	%edx, %edi		# ^d
+	andl	%ebx, %edi		# &b
+	xorl	%edx, %edi		# (((c ^ d) & b) ^ d)
+	addl	%edi, %ebp		# e += (((c ^ d) & b) ^ d)
+	movl	%eax, %edi		#
+	roll	$5, %edi		# rotl32(a,5)
+	addl	%edi, %ebp		# e += rotl32(a,5)
+	rorl	$2, %ebx		# b = rotl32(b,30)
+# 11
+	leal	0x5A827999(%rdx,%r12), %edx # e += RCONST + W[n]
+	movl	%ebx, %edi		# c
+	xorl	%ecx, %edi		# ^d
+	andl	%eax, %edi		# &b
+	xorl	%ecx, %edi		# (((c ^ d) & b) ^ d)
+	addl	%edi, %edx		# e += (((c ^ d) & b) ^ d)
+	movl	%ebp, %edi		#
+	roll	$5, %edi		# rotl32(a,5)
+	addl	%edi, %edx		# e += rotl32(a,5)
+	rorl	$2, %eax		# b = rotl32(b,30)
+	pshufd	$0x55, %xmm7, %xmm6
+# PREP %xmm1 %xmm2 %xmm3 %xmm0 -64+16*1(%rsp)
+	movaps	%xmm0, %xmm4
+	psrldq	$4, %xmm4	# rshift by 4 bytes: T1 = ([13],[14],[15],0)
+#	pshufd	$0x4e, %xmm1, %xmm5	# 01001110=2,3,0,1 shuffle, ([2],[3],x,x)
+#	punpcklqdq %xmm2, %xmm5	# T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5])
+# same result as above, but shorter and faster:
+# pshufd/shufps are subtly different: pshufd takes all dwords from source operand,
+# shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one!
+	movaps	%xmm1, %xmm5
+	shufps	$0x4e, %xmm2, %xmm5	# 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5])
+	xorps	%xmm3, %xmm1	# ([8],[9],[10],[11]) ^ ([0],[1],[2],[3])
+	xorps	%xmm4, %xmm5	# ([13],[14],[15],0) ^ ([2],[3],[4],[5])
+	xorps	%xmm5, %xmm1	# ^
+	# W0 = unrotated (W[0]..W[3]), still needs W[3] fixup
+	movaps	%xmm1, %xmm5
+	xorps	%xmm4, %xmm4	# rol(W0,1):
+	pcmpgtd	%xmm1, %xmm4	#  ffffffff for elements <0 (ones with msb bit 1)
+	paddd	%xmm1, %xmm1	#  shift left by 1
+	psubd	%xmm4, %xmm1	#  add 1 to those who had msb bit 1
+	# W0 = rotated (W[0]..W[3]), still needs W[3] fixup
+	pslldq	$12, %xmm5	# lshift by 12 bytes: T2 = (0,0,0,unrotW[0])
+	movaps	%xmm5, %xmm4
+	pslld	$2, %xmm5
+	psrld	$30, %xmm4
+#	xorps	%xmm4, %xmm5	# rol((0,0,0,unrotW[0]),2)
+	xorps	%xmm4, %xmm1	# same result, but does not depend on/does not modify T2
+	xorps	%xmm5, %xmm1	# W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2))
+	movaps	%xmm1, %xmm5
+	paddd	%xmm6, %xmm5
+	movups	%xmm5, -64+16*1(%rsp)
+# 12
+	leal	0x5A827999(%rcx,%r13), %ecx # e += RCONST + W[n]
+	shrq	$32, %r13
+	movl	%eax, %edi		# c
+	xorl	%ebx, %edi		# ^d
+	andl	%ebp, %edi		# &b
+	xorl	%ebx, %edi		# (((c ^ d) & b) ^ d)
+	addl	%edi, %ecx		# e += (((c ^ d) & b) ^ d)
+	movl	%edx, %edi		#
+	roll	$5, %edi		# rotl32(a,5)
+	addl	%edi, %ecx		# e += rotl32(a,5)
+	rorl	$2, %ebp		# b = rotl32(b,30)
+# 13
+	leal	0x5A827999(%rbx,%r13), %ebx # e += RCONST + W[n]
+	movl	%ebp, %edi		# c
+	xorl	%eax, %edi		# ^d
+	andl	%edx, %edi		# &b
+	xorl	%eax, %edi		# (((c ^ d) & b) ^ d)
+	addl	%edi, %ebx		# e += (((c ^ d) & b) ^ d)
+	movl	%ecx, %edi		#
+	roll	$5, %edi		# rotl32(a,5)
+	addl	%edi, %ebx		# e += rotl32(a,5)
+	rorl	$2, %edx		# b = rotl32(b,30)
+# 14
+	leal	0x5A827999(%rax,%r14), %eax # e += RCONST + W[n]
+	shrq	$32, %r14
+	movl	%edx, %edi		# c
+	xorl	%ebp, %edi		# ^d
+	andl	%ecx, %edi		# &b
+	xorl	%ebp, %edi		# (((c ^ d) & b) ^ d)
+	addl	%edi, %eax		# e += (((c ^ d) & b) ^ d)
+	movl	%ebx, %edi		#
+	roll	$5, %edi		# rotl32(a,5)
+	addl	%edi, %eax		# e += rotl32(a,5)
+	rorl	$2, %ecx		# b = rotl32(b,30)
+# 15
+	leal	0x5A827999(%rbp,%r14), %ebp # e += RCONST + W[n]
+	movl	%ecx, %edi		# c
+	xorl	%edx, %edi		# ^d
+	andl	%ebx, %edi		# &b
+	xorl	%edx, %edi		# (((c ^ d) & b) ^ d)
+	addl	%edi, %ebp		# e += (((c ^ d) & b) ^ d)
+	movl	%eax, %edi		#
+	roll	$5, %edi		# rotl32(a,5)
+	addl	%edi, %ebp		# e += rotl32(a,5)
+	rorl	$2, %ebx		# b = rotl32(b,30)
+# PREP %xmm2 %xmm3 %xmm0 %xmm1 -64+16*2(%rsp)
+	movaps	%xmm1, %xmm4
+	psrldq	$4, %xmm4	# rshift by 4 bytes: T1 = ([13],[14],[15],0)
+#	pshufd	$0x4e, %xmm2, %xmm5	# 01001110=2,3,0,1 shuffle, ([2],[3],x,x)
+#	punpcklqdq %xmm3, %xmm5	# T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5])
+# same result as above, but shorter and faster:
+# pshufd/shufps are subtly different: pshufd takes all dwords from source operand,
+# shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one!
+	movaps	%xmm2, %xmm5
+	shufps	$0x4e, %xmm3, %xmm5	# 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5])
+	xorps	%xmm0, %xmm2	# ([8],[9],[10],[11]) ^ ([0],[1],[2],[3])
+	xorps	%xmm4, %xmm5	# ([13],[14],[15],0) ^ ([2],[3],[4],[5])
+	xorps	%xmm5, %xmm2	# ^
+	# W0 = unrotated (W[0]..W[3]), still needs W[3] fixup
+	movaps	%xmm2, %xmm5
+	xorps	%xmm4, %xmm4	# rol(W0,1):
+	pcmpgtd	%xmm2, %xmm4	#  ffffffff for elements <0 (ones with msb bit 1)
+	paddd	%xmm2, %xmm2	#  shift left by 1
+	psubd	%xmm4, %xmm2	#  add 1 to those who had msb bit 1
+	# W0 = rotated (W[0]..W[3]), still needs W[3] fixup
+	pslldq	$12, %xmm5	# lshift by 12 bytes: T2 = (0,0,0,unrotW[0])
+	movaps	%xmm5, %xmm4
+	pslld	$2, %xmm5
+	psrld	$30, %xmm4
+#	xorps	%xmm4, %xmm5	# rol((0,0,0,unrotW[0]),2)
+	xorps	%xmm4, %xmm2	# same result, but does not depend on/does not modify T2
+	xorps	%xmm5, %xmm2	# W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2))
+	movaps	%xmm2, %xmm5
+	paddd	%xmm6, %xmm5
+	movups	%xmm5, -64+16*2(%rsp)
+# 16
+	movl	%ebx, %edi		# c
+	xorl	%ecx, %edi		# ^d
+	andl	%eax, %edi		# &b
+	xorl	%ecx, %edi		# (((c ^ d) & b) ^ d)
+	addl	-64+4*0(%rsp), %edx	# e += RCONST + W[n & 15]
+	addl	%edi, %edx		# e += (((c ^ d) & b) ^ d)
+	movl	%ebp, %esi		#
+	roll	$5, %esi		# rotl32(a,5)
+	addl	%esi, %edx		# e += rotl32(a,5)
+	rorl	$2, %eax		# b = rotl32(b,30)
+# 17
+	movl	%eax, %edi		# c
+	xorl	%ebx, %edi		# ^d
+	andl	%ebp, %edi		# &b
+	xorl	%ebx, %edi		# (((c ^ d) & b) ^ d)
+	addl	-64+4*1(%rsp), %ecx	# e += RCONST + W[n & 15]
+	addl	%edi, %ecx		# e += (((c ^ d) & b) ^ d)
+	movl	%edx, %esi		#
+	roll	$5, %esi		# rotl32(a,5)
+	addl	%esi, %ecx		# e += rotl32(a,5)
+	rorl	$2, %ebp		# b = rotl32(b,30)
+# 18
+	movl	%ebp, %edi		# c
+	xorl	%eax, %edi		# ^d
+	andl	%edx, %edi		# &b
+	xorl	%eax, %edi		# (((c ^ d) & b) ^ d)
+	addl	-64+4*2(%rsp), %ebx	# e += RCONST + W[n & 15]
+	addl	%edi, %ebx		# e += (((c ^ d) & b) ^ d)
+	movl	%ecx, %esi		#
+	roll	$5, %esi		# rotl32(a,5)
+	addl	%esi, %ebx		# e += rotl32(a,5)
+	rorl	$2, %edx		# b = rotl32(b,30)
+# 19
+	movl	%edx, %edi		# c
+	xorl	%ebp, %edi		# ^d
+	andl	%ecx, %edi		# &b
+	xorl	%ebp, %edi		# (((c ^ d) & b) ^ d)
+	addl	-64+4*3(%rsp), %eax	# e += RCONST + W[n & 15]
+	addl	%edi, %eax		# e += (((c ^ d) & b) ^ d)
+	movl	%ebx, %esi		#
+	roll	$5, %esi		# rotl32(a,5)
+	addl	%esi, %eax		# e += rotl32(a,5)
+	rorl	$2, %ecx		# b = rotl32(b,30)
+# PREP %xmm3 %xmm0 %xmm1 %xmm2 -64+16*3(%rsp)
+	movaps	%xmm2, %xmm4
+	psrldq	$4, %xmm4	# rshift by 4 bytes: T1 = ([13],[14],[15],0)
+#	pshufd	$0x4e, %xmm3, %xmm5	# 01001110=2,3,0,1 shuffle, ([2],[3],x,x)
+#	punpcklqdq %xmm0, %xmm5	# T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5])
+# same result as above, but shorter and faster:
+# pshufd/shufps are subtly different: pshufd takes all dwords from source operand,
+# shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one!
+	movaps	%xmm3, %xmm5
+	shufps	$0x4e, %xmm0, %xmm5	# 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5])
+	xorps	%xmm1, %xmm3	# ([8],[9],[10],[11]) ^ ([0],[1],[2],[3])
+	xorps	%xmm4, %xmm5	# ([13],[14],[15],0) ^ ([2],[3],[4],[5])
+	xorps	%xmm5, %xmm3	# ^
+	# W0 = unrotated (W[0]..W[3]), still needs W[3] fixup
+	movaps	%xmm3, %xmm5
+	xorps	%xmm4, %xmm4	# rol(W0,1):
+	pcmpgtd	%xmm3, %xmm4	#  ffffffff for elements <0 (ones with msb bit 1)
+	paddd	%xmm3, %xmm3	#  shift left by 1
+	psubd	%xmm4, %xmm3	#  add 1 to those who had msb bit 1
+	# W0 = rotated (W[0]..W[3]), still needs W[3] fixup
+	pslldq	$12, %xmm5	# lshift by 12 bytes: T2 = (0,0,0,unrotW[0])
+	movaps	%xmm5, %xmm4
+	pslld	$2, %xmm5
+	psrld	$30, %xmm4
+#	xorps	%xmm4, %xmm5	# rol((0,0,0,unrotW[0]),2)
+	xorps	%xmm4, %xmm3	# same result, but does not depend on/does not modify T2
+	xorps	%xmm5, %xmm3	# W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2))
+	movaps	%xmm3, %xmm5
+	paddd	%xmm6, %xmm5
+	movups	%xmm5, -64+16*3(%rsp)
+# 20
+	movl	%ecx, %edi		# c
+	xorl	%edx, %edi		# ^d
+	xorl	%ebx, %edi		# ^b
+	addl	-64+4*4(%rsp), %ebp	# e += RCONST + W[n & 15]
+	addl	%edi, %ebp		# e += (c ^ d ^ b)
+	movl	%eax, %esi		#
+	roll	$5, %esi		# rotl32(a,5)
+	addl	%esi, %ebp		# e += rotl32(a,5)
+	rorl	$2, %ebx		# b = rotl32(b,30)
+# 21
+	movl	%ebx, %edi		# c
+	xorl	%ecx, %edi		# ^d
+	xorl	%eax, %edi		# ^b
+	addl	-64+4*5(%rsp), %edx	# e += RCONST + W[n & 15]
+	addl	%edi, %edx		# e += (c ^ d ^ b)
+	movl	%ebp, %esi		#
+	roll	$5, %esi		# rotl32(a,5)
+	addl	%esi, %edx		# e += rotl32(a,5)
+	rorl	$2, %eax		# b = rotl32(b,30)
+# 22
+	movl	%eax, %edi		# c
+	xorl	%ebx, %edi		# ^d
+	xorl	%ebp, %edi		# ^b
+	addl	-64+4*6(%rsp), %ecx	# e += RCONST + W[n & 15]
+	addl	%edi, %ecx		# e += (c ^ d ^ b)
+	movl	%edx, %esi		#
+	roll	$5, %esi		# rotl32(a,5)
+	addl	%esi, %ecx		# e += rotl32(a,5)
+	rorl	$2, %ebp		# b = rotl32(b,30)
+# 23
+	movl	%ebp, %edi		# c
+	xorl	%eax, %edi		# ^d
+	xorl	%edx, %edi		# ^b
+	addl	-64+4*7(%rsp), %ebx	# e += RCONST + W[n & 15]
+	addl	%edi, %ebx		# e += (c ^ d ^ b)
+	movl	%ecx, %esi		#
+	roll	$5, %esi		# rotl32(a,5)
+	addl	%esi, %ebx		# e += rotl32(a,5)
+	rorl	$2, %edx		# b = rotl32(b,30)
+# PREP %xmm0 %xmm1 %xmm2 %xmm3 -64+16*0(%rsp)
+	movaps	%xmm3, %xmm4
+	psrldq	$4, %xmm4	# rshift by 4 bytes: T1 = ([13],[14],[15],0)
+#	pshufd	$0x4e, %xmm0, %xmm5	# 01001110=2,3,0,1 shuffle, ([2],[3],x,x)
+#	punpcklqdq %xmm1, %xmm5	# T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5])
+# same result as above, but shorter and faster:
+# pshufd/shufps are subtly different: pshufd takes all dwords from source operand,
+# shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one!
+	movaps	%xmm0, %xmm5
+	shufps	$0x4e, %xmm1, %xmm5	# 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5])
+	xorps	%xmm2, %xmm0	# ([8],[9],[10],[11]) ^ ([0],[1],[2],[3])
+	xorps	%xmm4, %xmm5	# ([13],[14],[15],0) ^ ([2],[3],[4],[5])
+	xorps	%xmm5, %xmm0	# ^
+	# W0 = unrotated (W[0]..W[3]), still needs W[3] fixup
+	movaps	%xmm0, %xmm5
+	xorps	%xmm4, %xmm4	# rol(W0,1):
+	pcmpgtd	%xmm0, %xmm4	#  ffffffff for elements <0 (ones with msb bit 1)
+	paddd	%xmm0, %xmm0	#  shift left by 1
+	psubd	%xmm4, %xmm0	#  add 1 to those who had msb bit 1
+	# W0 = rotated (W[0]..W[3]), still needs W[3] fixup
+	pslldq	$12, %xmm5	# lshift by 12 bytes: T2 = (0,0,0,unrotW[0])
+	movaps	%xmm5, %xmm4
+	pslld	$2, %xmm5
+	psrld	$30, %xmm4
+#	xorps	%xmm4, %xmm5	# rol((0,0,0,unrotW[0]),2)
+	xorps	%xmm4, %xmm0	# same result, but does not depend on/does not modify T2
+	xorps	%xmm5, %xmm0	# W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2))
+	movaps	%xmm0, %xmm5
+	paddd	%xmm6, %xmm5
+	movups	%xmm5, -64+16*0(%rsp)
+# 24
+	movl	%edx, %edi		# c
+	xorl	%ebp, %edi		# ^d
+	xorl	%ecx, %edi		# ^b
+	addl	-64+4*8(%rsp), %eax	# e += RCONST + W[n & 15]
+	addl	%edi, %eax		# e += (c ^ d ^ b)
+	movl	%ebx, %esi		#
+	roll	$5, %esi		# rotl32(a,5)
+	addl	%esi, %eax		# e += rotl32(a,5)
+	rorl	$2, %ecx		# b = rotl32(b,30)
+# 25
+	movl	%ecx, %edi		# c
+	xorl	%edx, %edi		# ^d
+	xorl	%ebx, %edi		# ^b
+	addl	-64+4*9(%rsp), %ebp	# e += RCONST + W[n & 15]
+	addl	%edi, %ebp		# e += (c ^ d ^ b)
+	movl	%eax, %esi		#
+	roll	$5, %esi		# rotl32(a,5)
+	addl	%esi, %ebp		# e += rotl32(a,5)
+	rorl	$2, %ebx		# b = rotl32(b,30)
+# 26
+	movl	%ebx, %edi		# c
+	xorl	%ecx, %edi		# ^d
+	xorl	%eax, %edi		# ^b
+	addl	-64+4*10(%rsp), %edx	# e += RCONST + W[n & 15]
+	addl	%edi, %edx		# e += (c ^ d ^ b)
+	movl	%ebp, %esi		#
+	roll	$5, %esi		# rotl32(a,5)
+	addl	%esi, %edx		# e += rotl32(a,5)
+	rorl	$2, %eax		# b = rotl32(b,30)
+# 27
+	movl	%eax, %edi		# c
+	xorl	%ebx, %edi		# ^d
+	xorl	%ebp, %edi		# ^b
+	addl	-64+4*11(%rsp), %ecx	# e += RCONST + W[n & 15]
+	addl	%edi, %ecx		# e += (c ^ d ^ b)
+	movl	%edx, %esi		#
+	roll	$5, %esi		# rotl32(a,5)
+	addl	%esi, %ecx		# e += rotl32(a,5)
+	rorl	$2, %ebp		# b = rotl32(b,30)
+# PREP %xmm1 %xmm2 %xmm3 %xmm0 -64+16*1(%rsp)
+	movaps	%xmm0, %xmm4
+	psrldq	$4, %xmm4	# rshift by 4 bytes: T1 = ([13],[14],[15],0)
+#	pshufd	$0x4e, %xmm1, %xmm5	# 01001110=2,3,0,1 shuffle, ([2],[3],x,x)
+#	punpcklqdq %xmm2, %xmm5	# T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5])
+# same result as above, but shorter and faster:
+# pshufd/shufps are subtly different: pshufd takes all dwords from source operand,
+# shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one!
+	movaps	%xmm1, %xmm5
+	shufps	$0x4e, %xmm2, %xmm5	# 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5])
+	xorps	%xmm3, %xmm1	# ([8],[9],[10],[11]) ^ ([0],[1],[2],[3])
+	xorps	%xmm4, %xmm5	# ([13],[14],[15],0) ^ ([2],[3],[4],[5])
+	xorps	%xmm5, %xmm1	# ^
+	# W0 = unrotated (W[0]..W[3]), still needs W[3] fixup
+	movaps	%xmm1, %xmm5
+	xorps	%xmm4, %xmm4	# rol(W0,1):
+	pcmpgtd	%xmm1, %xmm4	#  ffffffff for elements <0 (ones with msb bit 1)
+	paddd	%xmm1, %xmm1	#  shift left by 1
+	psubd	%xmm4, %xmm1	#  add 1 to those who had msb bit 1
+	# W0 = rotated (W[0]..W[3]), still needs W[3] fixup
+	pslldq	$12, %xmm5	# lshift by 12 bytes: T2 = (0,0,0,unrotW[0])
+	movaps	%xmm5, %xmm4
+	pslld	$2, %xmm5
+	psrld	$30, %xmm4
+#	xorps	%xmm4, %xmm5	# rol((0,0,0,unrotW[0]),2)
+	xorps	%xmm4, %xmm1	# same result, but does not depend on/does not modify T2
+	xorps	%xmm5, %xmm1	# W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2))
+	movaps	%xmm1, %xmm5
+	paddd	%xmm6, %xmm5
+	movups	%xmm5, -64+16*1(%rsp)
+# 28
+	movl	%ebp, %edi		# c
+	xorl	%eax, %edi		# ^d
+	xorl	%edx, %edi		# ^b
+	addl	-64+4*12(%rsp), %ebx	# e += RCONST + W[n & 15]
+	addl	%edi, %ebx		# e += (c ^ d ^ b)
+	movl	%ecx, %esi		#
+	roll	$5, %esi		# rotl32(a,5)
+	addl	%esi, %ebx		# e += rotl32(a,5)
+	rorl	$2, %edx		# b = rotl32(b,30)
+# 29
+	movl	%edx, %edi		# c
+	xorl	%ebp, %edi		# ^d
+	xorl	%ecx, %edi		# ^b
+	addl	-64+4*13(%rsp), %eax	# e += RCONST + W[n & 15]
+	addl	%edi, %eax		# e += (c ^ d ^ b)
+	movl	%ebx, %esi		#
+	roll	$5, %esi		# rotl32(a,5)
+	addl	%esi, %eax		# e += rotl32(a,5)
+	rorl	$2, %ecx		# b = rotl32(b,30)
+# 30
+	movl	%ecx, %edi		# c
+	xorl	%edx, %edi		# ^d
+	xorl	%ebx, %edi		# ^b
+	addl	-64+4*14(%rsp), %ebp	# e += RCONST + W[n & 15]
+	addl	%edi, %ebp		# e += (c ^ d ^ b)
+	movl	%eax, %esi		#
+	roll	$5, %esi		# rotl32(a,5)
+	addl	%esi, %ebp		# e += rotl32(a,5)
+	rorl	$2, %ebx		# b = rotl32(b,30)
+# 31
+	movl	%ebx, %edi		# c
+	xorl	%ecx, %edi		# ^d
+	xorl	%eax, %edi		# ^b
+	addl	-64+4*15(%rsp), %edx	# e += RCONST + W[n & 15]
+	addl	%edi, %edx		# e += (c ^ d ^ b)
+	movl	%ebp, %esi		#
+	roll	$5, %esi		# rotl32(a,5)
+	addl	%esi, %edx		# e += rotl32(a,5)
+	rorl	$2, %eax		# b = rotl32(b,30)
+	pshufd	$0xaa, %xmm7, %xmm6
+# PREP %xmm2 %xmm3 %xmm0 %xmm1 -64+16*2(%rsp)
+	movaps	%xmm1, %xmm4
+	psrldq	$4, %xmm4	# rshift by 4 bytes: T1 = ([13],[14],[15],0)
+#	pshufd	$0x4e, %xmm2, %xmm5	# 01001110=2,3,0,1 shuffle, ([2],[3],x,x)
+#	punpcklqdq %xmm3, %xmm5	# T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5])
+# same result as above, but shorter and faster:
+# pshufd/shufps are subtly different: pshufd takes all dwords from source operand,
+# shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one!
+	movaps	%xmm2, %xmm5
+	shufps	$0x4e, %xmm3, %xmm5	# 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5])
+	xorps	%xmm0, %xmm2	# ([8],[9],[10],[11]) ^ ([0],[1],[2],[3])
+	xorps	%xmm4, %xmm5	# ([13],[14],[15],0) ^ ([2],[3],[4],[5])
+	xorps	%xmm5, %xmm2	# ^
+	# W0 = unrotated (W[0]..W[3]), still needs W[3] fixup
+	movaps	%xmm2, %xmm5
+	xorps	%xmm4, %xmm4	# rol(W0,1):
+	pcmpgtd	%xmm2, %xmm4	#  ffffffff for elements <0 (ones with msb bit 1)
+	paddd	%xmm2, %xmm2	#  shift left by 1
+	psubd	%xmm4, %xmm2	#  add 1 to those who had msb bit 1
+	# W0 = rotated (W[0]..W[3]), still needs W[3] fixup
+	pslldq	$12, %xmm5	# lshift by 12 bytes: T2 = (0,0,0,unrotW[0])
+	movaps	%xmm5, %xmm4
+	pslld	$2, %xmm5
+	psrld	$30, %xmm4
+#	xorps	%xmm4, %xmm5	# rol((0,0,0,unrotW[0]),2)
+	xorps	%xmm4, %xmm2	# same result, but does not depend on/does not modify T2
+	xorps	%xmm5, %xmm2	# W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2))
+	movaps	%xmm2, %xmm5
+	paddd	%xmm6, %xmm5
+	movups	%xmm5, -64+16*2(%rsp)
+# 32
+	movl	%eax, %edi		# c
+	xorl	%ebx, %edi		# ^d
+	xorl	%ebp, %edi		# ^b
+	addl	-64+4*0(%rsp), %ecx	# e += RCONST + W[n & 15]
+	addl	%edi, %ecx		# e += (c ^ d ^ b)
+	movl	%edx, %esi		#
+	roll	$5, %esi		# rotl32(a,5)
+	addl	%esi, %ecx		# e += rotl32(a,5)
+	rorl	$2, %ebp		# b = rotl32(b,30)
+# 33
+	movl	%ebp, %edi		# c
+	xorl	%eax, %edi		# ^d
+	xorl	%edx, %edi		# ^b
+	addl	-64+4*1(%rsp), %ebx	# e += RCONST + W[n & 15]
+	addl	%edi, %ebx		# e += (c ^ d ^ b)
+	movl	%ecx, %esi		#
+	roll	$5, %esi		# rotl32(a,5)
+	addl	%esi, %ebx		# e += rotl32(a,5)
+	rorl	$2, %edx		# b = rotl32(b,30)
+# 34
+	movl	%edx, %edi		# c
+	xorl	%ebp, %edi		# ^d
+	xorl	%ecx, %edi		# ^b
+	addl	-64+4*2(%rsp), %eax	# e += RCONST + W[n & 15]
+	addl	%edi, %eax		# e += (c ^ d ^ b)
+	movl	%ebx, %esi		#
+	roll	$5, %esi		# rotl32(a,5)
+	addl	%esi, %eax		# e += rotl32(a,5)
+	rorl	$2, %ecx		# b = rotl32(b,30)
+# 35
+	movl	%ecx, %edi		# c
+	xorl	%edx, %edi		# ^d
+	xorl	%ebx, %edi		# ^b
+	addl	-64+4*3(%rsp), %ebp	# e += RCONST + W[n & 15]
+	addl	%edi, %ebp		# e += (c ^ d ^ b)
+	movl	%eax, %esi		#
+	roll	$5, %esi		# rotl32(a,5)
+	addl	%esi, %ebp		# e += rotl32(a,5)
+	rorl	$2, %ebx		# b = rotl32(b,30)
+# PREP %xmm3 %xmm0 %xmm1 %xmm2 -64+16*3(%rsp)
+	movaps	%xmm2, %xmm4
+	psrldq	$4, %xmm4	# rshift by 4 bytes: T1 = ([13],[14],[15],0)
+#	pshufd	$0x4e, %xmm3, %xmm5	# 01001110=2,3,0,1 shuffle, ([2],[3],x,x)
+#	punpcklqdq %xmm0, %xmm5	# T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5])
+# same result as above, but shorter and faster:
+# pshufd/shufps are subtly different: pshufd takes all dwords from source operand,
+# shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one!
+	movaps	%xmm3, %xmm5
+	shufps	$0x4e, %xmm0, %xmm5	# 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5])
+	xorps	%xmm1, %xmm3	# ([8],[9],[10],[11]) ^ ([0],[1],[2],[3])
+	xorps	%xmm4, %xmm5	# ([13],[14],[15],0) ^ ([2],[3],[4],[5])
+	xorps	%xmm5, %xmm3	# ^
+	# W0 = unrotated (W[0]..W[3]), still needs W[3] fixup
+	movaps	%xmm3, %xmm5
+	xorps	%xmm4, %xmm4	# rol(W0,1):
+	pcmpgtd	%xmm3, %xmm4	#  ffffffff for elements <0 (ones with msb bit 1)
+	paddd	%xmm3, %xmm3	#  shift left by 1
+	psubd	%xmm4, %xmm3	#  add 1 to those who had msb bit 1
+	# W0 = rotated (W[0]..W[3]), still needs W[3] fixup
+	pslldq	$12, %xmm5	# lshift by 12 bytes: T2 = (0,0,0,unrotW[0])
+	movaps	%xmm5, %xmm4
+	pslld	$2, %xmm5
+	psrld	$30, %xmm4
+#	xorps	%xmm4, %xmm5	# rol((0,0,0,unrotW[0]),2)
+	xorps	%xmm4, %xmm3	# same result, but does not depend on/does not modify T2
+	xorps	%xmm5, %xmm3	# W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2))
+	movaps	%xmm3, %xmm5
+	paddd	%xmm6, %xmm5
+	movups	%xmm5, -64+16*3(%rsp)
+# 36
+	movl	%ebx, %edi		# c
+	xorl	%ecx, %edi		# ^d
+	xorl	%eax, %edi		# ^b
+	addl	-64+4*4(%rsp), %edx	# e += RCONST + W[n & 15]
+	addl	%edi, %edx		# e += (c ^ d ^ b)
+	movl	%ebp, %esi		#
+	roll	$5, %esi		# rotl32(a,5)
+	addl	%esi, %edx		# e += rotl32(a,5)
+	rorl	$2, %eax		# b = rotl32(b,30)
+# 37
+	movl	%eax, %edi		# c
+	xorl	%ebx, %edi		# ^d
+	xorl	%ebp, %edi		# ^b
+	addl	-64+4*5(%rsp), %ecx	# e += RCONST + W[n & 15]
+	addl	%edi, %ecx		# e += (c ^ d ^ b)
+	movl	%edx, %esi		#
+	roll	$5, %esi		# rotl32(a,5)
+	addl	%esi, %ecx		# e += rotl32(a,5)
+	rorl	$2, %ebp		# b = rotl32(b,30)
+# 38
+	movl	%ebp, %edi		# c
+	xorl	%eax, %edi		# ^d
+	xorl	%edx, %edi		# ^b
+	addl	-64+4*6(%rsp), %ebx	# e += RCONST + W[n & 15]
+	addl	%edi, %ebx		# e += (c ^ d ^ b)
+	movl	%ecx, %esi		#
+	roll	$5, %esi		# rotl32(a,5)
+	addl	%esi, %ebx		# e += rotl32(a,5)
+	rorl	$2, %edx		# b = rotl32(b,30)
+# 39
+	movl	%edx, %edi		# c
+	xorl	%ebp, %edi		# ^d
+	xorl	%ecx, %edi		# ^b
+	addl	-64+4*7(%rsp), %eax	# e += RCONST + W[n & 15]
+	addl	%edi, %eax		# e += (c ^ d ^ b)
+	movl	%ebx, %esi		#
+	roll	$5, %esi		# rotl32(a,5)
+	addl	%esi, %eax		# e += rotl32(a,5)
+	rorl	$2, %ecx		# b = rotl32(b,30)
+# PREP %xmm0 %xmm1 %xmm2 %xmm3 -64+16*0(%rsp)
+	movaps	%xmm3, %xmm4
+	psrldq	$4, %xmm4	# rshift by 4 bytes: T1 = ([13],[14],[15],0)
+#	pshufd	$0x4e, %xmm0, %xmm5	# 01001110=2,3,0,1 shuffle, ([2],[3],x,x)
+#	punpcklqdq %xmm1, %xmm5	# T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5])
+# same result as above, but shorter and faster:
+# pshufd/shufps are subtly different: pshufd takes all dwords from source operand,
+# shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one!
+	movaps	%xmm0, %xmm5
+	shufps	$0x4e, %xmm1, %xmm5	# 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5])
+	xorps	%xmm2, %xmm0	# ([8],[9],[10],[11]) ^ ([0],[1],[2],[3])
+	xorps	%xmm4, %xmm5	# ([13],[14],[15],0) ^ ([2],[3],[4],[5])
+	xorps	%xmm5, %xmm0	# ^
+	# W0 = unrotated (W[0]..W[3]), still needs W[3] fixup
+	movaps	%xmm0, %xmm5
+	xorps	%xmm4, %xmm4	# rol(W0,1):
+	pcmpgtd	%xmm0, %xmm4	#  ffffffff for elements <0 (ones with msb bit 1)
+	paddd	%xmm0, %xmm0	#  shift left by 1
+	psubd	%xmm4, %xmm0	#  add 1 to those who had msb bit 1
+	# W0 = rotated (W[0]..W[3]), still needs W[3] fixup
+	pslldq	$12, %xmm5	# lshift by 12 bytes: T2 = (0,0,0,unrotW[0])
+	movaps	%xmm5, %xmm4
+	pslld	$2, %xmm5
+	psrld	$30, %xmm4
+#	xorps	%xmm4, %xmm5	# rol((0,0,0,unrotW[0]),2)
+	xorps	%xmm4, %xmm0	# same result, but does not depend on/does not modify T2
+	xorps	%xmm5, %xmm0	# W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2))
+	movaps	%xmm0, %xmm5
+	paddd	%xmm6, %xmm5
+	movups	%xmm5, -64+16*0(%rsp)
+# 40
+	movl	%ebx, %edi		# di: b
+	movl	%ebx, %esi		# si: b
+	orl	%ecx, %edi		# di: b | c
+	andl	%ecx, %esi		# si: b & c
+	andl	%edx, %edi		# di: (b | c) & d
+	orl	%esi, %edi		# ((b | c) & d) | (b & c)
+	addl	%edi, %ebp		# += ((b | c) & d) | (b & c)
+	addl	-64+4*8(%rsp), %ebp	# e += RCONST + W[n & 15]
+	movl	%eax, %esi		#
+	roll	$5, %esi		# rotl32(a,5)
+	addl	%esi, %ebp		# e += rotl32(a,5)
+	rorl	$2, %ebx		# b = rotl32(b,30)
+# 41
+	movl	%eax, %edi		# di: b
+	movl	%eax, %esi		# si: b
+	orl	%ebx, %edi		# di: b | c
+	andl	%ebx, %esi		# si: b & c
+	andl	%ecx, %edi		# di: (b | c) & d
+	orl	%esi, %edi		# ((b | c) & d) | (b & c)
+	addl	%edi, %edx		# += ((b | c) & d) | (b & c)
+	addl	-64+4*9(%rsp), %edx	# e += RCONST + W[n & 15]
+	movl	%ebp, %esi		#
+	roll	$5, %esi		# rotl32(a,5)
+	addl	%esi, %edx		# e += rotl32(a,5)
+	rorl	$2, %eax		# b = rotl32(b,30)
+# 42
+	movl	%ebp, %edi		# di: b
+	movl	%ebp, %esi		# si: b
+	orl	%eax, %edi		# di: b | c
+	andl	%eax, %esi		# si: b & c
+	andl	%ebx, %edi		# di: (b | c) & d
+	orl	%esi, %edi		# ((b | c) & d) | (b & c)
+	addl	%edi, %ecx		# += ((b | c) & d) | (b & c)
+	addl	-64+4*10(%rsp), %ecx	# e += RCONST + W[n & 15]
+	movl	%edx, %esi		#
+	roll	$5, %esi		# rotl32(a,5)
+	addl	%esi, %ecx		# e += rotl32(a,5)
+	rorl	$2, %ebp		# b = rotl32(b,30)
+# 43
+	movl	%edx, %edi		# di: b
+	movl	%edx, %esi		# si: b
+	orl	%ebp, %edi		# di: b | c
+	andl	%ebp, %esi		# si: b & c
+	andl	%eax, %edi		# di: (b | c) & d
+	orl	%esi, %edi		# ((b | c) & d) | (b & c)
+	addl	%edi, %ebx		# += ((b | c) & d) | (b & c)
+	addl	-64+4*11(%rsp), %ebx	# e += RCONST + W[n & 15]
+	movl	%ecx, %esi		#
+	roll	$5, %esi		# rotl32(a,5)
+	addl	%esi, %ebx		# e += rotl32(a,5)
+	rorl	$2, %edx		# b = rotl32(b,30)
+# PREP %xmm1 %xmm2 %xmm3 %xmm0 -64+16*1(%rsp)
+	movaps	%xmm0, %xmm4
+	psrldq	$4, %xmm4	# rshift by 4 bytes: T1 = ([13],[14],[15],0)
+#	pshufd	$0x4e, %xmm1, %xmm5	# 01001110=2,3,0,1 shuffle, ([2],[3],x,x)
+#	punpcklqdq %xmm2, %xmm5	# T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5])
+# same result as above, but shorter and faster:
+# pshufd/shufps are subtly different: pshufd takes all dwords from source operand,
+# shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one!
+	movaps	%xmm1, %xmm5
+	shufps	$0x4e, %xmm2, %xmm5	# 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5])
+	xorps	%xmm3, %xmm1	# ([8],[9],[10],[11]) ^ ([0],[1],[2],[3])
+	xorps	%xmm4, %xmm5	# ([13],[14],[15],0) ^ ([2],[3],[4],[5])
+	xorps	%xmm5, %xmm1	# ^
+	# W0 = unrotated (W[0]..W[3]), still needs W[3] fixup
+	movaps	%xmm1, %xmm5
+	xorps	%xmm4, %xmm4	# rol(W0,1):
+	pcmpgtd	%xmm1, %xmm4	#  ffffffff for elements <0 (ones with msb bit 1)
+	paddd	%xmm1, %xmm1	#  shift left by 1
+	psubd	%xmm4, %xmm1	#  add 1 to those who had msb bit 1
+	# W0 = rotated (W[0]..W[3]), still needs W[3] fixup
+	pslldq	$12, %xmm5	# lshift by 12 bytes: T2 = (0,0,0,unrotW[0])
+	movaps	%xmm5, %xmm4
+	pslld	$2, %xmm5
+	psrld	$30, %xmm4
+#	xorps	%xmm4, %xmm5	# rol((0,0,0,unrotW[0]),2)
+	xorps	%xmm4, %xmm1	# same result, but does not depend on/does not modify T2
+	xorps	%xmm5, %xmm1	# W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2))
+	movaps	%xmm1, %xmm5
+	paddd	%xmm6, %xmm5
+	movups	%xmm5, -64+16*1(%rsp)
+# 44
+	movl	%ecx, %edi		# di: b
+	movl	%ecx, %esi		# si: b
+	orl	%edx, %edi		# di: b | c
+	andl	%edx, %esi		# si: b & c
+	andl	%ebp, %edi		# di: (b | c) & d
+	orl	%esi, %edi		# ((b | c) & d) | (b & c)
+	addl	%edi, %eax		# += ((b | c) & d) | (b & c)
+	addl	-64+4*12(%rsp), %eax	# e += RCONST + W[n & 15]
+	movl	%ebx, %esi		#
+	roll	$5, %esi		# rotl32(a,5)
+	addl	%esi, %eax		# e += rotl32(a,5)
+	rorl	$2, %ecx		# b = rotl32(b,30)
+# 45
+	movl	%ebx, %edi		# di: b
+	movl	%ebx, %esi		# si: b
+	orl	%ecx, %edi		# di: b | c
+	andl	%ecx, %esi		# si: b & c
+	andl	%edx, %edi		# di: (b | c) & d
+	orl	%esi, %edi		# ((b | c) & d) | (b & c)
+	addl	%edi, %ebp		# += ((b | c) & d) | (b & c)
+	addl	-64+4*13(%rsp), %ebp	# e += RCONST + W[n & 15]
+	movl	%eax, %esi		#
+	roll	$5, %esi		# rotl32(a,5)
+	addl	%esi, %ebp		# e += rotl32(a,5)
+	rorl	$2, %ebx		# b = rotl32(b,30)
+# 46
+	movl	%eax, %edi		# di: b
+	movl	%eax, %esi		# si: b
+	orl	%ebx, %edi		# di: b | c
+	andl	%ebx, %esi		# si: b & c
+	andl	%ecx, %edi		# di: (b | c) & d
+	orl	%esi, %edi		# ((b | c) & d) | (b & c)
+	addl	%edi, %edx		# += ((b | c) & d) | (b & c)
+	addl	-64+4*14(%rsp), %edx	# e += RCONST + W[n & 15]
+	movl	%ebp, %esi		#
+	roll	$5, %esi		# rotl32(a,5)
+	addl	%esi, %edx		# e += rotl32(a,5)
+	rorl	$2, %eax		# b = rotl32(b,30)
+# 47
+	movl	%ebp, %edi		# di: b
+	movl	%ebp, %esi		# si: b
+	orl	%eax, %edi		# di: b | c
+	andl	%eax, %esi		# si: b & c
+	andl	%ebx, %edi		# di: (b | c) & d
+	orl	%esi, %edi		# ((b | c) & d) | (b & c)
+	addl	%edi, %ecx		# += ((b | c) & d) | (b & c)
+	addl	-64+4*15(%rsp), %ecx	# e += RCONST + W[n & 15]
+	movl	%edx, %esi		#
+	roll	$5, %esi		# rotl32(a,5)
+	addl	%esi, %ecx		# e += rotl32(a,5)
+	rorl	$2, %ebp		# b = rotl32(b,30)
+# PREP %xmm2 %xmm3 %xmm0 %xmm1 -64+16*2(%rsp)
+	movaps	%xmm1, %xmm4
+	psrldq	$4, %xmm4	# rshift by 4 bytes: T1 = ([13],[14],[15],0)
+#	pshufd	$0x4e, %xmm2, %xmm5	# 01001110=2,3,0,1 shuffle, ([2],[3],x,x)
+#	punpcklqdq %xmm3, %xmm5	# T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5])
+# same result as above, but shorter and faster:
+# pshufd/shufps are subtly different: pshufd takes all dwords from source operand,
+# shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one!
+	movaps	%xmm2, %xmm5
+	shufps	$0x4e, %xmm3, %xmm5	# 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5])
+	xorps	%xmm0, %xmm2	# ([8],[9],[10],[11]) ^ ([0],[1],[2],[3])
+	xorps	%xmm4, %xmm5	# ([13],[14],[15],0) ^ ([2],[3],[4],[5])
+	xorps	%xmm5, %xmm2	# ^
+	# W0 = unrotated (W[0]..W[3]), still needs W[3] fixup
+	movaps	%xmm2, %xmm5
+	xorps	%xmm4, %xmm4	# rol(W0,1):
+	pcmpgtd	%xmm2, %xmm4	#  ffffffff for elements <0 (ones with msb bit 1)
+	paddd	%xmm2, %xmm2	#  shift left by 1
+	psubd	%xmm4, %xmm2	#  add 1 to those who had msb bit 1
+	# W0 = rotated (W[0]..W[3]), still needs W[3] fixup
+	pslldq	$12, %xmm5	# lshift by 12 bytes: T2 = (0,0,0,unrotW[0])
+	movaps	%xmm5, %xmm4
+	pslld	$2, %xmm5
+	psrld	$30, %xmm4
+#	xorps	%xmm4, %xmm5	# rol((0,0,0,unrotW[0]),2)
+	xorps	%xmm4, %xmm2	# same result, but does not depend on/does not modify T2
+	xorps	%xmm5, %xmm2	# W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2))
+	movaps	%xmm2, %xmm5
+	paddd	%xmm6, %xmm5
+	movups	%xmm5, -64+16*2(%rsp)
+# 48
+	movl	%edx, %edi		# di: b
+	movl	%edx, %esi		# si: b
+	orl	%ebp, %edi		# di: b | c
+	andl	%ebp, %esi		# si: b & c
+	andl	%eax, %edi		# di: (b | c) & d
+	orl	%esi, %edi		# ((b | c) & d) | (b & c)
+	addl	%edi, %ebx		# += ((b | c) & d) | (b & c)
+	addl	-64+4*0(%rsp), %ebx	# e += RCONST + W[n & 15]
+	movl	%ecx, %esi		#
+	roll	$5, %esi		# rotl32(a,5)
+	addl	%esi, %ebx		# e += rotl32(a,5)
+	rorl	$2, %edx		# b = rotl32(b,30)
+# 49
+	movl	%ecx, %edi		# di: b
+	movl	%ecx, %esi		# si: b
+	orl	%edx, %edi		# di: b | c
+	andl	%edx, %esi		# si: b & c
+	andl	%ebp, %edi		# di: (b | c) & d
+	orl	%esi, %edi		# ((b | c) & d) | (b & c)
+	addl	%edi, %eax		# += ((b | c) & d) | (b & c)
+	addl	-64+4*1(%rsp), %eax	# e += RCONST + W[n & 15]
+	movl	%ebx, %esi		#
+	roll	$5, %esi		# rotl32(a,5)
+	addl	%esi, %eax		# e += rotl32(a,5)
+	rorl	$2, %ecx		# b = rotl32(b,30)
+# 50
+	movl	%ebx, %edi		# di: b
+	movl	%ebx, %esi		# si: b
+	orl	%ecx, %edi		# di: b | c
+	andl	%ecx, %esi		# si: b & c
+	andl	%edx, %edi		# di: (b | c) & d
+	orl	%esi, %edi		# ((b | c) & d) | (b & c)
+	addl	%edi, %ebp		# += ((b | c) & d) | (b & c)
+	addl	-64+4*2(%rsp), %ebp	# e += RCONST + W[n & 15]
+	movl	%eax, %esi		#
+	roll	$5, %esi		# rotl32(a,5)
+	addl	%esi, %ebp		# e += rotl32(a,5)
+	rorl	$2, %ebx		# b = rotl32(b,30)
+# 51
+	movl	%eax, %edi		# di: b
+	movl	%eax, %esi		# si: b
+	orl	%ebx, %edi		# di: b | c
+	andl	%ebx, %esi		# si: b & c
+	andl	%ecx, %edi		# di: (b | c) & d
+	orl	%esi, %edi		# ((b | c) & d) | (b & c)
+	addl	%edi, %edx		# += ((b | c) & d) | (b & c)
+	addl	-64+4*3(%rsp), %edx	# e += RCONST + W[n & 15]
+	movl	%ebp, %esi		#
+	roll	$5, %esi		# rotl32(a,5)
+	addl	%esi, %edx		# e += rotl32(a,5)
+	rorl	$2, %eax		# b = rotl32(b,30)
+	pshufd	$0xff, %xmm7, %xmm6
+# PREP %xmm3 %xmm0 %xmm1 %xmm2 -64+16*3(%rsp)
+	movaps	%xmm2, %xmm4
+	psrldq	$4, %xmm4	# rshift by 4 bytes: T1 = ([13],[14],[15],0)
+#	pshufd	$0x4e, %xmm3, %xmm5	# 01001110=2,3,0,1 shuffle, ([2],[3],x,x)
+#	punpcklqdq %xmm0, %xmm5	# T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5])
+# same result as above, but shorter and faster:
+# pshufd/shufps are subtly different: pshufd takes all dwords from source operand,
+# shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one!
+	movaps	%xmm3, %xmm5
+	shufps	$0x4e, %xmm0, %xmm5	# 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5])
+	xorps	%xmm1, %xmm3	# ([8],[9],[10],[11]) ^ ([0],[1],[2],[3])
+	xorps	%xmm4, %xmm5	# ([13],[14],[15],0) ^ ([2],[3],[4],[5])
+	xorps	%xmm5, %xmm3	# ^
+	# W0 = unrotated (W[0]..W[3]), still needs W[3] fixup
+	movaps	%xmm3, %xmm5
+	xorps	%xmm4, %xmm4	# rol(W0,1):
+	pcmpgtd	%xmm3, %xmm4	#  ffffffff for elements <0 (ones with msb bit 1)
+	paddd	%xmm3, %xmm3	#  shift left by 1
+	psubd	%xmm4, %xmm3	#  add 1 to those who had msb bit 1
+	# W0 = rotated (W[0]..W[3]), still needs W[3] fixup
+	pslldq	$12, %xmm5	# lshift by 12 bytes: T2 = (0,0,0,unrotW[0])
+	movaps	%xmm5, %xmm4
+	pslld	$2, %xmm5
+	psrld	$30, %xmm4
+#	xorps	%xmm4, %xmm5	# rol((0,0,0,unrotW[0]),2)
+	xorps	%xmm4, %xmm3	# same result, but does not depend on/does not modify T2
+	xorps	%xmm5, %xmm3	# W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2))
+	movaps	%xmm3, %xmm5
+	paddd	%xmm6, %xmm5
+	movups	%xmm5, -64+16*3(%rsp)
+# 52
+	movl	%ebp, %edi		# di: b
+	movl	%ebp, %esi		# si: b
+	orl	%eax, %edi		# di: b | c
+	andl	%eax, %esi		# si: b & c
+	andl	%ebx, %edi		# di: (b | c) & d
+	orl	%esi, %edi		# ((b | c) & d) | (b & c)
+	addl	%edi, %ecx		# += ((b | c) & d) | (b & c)
+	addl	-64+4*4(%rsp), %ecx	# e += RCONST + W[n & 15]
+	movl	%edx, %esi		#
+	roll	$5, %esi		# rotl32(a,5)
+	addl	%esi, %ecx		# e += rotl32(a,5)
+	rorl	$2, %ebp		# b = rotl32(b,30)
+# 53
+	movl	%edx, %edi		# di: b
+	movl	%edx, %esi		# si: b
+	orl	%ebp, %edi		# di: b | c
+	andl	%ebp, %esi		# si: b & c
+	andl	%eax, %edi		# di: (b | c) & d
+	orl	%esi, %edi		# ((b | c) & d) | (b & c)
+	addl	%edi, %ebx		# += ((b | c) & d) | (b & c)
+	addl	-64+4*5(%rsp), %ebx	# e += RCONST + W[n & 15]
+	movl	%ecx, %esi		#
+	roll	$5, %esi		# rotl32(a,5)
+	addl	%esi, %ebx		# e += rotl32(a,5)
+	rorl	$2, %edx		# b = rotl32(b,30)
+# 54
+	movl	%ecx, %edi		# di: b
+	movl	%ecx, %esi		# si: b
+	orl	%edx, %edi		# di: b | c
+	andl	%edx, %esi		# si: b & c
+	andl	%ebp, %edi		# di: (b | c) & d
+	orl	%esi, %edi		# ((b | c) & d) | (b & c)
+	addl	%edi, %eax		# += ((b | c) & d) | (b & c)
+	addl	-64+4*6(%rsp), %eax	# e += RCONST + W[n & 15]
+	movl	%ebx, %esi		#
+	roll	$5, %esi		# rotl32(a,5)
+	addl	%esi, %eax		# e += rotl32(a,5)
+	rorl	$2, %ecx		# b = rotl32(b,30)
+# 55
+	movl	%ebx, %edi		# di: b
+	movl	%ebx, %esi		# si: b
+	orl	%ecx, %edi		# di: b | c
+	andl	%ecx, %esi		# si: b & c
+	andl	%edx, %edi		# di: (b | c) & d
+	orl	%esi, %edi		# ((b | c) & d) | (b & c)
+	addl	%edi, %ebp		# += ((b | c) & d) | (b & c)
+	addl	-64+4*7(%rsp), %ebp	# e += RCONST + W[n & 15]
+	movl	%eax, %esi		#
+	roll	$5, %esi		# rotl32(a,5)
+	addl	%esi, %ebp		# e += rotl32(a,5)
+	rorl	$2, %ebx		# b = rotl32(b,30)
+# PREP %xmm0 %xmm1 %xmm2 %xmm3 -64+16*0(%rsp)
+	movaps	%xmm3, %xmm4
+	psrldq	$4, %xmm4	# rshift by 4 bytes: T1 = ([13],[14],[15],0)
+#	pshufd	$0x4e, %xmm0, %xmm5	# 01001110=2,3,0,1 shuffle, ([2],[3],x,x)
+#	punpcklqdq %xmm1, %xmm5	# T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5])
+# same result as above, but shorter and faster:
+# pshufd/shufps are subtly different: pshufd takes all dwords from source operand,
+# shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one!
+	movaps	%xmm0, %xmm5
+	shufps	$0x4e, %xmm1, %xmm5	# 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5])
+	xorps	%xmm2, %xmm0	# ([8],[9],[10],[11]) ^ ([0],[1],[2],[3])
+	xorps	%xmm4, %xmm5	# ([13],[14],[15],0) ^ ([2],[3],[4],[5])
+	xorps	%xmm5, %xmm0	# ^
+	# W0 = unrotated (W[0]..W[3]), still needs W[3] fixup
+	movaps	%xmm0, %xmm5
+	xorps	%xmm4, %xmm4	# rol(W0,1):
+	pcmpgtd	%xmm0, %xmm4	#  ffffffff for elements <0 (ones with msb bit 1)
+	paddd	%xmm0, %xmm0	#  shift left by 1
+	psubd	%xmm4, %xmm0	#  add 1 to those who had msb bit 1
+	# W0 = rotated (W[0]..W[3]), still needs W[3] fixup
+	pslldq	$12, %xmm5	# lshift by 12 bytes: T2 = (0,0,0,unrotW[0])
+	movaps	%xmm5, %xmm4
+	pslld	$2, %xmm5
+	psrld	$30, %xmm4
+#	xorps	%xmm4, %xmm5	# rol((0,0,0,unrotW[0]),2)
+	xorps	%xmm4, %xmm0	# same result, but does not depend on/does not modify T2
+	xorps	%xmm5, %xmm0	# W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2))
+	movaps	%xmm0, %xmm5
+	paddd	%xmm6, %xmm5
+	movups	%xmm5, -64+16*0(%rsp)
+# 56
+	movl	%eax, %edi		# di: b
+	movl	%eax, %esi		# si: b
+	orl	%ebx, %edi		# di: b | c
+	andl	%ebx, %esi		# si: b & c
+	andl	%ecx, %edi		# di: (b | c) & d
+	orl	%esi, %edi		# ((b | c) & d) | (b & c)
+	addl	%edi, %edx		# += ((b | c) & d) | (b & c)
+	addl	-64+4*8(%rsp), %edx	# e += RCONST + W[n & 15]
+	movl	%ebp, %esi		#
+	roll	$5, %esi		# rotl32(a,5)
+	addl	%esi, %edx		# e += rotl32(a,5)
+	rorl	$2, %eax		# b = rotl32(b,30)
+# 57
+	movl	%ebp, %edi		# di: b
+	movl	%ebp, %esi		# si: b
+	orl	%eax, %edi		# di: b | c
+	andl	%eax, %esi		# si: b & c
+	andl	%ebx, %edi		# di: (b | c) & d
+	orl	%esi, %edi		# ((b | c) & d) | (b & c)
+	addl	%edi, %ecx		# += ((b | c) & d) | (b & c)
+	addl	-64+4*9(%rsp), %ecx	# e += RCONST + W[n & 15]
+	movl	%edx, %esi		#
+	roll	$5, %esi		# rotl32(a,5)
+	addl	%esi, %ecx		# e += rotl32(a,5)
+	rorl	$2, %ebp		# b = rotl32(b,30)
+# 58
+	movl	%edx, %edi		# di: b
+	movl	%edx, %esi		# si: b
+	orl	%ebp, %edi		# di: b | c
+	andl	%ebp, %esi		# si: b & c
+	andl	%eax, %edi		# di: (b | c) & d
+	orl	%esi, %edi		# ((b | c) & d) | (b & c)
+	addl	%edi, %ebx		# += ((b | c) & d) | (b & c)
+	addl	-64+4*10(%rsp), %ebx	# e += RCONST + W[n & 15]
+	movl	%ecx, %esi		#
+	roll	$5, %esi		# rotl32(a,5)
+	addl	%esi, %ebx		# e += rotl32(a,5)
+	rorl	$2, %edx		# b = rotl32(b,30)
+# 59
+	movl	%ecx, %edi		# di: b
+	movl	%ecx, %esi		# si: b
+	orl	%edx, %edi		# di: b | c
+	andl	%edx, %esi		# si: b & c
+	andl	%ebp, %edi		# di: (b | c) & d
+	orl	%esi, %edi		# ((b | c) & d) | (b & c)
+	addl	%edi, %eax		# += ((b | c) & d) | (b & c)
+	addl	-64+4*11(%rsp), %eax	# e += RCONST + W[n & 15]
+	movl	%ebx, %esi		#
+	roll	$5, %esi		# rotl32(a,5)
+	addl	%esi, %eax		# e += rotl32(a,5)
+	rorl	$2, %ecx		# b = rotl32(b,30)
+# PREP %xmm1 %xmm2 %xmm3 %xmm0 -64+16*1(%rsp)
+	movaps	%xmm0, %xmm4
+	psrldq	$4, %xmm4	# rshift by 4 bytes: T1 = ([13],[14],[15],0)
+#	pshufd	$0x4e, %xmm1, %xmm5	# 01001110=2,3,0,1 shuffle, ([2],[3],x,x)
+#	punpcklqdq %xmm2, %xmm5	# T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5])
+# same result as above, but shorter and faster:
+# pshufd/shufps are subtly different: pshufd takes all dwords from source operand,
+# shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one!
+	movaps	%xmm1, %xmm5
+	shufps	$0x4e, %xmm2, %xmm5	# 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5])
+	xorps	%xmm3, %xmm1	# ([8],[9],[10],[11]) ^ ([0],[1],[2],[3])
+	xorps	%xmm4, %xmm5	# ([13],[14],[15],0) ^ ([2],[3],[4],[5])
+	xorps	%xmm5, %xmm1	# ^
+	# W0 = unrotated (W[0]..W[3]), still needs W[3] fixup
+	movaps	%xmm1, %xmm5
+	xorps	%xmm4, %xmm4	# rol(W0,1):
+	pcmpgtd	%xmm1, %xmm4	#  ffffffff for elements <0 (ones with msb bit 1)
+	paddd	%xmm1, %xmm1	#  shift left by 1
+	psubd	%xmm4, %xmm1	#  add 1 to those who had msb bit 1
+	# W0 = rotated (W[0]..W[3]), still needs W[3] fixup
+	pslldq	$12, %xmm5	# lshift by 12 bytes: T2 = (0,0,0,unrotW[0])
+	movaps	%xmm5, %xmm4
+	pslld	$2, %xmm5
+	psrld	$30, %xmm4
+#	xorps	%xmm4, %xmm5	# rol((0,0,0,unrotW[0]),2)
+	xorps	%xmm4, %xmm1	# same result, but does not depend on/does not modify T2
+	xorps	%xmm5, %xmm1	# W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2))
+	movaps	%xmm1, %xmm5
+	paddd	%xmm6, %xmm5
+	movups	%xmm5, -64+16*1(%rsp)
+# 60
+	movl	%ecx, %edi		# c
+	xorl	%edx, %edi		# ^d
+	xorl	%ebx, %edi		# ^b
+	addl	-64+4*12(%rsp), %ebp	# e += RCONST + W[n & 15]
+	addl	%edi, %ebp		# e += (c ^ d ^ b)
+	movl	%eax, %esi		#
+	roll	$5, %esi		# rotl32(a,5)
+	addl	%esi, %ebp		# e += rotl32(a,5)
+	rorl	$2, %ebx		# b = rotl32(b,30)
+# 61
+	movl	%ebx, %edi		# c
+	xorl	%ecx, %edi		# ^d
+	xorl	%eax, %edi		# ^b
+	addl	-64+4*13(%rsp), %edx	# e += RCONST + W[n & 15]
+	addl	%edi, %edx		# e += (c ^ d ^ b)
+	movl	%ebp, %esi		#
+	roll	$5, %esi		# rotl32(a,5)
+	addl	%esi, %edx		# e += rotl32(a,5)
+	rorl	$2, %eax		# b = rotl32(b,30)
+# 62
+	movl	%eax, %edi		# c
+	xorl	%ebx, %edi		# ^d
+	xorl	%ebp, %edi		# ^b
+	addl	-64+4*14(%rsp), %ecx	# e += RCONST + W[n & 15]
+	addl	%edi, %ecx		# e += (c ^ d ^ b)
+	movl	%edx, %esi		#
+	roll	$5, %esi		# rotl32(a,5)
+	addl	%esi, %ecx		# e += rotl32(a,5)
+	rorl	$2, %ebp		# b = rotl32(b,30)
+# 63
+	movl	%ebp, %edi		# c
+	xorl	%eax, %edi		# ^d
+	xorl	%edx, %edi		# ^b
+	addl	-64+4*15(%rsp), %ebx	# e += RCONST + W[n & 15]
+	addl	%edi, %ebx		# e += (c ^ d ^ b)
+	movl	%ecx, %esi		#
+	roll	$5, %esi		# rotl32(a,5)
+	addl	%esi, %ebx		# e += rotl32(a,5)
+	rorl	$2, %edx		# b = rotl32(b,30)
+# PREP %xmm2 %xmm3 %xmm0 %xmm1 -64+16*2(%rsp)
+	movaps	%xmm1, %xmm4
+	psrldq	$4, %xmm4	# rshift by 4 bytes: T1 = ([13],[14],[15],0)
+#	pshufd	$0x4e, %xmm2, %xmm5	# 01001110=2,3,0,1 shuffle, ([2],[3],x,x)
+#	punpcklqdq %xmm3, %xmm5	# T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5])
+# same result as above, but shorter and faster:
+# pshufd/shufps are subtly different: pshufd takes all dwords from source operand,
+# shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one!
+	movaps	%xmm2, %xmm5
+	shufps	$0x4e, %xmm3, %xmm5	# 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5])
+	xorps	%xmm0, %xmm2	# ([8],[9],[10],[11]) ^ ([0],[1],[2],[3])
+	xorps	%xmm4, %xmm5	# ([13],[14],[15],0) ^ ([2],[3],[4],[5])
+	xorps	%xmm5, %xmm2	# ^
+	# W0 = unrotated (W[0]..W[3]), still needs W[3] fixup
+	movaps	%xmm2, %xmm5
+	xorps	%xmm4, %xmm4	# rol(W0,1):
+	pcmpgtd	%xmm2, %xmm4	#  ffffffff for elements <0 (ones with msb bit 1)
+	paddd	%xmm2, %xmm2	#  shift left by 1
+	psubd	%xmm4, %xmm2	#  add 1 to those who had msb bit 1
+	# W0 = rotated (W[0]..W[3]), still needs W[3] fixup
+	pslldq	$12, %xmm5	# lshift by 12 bytes: T2 = (0,0,0,unrotW[0])
+	movaps	%xmm5, %xmm4
+	pslld	$2, %xmm5
+	psrld	$30, %xmm4
+#	xorps	%xmm4, %xmm5	# rol((0,0,0,unrotW[0]),2)
+	xorps	%xmm4, %xmm2	# same result, but does not depend on/does not modify T2
+	xorps	%xmm5, %xmm2	# W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2))
+	movaps	%xmm2, %xmm5
+	paddd	%xmm6, %xmm5
+	movups	%xmm5, -64+16*2(%rsp)
+# 64
+	movl	%edx, %edi		# c
+	xorl	%ebp, %edi		# ^d
+	xorl	%ecx, %edi		# ^b
+	addl	-64+4*0(%rsp), %eax	# e += RCONST + W[n & 15]
+	addl	%edi, %eax		# e += (c ^ d ^ b)
+	movl	%ebx, %esi		#
+	roll	$5, %esi		# rotl32(a,5)
+	addl	%esi, %eax		# e += rotl32(a,5)
+	rorl	$2, %ecx		# b = rotl32(b,30)
+# 65
+	movl	%ecx, %edi		# c
+	xorl	%edx, %edi		# ^d
+	xorl	%ebx, %edi		# ^b
+	addl	-64+4*1(%rsp), %ebp	# e += RCONST + W[n & 15]
+	addl	%edi, %ebp		# e += (c ^ d ^ b)
+	movl	%eax, %esi		#
+	roll	$5, %esi		# rotl32(a,5)
+	addl	%esi, %ebp		# e += rotl32(a,5)
+	rorl	$2, %ebx		# b = rotl32(b,30)
+# 66
+	movl	%ebx, %edi		# c
+	xorl	%ecx, %edi		# ^d
+	xorl	%eax, %edi		# ^b
+	addl	-64+4*2(%rsp), %edx	# e += RCONST + W[n & 15]
+	addl	%edi, %edx		# e += (c ^ d ^ b)
+	movl	%ebp, %esi		#
+	roll	$5, %esi		# rotl32(a,5)
+	addl	%esi, %edx		# e += rotl32(a,5)
+	rorl	$2, %eax		# b = rotl32(b,30)
+# 67
+	movl	%eax, %edi		# c
+	xorl	%ebx, %edi		# ^d
+	xorl	%ebp, %edi		# ^b
+	addl	-64+4*3(%rsp), %ecx	# e += RCONST + W[n & 15]
+	addl	%edi, %ecx		# e += (c ^ d ^ b)
+	movl	%edx, %esi		#
+	roll	$5, %esi		# rotl32(a,5)
+	addl	%esi, %ecx		# e += rotl32(a,5)
+	rorl	$2, %ebp		# b = rotl32(b,30)
+# PREP %xmm3 %xmm0 %xmm1 %xmm2 -64+16*3(%rsp)
+	movaps	%xmm2, %xmm4
+	psrldq	$4, %xmm4	# rshift by 4 bytes: T1 = ([13],[14],[15],0)
+#	pshufd	$0x4e, %xmm3, %xmm5	# 01001110=2,3,0,1 shuffle, ([2],[3],x,x)
+#	punpcklqdq %xmm0, %xmm5	# T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5])
+# same result as above, but shorter and faster:
+# pshufd/shufps are subtly different: pshufd takes all dwords from source operand,
+# shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one!
+	movaps	%xmm3, %xmm5
+	shufps	$0x4e, %xmm0, %xmm5	# 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5])
+	xorps	%xmm1, %xmm3	# ([8],[9],[10],[11]) ^ ([0],[1],[2],[3])
+	xorps	%xmm4, %xmm5	# ([13],[14],[15],0) ^ ([2],[3],[4],[5])
+	xorps	%xmm5, %xmm3	# ^
+	# W0 = unrotated (W[0]..W[3]), still needs W[3] fixup
+	movaps	%xmm3, %xmm5
+	xorps	%xmm4, %xmm4	# rol(W0,1):
+	pcmpgtd	%xmm3, %xmm4	#  ffffffff for elements <0 (ones with msb bit 1)
+	paddd	%xmm3, %xmm3	#  shift left by 1
+	psubd	%xmm4, %xmm3	#  add 1 to those who had msb bit 1
+	# W0 = rotated (W[0]..W[3]), still needs W[3] fixup
+	pslldq	$12, %xmm5	# lshift by 12 bytes: T2 = (0,0,0,unrotW[0])
+	movaps	%xmm5, %xmm4
+	pslld	$2, %xmm5
+	psrld	$30, %xmm4
+#	xorps	%xmm4, %xmm5	# rol((0,0,0,unrotW[0]),2)
+	xorps	%xmm4, %xmm3	# same result, but does not depend on/does not modify T2
+	xorps	%xmm5, %xmm3	# W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2))
+	movaps	%xmm3, %xmm5
+	paddd	%xmm6, %xmm5
+	movups	%xmm5, -64+16*3(%rsp)
+# 68
+	movl	%ebp, %edi		# c
+	xorl	%eax, %edi		# ^d
+	xorl	%edx, %edi		# ^b
+	addl	-64+4*4(%rsp), %ebx	# e += RCONST + W[n & 15]
+	addl	%edi, %ebx		# e += (c ^ d ^ b)
+	movl	%ecx, %esi		#
+	roll	$5, %esi		# rotl32(a,5)
+	addl	%esi, %ebx		# e += rotl32(a,5)
+	rorl	$2, %edx		# b = rotl32(b,30)
+# 69
+	movl	%edx, %edi		# c
+	xorl	%ebp, %edi		# ^d
+	xorl	%ecx, %edi		# ^b
+	addl	-64+4*5(%rsp), %eax	# e += RCONST + W[n & 15]
+	addl	%edi, %eax		# e += (c ^ d ^ b)
+	movl	%ebx, %esi		#
+	roll	$5, %esi		# rotl32(a,5)
+	addl	%esi, %eax		# e += rotl32(a,5)
+	rorl	$2, %ecx		# b = rotl32(b,30)
+# 70
+	movl	%ecx, %edi		# c
+	xorl	%edx, %edi		# ^d
+	xorl	%ebx, %edi		# ^b
+	addl	-64+4*6(%rsp), %ebp	# e += RCONST + W[n & 15]
+	addl	%edi, %ebp		# e += (c ^ d ^ b)
+	movl	%eax, %esi		#
+	roll	$5, %esi		# rotl32(a,5)
+	addl	%esi, %ebp		# e += rotl32(a,5)
+	rorl	$2, %ebx		# b = rotl32(b,30)
+# 71
+	movl	%ebx, %edi		# c
+	xorl	%ecx, %edi		# ^d
+	xorl	%eax, %edi		# ^b
+	addl	-64+4*7(%rsp), %edx	# e += RCONST + W[n & 15]
+	addl	%edi, %edx		# e += (c ^ d ^ b)
+	movl	%ebp, %esi		#
+	roll	$5, %esi		# rotl32(a,5)
+	addl	%esi, %edx		# e += rotl32(a,5)
+	rorl	$2, %eax		# b = rotl32(b,30)
+# 72
+	movl	%eax, %edi		# c
+	xorl	%ebx, %edi		# ^d
+	xorl	%ebp, %edi		# ^b
+	addl	-64+4*8(%rsp), %ecx	# e += RCONST + W[n & 15]
+	addl	%edi, %ecx		# e += (c ^ d ^ b)
+	movl	%edx, %esi		#
+	roll	$5, %esi		# rotl32(a,5)
+	addl	%esi, %ecx		# e += rotl32(a,5)
+	rorl	$2, %ebp		# b = rotl32(b,30)
+# 73
+	movl	%ebp, %edi		# c
+	xorl	%eax, %edi		# ^d
+	xorl	%edx, %edi		# ^b
+	addl	-64+4*9(%rsp), %ebx	# e += RCONST + W[n & 15]
+	addl	%edi, %ebx		# e += (c ^ d ^ b)
+	movl	%ecx, %esi		#
+	roll	$5, %esi		# rotl32(a,5)
+	addl	%esi, %ebx		# e += rotl32(a,5)
+	rorl	$2, %edx		# b = rotl32(b,30)
+# 74
+	movl	%edx, %edi		# c
+	xorl	%ebp, %edi		# ^d
+	xorl	%ecx, %edi		# ^b
+	addl	-64+4*10(%rsp), %eax	# e += RCONST + W[n & 15]
+	addl	%edi, %eax		# e += (c ^ d ^ b)
+	movl	%ebx, %esi		#
+	roll	$5, %esi		# rotl32(a,5)
+	addl	%esi, %eax		# e += rotl32(a,5)
+	rorl	$2, %ecx		# b = rotl32(b,30)
+# 75
+	movl	%ecx, %edi		# c
+	xorl	%edx, %edi		# ^d
+	xorl	%ebx, %edi		# ^b
+	addl	-64+4*11(%rsp), %ebp	# e += RCONST + W[n & 15]
+	addl	%edi, %ebp		# e += (c ^ d ^ b)
+	movl	%eax, %esi		#
+	roll	$5, %esi		# rotl32(a,5)
+	addl	%esi, %ebp		# e += rotl32(a,5)
+	rorl	$2, %ebx		# b = rotl32(b,30)
+# 76
+	movl	%ebx, %edi		# c
+	xorl	%ecx, %edi		# ^d
+	xorl	%eax, %edi		# ^b
+	addl	-64+4*12(%rsp), %edx	# e += RCONST + W[n & 15]
+	addl	%edi, %edx		# e += (c ^ d ^ b)
+	movl	%ebp, %esi		#
+	roll	$5, %esi		# rotl32(a,5)
+	addl	%esi, %edx		# e += rotl32(a,5)
+	rorl	$2, %eax		# b = rotl32(b,30)
+# 77
+	movl	%eax, %edi		# c
+	xorl	%ebx, %edi		# ^d
+	xorl	%ebp, %edi		# ^b
+	addl	-64+4*13(%rsp), %ecx	# e += RCONST + W[n & 15]
+	addl	%edi, %ecx		# e += (c ^ d ^ b)
+	movl	%edx, %esi		#
+	roll	$5, %esi		# rotl32(a,5)
+	addl	%esi, %ecx		# e += rotl32(a,5)
+	rorl	$2, %ebp		# b = rotl32(b,30)
+# 78
+	movl	%ebp, %edi		# c
+	xorl	%eax, %edi		# ^d
+	xorl	%edx, %edi		# ^b
+	addl	-64+4*14(%rsp), %ebx	# e += RCONST + W[n & 15]
+	addl	%edi, %ebx		# e += (c ^ d ^ b)
+	movl	%ecx, %esi		#
+	roll	$5, %esi		# rotl32(a,5)
+	addl	%esi, %ebx		# e += rotl32(a,5)
+	rorl	$2, %edx		# b = rotl32(b,30)
+# 79
+	movl	%edx, %edi		# c
+	xorl	%ebp, %edi		# ^d
+	xorl	%ecx, %edi		# ^b
+	addl	-64+4*15(%rsp), %eax	# e += RCONST + W[n & 15]
+	addl	%edi, %eax		# e += (c ^ d ^ b)
+	movl	%ebx, %esi		#
+	roll	$5, %esi		# rotl32(a,5)
+	addl	%esi, %eax		# e += rotl32(a,5)
+	rorl	$2, %ecx		# b = rotl32(b,30)
+
+	popq	%rdi		#
+	popq	%r12		#
+	addl	%eax, 80(%rdi)	# ctx->hash[0] += a
+	popq	%r13		#
+	addl	%ebx, 84(%rdi)	# ctx->hash[1] += b
+	popq	%r14		#
+	addl	%ecx, 88(%rdi)	# ctx->hash[2] += c
+#	popq	%r15		#
+	addl	%edx, 92(%rdi)	# ctx->hash[3] += d
+	popq	%rbx		#
+	addl	%ebp, 96(%rdi)	# ctx->hash[4] += e
+	popq	%rbp		#
+
+	ret
+	.size	sha1_process_block64, .-sha1_process_block64
+
+	.section	.rodata.cst16.sha1const, "aM", @progbits, 16
+	.balign	16
+sha1const:
+	.long	0x5A827999
+	.long	0x6ED9EBA1
+	.long	0x8F1BBCDC
+	.long	0xCA62C1D6
+
+#endif
diff --git a/libbb/hash_sha1_x86-64.S.sh b/libbb/hash_sha1_x86-64.S.sh
new file mode 100755
index 000000000..3fc125d51
--- /dev/null
+++ b/libbb/hash_sha1_x86-64.S.sh
@@ -0,0 +1,478 @@
+#!/bin/sh
+
+# We don't regenerate it on every "make" invocation - only by hand.
+# The reason is that the changes to generated code are difficult
+# to visualize by looking only at this script, it helps when the commit
+# also contains the diff of the generated file.
+exec >hash_sha1_x86-64.S
+
+# Based on http://arctic.org/~dean/crypto/sha1.html.
+# ("This SHA1 implementation is public domain.")
+#
+# x86-64 has at least SSE2 vector insns always available.
+# We can use them without any CPUID checks (and without a need
+# for a fallback code if needed insns are not available).
+# This code uses them to calculate W[] ahead of time.
+#
+# Unfortunately, results are passed from vector unit to
+# integer ALUs on the stack. MOVD/Q insns to move them directly
+# from vector to integer registers are slower than store-to-load
+# forwarding in LSU (on Skylake at least).
+#
+# The win against a purely integer code is small on Skylake,
+# only about 7-8%. We offload about 1/3 of our operations to the vector unit.
+# It can do 4 ops at once in one 128-bit register,
+# but we have to use x2 of them because of W[0] complication,
+# SSE2 has no "rotate each word by N bits" insns,
+# moving data to/from vector unit is clunky, and Skylake
+# has four integer ALUs unified with three vector ALUs,
+# which makes pure integer code rather fast, and makes
+# vector ops compete with integer ones.
+#
+# Zen3, with its separate vector ALUs, wins more, about 12%.
+
+xmmT1="%xmm4"
+xmmT2="%xmm5"
+xmmRCONST="%xmm6"
+xmmALLRCONST="%xmm7"
+T=`printf '\t'`
+
+# SSE instructions are longer than 4 bytes on average.
+# Intel CPUs (up to Tiger Lake at least) can't decode
+# more than 16 bytes of code in one cycle.
+# By interleaving SSE code and integer code
+# we mostly achieve a situation where 16-byte decode fetch window
+# contains 4 (or more) insns.
+#
+# However. On Skylake, there was no observed difference,
+# but on Zen3, non-interleaved code is ~3% faster
+# (822 Mb/s versus 795 Mb/s hashing speed).
+# Off for now:
+interleave=false
+
+INTERLEAVE() {
+	$interleave || \
+	{
+		# Generate non-interleaved code
+		# (it should work correctly too)
+		echo "$1"
+		echo "$2"
+		return
+	}
+	(
+	echo "$1" | grep -v '^$' >"$0.temp1"
+	echo "$2" | grep -v '^$' >"$0.temp2"
+	exec 3<"$0.temp1"
+	exec 4<"$0.temp2"
+	IFS=''
+	while :; do
+		line1=''
+		line2=''
+		while :; do
+			read -r line1 <&3
+			if test "${line1:0:1}" != "#" && test "${line1:0:2}" != "$T#"; then
+				break
+			fi
+			echo "$line1"
+		done
+		while :; do
+			read -r line2 <&4
+			if test "${line2:0:4}" = "${T}lea"; then
+				# We use 7-8 byte long forms of LEA.
+				# Do not interleave them with SSE insns
+				# which are also long.
+				echo "$line2"
+				read -r line2 <&4
+				echo "$line2"
+				continue
+			fi
+			if test "${line2:0:1}" != "#" && test "${line2:0:2}" != "$T#"; then
+				break
+			fi
+			echo "$line2"
+		done
+		test "$line1$line2" || break
+		echo "$line1"
+		echo "$line2"
+	done
+	rm "$0.temp1" "$0.temp2"
+	)
+}
+
+#	movaps	bswap32_mask(%rip), $xmmT1
+# Load W[] to xmm0..3, byteswapping on the fly.
+# For iterations 0..15, we pass RCONST+W[] in rsi,r8..r14
+# for use in RD1As instead of spilling them to stack.
+# (We use rsi instead of rN because this makes two
+# ADDs in two first RD1As shorter by one byte).
+#	movups	16*0(%rdi), %xmm0
+#	pshufb	$xmmT1, %xmm0		#SSSE3 insn
+#	movaps	%xmm0, $xmmT2
+#	paddd	$xmmRCONST, $xmmT2
+#	movq	$xmmT2, %rsi
+#	#pextrq	\$1, $xmmT2, %r8	#SSE4.1 insn
+#	#movhpd	$xmmT2, %r8		#can only move to mem, not to reg
+#	shufps	\$0x0e, $xmmT2, $xmmT2	# have to use two-insn sequence
+#	movq	$xmmT2, %r8		# instead
+#	...
+#	<repeat for xmm1,2,3>
+#	...
+#-	leal	$RCONST(%r$e,%rsi), %e$e	# e += RCONST + W[n]
+#+	addl	%esi, %e$e			# e += RCONST + W[n]
+# ^^^^^^^^^^^^^^^^^^^^^^^^
+# The above is -97 bytes of code...
+# ...but pshufb is a SSSE3 insn. Can't use it.
+
+echo \
+"### Generated by hash_sha1_x86-64.S.sh ###
+
+#if CONFIG_SHA1_SMALL == 0 && defined(__GNUC__) && defined(__x86_64__)
+#ifdef __linux__
+	.section	.note.GNU-stack, \"\", @progbits
+#endif
+	.section	.text.sha1_process_block64, \"ax\", @progbits
+	.globl	sha1_process_block64
+	.hidden	sha1_process_block64
+	.type	sha1_process_block64, @function
+
+	.balign	8	# allow decoders to fetch at least 5 first insns
+sha1_process_block64:
+	pushq	%rbp	# 1 byte insn
+	pushq	%rbx	# 1 byte insn
+#	pushq	%r15	# 2 byte insn
+	pushq	%r14	# 2 byte insn
+	pushq	%r13	# 2 byte insn
+	pushq	%r12	# 2 byte insn
+	pushq	%rdi	# we need ctx at the end
+
+#Register and stack use:
+# eax..edx: a..d
+# ebp: e
+# esi,edi,r8..r14: temps
+# r15: unused
+# xmm0..xmm3: W[]
+# xmm4,xmm5: temps
+# xmm6: current round constant
+# xmm7: all round constants
+# -64(%rsp): area for passing RCONST + W[] from vector to integer units
+
+	movl	80(%rdi), %eax		# a = ctx->hash[0]
+	movl	84(%rdi), %ebx		# b = ctx->hash[1]
+	movl	88(%rdi), %ecx		# c = ctx->hash[2]
+	movl	92(%rdi), %edx		# d = ctx->hash[3]
+	movl	96(%rdi), %ebp		# e = ctx->hash[4]
+
+	movaps	sha1const(%rip), $xmmALLRCONST
+	pshufd	\$0x00, $xmmALLRCONST, $xmmRCONST
+
+	# Load W[] to xmm0..3, byteswapping on the fly.
+	#
+	# For iterations 0..15, we pass W[] in rsi,r8..r14
+	# for use in RD1As instead of spilling them to stack.
+	# We lose parallelized addition of RCONST, but LEA
+	# can do two additions at once, so it is probably a wash.
+	# (We use rsi instead of rN because this makes two
+	# LEAs in two first RD1As shorter by one byte).
+	movq	4*0(%rdi), %rsi
+	movq	4*2(%rdi), %r8
+	bswapq	%rsi
+	bswapq	%r8
+	rolq	\$32, %rsi		# rsi = W[1]:W[0]
+	rolq	\$32, %r8		# r8  = W[3]:W[2]
+	movq	%rsi, %xmm0
+	movq	%r8, $xmmT1
+	punpcklqdq $xmmT1, %xmm0	# xmm0 = r8:rsi = (W[0],W[1],W[2],W[3])
+#	movaps	%xmm0, $xmmT1		# add RCONST, spill to stack
+#	paddd	$xmmRCONST, $xmmT1
+#	movups	$xmmT1, -64+16*0(%rsp)
+
+	movq	4*4(%rdi), %r9
+	movq	4*6(%rdi), %r10
+	bswapq	%r9
+	bswapq	%r10
+	rolq	\$32, %r9		# r9  = W[5]:W[4]
+	rolq	\$32, %r10		# r10 = W[7]:W[6]
+	movq	%r9, %xmm1
+	movq	%r10, $xmmT1
+	punpcklqdq $xmmT1, %xmm1	# xmm1 = r10:r9 = (W[4],W[5],W[6],W[7])
+
+	movq	4*8(%rdi), %r11
+	movq	4*10(%rdi), %r12
+	bswapq	%r11
+	bswapq	%r12
+	rolq	\$32, %r11		# r11 = W[9]:W[8]
+	rolq	\$32, %r12		# r12 = W[11]:W[10]
+	movq	%r11, %xmm2
+	movq	%r12, $xmmT1
+	punpcklqdq $xmmT1, %xmm2	# xmm2 = r12:r11 = (W[8],W[9],W[10],W[11])
+
+	movq	4*12(%rdi), %r13
+	movq	4*14(%rdi), %r14
+	bswapq	%r13
+	bswapq	%r14
+	rolq	\$32, %r13		# r13 = W[13]:W[12]
+	rolq	\$32, %r14		# r14 = W[15]:W[14]
+	movq	%r13, %xmm3
+	movq	%r14, $xmmT1
+	punpcklqdq $xmmT1, %xmm3	# xmm3 = r14:r13 = (W[12],W[13],W[14],W[15])
+"
+
+PREP() {
+local xmmW0=$1
+local xmmW4=$2
+local xmmW8=$3
+local xmmW12=$4
+# the above must be %xmm0..3 in some permutation
+local dstmem=$5
+#W[0] = rol(W[13] ^ W[8]  ^ W[2] ^ W[0], 1);
+#W[1] = rol(W[14] ^ W[9]  ^ W[3] ^ W[1], 1);
+#W[2] = rol(W[15] ^ W[10] ^ W[4] ^ W[2], 1);
+#W[3] = rol(  0   ^ W[11] ^ W[5] ^ W[3], 1);
+#W[3] ^= rol(W[0], 1);
+echo "# PREP $@
+	movaps	$xmmW12, $xmmT1
+	psrldq	\$4, $xmmT1	# rshift by 4 bytes: T1 = ([13],[14],[15],0)
+
+#	pshufd	\$0x4e, $xmmW0, $xmmT2	# 01001110=2,3,0,1 shuffle, ([2],[3],x,x)
+#	punpcklqdq $xmmW4, $xmmT2	# T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5])
+# same result as above, but shorter and faster:
+# pshufd/shufps are subtly different: pshufd takes all dwords from source operand,
+# shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one!
+	movaps	$xmmW0, $xmmT2
+	shufps	\$0x4e, $xmmW4, $xmmT2	# 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5])
+
+	xorps	$xmmW8, $xmmW0	# ([8],[9],[10],[11]) ^ ([0],[1],[2],[3])
+	xorps	$xmmT1, $xmmT2	# ([13],[14],[15],0) ^ ([2],[3],[4],[5])
+	xorps	$xmmT2, $xmmW0	# ^
+	# W0 = unrotated (W[0]..W[3]), still needs W[3] fixup
+	movaps	$xmmW0, $xmmT2
+
+	xorps	$xmmT1, $xmmT1	# rol(W0,1):
+	pcmpgtd	$xmmW0, $xmmT1	#  ffffffff for elements <0 (ones with msb bit 1)
+	paddd	$xmmW0, $xmmW0	#  shift left by 1
+	psubd	$xmmT1, $xmmW0	#  add 1 to those who had msb bit 1
+	# W0 = rotated (W[0]..W[3]), still needs W[3] fixup
+
+	pslldq	\$12, $xmmT2	# lshift by 12 bytes: T2 = (0,0,0,unrotW[0])
+	movaps	$xmmT2, $xmmT1
+	pslld	\$2, $xmmT2
+	psrld	\$30, $xmmT1
+#	xorps	$xmmT1, $xmmT2	# rol((0,0,0,unrotW[0]),2)
+	xorps	$xmmT1, $xmmW0	# same result, but does not depend on/does not modify T2
+
+	xorps	$xmmT2, $xmmW0	# W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2))
+"
+#	movq	$xmmW0, %r8	# high latency (~6 cycles)
+#	movaps	$xmmW0, $xmmT1
+#	psrldq	\$8, $xmmT1	# rshift by 8 bytes: move upper 64 bits to lower
+#	movq	$xmmT1, %r10	# high latency
+#	movq	%r8, %r9
+#	movq	%r10, %r11
+#	shrq	\$32, %r9
+#	shrq	\$32, %r11
+# ^^^ slower than passing the results on stack (!!!)
+echo "
+	movaps	$xmmW0, $xmmT2
+	paddd	$xmmRCONST, $xmmT2
+	movups	$xmmT2, $dstmem
+"
+}
+
+# It's possible to interleave integer insns in rounds to mostly eliminate
+# dependency chains, but this likely to only help old Pentium-based
+# CPUs (ones without OOO, which can only simultaneously execute a pair
+# of _adjacent_ insns).
+# Testing on old-ish Silvermont CPU (which has OOO window of only
+# about ~8 insns) shows very small (~1%) speedup.
+
+RD1A() {
+local a=$1;local b=$2;local c=$3;local d=$4;local e=$5
+local n=$(($6))
+local n0=$(((n+0) & 15))
+local rN=$((7+n0/2))
+echo "
+# $n
+";test $n0 = 0 && echo "
+	leal	$RCONST(%r$e,%rsi), %e$e # e += RCONST + W[n]
+	shrq	\$32, %rsi
+";test $n0 = 1 && echo "
+	leal	$RCONST(%r$e,%rsi), %e$e # e += RCONST + W[n]
+";test $n0 -ge 2 && test $((n0 & 1)) = 0 && echo "
+	leal	$RCONST(%r$e,%r$rN), %e$e # e += RCONST + W[n]
+	shrq	\$32, %r$rN
+";test $n0 -ge 2 && test $((n0 & 1)) = 1 && echo "
+	leal	$RCONST(%r$e,%r$rN), %e$e # e += RCONST + W[n]
+";echo "
+	movl	%e$c, %edi		# c
+	xorl	%e$d, %edi		# ^d
+	andl	%e$b, %edi		# &b
+	xorl	%e$d, %edi		# (((c ^ d) & b) ^ d)
+	addl	%edi, %e$e		# e += (((c ^ d) & b) ^ d)
+	movl	%e$a, %edi		#
+	roll	\$5, %edi		# rotl32(a,5)
+	addl	%edi, %e$e		# e += rotl32(a,5)
+	rorl	\$2, %e$b		# b = rotl32(b,30)
+"
+}
+RD1B() {
+local a=$1;local b=$2;local c=$3;local d=$4;local e=$5
+local n=$(($6))
+local n13=$(((n+13) & 15))
+local n8=$(((n+8) & 15))
+local n2=$(((n+2) & 15))
+local n0=$(((n+0) & 15))
+echo "
+# $n
+	movl	%e$c, %edi		# c
+	xorl	%e$d, %edi		# ^d
+	andl	%e$b, %edi		# &b
+	xorl	%e$d, %edi		# (((c ^ d) & b) ^ d)
+	addl	-64+4*$n0(%rsp), %e$e	# e += RCONST + W[n & 15]
+	addl	%edi, %e$e		# e += (((c ^ d) & b) ^ d)
+	movl	%e$a, %esi		#
+	roll	\$5, %esi		# rotl32(a,5)
+	addl	%esi, %e$e		# e += rotl32(a,5)
+	rorl	\$2, %e$b		# b = rotl32(b,30)
+"
+}
+
+RD2() {
+local a=$1;local b=$2;local c=$3;local d=$4;local e=$5
+local n=$(($6))
+local n13=$(((n+13) & 15))
+local n8=$(((n+8) & 15))
+local n2=$(((n+2) & 15))
+local n0=$(((n+0) & 15))
+echo "
+# $n
+	movl	%e$c, %edi		# c
+	xorl	%e$d, %edi		# ^d
+	xorl	%e$b, %edi		# ^b
+	addl	-64+4*$n0(%rsp), %e$e	# e += RCONST + W[n & 15]
+	addl	%edi, %e$e		# e += (c ^ d ^ b)
+	movl	%e$a, %esi		#
+	roll	\$5, %esi		# rotl32(a,5)
+	addl	%esi, %e$e		# e += rotl32(a,5)
+	rorl	\$2, %e$b		# b = rotl32(b,30)
+"
+}
+
+RD3() {
+local a=$1;local b=$2;local c=$3;local d=$4;local e=$5
+local n=$(($6))
+local n13=$(((n+13) & 15))
+local n8=$(((n+8) & 15))
+local n2=$(((n+2) & 15))
+local n0=$(((n+0) & 15))
+echo "
+# $n
+	movl	%e$b, %edi		# di: b
+	movl	%e$b, %esi		# si: b
+	orl	%e$c, %edi		# di: b | c
+	andl	%e$c, %esi		# si: b & c
+	andl	%e$d, %edi		# di: (b | c) & d
+	orl	%esi, %edi		# ((b | c) & d) | (b & c)
+	addl	%edi, %e$e		# += ((b | c) & d) | (b & c)
+	addl	-64+4*$n0(%rsp), %e$e	# e += RCONST + W[n & 15]
+	movl	%e$a, %esi		#
+	roll	\$5, %esi		# rotl32(a,5)
+	addl	%esi, %e$e		# e += rotl32(a,5)
+	rorl	\$2, %e$b		# b = rotl32(b,30)
+"
+}
+
+{
+# Round 1
+RCONST=0x5A827999
+RD1A ax bx cx dx bp  0; RD1A bp ax bx cx dx  1; RD1A dx bp ax bx cx  2; RD1A cx dx bp ax bx  3;
+RD1A bx cx dx bp ax  4; RD1A ax bx cx dx bp  5; RD1A bp ax bx cx dx  6; RD1A dx bp ax bx cx  7;
+a=`PREP %xmm0 %xmm1 %xmm2 %xmm3 "-64+16*0(%rsp)"`
+b=`RD1A cx dx bp ax bx  8; RD1A bx cx dx bp ax  9; RD1A ax bx cx dx bp 10; RD1A bp ax bx cx dx 11;`
+INTERLEAVE "$a" "$b"
+a=`echo "	pshufd	\\$0x55, $xmmALLRCONST, $xmmRCONST"
+   PREP %xmm1 %xmm2 %xmm3 %xmm0 "-64+16*1(%rsp)"`
+b=`RD1A dx bp ax bx cx 12; RD1A cx dx bp ax bx 13; RD1A bx cx dx bp ax 14; RD1A ax bx cx dx bp 15;`
+INTERLEAVE "$a" "$b"
+a=`PREP %xmm2 %xmm3 %xmm0 %xmm1 "-64+16*2(%rsp)"`
+b=`RD1B bp ax bx cx dx 16; RD1B dx bp ax bx cx 17; RD1B cx dx bp ax bx 18; RD1B bx cx dx bp ax 19;`
+INTERLEAVE "$a" "$b"
+
+# Round 2
+RCONST=0x6ED9EBA1
+a=`PREP %xmm3 %xmm0 %xmm1 %xmm2 "-64+16*3(%rsp)"`
+b=`RD2 ax bx cx dx bp 20; RD2 bp ax bx cx dx 21; RD2 dx bp ax bx cx 22; RD2 cx dx bp ax bx 23;`
+INTERLEAVE "$a" "$b"
+a=`PREP %xmm0 %xmm1 %xmm2 %xmm3 "-64+16*0(%rsp)"`
+b=`RD2 bx cx dx bp ax 24; RD2 ax bx cx dx bp 25; RD2 bp ax bx cx dx 26; RD2 dx bp ax bx cx 27;`
+INTERLEAVE "$a" "$b"
+a=`PREP %xmm1 %xmm2 %xmm3 %xmm0 "-64+16*1(%rsp)"`
+b=`RD2 cx dx bp ax bx 28; RD2 bx cx dx bp ax 29; RD2 ax bx cx dx bp 30; RD2 bp ax bx cx dx 31;`
+INTERLEAVE "$a" "$b"
+a=`echo "	pshufd	\\$0xaa, $xmmALLRCONST, $xmmRCONST"
+   PREP %xmm2 %xmm3 %xmm0 %xmm1 "-64+16*2(%rsp)"`
+b=`RD2 dx bp ax bx cx 32; RD2 cx dx bp ax bx 33; RD2 bx cx dx bp ax 34; RD2 ax bx cx dx bp 35;`
+INTERLEAVE "$a" "$b"
+a=`PREP %xmm3 %xmm0 %xmm1 %xmm2 "-64+16*3(%rsp)"`
+b=`RD2 bp ax bx cx dx 36; RD2 dx bp ax bx cx 37; RD2 cx dx bp ax bx 38; RD2 bx cx dx bp ax 39;`
+INTERLEAVE "$a" "$b"
+
+# Round 3
+RCONST=0x8F1BBCDC
+a=`PREP %xmm0 %xmm1 %xmm2 %xmm3 "-64+16*0(%rsp)"`
+b=`RD3 ax bx cx dx bp 40; RD3 bp ax bx cx dx 41; RD3 dx bp ax bx cx 42; RD3 cx dx bp ax bx 43;`
+INTERLEAVE "$a" "$b"
+a=`PREP %xmm1 %xmm2 %xmm3 %xmm0 "-64+16*1(%rsp)"`
+b=`RD3 bx cx dx bp ax 44; RD3 ax bx cx dx bp 45; RD3 bp ax bx cx dx 46; RD3 dx bp ax bx cx 47;`
+INTERLEAVE "$a" "$b"
+a=`PREP %xmm2 %xmm3 %xmm0 %xmm1 "-64+16*2(%rsp)"`
+b=`RD3 cx dx bp ax bx 48; RD3 bx cx dx bp ax 49; RD3 ax bx cx dx bp 50; RD3 bp ax bx cx dx 51;`
+INTERLEAVE "$a" "$b"
+a=`echo "	pshufd	\\$0xff, $xmmALLRCONST, $xmmRCONST"
+   PREP %xmm3 %xmm0 %xmm1 %xmm2 "-64+16*3(%rsp)"`
+b=`RD3 dx bp ax bx cx 52; RD3 cx dx bp ax bx 53; RD3 bx cx dx bp ax 54; RD3 ax bx cx dx bp 55;`
+INTERLEAVE "$a" "$b"
+a=`PREP %xmm0 %xmm1 %xmm2 %xmm3 "-64+16*0(%rsp)"`
+b=`RD3 bp ax bx cx dx 56; RD3 dx bp ax bx cx 57; RD3 cx dx bp ax bx 58; RD3 bx cx dx bp ax 59;`
+INTERLEAVE "$a" "$b"
+
+# Round 4 has the same logic as round 2, only n and RCONST are different
+RCONST=0xCA62C1D6
+a=`PREP %xmm1 %xmm2 %xmm3 %xmm0 "-64+16*1(%rsp)"`
+b=`RD2 ax bx cx dx bp 60; RD2 bp ax bx cx dx 61; RD2 dx bp ax bx cx 62; RD2 cx dx bp ax bx 63;`
+INTERLEAVE "$a" "$b"
+a=`PREP %xmm2 %xmm3 %xmm0 %xmm1 "-64+16*2(%rsp)"`
+b=`RD2 bx cx dx bp ax 64; RD2 ax bx cx dx bp 65; RD2 bp ax bx cx dx 66; RD2 dx bp ax bx cx 67;`
+INTERLEAVE "$a" "$b"
+a=`PREP %xmm3 %xmm0 %xmm1 %xmm2 "-64+16*3(%rsp)"`
+b=`RD2 cx dx bp ax bx 68; RD2 bx cx dx bp ax 69; RD2 ax bx cx dx bp 70; RD2 bp ax bx cx dx 71;`
+INTERLEAVE "$a" "$b"
+RD2 dx bp ax bx cx 72; RD2 cx dx bp ax bx 73; RD2 bx cx dx bp ax 74; RD2 ax bx cx dx bp 75;
+RD2 bp ax bx cx dx 76; RD2 dx bp ax bx cx 77; RD2 cx dx bp ax bx 78; RD2 bx cx dx bp ax 79;
+} | grep -v '^$'
+
+echo "
+	popq	%rdi		#
+	popq	%r12		#
+	addl	%eax, 80(%rdi)	# ctx->hash[0] += a
+	popq	%r13		#
+	addl	%ebx, 84(%rdi)	# ctx->hash[1] += b
+	popq	%r14		#
+	addl	%ecx, 88(%rdi)	# ctx->hash[2] += c
+#	popq	%r15		#
+	addl	%edx, 92(%rdi)	# ctx->hash[3] += d
+	popq	%rbx		#
+	addl	%ebp, 96(%rdi)	# ctx->hash[4] += e
+	popq	%rbp		#
+
+	ret
+	.size	sha1_process_block64, .-sha1_process_block64
+
+	.section	.rodata.cst16.sha1const, \"aM\", @progbits, 16
+	.balign	16
+sha1const:
+	.long	0x5A827999
+	.long	0x6ED9EBA1
+	.long	0x8F1BBCDC
+	.long	0xCA62C1D6
+
+#endif"
diff --git a/libbb/hash_sha256_hwaccel_x86-32.S b/libbb/hash_sha256_hwaccel_x86-32.S
new file mode 100644
index 000000000..a0e4a571a
--- /dev/null
+++ b/libbb/hash_sha256_hwaccel_x86-32.S
@@ -0,0 +1,284 @@
+#if ENABLE_SHA256_HWACCEL && defined(__GNUC__) && defined(__i386__)
+/* The code is adapted from Linux kernel's source */
+
+// We use shorter insns, even though they are for "wrong"
+// data type (fp, not int).
+// For Intel, there is no penalty for doing it at all
+// (CPUs which do have such penalty do not support SHA insns).
+// For AMD, the penalty is one extra cycle
+// (allegedly: I failed to find measurable difference).
+
+//#define mova128 movdqa
+#define mova128 movaps
+//#define movu128 movdqu
+#define movu128 movups
+//#define shuf128_32 pshufd
+#define shuf128_32 shufps
+
+// pshufb and palignr are SSSE3 insns.
+// We do not check SSSE3 in cpuid,
+// all SHA-capable CPUs support it as well.
+
+#ifdef __linux__
+	.section	.note.GNU-stack, "", @progbits
+#endif
+	.section	.text.sha256_process_block64_shaNI, "ax", @progbits
+	.globl	sha256_process_block64_shaNI
+	.hidden	sha256_process_block64_shaNI
+	.type	sha256_process_block64_shaNI, @function
+
+#define DATA_PTR	%eax
+
+#define SHA256CONSTANTS	%ecx
+
+#define MSG		%xmm0
+#define STATE0		%xmm1
+#define STATE1		%xmm2
+#define MSGTMP0		%xmm3
+#define MSGTMP1		%xmm4
+#define MSGTMP2		%xmm5
+#define MSGTMP3		%xmm6
+
+#define XMMTMP		%xmm7
+
+#define SHUF(a,b,c,d) $(a+(b<<2)+(c<<4)+(d<<6))
+
+	.balign	8	# allow decoders to fetch at least 2 first insns
+sha256_process_block64_shaNI:
+
+	movu128		76+0*16(%eax), XMMTMP /* ABCD (little-endian dword order) */
+	movu128		76+1*16(%eax), STATE1 /* EFGH */
+/* shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one */
+	mova128		STATE1, STATE0
+	/* ---		-------------- ABCD -- EFGH */
+	shufps		SHUF(1,0,1,0), XMMTMP, STATE0 /* FEBA */
+	shufps		SHUF(3,2,3,2), XMMTMP, STATE1 /* HGDC */
+
+/* XMMTMP holds flip mask from here... */
+	mova128		PSHUFFLE_BSWAP32_FLIP_MASK, XMMTMP
+	movl		$K256+8*16, SHA256CONSTANTS
+
+	/* Rounds 0-3 */
+	movu128		0*16(DATA_PTR), MSG
+	pshufb		XMMTMP, MSG
+	mova128		MSG, MSGTMP0
+		paddd		0*16-8*16(SHA256CONSTANTS), MSG
+		sha256rnds2	MSG, STATE0, STATE1
+		shuf128_32	$0x0E, MSG, MSG
+		sha256rnds2	MSG, STATE1, STATE0
+
+	/* Rounds 4-7 */
+	movu128		1*16(DATA_PTR), MSG
+	pshufb		XMMTMP, MSG
+	mova128		MSG, MSGTMP1
+		paddd		1*16-8*16(SHA256CONSTANTS), MSG
+		sha256rnds2	MSG, STATE0, STATE1
+		shuf128_32	$0x0E, MSG, MSG
+		sha256rnds2	MSG, STATE1, STATE0
+	sha256msg1	MSGTMP1, MSGTMP0
+
+	/* Rounds 8-11 */
+	movu128		2*16(DATA_PTR), MSG
+	pshufb		XMMTMP, MSG
+	mova128		MSG, MSGTMP2
+		paddd		2*16-8*16(SHA256CONSTANTS), MSG
+		sha256rnds2	MSG, STATE0, STATE1
+		shuf128_32	$0x0E, MSG, MSG
+		sha256rnds2	MSG, STATE1, STATE0
+	sha256msg1	MSGTMP2, MSGTMP1
+
+	/* Rounds 12-15 */
+	movu128		3*16(DATA_PTR), MSG
+	pshufb		XMMTMP, MSG
+/* ...to here */
+	mova128		MSG, MSGTMP3
+		paddd		3*16-8*16(SHA256CONSTANTS), MSG
+		sha256rnds2	MSG, STATE0, STATE1
+	mova128		MSGTMP3, XMMTMP
+	palignr		$4, MSGTMP2, XMMTMP
+	paddd		XMMTMP, MSGTMP0
+	sha256msg2	MSGTMP3, MSGTMP0
+		shuf128_32	$0x0E, MSG, MSG
+		sha256rnds2	MSG, STATE1, STATE0
+	sha256msg1	MSGTMP3, MSGTMP2
+
+	/* Rounds 16-19 */
+	mova128		MSGTMP0, MSG
+		paddd		4*16-8*16(SHA256CONSTANTS), MSG
+		sha256rnds2	MSG, STATE0, STATE1
+	mova128		MSGTMP0, XMMTMP
+	palignr		$4, MSGTMP3, XMMTMP
+	paddd		XMMTMP, MSGTMP1
+	sha256msg2	MSGTMP0, MSGTMP1
+		shuf128_32	$0x0E, MSG, MSG
+		sha256rnds2	MSG, STATE1, STATE0
+	sha256msg1	MSGTMP0, MSGTMP3
+
+	/* Rounds 20-23 */
+	mova128		MSGTMP1, MSG
+		paddd		5*16-8*16(SHA256CONSTANTS), MSG
+		sha256rnds2	MSG, STATE0, STATE1
+	mova128		MSGTMP1, XMMTMP
+	palignr		$4, MSGTMP0, XMMTMP
+	paddd		XMMTMP, MSGTMP2
+	sha256msg2	MSGTMP1, MSGTMP2
+		shuf128_32	$0x0E, MSG, MSG
+		sha256rnds2	MSG, STATE1, STATE0
+	sha256msg1	MSGTMP1, MSGTMP0
+
+	/* Rounds 24-27 */
+	mova128		MSGTMP2, MSG
+		paddd		6*16-8*16(SHA256CONSTANTS), MSG
+		sha256rnds2	MSG, STATE0, STATE1
+	mova128		MSGTMP2, XMMTMP
+	palignr		$4, MSGTMP1, XMMTMP
+	paddd		XMMTMP, MSGTMP3
+	sha256msg2	MSGTMP2, MSGTMP3
+		shuf128_32	$0x0E, MSG, MSG
+		sha256rnds2	MSG, STATE1, STATE0
+	sha256msg1	MSGTMP2, MSGTMP1
+
+	/* Rounds 28-31 */
+	mova128		MSGTMP3, MSG
+		paddd		7*16-8*16(SHA256CONSTANTS), MSG
+		sha256rnds2	MSG, STATE0, STATE1
+	mova128		MSGTMP3, XMMTMP
+	palignr		$4, MSGTMP2, XMMTMP
+	paddd		XMMTMP, MSGTMP0
+	sha256msg2	MSGTMP3, MSGTMP0
+		shuf128_32	$0x0E, MSG, MSG
+		sha256rnds2	MSG, STATE1, STATE0
+	sha256msg1	MSGTMP3, MSGTMP2
+
+	/* Rounds 32-35 */
+	mova128		MSGTMP0, MSG
+		paddd		8*16-8*16(SHA256CONSTANTS), MSG
+		sha256rnds2	MSG, STATE0, STATE1
+	mova128		MSGTMP0, XMMTMP
+	palignr		$4, MSGTMP3, XMMTMP
+	paddd		XMMTMP, MSGTMP1
+	sha256msg2	MSGTMP0, MSGTMP1
+		shuf128_32	$0x0E, MSG, MSG
+		sha256rnds2	MSG, STATE1, STATE0
+	sha256msg1	MSGTMP0, MSGTMP3
+
+	/* Rounds 36-39 */
+	mova128		MSGTMP1, MSG
+		paddd		9*16-8*16(SHA256CONSTANTS), MSG
+		sha256rnds2	MSG, STATE0, STATE1
+	mova128		MSGTMP1, XMMTMP
+	palignr		$4, MSGTMP0, XMMTMP
+	paddd		XMMTMP, MSGTMP2
+	sha256msg2	MSGTMP1, MSGTMP2
+		shuf128_32	$0x0E, MSG, MSG
+		sha256rnds2	MSG, STATE1, STATE0
+	sha256msg1	MSGTMP1, MSGTMP0
+
+	/* Rounds 40-43 */
+	mova128		MSGTMP2, MSG
+		paddd		10*16-8*16(SHA256CONSTANTS), MSG
+		sha256rnds2	MSG, STATE0, STATE1
+	mova128		MSGTMP2, XMMTMP
+	palignr		$4, MSGTMP1, XMMTMP
+	paddd		XMMTMP, MSGTMP3
+	sha256msg2	MSGTMP2, MSGTMP3
+		shuf128_32	$0x0E, MSG, MSG
+		sha256rnds2	MSG, STATE1, STATE0
+	sha256msg1	MSGTMP2, MSGTMP1
+
+	/* Rounds 44-47 */
+	mova128		MSGTMP3, MSG
+		paddd		11*16-8*16(SHA256CONSTANTS), MSG
+		sha256rnds2	MSG, STATE0, STATE1
+	mova128		MSGTMP3, XMMTMP
+	palignr		$4, MSGTMP2, XMMTMP
+	paddd		XMMTMP, MSGTMP0
+	sha256msg2	MSGTMP3, MSGTMP0
+		shuf128_32	$0x0E, MSG, MSG
+		sha256rnds2	MSG, STATE1, STATE0
+	sha256msg1	MSGTMP3, MSGTMP2
+
+	/* Rounds 48-51 */
+	mova128		MSGTMP0, MSG
+		paddd		12*16-8*16(SHA256CONSTANTS), MSG
+		sha256rnds2	MSG, STATE0, STATE1
+	mova128		MSGTMP0, XMMTMP
+	palignr		$4, MSGTMP3, XMMTMP
+	paddd		XMMTMP, MSGTMP1
+	sha256msg2	MSGTMP0, MSGTMP1
+		shuf128_32	$0x0E, MSG, MSG
+		sha256rnds2	MSG, STATE1, STATE0
+	sha256msg1	MSGTMP0, MSGTMP3
+
+	/* Rounds 52-55 */
+	mova128		MSGTMP1, MSG
+		paddd		13*16-8*16(SHA256CONSTANTS), MSG
+		sha256rnds2	MSG, STATE0, STATE1
+	mova128		MSGTMP1, XMMTMP
+	palignr		$4, MSGTMP0, XMMTMP
+	paddd		XMMTMP, MSGTMP2
+	sha256msg2	MSGTMP1, MSGTMP2
+		shuf128_32	$0x0E, MSG, MSG
+		sha256rnds2	MSG, STATE1, STATE0
+
+	/* Rounds 56-59 */
+	mova128		MSGTMP2, MSG
+		paddd		14*16-8*16(SHA256CONSTANTS), MSG
+		sha256rnds2	MSG, STATE0, STATE1
+	mova128		MSGTMP2, XMMTMP
+	palignr		$4, MSGTMP1, XMMTMP
+	paddd		XMMTMP, MSGTMP3
+	sha256msg2	MSGTMP2, MSGTMP3
+		shuf128_32	$0x0E, MSG, MSG
+		sha256rnds2	MSG, STATE1, STATE0
+
+	/* Rounds 60-63 */
+	mova128		MSGTMP3, MSG
+		paddd		15*16-8*16(SHA256CONSTANTS), MSG
+		sha256rnds2	MSG, STATE0, STATE1
+		shuf128_32	$0x0E, MSG, MSG
+		sha256rnds2	MSG, STATE1, STATE0
+
+	/* Write hash values back in the correct order */
+	mova128		STATE0, XMMTMP
+/* shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one */
+	/* ---		-------------- HGDC -- FEBA */
+	shufps		SHUF(3,2,3,2), STATE1, STATE0 /* ABCD */
+	shufps		SHUF(1,0,1,0), STATE1, XMMTMP /* EFGH */
+	/* add current hash values to previous ones */
+	movu128		76+1*16(%eax), STATE1
+	paddd		XMMTMP, STATE1
+	movu128		STATE1, 76+1*16(%eax)
+	movu128		76+0*16(%eax), XMMTMP
+	paddd		XMMTMP, STATE0
+	movu128		STATE0, 76+0*16(%eax)
+
+	ret
+	.size	sha256_process_block64_shaNI, .-sha256_process_block64_shaNI
+
+	.section	.rodata.cst256.K256, "aM", @progbits, 256
+	.balign	16
+K256:
+	.long	0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
+	.long	0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
+	.long	0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
+	.long	0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
+	.long	0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
+	.long	0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
+	.long	0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
+	.long	0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
+	.long	0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
+	.long	0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
+	.long	0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
+	.long	0xd192e819,0xd6990624,0xf40e3585,0x106aa070
+	.long	0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
+	.long	0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
+	.long	0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
+	.long	0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
+
+	.section	.rodata.cst16.PSHUFFLE_BSWAP32_FLIP_MASK, "aM", @progbits, 16
+	.balign	16
+PSHUFFLE_BSWAP32_FLIP_MASK:
+	.octa	0x0c0d0e0f08090a0b0405060700010203
+
+#endif
diff --git a/libbb/hash_sha256_hwaccel_x86-64.S b/libbb/hash_sha256_hwaccel_x86-64.S
new file mode 100644
index 000000000..172c2eae2
--- /dev/null
+++ b/libbb/hash_sha256_hwaccel_x86-64.S
@@ -0,0 +1,290 @@
+#if ENABLE_SHA256_HWACCEL && defined(__GNUC__) && defined(__x86_64__)
+/* The code is adapted from Linux kernel's source */
+
+// We use shorter insns, even though they are for "wrong"
+// data type (fp, not int).
+// For Intel, there is no penalty for doing it at all
+// (CPUs which do have such penalty do not support SHA insns).
+// For AMD, the penalty is one extra cycle
+// (allegedly: I failed to find measurable difference).
+
+//#define mova128 movdqa
+#define mova128 movaps
+//#define movu128 movdqu
+#define movu128 movups
+//#define shuf128_32 pshufd
+#define shuf128_32 shufps
+
+// pshufb and palignr are SSSE3 insns.
+// We do not check SSSE3 in cpuid,
+// all SHA-capable CPUs support it as well.
+
+#ifdef __linux__
+	.section	.note.GNU-stack, "", @progbits
+#endif
+	.section	.text.sha256_process_block64_shaNI, "ax", @progbits
+	.globl	sha256_process_block64_shaNI
+	.hidden	sha256_process_block64_shaNI
+	.type	sha256_process_block64_shaNI, @function
+
+#define DATA_PTR	%rdi
+
+#define SHA256CONSTANTS	%rax
+
+#define MSG		%xmm0
+#define STATE0		%xmm1
+#define STATE1		%xmm2
+#define MSGTMP0		%xmm3
+#define MSGTMP1		%xmm4
+#define MSGTMP2		%xmm5
+#define MSGTMP3		%xmm6
+
+#define XMMTMP		%xmm7
+
+#define SAVE0		%xmm8
+#define SAVE1		%xmm9
+
+#define SHUF(a,b,c,d) $(a+(b<<2)+(c<<4)+(d<<6))
+
+	.balign	8	# allow decoders to fetch at least 2 first insns
+sha256_process_block64_shaNI:
+
+	movu128		80+0*16(%rdi), XMMTMP /* ABCD (little-endian dword order) */
+	movu128		80+1*16(%rdi), STATE1 /* EFGH */
+/* shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one */
+	mova128		STATE1, STATE0
+	/* ---		-------------- ABCD -- EFGH */
+	shufps		SHUF(1,0,1,0), XMMTMP, STATE0 /* FEBA */
+	shufps		SHUF(3,2,3,2), XMMTMP, STATE1 /* HGDC */
+
+/* XMMTMP holds flip mask from here... */
+	mova128		PSHUFFLE_BSWAP32_FLIP_MASK(%rip), XMMTMP
+	leaq		K256+8*16(%rip), SHA256CONSTANTS
+
+	/* Save hash values for addition after rounds */
+	mova128		STATE0, SAVE0
+	mova128		STATE1, SAVE1
+
+	/* Rounds 0-3 */
+	movu128		0*16(DATA_PTR), MSG
+	pshufb		XMMTMP, MSG
+	mova128		MSG, MSGTMP0
+		paddd		0*16-8*16(SHA256CONSTANTS), MSG
+		sha256rnds2	MSG, STATE0, STATE1
+		shuf128_32	$0x0E, MSG, MSG
+		sha256rnds2	MSG, STATE1, STATE0
+
+	/* Rounds 4-7 */
+	movu128		1*16(DATA_PTR), MSG
+	pshufb		XMMTMP, MSG
+	mova128		MSG, MSGTMP1
+		paddd		1*16-8*16(SHA256CONSTANTS), MSG
+		sha256rnds2	MSG, STATE0, STATE1
+		shuf128_32	$0x0E, MSG, MSG
+		sha256rnds2	MSG, STATE1, STATE0
+	sha256msg1	MSGTMP1, MSGTMP0
+
+	/* Rounds 8-11 */
+	movu128		2*16(DATA_PTR), MSG
+	pshufb		XMMTMP, MSG
+	mova128		MSG, MSGTMP2
+		paddd		2*16-8*16(SHA256CONSTANTS), MSG
+		sha256rnds2	MSG, STATE0, STATE1
+		shuf128_32	$0x0E, MSG, MSG
+		sha256rnds2	MSG, STATE1, STATE0
+	sha256msg1	MSGTMP2, MSGTMP1
+
+	/* Rounds 12-15 */
+	movu128		3*16(DATA_PTR), MSG
+	pshufb		XMMTMP, MSG
+/* ...to here */
+	mova128		MSG, MSGTMP3
+		paddd		3*16-8*16(SHA256CONSTANTS), MSG
+		sha256rnds2	MSG, STATE0, STATE1
+	mova128		MSGTMP3, XMMTMP
+	palignr		$4, MSGTMP2, XMMTMP
+	paddd		XMMTMP, MSGTMP0
+	sha256msg2	MSGTMP3, MSGTMP0
+		shuf128_32	$0x0E, MSG, MSG
+		sha256rnds2	MSG, STATE1, STATE0
+	sha256msg1	MSGTMP3, MSGTMP2
+
+	/* Rounds 16-19 */
+	mova128		MSGTMP0, MSG
+		paddd		4*16-8*16(SHA256CONSTANTS), MSG
+		sha256rnds2	MSG, STATE0, STATE1
+	mova128		MSGTMP0, XMMTMP
+	palignr		$4, MSGTMP3, XMMTMP
+	paddd		XMMTMP, MSGTMP1
+	sha256msg2	MSGTMP0, MSGTMP1
+		shuf128_32	$0x0E, MSG, MSG
+		sha256rnds2	MSG, STATE1, STATE0
+	sha256msg1	MSGTMP0, MSGTMP3
+
+	/* Rounds 20-23 */
+	mova128		MSGTMP1, MSG
+		paddd		5*16-8*16(SHA256CONSTANTS), MSG
+		sha256rnds2	MSG, STATE0, STATE1
+	mova128		MSGTMP1, XMMTMP
+	palignr		$4, MSGTMP0, XMMTMP
+	paddd		XMMTMP, MSGTMP2
+	sha256msg2	MSGTMP1, MSGTMP2
+		shuf128_32	$0x0E, MSG, MSG
+		sha256rnds2	MSG, STATE1, STATE0
+	sha256msg1	MSGTMP1, MSGTMP0
+
+	/* Rounds 24-27 */
+	mova128		MSGTMP2, MSG
+		paddd		6*16-8*16(SHA256CONSTANTS), MSG
+		sha256rnds2	MSG, STATE0, STATE1
+	mova128		MSGTMP2, XMMTMP
+	palignr		$4, MSGTMP1, XMMTMP
+	paddd		XMMTMP, MSGTMP3
+	sha256msg2	MSGTMP2, MSGTMP3
+		shuf128_32	$0x0E, MSG, MSG
+		sha256rnds2	MSG, STATE1, STATE0
+	sha256msg1	MSGTMP2, MSGTMP1
+
+	/* Rounds 28-31 */
+	mova128		MSGTMP3, MSG
+		paddd		7*16-8*16(SHA256CONSTANTS), MSG
+		sha256rnds2	MSG, STATE0, STATE1
+	mova128		MSGTMP3, XMMTMP
+	palignr		$4, MSGTMP2, XMMTMP
+	paddd		XMMTMP, MSGTMP0
+	sha256msg2	MSGTMP3, MSGTMP0
+		shuf128_32	$0x0E, MSG, MSG
+		sha256rnds2	MSG, STATE1, STATE0
+	sha256msg1	MSGTMP3, MSGTMP2
+
+	/* Rounds 32-35 */
+	mova128		MSGTMP0, MSG
+		paddd		8*16-8*16(SHA256CONSTANTS), MSG
+		sha256rnds2	MSG, STATE0, STATE1
+	mova128		MSGTMP0, XMMTMP
+	palignr		$4, MSGTMP3, XMMTMP
+	paddd		XMMTMP, MSGTMP1
+	sha256msg2	MSGTMP0, MSGTMP1
+		shuf128_32	$0x0E, MSG, MSG
+		sha256rnds2	MSG, STATE1, STATE0
+	sha256msg1	MSGTMP0, MSGTMP3
+
+	/* Rounds 36-39 */
+	mova128		MSGTMP1, MSG
+		paddd		9*16-8*16(SHA256CONSTANTS), MSG
+		sha256rnds2	MSG, STATE0, STATE1
+	mova128		MSGTMP1, XMMTMP
+	palignr		$4, MSGTMP0, XMMTMP
+	paddd		XMMTMP, MSGTMP2
+	sha256msg2	MSGTMP1, MSGTMP2
+		shuf128_32	$0x0E, MSG, MSG
+		sha256rnds2	MSG, STATE1, STATE0
+	sha256msg1	MSGTMP1, MSGTMP0
+
+	/* Rounds 40-43 */
+	mova128		MSGTMP2, MSG
+		paddd		10*16-8*16(SHA256CONSTANTS), MSG
+		sha256rnds2	MSG, STATE0, STATE1
+	mova128		MSGTMP2, XMMTMP
+	palignr		$4, MSGTMP1, XMMTMP
+	paddd		XMMTMP, MSGTMP3
+	sha256msg2	MSGTMP2, MSGTMP3
+		shuf128_32	$0x0E, MSG, MSG
+		sha256rnds2	MSG, STATE1, STATE0
+	sha256msg1	MSGTMP2, MSGTMP1
+
+	/* Rounds 44-47 */
+	mova128		MSGTMP3, MSG
+		paddd		11*16-8*16(SHA256CONSTANTS), MSG
+		sha256rnds2	MSG, STATE0, STATE1
+	mova128		MSGTMP3, XMMTMP
+	palignr		$4, MSGTMP2, XMMTMP
+	paddd		XMMTMP, MSGTMP0
+	sha256msg2	MSGTMP3, MSGTMP0
+		shuf128_32	$0x0E, MSG, MSG
+		sha256rnds2	MSG, STATE1, STATE0
+	sha256msg1	MSGTMP3, MSGTMP2
+
+	/* Rounds 48-51 */
+	mova128		MSGTMP0, MSG
+		paddd		12*16-8*16(SHA256CONSTANTS), MSG
+		sha256rnds2	MSG, STATE0, STATE1
+	mova128		MSGTMP0, XMMTMP
+	palignr		$4, MSGTMP3, XMMTMP
+	paddd		XMMTMP, MSGTMP1
+	sha256msg2	MSGTMP0, MSGTMP1
+		shuf128_32	$0x0E, MSG, MSG
+		sha256rnds2	MSG, STATE1, STATE0
+	sha256msg1	MSGTMP0, MSGTMP3
+
+	/* Rounds 52-55 */
+	mova128		MSGTMP1, MSG
+		paddd		13*16-8*16(SHA256CONSTANTS), MSG
+		sha256rnds2	MSG, STATE0, STATE1
+	mova128		MSGTMP1, XMMTMP
+	palignr		$4, MSGTMP0, XMMTMP
+	paddd		XMMTMP, MSGTMP2
+	sha256msg2	MSGTMP1, MSGTMP2
+		shuf128_32	$0x0E, MSG, MSG
+		sha256rnds2	MSG, STATE1, STATE0
+
+	/* Rounds 56-59 */
+	mova128		MSGTMP2, MSG
+		paddd		14*16-8*16(SHA256CONSTANTS), MSG
+		sha256rnds2	MSG, STATE0, STATE1
+	mova128		MSGTMP2, XMMTMP
+	palignr		$4, MSGTMP1, XMMTMP
+	paddd		XMMTMP, MSGTMP3
+	sha256msg2	MSGTMP2, MSGTMP3
+		shuf128_32	$0x0E, MSG, MSG
+		sha256rnds2	MSG, STATE1, STATE0
+
+	/* Rounds 60-63 */
+	mova128		MSGTMP3, MSG
+		paddd		15*16-8*16(SHA256CONSTANTS), MSG
+		sha256rnds2	MSG, STATE0, STATE1
+		shuf128_32	$0x0E, MSG, MSG
+		sha256rnds2	MSG, STATE1, STATE0
+
+	/* Add current hash values with previously saved */
+	paddd		SAVE0, STATE0
+	paddd		SAVE1, STATE1
+
+	/* Write hash values back in the correct order */
+	mova128		STATE0, XMMTMP
+/* shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one */
+	/* ---		-------------- HGDC -- FEBA */
+	shufps		SHUF(3,2,3,2), STATE1, STATE0 /* ABCD */
+	shufps		SHUF(1,0,1,0), STATE1, XMMTMP /* EFGH */
+	movu128		STATE0, 80+0*16(%rdi)
+	movu128		XMMTMP, 80+1*16(%rdi)
+
+	ret
+	.size	sha256_process_block64_shaNI, .-sha256_process_block64_shaNI
+
+	.section	.rodata.cst256.K256, "aM", @progbits, 256
+	.balign	16
+K256:
+	.long	0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
+	.long	0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
+	.long	0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
+	.long	0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
+	.long	0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
+	.long	0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
+	.long	0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
+	.long	0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
+	.long	0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
+	.long	0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
+	.long	0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
+	.long	0xd192e819,0xd6990624,0xf40e3585,0x106aa070
+	.long	0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
+	.long	0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
+	.long	0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
+	.long	0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
+
+	.section	.rodata.cst16.PSHUFFLE_BSWAP32_FLIP_MASK, "aM", @progbits, 16
+	.balign	16
+PSHUFFLE_BSWAP32_FLIP_MASK:
+	.octa	0x0c0d0e0f08090a0b0405060700010203
+
+#endif
-- 
cgit v1.2.3