block-sha1: split the different "hacks" to be individually selected

This is to make it easier for them to be selected individually depending on the architecture instead of the other way around i.e. having each architecture select a list of hacks up front. That makes for clearer documentation as well. Signed-off-by: Nicolas Pitre <nico@cam.org> Signed-off-by: Junio C Hamano <gitster@pobox.com>
author: Nicolas Pitre <nico@cam.org> 2009-08-12 23:46:41 +0400
committer: Junio C Hamano <gitster@pobox.com> 2009-08-13 00:35:54 +0400
commit: dc52fd29738c2af98f3e986691eca34addfd4914 (patch)
tree: 89cb72a4bdaa2a152022e1eece078c5a07558fb5 /block-sha1
parent: 30ba0de726d92ccfc93009eb60f2c30b0886f61b (diff)
1 files changed, 18 insertions, 5 deletions
diff --git a/block-sha1/sha1.c b/block-sha1/sha1.c
index c3f1ae59b9..67c9bd0723 100644
--- a/block-sha1/sha1.c
+++ b/block-sha1/sha1.c
@@ -11,10 +11,16 @@
 
 #if defined(__i386__) || defined(__x86_64__)
 
+/*
+ * Force usage of rol or ror by selecting the one with the smaller constant.
+ * It _can_ generate slightly smaller code (a constant of 1 is special), but
+ * perhaps more importantly it's possibly faster on any uarch that does a
+ * rotate with a loop.
+ */
+
 #define SHA_ASM(op, x, n) ({ unsigned int __res; __asm__(op " %1,%0":"=r" (__res):"i" (n), "0" (x)); __res; })
 #define SHA_ROL(x,n)	SHA_ASM("rol", x, n)
 #define SHA_ROR(x,n)	SHA_ASM("ror", x, n)
-#define SMALL_REGISTER_SET
 
 #else
 
@@ -24,9 +30,6 @@
 
 #endif
 
-/* This "rolls" over the 512-bit array */
-#define W(x) (array[(x)&15])
-
 /*
  * If you have 32 registers or more, the compiler can (and should)
  * try to change the array[] accesses into registers. However, on
@@ -43,13 +46,23 @@
  * Ben Herrenschmidt reports that on PPC, the C version comes close
  * to the optimized asm with this (ie on PPC you don't want that
  * 'volatile', since there are lots of registers).
+ *
+ * On ARM we get the best code generation by forcing a full memory barrier
+ * between each SHA_ROUND, otherwise gcc happily get wild with spilling and
+ * the stack frame size simply explode and performance goes down the drain.
  */
-#ifdef SMALL_REGISTER_SET
+
+#if defined(__i386__) || defined(__x86_64__)
   #define setW(x, val) (*(volatile unsigned int *)&W(x) = (val))
+#elif defined(__arm__)
+  #define setW(x, val) do { W(x) = (val); __asm__("":::"memory"); } while (0)
 #else
   #define setW(x, val) (W(x) = (val))
 #endif
 
+/* This "rolls" over the 512-bit array */
+#define W(x) (array[(x)&15])
+
 /*
  * Where do we get the source from? The first 16 iterations get it from
  * the input data, the next mix it from the 512-bit array.
author	Nicolas Pitre <nico@cam.org>	2009-08-12 23:46:41 +0400
committer	Junio C Hamano <gitster@pobox.com>	2009-08-13 00:35:54 +0400
commit	dc52fd29738c2af98f3e986691eca34addfd4914 (patch)
tree	89cb72a4bdaa2a152022e1eece078c5a07558fb5 /block-sha1
parent	30ba0de726d92ccfc93009eb60f2c30b0886f61b (diff)