Improve the speed of blend8 on AVR by 20-30%

The SCALE8_FIXED version of blend8 uses the formula result = (A*(255-amountOfB) + A + B*amountOfB + B) >> 8 However, by rearranging this to result = (256*A + B - A*amountOfB + B*amountOfB) >> 8 We can save 4 or 5 cycles (depending on how the optimiser sets up a and b inputs for that extra cycle) This formula rearrangement may be advantageous for the C implementation too, but I haven't tried that.
author: Ben Isaacs <75862+ben-xo@users.noreply.github.com> 2021-08-26 15:25:09 +0300
committer: Ben Isaacs <75862+ben-xo@users.noreply.github.com> 2021-08-26 16:11:46 +0300
commit: f705f549d83d0d97530581e6470eed0490cc942a (patch)
tree: 9bd3e92fd7d9b2476a60ffbf78d46675c25c3caf
parent: 5cc17b2be88982eb34b1998de22b83fee56d4f07 (diff)
1 files changed, 35 insertions, 11 deletions
diff --git a/src/lib8tion/math8.h b/src/lib8tion/math8.h
index a83b1ad2..fe355e89 100644
--- a/src/lib8tion/math8.h
+++ b/src/lib8tion/math8.h
@@ -495,6 +495,38 @@ LIB8STATIC uint8_t blend8( uint8_t a, uint8_t b, uint8_t amountOfB)
     uint16_t partial;
     uint8_t result;
 
+#if (FASTLED_SCALE8_FIXED == 1)
+
+    // with SCALE8_FIXED, the algorithm above is:
+    // result = A*(255-amountOfB) + A + B*(amountOfB) + B
+
+    // however, we can rearrange that to:
+    // result = 256*A + B - A*amountOfB + B*amountOfB
+
+    // 1 or 2 cycles depending on how the compiler optimises
+    partial = (a << 8) + b;
+
+    // 7 cycles
+    asm volatile (
+        "  mul %[a], %[amountOfB]        \n\t"
+        "  sub %A[partial], r0           \n\t"
+        "  sbc %B[partial], r1           \n\t"
+        "  mul %[b], %[amountOfB]        \n\t"
+        "  add %A[partial], r0           \n\t"
+        "  adc %B[partial], r1           \n\t"
+        "  clr __zero_reg__              \n\t"
+        : [partial] "+r" (partial)
+        : [amountOfB] "r" (amountOfB),
+          [a] "r" (a),
+          [b] "r" (b)
+        : "r0", "r1"
+    );
+
+#else
+
+    // non-SCALE8-fixed version
+
+    // 7 cycles
     asm volatile (
         /* partial = b * amountOfB */
         "  mul %[b], %[amountOfB]        \n\t"
@@ -510,23 +542,15 @@ LIB8STATIC uint8_t blend8( uint8_t a, uint8_t b, uint8_t amountOfB)
         "  adc %B[partial], r1           \n\t"
                   
         "  clr __zero_reg__              \n\t"
-                  
-#if (FASTLED_SCALE8_FIXED == 1)
-        /* partial += a */
-        "  add %A[partial], %[a]         \n\t"
-        "  adc %B[partial], __zero_reg__ \n\t"
-                  
-        // partial += b
-        "  add %A[partial], %[b]         \n\t"
-        "  adc %B[partial], __zero_reg__ \n\t"
-#endif
-                  
+                        
         : [partial] "=r" (partial),
           [amountOfB] "+a" (amountOfB)
         : [a] "a" (a),
           [b] "a" (b)
         : "r0", "r1"
     );
+
+#endif
     
     result = partial >> 8;
author	Ben Isaacs <75862+ben-xo@users.noreply.github.com>	2021-08-26 15:25:09 +0300
committer	Ben Isaacs <75862+ben-xo@users.noreply.github.com>	2021-08-26 16:11:46 +0300
commit	f705f549d83d0d97530581e6470eed0490cc942a (patch)
tree	9bd3e92fd7d9b2476a60ffbf78d46675c25c3caf
parent	5cc17b2be88982eb34b1998de22b83fee56d4f07 (diff)