From f705f549d83d0d97530581e6470eed0490cc942a Mon Sep 17 00:00:00 2001
From: Ben Isaacs <75862+ben-xo@users.noreply.github.com>
Date: Thu, 26 Aug 2021 13:25:09 +0100
Subject: Improve the speed of blend8 on AVR by 20-30%

The SCALE8_FIXED version of blend8 uses the formula

  result = (A*(255-amountOfB) + A + B*amountOfB + B) >> 8

However, by rearranging this to

  result = (256*A + B - A*amountOfB + B*amountOfB) >> 8

We can save 4 or 5 cycles (depending on how the optimiser sets up a and b inputs for that extra cycle)

This formula rearrangement may be advantageous for the C implementation too, but I haven't tried that.
---
 src/lib8tion/math8.h | 46 +++++++++++++++++++++++++++++++++++-----------
 1 file changed, 35 insertions(+), 11 deletions(-)

diff --git a/src/lib8tion/math8.h b/src/lib8tion/math8.h
index a83b1ad2..fe355e89 100644
--- a/src/lib8tion/math8.h
+++ b/src/lib8tion/math8.h
@@ -495,6 +495,38 @@ LIB8STATIC uint8_t blend8( uint8_t a, uint8_t b, uint8_t amountOfB)
     uint16_t partial;
     uint8_t result;
 
+#if (FASTLED_SCALE8_FIXED == 1)
+
+    // with SCALE8_FIXED, the algorithm above is:
+    // result = A*(255-amountOfB) + A + B*(amountOfB) + B
+
+    // however, we can rearrange that to:
+    // result = 256*A + B - A*amountOfB + B*amountOfB
+
+    // 1 or 2 cycles depending on how the compiler optimises
+    partial = (a << 8) + b;
+
+    // 7 cycles
+    asm volatile (
+        "  mul %[a], %[amountOfB]        \n\t"
+        "  sub %A[partial], r0           \n\t"
+        "  sbc %B[partial], r1           \n\t"
+        "  mul %[b], %[amountOfB]        \n\t"
+        "  add %A[partial], r0           \n\t"
+        "  adc %B[partial], r1           \n\t"
+        "  clr __zero_reg__              \n\t"
+        : [partial] "+r" (partial)
+        : [amountOfB] "r" (amountOfB),
+          [a] "r" (a),
+          [b] "r" (b)
+        : "r0", "r1"
+    );
+
+#else
+
+    // non-SCALE8-fixed version
+
+    // 7 cycles
     asm volatile (
         /* partial = b * amountOfB */
         "  mul %[b], %[amountOfB]        \n\t"
@@ -510,23 +542,15 @@ LIB8STATIC uint8_t blend8( uint8_t a, uint8_t b, uint8_t amountOfB)
         "  adc %B[partial], r1           \n\t"
                   
         "  clr __zero_reg__              \n\t"
-                  
-#if (FASTLED_SCALE8_FIXED == 1)
-        /* partial += a */
-        "  add %A[partial], %[a]         \n\t"
-        "  adc %B[partial], __zero_reg__ \n\t"
-                  
-        // partial += b
-        "  add %A[partial], %[b]         \n\t"
-        "  adc %B[partial], __zero_reg__ \n\t"
-#endif
-                  
+                        
         : [partial] "=r" (partial),
           [amountOfB] "+a" (amountOfB)
         : [a] "a" (a),
           [b] "a" (b)
         : "r0", "r1"
     );
+
+#endif
     
     result = partial >> 8;
     
-- 
cgit v1.2.3


From ed0a7edd2f89b80c84a950b9b046f40a96737034 Mon Sep 17 00:00:00 2001
From: Ben Isaacs <75862+ben-xo@users.noreply.github.com>
Date: Thu, 26 Aug 2021 17:19:39 +0100
Subject: Using an | instead of + is 4 instructions faster.

---
 src/lib8tion/math8.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/lib8tion/math8.h b/src/lib8tion/math8.h
index fe355e89..61b12e6a 100644
--- a/src/lib8tion/math8.h
+++ b/src/lib8tion/math8.h
@@ -504,7 +504,7 @@ LIB8STATIC uint8_t blend8( uint8_t a, uint8_t b, uint8_t amountOfB)
     // result = 256*A + B - A*amountOfB + B*amountOfB
 
     // 1 or 2 cycles depending on how the compiler optimises
-    partial = (a << 8) + b;
+    partial = (a << 8) | b;
 
     // 7 cycles
     asm volatile (
-- 
cgit v1.2.3


From ca59a45a2be94c5bc1f8067b538cbced3ba049d1 Mon Sep 17 00:00:00 2001
From: Ben Isaacs <75862+ben-xo@users.noreply.github.com>
Date: Fri, 27 Aug 2021 12:20:14 +0100
Subject: Refactor blend8() to apply the same optimisations on all platforms
 (in BLEND_C).

As noted on https://github.com/FastLED/FastLED/pull/1288, many platforms can eliminate an entire multiplication this way.
---
 src/lib8tion/math8.h | 61 +++++++++++++++++++++++++++++++---------------------
 1 file changed, 37 insertions(+), 24 deletions(-)

diff --git a/src/lib8tion/math8.h b/src/lib8tion/math8.h
index 61b12e6a..f95697bd 100644
--- a/src/lib8tion/math8.h
+++ b/src/lib8tion/math8.h
@@ -469,39 +469,52 @@ LIB8STATIC uint8_t sqrt16(uint16_t x)
 #if (FASTLED_BLEND_FIXED == 1)
 LIB8STATIC uint8_t blend8( uint8_t a, uint8_t b, uint8_t amountOfB)
 {
-#if BLEND8_C == 1
+
+    // The BLEND_FIXED formula is
+    //
+    //   result = (  A*(amountOfA) + B*(amountOfB)              )/ 256
+    //
+    // …where amountOfA = 255-amountOfB.
+    //
+    // This formula will never return 255, which is why the BLEND_FIXED + SCALE8_FIXED version is
+    //
+    //   result = (  A*(amountOfA) + A + B*(amountOfB) + B      ) / 256
+    //
+    // We can rearrange this formula for some great optimisations.
+    //
+    //   result = (  A*(amountOfA) + A + B*(amountOfB) + B      ) / 256
+    //          = (  A*(255-amountOfB) + A + B*(amountOfB) + B  ) / 256
+    //          = (  A*(256-amountOfB) + B*(amountOfB) + B      ) / 256
+    //          = (  A*256 + B + B*(amountOfB) - A*(amountOfB)  ) / 256  // this is the version used in SCALE8_FIXED AVR below
+    //          = (  A*256 + B + (B-A)*(amountOfB)              ) / 256  // this is the version used in SCALE8_FIXED C below
+
     uint16_t partial;
     uint8_t result;
-    
+
+#if BLEND8_C == 1
+
+#   if (FASTLED_SCALE8_FIXED == 1)
+    partial = (a << 8) | b; // A*256 + B
+
+    // on many platforms this compiles to a single multiply of (B-A) * amountOfB
+    partial += (b * amountOfB);
+    partial -= (a * amountOfB);
+
+#   else
     uint8_t amountOfA = 255 - amountOfB;
-    
+
+    // on the other hand, this compiles to two multiplies, and gives the "wrong" answer :]
     partial = (a * amountOfA);
-#if (FASTLED_SCALE8_FIXED == 1)
-    partial += a;
-    //partial = add8to16( a, partial);
-#endif
-    
     partial += (b * amountOfB);
-#if (FASTLED_SCALE8_FIXED == 1)
-    partial += b;
-    //partial = add8to16( b, partial);
-#endif
+#   endif
     
     result = partial >> 8;
     
     return result;
 
 #elif BLEND8_AVRASM == 1
-    uint16_t partial;
-    uint8_t result;
-
-#if (FASTLED_SCALE8_FIXED == 1)
-
-    // with SCALE8_FIXED, the algorithm above is:
-    // result = A*(255-amountOfB) + A + B*(amountOfB) + B
 
-    // however, we can rearrange that to:
-    // result = 256*A + B - A*amountOfB + B*amountOfB
+#   if (FASTLED_SCALE8_FIXED == 1)
 
     // 1 or 2 cycles depending on how the compiler optimises
     partial = (a << 8) | b;
@@ -522,7 +535,7 @@ LIB8STATIC uint8_t blend8( uint8_t a, uint8_t b, uint8_t amountOfB)
         : "r0", "r1"
     );
 
-#else
+#   else
 
     // non-SCALE8-fixed version
 
@@ -550,14 +563,14 @@ LIB8STATIC uint8_t blend8( uint8_t a, uint8_t b, uint8_t amountOfB)
         : "r0", "r1"
     );
 
-#endif
+#   endif
     
     result = partial >> 8;
     
     return result;
     
 #else
-#error "No implementation for blend8 available."
+#   error "No implementation for blend8 available."
 #endif
 }
 
-- 
cgit v1.2.3