From f705f549d83d0d97530581e6470eed0490cc942a Mon Sep 17 00:00:00 2001 From: Ben Isaacs <75862+ben-xo@users.noreply.github.com> Date: Thu, 26 Aug 2021 13:25:09 +0100 Subject: Improve the speed of blend8 on AVR by 20-30% The SCALE8_FIXED version of blend8 uses the formula result = (A*(255-amountOfB) + A + B*amountOfB + B) >> 8 However, by rearranging this to result = (256*A + B - A*amountOfB + B*amountOfB) >> 8 We can save 4 or 5 cycles (depending on how the optimiser sets up a and b inputs for that extra cycle) This formula rearrangement may be advantageous for the C implementation too, but I haven't tried that. --- src/lib8tion/math8.h | 46 +++++++++++++++++++++++++++++++++++----------- 1 file changed, 35 insertions(+), 11 deletions(-) diff --git a/src/lib8tion/math8.h b/src/lib8tion/math8.h index a83b1ad2..fe355e89 100644 --- a/src/lib8tion/math8.h +++ b/src/lib8tion/math8.h @@ -495,6 +495,38 @@ LIB8STATIC uint8_t blend8( uint8_t a, uint8_t b, uint8_t amountOfB) uint16_t partial; uint8_t result; +#if (FASTLED_SCALE8_FIXED == 1) + + // with SCALE8_FIXED, the algorithm above is: + // result = A*(255-amountOfB) + A + B*(amountOfB) + B + + // however, we can rearrange that to: + // result = 256*A + B - A*amountOfB + B*amountOfB + + // 1 or 2 cycles depending on how the compiler optimises + partial = (a << 8) + b; + + // 7 cycles + asm volatile ( + " mul %[a], %[amountOfB] \n\t" + " sub %A[partial], r0 \n\t" + " sbc %B[partial], r1 \n\t" + " mul %[b], %[amountOfB] \n\t" + " add %A[partial], r0 \n\t" + " adc %B[partial], r1 \n\t" + " clr __zero_reg__ \n\t" + : [partial] "+r" (partial) + : [amountOfB] "r" (amountOfB), + [a] "r" (a), + [b] "r" (b) + : "r0", "r1" + ); + +#else + + // non-SCALE8-fixed version + + // 7 cycles asm volatile ( /* partial = b * amountOfB */ " mul %[b], %[amountOfB] \n\t" @@ -510,23 +542,15 @@ LIB8STATIC uint8_t blend8( uint8_t a, uint8_t b, uint8_t amountOfB) " adc %B[partial], r1 \n\t" " clr __zero_reg__ \n\t" - -#if (FASTLED_SCALE8_FIXED == 1) - /* partial += a */ - " add %A[partial], %[a] \n\t" - " adc %B[partial], __zero_reg__ \n\t" - - // partial += b - " add %A[partial], %[b] \n\t" - " adc %B[partial], __zero_reg__ \n\t" -#endif - + : [partial] "=r" (partial), [amountOfB] "+a" (amountOfB) : [a] "a" (a), [b] "a" (b) : "r0", "r1" ); + +#endif result = partial >> 8; -- cgit v1.2.3 From ed0a7edd2f89b80c84a950b9b046f40a96737034 Mon Sep 17 00:00:00 2001 From: Ben Isaacs <75862+ben-xo@users.noreply.github.com> Date: Thu, 26 Aug 2021 17:19:39 +0100 Subject: Using an | instead of + is 4 instructions faster. --- src/lib8tion/math8.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/lib8tion/math8.h b/src/lib8tion/math8.h index fe355e89..61b12e6a 100644 --- a/src/lib8tion/math8.h +++ b/src/lib8tion/math8.h @@ -504,7 +504,7 @@ LIB8STATIC uint8_t blend8( uint8_t a, uint8_t b, uint8_t amountOfB) // result = 256*A + B - A*amountOfB + B*amountOfB // 1 or 2 cycles depending on how the compiler optimises - partial = (a << 8) + b; + partial = (a << 8) | b; // 7 cycles asm volatile ( -- cgit v1.2.3 From ca59a45a2be94c5bc1f8067b538cbced3ba049d1 Mon Sep 17 00:00:00 2001 From: Ben Isaacs <75862+ben-xo@users.noreply.github.com> Date: Fri, 27 Aug 2021 12:20:14 +0100 Subject: Refactor blend8() to apply the same optimisations on all platforms (in BLEND_C). As noted on https://github.com/FastLED/FastLED/pull/1288, many platforms can eliminate an entire multiplication this way. --- src/lib8tion/math8.h | 61 +++++++++++++++++++++++++++++++--------------------- 1 file changed, 37 insertions(+), 24 deletions(-) diff --git a/src/lib8tion/math8.h b/src/lib8tion/math8.h index 61b12e6a..f95697bd 100644 --- a/src/lib8tion/math8.h +++ b/src/lib8tion/math8.h @@ -469,39 +469,52 @@ LIB8STATIC uint8_t sqrt16(uint16_t x) #if (FASTLED_BLEND_FIXED == 1) LIB8STATIC uint8_t blend8( uint8_t a, uint8_t b, uint8_t amountOfB) { -#if BLEND8_C == 1 + + // The BLEND_FIXED formula is + // + // result = ( A*(amountOfA) + B*(amountOfB) )/ 256 + // + // …where amountOfA = 255-amountOfB. + // + // This formula will never return 255, which is why the BLEND_FIXED + SCALE8_FIXED version is + // + // result = ( A*(amountOfA) + A + B*(amountOfB) + B ) / 256 + // + // We can rearrange this formula for some great optimisations. + // + // result = ( A*(amountOfA) + A + B*(amountOfB) + B ) / 256 + // = ( A*(255-amountOfB) + A + B*(amountOfB) + B ) / 256 + // = ( A*(256-amountOfB) + B*(amountOfB) + B ) / 256 + // = ( A*256 + B + B*(amountOfB) - A*(amountOfB) ) / 256 // this is the version used in SCALE8_FIXED AVR below + // = ( A*256 + B + (B-A)*(amountOfB) ) / 256 // this is the version used in SCALE8_FIXED C below + uint16_t partial; uint8_t result; - + +#if BLEND8_C == 1 + +# if (FASTLED_SCALE8_FIXED == 1) + partial = (a << 8) | b; // A*256 + B + + // on many platforms this compiles to a single multiply of (B-A) * amountOfB + partial += (b * amountOfB); + partial -= (a * amountOfB); + +# else uint8_t amountOfA = 255 - amountOfB; - + + // on the other hand, this compiles to two multiplies, and gives the "wrong" answer :] partial = (a * amountOfA); -#if (FASTLED_SCALE8_FIXED == 1) - partial += a; - //partial = add8to16( a, partial); -#endif - partial += (b * amountOfB); -#if (FASTLED_SCALE8_FIXED == 1) - partial += b; - //partial = add8to16( b, partial); -#endif +# endif result = partial >> 8; return result; #elif BLEND8_AVRASM == 1 - uint16_t partial; - uint8_t result; - -#if (FASTLED_SCALE8_FIXED == 1) - - // with SCALE8_FIXED, the algorithm above is: - // result = A*(255-amountOfB) + A + B*(amountOfB) + B - // however, we can rearrange that to: - // result = 256*A + B - A*amountOfB + B*amountOfB +# if (FASTLED_SCALE8_FIXED == 1) // 1 or 2 cycles depending on how the compiler optimises partial = (a << 8) | b; @@ -522,7 +535,7 @@ LIB8STATIC uint8_t blend8( uint8_t a, uint8_t b, uint8_t amountOfB) : "r0", "r1" ); -#else +# else // non-SCALE8-fixed version @@ -550,14 +563,14 @@ LIB8STATIC uint8_t blend8( uint8_t a, uint8_t b, uint8_t amountOfB) : "r0", "r1" ); -#endif +# endif result = partial >> 8; return result; #else -#error "No implementation for blend8 available." +# error "No implementation for blend8 available." #endif } -- cgit v1.2.3