diff options
-rw-r--r-- | lib8tion/scale8.h | 18 | ||||
-rw-r--r-- | noise.cpp | 40 |
2 files changed, 54 insertions, 4 deletions
diff --git a/lib8tion/scale8.h b/lib8tion/scale8.h index 239e9dea..e6bdeefd 100644 --- a/lib8tion/scale8.h +++ b/lib8tion/scale8.h @@ -388,8 +388,7 @@ LIB8STATIC uint16_t scale16( uint16_t i, fract16 scale ) result = ((uint32_t)(i) * (uint32_t)(scale)) / 65536; return result; #elif SCALE16_AVRASM == 1 - uint32_t result = 0; - const uint8_t zero = 0; + uint32_t result; asm volatile( // result.A-B = i.A x scale.A " mul %A[i], %A[scale] \n\t" @@ -406,12 +405,26 @@ LIB8STATIC uint16_t scale16( uint16_t i, fract16 scale ) // well, in case we want to use this code for // a generic 16x16 multiply somewhere. + : [result] "=r" (result) + : [i] "r" (i), + [scale] "r" (scale) + : "r0", "r1" + ); + + asm volatile( // result.C-D = i.B x scale.B " mul %B[i], %B[scale] \n\t" //" mov %C[result], r0 \n\t" //" mov %D[result], r1 \n\t" " movw %C[result], r0 \n\t" + : [result] "+r" (result) + : [i] "r" (i), + [scale] "r" (scale) + : "r0", "r1" + ); + const uint8_t zero = 0; + asm volatile( // result.B-D += i.B x scale.A " mul %B[i], %A[scale] \n\t" @@ -435,6 +448,7 @@ LIB8STATIC uint16_t scale16( uint16_t i, fract16 scale ) [zero] "r" (zero) : "r0", "r1" ); + result = result >> 16; return result; #else @@ -24,8 +24,32 @@ FL_PROGMEM static uint8_t const p[] = { 151,160,137,91,90,15, #if FASTLED_NOISE_ALLOW_AVERAGE_TO_OVERFLOW == 1 #define AVG15(U,V) (((U)+(V)) >> 1) #else +// See if we should use the inlined avg15 for AVR with MUL instruction +#if defined(__AVR__) && (LIB8_ATTINY == 0) +#define AVG15(U,V) (avg15_inline_avr_mul((U),(V))) +// inlined copy of avg15 for AVR with MUL instruction; cloned from math8.h +// Forcing this inline in the 3-D 16bit noise produces a 12% speedup overall, +// at a cost of just +8 bytes of net code size. +static int16_t inline __attribute__((always_inline)) avg15_inline_avr_mul( int16_t i, int16_t j) +{ + asm volatile( + /* first divide j by 2, throwing away lowest bit */ + "asr %B[j] \n\t" + "ror %A[j] \n\t" + /* now divide i by 2, with lowest bit going into C */ + "asr %B[i] \n\t" + "ror %A[i] \n\t" + /* add j + C to i */ + "adc %A[i], %A[j] \n\t" + "adc %B[i], %B[j] \n\t" + : [i] "+a" (i) + : [j] "a" (j) ); + return i; +} +#else #define AVG15(U,V) (avg15((U),(V))) #endif +#endif // // #define FADE_12 @@ -297,7 +321,13 @@ uint16_t inoise16(uint32_t x, uint32_t y, uint32_t z) { int32_t ans = inoise16_raw(x,y,z); ans = ans + 19052L; uint32_t pan = ans; - return (pan*220L)>>7; + // pan = (ans * 220L) >> 7. That's the same as: + // pan = (ans * 440L) >> 8. And this way avoids a 7X four-byte shift-loop on AVR. + // Identical math, except for the highest bit, which we don't care about anyway, + // since we're returning the 'middle' 16 out of a 32-bit value anyway. + pan *= 440L; + return (pan>>8); + // // return scale16by8(pan,220)<<1; // return ((inoise16_raw(x,y,z)+19052)*220)>>7; // return scale16by8(inoise16_raw(x,y,z)+19052,220)<<1; @@ -340,7 +370,13 @@ uint16_t inoise16(uint32_t x, uint32_t y) { int32_t ans = inoise16_raw(x,y); ans = ans + 17308L; uint32_t pan = ans; - return (pan*242L)>>7; + // pan = (ans * 242L) >> 7. That's the same as: + // pan = (ans * 484L) >> 8. And this way avoids a 7X four-byte shift-loop on AVR. + // Identical math, except for the highest bit, which we don't care about anyway, + // since we're returning the 'middle' 16 out of a 32-bit value anyway. + pan *= 484L; + return (pan>>8); + // return (uint32_t)(((int32_t)inoise16_raw(x,y)+(uint32_t)17308)*242)>>7; // return scale16by8(inoise16_raw(x,y)+17308,242)<<1; } |