2 files changed, 54 insertions, 4 deletions
diff --git a/lib8tion/scale8.h b/lib8tion/scale8.h
index 239e9dea..e6bdeefd 100644
--- a/lib8tion/scale8.h
+++ b/lib8tion/scale8.h
@@ -388,8 +388,7 @@ LIB8STATIC uint16_t scale16( uint16_t i, fract16 scale )
     result = ((uint32_t)(i) * (uint32_t)(scale)) / 65536;
     return result;
 #elif SCALE16_AVRASM == 1
-    uint32_t result = 0;
-    const uint8_t  zero = 0;
+    uint32_t result;
     asm volatile(
                  // result.A-B  = i.A x scale.A
                  "  mul %A[i], %A[scale]                 \n\t"
@@ -406,12 +405,26 @@ LIB8STATIC uint16_t scale16( uint16_t i, fract16 scale )
                  // well, in case we want to use this code for
                  // a generic 16x16 multiply somewhere.
 
+                 : [result] "=r" (result)
+                 : [i] "r" (i),
+                   [scale] "r" (scale)
+                 : "r0", "r1"
+                 );
+
+    asm volatile(
                  // result.C-D  = i.B x scale.B
                  "  mul %B[i], %B[scale]                 \n\t"
                  //"  mov %C[result], r0                 \n\t"
                  //"  mov %D[result], r1                 \n\t"
                  "  movw %C[result], r0                   \n\t"
+                 : [result] "+r" (result)
+                 : [i] "r" (i),
+                   [scale] "r" (scale)
+                 : "r0", "r1"
+                 );
 
+    const uint8_t  zero = 0;
+    asm volatile(
                  // result.B-D += i.B x scale.A
                  "  mul %B[i], %A[scale]                 \n\t"
 
@@ -435,6 +448,7 @@ LIB8STATIC uint16_t scale16( uint16_t i, fract16 scale )
                    [zero] "r" (zero)
                  : "r0", "r1"
                  );
+
     result = result >> 16;
     return result;
 #else
diff --git a/noise.cpp b/noise.cpp
index 49aa6e7c..8b1f6a4d 100644
--- a/noise.cpp
+++ b/noise.cpp
@@ -24,8 +24,32 @@ FL_PROGMEM static uint8_t const p[] = { 151,160,137,91,90,15,
 #if FASTLED_NOISE_ALLOW_AVERAGE_TO_OVERFLOW == 1
 #define AVG15(U,V) (((U)+(V)) >> 1)
 #else
+// See if we should use the inlined avg15 for AVR with MUL instruction
+#if defined(__AVR__) && (LIB8_ATTINY == 0)
+#define AVG15(U,V) (avg15_inline_avr_mul((U),(V)))
+// inlined copy of avg15 for AVR with MUL instruction; cloned from math8.h
+// Forcing this inline in the 3-D 16bit noise produces a 12% speedup overall,
+// at a cost of just +8 bytes of net code size.
+static int16_t inline __attribute__((always_inline))  avg15_inline_avr_mul( int16_t i, int16_t j)
+{
+    asm volatile(
+                 /* first divide j by 2, throwing away lowest bit */
+                 "asr %B[j]          \n\t"
+                 "ror %A[j]          \n\t"
+                 /* now divide i by 2, with lowest bit going into C */
+                 "asr %B[i]          \n\t"
+                 "ror %A[i]          \n\t"
+                 /* add j + C to i */
+                 "adc %A[i], %A[j]   \n\t"
+                 "adc %B[i], %B[j]   \n\t"
+                 : [i] "+a" (i)
+                 : [j] "a"  (j) );
+    return i;
+}
+#else
 #define AVG15(U,V) (avg15((U),(V)))
 #endif
+#endif
 
 //
 // #define FADE_12
@@ -297,7 +321,13 @@ uint16_t inoise16(uint32_t x, uint32_t y, uint32_t z) {
   int32_t ans = inoise16_raw(x,y,z);
   ans = ans + 19052L;
   uint32_t pan = ans;
-  return (pan*220L)>>7;
+  // pan = (ans * 220L) >> 7.  That's the same as:
+  // pan = (ans * 440L) >> 8.  And this way avoids a 7X four-byte shift-loop on AVR.
+  // Identical math, except for the highest bit, which we don't care about anyway,
+  // since we're returning the 'middle' 16 out of a 32-bit value anyway.
+  pan *= 440L;
+  return (pan>>8);
+
   // // return scale16by8(pan,220)<<1;
   // return ((inoise16_raw(x,y,z)+19052)*220)>>7;
   // return scale16by8(inoise16_raw(x,y,z)+19052,220)<<1;
@@ -340,7 +370,13 @@ uint16_t inoise16(uint32_t x, uint32_t y) {
   int32_t ans = inoise16_raw(x,y);
   ans = ans + 17308L;
   uint32_t pan = ans;
-  return (pan*242L)>>7;
+  // pan = (ans * 242L) >> 7.  That's the same as:
+  // pan = (ans * 484L) >> 8.  And this way avoids a 7X four-byte shift-loop on AVR.
+  // Identical math, except for the highest bit, which we don't care about anyway,
+  // since we're returning the 'middle' 16 out of a 32-bit value anyway.
+  pan *= 484L;
+  return (pan>>8);
+    
   // return (uint32_t)(((int32_t)inoise16_raw(x,y)+(uint32_t)17308)*242)>>7;
   // return scale16by8(inoise16_raw(x,y)+17308,242)<<1;
 }