6 files changed, 1222 insertions, 1094 deletions
diff --git a/docs/Doxyfile b/docs/Doxyfile
index 71d7b104..52b0e98b 100644
--- a/docs/Doxyfile
+++ b/docs/Doxyfile
@@ -753,7 +753,7 @@ WARN_LOGFILE           =
 # spaces.
 # Note: If this tag is empty the current directory is searched.
 
-INPUT                  =
+INPUT                  = . lib8tion 
 
 # This tag can be used to specify the character encoding of the source files
 # that doxygen parses. Internally doxygen uses the UTF-8 encoding. Doxygen uses
diff --git a/lib8tion.h b/lib8tion.h
index 87bd8bda..4874aa93 100644
--- a/lib8tion.h
+++ b/lib8tion.h
@@ -288,6 +288,10 @@ Lib8tion is pronounced like 'libation': lie-BAY-shun
 
 #endif
 
+///@defgroup lib8tion Fast math functions
+///A variety of functions for working with numbers.
+///@{
+
 
 ///////////////////////////////////////////////////////////////////////
 //
@@ -356,724 +360,17 @@ typedef union {
     };
 } IEEE754binary32_t;
 
-
+#include "lib8tion/math8.h"
+#include "lib8tion/scale8.h"
+#include "lib8tion/random8.h"
+#include "lib8tion/trig8.h"
 
 ///////////////////////////////////////////////////////////////////////
 
-// qadd8: add one byte to another, saturating at 0xFF
-LIB8STATIC uint8_t qadd8( uint8_t i, uint8_t j)
-{
-#if QADD8_C == 1
-    unsigned int t = i + j;
-    if( t > 255) t = 255;
-    return t;
-#elif QADD8_AVRASM == 1
-    asm volatile(
-         /* First, add j to i, conditioning the C flag */
-         "add %0, %1    \n\t"
-
-         /* Now test the C flag.
-           If C is clear, we branch around a load of 0xFF into i.
-           If C is set, we go ahead and load 0xFF into i.
-         */
-         "brcc L_%=     \n\t"
-         "ldi %0, 0xFF  \n\t"
-         "L_%=: "
-         : "+a" (i)
-         : "a"  (j) );
-    return i;
-#elif QADD8_ARM_DSP_ASM == 1
-    asm volatile( "uqadd8 %0, %0, %1" : "+r" (i) : "r" (j));
-    return i;
-#else
-#error "No implementation for qadd8 available."
-#endif
-}
-
-
-// qadd7: add one signed byte to another,
-//        saturating at 0x7F.
-LIB8STATIC int8_t qadd7( int8_t i, int8_t j)
-{
-#if QADD7_C == 1
-    int16_t t = i + j;
-    if( t > 127) t = 127;
-    return t;
-#elif QADD7_AVRASM == 1
-    asm volatile(
-         /* First, add j to i, conditioning the V flag */
-         "add %0, %1    \n\t"
-
-         /* Now test the V flag.
-          If V is clear, we branch around a load of 0x7F into i.
-          If V is set, we go ahead and load 0x7F into i.
-          */
-         "brvc L_%=     \n\t"
-         "ldi %0, 0x7F  \n\t"
-         "L_%=: "
-         : "+a" (i)
-         : "a"  (j) );
-
-    return i;
-#elif QADD7_ARM_DSP_ASM == 1
-    asm volatile( "qadd8 %0, %0, %1" : "+r" (i) : "r" (j));
-    return i;
-#else
-#error "No implementation for qadd7 available."
-#endif
-}
-
-// qsub8: subtract one byte from another, saturating at 0x00
-LIB8STATIC uint8_t qsub8( uint8_t i, uint8_t j)
-{
-#if QSUB8_C == 1
-    int t = i - j;
-    if( t < 0) t = 0;
-    return t;
-#elif QSUB8_AVRASM == 1
-
-    asm volatile(
-         /* First, subtract j from i, conditioning the C flag */
-         "sub %0, %1    \n\t"
-
-         /* Now test the C flag.
-          If C is clear, we branch around a load of 0x00 into i.
-          If C is set, we go ahead and load 0x00 into i.
-          */
-         "brcc L_%=     \n\t"
-         "ldi %0, 0x00  \n\t"
-         "L_%=: "
-         : "+a" (i)
-         : "a"  (j) );
-
-    return i;
-#else
-#error "No implementation for qsub8 available."
-#endif
-}
-
-// add8: add one byte to another, with one byte result
-LIB8STATIC uint8_t add8( uint8_t i, uint8_t j)
-{
-#if ADD8_C == 1
-    int t = i + j;
-    return t;
-#elif ADD8_AVRASM == 1
-    // Add j to i, period.
-    asm volatile( "add %0, %1" : "+a" (i) : "a" (j));
-    return i;
-#else
-#error "No implementation for add8 available."
-#endif
-}
-
-
-// sub8: subtract one byte from another, 8-bit result
-LIB8STATIC uint8_t sub8( uint8_t i, uint8_t j)
-{
-#if SUB8_C == 1
-    int t = i - j;
-    return t;
-#elif SUB8_AVRASM == 1
-    // Subtract j from i, period.
-    asm volatile( "sub %0, %1" : "+a" (i) : "a" (j));
-    return i;
-#else
-#error "No implementation for sub8 available."
-#endif
-}
-
-// avg8: Calculate an integer average of two unsigned
-//       8-bit integer values (uint8_t).
-//       Fractional results are rounded down, e.g. avg8(20,41) = 30
-LIB8STATIC uint8_t avg8( uint8_t i, uint8_t j)
-{
-#if AVG8_C == 1
-    return (i + j) >> 1;
-#elif AVG8_AVRASM == 1
-    asm volatile(
-         /* First, add j to i, 9th bit overflows into C flag */
-         "add %0, %1    \n\t"
-         /* Divide by two, moving C flag into high 8th bit */
-         "ror %0        \n\t"
-         : "+a" (i)
-         : "a"  (j) );
-    return i;
-#else
-#error "No implementation for avg8 available."
-#endif
-}
-
-
-// avg7: Calculate an integer average of two signed 7-bit
-//       integers (int8_t)
-//       If the first argument is even, result is rounded down.
-//       If the first argument is odd, result is result up.
-LIB8STATIC int8_t avg7( int8_t i, int8_t j)
-{
-#if AVG7_C == 1
-    return ((i + j) >> 1) + (i & 0x1);
-#elif AVG7_AVRASM == 1
-    asm volatile(
-                 "asr %1        \n\t"
-                 "asr %0        \n\t"
-                 "adc %0, %1    \n\t"
-                 : "+a" (i)
-                 : "a"  (j) );
-    return i;
-#else
-#error "No implementation for avg7 available."
-#endif
-}
-
-// mod8: Calculate the remainder of one unsigned 8-bit
-//       value divided by anoter, aka A % M.
-//       Implemented by repeated subtraction, which is
-//       very compact, and very fast if A is 'probably'
-//       less than M.  If A is a large multiple of M,
-//       the loop has to execute multiple times.  However,
-//       even in that case, the loop is only two
-//       instructions long on AVR, i.e., quick.
-LIB8STATIC uint8_t mod8( uint8_t a, uint8_t m)
-{
-#if defined(__AVR__)
-    asm volatile (
-                  "L_%=:  sub %[a],%[m]    \n\t"
-                  "       brcc L_%=        \n\t"
-                  "       add %[a],%[m]    \n\t"
-                  : [a] "+r" (a)
-                  : [m] "r"  (m)
-                  );
-#else
-    while( a >= m) a -= m;
-#endif
-    return a;
-}
-
-// addmod8: Add two numbers, and calculate the modulo
-//          of the sum and a third number, M.
-//          In other words, it returns (A+B) % M.
-//          It is designed as a compact mechanism for
-//          incrementing a 'mode' switch and wrapping
-//          around back to 'mode 0' when the switch
-//          goes past the end of the available range.
-//          e.g. if you have seven modes, this switches
-//          to the next one and wraps around if needed:
-//            mode = addmod8( mode, 1, 7);
-//          See 'mod8' for notes on performance.
-LIB8STATIC uint8_t addmod8( uint8_t a, uint8_t b, uint8_t m)
-{
-#if defined(__AVR__)
-    asm volatile (
-                  "       add %[a],%[b]    \n\t"
-                  "L_%=:  sub %[a],%[m]    \n\t"
-                  "       brcc L_%=        \n\t"
-                  "       add %[a],%[m]    \n\t"
-                  : [a] "+r" (a)
-                  : [b] "r"  (b), [m] "r" (m)
-                  );
-#else
-    a += b;
-    while( a >= m) a -= m;
-#endif
-    return a;
-}
-
-
-// scale8: scale one byte by a second one, which is treated as
-//         the numerator of a fraction whose denominator is 256
-//         In other words, it computes i * (scale / 256)
-//         4 clocks AVR with MUL, 2 clocks ARM
-LIB8STATIC uint8_t scale8( uint8_t i, fract8 scale)
-{
-#if SCALE8_C == 1
-    return ((uint16_t)i * (uint16_t)(scale) ) >> 8;
-#elif SCALE8_AVRASM == 1
-#if defined(LIB8_ATTINY)
-    uint8_t work=0;
-    uint8_t cnt=0x80;
-    asm volatile(
-        "LOOP_%=:                             \n\t"
-        /*"  sbrc %[scale], 0             \n\t"
-        "  add %[work], %[i]            \n\t"
-        "  ror %[work]                  \n\t"
-        "  lsr %[scale]                 \n\t"
-        "  clc                          \n\t"*/
-        "  sbrc %[scale], 0             \n\t"
-        "  add %[work], %[i]            \n\t"
-        "  ror %[work]                  \n\t"
-        "  lsr %[scale]                 \n\t"
-        "  lsr %[cnt]                   \n\t"
-        "brcc LOOP_%="
-        : [work] "+r" (work), [cnt] "+r" (cnt)
-        : [scale] "r" (scale), [i] "r" (i)
-        :
-      );
-    return work;
-#else
-    asm volatile(
-         /* Multiply 8-bit i * 8-bit scale, giving 16-bit r1,r0 */
-         "mul %0, %1          \n\t"
-         /* Move the high 8-bits of the product (r1) back to i */
-         "mov %0, r1          \n\t"
-         /* Restore r1 to "0"; it's expected to always be that */
-         "clr __zero_reg__    \n\t"
-
-         : "+a" (i)      /* writes to i */
-         : "a"  (scale)  /* uses scale */
-         : "r0", "r1"    /* clobbers r0, r1 */ );
-
-    /* Return the result */
-    return i;
-#endif
-#else
-#error "No implementation for scale8 available."
-#endif
-}
-
-
-//  The "video" version of scale8 guarantees that the output will
-//  be only be zero if one or both of the inputs are zero.  If both
-//  inputs are non-zero, the output is guaranteed to be non-zero.
-//  This makes for better 'video'/LED dimming, at the cost of
-//  several additional cycles.
-LIB8STATIC uint8_t scale8_video( uint8_t i, fract8 scale)
-{
-#if SCALE8_C == 1 || defined(LIB8_ATTINY)
-    uint8_t j = (((int)i * (int)scale) >> 8) + ((i&&scale)?1:0);
-    // uint8_t nonzeroscale = (scale != 0) ? 1 : 0;
-    // uint8_t j = (i == 0) ? 0 : (((int)i * (int)(scale) ) >> 8) + nonzeroscale;
-    return j;
-#elif SCALE8_AVRASM == 1
-    uint8_t j=0;
-    asm volatile(
-        "  tst %[i]\n\t"
-        "  breq L_%=\n\t"
-        "  mul %[i], %[scale]\n\t"
-        "  mov %[j], r1\n\t"
-        "  clr __zero_reg__\n\t"
-        "  cpse %[scale], r1\n\t"
-        "  subi %[j], 0xFF\n\t"
-        "L_%=: \n\t"
-        : [j] "+a" (j)
-        : [i] "a" (i), [scale] "a" (scale)
-        : "r0", "r1");
-
-    return j;
-    // uint8_t nonzeroscale = (scale != 0) ? 1 : 0;
-    // asm volatile(
-    //      "      tst %0           \n"
-    //      "      breq L_%=        \n"
-    //      "      mul %0, %1       \n"
-    //      "      mov %0, r1       \n"
-    //      "      add %0, %2       \n"
-    //      "      clr __zero_reg__ \n"
-    //      "L_%=:                  \n"
-
-    //      : "+a" (i)
-    //      : "a" (scale), "a" (nonzeroscale)
-    //      : "r0", "r1");
-
-    // // Return the result
-    // return i;
-#else
-#error "No implementation for scale8_video available."
-#endif
-}
-
-
-// This version of scale8 does not clean up the R1 register on AVR
-// If you are doing several 'scale8's in a row, use this, and
-// then explicitly call cleanup_R1.
-LIB8STATIC uint8_t scale8_LEAVING_R1_DIRTY( uint8_t i, fract8 scale)
-{
-#if SCALE8_C == 1
-    return ((int)i * (int)(scale) ) >> 8;
-#elif SCALE8_AVRASM == 1
-    asm volatile(
-         /* Multiply 8-bit i * 8-bit scale, giving 16-bit r1,r0 */
-         "mul %0, %1    \n\t"
-         /* Move the high 8-bits of the product (r1) back to i */
-         "mov %0, r1    \n\t"
-         /* R1 IS LEFT DIRTY HERE; YOU MUST ZERO IT OUT YOURSELF  */
-         /* "clr __zero_reg__    \n\t" */
-
-         : "+a" (i)      /* writes to i */
-         : "a"  (scale)  /* uses scale */
-         : "r0", "r1"    /* clobbers r0, r1 */ );
-
-    // Return the result
-    return i;
-#else
-#error "No implementation for scale8_LEAVING_R1_DIRTY available."
-#endif
-}
-
-//   THIS FUNCTION ALWAYS MODIFIES ITS ARGUMENT DIRECTLY IN PLACE
-
-LIB8STATIC void nscale8_LEAVING_R1_DIRTY( uint8_t& i, fract8 scale)
-{
-#if SCALE8_C == 1
-    i = ((int)i * (int)(scale) ) >> 8;
-#elif SCALE8_AVRASM == 1
-    asm volatile(
-         /* Multiply 8-bit i * 8-bit scale, giving 16-bit r1,r0 */
-         "mul %0, %1    \n\t"
-         /* Move the high 8-bits of the product (r1) back to i */
-         "mov %0, r1    \n\t"
-         /* R1 IS LEFT DIRTY HERE; YOU MUST ZERO IT OUT YOURSELF */
-         /* "clr __zero_reg__    \n\t" */
-
-         : "+a" (i)      /* writes to i */
-         : "a"  (scale)  /* uses scale */
-         : "r0", "r1"    /* clobbers r0, r1 */ );
-#else
-#error "No implementation for nscale8_LEAVING_R1_DIRTY available."
-#endif
-}
-
-
-
-LIB8STATIC uint8_t scale8_video_LEAVING_R1_DIRTY( uint8_t i, fract8 scale)
-{
-#if SCALE8_C == 1 || defined(LIB8_ATTINY)
-    uint8_t j = (((int)i * (int)scale) >> 8) + ((i&&scale)?1:0);
-    // uint8_t nonzeroscale = (scale != 0) ? 1 : 0;
-    // uint8_t j = (i == 0) ? 0 : (((int)i * (int)(scale) ) >> 8) + nonzeroscale;
-    return j;
-#elif SCALE8_AVRASM == 1
-    uint8_t j=0;
-    asm volatile(
-        "  tst %[i]\n\t"
-        "  breq L_%=\n\t"
-        "  mul %[i], %[scale]\n\t"
-        "  mov %[j], r1\n\t"
-        "  breq L_%=\n\t"
-        "  subi %[j], 0xFF\n\t"
-        "L_%=: \n\t"
-        : [j] "+a" (j)
-        : [i] "a" (i), [scale] "a" (scale)
-        : "r0", "r1");
-
-    return j;
-    // uint8_t nonzeroscale = (scale != 0) ? 1 : 0;
-    // asm volatile(
-    //      "      tst %0           \n"
-    //      "      breq L_%=        \n"
-    //      "      mul %0, %1       \n"
-    //      "      mov %0, r1       \n"
-    //      "      add %0, %2       \n"
-    //      "      clr __zero_reg__ \n"
-    //      "L_%=:                  \n"
-
-    //      : "+a" (i)
-    //      : "a" (scale), "a" (nonzeroscale)
-    //      : "r0", "r1");
-
-    // // Return the result
-    // return i;
-#else
-#error "No implementation for scale8_video_LEAVING_R1_DIRTY available."
-#endif
-}
-
-LIB8STATIC void nscale8_video_LEAVING_R1_DIRTY( uint8_t & i, fract8 scale)
-{
-#if SCALE8_C == 1 || defined(LIB8_ATTINY)
-    i = (((int)i * (int)scale) >> 8) + ((i&&scale)?1:0);
-#elif SCALE8_AVRASM == 1
-    asm volatile(
-        "  tst %[i]\n\t"
-        "  breq L_%=\n\t"
-        "  mul %[i], %[scale]\n\t"
-        "  mov %[i], r1\n\t"
-        "  breq L_%=\n\t"
-        "  subi %[i], 0xFF\n\t"
-        "L_%=: \n\t"
-        : [i] "+a" (i)
-        : [scale] "a" (scale)
-        : "r0", "r1");
-#else
-#error "No implementation for scale8_video_LEAVING_R1_DIRTY available."
-#endif
-}
-
-
-LIB8STATIC void cleanup_R1()
-{
-#if CLEANUP_R1_AVRASM == 1
-    // Restore r1 to "0"; it's expected to always be that
-    asm volatile( "clr __zero_reg__  \n\t" : : : "r1" );
-#endif
-}
-
-
-// nscale8x3: scale three one byte values by a fourth one, which is treated as
-//         the numerator of a fraction whose demominator is 256
-//         In other words, it computes r,g,b * (scale / 256)
-//
-//         THIS FUNCTION ALWAYS MODIFIES ITS ARGUMENTS IN PLACE
-
-LIB8STATIC void nscale8x3( uint8_t& r, uint8_t& g, uint8_t& b, fract8 scale)
-{
-#if SCALE8_C == 1
-    r = ((int)r * (int)(scale) ) >> 8;
-    g = ((int)g * (int)(scale) ) >> 8;
-    b = ((int)b * (int)(scale) ) >> 8;
-#elif SCALE8_AVRASM == 1
-    r = scale8_LEAVING_R1_DIRTY(r, scale);
-    g = scale8_LEAVING_R1_DIRTY(g, scale);
-    b = scale8_LEAVING_R1_DIRTY(b, scale);
-    cleanup_R1();
-#else
-#error "No implementation for nscale8x3 available."
-#endif
-}
-
-
-LIB8STATIC void nscale8x3_video( uint8_t& r, uint8_t& g, uint8_t& b, fract8 scale)
-{
-#if SCALE8_C == 1
-    uint8_t nonzeroscale = (scale != 0) ? 1 : 0;
-    r = (r == 0) ? 0 : (((int)r * (int)(scale) ) >> 8) + nonzeroscale;
-    g = (g == 0) ? 0 : (((int)g * (int)(scale) ) >> 8) + nonzeroscale;
-    b = (b == 0) ? 0 : (((int)b * (int)(scale) ) >> 8) + nonzeroscale;
-#elif SCALE8_AVRASM == 1
-    nscale8_video_LEAVING_R1_DIRTY( r, scale);
-    nscale8_video_LEAVING_R1_DIRTY( g, scale);
-    nscale8_video_LEAVING_R1_DIRTY( b, scale);
-    cleanup_R1();
-#else
-#error "No implementation for nscale8x3 available."
-#endif
-}
 
-// nscale8x2: scale two one byte values by a third one, which is treated as
-//         the numerator of a fraction whose demominator is 256
-//         In other words, it computes i,j * (scale / 256)
-//
-//         THIS FUNCTION ALWAYS MODIFIES ITS ARGUMENTS IN PLACE
 
-LIB8STATIC void nscale8x2( uint8_t& i, uint8_t& j, fract8 scale)
-{
-#if SCALE8_C == 1
-    i = ((int)i * (int)(scale) ) >> 8;
-    j = ((int)j * (int)(scale) ) >> 8;
-#elif SCALE8_AVRASM == 1
-    i = scale8_LEAVING_R1_DIRTY(i, scale);
-    j = scale8_LEAVING_R1_DIRTY(j, scale);
-    cleanup_R1();
-#else
-#error "No implementation for nscale8x2 available."
-#endif
-}
 
 
-LIB8STATIC void nscale8x2_video( uint8_t& i, uint8_t& j, fract8 scale)
-{
-#if SCALE8_C == 1
-    uint8_t nonzeroscale = (scale != 0) ? 1 : 0;
-    i = (i == 0) ? 0 : (((int)i * (int)(scale) ) >> 8) + nonzeroscale;
-    j = (j == 0) ? 0 : (((int)j * (int)(scale) ) >> 8) + nonzeroscale;
-#elif SCALE8_AVRASM == 1
-    nscale8_video_LEAVING_R1_DIRTY( i, scale);
-    nscale8_video_LEAVING_R1_DIRTY( j, scale);
-    cleanup_R1();
-#else
-#error "No implementation for nscale8x2 available."
-#endif
-}
-
-
-// scale16by8: scale a 16-bit unsigned value by an 8-bit value,
-//         considered as numerator of a fraction whose denominator
-//         is 256. In other words, it computes i * (scale / 256)
-
-#if SCALE16BY8_C == 1
-LIB8STATIC uint16_t scale16by8( uint16_t i, fract8 scale )
-{
-    uint16_t result;
-    result = (i * scale) / 256;
-    return result;
-}
-#elif SCALE16BY8_AVRASM == 1
-LIB8STATIC uint16_t scale16by8( uint16_t i, fract8 scale )
-{
-    uint16_t result = 0;
-    asm volatile(
-         // result.A = HighByte(i.A x j )
-         "  mul %A[i], %[scale]                 \n\t"
-         "  mov %A[result], r1                  \n\t"
-         //"  clr %B[result]                      \n\t"
-
-         // result.A-B += i.B x j
-         "  mul %B[i], %[scale]                 \n\t"
-         "  add %A[result], r0                  \n\t"
-         "  adc %B[result], r1                  \n\t"
-
-         // cleanup r1
-         "  clr __zero_reg__                    \n\t"
-
-         : [result] "+r" (result)
-         : [i] "r" (i), [scale] "r" (scale)
-         : "r0", "r1"
-         );
-    return result;
-}
-#else
-#error "No implementation for scale16by8 available."
-#endif
-
-// scale16: scale a 16-bit unsigned value by a 16-bit value,
-//         considered as numerator of a fraction whose denominator
-//         is 65536. In other words, it computes i * (scale / 65536)
-
-#if SCALE16_C == 1
-LIB8STATIC uint16_t scale16( uint16_t i, fract16 scale )
-{
-    uint16_t result;
-    result = ((uint32_t)(i) * (uint32_t)(scale)) / 65536;
-    return result;
-}
-#elif SCALE16_AVRASM == 1
-LIB8STATIC uint16_t scale16( uint16_t i, fract16 scale )
-{
-    uint32_t result = 0;
-    const uint8_t  zero = 0;
-    asm volatile(
-                 // result.A-B  = i.A x scale.A
-                 "  mul %A[i], %A[scale]                 \n\t"
-                 //  save results...
-                 // basic idea:
-                 //"  mov %A[result], r0                 \n\t"
-                 //"  mov %B[result], r1                 \n\t"
-                 // which can be written as...
-                 "  movw %A[result], r0                   \n\t"
-                 // We actually need to do anything with r0,
-                 // as result.A is never used again here, so we
-                 // could just move the high byte, but movw is
-                 // one clock cycle, just like mov, so might as
-                 // well, in case we want to use this code for
-                 // a generic 16x16 multiply somewhere.
-
-                 // result.C-D  = i.B x scale.B
-                 "  mul %B[i], %B[scale]                 \n\t"
-                 //"  mov %C[result], r0                 \n\t"
-                 //"  mov %D[result], r1                 \n\t"
-                 "  movw %C[result], r0                   \n\t"
-
-                 // result.B-D += i.B x scale.A
-                 "  mul %B[i], %A[scale]                 \n\t"
-
-                 "  add %B[result], r0                   \n\t"
-                 "  adc %C[result], r1                   \n\t"
-                 "  adc %D[result], %[zero]              \n\t"
-
-                 // result.B-D += i.A x scale.B
-                 "  mul %A[i], %B[scale]                 \n\t"
-
-                 "  add %B[result], r0                   \n\t"
-                 "  adc %C[result], r1                   \n\t"
-                 "  adc %D[result], %[zero]              \n\t"
-
-                 // cleanup r1
-                 "  clr r1                               \n\t"
-
-                 : [result] "+r" (result)
-                 : [i] "r" (i),
-                   [scale] "r" (scale),
-                   [zero] "r" (zero)
-                 : "r0", "r1"
-                 );
-    result = result >> 16;
-    return result;
-}
-#else
-#error "No implementation for scale16 available."
-#endif
-
-
-
-// mul8: 8x8 bit multiplication, with 8 bit result
-LIB8STATIC uint8_t mul8( uint8_t i, uint8_t j)
-{
-#if MUL8_C == 1
-    return ((int)i * (int)(j) ) & 0xFF;
-#elif MUL8_AVRASM == 1
-    asm volatile(
-         /* Multiply 8-bit i * 8-bit j, giving 16-bit r1,r0 */
-         "mul %0, %1          \n\t"
-         /* Extract the LOW 8-bits (r0) */
-         "mov %0, r0          \n\t"
-         /* Restore r1 to "0"; it's expected to always be that */
-         "clr __zero_reg__    \n\t"
-         : "+a" (i)
-         : "a"  (j)
-         : "r0", "r1");
-
-    return i;
-#else
-#error "No implementation for mul8 available."
-#endif
-}
-
-
-// mul8: saturating 8x8 bit multiplication, with 8 bit result
-LIB8STATIC uint8_t qmul8( uint8_t i, uint8_t j)
-{
-#if QMUL8_C == 1
-    int p = ((int)i * (int)(j) );
-    if( p > 255) p = 255;
-    return p;
-#elif QMUL8_AVRASM == 1
-    asm volatile(
-                 /* Multiply 8-bit i * 8-bit j, giving 16-bit r1,r0 */
-                 "  mul %0, %1          \n\t"
-                 /* If high byte of result is zero, all is well. */
-                 "  tst r1              \n\t"
-                 "  breq Lnospill_%=    \n\t"
-                 /* If high byte of result > 0, saturate low byte to 0xFF */
-                 "  ldi %0,0xFF         \n\t"
-                 "  rjmp Ldone_%=       \n\t"
-                 "Lnospill_%=:          \n\t"
-                 /* Extract the LOW 8-bits (r0) */
-                 "  mov %0, r0          \n\t"
-                 "Ldone_%=:             \n\t"
-                 /* Restore r1 to "0"; it's expected to always be that */
-                 "  clr __zero_reg__    \n\t"
-                 : "+a" (i)
-                 : "a"  (j)
-                 : "r0", "r1");
-
-    return i;
-#else
-#error "No implementation for qmul8 available."
-#endif
-}
-
-
-// abs8: take abs() of a signed 8-bit uint8_t
-LIB8STATIC int8_t abs8( int8_t i)
-{
-#if ABS8_C == 1
-    if( i < 0) i = -i;
-    return i;
-#elif ABS8_AVRASM == 1
-
-
-    asm volatile(
-         /* First, check the high bit, and prepare to skip if it's clear */
-         "sbrc %0, 7 \n"
-
-         /* Negate the value */
-         "neg %0     \n"
-
-         : "+r" (i) : "r" (i) );
-    return i;
-#else
-#error "No implementation for abs8 available."
-#endif
-}
 
 
 ///////////////////////////////////////////////////////////////////////
@@ -1103,355 +400,6 @@ LIB8STATIC sfract15 floatToSfract15( float f)
 
 
 ///////////////////////////////////////////////////////////////////////
-
-// Dimming and brightening functions
-//
-// The eye does not respond in a linear way to light.
-// High speed PWM'd LEDs at 50% duty cycle appear far
-// brighter then the 'half as bright' you might expect.
-//
-// If you want your midpoint brightness leve (128) to
-// appear half as bright as 'full' brightness (255), you
-// have to apply a 'dimming function'.
-//
-//
-
-LIB8STATIC uint8_t dim8_raw( uint8_t x)
-{
-    return scale8( x, x);
-}
-
-LIB8STATIC uint8_t dim8_video( uint8_t x)
-{
-    return scale8_video( x, x);
-}
-
-LIB8STATIC uint8_t dim8_lin( uint8_t x )
-{
-    if( x & 0x80 ) {
-        x = scale8( x, x);
-    } else {
-        x += 1;
-        x /= 2;
-    }
-    return x;
-}
-
-LIB8STATIC uint8_t brighten8_raw( uint8_t x)
-{
-    uint8_t ix = 255 - x;
-    return 255 - scale8( ix, ix);
-}
-
-LIB8STATIC uint8_t brighten8_video( uint8_t x)
-{
-    uint8_t ix = 255 - x;
-    return 255 - scale8_video( ix, ix);
-}
-
-LIB8STATIC uint8_t brighten8_lin( uint8_t x )
-{
-    uint8_t ix = 255 - x;
-    if( ix & 0x80 ) {
-        ix = scale8( ix, ix);
-    } else {
-        ix += 1;
-        ix /= 2;
-    }
-    return 255 - ix;
-}
-
-///////////////////////////////////////////////////////////////////////
-
-// A 16-bit PNRG good enough for LED animations
-
-// X(n+1) = (2053 * X(n)) + 13849)
-#define FASTLED_RAND16_2053  ((uint16_t)(2053))
-#define FASTLED_RAND16_13849 ((uint16_t)(13849))
-
-extern uint16_t rand16seed;// = RAND16_SEED;
-
-
-LIB8STATIC uint8_t random8()
-{
-    rand16seed = (rand16seed * FASTLED_RAND16_2053) + FASTLED_RAND16_13849;
-    // return the sum of the high and low bytes, for better
-    //  mixing and non-sequential correlation
-    return (uint8_t)(((uint8_t)(rand16seed & 0xFF)) +
-                     ((uint8_t)(rand16seed >> 8)));
-}
-
-LIB8STATIC uint16_t random16()
-{
-    rand16seed = (rand16seed * FASTLED_RAND16_2053) + FASTLED_RAND16_13849;
-    return rand16seed;
-}
-
-
-LIB8STATIC uint8_t random8(uint8_t lim)
-{
-    uint8_t r = random8();
-    r = scale8( r, lim);
-    return r;
-}
-
-LIB8STATIC uint8_t random8(uint8_t min, uint8_t lim)
-{
-    uint8_t delta = lim - min;
-    uint8_t r = random8(delta) + min;
-    return r;
-}
-
-LIB8STATIC uint16_t random16( uint16_t lim)
-{
-    uint16_t r = random16();
-    uint32_t p = (uint32_t)lim * (uint32_t)r;
-    r = p >> 16;
-    return r;
-}
-
-LIB8STATIC uint16_t random16( uint16_t min, uint16_t lim)
-{
-    uint16_t delta = lim - min;
-    uint16_t r = random16( delta) + min;
-    return r;
-}
-
-LIB8STATIC void random16_set_seed( uint16_t seed)
-{
-    rand16seed = seed;
-}
-
-LIB8STATIC uint16_t random16_get_seed()
-{
-    return rand16seed;
-}
-
-LIB8STATIC void random16_add_entropy( uint16_t entropy)
-{
-    rand16seed += entropy;
-}
-
-
-///////////////////////////////////////////////////////////////////////
-
-// sin16 & cos16:
-//        Fast 16-bit approximations of sin(x) & cos(x).
-//        Input angle is an unsigned int from 0-65535.
-//        Output is signed int from -32767 to 32767.
-//
-//        This approximation never varies more than 0.69%
-//        from the floating point value you'd get by doing
-//          float s = sin( x ) * 32767.0;
-//
-//        Don't use this approximation for calculating the
-//        trajectory of a rocket to Mars, but it's great
-//        for art projects and LED displays.
-//
-//        On Arduino/AVR, this approximation is more than
-//        10X faster than floating point sin(x) and cos(x)
-
-#if defined(__AVR__)
-#define sin16 sin16_avr
-#else
-#define sin16 sin16_C
-#endif
-
-LIB8STATIC int16_t sin16_avr( uint16_t theta )
-{
-    static const uint8_t data[] =
-    { 0,         0,         49, 0, 6393%256,   6393/256, 48, 0,
-      12539%256, 12539/256, 44, 0, 18204%256, 18204/256, 38, 0,
-      23170%256, 23170/256, 31, 0, 27245%256, 27245/256, 23, 0,
-      30273%256, 30273/256, 14, 0, 32137%256, 32137/256,  4 /*,0*/ };
-
-    uint16_t offset = (theta & 0x3FFF);
-
-    // AVR doesn't have a multi-bit shift instruction,
-    // so if we say "offset >>= 3", gcc makes a tiny loop.
-    // Inserting empty volatile statements between each
-    // bit shift forces gcc to unroll the loop.
-    offset >>= 1; // 0..8191
-    asm volatile("");
-    offset >>= 1; // 0..4095
-    asm volatile("");
-    offset >>= 1; // 0..2047
-
-    if( theta & 0x4000 ) offset = 2047 - offset;
-
-    uint8_t sectionX4;
-    sectionX4 = offset / 256;
-    sectionX4 *= 4;
-
-    uint8_t m;
-
-    union {
-        uint16_t b;
-        struct {
-            uint8_t blo;
-            uint8_t bhi;
-        };
-    } u;
-
-    //in effect u.b = blo + (256 * bhi);
-    u.blo = data[ sectionX4 ];
-    u.bhi = data[ sectionX4 + 1];
-    m     = data[ sectionX4 + 2];
-
-    uint8_t secoffset8 = (uint8_t)(offset) / 2;
-
-    uint16_t mx = m * secoffset8;
-
-    int16_t  y  = mx + u.b;
-    if( theta & 0x8000 ) y = -y;
-
-    return y;
-}
-
-LIB8STATIC int16_t sin16_C( uint16_t theta )
-{
-    static const uint16_t base[] =
-    { 0, 6393, 12539, 18204, 23170, 27245, 30273, 32137 };
-    static const uint8_t slope[] =
-    { 49, 48, 44, 38, 31, 23, 14, 4 };
-
-    uint16_t offset = (theta & 0x3FFF) >> 3; // 0..2047
-    if( theta & 0x4000 ) offset = 2047 - offset;
-
-    uint8_t section = offset / 256; // 0..7
-    uint16_t b   = base[section];
-    uint8_t  m   = slope[section];
-
-    uint8_t secoffset8 = (uint8_t)(offset) / 2;
-
-    uint16_t mx = m * secoffset8;
-    int16_t  y  = mx + b;
-
-    if( theta & 0x8000 ) y = -y;
-
-    return y;
-}
-
-LIB8STATIC int16_t cos16( uint16_t theta)
-{
-    return sin16( theta + 16384);
-}
-
-///////////////////////////////////////////////////////////////////////
-
-// sin8 & cos8
-//        Fast 8-bit approximations of sin(x) & cos(x).
-//        Input angle is an unsigned int from 0-255.
-//        Output is an unsigned int from 0 to 255.
-//
-//        This approximation can vary to to 2%
-//        from the floating point value you'd get by doing
-//          float s = (sin( x ) * 128.0) + 128;
-//
-//        Don't use this approximation for calculating the
-//        "real" trigonometric calculations, but it's great
-//        for art projects and LED displays.
-//
-//        On Arduino/AVR, this approximation is more than
-//        20X faster than floating point sin(x) and cos(x)
-
-#if defined(__AVR__) && !defined(LIB8_ATTINY)
-#define sin8 sin8_avr
-#else
-#define sin8 sin8_C
-#endif
-
-
-const uint8_t b_m16_interleave[] = { 0, 49, 49, 41, 90, 27, 117, 10 };
-
-LIB8STATIC uint8_t  sin8_avr( uint8_t theta)
-{
-    uint8_t offset = theta;
-
-    asm volatile(
-                 "sbrc %[theta],6         \n\t"
-                 "com  %[offset]           \n\t"
-                 : [theta] "+r" (theta), [offset] "+r" (offset)
-                 );
-
-    offset &= 0x3F; // 0..63
-
-    uint8_t secoffset  = offset & 0x0F; // 0..15
-    if( theta & 0x40) secoffset++;
-
-    uint8_t m16; uint8_t b;
-
-    uint8_t section = offset >> 4; // 0..3
-    uint8_t s2 = section * 2;
-
-    const uint8_t* p = b_m16_interleave;
-    p += s2;
-    b   = *p;
-    p++;
-    m16 = *p;
-
-    uint8_t mx;
-    uint8_t xr1;
-    asm volatile(
-                 "mul %[m16],%[secoffset]   \n\t"
-                 "mov %[mx],r0              \n\t"
-                 "mov %[xr1],r1             \n\t"
-                 "eor  r1, r1               \n\t"
-                 "swap %[mx]                \n\t"
-                 "andi %[mx],0x0F           \n\t"
-                 "swap %[xr1]               \n\t"
-                 "andi %[xr1], 0xF0         \n\t"
-                 "or   %[mx], %[xr1]        \n\t"
-                 : [mx] "=r" (mx), [xr1] "=r" (xr1)
-                 : [m16] "r" (m16), [secoffset] "r" (secoffset)
-                 );
-
-    int8_t y = mx + b;
-    if( theta & 0x80 ) y = -y;
-
-    y += 128;
-
-    return y;
-}
-
-
-LIB8STATIC uint8_t sin8_C( uint8_t theta)
-{
-    uint8_t offset = theta;
-    if( theta & 0x40 ) {
-        offset = (uint8_t)255 - offset;
-    }
-    offset &= 0x3F; // 0..63
-
-    uint8_t secoffset  = offset & 0x0F; // 0..15
-    if( theta & 0x40) secoffset++;
-
-    uint8_t section = offset >> 4; // 0..3
-    uint8_t s2 = section * 2;
-    const uint8_t* p = b_m16_interleave;
-    p += s2;
-    uint8_t b   =  *p;
-    p++;
-    uint8_t m16 =  *p;
-
-    uint8_t mx = (m16 * secoffset) >> 4;
-
-    int8_t y = mx + b;
-    if( theta & 0x80 ) y = -y;
-
-    y += 128;
-
-    return y;
-}
-
-
-LIB8STATIC uint8_t cos8( uint8_t theta)
-{
-    return sin8( theta + 64);
-}
-
-
-///////////////////////////////////////////////////////////////////////
 //
 // memmove8, memcpy8, and memset8:
 //   alternatives to memmove, memcpy, and memset that are
@@ -1770,39 +718,6 @@ LIB8STATIC uint8_t squarewave8( uint8_t in, uint8_t pulsewidth=128)
 
 
 
-// sqrt16: square root for 16-bit integers
-//         About three times faster and five times smaller
-//         than Arduino's general sqrt on AVR.
-LIB8STATIC uint8_t sqrt16(uint16_t x)
-{
-    if( x <= 1) {
-        return x;
-    }
-
-    uint8_t low = 1; // lower bound
-    uint8_t hi, mid;
-
-    if( x > 7904) {
-        hi = 255;
-    } else {
-        hi = (x >> 5) + 8; // initial estimate for upper bound
-    }
-
-    do {
-        mid = (low + hi) >> 1;
-        if ((uint16_t)(mid * mid) > x) {
-            hi = mid - 1;
-        } else {
-            if( mid == 255) {
-                return 255;
-            }
-            low = mid + 1;
-        }
-    } while (hi >= low);
-
-    return low - 1;
-}
-
 
 template<class T, int F, int I> class q {
   T i:I;
@@ -2136,5 +1051,6 @@ typedef CEveryNTimePeriods<uint8_t,hours8> CEveryNHours;
 #define EVERY_N_MILLISECONDS(N) EVERY_N_MILLIS(N)
 
 FASTLED_NAMESPACE_END
+///@}
 
 #endif
diff --git a/lib8tion/math8.h b/lib8tion/math8.h
new file mode 100644
index 00000000..9f1e9256
--- /dev/null
+++ b/lib8tion/math8.h
@@ -0,0 +1,348 @@
+#ifndef __INC_LIB8TION_MATH_H
+#define __INC_LIB8TION_MATH_H
+
+///@ingroup lib8tion
+
+///@defgroup Math Basic math operations
+///@{
+/// add one byte to another, saturating at 0xFF
+/// @param i - first byte to add
+/// @param j - second byte to add
+/// @returns the sum of i & j, capped at 0xFF
+LIB8STATIC uint8_t qadd8( uint8_t i, uint8_t j)
+{
+#if QADD8_C == 1
+    unsigned int t = i + j;
+    if( t > 255) t = 255;
+    return t;
+#elif QADD8_AVRASM == 1
+    asm volatile(
+         /* First, add j to i, conditioning the C flag */
+         "add %0, %1    \n\t"
+
+         /* Now test the C flag.
+           If C is clear, we branch around a load of 0xFF into i.
+           If C is set, we go ahead and load 0xFF into i.
+         */
+         "brcc L_%=     \n\t"
+         "ldi %0, 0xFF  \n\t"
+         "L_%=: "
+         : "+a" (i)
+         : "a"  (j) );
+    return i;
+#elif QADD8_ARM_DSP_ASM == 1
+    asm volatile( "uqadd8 %0, %0, %1" : "+r" (i) : "r" (j));
+    return i;
+#else
+#error "No implementation for qadd8 available."
+#endif
+}
+
+/// Add one byte to another, saturating at 0x7F
+/// @param i - first byte to add
+/// @param j - second byte to add
+/// @returns the sum of i & j, capped at 0xFF
+LIB8STATIC int8_t qadd7( int8_t i, int8_t j)
+{
+#if QADD7_C == 1
+    int16_t t = i + j;
+    if( t > 127) t = 127;
+    return t;
+#elif QADD7_AVRASM == 1
+    asm volatile(
+         /* First, add j to i, conditioning the V flag */
+         "add %0, %1    \n\t"
+
+         /* Now test the V flag.
+          If V is clear, we branch around a load of 0x7F into i.
+          If V is set, we go ahead and load 0x7F into i.
+          */
+         "brvc L_%=     \n\t"
+         "ldi %0, 0x7F  \n\t"
+         "L_%=: "
+         : "+a" (i)
+         : "a"  (j) );
+
+    return i;
+#elif QADD7_ARM_DSP_ASM == 1
+    asm volatile( "qadd8 %0, %0, %1" : "+r" (i) : "r" (j));
+    return i;
+#else
+#error "No implementation for qadd7 available."
+#endif
+}
+
+/// subtract one byte from another, saturating at 0x00
+/// @returns i - j with a floor of 0
+LIB8STATIC uint8_t qsub8( uint8_t i, uint8_t j)
+{
+#if QSUB8_C == 1
+    int t = i - j;
+    if( t < 0) t = 0;
+    return t;
+#elif QSUB8_AVRASM == 1
+
+    asm volatile(
+         /* First, subtract j from i, conditioning the C flag */
+         "sub %0, %1    \n\t"
+
+         /* Now test the C flag.
+          If C is clear, we branch around a load of 0x00 into i.
+          If C is set, we go ahead and load 0x00 into i.
+          */
+         "brcc L_%=     \n\t"
+         "ldi %0, 0x00  \n\t"
+         "L_%=: "
+         : "+a" (i)
+         : "a"  (j) );
+
+    return i;
+#else
+#error "No implementation for qsub8 available."
+#endif
+}
+
+/// add one byte to another, with one byte result
+LIB8STATIC uint8_t add8( uint8_t i, uint8_t j)
+{
+#if ADD8_C == 1
+    int t = i + j;
+    return t;
+#elif ADD8_AVRASM == 1
+    // Add j to i, period.
+    asm volatile( "add %0, %1" : "+a" (i) : "a" (j));
+    return i;
+#else
+#error "No implementation for add8 available."
+#endif
+}
+
+
+/// subtract one byte from another, 8-bit result
+LIB8STATIC uint8_t sub8( uint8_t i, uint8_t j)
+{
+#if SUB8_C == 1
+    int t = i - j;
+    return t;
+#elif SUB8_AVRASM == 1
+    // Subtract j from i, period.
+    asm volatile( "sub %0, %1" : "+a" (i) : "a" (j));
+    return i;
+#else
+#error "No implementation for sub8 available."
+#endif
+}
+
+/// Calculate an integer average of two unsigned
+///       8-bit integer values (uint8_t).
+///       Fractional results are rounded down, e.g. avg8(20,41) = 30
+LIB8STATIC uint8_t avg8( uint8_t i, uint8_t j)
+{
+#if AVG8_C == 1
+    return (i + j) >> 1;
+#elif AVG8_AVRASM == 1
+    asm volatile(
+         /* First, add j to i, 9th bit overflows into C flag */
+         "add %0, %1    \n\t"
+         /* Divide by two, moving C flag into high 8th bit */
+         "ror %0        \n\t"
+         : "+a" (i)
+         : "a"  (j) );
+    return i;
+#else
+#error "No implementation for avg8 available."
+#endif
+}
+
+
+/// Calculate an integer average of two signed 7-bit
+///       integers (int8_t)
+///       If the first argument is even, result is rounded down.
+///       If the first argument is odd, result is result up.
+LIB8STATIC int8_t avg7( int8_t i, int8_t j)
+{
+#if AVG7_C == 1
+    return ((i + j) >> 1) + (i & 0x1);
+#elif AVG7_AVRASM == 1
+    asm volatile(
+                 "asr %1        \n\t"
+                 "asr %0        \n\t"
+                 "adc %0, %1    \n\t"
+                 : "+a" (i)
+                 : "a"  (j) );
+    return i;
+#else
+#error "No implementation for avg7 available."
+#endif
+}
+
+///       Calculate the remainder of one unsigned 8-bit
+///       value divided by anoter, aka A % M.
+///       Implemented by repeated subtraction, which is
+///       very compact, and very fast if A is 'probably'
+///       less than M.  If A is a large multiple of M,
+///       the loop has to execute multiple times.  However,
+///       even in that case, the loop is only two
+///       instructions long on AVR, i.e., quick.
+LIB8STATIC uint8_t mod8( uint8_t a, uint8_t m)
+{
+#if defined(__AVR__)
+    asm volatile (
+                  "L_%=:  sub %[a],%[m]    \n\t"
+                  "       brcc L_%=        \n\t"
+                  "       add %[a],%[m]    \n\t"
+                  : [a] "+r" (a)
+                  : [m] "r"  (m)
+                  );
+#else
+    while( a >= m) a -= m;
+#endif
+    return a;
+}
+
+///          Add two numbers, and calculate the modulo
+///          of the sum and a third number, M.
+///          In other words, it returns (A+B) % M.
+///          It is designed as a compact mechanism for
+///          incrementing a 'mode' switch and wrapping
+///          around back to 'mode 0' when the switch
+///          goes past the end of the available range.
+///          e.g. if you have seven modes, this switches
+///          to the next one and wraps around if needed:
+///            mode = addmod8( mode, 1, 7);
+///          See 'mod8' for notes on performance.
+LIB8STATIC uint8_t addmod8( uint8_t a, uint8_t b, uint8_t m)
+{
+#if defined(__AVR__)
+    asm volatile (
+                  "       add %[a],%[b]    \n\t"
+                  "L_%=:  sub %[a],%[m]    \n\t"
+                  "       brcc L_%=        \n\t"
+                  "       add %[a],%[m]    \n\t"
+                  : [a] "+r" (a)
+                  : [b] "r"  (b), [m] "r" (m)
+                  );
+#else
+    a += b;
+    while( a >= m) a -= m;
+#endif
+    return a;
+}
+
+/// 8x8 bit multiplication, with 8 bit result
+LIB8STATIC uint8_t mul8( uint8_t i, uint8_t j)
+{
+#if MUL8_C == 1
+    return ((int)i * (int)(j) ) & 0xFF;
+#elif MUL8_AVRASM == 1
+    asm volatile(
+         /* Multiply 8-bit i * 8-bit j, giving 16-bit r1,r0 */
+         "mul %0, %1          \n\t"
+         /* Extract the LOW 8-bits (r0) */
+         "mov %0, r0          \n\t"
+         /* Restore r1 to "0"; it's expected to always be that */
+         "clr __zero_reg__    \n\t"
+         : "+a" (i)
+         : "a"  (j)
+         : "r0", "r1");
+
+    return i;
+#else
+#error "No implementation for mul8 available."
+#endif
+}
+
+
+/// saturating 8x8 bit multiplication, with 8 bit result
+/// @returns the product of i * j, capping at 0xFF
+LIB8STATIC uint8_t qmul8( uint8_t i, uint8_t j)
+{
+#if QMUL8_C == 1
+    int p = ((int)i * (int)(j) );
+    if( p > 255) p = 255;
+    return p;
+#elif QMUL8_AVRASM == 1
+    asm volatile(
+                 /* Multiply 8-bit i * 8-bit j, giving 16-bit r1,r0 */
+                 "  mul %0, %1          \n\t"
+                 /* If high byte of result is zero, all is well. */
+                 "  tst r1              \n\t"
+                 "  breq Lnospill_%=    \n\t"
+                 /* If high byte of result > 0, saturate low byte to 0xFF */
+                 "  ldi %0,0xFF         \n\t"
+                 "  rjmp Ldone_%=       \n\t"
+                 "Lnospill_%=:          \n\t"
+                 /* Extract the LOW 8-bits (r0) */
+                 "  mov %0, r0          \n\t"
+                 "Ldone_%=:             \n\t"
+                 /* Restore r1 to "0"; it's expected to always be that */
+                 "  clr __zero_reg__    \n\t"
+                 : "+a" (i)
+                 : "a"  (j)
+                 : "r0", "r1");
+
+    return i;
+#else
+#error "No implementation for qmul8 available."
+#endif
+}
+
+
+/// take abs() of a signed 8-bit uint8_t
+LIB8STATIC int8_t abs8( int8_t i)
+{
+#if ABS8_C == 1
+    if( i < 0) i = -i;
+    return i;
+#elif ABS8_AVRASM == 1
+
+
+    asm volatile(
+         /* First, check the high bit, and prepare to skip if it's clear */
+         "sbrc %0, 7 \n"
+
+         /* Negate the value */
+         "neg %0     \n"
+
+         : "+r" (i) : "r" (i) );
+    return i;
+#else
+#error "No implementation for abs8 available."
+#endif
+}
+
+///         square root for 16-bit integers
+///         About three times faster and five times smaller
+///         than Arduino's general sqrt on AVR.
+LIB8STATIC uint8_t sqrt16(uint16_t x)
+{
+    if( x <= 1) {
+        return x;
+    }
+
+    uint8_t low = 1; // lower bound
+    uint8_t hi, mid;
+
+    if( x > 7904) {
+        hi = 255;
+    } else {
+        hi = (x >> 5) + 8; // initial estimate for upper bound
+    }
+
+    do {
+        mid = (low + hi) >> 1;
+        if ((uint16_t)(mid * mid) > x) {
+            hi = mid - 1;
+        } else {
+            if( mid == 255) {
+                return 255;
+            }
+            low = mid + 1;
+        }
+    } while (hi >= low);
+
+    return low - 1;
+}
+
+///@}
+#endif
diff --git a/lib8tion/random8.h b/lib8tion/random8.h
new file mode 100644
index 00000000..95ba6a5c
--- /dev/null
+++ b/lib8tion/random8.h
@@ -0,0 +1,92 @@
+#ifndef __INC_LIB8TION_RANDOM_H
+#define __INC_LIB8TION_RANDOM_H
+///@ingroup lib8tion
+
+///@defgroup Random Fast random number generators
+/// A 16-bit PNRG good enough for LED animations
+///@{
+
+// X(n+1) = (2053 * X(n)) + 13849)
+#define FASTLED_RAND16_2053  ((uint16_t)(2053))
+#define FASTLED_RAND16_13849 ((uint16_t)(13849))
+
+/// random number seed
+extern uint16_t rand16seed;// = RAND16_SEED;
+
+/// Generate an 8-bit random number
+LIB8STATIC uint8_t random8()
+{
+    rand16seed = (rand16seed * FASTLED_RAND16_2053) + FASTLED_RAND16_13849;
+    // return the sum of the high and low bytes, for better
+    //  mixing and non-sequential correlation
+    return (uint8_t)(((uint8_t)(rand16seed & 0xFF)) +
+                     ((uint8_t)(rand16seed >> 8)));
+}
+
+/// Generate a 16 bit random number
+LIB8STATIC uint16_t random16()
+{
+    rand16seed = (rand16seed * FASTLED_RAND16_2053) + FASTLED_RAND16_13849;
+    return rand16seed;
+}
+
+/// Generate an 8-bit random number between 0 and lim
+/// @param lim the upper bound for the result
+LIB8STATIC uint8_t random8(uint8_t lim)
+{
+    uint8_t r = random8();
+    r = scale8( r, lim);
+    return r;
+}
+
+/// Generate an 8-bit random number in the given range
+/// @param min the lower bound for the random number
+/// @param lim the upper bound for the random number
+LIB8STATIC uint8_t random8(uint8_t min, uint8_t lim)
+{
+    uint8_t delta = lim - min;
+    uint8_t r = random8(delta) + min;
+    return r;
+}
+
+/// Generate an 16-bit random number between 0 and lim
+/// @param lim the upper bound for the result
+LIB8STATIC uint16_t random16( uint16_t lim)
+{
+    uint16_t r = random16();
+    uint32_t p = (uint32_t)lim * (uint32_t)r;
+    r = p >> 16;
+    return r;
+}
+
+/// Generate an 16-bit random number in the given range
+/// @param min the lower bound for the random number
+/// @param lim the upper bound for the random number
+LIB8STATIC uint16_t random16( uint16_t min, uint16_t lim)
+{
+    uint16_t delta = lim - min;
+    uint16_t r = random16( delta) + min;
+    return r;
+}
+
+/// Set the 16-bit seed used for the random number generator
+LIB8STATIC void random16_set_seed( uint16_t seed)
+{
+    rand16seed = seed;
+}
+
+/// Get the current seed value for the random number generator
+LIB8STATIC uint16_t random16_get_seed()
+{
+    return rand16seed;
+}
+
+/// Add entropy into the random number generator
+LIB8STATIC void random16_add_entropy( uint16_t entropy)
+{
+    rand16seed += entropy;
+}
+
+///@}
+
+#endif
diff --git a/lib8tion/scale8.h b/lib8tion/scale8.h
new file mode 100644
index 00000000..e9c95759
--- /dev/null
+++ b/lib8tion/scale8.h
@@ -0,0 +1,505 @@
+#ifndef __INC_LIB8TION_SCALE_H
+#define __INC_LIB8TION_SCALE_H
+
+///@ingroup lib8tion
+
+///@defgroup Scaling Scaling functions
+///  for scaling 8 and 16 bit values, as well as dimming and brightening them
+///@{
+
+///  scale one byte by a second one, which is treated as
+///  the numerator of a fraction whose denominator is 256
+///  In other words, it computes i * (scale / 256)
+///  4 clocks AVR with MUL, 2 clocks ARM
+LIB8STATIC uint8_t scale8( uint8_t i, fract8 scale)
+{
+#if SCALE8_C == 1
+    return ((uint16_t)i * (uint16_t)(scale) ) >> 8;
+#elif SCALE8_AVRASM == 1
+#if defined(LIB8_ATTINY)
+    uint8_t work=0;
+    uint8_t cnt=0x80;
+    asm volatile(
+        "LOOP_%=:                             \n\t"
+        /*"  sbrc %[scale], 0             \n\t"
+        "  add %[work], %[i]            \n\t"
+        "  ror %[work]                  \n\t"
+        "  lsr %[scale]                 \n\t"
+        "  clc                          \n\t"*/
+        "  sbrc %[scale], 0             \n\t"
+        "  add %[work], %[i]            \n\t"
+        "  ror %[work]                  \n\t"
+        "  lsr %[scale]                 \n\t"
+        "  lsr %[cnt]                   \n\t"
+        "brcc LOOP_%="
+        : [work] "+r" (work), [cnt] "+r" (cnt)
+        : [scale] "r" (scale), [i] "r" (i)
+        :
+      );
+    return work;
+#else
+    asm volatile(
+         /* Multiply 8-bit i * 8-bit scale, giving 16-bit r1,r0 */
+         "mul %0, %1          \n\t"
+         /* Move the high 8-bits of the product (r1) back to i */
+         "mov %0, r1          \n\t"
+         /* Restore r1 to "0"; it's expected to always be that */
+         "clr __zero_reg__    \n\t"
+
+         : "+a" (i)      /* writes to i */
+         : "a"  (scale)  /* uses scale */
+         : "r0", "r1"    /* clobbers r0, r1 */ );
+
+    /* Return the result */
+    return i;
+#endif
+#else
+#error "No implementation for scale8 available."
+#endif
+}
+
+
+///  The "video" version of scale8 guarantees that the output will
+///  be only be zero if one or both of the inputs are zero.  If both
+///  inputs are non-zero, the output is guaranteed to be non-zero.
+///  This makes for better 'video'/LED dimming, at the cost of
+///  several additional cycles.
+LIB8STATIC uint8_t scale8_video( uint8_t i, fract8 scale)
+{
+#if SCALE8_C == 1 || defined(LIB8_ATTINY)
+    uint8_t j = (((int)i * (int)scale) >> 8) + ((i&&scale)?1:0);
+    // uint8_t nonzeroscale = (scale != 0) ? 1 : 0;
+    // uint8_t j = (i == 0) ? 0 : (((int)i * (int)(scale) ) >> 8) + nonzeroscale;
+    return j;
+#elif SCALE8_AVRASM == 1
+    uint8_t j=0;
+    asm volatile(
+        "  tst %[i]\n\t"
+        "  breq L_%=\n\t"
+        "  mul %[i], %[scale]\n\t"
+        "  mov %[j], r1\n\t"
+        "  clr __zero_reg__\n\t"
+        "  cpse %[scale], r1\n\t"
+        "  subi %[j], 0xFF\n\t"
+        "L_%=: \n\t"
+        : [j] "+a" (j)
+        : [i] "a" (i), [scale] "a" (scale)
+        : "r0", "r1");
+
+    return j;
+    // uint8_t nonzeroscale = (scale != 0) ? 1 : 0;
+    // asm volatile(
+    //      "      tst %0           \n"
+    //      "      breq L_%=        \n"
+    //      "      mul %0, %1       \n"
+    //      "      mov %0, r1       \n"
+    //      "      add %0, %2       \n"
+    //      "      clr __zero_reg__ \n"
+    //      "L_%=:                  \n"
+
+    //      : "+a" (i)
+    //      : "a" (scale), "a" (nonzeroscale)
+    //      : "r0", "r1");
+
+    // // Return the result
+    // return i;
+#else
+#error "No implementation for scale8_video available."
+#endif
+}
+
+
+/// This version of scale8 does not clean up the R1 register on AVR
+/// If you are doing several 'scale8's in a row, use this, and
+/// then explicitly call cleanup_R1.
+LIB8STATIC uint8_t scale8_LEAVING_R1_DIRTY( uint8_t i, fract8 scale)
+{
+#if SCALE8_C == 1
+    return ((int)i * (int)(scale) ) >> 8;
+#elif SCALE8_AVRASM == 1
+    asm volatile(
+         /* Multiply 8-bit i * 8-bit scale, giving 16-bit r1,r0 */
+         "mul %0, %1    \n\t"
+         /* Move the high 8-bits of the product (r1) back to i */
+         "mov %0, r1    \n\t"
+         /* R1 IS LEFT DIRTY HERE; YOU MUST ZERO IT OUT YOURSELF  */
+         /* "clr __zero_reg__    \n\t" */
+
+         : "+a" (i)      /* writes to i */
+         : "a"  (scale)  /* uses scale */
+         : "r0", "r1"    /* clobbers r0, r1 */ );
+
+    // Return the result
+    return i;
+#else
+#error "No implementation for scale8_LEAVING_R1_DIRTY available."
+#endif
+}
+
+/// In place modifying version of scale8, also this version of nscale8 does not
+/// clean up the R1 register on AVR
+/// If you are doing several 'scale8's in a row, use this, and
+/// then explicitly call cleanup_R1.
+
+LIB8STATIC void nscale8_LEAVING_R1_DIRTY( uint8_t& i, fract8 scale)
+{
+#if SCALE8_C == 1
+    i = ((int)i * (int)(scale) ) >> 8;
+#elif SCALE8_AVRASM == 1
+    asm volatile(
+         /* Multiply 8-bit i * 8-bit scale, giving 16-bit r1,r0 */
+         "mul %0, %1    \n\t"
+         /* Move the high 8-bits of the product (r1) back to i */
+         "mov %0, r1    \n\t"
+         /* R1 IS LEFT DIRTY HERE; YOU MUST ZERO IT OUT YOURSELF */
+         /* "clr __zero_reg__    \n\t" */
+
+         : "+a" (i)      /* writes to i */
+         : "a"  (scale)  /* uses scale */
+         : "r0", "r1"    /* clobbers r0, r1 */ );
+#else
+#error "No implementation for nscale8_LEAVING_R1_DIRTY available."
+#endif
+}
+
+
+/// This version of scale8_video does not clean up the R1 register on AVR
+/// If you are doing several 'scale8_video's in a row, use this, and
+/// then explicitly call cleanup_R1.
+LIB8STATIC uint8_t scale8_video_LEAVING_R1_DIRTY( uint8_t i, fract8 scale)
+{
+#if SCALE8_C == 1 || defined(LIB8_ATTINY)
+    uint8_t j = (((int)i * (int)scale) >> 8) + ((i&&scale)?1:0);
+    // uint8_t nonzeroscale = (scale != 0) ? 1 : 0;
+    // uint8_t j = (i == 0) ? 0 : (((int)i * (int)(scale) ) >> 8) + nonzeroscale;
+    return j;
+#elif SCALE8_AVRASM == 1
+    uint8_t j=0;
+    asm volatile(
+        "  tst %[i]\n\t"
+        "  breq L_%=\n\t"
+        "  mul %[i], %[scale]\n\t"
+        "  mov %[j], r1\n\t"
+        "  breq L_%=\n\t"
+        "  subi %[j], 0xFF\n\t"
+        "L_%=: \n\t"
+        : [j] "+a" (j)
+        : [i] "a" (i), [scale] "a" (scale)
+        : "r0", "r1");
+
+    return j;
+    // uint8_t nonzeroscale = (scale != 0) ? 1 : 0;
+    // asm volatile(
+    //      "      tst %0           \n"
+    //      "      breq L_%=        \n"
+    //      "      mul %0, %1       \n"
+    //      "      mov %0, r1       \n"
+    //      "      add %0, %2       \n"
+    //      "      clr __zero_reg__ \n"
+    //      "L_%=:                  \n"
+
+    //      : "+a" (i)
+    //      : "a" (scale), "a" (nonzeroscale)
+    //      : "r0", "r1");
+
+    // // Return the result
+    // return i;
+#else
+#error "No implementation for scale8_video_LEAVING_R1_DIRTY available."
+#endif
+}
+
+/// In place modifying version of scale8_video, also this version of nscale8_video
+/// does not clean up the R1 register on AVR
+/// If you are doing several 'scale8_video's in a row, use this, and
+/// then explicitly call cleanup_R1.
+LIB8STATIC void nscale8_video_LEAVING_R1_DIRTY( uint8_t & i, fract8 scale)
+{
+#if SCALE8_C == 1 || defined(LIB8_ATTINY)
+    i = (((int)i * (int)scale) >> 8) + ((i&&scale)?1:0);
+#elif SCALE8_AVRASM == 1
+    asm volatile(
+        "  tst %[i]\n\t"
+        "  breq L_%=\n\t"
+        "  mul %[i], %[scale]\n\t"
+        "  mov %[i], r1\n\t"
+        "  breq L_%=\n\t"
+        "  subi %[i], 0xFF\n\t"
+        "L_%=: \n\t"
+        : [i] "+a" (i)
+        : [scale] "a" (scale)
+        : "r0", "r1");
+#else
+#error "No implementation for scale8_video_LEAVING_R1_DIRTY available."
+#endif
+}
+
+/// Clean up the r1 register after a series of *LEAVING_R1_DIRTY calls
+LIB8STATIC void cleanup_R1()
+{
+#if CLEANUP_R1_AVRASM == 1
+    // Restore r1 to "0"; it's expected to always be that
+    asm volatile( "clr __zero_reg__  \n\t" : : : "r1" );
+#endif
+}
+
+
+/// scale three one byte values by a fourth one, which is treated as
+///         the numerator of a fraction whose demominator is 256
+///         In other words, it computes r,g,b * (scale / 256)
+///
+///         THIS FUNCTION ALWAYS MODIFIES ITS ARGUMENTS IN PLACE
+
+LIB8STATIC void nscale8x3( uint8_t& r, uint8_t& g, uint8_t& b, fract8 scale)
+{
+#if SCALE8_C == 1
+    r = ((int)r * (int)(scale) ) >> 8;
+    g = ((int)g * (int)(scale) ) >> 8;
+    b = ((int)b * (int)(scale) ) >> 8;
+#elif SCALE8_AVRASM == 1
+    r = scale8_LEAVING_R1_DIRTY(r, scale);
+    g = scale8_LEAVING_R1_DIRTY(g, scale);
+    b = scale8_LEAVING_R1_DIRTY(b, scale);
+    cleanup_R1();
+#else
+#error "No implementation for nscale8x3 available."
+#endif
+}
+
+/// scale three one byte values by a fourth one, which is treated as
+///         the numerator of a fraction whose demominator is 256
+///         In other words, it computes r,g,b * (scale / 256), ensuring
+/// that non-zero values passed in remain non zero, no matter how low the scale
+/// argument.
+///
+///         THIS FUNCTION ALWAYS MODIFIES ITS ARGUMENTS IN PLACE
+LIB8STATIC void nscale8x3_video( uint8_t& r, uint8_t& g, uint8_t& b, fract8 scale)
+{
+#if SCALE8_C == 1
+    uint8_t nonzeroscale = (scale != 0) ? 1 : 0;
+    r = (r == 0) ? 0 : (((int)r * (int)(scale) ) >> 8) + nonzeroscale;
+    g = (g == 0) ? 0 : (((int)g * (int)(scale) ) >> 8) + nonzeroscale;
+    b = (b == 0) ? 0 : (((int)b * (int)(scale) ) >> 8) + nonzeroscale;
+#elif SCALE8_AVRASM == 1
+    nscale8_video_LEAVING_R1_DIRTY( r, scale);
+    nscale8_video_LEAVING_R1_DIRTY( g, scale);
+    nscale8_video_LEAVING_R1_DIRTY( b, scale);
+    cleanup_R1();
+#else
+#error "No implementation for nscale8x3 available."
+#endif
+}
+
+///  scale two one byte values by a third one, which is treated as
+///         the numerator of a fraction whose demominator is 256
+///         In other words, it computes i,j * (scale / 256)
+///
+///         THIS FUNCTION ALWAYS MODIFIES ITS ARGUMENTS IN PLACE
+
+LIB8STATIC void nscale8x2( uint8_t& i, uint8_t& j, fract8 scale)
+{
+#if SCALE8_C == 1
+    i = ((int)i * (int)(scale) ) >> 8;
+    j = ((int)j * (int)(scale) ) >> 8;
+#elif SCALE8_AVRASM == 1
+    i = scale8_LEAVING_R1_DIRTY(i, scale);
+    j = scale8_LEAVING_R1_DIRTY(j, scale);
+    cleanup_R1();
+#else
+#error "No implementation for nscale8x2 available."
+#endif
+}
+
+///  scale two one byte values by a third one, which is treated as
+///         the numerator of a fraction whose demominator is 256
+///         In other words, it computes i,j * (scale / 256), ensuring
+/// that non-zero values passed in remain non zero, no matter how low the scale
+/// argument.
+///
+///         THIS FUNCTION ALWAYS MODIFIES ITS ARGUMENTS IN PLACE
+
+
+LIB8STATIC void nscale8x2_video( uint8_t& i, uint8_t& j, fract8 scale)
+{
+#if SCALE8_C == 1
+    uint8_t nonzeroscale = (scale != 0) ? 1 : 0;
+    i = (i == 0) ? 0 : (((int)i * (int)(scale) ) >> 8) + nonzeroscale;
+    j = (j == 0) ? 0 : (((int)j * (int)(scale) ) >> 8) + nonzeroscale;
+#elif SCALE8_AVRASM == 1
+    nscale8_video_LEAVING_R1_DIRTY( i, scale);
+    nscale8_video_LEAVING_R1_DIRTY( j, scale);
+    cleanup_R1();
+#else
+#error "No implementation for nscale8x2 available."
+#endif
+}
+
+
+/// scale a 16-bit unsigned value by an 8-bit value,
+///         considered as numerator of a fraction whose denominator
+///         is 256. In other words, it computes i * (scale / 256)
+
+LIB8STATIC uint16_t scale16by8( uint16_t i, fract8 scale )
+{
+#if SCALE16BY8_C == 1
+    uint16_t result;
+    result = (i * scale) / 256;
+    return result;
+#elif SCALE16BY8_AVRASM == 1
+    uint16_t result = 0;
+    asm volatile(
+         // result.A = HighByte(i.A x j )
+         "  mul %A[i], %[scale]                 \n\t"
+         "  mov %A[result], r1                  \n\t"
+         //"  clr %B[result]                      \n\t"
+
+         // result.A-B += i.B x j
+         "  mul %B[i], %[scale]                 \n\t"
+         "  add %A[result], r0                  \n\t"
+         "  adc %B[result], r1                  \n\t"
+
+         // cleanup r1
+         "  clr __zero_reg__                    \n\t"
+
+         : [result] "+r" (result)
+         : [i] "r" (i), [scale] "r" (scale)
+         : "r0", "r1"
+         );
+    return result;
+#else
+    #error "No implementation for scale16by8 available."
+#endif
+}
+
+/// scale a 16-bit unsigned value by a 16-bit value,
+///         considered as numerator of a fraction whose denominator
+///         is 65536. In other words, it computes i * (scale / 65536)
+
+LIB8STATIC uint16_t scale16( uint16_t i, fract16 scale )
+{
+  #if SCALE16_C == 1
+    uint16_t result;
+    result = ((uint32_t)(i) * (uint32_t)(scale)) / 65536;
+    return result;
+#elif SCALE16_AVRASM == 1
+    uint32_t result = 0;
+    const uint8_t  zero = 0;
+    asm volatile(
+                 // result.A-B  = i.A x scale.A
+                 "  mul %A[i], %A[scale]                 \n\t"
+                 //  save results...
+                 // basic idea:
+                 //"  mov %A[result], r0                 \n\t"
+                 //"  mov %B[result], r1                 \n\t"
+                 // which can be written as...
+                 "  movw %A[result], r0                   \n\t"
+                 // We actually need to do anything with r0,
+                 // as result.A is never used again here, so we
+                 // could just move the high byte, but movw is
+                 // one clock cycle, just like mov, so might as
+                 // well, in case we want to use this code for
+                 // a generic 16x16 multiply somewhere.
+
+                 // result.C-D  = i.B x scale.B
+                 "  mul %B[i], %B[scale]                 \n\t"
+                 //"  mov %C[result], r0                 \n\t"
+                 //"  mov %D[result], r1                 \n\t"
+                 "  movw %C[result], r0                   \n\t"
+
+                 // result.B-D += i.B x scale.A
+                 "  mul %B[i], %A[scale]                 \n\t"
+
+                 "  add %B[result], r0                   \n\t"
+                 "  adc %C[result], r1                   \n\t"
+                 "  adc %D[result], %[zero]              \n\t"
+
+                 // result.B-D += i.A x scale.B
+                 "  mul %A[i], %B[scale]                 \n\t"
+
+                 "  add %B[result], r0                   \n\t"
+                 "  adc %C[result], r1                   \n\t"
+                 "  adc %D[result], %[zero]              \n\t"
+
+                 // cleanup r1
+                 "  clr r1                               \n\t"
+
+                 : [result] "+r" (result)
+                 : [i] "r" (i),
+                   [scale] "r" (scale),
+                   [zero] "r" (zero)
+                 : "r0", "r1"
+                 );
+    result = result >> 16;
+    return result;
+#else
+    #error "No implementation for scale16 available."
+#endif
+}
+///@}
+
+///@defgroup Dimming Dimming and brightening functions
+///
+/// Dimming and brightening functions
+///
+/// The eye does not respond in a linear way to light.
+/// High speed PWM'd LEDs at 50% duty cycle appear far
+/// brighter then the 'half as bright' you might expect.
+///
+/// If you want your midpoint brightness leve (128) to
+/// appear half as bright as 'full' brightness (255), you
+/// have to apply a 'dimming function'.
+///@{
+
+/// Adjust a scaling value for dimming
+LIB8STATIC uint8_t dim8_raw( uint8_t x)
+{
+    return scale8( x, x);
+}
+
+/// Adjust a scaling value for dimming for video (value will never go below 1)
+LIB8STATIC uint8_t dim8_video( uint8_t x)
+{
+    return scale8_video( x, x);
+}
+
+/// Linear version of the dimming function that halves for values < 128
+LIB8STATIC uint8_t dim8_lin( uint8_t x )
+{
+    if( x & 0x80 ) {
+        x = scale8( x, x);
+    } else {
+        x += 1;
+        x /= 2;
+    }
+    return x;
+}
+
+/// inverse of the dimming function, brighten a value
+LIB8STATIC uint8_t brighten8_raw( uint8_t x)
+{
+    uint8_t ix = 255 - x;
+    return 255 - scale8( ix, ix);
+}
+
+/// inverse of the dimming function, brighten a value
+LIB8STATIC uint8_t brighten8_video( uint8_t x)
+{
+    uint8_t ix = 255 - x;
+    return 255 - scale8_video( ix, ix);
+}
+
+/// inverse of the dimming function, brighten a value
+LIB8STATIC uint8_t brighten8_lin( uint8_t x )
+{
+    uint8_t ix = 255 - x;
+    if( ix & 0x80 ) {
+        ix = scale8( ix, ix);
+    } else {
+        ix += 1;
+        ix /= 2;
+    }
+    return 255 - ix;
+}
+
+///@}
+#endif
diff --git a/lib8tion/trig8.h b/lib8tion/trig8.h
new file mode 100644
index 00000000..f9e483ae
--- /dev/null
+++ b/lib8tion/trig8.h
@@ -0,0 +1,267 @@
+#ifndef __INC_LIB8TION_TRIG_H
+#define __INC_LIB8TION_TRIG_H
+
+///@ingroup lib8tion
+
+///@defgroup Trig Fast trig functions
+///@{
+
+// sin16 & cos16:
+//        Fast 16-bit approximations of sin(x) & cos(x).
+//        Input angle is an unsigned int from 0-65535.
+//        Output is signed int from -32767 to 32767.
+//
+//        This approximation never varies more than 0.69%
+//        from the floating point value you'd get by doing
+//          float s = sin( x ) * 32767.0;
+//
+//        Don't use this approximation for calculating the
+//        trajectory of a rocket to Mars, but it's great
+//        for art projects and LED displays.
+//
+//        On Arduino/AVR, this approximation is more than
+//        10X faster than floating point sin(x) and cos(x)
+
+#if defined(__AVR__)
+#define sin16 sin16_avr
+#else
+#define sin16 sin16_C
+#endif
+
+/// Fast 16-bit approximation of sin(x). This approximation never varies more than
+/// 0.69% from the floating point value you'd get by doing
+///
+///     float s = sin(x) * 32767.0;
+///
+/// @param theta input angle from 0-65535
+/// @returns sin of theta, value between -32767 to 32767.
+LIB8STATIC int16_t sin16_avr( uint16_t theta )
+{
+    static const uint8_t data[] =
+    { 0,         0,         49, 0, 6393%256,   6393/256, 48, 0,
+      12539%256, 12539/256, 44, 0, 18204%256, 18204/256, 38, 0,
+      23170%256, 23170/256, 31, 0, 27245%256, 27245/256, 23, 0,
+      30273%256, 30273/256, 14, 0, 32137%256, 32137/256,  4 /*,0*/ };
+
+    uint16_t offset = (theta & 0x3FFF);
+
+    // AVR doesn't have a multi-bit shift instruction,
+    // so if we say "offset >>= 3", gcc makes a tiny loop.
+    // Inserting empty volatile statements between each
+    // bit shift forces gcc to unroll the loop.
+    offset >>= 1; // 0..8191
+    asm volatile("");
+    offset >>= 1; // 0..4095
+    asm volatile("");
+    offset >>= 1; // 0..2047
+
+    if( theta & 0x4000 ) offset = 2047 - offset;
+
+    uint8_t sectionX4;
+    sectionX4 = offset / 256;
+    sectionX4 *= 4;
+
+    uint8_t m;
+
+    union {
+        uint16_t b;
+        struct {
+            uint8_t blo;
+            uint8_t bhi;
+        };
+    } u;
+
+    //in effect u.b = blo + (256 * bhi);
+    u.blo = data[ sectionX4 ];
+    u.bhi = data[ sectionX4 + 1];
+    m     = data[ sectionX4 + 2];
+
+    uint8_t secoffset8 = (uint8_t)(offset) / 2;
+
+    uint16_t mx = m * secoffset8;
+
+    int16_t  y  = mx + u.b;
+    if( theta & 0x8000 ) y = -y;
+
+    return y;
+}
+
+/// Fast 16-bit approximation of sin(x). This approximation never varies more than
+/// 0.69% from the floating point value you'd get by doing
+///
+///     float s = sin(x) * 32767.0;
+///
+/// @param theta input angle from 0-65535
+/// @returns sin of theta, value between -32767 to 32767.
+LIB8STATIC int16_t sin16_C( uint16_t theta )
+{
+    static const uint16_t base[] =
+    { 0, 6393, 12539, 18204, 23170, 27245, 30273, 32137 };
+    static const uint8_t slope[] =
+    { 49, 48, 44, 38, 31, 23, 14, 4 };
+
+    uint16_t offset = (theta & 0x3FFF) >> 3; // 0..2047
+    if( theta & 0x4000 ) offset = 2047 - offset;
+
+    uint8_t section = offset / 256; // 0..7
+    uint16_t b   = base[section];
+    uint8_t  m   = slope[section];
+
+    uint8_t secoffset8 = (uint8_t)(offset) / 2;
+
+    uint16_t mx = m * secoffset8;
+    int16_t  y  = mx + b;
+
+    if( theta & 0x8000 ) y = -y;
+
+    return y;
+}
+
+
+/// Fast 16-bit approximation of cos(x). This approximation never varies more than
+/// 0.69% from the floating point value you'd get by doing
+///
+///     float s = cos(x) * 32767.0;
+///
+/// @param theta input angle from 0-65535
+/// @returns sin of theta, value between -32767 to 32767.
+LIB8STATIC int16_t cos16( uint16_t theta)
+{
+    return sin16( theta + 16384);
+}
+
+///////////////////////////////////////////////////////////////////////
+
+// sin8 & cos8
+//        Fast 8-bit approximations of sin(x) & cos(x).
+//        Input angle is an unsigned int from 0-255.
+//        Output is an unsigned int from 0 to 255.
+//
+//        This approximation can vary to to 2%
+//        from the floating point value you'd get by doing
+//          float s = (sin( x ) * 128.0) + 128;
+//
+//        Don't use this approximation for calculating the
+//        "real" trigonometric calculations, but it's great
+//        for art projects and LED displays.
+//
+//        On Arduino/AVR, this approximation is more than
+//        20X faster than floating point sin(x) and cos(x)
+
+#if defined(__AVR__) && !defined(LIB8_ATTINY)
+#define sin8 sin8_avr
+#else
+#define sin8 sin8_C
+#endif
+
+
+const uint8_t b_m16_interleave[] = { 0, 49, 49, 41, 90, 27, 117, 10 };
+
+/// Fast 8-bit approximation of sin(x). This approximation never varies more than
+/// 2% from the floating point value you'd get by doing
+///
+///     float s = (sin(x) * 128.0) + 128;
+///
+/// @param theta input angle from 0-255
+/// @returns sin of theta, value between 0 and 255
+LIB8STATIC uint8_t  sin8_avr( uint8_t theta)
+{
+    uint8_t offset = theta;
+
+    asm volatile(
+                 "sbrc %[theta],6         \n\t"
+                 "com  %[offset]           \n\t"
+                 : [theta] "+r" (theta), [offset] "+r" (offset)
+                 );
+
+    offset &= 0x3F; // 0..63
+
+    uint8_t secoffset  = offset & 0x0F; // 0..15
+    if( theta & 0x40) secoffset++;
+
+    uint8_t m16; uint8_t b;
+
+    uint8_t section = offset >> 4; // 0..3
+    uint8_t s2 = section * 2;
+
+    const uint8_t* p = b_m16_interleave;
+    p += s2;
+    b   = *p;
+    p++;
+    m16 = *p;
+
+    uint8_t mx;
+    uint8_t xr1;
+    asm volatile(
+                 "mul %[m16],%[secoffset]   \n\t"
+                 "mov %[mx],r0              \n\t"
+                 "mov %[xr1],r1             \n\t"
+                 "eor  r1, r1               \n\t"
+                 "swap %[mx]                \n\t"
+                 "andi %[mx],0x0F           \n\t"
+                 "swap %[xr1]               \n\t"
+                 "andi %[xr1], 0xF0         \n\t"
+                 "or   %[mx], %[xr1]        \n\t"
+                 : [mx] "=r" (mx), [xr1] "=r" (xr1)
+                 : [m16] "r" (m16), [secoffset] "r" (secoffset)
+                 );
+
+    int8_t y = mx + b;
+    if( theta & 0x80 ) y = -y;
+
+    y += 128;
+
+    return y;
+}
+
+
+/// Fast 8-bit approximation of sin(x). This approximation never varies more than
+/// 2% from the floating point value you'd get by doing
+///
+///     float s = (sin(x) * 128.0) + 128;
+///
+/// @param theta input angle from 0-255
+/// @returns sin of theta, value between 0 and 255
+LIB8STATIC uint8_t sin8_C( uint8_t theta)
+{
+    uint8_t offset = theta;
+    if( theta & 0x40 ) {
+        offset = (uint8_t)255 - offset;
+    }
+    offset &= 0x3F; // 0..63
+
+    uint8_t secoffset  = offset & 0x0F; // 0..15
+    if( theta & 0x40) secoffset++;
+
+    uint8_t section = offset >> 4; // 0..3
+    uint8_t s2 = section * 2;
+    const uint8_t* p = b_m16_interleave;
+    p += s2;
+    uint8_t b   =  *p;
+    p++;
+    uint8_t m16 =  *p;
+
+    uint8_t mx = (m16 * secoffset) >> 4;
+
+    int8_t y = mx + b;
+    if( theta & 0x80 ) y = -y;
+
+    y += 128;
+
+    return y;
+}
+
+/// Fast 8-bit approximation of cos(x). This approximation never varies more than
+/// 2% from the floating point value you'd get by doing
+///
+///     float s = (cos(x) * 128.0) + 128;
+///
+/// @param theta input angle from 0-255
+/// @returns sin of theta, value between 0 and 255
+LIB8STATIC uint8_t cos8( uint8_t theta)
+{
+    return sin8( theta + 64);
+}
+
+///@}
+#endif