Merge branch 'FastSPI_LED2'

author: Daniel Garcia <danielgarcia@gmail.com> 2013-11-11 02:54:41 +0400
committer: Daniel Garcia <danielgarcia@gmail.com> 2013-11-11 02:54:41 +0400
commit: 6bcfa714588b12a72bdde36a1f0a43871fd5d567 (patch)
tree: 74c4fadde71b107f9a823928602141d673c0d9b5
parent: e325d5d3f934aed2b301c224352b41a1d07e3693 (diff)
parent: 59edcab79837185feeea2dfe6f46b2c4ad17b8d8 (diff)
22 files changed, 5636 insertions, 0 deletions
diff --git a/FastLED.cpp b/FastLED.cpp
new file mode 100644
index 00000000..85095210
--- /dev/null
+++ b/FastLED.cpp
@@ -0,0 +1,79 @@
+#include "FastSPI_LED2.h"
+
+
+CFastLED LEDS;
+CFastLED & FastSPI_LED = LEDS;
+CFastLED & FastSPI_LED2 = LEDS;
+CFastLED & FastLED = LEDS;
+
+uint32_t CRGB::Squant = ((uint32_t)((__TIME__[4]-'0') * 28))<<16 | ((__TIME__[6]-'0')*50)<<8 | ((__TIME__[7]-'0')*28);
+
+CFastLED::CFastLED() { 
+	// clear out the array of led controllers
+	m_nControllers = NUM_CONTROLLERS;
+	m_nScale = 255;
+	memset8(m_Controllers, 0, m_nControllers * sizeof(CControllerInfo));
+}
+
+CLEDController *CFastLED::addLeds(CLEDController *pLed, 
+									   const struct CRGB *data, 
+									   int nLedsOrOffset, int nLedsIfOffset) { 
+	int nOffset = (nLedsIfOffset > 0) ? nLedsOrOffset : 0;
+	int nLeds = (nLedsIfOffset > 0) ? nLedsIfOffset : nLedsOrOffset;
+
+	int target = -1;
+
+	// Figure out where to put the new led controller
+	for(int i = 0; i < m_nControllers; i++) { 
+		if(m_Controllers[i].pLedController == NULL) { 
+			target = i;
+			break;
+		}
+	}
+
+	// if we have a spot, use it!
+	if(target != -1) {
+		m_Controllers[target].pLedController = pLed;
+		m_Controllers[target].pLedData = data;
+		m_Controllers[target].nOffset = nOffset;
+		m_Controllers[target].nLeds = nLeds;
+		pLed->init();
+		return pLed;
+	}
+	
+	return NULL;
+}
+
+void CFastLED::show(uint8_t scale) { 
+	for(int i  = 0; i < m_nControllers; i++) { 
+		if(m_Controllers[i].pLedController != NULL) { 
+			m_Controllers[i].pLedController->show(m_Controllers[i].pLedData + m_Controllers[i].nOffset, 
+												  m_Controllers[i].nLeds, scale);
+		} else {
+			return;
+		}
+	}
+}
+
+void CFastLED::showColor(const struct CRGB & color, uint8_t scale) { 
+	for(int i  = 0; i < m_nControllers; i++) { 
+		if(m_Controllers[i].pLedController != NULL) { 
+			m_Controllers[i].pLedController->showColor(color, m_Controllers[i].nLeds, scale);
+		} else { 
+			return;
+		}
+	}
+}
+
+void CFastLED::clear(boolean includeLedData) { 
+	showColor(CRGB(0,0,0), 0);
+	if(includeLedData) { 
+		for(int i = 0; i < m_nControllers; i++) { 
+			if(m_Controllers[i].pLedData != NULL) { 
+				memset8((void*)m_Controllers[i].pLedData, 0, sizeof(struct CRGB) * m_Controllers[i].nLeds);
+			} else {
+				return;
+			}
+		}
+	}
+}
diff --git a/FastLED.h b/FastLED.h
new file mode 100644
index 00000000..a2891fcd
--- /dev/null
+++ b/FastLED.h
@@ -0,0 +1,147 @@
+#ifndef __INC_FASTSPI_LED2_H
+#define __INC_FASTSPI_LED2_H
+
+#include "controller.h"
+#include "fastpin.h"
+#include "fastspi.h"
+#include "clockless.h"
+#include "lib8tion.h"
+#include "hsv2rgb.h"
+#include "chipsets.h"
+#include "dmx.h"
+
+enum ESPIChipsets {
+	LPD8806,
+	WS2801,
+	SM16716
+};
+
+enum EClocklessChipsets {
+	DMX,
+	TM1809,
+	TM1804,
+	TM1803,
+	WS2811,
+	WS2812,
+	WS2812B,
+	WS2811_400,
+	NEOPIXEL,
+	UCS1903
+};
+
+#define NUM_CONTROLLERS 8
+
+class CFastLED {
+	struct CControllerInfo { 
+		CLEDController *pLedController;
+		const struct CRGB *pLedData;
+		int nLeds;
+		int nOffset;
+	};
+
+	CControllerInfo	m_Controllers[NUM_CONTROLLERS];
+	int m_nControllers;
+	uint8_t m_nScale;
+
+public:
+	CFastLED();
+
+	CLEDController *addLeds(CLEDController *pLed, const struct CRGB *data, int nLedsOrOffset, int nLedsIfOffset = 0);
+
+	template<ESPIChipsets CHIPSET,  uint8_t DATA_PIN, uint8_t CLOCK_PIN > CLEDController *addLeds(const struct CRGB *data, int nLedsOrOffset, int nLedsIfOffset = 0) { 
+		switch(CHIPSET) { 
+			case LPD8806: return addLeds(new LPD8806Controller<DATA_PIN, CLOCK_PIN>(), data, nLedsOrOffset, nLedsIfOffset);
+			case WS2801: return addLeds(new WS2801Controller<DATA_PIN, CLOCK_PIN>(), data, nLedsOrOffset, nLedsIfOffset);
+			case SM16716: return addLeds(new SM16716Controller<DATA_PIN, CLOCK_PIN>(), data, nLedsOrOffset, nLedsIfOffset);
+		}
+	}
+
+	template<ESPIChipsets CHIPSET,  uint8_t DATA_PIN, uint8_t CLOCK_PIN, EOrder RGB_ORDER > CLEDController *addLeds(const struct CRGB *data, int nLedsOrOffset, int nLedsIfOffset = 0) { 
+		switch(CHIPSET) { 
+			case LPD8806: return addLeds(new LPD8806Controller<DATA_PIN, CLOCK_PIN, RGB_ORDER>(), data, nLedsOrOffset, nLedsIfOffset);
+			case WS2801: return addLeds(new WS2801Controller<DATA_PIN, CLOCK_PIN, RGB_ORDER>(), data, nLedsOrOffset, nLedsIfOffset);
+			case SM16716: return addLeds(new SM16716Controller<DATA_PIN, CLOCK_PIN, RGB_ORDER>(), data, nLedsOrOffset, nLedsIfOffset);
+		}
+	}
+	
+	template<ESPIChipsets CHIPSET,  uint8_t DATA_PIN, uint8_t CLOCK_PIN, EOrder RGB_ORDER, uint8_t SPI_DATA_RATE > CLEDController *addLeds(const struct CRGB *data, int nLedsOrOffset, int nLedsIfOffset = 0) { 
+		switch(CHIPSET) { 
+			case LPD8806: return addLeds(new LPD8806Controller<DATA_PIN, CLOCK_PIN, RGB_ORDER, SPI_DATA_RATE>(), data, nLedsOrOffset, nLedsIfOffset);
+			case WS2801: return addLeds(new WS2801Controller<DATA_PIN, CLOCK_PIN, RGB_ORDER, SPI_DATA_RATE>(), data, nLedsOrOffset, nLedsIfOffset);
+			case SM16716: return addLeds(new SM16716Controller<DATA_PIN, CLOCK_PIN, RGB_ORDER, SPI_DATA_RATE>(), data, nLedsOrOffset, nLedsIfOffset);
+		}
+	}
+
+#ifdef SPI_DATA
+	template<ESPIChipsets CHIPSET> CLEDController *addLeds(const struct CRGB *data, int nLedsOrOffset, int nLedsIfOffset = 0) { 
+		return addLeds<CHIPSET, SPI_DATA, SPI_CLOCK, RGB>(data, nLedsOrOffset, nLedsIfOffset);
+	}	
+
+	template<ESPIChipsets CHIPSET, EOrder RGB_ORDER> CLEDController *addLeds(const struct CRGB *data, int nLedsOrOffset, int nLedsIfOffset = 0) { 
+		return addLeds<CHIPSET, SPI_DATA, SPI_CLOCK, RGB_ORDER>(data, nLedsOrOffset, nLedsIfOffset);
+	}	
+
+	template<ESPIChipsets CHIPSET, EOrder RGB_ORDER, uint8_t SPI_DATA_RATE> CLEDController *addLeds(const struct CRGB *data, int nLedsOrOffset, int nLedsIfOffset = 0) { 
+		return addLeds<CHIPSET, SPI_DATA, SPI_CLOCK, RGB_ORDER, SPI_DATA_RATE>(data, nLedsOrOffset, nLedsIfOffset);
+	}	
+
+#endif
+
+	template<EClocklessChipsets CHIPSET, uint8_t DATA_PIN> 
+	CLEDController *addLeds(const struct CRGB *data, int nLedsOrOffset, int nLedsIfOffset = 0) {
+		switch(CHIPSET) { 
+#ifdef FASTSPI_USE_DMX_SIMPLE
+			case DMX: return addLeds(new DMXController<DATA_PIN>(), data, nLedsOrOffset, nLedsIfOffset);
+#endif
+			case TM1804:
+			case TM1809: return addLeds(new TM1809Controller800Khz<DATA_PIN>(), data, nLedsOrOffset, nLedsIfOffset);
+			case TM1803: return addLeds(new TM1803Controller400Khz<DATA_PIN>(), data, nLedsOrOffset, nLedsIfOffset);
+			case UCS1903: return addLeds(new UCS1903Controller400Khz<DATA_PIN>(), data, nLedsOrOffset, nLedsIfOffset);
+			case WS2812: 
+			case WS2812B:
+			case NEOPIXEL:
+			case WS2811: return addLeds(new WS2811Controller800Khz<DATA_PIN>(), data, nLedsOrOffset, nLedsIfOffset);
+			case WS2811_400: return addLeds(new WS2811Controller400Khz<DATA_PIN>(), data, nLedsOrOffset, nLedsIfOffset);
+		}
+	}
+
+	template<EClocklessChipsets CHIPSET, uint8_t DATA_PIN, EOrder RGB_ORDER> 
+	CLEDController *addLeds(const struct CRGB *data, int nLedsOrOffset, int nLedsIfOffset = 0) {
+		switch(CHIPSET) { 
+#ifdef FASTSPI_USE_DMX_SIMPLE
+			case DMX: return addLeds(new DMXController<DATA_PIN, RGB_ORDER>(), data, nLedsOrOffset, nLedsIfOffset);
+#endif
+			case TM1809: return addLeds(new TM1809Controller800Khz<DATA_PIN, RGB_ORDER>(), data, nLedsOrOffset, nLedsIfOffset);
+			case TM1803: return addLeds(new TM1803Controller400Khz<DATA_PIN, RGB_ORDER>(), data, nLedsOrOffset, nLedsIfOffset);
+			case UCS1903: return addLeds(new UCS1903Controller400Khz<DATA_PIN, RGB_ORDER>(), data, nLedsOrOffset, nLedsIfOffset);
+			case WS2812: 
+			case WS2812B:
+			case NEOPIXEL:
+			case WS2811: return addLeds(new WS2811Controller800Khz<DATA_PIN, RGB_ORDER>(), data, nLedsOrOffset, nLedsIfOffset);
+			case WS2811_400: return addLeds(new WS2811Controller400Khz<DATA_PIN, RGB_ORDER>(), data, nLedsOrOffset, nLedsIfOffset);
+		}
+	}
+
+	void setBrightness(uint8_t scale) { m_nScale = scale; }
+	uint8_t getBrightness() { return m_nScale; }
+
+	/// Update all our controllers with the current led colors, using the passed in brightness
+	void show(uint8_t scale);
+
+	/// Update all our controllers with the current led colors
+	void show() { show(m_nScale); }
+
+	void clear(boolean includeLedData = true);
+
+	void showColor(const struct CRGB & color, uint8_t scale);
+
+	void showColor(const struct CRGB & color) { showColor(color, m_nScale); }
+
+};
+
+extern CFastLED & FastSPI_LED;
+extern CFastLED & FastSPI_LED2;
+extern CFastLED & FastLED;
+extern CFastLED LEDS;
+
+#endif
diff --git a/chipsets.h b/chipsets.h
new file mode 100644
index 00000000..5688ed48
--- /dev/null
+++ b/chipsets.h
@@ -0,0 +1,262 @@
+#ifndef __INC_CHIPSETS_H
+#define __INC_CHIPSETS_H
+
+#include "pixeltypes.h"
+
+//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+//
+// LPD8806 controller class - takes data/clock/select pin values (N.B. should take an SPI definition?)
+//
+//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <uint8_t DATA_PIN, uint8_t CLOCK_PIN, EOrder RGB_ORDER = RGB,  uint8_t SPI_SPEED = DATA_RATE_MHZ(24) >
+class LPD8806Controller : public CLEDController {
+	typedef SPIOutput<DATA_PIN, CLOCK_PIN, SPI_SPEED> SPI;
+
+	class LPD8806_ADJUST {
+	public:
+		// LPD8806 spec wants the high bit of every rgb data byte sent out to be set.
+		__attribute__((always_inline)) inline static uint8_t adjust(register uint8_t data) { return (data>>1) | 0x80; }
+		__attribute__((always_inline)) inline static uint8_t adjust(register uint8_t data, register uint8_t scale) { return (scale8(data, scale)>>1) | 0x80; }
+		__attribute__((always_inline)) inline static void postBlock(int len) { 
+			SPI::writeBytesValueRaw(0, ((len+63)>>6));
+		}
+
+	};
+
+	SPI mSPI;
+	int mClearedLeds;
+
+	void checkClear(int nLeds) { 
+		if(nLeds > mClearedLeds) { 
+			clearLine(nLeds);
+			mClearedLeds = nLeds;
+		}
+	}
+	
+	void clearLine(int nLeds) { 
+		int n = ((nLeds  + 63) >> 6);
+		mSPI.writeBytesValue(0, n);
+	}
+public:
+	LPD8806Controller()  {}
+	virtual void init() {
+		mSPI.init();
+		mClearedLeds = 0;
+	}
+
+	virtual void clearLeds(int nLeds) { 
+		mSPI.select();
+		mSPI.writeBytesValueRaw(0x80, nLeds * 3);	
+		mSPI.writeBytesValueRaw(0, ((nLeds*3+63)>>6));
+		mSPI.release();
+	}
+
+	virtual void showColor(const struct CRGB & data, int nLeds, uint8_t scale = 255) {
+		mSPI.select();
+		uint8_t a = 0x80 | (scale8(data[RGB_BYTE0(RGB_ORDER)], scale) >> 1);
+		uint8_t b = 0x80 | (scale8(data[RGB_BYTE1(RGB_ORDER)], scale) >> 1);
+		uint8_t c = 0x80 | (scale8(data[RGB_BYTE2(RGB_ORDER)], scale) >> 1);
+		int iLeds = 0;
+
+		while(iLeds++ < nLeds) { 
+			mSPI.writeByte(a);
+			mSPI.writeByte(b);
+			mSPI.writeByte(c);
+		}
+
+		// latch in the world
+		mSPI.writeBytesValueRaw(0, ((nLeds*3+63)>>6));
+		mSPI.release();
+	}
+
+	virtual void show(const struct CRGB *data, int nLeds, uint8_t scale = 255) {
+		mSPI.template writeBytes3<LPD8806_ADJUST, RGB_ORDER>((byte*)data, nLeds * 3, scale);
+	}
+
+#ifdef SUPPORT_ARGB
+	virtual void show(const struct CARGB *data, int nLeds, uint8_t scale) {
+		checkClear(nLeds);
+		mSPI.template writeBytes3<1, LPD8806_ADJUST, RGB_ORDER>((byte*)data, nLeds * 4, scale);
+	}
+#endif
+};
+
+
+//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+//
+// WS2801 definition - takes data/clock/select pin values (N.B. should take an SPI definition?)
+//
+//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <uint8_t DATA_PIN, uint8_t CLOCK_PIN, EOrder RGB_ORDER = RGB, uint8_t SPI_SPEED = DATA_RATE_MHZ(1)>
+class WS2801Controller : public CLEDController {
+	typedef SPIOutput<DATA_PIN, CLOCK_PIN, SPI_SPEED> SPI;
+	SPI mSPI;
+	CMinWait<500>  mWaitDelay;
+public:
+	WS2801Controller() {}
+
+	virtual void init() { 
+		mSPI.init();
+	    mWaitDelay.mark();
+	}
+
+	virtual void clearLeds(int nLeds) { 
+		mWaitDelay.wait();
+		mSPI.writeBytesValue(0, nLeds*3);
+		mWaitDelay.mark();
+	}
+	
+	virtual void showColor(const struct CRGB & data, int nLeds, uint8_t scale = 255) {
+		mWaitDelay.wait();
+		mSPI.select();
+		uint8_t a = scale8(data[RGB_BYTE0(RGB_ORDER)], scale);
+		uint8_t b = scale8(data[RGB_BYTE1(RGB_ORDER)], scale);
+		uint8_t c = scale8(data[RGB_BYTE2(RGB_ORDER)], scale);
+
+		while(nLeds--) { 
+			mSPI.writeByte(a);
+			mSPI.writeByte(b);
+			mSPI.writeByte(c);
+		}
+		mSPI.waitFully();
+		mSPI.release();
+		mWaitDelay.mark();
+	}
+
+	virtual void show(const struct CRGB *data, int nLeds, uint8_t scale) {
+		mWaitDelay.wait();
+		mSPI.template writeBytes3<0, RGB_ORDER>((byte*)data, nLeds * 3, scale);
+		mWaitDelay.mark();
+	}
+
+#ifdef SUPPORT_ARGB
+	virtual void show(const struct CRGB *data, int nLeds, uint8_t scale) {
+		mWaitDelay.wait();
+		mSPI.template writeBytes3<1, RGB_ORDER>((byte*)data, nLeds * 4, scale);
+		mWaitDelay.mark();
+	}
+#endif
+};
+
+//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+//
+// SM16716 definition - takes data/clock/select pin values (N.B. should take an SPI definition?)
+//
+//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <uint8_t DATA_PIN, uint8_t CLOCK_PIN, EOrder RGB_ORDER = RGB, uint8_t SPI_SPEED = DATA_RATE_MHZ(16)>
+class SM16716Controller : public CLEDController {
+	typedef SPIOutput<DATA_PIN, CLOCK_PIN, SPI_SPEED> SPI;
+	SPI mSPI;
+
+	void writeHeader() { 
+		// Write out 50 zeros to the spi line (6 blocks of 8 followed by two single bit writes)
+		mSPI.select();
+		mSPI.writeBytesValueRaw(0, 6);
+		mSPI.waitFully();
+		mSPI.template writeBit<0>(0);
+		mSPI.template writeBit<0>(0);
+		mSPI.release();
+	}
+
+public:
+	SM16716Controller() {}
+
+	virtual void init() { 
+		mSPI.init();
+	}
+
+	virtual void clearLeds(int nLeds) { 
+		mSPI.select();
+		while(nLeds--) { 
+			mSPI.template writeBit<0>(1);
+			mSPI.writeByte(0);
+			mSPI.writeByte(0);
+			mSPI.writeByte(0);
+		}
+		mSPI.waitFully();
+		mSPI.release();
+		writeHeader();
+	}
+
+	virtual void showColor(const struct CRGB & data, int nLeds, uint8_t scale = 255) {
+		mSPI.select();
+		uint8_t a = scale8(data[RGB_BYTE0(RGB_ORDER)], scale);
+		uint8_t b = scale8(data[RGB_BYTE1(RGB_ORDER)], scale);
+		uint8_t c = scale8(data[RGB_BYTE2(RGB_ORDER)], scale);
+
+		while(nLeds--) { 
+			mSPI.template writeBit<0>(1);
+			mSPI.writeByte(a);
+			mSPI.writeByte(b);
+			mSPI.writeByte(c);
+		}
+		writeHeader();
+		mSPI.release();
+	}
+
+	virtual void show(const struct CRGB *data, int nLeds, uint8_t scale = 255) {
+		// Make sure the FLAG_START_BIT flag is set to ensure that an extra 1 bit is sent at the start
+		// of each triplet of bytes for rgb data
+		// writeHeader();
+		mSPI.template writeBytes3<FLAG_START_BIT, RGB_ORDER>((byte*)data, nLeds * 3, scale);
+		writeHeader();
+	}
+
+#ifdef SUPPORT_ARGB
+	virtual void show(const struct CARGB *data, int nLeds, uint8_t scale = 255) {
+		mSPI.writeBytesValue(0, 6);
+		mSPI.template writeBit<0>(0);
+		mSPI.template writeBit<0>(0);
+
+		// Make sure the FLAG_START_BIT flag is set to ensure that an extra 1 bit is sent at the start
+		// of each triplet of bytes for rgb data
+		mSPI.template writeBytes3<1 | FLAG_START_BIT, RGB_ORDER>((byte*)data, nLeds * 4, scale);
+	}
+#endif
+};
+
+//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+//
+// Clockless template instantiations
+//
+//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// UCS1903 - 500ns, 1500ns, 500ns
+template <uint8_t DATA_PIN, EOrder RGB_ORDER = RGB>
+class UCS1903Controller400Khz : public ClocklessController<DATA_PIN, NS(500), NS(1500), NS(500), RGB_ORDER> {};
+#if NO_TIME(500, 1500, 500) 
+#warning "No enough clock cycles available for the UCS103"
+#endif
+
+// TM1809 - 312.5ns, 312.5ns, 325ns
+template <uint8_t DATA_PIN, EOrder RGB_ORDER = RGB>
+class TM1809Controller800Khz : public ClocklessController<DATA_PIN, NS(350), NS(350), NS(550), RGB_ORDER> {};
+#if NO_TIME(350, 350, 550) 
+#warning "No enough clock cycles available for the TM1809"
+#endif
+
+// WS2811 - 400ns, 400ns, 450ns 
+template <uint8_t DATA_PIN, EOrder RGB_ORDER = RGB>
+class WS2811Controller800Khz : public ClocklessController<DATA_PIN, NS(400), NS(400), NS(450), RGB_ORDER> {};
+#if NO_TIME(400, 400, 450) 
+#warning "No enough clock cycles available for the WS2811 (800khz)"
+#endif
+
+// WS2811@400khz - 800ns, 800ns, 900ns 
+template <uint8_t DATA_PIN, EOrder RGB_ORDER = RGB>
+class WS2811Controller400Khz : public ClocklessController<DATA_PIN, NS(800), NS(800), NS(900), RGB_ORDER> {};
+#if NO_TIME(800, 800, 900) 
+#warning "No enough clock cycles available for the WS2811 (400Khz)"
+#endif
+
+// 750NS, 750NS, 750NS
+template <uint8_t DATA_PIN, EOrder RGB_ORDER = RGB>
+class TM1803Controller400Khz : public ClocklessController<DATA_PIN, NS(750), NS(750), NS(750), RGB_ORDER> {};
+#if NO_TIME(750, 750, 750) 
+#warning "No enough clock cycles available for the UCS103"
+#endif
+
+#endif
diff --git a/clockless.h b/clockless.h
new file mode 100644
index 00000000..238276ef
--- /dev/null
+++ b/clockless.h
@@ -0,0 +1,318 @@
+#ifndef __INC_CLOCKLESS_H
+#define __INC_CLOCKLESS_H
+
+#include "controller.h"
+#include "lib8tion.h"
+#include <avr/interrupt.h> // for cli/se definitions
+
+// Macro to convert from nano-seconds to clocks and clocks to nano-seconds
+// #define NS(_NS) (_NS / (1000 / (F_CPU / 1000000L)))
+#if F_CPU < 96000000
+#define NS(_NS) ( (_NS * (F_CPU / 1000000L))) / 1000
+#define CLKS_TO_MICROS(_CLKS) ((long)(_CLKS)) / (F_CPU / 1000000L)
+#else
+#define NS(_NS) ( (_NS * (F_CPU / 2000000L))) / 1000
+#define CLKS_TO_MICROS(_CLKS) ((long)(_CLKS)) / (F_CPU / 2000000L)
+#endif
+
+//  Macro for making sure there's enough time available
+#define NO_TIME(A, B, C) (NS(A) < 3 || NS(B) < 3 || NS(C) < 6)
+
+#if defined(__MK20DX128__)
+   extern volatile uint32_t systick_millis_count;
+#  define MS_COUNTER systick_millis_count
+#else
+#  if defined(CORE_TEENSY)
+     extern volatile unsigned long timer0_millis_count;
+#    define MS_COUNTER timer0_millis_count
+#  else
+     extern volatile unsigned long timer0_millis;
+#    define MS_COUNTER timer0_millis
+#  endif
+#endif
+
+// Scaling macro choice
+#if defined(LIB8_ATTINY)
+#  define INLINE_SCALE(B, SCALE) delaycycles<3>()
+#  warning "No hardware multiply, inline brightness scaling disabled"
+#else
+#   define INLINE_SCALE(B, SCALE) B = scale8_LEAVING_R1_DIRTY(B, SCALE)
+#endif
+
+//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+//
+// Base template for clockless controllers.  These controllers have 3 control points in their cycle for each bit.  The first point
+// is where the line is raised hi.  The second pointsnt is where the line is dropped low for a zero.  The third point is where the 
+// line is dropped low for a one.  T1, T2, and T3 correspond to the timings for those three in clock cycles.
+//
+//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <uint8_t DATA_PIN, int T1, int T2, int T3, EOrder RGB_ORDER = RGB, int WAIT_TIME = 50>
+class ClocklessController : public CLEDController {
+	typedef typename FastPin<DATA_PIN>::port_ptr_t data_ptr_t;
+	typedef typename FastPin<DATA_PIN>::port_t data_t;
+
+	data_t mPinMask;
+	data_ptr_t mPort;
+	CMinWait<WAIT_TIME> mWait;
+public:
+	virtual void init() { 
+		FastPin<DATA_PIN>::setOutput();
+		mPinMask = FastPin<DATA_PIN>::mask();
+		mPort = FastPin<DATA_PIN>::port();
+	}
+
+#if defined(__MK20DX128__)
+	// We don't use the bitSetFast methods for ARM.
+#else
+	template <int N>inline static void bitSetFast(register data_ptr_t port, register data_t hi, register data_t lo, register uint8_t b) { 
+		// First cycle
+		FastPin<DATA_PIN>::fastset(port, hi); 								// 1/2 clock cycle if using out
+		delaycycles<T1 - (_CYCLES(DATA_PIN) + 1)>();					// 1st cycle length minus 1/2 clock for out, 1 clock for sbrs
+		__asm__ __volatile__ ("sbrs %0, %1" :: "r" (b), "M" (N) :); 	// 1 clock for check (+1 if skipping, next op is also 1 clock)
+
+		// Second cycle
+		FastPin<DATA_PIN>::fastset(port, lo);								// 1/2 clock cycle if using out
+		delaycycles<T2 - _CYCLES(DATA_PIN)>(); 							// 2nd cycle length minus 1/2 clock for out
+
+		// Third cycle
+		FastPin<DATA_PIN>::fastset(port, lo);								// 1 clock cycle if using out
+		delaycycles<T3 - _CYCLES(DATA_PIN)>();							// 3rd cycle length minus 1 clock for out
+	}
+	
+	#define END_OF_BYTE
+	#define END_OF_LOOP 6 		// loop compare, jump, next uint8_t load
+	template <int N, int ADJ>inline static void bitSetLast(register data_ptr_t port, register data_t hi, register data_t lo, register uint8_t b) { 
+		// First cycle
+		FastPin<DATA_PIN>::fastset(port, hi); 							// 1 clock cycle if using out, 2 otherwise
+		delaycycles<T1 - (_CYCLES(DATA_PIN))>();					// 1st cycle length minus 1 clock for out, 1 clock for sbrs
+		__asm__ __volatile__ ("sbrs %0, %1" :: "r" (b), "M" (N) :); // 1 clock for check (+1 if skipping, next op is also 1 clock)
+
+		// Second cycle
+		FastPin<DATA_PIN>::fastset(port, lo);							// 1/2 clock cycle if using out
+		delaycycles<T2 - (_CYCLES(DATA_PIN))>(); 						// 2nd cycle length minus 1/2 clock for out
+
+		// Third cycle
+		FastPin<DATA_PIN>::fastset(port, lo);							// 1/2 clock cycle if using out
+		delaycycles<T3 - (_CYCLES(DATA_PIN) + ADJ)>();				// 3rd cycle length minus 7 clocks for out, loop compare, jump, next uint8_t load
+	}
+#endif
+
+	virtual void clearLeds(int nLeds) {
+		showColor(CRGB(0, 0, 0), nLeds, 0);
+	}
+
+	// set all the leds on the controller to a given color
+	virtual void showColor(const struct CRGB & data, int nLeds, uint8_t scale = 255) {
+		mWait.wait();
+		cli();
+
+		showRGBInternal<0, false>(nLeds, scale, (const byte*)&data);
+
+		// Adjust the timer
+		long microsTaken = CLKS_TO_MICROS((long)nLeds * 24 * (T1 + T2 + T3));
+		MS_COUNTER += (microsTaken / 1000);
+		sei();
+		mWait.mark();
+	}
+
+	virtual void show(const struct CRGB *rgbdata, int nLeds, uint8_t scale = 255) { 
+		mWait.wait();
+		cli();
+
+		showRGBInternal<0, true>(nLeds, scale, (const byte*)rgbdata);
+
+		// Adjust the timer
+		long microsTaken = CLKS_TO_MICROS((long)nLeds * 24 * (T1 + T2 + T3));
+		MS_COUNTER += (microsTaken / 1000);
+		sei();
+		mWait.mark();
+	}
+
+#ifdef SUPPORT_ARGB
+	virtual void show(const struct CARGB *rgbdata, int nLeds, uint8_t scale = 255) { 
+		mWait.wait();
+		cli();
+
+		showRGBInternal<1, true>(nLeds, scale, (const byte*)rgbdata);
+
+		// Adjust the timer
+		long microsTaken = CLKS_TO_MICROS((long)nLeds * 24 * (T1 + T2 + T3));
+		MS_COUNTER += (microsTaken / 1000);
+		sei();
+		mWait.mark();
+	}
+#endif
+
+#if defined(__MK20DX128__)
+	inline static void write8Bits(register data_ptr_t port, register data_t hi, register data_t lo, register uint32_t & b)  __attribute__ ((always_inline)) {
+		// TODO: hand rig asm version of this method.  The timings are based on adjusting/studying GCC compiler ouptut.  This
+		// will bite me in the ass at some point, I know it.
+		for(register uint32_t i = 7; i > 0; i--) { 
+			FastPin<DATA_PIN>::fastset(port, hi);
+			delaycycles<T1 - 5>(); // 5 cycles - 2 store, 1 and, 1 test, 1 if
+			if(b & 0x80) { FastPin<DATA_PIN>::fastset(port, hi); } else { FastPin<DATA_PIN>::fastset(port, lo); }
+			b <<= 1;
+			delaycycles<T2 - 2>(); // 2 cycles,  1 store/skip,  1 shift 
+			FastPin<DATA_PIN>::fastset(port, lo);
+			delaycycles<T3 - 5>(); // 3 cycles, 2 store, 1 sub, 1 branch backwards
+		}
+		// delay an extra cycle because falling out of the loop takes on less cycle than looping around
+		delaycycles<1>();
+
+		FastPin<DATA_PIN>::fastset(port, hi);
+		delaycycles<T1 - 6>();
+		if(b & 0x80) { FastPin<DATA_PIN>::fastset(port, hi); } else { FastPin<DATA_PIN>::fastset(port, lo); }
+		delaycycles<T2 - 2>(); // 4 cycles, 2 store, store/skip
+		FastPin<DATA_PIN>::fastset(port, lo);
+	}
+#endif
+
+	// This method is made static to force making register Y available to use for data on AVR - if the method is non-static, then 
+	// gcc will use register Y for the this pointer.
+	template<int SKIP, bool ADVANCE> static void showRGBInternal(register int nLeds, register uint8_t scale, register const byte *rgbdata) {
+		register byte *data = (byte*)rgbdata;
+		register data_t mask = FastPin<DATA_PIN>::mask();
+		register data_ptr_t port = FastPin<DATA_PIN>::port();
+		nLeds *= (3 + SKIP);
+		register uint8_t *end = data + nLeds; 
+		register data_t hi = *port | mask;
+		register data_t lo = *port & ~mask;
+		*port = lo;
+
+#if defined(__MK20DX128__)
+		register uint32_t b;
+		b = ((ADVANCE)?data:rgbdata)[SKIP + RGB_BYTE0(RGB_ORDER)];
+		b = scale8(b, scale);
+		while(data < end) { 
+			// Write first byte, read next byte
+			write8Bits(port, hi, lo, b);
+
+			b = ((ADVANCE)?data:rgbdata)[SKIP + RGB_BYTE1(RGB_ORDER)];
+			INLINE_SCALE(b, scale);
+			delaycycles<T3 - 5>(); // 1 store, 2 load, 1 mul, 1 shift, 
+
+			// Write second byte
+			write8Bits(port, hi, lo, b);
+
+			b = ((ADVANCE)?data:rgbdata)[SKIP + RGB_BYTE2(RGB_ORDER)];
+			INLINE_SCALE(b, scale);
+
+			data += 3 + SKIP;
+			if((RGB_ORDER & 0070) == 0) {
+				delaycycles<T3 - 6>(); // 1 store, 2 load, 1 mul, 1 shift,  1 adds if BRG or GRB
+			} else {
+				delaycycles<T3 - 5>(); // 1 store, 2 load, 1 mul, 1 shift, 
+			}
+
+			// Write third byte
+			write8Bits(port, hi, lo, b);
+
+			b = ((ADVANCE)?data:rgbdata)[SKIP + RGB_BYTE0(RGB_ORDER)];
+			INLINE_SCALE(b, scale);
+
+			delaycycles<T3 - 11>(); // 1 store, 2 load (with increment), 1 mul, 1 shift, 1 cmp, 1 branch backwards, 1 movim
+		};
+#else
+#if 0
+		register uint8_t b = *data++;
+		while(data <= end) { 
+			bitSetFast<7>(port, hi, lo, b);
+			bitSetFast<6>(port, hi, lo, b);
+			bitSetFast<5>(port, hi, lo, b);
+			bitSetFast<4>(port, hi, lo, b);
+			bitSetFast<3>(port, hi, lo, b);
+			// Leave an extra 2 clocks for the next byte load
+			bitSetLast<2, 2>(port, hi, lo, b);
+			register uint8_t next = *data++;
+			// Leave an extra 4 clocks for the scale
+			bitSetLast<1, 4>(port, hi, lo, b);
+			next = scale8(next, scale);
+			bitSetLast<0, END_OF_LOOP>(port, hi, lo, b);
+			b = next;
+		}
+#else
+		register uint8_t b;
+
+		if(ADVANCE) { 
+			b = data[SKIP + RGB_BYTE0(RGB_ORDER)];
+		} else { 
+			b = rgbdata[SKIP + RGB_BYTE0(RGB_ORDER)];
+		}
+		b = scale8_LEAVING_R1_DIRTY(b, scale);
+
+		register uint8_t c;
+		register uint8_t d;
+		while(data < end) { 
+			for(register byte x=5; x; x--) {
+				bitSetLast<7, 4>(port, hi, lo, b);
+				b <<= 1;
+			}
+			delaycycles<1>();
+			// Leave an extra 2 clocks for the next byte load
+			bitSetLast<7, 1>(port, hi, lo, b);
+			delaycycles<1>();
+
+			// Leave an extra 4 clocks for the scale
+			bitSetLast<6, 6>(port, hi, lo, b);
+			if(ADVANCE) { 
+				c = data[SKIP + RGB_BYTE1(RGB_ORDER)];
+			} else { 
+				c = rgbdata[SKIP + RGB_BYTE1(RGB_ORDER)];
+				delaycycles<1>();
+			}
+			INLINE_SCALE(c, scale);
+			bitSetLast<5, 1>(port, hi, lo, b);
+			
+			for(register byte x=5; x; x--) {
+				bitSetLast<7, 4>(port, hi, lo, c);
+				c <<= 1;
+			}
+			delaycycles<1>();
+			// Leave an extra 2 clocks for the next byte load
+			bitSetLast<7, 1>(port, hi, lo, c);
+			delaycycles<1>();
+			
+			// Leave an extra 4 clocks for the scale
+			bitSetLast<6, 6>(port, hi, lo, c);
+			if(ADVANCE) { 
+				d = data[SKIP + RGB_BYTE2(RGB_ORDER)];
+			} else { 
+				d = rgbdata[SKIP + RGB_BYTE2(RGB_ORDER)];
+				delaycycles<1>();
+			}
+			INLINE_SCALE(d, scale);
+			bitSetLast<5, 1>(port, hi, lo, c);
+			
+			for(register byte x=5; x; x--) {
+				bitSetLast<7, 4>(port, hi, lo, d);
+				d <<= 1;
+			}
+			delaycycles<1>();
+			// Leave an extra 2 clocks for the next byte load
+			bitSetLast<7, 2>(port, hi, lo, d);
+			data += (SKIP + 3);
+			// Leave an extra 4 clocks for the scale
+			bitSetLast<6, 6>(port, hi, lo, d);
+			if(ADVANCE) { 
+				b = data[SKIP + RGB_BYTE0(RGB_ORDER)];
+			} else { 
+				b = rgbdata[SKIP + RGB_BYTE0(RGB_ORDER)];
+				delaycycles<1>();
+			}
+			INLINE_SCALE(b, scale);
+			bitSetLast<5, 6>(port, hi, lo, d);
+		}
+		cleanup_R1();
+#endif
+#endif
+	}
+
+#ifdef SUPPORT_ARGB
+	virtual void showARGB(struct CARGB *data, int nLeds) { 
+		// TODO: IMPLEMENTME
+	}
+#endif
+};
+
+#endif
diff --git a/controller.h b/controller.h
new file mode 100644
index 00000000..2c703f00
--- /dev/null
+++ b/controller.h
@@ -0,0 +1,56 @@
+#ifndef __INC_CONTROLLER_H
+#define __INC_CONTROLLER_H
+
+#include <avr/io.h>
+#include "pixeltypes.h"
+
+
+#define RGB_BYTE0(X) ((X>>6) & 0x3) 
+#define RGB_BYTE1(X) ((X>>3) & 0x3) 
+#define RGB_BYTE2(X) ((X) & 0x3)
+
+// operator byte *(struct CRGB[] arr) { return (byte*)arr; }
+
+
+//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+// 
+// LED Controller interface definition
+//
+//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Base definition for an LED controller.  Pretty much the methods that every LED controller object will make available.  
+/// Note that the showARGB method is not impelemented for all controllers yet.   Note also the methods for eventual checking
+/// of background writing of data (I'm looking at you, teensy 3.0 DMA controller!).  If you want to pass LED controllers around
+/// to methods, make them references to this type, keeps your code saner.
+class CLEDController { 
+public:
+	// initialize the LED controller
+	virtual void init() = 0;
+
+	// reset any internal state to a clean point
+	virtual void reset() { init(); } 
+
+	// clear out/zero out the given number of leds.
+	virtual void clearLeds(int nLeds) = 0;
+
+	// set all the leds on the controller to a given color
+	virtual void showColor(const struct CRGB & data, int nLeds, uint8_t scale = 255) = 0;
+
+	// note that the uint8_ts will be in the order that you want them sent out to the device. 
+	// nLeds is the number of RGB leds being written to
+	virtual void show(const struct CRGB *data, int nLeds, uint8_t scale = 255) = 0;
+
+#ifdef SUPPORT_ARGB
+	// as above, but every 4th uint8_t is assumed to be alpha channel data, and will be skipped
+	virtual void show(const struct CARGB *data, int nLeds, uint8_t scale = 255) = 0;
+#endif
+	
+	// is the controller ready to write data out
+	virtual bool ready() { return true; }
+
+	// wait until the controller is ready to write data out 
+	virtual void wait() { return; }
+
+};
+
+#endif
+\ No newline at end of file
diff --git a/delay.h b/delay.h
new file mode 100644
index 00000000..c29de694
--- /dev/null
+++ b/delay.h
@@ -0,0 +1,62 @@
+#ifndef __INC_DELAY_H
+#define __INC_DELAY_H
+
+////////////////////////////////////////////////////////////////////////////////////////////
+//
+// Clock cycle counted delay loop
+//
+////////////////////////////////////////////////////////////////////////////////////////////
+
+#if defined(__arm__) 
+# define NOP __asm__ __volatile__ ("nop\n");
+#else
+#  define NOP __asm__ __volatile__ ("cp r0,r0\n");
+#endif
+
+// predeclaration to not upset the compiler
+template<int CYCLES> inline void delaycycles();
+
+// TODO: ARM version of _delaycycles_
+// worker template - this will nop for LOOP * 3 + PAD cycles total
+template<int LOOP, int PAD> inline void _delaycycles_AVR() { 
+	delaycycles<PAD>();
+	// the loop below is 3 cycles * LOOP.  the LDI is one cycle,
+	// the DEC is 1 cycle, the BRNE is 2 cycles if looping back and
+	// 1 if not (the LDI balances out the BRNE being 1 cycle on exit)
+	__asm__ __volatile__ ( 
+		"		LDI R16, %0\n"
+		"L_%=:  DEC R16\n"
+		"		BRNE L_%=\n"
+		: /* no outputs */ 
+		: "M" (LOOP) 
+		: "r16"
+		);
+}
+
+// usable definition
+#if !defined(__MK20DX128__)
+template<int CYCLES> __attribute__((always_inline)) inline void delaycycles() { 
+	_delaycycles_AVR<CYCLES / 3, CYCLES % 3>();	
+}
+#else
+template<int CYCLES> __attribute__((always_inline)) inline void delaycycles() { 
+	NOP; delaycycles<CYCLES-1>();
+}
+#endif
+
+// pre-instantiations for values small enough to not need the loop, as well as sanity holders
+// for some negative values.
+template<> __attribute__((always_inline)) inline void delaycycles<-6>() {}
+template<> __attribute__((always_inline)) inline void delaycycles<-5>() {}
+template<> __attribute__((always_inline)) inline void delaycycles<-4>() {}
+template<> __attribute__((always_inline)) inline void delaycycles<-3>() {}
+template<> __attribute__((always_inline)) inline void delaycycles<-2>() {}
+template<> __attribute__((always_inline)) inline void delaycycles<-1>() {}
+template<> __attribute__((always_inline)) inline void delaycycles<0>() {}
+template<> __attribute__((always_inline)) inline void delaycycles<1>() {NOP;}
+template<> __attribute__((always_inline)) inline void delaycycles<2>() {NOP;NOP;}
+template<> __attribute__((always_inline)) inline void delaycycles<3>() {NOP;NOP;NOP;}
+template<> __attribute__((always_inline)) inline void delaycycles<4>() {NOP;NOP;NOP;NOP;}
+template<> __attribute__((always_inline)) inline void delaycycles<5>() {NOP;NOP;NOP;NOP;NOP;}
+
+#endif
+\ No newline at end of file
diff --git a/dmx.h b/dmx.h
new file mode 100644
index 00000000..a7c1c1f0
--- /dev/null
+++ b/dmx.h
@@ -0,0 +1,115 @@
+#ifndef __INC_DMX_H
+#define __INC_DMX_H
+
+//#ifdef DmxSimple_H
+//#if USE_DMX_SIMPLE
+#ifdef FASTSPI_USE_DMX_SIMPLE
+#include<DmxSimple.h>
+// note - dmx simple must be included before FastSPI for this code to be enabled
+template <uint8_t DATA_PIN, EOrder RGB_ORDER = RGB> class DMXController : public CLEDController {
+public:
+	// initialize the LED controller
+	virtual void init() { DmxSimple.usePin(DATA_PIN); }
+
+	// reset any internal state to a clean point
+	virtual void reset() { init(); } 
+
+	// clear out/zero out the given number of leds.
+	virtual void clearLeds(int nLeds) {
+		int count = min(nLeds * 3, DMX_SIZE);
+		for(int iChannel = 1; iChannel <= count; iChannel++) { DmxSimple.write(iChannel, 0); }
+	}
+
+	// set all the leds on the controller to a given color
+	virtual void showColor(const struct CRGB & data, int nLeds, uint8_t scale = 255) {
+		int count = min(nLeds, DMX_SIZE / 3);
+		int iChannel = 1;
+		for(int i = 0; i < count; i++) {
+			DmxSimple.write(iChannel++, scale8(data[RGB_BYTE0(RGB_ORDER)], scale));
+			DmxSimple.write(iChannel++, scale8(data[RGB_BYTE1(RGB_ORDER)], scale));
+			DmxSimple.write(iChannel++, scale8(data[RGB_BYTE2(RGB_ORDER)], scale));
+		}
+	}
+
+	// note that the uint8_ts will be in the order that you want them sent out to the device. 
+	// nLeds is the number of RGB leds being written to
+	virtual void show(const struct CRGB *data, int nLeds, uint8_t scale = 255) {
+		int count = min(nLeds, DMX_SIZE / 3);
+		int iChannel = 1;
+		for(int i = 0; i < count; i++) {
+			DmxSimple.write(iChannel++, scale8(data[i][RGB_BYTE0(RGB_ORDER)], scale));
+			DmxSimple.write(iChannel++, scale8(data[i][RGB_BYTE1(RGB_ORDER)], scale));
+			DmxSimple.write(iChannel++, scale8(data[i][RGB_BYTE2(RGB_ORDER)], scale));
+		}
+
+	}
+
+#ifdef SUPPORT_ARGB
+	// as above, but every 4th uint8_t is assumed to be alpha channel data, and will be skipped
+	virtual void show(const struct CARGB *data, int nLeds, uint8_t scale = 255) = 0;
+#endif
+	
+	// is the controller ready to write data out
+	virtual bool ready() { return true; }
+
+	// wait until the controller is ready to write data out 
+	virtual void wait() { return; }
+
+};
+
+#elif defined(DmxSerial_h)
+
+template <uint8_t DATA_PIN, EOrder RGB_ORDER = RGB> class DMXController : public CLEDController {
+public:
+	// initialize the LED controller
+	virtual void init() { DMXSerial.init(DMXController); }
+
+	// reset any internal state to a clean point
+	virtual void reset() { init(); } 
+
+	// clear out/zero out the given number of leds.
+	virtual void clearLeds(int nLeds) {
+		int count = min(nLeds * 3, DMX_SIZE);
+		for(int iChannel = 0; iChannel < count; iChannel++) { DmxSimple.write(iChannel, 0); }
+	}
+
+	// set all the leds on the controller to a given color
+	virtual void showColor(const struct CRGB & data, int nLeds, uint8_t scale = 255) {
+		int count = min(nLeds, DMX_SIZE / 3);
+		int iChannel = 0;
+		for(int i = 0; i < count; i++) {
+			DMXSerial.write(iChannel++, scale8(data[RGB_BYTE0(RGB_ORDER)], scale));
+			DMXSerial.write(iChannel++, scale8(data[RGB_BYTE1(RGB_ORDER)], scale));
+			DMXSerial.write(iChannel++, scale8(data[RGB_BYTE2(RGB_ORDER)], scale));
+		}
+	}
+
+	// note that the uint8_ts will be in the order that you want them sent out to the device. 
+	// nLeds is the number of RGB leds being written to
+	virtual void show(const struct CRGB *data, int nLeds, uint8_t scale = 255) {
+		int count = min(nLeds, DMX_SIZE / 3);
+		int iChannel = 0;
+		for(int i = 0; i < count; i++) {
+			DMXSerial.write(iChannel++, scale8(data[i][RGB_BYTE0(RGB_ORDER)], scale));
+			DMXSerial.write(iChannel++, scale8(data[i][RGB_BYTE1(RGB_ORDER)], scale));
+			DMXSerial.write(iChannel++, scale8(data[i][RGB_BYTE2(RGB_ORDER)], scale));
+		}
+
+	}
+
+#ifdef SUPPORT_ARGB
+	// as above, but every 4th uint8_t is assumed to be alpha channel data, and will be skipped
+	virtual void show(const struct CARGB *data, int nLeds, uint8_t scale = 255) = 0;
+#endif
+	
+	// is the controller ready to write data out
+	virtual bool ready() { return true; }
+
+	// wait until the controller is ready to write data out 
+	virtual void wait() { return; }
+	
+};
+
+#endif
+
+#endif
+\ No newline at end of file
diff --git a/examples/Fast2Dev/Fast2Dev.ino b/examples/Fast2Dev/Fast2Dev.ino
new file mode 100644
index 00000000..5090aaad
--- /dev/null
+++ b/examples/Fast2Dev/Fast2Dev.ino
@@ -0,0 +1,98 @@
+// Uncomment this line if you have any interrupts that are changing pins - this causes the library to be a little bit more cautious
+// #define FAST_SPI_INTERRUPTS_WRITE_PINS 1
+
+// Uncomment this line to force always using software, instead of hardware, SPI (why?)
+// #define FORCE_SOFTWARE_SPI 1
+
+// Uncomment this line if you want to talk to DMX controllers
+// #define FASTSPI_USE_DMX_SIMPLE 1
+
+#include "FastLED.h"
+
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+//
+// test code
+//
+//////////////////////////////////////////////////
+
+#define NUM_LEDS 150
+
+CRGB leds[NUM_LEDS];
+
+void setup() {
+	// sanity check delay - allows reprogramming if accidently blowing power w/leds
+   	delay(2000);
+
+   	// For safety (to prevent too high of a power draw), the test case defaults to
+   	// setting brightness to 25% brightness
+   	LEDS.setBrightness(64);
+
+   	// LEDS.addLeds<WS2811, 13>(leds, NUM_LEDS);
+   	// LEDS.addLeds<TM1809, 13>(leds, NUM_LEDS);
+   	// LEDS.addLeds<UCS1903, 13>(leds, NUM_LEDS);
+   	// LEDS.addLeds<TM1803, 13>(leds, NUM_LEDS);
+
+   	// LEDS.addLeds<P9813>(leds, NUM_LEDS);
+   	
+   	LEDS.addLeds<LPD8806>(leds, NUM_LEDS);
+	// LEDS.addLeds<WS2801>(leds, NUM_LEDS);
+   	// LEDS.addLeds<SM16716>(leds, NUM_LEDS);
+
+   	// LEDS.addLeds<WS2811, 11>(leds, NUM_LEDS);
+
+	// Put ws2801 strip on the hardware SPI pins with a BGR ordering of rgb and limited to a 1Mhz data rate
+	// LEDS.addLeds<WS2801, 11, 13, BGR, DATA_RATE_MHZ(1)>(leds, NUM_LEDS);
+
+   	// LEDS.addLeds<LPD8806, 10, 11>(leds, NUM_LEDS);
+   	// LEDS.addLeds<WS2811, 13, BRG>(leds, NUM_LEDS);
+   	// LEDS.addLeds<LPD8806, BGR>(leds, NUM_LEDS);
+}
+
+void loop() { 
+	for(int i = 0; i < 3; i++) {
+		for(int iLed = 0; iLed < NUM_LEDS; iLed++) {
+			memset(leds, 0,  NUM_LEDS * sizeof(struct CRGB));
+
+			switch(i) { 
+				// You can access the rgb values by field r, g, b
+			 	case 0: leds[iLed].r = 128; break;
+
+			 	// or by indexing into the led (r==0, g==1, b==2) 
+			 	case 1: leds[iLed][i] = 128; break;
+
+			 	// or by setting the rgb values for the pixel all at once
+			 	case 2: leds[iLed] = CRGB(0, 0, 128); break;
+			}
+
+			// and now, show your led array! 
+			LEDS.show();
+			delay(10);
+		}
+
+		// fade up
+		for(int x = 0; x < 128; x++) { 
+			// The showColor method sets all the leds in the strip to the same color
+			LEDS.showColor(CRGB(x, 0, 0));
+			delay(10);
+		}
+
+		// fade down
+		for(int x = 128; x >= 0; x--) { 
+			LEDS.showColor(CRGB(x, 0, 0));
+			delay(10);
+		}
+
+		// let's fade up by scaling the brightness
+		for(int scale = 0; scale < 128; scale++) { 
+			LEDS.showColor(CRGB(0, 128, 0), scale);
+			delay(10);
+		}
+
+		// let's fade down by scaling the brightness
+		for(int scale = 128; scale > 0; scale--) { 
+			LEDS.showColor(CRGB(0, 128, 0), scale);
+			delay(10);
+		}
+	}
+}
diff --git a/examples/FirstLight/FirstLight.ino b/examples/FirstLight/FirstLight.ino
new file mode 100644
index 00000000..fcfbacbd
--- /dev/null
+++ b/examples/FirstLight/FirstLight.ino
@@ -0,0 +1,66 @@
+#define FORCE_SOFTWARE_SPI
+#define FORCE_SOFTWARE_PINS
+#include "FastLED.h"
+
+///////////////////////////////////////////////////////////////////////////////////////////
+//
+// Move a white dot along the strip of leds.  This program simply shows how to configure the leds,
+// and then how to turn a single pixel white and then off, moving down the line of pixels.
+// 
+
+// How many leds are in the strip?
+#define NUM_LEDS 60
+
+// Data pin that led data will be written out over
+#define DATA_PIN 6
+
+// Clock pin only needed for SPI based chipsets when not using hardware SPI
+//#define CLOCK_PIN 8
+
+// This is an array of leds.  One item for each led in your strip.
+CRGB leds[NUM_LEDS];
+
+// This function sets up the ledsand tells the controller about them
+void setup() {
+	// sanity check delay - allows reprogramming if accidently blowing power w/leds
+   	delay(2000);
+
+      // Uncomment one of the following lines for your leds arrangement.
+      // FastLED.addLeds<TM1803, DATA_PIN, RGB>(leds, NUM_LEDS);
+      // FastLED.addLeds<TM1804, DATA_PIN, RGB>(leds, NUM_LEDS);
+      // FastLED.addLeds<TM1809, DATA_PIN, RGB>(leds, NUM_LEDS);
+      // FastLED.addLeds<WS2811, DATA_PIN, GRB>(leds+18, NUM_LEDS/3);
+      // FastLED.addLeds<WS2811, 8, RGB>(leds + 225, NUM_LEDS/4);
+      // FastLED.addLeds<WS2812, DATA_PIN, RGB>(leds, NUM_LEDS);
+      // FastLED.addLeds<WS2812B, DATA_PIN, RGB>(leds, NUM_LEDS);
+      // FastLED.addLeds<NEOPIXEL, DATA_PIN, RGB>(leds, NUM_LEDS);
+      // FastLED.addLeds<WS2811_400, DATA_PIN, RGB>(leds, NUM_LEDS);
+      // FastLED.addLeds<UCS1903, DATA_PIN, RGB>(leds, NUM_LEDS);
+
+      // FastLED.addLeds<WS2801, RGB>(leds, NUM_LEDS);
+      // FastLED.addLeds<SM16716, RGB>(leds, NUM_LEDS);
+      FastLED.addLeds<LPD8806, RGB>(leds, NUM_LEDS);
+
+      // FastLED.addLeds<WS2801, DATA_PIN, CLOCK_PIN, RGB>(leds, NUM_LEDS);
+      // FastLED.addLeds<SM16716, DATA_PIN, CLOCK_PIN, RGB>(leds, NUM_LEDS);
+      // FastLED.addLeds<LPD8806, DATA_PIN, CLOCK_PIN, RGB>(leds, NUM_LEDS);
+}
+
+// This function runs over and over, and is where you do the magic to light
+// your leds.
+void loop() {
+   // Move a single white led 
+   for(int whiteLed = 0; whiteLed < NUM_LEDS; whiteLed = whiteLed + 1) {
+      // Turn our current led on to white, then show the leds
+      leds[whiteLed] = CRGB::White;
+
+      // Show the leds (only one of which is set to white, from above)
+      FastLED.show();
+
+      // Wait a little bit
+      delay(100);
+
+      // Turn our current led back to black for the next loop around
+      leds[whiteLed] = CRGB::Black;
+   }
+}
diff --git a/examples/RGBCalibrate/RGBCalibrate.ino b/examples/RGBCalibrate/RGBCalibrate.ino
new file mode 100644
index 00000000..55661052
--- /dev/null
+++ b/examples/RGBCalibrate/RGBCalibrate.ino
@@ -0,0 +1,66 @@
+#include "FastLED.h"
+
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+//
+// RGB Calibration code
+//
+// Use this sketch to determine what the RGB ordering for your chipset should be.  Steps for setting up to use:
+
+// * Uncomment the line in setup that corresponds to the LED chipset that you are using.  (Note that they
+//   all explicitly specify the RGB order as RGB)
+// * Define DATA_PIN to the pin that data is connected to.
+// * (Optional) if using software SPI for chipsets that are SPI based, define CLOCK_PIN to the clock pin
+// * Compile/upload/run the sketch 
+
+// You should see six leds on.  If the RGB ordering is correct, you should see 1 red led, 2 green 
+// leds, and 3 blue leds.  If you see different colors, the count of each color tells you what the 
+// position for that color in the rgb orering should be.  So, for example, if you see 1 Blue, and 2
+// Red, and 3 Green leds then the rgb ordering should be BRG (Blue, Red, Green).  
+
+// You can then test this ordering by setting the RGB ordering in the addLeds line below to the new ordering
+// and it should come out correctly, 1 red, 2 green, and 3 blue.
+//
+//////////////////////////////////////////////////
+
+#define NUM_LEDS 6
+
+// Data pin that led data will be written out over
+#define DATA_PIN 7
+// Clock pin only needed for SPI based chipsets when not using hardware SPI
+//#define CLOCK_PIN 8
+
+CRGB leds[NUM_LEDS];
+
+void setup() {
+	// sanity check delay - allows reprogramming if accidently blowing power w/leds
+   	delay(2000);
+
+      // Uncomment one of the following lines for your leds arrangement.
+      // FastLED.addLeds<TM1803, DATA_PIN, RGB>(leds, NUM_LEDS);
+      // FastLED.addLeds<TM1804, DATA_PIN, RGB>(leds, NUM_LEDS);
+      // FastLED.addLeds<TM1809, DATA_PIN, RGB>(leds, NUM_LEDS);
+      // FastLED.addLeds<WS2811, DATA_PIN, RGB>(leds, NUM_LEDS);
+      // FastLED.addLeds<WS2812, DATA_PIN, RGB>(leds, NUM_LEDS);
+      // FastLED.addLeds<WS2812B, DATA_PIN, RGB>(leds, NUM_LEDS);
+      // FastLED.addLeds<UCS1903, DATA_PIN, RGB>(leds, NUM_LEDS);
+
+      // FastLED.addLeds<WS2801, RGB>(leds, NUM_LEDS);
+      // FastLED.addLeds<SM16716, RGB>(leds, NUM_LEDS);
+      // FastLED.addLeds<LPD8806, RGB>(leds, NUM_LEDS);
+
+      // FastLED.addLeds<WS2801, DATA_PIN, CLOCK_PIN, RGB>(leds, NUM_LEDS);
+      // FastLED.addLeds<SM16716, DATA_PIN, CLOCK_PIN, RGB>(leds, NUM_LEDS);
+      // FastLED.addLeds<LPD8806, DATA_PIN, CLOCK_PIN, RGB>(leds, NUM_LEDS);
+}
+
+void loop() {
+   leds[0] = CRGB::Red; 
+   leds[1] = CRGB::Green;
+   leds[2] = CRGB::Green;
+   leds[3] = CRGB::Blue;
+   leds[4] = CRGB::Blue;
+   leds[5] = CRGB::Blue;
+   FastLED.show();
+   delay(1000);
+}
diff --git a/fastpin.h b/fastpin.h
new file mode 100644
index 00000000..b6355fce
--- /dev/null
+++ b/fastpin.h
@@ -0,0 +1,424 @@
+#ifndef __INC_FASTPIN_H
+#define __INC_FASTPIN_H
+
+#include<avr/io.h>
+
+// Arduino.h needed for convinience functions digitalPinToPort/BitMask/portOutputRegister and the pinMode methods.
+#include<Arduino.h>
+
+#define NO_PIN 255 
+
+// Class to ensure that a minimum amount of time has kicked since the last time run - and delay if not enough time has passed yet
+// this should make sure that chipsets that have 
+template<int WAIT> class CMinWait {
+	long mLastMicros;
+public:
+	CMinWait() { mLastMicros = 0; }
+
+	void wait() { 
+		long diff = micros() - mLastMicros;
+		if(diff < WAIT) { 
+			delayMicroseconds(WAIT - diff);
+		}
+	}
+
+	void mark() { mLastMicros = micros(); }
+};
+
+//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+//
+// Pin access class - needs to tune for various platforms (naive fallback solution?)
+//
+//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+#if defined(__AVR_ATmega1280__) || defined(__AVR_ATmega2560__)
+#define _CYCLES(_PIN) (((_PIN >= 62 ) || (_PIN>=42 && _PIN<=49) || (_PIN>=14 && _PIN <=17) || (_PIN>=6 && _PIN <=9)) ? 2 : 1)
+#else
+#define _CYCLES(_PIN) ((_PIN >= 24) ? 2 : 1)
+#endif
+
+class Selectable {
+public:
+	virtual void select() = 0;
+	virtual void release() = 0;
+	virtual bool isSelected() = 0;
+};
+
+class Pin : public Selectable { 
+	uint8_t mPinMask;
+	uint8_t mPin;
+	volatile uint8_t *mPort;
+
+	void _init() { 
+		mPinMask = digitalPinToBitMask(mPin);
+		mPort = portOutputRegister(digitalPinToPort(mPin));
+	}
+public:
+	Pin(int pin) : mPin(pin) { _init(); }
+
+	typedef volatile uint8_t * port_ptr_t;
+	typedef uint8_t port_t;
+
+	inline void setOutput() { pinMode(mPin, OUTPUT); }
+	inline void setInput() { pinMode(mPin, INPUT); }
+
+	inline void hi() __attribute__ ((always_inline)) { *mPort |= mPinMask; } 
+	inline void lo() __attribute__ ((always_inline)) { *mPort &= ~mPinMask; }
+
+	inline void strobe() __attribute__ ((always_inline)) { hi(); lo(); }
+
+	inline void hi(register port_ptr_t port) __attribute__ ((always_inline)) { *port |= mPinMask; } 
+	inline void lo(register port_ptr_t port) __attribute__ ((always_inline)) { *port &= ~mPinMask; } 
+	inline void set(register port_t val) __attribute__ ((always_inline)) { *mPort = val; }
+
+	inline void fastset(register port_ptr_t port, register port_t val) __attribute__ ((always_inline)) { *port  = val; }
+
+	port_t hival() __attribute__ ((always_inline)) { return *mPort | mPinMask;  }
+	port_t loval() __attribute__ ((always_inline)) { return *mPort & ~mPinMask; }
+	port_ptr_t  port() __attribute__ ((always_inline)) { return mPort; }
+	port_t mask() __attribute__ ((always_inline)) { return mPinMask; }
+
+	virtual void select() { hi(); }
+	virtual void release() { lo(); }
+	virtual bool isSelected() { return (*mPort & mPinMask) == mPinMask; }
+};
+
+class OutputPin : public Pin {
+public:
+	OutputPin(int pin) : Pin(pin) { setOutput(); }
+};
+
+class InputPin : public Pin {
+public:
+	InputPin(int pin) : Pin(pin) { setInput(); }
+};
+
+/// The simplest level of Pin class.  This relies on runtime functions durinig initialization to get the port/pin mask for the pin.  Most
+/// of the accesses involve references to these static globals that get set up.  This won't be the fastest set of pin operations, but it
+/// will provide pin level access on pretty much all arduino environments.  In addition, it includes some methods to help optimize access in
+/// various ways.  Namely, the versions of hi, lo, and fastset that take the port register as a passed in register variable (saving a global
+/// dereference), since these functions are aggressively inlined, that can help collapse out a lot of extraneous memory loads/dereferences.
+/// 
+/// In addition, if, while writing a bunch of data to a pin, you know no other pins will be getting written to, you can get/cache a value of
+/// the pin's port register and use that to do a full set to the register.  This results in one being able to simply do a store to the register,
+/// vs. the load, and/or, and store that would be done normally.
+///
+/// There are platform specific instantiations of this class that provide direct i/o register access to pins for much higher speed pin twiddling.
+///
+/// Note that these classes are all static functions.  So the proper usage is Pin<13>::hi(); or such.  Instantiating objects is not recommended, 
+/// as passing Pin objects around will likely -not- have the effect you're expecting.
+template<uint8_t PIN> class FastPin { 
+	static uint8_t sPinMask;
+	static volatile uint8_t *sPort;
+	static void _init() { 
+		sPinMask = digitalPinToBitMask(PIN);
+		sPort = portOutputRegister(digitalPinToPort(PIN));
+	}
+public:
+	typedef volatile uint8_t * port_ptr_t;
+	typedef uint8_t port_t;
+
+	inline static void setOutput() { _init(); pinMode(PIN, OUTPUT); }
+	inline static void setInput() { _init(); pinMode(PIN, INPUT); }
+
+	inline static void hi() __attribute__ ((always_inline)) { *sPort |= sPinMask; } 
+	inline static void lo() __attribute__ ((always_inline)) { *sPort &= ~sPinMask; }
+
+	inline static void strobe() __attribute__ ((always_inline)) { hi(); lo(); }
+
+	inline static void hi(register port_ptr_t port) __attribute__ ((always_inline)) { *port |= sPinMask; } 
+	inline static void lo(register port_ptr_t port) __attribute__ ((always_inline)) { *port &= ~sPinMask; } 
+	inline static void set(register port_t val) __attribute__ ((always_inline)) { *sPort = val; }
+
+	inline static void fastset(register port_ptr_t port, register port_t val) __attribute__ ((always_inline)) { *port  = val; }
+
+	static port_t hival() __attribute__ ((always_inline)) { return *sPort | sPinMask;  }
+	static port_t loval() __attribute__ ((always_inline)) { return *sPort & ~sPinMask; }
+	static port_ptr_t  port() __attribute__ ((always_inline)) { return sPort; }
+	static port_t mask() __attribute__ ((always_inline)) { return sPinMask; }
+};
+
+template<uint8_t PIN> uint8_t FastPin<PIN>::sPinMask;
+template<uint8_t PIN> volatile uint8_t *FastPin<PIN>::sPort;
+
+/// Class definition for a Pin where we know the port registers at compile time for said pin.  This allows us to make
+/// a lot of optimizations, as the inlined hi/lo methods will devolve to a single io register write/bitset.  
+template<uint8_t PIN, uint8_t _MASK, typename _PORT, typename _DDR, typename _PIN> class _AVRPIN { 
+public:
+	typedef volatile uint8_t * port_ptr_t;
+	typedef uint8_t port_t;
+	 
+	inline static void setOutput() { _DDR::r() |= _MASK; }
+	inline static void setInput() { _DDR::r() &= ~_MASK; }
+
+	inline static void hi() __attribute__ ((always_inline)) { _PORT::r() |= _MASK; }
+	inline static void lo() __attribute__ ((always_inline)) { _PORT::r() &= ~_MASK; }
+	inline static void set(register uint8_t val) __attribute__ ((always_inline)) { _PORT::r() = val; }
+
+	inline static void strobe() __attribute__ ((always_inline)) { hi(); lo(); }
+	
+	inline static void hi(register port_ptr_t port) __attribute__ ((always_inline)) { hi(); }
+	inline static void lo(register port_ptr_t port) __attribute__ ((always_inline)) { lo(); }
+	inline static void fastset(register port_ptr_t port, register uint8_t val) __attribute__ ((always_inline)) { set(val); }
+
+	inline static port_t hival() __attribute__ ((always_inline)) { return _PORT::r() | _MASK; }
+	inline static port_t loval() __attribute__ ((always_inline)) { return _PORT::r() & ~_MASK; }
+	inline static port_ptr_t port() __attribute__ ((always_inline)) { return &_PORT::r(); }
+	inline static port_t mask() __attribute__ ((always_inline)) { return _MASK; }
+};
+
+/// Template definition for teensy 3.0 style ARM pins, providing direct access to the various GPIO registers.  Note that this
+/// uses the full port GPIO registers.  In theory, in some way, bit-band register access -should- be faster, however I have found
+/// that something about the way gcc does register allocation results in the bit-band code being slower.  It will need more fine tuning.
+template<uint8_t PIN, uint32_t _MASK, typename _PDOR, typename _PSOR, typename _PCOR, typename _PTOR, typename _PDIR, typename _PDDR> class _ARMPIN { 
+public:
+	typedef volatile uint32_t * port_ptr_t;
+	typedef uint32_t port_t;
+
+	inline static void setOutput() { pinMode(PIN, OUTPUT); } // TODO: perform MUX config { _PDDR::r() |= _MASK; }
+	inline static void setInput() { pinMode(PIN, INPUT); } // TODO: preform MUX config { _PDDR::r() &= ~_MASK; }
+
+	inline static void hi() __attribute__ ((always_inline)) { _PSOR::r() = _MASK; }
+	inline static void lo() __attribute__ ((always_inline)) { _PCOR::r() = _MASK; }
+	inline static void set(register port_t val) __attribute__ ((always_inline)) { _PDOR::r() = val; }
+
+	inline static void strobe() __attribute__ ((always_inline)) { toggle(); toggle(); }
+	
+	inline static void toggle() __attribute__ ((always_inline)) { _PTOR::r() = _MASK; }
+
+	inline static void hi(register port_ptr_t port) __attribute__ ((always_inline)) { hi(); }
+	inline static void lo(register port_ptr_t port) __attribute__ ((always_inline)) { lo(); }
+	inline static void fastset(register port_ptr_t port, register port_t val) __attribute__ ((always_inline)) { *port = val; }
+
+	inline static port_t hival() __attribute__ ((always_inline)) { return _PDOR::r() | _MASK; }
+	inline static port_t loval() __attribute__ ((always_inline)) { return _PDOR::r() & ~_MASK; }
+	inline static port_ptr_t port() __attribute__ ((always_inline)) { return &_PDOR::r(); }
+	inline static port_t mask() __attribute__ ((always_inline)) { return _MASK; }
+};
+
+/// Template definition for teensy 3.0 style ARM pins using bit banding, providing direct access to the various GPIO registers.  GCC 
+/// does a poor job of optimizing around these accesses so they are not being used just yet.
+template<uint8_t PIN, int _BIT, typename _PDOR, typename _PSOR, typename _PCOR, typename _PTOR, typename _PDIR, typename _PDDR> class _ARMPIN_BITBAND { 
+public:
+	typedef volatile uint32_t * port_ptr_t;
+	typedef uint32_t port_t;
+
+	inline static void setOutput() { pinMode(PIN, OUTPUT); } // TODO: perform MUX config { _PDDR::r() |= _MASK; }
+	inline static void setInput() { pinMode(PIN, INPUT); } // TODO: preform MUX config { _PDDR::r() &= ~_MASK; }
+
+	inline static void hi() __attribute__ ((always_inline)) { *_PDOR::template rx<_BIT>() = 1; }
+	inline static void lo() __attribute__ ((always_inline)) { *_PDOR::template rx<_BIT>() = 0; }
+	inline static void set(register port_t val) __attribute__ ((always_inline)) { *_PDOR::template rx<_BIT>() = val; }
+
+	inline static void strobe() __attribute__ ((always_inline)) { toggle(); toggle(); }
+	
+	inline static void toggle() __attribute__ ((always_inline)) { *_PTOR::template rx<_BIT>() = 1; }
+
+	inline static void hi(register port_ptr_t port) __attribute__ ((always_inline)) { *port = 1;  }
+	inline static void lo(register port_ptr_t port) __attribute__ ((always_inline)) { *port = 0; }
+	inline static void fastset(register port_ptr_t port, register port_t val) __attribute__ ((always_inline)) { *port = val; }
+
+	inline static port_t hival() __attribute__ ((always_inline)) { return 1; }
+	inline static port_t loval() __attribute__ ((always_inline)) { return 0; }
+	inline static port_ptr_t port() __attribute__ ((always_inline)) { return _PDOR::template rx<_BIT>(); }
+	inline static port_t mask() __attribute__ ((always_inline)) { return 1; }
+};
+
+/// AVR definitions for pins.  Getting around  the fact that I can't pass GPIO register addresses in as template arguments by instead creating
+/// a custom type for each GPIO register with a single, static, aggressively inlined function that returns that specific GPIO register.  A similar
+/// trick is used a bit further below for the ARM GPIO registers (of which there are far more than on AVR!)
+typedef volatile uint8_t & reg8_t;
+#define _R(T) struct __gen_struct_ ## T
+#define _RD8(T) struct __gen_struct_ ## T { static inline reg8_t r() { return T; }};
+#define _IO(L) _RD8(DDR ## L); _RD8(PORT ## L); _RD8(PIN ## L);
+#define _DEFPIN_AVR(PIN, MASK, L) template<> class FastPin<PIN> : public _AVRPIN<PIN, MASK, _R(PORT ## L), _R(DDR ## L), _R(PIN ## L)> {};
+
+// ARM definitions
+#define GPIO_BITBAND_ADDR(reg, bit) (((uint32_t)&(reg) - 0x40000000) * 32 + (bit) * 4 + 0x42000000)
+#define GPIO_BITBAND_PTR(reg, bit) ((uint32_t *)GPIO_BITBAND_ADDR((reg), (bit)))
+
+typedef volatile uint32_t & reg32_t;
+typedef volatile uint32_t * ptr_reg32_t;
+
+#define _RD32(T) struct __gen_struct_ ## T { static __attribute__((always_inline)) inline reg32_t r() { return T; } \
+	template<int BIT> static __attribute__((always_inline)) inline ptr_reg32_t rx() { return GPIO_BITBAND_PTR(T, BIT); } };
+#define _IO32(L) _RD32(GPIO ## L ## _PDOR); _RD32(GPIO ## L ## _PSOR); _RD32(GPIO ## L ## _PCOR); _RD32(GPIO ## L ## _PTOR); _RD32(GPIO ## L ## _PDIR); _RD32(GPIO ## L ## _PDDR);
+
+#define _DEFPIN_ARM(PIN, BIT, L) template<> class FastPin<PIN> : public _ARMPIN<PIN, 1 << BIT, _R(GPIO ## L ## _PDOR), _R(GPIO ## L ## _PSOR), _R(GPIO ## L ## _PCOR), \
+																			_R(GPIO ## L ## _PTOR), _R(GPIO ## L ## _PDIR), _R(GPIO ## L ## _PDDR)> {}; 
+
+// Don't use bit band'd pins for now, the compiler generates far less efficient code around them
+// #define _DEFPIN_ARM(PIN, BIT, L) template<> class Pin<PIN> : public _ARMPIN_BITBAND<PIN, BIT, _R(GPIO ## L ## _PDOR), _R(GPIO ## L ## _PSOR), _R(GPIO ## L ## _PCOR),
+// 																			_R(GPIO ## L ## _PTOR), _R(GPIO ## L ## _PDIR), _R(GPIO ## L ## _PDDR)> {}; 
+
+
+///////////////////////////////////////////////////////////////////////////////////////////////////////////
+//
+// Pin definitions for AVR and ARM.  If there are pin definitions supplied below for the platform being 
+// built on, then much higher speed access will be possible, namely with direct GPIO register accesses.
+//
+///////////////////////////////////////////////////////////////////////////////////////////////////////////
+#if defined(FORCE_SOFTWARE_PINS)
+#warning "Softwrae pin support forced pin access will be slightly slower.  See fastpin.h for info."
+#define NO_HARDWARE_PIN_SUPPORT
+
+#elif defined(__AVR_ATtiny85__) 
+_IO(B);
+
+_DEFPIN_AVR(0, 0x01, B); _DEFPIN_AVR(1, 0x02, B); _DEFPIN_AVR(2, 0x04, B); _DEFPIN_AVR(3, 0x08, B);
+_DEFPIN_AVR(4, 0x10, B); _DEFPIN_AVR(5, 0x20, B);
+
+#elif defined(__AVR_ATmega328P__) || defined(__AVR_ATmega168__)
+// Accelerated port definitions for arduino avrs
+_IO(D); _IO(B); _IO(C);
+_DEFPIN_AVR( 0, 0x01, D); _DEFPIN_AVR( 1, 0x02, D); _DEFPIN_AVR( 2, 0x04, D); _DEFPIN_AVR( 3, 0x08, D);
+_DEFPIN_AVR( 4, 0x10, D); _DEFPIN_AVR( 5, 0x20, D); _DEFPIN_AVR( 6, 0x40, D); _DEFPIN_AVR( 7, 0x80, D);
+_DEFPIN_AVR( 8, 0x01, B); _DEFPIN_AVR( 9, 0x02, B); _DEFPIN_AVR(10, 0x04, B); _DEFPIN_AVR(11, 0x08, B);
+_DEFPIN_AVR(12, 0x10, B); _DEFPIN_AVR(13, 0x20, B); _DEFPIN_AVR(14, 0x01, C); _DEFPIN_AVR(15, 0x02, C);
+_DEFPIN_AVR(16, 0x04, C); _DEFPIN_AVR(17, 0x08, C); _DEFPIN_AVR(18, 0x10, C); _DEFPIN_AVR(19, 0x20, C);
+
+#define SPI_DATA 11
+#define SPI_CLOCK 13
+#define SPI_SELECT 10
+#define AVR_HARDWARE_SPI
+
+#elif defined(__AVR_ATmega1280__) || defined(__AVR_ATmega2560__)
+// megas
+
+_IO(A); _IO(B); _IO(C); _IO(D); _IO(E); _IO(F); _IO(G); _IO(H); _IO(J); _IO(K); _IO(L);
+
+_DEFPIN_AVR(0, 1, E); _DEFPIN_AVR(1, 2, E); _DEFPIN_AVR(2, 16, E); _DEFPIN_AVR(3, 32, E); 
+_DEFPIN_AVR(4, 32, G); _DEFPIN_AVR(5, 8, E); _DEFPIN_AVR(6, 8, H); _DEFPIN_AVR(7, 16, H); 
+_DEFPIN_AVR(8, 32, H); _DEFPIN_AVR(9, 64, H); _DEFPIN_AVR(10, 16, B); _DEFPIN_AVR(11, 32, B); 
+_DEFPIN_AVR(12, 64, B); _DEFPIN_AVR(13, 128, B); _DEFPIN_AVR(14, 2, J); _DEFPIN_AVR(15, 1, J); 
+_DEFPIN_AVR(16, 2, H); _DEFPIN_AVR(17, 1, H); _DEFPIN_AVR(18, 8, D); _DEFPIN_AVR(19, 4, D); 
+_DEFPIN_AVR(20, 2, D); _DEFPIN_AVR(21, 1, D); _DEFPIN_AVR(22, 1, A); _DEFPIN_AVR(23, 2, A); 
+_DEFPIN_AVR(24, 4, A); _DEFPIN_AVR(25, 8, A); _DEFPIN_AVR(26, 16, A); _DEFPIN_AVR(27, 32, A); 
+_DEFPIN_AVR(28, 64, A); _DEFPIN_AVR(29, 128, A); _DEFPIN_AVR(30, 128, C); _DEFPIN_AVR(31, 64, C); 
+_DEFPIN_AVR(32, 32, C); _DEFPIN_AVR(33, 16, C); _DEFPIN_AVR(34, 8, C); _DEFPIN_AVR(35, 4, C); 
+_DEFPIN_AVR(36, 2, C); _DEFPIN_AVR(37, 1, C); _DEFPIN_AVR(38, 128, D); _DEFPIN_AVR(39, 4, G); 
+_DEFPIN_AVR(40, 2, G); _DEFPIN_AVR(41, 1, G); _DEFPIN_AVR(42, 128, L); _DEFPIN_AVR(43, 64, L); 
+_DEFPIN_AVR(44, 32, L); _DEFPIN_AVR(45, 16, L); _DEFPIN_AVR(46, 8, L); _DEFPIN_AVR(47, 4, L); 
+_DEFPIN_AVR(48, 2, L); _DEFPIN_AVR(49, 1, L); _DEFPIN_AVR(50, 8, B); _DEFPIN_AVR(51, 4, B); 
+_DEFPIN_AVR(52, 2, B); _DEFPIN_AVR(53, 1, B); _DEFPIN_AVR(54, 1, F); _DEFPIN_AVR(55, 2, F); 
+_DEFPIN_AVR(56, 4, F); _DEFPIN_AVR(57, 8, F); _DEFPIN_AVR(58, 16, F); _DEFPIN_AVR(59, 32, F); 
+_DEFPIN_AVR(60, 64, F); _DEFPIN_AVR(61, 128, F); _DEFPIN_AVR(62, 1, K); _DEFPIN_AVR(63, 2, K); 
+_DEFPIN_AVR(64, 4, K); _DEFPIN_AVR(65, 8, K); _DEFPIN_AVR(66, 16, K); _DEFPIN_AVR(67, 32, K); 
+_DEFPIN_AVR(68, 64, K); _DEFPIN_AVR(69, 128, K); 
+
+#define SPI_DATA 51
+#define SPI_CLOCK 52
+#define SPI_SELECT 53
+#define AVR_HARDWARE_SPI
+
+// Leonardo, teensy, blinkm
+#elif defined(__AVR_ATmega32U4__) && defined(CORE_TEENSY)
+
+// teensy defs
+_IO(B); _IO(C); _IO(D); _IO(E); _IO(F); 
+
+_DEFPIN_AVR(0, 1, B); _DEFPIN_AVR(1, 2, B); _DEFPIN_AVR(2, 4, B); _DEFPIN_AVR(3, 8, B); 
+_DEFPIN_AVR(4, 128, B); _DEFPIN_AVR(5, 1, D); _DEFPIN_AVR(6, 2, D); _DEFPIN_AVR(7, 4, D); 
+_DEFPIN_AVR(8, 8, D); _DEFPIN_AVR(9, 64, C); _DEFPIN_AVR(10, 128, C); _DEFPIN_AVR(11, 64, D); 
+_DEFPIN_AVR(12, 128, D); _DEFPIN_AVR(13, 16, B); _DEFPIN_AVR(14, 32, B); _DEFPIN_AVR(15, 64, B); 
+_DEFPIN_AVR(16, 128, F); _DEFPIN_AVR(17, 64, F); _DEFPIN_AVR(18, 32, F); _DEFPIN_AVR(19, 16, F); 
+_DEFPIN_AVR(20, 2, F); _DEFPIN_AVR(21, 1, F); _DEFPIN_AVR(22, 16, D); _DEFPIN_AVR(23, 32, D); 
+
+#define SPI_DATA 2
+#define SPI_CLOCK 1
+#define SPI_SELECT 3
+#define AVR_HARDWARE_SPI
+
+#elif defined(__AVR_AT90USB646__) || defined(__AVR_AT90USB1286__)
+// teensy++ 2 defs
+
+_IO(A); _IO(B); _IO(C); _IO(D); _IO(E); _IO(F); 
+
+_DEFPIN_AVR(0, 1, D); _DEFPIN_AVR(1, 2, D); _DEFPIN_AVR(2, 4, D); _DEFPIN_AVR(3, 8, D); 
+_DEFPIN_AVR(4, 16, D); _DEFPIN_AVR(5, 32, D); _DEFPIN_AVR(6, 64, D); _DEFPIN_AVR(7, 128, D); 
+_DEFPIN_AVR(8, 1, E); _DEFPIN_AVR(9, 2, E); _DEFPIN_AVR(10, 1, C); _DEFPIN_AVR(11, 2, C); 
+_DEFPIN_AVR(12, 4, C); _DEFPIN_AVR(13, 8, C); _DEFPIN_AVR(14, 16, C); _DEFPIN_AVR(15, 32, C); 
+_DEFPIN_AVR(16, 64, C); _DEFPIN_AVR(17, 128, C); _DEFPIN_AVR(18, 64, E); _DEFPIN_AVR(19, 128, E); 
+_DEFPIN_AVR(20, 1, B); _DEFPIN_AVR(21, 2, B); _DEFPIN_AVR(22, 4, B); _DEFPIN_AVR(23, 8, B); 
+_DEFPIN_AVR(24, 16, B); _DEFPIN_AVR(25, 32, B); _DEFPIN_AVR(26, 64, B); _DEFPIN_AVR(27, 128, B); 
+_DEFPIN_AVR(28, 1, A); _DEFPIN_AVR(29, 2, A); _DEFPIN_AVR(30, 4, A); _DEFPIN_AVR(31, 8, A); 
+_DEFPIN_AVR(32, 16, A); _DEFPIN_AVR(33, 32, A); _DEFPIN_AVR(34, 64, A); _DEFPIN_AVR(35, 128, A); 
+_DEFPIN_AVR(36, 16, E); _DEFPIN_AVR(37, 32, E); _DEFPIN_AVR(38, 1, F); _DEFPIN_AVR(39, 2, F); 
+_DEFPIN_AVR(40, 4, F); _DEFPIN_AVR(41, 8, F); _DEFPIN_AVR(42, 16, F); _DEFPIN_AVR(43, 32, F); 
+_DEFPIN_AVR(44, 64, F); _DEFPIN_AVR(45, 128, F); 
+
+#define SPI_DATA 22
+#define SPI_CLOCK 21
+#define SPI_SELECT 20
+#define AVR_HARDWARE_SPI
+
+#elif defined(__AVR_ATmega32U4__)
+
+// leonard defs
+_IO(B); _IO(C); _IO(D); _IO(E); _IO(F); 
+
+_DEFPIN_AVR(0, 4, D); _DEFPIN_AVR(1, 8, D); _DEFPIN_AVR(2, 2, D); _DEFPIN_AVR(3, 1, D); 
+_DEFPIN_AVR(4, 16, D); _DEFPIN_AVR(5, 64, C); _DEFPIN_AVR(6, 128, D); _DEFPIN_AVR(7, 64, E); 
+_DEFPIN_AVR(8, 16, B); _DEFPIN_AVR(9, 32, B); _DEFPIN_AVR(10, 64, B); _DEFPIN_AVR(11, 128, B); 
+_DEFPIN_AVR(12, 64, D); _DEFPIN_AVR(13, 128, C); _DEFPIN_AVR(14, 8, B); _DEFPIN_AVR(15, 2, B); 
+_DEFPIN_AVR(16, 4, B); _DEFPIN_AVR(17, 1, B); _DEFPIN_AVR(18, 128, F); _DEFPIN_AVR(19, 64, F); 
+_DEFPIN_AVR(20, 32, F); _DEFPIN_AVR(21, 16, F); _DEFPIN_AVR(22, 2, F); _DEFPIN_AVR(23, 0, F); 
+
+#define SPI_DATA 16
+#define SPI_CLOCK 15
+#define AVR_HARDWARE_SPI
+
+#elif defined(__MK20DX128__) && defined(CORE_TEENSY)
+
+_IO32(A); _IO32(B); _IO32(C); _IO32(D); _IO32(E);
+
+_DEFPIN_ARM(0, 16, B); _DEFPIN_ARM(1, 17, B); _DEFPIN_ARM(2, 0, D); _DEFPIN_ARM(3, 12, A);
+_DEFPIN_ARM(4, 13, A); _DEFPIN_ARM(5, 7, D); _DEFPIN_ARM(6, 4, D); _DEFPIN_ARM(7, 2, D);
+_DEFPIN_ARM(8, 3, D); _DEFPIN_ARM(9, 3, C); _DEFPIN_ARM(10, 4, C); _DEFPIN_ARM(11, 6, C);
+_DEFPIN_ARM(12, 7, C); _DEFPIN_ARM(13, 5, C); _DEFPIN_ARM(14, 1, D); _DEFPIN_ARM(15, 0, C);
+_DEFPIN_ARM(16, 0, B); _DEFPIN_ARM(17, 1, B); _DEFPIN_ARM(18, 3, B); _DEFPIN_ARM(19, 2, B);
+_DEFPIN_ARM(20, 5, D); _DEFPIN_ARM(21, 6, D); _DEFPIN_ARM(22, 1, C); _DEFPIN_ARM(23, 2, C);
+_DEFPIN_ARM(24, 5, A); _DEFPIN_ARM(25, 19, B); _DEFPIN_ARM(26, 1, E); _DEFPIN_ARM(27, 9, C);
+_DEFPIN_ARM(28, 8, C); _DEFPIN_ARM(29, 10, C); _DEFPIN_ARM(30, 11, C); _DEFPIN_ARM(31, 0, E);
+_DEFPIN_ARM(32, 18, B); _DEFPIN_ARM(33, 4, A);
+
+#define SPI_DATA 11
+#define SPI_CLOCK 13
+#define ARM_HARDWARE_SPI
+
+#elif defined(__SAM3X8E__)
+
+DUE_IO32(A);
+DUE_IO32(B);
+DUE_IO32(C);
+DUE_IO32(D);
+
+_DEFPIN_DUE(0, 8, A); _DEFPIN_DUE(1, 9, A); _DEFPIN_DUE(2, 25, B); _DEFPIN_DUE(3, 28, C);
+_DEFPIN_DUE(4, 26, C); _DEFPIN_DUE(5, 25, C); _DEFPIN_DUE(6, 24, C); _DEFPIN_DUE(7, 23, C);
+_DEFPIN_DUE(8, 22, C); _DEFPIN_DUE(9, 21, C); _DEFPIN_DUE(10, 29, C); _DEFPIN_DUE(11, 7, D);
+_DEFPIN_DUE(12, 8, D); _DEFPIN_DUE(13, 27, B); _DEFPIN_DUE(14, 4, D); _DEFPIN_DUE(15, 5, D);
+_DEFPIN_DUE(16, 13, A); _DEFPIN_DUE(17, 12, A); _DEFPIN_DUE(18, 11, A); _DEFPIN_DUE(19, 10, A);
+_DEFPIN_DUE(20, 12, B); _DEFPIN_DUE(21, 13, B); _DEFPIN_DUE(22, 26, B); _DEFPIN_DUE(23, 14, A);
+_DEFPIN_DUE(24, 15, A); _DEFPIN_DUE(25, 0, D); _DEFPIN_DUE(26, 1, D); _DEFPIN_DUE(27, 2, D);
+_DEFPIN_DUE(28, 3, D); _DEFPIN_DUE(29, 6, D); _DEFPIN_DUE(30, 9, D); _DEFPIN_DUE(31, 7, A);
+_DEFPIN_DUE(32, 10, D); _DEFPIN_DUE(33, 1, C); _DEFPIN_DUE(34, 2, C); _DEFPIN_DUE(35, 3, C);
+_DEFPIN_DUE(36, 4, C); _DEFPIN_DUE(37, 5, C); _DEFPIN_DUE(38, 6, C); _DEFPIN_DUE(39, 7, C);
+_DEFPIN_DUE(40, 8, C); _DEFPIN_DUE(41, 9, C); _DEFPIN_DUE(42, 19, A); _DEFPIN_DUE(43, 20, A);
+_DEFPIN_DUE(44, 19, C); _DEFPIN_DUE(45, 18, C); _DEFPIN_DUE(46, 17, C); _DEFPIN_DUE(47, 16, C);
+_DEFPIN_DUE(48, 15, C); _DEFPIN_DUE(49, 14, C); _DEFPIN_DUE(50, 13, C); _DEFPIN_DUE(51, 12, C);
+_DEFPIN_DUE(52, 21, B); _DEFPIN_DUE(53, 14, B); _DEFPIN_DUE(54, 16, A); _DEFPIN_DUE(55, 24, A);
+_DEFPIN_DUE(56, 23, A); _DEFPIN_DUE(57, 22, A); _DEFPIN_DUE(58, 6, A); _DEFPIN_DUE(59, 4, A);
+_DEFPIN_DUE(60, 3, A); _DEFPIN_DUE(61, 2, A); _DEFPIN_DUE(62, 17, B); _DEFPIN_DUE(63, 18, B);
+_DEFPIN_DUE(64, 19, B); _DEFPIN_DUE(65, 20, B); _DEFPIN_DUE(66, 15, B); _DEFPIN_DUE(67, 16, B);
+_DEFPIN_DUE(68, 1, A); _DEFPIN_DUE(69, 0, A); _DEFPIN_DUE(70, 17, A); _DEFPIN_DUE(71, 18, A);
+_DEFPIN_DUE(72, 30, C); _DEFPIN_DUE(73, 21, A); _DEFPIN_DUE(74, 25, A); _DEFPIN_DUE(75, 26, A);
+_DEFPIN_DUE(76, 27, A); _DEFPIN_DUE(77, 28, A); _DEFPIN_DUE(78, 23, B);
+
+#else
+
+#warning "No pin/port mappings found, pin access will be slightly slower.  See fastpin.h for info."
+#define NO_HARDWARE_PIN_SUPPORT
+
+#endif
+
+#endif
diff --git a/fastspi.h b/fastspi.h
new file mode 100644
index 00000000..00747137
--- /dev/null
+++ b/fastspi.h
@@ -0,0 +1,91 @@
+#ifndef __INC_FASTSPI_H
+#define __INC_FASTSPI_H
+
+#include "controller.h"
+#include "lib8tion.h"
+#include "delay.h"
+
+// Some helper macros for getting at mis-ordered byte values
+#define SPI_B0 (RGB_BYTE0(RGB_ORDER) + (MASK_SKIP_BITS & SKIP))
+#define SPI_B1 (RGB_BYTE1(RGB_ORDER) + (MASK_SKIP_BITS & SKIP))
+#define SPI_B2 (RGB_BYTE2(RGB_ORDER) + (MASK_SKIP_BITS & SKIP))
+#define SPI_ADVANCE (3 + (MASK_SKIP_BITS & SKIP))
+
+/// Some of the SPI controllers will need to perform a transform on each byte before doing
+/// anyting with it.  Creating a class of this form and passing it in as a template parameter to
+/// writeBytes/writeBytes3 below will ensure that the body of this method will get called on every
+/// byte worked on.  Recommendation, make the adjust method aggressively inlined.
+///
+/// TODO: Convinience macro for building these
+class DATA_NOP { 
+public:
+	static __attribute__((always_inline)) inline uint8_t adjust(register uint8_t data) { return data; } 
+	static __attribute__((always_inline)) inline uint8_t adjust(register uint8_t data, register uint8_t scale) { return scale8(data, scale); } 
+	static __attribute__((always_inline)) inline void postBlock(int len) {}
+};
+
+#define FLAG_START_BIT 0x80
+#define MASK_SKIP_BITS 0x3F
+
+// Clock speed dividers 
+#define SPEED_DIV_2 2
+#define SPEED_DIV_4 4
+#define SPEED_DIV_8 8
+#define SPEED_DIV_16 16
+#define SPEED_DIV_32 32
+#define SPEED_DIV_64 64
+#define SPEED_DIV_128 128
+
+#define MAX_DATA_RATE 0
+#define DATA_RATE_MHZ(X) ((F_CPU / 1000000L) / X)
+#define DATA_RATE_KHZ(X) ((F_CPU / 1000L) / X)
+
+// Include the various specific SPI implementations
+#include "fastspi_bitbang.h"
+#include "fastspi_arm.h"
+#include "fastspi_avr.h"
+#include "fastspi_dma.h"
+
+//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+//
+// External SPI template definition with partial instantiation(s) to map to hardware SPI ports on platforms/builds where the pin
+// mappings are known at compile time.
+//
+//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template<uint8_t _DATA_PIN, uint8_t _CLOCK_PIN, uint8_t _SPI_CLOCK_DIVIDER>
+class SPIOutput : public AVRSoftwareSPIOutput<_DATA_PIN, _CLOCK_PIN, _SPI_CLOCK_DIVIDER> {};
+
+template<uint8_t _DATA_PIN, uint8_t _CLOCK_PIN, uint8_t _SPI_CLOCK_DIVIDER>
+class SoftwareSPIOutput : public AVRSoftwareSPIOutput<_DATA_PIN, _CLOCK_PIN, _SPI_CLOCK_DIVIDER> {};
+
+#ifndef FORCE_SOFTWARE_SPI
+#if defined(SPI_DATA) && defined(SPI_CLOCK)
+
+#if defined(__MK20DX128__) && defined(CORE_TEENSY)
+
+template<uint8_t SPI_SPEED>
+class SPIOutput<SPI_DATA, SPI_CLOCK, SPI_SPEED> : public ARMHardwareSPIOutput<SPI_DATA, SPI_CLOCK, SPI_SPEED> {};
+
+#else
+
+template<uint8_t SPI_SPEED>
+class SPIOutput<SPI_DATA, SPI_CLOCK, SPI_SPEED> : public AVRHardwareSPIOutput<SPI_DATA, SPI_CLOCK, SPI_SPEED> {};
+
+#endif
+
+#else
+#warning "No hardware SPI pins defined.  All SPI access will default to bitbanged output"
+
+#endif
+
+// #if defined(USART_DATA) && defined(USART_CLOCK)
+// template<uint8_t SPI_SPEED>
+// class AVRSPIOutput<USART_DATA, USART_CLOCK, SPI_SPEED> : public AVRUSARTSPIOutput<USART_DATA, USART_CLOCK, SPI_SPEED> {};
+// #endif
+
+#else
+#warning "Forcing software SPI - no hardware SPI for you!"
+#endif 
+
+#endif
diff --git a/fastspi_arm.h b/fastspi_arm.h
new file mode 100644
index 00000000..e9c38343
--- /dev/null
+++ b/fastspi_arm.h
@@ -0,0 +1,386 @@
+#ifndef __INC_FASTSPI_ARM_H
+#define __INC_FASTSPI_ARM_H
+
+
+#if defined(__MK20DX128__) && defined(CORE_TEENSY)
+
+#ifndef SPI_PUSHR_CONT
+#define SPI_PUSHR_CONT SPI0_PUSHR_CONT
+#define SPI_PUSHR_CTAS(X) SPI0_PUSHR_CTAS(X)
+#define SPI_PUSHR_EOQ SPI0_PUSHR_EOQ
+#define SPI_PUSHR_CTCNT SPI0_PUSHR_CTCNT
+#define SPI_PUSHR_PCS(X) SPI0_PUSHR_PCS(X)
+#endif
+
+// Template function that, on compilation, expands to a constant representing the highest bit set in a byte.  Right now, 
+// if no bits are set (value is 0), it returns 0, which is also the value returned if the lowest bit is the only bit
+// set (the zero-th bit).  Unclear if I  will want this to change at some point.
+template<int VAL, int BIT> class BitWork { 
+	public: 
+		static int highestBit() __attribute__((always_inline)) { return (VAL & 1 << BIT) ? BIT : BitWork<VAL, BIT-1>::highestBit(); } 
+};
+template<int VAL> class BitWork<VAL, 0> { 
+	public: 
+		static int highestBit() __attribute__((always_inline)) { return 0; } 
+};
+
+#define MAX(A, B) (( (A) > (B) ) ? (A) : (B))
+
+#define USE_CONT 0
+
+// Templated function to translate a clock divider value into the prescalar, scalar, and clock doubling setting for the world.
+template <int VAL> void getScalars(uint32_t & preScalar, uint32_t & scalar, uint32_t & dbl) {
+    switch(VAL) {
+    		// Handle the dbl clock cases
+    		case 0: case 1:
+            case 2: preScalar = 0; scalar = 0; dbl = 1; break;
+            case 3: preScalar = 1; scalar = 0; dbl = 1; break;
+            case 5: preScalar = 2; scalar = 0; dbl = 1; break;
+            case 7: preScalar = 3; scalar = 0; dbl = 1; break;
+
+            // Handle the scalar value 6 cases (since it's not a power of two, it won't get caught
+            // below)
+            case 9: preScalar = 1; scalar = 2; dbl = 1; break;
+            case 18: case 19: preScalar = 1; scalar = 2; dbl = 0; break;
+
+            case 15: preScalar = 2; scalar = 2; dbl = 1; break;
+            case 30: case 31: preScalar = 2; scalar = 2; dbl = 0; break;
+
+            case 21: case 22: case 23: preScalar = 3; scalar = 2; dbl = 1; break;
+            case 42: case 43: case 44: case 45: case 46: case 47: preScalar = 3; scalar = 2; dbl = 0; break;
+            default: {
+                int p2 = BitWork<VAL/2, 15>::highestBit();
+                int p3 = BitWork<VAL/3, 15>::highestBit();
+                int p5 = BitWork<VAL/5, 15>::highestBit();
+                int p7 = BitWork<VAL/7, 15>::highestBit();
+
+                int w2 = 2 * (1 << p2);
+                int w3 = (VAL/3) > 0 ? 3 * (1 << p3) : 0;
+                int w5 = (VAL/5) > 0 ? 5 * (1 << p5) : 0;
+                int w7 = (VAL/7) > 0 ? 7 * (1 << p7) : 0;
+
+                int maxval = MAX(MAX(w2, w3), MAX(w5, w7));
+
+                if(w2 == maxval) { preScalar = 0; scalar = p2; }
+                else if(w3 == maxval) { preScalar = 1; scalar = p3; }
+                else if(w5 == maxval) { preScalar = 2; scalar = p5; }
+                else if(w7 == maxval) { preScalar = 3; scalar = p7; }
+
+                dbl = 0;
+                if(scalar == 0) { dbl = 1; }
+                else if(scalar < 3) { scalar--; }
+            }
+    }
+	return;
+}
+
+
+template <uint8_t _DATA_PIN, uint8_t _CLOCK_PIN, uint8_t _SPI_CLOCK_DIVIDER>
+class ARMHardwareSPIOutput { 
+	Selectable *m_pSelect;
+
+	// Borrowed from the teensy3 SPSR emulation code
+	static inline void enable_pins(void) __attribute__((always_inline)) {
+		//serial_print("enable_pins\n");
+		CORE_PIN11_CONFIG = PORT_PCR_DSE | PORT_PCR_MUX(2);
+		CORE_PIN12_CONFIG = PORT_PCR_MUX(2);
+		CORE_PIN13_CONFIG = PORT_PCR_DSE | PORT_PCR_MUX(2);
+	}
+
+	// Borrowed from the teensy3 SPSR emulation code
+	static inline void disable_pins(void) __attribute__((always_inline)) {
+		//serial_print("disable_pins\n");
+		CORE_PIN11_CONFIG = PORT_PCR_SRE | PORT_PCR_DSE | PORT_PCR_MUX(1);
+		CORE_PIN12_CONFIG = PORT_PCR_SRE | PORT_PCR_DSE | PORT_PCR_MUX(1);
+		CORE_PIN13_CONFIG = PORT_PCR_SRE | PORT_PCR_DSE | PORT_PCR_MUX(1);
+	}
+
+public:
+	ARMHardwareSPIOutput() { m_pSelect = NULL; }
+	ARMHardwareSPIOutput(Selectable *pSelect) { m_pSelect = pSelect; }
+	void setSelect(Selectable *pSelect) { m_pSelect = pSelect; }
+
+    static inline void update_ctar0(uint32_t ctar) __attribute__((always_inline)) {
+            if (SPI0_CTAR0 == ctar) return;
+            uint32_t mcr = SPI0_MCR;
+            if (mcr & SPI_MCR_MDIS) {
+                    SPI0_CTAR0 = ctar;
+            } else {
+                    SPI0_MCR = mcr | SPI_MCR_MDIS | SPI_MCR_HALT;
+                    SPI0_CTAR0 = ctar;
+
+                    SPI0_MCR = mcr;
+            }
+    }	
+
+    static inline void update_ctar1(uint32_t ctar) __attribute__((always_inline)) {
+            if (SPI0_CTAR1 == ctar) return;
+            uint32_t mcr = SPI0_MCR;
+            if (mcr & SPI_MCR_MDIS) {
+                    SPI0_CTAR1 = ctar;
+            } else {
+                    SPI0_MCR = mcr | SPI_MCR_MDIS | SPI_MCR_HALT;
+                    SPI0_CTAR1 = ctar;
+                    SPI0_MCR = mcr;
+          
+            }
+    }	
+
+    static inline void set_ctar1_bits(int bits) { 
+	    // Set ctar1 to 16 bits
+	    int ctar = SPI0_CTAR1;
+	    
+	    // clear the FMSZ bits
+	    ctar &= SPI_CTAR_FMSZ(0x0F);
+	    ctar |= SPI_CTAR_FMSZ((bits-1) & 0x0F);
+
+	    update_ctar1(ctar);
+    }
+
+    static inline void set_ctar0_bits(int bits) { 
+	    // Set ctar1 to 16 bits
+	    int ctar = SPI0_CTAR1;
+	    
+	    // clear the FMSZ bits
+	    ctar &= SPI_CTAR_FMSZ(0x0F);
+	    ctar |= SPI_CTAR_FMSZ((bits-1) & 0x0F);
+
+	    update_ctar1(ctar);
+    }
+
+
+    void setSPIRate() { 
+		// Configure CTAR0, defaulting to 8 bits and CTAR1, defaulting to 16 bits
+	 	uint32_t _PBR = 0;
+	 	uint32_t _BR = 0;
+	 	uint32_t _CSSCK = 0;
+	 	uint32_t _DBR = 0;
+
+	 	// if(_SPI_CLOCK_DIVIDER >= 256) 		{ _PBR = 0; _BR = _CSSCK = 7; _DBR = 0; } // osc/256
+	 	// else if(_SPI_CLOCK_DIVIDER >= 128) 	{ _PBR = 0; _BR = _CSSCK = 6; _DBR = 0; } // osc/128
+	 	// else if(_SPI_CLOCK_DIVIDER >= 64) 	{ _PBR = 0; _BR = _CSSCK = 5; _DBR = 0; } // osc/64
+	 	// else if(_SPI_CLOCK_DIVIDER >= 32) 	{ _PBR = 0; _BR = _CSSCK = 4; _DBR = 0; } // osc/32
+	 	// else if(_SPI_CLOCK_DIVIDER >= 16) 	{ _PBR = 0; _BR = _CSSCK = 3; _DBR = 0; } // osc/16
+	 	// else if(_SPI_CLOCK_DIVIDER >= 8) 	{ _PBR = 0; _BR = _CSSCK = 1; _DBR = 0; } // osc/8
+	 	// else if(_SPI_CLOCK_DIVIDER >= 7) 	{ _PBR = 3; _BR = _CSSCK = 0; _DBR = 1; } // osc/7
+	 	// else if(_SPI_CLOCK_DIVIDER >= 5) 	{ _PBR = 2; _BR = _CSSCK = 0; _DBR = 1; } // osc/5
+	 	// else if(_SPI_CLOCK_DIVIDER >= 4) 	{ _PBR = 0; _BR = _CSSCK = 0; _DBR = 0; } // osc/4
+	 	// else if(_SPI_CLOCK_DIVIDER >= 3) 	{ _PBR = 1; _BR = _CSSCK = 0; _DBR = 1; } // osc/3
+	 	// else                                { _PBR = 0; _BR = _CSSCK = 0; _DBR = 1; } // osc/2
+
+	 	getScalars<_SPI_CLOCK_DIVIDER>(_PBR, _BR, _DBR);
+	 	_CSSCK = _BR;
+
+	 	uint32_t ctar0 = SPI_CTAR_FMSZ(7) | SPI_CTAR_PBR(_PBR) | SPI_CTAR_BR(_BR) | SPI_CTAR_CSSCK(_CSSCK);
+	 	uint32_t ctar1 = SPI_CTAR_FMSZ(15) | SPI_CTAR_PBR(_PBR) | SPI_CTAR_BR(_BR) | SPI_CTAR_CSSCK(_CSSCK);
+
+#if USE_CONT == 1
+	 	ctar0 |= SPI_CTAR_CPHA | SPI_CTAR_CPOL;
+	 	ctar1 |= SPI_CTAR_CPHA | SPI_CTAR_CPOL;
+#endif
+
+	 	if(_DBR) { 
+	 		ctar0 |= SPI_CTAR_DBR;
+	 		ctar1 |= SPI_CTAR_DBR;
+	 	}
+
+	    update_ctar0(ctar0);
+	    update_ctar1(ctar1);
+
+    }
+	
+	void init() {
+		// set the pins to output
+		FastPin<_DATA_PIN>::setOutput();
+		FastPin<_CLOCK_PIN>::setOutput();
+		release();
+
+		// Enable SPI0 clock
+		uint32_t sim6 = SIM_SCGC6;
+		if (!(sim6 & SIM_SCGC6_SPI0)) {
+			//serial_print("init1\n");
+			SIM_SCGC6 = sim6 | SIM_SCGC6_SPI0;
+			SPI0_CTAR0 = SPI_CTAR_FMSZ(7) | SPI_CTAR_PBR(1) | SPI_CTAR_BR(1);
+		}
+
+		setSPIRate();
+
+		// Configure SPI as the master and enable 
+		SPI0_MCR |= SPI_MCR_MSTR; // | SPI_MCR_CONT_SCKE);
+		SPI0_MCR &= ~(SPI_MCR_MDIS | SPI_MCR_HALT);
+
+		enable_pins();
+	}
+
+	static void waitFully() __attribute__((always_inline)) { 
+		while( (SPI0_SR & 0xF000) > 0); 
+		while (!(SPI0_SR & SPI_SR_TCF)); 
+		SPI0_SR |= (SPI_SR_TCF | SPI_SR_EOQF); 
+	}
+
+	static bool needwait() __attribute__((always_inline)) { return (SPI0_SR & 0x4000); }
+	static void wait() __attribute__((always_inline)) { while( (SPI0_SR & 0x4000) );  }
+	static void wait1() __attribute__((always_inline)) { while( (SPI0_SR & 0xF000) >= 0x2000);  }
+	
+	enum ECont { CONT, NOCONT };
+	enum EWait { PRE, POST, NONE };
+	enum ELast { NOTLAST, LAST };
+
+#if USE_CONT == 1
+	#define CM CONT
+#else
+	#define CM NOCONT
+#endif
+	#define WM PRE
+
+	template<ECont CONT_STATE, EWait WAIT_STATE, ELast LAST_STATE> class Write { 
+	public:
+		static void writeWord(uint16_t w) __attribute__((always_inline)) { 
+			if(WAIT_STATE == PRE) { wait(); }
+			SPI0_PUSHR = ((LAST_STATE == LAST) ? SPI_PUSHR_EOQ : 0) |
+			             ((CONT_STATE == CONT) ? SPI_PUSHR_CONT : 0) | 
+			             SPI_PUSHR_CTAS(1) | (w & 0xFFFF);
+			if(WAIT_STATE == POST) { wait(); }
+		}
+
+		static void writeByte(uint8_t b) __attribute__((always_inline)) { 
+			if(WAIT_STATE == PRE) { wait(); }
+			SPI0_PUSHR = ((LAST_STATE == LAST) ? SPI_PUSHR_EOQ : 0) |
+			             ((CONT_STATE == CONT) ? SPI_PUSHR_CONT : 0) | 
+			             SPI_PUSHR_CTAS(0) | (b & 0xFF);
+			if(WAIT_STATE == POST) { wait(); }
+		}
+	};
+
+	static void writeWord(uint16_t w) __attribute__((always_inline)) { wait(); SPI0_PUSHR = SPI_PUSHR_CTAS(1) | (w & 0xFFFF); }
+	static void writeWordNoWait(uint16_t w) __attribute__((always_inline)) { SPI0_PUSHR = SPI_PUSHR_CTAS(1) | (w & 0xFFFF); }
+
+	static void writeByte(uint8_t b) __attribute__((always_inline)) { wait(); SPI0_PUSHR = SPI_PUSHR_CTAS(0) | (b & 0xFF); }
+	static void writeBytePostWait(uint8_t b) __attribute__((always_inline)) { SPI0_PUSHR = SPI_PUSHR_CTAS(0) | (b & 0xFF); wait(); }
+	static void writeByteNoWait(uint8_t b) __attribute__((always_inline)) { SPI0_PUSHR = SPI_PUSHR_CTAS(0) | (b & 0xFF); }
+
+	static void writeWordCont(uint16_t w) __attribute__((always_inline)) { wait(); SPI0_PUSHR = SPI_PUSHR_CONT | SPI_PUSHR_CTAS(1) | (w & 0xFFFF); }
+	static void writeWordContNoWait(uint16_t w) __attribute__((always_inline)) { SPI0_PUSHR = SPI_PUSHR_CONT | SPI_PUSHR_CTAS(1) | (w & 0xFFFF); }
+
+	static void writeByteCont(uint8_t b) __attribute__((always_inline)) { wait(); SPI0_PUSHR = SPI_PUSHR_CONT | SPI_PUSHR_CTAS(0) | (b & 0xFF); }
+	static void writeByteContPostWait(uint8_t b) __attribute__((always_inline)) { SPI0_PUSHR = SPI_PUSHR_CONT | SPI_PUSHR_CTAS(0) | (b & 0xFF); wait(); }
+	static void writeByteContNoWait(uint8_t b) __attribute__((always_inline)) { SPI0_PUSHR = SPI_PUSHR_CONT | SPI_PUSHR_CTAS(0) | (b & 0xFF); }
+
+	// not the most efficient mechanism in the world - but should be enough for sm16716 and friends
+	template <uint8_t BIT> inline static void writeBit(uint8_t b) { 
+		uint32_t ctar1_save = SPI0_CTAR1;
+
+		// Clear out the FMSZ bits, reset them for 9 bits transferd for the start bit
+		uint32_t ctar1 = (ctar1_save & (~SPI_CTAR_FMSZ(15))) | SPI_CTAR_FMSZ(0);
+		update_ctar1(ctar1);
+
+		writeWord( (b & (1 << BIT)) != 0);
+
+		update_ctar1(ctar1_save);
+	}
+
+	void inline select() __attribute__((always_inline)) { if(m_pSelect != NULL) { m_pSelect->select(); } } 
+	void inline release() __attribute__((always_inline)) { if(m_pSelect != NULL) { m_pSelect->release(); } } 
+
+	static void writeBytesValueRaw(uint8_t value, int len) {
+		while(len--) { Write<CM, WM, NOTLAST>::writeByte(value); }
+	}
+
+	void writeBytesValue(uint8_t value, int len) { 
+		setSPIRate();
+		select();
+		while(len--) { 
+			writeByte(value);
+		}
+		waitFully();
+		release();
+	}
+	
+	// Write a block of n uint8_ts out 
+	template <class D> void writeBytes(register uint8_t *data, int len) { 
+		setSPIRate();
+		uint8_t *end = data + len;
+		select();
+		while(data != end) { 
+			writeByte(D::adjust(*data++));
+		}
+		D::postBlock(len);
+		waitFully();
+		release();	
+	}
+
+	void writeBytes(register uint8_t *data, int len) { writeBytes<DATA_NOP>(data, len); }
+
+	// write a block of uint8_ts out in groups of three.  len is the total number of uint8_ts to write out.  The template
+	// parameters indicate how many uint8_ts to skip at the beginning and/or end of each grouping
+	template <uint8_t SKIP, class D, EOrder RGB_ORDER> void writeBytes3(register uint8_t *data, int len, register uint8_t scale) { 
+		// setSPIRate();
+		uint8_t *end = data + len;
+		select();
+		if((SKIP & FLAG_START_BIT) == 0) {
+			//If no start bit stupiditiy, write out as many 16-bit blocks as we can
+			uint8_t *first_end = end - (len % (SPI_ADVANCE * 2));
+			
+			while(data != first_end) {
+				if(WM == NONE) { wait1(); }
+				Write<CM, WM, NOTLAST>::writeWord(D::adjust(data[SPI_B0], scale) << 8 | D::adjust(data[SPI_B1], scale));
+				Write<CM, WM, NOTLAST>::writeWord(D::adjust(data[SPI_B2], scale) << 8 | D::adjust(data[SPI_ADVANCE + SPI_B0], scale));
+				Write<CM, WM, NOTLAST>::writeWord(D::adjust(data[SPI_ADVANCE + SPI_B1], scale) << 8 | D::adjust(data[SPI_ADVANCE + SPI_B2], scale));
+				data += (SPI_ADVANCE + SPI_ADVANCE);
+			}
+
+			if(data != end) { 
+				if(WM == NONE) { wait1(); }
+				// write out the rest as alternating 16/8-bit blocks (likely to be just one)
+				Write<CM, WM, NOTLAST>::writeWord(D::adjust(data[SPI_B0], scale) << 8 | D::adjust(data[SPI_B1], scale));
+				Write<CM, WM, NOTLAST>::writeByte(D::adjust(data[SPI_B2], scale));
+			}
+
+			D::postBlock(len);
+			waitFully();
+		} else if(SKIP & FLAG_START_BIT) {
+			uint32_t ctar1_save = SPI0_CTAR1;
+
+			// Clear out the FMSZ bits, reset them for 9 bits transferd for the start bit
+			uint32_t ctar1 = (ctar1_save & (~SPI_CTAR_FMSZ(15))) | SPI_CTAR_FMSZ(8);
+			update_ctar1(ctar1);
+
+			while(data != end) { 
+				writeWord( 0x100 | D::adjust(data[SPI_B0], scale));
+				writeByte(D::adjust(data[SPI_B1], scale));
+				writeByte(D::adjust(data[SPI_B2], scale));
+				data += SPI_ADVANCE;
+			}
+			D::postBlock(len);
+			waitFully();
+
+			// restore ctar1
+			update_ctar1(ctar1_save);
+		// } else {
+		// 	while(data != end) { 
+		// 		writeByte(D::adjust(data[SPI_B0], scale);
+		// 		writeWord(D::adjust(data[SPI_B1], scale) << 8 | D::adjust(data[SPI_B2], scale));
+		// 		data += SPI_ADVANCE;
+		// 	}
+		// 	waitFully();
+		}
+		release();
+	}
+
+
+	template <uint8_t SKIP, EOrder RGB_ORDER> void writeBytes3(register uint8_t *data, int len, register uint8_t scale) { 
+		writeBytes3<SKIP, DATA_NOP, RGB_ORDER>(data, len, scale); 
+	}
+	template <class D, EOrder RGB_ORDER> void writeBytes3(register uint8_t *data, int len, register uint8_t scale) { 
+		writeBytes3<0, D, RGB_ORDER>(data, len, scale); 
+	}
+	template <EOrder RGB_ORDER> void writeBytes3(register uint8_t *data, int len, register uint8_t scale) { 
+		writeBytes3<0, DATA_NOP, RGB_ORDER>(data, len, scale); 
+	}
+	void writeBytes3(register uint8_t *data, int len, register uint8_t scale) { 
+		writeBytes3<0, DATA_NOP, RGB>(data, len, scale); 
+	}
+};
+#endif
+
+#endif
diff --git a/fastspi_avr.h b/fastspi_avr.h
new file mode 100644
index 00000000..af116cab
--- /dev/null
+++ b/fastspi_avr.h
@@ -0,0 +1,314 @@
+#ifndef __INC_FASTSPI_AVR_H
+#define __INC_FASTSPI_AVR_H
+
+//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+//
+// Hardware SPI support using USART registers and friends
+//
+// TODO: Complete/test implementation - right now this doesn't work
+//
+//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// uno/mini/duemilanove
+#if defined(AVR_HARDWARE_SPI)
+#if defined(UBRR0)
+template <uint8_t _DATA_PIN, uint8_t _CLOCK_PIN, uint8_t _SPI_CLOCK_DIVIDER>
+class AVRUSARTSPIOutput { 
+	Selectable *m_pSelect;
+
+public:
+	AVRUSARTSPIOutput() { m_pSelect = NULL; }
+	AVRUSARTSPIOutput(Selectable *pSelect) { m_pSelect = pSelect; }
+	void setSelect(Selectable *pSelect) { m_pSelect = pSelect; }
+
+	void init() { 
+		UBRR0 = 0;
+		UCSR0A = 1<<TXC0;
+
+		FastPin<_CLOCK_PIN>::setOutput();
+		FastPin<_DATA_PIN>::setOutput();
+
+		UCSR0C = _BV (UMSEL00) | _BV (UMSEL01);  // Master SPI mode
+		UCSR0B = _BV (TXEN0) | _BV (RXEN0);  // transmit enable and receive enable
+
+		// must be done last, see page 206
+		UBRR0 = 3;  // 2 Mhz clock rate
+	}
+
+	static void stop() { 
+		// TODO: stop the uart spi output
+	}
+
+	static void wait() __attribute__((always_inline)) { while(!(UCSR0A & (1<<UDRE0))); }
+	static void waitFully() __attribute__((always_inline)) { wait(); }
+	
+	static void writeByteNoWait(uint8_t b) __attribute__((always_inline)) { UDR0 = b;}
+	static void writeBytePostWait(uint8_t b) __attribute__((always_inline)) { UDR0 = b; wait(); }
+	static void writeByte(uint8_t b) __attribute__((always_inline)) { wait(); UDR0 = b; }
+
+	static void writeWord(uint16_t w) __attribute__((always_inline)) { writeByte(w>>8); writeByte(w&0xFF); }
+	
+	template <uint8_t BIT> inline static void writeBit(uint8_t b) { 
+		if(b && (1 << BIT)) { 
+			FastPin<_DATA_PIN>::hi();
+		} else { 
+			FastPin<_DATA_PIN>::lo();
+		}
+
+		FastPin<_CLOCK_PIN>::hi();
+		FastPin<_CLOCK_PIN>::lo();
+	}
+
+	void select() { if(m_pSelect != NULL) { m_pSelect->select(); } } // FastPin<_SELECT_PIN>::hi(); }
+	void release() { 
+		// wait for all transmissions to finish
+  		while ((UCSR0A & (1 <<TXC0)) == 0) {}
+    	if(m_pSelect != NULL) { m_pSelect->release(); } // FastPin<_SELECT_PIN>::hi(); 
+	}
+
+	static void writeBytesValueRaw(uint8_t value, int len) {
+		while(len--) { writeByte(value); }
+	}
+	
+	void writeBytesValue(uint8_t value, int len) { 
+		select();
+		while(len--) { 
+			writeByte(value);
+		}
+		release();
+	}
+	
+	// Write a block of n uint8_ts out 
+	template <class D> void writeBytes(register uint8_t *data, int len) { 
+		uint8_t *end = data + len;
+		select();
+		while(data != end) { 
+#if defined(__MK20DX128__) 
+			writeByte(D::adjust(*data++));
+#else
+			// a slight touch of delay here helps optimize the timing of the status register check loop (not used on ARM)
+			writeByte(D::adjust(*data++)); delaycycles<3>();
+#endif
+		}
+		D::postBlock(len);
+		release();	
+	}
+
+	void writeBytes(register uint8_t *data, int len) { writeBytes<DATA_NOP>(data, len); }
+
+	// write a block of uint8_ts out in groups of three.  len is the total number of uint8_ts to write out.  The template
+	// parameters indicate how many uint8_ts to skip at the beginning and/or end of each grouping
+	template <uint8_t SKIP, class D, EOrder RGB_ORDER> void writeBytes3(register uint8_t *data, int len, register uint8_t scale) { 
+		uint8_t *end = data + len;
+		select();
+		while(data != end) { 
+			writeByte(D::adjust(data[SPI_B0], scale));
+			writeByte(D::adjust(data[SPI_B1], scale));
+			writeByte(D::adjust(data[SPI_B2], scale));
+			data += SPI_ADVANCE;
+		}
+		D::postBlock(len);
+		release();
+	}
+
+	template <uint8_t SKIP, EOrder RGB_ORDER> void writeBytes3(register uint8_t *data, int len, register uint8_t scale) { 
+		writeBytes3<SKIP, DATA_NOP, RGB_ORDER>(data, len, scale); 
+	}
+	template <class D, EOrder RGB_ORDER> void writeBytes3(register uint8_t *data, int len, register uint8_t scale) { 
+		writeBytes3<0, D, RGB_ORDER>(data, len, scale); 
+	}
+	template <EOrder RGB_ORDER> void writeBytes3(register uint8_t *data, int len, register uint8_t scale) { 
+		writeBytes3<0, DATA_NOP, RGB_ORDER>(data, len, scale); 
+	}
+	void writeBytes3(register uint8_t *data, int len, register uint8_t scale) { 
+		writeBytes3<0, DATA_NOP, RGB>(data, len, scale); 
+	}
+
+};
+
+#endif
+
+#if defined(SPSR)
+
+//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+//
+// Hardware SPI support using SPDR registers and friends
+//
+// Technically speaking, this uses the AVR SPI registers.  This will work on the Teensy 3.0 because Paul made a set of compatability
+// classes that map the AVR SPI registers to ARM's, however this caps the performance of output.  
+//
+// TODO: implement ARMHardwareSPIOutput
+//
+//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <uint8_t _DATA_PIN, uint8_t _CLOCK_PIN, uint8_t _SPI_CLOCK_DIVIDER>
+class AVRHardwareSPIOutput { 
+	Selectable *m_pSelect;
+	bool mWait;
+public:
+	AVRHardwareSPIOutput() { m_pSelect = NULL; mWait = false;}
+	AVRHardwareSPIOutput(Selectable *pSelect) { m_pSelect = pSelect; }
+	void setSelect(Selectable *pSelect) { m_pSelect = pSelect; }
+
+	void setSPIRate() { 
+		SPCR &= ~ ( (1<<SPR1) | (1<<SPR0) ); 	// clear out the prescalar bits
+
+	    bool b2x = false;
+
+	    if(_SPI_CLOCK_DIVIDER >= 128) { SPCR |= (1<<SPR1); SPCR |= (1<<SPR0); }
+	    else if(_SPI_CLOCK_DIVIDER >= 64) { SPCR |= (1<<SPR1);}
+	    else if(_SPI_CLOCK_DIVIDER >= 32) { SPCR |= (1<<SPR1); b2x = true;  }
+	    else if(_SPI_CLOCK_DIVIDER >= 16) { SPCR |= (1<<SPR0); } 
+	    else if(_SPI_CLOCK_DIVIDER >= 8) { SPCR |= (1<<SPR0); b2x = true; }
+	    else if(_SPI_CLOCK_DIVIDER >= 4) { /* do nothing - default rate */ }
+	    else { b2x = true; }
+
+	    if(b2x) { SPSR |= (1<<SPI2X); }
+	    else { SPSR &= ~ (1<<SPI2X); }
+	}
+	
+	void init() {
+		volatile uint8_t clr;
+
+		// set the pins to output
+		FastPin<_DATA_PIN>::setOutput();
+		FastPin<_CLOCK_PIN>::setOutput();
+#ifdef SPI_SELECT
+		// Make sure the slave select line is set to output, or arduino will block us
+		FastPin<SPI_SELECT>::setOutput();
+		FastPin<SPI_SELECT>::lo();
+#endif
+		release();
+
+		SPCR |= ((1<<SPE) | (1<<MSTR) ); 		// enable SPI as master
+		SPCR &= ~ ( (1<<SPR1) | (1<<SPR0) ); 	// clear out the prescalar bits
+
+		clr = SPSR; // clear SPI status register 
+		clr = SPDR; // clear SPI data register
+		clr; 
+
+	    bool b2x = false;
+
+	    if(_SPI_CLOCK_DIVIDER >= 128) { SPCR |= (1<<SPR1); SPCR |= (1<<SPR0); }
+	    else if(_SPI_CLOCK_DIVIDER >= 64) { SPCR |= (1<<SPR1);}
+	    else if(_SPI_CLOCK_DIVIDER >= 32) { SPCR |= (1<<SPR1); b2x = true;  }
+	    else if(_SPI_CLOCK_DIVIDER >= 16) { SPCR |= (1<<SPR0); } 
+	    else if(_SPI_CLOCK_DIVIDER >= 8) { SPCR |= (1<<SPR0); b2x = true; }
+	    else if(_SPI_CLOCK_DIVIDER >= 4) { /* do nothing - default rate */ }
+	    else { b2x = true; }
+
+	    if(b2x) { SPSR |= (1<<SPI2X); }
+	    else { SPSR &= ~ (1<<SPI2X); }
+
+	    SPDR=0;
+	    shouldWait(false);
+	}
+
+	static bool shouldWait(bool wait = false) __attribute__((always_inline)) { 
+		static bool sWait=false; 
+		if(sWait) { sWait = wait; return true; } else { sWait = wait; return false; } 
+		// return true;
+	}
+	static void wait() __attribute__((always_inline)) { if(shouldWait()) { while(!(SPSR & (1<<SPIF))); } }
+	static void waitFully() __attribute__((always_inline)) { wait(); }
+
+	static void writeByte(uint8_t b) __attribute__((always_inline)) { wait(); SPDR=b;  shouldWait(true); }
+	static void writeBytePostWait(uint8_t b) __attribute__((always_inline)) { SPDR=b; shouldWait(true); wait(); }
+	static void writeByteNoWait(uint8_t b) __attribute__((always_inline)) { SPDR=b; shouldWait(true); }
+
+	template <uint8_t BIT> inline static void writeBit(uint8_t b) { 
+		SPCR &= ~(1 << SPE);
+		if(b & (1 << BIT)) { 
+			FastPin<_DATA_PIN>::hi();
+		} else { 
+			FastPin<_DATA_PIN>::lo();
+		}
+
+		FastPin<_CLOCK_PIN>::hi();
+		FastPin<_CLOCK_PIN>::lo();
+		SPCR |= 1 << SPE;
+		shouldWait(false);
+	}
+
+	void select() { if(m_pSelect != NULL) { m_pSelect->select(); } } // FastPin<_SELECT_PIN>::hi(); }
+	void release() { if(m_pSelect != NULL) { m_pSelect->release(); } } // FastPin<_SELECT_PIN>::lo(); }
+
+	static void writeBytesValueRaw(uint8_t value, int len) {
+		while(len--) { writeByte(value); }
+	}
+
+	void writeBytesValue(uint8_t value, int len) { 
+		//setSPIRate();
+		select();
+		while(len--) { 
+			writeByte(value);
+		}
+		release();
+	}
+	
+	// Write a block of n uint8_ts out 
+	template <class D> void writeBytes(register uint8_t *data, int len) { 
+		//setSPIRate();
+		uint8_t *end = data + len;
+		select();
+		while(data != end) { 
+			// a slight touch of delay here helps optimize the timing of the status register check loop (not used on ARM)
+			writeByte(D::adjust(*data++)); delaycycles<3>();
+		}
+		release();	
+	}
+
+	void writeBytes(register uint8_t *data, int len) { writeBytes<DATA_NOP>(data, len); }
+
+	// write a block of uint8_ts out in groups of three.  len is the total number of uint8_ts to write out.  The template
+	// parameters indicate how many uint8_ts to skip at the beginning and/or end of each grouping
+	template <uint8_t SKIP, class D, EOrder RGB_ORDER> void writeBytes3(register uint8_t *data, int len, register uint8_t scale) { 
+		//setSPIRate();
+		uint8_t *end = data + len;
+		select();
+		while(data != end) { 
+			if(SKIP & FLAG_START_BIT) { 
+				writeBit<0>(1);
+			}
+			// a slight touch of delay here helps optimize the timing of the status register check loop (not used on ARM)
+			if(false && _SPI_CLOCK_DIVIDER == 0) { 
+				writeByteNoWait(D::adjust(data[SPI_B0], scale)); delaycycles<13>();
+				writeByteNoWait(D::adjust(data[SPI_B1], scale)); delaycycles<13>();
+				writeByteNoWait(D::adjust(data[SPI_B2], scale)); delaycycles<9>();
+			} else if(SKIP & FLAG_START_BIT) { 
+				writeBytePostWait(D::adjust(data[SPI_B0], scale));
+				writeBytePostWait(D::adjust(data[SPI_B1], scale));
+				writeBytePostWait(D::adjust(data[SPI_B2], scale));
+			} else { 
+				writeByte(D::adjust(data[SPI_B0], scale));
+				writeByte(D::adjust(data[SPI_B1], scale));
+				writeByte(D::adjust(data[SPI_B2], scale));
+			}
+
+			data += SPI_ADVANCE;
+		}
+		D::postBlock(len);
+		release();
+	}
+
+	template <uint8_t SKIP, EOrder RGB_ORDER> void writeBytes3(register uint8_t *data, int len, register uint8_t scale) { 
+		writeBytes3<SKIP, DATA_NOP, RGB_ORDER>(data, len, scale); 
+	}
+	template <class D, EOrder RGB_ORDER> void writeBytes3(register uint8_t *data, int len, register uint8_t scale) { 
+		writeBytes3<0, D, RGB_ORDER>(data, len, scale); 
+	}
+	template <EOrder RGB_ORDER> void writeBytes3(register uint8_t *data, int len, register uint8_t scale) { 
+		writeBytes3<0, DATA_NOP, RGB_ORDER>(data, len, scale); 
+	}
+	void writeBytes3(register uint8_t *data, int len, register uint8_t scale) { 
+		writeBytes3<0, DATA_NOP, RGB>(data, len, scale); 
+	}
+
+};
+#endif
+
+#else
+// #define FORCE_SOFTWARE_SPI
+#endif
+
+#endif
+\ No newline at end of file
diff --git a/fastspi_bitbang.h b/fastspi_bitbang.h
new file mode 100644
index 00000000..f9c1a218
--- /dev/null
+++ b/fastspi_bitbang.h
@@ -0,0 +1,368 @@
+#ifndef __INC_FASTSPI_BITBANG_H
+#define __INC_FASTSPI_BITBANG_H
+
+//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+//
+// Software SPI (aka bit-banging) support - with aggressive optimizations for when the clock and data pin are on the same port
+//
+// TODO: Replace the select pin definition with a set of pins, to allow using mux hardware for routing in the future
+//
+//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <uint8_t DATA_PIN, uint8_t CLOCK_PIN, uint8_t SPI_SPEED>
+class AVRSoftwareSPIOutput { 
+	// The data types for pointers to the pin port - typedef'd here from the Pin definition because on avr these
+	// are pointers to 8 bit values, while on arm they are 32 bit
+	typedef typename FastPin<DATA_PIN>::port_ptr_t data_ptr_t;
+	typedef typename FastPin<CLOCK_PIN>::port_ptr_t clock_ptr_t;
+
+	// The data type for what's at a pin's port - typedef'd here from the Pin definition because on avr the ports
+	// are 8 bits wide while on arm they are 32.
+	typedef typename FastPin<DATA_PIN>::port_t data_t;
+	typedef typename FastPin<CLOCK_PIN>::port_t clock_t;
+	Selectable 	*m_pSelect;
+
+public:
+	AVRSoftwareSPIOutput() { m_pSelect = NULL; }
+	AVRSoftwareSPIOutput(Selectable *pSelect) { m_pSelect = pSelect; }
+	void setSelect(Selectable *pSelect) { m_pSelect = pSelect; }
+
+	void init() {
+		// set the pins to output and make sure the select is released (which apparently means hi?  This is a bit
+		// confusing to me)
+		FastPin<DATA_PIN>::setOutput();
+		FastPin<CLOCK_PIN>::setOutput();
+		release();
+	}
+
+	// stop the SPI output.  Pretty much a NOP with software, as there's no registers to kick
+	static void stop() { }
+
+	// wait until the SPI subsystem is ready for more data to write.  A NOP when bitbanging
+	static void wait() __attribute__((always_inline)) { }
+	static void waitFully() __attribute__((always_inline)) { wait(); }
+		
+	static void writeByteNoWait(uint8_t b) __attribute__((always_inline)) { writeByte(b); }
+	static void writeBytePostWait(uint8_t b) __attribute__((always_inline)) { writeByte(b); wait(); }
+
+	static void writeWord(uint16_t w) __attribute__((always_inline)) { writeByte(w>>8); writeByte(w&0xFF); }
+	
+	// naive writeByte implelentation, simply calls writeBit on the 8 bits in the byte.
+	static void writeByte(uint8_t b) __attribute__((always_inline)) { 
+		writeBit<7>(b);
+		writeBit<6>(b);
+		writeBit<5>(b);
+		writeBit<4>(b);
+		writeBit<3>(b);
+		writeBit<2>(b);
+		writeBit<1>(b);
+		writeBit<0>(b);
+	}
+
+private:	
+	// writeByte implementation with data/clock registers passed in.
+	static void writeByte(uint8_t b, clock_ptr_t clockpin, data_ptr_t datapin) __attribute__((always_inline)) { 
+		writeBit<7>(b, clockpin, datapin);
+		writeBit<6>(b, clockpin, datapin);
+		writeBit<5>(b, clockpin, datapin);
+		writeBit<4>(b, clockpin, datapin);
+		writeBit<3>(b, clockpin, datapin);
+		writeBit<2>(b, clockpin, datapin);
+		writeBit<1>(b, clockpin, datapin);
+		writeBit<0>(b, clockpin, datapin);
+	}
+
+	// writeByte implementation with the data register passed in and prebaked values for data hi w/clock hi and
+	// low and data lo w/clock hi and lo.  This is to be used when clock and data are on the same GPIO register, 
+	// can get close to getting a bit out the door in 2 clock cycles!
+	static void writeByte(uint8_t b, data_ptr_t datapin, 
+						  data_t hival, data_t loval, 
+						  clock_t hiclock, clock_t loclock) __attribute__((always_inline, hot)) { 
+		writeBit<7>(b, datapin, hival, loval, hiclock, loclock);
+		writeBit<6>(b, datapin, hival, loval, hiclock, loclock);
+		writeBit<5>(b, datapin, hival, loval, hiclock, loclock);
+		writeBit<4>(b, datapin, hival, loval, hiclock, loclock);
+		writeBit<3>(b, datapin, hival, loval, hiclock, loclock);
+		writeBit<2>(b, datapin, hival, loval, hiclock, loclock);
+		writeBit<1>(b, datapin, hival, loval, hiclock, loclock);
+		writeBit<0>(b, datapin, hival, loval, hiclock, loclock);
+	}
+
+	// writeByte implementation with not just registers passed in, but pre-baked values for said registers for
+	// data hi/lo and clock hi/lo values.  Note: weird things will happen if this method is called in cases where
+	// the data and clock pins are on the same port!  Don't do that!
+	static void writeByte(uint8_t b, clock_ptr_t clockpin, data_ptr_t datapin, 
+						  data_t hival, data_t loval, 
+						  clock_t hiclock, clock_t loclock) __attribute__((always_inline)) { 
+		writeBit<7>(b, clockpin, datapin, hival, loval, hiclock, loclock);
+		writeBit<6>(b, clockpin, datapin, hival, loval, hiclock, loclock);
+		writeBit<5>(b, clockpin, datapin, hival, loval, hiclock, loclock);
+		writeBit<4>(b, clockpin, datapin, hival, loval, hiclock, loclock);
+		writeBit<3>(b, clockpin, datapin, hival, loval, hiclock, loclock);
+		writeBit<2>(b, clockpin, datapin, hival, loval, hiclock, loclock);
+		writeBit<1>(b, clockpin, datapin, hival, loval, hiclock, loclock);
+		writeBit<0>(b, clockpin, datapin, hival, loval, hiclock, loclock);
+	}
+
+public:
+	#define SPI_DELAY delaycycles< (SPI_SPEED-2) / 2>();
+
+	// write the BIT'th bit out via spi, setting the data pin then strobing the clcok
+	template <uint8_t BIT> __attribute__((always_inline, hot)) inline static void writeBit(uint8_t b) { 
+		if(b & (1 << BIT)) { 
+			FastPin<DATA_PIN>::hi();
+			if(SPI_SPEED < 3) { 
+				FastPin<CLOCK_PIN>::strobe();
+			} else { 
+				FastPin<CLOCK_PIN>::hi(); SPI_DELAY;
+				FastPin<CLOCK_PIN>::lo(); SPI_DELAY;
+			}
+		} else { 
+			FastPin<DATA_PIN>::lo();
+			if(SPI_SPEED < 3) { 
+				FastPin<CLOCK_PIN>::strobe();
+			} else { 
+				FastPin<CLOCK_PIN>::hi(); SPI_DELAY;
+				FastPin<CLOCK_PIN>::lo(); SPI_DELAY;
+			}
+		}
+	}
+	
+private:
+	// write the BIT'th bit out via spi, setting the data pin then strobing the clock, using the passed in pin registers to accelerate access if needed
+	template <uint8_t BIT> __attribute__((always_inline)) inline static void writeBit(uint8_t b, clock_ptr_t clockpin, data_ptr_t datapin) { 
+		if(b & (1 << BIT)) { 
+			FastPin<DATA_PIN>::hi(datapin);
+			FastPin<CLOCK_PIN>::hi(clockpin); SPI_DELAY;
+			FastPin<CLOCK_PIN>::lo(clockpin); SPI_DELAY;
+		} else { 
+			FastPin<DATA_PIN>::lo(datapin);
+			FastPin<CLOCK_PIN>::hi(clockpin); SPI_DELAY;
+			FastPin<CLOCK_PIN>::lo(clockpin); SPI_DELAY;
+		}
+
+	}
+
+	// the version of write to use when clock and data are on separate pins with precomputed values for setting
+	// the clock and data pins
+	template <uint8_t BIT> __attribute__((always_inline)) inline static void writeBit(uint8_t b, clock_ptr_t clockpin, data_ptr_t datapin, 
+													data_t hival, data_t loval, clock_t hiclock, clock_t loclock) { 
+		// // only need to explicitly set clock hi if clock and data are on different ports
+		if(b & (1 << BIT)) { 
+			FastPin<DATA_PIN>::fastset(datapin, hival);
+			FastPin<CLOCK_PIN>::fastset(clockpin, hiclock); SPI_DELAY;
+			FastPin<CLOCK_PIN>::fastset(clockpin, loclock); SPI_DELAY;
+		} else { 
+			// NOP;
+			FastPin<DATA_PIN>::fastset(datapin, loval);
+			FastPin<CLOCK_PIN>::fastset(clockpin, hiclock); SPI_DELAY;
+			FastPin<CLOCK_PIN>::fastset(clockpin, loclock); SPI_DELAY;
+		}
+	}
+
+	// the version of write to use when clock and data are on the same port with precomputed values for the various
+	// combinations
+	template <uint8_t BIT> __attribute__((always_inline)) inline static void writeBit(uint8_t b, data_ptr_t clockdatapin, 
+													data_t datahiclockhi, data_t dataloclockhi, 
+													data_t datahiclocklo, data_t dataloclocklo) { 
+#if 0
+		writeBit<BIT>(b);
+#else
+		if(b & (1 << BIT)) { 
+			FastPin<DATA_PIN>::fastset(clockdatapin, datahiclocklo); SPI_DELAY;
+			FastPin<DATA_PIN>::fastset(clockdatapin, datahiclockhi); SPI_DELAY;
+			FastPin<DATA_PIN>::fastset(clockdatapin, datahiclocklo); SPI_DELAY;
+		} else { 
+			// NOP;
+			FastPin<DATA_PIN>::fastset(clockdatapin, dataloclocklo); SPI_DELAY;
+			FastPin<DATA_PIN>::fastset(clockdatapin, dataloclockhi); SPI_DELAY;
+			FastPin<DATA_PIN>::fastset(clockdatapin, dataloclocklo); SPI_DELAY;
+		}
+#endif
+	}
+public:
+
+	// select the SPI output (TODO: research whether this really means hi or lo.  Alt TODO: move select responsibility out of the SPI classes
+	// entirely, make it up to the caller to remember to lock/select the line?)
+	void select() { if(m_pSelect != NULL) { m_pSelect->select(); } } // FastPin<SELECT_PIN>::hi(); }
+
+	// release the SPI line
+	void release() { if(m_pSelect != NULL) { m_pSelect->release(); } } // FastPin<SELECT_PIN>::lo(); }
+
+	// Write out len bytes of the given value out over SPI.  Useful for quickly flushing, say, a line of 0's down the line.
+	void writeBytesValue(uint8_t value, int len) { 
+		select();
+		writeBytesValueRaw(value, len);
+		release();
+	}
+
+	static void writeBytesValueRaw(uint8_t value, int len) {
+#ifdef FAST_SPI_INTERRUPTS_WRITE_PINS
+		// TODO: Weird things may happen if software bitbanging SPI output and other pins on the output reigsters are being twiddled.  Need
+		// to allow specifying whether or not exclusive i/o access is allowed during this process, and if i/o access is not allowed fall
+		// back to the degenerative code below
+		while(len--) { 
+			writeByte(value);
+		}
+#else
+		register data_ptr_t datapin = FastPin<DATA_PIN>::port();
+
+		if(FastPin<DATA_PIN>::port() != FastPin<CLOCK_PIN>::port()) {
+			// If data and clock are on different ports, then writing a bit will consist of writing the value foor
+			// the bit (hi or low) to the data pin port, and then two writes to the clock port to strobe the clock line
+			register clock_ptr_t clockpin = FastPin<CLOCK_PIN>::port();
+			register data_t datahi = FastPin<DATA_PIN>::hival();
+			register data_t datalo = FastPin<DATA_PIN>::loval();
+			register clock_t clockhi = FastPin<CLOCK_PIN>::hival();
+			register clock_t clocklo = FastPin<CLOCK_PIN>::loval();
+			while(len--) { 
+				writeByte(value, clockpin, datapin, datahi, datalo, clockhi, clocklo);
+			}
+
+		} else {
+			// If data and clock are on the same port then we can combine setting the data and clock pins 
+			register data_t datahi_clockhi = FastPin<DATA_PIN>::hival() | FastPin<CLOCK_PIN>::mask();
+			register data_t datalo_clockhi = FastPin<DATA_PIN>::loval() | FastPin<CLOCK_PIN>::mask();
+			register data_t datahi_clocklo = FastPin<DATA_PIN>::hival() & ~FastPin<CLOCK_PIN>::mask();
+			register data_t datalo_clocklo = FastPin<DATA_PIN>::loval() & ~FastPin<CLOCK_PIN>::mask();
+
+			while(len--) { 
+				writeByte(value, datapin, datahi_clockhi, datalo_clockhi, datahi_clocklo, datalo_clocklo);
+			}
+		}
+#endif
+	}
+
+	// write a block of len uint8_ts out.  Need to type this better so that explicit casts into the call aren't required.
+	// note that this template version takes a class parameter for a per-byte modifier to the data. 
+	template <class D> void writeBytes(register uint8_t *data, int len) { 
+		select();
+#ifdef FAST_SPI_INTERRUPTS_WRITE_PINS
+		uint8_t *end = data + len;
+		while(data != end) { 
+			writeByte(D::adjust(*data++));
+		}
+#else
+		register clock_ptr_t clockpin = FastPin<CLOCK_PIN>::port();
+		register data_ptr_t datapin = FastPin<DATA_PIN>::port();
+
+		if(FastPin<DATA_PIN>::port() != FastPin<CLOCK_PIN>::port()) {
+			// If data and clock are on different ports, then writing a bit will consist of writing the value foor
+			// the bit (hi or low) to the data pin port, and then two writes to the clock port to strobe the clock line
+			register data_t datahi = FastPin<DATA_PIN>::hival();
+			register data_t datalo = FastPin<DATA_PIN>::loval();
+			register clock_t clockhi = FastPin<CLOCK_PIN>::hival();
+			register clock_t clocklo = FastPin<CLOCK_PIN>::loval();
+			uint8_t *end = data + len;
+
+			while(data != end) { 
+				writeByte(D::adjust(*data++), clockpin, datapin, datahi, datalo, clockhi, clocklo);
+			}
+
+		} else {
+			// FastPin<CLOCK_PIN>::hi();
+			// If data and clock are on the same port then we can combine setting the data and clock pins 
+			register data_t datahi_clockhi = FastPin<DATA_PIN>::hival() | FastPin<CLOCK_PIN>::mask();
+			register data_t datalo_clockhi = FastPin<DATA_PIN>::loval() | FastPin<CLOCK_PIN>::mask();
+			register data_t datahi_clocklo = FastPin<DATA_PIN>::hival() & ~FastPin<CLOCK_PIN>::mask();
+			register data_t datalo_clocklo = FastPin<DATA_PIN>::loval() & ~FastPin<CLOCK_PIN>::mask();
+			
+			uint8_t *end = data + len;
+
+			while(data != end) { 
+				writeByte(D::adjust(*data++), datapin, datahi_clockhi, datalo_clockhi, datahi_clocklo, datalo_clocklo);
+			}
+			// FastPin<CLOCK_PIN>::lo();
+		}
+#endif
+		D::postBlock(len);
+		release();	
+	}
+
+	// default version of writing a block of data out to the SPI port, with no data modifications being made
+	void writeBytes(register uint8_t *data, int len) { writeBytes<DATA_NOP>(data, len); }
+
+
+	// write a block of uint8_ts out in groups of three.  len is the total number of uint8_ts to write out.  The template
+	// parameters indicate how many uint8_ts to skip at the beginning of each grouping, as well as a class specifying a per
+	// byte of data modification to be made.  (See DATA_NOP above)
+	template <uint8_t SKIP, class D, EOrder RGB_ORDER> void writeBytes3(register uint8_t *data, int len, register uint8_t scale) { 
+		select();
+
+#ifdef FAST_SPI_INTERRUPTS_WRITE_PINS
+		// If interrupts or other things may be generating output while we're working on things, then we need
+		// to use this block
+		uint8_t *end = data + len;
+		while(data != end) { 
+			if(SKIP & FLAG_START_BIT) { 
+				writeBit<0>(1);
+			}
+			writeByte(D::adjust(data[SPI_B0], scale));
+			writeByte(D::adjust(data[SPI_B1], scale));
+			writeByte(D::adjust(data[SPI_B2], scale));
+			data += SPI_ADVANCE;
+		}
+#else
+		// If we can guaruntee that no one else will be writing data while we are running (namely, changing the values of the PORT/PDOR pins)
+		// then we can use a bunch of optimizations in here
+		register data_ptr_t datapin = FastPin<DATA_PIN>::port();
+
+		if(FastPin<DATA_PIN>::port() != FastPin<CLOCK_PIN>::port()) {
+			register clock_ptr_t clockpin = FastPin<CLOCK_PIN>::port();
+			// If data and clock are on different ports, then writing a bit will consist of writing the value foor
+			// the bit (hi or low) to the data pin port, and then two writes to the clock port to strobe the clock line
+			register data_t datahi = FastPin<DATA_PIN>::hival();
+			register data_t datalo = FastPin<DATA_PIN>::loval();
+			register clock_t clockhi = FastPin<CLOCK_PIN>::hival();
+			register clock_t clocklo = FastPin<CLOCK_PIN>::loval();
+			uint8_t *end = data + len;
+
+			while(data != end) { 
+				if(SKIP & FLAG_START_BIT) { 
+					writeBit<0>(1, clockpin, datapin, datahi, datalo, clockhi, clocklo);
+				}
+				writeByte(D::adjust(data[SPI_B0], scale), clockpin, datapin, datahi, datalo, clockhi, clocklo);
+				writeByte(D::adjust(data[SPI_B1], scale), clockpin, datapin, datahi, datalo, clockhi, clocklo);
+				writeByte(D::adjust(data[SPI_B2], scale), clockpin, datapin, datahi, datalo, clockhi, clocklo);
+				data += SPI_ADVANCE;
+			}
+
+		} else {
+			// If data and clock are on the same port then we can combine setting the data and clock pins 
+			register data_t datahi_clockhi = FastPin<DATA_PIN>::hival() | FastPin<CLOCK_PIN>::mask();
+			register data_t datalo_clockhi = FastPin<DATA_PIN>::loval() | FastPin<CLOCK_PIN>::mask();
+			register data_t datahi_clocklo = FastPin<DATA_PIN>::hival() & ~FastPin<CLOCK_PIN>::mask();
+			register data_t datalo_clocklo = FastPin<DATA_PIN>::loval() & ~FastPin<CLOCK_PIN>::mask();
+			
+			uint8_t *end = data + len;
+
+			while(data != end) { 
+				if(SKIP & FLAG_START_BIT) { 
+					writeBit<0>(1, datapin, datahi_clockhi, datalo_clockhi, datahi_clocklo, datalo_clocklo);
+				}
+				writeByte(D::adjust(data[SPI_B0], scale), datapin, datahi_clockhi, datalo_clockhi, datahi_clocklo, datalo_clocklo);
+				writeByte(D::adjust(data[SPI_B1], scale), datapin, datahi_clockhi, datalo_clockhi, datahi_clocklo, datalo_clocklo);
+				writeByte(D::adjust(data[SPI_B2], scale), datapin, datahi_clockhi, datalo_clockhi, datahi_clocklo, datalo_clocklo);
+				data += SPI_ADVANCE;
+			}
+		}	
+#endif
+		D::postBlock(len);
+		release();
+	}
+
+	template <uint8_t SKIP, EOrder RGB_ORDER> void writeBytes3(register uint8_t *data, int len, register uint8_t scale) { 
+		writeBytes3<SKIP, DATA_NOP, RGB_ORDER>(data, len, scale); 
+	}
+	template <class D, EOrder RGB_ORDER> void writeBytes3(register uint8_t *data, int len, register uint8_t scale) { 
+		writeBytes3<0, D, RGB_ORDER>(data, len, scale); 
+	}
+	template <EOrder RGB_ORDER> void writeBytes3(register uint8_t *data, int len, register uint8_t scale) { 
+		writeBytes3<0, DATA_NOP, RGB_ORDER>(data, len, scale); 
+	}
+	void writeBytes3(register uint8_t *data, int len, register uint8_t scale) { 
+		writeBytes3<0, DATA_NOP, RGB>(data, len, scale); 
+	}
+};
+
+#endif
diff --git a/fastspi_dma.h b/fastspi_dma.h
new file mode 100644
index 00000000..e69de29b
--- /dev/null
+++ b/fastspi_dma.h
diff --git a/hsv2rgb.cpp b/hsv2rgb.cpp
new file mode 100644
index 00000000..b0abdab9
--- /dev/null
+++ b/hsv2rgb.cpp
@@ -0,0 +1,495 @@
+#include <stdint.h>
+
+#include "lib8tion.h"
+#include "hsv2rgb.h"
+
+// Functions to convert HSV colors to RGB colors.
+//
+//  The basically fall into two groups: spectra, and rainbows.
+//  Spectra and rainbows are not the same thing.  Wikipedia has a good
+//  illustration here
+//   http://upload.wikimedia.org/wikipedia/commons/f/f6/Prism_compare_rainbow_01.png
+//  from this article
+//   http://en.wikipedia.org/wiki/Rainbow#Number_of_colours_in_spectrum_or_rainbow
+//  that shows a 'spectrum' and a 'rainbow' side by side.  Among other
+//  differences, you'll see that a 'rainbow' has much more yellow than
+//  a plain spectrum.  "Classic" LED color washes are spectrum based, and
+//  usually show very little yellow.
+//
+//  Wikipedia's page on HSV color space, with pseudocode for conversion
+//  to RGB color space
+//   http://en.wikipedia.org/wiki/HSL_and_HSV
+//  Note that their conversion algorithm, which is (naturally) very popular
+//  is in the "maximum brightness at any given hue" style, vs the "uniform
+//  brightness for all hues" style.
+//
+//  You can't have both; either purple is the same brightness as red, e.g
+//    red = #FF0000 and purple = #800080 -> same "total light" output
+//  OR purple is 'as bright as it can be', e.g.
+//    red = #FF0000 and purple = #FF00FF -> purple is much brighter than red.
+//  The colorspace conversions here try to keep the apparent brightness
+//  constant even as the hue varies.
+//
+//  Adafruit's "Wheel" function, discussed here
+//   http://forums.adafruit.com/viewtopic.php?f=47&t=22483
+//  is also of the "constant apparent brightness" variety.
+//
+//  TODO: provide the 'maximum brightness no matter what' variation.
+//
+//  See also some good, clear Arduino C code from Kasper Kamperman
+//   http://www.kasperkamperman.com/blog/arduino/arduino-programming-hsb-to-rgb/
+//  which in turn was was based on Windows C code from "nico80"
+//   http://www.codeproject.com/Articles/9207/An-HSB-RGBA-colour-picker
+
+
+
+
+
+void hsv2rgb_raw_C (const struct CHSV & hsv, struct CRGB & rgb);
+void hsv2rgb_raw_avr(const struct CHSV & hsv, struct CRGB & rgb);
+
+#if defined(__AVR__) && !defined( LIB8_ATTINY )
+void hsv2rgb_raw(const struct CHSV & hsv, struct CRGB & rgb)
+{
+    hsv2rgb_raw_avr( hsv, rgb);
+}
+#else
+void hsv2rgb_raw(const struct CHSV & hsv, struct CRGB & rgb)
+{
+    hsv2rgb_raw_C( hsv, rgb);
+}
+#endif
+
+
+
+#define APPLY_DIMMING(X) (X)
+#define HSV_SECTION_6 (0x20)
+#define HSV_SECTION_3 (0x40)
+
+void hsv2rgb_raw_C (const struct CHSV & hsv, struct CRGB & rgb)
+{
+    // Convert hue, saturation and brightness ( HSV/HSB ) to RGB
+    // "Dimming" is used on saturation and brightness to make
+    // the output more visually linear.
+    
+    // Apply dimming curves
+    uint8_t value = APPLY_DIMMING( hsv.val);
+    uint8_t saturation = hsv.sat;
+    
+    // The brightness floor is minimum number that all of
+    // R, G, and B will be set to.
+    uint8_t invsat = APPLY_DIMMING( 255 - saturation);
+    uint8_t brightness_floor = (value * invsat) / 256;
+    
+    // The color amplitude is the maximum amount of R, G, and B
+    // that will be added on top of the brightness_floor to
+    // create the specific hue desired.
+    uint8_t color_amplitude = value - brightness_floor;
+    
+    // Figure out which section of the hue wheel we're in,
+    // and how far offset we are withing that section
+    uint8_t section = hsv.hue / HSV_SECTION_3; // 0..2
+    uint8_t offset = hsv.hue % HSV_SECTION_3;  // 0..63
+    
+    uint8_t rampup = offset; // 0..63
+    uint8_t rampdown = (HSV_SECTION_3 - 1) - offset; // 63..0
+    
+    // We now scale rampup and rampdown to a 0-255 range -- at least
+    // in theory, but here's where architecture-specific decsions
+    // come in to play:
+    // To scale them up to 0-255, we'd want to multiply by 4.
+    // But in the very next step, we multiply the ramps by other
+    // values and then divide the resulting product by 256.
+    // So which is faster?
+    //   ((ramp * 4) * othervalue) / 256
+    // or
+    //   ((ramp    ) * othervalue) /  64
+    // It depends on your processor architecture.
+    // On 8-bit AVR, the "/ 256" is just a one-cycle register move,
+    // but the "/ 64" might be a multicycle shift process. So on AVR
+    // it's faster do multiply the ramp values by four, and then
+    // divide by 256.
+    // On ARM, the "/ 256" and "/ 64" are one cycle each, so it's
+    // faster to NOT multiply the ramp values by four, and just to
+    // divide the resulting product by 64 (instead of 256).
+    // Moral of the story: trust your profiler, not your insticts.
+    
+    // Since there's an AVR assembly version elsewhere, we'll
+    // assume what we're on an architecture where any number of
+    // bit shifts has roughly the same cost, and we'll remove the
+    // redundant math at the source level:
+    
+    //  // scale up to 255 range
+    //  //rampup *= 4; // 0..252
+    //  //rampdown *= 4; // 0..252
+    
+    // compute color-amplitude-scaled-down versions of rampup and rampdown
+    uint8_t rampup_amp_adj   = (rampup   * color_amplitude) / (256 / 4);
+    uint8_t rampdown_amp_adj = (rampdown * color_amplitude) / (256 / 4);
+    
+    // add brightness_floor offset to everything
+    uint8_t rampup_adj_with_floor   = rampup_amp_adj   + brightness_floor;
+    uint8_t rampdown_adj_with_floor = rampdown_amp_adj + brightness_floor;
+    
+    
+    if( section ) {
+        if( section == 1) {
+            // section 1: 0x40..0x7F
+            rgb.r = brightness_floor;
+            rgb.g = rampdown_adj_with_floor;
+            rgb.b = rampup_adj_with_floor;
+        } else {
+            // section 2; 0x80..0xBF
+            rgb.r = rampup_adj_with_floor;
+            rgb.g = brightness_floor;
+            rgb.b = rampdown_adj_with_floor;
+        }
+    } else {
+        // section 0: 0x00..0x3F
+        rgb.r = rampdown_adj_with_floor;
+        rgb.g = rampup_adj_with_floor;
+        rgb.b = brightness_floor;
+    }
+}
+
+
+
+#if defined(__AVR__) && !defined( LIB8_ATTINY )
+void hsv2rgb_raw_avr(const struct CHSV & hsv, struct CRGB & rgb)
+{
+    uint8_t hue, saturation, value;
+    
+    hue =        hsv.hue;
+    saturation = hsv.sat;
+    value =      hsv.val;
+    
+    // Saturation more useful the other way around
+    saturation = 255 - saturation;
+    uint8_t invsat = APPLY_DIMMING( saturation );
+    
+    // Apply dimming curves
+    value = APPLY_DIMMING( value );
+    
+    // The brightness floor is minimum number that all of
+    // R, G, and B will be set to, which is value * invsat
+    uint8_t brightness_floor;
+    
+    asm volatile(
+                 "mul %[value], %[invsat]            \n"
+                 "mov %[brightness_floor], r1        \n"
+                 : [brightness_floor] "=r" (brightness_floor)
+                 : [value] "r" (value),
+                 [invsat] "r" (invsat)
+                 : "r0", "r1"
+                 );
+    
+    // The color amplitude is the maximum amount of R, G, and B
+    // that will be added on top of the brightness_floor to
+    // create the specific hue desired.
+    uint8_t color_amplitude = value - brightness_floor;
+    
+    // Figure how far we are offset into the section of the
+    // color wheel that we're in
+    uint8_t offset = hsv.hue & (HSV_SECTION_3 - 1);  // 0..63
+    uint8_t rampup = offset * 4; // 0..252
+    
+    
+    // compute color-amplitude-scaled-down versions of rampup and rampdown
+    uint8_t rampup_amp_adj;
+    uint8_t rampdown_amp_adj;
+    
+    asm volatile(
+                 "mul %[rampup], %[color_amplitude]       \n"
+                 "mov %[rampup_amp_adj], r1               \n"
+                 "com %[rampup]                           \n"
+                 "mul %[rampup], %[color_amplitude]       \n"
+                 "mov %[rampdown_amp_adj], r1             \n"
+                 : [rampup_amp_adj] "=&r" (rampup_amp_adj),
+                 [rampdown_amp_adj] "=&r" (rampdown_amp_adj),
+                 [rampup] "+r" (rampup)
+                 : [color_amplitude] "r" (color_amplitude)
+                 : "r0", "r1"
+                 );
+    
+    
+    // add brightness_floor offset to everything
+    uint8_t rampup_adj_with_floor   = rampup_amp_adj   + brightness_floor;
+    uint8_t rampdown_adj_with_floor = rampdown_amp_adj + brightness_floor;
+    
+    
+    // keep gcc from using "X" as the index register for storing
+    // results back in the return structure.  AVR's X register can't
+    // do "std X+q, rnn", but the Y and Z registers can.
+    // if the pointer to 'rgb' is in X, gcc will add all kinds of crazy
+    // extra instructions.  Simply killing X here seems to help it
+    // try Y or Z first.
+    asm volatile(  ""  :  :  : "r26", "r27" );
+    
+    
+    if( hue & 0x80 ) {
+        // section 2: 0x80..0xBF
+        rgb.r = rampup_adj_with_floor;
+        rgb.g = brightness_floor;
+        rgb.b = rampdown_adj_with_floor;
+    } else {
+        if( hue & 0x40) {
+            // section 1: 0x40..0x7F
+            rgb.r = brightness_floor;
+            rgb.g = rampdown_adj_with_floor;
+            rgb.b = rampup_adj_with_floor;
+        } else {
+            // section 0: 0x00..0x3F
+            rgb.r = rampdown_adj_with_floor;
+            rgb.g = rampup_adj_with_floor;
+            rgb.b = brightness_floor;
+        }
+    }
+    
+    cleanup_R1();
+}
+// End of AVR asm implementation
+
+#endif
+
+void hsv2rgb_spectrum( const CHSV& hsv, CRGB& rgb)
+{
+    CHSV hsv2(hsv);
+    hsv2.hue = scale8( hsv2.hue, 192);
+    hsv2rgb_raw(hsv2, rgb);
+}
+
+
+// Sometimes the compiler will do clever things to reduce
+// code size that result in a net slowdown, if it thinks that
+// a variable is not used in a certain location.
+// This macro does its best to convince the compiler that
+// the variable is used in this location, to help control
+// code motion and de-duplication that would result in a slowdown.
+#define FORCE_REFERENCE(var)  asm volatile( "" : : "r" (var) )
+
+
+#define K255 255
+#define K171 171
+#define K85  85
+
+void hsv2rgb_rainbow( const CHSV& hsv, CRGB& rgb)
+{
+    // Yellow has a higher inherent brightness than
+    // any other color; 'pure' yellow is perceived to
+    // be 93% as bright as white.  In order to make
+    // yellow appear the correct relative brightness,
+    // it has to be rendered brighter than all other
+    // colors.
+    // Level Y1 is a moderate boost, the default.
+    // Level Y2 is a strong boost.
+    const uint8_t Y1 = 1;
+    const uint8_t Y2 = 0;
+
+    // G2: Whether to divide all greens by two.
+    // Depends GREATLY on your particular LEDs
+    const uint8_t G2 = 0;
+    
+    // Gscale: what to scale green down by.
+    // Depends GREATLY on your particular LEDs
+    const uint8_t Gscale = 0;
+
+    
+    uint8_t hue = hsv.hue;
+    uint8_t sat = hsv.sat;
+    uint8_t val = hsv.val;
+        
+    uint8_t offset = hue & 0x1F; // 0..31
+    
+    // offset8 = offset * 8
+    uint8_t offset8 = offset;
+    {
+        offset8 <<= 1;
+        asm volatile("");
+        offset8 <<= 1;
+        asm volatile("");
+        offset8 <<= 1;
+    }
+    
+    uint8_t third = scale8( offset8, (256 / 3));
+        
+    uint8_t r, g, b;
+    
+    if( ! (hue & 0x80) ) {
+        // 0XX
+        if( ! (hue & 0x40) ) {
+            // 00X
+            //section 0-1
+            if( ! (hue & 0x20) ) {
+                // 000
+                //case 0: // R -> O
+                r = K255 - third;
+                g = third;
+                b = 0;
+                FORCE_REFERENCE(b);
+            } else {
+                // 001
+                //case 1: // O -> Y
+                if( Y1 ) {
+                    r = K171;
+                    g = K85 + third ;
+                    b = 0;
+                    FORCE_REFERENCE(b);
+                }
+                if( Y2 ) {
+                    r = K171 + third;
+                    uint8_t twothirds = (third << 1);
+                    g = K85 + twothirds;
+                    b = 0;
+                    FORCE_REFERENCE(b);
+                }
+            }
+        } else {
+            //01X
+            // section 2-3
+            if( !  (hue & 0x20) ) {
+                // 010
+                //case 2: // Y -> G
+                if( Y1 ) {
+                    uint8_t twothirds = (third << 1);
+                    r = K171 - twothirds;
+                    g = K171 + third;
+                    b = 0;
+                    FORCE_REFERENCE(b);
+                }
+                if( Y2 ) {
+                    r = K255 - offset8;
+                    g = K255;
+                    b = 0;
+                    FORCE_REFERENCE(b);
+                }
+            } else {
+                // 011
+                // case 3: // G -> A
+                r = 0;
+                FORCE_REFERENCE(r);
+                g = K255 - third;
+                b = third;
+            }
+        }
+    } else {
+        // section 4-7
+        // 1XX
+        if( ! (hue & 0x40) ) {
+            // 10X
+            if( ! ( hue & 0x20) ) {
+                // 100
+                //case 4: // A -> B
+                r = 0;
+                FORCE_REFERENCE(r);
+                uint8_t twothirds = (third << 1);
+                g = K171 - twothirds;
+                b = K85  + twothirds;
+
+            } else {
+                // 101
+                //case 5: // B -> P
+                r = third;
+                g = 0;
+                FORCE_REFERENCE(g);
+                b = K255 - third;
+
+            }
+        } else {
+            if( !  (hue & 0x20)  ) {
+                // 110
+                //case 6: // P -- K
+                r = K85 + third;
+                g = 0;
+                FORCE_REFERENCE(g);
+                b = K171 - third;
+
+            } else {
+                // 111
+                //case 7: // K -> R
+                r = K171 + third;
+                g = 0;
+                FORCE_REFERENCE(g);
+                b = K85 - third;
+
+            }
+        }
+    }
+    
+    // This is one of the good places to scale the green down,
+    // although the client can scale green down as well.
+    if( G2 ) g = g >> 1;
+    if( Gscale ) g = scale8_video_LEAVING_R1_DIRTY( g, Gscale);
+    
+    // Scale down colors if we're desaturated at all
+    // and add the brightness_floor to r, g, and b.
+    if( sat != 255 ) {
+
+        nscale8x3_video( r, g, b, sat);
+
+        uint8_t desat = 255 - sat;
+        desat = scale8( desat, desat);
+        
+        uint8_t brightness_floor = desat;
+        r += brightness_floor;
+        g += brightness_floor;
+        b += brightness_floor;
+    }
+
+    // Now scale everything down if we're at value < 255.
+    if( val != 255 ) {
+        
+        val = scale8_video_LEAVING_R1_DIRTY( val, val);
+        nscale8x3_video( r, g, b, val);
+    }
+    
+    // Here we have the old AVR "missing std X+n" problem again
+    // It turns out that fixing it winds up costing more than
+    // not fixing it.
+    // To paraphrase Dr Bronner, profile! profile! profile!
+    //asm volatile(  ""  :  :  : "r26", "r27" );
+    //asm volatile (" movw r30, r26 \n" : : : "r30", "r31");
+    rgb.r = r;
+    rgb.g = g;
+    rgb.b = b;
+}
+
+
+void hsv2rgb_raw(const struct CHSV * phsv, struct CRGB * prgb, int numLeds) {
+    for(int i = 0; i < numLeds; i++) {
+        hsv2rgb_raw(phsv[i], prgb[i]);
+    }
+}
+
+void hsv2rgb_rainbow( const struct CHSV* phsv, struct CRGB * prgb, int numLeds) {
+    for(int i = 0; i < numLeds; i++) {
+        hsv2rgb_rainbow(phsv[i], prgb[i]);
+    }
+}
+
+void hsv2rgb_spectrum( const struct CHSV* phsv, struct CRGB * prgb, int numLeds) {
+    for(int i = 0; i < numLeds; i++) {
+        hsv2rgb_spectrum(phsv[i], prgb[i]);
+    }
+}
+
+void fill_solid( struct CRGB * pFirstLED, int numToFill,
+                const struct CRGB& color)
+{
+    for( int i = 0; i < numToFill; i++) {
+        pFirstLED[i] = color;
+    }
+}
+
+void fill_rainbow( struct CRGB * pFirstLED, int numToFill,
+                  uint8_t initialhue,
+                  uint8_t deltahue )
+{
+    CHSV hsv;
+    hsv.hue = initialhue;
+    hsv.val = 255;
+    hsv.sat = 255;
+    for( int i = 0; i < numToFill; i++) {
+        hsv2rgb_rainbow( hsv, pFirstLED[i]);
+        hsv.hue += deltahue;
+    }
+}
diff --git a/hsv2rgb.h b/hsv2rgb.h
new file mode 100644
index 00000000..d30e9aef
--- /dev/null
+++ b/hsv2rgb.h
@@ -0,0 +1,59 @@
+#ifndef __INC_HSV2RGB_H
+#define __INC_HSV2RGB_H
+
+#include "pixeltypes.h"
+
+
+// hsv2rgb_rainbow - convert a hue, saturation, and value to RGB
+//                   using a visually balanced rainbow (vs a straight
+//                   mathematical spectrum).
+//                   This 'rainbow' yields better yellow and orange
+//                   than a straight 'spectrum'.
+//
+//                   NOTE: here hue is 0-255, not just 0-191
+
+void hsv2rgb_rainbow( const struct CHSV& hsv, struct CRGB& rgb);
+void hsv2rgb_rainbow( const struct CHSV* phsv, struct CRGB * prgb, int numLeds);
+#define HUE_MAX_RAINBOW 255
+
+
+// hsv2rgb_spectrum - convert a hue, saturation, and value to RGB
+//                    using a mathematically straight spectrum (vs
+//                    a visually balanced rainbow).
+//                    This 'spectrum' will have more green & blue
+//                    than a 'rainbow', and less yellow and orange.
+//
+//                    NOTE: here hue is 0-255, not just 0-191
+
+void hsv2rgb_spectrum( const struct CHSV& hsv, struct CRGB& rgb);
+void hsv2rgb_spectrum( const struct CHSV* phsv, struct CRGB * prgb, int numLeds);
+#define HUE_MAX_SPECTRUM 255
+
+
+// hsv2rgb_raw - convert hue, saturation, and value to RGB.
+//               This 'spectrum' conversion will be more green & blue
+//               than a real 'rainbow', and the hue is specified just
+//               in the range 0-191.  Together, these result in a
+//               slightly faster conversion speed, at the expense of
+//               color balance.
+//
+//               NOTE: Hue is 0-191 only!
+//               Saturation & value are 0-255 each.
+//
+
+void hsv2rgb_raw(const struct CHSV& hsv, struct CRGB & rgb);
+void hsv2rgb_raw(const struct CHSV* phsv, struct CRGB * prgb, int numLeds);
+#define HUE_MAX 191
+
+
+// fill_solid -   fill a range of LEDs with a solid color
+void fill_solid( struct CRGB * pFirstLED, int numToFill,
+                 const struct CRGB& color);
+
+// fill_rainbow - fill a range of LEDs with a rainbow of colors, at
+//                full saturation and full value (brightness)
+void fill_rainbow( struct CRGB * pFirstLED, int numToFill,
+                   uint8_t initialhue,
+                   uint8_t deltahue = 5);
+
+#endif
diff --git a/lib8tion.cpp b/lib8tion.cpp
new file mode 100644
index 00000000..224d44f2
--- /dev/null
+++ b/lib8tion.cpp
@@ -0,0 +1,242 @@
+#include <stdint.h>
+
+#define RAND16_SEED  1337
+uint16_t rand16seed = RAND16_SEED;
+
+
+// memset8, memcpy8, memmove8:
+//  optimized avr replacements for the standard "C" library
+//  routines memset, memcpy, and memmove.
+//
+//  There are two techniques that make these routines
+//  faster than the standard avr-libc routines.
+//  First, the loops are unrolled 2X, meaning that
+//  the average loop overhead is cut in half.
+//  And second, the compare-and-branch at the bottom
+//  of each loop decrements the low byte of the
+//  counter, and if the carry is clear, it branches
+//  back up immediately.  Only if the low byte math
+//  causes carry do we bother to decrement the high
+//  byte and check that result for carry as well.
+//  Results for a 100-byte buffer are 20-40% faster
+//  than standard avr-libc, at a cost of a few extra
+//  bytes of code.
+
+#if defined(__AVR__)
+extern "C" {
+//__attribute__ ((noinline))
+void * memset8 ( void * ptr, uint8_t val, uint16_t num )
+{
+    asm volatile(
+         "  movw r26, %[ptr]        \n\t"
+         "  sbrs %A[num], 0         \n\t"
+         "  rjmp Lseteven_%=        \n\t"
+         "  rjmp Lsetodd_%=         \n\t"
+         "Lsetloop_%=:              \n\t"
+         "  st X+, %[val]           \n\t"
+         "Lsetodd_%=:               \n\t"
+         "  st X+, %[val]           \n\t"
+         "Lseteven_%=:              \n\t"
+         "  subi %A[num], 2         \n\t"
+         "  brcc Lsetloop_%=        \n\t"
+         "  sbci %B[num], 0         \n\t"
+         "  brcc Lsetloop_%=        \n\t"
+         : [num] "+r" (num)
+         : [ptr]  "r" (ptr),
+           [val]  "r" (val)
+         : "memory"
+         );
+    return ptr;
+}
+
+
+
+//__attribute__ ((noinline))
+void * memcpy8 ( void * dst, void* src, uint16_t num )
+{
+    asm volatile(
+         "  movw r30, %[src]        \n\t"
+         "  movw r26, %[dst]        \n\t"
+         "  sbrs %A[num], 0         \n\t"
+         "  rjmp Lcpyeven_%=        \n\t"
+         "  rjmp Lcpyodd_%=         \n\t"
+         "Lcpyloop_%=:              \n\t"
+         "  ld __tmp_reg__, Z+      \n\t"
+         "  st X+, __tmp_reg__      \n\t"
+         "Lcpyodd_%=:               \n\t"
+         "  ld __tmp_reg__, Z+      \n\t"
+         "  st X+, __tmp_reg__      \n\t"
+         "Lcpyeven_%=:              \n\t"
+         "  subi %A[num], 2         \n\t"
+         "  brcc Lcpyloop_%=        \n\t"
+         "  sbci %B[num], 0         \n\t"
+         "  brcc Lcpyloop_%=        \n\t"
+         : [num] "+r" (num)
+         : [src] "r" (src),
+           [dst] "r" (dst)
+         : "memory"
+         );
+    return dst;
+}
+
+//__attribute__ ((noinline))
+void * memmove8 ( void * dst, void* src, uint16_t num )
+{
+    if( src < dst) {
+        // if src < dst then we can use the forward-stepping memcpy8
+        return memcpy8( dst, src, num);
+    } else {
+        // if src > dst then we have to step backward:
+        dst = (char*)dst + num;
+        src = (char*)src + num;
+        asm volatile(
+             "  movw r30, %[src]        \n\t"
+             "  movw r26, %[dst]        \n\t"
+             "  sbrs %A[num], 0         \n\t"
+             "  rjmp Lmoveven_%=        \n\t"
+             "  rjmp Lmovodd_%=         \n\t"
+             "Lmovloop_%=:              \n\t"
+             "  ld __tmp_reg__, -Z      \n\t"
+             "  st -X, __tmp_reg__      \n\t"
+             "Lmovodd_%=:               \n\t"
+             "  ld __tmp_reg__, -Z      \n\t"
+             "  st -X, __tmp_reg__      \n\t"
+             "Lmoveven_%=:              \n\t"
+             "  subi %A[num], 2         \n\t"
+             "  brcc Lmovloop_%=        \n\t"
+             "  sbci %B[num], 0         \n\t"
+             "  brcc Lmovloop_%=        \n\t"
+             : [num] "+r" (num)
+             : [src] "r" (src),
+               [dst] "r" (dst)
+             : "memory"
+             );
+        return dst;
+    }
+}
+
+
+} /* end extern "C" */
+
+#endif /* AVR */
+
+#if 0
+// TEST / VERIFICATION CODE ONLY BELOW THIS POINT
+#include <Arduino.h>
+#include "lib8tion.h"
+
+void test1abs( int8_t i)
+{
+    Serial.print("abs("); Serial.print(i); Serial.print(") = ");
+    int8_t j = abs8(i);
+    Serial.print(j); Serial.println(" ");
+}
+
+void testabs()
+{
+    delay(5000);
+    for( int8_t q = -128; q != 127; q++) {
+        test1abs(q);
+    }
+    for(;;){};
+}
+
+
+void testmul8()
+{
+    delay(5000);
+    byte r, c;
+    
+    Serial.println("mul8:");
+    for( r = 0; r <= 20; r += 1) {
+        Serial.print(r); Serial.print(" : ");
+        for( c = 0; c <= 20; c += 1) {
+            byte t;
+            t = mul8( r, c);
+            Serial.print(t); Serial.print(' ');
+        }
+        Serial.println(' ');
+    }
+    Serial.println("done.");
+    for(;;){};
+}
+
+
+void testscale8()
+{
+    delay(5000);
+    byte r, c;
+
+    Serial.println("scale8:");
+    for( r = 0; r <= 240; r += 10) {
+        Serial.print(r); Serial.print(" : ");
+        for( c = 0; c <= 240; c += 10) {
+            byte t;
+            t = scale8( r, c);
+            Serial.print(t); Serial.print(' ');
+        }
+        Serial.println(' ');
+    }
+
+    Serial.println(' ');
+    Serial.println("scale8_video:");
+
+    for( r = 0; r <= 100; r += 4) {
+        Serial.print(r); Serial.print(" : ");
+        for( c = 0; c <= 100; c += 4) {
+            byte t;
+            t = scale8_video( r, c);
+            Serial.print(t); Serial.print(' ');
+        }
+        Serial.println(' ');
+    }
+
+    Serial.println("done.");
+    for(;;){};
+}
+
+
+
+void testqadd8()
+{
+    delay(5000);
+    byte r, c;
+    for( r = 0; r <= 240; r += 10) {
+        Serial.print(r); Serial.print(" : ");
+        for( c = 0; c <= 240; c += 10) {
+            byte t;
+            t = qadd8( r, c);
+            Serial.print(t); Serial.print(' ');
+        }
+        Serial.println(' ');
+    }
+    Serial.println("done.");
+    for(;;){};
+}
+
+void testnscale8x3()
+{
+    delay(5000);
+    byte r, g, b, sc;
+    for( byte z = 0; z < 10; z++) {
+        r = random8(); g = random8(); b = random8(); sc = random8();
+        
+        Serial.print("nscale8x3_video( ");
+        Serial.print(r); Serial.print(", ");
+        Serial.print(g); Serial.print(", ");
+        Serial.print(b); Serial.print(", ");
+        Serial.print(sc); Serial.print(") = [ ");
+        
+        nscale8x3_video( r, g, b, sc);
+        
+        Serial.print(r); Serial.print(", ");
+        Serial.print(g); Serial.print(", ");
+        Serial.print(b); Serial.print("]");
+        
+        Serial.println(' ');
+    }
+    Serial.println("done.");
+    for(;;){};
+}
+
+#endif
diff --git a/lib8tion.h b/lib8tion.h
new file mode 100644
index 00000000..5fb812cb
--- /dev/null
+++ b/lib8tion.h
@@ -0,0 +1,1272 @@
+#ifndef __INC_LIB8TION_H
+#define __INC_LIB8TION_H
+
+/*
+ 
+ Fast, efficient 8-bit math functions specifically
+ designed for high-performance LED programming.
+ 
+ Because of the AVR(Arduino) and ARM assembly language
+ implementations provided, using these functions often
+ results in smaller and faster code than the equivalent
+ program using plain "C" arithmetic and logic.
+ 
+ 
+ Included are:
+ 
+ 
+ - Saturating unsigned 8-bit add and subtract.
+   Instead of wrapping around if an overflow occurs,
+   these routines just 'clamp' the output at a maxumum
+   of 255, or a minimum of 0.  Useful for adding pixel 
+   values.  E.g., qadd8( 200, 100) = 255.
+ 
+     qadd8( i, j) == MIN( (i + j), 0xFF )
+     qsub8( i, j) == MAX( (i - j), 0 )
+ 
+ - Saturating signed 8-bit ("7-bit") add.
+     qadd7( i, j) == MIN( (i + j), 0x7F)
+ 
+ 
+ - Scaling (down) of unsigned 8- and 16- bit values.
+   Scaledown value is specified in 1/256ths.
+     scale8( i, sc) == (i * sc) / 256
+     scale16by8( i, sc) == (i * sc) / 256
+ 
+   Example: scaling a 0-255 value down into a
+   range from 0-99:
+     downscaled = scale8( originalnumber, 100);
+
+   A special version of scale8 is provided for scaling
+   LED brightness values, to make sure that they don't
+   accidentally scale down to total black at low
+   dimming levels, since that would look wrong:
+     scale8_video( i, sc) = ((i * sc) / 256) +? 1
+ 
+   Example: reducing an LED brightness by a
+   dimming factor:
+     new_bright = scale8_video( orig_bright, dimming);
+ 
+ 
+ - Fast 8- and 16- bit unsigned random numbers.
+   Significantly faster than Arduino random(), but 
+   also somewhat less random.  You can add entropy.
+     random8()       == random from 0..255
+     random8( n)     == random from 0..(N-1)
+     random8( n, m)  == random from N..(M-1)
+ 
+     random16()      == random from 0..65535
+     random16( n)    == random from 0..(N-1)
+     random16( n, m) == random from N..(M-1)
+   
+     random16_set_seed( k)    ==  seed = k
+     random16_add_entropy( k) ==  seed += k
+
+ 
+ - Absolute value of a signed 8-bit value.
+     abs8( i)     == abs( i)
+
+
+ - 8-bit math operations which return 8-bit values.
+   These are provided mostly for completeness,
+   not particularly for performance.
+     mul8( i, j)  == (i * j) & 0xFF
+     add8( i, j)  == (i + j) & 0xFF
+     sub8( i, j)  == (i - j) & 0xFF
+
+ 
+ - Fast 16-bit approximations of sin and cos.
+   Input angle is a uint16_t from 0-65535.
+   Output is a signed int16_t from -32767 to 32767.
+      sin16( x)  == sin( (x/32768.0) * pi) * 32767
+      cos16( x)  == cos( (x/32768.0) * pi) * 32767
+   Accurate to more than 99% in all cases.
+ 
+ 
+ - Dimming and brightening functions for 8-bit
+   light values.
+      dim8_video( x)  == scale8_video( x, x)
+      dim8_raw( x)    == scale8( x, x)
+      brighten8_video( x) == 255 - dim8_video( 255 - x)
+      brighten8_raw( x) == 255 - dim8_raw( 255 - x)
+   The dimming functions in particular are suitable
+   for making LED light output appear more 'linear'.
+
+
+ - Fast 8-bit "easing in/out" function.
+     ease8InOutCubic(x) == 3(x^i) - 2(x^3)
+     ease8InOutApprox(x) == 
+       faster, rougher, approximation of cubic easing
+     
+
+ - Linear interpolation between two values, with the
+   fraction between them expressed as an 8- or 16-bit
+   fixed point fraction (fract8 or fract16).
+     lerp8by8(   fromU8, toU8, fract8 )
+     lerp16by8(  fromU16, toU16, fract8 )
+     lerp15by8(  fromS16, toS16, fract8 )
+       == from + (( to - from ) * fract8) / 256)
+     lerp16by16( fromU16, toU16, fract16 )
+       == from + (( to - from ) * fract16) / 65536)
+ 
+ - Optimized memmove, memcpy, and memset, that are
+   faster than standard avr-libc 1.8.
+      memmove8( dest, src,  bytecount)
+      memcpy8(  dest, src,  bytecount)
+      memset8(  buf, value, bytecount)
+ 
+
+Lib8tion is pronounced like 'libation': lie-BAY-shun
+
+*/
+ 
+ 
+
+#include <stdint.h>
+
+#define LIB8STATIC __attribute__ ((unused)) static
+
+
+#if defined(__AVR_ATtiny24__) || defined(__AVR_ATtiny44__) || defined(__AVR_ATtiny84__) || defined(__AVR_ATtiny25__) || defined(__AVR_ATtiny45__) || defined(__AVR_ATtiny85__)
+#define LIB8_ATTINY 1
+#endif
+
+
+#if defined(__arm__)
+
+#if defined(__MK20DX128__)
+// Can use Cortex M4 DSP instructions
+#define QADD8_C 0
+#define QADD7_C 0
+#define QADD8_ARM_DSP_ASM 1
+#define QADD7_ARM_DSP_ASM 1
+#else
+// Generic ARM
+#define QADD8_C 1
+#define QADD7_C 1
+#endif
+
+#define QSUB8_C 1
+#define SCALE8_C 1
+#define SCALE16BY8_C 1
+#define SCALE16_C 1
+#define ABS8_C 1
+#define MUL8_C 1
+#define QMUL8_C 1
+#define ADD8_C 1
+#define SUB8_C 1
+#define EASE8_C 1
+
+
+#elif defined(__AVR__)
+
+// AVR ATmega and friends Arduino
+
+#define QADD8_C 0
+#define QADD7_C 0
+#define QSUB8_C 0
+#define ABS8_C 0
+#define ADD8_C 0
+#define SUB8_C 0
+
+#define QADD8_AVRASM 1
+#define QADD7_AVRASM 1
+#define QSUB8_AVRASM 1
+#define ABS8_AVRASM 1
+#define ADD8_AVRASM 1
+#define SUB8_AVRASM 1
+
+// Note: these require hardware MUL instruction
+//       -- sorry, ATtiny!
+#if !defined(LIB8_ATTINY)
+#define SCALE8_C 0
+#define SCALE16BY8_C 0
+#define SCALE16_C 0
+#define MUL8_C 0
+#define QMUL8_C 0
+#define EASE8_C 0
+#define SCALE8_AVRASM 1
+#define SCALE16BY8_AVRASM 1
+#define SCALE16_AVRASM 1
+#define MUL8_AVRASM 1
+#define QMUL8_AVRASM 1
+#define EASE8_AVRASM 1
+#define CLEANUP_R1_AVRASM 1
+#else
+// On ATtiny, we just use C implementations
+#define SCALE8_C 1
+#define SCALE16BY8_C 1
+#define SCALE16_C 1
+#define MUL8_C 1
+#define QMUL8_C 1
+#define EASE8_C 1
+#define SCALE8_AVRASM 0
+#define SCALE16BY8_AVRASM 0
+#define SCALE16_AVRASM 0
+#define MUL8_AVRASM 0
+#define QMUL8_AVRASM 0
+#define EASE8_AVRASM 0
+#endif
+
+#else
+
+// unspecified architecture, so
+// no ASM, everything in C
+#define QADD8_C 1
+#define QADD7_C 1
+#define QSUB8_C 1
+#define SCALE8_C 1
+#define SCALE16BY8_C 1
+#define SCALE16_C 1
+#define ABS8_C 1
+#define MUL8_C 1
+#define ADD8_C 1
+#define SUB8_C 1
+#define EASE8_C 1
+
+#endif
+
+
+///////////////////////////////////////////////////////////////////////
+//
+// typdefs for fixed-point fractional types.
+//
+// sfract7 should be interpreted as signed 128ths.
+// fract8 should be interpreted as unsigned 256ths.
+// sfract15 should be interpreted as signed 32768ths.
+// fract16 should be interpreted as unsigned 65536ths.
+//
+// Example: if a fract8 has the value "64", that should be interpreted
+//          as 64/256ths, or one-quarter.
+//
+//
+//  fract8   range is 0 to 0.99609375
+//                 in steps of 0.00390625
+//
+//  sfract7  range is -0.9921875 to 0.9921875
+//                 in steps of 0.0078125
+//
+//  fract16  range is 0 to 0.99998474121
+//                 in steps of 0.00001525878
+//
+//  sfract15 range is -0.99996948242 to 0.99996948242
+//                 in steps of 0.00003051757
+//
+
+typedef uint8_t   fract8;   // ANSI: unsigned short _Fract
+typedef int8_t    sfract7;  // ANSI: signed   short _Fract
+typedef uint16_t  fract16;  // ANSI: unsigned       _Fract
+typedef int16_t   sfract15; // ANSI: signed         _Fract
+
+
+// accumXY types should be interpreted as X bits of integer,
+//         and Y bits of fraction.
+//         E.g., accum88 has 8 bits of int, 8 bits of fraction
+
+typedef uint16_t  accum88;  // ANSI: unsigned short _Accum
+typedef int16_t   saccum78; // ANSI: signed   short _Accum
+typedef uint32_t  accum1616;// ANSI: signed         _Accum
+typedef int32_t   saccum1516;//ANSI: signed         _Accum
+typedef uint16_t  accum124; // no direct ANSI counterpart
+typedef int32_t   saccum114;// no direct ANSI counterpart
+
+
+// typedef for IEEE754 "binary32" float type internals
+
+typedef union {
+    uint32_t i;
+    float    f;
+    struct {
+        uint32_t mantissa: 23;
+        uint32_t exponent:  8;
+        uint32_t signbit:   1;
+    };
+    struct {
+        uint32_t mant7 :  7;
+        uint32_t mant16: 16;
+        uint32_t exp_  :  8;
+        uint32_t sb_   :  1;
+    };
+    struct {
+        uint32_t mant_lo8 : 8;
+        uint32_t mant_hi16_exp_lo1 : 16;
+        uint32_t sb_exphi7 : 8;
+    };
+} IEEE754binary32_t;
+
+
+
+///////////////////////////////////////////////////////////////////////
+
+// qadd8: add one byte to another, saturating at 0xFF
+LIB8STATIC uint8_t qadd8( uint8_t i, uint8_t j)
+{
+#if QADD8_C == 1
+    int t = i + j;
+    if( t > 255) t = 255;
+    return t;
+#elif QADD8_AVRASM == 1
+    asm volatile(
+         /* First, add j to i, conditioning the C flag */
+         "add %0, %1    \n\t"
+
+         /* Now test the C flag.
+           If C is clear, we branch around a load of 0xFF into i.
+           If C is set, we go ahead and load 0xFF into i.
+         */
+         "brcc L_%=     \n\t"
+         "ldi %0, 0xFF  \n\t"
+         "L_%=: "
+         : "+a" (i)
+         : "a"  (j) );
+    return i;
+#elif QADD8_ARM_DSP_ASM == 1
+    asm volatile( "uqadd8 %0, %0, %1" : "+r" (i) : "r" (j));
+    return i;
+#else
+#error "No implementation for qadd8 available."
+#endif
+}
+
+
+// qadd7: add one signed byte to another,
+//        saturating at 0x7F.
+LIB8STATIC int8_t qadd7( int8_t i, int8_t j)
+{
+#if QADD7_C == 1
+    int16_t t = i + j;
+    if( t > 127) t = 127;
+    return t;
+#elif QADD7_AVRASM == 1
+    asm volatile(
+         /* First, add j to i, conditioning the V flag */
+         "add %0, %1    \n\t"
+         
+         /* Now test the V flag.
+          If V is clear, we branch around a load of 0x7F into i.
+          If V is set, we go ahead and load 0x7F into i.
+          */
+         "brvc L_%=     \n\t"
+         "ldi %0, 0x7F  \n\t"
+         "L_%=: "
+         : "+a" (i)
+         : "a"  (j) );
+
+    return i;
+#elif QADD7_ARM_DSP_ASM == 1
+    asm volatile( "qadd8 %0, %0, %1" : "+r" (i) : "r" (j));
+    return i;
+#else
+#error "No implementation for qadd7 available."
+#endif
+}
+
+// qsub8: subtract one byte from another, saturating at 0x00
+LIB8STATIC uint8_t qsub8( uint8_t i, uint8_t j)
+{
+#if QSUB8_C == 1
+    int t = i - j;
+    if( t < 0) t = 0;
+    return t;
+#elif QSUB8_AVRASM == 1
+
+    asm volatile(
+         /* First, subtract j from i, conditioning the C flag */
+         "sub %0, %1    \n\t"
+         
+         /* Now test the C flag.
+          If C is clear, we branch around a load of 0x00 into i.
+          If C is set, we go ahead and load 0x00 into i.
+          */
+         "brcc L_%=     \n\t"
+         "ldi %0, 0x00  \n\t"
+         "L_%=: "
+         : "+a" (i)
+         : "a"  (j) );
+    
+    return i;
+#else
+#error "No implementation for qsub8 available."
+#endif
+}
+
+// add8: add one byte to another, with one byte result
+LIB8STATIC uint8_t add8( uint8_t i, uint8_t j)
+{
+#if ADD8_C == 1
+    int t = i + j;
+    return t;
+#elif ADD8_AVRASM == 1
+    // Add j to i, period.
+    asm volatile( "add %0, %1" : "+a" (i) : "a" (j));
+    return i;
+#else
+#error "No implementation for add8 available."
+#endif
+}
+
+
+// sub8: subtract one byte from another, 8-bit result
+LIB8STATIC uint8_t sub8( uint8_t i, uint8_t j)
+{
+#if SUB8_C == 1
+    int t = i - j;
+    return t;
+#elif SUB8_AVRASM == 1
+    // Subtract j from i, period.
+    asm volatile( "sub %0, %1" : "+a" (i) : "a" (j));
+    return i;
+#else
+#error "No implementation for sub8 available."
+#endif
+}
+
+
+// scale8: scale one byte by a second one, which is treated as
+//         the numerator of a fraction whose denominator is 256
+//         In other words, it computes i * (scale / 256)
+//         4 clocks AVR, 2 clocks ARM
+LIB8STATIC uint8_t scale8( uint8_t i, fract8 scale)
+{
+#if SCALE8_C == 1
+    return ((int)i * (int)(scale) ) >> 8;
+#elif SCALE8_AVRASM == 1
+    asm volatile(
+         /* Multiply 8-bit i * 8-bit scale, giving 16-bit r1,r0 */
+         "mul %0, %1          \n\t"
+         /* Move the high 8-bits of the product (r1) back to i */
+         "mov %0, r1          \n\t"
+         /* Restore r1 to "0"; it's expected to always be that */
+         "clr __zero_reg__    \n\t"
+         
+         : "+a" (i)      /* writes to i */
+         : "a"  (scale)  /* uses scale */
+         : "r0", "r1"    /* clobbers r0, r1 */ );
+
+    /* Return the result */
+    return i;
+#else
+#error "No implementation for scale8 available."
+#endif
+}
+
+
+//  The "video" version of scale8 guarantees that the output will
+//  be only be zero if one or both of the inputs are zero.  If both
+//  inputs are non-zero, the output is guaranteed to be non-zero.
+//  This makes for better 'video'/LED dimming, at the cost of
+//  several additional cycles.
+LIB8STATIC uint8_t scale8_video( uint8_t i, fract8 scale)
+{
+#if SCALE8_C == 1
+    uint8_t nonzeroscale = (scale != 0) ? 1 : 0;
+    uint8_t j = (i == 0) ? 0 : (((int)i * (int)(scale) ) >> 8) + nonzeroscale;
+    return j;
+#elif SCALE8_AVRASM == 1
+    
+    uint8_t nonzeroscale = (scale != 0) ? 1 : 0;
+    asm volatile(
+         "      tst %0           \n"
+         "      breq L_%=        \n"
+         "      mul %0, %1       \n"
+         "      mov %0, r1       \n"
+         "      add %0, %2       \n"
+         "      clr __zero_reg__ \n"
+         "L_%=:                  \n"
+         
+         : "+a" (i)
+         : "a" (scale), "a" (nonzeroscale)
+         : "r0", "r1");
+    
+    // Return the result
+    return i;
+#else
+#error "No implementation for scale8_video available."
+#endif
+}
+
+
+// This version of scale8 does not clean up the R1 register on AVR
+// If you are doing several 'scale8's in a row, use this, and
+// then explicitly call cleanup_R1.
+LIB8STATIC uint8_t scale8_LEAVING_R1_DIRTY( uint8_t i, fract8 scale)
+{
+#if SCALE8_C == 1
+    return ((int)i * (int)(scale) ) >> 8;
+#elif SCALE8_AVRASM == 1
+    asm volatile(
+         /* Multiply 8-bit i * 8-bit scale, giving 16-bit r1,r0 */
+         "mul %0, %1    \n\t"
+         /* Move the high 8-bits of the product (r1) back to i */
+         "mov %0, r1    \n\t"
+         /* R1 IS LEFT DIRTY HERE; YOU MUST ZERO IT OUT YOURSELF  */
+         /* "clr __zero_reg__    \n\t" */
+         
+         : "+a" (i)      /* writes to i */
+         : "a"  (scale)  /* uses scale */
+         : "r0", "r1"    /* clobbers r0, r1 */ );
+    
+    // Return the result
+    return i;
+#else
+#error "No implementation for scale8_LEAVING_R1_DIRTY available."
+#endif
+}
+
+//   THIS FUNCTION ALWAYS MODIFIES ITS ARGUMENT DIRECTLY IN PLACE
+
+LIB8STATIC void nscale8_LEAVING_R1_DIRTY( uint8_t& i, fract8 scale)
+{
+#if SCALE8_C == 1
+    i = ((int)i * (int)(scale) ) >> 8;
+#elif SCALE8_AVRASM == 1
+    asm volatile(
+         /* Multiply 8-bit i * 8-bit scale, giving 16-bit r1,r0 */
+         "mul %0, %1    \n\t"
+         /* Move the high 8-bits of the product (r1) back to i */
+         "mov %0, r1    \n\t"
+         /* R1 IS LEFT DIRTY HERE; YOU MUST ZERO IT OUT YOURSELF */
+         /* "clr __zero_reg__    \n\t" */
+         
+         : "+a" (i)      /* writes to i */
+         : "a"  (scale)  /* uses scale */
+         : "r0", "r1"    /* clobbers r0, r1 */ );
+#else
+#error "No implementation for nscale8_LEAVING_R1_DIRTY available."
+#endif
+}
+
+
+
+LIB8STATIC uint8_t scale8_video_LEAVING_R1_DIRTY( uint8_t i, fract8 scale)
+{
+#if SCALE8_C == 1
+    uint8_t nonzeroscale = (scale != 0) ? 1 : 0;
+    uint8_t j = (i == 0) ? 0 : (((int)i * (int)(scale) ) >> 8) + nonzeroscale;
+    return j;
+#elif SCALE8_AVRASM == 1
+    
+    uint8_t nonzeroscale = (scale != 0) ? 1 : 0;
+    asm volatile(
+         "      tst %0          \n"
+         "      breq L_%=       \n"
+         "      mul %0, %1      \n"
+         "      mov %0, r1      \n"
+         "      add %0, %2      \n"
+         /* R1 IS LEFT DIRTY, YOU MUST ZERO IT OUT YOURSELF */
+         "L_%=:                 \n"
+         
+         : "+a" (i)
+         : "a" (scale), "a" (nonzeroscale)
+         : "r0", "r1");
+    
+    // Return the result
+    return i;
+#else
+#error "No implementation for scale8_video available."
+#endif
+}
+
+
+
+LIB8STATIC void cleanup_R1()
+{
+#if CLEANUP_R1_AVRASM == 1
+    // Restore r1 to "0"; it's expected to always be that
+    asm volatile( "clr __zero_reg__  \n\t" : : : "r1" );
+#endif
+}
+
+
+// nscale8x3: scale three one byte values by a fourth one, which is treated as
+//         the numerator of a fraction whose demominator is 256
+//         In other words, it computes r,g,b * (scale / 256)
+//
+//         THIS FUNCTION ALWAYS MODIFIES ITS ARGUMENTS IN PLACE
+
+LIB8STATIC void nscale8x3( uint8_t& r, uint8_t& g, uint8_t& b, fract8 scale)
+{
+#if SCALE8_C == 1
+    r = ((int)r * (int)(scale) ) >> 8;
+    g = ((int)g * (int)(scale) ) >> 8;
+    b = ((int)b * (int)(scale) ) >> 8;
+#elif SCALE8_AVRASM == 1
+    r = scale8_LEAVING_R1_DIRTY(r, scale);
+    g = scale8_LEAVING_R1_DIRTY(g, scale);
+    b = scale8_LEAVING_R1_DIRTY(b, scale);
+    cleanup_R1();
+#else
+#error "No implementation for nscale8x3 available."
+#endif
+}
+
+
+LIB8STATIC void nscale8x3_video( uint8_t& r, uint8_t& g, uint8_t& b, fract8 scale)
+{
+#if SCALE8_C == 1
+    uint8_t nonzeroscale = (scale != 0) ? 1 : 0;
+    r = (r == 0) ? 0 : (((int)r * (int)(scale) ) >> 8) + nonzeroscale;
+    g = (g == 0) ? 0 : (((int)g * (int)(scale) ) >> 8) + nonzeroscale;
+    b = (b == 0) ? 0 : (((int)b * (int)(scale) ) >> 8) + nonzeroscale;
+#elif SCALE8_AVRASM == 1
+    r = scale8_video_LEAVING_R1_DIRTY( r, scale);
+    g = scale8_video_LEAVING_R1_DIRTY( g, scale);
+    b = scale8_video_LEAVING_R1_DIRTY( b, scale);
+    cleanup_R1();
+#else
+#error "No implementation for nscale8x3 available."
+#endif
+}
+
+// nscale8x2: scale two one byte values by a third one, which is treated as
+//         the numerator of a fraction whose demominator is 256
+//         In other words, it computes i,j * (scale / 256)
+//
+//         THIS FUNCTION ALWAYS MODIFIES ITS ARGUMENTS IN PLACE
+
+LIB8STATIC void nscale8x2( uint8_t& i, uint8_t& j, fract8 scale)
+{
+#if SCALE8_C == 1
+    i = ((int)i * (int)(scale) ) >> 8;
+    j = ((int)j * (int)(scale) ) >> 8;
+#elif SCALE8_AVRASM == 1
+    i = scale8_LEAVING_R1_DIRTY(i, scale);
+    j = scale8_LEAVING_R1_DIRTY(j, scale);
+    cleanup_R1();
+#else
+#error "No implementation for nscale8x2 available."
+#endif
+}
+
+
+LIB8STATIC void nscale8x2_video( uint8_t& i, uint8_t& j, fract8 scale)
+{
+#if SCALE8_C == 1
+    uint8_t nonzeroscale = (scale != 0) ? 1 : 0;
+    i = (i == 0) ? 0 : (((int)i * (int)(scale) ) >> 8) + nonzeroscale;
+    j = (j == 0) ? 0 : (((int)j * (int)(scale) ) >> 8) + nonzeroscale;
+#elif SCALE8_AVRASM == 1
+    i = scale8_video_LEAVING_R1_DIRTY( i, scale);
+    j = scale8_video_LEAVING_R1_DIRTY( j, scale);
+    cleanup_R1();
+#else
+#error "No implementation for nscale8x2 available."
+#endif
+}
+
+
+// scale16by8: scale a 16-bit unsigned value by an 8-bit value,
+//         considered as numerator of a fraction whose denominator
+//         is 256. In other words, it computes i * (scale / 256)
+
+#if SCALE16BY8_C == 1
+LIB8STATIC uint16_t scale16by8( uint16_t i, fract8 scale )
+{
+    uint16_t result;
+    result = (i * scale) / 256;
+    return result;
+}
+#elif SCALE16BY8_AVRASM == 1
+LIB8STATIC uint16_t scale16by8( uint16_t i, fract8 scale )
+{
+    uint16_t result;
+    asm volatile(
+         // result.A = HighByte(i.A x j )
+         "  mul %A[i], %[scale]                 \n\t"
+         "  mov %A[result], r1                  \n\t"
+         "  clr %B[result]                      \n\t"
+         
+         // result.A-B += i.B x j
+         "  mul %B[i], %[scale]                 \n\t"
+         "  add %A[result], r0                  \n\t"
+         "  adc %B[result], r1                  \n\t"
+         
+         // cleanup r1
+         "  clr __zero_reg__                    \n\t"
+         
+         : [result] "=r" (result)
+         : [i] "r" (i), [scale] "r" (scale)
+         : "r0", "r1"
+         );
+    return result;
+}
+#else
+#error "No implementation for scale16by8 available."
+#endif
+
+// scale16: scale a 16-bit unsigned value by a 16-bit value,
+//         considered as numerator of a fraction whose denominator
+//         is 65536. In other words, it computes i * (scale / 65536)
+
+#if SCALE16_C == 1
+LIB8STATIC uint16_t scale16( uint16_t i, fract16 scale )
+{
+    uint16_t result;
+    result = ((uint32_t)(i) * (uint32_t)(scale)) / 65536;
+    return result;
+}
+#elif SCALE16_AVRASM == 1
+LIB8STATIC
+uint16_t scale16( uint16_t i, fract16 scale )
+{
+    uint32_t result = 0;
+    const uint8_t  zero = 0;
+    asm volatile(
+                 // result.A-B  = i.A x scale.A
+                 "  mul %A[i], %A[scale]                 \n\t"
+                 //  save results...
+                 // basic idea:
+                 //"  mov %A[result], r0                 \n\t"
+                 //"  mov %B[result], r1                 \n\t"
+                 // which can be written as...
+                 "  movw %A[result], r0                   \n\t"
+                 // We actually need to do anything with r0,
+                 // as result.A is never used again here, so we
+                 // could just move the high byte, but movw is
+                 // one clock cycle, just like mov, so might as
+                 // well, in case we want to use this code for
+                 // a generic 16x16 multiply somewhere.
+                 
+                 // result.C-D  = i.B x scale.B
+                 "  mul %B[i], %B[scale]                 \n\t"
+                 //"  mov %C[result], r0                 \n\t"
+                 //"  mov %D[result], r1                 \n\t"
+                 "  movw %C[result], r0                   \n\t"
+
+                 // result.B-D += i.B x scale.A
+                 "  mul %B[i], %A[scale]                 \n\t"
+                 
+                 "  add %B[result], r0                   \n\t"
+                 "  adc %C[result], r1                   \n\t"
+                 "  adc %D[result], %[zero]              \n\t"
+                 
+                 // result.B-D += i.A x scale.B
+                 "  mul %A[i], %B[scale]                 \n\t"
+                 
+                 "  add %B[result], r0                   \n\t"
+                 "  adc %C[result], r1                   \n\t"
+                 "  adc %D[result], %[zero]              \n\t"
+                                  
+                 // cleanup r1
+                 "  clr r1                               \n\t"
+                 
+                 : [result] "+r" (result)
+                 : [i] "r" (i),
+                   [scale] "r" (scale),
+                   [zero] "r" (zero)
+                 : "r0", "r1"
+                 );
+    result = result >> 16;
+    return result;
+}
+#else
+#error "No implementation for scale16 available."
+#endif
+
+
+
+// mul8: 8x8 bit multiplication, with 8 bit result
+LIB8STATIC uint8_t mul8( uint8_t i, uint8_t j)
+{
+#if MUL8_C == 1
+    return ((int)i * (int)(j) ) & 0xFF;
+#elif MUL8_AVRASM == 1
+    asm volatile(
+         /* Multiply 8-bit i * 8-bit j, giving 16-bit r1,r0 */
+         "mul %0, %1          \n\t"
+         /* Extract the LOW 8-bits (r0) */
+         "mov %0, r0          \n\t"
+         /* Restore r1 to "0"; it's expected to always be that */
+         "clr __zero_reg__    \n\t"
+         : "+a" (i)
+         : "a"  (j)
+         : "r0", "r1");
+    
+    return i;
+#else
+#error "No implementation for mul8 available."
+#endif
+}
+
+
+// mul8: saturating 8x8 bit multiplication, with 8 bit result
+LIB8STATIC uint8_t qmul8( uint8_t i, uint8_t j)
+{
+#if QMUL8_C == 1
+    int p = ((int)i * (int)(j) );
+    if( p > 255) p = 255;
+    return p;
+#elif QMUL8_AVRASM == 1
+    asm volatile(
+                 /* Multiply 8-bit i * 8-bit j, giving 16-bit r1,r0 */
+                 "  mul %0, %1          \n\t"
+                 /* If high byte of result is zero, all is well. */
+                 "  tst r1              \n\t"
+                 "  breq Lnospill_%=    \n\t"
+                 /* If high byte of result > 0, saturate low byte to 0xFF */
+                 "  ldi %0,0xFF         \n\t"
+                 "  rjmp Ldone_%=       \n\t"
+                 "Lnospill_%=:          \n\t"
+                 /* Extract the LOW 8-bits (r0) */
+                 "  mov %0, r0          \n\t"
+                 "Ldone_%=:             \n\t"
+                 /* Restore r1 to "0"; it's expected to always be that */
+                 "  clr __zero_reg__    \n\t"
+                 : "+a" (i)
+                 : "a"  (j)
+                 : "r0", "r1");
+    
+    return i;
+#else
+#error "No implementation for qmul8 available."
+#endif
+}
+
+
+// abs8: take abs() of a signed 8-bit uint8_t
+LIB8STATIC int8_t abs8( int8_t i)
+{
+#if ABS8_C == 1
+    if( i < 0) i = -i;
+    return i;
+#elif ABS8_AVRASM == 1
+    
+    
+    asm volatile(
+         /* First, check the high bit, and prepare to skip if it's clear */
+         "sbrc %0, 7 \n"
+         
+         /* Negate the value */
+         "neg %0     \n"
+         
+         : "+r" (i) : "r" (i) );
+    return i;
+#else
+#error "No implementation for abs8 available."
+#endif
+}
+
+
+///////////////////////////////////////////////////////////////////////
+//
+// float-to-fixed and fixed-to-float conversions
+//
+// Note that anything involving a 'float' on AVR will be slower.
+
+// floatToSfract15: conversion from IEEE754 float in the range (-1,1)
+//                  to 16-bit fixed point.  Note that the extremes of
+//                  one and negative one are NOT representable.  The
+//                  representable range is basically
+//
+// sfract15ToFloat: conversion from sfract15 fixed point to
+//                  IEEE754 32-bit float.
+
+LIB8STATIC
+float sfract15ToFloat( sfract15 y)
+{
+    return y / 32768.0;
+}
+
+LIB8STATIC
+sfract15 floatToSfract15( float f)
+{
+    return f * 32768.0;
+}
+
+
+
+///////////////////////////////////////////////////////////////////////
+
+// Dimming and brightening functions
+//
+// The eye does not respond in a linear way to light.
+// High speed PWM'd LEDs at 50% duty cycle appear far
+// brighter then the 'half as bright' you might expect.
+//
+// If you want your midpoint brightness leve (128) to
+// appear half as bright as 'full' brightness (255), you
+// have to apply a 'dimming function'.
+//
+// 
+
+LIB8STATIC uint8_t dim8_raw( uint8_t x)
+{
+    return scale8( x, x);
+}
+
+LIB8STATIC uint8_t dim8_video( uint8_t x)
+{
+    return scale8_video( x, x);
+}
+
+LIB8STATIC uint8_t brighten8_raw( uint8_t x)
+{
+    uint8_t ix = 255 - x;
+    return 255 - scale8( ix, ix);
+}
+
+LIB8STATIC uint8_t brighten8_video( uint8_t x)
+{
+    uint8_t ix = 255 - x;
+    return 255 - scale8_video( ix, ix);
+}
+
+///////////////////////////////////////////////////////////////////////
+
+// A 16-bit PNRG good enough for LED animations
+
+// X(n+1) = (2053 * X(n)) + 13849)
+#define RAND16_2053  2053
+#define RAND16_13849 13849
+
+extern uint16_t rand16seed;// = RAND16_SEED;
+
+
+LIB8STATIC uint8_t random8()
+{
+    rand16seed = (rand16seed * RAND16_2053) + RAND16_13849;
+    return rand16seed;
+}
+
+LIB8STATIC uint16_t random16()
+{
+    rand16seed = (rand16seed * RAND16_2053) + RAND16_13849;
+    return rand16seed;
+}
+
+
+LIB8STATIC uint8_t random8(uint8_t lim)
+{
+    uint8_t r = random8();
+    r = scale8( r, lim);
+    return r;
+}
+
+LIB8STATIC uint8_t random8(uint8_t min, uint8_t lim)
+{
+    uint8_t delta = lim - min;
+    uint8_t r = random8(delta) + min;
+    return r;
+}
+
+LIB8STATIC uint16_t random16( uint16_t lim)
+{
+    uint16_t r = random16();
+    uint32_t p = (uint32_t)lim * (uint32_t)r;
+    r = p >> 16;
+    return r;
+}
+
+LIB8STATIC uint16_t random16( uint16_t min, uint16_t lim)
+{
+    uint16_t delta = lim - min;
+    uint16_t r = random16( delta) + min;
+    return r;
+}
+
+LIB8STATIC void random16_set_seed( uint16_t seed)
+{
+    rand16seed = seed;
+}
+
+LIB8STATIC uint16_t random16_get_seed()
+{
+    return rand16seed;
+}
+
+LIB8STATIC void random16_add_entropy( uint16_t entropy)
+{
+    rand16seed += entropy;
+}
+
+
+///////////////////////////////////////////////////////////////////////
+
+// sin16 & cos16:
+//        Fast 16-bit approximations of sin(x) & cos(x).
+//        Input angle is an unsigned int from 0-65535.
+//        Output is signed int from -32767 to 32767.
+//
+//        This approximation never varies more than 0.69%
+//        from the floating point value you'd get by doing
+//          float s = sin( x ) * 32767.0;
+//
+//        Don't use this approximation for calculating the
+//        trajectory of a rocket to Mars, but it's great
+//        for art projects and LED displays.
+//
+//        On Arduino/AVR, this approximation is more than
+//        10X faster than floating point sin(x) and cos(x)
+
+#if defined(__AVR__)
+#define sin16 sin16_avr
+#else
+#define sin16 sin16_C
+#endif
+
+LIB8STATIC int16_t sin16_avr( uint16_t theta )
+{
+    static const uint8_t data[] =
+    { 0,         0,         49, 0, 6393%256,   6393/256, 48, 0,
+      12539%256, 12539/256, 44, 0, 18204%256, 18204/256, 38, 0,
+      23170%256, 23170/256, 31, 0, 27245%256, 27245/256, 23, 0,
+      30273%256, 30273/256, 14, 0, 32137%256, 32137/256,  4 /*,0*/ };
+    
+    uint16_t offset = (theta & 0x3FFF);
+    
+    // AVR doesn't have a multi-bit shift instruction,
+    // so if we say "offset >>= 3", gcc makes a tiny loop.
+    // Inserting empty volatile statements between each
+    // bit shift forces gcc to unroll the loop.
+    offset >>= 1; // 0..8191
+    asm volatile("");
+    offset >>= 1; // 0..4095
+    asm volatile("");
+    offset >>= 1; // 0..2047
+
+    if( theta & 0x4000 ) offset = 2047 - offset;
+    
+    uint8_t sectionX4;
+    sectionX4 = offset / 256;
+    sectionX4 *= 4;
+    
+    uint8_t m;
+    
+    union {
+        uint16_t b;
+        struct {
+            uint8_t blo;
+            uint8_t bhi;
+        };
+    } u;
+    
+    //in effect u.b = blo + (256 * bhi);
+    u.blo = data[ sectionX4 ];
+    u.bhi = data[ sectionX4 + 1];
+    m     = data[ sectionX4 + 2];
+    
+    uint8_t secoffset8 = (uint8_t)(offset) / 2;
+    
+    uint16_t mx = m * secoffset8;
+    
+    int16_t  y  = mx + u.b;
+    if( theta & 0x8000 ) y = -y;
+    
+    return y;
+}
+
+LIB8STATIC int16_t sin16_C( uint16_t theta )
+{
+    static const uint16_t base[] =
+    { 0, 6393, 12539, 18204, 23170, 27245, 30273, 32137 };
+    static const uint8_t slope[] =
+    { 49, 48, 44, 38, 31, 23, 14, 4 };
+    
+    uint16_t offset = (theta & 0x3FFF) >> 3; // 0..2047
+    if( theta & 0x4000 ) offset = 2047 - offset;
+    
+    uint8_t section = offset / 256; // 0..7
+    uint16_t b   = base[section];
+    uint8_t  m   = slope[section];
+    
+    uint8_t secoffset8 = (uint8_t)(offset) / 2;
+    
+    uint16_t mx = m * secoffset8;
+    int16_t  y  = mx + b;
+    
+    if( theta & 0x8000 ) y = -y;
+    
+    return y;
+}
+
+LIB8STATIC int16_t cos16( uint16_t theta)
+{
+    return sin16( theta + 16384);
+}
+
+
+///////////////////////////////////////////////////////////////////////
+//
+// memmove8, memcpy8, and memset8:
+//   alternatives to memmove, memcpy, and memset that are
+//   faster on AVR than standard avr-libc 1.8
+
+#if defined(__AVR__)
+extern "C" {
+void * memmove8( void * dst, const void * src, uint16_t num );
+void * memcpy8 ( void * dst, const void * src, uint16_t num )  __attribute__ ((noinline));
+void * memset8 ( void * ptr, uint8_t value, uint16_t num ) __attribute__ ((noinline)) ;
+}
+#else
+// on non-AVR platforms, these names just call standard libc.
+#define memmove8 memmove
+#define memcpy8 memcpy
+#define memset8 memset
+#endif
+
+
+///////////////////////////////////////////////////////////////////////
+//
+// linear interpolation, such as could be used for Perlin noise, etc.
+//
+
+// linear interpolation between two unsigned 8-bit values,
+// with 8-bit fraction
+LIB8STATIC uint8_t lerp8by8( uint8_t a, uint8_t b, fract8 frac)
+{
+    uint8_t delta = b - a;
+    uint8_t scaled = scale8( delta, frac);
+    uint8_t result = a + scaled;
+    return result;
+}
+
+// linear interpolation between two unsigned 16-bit values,
+// with 16-bit fraction
+LIB8STATIC uint16_t lerp16by16( uint16_t a, uint16_t b, fract16 frac)
+{
+    uint16_t delta = b - a;
+    uint32_t prod = (uint32_t)delta * (uint32_t)frac;
+    uint16_t scaled = prod >> 16;
+    uint16_t result = a + scaled;
+    return result;
+}
+
+
+// A note on the structure of lerp16by8 (and lerp15by8) :
+// The cases for b>a and b<=a are handled separately for
+// speed: without knowing the relative order of a and b,
+// the value (a-b) might be a signed 17-bit value, which
+// would have to be stored in a 32-bit signed int and
+// processed as such.  To avoid that, we separate the
+// two cases, and are able to do all the math with 16-bit
+// unsigned values, which is much faster and smaller on AVR.
+
+// linear interpolation between two unsigned 16-bit values,
+// with 8-bit fraction
+LIB8STATIC uint16_t lerp16by8( uint16_t a, uint16_t b, fract8 frac)
+{
+    uint16_t result;
+    if( b > a) {
+        uint16_t delta = b - a;
+        uint16_t scaled = scale16by8( delta, frac);
+        result = a + scaled;
+    } else {
+        uint16_t delta = a - b;
+        uint16_t scaled = scale16by8( delta, frac);
+        result = a - scaled;
+    }
+    return result;
+}
+
+// linear interpolation between two signed 15-bit values,
+// with 8-bit fraction
+LIB8STATIC int16_t lerp15by8( int16_t a, int16_t b, fract8 frac)
+{
+    int16_t result;
+    if( b > a) {
+        uint16_t delta = b - a;
+        uint16_t scaled = scale16by8( delta, frac);
+        result = a + scaled;
+    } else {
+        uint16_t delta = a - b;
+        uint16_t scaled = scale16by8( delta, frac);
+        result = a - scaled;
+    }
+    return result;
+}
+
+
+///////////////////////////////////////////////////////////////////////
+//
+// easing functions; see http://easings.net
+//
+
+// ease8InOuCubic: 8-bit cubic ease-in / ease-out function
+//                 Takes around 18 cycles on AVR
+LIB8STATIC fract8 ease8InOutCubic( fract8 i)
+{
+    uint8_t ii  = scale8_LEAVING_R1_DIRTY(  i, i);
+    uint8_t iii = scale8_LEAVING_R1_DIRTY( ii, i);
+    
+    uint16_t r1 = (3 * (uint16_t)(ii)) - ( 2 * (uint16_t)(iii));
+
+    /* the code generated for the above *'s automatically
+       cleans up R1, so there's no need to explicitily call
+       cleanup_R1(); */
+    
+    uint8_t result = r1;
+    
+    // if we got "256", return 255:
+    if( r1 & 0x100 ) {
+        result = 255;
+    }
+    return result;
+}
+
+// ease8InOutApprox: fast, rough 8-bit ease-in/ease-out function
+//                   shaped approximately like 'ease8InOutCubic',
+//                   it's never off by more than a couple of percent
+//                   from the actual cubic S-curve, and it executes
+//                   more than twice as fast.  Use when the cycles
+//                   are more important than visual smoothness.
+//                   Asm version takes around 7 cycles on AVR.
+
+#if EASE8_C == 1
+LIB8STATIC fract8 ease8InOutApprox( fract8 i)
+{
+    if( i < 64) {
+        // start with slope 0.5
+        i /= 2;
+    } else if( i > (255 - 64)) {
+        // end with slope 0.5
+        i = 255 - i;
+        i /= 2;
+        i = 255 - i;
+    } else {
+        // in the middle, use slope 192/128 = 1.5
+        i -= 64;
+        i += (i / 2);
+        i += 32;
+    }
+    
+    return i;
+}
+
+#elif EASE8_AVRASM == 1
+LIB8STATIC uint8_t ease8InOutApprox( fract8 i)
+{
+    // takes around 7 cycles on AVR
+    asm volatile (
+        "  subi %[i], 64         \n\t"
+        "  cpi  %[i], 128        \n\t"
+        "  brcc Lshift_%=        \n\t"
+
+        // middle case
+        "  mov __tmp_reg__, %[i] \n\t"
+        "  lsr __tmp_reg__       \n\t"
+        "  add %[i], __tmp_reg__ \n\t"
+        "  subi %[i], 224        \n\t"
+        "  rjmp Ldone_%=         \n\t"
+
+        // start or end case
+        "Lshift_%=:              \n\t"
+        "  lsr %[i]              \n\t"
+        "  subi %[i], 96         \n\t"
+
+        "Ldone_%=:               \n\t"
+                  
+        : [i] "+a" (i)
+        : 
+        : "r0", "r1"
+        );
+    return i;
+}
+#else
+#error "No implementation for ease8 available."
+#endif
+
+
+
+
+
+#endif
diff --git a/pixeltypes.h b/pixeltypes.h
new file mode 100644
index 00000000..6d3f67f2
--- /dev/null
+++ b/pixeltypes.h
@@ -0,0 +1,659 @@
+#ifndef __INC_PIXELS_H
+#define __INC_PIXELS_H
+
+#include <stdint.h>
+#include "lib8tion.h"
+
+struct CRGB;
+struct CHSV;
+
+// Forward declaration of hsv2rgb_rainbow here,
+// to avoid circular dependencies.
+extern void hsv2rgb_rainbow( const CHSV& hsv, CRGB& rgb);
+
+
+struct CHSV {
+    union {
+		struct {
+		    union {
+		        uint8_t hue;
+		        uint8_t h; };
+		    union {
+		        uint8_t saturation;
+		        uint8_t sat;
+		        uint8_t s; };
+		    union {
+		        uint8_t value;
+		        uint8_t val;
+		        uint8_t v; };
+		};
+		uint8_t raw[3];
+	};
+
+    // default values are UNITIALIZED
+    inline CHSV() __attribute__((always_inline))
+    {
+    }
+
+    // allow construction from H, S, V
+    inline CHSV( uint8_t ih, uint8_t is, uint8_t iv) __attribute__((always_inline))
+        : h(ih), s(is), v(iv)
+    {
+    }
+
+    // allow copy construction
+    inline CHSV(const CHSV& rhs) __attribute__((always_inline))
+    {
+        h = rhs.h;
+        s = rhs.s;
+        v = rhs.v;
+    }
+
+    inline CHSV& operator= (const CHSV& rhs) __attribute__((always_inline))
+    {
+        h = rhs.h;
+        s = rhs.s;
+        v = rhs.v;
+        return *this;
+    }
+
+    inline CHSV& setHSV(uint8_t ih, uint8_t is, uint8_t iv) __attribute__((always_inline))
+    {
+        h = ih;
+        s = is;
+        v = iv;
+        return *this;
+    }
+};
+
+
+struct CRGB {
+	union {
+		struct {
+            union {
+                uint8_t r;
+                uint8_t red;
+            };
+            union {
+                uint8_t g;
+                uint8_t green;
+            };
+            union {
+                uint8_t b;
+                uint8_t blue;
+            };
+        };
+		uint8_t raw[3];
+	};
+
+	inline uint8_t& operator[] (uint8_t x) __attribute__((always_inline))
+    {
+        return raw[x];
+    }
+
+    inline const uint8_t& operator[] (uint8_t x) const __attribute__((always_inline))
+    {
+        return raw[x];
+    }
+
+    // default values are UNINITIALIZED
+	inline CRGB() __attribute__((always_inline))
+    {
+    }
+    
+    // allow construction from R, G, B
+    inline CRGB( uint8_t ir, uint8_t ig, uint8_t ib)  __attribute__((always_inline))
+        : r(ir), g(ig), b(ib)
+    {
+    }
+    
+    // allow construction from 32-bit (really 24-bit) bit 0xRRGGBB color code
+    inline CRGB( uint32_t colorcode)  __attribute__((always_inline))
+    : r((colorcode >> 16) & 0xFF), g((colorcode >> 8) & 0xFF), b((colorcode >> 0) & 0xFF)
+    {
+    }
+    
+    // allow copy construction
+	inline CRGB(const CRGB& rhs) __attribute__((always_inline))
+    {
+        r = rhs.r;
+        g = rhs.g;
+        b = rhs.b;
+    }
+    
+    // allow construction from HSV color
+	inline CRGB(const CHSV& rhs) __attribute__((always_inline))
+    {
+        hsv2rgb_rainbow( rhs, *this);
+    }
+
+    // allow assignment from one RGB struct to another
+	inline CRGB& operator= (const CRGB& rhs) __attribute__((always_inline))
+    {
+        r = rhs.r;
+        g = rhs.g;
+        b = rhs.b;
+        return *this;
+    }    
+
+    // allow assignment from 32-bit (really 24-bit) 0xRRGGBB color code
+	inline CRGB& operator= (const uint32_t colorcode) __attribute__((always_inline))
+    {
+        r = (colorcode >> 16) & 0xFF;
+        g = (colorcode >>  8) & 0xFF;
+        b = (colorcode >>  0) & 0xFF;
+        return *this;
+    }
+    
+    // allow assignment from R, G, and B
+	inline CRGB& setRGB (uint8_t nr, uint8_t ng, uint8_t nb) __attribute__((always_inline))
+    {
+        r = nr;
+        g = ng;
+        b = nb;
+        return *this;
+    }
+    
+    // allow assignment from H, S, and V
+	inline CRGB& setHSV (uint8_t hue, uint8_t sat, uint8_t val) __attribute__((always_inline))
+    {
+        hsv2rgb_rainbow( CHSV(hue, sat, val), *this);
+        return *this;
+    }
+    
+    // allow assignment from just a Hue, saturation and value automatically at max.
+	inline CRGB& setHue (uint8_t hue) __attribute__((always_inline))
+    {
+        hsv2rgb_rainbow( CHSV(hue, 255, 255), *this);
+        return *this;
+    }
+    
+    // allow assignment from HSV color
+	inline CRGB& operator= (const CHSV& rhs) __attribute__((always_inline))
+    {
+        hsv2rgb_rainbow( rhs, *this);
+        return *this;
+    }
+    
+    // allow assignment from 32-bit (really 24-bit) 0xRRGGBB color code
+	inline CRGB& setColorCode (uint32_t colorcode) __attribute__((always_inline))
+    {
+        r = (colorcode >> 16) & 0xFF;
+        g = (colorcode >>  8) & 0xFF;
+        b = (colorcode >>  0) & 0xFF;
+        return *this;
+    }
+    
+
+    // add one RGB to another, saturating at 0xFF for each channel
+    inline CRGB& operator+= (const CRGB& rhs )
+    {
+        r = qadd8( r, rhs.r);
+        g = qadd8( g, rhs.g);
+        b = qadd8( b, rhs.b);
+        return *this;
+    }
+    
+    // add a contstant to each channel, saturating at 0xFF
+    // this is NOT an operator+= overload because the compiler
+    // can't usefully decide when it's being passed a 32-bit
+    // constant (e.g. CRGB::Red) and an 8-bit one (CRGB::Blue)
+    inline CRGB& addToRGB (uint8_t d )
+    {
+        r = qadd8( r, d);
+        g = qadd8( g, d);
+        b = qadd8( b, d);
+        return *this;
+    }
+    
+    // subtract one RGB from another, saturating at 0x00 for each channel
+    inline CRGB& operator-= (const CRGB& rhs )
+    {
+        r = qsub8( r, rhs.r);
+        g = qsub8( g, rhs.g);
+        b = qsub8( b, rhs.b);
+        return *this;
+    }
+    
+    // subtract a constant from each channel, saturating at 0x00
+    // this is NOT an operator+= overload because the compiler
+    // can't usefully decide when it's being passed a 32-bit
+    // constant (e.g. CRGB::Red) and an 8-bit one (CRGB::Blue)
+    inline CRGB& subtractFromRGB(uint8_t d )
+    {
+        r = qsub8( r, d);
+        g = qsub8( g, d);
+        b = qsub8( b, d);
+        return *this;
+    }
+    
+    // subtract a constant of '1' from each channel, saturating at 0x00
+    inline CRGB& operator-- ()  __attribute__((always_inline))
+    {
+        subtractFromRGB(1);
+        return *this;
+    }
+    
+    // subtract a constant of '1' from each channel, saturating at 0x00
+    inline CRGB operator-- (int DUMMY_ARG)  __attribute__((always_inline))
+    {
+        CRGB retval(*this);
+        --(*this);
+        return retval;
+    }
+
+    // add a constant of '1' from each channel, saturating at 0xFF
+    inline CRGB& operator++ ()  __attribute__((always_inline))
+    {
+        addToRGB(1);
+        return *this;
+    }
+    
+    // add a constant of '1' from each channel, saturating at 0xFF
+    inline CRGB operator++ (int DUMMY_ARG)  __attribute__((always_inline))
+    {
+        CRGB retval(*this);
+        ++(*this);
+        return retval;
+    }
+
+    // divide each of the channels by a constant
+    inline CRGB& operator/= (uint8_t d )
+    {
+        r /= d;
+        g /= d;
+        b /= d;
+        return *this;
+    }
+        
+    // multiply each of the channels by a constant,
+    // saturating each channel at 0xFF
+    inline CRGB& operator*= (uint8_t d )
+    {
+        r = qmul8( r, d);
+        g = qmul8( g, d);
+        b = qmul8( b, d);
+        return *this;
+    }
+
+    // scale down a RGB to N 256ths of it's current brightness, using
+    // 'video' dimming rules, which means that unless the scale factor is ZERO
+    // each channel is guaranteed NOT to dim down to zero.  If it's already
+    // nonzero, it'll stay nonzero, even if that means the hue shifts a little
+    // at low brightness levels.
+    inline CRGB& nscale8_video (uint8_t scaledown )
+    {
+        nscale8x3_video( r, g, b, scaledown);
+        return *this;
+    }
+    
+    // %= is a synonym for nscale8_video.  Think of it is scaling down
+    // by "a percentage"
+    inline CRGB& operator%= (uint8_t scaledown )
+    {
+        nscale8x3_video( r, g, b, scaledown);
+        return *this;
+    }
+
+    // fadeLightBy is a synonym for nscale8_video( ..., 255-fadefactor)
+    inline CRGB& fadeLightBy (uint8_t fadefactor )
+    {
+        nscale8x3_video( r, g, b, 255 - fadefactor);
+        return *this;
+    }
+    
+    // scale down a RGB to N 256ths of it's current brightness, using
+    // 'plain math' dimming rules, which means that if the low light levels
+    // may dim all the way to 100% black.
+    inline CRGB& nscale8 (uint8_t scaledown )
+    {
+        nscale8x3( r, g, b, scaledown);
+        return *this;
+    }
+
+    // fadeToBlackBy is a synonym for nscale8( ..., 255-fadefactor)
+    inline CRGB& fadeToBlackBy (uint8_t fadefactor )
+    {
+        nscale8x3( r, g, b, 255 - fadefactor);
+        return *this;
+    }
+    
+    // "or" operator brings each channel up to the higher of the two values
+    inline CRGB& operator|= (const CRGB& rhs )
+    {
+        if( rhs.r > r) r = rhs.r;
+        if( rhs.g > g) g = rhs.g;
+        if( rhs.b > b) b = rhs.b;
+        return *this;
+    }
+    inline CRGB& operator|= (uint8_t d )
+    {
+        if( d > r) r = d;
+        if( d > g) g = d;
+        if( d > b) b = d;
+        return *this;
+    }
+    
+    // "and" operator brings each channel down to the lower of the two values
+    inline CRGB& operator&= (const CRGB& rhs )
+    {
+        if( rhs.r < r) r = rhs.r;
+        if( rhs.g < g) g = rhs.g;
+        if( rhs.b < b) b = rhs.b;
+        return *this;
+    }
+    inline CRGB& operator&= (uint8_t d )
+    {
+        if( d < r) r = d;
+        if( d < g) g = d;
+        if( d < b) b = d;
+        return *this;
+    }
+    
+    // this allows testing a CRGB for zero-ness
+    inline operator bool() const __attribute__((always_inline))
+    {
+        return r || g || b;
+    }
+    
+    // invert each channel
+    inline CRGB operator- ()
+    {
+        CRGB retval;
+        retval.r = 255 - r;
+        retval.g = 255 - g;
+        retval.b = 255 - b;
+        return retval;
+    }
+    
+    
+    inline uint8_t getLuma ( )  {
+        //Y' = 0.2126 R' + 0.7152 G' + 0.0722 B'
+        //     54            183       18 (!)
+        
+        uint8_t luma = scale8_LEAVING_R1_DIRTY( r, 54) + \
+        scale8_LEAVING_R1_DIRTY( g, 183) + \
+        scale8_LEAVING_R1_DIRTY( b, 18);
+        cleanup_R1();
+        return luma;
+    }
+    
+    inline uint8_t getAverageLight( )  {
+        const uint8_t eightysix = 86;
+        uint8_t avg = scale8_LEAVING_R1_DIRTY( r, eightysix) + \
+        scale8_LEAVING_R1_DIRTY( g, eightysix) + \
+        scale8_LEAVING_R1_DIRTY( b, eightysix);
+        cleanup_R1();
+        return avg;
+    }
+
+    inline void maximizeBrightness( uint8_t limit = 255 )  {
+        uint8_t max = red;
+        if( green > max) max = green;
+        if( blue > max) max = blue;
+        uint16_t factor = ((uint16_t)(limit) * 256) / max;
+        red =   (red   * factor) / 256;
+        green = (green * factor) / 256;
+        blue =  (blue  * factor) / 256;
+    }
+    
+    typedef enum {
+        AliceBlue=0xF0F8FF,
+        Amethyst=0x9966CC,
+        AntiqueWhite=0xFAEBD7,
+        Aqua=0x00FFFF,
+        Aquamarine=0x7FFFD4,
+        Azure=0xF0FFFF,
+        Beige=0xF5F5DC,
+        Bisque=0xFFE4C4,
+        Black=0x000000,
+        BlanchedAlmond=0xFFEBCD,
+        Blue=0x0000FF,
+        BlueViolet=0x8A2BE2,
+        Brown=0xA52A2A,
+        BurlyWood=0xDEB887,
+        CadetBlue=0x5F9EA0,
+        Chartreuse=0x7FFF00,
+        Chocolate=0xD2691E,
+        Coral=0xFF7F50,
+        CornflowerBlue=0x6495ED,
+        Cornsilk=0xFFF8DC,
+        Crimson=0xDC143C,
+        Cyan=0x00FFFF,
+        DarkBlue=0x00008B,
+        DarkCyan=0x008B8B,
+        DarkGoldenrod=0xB8860B,
+        DarkGray=0xA9A9A9,
+        DarkGreen=0x006400,
+        DarkKhaki=0xBDB76B,
+        DarkMagenta=0x8B008B,
+        DarkOliveGreen=0x556B2F,
+        DarkOrange=0xFF8C00,
+        DarkOrchid=0x9932CC,
+        DarkRed=0x8B0000,
+        DarkSalmon=0xE9967A,
+        DarkSeaGreen=0x8FBC8F,
+        DarkSlateBlue=0x483D8B,
+        DarkSlateGray=0x2F4F4F,
+        DarkTurquoise=0x00CED1,
+        DarkViolet=0x9400D3,
+        DeepPink=0xFF1493,
+        DeepSkyBlue=0x00BFFF,
+        DimGray=0x696969,
+        DodgerBlue=0x1E90FF,
+        FireBrick=0xB22222,
+        FloralWhite=0xFFFAF0,
+        ForestGreen=0x228B22,
+        Fuchsia=0xFF00FF,
+        Gainsboro=0xDCDCDC,
+        GhostWhite=0xF8F8FF,
+        Gold=0xFFD700,
+        Goldenrod=0xDAA520,
+        Gray=0x808080,
+        Green=0x008000,
+        GreenYellow=0xADFF2F,
+        Honeydew=0xF0FFF0,
+        HotPink=0xFF69B4,
+        IndianRed=0xCD5C5C,
+        Indigo=0x4B0082,
+        Ivory=0xFFFFF0,
+        Khaki=0xF0E68C,
+        Lavender=0xE6E6FA,
+        LavenderBlush=0xFFF0F5,
+        LawnGreen=0x7CFC00,
+        LemonChiffon=0xFFFACD,
+        LightBlue=0xADD8E6,
+        LightCoral=0xF08080,
+        LightCyan=0xE0FFFF,
+        LightGoldenrodYellow=0xFAFAD2,
+        LightGreen=0x90EE90,
+        LightGrey=0xD3D3D3,
+        LightPink=0xFFB6C1,
+        LightSalmon=0xFFA07A,
+        LightSeaGreen=0x20B2AA,
+        LightSkyBlue=0x87CEFA,
+        LightSlateGray=0x778899,
+        LightSteelBlue=0xB0C4DE,
+        LightYellow=0xFFFFE0,
+        Lime=0x00FF00,
+        LimeGreen=0x32CD32,
+        Linen=0xFAF0E6,
+        Magenta=0xFF00FF,
+        Maroon=0x800000,
+        MediumAquamarine=0x66CDAA,
+        MediumBlue=0x0000CD,
+        MediumOrchid=0xBA55D3,
+        MediumPurple=0x9370DB,
+        MediumSeaGreen=0x3CB371,
+        MediumSlateBlue=0x7B68EE,
+        MediumSpringGreen=0x00FA9A,
+        MediumTurquoise=0x48D1CC,
+        MediumVioletRed=0xC71585,
+        MidnightBlue=0x191970,
+        MintCream=0xF5FFFA,
+        MistyRose=0xFFE4E1,
+        Moccasin=0xFFE4B5,
+        NavajoWhite=0xFFDEAD,
+        Navy=0x000080,
+        OldLace=0xFDF5E6,
+        Olive=0x808000,
+        OliveDrab=0x6B8E23,
+        Orange=0xFFA500,
+        OrangeRed=0xFF4500,
+        Orchid=0xDA70D6,
+        PaleGoldenrod=0xEEE8AA,
+        PaleGreen=0x98FB98,
+        PaleTurquoise=0xAFEEEE,
+        PaleVioletRed=0xDB7093,
+        PapayaWhip=0xFFEFD5,
+        PeachPuff=0xFFDAB9,
+        Peru=0xCD853F,
+        Pink=0xFFC0CB,
+        Plaid=0xCC5533,
+        Plum=0xDDA0DD,
+        PowderBlue=0xB0E0E6,
+        Purple=0x800080,
+        Red=0xFF0000,
+        RosyBrown=0xBC8F8F,
+        RoyalBlue=0x4169E1,
+        SaddleBrown=0x8B4513,
+        Salmon=0xFA8072,
+        SandyBrown=0xF4A460,
+        SeaGreen=0x2E8B57,
+        Seashell=0xFFF5EE,
+        Sienna=0xA0522D,
+        Silver=0xC0C0C0,
+        SkyBlue=0x87CEEB,
+        SlateBlue=0x6A5ACD,
+        SlateGray=0x708090,
+        Snow=0xFFFAFA,
+        SpringGreen=0x00FF7F,
+        SteelBlue=0x4682B4,
+        Tan=0xD2B48C,
+        Teal=0x008080,
+        Thistle=0xD8BFD8,
+        Tomato=0xFF6347,
+        Turquoise=0x40E0D0,
+        Violet=0xEE82EE,
+        Wheat=0xF5DEB3,
+        White=0xFFFFFF,
+        WhiteSmoke=0xF5F5F5,
+        Yellow=0xFFFF00,
+        YellowGreen=0x9ACD32
+    } HTMLColorCode;
+    static uint32_t Squant;
+};
+
+
+inline __attribute__((always_inline)) bool operator== (const CRGB& lhs, const CRGB& rhs)
+{
+    return (lhs.r == rhs.r) && (lhs.g == rhs.g) && (lhs.b == rhs.b);
+}
+
+inline __attribute__((always_inline)) bool operator!= (const CRGB& lhs, const CRGB& rhs)
+{
+    return !(lhs == rhs);
+}
+
+inline __attribute__((always_inline)) bool operator< (const CRGB& lhs, const CRGB& rhs)
+{
+    uint16_t sl, sr;
+    sl = lhs.r + lhs.g + lhs.b;
+    sr = rhs.r + rhs.g + rhs.b;
+    return sl < sr;
+}
+
+inline __attribute__((always_inline)) bool operator> (const CRGB& lhs, const CRGB& rhs)
+{
+    uint16_t sl, sr;
+    sl = lhs.r + lhs.g + lhs.b;
+    sr = rhs.r + rhs.g + rhs.b;
+    return sl > sr;
+}
+
+inline __attribute__((always_inline)) bool operator>= (const CRGB& lhs, const CRGB& rhs)
+{
+    uint16_t sl, sr;
+    sl = lhs.r + lhs.g + lhs.b;
+    sr = rhs.r + rhs.g + rhs.b;
+    return sl >= sr;
+}
+
+inline __attribute__((always_inline)) bool operator<= (const CRGB& lhs, const CRGB& rhs)
+{
+    uint16_t sl, sr;
+    sl = lhs.r + lhs.g + lhs.b;
+    sr = rhs.r + rhs.g + rhs.b;
+    return sl <= sr;
+}
+
+
+__attribute__((always_inline))
+inline CRGB operator+( const CRGB& p1, const CRGB& p2)
+{
+    return CRGB( qadd8( p1.r, p2.r),
+                 qadd8( p1.g, p2.g),
+                 qadd8( p1.b, p2.b));
+}
+
+__attribute__((always_inline))
+inline CRGB operator-( const CRGB& p1, const CRGB& p2)
+{
+    return CRGB( qsub8( p1.r, p2.r),
+                 qsub8( p1.g, p2.g),
+                 qsub8( p1.b, p2.b));
+}
+
+__attribute__((always_inline))
+inline CRGB operator*( const CRGB& p1, uint8_t d)
+{
+    return CRGB( qmul8( p1.r, d),
+                 qmul8( p1.g, d),
+                 qmul8( p1.b, d));
+}
+
+__attribute__((always_inline))
+inline CRGB operator/( const CRGB& p1, uint8_t d)
+{
+    return CRGB( p1.r/d, p1.g/d, p1.b/d);
+}
+
+    
+__attribute__((always_inline))
+inline CRGB operator&( const CRGB& p1, const CRGB& p2)
+{
+    return CRGB( p1.r < p2.r ? p1.r : p2.r,
+                 p1.g < p2.g ? p1.g : p2.g,
+                 p1.b < p2.b ? p1.b : p2.b);
+}
+    
+__attribute__((always_inline))
+inline CRGB operator|( const CRGB& p1, const CRGB& p2)
+{
+    return CRGB( p1.r > p2.r ? p1.r : p2.r,
+                 p1.g > p2.g ? p1.g : p2.g,
+                 p1.b > p2.b ? p1.b : p2.b);
+}
+
+__attribute__((always_inline))
+inline CRGB operator%( const CRGB& p1, uint8_t d)
+{
+    CRGB retval( p1);
+    retval.nscale8_video( d);
+    return retval;
+}
+
+
+
+// Define RGB orderings
+enum EOrder {
+	RGB=0012,
+	RBG=0021,
+	GRB=0102,
+	GBR=0120,
+	BRG=0201,
+	BGR=0210
+};
+
+
+#endif
diff --git a/preview_changes.txt b/preview_changes.txt
new file mode 100644
index 00000000..9b1a8831
--- /dev/null
+++ b/preview_changes.txt
@@ -0,0 +1,57 @@
+Release Candidate 5
+* Gemma and Trinket: supported except for global "setBrightness"
+
+Release Candidate 4
+* Added NEOPIXEL as a synonym for WS2811
+* Fix WS2811/WS2812B timings, bring it in line to exactly 1.25ns/bit.  
+* Fix handling of constant color definitions (damn you, gcc!)
+
+Release Candidate 3
+* Fixed bug when Clock and Data were on the same port
+* Added ability to set pixel color directly from HSV
+* Added ability to retrieve current random16 seed
+
+Release Candidate 2 
+* mostly bug fixes
+* Fix SPI macro definitions for latest teensy3 software update
+* Teensy 2 compilation fix
+* hsv2rgb_rainbow performance fix
+
+Release Candidate 1
+* New unified/simplified API for adding/using controllers
+* fleshout clockless chip support
+* add hsv (spectrum and rainbow style colors)
+* high speed memory management operations
+* library for interpolation/easing functions
+* various api changes, addition of clear and showColor functions
+* scale value applied to all show methods 
+* bug fixes for SM16716
+* performance improvements, lpd8806 exceeds 22Mbit now
+* hardware def fixes
+* allow alternate rgb color orderings
+* high speed math methods
+* rich CRGB structure
+
+Preview 3
+* True hardware SPI support for teensy (up to 20Mbit output!)
+* Minor bug fixes/tweaks
+
+Preview 2
+* Rename pin class to FastPin
+* Replace latch with select, more accurate description of what it does
+* Enforce intra-frame timing for ws2801s
+* SM16716 support
+* Add #define FAST_SPI_INTERRUPTS_WRITE_PINS to make sure world is ok w/interrupts and SPI
+* Add #define FORCE_SOFTWARE_SPI for those times when you absolutely don't want to use hardware SPI, ev
+en if you're using the hardware SPI pins
+* Add pin definitions for the arduino megas - should fix ws2811 support
+* Add pin definitions for the leonardo - should fix spi support and pin mappings
+* Add warnings when pin definitions are missing
+* Added google+ community for fastspi users - https://plus.google.com/communities/109127054924227823508
+# Add pin definitions for Teensy++ 2.0
+
+
+Preview 1
+* Initial release
+
+
author	Daniel Garcia <danielgarcia@gmail.com>	2013-11-11 02:54:41 +0400
committer	Daniel Garcia <danielgarcia@gmail.com>	2013-11-11 02:54:41 +0400
commit	6bcfa714588b12a72bdde36a1f0a43871fd5d567 (patch)
tree	74c4fadde71b107f9a823928602141d673c0d9b5
parent	e325d5d3f934aed2b301c224352b41a1d07e3693 (diff)
parent	59edcab79837185feeea2dfe6f46b2c4ad17b8d8 (diff)