The moon. The mother fucking moon. Working dithering, RGB adjust, inline scale8_video on an 8Mhz, hardware-mul-less -trinket-

author: Daniel Garcia <danielgarcia@gmail.com> 2014-02-19 08:37:09 +0400
committer: Daniel Garcia <danielgarcia@gmail.com> 2014-02-19 08:37:09 +0400
commit: 4ff103d94afcc33afc389dcd31537f15b98ae64a (patch)
tree: b5231757b3032ac6de990fa4db36b987b105cd2c /clockless_trinket.h
parent: 902bf544a44ad0ec5bd35c6b462c0f4b2e7da5c9 (diff)
1 files changed, 72 insertions, 39 deletions
diff --git a/clockless_trinket.h b/clockless_trinket.h
index 45eea2de..52831296 100644
--- a/clockless_trinket.h
+++ b/clockless_trinket.h
@@ -11,6 +11,10 @@
 // Scaling macro choice
 #ifndef TRINKET_SCALE
 #define TRINKET_SCALE 1
+// whether or not to use dithering
+#define DITHER 1
+// whether or not to enable scale_video adjustments
+#define SCALE_VIDEO 1
 #endif
 
 // Variations on the functions in delay.h - w/a loop var passed in to preserve registers across calls by the optimizer/compiler
@@ -87,11 +91,13 @@ public:
 	}
 
 #define USE_ASM_MACROS
-				
+	
+// The variables that our various asm statemetns use.  The same block of variables needs to be declared for
+// all the asm blocks because GCC is pretty stupid and it would clobber variables happily or optimize code away too aggressively			
 #define ASM_VARS : /* write variables */				\
-				[b0] "+r" (b0),							\
-				[b1] "+r" (b1),							\
-				[b2] "+r" (b2),							\
+				[b0] "+a" (b0),							\
+				[b1] "+a" (b1),							\
+				[b2] "+a" (b2),							\
 				[count] "+x" (count),					\
 				[scale_base] "+a" (scale_base),			\
 				[data] "+z" (data),						\
@@ -106,7 +112,10 @@ public:
 				[d0] "r" (d0),							\
 				[d1] "r" (d1),							\
 				[d2] "r" (d2),							\
-				[PORT] "M" (FastPin<DATA_PIN>::port()-0x20),		\
+				[e0] "r" (e0),							\
+				[e1] "r" (e1),							\
+				[e2] "r" (e2),							\
+				[PORT] "M" (FastPin<DATA_PIN>::port()-0x20),			\
 				[O0] "M" (RGB_BYTE0(RGB_ORDER)),		\
 				[O1] "M" (RGB_BYTE1(RGB_ORDER)),		\
 				[O2] "M" (RGB_BYTE2(RGB_ORDER))		\
@@ -114,27 +123,31 @@ public:
 
 
 // 1 cycle, write hi to the port
-#define HI1 asm __volatile__("out 0x02, %[hi]" ASM_VARS );
+#define HI1 asm __volatile__("out %[PORT], %[hi]" ASM_VARS );
 // 1 cycle, write lo to the port
-#define LO1 asm __volatile__("out 0x02, %[lo]" ASM_VARS );
+#define LO1 asm __volatile__("out %[PORT], %[lo]" ASM_VARS );
 // 2 cycles, sbrs on flipping the lne to lo if we're pushing out a 0
 #define QLO2(B, N) asm __volatile__("sbrs %[" #B "], " #N ASM_VARS ); LO1;
 // load a byte from ram into the given var with the given offset
 #define LD2(B,O) asm __volatile__("ldd %[" #B "], Z + %[" #O "]" ASM_VARS );
 // 3 cycles - load a byte from ram into the scaling scratch space with the given offset, clear the target var
 #define LDSCL3(B,O) asm __volatile__("ldd %[scale_base], Z + %[" #O "]\n\tclr %[" #B "]" ASM_VARS );
-// 2 cycles - increment the data pointer
-#define IDATA2 asm __volatile__("add %A[data], %A[ADV]\n\tadc %B[data], %B[ADV]"  ASM_VARS );
-// 2 cycles - decrement the counter
-#define DCOUNT2 asm __volatile__("sbiw %[count], 1" ASM_VARS );
-// 2 cycles - jump to the beginning of the loop
-#define JMPLOOP2 asm __volatile__("rjmp 1b" ASM_VARS );
-// 2 cycles - jump out of the loop
-#define BRLOOP1 asm __volatile__("breq 2f" ASM_VARS );
 // 2 cycles - perform one step of the scaling (if a given bit is set in scale, add scale-base to the scratch space)
 #define SCALE02(B, N) asm __volatile__("sbrc %[s0], " #N "\n\tadd %[" #B "], %[scale_base]" ASM_VARS );
 #define SCALE12(B, N) asm __volatile__("sbrc %[s1], " #N "\n\tadd %[" #B "], %[scale_base]" ASM_VARS );
 #define SCALE22(B, N) asm __volatile__("sbrc %[s2], " #N "\n\tadd %[" #B "], %[scale_base]" ASM_VARS );
+
+// apply dithering value  before we do anything with scale_base
+#define PRESCALE4(D) if(DITHER) { asm __volatile__("cpse %[scale_base], __zero_reg__\n\t add %[scale_base],%[" #D "]\n\tbrcc L_%=\n\tldi %[scale_base], 0xFF\n\tL_%=:\n\t" ASM_VARS); } \
+				      else { _dc<4>(loopvar); }
+
+// Do a (rough) approximation of the nscale8_video jump
+#if (SCALE_VIDEO == 1) 
+#define VIDADJ2(B) asm __volatile__("cpse %[scale_base], __zero_reg__\n\tsubi %[" #B "], 0xFF\n\t" ASM_VARS);
+#else
+#define VIDADJ2(B) asm __volatile__("rjmp .+0" ASM_VARS);
+#endif
+
 // 1 cycle - rotate right, pulling in from carry
 #define ROR1(B) asm __volatile__("ror %[" #B "]" ASM_VARS );
 // 1 cycle, clear the carry bit
@@ -149,18 +162,32 @@ public:
 #define SCROR04(B, N) SCALE02(B,N) ROR1(B) CLC1
 #define SCROR14(B, N) SCALE12(B,N) ROR1(B) CLC1
 #define SCROR24(B, N) SCALE22(B,N) ROR1(B) CLC1
+
+/////////////////////////////////////////////////////////////////////////////////////
+// Loop life cycle
+
+// #define ADJUST_DITHER  d0 += DADVANCE; d1 += DADVANCE; d2 += DADVANCE; d0 &= e0; d1 &= e1; d2 &= d2; 
+#define ADJDITHER2(D, E) D += DADVANCE; D &= E;
+// #define xstr(a) str(a)
+// #define str(a) #a
+// #define ADJDITHER2(D,E) asm __volatile__("subi %[" #D "], " xstr(DUSE) "\n\tand %[" #D "], %[" #E "]\n\t" ASM_VARS);
+
 // define the beginning of the loop
-#define LOOP asm __volatile__("1:" ASM_VARS ); d0 += DADVANCE; d1 += DADVANCE; d2 += DADVANCE; d0 &= e0; d1 &= e1; d2 &= d2; 
+#define LOOP asm __volatile__("1:" ASM_VARS );
+// define the end of the loop
 #define DONE asm __volatile__("2:" ASM_VARS );
-// delay time
+
+// 2 cycles - increment the data pointer
+#define IDATA2 asm __volatile__("add %A[data], %A[ADV]\n\tadc %B[data], %B[ADV]"  ASM_VARS );
+// 2 cycles - decrement the counter
+#define DCOUNT2 asm __volatile__("sbiw %[count], 1" ASM_VARS );
+// 2 cycles - jump to the beginning of the loop
+#define JMPLOOP2 asm __volatile__("rjmp 1b" ASM_VARS );
+// 2 cycles - jump out of the loop
+#define BRLOOP1 asm __volatile__("breq 2f" ASM_VARS );
+
 #define DADVANCE 3
-#define DITHER 1
-#define PRESCALE0_4() if(DITHER) { asm __volatile__("cpse %[scale_base], r1\n\t add %[scale_base],%[d0]\n\tbrcc L_%=\n\tldi %[scale_base], 0xFF\n\tL_%=:\n\t" ASM_VARS); } \
-				      else { _dc<4>(loopvar); }
-#define PRESCALE1_4() if(DITHER) { asm __volatile__("cpse %[scale_base], r1\n\t add %[scale_base],%[d1]\n\tbrcc L_%=\n\tldi %[scale_base], 0xFF\n\tL_%=:\n\t" ASM_VARS); } \
-				      else { _dc<4>(loopvar); }
-#define PRESCALE2_4() if(DITHER) { asm __volatile__("cpse %[scale_base], r1\n\t add %[scale_base],%[d2]\n\tbrcc L_%=\n\tldi %[scale_base], 0xFF\n\tL_%=:\n\t" ASM_VARS); } \
-				      else { _dc<4>(loopvar); }
+#define DUSE (0xFF - (DADVANCE-1))
 
 	// This method is made static to force making register Y available to use for data on AVR - if the method is non-static, then 
 	// gcc will use register Y for the this pointer.
@@ -173,8 +200,8 @@ public:
 		data_t lo = *port & ~mask;
 		*port = lo;
 
-		uint8_t d0, d1, d2;
-		uint8_t e0, e1, e2;
+		register uint8_t d0, d1, d2;
+		register uint8_t e0, e1, e2;
 		uint8_t s0, s1, s2;
 		uint8_t b0, b1, b2;
 		static uint8_t d[3] = {0,0,0};
@@ -182,6 +209,7 @@ public:
 		uint16_t count = nLeds;
 		uint8_t scale_base = 0;
 		uint16_t advanceBy = advance ? (skip+3) : 0;
+		// uint8_t dadv = DADVANCE;
 
 		// initialize the scales
 		s0 = scale.raw[B0];
@@ -190,24 +218,29 @@ public:
 
 		// initialize the e & d values
 		uint8_t S;
-		S = s0; e0 = 0xFF; while(s0 >>= 1) { e0 >>= 1; }
+		S = s0; e0 = 0xFF; while(S >>= 1) { e0 >>= 1; }
 		d0 = d[0] & e0;
-		S = s1; e1 = 0xFF; while(s1 >>= 1) { e1 >>= 1; }
+		S = s1; e1 = 0xFF; while(S >>= 1) { e1 >>= 1; }
 		d1 = d[1] & e1;
-		S = s2; e2 = 0xFF; while(s2 >>= 1) { e2 >>= 1; }
+		S = s2; e2 = 0xFF; while(S >>= 1) { e2 >>= 1; }
 		d2 = d[2] & e0;
 
 		b0 = data[RGB_BYTE0(RGB_ORDER)];
 		if(DITHER && b0) { b0 = qadd8(b0, d0); }
-		b0 = scale8(b0, scale);
+		b0 = scale8_video(b0, s0);
 		b1 = 0;
 		b2 = 0;
 		register uint8_t loopvar=0;
 
 		{
 			{
-				/* asm */
-				LOOP
+				// Loop beginning, does some stuff that's outside of the pixel write cycle, namely incrementing d0-2 and masking off
+				// by the E values (see the definition )
+				LOOP; 
+				ADJDITHER2(d0,e0)
+				ADJDITHER2(d1,e1) 
+				ADJDITHER2(d2,e2) 
+				VIDADJ2(b0);
 				// Sum of the clock counts across each row should be 10 for 8Mhz, WS2811
 				// The values in the D1/D2/D3 indicate how many cycles the previous column takes
 				// to allow things to line back up.
@@ -219,23 +252,23 @@ public:
 #if TRINKET_SCALE
 				// Inline scaling - RGB ordering
 				HI1 D1(1) QLO2(b0, 7) LDSCL3(b1,O1) 	D2(3)	LO1					D3(0)	
-				HI1	D1(1) QLO2(b0, 6) PRESCALE1_4()		D2(4)	LO1	SCALE12(b1,0)	D3(2)		
+				HI1	D1(1) QLO2(b0, 6) PRESCALE4(d1)		D2(4)	LO1	SCALE12(b1,0)	D3(2)		
 				HI1 D1(1) QLO2(b0, 5) RORSC14(b1,1) 	D2(4)	LO1 ROR1(b1) CLC1	D3(2)
 				HI1 D1(1) QLO2(b0, 4) SCROR14(b1,2)		D2(4)	LO1 SCALE12(b1,3)	D3(2)			
 				HI1 D1(1) QLO2(b0, 3) RORSC14(b1,4) 	D2(4)	LO1 ROR1(b1) CLC1	D3(2)			
 				HI1 D1(1) QLO2(b0, 2) SCROR14(b1,5) 	D2(4)	LO1 SCALE12(b1,6)	D3(2)			
 				HI1 D1(1) QLO2(b0, 1) RORSC14(b1,7) 	D2(4)	LO1 ROR1(b1) CLC1	D3(2)		
-				HI1 D1(1) QLO2(b0, 0) 				 	D2(0)	LO1 				D3(0)			
-				HI1 D1(1) QLO2(b1, 7) LDSCL3(b2,O1) 	D2(3)	LO1					D3(0)	
-				HI1	D1(1) QLO2(b1, 6) PRESCALE2_4()		D2(4)	LO1	SCALE22(b2,0)	D3(2)		
+				HI1 D1(1) QLO2(b0, 0) 				 	D2(0)	LO1 VIDADJ2(b1)		D3(2)			
+				HI1 D1(1) QLO2(b1, 7) LDSCL3(b2,O2) 	D2(3)	LO1					D3(0)	
+				HI1	D1(1) QLO2(b1, 6) PRESCALE4(d2)		D2(4)	LO1	SCALE22(b2,0)	D3(2)		
 				HI1 D1(1) QLO2(b1, 5) RORSC24(b2,1) 	D2(4)	LO1 ROR1(b2) CLC1	D3(2)
 				HI1 D1(1) QLO2(b1, 4) SCROR24(b2,2)		D2(4)	LO1 SCALE22(b2,3)	D3(2)	
 				HI1 D1(1) QLO2(b1, 3) RORSC24(b2,4) 	D2(4)	LO1 ROR1(b2) CLC1	D3(2)	
 				HI1 D1(1) QLO2(b1, 2) SCROR24(b2,5) 	D2(4)	LO1 SCALE22(b2,6)	D3(2)	
 				HI1 D1(1) QLO2(b1, 1) RORSC24(b2,7) 	D2(4)	LO1 ROR1(b2) CLC1	D3(2)
-				HI1 D1(1) QLO2(b1, 0) IDATA2 			D2(2) 	LO1 				D3(0)
-				HI1 D1(1) QLO2(b2, 7) LDSCL3(b2,O1) 	D2(3)	LO1					D3(0)	
-				HI1	D1(1) QLO2(b2, 6) PRESCALE0_4()		D2(4)	LO1	SCALE22(b0,0)	D3(2)		
+				HI1 D1(1) QLO2(b1, 0) IDATA2 			D2(2) 	LO1 VIDADJ2(b2)		D3(0)
+				HI1 D1(1) QLO2(b2, 7) LDSCL3(b0,O0) 	D2(3)	LO1					D3(0)	
+				HI1	D1(1) QLO2(b2, 6) PRESCALE4(d0)		D2(4)	LO1	SCALE22(b0,0)	D3(2)		
 				HI1 D1(1) QLO2(b2, 5) RORSC04(b0,1) 	D2(4)	LO1 ROR1(b0) CLC1	D3(2)
 				HI1 D1(1) QLO2(b2, 4) SCROR04(b0,2)		D2(4)	LO1 SCALE02(b0,3)	D3(2)	
 				HI1 D1(1) QLO2(b2, 3) RORSC04(b0,4) 	D2(4)	LO1 ROR1(b0) CLC1	D3(2)
author	Daniel Garcia <danielgarcia@gmail.com>	2014-02-19 08:37:09 +0400
committer	Daniel Garcia <danielgarcia@gmail.com>	2014-02-19 08:37:09 +0400
commit	4ff103d94afcc33afc389dcd31537f15b98ae64a (patch)
tree	b5231757b3032ac6de990fa4db36b987b105cd2c /clockless_trinket.h
parent	902bf544a44ad0ec5bd35c6b462c0f4b2e7da5c9 (diff)