1 files changed, 99 insertions, 0 deletions
diff --git a/newlib/libc/machine/h8300/memcpy.S b/newlib/libc/machine/h8300/memcpy.S
index 305e865df..6af5a9922 100644
--- a/newlib/libc/machine/h8300/memcpy.S
+++ b/newlib/libc/machine/h8300/memcpy.S
@@ -2,6 +2,104 @@
 
 #include "defines.h"
 
+#ifdef __H8300SX__
+
+	.global _memcpy
+_memcpy:
+	stm.l	er4-er6,@-er7
+
+	; Set up source and destination pointers for movmd.
+	mov.l	er0,er6
+	mov.l	er1,er5
+
+	; See whether the copy is long enough to use the movmd.l code.
+	; Although the code can handle anything longer than 6 bytes,
+	; it can be more expensive than movmd.b for small moves.
+	; It's better to use a higher threshold to account for this.
+	;
+	; Note that the exact overhead of the movmd.l checks depends on
+	; the alignments of the length and pointers.  They are faster when
+	; er0 & 3 == er1 & 3 == er2 & 3, faster still when these values
+	; are 0.  This threshold is a compromise between the various cases.
+	cmp	#16,LEN(r2)
+	blo	simple
+
+	; movmd.l only works for even addresses.  If one of the addresses
+	; is odd and the other is not, fall back on a simple move.
+	bld	#0,r5l
+	bxor	#0,r6l
+	bcs	simple
+
+	; Make the addresses even.
+	bld	#0,r5l
+	bcc	word_aligned
+	mov.b	@er5+,@er6+
+	sub	#1,LEN(r2)
+
+word_aligned:
+	; See if copying one word would make the first operand longword
+	; aligned.  Although this is only really worthwhile if it aligns
+	; the second operand as well, it's no worse if doesn't, so it
+	; hardly seems worth the overhead of a "band" check.
+	bld	#1,r6l
+	bcc	fast_copy
+	mov.w	@er5+,@er6+
+	sub	#2,LEN(r2)
+
+fast_copy:
+	; Set (e)r4 to the number of longwords to copy.
+	mov	LEN(r2),LEN(r4)
+	shlr	#2,LEN(r4)
+
+#ifdef __NORMAL_MODE__
+	; 16-bit pointers and size_ts: one movmd.l is enough.  This code
+	; is never reached with r4 == 0.
+	movmd.l
+	and.w	#3,r2
+simple:
+	mov.w	r2,r4
+	beq	quit
+	movmd.b
+quit:
+	rts/l	er4-er6
+#else
+	; Skip the first iteration if the number of longwords is divisible
+	; by 0x10000.
+	mov.w	r4,r4
+	beq	fast_loop_next
+
+	; This loop copies r4 (!= 0) longwords the first time round and 65536
+	; longwords on each iteration after that.
+fast_loop:
+	movmd.l
+fast_loop_next:
+	sub.w	#1,e4
+	bhs	fast_loop
+
+	; Mop up any left-over bytes.  We could just fall through to the
+	; simple code after the "and" but the version below is quicker
+	; and only takes 10 more bytes.
+	and.w	#3,r2
+	beq	quit
+	mov.w	r2,r4
+	movmd.b
+quit:
+	rts/l	er4-er6
+
+simple:
+	; Simple bytewise copy.  We need to handle all lengths, including zero.
+	mov.w	r2,r4
+	beq	simple_loop_next
+simple_loop:
+	movmd.b
+simple_loop_next:
+	sub.w	#1,e2
+	bhs	simple_loop
+	rts/l	er4-er6
+#endif
+
+#else
+
 	.global _memcpy
 _memcpy:
 ;	MOVP	@(2/4,r7),A0P	; dst
@@ -48,3 +146,4 @@ byteloop:
 	; return with A0 pointing to dst
 quit:	rts
 
+#endif