1 files changed, 321 insertions, 0 deletions
diff --git a/newlib/libc/machine/arm/memcpy-armv7m.S b/newlib/libc/machine/arm/memcpy-armv7m.S
new file mode 100644
index 000000000..8a70c7d73
--- /dev/null
+++ b/newlib/libc/machine/arm/memcpy-armv7m.S
@@ -0,0 +1,321 @@
+/*
+ * Copyright (c) 2013 ARM Ltd
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. The name of the company may not be used to endorse or promote
+ *    products derived from this software without specific prior written
+ *    permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY ARM LTD ``AS IS'' AND ANY EXPRESS OR IMPLIED
+ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL ARM LTD BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
+ * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/* This memcpy routine is optimised for Cortex-M3/M4 cores with/without
+   unaligned access.
+
+   If compiled with GCC, this file should be enclosed within following
+   pre-processing check:
+   if defined (__ARM_ARCH_7M__) || defined (__ARM_ARCH_7EM__)
+
+   Prototype: void *memcpy (void *dst, const void *src, size_t count);
+
+   The job will be done in 5 steps.
+   Step 1: Align src/dest pointers, copy mis-aligned if fail to align both
+   Step 2: Repeatedly copy big block size of __OPT_BIG_BLOCK_SIZE
+   Step 3: Repeatedly copy big block size of __OPT_MID_BLOCK_SIZE
+   Step 4: Copy word by word
+   Step 5: Copy byte-to-byte
+
+   Tunable options:
+     __OPT_BIG_BLOCK_SIZE: Size of big block in words.  Default to 64.
+     __OPT_MID_BLOCK_SIZE: Size of big block in words.  Default to 16.
+ */
+#ifndef __OPT_BIG_BLOCK_SIZE
+#define __OPT_BIG_BLOCK_SIZE (4 * 16)
+#endif
+
+#ifndef __OPT_MID_BLOCK_SIZE
+#define __OPT_MID_BLOCK_SIZE (4 * 4)
+#endif
+
+#if __OPT_BIG_BLOCK_SIZE == 16
+#define BEGIN_UNROLL_BIG_BLOCK \
+  .irp offset, 0,4,8,12
+#elif __OPT_BIG_BLOCK_SIZE == 32
+#define BEGIN_UNROLL_BIG_BLOCK \
+  .irp offset, 0,4,8,12,16,20,24,28
+#elif __OPT_BIG_BLOCK_SIZE == 64
+#define BEGIN_UNROLL_BIG_BLOCK \
+  .irp offset, 0,4,8,12,16,20,24,28,32,36,40,44,48,52,56,60
+#else
+#error "Illegal __OPT_BIG_BLOCK_SIZE"
+#endif
+
+#if __OPT_MID_BLOCK_SIZE == 8
+#define BEGIN_UNROLL_MID_BLOCK \
+  .irp offset, 0,4
+#elif __OPT_MID_BLOCK_SIZE == 16
+#define BEGIN_UNROLL_MID_BLOCK \
+  .irp offset, 0,4,8,12
+#else
+#error "Illegal __OPT_MID_BLOCK_SIZE"
+#endif
+
+#define END_UNROLL .endr
+
+	.syntax unified
+	.text
+	.align	2
+	.global	memcpy
+	.thumb
+	.thumb_func
+	.type	memcpy, %function
+memcpy:
+	@ r0: dst
+	@ r1: src
+	@ r2: len
+#ifdef __ARM_FEATURE_UNALIGNED
+	/* In case of UNALIGNED access supported, ip is not used in
+	   function body.  */
+	mov	ip, r0
+#else
+	push	{r0}
+#endif
+	orr	r3, r1, r0
+	ands	r3, r3, #3
+	bne	.Lmisaligned_copy
+
+.Lbig_block:
+	subs	r2, __OPT_BIG_BLOCK_SIZE
+	blo	.Lmid_block
+
+	/* Kernel loop for big block copy */
+	.align 2
+.Lbig_block_loop:
+	BEGIN_UNROLL_BIG_BLOCK
+#ifdef __ARM_ARCH_7EM__
+	ldr	r3, [r1], #4
+	str	r3, [r0], #4
+	END_UNROLL
+#else /* __ARM_ARCH_7M__ */
+	ldr	r3, [r1, \offset]
+	str	r3, [r0, \offset]
+	END_UNROLL
+	adds	r0, __OPT_BIG_BLOCK_SIZE
+	adds	r1, __OPT_BIG_BLOCK_SIZE
+#endif
+	subs	r2, __OPT_BIG_BLOCK_SIZE
+	bhs .Lbig_block_loop
+
+.Lmid_block:
+	adds	r2, __OPT_BIG_BLOCK_SIZE - __OPT_MID_BLOCK_SIZE
+	blo	.Lcopy_word_by_word
+
+	/* Kernel loop for mid-block copy */
+	.align 2
+.Lmid_block_loop:
+	BEGIN_UNROLL_MID_BLOCK
+#ifdef __ARM_ARCH_7EM__
+	ldr	r3, [r1], #4
+	str	r3, [r0], #4
+	END_UNROLL
+#else /* __ARM_ARCH_7M__ */
+	ldr	r3, [r1, \offset]
+	str	r3, [r0, \offset]
+	END_UNROLL
+	adds    r0, __OPT_MID_BLOCK_SIZE
+	adds    r1, __OPT_MID_BLOCK_SIZE
+#endif
+	subs	r2, __OPT_MID_BLOCK_SIZE
+	bhs	.Lmid_block_loop
+
+.Lcopy_word_by_word:
+	adds	r2, __OPT_MID_BLOCK_SIZE - 4
+	blo	.Lcopy_less_than_4
+
+	/* Kernel loop for small block copy */
+	.align 2
+.Lcopy_word_by_word_loop:
+	ldr	r3, [r1], #4
+	str	r3, [r0], #4
+	subs	r2, #4
+	bhs	.Lcopy_word_by_word_loop
+
+.Lcopy_less_than_4:
+	adds	r2, #4
+	beq	.Ldone
+
+	lsls	r2, r2, #31
+	itt ne
+	ldrbne  r3, [r1], #1
+	strbne  r3, [r0], #1
+
+	bcc	.Ldone
+#ifdef __ARM_FEATURE_UNALIGNED
+	ldrh	r3, [r1]
+	strh	r3, [r0]
+#else
+	ldrb	r3, [r1]
+	strb	r3, [r0]
+	ldrb	r3, [r1, #1]
+	strb	r3, [r0, #1]
+#endif /* __ARM_FEATURE_UNALIGNED */
+
+.Ldone:
+#ifdef __ARM_FEATURE_UNALIGNED
+	mov	r0, ip
+#else
+	pop	{r0}
+#endif
+	bx	lr
+
+	.align 2
+.Lmisaligned_copy:
+#ifdef __ARM_FEATURE_UNALIGNED
+	/* Define label DST_ALIGNED to BIG_BLOCK.  It will go to aligned copy
+	   once destination is adjusted to aligned.  */
+#define Ldst_aligned Lbig_block
+
+	/* Copy word by word using LDR when alignment can be done in hardware,
+	i.e., SCTLR.A is set, supporting unaligned access in LDR and STR.  */
+
+	cmp	r2, #8
+	blo	.Lbyte_copy
+
+	/* if src is aligned, just go to the big block loop.  */
+	lsls	r3, r1, #30
+	beq	.Ldst_aligned
+#else
+	/* if len < 12, misalignment adjustment has more overhead than
+	just byte-to-byte copy.  Also, len must >=8 to guarantee code
+	afterward work correctly.  */
+	cmp	r2, #12
+	blo	.Lbyte_copy
+#endif /* __ARM_FEATURE_UNALIGNED */
+
+	/* Align dst only, not trying to align src.  That is the because
+	handling of aligned src and misaligned dst need more overhead than
+	otherwise.  By doing this the worst case is when initial src is aligned,
+	additional up to 4 byte additional copy will executed, which is
+	acceptable.  */
+
+	ands	r3, r0, #3
+	beq	.Ldst_aligned
+
+	rsb	r3, #4
+	subs	r2, r3
+
+	lsls    r3, r3, #31
+	itt ne
+	ldrbne  r3, [r1], #1
+	strbne  r3, [r0], #1
+
+	bcc .Ldst_aligned
+
+#ifdef __ARM_FEATURE_UNALIGNED
+	ldrh    r3, [r1], #2
+	strh    r3, [r0], #2
+	b	.Ldst_aligned
+#else
+	ldrb    r3, [r1], #1
+	strb    r3, [r0], #1
+	ldrb    r3, [r1], #1
+	strb    r3, [r0], #1
+	/* Now that dst is aligned */
+.Ldst_aligned:
+	/* if r1 is aligned now, it means r0/r1 has the same misalignment,
+	and they are both aligned now.  Go aligned copy.  */
+	ands	r3, r1, #3
+	beq	.Lbig_block
+
+	/* dst is aligned, but src isn't.  Misaligned copy.  */
+
+	push	{r4, r5}
+	subs	r2, #4
+
+	/* Backward r1 by misaligned bytes, to make r1 aligned.
+	Since we need to restore r1 to unaligned address after the loop,
+	we need keep the offset bytes to ip and sub it from r1 afterward.  */
+	subs	r1, r3
+	rsb	ip, r3, #4
+
+	/* Pre-load on word */
+	ldr	r4, [r1], #4
+
+	cmp	r3, #2
+	beq	.Lmisaligned_copy_2_2
+	cmp	r3, #3
+	beq	.Lmisaligned_copy_3_1
+
+	.macro mis_src_copy shift
+1:
+	lsrs	r4, r4, \shift
+	ldr	r3, [r1], #4
+	lsls	r5, r3, 32-\shift
+	orr	r4, r4, r5
+	str	r4, [r0], #4
+	mov	r4, r3
+	subs	r2, #4
+	bhs	1b
+	.endm
+
+.Lmisaligned_copy_1_3:
+	mis_src_copy shift=8
+	b	.Lsrc_misaligned_tail
+
+.Lmisaligned_copy_3_1:
+	mis_src_copy shift=24
+	b	.Lsrc_misaligned_tail
+
+.Lmisaligned_copy_2_2:
+	/* For 2_2 misalignment, ldr is still faster than 2 x ldrh.  */
+	mis_src_copy shift=16
+
+.Lsrc_misaligned_tail:
+	adds	r2, #4
+	subs	r1, ip
+	pop	{r4, r5}
+
+#endif /* __ARM_FEATURE_UNALIGNED */
+
+.Lbyte_copy:
+	subs	r2, #4
+	blo	.Lcopy_less_than_4
+
+.Lbyte_copy_loop:
+	subs    r2, #1
+	ldrb    r3, [r1], #1
+	strb    r3, [r0], #1
+	bhs	.Lbyte_copy_loop
+
+	ldrb	r3, [r1]
+	strb	r3, [r0]
+	ldrb	r3, [r1, #1]
+	strb	r3, [r0, #1]
+	ldrb	r3, [r1, #2]
+	strb	r3, [r0, #2]
+
+#ifdef __ARM_FEATURE_UNALIGNED
+	mov	r0, ip
+#else
+	pop	{r0}
+#endif
+	bx	lr
+
+	.size	memcpy, .-memcpy