Welcome to mirror list, hosted at ThFree Co, Russian Federation.

cygwin.com/git/newlib-cygwin.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
Diffstat (limited to 'newlib/libc/machine/arm/memcpy-armv7m.S')
-rw-r--r--newlib/libc/machine/arm/memcpy-armv7m.S321
1 files changed, 321 insertions, 0 deletions
diff --git a/newlib/libc/machine/arm/memcpy-armv7m.S b/newlib/libc/machine/arm/memcpy-armv7m.S
new file mode 100644
index 000000000..8a70c7d73
--- /dev/null
+++ b/newlib/libc/machine/arm/memcpy-armv7m.S
@@ -0,0 +1,321 @@
+/*
+ * Copyright (c) 2013 ARM Ltd
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. The name of the company may not be used to endorse or promote
+ * products derived from this software without specific prior written
+ * permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY ARM LTD ``AS IS'' AND ANY EXPRESS OR IMPLIED
+ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL ARM LTD BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
+ * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/* This memcpy routine is optimised for Cortex-M3/M4 cores with/without
+ unaligned access.
+
+ If compiled with GCC, this file should be enclosed within following
+ pre-processing check:
+ if defined (__ARM_ARCH_7M__) || defined (__ARM_ARCH_7EM__)
+
+ Prototype: void *memcpy (void *dst, const void *src, size_t count);
+
+ The job will be done in 5 steps.
+ Step 1: Align src/dest pointers, copy mis-aligned if fail to align both
+ Step 2: Repeatedly copy big block size of __OPT_BIG_BLOCK_SIZE
+ Step 3: Repeatedly copy big block size of __OPT_MID_BLOCK_SIZE
+ Step 4: Copy word by word
+ Step 5: Copy byte-to-byte
+
+ Tunable options:
+ __OPT_BIG_BLOCK_SIZE: Size of big block in words. Default to 64.
+ __OPT_MID_BLOCK_SIZE: Size of big block in words. Default to 16.
+ */
+#ifndef __OPT_BIG_BLOCK_SIZE
+#define __OPT_BIG_BLOCK_SIZE (4 * 16)
+#endif
+
+#ifndef __OPT_MID_BLOCK_SIZE
+#define __OPT_MID_BLOCK_SIZE (4 * 4)
+#endif
+
+#if __OPT_BIG_BLOCK_SIZE == 16
+#define BEGIN_UNROLL_BIG_BLOCK \
+ .irp offset, 0,4,8,12
+#elif __OPT_BIG_BLOCK_SIZE == 32
+#define BEGIN_UNROLL_BIG_BLOCK \
+ .irp offset, 0,4,8,12,16,20,24,28
+#elif __OPT_BIG_BLOCK_SIZE == 64
+#define BEGIN_UNROLL_BIG_BLOCK \
+ .irp offset, 0,4,8,12,16,20,24,28,32,36,40,44,48,52,56,60
+#else
+#error "Illegal __OPT_BIG_BLOCK_SIZE"
+#endif
+
+#if __OPT_MID_BLOCK_SIZE == 8
+#define BEGIN_UNROLL_MID_BLOCK \
+ .irp offset, 0,4
+#elif __OPT_MID_BLOCK_SIZE == 16
+#define BEGIN_UNROLL_MID_BLOCK \
+ .irp offset, 0,4,8,12
+#else
+#error "Illegal __OPT_MID_BLOCK_SIZE"
+#endif
+
+#define END_UNROLL .endr
+
+ .syntax unified
+ .text
+ .align 2
+ .global memcpy
+ .thumb
+ .thumb_func
+ .type memcpy, %function
+memcpy:
+ @ r0: dst
+ @ r1: src
+ @ r2: len
+#ifdef __ARM_FEATURE_UNALIGNED
+ /* In case of UNALIGNED access supported, ip is not used in
+ function body. */
+ mov ip, r0
+#else
+ push {r0}
+#endif
+ orr r3, r1, r0
+ ands r3, r3, #3
+ bne .Lmisaligned_copy
+
+.Lbig_block:
+ subs r2, __OPT_BIG_BLOCK_SIZE
+ blo .Lmid_block
+
+ /* Kernel loop for big block copy */
+ .align 2
+.Lbig_block_loop:
+ BEGIN_UNROLL_BIG_BLOCK
+#ifdef __ARM_ARCH_7EM__
+ ldr r3, [r1], #4
+ str r3, [r0], #4
+ END_UNROLL
+#else /* __ARM_ARCH_7M__ */
+ ldr r3, [r1, \offset]
+ str r3, [r0, \offset]
+ END_UNROLL
+ adds r0, __OPT_BIG_BLOCK_SIZE
+ adds r1, __OPT_BIG_BLOCK_SIZE
+#endif
+ subs r2, __OPT_BIG_BLOCK_SIZE
+ bhs .Lbig_block_loop
+
+.Lmid_block:
+ adds r2, __OPT_BIG_BLOCK_SIZE - __OPT_MID_BLOCK_SIZE
+ blo .Lcopy_word_by_word
+
+ /* Kernel loop for mid-block copy */
+ .align 2
+.Lmid_block_loop:
+ BEGIN_UNROLL_MID_BLOCK
+#ifdef __ARM_ARCH_7EM__
+ ldr r3, [r1], #4
+ str r3, [r0], #4
+ END_UNROLL
+#else /* __ARM_ARCH_7M__ */
+ ldr r3, [r1, \offset]
+ str r3, [r0, \offset]
+ END_UNROLL
+ adds r0, __OPT_MID_BLOCK_SIZE
+ adds r1, __OPT_MID_BLOCK_SIZE
+#endif
+ subs r2, __OPT_MID_BLOCK_SIZE
+ bhs .Lmid_block_loop
+
+.Lcopy_word_by_word:
+ adds r2, __OPT_MID_BLOCK_SIZE - 4
+ blo .Lcopy_less_than_4
+
+ /* Kernel loop for small block copy */
+ .align 2
+.Lcopy_word_by_word_loop:
+ ldr r3, [r1], #4
+ str r3, [r0], #4
+ subs r2, #4
+ bhs .Lcopy_word_by_word_loop
+
+.Lcopy_less_than_4:
+ adds r2, #4
+ beq .Ldone
+
+ lsls r2, r2, #31
+ itt ne
+ ldrbne r3, [r1], #1
+ strbne r3, [r0], #1
+
+ bcc .Ldone
+#ifdef __ARM_FEATURE_UNALIGNED
+ ldrh r3, [r1]
+ strh r3, [r0]
+#else
+ ldrb r3, [r1]
+ strb r3, [r0]
+ ldrb r3, [r1, #1]
+ strb r3, [r0, #1]
+#endif /* __ARM_FEATURE_UNALIGNED */
+
+.Ldone:
+#ifdef __ARM_FEATURE_UNALIGNED
+ mov r0, ip
+#else
+ pop {r0}
+#endif
+ bx lr
+
+ .align 2
+.Lmisaligned_copy:
+#ifdef __ARM_FEATURE_UNALIGNED
+ /* Define label DST_ALIGNED to BIG_BLOCK. It will go to aligned copy
+ once destination is adjusted to aligned. */
+#define Ldst_aligned Lbig_block
+
+ /* Copy word by word using LDR when alignment can be done in hardware,
+ i.e., SCTLR.A is set, supporting unaligned access in LDR and STR. */
+
+ cmp r2, #8
+ blo .Lbyte_copy
+
+ /* if src is aligned, just go to the big block loop. */
+ lsls r3, r1, #30
+ beq .Ldst_aligned
+#else
+ /* if len < 12, misalignment adjustment has more overhead than
+ just byte-to-byte copy. Also, len must >=8 to guarantee code
+ afterward work correctly. */
+ cmp r2, #12
+ blo .Lbyte_copy
+#endif /* __ARM_FEATURE_UNALIGNED */
+
+ /* Align dst only, not trying to align src. That is the because
+ handling of aligned src and misaligned dst need more overhead than
+ otherwise. By doing this the worst case is when initial src is aligned,
+ additional up to 4 byte additional copy will executed, which is
+ acceptable. */
+
+ ands r3, r0, #3
+ beq .Ldst_aligned
+
+ rsb r3, #4
+ subs r2, r3
+
+ lsls r3, r3, #31
+ itt ne
+ ldrbne r3, [r1], #1
+ strbne r3, [r0], #1
+
+ bcc .Ldst_aligned
+
+#ifdef __ARM_FEATURE_UNALIGNED
+ ldrh r3, [r1], #2
+ strh r3, [r0], #2
+ b .Ldst_aligned
+#else
+ ldrb r3, [r1], #1
+ strb r3, [r0], #1
+ ldrb r3, [r1], #1
+ strb r3, [r0], #1
+ /* Now that dst is aligned */
+.Ldst_aligned:
+ /* if r1 is aligned now, it means r0/r1 has the same misalignment,
+ and they are both aligned now. Go aligned copy. */
+ ands r3, r1, #3
+ beq .Lbig_block
+
+ /* dst is aligned, but src isn't. Misaligned copy. */
+
+ push {r4, r5}
+ subs r2, #4
+
+ /* Backward r1 by misaligned bytes, to make r1 aligned.
+ Since we need to restore r1 to unaligned address after the loop,
+ we need keep the offset bytes to ip and sub it from r1 afterward. */
+ subs r1, r3
+ rsb ip, r3, #4
+
+ /* Pre-load on word */
+ ldr r4, [r1], #4
+
+ cmp r3, #2
+ beq .Lmisaligned_copy_2_2
+ cmp r3, #3
+ beq .Lmisaligned_copy_3_1
+
+ .macro mis_src_copy shift
+1:
+ lsrs r4, r4, \shift
+ ldr r3, [r1], #4
+ lsls r5, r3, 32-\shift
+ orr r4, r4, r5
+ str r4, [r0], #4
+ mov r4, r3
+ subs r2, #4
+ bhs 1b
+ .endm
+
+.Lmisaligned_copy_1_3:
+ mis_src_copy shift=8
+ b .Lsrc_misaligned_tail
+
+.Lmisaligned_copy_3_1:
+ mis_src_copy shift=24
+ b .Lsrc_misaligned_tail
+
+.Lmisaligned_copy_2_2:
+ /* For 2_2 misalignment, ldr is still faster than 2 x ldrh. */
+ mis_src_copy shift=16
+
+.Lsrc_misaligned_tail:
+ adds r2, #4
+ subs r1, ip
+ pop {r4, r5}
+
+#endif /* __ARM_FEATURE_UNALIGNED */
+
+.Lbyte_copy:
+ subs r2, #4
+ blo .Lcopy_less_than_4
+
+.Lbyte_copy_loop:
+ subs r2, #1
+ ldrb r3, [r1], #1
+ strb r3, [r0], #1
+ bhs .Lbyte_copy_loop
+
+ ldrb r3, [r1]
+ strb r3, [r0]
+ ldrb r3, [r1, #1]
+ strb r3, [r0, #1]
+ ldrb r3, [r1, #2]
+ strb r3, [r0, #2]
+
+#ifdef __ARM_FEATURE_UNALIGNED
+ mov r0, ip
+#else
+ pop {r0}
+#endif
+ bx lr
+
+ .size memcpy, .-memcpy