Welcome to mirror list, hosted at ThFree Co, Russian Federation.

cygwin.com/git/newlib-cygwin.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
path: root/newlib
diff options
context:
space:
mode:
authorJeff Johnston <jjohnstn@redhat.com>2013-04-13 02:12:29 +0400
committerJeff Johnston <jjohnstn@redhat.com>2013-04-13 02:12:29 +0400
commit62f41c2b1cc9329d956e33bbdc20aeb450bb3ac7 (patch)
tree3c5e91830d164566675738b0394ecbc426cd88db /newlib
parentf7a4b388b0f7c49bc1542ae9829e6fb31bc9c611 (diff)
2013-04-12 Will Newton <will.newton@linaro.org>
* libc/machine/arm/memcpy-stub.c: Use generic memcpy if unaligned access is not enabled. * libc/machine/arm/memcpy.S: Faster memcpy implementation for Cortex A15 cores using NEON and VFP if available.
Diffstat (limited to 'newlib')
-rw-r--r--newlib/ChangeLog7
-rw-r--r--newlib/libc/machine/arm/memcpy-stub.c2
-rw-r--r--newlib/libc/machine/arm/memcpy.S1004
3 files changed, 611 insertions, 402 deletions
diff --git a/newlib/ChangeLog b/newlib/ChangeLog
index 0bfad1c01..c389318f9 100644
--- a/newlib/ChangeLog
+++ b/newlib/ChangeLog
@@ -1,3 +1,10 @@
+2013-04-12 Will Newton <will.newton@linaro.org>
+
+ * libc/machine/arm/memcpy-stub.c: Use generic memcpy if unaligned
+ access is not enabled.
+ * libc/machine/arm/memcpy.S: Faster memcpy implementation for
+ Cortex A15 cores using NEON and VFP if available.
+
2013-04-12 Bin Cheng <bin.cheng@arm.com>
* acconfig.h (_WCHAR_ORIENT): Undef
diff --git a/newlib/libc/machine/arm/memcpy-stub.c b/newlib/libc/machine/arm/memcpy-stub.c
index 536b869cc..513631a9f 100644
--- a/newlib/libc/machine/arm/memcpy-stub.c
+++ b/newlib/libc/machine/arm/memcpy-stub.c
@@ -29,7 +29,7 @@
/* The sole purpose of this file is to include the plain memcpy provided in newlib.
An optimized version of memcpy is provided in the assembly file memcpy.S in this directory. */
#if (defined (__OPTIMIZE_SIZE__) || defined (PREFER_SIZE_OVER_SPEED) || \
- (!(defined (__ARM_ARCH_7A__))))
+ (!(defined (__ARM_ARCH_7A__) && defined (__ARM_FEATURE_UNALIGNED))))
#include "../../string/memcpy.c"
diff --git a/newlib/libc/machine/arm/memcpy.S b/newlib/libc/machine/arm/memcpy.S
index e408ed0e0..bc54bb3f5 100644
--- a/newlib/libc/machine/arm/memcpy.S
+++ b/newlib/libc/machine/arm/memcpy.S
@@ -1,423 +1,625 @@
-/*
- * Copyright (c) 2011 ARM Ltd
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * 1. Redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in the
- * documentation and/or other materials provided with the distribution.
- * 3. The name of the company may not be used to endorse or promote
- * products derived from this software without specific prior written
- * permission.
- *
- * THIS SOFTWARE IS PROVIDED BY ARM LTD ``AS IS'' AND ANY EXPRESS OR IMPLIED
- * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
- * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
- * IN NO EVENT SHALL ARM LTD BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
- * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
- * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
- * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
- * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+/* Copyright (c) 2013, Linaro Limited
+ All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+
+ * Neither the name of Linaro Limited nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+ This memcpy routine is optimised for Cortex-A15 cores and takes advantage
+ of VFP or NEON when built with the appropriate flags.
+
+ Assumptions:
+
+ ARMv6 (ARMv7-a if using Neon)
+ ARM state
+ Unaligned accesses
+ LDRD/STRD support unaligned word accesses
+
*/
#if (defined (__OPTIMIZE_SIZE__) || defined (PREFER_SIZE_OVER_SPEED) || \
- (!(defined (__ARM_ARCH_7A__))))
+ (!(defined (__ARM_ARCH_7A__) && defined (__ARM_FEATURE_UNALIGNED))))
/* Do nothing here. See memcpy-stub.c in the same directory. */
#else
- /* Prototype: void *memcpy (void *dst, const void *src, size_t count). */
- /* Use the version of memcpy implemented using LDRD and STRD.
- This version is tuned for Cortex-A15.
- This might not be the best for other ARMv7-A CPUs,
- but there is no predefine to distinguish between
- different CPUs in the same architecture,
- and this version is better than the plain memcpy provided in newlib.
+ .syntax unified
+ /* This implementation requires ARM state. */
+ .arm
+
+#ifdef __ARM_NEON__
+
+ .fpu neon
+ .arch armv7-a
+# define FRAME_SIZE 4
+# define USE_VFP
+# define USE_NEON
+
+#elif !defined (__SOFTFP__)
+
+ .arch armv6
+ .fpu vfpv2
+# define FRAME_SIZE 32
+# define USE_VFP
+
+#else
+ .arch armv6
+# define FRAME_SIZE 32
+
+#endif
+
+/* Old versions of GAS incorrectly implement the NEON align semantics. */
+#ifdef BROKEN_ASM_NEON_ALIGN
+#define ALIGN(addr, align) addr,:align
+#else
+#define ALIGN(addr, align) addr:align
+#endif
- Therefore, we use this version for all ARMv7-A CPUS. */
+#define PC_OFFSET 8 /* PC pipeline compensation. */
+#define INSN_SIZE 4
+
+/* Call parameters. */
+#define dstin r0
+#define src r1
+#define count r2
+
+/* Locals. */
+#define tmp1 r3
+#define dst ip
+#define tmp2 r10
+
+#ifndef USE_NEON
+/* For bulk copies using GP registers. */
+#define A_l r2 /* Call-clobbered. */
+#define A_h r3 /* Call-clobbered. */
+#define B_l r4
+#define B_h r5
+#define C_l r6
+#define C_h r7
+#define D_l r8
+#define D_h r9
+#endif
- /* To make the same code compile for both ARM and Thumb instruction
- sets, switch to unified syntax at the beginning of this function.
- However, by using the same code, we may be missing optimization
- opportunities. For instance, in LDRD/STRD instructions, the first
- destination register must be even and the second consecutive in
- ARM state, but not in Thumb state. */
+/* Number of lines ahead to pre-fetch data. If you change this the code
+ below will need adjustment to compensate. */
+
+#define prefetch_lines 5
+
+#ifdef USE_VFP
+ .macro cpy_line_vfp vreg, base
+ vstr \vreg, [dst, #\base]
+ vldr \vreg, [src, #\base]
+ vstr d0, [dst, #\base + 8]
+ vldr d0, [src, #\base + 8]
+ vstr d1, [dst, #\base + 16]
+ vldr d1, [src, #\base + 16]
+ vstr d2, [dst, #\base + 24]
+ vldr d2, [src, #\base + 24]
+ vstr \vreg, [dst, #\base + 32]
+ vldr \vreg, [src, #\base + prefetch_lines * 64 - 32]
+ vstr d0, [dst, #\base + 40]
+ vldr d0, [src, #\base + 40]
+ vstr d1, [dst, #\base + 48]
+ vldr d1, [src, #\base + 48]
+ vstr d2, [dst, #\base + 56]
+ vldr d2, [src, #\base + 56]
+ .endm
+
+ .macro cpy_tail_vfp vreg, base
+ vstr \vreg, [dst, #\base]
+ vldr \vreg, [src, #\base]
+ vstr d0, [dst, #\base + 8]
+ vldr d0, [src, #\base + 8]
+ vstr d1, [dst, #\base + 16]
+ vldr d1, [src, #\base + 16]
+ vstr d2, [dst, #\base + 24]
+ vldr d2, [src, #\base + 24]
+ vstr \vreg, [dst, #\base + 32]
+ vstr d0, [dst, #\base + 40]
+ vldr d0, [src, #\base + 40]
+ vstr d1, [dst, #\base + 48]
+ vldr d1, [src, #\base + 48]
+ vstr d2, [dst, #\base + 56]
+ vldr d2, [src, #\base + 56]
+ .endm
+#endif
- .syntax unified
+ .macro def_fn f p2align=0
+ .text
+ .p2align \p2align
+ .global \f
+ .type \f, %function
+\f:
+ .endm
+
+def_fn memcpy p2align=6
+
+ mov dst, dstin /* Preserve dstin, we need to return it. */
+ cmp count, #64
+ bge .Lcpy_not_short
+ /* Deal with small copies quickly by dropping straight into the
+ exit block. */
+
+.Ltail63unaligned:
+#ifdef USE_NEON
+ and tmp1, count, #0x38
+ rsb tmp1, tmp1, #(56 - PC_OFFSET + INSN_SIZE)
+ add pc, pc, tmp1
+ vld1.8 {d0}, [src]! /* 14 words to go. */
+ vst1.8 {d0}, [dst]!
+ vld1.8 {d0}, [src]! /* 12 words to go. */
+ vst1.8 {d0}, [dst]!
+ vld1.8 {d0}, [src]! /* 10 words to go. */
+ vst1.8 {d0}, [dst]!
+ vld1.8 {d0}, [src]! /* 8 words to go. */
+ vst1.8 {d0}, [dst]!
+ vld1.8 {d0}, [src]! /* 6 words to go. */
+ vst1.8 {d0}, [dst]!
+ vld1.8 {d0}, [src]! /* 4 words to go. */
+ vst1.8 {d0}, [dst]!
+ vld1.8 {d0}, [src]! /* 2 words to go. */
+ vst1.8 {d0}, [dst]!
+
+ tst count, #4
+ ldrne tmp1, [src], #4
+ strne tmp1, [dst], #4
+#else
+ /* Copy up to 15 full words of data. May not be aligned. */
+ /* Cannot use VFP for unaligned data. */
+ and tmp1, count, #0x3c
+ add dst, dst, tmp1
+ add src, src, tmp1
+ rsb tmp1, tmp1, #(60 - PC_OFFSET/2 + INSN_SIZE/2)
+ /* Jump directly into the sequence below at the correct offset. */
+ add pc, pc, tmp1, lsl #1
+
+ ldr tmp1, [src, #-60] /* 15 words to go. */
+ str tmp1, [dst, #-60]
+
+ ldr tmp1, [src, #-56] /* 14 words to go. */
+ str tmp1, [dst, #-56]
+ ldr tmp1, [src, #-52]
+ str tmp1, [dst, #-52]
+
+ ldr tmp1, [src, #-48] /* 12 words to go. */
+ str tmp1, [dst, #-48]
+ ldr tmp1, [src, #-44]
+ str tmp1, [dst, #-44]
+
+ ldr tmp1, [src, #-40] /* 10 words to go. */
+ str tmp1, [dst, #-40]
+ ldr tmp1, [src, #-36]
+ str tmp1, [dst, #-36]
+
+ ldr tmp1, [src, #-32] /* 8 words to go. */
+ str tmp1, [dst, #-32]
+ ldr tmp1, [src, #-28]
+ str tmp1, [dst, #-28]
+
+ ldr tmp1, [src, #-24] /* 6 words to go. */
+ str tmp1, [dst, #-24]
+ ldr tmp1, [src, #-20]
+ str tmp1, [dst, #-20]
+
+ ldr tmp1, [src, #-16] /* 4 words to go. */
+ str tmp1, [dst, #-16]
+ ldr tmp1, [src, #-12]
+ str tmp1, [dst, #-12]
+
+ ldr tmp1, [src, #-8] /* 2 words to go. */
+ str tmp1, [dst, #-8]
+ ldr tmp1, [src, #-4]
+ str tmp1, [dst, #-4]
+#endif
-#if defined (__thumb__)
- .thumb
- .thumb_func
+ lsls count, count, #31
+ ldrhcs tmp1, [src], #2
+ ldrbne src, [src] /* Src is dead, use as a scratch. */
+ strhcs tmp1, [dst], #2
+ strbne src, [dst]
+ bx lr
+
+.Lcpy_not_short:
+ /* At least 64 bytes to copy, but don't know the alignment yet. */
+ str tmp2, [sp, #-FRAME_SIZE]!
+ and tmp2, src, #3
+ and tmp1, dst, #3
+ cmp tmp1, tmp2
+ bne .Lcpy_notaligned
+
+#ifdef USE_VFP
+ /* Magic dust alert! Force VFP on Cortex-A9. Experiments show
+ that the FP pipeline is much better at streaming loads and
+ stores. This is outside the critical loop. */
+ vmov.f32 s0, s0
#endif
- .global memcpy
- .type memcpy, %function
-memcpy:
-
- /* Assumes that n >= 0, and dst, src are valid pointers.
- If there is at least 8 bytes to copy, use LDRD/STRD.
- If src and dst are misaligned with different offsets,
- first copy byte by byte until dst is aligned,
- and then copy using LDRD/STRD and shift if needed.
- When less than 8 left, copy a word and then byte by byte. */
-
- /* Save registers (r0 holds the return value):
- optimized push {r0, r4, r5, lr}.
- To try and improve performance, stack layout changed,
- i.e., not keeping the stack looking like users expect
- (highest numbered register at highest address). */
- push {r0, lr}
- strd r4, r5, [sp, #-8]!
-
- /* TODO: Add debug frame directives.
- We don't need exception unwind directives, because the code below
- does not throw any exceptions and does not call any other functions.
- Generally, newlib functions like this lack debug information for
- assembler source. */
-
- /* Get copying of tiny blocks out of the way first. */
- /* Is there at least 4 bytes to copy? */
- subs r2, r2, #4
- blt copy_less_than_4 /* If n < 4. */
-
- /* Check word alignment. */
- ands ip, r0, #3 /* ip = last 2 bits of dst. */
- bne dst_not_word_aligned /* If dst is not word-aligned. */
-
- /* Get here if dst is word-aligned. */
- ands ip, r1, #3 /* ip = last 2 bits of src. */
- bne src_not_word_aligned /* If src is not word-aligned. */
-word_aligned:
- /* Get here if source and dst both are word-aligned.
- The number of bytes remaining to copy is r2+4. */
-
- /* Is there is at least 64 bytes to copy? */
- subs r2, r2, #60
- blt copy_less_than_64 /* If r2 + 4 < 64. */
-
- /* First, align the destination buffer to 8-bytes,
- to make sure double loads and stores don't cross cache line boundary,
- as they are then more expensive even if the data is in the cache
- (require two load/store issue cycles instead of one).
- If only one of the buffers is not 8-bytes aligned,
- then it's more important to align dst than src,
- because there is more penalty for stores
- than loads that cross cacheline boundary.
- This check and realignment are only worth doing
- if there is a lot to copy. */
-
- /* Get here if dst is word aligned,
- i.e., the 2 least significant bits are 0.
- If dst is not 2w aligned (i.e., the 3rd bit is not set in dst),
- then copy 1 word (4 bytes). */
- ands r3, r0, #4
- beq 11f /* If dst already two-word aligned. */
- ldr r3, [r1], #4
- str r3, [r0], #4
- subs r2, r2, #4
- blt copy_less_than_64
-
-11:
- /* TODO: Align to cacheline (useful for PLD optimization). */
-
- /* Every loop iteration copies 64 bytes. */
+ /* SRC and DST have the same mutual 32-bit alignment, but we may
+ still need to pre-copy some bytes to get to natural alignment.
+ We bring DST into full 64-bit alignment. */
+ lsls tmp2, dst, #29
+ beq 1f
+ rsbs tmp2, tmp2, #0
+ sub count, count, tmp2, lsr #29
+ ldrmi tmp1, [src], #4
+ strmi tmp1, [dst], #4
+ lsls tmp2, tmp2, #2
+ ldrhcs tmp1, [src], #2
+ ldrbne tmp2, [src], #1
+ strhcs tmp1, [dst], #2
+ strbne tmp2, [dst], #1
+
1:
- .irp offset, #0, #8, #16, #24, #32, #40, #48, #56
- ldrd r4, r5, [r1, \offset]
- strd r4, r5, [r0, \offset]
- .endr
+ subs tmp2, count, #64 /* Use tmp2 for count. */
+ blt .Ltail63aligned
+
+ cmp tmp2, #512
+ bge .Lcpy_body_long
- add r0, r0, #64
- add r1, r1, #64
- subs r2, r2, #64
- bge 1b /* If there is more to copy. */
+.Lcpy_body_medium: /* Count in tmp2. */
+#ifdef USE_VFP
+1:
+ vldr d0, [src, #0]
+ subs tmp2, tmp2, #64
+ vldr d1, [src, #8]
+ vstr d0, [dst, #0]
+ vldr d0, [src, #16]
+ vstr d1, [dst, #8]
+ vldr d1, [src, #24]
+ vstr d0, [dst, #16]
+ vldr d0, [src, #32]
+ vstr d1, [dst, #24]
+ vldr d1, [src, #40]
+ vstr d0, [dst, #32]
+ vldr d0, [src, #48]
+ vstr d1, [dst, #40]
+ vldr d1, [src, #56]
+ vstr d0, [dst, #48]
+ add src, src, #64
+ vstr d1, [dst, #56]
+ add dst, dst, #64
+ bge 1b
+ tst tmp2, #0x3f
+ beq .Ldone
+
+.Ltail63aligned: /* Count in tmp2. */
+ and tmp1, tmp2, #0x38
+ add dst, dst, tmp1
+ add src, src, tmp1
+ rsb tmp1, tmp1, #(56 - PC_OFFSET + INSN_SIZE)
+ add pc, pc, tmp1
+
+ vldr d0, [src, #-56] /* 14 words to go. */
+ vstr d0, [dst, #-56]
+ vldr d0, [src, #-48] /* 12 words to go. */
+ vstr d0, [dst, #-48]
+ vldr d0, [src, #-40] /* 10 words to go. */
+ vstr d0, [dst, #-40]
+ vldr d0, [src, #-32] /* 8 words to go. */
+ vstr d0, [dst, #-32]
+ vldr d0, [src, #-24] /* 6 words to go. */
+ vstr d0, [dst, #-24]
+ vldr d0, [src, #-16] /* 4 words to go. */
+ vstr d0, [dst, #-16]
+ vldr d0, [src, #-8] /* 2 words to go. */
+ vstr d0, [dst, #-8]
+#else
+ sub src, src, #8
+ sub dst, dst, #8
+1:
+ ldrd A_l, A_h, [src, #8]
+ strd A_l, A_h, [dst, #8]
+ ldrd A_l, A_h, [src, #16]
+ strd A_l, A_h, [dst, #16]
+ ldrd A_l, A_h, [src, #24]
+ strd A_l, A_h, [dst, #24]
+ ldrd A_l, A_h, [src, #32]
+ strd A_l, A_h, [dst, #32]
+ ldrd A_l, A_h, [src, #40]
+ strd A_l, A_h, [dst, #40]
+ ldrd A_l, A_h, [src, #48]
+ strd A_l, A_h, [dst, #48]
+ ldrd A_l, A_h, [src, #56]
+ strd A_l, A_h, [dst, #56]
+ ldrd A_l, A_h, [src, #64]!
+ strd A_l, A_h, [dst, #64]!
+ subs tmp2, tmp2, #64
+ bge 1b
+ tst tmp2, #0x3f
+ bne 1f
+ ldr tmp2,[sp], #FRAME_SIZE
+ bx lr
+1:
+ add src, src, #8
+ add dst, dst, #8
+
+.Ltail63aligned: /* Count in tmp2. */
+ /* Copy up to 7 d-words of data. Similar to Ltail63unaligned, but
+ we know that the src and dest are 32-bit aligned so we can use
+ LDRD/STRD to improve efficiency. */
+ /* TMP2 is now negative, but we don't care about that. The bottom
+ six bits still tell us how many bytes are left to copy. */
+
+ and tmp1, tmp2, #0x38
+ add dst, dst, tmp1
+ add src, src, tmp1
+ rsb tmp1, tmp1, #(56 - PC_OFFSET + INSN_SIZE)
+ add pc, pc, tmp1
+ ldrd A_l, A_h, [src, #-56] /* 14 words to go. */
+ strd A_l, A_h, [dst, #-56]
+ ldrd A_l, A_h, [src, #-48] /* 12 words to go. */
+ strd A_l, A_h, [dst, #-48]
+ ldrd A_l, A_h, [src, #-40] /* 10 words to go. */
+ strd A_l, A_h, [dst, #-40]
+ ldrd A_l, A_h, [src, #-32] /* 8 words to go. */
+ strd A_l, A_h, [dst, #-32]
+ ldrd A_l, A_h, [src, #-24] /* 6 words to go. */
+ strd A_l, A_h, [dst, #-24]
+ ldrd A_l, A_h, [src, #-16] /* 4 words to go. */
+ strd A_l, A_h, [dst, #-16]
+ ldrd A_l, A_h, [src, #-8] /* 2 words to go. */
+ strd A_l, A_h, [dst, #-8]
-copy_less_than_64:
+#endif
+ tst tmp2, #4
+ ldrne tmp1, [src], #4
+ strne tmp1, [dst], #4
+ lsls tmp2, tmp2, #31 /* Count (tmp2) now dead. */
+ ldrhcs tmp1, [src], #2
+ ldrbne tmp2, [src]
+ strhcs tmp1, [dst], #2
+ strbne tmp2, [dst]
+
+.Ldone:
+ ldr tmp2, [sp], #FRAME_SIZE
+ bx lr
+
+.Lcpy_body_long: /* Count in tmp2. */
+
+ /* Long copy. We know that there's at least (prefetch_lines * 64)
+ bytes to go. */
+#ifdef USE_VFP
+ /* Don't use PLD. Instead, read some data in advance of the current
+ copy position into a register. This should act like a PLD
+ operation but we won't have to repeat the transfer. */
+
+ vldr d3, [src, #0]
+ vldr d4, [src, #64]
+ vldr d5, [src, #128]
+ vldr d6, [src, #192]
+ vldr d7, [src, #256]
+
+ vldr d0, [src, #8]
+ vldr d1, [src, #16]
+ vldr d2, [src, #24]
+ add src, src, #32
+
+ subs tmp2, tmp2, #prefetch_lines * 64 * 2
+ blt 2f
+1:
+ cpy_line_vfp d3, 0
+ cpy_line_vfp d4, 64
+ cpy_line_vfp d5, 128
+ add dst, dst, #3 * 64
+ add src, src, #3 * 64
+ cpy_line_vfp d6, 0
+ cpy_line_vfp d7, 64
+ add dst, dst, #2 * 64
+ add src, src, #2 * 64
+ subs tmp2, tmp2, #prefetch_lines * 64
+ bge 1b
- /* Get here if less than 64 bytes to copy, -64 <= r2 < 0.
- Restore the count if there is more than 7 bytes to copy. */
- adds r2, r2, #56
- blt copy_less_than_8
+2:
+ cpy_tail_vfp d3, 0
+ cpy_tail_vfp d4, 64
+ cpy_tail_vfp d5, 128
+ add src, src, #3 * 64
+ add dst, dst, #3 * 64
+ cpy_tail_vfp d6, 0
+ vstr d7, [dst, #64]
+ vldr d7, [src, #64]
+ vstr d0, [dst, #64 + 8]
+ vldr d0, [src, #64 + 8]
+ vstr d1, [dst, #64 + 16]
+ vldr d1, [src, #64 + 16]
+ vstr d2, [dst, #64 + 24]
+ vldr d2, [src, #64 + 24]
+ vstr d7, [dst, #64 + 32]
+ add src, src, #96
+ vstr d0, [dst, #64 + 40]
+ vstr d1, [dst, #64 + 48]
+ vstr d2, [dst, #64 + 56]
+ add dst, dst, #128
+ add tmp2, tmp2, #prefetch_lines * 64
+ b .Lcpy_body_medium
+#else
+ /* Long copy. Use an SMS style loop to maximize the I/O
+ bandwidth of the core. We don't have enough spare registers
+ to synthesise prefetching, so use PLD operations. */
+ /* Pre-bias src and dst. */
+ sub src, src, #8
+ sub dst, dst, #8
+ pld [src, #8]
+ pld [src, #72]
+ subs tmp2, tmp2, #64
+ pld [src, #136]
+ ldrd A_l, A_h, [src, #8]
+ strd B_l, B_h, [sp, #8]
+ ldrd B_l, B_h, [src, #16]
+ strd C_l, C_h, [sp, #16]
+ ldrd C_l, C_h, [src, #24]
+ strd D_l, D_h, [sp, #24]
+ pld [src, #200]
+ ldrd D_l, D_h, [src, #32]!
+ b 1f
+ .p2align 6
+2:
+ pld [src, #232]
+ strd A_l, A_h, [dst, #40]
+ ldrd A_l, A_h, [src, #40]
+ strd B_l, B_h, [dst, #48]
+ ldrd B_l, B_h, [src, #48]
+ strd C_l, C_h, [dst, #56]
+ ldrd C_l, C_h, [src, #56]
+ strd D_l, D_h, [dst, #64]!
+ ldrd D_l, D_h, [src, #64]!
+ subs tmp2, tmp2, #64
+1:
+ strd A_l, A_h, [dst, #8]
+ ldrd A_l, A_h, [src, #8]
+ strd B_l, B_h, [dst, #16]
+ ldrd B_l, B_h, [src, #16]
+ strd C_l, C_h, [dst, #24]
+ ldrd C_l, C_h, [src, #24]
+ strd D_l, D_h, [dst, #32]
+ ldrd D_l, D_h, [src, #32]
+ bcs 2b
+ /* Save the remaining bytes and restore the callee-saved regs. */
+ strd A_l, A_h, [dst, #40]
+ add src, src, #40
+ strd B_l, B_h, [dst, #48]
+ ldrd B_l, B_h, [sp, #8]
+ strd C_l, C_h, [dst, #56]
+ ldrd C_l, C_h, [sp, #16]
+ strd D_l, D_h, [dst, #64]
+ ldrd D_l, D_h, [sp, #24]
+ add dst, dst, #72
+ tst tmp2, #0x3f
+ bne .Ltail63aligned
+ ldr tmp2, [sp], #FRAME_SIZE
+ bx lr
+#endif
- /* Copy 8 bytes at a time. */
+.Lcpy_notaligned:
+ pld [src]
+ pld [src, #64]
+ /* There's at least 64 bytes to copy, but there is no mutual
+ alignment. */
+ /* Bring DST to 64-bit alignment. */
+ lsls tmp2, dst, #29
+ pld [src, #(2 * 64)]
+ beq 1f
+ rsbs tmp2, tmp2, #0
+ sub count, count, tmp2, lsr #29
+ ldrmi tmp1, [src], #4
+ strmi tmp1, [dst], #4
+ lsls tmp2, tmp2, #2
+ ldrbne tmp1, [src], #1
+ ldrhcs tmp2, [src], #2
+ strbne tmp1, [dst], #1
+ strhcs tmp2, [dst], #2
+1:
+ pld [src, #(3 * 64)]
+ subs count, count, #64
+ ldrmi tmp2, [sp], #FRAME_SIZE
+ bmi .Ltail63unaligned
+ pld [src, #(4 * 64)]
+
+#ifdef USE_NEON
+ vld1.8 {d0-d3}, [src]!
+ vld1.8 {d4-d7}, [src]!
+ subs count, count, #64
+ bmi 2f
+1:
+ pld [src, #(4 * 64)]
+ vst1.8 {d0-d3}, [ALIGN (dst, 64)]!
+ vld1.8 {d0-d3}, [src]!
+ vst1.8 {d4-d7}, [ALIGN (dst, 64)]!
+ vld1.8 {d4-d7}, [src]!
+ subs count, count, #64
+ bpl 1b
2:
- ldrd r4, r5, [r1], #8
- strd r4, r5, [r0], #8
- subs r2, r2, #8
- bge 2b /* If there is more to copy. */
-
-copy_less_than_8:
-
- /* Get here if less than 8 bytes to copy, -8 <= r2 < 0.
- Check if there is more to copy. */
- cmn r2, #8
- beq return /* If r2 + 8 == 0. */
-
- /* Restore the count if there is more than 3 bytes to copy. */
- adds r2, r2, #4
- blt copy_less_than_4
-
- /* Copy 4 bytes. */
- ldr r3, [r1], #4
- str r3, [r0], #4
-
-copy_less_than_4:
- /* Get here if less than 4 bytes to copy, -4 <= r2 < 0. */
-
- /* Restore the count, check if there is more to copy. */
- adds r2, r2, #4
- beq return /* If r2 == 0. */
-
- /* Get here with r2 is in {1,2,3}={01,10,11}. */
- /* Logical shift left r2, insert 0s, update flags. */
- lsls r2, r2, #31
-
- /* Copy byte by byte.
- Condition ne means the last bit of r2 is 0.
- Condition cs means the second to last bit of r2 is set,
- i.e., r2 is 1 or 3. */
- itt ne
- ldrbne r3, [r1], #1
- strbne r3, [r0], #1
-
- itttt cs
- ldrbcs r4, [r1], #1
- ldrbcs r5, [r1]
- strbcs r4, [r0], #1
- strbcs r5, [r0]
-
-return:
- /* Restore registers: optimized pop {r0, r4, r5, pc} */
- ldrd r4, r5, [sp], #8
- pop {r0, pc} /* This is the only return point of memcpy. */
-
-#ifndef __ARM_FEATURE_UNALIGNED
-
- /* The following assembly macro implements misaligned copy in software.
- Assumes that dst is word aligned, src is at offset "pull" bits from
- word, push = 32 - pull, and the number of bytes that remain to copy
- is r2 + 4, r2 >= 0. */
-
- /* In the code below, r2 is the number of bytes that remain to be
- written. The number of bytes read is always larger, because we have
- partial words in the shift queue. */
-
- .macro miscopy pull push shiftleft shiftright
-
- /* Align src to the previous word boundary. */
- bic r1, r1, #3
-
- /* Initialize the shift queue. */
- ldr r5, [r1], #4 /* Load a word from source. */
-
- subs r2, r2, #4
- blt 6f /* Go to misaligned copy of less than 8 bytes. */
-
- /* Get here if there is more than 8 bytes to copy.
- The number of bytes to copy is r2+8, r2 >= 0. */
-
- /* Save registers: push { r6, r7 }.
- We need additional registers for LDRD and STRD, because in ARM state
- the first destination register must be even and the second
- consecutive. */
- strd r6, r7, [sp, #-8]!
-
- subs r2, r2, #56
- blt 4f /* Go to misaligned copy of less than 64 bytes. */
-
-3:
- /* Get here if there is more than 64 bytes to copy.
- The number of bytes to copy is r2+64, r2 >= 0. */
-
- /* Copy 64 bytes in every iteration.
- Use a partial word from the shift queue. */
- .irp offset, #0, #8, #16, #24, #32, #40, #48, #56
- mov r6, r5, \shiftleft #\pull
- ldrd r4, r5, [r1, \offset]
- orr r6, r6, r4, \shiftright #\push
- mov r7, r4, \shiftleft #\pull
- orr r7, r7, r5, \shiftright #\push
- strd r6, r7, [r0, \offset]
- .endr
-
- add r1, r1, #64
- add r0, r0, #64
- subs r2, r2, #64
- bge 3b
-
-4:
- /* Get here if there is less than 64 bytes to copy (-64 <= r2 < 0)
- and they are misaligned. */
-
- /* Restore the count if there is more than 7 bytes to copy. */
- adds r2, r2, #56
-
- /* If less than 8 bytes to copy,
- restore registers saved for this loop: optimized poplt { r6, r7 }. */
- itt lt
- ldrdlt r6, r7, [sp], #8
- blt 6f /* Go to misaligned copy of less than 8 bytes. */
-
-5:
- /* Copy 8 bytes at a time.
- Use a partial word from the shift queue. */
- mov r6, r5, \shiftleft #\pull
- ldrd r4, r5, [r1], #8
- orr r6, r6, r4, \shiftright #\push
- mov r7, r4, \shiftleft #\pull
- orr r7, r7, r5, \shiftright #\push
- strd r6, r7, [r0], #8
-
- subs r2, r2, #8
- bge 5b /* If there is more to copy. */
-
- /* Restore registers saved for this loop: optimized pop { r6, r7 }. */
- ldrd r6, r7, [sp], #8
-
-6:
- /* Get here if there less than 8 bytes to copy (-8 <= r2 < 0)
- and they are misaligned. */
-
- /* Check if there is more to copy. */
- cmn r2, #8
- beq return
-
- /* Check if there is less than 4 bytes to copy. */
- cmn r2, #4
-
- itt lt
- /* Restore src offset from word-align. */
- sublt r1, r1, #(\push / 8)
- blt copy_less_than_4
-
- /* Use a partial word from the shift queue. */
- mov r3, r5, \shiftleft #\pull
- /* Load a word from src, but without writeback
- (this word is not fully written to dst). */
- ldr r5, [r1]
-
- /* Restore src offset from word-align. */
- add r1, r1, #(\pull / 8)
-
- /* Shift bytes to create one dst word and store it. */
- orr r3, r3, r5, \shiftright #\push
- str r3, [r0], #4
-
- /* Use single byte copying of the remaining bytes. */
- b copy_less_than_4
-
- .endm
-
-#endif /* not __ARM_FEATURE_UNALIGNED */
-
-dst_not_word_aligned:
-
- /* Get here when dst is not aligned and ip has the last 2 bits of dst,
- i.e., ip is the offset of dst from word.
- The number of bytes that remains to copy is r2 + 4,
- i.e., there are at least 4 bytes to copy.
- Write a partial word (0 to 3 bytes), such that dst becomes
- word-aligned. */
-
- /* If dst is at ip bytes offset from a word (with 0 < ip < 4),
- then there are (4 - ip) bytes to fill up to align dst to the next
- word. */
- rsb ip, ip, #4 /* ip = #4 - ip. */
- cmp ip, #2
-
- /* Copy byte by byte with conditionals. */
- itt gt
- ldrbgt r3, [r1], #1
- strbgt r3, [r0], #1
-
- itt ge
- ldrbge r4, [r1], #1
- strbge r4, [r0], #1
-
- ldrb lr, [r1], #1
- strb lr, [r0], #1
-
- /* Update the count.
- ip holds the number of bytes we have just copied. */
- subs r2, r2, ip /* r2 = r2 - ip. */
- blt copy_less_than_4 /* If r2 < ip. */
-
- /* Get here if there are more than 4 bytes to copy.
- Check if src is aligned. If beforehand src and dst were not word
- aligned but congruent (same offset), then now they are both
- word-aligned, and we can copy the rest efficiently (without
- shifting). */
- ands ip, r1, #3 /* ip = last 2 bits of src. */
- beq word_aligned /* If r1 is word-aligned. */
-
-src_not_word_aligned:
- /* Get here when src is not word-aligned, but dst is word-aligned.
- The number of bytes that remains to copy is r2+4. */
-
-#ifdef __ARM_FEATURE_UNALIGNED
- /* Copy word by word using LDR when alignment can be done in hardware,
- i.e., SCTLR.A is set, supporting unaligned access in LDR and STR. */
- subs r2, r2, #60
- blt 8f
-
-7:
- /* Copy 64 bytes in every loop iteration. */
- .irp offset, #0, #4, #8, #12, #16, #20, #24, #28, #32, #36, #40, #44, #48, #52, #56, #60
- ldr r3, [r1, \offset]
- str r3, [r0, \offset]
- .endr
-
- add r0, r0, #64
- add r1, r1, #64
- subs r2, r2, #64
- bge 7b
-
-8:
- /* Get here if less than 64 bytes to copy, -64 <= r2 < 0.
- Check if there is more than 3 bytes to copy. */
- adds r2, r2, #60
- blt copy_less_than_4
-
-9:
- /* Get here if there is less than 64 but at least 4 bytes to copy,
- where the number of bytes to copy is r2+4. */
- ldr r3, [r1], #4
- str r3, [r0], #4
- subs r2, r2, #4
- bge 9b
-
- b copy_less_than_4
-
-#else /* not __ARM_FEATURE_UNALIGNED */
-
- /* ip has last 2 bits of src,
- i.e., ip is the offset of src from word, and ip > 0.
- Compute shifts needed to copy from src to dst. */
- cmp ip, #2
- beq miscopy_16_16 /* If ip == 2. */
- bge miscopy_24_8 /* If ip == 3. */
-
- /* Get here if ip == 1. */
-
- /* Endian independent macros for shifting bytes within registers. */
-
-#ifndef __ARMEB__
-miscopy_8_24: miscopy pull=8 push=24 shiftleft=lsr shiftright=lsl
-miscopy_16_16: miscopy pull=16 push=16 shiftleft=lsr shiftright=lsl
-miscopy_24_8: miscopy pull=24 push=8 shiftleft=lsr shiftright=lsl
-#else /* not __ARMEB__ */
-miscopy_8_24: miscopy pull=8 push=24 shiftleft=lsl shiftright=lsr
-miscopy_16_16: miscopy pull=16 push=16 shiftleft=lsl shiftright=lsr
-miscopy_24_8: miscopy pull=24 push=8 shiftleft=lsl shiftright=lsr
-#endif /* not __ARMEB__ */
-
-#endif /* not __ARM_FEATURE_UNALIGNED */
+ vst1.8 {d0-d3}, [ALIGN (dst, 64)]!
+ vst1.8 {d4-d7}, [ALIGN (dst, 64)]!
+ ands count, count, #0x3f
+#else
+ /* Use an SMS style loop to maximize the I/O bandwidth. */
+ sub src, src, #4
+ sub dst, dst, #8
+ subs tmp2, count, #64 /* Use tmp2 for count. */
+ ldr A_l, [src, #4]
+ ldr A_h, [src, #8]
+ strd B_l, B_h, [sp, #8]
+ ldr B_l, [src, #12]
+ ldr B_h, [src, #16]
+ strd C_l, C_h, [sp, #16]
+ ldr C_l, [src, #20]
+ ldr C_h, [src, #24]
+ strd D_l, D_h, [sp, #24]
+ ldr D_l, [src, #28]
+ ldr D_h, [src, #32]!
+ b 1f
+ .p2align 6
+2:
+ pld [src, #(5 * 64) - (32 - 4)]
+ strd A_l, A_h, [dst, #40]
+ ldr A_l, [src, #36]
+ ldr A_h, [src, #40]
+ strd B_l, B_h, [dst, #48]
+ ldr B_l, [src, #44]
+ ldr B_h, [src, #48]
+ strd C_l, C_h, [dst, #56]
+ ldr C_l, [src, #52]
+ ldr C_h, [src, #56]
+ strd D_l, D_h, [dst, #64]!
+ ldr D_l, [src, #60]
+ ldr D_h, [src, #64]!
+ subs tmp2, tmp2, #64
+1:
+ strd A_l, A_h, [dst, #8]
+ ldr A_l, [src, #4]
+ ldr A_h, [src, #8]
+ strd B_l, B_h, [dst, #16]
+ ldr B_l, [src, #12]
+ ldr B_h, [src, #16]
+ strd C_l, C_h, [dst, #24]
+ ldr C_l, [src, #20]
+ ldr C_h, [src, #24]
+ strd D_l, D_h, [dst, #32]
+ ldr D_l, [src, #28]
+ ldr D_h, [src, #32]
+ bcs 2b
+
+ /* Save the remaining bytes and restore the callee-saved regs. */
+ strd A_l, A_h, [dst, #40]
+ add src, src, #36
+ strd B_l, B_h, [dst, #48]
+ ldrd B_l, B_h, [sp, #8]
+ strd C_l, C_h, [dst, #56]
+ ldrd C_l, C_h, [sp, #16]
+ strd D_l, D_h, [dst, #64]
+ ldrd D_l, D_h, [sp, #24]
+ add dst, dst, #72
+ ands count, tmp2, #0x3f
+#endif
+ ldr tmp2, [sp], #FRAME_SIZE
+ bne .Ltail63unaligned
+ bx lr
+
+ .size memcpy, . - memcpy
#endif /* memcpy */