diff options
Diffstat (limited to 'newlib/libc/machine/arm/aeabi_memcpy-armv7a.S')
-rw-r--r-- | newlib/libc/machine/arm/aeabi_memcpy-armv7a.S | 286 |
1 files changed, 0 insertions, 286 deletions
diff --git a/newlib/libc/machine/arm/aeabi_memcpy-armv7a.S b/newlib/libc/machine/arm/aeabi_memcpy-armv7a.S deleted file mode 100644 index 53e3330ff..000000000 --- a/newlib/libc/machine/arm/aeabi_memcpy-armv7a.S +++ /dev/null @@ -1,286 +0,0 @@ -/* - * Copyright (c) 2014 ARM Ltd - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 3. The name of the company may not be used to endorse or promote - * products derived from this software without specific prior written - * permission. - * - * THIS SOFTWARE IS PROVIDED BY ARM LTD ``AS IS'' AND ANY EXPRESS OR IMPLIED - * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF - * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. - * IN NO EVENT SHALL ARM LTD BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED - * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR - * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF - * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING - * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - */ - -#include "arm_asm.h" - -/* NOTE: This ifdef MUST match the one in aeabi_memcpy.c. */ -#if defined (__ARM_ARCH_7A__) && defined (__ARM_FEATURE_UNALIGNED) && \ - (defined (__ARM_NEON__) || !defined (__SOFTFP__)) - - .syntax unified - .global __aeabi_memcpy - .type __aeabi_memcpy, %function -__aeabi_memcpy: - /* Assumes that n >= 0, and dst, src are valid pointers. - If there is at least 8 bytes to copy, use LDRD/STRD. - If src and dst are misaligned with different offsets, - first copy byte by byte until dst is aligned, - and then copy using LDRD/STRD and shift if needed. - When less than 8 left, copy a word and then byte by byte. */ - - /* Save registers (r0 holds the return value): - optimized push {r0, r4, r5, lr}. - To try and improve performance, stack layout changed, - i.e., not keeping the stack looking like users expect - (highest numbered register at highest address). */ - push {r0, lr} - strd r4, r5, [sp, #-8]! - - /* Get copying of tiny blocks out of the way first. */ - /* Is there at least 4 bytes to copy? */ - subs r2, r2, #4 - blt copy_less_than_4 /* If n < 4. */ - - /* Check word alignment. */ - ands ip, r0, #3 /* ip = last 2 bits of dst. */ - bne dst_not_word_aligned /* If dst is not word-aligned. */ - - /* Get here if dst is word-aligned. */ - ands ip, r1, #3 /* ip = last 2 bits of src. */ - bne src_not_word_aligned /* If src is not word-aligned. */ -word_aligned: - /* Get here if source and dst both are word-aligned. - The number of bytes remaining to copy is r2+4. */ - - /* Is there is at least 64 bytes to copy? */ - subs r2, r2, #60 - blt copy_less_than_64 /* If r2 + 4 < 64. */ - - /* First, align the destination buffer to 8-bytes, - to make sure double loads and stores don't cross cache line boundary, - as they are then more expensive even if the data is in the cache - (require two load/store issue cycles instead of one). - If only one of the buffers is not 8-bytes aligned, - then it's more important to align dst than src, - because there is more penalty for stores - than loads that cross cacheline boundary. - This check and realignment are only worth doing - if there is a lot to copy. */ - - /* Get here if dst is word aligned, - i.e., the 2 least significant bits are 0. - If dst is not 2w aligned (i.e., the 3rd bit is not set in dst), - then copy 1 word (4 bytes). */ - ands r3, r0, #4 - beq two_word_aligned /* If dst already two-word aligned. */ - ldr r3, [r1], #4 - str r3, [r0], #4 - subs r2, r2, #4 - blt copy_less_than_64 - -two_word_aligned: - /* TODO: Align to cacheline (useful for PLD optimization). */ - - /* Every loop iteration copies 64 bytes. */ -1: - .irp offset, #0, #8, #16, #24, #32, #40, #48, #56 - ldrd r4, r5, [r1, \offset] - strd r4, r5, [r0, \offset] - .endr - - add r0, r0, #64 - add r1, r1, #64 - subs r2, r2, #64 - bge 1b /* If there is more to copy. */ - -copy_less_than_64: - - /* Get here if less than 64 bytes to copy, -64 <= r2 < 0. - Restore the count if there is more than 7 bytes to copy. */ - adds r2, r2, #56 - blt copy_less_than_8 - - /* Copy 8 bytes at a time. */ -2: - ldrd r4, r5, [r1], #8 - strd r4, r5, [r0], #8 - subs r2, r2, #8 - bge 2b /* If there is more to copy. */ - -copy_less_than_8: - - /* Get here if less than 8 bytes to copy, -8 <= r2 < 0. - Check if there is more to copy. */ - cmn r2, #8 - beq return /* If r2 + 8 == 0. */ - - /* Restore the count if there is more than 3 bytes to copy. */ - adds r2, r2, #4 - blt copy_less_than_4 - - /* Copy 4 bytes. */ - ldr r3, [r1], #4 - str r3, [r0], #4 - -copy_less_than_4: - /* Get here if less than 4 bytes to copy, -4 <= r2 < 0. */ - - /* Restore the count, check if there is more to copy. */ - adds r2, r2, #4 - beq return /* If r2 == 0. */ - - /* Get here with r2 is in {1,2,3}={01,10,11}. */ - /* Logical shift left r2, insert 0s, update flags. */ - lsls r2, r2, #31 - - /* Copy byte by byte. - Condition ne means the last bit of r2 is 0. - Condition cs means the second to last bit of r2 is set, - i.e., r2 is 1 or 3. */ - itt ne - ldrbne r3, [r1], #1 - strbne r3, [r0], #1 - - itttt cs - ldrbcs r4, [r1], #1 - ldrbcs r5, [r1] - strbcs r4, [r0], #1 - strbcs r5, [r0] - -return: - /* Restore registers: optimized pop {r0, r4, r5, pc} */ - ldrd r4, r5, [sp], #8 - pop {r0, pc} /* This is the only return point of memcpy. */ - -dst_not_word_aligned: - - /* Get here when dst is not aligned and ip has the last 2 bits of dst, - i.e., ip is the offset of dst from word. - The number of bytes that remains to copy is r2 + 4, - i.e., there are at least 4 bytes to copy. - Write a partial word (0 to 3 bytes), such that dst becomes - word-aligned. */ - - /* If dst is at ip bytes offset from a word (with 0 < ip < 4), - then there are (4 - ip) bytes to fill up to align dst to the next - word. */ - rsb ip, ip, #4 /* ip = #4 - ip. */ - cmp ip, #2 - - /* Copy byte by byte with conditionals. */ - itt gt - ldrbgt r3, [r1], #1 - strbgt r3, [r0], #1 - - itt ge - ldrbge r4, [r1], #1 - strbge r4, [r0], #1 - - ldrb lr, [r1], #1 - strb lr, [r0], #1 - - /* Update the count. - ip holds the number of bytes we have just copied. */ - subs r2, r2, ip /* r2 = r2 - ip. */ - blt copy_less_than_4 /* If r2 < ip. */ - - /* Get here if there are more than 4 bytes to copy. - Check if src is aligned. If beforehand src and dst were not word - aligned but congruent (same offset), then now they are both - word-aligned, and we can copy the rest efficiently (without - shifting). */ - ands ip, r1, #3 /* ip = last 2 bits of src. */ - beq word_aligned /* If r1 is word-aligned. */ - -src_not_word_aligned: - /* Get here when src is not word-aligned, but dst is word-aligned. - The number of bytes that remains to copy is r2+4. */ - - /* Copy word by word using LDR when alignment can be done in hardware, - i.e., SCTLR.A is set, supporting unaligned access in LDR and STR. */ - subs r2, r2, #60 - blt 8f - -7: - /* Copy 64 bytes in every loop iteration. */ - .irp offset, #0, #4, #8, #12, #16, #20, #24, #28, #32, #36, #40, #44, #48, #52, #56, #60 - ldr r3, [r1, \offset] - str r3, [r0, \offset] - .endr - - add r0, r0, #64 - add r1, r1, #64 - subs r2, r2, #64 - bge 7b - -8: - /* Get here if less than 64 bytes to copy, -64 <= r2 < 0. - Check if there is more than 3 bytes to copy. */ - adds r2, r2, #60 - blt copy_less_than_4 - -9: - /* Get here if there is less than 64 but at least 4 bytes to copy, - where the number of bytes to copy is r2+4. */ - ldr r3, [r1], #4 - str r3, [r0], #4 - subs r2, r2, #4 - bge 9b - - b copy_less_than_4 - - - .syntax unified - .global __aeabi_memcpy4 - .type __aeabi_memcpy4, %function -__aeabi_memcpy4: - /* Assumes that both of its arguments are 4-byte aligned. */ - - push {r0, lr} - strd r4, r5, [sp, #-8]! - - /* Is there at least 4 bytes to copy? */ - subs r2, r2, #4 - blt copy_less_than_4 /* If n < 4. */ - - bl word_aligned - - .syntax unified - .global __aeabi_memcpy8 - .type __aeabi_memcpy8, %function -__aeabi_memcpy8: - /* Assumes that both of its arguments are 8-byte aligned. */ - - push {r0, lr} - strd r4, r5, [sp, #-8]! - - /* Is there at least 4 bytes to copy? */ - subs r2, r2, #4 - blt copy_less_than_4 /* If n < 4. */ - - /* Is there at least 8 bytes to copy? */ - subs r2, r2, #4 - blt copy_less_than_8 /* If n < 8. */ - - /* Is there at least 64 bytes to copy? */ - subs r2, r2, #56 - blt copy_less_than_64 /* if n + 8 < 64. */ - - bl two_word_aligned - -#endif |