diff options
Diffstat (limited to 'newlib/libc/machine/arm/memcpy-armv7m.S')
-rw-r--r-- | newlib/libc/machine/arm/memcpy-armv7m.S | 321 |
1 files changed, 321 insertions, 0 deletions
diff --git a/newlib/libc/machine/arm/memcpy-armv7m.S b/newlib/libc/machine/arm/memcpy-armv7m.S new file mode 100644 index 000000000..8a70c7d73 --- /dev/null +++ b/newlib/libc/machine/arm/memcpy-armv7m.S @@ -0,0 +1,321 @@ +/* + * Copyright (c) 2013 ARM Ltd + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. The name of the company may not be used to endorse or promote + * products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY ARM LTD ``AS IS'' AND ANY EXPRESS OR IMPLIED + * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL ARM LTD BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED + * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +/* This memcpy routine is optimised for Cortex-M3/M4 cores with/without + unaligned access. + + If compiled with GCC, this file should be enclosed within following + pre-processing check: + if defined (__ARM_ARCH_7M__) || defined (__ARM_ARCH_7EM__) + + Prototype: void *memcpy (void *dst, const void *src, size_t count); + + The job will be done in 5 steps. + Step 1: Align src/dest pointers, copy mis-aligned if fail to align both + Step 2: Repeatedly copy big block size of __OPT_BIG_BLOCK_SIZE + Step 3: Repeatedly copy big block size of __OPT_MID_BLOCK_SIZE + Step 4: Copy word by word + Step 5: Copy byte-to-byte + + Tunable options: + __OPT_BIG_BLOCK_SIZE: Size of big block in words. Default to 64. + __OPT_MID_BLOCK_SIZE: Size of big block in words. Default to 16. + */ +#ifndef __OPT_BIG_BLOCK_SIZE +#define __OPT_BIG_BLOCK_SIZE (4 * 16) +#endif + +#ifndef __OPT_MID_BLOCK_SIZE +#define __OPT_MID_BLOCK_SIZE (4 * 4) +#endif + +#if __OPT_BIG_BLOCK_SIZE == 16 +#define BEGIN_UNROLL_BIG_BLOCK \ + .irp offset, 0,4,8,12 +#elif __OPT_BIG_BLOCK_SIZE == 32 +#define BEGIN_UNROLL_BIG_BLOCK \ + .irp offset, 0,4,8,12,16,20,24,28 +#elif __OPT_BIG_BLOCK_SIZE == 64 +#define BEGIN_UNROLL_BIG_BLOCK \ + .irp offset, 0,4,8,12,16,20,24,28,32,36,40,44,48,52,56,60 +#else +#error "Illegal __OPT_BIG_BLOCK_SIZE" +#endif + +#if __OPT_MID_BLOCK_SIZE == 8 +#define BEGIN_UNROLL_MID_BLOCK \ + .irp offset, 0,4 +#elif __OPT_MID_BLOCK_SIZE == 16 +#define BEGIN_UNROLL_MID_BLOCK \ + .irp offset, 0,4,8,12 +#else +#error "Illegal __OPT_MID_BLOCK_SIZE" +#endif + +#define END_UNROLL .endr + + .syntax unified + .text + .align 2 + .global memcpy + .thumb + .thumb_func + .type memcpy, %function +memcpy: + @ r0: dst + @ r1: src + @ r2: len +#ifdef __ARM_FEATURE_UNALIGNED + /* In case of UNALIGNED access supported, ip is not used in + function body. */ + mov ip, r0 +#else + push {r0} +#endif + orr r3, r1, r0 + ands r3, r3, #3 + bne .Lmisaligned_copy + +.Lbig_block: + subs r2, __OPT_BIG_BLOCK_SIZE + blo .Lmid_block + + /* Kernel loop for big block copy */ + .align 2 +.Lbig_block_loop: + BEGIN_UNROLL_BIG_BLOCK +#ifdef __ARM_ARCH_7EM__ + ldr r3, [r1], #4 + str r3, [r0], #4 + END_UNROLL +#else /* __ARM_ARCH_7M__ */ + ldr r3, [r1, \offset] + str r3, [r0, \offset] + END_UNROLL + adds r0, __OPT_BIG_BLOCK_SIZE + adds r1, __OPT_BIG_BLOCK_SIZE +#endif + subs r2, __OPT_BIG_BLOCK_SIZE + bhs .Lbig_block_loop + +.Lmid_block: + adds r2, __OPT_BIG_BLOCK_SIZE - __OPT_MID_BLOCK_SIZE + blo .Lcopy_word_by_word + + /* Kernel loop for mid-block copy */ + .align 2 +.Lmid_block_loop: + BEGIN_UNROLL_MID_BLOCK +#ifdef __ARM_ARCH_7EM__ + ldr r3, [r1], #4 + str r3, [r0], #4 + END_UNROLL +#else /* __ARM_ARCH_7M__ */ + ldr r3, [r1, \offset] + str r3, [r0, \offset] + END_UNROLL + adds r0, __OPT_MID_BLOCK_SIZE + adds r1, __OPT_MID_BLOCK_SIZE +#endif + subs r2, __OPT_MID_BLOCK_SIZE + bhs .Lmid_block_loop + +.Lcopy_word_by_word: + adds r2, __OPT_MID_BLOCK_SIZE - 4 + blo .Lcopy_less_than_4 + + /* Kernel loop for small block copy */ + .align 2 +.Lcopy_word_by_word_loop: + ldr r3, [r1], #4 + str r3, [r0], #4 + subs r2, #4 + bhs .Lcopy_word_by_word_loop + +.Lcopy_less_than_4: + adds r2, #4 + beq .Ldone + + lsls r2, r2, #31 + itt ne + ldrbne r3, [r1], #1 + strbne r3, [r0], #1 + + bcc .Ldone +#ifdef __ARM_FEATURE_UNALIGNED + ldrh r3, [r1] + strh r3, [r0] +#else + ldrb r3, [r1] + strb r3, [r0] + ldrb r3, [r1, #1] + strb r3, [r0, #1] +#endif /* __ARM_FEATURE_UNALIGNED */ + +.Ldone: +#ifdef __ARM_FEATURE_UNALIGNED + mov r0, ip +#else + pop {r0} +#endif + bx lr + + .align 2 +.Lmisaligned_copy: +#ifdef __ARM_FEATURE_UNALIGNED + /* Define label DST_ALIGNED to BIG_BLOCK. It will go to aligned copy + once destination is adjusted to aligned. */ +#define Ldst_aligned Lbig_block + + /* Copy word by word using LDR when alignment can be done in hardware, + i.e., SCTLR.A is set, supporting unaligned access in LDR and STR. */ + + cmp r2, #8 + blo .Lbyte_copy + + /* if src is aligned, just go to the big block loop. */ + lsls r3, r1, #30 + beq .Ldst_aligned +#else + /* if len < 12, misalignment adjustment has more overhead than + just byte-to-byte copy. Also, len must >=8 to guarantee code + afterward work correctly. */ + cmp r2, #12 + blo .Lbyte_copy +#endif /* __ARM_FEATURE_UNALIGNED */ + + /* Align dst only, not trying to align src. That is the because + handling of aligned src and misaligned dst need more overhead than + otherwise. By doing this the worst case is when initial src is aligned, + additional up to 4 byte additional copy will executed, which is + acceptable. */ + + ands r3, r0, #3 + beq .Ldst_aligned + + rsb r3, #4 + subs r2, r3 + + lsls r3, r3, #31 + itt ne + ldrbne r3, [r1], #1 + strbne r3, [r0], #1 + + bcc .Ldst_aligned + +#ifdef __ARM_FEATURE_UNALIGNED + ldrh r3, [r1], #2 + strh r3, [r0], #2 + b .Ldst_aligned +#else + ldrb r3, [r1], #1 + strb r3, [r0], #1 + ldrb r3, [r1], #1 + strb r3, [r0], #1 + /* Now that dst is aligned */ +.Ldst_aligned: + /* if r1 is aligned now, it means r0/r1 has the same misalignment, + and they are both aligned now. Go aligned copy. */ + ands r3, r1, #3 + beq .Lbig_block + + /* dst is aligned, but src isn't. Misaligned copy. */ + + push {r4, r5} + subs r2, #4 + + /* Backward r1 by misaligned bytes, to make r1 aligned. + Since we need to restore r1 to unaligned address after the loop, + we need keep the offset bytes to ip and sub it from r1 afterward. */ + subs r1, r3 + rsb ip, r3, #4 + + /* Pre-load on word */ + ldr r4, [r1], #4 + + cmp r3, #2 + beq .Lmisaligned_copy_2_2 + cmp r3, #3 + beq .Lmisaligned_copy_3_1 + + .macro mis_src_copy shift +1: + lsrs r4, r4, \shift + ldr r3, [r1], #4 + lsls r5, r3, 32-\shift + orr r4, r4, r5 + str r4, [r0], #4 + mov r4, r3 + subs r2, #4 + bhs 1b + .endm + +.Lmisaligned_copy_1_3: + mis_src_copy shift=8 + b .Lsrc_misaligned_tail + +.Lmisaligned_copy_3_1: + mis_src_copy shift=24 + b .Lsrc_misaligned_tail + +.Lmisaligned_copy_2_2: + /* For 2_2 misalignment, ldr is still faster than 2 x ldrh. */ + mis_src_copy shift=16 + +.Lsrc_misaligned_tail: + adds r2, #4 + subs r1, ip + pop {r4, r5} + +#endif /* __ARM_FEATURE_UNALIGNED */ + +.Lbyte_copy: + subs r2, #4 + blo .Lcopy_less_than_4 + +.Lbyte_copy_loop: + subs r2, #1 + ldrb r3, [r1], #1 + strb r3, [r0], #1 + bhs .Lbyte_copy_loop + + ldrb r3, [r1] + strb r3, [r0] + ldrb r3, [r1, #1] + strb r3, [r0, #1] + ldrb r3, [r1, #2] + strb r3, [r0, #2] + +#ifdef __ARM_FEATURE_UNALIGNED + mov r0, ip +#else + pop {r0} +#endif + bx lr + + .size memcpy, .-memcpy |