diff options
Diffstat (limited to 'newlib/libc/machine/x86_64/memcpy.S')
-rw-r--r-- | newlib/libc/machine/x86_64/memcpy.S | 113 |
1 files changed, 113 insertions, 0 deletions
diff --git a/newlib/libc/machine/x86_64/memcpy.S b/newlib/libc/machine/x86_64/memcpy.S new file mode 100644 index 000000000..3178dfae2 --- /dev/null +++ b/newlib/libc/machine/x86_64/memcpy.S @@ -0,0 +1,113 @@ +/* + * ==================================================== + * Copyright (C) 2007 by Ellips BV. All rights reserved. + * + * Permission to use, copy, modify, and distribute this + * software is freely granted, provided that this notice + * is preserved. + * ==================================================== + */ + + #include "x86_64mach.h" + + .global SYM (memcpy) + SOTYPE_FUNCTION(memcpy) + +SYM (memcpy): + movq rdi, rax /* Store destination in return value */ + cmpq $16, rdx + jb byte_copy + + movq rdi, r8 /* Align destination on quad word boundary */ + andq $7, r8 + jz quadword_aligned + movq $8, rcx + subq r8, rcx + subq rcx, rdx + rep movsb + +quadword_aligned: + cmpq $256, rdx + jb quadword_copy + + movq rax, -8 (rsp) + movq r12, -16 (rsp) + movq r13, -24 (rsp) + movq r14, -32 (rsp) + + movq rdx, rcx /* Copy 128 bytes at a time with minimum cache polution */ + shrq $7, rcx + + .p2align 4 +loop: + prefetchnta 768 (rsi) + prefetchnta 832 (rsi) + + movq (rsi), rax + movq 8 (rsi), r8 + movq 16 (rsi), r9 + movq 24 (rsi), r10 + movq 32 (rsi), r11 + movq 40 (rsi), r12 + movq 48 (rsi), r13 + movq 56 (rsi), r14 + + movntiq rax, (rdi) + movntiq r8 , 8 (rdi) + movntiq r9 , 16 (rdi) + movntiq r10, 24 (rdi) + movntiq r11, 32 (rdi) + movntiq r12, 40 (rdi) + movntiq r13, 48 (rdi) + movntiq r14, 56 (rdi) + + movq 64 (rsi), rax + movq 72 (rsi), r8 + movq 80 (rsi), r9 + movq 88 (rsi), r10 + movq 96 (rsi), r11 + movq 104 (rsi), r12 + movq 112 (rsi), r13 + movq 120 (rsi), r14 + + movntiq rax, 64 (rdi) + movntiq r8 , 72 (rdi) + movntiq r9 , 80 (rdi) + movntiq r10, 88 (rdi) + movntiq r11, 96 (rdi) + movntiq r12, 104 (rdi) + movntiq r13, 112 (rdi) + movntiq r14, 120 (rdi) + + leaq 128 (rsi), rsi + leaq 128 (rdi), rdi + + dec rcx + jnz loop + + sfence + movq rdx, rcx + andq $127, rcx + rep movsb + movq -8 (rsp), rax + movq -16 (rsp), r12 + movq -24 (rsp), r13 + movq -32 (rsp), r14 + ret + + +byte_copy: + movq rdx, rcx + rep movsb + ret + + +quadword_copy: + movq rdx, rcx + shrq $3, rcx + .p2align 4 + rep movsq + movq rdx, rcx + andq $7, rcx + rep movsb /* Copy the remaining bytes */ + ret |