Welcome to mirror list, hosted at ThFree Co, Russian Federation.

memcpy.S « x86_64 « machine « libc « newlib - cygwin.com/git/newlib-cygwin.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
blob: 3178dfae27c93948210eabc780dc21b695d63df2 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
/*
 * ====================================================
 * Copyright (C) 2007 by Ellips BV. All rights reserved.
 *
 * Permission to use, copy, modify, and distribute this
 * software is freely granted, provided that this notice
 * is preserved.
 * ====================================================
 */

  #include "x86_64mach.h"

  .global SYM (memcpy)
  SOTYPE_FUNCTION(memcpy)

SYM (memcpy):
  movq    rdi, rax                /* Store destination in return value */
  cmpq    $16, rdx
  jb      byte_copy

  movq    rdi, r8                 /* Align destination on quad word boundary */
  andq    $7, r8
  jz      quadword_aligned
  movq    $8, rcx
  subq    r8, rcx
  subq    rcx, rdx
  rep     movsb

quadword_aligned:
  cmpq    $256, rdx
  jb      quadword_copy

  movq    rax, -8  (rsp)
  movq    r12, -16 (rsp)
  movq    r13, -24 (rsp)
  movq    r14, -32 (rsp)

  movq    rdx, rcx                /* Copy 128 bytes at a time with minimum cache polution */
  shrq    $7, rcx

  .p2align 4
loop:
  prefetchnta   768 (rsi)
  prefetchnta   832 (rsi)

  movq       (rsi), rax
  movq     8 (rsi), r8
  movq    16 (rsi), r9
  movq    24 (rsi), r10
  movq    32 (rsi), r11
  movq    40 (rsi), r12
  movq    48 (rsi), r13
  movq    56 (rsi), r14

  movntiq rax,    (rdi)
  movntiq r8 ,  8 (rdi)
  movntiq r9 , 16 (rdi)
  movntiq r10, 24 (rdi)
  movntiq r11, 32 (rdi)
  movntiq r12, 40 (rdi)
  movntiq r13, 48 (rdi)
  movntiq r14, 56 (rdi)

  movq     64 (rsi), rax
  movq     72 (rsi), r8
  movq     80 (rsi), r9
  movq     88 (rsi), r10
  movq     96 (rsi), r11
  movq    104 (rsi), r12
  movq    112 (rsi), r13
  movq    120 (rsi), r14

  movntiq rax,  64 (rdi)
  movntiq r8 ,  72 (rdi)
  movntiq r9 ,  80 (rdi)
  movntiq r10,  88 (rdi)
  movntiq r11,  96 (rdi)
  movntiq r12, 104 (rdi)
  movntiq r13, 112 (rdi)
  movntiq r14, 120 (rdi)

  leaq    128 (rsi), rsi
  leaq    128 (rdi), rdi

  dec     rcx
  jnz     loop

  sfence
  movq    rdx, rcx
  andq    $127, rcx
  rep     movsb
  movq    -8  (rsp), rax
  movq    -16 (rsp), r12
  movq    -24 (rsp), r13
  movq    -32 (rsp), r14
  ret


byte_copy:
  movq    rdx, rcx
  rep     movsb
  ret


quadword_copy:
  movq    rdx, rcx
  shrq    $3, rcx
  .p2align 4
  rep     movsq
  movq    rdx, rcx
  andq    $7, rcx
  rep     movsb                   /* Copy the remaining bytes */
  ret