newlib/libc/machine/sh/strncpy.S


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209

/* Copyright 2003 SuperH Ltd.  */

#include "asm.h"

#ifdef __SH5__
#if __SHMEDIA__

#ifdef __LITTLE_ENDIAN__
#define ZPAD_MASK(src, dst) addi src, -1, dst
#else
#define ZPAD_MASK(src, dst) \
 byterev src, dst; addi dst, -1, dst; byterev dst, dst
#endif


/* We assume that the destination is not in the first 16 bytes of memory.
   A typical linker script will put the text section first, and as
   this code is longer that 16 bytes, you have to get out of your way
    to put data there.  */
ENTRY(strncpy)
 pt L_small, tr2
 ldlo.q r3, 0, r0
 shlli r3, 3, r19
 mcmpeq.b r0, r63, r1
 SHHI r1, r19, r7
 add r2, r4, r20
 addi r20, -8, r5
 /* If the size is greater than 8, we know we can read beyond the first
    (possibly partial) quadword, and write out a full first and last
    (possibly unaligned and/or overlapping) quadword.  */
 bge/u r2, r5, tr2 // L_small
 pt L_found0, tr0
 addi r2, 8, r22
 bnei/u r7, 0, tr0  // L_found0
 ori r3, -8, r38
 pt L_end_early, tr1
 sub r2, r38, r22
 stlo.q r2, 0, r0
 sthi.q r2, 7, r0
 sub r3, r2, r6
 ldx.q r22, r6, r0
 /* Before each iteration, check that we can store in full the next quad we
    are about to fetch.  */
 addi r5, -8, r36
 bgtu/u r22, r36, tr1 // L_end_early
 pt L_scan0, tr1
L_scan0:
 addi r22, 8, r22
 mcmpeq.b r0, r63, r1
 stlo.q r22, -8, r0
 bnei/u r1, 0, tr0   // L_found0
 sthi.q r22, -1, r0
 ldx.q r22, r6, r0
 bgeu/l r36, r22, tr1 // L_scan0
L_end:
 // At end; we might re-read a few bytes when we fetch the last quad.
 // branch mispredict, so load is ready now.
 mcmpeq.b r0, r63, r1
 addi r22, 8, r22
 bnei/u r1, 0, tr0   // L_found0
 add r3, r4, r7
 ldlo.q r7, -8, r1
 ldhi.q r7, -1, r7
 ptabs r18, tr0
 stlo.q r22, -8, r0
 or r1, r7, r1
 mcmpeq.b r1, r63, r7
 sthi.q r22, -1, r0
 ZPAD_MASK (r7, r7)
 and r1, r7, r1 // mask out non-zero bytes after first zero byte
 stlo.q r20, -8, r1
 sthi.q r20, -1, r1
 blink tr0, r63

L_end_early:
 /* Check if we can store the current quad in full.  */
 pt L_end, tr1
 add r3, r4, r7
 bgtu/u r5, r22, tr1 // L_end // Not really unlikely, but gap is short.
 /* If not, that means we can just proceed to process the last quad.
    Two pipeline stalls are unavoidable, as we don't have enough ILP.  */
 ldlo.q r7, -8, r1
 ldhi.q r7, -1, r7
 ptabs r18, tr0
 or r1, r7, r1
 mcmpeq.b r1, r63, r7
 ZPAD_MASK (r7, r7)
 and r1, r7, r1 // mask out non-zero bytes after first zero byte
 stlo.q r20, -8, r1
 sthi.q r20, -1, r1
 blink tr0, r63

L_found0:
 // r0: string to store, not yet zero-padding normalized.
 // r1: result of mcmpeq.b r0, r63, r1.
 // r22: store address plus 8.  I.e. address where zero padding beyond the
 //      string in r0 goes.
 // r20: store end address.
 // r5: store end address minus 8.
 pt L_write0_multiquad, tr0
 ZPAD_MASK (r1, r1)
 and r0, r1, r0 // mask out non-zero bytes after first zero byte
 stlo.q r22, -8, r0
 sthi.q r22, -1, r0
 andi r22, -8, r1 // Check if zeros to write fit in one quad word.
 bgtu/l r5, r1, tr0 // L_write0_multiquad
 ptabs r18, tr1
 sub r20, r22, r1
 shlli r1, 2, r1 // Do shift in two steps so that 64 bit case is
 SHLO r0, r1, r0 // handled correctly.
 SHLO r0, r1, r0
 sthi.q r20, -1, r0
 blink tr1, r63

L_write0_multiquad:
 pt L_write0_loop, tr0
 ptabs r18, tr1
 stlo.q r22, 0, r63
 sthi.q r20, -1, r63
 addi r1, 8, r1
 bgeu/l r5, r1, tr0 // L_write0_loop
 blink tr1, r63

L_write0_loop:
 st.q r1, 0 ,r63
 addi r1, 8, r1
 bgeu/l r5, r1, tr0 // L_write0_loop
 blink tr1, r63

L_small:
 // r0: string to store, not yet zero-padding normalized.
 // r1: result of mcmpeq.b r0, r63, r1.
 // r7: nonzero indicates relevant zero found r0.
 // r2: store address.
 // r3: read address.
 // r4: size, max 8
 // r20: store end address.
 // r5: store end address minus 8.
 pt L_nohi, tr0
 pt L_small_storelong, tr1
 ptabs r18, tr2
 sub r63, r4, r23
 bnei/u r7, 0, tr0  // L_nohi
 ori r3, -8, r7
 bge/l r23, r7, tr0 // L_nohi
 ldhi.q r3, 7, r1
 or r0, r1, r0
 mcmpeq.b r0, r63, r1
L_nohi:
 ZPAD_MASK (r1, r1)
 and r0, r1, r0
 movi 4, r19
 bge/u r4, r19, tr1 // L_small_storelong

 pt L_small_end, tr0
#ifndef __LITTLE_ENDIAN__
 byterev r0, r0
#endif
 beqi/u r4, 0, tr0 // L_small_end
 st.b r2, 0, r0
 beqi/u r4, 1, tr0 // L_small_end
 shlri r0, 8, r0
 st.b r2, 1, r0
 beqi/u r4, 2, tr0 // L_small_end
 shlri r0, 8, r0
 st.b r2, 2, r0
L_small_end:
 blink tr2, r63

L_small_storelong:
 shlli r23, 3, r7
 SHHI r0, r7, r1
#ifdef __LITTLE_ENDIAN__
 shlri r1, 32, r1
#else
 shlri r0, 32, r0
#endif
 stlo.l r2, 0, r0
 sthi.l r2, 3, r0
 stlo.l r20, -4, r1
 sthi.l r20, -1, r1
 blink tr2, r63

#else /* SHcompact */

/* This code is optimized for size.  Instruction selection is SH5 specific.
   SH4 should use a different version.  */
ENTRY(strncpy)
 mov #0, r6
 cmp/eq r4, r6
 bt return
 mov r2, r5
 add #-1, r5
 add r5, r4
loop:
 bt/s found0
 add #1, r5
 mov.b @r3+, r1
found0:
 cmp/eq r5,r4
 mov.b r1, @r5
 bf/s loop
 cmp/eq r1, r6
return:
 rts
 nop
 
#endif /* SHcompact */
#endif /* __SH5__ */