1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
|
#include "setarch.h"
#include "defines.h"
#ifdef __H8300SX__
.global _memcpy
_memcpy:
stm.l er4-er6,@-er7
; Set up source and destination pointers for movmd.
mov.l er0,er6
mov.l er1,er5
; See whether the copy is long enough to use the movmd.l code.
; Although the code can handle anything longer than 6 bytes,
; it can be more expensive than movmd.b for small moves.
; It's better to use a higher threshold to account for this.
;
; Note that the exact overhead of the movmd.l checks depends on
; the alignments of the length and pointers. They are faster when
; er0 & 3 == er1 & 3 == er2 & 3, faster still when these values
; are 0. This threshold is a compromise between the various cases.
cmp #16,LEN(r2)
blo simple
; movmd.l only works for even addresses. If one of the addresses
; is odd and the other is not, fall back on a simple move.
bld #0,r5l
bxor #0,r6l
bcs simple
; Make the addresses even.
bld #0,r5l
bcc word_aligned
mov.b @er5+,@er6+
sub #1,LEN(r2)
word_aligned:
; See if copying one word would make the first operand longword
; aligned. Although this is only really worthwhile if it aligns
; the second operand as well, it's no worse if doesn't, so it
; hardly seems worth the overhead of a "band" check.
bld #1,r6l
bcc fast_copy
mov.w @er5+,@er6+
sub #2,LEN(r2)
fast_copy:
; Set (e)r4 to the number of longwords to copy.
mov LEN(r2),LEN(r4)
shlr #2,LEN(r4)
#ifdef __NORMAL_MODE__
; 16-bit pointers and size_ts: one movmd.l is enough. This code
; is never reached with r4 == 0.
movmd.l
and.w #3,r2
simple:
mov.w r2,r4
beq quit
movmd.b
quit:
rts/l er4-er6
#else
; Skip the first iteration if the number of longwords is divisible
; by 0x10000.
mov.w r4,r4
beq fast_loop_next
; This loop copies r4 (!= 0) longwords the first time round and 65536
; longwords on each iteration after that.
fast_loop:
movmd.l
fast_loop_next:
sub.w #1,e4
bhs fast_loop
; Mop up any left-over bytes. We could just fall through to the
; simple code after the "and" but the version below is quicker
; and only takes 10 more bytes.
and.w #3,r2
beq quit
mov.w r2,r4
movmd.b
quit:
rts/l er4-er6
simple:
; Simple bytewise copy. We need to handle all lengths, including zero.
mov.w r2,r4
beq simple_loop_next
simple_loop:
movmd.b
simple_loop_next:
sub.w #1,e2
bhs simple_loop
rts/l er4-er6
#endif
#else
.global _memcpy
_memcpy:
; MOVP @(2/4,r7),A0P ; dst
; MOVP @(4/8,r7),A1P ; src
; MOVP @(6/12,r7),A2P ; len
MOVP A0P,A3P ; keep copy of final dst
ADDP A2P,A0P ; point to end of dst
CMPP A0P,A3P ; see if anything to do
beq quit
ADDP A2P,A1P ; point to end of src
; lets see if we can do this in words
or A0L,A2L ; or in the dst address
or A3L,A2L ; or the length
or A1L,A2L ; or the src address
btst #0,A2L ; see if the lsb is zero
bne byteloop
wordloop:
#ifdef __NORMAL_MODE__
sub #2,A1P
#else
subs #2,A1P ; point to word
#endif
mov.w @A1P,A2 ; get word
mov.w A2,@-A0P ; save word
CMPP A0P,A3P ; at the front again ?
bne wordloop
rts
byteloop:
#ifdef __NORMAL_MODE__
sub #1,A1P
#else
subs #1,A1P ; point to byte
#endif
mov.b @A1P,A2L ; get byte
mov.b A2L,@-A0P ; save byte
CMPP A0P,A3P ; at the front again ?
bne byteloop
; return with A0 pointing to dst
quit: rts
#endif
|