blob: 507054d847eaa99cf61aa8c25c7afe5183339f87 [file] [log] [blame]
Stefan Roese27c3e952021-09-02 17:00:17 +02001/* SPDX-License-Identifier: MIT */
2/*
3 * memcpy - copy memory area
4 *
5 * Copyright (c) 2012-2020, Arm Limited.
6 */
7
8/* Assumptions:
9 *
10 * ARMv8-a, AArch64, unaligned accesses.
11 *
12 */
13
14#include "asmdefs.h"
15
16#define dstin x0
17#define src x1
18#define count x2
19#define dst x3
20#define srcend x4
21#define dstend x5
22#define A_l x6
23#define A_lw w6
24#define A_h x7
25#define B_l x8
26#define B_lw w8
27#define B_h x9
28#define C_l x10
29#define C_lw w10
30#define C_h x11
31#define D_l x12
32#define D_h x13
33#define E_l x14
34#define E_h x15
35#define F_l x16
36#define F_h x17
37#define G_l count
38#define G_h dst
39#define H_l src
40#define H_h srcend
41#define tmp1 x14
42
43/* This implementation handles overlaps and supports both memcpy and memmove
44 from a single entry point. It uses unaligned accesses and branchless
45 sequences to keep the code small, simple and improve performance.
46
47 Copies are split into 3 main cases: small copies of up to 32 bytes, medium
48 copies of up to 128 bytes, and large copies. The overhead of the overlap
49 check is negligible since it is only required for large copies.
50
51 Large copies use a software pipelined loop processing 64 bytes per iteration.
52 The destination pointer is 16-byte aligned to minimize unaligned accesses.
53 The loop tail is handled by always copying 64 bytes from the end.
54*/
55
56ENTRY_ALIAS (memmove)
57ENTRY (memcpy)
58 PTR_ARG (0)
59 PTR_ARG (1)
60 SIZE_ARG (2)
61 add srcend, src, count
62 add dstend, dstin, count
63 cmp count, 128
64 b.hi L(copy_long)
65 cmp count, 32
66 b.hi L(copy32_128)
67
68 /* Small copies: 0..32 bytes. */
69 cmp count, 16
70 b.lo L(copy16)
71 ldp A_l, A_h, [src]
72 ldp D_l, D_h, [srcend, -16]
73 stp A_l, A_h, [dstin]
74 stp D_l, D_h, [dstend, -16]
75 ret
76
77 /* Copy 8-15 bytes. */
78L(copy16):
79 tbz count, 3, L(copy8)
80 ldr A_l, [src]
81 ldr A_h, [srcend, -8]
82 str A_l, [dstin]
83 str A_h, [dstend, -8]
84 ret
85
86 .p2align 3
87 /* Copy 4-7 bytes. */
88L(copy8):
89 tbz count, 2, L(copy4)
90 ldr A_lw, [src]
91 ldr B_lw, [srcend, -4]
92 str A_lw, [dstin]
93 str B_lw, [dstend, -4]
94 ret
95
96 /* Copy 0..3 bytes using a branchless sequence. */
97L(copy4):
98 cbz count, L(copy0)
99 lsr tmp1, count, 1
100 ldrb A_lw, [src]
101 ldrb C_lw, [srcend, -1]
102 ldrb B_lw, [src, tmp1]
103 strb A_lw, [dstin]
104 strb B_lw, [dstin, tmp1]
105 strb C_lw, [dstend, -1]
106L(copy0):
107 ret
108
109 .p2align 4
110 /* Medium copies: 33..128 bytes. */
111L(copy32_128):
112 ldp A_l, A_h, [src]
113 ldp B_l, B_h, [src, 16]
114 ldp C_l, C_h, [srcend, -32]
115 ldp D_l, D_h, [srcend, -16]
116 cmp count, 64
117 b.hi L(copy128)
118 stp A_l, A_h, [dstin]
119 stp B_l, B_h, [dstin, 16]
120 stp C_l, C_h, [dstend, -32]
121 stp D_l, D_h, [dstend, -16]
122 ret
123
124 .p2align 4
125 /* Copy 65..128 bytes. */
126L(copy128):
127 ldp E_l, E_h, [src, 32]
128 ldp F_l, F_h, [src, 48]
129 cmp count, 96
130 b.ls L(copy96)
131 ldp G_l, G_h, [srcend, -64]
132 ldp H_l, H_h, [srcend, -48]
133 stp G_l, G_h, [dstend, -64]
134 stp H_l, H_h, [dstend, -48]
135L(copy96):
136 stp A_l, A_h, [dstin]
137 stp B_l, B_h, [dstin, 16]
138 stp E_l, E_h, [dstin, 32]
139 stp F_l, F_h, [dstin, 48]
140 stp C_l, C_h, [dstend, -32]
141 stp D_l, D_h, [dstend, -16]
142 ret
143
144 .p2align 4
145 /* Copy more than 128 bytes. */
146L(copy_long):
147 /* Use backwards copy if there is an overlap. */
148 sub tmp1, dstin, src
149 cbz tmp1, L(copy0)
150 cmp tmp1, count
151 b.lo L(copy_long_backwards)
152
153 /* Copy 16 bytes and then align dst to 16-byte alignment. */
154
155 ldp D_l, D_h, [src]
156 and tmp1, dstin, 15
157 bic dst, dstin, 15
158 sub src, src, tmp1
159 add count, count, tmp1 /* Count is now 16 too large. */
160 ldp A_l, A_h, [src, 16]
161 stp D_l, D_h, [dstin]
162 ldp B_l, B_h, [src, 32]
163 ldp C_l, C_h, [src, 48]
164 ldp D_l, D_h, [src, 64]!
165 subs count, count, 128 + 16 /* Test and readjust count. */
166 b.ls L(copy64_from_end)
167
168L(loop64):
169 stp A_l, A_h, [dst, 16]
170 ldp A_l, A_h, [src, 16]
171 stp B_l, B_h, [dst, 32]
172 ldp B_l, B_h, [src, 32]
173 stp C_l, C_h, [dst, 48]
174 ldp C_l, C_h, [src, 48]
175 stp D_l, D_h, [dst, 64]!
176 ldp D_l, D_h, [src, 64]!
177 subs count, count, 64
178 b.hi L(loop64)
179
180 /* Write the last iteration and copy 64 bytes from the end. */
181L(copy64_from_end):
182 ldp E_l, E_h, [srcend, -64]
183 stp A_l, A_h, [dst, 16]
184 ldp A_l, A_h, [srcend, -48]
185 stp B_l, B_h, [dst, 32]
186 ldp B_l, B_h, [srcend, -32]
187 stp C_l, C_h, [dst, 48]
188 ldp C_l, C_h, [srcend, -16]
189 stp D_l, D_h, [dst, 64]
190 stp E_l, E_h, [dstend, -64]
191 stp A_l, A_h, [dstend, -48]
192 stp B_l, B_h, [dstend, -32]
193 stp C_l, C_h, [dstend, -16]
194 ret
195
196 .p2align 4
197
198 /* Large backwards copy for overlapping copies.
199 Copy 16 bytes and then align dst to 16-byte alignment. */
200L(copy_long_backwards):
201 ldp D_l, D_h, [srcend, -16]
202 and tmp1, dstend, 15
203 sub srcend, srcend, tmp1
204 sub count, count, tmp1
205 ldp A_l, A_h, [srcend, -16]
206 stp D_l, D_h, [dstend, -16]
207 ldp B_l, B_h, [srcend, -32]
208 ldp C_l, C_h, [srcend, -48]
209 ldp D_l, D_h, [srcend, -64]!
210 sub dstend, dstend, tmp1
211 subs count, count, 128
212 b.ls L(copy64_from_start)
213
214L(loop64_backwards):
215 stp A_l, A_h, [dstend, -16]
216 ldp A_l, A_h, [srcend, -16]
217 stp B_l, B_h, [dstend, -32]
218 ldp B_l, B_h, [srcend, -32]
219 stp C_l, C_h, [dstend, -48]
220 ldp C_l, C_h, [srcend, -48]
221 stp D_l, D_h, [dstend, -64]!
222 ldp D_l, D_h, [srcend, -64]!
223 subs count, count, 64
224 b.hi L(loop64_backwards)
225
226 /* Write the last iteration and copy 64 bytes from the start. */
227L(copy64_from_start):
228 ldp G_l, G_h, [src, 48]
229 stp A_l, A_h, [dstend, -16]
230 ldp A_l, A_h, [src, 32]
231 stp B_l, B_h, [dstend, -32]
232 ldp B_l, B_h, [src, 16]
233 stp C_l, C_h, [dstend, -48]
234 ldp C_l, C_h, [src]
235 stp D_l, D_h, [dstend, -64]
236 stp G_l, G_h, [dstin, 48]
237 stp A_l, A_h, [dstin, 32]
238 stp B_l, B_h, [dstin, 16]
239 stp C_l, C_h, [dstin]
240 ret
241
242END (memcpy)