1/*
2 * memcpy - copy memory area
3 *
4 * Copyright (c) 2012-2022, Arm Limited.
5 * Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved.
6 * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
7 */
8
9/* Assumptions:
10 *
11 * ARMv8-a, AArch64, unaligned accesses.
12 *
13 */
14
15#include <private/bionic_asm.h>
16
17#define dstin     x0
18#define src       x1
19#define count     x2
20#define dst       x3
21#define srcend    x4
22#define dstend    x5
23#define A_l       x6
24#define A_lw      w6
25#define A_h       x7
26#define B_l       x8
27#define B_lw      w8
28#define B_h       x9
29#define C_l       x10
30#define C_lw      w10
31#define C_h       x11
32#define D_l       x12
33#define D_h       x13
34#define E_l       x14
35#define E_h       x15
36#define F_l       x16
37#define F_h       x17
38#define G_l       count
39#define G_h       dst
40#define H_l       src
41#define H_h       srcend
42#define tmp1      x14
43#define tmp2      x16
44#define SMALL_BUFFER_SIZE    48
45
46/* This implementation handles overlaps and supports both memcpy and memmove
47   from a single entry point.  It uses unaligned accesses and branchless
48   sequences to keep the code small, simple and improve performance.
49
50   Copies are split into 3 main cases: small copies of up to 32 bytes, medium
51   copies of up to 128 bytes, and large copies.  The overhead of the overlap
52   check is negligible since it is only required for large copies.
53
54   Large copies use a software pipelined loop processing 64 bytes per iteration.
55   The destination pointer is 16-byte aligned to minimize unaligned accesses.
56   The loop tail is handled by always copying 64 bytes from the end.
57*/
58
59ALIAS_SYMBOL (__memmove_aarch64_nt, __memcpy_aarch64_nt)
60ENTRY (__memcpy_aarch64_nt)
61
62    add    srcend, src, count
63    add    dstend, dstin, count
64    cmp    count, 128
65    b.hi    L(copy_long)
66    cmp    count, 32
67    b.hi    L(copy32_128)
68
69    /* Small copies: 0..32 bytes.  */
70    cmp    count, 16
71    b.lo    L(copy16)
72    ldp    A_l, A_h, [src]
73    ldp    D_l, D_h, [srcend, -16]
74    stp    A_l, A_h, [dstin]
75    stp    D_l, D_h, [dstend, -16]
76    ret
77
78    /* Copy 8-15 bytes.  */
79L(copy16):
80    tbz    count, 3, L(copy8)
81    ldr    A_l, [src]
82    ldr    A_h, [srcend, -8]
83    str    A_l, [dstin]
84    str    A_h, [dstend, -8]
85    ret
86
87    .p2align 3
88    /* Copy 4-7 bytes.  */
89L(copy8):
90    tbz    count, 2, L(copy4)
91    ldr    A_lw, [src]
92    ldr    B_lw, [srcend, -4]
93    str    A_lw, [dstin]
94    str    B_lw, [dstend, -4]
95    ret
96
97    /* Copy 0..3 bytes using a branchless sequence.  */
98L(copy4):
99    cbz    count, L(copy0)
100    lsr    tmp1, count, 1
101    ldrb    A_lw, [src]
102    ldrb    C_lw, [srcend, -1]
103    ldrb    B_lw, [src, tmp1]
104    strb    A_lw, [dstin]
105    strb    B_lw, [dstin, tmp1]
106    strb    C_lw, [dstend, -1]
107L(copy0):
108    ret
109
110    .p2align 4
111    /* Medium copies: 33..128 bytes.  */
112L(copy32_128):
113    ldp    A_l, A_h, [src]
114    ldp    B_l, B_h, [src, 16]
115    ldp    C_l, C_h, [srcend, -32]
116    ldp    D_l, D_h, [srcend, -16]
117    cmp    count, 64
118    b.hi    L(copy128)
119    stp    A_l, A_h, [dstin]
120    stp    B_l, B_h, [dstin, 16]
121    stp    C_l, C_h, [dstend, -32]
122    stp    D_l, D_h, [dstend, -16]
123    ret
124
125    .p2align 4
126    /* Copy 65..128 bytes.  */
127L(copy128):
128    ldp    E_l, E_h, [src, 32]
129    ldp    F_l, F_h, [src, 48]
130    cmp    count, 96
131    b.ls    L(copy96)
132    ldp    G_l, G_h, [srcend, -64]
133    ldp    H_l, H_h, [srcend, -48]
134    stp    G_l, G_h, [dstend, -64]
135    stp    H_l, H_h, [dstend, -48]
136L(copy96):
137    stp    A_l, A_h, [dstin]
138    stp    B_l, B_h, [dstin, 16]
139    stp    E_l, E_h, [dstin, 32]
140    stp    F_l, F_h, [dstin, 48]
141    stp    C_l, C_h, [dstend, -32]
142    stp    D_l, D_h, [dstend, -16]
143    ret
144
145    .p2align 4
146    /* Copy more than 128 bytes.  */
147L(copy_long):
148    mov tmp2, #SMALL_BUFFER_SIZE
149    cmp count, tmp2, LSL#10
150    bgt L(copy_long_nt)
151    /* Use backwards copy if there is an overlap.  */
152    sub    tmp1, dstin, src
153    cbz    tmp1, L(copy0)
154    cmp    tmp1, count
155    b.lo    L(copy_long_backwards)
156
157    /* Copy 16 bytes and then align dst to 16-byte alignment.  */
158
159    ldp    D_l, D_h, [src]
160    and    tmp1, dstin, 15
161    bic    dst, dstin, 15
162    sub    src, src, tmp1
163    add    count, count, tmp1    /* Count is now 16 too large.  */
164    ldp    A_l, A_h, [src, 16]
165    stp    D_l, D_h, [dstin]
166    ldp    B_l, B_h, [src, 32]
167    ldp    C_l, C_h, [src, 48]
168    ldp    D_l, D_h, [src, 64]!
169    subs    count, count, 128 + 16    /* Test and readjust count.  */
170    b.ls    L(copy64_from_end)
171
172L(loop64):
173    stp    A_l, A_h, [dst, 16]
174    ldp    A_l, A_h, [src, 16]
175    stp    B_l, B_h, [dst, 32]
176    ldp    B_l, B_h, [src, 32]
177    stp    C_l, C_h, [dst, 48]
178    ldp    C_l, C_h, [src, 48]
179    stp    D_l, D_h, [dst, 64]!
180    ldp    D_l, D_h, [src, 64]!
181    subs    count, count, 64
182    b.hi    L(loop64)
183
184    /* Write the last iteration and copy 64 bytes from the end.  */
185L(copy64_from_end):
186    ldp    E_l, E_h, [srcend, -64]
187    stp    A_l, A_h, [dst, 16]
188    ldp    A_l, A_h, [srcend, -48]
189    stp    B_l, B_h, [dst, 32]
190    ldp    B_l, B_h, [srcend, -32]
191    stp    C_l, C_h, [dst, 48]
192    ldp    C_l, C_h, [srcend, -16]
193    stp    D_l, D_h, [dst, 64]
194    stp    E_l, E_h, [dstend, -64]
195    stp    A_l, A_h, [dstend, -48]
196    stp    B_l, B_h, [dstend, -32]
197    stp    C_l, C_h, [dstend, -16]
198    ret
199
200    .p2align 4
201
202    /* Large backwards copy for overlapping copies.
203       Copy 16 bytes and then align dst to 16-byte alignment.  */
204L(copy_long_backwards):
205    ldp    D_l, D_h, [srcend, -16]
206    and    tmp1, dstend, 15
207    sub    srcend, srcend, tmp1
208    sub    count, count, tmp1
209    ldp    A_l, A_h, [srcend, -16]
210    stp    D_l, D_h, [dstend, -16]
211    ldp    B_l, B_h, [srcend, -32]
212    ldp    C_l, C_h, [srcend, -48]
213    ldp    D_l, D_h, [srcend, -64]!
214    sub    dstend, dstend, tmp1
215    subs    count, count, 128
216    b.ls    L(copy64_from_start)
217
218L(loop64_backwards):
219    stp    A_l, A_h, [dstend, -16]
220    ldp    A_l, A_h, [srcend, -16]
221    stp    B_l, B_h, [dstend, -32]
222    ldp    B_l, B_h, [srcend, -32]
223    stp    C_l, C_h, [dstend, -48]
224    ldp    C_l, C_h, [srcend, -48]
225    stp    D_l, D_h, [dstend, -64]!
226    ldp    D_l, D_h, [srcend, -64]!
227    subs    count, count, 64
228    b.hi    L(loop64_backwards)
229
230    /* Write the last iteration and copy 64 bytes from the start.  */
231L(copy64_from_start):
232    ldp    G_l, G_h, [src, 48]
233    stp    A_l, A_h, [dstend, -16]
234    ldp    A_l, A_h, [src, 32]
235    stp    B_l, B_h, [dstend, -32]
236    ldp    B_l, B_h, [src, 16]
237    stp    C_l, C_h, [dstend, -48]
238    ldp    C_l, C_h, [src]
239    stp    D_l, D_h, [dstend, -64]
240    stp    G_l, G_h, [dstin, 48]
241    stp    A_l, A_h, [dstin, 32]
242    stp    B_l, B_h, [dstin, 16]
243    stp    C_l, C_h, [dstin]
244    ret
245
246    .p2align 4
247    /* Copy more than 48 KB using ldnp+stnp (non-temporal) instructions.  */
248L(copy_long_nt):
249    /* Use backwards copy if there is an overlap.  */
250    sub    tmp1, dstin, src
251    cbz    tmp1, L(copy0)
252    cmp    tmp1, count
253    b.lo    L(copy_long_backwards_nt)
254
255    /* Copy 16 bytes and then align dst to 16-byte alignment.  */
256
257    ldnp    D_l, D_h, [src]
258    and    tmp1, dstin, 15
259    bic    dst, dstin, 15
260    sub    src, src, tmp1
261    add    count, count, tmp1    /* Count is now 16 too large.  */
262    ldnp    A_l, A_h, [src, 16]
263    stnp    D_l, D_h, [dstin]
264    ldnp    B_l, B_h, [src, 32]
265    ldnp    C_l, C_h, [src, 48]
266    ldnp    D_l, D_h, [src, 64]
267    add     src, src, #64
268    subs    count, count, 128 + 16    /* Test and readjust count.  */
269    b.ls    L(copy64_from_end_nt)
270
271L(loop64_nt):
272    stnp    A_l, A_h, [dst, 16]
273    ldnp    A_l, A_h, [src, 16]
274    stnp    B_l, B_h, [dst, 32]
275    ldnp    B_l, B_h, [src, 32]
276    stnp    C_l, C_h, [dst, 48]
277    ldnp    C_l, C_h, [src, 48]
278    stnp    D_l, D_h, [dst, 64]
279    add dst, dst, #64
280    ldnp    D_l, D_h, [src, 64]
281    add src, src, #64
282    subs    count, count, 64
283    b.hi    L(loop64_nt)
284
285    /* Write the last iteration and copy 64 bytes from the end.  */
286L(copy64_from_end_nt):
287    ldnp    E_l, E_h, [srcend, -64]
288    stnp    A_l, A_h, [dst, 16]
289    ldnp    A_l, A_h, [srcend, -48]
290    stnp    B_l, B_h, [dst, 32]
291    ldnp    B_l, B_h, [srcend, -32]
292    stnp    C_l, C_h, [dst, 48]
293    ldnp    C_l, C_h, [srcend, -16]
294    stnp    D_l, D_h, [dst, 64]
295    stnp    E_l, E_h, [dstend, -64]
296    stnp    A_l, A_h, [dstend, -48]
297    stnp    B_l, B_h, [dstend, -32]
298    stnp    C_l, C_h, [dstend, -16]
299    ret
300
301    .p2align 4
302
303    /* Large backwards copy for overlapping copies.
304       Copy 16 bytes and then align dst to 16-byte alignment.  */
305L(copy_long_backwards_nt):
306    ldnp    D_l, D_h, [srcend, -16]
307    and    tmp1, dstend, 15
308    sub    srcend, srcend, tmp1
309    sub    count, count, tmp1
310    ldnp    A_l, A_h, [srcend, -16]
311    stnp    D_l, D_h, [dstend, -16]
312    ldnp    B_l, B_h, [srcend, -32]
313    ldnp    C_l, C_h, [srcend, -48]
314    ldnp    D_l, D_h, [srcend, -64]
315    add     srcend, srcend, #-64
316    sub    dstend, dstend, tmp1
317    subs    count, count, 128
318    b.ls    L(copy64_from_start_nt)
319
320L(loop64_backwards_nt):
321    stnp    A_l, A_h, [dstend, -16]
322    ldnp    A_l, A_h, [srcend, -16]
323    stnp    B_l, B_h, [dstend, -32]
324    ldnp    B_l, B_h, [srcend, -32]
325    stnp    C_l, C_h, [dstend, -48]
326    ldnp    C_l, C_h, [srcend, -48]
327    stnp    D_l, D_h, [dstend, -64]
328    add     dstend, dstend, #-64
329    ldnp    D_l, D_h, [srcend, -64]
330    add     srcend, srcend, #-64
331    subs    count, count, 64
332    b.hi    L(loop64_backwards_nt)
333
334    /* Write the last iteration and copy 64 bytes from the start.  */
335L(copy64_from_start_nt):
336    ldnp    G_l, G_h, [src, 48]
337    stnp    A_l, A_h, [dstend, -16]
338    ldnp    A_l, A_h, [src, 32]
339    stnp    B_l, B_h, [dstend, -32]
340    ldnp    B_l, B_h, [src, 16]
341    stnp    C_l, C_h, [dstend, -48]
342    ldnp    C_l, C_h, [src]
343    stnp    D_l, D_h, [dstend, -64]
344    stnp    G_l, G_h, [dstin, 48]
345    stnp    A_l, A_h, [dstin, 32]
346    stnp    B_l, B_h, [dstin, 16]
347    stnp    C_l, C_h, [dstin]
348    ret
349
350END (__memcpy_aarch64_nt)
351
352