1/* 2 * memcpy - copy memory area 3 * 4 * Copyright (c) 2012-2022, Arm Limited. 5 * Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved. 6 * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception 7 */ 8 9/* Assumptions: 10 * 11 * ARMv8-a, AArch64, unaligned accesses. 12 * 13 */ 14 15#include <private/bionic_asm.h> 16 17#define dstin x0 18#define src x1 19#define count x2 20#define dst x3 21#define srcend x4 22#define dstend x5 23#define A_l x6 24#define A_lw w6 25#define A_h x7 26#define B_l x8 27#define B_lw w8 28#define B_h x9 29#define C_l x10 30#define C_lw w10 31#define C_h x11 32#define D_l x12 33#define D_h x13 34#define E_l x14 35#define E_h x15 36#define F_l x16 37#define F_h x17 38#define G_l count 39#define G_h dst 40#define H_l src 41#define H_h srcend 42#define tmp1 x14 43#define tmp2 x16 44#define SMALL_BUFFER_SIZE 48 45 46/* This implementation handles overlaps and supports both memcpy and memmove 47 from a single entry point. It uses unaligned accesses and branchless 48 sequences to keep the code small, simple and improve performance. 49 50 Copies are split into 3 main cases: small copies of up to 32 bytes, medium 51 copies of up to 128 bytes, and large copies. The overhead of the overlap 52 check is negligible since it is only required for large copies. 53 54 Large copies use a software pipelined loop processing 64 bytes per iteration. 55 The destination pointer is 16-byte aligned to minimize unaligned accesses. 56 The loop tail is handled by always copying 64 bytes from the end. 57*/ 58 59ALIAS_SYMBOL (__memmove_aarch64_nt, __memcpy_aarch64_nt) 60ENTRY (__memcpy_aarch64_nt) 61 62 add srcend, src, count 63 add dstend, dstin, count 64 cmp count, 128 65 b.hi L(copy_long) 66 cmp count, 32 67 b.hi L(copy32_128) 68 69 /* Small copies: 0..32 bytes. */ 70 cmp count, 16 71 b.lo L(copy16) 72 ldp A_l, A_h, [src] 73 ldp D_l, D_h, [srcend, -16] 74 stp A_l, A_h, [dstin] 75 stp D_l, D_h, [dstend, -16] 76 ret 77 78 /* Copy 8-15 bytes. */ 79L(copy16): 80 tbz count, 3, L(copy8) 81 ldr A_l, [src] 82 ldr A_h, [srcend, -8] 83 str A_l, [dstin] 84 str A_h, [dstend, -8] 85 ret 86 87 .p2align 3 88 /* Copy 4-7 bytes. */ 89L(copy8): 90 tbz count, 2, L(copy4) 91 ldr A_lw, [src] 92 ldr B_lw, [srcend, -4] 93 str A_lw, [dstin] 94 str B_lw, [dstend, -4] 95 ret 96 97 /* Copy 0..3 bytes using a branchless sequence. */ 98L(copy4): 99 cbz count, L(copy0) 100 lsr tmp1, count, 1 101 ldrb A_lw, [src] 102 ldrb C_lw, [srcend, -1] 103 ldrb B_lw, [src, tmp1] 104 strb A_lw, [dstin] 105 strb B_lw, [dstin, tmp1] 106 strb C_lw, [dstend, -1] 107L(copy0): 108 ret 109 110 .p2align 4 111 /* Medium copies: 33..128 bytes. */ 112L(copy32_128): 113 ldp A_l, A_h, [src] 114 ldp B_l, B_h, [src, 16] 115 ldp C_l, C_h, [srcend, -32] 116 ldp D_l, D_h, [srcend, -16] 117 cmp count, 64 118 b.hi L(copy128) 119 stp A_l, A_h, [dstin] 120 stp B_l, B_h, [dstin, 16] 121 stp C_l, C_h, [dstend, -32] 122 stp D_l, D_h, [dstend, -16] 123 ret 124 125 .p2align 4 126 /* Copy 65..128 bytes. */ 127L(copy128): 128 ldp E_l, E_h, [src, 32] 129 ldp F_l, F_h, [src, 48] 130 cmp count, 96 131 b.ls L(copy96) 132 ldp G_l, G_h, [srcend, -64] 133 ldp H_l, H_h, [srcend, -48] 134 stp G_l, G_h, [dstend, -64] 135 stp H_l, H_h, [dstend, -48] 136L(copy96): 137 stp A_l, A_h, [dstin] 138 stp B_l, B_h, [dstin, 16] 139 stp E_l, E_h, [dstin, 32] 140 stp F_l, F_h, [dstin, 48] 141 stp C_l, C_h, [dstend, -32] 142 stp D_l, D_h, [dstend, -16] 143 ret 144 145 .p2align 4 146 /* Copy more than 128 bytes. */ 147L(copy_long): 148 mov tmp2, #SMALL_BUFFER_SIZE 149 cmp count, tmp2, LSL#10 150 bgt L(copy_long_nt) 151 /* Use backwards copy if there is an overlap. */ 152 sub tmp1, dstin, src 153 cbz tmp1, L(copy0) 154 cmp tmp1, count 155 b.lo L(copy_long_backwards) 156 157 /* Copy 16 bytes and then align dst to 16-byte alignment. */ 158 159 ldp D_l, D_h, [src] 160 and tmp1, dstin, 15 161 bic dst, dstin, 15 162 sub src, src, tmp1 163 add count, count, tmp1 /* Count is now 16 too large. */ 164 ldp A_l, A_h, [src, 16] 165 stp D_l, D_h, [dstin] 166 ldp B_l, B_h, [src, 32] 167 ldp C_l, C_h, [src, 48] 168 ldp D_l, D_h, [src, 64]! 169 subs count, count, 128 + 16 /* Test and readjust count. */ 170 b.ls L(copy64_from_end) 171 172L(loop64): 173 stp A_l, A_h, [dst, 16] 174 ldp A_l, A_h, [src, 16] 175 stp B_l, B_h, [dst, 32] 176 ldp B_l, B_h, [src, 32] 177 stp C_l, C_h, [dst, 48] 178 ldp C_l, C_h, [src, 48] 179 stp D_l, D_h, [dst, 64]! 180 ldp D_l, D_h, [src, 64]! 181 subs count, count, 64 182 b.hi L(loop64) 183 184 /* Write the last iteration and copy 64 bytes from the end. */ 185L(copy64_from_end): 186 ldp E_l, E_h, [srcend, -64] 187 stp A_l, A_h, [dst, 16] 188 ldp A_l, A_h, [srcend, -48] 189 stp B_l, B_h, [dst, 32] 190 ldp B_l, B_h, [srcend, -32] 191 stp C_l, C_h, [dst, 48] 192 ldp C_l, C_h, [srcend, -16] 193 stp D_l, D_h, [dst, 64] 194 stp E_l, E_h, [dstend, -64] 195 stp A_l, A_h, [dstend, -48] 196 stp B_l, B_h, [dstend, -32] 197 stp C_l, C_h, [dstend, -16] 198 ret 199 200 .p2align 4 201 202 /* Large backwards copy for overlapping copies. 203 Copy 16 bytes and then align dst to 16-byte alignment. */ 204L(copy_long_backwards): 205 ldp D_l, D_h, [srcend, -16] 206 and tmp1, dstend, 15 207 sub srcend, srcend, tmp1 208 sub count, count, tmp1 209 ldp A_l, A_h, [srcend, -16] 210 stp D_l, D_h, [dstend, -16] 211 ldp B_l, B_h, [srcend, -32] 212 ldp C_l, C_h, [srcend, -48] 213 ldp D_l, D_h, [srcend, -64]! 214 sub dstend, dstend, tmp1 215 subs count, count, 128 216 b.ls L(copy64_from_start) 217 218L(loop64_backwards): 219 stp A_l, A_h, [dstend, -16] 220 ldp A_l, A_h, [srcend, -16] 221 stp B_l, B_h, [dstend, -32] 222 ldp B_l, B_h, [srcend, -32] 223 stp C_l, C_h, [dstend, -48] 224 ldp C_l, C_h, [srcend, -48] 225 stp D_l, D_h, [dstend, -64]! 226 ldp D_l, D_h, [srcend, -64]! 227 subs count, count, 64 228 b.hi L(loop64_backwards) 229 230 /* Write the last iteration and copy 64 bytes from the start. */ 231L(copy64_from_start): 232 ldp G_l, G_h, [src, 48] 233 stp A_l, A_h, [dstend, -16] 234 ldp A_l, A_h, [src, 32] 235 stp B_l, B_h, [dstend, -32] 236 ldp B_l, B_h, [src, 16] 237 stp C_l, C_h, [dstend, -48] 238 ldp C_l, C_h, [src] 239 stp D_l, D_h, [dstend, -64] 240 stp G_l, G_h, [dstin, 48] 241 stp A_l, A_h, [dstin, 32] 242 stp B_l, B_h, [dstin, 16] 243 stp C_l, C_h, [dstin] 244 ret 245 246 .p2align 4 247 /* Copy more than 48 KB using ldnp+stnp (non-temporal) instructions. */ 248L(copy_long_nt): 249 /* Use backwards copy if there is an overlap. */ 250 sub tmp1, dstin, src 251 cbz tmp1, L(copy0) 252 cmp tmp1, count 253 b.lo L(copy_long_backwards_nt) 254 255 /* Copy 16 bytes and then align dst to 16-byte alignment. */ 256 257 ldnp D_l, D_h, [src] 258 and tmp1, dstin, 15 259 bic dst, dstin, 15 260 sub src, src, tmp1 261 add count, count, tmp1 /* Count is now 16 too large. */ 262 ldnp A_l, A_h, [src, 16] 263 stnp D_l, D_h, [dstin] 264 ldnp B_l, B_h, [src, 32] 265 ldnp C_l, C_h, [src, 48] 266 ldnp D_l, D_h, [src, 64] 267 add src, src, #64 268 subs count, count, 128 + 16 /* Test and readjust count. */ 269 b.ls L(copy64_from_end_nt) 270 271L(loop64_nt): 272 stnp A_l, A_h, [dst, 16] 273 ldnp A_l, A_h, [src, 16] 274 stnp B_l, B_h, [dst, 32] 275 ldnp B_l, B_h, [src, 32] 276 stnp C_l, C_h, [dst, 48] 277 ldnp C_l, C_h, [src, 48] 278 stnp D_l, D_h, [dst, 64] 279 add dst, dst, #64 280 ldnp D_l, D_h, [src, 64] 281 add src, src, #64 282 subs count, count, 64 283 b.hi L(loop64_nt) 284 285 /* Write the last iteration and copy 64 bytes from the end. */ 286L(copy64_from_end_nt): 287 ldnp E_l, E_h, [srcend, -64] 288 stnp A_l, A_h, [dst, 16] 289 ldnp A_l, A_h, [srcend, -48] 290 stnp B_l, B_h, [dst, 32] 291 ldnp B_l, B_h, [srcend, -32] 292 stnp C_l, C_h, [dst, 48] 293 ldnp C_l, C_h, [srcend, -16] 294 stnp D_l, D_h, [dst, 64] 295 stnp E_l, E_h, [dstend, -64] 296 stnp A_l, A_h, [dstend, -48] 297 stnp B_l, B_h, [dstend, -32] 298 stnp C_l, C_h, [dstend, -16] 299 ret 300 301 .p2align 4 302 303 /* Large backwards copy for overlapping copies. 304 Copy 16 bytes and then align dst to 16-byte alignment. */ 305L(copy_long_backwards_nt): 306 ldnp D_l, D_h, [srcend, -16] 307 and tmp1, dstend, 15 308 sub srcend, srcend, tmp1 309 sub count, count, tmp1 310 ldnp A_l, A_h, [srcend, -16] 311 stnp D_l, D_h, [dstend, -16] 312 ldnp B_l, B_h, [srcend, -32] 313 ldnp C_l, C_h, [srcend, -48] 314 ldnp D_l, D_h, [srcend, -64] 315 add srcend, srcend, #-64 316 sub dstend, dstend, tmp1 317 subs count, count, 128 318 b.ls L(copy64_from_start_nt) 319 320L(loop64_backwards_nt): 321 stnp A_l, A_h, [dstend, -16] 322 ldnp A_l, A_h, [srcend, -16] 323 stnp B_l, B_h, [dstend, -32] 324 ldnp B_l, B_h, [srcend, -32] 325 stnp C_l, C_h, [dstend, -48] 326 ldnp C_l, C_h, [srcend, -48] 327 stnp D_l, D_h, [dstend, -64] 328 add dstend, dstend, #-64 329 ldnp D_l, D_h, [srcend, -64] 330 add srcend, srcend, #-64 331 subs count, count, 64 332 b.hi L(loop64_backwards_nt) 333 334 /* Write the last iteration and copy 64 bytes from the start. */ 335L(copy64_from_start_nt): 336 ldnp G_l, G_h, [src, 48] 337 stnp A_l, A_h, [dstend, -16] 338 ldnp A_l, A_h, [src, 32] 339 stnp B_l, B_h, [dstend, -32] 340 ldnp B_l, B_h, [src, 16] 341 stnp C_l, C_h, [dstend, -48] 342 ldnp C_l, C_h, [src] 343 stnp D_l, D_h, [dstend, -64] 344 stnp G_l, G_h, [dstin, 48] 345 stnp A_l, A_h, [dstin, 32] 346 stnp B_l, B_h, [dstin, 16] 347 stnp C_l, C_h, [dstin] 348 ret 349 350END (__memcpy_aarch64_nt) 351 352