1/* 2Copyright (c) 2014, Intel Corporation 3All rights reserved. 4 5Redistribution and use in source and binary forms, with or without 6modification, are permitted provided that the following conditions are met: 7 8 * Redistributions of source code must retain the above copyright notice, 9 * this list of conditions and the following disclaimer. 10 11 * Redistributions in binary form must reproduce the above copyright notice, 12 * this list of conditions and the following disclaimer in the documentation 13 * and/or other materials provided with the distribution. 14 15 * Neither the name of Intel Corporation nor the names of its contributors 16 * may be used to endorse or promote products derived from this software 17 * without specific prior written permission. 18 19THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 20ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 21WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 22DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 23ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 24(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 25LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON 26ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 27(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 28SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 29*/ 30 31#define FOR_SILVERMONT 32#include "cache.h" 33 34#ifndef MEMMOVE 35# define MEMMOVE memmove_generic 36#endif 37 38#ifndef L 39# define L(label) .L##label 40#endif 41 42#ifndef cfi_startproc 43# define cfi_startproc .cfi_startproc 44#endif 45 46#ifndef cfi_endproc 47# define cfi_endproc .cfi_endproc 48#endif 49 50#ifndef cfi_rel_offset 51# define cfi_rel_offset(reg, off) .cfi_rel_offset reg, off 52#endif 53 54#ifndef cfi_restore 55# define cfi_restore(reg) .cfi_restore reg 56#endif 57 58#ifndef cfi_adjust_cfa_offset 59# define cfi_adjust_cfa_offset(off) .cfi_adjust_cfa_offset off 60#endif 61 62#ifndef ENTRY 63# define ENTRY(name) \ 64 .type name, @function; \ 65 .globl name; \ 66 .p2align 4; \ 67name: \ 68 cfi_startproc 69#endif 70 71#ifndef END 72# define END(name) \ 73 cfi_endproc; \ 74 .size name, .-name 75#endif 76 77#define DEST PARMS 78#define SRC DEST+4 79#define LEN SRC+4 80 81#define CFI_PUSH(REG) \ 82 cfi_adjust_cfa_offset (4); \ 83 cfi_rel_offset (REG, 0) 84 85#define CFI_POP(REG) \ 86 cfi_adjust_cfa_offset (-4); \ 87 cfi_restore (REG) 88 89#define PUSH(REG) pushl REG; CFI_PUSH (REG) 90#define POP(REG) popl REG; CFI_POP (REG) 91 92#define PARMS 8 /* Preserve EBX. */ 93#define ENTRANCE PUSH (%ebx); 94#define RETURN_END POP (%ebx); ret 95#define RETURN RETURN_END; CFI_PUSH (%ebx) 96 97 .section .text.sse2,"ax",@progbits 98ENTRY (MEMMOVE) 99 ENTRANCE 100 movl LEN(%esp), %ecx 101 movl SRC(%esp), %eax 102 movl DEST(%esp), %edx 103 104/* Check whether we should copy backward or forward. */ 105 cmp %eax, %edx 106 je L(mm_return) 107 jg L(mm_len_0_or_more_backward) 108 109/* Now do checks for lengths. We do [0..16], [0..32], [0..64], [0..128] 110 separately. */ 111 cmp $16, %ecx 112 jbe L(mm_len_0_16_bytes_forward) 113 114 cmpl $32, %ecx 115 ja L(mm_len_32_or_more_forward) 116 117/* Copy [0..32] and return. */ 118 movdqu (%eax), %xmm0 119 movdqu -16(%eax, %ecx), %xmm1 120 movdqu %xmm0, (%edx) 121 movdqu %xmm1, -16(%edx, %ecx) 122 jmp L(mm_return) 123 124L(mm_len_32_or_more_forward): 125 cmpl $64, %ecx 126 ja L(mm_len_64_or_more_forward) 127 128/* Copy [0..64] and return. */ 129 movdqu (%eax), %xmm0 130 movdqu 16(%eax), %xmm1 131 movdqu -16(%eax, %ecx), %xmm2 132 movdqu -32(%eax, %ecx), %xmm3 133 movdqu %xmm0, (%edx) 134 movdqu %xmm1, 16(%edx) 135 movdqu %xmm2, -16(%edx, %ecx) 136 movdqu %xmm3, -32(%edx, %ecx) 137 jmp L(mm_return) 138 139L(mm_len_64_or_more_forward): 140 cmpl $128, %ecx 141 ja L(mm_len_128_or_more_forward) 142 143/* Copy [0..128] and return. */ 144 movdqu (%eax), %xmm0 145 movdqu 16(%eax), %xmm1 146 movdqu 32(%eax), %xmm2 147 movdqu 48(%eax), %xmm3 148 movdqu -64(%eax, %ecx), %xmm4 149 movdqu -48(%eax, %ecx), %xmm5 150 movdqu -32(%eax, %ecx), %xmm6 151 movdqu -16(%eax, %ecx), %xmm7 152 movdqu %xmm0, (%edx) 153 movdqu %xmm1, 16(%edx) 154 movdqu %xmm2, 32(%edx) 155 movdqu %xmm3, 48(%edx) 156 movdqu %xmm4, -64(%edx, %ecx) 157 movdqu %xmm5, -48(%edx, %ecx) 158 movdqu %xmm6, -32(%edx, %ecx) 159 movdqu %xmm7, -16(%edx, %ecx) 160 jmp L(mm_return) 161 162L(mm_len_128_or_more_forward): 163 PUSH (%esi) 164 PUSH (%edi) 165 166/* Aligning the address of destination. */ 167 movdqu (%eax), %xmm0 168 movdqu 16(%eax), %xmm1 169 movdqu 32(%eax), %xmm2 170 movdqu 48(%eax), %xmm3 171 172 leal 64(%edx), %edi 173 andl $-64, %edi 174 subl %edx, %eax 175 176 movdqu (%eax, %edi), %xmm4 177 movdqu 16(%eax, %edi), %xmm5 178 movdqu 32(%eax, %edi), %xmm6 179 movdqu 48(%eax, %edi), %xmm7 180 181 movdqu %xmm0, (%edx) 182 movdqu %xmm1, 16(%edx) 183 movdqu %xmm2, 32(%edx) 184 movdqu %xmm3, 48(%edx) 185 movdqa %xmm4, (%edi) 186 movaps %xmm5, 16(%edi) 187 movaps %xmm6, 32(%edi) 188 movaps %xmm7, 48(%edi) 189 addl $64, %edi 190 191 leal (%edx, %ecx), %ebx 192 andl $-64, %ebx 193 cmp %edi, %ebx 194 jbe L(mm_copy_remaining_forward) 195 196 cmp $SHARED_CACHE_SIZE_HALF, %ecx 197 jae L(mm_large_page_loop_forward) 198 199 .p2align 4 200L(mm_main_loop_forward): 201 202 prefetcht0 128(%eax, %edi) 203 204 movdqu (%eax, %edi), %xmm0 205 movdqu 16(%eax, %edi), %xmm1 206 movdqu 32(%eax, %edi), %xmm2 207 movdqu 48(%eax, %edi), %xmm3 208 movdqa %xmm0, (%edi) 209 movaps %xmm1, 16(%edi) 210 movaps %xmm2, 32(%edi) 211 movaps %xmm3, 48(%edi) 212 leal 64(%edi), %edi 213 cmp %edi, %ebx 214 ja L(mm_main_loop_forward) 215 216L(mm_copy_remaining_forward): 217 addl %edx, %ecx 218 subl %edi, %ecx 219/* We copied all up till %edi position in the dst. 220 In %ecx now is how many bytes are left to copy. 221 Now we need to advance %esi. */ 222 leal (%edi, %eax), %esi 223 224L(mm_remaining_0_64_bytes_forward): 225 cmp $32, %ecx 226 ja L(mm_remaining_33_64_bytes_forward) 227 cmp $16, %ecx 228 ja L(mm_remaining_17_32_bytes_forward) 229 testl %ecx, %ecx 230 .p2align 4,,2 231 je L(mm_return_pop_all) 232 233 cmpb $8, %cl 234 ja L(mm_remaining_9_16_bytes_forward) 235 cmpb $4, %cl 236 .p2align 4,,5 237 ja L(mm_remaining_5_8_bytes_forward) 238 cmpb $2, %cl 239 .p2align 4,,1 240 ja L(mm_remaining_3_4_bytes_forward) 241 movzbl -1(%esi,%ecx), %eax 242 movzbl (%esi), %ebx 243 movb %al, -1(%edi,%ecx) 244 movb %bl, (%edi) 245 jmp L(mm_return_pop_all) 246 247L(mm_remaining_33_64_bytes_forward): 248 movdqu (%esi), %xmm0 249 movdqu 16(%esi), %xmm1 250 movdqu -32(%esi, %ecx), %xmm2 251 movdqu -16(%esi, %ecx), %xmm3 252 movdqu %xmm0, (%edi) 253 movdqu %xmm1, 16(%edi) 254 movdqu %xmm2, -32(%edi, %ecx) 255 movdqu %xmm3, -16(%edi, %ecx) 256 jmp L(mm_return_pop_all) 257 258L(mm_remaining_17_32_bytes_forward): 259 movdqu (%esi), %xmm0 260 movdqu -16(%esi, %ecx), %xmm1 261 movdqu %xmm0, (%edi) 262 movdqu %xmm1, -16(%edi, %ecx) 263 jmp L(mm_return_pop_all) 264 265L(mm_remaining_9_16_bytes_forward): 266 movq (%esi), %xmm0 267 movq -8(%esi, %ecx), %xmm1 268 movq %xmm0, (%edi) 269 movq %xmm1, -8(%edi, %ecx) 270 jmp L(mm_return_pop_all) 271 272L(mm_remaining_5_8_bytes_forward): 273 movl (%esi), %eax 274 movl -4(%esi,%ecx), %ebx 275 movl %eax, (%edi) 276 movl %ebx, -4(%edi,%ecx) 277 jmp L(mm_return_pop_all) 278 279L(mm_remaining_3_4_bytes_forward): 280 movzwl -2(%esi,%ecx), %eax 281 movzwl (%esi), %ebx 282 movw %ax, -2(%edi,%ecx) 283 movw %bx, (%edi) 284 jmp L(mm_return_pop_all) 285 286L(mm_len_0_16_bytes_forward): 287 testb $24, %cl 288 jne L(mm_len_9_16_bytes_forward) 289 testb $4, %cl 290 .p2align 4,,5 291 jne L(mm_len_5_8_bytes_forward) 292 testl %ecx, %ecx 293 .p2align 4,,2 294 je L(mm_return) 295 testb $2, %cl 296 .p2align 4,,1 297 jne L(mm_len_2_4_bytes_forward) 298 movzbl -1(%eax,%ecx), %ebx 299 movzbl (%eax), %eax 300 movb %bl, -1(%edx,%ecx) 301 movb %al, (%edx) 302 jmp L(mm_return) 303 304L(mm_len_2_4_bytes_forward): 305 movzwl -2(%eax,%ecx), %ebx 306 movzwl (%eax), %eax 307 movw %bx, -2(%edx,%ecx) 308 movw %ax, (%edx) 309 jmp L(mm_return) 310 311L(mm_len_5_8_bytes_forward): 312 movl (%eax), %ebx 313 movl -4(%eax,%ecx), %eax 314 movl %ebx, (%edx) 315 movl %eax, -4(%edx,%ecx) 316 jmp L(mm_return) 317 318L(mm_len_9_16_bytes_forward): 319 movq (%eax), %xmm0 320 movq -8(%eax, %ecx), %xmm1 321 movq %xmm0, (%edx) 322 movq %xmm1, -8(%edx, %ecx) 323 jmp L(mm_return) 324 325 CFI_POP (%edi) 326 CFI_POP (%esi) 327 328L(mm_recalc_len): 329/* Compute in %ecx how many bytes are left to copy after 330 the main loop stops. */ 331 movl %ebx, %ecx 332 subl %edx, %ecx 333/* The code for copying backwards. */ 334L(mm_len_0_or_more_backward): 335 336/* Now do checks for lengths. We do [0..16], [16..32], [32..64], [64..128] 337 separately. */ 338 cmp $16, %ecx 339 jbe L(mm_len_0_16_bytes_backward) 340 341 cmpl $32, %ecx 342 jg L(mm_len_32_or_more_backward) 343 344/* Copy [0..32] and return. */ 345 movdqu (%eax), %xmm0 346 movdqu -16(%eax, %ecx), %xmm1 347 movdqu %xmm0, (%edx) 348 movdqu %xmm1, -16(%edx, %ecx) 349 jmp L(mm_return) 350 351L(mm_len_32_or_more_backward): 352 cmpl $64, %ecx 353 jg L(mm_len_64_or_more_backward) 354 355/* Copy [0..64] and return. */ 356 movdqu (%eax), %xmm0 357 movdqu 16(%eax), %xmm1 358 movdqu -16(%eax, %ecx), %xmm2 359 movdqu -32(%eax, %ecx), %xmm3 360 movdqu %xmm0, (%edx) 361 movdqu %xmm1, 16(%edx) 362 movdqu %xmm2, -16(%edx, %ecx) 363 movdqu %xmm3, -32(%edx, %ecx) 364 jmp L(mm_return) 365 366L(mm_len_64_or_more_backward): 367 cmpl $128, %ecx 368 jg L(mm_len_128_or_more_backward) 369 370/* Copy [0..128] and return. */ 371 movdqu (%eax), %xmm0 372 movdqu 16(%eax), %xmm1 373 movdqu 32(%eax), %xmm2 374 movdqu 48(%eax), %xmm3 375 movdqu -64(%eax, %ecx), %xmm4 376 movdqu -48(%eax, %ecx), %xmm5 377 movdqu -32(%eax, %ecx), %xmm6 378 movdqu -16(%eax, %ecx), %xmm7 379 movdqu %xmm0, (%edx) 380 movdqu %xmm1, 16(%edx) 381 movdqu %xmm2, 32(%edx) 382 movdqu %xmm3, 48(%edx) 383 movdqu %xmm4, -64(%edx, %ecx) 384 movdqu %xmm5, -48(%edx, %ecx) 385 movdqu %xmm6, -32(%edx, %ecx) 386 movdqu %xmm7, -16(%edx, %ecx) 387 jmp L(mm_return) 388 389L(mm_len_128_or_more_backward): 390 PUSH (%esi) 391 PUSH (%edi) 392 393/* Aligning the address of destination. We need to save 394 16 bits from the source in order not to overwrite them. */ 395 movdqu -16(%eax, %ecx), %xmm0 396 movdqu -32(%eax, %ecx), %xmm1 397 movdqu -48(%eax, %ecx), %xmm2 398 movdqu -64(%eax, %ecx), %xmm3 399 400 leal (%edx, %ecx), %edi 401 andl $-64, %edi 402 403 movl %eax, %esi 404 subl %edx, %esi 405 406 movdqu -16(%edi, %esi), %xmm4 407 movdqu -32(%edi, %esi), %xmm5 408 movdqu -48(%edi, %esi), %xmm6 409 movdqu -64(%edi, %esi), %xmm7 410 411 movdqu %xmm0, -16(%edx, %ecx) 412 movdqu %xmm1, -32(%edx, %ecx) 413 movdqu %xmm2, -48(%edx, %ecx) 414 movdqu %xmm3, -64(%edx, %ecx) 415 movdqa %xmm4, -16(%edi) 416 movdqa %xmm5, -32(%edi) 417 movdqa %xmm6, -48(%edi) 418 movdqa %xmm7, -64(%edi) 419 leal -64(%edi), %edi 420 421 leal 64(%edx), %ebx 422 andl $-64, %ebx 423 424 cmp %edi, %ebx 425 jae L(mm_main_loop_backward_end) 426 427 cmp $SHARED_CACHE_SIZE_HALF, %ecx 428 jae L(mm_large_page_loop_backward) 429 430 .p2align 4 431L(mm_main_loop_backward): 432 433 prefetcht0 -128(%edi, %esi) 434 435 movdqu -64(%edi, %esi), %xmm0 436 movdqu -48(%edi, %esi), %xmm1 437 movdqu -32(%edi, %esi), %xmm2 438 movdqu -16(%edi, %esi), %xmm3 439 movdqa %xmm0, -64(%edi) 440 movdqa %xmm1, -48(%edi) 441 movdqa %xmm2, -32(%edi) 442 movdqa %xmm3, -16(%edi) 443 leal -64(%edi), %edi 444 cmp %edi, %ebx 445 jb L(mm_main_loop_backward) 446L(mm_main_loop_backward_end): 447 POP (%edi) 448 POP (%esi) 449 jmp L(mm_recalc_len) 450 451/* Copy [0..16] and return. */ 452L(mm_len_0_16_bytes_backward): 453 testb $24, %cl 454 jnz L(mm_len_9_16_bytes_backward) 455 testb $4, %cl 456 .p2align 4,,5 457 jnz L(mm_len_5_8_bytes_backward) 458 testl %ecx, %ecx 459 .p2align 4,,2 460 je L(mm_return) 461 testb $2, %cl 462 .p2align 4,,1 463 jne L(mm_len_3_4_bytes_backward) 464 movzbl -1(%eax,%ecx), %ebx 465 movzbl (%eax), %eax 466 movb %bl, -1(%edx,%ecx) 467 movb %al, (%edx) 468 jmp L(mm_return) 469 470L(mm_len_3_4_bytes_backward): 471 movzwl -2(%eax,%ecx), %ebx 472 movzwl (%eax), %eax 473 movw %bx, -2(%edx,%ecx) 474 movw %ax, (%edx) 475 jmp L(mm_return) 476 477L(mm_len_9_16_bytes_backward): 478 PUSH (%esi) 479 movl -4(%eax,%ecx), %ebx 480 movl -8(%eax,%ecx), %esi 481 movl %ebx, -4(%edx,%ecx) 482 movl %esi, -8(%edx,%ecx) 483 subl $8, %ecx 484 POP (%esi) 485 jmp L(mm_len_0_16_bytes_backward) 486 487L(mm_len_5_8_bytes_backward): 488 movl (%eax), %ebx 489 movl -4(%eax,%ecx), %eax 490 movl %ebx, (%edx) 491 movl %eax, -4(%edx,%ecx) 492 493L(mm_return): 494 movl %edx, %eax 495 RETURN 496 497L(mm_return_pop_all): 498 movl %edx, %eax 499 POP (%edi) 500 POP (%esi) 501 RETURN 502 503/* Big length copy forward part. */ 504 505 .p2align 4 506L(mm_large_page_loop_forward): 507 movdqu (%eax, %edi), %xmm0 508 movdqu 16(%eax, %edi), %xmm1 509 movdqu 32(%eax, %edi), %xmm2 510 movdqu 48(%eax, %edi), %xmm3 511 movntdq %xmm0, (%edi) 512 movntdq %xmm1, 16(%edi) 513 movntdq %xmm2, 32(%edi) 514 movntdq %xmm3, 48(%edi) 515 leal 64(%edi), %edi 516 cmp %edi, %ebx 517 ja L(mm_large_page_loop_forward) 518 sfence 519 jmp L(mm_copy_remaining_forward) 520 521/* Big length copy backward part. */ 522 .p2align 4 523L(mm_large_page_loop_backward): 524 movdqu -64(%edi, %esi), %xmm0 525 movdqu -48(%edi, %esi), %xmm1 526 movdqu -32(%edi, %esi), %xmm2 527 movdqu -16(%edi, %esi), %xmm3 528 movntdq %xmm0, -64(%edi) 529 movntdq %xmm1, -48(%edi) 530 movntdq %xmm2, -32(%edi) 531 movntdq %xmm3, -16(%edi) 532 leal -64(%edi), %edi 533 cmp %edi, %ebx 534 jb L(mm_large_page_loop_backward) 535 sfence 536 POP (%edi) 537 POP (%esi) 538 jmp L(mm_recalc_len) 539 540END (MEMMOVE) 541