1/*
2Copyright (c) 2014, Intel Corporation
3All rights reserved.
4
5Redistribution and use in source and binary forms, with or without
6modification, are permitted provided that the following conditions are met:
7
8    * Redistributions of source code must retain the above copyright notice,
9    * this list of conditions and the following disclaimer.
10
11    * Redistributions in binary form must reproduce the above copyright notice,
12    * this list of conditions and the following disclaimer in the documentation
13    * and/or other materials provided with the distribution.
14
15    * Neither the name of Intel Corporation nor the names of its contributors
16    * may be used to endorse or promote products derived from this software
17    * without specific prior written permission.
18
19THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
20ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
21WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
22DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
23ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
24(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
25LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
26ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
28SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29*/
30
31#define FOR_SILVERMONT
32#include "cache.h"
33
34#ifndef MEMMOVE
35# define MEMMOVE	memmove_generic
36#endif
37
38#ifndef L
39# define L(label)	.L##label
40#endif
41
42#ifndef cfi_startproc
43# define cfi_startproc	.cfi_startproc
44#endif
45
46#ifndef cfi_endproc
47# define cfi_endproc	.cfi_endproc
48#endif
49
50#ifndef cfi_rel_offset
51# define cfi_rel_offset(reg, off)	.cfi_rel_offset reg, off
52#endif
53
54#ifndef cfi_restore
55# define cfi_restore(reg)	.cfi_restore reg
56#endif
57
58#ifndef cfi_adjust_cfa_offset
59# define cfi_adjust_cfa_offset(off)	.cfi_adjust_cfa_offset off
60#endif
61
62#ifndef ENTRY
63# define ENTRY(name)		\
64	.type name,  @function;		\
65	.globl name;		\
66	.p2align 4;		\
67name:		\
68	cfi_startproc
69#endif
70
71#ifndef END
72# define END(name)		\
73	cfi_endproc;		\
74	.size name, .-name
75#endif
76
77#define DEST		PARMS
78#define SRC		DEST+4
79#define LEN		SRC+4
80
81#define CFI_PUSH(REG)		\
82  cfi_adjust_cfa_offset (4);		\
83  cfi_rel_offset (REG, 0)
84
85#define CFI_POP(REG)		\
86  cfi_adjust_cfa_offset (-4);		\
87  cfi_restore (REG)
88
89#define PUSH(REG)	pushl REG; CFI_PUSH (REG)
90#define POP(REG)	popl REG; CFI_POP (REG)
91
92#define PARMS		8		/* Preserve EBX.  */
93#define ENTRANCE	PUSH (%ebx);
94#define RETURN_END	POP (%ebx); ret
95#define RETURN		RETURN_END; CFI_PUSH (%ebx)
96
97	.section .text.sse2,"ax",@progbits
98ENTRY (MEMMOVE)
99	ENTRANCE
100	movl	LEN(%esp), %ecx
101	movl	SRC(%esp), %eax
102	movl	DEST(%esp), %edx
103
104/* Check whether we should copy backward or forward.  */
105	cmp	%eax, %edx
106	je	L(mm_return)
107	jg	L(mm_len_0_or_more_backward)
108
109/* Now do checks for lengths. We do [0..16], [0..32], [0..64], [0..128]
110	separately.  */
111	cmp	$16, %ecx
112	jbe	L(mm_len_0_16_bytes_forward)
113
114	cmpl	$32, %ecx
115	ja	L(mm_len_32_or_more_forward)
116
117/* Copy [0..32] and return.  */
118	movdqu	(%eax), %xmm0
119	movdqu	-16(%eax, %ecx), %xmm1
120	movdqu	%xmm0, (%edx)
121	movdqu	%xmm1, -16(%edx, %ecx)
122	jmp	L(mm_return)
123
124L(mm_len_32_or_more_forward):
125	cmpl	$64, %ecx
126	ja	L(mm_len_64_or_more_forward)
127
128/* Copy [0..64] and return.  */
129	movdqu	(%eax), %xmm0
130	movdqu	16(%eax), %xmm1
131	movdqu	-16(%eax, %ecx), %xmm2
132	movdqu	-32(%eax, %ecx), %xmm3
133	movdqu	%xmm0, (%edx)
134	movdqu	%xmm1, 16(%edx)
135	movdqu	%xmm2, -16(%edx, %ecx)
136	movdqu	%xmm3, -32(%edx, %ecx)
137	jmp	L(mm_return)
138
139L(mm_len_64_or_more_forward):
140	cmpl	$128, %ecx
141	ja	L(mm_len_128_or_more_forward)
142
143/* Copy [0..128] and return.  */
144	movdqu	(%eax), %xmm0
145	movdqu	16(%eax), %xmm1
146	movdqu	32(%eax), %xmm2
147	movdqu	48(%eax), %xmm3
148	movdqu	-64(%eax, %ecx), %xmm4
149	movdqu	-48(%eax, %ecx), %xmm5
150	movdqu	-32(%eax, %ecx), %xmm6
151	movdqu	-16(%eax, %ecx), %xmm7
152	movdqu	%xmm0, (%edx)
153	movdqu	%xmm1, 16(%edx)
154	movdqu	%xmm2, 32(%edx)
155	movdqu	%xmm3, 48(%edx)
156	movdqu	%xmm4, -64(%edx, %ecx)
157	movdqu	%xmm5, -48(%edx, %ecx)
158	movdqu	%xmm6, -32(%edx, %ecx)
159	movdqu	%xmm7, -16(%edx, %ecx)
160	jmp	L(mm_return)
161
162L(mm_len_128_or_more_forward):
163	PUSH (%esi)
164	PUSH (%edi)
165
166/* Aligning the address of destination.  */
167	movdqu	(%eax), %xmm0
168	movdqu	16(%eax), %xmm1
169	movdqu	32(%eax), %xmm2
170	movdqu	48(%eax), %xmm3
171
172	leal	64(%edx), %edi
173	andl	$-64, %edi
174	subl	%edx, %eax
175
176	movdqu	(%eax, %edi), %xmm4
177	movdqu	16(%eax, %edi), %xmm5
178	movdqu	32(%eax, %edi), %xmm6
179	movdqu	48(%eax, %edi), %xmm7
180
181	movdqu	%xmm0, (%edx)
182	movdqu	%xmm1, 16(%edx)
183	movdqu	%xmm2, 32(%edx)
184	movdqu	%xmm3, 48(%edx)
185	movdqa	%xmm4, (%edi)
186	movaps	%xmm5, 16(%edi)
187	movaps	%xmm6, 32(%edi)
188	movaps	%xmm7, 48(%edi)
189	addl	$64, %edi
190
191	leal	(%edx, %ecx), %ebx
192	andl	$-64, %ebx
193	cmp	%edi, %ebx
194	jbe	L(mm_copy_remaining_forward)
195
196	cmp	$SHARED_CACHE_SIZE_HALF, %ecx
197	jae	L(mm_large_page_loop_forward)
198
199	.p2align 4
200L(mm_main_loop_forward):
201
202	prefetcht0 128(%eax, %edi)
203
204	movdqu	(%eax, %edi), %xmm0
205	movdqu	16(%eax, %edi), %xmm1
206	movdqu	32(%eax, %edi), %xmm2
207	movdqu	48(%eax, %edi), %xmm3
208	movdqa	%xmm0, (%edi)
209	movaps	%xmm1, 16(%edi)
210	movaps	%xmm2, 32(%edi)
211	movaps	%xmm3, 48(%edi)
212	leal	64(%edi), %edi
213	cmp	%edi, %ebx
214	ja	L(mm_main_loop_forward)
215
216L(mm_copy_remaining_forward):
217	addl	%edx, %ecx
218	subl	%edi, %ecx
219/* We copied all up till %edi position in the dst.
220	In %ecx now is how many bytes are left to copy.
221	Now we need to advance %esi. */
222	leal	(%edi, %eax), %esi
223
224L(mm_remaining_0_64_bytes_forward):
225	cmp	$32, %ecx
226	ja	L(mm_remaining_33_64_bytes_forward)
227	cmp	$16, %ecx
228	ja	L(mm_remaining_17_32_bytes_forward)
229	testl	%ecx, %ecx
230	.p2align 4,,2
231	je	L(mm_return_pop_all)
232
233	cmpb	$8, %cl
234	ja	L(mm_remaining_9_16_bytes_forward)
235	cmpb	$4, %cl
236	.p2align 4,,5
237	ja	L(mm_remaining_5_8_bytes_forward)
238	cmpb	$2, %cl
239	.p2align 4,,1
240	ja	L(mm_remaining_3_4_bytes_forward)
241	movzbl	-1(%esi,%ecx), %eax
242	movzbl	(%esi), %ebx
243	movb	%al, -1(%edi,%ecx)
244	movb	%bl, (%edi)
245	jmp	L(mm_return_pop_all)
246
247L(mm_remaining_33_64_bytes_forward):
248	movdqu	(%esi), %xmm0
249	movdqu	16(%esi), %xmm1
250	movdqu	-32(%esi, %ecx), %xmm2
251	movdqu	-16(%esi, %ecx), %xmm3
252	movdqu	%xmm0, (%edi)
253	movdqu	%xmm1, 16(%edi)
254	movdqu	%xmm2, -32(%edi, %ecx)
255	movdqu	%xmm3, -16(%edi, %ecx)
256	jmp	L(mm_return_pop_all)
257
258L(mm_remaining_17_32_bytes_forward):
259	movdqu	(%esi), %xmm0
260	movdqu	-16(%esi, %ecx), %xmm1
261	movdqu	%xmm0, (%edi)
262	movdqu	%xmm1, -16(%edi, %ecx)
263	jmp	L(mm_return_pop_all)
264
265L(mm_remaining_9_16_bytes_forward):
266	movq	(%esi), %xmm0
267	movq	-8(%esi, %ecx), %xmm1
268	movq	%xmm0, (%edi)
269	movq	%xmm1, -8(%edi, %ecx)
270	jmp	L(mm_return_pop_all)
271
272L(mm_remaining_5_8_bytes_forward):
273	movl	(%esi), %eax
274	movl	-4(%esi,%ecx), %ebx
275	movl	%eax, (%edi)
276	movl	%ebx, -4(%edi,%ecx)
277	jmp	L(mm_return_pop_all)
278
279L(mm_remaining_3_4_bytes_forward):
280	movzwl	-2(%esi,%ecx), %eax
281	movzwl	(%esi), %ebx
282	movw	%ax, -2(%edi,%ecx)
283	movw	%bx, (%edi)
284	jmp	L(mm_return_pop_all)
285
286L(mm_len_0_16_bytes_forward):
287	testb	$24, %cl
288	jne	L(mm_len_9_16_bytes_forward)
289	testb	$4, %cl
290	.p2align 4,,5
291	jne	L(mm_len_5_8_bytes_forward)
292	testl	%ecx, %ecx
293	.p2align 4,,2
294	je	L(mm_return)
295	testb	$2, %cl
296	.p2align 4,,1
297	jne	L(mm_len_2_4_bytes_forward)
298	movzbl	-1(%eax,%ecx), %ebx
299	movzbl	(%eax), %eax
300	movb	%bl, -1(%edx,%ecx)
301	movb	%al, (%edx)
302	jmp	L(mm_return)
303
304L(mm_len_2_4_bytes_forward):
305	movzwl	-2(%eax,%ecx), %ebx
306	movzwl	(%eax), %eax
307	movw	%bx, -2(%edx,%ecx)
308	movw	%ax, (%edx)
309	jmp	L(mm_return)
310
311L(mm_len_5_8_bytes_forward):
312	movl	(%eax), %ebx
313	movl	-4(%eax,%ecx), %eax
314	movl	%ebx, (%edx)
315	movl	%eax, -4(%edx,%ecx)
316	jmp	L(mm_return)
317
318L(mm_len_9_16_bytes_forward):
319	movq	(%eax), %xmm0
320	movq	-8(%eax, %ecx), %xmm1
321	movq	%xmm0, (%edx)
322	movq	%xmm1, -8(%edx, %ecx)
323	jmp	L(mm_return)
324
325	CFI_POP (%edi)
326	CFI_POP (%esi)
327
328L(mm_recalc_len):
329/* Compute in %ecx how many bytes are left to copy after
330	the main loop stops.  */
331	movl	%ebx, %ecx
332	subl	%edx, %ecx
333/* The code for copying backwards.  */
334L(mm_len_0_or_more_backward):
335
336/* Now do checks for lengths. We do [0..16], [16..32], [32..64], [64..128]
337	separately.  */
338	cmp	$16, %ecx
339	jbe	L(mm_len_0_16_bytes_backward)
340
341	cmpl	$32, %ecx
342	jg	L(mm_len_32_or_more_backward)
343
344/* Copy [0..32] and return.  */
345	movdqu	(%eax), %xmm0
346	movdqu	-16(%eax, %ecx), %xmm1
347	movdqu	%xmm0, (%edx)
348	movdqu	%xmm1, -16(%edx, %ecx)
349	jmp	L(mm_return)
350
351L(mm_len_32_or_more_backward):
352	cmpl	$64, %ecx
353	jg	L(mm_len_64_or_more_backward)
354
355/* Copy [0..64] and return.  */
356	movdqu	(%eax), %xmm0
357	movdqu	16(%eax), %xmm1
358	movdqu	-16(%eax, %ecx), %xmm2
359	movdqu	-32(%eax, %ecx), %xmm3
360	movdqu	%xmm0, (%edx)
361	movdqu	%xmm1, 16(%edx)
362	movdqu	%xmm2, -16(%edx, %ecx)
363	movdqu	%xmm3, -32(%edx, %ecx)
364	jmp	L(mm_return)
365
366L(mm_len_64_or_more_backward):
367	cmpl	$128, %ecx
368	jg	L(mm_len_128_or_more_backward)
369
370/* Copy [0..128] and return.  */
371	movdqu	(%eax), %xmm0
372	movdqu	16(%eax), %xmm1
373	movdqu	32(%eax), %xmm2
374	movdqu	48(%eax), %xmm3
375	movdqu	-64(%eax, %ecx), %xmm4
376	movdqu	-48(%eax, %ecx), %xmm5
377	movdqu	-32(%eax, %ecx), %xmm6
378	movdqu	-16(%eax, %ecx), %xmm7
379	movdqu	%xmm0, (%edx)
380	movdqu	%xmm1, 16(%edx)
381	movdqu	%xmm2, 32(%edx)
382	movdqu	%xmm3, 48(%edx)
383	movdqu	%xmm4, -64(%edx, %ecx)
384	movdqu	%xmm5, -48(%edx, %ecx)
385	movdqu	%xmm6, -32(%edx, %ecx)
386	movdqu	%xmm7, -16(%edx, %ecx)
387	jmp	L(mm_return)
388
389L(mm_len_128_or_more_backward):
390	PUSH (%esi)
391	PUSH (%edi)
392
393/* Aligning the address of destination. We need to save
394	16 bits from the source in order not to overwrite them.  */
395	movdqu	-16(%eax, %ecx), %xmm0
396	movdqu	-32(%eax, %ecx), %xmm1
397	movdqu	-48(%eax, %ecx), %xmm2
398	movdqu	-64(%eax, %ecx), %xmm3
399
400	leal	(%edx, %ecx), %edi
401	andl	$-64, %edi
402
403	movl	%eax, %esi
404	subl	%edx, %esi
405
406	movdqu	-16(%edi, %esi), %xmm4
407	movdqu	-32(%edi, %esi), %xmm5
408	movdqu	-48(%edi, %esi), %xmm6
409	movdqu	-64(%edi, %esi), %xmm7
410
411	movdqu	%xmm0, -16(%edx, %ecx)
412	movdqu	%xmm1, -32(%edx, %ecx)
413	movdqu	%xmm2, -48(%edx, %ecx)
414	movdqu	%xmm3, -64(%edx, %ecx)
415	movdqa	%xmm4, -16(%edi)
416	movdqa	%xmm5, -32(%edi)
417	movdqa	%xmm6, -48(%edi)
418	movdqa	%xmm7, -64(%edi)
419	leal	-64(%edi), %edi
420
421	leal	64(%edx), %ebx
422	andl	$-64, %ebx
423
424	cmp	%edi, %ebx
425	jae	L(mm_main_loop_backward_end)
426
427	cmp	$SHARED_CACHE_SIZE_HALF, %ecx
428	jae	L(mm_large_page_loop_backward)
429
430	.p2align 4
431L(mm_main_loop_backward):
432
433	prefetcht0 -128(%edi, %esi)
434
435	movdqu	-64(%edi, %esi), %xmm0
436	movdqu	-48(%edi, %esi), %xmm1
437	movdqu	-32(%edi, %esi), %xmm2
438	movdqu	-16(%edi, %esi), %xmm3
439	movdqa	%xmm0, -64(%edi)
440	movdqa	%xmm1, -48(%edi)
441	movdqa	%xmm2, -32(%edi)
442	movdqa	%xmm3, -16(%edi)
443	leal	-64(%edi), %edi
444	cmp	%edi, %ebx
445	jb	L(mm_main_loop_backward)
446L(mm_main_loop_backward_end):
447	POP (%edi)
448	POP (%esi)
449	jmp	L(mm_recalc_len)
450
451/* Copy [0..16] and return.  */
452L(mm_len_0_16_bytes_backward):
453	testb	$24, %cl
454	jnz	L(mm_len_9_16_bytes_backward)
455	testb	$4, %cl
456	.p2align 4,,5
457	jnz	L(mm_len_5_8_bytes_backward)
458	testl	%ecx, %ecx
459	.p2align 4,,2
460	je	L(mm_return)
461	testb	$2, %cl
462	.p2align 4,,1
463	jne	L(mm_len_3_4_bytes_backward)
464	movzbl	-1(%eax,%ecx), %ebx
465	movzbl	(%eax), %eax
466	movb	%bl, -1(%edx,%ecx)
467	movb	%al, (%edx)
468	jmp	L(mm_return)
469
470L(mm_len_3_4_bytes_backward):
471	movzwl	-2(%eax,%ecx), %ebx
472	movzwl	(%eax), %eax
473	movw	%bx, -2(%edx,%ecx)
474	movw	%ax, (%edx)
475	jmp	L(mm_return)
476
477L(mm_len_9_16_bytes_backward):
478	PUSH (%esi)
479	movl	-4(%eax,%ecx), %ebx
480	movl	-8(%eax,%ecx), %esi
481	movl	%ebx, -4(%edx,%ecx)
482	movl	%esi, -8(%edx,%ecx)
483	subl	$8, %ecx
484	POP (%esi)
485	jmp	L(mm_len_0_16_bytes_backward)
486
487L(mm_len_5_8_bytes_backward):
488	movl	(%eax), %ebx
489	movl	-4(%eax,%ecx), %eax
490	movl	%ebx, (%edx)
491	movl	%eax, -4(%edx,%ecx)
492
493L(mm_return):
494	movl	%edx, %eax
495	RETURN
496
497L(mm_return_pop_all):
498	movl	%edx, %eax
499	POP (%edi)
500	POP (%esi)
501	RETURN
502
503/* Big length copy forward part.  */
504
505	.p2align 4
506L(mm_large_page_loop_forward):
507	movdqu	(%eax, %edi), %xmm0
508	movdqu	16(%eax, %edi), %xmm1
509	movdqu	32(%eax, %edi), %xmm2
510	movdqu	48(%eax, %edi), %xmm3
511	movntdq	%xmm0, (%edi)
512	movntdq	%xmm1, 16(%edi)
513	movntdq	%xmm2, 32(%edi)
514	movntdq	%xmm3, 48(%edi)
515	leal	64(%edi), %edi
516	cmp	%edi, %ebx
517	ja	L(mm_large_page_loop_forward)
518	sfence
519	jmp	L(mm_copy_remaining_forward)
520
521/* Big length copy backward part.  */
522	.p2align 4
523L(mm_large_page_loop_backward):
524	movdqu	-64(%edi, %esi), %xmm0
525	movdqu	-48(%edi, %esi), %xmm1
526	movdqu	-32(%edi, %esi), %xmm2
527	movdqu	-16(%edi, %esi), %xmm3
528	movntdq	%xmm0, -64(%edi)
529	movntdq	%xmm1, -48(%edi)
530	movntdq	%xmm2, -32(%edi)
531	movntdq	%xmm3, -16(%edi)
532	leal	-64(%edi), %edi
533	cmp	%edi, %ebx
534	jb	L(mm_large_page_loop_backward)
535	sfence
536	POP (%edi)
537	POP (%esi)
538	jmp	L(mm_recalc_len)
539
540END (MEMMOVE)
541