1/*
2 * Copyright (C) 2014 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 *      http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17#ifndef ART_RUNTIME_ARCH_ARM_MEMCMP16_ARM_S_
18#define ART_RUNTIME_ARCH_ARM_MEMCMP16_ARM_S_
19
20#include "asm_support_arm.S"
21
22/*
23 * Optimized memcmp16() for ARM9.
24 * This would not be optimal on XScale or ARM11, where more prefetching
25 * and use of pld will be needed.
26 * The 2 major optimzations here are
27 * (1) The main loop compares 16 bytes at a time
28 * (2) The loads are scheduled in a way they won't stall
29 */
30
31ARM_ENTRY __memcmp16
32        pld         [r0, #0]
33        pld         [r1, #0]
34
35        /* take of the case where length is nul or the buffers are the same */
36        cmp         r0, r1
37        cmpne       r2, #0
38        moveq       r0, #0
39        bxeq        lr
40
41        /* since r0 hold the result, move the first source
42         * pointer somewhere else
43         */
44
45        mov         r3, r0
46
47         /* make sure we have at least 12 words, this simplify things below
48          * and avoid some overhead for small blocks
49          */
50
51        cmp         r2, #12
52        bpl         0f
53
54        /* small blocks (less then 12 words) */
55        pld         [r0, #32]
56        pld         [r1, #32]
57
581:      ldrh        r0, [r3], #2
59        ldrh        ip, [r1], #2
60        subs        r0, r0, ip
61        bxne        lr
62        subs        r2, r2, #1
63        bne         1b
64        bx          lr
65
66
67        /* save registers */
680:      push        {r4, lr}
69        .cfi_def_cfa_offset 8
70        .cfi_rel_offset r4, 0
71        .cfi_rel_offset lr, 4
72
73        /* align first pointer to word boundary */
74        tst         r3, #2
75        beq         0f
76
77        ldrh        r0, [r3], #2
78        ldrh        ip, [r1], #2
79        sub         r2, r2, #1
80        subs        r0, r0, ip
81        /* restore registers and return */
82        popne       {r4, lr}
83        bxne        lr
84
85
860:      /* here the first pointer is aligned, and we have at least 3 words
87         * to process.
88         */
89
90        /* see if the pointers are congruent */
91        eor         r0, r3, r1
92        ands        r0, r0, #2
93        bne         5f
94
95        /* congruent case, 16 half-words per iteration
96         * We need to make sure there are at least 16+2 words left
97         * because we effectively read ahead one long word, and we could
98         * read past the buffer (and segfault) if we're not careful.
99         */
100
101        ldr         ip, [r1]
102        subs        r2, r2, #(16 + 2)
103        bmi         1f
104
1050:
106        pld         [r3, #64]
107        pld         [r1, #64]
108        ldr         r0, [r3], #4
109        ldr         lr, [r1, #4]!
110        eors        r0, r0, ip
111        ldreq       r0, [r3], #4
112        ldreq       ip, [r1, #4]!
113        eorseq      r0, r0, lr
114        ldreq       r0, [r3], #4
115        ldreq       lr, [r1, #4]!
116        eorseq      r0, r0, ip
117        ldreq       r0, [r3], #4
118        ldreq       ip, [r1, #4]!
119        eorseq      r0, r0, lr
120        ldreq       r0, [r3], #4
121        ldreq       lr, [r1, #4]!
122        eorseq      r0, r0, ip
123        ldreq       r0, [r3], #4
124        ldreq       ip, [r1, #4]!
125        eorseq      r0, r0, lr
126        ldreq       r0, [r3], #4
127        ldreq       lr, [r1, #4]!
128        eorseq      r0, r0, ip
129        ldreq       r0, [r3], #4
130        ldreq       ip, [r1, #4]!
131        eorseq      r0, r0, lr
132        bne         2f
133        subs        r2, r2, #16
134        bhs         0b
135
136        /* do we have at least 2 words left? */
1371:      adds        r2, r2, #(16 - 2 + 2)
138        bmi         4f
139
140        /* finish off 2 words at a time */
1413:      ldr         r0, [r3], #4
142        ldr         ip, [r1], #4
143        eors        r0, r0, ip
144        bne         2f
145        subs        r2, r2, #2
146        bhs         3b
147
148        /* are we done? */
1494:      adds        r2, r2, #2
150        bne         8f
151        /* restore registers and return */
152        mov         r0, #0
153        pop         {r4, pc}
154
1552:      /* the last 2 words are different, restart them */
156        ldrh        r0, [r3, #-4]
157        ldrh        ip, [r1, #-4]
158        subs        r0, r0, ip
159        ldrheq      r0, [r3, #-2]
160        ldrheq      ip, [r1, #-2]
161        subseq      r0, r0, ip
162        /* restore registers and return */
163        pop         {r4, pc}
164
165        /* process the last few words */
1668:      ldrh        r0, [r3], #2
167        ldrh        ip, [r1], #2
168        subs        r0, r0, ip
169        bne         9f
170        subs        r2, r2, #1
171        bne         8b
172
1739:      /* restore registers and return */
174        pop         {r4, pc}
175
1765:      /*************** non-congruent case ***************/
177
178        /* align the unaligned pointer */
179        bic         r1, r1, #3
180        ldr         lr, [r1], #4
181        sub         r2, r2, #8
182
1836:
184        pld         [r3, #64]
185        pld         [r1, #64]
186        mov         ip, lr, lsr #16
187        ldr         lr, [r1], #4
188        ldr         r0, [r3], #4
189        orr         ip, ip, lr, lsl #16
190        eors        r0, r0, ip
191        moveq       ip, lr, lsr #16
192        ldreq       lr, [r1], #4
193        ldreq       r0, [r3], #4
194        orreq       ip, ip, lr, lsl #16
195        eorseq      r0, r0, ip
196        moveq       ip, lr, lsr #16
197        ldreq       lr, [r1], #4
198        ldreq       r0, [r3], #4
199        orreq       ip, ip, lr, lsl #16
200        eorseq      r0, r0, ip
201        moveq       ip, lr, lsr #16
202        ldreq       lr, [r1], #4
203        ldreq       r0, [r3], #4
204        orreq       ip, ip, lr, lsl #16
205        eorseq      r0, r0, ip
206        bne         7f
207        subs        r2, r2, #8
208        bhs         6b
209        sub         r1, r1, #2
210        /* are we done? */
211        adds        r2, r2, #8
212        moveq       r0, #0
213        beq         9b
214        /* finish off the remaining bytes */
215        b           8b
216
2177:      /* fix up the 2 pointers and fallthrough... */
218        sub         r1, r1, #2
219        b           2b
220END __memcmp16
221
222
223#endif  // ART_RUNTIME_ARCH_ARM_MEMCMP16_ARM_S_
224