Alexey Brodkin | b628c01 | 2014-02-04 12:56:15 +0400 | [diff] [blame] | 1 | /* |
| 2 | * Copyright (C) 2004, 2007-2010, 2011-2014 Synopsys, Inc. All rights reserved. |
| 3 | * |
| 4 | * SPDX-License-Identifier: GPL-2.0+ |
| 5 | */ |
| 6 | |
| 7 | /* |
| 8 | * This is optimized primarily for the ARC700. |
| 9 | * It would be possible to speed up the loops by one cycle / word |
| 10 | * respective one cycle / byte by forcing double source 1 alignment, unrolling |
| 11 | * by a factor of two, and speculatively loading the second word / byte of |
| 12 | * source 1; however, that would increase the overhead for loop setup / finish, |
| 13 | * and strcmp might often terminate early. |
| 14 | */ |
| 15 | |
| 16 | .global strcmp |
| 17 | .align 4 |
| 18 | strcmp: |
| 19 | or %r2, %r0, %r1 |
| 20 | bmsk_s %r2, %r2, 1 |
| 21 | brne %r2, 0, .Lcharloop |
| 22 | mov_s %r12, 0x01010101 |
| 23 | ror %r5, %r12 |
| 24 | .Lwordloop: |
| 25 | ld.ab %r2, [%r0, 4] |
| 26 | ld.ab %r3, [%r1, 4] |
| 27 | nop_s |
| 28 | sub %r4, %r2, %r12 |
| 29 | bic %r4, %r4, %r2 |
| 30 | and %r4, %r4, %r5 |
| 31 | brne %r4, 0, .Lfound0 |
| 32 | breq %r2 ,%r3, .Lwordloop |
| 33 | #ifdef __LITTLE_ENDIAN__ |
| 34 | xor %r0, %r2, %r3 /* mask for difference */ |
| 35 | sub_s %r1, %r0, 1 |
| 36 | bic_s %r0, %r0, %r1 /* mask for least significant difference bit */ |
| 37 | sub %r1, %r5, %r0 |
| 38 | xor %r0, %r5, %r1 /* mask for least significant difference byte */ |
| 39 | and_s %r2, %r2, %r0 |
| 40 | and_s %r3, %r3, %r0 |
| 41 | #endif /* _ENDIAN__ */ |
| 42 | cmp_s %r2, %r3 |
| 43 | mov_s %r0, 1 |
| 44 | j_s.d [%blink] |
| 45 | bset.lo %r0, %r0, 31 |
| 46 | |
| 47 | .balign 4 |
| 48 | #ifdef __LITTLE_ENDIAN__ |
| 49 | .Lfound0: |
| 50 | xor %r0, %r2, %r3 /* mask for difference */ |
| 51 | or %r0, %r0, %r4 /* or in zero indicator */ |
| 52 | sub_s %r1, %r0, 1 |
| 53 | bic_s %r0, %r0, %r1 /* mask for least significant difference bit */ |
| 54 | sub %r1, %r5, %r0 |
| 55 | xor %r0, %r5, %r1 /* mask for least significant difference byte */ |
| 56 | and_s %r2, %r2, %r0 |
| 57 | and_s %r3, %r3, %r0 |
| 58 | sub.f %r0, %r2, %r3 |
| 59 | mov.hi %r0, 1 |
| 60 | j_s.d [%blink] |
| 61 | bset.lo %r0, %r0, 31 |
| 62 | #else /* __BIG_ENDIAN__ */ |
| 63 | /* |
| 64 | * The zero-detection above can mis-detect 0x01 bytes as zeroes |
| 65 | * because of carry-propagateion from a lower significant zero byte. |
| 66 | * We can compensate for this by checking that bit0 is zero. |
| 67 | * This compensation is not necessary in the step where we |
| 68 | * get a low estimate for r2, because in any affected bytes |
| 69 | * we already have 0x00 or 0x01, which will remain unchanged |
| 70 | * when bit 7 is cleared. |
| 71 | */ |
| 72 | .balign 4 |
| 73 | .Lfound0: |
| 74 | lsr %r0, %r4, 8 |
| 75 | lsr_s %r1, %r2 |
| 76 | bic_s %r2, %r2, %r0 /* get low estimate for r2 and get ... */ |
| 77 | bic_s %r0, %r0, %r1 /* <this is the adjusted mask for zeros> */ |
| 78 | or_s %r3, %r3, %r0 /* ... high estimate r3 so that r2 > r3 will */ |
| 79 | cmp_s %r3, %r2 /* ... be independent of trailing garbage */ |
| 80 | or_s %r2, %r2, %r0 /* likewise for r3 > r2 */ |
| 81 | bic_s %r3, %r3, %r0 |
| 82 | rlc %r0, 0 /* r0 := r2 > r3 ? 1 : 0 */ |
| 83 | cmp_s %r2, %r3 |
| 84 | j_s.d [%blink] |
| 85 | bset.lo %r0, %r0, 31 |
| 86 | #endif /* _ENDIAN__ */ |
| 87 | |
| 88 | .balign 4 |
| 89 | .Lcharloop: |
| 90 | ldb.ab %r2,[%r0,1] |
| 91 | ldb.ab %r3,[%r1,1] |
| 92 | nop_s |
| 93 | breq %r2, 0, .Lcmpend |
| 94 | breq %r2, %r3, .Lcharloop |
| 95 | .Lcmpend: |
| 96 | j_s.d [%blink] |
| 97 | sub %r0, %r2, %r3 |