Heinrich Schuchardt | 23caf66 | 2021-03-27 12:37:04 +0100 | [diff] [blame] | 1 | /* SPDX-License-Identifier: GPL-2.0 */ |
| 2 | |
| 3 | #include <linux/linkage.h> |
| 4 | #include <asm/asm.h> |
| 5 | |
| 6 | ENTRY(__memmove) |
| 7 | WEAK(memmove) |
Bin Meng | 8a27fcd | 2021-05-13 16:46:17 +0800 | [diff] [blame] | 8 | /* |
| 9 | * Here we determine if forward copy is possible. Forward copy is |
| 10 | * preferred to backward copy as it is more cache friendly. |
| 11 | * |
| 12 | * If a0 >= a1, t0 gives their distance, if t0 >= a2 then we can |
| 13 | * copy forward. |
| 14 | * If a0 < a1, we can always copy forward. This will make t0 negative, |
| 15 | * so a *unsigned* comparison will always have t0 >= a2. |
| 16 | * |
| 17 | * For forward copy we just delegate the task to memcpy. |
| 18 | */ |
| 19 | sub t0, a0, a1 |
| 20 | bltu t0, a2, 1f |
| 21 | tail __memcpy |
| 22 | 1: |
Heinrich Schuchardt | 23caf66 | 2021-03-27 12:37:04 +0100 | [diff] [blame] | 23 | |
Bin Meng | 8a27fcd | 2021-05-13 16:46:17 +0800 | [diff] [blame] | 24 | /* |
| 25 | * Register allocation for code below: |
| 26 | * a0 - end of uncopied dst |
| 27 | * a1 - end of uncopied src |
| 28 | * t0 - start of uncopied dst |
| 29 | */ |
| 30 | mv t0, a0 |
| 31 | add a0, a0, a2 |
| 32 | add a1, a1, a2 |
Heinrich Schuchardt | 23caf66 | 2021-03-27 12:37:04 +0100 | [diff] [blame] | 33 | |
Bin Meng | 8a27fcd | 2021-05-13 16:46:17 +0800 | [diff] [blame] | 34 | /* |
| 35 | * Use bytewise copy if too small. |
| 36 | * |
| 37 | * This threshold must be at least 2*SZREG to ensure at least one |
| 38 | * wordwise copy is performed. It is chosen to be 16 because it will |
| 39 | * save at least 7 iterations of bytewise copy, which pays off the |
| 40 | * fixed overhead. |
| 41 | */ |
| 42 | li a3, 16 |
| 43 | bltu a2, a3, .Lbyte_copy_tail |
Heinrich Schuchardt | 23caf66 | 2021-03-27 12:37:04 +0100 | [diff] [blame] | 44 | |
Bin Meng | 8a27fcd | 2021-05-13 16:46:17 +0800 | [diff] [blame] | 45 | /* |
| 46 | * Bytewise copy first to align t0 to word boundary. |
| 47 | */ |
| 48 | andi a2, a0, ~(SZREG-1) |
| 49 | beq a0, a2, 2f |
| 50 | 1: |
| 51 | addi a1, a1, -1 |
| 52 | lb a5, 0(a1) |
| 53 | addi a0, a0, -1 |
| 54 | sb a5, 0(a0) |
| 55 | bne a0, a2, 1b |
| 56 | 2: |
Heinrich Schuchardt | 23caf66 | 2021-03-27 12:37:04 +0100 | [diff] [blame] | 57 | |
Bin Meng | 8a27fcd | 2021-05-13 16:46:17 +0800 | [diff] [blame] | 58 | /* |
| 59 | * Now a0 is word-aligned. If a1 is also word aligned, we could perform |
| 60 | * aligned word-wise copy. Otherwise we need to perform misaligned |
| 61 | * word-wise copy. |
| 62 | */ |
| 63 | andi a3, a1, SZREG-1 |
| 64 | bnez a3, .Lmisaligned_word_copy |
Heinrich Schuchardt | 23caf66 | 2021-03-27 12:37:04 +0100 | [diff] [blame] | 65 | |
Bin Meng | 8a27fcd | 2021-05-13 16:46:17 +0800 | [diff] [blame] | 66 | /* Wordwise copy */ |
| 67 | addi t0, t0, SZREG-1 |
| 68 | bleu a0, t0, 2f |
| 69 | 1: |
| 70 | addi a1, a1, -SZREG |
| 71 | REG_L a5, 0(a1) |
| 72 | addi a0, a0, -SZREG |
| 73 | REG_S a5, 0(a0) |
| 74 | bgtu a0, t0, 1b |
| 75 | 2: |
| 76 | addi t0, t0, -(SZREG-1) |
Heinrich Schuchardt | 23caf66 | 2021-03-27 12:37:04 +0100 | [diff] [blame] | 77 | |
Bin Meng | 8a27fcd | 2021-05-13 16:46:17 +0800 | [diff] [blame] | 78 | .Lbyte_copy_tail: |
| 79 | /* |
| 80 | * Bytewise copy anything left. |
| 81 | */ |
| 82 | beq a0, t0, 2f |
| 83 | 1: |
| 84 | addi a1, a1, -1 |
| 85 | lb a5, 0(a1) |
| 86 | addi a0, a0, -1 |
| 87 | sb a5, 0(a0) |
| 88 | bne a0, t0, 1b |
| 89 | 2: |
Heinrich Schuchardt | 23caf66 | 2021-03-27 12:37:04 +0100 | [diff] [blame] | 90 | |
Bin Meng | 8a27fcd | 2021-05-13 16:46:17 +0800 | [diff] [blame] | 91 | mv a0, t0 |
| 92 | ret |
Heinrich Schuchardt | 23caf66 | 2021-03-27 12:37:04 +0100 | [diff] [blame] | 93 | |
Bin Meng | 8a27fcd | 2021-05-13 16:46:17 +0800 | [diff] [blame] | 94 | .Lmisaligned_word_copy: |
| 95 | /* |
| 96 | * Misaligned word-wise copy. |
| 97 | * For misaligned copy we still perform word-wise copy, but we need to |
| 98 | * use the value fetched from the previous iteration and do some shifts. |
| 99 | * This is safe because we wouldn't access more words than necessary. |
| 100 | */ |
| 101 | |
| 102 | /* Calculate shifts */ |
| 103 | slli t3, a3, 3 |
| 104 | sub t4, x0, t3 /* negate is okay as shift will only look at LSBs */ |
| 105 | |
| 106 | /* Load the initial value and align a1 */ |
| 107 | andi a1, a1, ~(SZREG-1) |
| 108 | REG_L a5, 0(a1) |
| 109 | |
| 110 | addi t0, t0, SZREG-1 |
| 111 | /* At least one iteration will be executed here, no check */ |
| 112 | 1: |
| 113 | sll a4, a5, t4 |
| 114 | addi a1, a1, -SZREG |
| 115 | REG_L a5, 0(a1) |
| 116 | srl a2, a5, t3 |
| 117 | or a2, a2, a4 |
| 118 | addi a0, a0, -SZREG |
| 119 | REG_S a2, 0(a0) |
| 120 | bgtu a0, t0, 1b |
| 121 | |
| 122 | /* Update pointers to correct value */ |
| 123 | addi t0, t0, -(SZREG-1) |
| 124 | add a1, a1, a3 |
| 125 | |
| 126 | j .Lbyte_copy_tail |
Heinrich Schuchardt | 23caf66 | 2021-03-27 12:37:04 +0100 | [diff] [blame] | 127 | |
Heinrich Schuchardt | 23caf66 | 2021-03-27 12:37:04 +0100 | [diff] [blame] | 128 | END(__memmove) |