Heinrich Schuchardt | 23caf66 | 2021-03-27 12:37:04 +0100 | [diff] [blame] | 1 | /* SPDX-License-Identifier: GPL-2.0-only */ |
| 2 | /* |
| 3 | * Copyright (C) 2013 Regents of the University of California |
| 4 | */ |
| 5 | |
| 6 | #include <linux/linkage.h> |
| 7 | #include <asm/asm.h> |
| 8 | |
| 9 | /* void *memcpy(void *, const void *, size_t) */ |
| 10 | ENTRY(__memcpy) |
| 11 | WEAK(memcpy) |
Rick Chen | 718e569 | 2023-01-04 09:56:28 +0800 | [diff] [blame] | 12 | beq a0, a1, .copy_end |
Bin Meng | 8a27fcd | 2021-05-13 16:46:17 +0800 | [diff] [blame] | 13 | /* Save for return value */ |
| 14 | mv t6, a0 |
Heinrich Schuchardt | 23caf66 | 2021-03-27 12:37:04 +0100 | [diff] [blame] | 15 | |
Bin Meng | 8a27fcd | 2021-05-13 16:46:17 +0800 | [diff] [blame] | 16 | /* |
| 17 | * Register allocation for code below: |
| 18 | * a0 - start of uncopied dst |
| 19 | * a1 - start of uncopied src |
| 20 | * t0 - end of uncopied dst |
| 21 | */ |
| 22 | add t0, a0, a2 |
Heinrich Schuchardt | 23caf66 | 2021-03-27 12:37:04 +0100 | [diff] [blame] | 23 | |
Heinrich Schuchardt | 23caf66 | 2021-03-27 12:37:04 +0100 | [diff] [blame] | 24 | /* |
Bin Meng | 8a27fcd | 2021-05-13 16:46:17 +0800 | [diff] [blame] | 25 | * Use bytewise copy if too small. |
| 26 | * |
| 27 | * This threshold must be at least 2*SZREG to ensure at least one |
| 28 | * wordwise copy is performed. It is chosen to be 16 because it will |
| 29 | * save at least 7 iterations of bytewise copy, which pays off the |
| 30 | * fixed overhead. |
Heinrich Schuchardt | 23caf66 | 2021-03-27 12:37:04 +0100 | [diff] [blame] | 31 | */ |
Bin Meng | 8a27fcd | 2021-05-13 16:46:17 +0800 | [diff] [blame] | 32 | li a3, 16 |
| 33 | bltu a2, a3, .Lbyte_copy_tail |
| 34 | |
| 35 | /* |
| 36 | * Bytewise copy first to align a0 to word boundary. |
| 37 | */ |
| 38 | addi a2, a0, SZREG-1 |
| 39 | andi a2, a2, ~(SZREG-1) |
| 40 | beq a0, a2, 2f |
Heinrich Schuchardt | 23caf66 | 2021-03-27 12:37:04 +0100 | [diff] [blame] | 41 | 1: |
Bin Meng | 8a27fcd | 2021-05-13 16:46:17 +0800 | [diff] [blame] | 42 | lb a5, 0(a1) |
| 43 | addi a1, a1, 1 |
| 44 | sb a5, 0(a0) |
| 45 | addi a0, a0, 1 |
| 46 | bne a0, a2, 1b |
| 47 | 2: |
Heinrich Schuchardt | 23caf66 | 2021-03-27 12:37:04 +0100 | [diff] [blame] | 48 | |
Bin Meng | 8a27fcd | 2021-05-13 16:46:17 +0800 | [diff] [blame] | 49 | /* |
| 50 | * Now a0 is word-aligned. If a1 is also word aligned, we could perform |
| 51 | * aligned word-wise copy. Otherwise we need to perform misaligned |
| 52 | * word-wise copy. |
| 53 | */ |
| 54 | andi a3, a1, SZREG-1 |
| 55 | bnez a3, .Lmisaligned_word_copy |
| 56 | |
| 57 | /* Unrolled wordwise copy */ |
| 58 | addi t0, t0, -(16*SZREG-1) |
| 59 | bgeu a0, t0, 2f |
| 60 | 1: |
| 61 | REG_L a2, 0(a1) |
| 62 | REG_L a3, SZREG(a1) |
| 63 | REG_L a4, 2*SZREG(a1) |
| 64 | REG_L a5, 3*SZREG(a1) |
| 65 | REG_L a6, 4*SZREG(a1) |
| 66 | REG_L a7, 5*SZREG(a1) |
| 67 | REG_L t1, 6*SZREG(a1) |
| 68 | REG_L t2, 7*SZREG(a1) |
| 69 | REG_L t3, 8*SZREG(a1) |
| 70 | REG_L t4, 9*SZREG(a1) |
| 71 | REG_L t5, 10*SZREG(a1) |
| 72 | REG_S a2, 0(a0) |
| 73 | REG_S a3, SZREG(a0) |
| 74 | REG_S a4, 2*SZREG(a0) |
| 75 | REG_S a5, 3*SZREG(a0) |
| 76 | REG_S a6, 4*SZREG(a0) |
| 77 | REG_S a7, 5*SZREG(a0) |
| 78 | REG_S t1, 6*SZREG(a0) |
| 79 | REG_S t2, 7*SZREG(a0) |
| 80 | REG_S t3, 8*SZREG(a0) |
| 81 | REG_S t4, 9*SZREG(a0) |
| 82 | REG_S t5, 10*SZREG(a0) |
| 83 | REG_L a2, 11*SZREG(a1) |
| 84 | REG_L a3, 12*SZREG(a1) |
| 85 | REG_L a4, 13*SZREG(a1) |
| 86 | REG_L a5, 14*SZREG(a1) |
| 87 | REG_L a6, 15*SZREG(a1) |
| 88 | addi a1, a1, 16*SZREG |
| 89 | REG_S a2, 11*SZREG(a0) |
| 90 | REG_S a3, 12*SZREG(a0) |
| 91 | REG_S a4, 13*SZREG(a0) |
| 92 | REG_S a5, 14*SZREG(a0) |
| 93 | REG_S a6, 15*SZREG(a0) |
| 94 | addi a0, a0, 16*SZREG |
| 95 | bltu a0, t0, 1b |
Heinrich Schuchardt | 23caf66 | 2021-03-27 12:37:04 +0100 | [diff] [blame] | 96 | 2: |
Bin Meng | 8a27fcd | 2021-05-13 16:46:17 +0800 | [diff] [blame] | 97 | /* Post-loop increment by 16*SZREG-1 and pre-loop decrement by SZREG-1 */ |
| 98 | addi t0, t0, 15*SZREG |
Heinrich Schuchardt | 23caf66 | 2021-03-27 12:37:04 +0100 | [diff] [blame] | 99 | |
Bin Meng | 8a27fcd | 2021-05-13 16:46:17 +0800 | [diff] [blame] | 100 | /* Wordwise copy */ |
| 101 | bgeu a0, t0, 2f |
| 102 | 1: |
| 103 | REG_L a5, 0(a1) |
| 104 | addi a1, a1, SZREG |
| 105 | REG_S a5, 0(a0) |
| 106 | addi a0, a0, SZREG |
| 107 | bltu a0, t0, 1b |
| 108 | 2: |
| 109 | addi t0, t0, SZREG-1 |
Heinrich Schuchardt | 23caf66 | 2021-03-27 12:37:04 +0100 | [diff] [blame] | 110 | |
Bin Meng | 8a27fcd | 2021-05-13 16:46:17 +0800 | [diff] [blame] | 111 | .Lbyte_copy_tail: |
| 112 | /* |
| 113 | * Bytewise copy anything left. |
| 114 | */ |
| 115 | beq a0, t0, 2f |
| 116 | 1: |
| 117 | lb a5, 0(a1) |
| 118 | addi a1, a1, 1 |
| 119 | sb a5, 0(a0) |
| 120 | addi a0, a0, 1 |
| 121 | bne a0, t0, 1b |
| 122 | 2: |
Heinrich Schuchardt | 23caf66 | 2021-03-27 12:37:04 +0100 | [diff] [blame] | 123 | |
Bin Meng | 8a27fcd | 2021-05-13 16:46:17 +0800 | [diff] [blame] | 124 | mv a0, t6 |
Rick Chen | 718e569 | 2023-01-04 09:56:28 +0800 | [diff] [blame] | 125 | .copy_end: |
Heinrich Schuchardt | 23caf66 | 2021-03-27 12:37:04 +0100 | [diff] [blame] | 126 | ret |
| 127 | |
Bin Meng | 8a27fcd | 2021-05-13 16:46:17 +0800 | [diff] [blame] | 128 | .Lmisaligned_word_copy: |
| 129 | /* |
| 130 | * Misaligned word-wise copy. |
| 131 | * For misaligned copy we still perform word-wise copy, but we need to |
| 132 | * use the value fetched from the previous iteration and do some shifts. |
| 133 | * This is safe because we wouldn't access more words than necessary. |
| 134 | */ |
| 135 | |
| 136 | /* Calculate shifts */ |
| 137 | slli t3, a3, 3 |
| 138 | sub t4, x0, t3 /* negate is okay as shift will only look at LSBs */ |
| 139 | |
| 140 | /* Load the initial value and align a1 */ |
| 141 | andi a1, a1, ~(SZREG-1) |
| 142 | REG_L a5, 0(a1) |
| 143 | |
| 144 | addi t0, t0, -(SZREG-1) |
| 145 | /* At least one iteration will be executed here, no check */ |
| 146 | 1: |
| 147 | srl a4, a5, t3 |
| 148 | REG_L a5, SZREG(a1) |
| 149 | addi a1, a1, SZREG |
| 150 | sll a2, a5, t4 |
| 151 | or a2, a2, a4 |
| 152 | REG_S a2, 0(a0) |
| 153 | addi a0, a0, SZREG |
| 154 | bltu a0, t0, 1b |
| 155 | |
| 156 | /* Update pointers to correct value */ |
| 157 | addi t0, t0, SZREG-1 |
| 158 | add a1, a1, a3 |
| 159 | |
| 160 | j .Lbyte_copy_tail |
Heinrich Schuchardt | 23caf66 | 2021-03-27 12:37:04 +0100 | [diff] [blame] | 161 | END(__memcpy) |