blob: ee9f9a96cfe6e26bc3e0e4867a48bac6764d8170 [file] [log] [blame]
Stefan Roese27c3e952021-09-02 17:00:17 +02001/* SPDX-License-Identifier: MIT */
2/*
3 * memset - fill memory with a constant byte
4 *
5 * Copyright (c) 2012-2021, Arm Limited.
6 */
7
8/* Assumptions:
9 *
10 * ARMv8-a, AArch64, Advanced SIMD, unaligned accesses.
11 *
12 */
13
Stefan Roese93e7dfd2021-09-02 17:00:18 +020014#include <asm/macro.h>
Stefan Roese27c3e952021-09-02 17:00:17 +020015#include "asmdefs.h"
16
17#define dstin x0
18#define val x1
19#define valw w1
20#define count x2
21#define dst x3
22#define dstend x4
23#define zva_val x5
24
25ENTRY (memset)
26 PTR_ARG (0)
27 SIZE_ARG (2)
28
Stefan Roese93e7dfd2021-09-02 17:00:18 +020029 /*
30 * The optimized memset uses the dc opcode, which causes problems
31 * when the cache is disabled. Let's check if the cache is disabled
32 * and use a very simple memset implementation in this case. Otherwise
33 * jump to the optimized version.
34 */
35 switch_el x6, 3f, 2f, 1f
363: mrs x6, sctlr_el3
37 b 0f
382: mrs x6, sctlr_el2
39 b 0f
401: mrs x6, sctlr_el1
410:
42 tst x6, #CR_C
43 bne 9f
44
45 /*
46 * A very "simple" memset implementation without the use of the
47 * dc opcode. Can be run with caches disabled.
48 */
49 mov x3, #0x0
50 cmp count, x3 /* check for zero length */
51 beq 8f
524: strb valw, [dstin, x3]
53 add x3, x3, #0x1
54 cmp count, x3
55 bne 4b
568: ret
579:
58
59 /* Here the optimized memset version starts */
Stefan Roese27c3e952021-09-02 17:00:17 +020060 dup v0.16B, valw
61 add dstend, dstin, count
62
63 cmp count, 96
64 b.hi L(set_long)
65 cmp count, 16
66 b.hs L(set_medium)
67 mov val, v0.D[0]
68
69 /* Set 0..15 bytes. */
70 tbz count, 3, 1f
71 str val, [dstin]
72 str val, [dstend, -8]
73 ret
74 .p2align 4
751: tbz count, 2, 2f
76 str valw, [dstin]
77 str valw, [dstend, -4]
78 ret
792: cbz count, 3f
80 strb valw, [dstin]
81 tbz count, 1, 3f
82 strh valw, [dstend, -2]
833: ret
84
85 /* Set 17..96 bytes. */
86L(set_medium):
87 str q0, [dstin]
88 tbnz count, 6, L(set96)
89 str q0, [dstend, -16]
90 tbz count, 5, 1f
91 str q0, [dstin, 16]
92 str q0, [dstend, -32]
931: ret
94
95 .p2align 4
96 /* Set 64..96 bytes. Write 64 bytes from the start and
97 32 bytes from the end. */
98L(set96):
99 str q0, [dstin, 16]
100 stp q0, q0, [dstin, 32]
101 stp q0, q0, [dstend, -32]
102 ret
103
104 .p2align 4
105L(set_long):
106 and valw, valw, 255
107 bic dst, dstin, 15
108 str q0, [dstin]
109 cmp count, 160
110 ccmp valw, 0, 0, hs
111 b.ne L(no_zva)
112
113#ifndef SKIP_ZVA_CHECK
114 mrs zva_val, dczid_el0
115 and zva_val, zva_val, 31
116 cmp zva_val, 4 /* ZVA size is 64 bytes. */
117 b.ne L(no_zva)
118#endif
119 str q0, [dst, 16]
120 stp q0, q0, [dst, 32]
121 bic dst, dst, 63
122 sub count, dstend, dst /* Count is now 64 too large. */
123 sub count, count, 128 /* Adjust count and bias for loop. */
124
125 .p2align 4
126L(zva_loop):
127 add dst, dst, 64
128 dc zva, dst
129 subs count, count, 64
130 b.hi L(zva_loop)
131 stp q0, q0, [dstend, -64]
132 stp q0, q0, [dstend, -32]
133 ret
134
135L(no_zva):
136 sub count, dstend, dst /* Count is 16 too large. */
137 sub dst, dst, 16 /* Dst is biased by -32. */
138 sub count, count, 64 + 16 /* Adjust count and bias for loop. */
139L(no_zva_loop):
140 stp q0, q0, [dst, 32]
141 stp q0, q0, [dst, 64]!
142 subs count, count, 64
143 b.hi L(no_zva_loop)
144 stp q0, q0, [dstend, -64]
145 stp q0, q0, [dstend, -32]
146 ret
147
148END (memset)