libc/memset: Implement function in assembler
Trace analysis of FVP_Base_AEMv8A model running in
Aarch32 mode with the build options listed below:
TRUSTED_BOARD_BOOT=1 GENERATE_COT=1
ARM_ROTPK_LOCATION=devel_ecdsa KEY_ALG=ecdsa
ROT_KEY=plat/arm/board/common/rotpk/arm_rotprivk_ecdsa.pem
shows that when auth_signature() gets called
71.84% of CPU execution time is spent in memset() function
written in C using single byte write operations,
see lib\libc\memset.c.
This patch replaces C memset() implementation with assembler
version giving the following results:
- for Aarch32 in auth_signature() call memset() CPU time
reduced to 24.84%.
- Number of CPU instructions executed during TF-A
boot stage before start of BL33 in RELEASE builds:
----------------------------------------------
| Arch | C | assembler | % |
----------------------------------------------
| Aarch32 | 2073275460 | 1487400003 | -28.25 |
| Aarch64 | 2056807158 | 1244898303 | -39.47 |
----------------------------------------------
The patch also replaces memset.c with aarch64/memset.S
in plat\nvidia\tegra\platform.mk.
Change-Id: Ifbf085a2f577a25491e2d28446ee95a4ac891597
Signed-off-by: Alexei Fedorov <Alexei.Fedorov@arm.com>
diff --git a/lib/libc/aarch64/memset.S b/lib/libc/aarch64/memset.S
new file mode 100644
index 0000000..8c65760
--- /dev/null
+++ b/lib/libc/aarch64/memset.S
@@ -0,0 +1,79 @@
+/*
+ * Copyright (c) 2020, Arm Limited. All rights reserved.
+ *
+ * SPDX-License-Identifier: BSD-3-Clause
+ */
+
+#include <asm_macros.S>
+
+ .global memset
+
+/* -----------------------------------------------------------------------
+ * void memset(void *dst, int val, size_t count)
+ *
+ * Copy the value of 'val' (converted to an unsigned char) into
+ * each of the first 'count' characters of the object pointed to by 'dst'.
+ *
+ * Returns the value of 'dst'.
+ * -----------------------------------------------------------------------
+ */
+func memset
+ cbz w2, exit /* exit if 'count' = 0 */
+ mov x3, x0 /* keep x0 */
+ tst x0, #7
+ b.eq aligned /* 8-bytes aligned */
+
+ /* Unaligned 'dst' */
+unaligned:
+ strb w1, [x3], #1
+ subs w2, w2, #1
+ b.eq exit /* exit if 0 */
+ tst x3, #7
+ b.ne unaligned /* continue while unaligned */
+
+ /* 8-bytes aligned */
+aligned:cbz x1, x1_zero
+ bfi w1, w1, #8, #8 /* propagate 'val' */
+ bfi w1, w1, #16, #16
+ bfi x1, x1, #32, #32
+
+x1_zero:ands w4, w2, #~0x3f
+ b.eq less_64
+
+write_64:
+ .rept 4
+ stp x1, x1, [x3], #16 /* write 64 bytes in a loop */
+ .endr
+ subs w4, w4, #64
+ b.ne write_64
+ ands w2, w2, #0x3f
+ b.eq exit /* exit if 0 */
+
+less_64:tbz w2, #5, less_32 /* < 32 bytes */
+ stp x1, x1, [x3], #16 /* write 32 bytes */
+ stp x1, x1, [x3], #16
+ ands w2, w2, #0x1f
+ b.eq exit
+
+less_32:tbz w2, #4, less_16 /* < 16 bytes */
+ stp x1, x1, [x3], #16 /* write 16 bytes */
+ ands w2, w2, #0xf
+ b.eq exit
+
+less_16:tbz w2, #3, less_8 /* < 8 bytes */
+ str x1, [x3], #8 /* write 8 bytes */
+ ands w2, w2, #7
+ b.eq exit
+
+less_8: tbz w2, #2, less_4 /* < 4 bytes */
+ str w1, [x3], #4 /* write 4 bytes */
+ ands w2, w2, #3
+ b.eq exit
+
+less_4: tbz w2, #1, less_2 /* < 2 bytes */
+ strh w1, [x3], #2 /* write 2 bytes */
+ tbz w2, #0, exit
+less_2: strb w1, [x3] /* write 1 byte */
+exit: ret
+
+endfunc memset