libc/memset: Implement function in assembler

Trace analysis of FVP_Base_AEMv8A model running in
Aarch32 mode with the build options listed below:
TRUSTED_BOARD_BOOT=1 GENERATE_COT=1
ARM_ROTPK_LOCATION=devel_ecdsa KEY_ALG=ecdsa
ROT_KEY=plat/arm/board/common/rotpk/arm_rotprivk_ecdsa.pem
shows that when auth_signature() gets called
71.84% of CPU execution time is spent in memset() function
written in C using single byte write operations,
see lib\libc\memset.c.
This patch replaces C memset() implementation with assembler
version giving the following results:
- for Aarch32 in auth_signature() call memset() CPU time
reduced to 24.84%.
- Number of CPU instructions executed during TF-A
boot stage before start of BL33 in RELEASE builds:
----------------------------------------------
|  Arch   |     C      |  assembler |    %   |
----------------------------------------------
| Aarch32 | 2073275460 | 1487400003 | -28.25 |
| Aarch64 | 2056807158 | 1244898303 | -39.47 |
----------------------------------------------
The patch also replaces memset.c with aarch64/memset.S
in plat\nvidia\tegra\platform.mk.

Change-Id: Ifbf085a2f577a25491e2d28446ee95a4ac891597
Signed-off-by: Alexei Fedorov <Alexei.Fedorov@arm.com>
diff --git a/lib/libc/aarch32/memset.S b/lib/libc/aarch32/memset.S
new file mode 100644
index 0000000..0b69897
--- /dev/null
+++ b/lib/libc/aarch32/memset.S
@@ -0,0 +1,74 @@
+/*
+ * Copyright (c) 2020, Arm Limited. All rights reserved.
+ *
+ * SPDX-License-Identifier: BSD-3-Clause
+ */
+
+#include <asm_macros.S>
+
+	.syntax unified
+	.global	memset
+
+/* -----------------------------------------------------------------------
+ * void memset(void *dst, int val, size_t count)
+ *
+ * Copy the value of 'val' (converted to an unsigned char) into
+ * each of the first 'count' characters of the object pointed to by 'dst'.
+ *
+ * Returns the value of 'dst'.
+ * -----------------------------------------------------------------------
+ */
+func memset
+	cmp	r2, #0
+	bxeq	lr			/* return if 'count' = 0 */
+	mov	r12, r0			/* keep r0 */
+	tst	r0, #3
+	beq	aligned			/* 4-bytes aligned */
+
+	/* Unaligned 'dst' */
+unaligned:
+	strb	r1, [r12], #1
+	subs	r2, r2, #1
+	bxeq	lr			/* return if 0 */
+	tst	r12, #3
+	bne	unaligned		/* continue while unaligned */
+
+	/* 4-bytes aligned */
+aligned:bfi	r1, r1, #8, #8		/* propagate 'val' */
+	bfi	r1, r1, #16, #16
+
+	mov	r3, r1
+
+	cmp	r2, #16
+	blo	less_16
+
+	push	{r4, lr}
+	mov	r4, r1
+	mov	lr, r1
+
+	cmp	r2, #32
+	blo	less_32
+
+write_32:
+	stmia	r12!, {r1, r3, r4, lr}	/* write 32 bytes in a loop */
+	stmia	r12!, {r1, r3, r4, lr}
+	subs	r2, r2, #32
+	popeq	{r4, pc}		/* return if 0 */
+	cmp	r2, #32
+	bhs	write_32
+
+less_32:cmp	r2, #16
+	stmiahs	r12!, {r1, r3, r4, lr}	/* write 16 bytes */
+	popeq	{r4, pc}		/* return if 16 */
+	pop	{r4, lr}
+
+less_16:lsls	r2, r2, #29		/* C = r2[3]; N = r2[2]; Z = r2[2:0] */
+	stmiacs	r12!, {r1, r3}		/* write 8 bytes */
+	bxeq	lr			/* return if 8 */
+	strmi	r1, [r12], #4		/* write 4 bytes */
+	lsls	r2, r2, #1		/* N = r2[1]; Z = r2[0] */
+	strhmi	r1, [r12], #2		/* write 2 bytes */
+	strbne	r1, [r12]		/* write 1 byte */
+	bx	lr
+
+endfunc memset
diff --git a/lib/libc/aarch64/memset.S b/lib/libc/aarch64/memset.S
new file mode 100644
index 0000000..8c65760
--- /dev/null
+++ b/lib/libc/aarch64/memset.S
@@ -0,0 +1,79 @@
+/*
+ * Copyright (c) 2020, Arm Limited. All rights reserved.
+ *
+ * SPDX-License-Identifier: BSD-3-Clause
+ */
+
+#include <asm_macros.S>
+
+	.global	memset
+
+/* -----------------------------------------------------------------------
+ * void memset(void *dst, int val, size_t count)
+ *
+ * Copy the value of 'val' (converted to an unsigned char) into
+ * each of the first 'count' characters of the object pointed to by 'dst'.
+ *
+ * Returns the value of 'dst'.
+ * -----------------------------------------------------------------------
+ */
+func memset
+	cbz	w2, exit		/* exit if 'count' = 0 */
+	mov	x3, x0			/* keep x0 */
+	tst	x0, #7
+	b.eq	aligned			/* 8-bytes aligned */
+
+	/* Unaligned 'dst' */
+unaligned:
+	strb	w1, [x3], #1
+	subs	w2, w2, #1
+	b.eq	exit			/* exit if 0 */
+	tst	x3, #7
+	b.ne	unaligned		/* continue while unaligned */
+
+	/* 8-bytes aligned */
+aligned:cbz	x1, x1_zero
+	bfi	w1, w1, #8, #8		/* propagate 'val' */
+	bfi	w1, w1, #16, #16
+	bfi	x1, x1, #32, #32
+
+x1_zero:ands	w4, w2, #~0x3f
+	b.eq	less_64
+
+write_64:
+	.rept	4
+	stp	x1, x1, [x3], #16	/* write 64 bytes in a loop */
+	.endr
+	subs	w4, w4, #64
+	b.ne	write_64
+	ands	w2, w2, #0x3f
+	b.eq	exit			/* exit if 0 */
+
+less_64:tbz	w2, #5, less_32		/* < 32 bytes */
+	stp	x1, x1, [x3], #16	/* write 32 bytes */
+	stp	x1, x1, [x3], #16
+	ands	w2, w2, #0x1f
+	b.eq	exit
+
+less_32:tbz	w2, #4, less_16		/* < 16 bytes */
+	stp	x1, x1, [x3], #16	/* write 16 bytes */
+	ands	w2, w2, #0xf
+	b.eq	exit
+
+less_16:tbz	w2, #3, less_8		/* < 8 bytes */
+	str	x1, [x3], #8		/* write 8 bytes */
+	ands	w2, w2, #7
+	b.eq	exit
+
+less_8:	tbz	w2, #2, less_4		/* < 4 bytes */
+	str	w1, [x3], #4		/* write 4 bytes */
+	ands	w2, w2, #3
+	b.eq	exit
+
+less_4:	tbz	w2, #1, less_2		/* < 2 bytes */
+	strh	w1, [x3], #2		/* write 2 bytes */
+	tbz	w2, #0, exit
+less_2:	strb	w1, [x3]		/* write 1 byte */
+exit:	ret
+
+endfunc	memset
diff --git a/lib/libc/libc.mk b/lib/libc/libc.mk
index 93d30d0..90a2a1e 100644
--- a/lib/libc/libc.mk
+++ b/lib/libc/libc.mk
@@ -1,5 +1,5 @@
 #
-# Copyright (c) 2016-2019, ARM Limited and Contributors. All rights reserved.
+# Copyright (c) 2016-2020, ARM Limited and Contributors. All rights reserved.
 #
 # SPDX-License-Identifier: BSD-3-Clause
 #
@@ -13,7 +13,6 @@
 			memcpy.c			\
 			memmove.c			\
 			memrchr.c			\
-			memset.c			\
 			printf.c			\
 			putchar.c			\
 			puts.c				\
@@ -28,8 +27,14 @@
 
 ifeq (${ARCH},aarch64)
 LIBC_SRCS	+=	$(addprefix lib/libc/aarch64/,	\
+			memset.S			\
 			setjmp.S)
 endif
 
+ifeq (${ARCH},aarch32)
+LIBC_SRCS	+=	$(addprefix lib/libc/aarch32/,	\
+			memset.S)
+endif
+
 INCLUDES	+=	-Iinclude/lib/libc		\
 			-Iinclude/lib/libc/$(ARCH)	\
diff --git a/lib/libc/memset.c b/lib/libc/memset.c
deleted file mode 100644
index d8007d8..0000000
--- a/lib/libc/memset.c
+++ /dev/null
@@ -1,18 +0,0 @@
-/*
- * Copyright (c) 2013-2019, ARM Limited and Contributors. All rights reserved.
- *
- * SPDX-License-Identifier: BSD-3-Clause
- */
-
-#include <stddef.h>
-#include <string.h>
-
-void *memset(void *dst, int val, size_t count)
-{
-	char *ptr = dst;
-
-	while (count--)
-		*ptr++ = val;
-
-	return dst;
-}
diff --git a/plat/nvidia/tegra/platform.mk b/plat/nvidia/tegra/platform.mk
index a4724e6..5cac46f 100644
--- a/plat/nvidia/tegra/platform.mk
+++ b/plat/nvidia/tegra/platform.mk
@@ -1,5 +1,5 @@
 #
-# Copyright (c) 2015-2019, ARM Limited and Contributors. All rights reserved.
+# Copyright (c) 2015-2020, ARM Limited and Contributors. All rights reserved.
 # Copyright (c) 2020, NVIDIA Corporation. All rights reserved.
 #
 # SPDX-License-Identifier: BSD-3-Clause
@@ -69,11 +69,11 @@
 
 # override with necessary libc files for the Tegra platform
 override LIBC_SRCS :=	$(addprefix lib/libc/,		\
+			aarch64/memset.S		\
 			aarch64/setjmp.S		\
 			assert.c			\
 			memcpy.c			\
 			memmove.c			\
-			memset.c			\
 			printf.c			\
 			putchar.c			\
 			strlen.c			\