plat/arm: Introduce and use makefile

Trace analysis of FVP_Base_AEMv8A 0.0/6063 model
running in Aarch32 mode with the build options
listed below:
ARM_ROTPK_LOCATION=devel_ecdsa KEY_ALG=ecdsa
shows that when auth_signature() gets called
71.99% of CPU execution time is spent in memset() function
written in C using single byte write operations,
see lib\libc\memset.c.
This patch introduces new makefile which
replaces C memset() implementation with assembler
version giving the following results:
- for Aarch32 in auth_signature() call memset() CPU time
reduced to 20.56%.
The number of CPU instructions (Inst) executed during
TF-A boot stage before start of BL33 in RELEASE builds
for different versions is presented in the tables below,
- C TF-A: existing TF-A C code;
- C musl: "lightweight code" C "implementation of the
  standard library for Linux-based systems"
- Asm Opt: assemler version from "Arm Optimized Routines"
- Asm Linux: assembler version from Linux kernel
- Asm TF-A: assembler version from this patch

| Variant   | Set  | Size |    Inst 	 |  Ratio   |
| C TF-A    | T32  | 16   | 2122110003   | 1.000000 |
| C musl    | T32  | 156  | 1643917668   | 0.774662 |
| Asm Opt   | T32  | 84   | 1604810003   | 0.756233 |
| Asm Linux | A32  | 168  | 1566255018   | 0.738065 |
| Asm TF-A  | A32  | 160  | 1525865101   | 0.719032 |

| Variant   | Size |    Inst    |  Ratio   |
| C TF-A    | 28   | 2732497518 | 1.000000 |
| C musl    | 212  | 1802999999 | 0.659836 |
| Asm TF-A  | 140  | 1680260003 | 0.614917 |

This patch modifies 'plat\arm\common\'
by overriding makefile with and
does not effect other platforms.

Change-Id: Ie89dd0b74ba1079420733a0d76b7366ad0157c2e
Signed-off-by: Alexei Fedorov <>
diff --git a/lib/libc/aarch32/memset.S b/lib/libc/aarch32/memset.S
new file mode 100644
index 0000000..880ba83
--- /dev/null
+++ b/lib/libc/aarch32/memset.S
@@ -0,0 +1,74 @@
+ * Copyright (c) 2020, Arm Limited. All rights reserved.
+ *
+ * SPDX-License-Identifier: BSD-3-Clause
+ */
+#include <asm_macros.S>
+	.syntax unified
+	.global	memset
+/* -----------------------------------------------------------------------
+ * void *memset(void *dst, int val, size_t count)
+ *
+ * Copy the value of 'val' (converted to an unsigned char) into
+ * each of the first 'count' characters of the object pointed to by 'dst'.
+ *
+ * Returns the value of 'dst'.
+ * -----------------------------------------------------------------------
+ */
+func memset
+	mov	r12, r0			/* keep r0 */
+	tst	r0, #3
+	beq	aligned			/* 4-bytes aligned */
+	/* Unaligned 'dst' */
+	subs	r2, r2, #1
+	strbhs	r1, [r12], #1
+	bxls	lr			/* return if 0 */
+	tst	r12, #3
+	bne	unaligned		/* continue while unaligned */
+	/* 4-bytes aligned */
+aligned:bfi	r1, r1, #8, #8		/* propagate 'val' */
+	bfi	r1, r1, #16, #16
+	mov	r3, r1
+	cmp	r2, #16
+	blo	less_16			/* < 16 */
+	push	{r4, lr}
+	mov	r4, r1
+	mov	lr, r1
+	subs	r2, r2, #32
+	stmiahs	r12!, {r1, r3, r4, lr}
+	stmiahs	r12!, {r1, r3, r4, lr}
+	bhi	write_32		/* write 32 bytes in a loop */
+	popeq	{r4, pc}		/* return if 0 */
+	lsls	r2, r2, #28		/* C = r2[4]; N = r2[3]; Z = r2[3:0] */
+	stmiacs	r12!, {r1, r3, r4, lr}	/* write 16 bytes */
+	popeq	{r4, pc}		/* return if 16 */
+	stmiami	r12!, {r1, r3}		/* write 8 bytes */
+	lsls	r2, r2, #2		/* C = r2[2]; N = r2[1]; Z = r2[1:0] */
+	strcs	r1, [r12], #4		/* write 4 bytes */
+	popeq	{r4, pc}		/* return if 8 or 4 */
+	strhmi	r1, [r12], #2		/* write 2 bytes */
+	lsls	r2, r2, #1		/* N = Z = r2[0] */
+	strbmi	r1, [r12]		/* write 1 byte */
+	pop	{r4, pc}
+less_16:lsls	r2, r2, #29		/* C = r2[3]; N = r2[2]; Z = r2[2:0] */
+	stmiacs	r12!, {r1, r3}		/* write 8 bytes */
+	bxeq	lr			/* return if 8 */
+	strmi	r1, [r12], #4		/* write 4 bytes */
+	lsls	r2, r2, #2		/* C = r2[1]; N = Z = r2[0] */
+	strhcs	r1, [r12], #2		/* write 2 bytes */
+	strbmi	r1, [r12]		/* write 1 byte */
+	bx	lr
+endfunc memset