Introduce unified API to zero memory

Introduce zeromem_dczva function on AArch64 that can handle unaligned
addresses and make use of DC ZVA instruction to zero a whole block at a
time. This zeroing takes place directly in the cache to speed it up
without doing external memory access.

Remove the zeromem16 function on AArch64 and replace it with an alias to
zeromem. This zeromem16 function is now deprecated.

Remove the 16-bytes alignment constraint on __BSS_START__ in
firmware-design.md as it is now not mandatory anymore (it used to comply
with zeromem16 requirements).

Change the 16-bytes alignment constraints in SP min's linker script to a
8-bytes alignment constraint as the AArch32 zeromem implementation is now
more efficient on 8-bytes aligned addresses.

Introduce zero_normalmem and zeromem helpers in platform agnostic header
that are implemented this way:
* AArch32:
	* zero_normalmem: zero using usual data access
	* zeromem: alias for zero_normalmem
* AArch64:
	* zero_normalmem: zero normal memory  using DC ZVA instruction
	                  (needs MMU enabled)
	* zeromem: zero using usual data access

Usage guidelines: in most cases, zero_normalmem should be preferred.

There are 2 scenarios where zeromem (or memset) must be used instead:
* Code that must run with MMU disabled (which means all memory is
  considered device memory for data accesses).
* Code that fills device memory with null bytes.

Optionally, the following rule can be applied if performance is
important:
* Code zeroing small areas (few bytes) that are not secrets should use
  memset to take advantage of compiler optimizations.

  Note: Code zeroing security-related critical information should use
  zero_normalmem/zeromem instead of memset to avoid removal by
  compilers' optimizations in some cases or misbehaving versions of GCC.

Fixes ARM-software/tf-issues#408

Change-Id: Iafd9663fc1070413c3e1904e54091cf60effaa82
Signed-off-by: Douglas Raillard <douglas.raillard@arm.com>
diff --git a/lib/aarch32/misc_helpers.S b/lib/aarch32/misc_helpers.S
index bf4084a..dc84799 100644
--- a/lib/aarch32/misc_helpers.S
+++ b/lib/aarch32/misc_helpers.S
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, ARM Limited and Contributors. All rights reserved.
+ * Copyright (c) 2016-2017, ARM Limited and Contributors. All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -34,6 +34,7 @@
 
 	.globl	smc
 	.globl	zeromem
+	.globl	zero_normalmem
 	.globl	memcpy4
 	.globl	disable_mmu_icache_secure
 	.globl	disable_mmu_secure
@@ -50,30 +51,108 @@
 endfunc smc
 
 /* -----------------------------------------------------------------------
- * void zeromem(void *mem, unsigned int length);
+ * void zeromem(void *mem, unsigned int length)
  *
- * Initialise a memory region to 0.
- * The memory address and length must be 4-byte aligned.
+ * Initialise a region in normal memory to 0. This functions complies with the
+ * AAPCS and can be called from C code.
+ *
  * -----------------------------------------------------------------------
  */
 func zeromem
-#if ASM_ASSERTION
-	tst	r0, #0x3
-	ASM_ASSERT(eq)
-	tst	r1, #0x3
-	ASM_ASSERT(eq)
-#endif
-	add	r2, r0, r1
-	mov	r1, #0
-z_loop:
-	cmp	r2, r0
-	beq	z_end
-	str	r1, [r0], #4
-	b	z_loop
-z_end:
+	/*
+	 * Readable names for registers
+	 *
+	 * Registers r0, r1 and r2 are also set by zeromem which
+	 * branches into the fallback path directly, so cursor, length and
+	 * stop_address should not be retargeted to other registers.
+	 */
+	cursor       .req r0 /* Start address and then current address */
+	length       .req r1 /* Length in bytes of the region to zero out */
+	/*
+	 * Reusing the r1 register as length is only used at the beginning of
+	 * the function.
+	 */
+	stop_address .req r1  /* Address past the last zeroed byte */
+	zeroreg1     .req r2  /* Source register filled with 0 */
+	zeroreg2     .req r3  /* Source register filled with 0 */
+	tmp	     .req r12 /* Temporary scratch register */
+
+	mov	zeroreg1, #0
+
+	/* stop_address is the address past the last to zero */
+	add	stop_address, cursor, length
+
+	/*
+	 * Length cannot be used anymore as it shares the same register with
+	 * stop_address.
+	 */
+	.unreq	length
+
+	/*
+	 * If the start address is already aligned to 8 bytes, skip this loop.
+	 */
+	tst	cursor, #(8-1)
+	beq	.Lzeromem_8bytes_aligned
+
+	/* Calculate the next address aligned to 8 bytes */
+	orr	tmp, cursor, #(8-1)
+	adds	tmp, tmp, #1
+	/* If it overflows, fallback to byte per byte zeroing */
+	beq	.Lzeromem_1byte_aligned
+	/* If the next aligned address is after the stop address, fall back */
+	cmp	tmp, stop_address
+	bhs	.Lzeromem_1byte_aligned
+
+	/* zero byte per byte */
+1:
+	strb	zeroreg1, [cursor], #1
+	cmp	cursor, tmp
+	bne	1b
+
+	/* zero 8 bytes at a time */
+.Lzeromem_8bytes_aligned:
+
+	/* Calculate the last 8 bytes aligned address. */
+	bic	tmp, stop_address, #(8-1)
+
+	cmp	cursor, tmp
+	bhs	2f
+
+	mov	zeroreg2, #0
+1:
+	stmia	cursor!, {zeroreg1, zeroreg2}
+	cmp	cursor, tmp
+	blo	1b
+2:
+
+	/* zero byte per byte */
+.Lzeromem_1byte_aligned:
+	cmp	cursor, stop_address
+	beq	2f
+1:
+	strb	zeroreg1, [cursor], #1
+	cmp	cursor, stop_address
+	bne	1b
+2:
 	bx	lr
+
+	.unreq	cursor
+	/*
+	 * length is already unreq'ed to reuse the register for another
+	 * variable.
+	 */
+	.unreq	stop_address
+	.unreq	zeroreg1
+	.unreq	zeroreg2
+	.unreq	tmp
 endfunc zeromem
 
+/*
+ * AArch32 does not have special ways of zeroing normal memory as AArch64 does
+ * using the DC ZVA instruction, so we just alias zero_normalmem to zeromem.
+ */
+.equ	zero_normalmem, zeromem
+
 /* --------------------------------------------------------------------------
  * void memcpy4(void *dest, const void *src, unsigned int length)
  *