Properly initialise the C runtime environment

This patch makes sure the C runtime environment is properly
initialised before executing any C code.

  - Zero-initialise NOBITS sections (e.g. the bss section).
  - Relocate BL1 data from ROM to RAM.

Change-Id: I0da81b417b2f0d1f7ef667cc5131b1e47e22571f
diff --git a/bl1/aarch64/bl1_entrypoint.S b/bl1/aarch64/bl1_entrypoint.S
index 7fe3bc6..119ae93 100644
--- a/bl1/aarch64/bl1_entrypoint.S
+++ b/bl1/aarch64/bl1_entrypoint.S
@@ -93,6 +93,30 @@
 
 _do_cold_boot:
 	/* ---------------------------------------------
+	 * Init C runtime environment.
+	 *   - Zero-initialise the NOBITS sections.
+	 *     There are 2 of them:
+	 *       - the .bss section;
+	 *       - the coherent memory section.
+	 *   - Copy the data section from BL1 image
+	 *     (stored in ROM) to the correct location
+	 *     in RAM.
+	 * ---------------------------------------------
+	 */
+	ldr	x0, =__BSS_START__
+	ldr	x1, =__BSS_SIZE__
+	bl	zeromem16
+
+	ldr	x0, =__COHERENT_RAM_START__
+	ldr	x1, =__COHERENT_RAM_UNALIGNED_SIZE__
+	bl	zeromem16
+
+	ldr	x0, =__DATA_RAM_START__
+	ldr	x1, =__DATA_ROM_START__
+	ldr	x2, =__DATA_SIZE__
+	bl	memcpy16
+
+	/* ---------------------------------------------
 	 * Initialize platform and jump to our c-entry
 	 * point for this type of reset
 	 * ---------------------------------------------
diff --git a/bl2/aarch64/bl2_entrypoint.S b/bl2/aarch64/bl2_entrypoint.S
index a85c51a..0255d65 100644
--- a/bl2/aarch64/bl2_entrypoint.S
+++ b/bl2/aarch64/bl2_entrypoint.S
@@ -77,6 +77,20 @@
 
 	isb
 
+	/* ---------------------------------------------
+	 * Zero out NOBITS sections. There are 2 of them:
+	 *   - the .bss section;
+	 *   - the coherent memory section.
+	 * ---------------------------------------------
+	 */
+	ldr	x0, =__BSS_START__
+	ldr	x1, =__BSS_SIZE__
+	bl	zeromem16
+
+	ldr	x0, =__COHERENT_RAM_START__
+	ldr	x1, =__COHERENT_RAM_UNALIGNED_SIZE__
+	bl	zeromem16
+
 	/* --------------------------------------------
 	 * Give ourselves a small coherent stack to
 	 * ease the pain of initializing the MMU
diff --git a/bl31/aarch64/bl31_entrypoint.S b/bl31/aarch64/bl31_entrypoint.S
index cb481b8..13725d7 100644
--- a/bl31/aarch64/bl31_entrypoint.S
+++ b/bl31/aarch64/bl31_entrypoint.S
@@ -93,6 +93,20 @@
 	bl	platform_is_primary_cpu
 	cbz	x0, _panic
 
+	/* ---------------------------------------------
+	 * Zero out NOBITS sections. There are 2 of them:
+	 *   - the .bss section;
+	 *   - the coherent memory section.
+	 * ---------------------------------------------
+	 */
+	ldr	x0, =__BSS_START__
+	ldr	x1, =__BSS_SIZE__
+	bl	zeromem16
+
+	ldr	x0, =__COHERENT_RAM_START__
+	ldr	x1, =__COHERENT_RAM_UNALIGNED_SIZE__
+	bl	zeromem16
+
 	/* --------------------------------------------
 	 * Give ourselves a small coherent stack to
 	 * ease the pain of initializing the MMU
diff --git a/lib/arch/aarch64/misc_helpers.S b/lib/arch/aarch64/misc_helpers.S
index 05e90f9..e36fdfa 100644
--- a/lib/arch/aarch64/misc_helpers.S
+++ b/lib/arch/aarch64/misc_helpers.S
@@ -75,6 +75,8 @@
 	.globl	eret
 	.globl	smc
 
+	.globl	zeromem16
+	.globl	memcpy16
 
 	.section	.text, "ax"
 
@@ -285,3 +287,54 @@
 
 smc:; .type smc, %function
 	smc	#0
+
+/* -----------------------------------------------------------------------
+ * void zeromem16(void *mem, unsigned int length);
+ *
+ * Initialise a memory region to 0.
+ * The memory address must be 16-byte aligned.
+ * -----------------------------------------------------------------------
+ */
+zeromem16:
+	add	x2, x0, x1
+/* zero 16 bytes at a time */
+z_loop16:
+	sub	x3, x2, x0
+	cmp	x3, #16
+	b.lt	z_loop1
+	stp	xzr, xzr, [x0], #16
+	b	z_loop16
+/* zero byte per byte */
+z_loop1:
+	cmp	x0, x2
+	b.eq	z_end
+	strb	wzr, [x0], #1
+	b	z_loop1
+z_end:	ret
+
+
+/* --------------------------------------------------------------------------
+ * void memcpy16(void *dest, const void *src, unsigned int length)
+ *
+ * Copy length bytes from memory area src to memory area dest.
+ * The memory areas should not overlap.
+ * Destination and source addresses must be 16-byte aligned.
+ * --------------------------------------------------------------------------
+ */
+memcpy16:
+/* copy 16 bytes at a time */
+m_loop16:
+	cmp	x2, #16
+	b.lt	m_loop1
+	ldp	x3, x4, [x1], #16
+	stp	x3, x4, [x0], #16
+	sub	x2, x2, #16
+	b	m_loop16
+/* copy byte per byte */
+m_loop1:
+	cbz	x2, m_end
+	ldrb	w3, [x1], #1
+	strb	w3, [x0], #1
+	subs	x2, x2, #1
+	b.ne	m_loop1
+m_end:	ret
diff --git a/plat/fvp/bl1_plat_setup.c b/plat/fvp/bl1_plat_setup.c
index 74b79d1..822a100 100644
--- a/plat/fvp/bl1_plat_setup.c
+++ b/plat/fvp/bl1_plat_setup.c
@@ -42,7 +42,6 @@
  ******************************************************************************/
 extern unsigned long __COHERENT_RAM_START__;
 extern unsigned long __COHERENT_RAM_END__;
-extern unsigned long __COHERENT_RAM_UNALIGNED_SIZE__;
 
 extern unsigned long __BL1_RAM_START__;
 extern unsigned long __BL1_RAM_END__;
@@ -56,8 +55,6 @@
  */
 #define BL1_COHERENT_RAM_BASE (unsigned long)(&__COHERENT_RAM_START__)
 #define BL1_COHERENT_RAM_LIMIT (unsigned long)(&__COHERENT_RAM_END__)
-#define BL1_COHERENT_RAM_LENGTH \
-	(unsigned long)(&__COHERENT_RAM_UNALIGNED_SIZE__)
 
 #define BL1_RAM_BASE (unsigned long)(&__BL1_RAM_START__)
 #define BL1_RAM_LIMIT (unsigned long)(&__BL1_RAM_END__)
@@ -113,13 +110,6 @@
  ******************************************************************************/
 void bl1_platform_setup(void)
 {
-	/*
-	 * This should zero out our coherent stacks as well but we don't care
-	 * as they are not being used right now.
-	 */
-	memset((void *) BL1_COHERENT_RAM_BASE, 0,
-	       (size_t) BL1_COHERENT_RAM_LENGTH);
-
 	/* Enable and initialize the System level generic timer */
 	mmio_write_32(SYS_CNTCTL_BASE + CNTCR_OFF, CNTCR_EN);