Fix board init code to respect the C runtime environment

board_init_f_mem() alters the C runtime environment's
stack it is actually already using. This is not a valid
behaviour within a C runtime environment.

Split board_init_f_mem into C functions which do not alter
their own stack and always behave properly with respect to
their C runtime environment.

Signed-off-by: Albert ARIBAUD <albert.u.boot@aribaud.net>
Acked-by: Thomas Chou <thomas@wytron.com.tw>
diff --git a/arch/arc/lib/start.S b/arch/arc/lib/start.S
index 26a5934..90ee7e0 100644
--- a/arch/arc/lib/start.S
+++ b/arch/arc/lib/start.S
@@ -50,18 +50,20 @@
 1:
 #endif
 
-	/* Setup stack- and frame-pointers */
+	/* Establish C runtime stack and frame */
 	mov	%sp, CONFIG_SYS_INIT_SP_ADDR
 	mov	%fp, %sp
 
-	/* Allocate and zero GD, update SP */
+	/* Allocate reserved area from current top of stack */
 	mov	%r0, %sp
-	bl	board_init_f_mem
-
-	/* Update stack- and frame-pointers */
+	bl	board_init_f_alloc_reserve
+	/* Set stack below reserved area, adjust frame pointer accordingly */
 	mov	%sp, %r0
 	mov	%fp, %sp
 
+	/* Initialize reserved area - note: r0 already contains address */
+	bl	board_init_f_init_reserve
+
 	/* Zero the one and only argument of "board_init_f" */
 	mov_s	%r0, 0
 	j	board_init_f
diff --git a/arch/arm/lib/crt0.S b/arch/arm/lib/crt0.S
index 80548eb..4f2a712 100644
--- a/arch/arm/lib/crt0.S
+++ b/arch/arm/lib/crt0.S
@@ -83,8 +83,9 @@
 	bic	sp, sp, #7	/* 8-byte alignment for ABI compliance */
 #endif
 	mov	r0, sp
-	bl	board_init_f_mem
+	bl	board_init_f_alloc_reserve
 	mov	sp, r0
+	bl	board_init_f_init_reserve
 
 	mov	r0, #0
 	bl	board_init_f
diff --git a/arch/arm/lib/crt0_64.S b/arch/arm/lib/crt0_64.S
index cef1c71..b4fc760 100644
--- a/arch/arm/lib/crt0_64.S
+++ b/arch/arm/lib/crt0_64.S
@@ -75,8 +75,10 @@
 	ldr	x0, =(CONFIG_SYS_INIT_SP_ADDR)
 #endif
 	bic	sp, x0, #0xf	/* 16-byte alignment for ABI compliance */
-	bl	board_init_f_mem
+	mov	x0, sp
+	bl	board_init_f_alloc_reserve
 	mov	sp, x0
+	bl	board_init_f_init_reserve
 
 	mov	x0, #0
 	bl	board_init_f
diff --git a/arch/microblaze/cpu/start.S b/arch/microblaze/cpu/start.S
index 14f46a8..206be3e 100644
--- a/arch/microblaze/cpu/start.S
+++ b/arch/microblaze/cpu/start.S
@@ -25,7 +25,7 @@
 
 	addi	r8, r0, __end
 	mts	rslr, r8
-	/* TODO: Redo this code to call board_init_f_mem() */
+	/* TODO: Redo this code to call board_init_f_*() */
 #if defined(CONFIG_SPL_BUILD)
 	addi	r1, r0, CONFIG_SPL_STACK_ADDR
 	mts	rshr, r1
@@ -142,7 +142,7 @@
 	ori	r12, r12, 0x1a0
 	mts	rmsr, r12
 
-	/* TODO: Redo this code to call board_init_f_mem() */
+	/* TODO: Redo this code to call board_init_f_*() */
 clear_bss:
 	/* clear BSS segments */
 	addi	r5, r0, __bss_start
diff --git a/arch/nios2/cpu/start.S b/arch/nios2/cpu/start.S
index 54787c5..204d0cd 100644
--- a/arch/nios2/cpu/start.S
+++ b/arch/nios2/cpu/start.S
@@ -106,14 +106,18 @@
 	stw	r0, 4(sp)
 	mov	fp, sp
 
-	/* Allocate and zero GD, update SP */
+	/* Allocate and initialize reserved area, update SP */
 	mov	r4, sp
-	movhi	r2, %hi(board_init_f_mem@h)
-	ori	r2, r2, %lo(board_init_f_mem@h)
+	movhi	r2, %hi(board_init_f_alloc_reserve@h)
+	ori	r2, r2, %lo(board_init_f_alloc_reserve@h)
 	callr	r2
-
-	/* Update stack- and frame-pointers */
 	mov	sp, r2
+	mov	r4, sp
+	movhi	r2, %hi(board_init_f_init_reserve@h)
+	ori	r2, r2, %lo(board_init_f_init_reserve@h)
+	callr	r2
+
+	/* Update frame-pointer */
 	mov	fp, sp
 
 	/* Call board_init_f -- never returns */
diff --git a/arch/powerpc/cpu/ppc4xx/start.S b/arch/powerpc/cpu/ppc4xx/start.S
index 3dd0557..137afce 100644
--- a/arch/powerpc/cpu/ppc4xx/start.S
+++ b/arch/powerpc/cpu/ppc4xx/start.S
@@ -762,8 +762,9 @@
 	bl	cpu_init_f	/* run low-level CPU init code	   (from Flash) */
 #ifdef CONFIG_SYS_GENERIC_BOARD
 	mr	r3, r1
-	bl	board_init_f_mem
+	bl	board_init_f_alloc_reserve
 	mr	r1, r3
+	bl	board_init_f_init_reserve
 	li	r0,0
 	stwu	r0, -4(r1)
 	stwu	r0, -4(r1)
@@ -1038,8 +1039,9 @@
 	bl	cpu_init_f	/* run low-level CPU init code	   (from Flash) */
 #ifdef CONFIG_SYS_GENERIC_BOARD
 	mr	r3, r1
-	bl	board_init_f_mem
+	bl	board_init_f_alloc_reserve
 	mr	r1, r3
+	bl	board_init_f_init_reserve
 	stwu	r0, -4(r1)
 	stwu	r0, -4(r1)
 #endif
diff --git a/arch/x86/cpu/start.S b/arch/x86/cpu/start.S
index 5b4ee79..485868f 100644
--- a/arch/x86/cpu/start.S
+++ b/arch/x86/cpu/start.S
@@ -123,8 +123,9 @@
 #endif
 	/* Set up global data */
 	mov	%esp, %eax
-	call	board_init_f_mem
+	call	board_init_f_alloc_reserve
 	mov	%eax, %esp
+	call	board_init_f_init_reserve
 
 #ifdef CONFIG_DEBUG_UART
 	call	debug_uart_init
diff --git a/arch/x86/lib/fsp/fsp_common.c b/arch/x86/lib/fsp/fsp_common.c
index 5276ce6..8479af1 100644
--- a/arch/x86/lib/fsp/fsp_common.c
+++ b/arch/x86/lib/fsp/fsp_common.c
@@ -90,8 +90,8 @@
 		/*
 		 * The second time we enter here, adjust the size of malloc()
 		 * pool before relocation. Given gd->malloc_base was adjusted
-		 * after the call to board_init_f_mem() in arch/x86/cpu/start.S,
-		 * we should fix up gd->malloc_limit here.
+		 * after the call to board_init_f_init_reserve() in arch/x86/
+		 * cpu/start.S, we should fix up gd->malloc_limit here.
 		 */
 		gd->malloc_limit += CONFIG_FSP_SYS_MALLOC_F_LEN;
 	}
diff --git a/common/init/board_init.c b/common/init/board_init.c
index 1c6126d..e649e07 100644
--- a/common/init/board_init.c
+++ b/common/init/board_init.c
@@ -29,31 +29,120 @@
 }
 #endif /* !CONFIG_X86 */
 
-ulong board_init_f_mem(ulong top)
+/*
+ * Allocate reserved space for use as 'globals' from 'top' address and
+ * return 'bottom' address of allocated space
+ *
+ * Notes:
+ *
+ * Actual reservation cannot be done from within this function as
+ * it requires altering the C stack pointer, so this will be done by
+ * the caller upon return from this function.
+ *
+ * IMPORTANT:
+ *
+ * Alignment constraints may differ for each 'chunk' allocated. For now:
+ *
+ * - GD is aligned down on a 16-byte boundary
+ *
+ *  - the early malloc arena is not aligned, therefore it follows the stack
+ *   alignment constraint of the architecture for which we are bulding.
+ *
+ *  - GD is allocated last, so that the return value of this functions is
+ *   both the bottom of the reserved area and the address of GD, should
+ *   the calling context need it.
+ */
+
+ulong board_init_f_alloc_reserve(ulong top)
+{
+	/* Reserve early malloc arena */
+#if defined(CONFIG_SYS_MALLOC_F)
+	top -= CONFIG_SYS_MALLOC_F_LEN;
+#endif
+	/* LAST : reserve GD (rounded up to a multiple of 16 bytes) */
+	top = rounddown(top-sizeof(struct global_data), 16);
+
+	return top;
+}
+
+/*
+ * Initialize reserved space (which has been safely allocated on the C
+ * stack from the C runtime environment handling code).
+ *
+ * Notes:
+ *
+ * Actual reservation was done by the caller; the locations from base
+ * to base+size-1 (where 'size' is the value returned by the allocation
+ * function above) can be accessed freely without risk of corrupting the
+ * C runtime environment.
+ *
+ * IMPORTANT:
+ *
+ * Upon return from the allocation function above, on some architectures
+ * the caller will set gd to the lowest reserved location. Therefore, in
+ * this initialization function, the global data MUST be placed at base.
+ *
+ * ALSO IMPORTANT:
+ *
+ * On some architectures, gd will already be good when entering this
+ * function. On others, it will only be good once arch_setup_gd() returns.
+ * Therefore, global data accesses must be done:
+ *
+ * - through gd_ptr if before the call to arch_setup_gd();
+ *
+ * - through gd once arch_setup_gd() has been called.
+ *
+ * Do not use 'gd->' until arch_setup_gd() has been called!
+ *
+ * IMPORTANT TOO:
+ *
+ * Initialization for each "chunk" (GD, early malloc arena...) ends with
+ * an incrementation line of the form 'base += <some size>'. The last of
+ * these incrementations seems useless, as base will not be used any
+ * more after this incrementation; but if/when a new "chunk" is appended,
+ * this increment will be essential as it will give base right value for
+ * this new chunk (which will have to end with its own incrementation
+ * statement). Besides, the compiler's optimizer will silently detect
+ * and remove the last base incrementation, therefore leaving that last
+ * (seemingly useless) incrementation causes no code increase.
+ */
+
+void board_init_f_init_reserve(ulong base)
 {
 	struct global_data *gd_ptr;
 #ifndef _USE_MEMCPY
 	int *ptr;
 #endif
 
-	/* Leave space for the stack we are running with now */
-	top -= 0x40;
+	/*
+	 * clear GD entirely and set it up.
+	 * Use gd_ptr, as gd may not be properly set yet.
+	 */
 
-	top -= sizeof(struct global_data);
-	top = ALIGN(top, 16);
-	gd_ptr = (struct global_data *)top;
+	gd_ptr = (struct global_data *)base;
+	/* zero the area */
 #ifdef _USE_MEMCPY
 	memset(gd_ptr, '\0', sizeof(*gd));
 #else
 	for (ptr = (int *)gd_ptr; ptr < (int *)(gd_ptr + 1); )
 		*ptr++ = 0;
 #endif
+	/* set GD unless architecture did it already */
+#ifndef CONFIG_X86
 	arch_setup_gd(gd_ptr);
+#endif
+	/* next alloc will be higher by one GD plus 16-byte alignment */
+	base += roundup(sizeof(struct global_data), 16);
+
+	/*
+	 * record early malloc arena start.
+	 * Use gd as it is now properly set for all architectures.
+	 */
 
 #if defined(CONFIG_SYS_MALLOC_F)
-	top -= CONFIG_SYS_MALLOC_F_LEN;
-	gd->malloc_base = top;
+	/* go down one 'early malloc arena' */
+	gd->malloc_base = base;
+	/* next alloc will be higher by one 'early malloc arena' size */
+	base += CONFIG_SYS_MALLOC_F_LEN;
 #endif
-
-	return top;
 }
diff --git a/include/common.h b/include/common.h
index 75c78d5..7bed0cc 100644
--- a/include/common.h
+++ b/include/common.h
@@ -224,32 +224,26 @@
 void board_init_r(gd_t *, ulong) __attribute__ ((noreturn));
 
 /**
- * board_init_f_mem() - Allocate global data and set stack position
+ * ulong board_init_f_alloc_reserve - allocate reserved area
  *
  * This function is called by each architecture very early in the start-up
- * code to set up the environment for board_init_f(). It allocates space for
- * global_data (see include/asm-generic/global_data.h) and places the stack
- * below this.
+ * code to allow the C runtime to reserve space on the stack for writable
+ * 'globals' such as GD and the malloc arena.
  *
- * This function requires a stack[1] Normally this is at @top. The function
- * starts allocating space from 64 bytes below @top. First it creates space
- * for global_data. Then it calls arch_setup_gd() which sets gd to point to
- * the global_data space and can reserve additional bytes of space if
- * required). Finally it allocates early malloc() memory
- * (CONFIG_SYS_MALLOC_F_LEN). The new top of the stack is just below this,
- * and it returned by this function.
+ * @top:	top of the reserve area, growing down.
+ * @return:	bottom of reserved area
+ */
+ulong board_init_f_alloc_reserve(ulong top);
+
+/**
+ * board_init_f_init_reserve - initialize the reserved area(s)
  *
- * [1] Strictly speaking it would be possible to implement this function
- * in C on many archs such that it does not require a stack. However this
- * does not seem hugely important as only 64 byte are wasted. The 64 bytes
- * are used to handle the calling standard which generally requires pushing
- * addresses or registers onto the stack. We should be able to get away with
- * less if this becomes important.
+ * This function is called once the C runtime has allocated the reserved
+ * area on the stack. It must initialize the GD at the base of that area.
  *
- * @top:	Top of available memory, also normally the top of the stack
- * @return:	New stack location
+ * @base:	top from which reservation was done
  */
-ulong board_init_f_mem(ulong top);
+void board_init_f_init_reserve(ulong base);
 
 /**
  * arch_setup_gd() - Set up the global_data pointer