Merge pull request #43 from danh-arm/dh/tf-issues#129

Move console.c to pl011 specific driver location
diff --git a/Makefile b/Makefile
index 9d02141..f4b74fe 100644
--- a/Makefile
+++ b/Makefile
@@ -150,8 +150,10 @@
 CFLAGS			:= 	-nostdinc -pedantic -ffreestanding -Wall	\
 				-Werror -mgeneral-regs-only -std=c99 -c -Os	\
 				-DDEBUG=${DEBUG} ${INCLUDES} ${CFLAGS}
+CFLAGS			+=	-ffunction-sections -fdata-sections
 
 LDFLAGS			+=	--fatal-warnings -O1
+LDFLAGS			+=	--gc-sections
 
 
 vpath %.ld.S bl1:bl2:bl31
diff --git a/arch/aarch64/cpu/cpu_helpers.S b/arch/aarch64/cpu/cpu_helpers.S
index 009f08a..573d0b8 100644
--- a/arch/aarch64/cpu/cpu_helpers.S
+++ b/arch/aarch64/cpu/cpu_helpers.S
@@ -29,13 +29,12 @@
  */
 
 #include <arch.h>
+#include <asm_macros.S>
 
 	.weak	cpu_reset_handler
 
 
-	.section	.text, "ax"; .align 3
-
-cpu_reset_handler: ; .type cpu_reset_handler, %function
+func cpu_reset_handler
 	mov	x19, x30 // lr
 
 	/* ---------------------------------------------
diff --git a/arch/system/gic/aarch64/gic_v3_sysregs.S b/arch/system/gic/aarch64/gic_v3_sysregs.S
index d686aeb..2a96da7 100644
--- a/arch/system/gic/aarch64/gic_v3_sysregs.S
+++ b/arch/system/gic/aarch64/gic_v3_sysregs.S
@@ -28,6 +28,8 @@
  * POSSIBILITY OF SUCH DAMAGE.
  */
 
+#include <asm_macros.S>
+
 	.globl	read_icc_sre_el1
 	.globl	read_icc_sre_el2
 	.globl	read_icc_sre_el3
@@ -48,42 +50,40 @@
 #define ICC_CTLR_EL3    S3_6_C12_C12_4
 #define ICC_PMR_EL1     S3_0_C4_C6_0
 
-	.section	.text, "ax"; .align 3
-
-read_icc_sre_el1: ; .type read_icc_sre_el1, %function
+func read_icc_sre_el1
 	mrs	x0, ICC_SRE_EL1
 	ret
 
 
-read_icc_sre_el2: ; .type read_icc_sre_el2, %function
+func read_icc_sre_el2
 	mrs	x0, ICC_SRE_EL2
 	ret
 
 
-read_icc_sre_el3: ; .type read_icc_sre_el3, %function
+func read_icc_sre_el3
 	mrs	x0, ICC_SRE_EL3
 	ret
 
 
-write_icc_sre_el1: ; .type write_icc_sre_el1, %function
+func write_icc_sre_el1
 	msr	ICC_SRE_EL1, x0
 	isb
 	ret
 
 
-write_icc_sre_el2: ; .type write_icc_sre_el2, %function
+func write_icc_sre_el2
 	msr	ICC_SRE_EL2, x0
 	isb
 	ret
 
 
-write_icc_sre_el3: ; .type write_icc_sre_el3, %function
+func write_icc_sre_el3
 	msr	ICC_SRE_EL3, x0
 	isb
 	ret
 
 
-write_icc_pmr_el1: ; .type write_icc_pmr_el1, %function
+func write_icc_pmr_el1
 	msr	ICC_PMR_EL1, x0
 	isb
 	ret
diff --git a/bl1/aarch64/bl1_entrypoint.S b/bl1/aarch64/bl1_entrypoint.S
index dc63f39..012b779 100644
--- a/bl1/aarch64/bl1_entrypoint.S
+++ b/bl1/aarch64/bl1_entrypoint.S
@@ -29,12 +29,11 @@
  */
 
 #include <arch.h>
+#include <asm_macros.S>
 
 	.globl	bl1_entrypoint
 
 
-	.section	.text, "ax"; .align 3
-
 	/* -----------------------------------------------------
 	 * bl1_entrypoint() is the entry point into the trusted
 	 * firmware code when a cpu is released from warm or
@@ -42,7 +41,7 @@
 	 * -----------------------------------------------------
 	 */
 
-bl1_entrypoint: ; .type bl1_entrypoint, %function
+func bl1_entrypoint
 	/* ---------------------------------------------
 	 * Perform any processor specific actions upon
 	 * reset e.g. cache, tlb invalidations etc.
diff --git a/bl1/aarch64/bl1_exceptions.S b/bl1/aarch64/bl1_exceptions.S
index a80d178..e57f228 100644
--- a/bl1/aarch64/bl1_exceptions.S
+++ b/bl1/aarch64/bl1_exceptions.S
@@ -178,8 +178,7 @@
 
 	.align	7
 
-	.section	.text, "ax"
-process_exception:
+func process_exception
 	sub	sp, sp, #0x40
 	stp	x0, x1, [sp, #0x0]
 	stp	x2, x3, [sp, #0x10]
diff --git a/bl1/bl1.ld.S b/bl1/bl1.ld.S
index 012ff58..81c5443 100644
--- a/bl1/bl1.ld.S
+++ b/bl1/bl1.ld.S
@@ -43,8 +43,8 @@
 {
     ro : {
         __RO_START__ = .;
-        *bl1_entrypoint.o(.text)
-        *(.text)
+        *bl1_entrypoint.o(.text*)
+        *(.text*)
         *(.rodata*)
         *(.vectors)
         __RO_END__ = .;
@@ -57,7 +57,7 @@
     . = NEXT(16);        /* Align LMA */
     .data : ALIGN(16) {  /* Align VMA */
         __DATA_RAM_START__ = .;
-        *(.data)
+        *(.data*)
         __DATA_RAM_END__ = .;
     } >RAM AT>ROM
 
@@ -73,7 +73,7 @@
      */
     .bss : ALIGN(16) {
         __BSS_START__ = .;
-        *(.bss)
+        *(.bss*)
         *(COMMON)
         __BSS_END__ = .;
     } >RAM
diff --git a/bl2/aarch64/bl2_entrypoint.S b/bl2/aarch64/bl2_entrypoint.S
index ebb3b94..6bc779a 100644
--- a/bl2/aarch64/bl2_entrypoint.S
+++ b/bl2/aarch64/bl2_entrypoint.S
@@ -30,15 +30,14 @@
 
 #include <bl_common.h>
 #include <arch.h>
+#include <asm_macros.S>
 
 
 	.globl	bl2_entrypoint
 
 
-	.section	.text, "ax"; .align 3
-
 
-bl2_entrypoint: ; .type bl2_entrypoint, %function
+func bl2_entrypoint
 	/*---------------------------------------------
 	 * Store the extents of the tzram available to
 	 * BL2 for future use. Use the opcode param to
diff --git a/bl2/bl2.ld.S b/bl2/bl2.ld.S
index 09dec75..edb676a 100644
--- a/bl2/bl2.ld.S
+++ b/bl2/bl2.ld.S
@@ -47,8 +47,8 @@
 
     ro . : {
         __RO_START__ = .;
-        *bl2_entrypoint.o(.text)
-        *(.text)
+        *bl2_entrypoint.o(.text*)
+        *(.text*)
         *(.rodata*)
         *(.vectors)
         __RO_END_UNALIGNED__ = .;
@@ -63,7 +63,7 @@
 
     .data . : {
         __DATA_START__ = .;
-        *(.data)
+        *(.data*)
         __DATA_END__ = .;
     } >RAM
 
@@ -79,7 +79,7 @@
      */
     .bss : ALIGN(16) {
         __BSS_START__ = .;
-        *(SORT_BY_ALIGNMENT(.bss))
+        *(SORT_BY_ALIGNMENT(.bss*))
         *(COMMON)
         __BSS_END__ = .;
     } >RAM
diff --git a/bl31/aarch64/bl31_entrypoint.S b/bl31/aarch64/bl31_entrypoint.S
index 97f59f3..d35b50a 100644
--- a/bl31/aarch64/bl31_entrypoint.S
+++ b/bl31/aarch64/bl31_entrypoint.S
@@ -32,20 +32,19 @@
 #include <platform.h>
 #include <arch.h>
 #include "cm_macros.S"
+#include <asm_macros.S>
 
 
 	.globl	bl31_entrypoint
 
 
-	.section	.text, "ax"; .align 3
-
 	/* -----------------------------------------------------
 	 * bl31_entrypoint() is the cold boot entrypoint,
 	 * executed only by the primary cpu.
 	 * -----------------------------------------------------
 	 */
 
-bl31_entrypoint: ; .type bl31_entrypoint, %function
+func bl31_entrypoint
 	/* ---------------------------------------------
 	 * BL2 has populated x0 with the opcode
 	 * indicating BL31 should be run, x3 with
diff --git a/bl31/aarch64/context.S b/bl31/aarch64/context.S
index 5fe8bab..2b2e7bf 100644
--- a/bl31/aarch64/context.S
+++ b/bl31/aarch64/context.S
@@ -29,6 +29,7 @@
  */
 
 #include <context.h>
+#include <asm_macros.S>
 
 /* -----------------------------------------------------
  * The following function strictly follows the AArch64
@@ -39,7 +40,7 @@
  * -----------------------------------------------------
  */
 	.global el3_sysregs_context_save
-el3_sysregs_context_save:
+func el3_sysregs_context_save
 
 	mrs	x9, scr_el3
 	mrs	x10, sctlr_el3
@@ -75,7 +76,7 @@
  * -----------------------------------------------------
  */
 	.global el3_sysregs_context_restore
-el3_sysregs_context_restore:
+func el3_sysregs_context_restore
 
 	ldp	x11, xzr, [x0, #CTX_CPTR_EL3]
 	msr	cptr_el3, x11
@@ -112,7 +113,7 @@
  * -----------------------------------------------------
  */
 	.global el1_sysregs_context_save
-el1_sysregs_context_save:
+func el1_sysregs_context_save
 
 	mrs	x9, spsr_el1
 	mrs	x10, elr_el1
@@ -193,7 +194,7 @@
  * -----------------------------------------------------
  */
 	.global el1_sysregs_context_restore
-el1_sysregs_context_restore:
+func el1_sysregs_context_restore
 
 	ldp	x9, x10, [x0, #CTX_SPSR_EL1]
 	msr	spsr_el1, x9
@@ -284,7 +285,7 @@
  * -----------------------------------------------------
  */
 	.global fpregs_context_save
-fpregs_context_save:
+func fpregs_context_save
 	stp	q0, q1, [x0, #CTX_FP_Q0]
 	stp	q2, q3, [x0, #CTX_FP_Q2]
 	stp	q4, q5, [x0, #CTX_FP_Q4]
@@ -327,7 +328,7 @@
  * -----------------------------------------------------
  */
 	.global fpregs_context_restore
-fpregs_context_restore:
+func fpregs_context_restore
 	ldp	q0, q1, [x0, #CTX_FP_Q0]
 	ldp	q2, q3, [x0, #CTX_FP_Q2]
 	ldp	q4, q5, [x0, #CTX_FP_Q4]
diff --git a/bl31/aarch64/runtime_exceptions.S b/bl31/aarch64/runtime_exceptions.S
index 223514e..d00c1d7 100644
--- a/bl31/aarch64/runtime_exceptions.S
+++ b/bl31/aarch64/runtime_exceptions.S
@@ -203,7 +203,6 @@
 
 	.align	7
 
-	.section	.text, "ax"
 	/* -----------------------------------------------------
 	 * The following code handles secure monitor calls.
 	 * Depending upon the execution state from where the SMC
@@ -217,6 +216,7 @@
 	 * used here
 	 * -----------------------------------------------------
 	 */
+func smc_handler
 smc_handler32:
 	/* Check whether aarch32 issued an SMC64 */
 	tbnz	x0, #FUNCID_CC_SHIFT, smc_prohibited
@@ -330,6 +330,9 @@
 	 * This routine assumes that the SP_EL3 is pointing to
 	 * a valid context structure from where the gp regs and
 	 * other special registers can be retrieved.
+	 *
+	 * Keep it in the same section as smc_handler as this
+	 * function uses a fall-through to el3_exit
 	 * -----------------------------------------------------
 	 */
 el3_exit: ; .type el3_exit, %function
@@ -383,7 +386,7 @@
 	 * within the 32 instructions per exception vector.
 	 * -----------------------------------------------------
 	 */
-save_scratch_registers: ; .type save_scratch_registers, %function
+func save_scratch_registers
 	stp	x0, x1, [sp, #CTX_GPREGS_OFFSET + CTX_GPREG_X0]
 	stp	x2, x3, [sp, #CTX_GPREGS_OFFSET + CTX_GPREG_X2]
 	stp	x4, x5, [sp, #CTX_GPREGS_OFFSET + CTX_GPREG_X4]
@@ -397,7 +400,7 @@
 	stp	x18, x17, [sp, #CTX_GPREGS_OFFSET + CTX_GPREG_X18]
 	ret
 
-restore_scratch_registers: ; .type restore_scratch_registers, %function
+func restore_scratch_registers
 	ldp	x0, x1, [sp, #CTX_GPREGS_OFFSET + CTX_GPREG_X0]
 	ldp	x2, x3, [sp, #CTX_GPREGS_OFFSET + CTX_GPREG_X2]
 
@@ -430,23 +433,14 @@
 	 * reporting unhandled exceptions
 	 * -----------------------------------------------------
 	 */
-get_exception_stack: ; .type get_exception_stack, %function
+func get_exception_stack
 	mov	x10, x30 // lr
-	bl	platform_get_core_pos
-	add	x0, x0, #1
-	mov	x1, #PCPU_EXCEPTION_STACK_SIZE
-	mul	x0, x0, x1
-	ldr	x1, =pcpu_exception_stack
-	add	x0, x1, x0
+	get_mp_stack pcpu_exception_stack, PCPU_EXCEPTION_STACK_SIZE
 	ret	x10
 
 	/* -----------------------------------------------------
 	 * Per-cpu exception stacks in normal memory.
 	 * -----------------------------------------------------
 	 */
-	.section	tzfw_normal_stacks, "aw", %nobits; .align 6
-
-pcpu_exception_stack:
-	/* Zero fill */
-	.space (PLATFORM_CORE_COUNT * PCPU_EXCEPTION_STACK_SIZE), 0
-
+declare_stack pcpu_exception_stack, tzfw_normal_stacks, \
+		PCPU_EXCEPTION_STACK_SIZE, PLATFORM_CORE_COUNT
diff --git a/bl31/bl31.ld.S b/bl31/bl31.ld.S
index 844f169..1b818f5 100644
--- a/bl31/bl31.ld.S
+++ b/bl31/bl31.ld.S
@@ -48,14 +48,14 @@
 
     ro . : {
         __RO_START__ = .;
-        *bl31_entrypoint.o(.text)
-        *(.text)
+        *bl31_entrypoint.o(.text*)
+        *(.text*)
         *(.rodata*)
 
-        /* Ensure 8-byte alignment for descriptors */
+        /* Ensure 8-byte alignment for descriptors and ensure inclusion */
         . = ALIGN(8);
         __RT_SVC_DESCS_START__ = .;
-        *(rt_svc_descs)
+        KEEP(*(rt_svc_descs))
         __RT_SVC_DESCS_END__ = .;
 
         *(.vectors)
@@ -71,7 +71,7 @@
 
     .data . : {
         __DATA_START__ = .;
-        *(.data)
+        *(.data*)
         __DATA_END__ = .;
     } >RAM
 
@@ -87,7 +87,7 @@
      */
     .bss : ALIGN(16) {
         __BSS_START__ = .;
-        *(.bss)
+        *(.bss*)
         *(COMMON)
         __BSS_END__ = .;
     } >RAM
diff --git a/bl32/tsp/aarch64/tsp_entrypoint.S b/bl32/tsp/aarch64/tsp_entrypoint.S
index fd02fd8..8bfe454 100644
--- a/bl32/tsp/aarch64/tsp_entrypoint.S
+++ b/bl32/tsp/aarch64/tsp_entrypoint.S
@@ -31,6 +31,7 @@
 #include <bl_common.h>
 #include <arch.h>
 #include <tsp.h>
+#include <asm_macros.S>
 
 
 	.globl	tsp_entrypoint
@@ -53,10 +54,8 @@
 	smc	#0
 	.endm
 
-	.section	.text, "ax"; .align 3
 
-
-tsp_entrypoint: ; .type tsp_entrypoint, %function
+func tsp_entrypoint
 	/*---------------------------------------------
 	 * Store the extents of the tzram available to
 	 * BL32 for future use.
@@ -161,7 +160,7 @@
 	 * here except for acknowledging the request.
 	 * ---------------------------------------------
 	 */
-tsp_cpu_off_entry: ; .type tsp_cpu_off_entry, %function
+func tsp_cpu_off_entry
 	bl	tsp_cpu_off_main
 	restore_args_call_smc
 
@@ -176,7 +175,7 @@
 	 * will be aarch64 and exceptions masked.
 	 * ---------------------------------------------
 	 */
-tsp_cpu_on_entry: ; .type tsp_cpu_on_entry, %function
+func tsp_cpu_on_entry
 	/* ---------------------------------------------
 	 * Set the exception vector to something sane.
 	 * ---------------------------------------------
@@ -236,7 +235,7 @@
 	 * the EL1 state.
 	 * ---------------------------------------------
 	 */
-tsp_cpu_suspend_entry: ; .type tsp_cpu_suspend_entry, %function
+func tsp_cpu_suspend_entry
 	bl	tsp_cpu_suspend_main
 	restore_args_call_smc
 
@@ -250,7 +249,7 @@
 	 * acknowledging the request.
 	 * ---------------------------------------------
 	 */
-tsp_cpu_resume_entry: ; .type tsp_cpu_resume_entry, %function
+func tsp_cpu_resume_entry
 	bl	tsp_cpu_resume_main
 	restore_args_call_smc
 tsp_cpu_resume_panic:
@@ -261,7 +260,7 @@
 	 * the TSP to service a fast smc request.
 	 * ---------------------------------------------
 	 */
-tsp_fast_smc_entry: ; .type tsp_fast_smc_entry, %function
+func tsp_fast_smc_entry
 	bl	tsp_fast_smc_handler
 	restore_args_call_smc
 tsp_fast_smc_entry_panic:
diff --git a/bl32/tsp/aarch64/tsp_request.S b/bl32/tsp/aarch64/tsp_request.S
index 13e5931..da7d6e6 100644
--- a/bl32/tsp/aarch64/tsp_request.S
+++ b/bl32/tsp/aarch64/tsp_request.S
@@ -29,18 +29,17 @@
  */
 
 #include <tsp.h>
+#include <asm_macros.S>
 
 	.globl tsp_get_magic
 
-	.section	.text, "ax"; .align 3
-
 
 /*
  * This function raises an SMC to retrieve arguments from secure
  * monitor/dispatcher, saves the returned arguments the array received in x0,
  * and then returns to the caller
  */
-tsp_get_magic:
+func tsp_get_magic
 	/* Save address to stack */
 	stp	x0, xzr, [sp, #-16]!
 
diff --git a/bl32/tsp/tsp-fvp.mk b/bl32/tsp/tsp-fvp.mk
index ead30ef..02fae09 100644
--- a/bl32/tsp/tsp-fvp.mk
+++ b/bl32/tsp/tsp-fvp.mk
@@ -34,4 +34,5 @@
 
 # TSP source files specific to FVP platform
 BL32_SOURCES		+=	bl32_plat_setup.c			\
+				platform_mp_stack.S			\
 				plat_common.c
diff --git a/bl32/tsp/tsp.ld.S b/bl32/tsp/tsp.ld.S
index 74b03ad..53bce7d 100644
--- a/bl32/tsp/tsp.ld.S
+++ b/bl32/tsp/tsp.ld.S
@@ -48,8 +48,8 @@
 
     ro . : {
         __RO_START__ = .;
-        *tsp_entrypoint.o(.text)
-        *(.text)
+        *tsp_entrypoint.o(.text*)
+        *(.text*)
         *(.rodata*)
         *(.vectors)
         __RO_END_UNALIGNED__ = .;
@@ -64,7 +64,7 @@
 
     .data . : {
         __DATA_START__ = .;
-        *(.data)
+        *(.data*)
         __DATA_END__ = .;
     } >RAM
 
@@ -80,7 +80,7 @@
      */
     .bss : ALIGN(16) {
         __BSS_START__ = .;
-        *(SORT_BY_ALIGNMENT(.bss))
+        *(SORT_BY_ALIGNMENT(.bss*))
         *(COMMON)
         __BSS_END__ = .;
     } >RAM
diff --git a/docs/porting-guide.md b/docs/porting-guide.md
index 5dca6fd..868eb33 100644
--- a/docs/porting-guide.md
+++ b/docs/porting-guide.md
@@ -81,7 +81,7 @@
 
 Each platform must export a header file of this name with the following
 constants defined. In the ARM FVP port, this file is found in
-[../plat/fvp/platform.h].
+[plat/fvp/platform.h].
 
 *   **#define : PLATFORM_LINKER_FORMAT**
 
@@ -96,7 +96,14 @@
 *   **#define : PLATFORM_STACK_SIZE**
 
     Defines the normal stack memory available to each CPU. This constant is used
-    by `platform_set_stack()`.
+    by [plat/common/aarch64/platform_mp_stack.S] and
+    [plat/common/aarch64/platform_up_stack.S].
+
+*   **#define : PCPU_DV_MEM_STACK_SIZE**
+
+    Defines the coherent stack memory available to each CPU. This constant is used
+    by [plat/common/aarch64/platform_mp_stack.S] and
+    [plat/common/aarch64/platform_up_stack.S].
 
 *   **#define : FIRMWARE_WELCOME_STR**
 
@@ -192,21 +199,7 @@
 
 The following mandatory modifications may be implemented in any file
 the implementer chooses. In the ARM FVP port, they are implemented in
-[../plat/fvp/aarch64/plat_common.c].
-
-*   **Variable : unsigned char platform_normal_stacks[X][Y]**
-
-        where  X = PLATFORM_STACK_SIZE
-          and  Y = PLATFORM_CORE_COUNT
-
-    Each platform must allocate a block of memory with Normal Cacheable, Write
-    back, Write allocate and Inner Shareable attributes aligned to the size (in
-    bytes) of the largest cache line amongst all caches implemented in the
-    system. A pointer to this memory should be exported with the name
-    `platform_normal_stacks`. This pointer is used by the common platform helper
-    functions `platform_set_stack()` (to allocate a stack for each CPU in the
-    platform)  & `platform_get_stack()` (to return the base address of that
-    stack) (see [../plat/common/aarch64/platform_helpers.S]).
+[plat/fvp/aarch64/plat_common.c].
 
 *   **Function : uint64_t plat_get_syscnt_freq(void)**
 
@@ -216,6 +209,7 @@
     In the ARM FVP port, it returns the base frequency of the system counter,
     which is retrieved from the first entry in the frequency modes table.
 
+
 2.2 Common optional modifications
 ---------------------------------
 
@@ -253,13 +247,18 @@
 *   Flushing caches prior to powering down a CPU or cluster.
 
 Each BL stage allocates this coherent stack memory for each CPU in the
-`tzfw_coherent_mem` section. A pointer to this memory (`pcpu_dv_mem_stack`) is
-used by this function to allocate a coherent stack for each CPU. A CPU is
-identified by its `MPIDR`, which is passed as an argument to this function.
+`tzfw_coherent_mem` section.
 
-The size of the stack allocated to each CPU is specified by the constant
+This function sets the current stack pointer to the coherent stack that
+has been allocated for the CPU specified by MPIDR. For BL images that only
+require a stack for the primary CPU the parameter is ignored. The size of
+the stack allocated to each CPU is specified by the platform defined constant
 `PCPU_DV_MEM_STACK_SIZE`.
 
+Common implementations of this function for the UP and MP BL images are
+provided in [plat/common/aarch64/platform_up_stack.S] and
+[plat/common/aarch64/platform_mp_stack.S]
+
 
 ### Function : platform_is_primary_cpu()
 
@@ -277,13 +276,15 @@
     Argument : unsigned long
     Return   : void
 
-This function uses the `platform_normal_stacks` pointer variable to allocate
-stacks to each CPU. Further details are given in the description of the
-`platform_normal_stacks` variable below. A CPU is identified by its `MPIDR`,
-which is passed as the argument.
+This function sets the current stack pointer to the normal memory stack that
+has been allocated for the CPU specificed by MPIDR. For BL images that only
+require a stack for the primary CPU the parameter is ignored. The size of
+the stack allocated to each CPU is specified by the platform defined constant
+`PLATFORM_STACK_SIZE`.
 
-The size of the stack allocated to each CPU is specified by the platform defined
-constant `PLATFORM_STACK_SIZE`.
+Common implementations of this function for the UP and MP BL images are
+provided in [plat/common/aarch64/platform_up_stack.S] and
+[plat/common/aarch64/platform_mp_stack.S]
 
 
 ### Function : platform_get_stack()
@@ -291,13 +292,15 @@
     Argument : unsigned long
     Return   : unsigned long
 
-This function uses the `platform_normal_stacks` pointer variable to return the
-base address of the stack memory reserved for a CPU. Further details are given
-in the description of the `platform_normal_stacks` variable below. A CPU is
-identified by its `MPIDR`, which is passed as the argument.
+This function returns the base address of the normal memory stack that
+has been allocated for the CPU specificed by MPIDR. For BL images that only
+require a stack for the primary CPU the parameter is ignored. The size of
+the stack allocated to each CPU is specified by the platform defined constant
+`PLATFORM_STACK_SIZE`.
 
-The size of the stack allocated to each CPU is specified by the platform defined
-constant `PLATFORM_STACK_SIZE`.
+Common implementations of this function for the UP and MP BL images are
+provided in [plat/common/aarch64/platform_up_stack.S] and
+[plat/common/aarch64/platform_mp_stack.S]
 
 
 ### Function : plat_report_exception()
@@ -319,7 +322,7 @@
 about the way the platform displays its status information.
 
 This function receives the exception type as its argument. Possible values for
-exceptions types are listed in the [../include/runtime_svc.h] header file. Note
+exceptions types are listed in the [include/runtime_svc.h] header file. Note
 that these constants are not related to any architectural exception code; they
 are just an ARM Trusted Firmware convention.
 
@@ -933,7 +936,7 @@
 the passed pointer with a pointer to BL3-1's private `plat_pm_ops` structure.
 
 A description of each member of this structure is given below. Please refer to
-the ARM FVP specific implementation of these handlers in [../plat/fvp/plat_pm.c]
+the ARM FVP specific implementation of these handlers in [plat/fvp/plat_pm.c]
 as an example. A platform port may choose not implement some of the power
 management operations. For example, the ARM FVP port does not implement the
 `affinst_standby()` function.
@@ -1135,8 +1138,9 @@
 [User Guide]: user-guide.md
 [FreeBSD]:    http://www.freebsd.org
 
-[../plat/common/aarch64/platform_helpers.S]: ../plat/common/aarch64/platform_helpers.S
-[../plat/fvp/platform.h]:                    ../plat/fvp/platform.h
-[../plat/fvp/aarch64/plat_common.c]:          ../plat/fvp/aarch64/plat_common.c
-[../plat/fvp/plat_pm.c]:                      ../plat/fvp/plat_pm.c
-[../include/runtime_svc.h]:                  ../include/runtime_svc.h
+[plat/common/aarch64/platform_mp_stack.S]: ../plat/common/aarch64/platform_mp_stack.S
+[plat/common/aarch64/platform_up_stack.S]: ../plat/common/aarch64/platform_up_stack.S
+[plat/fvp/platform.h]:                     ../plat/fvp/platform.h
+[plat/fvp/aarch64/plat_common.c]:          ../plat/fvp/aarch64/plat_common.c
+[plat/fvp/plat_pm.c]:                      ../plat/fvp/plat_pm.c
+[include/runtime_svc.h]:                   ../include/runtime_svc.h
diff --git a/include/asm_macros.S b/include/asm_macros.S
index f94d75f..8bcb7d2 100644
--- a/include/asm_macros.S
+++ b/include/asm_macros.S
@@ -80,3 +80,54 @@
 	    .error "Vector exceeds 32 instructions"
 	  .endif
 	.endm
+
+	/*
+	 * This macro is used to create a function label and place the
+	 * code into a separate text section based on the function name
+	 * to enable elimination of unused code during linking
+	 */
+	.macro func _name
+	.section .text.\_name, "ax"
+	.type \_name, %function
+	\_name:
+	.endm
+
+	/*
+	 * This macro declares an array of 1 or more stacks, properly
+	 * aligned and in the requested section
+	 */
+#define STACK_ALIGN	6
+
+	.macro declare_stack _name, _section, _size, _count
+	.if ((\_size & ((1 << STACK_ALIGN) - 1)) <> 0)
+	  .error "Stack size not correctly aligned"
+	.endif
+	.section    \_section, "aw", %nobits
+	.align STACK_ALIGN
+	\_name:
+	.space ((\_count) * (\_size)), 0
+	.endm
+
+	/*
+	 * This macro calculates the base address of an MP stack using the
+	 * platform_get_core_pos() index, the name of the stack storage and
+	 * the size of each stack
+	 * In: X0 = MPIDR of CPU whose stack is wanted
+	 * Out: X0 = physical address of stack base
+	 * Clobber: X30, X1, X2
+	 */
+	.macro get_mp_stack _name, _size
+	bl  platform_get_core_pos
+	ldr x2, =(\_name + \_size)
+	mov x1, #\_size
+	madd x0, x0, x1, x2
+	.endm
+
+	/*
+	 * This macro calculates the base address of a UP stack using the
+	 * name of the stack storage and the size of the stack
+	 * Out: X0 = physical address of stack base
+	 */
+	.macro get_up_stack _name, _size
+	ldr x0, =(\_name + \_size)
+	.endm
diff --git a/lib/arch/aarch64/cache_helpers.S b/lib/arch/aarch64/cache_helpers.S
index 26e4ba7..2696d90 100644
--- a/lib/arch/aarch64/cache_helpers.S
+++ b/lib/arch/aarch64/cache_helpers.S
@@ -44,58 +44,56 @@
 	.globl	dcsw_op_louis
 	.globl	dcsw_op_all
 
-	.section	.text, "ax"; .align 3
-
-dcisw: ; .type dcisw, %function
+func dcisw
 	dc	isw, x0
 	dsb	sy
 	isb
 	ret
 
 
-dccisw: ; .type dccisw, %function
+func dccisw
 	dc	cisw, x0
 	dsb	sy
 	isb
 	ret
 
 
-dccsw: ; .type dccsw, %function
+func dccsw
 	dc	csw, x0
 	dsb	sy
 	isb
 	ret
 
 
-dccvac: ; .type dccvac, %function
+func dccvac
 	dc	cvac, x0
 	dsb	sy
 	isb
 	ret
 
 
-dcivac: ; .type dcivac, %function
+func dcivac
 	dc	ivac, x0
 	dsb	sy
 	isb
 	ret
 
 
-dccivac: ; .type dccivac, %function
+func dccivac
 	dc	civac, x0
 	dsb	sy
 	isb
 	ret
 
 
-dccvau: ; .type dccvau, %function
+func dccvau
 	dc	cvau, x0
 	dsb	sy
 	isb
 	ret
 
 
-dczva: ; .type dczva, %function
+func dczva
 	dc	zva, x0
 	dsb	sy
 	isb
@@ -107,7 +105,7 @@
 	 * size. 'x0' = addr, 'x1' = size
 	 * ------------------------------------------
 	 */
-flush_dcache_range: ; .type flush_dcache_range, %function
+func flush_dcache_range
 	dcache_line_size x2, x3
 	add	x1, x0, x1
 	sub	x3, x2, #1
@@ -126,7 +124,7 @@
 	 * size. 'x0' = addr, 'x1' = size
 	 * ------------------------------------------
 	 */
-inv_dcache_range: ; .type inv_dcache_range, %function
+func inv_dcache_range
 	dcache_line_size x2, x3
 	add	x1, x0, x1
 	sub	x3, x2, #1
@@ -151,7 +149,7 @@
 	 * x14
 	 * ----------------------------------
 	 */
-dcsw_op: ; .type dcsw_op, %function
+func dcsw_op
 all_start_at_level:
 	add	x2, x10, x10, lsr #1            // work out 3x current cache level
 	lsr	x1, x0, x2                      // extract cache type bits from clidr
@@ -197,7 +195,7 @@
 	ret
 
 
-do_dcsw_op: ; .type do_dcsw_op, %function
+func do_dcsw_op
 	cbz	x3, exit
 	cmp	x0, #DCISW
 	b.eq	dc_isw
@@ -221,13 +219,13 @@
 	ret
 
 
-dcsw_op_louis: ; .type dcsw_op_louis, %function
+func dcsw_op_louis
 	dsb	sy
 	setup_dcsw_op_args x10, x3, x9, #LOUIS_SHIFT, #CLIDR_FIELD_WIDTH, #LEVEL_SHIFT
 	b	do_dcsw_op
 
 
-dcsw_op_all: ; .type dcsw_op_all, %function
+func dcsw_op_all
 	dsb	sy
 	setup_dcsw_op_args x10, x3, x9, #LOC_SHIFT, #CLIDR_FIELD_WIDTH, #LEVEL_SHIFT
 	b	do_dcsw_op
diff --git a/lib/arch/aarch64/misc_helpers.S b/lib/arch/aarch64/misc_helpers.S
index 324be76..e3b4ab5 100644
--- a/lib/arch/aarch64/misc_helpers.S
+++ b/lib/arch/aarch64/misc_helpers.S
@@ -30,6 +30,7 @@
 
 #include <arch_helpers.h>
 #include <runtime_svc.h>
+#include <asm_macros.S>
 
 	.globl	enable_irq
 	.globl	disable_irq
@@ -79,16 +80,15 @@
 	.globl	zeromem16
 	.globl	memcpy16
 
-	.section	.text, "ax"
 
-get_afflvl_shift: ; .type get_afflvl_shift, %function
+func get_afflvl_shift
 	cmp	x0, #3
 	cinc	x0, x0, eq
 	mov	x1, #MPIDR_AFFLVL_SHIFT
 	lsl	x0, x0, x1
 	ret
 
-mpidr_mask_lower_afflvls: ; .type mpidr_mask_lower_afflvls, %function
+func mpidr_mask_lower_afflvls
 	cmp	x1, #3
 	cinc	x1, x1, eq
 	mov	x2, #MPIDR_AFFLVL_SHIFT
@@ -101,57 +101,57 @@
 	 * Asynchronous exception manipulation accessors
 	 * -----------------------------------------------------
 	 */
-enable_irq: ; .type enable_irq, %function
+func enable_irq
 	msr	daifclr, #DAIF_IRQ_BIT
 	ret
 
 
-enable_fiq: ; .type enable_fiq, %function
+func enable_fiq
 	msr	daifclr, #DAIF_FIQ_BIT
 	ret
 
 
-enable_serror: ; .type enable_serror, %function
+func enable_serror
 	msr	daifclr, #DAIF_ABT_BIT
 	ret
 
 
-enable_debug_exceptions:
+func enable_debug_exceptions
 	msr	daifclr, #DAIF_DBG_BIT
 	ret
 
 
-disable_irq: ; .type disable_irq, %function
+func disable_irq
 	msr	daifset, #DAIF_IRQ_BIT
 	ret
 
 
-disable_fiq: ; .type disable_fiq, %function
+func disable_fiq
 	msr	daifset, #DAIF_FIQ_BIT
 	ret
 
 
-disable_serror: ; .type disable_serror, %function
+func disable_serror
 	msr	daifset, #DAIF_ABT_BIT
 	ret
 
 
-disable_debug_exceptions:
+func disable_debug_exceptions
 	msr	daifset, #DAIF_DBG_BIT
 	ret
 
 
-read_daif: ; .type read_daif, %function
+func read_daif
 	mrs	x0, daif
 	ret
 
 
-write_daif: ; .type write_daif, %function
+func write_daif
 	msr	daif, x0
 	ret
 
 
-read_spsr: ; .type read_spsr, %function
+func read_spsr
 	mrs	x0, CurrentEl
 	cmp	x0, #(MODE_EL1 << MODE_EL_SHIFT)
 	b.eq	read_spsr_el1
@@ -161,22 +161,22 @@
 	b.eq	read_spsr_el3
 
 
-read_spsr_el1: ; .type read_spsr_el1, %function
+func read_spsr_el1
 	mrs	x0, spsr_el1
 	ret
 
 
-read_spsr_el2: ; .type read_spsr_el2, %function
+func read_spsr_el2
 	mrs	x0, spsr_el2
 	ret
 
 
-read_spsr_el3: ; .type read_spsr_el3, %function
+func read_spsr_el3
 	mrs	x0, spsr_el3
 	ret
 
 
-write_spsr: ; .type write_spsr, %function
+func write_spsr
 	mrs	x1, CurrentEl
 	cmp	x1, #(MODE_EL1 << MODE_EL_SHIFT)
 	b.eq	write_spsr_el1
@@ -186,25 +186,25 @@
 	b.eq	write_spsr_el3
 
 
-write_spsr_el1: ; .type write_spsr_el1, %function
+func write_spsr_el1
 	msr	spsr_el1, x0
 	isb
 	ret
 
 
-write_spsr_el2: ; .type write_spsr_el2, %function
+func write_spsr_el2
 	msr	spsr_el2, x0
 	isb
 	ret
 
 
-write_spsr_el3: ; .type write_spsr_el3, %function
+func write_spsr_el3
 	msr	spsr_el3, x0
 	isb
 	ret
 
 
-read_elr: ; .type read_elr, %function
+func read_elr
 	mrs	x0, CurrentEl
 	cmp	x0, #(MODE_EL1 << MODE_EL_SHIFT)
 	b.eq	read_elr_el1
@@ -214,22 +214,22 @@
 	b.eq	read_elr_el3
 
 
-read_elr_el1: ; .type read_elr_el1, %function
+func read_elr_el1
 	mrs	x0, elr_el1
 	ret
 
 
-read_elr_el2: ; .type read_elr_el2, %function
+func read_elr_el2
 	mrs	x0, elr_el2
 	ret
 
 
-read_elr_el3: ; .type read_elr_el3, %function
+func read_elr_el3
 	mrs	x0, elr_el3
 	ret
 
 
-write_elr: ; .type write_elr, %function
+func write_elr
 	mrs	x1, CurrentEl
 	cmp	x1, #(MODE_EL1 << MODE_EL_SHIFT)
 	b.eq	write_elr_el1
@@ -239,54 +239,54 @@
 	b.eq	write_elr_el3
 
 
-write_elr_el1: ; .type write_elr_el1, %function
+func write_elr_el1
 	msr	elr_el1, x0
 	isb
 	ret
 
 
-write_elr_el2: ; .type write_elr_el2, %function
+func write_elr_el2
 	msr	elr_el2, x0
 	isb
 	ret
 
 
-write_elr_el3: ; .type write_elr_el3, %function
+func write_elr_el3
 	msr	elr_el3, x0
 	isb
 	ret
 
 
-dsb: ; .type dsb, %function
+func dsb
 	dsb	sy
 	ret
 
 
-isb: ; .type isb, %function
+func isb
 	isb
 	ret
 
 
-sev: ; .type sev, %function
+func sev
 	sev
 	ret
 
 
-wfe: ; .type wfe, %function
+func wfe
 	wfe
 	ret
 
 
-wfi: ; .type wfi, %function
+func wfi
 	wfi
 	ret
 
 
-eret: ; .type eret, %function
+func eret
 	eret
 
 
-smc: ; .type smc, %function
+func smc
 	smc	#0
 
 /* -----------------------------------------------------------------------
@@ -296,7 +296,7 @@
  * The memory address must be 16-byte aligned.
  * -----------------------------------------------------------------------
  */
-zeromem16:
+func zeromem16
 	add	x2, x0, x1
 /* zero 16 bytes at a time */
 z_loop16:
@@ -322,7 +322,7 @@
  * Destination and source addresses must be 16-byte aligned.
  * --------------------------------------------------------------------------
  */
-memcpy16:
+func memcpy16
 /* copy 16 bytes at a time */
 m_loop16:
 	cmp	x2, #16
diff --git a/lib/arch/aarch64/sysreg_helpers.S b/lib/arch/aarch64/sysreg_helpers.S
index dfd0f2f..8e816f0 100644
--- a/lib/arch/aarch64/sysreg_helpers.S
+++ b/lib/arch/aarch64/sysreg_helpers.S
@@ -29,6 +29,7 @@
  */
 
 #include <arch_helpers.h>
+#include <asm_macros.S>
 
 	.globl	read_vbar_el1
 	.globl	read_vbar_el2
@@ -164,19 +165,17 @@
 #endif
 
 
-	.section	.text, "ax"
-
-read_current_el: ; .type read_current_el, %function
+func read_current_el
 	mrs	x0, CurrentEl
 	ret
 
 
-read_id_pfr1_el1: ; .type read_id_pfr1_el1, %function
+func read_id_pfr1_el1
 	mrs	x0, id_pfr1_el1
 	ret
 
 
-read_id_aa64pfr0_el1: ; .type read_id_aa64pfr0_el1, %function
+func read_id_aa64pfr0_el1
 	mrs	x0, id_aa64pfr0_el1
 	ret
 
@@ -185,34 +184,34 @@
 	 * VBAR accessors
 	 * -----------------------------------------------------
 	 */
-read_vbar_el1: ; .type read_vbar_el1, %function
+func read_vbar_el1
 	mrs	x0, vbar_el1
 	ret
 
 
-read_vbar_el2: ; .type read_vbar_el2, %function
+func read_vbar_el2
 	mrs	x0, vbar_el2
 	ret
 
 
-read_vbar_el3: ; .type read_vbar_el3, %function
+func read_vbar_el3
 	mrs	x0, vbar_el3
 	ret
 
 
-write_vbar_el1: ; .type write_vbar_el1, %function
+func write_vbar_el1
 	msr	vbar_el1, x0
 	isb
 	ret
 
 
-write_vbar_el2: ; .type write_vbar_el2, %function
+func write_vbar_el2
 	msr	vbar_el2, x0
 	isb
 	ret
 
 
-write_vbar_el3: ; .type write_vbar_el3, %function
+func write_vbar_el3
 	msr	vbar_el3, x0
 	isb
 	ret
@@ -222,34 +221,34 @@
 	 * AFSR0 accessors
 	 * -----------------------------------------------------
 	 */
-read_afsr0_el1: ; .type read_afsr0_el1, %function
+func read_afsr0_el1
 	mrs	x0, afsr0_el1
 	ret
 
 
-read_afsr0_el2: ; .type read_afsr0_el2, %function
+func read_afsr0_el2
 	mrs	x0, afsr0_el2
 	ret
 
 
-read_afsr0_el3: ; .type read_afsr0_el3, %function
+func read_afsr0_el3
 	mrs	x0, afsr0_el3
 	ret
 
 
-write_afsr0_el1: ; .type write_afsr0_el1, %function
+func write_afsr0_el1
 	msr	afsr0_el1, x0
 	isb
 	ret
 
 
-write_afsr0_el2: ; .type write_afsr0_el2, %function
+func write_afsr0_el2
 	msr	afsr0_el2, x0
 	isb
 	ret
 
 
-write_afsr0_el3: ; .type write_afsr0_el3, %function
+func write_afsr0_el3
 	msr	afsr0_el3, x0
 	isb
 	ret
@@ -259,34 +258,34 @@
 	 * FAR accessors
 	 * -----------------------------------------------------
 	 */
-read_far_el1: ; .type read_far_el1, %function
+func read_far_el1
 	mrs	x0, far_el1
 	ret
 
 
-read_far_el2: ; .type read_far_el2, %function
+func read_far_el2
 	mrs	x0, far_el2
 	ret
 
 
-read_far_el3: ; .type read_far_el3, %function
+func read_far_el3
 	mrs	x0, far_el3
 	ret
 
 
-write_far_el1: ; .type write_far_el1, %function
+func write_far_el1
 	msr	far_el1, x0
 	isb
 	ret
 
 
-write_far_el2: ; .type write_far_el2, %function
+func write_far_el2
 	msr	far_el2, x0
 	isb
 	ret
 
 
-write_far_el3: ; .type write_far_el3, %function
+func write_far_el3
 	msr	far_el3, x0
 	isb
 	ret
@@ -296,34 +295,34 @@
 	 * MAIR accessors
 	 * -----------------------------------------------------
 	 */
-read_mair_el1: ; .type read_mair_el1, %function
+func read_mair_el1
 	mrs	x0, mair_el1
 	ret
 
 
-read_mair_el2: ; .type read_mair_el2, %function
+func read_mair_el2
 	mrs	x0, mair_el2
 	ret
 
 
-read_mair_el3: ; .type read_mair_el3, %function
+func read_mair_el3
 	mrs	x0, mair_el3
 	ret
 
 
-write_mair_el1: ; .type write_mair_el1, %function
+func write_mair_el1
 	msr	mair_el1, x0
 	isb
 	ret
 
 
-write_mair_el2: ; .type write_mair_el2, %function
+func write_mair_el2
 	msr	mair_el2, x0
 	isb
 	ret
 
 
-write_mair_el3: ; .type write_mair_el3, %function
+func write_mair_el3
 	msr	mair_el3, x0
 	isb
 	ret
@@ -333,34 +332,34 @@
 	 * AMAIR accessors
 	 * -----------------------------------------------------
 	 */
-read_amair_el1: ; .type read_amair_el1, %function
+func read_amair_el1
 	mrs	x0, amair_el1
 	ret
 
 
-read_amair_el2: ; .type read_amair_el2, %function
+func read_amair_el2
 	mrs	x0, amair_el2
 	ret
 
 
-read_amair_el3: ; .type read_amair_el3, %function
+func read_amair_el3
 	mrs	x0, amair_el3
 	ret
 
 
-write_amair_el1: ; .type write_amair_el1, %function
+func write_amair_el1
 	msr	amair_el1, x0
 	isb
 	ret
 
 
-write_amair_el2: ; .type write_amair_el2, %function
+func write_amair_el2
 	msr	amair_el2, x0
 	isb
 	ret
 
 
-write_amair_el3: ; .type write_amair_el3, %function
+func write_amair_el3
 	msr	amair_el3, x0
 	isb
 	ret
@@ -370,17 +369,17 @@
 	 * RVBAR accessors
 	 * -----------------------------------------------------
 	 */
-read_rvbar_el1: ; .type read_rvbar_el1, %function
+func read_rvbar_el1
 	mrs	x0, rvbar_el1
 	ret
 
 
-read_rvbar_el2: ; .type read_rvbar_el2, %function
+func read_rvbar_el2
 	mrs	x0, rvbar_el2
 	ret
 
 
-read_rvbar_el3: ; .type read_rvbar_el3, %function
+func read_rvbar_el3
 	mrs	x0, rvbar_el3
 	ret
 
@@ -389,34 +388,34 @@
 	 * RMR accessors
 	 * -----------------------------------------------------
 	 */
-read_rmr_el1: ; .type read_rmr_el1, %function
+func read_rmr_el1
 	mrs	x0, rmr_el1
 	ret
 
 
-read_rmr_el2: ; .type read_rmr_el2, %function
+func read_rmr_el2
 	mrs	x0, rmr_el2
 	ret
 
 
-read_rmr_el3: ; .type read_rmr_el3, %function
+func read_rmr_el3
 	mrs	x0, rmr_el3
 	ret
 
 
-write_rmr_el1: ; .type write_rmr_el1, %function
+func write_rmr_el1
 	msr	rmr_el1, x0
 	isb
 	ret
 
 
-write_rmr_el2: ; .type write_rmr_el2, %function
+func write_rmr_el2
 	msr	rmr_el2, x0
 	isb
 	ret
 
 
-write_rmr_el3: ; .type write_rmr_el3, %function
+func write_rmr_el3
 	msr	rmr_el3, x0
 	isb
 	ret
@@ -426,34 +425,34 @@
 	 * AFSR1 accessors
 	 * -----------------------------------------------------
 	 */
-read_afsr1_el1: ; .type read_afsr1_el1, %function
+func read_afsr1_el1
 	mrs	x0, afsr1_el1
 	ret
 
 
-read_afsr1_el2: ; .type read_afsr1_el2, %function
+func read_afsr1_el2
 	mrs	x0, afsr1_el2
 	ret
 
 
-read_afsr1_el3: ; .type read_afsr1_el3, %function
+func read_afsr1_el3
 	mrs	x0, afsr1_el3
 	ret
 
 
-write_afsr1_el1: ; .type write_afsr1_el1, %function
+func write_afsr1_el1
 	msr	afsr1_el1, x0
 	isb
 	ret
 
 
-write_afsr1_el2: ; .type write_afsr1_el2, %function
+func write_afsr1_el2
 	msr	afsr1_el2, x0
 	isb
 	ret
 
 
-write_afsr1_el3: ; .type write_afsr1_el3, %function
+func write_afsr1_el3
 	msr	afsr1_el3, x0
 	isb
 	ret
@@ -463,36 +462,36 @@
 	 * SCTLR accessors
 	 * -----------------------------------------------------
 	 */
-read_sctlr_el1: ; .type read_sctlr_el1, %function
+func read_sctlr_el1
 	mrs	x0, sctlr_el1
 	ret
 
 
-read_sctlr_el2: ; .type read_sctlr_el2, %function
+func read_sctlr_el2
 	mrs	x0, sctlr_el2
 	ret
 
 
-read_sctlr_el3: ; .type read_sctlr_el3, %function
+func read_sctlr_el3
 	mrs	x0, sctlr_el3
 	ret
 
 
-write_sctlr_el1: ; .type write_sctlr_el1, %function
+func write_sctlr_el1
 	msr	sctlr_el1, x0
 	dsb	sy
 	isb
 	ret
 
 
-write_sctlr_el2: ; .type write_sctlr_el2, %function
+func write_sctlr_el2
 	msr	sctlr_el2, x0
 	dsb	sy
 	isb
 	ret
 
 
-write_sctlr_el3: ; .type write_sctlr_el3, %function
+func write_sctlr_el3
 	msr	sctlr_el3, x0
 	dsb	sy
 	isb
@@ -503,36 +502,36 @@
 	 * ACTLR accessors
 	 * -----------------------------------------------------
 	 */
-read_actlr_el1: ; .type read_actlr_el1, %function
+func read_actlr_el1
 	mrs	x0, actlr_el1
 	ret
 
 
-read_actlr_el2: ; .type read_actlr_el2, %function
+func read_actlr_el2
 	mrs	x0, actlr_el2
 	ret
 
 
-read_actlr_el3: ; .type read_actlr_el3, %function
+func read_actlr_el3
 	mrs	x0, actlr_el3
 	ret
 
 
-write_actlr_el1: ; .type write_actlr_el1, %function
+func write_actlr_el1
 	msr	actlr_el1, x0
 	dsb	sy
 	isb
 	ret
 
 
-write_actlr_el2: ; .type write_actlr_el2, %function
+func write_actlr_el2
 	msr	actlr_el2, x0
 	dsb	sy
 	isb
 	ret
 
 
-write_actlr_el3: ; .type write_actlr_el3, %function
+func write_actlr_el3
 	msr	actlr_el3, x0
 	dsb	sy
 	isb
@@ -543,36 +542,36 @@
 	 * ESR accessors
 	 * -----------------------------------------------------
 	 */
-read_esr_el1: ; .type read_esr_el1, %function
+func read_esr_el1
 	mrs	x0, esr_el1
 	ret
 
 
-read_esr_el2: ; .type read_esr_el2, %function
+func read_esr_el2
 	mrs	x0, esr_el2
 	ret
 
 
-read_esr_el3: ; .type read_esr_el3, %function
+func read_esr_el3
 	mrs	x0, esr_el3
 	ret
 
 
-write_esr_el1: ; .type write_esr_el1, %function
+func write_esr_el1
 	msr	esr_el1, x0
 	dsb	sy
 	isb
 	ret
 
 
-write_esr_el2: ; .type write_esr_el2, %function
+func write_esr_el2
 	msr	esr_el2, x0
 	dsb	sy
 	isb
 	ret
 
 
-write_esr_el3: ; .type write_esr_el3, %function
+func write_esr_el3
 	msr	esr_el3, x0
 	dsb	sy
 	isb
@@ -583,36 +582,36 @@
 	 * TCR accessors
 	 * -----------------------------------------------------
 	 */
-read_tcr_el1: ; .type read_tcr_el1, %function
+func read_tcr_el1
 	mrs	x0, tcr_el1
 	ret
 
 
-read_tcr_el2: ; .type read_tcr_el2, %function
+func read_tcr_el2
 	mrs	x0, tcr_el2
 	ret
 
 
-read_tcr_el3: ; .type read_tcr_el3, %function
+func read_tcr_el3
 	mrs	x0, tcr_el3
 	ret
 
 
-write_tcr_el1: ; .type write_tcr_el1, %function
+func write_tcr_el1
 	msr	tcr_el1, x0
 	dsb	sy
 	isb
 	ret
 
 
-write_tcr_el2: ; .type write_tcr_el2, %function
+func write_tcr_el2
 	msr	tcr_el2, x0
 	dsb	sy
 	isb
 	ret
 
 
-write_tcr_el3: ; .type write_tcr_el3, %function
+func write_tcr_el3
 	msr	tcr_el3, x0
 	dsb	sy
 	isb
@@ -623,33 +622,33 @@
 	 * CPTR accessors
 	 * -----------------------------------------------------
 	 */
-read_cptr_el1: ; .type read_cptr_el1, %function
+func read_cptr_el1
 	b	read_cptr_el1
 	ret
 
 
-read_cptr_el2: ; .type read_cptr_el2, %function
+func read_cptr_el2
 	mrs	x0, cptr_el2
 	ret
 
 
-read_cptr_el3: ; .type read_cptr_el3, %function
+func read_cptr_el3
 	mrs	x0, cptr_el3
 	ret
 
 
-write_cptr_el1: ; .type write_cptr_el1, %function
+func write_cptr_el1
 	b	write_cptr_el1
 
 
-write_cptr_el2: ; .type write_cptr_el2, %function
+func write_cptr_el2
 	msr	cptr_el2, x0
 	dsb	sy
 	isb
 	ret
 
 
-write_cptr_el3: ; .type write_cptr_el3, %function
+func write_cptr_el3
 	msr	cptr_el3, x0
 	dsb	sy
 	isb
@@ -660,34 +659,34 @@
 	 * TTBR0 accessors
 	 * -----------------------------------------------------
 	 */
-read_ttbr0_el1: ; .type read_ttbr0_el1, %function
+func read_ttbr0_el1
 	mrs	x0, ttbr0_el1
 	ret
 
 
-read_ttbr0_el2: ; .type read_ttbr0_el2, %function
+func read_ttbr0_el2
 	mrs	x0, ttbr0_el2
 	ret
 
 
-read_ttbr0_el3: ; .type read_ttbr0_el3, %function
+func read_ttbr0_el3
 	mrs	x0, ttbr0_el3
 	ret
 
 
-write_ttbr0_el1: ; .type write_ttbr0_el1, %function
+func write_ttbr0_el1
 	msr	ttbr0_el1, x0
 	isb
 	ret
 
 
-write_ttbr0_el2: ; .type write_ttbr0_el2, %function
+func write_ttbr0_el2
 	msr	ttbr0_el2, x0
 	isb
 	ret
 
 
-write_ttbr0_el3: ; .type write_ttbr0_el3, %function
+func write_ttbr0_el3
 	msr	ttbr0_el3, x0
 	isb
 	ret
@@ -697,121 +696,121 @@
 	 * TTBR1 accessors
 	 * -----------------------------------------------------
 	 */
-read_ttbr1_el1: ; .type read_ttbr1_el1, %function
+func read_ttbr1_el1
 	mrs	x0, ttbr1_el1
 	ret
 
 
-read_ttbr1_el2: ; .type read_ttbr1_el2, %function
+func read_ttbr1_el2
 	b	read_ttbr1_el2
 
 
-read_ttbr1_el3: ; .type read_ttbr1_el3, %function
+func read_ttbr1_el3
 	b	read_ttbr1_el3
 
 
-write_ttbr1_el1: ; .type write_ttbr1_el1, %function
+func write_ttbr1_el1
 	msr	ttbr1_el1, x0
 	isb
 	ret
 
 
-write_ttbr1_el2: ; .type write_ttbr1_el2, %function
+func write_ttbr1_el2
 	b	write_ttbr1_el2
 
 
-write_ttbr1_el3: ; .type write_ttbr1_el3, %function
+func write_ttbr1_el3
 	b	write_ttbr1_el3
 
 
-read_hcr: ; .type read_hcr, %function
+func read_hcr
 	mrs	x0, hcr_el2
 	ret
 
 
-write_hcr: ; .type write_hcr, %function
+func write_hcr
 	msr	hcr_el2, x0
 	dsb	sy
 	isb
 	ret
 
 
-read_cpacr: ; .type read_cpacr, %function
+func read_cpacr
 	mrs	x0, cpacr_el1
 	ret
 
 
-write_cpacr: ; .type write_cpacr, %function
+func write_cpacr
 	msr	cpacr_el1, x0
 	ret
 
 
-read_cntfrq_el0: ; .type read_cntfrq_el0, %function
+func read_cntfrq_el0
 	mrs	x0, cntfrq_el0
 	ret
 
 
-write_cntfrq_el0: ; .type write_cntfrq_el0, %function
+func write_cntfrq_el0
 	msr	cntfrq_el0, x0
 	ret
 
 
-read_cpuectlr: ; .type read_cpuectlr, %function
+func read_cpuectlr
 	mrs	x0, CPUECTLR_EL1
 	ret
 
 
-write_cpuectlr: ; .type write_cpuectlr, %function
+func write_cpuectlr
 	msr	CPUECTLR_EL1, x0
 	dsb	sy
 	isb
 	ret
 
 
-read_cnthctl_el2: ; .type read_cnthctl_el2, %function
+func read_cnthctl_el2
 	mrs	x0, cnthctl_el2
 	ret
 
 
-write_cnthctl_el2: ; .type write_cnthctl_el2, %function
+func write_cnthctl_el2
 	msr	cnthctl_el2, x0
 	ret
 
 
-read_cntfrq: ; .type read_cntfrq, %function
+func read_cntfrq
 	mrs	x0, cntfrq_el0
 	ret
 
 
-write_cntfrq: ; .type write_cntfrq, %function
+func write_cntfrq
 	msr	cntfrq_el0, x0
 	ret
 
 
-write_scr: ; .type write_scr, %function
+func write_scr
 	msr	scr_el3, x0
 	dsb	sy
 	isb
 	ret
 
 
-read_scr: ; .type read_scr, %function
+func read_scr
 	mrs	x0, scr_el3
 	ret
 
 
-read_midr: ; .type read_midr, %function
+func read_midr
 	mrs	x0, midr_el1
 	ret
 
 
-read_mpidr: ; .type read_mpidr, %function
+func read_mpidr
 	mrs	x0, mpidr_el1
 	ret
 
 
 #if SUPPORT_VFP
-enable_vfp: ; .type enable_vfp, %function
+func enable_vfp
 	mrs	x0, cpacr_el1
 	orr	x0, x0, #CPACR_VFP_BITS
 	msr	cpacr_el1, x0
@@ -822,14 +821,12 @@
 	ret
 
 
-	// int read_fpexc(void)
-read_fpexc: ; .type read_fpexc, %function
+func read_fpexc
 	b	read_fpexc
 	ret
 
 
-	// void write_fpexc(int fpexc)
-write_fpexc: ; .type write_fpexc, %function
+func write_fpexc
 	b	write_fpexc
 	ret
 
diff --git a/lib/arch/aarch64/tlb_helpers.S b/lib/arch/aarch64/tlb_helpers.S
index 69fd9f5..4244974 100644
--- a/lib/arch/aarch64/tlb_helpers.S
+++ b/lib/arch/aarch64/tlb_helpers.S
@@ -29,6 +29,7 @@
  */
 
 #include <arch_helpers.h>
+#include <asm_macros.S>
 
 	.globl	tlbialle1
 	.globl	tlbialle1is
@@ -39,50 +40,48 @@
 	.globl	tlbivmalle1
 
 
-	.section	.text, "ax"
-
-tlbialle1: ; .type tlbialle1, %function
+func tlbialle1
 	tlbi	alle1
 	dsb	sy
 	isb
 	ret
 
 
-tlbialle1is: ; .type tlbialle1is, %function
+func tlbialle1is
 	tlbi	alle1is
 	dsb	sy
 	isb
 	ret
 
 
-tlbialle2: ; .type tlbialle2, %function
+func tlbialle2
 	tlbi	alle2
 	dsb	sy
 	isb
 	ret
 
 
-tlbialle2is: ; .type tlbialle2is, %function
+func tlbialle2is
 	tlbi	alle2is
 	dsb	sy
 	isb
 	ret
 
 
-tlbialle3: ; .type tlbialle3, %function
+func tlbialle3
 	tlbi	alle3
 	dsb	sy
 	isb
 	ret
 
 
-tlbialle3is: ; .type tlbialle3is, %function
+func tlbialle3is
 	tlbi	alle3is
 	dsb	sy
 	isb
 	ret
 
-tlbivmalle1: ; .type tlbivmalle1, %function
+func tlbivmalle1
 	tlbi	vmalle1
 	dsb	sy
 	isb
diff --git a/lib/semihosting/aarch64/semihosting_call.S b/lib/semihosting/aarch64/semihosting_call.S
index 9ece624..e6a9675 100644
--- a/lib/semihosting/aarch64/semihosting_call.S
+++ b/lib/semihosting/aarch64/semihosting_call.S
@@ -28,10 +28,10 @@
  * POSSIBILITY OF SUCH DAMAGE.
  */
 
-	.globl	semihosting_call
+#include <asm_macros.S>
 
-	.section	.text, "ax"
+	.globl	semihosting_call
 
-semihosting_call: ; .type semihosting_call, %function
+func semihosting_call
 	hlt	#0xf000
 	ret
diff --git a/lib/sync/locks/exclusive/spinlock.S b/lib/sync/locks/exclusive/spinlock.S
index 79d19e7..5eae2b0 100644
--- a/lib/sync/locks/exclusive/spinlock.S
+++ b/lib/sync/locks/exclusive/spinlock.S
@@ -28,13 +28,13 @@
  * POSSIBILITY OF SUCH DAMAGE.
  */
 
+#include <asm_macros.S>
+
 	.globl	spin_lock
 	.globl	spin_unlock
 
 
-	.section	.text, "ax";
-
-spin_lock: ; .type spin_lock, %function
+func spin_lock
 	mov	w2, #1
 	sevl
 l1:	wfe
@@ -45,6 +45,6 @@
 	ret
 
 
-spin_unlock: ; .type spin_unlock, %function
+func spin_unlock
 	stlr	wzr, [x0]
 	ret
diff --git a/plat/common/aarch64/platform_helpers.S b/plat/common/aarch64/platform_helpers.S
index 563f198..29268ba 100644
--- a/plat/common/aarch64/platform_helpers.S
+++ b/plat/common/aarch64/platform_helpers.S
@@ -30,62 +30,26 @@
 
 #include <arch.h>
 #include <platform.h>
+#include <asm_macros.S>
 
 
-	.globl	pcpu_dv_mem_stack
 	.weak	platform_get_core_pos
-	.weak	platform_set_stack
-	.weak	platform_get_stack
 	.weak	platform_is_primary_cpu
-	.weak	platform_set_coherent_stack
 	.weak	platform_check_mpidr
 	.weak	plat_report_exception
 
 	/* -----------------------------------------------------
-	 * Coherent stack sizes for debug and release builds
-	 * -----------------------------------------------------
-	 */
-#if DEBUG
-#define PCPU_DV_MEM_STACK_SIZE	0x400
-#else
-#define PCPU_DV_MEM_STACK_SIZE	0x300
-#endif
-
-	.section	.text, "ax"; .align 3
-
-	/* -----------------------------------------------------
-	 * unsigned long long platform_set_coherent_stack
-	 *                                    (unsigned mpidr);
-	 * For a given mpidr, this function returns the stack
-	 * pointer allocated in device memory. This stack can
-	 * be used by C code which enables/disables the SCTLR.M
-	 * SCTLR.C bit e.g. while powering down a cpu
-	 * -----------------------------------------------------
-	 */
-platform_set_coherent_stack: ; .type platform_set_coherent_stack, %function
-	mov	x5, x30 // lr
-	bl	platform_get_core_pos
-	add	x0, x0, #1
-	mov	x1, #PCPU_DV_MEM_STACK_SIZE
-	mul	x0, x0, x1
-	ldr	x1, =pcpu_dv_mem_stack
-	add	sp, x1, x0
-	ret	x5
-
-
-	/* -----------------------------------------------------
 	 *  int platform_get_core_pos(int mpidr);
 	 *  With this function: CorePos = (ClusterId * 4) +
 	 *  				  CoreId
 	 * -----------------------------------------------------
 	 */
-platform_get_core_pos: ; .type platform_get_core_pos, %function
+func platform_get_core_pos
 	and	x1, x0, #MPIDR_CPU_MASK
 	and	x0, x0, #MPIDR_CLUSTER_MASK
 	add	x0, x1, x0, LSR #6
 	ret
 
-
 	/* -----------------------------------------------------
 	 * void platform_is_primary_cpu (unsigned int mpid);
 	 *
@@ -93,42 +57,18 @@
 	 * cpu (applicable ony after a cold boot)
 	 * -----------------------------------------------------
 	 */
-platform_is_primary_cpu: ; .type platform_is_primary_cpu, %function
+func platform_is_primary_cpu
 	and	x0, x0, #(MPIDR_CLUSTER_MASK | MPIDR_CPU_MASK)
 	cmp	x0, #PRIMARY_CPU
 	cset	x0, eq
 	ret
 
 	/* -----------------------------------------------------
-	 * void platform_get_stack (unsigned long mpidr)
-	 * -----------------------------------------------------
-	 */
-platform_get_stack: ; .type platform_get_stack, %function
-	mov	x10, x30 // lr
-	bl	platform_get_core_pos
-	add	x0, x0, #1
-	mov	x1, #PLATFORM_STACK_SIZE
-	mul	x0, x0, x1
-	ldr	x1, =platform_normal_stacks
-	add	x0, x1, x0
-	ret	x10
-
-	/* -----------------------------------------------------
-	 * void platform_set_stack (unsigned long mpidr)
-	 * -----------------------------------------------------
-	 */
-platform_set_stack: ; .type platform_set_stack, %function
-	mov	x9, x30 // lr
-	bl	platform_get_stack
-	mov	sp, x0
-	ret	x9
-
-	/* -----------------------------------------------------
 	 * Placeholder function which should be redefined by
 	 * each platform.
 	 * -----------------------------------------------------
 	 */
-platform_check_mpidr: ; .type platform_check_mpidr, %function
+func platform_check_mpidr
 	mov	x0, xzr
 	ret
 
@@ -137,18 +77,5 @@
 	 * each platform.
 	 * -----------------------------------------------------
 	 */
-plat_report_exception:
+func plat_report_exception
 	ret
-
-	/* -----------------------------------------------------
-	 * Per-cpu stacks in device memory.
-	 * Used for C code just before power down or right after
-	 * power up when the MMU or caches need to be turned on
-	 * or off. Each cpu gets a stack of 512 bytes.
-	 * -----------------------------------------------------
-	 */
-	.section	tzfw_coherent_mem, "aw", %nobits; .align 6
-
-pcpu_dv_mem_stack:
-	/* Zero fill */
-	.space (PLATFORM_CORE_COUNT * PCPU_DV_MEM_STACK_SIZE), 0
diff --git a/plat/common/aarch64/platform_mp_stack.S b/plat/common/aarch64/platform_mp_stack.S
new file mode 100644
index 0000000..1438814
--- /dev/null
+++ b/plat/common/aarch64/platform_mp_stack.S
@@ -0,0 +1,102 @@
+/*
+ * Copyright (c) 2014, ARM Limited and Contributors. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * Neither the name of ARM nor the names of its contributors may be used
+ * to endorse or promote products derived from this software without specific
+ * prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <arch.h>
+#include <platform.h>
+#include <asm_macros.S>
+
+
+	.local	pcpu_dv_mem_stack
+	.local	platform_normal_stacks
+	.weak	platform_set_stack
+	.weak	platform_get_stack
+	.weak	platform_set_coherent_stack
+
+
+	/* -----------------------------------------------------
+	 * void platform_set_coherent_stack (unsigned long mpidr)
+	 *
+	 * For a given CPU, this function sets the stack pointer
+	 * to a stack allocated in device memory. This stack can
+	 * be used by C code which enables/disables the SCTLR.M
+	 * SCTLR.C bit e.g. while powering down a cpu
+	 * -----------------------------------------------------
+	 */
+func platform_set_coherent_stack
+	mov x5, x30 // lr
+	get_mp_stack pcpu_dv_mem_stack, PCPU_DV_MEM_STACK_SIZE
+	mov sp, x0
+	ret x5
+
+	/* -----------------------------------------------------
+	 * unsigned long platform_get_stack (unsigned long mpidr)
+	 *
+	 * For a given CPU, this function returns the stack
+	 * pointer for a stack allocated in device memory.
+	 * -----------------------------------------------------
+	 */
+func platform_get_stack
+	mov x10, x30 // lr
+	get_mp_stack platform_normal_stacks, PLATFORM_STACK_SIZE
+	ret x10
+
+	/* -----------------------------------------------------
+	 * void platform_set_stack (unsigned long mpidr)
+	 *
+	 * For a given CPU, this function sets the stack pointer
+	 * to a stack allocated in normal memory.
+	 * -----------------------------------------------------
+	 */
+func platform_set_stack
+	mov x9, x30 // lr
+	bl  platform_get_stack
+	mov sp, x0
+	ret x9
+
+	/* -----------------------------------------------------
+	 * Per-cpu stacks in normal memory.
+	 * Used for C code during runtime execution (when coherent
+	 * stacks are not required).
+	 * Each cpu gets a stack of PLATFORM_STACK_SIZE bytes.
+	 * -----------------------------------------------------
+	 */
+declare_stack platform_normal_stacks, tzfw_normal_stacks, \
+		PLATFORM_STACK_SIZE, PLATFORM_CORE_COUNT
+
+	/* -----------------------------------------------------
+	 * Per-cpu stacks in device memory.
+	 * Used for C code just before power down or right after
+	 * power up when the MMU or caches need to be turned on
+	 * or off.
+	 * Each cpu gets a stack of PCPU_DV_MEM_STACK_SIZE bytes.
+	 * -----------------------------------------------------
+	 */
+declare_stack pcpu_dv_mem_stack, tzfw_coherent_mem, \
+		PCPU_DV_MEM_STACK_SIZE, PLATFORM_CORE_COUNT
diff --git a/plat/common/aarch64/platform_up_stack.S b/plat/common/aarch64/platform_up_stack.S
new file mode 100644
index 0000000..b321a4e
--- /dev/null
+++ b/plat/common/aarch64/platform_up_stack.S
@@ -0,0 +1,97 @@
+/*
+ * Copyright (c) 2014, ARM Limited and Contributors. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * Neither the name of ARM nor the names of its contributors may be used
+ * to endorse or promote products derived from this software without specific
+ * prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <arch.h>
+#include <platform.h>
+#include <asm_macros.S>
+
+
+	.local	pcpu_dv_mem_stack
+	.local	platform_normal_stacks
+	.globl	platform_set_stack
+	.globl	platform_get_stack
+	.globl	platform_set_coherent_stack
+
+
+	/* -----------------------------------------------------
+	 * void platform_set_coherent_stack (unsigned long)
+	 *
+	 * For cold-boot BL images, only the primary CPU needs a
+	 * stack. This function sets the stack pointer to a stack
+	 * allocated in device memory.
+	 * -----------------------------------------------------
+	 */
+func platform_set_coherent_stack
+	get_up_stack pcpu_dv_mem_stack, PCPU_DV_MEM_STACK_SIZE
+	mov sp, x0
+	ret
+
+
+	/* -----------------------------------------------------
+	 * unsigned long platform_get_stack (unsigned long)
+	 *
+	 * For cold-boot BL images, only the primary CPU needs a
+	 * stack. This function returns the stack pointer for a
+	 * stack allocated in device memory.
+	 * -----------------------------------------------------
+	 */
+func platform_get_stack
+	get_up_stack platform_normal_stacks, PLATFORM_STACK_SIZE
+	ret
+
+	/* -----------------------------------------------------
+	 * void platform_set_stack (unsigned long)
+	 *
+	 * For cold-boot BL images, only the primary CPU needs a
+	 * stack. This function sets the stack pointer to a stack
+	 * allocated in normal memory.
+	 * -----------------------------------------------------
+	 */
+func platform_set_stack
+	get_up_stack platform_normal_stacks, PLATFORM_STACK_SIZE
+	mov sp, x0
+	ret
+
+	/* -----------------------------------------------------
+	 * Single cpu stack in normal memory.
+	 * Used for C code during boot, PLATFORM_STACK_SIZE bytes
+	 * are allocated
+	 * -----------------------------------------------------
+	 */
+declare_stack platform_normal_stacks, tzfw_normal_stacks, \
+		PLATFORM_STACK_SIZE, 1
+
+	/* -----------------------------------------------------
+	 * Single cpu stack in device/coherent memory.
+	 * PCPU_DV_MEM_STACK_SIZE bytes are allocated.
+	 * -----------------------------------------------------
+	 */
+declare_stack pcpu_dv_mem_stack, tzfw_coherent_mem, \
+		PCPU_DV_MEM_STACK_SIZE, 1
diff --git a/plat/fvp/aarch64/bl1_plat_helpers.S b/plat/fvp/aarch64/bl1_plat_helpers.S
index c487f03..15e3a21 100644
--- a/plat/fvp/aarch64/bl1_plat_helpers.S
+++ b/plat/fvp/aarch64/bl1_plat_helpers.S
@@ -32,15 +32,13 @@
 #include <platform.h>
 #include <fvp_pwrc.h>
 #include <gic.h>
+#include <asm_macros.S>
 
 	.globl	platform_get_entrypoint
 	.globl	platform_cold_boot_init
 	.globl	plat_secondary_cold_boot_setup
 
 
-	.section	.text, "ax"; .align 3
-
-
 	.macro	platform_choose_gicmmap  param1, param2, x_tmp, w_tmp, res
 	ldr	\x_tmp, =VE_SYSREGS_BASE + V2M_SYS_ID
 	ldr	\w_tmp, [\x_tmp]
@@ -60,12 +58,7 @@
 	 * that the request has gone through.
 	 * -----------------------------------------------------
 	 */
-plat_secondary_cold_boot_setup: ; .type plat_secondary_cold_boot_setup, %function
-	bl	read_mpidr
-	mov	x19, x0
-	bl	platform_get_core_pos
-	mov	x20, x0
-
+func plat_secondary_cold_boot_setup
 	/* ---------------------------------------------
 	 * Power down this cpu.
 	 * TODO: Do we need to worry about powering the
@@ -74,8 +67,9 @@
 	 * loader zeroes out the zi section.
 	 * ---------------------------------------------
 	 */
+	bl	read_mpidr
 	ldr	x1, =PWRC_BASE
-	str	w19, [x1, #PPOFFR_OFF]
+	str	w0, [x1, #PPOFFR_OFF]
 
 	/* ---------------------------------------------
 	 * Deactivate the gic cpu interface as well
@@ -120,7 +114,7 @@
 	 * 	reset all cpus will read the same WK field
 	 * -----------------------------------------------------
 	 */
-platform_get_entrypoint: ; .type platform_get_entrypoint, %function
+func platform_get_entrypoint
 	mov	x9, x30 // lr
 	mov	x2, x0
 	ldr	x1, =PWRC_BASE
@@ -160,7 +154,7 @@
 	 * BL1 will always read the mailboxes with the MMU off
 	 * -----------------------------------------------------
 	 */
-platform_mem_init: ; .type platform_mem_init, %function
+func platform_mem_init
 	ldr	x0, =TZDRAM_BASE + MBOX_OFF
 	stp	xzr, xzr, [x0, #0]
 	stp	xzr, xzr, [x0, #0x10]
@@ -176,7 +170,7 @@
 	 * boot to perform early platform initialization
 	 * -----------------------------------------------------
 	 */
-platform_cold_boot_init: ; .type platform_cold_boot_init, %function
+func platform_cold_boot_init
 	mov	x20, x0
 	bl	platform_mem_init
 	bl	read_mpidr
diff --git a/plat/fvp/aarch64/plat_common.c b/plat/fvp/aarch64/plat_common.c
index a5d9f1d..16645cb 100644
--- a/plat/fvp/aarch64/plat_common.c
+++ b/plat/fvp/aarch64/plat_common.c
@@ -34,10 +34,6 @@
 #include <platform.h>
 #include <xlat_tables.h>
 
-unsigned char platform_normal_stacks[PLATFORM_STACK_SIZE][PLATFORM_CORE_COUNT]
-__attribute__ ((aligned(PLATFORM_CACHE_LINE_SIZE),
-		section("tzfw_normal_stacks")));
-
 /*******************************************************************************
  * This array holds the characteristics of the differences between the three
  * FVP platforms (Base, A53_A57 & Foundation). It will be populated during cold
@@ -126,7 +122,7 @@
  * This doesn't include TZRAM as the 'mem_layout' argument passed to to
  * configure_mmu() will give the available subset of that,
  */
-const mmap_region mmap[] = {
+const mmap_region fvp_mmap[] = {
 	{ TZROM_BASE,	TZROM_SIZE,	MT_MEMORY | MT_RO | MT_SECURE },
 	{ TZDRAM_BASE,	TZDRAM_SIZE,	MT_MEMORY | MT_RW | MT_SECURE },
 	{ FLASH0_BASE,	FLASH0_SIZE,	MT_MEMORY | MT_RO | MT_SECURE },
@@ -157,7 +153,7 @@
 	mmap_add_region(coh_start, coh_limit - coh_start,
 				MT_DEVICE | MT_RW | MT_SECURE);
 
-	mmap_add(mmap);
+	mmap_add(fvp_mmap);
 
 	init_xlat_tables();
 
diff --git a/plat/fvp/aarch64/plat_helpers.S b/plat/fvp/aarch64/plat_helpers.S
index 032b393..ab826f4 100644
--- a/plat/fvp/aarch64/plat_helpers.S
+++ b/plat/fvp/aarch64/plat_helpers.S
@@ -30,11 +30,10 @@
 
 #include <arch.h>
 #include <platform.h>
+#include <asm_macros.S>
 
 	.globl	plat_report_exception
 
-	.section	.text, "ax"
-
 	/* ---------------------------------------------
 	 * void plat_report_exception(unsigned int type)
 	 * Function to report an unhandled exception
@@ -43,7 +42,7 @@
 	 * to indicate where we are
 	 * ---------------------------------------------
 	 */
-plat_report_exception:
+func plat_report_exception
 	mrs	x1, CurrentEl
 	lsr	x1, x1, #MODE_EL_SHIFT
 	lsl	x1, x1, #SYS_LED_EL_SHIFT
diff --git a/plat/fvp/platform.h b/plat/fvp/platform.h
index 5f4adc3..c594357 100644
--- a/plat/fvp/platform.h
+++ b/plat/fvp/platform.h
@@ -47,7 +47,16 @@
 /*******************************************************************************
  * Generic platform constants
  ******************************************************************************/
-#define PLATFORM_STACK_SIZE		0x800
+
+/* Size of cacheable stacks */
+#define PLATFORM_STACK_SIZE	0x800
+
+/* Size of coherent stacks for debug and release builds */
+#if DEBUG
+#define PCPU_DV_MEM_STACK_SIZE	0x400
+#else
+#define PCPU_DV_MEM_STACK_SIZE	0x300
+#endif
 
 #define FIRMWARE_WELCOME_STR		"Booting trusted firmware boot loader stage 1\n\r"
 
diff --git a/plat/fvp/platform.mk b/plat/fvp/platform.mk
index e3b4848..36090f2 100644
--- a/plat/fvp/platform.mk
+++ b/plat/fvp/platform.mk
@@ -76,14 +76,17 @@
 BL1_SOURCES		+=	bl1_plat_setup.c			\
 				bl1_plat_helpers.S			\
 				plat_helpers.S				\
+				platform_up_stack.S			\
 				plat_common.c				\
 				cci400.c
 
 BL2_SOURCES		+=	bl2_plat_setup.c			\
+				platform_up_stack.S			\
 				plat_common.c
 
 BL31_SOURCES		+=	bl31_plat_setup.c			\
 				plat_helpers.S				\
+				platform_mp_stack.S			\
 				plat_common.c				\
 				plat_pm.c				\
 				plat_topology.c				\
diff --git a/services/spd/tspd/tspd_helpers.S b/services/spd/tspd/tspd_helpers.S
index b56b2aa..27fbb5a 100644
--- a/services/spd/tspd/tspd_helpers.S
+++ b/services/spd/tspd/tspd_helpers.S
@@ -44,7 +44,7 @@
 	 * saved.
 	 * ---------------------------------------------
 	 */
-tspd_enter_sp:
+func tspd_enter_sp
 	/* Make space for the registers that we're going to save */
 	mov	x3, sp
 	str	x3, [x0, #0]
@@ -79,7 +79,7 @@
 	 * ---------------------------------------------
 	 */
 	.global tspd_exit_sp
-tspd_exit_sp:
+func tspd_exit_sp
 	/* Restore the previous stack */
 	mov	sp, x0
 
diff --git a/services/std_svc/psci/psci_entry.S b/services/std_svc/psci/psci_entry.S
index 361dfde..bdd571e 100644
--- a/services/std_svc/psci/psci_entry.S
+++ b/services/std_svc/psci/psci_entry.S
@@ -41,8 +41,6 @@
 	.globl	__psci_cpu_off
 	.globl	__psci_cpu_suspend
 
-	.section	.text, "ax"; .align 3
-
 	/* -----------------------------------------------------
 	 * This cpu has been physically powered up. Depending
 	 * upon whether it was resumed from suspend or simply
@@ -55,7 +53,7 @@
 	 * all this is done.
 	 * -----------------------------------------------------
 	 */
-psci_aff_on_finish_entry:
+func psci_aff_on_finish_entry
 	adr	x23, psci_afflvl_on_finishers
 	b	psci_aff_common_finish_entry
 
@@ -120,7 +118,7 @@
 	 * suffering from stack coherency issues
 	 * -----------------------------------------------------
 	 */
-__psci_cpu_off:
+func __psci_cpu_off
 	func_prologue
 	sub	sp, sp, #0x10
 	stp	x19, x20, [sp, #0]
@@ -137,7 +135,7 @@
 	func_epilogue
 	ret
 
-__psci_cpu_suspend:
+func __psci_cpu_suspend
 	func_prologue
 	sub	sp, sp, #0x20
 	stp	x19, x20, [sp, #0]
@@ -162,7 +160,7 @@
 	func_epilogue
 	ret
 
-final_wfi:
+func final_wfi
 	dsb	sy
 	wfi
 wfi_spill: