Merge pull request #62 from athoelke/set-little-endian-v2

Set processor endianness immediately after RESET v2
diff --git a/bl1/aarch64/bl1_arch_setup.c b/bl1/aarch64/bl1_arch_setup.c
index 1b14246..5725bac 100644
--- a/bl1/aarch64/bl1_arch_setup.c
+++ b/bl1/aarch64/bl1_arch_setup.c
@@ -43,6 +43,7 @@
 	tmp_reg = read_sctlr_el3();
 	tmp_reg |= (SCTLR_A_BIT | SCTLR_SA_BIT);
 	write_sctlr_el3(tmp_reg);
+	isb();
 
 	/*
 	 * Enable HVCs, route FIQs to EL3, set the next EL to be AArch64, route
diff --git a/bl1/aarch64/bl1_entrypoint.S b/bl1/aarch64/bl1_entrypoint.S
index 62e1218..7259601 100644
--- a/bl1/aarch64/bl1_entrypoint.S
+++ b/bl1/aarch64/bl1_entrypoint.S
@@ -96,7 +96,6 @@
 	mrs	x0, sctlr_el3
 	orr	x0, x0, #SCTLR_I_BIT
 	msr	sctlr_el3, x0
-
 	isb
 
 _wait_for_entrypoint:
@@ -108,10 +107,10 @@
 	 * their turn to be woken up
 	 * ---------------------------------------------
 	 */
-	bl	read_mpidr
+	mrs	x0, mpidr_el1
 	bl	platform_get_entrypoint
 	cbnz	x0, _do_warm_boot
-	bl	read_mpidr
+	mrs	x0, mpidr_el1
 	bl	platform_is_primary_cpu
 	cbnz	x0, _do_cold_boot
 
diff --git a/bl1/aarch64/bl1_exceptions.S b/bl1/aarch64/bl1_exceptions.S
index 68d088b..a87b20f 100644
--- a/bl1/aarch64/bl1_exceptions.S
+++ b/bl1/aarch64/bl1_exceptions.S
@@ -189,7 +189,7 @@
 	mov	x0, #SYNC_EXCEPTION_AARCH64
 	bl	plat_report_exception
 
-	bl	read_esr_el3
+	mrs	x0, esr_el3
 	ubfx	x1, x0, #ESR_EC_SHIFT, #ESR_EC_LENGTH
 	cmp	x1, #EC_AARCH64_SMC
 	b.ne	panic
@@ -201,10 +201,8 @@
 	mov	x2, x3
 	mov	x3, x4
 	bl	display_boot_progress
-	mov	x0, x20
-	bl	write_elr
-	mov	x0, x21
-	bl	write_spsr
+	msr	elr_el3, x20
+	msr	spsr_el3, x21
 	ubfx	x0, x21, #MODE_EL_SHIFT, #2
 	cmp	x0, #MODE_EL3
 	b.ne	skip_mmu_teardown
@@ -212,18 +210,11 @@
 	/* ---------------------------------------------
 	 * If BL31 is to be executed in EL3 as well
 	 * then turn off the MMU so that it can perform
-	 * its own setup. TODO: Assuming flat mapped
-	 * translations here. Also all should go into a
-	 * separate MMU teardown function
+	 * its own setup.
 	 * ---------------------------------------------
 	 */
-	mov	x1, #(SCTLR_M_BIT | SCTLR_C_BIT | SCTLR_I_BIT)
-	bl	read_sctlr_el3
-	bic	x0, x0, x1
-	bl	write_sctlr_el3
-	mov	x0, #DCCISW
-	bl	dcsw_op_all
-	bl	tlbialle3
+	bl	disable_mmu_icache_el3
+	tlbi	alle3
 skip_mmu_teardown:
 	ldp     x6, x7, [sp, #0x30]
 	ldp     x4, x5, [sp, #0x20]
diff --git a/bl2/aarch64/bl2_entrypoint.S b/bl2/aarch64/bl2_entrypoint.S
index b8af9a5..4f7565f 100644
--- a/bl2/aarch64/bl2_entrypoint.S
+++ b/bl2/aarch64/bl2_entrypoint.S
@@ -54,8 +54,7 @@
 	 * So, make sure no secondary has lost its way.
 	 * ---------------------------------------------
 	 */
-	bl	read_mpidr
-	mov	x19, x0
+	mrs	x0, mpidr_el1
 	bl	platform_is_primary_cpu
 	cbz	x0, _panic
 
@@ -73,7 +72,6 @@
 	mrs	x0, sctlr_el1
 	orr	x0, x0, #SCTLR_I_BIT
 	msr	sctlr_el1, x0
-
 	isb
 
 	/* ---------------------------------------------
@@ -103,7 +101,7 @@
 	 * ease the pain of initializing the MMU
 	 * --------------------------------------------
 	 */
-	mov	x0, x19
+	mrs	x0, mpidr_el1
 	bl	platform_set_coherent_stack
 
 	/* ---------------------------------------------
@@ -121,7 +119,7 @@
 	 * -IS-WBWA memory
 	 * ---------------------------------------------
 	 */
-	mov	x0, x19
+	mrs	x0, mpidr_el1
 	bl	platform_set_stack
 
 	/* ---------------------------------------------
diff --git a/bl31/aarch64/bl31_entrypoint.S b/bl31/aarch64/bl31_entrypoint.S
index 39fa605..13bd5b8 100644
--- a/bl31/aarch64/bl31_entrypoint.S
+++ b/bl31/aarch64/bl31_entrypoint.S
@@ -89,7 +89,6 @@
 	mrs	x1, sctlr_el3
 	orr	x1, x1, #SCTLR_I_BIT
 	msr	sctlr_el3, x1
-
 	isb
 
 	/* ---------------------------------------------
@@ -108,8 +107,7 @@
 	 * So, make sure no secondary has lost its way.
 	 * ---------------------------------------------
 	 */
-	bl	read_mpidr
-	mov	x19, x0
+	mrs	x0, mpidr_el1
 	bl	platform_is_primary_cpu
 	cbz	x0, _panic
 
@@ -138,7 +136,7 @@
 	 * ease the pain of initializing the MMU
 	 * --------------------------------------------
 	 */
-	mov	x0, x19
+	mrs	x0, mpidr_el1
 	bl	platform_set_coherent_stack
 
 	/* ---------------------------------------------
@@ -155,7 +153,7 @@
 	 * -IS-WBWA memory
 	 * ---------------------------------------------
 	 */
-	mov	x0, x19
+	mrs	x0, mpidr_el1
 	bl	platform_set_stack
 
 	/* ---------------------------------------------
diff --git a/bl31/bl31_main.c b/bl31/bl31_main.c
index 01f00f2..755320d 100644
--- a/bl31/bl31_main.c
+++ b/bl31/bl31_main.c
@@ -100,6 +100,7 @@
 	assert(cm_get_context(mpidr, NON_SECURE));
 	cm_set_next_eret_context(NON_SECURE);
 	write_vbar_el3((uint64_t) runtime_exceptions);
+	isb();
 	next_image_type = NON_SECURE;
 
 	/*
diff --git a/drivers/arm/gic/aarch64/gic_v3_sysregs.S b/drivers/arm/gic/aarch64/gic_v3_sysregs.S
index 2a96da7..ddf85a8 100644
--- a/drivers/arm/gic/aarch64/gic_v3_sysregs.S
+++ b/drivers/arm/gic/aarch64/gic_v3_sysregs.S
@@ -67,23 +67,19 @@
 
 func write_icc_sre_el1
 	msr	ICC_SRE_EL1, x0
-	isb
 	ret
 
 
 func write_icc_sre_el2
 	msr	ICC_SRE_EL2, x0
-	isb
 	ret
 
 
 func write_icc_sre_el3
 	msr	ICC_SRE_EL3, x0
-	isb
 	ret
 
 
 func write_icc_pmr_el1
 	msr	ICC_PMR_EL1, x0
-	isb
 	ret
diff --git a/include/common/asm_macros.S b/include/common/asm_macros.S
index 6cf1a19..3dbd9f2 100644
--- a/include/common/asm_macros.S
+++ b/include/common/asm_macros.S
@@ -58,20 +58,13 @@
 
 
 	.macro	smc_check  label
-	bl	read_esr
+	mrs	x0, esr_el3
 	ubfx	x0, x0, #ESR_EC_SHIFT, #ESR_EC_LENGTH
 	cmp	x0, #EC_AARCH64_SMC
 	b.ne	$label
 	.endm
 
 
-	.macro	setup_dcsw_op_args  start_level, end_level, clidr, shift, fw, ls
-	mrs	\clidr, clidr_el1
-	mov	\start_level, xzr
-	ubfx	\end_level, \clidr, \shift, \fw
-	lsl	\end_level, \end_level, \ls
-	.endm
-
 	/*
 	 * This macro verifies that the a given vector doesn't exceed the
 	 * architectural limit of 32 instructions. This is meant to be placed
diff --git a/include/drivers/arm/pl011.h b/include/drivers/arm/pl011.h
index 28aef54..1254920 100644
--- a/include/drivers/arm/pl011.h
+++ b/include/drivers/arm/pl011.h
@@ -78,10 +78,6 @@
 #define PL011_UARTCR_LBE          (1 << 7)	/* Loopback enable */
 #define PL011_UARTCR_UARTEN       (1 << 0)	/* UART Enable */
 
-#if !defined(PL011_BASE)
-#error "The PL011_BASE macro must be defined."
-#endif
-
 #if !defined(PL011_BAUDRATE)
 #define PL011_BAUDRATE  115200
 #endif
diff --git a/include/lib/aarch64/arch_helpers.h b/include/lib/aarch64/arch_helpers.h
index 565b1b4..517e25a 100644
--- a/include/lib/aarch64/arch_helpers.h
+++ b/include/lib/aarch64/arch_helpers.h
@@ -78,6 +78,9 @@
 extern void dcsw_op_louis(unsigned int);
 extern void dcsw_op_all(unsigned int);
 
+extern void disable_mmu_el3(void);
+extern void disable_mmu_icache_el3(void);
+
 /*******************************************************************************
  * Misc. accessor prototypes
  ******************************************************************************/
@@ -191,9 +194,7 @@
 extern unsigned long read_ttbr0_el2(void);
 extern unsigned long read_ttbr0_el3(void);
 
-extern unsigned long read_ttbr1(void);
 extern unsigned long read_ttbr1_el1(void);
-extern unsigned long read_ttbr1_el2(void);
 
 extern unsigned long read_cptr_el2(void);
 extern unsigned long read_cptr_el3(void);
@@ -225,12 +226,10 @@
 extern void write_esr_el2(unsigned long);
 extern void write_esr_el3(unsigned long);
 
-extern void write_afsr0(unsigned long);
 extern void write_afsr0_el1(unsigned long);
 extern void write_afsr0_el2(unsigned long);
 extern void write_afsr0_el3(unsigned long);
 
-extern void write_afsr1(unsigned long);
 extern void write_afsr1_el1(unsigned long);
 extern void write_afsr1_el2(unsigned long);
 extern void write_afsr1_el3(unsigned long);
@@ -260,7 +259,6 @@
 extern void write_ttbr0_el3(unsigned long);
 
 extern void write_ttbr1_el1(unsigned long);
-extern void write_ttbr1_el2(unsigned long);
 
 extern void write_cpuectlr(unsigned long);
 extern void write_cptr_el2(unsigned long);
diff --git a/lib/aarch64/cache_helpers.S b/lib/aarch64/cache_helpers.S
index 2649ad0..a5b918c 100644
--- a/lib/aarch64/cache_helpers.S
+++ b/lib/aarch64/cache_helpers.S
@@ -46,57 +46,41 @@
 
 func dcisw
 	dc	isw, x0
-	dsb	sy
-	isb
 	ret
 
 
 func dccisw
 	dc	cisw, x0
-	dsb	sy
-	isb
 	ret
 
 
 func dccsw
 	dc	csw, x0
-	dsb	sy
-	isb
 	ret
 
 
 func dccvac
 	dc	cvac, x0
-	dsb	sy
-	isb
 	ret
 
 
 func dcivac
 	dc	ivac, x0
-	dsb	sy
-	isb
 	ret
 
 
 func dccivac
 	dc	civac, x0
-	dsb	sy
-	isb
 	ret
 
 
 func dccvau
 	dc	cvau, x0
-	dsb	sy
-	isb
 	ret
 
 
 func dczva
 	dc	zva, x0
-	dsb	sy
-	isb
 	ret
 
 
@@ -138,94 +122,92 @@
 	ret
 
 
-	/* ------------------------------------------
-	 * Data cache operations by set/way to the
-	 * level specified
-	 * ------------------------------------------
-	 * ----------------------------------
-	 * Call this func with the clidr in
-	 * x0, starting cache level in x10,
-	 * last cache level in x3 & cm op in
-	 * x14
-	 * ----------------------------------
+	/* ---------------------------------------------------------------
+	 * Data cache operations by set/way to the level specified
+	 *
+	 * The main function, do_dcsw_op requires:
+	 * x0: The operation type (0-2), as defined in arch.h
+	 * x3: The last cache level to operate on
+	 * x9: clidr_el1
+	 * and will carry out the operation on each data cache from level 0
+	 * to the level in x3 in sequence
+	 *
+	 * The dcsw_op macro sets up the x3 and x9 parameters based on
+	 * clidr_el1 cache information before invoking the main function
+	 * ---------------------------------------------------------------
 	 */
-func dcsw_op
-all_start_at_level:
-	add	x2, x10, x10, lsr #1            // work out 3x current cache level
-	lsr	x1, x0, x2                      // extract cache type bits from clidr
-	and	x1, x1, #7                      // mask of the bits for current cache only
-	cmp	x1, #2                          // see what cache we have at this level
-	b.lt	skip                            // skip if no cache, or just i-cache
-	msr	csselr_el1, x10                 // select current cache level in csselr
-	isb                                     // isb to sych the new cssr&csidr
-	mrs	x1, ccsidr_el1                  // read the new ccsidr
-	and	x2, x1, #7                      // extract the length of the cache lines
-	add	x2, x2, #4                      // add 4 (line length offset)
-	mov	x4, #0x3ff
-	and	x4, x4, x1, lsr #3              // find maximum number on the way size
-	clz	w5, w4                          // find bit position of way size increment
-	mov	x7, #0x7fff
-	and	x7, x7, x1, lsr #13             // extract max number of the index size
-loop2:
-	mov	x9, x4                          // create working copy of max way size
-loop3:
-	lsl	x6, x9, x5
-	orr	x11, x10, x6                    // factor way and cache number into x11
-	lsl	x6, x7, x2
-	orr	x11, x11, x6                    // factor index number into x11
-	mov	x12, x0
-	mov	x13, x30 // lr
-	mov	x0, x11
-	blr	x14
-	mov	x0, x12
-	mov	x30, x13 // lr
-	subs	x9, x9, #1                      // decrement the way
-	b.ge    loop3
-	subs	x7, x7, #1                      // decrement the index
-	b.ge    loop2
-skip:
-	add	x10, x10, #2                    // increment cache number
-	cmp	x3, x10
-	b.gt    all_start_at_level
-finished:
-	mov	x10, #0                         // swith back to cache level 0
-	msr	csselr_el1, x10                 // select current cache level in csselr
-	dsb	sy
-	isb
-	ret
 
+	.macro	dcsw_op shift, fw, ls
+	mrs	x9, clidr_el1
+	ubfx	x3, x9, \shift, \fw
+	lsl	x3, x3, \ls
+	b	do_dcsw_op
+	.endm
 
 func do_dcsw_op
 	cbz	x3, exit
-	cmp	x0, #DCISW
-	b.eq	dc_isw
-	cmp	x0, #DCCISW
-	b.eq	dc_cisw
-	cmp	x0, #DCCSW
-	b.eq	dc_csw
-dc_isw:
-	mov	x0, x9
-	adr	x14, dcisw
-	b	dcsw_op
-dc_cisw:
+	mov	x10, xzr
+	adr	x14, dcsw_loop_table	// compute inner loop address
+	add	x14, x14, x0, lsl #5	// inner loop is 8x32-bit instructions
 	mov	x0, x9
-	adr	x14, dccisw
-	b	dcsw_op
-dc_csw:
-	mov	x0, x9
-	adr	x14, dccsw
-	b	dcsw_op
+	mov	w8, #1
+loop1:
+	add	x2, x10, x10, lsr #1	// work out 3x current cache level
+	lsr	x1, x0, x2		// extract cache type bits from clidr
+	and	x1, x1, #7		// mask the bits for current cache only
+	cmp	x1, #2			// see what cache we have at this level
+	b.lt	level_done		// nothing to do if no cache or icache
+
+	msr	csselr_el1, x10		// select current cache level in csselr
+	isb				// isb to sych the new cssr&csidr
+	mrs	x1, ccsidr_el1		// read the new ccsidr
+	and	x2, x1, #7		// extract the length of the cache lines
+	add	x2, x2, #4		// add 4 (line length offset)
+	ubfx	x4, x1, #3, #10		// maximum way number
+	clz	w5, w4			// bit position of way size increment
+	lsl	w9, w4, w5		// w9 = aligned max way number
+	lsl	w16, w8, w5		// w16 = way number loop decrement
+	orr	w9, w10, w9		// w9 = combine way and cache number
+	ubfx	w6, w1, #13, #15	// w6 = max set number
+	lsl	w17, w8, w2		// w17 = set number loop decrement
+	dsb	sy			// barrier before we start this level
+	br	x14			// jump to DC operation specific loop
+
+	.macro	dcsw_loop _op
+loop2_\_op:
+	lsl	w7, w6, w2		// w7 = aligned max set number
+
+loop3_\_op:
+	orr	w11, w9, w7		// combine cache, way and set number
+	dc	\_op, x11
+	subs	w7, w7, w17		// decrement set number
+	b.ge	loop3_\_op
+
+	subs	x9, x9, x16		// decrement way number
+	b.ge	loop2_\_op
+
+	b	level_done
+	.endm
+
+level_done:
+	add	x10, x10, #2		// increment cache number
+	cmp	x3, x10
+	b.gt    loop1
+	msr	csselr_el1, xzr		// select cache level 0 in csselr
+	dsb	sy			// barrier to complete final cache operation
+	isb
 exit:
 	ret
 
+dcsw_loop_table:
+	dcsw_loop isw
+	dcsw_loop cisw
+	dcsw_loop csw
+
 
 func dcsw_op_louis
-	dsb	sy
-	setup_dcsw_op_args x10, x3, x9, #LOUIS_SHIFT, #CLIDR_FIELD_WIDTH, #LEVEL_SHIFT
-	b	do_dcsw_op
+	dcsw_op #LOUIS_SHIFT, #CLIDR_FIELD_WIDTH, #LEVEL_SHIFT
 
 
 func dcsw_op_all
-	dsb	sy
-	setup_dcsw_op_args x10, x3, x9, #LOC_SHIFT, #CLIDR_FIELD_WIDTH, #LEVEL_SHIFT
-	b	do_dcsw_op
+	dcsw_op #LOC_SHIFT, #CLIDR_FIELD_WIDTH, #LEVEL_SHIFT
diff --git a/lib/aarch64/cpu_helpers.S b/lib/aarch64/cpu_helpers.S
index 573d0b8..abb996d 100644
--- a/lib/aarch64/cpu_helpers.S
+++ b/lib/aarch64/cpu_helpers.S
@@ -35,13 +35,11 @@
 
 
 func cpu_reset_handler
-	mov	x19, x30 // lr
-
 	/* ---------------------------------------------
 	 * As a bare minimal enable the SMP bit.
 	 * ---------------------------------------------
 	 */
-	bl	read_midr
+	mrs	x0, midr_el1
 	lsr	x0, x0, #MIDR_PN_SHIFT
 	and	x0, x0, #MIDR_PN_MASK
 	cmp	x0, #MIDR_PN_A57
@@ -49,8 +47,9 @@
 	cmp	x0, #MIDR_PN_A53
 	b.ne	smp_setup_end
 smp_setup_begin:
-	bl	read_cpuectlr
+	mrs	x0, CPUECTLR_EL1
 	orr	x0, x0, #CPUECTLR_SMP_BIT
-	bl	write_cpuectlr
+	msr	CPUECTLR_EL1, x0
+	isb
 smp_setup_end:
-	ret	x19
+	ret
diff --git a/lib/aarch64/misc_helpers.S b/lib/aarch64/misc_helpers.S
index e7b2331..e7ee015 100644
--- a/lib/aarch64/misc_helpers.S
+++ b/lib/aarch64/misc_helpers.S
@@ -46,22 +46,18 @@
 	.globl	read_daif
 	.globl	write_daif
 
-	.globl	read_spsr
 	.globl	read_spsr_el1
 	.globl	read_spsr_el2
 	.globl	read_spsr_el3
 
-	.globl	write_spsr
 	.globl	write_spsr_el1
 	.globl	write_spsr_el2
 	.globl	write_spsr_el3
 
-	.globl	read_elr
 	.globl	read_elr_el1
 	.globl	read_elr_el2
 	.globl	read_elr_el3
 
-	.globl	write_elr
 	.globl	write_elr_el1
 	.globl	write_elr_el2
 	.globl	write_elr_el3
@@ -79,6 +75,9 @@
 	.globl	zeromem16
 	.globl	memcpy16
 
+	.globl	disable_mmu_el3
+	.globl	disable_mmu_icache_el3
+
 
 func get_afflvl_shift
 	cmp	x0, #3
@@ -150,16 +149,6 @@
 	ret
 
 
-func read_spsr
-	mrs	x0, CurrentEl
-	cmp	x0, #(MODE_EL1 << MODE_EL_SHIFT)
-	b.eq	read_spsr_el1
-	cmp	x0, #(MODE_EL2 << MODE_EL_SHIFT)
-	b.eq	read_spsr_el2
-	cmp	x0, #(MODE_EL3 << MODE_EL_SHIFT)
-	b.eq	read_spsr_el3
-
-
 func read_spsr_el1
 	mrs	x0, spsr_el1
 	ret
@@ -175,44 +164,21 @@
 	ret
 
 
-func write_spsr
-	mrs	x1, CurrentEl
-	cmp	x1, #(MODE_EL1 << MODE_EL_SHIFT)
-	b.eq	write_spsr_el1
-	cmp	x1, #(MODE_EL2 << MODE_EL_SHIFT)
-	b.eq	write_spsr_el2
-	cmp	x1, #(MODE_EL3 << MODE_EL_SHIFT)
-	b.eq	write_spsr_el3
-
-
 func write_spsr_el1
 	msr	spsr_el1, x0
-	isb
 	ret
 
 
 func write_spsr_el2
 	msr	spsr_el2, x0
-	isb
 	ret
 
 
 func write_spsr_el3
 	msr	spsr_el3, x0
-	isb
 	ret
 
 
-func read_elr
-	mrs	x0, CurrentEl
-	cmp	x0, #(MODE_EL1 << MODE_EL_SHIFT)
-	b.eq	read_elr_el1
-	cmp	x0, #(MODE_EL2 << MODE_EL_SHIFT)
-	b.eq	read_elr_el2
-	cmp	x0, #(MODE_EL3 << MODE_EL_SHIFT)
-	b.eq	read_elr_el3
-
-
 func read_elr_el1
 	mrs	x0, elr_el1
 	ret
@@ -228,31 +194,18 @@
 	ret
 
 
-func write_elr
-	mrs	x1, CurrentEl
-	cmp	x1, #(MODE_EL1 << MODE_EL_SHIFT)
-	b.eq	write_elr_el1
-	cmp	x1, #(MODE_EL2 << MODE_EL_SHIFT)
-	b.eq	write_elr_el2
-	cmp	x1, #(MODE_EL3 << MODE_EL_SHIFT)
-	b.eq	write_elr_el3
-
-
 func write_elr_el1
 	msr	elr_el1, x0
-	isb
 	ret
 
 
 func write_elr_el2
 	msr	elr_el2, x0
-	isb
 	ret
 
 
 func write_elr_el3
 	msr	elr_el3, x0
-	isb
 	ret
 
 
@@ -338,3 +291,27 @@
 	subs	x2, x2, #1
 	b.ne	m_loop1
 m_end:	ret
+
+/* ---------------------------------------------------------------------------
+ * Disable the MMU at EL3
+ * This is implemented in assembler to ensure that the data cache is cleaned
+ * and invalidated after the MMU is disabled without any intervening cacheable
+ * data accesses
+ * ---------------------------------------------------------------------------
+ */
+
+func disable_mmu_el3
+	mov	x1, #(SCTLR_M_BIT | SCTLR_C_BIT)
+do_disable_mmu:
+	mrs	x0, sctlr_el3
+	bic	x0, x0, x1
+	msr	sctlr_el3, x0
+	isb				// ensure MMU is off
+	mov	x0, #DCCISW		// DCache clean and invalidate
+	b	dcsw_op_all
+
+
+func disable_mmu_icache_el3
+	mov	x1, #(SCTLR_M_BIT | SCTLR_C_BIT | SCTLR_I_BIT)
+	b	do_disable_mmu
+
diff --git a/lib/aarch64/sysreg_helpers.S b/lib/aarch64/sysreg_helpers.S
index 61468f9..376da49 100644
--- a/lib/aarch64/sysreg_helpers.S
+++ b/lib/aarch64/sysreg_helpers.S
@@ -125,10 +125,7 @@
 	.globl	write_ttbr0_el3
 
 	.globl	read_ttbr1_el1
-	.globl	read_ttbr1_el2
-	.globl	write_ttbr1
 	.globl	write_ttbr1_el1
-	.globl	write_ttbr1_el2
 
 	.globl	read_cpacr
 	.globl	write_cpacr
@@ -160,8 +157,6 @@
 
 #if SUPPORT_VFP
 	.globl	enable_vfp
-	.globl	read_fpexc
-	.globl	write_fpexc
 #endif
 
 
@@ -201,19 +196,16 @@
 
 func write_vbar_el1
 	msr	vbar_el1, x0
-	isb
 	ret
 
 
 func write_vbar_el2
 	msr	vbar_el2, x0
-	isb
 	ret
 
 
 func write_vbar_el3
 	msr	vbar_el3, x0
-	isb
 	ret
 
 
@@ -238,19 +230,16 @@
 
 func write_afsr0_el1
 	msr	afsr0_el1, x0
-	isb
 	ret
 
 
 func write_afsr0_el2
 	msr	afsr0_el2, x0
-	isb
 	ret
 
 
 func write_afsr0_el3
 	msr	afsr0_el3, x0
-	isb
 	ret
 
 
@@ -275,19 +264,16 @@
 
 func write_far_el1
 	msr	far_el1, x0
-	isb
 	ret
 
 
 func write_far_el2
 	msr	far_el2, x0
-	isb
 	ret
 
 
 func write_far_el3
 	msr	far_el3, x0
-	isb
 	ret
 
 
@@ -312,19 +298,16 @@
 
 func write_mair_el1
 	msr	mair_el1, x0
-	isb
 	ret
 
 
 func write_mair_el2
 	msr	mair_el2, x0
-	isb
 	ret
 
 
 func write_mair_el3
 	msr	mair_el3, x0
-	isb
 	ret
 
 
@@ -349,19 +332,16 @@
 
 func write_amair_el1
 	msr	amair_el1, x0
-	isb
 	ret
 
 
 func write_amair_el2
 	msr	amair_el2, x0
-	isb
 	ret
 
 
 func write_amair_el3
 	msr	amair_el3, x0
-	isb
 	ret
 
 
@@ -405,19 +385,16 @@
 
 func write_rmr_el1
 	msr	rmr_el1, x0
-	isb
 	ret
 
 
 func write_rmr_el2
 	msr	rmr_el2, x0
-	isb
 	ret
 
 
 func write_rmr_el3
 	msr	rmr_el3, x0
-	isb
 	ret
 
 
@@ -442,19 +419,16 @@
 
 func write_afsr1_el1
 	msr	afsr1_el1, x0
-	isb
 	ret
 
 
 func write_afsr1_el2
 	msr	afsr1_el2, x0
-	isb
 	ret
 
 
 func write_afsr1_el3
 	msr	afsr1_el3, x0
-	isb
 	ret
 
 
@@ -479,22 +453,16 @@
 
 func write_sctlr_el1
 	msr	sctlr_el1, x0
-	dsb	sy
-	isb
 	ret
 
 
 func write_sctlr_el2
 	msr	sctlr_el2, x0
-	dsb	sy
-	isb
 	ret
 
 
 func write_sctlr_el3
 	msr	sctlr_el3, x0
-	dsb	sy
-	isb
 	ret
 
 
@@ -519,22 +487,16 @@
 
 func write_actlr_el1
 	msr	actlr_el1, x0
-	dsb	sy
-	isb
 	ret
 
 
 func write_actlr_el2
 	msr	actlr_el2, x0
-	dsb	sy
-	isb
 	ret
 
 
 func write_actlr_el3
 	msr	actlr_el3, x0
-	dsb	sy
-	isb
 	ret
 
 
@@ -559,22 +521,16 @@
 
 func write_esr_el1
 	msr	esr_el1, x0
-	dsb	sy
-	isb
 	ret
 
 
 func write_esr_el2
 	msr	esr_el2, x0
-	dsb	sy
-	isb
 	ret
 
 
 func write_esr_el3
 	msr	esr_el3, x0
-	dsb	sy
-	isb
 	ret
 
 
@@ -599,22 +555,16 @@
 
 func write_tcr_el1
 	msr	tcr_el1, x0
-	dsb	sy
-	isb
 	ret
 
 
 func write_tcr_el2
 	msr	tcr_el2, x0
-	dsb	sy
-	isb
 	ret
 
 
 func write_tcr_el3
 	msr	tcr_el3, x0
-	dsb	sy
-	isb
 	ret
 
 
@@ -622,11 +572,6 @@
 	 * CPTR accessors
 	 * -----------------------------------------------------
 	 */
-func read_cptr_el1
-	b	read_cptr_el1
-	ret
-
-
 func read_cptr_el2
 	mrs	x0, cptr_el2
 	ret
@@ -636,22 +581,14 @@
 	mrs	x0, cptr_el3
 	ret
 
-
-func write_cptr_el1
-	b	write_cptr_el1
-
 
 func write_cptr_el2
 	msr	cptr_el2, x0
-	dsb	sy
-	isb
 	ret
 
 
 func write_cptr_el3
 	msr	cptr_el3, x0
-	dsb	sy
-	isb
 	ret
 
 
@@ -676,19 +613,16 @@
 
 func write_ttbr0_el1
 	msr	ttbr0_el1, x0
-	isb
 	ret
 
 
 func write_ttbr0_el2
 	msr	ttbr0_el2, x0
-	isb
 	ret
 
 
 func write_ttbr0_el3
 	msr	ttbr0_el3, x0
-	isb
 	ret
 
 
@@ -701,27 +635,10 @@
 	ret
 
 
-func read_ttbr1_el2
-	b	read_ttbr1_el2
-
-
-func read_ttbr1_el3
-	b	read_ttbr1_el3
-
-
 func write_ttbr1_el1
 	msr	ttbr1_el1, x0
-	isb
 	ret
 
-
-func write_ttbr1_el2
-	b	write_ttbr1_el2
-
-
-func write_ttbr1_el3
-	b	write_ttbr1_el3
-
 
 func read_hcr
 	mrs	x0, hcr_el2
@@ -730,8 +647,6 @@
 
 func write_hcr
 	msr	hcr_el2, x0
-	dsb	sy
-	isb
 	ret
 
 
@@ -762,8 +677,6 @@
 
 func write_cpuectlr
 	msr	CPUECTLR_EL1, x0
-	dsb	sy
-	isb
 	ret
 
 
@@ -789,8 +702,6 @@
 
 func write_scr
 	msr	scr_el3, x0
-	dsb	sy
-	isb
 	ret
 
 
@@ -818,16 +729,7 @@
 	mov	x1, #AARCH64_CPTR_TFP
 	bic	x0, x0, x1
 	msr	cptr_el3, x0
-	ret
-
-
-func read_fpexc
-	b	read_fpexc
-	ret
-
-
-func write_fpexc
-	b	write_fpexc
+	isb
 	ret
 
 #endif
diff --git a/lib/aarch64/tlb_helpers.S b/lib/aarch64/tlb_helpers.S
index ec1558b..8dfae12 100644
--- a/lib/aarch64/tlb_helpers.S
+++ b/lib/aarch64/tlb_helpers.S
@@ -41,47 +41,33 @@
 
 func tlbialle1
 	tlbi	alle1
-	dsb	sy
-	isb
 	ret
 
 
 func tlbialle1is
 	tlbi	alle1is
-	dsb	sy
-	isb
 	ret
 
 
 func tlbialle2
 	tlbi	alle2
-	dsb	sy
-	isb
 	ret
 
 
 func tlbialle2is
 	tlbi	alle2is
-	dsb	sy
-	isb
 	ret
 
 
 func tlbialle3
 	tlbi	alle3
-	dsb	sy
-	isb
 	ret
 
 
 func tlbialle3is
 	tlbi	alle3is
-	dsb	sy
-	isb
 	ret
 
 func tlbivmalle1
 	tlbi	vmalle1
-	dsb	sy
-	isb
 	ret
diff --git a/plat/fvp/aarch64/bl1_plat_helpers.S b/plat/fvp/aarch64/bl1_plat_helpers.S
index 92075ea..b4d4458 100644
--- a/plat/fvp/aarch64/bl1_plat_helpers.S
+++ b/plat/fvp/aarch64/bl1_plat_helpers.S
@@ -67,7 +67,7 @@
 	 * loader zeroes out the zi section.
 	 * ---------------------------------------------
 	 */
-	bl	read_mpidr
+	mrs	x0, mpidr_el1
 	ldr	x1, =PWRC_BASE
 	str	w0, [x1, #PPOFFR_OFF]
 
@@ -173,8 +173,6 @@
 func platform_cold_boot_init
 	mov	x20, x0
 	bl	platform_mem_init
-	bl	read_mpidr
-	mov	x19, x0
 
 	/* ---------------------------------------------
 	 * Give ourselves a small coherent stack to
@@ -182,6 +180,7 @@
 	 * CCI in assembler
 	 * ---------------------------------------------
 	 */
+	mrs	x0, mpidr_el1
 	bl	platform_set_coherent_stack
 
 	/* ---------------------------------------------
@@ -200,7 +199,7 @@
 	 * -IS-WBWA memory
 	 * ---------------------------------------------
 	 */
-	mov	x0, x19
+	mrs	x0, mpidr_el1
 	bl	platform_set_stack
 
 	/* ---------------------------------------------
diff --git a/plat/fvp/aarch64/plat_common.c b/plat/fvp/aarch64/plat_common.c
index c8e529d..edeb6e0 100644
--- a/plat/fvp/aarch64/plat_common.c
+++ b/plat/fvp/aarch64/plat_common.c
@@ -69,6 +69,8 @@
 	ttbr = (unsigned long) l1_xlation_table;
 
 	if (GET_EL(current_el) == MODE_EL3) {
+		assert((read_sctlr_el3() & SCTLR_M_BIT) == 0);
+
 		write_mair_el3(mair);
 		tcr |= TCR_EL3_RES1;
 		/* Invalidate EL3 TLBs */
@@ -77,11 +79,19 @@
 		write_tcr_el3(tcr);
 		write_ttbr0_el3(ttbr);
 
+		/* ensure all translation table writes have drained into memory,
+		 * the TLB invalidation is complete, and translation register
+		 * writes are committed before enabling the MMU
+		 */
+		dsb();
+		isb();
+
 		sctlr = read_sctlr_el3();
 		sctlr |= SCTLR_WXN_BIT | SCTLR_M_BIT | SCTLR_I_BIT;
 		sctlr |= SCTLR_A_BIT | SCTLR_C_BIT;
 		write_sctlr_el3(sctlr);
 	} else {
+		assert((read_sctlr_el1() & SCTLR_M_BIT) == 0);
 
 		write_mair_el1(mair);
 		/* Invalidate EL1 TLBs */
@@ -90,32 +100,20 @@
 		write_tcr_el1(tcr);
 		write_ttbr0_el1(ttbr);
 
+		/* ensure all translation table writes have drained into memory,
+		 * the TLB invalidation is complete, and translation register
+		 * writes are committed before enabling the MMU
+		 */
+		dsb();
+		isb();
+
 		sctlr = read_sctlr_el1();
 		sctlr |= SCTLR_WXN_BIT | SCTLR_M_BIT | SCTLR_I_BIT;
 		sctlr |= SCTLR_A_BIT | SCTLR_C_BIT;
 		write_sctlr_el1(sctlr);
 	}
-
-	return;
-}
-
-void disable_mmu(void)
-{
-	unsigned long sctlr;
-	unsigned long current_el = read_current_el();
-
-	if (GET_EL(current_el) == MODE_EL3) {
-		sctlr = read_sctlr_el3();
-		sctlr = sctlr & ~(SCTLR_M_BIT | SCTLR_C_BIT);
-		write_sctlr_el3(sctlr);
-	} else {
-		sctlr = read_sctlr_el1();
-		sctlr = sctlr & ~(SCTLR_M_BIT | SCTLR_C_BIT);
-		write_sctlr_el1(sctlr);
-	}
-
-	/* Flush the caches */
-	dcsw_op_all(DCCISW);
+	/* ensure the MMU enable takes effect immediately */
+	isb();
 
 	return;
 }
diff --git a/plat/fvp/plat_gic.c b/plat/fvp/plat_gic.c
index 8457af1..db3c9cf 100644
--- a/plat/fvp/plat_gic.c
+++ b/plat/fvp/plat_gic.c
@@ -86,6 +86,7 @@
 	 */
 	scr_val = read_scr();
 	write_scr(scr_val | SCR_NS_BIT);
+	isb();	/* ensure NS=1 takes effect before accessing ICC_SRE_EL2 */
 
 	/*
 	 * By default EL2 and NS-EL1 software should be able to enable GICv3
@@ -103,9 +104,11 @@
 	write_icc_sre_el2(val | ICC_SRE_EN | ICC_SRE_SRE);
 
 	write_icc_pmr_el1(GIC_PRI_MASK);
+	isb();	/* commite ICC_* changes before setting NS=0 */
 
 	/* Restore SCR_EL3 */
 	write_scr(scr_val);
+	isb();	/* ensure NS=0 takes effect immediately */
 }
 
 /*******************************************************************************
diff --git a/plat/fvp/plat_pm.c b/plat/fvp/plat_pm.c
index 5430fff..f80e2d7 100644
--- a/plat/fvp/plat_pm.c
+++ b/plat/fvp/plat_pm.c
@@ -54,7 +54,11 @@
 	if (target_afflvl != MPIDR_AFFLVL0)
 		return PSCI_E_INVALID_PARAMS;
 
-	/* Enter standby state */
+	/*
+	 * Enter standby state
+	 * dsb is good practice before using wfi to enter low power states
+	 */
+	dsb();
 	wfi();
 
 	return PSCI_E_SUCCESS;
diff --git a/plat/fvp/platform.h b/plat/fvp/platform.h
index 1f4e432..3fe892e 100644
--- a/plat/fvp/platform.h
+++ b/plat/fvp/platform.h
@@ -298,7 +298,6 @@
 #define PL011_UART1_BASE		0x1c0a0000
 #define PL011_UART2_BASE		0x1c0b0000
 #define PL011_UART3_BASE		0x1c0c0000
-#define PL011_BASE			PL011_UART0_BASE
 
 
 /*******************************************************************************
@@ -371,7 +370,6 @@
 extern void bl31_plat_arch_setup(void);
 extern int platform_setup_pm(const struct plat_pm_ops **);
 extern unsigned int platform_get_core_pos(unsigned long mpidr);
-extern void disable_mmu(void);
 extern void enable_mmu(void);
 extern void configure_mmu(struct meminfo *,
 			  unsigned long,
diff --git a/services/std_svc/psci/psci_afflvl_off.c b/services/std_svc/psci/psci_afflvl_off.c
index e007bc3..21a4d1a 100644
--- a/services/std_svc/psci/psci_afflvl_off.c
+++ b/services/std_svc/psci/psci_afflvl_off.c
@@ -82,6 +82,7 @@
 	sctlr = read_sctlr_el3();
 	sctlr &= ~SCTLR_C_BIT;
 	write_sctlr_el3(sctlr);
+	isb();	/* ensure MMU disable takes immediate effect */
 
 	/*
 	 * CAUTION: This flush to the level of unification makes an assumption
diff --git a/services/std_svc/psci/psci_afflvl_suspend.c b/services/std_svc/psci/psci_afflvl_suspend.c
index dc12f7a..534e4a9 100644
--- a/services/std_svc/psci/psci_afflvl_suspend.c
+++ b/services/std_svc/psci/psci_afflvl_suspend.c
@@ -198,6 +198,7 @@
 	sctlr = read_sctlr_el3();
 	sctlr &= ~SCTLR_C_BIT;
 	write_sctlr_el3(sctlr);
+	isb();	/* ensure MMU disable takes immediate effect */
 
 	/*
 	 * CAUTION: This flush to the level of unification makes an assumption
diff --git a/services/std_svc/psci/psci_entry.S b/services/std_svc/psci/psci_entry.S
index e2c690d..25adaa1 100644
--- a/services/std_svc/psci/psci_entry.S
+++ b/services/std_svc/psci/psci_entry.S
@@ -75,10 +75,8 @@
 	 * ---------------------------------------------
 	 */
 	msr	spsel, #0
-	isb
 
-	bl	read_mpidr
-	mov	x19, x0
+	mrs	x0, mpidr_el1
 	bl	platform_set_coherent_stack
 
 	/* ---------------------------------------------
@@ -86,14 +84,14 @@
 	 * level 0.
 	 * ---------------------------------------------
 	 */
-	mov	x0, x19
+	mrs	x0, mpidr_el1
 	bl	get_power_on_target_afflvl
 	cmp	x0, xzr
 	b.lt	_panic
 	mov	x3, x23
 	mov	x2, x0
-	mov	x0, x19
 	mov	x1, #MPIDR_AFFLVL0
+	mrs	x0, mpidr_el1
 	blr	x22
 
 	/* --------------------------------------------
@@ -101,7 +99,7 @@
 	 * -IS-WBWA memory
 	 * --------------------------------------------
 	 */
-	mov	x0, x19
+	mrs	x0, mpidr_el1
 	bl	platform_set_stack
 
 	zero_callee_saved_regs
@@ -120,7 +118,7 @@
 	sub	sp, sp, #0x10
 	stp	x19, x20, [sp, #0]
 	mov	x19, sp
-	bl	read_mpidr
+	mrs	x0, mpidr_el1
 	bl	platform_set_coherent_stack
 	bl	psci_cpu_off
 	mov	x1, #PSCI_E_SUCCESS
@@ -141,7 +139,7 @@
 	mov	x20, x0
 	mov	x21, x1
 	mov	x22, x2
-	bl	read_mpidr
+	mrs	x0, mpidr_el1
 	bl	platform_set_coherent_stack
 	mov	x0, x20
 	mov	x1, x21
@@ -158,7 +156,7 @@
 	ret
 
 func final_wfi
-	dsb	sy
+	dsb	sy		// ensure write buffer empty
 	wfi
 wfi_spill:
 	b	wfi_spill