Merge pull request #58 from athoelke/optimise-cache-flush-v2

Optimise data cache clean/invalidate operation v2
diff --git a/bl1/aarch64/bl1_arch_setup.c b/bl1/aarch64/bl1_arch_setup.c
index 758b8e8..a1ebbdb 100644
--- a/bl1/aarch64/bl1_arch_setup.c
+++ b/bl1/aarch64/bl1_arch_setup.c
@@ -44,6 +44,7 @@
 	tmp_reg |= (SCTLR_A_BIT | SCTLR_SA_BIT);
 	tmp_reg &= ~SCTLR_EE_BIT;
 	write_sctlr_el3(tmp_reg);
+	isb();
 
 	/*
 	 * Enable HVCs, route FIQs to EL3, set the next EL to be AArch64, route
diff --git a/bl1/aarch64/bl1_entrypoint.S b/bl1/aarch64/bl1_entrypoint.S
index 012b779..c081af4 100644
--- a/bl1/aarch64/bl1_entrypoint.S
+++ b/bl1/aarch64/bl1_entrypoint.S
@@ -86,7 +86,6 @@
 	mrs	x0, sctlr_el3
 	orr	x0, x0, #SCTLR_I_BIT
 	msr	sctlr_el3, x0
-
 	isb
 
 _wait_for_entrypoint:
@@ -98,10 +97,10 @@
 	 * their turn to be woken up
 	 * ---------------------------------------------
 	 */
-	bl	read_mpidr
+	mrs	x0, mpidr_el1
 	bl	platform_get_entrypoint
 	cbnz	x0, _do_warm_boot
-	bl	read_mpidr
+	mrs	x0, mpidr_el1
 	bl	platform_is_primary_cpu
 	cbnz	x0, _do_cold_boot
 
diff --git a/bl1/aarch64/bl1_exceptions.S b/bl1/aarch64/bl1_exceptions.S
index 68d088b..a87b20f 100644
--- a/bl1/aarch64/bl1_exceptions.S
+++ b/bl1/aarch64/bl1_exceptions.S
@@ -189,7 +189,7 @@
 	mov	x0, #SYNC_EXCEPTION_AARCH64
 	bl	plat_report_exception
 
-	bl	read_esr_el3
+	mrs	x0, esr_el3
 	ubfx	x1, x0, #ESR_EC_SHIFT, #ESR_EC_LENGTH
 	cmp	x1, #EC_AARCH64_SMC
 	b.ne	panic
@@ -201,10 +201,8 @@
 	mov	x2, x3
 	mov	x3, x4
 	bl	display_boot_progress
-	mov	x0, x20
-	bl	write_elr
-	mov	x0, x21
-	bl	write_spsr
+	msr	elr_el3, x20
+	msr	spsr_el3, x21
 	ubfx	x0, x21, #MODE_EL_SHIFT, #2
 	cmp	x0, #MODE_EL3
 	b.ne	skip_mmu_teardown
@@ -212,18 +210,11 @@
 	/* ---------------------------------------------
 	 * If BL31 is to be executed in EL3 as well
 	 * then turn off the MMU so that it can perform
-	 * its own setup. TODO: Assuming flat mapped
-	 * translations here. Also all should go into a
-	 * separate MMU teardown function
+	 * its own setup.
 	 * ---------------------------------------------
 	 */
-	mov	x1, #(SCTLR_M_BIT | SCTLR_C_BIT | SCTLR_I_BIT)
-	bl	read_sctlr_el3
-	bic	x0, x0, x1
-	bl	write_sctlr_el3
-	mov	x0, #DCCISW
-	bl	dcsw_op_all
-	bl	tlbialle3
+	bl	disable_mmu_icache_el3
+	tlbi	alle3
 skip_mmu_teardown:
 	ldp     x6, x7, [sp, #0x30]
 	ldp     x4, x5, [sp, #0x20]
diff --git a/bl2/aarch64/bl2_entrypoint.S b/bl2/aarch64/bl2_entrypoint.S
index b8af9a5..4f7565f 100644
--- a/bl2/aarch64/bl2_entrypoint.S
+++ b/bl2/aarch64/bl2_entrypoint.S
@@ -54,8 +54,7 @@
 	 * So, make sure no secondary has lost its way.
 	 * ---------------------------------------------
 	 */
-	bl	read_mpidr
-	mov	x19, x0
+	mrs	x0, mpidr_el1
 	bl	platform_is_primary_cpu
 	cbz	x0, _panic
 
@@ -73,7 +72,6 @@
 	mrs	x0, sctlr_el1
 	orr	x0, x0, #SCTLR_I_BIT
 	msr	sctlr_el1, x0
-
 	isb
 
 	/* ---------------------------------------------
@@ -103,7 +101,7 @@
 	 * ease the pain of initializing the MMU
 	 * --------------------------------------------
 	 */
-	mov	x0, x19
+	mrs	x0, mpidr_el1
 	bl	platform_set_coherent_stack
 
 	/* ---------------------------------------------
@@ -121,7 +119,7 @@
 	 * -IS-WBWA memory
 	 * ---------------------------------------------
 	 */
-	mov	x0, x19
+	mrs	x0, mpidr_el1
 	bl	platform_set_stack
 
 	/* ---------------------------------------------
diff --git a/bl31/aarch64/bl31_entrypoint.S b/bl31/aarch64/bl31_entrypoint.S
index 39fa605..13bd5b8 100644
--- a/bl31/aarch64/bl31_entrypoint.S
+++ b/bl31/aarch64/bl31_entrypoint.S
@@ -89,7 +89,6 @@
 	mrs	x1, sctlr_el3
 	orr	x1, x1, #SCTLR_I_BIT
 	msr	sctlr_el3, x1
-
 	isb
 
 	/* ---------------------------------------------
@@ -108,8 +107,7 @@
 	 * So, make sure no secondary has lost its way.
 	 * ---------------------------------------------
 	 */
-	bl	read_mpidr
-	mov	x19, x0
+	mrs	x0, mpidr_el1
 	bl	platform_is_primary_cpu
 	cbz	x0, _panic
 
@@ -138,7 +136,7 @@
 	 * ease the pain of initializing the MMU
 	 * --------------------------------------------
 	 */
-	mov	x0, x19
+	mrs	x0, mpidr_el1
 	bl	platform_set_coherent_stack
 
 	/* ---------------------------------------------
@@ -155,7 +153,7 @@
 	 * -IS-WBWA memory
 	 * ---------------------------------------------
 	 */
-	mov	x0, x19
+	mrs	x0, mpidr_el1
 	bl	platform_set_stack
 
 	/* ---------------------------------------------
diff --git a/bl31/bl31_main.c b/bl31/bl31_main.c
index 01f00f2..755320d 100644
--- a/bl31/bl31_main.c
+++ b/bl31/bl31_main.c
@@ -100,6 +100,7 @@
 	assert(cm_get_context(mpidr, NON_SECURE));
 	cm_set_next_eret_context(NON_SECURE);
 	write_vbar_el3((uint64_t) runtime_exceptions);
+	isb();
 	next_image_type = NON_SECURE;
 
 	/*
diff --git a/drivers/arm/gic/aarch64/gic_v3_sysregs.S b/drivers/arm/gic/aarch64/gic_v3_sysregs.S
index 2a96da7..ddf85a8 100644
--- a/drivers/arm/gic/aarch64/gic_v3_sysregs.S
+++ b/drivers/arm/gic/aarch64/gic_v3_sysregs.S
@@ -67,23 +67,19 @@
 
 func write_icc_sre_el1
 	msr	ICC_SRE_EL1, x0
-	isb
 	ret
 
 
 func write_icc_sre_el2
 	msr	ICC_SRE_EL2, x0
-	isb
 	ret
 
 
 func write_icc_sre_el3
 	msr	ICC_SRE_EL3, x0
-	isb
 	ret
 
 
 func write_icc_pmr_el1
 	msr	ICC_PMR_EL1, x0
-	isb
 	ret
diff --git a/include/common/asm_macros.S b/include/common/asm_macros.S
index 3edd392..3dbd9f2 100644
--- a/include/common/asm_macros.S
+++ b/include/common/asm_macros.S
@@ -58,7 +58,7 @@
 
 
 	.macro	smc_check  label
-	bl	read_esr
+	mrs	x0, esr_el3
 	ubfx	x0, x0, #ESR_EC_SHIFT, #ESR_EC_LENGTH
 	cmp	x0, #EC_AARCH64_SMC
 	b.ne	$label
diff --git a/include/drivers/arm/pl011.h b/include/drivers/arm/pl011.h
index 28aef54..1254920 100644
--- a/include/drivers/arm/pl011.h
+++ b/include/drivers/arm/pl011.h
@@ -78,10 +78,6 @@
 #define PL011_UARTCR_LBE          (1 << 7)	/* Loopback enable */
 #define PL011_UARTCR_UARTEN       (1 << 0)	/* UART Enable */
 
-#if !defined(PL011_BASE)
-#error "The PL011_BASE macro must be defined."
-#endif
-
 #if !defined(PL011_BAUDRATE)
 #define PL011_BAUDRATE  115200
 #endif
diff --git a/include/lib/aarch64/arch_helpers.h b/include/lib/aarch64/arch_helpers.h
index 565b1b4..517e25a 100644
--- a/include/lib/aarch64/arch_helpers.h
+++ b/include/lib/aarch64/arch_helpers.h
@@ -78,6 +78,9 @@
 extern void dcsw_op_louis(unsigned int);
 extern void dcsw_op_all(unsigned int);
 
+extern void disable_mmu_el3(void);
+extern void disable_mmu_icache_el3(void);
+
 /*******************************************************************************
  * Misc. accessor prototypes
  ******************************************************************************/
@@ -191,9 +194,7 @@
 extern unsigned long read_ttbr0_el2(void);
 extern unsigned long read_ttbr0_el3(void);
 
-extern unsigned long read_ttbr1(void);
 extern unsigned long read_ttbr1_el1(void);
-extern unsigned long read_ttbr1_el2(void);
 
 extern unsigned long read_cptr_el2(void);
 extern unsigned long read_cptr_el3(void);
@@ -225,12 +226,10 @@
 extern void write_esr_el2(unsigned long);
 extern void write_esr_el3(unsigned long);
 
-extern void write_afsr0(unsigned long);
 extern void write_afsr0_el1(unsigned long);
 extern void write_afsr0_el2(unsigned long);
 extern void write_afsr0_el3(unsigned long);
 
-extern void write_afsr1(unsigned long);
 extern void write_afsr1_el1(unsigned long);
 extern void write_afsr1_el2(unsigned long);
 extern void write_afsr1_el3(unsigned long);
@@ -260,7 +259,6 @@
 extern void write_ttbr0_el3(unsigned long);
 
 extern void write_ttbr1_el1(unsigned long);
-extern void write_ttbr1_el2(unsigned long);
 
 extern void write_cpuectlr(unsigned long);
 extern void write_cptr_el2(unsigned long);
diff --git a/lib/aarch64/cache_helpers.S b/lib/aarch64/cache_helpers.S
index c272fc7..a5b918c 100644
--- a/lib/aarch64/cache_helpers.S
+++ b/lib/aarch64/cache_helpers.S
@@ -46,57 +46,41 @@
 
 func dcisw
 	dc	isw, x0
-	dsb	sy
-	isb
 	ret
 
 
 func dccisw
 	dc	cisw, x0
-	dsb	sy
-	isb
 	ret
 
 
 func dccsw
 	dc	csw, x0
-	dsb	sy
-	isb
 	ret
 
 
 func dccvac
 	dc	cvac, x0
-	dsb	sy
-	isb
 	ret
 
 
 func dcivac
 	dc	ivac, x0
-	dsb	sy
-	isb
 	ret
 
 
 func dccivac
 	dc	civac, x0
-	dsb	sy
-	isb
 	ret
 
 
 func dccvau
 	dc	cvau, x0
-	dsb	sy
-	isb
 	ret
 
 
 func dczva
 	dc	zva, x0
-	dsb	sy
-	isb
 	ret
 
 
diff --git a/lib/aarch64/cpu_helpers.S b/lib/aarch64/cpu_helpers.S
index 573d0b8..abb996d 100644
--- a/lib/aarch64/cpu_helpers.S
+++ b/lib/aarch64/cpu_helpers.S
@@ -35,13 +35,11 @@
 
 
 func cpu_reset_handler
-	mov	x19, x30 // lr
-
 	/* ---------------------------------------------
 	 * As a bare minimal enable the SMP bit.
 	 * ---------------------------------------------
 	 */
-	bl	read_midr
+	mrs	x0, midr_el1
 	lsr	x0, x0, #MIDR_PN_SHIFT
 	and	x0, x0, #MIDR_PN_MASK
 	cmp	x0, #MIDR_PN_A57
@@ -49,8 +47,9 @@
 	cmp	x0, #MIDR_PN_A53
 	b.ne	smp_setup_end
 smp_setup_begin:
-	bl	read_cpuectlr
+	mrs	x0, CPUECTLR_EL1
 	orr	x0, x0, #CPUECTLR_SMP_BIT
-	bl	write_cpuectlr
+	msr	CPUECTLR_EL1, x0
+	isb
 smp_setup_end:
-	ret	x19
+	ret
diff --git a/lib/aarch64/misc_helpers.S b/lib/aarch64/misc_helpers.S
index e7b2331..e7ee015 100644
--- a/lib/aarch64/misc_helpers.S
+++ b/lib/aarch64/misc_helpers.S
@@ -46,22 +46,18 @@
 	.globl	read_daif
 	.globl	write_daif
 
-	.globl	read_spsr
 	.globl	read_spsr_el1
 	.globl	read_spsr_el2
 	.globl	read_spsr_el3
 
-	.globl	write_spsr
 	.globl	write_spsr_el1
 	.globl	write_spsr_el2
 	.globl	write_spsr_el3
 
-	.globl	read_elr
 	.globl	read_elr_el1
 	.globl	read_elr_el2
 	.globl	read_elr_el3
 
-	.globl	write_elr
 	.globl	write_elr_el1
 	.globl	write_elr_el2
 	.globl	write_elr_el3
@@ -79,6 +75,9 @@
 	.globl	zeromem16
 	.globl	memcpy16
 
+	.globl	disable_mmu_el3
+	.globl	disable_mmu_icache_el3
+
 
 func get_afflvl_shift
 	cmp	x0, #3
@@ -150,16 +149,6 @@
 	ret
 
 
-func read_spsr
-	mrs	x0, CurrentEl
-	cmp	x0, #(MODE_EL1 << MODE_EL_SHIFT)
-	b.eq	read_spsr_el1
-	cmp	x0, #(MODE_EL2 << MODE_EL_SHIFT)
-	b.eq	read_spsr_el2
-	cmp	x0, #(MODE_EL3 << MODE_EL_SHIFT)
-	b.eq	read_spsr_el3
-
-
 func read_spsr_el1
 	mrs	x0, spsr_el1
 	ret
@@ -175,44 +164,21 @@
 	ret
 
 
-func write_spsr
-	mrs	x1, CurrentEl
-	cmp	x1, #(MODE_EL1 << MODE_EL_SHIFT)
-	b.eq	write_spsr_el1
-	cmp	x1, #(MODE_EL2 << MODE_EL_SHIFT)
-	b.eq	write_spsr_el2
-	cmp	x1, #(MODE_EL3 << MODE_EL_SHIFT)
-	b.eq	write_spsr_el3
-
-
 func write_spsr_el1
 	msr	spsr_el1, x0
-	isb
 	ret
 
 
 func write_spsr_el2
 	msr	spsr_el2, x0
-	isb
 	ret
 
 
 func write_spsr_el3
 	msr	spsr_el3, x0
-	isb
 	ret
 
 
-func read_elr
-	mrs	x0, CurrentEl
-	cmp	x0, #(MODE_EL1 << MODE_EL_SHIFT)
-	b.eq	read_elr_el1
-	cmp	x0, #(MODE_EL2 << MODE_EL_SHIFT)
-	b.eq	read_elr_el2
-	cmp	x0, #(MODE_EL3 << MODE_EL_SHIFT)
-	b.eq	read_elr_el3
-
-
 func read_elr_el1
 	mrs	x0, elr_el1
 	ret
@@ -228,31 +194,18 @@
 	ret
 
 
-func write_elr
-	mrs	x1, CurrentEl
-	cmp	x1, #(MODE_EL1 << MODE_EL_SHIFT)
-	b.eq	write_elr_el1
-	cmp	x1, #(MODE_EL2 << MODE_EL_SHIFT)
-	b.eq	write_elr_el2
-	cmp	x1, #(MODE_EL3 << MODE_EL_SHIFT)
-	b.eq	write_elr_el3
-
-
 func write_elr_el1
 	msr	elr_el1, x0
-	isb
 	ret
 
 
 func write_elr_el2
 	msr	elr_el2, x0
-	isb
 	ret
 
 
 func write_elr_el3
 	msr	elr_el3, x0
-	isb
 	ret
 
 
@@ -338,3 +291,27 @@
 	subs	x2, x2, #1
 	b.ne	m_loop1
 m_end:	ret
+
+/* ---------------------------------------------------------------------------
+ * Disable the MMU at EL3
+ * This is implemented in assembler to ensure that the data cache is cleaned
+ * and invalidated after the MMU is disabled without any intervening cacheable
+ * data accesses
+ * ---------------------------------------------------------------------------
+ */
+
+func disable_mmu_el3
+	mov	x1, #(SCTLR_M_BIT | SCTLR_C_BIT)
+do_disable_mmu:
+	mrs	x0, sctlr_el3
+	bic	x0, x0, x1
+	msr	sctlr_el3, x0
+	isb				// ensure MMU is off
+	mov	x0, #DCCISW		// DCache clean and invalidate
+	b	dcsw_op_all
+
+
+func disable_mmu_icache_el3
+	mov	x1, #(SCTLR_M_BIT | SCTLR_C_BIT | SCTLR_I_BIT)
+	b	do_disable_mmu
+
diff --git a/lib/aarch64/sysreg_helpers.S b/lib/aarch64/sysreg_helpers.S
index 61468f9..376da49 100644
--- a/lib/aarch64/sysreg_helpers.S
+++ b/lib/aarch64/sysreg_helpers.S
@@ -125,10 +125,7 @@
 	.globl	write_ttbr0_el3
 
 	.globl	read_ttbr1_el1
-	.globl	read_ttbr1_el2
-	.globl	write_ttbr1
 	.globl	write_ttbr1_el1
-	.globl	write_ttbr1_el2
 
 	.globl	read_cpacr
 	.globl	write_cpacr
@@ -160,8 +157,6 @@
 
 #if SUPPORT_VFP
 	.globl	enable_vfp
-	.globl	read_fpexc
-	.globl	write_fpexc
 #endif
 
 
@@ -201,19 +196,16 @@
 
 func write_vbar_el1
 	msr	vbar_el1, x0
-	isb
 	ret
 
 
 func write_vbar_el2
 	msr	vbar_el2, x0
-	isb
 	ret
 
 
 func write_vbar_el3
 	msr	vbar_el3, x0
-	isb
 	ret
 
 
@@ -238,19 +230,16 @@
 
 func write_afsr0_el1
 	msr	afsr0_el1, x0
-	isb
 	ret
 
 
 func write_afsr0_el2
 	msr	afsr0_el2, x0
-	isb
 	ret
 
 
 func write_afsr0_el3
 	msr	afsr0_el3, x0
-	isb
 	ret
 
 
@@ -275,19 +264,16 @@
 
 func write_far_el1
 	msr	far_el1, x0
-	isb
 	ret
 
 
 func write_far_el2
 	msr	far_el2, x0
-	isb
 	ret
 
 
 func write_far_el3
 	msr	far_el3, x0
-	isb
 	ret
 
 
@@ -312,19 +298,16 @@
 
 func write_mair_el1
 	msr	mair_el1, x0
-	isb
 	ret
 
 
 func write_mair_el2
 	msr	mair_el2, x0
-	isb
 	ret
 
 
 func write_mair_el3
 	msr	mair_el3, x0
-	isb
 	ret
 
 
@@ -349,19 +332,16 @@
 
 func write_amair_el1
 	msr	amair_el1, x0
-	isb
 	ret
 
 
 func write_amair_el2
 	msr	amair_el2, x0
-	isb
 	ret
 
 
 func write_amair_el3
 	msr	amair_el3, x0
-	isb
 	ret
 
 
@@ -405,19 +385,16 @@
 
 func write_rmr_el1
 	msr	rmr_el1, x0
-	isb
 	ret
 
 
 func write_rmr_el2
 	msr	rmr_el2, x0
-	isb
 	ret
 
 
 func write_rmr_el3
 	msr	rmr_el3, x0
-	isb
 	ret
 
 
@@ -442,19 +419,16 @@
 
 func write_afsr1_el1
 	msr	afsr1_el1, x0
-	isb
 	ret
 
 
 func write_afsr1_el2
 	msr	afsr1_el2, x0
-	isb
 	ret
 
 
 func write_afsr1_el3
 	msr	afsr1_el3, x0
-	isb
 	ret
 
 
@@ -479,22 +453,16 @@
 
 func write_sctlr_el1
 	msr	sctlr_el1, x0
-	dsb	sy
-	isb
 	ret
 
 
 func write_sctlr_el2
 	msr	sctlr_el2, x0
-	dsb	sy
-	isb
 	ret
 
 
 func write_sctlr_el3
 	msr	sctlr_el3, x0
-	dsb	sy
-	isb
 	ret
 
 
@@ -519,22 +487,16 @@
 
 func write_actlr_el1
 	msr	actlr_el1, x0
-	dsb	sy
-	isb
 	ret
 
 
 func write_actlr_el2
 	msr	actlr_el2, x0
-	dsb	sy
-	isb
 	ret
 
 
 func write_actlr_el3
 	msr	actlr_el3, x0
-	dsb	sy
-	isb
 	ret
 
 
@@ -559,22 +521,16 @@
 
 func write_esr_el1
 	msr	esr_el1, x0
-	dsb	sy
-	isb
 	ret
 
 
 func write_esr_el2
 	msr	esr_el2, x0
-	dsb	sy
-	isb
 	ret
 
 
 func write_esr_el3
 	msr	esr_el3, x0
-	dsb	sy
-	isb
 	ret
 
 
@@ -599,22 +555,16 @@
 
 func write_tcr_el1
 	msr	tcr_el1, x0
-	dsb	sy
-	isb
 	ret
 
 
 func write_tcr_el2
 	msr	tcr_el2, x0
-	dsb	sy
-	isb
 	ret
 
 
 func write_tcr_el3
 	msr	tcr_el3, x0
-	dsb	sy
-	isb
 	ret
 
 
@@ -622,11 +572,6 @@
 	 * CPTR accessors
 	 * -----------------------------------------------------
 	 */
-func read_cptr_el1
-	b	read_cptr_el1
-	ret
-
-
 func read_cptr_el2
 	mrs	x0, cptr_el2
 	ret
@@ -636,22 +581,14 @@
 	mrs	x0, cptr_el3
 	ret
 
-
-func write_cptr_el1
-	b	write_cptr_el1
-
 
 func write_cptr_el2
 	msr	cptr_el2, x0
-	dsb	sy
-	isb
 	ret
 
 
 func write_cptr_el3
 	msr	cptr_el3, x0
-	dsb	sy
-	isb
 	ret
 
 
@@ -676,19 +613,16 @@
 
 func write_ttbr0_el1
 	msr	ttbr0_el1, x0
-	isb
 	ret
 
 
 func write_ttbr0_el2
 	msr	ttbr0_el2, x0
-	isb
 	ret
 
 
 func write_ttbr0_el3
 	msr	ttbr0_el3, x0
-	isb
 	ret
 
 
@@ -701,27 +635,10 @@
 	ret
 
 
-func read_ttbr1_el2
-	b	read_ttbr1_el2
-
-
-func read_ttbr1_el3
-	b	read_ttbr1_el3
-
-
 func write_ttbr1_el1
 	msr	ttbr1_el1, x0
-	isb
 	ret
 
-
-func write_ttbr1_el2
-	b	write_ttbr1_el2
-
-
-func write_ttbr1_el3
-	b	write_ttbr1_el3
-
 
 func read_hcr
 	mrs	x0, hcr_el2
@@ -730,8 +647,6 @@
 
 func write_hcr
 	msr	hcr_el2, x0
-	dsb	sy
-	isb
 	ret
 
 
@@ -762,8 +677,6 @@
 
 func write_cpuectlr
 	msr	CPUECTLR_EL1, x0
-	dsb	sy
-	isb
 	ret
 
 
@@ -789,8 +702,6 @@
 
 func write_scr
 	msr	scr_el3, x0
-	dsb	sy
-	isb
 	ret
 
 
@@ -818,16 +729,7 @@
 	mov	x1, #AARCH64_CPTR_TFP
 	bic	x0, x0, x1
 	msr	cptr_el3, x0
-	ret
-
-
-func read_fpexc
-	b	read_fpexc
-	ret
-
-
-func write_fpexc
-	b	write_fpexc
+	isb
 	ret
 
 #endif
diff --git a/lib/aarch64/tlb_helpers.S b/lib/aarch64/tlb_helpers.S
index ec1558b..8dfae12 100644
--- a/lib/aarch64/tlb_helpers.S
+++ b/lib/aarch64/tlb_helpers.S
@@ -41,47 +41,33 @@
 
 func tlbialle1
 	tlbi	alle1
-	dsb	sy
-	isb
 	ret
 
 
 func tlbialle1is
 	tlbi	alle1is
-	dsb	sy
-	isb
 	ret
 
 
 func tlbialle2
 	tlbi	alle2
-	dsb	sy
-	isb
 	ret
 
 
 func tlbialle2is
 	tlbi	alle2is
-	dsb	sy
-	isb
 	ret
 
 
 func tlbialle3
 	tlbi	alle3
-	dsb	sy
-	isb
 	ret
 
 
 func tlbialle3is
 	tlbi	alle3is
-	dsb	sy
-	isb
 	ret
 
 func tlbivmalle1
 	tlbi	vmalle1
-	dsb	sy
-	isb
 	ret
diff --git a/plat/fvp/aarch64/bl1_plat_helpers.S b/plat/fvp/aarch64/bl1_plat_helpers.S
index 92075ea..b4d4458 100644
--- a/plat/fvp/aarch64/bl1_plat_helpers.S
+++ b/plat/fvp/aarch64/bl1_plat_helpers.S
@@ -67,7 +67,7 @@
 	 * loader zeroes out the zi section.
 	 * ---------------------------------------------
 	 */
-	bl	read_mpidr
+	mrs	x0, mpidr_el1
 	ldr	x1, =PWRC_BASE
 	str	w0, [x1, #PPOFFR_OFF]
 
@@ -173,8 +173,6 @@
 func platform_cold_boot_init
 	mov	x20, x0
 	bl	platform_mem_init
-	bl	read_mpidr
-	mov	x19, x0
 
 	/* ---------------------------------------------
 	 * Give ourselves a small coherent stack to
@@ -182,6 +180,7 @@
 	 * CCI in assembler
 	 * ---------------------------------------------
 	 */
+	mrs	x0, mpidr_el1
 	bl	platform_set_coherent_stack
 
 	/* ---------------------------------------------
@@ -200,7 +199,7 @@
 	 * -IS-WBWA memory
 	 * ---------------------------------------------
 	 */
-	mov	x0, x19
+	mrs	x0, mpidr_el1
 	bl	platform_set_stack
 
 	/* ---------------------------------------------
diff --git a/plat/fvp/aarch64/plat_common.c b/plat/fvp/aarch64/plat_common.c
index c8e529d..edeb6e0 100644
--- a/plat/fvp/aarch64/plat_common.c
+++ b/plat/fvp/aarch64/plat_common.c
@@ -69,6 +69,8 @@
 	ttbr = (unsigned long) l1_xlation_table;
 
 	if (GET_EL(current_el) == MODE_EL3) {
+		assert((read_sctlr_el3() & SCTLR_M_BIT) == 0);
+
 		write_mair_el3(mair);
 		tcr |= TCR_EL3_RES1;
 		/* Invalidate EL3 TLBs */
@@ -77,11 +79,19 @@
 		write_tcr_el3(tcr);
 		write_ttbr0_el3(ttbr);
 
+		/* ensure all translation table writes have drained into memory,
+		 * the TLB invalidation is complete, and translation register
+		 * writes are committed before enabling the MMU
+		 */
+		dsb();
+		isb();
+
 		sctlr = read_sctlr_el3();
 		sctlr |= SCTLR_WXN_BIT | SCTLR_M_BIT | SCTLR_I_BIT;
 		sctlr |= SCTLR_A_BIT | SCTLR_C_BIT;
 		write_sctlr_el3(sctlr);
 	} else {
+		assert((read_sctlr_el1() & SCTLR_M_BIT) == 0);
 
 		write_mair_el1(mair);
 		/* Invalidate EL1 TLBs */
@@ -90,32 +100,20 @@
 		write_tcr_el1(tcr);
 		write_ttbr0_el1(ttbr);
 
+		/* ensure all translation table writes have drained into memory,
+		 * the TLB invalidation is complete, and translation register
+		 * writes are committed before enabling the MMU
+		 */
+		dsb();
+		isb();
+
 		sctlr = read_sctlr_el1();
 		sctlr |= SCTLR_WXN_BIT | SCTLR_M_BIT | SCTLR_I_BIT;
 		sctlr |= SCTLR_A_BIT | SCTLR_C_BIT;
 		write_sctlr_el1(sctlr);
 	}
-
-	return;
-}
-
-void disable_mmu(void)
-{
-	unsigned long sctlr;
-	unsigned long current_el = read_current_el();
-
-	if (GET_EL(current_el) == MODE_EL3) {
-		sctlr = read_sctlr_el3();
-		sctlr = sctlr & ~(SCTLR_M_BIT | SCTLR_C_BIT);
-		write_sctlr_el3(sctlr);
-	} else {
-		sctlr = read_sctlr_el1();
-		sctlr = sctlr & ~(SCTLR_M_BIT | SCTLR_C_BIT);
-		write_sctlr_el1(sctlr);
-	}
-
-	/* Flush the caches */
-	dcsw_op_all(DCCISW);
+	/* ensure the MMU enable takes effect immediately */
+	isb();
 
 	return;
 }
diff --git a/plat/fvp/plat_gic.c b/plat/fvp/plat_gic.c
index 8457af1..db3c9cf 100644
--- a/plat/fvp/plat_gic.c
+++ b/plat/fvp/plat_gic.c
@@ -86,6 +86,7 @@
 	 */
 	scr_val = read_scr();
 	write_scr(scr_val | SCR_NS_BIT);
+	isb();	/* ensure NS=1 takes effect before accessing ICC_SRE_EL2 */
 
 	/*
 	 * By default EL2 and NS-EL1 software should be able to enable GICv3
@@ -103,9 +104,11 @@
 	write_icc_sre_el2(val | ICC_SRE_EN | ICC_SRE_SRE);
 
 	write_icc_pmr_el1(GIC_PRI_MASK);
+	isb();	/* commite ICC_* changes before setting NS=0 */
 
 	/* Restore SCR_EL3 */
 	write_scr(scr_val);
+	isb();	/* ensure NS=0 takes effect immediately */
 }
 
 /*******************************************************************************
diff --git a/plat/fvp/plat_pm.c b/plat/fvp/plat_pm.c
index 5430fff..f80e2d7 100644
--- a/plat/fvp/plat_pm.c
+++ b/plat/fvp/plat_pm.c
@@ -54,7 +54,11 @@
 	if (target_afflvl != MPIDR_AFFLVL0)
 		return PSCI_E_INVALID_PARAMS;
 
-	/* Enter standby state */
+	/*
+	 * Enter standby state
+	 * dsb is good practice before using wfi to enter low power states
+	 */
+	dsb();
 	wfi();
 
 	return PSCI_E_SUCCESS;
diff --git a/plat/fvp/platform.h b/plat/fvp/platform.h
index 1f4e432..3fe892e 100644
--- a/plat/fvp/platform.h
+++ b/plat/fvp/platform.h
@@ -298,7 +298,6 @@
 #define PL011_UART1_BASE		0x1c0a0000
 #define PL011_UART2_BASE		0x1c0b0000
 #define PL011_UART3_BASE		0x1c0c0000
-#define PL011_BASE			PL011_UART0_BASE
 
 
 /*******************************************************************************
@@ -371,7 +370,6 @@
 extern void bl31_plat_arch_setup(void);
 extern int platform_setup_pm(const struct plat_pm_ops **);
 extern unsigned int platform_get_core_pos(unsigned long mpidr);
-extern void disable_mmu(void);
 extern void enable_mmu(void);
 extern void configure_mmu(struct meminfo *,
 			  unsigned long,
diff --git a/services/std_svc/psci/psci_afflvl_off.c b/services/std_svc/psci/psci_afflvl_off.c
index e007bc3..21a4d1a 100644
--- a/services/std_svc/psci/psci_afflvl_off.c
+++ b/services/std_svc/psci/psci_afflvl_off.c
@@ -82,6 +82,7 @@
 	sctlr = read_sctlr_el3();
 	sctlr &= ~SCTLR_C_BIT;
 	write_sctlr_el3(sctlr);
+	isb();	/* ensure MMU disable takes immediate effect */
 
 	/*
 	 * CAUTION: This flush to the level of unification makes an assumption
diff --git a/services/std_svc/psci/psci_afflvl_suspend.c b/services/std_svc/psci/psci_afflvl_suspend.c
index dc12f7a..534e4a9 100644
--- a/services/std_svc/psci/psci_afflvl_suspend.c
+++ b/services/std_svc/psci/psci_afflvl_suspend.c
@@ -198,6 +198,7 @@
 	sctlr = read_sctlr_el3();
 	sctlr &= ~SCTLR_C_BIT;
 	write_sctlr_el3(sctlr);
+	isb();	/* ensure MMU disable takes immediate effect */
 
 	/*
 	 * CAUTION: This flush to the level of unification makes an assumption
diff --git a/services/std_svc/psci/psci_entry.S b/services/std_svc/psci/psci_entry.S
index e2c690d..25adaa1 100644
--- a/services/std_svc/psci/psci_entry.S
+++ b/services/std_svc/psci/psci_entry.S
@@ -75,10 +75,8 @@
 	 * ---------------------------------------------
 	 */
 	msr	spsel, #0
-	isb
 
-	bl	read_mpidr
-	mov	x19, x0
+	mrs	x0, mpidr_el1
 	bl	platform_set_coherent_stack
 
 	/* ---------------------------------------------
@@ -86,14 +84,14 @@
 	 * level 0.
 	 * ---------------------------------------------
 	 */
-	mov	x0, x19
+	mrs	x0, mpidr_el1
 	bl	get_power_on_target_afflvl
 	cmp	x0, xzr
 	b.lt	_panic
 	mov	x3, x23
 	mov	x2, x0
-	mov	x0, x19
 	mov	x1, #MPIDR_AFFLVL0
+	mrs	x0, mpidr_el1
 	blr	x22
 
 	/* --------------------------------------------
@@ -101,7 +99,7 @@
 	 * -IS-WBWA memory
 	 * --------------------------------------------
 	 */
-	mov	x0, x19
+	mrs	x0, mpidr_el1
 	bl	platform_set_stack
 
 	zero_callee_saved_regs
@@ -120,7 +118,7 @@
 	sub	sp, sp, #0x10
 	stp	x19, x20, [sp, #0]
 	mov	x19, sp
-	bl	read_mpidr
+	mrs	x0, mpidr_el1
 	bl	platform_set_coherent_stack
 	bl	psci_cpu_off
 	mov	x1, #PSCI_E_SUCCESS
@@ -141,7 +139,7 @@
 	mov	x20, x0
 	mov	x21, x1
 	mov	x22, x2
-	bl	read_mpidr
+	mrs	x0, mpidr_el1
 	bl	platform_set_coherent_stack
 	mov	x0, x20
 	mov	x1, x21
@@ -158,7 +156,7 @@
 	ret
 
 func final_wfi
-	dsb	sy
+	dsb	sy		// ensure write buffer empty
 	wfi
 wfi_spill:
 	b	wfi_spill