Merge pull request #999 from douglas-raillard-arm/dr/fix_tegra_CFLAGS

Fix Tegra CFLAGS usage
diff --git a/Makefile b/Makefile
index aec10c9..31964de 100644
--- a/Makefile
+++ b/Makefile
@@ -454,6 +454,7 @@
 $(eval $(call assert_boolean,USE_COHERENT_MEM))
 $(eval $(call assert_boolean,USE_TBBR_DEFS))
 $(eval $(call assert_boolean,WARMBOOT_ENABLE_DCACHE_EARLY))
+$(eval $(call assert_boolean,ENABLE_SPE_FOR_LOWER_ELS))
 
 $(eval $(call assert_numeric,ARM_ARCH_MAJOR))
 $(eval $(call assert_numeric,ARM_ARCH_MINOR))
@@ -493,6 +494,7 @@
 $(eval $(call add_define,USE_COHERENT_MEM))
 $(eval $(call add_define,USE_TBBR_DEFS))
 $(eval $(call add_define,WARMBOOT_ENABLE_DCACHE_EARLY))
+$(eval $(call add_define,ENABLE_SPE_FOR_LOWER_ELS))
 
 # Define the EL3_PAYLOAD_BASE flag only if it is provided.
 ifdef EL3_PAYLOAD_BASE
diff --git a/bl1/aarch32/bl1_entrypoint.S b/bl1/aarch32/bl1_entrypoint.S
index 39ebcf7..7780626 100644
--- a/bl1/aarch32/bl1_entrypoint.S
+++ b/bl1/aarch32/bl1_entrypoint.S
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, ARM Limited and Contributors. All rights reserved.
+ * Copyright (c) 2016-2017, ARM Limited and Contributors. All rights reserved.
  *
  * SPDX-License-Identifier: BSD-3-Clause
  */
@@ -44,7 +44,7 @@
 * ---------------------------------------------------------------------
 */
 	el3_entrypoint_common					\
-		_set_endian=1					\
+		_init_sctlr=1					\
 		_warm_boot_mailbox=!PROGRAMMABLE_RESET_ADDRESS	\
 		_secondary_cold_boot=!COLD_BOOT_SINGLE_CPU	\
 		_init_memory=1					\
diff --git a/bl1/aarch64/bl1_entrypoint.S b/bl1/aarch64/bl1_entrypoint.S
index 36ce0d0..f7e02e9 100644
--- a/bl1/aarch64/bl1_entrypoint.S
+++ b/bl1/aarch64/bl1_entrypoint.S
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2013-2015, ARM Limited and Contributors. All rights reserved.
+ * Copyright (c) 2013-2017, ARM Limited and Contributors. All rights reserved.
  *
  * SPDX-License-Identifier: BSD-3-Clause
  */
@@ -25,7 +25,7 @@
 	 * ---------------------------------------------------------------------
 	 */
 	el3_entrypoint_common					\
-		_set_endian=1					\
+		_init_sctlr=1					\
 		_warm_boot_mailbox=!PROGRAMMABLE_RESET_ADDRESS	\
 		_secondary_cold_boot=!COLD_BOOT_SINGLE_CPU	\
 		_init_memory=1					\
diff --git a/bl1/bl1_fwu.c b/bl1/bl1_fwu.c
index 205ea92..85eee1a 100644
--- a/bl1/bl1_fwu.c
+++ b/bl1/bl1_fwu.c
@@ -176,18 +176,19 @@
 
 	checked_image_base = checked_info->image_base;
 	checked_image_end = checked_image_base + checked_info->image_size - 1;
-	/* No need to check for overlaps, it's done in bl1_fwu_image_copy(). */
+	/* No need to check for overflows, it's done in bl1_fwu_image_copy(). */
 
 	for (int i = 0; i < FWU_MAX_SIMULTANEOUS_IMAGES; i++) {
 
-		/* Don't check image against itself. */
-		if (bl1_fwu_loaded_ids[i] == image_id)
+		/* Skip INVALID_IMAGE_IDs and don't check image against itself */
+		if ((bl1_fwu_loaded_ids[i] == INVALID_IMAGE_ID) ||
+				(bl1_fwu_loaded_ids[i] == image_id))
 			continue;
 
 		image_desc = bl1_plat_get_image_desc(bl1_fwu_loaded_ids[i]);
 
 		/* Only check images that are loaded or being loaded. */
-		assert (image_desc->state != IMAGE_STATE_RESET);
+		assert (image_desc && image_desc->state != IMAGE_STATE_RESET);
 
 		info = &image_desc->image_info;
 
@@ -704,11 +705,15 @@
 			return -EPERM;
 		}
 
-		/* Clear the memory.*/
-		zero_normalmem((void *)image_desc->image_info.image_base,
-				image_desc->copied_size);
-		flush_dcache_range(image_desc->image_info.image_base,
-				image_desc->copied_size);
+		if (image_desc->copied_size) {
+			/* Clear the memory if the image is copied */
+			assert(GET_SECURITY_STATE(image_desc->ep_info.h.attr) == SECURE);
+
+			zero_normalmem((void *)image_desc->image_info.image_base,
+					image_desc->copied_size);
+			flush_dcache_range(image_desc->image_info.image_base,
+					image_desc->copied_size);
+		}
 
 		/* Reset status variables */
 		image_desc->copied_size = 0;
diff --git a/bl31/aarch64/bl31_entrypoint.S b/bl31/aarch64/bl31_entrypoint.S
index 6d10bce..419927d 100644
--- a/bl31/aarch64/bl31_entrypoint.S
+++ b/bl31/aarch64/bl31_entrypoint.S
@@ -36,12 +36,12 @@
 	 * bl31_entrypoint() during the cold boot flow, so the cold/warm boot
 	 * and primary/secondary CPU logic should not be executed in this case.
 	 *
-	 * Also, assume that the previous bootloader has already set up the CPU
-	 * endianness and has initialised the memory.
+	 * Also, assume that the previous bootloader has already initialised the
+	 * SCTLR_EL3, including the endianness, and has initialised the memory.
 	 * ---------------------------------------------------------------------
 	 */
 	el3_entrypoint_common					\
-		_set_endian=0					\
+		_init_sctlr=0					\
 		_warm_boot_mailbox=0				\
 		_secondary_cold_boot=0				\
 		_init_memory=0					\
@@ -62,7 +62,7 @@
 	 * ---------------------------------------------------------------------
 	 */
 	el3_entrypoint_common					\
-		_set_endian=1					\
+		_init_sctlr=1					\
 		_warm_boot_mailbox=!PROGRAMMABLE_RESET_ADDRESS	\
 		_secondary_cold_boot=!COLD_BOOT_SINGLE_CPU	\
 		_init_memory=1					\
@@ -136,7 +136,7 @@
 	 * 'el3_entrypoint_common' must be skipped:
 	 *
 	 *  - Only when the platform bypasses the BL1/BL31 entrypoint by
-	 *    programming the reset address do we need to set the CPU endianness.
+	 *    programming the reset address do we need to initialise SCTLR_EL3.
 	 *    In other cases, we assume this has been taken care by the
 	 *    entrypoint code.
 	 *
@@ -149,7 +149,7 @@
 	 *    it has been done once and for all on the cold boot path.
 	 */
 	el3_entrypoint_common					\
-		_set_endian=PROGRAMMABLE_RESET_ADDRESS		\
+		_init_sctlr=PROGRAMMABLE_RESET_ADDRESS		\
 		_warm_boot_mailbox=0				\
 		_secondary_cold_boot=0				\
 		_init_memory=0					\
diff --git a/bl32/sp_min/aarch32/entrypoint.S b/bl32/sp_min/aarch32/entrypoint.S
index e145511..b3fccde 100644
--- a/bl32/sp_min/aarch32/entrypoint.S
+++ b/bl32/sp_min/aarch32/entrypoint.S
@@ -49,12 +49,12 @@
 	 * sp_min_entrypoint() during the cold boot flow, so the cold/warm boot
 	 * and primary/secondary CPU logic should not be executed in this case.
 	 *
-	 * Also, assume that the previous bootloader has already set up the CPU
-	 * endianness and has initialised the memory.
+	 * Also, assume that the previous bootloader has already initialised the
+	 * SCTLR, including the CPU endianness, and has initialised the memory.
 	 * ---------------------------------------------------------------------
 	 */
 	el3_entrypoint_common					\
-		_set_endian=0					\
+		_init_sctlr=0					\
 		_warm_boot_mailbox=0				\
 		_secondary_cold_boot=0				\
 		_init_memory=0					\
@@ -75,7 +75,7 @@
 	 * ---------------------------------------------------------------------
 	 */
 	el3_entrypoint_common					\
-		_set_endian=1					\
+		_init_sctlr=1					\
 		_warm_boot_mailbox=!PROGRAMMABLE_RESET_ADDRESS	\
 		_secondary_cold_boot=!COLD_BOOT_SINGLE_CPU	\
 		_init_memory=1					\
@@ -174,7 +174,7 @@
 	 * 'el3_entrypoint_common' must be skipped:
 	 *
 	 *  - Only when the platform bypasses the BL1/BL32 (SP_MIN) entrypoint by
-	 *    programming the reset address do we need to set the CPU endianness.
+	 *    programming the reset address do we need to initialied the SCTLR.
 	 *    In other cases, we assume this has been taken care by the
 	 *    entrypoint code.
 	 *
@@ -187,7 +187,7 @@
 	 *    it has been done once and for all on the cold boot path.
 	 */
 	el3_entrypoint_common					\
-		_set_endian=PROGRAMMABLE_RESET_ADDRESS		\
+		_init_sctlr=PROGRAMMABLE_RESET_ADDRESS		\
 		_warm_boot_mailbox=0				\
 		_secondary_cold_boot=0				\
 		_init_memory=0					\
diff --git a/docs/firmware-design.md b/docs/firmware-design.md
index 358292a..746e413 100644
--- a/docs/firmware-design.md
+++ b/docs/firmware-design.md
@@ -220,6 +220,12 @@
     -   `DAIF`. The SError interrupt is enabled by clearing the SError interrupt
         mask bit.
 
+    -   `MDCR_EL3`. The trap controls, `MDCR_EL3.TDOSA`, `MDCR_EL3.TDA` and
+        `MDCR_EL3.TPM`, are set so that accesses to the registers they control
+        do not trap to EL3. AArch64 Secure self-hosted debug is disabled by
+        setting the `MDCR_EL3.SDD` bit. Also `MDCR_EL3.SPD32` is set to
+        disable AArch32 Secure self-hosted privileged debug from S-EL1.
+
 *   Control register setup (for AArch32)
     -   `SCTLR`. Instruction cache is enabled by setting the `SCTLR.I` bit.
         Alignment checking is enabled by setting the `SCTLR.A` bit.
@@ -243,6 +249,9 @@
     -   `CPSR.A`. The Asynchronous data abort interrupt is enabled by clearing
         the Asynchronous data abort interrupt mask bit.
 
+    -   `SDCR`. The `SDCR.SPD` field is set to disable AArch32 Secure
+        self-hosted privileged debug.
+
 #### Platform initialization
 
 On ARM platforms, BL1 performs the following platform initializations:
diff --git a/docs/user-guide.md b/docs/user-guide.md
index 0065ac0..d5423ca 100644
--- a/docs/user-guide.md
+++ b/docs/user-guide.md
@@ -542,6 +542,11 @@
     cluster platforms). If this option is enabled, then warm boot path
     enables D-caches immediately after enabling MMU. This option defaults to 0.
 
+*   `ENABLE_SPE_FOR_LOWER_ELS` : Boolean option to enable Statistical Profiling
+     extensions.  This is an optional architectural feature available only for
+     AArch64 8.2 onwards.  This option defaults to 1 but is automatically
+     disabled when the target architecture is AArch32 or AArch64 8.0/8.1.
+
 #### ARM development platform specific build options
 
 *   `ARM_BL31_IN_DRAM`: Boolean option to select loading of BL31 in TZC secured
diff --git a/include/common/aarch32/el3_common_macros.S b/include/common/aarch32/el3_common_macros.S
index e1261ea..6fc00dd 100644
--- a/include/common/aarch32/el3_common_macros.S
+++ b/include/common/aarch32/el3_common_macros.S
@@ -16,10 +16,18 @@
 	 */
 	.macro el3_arch_init_common _exception_vectors
 	/* ---------------------------------------------------------------------
-	 * Enable the instruction cache and alignment checks
+	 * SCTLR has already been initialised - read current value before
+	 * modifying.
+	 *
+	 * SCTLR.I: Enable the instruction cache.
+	 *
+	 * SCTLR.A: Enable Alignment fault checking. All instructions that load
+	 *  or store one or more registers have an alignment check that the
+	 *  address being accessed is aligned to the size of the data element(s)
+	 *  being accessed.
 	 * ---------------------------------------------------------------------
 	 */
-	ldr	r1, =(SCTLR_RES1 | SCTLR_I_BIT | SCTLR_A_BIT)
+	ldr	r1, =(SCTLR_I_BIT | SCTLR_A_BIT)
 	ldcopr	r0, SCTLR
 	orr	r0, r0, r1
 	stcopr	r0, SCTLR
@@ -34,13 +42,14 @@
 	stcopr	r0, MVBAR
 	isb
 
-	/* -----------------------------------------------------
-	 * Enable the SIF bit to disable instruction fetches
-	 * from Non-secure memory.
-	 * -----------------------------------------------------
+	/* ---------------------------------------------------------------------
+	 * Initialise SCR, setting all fields rather than relying on the hw.
+	 *
+	 * SCR.SIF: Enabled so that Secure state instruction fetches from
+	 *  Non-secure memory are not permitted.
+	 * ---------------------------------------------------------------------
 	 */
-	ldcopr	r0, SCR
-	orr	r0, r0, #SCR_SIF_BIT
+	ldr	r0, =(SCR_RESET_VAL | SCR_SIF_BIT)
 	stcopr	r0, SCR
 
 	/* -----------------------------------------------------
@@ -51,32 +60,61 @@
 	cpsie   a
 	isb
 
-	/* Enable access to Advanced SIMD registers */
+	/* ---------------------------------------------------------------------
+	 * Initialise NSACR, setting all the fields, except for the
+	 * IMPLEMENTATION DEFINED field, rather than relying on the hw. Some
+	 * fields are architecturally UNKNOWN on reset.
+	 *
+	 * NSACR_ENABLE_FP_ACCESS: Represents NSACR.cp11 and NSACR.cp10. The
+	 *  cp11 field is ignored, but is set to same value as cp10. The cp10
+	 *  field is set to allow access to Advanced SIMD and floating point
+	 *  features from both Security states.
+	 * ---------------------------------------------------------------------
+	 */
 	ldcopr	r0, NSACR
-	bic	r0, r0, #NSASEDIS_BIT
-	bic	r0, r0, #NSTRCDIS_BIT
-	orr	r0, r0, #(NASCR_CP10_BIT | NASCR_CP11_BIT)
+	and	r0, r0, #NSACR_IMP_DEF_MASK
+	orr	r0, r0, #(NSACR_RESET_VAL | NSACR_ENABLE_FP_ACCESS)
 	stcopr	r0, NSACR
 	isb
 
-	/*
-	 * Enable access to Advanced SIMD, Floating point and to the Trace
-	 * functionality as well.
+	/* ---------------------------------------------------------------------
+	 * Initialise CPACR, setting all fields rather than relying on hw. Some
+	 * fields are architecturally UNKNOWN on reset.
+	 *
+	 * CPACR.TRCDIS: Trap control for PL0 and PL1 System register accesses
+	 *  to trace registers. Set to zero to allow access.
+	 *
+	 * CPACR_ENABLE_FP_ACCESS: Represents CPACR.cp11 and CPACR.cp10. The
+	 *  cp11 field is ignored, but is set to same value as cp10. The cp10
+	 *  field is set to allow full access from PL0 and PL1 to floating-point
+	 *  and Advanced SIMD features.
+	 * ---------------------------------------------------------------------
 	 */
-	ldcopr	r0, CPACR
-	bic	r0, r0, #ASEDIS_BIT
-	bic	r0, r0, #TRCDIS_BIT
-	orr	r0, r0, #CPACR_ENABLE_FP_ACCESS
+	ldr	r0, =((CPACR_RESET_VAL | CPACR_ENABLE_FP_ACCESS) & ~(TRCDIS_BIT))
 	stcopr	r0, CPACR
 	isb
 
-	vmrs	r0, FPEXC
-	orr	r0, r0, #FPEXC_EN_BIT
+	/* ---------------------------------------------------------------------
+	 * Initialise FPEXC, setting all fields rather than relying on hw. Some
+	 * fields are architecturally UNKNOWN on reset and are set to zero
+	 * except for field(s) listed below.
+	 *
+	 * FPEXC.EN: Enable access to Advanced SIMD and floating point features
+	 *  from all exception levels.
+	 * ---------------------------------------------------------------------
+	 */
+	ldr	r0, =(FPEXC_RESET_VAL | FPEXC_EN_BIT)
 	vmsr	FPEXC, r0
 	isb
 
-	/* Disable secure self-hosted invasive debug. */
-	ldr	r0, =SDCR_DEF_VAL
+	/* ---------------------------------------------------------------------
+	 * Initialise SDCR, setting all the fields rather than relying on hw.
+	 *
+	 * SDCR.SPD: Disable AArch32 privileged debug. Debug exceptions from
+	 * Secure EL1 are disabled.
+	 * ---------------------------------------------------------------------
+	 */
+	ldr	r0, =(SDCR_RESET_VAL | SDCR_SPD(SDCR_SPD_DISABLE))
 	stcopr	r0, SDCR
 
 	.endm
@@ -91,8 +129,9 @@
  * why this macro is parameterised ; each parameter allows to enable/disable
  * some actions.
  *
- *  _set_endian:
- *	Whether the macro needs to configure the endianness of data accesses.
+ *  _init_sctlr:
+ *	Whether the macro needs to initialise the SCTLR register including
+ *	configuring the endianness of data accesses.
  *
  *  _warm_boot_mailbox:
  *	Whether the macro needs to detect the type of boot (cold/warm). The
@@ -120,7 +159,7 @@
  * -----------------------------------------------------------------------------
  */
 	.macro el3_entrypoint_common					\
-		_set_endian, _warm_boot_mailbox, _secondary_cold_boot,	\
+		_init_sctlr, _warm_boot_mailbox, _secondary_cold_boot,	\
 		_init_memory, _init_c_runtime, _exception_vectors
 
 	/* Make sure we are in Secure Mode */
@@ -130,17 +169,27 @@
 	ASM_ASSERT(eq)
 #endif
 
-	.if \_set_endian
+	.if \_init_sctlr
 		/* -------------------------------------------------------------
-		 * Set the CPU endianness before doing anything that might
-		 * involve memory reads or writes.
+		 * This is the initialisation of SCTLR and so must ensure that
+		 * all fields are explicitly set rather than relying on hw. Some
+		 * fields reset to an IMPLEMENTATION DEFINED value.
+		 *
+		 * SCTLR.TE: Set to zero so that exceptions to an Exception
+		 *  Level executing at PL1 are taken to A32 state.
+		 *
+		 * SCTLR.EE: Set the CPU endianness before doing anything that
+		 *  might involve memory reads or writes. Set to zero to select
+		 *  Little Endian.
+		 *
+		 * SCTLR.V: Set to zero to select the normal exception vectors
+		 *  with base address held in VBAR.
 		 * -------------------------------------------------------------
 		 */
-		ldcopr	r0, SCTLR
-		bic	r0, r0, #SCTLR_EE_BIT
+		ldr     r0, =(SCTLR_RESET_VAL & ~(SCTLR_TE_BIT | SCTLR_EE_BIT | SCTLR_V_BIT))
 		stcopr	r0, SCTLR
 		isb
-	.endif /* _set_endian */
+	.endif /* _init_sctlr */
 
 	/* Switch to monitor mode */
 	cps	#MODE32_mon
diff --git a/include/common/aarch64/el3_common_macros.S b/include/common/aarch64/el3_common_macros.S
index 674d52f..34fdaee 100644
--- a/include/common/aarch64/el3_common_macros.S
+++ b/include/common/aarch64/el3_common_macros.S
@@ -15,8 +15,20 @@
 	 */
 	.macro el3_arch_init_common _exception_vectors
 	/* ---------------------------------------------------------------------
-	 * Enable the instruction cache, stack pointer and data access alignment
-	 * checks
+	 * SCTLR_EL3 has already been initialised - read current value before
+	 * modifying.
+	 *
+	 * SCTLR_EL3.I: Enable the instruction cache.
+	 *
+	 * SCTLR_EL3.SA: Enable Stack Aligment check. A SP alignment fault
+	 *  exception is generated if a load or store instruction executed at
+	 *  EL3 uses the SP as the base address and the SP is not aligned to a
+	 *  16-byte boundary.
+	 *
+	 * SCTLR_EL3.A: Enable Alignment fault checking. All instructions that
+	 *  load or store one or more registers have an alignment check that the
+	 *  address being accessed is aligned to the size of the data element(s)
+	 *  being accessed.
 	 * ---------------------------------------------------------------------
 	 */
 	mov	x1, #(SCTLR_I_BIT | SCTLR_A_BIT | SCTLR_SA_BIT)
@@ -46,19 +58,73 @@
 	isb
 
 	/* ---------------------------------------------------------------------
-	 * Early set RES1 bits in SCR_EL3. Set EA bit to catch both
-	 * External Aborts and SError Interrupts in EL3 and also the SIF bit
-	 * to disable instruction fetches from Non-secure memory.
+	 * Initialise SCR_EL3, setting all fields rather than relying on hw.
+	 * All fields are architecturally UNKNOWN on reset. The following fields
+	 * do not change during the TF lifetime. The remaining fields are set to
+	 * zero here but are updated ahead of transitioning to a lower EL in the
+	 * function cm_init_context_common().
+	 *
+	 * SCR_EL3.TWE: Set to zero so that execution of WFE instructions at
+	 *  EL2, EL1 and EL0 are not trapped to EL3.
+	 *
+	 * SCR_EL3.TWI: Set to zero so that execution of WFI instructions at
+	 *  EL2, EL1 and EL0 are not trapped to EL3.
+	 *
+	 * SCR_EL3.SIF: Set to one to disable instruction fetches from
+	 *  Non-secure memory.
+	 *
+	 * SCR_EL3.SMD: Set to zero to enable SMC calls at EL1 and above, from
+	 *  both Security states and both Execution states.
+	 *
+	 * SCR_EL3.EA: Set to one to route External Aborts and SError Interrupts
+	 *  to EL3 when executing at any EL.
 	 * ---------------------------------------------------------------------
 	 */
-	mov	x0, #(SCR_RES1_BITS | SCR_EA_BIT | SCR_SIF_BIT)
+	mov	x0, #((SCR_RESET_VAL | SCR_EA_BIT | SCR_SIF_BIT) \
+			& ~(SCR_TWE_BIT | SCR_TWI_BIT | SCR_SMD_BIT))
 	msr	scr_el3, x0
 
 	/* ---------------------------------------------------------------------
-	 * Disable secure self-hosted invasive debug.
+	 * Initialise MDCR_EL3, setting all fields rather than relying on hw.
+	 * Some fields are architecturally UNKNOWN on reset.
+	 *
+	 * MDCR_EL3.SDD: Set to one to disable AArch64 Secure self-hosted debug.
+	 *  Debug exceptions, other than Breakpoint Instruction exceptions, are
+	 *  disabled from all ELs in Secure state.
+	 *
+	 * MDCR_EL3.SPD32: Set to 0b10 to disable AArch32 Secure self-hosted
+	 *  privileged debug from S-EL1.
+	 *
+	 * MDCR_EL3.NSPB (ARM v8.2): SPE enabled in non-secure state and
+	 * disabled in secure state. Accesses to SPE registers at SEL1 generate
+	 * trap exceptions to EL3.
+	 *
+	 * MDCR_EL3.TDOSA: Set to zero so that EL2 and EL2 System register
+	 *  access to the powerdown debug registers do not trap to EL3.
+	 *
+	 * MDCR_EL3.TDA: Set to zero to allow EL0, EL1 and EL2 access to the
+	 *  debug registers, other than those registers that are controlled by
+	 *  MDCR_EL3.TDOSA.
+	 *
+	 * MDCR_EL3.TPM: Set to zero so that EL0, EL1, and EL2 System register
+	 *  accesses to all Performance Monitors registers do not trap to EL3.
 	 * ---------------------------------------------------------------------
 	 */
-	mov_imm	x0, MDCR_DEF_VAL
+	mov_imm	x0, ((MDCR_EL3_RESET_VAL | MDCR_SDD_BIT | MDCR_SPD32(MDCR_SPD32_DISABLE)) \
+			& ~(MDCR_TDOSA_BIT | MDCR_TDA_BIT | MDCR_TPM_BIT))
+
+#if ENABLE_SPE_FOR_LOWER_ELS
+	/* Detect if SPE is implemented */
+	mrs	x1, id_aa64dfr0_el1
+	ubfx	x1, x1, #ID_AA64DFR0_PMS_SHIFT, #ID_AA64DFR0_PMS_LENGTH
+	cmp	x1, #0x1
+	b.ne	1f
+
+	/* Enable SPE for use by normal world */
+	orr	x0, x0, #MDCR_NSPB(MDCR_NSPB_EL1)
+1:
+#endif
+
 	msr	mdcr_el3, x0
 
 	/* ---------------------------------------------------------------------
@@ -69,28 +135,20 @@
 	msr	daifclr, #DAIF_ABT_BIT
 
 	/* ---------------------------------------------------------------------
-	 * The initial state of the Architectural feature trap register
-	 * (CPTR_EL3) is unknown and it must be set to a known state. All
-	 * feature traps are disabled. Some bits in this register are marked as
-	 * reserved and should not be modified.
+	 * Initialise CPTR_EL3, setting all fields rather than relying on hw.
+	 * All fields are architecturally UNKNOWN on reset.
 	 *
-	 * CPTR_EL3.TCPAC: This causes a direct access to the CPACR_EL1 from EL1
-	 *  or the CPTR_EL2 from EL2 to trap to EL3 unless it is trapped at EL2.
+	 * CPTR_EL3.TCPAC: Set to zero so that any accesses to CPACR_EL1,
+	 *  CPTR_EL2, CPACR, or HCPTR do not trap to EL3.
 	 *
-	 * CPTR_EL3.TTA: This causes access to the Trace functionality to trap
-	 *  to EL3 when executed from EL0, EL1, EL2, or EL3. If system register
-	 *  access to trace functionality is not supported, this bit is RES0.
+	 * CPTR_EL3.TTA: Set to zero so that System register accesses to the
+	 *  trace registers do not trap to EL3.
 	 *
-	 * CPTR_EL3.TFP: This causes instructions that access the registers
-	 *  associated with Floating Point and Advanced SIMD execution to trap
-	 *  to EL3 when executed from any exception level, unless trapped to EL1
-	 *  or EL2.
+	 * CPTR_EL3.TFP: Set to zero so that accesses to Advanced SIMD and
+	 *  floating-point functionality do not trap to EL3.
 	 * ---------------------------------------------------------------------
 	 */
-	mrs	x0, cptr_el3
-	bic	w0, w0, #TCPAC_BIT
-	bic	w0, w0, #TTA_BIT
-	bic	w0, w0, #TFP_BIT
+	mov_imm x0, (CPTR_EL3_RESET_VAL & ~(TCPAC_BIT | TTA_BIT | TFP_BIT))
 	msr	cptr_el3, x0
 	.endm
 
@@ -104,8 +162,9 @@
  * why this macro is parameterised ; each parameter allows to enable/disable
  * some actions.
  *
- *  _set_endian:
- *	Whether the macro needs to configure the endianness of data accesses.
+ *  _init_sctlr:
+ *	Whether the macro needs to initialise SCTLR_EL3, including configuring
+ *      the endianness of data accesses.
  *
  *  _warm_boot_mailbox:
  *	Whether the macro needs to detect the type of boot (cold/warm). The
@@ -133,20 +192,35 @@
  * -----------------------------------------------------------------------------
  */
 	.macro el3_entrypoint_common					\
-		_set_endian, _warm_boot_mailbox, _secondary_cold_boot,	\
+		_init_sctlr, _warm_boot_mailbox, _secondary_cold_boot,	\
 		_init_memory, _init_c_runtime, _exception_vectors
 
-	.if \_set_endian
+	.if \_init_sctlr
 		/* -------------------------------------------------------------
-		 * Set the CPU endianness before doing anything that might
-		 * involve memory reads or writes.
+		 * This is the initialisation of SCTLR_EL3 and so must ensure
+		 * that all fields are explicitly set rather than relying on hw.
+		 * Some fields reset to an IMPLEMENTATION DEFINED value and
+		 * others are architecturally UNKNOWN on reset.
+		 *
+		 * SCTLR.EE: Set the CPU endianness before doing anything that
+		 *  might involve memory reads or writes. Set to zero to select
+		 *  Little Endian.
+		 *
+		 * SCTLR_EL3.WXN: For the EL3 translation regime, this field can
+		 *  force all memory regions that are writeable to be treated as
+		 *  XN (Execute-never). Set to zero so that this control has no
+		 *  effect on memory access permissions.
+		 *
+		 * SCTLR_EL3.SA: Set to zero to disable Stack Aligment check.
+		 *
+		 * SCTLR_EL3.A: Set to zero to disable Alignment fault checking.
 		 * -------------------------------------------------------------
 		 */
-		mrs	x0, sctlr_el3
-		bic	x0, x0, #SCTLR_EE_BIT
+		mov_imm	x0, (SCTLR_RESET_VAL & ~(SCTLR_EE_BIT | SCTLR_WXN_BIT \
+				| SCTLR_SA_BIT | SCTLR_A_BIT))
 		msr	sctlr_el3, x0
 		isb
-	.endif /* _set_endian */
+	.endif /* _init_sctlr */
 
 	.if \_warm_boot_mailbox
 		/* -------------------------------------------------------------
diff --git a/include/common/ep_info.h b/include/common/ep_info.h
index 23d27c4..3f6213f 100644
--- a/include/common/ep_info.h
+++ b/include/common/ep_info.h
@@ -33,6 +33,7 @@
 			((x) = ((x) & ~PARAM_EP_SECURITY_MASK) | (security))
 
 #define EP_EE_MASK	U(0x2)
+#define EP_EE_SHIFT	1
 #define EP_EE_LITTLE	U(0x0)
 #define EP_EE_BIG	U(0x2)
 #define EP_GET_EE(x) (x & EP_EE_MASK)
diff --git a/include/lib/aarch32/arch.h b/include/lib/aarch32/arch.h
index d70e4c7..661dbf8 100644
--- a/include/lib/aarch32/arch.h
+++ b/include/lib/aarch32/arch.h
@@ -101,14 +101,19 @@
 #define SCTLR_TRE_BIT		(1 << 28)
 #define SCTLR_AFE_BIT		(1 << 29)
 #define SCTLR_TE_BIT		(1 << 30)
+#define SCTLR_RESET_VAL         (SCTLR_RES1 | SCTLR_NTWE_BIT |		\
+				SCTLR_NTWI_BIT | SCTLR_CP15BEN_BIT)
 
 /* SDCR definitions */
 #define SDCR_SPD(x)		((x) << 14)
 #define SDCR_SPD_LEGACY		0x0
 #define SDCR_SPD_DISABLE	0x2
 #define SDCR_SPD_ENABLE		0x3
+#define SDCR_RESET_VAL		0x0
 
+#if !ERROR_DEPRECATED
 #define SDCR_DEF_VAL		SDCR_SPD(SDCR_SPD_DISABLE)
+#endif
 
 /* HSCTLR definitions */
 #define HSCTLR_RES1 	((1 << 29) | (1 << 28) | (1 << 23) | (1 << 22)	\
@@ -145,6 +150,7 @@
 #define SCR_IRQ_BIT		(1 << 1)
 #define SCR_NS_BIT		(1 << 0)
 #define SCR_VALID_BIT_MASK	0x33ff
+#define SCR_RESET_VAL		0x0
 
 #define GET_NS_BIT(scr)		((scr) & SCR_NS_BIT)
 
@@ -152,9 +158,10 @@
 #define HCR_AMO_BIT		(1 << 5)
 #define HCR_IMO_BIT		(1 << 4)
 #define HCR_FMO_BIT		(1 << 3)
+#define HCR_RESET_VAL		0x0
 
 /* CNTHCTL definitions */
-#define EVNTEN_BIT		(1 << 2)
+#define CNTHCTL_RESET_VAL	0x0
 #define PL1PCEN_BIT		(1 << 1)
 #define PL1PCTEN_BIT		(1 << 0)
 
@@ -169,16 +176,42 @@
 #define EVNTI_MASK		0xf
 
 /* HCPTR definitions */
+#define HCPTR_RES1		((1 << 13) | (1<<12) | 0x3ff)
 #define TCPAC_BIT		(1 << 31)
 #define TTA_BIT			(1 << 20)
 #define TCP11_BIT		(1 << 10)
 #define TCP10_BIT		(1 << 10)
+#define HCPTR_RESET_VAL		HCPTR_RES1
+
+/* VTTBR defintions */
+#define VTTBR_RESET_VAL		ULL(0x0)
+#define VTTBR_VMID_MASK		ULL(0xff)
+#define VTTBR_VMID_SHIFT	48
+#define VTTBR_BADDR_MASK	0xffffffffffff
+#define VTTBR_BADDR_SHIFT	0
+
+/* HDCR definitions */
+#define HDCR_RESET_VAL		0x0
+
+/* HSTR definitions */
+#define HSTR_RESET_VAL		0x0
+
+/* CNTHP_CTL definitions */
+#define CNTHP_CTL_RESET_VAL	0x0
 
 /* NASCR definitions */
 #define NSASEDIS_BIT		(1 << 15)
 #define NSTRCDIS_BIT		(1 << 20)
+/* NOTE: correct typo in the definitions */
+#if !ERROR_DEPRECATED
 #define NASCR_CP11_BIT		(1 << 11)
 #define NASCR_CP10_BIT		(1 << 10)
+#endif
+#define NSACR_CP11_BIT		(1 << 11)
+#define NSACR_CP10_BIT		(1 << 10)
+#define NSACR_IMP_DEF_MASK	(0x7 << 16)
+#define NSACR_ENABLE_FP_ACCESS	(NSACR_CP11_BIT | NSACR_CP10_BIT)
+#define NSACR_RESET_VAL		0x0
 
 /* CPACR definitions */
 #define ASEDIS_BIT		(1 << 31)
@@ -187,9 +220,12 @@
 #define CPACR_CP10_SHIFT	20
 #define CPACR_ENABLE_FP_ACCESS	(0x3 << CPACR_CP11_SHIFT |\
 					0x3 << CPACR_CP10_SHIFT)
+#define CPACR_RESET_VAL         0x0
 
 /* FPEXC definitions */
+#define FPEXC_RES1		((1 << 10) | (1 << 9) | (1 << 8))
 #define FPEXC_EN_BIT		(1 << 30)
+#define FPEXC_RESET_VAL		FPEXC_RES1
 
 /* SPSR/CPSR definitions */
 #define SPSR_FIQ_BIT		(1 << 0)
@@ -369,6 +405,7 @@
 #define HSCTLR		p15, 4, c1, c0, 0
 #define HCR		p15, 4, c1, c1, 0
 #define HCPTR		p15, 4, c1, c1, 2
+#define HSTR		p15, 4, c1, c1, 3
 #define CNTHCTL		p15, 4, c14, c1, 0
 #define CNTKCTL		p15, 0, c14, c1, 0
 #define VPIDR		p15, 4, c0, c0, 0
diff --git a/include/lib/aarch32/arch_helpers.h b/include/lib/aarch32/arch_helpers.h
index bd1ac25..5d31836 100644
--- a/include/lib/aarch32/arch_helpers.h
+++ b/include/lib/aarch32/arch_helpers.h
@@ -251,6 +251,7 @@
 DEFINE_COPROCR_RW_FUNCS_64(ttbr1, TTBR1_64)
 DEFINE_COPROCR_RW_FUNCS_64(cntvoff, CNTVOFF_64)
 DEFINE_COPROCR_RW_FUNCS(csselr, CSSELR)
+DEFINE_COPROCR_RW_FUNCS(hstr, HSTR)
 
 DEFINE_COPROCR_RW_FUNCS(icc_sre_el1, ICC_SRE)
 DEFINE_COPROCR_RW_FUNCS(icc_sre_el2, ICC_HSRE)
diff --git a/include/lib/aarch64/arch.h b/include/lib/aarch64/arch.h
index e84c888..7bceea7 100644
--- a/include/lib/aarch64/arch.h
+++ b/include/lib/aarch64/arch.h
@@ -110,6 +110,11 @@
 #define ID_AA64PFR0_EL3_SHIFT	U(12)
 #define ID_AA64PFR0_ELX_MASK	U(0xf)
 
+/* ID_AA64DFR0_EL1.PMS definitions (for ARMv8.2+) */
+#define ID_AA64DFR0_PMS_SHIFT	U(32)
+#define ID_AA64DFR0_PMS_LENGTH	U(4)
+#define ID_AA64DFR0_PMS_MASK	U(0xf)
+
 #define EL_IMPL_NONE		U(0)
 #define EL_IMPL_A64ONLY		U(1)
 #define EL_IMPL_A64_A32		U(2)
@@ -135,16 +140,20 @@
 				 & ID_PFR1_VIRTEXT_MASK)
 
 /* SCTLR definitions */
-#define SCTLR_EL2_RES1  ((U(1) << 29) | (U(1) << 28) | (U(1) << 23) | \
+#define SCTLR_EL2_RES1	((U(1) << 29) | (U(1) << 28) | (U(1) << 23) | \
 			 (U(1) << 22) | (U(1) << 18) | (U(1) << 16) | \
 			 (U(1) << 11) | (U(1) << 5) | (U(1) << 4))
 
-#define SCTLR_EL1_RES1  ((U(1) << 29) | (U(1) << 28) | (U(1) << 23) | \
+#define SCTLR_EL1_RES1	((U(1) << 29) | (U(1) << 28) | (U(1) << 23) | \
 			 (U(1) << 22) | (U(1) << 20) | (U(1) << 11))
 #define SCTLR_AARCH32_EL1_RES1 \
 			((U(1) << 23) | (U(1) << 22) | (U(1) << 11) | \
 			 (U(1) << 4) | (U(1) << 3))
 
+#define SCTLR_EL3_RES1	((U(1) << 29) | (U(1) << 28) | (U(1) << 23) | \
+			(U(1) << 22) | (U(1) << 18) | (U(1) << 16) | \
+			(U(1) << 11) | (U(1) << 5) | (U(1) << 4))
+
 #define SCTLR_M_BIT		(U(1) << 0)
 #define SCTLR_A_BIT		(U(1) << 1)
 #define SCTLR_C_BIT		(U(1) << 2)
@@ -155,6 +164,7 @@
 #define SCTLR_NTWE_BIT		(U(1) << 18)
 #define SCTLR_WXN_BIT		(U(1) << 19)
 #define SCTLR_EE_BIT		(U(1) << 25)
+#define SCTLR_RESET_VAL		SCTLR_EL3_RES1
 
 /* CPACR_El1 definitions */
 #define CPACR_EL1_FPEN(x)	((x) << 20)
@@ -176,15 +186,52 @@
 #define SCR_IRQ_BIT		(U(1) << 1)
 #define SCR_NS_BIT		(U(1) << 0)
 #define SCR_VALID_BIT_MASK	U(0x2f8f)
+#define SCR_RESET_VAL		SCR_RES1_BITS
 
-/* MDCR definitions */
+/* MDCR_EL3 definitions */
 #define MDCR_SPD32(x)		((x) << 14)
 #define MDCR_SPD32_LEGACY	U(0x0)
 #define MDCR_SPD32_DISABLE	U(0x2)
 #define MDCR_SPD32_ENABLE	U(0x3)
 #define MDCR_SDD_BIT		(U(1) << 16)
+#define MDCR_NSPB(x)		((x) << 12)
+#define MDCR_NSPB_EL1		U(0x3)
+#define MDCR_TDOSA_BIT		(U(1) << 10)
+#define MDCR_TDA_BIT		(U(1) << 9)
+#define MDCR_TPM_BIT		(U(1) << 6)
+#define MDCR_EL3_RESET_VAL	U(0x0)
 
+#if !ERROR_DEPRECATED
 #define MDCR_DEF_VAL		(MDCR_SDD_BIT | MDCR_SPD32(MDCR_SPD32_DISABLE))
+#endif
+
+/* MDCR_EL2 definitions */
+#define MDCR_EL2_TPMS		(U(1) << 14)
+#define MDCR_EL2_E2PB(x)	((x) << 12)
+#define MDCR_EL2_E2PB_EL1	U(0x3)
+#define MDCR_EL2_TDRA_BIT	(U(1) << 11)
+#define MDCR_EL2_TDOSA_BIT	(U(1) << 10)
+#define MDCR_EL2_TDA_BIT	(U(1) << 9)
+#define MDCR_EL2_TDE_BIT	(U(1) << 8)
+#define MDCR_EL2_HPME_BIT	(U(1) << 7)
+#define MDCR_EL2_TPM_BIT	(U(1) << 6)
+#define MDCR_EL2_TPMCR_BIT	(U(1) << 5)
+#define MDCR_EL2_RESET_VAL	U(0x0)
+
+/* HSTR_EL2 definitions */
+#define HSTR_EL2_RESET_VAL	U(0x0)
+#define HSTR_EL2_T_MASK		U(0xff)
+
+/* CNTHP_CTL_EL2 definitions */
+#define CNTHP_CTL_ENABLE_BIT	(U(1) << 0)
+#define CNTHP_CTL_RESET_VAL	U(0x0)
+
+/* VTTBR_EL2 definitions */
+#define VTTBR_RESET_VAL		ULL(0x0)
+#define VTTBR_VMID_MASK		ULL(0xff)
+#define VTTBR_VMID_SHIFT	U(48)
+#define VTTBR_BADDR_MASK	ULL(0xffffffffffff)
+#define VTTBR_BADDR_SHIFT	U(0)
 
 /* HCR definitions */
 #define HCR_RW_SHIFT		U(31)
@@ -199,6 +246,7 @@
 #define ISR_F_SHIFT		U(6)
 
 /* CNTHCTL_EL2 definitions */
+#define CNTHCTL_RESET_VAL	U(0x0)
 #define EVNTEN_BIT		(U(1) << 2)
 #define EL1PCEN_BIT		(U(1) << 1)
 #define EL1PCTEN_BIT		(U(1) << 0)
@@ -217,6 +265,14 @@
 #define TCPAC_BIT		(U(1) << 31)
 #define TTA_BIT			(U(1) << 20)
 #define TFP_BIT			(U(1) << 10)
+#define CPTR_EL3_RESET_VAL	U(0x0)
+
+/* CPTR_EL2 definitions */
+#define CPTR_EL2_RES1		((U(1) << 13) | (U(1) << 12) | (U(0x3ff)))
+#define CPTR_EL2_TCPAC_BIT	(U(1) << 31)
+#define CPTR_EL2_TTA_BIT	(U(1) << 20)
+#define CPTR_EL2_TFP_BIT	(U(1) << 10)
+#define CPTR_EL2_RESET_VAL	CPTR_EL2_RES1
 
 /* CPSR/SPSR definitions */
 #define DAIF_FIQ_BIT		(U(1) << 0)
diff --git a/include/lib/aarch64/arch_helpers.h b/include/lib/aarch64/arch_helpers.h
index 32290e2..0d0d7d3 100644
--- a/include/lib/aarch64/arch_helpers.h
+++ b/include/lib/aarch64/arch_helpers.h
@@ -184,6 +184,7 @@
 DEFINE_SYSREG_READ_FUNC(par_el1)
 DEFINE_SYSREG_READ_FUNC(id_pfr1_el1)
 DEFINE_SYSREG_READ_FUNC(id_aa64pfr0_el1)
+DEFINE_SYSREG_READ_FUNC(id_aa64dfr0_el1)
 DEFINE_SYSREG_READ_FUNC(CurrentEl)
 DEFINE_SYSREG_RW_FUNCS(daif)
 DEFINE_SYSREG_RW_FUNCS(spsr_el1)
diff --git a/include/lib/el3_runtime/aarch64/context.h b/include/lib/el3_runtime/aarch64/context.h
index dead971..dcbf1c9 100644
--- a/include/lib/el3_runtime/aarch64/context.h
+++ b/include/lib/el3_runtime/aarch64/context.h
@@ -308,6 +308,7 @@
  * Function prototypes
  ******************************************************************************/
 void el1_sysregs_context_save(el1_sys_regs_t *regs);
+void el1_sysregs_context_save_post_ops(void);
 void el1_sysregs_context_restore(el1_sys_regs_t *regs);
 #if CTX_INCLUDE_FPREGS
 void fpregs_context_save(fp_regs_t *regs);
diff --git a/include/plat/arm/common/plat_arm.h b/include/plat/arm/common/plat_arm.h
index e61c22f..3a73776 100644
--- a/include/plat/arm/common/plat_arm.h
+++ b/include/plat/arm/common/plat_arm.h
@@ -219,4 +219,7 @@
 		uint32_t cookie_lo,
 		void *handle);
 
+/* Disable Statistical Profiling Extensions helper */
+void arm_disable_spe(void);
+
 #endif /* __PLAT_ARM_H__ */
diff --git a/lib/aarch32/cache_helpers.S b/lib/aarch32/cache_helpers.S
index 57b6b38..810af0f 100644
--- a/lib/aarch32/cache_helpers.S
+++ b/lib/aarch32/cache_helpers.S
@@ -20,6 +20,9 @@
  * This macro can be used for implementing various data cache operations `op`
  */
 .macro do_dcache_maintenance_by_mva op, coproc, opc1, CRn, CRm, opc2
+	/* Exit early if size is zero */
+	cmp	r1, #0
+	beq	exit_loop_\op
 	dcache_line_size r2, r3
 	add	r1, r0, r1
 	sub	r3, r2, #1
@@ -30,6 +33,7 @@
 	cmp	r0, r1
 	blo	loop_\op
 	dsb	sy
+exit_loop_\op:
 	bx	lr
 .endm
 
diff --git a/lib/aarch64/cache_helpers.S b/lib/aarch64/cache_helpers.S
index eef07a8..9c40b9d 100644
--- a/lib/aarch64/cache_helpers.S
+++ b/lib/aarch64/cache_helpers.S
@@ -20,6 +20,8 @@
  * This macro can be used for implementing various data cache operations `op`
  */
 .macro do_dcache_maintenance_by_mva op
+	/* Exit early if size is zero */
+	cbz	x1, exit_loop_\op
 	dcache_line_size x2, x3
 	add	x1, x0, x1
 	sub	x3, x2, #1
@@ -30,6 +32,7 @@
 	cmp	x0, x1
 	b.lo    loop_\op
 	dsb	sy
+exit_loop_\op:
 	ret
 .endm
 	/* ------------------------------------------
diff --git a/lib/el3_runtime/aarch32/context_mgmt.c b/lib/el3_runtime/aarch32/context_mgmt.c
index 020f3a3..3e7a5b7 100644
--- a/lib/el3_runtime/aarch32/context_mgmt.c
+++ b/lib/el3_runtime/aarch32/context_mgmt.c
@@ -75,36 +75,44 @@
 	if (security_state != SECURE)
 		scr |= SCR_NS_BIT;
 
-	/*
-	 * Set up SCTLR for the Non Secure context.
-	 * EE bit is taken from the entrypoint attributes
-	 * M, C and I bits must be zero (as required by PSCI specification)
-	 *
-	 * The target exception level is based on the spsr mode requested.
-	 * If execution is requested to hyp mode, HVC is enabled
-	 * via SCR.HCE.
-	 *
-	 * Always compute the SCTLR_EL1 value and save in the cpu_context
-	 * - the HYP registers are set up by cm_preapre_ns_entry() as they
-	 * are not part of the stored cpu_context
-	 *
-	 * TODO: In debug builds the spsr should be validated and checked
-	 * against the CPU support, security state, endianness and pc
-	 */
 	if (security_state != SECURE) {
-		sctlr = EP_GET_EE(ep->h.attr) ? SCTLR_EE_BIT : 0;
 		/*
-		 * In addition to SCTLR_RES1, set the CP15_BEN, nTWI & nTWE
-		 * bits that architecturally reset to 1.
+		 * Set up SCTLR for the Non-secure context.
+		 *
+		 * SCTLR.EE: Endianness is taken from the entrypoint attributes.
+		 *
+		 * SCTLR.M, SCTLR.C and SCTLR.I: These fields must be zero (as
+		 *  required by PSCI specification)
+		 *
+		 * Set remaining SCTLR fields to their architecturally defined
+		 * values. Some fields reset to an IMPLEMENTATION DEFINED value:
+		 *
+		 * SCTLR.TE: Set to zero so that exceptions to an Exception
+		 *  Level executing at PL1 are taken to A32 state.
+		 *
+		 * SCTLR.V: Set to zero to select the normal exception vectors
+		 *  with base address held in VBAR.
 		 */
-		sctlr |= SCTLR_RES1 | SCTLR_CP15BEN_BIT |
-				SCTLR_NTWI_BIT | SCTLR_NTWE_BIT;
+		assert(((ep->spsr >> SPSR_E_SHIFT) & SPSR_E_MASK) ==
+			(EP_GET_EE(ep->h.attr) >> EP_EE_SHIFT));
+
+		sctlr = EP_GET_EE(ep->h.attr) ? SCTLR_EE_BIT : 0;
+		sctlr |= (SCTLR_RESET_VAL & ~(SCTLR_TE_BIT | SCTLR_V_BIT));
 		write_ctx_reg(reg_ctx, CTX_NS_SCTLR, sctlr);
 	}
 
+	/*
+	 * The target exception level is based on the spsr mode requested. If
+	 * execution is requested to hyp mode, HVC is enabled via SCR.HCE.
+	 */
 	if (GET_M32(ep->spsr) == MODE32_hyp)
 		scr |= SCR_HCE_BIT;
 
+	/*
+	 * Store the initialised values for SCTLR and SCR in the cpu_context.
+	 * The Hyp mode registers are not part of the saved context and are
+	 * set-up in cm_prepare_el3_exit().
+	 */
 	write_ctx_reg(reg_ctx, CTX_SCR, scr);
 	write_ctx_reg(reg_ctx, CTX_LR, ep->pc);
 	write_ctx_reg(reg_ctx, CTX_SPSR, ep->spsr);
@@ -151,7 +159,7 @@
  ******************************************************************************/
 void cm_prepare_el3_exit(uint32_t security_state)
 {
-	uint32_t sctlr, scr, hcptr;
+	uint32_t hsctlr, scr;
 	cpu_context_t *ctx = cm_get_context(security_state);
 
 	assert(ctx);
@@ -160,9 +168,9 @@
 		scr = read_ctx_reg(get_regs_ctx(ctx), CTX_SCR);
 		if (scr & SCR_HCE_BIT) {
 			/* Use SCTLR value to initialize HSCTLR */
-			sctlr = read_ctx_reg(get_regs_ctx(ctx),
+			hsctlr = read_ctx_reg(get_regs_ctx(ctx),
 						 CTX_NS_SCTLR);
-			sctlr |= HSCTLR_RES1;
+			hsctlr |= HSCTLR_RES1;
 			/* Temporarily set the NS bit to access HSCTLR */
 			write_scr(read_scr() | SCR_NS_BIT);
 			/*
@@ -170,7 +178,7 @@
 			 * we can access HSCTLR
 			 */
 			isb();
-			write_hsctlr(sctlr);
+			write_hsctlr(hsctlr);
 			isb();
 
 			write_scr(read_scr() & ~SCR_NS_BIT);
@@ -184,48 +192,92 @@
 			write_scr(read_scr() | SCR_NS_BIT);
 			isb();
 
-			/* PL2 present but unused, need to disable safely */
-			write_hcr(0);
-
-			/* HSCTLR : can be ignored when bypassing */
+			/*
+			 * Hyp / PL2 present but unused, need to disable safely.
+			 * HSCTLR can be ignored in this case.
+			 *
+			 * Set HCR to its architectural reset value so that
+			 * Non-secure operations do not trap to Hyp mode.
+			 */
+			write_hcr(HCR_RESET_VAL);
 
-			/* HCPTR : disable all traps TCPAC, TTA, TCP */
-			hcptr = read_hcptr();
-			hcptr &= ~(TCPAC_BIT | TTA_BIT | TCP11_BIT | TCP10_BIT);
-			write_hcptr(hcptr);
+			/*
+			 * Set HCPTR to its architectural reset value so that
+			 * Non-secure access from EL1 or EL0 to trace and to
+			 * Advanced SIMD and floating point functionality does
+			 * not trap to Hyp mode.
+			 */
+			write_hcptr(HCPTR_RESET_VAL);
 
-			/* Enable EL1 access to timer */
-			write_cnthctl(PL1PCEN_BIT | PL1PCTEN_BIT);
+			/*
+			 * Initialise CNTHCTL. All fields are architecturally
+			 * UNKNOWN on reset and are set to zero except for
+			 * field(s) listed below.
+			 *
+			 * CNTHCTL.PL1PCEN: Disable traps to Hyp mode of
+			 *  Non-secure EL0 and EL1 accessed to the physical
+			 *  timer registers.
+			 *
+			 * CNTHCTL.PL1PCTEN: Disable traps to Hyp mode of
+			 *  Non-secure EL0 and EL1 accessed to the physical
+			 *  counter registers.
+			 */
+			write_cnthctl(CNTHCTL_RESET_VAL |
+					PL1PCEN_BIT | PL1PCTEN_BIT);
 
-			/* Reset CNTVOFF_EL2 */
+			/*
+			 * Initialise CNTVOFF to zero as it resets to an
+			 * IMPLEMENTATION DEFINED value.
+			 */
 			write64_cntvoff(0);
 
-			/* Set VPIDR, VMPIDR to match MIDR, MPIDR */
+			/*
+			 * Set VPIDR and VMPIDR to match MIDR_EL1 and MPIDR
+			 * respectively.
+			 */
 			write_vpidr(read_midr());
 			write_vmpidr(read_mpidr());
 
 			/*
-			 * Reset VTTBR.
-			 * Needed because cache maintenance operations depend on
-			 * the VMID even when non-secure EL1&0 stage 2 address
-			 * translation are disabled.
+			 * Initialise VTTBR, setting all fields rather than
+			 * relying on the hw. Some fields are architecturally
+			 * UNKNOWN at reset.
+			 *
+			 * VTTBR.VMID: Set to zero which is the architecturally
+			 *  defined reset value. Even though EL1&0 stage 2
+			 *  address translation is disabled, cache maintenance
+			 *  operations depend on the VMID.
+			 *
+			 * VTTBR.BADDR: Set to zero as EL1&0 stage 2 address
+			 *  translation is disabled.
 			 */
-			write64_vttbr(0);
+			write64_vttbr(VTTBR_RESET_VAL &
+				~((VTTBR_VMID_MASK << VTTBR_VMID_SHIFT)
+				| (VTTBR_BADDR_MASK << VTTBR_BADDR_SHIFT)));
 
 			/*
-			 * Avoid unexpected debug traps in case where HDCR
-			 * is not completely reset by the hardware - set
-			 * HDCR.HPMN to PMCR.N and zero the remaining bits.
-			 * The HDCR.HPMN and PMCR.N fields are the same size
-			 * (5 bits) and HPMN is at offset zero within HDCR.
+			 * Initialise HDCR, setting all the fields rather than
+			 * relying on hw.
+			 *
+			 * HDCR.HPMN: Set to value of PMCR.N which is the
+			 *  architecturally-defined reset value.
 			 */
-			write_hdcr((read_pmcr() & PMCR_N_BITS) >> PMCR_N_SHIFT);
+			write_hdcr(HDCR_RESET_VAL |
+				((read_pmcr() & PMCR_N_BITS) >> PMCR_N_SHIFT));
 
 			/*
-			 * Reset CNTHP_CTL to disable the EL2 physical timer and
-			 * therefore prevent timer interrupts.
+			 * Set HSTR to its architectural reset value so that
+			 * access to system registers in the cproc=1111
+			 * encoding space do not trap to Hyp mode.
+			 */
+			write_hstr(HSTR_RESET_VAL);
+			/*
+			 * Set CNTHP_CTL to its architectural reset value to
+			 * disable the EL2 physical timer and prevent timer
+			 * interrupts. Some fields are architecturally UNKNOWN
+			 * on reset and are set to zero.
 			 */
-			write_cnthp_ctl(0);
+			write_cnthp_ctl(CNTHP_CTL_RESET_VAL);
 			isb();
 
 			write_scr(read_scr() & ~SCR_NS_BIT);
diff --git a/lib/el3_runtime/aarch64/context.S b/lib/el3_runtime/aarch64/context.S
index afe912a..8a6c11b 100644
--- a/lib/el3_runtime/aarch64/context.S
+++ b/lib/el3_runtime/aarch64/context.S
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2013-2016, ARM Limited and Contributors. All rights reserved.
+ * Copyright (c) 2013-2017, ARM Limited and Contributors. All rights reserved.
  *
  * SPDX-License-Identifier: BSD-3-Clause
  */
@@ -9,6 +9,7 @@
 #include <context.h>
 
 	.global	el1_sysregs_context_save
+	.global el1_sysregs_context_save_post_ops
 	.global	el1_sysregs_context_restore
 #if CTX_INCLUDE_FPREGS
 	.global	fpregs_context_save
@@ -111,6 +112,36 @@
 /* -----------------------------------------------------
  * The following function strictly follows the AArch64
  * PCS to use x9-x17 (temporary caller-saved registers)
+ * to do post operations after saving the EL1 system
+ * register context.
+ * -----------------------------------------------------
+ */
+func el1_sysregs_context_save_post_ops
+#if ENABLE_SPE_FOR_LOWER_ELS
+	/* Detect if SPE is implemented */
+	mrs	x9, id_aa64dfr0_el1
+	ubfx	x9, x9, #ID_AA64DFR0_PMS_SHIFT, #ID_AA64DFR0_PMS_LENGTH
+	cmp	x9, #0x1
+	b.ne	1f
+
+	/*
+	 * Before switching from normal world to secure world
+	 * the profiling buffers need to be drained out to memory.  This is
+	 * required to avoid an invalid memory access when TTBR is switched
+	 * for entry to SEL1.
+	 */
+	.arch	armv8.2-a+profile
+	psb	csync
+	dsb	nsh
+	.arch	armv8-a
+1:
+#endif
+	ret
+endfunc el1_sysregs_context_save_post_ops
+
+/* -----------------------------------------------------
+ * The following function strictly follows the AArch64
+ * PCS to use x9-x17 (temporary caller-saved registers)
  * to restore EL1 system register context.  It assumes
  * that 'x0' is pointing to a 'el1_sys_regs' structure
  * from where the register context will be restored
@@ -343,7 +374,7 @@
 	ldp	x24, x25, [sp, #CTX_GPREGS_OFFSET + CTX_GPREG_X24]
 	ldp	x26, x27, [sp, #CTX_GPREGS_OFFSET + CTX_GPREG_X26]
 	ldp	x28, x29, [sp, #CTX_GPREGS_OFFSET + CTX_GPREG_X28]
-	ldp	 x30, x17, [sp, #CTX_GPREGS_OFFSET + CTX_GPREG_LR]
+	ldp	x30, x17, [sp, #CTX_GPREGS_OFFSET + CTX_GPREG_LR]
 	msr	sp_el0, x17
 	ldp	x16, x17, [sp, #CTX_GPREGS_OFFSET + CTX_GPREG_X16]
 	eret
diff --git a/lib/el3_runtime/aarch64/context_mgmt.c b/lib/el3_runtime/aarch64/context_mgmt.c
index 0104c4e..5257bf1 100644
--- a/lib/el3_runtime/aarch64/context_mgmt.c
+++ b/lib/el3_runtime/aarch64/context_mgmt.c
@@ -71,77 +71,104 @@
 	zeromem(ctx, sizeof(*ctx));
 
 	/*
-	 * Base the context SCR on the current value, adjust for entry point
-	 * specific requirements and set trap bits from the IMF
-	 * TODO: provide the base/global SCR bits using another mechanism?
+	 * SCR_EL3 was initialised during reset sequence in macro
+	 * el3_arch_init_common. This code modifies the SCR_EL3 fields that
+	 * affect the next EL.
+	 *
+	 * The following fields are initially set to zero and then updated to
+	 * the required value depending on the state of the SPSR_EL3 and the
+	 * Security state and entrypoint attributes of the next EL.
 	 */
 	scr_el3 = read_scr();
 	scr_el3 &= ~(SCR_NS_BIT | SCR_RW_BIT | SCR_FIQ_BIT | SCR_IRQ_BIT |
 			SCR_ST_BIT | SCR_HCE_BIT);
-
+	/*
+	 * SCR_NS: Set the security state of the next EL.
+	 */
 	if (security_state != SECURE)
 		scr_el3 |= SCR_NS_BIT;
-
+	/*
+	 * SCR_EL3.RW: Set the execution state, AArch32 or AArch64, for next
+	 *  Exception level as specified by SPSR.
+	 */
 	if (GET_RW(ep->spsr) == MODE_RW_64)
 		scr_el3 |= SCR_RW_BIT;
-
+	/*
+	 * SCR_EL3.ST: Traps Secure EL1 accesses to the Counter-timer Physical
+	 *  Secure timer registers to EL3, from AArch64 state only, if specified
+	 *  by the entrypoint attributes.
+	 */
 	if (EP_GET_ST(ep->h.attr))
 		scr_el3 |= SCR_ST_BIT;
 
 #ifndef HANDLE_EA_EL3_FIRST
-	/* Explicitly stop to trap aborts from lower exception levels. */
+	/*
+	 * SCR_EL3.EA: Do not route External Abort and SError Interrupt External
+	 *  to EL3 when executing at a lower EL. When executing at EL3, External
+	 *  Aborts are taken to EL3.
+	 */
 	scr_el3 &= ~SCR_EA_BIT;
 #endif
 
 #ifdef IMAGE_BL31
 	/*
-	 * IRQ/FIQ bits only need setting if interrupt routing
-	 * model has been set up for BL31.
+	 * SCR_EL3.IRQ, SCR_EL3.FIQ: Enable the physical FIQ and IRQ rounting as
+	 *  indicated by the interrupt routing model for BL31.
 	 */
 	scr_el3 |= get_scr_el3_from_routing_model(security_state);
 #endif
 
 	/*
-	 * Set up SCTLR_ELx for the target exception level:
-	 * EE bit is taken from the entrypoint attributes
-	 * M, C and I bits must be zero (as required by PSCI specification)
-	 *
-	 * The target exception level is based on the spsr mode requested.
-	 * If execution is requested to EL2 or hyp mode, HVC is enabled
-	 * via SCR_EL3.HCE.
+	 * SCR_EL3.HCE: Enable HVC instructions if next execution state is
+	 * AArch64 and next EL is EL2, or if next execution state is AArch32 and
+	 * next mode is Hyp.
+	 */
+	if ((GET_RW(ep->spsr) == MODE_RW_64
+	     && GET_EL(ep->spsr) == MODE_EL2)
+	    || (GET_RW(ep->spsr) != MODE_RW_64
+		&& GET_M32(ep->spsr) == MODE32_hyp)) {
+		scr_el3 |= SCR_HCE_BIT;
+	}
+
+	/*
+	 * Initialise SCTLR_EL1 to the reset value corresponding to the target
+	 * execution state setting all fields rather than relying of the hw.
+	 * Some fields have architecturally UNKNOWN reset values and these are
+	 * set to zero.
 	 *
-	 * Always compute the SCTLR_EL1 value and save in the cpu_context
-	 * - the EL2 registers are set up by cm_preapre_ns_entry() as they
-	 * are not part of the stored cpu_context
+	 * SCTLR.EE: Endianness is taken from the entrypoint attributes.
 	 *
-	 * TODO: In debug builds the spsr should be validated and checked
-	 * against the CPU support, security state, endianess and pc
+	 * SCTLR.M, SCTLR.C and SCTLR.I: These fields must be zero (as
+	 *  required by PSCI specification)
 	 */
 	sctlr_elx = EP_GET_EE(ep->h.attr) ? SCTLR_EE_BIT : 0;
 	if (GET_RW(ep->spsr) == MODE_RW_64)
 		sctlr_elx |= SCTLR_EL1_RES1;
 	else {
-		sctlr_elx |= SCTLR_AARCH32_EL1_RES1;
 		/*
-		 * If lower non-secure EL is AArch32, enable the CP15BEN, nTWI
-		 * & nTWI bits. This aligns with SCTLR initialization on
-		 * systems with an AArch32 EL3, where these bits
-		 * architecturally reset to 1.
+		 * If the target execution state is AArch32 then the following
+		 * fields need to be set.
+		 *
+		 * SCTRL_EL1.nTWE: Set to one so that EL0 execution of WFE
+		 *  instructions are not trapped to EL1.
+		 *
+		 * SCTLR_EL1.nTWI: Set to one so that EL0 execution of WFI
+		 *  instructions are not trapped to EL1.
+		 *
+		 * SCTLR_EL1.CP15BEN: Set to one to enable EL0 execution of the
+		 *  CP15DMB, CP15DSB, and CP15ISB instructions.
 		 */
-		if (security_state != SECURE)
-			sctlr_elx |= SCTLR_CP15BEN_BIT | SCTLR_NTWI_BIT
-						| SCTLR_NTWE_BIT;
+		sctlr_elx |= SCTLR_AARCH32_EL1_RES1 | SCTLR_CP15BEN_BIT
+					| SCTLR_NTWI_BIT | SCTLR_NTWE_BIT;
 	}
 
+	/*
+	 * Store the initialised SCTLR_EL1 value in the cpu_context - SCTLR_EL2
+	 * and other EL2 resgisters are set up by cm_preapre_ns_entry() as they
+	 * are not part of the stored cpu_context.
+	 */
 	write_ctx_reg(get_sysregs_ctx(ctx), CTX_SCTLR_EL1, sctlr_elx);
 
-	if ((GET_RW(ep->spsr) == MODE_RW_64
-	     && GET_EL(ep->spsr) == MODE_EL2)
-	    || (GET_RW(ep->spsr) != MODE_RW_64
-		&& GET_M32(ep->spsr) == MODE32_hyp)) {
-		scr_el3 |= SCR_HCE_BIT;
-	}
-
 	/* Populate EL3 state so that we've the right context before doing ERET */
 	state = get_el3state_ctx(ctx);
 	write_ctx_reg(state, CTX_SCR_EL3, scr_el3);
@@ -191,7 +218,7 @@
  ******************************************************************************/
 void cm_prepare_el3_exit(uint32_t security_state)
 {
-	uint32_t sctlr_elx, scr_el3, cptr_el2;
+	uint32_t sctlr_elx, scr_el3, mdcr_el2;
 	cpu_context_t *ctx = cm_get_context(security_state);
 
 	assert(ctx);
@@ -206,57 +233,167 @@
 			sctlr_elx |= SCTLR_EL2_RES1;
 			write_sctlr_el2(sctlr_elx);
 		} else if (EL_IMPLEMENTED(2)) {
-			/* EL2 present but unused, need to disable safely */
-
-			/* HCR_EL2 = 0, except RW bit set to match SCR_EL3 */
+			/*
+			 * EL2 present but unused, need to disable safely.
+			 * SCTLR_EL2 can be ignored in this case.
+			 *
+			 * Initialise all fields in HCR_EL2, except HCR_EL2.RW,
+			 * to zero so that Non-secure operations do not trap to
+			 * EL2.
+			 *
+			 * HCR_EL2.RW: Set this field to match SCR_EL3.RW
+			 */
 			write_hcr_el2((scr_el3 & SCR_RW_BIT) ? HCR_RW_BIT : 0);
 
-			/* SCTLR_EL2 : can be ignored when bypassing */
-
-			/* CPTR_EL2 : disable all traps TCPAC, TTA, TFP */
-			cptr_el2 = read_cptr_el2();
-			cptr_el2 &= ~(TCPAC_BIT | TTA_BIT | TFP_BIT);
-			write_cptr_el2(cptr_el2);
+			/*
+			 * Initialise CPTR_EL2 setting all fields rather than
+			 * relying on the hw. All fields have architecturally
+			 * UNKNOWN reset values.
+			 *
+			 * CPTR_EL2.TCPAC: Set to zero so that Non-secure EL1
+			 *  accesses to the CPACR_EL1 or CPACR from both
+			 *  Execution states do not trap to EL2.
+			 *
+			 * CPTR_EL2.TTA: Set to zero so that Non-secure System
+			 *  register accesses to the trace registers from both
+			 *  Execution states do not trap to EL2.
+			 *
+			 * CPTR_EL2.TFP: Set to zero so that Non-secure accesses
+			 *  to SIMD and floating-point functionality from both
+			 *  Execution states do not trap to EL2.
+			 */
+			write_cptr_el2(CPTR_EL2_RESET_VAL &
+					~(CPTR_EL2_TCPAC_BIT | CPTR_EL2_TTA_BIT
+					| CPTR_EL2_TFP_BIT));
 
-			/* Enable EL1 access to timer */
-			write_cnthctl_el2(EL1PCEN_BIT | EL1PCTEN_BIT);
+			/*
+			 * Initiliase CNTHCTL_EL2. All fields are
+			 * architecturally UNKNOWN on reset and are set to zero
+			 * except for field(s) listed below.
+			 *
+			 * CNTHCTL_EL2.EL1PCEN: Set to one to disable traps to
+			 *  Hyp mode of Non-secure EL0 and EL1 accesses to the
+			 *  physical timer registers.
+			 *
+			 * CNTHCTL_EL2.EL1PCTEN: Set to one to disable traps to
+			 *  Hyp mode of  Non-secure EL0 and EL1 accesses to the
+			 *  physical counter registers.
+			 */
+			write_cnthctl_el2(CNTHCTL_RESET_VAL |
+						EL1PCEN_BIT | EL1PCTEN_BIT);
 
-			/* Reset CNTVOFF_EL2 */
+			/*
+			 * Initialise CNTVOFF_EL2 to zero as it resets to an
+			 * architecturally UNKNOWN value.
+			 */
 			write_cntvoff_el2(0);
 
-			/* Set VPIDR, VMPIDR to match MIDR, MPIDR */
+			/*
+			 * Set VPIDR_EL2 and VMPIDR_EL2 to match MIDR_EL1 and
+			 * MPIDR_EL1 respectively.
+			 */
 			write_vpidr_el2(read_midr_el1());
 			write_vmpidr_el2(read_mpidr_el1());
 
 			/*
-			 * Reset VTTBR_EL2.
-			 * Needed because cache maintenance operations depend on
-			 * the VMID even when non-secure EL1&0 stage 2 address
-			 * translation are disabled.
+			 * Initialise VTTBR_EL2. All fields are architecturally
+			 * UNKNOWN on reset.
+			 *
+			 * VTTBR_EL2.VMID: Set to zero. Even though EL1&0 stage
+			 *  2 address translation is disabled, cache maintenance
+			 *  operations depend on the VMID.
+			 *
+			 * VTTBR_EL2.BADDR: Set to zero as EL1&0 stage 2 address
+			 *  translation is disabled.
 			 */
-			write_vttbr_el2(0);
+			write_vttbr_el2(VTTBR_RESET_VAL &
+				~((VTTBR_VMID_MASK << VTTBR_VMID_SHIFT)
+				| (VTTBR_BADDR_MASK << VTTBR_BADDR_SHIFT)));
+
 			/*
-			 * Avoid unexpected debug traps in case where MDCR_EL2
-			 * is not completely reset by the hardware - set
-			 * MDCR_EL2.HPMN to PMCR_EL0.N and zero the remaining
-			 * bits.
-			 * MDCR_EL2.HPMN and PMCR_EL0.N fields are the same size
-			 * (5 bits) and HPMN is at offset zero within MDCR_EL2.
+			 * Initialise MDCR_EL2, setting all fields rather than
+			 * relying on hw. Some fields are architecturally
+			 * UNKNOWN on reset.
+			 *
+			 * MDCR_EL2.TPMS (ARM v8.2): Do not trap statistical
+			 * profiling controls to EL2.
+			 *
+			 * MDCR_EL2.E2PB (ARM v8.2): SPE enabled in non-secure
+			 * state. Accesses to profiling buffer controls at
+			 * non-secure EL1 are not trapped to EL2.
+			 *
+			 * MDCR_EL2.TDRA: Set to zero so that Non-secure EL0 and
+			 *  EL1 System register accesses to the Debug ROM
+			 *  registers are not trapped to EL2.
+			 *
+			 * MDCR_EL2.TDOSA: Set to zero so that Non-secure EL1
+			 *  System register accesses to the powerdown debug
+			 *  registers are not trapped to EL2.
+			 *
+			 * MDCR_EL2.TDA: Set to zero so that System register
+			 *  accesses to the debug registers do not trap to EL2.
+			 *
+			 * MDCR_EL2.TDE: Set to zero so that debug exceptions
+			 *  are not routed to EL2.
+			 *
+			 * MDCR_EL2.HPME: Set to zero to disable EL2 Performance
+			 *  Monitors.
+			 *
+			 * MDCR_EL2.TPM: Set to zero so that Non-secure EL0 and
+			 *  EL1 accesses to all Performance Monitors registers
+			 *  are not trapped to EL2.
+			 *
+			 * MDCR_EL2.TPMCR: Set to zero so that Non-secure EL0
+			 *  and EL1 accesses to the PMCR_EL0 or PMCR are not
+			 *  trapped to EL2.
+			 *
+			 * MDCR_EL2.HPMN: Set to value of PMCR_EL0.N which is the
+			 *  architecturally-defined reset value.
 			 */
-			write_mdcr_el2((read_pmcr_el0() & PMCR_EL0_N_BITS)
-					>> PMCR_EL0_N_SHIFT);
+			mdcr_el2 = ((MDCR_EL2_RESET_VAL |
+					((read_pmcr_el0() & PMCR_EL0_N_BITS)
+					>> PMCR_EL0_N_SHIFT)) &
+					~(MDCR_EL2_TDRA_BIT | MDCR_EL2_TDOSA_BIT
+					| MDCR_EL2_TDA_BIT | MDCR_EL2_TDE_BIT
+					| MDCR_EL2_HPME_BIT | MDCR_EL2_TPM_BIT
+					| MDCR_EL2_TPMCR_BIT));
+
+#if ENABLE_SPE_FOR_LOWER_ELS
+			uint64_t id_aa64dfr0_el1;
+
+			/* Detect if SPE is implemented */
+			id_aa64dfr0_el1 = read_id_aa64dfr0_el1() >>
+				ID_AA64DFR0_PMS_SHIFT;
+			if ((id_aa64dfr0_el1 & ID_AA64DFR0_PMS_MASK) == 1) {
+				/*
+				 * Make sure traps to EL2 are not generated if
+				 * EL2 is implemented but not used.
+				 */
+				mdcr_el2 &= ~MDCR_EL2_TPMS;
+				mdcr_el2 |= MDCR_EL2_E2PB(MDCR_EL2_E2PB_EL1);
+			}
+#endif
+
+			write_mdcr_el2(mdcr_el2);
+
 			/*
-			 * Avoid unexpected traps of non-secure access to
-			 * certain system registers at EL1 or lower where
-			 * HSTR_EL2 is not completely reset to zero by the
-			 * hardware - zero the entire register.
+			 * Initialise HSTR_EL2. All fields are architecturally
+			 * UNKNOWN on reset.
+			 *
+			 * HSTR_EL2.T<n>: Set all these fields to zero so that
+			 *  Non-secure EL0 or EL1 accesses to System registers
+			 *  do not trap to EL2.
 			 */
-			write_hstr_el2(0);
+			write_hstr_el2(HSTR_EL2_RESET_VAL & ~(HSTR_EL2_T_MASK));
 			/*
-			 * Reset CNTHP_CTL_EL2 to disable the EL2 physical timer
-			 * and therefore prevent timer interrupts.
+			 * Initialise CNTHP_CTL_EL2. All fields are
+			 * architecturally UNKNOWN on reset.
+			 *
+			 * CNTHP_CTL_EL2:ENABLE: Set to zero to disable the EL2
+			 *  physical timer and prevent timer interrupts.
 			 */
-			write_cnthp_ctl_el2(0);
+			write_cnthp_ctl_el2(CNTHP_CTL_RESET_VAL &
+						~(CNTHP_CTL_ENABLE_BIT));
 		}
 	}
 
@@ -278,6 +415,7 @@
 	assert(ctx);
 
 	el1_sysregs_context_save(get_sysregs_ctx(ctx));
+	el1_sysregs_context_save_post_ops();
 }
 
 void cm_el1_sysregs_context_restore(uint32_t security_state)
diff --git a/lib/psci/psci_stat.c b/lib/psci/psci_stat.c
index 65b3f9b..3e79c5d 100644
--- a/lib/psci/psci_stat.c
+++ b/lib/psci/psci_stat.c
@@ -72,7 +72,7 @@
 void psci_stats_update_pwr_down(unsigned int end_pwrlvl,
 			const psci_power_state_t *state_info)
 {
-	int lvl, parent_idx, cpu_idx = plat_my_core_pos();
+	unsigned int lvl, parent_idx, cpu_idx = plat_my_core_pos();
 
 	assert(end_pwrlvl <= PLAT_MAX_PWR_LVL);
 	assert(state_info);
@@ -104,8 +104,8 @@
 void psci_stats_update_pwr_up(unsigned int end_pwrlvl,
 			const psci_power_state_t *state_info)
 {
-	int parent_idx, cpu_idx = plat_my_core_pos();
-	int lvl, stat_idx;
+	unsigned int lvl, parent_idx, cpu_idx = plat_my_core_pos();
+	int stat_idx;
 	plat_local_state_t local_state;
 	u_register_t residency;
 
@@ -162,10 +162,11 @@
  * local state for the highest power level expressed in the `power_state`
  * for the node represented by `target_cpu`.
  ******************************************************************************/
-int psci_get_stat(u_register_t target_cpu, unsigned int power_state,
+static int psci_get_stat(u_register_t target_cpu, unsigned int power_state,
 			 psci_stat_t *psci_stat)
 {
-	int rc, pwrlvl, lvl, parent_idx, stat_idx, target_idx;
+	int rc;
+	unsigned int pwrlvl, lvl, parent_idx, stat_idx, target_idx;
 	psci_power_state_t state_info = { {PSCI_LOCAL_STATE_RUN} };
 	plat_local_state_t local_state;
 
@@ -216,8 +217,8 @@
 		unsigned int power_state)
 {
 	psci_stat_t psci_stat;
-
 	int rc = psci_get_stat(target_cpu, power_state, &psci_stat);
+
 	if (rc == PSCI_E_SUCCESS)
 		return psci_stat.residency;
 	else
@@ -229,8 +230,8 @@
 	unsigned int power_state)
 {
 	psci_stat_t psci_stat;
-
 	int rc = psci_get_stat(target_cpu, power_state, &psci_stat);
+
 	if (rc == PSCI_E_SUCCESS)
 		return psci_stat.count;
 	else
diff --git a/lib/psci/psci_suspend.c b/lib/psci/psci_suspend.c
index fe68f44..4798892 100644
--- a/lib/psci/psci_suspend.c
+++ b/lib/psci/psci_suspend.c
@@ -292,7 +292,7 @@
 	 * Dispatcher to let it do any bookeeping. If the handler encounters an
 	 * error, it's expected to assert within
 	 */
-	if (psci_spd_pm && psci_spd_pm->svc_suspend) {
+	if (psci_spd_pm && psci_spd_pm->svc_suspend_finish) {
 		max_off_lvl = psci_find_max_off_lvl(state_info);
 		assert (max_off_lvl != PSCI_INVALID_PWR_LVL);
 		psci_spd_pm->svc_suspend_finish(max_off_lvl);
diff --git a/make_helpers/defaults.mk b/make_helpers/defaults.mk
index 2c8f82a..9946fea 100644
--- a/make_helpers/defaults.mk
+++ b/make_helpers/defaults.mk
@@ -136,3 +136,20 @@
 # required to enable cache coherency after warm reset (eg: single cluster
 # platforms).
 WARMBOOT_ENABLE_DCACHE_EARLY	:= 0
+
+# By default, enable Statistical Profiling Extensions.
+# The top level Makefile will disable this feature depending on
+# the target architecture and version number.
+ENABLE_SPE_FOR_LOWER_ELS	:= 1
+
+# SPE is enabled by default but only supported on AArch64 8.2 onwards.
+# Disable it in all other cases.
+ifeq (${ARCH},aarch32)
+    override ENABLE_SPE_FOR_LOWER_ELS := 0
+else
+    ifeq (${ARM_ARCH_MAJOR},8)
+        ifeq ($(ARM_ARCH_MINOR),$(filter $(ARM_ARCH_MINOR),0 1))
+            ENABLE_SPE_FOR_LOWER_ELS := 0
+        endif
+    endif
+endif
diff --git a/plat/arm/board/fvp/fvp_pm.c b/plat/arm/board/fvp/fvp_pm.c
index f4df658..e39a4d5 100644
--- a/plat/arm/board/fvp/fvp_pm.c
+++ b/plat/arm/board/fvp/fvp_pm.c
@@ -48,6 +48,14 @@
 {
 	uint64_t mpidr = read_mpidr_el1();
 
+#if ENABLE_SPE_FOR_LOWER_ELS
+	/*
+	 * On power down we need to disable statistical profiling extensions
+	 * before exiting coherency.
+	 */
+	arm_disable_spe();
+#endif
+
 	/* Disable coherency if this cluster is to be turned off */
 	fvp_interconnect_disable();
 
diff --git a/plat/arm/common/aarch64/arm_helpers.S b/plat/arm/common/aarch64/arm_helpers.S
index 1f20cb5..86565f5 100644
--- a/plat/arm/common/aarch64/arm_helpers.S
+++ b/plat/arm/common/aarch64/arm_helpers.S
@@ -12,6 +12,7 @@
 	.globl	plat_crash_console_putc
 	.globl	plat_crash_console_flush
 	.globl	platform_mem_init
+	.globl	arm_disable_spe
 
 
 	/* -----------------------------------------------------
@@ -86,3 +87,31 @@
 func platform_mem_init
 	ret
 endfunc platform_mem_init
+
+	/* -----------------------------------------------------
+	 * void arm_disable_spe (void);
+	 * -----------------------------------------------------
+	 */
+#if ENABLE_SPE_FOR_LOWER_ELS
+func arm_disable_spe
+	/* Detect if SPE is implemented */
+	mrs	x0, id_aa64dfr0_el1
+	ubfx	x0, x0, #ID_AA64DFR0_PMS_SHIFT, #ID_AA64DFR0_PMS_LENGTH
+	cmp	x0, #0x1
+	b.ne	1f
+
+	/* Drain buffered data */
+	.arch	armv8.2-a+profile
+	psb	csync
+	dsb	nsh
+
+	/* Disable Profiling Buffer */
+	mrs	x0, pmblimitr_el1
+	bic	x0, x0, #1
+	msr	pmblimitr_el1, x0
+	isb
+	.arch	armv8-a
+1:
+	ret
+endfunc arm_disable_spe
+#endif