Merge pull request #1227 from geesun/qx/emmc_macros

emmc: add macros CMD21, BUS_WIDTH_DDR_4 and BUS_WIDTH_DDR_8
diff --git a/Makefile b/Makefile
index 1058b39..1cd6b62 100644
--- a/Makefile
+++ b/Makefile
@@ -500,6 +500,7 @@
 $(eval $(call assert_boolean,USE_COHERENT_MEM))
 $(eval $(call assert_boolean,USE_TBBR_DEFS))
 $(eval $(call assert_boolean,WARMBOOT_ENABLE_DCACHE_EARLY))
+$(eval $(call assert_boolean,BL2_AT_EL3))
 
 $(eval $(call assert_numeric,ARM_ARCH_MAJOR))
 $(eval $(call assert_numeric,ARM_ARCH_MINOR))
@@ -543,6 +544,7 @@
 $(eval $(call add_define,USE_COHERENT_MEM))
 $(eval $(call add_define,USE_TBBR_DEFS))
 $(eval $(call add_define,WARMBOOT_ENABLE_DCACHE_EARLY))
+$(eval $(call add_define,BL2_AT_EL3))
 
 # Define the EL3_PAYLOAD_BASE flag only if it is provided.
 ifdef EL3_PAYLOAD_BASE
@@ -584,8 +586,12 @@
 endif
 
 ifeq (${NEED_BL2},yes)
-$(if ${BL2}, $(eval $(call MAKE_TOOL_ARGS,2,${BL2},tb-fw)),\
-	$(eval $(call MAKE_BL,2,tb-fw)))
+ifeq (${BL2_AT_EL3}, 0)
+FIP_BL2_ARGS := tb-fw
+endif
+
+$(if ${BL2}, $(eval $(call MAKE_TOOL_ARGS,2,${BL2},${FIP_BL2_ARGS})),\
+	$(eval $(call MAKE_BL,2,${FIP_BL2_ARGS})))
 endif
 
 ifeq (${NEED_SCP_BL2},yes)
diff --git a/bl1/bl1.mk b/bl1/bl1.mk
index a026499..41ee1a7 100644
--- a/bl1/bl1.mk
+++ b/bl1/bl1.mk
@@ -13,7 +13,10 @@
 				lib/cpus/errata_report.c		\
 				lib/el3_runtime/${ARCH}/context_mgmt.c	\
 				plat/common/plat_bl1_common.c		\
-				plat/common/${ARCH}/platform_up_stack.S
+				plat/common/${ARCH}/platform_up_stack.S \
+				${MBEDTLS_COMMON_SOURCES}		\
+				${MBEDTLS_CRYPTO_SOURCES}		\
+				${MBEDTLS_X509_SOURCES}
 
 ifeq (${ARCH},aarch64)
 BL1_SOURCES		+=	lib/el3_runtime/aarch64/context.S
diff --git a/bl2/aarch32/bl2_el3_entrypoint.S b/bl2/aarch32/bl2_el3_entrypoint.S
new file mode 100644
index 0000000..997b069
--- /dev/null
+++ b/bl2/aarch32/bl2_el3_entrypoint.S
@@ -0,0 +1,84 @@
+/*
+ * Copyright (c) 2017, ARM Limited and Contributors. All rights reserved.
+ *
+ * SPDX-License-Identifier: BSD-3-Clause
+ */
+
+#include <arch.h>
+#include <asm_macros.S>
+#include <bl_common.h>
+#include <el3_common_macros.S>
+
+
+	.globl	bl2_entrypoint
+	.globl	bl2_run_next_image
+
+
+func bl2_entrypoint
+	/* Save arguments x0-x3 from previous Boot loader */
+	mov	r9, r0
+	mov	r10, r1
+	mov	r11, r2
+	mov	r12, r3
+
+	el3_entrypoint_common                                   \
+                _init_sctlr=1                                   \
+                _warm_boot_mailbox=!PROGRAMMABLE_RESET_ADDRESS  \
+                _secondary_cold_boot=!COLD_BOOT_SINGLE_CPU      \
+                _init_memory=1                                  \
+                _init_c_runtime=1                               \
+                _exception_vectors=bl2_vector_table
+
+	/*
+	 * Restore parameters of boot rom
+	 */
+	mov	r0, r9
+	mov	r1, r10
+	mov	r2, r11
+	mov	r3, r12
+
+	bl	bl2_el3_early_platform_setup
+	bl	bl2_el3_plat_arch_setup
+
+	/* ---------------------------------------------
+	 * Jump to main function.
+	 * ---------------------------------------------
+	 */
+	bl	bl2_main
+
+	/* ---------------------------------------------
+	 * Should never reach this point.
+	 * ---------------------------------------------
+	 */
+	no_ret	plat_panic_handler
+
+endfunc bl2_entrypoint
+
+func bl2_run_next_image
+	mov	r8,r0
+
+	/*
+	 * MMU needs to be disabled because both BL2 and BL32 execute
+	 * in PL1, and therefore share the same address space.
+	 * BL32 will initialize the address space according to its
+	 * own requirement.
+	 */
+	bl	disable_mmu_icache_secure
+	stcopr	r0, TLBIALL
+	dsb	sy
+	isb
+	mov	r0, r8
+	bl	bl2_el3_plat_prepare_exit
+
+	/*
+	 * Extract PC and SPSR based on struct `entry_point_info_t`
+	 * and load it in LR and SPSR registers respectively.
+	 */
+	ldr	lr, [r8, #ENTRY_POINT_INFO_PC_OFFSET]
+	ldr	r1, [r8, #(ENTRY_POINT_INFO_PC_OFFSET + 4)]
+	msr	spsr, r1
+
+	add	r8, r8, #ENTRY_POINT_INFO_ARGS_OFFSET
+	ldm	r8, {r0, r1, r2, r3}
+	eret
+endfunc bl2_run_next_image
diff --git a/bl2/aarch32/bl2_el3_exceptions.S b/bl2/aarch32/bl2_el3_exceptions.S
new file mode 100644
index 0000000..11ddf37
--- /dev/null
+++ b/bl2/aarch32/bl2_el3_exceptions.S
@@ -0,0 +1,21 @@
+/*
+ * Copyright (c) 2017, ARM Limited and Contributors. All rights reserved.
+ *
+ * SPDX-License-Identifier: BSD-3-Clause
+ */
+
+#include <arch.h>
+#include <asm_macros.S>
+#include <bl_common.h>
+
+	.globl	bl2_vector_table
+
+vector_base bl2_vector_table
+	b	bl2_entrypoint
+	b	report_exception	/* Undef */
+	b	report_exception	/* SVC call */
+	b	report_exception	/* Prefetch abort */
+	b	report_exception	/* Data abort */
+	b	report_exception	/* Reserved */
+	b	report_exception	/* IRQ */
+	b	report_exception	/* FIQ */
diff --git a/bl2/aarch64/bl2_el3_entrypoint.S b/bl2/aarch64/bl2_el3_entrypoint.S
new file mode 100644
index 0000000..2d3efd1
--- /dev/null
+++ b/bl2/aarch64/bl2_el3_entrypoint.S
@@ -0,0 +1,77 @@
+/*
+ * Copyright (c) 2017, ARM Limited and Contributors. All rights reserved.
+ *
+ * SPDX-License-Identifier: BSD-3-Clause
+ */
+
+#include <arch.h>
+#include <asm_macros.S>
+#include <bl_common.h>
+#include <el3_common_macros.S>
+
+	.globl	bl2_entrypoint
+	.globl	bl2_vector_table
+	.globl	bl2_el3_run_image
+	.globl	bl2_run_next_image
+
+func bl2_entrypoint
+	/* Save arguments x0-x3 from previous Boot loader */
+	mov	x20, x0
+	mov	x21, x1
+	mov	x22, x2
+	mov	x23, x3
+
+	el3_entrypoint_common                                   \
+		_init_sctlr=1                                   \
+		_warm_boot_mailbox=!PROGRAMMABLE_RESET_ADDRESS  \
+		_secondary_cold_boot=!COLD_BOOT_SINGLE_CPU      \
+		_init_memory=1                                  \
+		_init_c_runtime=1                               \
+		_exception_vectors=bl2_el3_exceptions
+
+	/*
+	 * Restore parameters of boot rom
+	 */
+	mov	x0, x20
+	mov	x1, x21
+	mov	x2, x22
+	mov	x3, x23
+
+	bl	bl2_el3_early_platform_setup
+	bl	bl2_el3_plat_arch_setup
+
+	/* ---------------------------------------------
+	 * Jump to main function.
+	 * ---------------------------------------------
+	 */
+	bl	bl2_main
+
+	/* ---------------------------------------------
+	 * Should never reach this point.
+	 * ---------------------------------------------
+	 */
+	no_ret	plat_panic_handler
+endfunc bl2_entrypoint
+
+func bl2_run_next_image
+	mov	x20,x0
+        /*
+         * MMU needs to be disabled because both BL2 and BL31 execute
+         * in EL3, and therefore share the same address space.
+         * BL31 will initialize the address space according to its
+         * own requirement.
+         */
+	bl	disable_mmu_icache_el3
+	tlbi	alle3
+	bl	bl2_el3_plat_prepare_exit
+
+	ldp	x0, x1, [x20, #ENTRY_POINT_INFO_PC_OFFSET]
+	msr	elr_el3, x0
+	msr	spsr_el3, x1
+
+	ldp	x6, x7, [x20, #(ENTRY_POINT_INFO_ARGS_OFFSET + 0x30)]
+	ldp	x4, x5, [x20, #(ENTRY_POINT_INFO_ARGS_OFFSET + 0x20)]
+	ldp	x2, x3, [x20, #(ENTRY_POINT_INFO_ARGS_OFFSET + 0x10)]
+	ldp	x0, x1, [x20, #(ENTRY_POINT_INFO_ARGS_OFFSET + 0x0)]
+	eret
+endfunc bl2_run_next_image
diff --git a/bl2/aarch64/bl2_el3_exceptions.S b/bl2/aarch64/bl2_el3_exceptions.S
new file mode 100644
index 0000000..987f6e3
--- /dev/null
+++ b/bl2/aarch64/bl2_el3_exceptions.S
@@ -0,0 +1,131 @@
+/*
+ * Copyright (c) 2017, ARM Limited and Contributors. All rights reserved.
+ *
+ * SPDX-License-Identifier: BSD-3-Clause
+ */
+
+#include <arch.h>
+#include <asm_macros.S>
+#include <bl1.h>
+#include <bl_common.h>
+#include <context.h>
+
+/* -----------------------------------------------------------------------------
+ * Very simple stackless exception handlers used by BL2.
+ * -----------------------------------------------------------------------------
+ */
+	.globl	bl2_el3_exceptions
+
+vector_base bl2_el3_exceptions
+
+	/* -----------------------------------------------------
+	 * Current EL with SP0 : 0x0 - 0x200
+	 * -----------------------------------------------------
+	 */
+vector_entry SynchronousExceptionSP0
+	mov	x0, #SYNC_EXCEPTION_SP_EL0
+	bl	plat_report_exception
+	no_ret	plat_panic_handler
+	check_vector_size SynchronousExceptionSP0
+
+vector_entry IrqSP0
+	mov	x0, #IRQ_SP_EL0
+	bl	plat_report_exception
+	no_ret	plat_panic_handler
+	check_vector_size IrqSP0
+
+vector_entry FiqSP0
+	mov	x0, #FIQ_SP_EL0
+	bl	plat_report_exception
+	no_ret	plat_panic_handler
+	check_vector_size FiqSP0
+
+vector_entry SErrorSP0
+	mov	x0, #SERROR_SP_EL0
+	bl	plat_report_exception
+	no_ret	plat_panic_handler
+	check_vector_size SErrorSP0
+
+	/* -----------------------------------------------------
+	 * Current EL with SPx: 0x200 - 0x400
+	 * -----------------------------------------------------
+	 */
+vector_entry SynchronousExceptionSPx
+	mov	x0, #SYNC_EXCEPTION_SP_ELX
+	bl	plat_report_exception
+	no_ret	plat_panic_handler
+	check_vector_size SynchronousExceptionSPx
+
+vector_entry IrqSPx
+	mov	x0, #IRQ_SP_ELX
+	bl	plat_report_exception
+	no_ret	plat_panic_handler
+	check_vector_size IrqSPx
+
+vector_entry FiqSPx
+	mov	x0, #FIQ_SP_ELX
+	bl	plat_report_exception
+	no_ret	plat_panic_handler
+	check_vector_size FiqSPx
+
+vector_entry SErrorSPx
+	mov	x0, #SERROR_SP_ELX
+	bl	plat_report_exception
+	no_ret	plat_panic_handler
+	check_vector_size SErrorSPx
+
+	/* -----------------------------------------------------
+	 * Lower EL using AArch64 : 0x400 - 0x600
+	 * -----------------------------------------------------
+	 */
+vector_entry SynchronousExceptionA64
+	mov	x0, #SYNC_EXCEPTION_AARCH64
+	bl	plat_report_exception
+	no_ret	plat_panic_handler
+	check_vector_size SynchronousExceptionA64
+
+vector_entry IrqA64
+	mov	x0, #IRQ_AARCH64
+	bl	plat_report_exception
+	no_ret	plat_panic_handler
+	check_vector_size IrqA64
+
+vector_entry FiqA64
+	mov	x0, #FIQ_AARCH64
+	bl	plat_report_exception
+	no_ret	plat_panic_handler
+	check_vector_size FiqA64
+
+vector_entry SErrorA64
+	mov	x0, #SERROR_AARCH64
+	bl	plat_report_exception
+	no_ret	plat_panic_handler
+	check_vector_size SErrorA64
+
+	/* -----------------------------------------------------
+	 * Lower EL using AArch32 : 0x600 - 0x800
+	 * -----------------------------------------------------
+	 */
+vector_entry SynchronousExceptionA32
+	mov	x0, #SYNC_EXCEPTION_AARCH32
+	bl	plat_report_exception
+	no_ret	plat_panic_handler
+	check_vector_size SynchronousExceptionA32
+
+vector_entry IrqA32
+	mov	x0, #IRQ_AARCH32
+	bl	plat_report_exception
+	no_ret	plat_panic_handler
+	check_vector_size IrqA32
+
+vector_entry FiqA32
+	mov	x0, #FIQ_AARCH32
+	bl	plat_report_exception
+	no_ret	plat_panic_handler
+	check_vector_size FiqA32
+
+vector_entry SErrorA32
+	mov	x0, #SERROR_AARCH32
+	bl	plat_report_exception
+	no_ret	plat_panic_handler
+	check_vector_size SErrorA32
diff --git a/bl2/bl2.mk b/bl2/bl2.mk
index 32e3284..9d75286 100644
--- a/bl2/bl2.mk
+++ b/bl2/bl2.mk
@@ -5,10 +5,12 @@
 #
 
 BL2_SOURCES		+=	bl2/bl2_main.c				\
-				bl2/${ARCH}/bl2_entrypoint.S		\
 				bl2/${ARCH}/bl2_arch_setup.c		\
 				lib/locks/exclusive/${ARCH}/spinlock.S	\
-				plat/common/${ARCH}/platform_up_stack.S
+				plat/common/${ARCH}/platform_up_stack.S	\
+				${MBEDTLS_COMMON_SOURCES}               \
+				${MBEDTLS_CRYPTO_SOURCES}		\
+				${MBEDTLS_X509_SOURCES}
 
 ifeq (${ARCH},aarch64)
 BL2_SOURCES		+=	common/aarch64/early_exceptions.S
@@ -20,4 +22,15 @@
 BL2_SOURCES		+=	bl2/bl2_image_load.c
 endif
 
+ifeq (${BL2_AT_EL3},0)
+BL2_SOURCES		+=	bl2/${ARCH}/bl2_entrypoint.S
 BL2_LINKERFILE		:=	bl2/bl2.ld.S
+
+else
+BL2_SOURCES		+=	bl2/${ARCH}/bl2_el3_entrypoint.S	\
+				bl2/${ARCH}/bl2_el3_exceptions.S	\
+				plat/common/plat_bl2_el3_common.c	\
+				lib/cpus/${ARCH}/cpu_helpers.S		\
+				lib/cpus/errata_report.c
+BL2_LINKERFILE		:=	bl2/bl2_el3.ld.S
+endif
diff --git a/bl2/bl2_el3.ld.S b/bl2/bl2_el3.ld.S
new file mode 100644
index 0000000..57709e3
--- /dev/null
+++ b/bl2/bl2_el3.ld.S
@@ -0,0 +1,184 @@
+/*
+ * Copyright (c) 2017, ARM Limited and Contributors. All rights reserved.
+ *
+ * SPDX-License-Identifier: BSD-3-Clause
+ */
+
+#include <platform_def.h>
+#include <xlat_tables_defs.h>
+
+OUTPUT_FORMAT(PLATFORM_LINKER_FORMAT)
+OUTPUT_ARCH(PLATFORM_LINKER_ARCH)
+ENTRY(bl2_entrypoint)
+
+MEMORY {
+    RAM (rwx): ORIGIN = BL2_BASE, LENGTH = BL2_LIMIT - BL2_BASE
+}
+
+
+SECTIONS
+{
+    . = BL2_BASE;
+    ASSERT(. == ALIGN(PAGE_SIZE),
+           "BL2_BASE address is not aligned on a page boundary.")
+
+#if SEPARATE_CODE_AND_RODATA
+    .text . : {
+        __TEXT_START__ = .;
+	__TEXT_RESIDENT_START__ = .;
+	*bl2_el3_entrypoint.o(.text*)
+	*(.text.asm.*)
+	__TEXT_RESIDENT_END__ = .;
+        *(.text*)
+        *(.vectors)
+        . = NEXT(PAGE_SIZE);
+        __TEXT_END__ = .;
+     } >RAM
+
+    .rodata . : {
+        __RODATA_START__ = .;
+        *(.rodata*)
+
+        /* Ensure 8-byte alignment for descriptors and ensure inclusion */
+        . = ALIGN(8);
+        __PARSER_LIB_DESCS_START__ = .;
+        KEEP(*(.img_parser_lib_descs))
+        __PARSER_LIB_DESCS_END__ = .;
+
+        /*
+         * Ensure 8-byte alignment for cpu_ops so that its fields are also
+         * aligned. Also ensure cpu_ops inclusion.
+         */
+        . = ALIGN(8);
+        __CPU_OPS_START__ = .;
+        KEEP(*(cpu_ops))
+        __CPU_OPS_END__ = .;
+
+        . = NEXT(PAGE_SIZE);
+        __RODATA_END__ = .;
+    } >RAM
+
+    ASSERT(__TEXT_RESIDENT_END__ - __TEXT_RESIDENT_START__ <= PAGE_SIZE,
+          "Resident part of BL2 has exceeded its limit.")
+#else
+    ro . : {
+        __RO_START__ = .;
+	__TEXT_RESIDENT_START__ = .;
+	*bl2_el3_entrypoint.o(.text*)
+	*(.text.asm.*)
+	__TEXT_RESIDENT_END__ = .;
+        *(.text*)
+        *(.rodata*)
+
+        /*
+         * Ensure 8-byte alignment for cpu_ops so that its fields are also
+         * aligned. Also ensure cpu_ops inclusion.
+         */
+        . = ALIGN(8);
+        __CPU_OPS_START__ = .;
+        KEEP(*(cpu_ops))
+        __CPU_OPS_END__ = .;
+
+        /* Ensure 8-byte alignment for descriptors and ensure inclusion */
+        . = ALIGN(8);
+        __PARSER_LIB_DESCS_START__ = .;
+        KEEP(*(.img_parser_lib_descs))
+        __PARSER_LIB_DESCS_END__ = .;
+
+        *(.vectors)
+        __RO_END_UNALIGNED__ = .;
+        /*
+         * Memory page(s) mapped to this section will be marked as
+         * read-only, executable.  No RW data from the next section must
+         * creep in.  Ensure the rest of the current memory page is unused.
+         */
+        . = NEXT(PAGE_SIZE);
+
+        __RO_END__ = .;
+    } >RAM
+#endif
+
+    ASSERT(__CPU_OPS_END__ > __CPU_OPS_START__,
+          "cpu_ops not defined for this platform.")
+
+    /*
+     * Define a linker symbol to mark start of the RW memory area for this
+     * image.
+     */
+    __RW_START__ = . ;
+
+    /*
+     * .data must be placed at a lower address than the stacks if the stack
+     * protector is enabled. Alternatively, the .data.stack_protector_canary
+     * section can be placed independently of the main .data section.
+     */
+    .data . : {
+        __DATA_START__ = .;
+        *(.data*)
+        __DATA_END__ = .;
+    } >RAM
+
+    stacks (NOLOAD) : {
+        __STACKS_START__ = .;
+        *(tzfw_normal_stacks)
+        __STACKS_END__ = .;
+    } >RAM
+
+    /*
+     * The .bss section gets initialised to 0 at runtime.
+     * Its base address should be 16-byte aligned for better performance of the
+     * zero-initialization code.
+     */
+    .bss : ALIGN(16) {
+        __BSS_START__ = .;
+        *(SORT_BY_ALIGNMENT(.bss*))
+        *(COMMON)
+        __BSS_END__ = .;
+    } >RAM
+
+    /*
+     * The xlat_table section is for full, aligned page tables (4K).
+     * Removing them from .bss avoids forcing 4K alignment on
+     * the .bss section and eliminates the unnecessary zero init
+     */
+    xlat_table (NOLOAD) : {
+        *(xlat_table)
+    } >RAM
+
+#if USE_COHERENT_MEM
+    /*
+     * The base address of the coherent memory section must be page-aligned (4K)
+     * to guarantee that the coherent data are stored on their own pages and
+     * are not mixed with normal data.  This is required to set up the correct
+     * memory attributes for the coherent data page tables.
+     */
+    coherent_ram (NOLOAD) : ALIGN(PAGE_SIZE) {
+        __COHERENT_RAM_START__ = .;
+        *(tzfw_coherent_mem)
+        __COHERENT_RAM_END_UNALIGNED__ = .;
+        /*
+         * Memory page(s) mapped to this section will be marked
+         * as device memory.  No other unexpected data must creep in.
+         * Ensure the rest of the current memory page is unused.
+         */
+        . = NEXT(PAGE_SIZE);
+        __COHERENT_RAM_END__ = .;
+    } >RAM
+#endif
+
+    /*
+     * Define a linker symbol to mark end of the RW memory area for this
+     * image.
+     */
+    __RW_END__ = .;
+    __BL2_END__ = .;
+
+    __BSS_SIZE__ = SIZEOF(.bss);
+
+#if USE_COHERENT_MEM
+    __COHERENT_RAM_UNALIGNED_SIZE__ =
+        __COHERENT_RAM_END_UNALIGNED__ - __COHERENT_RAM_START__;
+#endif
+
+    ASSERT(. <= BL2_LIMIT, "BL2 image has exceeded its limit.")
+}
diff --git a/bl2/bl2_main.c b/bl2/bl2_main.c
index 018deb3..c85db2d 100644
--- a/bl2/bl2_main.c
+++ b/bl2/bl2_main.c
@@ -13,6 +13,11 @@
 #include <platform.h>
 #include "bl2_private.h"
 
+#ifdef AARCH32
+#define NEXT_IMAGE	"BL32"
+#else
+#define NEXT_IMAGE	"BL31"
+#endif
 
 /*******************************************************************************
  * The only thing to do in BL2 is to load further images and pass control to
@@ -49,6 +54,8 @@
 	disable_mmu_icache_secure();
 #endif /* AARCH32 */
 
+
+#if !BL2_AT_EL3
 	console_flush();
 
 	/*
@@ -57,4 +64,11 @@
 	 * be passed to next BL image as an argument.
 	 */
 	smc(BL1_SMC_RUN_IMAGE, (unsigned long)next_bl_ep_info, 0, 0, 0, 0, 0, 0);
+#else
+	NOTICE("BL2: Booting " NEXT_IMAGE "\n");
+	print_entry_point_info(next_bl_ep_info);
+	console_flush();
+
+	bl2_run_next_image(next_bl_ep_info);
+#endif
 }
diff --git a/bl2/bl2_private.h b/bl2/bl2_private.h
index 83b8047..ea2f33a 100644
--- a/bl2/bl2_private.h
+++ b/bl2/bl2_private.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2013-2016, ARM Limited and Contributors. All rights reserved.
+ * Copyright (c) 2013-2017, ARM Limited and Contributors. All rights reserved.
  *
  * SPDX-License-Identifier: BSD-3-Clause
  */
@@ -17,5 +17,6 @@
  *****************************************/
 void bl2_arch_setup(void);
 struct entry_point_info *bl2_load_images(void);
+void bl2_run_next_image(const entry_point_info_t *bl_ep_info);
 
 #endif /* __BL2_PRIVATE_H__ */
diff --git a/docs/firmware-design.rst b/docs/firmware-design.rst
index 3cb004a..1f8fcc8 100644
--- a/docs/firmware-design.rst
+++ b/docs/firmware-design.rst
@@ -418,6 +418,63 @@
 
 #. BL1 passes control to BL31 at the specified entrypoint at EL3.
 
+Running BL2 at EL3 execution level
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Some platforms have a non-TF Boot ROM that expects the next boot stage
+to execute at EL3. On these platforms, TF BL1 is a waste of memory
+as its only purpose is to ensure TF BL2 is entered at S-EL1. To avoid
+this waste, a special mode enables BL2 to execute at EL3, which allows
+a non-TF Boot ROM to load and jump directly to BL2. This mode is selected
+when the build flag BL2_AT_EL3 is enabled. The main differences in this
+mode are:
+
+#. BL2 includes the reset code and the mailbox mechanism to differentiate
+   cold boot and warm boot. It runs at EL3 doing the arch
+   initialization required for EL3.
+
+#. BL2 does not receive the meminfo information from BL1 anymore. This
+   information can be passed by the Boot ROM or be internal to the
+   BL2 image.
+
+#. Since BL2 executes at EL3, BL2 jumps directly to the next image,
+   instead of invoking the RUN_IMAGE SMC call.
+
+
+We assume 3 different types of BootROM support on the platform:
+
+#. The Boot ROM always jumps to the same address, for both cold
+   and warm boot. In this case, we will need to keep a resident part
+   of BL2 whose memory cannot be reclaimed by any other image. The
+   linker script defines the symbols __TEXT_RESIDENT_START__ and
+   __TEXT_RESIDENT_END__ that allows the platform to configure
+   correctly the memory map.
+#. The platform has some mechanism to indicate the jump address to the
+   Boot ROM. Platform code can then program the jump address with
+   psci_warmboot_entrypoint during cold boot.
+#. The platform has some mechanism to program the reset address using
+   the PROGRAMMABLE_RESET_ADDRESS feature. Platform code can then
+   program the reset address with psci_warmboot_entrypoint during
+   cold boot, bypassing the boot ROM for warm boot.
+
+In the last 2 cases, no part of BL2 needs to remain resident at
+runtime. In the first 2 cases, we expect the Boot ROM to be able to
+differentiate between warm and cold boot, to avoid loading BL2 again
+during warm boot.
+
+This functionality can be tested with FVP loading the image directly
+in memory and changing the address where the system jumps at reset.
+For example:
+
+	-C cluster0.cpu0.RVBAR=0x4014000
+	--data cluster0.cpu0=bl2.bin@0x4014000
+
+With this configuration, FVP is like a platform of the first case,
+where the Boot ROM jumps always to the same address. For simplification,
+BL32 is loaded in DRAM in this case, to avoid other images reclaiming
+BL2 memory.
+
+
 AArch64 BL31
 ~~~~~~~~~~~~
 
diff --git a/docs/porting-guide.rst b/docs/porting-guide.rst
index 2e2cc4f..84bd2cd 100644
--- a/docs/porting-guide.rst
+++ b/docs/porting-guide.rst
@@ -1643,6 +1643,70 @@
 must return 0, otherwise it must return 1. The default implementation
 of this always returns 0.
 
+Boot Loader Stage 2 (BL2) at EL3
+--------------------------------
+
+When the platform has a non-TF Boot ROM it is desirable to jump
+directly to BL2 instead of TF BL1. In this case BL2 is expected to
+execute at EL3 instead of executing at EL1. Refer to the `Firmware
+Design`_ for more information.
+
+All mandatory functions of BL2 must be implemented, except the functions
+bl2\_early\_platform\_setup and bl2\_el3\_plat\_arch\_setup, because
+their work is done now by bl2\_el3\_early\_platform\_setup and
+bl2\_el3\_plat\_arch\_setup. These functions should generally implement
+the bl1\_plat\_xxx() and bl2\_plat\_xxx() functionality combined.
+
+
+Function : bl2\_el3\_early\_platform\_setup() [mandatory]
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+::
+	Argument : u_register_t, u_register_t, u_register_t, u_register_t
+	Return   : void
+
+This function executes with the MMU and data caches disabled. It is only called
+by the primary CPU. This function receives four parameters which can be used
+by the platform to pass any needed information from the Boot ROM to BL2.
+
+On ARM standard platforms, this function does the following:
+
+-  Initializes a UART (PL011 console), which enables access to the ``printf``
+   family of functions in BL2.
+
+-  Initializes the storage abstraction layer used to load further bootloader
+   images. It is necessary to do this early on platforms with a SCP\_BL2 image,
+   since the later ``bl2_platform_setup`` must be done after SCP\_BL2 is loaded.
+
+- Initializes the private variables that define the memory layout used.
+
+Function : bl2\_el3\_plat\_arch\_setup() [mandatory]
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+::
+	Argument : void
+	Return   : void
+
+This function executes with the MMU and data caches disabled. It is only called
+by the primary CPU.
+
+The purpose of this function is to perform any architectural initialization
+that varies across platforms.
+
+On ARM standard platforms, this function enables the MMU.
+
+Function : bl2\_el3\_plat\_prepare\_exit() [optional]
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+::
+	Argument : void
+	Return   : void
+
+This function is called prior to exiting BL2 and run the next image.
+It should be used to perform platform specific clean up or bookkeeping
+operations before transferring control to the next image. This function
+runs with MMU disabled.
+
 FWU Boot Loader Stage 2 (BL2U)
 ------------------------------
 
diff --git a/docs/user-guide.rst b/docs/user-guide.rst
index 0647e70..ed5ba18 100644
--- a/docs/user-guide.rst
+++ b/docs/user-guide.rst
@@ -245,6 +245,9 @@
    BL2U image. In this case, the BL2U in the ARM Trusted Firmware will not
    be built.
 
+- ``BL2_AT_EL3``: This is an optional build option that enables the use of
+   BL2 at EL3 execution level.
+
 -  ``BL31``: This is an optional build option which specifies the path to
    BL31 image for the ``fip`` target. In this case, the BL31 in the ARM
    Trusted Firmware will not be built.
diff --git a/drivers/auth/mbedtls/mbedtls_common.mk b/drivers/auth/mbedtls/mbedtls_common.mk
index f2b6f6e..8c4123d 100644
--- a/drivers/auth/mbedtls/mbedtls_common.mk
+++ b/drivers/auth/mbedtls/mbedtls_common.mk
@@ -29,7 +29,4 @@
 				platform.c 				\
 				)
 
-BL1_SOURCES		+=	${MBEDTLS_COMMON_SOURCES}
-BL2_SOURCES		+=	${MBEDTLS_COMMON_SOURCES}
-
 endif
diff --git a/drivers/auth/mbedtls/mbedtls_crypto.mk b/drivers/auth/mbedtls/mbedtls_crypto.mk
index 8eb4873..6b15e71 100644
--- a/drivers/auth/mbedtls/mbedtls_crypto.mk
+++ b/drivers/auth/mbedtls/mbedtls_crypto.mk
@@ -89,6 +89,3 @@
 # Needs to be set to drive mbed TLS configuration correctly
 $(eval $(call add_define,TF_MBEDTLS_KEY_ALG_ID))
 $(eval $(call add_define,TF_MBEDTLS_HASH_ALG_ID))
-
-BL1_SOURCES			+=	${MBEDTLS_CRYPTO_SOURCES}
-BL2_SOURCES			+=	${MBEDTLS_CRYPTO_SOURCES}
diff --git a/drivers/auth/mbedtls/mbedtls_x509.mk b/drivers/auth/mbedtls/mbedtls_x509.mk
index 0f28b65..a6f72e6 100644
--- a/drivers/auth/mbedtls/mbedtls_x509.mk
+++ b/drivers/auth/mbedtls/mbedtls_x509.mk
@@ -11,6 +11,3 @@
 				x509.c 						\
 				x509_crt.c 					\
 				)
-
-BL1_SOURCES		+=	${MBEDTLS_X509_SOURCES}
-BL2_SOURCES		+=	${MBEDTLS_X509_SOURCES}
diff --git a/drivers/synopsys/emmc/dw_mmc.c b/drivers/synopsys/emmc/dw_mmc.c
index e6904d1..701e6d5 100644
--- a/drivers/synopsys/emmc/dw_mmc.c
+++ b/drivers/synopsys/emmc/dw_mmc.c
@@ -146,7 +146,7 @@
 		if ((data & CMD_START) == 0)
 			break;
 		data = mmio_read_32(dw_params.reg_base + DWMMC_RINTSTS);
-		assert(data & INT_HLE);
+		assert((data & INT_HLE) == 0);
 	}
 }
 
diff --git a/include/common/aarch32/el3_common_macros.S b/include/common/aarch32/el3_common_macros.S
index 59e99f8..d654b65 100644
--- a/include/common/aarch32/el3_common_macros.S
+++ b/include/common/aarch32/el3_common_macros.S
@@ -260,9 +260,9 @@
 	 * ---------------------------------------------------------------------
 	 */
 	.if \_init_c_runtime
-#ifdef IMAGE_BL32
+#if defined(IMAGE_BL32) || (defined(IMAGE_BL2) && BL2_AT_EL3)
 		/* -----------------------------------------------------------------
-		 * Invalidate the RW memory used by the BL32 (SP_MIN) image. This
+		 * Invalidate the RW memory used by the image. This
 		 * includes the data and NOBITS sections. This is done to
 		 * safeguard against possible corruption of this memory by
 		 * dirty cache lines in a system cache as a result of use by
@@ -273,7 +273,7 @@
 		ldr	r1, =__RW_END__
 		sub	r1, r1, r0
 		bl	inv_dcache_range
-#endif /* IMAGE_BL32 */
+#endif
 
 		ldr	r0, =__BSS_START__
 		ldr	r1, =__BSS_SIZE__
diff --git a/include/common/aarch64/el3_common_macros.S b/include/common/aarch64/el3_common_macros.S
index defd4a2..4ebf77b 100644
--- a/include/common/aarch64/el3_common_macros.S
+++ b/include/common/aarch64/el3_common_macros.S
@@ -269,7 +269,7 @@
 	 * ---------------------------------------------------------------------
 	 */
 	.if \_init_c_runtime
-#ifdef IMAGE_BL31
+#if defined(IMAGE_BL31) || (defined(IMAGE_BL2) && BL2_AT_EL3)
 		/* -------------------------------------------------------------
 		 * Invalidate the RW memory used by the BL31 image. This
 		 * includes the data and NOBITS sections. This is done to
@@ -282,7 +282,7 @@
 		adr	x1, __RW_END__
 		sub	x1, x1, x0
 		bl	inv_dcache_range
-#endif /* IMAGE_BL31 */
+#endif
 
 		ldr	x0, =__BSS_START__
 		ldr	x1, =__BSS_SIZE__
diff --git a/include/common/asm_macros_common.S b/include/common/asm_macros_common.S
index 6a02e18..ca8c1ad 100644
--- a/include/common/asm_macros_common.S
+++ b/include/common/asm_macros_common.S
@@ -29,7 +29,7 @@
 	 * debugging experience.
 	 */
 	.cfi_sections .debug_frame
-	.section .text.\_name, "ax"
+	.section .text.asm.\_name, "ax"
 	.type \_name, %function
 	.func \_name
 	/*
diff --git a/include/lib/cpus/aarch32/cpu_macros.S b/include/lib/cpus/aarch32/cpu_macros.S
index e2e4316..0f3a572 100644
--- a/include/lib/cpus/aarch32/cpu_macros.S
+++ b/include/lib/cpus/aarch32/cpu_macros.S
@@ -9,6 +9,10 @@
 #include <arch.h>
 #include <errata_report.h>
 
+#if defined(IMAGE_BL1) || defined(IMAGE_BL32)  || (defined(IMAGE_BL2) && BL2_AT_EL3)
+#define IMAGE_AT_EL3
+#endif
+
 #define CPU_IMPL_PN_MASK	(MIDR_IMPL_MASK << MIDR_IMPL_SHIFT) | \
 				(MIDR_PN_MASK << MIDR_PN_SHIFT)
 
@@ -38,7 +42,7 @@
 CPU_MIDR: /* cpu_ops midr */
 	.space  4
 /* Reset fn is needed during reset */
-#if defined(IMAGE_BL1) || defined(IMAGE_BL32)
+#if defined(IMAGE_AT_EL3)
 CPU_RESET_FUNC: /* cpu_ops reset_func */
 	.space  4
 #endif
@@ -54,7 +58,7 @@
 #if REPORT_ERRATA
 CPU_ERRATA_FUNC: /* CPU errata status printing function */
 	.space  4
-#ifdef IMAGE_BL32
+#if defined(IMAGE_BL32)
 CPU_ERRATA_LOCK:
 	.space	4
 CPU_ERRATA_PRINTED:
@@ -120,7 +124,7 @@
 	.align 2
 	.type cpu_ops_\_name, %object
 	.word \_midr
-#if defined(IMAGE_BL1) || defined(IMAGE_BL32)
+#if defined(IMAGE_AT_EL3)
 	.word \_resetfunc
 #endif
 #ifdef IMAGE_BL32
diff --git a/include/lib/cpus/aarch64/cpu_macros.S b/include/lib/cpus/aarch64/cpu_macros.S
index a8c23e5..ccf5306 100644
--- a/include/lib/cpus/aarch64/cpu_macros.S
+++ b/include/lib/cpus/aarch64/cpu_macros.S
@@ -21,6 +21,10 @@
 /* Word size for 64-bit CPUs */
 #define CPU_WORD_SIZE			8
 
+#if defined(IMAGE_BL1) || defined(IMAGE_BL31) ||(defined(IMAGE_BL2) && BL2_AT_EL3)
+#define IMAGE_AT_EL3
+#endif
+
 /*
  * Whether errata status needs reporting. Errata status is printed in debug
  * builds for both BL1 and BL31 images.
@@ -38,7 +42,7 @@
 CPU_MIDR: /* cpu_ops midr */
 	.space  8
 /* Reset fn is needed in BL at reset vector */
-#if defined(IMAGE_BL1) || defined(IMAGE_BL31)
+#if defined(IMAGE_AT_EL3)
 CPU_RESET_FUNC: /* cpu_ops reset_func */
 	.space  8
 #endif
@@ -54,7 +58,7 @@
 #if REPORT_ERRATA
 CPU_ERRATA_FUNC:
 	.space	8
-#ifdef IMAGE_BL31
+#if defined(IMAGE_BL31)
 CPU_ERRATA_LOCK:
 	.space	8
 CPU_ERRATA_PRINTED:
@@ -124,7 +128,7 @@
 	.align 3
 	.type cpu_ops_\_name, %object
 	.quad \_midr
-#if defined(IMAGE_BL1) || defined(IMAGE_BL31)
+#if defined(IMAGE_AT_EL3)
 	.quad \_resetfunc
 #endif
 #ifdef IMAGE_BL31
diff --git a/include/lib/utils.h b/include/lib/utils.h
index cfc8302..3d215c3 100644
--- a/include/lib/utils.h
+++ b/include/lib/utils.h
@@ -19,7 +19,7 @@
 
 #include <types.h>
 
-typedef struct mem_region_t {
+typedef struct mem_region {
 	uintptr_t base;
 	size_t nbytes;
 } mem_region_t;
diff --git a/include/lib/xlat_tables/xlat_tables_v2_helpers.h b/include/lib/xlat_tables/xlat_tables_v2_helpers.h
index 1be99b7..de1c2d4 100644
--- a/include/lib/xlat_tables/xlat_tables_v2_helpers.h
+++ b/include/lib/xlat_tables/xlat_tables_v2_helpers.h
@@ -168,7 +168,7 @@
  * This IMAGE_EL macro must not to be used outside the library, and it is only
  * used in AArch64.
  */
-#if defined(IMAGE_BL1) || defined(IMAGE_BL31)
+#if defined(IMAGE_BL1) || defined(IMAGE_BL31) || (defined(IMAGE_BL2) && BL2_AT_EL3)
 # define IMAGE_EL	3
 # define IMAGE_XLAT_DEFAULT_REGIME EL3_REGIME
 #else
diff --git a/include/plat/arm/common/arm_def.h b/include/plat/arm/common/arm_def.h
index f38c357..697a0b0 100644
--- a/include/plat/arm/common/arm_def.h
+++ b/include/plat/arm/common/arm_def.h
@@ -334,6 +334,11 @@
 #define BL2_BASE			(BL1_RW_BASE - PLAT_ARM_MAX_BL2_SIZE)
 #define BL2_LIMIT			BL1_RW_BASE
 
+#elif BL2_AT_EL3
+
+#define BL2_BASE			ARM_BL_RAM_BASE
+#define BL2_LIMIT			(ARM_BL_RAM_BASE + ARM_BL_RAM_SIZE)
+
 #elif defined(AARCH32) || JUNO_AARCH32_EL3_RUNTIME
 /*
  * Put BL2 just below BL32.
diff --git a/include/plat/arm/common/plat_arm.h b/include/plat/arm/common/plat_arm.h
index abd7395..dfd7a20 100644
--- a/include/plat/arm/common/plat_arm.h
+++ b/include/plat/arm/common/plat_arm.h
@@ -145,6 +145,10 @@
 uint32_t arm_get_spsr_for_bl33_entry(void);
 int arm_bl2_handle_post_image_load(unsigned int image_id);
 
+/* BL2 at EL3 functions */
+void arm_bl2_el3_early_platform_setup(void);
+void arm_bl2_el3_plat_arch_setup(void);
+
 /* BL2U utility functions */
 void arm_bl2u_early_platform_setup(struct meminfo *mem_layout,
 				void *plat_info);
diff --git a/include/plat/common/platform.h b/include/plat/common/platform.h
index f11bee9..0960105 100644
--- a/include/plat/common/platform.h
+++ b/include/plat/common/platform.h
@@ -235,6 +235,21 @@
  * Optional BL2 functions (may be overridden)
  ******************************************************************************/
 
+
+/*******************************************************************************
+ * Mandatory BL2 at EL3 functions: Must be implemented if BL2_AT_EL3 image is
+ * supported
+ ******************************************************************************/
+void bl2_el3_early_platform_setup(u_register_t arg0, u_register_t arg1,
+				  u_register_t arg2, u_register_t arg3);
+void bl2_el3_plat_arch_setup(void);
+
+
+/*******************************************************************************
+ * Optional BL2 at EL3 functions (may be overridden)
+ ******************************************************************************/
+void bl2_el3_plat_prepare_exit(void);
+
 /*******************************************************************************
  * Mandatory BL2U functions.
  ******************************************************************************/
diff --git a/lib/cpus/aarch32/cpu_helpers.S b/lib/cpus/aarch32/cpu_helpers.S
index bfdc1e4..72e42c6 100644
--- a/lib/cpus/aarch32/cpu_helpers.S
+++ b/lib/cpus/aarch32/cpu_helpers.S
@@ -10,7 +10,7 @@
 #include <cpu_data.h>
 #include <cpu_macros.S>
 
-#if defined(IMAGE_BL1) || defined(IMAGE_BL32)
+#if defined(IMAGE_BL1) || defined(IMAGE_BL32) || (defined(IMAGE_BL2) && BL2_AT_EL3)
 	/*
 	 * The reset handler common to all platforms.  After a matching
 	 * cpu_ops structure entry is found, the correponding reset_handler
@@ -42,7 +42,7 @@
 	bx	lr
 endfunc reset_handler
 
-#endif /* IMAGE_BL1 || IMAGE_BL32 */
+#endif
 
 #ifdef IMAGE_BL32 /* The power down core and cluster is needed only in  BL32 */
 	/*
diff --git a/lib/cpus/aarch64/cpu_helpers.S b/lib/cpus/aarch64/cpu_helpers.S
index 2384553..ae1c3c2 100644
--- a/lib/cpus/aarch64/cpu_helpers.S
+++ b/lib/cpus/aarch64/cpu_helpers.S
@@ -7,7 +7,7 @@
 #include <arch.h>
 #include <asm_macros.S>
 #include <assert_macros.S>
-#ifdef IMAGE_BL31
+#if defined(IMAGE_BL31) || (defined(IMAGE_BL2) && BL2_AT_EL3)
 #include <cpu_data.h>
 #endif
 #include <cpu_macros.S>
@@ -15,7 +15,7 @@
 #include <errata_report.h>
 
  /* Reset fn is needed in BL at reset vector */
-#if defined(IMAGE_BL1) || defined(IMAGE_BL31)
+#if defined(IMAGE_BL1) || defined(IMAGE_BL31) || (defined(IMAGE_BL2) && BL2_AT_EL3)
 	/*
 	 * The reset handler common to all platforms.  After a matching
 	 * cpu_ops structure entry is found, the correponding reset_handler
@@ -47,7 +47,7 @@
 	ret
 endfunc reset_handler
 
-#endif /* IMAGE_BL1 || IMAGE_BL31 */
+#endif
 
 #ifdef IMAGE_BL31 /* The power down core and cluster is needed only in  BL31 */
 	/*
diff --git a/lib/cpus/errata_report.c b/lib/cpus/errata_report.c
index 8d9f704..182679d 100644
--- a/lib/cpus/errata_report.c
+++ b/lib/cpus/errata_report.c
@@ -20,6 +20,8 @@
 # define BL_STRING	"BL31"
 #elif defined(AARCH32) && defined(IMAGE_BL32)
 # define BL_STRING	"BL32"
+#elif defined(IMAGE_BL2) && BL2_AT_EL3
+# define BL_STRING "BL2"
 #else
 # error This image should not be printing errata status
 #endif
diff --git a/make_helpers/defaults.mk b/make_helpers/defaults.mk
index fa0d17d..643890f 100644
--- a/make_helpers/defaults.mk
+++ b/make_helpers/defaults.mk
@@ -27,6 +27,9 @@
 # Base commit to perform code check on
 BASE_COMMIT			:= origin/master
 
+# Execute BL2 at EL3
+BL2_AT_EL3			:= 0
+
 # By default, consider that the platform may release several CPUs out of reset.
 # The platform Makefile is free to override this value.
 COLD_BOOT_SINGLE_CPU		:= 0
diff --git a/make_helpers/tbbr/tbbr_tools.mk b/make_helpers/tbbr/tbbr_tools.mk
index 6e6e273..cda8d72 100644
--- a/make_helpers/tbbr/tbbr_tools.mk
+++ b/make_helpers/tbbr/tbbr_tools.mk
@@ -64,7 +64,9 @@
 $(if ${BL2},$(eval $(call CERT_ADD_CMD_OPT,${BL2},--tb-fw,true)),\
             $(eval $(call CERT_ADD_CMD_OPT,$(call IMG_BIN,2),--tb-fw,true)))
 $(eval $(call CERT_ADD_CMD_OPT,${BUILD_PLAT}/tb_fw.crt,--tb-fw-cert))
+ifeq (${BL2_AT_EL3}, 0)
 $(eval $(call FIP_ADD_PAYLOAD,${BUILD_PLAT}/tb_fw.crt,--tb-fw-cert))
+endif
 
 # Add the SCP_BL2 CoT (key cert + img cert + image)
 ifneq (${SCP_BL2},)
diff --git a/plat/arm/board/fvp/fvp_bl2_el3_setup.c b/plat/arm/board/fvp/fvp_bl2_el3_setup.c
new file mode 100644
index 0000000..69f2f7a
--- /dev/null
+++ b/plat/arm/board/fvp/fvp_bl2_el3_setup.c
@@ -0,0 +1,29 @@
+/*
+ * Copyright (c) 2017, ARM Limited and Contributors. All rights reserved.
+ *
+ * SPDX-License-Identifier: BSD-3-Clause
+ */
+
+#include <plat_arm.h>
+#include "fvp_private.h"
+
+void bl2_el3_early_platform_setup(u_register_t arg0 __unused,
+				  u_register_t arg1 __unused,
+				  u_register_t arg2 __unused,
+				  u_register_t arg3 __unused)
+{
+	arm_bl2_el3_early_platform_setup();
+
+	/* Initialize the platform config for future decision making */
+	fvp_config_setup();
+
+	/*
+	 * Initialize Interconnect for this cluster during cold boot.
+	 * No need for locks as no other CPU is active.
+	 */
+	fvp_interconnect_init();
+	/*
+	 * Enable coherency in Interconnect for the primary CPU's cluster.
+	 */
+	fvp_interconnect_enable();
+}
diff --git a/plat/arm/board/fvp/platform.mk b/plat/arm/board/fvp/platform.mk
index 9d3c5f6..a257784 100644
--- a/plat/arm/board/fvp/platform.mk
+++ b/plat/arm/board/fvp/platform.mk
@@ -126,6 +126,13 @@
 				plat/arm/board/fvp/fvp_trusted_boot.c		\
 				${FVP_SECURITY_SOURCES}
 
+ifeq (${BL2_AT_EL3},1)
+BL2_SOURCES		+=	plat/arm/board/fvp/${ARCH}/fvp_helpers.S	\
+				plat/arm/board/fvp/fvp_bl2_el3_setup.c		\
+				${FVP_CPU_LIBS}					\
+				${FVP_INTERCONNECT_SOURCES}
+endif
+
 ifeq (${FVP_USE_SP804_TIMER},1)
 BL2_SOURCES		+=	drivers/arm/sp804/sp804_delay_timer.c
 endif
@@ -165,5 +172,9 @@
 # Add support for platform supplied linker script for BL31 build
 $(eval $(call add_define,PLAT_EXTRA_LD_SCRIPT))
 
+ifneq (${BL2_AT_EL3}, 0)
+    override BL1_SOURCES =
+endif
+
 include plat/arm/board/common/board_common.mk
 include plat/arm/common/arm_common.mk
diff --git a/plat/arm/common/arm_bl2_el3_setup.c b/plat/arm/common/arm_bl2_el3_setup.c
new file mode 100644
index 0000000..e70d115
--- /dev/null
+++ b/plat/arm/common/arm_bl2_el3_setup.c
@@ -0,0 +1,91 @@
+/*
+ * Copyright (c) 2017, ARM Limited and Contributors. All rights reserved.
+ *
+ * SPDX-License-Identifier: BSD-3-Clause
+ */
+#include <console.h>
+#include <generic_delay_timer.h>
+#include <plat_arm.h>
+#include <platform.h>
+
+#pragma weak bl2_el3_early_platform_setup
+#pragma weak bl2_el3_plat_arch_setup
+#pragma weak bl2_el3_plat_prepare_exit
+
+static meminfo_t bl2_el3_tzram_layout;
+
+/*
+ * Perform arm specific early platform setup. At this moment we only initialize
+ * the console and the memory layout.
+ */
+void arm_bl2_el3_early_platform_setup(void)
+{
+	/* Initialize the console to provide early debug support */
+	console_init(PLAT_ARM_BOOT_UART_BASE, PLAT_ARM_BOOT_UART_CLK_IN_HZ,
+			ARM_CONSOLE_BAUDRATE);
+
+	/*
+	 * Allow BL2 to see the whole Trusted RAM. This is determined
+	 * statically since we cannot rely on BL1 passing this information
+	 * in the BL2_AT_EL3 case.
+	 */
+	bl2_el3_tzram_layout.total_base = ARM_BL_RAM_BASE;
+	bl2_el3_tzram_layout.total_size = ARM_BL_RAM_SIZE;
+
+	/* Initialise the IO layer and register platform IO devices */
+	plat_arm_io_setup();
+}
+
+void bl2_el3_early_platform_setup(u_register_t arg0 __unused,
+				  u_register_t arg1 __unused,
+				  u_register_t arg2 __unused,
+				  u_register_t arg3 __unused)
+{
+	arm_bl2_el3_early_platform_setup();
+
+	/*
+	 * Initialize Interconnect for this cluster during cold boot.
+	 * No need for locks as no other CPU is active.
+	 */
+	plat_arm_interconnect_init();
+	/*
+	 * Enable Interconnect coherency for the primary CPU's cluster.
+	 */
+	plat_arm_interconnect_enter_coherency();
+
+	generic_delay_timer_init();
+}
+
+/*******************************************************************************
+ * Perform the very early platform specific architectural setup here. At the
+ * moment this is only initializes the mmu in a quick and dirty way.
+ ******************************************************************************/
+void arm_bl2_el3_plat_arch_setup(void)
+{
+	arm_setup_page_tables(bl2_el3_tzram_layout.total_base,
+			      bl2_el3_tzram_layout.total_size,
+			      BL_CODE_BASE,
+			      BL_CODE_END,
+			      BL_RO_DATA_BASE,
+			      BL_RO_DATA_END
+#if USE_COHERENT_MEM
+			      , BL_COHERENT_RAM_BASE,
+			      BL_COHERENT_RAM_END
+#endif
+			      );
+
+#ifdef AARCH32
+	enable_mmu_secure(0);
+#else
+	enable_mmu_el3(0);
+#endif
+}
+
+void bl2_el3_plat_arch_setup(void)
+{
+	arm_bl2_el3_plat_arch_setup();
+}
+
+void bl2_el3_plat_prepare_exit(void)
+{
+}
diff --git a/plat/arm/common/arm_common.mk b/plat/arm/common/arm_common.mk
index fab57f1..e6ce18a 100644
--- a/plat/arm/common/arm_common.mk
+++ b/plat/arm/common/arm_common.mk
@@ -150,6 +150,11 @@
 				drivers/io/io_storage.c				\
 				plat/arm/common/arm_bl2_setup.c			\
 				plat/arm/common/arm_io_storage.c
+
+ifeq (${BL2_AT_EL3},1)
+BL2_SOURCES		+=	plat/arm/common/arm_bl2_el3_setup.c
+endif
+
 ifeq (${LOAD_IMAGE_V2},1)
 # Because BL1/BL2 execute in AArch64 mode but BL32 in AArch32 we need to use
 # the AArch32 descriptors.
diff --git a/plat/common/plat_bl2_el3_common.c b/plat/common/plat_bl2_el3_common.c
new file mode 100644
index 0000000..358a02d
--- /dev/null
+++ b/plat/common/plat_bl2_el3_common.c
@@ -0,0 +1,24 @@
+/*
+ * Copyright (c) 2017, ARM Limited and Contributors. All rights reserved.
+ *
+ * SPDX-License-Identifier: BSD-3-Clause
+ */
+
+#include <arch_helpers.h>
+#include <assert.h>
+#include <bl_common.h>
+#include <debug.h>
+#include <errno.h>
+#include <platform_def.h>
+
+/*
+ * The following platform functions are weakly defined. They
+ * are default implementations that allow BL2 to compile in
+ * absence of real definitions. The Platforms may override
+ * with more complex definitions.
+ */
+#pragma weak bl2_el3_plat_prepare_exit
+
+void bl2_el3_plat_prepare_exit(void)
+{
+}
diff --git a/plat/hisilicon/hikey960/aarch64/hikey960_helpers.S b/plat/hisilicon/hikey960/aarch64/hikey960_helpers.S
index c88f68e..d18399f 100644
--- a/plat/hisilicon/hikey960/aarch64/hikey960_helpers.S
+++ b/plat/hisilicon/hikey960/aarch64/hikey960_helpers.S
@@ -16,8 +16,6 @@
 	.globl	plat_crash_console_putc
 	.globl	plat_report_exception
 	.globl	plat_reset_handler
-	.globl	set_retention_ticks
-	.globl	clr_retention_ticks
 	.globl	clr_ex
 	.globl	nop
 
@@ -139,35 +137,6 @@
 endfunc plat_reset_handler
 
 	/* -----------------------------------------------------
-	 * void set_retention_ticks(unsigned int val);
-	 * Clobber list : x0
-	 * -----------------------------------------------------
-	 */
-func set_retention_ticks
-	mrs	x0, CORTEX_A53_ECTLR_EL1
-	bic	x0, x0, #CORTEX_A53_ECTLR_CPU_RET_CTRL_MASK
-	orr	x0, x0, #RETENTION_ENTRY_TICKS_8
-	msr	CORTEX_A53_ECTLR_EL1, x0
-	isb
-	dsb	sy
-	ret
-endfunc set_retention_ticks
-
-	/* -----------------------------------------------------
-	 * void clr_retention_ticks(unsigned int val);
-	 * Clobber list : x0
-	 * -----------------------------------------------------
-	 */
-func clr_retention_ticks
-	mrs	x0, CORTEX_A53_ECTLR_EL1
-	bic	x0, x0, #CORTEX_A53_ECTLR_CPU_RET_CTRL_MASK
-	msr	CORTEX_A53_ECTLR_EL1, x0
-	isb
-	dsb	sy
-	ret
-endfunc clr_retention_ticks
-
-	/* -----------------------------------------------------
 	 * void clrex(void);
 	 * -----------------------------------------------------
 	 */
diff --git a/plat/hisilicon/hikey960/hikey960_pm.c b/plat/hisilicon/hikey960/hikey960_pm.c
index 078f0d8..6609530 100644
--- a/plat/hisilicon/hikey960/hikey960_pm.c
+++ b/plat/hisilicon/hikey960/hikey960_pm.c
@@ -26,38 +26,6 @@
 #define SYSTEM_PWR_STATE(state) \
 	((state)->pwr_domain_state[PLAT_MAX_PWR_LVL])
 
-#define PSTATE_WIDTH		4
-#define PSTATE_MASK		((1 << PSTATE_WIDTH) - 1)
-
-#define MAKE_PWRSTATE(lvl2_state, lvl1_state, lvl0_state, pwr_lvl, type) \
-		(((lvl2_state) << (PSTATE_ID_SHIFT + PSTATE_WIDTH * 2)) | \
-		 ((lvl1_state) << (PSTATE_ID_SHIFT + PSTATE_WIDTH)) | \
-		 ((lvl0_state) << (PSTATE_ID_SHIFT)) | \
-		 ((pwr_lvl) << PSTATE_PWR_LVL_SHIFT) | \
-		 ((type) << PSTATE_TYPE_SHIFT))
-
-/*
- * The table storing the valid idle power states. Ensure that the
- * array entries are populated in ascending order of state-id to
- * enable us to use binary search during power state validation.
- * The table must be terminated by a NULL entry.
- */
-const unsigned int hikey960_pwr_idle_states[] = {
-	/* State-id - 0x001 */
-	MAKE_PWRSTATE(PLAT_MAX_RUN_STATE, PLAT_MAX_RUN_STATE,
-		      PLAT_MAX_STB_STATE, MPIDR_AFFLVL0, PSTATE_TYPE_STANDBY),
-	/* State-id - 0x002 */
-	MAKE_PWRSTATE(PLAT_MAX_RUN_STATE, PLAT_MAX_RUN_STATE,
-		      PLAT_MAX_RET_STATE, MPIDR_AFFLVL0, PSTATE_TYPE_STANDBY),
-	/* State-id - 0x003 */
-	MAKE_PWRSTATE(PLAT_MAX_RUN_STATE, PLAT_MAX_RUN_STATE,
-		      PLAT_MAX_OFF_STATE, MPIDR_AFFLVL0, PSTATE_TYPE_POWERDOWN),
-	/* State-id - 0x033 */
-	MAKE_PWRSTATE(PLAT_MAX_RUN_STATE, PLAT_MAX_OFF_STATE,
-		      PLAT_MAX_OFF_STATE, MPIDR_AFFLVL1, PSTATE_TYPE_POWERDOWN),
-	0,
-};
-
 #define DMAC_GLB_REG_SEC	0x694
 #define AXI_CONF_BASE		0x820
 
@@ -66,24 +34,17 @@
 static void hikey960_pwr_domain_standby(plat_local_state_t cpu_state)
 {
 	unsigned long scr;
-	unsigned int val = 0;
-
-	assert(cpu_state == PLAT_MAX_STB_STATE ||
-	       cpu_state == PLAT_MAX_RET_STATE);
 
 	scr = read_scr_el3();
 
-	/* Enable Physical IRQ and FIQ to wake the CPU*/
+	/* Enable Physical IRQ and FIQ to wake the CPU */
 	write_scr_el3(scr | SCR_IRQ_BIT | SCR_FIQ_BIT);
 
-	if (cpu_state == PLAT_MAX_RET_STATE)
-		set_retention_ticks(val);
-
+	/* Add barrier before CPU enter WFI state */
+	isb();
+	dsb();
 	wfi();
 
-	if (cpu_state == PLAT_MAX_RET_STATE)
-		clr_retention_ticks(val);
-
 	/*
 	 * Restore SCR to the original value, synchronisazion of
 	 * scr_el3 is done by eret while el3_exit to save some
@@ -161,34 +122,38 @@
 int hikey960_validate_power_state(unsigned int power_state,
 			       psci_power_state_t *req_state)
 {
-	unsigned int state_id;
+	unsigned int pstate = psci_get_pstate_type(power_state);
+	unsigned int pwr_lvl = psci_get_pstate_pwrlvl(power_state);
 	int i;
 
 	assert(req_state);
 
-	/*
-	 *  Currently we are using a linear search for finding the matching
-	 *  entry in the idle power state array. This can be made a binary
-	 *  search if the number of entries justify the additional complexity.
-	 */
-	for (i = 0; !!hikey960_pwr_idle_states[i]; i++) {
-		if (power_state == hikey960_pwr_idle_states[i])
-			break;
-	}
-
-	/* Return error if entry not found in the idle state array */
-	if (!hikey960_pwr_idle_states[i])
+	if (pwr_lvl > PLAT_MAX_PWR_LVL)
 		return PSCI_E_INVALID_PARAMS;
 
-	i = 0;
-	state_id = psci_get_pstate_id(power_state);
+	/* Sanity check the requested state */
+	if (pstate == PSTATE_TYPE_STANDBY) {
+		/*
+		 * It's possible to enter standby only on power level 0
+		 * Ignore any other power level.
+		 */
+		if (pwr_lvl != MPIDR_AFFLVL0)
+			return PSCI_E_INVALID_PARAMS;
 
-	/* Parse the State ID and populate the state info parameter */
-	while (state_id) {
-		req_state->pwr_domain_state[i++] = state_id & PSTATE_MASK;
-		state_id >>= PSTATE_WIDTH;
+		req_state->pwr_domain_state[MPIDR_AFFLVL0] =
+					PLAT_MAX_RET_STATE;
+	} else {
+		for (i = MPIDR_AFFLVL0; i <= pwr_lvl; i++)
+			req_state->pwr_domain_state[i] =
+					PLAT_MAX_OFF_STATE;
 	}
 
+	/*
+	 * We expect the 'state id' to be zero.
+	 */
+	if (psci_get_pstate_id(power_state))
+		return PSCI_E_INVALID_PARAMS;
+
 	return PSCI_E_SUCCESS;
 }
 
diff --git a/plat/hisilicon/hikey960/include/platform_def.h b/plat/hisilicon/hikey960/include/platform_def.h
index 2ac7f2a..cb76090 100644
--- a/plat/hisilicon/hikey960/include/platform_def.h
+++ b/plat/hisilicon/hikey960/include/platform_def.h
@@ -31,10 +31,8 @@
 #define PLAT_NUM_PWR_DOMAINS		(PLATFORM_CORE_COUNT + \
 					 PLATFORM_CLUSTER_COUNT + 1)
 
-#define PLAT_MAX_RUN_STATE		0
-#define PLAT_MAX_STB_STATE		1
-#define PLAT_MAX_RET_STATE		2
-#define PLAT_MAX_OFF_STATE		3
+#define PLAT_MAX_RET_STATE		1
+#define PLAT_MAX_OFF_STATE		2
 
 #define MAX_IO_DEVICES			3
 #define MAX_IO_HANDLES			4