PIE: Position Independant Executable support for BL31

This patch introduces Position Independant Executable(PIE) support
in TF-A. As a initial prototype, only BL31 can support PIE. A trivial
dynamic linker is implemented which supports fixing up Global Offset
Table(GOT) and Dynamic relocations(.rela.dyn). The fixup_gdt_reloc()
helper function implements this linker and this needs to be called
early in the boot sequence prior to invoking C functions. The GOT is
placed in the RO section of BL31 binary for improved security and the
BL31 linker script is modified to export the appropriate symbols
required for the dynamic linker.

The C compiler always generates PC relative addresses to linker symbols
and hence referencing symbols exporting constants are a problem when
relocating the binary. Hence the reference to the
`__PERCPU_TIMESTAMP_SIZE__` symbol in PMF is removed and is now calculated
at runtime based on start and end addresses.

Change-Id: I1228583ff92cf432963b7cef052e95d995cca93d
Signed-off-by: Soby Mathew <soby.mathew@arm.com>
diff --git a/bl31/aarch64/bl31_entrypoint.S b/bl31/aarch64/bl31_entrypoint.S
index 3a45e53..7c116a2 100644
--- a/bl31/aarch64/bl31_entrypoint.S
+++ b/bl31/aarch64/bl31_entrypoint.S
@@ -7,6 +7,7 @@
 #include <arch.h>
 #include <bl_common.h>
 #include <el3_common_macros.S>
+#include <platform_def.h>
 #include <pmf_asm_macros.S>
 #include <runtime_instr.h>
 #include <xlat_mmu_helpers.h>
@@ -73,6 +74,18 @@
 	mov	x22, 0
 	mov	x23, 0
 #endif /* RESET_TO_BL31 */
+
+	/* --------------------------------------------------------------------
+	 * If PIE is enabled, fixup the Global descriptor Table and dynamic
+	 * relocations
+	 * --------------------------------------------------------------------
+	 */
+#if ENABLE_PIE
+	mov_imm	x0, BL31_BASE
+	mov_imm	x1, BL31_LIMIT
+	bl	fixup_gdt_reloc
+#endif /* ENABLE_PIE */
+
 	/* ---------------------------------------------
 	 * Perform platform specific early arch. setup
 	 * ---------------------------------------------
diff --git a/bl31/bl31.ld.S b/bl31/bl31.ld.S
index 81e7ba3..43d0ed4 100644
--- a/bl31/bl31.ld.S
+++ b/bl31/bl31.ld.S
@@ -26,6 +26,8 @@
     ASSERT(. == ALIGN(PAGE_SIZE),
            "BL31_BASE address is not aligned on a page boundary.")
 
+    __BL31_START__ = .;
+
 #if SEPARATE_CODE_AND_RODATA
     .text . : {
         __TEXT_START__ = .;
@@ -63,6 +65,16 @@
         KEEP(*(cpu_ops))
         __CPU_OPS_END__ = .;
 
+        /*
+         * Keep the .got section in the RO section as the it is patched
+         * prior to enabling the MMU and having the .got in RO is better for
+         * security.
+         */
+        . = ALIGN(16);
+        __GOT_START__ = .;
+        *(.got)
+        __GOT_END__ = .;
+
         /* Place pubsub sections for events */
         . = ALIGN(8);
 #include <pubsub_events.h>
@@ -153,6 +165,16 @@
         __DATA_END__ = .;
     } >RAM
 
+    . = ALIGN(16);
+    /*
+     * .rela.dyn needs to come after .data for the read-elf utility to parse
+     * this section correctly.
+     */
+    __RELA_START__ = .;
+    .rela.dyn . : {
+    } >RAM
+    __RELA_END__ = .;
+
 #ifdef BL31_PROGBITS_LIMIT
     ASSERT(. <= BL31_PROGBITS_LIMIT, "BL31 progbits has exceeded its limit.")
 #endif
@@ -265,11 +287,5 @@
     __RW_END__ = .;
     __BL31_END__ = .;
 
-    __BSS_SIZE__ = SIZEOF(.bss);
-#if USE_COHERENT_MEM
-    __COHERENT_RAM_UNALIGNED_SIZE__ =
-        __COHERENT_RAM_END_UNALIGNED__ - __COHERENT_RAM_START__;
-#endif
-
     ASSERT(. <= BL31_LIMIT, "BL31 image has exceeded its limit.")
 }
diff --git a/include/common/bl_common.h b/include/common/bl_common.h
index 2ecf281..6a79dc3 100644
--- a/include/common/bl_common.h
+++ b/include/common/bl_common.h
@@ -83,6 +83,7 @@
 #elif defined(IMAGE_BL2U)
 IMPORT_SYM(unsigned long, __BL2U_END__,		BL2U_END);
 #elif defined(IMAGE_BL31)
+IMPORT_SYM(unsigned long, __BL31_START__,	BL31_START);
 IMPORT_SYM(unsigned long, __BL31_END__,		BL31_END);
 #elif defined(IMAGE_BL32)
 IMPORT_SYM(unsigned long, __BL32_END__,		BL32_END);
diff --git a/lib/aarch64/misc_helpers.S b/lib/aarch64/misc_helpers.S
index 1a075aa..002942e 100644
--- a/lib/aarch64/misc_helpers.S
+++ b/lib/aarch64/misc_helpers.S
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2013-2017, ARM Limited and Contributors. All rights reserved.
+ * Copyright (c) 2013-2018, ARM Limited and Contributors. All rights reserved.
  *
  * SPDX-License-Identifier: BSD-3-Clause
  */
@@ -7,6 +7,7 @@
 #include <arch.h>
 #include <asm_macros.S>
 #include <assert_macros.S>
+#include <xlat_tables_defs.h>
 
 	.globl	get_afflvl_shift
 	.globl	mpidr_mask_lower_afflvls
@@ -23,6 +24,8 @@
 	.globl	disable_mmu_icache_el1
 	.globl	disable_mmu_icache_el3
 
+	.globl	fixup_gdt_reloc
+
 #if SUPPORT_VFP
 	.globl	enable_vfp
 #endif
@@ -497,3 +500,114 @@
 	ret
 endfunc enable_vfp
 #endif
+
+/* ---------------------------------------------------------------------------
+ * Helper to fixup Global Descriptor table (GDT) and dynamic relocations
+ * (.rela.dyn) at runtime.
+ *
+ * This function is meant to be used when the firmware is compiled with -fpie
+ * and linked with -pie options. We rely on the linker script exporting
+ * appropriate markers for start and end of the section. For GOT, we
+ * expect __GOT_START__ and __GOT_END__. Similarly for .rela.dyn, we expect
+ * __RELA_START__ and __RELA_END__.
+ *
+ * The function takes the limits of the memory to apply fixups to as
+ * arguments (which is usually the limits of the relocable BL image).
+ *   x0 -  the start of the fixup region
+ *   x1 -  the limit of the fixup region
+ * These addresses have to be page (4KB aligned).
+ * ---------------------------------------------------------------------------
+ */
+func fixup_gdt_reloc
+	mov	x6, x0
+	mov	x7, x1
+
+	/* Test if the limits are 4K aligned */
+#if ENABLE_ASSERTIONS
+	orr	x0, x0, x1
+	tst	x0, #(PAGE_SIZE - 1)
+	ASM_ASSERT(eq)
+#endif
+	/*
+	 * Calculate the offset based on return address in x30.
+	 * Assume that this funtion is called within a page of the start of
+	 * of fixup region.
+	 */
+	and	x2, x30, #~(PAGE_SIZE - 1)
+	sub	x0, x2, x6	/* Diff(S) = Current Address - Compiled Address */
+
+	adrp	x1, __GOT_START__
+	add	x1, x1, :lo12:__GOT_START__
+	adrp	x2, __GOT_END__
+	add	x2, x2, :lo12:__GOT_END__
+
+	/*
+	 * GOT is an array of 64_bit addresses which must be fixed up as
+	 * new_addr = old_addr + Diff(S).
+	 * The new_addr is the address currently the binary is executing from
+	 * and old_addr is the address at compile time.
+	 */
+1:
+	ldr	x3, [x1]
+	/* Skip adding offset if address is < lower limit */
+	cmp	x3, x6
+	b.lo	2f
+	/* Skip adding offset if address is >= upper limit */
+	cmp	x3, x7
+	b.ge	2f
+	add	x3, x3, x0
+	str	x3, [x1]
+2:
+	add	x1, x1, #8
+	cmp	x1, x2
+	b.lo	1b
+
+	/* Starting dynamic relocations. Use adrp/adr to get RELA_START and END */
+	adrp	x1, __RELA_START__
+	add	x1, x1, :lo12:__RELA_START__
+	adrp	x2, __RELA_END__
+	add	x2, x2, :lo12:__RELA_END__
+	/*
+	 * According to ELF-64 specification, the RELA data structure is as
+	 * follows:
+	 *	typedef struct
+	 * 	{
+	 *		Elf64_Addr r_offset;
+	 *		Elf64_Xword r_info;
+	 *		Elf64_Sxword r_addend;
+	 *	} Elf64_Rela;
+	 *
+	 * r_offset is address of reference
+	 * r_info is symbol index and type of relocation (in this case
+	 * 0x403 which corresponds to R_AARCH64_RELATIV).
+	 * r_addend is constant part of expression.
+	 *
+	 * Size of Elf64_Rela structure is 24 bytes.
+	 */
+1:
+	/* Assert that the relocation type is R_AARCH64_RELATIV */
+#if ENABLE_ASSERTIONS
+	ldr	x3, [x1, #8]
+	cmp	x3, #0x403
+	ASM_ASSERT(eq)
+#endif
+	ldr	x3, [x1]	/* r_offset */
+	add	x3, x0, x3
+	ldr	x4, [x1, #16]	/* r_addend */
+
+	/* Skip adding offset if r_addend is < lower limit */
+	cmp	x4, x6
+	b.lo	2f
+	/* Skip adding offset if r_addend entry is >= upper limit */
+	cmp	x4, x7
+	b.ge	2f
+
+	add	x4, x0, x4	/* Diff(S) + r_addend */
+	str	x4, [x3]
+
+2:	add	x1, x1, #24
+	cmp	x1, x2
+	b.lo	1b
+
+	ret
+endfunc fixup_gdt_reloc
diff --git a/lib/pmf/pmf_main.c b/lib/pmf/pmf_main.c
index a020860..25513c1 100644
--- a/lib/pmf/pmf_main.c
+++ b/lib/pmf/pmf_main.c
@@ -25,9 +25,10 @@
 
 IMPORT_SYM(uintptr_t, __PMF_SVC_DESCS_START__,		PMF_SVC_DESCS_START);
 IMPORT_SYM(uintptr_t, __PMF_SVC_DESCS_END__,		PMF_SVC_DESCS_END);
-IMPORT_SYM(uintptr_t, __PERCPU_TIMESTAMP_SIZE__,	PMF_PERCPU_TIMESTAMP_SIZE);
+IMPORT_SYM(uintptr_t, __PMF_PERCPU_TIMESTAMP_END__,	PMF_PERCPU_TIMESTAMP_END);
 IMPORT_SYM(intptr_t,  __PMF_TIMESTAMP_START__,		PMF_TIMESTAMP_ARRAY_START);
-IMPORT_SYM(uintptr_t, __PMF_TIMESTAMP_END__,		PMF_TIMESTAMP_ARRAY_END);
+
+#define PMF_PERCPU_TIMESTAMP_SIZE	(PMF_PERCPU_TIMESTAMP_END - PMF_TIMESTAMP_ARRAY_START)
 
 #define PMF_SVC_DESCS_MAX		10