armv8: Reduce exception handling code

The arm64 exception handling code is quite big, mostly due to
architectural alignment requirements. Each exception entry spans 32
instructions, which sounds generous, but is too small to fit all of the
save/branch/restore code in there. So at the moment we use only four
instructions, branching into shared save and restore routines.
To not leave the space for those remaining 28 instructions wasted, let's
split the save and restore routines and stuff them into the gaps.
This saves about 250 bytes of code, which is helpful for those tight
SPLs.

Signed-off-by: Andre Przywara <andre.przywara@arm.com>
diff --git a/arch/arm/cpu/armv8/exceptions.S b/arch/arm/cpu/armv8/exceptions.S
index 1a78a5d..a15af72 100644
--- a/arch/arm/cpu/armv8/exceptions.S
+++ b/arch/arm/cpu/armv8/exceptions.S
@@ -11,7 +11,26 @@
 #include <linux/linkage.h>
 
 /*
- * Exception vectors.
+ * AArch64 exception vectors:
+ * We have four types of exceptions:
+ * - synchronous: traps, data aborts, undefined instructions, ...
+ * - IRQ: group 1 (normal) interrupts
+ * - FIQ: group 0 or secure interrupts
+ * - SError: fatal system errors
+ * There are entries for all four of those for different contexts:
+ * - from same exception level, when using the SP_EL0 stack pointer
+ * - from same exception level, when using the SP_ELx stack pointer
+ * - from lower exception level, when this is AArch64
+ * - from lower exception level, when this is AArch32
+ * Each of those 16 entries have space for 32 instructions, each entry must
+ * be 128 byte aligned, the whole table must be 2K aligned.
+ * The 32 instructions are not enough to save and restore all registers and
+ * to branch to the actual handler, so we split this up:
+ * Each entry saves the LR, branches to the save routine, then to the actual
+ * handler, then to the restore routine. The save and restore routines are
+ * each split in half and stuffed in the unused gap between the entries.
+ * Also as we do not run anything in a lower exception level, we just provide
+ * the first 8 entries for exceptions from the same EL.
  */
 	.align	11
 	.globl	vectors
@@ -22,52 +41,9 @@
 	bl	do_bad_sync
 	b	exception_exit
 
-	.align	7		/* Current EL IRQ Thread */
-	stp	x29, x30, [sp, #-16]!
-	bl	_exception_entry
-	bl	do_bad_irq
-	b	exception_exit
-
-	.align	7		/* Current EL FIQ Thread */
-	stp	x29, x30, [sp, #-16]!
-	bl	_exception_entry
-	bl	do_bad_fiq
-	b	exception_exit
-
-	.align	7		/* Current EL Error Thread */
-	stp	x29, x30, [sp, #-16]!
-	bl	_exception_entry
-	bl	do_bad_error
-	b	exception_exit
-
-	.align	7		 /* Current EL Synchronous Handler */
-	stp	x29, x30, [sp, #-16]!
-	bl	_exception_entry
-	bl	do_sync
-	b	exception_exit
-
-	.align	7		 /* Current EL IRQ Handler */
-	stp	x29, x30, [sp, #-16]!
-	bl	_exception_entry
-	bl	do_irq
-	b	exception_exit
-
-	.align	7		 /* Current EL FIQ Handler */
-	stp	x29, x30, [sp, #-16]!
-	bl	_exception_entry
-	bl	do_fiq
-	b	exception_exit
-
-	.align	7		 /* Current EL Error Handler */
-	stp	x29, x30, [sp, #-16]!
-	bl	_exception_entry
-	bl	do_error
-	b	exception_exit
-
 /*
- * Enter Exception.
- * This will save the processor state that is ELR/X0~X30
- * to the stack frame.
+ * Save (most of) the GP registers to the stack frame.
+ * This is the first part of the shared routine called into from all entries.
  */
 _exception_entry:
 	stp	x27, x28, [sp, #-16]!
@@ -84,7 +60,19 @@
 	stp	x5, x6, [sp, #-16]!
 	stp	x3, x4, [sp, #-16]!
 	stp	x1, x2, [sp, #-16]!
+	b	_save_el_regs			/* jump to the second part */
 
+	.align	7		/* Current EL IRQ Thread */
+	stp	x29, x30, [sp, #-16]!
+	bl	_exception_entry
+	bl	do_bad_irq
+	b	exception_exit
+
+/*
+ * Save exception specific context: ESR and ELR, for all exception levels.
+ * This is the second part of the shared routine called into from all entries.
+ */
+_save_el_regs:
 	/* Could be running at EL3/EL2/EL1 */
 	switch_el x11, 3f, 2f, 1f
 3:	mrs	x1, esr_el3
@@ -100,16 +88,36 @@
 	mov	x0, sp
 	ret
 
-
+	.align	7		/* Current EL FIQ Thread */
+	stp	x29, x30, [sp, #-16]!
+	bl	_exception_entry
+	bl	do_bad_fiq
+				/* falling through to _exception_exit */
+/*
+ * Restore the exception return address, for all exception levels.
+ * This is the first part of the shared routine called into from all entries.
+ */
 exception_exit:
 	ldp	x2, x0, [sp],#16
 	switch_el x11, 3f, 2f, 1f
 3:	msr	elr_el3, x2
-	b	0f
+	b	_restore_regs
 2:	msr	elr_el2, x2
-	b	0f
+	b	_restore_regs
 1:	msr	elr_el1, x2
-0:
+	b	_restore_regs		/* jump to the second part */
+
+	.align	7		/* Current EL Error Thread */
+	stp	x29, x30, [sp, #-16]!
+	bl	_exception_entry
+	bl	do_bad_error
+	b	exception_exit
+
+/*
+ * Restore the general purpose registers from the exception stack, then return.
+ * This is the second part of the shared routine called into from all entries.
+ */
+_restore_regs:
 	ldp	x1, x2, [sp],#16
 	ldp	x3, x4, [sp],#16
 	ldp	x5, x6, [sp],#16
@@ -126,3 +134,27 @@
 	ldp	x27, x28, [sp],#16
 	ldp	x29, x30, [sp],#16
 	eret
+
+	.align	7		 /* Current EL (SP_ELx) Synchronous Handler */
+	stp	x29, x30, [sp, #-16]!
+	bl	_exception_entry
+	bl	do_sync
+	b	exception_exit
+
+	.align	7		 /* Current EL (SP_ELx) IRQ Handler */
+	stp	x29, x30, [sp, #-16]!
+	bl	_exception_entry
+	bl	do_irq
+	b	exception_exit
+
+	.align	7		 /* Current EL (SP_ELx) FIQ Handler */
+	stp	x29, x30, [sp, #-16]!
+	bl	_exception_entry
+	bl	do_fiq
+	b	exception_exit
+
+	.align	7		 /* Current EL (SP_ELx) Error Handler */
+	stp	x29, x30, [sp, #-16]!
+	bl	_exception_entry
+	bl	do_error
+	b	exception_exit