feat(simd): add routines to save, restore sve state

This adds assembly routines to save and restore SVE registers. In order
to share between FPU and SVE the code to save and restore FPCR and
FPSR, the patch converts code for those registers into macro.
Since we will be using simd_ctx_t to save and restore FPU also, we use
offsets in simd_ctx_t for FPSR and FPCR. Since simd_ctx_t has the same
structure at the beginning as fp_regs_t, those offsets should be the
same as CTX_FP_* offsets, when SVE is not enabled. Note that the code
also saves and restores FPEXC32 reg along with FPSR and FPCR.

Signed-off-by: Madhukar Pappireddy <madhukar.pappireddy@arm.com>
Signed-off-by: Okash Khawaja <okash@google.com>
Change-Id: I120c02359794aa6bb6376a464a9afe98bd84ae60
diff --git a/lib/el3_runtime/aarch64/context.S b/lib/el3_runtime/aarch64/context.S
index 62895ff..5977c92 100644
--- a/lib/el3_runtime/aarch64/context.S
+++ b/lib/el3_runtime/aarch64/context.S
@@ -9,12 +9,18 @@
 #include <assert_macros.S>
 #include <context.h>
 #include <el3_common_macros.S>
+#include <platform_def.h>
 
 #if CTX_INCLUDE_FPREGS
 	.global	fpregs_context_save
 	.global	fpregs_context_restore
 #endif /* CTX_INCLUDE_FPREGS */
 
+#if CTX_INCLUDE_SVE_REGS
+	.global sve_context_save
+	.global sve_context_restore
+#endif /* CTX_INCLUDE_SVE_REGS */
+
 #if ERRATA_SPECULATIVE_AT
 	.global save_and_update_ptw_el1_sys_regs
 #endif /* ERRATA_SPECULATIVE_AT */
@@ -23,6 +29,36 @@
 	.global	restore_gp_pmcr_pauth_regs
 	.global	el3_exit
 
+/* Following macros will be used if any of CTX_INCLUDE_FPREGS or CTX_INCLUDE_SVE_REGS is enabled */
+#if CTX_INCLUDE_FPREGS || CTX_INCLUDE_SVE_REGS
+.macro fpregs_state_save base:req hold:req
+	mrs	\hold, fpsr
+	str	\hold, [\base, #CTX_SIMD_FPSR]
+
+	mrs	\hold, fpcr
+	str	\hold, [\base, #CTX_SIMD_FPCR]
+
+#if CTX_INCLUDE_AARCH32_REGS && CTX_INCLUDE_FPREGS
+	mrs	\hold, fpexc32_el2
+	str	\hold, [\base, #CTX_SIMD_FPEXC32]
+#endif
+.endm
+
+.macro fpregs_state_restore base:req hold:req
+	ldr	\hold, [\base, #CTX_SIMD_FPSR]
+	msr	fpsr, \hold
+
+	ldr	\hold, [\base, #CTX_SIMD_FPCR]
+	msr	fpcr, \hold
+
+#if CTX_INCLUDE_AARCH32_REGS && CTX_INCLUDE_FPREGS
+	ldr	\hold, [\base, #CTX_SIMD_FPEXC32]
+	msr	fpexc32_el2, \hold
+#endif
+.endm
+
+#endif /* CTX_INCLUDE_FPREGS || CTX_INCLUDE_SVE_REGS */
+
 /* ------------------------------------------------------------------
  * The following function follows the aapcs_64 strictly to use
  * x9-x17 (temporary caller-saved registers according to AArch64 PCS)
@@ -39,33 +75,25 @@
  */
 #if CTX_INCLUDE_FPREGS
 func fpregs_context_save
-	stp	q0, q1, [x0, #CTX_FP_Q0]
-	stp	q2, q3, [x0, #CTX_FP_Q2]
-	stp	q4, q5, [x0, #CTX_FP_Q4]
-	stp	q6, q7, [x0, #CTX_FP_Q6]
-	stp	q8, q9, [x0, #CTX_FP_Q8]
-	stp	q10, q11, [x0, #CTX_FP_Q10]
-	stp	q12, q13, [x0, #CTX_FP_Q12]
-	stp	q14, q15, [x0, #CTX_FP_Q14]
-	stp	q16, q17, [x0, #CTX_FP_Q16]
-	stp	q18, q19, [x0, #CTX_FP_Q18]
-	stp	q20, q21, [x0, #CTX_FP_Q20]
-	stp	q22, q23, [x0, #CTX_FP_Q22]
-	stp	q24, q25, [x0, #CTX_FP_Q24]
-	stp	q26, q27, [x0, #CTX_FP_Q26]
-	stp	q28, q29, [x0, #CTX_FP_Q28]
-	stp	q30, q31, [x0, #CTX_FP_Q30]
-
-	mrs	x9, fpsr
-	str	x9, [x0, #CTX_FP_FPSR]
+	stp	q0, q1, [x0], #32
+	stp	q2, q3, [x0], #32
+	stp	q4, q5, [x0], #32
+	stp	q6, q7, [x0], #32
+	stp	q8, q9, [x0], #32
+	stp	q10, q11, [x0], #32
+	stp	q12, q13, [x0], #32
+	stp	q14, q15, [x0], #32
+	stp	q16, q17, [x0], #32
+	stp	q18, q19, [x0], #32
+	stp	q20, q21, [x0], #32
+	stp	q22, q23, [x0], #32
+	stp	q24, q25, [x0], #32
+	stp	q26, q27, [x0], #32
+	stp	q28, q29, [x0], #32
+	stp	q30, q31, [x0], #32
 
-	mrs	x10, fpcr
-	str	x10, [x0, #CTX_FP_FPCR]
+	fpregs_state_save x0, x9
 
-#if CTX_INCLUDE_AARCH32_REGS
-	mrs	x11, fpexc32_el2
-	str	x11, [x0, #CTX_FP_FPEXC32_EL2]
-#endif /* CTX_INCLUDE_AARCH32_REGS */
 	ret
 endfunc fpregs_context_save
 
@@ -84,51 +112,196 @@
  * ------------------------------------------------------------------
  */
 func fpregs_context_restore
-	ldp	q0, q1, [x0, #CTX_FP_Q0]
-	ldp	q2, q3, [x0, #CTX_FP_Q2]
-	ldp	q4, q5, [x0, #CTX_FP_Q4]
-	ldp	q6, q7, [x0, #CTX_FP_Q6]
-	ldp	q8, q9, [x0, #CTX_FP_Q8]
-	ldp	q10, q11, [x0, #CTX_FP_Q10]
-	ldp	q12, q13, [x0, #CTX_FP_Q12]
-	ldp	q14, q15, [x0, #CTX_FP_Q14]
-	ldp	q16, q17, [x0, #CTX_FP_Q16]
-	ldp	q18, q19, [x0, #CTX_FP_Q18]
-	ldp	q20, q21, [x0, #CTX_FP_Q20]
-	ldp	q22, q23, [x0, #CTX_FP_Q22]
-	ldp	q24, q25, [x0, #CTX_FP_Q24]
-	ldp	q26, q27, [x0, #CTX_FP_Q26]
-	ldp	q28, q29, [x0, #CTX_FP_Q28]
-	ldp	q30, q31, [x0, #CTX_FP_Q30]
+	ldp	q0, q1, [x0], #32
+	ldp	q2, q3, [x0], #32
+	ldp	q4, q5, [x0], #32
+	ldp	q6, q7, [x0], #32
+	ldp	q8, q9, [x0], #32
+	ldp	q10, q11, [x0], #32
+	ldp	q12, q13, [x0], #32
+	ldp	q14, q15, [x0], #32
+	ldp	q16, q17, [x0], #32
+	ldp	q18, q19, [x0], #32
+	ldp	q20, q21, [x0], #32
+	ldp	q22, q23, [x0], #32
+	ldp	q24, q25, [x0], #32
+	ldp	q26, q27, [x0], #32
+	ldp	q28, q29, [x0], #32
+	ldp	q30, q31, [x0], #32
 
-	ldr	x9, [x0, #CTX_FP_FPSR]
-	msr	fpsr, x9
+	fpregs_state_restore x0, x9
 
-	ldr	x10, [x0, #CTX_FP_FPCR]
-	msr	fpcr, x10
+	ret
+endfunc fpregs_context_restore
+#endif /* CTX_INCLUDE_FPREGS */
 
-#if CTX_INCLUDE_AARCH32_REGS
-	ldr	x11, [x0, #CTX_FP_FPEXC32_EL2]
-	msr	fpexc32_el2, x11
-#endif /* CTX_INCLUDE_AARCH32_REGS */
+#if CTX_INCLUDE_SVE_REGS
+/*
+ * Helper macros for SVE predicates save/restore operations.
+ */
+.macro sve_predicate_op op:req reg:req
+	\op p0, [\reg, #0, MUL VL]
+	\op p1, [\reg, #1, MUL VL]
+	\op p2, [\reg, #2, MUL VL]
+	\op p3, [\reg, #3, MUL VL]
+	\op p4, [\reg, #4, MUL VL]
+	\op p5, [\reg, #5, MUL VL]
+	\op p6, [\reg, #6, MUL VL]
+	\op p7, [\reg, #7, MUL VL]
+	\op p8, [\reg, #8, MUL VL]
+	\op p9, [\reg, #9, MUL VL]
+	\op p10, [\reg, #10, MUL VL]
+	\op p11, [\reg, #11, MUL VL]
+	\op p12, [\reg, #12, MUL VL]
+	\op p13, [\reg, #13, MUL VL]
+	\op p14, [\reg, #14, MUL VL]
+	\op p15, [\reg, #15, MUL VL]
+.endm
 
-	/*
-	 * No explict ISB required here as ERET to
-	 * switch to secure EL1 or non-secure world
-	 * covers it
-	 */
+.macro sve_vectors_op op:req reg:req
+	\op z0, [\reg, #0, MUL VL]
+	\op z1, [\reg, #1, MUL VL]
+	\op z2, [\reg, #2, MUL VL]
+	\op z3, [\reg, #3, MUL VL]
+	\op z4, [\reg, #4, MUL VL]
+	\op z5, [\reg, #5, MUL VL]
+	\op z6, [\reg, #6, MUL VL]
+	\op z7, [\reg, #7, MUL VL]
+	\op z8, [\reg, #8, MUL VL]
+	\op z9, [\reg, #9, MUL VL]
+	\op z10, [\reg, #10, MUL VL]
+	\op z11, [\reg, #11, MUL VL]
+	\op z12, [\reg, #12, MUL VL]
+	\op z13, [\reg, #13, MUL VL]
+	\op z14, [\reg, #14, MUL VL]
+	\op z15, [\reg, #15, MUL VL]
+	\op z16, [\reg, #16, MUL VL]
+	\op z17, [\reg, #17, MUL VL]
+	\op z18, [\reg, #18, MUL VL]
+	\op z19, [\reg, #19, MUL VL]
+	\op z20, [\reg, #20, MUL VL]
+	\op z21, [\reg, #21, MUL VL]
+	\op z22, [\reg, #22, MUL VL]
+	\op z23, [\reg, #23, MUL VL]
+	\op z24, [\reg, #24, MUL VL]
+	\op z25, [\reg, #25, MUL VL]
+	\op z26, [\reg, #26, MUL VL]
+	\op z27, [\reg, #27, MUL VL]
+	\op z28, [\reg, #28, MUL VL]
+	\op z29, [\reg, #29, MUL VL]
+	\op z30, [\reg, #30, MUL VL]
+	\op z31, [\reg, #31, MUL VL]
+.endm
+
+/* ------------------------------------------------------------------
+ * The following function follows the aapcs_64 strictly to use x9-x17
+ * (temporary caller-saved registers according to AArch64 PCS) to
+ * restore SVE register context. It assumes that 'x0' is
+ * pointing to a 'sve_regs_t' structure to which the register context
+ * will be saved.
+ * ------------------------------------------------------------------
+ */
+func sve_context_save
+.arch_extension sve
+	/* Temporarily enable SVE */
+	mrs	x10, cptr_el3
+	orr	x11, x10, #CPTR_EZ_BIT
+	bic	x11, x11, #TFP_BIT
+	msr	cptr_el3, x11
+	isb
+
+	/* zcr_el3 */
+	mrs	x12, S3_6_C1_C2_0
+	mov	x13, #((SVE_VECTOR_LEN >> 7) - 1)
+	msr	S3_6_C1_C2_0, x13
+	isb
+
+	/* Predicate registers */
+	mov x13, #CTX_SIMD_PREDICATES
+	add	x9, x0, x13
+	sve_predicate_op str, x9
+
+	/* Save FFR after predicates */
+	mov x13, #CTX_SIMD_FFR
+	add	x9, x0, x13
+	rdffr   p0.b
+	str	p0, [x9]
+
+	/* Save vector registers */
+	mov x13, #CTX_SIMD_VECTORS
+	add	x9, x0, x13
+	sve_vectors_op  str, x9
+
+	/* Restore SVE enablement */
+	msr	S3_6_C1_C2_0, x12 /* zcr_el3 */
+	msr	cptr_el3, x10
+	isb
+.arch_extension nosve
+
+	/* Save FPSR, FPCR and FPEXC32 */
+	fpregs_state_save x0, x9
 
 	ret
-endfunc fpregs_context_restore
-#endif /* CTX_INCLUDE_FPREGS */
+endfunc sve_context_save
+
+/* ------------------------------------------------------------------
+ * The following function follows the aapcs_64 strictly to use x9-x17
+ * (temporary caller-saved registers according to AArch64 PCS) to
+ * restore SVE register context. It assumes that 'x0' is pointing to
+ * a 'sve_regs_t' structure from where the register context will be
+ * restored.
+ * ------------------------------------------------------------------
+ */
+func sve_context_restore
+.arch_extension sve
+	/* Temporarily enable SVE for EL3 */
+	mrs	x10, cptr_el3
+	orr	x11, x10, #CPTR_EZ_BIT
+	bic	x11, x11, #TFP_BIT
+	msr	cptr_el3, x11
+	isb
+
+	/* zcr_el3 */
+	mrs	x12, S3_6_C1_C2_0
+	mov	x13, #((SVE_VECTOR_LEN >> 7) - 1)
+	msr	S3_6_C1_C2_0, x13
+	isb
+
+	/* Restore FFR register before predicates */
+	mov x13, #CTX_SIMD_FFR
+	add	x9, x0, x13
+	ldr	p0, [x9]
+	wrffr	p0.b
+
+	/* Restore predicate registers */
+	mov x13, #CTX_SIMD_PREDICATES
+	add	x9, x0, x13
+	sve_predicate_op ldr, x9
+
+	/* Restore vector registers */
+	mov x13, #CTX_SIMD_VECTORS
+	add	x9, x0, x13
+	sve_vectors_op	ldr, x9
+
+	/* Restore SVE enablement */
+	msr	S3_6_C1_C2_0, x12 /* zcr_el3 */
+	msr	cptr_el3, x10
+	isb
+.arch_extension nosve
+
+	/* Restore FPSR, FPCR and FPEXC32 */
+	fpregs_state_restore x0, x9
+	ret
+endfunc sve_context_restore
+#endif /* CTX_INCLUDE_SVE_REGS */
 
 	/*
 	 * Set SCR_EL3.EA bit to enable SErrors at EL3
 	 */
 	.macro enable_serror_at_el3
-	mrs     x8, scr_el3
-	orr     x8, x8, #SCR_EA_BIT
-	msr     scr_el3, x8
+	mrs	x8, scr_el3
+	orr	x8, x8, #SCR_EA_BIT
+	msr	scr_el3, x8
 	.endm
 
 	/*
@@ -147,8 +320,8 @@
 	and	x8, x8, #(ID_AA64PFR0_DIT_MASK << ID_AA64PFR0_DIT_SHIFT)
 	cbz	x8, 1f
 #endif
-	mov     x8, #DIT_BIT
-	msr     DIT, x8
+	mov	x8, #DIT_BIT
+	msr	DIT, x8
 1:
 #endif /* ENABLE_FEAT_DIT */
 	.endm /* set_unset_pstate_bits */