riscv: add support for multi-hart systems

On RISC-V, all harts boot independently. To be able to run on a
multi-hart system, U-Boot must be extended with the functionality to
manage all harts in the system. All harts entering U-Boot are registered
in the available_harts mask stored in global data. A hart lottery system
as used in the Linux kernel selects the hart U-Boot runs on. All other
harts are halted. U-Boot can delegate functions to them using
smp_call_function().

Every hart has a valid pointer to the global data structure and a 8KiB
stack by default. The stack size is set with CONFIG_STACK_SIZE_SHIFT.

Signed-off-by: Lukas Auer <lukas.auer@aisec.fraunhofer.de>
Reviewed-by: Anup Patel <anup.patel@wdc.com>
Reviewed-by: Bin Meng <bmeng.cn@gmail.com>
Tested-by: Bin Meng <bmeng.cn@gmail.com>
diff --git a/arch/riscv/cpu/start.S b/arch/riscv/cpu/start.S
index bcc0ff6..f55b8cb 100644
--- a/arch/riscv/cpu/start.S
+++ b/arch/riscv/cpu/start.S
@@ -13,6 +13,7 @@
 #include <config.h>
 #include <common.h>
 #include <elf.h>
+#include <asm/csr.h>
 #include <asm/encoding.h>
 #include <generated/asm-offsets.h>
 
@@ -45,6 +46,23 @@
 	/* mask all interrupts */
 	csrw	MODE_PREFIX(ie), zero
 
+#ifdef CONFIG_SMP
+	/* check if hart is within range */
+	/* tp: hart id */
+	li	t0, CONFIG_NR_CPUS
+	bge	tp, t0, hart_out_of_bounds_loop
+#endif
+
+#ifdef CONFIG_SMP
+	/* set xSIE bit to receive IPIs */
+#ifdef CONFIG_RISCV_MMODE
+	li	t0, MIE_MSIE
+#else
+	li	t0, SIE_SSIE
+#endif
+	csrs	MODE_PREFIX(ie), t0
+#endif
+
 /*
  * Set stackpointer in internal/ex RAM to call board_init_f
  */
@@ -56,7 +74,30 @@
 call_board_init_f_0:
 	mv	a0, sp
 	jal	board_init_f_alloc_reserve
+
+	/*
+	 * Set global data pointer here for all harts, uninitialized at this
+	 * point.
+	 */
+	mv	gp, a0
+
+	/* setup stack */
+#ifdef CONFIG_SMP
+	/* tp: hart id */
+	slli	t0, tp, CONFIG_STACK_SIZE_SHIFT
+	sub	sp, a0, t0
+#else
 	mv	sp, a0
+#endif
+
+	/*
+	 * Pick hart to initialize global data and run U-Boot. The other harts
+	 * wait for initialization to complete.
+	 */
+	la	t0, hart_lottery
+	li	s2, 1
+	amoswap.w s2, t1, 0(t0)
+	bnez	s2, wait_for_gd_init
 
 	la	t0, prior_stage_fdt_address
 	SREG	s1, 0(t0)
@@ -66,6 +107,33 @@
 	/* save the boot hart id to global_data */
 	SREG	tp, GD_BOOT_HART(gp)
 
+	la	t0, available_harts_lock
+	fence	rw, w
+	amoswap.w zero, zero, 0(t0)
+
+wait_for_gd_init:
+	la	t0, available_harts_lock
+	li	t1, 1
+1:	amoswap.w t1, t1, 0(t0)
+	fence	r, rw
+	bnez	t1, 1b
+
+	/* register available harts in the available_harts mask */
+	li	t1, 1
+	sll	t1, t1, tp
+	LREG	t2, GD_AVAILABLE_HARTS(gp)
+	or	t2, t2, t1
+	SREG	t2, GD_AVAILABLE_HARTS(gp)
+
+	fence	rw, w
+	amoswap.w zero, zero, 0(t0)
+
+	/*
+	 * Continue on hart lottery winner, others branch to
+	 * secondary_hart_loop.
+	 */
+	bnez	s2, secondary_hart_loop
+
 	/* Enable cache */
 	jal	icache_enable
 	jal	dcache_enable
@@ -95,7 +163,14 @@
  *Set up the stack
  */
 stack_setup:
+#ifdef CONFIG_SMP
+	/* tp: hart id */
+	slli	t0, tp, CONFIG_STACK_SIZE_SHIFT
+	sub	sp, s2, t0
+#else
 	mv	sp, s2
+#endif
+
 	la	t0, _start
 	sub	t6, s4, t0		/* t6 <- relocation offset */
 	beq	t0, s4, clear_bss	/* skip relocation */
@@ -175,13 +250,30 @@
 	add	t0, t0, t6		/* t0 <- rel __bss_start in RAM */
 	la	t1, __bss_end		/* t1 <- rel __bss_end in FLASH */
 	add	t1, t1, t6		/* t1 <- rel __bss_end in RAM */
-	beq	t0, t1, call_board_init_r
+	beq	t0, t1, relocate_secondary_harts
 
 clbss_l:
 	SREG	zero, 0(t0)		/* clear loop... */
 	addi	t0, t0, REGBYTES
 	bne	t0, t1, clbss_l
 
+relocate_secondary_harts:
+#ifdef CONFIG_SMP
+	/* send relocation IPI */
+	la	t0, secondary_hart_relocate
+	add	a0, t0, t6
+
+	/* store relocation offset */
+	mv	s5, t6
+
+	mv	a1, s2
+	mv	a2, s3
+	jal	smp_call_function
+
+	/* restore relocation offset */
+	mv	t6, s5
+#endif
+
 /*
  * We are done. Do not return, instead branch to second part of board
  * initialization, now running from RAM.
@@ -202,3 +294,43 @@
  * jump to it ...
  */
 	jr	t4			/* jump to board_init_r() */
+
+#ifdef CONFIG_SMP
+hart_out_of_bounds_loop:
+	/* Harts in this loop are out of bounds, increase CONFIG_NR_CPUS. */
+	wfi
+	j	hart_out_of_bounds_loop
+#endif
+
+#ifdef CONFIG_SMP
+/* SMP relocation entry */
+secondary_hart_relocate:
+	/* a1: new sp */
+	/* a2: new gd */
+	/* tp: hart id */
+
+	/* setup stack */
+	slli	t0, tp, CONFIG_STACK_SIZE_SHIFT
+	sub	sp, a1, t0
+
+	/* update global data pointer */
+	mv	gp, a2
+#endif
+
+secondary_hart_loop:
+	wfi
+
+#ifdef CONFIG_SMP
+	csrr	t0, MODE_PREFIX(ip)
+#ifdef CONFIG_RISCV_MMODE
+	andi	t0, t0, MIE_MSIE
+#else
+	andi	t0, t0, SIE_SSIE
+#endif
+	beqz	t0, secondary_hart_loop
+
+	mv	a0, tp
+	jal	handle_ipi
+#endif
+
+	j	secondary_hart_loop