allwinner: Use the arisc to turn off ARM cores

PSCI requires a core to turn itself off, which we can't do properly by
just executing an algorithm on that very core. As a consequence we just
put a core into WFI on CPU_OFF right now.
To fix this let's task the "arisc" management processor (an OpenRISC
core) with that task of asserting reset and turning off the core's power
domain. We use a handcrafted sequence of OpenRISC instructions to
achieve this, and hand this data over to the new sunxi_execute_arisc_code()
routine.
The commented source code for this routine is provided in a separate file,
but the ATF code contains the already encoded instructions as data.
The H6 uses the same algorithm, but differs in the MMIO addresses, so
provide a SoC (family) specific copy of that code.

Signed-off-by: Andre Przywara <andre.przywara@arm.com>
diff --git a/plat/allwinner/common/arisc_off.S b/plat/allwinner/common/arisc_off.S
new file mode 100644
index 0000000..ed10832
--- /dev/null
+++ b/plat/allwinner/common/arisc_off.S
@@ -0,0 +1,115 @@
+# turn_off_core.S
+#
+# Copyright (c) 2018, Andre Przywara <osp@andrep.de>
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# OpenRISC assembly to turn off an ARM core on an Allwinner SoC from
+# the arisc management controller.
+# Generate a binary representation with:
+# $ or1k-elf-as -c -o turn_off_core.o turn_off_core.S
+# $ or1k-elf-objcopy -O binary --reverse-bytes=4 turn_off_core.o \
+#   turn_off_core.bin
+# The encoded instructions go into an array defined in
+# plat/allwinner/sun50i_*/include/core_off_arisc.h, to be handed off to
+# the arisc processor.
+#
+# This routine is meant to be called directly from arisc reset (put the
+# start address in the reset vector), to be actually triggered by that
+# very ARM core to be turned off.
+# It expects the core number presented as a mask in the upper half of
+# r3, so to be patched in the lower 16 bits of the first instruction,
+# overwriting the 0 in this code here.
+# The code will do the following:
+# - Read the C_CPU_STATUS register, which contains the status of the WFI
+#   lines of each of the four A53 cores.
+# - Loop until the core in question reaches WFI.
+# - Using that mask, activate the core output clamps by setting the
+#   respective core bit in CPUX_PWROFF_GATING_REG (0x1f01500).
+#   Note that the clamp for core 0 covers more than just the core, activating
+#   it hangs the whole system. So we skip this step for core 0.
+# - Using the negated mask, assert the core's reset line by clearing the
+#   respective bit in C_RST_CTRL (0x1f01c30).
+# - Finally turn off the core's power switch by writing 0xff to the
+#   respective CPUx_PWR_SWITCH_REG (0x1f01540 ff.)
+# - Assert the arisc's own reset to end execution.
+#   This also signals other arisc users that the chip is free again.
+# So in C this would look like:
+#	while (!(readl(0x1700030) & (1U << core_nr)))
+#		;
+#	if (core_nr != 0)
+#		writel(readl(0x1f01500) | (1U << core_nr), 0x1f01500);
+#	writel(readl(0x1f01c30) & ~(1U << core_nr), 0x1f01c30);
+#	writel(0xff, 0x1f01540 + (core_nr * 4));
+# (using A64/H5 addresses)
+
+.text
+_start:
+	l.movhi	r3, 0				# FIXUP! with core mask
+	l.movhi r0, 0				# clear r0
+	l.movhi	r13, 0x170			# r13: CPU_CFG_BASE=0x01700000
+wait_wfi:
+	l.lwz	r5, 0x30(r13)			# load C_CPU_STATUS
+	l.and	r5, r5, r3			# mask requested core
+	l.sfeq	r5, r0				# is it not yet in WFI?
+	l.bf	wait_wfi			# try again
+
+	l.srli	r6, r3, 16			# move mask to lower 16 bits
+	l.sfeqi	r6, 1				# core 0 is special
+	l.bf	1f				# don't touch the bit for core 0
+	l.movhi	r13, 0x1f0			# address of R_CPUCFG (delay)
+	l.lwz	r5, 0x1500(r13)			# core output clamps
+	l.or	r5, r5, r6			# set bit to ...
+	l.sw	0x1500(r13), r5			# ... activate for our core
+
+1:	l.lwz	r5, 0x1c30(r13)			# CPU power-on reset
+	l.xori	r6, r6, -1			# negate core mask
+	l.and	r5, r5, r6			# clear bit to ...
+	l.sw	0x1c30(r13), r5			# ... assert for our core
+
+	l.ff1	r6, r3				# get core number from high mask
+	l.addi	r6, r6, -17			# convert to 0-3
+	l.slli	r6, r6, 2			# r5: core number*4 (0-12)
+	l.add	r6, r6, r13			# add to base address
+	l.ori	r5, r0, 0xff			# 0xff means all switches off
+	l.sw	0x1540(r6), r5			# core power switch registers
+
+reset:	l.sw	0x1c00(r13),r0			# pull down our own reset line
+
+	l.j	reset				# just in case ....
+	l.nop	0x0				# (delay slot)
+
+# same as above, but with the MMIO addresses matching the H6 SoC
+_start_h6:
+	l.movhi	r3, 0				# FIXUP! with core mask
+	l.movhi r0, 0				# clear r0
+	l.movhi	r13, 0x901			# r13: CPU_CFG_BASE=0x09010000
+1:
+	l.lwz	r5, 0x80(r13)			# load C_CPU_STATUS
+	l.and	r5, r5, r3			# mask requested core
+	l.sfeq	r5, r0				# is it not yet in WFI?
+	l.bf	1b				# try again
+
+	l.srli	r6, r3, 16			# move mask to lower 16 bits(ds)
+	l.sfeqi	r6, 1				# core 0 is special
+	l.bf	1f				# don't touch the bit for core 0
+	l.movhi	r13, 0x700			# address of R_CPUCFG (ds)
+	l.lwz	r5, 0x0444(r13)			# core output clamps
+	l.or	r5, r5, r6			# set bit to ...
+	l.sw	0x0444(r13), r5			# ... activate for our core
+
+1:	l.lwz	r5, 0x0440(r13)			# CPU power-on reset
+	l.xori	r6, r6, -1			# negate core mask
+	l.and	r5, r5, r6			# clear bit to ...
+	l.sw	0x0440(r13), r5			# ... assert for our core
+
+	l.ff1	r6, r3				# get core number from high mask
+	l.addi	r6, r6, -17			# convert to 0-3
+	l.slli	r6, r6, 2			# r5: core number*4 (0-12)
+	l.add	r6, r6, r13			# add to base address
+	l.ori	r5, r0, 0xff			# 0xff means all switches off
+	l.sw	0x0450(r6), r5			# core power switch registers
+
+1:	l.sw	0x0400(r13),r0			# pull down our own reset line
+
+	l.j	1b				# just in case ...
+	l.nop	0x0				# (delay slot)
diff --git a/plat/allwinner/common/sunxi_cpu_ops.c b/plat/allwinner/common/sunxi_cpu_ops.c
index 2db2697..3b732b5 100644
--- a/plat/allwinner/common/sunxi_cpu_ops.c
+++ b/plat/allwinner/common/sunxi_cpu_ops.c
@@ -4,11 +4,16 @@
  * SPDX-License-Identifier: BSD-3-Clause
  */
 
+#include <arch_helpers.h>
+#include <assert.h>
+#include <core_off_arisc.h>
 #include <debug.h>
+#include <delay_timer.h>
 #include <mmio.h>
+#include <platform.h>
 #include <platform_def.h>
-#include <sunxi_mmap.h>
 #include <sunxi_cpucfg.h>
+#include <sunxi_mmap.h>
 #include <sunxi_private.h>
 #include <utils_def.h>
 
@@ -39,16 +44,37 @@
 
 void sunxi_cpu_off(unsigned int cluster, unsigned int core)
 {
+	int corenr = cluster * PLATFORM_MAX_CPUS_PER_CLUSTER + core;
+
 	VERBOSE("PSCI: Powering off cluster %d core %d\n", cluster, core);
 
 	/* Deassert DBGPWRDUP */
 	mmio_clrbits_32(SUNXI_CPUCFG_DBG_REG0, BIT(core));
-	/* Activate the core output clamps */
-	mmio_setbits_32(SUNXI_POWEROFF_GATING_REG(cluster), BIT(core));
-	/* Assert CPU power-on reset */
-	mmio_clrbits_32(SUNXI_POWERON_RST_REG(cluster), BIT(core));
-	/* Remove power from the CPU */
-	sunxi_cpu_disable_power(cluster, core);
+
+	/* We can't turn ourself off like this, but it works for other cores. */
+	if (plat_my_core_pos() != corenr) {
+		/* Activate the core output clamps, but not for core 0. */
+		if (corenr != 0)
+			mmio_setbits_32(SUNXI_POWEROFF_GATING_REG(cluster),
+					BIT(core));
+		/* Assert CPU power-on reset */
+		mmio_clrbits_32(SUNXI_POWERON_RST_REG(cluster), BIT(core));
+		/* Remove power from the CPU */
+		sunxi_cpu_disable_power(cluster, core);
+
+		return;
+	}
+
+	/* Simplifies assembly, all SoCs so far are single cluster anyway. */
+	assert(cluster == 0);
+
+	/*
+	 * If we are supposed to turn ourself off, tell the arisc SCP
+	 * to do that work for us. The code expects the core mask to be
+	 * patched into the first instruction.
+	 */
+	sunxi_execute_arisc_code(arisc_core_off, sizeof(arisc_core_off),
+				 0, BIT_32(core));
 }
 
 void sunxi_cpu_on(unsigned int cluster, unsigned int core)
diff --git a/plat/allwinner/common/sunxi_pm.c b/plat/allwinner/common/sunxi_pm.c
index 86336f0..7d13cda 100644
--- a/plat/allwinner/common/sunxi_pm.c
+++ b/plat/allwinner/common/sunxi_pm.c
@@ -42,6 +42,16 @@
 	gicv2_cpuif_disable();
 }
 
+static void __dead2 sunxi_pwr_down_wfi(const psci_power_state_t *target_state)
+{
+	u_register_t mpidr = read_mpidr();
+
+	sunxi_cpu_off(MPIDR_AFFLVL1_VAL(mpidr), MPIDR_AFFLVL0_VAL(mpidr));
+
+	while (1)
+		wfi();
+}
+
 static void sunxi_pwr_domain_on_finish(const psci_power_state_t *target_state)
 {
 	gicv2_pcpu_distif_init();
@@ -82,6 +92,7 @@
 static plat_psci_ops_t sunxi_psci_ops = {
 	.pwr_domain_on			= sunxi_pwr_domain_on,
 	.pwr_domain_off			= sunxi_pwr_domain_off,
+	.pwr_domain_pwr_down_wfi	= sunxi_pwr_down_wfi,
 	.pwr_domain_on_finish		= sunxi_pwr_domain_on_finish,
 	.system_off			= sunxi_system_off,
 	.system_reset			= sunxi_system_reset,
diff --git a/plat/allwinner/sun50i_a64/include/core_off_arisc.h b/plat/allwinner/sun50i_a64/include/core_off_arisc.h
new file mode 100644
index 0000000..ae436ca
--- /dev/null
+++ b/plat/allwinner/sun50i_a64/include/core_off_arisc.h
@@ -0,0 +1,39 @@
+/*
+ * Copyright (c) 2018, ARM Limited and Contributors. All rights reserved.
+ *
+ * SPDX-License-Identifier: BSD-3-Clause
+ */
+
+static uint32_t arisc_core_off[] = {
+	0x18600000, /* l.movhi	r3, <corenr>	*/
+	0x18000000, /* l.movhi	r0, 0x0		*/
+	0x19a00170, /* l.movhi	r13, 0x170	*/
+	0x84ad0030, /* l.lwz	r5, 0x30(r13)	*/
+	0xe0a51803, /* l.and	r5, r5, r3	*/
+	0xe4050000, /* l.sfeq	r5, r0		*/
+	0x13fffffd, /* l.bf	-12		*/
+
+	0xb8c30050, /* l.srli	r6, r3, 16	*/
+	0xbc060001, /* l.sfeqi	r6, 1		*/
+	0x10000005, /* l.bf	+20		*/
+	0x19a001f0, /* l.movhi	r13, 0x1f0	*/
+	0x84ad1500, /* l.lwz	r5, 0x1500(r13)	*/
+	0xe0a53004, /* l.or	r5, r5, r6	*/
+	0xd44d2d00, /* l.sw	0x1500(r13), r5	*/
+
+	0x84ad1c30, /* l.lwz	r5, 0x1c30(r13)	*/
+	0xacc6ffff, /* l.xori	r6, r6, -1	*/
+	0xe0a53003, /* l.and	r5, r5, r6	*/
+	0xd46d2c30, /* l.sw	0x1c30(r13), r5	*/
+
+	0xe0c3000f, /* l.ff1	r6, r3		*/
+	0x9cc6ffef, /* l.addi	r6, r6, -17	*/
+	0xb8c60002, /* l.slli	r6, r6, 2	*/
+	0xe0c66800, /* l.add	r6, r6, r13	*/
+	0xa8a000ff, /* l.ori	r5, r0, 0xff	*/
+	0xd4462d40, /* l.sw	0x1540(r6), r5	*/
+
+	0xd46d0400, /* l.sw	0x1c00(r13), r0	*/
+	0x03ffffff, /* l.j	-1		*/
+	0x15000000, /* l.nop			*/
+};
diff --git a/plat/allwinner/sun50i_h6/include/core_off_arisc.h b/plat/allwinner/sun50i_h6/include/core_off_arisc.h
new file mode 100644
index 0000000..63a5d8d
--- /dev/null
+++ b/plat/allwinner/sun50i_h6/include/core_off_arisc.h
@@ -0,0 +1,39 @@
+/*
+ * Copyright (c) 2018, ARM Limited and Contributors. All rights reserved.
+ *
+ * SPDX-License-Identifier: BSD-3-Clause
+ */
+
+static uint32_t arisc_core_off[] = {
+	0x18600000, /* l.movhi	r3, <corenr>	*/
+	0x18000000, /* l.movhi	r0, 0x0		*/
+	0x19a00901, /* l.movhi	r13, 0x901	*/
+	0x84ad0080, /* l.lwz	r5, 0x80(r13)	*/
+	0xe0a51803, /* l.and	r5, r5, r3	*/
+	0xe4050000, /* l.sfeq	r5, r0		*/
+	0x13fffffd, /* l.bf	-12		*/
+	0xb8c30050, /* l.srli	r6, r3, 16	*/
+
+	0xbc060001, /* l.sfeqi	r6, 1		*/
+	0x10000005, /* l.bf	+20		*/
+	0x19a00700, /* l.movhi	r13, 0x700	*/
+	0x84ad0444, /* l.lwz	r5, 0x0444(r13)	*/
+	0xe0a53004, /* l.or	r5, r5, r6	*/
+	0xd40d2c44, /* l.sw	0x0444(r13), r5	*/
+
+	0x84ad0440, /* l.lwz	r5, 0x0440(r13)	*/
+	0xacc6ffff, /* l.xori	r6, r6, -1	*/
+	0xe0a53003, /* l.and	r5, r5, r6	*/
+	0xd40d2c40, /* l.sw	0x0440(r13), r5	*/
+
+	0xe0c3000f, /* l.ff1	r6, r3		*/
+	0x9cc6ffef, /* l.addi	r6, r6, -17	*/
+	0xb8c60002, /* l.slli	r6, r6, 2	*/
+	0xe0c66800, /* l.add	r6, r6, r13	*/
+	0xa8a000ff, /* l.ori	r5, r0, 0xff	*/
+	0xd4062c50, /* l.sw	0x0450(r6), r5	*/
+
+	0xd40d0400, /* l.sw	0x0400(r13), r0	*/
+	0x03ffffff, /* l.j	-1		*/
+	0x15000000, /* l.nop			*/
+};