ARM: Factor out reusable psci_cpu_off_common

Move parts of sunxi's psci_cpu_off into psci_cpu_off_common, namely
cache disabling and flushing, clrex and the disabling of SMP for the
dying CPU. These steps are apparently generic for ARMv7 and will be
reused for Tegra124 support.

As the way of disabled SMP is not architectural, though commonly done
via ACLTR, the related function can be overloaded.

CC: Marc Zyngier <marc.zyngier@arm.com>
Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com>
Reviewed-by: Tom Rini <trini@konsulko.com>
Reviewed-by: Thierry Reding <treding@nvidia.com>
Tested-by: Thierry Reding <treding@nvidia.com>
Tested-by: Ian Campbell <ijc@hellion.org.uk>
Signed-off-by: Tom Warren <twarren@nvidia.com>
diff --git a/arch/arm/cpu/armv7/sunxi/psci.S b/arch/arm/cpu/armv7/sunxi/psci.S
index bcc419d..05d047b 100644
--- a/arch/arm/cpu/armv7/sunxi/psci.S
+++ b/arch/arm/cpu/armv7/sunxi/psci.S
@@ -200,53 +200,6 @@
 _target_pc:
 	.word	0
 
-/* Imported from Linux kernel */
-v7_flush_dcache_all:
-	dmb					@ ensure ordering with previous memory accesses
-	mrc	p15, 1, r0, c0, c0, 1		@ read clidr
-	ands	r3, r0, #0x7000000		@ extract loc from clidr
-	mov	r3, r3, lsr #23			@ left align loc bit field
-	beq	finished			@ if loc is 0, then no need to clean
-	mov	r10, #0				@ start clean at cache level 0
-flush_levels:
-	add	r2, r10, r10, lsr #1		@ work out 3x current cache level
-	mov	r1, r0, lsr r2			@ extract cache type bits from clidr
-	and	r1, r1, #7			@ mask of the bits for current cache only
-	cmp	r1, #2				@ see what cache we have at this level
-	blt	skip				@ skip if no cache, or just i-cache
-	mrs     r9, cpsr			@ make cssr&csidr read atomic
-	mcr	p15, 2, r10, c0, c0, 0		@ select current cache level in cssr
-	isb					@ isb to sych the new cssr&csidr
-	mrc	p15, 1, r1, c0, c0, 0		@ read the new csidr
-	msr     cpsr_c, r9
-	and	r2, r1, #7			@ extract the length of the cache lines
-	add	r2, r2, #4			@ add 4 (line length offset)
-	ldr	r4, =0x3ff
-	ands	r4, r4, r1, lsr #3		@ find maximum number on the way size
-	clz	r5, r4				@ find bit position of way size increment
-	ldr	r7, =0x7fff
-	ands	r7, r7, r1, lsr #13		@ extract max number of the index size
-loop1:
-	mov	r9, r7				@ create working copy of max index
-loop2:
-	orr	r11, r10, r4, lsl r5		@ factor way and cache number into r11
-	orr	r11, r11, r9, lsl r2		@ factor index number into r11
-	mcr	p15, 0, r11, c7, c14, 2		@ clean & invalidate by set/way
-	subs	r9, r9, #1			@ decrement the index
-	bge	loop2
-	subs	r4, r4, #1			@ decrement the way
-	bge	loop1
-skip:
-	add	r10, r10, #2			@ increment cache number
-	cmp	r3, r10
-	bgt	flush_levels
-finished:
-	mov	r10, #0				@ swith back to cache level 0
-	mcr	p15, 2, r10, c0, c0, 0		@ select current cache level in cssr
-	dsb	st
-	isb
-	bx	lr
-
 _sunxi_cpu_entry:
 	@ Set SMP bit
 	mrc	p15, 0, r0, c1, c0, 1
@@ -262,21 +215,7 @@
 
 .globl	psci_cpu_off
 psci_cpu_off:
-	mrc	p15, 0, r0, c1, c0, 0		@ SCTLR
-	bic	r0, r0, #(1 << 2)		@ Clear C bit
-	mcr	p15, 0, r0, c1, c0, 0		@ SCTLR
-	isb
-	dsb
-
-	bl	v7_flush_dcache_all
-
-	clrex					@ Why???
-
-	mrc	p15, 0, r0, c1, c0, 1		@ ACTLR
-	bic	r0, r0, #(1 << 6)		@ Clear SMP bit
-	mcr	p15, 0, r0, c1, c0, 1		@ ACTLR
-	isb
-	dsb
+	bl	psci_cpu_off_common
 
 	@ Ask CPU0 to pull the rug...
 	movw	r0, #(GICD_BASE & 0xffff)