armv8/fsl-layerscape: Add loop to check L3 dcache status

Flushing L3 cache may need variable time depending upon cache line
allocation.

Coming up with a proper timeout value would be best handled by
simulations under multiple scenarios in your actual system.
>From the purely HN-F point of view, the flush would take ~15 cycles for
a clean line, and ~22 cycles for a dirty line.  For the dirty line case,
there are many variables outside the HN-F that will increase the
duration per line.  For example, a *DBIDResp from the SN-F/SBSX,
memory controller latency, SN-F/SBSX RetryAck responses, CCN ring
congestion, CCN ring hops, etc, etc.  The worst-case timeout would
have to factor in all of these variables plus the HN-F cycles for
every line in the L3, and assuming all lines are dirty

In case if L3 is not flushed properly, system behaviour will be
erratic, so remove timeout and add loop to check status of L3 cache.

System will stuck in while loop if there is some issue in L3 cache
flushing.

Signed-off-by: Udit Kumar <udit.kumar@nxp.com>
Signed-off-by: Meenakshi Aggarwal <meenakshi.aggarwal@nxp.com>
Reviewed-by: Prabhakar Kushwaha <prabhakar.kushwaha@nxp.com>
diff --git a/arch/arm/cpu/armv8/fsl-layerscape/lowlevel.S b/arch/arm/cpu/armv8/fsl-layerscape/lowlevel.S
index 6721a57..711ab87 100644
--- a/arch/arm/cpu/armv8/fsl-layerscape/lowlevel.S
+++ b/arch/arm/cpu/armv8/fsl-layerscape/lowlevel.S
@@ -1,6 +1,7 @@
 /* SPDX-License-Identifier: GPL-2.0+ */
 /*
  * (C) Copyright 2014-2015 Freescale Semiconductor
+ * Copyright 2019 NXP
  *
  * Extracted from armv8/start.S
  */
@@ -356,31 +357,22 @@
 
 #if defined(CONFIG_SYS_FSL_HAS_CCN504) || defined(CONFIG_SYS_FSL_HAS_CCN508)
 hnf_pstate_poll:
-	/* x0 has the desired status, return 0 for success, 1 for timeout
-	 * clobber x1, x2, x3, x4, x6, x7
+	/* x0 has the desired status, return only if operation succeed
+	 * clobber x1, x2, x6
 	 */
 	mov	x1, x0
-	mov	x7, #0			/* flag for timeout */
-	mrs	x3, cntpct_el0		/* read timer */
-	add	x3, x3, #1200		/* timeout after 100 microseconds */
+	mov	w6, #8			/* HN-F node count */
 	mov	x0, #0x18
 	movk	x0, #0x420, lsl #16	/* HNF0_PSTATE_STATUS */
-	mov	w6, #8			/* HN-F node count */
 1:
 	ldr	x2, [x0]
 	cmp	x2, x1			/* check status */
 	b.eq	2f
-	mrs	x4, cntpct_el0
-	cmp	x4, x3
-	b.ls	1b
-	mov	x7, #1			/* timeout */
-	b	3f
+	b	1b
 2:
 	add	x0, x0, #0x10000	/* move to next node */
 	subs	w6, w6, #1
 	cbnz	w6, 1b
-3:
-	mov	x0, x7
 	ret
 
 hnf_set_pstate:
@@ -405,10 +397,8 @@
 	/*
 	 * Return status in x0
 	 *    success 0
-	 *    timeout 1 for setting SFONLY, 2 for FAM, 3 for both
 	 */
 	mov	x29, lr
-	mov	x8, #0
 
 	dsb	sy
 	mov	x0, #0x1		/* HNFPSTAT_SFONLY */
@@ -416,19 +406,15 @@
 
 	mov	x0, #0x4		/* SFONLY status */
 	bl	hnf_pstate_poll
-	cbz	x0, 1f
-	mov	x8, #1			/* timeout */
-1:
+
 	dsb	sy
 	mov	x0, #0x3		/* HNFPSTAT_FAM */
 	bl	hnf_set_pstate
 
 	mov	x0, #0xc		/* FAM status */
 	bl	hnf_pstate_poll
-	cbz	x0, 1f
-	add	x8, x8, #0x2
-1:
-	mov	x0, x8
+
+	mov	x0, #0
 	mov	lr, x29
 	ret
 ENDPROC(__asm_flush_l3_dcache)