cpus: higher performance non-cacheable load forwarding

The CPUACTLR_EL1 register on Cortex-A57 CPUs supports a bit to enable
non-cacheable streaming enhancement. Platforms can set this bit only
if their memory system meets the requirement that cache line fill
requests from the Cortex-A57 processor are atomic.

This patch adds support to enable higher performance non-cacheable load
forwarding for such platforms. Platforms must enable this support by
setting the 'A57_ENABLE_NONCACHEABLE_LOAD_FWD' flag from their
makefiles. This flag is disabled by default.

Change-Id: Ib27e55dd68d11a50962c0bbc5b89072208b4bac5
Signed-off-by: Varun Wadekar <vwadekar@nvidia.com>
diff --git a/docs/design/cpu-specific-build-macros.rst b/docs/design/cpu-specific-build-macros.rst
index f3096b4..258f73d 100644
--- a/docs/design/cpu-specific-build-macros.rst
+++ b/docs/design/cpu-specific-build-macros.rst
@@ -324,6 +324,13 @@
    as recommended in section "4.7 Non-Temporal Loads/Stores" of the
    `Cortex-A57 Software Optimization Guide`_.
 
+- ''A57_ENABLE_NON_CACHEABLE_LOAD_FWD'': This flag enables non-cacheable
+   streaming enhancement feature for Cortex-A57 CPUs. Platforms can set
+   this bit only if their memory system meets the requirement that cache
+   line fill requests from the Cortex-A57 processor are atomic. Each
+   Cortex-A57 based platform must make its own decision on whether to use
+   the optimization. This flag is disabled by default.
+
 -  ``NEOVERSE_N1_EXTERNAL_LLC``: This flag indicates that an external last
    level cache(LLC) is present in the system, and that the DataSource field
    on the master CHI interface indicates when data is returned from the LLC.
diff --git a/include/lib/cpus/aarch64/cortex_a57.h b/include/lib/cpus/aarch64/cortex_a57.h
index 102ff60..dc40e31 100644
--- a/include/lib/cpus/aarch64/cortex_a57.h
+++ b/include/lib/cpus/aarch64/cortex_a57.h
@@ -1,5 +1,6 @@
 /*
  * Copyright (c) 2014-2019, ARM Limited and Contributors. All rights reserved.
+ * Copyright (c) 2020, NVIDIA Corporation. All rights reserved.
  *
  * SPDX-License-Identifier: BSD-3-Clause
  */
@@ -54,6 +55,7 @@
 #define CORTEX_A57_CPUACTLR_EL1_FORCE_FPSCR_FLUSH	(ULL(1) << 38)
 #define CORTEX_A57_CPUACTLR_EL1_DIS_INSTR_PREFETCH	(ULL(1) << 32)
 #define CORTEX_A57_CPUACTLR_EL1_DIS_STREAMING		(ULL(3) << 27)
+#define CORTEX_A57_CPUACTLR_EL1_EN_NC_LOAD_FWD		(ULL(1) << 24)
 #define CORTEX_A57_CPUACTLR_EL1_DIS_L1_STREAMING	(ULL(3) << 25)
 #define CORTEX_A57_CPUACTLR_EL1_DIS_INDIRECT_PREDICTOR	(ULL(1) << 4)
 
diff --git a/lib/cpus/aarch64/cortex_a57.S b/lib/cpus/aarch64/cortex_a57.S
index dd03c0f..3fee470 100644
--- a/lib/cpus/aarch64/cortex_a57.S
+++ b/lib/cpus/aarch64/cortex_a57.S
@@ -1,5 +1,6 @@
 /*
  * Copyright (c) 2014-2019, ARM Limited and Contributors. All rights reserved.
+ * Copyright (c) 2020, NVIDIA Corporation. All rights reserved.
  *
  * SPDX-License-Identifier: BSD-3-Clause
  */
@@ -469,6 +470,17 @@
 	dsb	sy
 #endif
 
+#if A57_ENABLE_NONCACHEABLE_LOAD_FWD
+	/* ---------------------------------------------
+	 * Enable higher performance non-cacheable load
+	 * forwarding
+	 * ---------------------------------------------
+	 */
+	mrs	x0, CORTEX_A57_CPUACTLR_EL1
+	orr	x0, x0, #CORTEX_A57_CPUACTLR_EL1_EN_NC_LOAD_FWD
+	msr	CORTEX_A57_CPUACTLR_EL1, x0
+#endif
+
 	/* ---------------------------------------------
 	 * Enable the SMP bit.
 	 * ---------------------------------------------
diff --git a/lib/cpus/cpu-ops.mk b/lib/cpus/cpu-ops.mk
index e3bfc2f..3c0c9cd 100644
--- a/lib/cpus/cpu-ops.mk
+++ b/lib/cpus/cpu-ops.mk
@@ -1,5 +1,6 @@
 #
 # Copyright (c) 2014-2020, ARM Limited and Contributors. All rights reserved.
+# Copyright (c) 2020, NVIDIA Corporation. All rights reserved.
 #
 # SPDX-License-Identifier: BSD-3-Clause
 #
@@ -16,6 +17,10 @@
 # It is enabled by default.
 A57_DISABLE_NON_TEMPORAL_HINT	?=1
 
+# Flag to enable higher performance non-cacheable load forwarding.
+# It is disabled by default.
+A57_ENABLE_NONCACHEABLE_LOAD_FWD	?= 0
+
 WORKAROUND_CVE_2017_5715	?=1
 WORKAROUND_CVE_2018_3639	?=1
 DYNAMIC_WORKAROUND_CVE_2018_3639	?=0
@@ -24,6 +29,10 @@
 # By default internal
 NEOVERSE_N1_EXTERNAL_LLC	?=0
 
+# Process A57_ENABLE_NONCACHEABLE_LOAD_FWD flag
+$(eval $(call assert_boolean,A57_ENABLE_NONCACHEABLE_LOAD_FWD))
+$(eval $(call add_define,A57_ENABLE_NONCACHEABLE_LOAD_FWD))
+
 # Process SKIP_A57_L1_FLUSH_PWR_DWN flag
 $(eval $(call assert_boolean,SKIP_A57_L1_FLUSH_PWR_DWN))
 $(eval $(call add_define,SKIP_A57_L1_FLUSH_PWR_DWN))