Disable non-temporal hint on Cortex-A53/57

The LDNP/STNP instructions as implemented on Cortex-A53 and
Cortex-A57 do not behave in a way most programmers expect, and will
most probably result in a significant speed degradation to any code
that employs them. The ARMv8-A architecture (see Document ARM DDI
0487A.h, section D3.4.3) allows cores to ignore the non-temporal hint
and treat LDNP/STNP as LDP/STP instead.

This patch introduces 2 new build flags:
A53_DISABLE_NON_TEMPORAL_HINT and A57_DISABLE_NON_TEMPORAL_HINT
to enforce this behaviour on Cortex-A53 and Cortex-A57. They are
enabled by default.

The string printed in debug builds when a specific CPU errata
workaround is compiled in but skipped at runtime has been
generalised, so that it can be reused for the non-temporal hint use
case as well.

Change-Id: I3e354f4797fd5d3959872a678e160322b13867a1
diff --git a/lib/cpus/aarch64/cortex_a53.S b/lib/cpus/aarch64/cortex_a53.S
index e4b94e8..00ceadb 100644
--- a/lib/cpus/aarch64/cortex_a53.S
+++ b/lib/cpus/aarch64/cortex_a53.S
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2014, ARM Limited and Contributors. All rights reserved.
+ * Copyright (c) 2014-2016, ARM Limited and Contributors. All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -86,33 +86,40 @@
 	ret
 endfunc errata_a53_826319_wa
 
-	/* --------------------------------------------------
-	 * Errata Workaround for Cortex A53 Errata #836870.
-	 * This applies only to revision <= r0p3 of Cortex A53.
-	 * From r0p4 and onwards, this errata is enabled by
-	 * default.
+	/* ---------------------------------------------------------------------
+	 * Disable the cache non-temporal hint.
+	 *
+	 * This ignores the Transient allocation hint in the MAIR and treats
+	 * allocations the same as non-transient allocation types. As a result,
+	 * the LDNP and STNP instructions in AArch64 behave the same as the
+	 * equivalent LDP and STP instructions.
+	 *
+	 * This is relevant only for revisions <= r0p3 of Cortex-A53.
+	 * From r0p4 and onwards, the bit to disable the hint is enabled by
+	 * default at reset.
+	 *
 	 * Inputs:
 	 * x0: variant[4:7] and revision[0:3] of current cpu.
 	 * Clobbers : x0 - x5
-	 * --------------------------------------------------
+	 * ---------------------------------------------------------------------
 	 */
-func errata_a53_836870_wa
+func a53_disable_non_temporal_hint
 	/*
 	 * Compare x0 against revision r0p3
 	 */
 	cmp	x0, #3
-	b.ls	apply_836870
+	b.ls	disable_hint
 #if DEBUG
 	b	print_revision_warning
 #else
 	ret
 #endif
-apply_836870:
+disable_hint:
 	mrs	x1, CPUACTLR_EL1
 	orr	x1, x1, #CPUACTLR_DTAH
 	msr	CPUACTLR_EL1, x1
 	ret
-endfunc errata_a53_836870_wa
+endfunc a53_disable_non_temporal_hint
 
 	/* -------------------------------------------------
 	 * The CPU Ops reset function for Cortex-A53.
@@ -138,9 +145,9 @@
 	bl	errata_a53_826319_wa
 #endif
 
-#if ERRATA_A53_836870
+#if ERRATA_A53_836870 || A53_DISABLE_NON_TEMPORAL_HINT
 	mov	x0, x15
-	bl	errata_a53_836870_wa
+	bl	a53_disable_non_temporal_hint
 #endif
 
 	/* ---------------------------------------------
diff --git a/lib/cpus/aarch64/cortex_a57.S b/lib/cpus/aarch64/cortex_a57.S
index 05799d6..8bcb5dd 100644
--- a/lib/cpus/aarch64/cortex_a57.S
+++ b/lib/cpus/aarch64/cortex_a57.S
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2014, ARM Limited and Contributors. All rights reserved.
+ * Copyright (c) 2014-2016, ARM Limited and Contributors. All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -153,6 +153,35 @@
 	ret
 endfunc errata_a57_813420_wa
 
+	/* --------------------------------------------------------------------
+	 * Disable the over-read from the LDNP instruction.
+	 *
+	 * This applies to all revisions <= r1p2. The performance degradation
+	 * observed with LDNP/STNP has been fixed on r1p3 and onwards.
+	 *
+	 * Inputs:
+	 * x0: variant[4:7] and revision[0:3] of current cpu.
+	 * Clobbers : x0 - x5, x30
+	 * ---------------------------------------------------------------------
+	 */
+func a57_disable_ldnp_overread
+	/*
+	 * Compare x0 against revision r1p2
+	 */
+	cmp	x0, #0x12
+	b.ls	disable_hint
+#if DEBUG
+	b	print_revision_warning
+#else
+	ret
+#endif
+disable_hint:
+	mrs	x1, CPUACTLR_EL1
+	orr	x1, x1, #CPUACTLR_DIS_OVERREAD
+	msr	CPUACTLR_EL1, x1
+	ret
+endfunc a57_disable_ldnp_overread
+
 	/* -------------------------------------------------
 	 * The CPU Ops reset function for Cortex-A57.
 	 * Clobbers: x0-x5, x15, x19, x30
@@ -181,6 +210,11 @@
 	bl	errata_a57_813420_wa
 #endif
 
+#if A57_DISABLE_NON_TEMPORAL_HINT
+	mov	x0, x15
+	bl	a57_disable_ldnp_overread
+#endif
+
 	/* ---------------------------------------------
 	 * As a bare minimum enable the SMP bit if it is
 	 * not already set.
diff --git a/lib/cpus/aarch64/cpu_helpers.S b/lib/cpus/aarch64/cpu_helpers.S
index e8a1392..e41d95b 100644
--- a/lib/cpus/aarch64/cpu_helpers.S
+++ b/lib/cpus/aarch64/cpu_helpers.S
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2014, ARM Limited and Contributors. All rights reserved.
+ * Copyright (c) 2014-2016, ARM Limited and Contributors. All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -205,16 +205,17 @@
 endfunc get_cpu_ops_ptr
 
 #if DEBUG
-	/*
-	 * This function prints a warning message to the crash console
-	 * if the CPU revision/part number does not match the errata
-	 * workaround enabled in the build.
-	 * Clobber: x30, x0 - x5
-	 */
 .section .rodata.rev_warn_str, "aS"
 rev_warn_str:
-	.asciz "Warning: Skipping Errata workaround for non matching CPU revision number.\n"
+	.asciz "Warning: Skipping CPU specific reset operation for non-matching CPU revision number.\n"
 
+	/*
+	 * This function prints the above warning message to the crash console.
+	 * It should be called when a CPU specific operation is enabled in the
+	 * build but doesn't apply to this CPU revision/part number.
+	 *
+	 * Clobber: x30, x0 - x5
+	 */
 	.globl	print_revision_warning
 func print_revision_warning
 	mov	x5, x30