REORG: atomic: reimplement pl_cpu_relax() from atomic-ops.h
There is some confusion here as we need to place some cpu_relax statements
in some loops where it's not easily possible to condition them on the use
of threads. That's what atomic.h already does. So let's take the various
pl_cpu_relax() implementations from there and place them in atomic.h under
the name __ha_cpu_relax() and let them adapt to the presence or absence of
threads and to the architecture (currently only x86 and aarch64 use a barrier
instruction), though it's very likely that arm would work well with a cache
flushing ISB instruction as well).
This time they were implemented as expressions returning 1 rather than
statements, in order to ease their placement as the loop condition or the
continuation expression inside "for" loops. We should probably do the same
with barriers and a few such other ones.
diff --git a/include/haproxy/atomic.h b/include/haproxy/atomic.h
index e21d7a3..ed55165 100644
--- a/include/haproxy/atomic.h
+++ b/include/haproxy/atomic.h
@@ -152,6 +152,7 @@
#define __ha_barrier_store() do { } while (0)
#define __ha_barrier_full() do { } while (0)
#define __ha_compiler_barrier() do { } while (0)
+#define __ha_cpu_relax() ({ 1; })
#else /* !USE_THREAD */
@@ -395,6 +396,9 @@
return (ret);
}
+/* short-lived CPU relaxation */
+#define __ha_cpu_relax() ({ asm volatile("rep;nop\n"); 1; })
+
#elif defined(__arm__) && (defined(__ARM_ARCH_7__) || defined(__ARM_ARCH_7A__))
static __inline void
@@ -457,6 +461,9 @@
return (tmp);
}
+/* short-lived CPU relaxation */
+#define __ha_cpu_relax() ({ asm volatile(""); 1; })
+
#elif defined (__aarch64__)
static __inline void
@@ -498,6 +505,11 @@
__asm __volatile("dmb ish" ::: "memory");
}
+/* short-lived CPU relaxation; this was shown to improve fairness on
+ * modern ARMv8 cores such as Neoverse N1.
+ */
+#define __ha_cpu_relax() ({ asm volatile("isb" ::: "memory"); 1; })
+
static __inline int __ha_cas_dw(void *target, void *compare, void *set)
{
void *value[2];
@@ -534,6 +546,9 @@
#define __ha_barrier_full __sync_synchronize
/* Note: there is no generic DWCAS */
+/* short-lived CPU relaxation */
+#define __ha_cpu_relax() ({ asm volatile(""); 1; })
+
#endif /* end of arch-specific barrier/dwcas */
static inline void __ha_compiler_barrier(void)