BUG/MINOR: freq_ctr: use stricter barriers between updates and readings

update_freq_ctr_period() was using relaxed atomics without using barriers,
which usually works fine on x86 but not everywhere else. In addition, some
values were read without being enclosed by barriers, allowing the compiler
to possibly prefetch them a bit earlier. Finally, freq_ctr_total() was also
reading these without enough barriers. Let's make explicit use of atomic
loads and atomic stores to get rid of this situation. This required to
slightly rearrange the freq_ctr_total() loop, which could possibly slightly
improve performance under extreme contention by avoiding to reread all
fields.

A backport may be done to 2.4 if a problem is encountered, but last tests
on arm64 with LSE didn't show any issue so this can possibly stay as-is.
diff --git a/include/haproxy/freq_ctr.h b/include/haproxy/freq_ctr.h
index a59a3f9..b2da7ad 100644
--- a/include/haproxy/freq_ctr.h
+++ b/include/haproxy/freq_ctr.h
@@ -46,12 +46,11 @@
 	 * a rotation is in progress (no big deal).
 	 */
 	for (;; __ha_cpu_relax()) {
-		__ha_barrier_load();
-		now_ms_tmp = global_now_ms;
-		curr_tick = ctr->curr_tick;
+		curr_tick  = HA_ATOMIC_LOAD(&ctr->curr_tick);
+		now_ms_tmp = HA_ATOMIC_LOAD(&global_now_ms);
 
 		if (now_ms_tmp - curr_tick < period)
-			return _HA_ATOMIC_ADD_FETCH(&ctr->curr_ctr, inc);
+			return HA_ATOMIC_ADD_FETCH(&ctr->curr_ctr, inc);
 
 		/* a rotation is needed */
 		if (!(curr_tick & 1) &&
@@ -60,19 +59,20 @@
 	}
 
 	/* atomically switch the new period into the old one without losing any
-	 * potential concurrent update.
+	 * potential concurrent update. We're the only one performing the rotate
+	 * (locked above), others are only adding positive values to curr_ctr.
 	 */
-	_HA_ATOMIC_STORE(&ctr->prev_ctr, _HA_ATOMIC_FETCH_SUB(&ctr->curr_ctr, ctr->curr_ctr - inc));
+	HA_ATOMIC_STORE(&ctr->prev_ctr, HA_ATOMIC_XCHG(&ctr->curr_ctr, inc));
 	curr_tick += period;
 	if (likely(now_ms_tmp - curr_tick >= period)) {
 		/* we missed at least two periods */
-		_HA_ATOMIC_STORE(&ctr->prev_ctr, 0);
+		HA_ATOMIC_STORE(&ctr->prev_ctr, 0);
 		curr_tick = now_ms_tmp;
 	}
 
 	/* release the lock and update the time in case of rotate. */
-	_HA_ATOMIC_STORE(&ctr->curr_tick, curr_tick & ~1);
-	return ctr->curr_ctr;
+	HA_ATOMIC_STORE(&ctr->curr_tick, curr_tick & ~1);
+	return inc;
 }
 
 /* Update a 1-sec frequency counter by <inc> incremental units. It is automatically
@@ -312,7 +312,7 @@
 	old_sum = *sum;
 	do {
 		new_sum = old_sum - (old_sum + n - 1) / n + v;
-	} while (!_HA_ATOMIC_CAS(sum, &old_sum, new_sum) && __ha_cpu_relax());
+	} while (!HA_ATOMIC_CAS(sum, &old_sum, new_sum) && __ha_cpu_relax());
 	return new_sum;
 }
 
@@ -330,7 +330,7 @@
 	old_sum = *sum;
 	do {
 		new_sum = old_sum - (n ? (old_sum + n - 1) / n : 0) + v;
-	} while (!_HA_ATOMIC_CAS(sum, &old_sum, new_sum) && __ha_cpu_relax());
+	} while (!HA_ATOMIC_CAS(sum, &old_sum, new_sum) && __ha_cpu_relax());
 	return new_sum;
 }
 
@@ -364,7 +364,7 @@
 	old_sum = *sum;
 	do {
 		new_sum = old_sum + v * s - div64_32((unsigned long long)(old_sum + n) * s, n);
-	} while (!_HA_ATOMIC_CAS(sum, &old_sum, new_sum) && __ha_cpu_relax());
+	} while (!HA_ATOMIC_CAS(sum, &old_sum, new_sum) && __ha_cpu_relax());
 	return new_sum;
 }
 
diff --git a/src/freq_ctr.c b/src/freq_ctr.c
index 747a4cc..edea699 100644
--- a/src/freq_ctr.c
+++ b/src/freq_ctr.c
@@ -27,36 +27,56 @@
  */
 ullong freq_ctr_total(const struct freq_ctr *ctr, uint period, int pend)
 {
-	ullong curr, past;
-	uint curr_tick;
+	ullong curr, past, old_curr, old_past;
+	uint tick, old_tick;
 	int remain;
 
-	for (;; __ha_cpu_relax()) {
-		curr = ctr->curr_ctr;
-		past = ctr->prev_ctr;
-		curr_tick = ctr->curr_tick;
+	tick = HA_ATOMIC_LOAD(&ctr->curr_tick);
+	curr = HA_ATOMIC_LOAD(&ctr->curr_ctr);
+	past = HA_ATOMIC_LOAD(&ctr->prev_ctr);
 
-		/* now let's make sure the second loads retrieve the most
-		 * up-to-date values. If no value changed after a load barrier,
-		 * we're certain the values we got were stable.
+	while (1) {
+		if (tick & 0x1) // change in progress
+			goto redo0;
+
+		old_tick = tick;
+		old_curr = curr;
+		old_past = past;
+
+		/* now let's load the values a second time and make sure they
+		 * did not change, which will indicate it was a stable reading.
 		 */
-		__ha_barrier_load();
 
-		if (curr_tick & 0x1)
-			continue;
+		tick = HA_ATOMIC_LOAD(&ctr->curr_tick);
+		if (tick & 0x1) // change in progress
+			goto redo0;
 
-		if (curr != ctr->curr_ctr)
-			continue;
+		if (tick != old_tick)
+			goto redo1;
 
-		if (past != ctr->prev_ctr)
-			continue;
+		curr = HA_ATOMIC_LOAD(&ctr->curr_ctr);
+		if (curr != old_curr)
+			goto redo2;
 
-		if (curr_tick != ctr->curr_tick)
-			continue;
+		past = HA_ATOMIC_LOAD(&ctr->prev_ctr);
+		if (past != old_past)
+			goto redo3;
+
+		/* all values match between two loads, they're stable, let's
+		 * quit now.
+		 */
 		break;
+	redo0:
+		tick = HA_ATOMIC_LOAD(&ctr->curr_tick);
+	redo1:
+		curr = HA_ATOMIC_LOAD(&ctr->curr_ctr);
+	redo2:
+		past = HA_ATOMIC_LOAD(&ctr->prev_ctr);
+	redo3:
+		__ha_cpu_relax();
 	};
 
-	remain = curr_tick + period - global_now_ms;
+	remain = tick + period - HA_ATOMIC_LOAD(&global_now_ms);
 	if (unlikely(remain < 0)) {
 		/* We're past the first period, check if we can still report a
 		 * part of last period or if we're too far away.