MINOR: freq_ctr: use the thread's local time whenever possible

Right now when dealing with freq_ctr updates, we're using the process-
wide monotinic time, and accessing it is expensive since every thread
needs to update it, so this adds some contention. However we don't need
it all the time, the thread's local time is most of the time strictly
equal to the global time, and may be off by one millisecond when the
global time is switched to the next one by another thread, and in this
case we don't want to use the local time because it would risk to cause
a rotation of the counter. But that's precisely the condition we're
already relying on for the slow path!

What this patch does is to add a check for the period against the
local time prior to anything else, and immediately return after
updating the counter if still within the period, otherwise fall back
to the existing code. Given that the function starts to inflate a bit,
it was split between s very short inline part that does the hot path,
and the slower fallback that's in a cold function. It was measured that
on a 24-CPU machine it was called ~0.003% of the time.

The resulting improvement sits between 2 and 3% at 500k req/s tracking
an http_req_rate counter.
diff --git a/include/haproxy/freq_ctr.h b/include/haproxy/freq_ctr.h
index be9c9d8..fbb7852 100644
--- a/include/haproxy/freq_ctr.h
+++ b/include/haproxy/freq_ctr.h
@@ -30,49 +30,29 @@
 /* exported functions from freq_ctr.c */
 ullong freq_ctr_total(const struct freq_ctr *ctr, uint period, int pend);
 int freq_ctr_overshoot_period(const struct freq_ctr *ctr, uint period, uint freq);
+uint update_freq_ctr_period_slow(struct freq_ctr *ctr, uint period, uint inc);
 
 /* Update a frequency counter by <inc> incremental units. It is automatically
  * rotated if the period is over. It is important that it correctly initializes
- * a null area. This one works on frequency counters which have a period
- * different from one second.
+ * a null area.
  */
-static inline unsigned int update_freq_ctr_period(struct freq_ctr *ctr,
-						  unsigned int period, unsigned int inc)
+static inline uint update_freq_ctr_period(struct freq_ctr *ctr, uint period, uint inc)
 {
-	unsigned int curr_tick;
-	uint32_t now_ms_tmp;
+	uint curr_tick;
 
-	/* atomically update the counter if still within the period, even if
-	 * a rotation is in progress (no big deal).
+	/* our local clock (now_ms) is most of the time strictly equal to
+	 * global_now_ms, and during the edge of the millisecond, global_now_ms
+	 * might have been pushed further by another thread. Given that
+	 * accessing this shared variable is extremely expensive, we first try
+	 * to use our local date, which will be good almost every time. And we
+	 * only switch to the global clock when we're out of the period so as
+	 * to never put a date in the past there.
 	 */
-	for (;; __ha_cpu_relax()) {
-		curr_tick  = HA_ATOMIC_LOAD(&ctr->curr_tick);
-		now_ms_tmp = HA_ATOMIC_LOAD(&global_now_ms);
-
-		if (now_ms_tmp - curr_tick < period)
-			return HA_ATOMIC_ADD_FETCH(&ctr->curr_ctr, inc);
-
-		/* a rotation is needed */
-		if (!(curr_tick & 1) &&
-		    HA_ATOMIC_CAS(&ctr->curr_tick, &curr_tick, curr_tick | 0x1))
-			break;
-	}
-
-	/* atomically switch the new period into the old one without losing any
-	 * potential concurrent update. We're the only one performing the rotate
-	 * (locked above), others are only adding positive values to curr_ctr.
-	 */
-	HA_ATOMIC_STORE(&ctr->prev_ctr, HA_ATOMIC_XCHG(&ctr->curr_ctr, inc));
-	curr_tick += period;
-	if (likely(now_ms_tmp - curr_tick >= period)) {
-		/* we missed at least two periods */
-		HA_ATOMIC_STORE(&ctr->prev_ctr, 0);
-		curr_tick = now_ms_tmp;
-	}
+	curr_tick  = HA_ATOMIC_LOAD(&ctr->curr_tick);
+	if (likely(now_ms - curr_tick < period))
+		return HA_ATOMIC_ADD_FETCH(&ctr->curr_ctr, inc);
 
-	/* release the lock and update the time in case of rotate. */
-	HA_ATOMIC_STORE(&ctr->curr_tick, curr_tick & ~1);
-	return inc;
+	return update_freq_ctr_period_slow(ctr, period, inc);
 }
 
 /* Update a 1-sec frequency counter by <inc> incremental units. It is automatically
diff --git a/src/freq_ctr.c b/src/freq_ctr.c
index 9d9ab3b..602ad9b 100644
--- a/src/freq_ctr.c
+++ b/src/freq_ctr.c
@@ -14,6 +14,56 @@
 #include <haproxy/freq_ctr.h>
 #include <haproxy/tools.h>
 
+/* Update a frequency counter by <inc> incremental units. It is automatically
+ * rotated if the period is over. It is important that it correctly initializes
+ * a null area. This one works on frequency counters which have a period
+ * different from one second. It relies on the process-wide clock that is
+ * guaranteed to be monotonic. It's important to avoid forced rotates between
+ * threads. A faster wrapper (update_freq_ctr_period) should be used instead,
+ * which uses the thread's local time whenever possible and falls back to this
+ * one when needed (less than 0.003% of the time).
+ */
+uint update_freq_ctr_period_slow(struct freq_ctr *ctr, uint period, uint inc)
+{
+	uint curr_tick;
+	uint32_t now_ms_tmp;
+
+	/* atomically update the counter if still within the period, even if
+	 * a rotation is in progress (no big deal).
+	 */
+	for (;; __ha_cpu_relax()) {
+		curr_tick  = HA_ATOMIC_LOAD(&ctr->curr_tick);
+		now_ms_tmp = HA_ATOMIC_LOAD(&global_now_ms);
+
+		if (now_ms_tmp - curr_tick < period)
+			return HA_ATOMIC_ADD_FETCH(&ctr->curr_ctr, inc);
+
+		/* a rotation is needed. While extremely rare, contention may
+		 * happen because it will be triggered on time, and all threads
+		 * see the time change simultaneously.
+		 */
+		if (!(curr_tick & 1) &&
+		    HA_ATOMIC_CAS(&ctr->curr_tick, &curr_tick, curr_tick | 0x1))
+			break;
+	}
+
+	/* atomically switch the new period into the old one without losing any
+	 * potential concurrent update. We're the only one performing the rotate
+	 * (locked above), others are only adding positive values to curr_ctr.
+	 */
+	HA_ATOMIC_STORE(&ctr->prev_ctr, HA_ATOMIC_XCHG(&ctr->curr_ctr, inc));
+	curr_tick += period;
+	if (likely(now_ms_tmp - curr_tick >= period)) {
+		/* we missed at least two periods */
+		HA_ATOMIC_STORE(&ctr->prev_ctr, 0);
+		curr_tick = now_ms_tmp;
+	}
+
+	/* release the lock and update the time in case of rotate. */
+	HA_ATOMIC_STORE(&ctr->curr_tick, curr_tick & ~1);
+	return inc;
+}
+
 /* Returns the total number of events over the current + last period, including
  * a number of already pending events <pend>. The average frequency will be
  * obtained by dividing the output by <period>. This is essentially made to