BUG/MEDIUM: random: implement a thread-safe and process-safe PRNG

This is the replacement of failed attempt to add thread safety and
per-process sequences of random numbers initally tried with commit
1c306aa84d ("BUG/MEDIUM: random: implement per-thread and per-process
random sequences").

This new version takes a completely different approach and doesn't try
to work around the horrible OS-specific and non-portable random API
anymore. Instead it implements "xoroshiro128**", a reputedly high
quality random number generator, which is one of the many variants of
xorshift, which passes all quality tests and which is described here:

   http://prng.di.unimi.it/

While not cryptographically secure, it is fast and features a 2^128-1
period. It supports fast jumps allowing to cut the period into smaller
non-overlapping sequences, which we use here to support up to 2^32
processes each having their own, non-overlapping sequence of 2^96
numbers (~7*10^28). This is enough to provide 1 billion randoms per
second and per process for 2200 billion years.

The implementation was made thread-safe either by using a double 64-bit
CAS on platforms supporting it (x86_64, aarch64) or by using a local
lock for the time needed to perform the shift operations. This ensures
that all threads pick numbers from the same pool so that it is not
needed to assign per-thread ranges. For processes we use the fast jump
method to advance the sequence by 2^96 for each process.

Before this patch, the following config:
    global
        nbproc 8

    frontend f
        bind :4445
        mode http
        log stdout format raw daemon
        log-format "%[uuid] %pid"
        redirect location /

Would produce this output:
    a4d0ad64-2645-4b74-b894-48acce0669af 12987
    a4d0ad64-2645-4b74-b894-48acce0669af 12992
    a4d0ad64-2645-4b74-b894-48acce0669af 12986
    a4d0ad64-2645-4b74-b894-48acce0669af 12988
    a4d0ad64-2645-4b74-b894-48acce0669af 12991
    a4d0ad64-2645-4b74-b894-48acce0669af 12989
    a4d0ad64-2645-4b74-b894-48acce0669af 12990
    82d5f6cd-f6c1-4f85-a89c-36ae85d26fb9 12987
    82d5f6cd-f6c1-4f85-a89c-36ae85d26fb9 12992
    82d5f6cd-f6c1-4f85-a89c-36ae85d26fb9 12986
    (...)

And now produces:
    f94b29b3-da74-4e03-a0c5-a532c635bad9 13011
    47470c02-4862-4c33-80e7-a952899570e5 13014
    86332123-539a-47bf-853f-8c8ea8b2a2b5 13013
    8f9efa99-3143-47b2-83cf-d618c8dea711 13012
    3cc0f5c7-d790-496b-8d39-bec77647af5b 13015
    3ec64915-8f95-4374-9e66-e777dc8791e0 13009
    0f9bf894-dcde-408c-b094-6e0bb3255452 13011
    49c7bfde-3ffb-40e9-9a8d-8084d650ed8f 13014
    e23f6f2e-35c5-4433-a294-b790ab902653 13012

There are multiple benefits to using this method. First, it doesn't
depend anymore on a non-portable API. Second it's thread safe. Third it
is fast and more proven than any hack we could attempt to try to work
around the deficiencies of the various implementations around.

This commit depends on previous patches "MINOR: tools: add 64-bit rotate
operators" and "BUG/MEDIUM: random: initialize the random pool a bit
better", all of which will need to be backported at least as far as
version 2.0. It doesn't require to backport the build fixes for circular
include files dependecy anymore.
diff --git a/src/standard.c b/src/standard.c
index 38997d5..5ccf447 100644
--- a/src/standard.c
+++ b/src/standard.c
@@ -4528,6 +4528,118 @@
 	return len;
 }
 
+
+/* Random number generator state, see below */
+static uint64_t ha_random_state[2];
+
+/* This is a thread-safe implementation of xoroshiro128** described below:
+ *     http://prng.di.unimi.it/
+ * It features a 2^128 long sequence, returns 64 high-quality bits on each call,
+ * supports fast jumps and passes all common quality tests. It is thread-safe,
+ * uses a double-cas on 64-bit architectures supporting it, and falls back to a
+ * local lock on other ones.
+ */
+uint64_t ha_random64()
+{
+	uint64_t result;
+	uint64_t old[2];
+	uint64_t new[2];
+
+#if defined(USE_THREAD) && (!defined(HA_CAS_IS_8B) || !defined(HA_HAVE_CAS_DW))
+	static HA_SPINLOCK_T rand_lock;
+
+	HA_SPIN_LOCK(OTHER_LOCK, &rand_lock);
+#endif
+
+	old[0] = ha_random_state[0];
+	old[1] = ha_random_state[1];
+
+#if defined(USE_THREAD) && defined(HA_CAS_IS_8B) && defined(HA_HAVE_CAS_DW)
+	do {
+#endif
+		result = rotl64(old[0] * 5, 7) * 9;
+		new[1] = old[0] ^ old[1];
+		new[0] = rotl64(old[0], 24) ^ new[1] ^ (new[1] << 16); // a, b
+		new[1] = rotl64(new[1], 37); // c
+
+#if defined(USE_THREAD) && defined(HA_CAS_IS_8B) && defined(HA_HAVE_CAS_DW)
+	} while (unlikely(!_HA_ATOMIC_DWCAS(ha_random_state, old, new)));
+#else
+	ha_random_state[0] = new[0];
+	ha_random_state[1] = new[1];
+#if defined(USE_THREAD)
+	HA_SPIN_UNLOCK(OTHER_LOCK, &rand_lock);
+#endif
+#endif
+	return result;
+}
+
+/* seeds the random state using up to <len> bytes from <seed>, starting with
+ * the first non-zero byte.
+ */
+void ha_random_seed(const unsigned char *seed, size_t len)
+{
+	size_t pos;
+
+	/* the seed must not be all zeroes, so we pre-fill it with alternating
+	 * bits and overwrite part of them with the block starting at the first
+	 * non-zero byte from the seed.
+	 */
+	memset(ha_random_state, 0x55, sizeof(ha_random_state));
+
+	for (pos = 0; pos < len; pos++)
+		if (seed[pos] != 0)
+			break;
+
+	if (pos == len)
+		return;
+
+	seed += pos;
+	len -= pos;
+
+	if (len > sizeof(ha_random_state))
+		len = sizeof(ha_random_state);
+
+	memcpy(ha_random_state, seed, len);
+}
+
+/* This causes a jump to (dist * 2^96) places in the pseudo-random sequence,
+ * and is equivalent to calling ha_random64() as many times. It is used to
+ * provide non-overlapping sequences of 2^96 numbers (~7*10^28) to up to 2^32
+ * different generators (i.e. different processes after a fork). The <dist>
+ * argument is the distance to jump to and is used in a loop so it rather not
+ * be too large if the processing time is a concern.
+ *
+ * BEWARE: this function is NOT thread-safe and must not be called during
+ * concurrent accesses to ha_random64().
+ */
+void ha_random_jump96(uint32_t dist)
+{
+	while (dist--) {
+		uint64_t s0 = 0;
+		uint64_t s1 = 0;
+		int b;
+
+		for (b = 0; b < 64; b++) {
+			if ((0xd2a98b26625eee7bULL >> b) & 1) {
+				s0 ^= ha_random_state[0];
+				s1 ^= ha_random_state[1];
+			}
+			ha_random64();
+		}
+
+		for (b = 0; b < 64; b++) {
+			if ((0xdddf9b1090aa7ac1ULL >> b) & 1) {
+				s0 ^= ha_random_state[0];
+				s1 ^= ha_random_state[1];
+			}
+			ha_random64();
+		}
+		ha_random_state[0] = s0;
+		ha_random_state[1] = s1;
+	}
+}
+
 /*
  * Local variables:
  *  c-indent-level: 8