BUG/MEDIUM: random: implement a thread-safe and process-safe PRNG

This is the replacement of failed attempt to add thread safety and
per-process sequences of random numbers initally tried with commit
1c306aa84d ("BUG/MEDIUM: random: implement per-thread and per-process
random sequences").

This new version takes a completely different approach and doesn't try
to work around the horrible OS-specific and non-portable random API
anymore. Instead it implements "xoroshiro128**", a reputedly high
quality random number generator, which is one of the many variants of
xorshift, which passes all quality tests and which is described here:

   http://prng.di.unimi.it/

While not cryptographically secure, it is fast and features a 2^128-1
period. It supports fast jumps allowing to cut the period into smaller
non-overlapping sequences, which we use here to support up to 2^32
processes each having their own, non-overlapping sequence of 2^96
numbers (~7*10^28). This is enough to provide 1 billion randoms per
second and per process for 2200 billion years.

The implementation was made thread-safe either by using a double 64-bit
CAS on platforms supporting it (x86_64, aarch64) or by using a local
lock for the time needed to perform the shift operations. This ensures
that all threads pick numbers from the same pool so that it is not
needed to assign per-thread ranges. For processes we use the fast jump
method to advance the sequence by 2^96 for each process.

There are multiple benefits to using this method. First, it doesn't
depend anymore on a non-portable API. Second it's thread safe. Third it
is fast and more proven than any hack we could do to try to work around
the breakage of various implementations.

This commit depends on previous patch "MINOR: tools: add 64-bit rotate
operators", both of which will need to be backported at least as far as
version 2.0. It doesn't require to backport the build fixes for circular
include files dependecy anymore.

(cherry picked from commit 52bf839394e683eec2fa8aafff5a0dd51d2dd365)
[wt: minor context adjustments]
(cherry picked from commit d4ac067fe4708e5c7c77d8f356a53d7d376ced40)
[wt: context adjustments, random() appearing at other places]
Signed-off-by: Willy Tarreau <w@1wt.eu>
diff --git a/include/common/standard.h b/include/common/standard.h
index b646ff7..aaed6be 100644
--- a/include/common/standard.h
+++ b/include/common/standard.h
@@ -1475,6 +1475,21 @@
 
 int parse_dotted_uints(const char *s, unsigned int **nums, size_t *sz);
 
+/* PRNG */
+void ha_random_seed(const unsigned char *seed, size_t len);
+void ha_random_jump96(uint32_t dist);
+uint64_t ha_random64();
+
+static inline uint32_t ha_random32()
+{
+	return ha_random64() >> 32;
+}
+
+static inline int32_t ha_random()
+{
+	return ha_random32() >> 1;
+}
+
 /* HAP_STRING() makes a string from a literal while HAP_XSTRING() first
  * evaluates the argument and is suited to pass macros.
  *
diff --git a/src/51d.c b/src/51d.c
index 4c16a79..dc1acba 100644
--- a/src/51d.c
+++ b/src/51d.c
@@ -771,7 +771,7 @@
 	free(_51d_property_list);
 
 #ifdef FIFTYONEDEGREES_H_PATTERN_INCLUDED
-	_51d_lru_seed = random();
+	_51d_lru_seed = ha_random();
 	if (global_51degrees.cache_size) {
 		_51d_lru_tree = lru64_new(global_51degrees.cache_size);
 	}
diff --git a/src/backend.c b/src/backend.c
index 6570342..a493b49 100644
--- a/src/backend.c
+++ b/src/backend.c
@@ -575,7 +575,7 @@
 	do {
 		prev = curr;
 		/* ensure all 32 bits are covered as long as RAND_MAX >= 65535 */
-		hash = ((uint64_t)random() * ((uint64_t)RAND_MAX + 1)) ^ random();
+		hash = ((uint64_t)ha_random() * ((uint64_t)RAND_MAX + 1)) ^ ha_random();
 		curr = chash_get_server_hash(px, hash, avoid);
 		if (!curr)
 			break;
diff --git a/src/flt_spoe.c b/src/flt_spoe.c
index 4e3bc49..fbd834b 100644
--- a/src/flt_spoe.c
+++ b/src/flt_spoe.c
@@ -269,7 +269,7 @@
 
 	while (byte < 4) {
 		while (bits < 32) {
-			last |= (uint64_t)random() << bits;
+			last |= (uint64_t)ha_random() << bits;
 			bits += rand_max_bits;
 		}
 		rnd[byte++] = last;
@@ -3119,10 +3119,6 @@
 	struct spoe_config *conf = fconf->conf;
 	struct spoe_agent *agent = conf->agent;
 
-	/* Use a != seed per process */
-	if (relative_pid > 1 && tid == 0)
-		srandom(now_ms * pid);
-
 	agent->rt[tid].engine_id = generate_pseudo_uuid();
 	if (agent->rt[tid].engine_id == NULL)
 		return -1;
diff --git a/src/flt_trace.c b/src/flt_trace.c
index c0660ac..223a3ff 100644
--- a/src/flt_trace.c
+++ b/src/flt_trace.c
@@ -487,7 +487,7 @@
 			}
 		}
 		if (data)  {
-			ret = random() % (ret+1);
+			ret = ha_random() % (ret+1);
 			if (!ret || ret >= data)
 				ret = len;
 		}
@@ -516,7 +516,7 @@
 	int ret   = avail;
 
 	if (ret && conf->rand_parsing)
-		ret = random() % (ret+1);
+		ret = ha_random() % (ret+1);
 
 	STRM_TRACE(conf, s, "%-25s: channel=%-10s - mode=%-5s (%s) - "
 		   "chunk_len=%llu - next=%u - fwd=%u - avail=%d - consume=%d",
@@ -582,7 +582,7 @@
 	int                  ret  = len;
 
 	if (ret && conf->rand_forwarding)
-		ret = random() % (ret+1);
+		ret = ha_random() % (ret+1);
 
 	STRM_TRACE(conf, s, "%-25s: channel=%-10s - mode=%-5s (%s) - "
 		   "len=%u - nxt=%u - fwd=%u - forward=%d",
@@ -613,7 +613,7 @@
 	int                  ret  = avail;
 
 	if (ret && conf->rand_parsing)
-		ret = random() % (ret+1);
+		ret = ha_random() % (ret+1);
 
 	STRM_TRACE(conf, s, "%-25s: channel=%-10s - mode=%-5s (%s) - next=%u - avail=%u - consume=%d",
 		   __FUNCTION__,
@@ -633,7 +633,7 @@
 	int                  ret  = len;
 
 	if (ret && conf->rand_forwarding)
-		ret = random() % (ret+1);
+		ret = ha_random() % (ret+1);
 
 	STRM_TRACE(conf, s, "%-25s: channel=%-10s - mode=%-5s (%s) - len=%u - fwd=%u - forward=%d",
 		   __FUNCTION__,
diff --git a/src/haproxy.c b/src/haproxy.c
index 1947d0e..c878518 100644
--- a/src/haproxy.c
+++ b/src/haproxy.c
@@ -1244,6 +1244,10 @@
  * We initialize the current process with the first 32 bits before starting the
  * polling loop, where all this will be changed to have process specific and
  * thread specific sequences.
+ *
+ * Before starting threads, it's still possible to call random() as srandom()
+ * is initialized from this, but after threads and/or processes are started,
+ * only ha_random() is expected to be used to guarantee distinct sequences.
  */
 static void ha_random_boot(char *const *argv)
 {
@@ -1314,6 +1318,7 @@
 	blk_SHA1_Final(boot_seed, &ctx);
 
 	srandom(read_u32(boot_seed));
+	ha_random_seed(boot_seed, sizeof(boot_seed));
 }
 
 /* considers splicing proxies' maxconn, computes the ideal global.maxpipes
@@ -3100,8 +3105,10 @@
 					protocol_unbind_all();
 					exit(1); /* there has been an error */
 				}
-				else if (ret == 0) /* child breaks here */
+				else if (ret == 0) { /* child breaks here */
+					ha_random_jump96(relative_pid);
 					break;
+				}
 				if (pidfd >= 0 && !(global.mode & MODE_MWORKER)) {
 					char pidstr[100];
 					snprintf(pidstr, sizeof(pidstr), "%d\n", ret);
diff --git a/src/memory.c b/src/memory.c
index 409b4e5..fc16339 100644
--- a/src/memory.c
+++ b/src/memory.c
@@ -615,7 +615,7 @@
 	int n;
 
 	if (mem_fail_rate > 0 && !(global.mode & MODE_STARTING)) {
-		int randnb = random() % 100;
+		int randnb = ha_random() % 100;
 
 		if (mem_fail_rate > randnb)
 			ret = 1;
diff --git a/src/pattern.c b/src/pattern.c
index 90067cd..df915f5 100644
--- a/src/pattern.c
+++ b/src/pattern.c
@@ -2652,7 +2652,7 @@
 	struct pat_ref *ref, *ref2, *ref3;
 	struct list pr = LIST_HEAD_INIT(pr);
 
-	pat_lru_seed = random();
+	pat_lru_seed = ha_random();
 
 	list_for_each_entry(ref, &pattern_reference, list) {
 		if (ref->unique_id == -1) {
diff --git a/src/peers.c b/src/peers.c
index 4a28db4..3463671 100644
--- a/src/peers.c
+++ b/src/peers.c
@@ -2232,7 +2232,7 @@
 					 * retrying otherwise the other end will do the same and we can loop
 					 * for a while.
 					 */
-					curpeer->reconnect = tick_add(now_ms, MS_TO_TICKS(50 + random() % 2000));
+					curpeer->reconnect = tick_add(now_ms, MS_TO_TICKS(50 + ha_random() % 2000));
 					peer_session_forceshutdown(curpeer);
 				}
 				if (maj_ver != (unsigned int)-1 && min_ver != (unsigned int)-1) {
@@ -2709,7 +2709,7 @@
 									ps->reconnect = tick_add(now_ms, MS_TO_TICKS(PEER_RECONNECT_TIMEOUT));
 								}
 								else  {
-									ps->reconnect = tick_add(now_ms, MS_TO_TICKS(50 + random() % 2000));
+									ps->reconnect = tick_add(now_ms, MS_TO_TICKS(50 + ha_random() % 2000));
 									peer_session_forceshutdown(ps);
 									ps->no_hbt++;
 								}
@@ -2765,7 +2765,7 @@
 				 * retrying otherwise the other end will do the same and we can loop
 				 * for a while.
 				 */
-				ps->reconnect = tick_add(now_ms, MS_TO_TICKS(50 + random() % 2000));
+				ps->reconnect = tick_add(now_ms, MS_TO_TICKS(50 + ha_random() % 2000));
 				if (ps->appctx) {
 					peer_session_forceshutdown(ps);
 				}
diff --git a/src/sample.c b/src/sample.c
index e708101..77b420c 100644
--- a/src/sample.c
+++ b/src/sample.c
@@ -2946,7 +2946,7 @@
 static int
 smp_fetch_rand(const struct arg *args, struct sample *smp, const char *kw, void *private)
 {
-	smp->data.u.sint = random();
+	smp->data.u.sint = ha_random();
 
 	/* reduce if needed. Don't do a modulo, use all bits! */
 	if (args && args[0].type == ARGT_SINT)
@@ -3158,7 +3158,7 @@
 
 		while (byte < 4) {
 			while (bits < 32) {
-				last |= (uint64_t)random() << bits;
+				last |= (uint64_t)ha_random() << bits;
 				bits += rand_max_bits;
 			}
 			rnd[byte++] = last;
diff --git a/src/standard.c b/src/standard.c
index 59c2914..66e7a61 100644
--- a/src/standard.c
+++ b/src/standard.c
@@ -4349,6 +4349,118 @@
 {
 }
 
+
+/* Random number generator state, see below */
+static uint64_t ha_random_state[2];
+
+/* This is a thread-safe implementation of xoroshiro128** described below:
+ *     http://prng.di.unimi.it/
+ * It features a 2^128 long sequence, returns 64 high-quality bits on each call,
+ * supports fast jumps and passes all common quality tests. It is thread-safe,
+ * uses a double-cas on 64-bit architectures supporting it, and falls back to a
+ * local lock on other ones.
+ */
+uint64_t ha_random64()
+{
+	uint64_t result;
+	uint64_t old[2];
+	uint64_t new[2];
+
+#if defined(USE_THREAD) && (!defined(HA_CAS_IS_8B) || !defined(HA_HAVE_CAS_DW))
+	static HA_SPINLOCK_T rand_lock;
+
+	HA_SPIN_LOCK(OTHER_LOCK, &rand_lock);
+#endif
+
+	old[0] = ha_random_state[0];
+	old[1] = ha_random_state[1];
+
+#if defined(USE_THREAD) && defined(HA_CAS_IS_8B) && defined(HA_HAVE_CAS_DW)
+	do {
+#endif
+		result = rotl64(old[0] * 5, 7) * 9;
+		new[1] = old[0] ^ old[1];
+		new[0] = rotl64(old[0], 24) ^ new[1] ^ (new[1] << 16); // a, b
+		new[1] = rotl64(new[1], 37); // c
+
+#if defined(USE_THREAD) && defined(HA_CAS_IS_8B) && defined(HA_HAVE_CAS_DW)
+	} while (unlikely(!_HA_ATOMIC_DWCAS(ha_random_state, old, new)));
+#else
+	ha_random_state[0] = new[0];
+	ha_random_state[1] = new[1];
+#if defined(USE_THREAD)
+	HA_SPIN_UNLOCK(OTHER_LOCK, &rand_lock);
+#endif
+#endif
+	return result;
+}
+
+/* seeds the random state using up to <len> bytes from <seed>, starting with
+ * the first non-zero byte.
+ */
+void ha_random_seed(const unsigned char *seed, size_t len)
+{
+	size_t pos;
+
+	/* the seed must not be all zeroes, so we pre-fill it with alternating
+	 * bits and overwrite part of them with the block starting at the first
+	 * non-zero byte from the seed.
+	 */
+	memset(ha_random_state, 0x55, sizeof(ha_random_state));
+
+	for (pos = 0; pos < len; pos++)
+		if (seed[pos] != 0)
+			break;
+
+	if (pos == len)
+		return;
+
+	seed += pos;
+	len -= pos;
+
+	if (len > sizeof(ha_random_state))
+		len = sizeof(ha_random_state);
+
+	memcpy(ha_random_state, seed, len);
+}
+
+/* This causes a jump to (dist * 2^96) places in the pseudo-random sequence,
+ * and is equivalent to calling ha_random64() as many times. It is used to
+ * provide non-overlapping sequences of 2^96 numbers (~7*10^28) to up to 2^32
+ * different generators (i.e. different processes after a fork). The <dist>
+ * argument is the distance to jump to and is used in a loop so it rather not
+ * be too large if the processing time is a concern.
+ *
+ * BEWARE: this function is NOT thread-safe and must not be called during
+ * concurrent accesses to ha_random64().
+ */
+void ha_random_jump96(uint32_t dist)
+{
+	while (dist--) {
+		uint64_t s0 = 0;
+		uint64_t s1 = 0;
+		int b;
+
+		for (b = 0; b < 64; b++) {
+			if ((0xd2a98b26625eee7bULL >> b) & 1) {
+				s0 ^= ha_random_state[0];
+				s1 ^= ha_random_state[1];
+			}
+			ha_random64();
+		}
+
+		for (b = 0; b < 64; b++) {
+			if ((0xdddf9b1090aa7ac1ULL >> b) & 1) {
+				s0 ^= ha_random_state[0];
+				s1 ^= ha_random_state[1];
+			}
+			ha_random64();
+		}
+		ha_random_state[0] = s0;
+		ha_random_state[1] = s1;
+	}
+}
+
 /*
  * Local variables:
  *  c-indent-level: 8