BUG/MEDIUM: random: implement per-thread and per-process random sequences

As mentioned in previous patch, the random number generator was never
made thread-safe, which used not to be a problem for health checks
spreading, until the uuid sample fetch function appeared. Currently
it is possible for two threads or processes to produce exactly the
same UUID. In fact it's extremely likely that this will happen for
processes, as can be seen with this config:

    global
        nbproc 8

    frontend f
        bind :4445
        mode http
        log stdout daemon format raw
        log-format "%[uuid] %pid"
        redirect location /

It typically produces this log:

  551ce567-0bfb-4bbd-9b58-cdc7e9365325 30645
  551ce567-0bfb-4bbd-9b58-cdc7e9365325 30641
  551ce567-0bfb-4bbd-9b58-cdc7e9365325 30644
  551ce567-0bfb-4bbd-9b58-cdc7e9365325 30639
  551ce567-0bfb-4bbd-9b58-cdc7e9365325 30646
  07764439-c24d-4e6f-a5a6-0138be59e7a8 30645
  07764439-c24d-4e6f-a5a6-0138be59e7a8 30639
  551ce567-0bfb-4bbd-9b58-cdc7e9365325 30643
  07764439-c24d-4e6f-a5a6-0138be59e7a8 30646
  b6773fdd-678f-4d04-96f2-4fb11ad15d6b 30646
  551ce567-0bfb-4bbd-9b58-cdc7e9365325 30642
  07764439-c24d-4e6f-a5a6-0138be59e7a8 30642

What this patch does is to use a distinct per-thread and per-process
seed to make sure the same sequences will not appear, and will then
extend these seeds by "burning" a number of randoms that depends on
the global random seed, the thread ID and the process ID. This adds
roughly 20 extra bits of randomness, resulting in 52 bits total per
thread and per process.

It only takes a few milliseconds to burn these randoms and given
that threads start with a different seed, we know they will not
catch each other. So these random extra bits are essentially added
to ensure randomness between boots and cluster instances.

This replaces all uses of random() with ha_random() which uses the
thread-local state.

This must be backported as far as 2.0 or any version having the
UUID sample-fetch function since it's the main victim here.

It's important to note that this patch, in addition to depending on
the previous one "BUG/MEDIUM: init: initialize the random pool a bit
better", also depends on the preceeding build fixes to address a
circular dependency issue in the include files that prevented it
from building. Part or all of these patches may need to be backported
or adapted as well.
diff --git a/include/common/standard.h b/include/common/standard.h
index a780833..56995f4 100644
--- a/include/common/standard.h
+++ b/include/common/standard.h
@@ -43,6 +43,7 @@
 #include <common/namespace.h>
 #include <eb32tree.h>
 #include <eb32sctree.h>
+#include <types/global.h>
 #include <types/protocol.h>
 
 /* size used for max length of decimal representation of long long int. */
@@ -1530,6 +1531,18 @@
 
 int parse_dotted_uints(const char *s, unsigned int **nums, size_t *sz);
 
+/* returns a positive random from a process-specific and thread-specific
+ * sequence initialized by ha_random_init_per_thread(). It's just a wrapper on
+ * top of random_r() so it lives with the same limitations (i.e. 31 bits only).
+ */
+static inline int32_t ha_random()
+{
+	int32_t r;
+
+	random_r(&ha_rand_data, &r); // no error since our buffer is OK.
+	return r;
+}
+
 /* HAP_STRING() makes a string from a literal while HAP_XSTRING() first
  * evaluates the argument and is suited to pass macros.
  *
diff --git a/include/types/global.h b/include/types/global.h
index 82f1011..994927b 100644
--- a/include/types/global.h
+++ b/include/types/global.h
@@ -240,6 +240,8 @@
 extern unsigned int rlim_fd_max_at_boot;
 extern int atexit_flag;
 extern unsigned char boot_seed[20];  // per-boot random seed (160 bits initially)
+extern THREAD_LOCAL char ha_rand_state[32];          /* opaque 256 bits of random state */
+extern THREAD_LOCAL struct random_data ha_rand_data; /* opaque internal random_r() date */
 
 /* bit values to go with "warned" above */
 /* unassigned : 0x00000001 (previously: WARN_BLOCK_DEPRECATED) */
diff --git a/src/51d.c b/src/51d.c
index b00f018..42d1929 100644
--- a/src/51d.c
+++ b/src/51d.c
@@ -700,7 +700,7 @@
 	free(_51d_property_list);
 
 #ifdef FIFTYONEDEGREES_H_PATTERN_INCLUDED
-	_51d_lru_seed = random();
+	_51d_lru_seed = ha_random();
 	if (global_51degrees.cache_size) {
 		_51d_lru_tree = lru64_new(global_51degrees.cache_size);
 	}
diff --git a/src/backend.c b/src/backend.c
index 251a7a5..87e3b9a 100644
--- a/src/backend.c
+++ b/src/backend.c
@@ -541,7 +541,7 @@
 	do {
 		prev = curr;
 		/* ensure all 32 bits are covered as long as RAND_MAX >= 65535 */
-		hash = ((uint64_t)random() * ((uint64_t)RAND_MAX + 1)) ^ random();
+		hash = ((uint64_t)ha_random() * ((uint64_t)RAND_MAX + 1)) ^ ha_random();
 		curr = chash_get_server_hash(px, hash, avoid);
 		if (!curr)
 			break;
diff --git a/src/flt_spoe.c b/src/flt_spoe.c
index d54fcd4..bcdec08 100644
--- a/src/flt_spoe.c
+++ b/src/flt_spoe.c
@@ -269,7 +269,7 @@
 
 	while (byte < 4) {
 		while (bits < 32) {
-			last |= (uint64_t)random() << bits;
+			last |= (uint64_t)ha_random() << bits;
 			bits += rand_max_bits;
 		}
 		rnd[byte++] = last;
@@ -3109,10 +3109,6 @@
 	struct spoe_config *conf = fconf->conf;
 	struct spoe_agent *agent = conf->agent;
 
-	/* Use a != seed per process */
-	if (relative_pid > 1 && tid == 0)
-		srandom(now_ms * pid);
-
 	agent->rt[tid].engine_id = generate_pseudo_uuid();
 	if (agent->rt[tid].engine_id == NULL)
 		return -1;
diff --git a/src/flt_trace.c b/src/flt_trace.c
index 5a26fab..b06ba15 100644
--- a/src/flt_trace.c
+++ b/src/flt_trace.c
@@ -468,7 +468,7 @@
 		unsigned int data = trace_get_htx_datalen(htxbuf(&msg->chn->buf), offset, len);
 
 		if (data) {
-			ret = random() % (ret+1);
+			ret = ha_random() % (ret+1);
 			if (!ret || ret >= data)
 				ret = len;
 		}
@@ -536,7 +536,7 @@
 			unsigned int data = trace_get_htx_datalen(htxbuf(&chn->buf), offset, len);
 
 			if (data) {
-				ret = random() % (ret+1);
+				ret = ha_random() % (ret+1);
 				if (!ret || ret >= data)
 					ret = len;
 			}
@@ -554,7 +554,7 @@
 	else {
 
 		if (ret && conf->rand_forwarding)
-			ret = random() % (ret+1);
+			ret = ha_random() % (ret+1);
 
 		FLT_STRM_TRACE(conf, s, "%-25s: channel=%-10s - mode=%-5s (%s) - "
 			       "offset=%u - len=%u - forward=%d",
diff --git a/src/haproxy.c b/src/haproxy.c
index ef9010f..8b13d96 100644
--- a/src/haproxy.c
+++ b/src/haproxy.c
@@ -238,6 +238,8 @@
 
 /* per-boot randomness */
 unsigned char boot_seed[20];        /* per-boot random seed (160 bits initially) */
+THREAD_LOCAL char ha_rand_state[32];          /* opaque 256 bits of random state */
+THREAD_LOCAL struct random_data ha_rand_data; /* opaque internal random_r() date */
 
 struct mworker_proc *proc_self = NULL;
 
@@ -1363,6 +1365,59 @@
 }
 
 
+/* Initializes the per-thread, per-process random seed for use with random_r().
+ *
+ * We cannot pass a global state from one thread to another one because we
+ * must still call initstate_r() on it to reset the per-thread pointer, and
+ * this will reinitialize our state. What we do instead is that we use the
+ * *same* seed for all threads so that they start with the exact same internal
+ * state, and will loop over random() a different (and large) number of times
+ * to make sure their internal state is totally different. This results in 4
+ * billion possible *boot* sequences, and each thread may start with a much
+ * greater number of sequences as well (we typically add up to 20 bits, giving
+ * 4 trillon possible initial sequences).
+ */
+static void ha_random_init_per_thread()
+{
+	unsigned int seed;
+	unsigned int loops;
+	uint64_t u64;
+
+	/* recreate a distinct initial state for each process/thread */
+	seed = read_u32(boot_seed);
+
+	/* start with a strictly different seed per thread/process */
+	seed += (relative_pid * MAX_THREADS)+ tid;
+
+	memset(&ha_rand_data, 0, sizeof(ha_rand_data));
+	initstate_r(seed, ha_rand_state, sizeof(ha_rand_state), &ha_rand_data);
+
+	/* make sure all pids and tids have a different count, we'll
+	 * loop up to ~1 million times on each thread, with a fairly
+	 * different number for each. This should only take a few ms
+	 * per thread and will provide ~20 extra bits of randomness
+	 * to each thread/process, resulting in ~52 bits per thread per
+	 * boot.
+	 */
+	loops = read_u32(boot_seed);
+
+	u64 = read_u64(boot_seed + 4);
+	u64 = (u64 << relative_pid) | (u64 >> (63-relative_pid));
+	loops ^= u64 ^ (u64 >> 32);
+
+	u64 = read_u64(boot_seed + 12);
+	u64 = (u64 << tid) | (u64 >> (63-tid));
+	loops ^= u64 ^ (u64 >> 32);
+	loops %= 1048573;
+
+	/* burn some randoms to mix the internal state */
+	while (loops--) {
+		int32_t drop;
+
+		(void)random_r(&ha_rand_data, &drop);
+	}
+}
+
 /* Performs basic random seed initialization. The main issue with this is that
  * srandom_r() only takes 32 bits and purposely provides a reproducible sequence,
  * which means that there will only be 4 billion possible random sequences once
@@ -1374,6 +1429,10 @@
  * We initialize the current process with the first 32 bits before starting the
  * polling loop, where all this will be changed to have process specific and
  * thread specific sequences.
+ *
+ * Before starting threads, it's still possible to call random() as srandom()
+ * is initialized from this, but after threads and/or processes are started,
+ * only ha_random() is expected to be used to guarantee distinct sequences.
  */
 static void ha_random_boot(char *const *argv)
 {
@@ -1444,6 +1503,7 @@
 	blk_SHA1_Final(boot_seed, &ctx);
 
 	srandom(read_u32(boot_seed));
+	ha_random_init_per_thread();
 }
 
 /* considers splicing proxies' maxconn, computes the ideal global.maxpipes
@@ -2780,6 +2840,9 @@
 	ti->clock_id = CLOCK_THREAD_CPUTIME_ID;
 #endif
 #endif
+	/* assign per-process, per-thread randomness */
+	ha_random_init_per_thread();
+
 	/* Now, initialize one thread init at a time. This is better since
 	 * some init code is a bit tricky and may release global resources
 	 * after reallocating them locally. This will also ensure there is
diff --git a/src/memory.c b/src/memory.c
index d1aec59..0ff3ea8 100644
--- a/src/memory.c
+++ b/src/memory.c
@@ -628,7 +628,7 @@
 	int n;
 
 	if (mem_fail_rate > 0 && !(global.mode & MODE_STARTING)) {
-		int randnb = random() % 100;
+		int randnb = ha_random() % 100;
 
 		if (mem_fail_rate > randnb)
 			ret = 1;
diff --git a/src/pattern.c b/src/pattern.c
index 8dfe3cf..3ea1f33 100644
--- a/src/pattern.c
+++ b/src/pattern.c
@@ -2667,7 +2667,7 @@
 	struct pat_ref *ref, **arr;
 	struct list pr = LIST_HEAD_INIT(pr);
 
-	pat_lru_seed = random();
+	pat_lru_seed = ha_random();
 
 	/* Count pat_refs with user defined unique_id and totalt count */
 	list_for_each_entry(ref, &pattern_reference, list) {
diff --git a/src/peers.c b/src/peers.c
index f5a4f18..640a99f 100644
--- a/src/peers.c
+++ b/src/peers.c
@@ -2232,7 +2232,7 @@
 					 * retrying otherwise the other end will do the same and we can loop
 					 * for a while.
 					 */
-					curpeer->reconnect = tick_add(now_ms, MS_TO_TICKS(50 + random() % 2000));
+					curpeer->reconnect = tick_add(now_ms, MS_TO_TICKS(50 + ha_random() % 2000));
 					peer_session_forceshutdown(curpeer);
 				}
 				if (maj_ver != (unsigned int)-1 && min_ver != (unsigned int)-1) {
@@ -2685,7 +2685,7 @@
 									ps->reconnect = tick_add(now_ms, MS_TO_TICKS(PEER_RECONNECT_TIMEOUT));
 								}
 								else  {
-									ps->reconnect = tick_add(now_ms, MS_TO_TICKS(50 + random() % 2000));
+									ps->reconnect = tick_add(now_ms, MS_TO_TICKS(50 + ha_random() % 2000));
 									peer_session_forceshutdown(ps);
 									ps->no_hbt++;
 								}
@@ -2741,7 +2741,7 @@
 				 * retrying otherwise the other end will do the same and we can loop
 				 * for a while.
 				 */
-				ps->reconnect = tick_add(now_ms, MS_TO_TICKS(50 + random() % 2000));
+				ps->reconnect = tick_add(now_ms, MS_TO_TICKS(50 + ha_random() % 2000));
 				if (ps->appctx) {
 					peer_session_forceshutdown(ps);
 				}
diff --git a/src/sample.c b/src/sample.c
index 3c61112..fd63902 100644
--- a/src/sample.c
+++ b/src/sample.c
@@ -3124,7 +3124,7 @@
 static int
 smp_fetch_rand(const struct arg *args, struct sample *smp, const char *kw, void *private)
 {
-	smp->data.u.sint = random();
+	smp->data.u.sint = ha_random();
 
 	/* reduce if needed. Don't do a modulo, use all bits! */
 	if (args && args[0].type == ARGT_SINT)
@@ -3336,7 +3336,7 @@
 
 		while (byte < 4) {
 			while (bits < 32) {
-				last |= (uint64_t)random() << bits;
+				last |= (uint64_t)ha_random() << bits;
 				bits += rand_max_bits;
 			}
 			rnd[byte++] = last;