MINOR: listener: support another thread dispatch mode: "fair"

This new algorithm for rebalancing incoming connections to multiple
threads is simpler and instead of considering the threads load, it will
only cycle through all of them, offering a fair share of the traffic to
each thread. It may be well suited for short-lived connections but is
also convenient for very large thread counts where it's not always certain
that the least loaded thread will always be found.
diff --git a/doc/configuration.txt b/doc/configuration.txt
index 82721f4..a4c9672 100644
--- a/doc/configuration.txt
+++ b/doc/configuration.txt
@@ -3023,16 +3023,23 @@
   clicking). There should be no reason for changing this value. Please check
   tune.ssl.maxrecord below.
 
-tune.listener.multi-queue { on | off }
-  Enables ('on') or disables ('off') the listener's multi-queue accept which
-  spreads the incoming traffic to all threads a "bind" line is allowed to run
-  on instead of taking them for itself. This provides a smoother traffic
+tune.listener.multi-queue { on | fair | off }
+  Enables ('on' / 'fair') or disables ('off') the listener's multi-queue accept
+  which spreads the incoming traffic to all threads a "bind" line is allowed to
+  run on instead of taking them for itself. This provides a smoother traffic
   distribution and scales much better, especially in environments where threads
   may be unevenly loaded due to external activity (network interrupts colliding
-  with one thread for example). This option is enabled by default, but it may
-  be forcefully disabled for troubleshooting or for situations where it is
-  estimated that the operating system already provides a good enough
-  distribution and connections are extremely short-lived.
+  with one thread for example). The default mode, "on", optimizes the choice of
+  a thread by picking in a sample the one with the less connections. It is
+  often the best choice when connections are long-lived as it manages to keep
+  all threads busy. A second mode, "fair", instead cycles through all threads
+  regardless of their instant load level. It can be better suited for short-
+  lived connections, or on machines with very large numbers of threads where
+  the probability to find the least loaded thread with the first mode is low.
+  Finally it is possible to forcefully disable the redistribution mechanism
+  using "off" for troubleshooting, or for situations where connections are
+  short-lived and it is estimated that the operating system alredy provides a
+  good enough distribution. The default is "on".
 
 tune.lua.forced-yield <number>
   This directive forces the Lua engine to execute a yield each <number> of
diff --git a/include/haproxy/global-t.h b/include/haproxy/global-t.h
index b7b00ba..af7f26c 100644
--- a/include/haproxy/global-t.h
+++ b/include/haproxy/global-t.h
@@ -66,7 +66,7 @@
 #define GTUNE_USE_SYSTEMD        (1<<10)
 
 #define GTUNE_BUSY_POLLING       (1<<11)
-#define GTUNE_LISTENER_MQ        (1<<12)
+/* unused: (1<<12) */
 #define GTUNE_SET_DUMPABLE       (1<<13)
 #define GTUNE_USE_EVPORTS        (1<<14)
 #define GTUNE_STRICT_LIMITS      (1<<15)
@@ -81,6 +81,9 @@
 #define GTUNE_QUIC_SOCK_PER_CONN (1<<24)
 #define GTUNE_NO_QUIC            (1<<25)
 #define GTUNE_USE_FAST_FWD       (1<<26)
+#define GTUNE_LISTENER_MQ_FAIR   (1<<27)
+#define GTUNE_LISTENER_MQ_OPT    (1<<28)
+#define GTUNE_LISTENER_MQ_ANY    (GTUNE_LISTENER_MQ_FAIR | GTUNE_LISTENER_MQ_OPT)
 
 /* SSL server verify mode */
 enum {
diff --git a/src/haproxy.c b/src/haproxy.c
index fd43c96..5306113 100644
--- a/src/haproxy.c
+++ b/src/haproxy.c
@@ -187,7 +187,7 @@
 		 }
 	},
 	.tune = {
-		.options = GTUNE_LISTENER_MQ,
+		.options = GTUNE_LISTENER_MQ_OPT,
 		.bufsize = (BUFSIZE + 2*sizeof(void *) - 1) & -(2*sizeof(void *)),
 		.maxrewrite = MAXREWRITE,
 		.reserved_bufs = RESERVED_BUFS,
diff --git a/src/listener.c b/src/listener.c
index d6e58ce..e441eff 100644
--- a/src/listener.c
+++ b/src/listener.c
@@ -1099,7 +1099,7 @@
 
 #if defined(USE_THREAD)
 		mask = l->rx.bind_thread & _HA_ATOMIC_LOAD(&tg->threads_enabled);
-		if (atleast2(mask) && (global.tune.options & GTUNE_LISTENER_MQ) && !stopping) {
+		if (atleast2(mask) && (global.tune.options & GTUNE_LISTENER_MQ_ANY) && !stopping) {
 			struct accept_queue_ring *ring;
 			unsigned int t, t0, t1, t2;
 			int base = tg->base;
@@ -1140,6 +1140,14 @@
 					t1 += my_ffsl(m1) - 1;
 				}
 
+				/* if running in round-robin mode ("fair"), we don't need
+				 * to go further.
+				 */
+				if ((global.tune.options & GTUNE_LISTENER_MQ_ANY) == GTUNE_LISTENER_MQ_FAIR) {
+					t = t1;
+					goto updt_t1;
+				}
+
 				if (unlikely(!(m2 & (1UL << t2)) || t1 == t2)) {
 					/* highest bit not set */
 					if (!m2)
@@ -1184,6 +1192,7 @@
 				}
 				else {
 					t = t1;
+				updt_t1:
 					t1++;
 					if (t1 >= LONGBITS)
 						t1 = 0;
@@ -1898,7 +1907,7 @@
 	return 0;
 }
 
-/* config parser for global "tune.listener.multi-queue", accepts "on" or "off" */
+/* config parser for global "tune.listener.multi-queue", accepts "on", "fair" or "off" */
 static int cfg_parse_tune_listener_mq(char **args, int section_type, struct proxy *curpx,
                                       const struct proxy *defpx, const char *file, int line,
                                       char **err)
@@ -1907,11 +1916,13 @@
 		return -1;
 
 	if (strcmp(args[1], "on") == 0)
-		global.tune.options |= GTUNE_LISTENER_MQ;
+		global.tune.options = (global.tune.options & ~GTUNE_LISTENER_MQ_ANY) | GTUNE_LISTENER_MQ_OPT;
+	else if (strcmp(args[1], "fair") == 0)
+		global.tune.options = (global.tune.options & ~GTUNE_LISTENER_MQ_ANY) | GTUNE_LISTENER_MQ_FAIR;
 	else if (strcmp(args[1], "off") == 0)
-		global.tune.options &= ~GTUNE_LISTENER_MQ;
+		global.tune.options &= ~GTUNE_LISTENER_MQ_ANY;
 	else {
-		memprintf(err, "'%s' expects either 'on' or 'off' but got '%s'.", args[0], args[1]);
+		memprintf(err, "'%s' expects either 'on', 'fair', or 'off' but got '%s'.", args[0], args[1]);
 		return -1;
 	}
 	return 0;