MINOR: global: add option to disable numa detection

Render numa detection optional with a global configuration statement
'no numa-cpu-mapping'. This can be used if the applied affinity of the
algorithm is not optimal. Also complete the documentation with this new
keyword.
diff --git a/doc/configuration.txt b/doc/configuration.txt
index c50700f..8827f9b 100644
--- a/doc/configuration.txt
+++ b/doc/configuration.txt
@@ -919,6 +919,7 @@
    - nbproc
    - nbthread
    - node
+   - numa-cpu-mapping
    - pidfile
    - pp2-never-send-local
    - presetenv
@@ -1540,6 +1541,17 @@
   like "taskset" or "cpuset". Otherwise, this value defaults to 1. The default
   value is reported in the output of "haproxy -vv". See also "nbproc".
 
+numa-cpu-mapping
+  By default, if running on Linux, haproxy inspects on startup the CPU topology
+  of the machine. If a multi-socket machine is detected, the affinity is
+  automatically calculated to run on the CPUs of a single node. This is done in
+  order to not suffer from the performance penalties caused by the inter-socket
+  bus latency. However, if the applied binding is non optimal on a particular
+  architecture, it can be disabled with the statement 'no numa-cpu-mapping'.
+  This automatic binding is also not applied if a nbthread statement is present
+  in the configuration, or the affinity of the process is already specified,
+  for example via the 'cpu-map' directive or the taskset utility.
+
 pidfile <pidfile>
   Writes PIDs of all daemons into file <pidfile> when daemon mode or writes PID
   of master process into file <pidfile> when master-worker mode. This option is
diff --git a/include/haproxy/global-t.h b/include/haproxy/global-t.h
index bc734a6..03a6a50 100644
--- a/include/haproxy/global-t.h
+++ b/include/haproxy/global-t.h
@@ -166,6 +166,7 @@
 		struct hap_cpuset thread[MAX_THREADS];  /* list of CPU masks for the 32/64 first threads of the 1st process */
 	} cpu_map;
 #endif
+	int numa_cpu_mapping;
 	/* The info above is config stuff, it doesn't change during the process' life */
 	/* A number of the elements below are updated by all threads in real time and
 	 * suffer high contention, so we need to put them in their own cache lines, if
diff --git a/src/cfgparse-global.c b/src/cfgparse-global.c
index 89e3b10..47de32a 100644
--- a/src/cfgparse-global.c
+++ b/src/cfgparse-global.c
@@ -42,7 +42,8 @@
 	"log-send-hostname", "server-state-base", "server-state-file",
 	"log-tag", "spread-checks", "max-spread-checks", "cpu-map", "setenv",
 	"presetenv", "unsetenv", "resetenv", "strict-limits", "localpeer",
-	"defaults", "listen", "frontend", "backend", "peers", "resolvers",
+	"numa-cpu-mapping", "defaults", "listen", "frontend", "backend",
+	"peers", "resolvers",
 	NULL /* must be last */
 };
 
@@ -1288,6 +1289,9 @@
 		}
 		setenv("HAPROXY_LOCALPEER", localpeer, 1);
 	}
+	else if (strcmp(args[0], "numa-cpu-mapping") == 0) {
+		global.numa_cpu_mapping = (kwm == KWM_NO) ? 0 : 1;
+	}
 	else {
 		struct cfg_kw_list *kwl;
 		const char *best;
diff --git a/src/cfgparse.c b/src/cfgparse.c
index 25820bc..507d072 100644
--- a/src/cfgparse.c
+++ b/src/cfgparse.c
@@ -1846,10 +1846,12 @@
 		if (kwm != KWM_STD && strcmp(args[0], "option") != 0 &&
 		    strcmp(args[0], "log") != 0 && strcmp(args[0], "busy-polling") != 0 &&
 		    strcmp(args[0], "set-dumpable") != 0 && strcmp(args[0], "strict-limits") != 0 &&
-		    strcmp(args[0], "insecure-fork-wanted") != 0) {
+		    strcmp(args[0], "insecure-fork-wanted") != 0 &&
+		    strcmp(args[0], "numa-cpu-mapping") != 0) {
 			ha_alert("parsing [%s:%d]: negation/default currently "
 				 "supported only for options, log, busy-polling, "
-				 "set-dumpable, strict-limits, and insecure-fork-wanted.\n", file, linenum);
+				 "set-dumpable, strict-limits, insecure-fork-wanted "
+				 "and numa-cpu-mapping.\n", file, linenum);
 			err_code |= ERR_ALERT | ERR_FATAL;
 			fatal++;
 		}
@@ -2189,7 +2191,7 @@
 		if (global.nbproc == 1) {
 			int numa_cores = 0;
 #if defined(__linux__) && defined USE_CPU_AFFINITY
-			if (!thread_cpu_mask_forced())
+			if (global.numa_cpu_mapping && !thread_cpu_mask_forced())
 				numa_cores = numa_detect_topology();
 #endif
 			global.nbthread = numa_cores ? numa_cores :
diff --git a/src/haproxy.c b/src/haproxy.c
index 58e9e62..b89c517 100644
--- a/src/haproxy.c
+++ b/src/haproxy.c
@@ -160,6 +160,7 @@
 struct global global = {
 	.hard_stop_after = TICK_ETERNITY,
 	.nbproc = 1,
+	.numa_cpu_mapping = 1,
 	.nbthread = 0,
 	.req_count = 0,
 	.logsrvs = LIST_HEAD_INIT(global.logsrvs),