MINOR: pool: make the thread-local hot cache size configurable

Till now it was only possible to change the thread local hot cache size
at build time using CONFIG_HAP_POOL_CACHE_SIZE. But along benchmarks it
was sometimes noticed a huge contention in the lower level memory
allocators indicating that larger caches could be beneficial, especially
on machines with large L2 CPUs.

Given that the checks against this value was no longer on a hot path
anymore, there was no reason for continuing to force it to be tuned at
build time. So this patch allows to set it by tune.memory-hot-size.

It's worth noting that during the boot phase the value remains zero so
that it's possible to know if the value was set or not, which opens the
possibility that we try to automatically adjust it based on the per-cpu
L2 cache size or the use of certain protocols (none of this is done yet).
diff --git a/doc/configuration.txt b/doc/configuration.txt
index 0cc2bde..b66f75d 100644
--- a/doc/configuration.txt
+++ b/doc/configuration.txt
@@ -1126,6 +1126,7 @@
    - tune.maxaccept
    - tune.maxpollevents
    - tune.maxrewrite
+   - tune.memory.hot-size
    - tune.pattern.cache-size
    - tune.peers.max-updates-at-once
    - tune.pipesize
@@ -2983,6 +2984,22 @@
   larger than that. This means you don't have to worry about it when changing
   bufsize.
 
+tune.memory.hot-size <number>
+  Sets the per-thread amount of memory that will be kept hot in the local cache
+  and will never be recoverable by other threads. Access to this memory is very
+  fast (lockless), and having enough is critical to maintain a good performance
+  level under extreme thread contention. The value is expressed in bytes, and
+  the default value is configured at build time via CONFIG_HAP_POOL_CACHE_SIZE
+  which defaults to 524288 (512 kB). A larger value may increase performance in
+  some usage scenarios, especially when performance profiles show that memory
+  allocation is stressed a lot. Experience shows that a good value sits between
+  once to twice the per CPU core L2 cache size. Too large values will have a
+  negative impact on performance by making inefficient use of the L3 caches in
+  the CPUs, and will consume larger amounts of memory. It is recommended not to
+  change this value, or to proceed in small increments. In order to completely
+  disable the per-thread CPU caches, using a very small value could work, but
+  it is better to use "-dMno-cache" on the command-line.
+
 tune.pattern.cache-size <number>
   Sets the size of the pattern lookup cache to <number> entries. This is an LRU
   cache which reminds previous lookups and their results. It is used by ACLs
diff --git a/doc/internals/api/pools.txt b/doc/internals/api/pools.txt
index 4023dc3..480cf24 100644
--- a/doc/internals/api/pools.txt
+++ b/doc/internals/api/pools.txt
@@ -124,13 +124,17 @@
 "-dMfail". In this case the desired average rate of allocation failures can be
 fixed by global setting "tune.fail-alloc" expressed in percent.
 
-The thread-local caches contain the freshest objects whose total size amounts
-to CONFIG_HAP_POOL_CACHE_SIZE bytes, which is typically was 1MB before 2.6 and
-is 512kB after. The aim is to keep hot objects that still fit in the CPU core's
-private L2 cache. Once these objects do not fit into the cache anymore, there's
-no benefit keeping them local to the thread, so they'd rather be returned to
-the shared pool or the main allocator so that any other thread may make use of
-them.
+The thread-local caches contain the freshest objects. Its total size amounts to
+the number of bytes set in global.tune.pool_cache_size and that may be adjusted
+by the "tune.memory.hot-size" global option, which itself defaults to build
+time setting CONFIG_HAP_POOL_CACHE_SIZE, which was 1MB before 2.6 and 512kB
+after. The aim is to keep hot objects that still fit in the CPU core's private
+L2 cache. Once these objects do not fit into the cache anymore, there's no
+benefit keeping them local to the thread, so they'd rather be returned to the
+shared pool or the main allocator so that any other thread may make use of
+them. Under extreme thread contention the cost of accessing shared structures
+in the global cache or in malloc() may still be important and it may prove
+useful to increase the thread-local cache size.
 
 
 3. Storage in thread-local caches
@@ -563,14 +567,15 @@
         boot-time option "-dMno-global".
 
 CONFIG_HAP_POOL_CACHE_SIZE
-        This allows one to define the size of the per-thread cache, in bytes.
-        The default value is 512 kB (524288). Smaller values will use less
-        memory at the expense of a possibly higher CPU usage when using many
-        threads. Higher values will give diminishing returns on performance
-        while using much more memory. Usually there is no benefit in using
-        more than a per-core L2 cache size. It would be better not to set this
-        value lower than a few times the size of a buffer (bufsize, defaults to
-        16 kB).
+        This allows one to define the default size of the per-thread cache, in
+        bytes. The default value is 512 kB (524288). Smaller values will use
+        less memory at the expense of a possibly higher CPU usage when using
+        many threads. Higher values will give diminishing returns on
+        performance while using much more memory. Usually there is no benefit
+        in using more than a per-core L2 cache size. It would be better not to
+        set this value lower than a few times the size of a buffer (bufsize,
+        defaults to 16 kB). In addition, keep in mind that this option may be
+        changed at runtime using "tune.memory.hot-size".
 
 CONFIG_HAP_POOL_CLUSTER_SIZE
         This allows one to define the maximum number of objects that will be
diff --git a/include/haproxy/global-t.h b/include/haproxy/global-t.h
index 2e9b61b..11f4b2c 100644
--- a/include/haproxy/global-t.h
+++ b/include/haproxy/global-t.h
@@ -160,6 +160,7 @@
 		int pool_high_ratio;  /* max ratio of FDs used before we start killing idle connections when creating new connections */
 		int pool_low_count;   /* max number of opened fd before we stop using new idle connections */
 		int pool_high_count;  /* max number of opened fd before we start killing idle connections when creating new connections */
+		size_t pool_cache_size;    /* per-thread cache size per pool (defaults to CONFIG_HAP_POOL_CACHE_SIZE) */
 		unsigned short idle_timer; /* how long before an empty buffer is considered idle (ms) */
 #ifdef USE_QUIC
 		unsigned int quic_backend_max_idle_timeout;
diff --git a/src/haproxy.c b/src/haproxy.c
index 178f274..68c7842 100644
--- a/src/haproxy.c
+++ b/src/haproxy.c
@@ -2670,6 +2670,14 @@
 
 	if (!hlua_post_init())
 		exit(1);
+
+	/* Set the per-thread pool cache size to the default value if not set.
+	 * This is the right place to decide to automatically adjust it (e.g.
+	 * check L2 cache size, thread counts or take into account certain
+	 * expensive pools).
+	 */
+	if (!global.tune.pool_cache_size)
+		global.tune.pool_cache_size = CONFIG_HAP_POOL_CACHE_SIZE;
 }
 
 void deinit(void)
diff --git a/src/pool.c b/src/pool.c
index e225d21..df9d060 100644
--- a/src/pool.c
+++ b/src/pool.c
@@ -517,7 +517,7 @@
 	while ((ph->count && full) ||
 	       (ph->count >= CONFIG_HAP_POOL_CLUSTER_SIZE &&
 	        ph->count >= 16 + pool_cache_count / 8 &&
-	        pool_cache_bytes > CONFIG_HAP_POOL_CACHE_SIZE * 3 / 4)) {
+	        pool_cache_bytes > global.tune.pool_cache_size * 3 / 4)) {
 		pool_evict_last_items(pool, ph, CONFIG_HAP_POOL_CLUSTER_SIZE);
 	}
 }
@@ -546,7 +546,7 @@
 		BUG_ON(pool != ph->pool);
 
 		pool_evict_last_items(pool, ph, CONFIG_HAP_POOL_CLUSTER_SIZE);
-	} while (pool_cache_bytes > CONFIG_HAP_POOL_CACHE_SIZE * 7 / 8);
+	} while (pool_cache_bytes > global.tune.pool_cache_size * 7 / 8);
 }
 
 /* Frees an object to the local cache, possibly pushing oldest objects to the
@@ -572,10 +572,10 @@
 	pool_cache_count++;
 	pool_cache_bytes += pool->size;
 
-	if (unlikely(pool_cache_bytes > CONFIG_HAP_POOL_CACHE_SIZE * 3 / 4)) {
+	if (unlikely(pool_cache_bytes > global.tune.pool_cache_size * 3 / 4)) {
 		if (ph->count >= 16 + pool_cache_count / 8 + CONFIG_HAP_POOL_CLUSTER_SIZE)
 			pool_evict_from_local_cache(pool, 0);
-		if (pool_cache_bytes > CONFIG_HAP_POOL_CACHE_SIZE)
+		if (pool_cache_bytes > global.tune.pool_cache_size)
 			pool_evict_from_local_caches();
 	}
 }
@@ -790,7 +790,8 @@
 	}
 #endif
 
-	if (unlikely(pool_debugging & POOL_DBG_NO_CACHE)) {
+	if (unlikely((pool_debugging & POOL_DBG_NO_CACHE) ||
+		     global.tune.pool_cache_size < pool->size)) {
 		pool_free_nocache(pool, ptr);
 		return;
 	}
@@ -1211,6 +1212,26 @@
 	return 0;
 }
 
+/* config parser for global "tune.memory.hot-size" */
+static int mem_parse_global_hot_size(char **args, int section_type, struct proxy *curpx,
+                                       const struct proxy *defpx, const char *file, int line,
+                                       char **err)
+{
+	long size;
+
+	if (too_many_args(1, args, err, NULL))
+		return -1;
+
+	size = atol(args[1]);
+	if (size <= 0) {
+	    memprintf(err, "'%s' expects a strictly positive value.", args[0]);
+	    return -1;
+	}
+
+	global.tune.pool_cache_size = size;
+	return 0;
+}
+
 /* config parser for global "no-memory-trimming" */
 static int mem_parse_global_no_mem_trim(char **args, int section_type, struct proxy *curpx,
                                        const struct proxy *defpx, const char *file, int line,
@@ -1225,6 +1246,7 @@
 /* register global config keywords */
 static struct cfg_kw_list mem_cfg_kws = {ILH, {
 	{ CFG_GLOBAL, "tune.fail-alloc", mem_parse_global_fail_alloc },
+	{ CFG_GLOBAL, "tune.memory.hot-size", mem_parse_global_hot_size },
 	{ CFG_GLOBAL, "no-memory-trimming", mem_parse_global_no_mem_trim },
 	{ 0, NULL, NULL }
 }};