MEDIUM: pools: move the cache into the pool header

Initially per-thread pool caches were stored into a fixed-size array.
But this was a bit ugly because the last allocated pools were not able
to benefit from the cache at all. As a work around to preserve
performance, a size of 64 cacheable pools was set by default (there
are 51 pools at the moment, excluding any addon and debugging code),
so all in-tree pools were covered, at the expense of higher memory
usage.

In addition an index had to be calculated for each pool, and was used
to acces the pool cache head into that array. The pool index was not
even stored into the pools so it was required to determine it to access
the cache when the pool was already known.

This patch changes this by moving the pool cache head into the pool
head itself. This way it is certain that each pool will have its own
cache. This removes the need for index calculation.

The pool cache head is 32 bytes long so it was aligned to 64B to avoid
false sharing between threads. The extra cost is not huge (~2kB more
per pool than before), and we'll make better use of that space soon.
The pool cache head contains the size, which should probably be removed
since it's already in the pool's head.
diff --git a/include/haproxy/pool-t.h b/include/haproxy/pool-t.h
index bc54c2d..b6d4d31 100644
--- a/include/haproxy/pool-t.h
+++ b/include/haproxy/pool-t.h
@@ -75,10 +75,6 @@
 #define POOL_LINK(pool, item) ((void **)(item))
 #endif
 
-#ifndef MAX_BASE_POOLS
-#define MAX_BASE_POOLS 64
-#endif
-
 #define POOL_AVG_SAMPLES 1024
 
 /* possible flags for __pool_alloc() */
@@ -90,7 +86,7 @@
 	struct list list;    /* head of objects in this pool */
 	size_t size;         /* size of an object */
 	unsigned int count;  /* number of objects in this pool */
-};
+} THREAD_ALIGNED(64);
 
 struct pool_cache_item {
 	struct list by_pool; /* link to objects in this pool */
@@ -122,6 +118,9 @@
 	unsigned int failed;	/* failed allocations */
 	struct list list;	/* list of all known pools */
 	char name[12];		/* name of the pool */
+#ifdef CONFIG_HAP_LOCAL_POOLS
+	struct pool_cache_head cache[MAX_THREADS]; /* pool caches */
+#endif
 } __attribute__((aligned(64)));
 
 #endif /* _HAPROXY_POOL_T_H */
diff --git a/include/haproxy/pool.h b/include/haproxy/pool.h
index b9865a2..ee97135 100644
--- a/include/haproxy/pool.h
+++ b/include/haproxy/pool.h
@@ -75,39 +75,20 @@
 
 /****************** Thread-local cache management ******************/
 
-extern struct pool_head pool_base_start[MAX_BASE_POOLS];
-extern unsigned int pool_base_count;
-extern struct pool_cache_head pool_cache[][MAX_BASE_POOLS];
 extern THREAD_LOCAL size_t pool_cache_bytes;   /* total cache size */
 extern THREAD_LOCAL size_t pool_cache_count;   /* #cache objects   */
 
 void pool_evict_from_cache();
 
-/* returns the pool index for pool <pool>, or -1 if this pool has no index */
-static inline ssize_t pool_get_index(const struct pool_head *pool)
-{
-	size_t idx;
-
-	idx = pool - pool_base_start;
-	if (idx < MAX_BASE_POOLS)
-		return idx;
-	return -1;
-}
-
 /* Tries to retrieve an object from the local pool cache corresponding to pool
  * <pool>. Returns NULL if none is available.
  */
 static inline void *__pool_get_from_cache(struct pool_head *pool)
 {
-	ssize_t idx = pool_get_index(pool);
 	struct pool_cache_item *item;
 	struct pool_cache_head *ph;
 
-	/* pool not in cache */
-	if (idx < 0)
-		return NULL;
-
-	ph = &pool_cache[tid][idx];
+	ph = &pool->cache[tid];
 	if (LIST_ISEMPTY(&ph->list))
 		return NULL; // empty
 
@@ -127,10 +108,10 @@
 /* Frees an object to the local cache, possibly pushing oldest objects to the
  * global pool.
  */
-static inline void pool_put_to_cache(struct pool_head *pool, void *ptr, ssize_t idx)
+static inline void pool_put_to_cache(struct pool_head *pool, void *ptr)
 {
 	struct pool_cache_item *item = (struct pool_cache_item *)ptr;
-	struct pool_cache_head *ph = &pool_cache[tid][idx];
+	struct pool_cache_head *ph = &pool->cache[tid];
 
 	LIST_ADD(&ph->list, &item->by_pool);
 	LIST_ADD(&ti->pool_lru_head, &item->by_lru);
@@ -142,11 +123,6 @@
 		pool_evict_from_cache();
 }
 
-#else // CONFIG_HAP_LOCAL_POOLS
-
-/* always return index -1 when thread-local pools are disabled */
-#define pool_get_index(pool) ((ssize_t)-1)
-
 #endif // CONFIG_HAP_LOCAL_POOLS
 
 
@@ -346,8 +322,6 @@
 static inline void pool_free(struct pool_head *pool, void *ptr)
 {
         if (likely(ptr != NULL)) {
-		ssize_t idx __maybe_unused;
-
 #ifdef DEBUG_MEMORY_POOLS
 		/* we'll get late corruption if we refill to the wrong pool or double-free */
 		if (*POOL_LINK(pool, ptr) != (void *)pool)
@@ -361,11 +335,9 @@
 		 * many objects yet in this pool (no more than half of the cached
 		 * is used or this pool uses no more than 1/8 of the cache size).
 		 */
-		idx = pool_get_index(pool);
-		if (idx >= 0 &&
-		    (pool_cache_bytes <= CONFIG_HAP_POOL_CACHE_SIZE * 3 / 4 ||
-		     pool_cache[tid][idx].count < 16 + pool_cache_count / 8)) {
-			pool_put_to_cache(pool, ptr, idx);
+		if ((pool_cache_bytes <= CONFIG_HAP_POOL_CACHE_SIZE * 3 / 4 ||
+		     pool->cache[tid].count < 16 + pool_cache_count / 8)) {
+			pool_put_to_cache(pool, ptr);
 			return;
 		}
 #endif
diff --git a/src/pool.c b/src/pool.c
index cdb3eef..4f367d0 100644
--- a/src/pool.c
+++ b/src/pool.c
@@ -28,14 +28,7 @@
 
 
 #ifdef CONFIG_HAP_LOCAL_POOLS
-/* These are the most common pools, expected to be initialized first. These
- * ones are allocated from an array, allowing to map them to an index.
- */
-struct pool_head pool_base_start[MAX_BASE_POOLS] = { };
-unsigned int pool_base_count = 0;
-
 /* These ones are initialized per-thread on startup by init_pools() */
-struct pool_cache_head pool_cache[MAX_THREADS][MAX_BASE_POOLS];
 THREAD_LOCAL size_t pool_cache_bytes = 0;                /* total cache size */
 THREAD_LOCAL size_t pool_cache_count = 0;                /* #cache objects   */
 #endif
@@ -60,7 +53,7 @@
 	struct pool_head *entry;
 	struct list *start;
 	unsigned int align;
-	int idx __maybe_unused;
+	int thr __maybe_unused;
 
 	/* We need to store a (void *) at the end of the chunks. Since we know
 	 * that the malloc() function will never return such a small size,
@@ -103,21 +96,6 @@
 	}
 
 	if (!pool) {
-#ifdef CONFIG_HAP_LOCAL_POOLS
-		if (pool_base_count < MAX_BASE_POOLS)
-			pool = &pool_base_start[pool_base_count++];
-
-		if (!pool) {
-			/* look for a freed entry */
-			for (entry = pool_base_start; entry != pool_base_start + MAX_BASE_POOLS; entry++) {
-				if (!entry->size) {
-					pool = entry;
-					break;
-				}
-			}
-		}
-#endif
-
 		if (!pool)
 			pool = calloc(1, sizeof(*pool));
 
@@ -131,12 +109,9 @@
 
 #ifdef CONFIG_HAP_LOCAL_POOLS
 		/* update per-thread pool cache if necessary */
-		idx = pool_get_index(pool);
-		if (idx >= 0) {
-			int thr;
-
-			for (thr = 0; thr < MAX_THREADS; thr++)
-				pool_cache[thr][idx].size = size;
+		for (thr = 0; thr < MAX_THREADS; thr++) {
+			LIST_INIT(&pool->cache[thr].list);
+			pool->cache[thr].size = size;
 		}
 #endif
 		HA_SPIN_INIT(&pool->lock);
@@ -153,6 +128,7 @@
 {
 	struct pool_cache_item *item;
 	struct pool_cache_head *ph;
+	struct pool_head *pool;
 
 	do {
 		item = LIST_PREV(&ti->pool_lru_head, struct pool_cache_item *, by_lru);
@@ -160,12 +136,13 @@
 		 * oldest in their own pools, thus their next is the pool's head.
 		 */
 		ph = LIST_NEXT(&item->by_pool, struct pool_cache_head *, list);
+		pool = container_of(ph - tid, struct pool_head, cache);
 		LIST_DEL(&item->by_pool);
 		LIST_DEL(&item->by_lru);
 		ph->count--;
 		pool_cache_count--;
 		pool_cache_bytes -= ph->size;
-		__pool_free(pool_base_start + (ph - pool_cache[tid]), item);
+		__pool_free(pool, item);
 	} while (pool_cache_bytes > CONFIG_HAP_POOL_CACHE_SIZE * 7 / 8);
 }
 #endif
@@ -506,13 +483,8 @@
 #ifndef CONFIG_HAP_LOCKLESS_POOLS
 			HA_SPIN_DESTROY(&pool->lock);
 #endif
-
-#ifdef CONFIG_HAP_LOCAL_POOLS
-			if ((pool - pool_base_start) < MAX_BASE_POOLS)
-				memset(pool, 0, sizeof(*pool));
-			else
-#endif
-				free(pool);
+			/* note that if used == 0, the cache is empty */
+			free(pool);
 		}
 	}
 	return NULL;
@@ -540,11 +512,11 @@
 #ifndef CONFIG_HAP_LOCKLESS_POOLS
 		HA_SPIN_LOCK(POOL_LOCK, &entry->lock);
 #endif
-		chunk_appendf(&trash, "  - Pool %s (%u bytes) : %u allocated (%u bytes), %u used, needed_avg %u, %u failures, %u users, @%p=%02d%s\n",
+		chunk_appendf(&trash, "  - Pool %s (%u bytes) : %u allocated (%u bytes), %u used, needed_avg %u, %u failures, %u users, @%p%s\n",
 			 entry->name, entry->size, entry->allocated,
 		         entry->size * entry->allocated, entry->used,
 		         swrate_avg(entry->needed_avg, POOL_AVG_SAMPLES), entry->failed,
-			 entry->users, entry, (int)pool_get_index(entry),
+			 entry->users, entry,
 			 (entry->flags & MEM_F_SHARED) ? " [SHARED]" : "");
 
 		allocated += entry->allocated * entry->size;
@@ -632,13 +604,9 @@
 static void init_pools()
 {
 #ifdef CONFIG_HAP_LOCAL_POOLS
-	int thr, idx;
+	int thr;
 
 	for (thr = 0; thr < MAX_THREADS; thr++) {
-		for (idx = 0; idx < MAX_BASE_POOLS; idx++) {
-			LIST_INIT(&pool_cache[thr][idx].list);
-			pool_cache[thr][idx].size = 0;
-		}
 		LIST_INIT(&ha_thread_info[thr].pool_lru_head);
 	}
 #endif