OPTIM: pool: split the read_mostly from read_write parts in pool_head

Performance profiling on a 48-thread machine showed a lot of time spent
in pool_free(), precisely at the point where pool->limit was retrieved.
And the reason is simple. Some parts of the pool_head are heavily updated
only when facing a cache miss ("allocated", "used", "needed_avg"), while
others are always accessed (limit, flags, size). The fact that both
entries were stored into the same cache line makes it very difficult for
each thread to access these precious info even when working with its own
cache.

By just splitting the fields apart, a test on QUIC (which stresses pools
a lot) more than doubled performance from 42 Gbps to 96 Gbps!

Given that the patch only reorders fields and addresses such a significant
contention, it should be backported to 2.7 and 2.6.
diff --git a/include/haproxy/pool-t.h b/include/haproxy/pool-t.h
index 523bbaf..ff6773c 100644
--- a/include/haproxy/pool-t.h
+++ b/include/haproxy/pool-t.h
@@ -107,20 +107,24 @@
  * alignment could be removed.
  */
 struct pool_head {
-	struct pool_item *free_list; /* list of free shared objects */
-	unsigned int used;	/* how many chunks are currently in use */
-	unsigned int needed_avg;/* floating indicator between used and allocated */
-	unsigned int allocated;	/* how many chunks have been allocated */
+	/* read-mostly part, purely configuration */
 	unsigned int limit;	/* hard limit on the number of chunks */
 	unsigned int minavail;	/* how many chunks are expected to be used */
 	unsigned int size;	/* chunk size */
 	unsigned int flags;	/* MEM_F_* */
 	unsigned int users;	/* number of pools sharing this zone */
-	unsigned int failed;	/* failed allocations */
 	unsigned int alloc_sz;	/* allocated size (includes hidden fields) */
 	struct list list;	/* list of all known pools */
 	void *base_addr;        /* allocation address, for free() */
 	char name[12];		/* name of the pool */
+
+	/* heavily read-write part */
+	THREAD_ALIGN(64);
+	struct pool_item *free_list; /* list of free shared objects */
+	unsigned int used;	/* how many chunks are currently in use */
+	unsigned int needed_avg;/* floating indicator between used and allocated */
+	unsigned int allocated;	/* how many chunks have been allocated */
+	unsigned int failed;	/* failed allocations */
 	struct pool_cache_head cache[MAX_THREADS] THREAD_ALIGNED(64); /* pool caches */
 } __attribute__((aligned(64)));