MEDIUM: server: improve estimate of the need for idle connections

Starting with commit 079cb9a ("MEDIUM: connections: Revamp the way idle
connections are killed") we started to improve the way to compute the
need for idle connections. But the condition to keep a connection idle
or drop it when releasing it was not updated. This often results in
storms of close when certain thresholds are met, and long series of
takeover() when there aren't enough connections left for a thread on
a server.

This patch tries to improve the situation this way:
  - it keeps an estimate of the number of connections needed for a server.
    This estimate is a copy of the max over previous purge period, or is a
    max of what is seen over current period; it differs from max_used_conns
    in that this one is a counter that's reset on each purge period ;

  - when releasing, if the number of current idle+used connections is
    lower than this last estimate, then we'll keep the connection;

  - when releasing, if the current thread's idle conns head is empty,
    and we don't exceed the estimate by the number of threads, then
    we'll keep the connection.

  - when cleaning up connections, we consider the max of the last two
    periods to avoid killing too many idle conns when facing bursty
    traffic.

Thanks to this we can better converge towards a situation where, provided
there are enough FDs, each active server keeps at least one idle connection
per thread all the time, with a total number close to what was needed over
the previous measurement period (as defined by pool-purge-delay).

On tests with large numbers of concurrent connections (30k) and many
servers (200), this has quite smoothed the CPU usage pattern, increased
the reuse rate and roughly halved the takeover rate.
diff --git a/include/haproxy/server-t.h b/include/haproxy/server-t.h
index 48ed176..53938c0 100644
--- a/include/haproxy/server-t.h
+++ b/include/haproxy/server-t.h
@@ -232,6 +232,7 @@
 	unsigned int curr_safe_nb;              /* Current number of connections in the safe list */
 	unsigned int curr_used_conns;           /* Current number of used connections */
 	unsigned int max_used_conns;            /* Max number of used connections (the counter is reset at each connection purges */
+	unsigned int est_need_conns;            /* Estimate on the number of needed connections (max of curr and previous max_used) */
 	unsigned int *curr_idle_thr;            /* Current number of orphan idling connections per thread */
 	int max_reuse;                          /* Max number of requests on a same connection */
 	struct eb32_node idle_node;             /* When to next do cleanup in the idle connections */
diff --git a/include/haproxy/server.h b/include/haproxy/server.h
index 9c1672a..4d025ad 100644
--- a/include/haproxy/server.h
+++ b/include/haproxy/server.h
@@ -244,9 +244,16 @@
  */
 static inline int srv_add_to_idle_list(struct server *srv, struct connection *conn, int is_safe)
 {
+	/* we try to keep the connection in the server's idle list
+	 * if we don't have too many FD in use, and if the number of
+	 * idle+current conns is lower than what was observed before
+	 * last purge, or if we already don't have idle conns for the
+	 * current thread and we don't exceed last count by global.nbthread.
+	 */
 	if (srv && srv->pool_purge_delay > 0 &&
 	    (srv->max_idle_conns == -1 || srv->max_idle_conns > srv->curr_idle_conns) &&
-	    (srv->cur_sess + srv->curr_idle_conns <= srv->counters.cur_sess_max) &&
+	    (srv->curr_used_conns + srv->curr_idle_conns < MAX(srv->curr_used_conns, srv->est_need_conns) +
+	     (MT_LIST_ISEMPTY(&srv->safe_conns[tid]) && MT_LIST_ISEMPTY(&srv->idle_conns[tid])) ? global.nbthread : 0) &&
 	    !(conn->flags & CO_FL_PRIVATE) &&
 	    ((srv->proxy->options & PR_O_REUSE_MASK) != PR_O_REUSE_NEVR) &&
 	    !conn->mux->used_streams(conn) && conn->mux->avail_streams(conn) &&
diff --git a/src/backend.c b/src/backend.c
index a3ec782..f87e36e 100644
--- a/src/backend.c
+++ b/src/backend.c
@@ -1355,6 +1355,9 @@
 		 */
 		if (srv->max_used_conns < srv->curr_used_conns)
 			srv->max_used_conns = srv->curr_used_conns;
+
+		if (srv->est_need_conns < srv->curr_used_conns)
+			srv->est_need_conns = srv->curr_used_conns;
 	}
 	if (!srv_conn || !sockaddr_alloc(&srv_conn->dst)) {
 		if (srv_conn)
diff --git a/src/server.c b/src/server.c
index 8cc5bbf..258a627 100644
--- a/src/server.c
+++ b/src/server.c
@@ -5254,9 +5254,13 @@
 		curr_idle = srv->curr_idle_conns;
 		if (curr_idle == 0)
 			goto remove;
-		exceed_conns = srv->curr_used_conns + curr_idle -
-		               srv->max_used_conns;
+		exceed_conns = srv->curr_used_conns + curr_idle - MAX(srv->max_used_conns, srv->est_need_conns);
 		exceed_conns = to_kill = exceed_conns / 2 + (exceed_conns & 1);
+
+		srv->est_need_conns = (srv->est_need_conns + srv->max_used_conns + 1) / 2;
+		if (srv->est_need_conns < srv->max_used_conns)
+			srv->est_need_conns = srv->max_used_conns;
+
 		srv->max_used_conns = srv->curr_used_conns;
 
 		/* check all threads starting with ours */