BUG/MAJOR: tcpcheck: Allocate input and output buffers from the buffer pool

Historically, the input and output buffers of a check are allocated by hand
during the startup, with a specific size (not necessarily the same than
other buffers). But since the recent refactoring of the checks to rely
exclusively on the tcp-checks and to use the underlying mux layer, this part
is totally buggy. Indeed, because these buffers are now passed to a mux,
they maybe be swapped if a zero-copy is possible. In fact, for now it is
only possible in h2_rcv_buf(). Thus the bug concretely only exists if a h2
health-check is performed. But, it is a latent bug for other muxes.

Another problem is the size of these buffers. because it may differ for the
other buffer size, it might be source of bugs.

Finally, for configurations with hundreds of thousands of servers, having 2
buffers per check always allocated may be an issue.

To fix the bug, we now allocate these buffers when required using the buffer
pool. Thus not-running checks don't waste memory and muxes may swap them if
possible. The only drawback is the check buffers have now always the same
size than buffers used by the streams. This deprecates indirectly the
"tune.chksize" global option.

In addition, the http-check regtest have been update to perform some h2
health-checks.

Many thanks to @VigneshSP94 for its help on this bug.

This patch should solve the issue #936. It relies on the commit "MINOR:
tcpcheck: Don't handle anymore in-progress send rules in tcpcheck_main".
Both must be backport as far as 2.2.

bla

(cherry picked from commit b381a505c1010bb11abbe7b31e8d2307c4dab541)
Signed-off-by: William Lallemand <wlallemand@haproxy.org>
diff --git a/include/haproxy/check-t.h b/include/haproxy/check-t.h
index 727f837..274e162 100644
--- a/include/haproxy/check-t.h
+++ b/include/haproxy/check-t.h
@@ -20,6 +20,7 @@
 #include <haproxy/api-t.h>
 #include <haproxy/buf-t.h>
 #include <haproxy/connection-t.h>
+#include <haproxy/dynbuf-t.h>
 #include <haproxy/obj_type-t.h>
 #include <haproxy/vars-t.h>
 
@@ -49,6 +50,8 @@
 #define CHK_ST_PAUSED           0x0008  /* checks are paused because of maintenance (health only) */
 #define CHK_ST_AGENT            0x0010  /* check is an agent check (otherwise it's a health check) */
 #define CHK_ST_PORT_MISS        0x0020  /* check can't be send because no port is configured to run it */
+#define CHK_ST_IN_ALLOC         0x0040  /* check blocked waiting for input buffer allocation */
+#define CHK_ST_OUT_ALLOC        0x0080  /* check blocked waiting for output buffer allocation */
 
 /* check status */
 enum healthcheck_status {
@@ -145,6 +148,7 @@
 	struct xprt_ops *xprt;			/* transport layer operations for health checks */
 	struct conn_stream *cs;			/* conn_stream state for health checks */
 	struct buffer bi, bo;			/* input and output buffers to send/recv check */
+	struct buffer_wait buf_wait;            /* Wait list for buffer allocation */
 	struct task *task;			/* the task associated to the health check processing, NULL if disabled */
 	struct timeval start;			/* last health check start time */
 	long duration;				/* time in ms took to finish last health check */
diff --git a/include/haproxy/check.h b/include/haproxy/check.h
index a1bdb01..5e55da1 100644
--- a/include/haproxy/check.h
+++ b/include/haproxy/check.h
@@ -40,6 +40,9 @@
 void check_notify_success(struct check *check);
 struct task *process_chk(struct task *t, void *context, unsigned short state);
 
+int check_buf_available(void *target);
+struct buffer *check_get_buf(struct check *check, struct buffer *bptr);
+void check_release_buf(struct check *check, struct buffer *bptr);
 const char *init_check(struct check *check, int type);
 void free_check(struct check *check);
 
diff --git a/reg-tests/checks/http-check.vtc b/reg-tests/checks/http-check.vtc
index 1180c3d..9ece54b 100644
--- a/reg-tests/checks/http-check.vtc
+++ b/reg-tests/checks/http-check.vtc
@@ -82,6 +82,10 @@
     expect ~ "[^:\\[ ]\\[${h1_pid}\\]: Health check for server be[0-9]/srv succeeded.*code: 200"
     recv
     expect ~ "[^:\\[ ]\\[${h1_pid}\\]: Health check for server be[0-9]/srv succeeded.*code: 200"
+    recv
+    expect ~ "[^:\\[ ]\\[${h1_pid}\\]: Health check for server be[0-9]/srv succeeded.*code: 200"
+    recv
+    expect ~ "[^:\\[ ]\\[${h1_pid}\\]: Health check for server be[0-9]/srv succeeded.*code: 200"
 } -start
 
 haproxy h1 -conf {
@@ -133,6 +137,21 @@
         ## implicit expect rule
         server srv ${s1_addr}:${s1_port} check inter 100ms rise 1 fall 1
 
+    backend be5
+        log ${S1_addr}:${S1_port} len 2048 local0
+        option httpchk
+        server srv ${h1_li1_addr}:${h1_li1_port} proto h2 check inter 100ms rise 1 fall 1
+
+    backend be6
+        log ${S1_addr}:${S1_port} len 2048 local0
+        option httpchk GET /status HTTP/1.1
+        server srv ${h1_li1_addr}:${h1_li1_port} check check-proto h2 inter 100ms rise 1 fall 1
+
+    listen li1
+        mode http
+        bind "fd@${li1}" proto h2
+        http-request return status 200
+
 } -start
 
 syslog S1 -wait
diff --git a/src/check.c b/src/check.c
index 9836d36..337030f 100644
--- a/src/check.c
+++ b/src/check.c
@@ -37,6 +37,7 @@
 #include <haproxy/chunk.h>
 #include <haproxy/dgram.h>
 #include <haproxy/dns.h>
+#include <haproxy/dynbuf-t.h>
 #include <haproxy/extcheck.h>
 #include <haproxy/fd.h>
 #include <haproxy/global.h>
@@ -866,8 +867,6 @@
 		set_server_check_status(check, HCHK_STATUS_START, NULL);
 
 		check->state |= CHK_ST_INPROGRESS;
-		b_reset(&check->bi);
-		b_reset(&check->bo);
 
 		task_set_affinity(t, tid_bit);
 
@@ -936,7 +935,9 @@
 			}
 		}
 		task_set_affinity(t, MAX_THREADS_MASK);
-		check->state &= ~CHK_ST_INPROGRESS;
+		check_release_buf(check, &check->bi);
+		check_release_buf(check, &check->bo);
+		check->state &= ~(CHK_ST_INPROGRESS|CHK_ST_IN_ALLOC|CHK_ST_OUT_ALLOC);
 
 		if (check->server) {
 			rv = 0;
@@ -961,18 +962,65 @@
 /**************************************************************************/
 /************************** Init/deinit checks ****************************/
 /**************************************************************************/
-const char *init_check(struct check *check, int type)
+/*
+ * Tries to grab a buffer and to re-enables processing on check <target>. The
+ * check flags are used to figure what buffer was requested. It returns 1 if the
+ * allocation succeeds, in which case the I/O tasklet is woken up, or 0 if it's
+ * impossible to wake up and we prefer to be woken up later.
+ */
+int check_buf_available(void *target)
 {
-	check->type = type;
+	struct check *check = target;
+
+	if ((check->state & CHK_ST_IN_ALLOC) && b_alloc_margin(&check->bi, 0)) {
+		check->state &= ~CHK_ST_IN_ALLOC;
+		tasklet_wakeup(check->wait_list.tasklet);
+		return 1;
+	}
+	if ((check->state & CHK_ST_OUT_ALLOC) && b_alloc_margin(&check->bo, 0)) {
+		check->state &= ~CHK_ST_OUT_ALLOC;
+		tasklet_wakeup(check->wait_list.tasklet);
+		return 1;
+	}
+
+	return 0;
+}
 
-	b_reset(&check->bi); check->bi.size = global.tune.chksize;
-	b_reset(&check->bo); check->bo.size = global.tune.chksize;
+/*
+ * Allocate a buffer. If if fails, it adds the check in buffer wait queue.
+ */
+struct buffer *check_get_buf(struct check *check, struct buffer *bptr)
+{
+	struct buffer *buf = NULL;
 
-	check->bi.area = calloc(check->bi.size, sizeof(*check->bi.area));
-	check->bo.area = calloc(check->bo.size, sizeof(*check->bo.area));
+	if (likely(!MT_LIST_ADDED(&check->buf_wait.list)) &&
+	    unlikely((buf = b_alloc_margin(bptr, 0)) == NULL)) {
+		check->buf_wait.target = check;
+		check->buf_wait.wakeup_cb = check_buf_available;
+		MT_LIST_ADDQ(&buffer_wq, &check->buf_wait.list);
+	}
+	return buf;
+}
+
+/*
+ * Release a buffer, if any, and try to wake up entities waiting in the buffer
+ * wait queue.
+ */
+void check_release_buf(struct check *check, struct buffer *bptr)
+{
+	if (bptr->size) {
+		b_free(bptr);
+		offer_buffers(check->buf_wait.target, tasks_run_queue);
+	}
+}
+
+const char *init_check(struct check *check, int type)
+{
+	check->type = type;
 
-	if (!check->bi.area || !check->bo.area)
-		return "out of memory while allocating check buffer";
+	check->bi = BUF_NULL;
+	check->bo = BUF_NULL;
+	MT_LIST_INIT(&check->buf_wait.list);
 
 	check->wait_list.tasklet = tasklet_new();
 	if (!check->wait_list.tasklet)
@@ -989,8 +1037,8 @@
 	if (check->wait_list.tasklet)
 		tasklet_free(check->wait_list.tasklet);
 
-	free(check->bi.area);
-	free(check->bo.area);
+	check_release_buf(check, &check->bi);
+	check_release_buf(check, &check->bo);
 	if (check->cs) {
 		free(check->cs->conn);
 		check->cs->conn = NULL;
diff --git a/src/tcpcheck.c b/src/tcpcheck.c
index 964dcf8..accc699 100644
--- a/src/tcpcheck.c
+++ b/src/tcpcheck.c
@@ -993,6 +993,10 @@
 	 *   3: release and replace the old one on success
 	 */
 
+	/* Always release input and output buffer when a new connect is evaluated */
+	check_release_buf(check, &check->bi);
+	check_release_buf(check, &check->bo);
+
 	/* 2- prepare new connection */
 	cs = cs_new(NULL, (s ? &s->obj_type : &proxy->obj_type));
 	if (!cs) {
@@ -1222,13 +1226,23 @@
 	struct buffer *tmp = NULL;
 	struct htx *htx = NULL;
 
+	if (check->state & CHK_ST_OUT_ALLOC) {
+		ret = TCPCHK_EVAL_WAIT;
+		goto out;
+	}
+
+	if (!check_get_buf(check, &check->bo)) {
+		check->state |= CHK_ST_OUT_ALLOC;
+		ret = TCPCHK_EVAL_WAIT;
+		goto out;
+	}
+
 	/* Data already pending in the output buffer, send them now */
 	if (b_data(&check->bo))
 		goto do_send;
 
-	/* reset the read & write buffer */
-	b_reset(&check->bi);
-	b_reset(&check->bo);
+	/* Always release input buffer when a new send is evaluated */
+	check_release_buf(check, &check->bi);
 
 	switch (send->type) {
 	case TCPCHK_SEND_STRING:
@@ -1372,6 +1386,8 @@
 
   out:
 	free_trash_chunk(tmp);
+	if (!b_data(&check->bo) || ret == TCPCHK_EVAL_STOP)
+		check_release_buf(check, &check->bo);
 	return ret;
 
   error_htx:
@@ -1414,6 +1430,14 @@
 	if (cs->flags & CS_FL_EOS)
 		goto end_recv;
 
+	if (check->state & CHK_ST_IN_ALLOC)
+		goto wait_more_data;
+
+	if (!check_get_buf(check, &check->bi)) {
+		check->state |= CHK_ST_IN_ALLOC;
+		goto wait_more_data;
+	}
+
 	/* errors on the connection and the conn-stream were already checked */
 
 	/* prepare to detect if the mux needs more room */
@@ -1461,6 +1485,8 @@
 	}
 
   out:
+	if (!b_data(&check->bi) || ret == TCPCHK_EVAL_STOP)
+		check_release_buf(check, &check->bi);
 	return ret;
 
   stop:
@@ -2115,6 +2141,10 @@
 	if ((conn && conn->flags & CO_FL_ERROR) || (cs && cs->flags & CS_FL_ERROR))
 		chk_report_conn_err(check, errno, 0);
 
+	/* the tcpcheck is finished, release in/out buffer now */
+	check_release_buf(check, &check->bi);
+	check_release_buf(check, &check->bo);
+
   out:
 	return retcode;
 }