MEDIUM: fd: add experimental support for edge-triggered polling

Some of the recent optimizations around the polling to save a few
epoll_ctl() calls have shown that they could also cause some trouble.
However, over time our code base has become totally asynchronous with
I/Os always attempted from the upper layers and only retried at the
bottom, making it look like we're getting closer to EPOLLET support.

There are showstoppers there such as the listeners which cannot support
this. But given that most of the epoll_ctl() dance comes from the
connections, we can try to enable edge-triggered polling on connections.

What this patch does is to add a new global tunable "tune.fd.edge-triggered",
that makes fd_insert() automatically set an et_possible bit on the fd if
the I/O callback is conn_fd_handler. When the epoll code sees an update
for such an FD, it immediately registers it in both directions the first
time and doesn't update it anymore.

On a few tests it proved quite useful with a 14% request rate increase in
a H2->H1 scenario, reducing the epoll_ctl() calls from 2 per request to
2 per connection.

The option is obviously disabled by default as bugs are still expected,
particularly around the subscribe() code where it is possible that some
layers do not always re-attempt reading data after being woken up.
diff --git a/doc/configuration.txt b/doc/configuration.txt
index 0b9776c..f8c29b2 100644
--- a/doc/configuration.txt
+++ b/doc/configuration.txt
@@ -675,6 +675,7 @@
    - tune.bufsize
    - tune.chksize
    - tune.comp.maxlevel
+   - tune.fd.edge-triggered
    - tune.h2.header-table-size
    - tune.h2.initial-window-size
    - tune.h2.max-concurrent-streams
@@ -1874,6 +1875,13 @@
   success). This is useful to debug and make sure memory failures are handled
   gracefully.
 
+tune.fd.edge-triggered { on | off }  [ EXPERIMENTAL ]
+  Enables ('on') or disables ('off') the edge-triggered polling mode for FDs
+  that support it. This is currently only support with epoll. It may noticeably
+  reduce the number of epoll_ctl() calls and slightly improve performance in
+  certain scenarios. This is still experimental, it may result in frozen
+  connections if bugs are still present, and is disabled by default.
+
 tune.h2.header-table-size <number>
   Sets the HTTP/2 dynamic header table size. It defaults to 4096 bytes and
   cannot be larger than 65536 bytes. A larger value may help certain clients
diff --git a/include/haproxy/fd-t.h b/include/haproxy/fd-t.h
index 5e17b6f..97b383c 100644
--- a/include/haproxy/fd-t.h
+++ b/include/haproxy/fd-t.h
@@ -133,6 +133,7 @@
 	unsigned char linger_risk:1;         /* 1 if we must kill lingering before closing */
 	unsigned char cloned:1;              /* 1 if a cloned socket, requires EPOLL_CTL_DEL on close */
 	unsigned char initialized:1;         /* 1 if init phase was done on this fd (e.g. set non-blocking) */
+	unsigned char et_possible:1;         /* 1 if edge-triggered is possible on this FD */
 } THREAD_ALIGNED(64);
 
 /* polled mask, one bit per thread and per direction for each FD */
diff --git a/include/haproxy/fd.h b/include/haproxy/fd.h
index 0f1799d..f7af4e1 100644
--- a/include/haproxy/fd.h
+++ b/include/haproxy/fd.h
@@ -30,6 +30,7 @@
 #include <haproxy/activity.h>
 #include <haproxy/api.h>
 #include <haproxy/fd-t.h>
+#include <haproxy/global.h>
 #include <haproxy/thread.h>
 #include <haproxy/ticks.h>
 #include <haproxy/time.h>
@@ -435,6 +436,7 @@
 static inline void fd_insert(int fd, void *owner, void (*iocb)(int fd), unsigned long thread_mask)
 {
 	int locked = fdtab[fd].running_mask != tid_bit;
+	extern void conn_fd_handler(int);
 
 	if (locked)
 		fd_set_running_excl(fd);
@@ -443,6 +445,12 @@
 	fdtab[fd].ev = 0;
 	fdtab[fd].linger_risk = 0;
 	fdtab[fd].cloned = 0;
+	fdtab[fd].et_possible = 0;
+
+	/* conn_fd_handler should support edge-triggered FDs */
+	if ((global.tune.options & GTUNE_FD_ET) && fdtab[fd].iocb == conn_fd_handler)
+		fdtab[fd].et_possible = 1;
+
 	fdtab[fd].thread_mask = thread_mask;
 	/* note: do not reset polled_mask here as it indicates which poller
 	 * still knows this FD from a possible previous round.
diff --git a/include/haproxy/global-t.h b/include/haproxy/global-t.h
index f5bf216..c7591b4 100644
--- a/include/haproxy/global-t.h
+++ b/include/haproxy/global-t.h
@@ -66,6 +66,7 @@
 #define GTUNE_STRICT_LIMITS      (1<<15)
 #define GTUNE_INSECURE_FORK      (1<<16)
 #define GTUNE_INSECURE_SETUID    (1<<17)
+#define GTUNE_FD_ET              (1<<18)
 
 /* SSL server verify mode */
 enum {
diff --git a/src/ev_epoll.c b/src/ev_epoll.c
index 5102b10..92c000f 100644
--- a/src/ev_epoll.c
+++ b/src/ev_epoll.c
@@ -59,6 +59,20 @@
 
 	en = fdtab[fd].state;
 
+	/* Try to force EPOLLET on FDs that support it */
+	if (fdtab[fd].et_possible) {
+		/* already done ? */
+		if (polled_mask[fd].poll_recv & polled_mask[fd].poll_send & tid_bit)
+			return;
+
+		/* enable ET polling in both directions */
+		_HA_ATOMIC_OR(&polled_mask[fd].poll_recv, tid_bit);
+		_HA_ATOMIC_OR(&polled_mask[fd].poll_send, tid_bit);
+		opcode = EPOLL_CTL_ADD;
+		ev.events = EPOLLIN | EPOLLRDHUP | EPOLLOUT | EPOLLET;
+		goto done;
+	}
+
 	/* if we're already polling or are going to poll for this FD and it's
 	 * neither active nor ready, force it to be active so that we don't
 	 * needlessly unsubscribe then re-subscribe it.
@@ -120,6 +134,7 @@
 	if (en & FD_EV_ACTIVE_W)
 		ev.events |= EPOLLOUT;
 
+ done:
 	ev.data.fd = fd;
 	epoll_ctl(epoll_fd[tid], opcode, fd, &ev);
 }
diff --git a/src/fd.c b/src/fd.c
index 1e1c0cb..60ad699 100644
--- a/src/fd.c
+++ b/src/fd.c
@@ -88,9 +88,11 @@
 #endif
 
 #include <haproxy/api.h>
+#include <haproxy/cfgparse.h>
 #include <haproxy/fd.h>
 #include <haproxy/global.h>
 #include <haproxy/port_range.h>
+#include <haproxy/tools.h>
 
 
 struct fdtab *fdtab = NULL;     /* array of all the file descriptors */
@@ -807,6 +809,33 @@
 	return 1;
 }
 
+/* config parser for global "tune.fd.edge-triggered", accepts "on" or "off" */
+static int cfg_parse_tune_fd_edge_triggered(char **args, int section_type, struct proxy *curpx,
+                                      struct proxy *defpx, const char *file, int line,
+                                      char **err)
+{
+	if (too_many_args(1, args, err, NULL))
+		return -1;
+
+	if (strcmp(args[1], "on") == 0)
+		global.tune.options |= GTUNE_FD_ET;
+	else if (strcmp(args[1], "off") == 0)
+		global.tune.options &= ~GTUNE_FD_ET;
+	else {
+		memprintf(err, "'%s' expects either 'on' or 'off' but got '%s'.", args[0], args[1]);
+		return -1;
+	}
+	return 0;
+}
+
+/* config keyword parsers */
+static struct cfg_kw_list cfg_kws = {ILH, {
+	{ CFG_GLOBAL, "tune.fd.edge-triggered", cfg_parse_tune_fd_edge_triggered },
+	{ 0, NULL, NULL }
+}};
+
+INITCALL1(STG_REGISTER, cfg_register_keywords, &cfg_kws);
+
 REGISTER_PER_THREAD_ALLOC(alloc_pollers_per_thread);
 REGISTER_PER_THREAD_INIT(init_pollers_per_thread);
 REGISTER_PER_THREAD_DEINIT(deinit_pollers_per_thread);