[MAJOR] complete support for linux 2.6 kernel splicing This code provides support for linux 2.6 kernel splicing. This feature appeared in kernel 2.6.25, but initial implementations were awkward and buggy. A kernel >= 2.6.29-rc1 is recommended, as well as some optimization patches. Using pipes, this code is able to pass network data directly between sockets. The pipes are a bit annoying to manage (fd creation, release, ...) but finally work quite well. Preliminary tests show that on high bandwidths, there's a substantial gain (approx +50%, only +20% with kernel workarounds for corruption bugs). With 2000 concurrent connections, with Myricom NICs, haproxy now more easily achieves 4.5 Gbps for 1 process and 6 Gbps for two processes buffers. 8-9 Gbps are easily reached with smaller numbers of connections. We also try to splice out immediately after a splice in by making profit from the new ability for a data producer to notify the consumer that data are available. Doing this ensures that the data are immediately transferred between sockets without latency, and without having to re-poll. Performance on small packets has considerably increased due to this method. Earlier kernels return only one TCP segment at a time in non-blocking splice-in mode, while newer return as many segments as may fit in the pipe. To work around this limitation without hurting more recent kernels, we try to collect as much data as possible, but we stop when we believe we have read 16 segments, then we forward everything at once. It also ensures that even upon shutdown or EAGAIN the data will be forwarded. Some tricks were necessary because the splice() syscall does not make a difference between missing data and a pipe full, it always returns EAGAIN. The trick consists in stop polling in case of EAGAIN and a non empty pipe. The receiver waits for the buffer to be empty before using the pipe. This is in order to avoid confusion between buffer data and pipe data. The BF_EMPTY flag now covers the pipe too. Right now the code is disabled by default. It needs to be built with CONFIG_HAP_LINUX_SPLICE, and the instances intented to use splice() must have "option splice-response" (or option splice-request) enabled. It is probably desirable to keep a pool of pre-allocated pipes to avoid having to create them for every session. This will be worked on later. Preliminary tests show very good results, even with the kernel workaround causing one memcpy(). At 3000 connections, performance has moved from 3.2 Gbps to 4.7 Gbps.

commit: 5bd8c376ad60ae96ce58704a608359d5e7f6ed9c [log] [tgz]
author: Willy Tarreau <w@1wt.eu> Mon Jan 19 00:32:22 2009 +0100
committer: Willy Tarreau <w@1wt.eu> Mon Jan 19 00:32:22 2009 +0100
tree: b9fda81cbe17437e3cc5af1052a10e9293879e44
parent: 6b4aad4c1bd39c84e2d0eb8b5df0fce2a397fdfd [diff]
diff --git a/src/haproxy.c b/src/haproxy.c
index 771fa0b..5911b0c 100644
--- a/src/haproxy.c
+++ b/src/haproxy.c

@@ -902,7 +902,7 @@
 	/* on very high loads, a sigpipe sometimes happen just between the
 	 * getsockopt() which tells "it's OK to write", and the following write :-(
 	 */
-#ifndef MSG_NOSIGNAL
+#if !defined(MSG_NOSIGNAL) || defined(CONFIG_HAP_LINUX_SPLICE)
 	signal(SIGPIPE, SIG_IGN);
 #endif
 

diff --git a/src/session.c b/src/session.c
index cd44199..3f18ad9 100644
--- a/src/session.c
+++ b/src/session.c

@@ -770,6 +770,14 @@
 	 */
 	if (!s->req->send_max && s->req->prod->state >= SI_ST_EST &&
 	    !s->req->analysers && !(s->req->flags & BF_HIJACK)) {
+		/* check if it is wise to enable kernel splicing on the request buffer */
+		if (!(s->req->flags & BF_KERN_SPLICING) &&
+		    (usedpipes < global.maxpipes) &&
+		    (((s->fe->options2|s->be->options2) & PR_O2_SPLIC_REQ) ||
+		     (((s->fe->options2|s->be->options2) & PR_O2_SPLIC_AUT) &&
+		      (s->req->flags & BF_STREAMER_FAST))))
+			s->req->flags |= BF_KERN_SPLICING;
+
 		if (s->req->to_forward < FORWARD_DEFAULT_SIZE)
 			buffer_forward(s->req, FORWARD_DEFAULT_SIZE);
 	}
@@ -885,6 +893,14 @@
 	 */
 	if (!s->rep->send_max && s->rep->prod->state >= SI_ST_EST &&
 	    !s->rep->analysers && !(s->rep->flags & BF_HIJACK)) {
+		/* check if it is wise to enable kernel splicing on the response buffer */
+		if (!(s->rep->flags & BF_KERN_SPLICING) &&
+		    (usedpipes < global.maxpipes) &&
+		    (((s->fe->options2|s->be->options2) & PR_O2_SPLIC_RTR) ||
+		     (((s->fe->options2|s->be->options2) & PR_O2_SPLIC_AUT) &&
+		      (s->rep->flags & BF_STREAMER_FAST))))
+			s->rep->flags |= BF_KERN_SPLICING;
+
 		if (s->rep->to_forward < FORWARD_DEFAULT_SIZE)
 			buffer_forward(s->rep, FORWARD_DEFAULT_SIZE);
 	}

diff --git a/src/stream_sock.c b/src/stream_sock.c
index 5504369..bcdb277 100644
--- a/src/stream_sock.c
+++ b/src/stream_sock.c

@@ -33,6 +33,7 @@
 #include <proto/stream_sock.h>
 #include <proto/task.h>
 
+#include <types/global.h>
 
 /* On recent Linux kernels, the splice() syscall may be used for faster data copy.
  * But it's not always defined on some OS versions, and it even happens that some
@@ -74,6 +75,171 @@
 _syscall6(int, splice, int, fdin, loff_t *, off_in, int, fdout, loff_t *, off_out, size_t, len, unsigned long, flags)
 
 #endif /* __NR_splice */
+
+/* A pipe contains 16 segments max, and it's common to see segments of 1448 bytes
+ * because of timestamps. Use this as a hint for not looping on splice().
+ */
+#define SPLICE_FULL_HINT	16*1448
+
+/* Returns :
+ *   -1 if splice is not possible or not possible anymore and we must switch to
+ *      user-land copy (eg: to_forward reached)
+ *    0 when we know that polling is required to get more data (EAGAIN)
+ *    1 for all other cases (we can safely try again, or if an activity has been
+ *      detected (DATA/NULL/ERR))
+ * Sets :
+ *   BF_READ_NULL
+ *   BF_READ_PARTIAL
+ *   BF_WRITE_PARTIAL (during copy)
+ *   BF_EMPTY (during copy)
+ *   SI_FL_ERR
+ *   SI_FL_WAIT_ROOM
+ *   (SI_FL_WAIT_RECV)
+ */
+static int stream_sock_splice_in(struct buffer *b, struct stream_interface *si)
+{
+	int fd = si->fd;
+	int ret, max, total = 0;
+	int retval = 1;
+
+	if (!b->to_forward)
+		return -1;
+
+	if (!(b->flags & BF_KERN_SPLICING))
+		return -1;
+
+	if (b->l) {
+		/* We're embarrassed, there are already data pending in
+		 * the buffer and we don't want to have them at two
+		 * locations at a time. Let's indicate we need some
+		 * place and ask the consumer to hurry.
+		 */
+		si->flags |= SI_FL_WAIT_ROOM;
+		EV_FD_CLR(fd, DIR_RD);
+		b->rex = TICK_ETERNITY;
+		b->cons->chk_snd(b->cons);
+		return 1;
+	}
+
+	if (unlikely(b->splice.prod == -1)) {
+		int pipefd[2];
+		if (usedpipes >= global.maxpipes || pipe(pipefd) < 0) {
+			b->flags &= ~BF_KERN_SPLICING;
+			return -1;
+		}
+		usedpipes++;
+		b->splice.prod = pipefd[1];
+		b->splice.cons = pipefd[0];
+	}
+
+	while (1) {
+		max = b->to_forward;
+		if (max <= 0) {
+			/* It looks like the buffer + the pipe already contain
+			 * the maximum amount of data to be transferred. Try to
+			 * send those data immediately on the other side if it
+			 * is currently waiting.
+			 */
+			retval = -1; /* end of forwarding */
+			break;
+		}
+
+		ret = splice(fd, NULL, b->splice.prod, NULL, max,
+			     SPLICE_F_MOVE|SPLICE_F_NONBLOCK);
+
+		if (ret <= 0) {
+#ifdef SPLICE_OLD_KERNEL_WORKAROUND
+			/* This part contains a lot of tricks for kernels before 2.6.29-rc1
+			 * where splice() did erroneously return -EAGAIN upon shutdown.
+			 */
+			if (ret == 0) {
+				si->flags |= SI_FL_WAIT_ROOM;
+				retval = 1;
+				break;
+			}
+
+			if (errno == EAGAIN) {
+				char dummy;
+				/* it can mean either that the socket got a shutdown read,
+				 * or that it has no available data to read.
+				 */
+				ret = recv(fd, &dummy, sizeof dummy,
+					   MSG_PEEK|MSG_DONTWAIT|MSG_NOSIGNAL);
+				if (!ret) {
+					/* connection closed */
+					b->flags |= BF_READ_NULL;
+					si->flags &= ~SI_FL_WAIT_ROOM;
+					retval = 1; /* no need for further polling */
+					break;
+				}
+				/* sometimes, splice() will return -1/EAGAIN while recv() will return 1.
+				 * Thus, it means we have to wait for more room to be left in the pipe
+				 * by the other end.
+				 */
+				if (ret > 0) {
+					si->flags |= SI_FL_WAIT_ROOM;
+					retval = 1;
+					break;
+				}
+
+				/* we need a new flag : SI_FL_WAIT_RECV */
+				retval = 0;
+				break;
+			}
+#else
+			/* this part is OK with kernel >= 2.6.29-rc1 */
+
+			if (ret == 0) {
+				/* connection closed */
+				b->flags |= BF_READ_NULL;
+				si->flags &= ~SI_FL_WAIT_ROOM;
+				retval = 1; /* no need for further polling */
+				break;
+			}
+
+			if (errno == EAGAIN) {
+				/* there are two reasons for EAGAIN :
+				 *   - nothing in the socket buffer (standard)
+				 *   - pipe is full
+				 * Since we don't know if pipe is full, we'll
+				 * stop if the pipe is not empty. Anyway, we
+				 * will almost always fill/empty the pipe.
+				 */
+
+				if (b->splice_len > 0) {
+					si->flags |= SI_FL_WAIT_ROOM;
+					retval = 1;
+					break;
+				}
+
+				/* note that we'd need a new flag : SI_FL_WAIT_RECV */
+				retval = 0;
+				break;
+			}
+#endif
+			/* here we have another error */
+			si->flags |= SI_FL_ERR;
+			retval = 1;
+			break;
+		} /* ret <= 0 */
+
+		b->to_forward -= ret;
+		total += ret;
+		b->total += ret;
+		b->splice_len += ret;
+		b->flags |= BF_READ_PARTIAL;
+		b->flags &= ~BF_EMPTY; /* to prevent shutdowns */
+
+		if (b->splice_len >= SPLICE_FULL_HINT) {
+			/* We've read enough of it for this time. */
+			retval = 1;
+			break;
+		}
+	} /* while */
+
+	return retval;
+}
+
 #endif /* CONFIG_HAP_LINUX_SPLICE */
 
 
@@ -103,6 +269,20 @@
 	if ((fdtab[fd].ev & (FD_POLL_IN|FD_POLL_HUP)) == FD_POLL_HUP)
 		goto out_shutdown_r;
 
+#if defined(CONFIG_HAP_LINUX_SPLICE)
+	if (b->to_forward && b->flags & BF_KERN_SPLICING) {
+		retval = stream_sock_splice_in(b, si);
+
+		if (retval >= 0) {
+			if (si->flags & SI_FL_ERR)
+				goto out_error;
+			if (b->flags & BF_READ_NULL)
+				goto out_shutdown_r;
+			goto out_wakeup;
+		}
+		/* splice not possible (anymore), let's go on on standard copy */
+	}
+#endif
 	cur_read = 0;
 	while (1) {
 		/*
@@ -342,6 +522,38 @@
 	int retval = 1;
 	int ret, max;
 
+#if defined(CONFIG_HAP_LINUX_SPLICE)
+	while (b->splice_len) {
+		ret = splice(b->splice.cons, NULL, si->fd, NULL, b->splice_len,
+			     SPLICE_F_MOVE|SPLICE_F_NONBLOCK);
+		if (ret <= 0) {
+			if (ret == 0 || errno == EAGAIN) {
+				retval = 0;
+				return retval;
+			}
+			/* here we have another error */
+			retval = -1;
+			return retval;
+		}
+
+		b->flags |= BF_WRITE_PARTIAL;
+		b->splice_len -= ret;
+
+		if (!b->splice_len)
+			break;
+
+		if (--write_poll <= 0)
+			return retval;
+	}
+
+	/* At this point, the pipe is empty, but we may still have data pending
+	 * in the normal buffer.
+	 */
+	if (!b->l) {
+		b->flags |= BF_EMPTY;
+		return retval;
+	}
+#endif
 	if (!b->send_max)
 		return retval;
commit	5bd8c376ad60ae96ce58704a608359d5e7f6ed9c	[log] [tgz]
author	Willy Tarreau <w@1wt.eu>	Mon Jan 19 00:32:22 2009 +0100
committer	Willy Tarreau <w@1wt.eu>	Mon Jan 19 00:32:22 2009 +0100
tree	b9fda81cbe17437e3cc5af1052a10e9293879e44
parent	6b4aad4c1bd39c84e2d0eb8b5df0fce2a397fdfd [diff]