[MAJOR] complete support for linux 2.6 kernel splicing
This code provides support for linux 2.6 kernel splicing. This feature
appeared in kernel 2.6.25, but initial implementations were awkward and
buggy. A kernel >= 2.6.29-rc1 is recommended, as well as some optimization
patches.
Using pipes, this code is able to pass network data directly between
sockets. The pipes are a bit annoying to manage (fd creation, release,
...) but finally work quite well.
Preliminary tests show that on high bandwidths, there's a substantial
gain (approx +50%, only +20% with kernel workarounds for corruption
bugs). With 2000 concurrent connections, with Myricom NICs, haproxy
now more easily achieves 4.5 Gbps for 1 process and 6 Gbps for two
processes buffers. 8-9 Gbps are easily reached with smaller numbers
of connections.
We also try to splice out immediately after a splice in by making
profit from the new ability for a data producer to notify the
consumer that data are available. Doing this ensures that the
data are immediately transferred between sockets without latency,
and without having to re-poll. Performance on small packets has
considerably increased due to this method.
Earlier kernels return only one TCP segment at a time in non-blocking
splice-in mode, while newer return as many segments as may fit in the
pipe. To work around this limitation without hurting more recent kernels,
we try to collect as much data as possible, but we stop when we believe
we have read 16 segments, then we forward everything at once. It also
ensures that even upon shutdown or EAGAIN the data will be forwarded.
Some tricks were necessary because the splice() syscall does not make
a difference between missing data and a pipe full, it always returns
EAGAIN. The trick consists in stop polling in case of EAGAIN and a non
empty pipe.
The receiver waits for the buffer to be empty before using the pipe.
This is in order to avoid confusion between buffer data and pipe data.
The BF_EMPTY flag now covers the pipe too.
Right now the code is disabled by default. It needs to be built with
CONFIG_HAP_LINUX_SPLICE, and the instances intented to use splice()
must have "option splice-response" (or option splice-request) enabled.
It is probably desirable to keep a pool of pre-allocated pipes to
avoid having to create them for every session. This will be worked
on later.
Preliminary tests show very good results, even with the kernel
workaround causing one memcpy(). At 3000 connections, performance
has moved from 3.2 Gbps to 4.7 Gbps.
diff --git a/src/haproxy.c b/src/haproxy.c
index 771fa0b..5911b0c 100644
--- a/src/haproxy.c
+++ b/src/haproxy.c
@@ -902,7 +902,7 @@
/* on very high loads, a sigpipe sometimes happen just between the
* getsockopt() which tells "it's OK to write", and the following write :-(
*/
-#ifndef MSG_NOSIGNAL
+#if !defined(MSG_NOSIGNAL) || defined(CONFIG_HAP_LINUX_SPLICE)
signal(SIGPIPE, SIG_IGN);
#endif
diff --git a/src/session.c b/src/session.c
index cd44199..3f18ad9 100644
--- a/src/session.c
+++ b/src/session.c
@@ -770,6 +770,14 @@
*/
if (!s->req->send_max && s->req->prod->state >= SI_ST_EST &&
!s->req->analysers && !(s->req->flags & BF_HIJACK)) {
+ /* check if it is wise to enable kernel splicing on the request buffer */
+ if (!(s->req->flags & BF_KERN_SPLICING) &&
+ (usedpipes < global.maxpipes) &&
+ (((s->fe->options2|s->be->options2) & PR_O2_SPLIC_REQ) ||
+ (((s->fe->options2|s->be->options2) & PR_O2_SPLIC_AUT) &&
+ (s->req->flags & BF_STREAMER_FAST))))
+ s->req->flags |= BF_KERN_SPLICING;
+
if (s->req->to_forward < FORWARD_DEFAULT_SIZE)
buffer_forward(s->req, FORWARD_DEFAULT_SIZE);
}
@@ -885,6 +893,14 @@
*/
if (!s->rep->send_max && s->rep->prod->state >= SI_ST_EST &&
!s->rep->analysers && !(s->rep->flags & BF_HIJACK)) {
+ /* check if it is wise to enable kernel splicing on the response buffer */
+ if (!(s->rep->flags & BF_KERN_SPLICING) &&
+ (usedpipes < global.maxpipes) &&
+ (((s->fe->options2|s->be->options2) & PR_O2_SPLIC_RTR) ||
+ (((s->fe->options2|s->be->options2) & PR_O2_SPLIC_AUT) &&
+ (s->rep->flags & BF_STREAMER_FAST))))
+ s->rep->flags |= BF_KERN_SPLICING;
+
if (s->rep->to_forward < FORWARD_DEFAULT_SIZE)
buffer_forward(s->rep, FORWARD_DEFAULT_SIZE);
}
diff --git a/src/stream_sock.c b/src/stream_sock.c
index 5504369..bcdb277 100644
--- a/src/stream_sock.c
+++ b/src/stream_sock.c
@@ -33,6 +33,7 @@
#include <proto/stream_sock.h>
#include <proto/task.h>
+#include <types/global.h>
/* On recent Linux kernels, the splice() syscall may be used for faster data copy.
* But it's not always defined on some OS versions, and it even happens that some
@@ -74,6 +75,171 @@
_syscall6(int, splice, int, fdin, loff_t *, off_in, int, fdout, loff_t *, off_out, size_t, len, unsigned long, flags)
#endif /* __NR_splice */
+
+/* A pipe contains 16 segments max, and it's common to see segments of 1448 bytes
+ * because of timestamps. Use this as a hint for not looping on splice().
+ */
+#define SPLICE_FULL_HINT 16*1448
+
+/* Returns :
+ * -1 if splice is not possible or not possible anymore and we must switch to
+ * user-land copy (eg: to_forward reached)
+ * 0 when we know that polling is required to get more data (EAGAIN)
+ * 1 for all other cases (we can safely try again, or if an activity has been
+ * detected (DATA/NULL/ERR))
+ * Sets :
+ * BF_READ_NULL
+ * BF_READ_PARTIAL
+ * BF_WRITE_PARTIAL (during copy)
+ * BF_EMPTY (during copy)
+ * SI_FL_ERR
+ * SI_FL_WAIT_ROOM
+ * (SI_FL_WAIT_RECV)
+ */
+static int stream_sock_splice_in(struct buffer *b, struct stream_interface *si)
+{
+ int fd = si->fd;
+ int ret, max, total = 0;
+ int retval = 1;
+
+ if (!b->to_forward)
+ return -1;
+
+ if (!(b->flags & BF_KERN_SPLICING))
+ return -1;
+
+ if (b->l) {
+ /* We're embarrassed, there are already data pending in
+ * the buffer and we don't want to have them at two
+ * locations at a time. Let's indicate we need some
+ * place and ask the consumer to hurry.
+ */
+ si->flags |= SI_FL_WAIT_ROOM;
+ EV_FD_CLR(fd, DIR_RD);
+ b->rex = TICK_ETERNITY;
+ b->cons->chk_snd(b->cons);
+ return 1;
+ }
+
+ if (unlikely(b->splice.prod == -1)) {
+ int pipefd[2];
+ if (usedpipes >= global.maxpipes || pipe(pipefd) < 0) {
+ b->flags &= ~BF_KERN_SPLICING;
+ return -1;
+ }
+ usedpipes++;
+ b->splice.prod = pipefd[1];
+ b->splice.cons = pipefd[0];
+ }
+
+ while (1) {
+ max = b->to_forward;
+ if (max <= 0) {
+ /* It looks like the buffer + the pipe already contain
+ * the maximum amount of data to be transferred. Try to
+ * send those data immediately on the other side if it
+ * is currently waiting.
+ */
+ retval = -1; /* end of forwarding */
+ break;
+ }
+
+ ret = splice(fd, NULL, b->splice.prod, NULL, max,
+ SPLICE_F_MOVE|SPLICE_F_NONBLOCK);
+
+ if (ret <= 0) {
+#ifdef SPLICE_OLD_KERNEL_WORKAROUND
+ /* This part contains a lot of tricks for kernels before 2.6.29-rc1
+ * where splice() did erroneously return -EAGAIN upon shutdown.
+ */
+ if (ret == 0) {
+ si->flags |= SI_FL_WAIT_ROOM;
+ retval = 1;
+ break;
+ }
+
+ if (errno == EAGAIN) {
+ char dummy;
+ /* it can mean either that the socket got a shutdown read,
+ * or that it has no available data to read.
+ */
+ ret = recv(fd, &dummy, sizeof dummy,
+ MSG_PEEK|MSG_DONTWAIT|MSG_NOSIGNAL);
+ if (!ret) {
+ /* connection closed */
+ b->flags |= BF_READ_NULL;
+ si->flags &= ~SI_FL_WAIT_ROOM;
+ retval = 1; /* no need for further polling */
+ break;
+ }
+ /* sometimes, splice() will return -1/EAGAIN while recv() will return 1.
+ * Thus, it means we have to wait for more room to be left in the pipe
+ * by the other end.
+ */
+ if (ret > 0) {
+ si->flags |= SI_FL_WAIT_ROOM;
+ retval = 1;
+ break;
+ }
+
+ /* we need a new flag : SI_FL_WAIT_RECV */
+ retval = 0;
+ break;
+ }
+#else
+ /* this part is OK with kernel >= 2.6.29-rc1 */
+
+ if (ret == 0) {
+ /* connection closed */
+ b->flags |= BF_READ_NULL;
+ si->flags &= ~SI_FL_WAIT_ROOM;
+ retval = 1; /* no need for further polling */
+ break;
+ }
+
+ if (errno == EAGAIN) {
+ /* there are two reasons for EAGAIN :
+ * - nothing in the socket buffer (standard)
+ * - pipe is full
+ * Since we don't know if pipe is full, we'll
+ * stop if the pipe is not empty. Anyway, we
+ * will almost always fill/empty the pipe.
+ */
+
+ if (b->splice_len > 0) {
+ si->flags |= SI_FL_WAIT_ROOM;
+ retval = 1;
+ break;
+ }
+
+ /* note that we'd need a new flag : SI_FL_WAIT_RECV */
+ retval = 0;
+ break;
+ }
+#endif
+ /* here we have another error */
+ si->flags |= SI_FL_ERR;
+ retval = 1;
+ break;
+ } /* ret <= 0 */
+
+ b->to_forward -= ret;
+ total += ret;
+ b->total += ret;
+ b->splice_len += ret;
+ b->flags |= BF_READ_PARTIAL;
+ b->flags &= ~BF_EMPTY; /* to prevent shutdowns */
+
+ if (b->splice_len >= SPLICE_FULL_HINT) {
+ /* We've read enough of it for this time. */
+ retval = 1;
+ break;
+ }
+ } /* while */
+
+ return retval;
+}
+
#endif /* CONFIG_HAP_LINUX_SPLICE */
@@ -103,6 +269,20 @@
if ((fdtab[fd].ev & (FD_POLL_IN|FD_POLL_HUP)) == FD_POLL_HUP)
goto out_shutdown_r;
+#if defined(CONFIG_HAP_LINUX_SPLICE)
+ if (b->to_forward && b->flags & BF_KERN_SPLICING) {
+ retval = stream_sock_splice_in(b, si);
+
+ if (retval >= 0) {
+ if (si->flags & SI_FL_ERR)
+ goto out_error;
+ if (b->flags & BF_READ_NULL)
+ goto out_shutdown_r;
+ goto out_wakeup;
+ }
+ /* splice not possible (anymore), let's go on on standard copy */
+ }
+#endif
cur_read = 0;
while (1) {
/*
@@ -342,6 +522,38 @@
int retval = 1;
int ret, max;
+#if defined(CONFIG_HAP_LINUX_SPLICE)
+ while (b->splice_len) {
+ ret = splice(b->splice.cons, NULL, si->fd, NULL, b->splice_len,
+ SPLICE_F_MOVE|SPLICE_F_NONBLOCK);
+ if (ret <= 0) {
+ if (ret == 0 || errno == EAGAIN) {
+ retval = 0;
+ return retval;
+ }
+ /* here we have another error */
+ retval = -1;
+ return retval;
+ }
+
+ b->flags |= BF_WRITE_PARTIAL;
+ b->splice_len -= ret;
+
+ if (!b->splice_len)
+ break;
+
+ if (--write_poll <= 0)
+ return retval;
+ }
+
+ /* At this point, the pipe is empty, but we may still have data pending
+ * in the normal buffer.
+ */
+ if (!b->l) {
+ b->flags |= BF_EMPTY;
+ return retval;
+ }
+#endif
if (!b->send_max)
return retval;