[MEDIUM] stream_sock: implement tcp-cork for use during shutdowns on Linux
Setting TCP_CORK on a socket before sending the last segment enables
automatic merging of this segment with the FIN from the shutdown()
call. Playing with TCP_CORK is not easy though as we have to track
the status of the TCP_NODELAY flag since both are mutually exclusive.
Doing so saves one more packet per session and offers about 5% more
performance.
There is no reason not to do it, so there is no associated option.
diff --git a/include/types/fd.h b/include/types/fd.h
index 8ae0f2b..2bc258f 100644
--- a/include/types/fd.h
+++ b/include/types/fd.h
@@ -59,6 +59,13 @@
#define FD_POLL_DATA (FD_POLL_IN | FD_POLL_OUT)
#define FD_POLL_STICKY (FD_POLL_ERR | FD_POLL_HUP)
+/* bit values for fdtab[fd]->flags. Most of them are used to hold a value
+ * consecutive to a behaviour change.
+ */
+#define FD_FL_TCP 0x0001 /* socket is TCP */
+#define FD_FL_TCP_NODELAY 0x0002
+#define FD_FL_TCP_CORK 0x0004
+
/* info about one given fd */
struct fdtab {
struct {
@@ -66,6 +73,7 @@
struct buffer *b; /* read/write buffer */
} cb[DIR_SIZE];
void *owner; /* the session (or proxy) associated with this fd */
+ unsigned short flags; /* various flags precising the exact status of this fd */
unsigned char state; /* the state of this fd */
unsigned char ev; /* event seen in return of poll() : FD_POLL_* */
struct sockaddr *peeraddr; /* pointer to peer's network address, or NULL if unset */
diff --git a/src/backend.c b/src/backend.c
index b830bdb..5e78fd8 100644
--- a/src/backend.c
+++ b/src/backend.c
@@ -1948,6 +1948,7 @@
fdtab[fd].owner = s->req->cons;
fdtab[fd].state = FD_STCONN; /* connection in progress */
+ fdtab[fd].flags = FD_FL_TCP | FD_FL_TCP_NODELAY;
fdtab[fd].cb[DIR_RD].f = &stream_sock_read;
fdtab[fd].cb[DIR_RD].b = s->rep;
fdtab[fd].cb[DIR_WR].f = &stream_sock_write;
diff --git a/src/checks.c b/src/checks.c
index b4e9857..4022cad 100644
--- a/src/checks.c
+++ b/src/checks.c
@@ -692,6 +692,7 @@
fdtab[fd].peeraddr = (struct sockaddr *)&sa;
fdtab[fd].peerlen = sizeof(sa);
fdtab[fd].state = FD_STCONN; /* connection in progress */
+ fdtab[fd].flags = FD_FL_TCP | FD_FL_TCP_NODELAY;
EV_FD_SET(fd, DIR_WR); /* for connect status */
#ifdef DEBUG_FULL
assert (!EV_FD_ISSET(fd, DIR_RD));
diff --git a/src/client.c b/src/client.c
index 3e156eb..346adc6 100644
--- a/src/client.c
+++ b/src/client.c
@@ -417,6 +417,7 @@
fd_insert(cfd);
fdtab[cfd].owner = &s->si[0];
fdtab[cfd].state = FD_STREADY;
+ fdtab[cfd].flags = FD_FL_TCP | FD_FL_TCP_NODELAY;
fdtab[cfd].cb[DIR_RD].f = l->proto->read;
fdtab[cfd].cb[DIR_RD].b = s->req;
fdtab[cfd].cb[DIR_WR].f = l->proto->write;
diff --git a/src/proto_tcp.c b/src/proto_tcp.c
index ed0812c..e9b3ae3 100644
--- a/src/proto_tcp.c
+++ b/src/proto_tcp.c
@@ -212,9 +212,7 @@
goto tcp_close_return;
}
- if ((fcntl(fd, F_SETFL, O_NONBLOCK) == -1) ||
- (setsockopt(fd, IPPROTO_TCP, TCP_NODELAY,
- (char *) &one, sizeof(one)) == -1)) {
+ if (fcntl(fd, F_SETFL, O_NONBLOCK) == -1) {
err |= ERR_FATAL | ERR_ALERT;
msg = "cannot make socket non-blocking";
goto tcp_close_return;
@@ -281,6 +279,7 @@
fdtab[fd].cb[DIR_RD].b = fdtab[fd].cb[DIR_WR].b = NULL;
fdtab[fd].owner = listener; /* reference the listener instead of a task */
fdtab[fd].state = FD_STLISTEN;
+ fdtab[fd].flags = FD_FL_TCP;
fdtab[fd].peeraddr = NULL;
fdtab[fd].peerlen = 0;
tcp_return:
diff --git a/src/stream_sock.c b/src/stream_sock.c
index 82bf8ca..a3ef269 100644
--- a/src/stream_sock.c
+++ b/src/stream_sock.c
@@ -16,6 +16,8 @@
#include <stdio.h>
#include <stdlib.h>
+#include <netinet/tcp.h>
+
#include <sys/socket.h>
#include <sys/stat.h>
#include <sys/types.h>
@@ -569,6 +571,26 @@
if (max > b->send_max)
max = b->send_max;
+
+#ifdef TCP_CORK
+ /*
+ * Check if we want to cork output before sending. This typically occurs
+ * when there are data left in the buffer, or when we reached the end of
+ * buffer but we know we will close, so we try to merge the ongoing FIN
+ * with the last data segment.
+ */
+ if ((fdtab[si->fd].flags & (FD_FL_TCP|FD_FL_TCP_CORK)) == FD_FL_TCP) {
+ if (unlikely((b->send_max == b->l &&
+ (b->flags & (BF_SHUTW|BF_SHUTW_NOW|BF_HIJACK|BF_WRITE_ENA|BF_SHUTR)) ==
+ (BF_WRITE_ENA|BF_SHUTR)))) {
+ /* we have to unconditionally reset TCP_NODELAY for CORK */
+ setsockopt(si->fd, IPPROTO_TCP, TCP_NODELAY, (char *) &zero, sizeof(zero));
+ setsockopt(si->fd, SOL_TCP, TCP_CORK, (char *) &one, sizeof(one));
+ fdtab[si->fd].flags = (fdtab[si->fd].flags & ~FD_FL_TCP_NODELAY) | FD_FL_TCP_CORK;
+ }
+ }
+#endif
+
#ifndef MSG_NOSIGNAL
{
int skerr;
@@ -628,6 +650,21 @@
}
} /* while (1) */
+ /* check if we need to uncork the output, for instance when the
+ * output buffer is empty but not shutr().
+ */
+ if (unlikely((fdtab[si->fd].flags & (FD_FL_TCP|FD_FL_TCP_NODELAY)) == FD_FL_TCP && (b->flags & BF_EMPTY))) {
+ if ((b->flags & (BF_SHUTW|BF_SHUTW_NOW|BF_HIJACK|BF_WRITE_ENA|BF_SHUTR)) != (BF_WRITE_ENA|BF_SHUTR)) {
+#ifdef TCP_CORK
+ if (fdtab[si->fd].flags & FD_FL_TCP_CORK)
+ setsockopt(si->fd, SOL_TCP, TCP_CORK, (char *) &zero, sizeof(zero));
+#endif
+ setsockopt(si->fd, IPPROTO_TCP, TCP_NODELAY, (char *) &one, sizeof(one));
+ fdtab[si->fd].flags = (fdtab[si->fd].flags & ~FD_FL_TCP_CORK) | FD_FL_TCP_NODELAY;
+ }
+ }
+
+
return retval;
}