[MEDIUM] stream_sock: implement tcp-cork for use during shutdowns on Linux

Setting TCP_CORK on a socket before sending the last segment enables
automatic merging of this segment with the FIN from the shutdown()
call. Playing with TCP_CORK is not easy though as we have to track
the status of the TCP_NODELAY flag since both are mutually exclusive.
Doing so saves one more packet per session and offers about 5% more
performance.

There is no reason not to do it, so there is no associated option.
diff --git a/include/types/fd.h b/include/types/fd.h
index 8ae0f2b..2bc258f 100644
--- a/include/types/fd.h
+++ b/include/types/fd.h
@@ -59,6 +59,13 @@
 #define FD_POLL_DATA    (FD_POLL_IN  | FD_POLL_OUT)
 #define FD_POLL_STICKY  (FD_POLL_ERR | FD_POLL_HUP)
 
+/* bit values for fdtab[fd]->flags. Most of them are used to hold a value
+ * consecutive to a behaviour change.
+ */
+#define FD_FL_TCP               0x0001       /* socket is TCP */
+#define FD_FL_TCP_NODELAY       0x0002
+#define FD_FL_TCP_CORK          0x0004
+
 /* info about one given fd */
 struct fdtab {
 	struct {
@@ -66,6 +73,7 @@
 		struct buffer *b;            /* read/write buffer */
 	} cb[DIR_SIZE];
 	void *owner;                         /* the session (or proxy) associated with this fd */
+	unsigned short flags;                /* various flags precising the exact status of this fd */
 	unsigned char state;                 /* the state of this fd */
 	unsigned char ev;                    /* event seen in return of poll() : FD_POLL_* */
 	struct sockaddr *peeraddr;           /* pointer to peer's network address, or NULL if unset */
diff --git a/src/backend.c b/src/backend.c
index b830bdb..5e78fd8 100644
--- a/src/backend.c
+++ b/src/backend.c
@@ -1948,6 +1948,7 @@
 
 	fdtab[fd].owner = s->req->cons;
 	fdtab[fd].state = FD_STCONN; /* connection in progress */
+	fdtab[fd].flags = FD_FL_TCP | FD_FL_TCP_NODELAY;
 	fdtab[fd].cb[DIR_RD].f = &stream_sock_read;
 	fdtab[fd].cb[DIR_RD].b = s->rep;
 	fdtab[fd].cb[DIR_WR].f = &stream_sock_write;
diff --git a/src/checks.c b/src/checks.c
index b4e9857..4022cad 100644
--- a/src/checks.c
+++ b/src/checks.c
@@ -692,6 +692,7 @@
 						fdtab[fd].peeraddr = (struct sockaddr *)&sa;
 						fdtab[fd].peerlen = sizeof(sa);
 						fdtab[fd].state = FD_STCONN; /* connection in progress */
+						fdtab[fd].flags = FD_FL_TCP | FD_FL_TCP_NODELAY;
 						EV_FD_SET(fd, DIR_WR);  /* for connect status */
 #ifdef DEBUG_FULL
 						assert (!EV_FD_ISSET(fd, DIR_RD));
diff --git a/src/client.c b/src/client.c
index 3e156eb..346adc6 100644
--- a/src/client.c
+++ b/src/client.c
@@ -417,6 +417,7 @@
 		fd_insert(cfd);
 		fdtab[cfd].owner = &s->si[0];
 		fdtab[cfd].state = FD_STREADY;
+		fdtab[cfd].flags = FD_FL_TCP | FD_FL_TCP_NODELAY;
 		fdtab[cfd].cb[DIR_RD].f = l->proto->read;
 		fdtab[cfd].cb[DIR_RD].b = s->req;
 		fdtab[cfd].cb[DIR_WR].f = l->proto->write;
diff --git a/src/proto_tcp.c b/src/proto_tcp.c
index ed0812c..e9b3ae3 100644
--- a/src/proto_tcp.c
+++ b/src/proto_tcp.c
@@ -212,9 +212,7 @@
 		goto tcp_close_return;
 	}
 
-	if ((fcntl(fd, F_SETFL, O_NONBLOCK) == -1) ||
-	    (setsockopt(fd, IPPROTO_TCP, TCP_NODELAY,
-			(char *) &one, sizeof(one)) == -1)) {
+	if (fcntl(fd, F_SETFL, O_NONBLOCK) == -1) {
 		err |= ERR_FATAL | ERR_ALERT;
 		msg = "cannot make socket non-blocking";
 		goto tcp_close_return;
@@ -281,6 +279,7 @@
 	fdtab[fd].cb[DIR_RD].b = fdtab[fd].cb[DIR_WR].b = NULL;
 	fdtab[fd].owner = listener; /* reference the listener instead of a task */
 	fdtab[fd].state = FD_STLISTEN;
+	fdtab[fd].flags = FD_FL_TCP;
 	fdtab[fd].peeraddr = NULL;
 	fdtab[fd].peerlen = 0;
  tcp_return:
diff --git a/src/stream_sock.c b/src/stream_sock.c
index 82bf8ca..a3ef269 100644
--- a/src/stream_sock.c
+++ b/src/stream_sock.c
@@ -16,6 +16,8 @@
 #include <stdio.h>
 #include <stdlib.h>
 
+#include <netinet/tcp.h>
+
 #include <sys/socket.h>
 #include <sys/stat.h>
 #include <sys/types.h>
@@ -569,6 +571,26 @@
 		if (max > b->send_max)
 			max = b->send_max;
 
+
+#ifdef TCP_CORK
+		/*
+		 * Check if we want to cork output before sending. This typically occurs
+		 * when there are data left in the buffer, or when we reached the end of
+		 * buffer but we know we will close, so we try to merge the ongoing FIN
+		 * with the last data segment.
+		 */
+		if ((fdtab[si->fd].flags & (FD_FL_TCP|FD_FL_TCP_CORK)) == FD_FL_TCP) {
+			if (unlikely((b->send_max == b->l && 
+				      (b->flags & (BF_SHUTW|BF_SHUTW_NOW|BF_HIJACK|BF_WRITE_ENA|BF_SHUTR)) ==
+				      (BF_WRITE_ENA|BF_SHUTR)))) {
+				/* we have to unconditionally reset TCP_NODELAY for CORK */
+				setsockopt(si->fd, IPPROTO_TCP, TCP_NODELAY, (char *) &zero, sizeof(zero));
+				setsockopt(si->fd, SOL_TCP, TCP_CORK, (char *) &one, sizeof(one));
+				fdtab[si->fd].flags = (fdtab[si->fd].flags & ~FD_FL_TCP_NODELAY) | FD_FL_TCP_CORK;
+			}
+		}
+#endif
+
 #ifndef MSG_NOSIGNAL
 		{
 			int skerr;
@@ -628,6 +650,21 @@
 		}
 	} /* while (1) */
 
+	/* check if we need to uncork the output, for instance when the
+	 * output buffer is empty but not shutr().
+	 */
+	if (unlikely((fdtab[si->fd].flags & (FD_FL_TCP|FD_FL_TCP_NODELAY)) == FD_FL_TCP && (b->flags & BF_EMPTY))) {
+		if ((b->flags & (BF_SHUTW|BF_SHUTW_NOW|BF_HIJACK|BF_WRITE_ENA|BF_SHUTR)) != (BF_WRITE_ENA|BF_SHUTR)) {
+#ifdef TCP_CORK
+			if (fdtab[si->fd].flags & FD_FL_TCP_CORK)
+				setsockopt(si->fd, SOL_TCP, TCP_CORK, (char *) &zero, sizeof(zero));
+#endif
+			setsockopt(si->fd, IPPROTO_TCP, TCP_NODELAY, (char *) &one, sizeof(one));
+			fdtab[si->fd].flags = (fdtab[si->fd].flags & ~FD_FL_TCP_CORK) | FD_FL_TCP_NODELAY;
+		}
+	}
+
+
 	return retval;
 }