MAJOR: fd: remove the need for the socket layer to recheck the connection Up to now, if an outgoing connection had no data to send, the socket layer had to perform a connect() again to check for establishment. This is not acceptable for SSL, and will cause problems with socketpair(). Some socket layers will also need an initializer before sending data (eg: SSL). The solution consists in moving the connect() test to the protocol layer (eg: TCP) and to make it hold the fd->write callback until the connection is validated. At this point, it will switch the write callback to the socket layer's write function. In fact we need to hold both read and write callbacks to ensure the socket layer is never called before being initialized. This intermediate callback is used only if there is a socket init function or if there are no data to send. The socket layer does not have any code to check for connection establishment anymore, which makes sense.

commit: eeda90e68c19d2184e644a631212983757db3249 [log] [tgz]
author: Willy Tarreau <w@1wt.eu> Fri May 11 19:53:32 2012 +0200
committer: Willy Tarreau <w@1wt.eu> Fri May 11 20:18:26 2012 +0200
tree: ce22811af225c8539baee6e3095da947210f1ecf
parent: d02394b5a1468618cf708fb240a8b0930c077663 [diff] [blame]
diff --git a/src/proto_tcp.c b/src/proto_tcp.c
index 5d5ede6..eee65bc 100644
--- a/src/proto_tcp.c
+++ b/src/proto_tcp.c

@@ -60,6 +60,8 @@
 
 static int tcp_bind_listeners(struct protocol *proto, char *errmsg, int errlen);
 static int tcp_bind_listener(struct listener *listener, char *errmsg, int errlen);
+static int tcp_connect_write(int fd);
+static int tcp_connect_read(int fd);
 
 /* Note: must not be declared <const> as its list will be overwritten */
 static struct protocol proto_tcpv4 = {
@@ -449,11 +451,22 @@
 	fdtab[fd].owner = si;
 	fdtab[fd].state = FD_STCONN; /* connection in progress */
 	fdtab[fd].flags = FD_FL_TCP | FD_FL_TCP_NODELAY;
-	fdtab[fd].cb[DIR_RD].f = si->sock.read;
 	fdtab[fd].cb[DIR_RD].b = si->ib;
-	fdtab[fd].cb[DIR_WR].f = si->sock.write;
 	fdtab[fd].cb[DIR_WR].b = si->ob;
 
+	/* If we have nothing to send or if we want to initialize the sock layer,
+	 * we want to confirm that the TCP connection is established before doing
+	 * so, so we use our own write callback then switch to the sock layer.
+	 */
+	if (si->sock.init || ((si->ob->flags & BF_OUT_EMPTY) && !si->send_proxy_ofs)) {
+		fdtab[fd].cb[DIR_RD].f = tcp_connect_read;
+		fdtab[fd].cb[DIR_WR].f = tcp_connect_write;
+	}
+	else {
+		fdtab[fd].cb[DIR_RD].f = si->sock.read;
+		fdtab[fd].cb[DIR_WR].f = si->sock.write;
+	}
+
 	fdinfo[fd].peeraddr = (struct sockaddr *)&si->addr.to;
 	fdinfo[fd].peerlen = get_addr_len(&si->addr.to);
 
@@ -502,6 +515,122 @@
 		return getsockname(fd, sa, &salen);
 }
 
+/* This is the callback which is set when a connection establishment is pending
+ * and we have nothing to send, or if we have an init function we want to call
+ * once the connection is established.
+ */
+static int tcp_connect_write(int fd)
+{
+	struct stream_interface *si = fdtab[fd].owner;
+	struct buffer *b = si->ob;
+	int retval = 1;
+
+	if (fdtab[fd].state == FD_STERROR)
+		goto out_error;
+
+	if (fdtab[fd].state != FD_STCONN) {
+		retval = 0;
+		goto out_ignore; /* strange we were called while ready */
+	}
+
+	/* we might have been called just after an asynchronous shutw */
+	if (b->flags & BF_SHUTW)
+		goto out_wakeup;
+
+	/* We have no data to send to check the connection, and
+	 * getsockopt() will not inform us whether the connection
+	 * is still pending. So we'll reuse connect() to check the
+	 * state of the socket. This has the advantage of giving us
+	 * the following info :
+	 *  - error
+	 *  - connecting (EALREADY, EINPROGRESS)
+	 *  - connected (EISCONN, 0)
+	 */
+	if ((connect(fd, fdinfo[fd].peeraddr, fdinfo[fd].peerlen) == 0))
+		errno = 0;
+
+	if (errno == EALREADY || errno == EINPROGRESS) {
+		retval = 0;
+		goto out_ignore;
+	}
+
+	if (errno && errno != EISCONN)
+		goto out_error;
+
+	/* OK we just need to indicate that we got a connection
+	 * and that we wrote nothing.
+	 */
+	b->flags |= BF_WRITE_NULL;
+
+	/* The FD is ready now, we can hand the handlers to the socket layer */
+	fdtab[fd].cb[DIR_RD].f = si->sock.read;
+	fdtab[fd].cb[DIR_WR].f = si->sock.write;
+	fdtab[fd].state = FD_STREADY;
+
+ out_wakeup:
+	task_wakeup(si->owner, TASK_WOKEN_IO);
+
+ out_ignore:
+	fdtab[fd].ev &= ~FD_POLL_OUT;
+	return retval;
+
+ out_error:
+	/* Write error on the file descriptor. We mark the FD as STERROR so
+	 * that we don't use it anymore. The error is reported to the stream
+	 * interface which will take proper action. We must not perturbate the
+	 * buffer because the stream interface wants to ensure transparent
+	 * connection retries.
+	 */
+
+	fdtab[fd].state = FD_STERROR;
+	fdtab[fd].ev &= ~FD_POLL_STICKY;
+	EV_FD_REM(fd);
+	si->flags |= SI_FL_ERR;
+	goto out_wakeup;
+}
+
+
+/* might be used on connect error */
+static int tcp_connect_read(int fd)
+{
+	struct stream_interface *si = fdtab[fd].owner;
+	int retval;
+
+	retval = 1;
+
+	if (fdtab[fd].state == FD_STERROR)
+		goto out_error;
+
+	if (fdtab[fd].state != FD_STCONN) {
+		retval = 0;
+		goto out_ignore; /* strange we were called while ready */
+	}
+
+	/* stop here if we reached the end of data */
+	if ((fdtab[fd].ev & (FD_POLL_IN|FD_POLL_HUP)) == FD_POLL_HUP)
+		goto out_error;
+
+ out_wakeup:
+	task_wakeup(si->owner, TASK_WOKEN_IO);
+ out_ignore:
+	fdtab[fd].ev &= ~FD_POLL_IN;
+	return retval;
+
+ out_error:
+	/* Read error on the file descriptor. We mark the FD as STERROR so
+	 * that we don't use it anymore. The error is reported to the stream
+	 * interface which will take proper action. We must not perturbate the
+	 * buffer because the stream interface wants to ensure transparent
+	 * connection retries.
+	 */
+
+	fdtab[fd].state = FD_STERROR;
+	fdtab[fd].ev &= ~FD_POLL_STICKY;
+	EV_FD_REM(fd);
+	si->flags |= SI_FL_ERR;
+	goto out_wakeup;
+}
+
 
 /* This function tries to bind a TCPv4/v6 listener. It may return a warning or
  * an error message in <err> if the message is at most <errlen> bytes long
commit	eeda90e68c19d2184e644a631212983757db3249	[log] [tgz]
author	Willy Tarreau <w@1wt.eu>	Fri May 11 19:53:32 2012 +0200
committer	Willy Tarreau <w@1wt.eu>	Fri May 11 20:18:26 2012 +0200
tree	ce22811af225c8539baee6e3095da947210f1ecf
parent	d02394b5a1468618cf708fb240a8b0930c077663 [diff] [blame]