BUG/MEDIUM: stream-int: do not rely on the connection error once established Historically the stream-interface code used to check for connection errors by itself. Later this was partially deferred to muxes, but only once the mux is installed or the connection is at least in the established state. But probably as a safety practice the connection error tests remained. The problem is that they are causing trouble on when a response received from a mux is mixed with an error report. The typical case is an upload that is interrupted by the server sending an error or redirect without draining all data, causing an RST to be queued just after the data. In this case the mux has the data, the CO_FL_ERROR flag is present on the connection, and unfortunately the stream-interface refuses to retrieve the data due to this flag, and return an error to the client. It's about time to only rely on CS_FL_ERROR which is set by the mux, but the stream-interface is still responsible for the connection during its setup. However everywhere the CO_FL_ERROR is checked, CS_FL_ERROR is also checked. This commit addresses this by: - adding a new function si_is_conn_error() that checks the SI state and only reports the status of CO_FL_ERROR for states before SI_ST_EST. - eliminating all checks for CO_FL_ERORR in places where CS_FL_ERROR is already checked and either the presence of a mux was already validated or the stream-int's state was already checked as being SI_ST_EST or higher. CO_FL_ERROR tests on the send() direction are also inappropriate as they may cause the loss of pending data. Now this doesn't happen anymore and such events are only converted to CS_FL_ERROR by the mux once notified of the problem. As such, this must not cause the loss of any error event. Now an early error reported on a backend mux doesn't prevent the queued response from being read and forwarded to the client (the list of syscalls below was trimmed and epoll_ctl is not represented): recvfrom(10, "POST / HTTP/1.1\r\nConnection: clo"..., 16320, 0, NULL, NULL) = 66 sendto(11, "POST / HTTP/1.1\r\ntransfer-encodi"..., 47, MSG_DONTWAIT|MSG_NOSIGNAL, NULL, 0) = 47 epoll_wait(3, [{events=EPOLLIN|EPOLLERR|EPOLLHUP|EPOLLRDHUP, data={u32=11, u64=11}}], 200, 15001) = 1 recvfrom(11, "HTTP/1.1 200 OK\r\ncontent-length:"..., 16320, 0, NULL, NULL) = 57 sendto(10, "HTTP/1.1 200 OK\r\ncontent-length:"..., 57, MSG_DONTWAIT|MSG_NOSIGNAL, NULL, 0) = 57 epoll_wait(3, [{events=EPOLLIN|EPOLLERR|EPOLLHUP|EPOLLRDHUP, data={u32=11, u64=11}}], 200, 13001) = 1 epoll_wait(3, [{events=EPOLLIN, data={u32=10, u64=10}}], 200, 13001) = 1 recvfrom(10, "A\n0123456789\r\n0\r\n\r\n", 16320, 0, NULL, NULL) = 19 shutdown(10, SHUT_WR) = 0 close(11) = 0 close(10) = 0 Above the server is an haproxy configured with the following: listen blah bind :8002 mode http timeout connect 5s timeout client 5s timeout server 5s option httpclose option nolinger http-request return status 200 hdr connection close And the client takes care of sending requests and data in two distinct parts: while :; do ./dev/tcploop/tcploop 8001 C T S:"POST / HTTP/1.1\r\nConnection: close\r\nTransfer-encoding: chunked\r\n\r\n" P1 S:"A\n0123456789\r\n0\r\n\r\n" P R F; done With this, a small percentage of the requests will reproduce the behavior above. Note that this fix requires the following patch to be applied for the test above to work: BUG/MEDIUM: mux-h1: only turn CO_FL_ERROR to CS_FL_ERROR with empty ibuf This should be backported with after a few weeks of observation, and likely one version at a time. During the backports, the patch might need to be adjusted at each check of CO_FL_ERORR to follow the principles explained above. (cherry picked from commit d1480cc8a44395c27c715c0dd431d3e37a83a4c8) Signed-off-by: Christopher Faulet <cfaulet@haproxy.com> (cherry picked from commit ba25116c1e0f5c37d5141cf38cc1ede26de21d98) Signed-off-by: Christopher Faulet <cfaulet@haproxy.com>

commit: b3666ef018a94a5cbcc3a143702a7b3d0b108f4c [log] [tgz]
author: Willy Tarreau <w@1wt.eu> Thu Mar 17 16:19:09 2022 +0100
committer: Christopher Faulet <cfaulet@haproxy.com> Mon Mar 28 22:09:43 2022 +0200
tree: 5e438d8ae67b597e97799c488a5a6290cea5e91f
parent: a1d43d1fad324b53b75887216e6c44be357c8bf3 [diff]
diff --git a/src/stream_interface.c b/src/stream_interface.c
index afcf3c0..e4f8bd3 100644
--- a/src/stream_interface.c
+++ b/src/stream_interface.c

@@ -554,6 +554,23 @@
 		ic->flags &= ~CF_READ_DONTWAIT;
 }
 
+/* The stream interface is only responsible for the connection during the early
+ * states, before plugging a mux. Thus it should only care about CO_FL_ERROR
+ * before SI_ST_EST, and after that it must absolutely ignore it since the mux
+ * may hold pending data. This function returns true if such an error was
+ * reported. Both the CS and the CONN must be valid.
+ */
+static inline int si_is_conn_error(const struct stream_interface *si)
+{
+	struct connection *conn;
+
+	if (si->state >= SI_ST_EST)
+		return 0;
+
+	conn = __objt_cs(si->end)->conn;
+	BUG_ON(!conn);
+	return !!(conn->flags & CO_FL_ERROR);
+}
 
 /* Called by I/O handlers after completion.. It propagates
  * connection flags to the stream interface, updates the stream (which may or
@@ -584,9 +601,11 @@
 	 *       wake callback. Otherwise si_cs_recv()/si_cs_send() already take
 	 *       care of it.
 	 */
-	if (si->state >= SI_ST_CON &&
-	    (conn->flags & CO_FL_ERROR || cs->flags & CS_FL_ERROR))
-		si->flags |= SI_FL_ERR;
+
+	if (si->state >= SI_ST_CON) {
+		if ((cs->flags & CS_FL_ERROR) || si_is_conn_error(si))
+			si->flags |= SI_FL_ERR;
+	}
 
 	/* If we had early data, and the handshake ended, then
 	 * we can remove the flag, and attempt to wake the task up,
@@ -655,7 +674,7 @@
 	int ret;
 	int did_send = 0;
 
-	if (conn->flags & CO_FL_ERROR || cs->flags & (CS_FL_ERROR|CS_FL_ERR_PENDING)) {
+	if (cs->flags & (CS_FL_ERROR|CS_FL_ERR_PENDING) || si_is_conn_error(si)) {
 		/* We're probably there because the tasklet was woken up,
 		 * but process_stream() ran before, detected there were an
 		 * error and put the si back to SI_ST_TAR. There's still
@@ -778,7 +797,7 @@
 		si_rx_room_rdy(si_opposite(si));
 	}
 
-	if (conn->flags & CO_FL_ERROR || cs->flags & (CS_FL_ERROR|CS_FL_ERR_PENDING)) {
+	if (cs->flags & (CS_FL_ERROR|CS_FL_ERR_PENDING)) {
 		si->flags |= SI_FL_ERR;
 		return 1;
 	}
@@ -1133,7 +1152,7 @@
 	if (!(si->wait_event.events & SUB_RETRY_SEND) && !channel_is_empty(si_oc(si)))
 		si_cs_send(cs);
 
-	if (cs->flags & (CS_FL_ERROR|CS_FL_ERR_PENDING) || cs->conn->flags & CO_FL_ERROR) {
+	if (cs->flags & (CS_FL_ERROR|CS_FL_ERR_PENDING) || si_is_conn_error(si)) {
 		/* Write error on the file descriptor */
 		if (si->state >= SI_ST_CON)
 			si->flags |= SI_FL_ERR;
@@ -1248,7 +1267,7 @@
 	if (!(cs->flags & CS_FL_RCV_MORE)) {
 		if (!conn_xprt_ready(conn))
 			return 0;
-		if (conn->flags & CO_FL_ERROR || cs->flags & CS_FL_ERROR)
+		if (cs->flags & CS_FL_ERROR)
 			goto end_recv;
 	}
 
@@ -1305,7 +1324,7 @@
 			ic->flags |= CF_READ_PARTIAL;
 		}
 
-		if (conn->flags & CO_FL_ERROR || cs->flags & (CS_FL_EOS|CS_FL_ERROR))
+		if (cs->flags & (CS_FL_EOS|CS_FL_ERROR))
 			goto end_recv;
 
 		if (conn->flags & CO_FL_WAIT_ROOM) {
@@ -1361,7 +1380,7 @@
 	 * recv().
 	 */
 	while ((cs->flags & CS_FL_RCV_MORE) ||
-	    (!(conn->flags & (CO_FL_ERROR | CO_FL_HANDSHAKE)) &&
+	       (!(conn->flags & CO_FL_HANDSHAKE) &&
 	       (!(cs->flags & (CS_FL_ERROR|CS_FL_EOS))) && !(ic->flags & CF_SHUTR))) {
 		int cur_flags = flags;
 
@@ -1507,8 +1526,7 @@
 		ret = 1;
 	}
 
-	if (conn->flags & CO_FL_ERROR || cs->flags & CS_FL_ERROR) {
-		cs->flags |= CS_FL_ERROR;
+	if (cs->flags & CS_FL_ERROR) {
 		si->flags |= SI_FL_ERR;
 		ret = 1;
 	}
commit	b3666ef018a94a5cbcc3a143702a7b3d0b108f4c	[log] [tgz]
author	Willy Tarreau <w@1wt.eu>	Thu Mar 17 16:19:09 2022 +0100
committer	Christopher Faulet <cfaulet@haproxy.com>	Mon Mar 28 22:09:43 2022 +0200
tree	5e438d8ae67b597e97799c488a5a6290cea5e91f
parent	a1d43d1fad324b53b75887216e6c44be357c8bf3 [diff]