MEDIUM: h1: reimplement the http/1 response parser for the gateway The HTTP/2->HTTP/1 gateway will need to process HTTP/1 responses. We cannot sanely rely on the HTTP/1 txn to parse a response because : 1) responses generated by haproxy such as error messages, redirects, stats or Lua are neither parsed nor indexed ; this could be addressed over the long term but will take time. 2) the http txn is useless to parse the body : the states present there are only meaningful to received bytes (ie next bytes to parse) and not at all to sent bytes. Thus chunks cannot be followed at all. Even when implementing this later, it's unsure whether it will be possible when dealing with compression. So using the HTTP txn is now out of the equation and the only remaining solution is to call an HTTP/1 message parser. We already have one, it was slightly modified to avoid keeping states by benefitting from the fact that the response was produced by haproxy and this is entirely available. It assumes the following rules are true, or that incuring an extra cost to work around them is acceptable : - the response buffer is read-write and supports modifications in place - headers sent through / by haproxy are not folded. Folding is still implemented by replacing CR/LF/tabs/spaces with spaces if encountered - HTTP/0.9 responses are never sent by haproxy and have never been supported at all - haproxy will not send partial responses, the whole headers block will be sent at once ; this means that we don't need to keep expensive states and can afford to restart the parsing from the beginning when facing a partial response ; - response is contiguous (does not wrap). This was already the case with the original parser and ensures we can safely dereference all fields with (ptr,len) The parser replaces all of the http_msg fields that were necessary with local variables. The parser is not called on an http_msg but on a string with a start and an end. The HTTP/1 states were reused for ease of use, though the request-specific ones have not been implemented for now. The error position and error state are supported and optional ; these ones may be used later for bug hunting. The parser issues the list of all the headers into a caller-allocated array of struct ist. The content-length/transfer-encoding header are checked and the relevant info fed the h1 message state (flags + body_len).

commit: 794f9af8949d0d9edfdfe2453167325e75cac2e4 [log] [tgz]
author: Willy Tarreau <w@1wt.eu> Wed Jul 26 09:07:47 2017 +0200
committer: Willy Tarreau <w@1wt.eu> Sun Oct 22 09:54:15 2017 +0200
tree: 197df7c272157d9904db9c30843d2f627c57c854
parent: 306924ecb828a01edf1e8012deb02299284d3cb8 [diff]
diff --git a/include/proto/h1.h b/include/proto/h1.h
index 6bd7537..e7d364d 100644
--- a/include/proto/h1.h
+++ b/include/proto/h1.h

@@ -25,6 +25,7 @@
 #include <common/buffer.h>
 #include <common/compiler.h>
 #include <common/config.h>
+#include <common/http-hdr.h>
 #include <common/standard.h>
 #include <types/h1.h>
 #include <types/proto_http.h>
@@ -39,6 +40,9 @@
                                unsigned int *ret_ptr, enum h1_state *ret_state);
 void http_msg_analyzer(struct http_msg *msg, struct hdr_idx *idx);
 int http_forward_trailers(struct http_msg *msg);
+int h1_headers_to_hdr_list(char *start, const char *stop,
+                           struct http_hdr *hdr, unsigned int hdr_num,
+                           struct h1m *h1m);
 
 #define H1_FLG_CTL  0x01
 #define H1_FLG_SEP  0x02

diff --git a/src/h1.c b/src/h1.c
index bca820c..7e8f19d 100644
--- a/src/h1.c
+++ b/src/h1.c

@@ -10,7 +10,9 @@
  *
  */
 
+#include <ctype.h>
 #include <common/config.h>
+#include <common/http-hdr.h>
 
 #include <proto/h1.h>
 #include <proto/hdr_idx.h>
@@ -795,6 +797,402 @@
 	return;
 }
 
+/* This function parses a contiguous HTTP/1 headers block starting at <start>
+ * and ending before <stop>, at once, and converts it a list of (name,value)
+ * pairs representing header fields into the array <hdr> of size <hdr_num>,
+ * whose last entry will have an empty name and an empty value. If <hdr_num> is
+ * too small to represent the whole message, an error is returned. If <h1m> is
+ * not NULL, some protocol elements such as content-length and transfer-encoding
+ * will be parsed and stored there as well.
+ *
+ * For now it's limited to the response. If the header block is incomplete,
+ * 0 is returned, waiting to be called again with more data to try it again.
+ *
+ * The code derived from the main HTTP/1 parser above but was simplified and
+ * optimized to process responses produced or forwarded by haproxy. The caller
+ * is responsible for ensuring that the message doesn't wrap, and should ensure
+ * it is complete to avoid having to retry the operation after a failed
+ * attempt. The message is not supposed to be invalid, which is why a few
+ * properties such as the character set used in the header field names are not
+ * checked. In case of an unparsable response message, a negative value will be
+ * returned with h1m->err_pos and h1m->err_state matching the location and
+ * state where the error was met. Leading blank likes are tolerated but not
+ * recommended.
+ *
+ * This function returns :
+ *    -1 in case of error. In this case, h1m->err_state is filled (if h1m is
+ *       set) with the state the error occurred in and h2-m>err_pos with the
+ *       the position relative to <start>
+ *    -2 if the output is full (hdr_num reached). err_state and err_pos also
+ *       indicate where it failed.
+ *     0 in case of missing data.
+ *   > 0 on success, it then corresponds to the number of bytes read since
+ *       <start> so that the caller can go on with the payload.
+ */
+int h1_headers_to_hdr_list(char *start, const char *stop,
+                           struct http_hdr *hdr, unsigned int hdr_num,
+                           struct h1m *h1m)
+{
+	enum h1_state state = HTTP_MSG_RPBEFORE;
+	register char *ptr  = start;
+	register const char *end  = stop;
+	unsigned int hdr_count = 0;
+	unsigned int code = 0; /* status code, ASCII form */
+	unsigned int st_c;     /* beginning of status code, relative to msg_start */
+	unsigned int st_c_l;   /* length of status code */
+	unsigned int sol = 0;  /* start of line */
+	unsigned int col = 0;  /* position of the colon */
+	unsigned int eol = 0;  /* end of line */
+	unsigned int sov = 0;  /* start of value */
+	unsigned int skip = 0; /* number of bytes skipped at the beginning */
+	struct ist n, v;       /* header name and value during parsing */
+
+	if (unlikely(ptr >= end))
+		goto http_msg_ood;
+
+	switch (state)	{
+	case HTTP_MSG_RPBEFORE:
+	http_msg_rpbefore:
+		if (likely(HTTP_IS_TOKEN(*ptr))) {
+			/* we have a start of message, we may have skipped some
+			 * heading CRLF. Skip them now.
+			 */
+			skip += ptr - start;
+			start = ptr;
+
+			sol = 0;
+			hdr_count = 0;
+			state = HTTP_MSG_RPVER;
+			goto http_msg_rpver;
+		}
+
+		if (unlikely(!HTTP_IS_CRLF(*ptr))) {
+			state = HTTP_MSG_RPBEFORE;
+			goto http_msg_invalid;
+		}
+
+		if (unlikely(*ptr == '\n'))
+			EAT_AND_JUMP_OR_RETURN(ptr, end, http_msg_rpbefore, http_msg_ood, state, HTTP_MSG_RPBEFORE);
+		EAT_AND_JUMP_OR_RETURN(ptr, end, http_msg_rpbefore_cr, http_msg_ood, state, HTTP_MSG_RPBEFORE_CR);
+		/* stop here */
+
+	case HTTP_MSG_RPBEFORE_CR:
+	http_msg_rpbefore_cr:
+		EXPECT_LF_HERE(ptr, http_msg_invalid, state, HTTP_MSG_RPBEFORE_CR);
+		EAT_AND_JUMP_OR_RETURN(ptr, end, http_msg_rpbefore, http_msg_ood, state, HTTP_MSG_RPBEFORE);
+		/* stop here */
+
+	case HTTP_MSG_RPVER:
+	http_msg_rpver:
+		if (likely(HTTP_IS_VER_TOKEN(*ptr)))
+			EAT_AND_JUMP_OR_RETURN(ptr, end, http_msg_rpver, http_msg_ood, state, HTTP_MSG_RPVER);
+
+		if (likely(HTTP_IS_SPHT(*ptr))) {
+			/* version length = ptr - start */
+			EAT_AND_JUMP_OR_RETURN(ptr, end, http_msg_rpver_sp, http_msg_ood, state, HTTP_MSG_RPVER_SP);
+		}
+		state = HTTP_MSG_RPVER;
+		goto http_msg_invalid;
+
+	case HTTP_MSG_RPVER_SP:
+	http_msg_rpver_sp:
+		if (likely(!HTTP_IS_LWS(*ptr))) {
+			code = 0;
+			st_c = ptr - start;
+			goto http_msg_rpcode;
+		}
+		if (likely(HTTP_IS_SPHT(*ptr)))
+			EAT_AND_JUMP_OR_RETURN(ptr, end, http_msg_rpver_sp, http_msg_ood, state, HTTP_MSG_RPVER_SP);
+		/* so it's a CR/LF, this is invalid */
+		state = HTTP_MSG_RPVER_SP;
+		goto http_msg_invalid;
+
+	case HTTP_MSG_RPCODE:
+	http_msg_rpcode:
+		if (likely(!HTTP_IS_LWS(*ptr))) {
+			code = (code << 8) + *ptr;
+			EAT_AND_JUMP_OR_RETURN(ptr, end, http_msg_rpcode, http_msg_ood, state, HTTP_MSG_RPCODE);
+		}
+
+		if (likely(HTTP_IS_SPHT(*ptr))) {
+			st_c_l = ptr - start - st_c;
+			EAT_AND_JUMP_OR_RETURN(ptr, end, http_msg_rpcode_sp, http_msg_ood, state, HTTP_MSG_RPCODE_SP);
+		}
+
+		/* so it's a CR/LF, so there is no reason phrase */
+		st_c_l = ptr - start - st_c;
+
+	http_msg_rsp_reason:
+		/* reason = ptr - start; */
+		/* reason length = 0 */
+		goto http_msg_rpline_eol;
+
+	case HTTP_MSG_RPCODE_SP:
+	http_msg_rpcode_sp:
+		if (likely(!HTTP_IS_LWS(*ptr))) {
+			/* reason = ptr - start */
+			goto http_msg_rpreason;
+		}
+		if (likely(HTTP_IS_SPHT(*ptr)))
+			EAT_AND_JUMP_OR_RETURN(ptr, end, http_msg_rpcode_sp, http_msg_ood, state, HTTP_MSG_RPCODE_SP);
+		/* so it's a CR/LF, so there is no reason phrase */
+		goto http_msg_rsp_reason;
+
+	case HTTP_MSG_RPREASON:
+	http_msg_rpreason:
+		if (likely(!HTTP_IS_CRLF(*ptr)))
+			EAT_AND_JUMP_OR_RETURN(ptr, end, http_msg_rpreason, http_msg_ood, state, HTTP_MSG_RPREASON);
+		/* reason length = ptr - start - reason */
+	http_msg_rpline_eol:
+		/* We have seen the end of line. Note that we do not
+		 * necessarily have the \n yet, but at least we know that we
+		 * have EITHER \r OR \n, otherwise the response would not be
+		 * complete. We can then record the response length and return
+		 * to the caller which will be able to register it.
+		 */
+
+		if (unlikely(hdr_count >= hdr_num)) {
+			state = HTTP_MSG_RPREASON;
+			goto http_output_full;
+		}
+		http_set_hdr(&hdr[hdr_count++], ist(":status"), ist2(start + st_c, st_c_l));
+
+		sol = ptr - start;
+		if (likely(*ptr == '\r'))
+			EAT_AND_JUMP_OR_RETURN(ptr, end, http_msg_rpline_end, http_msg_ood, state, HTTP_MSG_RPLINE_END);
+		goto http_msg_rpline_end;
+
+	case HTTP_MSG_RPLINE_END:
+	http_msg_rpline_end:
+		/* sol must point to the first of CR or LF. */
+		EXPECT_LF_HERE(ptr, http_msg_invalid, state, HTTP_MSG_RPLINE_END);
+		EAT_AND_JUMP_OR_RETURN(ptr, end, http_msg_hdr_first, http_msg_ood, state, HTTP_MSG_HDR_FIRST);
+		/* stop here */
+
+	case HTTP_MSG_HDR_FIRST:
+	http_msg_hdr_first:
+		sol = ptr - start;
+		if (likely(!HTTP_IS_CRLF(*ptr))) {
+			goto http_msg_hdr_name;
+		}
+
+		if (likely(*ptr == '\r'))
+			EAT_AND_JUMP_OR_RETURN(ptr, end, http_msg_last_lf, http_msg_ood, state, HTTP_MSG_LAST_LF);
+		goto http_msg_last_lf;
+
+	case HTTP_MSG_HDR_NAME:
+	http_msg_hdr_name:
+		/* assumes sol points to the first char */
+		if (likely(HTTP_IS_TOKEN(*ptr))) {
+			/* turn it to lower case if needed */
+			if (isupper((unsigned char)*ptr))
+				*ptr = tolower(*ptr);
+			EAT_AND_JUMP_OR_RETURN(ptr, end, http_msg_hdr_name, http_msg_ood, state, HTTP_MSG_HDR_NAME);
+		}
+
+		if (likely(*ptr == ':')) {
+			col = ptr - start;
+			EAT_AND_JUMP_OR_RETURN(ptr, end, http_msg_hdr_l1_sp, http_msg_ood, state, HTTP_MSG_HDR_L1_SP);
+		}
+
+		if (HTTP_IS_LWS(*ptr)) {
+			state = HTTP_MSG_HDR_NAME;
+			goto http_msg_invalid;
+		}
+
+		/* now we have a non-token character in the header field name,
+		 * it's up to the H1 layer to have decided whether or not it
+		 * was acceptable. If we find it here, it was considered
+		 * acceptable due to configuration rules so we obey.
+		 */
+		EAT_AND_JUMP_OR_RETURN(ptr, end, http_msg_hdr_name, http_msg_ood, state, HTTP_MSG_HDR_NAME);
+
+	case HTTP_MSG_HDR_L1_SP:
+	http_msg_hdr_l1_sp:
+		/* assumes sol points to the first char */
+		if (likely(HTTP_IS_SPHT(*ptr)))
+			EAT_AND_JUMP_OR_RETURN(ptr, end, http_msg_hdr_l1_sp, http_msg_ood, state, HTTP_MSG_HDR_L1_SP);
+
+		/* header value can be basically anything except CR/LF */
+		sov = ptr - start;
+
+		if (likely(!HTTP_IS_CRLF(*ptr))) {
+			goto http_msg_hdr_val;
+		}
+
+		if (likely(*ptr == '\r'))
+			EAT_AND_JUMP_OR_RETURN(ptr, end, http_msg_hdr_l1_lf, http_msg_ood, state, HTTP_MSG_HDR_L1_LF);
+		goto http_msg_hdr_l1_lf;
+
+	case HTTP_MSG_HDR_L1_LF:
+	http_msg_hdr_l1_lf:
+		EXPECT_LF_HERE(ptr, http_msg_invalid, state, HTTP_MSG_HDR_L1_LF);
+		EAT_AND_JUMP_OR_RETURN(ptr, end, http_msg_hdr_l1_lws, http_msg_ood, state, HTTP_MSG_HDR_L1_LWS);
+
+	case HTTP_MSG_HDR_L1_LWS:
+	http_msg_hdr_l1_lws:
+		if (likely(HTTP_IS_SPHT(*ptr))) {
+			/* replace HT,CR,LF with spaces */
+			for (; start + sov < ptr; sov++)
+				start[sov] = ' ';
+			goto http_msg_hdr_l1_sp;
+		}
+		/* we had a header consisting only in spaces ! */
+		eol = sov;
+		goto http_msg_complete_header;
+
+	case HTTP_MSG_HDR_VAL:
+	http_msg_hdr_val:
+		/* assumes sol points to the first char, and sov
+		 * points to the first character of the value.
+		 */
+
+		/* speedup: we'll skip packs of 4 or 8 bytes not containing bytes 0x0D
+		 * and lower. In fact since most of the time is spent in the loop, we
+		 * also remove the sign bit test so that bytes 0x8e..0x0d break the
+		 * loop, but we don't care since they're very rare in header values.
+		 */
+#if defined(__x86_64__)
+		while (ptr <= end - sizeof(long)) {
+			if ((*(long *)ptr - 0x0e0e0e0e0e0e0e0eULL) & 0x8080808080808080ULL)
+				goto http_msg_hdr_val2;
+			ptr += sizeof(long);
+		}
+#endif
+#if defined(__x86_64__) || \
+    defined(__i386__) || defined(__i486__) || defined(__i586__) || defined(__i686__) || \
+    defined(__ARM_ARCH_7A__)
+		while (ptr <= end - sizeof(int)) {
+			if ((*(int*)ptr - 0x0e0e0e0e) & 0x80808080)
+				goto http_msg_hdr_val2;
+			ptr += sizeof(int);
+		}
+#endif
+		if (ptr >= end) {
+			state = HTTP_MSG_HDR_VAL;
+			goto http_msg_ood;
+		}
+	http_msg_hdr_val2:
+		if (likely(!HTTP_IS_CRLF(*ptr)))
+			EAT_AND_JUMP_OR_RETURN(ptr, end, http_msg_hdr_val2, http_msg_ood, state, HTTP_MSG_HDR_VAL);
+
+		eol = ptr - start;
+		/* Note: we could also copy eol into ->eoh so that we have the
+		 * real header end in case it ends with lots of LWS, but is this
+		 * really needed ?
+		 */
+		if (likely(*ptr == '\r'))
+			EAT_AND_JUMP_OR_RETURN(ptr, end, http_msg_hdr_l2_lf, http_msg_ood, state, HTTP_MSG_HDR_L2_LF);
+		goto http_msg_hdr_l2_lf;
+
+	case HTTP_MSG_HDR_L2_LF:
+	http_msg_hdr_l2_lf:
+		EXPECT_LF_HERE(ptr, http_msg_invalid, state, HTTP_MSG_HDR_L2_LF);
+		EAT_AND_JUMP_OR_RETURN(ptr, end, http_msg_hdr_l2_lws, http_msg_ood, state, HTTP_MSG_HDR_L2_LWS);
+
+	case HTTP_MSG_HDR_L2_LWS:
+	http_msg_hdr_l2_lws:
+		if (unlikely(HTTP_IS_SPHT(*ptr))) {
+			/* LWS: replace HT,CR,LF with spaces */
+			for (; start + eol < ptr; eol++)
+				start[eol] = ' ';
+			goto http_msg_hdr_val;
+		}
+	http_msg_complete_header:
+		/*
+		 * It was a new header, so the last one is finished. Assumes
+		 * <sol> points to the first char of the name, <col> to the
+		 * colon, <sov> points to the first character of the value and
+		 * <eol> to the first CR or LF so we know how the line ends. We
+		 * will trim spaces around the value. It's possible to do it by
+		 * adjusting <eol> and <sov> which are no more used after this.
+		 * We can add the header field to the list.
+		 */
+		while (sov < eol && HTTP_IS_LWS(start[sov]))
+			sov++;
+
+		while (eol - 1 > sov && HTTP_IS_LWS(start[eol - 1]))
+			eol--;
+
+
+		n = ist2(start + sol, col - sol);
+		v = ist2(start + sov, eol - sov);
+
+		if (unlikely(hdr_count >= hdr_num)) {
+			state = HTTP_MSG_HDR_L2_LWS;
+			goto http_output_full;
+		}
+		http_set_hdr(&hdr[hdr_count++], n, v);
+
+		if (h1m) {
+			long long cl;
+
+			if (isteq(n, ist("transfer-encoding"))) {
+				h1m->flags &= ~H1_MF_CLEN;
+				h1m->flags |= H1_MF_CHNK;
+			}
+			else if (isteq(n, ist("content-length")) && !(h1m->flags & H1_MF_CHNK)) {
+				h1m->flags |= H1_MF_CLEN;
+				strl2llrc(v.ptr, v.len, &cl);
+				h1m->curr_len = h1m->body_len = cl;
+			}
+		}
+
+		sol = ptr - start;
+		if (likely(!HTTP_IS_CRLF(*ptr)))
+			goto http_msg_hdr_name;
+
+		if (likely(*ptr == '\r'))
+			EAT_AND_JUMP_OR_RETURN(ptr, end, http_msg_last_lf, http_msg_ood, state, HTTP_MSG_LAST_LF);
+		goto http_msg_last_lf;
+
+	case HTTP_MSG_LAST_LF:
+	http_msg_last_lf:
+		EXPECT_LF_HERE(ptr, http_msg_invalid, state, HTTP_MSG_LAST_LF);
+		ptr++;
+		/* <ptr> now points to the first byte of payload. If needed sol
+		 * still points to the first of either CR or LF of the empty
+		 * line ending the headers block.
+		 */
+		if (unlikely(hdr_count >= hdr_num)) {
+			state = HTTP_MSG_LAST_LF;
+			goto http_output_full;
+		}
+		http_set_hdr(&hdr[hdr_count++], ist(""), ist(""));
+		state = HTTP_MSG_BODY;
+		break;
+
+	default:
+		/* impossible states */
+		goto http_msg_invalid;
+	}
+
+	/* reaching here, we've parsed the whole message and the state is
+	 * HTTP_MSG_BODY.
+	 */
+	return ptr - start + skip;
+
+ http_msg_ood:
+	/* out of data at <ptr> during state <state> */
+	return 0;
+
+ http_msg_invalid:
+	/* invalid message, error at <ptr> */
+	if (h1m) {
+		h1m->err_state = state;
+		h1m->err_pos = ptr - start + skip;
+	}
+	return -1;
+
+ http_output_full:
+	/* no more room to store the current header, error at <ptr> */
+	if (h1m) {
+		h1m->err_state = state;
+		h1m->err_pos = ptr - start + skip;
+	}
+	return -2;
+}
+
 /* This function skips trailers in the buffer associated with HTTP message
  * <msg>. The first visited position is msg->next. If the end of the trailers is
  * found, the function returns >0. So, the caller can automatically schedul it
commit	794f9af8949d0d9edfdfe2453167325e75cac2e4	[log] [tgz]
author	Willy Tarreau <w@1wt.eu>	Wed Jul 26 09:07:47 2017 +0200
committer	Willy Tarreau <w@1wt.eu>	Sun Oct 22 09:54:15 2017 +0200
tree	197df7c272157d9904db9c30843d2f627c57c854
parent	306924ecb828a01edf1e8012deb02299284d3cb8 [diff]