OPTIM: http: move all http character classs tables into a single one We used to have 7 different character classes, each was 256 bytes long, resulting in almost 2kB being used in the L1 cache. It's as cheap to test a bit than to check the byte is not null, so let's store a 7-bit composite value and check for the respective bits there instead. The executable is now 4 kB smaller and the performance on small objects increased by about 1% to 222k requests/second with a config involving 4 http-request rules including 1 header lookup, one header replacement, and 2 variable assignments.

commit: 2235b261b6163cf20325e519c13c3f93f16b4d3a [log] [tgz]
author: Willy Tarreau <w@1wt.eu> Sat Nov 05 15:50:20 2016 +0100
committer: Willy Tarreau <w@1wt.eu> Sat Nov 05 15:58:08 2016 +0100
tree: e09e62f60f45b2f9f3789f67ca073e7360699c40
parent: dc3a9e830c3f91774f67693cf766f293dc673ec9 [diff]
diff --git a/src/flt_http_comp.c b/src/flt_http_comp.c
index 249ccdf..f1e7397 100644
--- a/src/flt_http_comp.c
+++ b/src/flt_http_comp.c

@@ -343,12 +343,12 @@
 
 			/* try to isolate the token from the optional q-value */
 			toklen = 0;
-			while (toklen < ctx.vlen && http_is_token[(unsigned char)*(ctx.line + ctx.val + toklen)])
+			while (toklen < ctx.vlen && HTTP_IS_TOKEN(*(ctx.line + ctx.val + toklen)))
 				toklen++;
 
 			qval = ctx.line + ctx.val + toklen;
 			while (1) {
-				while (qval < ctx.line + ctx.val + ctx.vlen && http_is_lws[(unsigned char)*qval])
+				while (qval < ctx.line + ctx.val + ctx.vlen && HTTP_IS_LWS(*qval))
 					qval++;
 
 				if (qval >= ctx.line + ctx.val + ctx.vlen || *qval != ';') {
@@ -357,7 +357,7 @@
 				}
 				qval++;
 
-				while (qval < ctx.line + ctx.val + ctx.vlen && http_is_lws[(unsigned char)*qval])
+				while (qval < ctx.line + ctx.val + ctx.vlen && HTTP_IS_LWS(*qval))
 					qval++;
 
 				if (qval >= ctx.line + ctx.val + ctx.vlen) {

diff --git a/src/proto_http.c b/src/proto_http.c
index 50e3d48..07c53ee 100644
--- a/src/proto_http.c
+++ b/src/proto_http.c

@@ -491,80 +491,144 @@
 
 /* It is about twice as fast on recent architectures to lookup a byte in a
  * table than to perform a boolean AND or OR between two tests. Refer to
- * RFC2616 for those chars.
+ * RFC2616/RFC5234/RFC7230 for those chars. A token is any ASCII char that is
+ * neither a separator nor a CTL char. An http ver_token is any ASCII which can
+ * be found in an HTTP version, which includes 'H', 'T', 'P', '/', '.' and any
+ * digit. Note: please do not overwrite values in assignment since gcc-2.95
+ * will not handle them correctly. It's worth noting that chars 128..255 are
+ * nothing, not even control chars.
  */
-
-const char http_is_spht[256] = {
-	[' '] = 1, ['\t'] = 1,
-};
-
-const char http_is_crlf[256] = {
-	['\r'] = 1, ['\n'] = 1,
-};
-
-const char http_is_lws[256] = {
-	[' '] = 1, ['\t'] = 1,
-	['\r'] = 1, ['\n'] = 1,
+const unsigned char http_char_classes[256] = {
+	[  0] = HTTP_FLG_CTL,
+	[  1] = HTTP_FLG_CTL,
+	[  2] = HTTP_FLG_CTL,
+	[  3] = HTTP_FLG_CTL,
+	[  4] = HTTP_FLG_CTL,
+	[  5] = HTTP_FLG_CTL,
+	[  6] = HTTP_FLG_CTL,
+	[  7] = HTTP_FLG_CTL,
+	[  8] = HTTP_FLG_CTL,
+	[  9] = HTTP_FLG_SPHT | HTTP_FLG_LWS | HTTP_FLG_SEP | HTTP_FLG_CTL,
+	[ 10] = HTTP_FLG_CRLF | HTTP_FLG_LWS | HTTP_FLG_CTL,
+	[ 11] = HTTP_FLG_CTL,
+	[ 12] = HTTP_FLG_CTL,
+	[ 13] = HTTP_FLG_CRLF | HTTP_FLG_LWS | HTTP_FLG_CTL,
+	[ 14] = HTTP_FLG_CTL,
+	[ 15] = HTTP_FLG_CTL,
+	[ 16] = HTTP_FLG_CTL,
+	[ 17] = HTTP_FLG_CTL,
+	[ 18] = HTTP_FLG_CTL,
+	[ 19] = HTTP_FLG_CTL,
+	[ 20] = HTTP_FLG_CTL,
+	[ 21] = HTTP_FLG_CTL,
+	[ 22] = HTTP_FLG_CTL,
+	[ 23] = HTTP_FLG_CTL,
+	[ 24] = HTTP_FLG_CTL,
+	[ 25] = HTTP_FLG_CTL,
+	[ 26] = HTTP_FLG_CTL,
+	[ 27] = HTTP_FLG_CTL,
+	[ 28] = HTTP_FLG_CTL,
+	[ 29] = HTTP_FLG_CTL,
+	[ 30] = HTTP_FLG_CTL,
+	[ 31] = HTTP_FLG_CTL,
+	[' '] = HTTP_FLG_SPHT | HTTP_FLG_LWS | HTTP_FLG_SEP,
+	['!'] = HTTP_FLG_TOK,
+	['"'] = HTTP_FLG_SEP,
+	['#'] = HTTP_FLG_TOK,
+	['$'] = HTTP_FLG_TOK,
+	['%'] = HTTP_FLG_TOK,
+	['&'] = HTTP_FLG_TOK,
+	[ 39] = HTTP_FLG_TOK,
+	['('] = HTTP_FLG_SEP,
+	[')'] = HTTP_FLG_SEP,
+	['*'] = HTTP_FLG_TOK,
+	['+'] = HTTP_FLG_TOK,
+	[','] = HTTP_FLG_SEP,
+	['-'] = HTTP_FLG_TOK,
+	['.'] = HTTP_FLG_TOK | HTTP_FLG_VER,
+	['/'] = HTTP_FLG_SEP | HTTP_FLG_VER,
+	['0'] = HTTP_FLG_TOK | HTTP_FLG_VER,
+	['1'] = HTTP_FLG_TOK | HTTP_FLG_VER,
+	['2'] = HTTP_FLG_TOK | HTTP_FLG_VER,
+	['3'] = HTTP_FLG_TOK | HTTP_FLG_VER,
+	['4'] = HTTP_FLG_TOK | HTTP_FLG_VER,
+	['5'] = HTTP_FLG_TOK | HTTP_FLG_VER,
+	['6'] = HTTP_FLG_TOK | HTTP_FLG_VER,
+	['7'] = HTTP_FLG_TOK | HTTP_FLG_VER,
+	['8'] = HTTP_FLG_TOK | HTTP_FLG_VER,
+	['9'] = HTTP_FLG_TOK | HTTP_FLG_VER,
+	[':'] = HTTP_FLG_SEP,
+	[';'] = HTTP_FLG_SEP,
+	['<'] = HTTP_FLG_SEP,
+	['='] = HTTP_FLG_SEP,
+	['>'] = HTTP_FLG_SEP,
+	['?'] = HTTP_FLG_SEP,
+	['@'] = HTTP_FLG_SEP,
+	['A'] = HTTP_FLG_TOK,
+	['B'] = HTTP_FLG_TOK,
+	['C'] = HTTP_FLG_TOK,
+	['D'] = HTTP_FLG_TOK,
+	['E'] = HTTP_FLG_TOK,
+	['F'] = HTTP_FLG_TOK,
+	['G'] = HTTP_FLG_TOK,
+	['H'] = HTTP_FLG_TOK | HTTP_FLG_VER,
+	['I'] = HTTP_FLG_TOK,
+	['J'] = HTTP_FLG_TOK,
+	['K'] = HTTP_FLG_TOK,
+	['L'] = HTTP_FLG_TOK,
+	['M'] = HTTP_FLG_TOK,
+	['N'] = HTTP_FLG_TOK,
+	['O'] = HTTP_FLG_TOK,
+	['P'] = HTTP_FLG_TOK | HTTP_FLG_VER,
+	['Q'] = HTTP_FLG_TOK,
+	['R'] = HTTP_FLG_TOK | HTTP_FLG_VER,
+	['S'] = HTTP_FLG_TOK | HTTP_FLG_VER,
+	['T'] = HTTP_FLG_TOK | HTTP_FLG_VER,
+	['U'] = HTTP_FLG_TOK,
+	['V'] = HTTP_FLG_TOK,
+	['W'] = HTTP_FLG_TOK,
+	['X'] = HTTP_FLG_TOK,
+	['Y'] = HTTP_FLG_TOK,
+	['Z'] = HTTP_FLG_TOK,
+	['['] = HTTP_FLG_SEP,
+	[ 92] = HTTP_FLG_SEP,
+	[']'] = HTTP_FLG_SEP,
+	['^'] = HTTP_FLG_TOK,
+	['_'] = HTTP_FLG_TOK,
+	['`'] = HTTP_FLG_TOK,
+	['a'] = HTTP_FLG_TOK,
+	['b'] = HTTP_FLG_TOK,
+	['c'] = HTTP_FLG_TOK,
+	['d'] = HTTP_FLG_TOK,
+	['e'] = HTTP_FLG_TOK,
+	['f'] = HTTP_FLG_TOK,
+	['g'] = HTTP_FLG_TOK,
+	['h'] = HTTP_FLG_TOK,
+	['i'] = HTTP_FLG_TOK,
+	['j'] = HTTP_FLG_TOK,
+	['k'] = HTTP_FLG_TOK,
+	['l'] = HTTP_FLG_TOK,
+	['m'] = HTTP_FLG_TOK,
+	['n'] = HTTP_FLG_TOK,
+	['o'] = HTTP_FLG_TOK,
+	['p'] = HTTP_FLG_TOK,
+	['q'] = HTTP_FLG_TOK,
+	['r'] = HTTP_FLG_TOK,
+	['s'] = HTTP_FLG_TOK,
+	['t'] = HTTP_FLG_TOK,
+	['u'] = HTTP_FLG_TOK,
+	['v'] = HTTP_FLG_TOK,
+	['w'] = HTTP_FLG_TOK,
+	['x'] = HTTP_FLG_TOK,
+	['y'] = HTTP_FLG_TOK,
+	['z'] = HTTP_FLG_TOK,
+	['{'] = HTTP_FLG_SEP,
+	['|'] = HTTP_FLG_TOK,
+	['}'] = HTTP_FLG_SEP,
+	['~'] = HTTP_FLG_TOK,
+	[127] = HTTP_FLG_CTL,
 };
 
-const char http_is_sep[256] = {
-	['('] = 1, [')']  = 1, ['<']  = 1, ['>'] = 1,
-	['@'] = 1, [',']  = 1, [';']  = 1, [':'] = 1,
-	['"'] = 1, ['/']  = 1, ['[']  = 1, [']'] = 1,
-	['{'] = 1, ['}']  = 1, ['?']  = 1, ['='] = 1,
-	[' '] = 1, ['\t'] = 1, ['\\'] = 1,
-};
-
-const char http_is_ctl[256] = {
-	[0 ... 31] = 1,
-	[127] = 1,
-};
-
-/*
- * A token is any ASCII char that is neither a separator nor a CTL char.
- * Do not overwrite values in assignment since gcc-2.95 will not handle
- * them correctly. Instead, define every non-CTL char's status.
- */
-const char http_is_token[256] = {
-	[' '] = 0, ['!'] = 1, ['"'] = 0, ['#'] = 1,
-	['$'] = 1, ['%'] = 1, ['&'] = 1, ['\''] = 1,
-	['('] = 0, [')'] = 0, ['*'] = 1, ['+'] = 1,
-	[','] = 0, ['-'] = 1, ['.'] = 1, ['/'] = 0,
-	['0'] = 1, ['1'] = 1, ['2'] = 1, ['3'] = 1,
-	['4'] = 1, ['5'] = 1, ['6'] = 1, ['7'] = 1,
-	['8'] = 1, ['9'] = 1, [':'] = 0, [';'] = 0,
-	['<'] = 0, ['='] = 0, ['>'] = 0, ['?'] = 0,
-	['@'] = 0, ['A'] = 1, ['B'] = 1, ['C'] = 1,
-	['D'] = 1, ['E'] = 1, ['F'] = 1, ['G'] = 1,
-	['H'] = 1, ['I'] = 1, ['J'] = 1, ['K'] = 1,
-	['L'] = 1, ['M'] = 1, ['N'] = 1, ['O'] = 1,
-	['P'] = 1, ['Q'] = 1, ['R'] = 1, ['S'] = 1,
-	['T'] = 1, ['U'] = 1, ['V'] = 1, ['W'] = 1,
-	['X'] = 1, ['Y'] = 1, ['Z'] = 1, ['['] = 0,
-	['\\'] = 0, [']'] = 0, ['^'] = 1, ['_'] = 1,
-	['`'] = 1, ['a'] = 1, ['b'] = 1, ['c'] = 1,
-	['d'] = 1, ['e'] = 1, ['f'] = 1, ['g'] = 1,
-	['h'] = 1, ['i'] = 1, ['j'] = 1, ['k'] = 1,
-	['l'] = 1, ['m'] = 1, ['n'] = 1, ['o'] = 1,
-	['p'] = 1, ['q'] = 1, ['r'] = 1, ['s'] = 1,
-	['t'] = 1, ['u'] = 1, ['v'] = 1, ['w'] = 1,
-	['x'] = 1, ['y'] = 1, ['z'] = 1, ['{'] = 0,
-	['|'] = 1, ['}'] = 0, ['~'] = 1, 
-};
-
-
-/*
- * An http ver_token is any ASCII which can be found in an HTTP version,
- * which includes 'H', 'T', 'P', '/', '.' and any digit.
- */
-const char http_is_ver_token[256] = {
-	['.'] = 1, ['/'] = 1,
-	['0'] = 1, ['1'] = 1, ['2'] = 1, ['3'] = 1, ['4'] = 1,
-	['5'] = 1, ['6'] = 1, ['7'] = 1, ['8'] = 1, ['9'] = 1,
-	['H'] = 1, ['P'] = 1, ['R'] = 1, ['S'] = 1, ['T'] = 1,
-};
-
-
 /*
  * Adds a header and its CRLF at the tail of the message's buffer, just before
  * the last CRLF. Text length is measured first, so it cannot be NULL.
@@ -676,7 +740,7 @@
 		    (strncasecmp(sol, name, len) == 0)) {
 			ctx->del = len;
 			sov = sol + len + 1;
-			while (sov < eol && http_is_lws[(unsigned char)*sov])
+			while (sov < eol && HTTP_IS_LWS(*sov))
 				sov++;
 
 			ctx->line = sol;
@@ -684,7 +748,7 @@
 			ctx->idx  = cur_idx;
 			ctx->val  = sov - sol;
 			ctx->tws = 0;
-			while (eol > sov && http_is_lws[(unsigned char)*(eol - 1)]) {
+			while (eol > sov && HTTP_IS_LWS(*(eol - 1))) {
 				eol--;
 				ctx->tws++;
 			}
@@ -739,7 +803,7 @@
 
 		ctx->del = len;
 		sov = sol + len + 1;
-		while (sov < eol && http_is_lws[(unsigned char)*sov])
+		while (sov < eol && HTTP_IS_LWS(*sov))
 			sov++;
 
 		ctx->line = sol;
@@ -748,7 +812,7 @@
 		ctx->val  = sov - sol;
 		ctx->tws = 0;
 
-		while (eol > sov && http_is_lws[(unsigned char)*(eol - 1)]) {
+		while (eol > sov && HTTP_IS_LWS(*(eol - 1))) {
 			eol--;
 			ctx->tws++;
 		}
@@ -819,7 +883,7 @@
 		 * for later use (eg: for header deletion).
 		 */
 		sov++;
-		while (sov < eol && http_is_lws[(unsigned char)*sov])
+		while (sov < eol && HTTP_IS_LWS((*sov)))
 			sov++;
 
 		goto return_hdr;
@@ -845,7 +909,7 @@
 		    (strncasecmp(sol, name, len) == 0)) {
 			ctx->del = len;
 			sov = sol + len + 1;
-			while (sov < eol && http_is_lws[(unsigned char)*sov])
+			while (sov < eol && HTTP_IS_LWS(*sov))
 				sov++;
 
 			ctx->line = sol;
@@ -856,7 +920,7 @@
 
 			eol = find_hdr_value_end(sov, eol);
 			ctx->tws = 0;
-			while (eol > sov && http_is_lws[(unsigned char)*(eol - 1)]) {
+			while (eol > sov && HTTP_IS_LWS(*(eol - 1))) {
 				eol--;
 				ctx->tws++;
 			}
@@ -1233,7 +1297,7 @@
 			col++;
 
 		sov = col + 1;
-		while (sov < eol && http_is_lws[(unsigned char)*sov])
+		while (sov < eol && HTTP_IS_LWS(*sov))
 			sov++;
 				
 		for (h = cap_hdr; h; h = h->next) {
@@ -2136,7 +2200,7 @@
 	if (unlikely(ptr == ptr_old))
 		goto error;
 
-	while (http_is_spht[(unsigned char)*ptr]) {
+	while (HTTP_IS_SPHT(*ptr)) {
 		if (++ptr >= end)
 			ptr = buf->data;
 		if (unlikely(ptr == stop))
@@ -7350,28 +7414,28 @@
 		/* We're removing the first value, preserve the colon and add a
 		 * space if possible.
 		 */
-		if (!http_is_crlf[(unsigned char)*next])
+		if (!HTTP_IS_CRLF(*next))
 			next++;
 		prev++;
 		if (prev < next)
 			*prev++ = ' ';
 
-		while (http_is_spht[(unsigned char)*next])
+		while (HTTP_IS_SPHT(*next))
 			next++;
 	} else {
 		/* Remove useless spaces before the old delimiter. */
-		while (http_is_spht[(unsigned char)*(prev-1)])
+		while (HTTP_IS_SPHT(*(prev-1)))
 			prev--;
 		*from = prev;
 
 		/* copy the delimiter and if possible a space if we're
 		 * not at the end of the line.
 		 */
-		if (!http_is_crlf[(unsigned char)*next]) {
+		if (!HTTP_IS_CRLF(*next)) {
 			*prev++ = *next++;
 			if (prev + 1 < next)
 				*prev++ = ' ';
-			while (http_is_spht[(unsigned char)*next])
+			while (HTTP_IS_SPHT(*next))
 				next++;
 		}
 	}
@@ -7470,7 +7534,7 @@
 
 			/* find att_beg */
 			att_beg = prev + 1;
-			while (att_beg < hdr_end && http_is_spht[(unsigned char)*att_beg])
+			while (att_beg < hdr_end && HTTP_IS_SPHT(*att_beg))
 				att_beg++;
 
 			/* find att_end : this is the first character after the last non
@@ -7481,7 +7545,7 @@
 			while (equal < hdr_end) {
 				if (*equal == '=' || *equal == ',' || *equal == ';')
 					break;
-				if (http_is_spht[(unsigned char)*equal++])
+				if (HTTP_IS_SPHT(*equal++))
 					continue;
 				att_end = equal;
 			}
@@ -7494,7 +7558,7 @@
 			if (equal < hdr_end && *equal == '=') {
 				/* look for the beginning of the value */
 				val_beg = equal + 1;
-				while (val_beg < hdr_end && http_is_spht[(unsigned char)*val_beg])
+				while (val_beg < hdr_end && HTTP_IS_SPHT(*val_beg))
 					val_beg++;
 
 				/* find the end of the value, respecting quotes */
@@ -7502,7 +7566,7 @@
 
 				/* make val_end point to the first white space or delimitor after the value */
 				val_end = next;
-				while (val_end > val_beg && http_is_spht[(unsigned char)*(val_end - 1)])
+				while (val_end > val_beg && HTTP_IS_SPHT(*(val_end - 1)))
 					val_end--;
 			} else {
 				val_beg = val_end = next = equal;
@@ -8132,7 +8196,7 @@
 
 			/* find att_beg */
 			att_beg = prev + 1;
-			while (att_beg < hdr_end && http_is_spht[(unsigned char)*att_beg])
+			while (att_beg < hdr_end && HTTP_IS_SPHT(*att_beg))
 				att_beg++;
 
 			/* find att_end : this is the first character after the last non
@@ -8143,7 +8207,7 @@
 			while (equal < hdr_end) {
 				if (*equal == '=' || *equal == ';' || (is_cookie2 && *equal == ','))
 					break;
-				if (http_is_spht[(unsigned char)*equal++])
+				if (HTTP_IS_SPHT(*equal++))
 					continue;
 				att_end = equal;
 			}
@@ -8156,7 +8220,7 @@
 			if (equal < hdr_end && *equal == '=') {
 				/* look for the beginning of the value */
 				val_beg = equal + 1;
-				while (val_beg < hdr_end && http_is_spht[(unsigned char)*val_beg])
+				while (val_beg < hdr_end && HTTP_IS_SPHT(*val_beg))
 					val_beg++;
 
 				/* find the end of the value, respecting quotes */
@@ -8164,7 +8228,7 @@
 
 				/* make val_end point to the first white space or delimitor after the value */
 				val_end = next;
-				while (val_end > val_beg && http_is_spht[(unsigned char)*(val_end - 1)])
+				while (val_end > val_beg && HTTP_IS_SPHT(*(val_end - 1)))
 					val_end--;
 			} else {
 				/* <equal> points to next comma, semi-colon or EOL */
@@ -10878,7 +10942,7 @@
 	for (att_beg = hdr; att_beg + cookie_name_l + 1 < hdr_end; att_beg = next + 1) {
 		/* Iterate through all cookies on this line */
 
-		while (att_beg < hdr_end && http_is_spht[(unsigned char)*att_beg])
+		while (att_beg < hdr_end && HTTP_IS_SPHT(*att_beg))
 			att_beg++;
 
 		/* find att_end : this is the first character after the last non
@@ -10889,7 +10953,7 @@
 		while (equal < hdr_end) {
 			if (*equal == '=' || *equal == ';' || (list && *equal == ','))
 				break;
-			if (http_is_spht[(unsigned char)*equal++])
+			if (HTTP_IS_SPHT(*equal++))
 				continue;
 			att_end = equal;
 		}
@@ -10902,7 +10966,7 @@
 		if (equal < hdr_end && *equal == '=') {
 			/* look for the beginning of the value */
 			val_beg = equal + 1;
-			while (val_beg < hdr_end && http_is_spht[(unsigned char)*val_beg])
+			while (val_beg < hdr_end && HTTP_IS_SPHT(*val_beg))
 				val_beg++;
 
 			/* find the end of the value, respecting quotes */
@@ -10910,7 +10974,7 @@
 
 			/* make val_end point to the first white space or delimitor after the value */
 			val_end = next;
-			while (val_end > val_beg && http_is_spht[(unsigned char)*(val_end - 1)])
+			while (val_end > val_beg && HTTP_IS_SPHT(*(val_end - 1)))
 				val_end--;
 		} else {
 			val_beg = val_end = next = equal;

diff --git a/src/sample.c b/src/sample.c
index 77cbd1b..1438ca1 100644
--- a/src/sample.c
+++ b/src/sample.c

@@ -2570,7 +2570,7 @@
 		 * token = 1*tchar
 		 */
 		for (i = 0; i < args[0].data.str.len; i++) {
-			if (!http_is_token[(unsigned char)args[0].data.str.str[i]]) {
+			if (!HTTP_IS_TOKEN(args[0].data.str.str[i])) {
 				memprintf(err, "expects valid method.");
 				return 0;
 			}
commit	2235b261b6163cf20325e519c13c3f93f16b4d3a	[log] [tgz]
author	Willy Tarreau <w@1wt.eu>	Sat Nov 05 15:50:20 2016 +0100
committer	Willy Tarreau <w@1wt.eu>	Sat Nov 05 15:58:08 2016 +0100
tree	e09e62f60f45b2f9f3789f67ca073e7360699c40
parent	dc3a9e830c3f91774f67693cf766f293dc673ec9 [diff]