OPTIM: http: improve parsing performance of long header lines

A performance test with 1kB cookies was capping at 194k req/s. After
implementing multi-byte skipping, the performance increased to 212k req/s,
or 9.2% faster. This patch implements this for architectures supporting
unaligned accesses (x86_64, x86, armv7a). Maybe other architectures can
benefit from this but they were not tested yet.
diff --git a/src/proto_http.c b/src/proto_http.c
index 07c53ee..c9f38b1 100644
--- a/src/proto_http.c
+++ b/src/proto_http.c
@@ -1922,8 +1922,31 @@
 		/* assumes msg->sol points to the first char, and msg->sov
 		 * points to the first character of the value.
 		 */
+
+		/* speedup: we'll skip packs of 4 or 8 bytes not containing bytes 0x0D
+		 * and lower. In fact since most of the time is spent in the loop, we
+		 * also remove the sign bit test so that bytes 0x8e..0x0d break the
+		 * loop, but we don't care since they're very rare in header values.
+		 */
+#if defined(__x86_64__)
+		while (ptr <= end - sizeof(long)) {
+			if ((*(long *)ptr - 0x0e0e0e0e0e0e0e0eULL) & 0x8080808080808080ULL)
+				goto http_msg_hdr_val2;
+			ptr += sizeof(long);
+		}
+#endif
+#if defined(__x86_64__) || \
+    defined(__i386__) || defined(__i486__) || defined(__i586__) || defined(__i686__) || \
+    defined(__ARM_ARCH_7A__)
+		while (ptr <= end - sizeof(int)) {
+			if ((*(int*)ptr - 0x0e0e0e0e) & 0x80808080)
+				goto http_msg_hdr_val2;
+			ptr += sizeof(int);
+		}
+#endif
+	http_msg_hdr_val2:
 		if (likely(!HTTP_IS_CRLF(*ptr)))
-			EAT_AND_JUMP_OR_RETURN(http_msg_hdr_val, HTTP_MSG_HDR_VAL);
+			EAT_AND_JUMP_OR_RETURN(http_msg_hdr_val2, HTTP_MSG_HDR_VAL);
 
 		msg->eol = ptr - buf->p;
 		/* Note: we could also copy eol into ->eoh so that we have the