MEDIUM: h1: deduplicate the content-length header
Just like we used to do in proto_http, we now check that each and every
occurrence of the content-length header field and each of its values are
exactly identical, and we normalize the header to return the last value
of the first header with spaces trimmed.
diff --git a/src/h1.c b/src/h1.c
index d947661..f8231c5 100644
--- a/src/h1.c
+++ b/src/h1.c
@@ -660,6 +660,78 @@
}
+/* Parse the Content-Length header field of an HTTP/1 request. The function
+ * checks all possible occurrences of a comma-delimited value, and verifies
+ * if any of them doesn't match a previous value. It returns <0 if a value
+ * differs, 0 if the whole header can be dropped (i.e. already known), or >0
+ * if the value can be indexed (first one). In the last case, the value might
+ * be adjusted and the caller must only add the updated value.
+ */
+int h1_parse_cont_len_header(struct h1m *h1m, struct ist *value)
+{
+ char *e, *n;
+ long long cl;
+ int not_first = !!(h1m->flags & H1_MF_CLEN);
+ struct ist word;
+
+ word.ptr = value->ptr - 1; // -1 for next loop's pre-increment
+ e = value->ptr + value->len;
+
+ while (++word.ptr < e) {
+ /* skip leading delimitor and blanks */
+ if (unlikely(HTTP_IS_LWS(*word.ptr)))
+ continue;
+
+ /* digits only now */
+ for (cl = 0, n = word.ptr; n < e; n++) {
+ unsigned int c = *n - '0';
+ if (unlikely(c > 9)) {
+ /* non-digit */
+ if (unlikely(n == word.ptr)) // spaces only
+ goto fail;
+ break;
+ }
+ if (unlikely(cl > ULLONG_MAX / 10ULL))
+ goto fail; /* multiply overflow */
+ cl = cl * 10ULL;
+ if (unlikely(cl + c < cl))
+ goto fail; /* addition overflow */
+ cl = cl + c;
+ }
+
+ /* keep a copy of the exact cleaned value */
+ word.len = n - word.ptr;
+
+ /* skip trailing LWS till next comma or EOL */
+ for (; n < e; n++) {
+ if (!HTTP_IS_LWS(*n)) {
+ if (unlikely(*n != ','))
+ goto fail;
+ break;
+ }
+ }
+
+ /* if duplicate, must be equal */
+ if (h1m->flags & H1_MF_CLEN && cl != h1m->body_len)
+ goto fail;
+
+ /* OK, store this result as the one to be indexed */
+ h1m->flags |= H1_MF_CLEN;
+ h1m->curr_len = h1m->body_len = cl;
+ *value = word;
+ word.ptr = n;
+ }
+ /* here we've reached the end with a single value or a series of
+ * identical values, all matching previous series if any. The last
+ * parsed value was sent back into <value>. We just have to decide
+ * if this occurrence has to be indexed (it's the first one) or
+ * silently skipped (it's not the first one)
+ */
+ return !not_first;
+ fail:
+ return -1;
+}
+
/* Parse the Transfer-Encoding: header field of an HTTP/1 request, looking for
* "chunked" being the last value, and setting H1_MF_CHNK in h1m->flags only in
* this case. Any other token found or any empty header field found will reset
@@ -1301,8 +1373,8 @@
n = ist2(start + sol, col - sol);
v = ist2(start + sov, eol - sov);
- if (likely(!skip_update)) {
- long long cl;
+ if (likely(!skip_update)) do {
+ int ret;
if (unlikely(hdr_count >= hdr_num)) {
state = H1_MSG_HDR_L2_LWS;
@@ -1312,17 +1384,24 @@
if (isteqi(n, ist("transfer-encoding"))) {
h1_parse_xfer_enc_header(h1m, v);
}
- else if (isteqi(n, ist("content-length")) && !(h1m->flags & H1_MF_CHNK)) {
- h1m->flags |= H1_MF_CLEN;
- strl2llrc(v.ptr, v.len, &cl);
- h1m->curr_len = h1m->body_len = cl;
+ else if (isteqi(n, ist("content-length"))) {
+ ret = h1_parse_cont_len_header(h1m, &v);
+
+ if (ret < 0) {
+ state = H1_MSG_HDR_L2_LWS;
+ goto http_msg_invalid;
+ }
+ else if (ret == 0) {
+ /* skip it */
+ break;
+ }
}
else if (isteqi(n, ist("connection"))) {
h1_parse_connection_header(h1m, v);
}
http_set_hdr(&hdr[hdr_count++], n, v);
- }
+ } while (0);
sol = ptr - start;
if (likely(!HTTP_IS_CRLF(*ptr)))