MINOR: uri_normalizer: Add a `percent-decode-unreserved` normalizer This normalizer decodes percent encoded characters within the RFC 3986 unreserved set. See GitHub Issue #714.

commit: 2e4a18e04a866bbe0e12745d6475d85243eafe5a [log] [tgz]
author: Tim Duesterhus <tim@bastelstu.be> Wed Apr 21 21:20:36 2021 +0200
committer: Christopher Faulet <cfaulet@haproxy.com> Fri Apr 23 19:43:45 2021 +0200
tree: db7e2d55e45c85e1a1e34c40dcc5b7f7ce1ec1d7
parent: d6d33deaea973d36827cce12f49dfc8a38179dd0 [diff]
diff --git a/src/http_act.c b/src/http_act.c
index df2bbe4..58e1bb8 100644
--- a/src/http_act.c
+++ b/src/http_act.c

@@ -302,6 +302,24 @@
 
 			break;
 		}
+		case ACT_NORMALIZE_URI_PERCENT_DECODE_UNRESERVED:
+		case ACT_NORMALIZE_URI_PERCENT_DECODE_UNRESERVED_STRICT: {
+			const struct ist path = http_get_path(uri);
+			struct ist newpath = ist2(replace->area, replace->size);
+
+			if (!isttest(path))
+				goto leave;
+
+			err = uri_normalizer_percent_decode_unreserved(path, rule->action == ACT_NORMALIZE_URI_PERCENT_DECODE_UNRESERVED_STRICT, &newpath);
+
+			if (err != URI_NORMALIZER_ERR_NONE)
+				break;
+
+			if (!http_replace_req_path(htx, newpath, 1))
+				goto fail_rewrite;
+
+			break;
+		}
 	}
 
 	switch (err) {
@@ -407,6 +425,21 @@
 			return ACT_RET_PRS_ERR;
 		}
 	}
+	else if (strcmp(args[cur_arg], "percent-decode-unreserved") == 0) {
+		cur_arg++;
+
+		if (strcmp(args[cur_arg], "strict") == 0) {
+			cur_arg++;
+			rule->action = ACT_NORMALIZE_URI_PERCENT_DECODE_UNRESERVED_STRICT;
+		}
+		else if (!*args[cur_arg]) {
+			rule->action = ACT_NORMALIZE_URI_PERCENT_DECODE_UNRESERVED;
+		}
+		else if (strcmp(args[cur_arg], "if") != 0 && strcmp(args[cur_arg], "unless") != 0) {
+			memprintf(err, "unknown argument '%s' for 'percent-decode-unreserved' normalizer", args[cur_arg]);
+			return ACT_RET_PRS_ERR;
+		}
+	}
 	else {
 		memprintf(err, "unknown normalizer '%s'", args[cur_arg]);
 		return ACT_RET_PRS_ERR;

diff --git a/src/uri_normalizer.c b/src/uri_normalizer.c
index 8d95936..4fd783d 100644
--- a/src/uri_normalizer.c
+++ b/src/uri_normalizer.c

@@ -18,6 +18,101 @@
 #include <haproxy/tools.h>
 #include <haproxy/uri_normalizer.h>
 
+/* Returns 1 if the given character is part of the 'unreserved' set in the
+ * RFC 3986 ABNF.
+ * Returns 0 if not.
+ */
+static int is_unreserved_character(unsigned char c)
+{
+	switch (c) {
+	case 'A'...'Z': /* ALPHA */
+	case 'a'...'z': /* ALPHA */
+	case '0'...'9': /* DIGIT */
+	case '-':
+	case '.':
+	case '_':
+	case '~':
+		return 1;
+	default:
+		return 0;
+	}
+}
+
+/* Decodes percent encoded characters that are part of the 'unreserved' set.
+ *
+ * RFC 3986, section 2.3:
+ * >  URIs that differ in the replacement of an unreserved character with
+ * >  its corresponding percent-encoded US-ASCII octet are equivalent [...]
+ * >  when found in a URI, should be decoded to their corresponding unreserved
+ * >  characters by URI normalizers.
+ *
+ * If `strict` is set to 0 then percent characters that are not followed by a
+ * hexadecimal digit are returned as-is without performing any decoding.
+ * If `strict` is set to 1 then `URI_NORMALIZER_ERR_INVALID_INPUT` is returned
+ * for invalid sequences.
+ */
+enum uri_normalizer_err uri_normalizer_percent_decode_unreserved(const struct ist input, int strict, struct ist *dst)
+{
+	enum uri_normalizer_err err;
+
+	const size_t size = istclear(dst);
+	struct ist output = *dst;
+
+	struct ist scanner = input;
+
+	/* The output will either be shortened or have the same length. */
+	if (size < istlen(input)) {
+		err = URI_NORMALIZER_ERR_ALLOC;
+		goto fail;
+	}
+
+	while (istlen(scanner)) {
+		const char current = istshift(&scanner);
+
+		if (current == '%') {
+			if (istlen(scanner) >= 2) {
+				if (ishex(istptr(scanner)[0]) && ishex(istptr(scanner)[1])) {
+					char hex1, hex2, c;
+
+					hex1 = istshift(&scanner);
+					hex2 = istshift(&scanner);
+					c = (hex2i(hex1) << 4) + hex2i(hex2);
+
+					if (is_unreserved_character(c)) {
+						output = __istappend(output, c);
+					}
+					else {
+						output = __istappend(output, current);
+						output = __istappend(output, hex1);
+						output = __istappend(output, hex2);
+					}
+
+					continue;
+				}
+			}
+
+			if (strict) {
+				err = URI_NORMALIZER_ERR_INVALID_INPUT;
+				goto fail;
+			}
+			else {
+				output = __istappend(output, current);
+			}
+		}
+		else {
+			output = __istappend(output, current);
+		}
+	}
+
+	*dst = output;
+
+	return URI_NORMALIZER_ERR_NONE;
+
+  fail:
+
+	return err;
+}
+
 /* Uppercases letters used in percent encoding.
  *
  * If `strict` is set to 0 then percent characters that are not followed by a
commit	2e4a18e04a866bbe0e12745d6475d85243eafe5a	[log] [tgz]
author	Tim Duesterhus <tim@bastelstu.be>	Wed Apr 21 21:20:36 2021 +0200
committer	Christopher Faulet <cfaulet@haproxy.com>	Fri Apr 23 19:43:45 2021 +0200
tree	db7e2d55e45c85e1a1e34c40dcc5b7f7ce1ec1d7
parent	d6d33deaea973d36827cce12f49dfc8a38179dd0 [diff]