BUG/MINOR: http: make url_decode() optionally convert '+' to SP The url_decode() function used by the url_dec converter and a few other call points is ambiguous on its processing of the '+' character which itself isn't stable in the spec. This one belongs to the reserved characters for the query string but not for the path nor the scheme, in which it must be left as-is. It's only in argument strings that follow the application/x-www-form-urlencoded encoding that it must be turned into a space, that is, in query strings and POST arguments. The problem is that the function is used to process full URLs and paths in various configs, and to process query strings from the stats page for example. This patch updates the function to differentiate the situation where it's parsing a path and a query string. A new argument indicates if a query string should be assumed, otherwise it's only assumed after seeing a question mark. The various locations in the code making use of this function were updated to take care of this (most call places were using it to decode POST arguments). The url_dec converter is usually called on path or url samples, so it needs to remain compatible with this and will default to parsing a path and turning the '+' to a space only after a question mark. However in situations where it would explicitly be extracted from a POST or a query string, it now becomes possible to enforce the decoding by passing a non-null value in argument. It seems to be what was reported in issue #585. This fix may be backported to older stable releases.

commit: 62ba9ba6ca259f45f7bd8de436b743b3ad9ac04a [log] [tgz]
author: Willy Tarreau <w@1wt.eu> Thu Apr 23 17:54:47 2020 +0200
committer: Willy Tarreau <w@1wt.eu> Thu Apr 23 20:03:27 2020 +0200
tree: c3f6c28b5baf9fc4f0a5e70da06ad8bfb7be6dbe
parent: bf5b49189561c74c26ee58ec79e2b859b1c4f1c1 [diff]
diff --git a/contrib/prometheus-exporter/service-prometheus.c b/contrib/prometheus-exporter/service-prometheus.c
index 8ffeb82..54770ef 100644
--- a/contrib/prometheus-exporter/service-prometheus.c
+++ b/contrib/prometheus-exporter/service-prometheus.c

@@ -2292,7 +2292,7 @@
 			*(p++) = 0;
 		else if (*p == '#')
 			*p = 0;
-		len = url_decode(key);
+		len = url_decode(key, 1);
 		if (len == -1)
 			goto error;
 
@@ -2306,7 +2306,7 @@
 				*(p++) = 0;
 			else if (*p == '#')
 				*p = 0;
-			len = url_decode(value);
+			len = url_decode(value, 1);
 			if (len == -1)
 				goto error;
 		}

diff --git a/doc/configuration.txt b/doc/configuration.txt
index f3e6aa1..2dbb893 100644
--- a/doc/configuration.txt
+++ b/doc/configuration.txt

@@ -14540,9 +14540,13 @@
   sample fetch function or after a transformation keyword returning a string
   type. The result is of type string.
 
-url_dec
-  Takes an url-encoded string provided as input and returns the decoded
-  version as output. The input and the output are of type string.
+url_dec([<in_form>])
+  Takes an url-encoded string provided as input and returns the decoded version
+  as output. The input and the output are of type string. If the <in_form>
+  argument is set to a non-zero integer value, the input string is assumed to
+  be part of a form or query string and the '+' character will be turned into a
+  space (' '). Otherwise this will only happen after a question mark indicating
+  a query string ('?').
 
 ungrpc(<field_number>,[<field_type>])
   This extracts the protocol buffers message field in raw mode of an input binary

diff --git a/include/common/standard.h b/include/common/standard.h
index 81d6093..a6c3910 100644
--- a/include/common/standard.h
+++ b/include/common/standard.h

@@ -609,8 +609,12 @@
  * be shorter. If some forbidden characters are found, the conversion is
  * aborted, the string is truncated before the issue and non-zero is returned,
  * otherwise the operation returns non-zero indicating success.
+ * If the 'in_form' argument is non-nul the string is assumed to be part of
+ * an "application/x-www-form-urlencoded" encoded string, and the '+' will be
+ * turned to a space. If it's zero, this will only be done after a question
+ * mark ('?').
  */
-int url_decode(char *string);
+int url_decode(char *string, int in_form);
 
 /* This one is 6 times faster than strtoul() on athlon, but does
  * no check at all.

diff --git a/src/http_conv.c b/src/http_conv.c
index cd93aa9..f496c56 100644
--- a/src/http_conv.c
+++ b/src/http_conv.c

@@ -246,6 +246,7 @@
 /* This fetch url-decode any input string. */
 static int sample_conv_url_dec(const struct arg *args, struct sample *smp, void *private)
 {
+	int in_form = 0;
 	int len;
 
 	/* If the constant flag is set or if not size is available at
@@ -262,7 +263,11 @@
 
 	/* Add final \0 required by url_decode(), and convert the input string. */
 	smp->data.u.str.area[smp->data.u.str.data] = '\0';
-	len = url_decode(smp->data.u.str.area);
+
+	if (args && (args[0].type == ARGT_SINT))
+		in_form = !!args[0].data.sint;
+
+	len = url_decode(smp->data.u.str.area, in_form);
 	if (len < 0)
 		return 0;
 	smp->data.u.str.data = len;
@@ -361,7 +366,7 @@
 	{ "language",       sample_conv_q_preferred,  ARG2(1,STR,STR),  NULL,   SMP_T_STR,  SMP_T_STR},
 	{ "capture-req",    smp_conv_req_capture,     ARG1(1,SINT),     NULL,   SMP_T_STR,  SMP_T_STR},
 	{ "capture-res",    smp_conv_res_capture,     ARG1(1,SINT),     NULL,   SMP_T_STR,  SMP_T_STR},
-	{ "url_dec",        sample_conv_url_dec,      0,                NULL,   SMP_T_STR,  SMP_T_STR},
+	{ "url_dec",        sample_conv_url_dec,      ARG1(0,SINT),     NULL,   SMP_T_STR,  SMP_T_STR},
 	{ NULL, NULL, 0, 0, 0 },
 }};
 

diff --git a/src/mux_fcgi.c b/src/mux_fcgi.c
index 2f712f3..63be2d6 100644
--- a/src/mux_fcgi.c
+++ b/src/mux_fcgi.c

@@ -1306,7 +1306,7 @@
 		chunk_memcat(params->p, path.ptr, path.len);
 		path.ptr = b_tail(params->p) - path.len;
 		path.ptr[path.len] = '\0';
-		len = url_decode(path.ptr);
+		len = url_decode(path.ptr, 0);
 		if (len < 0)
 			goto error;
 		path.len = len;

diff --git a/src/standard.c b/src/standard.c
index e3e81ba..cf27248 100644
--- a/src/standard.c
+++ b/src/standard.c

@@ -1749,8 +1749,12 @@
  * be shorter. If some forbidden characters are found, the conversion is
  * aborted, the string is truncated before the issue and a negative value is
  * returned, otherwise the operation returns the length of the decoded string.
+ * If the 'in_form' argument is non-nul the string is assumed to be part of
+ * an "application/x-www-form-urlencoded" encoded string, and the '+' will be
+ * turned to a space. If it's zero, this will only be done after a question
+ * mark ('?').
  */
-int url_decode(char *string)
+int url_decode(char *string, int in_form)
 {
 	char *in, *out;
 	int ret = -1;
@@ -1760,7 +1764,7 @@
 	while (*in) {
 		switch (*in) {
 		case '+' :
-			*out++ = ' ';
+			*out++ = in_form ? ' ' : *in;
 			break;
 		case '%' :
 			if (!ishex(in[1]) || !ishex(in[2]))
@@ -1768,6 +1772,9 @@
 			*out++ = (hex2i(in[1]) << 4) + hex2i(in[2]);
 			in += 2;
 			break;
+		case '?':
+			in_form = 1;
+			/* fall through */
 		default:
 			*out++ = *in;
 			break;

diff --git a/src/stats.c b/src/stats.c
index 46475cc..f76fd37 100644
--- a/src/stats.c
+++ b/src/stats.c

@@ -2902,7 +2902,7 @@
 				/* Ok, a value is found, we can mark the end of the key */
 				*value++ = '\0';
 			}
-			if (url_decode(key) < 0 || url_decode(value) < 0)
+			if (url_decode(key, 1) < 0 || url_decode(value, 1) < 0)
 				break;
 
 			/* Now we can check the key to see what to do */
commit	62ba9ba6ca259f45f7bd8de436b743b3ad9ac04a	[log] [tgz]
author	Willy Tarreau <w@1wt.eu>	Thu Apr 23 17:54:47 2020 +0200
committer	Willy Tarreau <w@1wt.eu>	Thu Apr 23 20:03:27 2020 +0200
tree	c3f6c28b5baf9fc4f0a5e70da06ad8bfb7be6dbe
parent	bf5b49189561c74c26ee58ec79e2b859b1c4f1c1 [diff]