MEDIUM: samples: add a regsub converter to perform regex-based transformations

We can now replace matching regex parts with a string, a la sed. Note
that there are at least 3 different behaviours for existing sed
implementations when matching 0-length strings. Here is the result
of the following operation on each implementationt tested :

  echo 'xzxyz' | sed -e 's/x*y*/A/g'

  GNU sed 4.2.1       => AzAzA
  Perl's sed 5.16.1   => AAzAAzA
  Busybox v1.11.2 sed => AzAz

The psed behaviour was adopted because it causes the least exceptions
in the code and seems logical from a certain perspective :

  - "x"  matches x*y*  => add "A" and skip "x"
  - "z"  matches x*y*  => add "A" and keep "z", not part of the match
  - "xy" matches x*y*  => add "A" and skip "xy"
  - "z"  matches x*y*  => add "A" and keep "z", not part of the match
  - ""   matches x*y*  => add "A" and stop here

Anyway, given the incompatibilities between implementations, it's unlikely
that some processing will rely on this behaviour.

There currently is one big limitation : the configuration parser makes it
impossible to pass commas or closing parenthesis (or even closing brackets
in log formats). But that's still quite usable to replace certain characters
or character sequences. It will become more complete once the config parser
is reworked.
diff --git a/doc/configuration.txt b/doc/configuration.txt
index e899297..380fb83 100644
--- a/doc/configuration.txt
+++ b/doc/configuration.txt
@@ -10328,6 +10328,28 @@
       |       `---------------------------- key
       `------------------------------------ leading spaces ignored
 
+regsub(<regex>,<subst>[,<flags>)
+  Applies a regex-based substitution to the input string. It does the same
+  operation as the well-known "sed" utility with "s/<regex>/<subst>/". By
+  default it will replace in the input string the first occurrence of the
+  largest part matching the regular expression <regex> with the substitution
+  string <subst>. It is possible to replace all occurrences instead by adding
+  the flag "g" in the third argument <flags>. It is also possible to make the
+  regex case insensitive by adding the flag "i" in <flags>. Since <flags> is a
+  string, it is made up from the concatenation of all desired flags. Thus if
+  both "i" and "g" are desired, using "gi" or "ig" will have the same effect.
+  It is important to note that due to the current limitations of the
+  configuration parser, some characters such as closing parenthesis or comma
+  are not possible to use in the arguments. The first use of this converter is
+  to replace certain characters or sequence of characters with other ones.
+
+  Example :
+
+     # de-duplicate "/" in header "x-path".
+     # input:  x-path: /////a///b/c/xzxyz/
+     # output: x-path: /a/b/c/xzxyz/
+     http-request set-header x-path %[hdr(x-path),regsub(/+,/,g)]
+
 sdbm([<avalanche>])
   Hashes a binary input sample into an unsigned 32-bit quantity using the SDBM
   hash function. Optionally, it is possible to apply a full avalanche hash
diff --git a/src/sample.c b/src/sample.c
index 8fe341f..598bb53 100644
--- a/src/sample.c
+++ b/src/sample.c
@@ -1782,6 +1782,112 @@
 	return 1;
 }
 
+static int sample_conv_regsub_check(struct arg *args, struct sample_conv *conv,
+                                    const char *file, int line, char **err)
+{
+	struct arg *arg = args;
+	char *p;
+	int len;
+
+	/* arg0 is a regex, it uses type_flag for ICASE and global match */
+	arg[0].type_flags = 0;
+
+	if (arg[2].type != ARGT_STR)
+		return 1;
+
+	p = arg[2].data.str.str;
+	len = arg[2].data.str.len;
+	while (len) {
+		if (*p == 'i') {
+			arg[0].type_flags |= ARGF_REG_ICASE;
+		}
+		else if (*p == 'g') {
+			arg[0].type_flags |= ARGF_REG_GLOB;
+		}
+		else {
+			memprintf(err, "invalid regex flag '%c', only 'i' and 'g' are supported", *p);
+			return 0;
+		}
+		p++;
+		len--;
+	}
+	return 1;
+}
+
+/* This sample function is designed to do the equivalent of s/match/replace/ on
+ * the input string. It applies a regex and restarts from the last matched
+ * location until nothing matches anymore. First arg is the regex to apply to
+ * the input string, second arg is the replacement expression.
+ */
+static int sample_conv_regsub(const struct arg *arg_p, struct sample *smp)
+{
+	char *start, *end;
+	struct my_regex *reg = arg_p[0].data.reg;
+	regmatch_t pmatch[MAX_MATCH];
+	struct chunk *trash = get_trash_chunk();
+	int flag, max;
+	int found;
+
+	start = smp->data.str.str;
+	end = start + smp->data.str.len;
+
+	flag = 0;
+	while (1) {
+		/* check for last round which is used to copy remaining parts
+		 * when not running in global replacement mode.
+		 */
+		found = 0;
+		if ((arg_p[0].type_flags & ARGF_REG_GLOB) || !(flag & REG_NOTBOL)) {
+			/* Note: we can have start == end on empty strings or at the end */
+			found = regex_exec_match2(reg, start, end - start, MAX_MATCH, pmatch, flag);
+		}
+
+		if (!found)
+			pmatch[0].rm_so = end - start;
+
+		/* copy the heading non-matching part (which may also be the tail if nothing matches) */
+		max = trash->size - trash->len;
+		if (max && pmatch[0].rm_so > 0) {
+			if (max > pmatch[0].rm_so)
+				max = pmatch[0].rm_so;
+			memcpy(trash->str + trash->len, start, max);
+			trash->len += max;
+		}
+
+		if (!found)
+			break;
+
+		/* replace the matching part */
+		max = trash->size - trash->len;
+		if (max) {
+			if (max > arg_p[1].data.str.len)
+				max = arg_p[1].data.str.len;
+			memcpy(trash->str + trash->len, arg_p[1].data.str.str, max);
+			trash->len += max;
+		}
+
+		/* stop here if we're done with this string */
+		if (start >= end)
+			break;
+
+		/* We have a special case for matches of length 0 (eg: "x*y*").
+		 * These ones are considered to match in front of a character,
+		 * so we have to copy that character and skip to the next one.
+		 */
+		if (!pmatch[0].rm_eo) {
+			if (trash->len < trash->size)
+				trash->str[trash->len++] = start[pmatch[0].rm_eo];
+			pmatch[0].rm_eo++;
+		}
+
+		start += pmatch[0].rm_eo;
+		flag |= REG_NOTBOL;
+	}
+
+	smp->data.str = *trash;
+	return 1;
+}
+
 /************************************************************************/
 /*       All supported sample fetch functions must be declared here     */
 /************************************************************************/
@@ -1927,6 +2033,7 @@
 	{ "bytes",  sample_conv_bytes,     ARG2(1,UINT,UINT), NULL, SMP_T_BIN,  SMP_T_BIN },
 	{ "field",  sample_conv_field,     ARG2(2,UINT,STR), sample_conv_field_check, SMP_T_STR,  SMP_T_STR },
 	{ "word",   sample_conv_word,      ARG2(2,UINT,STR), sample_conv_field_check, SMP_T_STR,  SMP_T_STR },
+	{ "regsub", sample_conv_regsub,    ARG3(2,REG,STR,STR), sample_conv_regsub_check, SMP_T_STR, SMP_T_STR },
 	{ NULL, NULL, 0, 0, 0 },
 }};