MEDIUM: samples: add a regsub converter to perform regex-based transformations We can now replace matching regex parts with a string, a la sed. Note that there are at least 3 different behaviours for existing sed implementations when matching 0-length strings. Here is the result of the following operation on each implementationt tested : echo 'xzxyz' | sed -e 's/x*y*/A/g' GNU sed 4.2.1 => AzAzA Perl's sed 5.16.1 => AAzAAzA Busybox v1.11.2 sed => AzAz The psed behaviour was adopted because it causes the least exceptions in the code and seems logical from a certain perspective : - "x" matches x*y* => add "A" and skip "x" - "z" matches x*y* => add "A" and keep "z", not part of the match - "xy" matches x*y* => add "A" and skip "xy" - "z" matches x*y* => add "A" and keep "z", not part of the match - "" matches x*y* => add "A" and stop here Anyway, given the incompatibilities between implementations, it's unlikely that some processing will rely on this behaviour. There currently is one big limitation : the configuration parser makes it impossible to pass commas or closing parenthesis (or even closing brackets in log formats). But that's still quite usable to replace certain characters or character sequences. It will become more complete once the config parser is reworked.

commit: 7eda849dce7b93cda5ccf703edaed59b5e28b8bd [log] [tgz]
author: Willy Tarreau <w@1wt.eu> Tue Jan 20 19:47:06 2015 +0100
committer: Willy Tarreau <w@1wt.eu> Thu Jan 22 14:24:53 2015 +0100
tree: 605a64789378fdd19518d7a184b9c33414c2c184
parent: 15a53a43846e25c99e37f210ec84349d3ea1c64d [diff]
diff --git a/doc/configuration.txt b/doc/configuration.txt
index e899297..380fb83 100644
--- a/doc/configuration.txt
+++ b/doc/configuration.txt

@@ -10328,6 +10328,28 @@
       |       `---------------------------- key
       `------------------------------------ leading spaces ignored
 
+regsub(<regex>,<subst>[,<flags>)
+  Applies a regex-based substitution to the input string. It does the same
+  operation as the well-known "sed" utility with "s/<regex>/<subst>/". By
+  default it will replace in the input string the first occurrence of the
+  largest part matching the regular expression <regex> with the substitution
+  string <subst>. It is possible to replace all occurrences instead by adding
+  the flag "g" in the third argument <flags>. It is also possible to make the
+  regex case insensitive by adding the flag "i" in <flags>. Since <flags> is a
+  string, it is made up from the concatenation of all desired flags. Thus if
+  both "i" and "g" are desired, using "gi" or "ig" will have the same effect.
+  It is important to note that due to the current limitations of the
+  configuration parser, some characters such as closing parenthesis or comma
+  are not possible to use in the arguments. The first use of this converter is
+  to replace certain characters or sequence of characters with other ones.
+
+  Example :
+
+     # de-duplicate "/" in header "x-path".
+     # input:  x-path: /////a///b/c/xzxyz/
+     # output: x-path: /a/b/c/xzxyz/
+     http-request set-header x-path %[hdr(x-path),regsub(/+,/,g)]
+
 sdbm([<avalanche>])
   Hashes a binary input sample into an unsigned 32-bit quantity using the SDBM
   hash function. Optionally, it is possible to apply a full avalanche hash

diff --git a/src/sample.c b/src/sample.c
index 8fe341f..598bb53 100644
--- a/src/sample.c
+++ b/src/sample.c

@@ -1782,6 +1782,112 @@
 	return 1;
 }
 
+static int sample_conv_regsub_check(struct arg *args, struct sample_conv *conv,
+                                    const char *file, int line, char **err)
+{
+	struct arg *arg = args;
+	char *p;
+	int len;
+
+	/* arg0 is a regex, it uses type_flag for ICASE and global match */
+	arg[0].type_flags = 0;
+
+	if (arg[2].type != ARGT_STR)
+		return 1;
+
+	p = arg[2].data.str.str;
+	len = arg[2].data.str.len;
+	while (len) {
+		if (*p == 'i') {
+			arg[0].type_flags |= ARGF_REG_ICASE;
+		}
+		else if (*p == 'g') {
+			arg[0].type_flags |= ARGF_REG_GLOB;
+		}
+		else {
+			memprintf(err, "invalid regex flag '%c', only 'i' and 'g' are supported", *p);
+			return 0;
+		}
+		p++;
+		len--;
+	}
+	return 1;
+}
+
+/* This sample function is designed to do the equivalent of s/match/replace/ on
+ * the input string. It applies a regex and restarts from the last matched
+ * location until nothing matches anymore. First arg is the regex to apply to
+ * the input string, second arg is the replacement expression.
+ */
+static int sample_conv_regsub(const struct arg *arg_p, struct sample *smp)
+{
+	char *start, *end;
+	struct my_regex *reg = arg_p[0].data.reg;
+	regmatch_t pmatch[MAX_MATCH];
+	struct chunk *trash = get_trash_chunk();
+	int flag, max;
+	int found;
+
+	start = smp->data.str.str;
+	end = start + smp->data.str.len;
+
+	flag = 0;
+	while (1) {
+		/* check for last round which is used to copy remaining parts
+		 * when not running in global replacement mode.
+		 */
+		found = 0;
+		if ((arg_p[0].type_flags & ARGF_REG_GLOB) || !(flag & REG_NOTBOL)) {
+			/* Note: we can have start == end on empty strings or at the end */
+			found = regex_exec_match2(reg, start, end - start, MAX_MATCH, pmatch, flag);
+		}
+
+		if (!found)
+			pmatch[0].rm_so = end - start;
+
+		/* copy the heading non-matching part (which may also be the tail if nothing matches) */
+		max = trash->size - trash->len;
+		if (max && pmatch[0].rm_so > 0) {
+			if (max > pmatch[0].rm_so)
+				max = pmatch[0].rm_so;
+			memcpy(trash->str + trash->len, start, max);
+			trash->len += max;
+		}
+
+		if (!found)
+			break;
+
+		/* replace the matching part */
+		max = trash->size - trash->len;
+		if (max) {
+			if (max > arg_p[1].data.str.len)
+				max = arg_p[1].data.str.len;
+			memcpy(trash->str + trash->len, arg_p[1].data.str.str, max);
+			trash->len += max;
+		}
+
+		/* stop here if we're done with this string */
+		if (start >= end)
+			break;
+
+		/* We have a special case for matches of length 0 (eg: "x*y*").
+		 * These ones are considered to match in front of a character,
+		 * so we have to copy that character and skip to the next one.
+		 */
+		if (!pmatch[0].rm_eo) {
+			if (trash->len < trash->size)
+				trash->str[trash->len++] = start[pmatch[0].rm_eo];
+			pmatch[0].rm_eo++;
+		}
+
+		start += pmatch[0].rm_eo;
+		flag |= REG_NOTBOL;
+	}
+
+	smp->data.str = *trash;
+	return 1;
+}
+
 /************************************************************************/
 /*       All supported sample fetch functions must be declared here     */
 /************************************************************************/
@@ -1927,6 +2033,7 @@
 	{ "bytes",  sample_conv_bytes,     ARG2(1,UINT,UINT), NULL, SMP_T_BIN,  SMP_T_BIN },
 	{ "field",  sample_conv_field,     ARG2(2,UINT,STR), sample_conv_field_check, SMP_T_STR,  SMP_T_STR },
 	{ "word",   sample_conv_word,      ARG2(2,UINT,STR), sample_conv_field_check, SMP_T_STR,  SMP_T_STR },
+	{ "regsub", sample_conv_regsub,    ARG3(2,REG,STR,STR), sample_conv_regsub_check, SMP_T_STR, SMP_T_STR },
 	{ NULL, NULL, 0, 0, 0 },
 }};
commit	7eda849dce7b93cda5ccf703edaed59b5e28b8bd	[log] [tgz]
author	Willy Tarreau <w@1wt.eu>	Tue Jan 20 19:47:06 2015 +0100
committer	Willy Tarreau <w@1wt.eu>	Thu Jan 22 14:24:53 2015 +0100
tree	605a64789378fdd19518d7a184b9c33414c2c184
parent	15a53a43846e25c99e37f210ec84349d3ea1c64d [diff]