setexpr: add regex substring matching and substitution

Add "setexpr name gsub r s [t]" and "setexpr name sub r s [t]"
commands which implement substring matching for the regular
expression <r> in the string <t>, and substitution of the string <s>.
The result is assigned to the environment variable <name>.  If <t> is
not supplied, the previous value of <name> is used instead.  "gsub"
performs global substitution, while "sub" will replace only the first
substring.

Both commands are closely modeled after the gawk functions with the
same names.

Examples:

- Generate broadcast address by substituting the last two numbers of
  the IP address by "255.255":

  	=> print ipaddr
	ipaddr=192.168.1.104
	=> setexpr broadcast sub "(.*\\.).*\\..*" "\\1255.255" $ipaddr
	broadcast=192.168.255.255

- Depending on keyboard configuration (German vs. US keyboard) a
  barcode scanner may initialize the MAC address as C0:E5:4E:02:06:DC
  or as C0>E5>4E>02>06>DC.  Make sure we always have a correct value:

	=> print ethaddr
	ethaddr=C0>E5>4E>02>06>DC
	=> setexpr ethaddr gsub > :
	ethaddr=C0:E5:4E:02:06:DC

- Do the same, but substitute one step at a time in a loop until no
  futher matches:

	=> setenv ethaddr C0>E5>4E>02>06>DC
	=> while setexpr ethaddr sub > :
	> do
	> echo -----
	> done
	ethaddr=C0:E5>4E>02>06>DC
	-----
	ethaddr=C0:E5:4E>02>06>DC
	-----
	ethaddr=C0:E5:4E:02>06>DC
	-----
	ethaddr=C0:E5:4E:02:06>DC
	-----
	ethaddr=C0:E5:4E:02:06:DC
	-----
	C0:E5:4E:02:06:DC: No match
	=> print ethaddr
	ethaddr=C0:E5:4E:02:06:DC

etc.

To enable this feature, the CONFIG_REGEX option has to be defined in
the board config file.

Signed-off-by: Wolfgang Denk <wd@denx.de>
diff --git a/common/cmd_setexpr.c b/common/cmd_setexpr.c
index ccd87f4..93cb255 100644
--- a/common/cmd_setexpr.c
+++ b/common/cmd_setexpr.c
@@ -1,5 +1,6 @@
 /*
  * Copyright 2008 Freescale Semiconductor, Inc.
+ * Copyright 2013 Wolfgang Denk <wd@denx.de>
  *
  * See file CREDITS for list of people who contributed to this
  * project.
@@ -50,13 +51,263 @@
 	}
 }
 
+#ifdef CONFIG_REGEX
+
+#include <slre.h>
+
+#define SLRE_BUFSZ	16384
+#define SLRE_PATSZ	4096
+
+/*
+ * memstr - Find the first substring in memory
+ * @s1: The string to be searched
+ * @s2: The string to search for
+ *
+ * Similar to and based on strstr(),
+ * but strings do not need to be NUL terminated.
+ */
+static char *memstr(const char *s1, int l1, const char *s2, int l2)
+{
+	if (!l2)
+		return (char *)s1;
+
+	while (l1 >= l2) {
+		l1--;
+		if (!memcmp(s1, s2, l2))
+			return (char *)s1;
+		s1++;
+	}
+	return NULL;
+}
+
+static char *substitute(char *string,	/* string buffer */
+			int *slen,	/* current string length */
+			int ssize,	/* string bufer size */
+			const char *old,/* old (replaced) string */
+			int olen,	/* length of old string */
+			const char *new,/* new (replacement) string */
+			int nlen)	/* length of new string */
+{
+	char *p = memstr(string, *slen, old, olen);
+
+	if (p == NULL)
+		return NULL;
+
+	debug("## Match at pos %ld: match len %d, subst len %d\n",
+		(long)(p - string), olen, nlen);
+
+	/* make sure replacement matches */
+	if (*slen + nlen - olen > ssize) {
+		printf("## error: substitution buffer overflow\n");
+		return NULL;
+	}
+
+	/* move tail if needed */
+	if (olen != nlen) {
+		int tail, len;
+
+		len = (olen > nlen) ? olen : nlen;
+
+		tail = ssize - (p + len - string);
+
+		debug("## tail len %d\n", tail);
+
+		memmove(p + nlen, p + olen, tail);
+	}
+
+	/* insert substitue */
+	memcpy(p, new, nlen);
+
+	*slen += nlen - olen;
+
+	return p + nlen;
+}
+
+/*
+ * Perform regex operations on a environment variable
+ *
+ * Returns 0 if OK, 1 in case of errors.
+ */
+static int regex_sub(const char *name,
+	const char *r, const char *s, const char *t,
+	int global)
+{
+	struct slre slre;
+	char data[SLRE_BUFSZ];
+	char *datap = data;
+	const char *value;
+	int res, len, nlen, loop;
+
+	if (name == NULL)
+		return 1;
+
+	if (slre_compile(&slre, r) == 0) {
+		printf("Error compiling regex: %s\n", slre.err_str);
+		return 1;
+	}
+
+	if (t == NULL) {
+		value = getenv(name);
+
+		if (value == NULL) {
+			printf("## Error: variable \"%s\" not defined\n", name);
+			return 1;
+		}
+		t = value;
+	}
+
+	debug("REGEX on %s=%s\n", name, t);
+	debug("REGEX=\"%s\", SUBST=\"%s\", GLOBAL=%d\n",
+		r, s ? s : "<NULL>", global);
+
+	len = strlen(t);
+	if (len + 1 > SLRE_BUFSZ) {
+		printf("## error: subst buffer overflow: have %d, need %d\n",
+			SLRE_BUFSZ, len + 1);
+		return 1;
+	}
+
+	strcpy(data, t);
+
+	if (s == NULL)
+		nlen = 0;
+	else
+		nlen = strlen(s);
+
+	for (loop = 0;; loop++) {
+		struct cap caps[slre.num_caps + 2];
+		char nbuf[SLRE_PATSZ];
+		const char *old;
+		char *np;
+		int i, olen;
+
+		(void) memset(caps, 0, sizeof(caps));
+
+		res = slre_match(&slre, datap, len, caps);
+
+		debug("Result: %d\n", res);
+
+		for (i = 0; i < slre.num_caps; i++) {
+			if (caps[i].len > 0) {
+				debug("Substring %d: [%.*s]\n", i,
+					caps[i].len, caps[i].ptr);
+			}
+		}
+
+		if (res == 0) {
+			if (loop == 0) {
+				printf("%s: No match\n", t);
+				return 1;
+			} else {
+				break;
+			}
+		}
+
+		debug("## MATCH ## %s\n", data);
+
+		if (s == NULL) {
+			printf("%s=%s\n", name, t);
+			return 1;
+		}
+
+		old = caps[0].ptr;
+		olen = caps[0].len;
+
+		if (nlen + 1 >= SLRE_PATSZ) {
+			printf("## error: pattern buffer overflow: have %d, need %d\n",
+				SLRE_BUFSZ, nlen + 1);
+			return 1;
+		}
+		strcpy(nbuf, s);
+
+		debug("## SUBST(1) ## %s\n", nbuf);
+
+		/*
+		 * Handle back references
+		 *
+		 * Support for \0 ... \9, where \0 is the
+		 * whole matched pattern (similar to &).
+		 *
+		 * Implementation is a bit simpleminded as
+		 * backrefs are substituted sequentially, one
+		 * by one.  This will lead to somewhat
+		 * unexpected results if the replacement
+		 * strings contain any \N strings then then
+		 * may get substitued, too.  We accept this
+		 * restriction for the sake of simplicity.
+		 */
+		for (i = 0; i < 10; ++i) {
+			char backref[2] = {
+				'\\',
+				'0',
+			};
+
+			if (caps[i].len == 0)
+				break;
+
+			backref[1] += i;
+
+			debug("## BACKREF %d: replace \"%.*s\" by \"%.*s\" in \"%s\"\n",
+				i,
+				2, backref,
+				caps[i].len, caps[i].ptr,
+				nbuf);
+
+			for (np = nbuf;;) {
+				char *p = memstr(np, nlen, backref, 2);
+
+				if (p == NULL)
+					break;
+
+				np = substitute(np, &nlen,
+					SLRE_PATSZ,
+					backref, 2,
+					caps[i].ptr, caps[i].len);
+
+				if (np == NULL)
+					return 1;
+			}
+		}
+		debug("## SUBST(2) ## %s\n", nbuf);
+
+		datap = substitute(datap, &len, SLRE_BUFSZ,
+				old, olen,
+				nbuf, nlen);
+
+		if (datap == NULL)
+			return 1;
+
+		debug("## REMAINDER: %s\n", datap);
+
+		debug("## RESULT: %s\n", data);
+
+		if (!global)
+			break;
+	}
+	debug("## FINAL (now setenv()) :  %s\n", data);
+
+	printf("%s=%s\n", name, data);
+
+	return setenv(name, data);
+}
+#endif
+
 static int do_setexpr(cmd_tbl_t *cmdtp, int flag, int argc, char * const argv[])
 {
 	ulong a, b;
 	ulong value;
 	int w;
 
+	/*
+	 * We take 3, 5, or 6 arguments:
+	 * 3 : setexpr name value
+	 * 5 : setexpr name val1 op val2
+	 *     setexpr name [g]sub r s
+	 * 6 : setexpr name [g]sub r s t
+	 */
+
-	if (argc < 3)
+	/* > 6 already tested by max command args */
+	if ((argc < 3) || (argc == 4))
 		return CMD_RET_USAGE;
 
 	w = cmd_get_data_size(argv[0], 4);
@@ -69,6 +320,19 @@
 		return 0;
 	}
 
+	/* 5 or 6 args (6 args only with [g]sub) */
+#ifdef CONFIG_REGEX
+	/*
+	 * rexep handling: "setexpr name [g]sub r s [t]"
+	 * with 5 args, "t" will be NULL
+	 */
+	if (strcmp(argv[2], "gsub") == 0)
+		return regex_sub(argv[1], argv[3], argv[4], argv[5], 1);
+
+	if (strcmp(argv[2], "sub") == 0)
+		return regex_sub(argv[1], argv[3], argv[4], argv[5], 0);
+#endif
+
 	/* standard operators: "setexpr name val1 op val2" */
 	if (argc != 5)
 		return CMD_RET_USAGE;
@@ -114,13 +378,23 @@
 }
 
 U_BOOT_CMD(
-	setexpr, 5, 0, do_setexpr,
+	setexpr, 6, 0, do_setexpr,
 	"set environment variable as the result of eval expression",
 	"[.b, .w, .l] name [*]value1 <op> [*]value2\n"
 	"    - set environment variable 'name' to the result of the evaluated\n"
-	"      express specified by <op>.  <op> can be &, |, ^, +, -, *, /, %\n"
+	"      expression specified by <op>.  <op> can be &, |, ^, +, -, *, /, %\n"
 	"      size argument is only meaningful if value1 and/or value2 are\n"
 	"      memory addresses (*)\n"
 	"setexpr[.b, .w, .l] name [*]value\n"
 	"    - load a value into a variable"
+#ifdef CONFIG_REGEX
+	"\n"
+	"setexpr name gsub r s [t]\n"
+	"    - For each substring matching the regular expression <r> in the\n"
+	"      string <t>, substitute the string <s>.  The result is\n"
+	"      assigned to <name>.  If <t> is not supplied, use the old\n"
+	"      value of <name>\n"
+	"setexpr name sub r s [t]\n"
+	"    - Just like gsub(), but replace only the first matching substring"
+#endif
 );