MEDIUM: log-format: make the format parser more robust and more extensible

The log-format parser reached a limit making it hard to add new features.
It also suffers from a weak handling of certain incorrect corner cases,
for example "%{foo}" is emitted as a litteral while syntactically it's an
argument to no variable. Also the argument parser had to redo some of the
job with some cases causing minor memory leaks (eg: ignored args).

This work aims at improving the situation so that slightly better reporting
is possible and that it becomes possible to extend the log format. The code
has a few more states but looks significantly simpler. The parser is now
capable of reporting ignored arguments and truncated lines.
diff --git a/include/proto/log.h b/include/proto/log.h
index a6c59ff..5d1215a 100644
--- a/include/proto/log.h
+++ b/include/proto/log.h
@@ -55,10 +55,10 @@
 int parse_logformat_var_args(char *args, struct logformat_node *node);
 
 /*
- * Parse a variable '%varname' or '%{args}varname' in logformat
+ * Parse a variable '%varname' or '%{args}varname' in log-format
  *
  */
-int parse_logformat_var(char *str, size_t len, struct proxy *curproxy, struct list *list_format, int *defoptions);
+int parse_logformat_var(char *arg, int arg_len, char *var, int var_len, struct proxy *curproxy, struct list *list_format, int *defoptions);
 
 /*
  * add to the logformat linked list
diff --git a/include/types/log.h b/include/types/log.h
index c81ab0d..0ed79d4 100644
--- a/include/types/log.h
+++ b/include/types/log.h
@@ -97,15 +97,16 @@
 	LOG_FMT_SSL_VERSION,
 };
 
-/* enum for parse_logformat */
+/* enum for parse_logformat_string */
 enum {
-	LF_TEXT = 0,
-	LF_SEPARATOR,
-	LF_VAR, // after %
-
-	LF_STARTVAR,   // %
-	LF_STARG, // { and within { }
-	LF_EDARG, // end arg }
+	LF_INIT = 0,   // before first character
+	LF_TEXT,       // normal text
+	LF_SEPARATOR,  // a single separator
+	LF_VAR,        // variable name, after '%' or '%{..}'
+	LF_STARTVAR,   // % in text
+	LF_STARG,      // after '%{' and berore '}'
+	LF_EDARG,      // '}' after '%{'
+	LF_END,        // \0 found
 };
 
 
diff --git a/src/log.c b/src/log.c
index 0d63ba6..a586002 100644
--- a/src/log.c
+++ b/src/log.c
@@ -10,6 +10,7 @@
  *
  */
 
+#include <ctype.h>
 #include <fcntl.h>
 #include <stdarg.h>
 #include <stdio.h>
@@ -221,69 +222,54 @@
 }
 
 /*
- * Parse a variable '%varname' or '%{args}varname' in logformat
- *
+ * Parse a variable '%varname' or '%{args}varname' in log-format. The caller
+ * must pass the args part in the <arg> pointer with its length in <arg_len>,
+ * and varname with its length in <var> and <var_len> respectively. <arg> is
+ * ignored when arg_len is 0. Neither <var> nor <var_len> may be null.
  */
-int parse_logformat_var(char *str, size_t len, struct proxy *curproxy, struct list *list_format, int *defoptions)
+int parse_logformat_var(char *arg, int arg_len, char *var, int var_len, struct proxy *curproxy, struct list *list_format, int *defoptions)
 {
-	int i, j;
-	char *arg = NULL; // arguments
-	int fparam = 0;
-	char *name = NULL;
-	struct logformat_node *node = NULL;
-	char varname[255] = { 0 }; // variable name
+	int j;
+	struct logformat_node *node;
 
-	for (i = 1; i < len; i++) { // escape first char %
-		if (!arg && str[i] == '{') {
-			arg = str + i;
-			fparam = 1;
-		} else if (arg && str[i] == '}') {
-			char *tmp = arg;
-			arg = calloc(str + i - tmp, 1); // without {}
-			strncpy(arg, tmp + 1, str + i - tmp - 1); // copy without { and }
-			arg[str + i - tmp - 1] = '\0';
-			fparam = 0;
-		} else if (!name && !fparam) {
-			strncpy(varname, str + i, len - i + 1);
-			varname[len - i] = '\0';
-			for (j = 0; logformat_keywords[j].name; j++) { // search a log type
-				if (strcmp(varname, logformat_keywords[j].name) == 0) {
-					if (!((logformat_keywords[j].mode == PR_MODE_HTTP) && (curproxy->mode == PR_MODE_TCP))) {
-						node = calloc(1, sizeof(struct logformat_node));
-						node->type = logformat_keywords[j].type;
-						node->options = *defoptions;
-						node->arg = arg;
-						parse_logformat_var_args(node->arg, node);
-						if (node->type == LOG_FMT_GLOBAL) {
-							*defoptions = node->options;
-							free(node);
-						} else {
-							if (logformat_keywords[j].config_callback != NULL) {
-								if (logformat_keywords[j].config_callback(node, curproxy) != 0) {
-									return -1;
-								 }
-							}
-							curproxy->to_log |= logformat_keywords[j].lw;
-							LIST_ADDQ(list_format, &node->list);
-						}
-						if (logformat_keywords[j].replace_by)
-							Warning("Warning: deprecated variable '%s' in log-format, please replace it with '%s'\n",
-								varname, logformat_keywords[j].replace_by);
-						return 0;
-					} else {
-						Warning("Warning: No such variable name '%s' in this log mode\n", varname);
-						if (arg)
-							free(arg);
+	for (j = 0; logformat_keywords[j].name; j++) { // search a log type
+		if (strlen(logformat_keywords[j].name) == var_len &&
+		    strncmp(var, logformat_keywords[j].name, var_len) == 0) {
+			if (logformat_keywords[j].mode != PR_MODE_HTTP || curproxy->mode == PR_MODE_HTTP) {
+				node = calloc(1, sizeof(struct logformat_node));
+				node->type = logformat_keywords[j].type;
+				node->options = *defoptions;
+				if (arg_len) {
+					node->arg = my_strndup(arg, arg_len);
+					parse_logformat_var_args(node->arg, node);
+				}
+				if (node->type == LOG_FMT_GLOBAL) {
+					*defoptions = node->options;
+					free(node->arg);
+					free(node);
+				} else {
+					if (logformat_keywords[j].config_callback &&
+					    logformat_keywords[j].config_callback(node, curproxy) != 0) {
 						return -1;
 					}
+					curproxy->to_log |= logformat_keywords[j].lw;
+					LIST_ADDQ(list_format, &node->list);
 				}
+				if (logformat_keywords[j].replace_by)
+					Warning("Warning: deprecated variable '%s' in log-format, please replace it with '%s'\n",
+					        logformat_keywords[j].name, logformat_keywords[j].replace_by);
+				return 0;
+			} else {
+				Warning("Warning: log-format variable name '%s' is not suited to HTTP mode\n", logformat_keywords[j].name);
+				return -1;
 			}
-			Warning("Warning: No such variable name '%s' in log-format\n", varname);
-			if (arg)
-				free(arg);
-			return -1;
 		}
 	}
+
+	j = var[var_len];
+	var[var_len] = 0;
+	Warning("Warning: no such variable name '%s' in log-format\n", var);
+	var[var_len] = j;
 	return -1;
 }
 
@@ -329,9 +315,13 @@
  */
 void parse_logformat_string(char *str, struct proxy *curproxy, struct list *list_format, int capabilities)
 {
-	char *sp = str; /* start pointer */
-	int cformat = -1; /* current token format : LOG_TEXT, LOG_SEPARATOR, LOG_VARIABLE */
-	int pformat = -1; /* previous token format */
+	char *sp = str; /* start pointer for text parts */
+	char *arg = NULL; /* start pointer for args */
+	char *var = NULL; /* start pointer for vars */
+	int arg_len = 0;
+	int var_len = 0;
+	int cformat; /* current token format */
+	int pformat; /* previous token format */
 	struct logformat_node *tmplf, *back;
 	int options = 0;
 
@@ -343,87 +333,89 @@
 		free(tmplf);
 	}
 
-	while (1) {
-
-		// push the variable only if formats are different, not
-		// within a variable, and not the first iteration
-		if ((cformat != pformat && cformat != -1 && pformat != -1) || *str == '\0') {
-			if (((pformat != LF_STARTVAR && cformat != LF_VAR) &&
-			    (pformat != LF_STARTVAR && cformat != LF_STARG) &&
-			    (pformat != LF_STARG && cformat !=  LF_VAR)) || *str == '\0') {
-				if (pformat > LF_VAR) // unfinished string
-					pformat = LF_TEXT;
-				if (pformat == LF_VAR)
-					parse_logformat_var(sp, str - sp, curproxy, list_format, &options);
-				else
-					add_to_logformat_list(sp, str, pformat, list_format);
-				sp = str;
-				if (*str == '\0')
-					break;
-			    }
-		}
-
-		if (cformat != -1)
-			str++; // consume the string, except on the first tour
-
+	for (cformat = LF_INIT; cformat != LF_END; str++) {
 		pformat = cformat;
 
-		if (*str == '\0') {
-			cformat = LF_STARTVAR; // for breaking in all cases
-			continue;
-		}
+		if (!*str)
+			cformat = LF_END;              // preset it to save all states from doing this
 
-		if (pformat == LF_STARTVAR) { // after a %
-			if ( (*str >= 'a' && *str <= 'z') || // parse varname
-			     (*str >= 'A' && *str <= 'Z') ||
-			     (*str >= '0' && *str <= '9')) {
-				cformat = LF_VAR; // varname
-				continue;
-			} else if (*str == '{') {
-				cformat = LF_STARG; // variable arguments
-				continue;
-			} else { // another unexpected token
-				pformat = LF_TEXT; // redefine the format of the previous token to TEXT
-				cformat = LF_TEXT;
-				continue;
+		/* The prinicple of the two-step state machine below is to first detect a change, and
+		 * second have all common paths processed at one place. The common paths are the ones
+		 * encountered in text areas (LF_INIT, LF_TEXT, LF_SEPARATOR) and at the end (LF_END).
+		 * We use the common LF_INIT state to dispatch to the different final states.
+		 */
+		switch (pformat) {
+		case LF_STARTVAR:                      // text immediately following a '%'
+			arg = NULL;
+			arg_len = var_len = 0;
+			if (*str == '{') {             // optional argument
+				cformat = LF_STARG;
+				arg = str + 1;
 			}
-
-		} else if (pformat == LF_VAR) { // after a varname
-			if ( (*str >= 'a' && *str <= 'z') || // parse varname
-			     (*str >= 'A' && *str <= 'Z') ||
-			     (*str >= '0' && *str <= '9')) {
+			else if (isalnum((int)*str)) { // variable name
 				cformat = LF_VAR;
-				continue;
+				var = str;
 			}
-		} else if (pformat  == LF_STARG) { // inside variable arguments
-			if (*str == '}') { // end of varname
+			else if (*str == '%')
+				cformat = LF_TEXT;     // convert this character to a litteral (useful for '%')
+			else
+				cformat = LF_INIT;     // handle other cases of litterals
+			break;
+
+		case LF_STARG:                         // text immediately following '%{'
+			if (*str == '}') {             // end of arg
 				cformat = LF_EDARG;
-				continue;
-			} else { // all tokens are acceptable within { }
-				cformat = LF_STARG;
-				continue;
+				arg_len = str - arg;
+				*str = 0;              // used for reporting errors
 			}
-		} else if (pformat == LF_EDARG) { //  after arguments
-			if ( (*str >= 'a' && *str <= 'z') || // parse a varname
-			     (*str >= 'A' && *str <= 'Z') ||
-			     (*str >= '0' && *str <= '9')) {
+			break;
+
+		case LF_EDARG:                         // text immediately following '%{arg}'
+			if (isalnum((int)*str)) {      // variable name
 				cformat = LF_VAR;
-				continue;
-			} else { // if no varname after arguments, transform in TEXT
-				pformat = LF_TEXT;
-				cformat = LF_TEXT;
+				var = str;
+				break;
+			}
+
+			Warning("Skipping isolated argument in log-format line : '%%{%s}'\n", arg);
+			cformat = LF_INIT;
+			break;
+
+		case LF_VAR:                           // text part of a variable name
+			var_len = str - var;
+			if (!isalnum((int)*str))
+				cformat = LF_INIT;     // not variable name anymore
+			break;
+
+		default:                               // LF_INIT, LF_TEXT, LF_SEPARATOR, LF_END
+			cformat = LF_INIT;
+		}
+
+		if (cformat == LF_INIT) { /* resynchronize state to text/sep/startvar */
+			switch (*str) {
+			case '%': cformat = LF_STARTVAR;  break;
+			case ' ': cformat = LF_SEPARATOR; break;
+			case  0 : cformat = LF_END;       break;
+			default : cformat = LF_TEXT;      break;
 			}
 		}
 
-		// others tokens that don't match previous conditions
-		if (*str == '%') {
-			cformat = LF_STARTVAR;
-		} else if (*str == ' ') {
-			cformat = LF_SEPARATOR;
-		} else {
-			cformat = LF_TEXT;
+		if (cformat != pformat || pformat == LF_SEPARATOR) {
+			switch (pformat) {
+			case LF_VAR:
+				parse_logformat_var(arg, arg_len, var, var_len, curproxy, list_format, &options);
+				break;
+			case LF_TEXT:
+			case LF_SEPARATOR:
+				add_to_logformat_list(sp, str, pformat, list_format);
+				break;
+			}
+			sp = str; /* new start of text at every state switch and at every separator */
 		}
 	}
+
+	if (pformat == LF_STARTVAR || pformat == LF_STARG)
+		Warning("Ignoring end of truncated log-format line after '%s'\n", arg ? arg : "%");
 }
 
 /*