MINOR: sample: add "json" converter This converter escapes string to use it as json/ascii escaped string. It can read UTF-8 with differents behavior on errors and encode it in json/ascii. json([<input-code>]) Escapes the input string and produces an ASCII ouput string ready to use as a JSON string. The converter tries to decode the input string according to the <input-code> parameter. It can be "ascii", "utf8", "utf8s", "utf8"" or "utf8ps". The "ascii" decoder never fails. The "utf8" decoder detects 3 types of errors: - bad UTF-8 sequence (lone continuation byte, bad number of continuation bytes, ...) - invalid range (the decoded value is within a UTF-8 prohibited range), - code overlong (the value is encoded with more bytes than necessary). The UTF-8 JSON encoding can produce a "too long value" error when the UTF-8 character is greater than 0xffff because the JSON string escape specification only authorizes 4 hex digits for the value encoding. The UTF-8 decoder exists in 4 variants designated by a combination of two suffix letters : "p" for "permissive" and "s" for "silently ignore". The behaviors of the decoders are : - "ascii" : never fails ; - "utf8" : fails on any detected errors ; - "utf8s" : never fails, but removes characters corresponding to errors ; - "utf8p" : accepts and fixes the overlong errors, but fails on any other error ; - "utf8ps" : never fails, accepts and fixes the overlong errors, but removes characters corresponding to the other errors. This converter is particularly useful for building properly escaped JSON for logging to servers which consume JSON-formated traffic logs. Example: capture request header user-agent len 150 capture request header Host len 15 log-format {"ip":"%[src]","user-agent":"%[capture.req.hdr(1),json]"} Input request from client 127.0.0.1: GET / HTTP/1.0 User-Agent: Very "Ugly" UA 1/2 Output log: {"ip":"127.0.0.1","user-agent":"Very \"Ugly\" UA 1\/2"}

commit: 317e1c4f1e00ff1de2451c3a14b8082b1a7198f9 [log] [tgz]
author: Thierry FOURNIER <tfournier@exceliance.fr> Tue Aug 12 10:20:47 2014 +0200
committer: Willy Tarreau <w@1wt.eu> Sun Oct 26 06:41:12 2014 +0100
tree: 7185be8c3871c110b520e2b6619a36a0489abc7e
parent: fb20e4668d0d3fbdcb97eacb9695659aa7ac1a54 [diff]
diff --git a/doc/configuration.txt b/doc/configuration.txt
index 58aaaf0..ad6defb 100644
--- a/doc/configuration.txt
+++ b/doc/configuration.txt

@@ -10043,6 +10043,46 @@
   table entries and as such use the same server. The mask can be passed in
   dotted form (eg: 255.255.255.0) or in CIDR form (eg: 24).
 
+json([<input-code>])
+  Escapes the input string and produces an ASCII ouput string ready to use as a
+  JSON string. The converter tries to decode the input string according to the
+  <input-code> parameter. It can be "ascii", "utf8", "utf8s", "utf8"" or
+  "utf8ps". The "ascii" decoder never fails. The "utf8" decoder detects 3 types
+  of errors:
+   - bad UTF-8 sequence (lone continuation byte, bad number of continuation
+     bytes, ...)
+   - invalid range (the decoded value is within a UTF-8 prohibited range),
+   - code overlong (the value is encoded with more bytes than necessary).
+
+  The UTF-8 JSON encoding can produce a "too long value" error when the UTF-8
+  character is greater than 0xffff because the JSON string escape specification
+  only authorizes 4 hex digits for the value encoding. The UTF-8 decoder exists
+  in 4 variants designated by a combination of two suffix letters : "p" for
+  "permissive" and "s" for "silently ignore". The behaviors of the decoders
+  are :
+   - "ascii"  : never fails ;
+   - "utf8"   : fails on any detected errors ;
+   - "utf8s"  : never fails, but removes characters corresponding to errors ;
+   - "utf8p"  : accepts and fixes the overlong errors, but fails on any other
+                error ;
+   - "utf8ps" : never fails, accepts and fixes the overlong errors, but removes
+                characters corresponding to the other errors.
+
+  This converter is particularly useful for building properly escaped JSON for
+  logging to servers which consume JSON-formated traffic logs.
+
+  Example:
+     capture request header user-agent len 150
+     capture request header Host len 15
+     log-format {"ip":"%[src]","user-agent":"%[capture.req.hdr(1),json]"}
+
+  Input request from client 127.0.0.1:
+     GET / HTTP/1.0
+     User-Agent: Very "Ugly" UA 1/2
+
+  Output log:
+     {"ip":"127.0.0.1","user-agent":"Very \"Ugly\" UA 1\/2"}
+
 language(<value>[,<default>])
   Returns the value with the highest q-factor from a list as extracted from the
   "accept-language" header using "req.fhdr". Values with no q-factor have a

diff --git a/include/common/standard.h b/include/common/standard.h
index 8811c6f..e9900d5 100644
--- a/include/common/standard.h
+++ b/include/common/standard.h

@@ -914,4 +914,22 @@
 	return caddr & ~(unsigned long)(data & 3);
 }
 
+/* UTF-8 decoder status */
+#define UTF8_CODE_OK       0x00
+#define UTF8_CODE_OVERLONG 0x10
+#define UTF8_CODE_INVRANGE 0x20
+#define UTF8_CODE_BADSEQ   0x40
+
+unsigned char utf8_next(const char *s, int len, unsigned int *c);
+
+static inline unsigned char utf8_return_code(unsigned int code)
+{
+	return code & 0xf0;
+}
+
+static inline unsigned char utf8_return_length(unsigned char code)
+{
+	return code & 0x0f;
+}
+
 #endif /* _COMMON_STANDARD_H */

diff --git a/src/sample.c b/src/sample.c
index 3343739..70f47bb 100644
--- a/src/sample.c
+++ b/src/sample.c

@@ -11,6 +11,7 @@
  *
  */
 
+#include <ctype.h>
 #include <string.h>
 #include <arpa/inet.h>
 #include <stdio.h>
@@ -1386,6 +1387,188 @@
 	return 1;
 }
 
+/* This function escape special json characters. The returned string can be
+ * safely set between two '"' and used as json string. The json string is
+ * defined like this:
+ *
+ *    any Unicode character except '"' or '\' or control character
+ *    \", \\, \/, \b, \f, \n, \r, \t, \u + four-hex-digits
+ *
+ * The enum input_type contain all the allowed mode for decoding the input
+ * string.
+ */
+enum input_type {
+	IT_ASCII = 0,
+	IT_UTF8,
+	IT_UTF8S,
+	IT_UTF8P,
+	IT_UTF8PS,
+};
+static int sample_conv_json_check(struct arg *arg, struct sample_conv *conv,
+                                  const char *file, int line, char **err)
+{
+	if (!arg) {
+		memprintf(err, "Unexpected empty arg list");
+		return 0;
+	}
+
+	if (arg->type != ARGT_STR) {
+		memprintf(err, "Unexpected arg type");
+		return 0;
+	}
+
+	if (strcmp(arg->data.str.str, "") == 0) {
+		arg->type = ARGT_UINT;
+		arg->data.uint = IT_ASCII;
+		return 1;
+	}
+
+	else if (strcmp(arg->data.str.str, "ascii") == 0) {
+		arg->type = ARGT_UINT;
+		arg->data.uint = IT_ASCII;
+		return 1;
+	}
+
+	else if (strcmp(arg->data.str.str, "utf8") == 0) {
+		arg->type = ARGT_UINT;
+		arg->data.uint = IT_UTF8;
+		return 1;
+	}
+
+	else if (strcmp(arg->data.str.str, "utf8s") == 0) {
+		arg->type = ARGT_UINT;
+		arg->data.uint = IT_UTF8S;
+		return 1;
+	}
+
+	else if (strcmp(arg->data.str.str, "utf8p") == 0) {
+		arg->type = ARGT_UINT;
+		arg->data.uint = IT_UTF8P;
+		return 1;
+	}
+
+	else if (strcmp(arg->data.str.str, "utf8ps") == 0) {
+		arg->type = ARGT_UINT;
+		arg->data.uint = IT_UTF8PS;
+		return 1;
+	}
+
+	memprintf(err, "Unexpected input code type at file '%s', line %d. "
+	               "Allowed value are 'ascii', 'utf8', 'utf8p' and 'utf8pp'", file, line);
+	return 0;
+}
+
+static int sample_conv_json(const struct arg *arg_p, struct sample *smp)
+{
+	struct chunk *temp;
+	char _str[7]; /* \u + 4 hex digit + null char for sprintf. */
+	const char *str;
+	int len;
+	enum input_type input_type = IT_ASCII;
+	unsigned int c;
+	unsigned int ret;
+	char *p;
+
+	if (arg_p)
+		input_type = arg_p->data.uint;
+
+	temp = get_trash_chunk();
+	temp->len = 0;
+
+	p = smp->data.str.str;
+	while (p < smp->data.str.str + smp->data.str.len) {
+
+		if (input_type == IT_ASCII) {
+			/* Read input as ASCII. */
+			c = *(unsigned char *)p;
+			p++;
+		}
+		else {
+			/* Read input as UTF8. */
+			ret = utf8_next(p, smp->data.str.len - ( p - smp->data.str.str ), &c);
+			p += utf8_return_length(ret);
+
+			if (input_type == IT_UTF8 && utf8_return_code(ret) != UTF8_CODE_OK)
+					return 0;
+			if (input_type == IT_UTF8S && utf8_return_code(ret) != UTF8_CODE_OK)
+					continue;
+			if (input_type == IT_UTF8P && utf8_return_code(ret) & (UTF8_CODE_INVRANGE|UTF8_CODE_BADSEQ))
+					return 0;
+			if (input_type == IT_UTF8PS && utf8_return_code(ret) & (UTF8_CODE_INVRANGE|UTF8_CODE_BADSEQ))
+					continue;
+
+			/* Check too big values. */
+			if ((unsigned int)c > 0xffff) {
+				if (input_type == IT_UTF8 || input_type == IT_UTF8P)
+					return 0;
+				continue;
+			}
+		}
+
+		/* Convert character. */
+		if (c == '"') {
+			len = 2;
+			str = "\\\"";
+		}
+		else if (c == '\\') {
+			len = 2;
+			str = "\\\\";
+		}
+		else if (c == '/') {
+			len = 2;
+			str = "\\/";
+		}
+		else if (c == '\b') {
+			len = 2;
+			str = "\\b";
+		}
+		else if (c == '\f') {
+			len = 2;
+			str = "\\f";
+		}
+		else if (c == '\r') {
+			len = 2;
+			str = "\\r";
+		}
+		else if (c == '\n') {
+			len = 2;
+			str = "\\n";
+		}
+		else if (c == '\t') {
+			len = 2;
+			str = "\\t";
+		}
+		else if (c > 0xff || !isprint(c)) {
+			/* isprint generate a segfault if c is too big. The man says that
+			 * c must have the value of an unsigned char or EOF.
+			 */
+			len = 6;
+			_str[0] = '\\';
+			_str[1] = 'u';
+			snprintf(&_str[2], 5, "%04x", (unsigned short)c);
+			str = _str;
+		}
+		else {
+			len = 1;
+			str = (char *)&c;
+		}
+
+		/* Check length */
+		if (temp->len + len > temp->size)
+			return 0;
+
+		/* Copy string. */
+		memcpy(temp->str + temp->len, str, len);
+		temp->len += len;
+	}
+
+	smp->flags &= ~SMP_F_CONST;
+	smp->data.str = *temp;
+	smp->type = SMP_T_STR;
+
+	return 1;
+}
+
 /************************************************************************/
 /*       All supported sample fetch functions must be declared here     */
 /************************************************************************/
@@ -1493,6 +1676,7 @@
 	{ "djb2",   sample_conv_djb2,      ARG1(0,UINT), NULL, SMP_T_BIN,  SMP_T_UINT },
 	{ "sdbm",   sample_conv_sdbm,      ARG1(0,UINT), NULL, SMP_T_BIN,  SMP_T_UINT },
 	{ "wt6",    sample_conv_wt6,       ARG1(0,UINT), NULL, SMP_T_BIN,  SMP_T_UINT },
+	{ "json",   sample_conv_json,      ARG1(1,STR),  sample_conv_json_check, SMP_T_STR,  SMP_T_STR },
 	{ NULL, NULL, 0, 0, 0 },
 }};
 

diff --git a/src/standard.c b/src/standard.c
index f57724c..00e672a 100644
--- a/src/standard.c
+++ b/src/standard.c

@@ -2533,6 +2533,126 @@
 	return NULL;
 }
 
+/* This function read the next valid utf8 char.
+ * <s> is the byte srray to be decode, <len> is its length.
+ * The function returns decoded char encoded like this:
+ * The 4 msb are the return code (UTF8_CODE_*), the 4 lsb
+ * are the length read. The decoded character is stored in <c>.
+ */
+unsigned char utf8_next(const char *s, int len, unsigned int *c)
+{
+	const unsigned char *p = (unsigned char *)s;
+	int dec;
+	unsigned char code = UTF8_CODE_OK;
+
+	if (len < 1)
+		return UTF8_CODE_OK;
+
+	/* Check the type of UTF8 sequence
+	 *
+	 * 0... ....  0x00 <= x <= 0x7f : 1 byte: ascii char
+	 * 10.. ....  0x80 <= x <= 0xbf : invalid sequence
+	 * 110. ....  0xc0 <= x <= 0xdf : 2 bytes
+	 * 1110 ....  0xe0 <= x <= 0xef : 3 bytes
+	 * 1111 0...  0xf0 <= x <= 0xf7 : 4 bytes
+	 * 1111 10..  0xf8 <= x <= 0xfb : 5 bytes
+	 * 1111 110.  0xfc <= x <= 0xfd : 6 bytes
+	 * 1111 111.  0xfe <= x <= 0xff : invalid sequence
+	 */
+	switch (*p) {
+	case 0x00 ... 0x7f:
+		*c = *p;
+		return UTF8_CODE_OK | 1;
+
+	case 0x80 ... 0xbf:
+		*c = *p;
+		return UTF8_CODE_BADSEQ | 1;
+
+	case 0xc0 ... 0xdf:
+		if (len < 2) {
+			*c = *p;
+			return UTF8_CODE_BADSEQ | 1;
+		}
+		*c = *p & 0x1f;
+		dec = 1;
+		break;
+
+	case 0xe0 ... 0xef:
+		if (len < 3) {
+			*c = *p;
+			return UTF8_CODE_BADSEQ | 1;
+		}
+		*c = *p & 0x0f;
+		dec = 2;
+		break;
+
+	case 0xf0 ... 0xf7:
+		if (len < 4) {
+			*c = *p;
+			return UTF8_CODE_BADSEQ | 1;
+		}
+		*c = *p & 0x07;
+		dec = 3;
+		break;
+
+	case 0xf8 ... 0xfb:
+		if (len < 5) {
+			*c = *p;
+			return UTF8_CODE_BADSEQ | 1;
+		}
+		*c = *p & 0x03;
+		dec = 4;
+		break;
+
+	case 0xfc ... 0xfd:
+		if (len < 6) {
+			*c = *p;
+			return UTF8_CODE_BADSEQ | 1;
+		}
+		*c = *p & 0x01;
+		dec = 5;
+		break;
+
+	case 0xfe ... 0xff:
+	default:
+		*c = *p;
+		return UTF8_CODE_BADSEQ | 1;
+	}
+
+	p++;
+
+	while (dec > 0) {
+
+		/* need 0x10 for the 2 first bits */
+		if ( ( *p & 0xc0 ) != 0x80 )
+			return UTF8_CODE_BADSEQ | ((p-(unsigned char *)s)&0xffff);
+
+		/* add data at char */
+		*c = ( *c << 6 ) | ( *p & 0x3f );
+
+		dec--;
+		p++;
+	}
+
+	/* Check ovelong encoding.
+	 * 1 byte  : 5 + 6         : 11 : 0x80    ... 0x7ff
+	 * 2 bytes : 4 + 6 + 6     : 16 : 0x800   ... 0xffff
+	 * 3 bytes : 3 + 6 + 6 + 6 : 21 : 0x10000 ... 0x1fffff
+	 */
+	if ((*c >= 0x00    && *c <= 0x7f     && (p-(unsigned char *)s) > 1) ||
+	    (*c >= 0x80    && *c <= 0x7ff    && (p-(unsigned char *)s) > 2) ||
+	    (*c >= 0x800   && *c <= 0xffff   && (p-(unsigned char *)s) > 3) ||
+	    (*c >= 0x10000 && *c <= 0x1fffff && (p-(unsigned char *)s) > 4))
+		code |= UTF8_CODE_OVERLONG;
+
+	/* Check invalid UTF8 range. */
+	if ((*c >= 0xd800 && *c <= 0xdfff) ||
+	    (*c >= 0xfffe && *c <= 0xffff))
+		code |= UTF8_CODE_INVRANGE;
+
+	return code | ((p-(unsigned char *)s)&0x0f);
+}
+
 /*
  * Local variables:
  *  c-indent-level: 8
commit	317e1c4f1e00ff1de2451c3a14b8082b1a7198f9	[log] [tgz]
author	Thierry FOURNIER <tfournier@exceliance.fr>	Tue Aug 12 10:20:47 2014 +0200
committer	Willy Tarreau <w@1wt.eu>	Sun Oct 26 06:41:12 2014 +0100
tree	7185be8c3871c110b520e2b6619a36a0489abc7e
parent	fb20e4668d0d3fbdcb97eacb9695659aa7ac1a54 [diff]