MINOR: sample: add "json" converter

This converter escapes string to use it as json/ascii escaped string.
It can read UTF-8 with differents behavior on errors and encode it in
json/ascii.

json([<input-code>])
  Escapes the input string and produces an ASCII ouput string ready to use as a
  JSON string. The converter tries to decode the input string according to the
  <input-code> parameter. It can be "ascii", "utf8", "utf8s", "utf8"" or
  "utf8ps". The "ascii" decoder never fails. The "utf8" decoder detects 3 types
  of errors:
   - bad UTF-8 sequence (lone continuation byte, bad number of continuation
     bytes, ...)
   - invalid range (the decoded value is within a UTF-8 prohibited range),
   - code overlong (the value is encoded with more bytes than necessary).

  The UTF-8 JSON encoding can produce a "too long value" error when the UTF-8
  character is greater than 0xffff because the JSON string escape specification
  only authorizes 4 hex digits for the value encoding. The UTF-8 decoder exists
  in 4 variants designated by a combination of two suffix letters : "p" for
  "permissive" and "s" for "silently ignore". The behaviors of the decoders
  are :
   - "ascii"  : never fails ;
   - "utf8"   : fails on any detected errors ;
   - "utf8s"  : never fails, but removes characters corresponding to errors ;
   - "utf8p"  : accepts and fixes the overlong errors, but fails on any other
                error ;
   - "utf8ps" : never fails, accepts and fixes the overlong errors, but removes
                characters corresponding to the other errors.

  This converter is particularly useful for building properly escaped JSON for
  logging to servers which consume JSON-formated traffic logs.

  Example:
     capture request header user-agent len 150
     capture request header Host len 15
     log-format {"ip":"%[src]","user-agent":"%[capture.req.hdr(1),json]"}

  Input request from client 127.0.0.1:
     GET / HTTP/1.0
     User-Agent: Very "Ugly" UA 1/2

  Output log:
     {"ip":"127.0.0.1","user-agent":"Very \"Ugly\" UA 1\/2"}
diff --git a/src/standard.c b/src/standard.c
index f57724c..00e672a 100644
--- a/src/standard.c
+++ b/src/standard.c
@@ -2533,6 +2533,126 @@
 	return NULL;
 }
 
+/* This function read the next valid utf8 char.
+ * <s> is the byte srray to be decode, <len> is its length.
+ * The function returns decoded char encoded like this:
+ * The 4 msb are the return code (UTF8_CODE_*), the 4 lsb
+ * are the length read. The decoded character is stored in <c>.
+ */
+unsigned char utf8_next(const char *s, int len, unsigned int *c)
+{
+	const unsigned char *p = (unsigned char *)s;
+	int dec;
+	unsigned char code = UTF8_CODE_OK;
+
+	if (len < 1)
+		return UTF8_CODE_OK;
+
+	/* Check the type of UTF8 sequence
+	 *
+	 * 0... ....  0x00 <= x <= 0x7f : 1 byte: ascii char
+	 * 10.. ....  0x80 <= x <= 0xbf : invalid sequence
+	 * 110. ....  0xc0 <= x <= 0xdf : 2 bytes
+	 * 1110 ....  0xe0 <= x <= 0xef : 3 bytes
+	 * 1111 0...  0xf0 <= x <= 0xf7 : 4 bytes
+	 * 1111 10..  0xf8 <= x <= 0xfb : 5 bytes
+	 * 1111 110.  0xfc <= x <= 0xfd : 6 bytes
+	 * 1111 111.  0xfe <= x <= 0xff : invalid sequence
+	 */
+	switch (*p) {
+	case 0x00 ... 0x7f:
+		*c = *p;
+		return UTF8_CODE_OK | 1;
+
+	case 0x80 ... 0xbf:
+		*c = *p;
+		return UTF8_CODE_BADSEQ | 1;
+
+	case 0xc0 ... 0xdf:
+		if (len < 2) {
+			*c = *p;
+			return UTF8_CODE_BADSEQ | 1;
+		}
+		*c = *p & 0x1f;
+		dec = 1;
+		break;
+
+	case 0xe0 ... 0xef:
+		if (len < 3) {
+			*c = *p;
+			return UTF8_CODE_BADSEQ | 1;
+		}
+		*c = *p & 0x0f;
+		dec = 2;
+		break;
+
+	case 0xf0 ... 0xf7:
+		if (len < 4) {
+			*c = *p;
+			return UTF8_CODE_BADSEQ | 1;
+		}
+		*c = *p & 0x07;
+		dec = 3;
+		break;
+
+	case 0xf8 ... 0xfb:
+		if (len < 5) {
+			*c = *p;
+			return UTF8_CODE_BADSEQ | 1;
+		}
+		*c = *p & 0x03;
+		dec = 4;
+		break;
+
+	case 0xfc ... 0xfd:
+		if (len < 6) {
+			*c = *p;
+			return UTF8_CODE_BADSEQ | 1;
+		}
+		*c = *p & 0x01;
+		dec = 5;
+		break;
+
+	case 0xfe ... 0xff:
+	default:
+		*c = *p;
+		return UTF8_CODE_BADSEQ | 1;
+	}
+
+	p++;
+
+	while (dec > 0) {
+
+		/* need 0x10 for the 2 first bits */
+		if ( ( *p & 0xc0 ) != 0x80 )
+			return UTF8_CODE_BADSEQ | ((p-(unsigned char *)s)&0xffff);
+
+		/* add data at char */
+		*c = ( *c << 6 ) | ( *p & 0x3f );
+
+		dec--;
+		p++;
+	}
+
+	/* Check ovelong encoding.
+	 * 1 byte  : 5 + 6         : 11 : 0x80    ... 0x7ff
+	 * 2 bytes : 4 + 6 + 6     : 16 : 0x800   ... 0xffff
+	 * 3 bytes : 3 + 6 + 6 + 6 : 21 : 0x10000 ... 0x1fffff
+	 */
+	if ((*c >= 0x00    && *c <= 0x7f     && (p-(unsigned char *)s) > 1) ||
+	    (*c >= 0x80    && *c <= 0x7ff    && (p-(unsigned char *)s) > 2) ||
+	    (*c >= 0x800   && *c <= 0xffff   && (p-(unsigned char *)s) > 3) ||
+	    (*c >= 0x10000 && *c <= 0x1fffff && (p-(unsigned char *)s) > 4))
+		code |= UTF8_CODE_OVERLONG;
+
+	/* Check invalid UTF8 range. */
+	if ((*c >= 0xd800 && *c <= 0xdfff) ||
+	    (*c >= 0xfffe && *c <= 0xffff))
+		code |= UTF8_CODE_INVRANGE;
+
+	return code | ((p-(unsigned char *)s)&0x0f);
+}
+
 /*
  * Local variables:
  *  c-indent-level: 8