MINOR: sample: add "json" converter This converter escapes string to use it as json/ascii escaped string. It can read UTF-8 with differents behavior on errors and encode it in json/ascii. json([<input-code>]) Escapes the input string and produces an ASCII ouput string ready to use as a JSON string. The converter tries to decode the input string according to the <input-code> parameter. It can be "ascii", "utf8", "utf8s", "utf8"" or "utf8ps". The "ascii" decoder never fails. The "utf8" decoder detects 3 types of errors: - bad UTF-8 sequence (lone continuation byte, bad number of continuation bytes, ...) - invalid range (the decoded value is within a UTF-8 prohibited range), - code overlong (the value is encoded with more bytes than necessary). The UTF-8 JSON encoding can produce a "too long value" error when the UTF-8 character is greater than 0xffff because the JSON string escape specification only authorizes 4 hex digits for the value encoding. The UTF-8 decoder exists in 4 variants designated by a combination of two suffix letters : "p" for "permissive" and "s" for "silently ignore". The behaviors of the decoders are : - "ascii" : never fails ; - "utf8" : fails on any detected errors ; - "utf8s" : never fails, but removes characters corresponding to errors ; - "utf8p" : accepts and fixes the overlong errors, but fails on any other error ; - "utf8ps" : never fails, accepts and fixes the overlong errors, but removes characters corresponding to the other errors. This converter is particularly useful for building properly escaped JSON for logging to servers which consume JSON-formated traffic logs. Example: capture request header user-agent len 150 capture request header Host len 15 log-format {"ip":"%[src]","user-agent":"%[capture.req.hdr(1),json]"} Input request from client 127.0.0.1: GET / HTTP/1.0 User-Agent: Very "Ugly" UA 1/2 Output log: {"ip":"127.0.0.1","user-agent":"Very \"Ugly\" UA 1\/2"}

commit: 317e1c4f1e00ff1de2451c3a14b8082b1a7198f9 [log] [tgz]
author: Thierry FOURNIER <tfournier@exceliance.fr> Tue Aug 12 10:20:47 2014 +0200
committer: Willy Tarreau <w@1wt.eu> Sun Oct 26 06:41:12 2014 +0100
tree: 7185be8c3871c110b520e2b6619a36a0489abc7e
parent: fb20e4668d0d3fbdcb97eacb9695659aa7ac1a54 [diff] [blame]
diff --git a/src/standard.c b/src/standard.c
index f57724c..00e672a 100644
--- a/src/standard.c
+++ b/src/standard.c

@@ -2533,6 +2533,126 @@
 	return NULL;
 }
 
+/* This function read the next valid utf8 char.
+ * <s> is the byte srray to be decode, <len> is its length.
+ * The function returns decoded char encoded like this:
+ * The 4 msb are the return code (UTF8_CODE_*), the 4 lsb
+ * are the length read. The decoded character is stored in <c>.
+ */
+unsigned char utf8_next(const char *s, int len, unsigned int *c)
+{
+	const unsigned char *p = (unsigned char *)s;
+	int dec;
+	unsigned char code = UTF8_CODE_OK;
+
+	if (len < 1)
+		return UTF8_CODE_OK;
+
+	/* Check the type of UTF8 sequence
+	 *
+	 * 0... ....  0x00 <= x <= 0x7f : 1 byte: ascii char
+	 * 10.. ....  0x80 <= x <= 0xbf : invalid sequence
+	 * 110. ....  0xc0 <= x <= 0xdf : 2 bytes
+	 * 1110 ....  0xe0 <= x <= 0xef : 3 bytes
+	 * 1111 0...  0xf0 <= x <= 0xf7 : 4 bytes
+	 * 1111 10..  0xf8 <= x <= 0xfb : 5 bytes
+	 * 1111 110.  0xfc <= x <= 0xfd : 6 bytes
+	 * 1111 111.  0xfe <= x <= 0xff : invalid sequence
+	 */
+	switch (*p) {
+	case 0x00 ... 0x7f:
+		*c = *p;
+		return UTF8_CODE_OK | 1;
+
+	case 0x80 ... 0xbf:
+		*c = *p;
+		return UTF8_CODE_BADSEQ | 1;
+
+	case 0xc0 ... 0xdf:
+		if (len < 2) {
+			*c = *p;
+			return UTF8_CODE_BADSEQ | 1;
+		}
+		*c = *p & 0x1f;
+		dec = 1;
+		break;
+
+	case 0xe0 ... 0xef:
+		if (len < 3) {
+			*c = *p;
+			return UTF8_CODE_BADSEQ | 1;
+		}
+		*c = *p & 0x0f;
+		dec = 2;
+		break;
+
+	case 0xf0 ... 0xf7:
+		if (len < 4) {
+			*c = *p;
+			return UTF8_CODE_BADSEQ | 1;
+		}
+		*c = *p & 0x07;
+		dec = 3;
+		break;
+
+	case 0xf8 ... 0xfb:
+		if (len < 5) {
+			*c = *p;
+			return UTF8_CODE_BADSEQ | 1;
+		}
+		*c = *p & 0x03;
+		dec = 4;
+		break;
+
+	case 0xfc ... 0xfd:
+		if (len < 6) {
+			*c = *p;
+			return UTF8_CODE_BADSEQ | 1;
+		}
+		*c = *p & 0x01;
+		dec = 5;
+		break;
+
+	case 0xfe ... 0xff:
+	default:
+		*c = *p;
+		return UTF8_CODE_BADSEQ | 1;
+	}
+
+	p++;
+
+	while (dec > 0) {
+
+		/* need 0x10 for the 2 first bits */
+		if ( ( *p & 0xc0 ) != 0x80 )
+			return UTF8_CODE_BADSEQ | ((p-(unsigned char *)s)&0xffff);
+
+		/* add data at char */
+		*c = ( *c << 6 ) | ( *p & 0x3f );
+
+		dec--;
+		p++;
+	}
+
+	/* Check ovelong encoding.
+	 * 1 byte  : 5 + 6         : 11 : 0x80    ... 0x7ff
+	 * 2 bytes : 4 + 6 + 6     : 16 : 0x800   ... 0xffff
+	 * 3 bytes : 3 + 6 + 6 + 6 : 21 : 0x10000 ... 0x1fffff
+	 */
+	if ((*c >= 0x00    && *c <= 0x7f     && (p-(unsigned char *)s) > 1) ||
+	    (*c >= 0x80    && *c <= 0x7ff    && (p-(unsigned char *)s) > 2) ||
+	    (*c >= 0x800   && *c <= 0xffff   && (p-(unsigned char *)s) > 3) ||
+	    (*c >= 0x10000 && *c <= 0x1fffff && (p-(unsigned char *)s) > 4))
+		code |= UTF8_CODE_OVERLONG;
+
+	/* Check invalid UTF8 range. */
+	if ((*c >= 0xd800 && *c <= 0xdfff) ||
+	    (*c >= 0xfffe && *c <= 0xffff))
+		code |= UTF8_CODE_INVRANGE;
+
+	return code | ((p-(unsigned char *)s)&0x0f);
+}
+
 /*
  * Local variables:
  *  c-indent-level: 8
commit	317e1c4f1e00ff1de2451c3a14b8082b1a7198f9	[log] [tgz]
author	Thierry FOURNIER <tfournier@exceliance.fr>	Tue Aug 12 10:20:47 2014 +0200
committer	Willy Tarreau <w@1wt.eu>	Sun Oct 26 06:41:12 2014 +0100
tree	7185be8c3871c110b520e2b6619a36a0489abc7e
parent	fb20e4668d0d3fbdcb97eacb9695659aa7ac1a54 [diff] [blame]