MAJOR: compression: integrate support for libslz

This library is designed to emit a zlib-compatible stream with no
memory usage and to favor resource savings over compression ratio.
While zlib requires 256 kB of RAM per compression context (and can only
support 4000 connections per GB of RAM), the stateless compression
offered by libslz does not need to retain buffers between subsequent
calls. In theory this slightly reduces the compression ratio but in
practice it does not have that much of an effect since the zlib
window is limited to 32kB.

Libslz is available at :

      http://git.1wt.eu/web?p=libslz.git

It was designed for web compression and provides a lot of savings
over zlib in haproxy. Here are the preliminary results on a single
core of a core2-quad 3.0 GHz in 32-bit for only 300 concurrent
sessions visiting the home page of www.haproxy.org (76 kB) with
the default 16kB buffers :

          BW In      BW Out     BW Saved   Ratio   memory VSZ/RSS
zlib      237 Mbps    92 Mbps   145 Mbps   2.58     84M /  69M
slz       733 Mbps   380 Mbps   353 Mbps   1.93    5.9M / 4.2M

So while the compression ratio is lower, the bandwidth savings are
much more important due to the significantly lower compression cost
which allows to consume even more data from the servers. In the
example above, zlib became the bottleneck at 24% of the output
bandwidth. Also the difference in memory usage is obvious.

More tests run on a single core of a core i5-3320M, with 500 concurrent
users and the default 16kB buffers :

At 100% CPU (no limit) :
          BW In      BW Out     BW Saved   Ratio   memory VSZ/RSS  hits/s
zlib      480 Mbps   188 Mbps   292 Mbps   2.55     130M / 101M     744
slz      1700 Mbps   810 Mbps   890 Mbps   2.10    23.7M / 9.7M    2382

At 85% CPU (limited) :
          BW In      BW Out     BW Saved   Ratio   memory VSZ/RSS  hits/s
zlib     1240 Mbps   976 Mbps   264 Mbps   1.27     130M / 100M    1738
slz      1600 Mbps   976 Mbps   624 Mbps   1.64    23.7M / 9.7M    2210

The most important benefit really happens when the CPU usage is
limited by "maxcompcpuusage" or the BW limited by "maxcomprate" :
in order to preserve resources, haproxy throttles the compression
ratio until usage is within limits. Since slz is much cheaper, the
average compression ratio is much higher and the input bandwidth
is quite higher for one Gbps output.

Other tests made with some reference files :

                           BW In     BW Out    BW Saved  Ratio  hits/s
daniels.html       zlib  1320 Mbps  163 Mbps  1157 Mbps   8.10    1925
                   slz   3600 Mbps  580 Mbps  3020 Mbps   6.20    5300

tv.com/listing     zlib   980 Mbps  124 Mbps   856 Mbps   7.90     310
                   slz   3300 Mbps  553 Mbps  2747 Mbps   5.97    1100

jquery.min.js      zlib   430 Mbps  180 Mbps   250 Mbps   2.39     547
                   slz   1470 Mbps  764 Mbps   706 Mbps   1.92    1815

bootstrap.min.css  zlib   790 Mbps  165 Mbps   625 Mbps   4.79     777
                   slz   2450 Mbps  650 Mbps  1800 Mbps   3.77    2400

So on top of saving a lot of memory, slz is constantly 2.5-3.5 times
faster than zlib and results in providing more savings for a fixed CPU
usage. For links smaller than 100 Mbps, zlib still provides a better
compression ratio, at the expense of a much higher CPU usage.

Larger input files provide slightly higher bandwidth for both libs, at
the expense of a bit more memory usage for zlib (it converges to 256kB
per connection).
diff --git a/Makefile b/Makefile
index e3efcc6..afe38a3 100644
--- a/Makefile
+++ b/Makefile
@@ -33,6 +33,7 @@
 #   USE_ACCEPT4          : enable use of accept4() on linux. Automatic.
 #   USE_MY_ACCEPT4       : use own implemention of accept4() if glibc < 2.10.
 #   USE_ZLIB             : enable zlib library support.
+#   USE_SLZ              : enable slz library instead of zlib (pick at most one).
 #   USE_CPU_AFFINITY     : enable pinning processes to CPU on Linux. Automatic.
 #   USE_TFO              : enable TCP fast open. Supported on Linux >= 3.7.
 #   USE_NS               : enable network namespace support. Supported on Linux >= 2.6.24.
@@ -448,6 +449,15 @@
 BUILD_OPTIONS   += $(call ignore_implicit,USE_GETADDRINFO)
 endif
 
+ifneq ($(USE_SLZ),)
+# Use SLZ_INC and SLZ_LIB to force path to zlib.h and libz.{a,so} if needed.
+SLZ_INC =
+SLZ_LIB =
+OPTIONS_CFLAGS  += -DUSE_SLZ $(if $(SLZ_INC),-I$(SLZ_INC))
+BUILD_OPTIONS   += $(call ignore_implicit,USE_SLZ)
+OPTIONS_LDFLAGS += $(if $(SLZ_LIB),-L$(SLZ_LIB)) -lslz
+endif
+
 ifneq ($(USE_ZLIB),)
 # Use ZLIB_INC and ZLIB_LIB to force path to zlib.h and libz.{a,so} if needed.
 ZLIB_INC =
diff --git a/README b/README
index 35ceab7..b98b919 100644
--- a/README
+++ b/README
@@ -118,7 +118,9 @@
 
 It is also possible to include native support for ZLIB to benefit from HTTP
 compression. For this, pass "USE_ZLIB=1" on the "make" command line and ensure
-that zlib is present on the system.
+that zlib is present on the system. Alternatively it is possible to use libslz
+for a faster, memory less, but slightly less efficient compression, by passing
+"USE_SLZ=1".
 
 By default, the DEBUG variable is set to '-g' to enable debug symbols. It is
 not wise to disable it on uncommon systems, because it's often the only way to
diff --git a/include/types/compression.h b/include/types/compression.h
index d062d23..ecbd5a1 100644
--- a/include/types/compression.h
+++ b/include/types/compression.h
@@ -23,11 +23,11 @@
 #ifndef _TYPES_COMP_H
 #define _TYPES_COMP_H
 
-#ifdef USE_ZLIB
-
+#if defined(USE_SLZ)
+#include <slz.h>
+#elif defined(USE_ZLIB)
 #include <zlib.h>
-
-#endif /* USE_ZLIB */
+#endif
 
 struct comp {
 	struct comp_algo *algos;
@@ -36,14 +36,19 @@
 };
 
 struct comp_ctx {
-#ifdef USE_ZLIB
+#if defined(USE_SLZ)
+	struct slz_stream strm;
+	const void *direct_ptr; /* NULL or pointer to beginning of data */
+	int direct_len;         /* length of direct_ptr if not NULL */
+	struct buffer *queued;  /* if not NULL, data already queued */
+#elif defined(USE_ZLIB)
 	z_stream strm; /* zlib stream */
 	void *zlib_deflate_state;
 	void *zlib_window;
 	void *zlib_prev;
 	void *zlib_pending_buf;
 	void *zlib_head;
-#endif /* USE_ZLIB */
+#endif
 	int cur_lvl;
 };
 
diff --git a/src/compression.c b/src/compression.c
index 004c5e2..9900d71 100644
--- a/src/compression.c
+++ b/src/compression.c
@@ -13,7 +13,9 @@
 
 #include <stdio.h>
 
-#ifdef USE_ZLIB
+#if defined(USE_SLZ)
+#include <slz.h>
+#elif defined(USE_ZLIB)
 /* Note: the crappy zlib and openssl libs both define the "free_func" type.
  * That's a very clever idea to use such a generic name in general purpose
  * libraries, really... The zlib one is easier to redefine than openssl's,
@@ -61,7 +63,17 @@
 static int identity_finish(struct comp_ctx *comp_ctx, struct buffer *out);
 static int identity_end(struct comp_ctx **comp_ctx);
 
-#ifdef USE_ZLIB
+#if defined(USE_SLZ)
+
+static int rfc1950_init(struct comp_ctx **comp_ctx, int level);
+static int rfc1951_init(struct comp_ctx **comp_ctx, int level);
+static int rfc1952_init(struct comp_ctx **comp_ctx, int level);
+static int rfc195x_add_data(struct comp_ctx *comp_ctx, const char *in_data, int in_len, struct buffer *out);
+static int rfc195x_flush(struct comp_ctx *comp_ctx, struct buffer *out);
+static int rfc195x_finish(struct comp_ctx *comp_ctx, struct buffer *out);
+static int rfc195x_end(struct comp_ctx **comp_ctx);
+
+#elif defined(USE_ZLIB)
 
 static int gzip_init(struct comp_ctx **comp_ctx, int level);
 static int raw_def_init(struct comp_ctx **comp_ctx, int level);
@@ -77,7 +89,11 @@
 const struct comp_algo comp_algos[] =
 {
 	{ "identity",     8, "identity", 8, identity_init, identity_add_data, identity_flush, identity_finish, identity_end },
-#ifdef USE_ZLIB
+#if defined(USE_SLZ)
+	{ "deflate",      7, "deflate",  7, rfc1950_init,  rfc195x_add_data,  rfc195x_flush,  rfc195x_finish,  rfc195x_end },
+	{ "raw-deflate", 11, "deflate",  7, rfc1951_init,  rfc195x_add_data,  rfc195x_flush,  rfc195x_finish,  rfc195x_end },
+	{ "gzip",         4, "gzip",     4, rfc1952_init,  rfc195x_add_data,  rfc195x_flush,  rfc195x_finish,  rfc195x_end },
+#elif defined(USE_ZLIB)
 	{ "deflate",      7, "deflate",  7, deflate_init,  deflate_add_data,  deflate_flush,  deflate_finish,  deflate_end },
 	{ "raw-deflate", 11, "deflate",  7, raw_def_init,  deflate_add_data,  deflate_flush,  deflate_finish,  deflate_end },
 	{ "gzip",         4, "gzip",     4, gzip_init,     deflate_add_data,  deflate_flush,  deflate_finish,  deflate_end },
@@ -221,7 +237,7 @@
 	struct buffer *ib = *in, *ob = *out;
 	char *tail;
 
-#ifdef USE_ZLIB
+#if defined(USE_SLZ) || defined(USE_ZLIB)
 	int ret;
 
 	/* flush data here */
@@ -357,7 +373,11 @@
 	*comp_ctx = pool_alloc2(pool_comp_ctx);
 	if (*comp_ctx == NULL)
 		return -1;
-#ifdef USE_ZLIB
+#if defined(USE_SLZ)
+	(*comp_ctx)->direct_ptr = NULL;
+	(*comp_ctx)->direct_len = 0;
+	(*comp_ctx)->queued = NULL;
+#elif defined(USE_ZLIB)
 	zlib_used_memory += sizeof(struct comp_ctx);
 
 	strm = &(*comp_ctx)->strm;
@@ -427,21 +447,157 @@
 	return 0;
 }
 
-static int identity_reset(struct comp_ctx *comp_ctx)
+/*
+ * Deinit the algorithm
+ */
+static int identity_end(struct comp_ctx **comp_ctx)
 {
 	return 0;
 }
 
-/*
- * Deinit the algorithm
+
+#ifdef USE_SLZ
+
+/* SLZ's gzip format (RFC1952). Returns < 0 on error. */
+static int rfc1952_init(struct comp_ctx **comp_ctx, int level)
+{
+	if (init_comp_ctx(comp_ctx) < 0)
+		return -1;
+
+	(*comp_ctx)->cur_lvl = !!level;
+	return slz_rfc1952_init(&(*comp_ctx)->strm, !!level);
+}
+
+/* SLZ's raw deflate format (RFC1951). Returns < 0 on error. */
+static int rfc1951_init(struct comp_ctx **comp_ctx, int level)
+{
+	if (init_comp_ctx(comp_ctx) < 0)
+		return -1;
+
+	(*comp_ctx)->cur_lvl = !!level;
+	return slz_rfc1951_init(&(*comp_ctx)->strm, !!level);
+}
+
+/* SLZ's zlib format (RFC1950). Returns < 0 on error. */
+static int rfc1950_init(struct comp_ctx **comp_ctx, int level)
+{
+	if (init_comp_ctx(comp_ctx) < 0)
+		return -1;
+
+	(*comp_ctx)->cur_lvl = !!level;
+	return slz_rfc1950_init(&(*comp_ctx)->strm, !!level);
+}
+
+/* Return the size of consumed data or -1. The output buffer is unused at this
+ * point, we only keep a reference to the input data or a copy of them if the
+ * reference is already used.
  */
-static int identity_end(struct comp_ctx **comp_ctx)
+static int rfc195x_add_data(struct comp_ctx *comp_ctx, const char *in_data, int in_len, struct buffer *out)
 {
+	static struct buffer *tmpbuf = &buf_empty;
+
+	if (in_len <= 0)
+		return 0;
+
+	if (comp_ctx->direct_ptr && !comp_ctx->queued) {
+		/* data already being pointed to, we're in front of fragmented
+		 * data and need a buffer now. We reuse the same buffer, as it's
+		 * not used out of the scope of a series of add_data()*, end().
+		 */
+		if (unlikely(!tmpbuf->size)) {
+			/* this is the first time we need the compression buffer */
+			if (b_alloc(&tmpbuf) == NULL)
+				return -1; /* no memory */
+		}
+		b_reset(tmpbuf);
+		memcpy(bi_end(tmpbuf), comp_ctx->direct_ptr, comp_ctx->direct_len);
+		tmpbuf->i += comp_ctx->direct_len;
+		comp_ctx->direct_ptr = NULL;
+		comp_ctx->direct_len = 0;
+		comp_ctx->queued = tmpbuf;
+		/* fall through buffer copy */
+	}
+
+	if (comp_ctx->queued) {
+		/* data already pending */
+		memcpy(bi_end(comp_ctx->queued), in_data, in_len);
+		comp_ctx->queued->i += in_len;
+		return in_len;
+	}
+
+	comp_ctx->direct_ptr = in_data;
+	comp_ctx->direct_len = in_len;
+	return in_len;
+}
+
+/* Compresses the data accumulated using add_data(), and optionally sends the
+ * format-specific trailer if <finish> is non-null. <out> is expected to have a
+ * large enough free non-wrapping space as verified by http_comp_buffer_init().
+ * The number of bytes emitted is reported.
+ */
+static int rfc195x_flush_or_finish(struct comp_ctx *comp_ctx, struct buffer *out, int finish)
+{
+	struct slz_stream *strm = &comp_ctx->strm;
+	const char *in_ptr;
+	int in_len;
+	int out_len;
+
+	in_ptr = comp_ctx->direct_ptr;
+	in_len = comp_ctx->direct_len;
+
+	if (comp_ctx->queued) {
+		in_ptr = comp_ctx->queued->p;
+		in_len = comp_ctx->queued->i;
+	}
+
+	out_len = out->i;
+
+	if (in_ptr)
+		out->i += slz_encode(strm, bi_end(out), in_ptr, in_len, !finish);
+
+	if (finish)
+		out->i += slz_finish(strm, bi_end(out));
+
+	out_len = out->i - out_len;
+
+	/* very important, we must wipe the data we've just flushed */
+	comp_ctx->direct_len = 0;
+	comp_ctx->direct_ptr = NULL;
+	comp_ctx->queued     = NULL;
+
+	/* Verify compression rate limiting and CPU usage */
+	if ((global.comp_rate_lim > 0 && (read_freq_ctr(&global.comp_bps_out) > global.comp_rate_lim)) ||    /* rate */
+	   (idle_pct < compress_min_idle)) {                                                                 /* idle */
+		if (comp_ctx->cur_lvl > 0)
+			strm->level = --comp_ctx->cur_lvl;
+	}
+	else if (comp_ctx->cur_lvl < global.tune.comp_maxlevel && comp_ctx->cur_lvl < 1) {
+		strm->level = ++comp_ctx->cur_lvl;
+	}
+
+	/* and that's all */
+	return out_len;
+}
+
+static int rfc195x_flush(struct comp_ctx *comp_ctx, struct buffer *out)
+{
+	return rfc195x_flush_or_finish(comp_ctx, out, 0);
+}
+
+static int rfc195x_finish(struct comp_ctx *comp_ctx, struct buffer *out)
+{
+	return rfc195x_flush_or_finish(comp_ctx, out, 1);
+}
+
+/* we just need to free the comp_ctx here, nothing was allocated */
+static int rfc195x_end(struct comp_ctx **comp_ctx)
+{
+	deinit_comp_ctx(comp_ctx);
 	return 0;
 }
 
+#elif defined(USE_ZLIB)  /* ! USE_SLZ */
 
-#ifdef USE_ZLIB
 /*
  * This is a tricky allocation function using the zlib.
  * This is based on the allocation order in deflateInit2.
@@ -719,6 +875,10 @@
 __attribute__((constructor))
 static void __comp_fetch_init(void)
 {
+#ifdef USE_SLZ
+	slz_make_crc_table();
+	slz_prepare_dist_table();
+#endif
 	acl_register_keywords(&acl_kws);
 	sample_register_fetches(&sample_fetch_keywords);
 }