Blame - src/slz.c - haproxy

blob: 1d6032ea2e9294858b33314d998081075fbfea11 [file] [log] [blame]

Willy Tarreau	ab2b782	2021-04-22 14:09:44 +0200	[diff] [blame]	1	/*
				2	* Copyright (C) 2013-2015 Willy Tarreau <w@1wt.eu>
				3	*
				4	* Permission is hereby granted, free of charge, to any person obtaining
				5	* a copy of this software and associated documentation files (the
				6	* "Software"), to deal in the Software without restriction, including
				7	* without limitation the rights to use, copy, modify, merge, publish,
				8	* distribute, sublicense, and/or sell copies of the Software, and to
				9	* permit persons to whom the Software is furnished to do so, subject to
				10	* the following conditions:
				11	*
				12	* The above copyright notice and this permission notice shall be
				13	* included in all copies or substantial portions of the Software.
				14	*
				15	* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
				16	* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
				17	* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
				18	* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
				19	* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
				20	* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
				21	* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
				22	* OTHER DEALINGS IN THE SOFTWARE.
				23	*/
				24
Willy Tarreau	388fc25	2021-05-14 08:44:52 +0200	[diff] [blame]	25	#include <inttypes.h>
Willy Tarreau	ab2b782	2021-04-22 14:09:44 +0200	[diff] [blame]	26	#include <stdio.h>
				27	#include <string.h>
				28	#include <import/slz.h>
				29	#include <import/slz-tables.h>
				30
				31	/* First, RFC1951-specific declarations and extracts from the RFC.
				32	*
				33	* RFC1951 - deflate stream format
				34
				35
				36	* Data elements are packed into bytes in order of
				37	increasing bit number within the byte, i.e., starting
				38	with the least-significant bit of the byte.
				39	* Data elements other than Huffman codes are packed
				40	starting with the least-significant bit of the data
				41	element.
				42	* Huffman codes are packed starting with the most-
				43	significant bit of the code.
				44
				45	3.2.3. Details of block format
				46
				47	Each block of compressed data begins with 3 header bits
				48	containing the following data:
				49
				50	first bit BFINAL
				51	next 2 bits BTYPE
				52
				53	Note that the header bits do not necessarily begin on a byte
				54	boundary, since a block does not necessarily occupy an integral
				55	number of bytes.
				56
				57	BFINAL is set if and only if this is the last block of the data
				58	set.
				59
				60	BTYPE specifies how the data are compressed, as follows:
				61
				62	00 - no compression
				63	01 - compressed with fixed Huffman codes
				64	10 - compressed with dynamic Huffman codes
				65	11 - reserved (error)
				66
				67	3.2.4. Non-compressed blocks (BTYPE=00)
				68
				69	Any bits of input up to the next byte boundary are ignored.
				70	The rest of the block consists of the following information:
				71
				72	0 1 2 3 4...
				73	+---+---+---+---+================================+
				74	\| LEN \| NLEN \|... LEN bytes of literal data...\|
				75	+---+---+---+---+================================+
				76
				77	LEN is the number of data bytes in the block. NLEN is the
				78	one's complement of LEN.
				79
				80	3.2.5. Compressed blocks (length and distance codes)
				81
				82	As noted above, encoded data blocks in the "deflate" format
				83	consist of sequences of symbols drawn from three conceptually
				84	distinct alphabets: either literal bytes, from the alphabet of
				85	byte values (0..255), or <length, backward distance> pairs,
				86	where the length is drawn from (3..258) and the distance is
				87	drawn from (1..32,768). In fact, the literal and length
				88	alphabets are merged into a single alphabet (0..285), where
				89	values 0..255 represent literal bytes, the value 256 indicates
				90	end-of-block, and values 257..285 represent length codes
				91	(possibly in conjunction with extra bits following the symbol
				92	code) as follows:
				93
				94	Length encoding :
				95	Extra Extra Extra
				96	Code Bits Length(s) Code Bits Lengths Code Bits Length(s)
				97	---- ---- ------ ---- ---- ------- ---- ---- -------
				98	257 0 3 267 1 15,16 277 4 67-82
				99	258 0 4 268 1 17,18 278 4 83-98
				100	259 0 5 269 2 19-22 279 4 99-114
				101	260 0 6 270 2 23-26 280 4 115-130
				102	261 0 7 271 2 27-30 281 5 131-162
				103	262 0 8 272 2 31-34 282 5 163-194
				104	263 0 9 273 3 35-42 283 5 195-226
				105	264 0 10 274 3 43-50 284 5 227-257
				106	265 1 11,12 275 3 51-58 285 0 258
				107	266 1 13,14 276 3 59-66
				108
				109	Distance encoding :
				110	Extra Extra Extra
				111	Code Bits Dist Code Bits Dist Code Bits Distance
				112	---- ---- ---- ---- ---- ------ ---- ---- --------
				113	0 0 1 10 4 33-48 20 9 1025-1536
				114	1 0 2 11 4 49-64 21 9 1537-2048
				115	2 0 3 12 5 65-96 22 10 2049-3072
				116	3 0 4 13 5 97-128 23 10 3073-4096
				117	4 1 5,6 14 6 129-192 24 11 4097-6144
				118	5 1 7,8 15 6 193-256 25 11 6145-8192
				119	6 2 9-12 16 7 257-384 26 12 8193-12288
				120	7 2 13-16 17 7 385-512 27 12 12289-16384
				121	8 3 17-24 18 8 513-768 28 13 16385-24576
				122	9 3 25-32 19 8 769-1024 29 13 24577-32768
				123
				124	3.2.6. Compression with fixed Huffman codes (BTYPE=01)
				125
				126	The Huffman codes for the two alphabets are fixed, and are not
				127	represented explicitly in the data. The Huffman code lengths
				128	for the literal/length alphabet are:
				129
				130	Lit Value Bits Codes
				131	--------- ---- -----
				132	0 - 143 8 00110000 through
				133	10111111
				134	144 - 255 9 110010000 through
				135	111111111
				136	256 - 279 7 0000000 through
				137	0010111
				138	280 - 287 8 11000000 through
				139	11000111
				140
				141	The code lengths are sufficient to generate the actual codes,
				142	as described above; we show the codes in the table for added
				143	clarity. Literal/length values 286-287 will never actually
				144	occur in the compressed data, but participate in the code
				145	construction.
				146
				147	Distance codes 0-31 are represented by (fixed-length) 5-bit
				148	codes, with possible additional bits as shown in the table
				149	shown in Paragraph 3.2.5, above. Note that distance codes 30-
				150	31 will never actually occur in the compressed data.
				151
				152	*/
				153
				154	/* back references, built in a way that is optimal for 32/64 bits */
				155	union ref {
				156	struct {
				157	uint32_t pos;
				158	uint32_t word;
				159	} by32;
				160	uint64_t by64;
				161	};
				162
				163	#if defined(USE_64BIT_QUEUE) && defined(UNALIGNED_LE_OK)
				164
				165	/* enqueue code x of <xbits> bits (LSB aligned, at most 24) and copy complete
				166	* 32-bit words into output buffer. X must not contain non-zero bits above
				167	* xbits.
				168	*/
				169	static inline void enqueue24(struct slz_stream *strm, uint32_t x, uint32_t xbits)
				170	{
				171	uint64_t queue = strm->queue + ((uint64_t)x << strm->qbits);
				172	uint32_t qbits = strm->qbits + xbits;
				173
				174	if (__builtin_expect(qbits >= 32, 1)) {
				175	(uint32_t )strm->outbuf = queue;
				176	queue >>= 32;
				177	qbits -= 32;
				178	strm->outbuf += 4;
				179	}
				180
				181	strm->queue = queue;
				182	strm->qbits = qbits;
				183	}
				184
				185	#define enqueue8 enqueue24
				186
				187	/* flush the queue and align to next byte */
				188	static inline void flush_bits(struct slz_stream *strm)
				189	{
				190	if (strm->qbits > 0)
				191	*strm->outbuf++ = strm->queue;
				192
				193	if (strm->qbits > 8)
				194	*strm->outbuf++ = strm->queue >> 8;
				195
				196	if (strm->qbits > 16)
				197	*strm->outbuf++ = strm->queue >> 16;
				198
				199	if (strm->qbits > 24)
				200	*strm->outbuf++ = strm->queue >> 24;
				201
				202	strm->queue = 0;
				203	strm->qbits = 0;
				204	}
				205
				206	#else /* non-64 bit or aligned or big endian */
				207
				208	/* enqueue code x of <xbits> bits (LSB aligned, at most 24) and copy complete
				209	* bytes into out buf. X must not contain non-zero bits above xbits. Prefer
				210	* enqueue8() when xbits is known for being 8 or less.
				211	*/
				212	static void enqueue24(struct slz_stream *strm, uint32_t x, uint32_t xbits)
				213	{
				214	uint32_t queue = strm->queue + (x << strm->qbits);
				215	uint32_t qbits = strm->qbits + xbits;
				216
				217	if (qbits >= 16) {
				218	#ifndef UNALIGNED_LE_OK
				219	strm->outbuf[0] = queue;
				220	strm->outbuf[1] = queue >> 8;
				221	#else
				222	(uint16_t )strm->outbuf = queue;
				223	#endif
				224	strm->outbuf += 2;
				225	queue >>= 16;
				226	qbits -= 16;
				227	}
				228
				229	if (qbits >= 8) {
				230	qbits -= 8;
				231	*strm->outbuf++ = queue;
				232	queue >>= 8;
				233	}
				234	strm->qbits = qbits;
				235	strm->queue = queue;
				236	return;
				237	}
				238
				239	/* enqueue code x of <xbits> bits (at most 8) and copy complete bytes into
				240	* out buf. X must not contain non-zero bits above xbits.
				241	*/
				242	static inline void enqueue8(struct slz_stream *strm, uint32_t x, uint32_t xbits)
				243	{
				244	uint32_t queue = strm->queue + (x << strm->qbits);
				245	uint32_t qbits = strm->qbits + xbits;
				246
				247	if (__builtin_expect((signed)(qbits - 8) >= 0, 1)) {
				248	qbits -= 8;
				249	*strm->outbuf++ = queue;
				250	queue >>= 8;
				251	}
				252
				253	strm->qbits = qbits;
				254	strm->queue = queue;
				255	}
				256
				257	/* align to next byte */
				258	static inline void flush_bits(struct slz_stream *strm)
				259	{
				260	if (strm->qbits > 0)
				261	*strm->outbuf++ = strm->queue;
				262
				263	if (strm->qbits > 8)
				264	*strm->outbuf++ = strm->queue >> 8;
				265
				266	strm->queue = 0;
				267	strm->qbits = 0;
				268	}
				269	#endif
				270
				271
				272	/* only valid if buffer is already aligned */
				273	static inline void copy_8b(struct slz_stream *strm, uint32_t x)
				274	{
				275	*strm->outbuf++ = x;
				276	}
				277
				278	/* only valid if buffer is already aligned */
				279	static inline void copy_16b(struct slz_stream *strm, uint32_t x)
				280	{
				281	strm->outbuf[0] = x;
				282	strm->outbuf[1] = x >> 8;
				283	strm->outbuf += 2;
				284	}
				285
				286	/* only valid if buffer is already aligned */
				287	static inline void copy_32b(struct slz_stream *strm, uint32_t x)
				288	{
				289	strm->outbuf[0] = x;
				290	strm->outbuf[1] = x >> 8;
				291	strm->outbuf[2] = x >> 16;
				292	strm->outbuf[3] = x >> 24;
				293	strm->outbuf += 4;
				294	}
				295
				296	static inline void send_huff(struct slz_stream *strm, uint32_t code)
				297	{
				298	uint32_t bits;
				299
				300	code = fixed_huff[code];
				301	bits = code & 15;
				302	code >>= 4;
				303	enqueue24(strm, code, bits);
				304	}
				305
				306	static inline void send_eob(struct slz_stream *strm)
				307	{
				308	enqueue8(strm, 0, 7); // direct encoding of 256 = EOB (cf RFC1951)
				309	}
				310
Ilya Shipitsin	b2be9a1	2021-04-24 13:25:42 +0500	[diff] [blame]	311	/* copies <len> literals from <buf>. <more> indicates that there are data past
Willy Tarreau	ab2b782	2021-04-22 14:09:44 +0200	[diff] [blame]	312	* buf + <len>. <len> must not be null.
				313	*/
				314	static void copy_lit(struct slz_stream strm, const void buf, uint32_t len, int more)
				315	{
				316	uint32_t len2;
				317
				318	do {
				319	len2 = len;
				320	if (__builtin_expect(len2 > 65535, 0))
				321	len2 = 65535;
				322
				323	len -= len2;
				324
				325	if (strm->state != SLZ_ST_EOB)
				326	send_eob(strm);
				327
				328	strm->state = (more \|\| len) ? SLZ_ST_EOB : SLZ_ST_DONE;
				329
				330	enqueue8(strm, !(more \|\| len), 3); // BFINAL = !more ; BTYPE = 00
				331	flush_bits(strm);
				332	copy_16b(strm, len2); // len2
				333	copy_16b(strm, ~len2); // nlen2
				334	memcpy(strm->outbuf, buf, len2);
				335	buf += len2;
				336	strm->outbuf += len2;
				337	} while (len);
				338	}
				339
Ilya Shipitsin	b2be9a1	2021-04-24 13:25:42 +0500	[diff] [blame]	340	/* copies <len> literals from <buf>. <more> indicates that there are data past
Willy Tarreau	ab2b782	2021-04-22 14:09:44 +0200	[diff] [blame]	341	* buf + <len>. <len> must not be null.
				342	*/
				343	static void copy_lit_huff(struct slz_stream strm, const unsigned char buf, uint32_t len, int more)
				344	{
				345	uint32_t pos;
				346
				347	/* This ugly construct limits the mount of tests and optimizes for the
				348	* most common case (more > 0).
				349	*/
				350	if (strm->state == SLZ_ST_EOB) {
				351	eob:
				352	strm->state = more ? SLZ_ST_FIXED : SLZ_ST_LAST;
				353	enqueue8(strm, 2 + !more, 3); // BFINAL = !more ; BTYPE = 01
				354	}
				355	else if (!more) {
				356	send_eob(strm);
				357	goto eob;
				358	}
				359
				360	pos = 0;
				361	do {
				362	send_huff(strm, buf[pos++]);
				363	} while (pos < len);
				364	}
				365
				366	/* format:
				367	* bit0..31 = word
				368	* bit32..63 = last position in buffer of similar content
				369	*/
				370
				371	/* This hash provides good average results on HTML contents, and is among the
				372	* few which provide almost optimal results on various different pages.
				373	*/
				374	static inline uint32_t slz_hash(uint32_t a)
				375	{
				376	#if defined(__ARM_FEATURE_CRC32)
Willy Tarreau	b154422	2021-12-03 17:38:42 +0100	[diff] [blame]	377	# if defined(__ARM_ARCH_ISA_A64)
				378	// 64 bit mode
Willy Tarreau	ab2b782	2021-04-22 14:09:44 +0200	[diff] [blame]	379	__asm__ volatile("crc32w %w0,%w0,%w1" : "+r"(a) : "r"(0));
Willy Tarreau	b154422	2021-12-03 17:38:42 +0100	[diff] [blame]	380	# else
				381	// 32 bit mode (e.g. armv7 compiler building for armv8
				382	__asm__ volatile("crc32w %0,%0,%1" : "+r"(a) : "r"(0));
				383	# endif
Willy Tarreau	ab2b782	2021-04-22 14:09:44 +0200	[diff] [blame]	384	return a >> (32 - HASH_BITS);
				385	#else
				386	return ((a << 19) + (a << 6) - a) >> (32 - HASH_BITS);
				387	#endif
				388	}
				389
				390	/* This function compares buffers <a> and <b> and reads 32 or 64 bits at a time
				391	* during the approach. It makes us of unaligned little endian memory accesses
				392	* on capable architectures. <max> is the maximum number of bytes that can be
				393	* read, so both <a> and <b> must have at least <max> bytes ahead. <max> may
				394	* safely be null or negative if that simplifies computations in the caller.
				395	*/
				396	static inline long memmatch(const unsigned char a, const unsigned char b, long max)
				397	{
				398	long len = 0;
				399
				400	#ifdef UNALIGNED_LE_OK
				401	unsigned long xor;
				402
				403	while (1) {
				404	if ((long)(len + 2 * sizeof(long)) > max) {
				405	while (len < max) {
				406	if (a[len] != b[len])
				407	break;
				408	len++;
				409	}
				410	return len;
				411	}
				412
				413	xor = (long )&a[len] ^ (long )&b[len];
				414	if (xor)
				415	break;
				416	len += sizeof(long);
				417
				418	xor = (long )&a[len] ^ (long )&b[len];
				419	if (xor)
				420	break;
				421	len += sizeof(long);
				422	}
				423
				424	#if defined(__x86_64__) \|\| defined(__i386__) \|\| defined(__i486__) \|\| defined(__i586__) \|\| defined(__i686__)
				425	/* x86 has bsf. We know that xor is non-null here */
				426	asm("bsf %1,%0\n" : "=r"(xor) : "0" (xor));
				427	return len + xor / 8;
				428	#else
				429	if (sizeof(long) > 4 && !(xor & 0xffffffff)) {
				430	/* This code is optimized out on 32-bit archs, but we still
				431	* need to shift in two passes to avoid a warning. It is
				432	* properly optimized out as a single shift.
				433	*/
				434	xor >>= 16; xor >>= 16;
				435	if (xor & 0xffff) {
				436	if (xor & 0xff)
				437	return len + 4;
				438	return len + 5;
				439	}
				440	if (xor & 0xffffff)
				441	return len + 6;
				442	return len + 7;
				443	}
				444
				445	if (xor & 0xffff) {
				446	if (xor & 0xff)
				447	return len;
				448	return len + 1;
				449	}
				450	if (xor & 0xffffff)
				451	return len + 2;
				452	return len + 3;
				453	#endif // x86
				454
				455	#else // UNALIGNED_LE_OK
				456	/* This is the generic version for big endian or unaligned-incompatible
				457	* architectures.
				458	*/
				459	while (len < max) {
				460	if (a[len] != b[len])
				461	break;
				462	len++;
				463	}
				464	return len;
				465
				466	#endif
				467	}
				468
				469	/* sets <count> BYTES to -32769 in <refs> so that any uninitialized entry will
				470	* verify (pos-last-1 >= 32768) and be ignored. <count> must be a multiple of
				471	* 128 bytes and <refs> must be at least one count in length. It's supposed to
				472	* be applied to 64-bit aligned data exclusively, which makes it slightly
				473	* faster than the regular memset() since no alignment check is performed.
				474	*/
Tim Duesterhus	eaf16fc	2021-09-20 19:59:42 +0200	[diff] [blame]	475	static void reset_refs(union ref *refs, long count)
Willy Tarreau	ab2b782	2021-04-22 14:09:44 +0200	[diff] [blame]	476	{
				477	/* avoid a shift/mask by casting to void* */
				478	union ref end = (void )refs + count;
				479
				480	do {
				481	refs[ 0].by64 = -32769;
				482	refs[ 1].by64 = -32769;
				483	refs[ 2].by64 = -32769;
				484	refs[ 3].by64 = -32769;
				485	refs[ 4].by64 = -32769;
				486	refs[ 5].by64 = -32769;
				487	refs[ 6].by64 = -32769;
				488	refs[ 7].by64 = -32769;
				489	refs[ 8].by64 = -32769;
				490	refs[ 9].by64 = -32769;
				491	refs[10].by64 = -32769;
				492	refs[11].by64 = -32769;
				493	refs[12].by64 = -32769;
				494	refs[13].by64 = -32769;
				495	refs[14].by64 = -32769;
				496	refs[15].by64 = -32769;
				497	refs += 16;
				498	} while (refs < end);
				499	}
				500
				501	/* Compresses <ilen> bytes from <in> into <out> according to RFC1951. The
				502	* output result may be up to 5 bytes larger than the input, to which 2 extra
				503	* bytes may be added to send the last chunk due to BFINAL+EOB encoding (10
				504	* bits) when <more> is not set. The caller is responsible for ensuring there
				505	* is enough room in the output buffer for this. The amount of output bytes is
				506	* returned, and no CRC is computed.
				507	*/
				508	long slz_rfc1951_encode(struct slz_stream strm, unsigned char out, const unsigned char *in, long ilen, int more)
				509	{
				510	long rem = ilen;
				511	unsigned long pos = 0;
				512	unsigned long last;
				513	uint32_t word = 0;
				514	long mlen;
				515	uint32_t h;
				516	uint64_t ent;
				517
				518	uint32_t plit = 0;
				519	uint32_t bit9 = 0;
				520	uint32_t dist, code;
				521	union ref refs[1 << HASH_BITS];
				522
				523	if (!strm->level) {
				524	/* force to send as literals (eg to preserve CPU) */
				525	strm->outbuf = out;
				526	plit = pos = ilen;
				527	bit9 = 52; /* force literal dump */
				528	goto final_lit_dump;
				529	}
				530
				531	reset_refs(refs, sizeof(refs));
				532
				533	strm->outbuf = out;
				534
				535	#ifndef UNALIGNED_FASTER
				536	word = ((unsigned char)in[pos] << 8) + ((unsigned char)in[pos + 1] << 16) + ((unsigned char)in[pos + 2] << 24);
				537	#endif
				538	while (rem >= 4) {
				539	#ifndef UNALIGNED_FASTER
				540	word = ((unsigned char)in[pos + 3] << 24) + (word >> 8);
				541	#else
				542	word = (uint32_t )&in[pos];
				543	#endif
				544	h = slz_hash(word);
				545	asm volatile ("" ::); // prevent gcc from trying to be smart with the prefetch
				546
				547	if (sizeof(long) >= 8) {
				548	ent = refs[h].by64;
				549	last = (uint32_t)ent;
				550	ent >>= 32;
				551	refs[h].by64 = ((uint64_t)pos) + ((uint64_t)word << 32);
				552	} else {
				553	ent = refs[h].by32.word;
				554	last = refs[h].by32.pos;
				555	refs[h].by32.pos = pos;
				556	refs[h].by32.word = word;
				557	}
				558
Willy Tarreau	fc89c3f	2021-08-28 12:10:49 +0200	[diff] [blame]	559	#ifdef FIND_OPTIMAL_MATCH
Willy Tarreau	ab2b782	2021-04-22 14:09:44 +0200	[diff] [blame]	560	/* Experimental code to see what could be saved with an ideal
				561	* longest match lookup algorithm. This one is very slow but
				562	* scans the whole window. In short, here are the savings :
				563	* file orig fast(ratio) optimal(ratio)
				564	* README 5185 3419 (65.9%) 3165 (61.0%) -7.5%
				565	* index.html 76799 35662 (46.4%) 29875 (38.9%) -16.3%
				566	* rfc1952.c 29383 13442 (45.7%) 11793 (40.1%) -12.3%
				567	*
				568	* Thus the savings to expect for large files is at best 16%.
				569	*
				570	* A non-colliding hash gives 33025 instead of 35662 (-7.4%),
				571	* and keeping the last two entries gives 31724 (-11.0%).
				572	*/
				573	unsigned long scan;
				574	int saved = 0;
				575	int bestpos = 0;
				576	int bestlen = 0;
				577	int firstlen = 0;
				578	int max_lookup = 2; // 0 = no limit
				579
				580	for (scan = pos - 1; scan < pos && (unsigned long)(pos - scan - 1) < 32768; scan--) {
Willy Tarreau	eda36f1	2021-12-13 09:07:05 +0100	[diff] [blame]	581	int len;
				582
Willy Tarreau	ab2b782	2021-04-22 14:09:44 +0200	[diff] [blame]	583	if ((uint32_t )(in + scan) != word)
				584	continue;
				585
				586	len = memmatch(in + pos, in + scan, rem);
				587	if (!bestlen)
				588	firstlen = len;
				589
				590	if (len > bestlen) {
				591	bestlen = len;
				592	bestpos = scan;
				593	}
				594	if (!--max_lookup)
				595	break;
				596	}
				597	if (bestlen) {
				598	//printf("pos=%d last=%d bestpos=%d word=%08x ent=%08x len=%d\n",
				599	// (int)pos, (int)last, (int)bestpos, (int)word, (int)ent, bestlen);
				600	last = bestpos;
				601	ent = word;
				602	saved += bestlen - firstlen;
				603	}
				604	//fprintf(stderr, "first=%d best=%d saved_total=%d\n", firstlen, bestlen, saved);
				605	#endif
				606
				607	if ((uint32_t)ent != word) {
				608	send_as_lit:
				609	rem--;
				610	plit++;
				611	bit9 += ((unsigned char)word >= 144);
				612	pos++;
				613	continue;
				614	}
				615
				616	/* We reject pos = last and pos > last+32768 */
				617	if ((unsigned long)(pos - last - 1) >= 32768)
				618	goto send_as_lit;
				619
				620	/* Note: cannot encode a length larger than 258 bytes */
				621	mlen = memmatch(in + pos + 4, in + last + 4, (rem > 258 ? 258 : rem) - 4) + 4;
				622
				623	/* found a matching entry */
				624
				625	if (bit9 >= 52 && mlen < 6)
				626	goto send_as_lit;
				627
				628	/* compute the output code, its size and the length's size in
				629	* bits to know if the reference is cheaper than literals.
				630	*/
				631	code = len_fh[mlen];
				632
				633	/* direct mapping of dist->huffman code */
				634	dist = fh_dist_table[pos - last - 1];
				635
				636	/* if encoding the dist+length is more expensive than sending
				637	* the equivalent as bytes, lets keep the literals.
				638	*/
				639	if ((dist & 0x1f) + (code >> 16) + 8 >= 8 * mlen + bit9)
				640	goto send_as_lit;
				641
				642	/* first, copy pending literals */
				643	if (plit) {
				644	/* Huffman encoding requires 9 bits for octets 144..255, so this
				645	* is a waste of space for binary data. Switching between Huffman
				646	* and no-comp then huffman consumes 52 bits (7 for EOB + 3 for
				647	* block type + 7 for alignment + 32 for LEN+NLEN + 3 for next
				648	* block. Only use plain literals if there are more than 52 bits
				649	* to save then.
				650	*/
				651	if (bit9 >= 52)
				652	copy_lit(strm, in + pos - plit, plit, 1);
				653	else
				654	copy_lit_huff(strm, in + pos - plit, plit, 1);
				655
				656	plit = 0;
				657	}
				658
				659	/* use mode 01 - fixed huffman */
				660	if (strm->state == SLZ_ST_EOB) {
				661	strm->state = SLZ_ST_FIXED;
				662	enqueue8(strm, 0x02, 3); // BTYPE = 01, BFINAL = 0
				663	}
				664
				665	/* copy the length first */
				666	enqueue24(strm, code & 0xFFFF, code >> 16);
				667
				668	/* in fixed huffman mode, dist is fixed 5 bits */
				669	enqueue24(strm, dist >> 5, dist & 0x1f);
				670	bit9 = 0;
				671	rem -= mlen;
				672	pos += mlen;
				673
				674	#ifndef UNALIGNED_FASTER
				675	#ifdef UNALIGNED_LE_OK
				676	word = (uint32_t )&in[pos - 1];
				677	#else
				678	word = ((unsigned char)in[pos] << 8) + ((unsigned char)in[pos + 1] << 16) + ((unsigned char)in[pos + 2] << 24);
				679	#endif
				680	#endif
				681	}
				682
				683	if (__builtin_expect(rem, 0)) {
				684	/* we're reading the 1..3 last bytes */
				685	plit += rem;
				686	do {
				687	bit9 += ((unsigned char)in[pos++] >= 144);
				688	} while (--rem);
				689	}
				690
				691	final_lit_dump:
				692	/* now copy remaining literals or mark the end */
				693	if (plit) {
				694	if (bit9 >= 52)
				695	copy_lit(strm, in + pos - plit, plit, more);
				696	else
				697	copy_lit_huff(strm, in + pos - plit, plit, more);
				698
				699	plit = 0;
				700	}
				701
				702	strm->ilen += ilen;
				703	return strm->outbuf - out;
				704	}
				705
				706	/* Initializes stream <strm> for use with raw deflate (rfc1951). The CRC is
				707	* unused but set to zero. The compression level passed in <level> is set. This
				708	* value can only be 0 (no compression) or 1 (compression) and other values
				709	* will lead to unpredictable behaviour. The function always returns 0.
				710	*/
				711	int slz_rfc1951_init(struct slz_stream *strm, int level)
				712	{
				713	strm->state = SLZ_ST_EOB; // no header
				714	strm->level = level;
				715	strm->format = SLZ_FMT_DEFLATE;
				716	strm->crc32 = 0;
				717	strm->ilen = 0;
				718	strm->qbits = 0;
				719	strm->queue = 0;
				720	return 0;
				721	}
				722
				723	/* Flushes any pending for stream <strm> into buffer <buf>, then sends BTYPE=1
				724	* and BFINAL=1 if needed. The stream ends in SLZ_ST_DONE. It returns the number
				725	* of bytes emitted. The trailer consists in flushing the possibly pending bits
				726	* from the queue (up to 7 bits), then possibly EOB (7 bits), then 3 bits, EOB,
				727	* a rounding to the next byte, which amounts to a total of 4 bytes max, that
				728	* the caller must ensure are available before calling the function.
				729	*/
				730	int slz_rfc1951_finish(struct slz_stream strm, unsigned char buf)
				731	{
				732	strm->outbuf = buf;
				733
				734	if (strm->state == SLZ_ST_FIXED \|\| strm->state == SLZ_ST_LAST) {
				735	strm->state = (strm->state == SLZ_ST_LAST) ? SLZ_ST_DONE : SLZ_ST_EOB;
				736	send_eob(strm);
				737	}
				738
				739	if (strm->state != SLZ_ST_DONE) {
				740	/* send BTYPE=1, BFINAL=1 */
				741	enqueue8(strm, 3, 3);
				742	send_eob(strm);
				743	strm->state = SLZ_ST_DONE;
				744	}
				745
				746	flush_bits(strm);
				747	return strm->outbuf - buf;
				748	}
				749
				750	/* Now RFC1952-specific declarations and extracts from RFC.
				751	* From RFC1952 about the GZIP file format :
				752
				753	A gzip file consists of a series of "members" ...
				754
				755	2.3. Member format
				756
				757	Each member has the following structure:
				758
				759	+---+---+---+---+---+---+---+---+---+---+
				760	\|ID1\|ID2\|CM \|FLG\| MTIME \|XFL\|OS \| (more-->)
				761	+---+---+---+---+---+---+---+---+---+---+
				762
				763	(if FLG.FEXTRA set)
				764
				765	+---+---+=================================+
				766	\| XLEN \|...XLEN bytes of "extra field"...\| (more-->)
				767	+---+---+=================================+
				768
				769	(if FLG.FNAME set)
				770
				771	+=========================================+
				772	\|...original file name, zero-terminated...\| (more-->)
				773	+=========================================+
				774
				775	(if FLG.FCOMMENT set)
				776
				777	+===================================+
				778	\|...file comment, zero-terminated...\| (more-->)
				779	+===================================+
				780
				781	(if FLG.FHCRC set)
				782
				783	+---+---+
				784	\| CRC16 \|
				785	+---+---+
				786
				787	+=======================+
				788	\|...compressed blocks...\| (more-->)
				789	+=======================+
				790
				791	0 1 2 3 4 5 6 7
				792	+---+---+---+---+---+---+---+---+
				793	\| CRC32 \| ISIZE \|
				794	+---+---+---+---+---+---+---+---+
				795
				796
				797	2.3.1. Member header and trailer
				798
				799	ID1 (IDentification 1)
				800	ID2 (IDentification 2)
				801	These have the fixed values ID1 = 31 (0x1f, \037), ID2 = 139
				802	(0x8b, \213), to identify the file as being in gzip format.
				803
				804	CM (Compression Method)
				805	This identifies the compression method used in the file. CM
				806	= 0-7 are reserved. CM = 8 denotes the "deflate"
				807	compression method, which is the one customarily used by
				808	gzip and which is documented elsewhere.
				809
				810	FLG (FLaGs)
				811	This flag byte is divided into individual bits as follows:
				812
				813	bit 0 FTEXT
				814	bit 1 FHCRC
				815	bit 2 FEXTRA
				816	bit 3 FNAME
				817	bit 4 FCOMMENT
				818	bit 5 reserved
				819	bit 6 reserved
				820	bit 7 reserved
				821
				822	Reserved FLG bits must be zero.
				823
				824	MTIME (Modification TIME)
				825	This gives the most recent modification time of the original
				826	file being compressed. The time is in Unix format, i.e.,
				827	seconds since 00:00:00 GMT, Jan. 1, 1970. (Note that this
				828	may cause problems for MS-DOS and other systems that use
				829	local rather than Universal time.) If the compressed data
				830	did not come from a file, MTIME is set to the time at which
				831	compression started. MTIME = 0 means no time stamp is
				832	available.
				833
				834	XFL (eXtra FLags)
				835	These flags are available for use by specific compression
				836	methods. The "deflate" method (CM = 8) sets these flags as
				837	follows:
				838
				839	XFL = 2 - compressor used maximum compression,
				840	slowest algorithm
				841	XFL = 4 - compressor used fastest algorithm
				842
				843	OS (Operating System)
				844	This identifies the type of file system on which compression
				845	took place. This may be useful in determining end-of-line
				846	convention for text files. The currently defined values are
				847	as follows:
				848
				849	0 - FAT filesystem (MS-DOS, OS/2, NT/Win32)
				850	1 - Amiga
				851	2 - VMS (or OpenVMS)
				852	3 - Unix
				853	4 - VM/CMS
				854	5 - Atari TOS
				855	6 - HPFS filesystem (OS/2, NT)
				856	7 - Macintosh
				857	8 - Z-System
				858	9 - CP/M
				859	10 - TOPS-20
				860	11 - NTFS filesystem (NT)
				861	12 - QDOS
				862	13 - Acorn RISCOS
				863	255 - unknown
				864
				865	==> A file compressed using "gzip -1" on Unix-like systems can be :
				866
				867	1F 8B 08 00 00 00 00 00 04 03
				868	<deflate-compressed stream>
				869	crc32 size32
				870	*/
				871
				872	static const unsigned char gzip_hdr[] = { 0x1F, 0x8B, // ID1, ID2
				873	0x08, 0x00, // Deflate, flags (none)
				874	0x00, 0x00, 0x00, 0x00, // mtime: none
				875	0x04, 0x03 }; // fastest comp, OS=Unix
				876
				877	static inline uint32_t crc32_char(uint32_t crc, uint8_t x)
				878	{
				879	#if defined(__ARM_FEATURE_CRC32)
				880	crc = ~crc;
Willy Tarreau	b154422	2021-12-03 17:38:42 +0100	[diff] [blame]	881	# if defined(__ARM_ARCH_ISA_A64)
				882	// 64 bit mode
Willy Tarreau	ab2b782	2021-04-22 14:09:44 +0200	[diff] [blame]	883	__asm__ volatile("crc32b %w0,%w0,%w1" : "+r"(crc) : "r"(x));
Willy Tarreau	b154422	2021-12-03 17:38:42 +0100	[diff] [blame]	884	# else
				885	// 32 bit mode (e.g. armv7 compiler building for armv8
				886	__asm__ volatile("crc32b %0,%0,%1" : "+r"(crc) : "r"(x));
				887	# endif
Willy Tarreau	ab2b782	2021-04-22 14:09:44 +0200	[diff] [blame]	888	crc = ~crc;
				889	#else
				890	crc = crc32_fast[0][(crc ^ x) & 0xff] ^ (crc >> 8);
				891	#endif
				892	return crc;
				893	}
				894
				895	static inline uint32_t crc32_uint32(uint32_t data)
				896	{
				897	#if defined(__ARM_FEATURE_CRC32)
Willy Tarreau	b154422	2021-12-03 17:38:42 +0100	[diff] [blame]	898	# if defined(__ARM_ARCH_ISA_A64)
				899	// 64 bit mode
Willy Tarreau	ab2b782	2021-04-22 14:09:44 +0200	[diff] [blame]	900	__asm__ volatile("crc32w %w0,%w0,%w1" : "+r"(data) : "r"(~0UL));
Willy Tarreau	b154422	2021-12-03 17:38:42 +0100	[diff] [blame]	901	# else
				902	// 32 bit mode (e.g. armv7 compiler building for armv8
				903	__asm__ volatile("crc32w %0,%0,%1" : "+r"(data) : "r"(~0UL));
				904	# endif
Willy Tarreau	ab2b782	2021-04-22 14:09:44 +0200	[diff] [blame]	905	data = ~data;
				906	#else
				907	data = crc32_fast[3][(data >> 0) & 0xff] ^
				908	crc32_fast[2][(data >> 8) & 0xff] ^
				909	crc32_fast[1][(data >> 16) & 0xff] ^
				910	crc32_fast[0][(data >> 24) & 0xff];
				911	#endif
				912	return data;
				913	}
				914
				915	/* Modified version originally from RFC1952, working with non-inverting CRCs */
				916	uint32_t slz_crc32_by1(uint32_t crc, const unsigned char *buf, int len)
				917	{
				918	int n;
				919
				920	for (n = 0; n < len; n++)
				921	crc = crc32_char(crc, buf[n]);
				922	return crc;
				923	}
				924
				925	/* This version computes the crc32 of <buf> over <len> bytes, doing most of it
				926	* in 32-bit chunks.
				927	*/
				928	uint32_t slz_crc32_by4(uint32_t crc, const unsigned char *buf, int len)
				929	{
				930	const unsigned char *end = buf + len;
				931
				932	while (buf <= end - 16) {
				933	#ifdef UNALIGNED_LE_OK
				934	#if defined(__ARM_FEATURE_CRC32)
				935	crc = ~crc;
Willy Tarreau	b154422	2021-12-03 17:38:42 +0100	[diff] [blame]	936	# if defined(__ARM_ARCH_ISA_A64)
				937	// 64 bit mode
Willy Tarreau	ab2b782	2021-04-22 14:09:44 +0200	[diff] [blame]	938	__asm__ volatile("crc32w %w0,%w0,%w1" : "+r"(crc) : "r"((uint32_t)(buf)));
				939	__asm__ volatile("crc32w %w0,%w0,%w1" : "+r"(crc) : "r"((uint32_t)(buf + 4)));
				940	__asm__ volatile("crc32w %w0,%w0,%w1" : "+r"(crc) : "r"((uint32_t)(buf + 8)));
				941	__asm__ volatile("crc32w %w0,%w0,%w1" : "+r"(crc) : "r"((uint32_t)(buf + 12)));
Willy Tarreau	b154422	2021-12-03 17:38:42 +0100	[diff] [blame]	942	# else
				943	// 32 bit mode (e.g. armv7 compiler building for armv8
				944	__asm__ volatile("crc32w %0,%0,%1" : "+r"(crc) : "r"((uint32_t)(buf)));
				945	__asm__ volatile("crc32w %0,%0,%1" : "+r"(crc) : "r"((uint32_t)(buf + 4)));
				946	__asm__ volatile("crc32w %0,%0,%1" : "+r"(crc) : "r"((uint32_t)(buf + 8)));
				947	__asm__ volatile("crc32w %0,%0,%1" : "+r"(crc) : "r"((uint32_t)(buf + 12)));
				948	# endif
Willy Tarreau	ab2b782	2021-04-22 14:09:44 +0200	[diff] [blame]	949	crc = ~crc;
				950	#else
				951	crc ^= (uint32_t )buf;
				952	crc = crc32_uint32(crc);
				953
				954	crc ^= (uint32_t )(buf + 4);
				955	crc = crc32_uint32(crc);
				956
				957	crc ^= (uint32_t )(buf + 8);
				958	crc = crc32_uint32(crc);
				959
				960	crc ^= (uint32_t )(buf + 12);
				961	crc = crc32_uint32(crc);
				962	#endif
				963	#else
				964	crc = crc32_fast[3][(buf[0] ^ (crc >> 0)) & 0xff] ^
				965	crc32_fast[2][(buf[1] ^ (crc >> 8)) & 0xff] ^
				966	crc32_fast[1][(buf[2] ^ (crc >> 16)) & 0xff] ^
				967	crc32_fast[0][(buf[3] ^ (crc >> 24)) & 0xff];
				968
				969	crc = crc32_fast[3][(buf[4] ^ (crc >> 0)) & 0xff] ^
				970	crc32_fast[2][(buf[5] ^ (crc >> 8)) & 0xff] ^
				971	crc32_fast[1][(buf[6] ^ (crc >> 16)) & 0xff] ^
				972	crc32_fast[0][(buf[7] ^ (crc >> 24)) & 0xff];
				973
				974	crc = crc32_fast[3][(buf[8] ^ (crc >> 0)) & 0xff] ^
				975	crc32_fast[2][(buf[9] ^ (crc >> 8)) & 0xff] ^
				976	crc32_fast[1][(buf[10] ^ (crc >> 16)) & 0xff] ^
				977	crc32_fast[0][(buf[11] ^ (crc >> 24)) & 0xff];
				978
				979	crc = crc32_fast[3][(buf[12] ^ (crc >> 0)) & 0xff] ^
				980	crc32_fast[2][(buf[13] ^ (crc >> 8)) & 0xff] ^
				981	crc32_fast[1][(buf[14] ^ (crc >> 16)) & 0xff] ^
				982	crc32_fast[0][(buf[15] ^ (crc >> 24)) & 0xff];
				983	#endif
				984	buf += 16;
				985	}
				986
				987	while (buf <= end - 4) {
				988	#ifdef UNALIGNED_LE_OK
				989	crc ^= (uint32_t )buf;
				990	crc = crc32_uint32(crc);
				991	#else
				992	crc = crc32_fast[3][(buf[0] ^ (crc >> 0)) & 0xff] ^
				993	crc32_fast[2][(buf[1] ^ (crc >> 8)) & 0xff] ^
				994	crc32_fast[1][(buf[2] ^ (crc >> 16)) & 0xff] ^
				995	crc32_fast[0][(buf[3] ^ (crc >> 24)) & 0xff];
				996	#endif
				997	buf += 4;
				998	}
				999
				1000	while (buf < end)
Willy Tarreau	027fdcb	2021-05-12 08:34:36 +0200	[diff] [blame]	1001	crc = crc32_char(crc, *buf++);
Willy Tarreau	ab2b782	2021-04-22 14:09:44 +0200	[diff] [blame]	1002	return crc;
				1003	}
				1004
				1005	/* uses the most suitable crc32 function to update crc on <buf, len> */
				1006	static inline uint32_t update_crc(uint32_t crc, const void *buf, int len)
				1007	{
				1008	return slz_crc32_by4(crc, buf, len);
				1009	}
				1010
				1011	/* Sends the gzip header for stream <strm> into buffer <buf>. When it's done,
				1012	* the stream state is updated to SLZ_ST_EOB. It returns the number of bytes
				1013	* emitted which is always 10. The caller is responsible for ensuring there's
				1014	* always enough room in the buffer.
				1015	*/
				1016	int slz_rfc1952_send_header(struct slz_stream strm, unsigned char buf)
				1017	{
				1018	memcpy(buf, gzip_hdr, sizeof(gzip_hdr));
				1019	strm->state = SLZ_ST_EOB;
				1020	return sizeof(gzip_hdr);
				1021	}
				1022
				1023	/* Encodes the block according to rfc1952. This means that the CRC of the input
				1024	* block is computed according to the CRC32 algorithm. If the header was never
				1025	* sent, it may be sent first. The number of output bytes is returned.
				1026	*/
				1027	long slz_rfc1952_encode(struct slz_stream strm, unsigned char out, const unsigned char *in, long ilen, int more)
				1028	{
				1029	long ret = 0;
				1030
				1031	if (__builtin_expect(strm->state == SLZ_ST_INIT, 0))
				1032	ret += slz_rfc1952_send_header(strm, out);
				1033
				1034	strm->crc32 = update_crc(strm->crc32, in, ilen);
				1035	ret += slz_rfc1951_encode(strm, out + ret, in, ilen, more);
				1036	return ret;
				1037	}
				1038
				1039	/* Initializes stream <strm> for use with the gzip format (rfc1952). The
				1040	* compression level passed in <level> is set. This value can only be 0 (no
				1041	* compression) or 1 (compression) and other values will lead to unpredictable
				1042	* behaviour. The function always returns 0.
				1043	*/
				1044	int slz_rfc1952_init(struct slz_stream *strm, int level)
				1045	{
				1046	strm->state = SLZ_ST_INIT;
				1047	strm->level = level;
				1048	strm->format = SLZ_FMT_GZIP;
				1049	strm->crc32 = 0;
				1050	strm->ilen = 0;
				1051	strm->qbits = 0;
				1052	strm->queue = 0;
				1053	return 0;
				1054	}
				1055
				1056	/* Flushes pending bits and sends the gzip trailer for stream <strm> into
				1057	* buffer <buf>. When it's done, the stream state is updated to SLZ_ST_END. It
				1058	* returns the number of bytes emitted. The trailer consists in flushing the
				1059	* possibly pending bits from the queue (up to 24 bits), rounding to the next
				1060	* byte, then 4 bytes for the CRC and another 4 bytes for the input length.
Ilya Shipitsin	b2be9a1	2021-04-24 13:25:42 +0500	[diff] [blame]	1061	* That may about to 4+4+4 = 12 bytes, that the caller must ensure are
Willy Tarreau	ab2b782	2021-04-22 14:09:44 +0200	[diff] [blame]	1062	* available before calling the function. Note that if the initial header was
				1063	* never sent, it will be sent first as well (10 extra bytes).
				1064	*/
				1065	int slz_rfc1952_finish(struct slz_stream strm, unsigned char buf)
				1066	{
				1067	strm->outbuf = buf;
				1068
				1069	if (__builtin_expect(strm->state == SLZ_ST_INIT, 0))
				1070	strm->outbuf += slz_rfc1952_send_header(strm, strm->outbuf);
				1071
				1072	slz_rfc1951_finish(strm, strm->outbuf);
				1073	copy_32b(strm, strm->crc32);
				1074	copy_32b(strm, strm->ilen);
				1075	strm->state = SLZ_ST_END;
				1076
				1077	return strm->outbuf - buf;
				1078	}
				1079
				1080
				1081	/* RFC1950-specific stuff. This is for the Zlib stream format.
				1082	* From RFC1950 (zlib) :
				1083	*
				1084
				1085	2.2. Data format
				1086
				1087	A zlib stream has the following structure:
				1088
				1089	0 1
				1090	+---+---+
				1091	\|CMF\|FLG\| (more-->)
				1092	+---+---+
				1093
				1094
				1095	(if FLG.FDICT set)
				1096
				1097	0 1 2 3
				1098	+---+---+---+---+
				1099	\| DICTID \| (more-->)
				1100	+---+---+---+---+
				1101
				1102	+=====================+---+---+---+---+
				1103	\|...compressed data...\| ADLER32 \|
				1104	+=====================+---+---+---+---+
				1105
				1106	Any data which may appear after ADLER32 are not part of the zlib
				1107	stream.
				1108
				1109	CMF (Compression Method and flags)
				1110	This byte is divided into a 4-bit compression method and a 4-
				1111	bit information field depending on the compression method.
				1112
				1113	bits 0 to 3 CM Compression method
				1114	bits 4 to 7 CINFO Compression info
				1115
				1116	CM (Compression method)
				1117	This identifies the compression method used in the file. CM = 8
				1118	denotes the "deflate" compression method with a window size up
				1119	to 32K. This is the method used by gzip and PNG (see
				1120	references [1] and [2] in Chapter 3, below, for the reference
				1121	documents). CM = 15 is reserved. It might be used in a future
				1122	version of this specification to indicate the presence of an
				1123	extra field before the compressed data.
				1124
				1125	CINFO (Compression info)
				1126	For CM = 8, CINFO is the base-2 logarithm of the LZ77 window
				1127	size, minus eight (CINFO=7 indicates a 32K window size). Values
				1128	of CINFO above 7 are not allowed in this version of the
				1129	specification. CINFO is not defined in this specification for
				1130	CM not equal to 8.
				1131
				1132	FLG (FLaGs)
				1133	This flag byte is divided as follows:
				1134
				1135	bits 0 to 4 FCHECK (check bits for CMF and FLG)
				1136	bit 5 FDICT (preset dictionary)
				1137	bits 6 to 7 FLEVEL (compression level)
				1138
				1139	The FCHECK value must be such that CMF and FLG, when viewed as
				1140	a 16-bit unsigned integer stored in MSB order (CMF*256 + FLG),
				1141	is a multiple of 31.
				1142
				1143
				1144	FDICT (Preset dictionary)
				1145	If FDICT is set, a DICT dictionary identifier is present
				1146	immediately after the FLG byte. The dictionary is a sequence of
				1147	bytes which are initially fed to the compressor without
				1148	producing any compressed output. DICT is the Adler-32 checksum
				1149	of this sequence of bytes (see the definition of ADLER32
				1150	below). The decompressor can use this identifier to determine
				1151	which dictionary has been used by the compressor.
				1152
				1153	FLEVEL (Compression level)
				1154	These flags are available for use by specific compression
				1155	methods. The "deflate" method (CM = 8) sets these flags as
				1156	follows:
				1157
				1158	0 - compressor used fastest algorithm
				1159	1 - compressor used fast algorithm
				1160	2 - compressor used default algorithm
				1161	3 - compressor used maximum compression, slowest algorithm
				1162
				1163	The information in FLEVEL is not needed for decompression; it
				1164	is there to indicate if recompression might be worthwhile.
				1165
				1166	compressed data
				1167	For compression method 8, the compressed data is stored in the
				1168	deflate compressed data format as described in the document
				1169	"DEFLATE Compressed Data Format Specification" by L. Peter
				1170	Deutsch. (See reference [3] in Chapter 3, below)
				1171
				1172	Other compressed data formats are not specified in this version
				1173	of the zlib specification.
				1174
				1175	ADLER32 (Adler-32 checksum)
				1176	This contains a checksum value of the uncompressed data
				1177	(excluding any dictionary data) computed according to Adler-32
				1178	algorithm. This algorithm is a 32-bit extension and improvement
				1179	of the Fletcher algorithm, used in the ITU-T X.224 / ISO 8073
				1180	standard. See references [4] and [5] in Chapter 3, below)
				1181
				1182	Adler-32 is composed of two sums accumulated per byte: s1 is
				1183	the sum of all bytes, s2 is the sum of all s1 values. Both sums
				1184	are done modulo 65521. s1 is initialized to 1, s2 to zero. The
				1185	Adler-32 checksum is stored as s2*65536 + s1 in most-
				1186	significant-byte first (network) order.
				1187
				1188	==> The stream can start with only 2 bytes :
				1189	- CM = 0x78 : CMINFO=7 (32kB window), CM=8 (deflate)
				1190	- FLG = 0x01 : FLEVEL = 0 (fastest), FDICT=0 (no dict), FCHECK=1 so
				1191	that 0x7801 is a multiple of 31 (30721 = 991 * 31).
				1192
				1193	==> and it ends with only 4 bytes, the Adler-32 checksum in big-endian format.
				1194
				1195	*/
				1196
				1197	static const unsigned char zlib_hdr[] = { 0x78, 0x01 }; // 32k win, deflate, chk=1
				1198
				1199
				1200	/* Original version from RFC1950, verified and works OK */
				1201	uint32_t slz_adler32_by1(uint32_t crc, const unsigned char *buf, int len)
				1202	{
				1203	uint32_t s1 = crc & 0xffff;
				1204	uint32_t s2 = (crc >> 16) & 0xffff;
				1205	int n;
				1206
				1207	for (n = 0; n < len; n++) {
				1208	s1 = (s1 + buf[n]) % 65521;
				1209	s2 = (s2 + s1) % 65521;
				1210	}
				1211	return (s2 << 16) + s1;
				1212	}
				1213
				1214	/* Computes the adler32 sum on <buf> for <len> bytes. It avoids the expensive
				1215	* modulus by retrofitting the number of bytes missed between 65521 and 65536
				1216	* which is easy to count : For every sum above 65536, the modulus is offset
				1217	* by (65536-65521) = 15. So for any value, we can count the accumulated extra
				1218	* values by dividing the sum by 65536 and multiplying this value by
				1219	* (65536-65521). That's easier with a drawing with boxes and marbles. It gives
				1220	* this :
				1221	* x % 65521 = (x % 65536) + (x / 65536) * (65536 - 65521)
				1222	* = (x & 0xffff) + (x >> 16) * 15.
				1223	*/
				1224	uint32_t slz_adler32_block(uint32_t crc, const unsigned char *buf, long len)
				1225	{
				1226	long s1 = crc & 0xffff;
				1227	long s2 = (crc >> 16);
				1228	long blk;
				1229	long n;
				1230
				1231	do {
				1232	blk = len;
				1233	/* ensure we never overflow s2 (limit is about 2^((32-8)/2) */
				1234	if (blk > (1U << 12))
				1235	blk = 1U << 12;
				1236	len -= blk;
				1237
				1238	for (n = 0; n < blk; n++) {
				1239	s1 = (s1 + buf[n]);
				1240	s2 = (s2 + s1);
				1241	}
				1242
				1243	/* Largest value here is 2^12 * 255 = 1044480 < 2^20. We can
				1244	* still overflow once, but not twice because the right hand
				1245	* size is 225 max, so the total is 65761. However we also
				1246	* have to take care of the values between 65521 and 65536.
				1247	*/
				1248	s1 = (s1 & 0xffff) + 15 * (s1 >> 16);
				1249	if (s1 >= 65521)
				1250	s1 -= 65521;
				1251
				1252	/* For s2, the largest value is estimated to 2^32-1 for
				1253	* simplicity, so the right hand side is about 15*65535
				1254	* = 983025. We can overflow twice at most.
				1255	*/
				1256	s2 = (s2 & 0xffff) + 15 * (s2 >> 16);
				1257	s2 = (s2 & 0xffff) + 15 * (s2 >> 16);
				1258	if (s2 >= 65521)
				1259	s2 -= 65521;
				1260
				1261	buf += blk;
				1262	} while (len);
				1263	return (s2 << 16) + s1;
				1264	}
				1265
				1266	/* Sends the zlib header for stream <strm> into buffer <buf>. When it's done,
				1267	* the stream state is updated to SLZ_ST_EOB. It returns the number of bytes
				1268	* emitted which is always 2. The caller is responsible for ensuring there's
				1269	* always enough room in the buffer.
				1270	*/
				1271	int slz_rfc1950_send_header(struct slz_stream strm, unsigned char buf)
				1272	{
				1273	memcpy(buf, zlib_hdr, sizeof(zlib_hdr));
				1274	strm->state = SLZ_ST_EOB;
				1275	return sizeof(zlib_hdr);
				1276	}
				1277
				1278	/* Encodes the block according to rfc1950. This means that the CRC of the input
				1279	* block is computed according to the ADLER32 algorithm. If the header was never
				1280	* sent, it may be sent first. The number of output bytes is returned.
				1281	*/
				1282	long slz_rfc1950_encode(struct slz_stream strm, unsigned char out, const unsigned char *in, long ilen, int more)
				1283	{
				1284	long ret = 0;
				1285
				1286	if (__builtin_expect(strm->state == SLZ_ST_INIT, 0))
				1287	ret += slz_rfc1950_send_header(strm, out);
				1288
				1289	strm->crc32 = slz_adler32_block(strm->crc32, in, ilen);
				1290	ret += slz_rfc1951_encode(strm, out + ret, in, ilen, more);
				1291	return ret;
				1292	}
				1293
				1294	/* Initializes stream <strm> for use with the zlib format (rfc1952). The
				1295	* compression level passed in <level> is set. This value can only be 0 (no
				1296	* compression) or 1 (compression) and other values will lead to unpredictable
				1297	* behaviour. The function always returns 0.
				1298	*/
				1299	int slz_rfc1950_init(struct slz_stream *strm, int level)
				1300	{
				1301	strm->state = SLZ_ST_INIT;
				1302	strm->level = level;
				1303	strm->format = SLZ_FMT_ZLIB;
				1304	strm->crc32 = 1; // rfc1950/zlib starts with initial crc=1
				1305	strm->ilen = 0;
				1306	strm->qbits = 0;
				1307	strm->queue = 0;
				1308	return 0;
				1309	}
				1310
				1311	/* Flushes pending bits and sends the gzip trailer for stream <strm> into
				1312	* buffer <buf>. When it's done, the stream state is updated to SLZ_ST_END. It
				1313	* returns the number of bytes emitted. The trailer consists in flushing the
				1314	* possibly pending bits from the queue (up to 24 bits), rounding to the next
Ilya Shipitsin	b2be9a1	2021-04-24 13:25:42 +0500	[diff] [blame]	1315	* byte, then 4 bytes for the CRC. That may about to 4+4 = 8 bytes, that the
Willy Tarreau	ab2b782	2021-04-22 14:09:44 +0200	[diff] [blame]	1316	* caller must ensure are available before calling the function. Note that if
				1317	* the initial header was never sent, it will be sent first as well (2 extra
				1318	* bytes).
				1319	*/
				1320	int slz_rfc1950_finish(struct slz_stream strm, unsigned char buf)
				1321	{
				1322	strm->outbuf = buf;
				1323
				1324	if (__builtin_expect(strm->state == SLZ_ST_INIT, 0))
				1325	strm->outbuf += slz_rfc1952_send_header(strm, strm->outbuf);
				1326
				1327	slz_rfc1951_finish(strm, strm->outbuf);
				1328	copy_8b(strm, (strm->crc32 >> 24) & 0xff);
				1329	copy_8b(strm, (strm->crc32 >> 16) & 0xff);
				1330	copy_8b(strm, (strm->crc32 >> 8) & 0xff);
				1331	copy_8b(strm, (strm->crc32 >> 0) & 0xff);
				1332	strm->state = SLZ_ST_END;
				1333	return strm->outbuf - buf;
				1334	}
				1335
Willy Tarreau	ab2b782	2021-04-22 14:09:44 +0200	[diff] [blame]	1336	__attribute__((constructor))
				1337	static void __slz_initialize(void)
				1338	{
Willy Tarreau	9e27428	2021-05-12 08:36:09 +0200	[diff] [blame]	1339	#if !defined(__ARM_FEATURE_CRC32)
Willy Tarreau	ab2b782	2021-04-22 14:09:44 +0200	[diff] [blame]	1340	__slz_make_crc_table();
Willy Tarreau	9e27428	2021-05-12 08:36:09 +0200	[diff] [blame]	1341	#endif
Willy Tarreau	ab2b782	2021-04-22 14:09:44 +0200	[diff] [blame]	1342	__slz_prepare_dist_table();
				1343	}