Blame - src/slz.c - haproxy

blob: b1c5cd93650366060a06b94f70311b9ea85d332a [file] [log] [blame]

Willy Tarreau	ab2b782	2021-04-22 14:09:44 +0200	[diff] [blame]	1	/*
				2	* Copyright (C) 2013-2015 Willy Tarreau <w@1wt.eu>
				3	*
				4	* Permission is hereby granted, free of charge, to any person obtaining
				5	* a copy of this software and associated documentation files (the
				6	* "Software"), to deal in the Software without restriction, including
				7	* without limitation the rights to use, copy, modify, merge, publish,
				8	* distribute, sublicense, and/or sell copies of the Software, and to
				9	* permit persons to whom the Software is furnished to do so, subject to
				10	* the following conditions:
				11	*
				12	* The above copyright notice and this permission notice shall be
				13	* included in all copies or substantial portions of the Software.
				14	*
				15	* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
				16	* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
				17	* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
				18	* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
				19	* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
				20	* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
				21	* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
				22	* OTHER DEALINGS IN THE SOFTWARE.
				23	*/
				24
Willy Tarreau	388fc25	2021-05-14 08:44:52 +0200	[diff] [blame]	25	#include <inttypes.h>
Willy Tarreau	ab2b782	2021-04-22 14:09:44 +0200	[diff] [blame]	26	#include <stdio.h>
				27	#include <string.h>
				28	#include <import/slz.h>
				29	#include <import/slz-tables.h>
				30
				31	/* First, RFC1951-specific declarations and extracts from the RFC.
				32	*
				33	* RFC1951 - deflate stream format
				34
				35
				36	* Data elements are packed into bytes in order of
				37	increasing bit number within the byte, i.e., starting
				38	with the least-significant bit of the byte.
				39	* Data elements other than Huffman codes are packed
				40	starting with the least-significant bit of the data
				41	element.
				42	* Huffman codes are packed starting with the most-
				43	significant bit of the code.
				44
				45	3.2.3. Details of block format
				46
				47	Each block of compressed data begins with 3 header bits
				48	containing the following data:
				49
				50	first bit BFINAL
				51	next 2 bits BTYPE
				52
				53	Note that the header bits do not necessarily begin on a byte
				54	boundary, since a block does not necessarily occupy an integral
				55	number of bytes.
				56
				57	BFINAL is set if and only if this is the last block of the data
				58	set.
				59
				60	BTYPE specifies how the data are compressed, as follows:
				61
				62	00 - no compression
				63	01 - compressed with fixed Huffman codes
				64	10 - compressed with dynamic Huffman codes
				65	11 - reserved (error)
				66
				67	3.2.4. Non-compressed blocks (BTYPE=00)
				68
				69	Any bits of input up to the next byte boundary are ignored.
				70	The rest of the block consists of the following information:
				71
				72	0 1 2 3 4...
				73	+---+---+---+---+================================+
				74	\| LEN \| NLEN \|... LEN bytes of literal data...\|
				75	+---+---+---+---+================================+
				76
				77	LEN is the number of data bytes in the block. NLEN is the
				78	one's complement of LEN.
				79
				80	3.2.5. Compressed blocks (length and distance codes)
				81
				82	As noted above, encoded data blocks in the "deflate" format
				83	consist of sequences of symbols drawn from three conceptually
				84	distinct alphabets: either literal bytes, from the alphabet of
				85	byte values (0..255), or <length, backward distance> pairs,
				86	where the length is drawn from (3..258) and the distance is
				87	drawn from (1..32,768). In fact, the literal and length
				88	alphabets are merged into a single alphabet (0..285), where
				89	values 0..255 represent literal bytes, the value 256 indicates
				90	end-of-block, and values 257..285 represent length codes
				91	(possibly in conjunction with extra bits following the symbol
				92	code) as follows:
				93
				94	Length encoding :
				95	Extra Extra Extra
				96	Code Bits Length(s) Code Bits Lengths Code Bits Length(s)
				97	---- ---- ------ ---- ---- ------- ---- ---- -------
				98	257 0 3 267 1 15,16 277 4 67-82
				99	258 0 4 268 1 17,18 278 4 83-98
				100	259 0 5 269 2 19-22 279 4 99-114
				101	260 0 6 270 2 23-26 280 4 115-130
				102	261 0 7 271 2 27-30 281 5 131-162
				103	262 0 8 272 2 31-34 282 5 163-194
				104	263 0 9 273 3 35-42 283 5 195-226
				105	264 0 10 274 3 43-50 284 5 227-257
				106	265 1 11,12 275 3 51-58 285 0 258
				107	266 1 13,14 276 3 59-66
				108
				109	Distance encoding :
				110	Extra Extra Extra
				111	Code Bits Dist Code Bits Dist Code Bits Distance
				112	---- ---- ---- ---- ---- ------ ---- ---- --------
				113	0 0 1 10 4 33-48 20 9 1025-1536
				114	1 0 2 11 4 49-64 21 9 1537-2048
				115	2 0 3 12 5 65-96 22 10 2049-3072
				116	3 0 4 13 5 97-128 23 10 3073-4096
				117	4 1 5,6 14 6 129-192 24 11 4097-6144
				118	5 1 7,8 15 6 193-256 25 11 6145-8192
				119	6 2 9-12 16 7 257-384 26 12 8193-12288
				120	7 2 13-16 17 7 385-512 27 12 12289-16384
				121	8 3 17-24 18 8 513-768 28 13 16385-24576
				122	9 3 25-32 19 8 769-1024 29 13 24577-32768
				123
				124	3.2.6. Compression with fixed Huffman codes (BTYPE=01)
				125
				126	The Huffman codes for the two alphabets are fixed, and are not
				127	represented explicitly in the data. The Huffman code lengths
				128	for the literal/length alphabet are:
				129
				130	Lit Value Bits Codes
				131	--------- ---- -----
				132	0 - 143 8 00110000 through
				133	10111111
				134	144 - 255 9 110010000 through
				135	111111111
				136	256 - 279 7 0000000 through
				137	0010111
				138	280 - 287 8 11000000 through
				139	11000111
				140
				141	The code lengths are sufficient to generate the actual codes,
				142	as described above; we show the codes in the table for added
				143	clarity. Literal/length values 286-287 will never actually
				144	occur in the compressed data, but participate in the code
				145	construction.
				146
				147	Distance codes 0-31 are represented by (fixed-length) 5-bit
				148	codes, with possible additional bits as shown in the table
				149	shown in Paragraph 3.2.5, above. Note that distance codes 30-
				150	31 will never actually occur in the compressed data.
				151
				152	*/
				153
				154	/* back references, built in a way that is optimal for 32/64 bits */
				155	union ref {
				156	struct {
				157	uint32_t pos;
				158	uint32_t word;
				159	} by32;
				160	uint64_t by64;
				161	};
				162
				163	#if defined(USE_64BIT_QUEUE) && defined(UNALIGNED_LE_OK)
				164
				165	/* enqueue code x of <xbits> bits (LSB aligned, at most 24) and copy complete
				166	* 32-bit words into output buffer. X must not contain non-zero bits above
				167	* xbits.
				168	*/
				169	static inline void enqueue24(struct slz_stream *strm, uint32_t x, uint32_t xbits)
				170	{
				171	uint64_t queue = strm->queue + ((uint64_t)x << strm->qbits);
				172	uint32_t qbits = strm->qbits + xbits;
				173
				174	if (__builtin_expect(qbits >= 32, 1)) {
				175	(uint32_t )strm->outbuf = queue;
				176	queue >>= 32;
				177	qbits -= 32;
				178	strm->outbuf += 4;
				179	}
				180
				181	strm->queue = queue;
				182	strm->qbits = qbits;
				183	}
				184
				185	#define enqueue8 enqueue24
				186
				187	/* flush the queue and align to next byte */
				188	static inline void flush_bits(struct slz_stream *strm)
				189	{
				190	if (strm->qbits > 0)
				191	*strm->outbuf++ = strm->queue;
				192
				193	if (strm->qbits > 8)
				194	*strm->outbuf++ = strm->queue >> 8;
				195
				196	if (strm->qbits > 16)
				197	*strm->outbuf++ = strm->queue >> 16;
				198
				199	if (strm->qbits > 24)
				200	*strm->outbuf++ = strm->queue >> 24;
				201
				202	strm->queue = 0;
				203	strm->qbits = 0;
				204	}
				205
				206	#else /* non-64 bit or aligned or big endian */
				207
				208	/* enqueue code x of <xbits> bits (LSB aligned, at most 24) and copy complete
				209	* bytes into out buf. X must not contain non-zero bits above xbits. Prefer
				210	* enqueue8() when xbits is known for being 8 or less.
				211	*/
				212	static void enqueue24(struct slz_stream *strm, uint32_t x, uint32_t xbits)
				213	{
				214	uint32_t queue = strm->queue + (x << strm->qbits);
				215	uint32_t qbits = strm->qbits + xbits;
				216
				217	if (qbits >= 16) {
				218	#ifndef UNALIGNED_LE_OK
				219	strm->outbuf[0] = queue;
				220	strm->outbuf[1] = queue >> 8;
				221	#else
				222	(uint16_t )strm->outbuf = queue;
				223	#endif
				224	strm->outbuf += 2;
				225	queue >>= 16;
				226	qbits -= 16;
				227	}
				228
				229	if (qbits >= 8) {
				230	qbits -= 8;
				231	*strm->outbuf++ = queue;
				232	queue >>= 8;
				233	}
				234	strm->qbits = qbits;
				235	strm->queue = queue;
				236	return;
				237	}
				238
				239	/* enqueue code x of <xbits> bits (at most 8) and copy complete bytes into
				240	* out buf. X must not contain non-zero bits above xbits.
				241	*/
				242	static inline void enqueue8(struct slz_stream *strm, uint32_t x, uint32_t xbits)
				243	{
				244	uint32_t queue = strm->queue + (x << strm->qbits);
				245	uint32_t qbits = strm->qbits + xbits;
				246
				247	if (__builtin_expect((signed)(qbits - 8) >= 0, 1)) {
				248	qbits -= 8;
				249	*strm->outbuf++ = queue;
				250	queue >>= 8;
				251	}
				252
				253	strm->qbits = qbits;
				254	strm->queue = queue;
				255	}
				256
				257	/* align to next byte */
				258	static inline void flush_bits(struct slz_stream *strm)
				259	{
				260	if (strm->qbits > 0)
				261	*strm->outbuf++ = strm->queue;
				262
				263	if (strm->qbits > 8)
				264	*strm->outbuf++ = strm->queue >> 8;
				265
				266	strm->queue = 0;
				267	strm->qbits = 0;
				268	}
				269	#endif
				270
				271
				272	/* only valid if buffer is already aligned */
				273	static inline void copy_8b(struct slz_stream *strm, uint32_t x)
				274	{
				275	*strm->outbuf++ = x;
				276	}
				277
				278	/* only valid if buffer is already aligned */
				279	static inline void copy_16b(struct slz_stream *strm, uint32_t x)
				280	{
				281	strm->outbuf[0] = x;
				282	strm->outbuf[1] = x >> 8;
				283	strm->outbuf += 2;
				284	}
				285
				286	/* only valid if buffer is already aligned */
				287	static inline void copy_32b(struct slz_stream *strm, uint32_t x)
				288	{
				289	strm->outbuf[0] = x;
				290	strm->outbuf[1] = x >> 8;
				291	strm->outbuf[2] = x >> 16;
				292	strm->outbuf[3] = x >> 24;
				293	strm->outbuf += 4;
				294	}
				295
				296	static inline void send_huff(struct slz_stream *strm, uint32_t code)
				297	{
				298	uint32_t bits;
				299
				300	code = fixed_huff[code];
				301	bits = code & 15;
				302	code >>= 4;
				303	enqueue24(strm, code, bits);
				304	}
				305
				306	static inline void send_eob(struct slz_stream *strm)
				307	{
				308	enqueue8(strm, 0, 7); // direct encoding of 256 = EOB (cf RFC1951)
				309	}
				310
Ilya Shipitsin	b2be9a1	2021-04-24 13:25:42 +0500	[diff] [blame]	311	/* copies <len> literals from <buf>. <more> indicates that there are data past
Willy Tarreau	ab2b782	2021-04-22 14:09:44 +0200	[diff] [blame]	312	* buf + <len>. <len> must not be null.
				313	*/
				314	static void copy_lit(struct slz_stream strm, const void buf, uint32_t len, int more)
				315	{
				316	uint32_t len2;
				317
				318	do {
				319	len2 = len;
				320	if (__builtin_expect(len2 > 65535, 0))
				321	len2 = 65535;
				322
				323	len -= len2;
				324
				325	if (strm->state != SLZ_ST_EOB)
				326	send_eob(strm);
				327
				328	strm->state = (more \|\| len) ? SLZ_ST_EOB : SLZ_ST_DONE;
				329
				330	enqueue8(strm, !(more \|\| len), 3); // BFINAL = !more ; BTYPE = 00
				331	flush_bits(strm);
				332	copy_16b(strm, len2); // len2
				333	copy_16b(strm, ~len2); // nlen2
				334	memcpy(strm->outbuf, buf, len2);
				335	buf += len2;
				336	strm->outbuf += len2;
				337	} while (len);
				338	}
				339
Ilya Shipitsin	b2be9a1	2021-04-24 13:25:42 +0500	[diff] [blame]	340	/* copies <len> literals from <buf>. <more> indicates that there are data past
Willy Tarreau	ab2b782	2021-04-22 14:09:44 +0200	[diff] [blame]	341	* buf + <len>. <len> must not be null.
				342	*/
				343	static void copy_lit_huff(struct slz_stream strm, const unsigned char buf, uint32_t len, int more)
				344	{
				345	uint32_t pos;
				346
				347	/* This ugly construct limits the mount of tests and optimizes for the
				348	* most common case (more > 0).
				349	*/
				350	if (strm->state == SLZ_ST_EOB) {
				351	eob:
				352	strm->state = more ? SLZ_ST_FIXED : SLZ_ST_LAST;
				353	enqueue8(strm, 2 + !more, 3); // BFINAL = !more ; BTYPE = 01
				354	}
				355	else if (!more) {
				356	send_eob(strm);
				357	goto eob;
				358	}
				359
				360	pos = 0;
				361	do {
				362	send_huff(strm, buf[pos++]);
				363	} while (pos < len);
				364	}
				365
				366	/* format:
				367	* bit0..31 = word
				368	* bit32..63 = last position in buffer of similar content
				369	*/
				370
				371	/* This hash provides good average results on HTML contents, and is among the
				372	* few which provide almost optimal results on various different pages.
				373	*/
				374	static inline uint32_t slz_hash(uint32_t a)
				375	{
				376	#if defined(__ARM_FEATURE_CRC32)
Willy Tarreau	5fddba6	2021-12-03 17:38:42 +0100	[diff] [blame]	377	# if defined(__ARM_ARCH_ISA_A64)
				378	// 64 bit mode
Willy Tarreau	ab2b782	2021-04-22 14:09:44 +0200	[diff] [blame]	379	__asm__ volatile("crc32w %w0,%w0,%w1" : "+r"(a) : "r"(0));
Willy Tarreau	5fddba6	2021-12-03 17:38:42 +0100	[diff] [blame]	380	# else
				381	// 32 bit mode (e.g. armv7 compiler building for armv8
				382	__asm__ volatile("crc32w %0,%0,%1" : "+r"(a) : "r"(0));
				383	# endif
Willy Tarreau	ab2b782	2021-04-22 14:09:44 +0200	[diff] [blame]	384	return a >> (32 - HASH_BITS);
				385	#else
				386	return ((a << 19) + (a << 6) - a) >> (32 - HASH_BITS);
				387	#endif
				388	}
				389
				390	/* This function compares buffers <a> and <b> and reads 32 or 64 bits at a time
				391	* during the approach. It makes us of unaligned little endian memory accesses
				392	* on capable architectures. <max> is the maximum number of bytes that can be
				393	* read, so both <a> and <b> must have at least <max> bytes ahead. <max> may
				394	* safely be null or negative if that simplifies computations in the caller.
				395	*/
				396	static inline long memmatch(const unsigned char a, const unsigned char b, long max)
				397	{
				398	long len = 0;
				399
				400	#ifdef UNALIGNED_LE_OK
				401	unsigned long xor;
				402
				403	while (1) {
				404	if ((long)(len + 2 * sizeof(long)) > max) {
				405	while (len < max) {
				406	if (a[len] != b[len])
				407	break;
				408	len++;
				409	}
				410	return len;
				411	}
				412
				413	xor = (long )&a[len] ^ (long )&b[len];
				414	if (xor)
				415	break;
				416	len += sizeof(long);
				417
				418	xor = (long )&a[len] ^ (long )&b[len];
				419	if (xor)
				420	break;
				421	len += sizeof(long);
				422	}
				423
				424	#if defined(__x86_64__) \|\| defined(__i386__) \|\| defined(__i486__) \|\| defined(__i586__) \|\| defined(__i686__)
				425	/* x86 has bsf. We know that xor is non-null here */
				426	asm("bsf %1,%0\n" : "=r"(xor) : "0" (xor));
				427	return len + xor / 8;
				428	#else
				429	if (sizeof(long) > 4 && !(xor & 0xffffffff)) {
				430	/* This code is optimized out on 32-bit archs, but we still
				431	* need to shift in two passes to avoid a warning. It is
				432	* properly optimized out as a single shift.
				433	*/
				434	xor >>= 16; xor >>= 16;
				435	if (xor & 0xffff) {
				436	if (xor & 0xff)
				437	return len + 4;
				438	return len + 5;
				439	}
				440	if (xor & 0xffffff)
				441	return len + 6;
				442	return len + 7;
				443	}
				444
				445	if (xor & 0xffff) {
				446	if (xor & 0xff)
				447	return len;
				448	return len + 1;
				449	}
				450	if (xor & 0xffffff)
				451	return len + 2;
				452	return len + 3;
				453	#endif // x86
				454
				455	#else // UNALIGNED_LE_OK
				456	/* This is the generic version for big endian or unaligned-incompatible
				457	* architectures.
				458	*/
				459	while (len < max) {
				460	if (a[len] != b[len])
				461	break;
				462	len++;
				463	}
				464	return len;
				465
				466	#endif
				467	}
				468
				469	/* sets <count> BYTES to -32769 in <refs> so that any uninitialized entry will
				470	* verify (pos-last-1 >= 32768) and be ignored. <count> must be a multiple of
				471	* 128 bytes and <refs> must be at least one count in length. It's supposed to
				472	* be applied to 64-bit aligned data exclusively, which makes it slightly
				473	* faster than the regular memset() since no alignment check is performed.
				474	*/
				475	void reset_refs(union ref *refs, long count)
				476	{
				477	/* avoid a shift/mask by casting to void* */
				478	union ref end = (void )refs + count;
				479
				480	do {
				481	refs[ 0].by64 = -32769;
				482	refs[ 1].by64 = -32769;
				483	refs[ 2].by64 = -32769;
				484	refs[ 3].by64 = -32769;
				485	refs[ 4].by64 = -32769;
				486	refs[ 5].by64 = -32769;
				487	refs[ 6].by64 = -32769;
				488	refs[ 7].by64 = -32769;
				489	refs[ 8].by64 = -32769;
				490	refs[ 9].by64 = -32769;
				491	refs[10].by64 = -32769;
				492	refs[11].by64 = -32769;
				493	refs[12].by64 = -32769;
				494	refs[13].by64 = -32769;
				495	refs[14].by64 = -32769;
				496	refs[15].by64 = -32769;
				497	refs += 16;
				498	} while (refs < end);
				499	}
				500
				501	/* Compresses <ilen> bytes from <in> into <out> according to RFC1951. The
				502	* output result may be up to 5 bytes larger than the input, to which 2 extra
				503	* bytes may be added to send the last chunk due to BFINAL+EOB encoding (10
				504	* bits) when <more> is not set. The caller is responsible for ensuring there
				505	* is enough room in the output buffer for this. The amount of output bytes is
				506	* returned, and no CRC is computed.
				507	*/
				508	long slz_rfc1951_encode(struct slz_stream strm, unsigned char out, const unsigned char *in, long ilen, int more)
				509	{
				510	long rem = ilen;
				511	unsigned long pos = 0;
				512	unsigned long last;
				513	uint32_t word = 0;
				514	long mlen;
				515	uint32_t h;
				516	uint64_t ent;
				517
				518	uint32_t plit = 0;
				519	uint32_t bit9 = 0;
				520	uint32_t dist, code;
				521	union ref refs[1 << HASH_BITS];
				522
				523	if (!strm->level) {
				524	/* force to send as literals (eg to preserve CPU) */
				525	strm->outbuf = out;
				526	plit = pos = ilen;
				527	bit9 = 52; /* force literal dump */
				528	goto final_lit_dump;
				529	}
				530
				531	reset_refs(refs, sizeof(refs));
				532
				533	strm->outbuf = out;
				534
				535	#ifndef UNALIGNED_FASTER
				536	word = ((unsigned char)in[pos] << 8) + ((unsigned char)in[pos + 1] << 16) + ((unsigned char)in[pos + 2] << 24);
				537	#endif
				538	while (rem >= 4) {
				539	#ifndef UNALIGNED_FASTER
				540	word = ((unsigned char)in[pos + 3] << 24) + (word >> 8);
				541	#else
				542	word = (uint32_t )&in[pos];
				543	#endif
				544	h = slz_hash(word);
				545	asm volatile ("" ::); // prevent gcc from trying to be smart with the prefetch
				546
				547	if (sizeof(long) >= 8) {
				548	ent = refs[h].by64;
				549	last = (uint32_t)ent;
				550	ent >>= 32;
				551	refs[h].by64 = ((uint64_t)pos) + ((uint64_t)word << 32);
				552	} else {
				553	ent = refs[h].by32.word;
				554	last = refs[h].by32.pos;
				555	refs[h].by32.pos = pos;
				556	refs[h].by32.word = word;
				557	}
				558
Willy Tarreau	df7cd97	2021-08-28 12:10:49 +0200	[diff] [blame]	559	#ifdef FIND_OPTIMAL_MATCH
Willy Tarreau	ab2b782	2021-04-22 14:09:44 +0200	[diff] [blame]	560	/* Experimental code to see what could be saved with an ideal
				561	* longest match lookup algorithm. This one is very slow but
				562	* scans the whole window. In short, here are the savings :
				563	* file orig fast(ratio) optimal(ratio)
				564	* README 5185 3419 (65.9%) 3165 (61.0%) -7.5%
				565	* index.html 76799 35662 (46.4%) 29875 (38.9%) -16.3%
				566	* rfc1952.c 29383 13442 (45.7%) 11793 (40.1%) -12.3%
				567	*
				568	* Thus the savings to expect for large files is at best 16%.
				569	*
				570	* A non-colliding hash gives 33025 instead of 35662 (-7.4%),
				571	* and keeping the last two entries gives 31724 (-11.0%).
				572	*/
				573	unsigned long scan;
				574	int saved = 0;
				575	int bestpos = 0;
				576	int bestlen = 0;
				577	int firstlen = 0;
				578	int max_lookup = 2; // 0 = no limit
				579
				580	for (scan = pos - 1; scan < pos && (unsigned long)(pos - scan - 1) < 32768; scan--) {
				581	if ((uint32_t )(in + scan) != word)
				582	continue;
				583
				584	len = memmatch(in + pos, in + scan, rem);
				585	if (!bestlen)
				586	firstlen = len;
				587
				588	if (len > bestlen) {
				589	bestlen = len;
				590	bestpos = scan;
				591	}
				592	if (!--max_lookup)
				593	break;
				594	}
				595	if (bestlen) {
				596	//printf("pos=%d last=%d bestpos=%d word=%08x ent=%08x len=%d\n",
				597	// (int)pos, (int)last, (int)bestpos, (int)word, (int)ent, bestlen);
				598	last = bestpos;
				599	ent = word;
				600	saved += bestlen - firstlen;
				601	}
				602	//fprintf(stderr, "first=%d best=%d saved_total=%d\n", firstlen, bestlen, saved);
				603	#endif
				604
				605	if ((uint32_t)ent != word) {
				606	send_as_lit:
				607	rem--;
				608	plit++;
				609	bit9 += ((unsigned char)word >= 144);
				610	pos++;
				611	continue;
				612	}
				613
				614	/* We reject pos = last and pos > last+32768 */
				615	if ((unsigned long)(pos - last - 1) >= 32768)
				616	goto send_as_lit;
				617
				618	/* Note: cannot encode a length larger than 258 bytes */
				619	mlen = memmatch(in + pos + 4, in + last + 4, (rem > 258 ? 258 : rem) - 4) + 4;
				620
				621	/* found a matching entry */
				622
				623	if (bit9 >= 52 && mlen < 6)
				624	goto send_as_lit;
				625
				626	/* compute the output code, its size and the length's size in
				627	* bits to know if the reference is cheaper than literals.
				628	*/
				629	code = len_fh[mlen];
				630
				631	/* direct mapping of dist->huffman code */
				632	dist = fh_dist_table[pos - last - 1];
				633
				634	/* if encoding the dist+length is more expensive than sending
				635	* the equivalent as bytes, lets keep the literals.
				636	*/
				637	if ((dist & 0x1f) + (code >> 16) + 8 >= 8 * mlen + bit9)
				638	goto send_as_lit;
				639
				640	/* first, copy pending literals */
				641	if (plit) {
				642	/* Huffman encoding requires 9 bits for octets 144..255, so this
				643	* is a waste of space for binary data. Switching between Huffman
				644	* and no-comp then huffman consumes 52 bits (7 for EOB + 3 for
				645	* block type + 7 for alignment + 32 for LEN+NLEN + 3 for next
				646	* block. Only use plain literals if there are more than 52 bits
				647	* to save then.
				648	*/
				649	if (bit9 >= 52)
				650	copy_lit(strm, in + pos - plit, plit, 1);
				651	else
				652	copy_lit_huff(strm, in + pos - plit, plit, 1);
				653
				654	plit = 0;
				655	}
				656
				657	/* use mode 01 - fixed huffman */
				658	if (strm->state == SLZ_ST_EOB) {
				659	strm->state = SLZ_ST_FIXED;
				660	enqueue8(strm, 0x02, 3); // BTYPE = 01, BFINAL = 0
				661	}
				662
				663	/* copy the length first */
				664	enqueue24(strm, code & 0xFFFF, code >> 16);
				665
				666	/* in fixed huffman mode, dist is fixed 5 bits */
				667	enqueue24(strm, dist >> 5, dist & 0x1f);
				668	bit9 = 0;
				669	rem -= mlen;
				670	pos += mlen;
				671
				672	#ifndef UNALIGNED_FASTER
				673	#ifdef UNALIGNED_LE_OK
				674	word = (uint32_t )&in[pos - 1];
				675	#else
				676	word = ((unsigned char)in[pos] << 8) + ((unsigned char)in[pos + 1] << 16) + ((unsigned char)in[pos + 2] << 24);
				677	#endif
				678	#endif
				679	}
				680
				681	if (__builtin_expect(rem, 0)) {
				682	/* we're reading the 1..3 last bytes */
				683	plit += rem;
				684	do {
				685	bit9 += ((unsigned char)in[pos++] >= 144);
				686	} while (--rem);
				687	}
				688
				689	final_lit_dump:
				690	/* now copy remaining literals or mark the end */
				691	if (plit) {
				692	if (bit9 >= 52)
				693	copy_lit(strm, in + pos - plit, plit, more);
				694	else
				695	copy_lit_huff(strm, in + pos - plit, plit, more);
				696
				697	plit = 0;
				698	}
				699
				700	strm->ilen += ilen;
				701	return strm->outbuf - out;
				702	}
				703
				704	/* Initializes stream <strm> for use with raw deflate (rfc1951). The CRC is
				705	* unused but set to zero. The compression level passed in <level> is set. This
				706	* value can only be 0 (no compression) or 1 (compression) and other values
				707	* will lead to unpredictable behaviour. The function always returns 0.
				708	*/
				709	int slz_rfc1951_init(struct slz_stream *strm, int level)
				710	{
				711	strm->state = SLZ_ST_EOB; // no header
				712	strm->level = level;
				713	strm->format = SLZ_FMT_DEFLATE;
				714	strm->crc32 = 0;
				715	strm->ilen = 0;
				716	strm->qbits = 0;
				717	strm->queue = 0;
				718	return 0;
				719	}
				720
				721	/* Flushes any pending for stream <strm> into buffer <buf>, then sends BTYPE=1
				722	* and BFINAL=1 if needed. The stream ends in SLZ_ST_DONE. It returns the number
				723	* of bytes emitted. The trailer consists in flushing the possibly pending bits
				724	* from the queue (up to 7 bits), then possibly EOB (7 bits), then 3 bits, EOB,
				725	* a rounding to the next byte, which amounts to a total of 4 bytes max, that
				726	* the caller must ensure are available before calling the function.
				727	*/
				728	int slz_rfc1951_finish(struct slz_stream strm, unsigned char buf)
				729	{
				730	strm->outbuf = buf;
				731
				732	if (strm->state == SLZ_ST_FIXED \|\| strm->state == SLZ_ST_LAST) {
				733	strm->state = (strm->state == SLZ_ST_LAST) ? SLZ_ST_DONE : SLZ_ST_EOB;
				734	send_eob(strm);
				735	}
				736
				737	if (strm->state != SLZ_ST_DONE) {
				738	/* send BTYPE=1, BFINAL=1 */
				739	enqueue8(strm, 3, 3);
				740	send_eob(strm);
				741	strm->state = SLZ_ST_DONE;
				742	}
				743
				744	flush_bits(strm);
				745	return strm->outbuf - buf;
				746	}
				747
				748	/* Now RFC1952-specific declarations and extracts from RFC.
				749	* From RFC1952 about the GZIP file format :
				750
				751	A gzip file consists of a series of "members" ...
				752
				753	2.3. Member format
				754
				755	Each member has the following structure:
				756
				757	+---+---+---+---+---+---+---+---+---+---+
				758	\|ID1\|ID2\|CM \|FLG\| MTIME \|XFL\|OS \| (more-->)
				759	+---+---+---+---+---+---+---+---+---+---+
				760
				761	(if FLG.FEXTRA set)
				762
				763	+---+---+=================================+
				764	\| XLEN \|...XLEN bytes of "extra field"...\| (more-->)
				765	+---+---+=================================+
				766
				767	(if FLG.FNAME set)
				768
				769	+=========================================+
				770	\|...original file name, zero-terminated...\| (more-->)
				771	+=========================================+
				772
				773	(if FLG.FCOMMENT set)
				774
				775	+===================================+
				776	\|...file comment, zero-terminated...\| (more-->)
				777	+===================================+
				778
				779	(if FLG.FHCRC set)
				780
				781	+---+---+
				782	\| CRC16 \|
				783	+---+---+
				784
				785	+=======================+
				786	\|...compressed blocks...\| (more-->)
				787	+=======================+
				788
				789	0 1 2 3 4 5 6 7
				790	+---+---+---+---+---+---+---+---+
				791	\| CRC32 \| ISIZE \|
				792	+---+---+---+---+---+---+---+---+
				793
				794
				795	2.3.1. Member header and trailer
				796
				797	ID1 (IDentification 1)
				798	ID2 (IDentification 2)
				799	These have the fixed values ID1 = 31 (0x1f, \037), ID2 = 139
				800	(0x8b, \213), to identify the file as being in gzip format.
				801
				802	CM (Compression Method)
				803	This identifies the compression method used in the file. CM
				804	= 0-7 are reserved. CM = 8 denotes the "deflate"
				805	compression method, which is the one customarily used by
				806	gzip and which is documented elsewhere.
				807
				808	FLG (FLaGs)
				809	This flag byte is divided into individual bits as follows:
				810
				811	bit 0 FTEXT
				812	bit 1 FHCRC
				813	bit 2 FEXTRA
				814	bit 3 FNAME
				815	bit 4 FCOMMENT
				816	bit 5 reserved
				817	bit 6 reserved
				818	bit 7 reserved
				819
				820	Reserved FLG bits must be zero.
				821
				822	MTIME (Modification TIME)
				823	This gives the most recent modification time of the original
				824	file being compressed. The time is in Unix format, i.e.,
				825	seconds since 00:00:00 GMT, Jan. 1, 1970. (Note that this
				826	may cause problems for MS-DOS and other systems that use
				827	local rather than Universal time.) If the compressed data
				828	did not come from a file, MTIME is set to the time at which
				829	compression started. MTIME = 0 means no time stamp is
				830	available.
				831
				832	XFL (eXtra FLags)
				833	These flags are available for use by specific compression
				834	methods. The "deflate" method (CM = 8) sets these flags as
				835	follows:
				836
				837	XFL = 2 - compressor used maximum compression,
				838	slowest algorithm
				839	XFL = 4 - compressor used fastest algorithm
				840
				841	OS (Operating System)
				842	This identifies the type of file system on which compression
				843	took place. This may be useful in determining end-of-line
				844	convention for text files. The currently defined values are
				845	as follows:
				846
				847	0 - FAT filesystem (MS-DOS, OS/2, NT/Win32)
				848	1 - Amiga
				849	2 - VMS (or OpenVMS)
				850	3 - Unix
				851	4 - VM/CMS
				852	5 - Atari TOS
				853	6 - HPFS filesystem (OS/2, NT)
				854	7 - Macintosh
				855	8 - Z-System
				856	9 - CP/M
				857	10 - TOPS-20
				858	11 - NTFS filesystem (NT)
				859	12 - QDOS
				860	13 - Acorn RISCOS
				861	255 - unknown
				862
				863	==> A file compressed using "gzip -1" on Unix-like systems can be :
				864
				865	1F 8B 08 00 00 00 00 00 04 03
				866	<deflate-compressed stream>
				867	crc32 size32
				868	*/
				869
				870	static const unsigned char gzip_hdr[] = { 0x1F, 0x8B, // ID1, ID2
				871	0x08, 0x00, // Deflate, flags (none)
				872	0x00, 0x00, 0x00, 0x00, // mtime: none
				873	0x04, 0x03 }; // fastest comp, OS=Unix
				874
				875	static inline uint32_t crc32_char(uint32_t crc, uint8_t x)
				876	{
				877	#if defined(__ARM_FEATURE_CRC32)
				878	crc = ~crc;
Willy Tarreau	5fddba6	2021-12-03 17:38:42 +0100	[diff] [blame]	879	# if defined(__ARM_ARCH_ISA_A64)
				880	// 64 bit mode
Willy Tarreau	ab2b782	2021-04-22 14:09:44 +0200	[diff] [blame]	881	__asm__ volatile("crc32b %w0,%w0,%w1" : "+r"(crc) : "r"(x));
Willy Tarreau	5fddba6	2021-12-03 17:38:42 +0100	[diff] [blame]	882	# else
				883	// 32 bit mode (e.g. armv7 compiler building for armv8
				884	__asm__ volatile("crc32b %0,%0,%1" : "+r"(crc) : "r"(x));
				885	# endif
Willy Tarreau	ab2b782	2021-04-22 14:09:44 +0200	[diff] [blame]	886	crc = ~crc;
				887	#else
				888	crc = crc32_fast[0][(crc ^ x) & 0xff] ^ (crc >> 8);
				889	#endif
				890	return crc;
				891	}
				892
				893	static inline uint32_t crc32_uint32(uint32_t data)
				894	{
				895	#if defined(__ARM_FEATURE_CRC32)
Willy Tarreau	5fddba6	2021-12-03 17:38:42 +0100	[diff] [blame]	896	# if defined(__ARM_ARCH_ISA_A64)
				897	// 64 bit mode
Willy Tarreau	ab2b782	2021-04-22 14:09:44 +0200	[diff] [blame]	898	__asm__ volatile("crc32w %w0,%w0,%w1" : "+r"(data) : "r"(~0UL));
Willy Tarreau	5fddba6	2021-12-03 17:38:42 +0100	[diff] [blame]	899	# else
				900	// 32 bit mode (e.g. armv7 compiler building for armv8
				901	__asm__ volatile("crc32w %0,%0,%1" : "+r"(data) : "r"(~0UL));
				902	# endif
Willy Tarreau	ab2b782	2021-04-22 14:09:44 +0200	[diff] [blame]	903	data = ~data;
				904	#else
				905	data = crc32_fast[3][(data >> 0) & 0xff] ^
				906	crc32_fast[2][(data >> 8) & 0xff] ^
				907	crc32_fast[1][(data >> 16) & 0xff] ^
				908	crc32_fast[0][(data >> 24) & 0xff];
				909	#endif
				910	return data;
				911	}
				912
				913	/* Modified version originally from RFC1952, working with non-inverting CRCs */
				914	uint32_t slz_crc32_by1(uint32_t crc, const unsigned char *buf, int len)
				915	{
				916	int n;
				917
				918	for (n = 0; n < len; n++)
				919	crc = crc32_char(crc, buf[n]);
				920	return crc;
				921	}
				922
				923	/* This version computes the crc32 of <buf> over <len> bytes, doing most of it
				924	* in 32-bit chunks.
				925	*/
				926	uint32_t slz_crc32_by4(uint32_t crc, const unsigned char *buf, int len)
				927	{
				928	const unsigned char *end = buf + len;
				929
				930	while (buf <= end - 16) {
				931	#ifdef UNALIGNED_LE_OK
				932	#if defined(__ARM_FEATURE_CRC32)
				933	crc = ~crc;
Willy Tarreau	5fddba6	2021-12-03 17:38:42 +0100	[diff] [blame]	934	# if defined(__ARM_ARCH_ISA_A64)
				935	// 64 bit mode
Willy Tarreau	ab2b782	2021-04-22 14:09:44 +0200	[diff] [blame]	936	__asm__ volatile("crc32w %w0,%w0,%w1" : "+r"(crc) : "r"((uint32_t)(buf)));
				937	__asm__ volatile("crc32w %w0,%w0,%w1" : "+r"(crc) : "r"((uint32_t)(buf + 4)));
				938	__asm__ volatile("crc32w %w0,%w0,%w1" : "+r"(crc) : "r"((uint32_t)(buf + 8)));
				939	__asm__ volatile("crc32w %w0,%w0,%w1" : "+r"(crc) : "r"((uint32_t)(buf + 12)));
Willy Tarreau	5fddba6	2021-12-03 17:38:42 +0100	[diff] [blame]	940	# else
				941	// 32 bit mode (e.g. armv7 compiler building for armv8
				942	__asm__ volatile("crc32w %0,%0,%1" : "+r"(crc) : "r"((uint32_t)(buf)));
				943	__asm__ volatile("crc32w %0,%0,%1" : "+r"(crc) : "r"((uint32_t)(buf + 4)));
				944	__asm__ volatile("crc32w %0,%0,%1" : "+r"(crc) : "r"((uint32_t)(buf + 8)));
				945	__asm__ volatile("crc32w %0,%0,%1" : "+r"(crc) : "r"((uint32_t)(buf + 12)));
				946	# endif
Willy Tarreau	ab2b782	2021-04-22 14:09:44 +0200	[diff] [blame]	947	crc = ~crc;
				948	#else
				949	crc ^= (uint32_t )buf;
				950	crc = crc32_uint32(crc);
				951
				952	crc ^= (uint32_t )(buf + 4);
				953	crc = crc32_uint32(crc);
				954
				955	crc ^= (uint32_t )(buf + 8);
				956	crc = crc32_uint32(crc);
				957
				958	crc ^= (uint32_t )(buf + 12);
				959	crc = crc32_uint32(crc);
				960	#endif
				961	#else
				962	crc = crc32_fast[3][(buf[0] ^ (crc >> 0)) & 0xff] ^
				963	crc32_fast[2][(buf[1] ^ (crc >> 8)) & 0xff] ^
				964	crc32_fast[1][(buf[2] ^ (crc >> 16)) & 0xff] ^
				965	crc32_fast[0][(buf[3] ^ (crc >> 24)) & 0xff];
				966
				967	crc = crc32_fast[3][(buf[4] ^ (crc >> 0)) & 0xff] ^
				968	crc32_fast[2][(buf[5] ^ (crc >> 8)) & 0xff] ^
				969	crc32_fast[1][(buf[6] ^ (crc >> 16)) & 0xff] ^
				970	crc32_fast[0][(buf[7] ^ (crc >> 24)) & 0xff];
				971
				972	crc = crc32_fast[3][(buf[8] ^ (crc >> 0)) & 0xff] ^
				973	crc32_fast[2][(buf[9] ^ (crc >> 8)) & 0xff] ^
				974	crc32_fast[1][(buf[10] ^ (crc >> 16)) & 0xff] ^
				975	crc32_fast[0][(buf[11] ^ (crc >> 24)) & 0xff];
				976
				977	crc = crc32_fast[3][(buf[12] ^ (crc >> 0)) & 0xff] ^
				978	crc32_fast[2][(buf[13] ^ (crc >> 8)) & 0xff] ^
				979	crc32_fast[1][(buf[14] ^ (crc >> 16)) & 0xff] ^
				980	crc32_fast[0][(buf[15] ^ (crc >> 24)) & 0xff];
				981	#endif
				982	buf += 16;
				983	}
				984
				985	while (buf <= end - 4) {
				986	#ifdef UNALIGNED_LE_OK
				987	crc ^= (uint32_t )buf;
				988	crc = crc32_uint32(crc);
				989	#else
				990	crc = crc32_fast[3][(buf[0] ^ (crc >> 0)) & 0xff] ^
				991	crc32_fast[2][(buf[1] ^ (crc >> 8)) & 0xff] ^
				992	crc32_fast[1][(buf[2] ^ (crc >> 16)) & 0xff] ^
				993	crc32_fast[0][(buf[3] ^ (crc >> 24)) & 0xff];
				994	#endif
				995	buf += 4;
				996	}
				997
				998	while (buf < end)
Willy Tarreau	027fdcb	2021-05-12 08:34:36 +0200	[diff] [blame]	999	crc = crc32_char(crc, *buf++);
Willy Tarreau	ab2b782	2021-04-22 14:09:44 +0200	[diff] [blame]	1000	return crc;
				1001	}
				1002
				1003	/* uses the most suitable crc32 function to update crc on <buf, len> */
				1004	static inline uint32_t update_crc(uint32_t crc, const void *buf, int len)
				1005	{
				1006	return slz_crc32_by4(crc, buf, len);
				1007	}
				1008
				1009	/* Sends the gzip header for stream <strm> into buffer <buf>. When it's done,
				1010	* the stream state is updated to SLZ_ST_EOB. It returns the number of bytes
				1011	* emitted which is always 10. The caller is responsible for ensuring there's
				1012	* always enough room in the buffer.
				1013	*/
				1014	int slz_rfc1952_send_header(struct slz_stream strm, unsigned char buf)
				1015	{
				1016	memcpy(buf, gzip_hdr, sizeof(gzip_hdr));
				1017	strm->state = SLZ_ST_EOB;
				1018	return sizeof(gzip_hdr);
				1019	}
				1020
				1021	/* Encodes the block according to rfc1952. This means that the CRC of the input
				1022	* block is computed according to the CRC32 algorithm. If the header was never
				1023	* sent, it may be sent first. The number of output bytes is returned.
				1024	*/
				1025	long slz_rfc1952_encode(struct slz_stream strm, unsigned char out, const unsigned char *in, long ilen, int more)
				1026	{
				1027	long ret = 0;
				1028
				1029	if (__builtin_expect(strm->state == SLZ_ST_INIT, 0))
				1030	ret += slz_rfc1952_send_header(strm, out);
				1031
				1032	strm->crc32 = update_crc(strm->crc32, in, ilen);
				1033	ret += slz_rfc1951_encode(strm, out + ret, in, ilen, more);
				1034	return ret;
				1035	}
				1036
				1037	/* Initializes stream <strm> for use with the gzip format (rfc1952). The
				1038	* compression level passed in <level> is set. This value can only be 0 (no
				1039	* compression) or 1 (compression) and other values will lead to unpredictable
				1040	* behaviour. The function always returns 0.
				1041	*/
				1042	int slz_rfc1952_init(struct slz_stream *strm, int level)
				1043	{
				1044	strm->state = SLZ_ST_INIT;
				1045	strm->level = level;
				1046	strm->format = SLZ_FMT_GZIP;
				1047	strm->crc32 = 0;
				1048	strm->ilen = 0;
				1049	strm->qbits = 0;
				1050	strm->queue = 0;
				1051	return 0;
				1052	}
				1053
				1054	/* Flushes pending bits and sends the gzip trailer for stream <strm> into
				1055	* buffer <buf>. When it's done, the stream state is updated to SLZ_ST_END. It
				1056	* returns the number of bytes emitted. The trailer consists in flushing the
				1057	* possibly pending bits from the queue (up to 24 bits), rounding to the next
				1058	* byte, then 4 bytes for the CRC and another 4 bytes for the input length.
Ilya Shipitsin	b2be9a1	2021-04-24 13:25:42 +0500	[diff] [blame]	1059	* That may about to 4+4+4 = 12 bytes, that the caller must ensure are
Willy Tarreau	ab2b782	2021-04-22 14:09:44 +0200	[diff] [blame]	1060	* available before calling the function. Note that if the initial header was
				1061	* never sent, it will be sent first as well (10 extra bytes).
				1062	*/
				1063	int slz_rfc1952_finish(struct slz_stream strm, unsigned char buf)
				1064	{
				1065	strm->outbuf = buf;
				1066
				1067	if (__builtin_expect(strm->state == SLZ_ST_INIT, 0))
				1068	strm->outbuf += slz_rfc1952_send_header(strm, strm->outbuf);
				1069
				1070	slz_rfc1951_finish(strm, strm->outbuf);
				1071	copy_32b(strm, strm->crc32);
				1072	copy_32b(strm, strm->ilen);
				1073	strm->state = SLZ_ST_END;
				1074
				1075	return strm->outbuf - buf;
				1076	}
				1077
				1078
				1079	/* RFC1950-specific stuff. This is for the Zlib stream format.
				1080	* From RFC1950 (zlib) :
				1081	*
				1082
				1083	2.2. Data format
				1084
				1085	A zlib stream has the following structure:
				1086
				1087	0 1
				1088	+---+---+
				1089	\|CMF\|FLG\| (more-->)
				1090	+---+---+
				1091
				1092
				1093	(if FLG.FDICT set)
				1094
				1095	0 1 2 3
				1096	+---+---+---+---+
				1097	\| DICTID \| (more-->)
				1098	+---+---+---+---+
				1099
				1100	+=====================+---+---+---+---+
				1101	\|...compressed data...\| ADLER32 \|
				1102	+=====================+---+---+---+---+
				1103
				1104	Any data which may appear after ADLER32 are not part of the zlib
				1105	stream.
				1106
				1107	CMF (Compression Method and flags)
				1108	This byte is divided into a 4-bit compression method and a 4-
				1109	bit information field depending on the compression method.
				1110
				1111	bits 0 to 3 CM Compression method
				1112	bits 4 to 7 CINFO Compression info
				1113
				1114	CM (Compression method)
				1115	This identifies the compression method used in the file. CM = 8
				1116	denotes the "deflate" compression method with a window size up
				1117	to 32K. This is the method used by gzip and PNG (see
				1118	references [1] and [2] in Chapter 3, below, for the reference
				1119	documents). CM = 15 is reserved. It might be used in a future
				1120	version of this specification to indicate the presence of an
				1121	extra field before the compressed data.
				1122
				1123	CINFO (Compression info)
				1124	For CM = 8, CINFO is the base-2 logarithm of the LZ77 window
				1125	size, minus eight (CINFO=7 indicates a 32K window size). Values
				1126	of CINFO above 7 are not allowed in this version of the
				1127	specification. CINFO is not defined in this specification for
				1128	CM not equal to 8.
				1129
				1130	FLG (FLaGs)
				1131	This flag byte is divided as follows:
				1132
				1133	bits 0 to 4 FCHECK (check bits for CMF and FLG)
				1134	bit 5 FDICT (preset dictionary)
				1135	bits 6 to 7 FLEVEL (compression level)
				1136
				1137	The FCHECK value must be such that CMF and FLG, when viewed as
				1138	a 16-bit unsigned integer stored in MSB order (CMF*256 + FLG),
				1139	is a multiple of 31.
				1140
				1141
				1142	FDICT (Preset dictionary)
				1143	If FDICT is set, a DICT dictionary identifier is present
				1144	immediately after the FLG byte. The dictionary is a sequence of
				1145	bytes which are initially fed to the compressor without
				1146	producing any compressed output. DICT is the Adler-32 checksum
				1147	of this sequence of bytes (see the definition of ADLER32
				1148	below). The decompressor can use this identifier to determine
				1149	which dictionary has been used by the compressor.
				1150
				1151	FLEVEL (Compression level)
				1152	These flags are available for use by specific compression
				1153	methods. The "deflate" method (CM = 8) sets these flags as
				1154	follows:
				1155
				1156	0 - compressor used fastest algorithm
				1157	1 - compressor used fast algorithm
				1158	2 - compressor used default algorithm
				1159	3 - compressor used maximum compression, slowest algorithm
				1160
				1161	The information in FLEVEL is not needed for decompression; it
				1162	is there to indicate if recompression might be worthwhile.
				1163
				1164	compressed data
				1165	For compression method 8, the compressed data is stored in the
				1166	deflate compressed data format as described in the document
				1167	"DEFLATE Compressed Data Format Specification" by L. Peter
				1168	Deutsch. (See reference [3] in Chapter 3, below)
				1169
				1170	Other compressed data formats are not specified in this version
				1171	of the zlib specification.
				1172
				1173	ADLER32 (Adler-32 checksum)
				1174	This contains a checksum value of the uncompressed data
				1175	(excluding any dictionary data) computed according to Adler-32
				1176	algorithm. This algorithm is a 32-bit extension and improvement
				1177	of the Fletcher algorithm, used in the ITU-T X.224 / ISO 8073
				1178	standard. See references [4] and [5] in Chapter 3, below)
				1179
				1180	Adler-32 is composed of two sums accumulated per byte: s1 is
				1181	the sum of all bytes, s2 is the sum of all s1 values. Both sums
				1182	are done modulo 65521. s1 is initialized to 1, s2 to zero. The
				1183	Adler-32 checksum is stored as s2*65536 + s1 in most-
				1184	significant-byte first (network) order.
				1185
				1186	==> The stream can start with only 2 bytes :
				1187	- CM = 0x78 : CMINFO=7 (32kB window), CM=8 (deflate)
				1188	- FLG = 0x01 : FLEVEL = 0 (fastest), FDICT=0 (no dict), FCHECK=1 so
				1189	that 0x7801 is a multiple of 31 (30721 = 991 * 31).
				1190
				1191	==> and it ends with only 4 bytes, the Adler-32 checksum in big-endian format.
				1192
				1193	*/
				1194
				1195	static const unsigned char zlib_hdr[] = { 0x78, 0x01 }; // 32k win, deflate, chk=1
				1196
				1197
				1198	/* Original version from RFC1950, verified and works OK */
				1199	uint32_t slz_adler32_by1(uint32_t crc, const unsigned char *buf, int len)
				1200	{
				1201	uint32_t s1 = crc & 0xffff;
				1202	uint32_t s2 = (crc >> 16) & 0xffff;
				1203	int n;
				1204
				1205	for (n = 0; n < len; n++) {
				1206	s1 = (s1 + buf[n]) % 65521;
				1207	s2 = (s2 + s1) % 65521;
				1208	}
				1209	return (s2 << 16) + s1;
				1210	}
				1211
				1212	/* Computes the adler32 sum on <buf> for <len> bytes. It avoids the expensive
				1213	* modulus by retrofitting the number of bytes missed between 65521 and 65536
				1214	* which is easy to count : For every sum above 65536, the modulus is offset
				1215	* by (65536-65521) = 15. So for any value, we can count the accumulated extra
				1216	* values by dividing the sum by 65536 and multiplying this value by
				1217	* (65536-65521). That's easier with a drawing with boxes and marbles. It gives
				1218	* this :
				1219	* x % 65521 = (x % 65536) + (x / 65536) * (65536 - 65521)
				1220	* = (x & 0xffff) + (x >> 16) * 15.
				1221	*/
				1222	uint32_t slz_adler32_block(uint32_t crc, const unsigned char *buf, long len)
				1223	{
				1224	long s1 = crc & 0xffff;
				1225	long s2 = (crc >> 16);
				1226	long blk;
				1227	long n;
				1228
				1229	do {
				1230	blk = len;
				1231	/* ensure we never overflow s2 (limit is about 2^((32-8)/2) */
				1232	if (blk > (1U << 12))
				1233	blk = 1U << 12;
				1234	len -= blk;
				1235
				1236	for (n = 0; n < blk; n++) {
				1237	s1 = (s1 + buf[n]);
				1238	s2 = (s2 + s1);
				1239	}
				1240
				1241	/* Largest value here is 2^12 * 255 = 1044480 < 2^20. We can
				1242	* still overflow once, but not twice because the right hand
				1243	* size is 225 max, so the total is 65761. However we also
				1244	* have to take care of the values between 65521 and 65536.
				1245	*/
				1246	s1 = (s1 & 0xffff) + 15 * (s1 >> 16);
				1247	if (s1 >= 65521)
				1248	s1 -= 65521;
				1249
				1250	/* For s2, the largest value is estimated to 2^32-1 for
				1251	* simplicity, so the right hand side is about 15*65535
				1252	* = 983025. We can overflow twice at most.
				1253	*/
				1254	s2 = (s2 & 0xffff) + 15 * (s2 >> 16);
				1255	s2 = (s2 & 0xffff) + 15 * (s2 >> 16);
				1256	if (s2 >= 65521)
				1257	s2 -= 65521;
				1258
				1259	buf += blk;
				1260	} while (len);
				1261	return (s2 << 16) + s1;
				1262	}
				1263
				1264	/* Sends the zlib header for stream <strm> into buffer <buf>. When it's done,
				1265	* the stream state is updated to SLZ_ST_EOB. It returns the number of bytes
				1266	* emitted which is always 2. The caller is responsible for ensuring there's
				1267	* always enough room in the buffer.
				1268	*/
				1269	int slz_rfc1950_send_header(struct slz_stream strm, unsigned char buf)
				1270	{
				1271	memcpy(buf, zlib_hdr, sizeof(zlib_hdr));
				1272	strm->state = SLZ_ST_EOB;
				1273	return sizeof(zlib_hdr);
				1274	}
				1275
				1276	/* Encodes the block according to rfc1950. This means that the CRC of the input
				1277	* block is computed according to the ADLER32 algorithm. If the header was never
				1278	* sent, it may be sent first. The number of output bytes is returned.
				1279	*/
				1280	long slz_rfc1950_encode(struct slz_stream strm, unsigned char out, const unsigned char *in, long ilen, int more)
				1281	{
				1282	long ret = 0;
				1283
				1284	if (__builtin_expect(strm->state == SLZ_ST_INIT, 0))
				1285	ret += slz_rfc1950_send_header(strm, out);
				1286
				1287	strm->crc32 = slz_adler32_block(strm->crc32, in, ilen);
				1288	ret += slz_rfc1951_encode(strm, out + ret, in, ilen, more);
				1289	return ret;
				1290	}
				1291
				1292	/* Initializes stream <strm> for use with the zlib format (rfc1952). The
				1293	* compression level passed in <level> is set. This value can only be 0 (no
				1294	* compression) or 1 (compression) and other values will lead to unpredictable
				1295	* behaviour. The function always returns 0.
				1296	*/
				1297	int slz_rfc1950_init(struct slz_stream *strm, int level)
				1298	{
				1299	strm->state = SLZ_ST_INIT;
				1300	strm->level = level;
				1301	strm->format = SLZ_FMT_ZLIB;
				1302	strm->crc32 = 1; // rfc1950/zlib starts with initial crc=1
				1303	strm->ilen = 0;
				1304	strm->qbits = 0;
				1305	strm->queue = 0;
				1306	return 0;
				1307	}
				1308
				1309	/* Flushes pending bits and sends the gzip trailer for stream <strm> into
				1310	* buffer <buf>. When it's done, the stream state is updated to SLZ_ST_END. It
				1311	* returns the number of bytes emitted. The trailer consists in flushing the
				1312	* possibly pending bits from the queue (up to 24 bits), rounding to the next
Ilya Shipitsin	b2be9a1	2021-04-24 13:25:42 +0500	[diff] [blame]	1313	* byte, then 4 bytes for the CRC. That may about to 4+4 = 8 bytes, that the
Willy Tarreau	ab2b782	2021-04-22 14:09:44 +0200	[diff] [blame]	1314	* caller must ensure are available before calling the function. Note that if
				1315	* the initial header was never sent, it will be sent first as well (2 extra
				1316	* bytes).
				1317	*/
				1318	int slz_rfc1950_finish(struct slz_stream strm, unsigned char buf)
				1319	{
				1320	strm->outbuf = buf;
				1321
				1322	if (__builtin_expect(strm->state == SLZ_ST_INIT, 0))
				1323	strm->outbuf += slz_rfc1952_send_header(strm, strm->outbuf);
				1324
				1325	slz_rfc1951_finish(strm, strm->outbuf);
				1326	copy_8b(strm, (strm->crc32 >> 24) & 0xff);
				1327	copy_8b(strm, (strm->crc32 >> 16) & 0xff);
				1328	copy_8b(strm, (strm->crc32 >> 8) & 0xff);
				1329	copy_8b(strm, (strm->crc32 >> 0) & 0xff);
				1330	strm->state = SLZ_ST_END;
				1331	return strm->outbuf - buf;
				1332	}
				1333
Willy Tarreau	ab2b782	2021-04-22 14:09:44 +0200	[diff] [blame]	1334	__attribute__((constructor))
				1335	static void __slz_initialize(void)
				1336	{
Willy Tarreau	9e27428	2021-05-12 08:36:09 +0200	[diff] [blame]	1337	#if !defined(__ARM_FEATURE_CRC32)
Willy Tarreau	ab2b782	2021-04-22 14:09:44 +0200	[diff] [blame]	1338	__slz_make_crc_table();
Willy Tarreau	9e27428	2021-05-12 08:36:09 +0200	[diff] [blame]	1339	#endif
Willy Tarreau	ab2b782	2021-04-22 14:09:44 +0200	[diff] [blame]	1340	__slz_prepare_dist_table();
				1341	}