Blame - src/slz.c - haproxy

blob: 1560bac343fb76bb23d1b15b5b1ce8bfd04a9a68 [file] [log] [blame]

Willy Tarreau	ab2b782	2021-04-22 14:09:44 +0200	[diff] [blame]	1	/*
				2	* Copyright (C) 2013-2015 Willy Tarreau <w@1wt.eu>
				3	*
				4	* Permission is hereby granted, free of charge, to any person obtaining
				5	* a copy of this software and associated documentation files (the
				6	* "Software"), to deal in the Software without restriction, including
				7	* without limitation the rights to use, copy, modify, merge, publish,
				8	* distribute, sublicense, and/or sell copies of the Software, and to
				9	* permit persons to whom the Software is furnished to do so, subject to
				10	* the following conditions:
				11	*
				12	* The above copyright notice and this permission notice shall be
				13	* included in all copies or substantial portions of the Software.
				14	*
				15	* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
				16	* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
				17	* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
				18	* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
				19	* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
				20	* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
				21	* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
				22	* OTHER DEALINGS IN THE SOFTWARE.
				23	*/
				24
Willy Tarreau	388fc25	2021-05-14 08:44:52 +0200	[diff] [blame]	25	#include <inttypes.h>
Willy Tarreau	ab2b782	2021-04-22 14:09:44 +0200	[diff] [blame]	26	#include <stdio.h>
				27	#include <string.h>
				28	#include <import/slz.h>
				29	#include <import/slz-tables.h>
				30
				31	/* First, RFC1951-specific declarations and extracts from the RFC.
				32	*
				33	* RFC1951 - deflate stream format
				34
				35
				36	* Data elements are packed into bytes in order of
				37	increasing bit number within the byte, i.e., starting
				38	with the least-significant bit of the byte.
				39	* Data elements other than Huffman codes are packed
				40	starting with the least-significant bit of the data
				41	element.
				42	* Huffman codes are packed starting with the most-
				43	significant bit of the code.
				44
				45	3.2.3. Details of block format
				46
				47	Each block of compressed data begins with 3 header bits
				48	containing the following data:
				49
				50	first bit BFINAL
				51	next 2 bits BTYPE
				52
				53	Note that the header bits do not necessarily begin on a byte
				54	boundary, since a block does not necessarily occupy an integral
				55	number of bytes.
				56
				57	BFINAL is set if and only if this is the last block of the data
				58	set.
				59
				60	BTYPE specifies how the data are compressed, as follows:
				61
				62	00 - no compression
				63	01 - compressed with fixed Huffman codes
				64	10 - compressed with dynamic Huffman codes
				65	11 - reserved (error)
				66
				67	3.2.4. Non-compressed blocks (BTYPE=00)
				68
				69	Any bits of input up to the next byte boundary are ignored.
				70	The rest of the block consists of the following information:
				71
				72	0 1 2 3 4...
				73	+---+---+---+---+================================+
				74	\| LEN \| NLEN \|... LEN bytes of literal data...\|
				75	+---+---+---+---+================================+
				76
				77	LEN is the number of data bytes in the block. NLEN is the
				78	one's complement of LEN.
				79
				80	3.2.5. Compressed blocks (length and distance codes)
				81
				82	As noted above, encoded data blocks in the "deflate" format
				83	consist of sequences of symbols drawn from three conceptually
				84	distinct alphabets: either literal bytes, from the alphabet of
				85	byte values (0..255), or <length, backward distance> pairs,
				86	where the length is drawn from (3..258) and the distance is
				87	drawn from (1..32,768). In fact, the literal and length
				88	alphabets are merged into a single alphabet (0..285), where
				89	values 0..255 represent literal bytes, the value 256 indicates
				90	end-of-block, and values 257..285 represent length codes
				91	(possibly in conjunction with extra bits following the symbol
				92	code) as follows:
				93
				94	Length encoding :
				95	Extra Extra Extra
				96	Code Bits Length(s) Code Bits Lengths Code Bits Length(s)
				97	---- ---- ------ ---- ---- ------- ---- ---- -------
				98	257 0 3 267 1 15,16 277 4 67-82
				99	258 0 4 268 1 17,18 278 4 83-98
				100	259 0 5 269 2 19-22 279 4 99-114
				101	260 0 6 270 2 23-26 280 4 115-130
				102	261 0 7 271 2 27-30 281 5 131-162
				103	262 0 8 272 2 31-34 282 5 163-194
				104	263 0 9 273 3 35-42 283 5 195-226
				105	264 0 10 274 3 43-50 284 5 227-257
				106	265 1 11,12 275 3 51-58 285 0 258
				107	266 1 13,14 276 3 59-66
				108
				109	Distance encoding :
				110	Extra Extra Extra
				111	Code Bits Dist Code Bits Dist Code Bits Distance
				112	---- ---- ---- ---- ---- ------ ---- ---- --------
				113	0 0 1 10 4 33-48 20 9 1025-1536
				114	1 0 2 11 4 49-64 21 9 1537-2048
				115	2 0 3 12 5 65-96 22 10 2049-3072
				116	3 0 4 13 5 97-128 23 10 3073-4096
				117	4 1 5,6 14 6 129-192 24 11 4097-6144
				118	5 1 7,8 15 6 193-256 25 11 6145-8192
				119	6 2 9-12 16 7 257-384 26 12 8193-12288
				120	7 2 13-16 17 7 385-512 27 12 12289-16384
				121	8 3 17-24 18 8 513-768 28 13 16385-24576
				122	9 3 25-32 19 8 769-1024 29 13 24577-32768
				123
				124	3.2.6. Compression with fixed Huffman codes (BTYPE=01)
				125
				126	The Huffman codes for the two alphabets are fixed, and are not
				127	represented explicitly in the data. The Huffman code lengths
				128	for the literal/length alphabet are:
				129
				130	Lit Value Bits Codes
				131	--------- ---- -----
				132	0 - 143 8 00110000 through
				133	10111111
				134	144 - 255 9 110010000 through
				135	111111111
				136	256 - 279 7 0000000 through
				137	0010111
				138	280 - 287 8 11000000 through
				139	11000111
				140
				141	The code lengths are sufficient to generate the actual codes,
				142	as described above; we show the codes in the table for added
				143	clarity. Literal/length values 286-287 will never actually
				144	occur in the compressed data, but participate in the code
				145	construction.
				146
				147	Distance codes 0-31 are represented by (fixed-length) 5-bit
				148	codes, with possible additional bits as shown in the table
				149	shown in Paragraph 3.2.5, above. Note that distance codes 30-
				150	31 will never actually occur in the compressed data.
				151
				152	*/
				153
				154	/* back references, built in a way that is optimal for 32/64 bits */
				155	union ref {
				156	struct {
				157	uint32_t pos;
				158	uint32_t word;
				159	} by32;
				160	uint64_t by64;
				161	};
				162
				163	#if defined(USE_64BIT_QUEUE) && defined(UNALIGNED_LE_OK)
				164
				165	/* enqueue code x of <xbits> bits (LSB aligned, at most 24) and copy complete
				166	* 32-bit words into output buffer. X must not contain non-zero bits above
				167	* xbits.
				168	*/
				169	static inline void enqueue24(struct slz_stream *strm, uint32_t x, uint32_t xbits)
				170	{
				171	uint64_t queue = strm->queue + ((uint64_t)x << strm->qbits);
				172	uint32_t qbits = strm->qbits + xbits;
				173
				174	if (__builtin_expect(qbits >= 32, 1)) {
				175	(uint32_t )strm->outbuf = queue;
				176	queue >>= 32;
				177	qbits -= 32;
				178	strm->outbuf += 4;
				179	}
				180
				181	strm->queue = queue;
				182	strm->qbits = qbits;
				183	}
				184
				185	#define enqueue8 enqueue24
				186
				187	/* flush the queue and align to next byte */
				188	static inline void flush_bits(struct slz_stream *strm)
				189	{
				190	if (strm->qbits > 0)
				191	*strm->outbuf++ = strm->queue;
				192
				193	if (strm->qbits > 8)
				194	*strm->outbuf++ = strm->queue >> 8;
				195
				196	if (strm->qbits > 16)
				197	*strm->outbuf++ = strm->queue >> 16;
				198
				199	if (strm->qbits > 24)
				200	*strm->outbuf++ = strm->queue >> 24;
				201
				202	strm->queue = 0;
				203	strm->qbits = 0;
				204	}
				205
				206	#else /* non-64 bit or aligned or big endian */
				207
				208	/* enqueue code x of <xbits> bits (LSB aligned, at most 24) and copy complete
				209	* bytes into out buf. X must not contain non-zero bits above xbits. Prefer
				210	* enqueue8() when xbits is known for being 8 or less.
				211	*/
				212	static void enqueue24(struct slz_stream *strm, uint32_t x, uint32_t xbits)
				213	{
				214	uint32_t queue = strm->queue + (x << strm->qbits);
				215	uint32_t qbits = strm->qbits + xbits;
				216
				217	if (qbits >= 16) {
				218	#ifndef UNALIGNED_LE_OK
				219	strm->outbuf[0] = queue;
				220	strm->outbuf[1] = queue >> 8;
				221	#else
				222	(uint16_t )strm->outbuf = queue;
				223	#endif
				224	strm->outbuf += 2;
				225	queue >>= 16;
				226	qbits -= 16;
				227	}
				228
				229	if (qbits >= 8) {
				230	qbits -= 8;
				231	*strm->outbuf++ = queue;
				232	queue >>= 8;
				233	}
				234	strm->qbits = qbits;
				235	strm->queue = queue;
				236	return;
				237	}
				238
				239	/* enqueue code x of <xbits> bits (at most 8) and copy complete bytes into
				240	* out buf. X must not contain non-zero bits above xbits.
				241	*/
				242	static inline void enqueue8(struct slz_stream *strm, uint32_t x, uint32_t xbits)
				243	{
				244	uint32_t queue = strm->queue + (x << strm->qbits);
				245	uint32_t qbits = strm->qbits + xbits;
				246
				247	if (__builtin_expect((signed)(qbits - 8) >= 0, 1)) {
				248	qbits -= 8;
				249	*strm->outbuf++ = queue;
				250	queue >>= 8;
				251	}
				252
				253	strm->qbits = qbits;
				254	strm->queue = queue;
				255	}
				256
				257	/* align to next byte */
				258	static inline void flush_bits(struct slz_stream *strm)
				259	{
				260	if (strm->qbits > 0)
				261	*strm->outbuf++ = strm->queue;
				262
				263	if (strm->qbits > 8)
				264	*strm->outbuf++ = strm->queue >> 8;
				265
				266	strm->queue = 0;
				267	strm->qbits = 0;
				268	}
				269	#endif
				270
				271
				272	/* only valid if buffer is already aligned */
				273	static inline void copy_8b(struct slz_stream *strm, uint32_t x)
				274	{
				275	*strm->outbuf++ = x;
				276	}
				277
				278	/* only valid if buffer is already aligned */
				279	static inline void copy_16b(struct slz_stream *strm, uint32_t x)
				280	{
				281	strm->outbuf[0] = x;
				282	strm->outbuf[1] = x >> 8;
				283	strm->outbuf += 2;
				284	}
				285
				286	/* only valid if buffer is already aligned */
				287	static inline void copy_32b(struct slz_stream *strm, uint32_t x)
				288	{
				289	strm->outbuf[0] = x;
				290	strm->outbuf[1] = x >> 8;
				291	strm->outbuf[2] = x >> 16;
				292	strm->outbuf[3] = x >> 24;
				293	strm->outbuf += 4;
				294	}
				295
				296	static inline void send_huff(struct slz_stream *strm, uint32_t code)
				297	{
				298	uint32_t bits;
				299
				300	code = fixed_huff[code];
				301	bits = code & 15;
				302	code >>= 4;
				303	enqueue24(strm, code, bits);
				304	}
				305
				306	static inline void send_eob(struct slz_stream *strm)
				307	{
				308	enqueue8(strm, 0, 7); // direct encoding of 256 = EOB (cf RFC1951)
				309	}
				310
Ilya Shipitsin	b2be9a1	2021-04-24 13:25:42 +0500	[diff] [blame]	311	/* copies <len> literals from <buf>. <more> indicates that there are data past
Willy Tarreau	ab2b782	2021-04-22 14:09:44 +0200	[diff] [blame]	312	* buf + <len>. <len> must not be null.
				313	*/
				314	static void copy_lit(struct slz_stream strm, const void buf, uint32_t len, int more)
				315	{
				316	uint32_t len2;
				317
				318	do {
				319	len2 = len;
				320	if (__builtin_expect(len2 > 65535, 0))
				321	len2 = 65535;
				322
				323	len -= len2;
				324
				325	if (strm->state != SLZ_ST_EOB)
				326	send_eob(strm);
				327
				328	strm->state = (more \|\| len) ? SLZ_ST_EOB : SLZ_ST_DONE;
				329
				330	enqueue8(strm, !(more \|\| len), 3); // BFINAL = !more ; BTYPE = 00
				331	flush_bits(strm);
				332	copy_16b(strm, len2); // len2
				333	copy_16b(strm, ~len2); // nlen2
				334	memcpy(strm->outbuf, buf, len2);
				335	buf += len2;
				336	strm->outbuf += len2;
				337	} while (len);
				338	}
				339
Ilya Shipitsin	b2be9a1	2021-04-24 13:25:42 +0500	[diff] [blame]	340	/* copies <len> literals from <buf>. <more> indicates that there are data past
Willy Tarreau	ab2b782	2021-04-22 14:09:44 +0200	[diff] [blame]	341	* buf + <len>. <len> must not be null.
				342	*/
				343	static void copy_lit_huff(struct slz_stream strm, const unsigned char buf, uint32_t len, int more)
				344	{
				345	uint32_t pos;
				346
				347	/* This ugly construct limits the mount of tests and optimizes for the
				348	* most common case (more > 0).
				349	*/
				350	if (strm->state == SLZ_ST_EOB) {
				351	eob:
				352	strm->state = more ? SLZ_ST_FIXED : SLZ_ST_LAST;
				353	enqueue8(strm, 2 + !more, 3); // BFINAL = !more ; BTYPE = 01
				354	}
				355	else if (!more) {
				356	send_eob(strm);
				357	goto eob;
				358	}
				359
				360	pos = 0;
				361	do {
				362	send_huff(strm, buf[pos++]);
				363	} while (pos < len);
				364	}
				365
				366	/* format:
				367	* bit0..31 = word
				368	* bit32..63 = last position in buffer of similar content
				369	*/
				370
				371	/* This hash provides good average results on HTML contents, and is among the
				372	* few which provide almost optimal results on various different pages.
				373	*/
				374	static inline uint32_t slz_hash(uint32_t a)
				375	{
				376	#if defined(__ARM_FEATURE_CRC32)
Willy Tarreau	b154422	2021-12-03 17:38:42 +0100	[diff] [blame]	377	# if defined(__ARM_ARCH_ISA_A64)
				378	// 64 bit mode
Willy Tarreau	ab2b782	2021-04-22 14:09:44 +0200	[diff] [blame]	379	__asm__ volatile("crc32w %w0,%w0,%w1" : "+r"(a) : "r"(0));
Willy Tarreau	b154422	2021-12-03 17:38:42 +0100	[diff] [blame]	380	# else
				381	// 32 bit mode (e.g. armv7 compiler building for armv8
				382	__asm__ volatile("crc32w %0,%0,%1" : "+r"(a) : "r"(0));
				383	# endif
Willy Tarreau	ab2b782	2021-04-22 14:09:44 +0200	[diff] [blame]	384	return a >> (32 - HASH_BITS);
				385	#else
				386	return ((a << 19) + (a << 6) - a) >> (32 - HASH_BITS);
				387	#endif
				388	}
				389
				390	/* This function compares buffers <a> and <b> and reads 32 or 64 bits at a time
				391	* during the approach. It makes us of unaligned little endian memory accesses
				392	* on capable architectures. <max> is the maximum number of bytes that can be
				393	* read, so both <a> and <b> must have at least <max> bytes ahead. <max> may
				394	* safely be null or negative if that simplifies computations in the caller.
				395	*/
				396	static inline long memmatch(const unsigned char a, const unsigned char b, long max)
				397	{
				398	long len = 0;
				399
				400	#ifdef UNALIGNED_LE_OK
				401	unsigned long xor;
				402
				403	while (1) {
				404	if ((long)(len + 2 * sizeof(long)) > max) {
				405	while (len < max) {
				406	if (a[len] != b[len])
				407	break;
				408	len++;
				409	}
				410	return len;
				411	}
				412
				413	xor = (long )&a[len] ^ (long )&b[len];
				414	if (xor)
				415	break;
				416	len += sizeof(long);
				417
				418	xor = (long )&a[len] ^ (long )&b[len];
				419	if (xor)
				420	break;
				421	len += sizeof(long);
				422	}
				423
				424	#if defined(__x86_64__) \|\| defined(__i386__) \|\| defined(__i486__) \|\| defined(__i586__) \|\| defined(__i686__)
				425	/* x86 has bsf. We know that xor is non-null here */
				426	asm("bsf %1,%0\n" : "=r"(xor) : "0" (xor));
				427	return len + xor / 8;
				428	#else
				429	if (sizeof(long) > 4 && !(xor & 0xffffffff)) {
				430	/* This code is optimized out on 32-bit archs, but we still
				431	* need to shift in two passes to avoid a warning. It is
				432	* properly optimized out as a single shift.
				433	*/
				434	xor >>= 16; xor >>= 16;
				435	if (xor & 0xffff) {
				436	if (xor & 0xff)
				437	return len + 4;
				438	return len + 5;
				439	}
				440	if (xor & 0xffffff)
				441	return len + 6;
				442	return len + 7;
				443	}
				444
				445	if (xor & 0xffff) {
				446	if (xor & 0xff)
				447	return len;
				448	return len + 1;
				449	}
				450	if (xor & 0xffffff)
				451	return len + 2;
				452	return len + 3;
				453	#endif // x86
				454
				455	#else // UNALIGNED_LE_OK
				456	/* This is the generic version for big endian or unaligned-incompatible
				457	* architectures.
				458	*/
				459	while (len < max) {
				460	if (a[len] != b[len])
				461	break;
				462	len++;
				463	}
				464	return len;
				465
				466	#endif
				467	}
				468
				469	/* sets <count> BYTES to -32769 in <refs> so that any uninitialized entry will
				470	* verify (pos-last-1 >= 32768) and be ignored. <count> must be a multiple of
				471	* 128 bytes and <refs> must be at least one count in length. It's supposed to
				472	* be applied to 64-bit aligned data exclusively, which makes it slightly
				473	* faster than the regular memset() since no alignment check is performed.
				474	*/
Tim Duesterhus	eaf16fc	2021-09-20 19:59:42 +0200	[diff] [blame]	475	static void reset_refs(union ref *refs, long count)
Willy Tarreau	ab2b782	2021-04-22 14:09:44 +0200	[diff] [blame]	476	{
				477	/* avoid a shift/mask by casting to void* */
				478	union ref end = (void )refs + count;
				479
				480	do {
				481	refs[ 0].by64 = -32769;
				482	refs[ 1].by64 = -32769;
				483	refs[ 2].by64 = -32769;
				484	refs[ 3].by64 = -32769;
				485	refs[ 4].by64 = -32769;
				486	refs[ 5].by64 = -32769;
				487	refs[ 6].by64 = -32769;
				488	refs[ 7].by64 = -32769;
				489	refs[ 8].by64 = -32769;
				490	refs[ 9].by64 = -32769;
				491	refs[10].by64 = -32769;
				492	refs[11].by64 = -32769;
				493	refs[12].by64 = -32769;
				494	refs[13].by64 = -32769;
				495	refs[14].by64 = -32769;
				496	refs[15].by64 = -32769;
				497	refs += 16;
				498	} while (refs < end);
				499	}
				500
				501	/* Compresses <ilen> bytes from <in> into <out> according to RFC1951. The
				502	* output result may be up to 5 bytes larger than the input, to which 2 extra
				503	* bytes may be added to send the last chunk due to BFINAL+EOB encoding (10
				504	* bits) when <more> is not set. The caller is responsible for ensuring there
				505	* is enough room in the output buffer for this. The amount of output bytes is
				506	* returned, and no CRC is computed.
				507	*/
				508	long slz_rfc1951_encode(struct slz_stream strm, unsigned char out, const unsigned char *in, long ilen, int more)
				509	{
				510	long rem = ilen;
				511	unsigned long pos = 0;
				512	unsigned long last;
				513	uint32_t word = 0;
				514	long mlen;
				515	uint32_t h;
				516	uint64_t ent;
				517
				518	uint32_t plit = 0;
				519	uint32_t bit9 = 0;
				520	uint32_t dist, code;
				521	union ref refs[1 << HASH_BITS];
				522
				523	if (!strm->level) {
				524	/* force to send as literals (eg to preserve CPU) */
				525	strm->outbuf = out;
				526	plit = pos = ilen;
				527	bit9 = 52; /* force literal dump */
				528	goto final_lit_dump;
				529	}
				530
				531	reset_refs(refs, sizeof(refs));
				532
				533	strm->outbuf = out;
				534
				535	#ifndef UNALIGNED_FASTER
				536	word = ((unsigned char)in[pos] << 8) + ((unsigned char)in[pos + 1] << 16) + ((unsigned char)in[pos + 2] << 24);
				537	#endif
				538	while (rem >= 4) {
				539	#ifndef UNALIGNED_FASTER
				540	word = ((unsigned char)in[pos + 3] << 24) + (word >> 8);
				541	#else
				542	word = (uint32_t )&in[pos];
				543	#endif
				544	h = slz_hash(word);
				545	asm volatile ("" ::); // prevent gcc from trying to be smart with the prefetch
				546
				547	if (sizeof(long) >= 8) {
				548	ent = refs[h].by64;
				549	last = (uint32_t)ent;
				550	ent >>= 32;
				551	refs[h].by64 = ((uint64_t)pos) + ((uint64_t)word << 32);
				552	} else {
				553	ent = refs[h].by32.word;
				554	last = refs[h].by32.pos;
				555	refs[h].by32.pos = pos;
				556	refs[h].by32.word = word;
				557	}
				558
Willy Tarreau	fc89c3f	2021-08-28 12:10:49 +0200	[diff] [blame]	559	#ifdef FIND_OPTIMAL_MATCH
Willy Tarreau	ab2b782	2021-04-22 14:09:44 +0200	[diff] [blame]	560	/* Experimental code to see what could be saved with an ideal
				561	* longest match lookup algorithm. This one is very slow but
				562	* scans the whole window. In short, here are the savings :
				563	* file orig fast(ratio) optimal(ratio)
				564	* README 5185 3419 (65.9%) 3165 (61.0%) -7.5%
				565	* index.html 76799 35662 (46.4%) 29875 (38.9%) -16.3%
				566	* rfc1952.c 29383 13442 (45.7%) 11793 (40.1%) -12.3%
				567	*
				568	* Thus the savings to expect for large files is at best 16%.
				569	*
				570	* A non-colliding hash gives 33025 instead of 35662 (-7.4%),
				571	* and keeping the last two entries gives 31724 (-11.0%).
				572	*/
				573	unsigned long scan;
				574	int saved = 0;
				575	int bestpos = 0;
				576	int bestlen = 0;
				577	int firstlen = 0;
				578	int max_lookup = 2; // 0 = no limit
				579
				580	for (scan = pos - 1; scan < pos && (unsigned long)(pos - scan - 1) < 32768; scan--) {
Willy Tarreau	eda36f1	2021-12-13 09:07:05 +0100	[diff] [blame]	581	int len;
				582
Willy Tarreau	ab2b782	2021-04-22 14:09:44 +0200	[diff] [blame]	583	if ((uint32_t )(in + scan) != word)
				584	continue;
				585
				586	len = memmatch(in + pos, in + scan, rem);
				587	if (!bestlen)
				588	firstlen = len;
				589
				590	if (len > bestlen) {
				591	bestlen = len;
				592	bestpos = scan;
				593	}
				594	if (!--max_lookup)
				595	break;
				596	}
				597	if (bestlen) {
				598	//printf("pos=%d last=%d bestpos=%d word=%08x ent=%08x len=%d\n",
				599	// (int)pos, (int)last, (int)bestpos, (int)word, (int)ent, bestlen);
				600	last = bestpos;
				601	ent = word;
				602	saved += bestlen - firstlen;
				603	}
				604	//fprintf(stderr, "first=%d best=%d saved_total=%d\n", firstlen, bestlen, saved);
				605	#endif
				606
				607	if ((uint32_t)ent != word) {
				608	send_as_lit:
				609	rem--;
				610	plit++;
				611	bit9 += ((unsigned char)word >= 144);
				612	pos++;
				613	continue;
				614	}
				615
				616	/* We reject pos = last and pos > last+32768 */
				617	if ((unsigned long)(pos - last - 1) >= 32768)
				618	goto send_as_lit;
				619
				620	/* Note: cannot encode a length larger than 258 bytes */
				621	mlen = memmatch(in + pos + 4, in + last + 4, (rem > 258 ? 258 : rem) - 4) + 4;
				622
				623	/* found a matching entry */
				624
				625	if (bit9 >= 52 && mlen < 6)
				626	goto send_as_lit;
				627
				628	/* compute the output code, its size and the length's size in
				629	* bits to know if the reference is cheaper than literals.
				630	*/
				631	code = len_fh[mlen];
				632
				633	/* direct mapping of dist->huffman code */
				634	dist = fh_dist_table[pos - last - 1];
				635
				636	/* if encoding the dist+length is more expensive than sending
				637	* the equivalent as bytes, lets keep the literals.
				638	*/
				639	if ((dist & 0x1f) + (code >> 16) + 8 >= 8 * mlen + bit9)
				640	goto send_as_lit;
				641
				642	/* first, copy pending literals */
				643	if (plit) {
				644	/* Huffman encoding requires 9 bits for octets 144..255, so this
				645	* is a waste of space for binary data. Switching between Huffman
				646	* and no-comp then huffman consumes 52 bits (7 for EOB + 3 for
				647	* block type + 7 for alignment + 32 for LEN+NLEN + 3 for next
				648	* block. Only use plain literals if there are more than 52 bits
				649	* to save then.
				650	*/
				651	if (bit9 >= 52)
				652	copy_lit(strm, in + pos - plit, plit, 1);
				653	else
				654	copy_lit_huff(strm, in + pos - plit, plit, 1);
				655
				656	plit = 0;
				657	}
				658
				659	/* use mode 01 - fixed huffman */
				660	if (strm->state == SLZ_ST_EOB) {
				661	strm->state = SLZ_ST_FIXED;
				662	enqueue8(strm, 0x02, 3); // BTYPE = 01, BFINAL = 0
				663	}
				664
				665	/* copy the length first */
				666	enqueue24(strm, code & 0xFFFF, code >> 16);
				667
				668	/* in fixed huffman mode, dist is fixed 5 bits */
				669	enqueue24(strm, dist >> 5, dist & 0x1f);
				670	bit9 = 0;
				671	rem -= mlen;
				672	pos += mlen;
				673
				674	#ifndef UNALIGNED_FASTER
				675	#ifdef UNALIGNED_LE_OK
				676	word = (uint32_t )&in[pos - 1];
				677	#else
				678	word = ((unsigned char)in[pos] << 8) + ((unsigned char)in[pos + 1] << 16) + ((unsigned char)in[pos + 2] << 24);
				679	#endif
				680	#endif
				681	}
				682
				683	if (__builtin_expect(rem, 0)) {
				684	/* we're reading the 1..3 last bytes */
				685	plit += rem;
				686	do {
				687	bit9 += ((unsigned char)in[pos++] >= 144);
				688	} while (--rem);
				689	}
				690
				691	final_lit_dump:
				692	/* now copy remaining literals or mark the end */
				693	if (plit) {
				694	if (bit9 >= 52)
				695	copy_lit(strm, in + pos - plit, plit, more);
				696	else
				697	copy_lit_huff(strm, in + pos - plit, plit, more);
				698
				699	plit = 0;
				700	}
				701
				702	strm->ilen += ilen;
				703	return strm->outbuf - out;
				704	}
				705
				706	/* Initializes stream <strm> for use with raw deflate (rfc1951). The CRC is
				707	* unused but set to zero. The compression level passed in <level> is set. This
				708	* value can only be 0 (no compression) or 1 (compression) and other values
				709	* will lead to unpredictable behaviour. The function always returns 0.
				710	*/
				711	int slz_rfc1951_init(struct slz_stream *strm, int level)
				712	{
				713	strm->state = SLZ_ST_EOB; // no header
				714	strm->level = level;
				715	strm->format = SLZ_FMT_DEFLATE;
				716	strm->crc32 = 0;
				717	strm->ilen = 0;
				718	strm->qbits = 0;
				719	strm->queue = 0;
				720	return 0;
				721	}
				722
Willy Tarreau	3ffe11e	2023-06-26 19:24:55 +0200	[diff] [blame]	723	/* Flushes any pending data for stream <strm> into buffer <buf>, then emits an
				724	* empty literal block to byte-align the output, allowing to completely flush
				725	* the queue. This requires that the output buffer still has the size of the
				726	* queue available (up to 4 bytes), plus one byte for (BFINAL,BTYPE), plus 4
				727	* bytes for LEN+NLEN, or a total of 9 bytes in the worst case. The number of
				728	* bytes emitted is returned. It is guaranteed that the queue is empty on
				729	* return. This may cause some overhead by adding needless 5-byte blocks if
				730	* called to often.
				731	*/
				732	int slz_rfc1951_flush(struct slz_stream strm, unsigned char buf)
				733	{
				734	strm->outbuf = buf;
				735
				736	/* The queue is always empty on INIT, DONE, and END */
				737	if (!strm->qbits)
				738	return 0;
				739
				740	/* we may need to terminate a huffman output. Lit is always in EOB state */
				741	if (strm->state != SLZ_ST_EOB) {
				742	strm->state = (strm->state == SLZ_ST_LAST) ? SLZ_ST_DONE : SLZ_ST_EOB;
				743	send_eob(strm);
				744	}
				745
				746	/* send BFINAL according to state, and BTYPE=00 (lit) */
				747	enqueue8(strm, (strm->state == SLZ_ST_DONE) ? 1 : 0, 3);
				748	flush_bits(strm); // emit pending bits
				749	copy_32b(strm, 0xFFFF0000U); // len=0, nlen=~0
				750
				751	/* Now the queue is empty, EOB was sent, BFINAL might have been sent if
				752	* we completed the last block, and a zero-byte block was sent to byte-
				753	* align the output. The last state reflects all this. Let's just
				754	* return the number of bytes added to the output buffer.
				755	*/
				756	return strm->outbuf - buf;
				757	}
				758
Willy Tarreau	ab2b782	2021-04-22 14:09:44 +0200	[diff] [blame]	759	/* Flushes any pending for stream <strm> into buffer <buf>, then sends BTYPE=1
				760	* and BFINAL=1 if needed. The stream ends in SLZ_ST_DONE. It returns the number
				761	* of bytes emitted. The trailer consists in flushing the possibly pending bits
				762	* from the queue (up to 7 bits), then possibly EOB (7 bits), then 3 bits, EOB,
				763	* a rounding to the next byte, which amounts to a total of 4 bytes max, that
				764	* the caller must ensure are available before calling the function.
				765	*/
				766	int slz_rfc1951_finish(struct slz_stream strm, unsigned char buf)
				767	{
				768	strm->outbuf = buf;
				769
				770	if (strm->state == SLZ_ST_FIXED \|\| strm->state == SLZ_ST_LAST) {
				771	strm->state = (strm->state == SLZ_ST_LAST) ? SLZ_ST_DONE : SLZ_ST_EOB;
				772	send_eob(strm);
				773	}
				774
				775	if (strm->state != SLZ_ST_DONE) {
				776	/* send BTYPE=1, BFINAL=1 */
				777	enqueue8(strm, 3, 3);
				778	send_eob(strm);
				779	strm->state = SLZ_ST_DONE;
				780	}
				781
				782	flush_bits(strm);
				783	return strm->outbuf - buf;
				784	}
				785
				786	/* Now RFC1952-specific declarations and extracts from RFC.
				787	* From RFC1952 about the GZIP file format :
				788
				789	A gzip file consists of a series of "members" ...
				790
				791	2.3. Member format
				792
				793	Each member has the following structure:
				794
				795	+---+---+---+---+---+---+---+---+---+---+
				796	\|ID1\|ID2\|CM \|FLG\| MTIME \|XFL\|OS \| (more-->)
				797	+---+---+---+---+---+---+---+---+---+---+
				798
				799	(if FLG.FEXTRA set)
				800
				801	+---+---+=================================+
				802	\| XLEN \|...XLEN bytes of "extra field"...\| (more-->)
				803	+---+---+=================================+
				804
				805	(if FLG.FNAME set)
				806
				807	+=========================================+
				808	\|...original file name, zero-terminated...\| (more-->)
				809	+=========================================+
				810
				811	(if FLG.FCOMMENT set)
				812
				813	+===================================+
				814	\|...file comment, zero-terminated...\| (more-->)
				815	+===================================+
				816
				817	(if FLG.FHCRC set)
				818
				819	+---+---+
				820	\| CRC16 \|
				821	+---+---+
				822
				823	+=======================+
				824	\|...compressed blocks...\| (more-->)
				825	+=======================+
				826
				827	0 1 2 3 4 5 6 7
				828	+---+---+---+---+---+---+---+---+
				829	\| CRC32 \| ISIZE \|
				830	+---+---+---+---+---+---+---+---+
				831
				832
				833	2.3.1. Member header and trailer
				834
				835	ID1 (IDentification 1)
				836	ID2 (IDentification 2)
				837	These have the fixed values ID1 = 31 (0x1f, \037), ID2 = 139
				838	(0x8b, \213), to identify the file as being in gzip format.
				839
				840	CM (Compression Method)
				841	This identifies the compression method used in the file. CM
				842	= 0-7 are reserved. CM = 8 denotes the "deflate"
				843	compression method, which is the one customarily used by
				844	gzip and which is documented elsewhere.
				845
				846	FLG (FLaGs)
				847	This flag byte is divided into individual bits as follows:
				848
				849	bit 0 FTEXT
				850	bit 1 FHCRC
				851	bit 2 FEXTRA
				852	bit 3 FNAME
				853	bit 4 FCOMMENT
				854	bit 5 reserved
				855	bit 6 reserved
				856	bit 7 reserved
				857
				858	Reserved FLG bits must be zero.
				859
				860	MTIME (Modification TIME)
				861	This gives the most recent modification time of the original
				862	file being compressed. The time is in Unix format, i.e.,
				863	seconds since 00:00:00 GMT, Jan. 1, 1970. (Note that this
				864	may cause problems for MS-DOS and other systems that use
				865	local rather than Universal time.) If the compressed data
				866	did not come from a file, MTIME is set to the time at which
				867	compression started. MTIME = 0 means no time stamp is
				868	available.
				869
				870	XFL (eXtra FLags)
				871	These flags are available for use by specific compression
				872	methods. The "deflate" method (CM = 8) sets these flags as
				873	follows:
				874
				875	XFL = 2 - compressor used maximum compression,
				876	slowest algorithm
				877	XFL = 4 - compressor used fastest algorithm
				878
				879	OS (Operating System)
				880	This identifies the type of file system on which compression
				881	took place. This may be useful in determining end-of-line
				882	convention for text files. The currently defined values are
				883	as follows:
				884
				885	0 - FAT filesystem (MS-DOS, OS/2, NT/Win32)
				886	1 - Amiga
				887	2 - VMS (or OpenVMS)
				888	3 - Unix
				889	4 - VM/CMS
				890	5 - Atari TOS
				891	6 - HPFS filesystem (OS/2, NT)
				892	7 - Macintosh
				893	8 - Z-System
				894	9 - CP/M
				895	10 - TOPS-20
				896	11 - NTFS filesystem (NT)
				897	12 - QDOS
				898	13 - Acorn RISCOS
				899	255 - unknown
				900
				901	==> A file compressed using "gzip -1" on Unix-like systems can be :
				902
				903	1F 8B 08 00 00 00 00 00 04 03
				904	<deflate-compressed stream>
				905	crc32 size32
				906	*/
				907
				908	static const unsigned char gzip_hdr[] = { 0x1F, 0x8B, // ID1, ID2
				909	0x08, 0x00, // Deflate, flags (none)
				910	0x00, 0x00, 0x00, 0x00, // mtime: none
				911	0x04, 0x03 }; // fastest comp, OS=Unix
				912
				913	static inline uint32_t crc32_char(uint32_t crc, uint8_t x)
				914	{
				915	#if defined(__ARM_FEATURE_CRC32)
				916	crc = ~crc;
Willy Tarreau	b154422	2021-12-03 17:38:42 +0100	[diff] [blame]	917	# if defined(__ARM_ARCH_ISA_A64)
				918	// 64 bit mode
Willy Tarreau	ab2b782	2021-04-22 14:09:44 +0200	[diff] [blame]	919	__asm__ volatile("crc32b %w0,%w0,%w1" : "+r"(crc) : "r"(x));
Willy Tarreau	b154422	2021-12-03 17:38:42 +0100	[diff] [blame]	920	# else
				921	// 32 bit mode (e.g. armv7 compiler building for armv8
				922	__asm__ volatile("crc32b %0,%0,%1" : "+r"(crc) : "r"(x));
				923	# endif
Willy Tarreau	ab2b782	2021-04-22 14:09:44 +0200	[diff] [blame]	924	crc = ~crc;
				925	#else
				926	crc = crc32_fast[0][(crc ^ x) & 0xff] ^ (crc >> 8);
				927	#endif
				928	return crc;
				929	}
				930
				931	static inline uint32_t crc32_uint32(uint32_t data)
				932	{
				933	#if defined(__ARM_FEATURE_CRC32)
Willy Tarreau	b154422	2021-12-03 17:38:42 +0100	[diff] [blame]	934	# if defined(__ARM_ARCH_ISA_A64)
				935	// 64 bit mode
Willy Tarreau	ab2b782	2021-04-22 14:09:44 +0200	[diff] [blame]	936	__asm__ volatile("crc32w %w0,%w0,%w1" : "+r"(data) : "r"(~0UL));
Willy Tarreau	b154422	2021-12-03 17:38:42 +0100	[diff] [blame]	937	# else
				938	// 32 bit mode (e.g. armv7 compiler building for armv8
				939	__asm__ volatile("crc32w %0,%0,%1" : "+r"(data) : "r"(~0UL));
				940	# endif
Willy Tarreau	ab2b782	2021-04-22 14:09:44 +0200	[diff] [blame]	941	data = ~data;
				942	#else
				943	data = crc32_fast[3][(data >> 0) & 0xff] ^
				944	crc32_fast[2][(data >> 8) & 0xff] ^
				945	crc32_fast[1][(data >> 16) & 0xff] ^
				946	crc32_fast[0][(data >> 24) & 0xff];
				947	#endif
				948	return data;
				949	}
				950
				951	/* Modified version originally from RFC1952, working with non-inverting CRCs */
				952	uint32_t slz_crc32_by1(uint32_t crc, const unsigned char *buf, int len)
				953	{
				954	int n;
				955
				956	for (n = 0; n < len; n++)
				957	crc = crc32_char(crc, buf[n]);
				958	return crc;
				959	}
				960
				961	/* This version computes the crc32 of <buf> over <len> bytes, doing most of it
				962	* in 32-bit chunks.
				963	*/
				964	uint32_t slz_crc32_by4(uint32_t crc, const unsigned char *buf, int len)
				965	{
				966	const unsigned char *end = buf + len;
				967
				968	while (buf <= end - 16) {
				969	#ifdef UNALIGNED_LE_OK
				970	#if defined(__ARM_FEATURE_CRC32)
				971	crc = ~crc;
Willy Tarreau	b154422	2021-12-03 17:38:42 +0100	[diff] [blame]	972	# if defined(__ARM_ARCH_ISA_A64)
				973	// 64 bit mode
Willy Tarreau	ab2b782	2021-04-22 14:09:44 +0200	[diff] [blame]	974	__asm__ volatile("crc32w %w0,%w0,%w1" : "+r"(crc) : "r"((uint32_t)(buf)));
				975	__asm__ volatile("crc32w %w0,%w0,%w1" : "+r"(crc) : "r"((uint32_t)(buf + 4)));
				976	__asm__ volatile("crc32w %w0,%w0,%w1" : "+r"(crc) : "r"((uint32_t)(buf + 8)));
				977	__asm__ volatile("crc32w %w0,%w0,%w1" : "+r"(crc) : "r"((uint32_t)(buf + 12)));
Willy Tarreau	b154422	2021-12-03 17:38:42 +0100	[diff] [blame]	978	# else
				979	// 32 bit mode (e.g. armv7 compiler building for armv8
				980	__asm__ volatile("crc32w %0,%0,%1" : "+r"(crc) : "r"((uint32_t)(buf)));
				981	__asm__ volatile("crc32w %0,%0,%1" : "+r"(crc) : "r"((uint32_t)(buf + 4)));
				982	__asm__ volatile("crc32w %0,%0,%1" : "+r"(crc) : "r"((uint32_t)(buf + 8)));
				983	__asm__ volatile("crc32w %0,%0,%1" : "+r"(crc) : "r"((uint32_t)(buf + 12)));
				984	# endif
Willy Tarreau	ab2b782	2021-04-22 14:09:44 +0200	[diff] [blame]	985	crc = ~crc;
				986	#else
				987	crc ^= (uint32_t )buf;
				988	crc = crc32_uint32(crc);
				989
				990	crc ^= (uint32_t )(buf + 4);
				991	crc = crc32_uint32(crc);
				992
				993	crc ^= (uint32_t )(buf + 8);
				994	crc = crc32_uint32(crc);
				995
				996	crc ^= (uint32_t )(buf + 12);
				997	crc = crc32_uint32(crc);
				998	#endif
				999	#else
				1000	crc = crc32_fast[3][(buf[0] ^ (crc >> 0)) & 0xff] ^
				1001	crc32_fast[2][(buf[1] ^ (crc >> 8)) & 0xff] ^
				1002	crc32_fast[1][(buf[2] ^ (crc >> 16)) & 0xff] ^
				1003	crc32_fast[0][(buf[3] ^ (crc >> 24)) & 0xff];
				1004
				1005	crc = crc32_fast[3][(buf[4] ^ (crc >> 0)) & 0xff] ^
				1006	crc32_fast[2][(buf[5] ^ (crc >> 8)) & 0xff] ^
				1007	crc32_fast[1][(buf[6] ^ (crc >> 16)) & 0xff] ^
				1008	crc32_fast[0][(buf[7] ^ (crc >> 24)) & 0xff];
				1009
				1010	crc = crc32_fast[3][(buf[8] ^ (crc >> 0)) & 0xff] ^
				1011	crc32_fast[2][(buf[9] ^ (crc >> 8)) & 0xff] ^
				1012	crc32_fast[1][(buf[10] ^ (crc >> 16)) & 0xff] ^
				1013	crc32_fast[0][(buf[11] ^ (crc >> 24)) & 0xff];
				1014
				1015	crc = crc32_fast[3][(buf[12] ^ (crc >> 0)) & 0xff] ^
				1016	crc32_fast[2][(buf[13] ^ (crc >> 8)) & 0xff] ^
				1017	crc32_fast[1][(buf[14] ^ (crc >> 16)) & 0xff] ^
				1018	crc32_fast[0][(buf[15] ^ (crc >> 24)) & 0xff];
				1019	#endif
				1020	buf += 16;
				1021	}
				1022
				1023	while (buf <= end - 4) {
				1024	#ifdef UNALIGNED_LE_OK
				1025	crc ^= (uint32_t )buf;
				1026	crc = crc32_uint32(crc);
				1027	#else
				1028	crc = crc32_fast[3][(buf[0] ^ (crc >> 0)) & 0xff] ^
				1029	crc32_fast[2][(buf[1] ^ (crc >> 8)) & 0xff] ^
				1030	crc32_fast[1][(buf[2] ^ (crc >> 16)) & 0xff] ^
				1031	crc32_fast[0][(buf[3] ^ (crc >> 24)) & 0xff];
				1032	#endif
				1033	buf += 4;
				1034	}
				1035
				1036	while (buf < end)
Willy Tarreau	027fdcb	2021-05-12 08:34:36 +0200	[diff] [blame]	1037	crc = crc32_char(crc, *buf++);
Willy Tarreau	ab2b782	2021-04-22 14:09:44 +0200	[diff] [blame]	1038	return crc;
				1039	}
				1040
				1041	/* uses the most suitable crc32 function to update crc on <buf, len> */
				1042	static inline uint32_t update_crc(uint32_t crc, const void *buf, int len)
				1043	{
				1044	return slz_crc32_by4(crc, buf, len);
				1045	}
				1046
				1047	/* Sends the gzip header for stream <strm> into buffer <buf>. When it's done,
				1048	* the stream state is updated to SLZ_ST_EOB. It returns the number of bytes
				1049	* emitted which is always 10. The caller is responsible for ensuring there's
				1050	* always enough room in the buffer.
				1051	*/
				1052	int slz_rfc1952_send_header(struct slz_stream strm, unsigned char buf)
				1053	{
				1054	memcpy(buf, gzip_hdr, sizeof(gzip_hdr));
				1055	strm->state = SLZ_ST_EOB;
				1056	return sizeof(gzip_hdr);
				1057	}
				1058
				1059	/* Encodes the block according to rfc1952. This means that the CRC of the input
				1060	* block is computed according to the CRC32 algorithm. If the header was never
				1061	* sent, it may be sent first. The number of output bytes is returned.
				1062	*/
				1063	long slz_rfc1952_encode(struct slz_stream strm, unsigned char out, const unsigned char *in, long ilen, int more)
				1064	{
				1065	long ret = 0;
				1066
				1067	if (__builtin_expect(strm->state == SLZ_ST_INIT, 0))
				1068	ret += slz_rfc1952_send_header(strm, out);
				1069
				1070	strm->crc32 = update_crc(strm->crc32, in, ilen);
				1071	ret += slz_rfc1951_encode(strm, out + ret, in, ilen, more);
				1072	return ret;
				1073	}
				1074
				1075	/* Initializes stream <strm> for use with the gzip format (rfc1952). The
				1076	* compression level passed in <level> is set. This value can only be 0 (no
				1077	* compression) or 1 (compression) and other values will lead to unpredictable
				1078	* behaviour. The function always returns 0.
				1079	*/
				1080	int slz_rfc1952_init(struct slz_stream *strm, int level)
				1081	{
				1082	strm->state = SLZ_ST_INIT;
				1083	strm->level = level;
				1084	strm->format = SLZ_FMT_GZIP;
				1085	strm->crc32 = 0;
				1086	strm->ilen = 0;
				1087	strm->qbits = 0;
				1088	strm->queue = 0;
				1089	return 0;
				1090	}
				1091
Willy Tarreau	3ffe11e	2023-06-26 19:24:55 +0200	[diff] [blame]	1092	/* Flushes any pending data for stream <strm> into buffer <buf>, then emits an
				1093	* empty literal block to byte-align the output, allowing to completely flush
				1094	* the queue. Note that if the initial header was never sent, it will be sent
				1095	* first as well (10 extra bytes). This requires that the output buffer still
				1096	* has this plus the size of the queue available (up to 4 bytes), plus one byte
				1097	* for (BFINAL,BTYPE), plus 4 bytes for LEN+NLEN, or a total of 19 bytes in the
				1098	* worst case. The number of bytes emitted is returned. It is guaranteed that
				1099	* the queue is empty on return. This may cause some overhead by adding
				1100	* needless 5-byte blocks if called to often.
				1101	*/
				1102	int slz_rfc1952_flush(struct slz_stream strm, unsigned char buf)
				1103	{
				1104	int sent = 0;
				1105
				1106	if (__builtin_expect(strm->state == SLZ_ST_INIT, 0))
				1107	sent = slz_rfc1952_send_header(strm, buf);
				1108
				1109	sent += slz_rfc1951_flush(strm, buf + sent);
				1110	return sent;
				1111	}
				1112
Willy Tarreau	ab2b782	2021-04-22 14:09:44 +0200	[diff] [blame]	1113	/* Flushes pending bits and sends the gzip trailer for stream <strm> into
				1114	* buffer <buf>. When it's done, the stream state is updated to SLZ_ST_END. It
				1115	* returns the number of bytes emitted. The trailer consists in flushing the
				1116	* possibly pending bits from the queue (up to 24 bits), rounding to the next
				1117	* byte, then 4 bytes for the CRC and another 4 bytes for the input length.
Ilya Shipitsin	b2be9a1	2021-04-24 13:25:42 +0500	[diff] [blame]	1118	* That may about to 4+4+4 = 12 bytes, that the caller must ensure are
Willy Tarreau	ab2b782	2021-04-22 14:09:44 +0200	[diff] [blame]	1119	* available before calling the function. Note that if the initial header was
				1120	* never sent, it will be sent first as well (10 extra bytes).
				1121	*/
				1122	int slz_rfc1952_finish(struct slz_stream strm, unsigned char buf)
				1123	{
				1124	strm->outbuf = buf;
				1125
				1126	if (__builtin_expect(strm->state == SLZ_ST_INIT, 0))
				1127	strm->outbuf += slz_rfc1952_send_header(strm, strm->outbuf);
				1128
				1129	slz_rfc1951_finish(strm, strm->outbuf);
				1130	copy_32b(strm, strm->crc32);
				1131	copy_32b(strm, strm->ilen);
				1132	strm->state = SLZ_ST_END;
				1133
				1134	return strm->outbuf - buf;
				1135	}
				1136
				1137
				1138	/* RFC1950-specific stuff. This is for the Zlib stream format.
				1139	* From RFC1950 (zlib) :
				1140	*
				1141
				1142	2.2. Data format
				1143
				1144	A zlib stream has the following structure:
				1145
				1146	0 1
				1147	+---+---+
				1148	\|CMF\|FLG\| (more-->)
				1149	+---+---+
				1150
				1151
				1152	(if FLG.FDICT set)
				1153
				1154	0 1 2 3
				1155	+---+---+---+---+
				1156	\| DICTID \| (more-->)
				1157	+---+---+---+---+
				1158
				1159	+=====================+---+---+---+---+
				1160	\|...compressed data...\| ADLER32 \|
				1161	+=====================+---+---+---+---+
				1162
				1163	Any data which may appear after ADLER32 are not part of the zlib
				1164	stream.
				1165
				1166	CMF (Compression Method and flags)
				1167	This byte is divided into a 4-bit compression method and a 4-
				1168	bit information field depending on the compression method.
				1169
				1170	bits 0 to 3 CM Compression method
				1171	bits 4 to 7 CINFO Compression info
				1172
				1173	CM (Compression method)
				1174	This identifies the compression method used in the file. CM = 8
				1175	denotes the "deflate" compression method with a window size up
				1176	to 32K. This is the method used by gzip and PNG (see
				1177	references [1] and [2] in Chapter 3, below, for the reference
				1178	documents). CM = 15 is reserved. It might be used in a future
				1179	version of this specification to indicate the presence of an
				1180	extra field before the compressed data.
				1181
				1182	CINFO (Compression info)
				1183	For CM = 8, CINFO is the base-2 logarithm of the LZ77 window
				1184	size, minus eight (CINFO=7 indicates a 32K window size). Values
				1185	of CINFO above 7 are not allowed in this version of the
				1186	specification. CINFO is not defined in this specification for
				1187	CM not equal to 8.
				1188
				1189	FLG (FLaGs)
				1190	This flag byte is divided as follows:
				1191
				1192	bits 0 to 4 FCHECK (check bits for CMF and FLG)
				1193	bit 5 FDICT (preset dictionary)
				1194	bits 6 to 7 FLEVEL (compression level)
				1195
				1196	The FCHECK value must be such that CMF and FLG, when viewed as
				1197	a 16-bit unsigned integer stored in MSB order (CMF*256 + FLG),
				1198	is a multiple of 31.
				1199
				1200
				1201	FDICT (Preset dictionary)
				1202	If FDICT is set, a DICT dictionary identifier is present
				1203	immediately after the FLG byte. The dictionary is a sequence of
				1204	bytes which are initially fed to the compressor without
				1205	producing any compressed output. DICT is the Adler-32 checksum
				1206	of this sequence of bytes (see the definition of ADLER32
				1207	below). The decompressor can use this identifier to determine
				1208	which dictionary has been used by the compressor.
				1209
				1210	FLEVEL (Compression level)
				1211	These flags are available for use by specific compression
				1212	methods. The "deflate" method (CM = 8) sets these flags as
				1213	follows:
				1214
				1215	0 - compressor used fastest algorithm
				1216	1 - compressor used fast algorithm
				1217	2 - compressor used default algorithm
				1218	3 - compressor used maximum compression, slowest algorithm
				1219
				1220	The information in FLEVEL is not needed for decompression; it
				1221	is there to indicate if recompression might be worthwhile.
				1222
				1223	compressed data
				1224	For compression method 8, the compressed data is stored in the
				1225	deflate compressed data format as described in the document
				1226	"DEFLATE Compressed Data Format Specification" by L. Peter
				1227	Deutsch. (See reference [3] in Chapter 3, below)
				1228
				1229	Other compressed data formats are not specified in this version
				1230	of the zlib specification.
				1231
				1232	ADLER32 (Adler-32 checksum)
				1233	This contains a checksum value of the uncompressed data
				1234	(excluding any dictionary data) computed according to Adler-32
				1235	algorithm. This algorithm is a 32-bit extension and improvement
				1236	of the Fletcher algorithm, used in the ITU-T X.224 / ISO 8073
				1237	standard. See references [4] and [5] in Chapter 3, below)
				1238
				1239	Adler-32 is composed of two sums accumulated per byte: s1 is
				1240	the sum of all bytes, s2 is the sum of all s1 values. Both sums
				1241	are done modulo 65521. s1 is initialized to 1, s2 to zero. The
				1242	Adler-32 checksum is stored as s2*65536 + s1 in most-
				1243	significant-byte first (network) order.
				1244
				1245	==> The stream can start with only 2 bytes :
				1246	- CM = 0x78 : CMINFO=7 (32kB window), CM=8 (deflate)
				1247	- FLG = 0x01 : FLEVEL = 0 (fastest), FDICT=0 (no dict), FCHECK=1 so
				1248	that 0x7801 is a multiple of 31 (30721 = 991 * 31).
				1249
				1250	==> and it ends with only 4 bytes, the Adler-32 checksum in big-endian format.
				1251
				1252	*/
				1253
				1254	static const unsigned char zlib_hdr[] = { 0x78, 0x01 }; // 32k win, deflate, chk=1
				1255
				1256
				1257	/* Original version from RFC1950, verified and works OK */
				1258	uint32_t slz_adler32_by1(uint32_t crc, const unsigned char *buf, int len)
				1259	{
				1260	uint32_t s1 = crc & 0xffff;
				1261	uint32_t s2 = (crc >> 16) & 0xffff;
				1262	int n;
				1263
				1264	for (n = 0; n < len; n++) {
				1265	s1 = (s1 + buf[n]) % 65521;
				1266	s2 = (s2 + s1) % 65521;
				1267	}
				1268	return (s2 << 16) + s1;
				1269	}
				1270
				1271	/* Computes the adler32 sum on <buf> for <len> bytes. It avoids the expensive
				1272	* modulus by retrofitting the number of bytes missed between 65521 and 65536
				1273	* which is easy to count : For every sum above 65536, the modulus is offset
				1274	* by (65536-65521) = 15. So for any value, we can count the accumulated extra
				1275	* values by dividing the sum by 65536 and multiplying this value by
				1276	* (65536-65521). That's easier with a drawing with boxes and marbles. It gives
				1277	* this :
				1278	* x % 65521 = (x % 65536) + (x / 65536) * (65536 - 65521)
				1279	* = (x & 0xffff) + (x >> 16) * 15.
				1280	*/
				1281	uint32_t slz_adler32_block(uint32_t crc, const unsigned char *buf, long len)
				1282	{
				1283	long s1 = crc & 0xffff;
				1284	long s2 = (crc >> 16);
				1285	long blk;
				1286	long n;
				1287
				1288	do {
				1289	blk = len;
				1290	/* ensure we never overflow s2 (limit is about 2^((32-8)/2) */
				1291	if (blk > (1U << 12))
				1292	blk = 1U << 12;
				1293	len -= blk;
				1294
				1295	for (n = 0; n < blk; n++) {
				1296	s1 = (s1 + buf[n]);
				1297	s2 = (s2 + s1);
				1298	}
				1299
				1300	/* Largest value here is 2^12 * 255 = 1044480 < 2^20. We can
				1301	* still overflow once, but not twice because the right hand
				1302	* size is 225 max, so the total is 65761. However we also
				1303	* have to take care of the values between 65521 and 65536.
				1304	*/
				1305	s1 = (s1 & 0xffff) + 15 * (s1 >> 16);
				1306	if (s1 >= 65521)
				1307	s1 -= 65521;
				1308
				1309	/* For s2, the largest value is estimated to 2^32-1 for
				1310	* simplicity, so the right hand side is about 15*65535
				1311	* = 983025. We can overflow twice at most.
				1312	*/
				1313	s2 = (s2 & 0xffff) + 15 * (s2 >> 16);
				1314	s2 = (s2 & 0xffff) + 15 * (s2 >> 16);
				1315	if (s2 >= 65521)
				1316	s2 -= 65521;
				1317
				1318	buf += blk;
				1319	} while (len);
				1320	return (s2 << 16) + s1;
				1321	}
				1322
				1323	/* Sends the zlib header for stream <strm> into buffer <buf>. When it's done,
				1324	* the stream state is updated to SLZ_ST_EOB. It returns the number of bytes
				1325	* emitted which is always 2. The caller is responsible for ensuring there's
				1326	* always enough room in the buffer.
				1327	*/
				1328	int slz_rfc1950_send_header(struct slz_stream strm, unsigned char buf)
				1329	{
				1330	memcpy(buf, zlib_hdr, sizeof(zlib_hdr));
				1331	strm->state = SLZ_ST_EOB;
				1332	return sizeof(zlib_hdr);
				1333	}
				1334
				1335	/* Encodes the block according to rfc1950. This means that the CRC of the input
				1336	* block is computed according to the ADLER32 algorithm. If the header was never
				1337	* sent, it may be sent first. The number of output bytes is returned.
				1338	*/
				1339	long slz_rfc1950_encode(struct slz_stream strm, unsigned char out, const unsigned char *in, long ilen, int more)
				1340	{
				1341	long ret = 0;
				1342
				1343	if (__builtin_expect(strm->state == SLZ_ST_INIT, 0))
				1344	ret += slz_rfc1950_send_header(strm, out);
				1345
				1346	strm->crc32 = slz_adler32_block(strm->crc32, in, ilen);
				1347	ret += slz_rfc1951_encode(strm, out + ret, in, ilen, more);
				1348	return ret;
				1349	}
				1350
				1351	/* Initializes stream <strm> for use with the zlib format (rfc1952). The
				1352	* compression level passed in <level> is set. This value can only be 0 (no
				1353	* compression) or 1 (compression) and other values will lead to unpredictable
				1354	* behaviour. The function always returns 0.
				1355	*/
				1356	int slz_rfc1950_init(struct slz_stream *strm, int level)
				1357	{
				1358	strm->state = SLZ_ST_INIT;
				1359	strm->level = level;
				1360	strm->format = SLZ_FMT_ZLIB;
				1361	strm->crc32 = 1; // rfc1950/zlib starts with initial crc=1
				1362	strm->ilen = 0;
				1363	strm->qbits = 0;
				1364	strm->queue = 0;
				1365	return 0;
				1366	}
				1367
Willy Tarreau	3ffe11e	2023-06-26 19:24:55 +0200	[diff] [blame]	1368	/* Flushes any pending data for stream <strm> into buffer <buf>, then emits an
				1369	* empty literal block to byte-align the output, allowing to completely flush
				1370	* the queue. Note that if the initial header was never sent, it will be sent
				1371	* first as well (2 extra bytes). This requires that the output buffer still
				1372	* has this plus the size of the queue available (up to 4 bytes), plus one byte
				1373	* for (BFINAL,BTYPE), plus 4 bytes for LEN+NLEN, or a total of 11 bytes in the
				1374	* worst case. The number of bytes emitted is returned. It is guaranteed that
				1375	* the queue is empty on return. This may cause some overhead by adding
				1376	* needless 5-byte blocks if called to often.
				1377	*/
				1378	int slz_rfc1950_flush(struct slz_stream strm, unsigned char buf)
				1379	{
				1380	int sent = 0;
				1381
				1382	if (__builtin_expect(strm->state == SLZ_ST_INIT, 0))
				1383	sent = slz_rfc1950_send_header(strm, buf);
				1384
				1385	sent += slz_rfc1951_flush(strm, buf + sent);
				1386	return sent;
				1387	}
				1388
Willy Tarreau	ab2b782	2021-04-22 14:09:44 +0200	[diff] [blame]	1389	/* Flushes pending bits and sends the gzip trailer for stream <strm> into
				1390	* buffer <buf>. When it's done, the stream state is updated to SLZ_ST_END. It
				1391	* returns the number of bytes emitted. The trailer consists in flushing the
				1392	* possibly pending bits from the queue (up to 24 bits), rounding to the next
Ilya Shipitsin	b2be9a1	2021-04-24 13:25:42 +0500	[diff] [blame]	1393	* byte, then 4 bytes for the CRC. That may about to 4+4 = 8 bytes, that the
Willy Tarreau	ab2b782	2021-04-22 14:09:44 +0200	[diff] [blame]	1394	* caller must ensure are available before calling the function. Note that if
				1395	* the initial header was never sent, it will be sent first as well (2 extra
				1396	* bytes).
				1397	*/
				1398	int slz_rfc1950_finish(struct slz_stream strm, unsigned char buf)
				1399	{
				1400	strm->outbuf = buf;
				1401
				1402	if (__builtin_expect(strm->state == SLZ_ST_INIT, 0))
				1403	strm->outbuf += slz_rfc1952_send_header(strm, strm->outbuf);
				1404
				1405	slz_rfc1951_finish(strm, strm->outbuf);
				1406	copy_8b(strm, (strm->crc32 >> 24) & 0xff);
				1407	copy_8b(strm, (strm->crc32 >> 16) & 0xff);
				1408	copy_8b(strm, (strm->crc32 >> 8) & 0xff);
				1409	copy_8b(strm, (strm->crc32 >> 0) & 0xff);
				1410	strm->state = SLZ_ST_END;
				1411	return strm->outbuf - buf;
				1412	}
				1413
Willy Tarreau	ab2b782	2021-04-22 14:09:44 +0200	[diff] [blame]	1414	__attribute__((constructor))
				1415	static void __slz_initialize(void)
				1416	{
Willy Tarreau	9e27428	2021-05-12 08:36:09 +0200	[diff] [blame]	1417	#if !defined(__ARM_FEATURE_CRC32)
Willy Tarreau	ab2b782	2021-04-22 14:09:44 +0200	[diff] [blame]	1418	__slz_make_crc_table();
Willy Tarreau	9e27428	2021-05-12 08:36:09 +0200	[diff] [blame]	1419	#endif
Willy Tarreau	ab2b782	2021-04-22 14:09:44 +0200	[diff] [blame]	1420	__slz_prepare_dist_table();
				1421	}