Blame - src/slz.c - haproxy

blob: a9104ea9effd29bf23fe64b91969e03c237497f7 [file] [log] [blame]

Willy Tarreau	ab2b782	2021-04-22 14:09:44 +0200	[diff] [blame]	1	/*
				2	* Copyright (C) 2013-2015 Willy Tarreau <w@1wt.eu>
				3	*
				4	* Permission is hereby granted, free of charge, to any person obtaining
				5	* a copy of this software and associated documentation files (the
				6	* "Software"), to deal in the Software without restriction, including
				7	* without limitation the rights to use, copy, modify, merge, publish,
				8	* distribute, sublicense, and/or sell copies of the Software, and to
				9	* permit persons to whom the Software is furnished to do so, subject to
				10	* the following conditions:
				11	*
				12	* The above copyright notice and this permission notice shall be
				13	* included in all copies or substantial portions of the Software.
				14	*
				15	* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
				16	* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
				17	* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
				18	* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
				19	* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
				20	* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
				21	* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
				22	* OTHER DEALINGS IN THE SOFTWARE.
				23	*/
				24
				25	#include <stdint.h>
				26	#include <stdio.h>
				27	#include <string.h>
				28	#include <import/slz.h>
				29	#include <import/slz-tables.h>
				30
				31	/* First, RFC1951-specific declarations and extracts from the RFC.
				32	*
				33	* RFC1951 - deflate stream format
				34
				35
				36	* Data elements are packed into bytes in order of
				37	increasing bit number within the byte, i.e., starting
				38	with the least-significant bit of the byte.
				39	* Data elements other than Huffman codes are packed
				40	starting with the least-significant bit of the data
				41	element.
				42	* Huffman codes are packed starting with the most-
				43	significant bit of the code.
				44
				45	3.2.3. Details of block format
				46
				47	Each block of compressed data begins with 3 header bits
				48	containing the following data:
				49
				50	first bit BFINAL
				51	next 2 bits BTYPE
				52
				53	Note that the header bits do not necessarily begin on a byte
				54	boundary, since a block does not necessarily occupy an integral
				55	number of bytes.
				56
				57	BFINAL is set if and only if this is the last block of the data
				58	set.
				59
				60	BTYPE specifies how the data are compressed, as follows:
				61
				62	00 - no compression
				63	01 - compressed with fixed Huffman codes
				64	10 - compressed with dynamic Huffman codes
				65	11 - reserved (error)
				66
				67	3.2.4. Non-compressed blocks (BTYPE=00)
				68
				69	Any bits of input up to the next byte boundary are ignored.
				70	The rest of the block consists of the following information:
				71
				72	0 1 2 3 4...
				73	+---+---+---+---+================================+
				74	\| LEN \| NLEN \|... LEN bytes of literal data...\|
				75	+---+---+---+---+================================+
				76
				77	LEN is the number of data bytes in the block. NLEN is the
				78	one's complement of LEN.
				79
				80	3.2.5. Compressed blocks (length and distance codes)
				81
				82	As noted above, encoded data blocks in the "deflate" format
				83	consist of sequences of symbols drawn from three conceptually
				84	distinct alphabets: either literal bytes, from the alphabet of
				85	byte values (0..255), or <length, backward distance> pairs,
				86	where the length is drawn from (3..258) and the distance is
				87	drawn from (1..32,768). In fact, the literal and length
				88	alphabets are merged into a single alphabet (0..285), where
				89	values 0..255 represent literal bytes, the value 256 indicates
				90	end-of-block, and values 257..285 represent length codes
				91	(possibly in conjunction with extra bits following the symbol
				92	code) as follows:
				93
				94	Length encoding :
				95	Extra Extra Extra
				96	Code Bits Length(s) Code Bits Lengths Code Bits Length(s)
				97	---- ---- ------ ---- ---- ------- ---- ---- -------
				98	257 0 3 267 1 15,16 277 4 67-82
				99	258 0 4 268 1 17,18 278 4 83-98
				100	259 0 5 269 2 19-22 279 4 99-114
				101	260 0 6 270 2 23-26 280 4 115-130
				102	261 0 7 271 2 27-30 281 5 131-162
				103	262 0 8 272 2 31-34 282 5 163-194
				104	263 0 9 273 3 35-42 283 5 195-226
				105	264 0 10 274 3 43-50 284 5 227-257
				106	265 1 11,12 275 3 51-58 285 0 258
				107	266 1 13,14 276 3 59-66
				108
				109	Distance encoding :
				110	Extra Extra Extra
				111	Code Bits Dist Code Bits Dist Code Bits Distance
				112	---- ---- ---- ---- ---- ------ ---- ---- --------
				113	0 0 1 10 4 33-48 20 9 1025-1536
				114	1 0 2 11 4 49-64 21 9 1537-2048
				115	2 0 3 12 5 65-96 22 10 2049-3072
				116	3 0 4 13 5 97-128 23 10 3073-4096
				117	4 1 5,6 14 6 129-192 24 11 4097-6144
				118	5 1 7,8 15 6 193-256 25 11 6145-8192
				119	6 2 9-12 16 7 257-384 26 12 8193-12288
				120	7 2 13-16 17 7 385-512 27 12 12289-16384
				121	8 3 17-24 18 8 513-768 28 13 16385-24576
				122	9 3 25-32 19 8 769-1024 29 13 24577-32768
				123
				124	3.2.6. Compression with fixed Huffman codes (BTYPE=01)
				125
				126	The Huffman codes for the two alphabets are fixed, and are not
				127	represented explicitly in the data. The Huffman code lengths
				128	for the literal/length alphabet are:
				129
				130	Lit Value Bits Codes
				131	--------- ---- -----
				132	0 - 143 8 00110000 through
				133	10111111
				134	144 - 255 9 110010000 through
				135	111111111
				136	256 - 279 7 0000000 through
				137	0010111
				138	280 - 287 8 11000000 through
				139	11000111
				140
				141	The code lengths are sufficient to generate the actual codes,
				142	as described above; we show the codes in the table for added
				143	clarity. Literal/length values 286-287 will never actually
				144	occur in the compressed data, but participate in the code
				145	construction.
				146
				147	Distance codes 0-31 are represented by (fixed-length) 5-bit
				148	codes, with possible additional bits as shown in the table
				149	shown in Paragraph 3.2.5, above. Note that distance codes 30-
				150	31 will never actually occur in the compressed data.
				151
				152	*/
				153
				154	/* back references, built in a way that is optimal for 32/64 bits */
				155	union ref {
				156	struct {
				157	uint32_t pos;
				158	uint32_t word;
				159	} by32;
				160	uint64_t by64;
				161	};
				162
				163	#if defined(USE_64BIT_QUEUE) && defined(UNALIGNED_LE_OK)
				164
				165	/* enqueue code x of <xbits> bits (LSB aligned, at most 24) and copy complete
				166	* 32-bit words into output buffer. X must not contain non-zero bits above
				167	* xbits.
				168	*/
				169	static inline void enqueue24(struct slz_stream *strm, uint32_t x, uint32_t xbits)
				170	{
				171	uint64_t queue = strm->queue + ((uint64_t)x << strm->qbits);
				172	uint32_t qbits = strm->qbits + xbits;
				173
				174	if (__builtin_expect(qbits >= 32, 1)) {
				175	(uint32_t )strm->outbuf = queue;
				176	queue >>= 32;
				177	qbits -= 32;
				178	strm->outbuf += 4;
				179	}
				180
				181	strm->queue = queue;
				182	strm->qbits = qbits;
				183	}
				184
				185	#define enqueue8 enqueue24
				186
				187	/* flush the queue and align to next byte */
				188	static inline void flush_bits(struct slz_stream *strm)
				189	{
				190	if (strm->qbits > 0)
				191	*strm->outbuf++ = strm->queue;
				192
				193	if (strm->qbits > 8)
				194	*strm->outbuf++ = strm->queue >> 8;
				195
				196	if (strm->qbits > 16)
				197	*strm->outbuf++ = strm->queue >> 16;
				198
				199	if (strm->qbits > 24)
				200	*strm->outbuf++ = strm->queue >> 24;
				201
				202	strm->queue = 0;
				203	strm->qbits = 0;
				204	}
				205
				206	#else /* non-64 bit or aligned or big endian */
				207
				208	/* enqueue code x of <xbits> bits (LSB aligned, at most 24) and copy complete
				209	* bytes into out buf. X must not contain non-zero bits above xbits. Prefer
				210	* enqueue8() when xbits is known for being 8 or less.
				211	*/
				212	static void enqueue24(struct slz_stream *strm, uint32_t x, uint32_t xbits)
				213	{
				214	uint32_t queue = strm->queue + (x << strm->qbits);
				215	uint32_t qbits = strm->qbits + xbits;
				216
				217	if (qbits >= 16) {
				218	#ifndef UNALIGNED_LE_OK
				219	strm->outbuf[0] = queue;
				220	strm->outbuf[1] = queue >> 8;
				221	#else
				222	(uint16_t )strm->outbuf = queue;
				223	#endif
				224	strm->outbuf += 2;
				225	queue >>= 16;
				226	qbits -= 16;
				227	}
				228
				229	if (qbits >= 8) {
				230	qbits -= 8;
				231	*strm->outbuf++ = queue;
				232	queue >>= 8;
				233	}
				234	strm->qbits = qbits;
				235	strm->queue = queue;
				236	return;
				237	}
				238
				239	/* enqueue code x of <xbits> bits (at most 8) and copy complete bytes into
				240	* out buf. X must not contain non-zero bits above xbits.
				241	*/
				242	static inline void enqueue8(struct slz_stream *strm, uint32_t x, uint32_t xbits)
				243	{
				244	uint32_t queue = strm->queue + (x << strm->qbits);
				245	uint32_t qbits = strm->qbits + xbits;
				246
				247	if (__builtin_expect((signed)(qbits - 8) >= 0, 1)) {
				248	qbits -= 8;
				249	*strm->outbuf++ = queue;
				250	queue >>= 8;
				251	}
				252
				253	strm->qbits = qbits;
				254	strm->queue = queue;
				255	}
				256
				257	/* align to next byte */
				258	static inline void flush_bits(struct slz_stream *strm)
				259	{
				260	if (strm->qbits > 0)
				261	*strm->outbuf++ = strm->queue;
				262
				263	if (strm->qbits > 8)
				264	*strm->outbuf++ = strm->queue >> 8;
				265
				266	strm->queue = 0;
				267	strm->qbits = 0;
				268	}
				269	#endif
				270
				271
				272	/* only valid if buffer is already aligned */
				273	static inline void copy_8b(struct slz_stream *strm, uint32_t x)
				274	{
				275	*strm->outbuf++ = x;
				276	}
				277
				278	/* only valid if buffer is already aligned */
				279	static inline void copy_16b(struct slz_stream *strm, uint32_t x)
				280	{
				281	strm->outbuf[0] = x;
				282	strm->outbuf[1] = x >> 8;
				283	strm->outbuf += 2;
				284	}
				285
				286	/* only valid if buffer is already aligned */
				287	static inline void copy_32b(struct slz_stream *strm, uint32_t x)
				288	{
				289	strm->outbuf[0] = x;
				290	strm->outbuf[1] = x >> 8;
				291	strm->outbuf[2] = x >> 16;
				292	strm->outbuf[3] = x >> 24;
				293	strm->outbuf += 4;
				294	}
				295
				296	static inline void send_huff(struct slz_stream *strm, uint32_t code)
				297	{
				298	uint32_t bits;
				299
				300	code = fixed_huff[code];
				301	bits = code & 15;
				302	code >>= 4;
				303	enqueue24(strm, code, bits);
				304	}
				305
				306	static inline void send_eob(struct slz_stream *strm)
				307	{
				308	enqueue8(strm, 0, 7); // direct encoding of 256 = EOB (cf RFC1951)
				309	}
				310
Ilya Shipitsin	b2be9a1	2021-04-24 13:25:42 +0500	[diff] [blame^]	311	/* copies <len> literals from <buf>. <more> indicates that there are data past
Willy Tarreau	ab2b782	2021-04-22 14:09:44 +0200	[diff] [blame]	312	* buf + <len>. <len> must not be null.
				313	*/
				314	static void copy_lit(struct slz_stream strm, const void buf, uint32_t len, int more)
				315	{
				316	uint32_t len2;
				317
				318	do {
				319	len2 = len;
				320	if (__builtin_expect(len2 > 65535, 0))
				321	len2 = 65535;
				322
				323	len -= len2;
				324
				325	if (strm->state != SLZ_ST_EOB)
				326	send_eob(strm);
				327
				328	strm->state = (more \|\| len) ? SLZ_ST_EOB : SLZ_ST_DONE;
				329
				330	enqueue8(strm, !(more \|\| len), 3); // BFINAL = !more ; BTYPE = 00
				331	flush_bits(strm);
				332	copy_16b(strm, len2); // len2
				333	copy_16b(strm, ~len2); // nlen2
				334	memcpy(strm->outbuf, buf, len2);
				335	buf += len2;
				336	strm->outbuf += len2;
				337	} while (len);
				338	}
				339
Ilya Shipitsin	b2be9a1	2021-04-24 13:25:42 +0500	[diff] [blame^]	340	/* copies <len> literals from <buf>. <more> indicates that there are data past
Willy Tarreau	ab2b782	2021-04-22 14:09:44 +0200	[diff] [blame]	341	* buf + <len>. <len> must not be null.
				342	*/
				343	static void copy_lit_huff(struct slz_stream strm, const unsigned char buf, uint32_t len, int more)
				344	{
				345	uint32_t pos;
				346
				347	/* This ugly construct limits the mount of tests and optimizes for the
				348	* most common case (more > 0).
				349	*/
				350	if (strm->state == SLZ_ST_EOB) {
				351	eob:
				352	strm->state = more ? SLZ_ST_FIXED : SLZ_ST_LAST;
				353	enqueue8(strm, 2 + !more, 3); // BFINAL = !more ; BTYPE = 01
				354	}
				355	else if (!more) {
				356	send_eob(strm);
				357	goto eob;
				358	}
				359
				360	pos = 0;
				361	do {
				362	send_huff(strm, buf[pos++]);
				363	} while (pos < len);
				364	}
				365
				366	/* format:
				367	* bit0..31 = word
				368	* bit32..63 = last position in buffer of similar content
				369	*/
				370
				371	/* This hash provides good average results on HTML contents, and is among the
				372	* few which provide almost optimal results on various different pages.
				373	*/
				374	static inline uint32_t slz_hash(uint32_t a)
				375	{
				376	#if defined(__ARM_FEATURE_CRC32)
				377	__asm__ volatile("crc32w %w0,%w0,%w1" : "+r"(a) : "r"(0));
				378	return a >> (32 - HASH_BITS);
				379	#else
				380	return ((a << 19) + (a << 6) - a) >> (32 - HASH_BITS);
				381	#endif
				382	}
				383
				384	/* This function compares buffers <a> and <b> and reads 32 or 64 bits at a time
				385	* during the approach. It makes us of unaligned little endian memory accesses
				386	* on capable architectures. <max> is the maximum number of bytes that can be
				387	* read, so both <a> and <b> must have at least <max> bytes ahead. <max> may
				388	* safely be null or negative if that simplifies computations in the caller.
				389	*/
				390	static inline long memmatch(const unsigned char a, const unsigned char b, long max)
				391	{
				392	long len = 0;
				393
				394	#ifdef UNALIGNED_LE_OK
				395	unsigned long xor;
				396
				397	while (1) {
				398	if ((long)(len + 2 * sizeof(long)) > max) {
				399	while (len < max) {
				400	if (a[len] != b[len])
				401	break;
				402	len++;
				403	}
				404	return len;
				405	}
				406
				407	xor = (long )&a[len] ^ (long )&b[len];
				408	if (xor)
				409	break;
				410	len += sizeof(long);
				411
				412	xor = (long )&a[len] ^ (long )&b[len];
				413	if (xor)
				414	break;
				415	len += sizeof(long);
				416	}
				417
				418	#if defined(__x86_64__) \|\| defined(__i386__) \|\| defined(__i486__) \|\| defined(__i586__) \|\| defined(__i686__)
				419	/* x86 has bsf. We know that xor is non-null here */
				420	asm("bsf %1,%0\n" : "=r"(xor) : "0" (xor));
				421	return len + xor / 8;
				422	#else
				423	if (sizeof(long) > 4 && !(xor & 0xffffffff)) {
				424	/* This code is optimized out on 32-bit archs, but we still
				425	* need to shift in two passes to avoid a warning. It is
				426	* properly optimized out as a single shift.
				427	*/
				428	xor >>= 16; xor >>= 16;
				429	if (xor & 0xffff) {
				430	if (xor & 0xff)
				431	return len + 4;
				432	return len + 5;
				433	}
				434	if (xor & 0xffffff)
				435	return len + 6;
				436	return len + 7;
				437	}
				438
				439	if (xor & 0xffff) {
				440	if (xor & 0xff)
				441	return len;
				442	return len + 1;
				443	}
				444	if (xor & 0xffffff)
				445	return len + 2;
				446	return len + 3;
				447	#endif // x86
				448
				449	#else // UNALIGNED_LE_OK
				450	/* This is the generic version for big endian or unaligned-incompatible
				451	* architectures.
				452	*/
				453	while (len < max) {
				454	if (a[len] != b[len])
				455	break;
				456	len++;
				457	}
				458	return len;
				459
				460	#endif
				461	}
				462
				463	/* sets <count> BYTES to -32769 in <refs> so that any uninitialized entry will
				464	* verify (pos-last-1 >= 32768) and be ignored. <count> must be a multiple of
				465	* 128 bytes and <refs> must be at least one count in length. It's supposed to
				466	* be applied to 64-bit aligned data exclusively, which makes it slightly
				467	* faster than the regular memset() since no alignment check is performed.
				468	*/
				469	void reset_refs(union ref *refs, long count)
				470	{
				471	/* avoid a shift/mask by casting to void* */
				472	union ref end = (void )refs + count;
				473
				474	do {
				475	refs[ 0].by64 = -32769;
				476	refs[ 1].by64 = -32769;
				477	refs[ 2].by64 = -32769;
				478	refs[ 3].by64 = -32769;
				479	refs[ 4].by64 = -32769;
				480	refs[ 5].by64 = -32769;
				481	refs[ 6].by64 = -32769;
				482	refs[ 7].by64 = -32769;
				483	refs[ 8].by64 = -32769;
				484	refs[ 9].by64 = -32769;
				485	refs[10].by64 = -32769;
				486	refs[11].by64 = -32769;
				487	refs[12].by64 = -32769;
				488	refs[13].by64 = -32769;
				489	refs[14].by64 = -32769;
				490	refs[15].by64 = -32769;
				491	refs += 16;
				492	} while (refs < end);
				493	}
				494
				495	/* Compresses <ilen> bytes from <in> into <out> according to RFC1951. The
				496	* output result may be up to 5 bytes larger than the input, to which 2 extra
				497	* bytes may be added to send the last chunk due to BFINAL+EOB encoding (10
				498	* bits) when <more> is not set. The caller is responsible for ensuring there
				499	* is enough room in the output buffer for this. The amount of output bytes is
				500	* returned, and no CRC is computed.
				501	*/
				502	long slz_rfc1951_encode(struct slz_stream strm, unsigned char out, const unsigned char *in, long ilen, int more)
				503	{
				504	long rem = ilen;
				505	unsigned long pos = 0;
				506	unsigned long last;
				507	uint32_t word = 0;
				508	long mlen;
				509	uint32_t h;
				510	uint64_t ent;
				511
				512	uint32_t plit = 0;
				513	uint32_t bit9 = 0;
				514	uint32_t dist, code;
				515	union ref refs[1 << HASH_BITS];
				516
				517	if (!strm->level) {
				518	/* force to send as literals (eg to preserve CPU) */
				519	strm->outbuf = out;
				520	plit = pos = ilen;
				521	bit9 = 52; /* force literal dump */
				522	goto final_lit_dump;
				523	}
				524
				525	reset_refs(refs, sizeof(refs));
				526
				527	strm->outbuf = out;
				528
				529	#ifndef UNALIGNED_FASTER
				530	word = ((unsigned char)in[pos] << 8) + ((unsigned char)in[pos + 1] << 16) + ((unsigned char)in[pos + 2] << 24);
				531	#endif
				532	while (rem >= 4) {
				533	#ifndef UNALIGNED_FASTER
				534	word = ((unsigned char)in[pos + 3] << 24) + (word >> 8);
				535	#else
				536	word = (uint32_t )&in[pos];
				537	#endif
				538	h = slz_hash(word);
				539	asm volatile ("" ::); // prevent gcc from trying to be smart with the prefetch
				540
				541	if (sizeof(long) >= 8) {
				542	ent = refs[h].by64;
				543	last = (uint32_t)ent;
				544	ent >>= 32;
				545	refs[h].by64 = ((uint64_t)pos) + ((uint64_t)word << 32);
				546	} else {
				547	ent = refs[h].by32.word;
				548	last = refs[h].by32.pos;
				549	refs[h].by32.pos = pos;
				550	refs[h].by32.word = word;
				551	}
				552
				553	#if FIND_OPTIMAL_MATCH
				554	/* Experimental code to see what could be saved with an ideal
				555	* longest match lookup algorithm. This one is very slow but
				556	* scans the whole window. In short, here are the savings :
				557	* file orig fast(ratio) optimal(ratio)
				558	* README 5185 3419 (65.9%) 3165 (61.0%) -7.5%
				559	* index.html 76799 35662 (46.4%) 29875 (38.9%) -16.3%
				560	* rfc1952.c 29383 13442 (45.7%) 11793 (40.1%) -12.3%
				561	*
				562	* Thus the savings to expect for large files is at best 16%.
				563	*
				564	* A non-colliding hash gives 33025 instead of 35662 (-7.4%),
				565	* and keeping the last two entries gives 31724 (-11.0%).
				566	*/
				567	unsigned long scan;
				568	int saved = 0;
				569	int bestpos = 0;
				570	int bestlen = 0;
				571	int firstlen = 0;
				572	int max_lookup = 2; // 0 = no limit
				573
				574	for (scan = pos - 1; scan < pos && (unsigned long)(pos - scan - 1) < 32768; scan--) {
				575	if ((uint32_t )(in + scan) != word)
				576	continue;
				577
				578	len = memmatch(in + pos, in + scan, rem);
				579	if (!bestlen)
				580	firstlen = len;
				581
				582	if (len > bestlen) {
				583	bestlen = len;
				584	bestpos = scan;
				585	}
				586	if (!--max_lookup)
				587	break;
				588	}
				589	if (bestlen) {
				590	//printf("pos=%d last=%d bestpos=%d word=%08x ent=%08x len=%d\n",
				591	// (int)pos, (int)last, (int)bestpos, (int)word, (int)ent, bestlen);
				592	last = bestpos;
				593	ent = word;
				594	saved += bestlen - firstlen;
				595	}
				596	//fprintf(stderr, "first=%d best=%d saved_total=%d\n", firstlen, bestlen, saved);
				597	#endif
				598
				599	if ((uint32_t)ent != word) {
				600	send_as_lit:
				601	rem--;
				602	plit++;
				603	bit9 += ((unsigned char)word >= 144);
				604	pos++;
				605	continue;
				606	}
				607
				608	/* We reject pos = last and pos > last+32768 */
				609	if ((unsigned long)(pos - last - 1) >= 32768)
				610	goto send_as_lit;
				611
				612	/* Note: cannot encode a length larger than 258 bytes */
				613	mlen = memmatch(in + pos + 4, in + last + 4, (rem > 258 ? 258 : rem) - 4) + 4;
				614
				615	/* found a matching entry */
				616
				617	if (bit9 >= 52 && mlen < 6)
				618	goto send_as_lit;
				619
				620	/* compute the output code, its size and the length's size in
				621	* bits to know if the reference is cheaper than literals.
				622	*/
				623	code = len_fh[mlen];
				624
				625	/* direct mapping of dist->huffman code */
				626	dist = fh_dist_table[pos - last - 1];
				627
				628	/* if encoding the dist+length is more expensive than sending
				629	* the equivalent as bytes, lets keep the literals.
				630	*/
				631	if ((dist & 0x1f) + (code >> 16) + 8 >= 8 * mlen + bit9)
				632	goto send_as_lit;
				633
				634	/* first, copy pending literals */
				635	if (plit) {
				636	/* Huffman encoding requires 9 bits for octets 144..255, so this
				637	* is a waste of space for binary data. Switching between Huffman
				638	* and no-comp then huffman consumes 52 bits (7 for EOB + 3 for
				639	* block type + 7 for alignment + 32 for LEN+NLEN + 3 for next
				640	* block. Only use plain literals if there are more than 52 bits
				641	* to save then.
				642	*/
				643	if (bit9 >= 52)
				644	copy_lit(strm, in + pos - plit, plit, 1);
				645	else
				646	copy_lit_huff(strm, in + pos - plit, plit, 1);
				647
				648	plit = 0;
				649	}
				650
				651	/* use mode 01 - fixed huffman */
				652	if (strm->state == SLZ_ST_EOB) {
				653	strm->state = SLZ_ST_FIXED;
				654	enqueue8(strm, 0x02, 3); // BTYPE = 01, BFINAL = 0
				655	}
				656
				657	/* copy the length first */
				658	enqueue24(strm, code & 0xFFFF, code >> 16);
				659
				660	/* in fixed huffman mode, dist is fixed 5 bits */
				661	enqueue24(strm, dist >> 5, dist & 0x1f);
				662	bit9 = 0;
				663	rem -= mlen;
				664	pos += mlen;
				665
				666	#ifndef UNALIGNED_FASTER
				667	#ifdef UNALIGNED_LE_OK
				668	word = (uint32_t )&in[pos - 1];
				669	#else
				670	word = ((unsigned char)in[pos] << 8) + ((unsigned char)in[pos + 1] << 16) + ((unsigned char)in[pos + 2] << 24);
				671	#endif
				672	#endif
				673	}
				674
				675	if (__builtin_expect(rem, 0)) {
				676	/* we're reading the 1..3 last bytes */
				677	plit += rem;
				678	do {
				679	bit9 += ((unsigned char)in[pos++] >= 144);
				680	} while (--rem);
				681	}
				682
				683	final_lit_dump:
				684	/* now copy remaining literals or mark the end */
				685	if (plit) {
				686	if (bit9 >= 52)
				687	copy_lit(strm, in + pos - plit, plit, more);
				688	else
				689	copy_lit_huff(strm, in + pos - plit, plit, more);
				690
				691	plit = 0;
				692	}
				693
				694	strm->ilen += ilen;
				695	return strm->outbuf - out;
				696	}
				697
				698	/* Initializes stream <strm> for use with raw deflate (rfc1951). The CRC is
				699	* unused but set to zero. The compression level passed in <level> is set. This
				700	* value can only be 0 (no compression) or 1 (compression) and other values
				701	* will lead to unpredictable behaviour. The function always returns 0.
				702	*/
				703	int slz_rfc1951_init(struct slz_stream *strm, int level)
				704	{
				705	strm->state = SLZ_ST_EOB; // no header
				706	strm->level = level;
				707	strm->format = SLZ_FMT_DEFLATE;
				708	strm->crc32 = 0;
				709	strm->ilen = 0;
				710	strm->qbits = 0;
				711	strm->queue = 0;
				712	return 0;
				713	}
				714
				715	/* Flushes any pending for stream <strm> into buffer <buf>, then sends BTYPE=1
				716	* and BFINAL=1 if needed. The stream ends in SLZ_ST_DONE. It returns the number
				717	* of bytes emitted. The trailer consists in flushing the possibly pending bits
				718	* from the queue (up to 7 bits), then possibly EOB (7 bits), then 3 bits, EOB,
				719	* a rounding to the next byte, which amounts to a total of 4 bytes max, that
				720	* the caller must ensure are available before calling the function.
				721	*/
				722	int slz_rfc1951_finish(struct slz_stream strm, unsigned char buf)
				723	{
				724	strm->outbuf = buf;
				725
				726	if (strm->state == SLZ_ST_FIXED \|\| strm->state == SLZ_ST_LAST) {
				727	strm->state = (strm->state == SLZ_ST_LAST) ? SLZ_ST_DONE : SLZ_ST_EOB;
				728	send_eob(strm);
				729	}
				730
				731	if (strm->state != SLZ_ST_DONE) {
				732	/* send BTYPE=1, BFINAL=1 */
				733	enqueue8(strm, 3, 3);
				734	send_eob(strm);
				735	strm->state = SLZ_ST_DONE;
				736	}
				737
				738	flush_bits(strm);
				739	return strm->outbuf - buf;
				740	}
				741
				742	/* Now RFC1952-specific declarations and extracts from RFC.
				743	* From RFC1952 about the GZIP file format :
				744
				745	A gzip file consists of a series of "members" ...
				746
				747	2.3. Member format
				748
				749	Each member has the following structure:
				750
				751	+---+---+---+---+---+---+---+---+---+---+
				752	\|ID1\|ID2\|CM \|FLG\| MTIME \|XFL\|OS \| (more-->)
				753	+---+---+---+---+---+---+---+---+---+---+
				754
				755	(if FLG.FEXTRA set)
				756
				757	+---+---+=================================+
				758	\| XLEN \|...XLEN bytes of "extra field"...\| (more-->)
				759	+---+---+=================================+
				760
				761	(if FLG.FNAME set)
				762
				763	+=========================================+
				764	\|...original file name, zero-terminated...\| (more-->)
				765	+=========================================+
				766
				767	(if FLG.FCOMMENT set)
				768
				769	+===================================+
				770	\|...file comment, zero-terminated...\| (more-->)
				771	+===================================+
				772
				773	(if FLG.FHCRC set)
				774
				775	+---+---+
				776	\| CRC16 \|
				777	+---+---+
				778
				779	+=======================+
				780	\|...compressed blocks...\| (more-->)
				781	+=======================+
				782
				783	0 1 2 3 4 5 6 7
				784	+---+---+---+---+---+---+---+---+
				785	\| CRC32 \| ISIZE \|
				786	+---+---+---+---+---+---+---+---+
				787
				788
				789	2.3.1. Member header and trailer
				790
				791	ID1 (IDentification 1)
				792	ID2 (IDentification 2)
				793	These have the fixed values ID1 = 31 (0x1f, \037), ID2 = 139
				794	(0x8b, \213), to identify the file as being in gzip format.
				795
				796	CM (Compression Method)
				797	This identifies the compression method used in the file. CM
				798	= 0-7 are reserved. CM = 8 denotes the "deflate"
				799	compression method, which is the one customarily used by
				800	gzip and which is documented elsewhere.
				801
				802	FLG (FLaGs)
				803	This flag byte is divided into individual bits as follows:
				804
				805	bit 0 FTEXT
				806	bit 1 FHCRC
				807	bit 2 FEXTRA
				808	bit 3 FNAME
				809	bit 4 FCOMMENT
				810	bit 5 reserved
				811	bit 6 reserved
				812	bit 7 reserved
				813
				814	Reserved FLG bits must be zero.
				815
				816	MTIME (Modification TIME)
				817	This gives the most recent modification time of the original
				818	file being compressed. The time is in Unix format, i.e.,
				819	seconds since 00:00:00 GMT, Jan. 1, 1970. (Note that this
				820	may cause problems for MS-DOS and other systems that use
				821	local rather than Universal time.) If the compressed data
				822	did not come from a file, MTIME is set to the time at which
				823	compression started. MTIME = 0 means no time stamp is
				824	available.
				825
				826	XFL (eXtra FLags)
				827	These flags are available for use by specific compression
				828	methods. The "deflate" method (CM = 8) sets these flags as
				829	follows:
				830
				831	XFL = 2 - compressor used maximum compression,
				832	slowest algorithm
				833	XFL = 4 - compressor used fastest algorithm
				834
				835	OS (Operating System)
				836	This identifies the type of file system on which compression
				837	took place. This may be useful in determining end-of-line
				838	convention for text files. The currently defined values are
				839	as follows:
				840
				841	0 - FAT filesystem (MS-DOS, OS/2, NT/Win32)
				842	1 - Amiga
				843	2 - VMS (or OpenVMS)
				844	3 - Unix
				845	4 - VM/CMS
				846	5 - Atari TOS
				847	6 - HPFS filesystem (OS/2, NT)
				848	7 - Macintosh
				849	8 - Z-System
				850	9 - CP/M
				851	10 - TOPS-20
				852	11 - NTFS filesystem (NT)
				853	12 - QDOS
				854	13 - Acorn RISCOS
				855	255 - unknown
				856
				857	==> A file compressed using "gzip -1" on Unix-like systems can be :
				858
				859	1F 8B 08 00 00 00 00 00 04 03
				860	<deflate-compressed stream>
				861	crc32 size32
				862	*/
				863
				864	static const unsigned char gzip_hdr[] = { 0x1F, 0x8B, // ID1, ID2
				865	0x08, 0x00, // Deflate, flags (none)
				866	0x00, 0x00, 0x00, 0x00, // mtime: none
				867	0x04, 0x03 }; // fastest comp, OS=Unix
				868
				869	static inline uint32_t crc32_char(uint32_t crc, uint8_t x)
				870	{
				871	#if defined(__ARM_FEATURE_CRC32)
				872	crc = ~crc;
				873	__asm__ volatile("crc32b %w0,%w0,%w1" : "+r"(crc) : "r"(x));
				874	crc = ~crc;
				875	#else
				876	crc = crc32_fast[0][(crc ^ x) & 0xff] ^ (crc >> 8);
				877	#endif
				878	return crc;
				879	}
				880
				881	static inline uint32_t crc32_uint32(uint32_t data)
				882	{
				883	#if defined(__ARM_FEATURE_CRC32)
				884	__asm__ volatile("crc32w %w0,%w0,%w1" : "+r"(data) : "r"(~0UL));
				885	data = ~data;
				886	#else
				887	data = crc32_fast[3][(data >> 0) & 0xff] ^
				888	crc32_fast[2][(data >> 8) & 0xff] ^
				889	crc32_fast[1][(data >> 16) & 0xff] ^
				890	crc32_fast[0][(data >> 24) & 0xff];
				891	#endif
				892	return data;
				893	}
				894
				895	/* Modified version originally from RFC1952, working with non-inverting CRCs */
				896	uint32_t slz_crc32_by1(uint32_t crc, const unsigned char *buf, int len)
				897	{
				898	int n;
				899
				900	for (n = 0; n < len; n++)
				901	crc = crc32_char(crc, buf[n]);
				902	return crc;
				903	}
				904
				905	/* This version computes the crc32 of <buf> over <len> bytes, doing most of it
				906	* in 32-bit chunks.
				907	*/
				908	uint32_t slz_crc32_by4(uint32_t crc, const unsigned char *buf, int len)
				909	{
				910	const unsigned char *end = buf + len;
				911
				912	while (buf <= end - 16) {
				913	#ifdef UNALIGNED_LE_OK
				914	#if defined(__ARM_FEATURE_CRC32)
				915	crc = ~crc;
				916	__asm__ volatile("crc32w %w0,%w0,%w1" : "+r"(crc) : "r"((uint32_t)(buf)));
				917	__asm__ volatile("crc32w %w0,%w0,%w1" : "+r"(crc) : "r"((uint32_t)(buf + 4)));
				918	__asm__ volatile("crc32w %w0,%w0,%w1" : "+r"(crc) : "r"((uint32_t)(buf + 8)));
				919	__asm__ volatile("crc32w %w0,%w0,%w1" : "+r"(crc) : "r"((uint32_t)(buf + 12)));
				920	crc = ~crc;
				921	#else
				922	crc ^= (uint32_t )buf;
				923	crc = crc32_uint32(crc);
				924
				925	crc ^= (uint32_t )(buf + 4);
				926	crc = crc32_uint32(crc);
				927
				928	crc ^= (uint32_t )(buf + 8);
				929	crc = crc32_uint32(crc);
				930
				931	crc ^= (uint32_t )(buf + 12);
				932	crc = crc32_uint32(crc);
				933	#endif
				934	#else
				935	crc = crc32_fast[3][(buf[0] ^ (crc >> 0)) & 0xff] ^
				936	crc32_fast[2][(buf[1] ^ (crc >> 8)) & 0xff] ^
				937	crc32_fast[1][(buf[2] ^ (crc >> 16)) & 0xff] ^
				938	crc32_fast[0][(buf[3] ^ (crc >> 24)) & 0xff];
				939
				940	crc = crc32_fast[3][(buf[4] ^ (crc >> 0)) & 0xff] ^
				941	crc32_fast[2][(buf[5] ^ (crc >> 8)) & 0xff] ^
				942	crc32_fast[1][(buf[6] ^ (crc >> 16)) & 0xff] ^
				943	crc32_fast[0][(buf[7] ^ (crc >> 24)) & 0xff];
				944
				945	crc = crc32_fast[3][(buf[8] ^ (crc >> 0)) & 0xff] ^
				946	crc32_fast[2][(buf[9] ^ (crc >> 8)) & 0xff] ^
				947	crc32_fast[1][(buf[10] ^ (crc >> 16)) & 0xff] ^
				948	crc32_fast[0][(buf[11] ^ (crc >> 24)) & 0xff];
				949
				950	crc = crc32_fast[3][(buf[12] ^ (crc >> 0)) & 0xff] ^
				951	crc32_fast[2][(buf[13] ^ (crc >> 8)) & 0xff] ^
				952	crc32_fast[1][(buf[14] ^ (crc >> 16)) & 0xff] ^
				953	crc32_fast[0][(buf[15] ^ (crc >> 24)) & 0xff];
				954	#endif
				955	buf += 16;
				956	}
				957
				958	while (buf <= end - 4) {
				959	#ifdef UNALIGNED_LE_OK
				960	crc ^= (uint32_t )buf;
				961	crc = crc32_uint32(crc);
				962	#else
				963	crc = crc32_fast[3][(buf[0] ^ (crc >> 0)) & 0xff] ^
				964	crc32_fast[2][(buf[1] ^ (crc >> 8)) & 0xff] ^
				965	crc32_fast[1][(buf[2] ^ (crc >> 16)) & 0xff] ^
				966	crc32_fast[0][(buf[3] ^ (crc >> 24)) & 0xff];
				967	#endif
				968	buf += 4;
				969	}
				970
				971	while (buf < end)
				972	crc = crc32_fast[0][(crc ^ *buf++) & 0xff] ^ (crc >> 8);
				973	return crc;
				974	}
				975
				976	/* uses the most suitable crc32 function to update crc on <buf, len> */
				977	static inline uint32_t update_crc(uint32_t crc, const void *buf, int len)
				978	{
				979	return slz_crc32_by4(crc, buf, len);
				980	}
				981
				982	/* Sends the gzip header for stream <strm> into buffer <buf>. When it's done,
				983	* the stream state is updated to SLZ_ST_EOB. It returns the number of bytes
				984	* emitted which is always 10. The caller is responsible for ensuring there's
				985	* always enough room in the buffer.
				986	*/
				987	int slz_rfc1952_send_header(struct slz_stream strm, unsigned char buf)
				988	{
				989	memcpy(buf, gzip_hdr, sizeof(gzip_hdr));
				990	strm->state = SLZ_ST_EOB;
				991	return sizeof(gzip_hdr);
				992	}
				993
				994	/* Encodes the block according to rfc1952. This means that the CRC of the input
				995	* block is computed according to the CRC32 algorithm. If the header was never
				996	* sent, it may be sent first. The number of output bytes is returned.
				997	*/
				998	long slz_rfc1952_encode(struct slz_stream strm, unsigned char out, const unsigned char *in, long ilen, int more)
				999	{
				1000	long ret = 0;
				1001
				1002	if (__builtin_expect(strm->state == SLZ_ST_INIT, 0))
				1003	ret += slz_rfc1952_send_header(strm, out);
				1004
				1005	strm->crc32 = update_crc(strm->crc32, in, ilen);
				1006	ret += slz_rfc1951_encode(strm, out + ret, in, ilen, more);
				1007	return ret;
				1008	}
				1009
				1010	/* Initializes stream <strm> for use with the gzip format (rfc1952). The
				1011	* compression level passed in <level> is set. This value can only be 0 (no
				1012	* compression) or 1 (compression) and other values will lead to unpredictable
				1013	* behaviour. The function always returns 0.
				1014	*/
				1015	int slz_rfc1952_init(struct slz_stream *strm, int level)
				1016	{
				1017	strm->state = SLZ_ST_INIT;
				1018	strm->level = level;
				1019	strm->format = SLZ_FMT_GZIP;
				1020	strm->crc32 = 0;
				1021	strm->ilen = 0;
				1022	strm->qbits = 0;
				1023	strm->queue = 0;
				1024	return 0;
				1025	}
				1026
				1027	/* Flushes pending bits and sends the gzip trailer for stream <strm> into
				1028	* buffer <buf>. When it's done, the stream state is updated to SLZ_ST_END. It
				1029	* returns the number of bytes emitted. The trailer consists in flushing the
				1030	* possibly pending bits from the queue (up to 24 bits), rounding to the next
				1031	* byte, then 4 bytes for the CRC and another 4 bytes for the input length.
Ilya Shipitsin	b2be9a1	2021-04-24 13:25:42 +0500	[diff] [blame^]	1032	* That may about to 4+4+4 = 12 bytes, that the caller must ensure are
Willy Tarreau	ab2b782	2021-04-22 14:09:44 +0200	[diff] [blame]	1033	* available before calling the function. Note that if the initial header was
				1034	* never sent, it will be sent first as well (10 extra bytes).
				1035	*/
				1036	int slz_rfc1952_finish(struct slz_stream strm, unsigned char buf)
				1037	{
				1038	strm->outbuf = buf;
				1039
				1040	if (__builtin_expect(strm->state == SLZ_ST_INIT, 0))
				1041	strm->outbuf += slz_rfc1952_send_header(strm, strm->outbuf);
				1042
				1043	slz_rfc1951_finish(strm, strm->outbuf);
				1044	copy_32b(strm, strm->crc32);
				1045	copy_32b(strm, strm->ilen);
				1046	strm->state = SLZ_ST_END;
				1047
				1048	return strm->outbuf - buf;
				1049	}
				1050
				1051
				1052	/* RFC1950-specific stuff. This is for the Zlib stream format.
				1053	* From RFC1950 (zlib) :
				1054	*
				1055
				1056	2.2. Data format
				1057
				1058	A zlib stream has the following structure:
				1059
				1060	0 1
				1061	+---+---+
				1062	\|CMF\|FLG\| (more-->)
				1063	+---+---+
				1064
				1065
				1066	(if FLG.FDICT set)
				1067
				1068	0 1 2 3
				1069	+---+---+---+---+
				1070	\| DICTID \| (more-->)
				1071	+---+---+---+---+
				1072
				1073	+=====================+---+---+---+---+
				1074	\|...compressed data...\| ADLER32 \|
				1075	+=====================+---+---+---+---+
				1076
				1077	Any data which may appear after ADLER32 are not part of the zlib
				1078	stream.
				1079
				1080	CMF (Compression Method and flags)
				1081	This byte is divided into a 4-bit compression method and a 4-
				1082	bit information field depending on the compression method.
				1083
				1084	bits 0 to 3 CM Compression method
				1085	bits 4 to 7 CINFO Compression info
				1086
				1087	CM (Compression method)
				1088	This identifies the compression method used in the file. CM = 8
				1089	denotes the "deflate" compression method with a window size up
				1090	to 32K. This is the method used by gzip and PNG (see
				1091	references [1] and [2] in Chapter 3, below, for the reference
				1092	documents). CM = 15 is reserved. It might be used in a future
				1093	version of this specification to indicate the presence of an
				1094	extra field before the compressed data.
				1095
				1096	CINFO (Compression info)
				1097	For CM = 8, CINFO is the base-2 logarithm of the LZ77 window
				1098	size, minus eight (CINFO=7 indicates a 32K window size). Values
				1099	of CINFO above 7 are not allowed in this version of the
				1100	specification. CINFO is not defined in this specification for
				1101	CM not equal to 8.
				1102
				1103	FLG (FLaGs)
				1104	This flag byte is divided as follows:
				1105
				1106	bits 0 to 4 FCHECK (check bits for CMF and FLG)
				1107	bit 5 FDICT (preset dictionary)
				1108	bits 6 to 7 FLEVEL (compression level)
				1109
				1110	The FCHECK value must be such that CMF and FLG, when viewed as
				1111	a 16-bit unsigned integer stored in MSB order (CMF*256 + FLG),
				1112	is a multiple of 31.
				1113
				1114
				1115	FDICT (Preset dictionary)
				1116	If FDICT is set, a DICT dictionary identifier is present
				1117	immediately after the FLG byte. The dictionary is a sequence of
				1118	bytes which are initially fed to the compressor without
				1119	producing any compressed output. DICT is the Adler-32 checksum
				1120	of this sequence of bytes (see the definition of ADLER32
				1121	below). The decompressor can use this identifier to determine
				1122	which dictionary has been used by the compressor.
				1123
				1124	FLEVEL (Compression level)
				1125	These flags are available for use by specific compression
				1126	methods. The "deflate" method (CM = 8) sets these flags as
				1127	follows:
				1128
				1129	0 - compressor used fastest algorithm
				1130	1 - compressor used fast algorithm
				1131	2 - compressor used default algorithm
				1132	3 - compressor used maximum compression, slowest algorithm
				1133
				1134	The information in FLEVEL is not needed for decompression; it
				1135	is there to indicate if recompression might be worthwhile.
				1136
				1137	compressed data
				1138	For compression method 8, the compressed data is stored in the
				1139	deflate compressed data format as described in the document
				1140	"DEFLATE Compressed Data Format Specification" by L. Peter
				1141	Deutsch. (See reference [3] in Chapter 3, below)
				1142
				1143	Other compressed data formats are not specified in this version
				1144	of the zlib specification.
				1145
				1146	ADLER32 (Adler-32 checksum)
				1147	This contains a checksum value of the uncompressed data
				1148	(excluding any dictionary data) computed according to Adler-32
				1149	algorithm. This algorithm is a 32-bit extension and improvement
				1150	of the Fletcher algorithm, used in the ITU-T X.224 / ISO 8073
				1151	standard. See references [4] and [5] in Chapter 3, below)
				1152
				1153	Adler-32 is composed of two sums accumulated per byte: s1 is
				1154	the sum of all bytes, s2 is the sum of all s1 values. Both sums
				1155	are done modulo 65521. s1 is initialized to 1, s2 to zero. The
				1156	Adler-32 checksum is stored as s2*65536 + s1 in most-
				1157	significant-byte first (network) order.
				1158
				1159	==> The stream can start with only 2 bytes :
				1160	- CM = 0x78 : CMINFO=7 (32kB window), CM=8 (deflate)
				1161	- FLG = 0x01 : FLEVEL = 0 (fastest), FDICT=0 (no dict), FCHECK=1 so
				1162	that 0x7801 is a multiple of 31 (30721 = 991 * 31).
				1163
				1164	==> and it ends with only 4 bytes, the Adler-32 checksum in big-endian format.
				1165
				1166	*/
				1167
				1168	static const unsigned char zlib_hdr[] = { 0x78, 0x01 }; // 32k win, deflate, chk=1
				1169
				1170
				1171	/* Original version from RFC1950, verified and works OK */
				1172	uint32_t slz_adler32_by1(uint32_t crc, const unsigned char *buf, int len)
				1173	{
				1174	uint32_t s1 = crc & 0xffff;
				1175	uint32_t s2 = (crc >> 16) & 0xffff;
				1176	int n;
				1177
				1178	for (n = 0; n < len; n++) {
				1179	s1 = (s1 + buf[n]) % 65521;
				1180	s2 = (s2 + s1) % 65521;
				1181	}
				1182	return (s2 << 16) + s1;
				1183	}
				1184
				1185	/* Computes the adler32 sum on <buf> for <len> bytes. It avoids the expensive
				1186	* modulus by retrofitting the number of bytes missed between 65521 and 65536
				1187	* which is easy to count : For every sum above 65536, the modulus is offset
				1188	* by (65536-65521) = 15. So for any value, we can count the accumulated extra
				1189	* values by dividing the sum by 65536 and multiplying this value by
				1190	* (65536-65521). That's easier with a drawing with boxes and marbles. It gives
				1191	* this :
				1192	* x % 65521 = (x % 65536) + (x / 65536) * (65536 - 65521)
				1193	* = (x & 0xffff) + (x >> 16) * 15.
				1194	*/
				1195	uint32_t slz_adler32_block(uint32_t crc, const unsigned char *buf, long len)
				1196	{
				1197	long s1 = crc & 0xffff;
				1198	long s2 = (crc >> 16);
				1199	long blk;
				1200	long n;
				1201
				1202	do {
				1203	blk = len;
				1204	/* ensure we never overflow s2 (limit is about 2^((32-8)/2) */
				1205	if (blk > (1U << 12))
				1206	blk = 1U << 12;
				1207	len -= blk;
				1208
				1209	for (n = 0; n < blk; n++) {
				1210	s1 = (s1 + buf[n]);
				1211	s2 = (s2 + s1);
				1212	}
				1213
				1214	/* Largest value here is 2^12 * 255 = 1044480 < 2^20. We can
				1215	* still overflow once, but not twice because the right hand
				1216	* size is 225 max, so the total is 65761. However we also
				1217	* have to take care of the values between 65521 and 65536.
				1218	*/
				1219	s1 = (s1 & 0xffff) + 15 * (s1 >> 16);
				1220	if (s1 >= 65521)
				1221	s1 -= 65521;
				1222
				1223	/* For s2, the largest value is estimated to 2^32-1 for
				1224	* simplicity, so the right hand side is about 15*65535
				1225	* = 983025. We can overflow twice at most.
				1226	*/
				1227	s2 = (s2 & 0xffff) + 15 * (s2 >> 16);
				1228	s2 = (s2 & 0xffff) + 15 * (s2 >> 16);
				1229	if (s2 >= 65521)
				1230	s2 -= 65521;
				1231
				1232	buf += blk;
				1233	} while (len);
				1234	return (s2 << 16) + s1;
				1235	}
				1236
				1237	/* Sends the zlib header for stream <strm> into buffer <buf>. When it's done,
				1238	* the stream state is updated to SLZ_ST_EOB. It returns the number of bytes
				1239	* emitted which is always 2. The caller is responsible for ensuring there's
				1240	* always enough room in the buffer.
				1241	*/
				1242	int slz_rfc1950_send_header(struct slz_stream strm, unsigned char buf)
				1243	{
				1244	memcpy(buf, zlib_hdr, sizeof(zlib_hdr));
				1245	strm->state = SLZ_ST_EOB;
				1246	return sizeof(zlib_hdr);
				1247	}
				1248
				1249	/* Encodes the block according to rfc1950. This means that the CRC of the input
				1250	* block is computed according to the ADLER32 algorithm. If the header was never
				1251	* sent, it may be sent first. The number of output bytes is returned.
				1252	*/
				1253	long slz_rfc1950_encode(struct slz_stream strm, unsigned char out, const unsigned char *in, long ilen, int more)
				1254	{
				1255	long ret = 0;
				1256
				1257	if (__builtin_expect(strm->state == SLZ_ST_INIT, 0))
				1258	ret += slz_rfc1950_send_header(strm, out);
				1259
				1260	strm->crc32 = slz_adler32_block(strm->crc32, in, ilen);
				1261	ret += slz_rfc1951_encode(strm, out + ret, in, ilen, more);
				1262	return ret;
				1263	}
				1264
				1265	/* Initializes stream <strm> for use with the zlib format (rfc1952). The
				1266	* compression level passed in <level> is set. This value can only be 0 (no
				1267	* compression) or 1 (compression) and other values will lead to unpredictable
				1268	* behaviour. The function always returns 0.
				1269	*/
				1270	int slz_rfc1950_init(struct slz_stream *strm, int level)
				1271	{
				1272	strm->state = SLZ_ST_INIT;
				1273	strm->level = level;
				1274	strm->format = SLZ_FMT_ZLIB;
				1275	strm->crc32 = 1; // rfc1950/zlib starts with initial crc=1
				1276	strm->ilen = 0;
				1277	strm->qbits = 0;
				1278	strm->queue = 0;
				1279	return 0;
				1280	}
				1281
				1282	/* Flushes pending bits and sends the gzip trailer for stream <strm> into
				1283	* buffer <buf>. When it's done, the stream state is updated to SLZ_ST_END. It
				1284	* returns the number of bytes emitted. The trailer consists in flushing the
				1285	* possibly pending bits from the queue (up to 24 bits), rounding to the next
Ilya Shipitsin	b2be9a1	2021-04-24 13:25:42 +0500	[diff] [blame^]	1286	* byte, then 4 bytes for the CRC. That may about to 4+4 = 8 bytes, that the
Willy Tarreau	ab2b782	2021-04-22 14:09:44 +0200	[diff] [blame]	1287	* caller must ensure are available before calling the function. Note that if
				1288	* the initial header was never sent, it will be sent first as well (2 extra
				1289	* bytes).
				1290	*/
				1291	int slz_rfc1950_finish(struct slz_stream strm, unsigned char buf)
				1292	{
				1293	strm->outbuf = buf;
				1294
				1295	if (__builtin_expect(strm->state == SLZ_ST_INIT, 0))
				1296	strm->outbuf += slz_rfc1952_send_header(strm, strm->outbuf);
				1297
				1298	slz_rfc1951_finish(strm, strm->outbuf);
				1299	copy_8b(strm, (strm->crc32 >> 24) & 0xff);
				1300	copy_8b(strm, (strm->crc32 >> 16) & 0xff);
				1301	copy_8b(strm, (strm->crc32 >> 8) & 0xff);
				1302	copy_8b(strm, (strm->crc32 >> 0) & 0xff);
				1303	strm->state = SLZ_ST_END;
				1304	return strm->outbuf - buf;
				1305	}
				1306
Willy Tarreau	ab2b782	2021-04-22 14:09:44 +0200	[diff] [blame]	1307	__attribute__((constructor))
				1308	static void __slz_initialize(void)
				1309	{
				1310	__slz_make_crc_table();
				1311	__slz_prepare_dist_table();
				1312	}