blob: 0ca9d279c38c7ecab1453f29d6fd223f6add467c [file] [log] [blame]
Willy Tarreauab2b7822021-04-22 14:09:44 +02001/*
2 * Copyright (C) 2013-2015 Willy Tarreau <w@1wt.eu>
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining
5 * a copy of this software and associated documentation files (the
6 * "Software"), to deal in the Software without restriction, including
7 * without limitation the rights to use, copy, modify, merge, publish,
8 * distribute, sublicense, and/or sell copies of the Software, and to
9 * permit persons to whom the Software is furnished to do so, subject to
10 * the following conditions:
11 *
12 * The above copyright notice and this permission notice shall be
13 * included in all copies or substantial portions of the Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
16 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
17 * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
18 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
19 * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
20 * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
21 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
22 * OTHER DEALINGS IN THE SOFTWARE.
23 */
24
Willy Tarreau388fc252021-05-14 08:44:52 +020025#include <inttypes.h>
Willy Tarreauab2b7822021-04-22 14:09:44 +020026#include <stdio.h>
27#include <string.h>
28#include <import/slz.h>
29#include <import/slz-tables.h>
30
31/* First, RFC1951-specific declarations and extracts from the RFC.
32 *
33 * RFC1951 - deflate stream format
34
35
36 * Data elements are packed into bytes in order of
37 increasing bit number within the byte, i.e., starting
38 with the least-significant bit of the byte.
39 * Data elements other than Huffman codes are packed
40 starting with the least-significant bit of the data
41 element.
42 * Huffman codes are packed starting with the most-
43 significant bit of the code.
44
45 3.2.3. Details of block format
46
47 Each block of compressed data begins with 3 header bits
48 containing the following data:
49
50 first bit BFINAL
51 next 2 bits BTYPE
52
53 Note that the header bits do not necessarily begin on a byte
54 boundary, since a block does not necessarily occupy an integral
55 number of bytes.
56
57 BFINAL is set if and only if this is the last block of the data
58 set.
59
60 BTYPE specifies how the data are compressed, as follows:
61
62 00 - no compression
63 01 - compressed with fixed Huffman codes
64 10 - compressed with dynamic Huffman codes
65 11 - reserved (error)
66
67 3.2.4. Non-compressed blocks (BTYPE=00)
68
69 Any bits of input up to the next byte boundary are ignored.
70 The rest of the block consists of the following information:
71
72 0 1 2 3 4...
73 +---+---+---+---+================================+
74 | LEN | NLEN |... LEN bytes of literal data...|
75 +---+---+---+---+================================+
76
77 LEN is the number of data bytes in the block. NLEN is the
78 one's complement of LEN.
79
80 3.2.5. Compressed blocks (length and distance codes)
81
82 As noted above, encoded data blocks in the "deflate" format
83 consist of sequences of symbols drawn from three conceptually
84 distinct alphabets: either literal bytes, from the alphabet of
85 byte values (0..255), or <length, backward distance> pairs,
86 where the length is drawn from (3..258) and the distance is
87 drawn from (1..32,768). In fact, the literal and length
88 alphabets are merged into a single alphabet (0..285), where
89 values 0..255 represent literal bytes, the value 256 indicates
90 end-of-block, and values 257..285 represent length codes
91 (possibly in conjunction with extra bits following the symbol
92 code) as follows:
93
94Length encoding :
95 Extra Extra Extra
96 Code Bits Length(s) Code Bits Lengths Code Bits Length(s)
97 ---- ---- ------ ---- ---- ------- ---- ---- -------
98 257 0 3 267 1 15,16 277 4 67-82
99 258 0 4 268 1 17,18 278 4 83-98
100 259 0 5 269 2 19-22 279 4 99-114
101 260 0 6 270 2 23-26 280 4 115-130
102 261 0 7 271 2 27-30 281 5 131-162
103 262 0 8 272 2 31-34 282 5 163-194
104 263 0 9 273 3 35-42 283 5 195-226
105 264 0 10 274 3 43-50 284 5 227-257
106 265 1 11,12 275 3 51-58 285 0 258
107 266 1 13,14 276 3 59-66
108
109Distance encoding :
110 Extra Extra Extra
111 Code Bits Dist Code Bits Dist Code Bits Distance
112 ---- ---- ---- ---- ---- ------ ---- ---- --------
113 0 0 1 10 4 33-48 20 9 1025-1536
114 1 0 2 11 4 49-64 21 9 1537-2048
115 2 0 3 12 5 65-96 22 10 2049-3072
116 3 0 4 13 5 97-128 23 10 3073-4096
117 4 1 5,6 14 6 129-192 24 11 4097-6144
118 5 1 7,8 15 6 193-256 25 11 6145-8192
119 6 2 9-12 16 7 257-384 26 12 8193-12288
120 7 2 13-16 17 7 385-512 27 12 12289-16384
121 8 3 17-24 18 8 513-768 28 13 16385-24576
122 9 3 25-32 19 8 769-1024 29 13 24577-32768
123
124 3.2.6. Compression with fixed Huffman codes (BTYPE=01)
125
126 The Huffman codes for the two alphabets are fixed, and are not
127 represented explicitly in the data. The Huffman code lengths
128 for the literal/length alphabet are:
129
130 Lit Value Bits Codes
131 --------- ---- -----
132 0 - 143 8 00110000 through
133 10111111
134 144 - 255 9 110010000 through
135 111111111
136 256 - 279 7 0000000 through
137 0010111
138 280 - 287 8 11000000 through
139 11000111
140
141 The code lengths are sufficient to generate the actual codes,
142 as described above; we show the codes in the table for added
143 clarity. Literal/length values 286-287 will never actually
144 occur in the compressed data, but participate in the code
145 construction.
146
147 Distance codes 0-31 are represented by (fixed-length) 5-bit
148 codes, with possible additional bits as shown in the table
149 shown in Paragraph 3.2.5, above. Note that distance codes 30-
150 31 will never actually occur in the compressed data.
151
152*/
153
154/* back references, built in a way that is optimal for 32/64 bits */
155union ref {
156 struct {
157 uint32_t pos;
158 uint32_t word;
159 } by32;
160 uint64_t by64;
161};
162
163#if defined(USE_64BIT_QUEUE) && defined(UNALIGNED_LE_OK)
164
165/* enqueue code x of <xbits> bits (LSB aligned, at most 24) and copy complete
166 * 32-bit words into output buffer. X must not contain non-zero bits above
167 * xbits.
168 */
169static inline void enqueue24(struct slz_stream *strm, uint32_t x, uint32_t xbits)
170{
171 uint64_t queue = strm->queue + ((uint64_t)x << strm->qbits);
172 uint32_t qbits = strm->qbits + xbits;
173
174 if (__builtin_expect(qbits >= 32, 1)) {
175 *(uint32_t *)strm->outbuf = queue;
176 queue >>= 32;
177 qbits -= 32;
178 strm->outbuf += 4;
179 }
180
181 strm->queue = queue;
182 strm->qbits = qbits;
183}
184
185#define enqueue8 enqueue24
186
187/* flush the queue and align to next byte */
188static inline void flush_bits(struct slz_stream *strm)
189{
190 if (strm->qbits > 0)
191 *strm->outbuf++ = strm->queue;
192
193 if (strm->qbits > 8)
194 *strm->outbuf++ = strm->queue >> 8;
195
196 if (strm->qbits > 16)
197 *strm->outbuf++ = strm->queue >> 16;
198
199 if (strm->qbits > 24)
200 *strm->outbuf++ = strm->queue >> 24;
201
202 strm->queue = 0;
203 strm->qbits = 0;
204}
205
206#else /* non-64 bit or aligned or big endian */
207
208/* enqueue code x of <xbits> bits (LSB aligned, at most 24) and copy complete
209 * bytes into out buf. X must not contain non-zero bits above xbits. Prefer
210 * enqueue8() when xbits is known for being 8 or less.
211 */
212static void enqueue24(struct slz_stream *strm, uint32_t x, uint32_t xbits)
213{
214 uint32_t queue = strm->queue + (x << strm->qbits);
215 uint32_t qbits = strm->qbits + xbits;
216
217 if (qbits >= 16) {
218#ifndef UNALIGNED_LE_OK
219 strm->outbuf[0] = queue;
220 strm->outbuf[1] = queue >> 8;
221#else
222 *(uint16_t *)strm->outbuf = queue;
223#endif
224 strm->outbuf += 2;
225 queue >>= 16;
226 qbits -= 16;
227 }
228
229 if (qbits >= 8) {
230 qbits -= 8;
231 *strm->outbuf++ = queue;
232 queue >>= 8;
233 }
234 strm->qbits = qbits;
235 strm->queue = queue;
236 return;
237}
238
239/* enqueue code x of <xbits> bits (at most 8) and copy complete bytes into
240 * out buf. X must not contain non-zero bits above xbits.
241 */
242static inline void enqueue8(struct slz_stream *strm, uint32_t x, uint32_t xbits)
243{
244 uint32_t queue = strm->queue + (x << strm->qbits);
245 uint32_t qbits = strm->qbits + xbits;
246
247 if (__builtin_expect((signed)(qbits - 8) >= 0, 1)) {
248 qbits -= 8;
249 *strm->outbuf++ = queue;
250 queue >>= 8;
251 }
252
253 strm->qbits = qbits;
254 strm->queue = queue;
255}
256
257/* align to next byte */
258static inline void flush_bits(struct slz_stream *strm)
259{
260 if (strm->qbits > 0)
261 *strm->outbuf++ = strm->queue;
262
263 if (strm->qbits > 8)
264 *strm->outbuf++ = strm->queue >> 8;
265
266 strm->queue = 0;
267 strm->qbits = 0;
268}
269#endif
270
271
272/* only valid if buffer is already aligned */
273static inline void copy_8b(struct slz_stream *strm, uint32_t x)
274{
275 *strm->outbuf++ = x;
276}
277
278/* only valid if buffer is already aligned */
279static inline void copy_16b(struct slz_stream *strm, uint32_t x)
280{
281 strm->outbuf[0] = x;
282 strm->outbuf[1] = x >> 8;
283 strm->outbuf += 2;
284}
285
286/* only valid if buffer is already aligned */
287static inline void copy_32b(struct slz_stream *strm, uint32_t x)
288{
289 strm->outbuf[0] = x;
290 strm->outbuf[1] = x >> 8;
291 strm->outbuf[2] = x >> 16;
292 strm->outbuf[3] = x >> 24;
293 strm->outbuf += 4;
294}
295
296static inline void send_huff(struct slz_stream *strm, uint32_t code)
297{
298 uint32_t bits;
299
300 code = fixed_huff[code];
301 bits = code & 15;
302 code >>= 4;
303 enqueue24(strm, code, bits);
304}
305
306static inline void send_eob(struct slz_stream *strm)
307{
308 enqueue8(strm, 0, 7); // direct encoding of 256 = EOB (cf RFC1951)
309}
310
Ilya Shipitsinb2be9a12021-04-24 13:25:42 +0500311/* copies <len> literals from <buf>. <more> indicates that there are data past
Willy Tarreauab2b7822021-04-22 14:09:44 +0200312 * buf + <len>. <len> must not be null.
313 */
314static void copy_lit(struct slz_stream *strm, const void *buf, uint32_t len, int more)
315{
316 uint32_t len2;
317
318 do {
319 len2 = len;
320 if (__builtin_expect(len2 > 65535, 0))
321 len2 = 65535;
322
323 len -= len2;
324
325 if (strm->state != SLZ_ST_EOB)
326 send_eob(strm);
327
328 strm->state = (more || len) ? SLZ_ST_EOB : SLZ_ST_DONE;
329
330 enqueue8(strm, !(more || len), 3); // BFINAL = !more ; BTYPE = 00
331 flush_bits(strm);
332 copy_16b(strm, len2); // len2
333 copy_16b(strm, ~len2); // nlen2
334 memcpy(strm->outbuf, buf, len2);
335 buf += len2;
336 strm->outbuf += len2;
337 } while (len);
338}
339
Ilya Shipitsinb2be9a12021-04-24 13:25:42 +0500340/* copies <len> literals from <buf>. <more> indicates that there are data past
Willy Tarreauab2b7822021-04-22 14:09:44 +0200341 * buf + <len>. <len> must not be null.
342 */
343static void copy_lit_huff(struct slz_stream *strm, const unsigned char *buf, uint32_t len, int more)
344{
345 uint32_t pos;
346
347 /* This ugly construct limits the mount of tests and optimizes for the
348 * most common case (more > 0).
349 */
350 if (strm->state == SLZ_ST_EOB) {
351 eob:
352 strm->state = more ? SLZ_ST_FIXED : SLZ_ST_LAST;
353 enqueue8(strm, 2 + !more, 3); // BFINAL = !more ; BTYPE = 01
354 }
355 else if (!more) {
356 send_eob(strm);
357 goto eob;
358 }
359
360 pos = 0;
361 do {
362 send_huff(strm, buf[pos++]);
363 } while (pos < len);
364}
365
366/* format:
367 * bit0..31 = word
368 * bit32..63 = last position in buffer of similar content
369 */
370
371/* This hash provides good average results on HTML contents, and is among the
372 * few which provide almost optimal results on various different pages.
373 */
374static inline uint32_t slz_hash(uint32_t a)
375{
376#if defined(__ARM_FEATURE_CRC32)
Willy Tarreaub1544222021-12-03 17:38:42 +0100377# if defined(__ARM_ARCH_ISA_A64)
378 // 64 bit mode
Willy Tarreauab2b7822021-04-22 14:09:44 +0200379 __asm__ volatile("crc32w %w0,%w0,%w1" : "+r"(a) : "r"(0));
Willy Tarreaub1544222021-12-03 17:38:42 +0100380# else
381 // 32 bit mode (e.g. armv7 compiler building for armv8
382 __asm__ volatile("crc32w %0,%0,%1" : "+r"(a) : "r"(0));
383# endif
Willy Tarreauab2b7822021-04-22 14:09:44 +0200384 return a >> (32 - HASH_BITS);
385#else
386 return ((a << 19) + (a << 6) - a) >> (32 - HASH_BITS);
387#endif
388}
389
390/* This function compares buffers <a> and <b> and reads 32 or 64 bits at a time
391 * during the approach. It makes us of unaligned little endian memory accesses
392 * on capable architectures. <max> is the maximum number of bytes that can be
393 * read, so both <a> and <b> must have at least <max> bytes ahead. <max> may
394 * safely be null or negative if that simplifies computations in the caller.
395 */
396static inline long memmatch(const unsigned char *a, const unsigned char *b, long max)
397{
398 long len = 0;
399
400#ifdef UNALIGNED_LE_OK
401 unsigned long xor;
402
403 while (1) {
404 if ((long)(len + 2 * sizeof(long)) > max) {
405 while (len < max) {
406 if (a[len] != b[len])
407 break;
408 len++;
409 }
410 return len;
411 }
412
413 xor = *(long *)&a[len] ^ *(long *)&b[len];
414 if (xor)
415 break;
416 len += sizeof(long);
417
418 xor = *(long *)&a[len] ^ *(long *)&b[len];
419 if (xor)
420 break;
421 len += sizeof(long);
422 }
423
424#if defined(__x86_64__) || defined(__i386__) || defined(__i486__) || defined(__i586__) || defined(__i686__)
425 /* x86 has bsf. We know that xor is non-null here */
426 asm("bsf %1,%0\n" : "=r"(xor) : "0" (xor));
427 return len + xor / 8;
428#else
429 if (sizeof(long) > 4 && !(xor & 0xffffffff)) {
430 /* This code is optimized out on 32-bit archs, but we still
431 * need to shift in two passes to avoid a warning. It is
432 * properly optimized out as a single shift.
433 */
434 xor >>= 16; xor >>= 16;
435 if (xor & 0xffff) {
436 if (xor & 0xff)
437 return len + 4;
438 return len + 5;
439 }
440 if (xor & 0xffffff)
441 return len + 6;
442 return len + 7;
443 }
444
445 if (xor & 0xffff) {
446 if (xor & 0xff)
447 return len;
448 return len + 1;
449 }
450 if (xor & 0xffffff)
451 return len + 2;
452 return len + 3;
453#endif // x86
454
455#else // UNALIGNED_LE_OK
456 /* This is the generic version for big endian or unaligned-incompatible
457 * architectures.
458 */
459 while (len < max) {
460 if (a[len] != b[len])
461 break;
462 len++;
463 }
464 return len;
465
466#endif
467}
468
469/* sets <count> BYTES to -32769 in <refs> so that any uninitialized entry will
470 * verify (pos-last-1 >= 32768) and be ignored. <count> must be a multiple of
471 * 128 bytes and <refs> must be at least one count in length. It's supposed to
472 * be applied to 64-bit aligned data exclusively, which makes it slightly
473 * faster than the regular memset() since no alignment check is performed.
474 */
Tim Duesterhuseaf16fc2021-09-20 19:59:42 +0200475static void reset_refs(union ref *refs, long count)
Willy Tarreauab2b7822021-04-22 14:09:44 +0200476{
477 /* avoid a shift/mask by casting to void* */
478 union ref *end = (void *)refs + count;
479
480 do {
481 refs[ 0].by64 = -32769;
482 refs[ 1].by64 = -32769;
483 refs[ 2].by64 = -32769;
484 refs[ 3].by64 = -32769;
485 refs[ 4].by64 = -32769;
486 refs[ 5].by64 = -32769;
487 refs[ 6].by64 = -32769;
488 refs[ 7].by64 = -32769;
489 refs[ 8].by64 = -32769;
490 refs[ 9].by64 = -32769;
491 refs[10].by64 = -32769;
492 refs[11].by64 = -32769;
493 refs[12].by64 = -32769;
494 refs[13].by64 = -32769;
495 refs[14].by64 = -32769;
496 refs[15].by64 = -32769;
497 refs += 16;
498 } while (refs < end);
499}
500
501/* Compresses <ilen> bytes from <in> into <out> according to RFC1951. The
502 * output result may be up to 5 bytes larger than the input, to which 2 extra
503 * bytes may be added to send the last chunk due to BFINAL+EOB encoding (10
504 * bits) when <more> is not set. The caller is responsible for ensuring there
505 * is enough room in the output buffer for this. The amount of output bytes is
506 * returned, and no CRC is computed.
507 */
508long slz_rfc1951_encode(struct slz_stream *strm, unsigned char *out, const unsigned char *in, long ilen, int more)
509{
510 long rem = ilen;
511 unsigned long pos = 0;
512 unsigned long last;
513 uint32_t word = 0;
514 long mlen;
515 uint32_t h;
516 uint64_t ent;
517
518 uint32_t plit = 0;
519 uint32_t bit9 = 0;
520 uint32_t dist, code;
521 union ref refs[1 << HASH_BITS];
522
523 if (!strm->level) {
524 /* force to send as literals (eg to preserve CPU) */
525 strm->outbuf = out;
526 plit = pos = ilen;
527 bit9 = 52; /* force literal dump */
528 goto final_lit_dump;
529 }
530
531 reset_refs(refs, sizeof(refs));
532
533 strm->outbuf = out;
534
535#ifndef UNALIGNED_FASTER
536 word = ((unsigned char)in[pos] << 8) + ((unsigned char)in[pos + 1] << 16) + ((unsigned char)in[pos + 2] << 24);
537#endif
538 while (rem >= 4) {
539#ifndef UNALIGNED_FASTER
540 word = ((unsigned char)in[pos + 3] << 24) + (word >> 8);
541#else
542 word = *(uint32_t *)&in[pos];
543#endif
544 h = slz_hash(word);
545 asm volatile ("" ::); // prevent gcc from trying to be smart with the prefetch
546
547 if (sizeof(long) >= 8) {
548 ent = refs[h].by64;
549 last = (uint32_t)ent;
550 ent >>= 32;
551 refs[h].by64 = ((uint64_t)pos) + ((uint64_t)word << 32);
552 } else {
553 ent = refs[h].by32.word;
554 last = refs[h].by32.pos;
555 refs[h].by32.pos = pos;
556 refs[h].by32.word = word;
557 }
558
Willy Tarreaufc89c3f2021-08-28 12:10:49 +0200559#ifdef FIND_OPTIMAL_MATCH
Willy Tarreauab2b7822021-04-22 14:09:44 +0200560 /* Experimental code to see what could be saved with an ideal
561 * longest match lookup algorithm. This one is very slow but
562 * scans the whole window. In short, here are the savings :
563 * file orig fast(ratio) optimal(ratio)
564 * README 5185 3419 (65.9%) 3165 (61.0%) -7.5%
565 * index.html 76799 35662 (46.4%) 29875 (38.9%) -16.3%
566 * rfc1952.c 29383 13442 (45.7%) 11793 (40.1%) -12.3%
567 *
568 * Thus the savings to expect for large files is at best 16%.
569 *
570 * A non-colliding hash gives 33025 instead of 35662 (-7.4%),
571 * and keeping the last two entries gives 31724 (-11.0%).
572 */
573 unsigned long scan;
574 int saved = 0;
575 int bestpos = 0;
576 int bestlen = 0;
577 int firstlen = 0;
578 int max_lookup = 2; // 0 = no limit
579
580 for (scan = pos - 1; scan < pos && (unsigned long)(pos - scan - 1) < 32768; scan--) {
581 if (*(uint32_t *)(in + scan) != word)
582 continue;
583
584 len = memmatch(in + pos, in + scan, rem);
585 if (!bestlen)
586 firstlen = len;
587
588 if (len > bestlen) {
589 bestlen = len;
590 bestpos = scan;
591 }
592 if (!--max_lookup)
593 break;
594 }
595 if (bestlen) {
596 //printf("pos=%d last=%d bestpos=%d word=%08x ent=%08x len=%d\n",
597 // (int)pos, (int)last, (int)bestpos, (int)word, (int)ent, bestlen);
598 last = bestpos;
599 ent = word;
600 saved += bestlen - firstlen;
601 }
602 //fprintf(stderr, "first=%d best=%d saved_total=%d\n", firstlen, bestlen, saved);
603#endif
604
605 if ((uint32_t)ent != word) {
606 send_as_lit:
607 rem--;
608 plit++;
609 bit9 += ((unsigned char)word >= 144);
610 pos++;
611 continue;
612 }
613
614 /* We reject pos = last and pos > last+32768 */
615 if ((unsigned long)(pos - last - 1) >= 32768)
616 goto send_as_lit;
617
618 /* Note: cannot encode a length larger than 258 bytes */
619 mlen = memmatch(in + pos + 4, in + last + 4, (rem > 258 ? 258 : rem) - 4) + 4;
620
621 /* found a matching entry */
622
623 if (bit9 >= 52 && mlen < 6)
624 goto send_as_lit;
625
626 /* compute the output code, its size and the length's size in
627 * bits to know if the reference is cheaper than literals.
628 */
629 code = len_fh[mlen];
630
631 /* direct mapping of dist->huffman code */
632 dist = fh_dist_table[pos - last - 1];
633
634 /* if encoding the dist+length is more expensive than sending
635 * the equivalent as bytes, lets keep the literals.
636 */
637 if ((dist & 0x1f) + (code >> 16) + 8 >= 8 * mlen + bit9)
638 goto send_as_lit;
639
640 /* first, copy pending literals */
641 if (plit) {
642 /* Huffman encoding requires 9 bits for octets 144..255, so this
643 * is a waste of space for binary data. Switching between Huffman
644 * and no-comp then huffman consumes 52 bits (7 for EOB + 3 for
645 * block type + 7 for alignment + 32 for LEN+NLEN + 3 for next
646 * block. Only use plain literals if there are more than 52 bits
647 * to save then.
648 */
649 if (bit9 >= 52)
650 copy_lit(strm, in + pos - plit, plit, 1);
651 else
652 copy_lit_huff(strm, in + pos - plit, plit, 1);
653
654 plit = 0;
655 }
656
657 /* use mode 01 - fixed huffman */
658 if (strm->state == SLZ_ST_EOB) {
659 strm->state = SLZ_ST_FIXED;
660 enqueue8(strm, 0x02, 3); // BTYPE = 01, BFINAL = 0
661 }
662
663 /* copy the length first */
664 enqueue24(strm, code & 0xFFFF, code >> 16);
665
666 /* in fixed huffman mode, dist is fixed 5 bits */
667 enqueue24(strm, dist >> 5, dist & 0x1f);
668 bit9 = 0;
669 rem -= mlen;
670 pos += mlen;
671
672#ifndef UNALIGNED_FASTER
673#ifdef UNALIGNED_LE_OK
674 word = *(uint32_t *)&in[pos - 1];
675#else
676 word = ((unsigned char)in[pos] << 8) + ((unsigned char)in[pos + 1] << 16) + ((unsigned char)in[pos + 2] << 24);
677#endif
678#endif
679 }
680
681 if (__builtin_expect(rem, 0)) {
682 /* we're reading the 1..3 last bytes */
683 plit += rem;
684 do {
685 bit9 += ((unsigned char)in[pos++] >= 144);
686 } while (--rem);
687 }
688
689 final_lit_dump:
690 /* now copy remaining literals or mark the end */
691 if (plit) {
692 if (bit9 >= 52)
693 copy_lit(strm, in + pos - plit, plit, more);
694 else
695 copy_lit_huff(strm, in + pos - plit, plit, more);
696
697 plit = 0;
698 }
699
700 strm->ilen += ilen;
701 return strm->outbuf - out;
702}
703
704/* Initializes stream <strm> for use with raw deflate (rfc1951). The CRC is
705 * unused but set to zero. The compression level passed in <level> is set. This
706 * value can only be 0 (no compression) or 1 (compression) and other values
707 * will lead to unpredictable behaviour. The function always returns 0.
708 */
709int slz_rfc1951_init(struct slz_stream *strm, int level)
710{
711 strm->state = SLZ_ST_EOB; // no header
712 strm->level = level;
713 strm->format = SLZ_FMT_DEFLATE;
714 strm->crc32 = 0;
715 strm->ilen = 0;
716 strm->qbits = 0;
717 strm->queue = 0;
718 return 0;
719}
720
721/* Flushes any pending for stream <strm> into buffer <buf>, then sends BTYPE=1
722 * and BFINAL=1 if needed. The stream ends in SLZ_ST_DONE. It returns the number
723 * of bytes emitted. The trailer consists in flushing the possibly pending bits
724 * from the queue (up to 7 bits), then possibly EOB (7 bits), then 3 bits, EOB,
725 * a rounding to the next byte, which amounts to a total of 4 bytes max, that
726 * the caller must ensure are available before calling the function.
727 */
728int slz_rfc1951_finish(struct slz_stream *strm, unsigned char *buf)
729{
730 strm->outbuf = buf;
731
732 if (strm->state == SLZ_ST_FIXED || strm->state == SLZ_ST_LAST) {
733 strm->state = (strm->state == SLZ_ST_LAST) ? SLZ_ST_DONE : SLZ_ST_EOB;
734 send_eob(strm);
735 }
736
737 if (strm->state != SLZ_ST_DONE) {
738 /* send BTYPE=1, BFINAL=1 */
739 enqueue8(strm, 3, 3);
740 send_eob(strm);
741 strm->state = SLZ_ST_DONE;
742 }
743
744 flush_bits(strm);
745 return strm->outbuf - buf;
746}
747
748/* Now RFC1952-specific declarations and extracts from RFC.
749 * From RFC1952 about the GZIP file format :
750
751A gzip file consists of a series of "members" ...
752
7532.3. Member format
754
755 Each member has the following structure:
756
757 +---+---+---+---+---+---+---+---+---+---+
758 |ID1|ID2|CM |FLG| MTIME |XFL|OS | (more-->)
759 +---+---+---+---+---+---+---+---+---+---+
760
761 (if FLG.FEXTRA set)
762
763 +---+---+=================================+
764 | XLEN |...XLEN bytes of "extra field"...| (more-->)
765 +---+---+=================================+
766
767 (if FLG.FNAME set)
768
769 +=========================================+
770 |...original file name, zero-terminated...| (more-->)
771 +=========================================+
772
773 (if FLG.FCOMMENT set)
774
775 +===================================+
776 |...file comment, zero-terminated...| (more-->)
777 +===================================+
778
779 (if FLG.FHCRC set)
780
781 +---+---+
782 | CRC16 |
783 +---+---+
784
785 +=======================+
786 |...compressed blocks...| (more-->)
787 +=======================+
788
789 0 1 2 3 4 5 6 7
790 +---+---+---+---+---+---+---+---+
791 | CRC32 | ISIZE |
792 +---+---+---+---+---+---+---+---+
793
794
7952.3.1. Member header and trailer
796
797 ID1 (IDentification 1)
798 ID2 (IDentification 2)
799 These have the fixed values ID1 = 31 (0x1f, \037), ID2 = 139
800 (0x8b, \213), to identify the file as being in gzip format.
801
802 CM (Compression Method)
803 This identifies the compression method used in the file. CM
804 = 0-7 are reserved. CM = 8 denotes the "deflate"
805 compression method, which is the one customarily used by
806 gzip and which is documented elsewhere.
807
808 FLG (FLaGs)
809 This flag byte is divided into individual bits as follows:
810
811 bit 0 FTEXT
812 bit 1 FHCRC
813 bit 2 FEXTRA
814 bit 3 FNAME
815 bit 4 FCOMMENT
816 bit 5 reserved
817 bit 6 reserved
818 bit 7 reserved
819
820 Reserved FLG bits must be zero.
821
822 MTIME (Modification TIME)
823 This gives the most recent modification time of the original
824 file being compressed. The time is in Unix format, i.e.,
825 seconds since 00:00:00 GMT, Jan. 1, 1970. (Note that this
826 may cause problems for MS-DOS and other systems that use
827 local rather than Universal time.) If the compressed data
828 did not come from a file, MTIME is set to the time at which
829 compression started. MTIME = 0 means no time stamp is
830 available.
831
832 XFL (eXtra FLags)
833 These flags are available for use by specific compression
834 methods. The "deflate" method (CM = 8) sets these flags as
835 follows:
836
837 XFL = 2 - compressor used maximum compression,
838 slowest algorithm
839 XFL = 4 - compressor used fastest algorithm
840
841 OS (Operating System)
842 This identifies the type of file system on which compression
843 took place. This may be useful in determining end-of-line
844 convention for text files. The currently defined values are
845 as follows:
846
847 0 - FAT filesystem (MS-DOS, OS/2, NT/Win32)
848 1 - Amiga
849 2 - VMS (or OpenVMS)
850 3 - Unix
851 4 - VM/CMS
852 5 - Atari TOS
853 6 - HPFS filesystem (OS/2, NT)
854 7 - Macintosh
855 8 - Z-System
856 9 - CP/M
857 10 - TOPS-20
858 11 - NTFS filesystem (NT)
859 12 - QDOS
860 13 - Acorn RISCOS
861 255 - unknown
862
863 ==> A file compressed using "gzip -1" on Unix-like systems can be :
864
865 1F 8B 08 00 00 00 00 00 04 03
866 <deflate-compressed stream>
867 crc32 size32
868*/
869
870static const unsigned char gzip_hdr[] = { 0x1F, 0x8B, // ID1, ID2
871 0x08, 0x00, // Deflate, flags (none)
872 0x00, 0x00, 0x00, 0x00, // mtime: none
873 0x04, 0x03 }; // fastest comp, OS=Unix
874
875static inline uint32_t crc32_char(uint32_t crc, uint8_t x)
876{
877#if defined(__ARM_FEATURE_CRC32)
878 crc = ~crc;
Willy Tarreaub1544222021-12-03 17:38:42 +0100879# if defined(__ARM_ARCH_ISA_A64)
880 // 64 bit mode
Willy Tarreauab2b7822021-04-22 14:09:44 +0200881 __asm__ volatile("crc32b %w0,%w0,%w1" : "+r"(crc) : "r"(x));
Willy Tarreaub1544222021-12-03 17:38:42 +0100882# else
883 // 32 bit mode (e.g. armv7 compiler building for armv8
884 __asm__ volatile("crc32b %0,%0,%1" : "+r"(crc) : "r"(x));
885# endif
Willy Tarreauab2b7822021-04-22 14:09:44 +0200886 crc = ~crc;
887#else
888 crc = crc32_fast[0][(crc ^ x) & 0xff] ^ (crc >> 8);
889#endif
890 return crc;
891}
892
893static inline uint32_t crc32_uint32(uint32_t data)
894{
895#if defined(__ARM_FEATURE_CRC32)
Willy Tarreaub1544222021-12-03 17:38:42 +0100896# if defined(__ARM_ARCH_ISA_A64)
897 // 64 bit mode
Willy Tarreauab2b7822021-04-22 14:09:44 +0200898 __asm__ volatile("crc32w %w0,%w0,%w1" : "+r"(data) : "r"(~0UL));
Willy Tarreaub1544222021-12-03 17:38:42 +0100899# else
900 // 32 bit mode (e.g. armv7 compiler building for armv8
901 __asm__ volatile("crc32w %0,%0,%1" : "+r"(data) : "r"(~0UL));
902# endif
Willy Tarreauab2b7822021-04-22 14:09:44 +0200903 data = ~data;
904#else
905 data = crc32_fast[3][(data >> 0) & 0xff] ^
906 crc32_fast[2][(data >> 8) & 0xff] ^
907 crc32_fast[1][(data >> 16) & 0xff] ^
908 crc32_fast[0][(data >> 24) & 0xff];
909#endif
910 return data;
911}
912
913/* Modified version originally from RFC1952, working with non-inverting CRCs */
914uint32_t slz_crc32_by1(uint32_t crc, const unsigned char *buf, int len)
915{
916 int n;
917
918 for (n = 0; n < len; n++)
919 crc = crc32_char(crc, buf[n]);
920 return crc;
921}
922
923/* This version computes the crc32 of <buf> over <len> bytes, doing most of it
924 * in 32-bit chunks.
925 */
926uint32_t slz_crc32_by4(uint32_t crc, const unsigned char *buf, int len)
927{
928 const unsigned char *end = buf + len;
929
930 while (buf <= end - 16) {
931#ifdef UNALIGNED_LE_OK
932#if defined(__ARM_FEATURE_CRC32)
933 crc = ~crc;
Willy Tarreaub1544222021-12-03 17:38:42 +0100934# if defined(__ARM_ARCH_ISA_A64)
935 // 64 bit mode
Willy Tarreauab2b7822021-04-22 14:09:44 +0200936 __asm__ volatile("crc32w %w0,%w0,%w1" : "+r"(crc) : "r"(*(uint32_t*)(buf)));
937 __asm__ volatile("crc32w %w0,%w0,%w1" : "+r"(crc) : "r"(*(uint32_t*)(buf + 4)));
938 __asm__ volatile("crc32w %w0,%w0,%w1" : "+r"(crc) : "r"(*(uint32_t*)(buf + 8)));
939 __asm__ volatile("crc32w %w0,%w0,%w1" : "+r"(crc) : "r"(*(uint32_t*)(buf + 12)));
Willy Tarreaub1544222021-12-03 17:38:42 +0100940# else
941 // 32 bit mode (e.g. armv7 compiler building for armv8
942 __asm__ volatile("crc32w %0,%0,%1" : "+r"(crc) : "r"(*(uint32_t*)(buf)));
943 __asm__ volatile("crc32w %0,%0,%1" : "+r"(crc) : "r"(*(uint32_t*)(buf + 4)));
944 __asm__ volatile("crc32w %0,%0,%1" : "+r"(crc) : "r"(*(uint32_t*)(buf + 8)));
945 __asm__ volatile("crc32w %0,%0,%1" : "+r"(crc) : "r"(*(uint32_t*)(buf + 12)));
946# endif
Willy Tarreauab2b7822021-04-22 14:09:44 +0200947 crc = ~crc;
948#else
949 crc ^= *(uint32_t *)buf;
950 crc = crc32_uint32(crc);
951
952 crc ^= *(uint32_t *)(buf + 4);
953 crc = crc32_uint32(crc);
954
955 crc ^= *(uint32_t *)(buf + 8);
956 crc = crc32_uint32(crc);
957
958 crc ^= *(uint32_t *)(buf + 12);
959 crc = crc32_uint32(crc);
960#endif
961#else
962 crc = crc32_fast[3][(buf[0] ^ (crc >> 0)) & 0xff] ^
963 crc32_fast[2][(buf[1] ^ (crc >> 8)) & 0xff] ^
964 crc32_fast[1][(buf[2] ^ (crc >> 16)) & 0xff] ^
965 crc32_fast[0][(buf[3] ^ (crc >> 24)) & 0xff];
966
967 crc = crc32_fast[3][(buf[4] ^ (crc >> 0)) & 0xff] ^
968 crc32_fast[2][(buf[5] ^ (crc >> 8)) & 0xff] ^
969 crc32_fast[1][(buf[6] ^ (crc >> 16)) & 0xff] ^
970 crc32_fast[0][(buf[7] ^ (crc >> 24)) & 0xff];
971
972 crc = crc32_fast[3][(buf[8] ^ (crc >> 0)) & 0xff] ^
973 crc32_fast[2][(buf[9] ^ (crc >> 8)) & 0xff] ^
974 crc32_fast[1][(buf[10] ^ (crc >> 16)) & 0xff] ^
975 crc32_fast[0][(buf[11] ^ (crc >> 24)) & 0xff];
976
977 crc = crc32_fast[3][(buf[12] ^ (crc >> 0)) & 0xff] ^
978 crc32_fast[2][(buf[13] ^ (crc >> 8)) & 0xff] ^
979 crc32_fast[1][(buf[14] ^ (crc >> 16)) & 0xff] ^
980 crc32_fast[0][(buf[15] ^ (crc >> 24)) & 0xff];
981#endif
982 buf += 16;
983 }
984
985 while (buf <= end - 4) {
986#ifdef UNALIGNED_LE_OK
987 crc ^= *(uint32_t *)buf;
988 crc = crc32_uint32(crc);
989#else
990 crc = crc32_fast[3][(buf[0] ^ (crc >> 0)) & 0xff] ^
991 crc32_fast[2][(buf[1] ^ (crc >> 8)) & 0xff] ^
992 crc32_fast[1][(buf[2] ^ (crc >> 16)) & 0xff] ^
993 crc32_fast[0][(buf[3] ^ (crc >> 24)) & 0xff];
994#endif
995 buf += 4;
996 }
997
998 while (buf < end)
Willy Tarreau027fdcb2021-05-12 08:34:36 +0200999 crc = crc32_char(crc, *buf++);
Willy Tarreauab2b7822021-04-22 14:09:44 +02001000 return crc;
1001}
1002
1003/* uses the most suitable crc32 function to update crc on <buf, len> */
1004static inline uint32_t update_crc(uint32_t crc, const void *buf, int len)
1005{
1006 return slz_crc32_by4(crc, buf, len);
1007}
1008
1009/* Sends the gzip header for stream <strm> into buffer <buf>. When it's done,
1010 * the stream state is updated to SLZ_ST_EOB. It returns the number of bytes
1011 * emitted which is always 10. The caller is responsible for ensuring there's
1012 * always enough room in the buffer.
1013 */
1014int slz_rfc1952_send_header(struct slz_stream *strm, unsigned char *buf)
1015{
1016 memcpy(buf, gzip_hdr, sizeof(gzip_hdr));
1017 strm->state = SLZ_ST_EOB;
1018 return sizeof(gzip_hdr);
1019}
1020
1021/* Encodes the block according to rfc1952. This means that the CRC of the input
1022 * block is computed according to the CRC32 algorithm. If the header was never
1023 * sent, it may be sent first. The number of output bytes is returned.
1024 */
1025long slz_rfc1952_encode(struct slz_stream *strm, unsigned char *out, const unsigned char *in, long ilen, int more)
1026{
1027 long ret = 0;
1028
1029 if (__builtin_expect(strm->state == SLZ_ST_INIT, 0))
1030 ret += slz_rfc1952_send_header(strm, out);
1031
1032 strm->crc32 = update_crc(strm->crc32, in, ilen);
1033 ret += slz_rfc1951_encode(strm, out + ret, in, ilen, more);
1034 return ret;
1035}
1036
1037/* Initializes stream <strm> for use with the gzip format (rfc1952). The
1038 * compression level passed in <level> is set. This value can only be 0 (no
1039 * compression) or 1 (compression) and other values will lead to unpredictable
1040 * behaviour. The function always returns 0.
1041 */
1042int slz_rfc1952_init(struct slz_stream *strm, int level)
1043{
1044 strm->state = SLZ_ST_INIT;
1045 strm->level = level;
1046 strm->format = SLZ_FMT_GZIP;
1047 strm->crc32 = 0;
1048 strm->ilen = 0;
1049 strm->qbits = 0;
1050 strm->queue = 0;
1051 return 0;
1052}
1053
1054/* Flushes pending bits and sends the gzip trailer for stream <strm> into
1055 * buffer <buf>. When it's done, the stream state is updated to SLZ_ST_END. It
1056 * returns the number of bytes emitted. The trailer consists in flushing the
1057 * possibly pending bits from the queue (up to 24 bits), rounding to the next
1058 * byte, then 4 bytes for the CRC and another 4 bytes for the input length.
Ilya Shipitsinb2be9a12021-04-24 13:25:42 +05001059 * That may about to 4+4+4 = 12 bytes, that the caller must ensure are
Willy Tarreauab2b7822021-04-22 14:09:44 +02001060 * available before calling the function. Note that if the initial header was
1061 * never sent, it will be sent first as well (10 extra bytes).
1062 */
1063int slz_rfc1952_finish(struct slz_stream *strm, unsigned char *buf)
1064{
1065 strm->outbuf = buf;
1066
1067 if (__builtin_expect(strm->state == SLZ_ST_INIT, 0))
1068 strm->outbuf += slz_rfc1952_send_header(strm, strm->outbuf);
1069
1070 slz_rfc1951_finish(strm, strm->outbuf);
1071 copy_32b(strm, strm->crc32);
1072 copy_32b(strm, strm->ilen);
1073 strm->state = SLZ_ST_END;
1074
1075 return strm->outbuf - buf;
1076}
1077
1078
1079/* RFC1950-specific stuff. This is for the Zlib stream format.
1080 * From RFC1950 (zlib) :
1081 *
1082
1083 2.2. Data format
1084
1085 A zlib stream has the following structure:
1086
1087 0 1
1088 +---+---+
1089 |CMF|FLG| (more-->)
1090 +---+---+
1091
1092
1093 (if FLG.FDICT set)
1094
1095 0 1 2 3
1096 +---+---+---+---+
1097 | DICTID | (more-->)
1098 +---+---+---+---+
1099
1100 +=====================+---+---+---+---+
1101 |...compressed data...| ADLER32 |
1102 +=====================+---+---+---+---+
1103
1104 Any data which may appear after ADLER32 are not part of the zlib
1105 stream.
1106
1107 CMF (Compression Method and flags)
1108 This byte is divided into a 4-bit compression method and a 4-
1109 bit information field depending on the compression method.
1110
1111 bits 0 to 3 CM Compression method
1112 bits 4 to 7 CINFO Compression info
1113
1114 CM (Compression method)
1115 This identifies the compression method used in the file. CM = 8
1116 denotes the "deflate" compression method with a window size up
1117 to 32K. This is the method used by gzip and PNG (see
1118 references [1] and [2] in Chapter 3, below, for the reference
1119 documents). CM = 15 is reserved. It might be used in a future
1120 version of this specification to indicate the presence of an
1121 extra field before the compressed data.
1122
1123 CINFO (Compression info)
1124 For CM = 8, CINFO is the base-2 logarithm of the LZ77 window
1125 size, minus eight (CINFO=7 indicates a 32K window size). Values
1126 of CINFO above 7 are not allowed in this version of the
1127 specification. CINFO is not defined in this specification for
1128 CM not equal to 8.
1129
1130 FLG (FLaGs)
1131 This flag byte is divided as follows:
1132
1133 bits 0 to 4 FCHECK (check bits for CMF and FLG)
1134 bit 5 FDICT (preset dictionary)
1135 bits 6 to 7 FLEVEL (compression level)
1136
1137 The FCHECK value must be such that CMF and FLG, when viewed as
1138 a 16-bit unsigned integer stored in MSB order (CMF*256 + FLG),
1139 is a multiple of 31.
1140
1141
1142 FDICT (Preset dictionary)
1143 If FDICT is set, a DICT dictionary identifier is present
1144 immediately after the FLG byte. The dictionary is a sequence of
1145 bytes which are initially fed to the compressor without
1146 producing any compressed output. DICT is the Adler-32 checksum
1147 of this sequence of bytes (see the definition of ADLER32
1148 below). The decompressor can use this identifier to determine
1149 which dictionary has been used by the compressor.
1150
1151 FLEVEL (Compression level)
1152 These flags are available for use by specific compression
1153 methods. The "deflate" method (CM = 8) sets these flags as
1154 follows:
1155
1156 0 - compressor used fastest algorithm
1157 1 - compressor used fast algorithm
1158 2 - compressor used default algorithm
1159 3 - compressor used maximum compression, slowest algorithm
1160
1161 The information in FLEVEL is not needed for decompression; it
1162 is there to indicate if recompression might be worthwhile.
1163
1164 compressed data
1165 For compression method 8, the compressed data is stored in the
1166 deflate compressed data format as described in the document
1167 "DEFLATE Compressed Data Format Specification" by L. Peter
1168 Deutsch. (See reference [3] in Chapter 3, below)
1169
1170 Other compressed data formats are not specified in this version
1171 of the zlib specification.
1172
1173 ADLER32 (Adler-32 checksum)
1174 This contains a checksum value of the uncompressed data
1175 (excluding any dictionary data) computed according to Adler-32
1176 algorithm. This algorithm is a 32-bit extension and improvement
1177 of the Fletcher algorithm, used in the ITU-T X.224 / ISO 8073
1178 standard. See references [4] and [5] in Chapter 3, below)
1179
1180 Adler-32 is composed of two sums accumulated per byte: s1 is
1181 the sum of all bytes, s2 is the sum of all s1 values. Both sums
1182 are done modulo 65521. s1 is initialized to 1, s2 to zero. The
1183 Adler-32 checksum is stored as s2*65536 + s1 in most-
1184 significant-byte first (network) order.
1185
1186 ==> The stream can start with only 2 bytes :
1187 - CM = 0x78 : CMINFO=7 (32kB window), CM=8 (deflate)
1188 - FLG = 0x01 : FLEVEL = 0 (fastest), FDICT=0 (no dict), FCHECK=1 so
1189 that 0x7801 is a multiple of 31 (30721 = 991 * 31).
1190
1191 ==> and it ends with only 4 bytes, the Adler-32 checksum in big-endian format.
1192
1193 */
1194
1195static const unsigned char zlib_hdr[] = { 0x78, 0x01 }; // 32k win, deflate, chk=1
1196
1197
1198/* Original version from RFC1950, verified and works OK */
1199uint32_t slz_adler32_by1(uint32_t crc, const unsigned char *buf, int len)
1200{
1201 uint32_t s1 = crc & 0xffff;
1202 uint32_t s2 = (crc >> 16) & 0xffff;
1203 int n;
1204
1205 for (n = 0; n < len; n++) {
1206 s1 = (s1 + buf[n]) % 65521;
1207 s2 = (s2 + s1) % 65521;
1208 }
1209 return (s2 << 16) + s1;
1210}
1211
1212/* Computes the adler32 sum on <buf> for <len> bytes. It avoids the expensive
1213 * modulus by retrofitting the number of bytes missed between 65521 and 65536
1214 * which is easy to count : For every sum above 65536, the modulus is offset
1215 * by (65536-65521) = 15. So for any value, we can count the accumulated extra
1216 * values by dividing the sum by 65536 and multiplying this value by
1217 * (65536-65521). That's easier with a drawing with boxes and marbles. It gives
1218 * this :
1219 * x % 65521 = (x % 65536) + (x / 65536) * (65536 - 65521)
1220 * = (x & 0xffff) + (x >> 16) * 15.
1221 */
1222uint32_t slz_adler32_block(uint32_t crc, const unsigned char *buf, long len)
1223{
1224 long s1 = crc & 0xffff;
1225 long s2 = (crc >> 16);
1226 long blk;
1227 long n;
1228
1229 do {
1230 blk = len;
1231 /* ensure we never overflow s2 (limit is about 2^((32-8)/2) */
1232 if (blk > (1U << 12))
1233 blk = 1U << 12;
1234 len -= blk;
1235
1236 for (n = 0; n < blk; n++) {
1237 s1 = (s1 + buf[n]);
1238 s2 = (s2 + s1);
1239 }
1240
1241 /* Largest value here is 2^12 * 255 = 1044480 < 2^20. We can
1242 * still overflow once, but not twice because the right hand
1243 * size is 225 max, so the total is 65761. However we also
1244 * have to take care of the values between 65521 and 65536.
1245 */
1246 s1 = (s1 & 0xffff) + 15 * (s1 >> 16);
1247 if (s1 >= 65521)
1248 s1 -= 65521;
1249
1250 /* For s2, the largest value is estimated to 2^32-1 for
1251 * simplicity, so the right hand side is about 15*65535
1252 * = 983025. We can overflow twice at most.
1253 */
1254 s2 = (s2 & 0xffff) + 15 * (s2 >> 16);
1255 s2 = (s2 & 0xffff) + 15 * (s2 >> 16);
1256 if (s2 >= 65521)
1257 s2 -= 65521;
1258
1259 buf += blk;
1260 } while (len);
1261 return (s2 << 16) + s1;
1262}
1263
1264/* Sends the zlib header for stream <strm> into buffer <buf>. When it's done,
1265 * the stream state is updated to SLZ_ST_EOB. It returns the number of bytes
1266 * emitted which is always 2. The caller is responsible for ensuring there's
1267 * always enough room in the buffer.
1268 */
1269int slz_rfc1950_send_header(struct slz_stream *strm, unsigned char *buf)
1270{
1271 memcpy(buf, zlib_hdr, sizeof(zlib_hdr));
1272 strm->state = SLZ_ST_EOB;
1273 return sizeof(zlib_hdr);
1274}
1275
1276/* Encodes the block according to rfc1950. This means that the CRC of the input
1277 * block is computed according to the ADLER32 algorithm. If the header was never
1278 * sent, it may be sent first. The number of output bytes is returned.
1279 */
1280long slz_rfc1950_encode(struct slz_stream *strm, unsigned char *out, const unsigned char *in, long ilen, int more)
1281{
1282 long ret = 0;
1283
1284 if (__builtin_expect(strm->state == SLZ_ST_INIT, 0))
1285 ret += slz_rfc1950_send_header(strm, out);
1286
1287 strm->crc32 = slz_adler32_block(strm->crc32, in, ilen);
1288 ret += slz_rfc1951_encode(strm, out + ret, in, ilen, more);
1289 return ret;
1290}
1291
1292/* Initializes stream <strm> for use with the zlib format (rfc1952). The
1293 * compression level passed in <level> is set. This value can only be 0 (no
1294 * compression) or 1 (compression) and other values will lead to unpredictable
1295 * behaviour. The function always returns 0.
1296 */
1297int slz_rfc1950_init(struct slz_stream *strm, int level)
1298{
1299 strm->state = SLZ_ST_INIT;
1300 strm->level = level;
1301 strm->format = SLZ_FMT_ZLIB;
1302 strm->crc32 = 1; // rfc1950/zlib starts with initial crc=1
1303 strm->ilen = 0;
1304 strm->qbits = 0;
1305 strm->queue = 0;
1306 return 0;
1307}
1308
1309/* Flushes pending bits and sends the gzip trailer for stream <strm> into
1310 * buffer <buf>. When it's done, the stream state is updated to SLZ_ST_END. It
1311 * returns the number of bytes emitted. The trailer consists in flushing the
1312 * possibly pending bits from the queue (up to 24 bits), rounding to the next
Ilya Shipitsinb2be9a12021-04-24 13:25:42 +05001313 * byte, then 4 bytes for the CRC. That may about to 4+4 = 8 bytes, that the
Willy Tarreauab2b7822021-04-22 14:09:44 +02001314 * caller must ensure are available before calling the function. Note that if
1315 * the initial header was never sent, it will be sent first as well (2 extra
1316 * bytes).
1317 */
1318int slz_rfc1950_finish(struct slz_stream *strm, unsigned char *buf)
1319{
1320 strm->outbuf = buf;
1321
1322 if (__builtin_expect(strm->state == SLZ_ST_INIT, 0))
1323 strm->outbuf += slz_rfc1952_send_header(strm, strm->outbuf);
1324
1325 slz_rfc1951_finish(strm, strm->outbuf);
1326 copy_8b(strm, (strm->crc32 >> 24) & 0xff);
1327 copy_8b(strm, (strm->crc32 >> 16) & 0xff);
1328 copy_8b(strm, (strm->crc32 >> 8) & 0xff);
1329 copy_8b(strm, (strm->crc32 >> 0) & 0xff);
1330 strm->state = SLZ_ST_END;
1331 return strm->outbuf - buf;
1332}
1333
Willy Tarreauab2b7822021-04-22 14:09:44 +02001334__attribute__((constructor))
1335static void __slz_initialize(void)
1336{
Willy Tarreau9e274282021-05-12 08:36:09 +02001337#if !defined(__ARM_FEATURE_CRC32)
Willy Tarreauab2b7822021-04-22 14:09:44 +02001338 __slz_make_crc_table();
Willy Tarreau9e274282021-05-12 08:36:09 +02001339#endif
Willy Tarreauab2b7822021-04-22 14:09:44 +02001340 __slz_prepare_dist_table();
1341}