blob: 1560bac343fb76bb23d1b15b5b1ce8bfd04a9a68 [file] [log] [blame]
Willy Tarreauab2b7822021-04-22 14:09:44 +02001/*
2 * Copyright (C) 2013-2015 Willy Tarreau <w@1wt.eu>
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining
5 * a copy of this software and associated documentation files (the
6 * "Software"), to deal in the Software without restriction, including
7 * without limitation the rights to use, copy, modify, merge, publish,
8 * distribute, sublicense, and/or sell copies of the Software, and to
9 * permit persons to whom the Software is furnished to do so, subject to
10 * the following conditions:
11 *
12 * The above copyright notice and this permission notice shall be
13 * included in all copies or substantial portions of the Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
16 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
17 * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
18 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
19 * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
20 * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
21 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
22 * OTHER DEALINGS IN THE SOFTWARE.
23 */
24
Willy Tarreau388fc252021-05-14 08:44:52 +020025#include <inttypes.h>
Willy Tarreauab2b7822021-04-22 14:09:44 +020026#include <stdio.h>
27#include <string.h>
28#include <import/slz.h>
29#include <import/slz-tables.h>
30
31/* First, RFC1951-specific declarations and extracts from the RFC.
32 *
33 * RFC1951 - deflate stream format
34
35
36 * Data elements are packed into bytes in order of
37 increasing bit number within the byte, i.e., starting
38 with the least-significant bit of the byte.
39 * Data elements other than Huffman codes are packed
40 starting with the least-significant bit of the data
41 element.
42 * Huffman codes are packed starting with the most-
43 significant bit of the code.
44
45 3.2.3. Details of block format
46
47 Each block of compressed data begins with 3 header bits
48 containing the following data:
49
50 first bit BFINAL
51 next 2 bits BTYPE
52
53 Note that the header bits do not necessarily begin on a byte
54 boundary, since a block does not necessarily occupy an integral
55 number of bytes.
56
57 BFINAL is set if and only if this is the last block of the data
58 set.
59
60 BTYPE specifies how the data are compressed, as follows:
61
62 00 - no compression
63 01 - compressed with fixed Huffman codes
64 10 - compressed with dynamic Huffman codes
65 11 - reserved (error)
66
67 3.2.4. Non-compressed blocks (BTYPE=00)
68
69 Any bits of input up to the next byte boundary are ignored.
70 The rest of the block consists of the following information:
71
72 0 1 2 3 4...
73 +---+---+---+---+================================+
74 | LEN | NLEN |... LEN bytes of literal data...|
75 +---+---+---+---+================================+
76
77 LEN is the number of data bytes in the block. NLEN is the
78 one's complement of LEN.
79
80 3.2.5. Compressed blocks (length and distance codes)
81
82 As noted above, encoded data blocks in the "deflate" format
83 consist of sequences of symbols drawn from three conceptually
84 distinct alphabets: either literal bytes, from the alphabet of
85 byte values (0..255), or <length, backward distance> pairs,
86 where the length is drawn from (3..258) and the distance is
87 drawn from (1..32,768). In fact, the literal and length
88 alphabets are merged into a single alphabet (0..285), where
89 values 0..255 represent literal bytes, the value 256 indicates
90 end-of-block, and values 257..285 represent length codes
91 (possibly in conjunction with extra bits following the symbol
92 code) as follows:
93
94Length encoding :
95 Extra Extra Extra
96 Code Bits Length(s) Code Bits Lengths Code Bits Length(s)
97 ---- ---- ------ ---- ---- ------- ---- ---- -------
98 257 0 3 267 1 15,16 277 4 67-82
99 258 0 4 268 1 17,18 278 4 83-98
100 259 0 5 269 2 19-22 279 4 99-114
101 260 0 6 270 2 23-26 280 4 115-130
102 261 0 7 271 2 27-30 281 5 131-162
103 262 0 8 272 2 31-34 282 5 163-194
104 263 0 9 273 3 35-42 283 5 195-226
105 264 0 10 274 3 43-50 284 5 227-257
106 265 1 11,12 275 3 51-58 285 0 258
107 266 1 13,14 276 3 59-66
108
109Distance encoding :
110 Extra Extra Extra
111 Code Bits Dist Code Bits Dist Code Bits Distance
112 ---- ---- ---- ---- ---- ------ ---- ---- --------
113 0 0 1 10 4 33-48 20 9 1025-1536
114 1 0 2 11 4 49-64 21 9 1537-2048
115 2 0 3 12 5 65-96 22 10 2049-3072
116 3 0 4 13 5 97-128 23 10 3073-4096
117 4 1 5,6 14 6 129-192 24 11 4097-6144
118 5 1 7,8 15 6 193-256 25 11 6145-8192
119 6 2 9-12 16 7 257-384 26 12 8193-12288
120 7 2 13-16 17 7 385-512 27 12 12289-16384
121 8 3 17-24 18 8 513-768 28 13 16385-24576
122 9 3 25-32 19 8 769-1024 29 13 24577-32768
123
124 3.2.6. Compression with fixed Huffman codes (BTYPE=01)
125
126 The Huffman codes for the two alphabets are fixed, and are not
127 represented explicitly in the data. The Huffman code lengths
128 for the literal/length alphabet are:
129
130 Lit Value Bits Codes
131 --------- ---- -----
132 0 - 143 8 00110000 through
133 10111111
134 144 - 255 9 110010000 through
135 111111111
136 256 - 279 7 0000000 through
137 0010111
138 280 - 287 8 11000000 through
139 11000111
140
141 The code lengths are sufficient to generate the actual codes,
142 as described above; we show the codes in the table for added
143 clarity. Literal/length values 286-287 will never actually
144 occur in the compressed data, but participate in the code
145 construction.
146
147 Distance codes 0-31 are represented by (fixed-length) 5-bit
148 codes, with possible additional bits as shown in the table
149 shown in Paragraph 3.2.5, above. Note that distance codes 30-
150 31 will never actually occur in the compressed data.
151
152*/
153
154/* back references, built in a way that is optimal for 32/64 bits */
155union ref {
156 struct {
157 uint32_t pos;
158 uint32_t word;
159 } by32;
160 uint64_t by64;
161};
162
163#if defined(USE_64BIT_QUEUE) && defined(UNALIGNED_LE_OK)
164
165/* enqueue code x of <xbits> bits (LSB aligned, at most 24) and copy complete
166 * 32-bit words into output buffer. X must not contain non-zero bits above
167 * xbits.
168 */
169static inline void enqueue24(struct slz_stream *strm, uint32_t x, uint32_t xbits)
170{
171 uint64_t queue = strm->queue + ((uint64_t)x << strm->qbits);
172 uint32_t qbits = strm->qbits + xbits;
173
174 if (__builtin_expect(qbits >= 32, 1)) {
175 *(uint32_t *)strm->outbuf = queue;
176 queue >>= 32;
177 qbits -= 32;
178 strm->outbuf += 4;
179 }
180
181 strm->queue = queue;
182 strm->qbits = qbits;
183}
184
185#define enqueue8 enqueue24
186
187/* flush the queue and align to next byte */
188static inline void flush_bits(struct slz_stream *strm)
189{
190 if (strm->qbits > 0)
191 *strm->outbuf++ = strm->queue;
192
193 if (strm->qbits > 8)
194 *strm->outbuf++ = strm->queue >> 8;
195
196 if (strm->qbits > 16)
197 *strm->outbuf++ = strm->queue >> 16;
198
199 if (strm->qbits > 24)
200 *strm->outbuf++ = strm->queue >> 24;
201
202 strm->queue = 0;
203 strm->qbits = 0;
204}
205
206#else /* non-64 bit or aligned or big endian */
207
208/* enqueue code x of <xbits> bits (LSB aligned, at most 24) and copy complete
209 * bytes into out buf. X must not contain non-zero bits above xbits. Prefer
210 * enqueue8() when xbits is known for being 8 or less.
211 */
212static void enqueue24(struct slz_stream *strm, uint32_t x, uint32_t xbits)
213{
214 uint32_t queue = strm->queue + (x << strm->qbits);
215 uint32_t qbits = strm->qbits + xbits;
216
217 if (qbits >= 16) {
218#ifndef UNALIGNED_LE_OK
219 strm->outbuf[0] = queue;
220 strm->outbuf[1] = queue >> 8;
221#else
222 *(uint16_t *)strm->outbuf = queue;
223#endif
224 strm->outbuf += 2;
225 queue >>= 16;
226 qbits -= 16;
227 }
228
229 if (qbits >= 8) {
230 qbits -= 8;
231 *strm->outbuf++ = queue;
232 queue >>= 8;
233 }
234 strm->qbits = qbits;
235 strm->queue = queue;
236 return;
237}
238
239/* enqueue code x of <xbits> bits (at most 8) and copy complete bytes into
240 * out buf. X must not contain non-zero bits above xbits.
241 */
242static inline void enqueue8(struct slz_stream *strm, uint32_t x, uint32_t xbits)
243{
244 uint32_t queue = strm->queue + (x << strm->qbits);
245 uint32_t qbits = strm->qbits + xbits;
246
247 if (__builtin_expect((signed)(qbits - 8) >= 0, 1)) {
248 qbits -= 8;
249 *strm->outbuf++ = queue;
250 queue >>= 8;
251 }
252
253 strm->qbits = qbits;
254 strm->queue = queue;
255}
256
257/* align to next byte */
258static inline void flush_bits(struct slz_stream *strm)
259{
260 if (strm->qbits > 0)
261 *strm->outbuf++ = strm->queue;
262
263 if (strm->qbits > 8)
264 *strm->outbuf++ = strm->queue >> 8;
265
266 strm->queue = 0;
267 strm->qbits = 0;
268}
269#endif
270
271
272/* only valid if buffer is already aligned */
273static inline void copy_8b(struct slz_stream *strm, uint32_t x)
274{
275 *strm->outbuf++ = x;
276}
277
278/* only valid if buffer is already aligned */
279static inline void copy_16b(struct slz_stream *strm, uint32_t x)
280{
281 strm->outbuf[0] = x;
282 strm->outbuf[1] = x >> 8;
283 strm->outbuf += 2;
284}
285
286/* only valid if buffer is already aligned */
287static inline void copy_32b(struct slz_stream *strm, uint32_t x)
288{
289 strm->outbuf[0] = x;
290 strm->outbuf[1] = x >> 8;
291 strm->outbuf[2] = x >> 16;
292 strm->outbuf[3] = x >> 24;
293 strm->outbuf += 4;
294}
295
296static inline void send_huff(struct slz_stream *strm, uint32_t code)
297{
298 uint32_t bits;
299
300 code = fixed_huff[code];
301 bits = code & 15;
302 code >>= 4;
303 enqueue24(strm, code, bits);
304}
305
306static inline void send_eob(struct slz_stream *strm)
307{
308 enqueue8(strm, 0, 7); // direct encoding of 256 = EOB (cf RFC1951)
309}
310
Ilya Shipitsinb2be9a12021-04-24 13:25:42 +0500311/* copies <len> literals from <buf>. <more> indicates that there are data past
Willy Tarreauab2b7822021-04-22 14:09:44 +0200312 * buf + <len>. <len> must not be null.
313 */
314static void copy_lit(struct slz_stream *strm, const void *buf, uint32_t len, int more)
315{
316 uint32_t len2;
317
318 do {
319 len2 = len;
320 if (__builtin_expect(len2 > 65535, 0))
321 len2 = 65535;
322
323 len -= len2;
324
325 if (strm->state != SLZ_ST_EOB)
326 send_eob(strm);
327
328 strm->state = (more || len) ? SLZ_ST_EOB : SLZ_ST_DONE;
329
330 enqueue8(strm, !(more || len), 3); // BFINAL = !more ; BTYPE = 00
331 flush_bits(strm);
332 copy_16b(strm, len2); // len2
333 copy_16b(strm, ~len2); // nlen2
334 memcpy(strm->outbuf, buf, len2);
335 buf += len2;
336 strm->outbuf += len2;
337 } while (len);
338}
339
Ilya Shipitsinb2be9a12021-04-24 13:25:42 +0500340/* copies <len> literals from <buf>. <more> indicates that there are data past
Willy Tarreauab2b7822021-04-22 14:09:44 +0200341 * buf + <len>. <len> must not be null.
342 */
343static void copy_lit_huff(struct slz_stream *strm, const unsigned char *buf, uint32_t len, int more)
344{
345 uint32_t pos;
346
347 /* This ugly construct limits the mount of tests and optimizes for the
348 * most common case (more > 0).
349 */
350 if (strm->state == SLZ_ST_EOB) {
351 eob:
352 strm->state = more ? SLZ_ST_FIXED : SLZ_ST_LAST;
353 enqueue8(strm, 2 + !more, 3); // BFINAL = !more ; BTYPE = 01
354 }
355 else if (!more) {
356 send_eob(strm);
357 goto eob;
358 }
359
360 pos = 0;
361 do {
362 send_huff(strm, buf[pos++]);
363 } while (pos < len);
364}
365
366/* format:
367 * bit0..31 = word
368 * bit32..63 = last position in buffer of similar content
369 */
370
371/* This hash provides good average results on HTML contents, and is among the
372 * few which provide almost optimal results on various different pages.
373 */
374static inline uint32_t slz_hash(uint32_t a)
375{
376#if defined(__ARM_FEATURE_CRC32)
Willy Tarreaub1544222021-12-03 17:38:42 +0100377# if defined(__ARM_ARCH_ISA_A64)
378 // 64 bit mode
Willy Tarreauab2b7822021-04-22 14:09:44 +0200379 __asm__ volatile("crc32w %w0,%w0,%w1" : "+r"(a) : "r"(0));
Willy Tarreaub1544222021-12-03 17:38:42 +0100380# else
381 // 32 bit mode (e.g. armv7 compiler building for armv8
382 __asm__ volatile("crc32w %0,%0,%1" : "+r"(a) : "r"(0));
383# endif
Willy Tarreauab2b7822021-04-22 14:09:44 +0200384 return a >> (32 - HASH_BITS);
385#else
386 return ((a << 19) + (a << 6) - a) >> (32 - HASH_BITS);
387#endif
388}
389
390/* This function compares buffers <a> and <b> and reads 32 or 64 bits at a time
391 * during the approach. It makes us of unaligned little endian memory accesses
392 * on capable architectures. <max> is the maximum number of bytes that can be
393 * read, so both <a> and <b> must have at least <max> bytes ahead. <max> may
394 * safely be null or negative if that simplifies computations in the caller.
395 */
396static inline long memmatch(const unsigned char *a, const unsigned char *b, long max)
397{
398 long len = 0;
399
400#ifdef UNALIGNED_LE_OK
401 unsigned long xor;
402
403 while (1) {
404 if ((long)(len + 2 * sizeof(long)) > max) {
405 while (len < max) {
406 if (a[len] != b[len])
407 break;
408 len++;
409 }
410 return len;
411 }
412
413 xor = *(long *)&a[len] ^ *(long *)&b[len];
414 if (xor)
415 break;
416 len += sizeof(long);
417
418 xor = *(long *)&a[len] ^ *(long *)&b[len];
419 if (xor)
420 break;
421 len += sizeof(long);
422 }
423
424#if defined(__x86_64__) || defined(__i386__) || defined(__i486__) || defined(__i586__) || defined(__i686__)
425 /* x86 has bsf. We know that xor is non-null here */
426 asm("bsf %1,%0\n" : "=r"(xor) : "0" (xor));
427 return len + xor / 8;
428#else
429 if (sizeof(long) > 4 && !(xor & 0xffffffff)) {
430 /* This code is optimized out on 32-bit archs, but we still
431 * need to shift in two passes to avoid a warning. It is
432 * properly optimized out as a single shift.
433 */
434 xor >>= 16; xor >>= 16;
435 if (xor & 0xffff) {
436 if (xor & 0xff)
437 return len + 4;
438 return len + 5;
439 }
440 if (xor & 0xffffff)
441 return len + 6;
442 return len + 7;
443 }
444
445 if (xor & 0xffff) {
446 if (xor & 0xff)
447 return len;
448 return len + 1;
449 }
450 if (xor & 0xffffff)
451 return len + 2;
452 return len + 3;
453#endif // x86
454
455#else // UNALIGNED_LE_OK
456 /* This is the generic version for big endian or unaligned-incompatible
457 * architectures.
458 */
459 while (len < max) {
460 if (a[len] != b[len])
461 break;
462 len++;
463 }
464 return len;
465
466#endif
467}
468
469/* sets <count> BYTES to -32769 in <refs> so that any uninitialized entry will
470 * verify (pos-last-1 >= 32768) and be ignored. <count> must be a multiple of
471 * 128 bytes and <refs> must be at least one count in length. It's supposed to
472 * be applied to 64-bit aligned data exclusively, which makes it slightly
473 * faster than the regular memset() since no alignment check is performed.
474 */
Tim Duesterhuseaf16fc2021-09-20 19:59:42 +0200475static void reset_refs(union ref *refs, long count)
Willy Tarreauab2b7822021-04-22 14:09:44 +0200476{
477 /* avoid a shift/mask by casting to void* */
478 union ref *end = (void *)refs + count;
479
480 do {
481 refs[ 0].by64 = -32769;
482 refs[ 1].by64 = -32769;
483 refs[ 2].by64 = -32769;
484 refs[ 3].by64 = -32769;
485 refs[ 4].by64 = -32769;
486 refs[ 5].by64 = -32769;
487 refs[ 6].by64 = -32769;
488 refs[ 7].by64 = -32769;
489 refs[ 8].by64 = -32769;
490 refs[ 9].by64 = -32769;
491 refs[10].by64 = -32769;
492 refs[11].by64 = -32769;
493 refs[12].by64 = -32769;
494 refs[13].by64 = -32769;
495 refs[14].by64 = -32769;
496 refs[15].by64 = -32769;
497 refs += 16;
498 } while (refs < end);
499}
500
501/* Compresses <ilen> bytes from <in> into <out> according to RFC1951. The
502 * output result may be up to 5 bytes larger than the input, to which 2 extra
503 * bytes may be added to send the last chunk due to BFINAL+EOB encoding (10
504 * bits) when <more> is not set. The caller is responsible for ensuring there
505 * is enough room in the output buffer for this. The amount of output bytes is
506 * returned, and no CRC is computed.
507 */
508long slz_rfc1951_encode(struct slz_stream *strm, unsigned char *out, const unsigned char *in, long ilen, int more)
509{
510 long rem = ilen;
511 unsigned long pos = 0;
512 unsigned long last;
513 uint32_t word = 0;
514 long mlen;
515 uint32_t h;
516 uint64_t ent;
517
518 uint32_t plit = 0;
519 uint32_t bit9 = 0;
520 uint32_t dist, code;
521 union ref refs[1 << HASH_BITS];
522
523 if (!strm->level) {
524 /* force to send as literals (eg to preserve CPU) */
525 strm->outbuf = out;
526 plit = pos = ilen;
527 bit9 = 52; /* force literal dump */
528 goto final_lit_dump;
529 }
530
531 reset_refs(refs, sizeof(refs));
532
533 strm->outbuf = out;
534
535#ifndef UNALIGNED_FASTER
536 word = ((unsigned char)in[pos] << 8) + ((unsigned char)in[pos + 1] << 16) + ((unsigned char)in[pos + 2] << 24);
537#endif
538 while (rem >= 4) {
539#ifndef UNALIGNED_FASTER
540 word = ((unsigned char)in[pos + 3] << 24) + (word >> 8);
541#else
542 word = *(uint32_t *)&in[pos];
543#endif
544 h = slz_hash(word);
545 asm volatile ("" ::); // prevent gcc from trying to be smart with the prefetch
546
547 if (sizeof(long) >= 8) {
548 ent = refs[h].by64;
549 last = (uint32_t)ent;
550 ent >>= 32;
551 refs[h].by64 = ((uint64_t)pos) + ((uint64_t)word << 32);
552 } else {
553 ent = refs[h].by32.word;
554 last = refs[h].by32.pos;
555 refs[h].by32.pos = pos;
556 refs[h].by32.word = word;
557 }
558
Willy Tarreaufc89c3f2021-08-28 12:10:49 +0200559#ifdef FIND_OPTIMAL_MATCH
Willy Tarreauab2b7822021-04-22 14:09:44 +0200560 /* Experimental code to see what could be saved with an ideal
561 * longest match lookup algorithm. This one is very slow but
562 * scans the whole window. In short, here are the savings :
563 * file orig fast(ratio) optimal(ratio)
564 * README 5185 3419 (65.9%) 3165 (61.0%) -7.5%
565 * index.html 76799 35662 (46.4%) 29875 (38.9%) -16.3%
566 * rfc1952.c 29383 13442 (45.7%) 11793 (40.1%) -12.3%
567 *
568 * Thus the savings to expect for large files is at best 16%.
569 *
570 * A non-colliding hash gives 33025 instead of 35662 (-7.4%),
571 * and keeping the last two entries gives 31724 (-11.0%).
572 */
573 unsigned long scan;
574 int saved = 0;
575 int bestpos = 0;
576 int bestlen = 0;
577 int firstlen = 0;
578 int max_lookup = 2; // 0 = no limit
579
580 for (scan = pos - 1; scan < pos && (unsigned long)(pos - scan - 1) < 32768; scan--) {
Willy Tarreaueda36f12021-12-13 09:07:05 +0100581 int len;
582
Willy Tarreauab2b7822021-04-22 14:09:44 +0200583 if (*(uint32_t *)(in + scan) != word)
584 continue;
585
586 len = memmatch(in + pos, in + scan, rem);
587 if (!bestlen)
588 firstlen = len;
589
590 if (len > bestlen) {
591 bestlen = len;
592 bestpos = scan;
593 }
594 if (!--max_lookup)
595 break;
596 }
597 if (bestlen) {
598 //printf("pos=%d last=%d bestpos=%d word=%08x ent=%08x len=%d\n",
599 // (int)pos, (int)last, (int)bestpos, (int)word, (int)ent, bestlen);
600 last = bestpos;
601 ent = word;
602 saved += bestlen - firstlen;
603 }
604 //fprintf(stderr, "first=%d best=%d saved_total=%d\n", firstlen, bestlen, saved);
605#endif
606
607 if ((uint32_t)ent != word) {
608 send_as_lit:
609 rem--;
610 plit++;
611 bit9 += ((unsigned char)word >= 144);
612 pos++;
613 continue;
614 }
615
616 /* We reject pos = last and pos > last+32768 */
617 if ((unsigned long)(pos - last - 1) >= 32768)
618 goto send_as_lit;
619
620 /* Note: cannot encode a length larger than 258 bytes */
621 mlen = memmatch(in + pos + 4, in + last + 4, (rem > 258 ? 258 : rem) - 4) + 4;
622
623 /* found a matching entry */
624
625 if (bit9 >= 52 && mlen < 6)
626 goto send_as_lit;
627
628 /* compute the output code, its size and the length's size in
629 * bits to know if the reference is cheaper than literals.
630 */
631 code = len_fh[mlen];
632
633 /* direct mapping of dist->huffman code */
634 dist = fh_dist_table[pos - last - 1];
635
636 /* if encoding the dist+length is more expensive than sending
637 * the equivalent as bytes, lets keep the literals.
638 */
639 if ((dist & 0x1f) + (code >> 16) + 8 >= 8 * mlen + bit9)
640 goto send_as_lit;
641
642 /* first, copy pending literals */
643 if (plit) {
644 /* Huffman encoding requires 9 bits for octets 144..255, so this
645 * is a waste of space for binary data. Switching between Huffman
646 * and no-comp then huffman consumes 52 bits (7 for EOB + 3 for
647 * block type + 7 for alignment + 32 for LEN+NLEN + 3 for next
648 * block. Only use plain literals if there are more than 52 bits
649 * to save then.
650 */
651 if (bit9 >= 52)
652 copy_lit(strm, in + pos - plit, plit, 1);
653 else
654 copy_lit_huff(strm, in + pos - plit, plit, 1);
655
656 plit = 0;
657 }
658
659 /* use mode 01 - fixed huffman */
660 if (strm->state == SLZ_ST_EOB) {
661 strm->state = SLZ_ST_FIXED;
662 enqueue8(strm, 0x02, 3); // BTYPE = 01, BFINAL = 0
663 }
664
665 /* copy the length first */
666 enqueue24(strm, code & 0xFFFF, code >> 16);
667
668 /* in fixed huffman mode, dist is fixed 5 bits */
669 enqueue24(strm, dist >> 5, dist & 0x1f);
670 bit9 = 0;
671 rem -= mlen;
672 pos += mlen;
673
674#ifndef UNALIGNED_FASTER
675#ifdef UNALIGNED_LE_OK
676 word = *(uint32_t *)&in[pos - 1];
677#else
678 word = ((unsigned char)in[pos] << 8) + ((unsigned char)in[pos + 1] << 16) + ((unsigned char)in[pos + 2] << 24);
679#endif
680#endif
681 }
682
683 if (__builtin_expect(rem, 0)) {
684 /* we're reading the 1..3 last bytes */
685 plit += rem;
686 do {
687 bit9 += ((unsigned char)in[pos++] >= 144);
688 } while (--rem);
689 }
690
691 final_lit_dump:
692 /* now copy remaining literals or mark the end */
693 if (plit) {
694 if (bit9 >= 52)
695 copy_lit(strm, in + pos - plit, plit, more);
696 else
697 copy_lit_huff(strm, in + pos - plit, plit, more);
698
699 plit = 0;
700 }
701
702 strm->ilen += ilen;
703 return strm->outbuf - out;
704}
705
706/* Initializes stream <strm> for use with raw deflate (rfc1951). The CRC is
707 * unused but set to zero. The compression level passed in <level> is set. This
708 * value can only be 0 (no compression) or 1 (compression) and other values
709 * will lead to unpredictable behaviour. The function always returns 0.
710 */
711int slz_rfc1951_init(struct slz_stream *strm, int level)
712{
713 strm->state = SLZ_ST_EOB; // no header
714 strm->level = level;
715 strm->format = SLZ_FMT_DEFLATE;
716 strm->crc32 = 0;
717 strm->ilen = 0;
718 strm->qbits = 0;
719 strm->queue = 0;
720 return 0;
721}
722
Willy Tarreau3ffe11e2023-06-26 19:24:55 +0200723/* Flushes any pending data for stream <strm> into buffer <buf>, then emits an
724 * empty literal block to byte-align the output, allowing to completely flush
725 * the queue. This requires that the output buffer still has the size of the
726 * queue available (up to 4 bytes), plus one byte for (BFINAL,BTYPE), plus 4
727 * bytes for LEN+NLEN, or a total of 9 bytes in the worst case. The number of
728 * bytes emitted is returned. It is guaranteed that the queue is empty on
729 * return. This may cause some overhead by adding needless 5-byte blocks if
730 * called to often.
731 */
732int slz_rfc1951_flush(struct slz_stream *strm, unsigned char *buf)
733{
734 strm->outbuf = buf;
735
736 /* The queue is always empty on INIT, DONE, and END */
737 if (!strm->qbits)
738 return 0;
739
740 /* we may need to terminate a huffman output. Lit is always in EOB state */
741 if (strm->state != SLZ_ST_EOB) {
742 strm->state = (strm->state == SLZ_ST_LAST) ? SLZ_ST_DONE : SLZ_ST_EOB;
743 send_eob(strm);
744 }
745
746 /* send BFINAL according to state, and BTYPE=00 (lit) */
747 enqueue8(strm, (strm->state == SLZ_ST_DONE) ? 1 : 0, 3);
748 flush_bits(strm); // emit pending bits
749 copy_32b(strm, 0xFFFF0000U); // len=0, nlen=~0
750
751 /* Now the queue is empty, EOB was sent, BFINAL might have been sent if
752 * we completed the last block, and a zero-byte block was sent to byte-
753 * align the output. The last state reflects all this. Let's just
754 * return the number of bytes added to the output buffer.
755 */
756 return strm->outbuf - buf;
757}
758
Willy Tarreauab2b7822021-04-22 14:09:44 +0200759/* Flushes any pending for stream <strm> into buffer <buf>, then sends BTYPE=1
760 * and BFINAL=1 if needed. The stream ends in SLZ_ST_DONE. It returns the number
761 * of bytes emitted. The trailer consists in flushing the possibly pending bits
762 * from the queue (up to 7 bits), then possibly EOB (7 bits), then 3 bits, EOB,
763 * a rounding to the next byte, which amounts to a total of 4 bytes max, that
764 * the caller must ensure are available before calling the function.
765 */
766int slz_rfc1951_finish(struct slz_stream *strm, unsigned char *buf)
767{
768 strm->outbuf = buf;
769
770 if (strm->state == SLZ_ST_FIXED || strm->state == SLZ_ST_LAST) {
771 strm->state = (strm->state == SLZ_ST_LAST) ? SLZ_ST_DONE : SLZ_ST_EOB;
772 send_eob(strm);
773 }
774
775 if (strm->state != SLZ_ST_DONE) {
776 /* send BTYPE=1, BFINAL=1 */
777 enqueue8(strm, 3, 3);
778 send_eob(strm);
779 strm->state = SLZ_ST_DONE;
780 }
781
782 flush_bits(strm);
783 return strm->outbuf - buf;
784}
785
786/* Now RFC1952-specific declarations and extracts from RFC.
787 * From RFC1952 about the GZIP file format :
788
789A gzip file consists of a series of "members" ...
790
7912.3. Member format
792
793 Each member has the following structure:
794
795 +---+---+---+---+---+---+---+---+---+---+
796 |ID1|ID2|CM |FLG| MTIME |XFL|OS | (more-->)
797 +---+---+---+---+---+---+---+---+---+---+
798
799 (if FLG.FEXTRA set)
800
801 +---+---+=================================+
802 | XLEN |...XLEN bytes of "extra field"...| (more-->)
803 +---+---+=================================+
804
805 (if FLG.FNAME set)
806
807 +=========================================+
808 |...original file name, zero-terminated...| (more-->)
809 +=========================================+
810
811 (if FLG.FCOMMENT set)
812
813 +===================================+
814 |...file comment, zero-terminated...| (more-->)
815 +===================================+
816
817 (if FLG.FHCRC set)
818
819 +---+---+
820 | CRC16 |
821 +---+---+
822
823 +=======================+
824 |...compressed blocks...| (more-->)
825 +=======================+
826
827 0 1 2 3 4 5 6 7
828 +---+---+---+---+---+---+---+---+
829 | CRC32 | ISIZE |
830 +---+---+---+---+---+---+---+---+
831
832
8332.3.1. Member header and trailer
834
835 ID1 (IDentification 1)
836 ID2 (IDentification 2)
837 These have the fixed values ID1 = 31 (0x1f, \037), ID2 = 139
838 (0x8b, \213), to identify the file as being in gzip format.
839
840 CM (Compression Method)
841 This identifies the compression method used in the file. CM
842 = 0-7 are reserved. CM = 8 denotes the "deflate"
843 compression method, which is the one customarily used by
844 gzip and which is documented elsewhere.
845
846 FLG (FLaGs)
847 This flag byte is divided into individual bits as follows:
848
849 bit 0 FTEXT
850 bit 1 FHCRC
851 bit 2 FEXTRA
852 bit 3 FNAME
853 bit 4 FCOMMENT
854 bit 5 reserved
855 bit 6 reserved
856 bit 7 reserved
857
858 Reserved FLG bits must be zero.
859
860 MTIME (Modification TIME)
861 This gives the most recent modification time of the original
862 file being compressed. The time is in Unix format, i.e.,
863 seconds since 00:00:00 GMT, Jan. 1, 1970. (Note that this
864 may cause problems for MS-DOS and other systems that use
865 local rather than Universal time.) If the compressed data
866 did not come from a file, MTIME is set to the time at which
867 compression started. MTIME = 0 means no time stamp is
868 available.
869
870 XFL (eXtra FLags)
871 These flags are available for use by specific compression
872 methods. The "deflate" method (CM = 8) sets these flags as
873 follows:
874
875 XFL = 2 - compressor used maximum compression,
876 slowest algorithm
877 XFL = 4 - compressor used fastest algorithm
878
879 OS (Operating System)
880 This identifies the type of file system on which compression
881 took place. This may be useful in determining end-of-line
882 convention for text files. The currently defined values are
883 as follows:
884
885 0 - FAT filesystem (MS-DOS, OS/2, NT/Win32)
886 1 - Amiga
887 2 - VMS (or OpenVMS)
888 3 - Unix
889 4 - VM/CMS
890 5 - Atari TOS
891 6 - HPFS filesystem (OS/2, NT)
892 7 - Macintosh
893 8 - Z-System
894 9 - CP/M
895 10 - TOPS-20
896 11 - NTFS filesystem (NT)
897 12 - QDOS
898 13 - Acorn RISCOS
899 255 - unknown
900
901 ==> A file compressed using "gzip -1" on Unix-like systems can be :
902
903 1F 8B 08 00 00 00 00 00 04 03
904 <deflate-compressed stream>
905 crc32 size32
906*/
907
908static const unsigned char gzip_hdr[] = { 0x1F, 0x8B, // ID1, ID2
909 0x08, 0x00, // Deflate, flags (none)
910 0x00, 0x00, 0x00, 0x00, // mtime: none
911 0x04, 0x03 }; // fastest comp, OS=Unix
912
913static inline uint32_t crc32_char(uint32_t crc, uint8_t x)
914{
915#if defined(__ARM_FEATURE_CRC32)
916 crc = ~crc;
Willy Tarreaub1544222021-12-03 17:38:42 +0100917# if defined(__ARM_ARCH_ISA_A64)
918 // 64 bit mode
Willy Tarreauab2b7822021-04-22 14:09:44 +0200919 __asm__ volatile("crc32b %w0,%w0,%w1" : "+r"(crc) : "r"(x));
Willy Tarreaub1544222021-12-03 17:38:42 +0100920# else
921 // 32 bit mode (e.g. armv7 compiler building for armv8
922 __asm__ volatile("crc32b %0,%0,%1" : "+r"(crc) : "r"(x));
923# endif
Willy Tarreauab2b7822021-04-22 14:09:44 +0200924 crc = ~crc;
925#else
926 crc = crc32_fast[0][(crc ^ x) & 0xff] ^ (crc >> 8);
927#endif
928 return crc;
929}
930
931static inline uint32_t crc32_uint32(uint32_t data)
932{
933#if defined(__ARM_FEATURE_CRC32)
Willy Tarreaub1544222021-12-03 17:38:42 +0100934# if defined(__ARM_ARCH_ISA_A64)
935 // 64 bit mode
Willy Tarreauab2b7822021-04-22 14:09:44 +0200936 __asm__ volatile("crc32w %w0,%w0,%w1" : "+r"(data) : "r"(~0UL));
Willy Tarreaub1544222021-12-03 17:38:42 +0100937# else
938 // 32 bit mode (e.g. armv7 compiler building for armv8
939 __asm__ volatile("crc32w %0,%0,%1" : "+r"(data) : "r"(~0UL));
940# endif
Willy Tarreauab2b7822021-04-22 14:09:44 +0200941 data = ~data;
942#else
943 data = crc32_fast[3][(data >> 0) & 0xff] ^
944 crc32_fast[2][(data >> 8) & 0xff] ^
945 crc32_fast[1][(data >> 16) & 0xff] ^
946 crc32_fast[0][(data >> 24) & 0xff];
947#endif
948 return data;
949}
950
951/* Modified version originally from RFC1952, working with non-inverting CRCs */
952uint32_t slz_crc32_by1(uint32_t crc, const unsigned char *buf, int len)
953{
954 int n;
955
956 for (n = 0; n < len; n++)
957 crc = crc32_char(crc, buf[n]);
958 return crc;
959}
960
961/* This version computes the crc32 of <buf> over <len> bytes, doing most of it
962 * in 32-bit chunks.
963 */
964uint32_t slz_crc32_by4(uint32_t crc, const unsigned char *buf, int len)
965{
966 const unsigned char *end = buf + len;
967
968 while (buf <= end - 16) {
969#ifdef UNALIGNED_LE_OK
970#if defined(__ARM_FEATURE_CRC32)
971 crc = ~crc;
Willy Tarreaub1544222021-12-03 17:38:42 +0100972# if defined(__ARM_ARCH_ISA_A64)
973 // 64 bit mode
Willy Tarreauab2b7822021-04-22 14:09:44 +0200974 __asm__ volatile("crc32w %w0,%w0,%w1" : "+r"(crc) : "r"(*(uint32_t*)(buf)));
975 __asm__ volatile("crc32w %w0,%w0,%w1" : "+r"(crc) : "r"(*(uint32_t*)(buf + 4)));
976 __asm__ volatile("crc32w %w0,%w0,%w1" : "+r"(crc) : "r"(*(uint32_t*)(buf + 8)));
977 __asm__ volatile("crc32w %w0,%w0,%w1" : "+r"(crc) : "r"(*(uint32_t*)(buf + 12)));
Willy Tarreaub1544222021-12-03 17:38:42 +0100978# else
979 // 32 bit mode (e.g. armv7 compiler building for armv8
980 __asm__ volatile("crc32w %0,%0,%1" : "+r"(crc) : "r"(*(uint32_t*)(buf)));
981 __asm__ volatile("crc32w %0,%0,%1" : "+r"(crc) : "r"(*(uint32_t*)(buf + 4)));
982 __asm__ volatile("crc32w %0,%0,%1" : "+r"(crc) : "r"(*(uint32_t*)(buf + 8)));
983 __asm__ volatile("crc32w %0,%0,%1" : "+r"(crc) : "r"(*(uint32_t*)(buf + 12)));
984# endif
Willy Tarreauab2b7822021-04-22 14:09:44 +0200985 crc = ~crc;
986#else
987 crc ^= *(uint32_t *)buf;
988 crc = crc32_uint32(crc);
989
990 crc ^= *(uint32_t *)(buf + 4);
991 crc = crc32_uint32(crc);
992
993 crc ^= *(uint32_t *)(buf + 8);
994 crc = crc32_uint32(crc);
995
996 crc ^= *(uint32_t *)(buf + 12);
997 crc = crc32_uint32(crc);
998#endif
999#else
1000 crc = crc32_fast[3][(buf[0] ^ (crc >> 0)) & 0xff] ^
1001 crc32_fast[2][(buf[1] ^ (crc >> 8)) & 0xff] ^
1002 crc32_fast[1][(buf[2] ^ (crc >> 16)) & 0xff] ^
1003 crc32_fast[0][(buf[3] ^ (crc >> 24)) & 0xff];
1004
1005 crc = crc32_fast[3][(buf[4] ^ (crc >> 0)) & 0xff] ^
1006 crc32_fast[2][(buf[5] ^ (crc >> 8)) & 0xff] ^
1007 crc32_fast[1][(buf[6] ^ (crc >> 16)) & 0xff] ^
1008 crc32_fast[0][(buf[7] ^ (crc >> 24)) & 0xff];
1009
1010 crc = crc32_fast[3][(buf[8] ^ (crc >> 0)) & 0xff] ^
1011 crc32_fast[2][(buf[9] ^ (crc >> 8)) & 0xff] ^
1012 crc32_fast[1][(buf[10] ^ (crc >> 16)) & 0xff] ^
1013 crc32_fast[0][(buf[11] ^ (crc >> 24)) & 0xff];
1014
1015 crc = crc32_fast[3][(buf[12] ^ (crc >> 0)) & 0xff] ^
1016 crc32_fast[2][(buf[13] ^ (crc >> 8)) & 0xff] ^
1017 crc32_fast[1][(buf[14] ^ (crc >> 16)) & 0xff] ^
1018 crc32_fast[0][(buf[15] ^ (crc >> 24)) & 0xff];
1019#endif
1020 buf += 16;
1021 }
1022
1023 while (buf <= end - 4) {
1024#ifdef UNALIGNED_LE_OK
1025 crc ^= *(uint32_t *)buf;
1026 crc = crc32_uint32(crc);
1027#else
1028 crc = crc32_fast[3][(buf[0] ^ (crc >> 0)) & 0xff] ^
1029 crc32_fast[2][(buf[1] ^ (crc >> 8)) & 0xff] ^
1030 crc32_fast[1][(buf[2] ^ (crc >> 16)) & 0xff] ^
1031 crc32_fast[0][(buf[3] ^ (crc >> 24)) & 0xff];
1032#endif
1033 buf += 4;
1034 }
1035
1036 while (buf < end)
Willy Tarreau027fdcb2021-05-12 08:34:36 +02001037 crc = crc32_char(crc, *buf++);
Willy Tarreauab2b7822021-04-22 14:09:44 +02001038 return crc;
1039}
1040
1041/* uses the most suitable crc32 function to update crc on <buf, len> */
1042static inline uint32_t update_crc(uint32_t crc, const void *buf, int len)
1043{
1044 return slz_crc32_by4(crc, buf, len);
1045}
1046
1047/* Sends the gzip header for stream <strm> into buffer <buf>. When it's done,
1048 * the stream state is updated to SLZ_ST_EOB. It returns the number of bytes
1049 * emitted which is always 10. The caller is responsible for ensuring there's
1050 * always enough room in the buffer.
1051 */
1052int slz_rfc1952_send_header(struct slz_stream *strm, unsigned char *buf)
1053{
1054 memcpy(buf, gzip_hdr, sizeof(gzip_hdr));
1055 strm->state = SLZ_ST_EOB;
1056 return sizeof(gzip_hdr);
1057}
1058
1059/* Encodes the block according to rfc1952. This means that the CRC of the input
1060 * block is computed according to the CRC32 algorithm. If the header was never
1061 * sent, it may be sent first. The number of output bytes is returned.
1062 */
1063long slz_rfc1952_encode(struct slz_stream *strm, unsigned char *out, const unsigned char *in, long ilen, int more)
1064{
1065 long ret = 0;
1066
1067 if (__builtin_expect(strm->state == SLZ_ST_INIT, 0))
1068 ret += slz_rfc1952_send_header(strm, out);
1069
1070 strm->crc32 = update_crc(strm->crc32, in, ilen);
1071 ret += slz_rfc1951_encode(strm, out + ret, in, ilen, more);
1072 return ret;
1073}
1074
1075/* Initializes stream <strm> for use with the gzip format (rfc1952). The
1076 * compression level passed in <level> is set. This value can only be 0 (no
1077 * compression) or 1 (compression) and other values will lead to unpredictable
1078 * behaviour. The function always returns 0.
1079 */
1080int slz_rfc1952_init(struct slz_stream *strm, int level)
1081{
1082 strm->state = SLZ_ST_INIT;
1083 strm->level = level;
1084 strm->format = SLZ_FMT_GZIP;
1085 strm->crc32 = 0;
1086 strm->ilen = 0;
1087 strm->qbits = 0;
1088 strm->queue = 0;
1089 return 0;
1090}
1091
Willy Tarreau3ffe11e2023-06-26 19:24:55 +02001092/* Flushes any pending data for stream <strm> into buffer <buf>, then emits an
1093 * empty literal block to byte-align the output, allowing to completely flush
1094 * the queue. Note that if the initial header was never sent, it will be sent
1095 * first as well (10 extra bytes). This requires that the output buffer still
1096 * has this plus the size of the queue available (up to 4 bytes), plus one byte
1097 * for (BFINAL,BTYPE), plus 4 bytes for LEN+NLEN, or a total of 19 bytes in the
1098 * worst case. The number of bytes emitted is returned. It is guaranteed that
1099 * the queue is empty on return. This may cause some overhead by adding
1100 * needless 5-byte blocks if called to often.
1101 */
1102int slz_rfc1952_flush(struct slz_stream *strm, unsigned char *buf)
1103{
1104 int sent = 0;
1105
1106 if (__builtin_expect(strm->state == SLZ_ST_INIT, 0))
1107 sent = slz_rfc1952_send_header(strm, buf);
1108
1109 sent += slz_rfc1951_flush(strm, buf + sent);
1110 return sent;
1111}
1112
Willy Tarreauab2b7822021-04-22 14:09:44 +02001113/* Flushes pending bits and sends the gzip trailer for stream <strm> into
1114 * buffer <buf>. When it's done, the stream state is updated to SLZ_ST_END. It
1115 * returns the number of bytes emitted. The trailer consists in flushing the
1116 * possibly pending bits from the queue (up to 24 bits), rounding to the next
1117 * byte, then 4 bytes for the CRC and another 4 bytes for the input length.
Ilya Shipitsinb2be9a12021-04-24 13:25:42 +05001118 * That may about to 4+4+4 = 12 bytes, that the caller must ensure are
Willy Tarreauab2b7822021-04-22 14:09:44 +02001119 * available before calling the function. Note that if the initial header was
1120 * never sent, it will be sent first as well (10 extra bytes).
1121 */
1122int slz_rfc1952_finish(struct slz_stream *strm, unsigned char *buf)
1123{
1124 strm->outbuf = buf;
1125
1126 if (__builtin_expect(strm->state == SLZ_ST_INIT, 0))
1127 strm->outbuf += slz_rfc1952_send_header(strm, strm->outbuf);
1128
1129 slz_rfc1951_finish(strm, strm->outbuf);
1130 copy_32b(strm, strm->crc32);
1131 copy_32b(strm, strm->ilen);
1132 strm->state = SLZ_ST_END;
1133
1134 return strm->outbuf - buf;
1135}
1136
1137
1138/* RFC1950-specific stuff. This is for the Zlib stream format.
1139 * From RFC1950 (zlib) :
1140 *
1141
1142 2.2. Data format
1143
1144 A zlib stream has the following structure:
1145
1146 0 1
1147 +---+---+
1148 |CMF|FLG| (more-->)
1149 +---+---+
1150
1151
1152 (if FLG.FDICT set)
1153
1154 0 1 2 3
1155 +---+---+---+---+
1156 | DICTID | (more-->)
1157 +---+---+---+---+
1158
1159 +=====================+---+---+---+---+
1160 |...compressed data...| ADLER32 |
1161 +=====================+---+---+---+---+
1162
1163 Any data which may appear after ADLER32 are not part of the zlib
1164 stream.
1165
1166 CMF (Compression Method and flags)
1167 This byte is divided into a 4-bit compression method and a 4-
1168 bit information field depending on the compression method.
1169
1170 bits 0 to 3 CM Compression method
1171 bits 4 to 7 CINFO Compression info
1172
1173 CM (Compression method)
1174 This identifies the compression method used in the file. CM = 8
1175 denotes the "deflate" compression method with a window size up
1176 to 32K. This is the method used by gzip and PNG (see
1177 references [1] and [2] in Chapter 3, below, for the reference
1178 documents). CM = 15 is reserved. It might be used in a future
1179 version of this specification to indicate the presence of an
1180 extra field before the compressed data.
1181
1182 CINFO (Compression info)
1183 For CM = 8, CINFO is the base-2 logarithm of the LZ77 window
1184 size, minus eight (CINFO=7 indicates a 32K window size). Values
1185 of CINFO above 7 are not allowed in this version of the
1186 specification. CINFO is not defined in this specification for
1187 CM not equal to 8.
1188
1189 FLG (FLaGs)
1190 This flag byte is divided as follows:
1191
1192 bits 0 to 4 FCHECK (check bits for CMF and FLG)
1193 bit 5 FDICT (preset dictionary)
1194 bits 6 to 7 FLEVEL (compression level)
1195
1196 The FCHECK value must be such that CMF and FLG, when viewed as
1197 a 16-bit unsigned integer stored in MSB order (CMF*256 + FLG),
1198 is a multiple of 31.
1199
1200
1201 FDICT (Preset dictionary)
1202 If FDICT is set, a DICT dictionary identifier is present
1203 immediately after the FLG byte. The dictionary is a sequence of
1204 bytes which are initially fed to the compressor without
1205 producing any compressed output. DICT is the Adler-32 checksum
1206 of this sequence of bytes (see the definition of ADLER32
1207 below). The decompressor can use this identifier to determine
1208 which dictionary has been used by the compressor.
1209
1210 FLEVEL (Compression level)
1211 These flags are available for use by specific compression
1212 methods. The "deflate" method (CM = 8) sets these flags as
1213 follows:
1214
1215 0 - compressor used fastest algorithm
1216 1 - compressor used fast algorithm
1217 2 - compressor used default algorithm
1218 3 - compressor used maximum compression, slowest algorithm
1219
1220 The information in FLEVEL is not needed for decompression; it
1221 is there to indicate if recompression might be worthwhile.
1222
1223 compressed data
1224 For compression method 8, the compressed data is stored in the
1225 deflate compressed data format as described in the document
1226 "DEFLATE Compressed Data Format Specification" by L. Peter
1227 Deutsch. (See reference [3] in Chapter 3, below)
1228
1229 Other compressed data formats are not specified in this version
1230 of the zlib specification.
1231
1232 ADLER32 (Adler-32 checksum)
1233 This contains a checksum value of the uncompressed data
1234 (excluding any dictionary data) computed according to Adler-32
1235 algorithm. This algorithm is a 32-bit extension and improvement
1236 of the Fletcher algorithm, used in the ITU-T X.224 / ISO 8073
1237 standard. See references [4] and [5] in Chapter 3, below)
1238
1239 Adler-32 is composed of two sums accumulated per byte: s1 is
1240 the sum of all bytes, s2 is the sum of all s1 values. Both sums
1241 are done modulo 65521. s1 is initialized to 1, s2 to zero. The
1242 Adler-32 checksum is stored as s2*65536 + s1 in most-
1243 significant-byte first (network) order.
1244
1245 ==> The stream can start with only 2 bytes :
1246 - CM = 0x78 : CMINFO=7 (32kB window), CM=8 (deflate)
1247 - FLG = 0x01 : FLEVEL = 0 (fastest), FDICT=0 (no dict), FCHECK=1 so
1248 that 0x7801 is a multiple of 31 (30721 = 991 * 31).
1249
1250 ==> and it ends with only 4 bytes, the Adler-32 checksum in big-endian format.
1251
1252 */
1253
1254static const unsigned char zlib_hdr[] = { 0x78, 0x01 }; // 32k win, deflate, chk=1
1255
1256
1257/* Original version from RFC1950, verified and works OK */
1258uint32_t slz_adler32_by1(uint32_t crc, const unsigned char *buf, int len)
1259{
1260 uint32_t s1 = crc & 0xffff;
1261 uint32_t s2 = (crc >> 16) & 0xffff;
1262 int n;
1263
1264 for (n = 0; n < len; n++) {
1265 s1 = (s1 + buf[n]) % 65521;
1266 s2 = (s2 + s1) % 65521;
1267 }
1268 return (s2 << 16) + s1;
1269}
1270
1271/* Computes the adler32 sum on <buf> for <len> bytes. It avoids the expensive
1272 * modulus by retrofitting the number of bytes missed between 65521 and 65536
1273 * which is easy to count : For every sum above 65536, the modulus is offset
1274 * by (65536-65521) = 15. So for any value, we can count the accumulated extra
1275 * values by dividing the sum by 65536 and multiplying this value by
1276 * (65536-65521). That's easier with a drawing with boxes and marbles. It gives
1277 * this :
1278 * x % 65521 = (x % 65536) + (x / 65536) * (65536 - 65521)
1279 * = (x & 0xffff) + (x >> 16) * 15.
1280 */
1281uint32_t slz_adler32_block(uint32_t crc, const unsigned char *buf, long len)
1282{
1283 long s1 = crc & 0xffff;
1284 long s2 = (crc >> 16);
1285 long blk;
1286 long n;
1287
1288 do {
1289 blk = len;
1290 /* ensure we never overflow s2 (limit is about 2^((32-8)/2) */
1291 if (blk > (1U << 12))
1292 blk = 1U << 12;
1293 len -= blk;
1294
1295 for (n = 0; n < blk; n++) {
1296 s1 = (s1 + buf[n]);
1297 s2 = (s2 + s1);
1298 }
1299
1300 /* Largest value here is 2^12 * 255 = 1044480 < 2^20. We can
1301 * still overflow once, but not twice because the right hand
1302 * size is 225 max, so the total is 65761. However we also
1303 * have to take care of the values between 65521 and 65536.
1304 */
1305 s1 = (s1 & 0xffff) + 15 * (s1 >> 16);
1306 if (s1 >= 65521)
1307 s1 -= 65521;
1308
1309 /* For s2, the largest value is estimated to 2^32-1 for
1310 * simplicity, so the right hand side is about 15*65535
1311 * = 983025. We can overflow twice at most.
1312 */
1313 s2 = (s2 & 0xffff) + 15 * (s2 >> 16);
1314 s2 = (s2 & 0xffff) + 15 * (s2 >> 16);
1315 if (s2 >= 65521)
1316 s2 -= 65521;
1317
1318 buf += blk;
1319 } while (len);
1320 return (s2 << 16) + s1;
1321}
1322
1323/* Sends the zlib header for stream <strm> into buffer <buf>. When it's done,
1324 * the stream state is updated to SLZ_ST_EOB. It returns the number of bytes
1325 * emitted which is always 2. The caller is responsible for ensuring there's
1326 * always enough room in the buffer.
1327 */
1328int slz_rfc1950_send_header(struct slz_stream *strm, unsigned char *buf)
1329{
1330 memcpy(buf, zlib_hdr, sizeof(zlib_hdr));
1331 strm->state = SLZ_ST_EOB;
1332 return sizeof(zlib_hdr);
1333}
1334
1335/* Encodes the block according to rfc1950. This means that the CRC of the input
1336 * block is computed according to the ADLER32 algorithm. If the header was never
1337 * sent, it may be sent first. The number of output bytes is returned.
1338 */
1339long slz_rfc1950_encode(struct slz_stream *strm, unsigned char *out, const unsigned char *in, long ilen, int more)
1340{
1341 long ret = 0;
1342
1343 if (__builtin_expect(strm->state == SLZ_ST_INIT, 0))
1344 ret += slz_rfc1950_send_header(strm, out);
1345
1346 strm->crc32 = slz_adler32_block(strm->crc32, in, ilen);
1347 ret += slz_rfc1951_encode(strm, out + ret, in, ilen, more);
1348 return ret;
1349}
1350
1351/* Initializes stream <strm> for use with the zlib format (rfc1952). The
1352 * compression level passed in <level> is set. This value can only be 0 (no
1353 * compression) or 1 (compression) and other values will lead to unpredictable
1354 * behaviour. The function always returns 0.
1355 */
1356int slz_rfc1950_init(struct slz_stream *strm, int level)
1357{
1358 strm->state = SLZ_ST_INIT;
1359 strm->level = level;
1360 strm->format = SLZ_FMT_ZLIB;
1361 strm->crc32 = 1; // rfc1950/zlib starts with initial crc=1
1362 strm->ilen = 0;
1363 strm->qbits = 0;
1364 strm->queue = 0;
1365 return 0;
1366}
1367
Willy Tarreau3ffe11e2023-06-26 19:24:55 +02001368/* Flushes any pending data for stream <strm> into buffer <buf>, then emits an
1369 * empty literal block to byte-align the output, allowing to completely flush
1370 * the queue. Note that if the initial header was never sent, it will be sent
1371 * first as well (2 extra bytes). This requires that the output buffer still
1372 * has this plus the size of the queue available (up to 4 bytes), plus one byte
1373 * for (BFINAL,BTYPE), plus 4 bytes for LEN+NLEN, or a total of 11 bytes in the
1374 * worst case. The number of bytes emitted is returned. It is guaranteed that
1375 * the queue is empty on return. This may cause some overhead by adding
1376 * needless 5-byte blocks if called to often.
1377 */
1378int slz_rfc1950_flush(struct slz_stream *strm, unsigned char *buf)
1379{
1380 int sent = 0;
1381
1382 if (__builtin_expect(strm->state == SLZ_ST_INIT, 0))
1383 sent = slz_rfc1950_send_header(strm, buf);
1384
1385 sent += slz_rfc1951_flush(strm, buf + sent);
1386 return sent;
1387}
1388
Willy Tarreauab2b7822021-04-22 14:09:44 +02001389/* Flushes pending bits and sends the gzip trailer for stream <strm> into
1390 * buffer <buf>. When it's done, the stream state is updated to SLZ_ST_END. It
1391 * returns the number of bytes emitted. The trailer consists in flushing the
1392 * possibly pending bits from the queue (up to 24 bits), rounding to the next
Ilya Shipitsinb2be9a12021-04-24 13:25:42 +05001393 * byte, then 4 bytes for the CRC. That may about to 4+4 = 8 bytes, that the
Willy Tarreauab2b7822021-04-22 14:09:44 +02001394 * caller must ensure are available before calling the function. Note that if
1395 * the initial header was never sent, it will be sent first as well (2 extra
1396 * bytes).
1397 */
1398int slz_rfc1950_finish(struct slz_stream *strm, unsigned char *buf)
1399{
1400 strm->outbuf = buf;
1401
1402 if (__builtin_expect(strm->state == SLZ_ST_INIT, 0))
1403 strm->outbuf += slz_rfc1952_send_header(strm, strm->outbuf);
1404
1405 slz_rfc1951_finish(strm, strm->outbuf);
1406 copy_8b(strm, (strm->crc32 >> 24) & 0xff);
1407 copy_8b(strm, (strm->crc32 >> 16) & 0xff);
1408 copy_8b(strm, (strm->crc32 >> 8) & 0xff);
1409 copy_8b(strm, (strm->crc32 >> 0) & 0xff);
1410 strm->state = SLZ_ST_END;
1411 return strm->outbuf - buf;
1412}
1413
Willy Tarreauab2b7822021-04-22 14:09:44 +02001414__attribute__((constructor))
1415static void __slz_initialize(void)
1416{
Willy Tarreau9e274282021-05-12 08:36:09 +02001417#if !defined(__ARM_FEATURE_CRC32)
Willy Tarreauab2b7822021-04-22 14:09:44 +02001418 __slz_make_crc_table();
Willy Tarreau9e274282021-05-12 08:36:09 +02001419#endif
Willy Tarreauab2b7822021-04-22 14:09:44 +02001420 __slz_prepare_dist_table();
1421}