| /* |
| * Copyright (C) 2013-2015 Willy Tarreau <w@1wt.eu> |
| * |
| * Permission is hereby granted, free of charge, to any person obtaining |
| * a copy of this software and associated documentation files (the |
| * "Software"), to deal in the Software without restriction, including |
| * without limitation the rights to use, copy, modify, merge, publish, |
| * distribute, sublicense, and/or sell copies of the Software, and to |
| * permit persons to whom the Software is furnished to do so, subject to |
| * the following conditions: |
| * |
| * The above copyright notice and this permission notice shall be |
| * included in all copies or substantial portions of the Software. |
| * |
| * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, |
| * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES |
| * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND |
| * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT |
| * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, |
| * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING |
| * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR |
| * OTHER DEALINGS IN THE SOFTWARE. |
| */ |
| |
| #include <inttypes.h> |
| #include <stdio.h> |
| #include <string.h> |
| #include <import/slz.h> |
| #include <import/slz-tables.h> |
| |
| /* First, RFC1951-specific declarations and extracts from the RFC. |
| * |
| * RFC1951 - deflate stream format |
| |
| |
| * Data elements are packed into bytes in order of |
| increasing bit number within the byte, i.e., starting |
| with the least-significant bit of the byte. |
| * Data elements other than Huffman codes are packed |
| starting with the least-significant bit of the data |
| element. |
| * Huffman codes are packed starting with the most- |
| significant bit of the code. |
| |
| 3.2.3. Details of block format |
| |
| Each block of compressed data begins with 3 header bits |
| containing the following data: |
| |
| first bit BFINAL |
| next 2 bits BTYPE |
| |
| Note that the header bits do not necessarily begin on a byte |
| boundary, since a block does not necessarily occupy an integral |
| number of bytes. |
| |
| BFINAL is set if and only if this is the last block of the data |
| set. |
| |
| BTYPE specifies how the data are compressed, as follows: |
| |
| 00 - no compression |
| 01 - compressed with fixed Huffman codes |
| 10 - compressed with dynamic Huffman codes |
| 11 - reserved (error) |
| |
| 3.2.4. Non-compressed blocks (BTYPE=00) |
| |
| Any bits of input up to the next byte boundary are ignored. |
| The rest of the block consists of the following information: |
| |
| 0 1 2 3 4... |
| +---+---+---+---+================================+ |
| | LEN | NLEN |... LEN bytes of literal data...| |
| +---+---+---+---+================================+ |
| |
| LEN is the number of data bytes in the block. NLEN is the |
| one's complement of LEN. |
| |
| 3.2.5. Compressed blocks (length and distance codes) |
| |
| As noted above, encoded data blocks in the "deflate" format |
| consist of sequences of symbols drawn from three conceptually |
| distinct alphabets: either literal bytes, from the alphabet of |
| byte values (0..255), or <length, backward distance> pairs, |
| where the length is drawn from (3..258) and the distance is |
| drawn from (1..32,768). In fact, the literal and length |
| alphabets are merged into a single alphabet (0..285), where |
| values 0..255 represent literal bytes, the value 256 indicates |
| end-of-block, and values 257..285 represent length codes |
| (possibly in conjunction with extra bits following the symbol |
| code) as follows: |
| |
| Length encoding : |
| Extra Extra Extra |
| Code Bits Length(s) Code Bits Lengths Code Bits Length(s) |
| ---- ---- ------ ---- ---- ------- ---- ---- ------- |
| 257 0 3 267 1 15,16 277 4 67-82 |
| 258 0 4 268 1 17,18 278 4 83-98 |
| 259 0 5 269 2 19-22 279 4 99-114 |
| 260 0 6 270 2 23-26 280 4 115-130 |
| 261 0 7 271 2 27-30 281 5 131-162 |
| 262 0 8 272 2 31-34 282 5 163-194 |
| 263 0 9 273 3 35-42 283 5 195-226 |
| 264 0 10 274 3 43-50 284 5 227-257 |
| 265 1 11,12 275 3 51-58 285 0 258 |
| 266 1 13,14 276 3 59-66 |
| |
| Distance encoding : |
| Extra Extra Extra |
| Code Bits Dist Code Bits Dist Code Bits Distance |
| ---- ---- ---- ---- ---- ------ ---- ---- -------- |
| 0 0 1 10 4 33-48 20 9 1025-1536 |
| 1 0 2 11 4 49-64 21 9 1537-2048 |
| 2 0 3 12 5 65-96 22 10 2049-3072 |
| 3 0 4 13 5 97-128 23 10 3073-4096 |
| 4 1 5,6 14 6 129-192 24 11 4097-6144 |
| 5 1 7,8 15 6 193-256 25 11 6145-8192 |
| 6 2 9-12 16 7 257-384 26 12 8193-12288 |
| 7 2 13-16 17 7 385-512 27 12 12289-16384 |
| 8 3 17-24 18 8 513-768 28 13 16385-24576 |
| 9 3 25-32 19 8 769-1024 29 13 24577-32768 |
| |
| 3.2.6. Compression with fixed Huffman codes (BTYPE=01) |
| |
| The Huffman codes for the two alphabets are fixed, and are not |
| represented explicitly in the data. The Huffman code lengths |
| for the literal/length alphabet are: |
| |
| Lit Value Bits Codes |
| --------- ---- ----- |
| 0 - 143 8 00110000 through |
| 10111111 |
| 144 - 255 9 110010000 through |
| 111111111 |
| 256 - 279 7 0000000 through |
| 0010111 |
| 280 - 287 8 11000000 through |
| 11000111 |
| |
| The code lengths are sufficient to generate the actual codes, |
| as described above; we show the codes in the table for added |
| clarity. Literal/length values 286-287 will never actually |
| occur in the compressed data, but participate in the code |
| construction. |
| |
| Distance codes 0-31 are represented by (fixed-length) 5-bit |
| codes, with possible additional bits as shown in the table |
| shown in Paragraph 3.2.5, above. Note that distance codes 30- |
| 31 will never actually occur in the compressed data. |
| |
| */ |
| |
| /* back references, built in a way that is optimal for 32/64 bits */ |
| union ref { |
| struct { |
| uint32_t pos; |
| uint32_t word; |
| } by32; |
| uint64_t by64; |
| }; |
| |
| #if defined(USE_64BIT_QUEUE) && defined(UNALIGNED_LE_OK) |
| |
| /* enqueue code x of <xbits> bits (LSB aligned, at most 24) and copy complete |
| * 32-bit words into output buffer. X must not contain non-zero bits above |
| * xbits. |
| */ |
| static inline void enqueue24(struct slz_stream *strm, uint32_t x, uint32_t xbits) |
| { |
| uint64_t queue = strm->queue + ((uint64_t)x << strm->qbits); |
| uint32_t qbits = strm->qbits + xbits; |
| |
| if (__builtin_expect(qbits >= 32, 1)) { |
| *(uint32_t *)strm->outbuf = queue; |
| queue >>= 32; |
| qbits -= 32; |
| strm->outbuf += 4; |
| } |
| |
| strm->queue = queue; |
| strm->qbits = qbits; |
| } |
| |
| #define enqueue8 enqueue24 |
| |
| /* flush the queue and align to next byte */ |
| static inline void flush_bits(struct slz_stream *strm) |
| { |
| if (strm->qbits > 0) |
| *strm->outbuf++ = strm->queue; |
| |
| if (strm->qbits > 8) |
| *strm->outbuf++ = strm->queue >> 8; |
| |
| if (strm->qbits > 16) |
| *strm->outbuf++ = strm->queue >> 16; |
| |
| if (strm->qbits > 24) |
| *strm->outbuf++ = strm->queue >> 24; |
| |
| strm->queue = 0; |
| strm->qbits = 0; |
| } |
| |
| #else /* non-64 bit or aligned or big endian */ |
| |
| /* enqueue code x of <xbits> bits (LSB aligned, at most 24) and copy complete |
| * bytes into out buf. X must not contain non-zero bits above xbits. Prefer |
| * enqueue8() when xbits is known for being 8 or less. |
| */ |
| static void enqueue24(struct slz_stream *strm, uint32_t x, uint32_t xbits) |
| { |
| uint32_t queue = strm->queue + (x << strm->qbits); |
| uint32_t qbits = strm->qbits + xbits; |
| |
| if (qbits >= 16) { |
| #ifndef UNALIGNED_LE_OK |
| strm->outbuf[0] = queue; |
| strm->outbuf[1] = queue >> 8; |
| #else |
| *(uint16_t *)strm->outbuf = queue; |
| #endif |
| strm->outbuf += 2; |
| queue >>= 16; |
| qbits -= 16; |
| } |
| |
| if (qbits >= 8) { |
| qbits -= 8; |
| *strm->outbuf++ = queue; |
| queue >>= 8; |
| } |
| strm->qbits = qbits; |
| strm->queue = queue; |
| return; |
| } |
| |
| /* enqueue code x of <xbits> bits (at most 8) and copy complete bytes into |
| * out buf. X must not contain non-zero bits above xbits. |
| */ |
| static inline void enqueue8(struct slz_stream *strm, uint32_t x, uint32_t xbits) |
| { |
| uint32_t queue = strm->queue + (x << strm->qbits); |
| uint32_t qbits = strm->qbits + xbits; |
| |
| if (__builtin_expect((signed)(qbits - 8) >= 0, 1)) { |
| qbits -= 8; |
| *strm->outbuf++ = queue; |
| queue >>= 8; |
| } |
| |
| strm->qbits = qbits; |
| strm->queue = queue; |
| } |
| |
| /* align to next byte */ |
| static inline void flush_bits(struct slz_stream *strm) |
| { |
| if (strm->qbits > 0) |
| *strm->outbuf++ = strm->queue; |
| |
| if (strm->qbits > 8) |
| *strm->outbuf++ = strm->queue >> 8; |
| |
| strm->queue = 0; |
| strm->qbits = 0; |
| } |
| #endif |
| |
| |
| /* only valid if buffer is already aligned */ |
| static inline void copy_8b(struct slz_stream *strm, uint32_t x) |
| { |
| *strm->outbuf++ = x; |
| } |
| |
| /* only valid if buffer is already aligned */ |
| static inline void copy_16b(struct slz_stream *strm, uint32_t x) |
| { |
| strm->outbuf[0] = x; |
| strm->outbuf[1] = x >> 8; |
| strm->outbuf += 2; |
| } |
| |
| /* only valid if buffer is already aligned */ |
| static inline void copy_32b(struct slz_stream *strm, uint32_t x) |
| { |
| strm->outbuf[0] = x; |
| strm->outbuf[1] = x >> 8; |
| strm->outbuf[2] = x >> 16; |
| strm->outbuf[3] = x >> 24; |
| strm->outbuf += 4; |
| } |
| |
| static inline void send_huff(struct slz_stream *strm, uint32_t code) |
| { |
| uint32_t bits; |
| |
| code = fixed_huff[code]; |
| bits = code & 15; |
| code >>= 4; |
| enqueue24(strm, code, bits); |
| } |
| |
| static inline void send_eob(struct slz_stream *strm) |
| { |
| enqueue8(strm, 0, 7); // direct encoding of 256 = EOB (cf RFC1951) |
| } |
| |
| /* copies <len> literals from <buf>. <more> indicates that there are data past |
| * buf + <len>. <len> must not be null. |
| */ |
| static void copy_lit(struct slz_stream *strm, const void *buf, uint32_t len, int more) |
| { |
| uint32_t len2; |
| |
| do { |
| len2 = len; |
| if (__builtin_expect(len2 > 65535, 0)) |
| len2 = 65535; |
| |
| len -= len2; |
| |
| if (strm->state != SLZ_ST_EOB) |
| send_eob(strm); |
| |
| strm->state = (more || len) ? SLZ_ST_EOB : SLZ_ST_DONE; |
| |
| enqueue8(strm, !(more || len), 3); // BFINAL = !more ; BTYPE = 00 |
| flush_bits(strm); |
| copy_16b(strm, len2); // len2 |
| copy_16b(strm, ~len2); // nlen2 |
| memcpy(strm->outbuf, buf, len2); |
| buf += len2; |
| strm->outbuf += len2; |
| } while (len); |
| } |
| |
| /* copies <len> literals from <buf>. <more> indicates that there are data past |
| * buf + <len>. <len> must not be null. |
| */ |
| static void copy_lit_huff(struct slz_stream *strm, const unsigned char *buf, uint32_t len, int more) |
| { |
| uint32_t pos; |
| |
| /* This ugly construct limits the mount of tests and optimizes for the |
| * most common case (more > 0). |
| */ |
| if (strm->state == SLZ_ST_EOB) { |
| eob: |
| strm->state = more ? SLZ_ST_FIXED : SLZ_ST_LAST; |
| enqueue8(strm, 2 + !more, 3); // BFINAL = !more ; BTYPE = 01 |
| } |
| else if (!more) { |
| send_eob(strm); |
| goto eob; |
| } |
| |
| pos = 0; |
| do { |
| send_huff(strm, buf[pos++]); |
| } while (pos < len); |
| } |
| |
| /* format: |
| * bit0..31 = word |
| * bit32..63 = last position in buffer of similar content |
| */ |
| |
| /* This hash provides good average results on HTML contents, and is among the |
| * few which provide almost optimal results on various different pages. |
| */ |
| static inline uint32_t slz_hash(uint32_t a) |
| { |
| #if defined(__ARM_FEATURE_CRC32) |
| # if defined(__ARM_ARCH_ISA_A64) |
| // 64 bit mode |
| __asm__ volatile("crc32w %w0,%w0,%w1" : "+r"(a) : "r"(0)); |
| # else |
| // 32 bit mode (e.g. armv7 compiler building for armv8 |
| __asm__ volatile("crc32w %0,%0,%1" : "+r"(a) : "r"(0)); |
| # endif |
| return a >> (32 - HASH_BITS); |
| #else |
| return ((a << 19) + (a << 6) - a) >> (32 - HASH_BITS); |
| #endif |
| } |
| |
| /* This function compares buffers <a> and <b> and reads 32 or 64 bits at a time |
| * during the approach. It makes us of unaligned little endian memory accesses |
| * on capable architectures. <max> is the maximum number of bytes that can be |
| * read, so both <a> and <b> must have at least <max> bytes ahead. <max> may |
| * safely be null or negative if that simplifies computations in the caller. |
| */ |
| static inline long memmatch(const unsigned char *a, const unsigned char *b, long max) |
| { |
| long len = 0; |
| |
| #ifdef UNALIGNED_LE_OK |
| unsigned long xor; |
| |
| while (1) { |
| if ((long)(len + 2 * sizeof(long)) > max) { |
| while (len < max) { |
| if (a[len] != b[len]) |
| break; |
| len++; |
| } |
| return len; |
| } |
| |
| xor = *(long *)&a[len] ^ *(long *)&b[len]; |
| if (xor) |
| break; |
| len += sizeof(long); |
| |
| xor = *(long *)&a[len] ^ *(long *)&b[len]; |
| if (xor) |
| break; |
| len += sizeof(long); |
| } |
| |
| #if defined(__x86_64__) || defined(__i386__) || defined(__i486__) || defined(__i586__) || defined(__i686__) |
| /* x86 has bsf. We know that xor is non-null here */ |
| asm("bsf %1,%0\n" : "=r"(xor) : "0" (xor)); |
| return len + xor / 8; |
| #else |
| if (sizeof(long) > 4 && !(xor & 0xffffffff)) { |
| /* This code is optimized out on 32-bit archs, but we still |
| * need to shift in two passes to avoid a warning. It is |
| * properly optimized out as a single shift. |
| */ |
| xor >>= 16; xor >>= 16; |
| if (xor & 0xffff) { |
| if (xor & 0xff) |
| return len + 4; |
| return len + 5; |
| } |
| if (xor & 0xffffff) |
| return len + 6; |
| return len + 7; |
| } |
| |
| if (xor & 0xffff) { |
| if (xor & 0xff) |
| return len; |
| return len + 1; |
| } |
| if (xor & 0xffffff) |
| return len + 2; |
| return len + 3; |
| #endif // x86 |
| |
| #else // UNALIGNED_LE_OK |
| /* This is the generic version for big endian or unaligned-incompatible |
| * architectures. |
| */ |
| while (len < max) { |
| if (a[len] != b[len]) |
| break; |
| len++; |
| } |
| return len; |
| |
| #endif |
| } |
| |
| /* sets <count> BYTES to -32769 in <refs> so that any uninitialized entry will |
| * verify (pos-last-1 >= 32768) and be ignored. <count> must be a multiple of |
| * 128 bytes and <refs> must be at least one count in length. It's supposed to |
| * be applied to 64-bit aligned data exclusively, which makes it slightly |
| * faster than the regular memset() since no alignment check is performed. |
| */ |
| static void reset_refs(union ref *refs, long count) |
| { |
| /* avoid a shift/mask by casting to void* */ |
| union ref *end = (void *)refs + count; |
| |
| do { |
| refs[ 0].by64 = -32769; |
| refs[ 1].by64 = -32769; |
| refs[ 2].by64 = -32769; |
| refs[ 3].by64 = -32769; |
| refs[ 4].by64 = -32769; |
| refs[ 5].by64 = -32769; |
| refs[ 6].by64 = -32769; |
| refs[ 7].by64 = -32769; |
| refs[ 8].by64 = -32769; |
| refs[ 9].by64 = -32769; |
| refs[10].by64 = -32769; |
| refs[11].by64 = -32769; |
| refs[12].by64 = -32769; |
| refs[13].by64 = -32769; |
| refs[14].by64 = -32769; |
| refs[15].by64 = -32769; |
| refs += 16; |
| } while (refs < end); |
| } |
| |
| /* Compresses <ilen> bytes from <in> into <out> according to RFC1951. The |
| * output result may be up to 5 bytes larger than the input, to which 2 extra |
| * bytes may be added to send the last chunk due to BFINAL+EOB encoding (10 |
| * bits) when <more> is not set. The caller is responsible for ensuring there |
| * is enough room in the output buffer for this. The amount of output bytes is |
| * returned, and no CRC is computed. |
| */ |
| long slz_rfc1951_encode(struct slz_stream *strm, unsigned char *out, const unsigned char *in, long ilen, int more) |
| { |
| long rem = ilen; |
| unsigned long pos = 0; |
| unsigned long last; |
| uint32_t word = 0; |
| long mlen; |
| uint32_t h; |
| uint64_t ent; |
| |
| uint32_t plit = 0; |
| uint32_t bit9 = 0; |
| uint32_t dist, code; |
| union ref refs[1 << HASH_BITS]; |
| |
| if (!strm->level) { |
| /* force to send as literals (eg to preserve CPU) */ |
| strm->outbuf = out; |
| plit = pos = ilen; |
| bit9 = 52; /* force literal dump */ |
| goto final_lit_dump; |
| } |
| |
| reset_refs(refs, sizeof(refs)); |
| |
| strm->outbuf = out; |
| |
| #ifndef UNALIGNED_FASTER |
| word = ((unsigned char)in[pos] << 8) + ((unsigned char)in[pos + 1] << 16) + ((unsigned char)in[pos + 2] << 24); |
| #endif |
| while (rem >= 4) { |
| #ifndef UNALIGNED_FASTER |
| word = ((unsigned char)in[pos + 3] << 24) + (word >> 8); |
| #else |
| word = *(uint32_t *)&in[pos]; |
| #endif |
| h = slz_hash(word); |
| asm volatile ("" ::); // prevent gcc from trying to be smart with the prefetch |
| |
| if (sizeof(long) >= 8) { |
| ent = refs[h].by64; |
| last = (uint32_t)ent; |
| ent >>= 32; |
| refs[h].by64 = ((uint64_t)pos) + ((uint64_t)word << 32); |
| } else { |
| ent = refs[h].by32.word; |
| last = refs[h].by32.pos; |
| refs[h].by32.pos = pos; |
| refs[h].by32.word = word; |
| } |
| |
| #ifdef FIND_OPTIMAL_MATCH |
| /* Experimental code to see what could be saved with an ideal |
| * longest match lookup algorithm. This one is very slow but |
| * scans the whole window. In short, here are the savings : |
| * file orig fast(ratio) optimal(ratio) |
| * README 5185 3419 (65.9%) 3165 (61.0%) -7.5% |
| * index.html 76799 35662 (46.4%) 29875 (38.9%) -16.3% |
| * rfc1952.c 29383 13442 (45.7%) 11793 (40.1%) -12.3% |
| * |
| * Thus the savings to expect for large files is at best 16%. |
| * |
| * A non-colliding hash gives 33025 instead of 35662 (-7.4%), |
| * and keeping the last two entries gives 31724 (-11.0%). |
| */ |
| unsigned long scan; |
| int saved = 0; |
| int bestpos = 0; |
| int bestlen = 0; |
| int firstlen = 0; |
| int max_lookup = 2; // 0 = no limit |
| |
| for (scan = pos - 1; scan < pos && (unsigned long)(pos - scan - 1) < 32768; scan--) { |
| int len; |
| |
| if (*(uint32_t *)(in + scan) != word) |
| continue; |
| |
| len = memmatch(in + pos, in + scan, rem); |
| if (!bestlen) |
| firstlen = len; |
| |
| if (len > bestlen) { |
| bestlen = len; |
| bestpos = scan; |
| } |
| if (!--max_lookup) |
| break; |
| } |
| if (bestlen) { |
| //printf("pos=%d last=%d bestpos=%d word=%08x ent=%08x len=%d\n", |
| // (int)pos, (int)last, (int)bestpos, (int)word, (int)ent, bestlen); |
| last = bestpos; |
| ent = word; |
| saved += bestlen - firstlen; |
| } |
| //fprintf(stderr, "first=%d best=%d saved_total=%d\n", firstlen, bestlen, saved); |
| #endif |
| |
| if ((uint32_t)ent != word) { |
| send_as_lit: |
| rem--; |
| plit++; |
| bit9 += ((unsigned char)word >= 144); |
| pos++; |
| continue; |
| } |
| |
| /* We reject pos = last and pos > last+32768 */ |
| if ((unsigned long)(pos - last - 1) >= 32768) |
| goto send_as_lit; |
| |
| /* Note: cannot encode a length larger than 258 bytes */ |
| mlen = memmatch(in + pos + 4, in + last + 4, (rem > 258 ? 258 : rem) - 4) + 4; |
| |
| /* found a matching entry */ |
| |
| if (bit9 >= 52 && mlen < 6) |
| goto send_as_lit; |
| |
| /* compute the output code, its size and the length's size in |
| * bits to know if the reference is cheaper than literals. |
| */ |
| code = len_fh[mlen]; |
| |
| /* direct mapping of dist->huffman code */ |
| dist = fh_dist_table[pos - last - 1]; |
| |
| /* if encoding the dist+length is more expensive than sending |
| * the equivalent as bytes, lets keep the literals. |
| */ |
| if ((dist & 0x1f) + (code >> 16) + 8 >= 8 * mlen + bit9) |
| goto send_as_lit; |
| |
| /* first, copy pending literals */ |
| if (plit) { |
| /* Huffman encoding requires 9 bits for octets 144..255, so this |
| * is a waste of space for binary data. Switching between Huffman |
| * and no-comp then huffman consumes 52 bits (7 for EOB + 3 for |
| * block type + 7 for alignment + 32 for LEN+NLEN + 3 for next |
| * block. Only use plain literals if there are more than 52 bits |
| * to save then. |
| */ |
| if (bit9 >= 52) |
| copy_lit(strm, in + pos - plit, plit, 1); |
| else |
| copy_lit_huff(strm, in + pos - plit, plit, 1); |
| |
| plit = 0; |
| } |
| |
| /* use mode 01 - fixed huffman */ |
| if (strm->state == SLZ_ST_EOB) { |
| strm->state = SLZ_ST_FIXED; |
| enqueue8(strm, 0x02, 3); // BTYPE = 01, BFINAL = 0 |
| } |
| |
| /* copy the length first */ |
| enqueue24(strm, code & 0xFFFF, code >> 16); |
| |
| /* in fixed huffman mode, dist is fixed 5 bits */ |
| enqueue24(strm, dist >> 5, dist & 0x1f); |
| bit9 = 0; |
| rem -= mlen; |
| pos += mlen; |
| |
| #ifndef UNALIGNED_FASTER |
| #ifdef UNALIGNED_LE_OK |
| word = *(uint32_t *)&in[pos - 1]; |
| #else |
| word = ((unsigned char)in[pos] << 8) + ((unsigned char)in[pos + 1] << 16) + ((unsigned char)in[pos + 2] << 24); |
| #endif |
| #endif |
| } |
| |
| if (__builtin_expect(rem, 0)) { |
| /* we're reading the 1..3 last bytes */ |
| plit += rem; |
| do { |
| bit9 += ((unsigned char)in[pos++] >= 144); |
| } while (--rem); |
| } |
| |
| final_lit_dump: |
| /* now copy remaining literals or mark the end */ |
| if (plit) { |
| if (bit9 >= 52) |
| copy_lit(strm, in + pos - plit, plit, more); |
| else |
| copy_lit_huff(strm, in + pos - plit, plit, more); |
| |
| plit = 0; |
| } |
| |
| strm->ilen += ilen; |
| return strm->outbuf - out; |
| } |
| |
| /* Initializes stream <strm> for use with raw deflate (rfc1951). The CRC is |
| * unused but set to zero. The compression level passed in <level> is set. This |
| * value can only be 0 (no compression) or 1 (compression) and other values |
| * will lead to unpredictable behaviour. The function always returns 0. |
| */ |
| int slz_rfc1951_init(struct slz_stream *strm, int level) |
| { |
| strm->state = SLZ_ST_EOB; // no header |
| strm->level = level; |
| strm->format = SLZ_FMT_DEFLATE; |
| strm->crc32 = 0; |
| strm->ilen = 0; |
| strm->qbits = 0; |
| strm->queue = 0; |
| return 0; |
| } |
| |
| /* Flushes any pending data for stream <strm> into buffer <buf>, then emits an |
| * empty literal block to byte-align the output, allowing to completely flush |
| * the queue. This requires that the output buffer still has the size of the |
| * queue available (up to 4 bytes), plus one byte for (BFINAL,BTYPE), plus 4 |
| * bytes for LEN+NLEN, or a total of 9 bytes in the worst case. The number of |
| * bytes emitted is returned. It is guaranteed that the queue is empty on |
| * return. This may cause some overhead by adding needless 5-byte blocks if |
| * called to often. |
| */ |
| int slz_rfc1951_flush(struct slz_stream *strm, unsigned char *buf) |
| { |
| strm->outbuf = buf; |
| |
| /* The queue is always empty on INIT, DONE, and END */ |
| if (!strm->qbits) |
| return 0; |
| |
| /* we may need to terminate a huffman output. Lit is always in EOB state */ |
| if (strm->state != SLZ_ST_EOB) { |
| strm->state = (strm->state == SLZ_ST_LAST) ? SLZ_ST_DONE : SLZ_ST_EOB; |
| send_eob(strm); |
| } |
| |
| /* send BFINAL according to state, and BTYPE=00 (lit) */ |
| enqueue8(strm, (strm->state == SLZ_ST_DONE) ? 1 : 0, 3); |
| flush_bits(strm); // emit pending bits |
| copy_32b(strm, 0xFFFF0000U); // len=0, nlen=~0 |
| |
| /* Now the queue is empty, EOB was sent, BFINAL might have been sent if |
| * we completed the last block, and a zero-byte block was sent to byte- |
| * align the output. The last state reflects all this. Let's just |
| * return the number of bytes added to the output buffer. |
| */ |
| return strm->outbuf - buf; |
| } |
| |
| /* Flushes any pending for stream <strm> into buffer <buf>, then sends BTYPE=1 |
| * and BFINAL=1 if needed. The stream ends in SLZ_ST_DONE. It returns the number |
| * of bytes emitted. The trailer consists in flushing the possibly pending bits |
| * from the queue (up to 7 bits), then possibly EOB (7 bits), then 3 bits, EOB, |
| * a rounding to the next byte, which amounts to a total of 4 bytes max, that |
| * the caller must ensure are available before calling the function. |
| */ |
| int slz_rfc1951_finish(struct slz_stream *strm, unsigned char *buf) |
| { |
| strm->outbuf = buf; |
| |
| if (strm->state == SLZ_ST_FIXED || strm->state == SLZ_ST_LAST) { |
| strm->state = (strm->state == SLZ_ST_LAST) ? SLZ_ST_DONE : SLZ_ST_EOB; |
| send_eob(strm); |
| } |
| |
| if (strm->state != SLZ_ST_DONE) { |
| /* send BTYPE=1, BFINAL=1 */ |
| enqueue8(strm, 3, 3); |
| send_eob(strm); |
| strm->state = SLZ_ST_DONE; |
| } |
| |
| flush_bits(strm); |
| return strm->outbuf - buf; |
| } |
| |
| /* Now RFC1952-specific declarations and extracts from RFC. |
| * From RFC1952 about the GZIP file format : |
| |
| A gzip file consists of a series of "members" ... |
| |
| 2.3. Member format |
| |
| Each member has the following structure: |
| |
| +---+---+---+---+---+---+---+---+---+---+ |
| |ID1|ID2|CM |FLG| MTIME |XFL|OS | (more-->) |
| +---+---+---+---+---+---+---+---+---+---+ |
| |
| (if FLG.FEXTRA set) |
| |
| +---+---+=================================+ |
| | XLEN |...XLEN bytes of "extra field"...| (more-->) |
| +---+---+=================================+ |
| |
| (if FLG.FNAME set) |
| |
| +=========================================+ |
| |...original file name, zero-terminated...| (more-->) |
| +=========================================+ |
| |
| (if FLG.FCOMMENT set) |
| |
| +===================================+ |
| |...file comment, zero-terminated...| (more-->) |
| +===================================+ |
| |
| (if FLG.FHCRC set) |
| |
| +---+---+ |
| | CRC16 | |
| +---+---+ |
| |
| +=======================+ |
| |...compressed blocks...| (more-->) |
| +=======================+ |
| |
| 0 1 2 3 4 5 6 7 |
| +---+---+---+---+---+---+---+---+ |
| | CRC32 | ISIZE | |
| +---+---+---+---+---+---+---+---+ |
| |
| |
| 2.3.1. Member header and trailer |
| |
| ID1 (IDentification 1) |
| ID2 (IDentification 2) |
| These have the fixed values ID1 = 31 (0x1f, \037), ID2 = 139 |
| (0x8b, \213), to identify the file as being in gzip format. |
| |
| CM (Compression Method) |
| This identifies the compression method used in the file. CM |
| = 0-7 are reserved. CM = 8 denotes the "deflate" |
| compression method, which is the one customarily used by |
| gzip and which is documented elsewhere. |
| |
| FLG (FLaGs) |
| This flag byte is divided into individual bits as follows: |
| |
| bit 0 FTEXT |
| bit 1 FHCRC |
| bit 2 FEXTRA |
| bit 3 FNAME |
| bit 4 FCOMMENT |
| bit 5 reserved |
| bit 6 reserved |
| bit 7 reserved |
| |
| Reserved FLG bits must be zero. |
| |
| MTIME (Modification TIME) |
| This gives the most recent modification time of the original |
| file being compressed. The time is in Unix format, i.e., |
| seconds since 00:00:00 GMT, Jan. 1, 1970. (Note that this |
| may cause problems for MS-DOS and other systems that use |
| local rather than Universal time.) If the compressed data |
| did not come from a file, MTIME is set to the time at which |
| compression started. MTIME = 0 means no time stamp is |
| available. |
| |
| XFL (eXtra FLags) |
| These flags are available for use by specific compression |
| methods. The "deflate" method (CM = 8) sets these flags as |
| follows: |
| |
| XFL = 2 - compressor used maximum compression, |
| slowest algorithm |
| XFL = 4 - compressor used fastest algorithm |
| |
| OS (Operating System) |
| This identifies the type of file system on which compression |
| took place. This may be useful in determining end-of-line |
| convention for text files. The currently defined values are |
| as follows: |
| |
| 0 - FAT filesystem (MS-DOS, OS/2, NT/Win32) |
| 1 - Amiga |
| 2 - VMS (or OpenVMS) |
| 3 - Unix |
| 4 - VM/CMS |
| 5 - Atari TOS |
| 6 - HPFS filesystem (OS/2, NT) |
| 7 - Macintosh |
| 8 - Z-System |
| 9 - CP/M |
| 10 - TOPS-20 |
| 11 - NTFS filesystem (NT) |
| 12 - QDOS |
| 13 - Acorn RISCOS |
| 255 - unknown |
| |
| ==> A file compressed using "gzip -1" on Unix-like systems can be : |
| |
| 1F 8B 08 00 00 00 00 00 04 03 |
| <deflate-compressed stream> |
| crc32 size32 |
| */ |
| |
| static const unsigned char gzip_hdr[] = { 0x1F, 0x8B, // ID1, ID2 |
| 0x08, 0x00, // Deflate, flags (none) |
| 0x00, 0x00, 0x00, 0x00, // mtime: none |
| 0x04, 0x03 }; // fastest comp, OS=Unix |
| |
| static inline uint32_t crc32_char(uint32_t crc, uint8_t x) |
| { |
| #if defined(__ARM_FEATURE_CRC32) |
| crc = ~crc; |
| # if defined(__ARM_ARCH_ISA_A64) |
| // 64 bit mode |
| __asm__ volatile("crc32b %w0,%w0,%w1" : "+r"(crc) : "r"(x)); |
| # else |
| // 32 bit mode (e.g. armv7 compiler building for armv8 |
| __asm__ volatile("crc32b %0,%0,%1" : "+r"(crc) : "r"(x)); |
| # endif |
| crc = ~crc; |
| #else |
| crc = crc32_fast[0][(crc ^ x) & 0xff] ^ (crc >> 8); |
| #endif |
| return crc; |
| } |
| |
| static inline uint32_t crc32_uint32(uint32_t data) |
| { |
| #if defined(__ARM_FEATURE_CRC32) |
| # if defined(__ARM_ARCH_ISA_A64) |
| // 64 bit mode |
| __asm__ volatile("crc32w %w0,%w0,%w1" : "+r"(data) : "r"(~0UL)); |
| # else |
| // 32 bit mode (e.g. armv7 compiler building for armv8 |
| __asm__ volatile("crc32w %0,%0,%1" : "+r"(data) : "r"(~0UL)); |
| # endif |
| data = ~data; |
| #else |
| data = crc32_fast[3][(data >> 0) & 0xff] ^ |
| crc32_fast[2][(data >> 8) & 0xff] ^ |
| crc32_fast[1][(data >> 16) & 0xff] ^ |
| crc32_fast[0][(data >> 24) & 0xff]; |
| #endif |
| return data; |
| } |
| |
| /* Modified version originally from RFC1952, working with non-inverting CRCs */ |
| uint32_t slz_crc32_by1(uint32_t crc, const unsigned char *buf, int len) |
| { |
| int n; |
| |
| for (n = 0; n < len; n++) |
| crc = crc32_char(crc, buf[n]); |
| return crc; |
| } |
| |
| /* This version computes the crc32 of <buf> over <len> bytes, doing most of it |
| * in 32-bit chunks. |
| */ |
| uint32_t slz_crc32_by4(uint32_t crc, const unsigned char *buf, int len) |
| { |
| const unsigned char *end = buf + len; |
| |
| while (buf <= end - 16) { |
| #ifdef UNALIGNED_LE_OK |
| #if defined(__ARM_FEATURE_CRC32) |
| crc = ~crc; |
| # if defined(__ARM_ARCH_ISA_A64) |
| // 64 bit mode |
| __asm__ volatile("crc32w %w0,%w0,%w1" : "+r"(crc) : "r"(*(uint32_t*)(buf))); |
| __asm__ volatile("crc32w %w0,%w0,%w1" : "+r"(crc) : "r"(*(uint32_t*)(buf + 4))); |
| __asm__ volatile("crc32w %w0,%w0,%w1" : "+r"(crc) : "r"(*(uint32_t*)(buf + 8))); |
| __asm__ volatile("crc32w %w0,%w0,%w1" : "+r"(crc) : "r"(*(uint32_t*)(buf + 12))); |
| # else |
| // 32 bit mode (e.g. armv7 compiler building for armv8 |
| __asm__ volatile("crc32w %0,%0,%1" : "+r"(crc) : "r"(*(uint32_t*)(buf))); |
| __asm__ volatile("crc32w %0,%0,%1" : "+r"(crc) : "r"(*(uint32_t*)(buf + 4))); |
| __asm__ volatile("crc32w %0,%0,%1" : "+r"(crc) : "r"(*(uint32_t*)(buf + 8))); |
| __asm__ volatile("crc32w %0,%0,%1" : "+r"(crc) : "r"(*(uint32_t*)(buf + 12))); |
| # endif |
| crc = ~crc; |
| #else |
| crc ^= *(uint32_t *)buf; |
| crc = crc32_uint32(crc); |
| |
| crc ^= *(uint32_t *)(buf + 4); |
| crc = crc32_uint32(crc); |
| |
| crc ^= *(uint32_t *)(buf + 8); |
| crc = crc32_uint32(crc); |
| |
| crc ^= *(uint32_t *)(buf + 12); |
| crc = crc32_uint32(crc); |
| #endif |
| #else |
| crc = crc32_fast[3][(buf[0] ^ (crc >> 0)) & 0xff] ^ |
| crc32_fast[2][(buf[1] ^ (crc >> 8)) & 0xff] ^ |
| crc32_fast[1][(buf[2] ^ (crc >> 16)) & 0xff] ^ |
| crc32_fast[0][(buf[3] ^ (crc >> 24)) & 0xff]; |
| |
| crc = crc32_fast[3][(buf[4] ^ (crc >> 0)) & 0xff] ^ |
| crc32_fast[2][(buf[5] ^ (crc >> 8)) & 0xff] ^ |
| crc32_fast[1][(buf[6] ^ (crc >> 16)) & 0xff] ^ |
| crc32_fast[0][(buf[7] ^ (crc >> 24)) & 0xff]; |
| |
| crc = crc32_fast[3][(buf[8] ^ (crc >> 0)) & 0xff] ^ |
| crc32_fast[2][(buf[9] ^ (crc >> 8)) & 0xff] ^ |
| crc32_fast[1][(buf[10] ^ (crc >> 16)) & 0xff] ^ |
| crc32_fast[0][(buf[11] ^ (crc >> 24)) & 0xff]; |
| |
| crc = crc32_fast[3][(buf[12] ^ (crc >> 0)) & 0xff] ^ |
| crc32_fast[2][(buf[13] ^ (crc >> 8)) & 0xff] ^ |
| crc32_fast[1][(buf[14] ^ (crc >> 16)) & 0xff] ^ |
| crc32_fast[0][(buf[15] ^ (crc >> 24)) & 0xff]; |
| #endif |
| buf += 16; |
| } |
| |
| while (buf <= end - 4) { |
| #ifdef UNALIGNED_LE_OK |
| crc ^= *(uint32_t *)buf; |
| crc = crc32_uint32(crc); |
| #else |
| crc = crc32_fast[3][(buf[0] ^ (crc >> 0)) & 0xff] ^ |
| crc32_fast[2][(buf[1] ^ (crc >> 8)) & 0xff] ^ |
| crc32_fast[1][(buf[2] ^ (crc >> 16)) & 0xff] ^ |
| crc32_fast[0][(buf[3] ^ (crc >> 24)) & 0xff]; |
| #endif |
| buf += 4; |
| } |
| |
| while (buf < end) |
| crc = crc32_char(crc, *buf++); |
| return crc; |
| } |
| |
| /* uses the most suitable crc32 function to update crc on <buf, len> */ |
| static inline uint32_t update_crc(uint32_t crc, const void *buf, int len) |
| { |
| return slz_crc32_by4(crc, buf, len); |
| } |
| |
| /* Sends the gzip header for stream <strm> into buffer <buf>. When it's done, |
| * the stream state is updated to SLZ_ST_EOB. It returns the number of bytes |
| * emitted which is always 10. The caller is responsible for ensuring there's |
| * always enough room in the buffer. |
| */ |
| int slz_rfc1952_send_header(struct slz_stream *strm, unsigned char *buf) |
| { |
| memcpy(buf, gzip_hdr, sizeof(gzip_hdr)); |
| strm->state = SLZ_ST_EOB; |
| return sizeof(gzip_hdr); |
| } |
| |
| /* Encodes the block according to rfc1952. This means that the CRC of the input |
| * block is computed according to the CRC32 algorithm. If the header was never |
| * sent, it may be sent first. The number of output bytes is returned. |
| */ |
| long slz_rfc1952_encode(struct slz_stream *strm, unsigned char *out, const unsigned char *in, long ilen, int more) |
| { |
| long ret = 0; |
| |
| if (__builtin_expect(strm->state == SLZ_ST_INIT, 0)) |
| ret += slz_rfc1952_send_header(strm, out); |
| |
| strm->crc32 = update_crc(strm->crc32, in, ilen); |
| ret += slz_rfc1951_encode(strm, out + ret, in, ilen, more); |
| return ret; |
| } |
| |
| /* Initializes stream <strm> for use with the gzip format (rfc1952). The |
| * compression level passed in <level> is set. This value can only be 0 (no |
| * compression) or 1 (compression) and other values will lead to unpredictable |
| * behaviour. The function always returns 0. |
| */ |
| int slz_rfc1952_init(struct slz_stream *strm, int level) |
| { |
| strm->state = SLZ_ST_INIT; |
| strm->level = level; |
| strm->format = SLZ_FMT_GZIP; |
| strm->crc32 = 0; |
| strm->ilen = 0; |
| strm->qbits = 0; |
| strm->queue = 0; |
| return 0; |
| } |
| |
| /* Flushes any pending data for stream <strm> into buffer <buf>, then emits an |
| * empty literal block to byte-align the output, allowing to completely flush |
| * the queue. Note that if the initial header was never sent, it will be sent |
| * first as well (10 extra bytes). This requires that the output buffer still |
| * has this plus the size of the queue available (up to 4 bytes), plus one byte |
| * for (BFINAL,BTYPE), plus 4 bytes for LEN+NLEN, or a total of 19 bytes in the |
| * worst case. The number of bytes emitted is returned. It is guaranteed that |
| * the queue is empty on return. This may cause some overhead by adding |
| * needless 5-byte blocks if called to often. |
| */ |
| int slz_rfc1952_flush(struct slz_stream *strm, unsigned char *buf) |
| { |
| int sent = 0; |
| |
| if (__builtin_expect(strm->state == SLZ_ST_INIT, 0)) |
| sent = slz_rfc1952_send_header(strm, buf); |
| |
| sent += slz_rfc1951_flush(strm, buf + sent); |
| return sent; |
| } |
| |
| /* Flushes pending bits and sends the gzip trailer for stream <strm> into |
| * buffer <buf>. When it's done, the stream state is updated to SLZ_ST_END. It |
| * returns the number of bytes emitted. The trailer consists in flushing the |
| * possibly pending bits from the queue (up to 24 bits), rounding to the next |
| * byte, then 4 bytes for the CRC and another 4 bytes for the input length. |
| * That may about to 4+4+4 = 12 bytes, that the caller must ensure are |
| * available before calling the function. Note that if the initial header was |
| * never sent, it will be sent first as well (10 extra bytes). |
| */ |
| int slz_rfc1952_finish(struct slz_stream *strm, unsigned char *buf) |
| { |
| strm->outbuf = buf; |
| |
| if (__builtin_expect(strm->state == SLZ_ST_INIT, 0)) |
| strm->outbuf += slz_rfc1952_send_header(strm, strm->outbuf); |
| |
| slz_rfc1951_finish(strm, strm->outbuf); |
| copy_32b(strm, strm->crc32); |
| copy_32b(strm, strm->ilen); |
| strm->state = SLZ_ST_END; |
| |
| return strm->outbuf - buf; |
| } |
| |
| |
| /* RFC1950-specific stuff. This is for the Zlib stream format. |
| * From RFC1950 (zlib) : |
| * |
| |
| 2.2. Data format |
| |
| A zlib stream has the following structure: |
| |
| 0 1 |
| +---+---+ |
| |CMF|FLG| (more-->) |
| +---+---+ |
| |
| |
| (if FLG.FDICT set) |
| |
| 0 1 2 3 |
| +---+---+---+---+ |
| | DICTID | (more-->) |
| +---+---+---+---+ |
| |
| +=====================+---+---+---+---+ |
| |...compressed data...| ADLER32 | |
| +=====================+---+---+---+---+ |
| |
| Any data which may appear after ADLER32 are not part of the zlib |
| stream. |
| |
| CMF (Compression Method and flags) |
| This byte is divided into a 4-bit compression method and a 4- |
| bit information field depending on the compression method. |
| |
| bits 0 to 3 CM Compression method |
| bits 4 to 7 CINFO Compression info |
| |
| CM (Compression method) |
| This identifies the compression method used in the file. CM = 8 |
| denotes the "deflate" compression method with a window size up |
| to 32K. This is the method used by gzip and PNG (see |
| references [1] and [2] in Chapter 3, below, for the reference |
| documents). CM = 15 is reserved. It might be used in a future |
| version of this specification to indicate the presence of an |
| extra field before the compressed data. |
| |
| CINFO (Compression info) |
| For CM = 8, CINFO is the base-2 logarithm of the LZ77 window |
| size, minus eight (CINFO=7 indicates a 32K window size). Values |
| of CINFO above 7 are not allowed in this version of the |
| specification. CINFO is not defined in this specification for |
| CM not equal to 8. |
| |
| FLG (FLaGs) |
| This flag byte is divided as follows: |
| |
| bits 0 to 4 FCHECK (check bits for CMF and FLG) |
| bit 5 FDICT (preset dictionary) |
| bits 6 to 7 FLEVEL (compression level) |
| |
| The FCHECK value must be such that CMF and FLG, when viewed as |
| a 16-bit unsigned integer stored in MSB order (CMF*256 + FLG), |
| is a multiple of 31. |
| |
| |
| FDICT (Preset dictionary) |
| If FDICT is set, a DICT dictionary identifier is present |
| immediately after the FLG byte. The dictionary is a sequence of |
| bytes which are initially fed to the compressor without |
| producing any compressed output. DICT is the Adler-32 checksum |
| of this sequence of bytes (see the definition of ADLER32 |
| below). The decompressor can use this identifier to determine |
| which dictionary has been used by the compressor. |
| |
| FLEVEL (Compression level) |
| These flags are available for use by specific compression |
| methods. The "deflate" method (CM = 8) sets these flags as |
| follows: |
| |
| 0 - compressor used fastest algorithm |
| 1 - compressor used fast algorithm |
| 2 - compressor used default algorithm |
| 3 - compressor used maximum compression, slowest algorithm |
| |
| The information in FLEVEL is not needed for decompression; it |
| is there to indicate if recompression might be worthwhile. |
| |
| compressed data |
| For compression method 8, the compressed data is stored in the |
| deflate compressed data format as described in the document |
| "DEFLATE Compressed Data Format Specification" by L. Peter |
| Deutsch. (See reference [3] in Chapter 3, below) |
| |
| Other compressed data formats are not specified in this version |
| of the zlib specification. |
| |
| ADLER32 (Adler-32 checksum) |
| This contains a checksum value of the uncompressed data |
| (excluding any dictionary data) computed according to Adler-32 |
| algorithm. This algorithm is a 32-bit extension and improvement |
| of the Fletcher algorithm, used in the ITU-T X.224 / ISO 8073 |
| standard. See references [4] and [5] in Chapter 3, below) |
| |
| Adler-32 is composed of two sums accumulated per byte: s1 is |
| the sum of all bytes, s2 is the sum of all s1 values. Both sums |
| are done modulo 65521. s1 is initialized to 1, s2 to zero. The |
| Adler-32 checksum is stored as s2*65536 + s1 in most- |
| significant-byte first (network) order. |
| |
| ==> The stream can start with only 2 bytes : |
| - CM = 0x78 : CMINFO=7 (32kB window), CM=8 (deflate) |
| - FLG = 0x01 : FLEVEL = 0 (fastest), FDICT=0 (no dict), FCHECK=1 so |
| that 0x7801 is a multiple of 31 (30721 = 991 * 31). |
| |
| ==> and it ends with only 4 bytes, the Adler-32 checksum in big-endian format. |
| |
| */ |
| |
| static const unsigned char zlib_hdr[] = { 0x78, 0x01 }; // 32k win, deflate, chk=1 |
| |
| |
| /* Original version from RFC1950, verified and works OK */ |
| uint32_t slz_adler32_by1(uint32_t crc, const unsigned char *buf, int len) |
| { |
| uint32_t s1 = crc & 0xffff; |
| uint32_t s2 = (crc >> 16) & 0xffff; |
| int n; |
| |
| for (n = 0; n < len; n++) { |
| s1 = (s1 + buf[n]) % 65521; |
| s2 = (s2 + s1) % 65521; |
| } |
| return (s2 << 16) + s1; |
| } |
| |
| /* Computes the adler32 sum on <buf> for <len> bytes. It avoids the expensive |
| * modulus by retrofitting the number of bytes missed between 65521 and 65536 |
| * which is easy to count : For every sum above 65536, the modulus is offset |
| * by (65536-65521) = 15. So for any value, we can count the accumulated extra |
| * values by dividing the sum by 65536 and multiplying this value by |
| * (65536-65521). That's easier with a drawing with boxes and marbles. It gives |
| * this : |
| * x % 65521 = (x % 65536) + (x / 65536) * (65536 - 65521) |
| * = (x & 0xffff) + (x >> 16) * 15. |
| */ |
| uint32_t slz_adler32_block(uint32_t crc, const unsigned char *buf, long len) |
| { |
| long s1 = crc & 0xffff; |
| long s2 = (crc >> 16); |
| long blk; |
| long n; |
| |
| do { |
| blk = len; |
| /* ensure we never overflow s2 (limit is about 2^((32-8)/2) */ |
| if (blk > (1U << 12)) |
| blk = 1U << 12; |
| len -= blk; |
| |
| for (n = 0; n < blk; n++) { |
| s1 = (s1 + buf[n]); |
| s2 = (s2 + s1); |
| } |
| |
| /* Largest value here is 2^12 * 255 = 1044480 < 2^20. We can |
| * still overflow once, but not twice because the right hand |
| * size is 225 max, so the total is 65761. However we also |
| * have to take care of the values between 65521 and 65536. |
| */ |
| s1 = (s1 & 0xffff) + 15 * (s1 >> 16); |
| if (s1 >= 65521) |
| s1 -= 65521; |
| |
| /* For s2, the largest value is estimated to 2^32-1 for |
| * simplicity, so the right hand side is about 15*65535 |
| * = 983025. We can overflow twice at most. |
| */ |
| s2 = (s2 & 0xffff) + 15 * (s2 >> 16); |
| s2 = (s2 & 0xffff) + 15 * (s2 >> 16); |
| if (s2 >= 65521) |
| s2 -= 65521; |
| |
| buf += blk; |
| } while (len); |
| return (s2 << 16) + s1; |
| } |
| |
| /* Sends the zlib header for stream <strm> into buffer <buf>. When it's done, |
| * the stream state is updated to SLZ_ST_EOB. It returns the number of bytes |
| * emitted which is always 2. The caller is responsible for ensuring there's |
| * always enough room in the buffer. |
| */ |
| int slz_rfc1950_send_header(struct slz_stream *strm, unsigned char *buf) |
| { |
| memcpy(buf, zlib_hdr, sizeof(zlib_hdr)); |
| strm->state = SLZ_ST_EOB; |
| return sizeof(zlib_hdr); |
| } |
| |
| /* Encodes the block according to rfc1950. This means that the CRC of the input |
| * block is computed according to the ADLER32 algorithm. If the header was never |
| * sent, it may be sent first. The number of output bytes is returned. |
| */ |
| long slz_rfc1950_encode(struct slz_stream *strm, unsigned char *out, const unsigned char *in, long ilen, int more) |
| { |
| long ret = 0; |
| |
| if (__builtin_expect(strm->state == SLZ_ST_INIT, 0)) |
| ret += slz_rfc1950_send_header(strm, out); |
| |
| strm->crc32 = slz_adler32_block(strm->crc32, in, ilen); |
| ret += slz_rfc1951_encode(strm, out + ret, in, ilen, more); |
| return ret; |
| } |
| |
| /* Initializes stream <strm> for use with the zlib format (rfc1952). The |
| * compression level passed in <level> is set. This value can only be 0 (no |
| * compression) or 1 (compression) and other values will lead to unpredictable |
| * behaviour. The function always returns 0. |
| */ |
| int slz_rfc1950_init(struct slz_stream *strm, int level) |
| { |
| strm->state = SLZ_ST_INIT; |
| strm->level = level; |
| strm->format = SLZ_FMT_ZLIB; |
| strm->crc32 = 1; // rfc1950/zlib starts with initial crc=1 |
| strm->ilen = 0; |
| strm->qbits = 0; |
| strm->queue = 0; |
| return 0; |
| } |
| |
| /* Flushes any pending data for stream <strm> into buffer <buf>, then emits an |
| * empty literal block to byte-align the output, allowing to completely flush |
| * the queue. Note that if the initial header was never sent, it will be sent |
| * first as well (2 extra bytes). This requires that the output buffer still |
| * has this plus the size of the queue available (up to 4 bytes), plus one byte |
| * for (BFINAL,BTYPE), plus 4 bytes for LEN+NLEN, or a total of 11 bytes in the |
| * worst case. The number of bytes emitted is returned. It is guaranteed that |
| * the queue is empty on return. This may cause some overhead by adding |
| * needless 5-byte blocks if called to often. |
| */ |
| int slz_rfc1950_flush(struct slz_stream *strm, unsigned char *buf) |
| { |
| int sent = 0; |
| |
| if (__builtin_expect(strm->state == SLZ_ST_INIT, 0)) |
| sent = slz_rfc1950_send_header(strm, buf); |
| |
| sent += slz_rfc1951_flush(strm, buf + sent); |
| return sent; |
| } |
| |
| /* Flushes pending bits and sends the gzip trailer for stream <strm> into |
| * buffer <buf>. When it's done, the stream state is updated to SLZ_ST_END. It |
| * returns the number of bytes emitted. The trailer consists in flushing the |
| * possibly pending bits from the queue (up to 24 bits), rounding to the next |
| * byte, then 4 bytes for the CRC. That may about to 4+4 = 8 bytes, that the |
| * caller must ensure are available before calling the function. Note that if |
| * the initial header was never sent, it will be sent first as well (2 extra |
| * bytes). |
| */ |
| int slz_rfc1950_finish(struct slz_stream *strm, unsigned char *buf) |
| { |
| strm->outbuf = buf; |
| |
| if (__builtin_expect(strm->state == SLZ_ST_INIT, 0)) |
| strm->outbuf += slz_rfc1952_send_header(strm, strm->outbuf); |
| |
| slz_rfc1951_finish(strm, strm->outbuf); |
| copy_8b(strm, (strm->crc32 >> 24) & 0xff); |
| copy_8b(strm, (strm->crc32 >> 16) & 0xff); |
| copy_8b(strm, (strm->crc32 >> 8) & 0xff); |
| copy_8b(strm, (strm->crc32 >> 0) & 0xff); |
| strm->state = SLZ_ST_END; |
| return strm->outbuf - buf; |
| } |
| |
| __attribute__((constructor)) |
| static void __slz_initialize(void) |
| { |
| #if !defined(__ARM_FEATURE_CRC32) |
| __slz_make_crc_table(); |
| #endif |
| __slz_prepare_dist_table(); |
| } |