blob: 2408ee90c18920b7f0ebbe859f4476f0f0118450 [file] [log] [blame]
Simon Glass0a30e422016-01-05 09:31:00 -07001
2/*-------------------------------------------------------------*/
3/*--- Block sorting machinery ---*/
4/*--- blocksort.c ---*/
5/*-------------------------------------------------------------*/
6
7/*--
8 This file is a part of bzip2 and/or libbzip2, a program and
9 library for lossless, block-sorting data compression.
10
11 Copyright (C) 1996-2002 Julian R Seward. All rights reserved.
12
13 Redistribution and use in source and binary forms, with or without
14 modification, are permitted provided that the following conditions
15 are met:
16
17 1. Redistributions of source code must retain the above copyright
18 notice, this list of conditions and the following disclaimer.
19
20 2. The origin of this software must not be misrepresented; you must
21 not claim that you wrote the original software. If you use this
22 software in a product, an acknowledgment in the product
23 documentation would be appreciated but is not required.
24
25 3. Altered source versions must be plainly marked as such, and must
26 not be misrepresented as being the original software.
27
28 4. The name of the author may not be used to endorse or promote
29 products derived from this software without specific prior written
30 permission.
31
32 THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS
33 OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
34 WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
35 ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
36 DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
37 DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE
38 GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
39 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
40 WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
41 NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
42 SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
43
44 Julian Seward, Cambridge, UK.
45 jseward@acm.org
46 bzip2/libbzip2 version 1.0.6 of 6 September 2010
47 Copyright (C) 1996-2010 Julian Seward <jseward@bzip.org>
48
49 This program is based on (at least) the work of:
50 Mike Burrows
51 David Wheeler
52 Peter Fenwick
53 Alistair Moffat
54 Radford Neal
55 Ian H. Witten
56 Robert Sedgewick
57 Jon L. Bentley
58
59 For more information on these sources, see the manual.
60--*/
61
62#include "bzlib_private.h"
Simon Glass0f2af882020-05-10 11:40:05 -060063#include <log.h>
Simon Glass0a30e422016-01-05 09:31:00 -070064
65/*---------------------------------------------*/
66/*--- Fallback O(N log(N)^2) sorting ---*/
67/*--- algorithm, for repetitive blocks ---*/
68/*---------------------------------------------*/
69
70/*---------------------------------------------*/
71static
72__inline__
73void fallbackSimpleSort ( UInt32* fmap,
74 UInt32* eclass,
75 Int32 lo,
76 Int32 hi )
77{
78 Int32 i, j, tmp;
79 UInt32 ec_tmp;
80
81 if (lo == hi) return;
82
83 if (hi - lo > 3) {
84 for ( i = hi-4; i >= lo; i-- ) {
85 tmp = fmap[i];
86 ec_tmp = eclass[tmp];
87 for ( j = i+4; j <= hi && ec_tmp > eclass[fmap[j]]; j += 4 )
88 fmap[j-4] = fmap[j];
89 fmap[j-4] = tmp;
90 }
91 }
92
93 for ( i = hi-1; i >= lo; i-- ) {
94 tmp = fmap[i];
95 ec_tmp = eclass[tmp];
96 for ( j = i+1; j <= hi && ec_tmp > eclass[fmap[j]]; j++ )
97 fmap[j-1] = fmap[j];
98 fmap[j-1] = tmp;
99 }
100}
101
Simon Glass0a30e422016-01-05 09:31:00 -0700102/*---------------------------------------------*/
103#define fswap(zz1, zz2) \
104 { Int32 zztmp = zz1; zz1 = zz2; zz2 = zztmp; }
105
106#define fvswap(zzp1, zzp2, zzn) \
107{ \
108 Int32 yyp1 = (zzp1); \
109 Int32 yyp2 = (zzp2); \
110 Int32 yyn = (zzn); \
111 while (yyn > 0) { \
112 fswap(fmap[yyp1], fmap[yyp2]); \
113 yyp1++; yyp2++; yyn--; \
114 } \
115}
116
Simon Glass0a30e422016-01-05 09:31:00 -0700117#define fmin(a,b) ((a) < (b)) ? (a) : (b)
118
119#define fpush(lz,hz) { stackLo[sp] = lz; \
120 stackHi[sp] = hz; \
121 sp++; }
122
123#define fpop(lz,hz) { sp--; \
124 lz = stackLo[sp]; \
125 hz = stackHi[sp]; }
126
127#define FALLBACK_QSORT_SMALL_THRESH 10
128#define FALLBACK_QSORT_STACK_SIZE 100
129
Simon Glass0a30e422016-01-05 09:31:00 -0700130static
131void fallbackQSort3 ( UInt32* fmap,
132 UInt32* eclass,
133 Int32 loSt,
134 Int32 hiSt )
135{
136 Int32 unLo, unHi, ltLo, gtHi, n, m;
137 Int32 sp, lo, hi;
138 UInt32 med, r, r3;
139 Int32 stackLo[FALLBACK_QSORT_STACK_SIZE];
140 Int32 stackHi[FALLBACK_QSORT_STACK_SIZE];
141
142 r = 0;
143
144 sp = 0;
145 fpush ( loSt, hiSt );
146
147 while (sp > 0) {
148
149 AssertH ( sp < FALLBACK_QSORT_STACK_SIZE - 1, 1004 );
150
151 fpop ( lo, hi );
152 if (hi - lo < FALLBACK_QSORT_SMALL_THRESH) {
153 fallbackSimpleSort ( fmap, eclass, lo, hi );
154 continue;
155 }
156
157 /* Random partitioning. Median of 3 sometimes fails to
158 avoid bad cases. Median of 9 seems to help but
159 looks rather expensive. This too seems to work but
160 is cheaper. Guidance for the magic constants
161 7621 and 32768 is taken from Sedgewick's algorithms
162 book, chapter 35.
163 */
164 r = ((r * 7621) + 1) % 32768;
165 r3 = r % 3;
166 if (r3 == 0) med = eclass[fmap[lo]]; else
167 if (r3 == 1) med = eclass[fmap[(lo+hi)>>1]]; else
168 med = eclass[fmap[hi]];
169
170 unLo = ltLo = lo;
171 unHi = gtHi = hi;
172
173 while (1) {
174 while (1) {
175 if (unLo > unHi) break;
176 n = (Int32)eclass[fmap[unLo]] - (Int32)med;
177 if (n == 0) {
178 fswap(fmap[unLo], fmap[ltLo]);
179 ltLo++; unLo++;
180 continue;
181 };
182 if (n > 0) break;
183 unLo++;
184 }
185 while (1) {
186 if (unLo > unHi) break;
187 n = (Int32)eclass[fmap[unHi]] - (Int32)med;
188 if (n == 0) {
189 fswap(fmap[unHi], fmap[gtHi]);
190 gtHi--; unHi--;
191 continue;
192 };
193 if (n < 0) break;
194 unHi--;
195 }
196 if (unLo > unHi) break;
197 fswap(fmap[unLo], fmap[unHi]); unLo++; unHi--;
198 }
199
200 AssertD ( unHi == unLo-1, "fallbackQSort3(2)" );
201
202 if (gtHi < ltLo) continue;
203
204 n = fmin(ltLo-lo, unLo-ltLo); fvswap(lo, unLo-n, n);
205 m = fmin(hi-gtHi, gtHi-unHi); fvswap(unLo, hi-m+1, m);
206
207 n = lo + unLo - ltLo - 1;
208 m = hi - (gtHi - unHi) + 1;
209
210 if (n - lo > hi - m) {
211 fpush ( lo, n );
212 fpush ( m, hi );
213 } else {
214 fpush ( m, hi );
215 fpush ( lo, n );
216 }
217 }
218}
219
220#undef fmin
221#undef fpush
222#undef fpop
223#undef fswap
224#undef fvswap
225#undef FALLBACK_QSORT_SMALL_THRESH
226#undef FALLBACK_QSORT_STACK_SIZE
227
Simon Glass0a30e422016-01-05 09:31:00 -0700228/*---------------------------------------------*/
229/* Pre:
230 nblock > 0
231 eclass exists for [0 .. nblock-1]
232 ((UChar*)eclass) [0 .. nblock-1] holds block
233 ptr exists for [0 .. nblock-1]
234
235 Post:
236 ((UChar*)eclass) [0 .. nblock-1] holds block
237 All other areas of eclass destroyed
238 fmap [0 .. nblock-1] holds sorted order
239 bhtab [ 0 .. 2+(nblock/32) ] destroyed
240*/
241
242#define SET_BH(zz) bhtab[(zz) >> 5] |= (1 << ((zz) & 31))
243#define CLEAR_BH(zz) bhtab[(zz) >> 5] &= ~(1 << ((zz) & 31))
244#define ISSET_BH(zz) (bhtab[(zz) >> 5] & (1 << ((zz) & 31)))
245#define WORD_BH(zz) bhtab[(zz) >> 5]
246#define UNALIGNED_BH(zz) ((zz) & 0x01f)
247
248static
249void fallbackSort ( UInt32* fmap,
250 UInt32* eclass,
251 UInt32* bhtab,
252 Int32 nblock,
253 Int32 verb )
254{
255 Int32 ftab[257];
256 Int32 ftabCopy[256];
257 Int32 H, i, j, k, l, r, cc, cc1;
258 Int32 nNotDone;
259 Int32 nBhtab;
260 UChar* eclass8 = (UChar*)eclass;
261
262 /*--
263 Initial 1-char radix sort to generate
264 initial fmap and initial BH bits.
265 --*/
266 if (verb >= 4)
267 VPrintf0 ( " bucket sorting ...\n" );
268 for (i = 0; i < 257; i++) ftab[i] = 0;
269 for (i = 0; i < nblock; i++) ftab[eclass8[i]]++;
270 for (i = 0; i < 256; i++) ftabCopy[i] = ftab[i];
271 for (i = 1; i < 257; i++) ftab[i] += ftab[i-1];
272
273 for (i = 0; i < nblock; i++) {
274 j = eclass8[i];
275 k = ftab[j] - 1;
276 ftab[j] = k;
277 fmap[k] = i;
278 }
279
280 nBhtab = 2 + (nblock / 32);
281 for (i = 0; i < nBhtab; i++) bhtab[i] = 0;
282 for (i = 0; i < 256; i++) SET_BH(ftab[i]);
283
284 /*--
285 Inductively refine the buckets. Kind-of an
286 "exponential radix sort" (!), inspired by the
287 Manber-Myers suffix array construction algorithm.
288 --*/
289
290 /*-- set sentinel bits for block-end detection --*/
291 for (i = 0; i < 32; i++) {
292 SET_BH(nblock + 2*i);
293 CLEAR_BH(nblock + 2*i + 1);
294 }
295
296 /*-- the log(N) loop --*/
297 H = 1;
298 while (1) {
299
300 if (verb >= 4)
301 VPrintf1 ( " depth %6d has ", H );
302
303 j = 0;
304 for (i = 0; i < nblock; i++) {
305 if (ISSET_BH(i)) j = i;
306 k = fmap[i] - H; if (k < 0) k += nblock;
307 eclass[k] = j;
308 }
309
310 nNotDone = 0;
311 r = -1;
312 while (1) {
313
314 /*-- find the next non-singleton bucket --*/
315 k = r + 1;
316 while (ISSET_BH(k) && UNALIGNED_BH(k)) k++;
317 if (ISSET_BH(k)) {
318 while (WORD_BH(k) == 0xffffffff) k += 32;
319 while (ISSET_BH(k)) k++;
320 }
321 l = k - 1;
322 if (l >= nblock) break;
323 while (!ISSET_BH(k) && UNALIGNED_BH(k)) k++;
324 if (!ISSET_BH(k)) {
325 while (WORD_BH(k) == 0x00000000) k += 32;
326 while (!ISSET_BH(k)) k++;
327 }
328 r = k - 1;
329 if (r >= nblock) break;
330
331 /*-- now [l, r] bracket current bucket --*/
332 if (r > l) {
333 nNotDone += (r - l + 1);
334 fallbackQSort3 ( fmap, eclass, l, r );
335
336 /*-- scan bucket and generate header bits-- */
337 cc = -1;
338 for (i = l; i <= r; i++) {
339 cc1 = eclass[fmap[i]];
340 if (cc != cc1) { SET_BH(i); cc = cc1; };
341 }
342 }
343 }
344
345 if (verb >= 4)
346 VPrintf1 ( "%6d unresolved strings\n", nNotDone );
347
348 H *= 2;
349 if (H > nblock || nNotDone == 0) break;
350 }
351
352 /*--
353 Reconstruct the original block in
354 eclass8 [0 .. nblock-1], since the
355 previous phase destroyed it.
356 --*/
357 if (verb >= 4)
358 VPrintf0 ( " reconstructing block ...\n" );
359 j = 0;
360 for (i = 0; i < nblock; i++) {
361 while (ftabCopy[j] == 0) j++;
362 ftabCopy[j]--;
363 eclass8[fmap[i]] = (UChar)j;
364 }
365 AssertH ( j < 256, 1005 );
366}
367
368#undef SET_BH
369#undef CLEAR_BH
370#undef ISSET_BH
371#undef WORD_BH
372#undef UNALIGNED_BH
373
Simon Glass0a30e422016-01-05 09:31:00 -0700374/*---------------------------------------------*/
375/*--- The main, O(N^2 log(N)) sorting ---*/
376/*--- algorithm. Faster for "normal" ---*/
377/*--- non-repetitive blocks. ---*/
378/*---------------------------------------------*/
379
380/*---------------------------------------------*/
381static
382__inline__
383Bool mainGtU ( UInt32 i1,
384 UInt32 i2,
385 UChar* block,
386 UInt16* quadrant,
387 UInt32 nblock,
388 Int32* budget )
389{
390 Int32 k;
391 UChar c1, c2;
392 UInt16 s1, s2;
393
394 AssertD ( i1 != i2, "mainGtU" );
395 /* 1 */
396 c1 = block[i1]; c2 = block[i2];
397 if (c1 != c2) return (c1 > c2);
398 i1++; i2++;
399 /* 2 */
400 c1 = block[i1]; c2 = block[i2];
401 if (c1 != c2) return (c1 > c2);
402 i1++; i2++;
403 /* 3 */
404 c1 = block[i1]; c2 = block[i2];
405 if (c1 != c2) return (c1 > c2);
406 i1++; i2++;
407 /* 4 */
408 c1 = block[i1]; c2 = block[i2];
409 if (c1 != c2) return (c1 > c2);
410 i1++; i2++;
411 /* 5 */
412 c1 = block[i1]; c2 = block[i2];
413 if (c1 != c2) return (c1 > c2);
414 i1++; i2++;
415 /* 6 */
416 c1 = block[i1]; c2 = block[i2];
417 if (c1 != c2) return (c1 > c2);
418 i1++; i2++;
419 /* 7 */
420 c1 = block[i1]; c2 = block[i2];
421 if (c1 != c2) return (c1 > c2);
422 i1++; i2++;
423 /* 8 */
424 c1 = block[i1]; c2 = block[i2];
425 if (c1 != c2) return (c1 > c2);
426 i1++; i2++;
427 /* 9 */
428 c1 = block[i1]; c2 = block[i2];
429 if (c1 != c2) return (c1 > c2);
430 i1++; i2++;
431 /* 10 */
432 c1 = block[i1]; c2 = block[i2];
433 if (c1 != c2) return (c1 > c2);
434 i1++; i2++;
435 /* 11 */
436 c1 = block[i1]; c2 = block[i2];
437 if (c1 != c2) return (c1 > c2);
438 i1++; i2++;
439 /* 12 */
440 c1 = block[i1]; c2 = block[i2];
441 if (c1 != c2) return (c1 > c2);
442 i1++; i2++;
443
444 k = nblock + 8;
445
446 do {
447 /* 1 */
448 c1 = block[i1]; c2 = block[i2];
449 if (c1 != c2) return (c1 > c2);
450 s1 = quadrant[i1]; s2 = quadrant[i2];
451 if (s1 != s2) return (s1 > s2);
452 i1++; i2++;
453 /* 2 */
454 c1 = block[i1]; c2 = block[i2];
455 if (c1 != c2) return (c1 > c2);
456 s1 = quadrant[i1]; s2 = quadrant[i2];
457 if (s1 != s2) return (s1 > s2);
458 i1++; i2++;
459 /* 3 */
460 c1 = block[i1]; c2 = block[i2];
461 if (c1 != c2) return (c1 > c2);
462 s1 = quadrant[i1]; s2 = quadrant[i2];
463 if (s1 != s2) return (s1 > s2);
464 i1++; i2++;
465 /* 4 */
466 c1 = block[i1]; c2 = block[i2];
467 if (c1 != c2) return (c1 > c2);
468 s1 = quadrant[i1]; s2 = quadrant[i2];
469 if (s1 != s2) return (s1 > s2);
470 i1++; i2++;
471 /* 5 */
472 c1 = block[i1]; c2 = block[i2];
473 if (c1 != c2) return (c1 > c2);
474 s1 = quadrant[i1]; s2 = quadrant[i2];
475 if (s1 != s2) return (s1 > s2);
476 i1++; i2++;
477 /* 6 */
478 c1 = block[i1]; c2 = block[i2];
479 if (c1 != c2) return (c1 > c2);
480 s1 = quadrant[i1]; s2 = quadrant[i2];
481 if (s1 != s2) return (s1 > s2);
482 i1++; i2++;
483 /* 7 */
484 c1 = block[i1]; c2 = block[i2];
485 if (c1 != c2) return (c1 > c2);
486 s1 = quadrant[i1]; s2 = quadrant[i2];
487 if (s1 != s2) return (s1 > s2);
488 i1++; i2++;
489 /* 8 */
490 c1 = block[i1]; c2 = block[i2];
491 if (c1 != c2) return (c1 > c2);
492 s1 = quadrant[i1]; s2 = quadrant[i2];
493 if (s1 != s2) return (s1 > s2);
494 i1++; i2++;
495
496 if (i1 >= nblock) i1 -= nblock;
497 if (i2 >= nblock) i2 -= nblock;
498
499 k -= 8;
500 (*budget)--;
501 }
502 while (k >= 0);
503
504 return False;
505}
506
Simon Glass0a30e422016-01-05 09:31:00 -0700507/*---------------------------------------------*/
508/*--
509 Knuth's increments seem to work better
510 than Incerpi-Sedgewick here. Possibly
511 because the number of elems to sort is
512 usually small, typically <= 20.
513--*/
514static
515Int32 incs[14] = { 1, 4, 13, 40, 121, 364, 1093, 3280,
516 9841, 29524, 88573, 265720,
517 797161, 2391484 };
518
519static
520void mainSimpleSort ( UInt32* ptr,
521 UChar* block,
522 UInt16* quadrant,
523 Int32 nblock,
524 Int32 lo,
525 Int32 hi,
526 Int32 d,
527 Int32* budget )
528{
529 Int32 i, j, h, bigN, hp;
530 UInt32 v;
531
532 bigN = hi - lo + 1;
533 if (bigN < 2) return;
534
535 hp = 0;
536 while (incs[hp] < bigN) hp++;
537 hp--;
538
539 for (; hp >= 0; hp--) {
540 h = incs[hp];
541
542 i = lo + h;
543 while (True) {
544
545 /*-- copy 1 --*/
546 if (i > hi) break;
547 v = ptr[i];
548 j = i;
549 while ( mainGtU (
550 ptr[j-h]+d, v+d, block, quadrant, nblock, budget
551 ) ) {
552 ptr[j] = ptr[j-h];
553 j = j - h;
554 if (j <= (lo + h - 1)) break;
555 }
556 ptr[j] = v;
557 i++;
558
559 /*-- copy 2 --*/
560 if (i > hi) break;
561 v = ptr[i];
562 j = i;
563 while ( mainGtU (
564 ptr[j-h]+d, v+d, block, quadrant, nblock, budget
565 ) ) {
566 ptr[j] = ptr[j-h];
567 j = j - h;
568 if (j <= (lo + h - 1)) break;
569 }
570 ptr[j] = v;
571 i++;
572
573 /*-- copy 3 --*/
574 if (i > hi) break;
575 v = ptr[i];
576 j = i;
577 while ( mainGtU (
578 ptr[j-h]+d, v+d, block, quadrant, nblock, budget
579 ) ) {
580 ptr[j] = ptr[j-h];
581 j = j - h;
582 if (j <= (lo + h - 1)) break;
583 }
584 ptr[j] = v;
585 i++;
586
587 if (*budget < 0) return;
588 }
589 }
590}
591
Simon Glass0a30e422016-01-05 09:31:00 -0700592/*---------------------------------------------*/
593/*--
594 The following is an implementation of
595 an elegant 3-way quicksort for strings,
596 described in a paper "Fast Algorithms for
597 Sorting and Searching Strings", by Robert
598 Sedgewick and Jon L. Bentley.
599--*/
600
601#define mswap(zz1, zz2) \
602 { Int32 zztmp = zz1; zz1 = zz2; zz2 = zztmp; }
603
604#define mvswap(zzp1, zzp2, zzn) \
605{ \
606 Int32 yyp1 = (zzp1); \
607 Int32 yyp2 = (zzp2); \
608 Int32 yyn = (zzn); \
609 while (yyn > 0) { \
610 mswap(ptr[yyp1], ptr[yyp2]); \
611 yyp1++; yyp2++; yyn--; \
612 } \
613}
614
615static
616__inline__
617UChar mmed3 ( UChar a, UChar b, UChar c )
618{
619 UChar t;
620 if (a > b) { t = a; a = b; b = t; };
621 if (b > c) {
622 b = c;
623 if (a > b) b = a;
624 }
625 return b;
626}
627
628#define mmin(a,b) ((a) < (b)) ? (a) : (b)
629
630#define mpush(lz,hz,dz) { stackLo[sp] = lz; \
631 stackHi[sp] = hz; \
632 stackD [sp] = dz; \
633 sp++; }
634
635#define mpop(lz,hz,dz) { sp--; \
636 lz = stackLo[sp]; \
637 hz = stackHi[sp]; \
638 dz = stackD [sp]; }
639
Simon Glass0a30e422016-01-05 09:31:00 -0700640#define mnextsize(az) (nextHi[az]-nextLo[az])
641
642#define mnextswap(az,bz) \
643 { Int32 tz; \
644 tz = nextLo[az]; nextLo[az] = nextLo[bz]; nextLo[bz] = tz; \
645 tz = nextHi[az]; nextHi[az] = nextHi[bz]; nextHi[bz] = tz; \
646 tz = nextD [az]; nextD [az] = nextD [bz]; nextD [bz] = tz; }
647
Simon Glass0a30e422016-01-05 09:31:00 -0700648#define MAIN_QSORT_SMALL_THRESH 20
649#define MAIN_QSORT_DEPTH_THRESH (BZ_N_RADIX + BZ_N_QSORT)
650#define MAIN_QSORT_STACK_SIZE 100
651
652static
653void mainQSort3 ( UInt32* ptr,
654 UChar* block,
655 UInt16* quadrant,
656 Int32 nblock,
657 Int32 loSt,
658 Int32 hiSt,
659 Int32 dSt,
660 Int32* budget )
661{
662 Int32 unLo, unHi, ltLo, gtHi, n, m, med;
663 Int32 sp, lo, hi, d;
664
665 Int32 stackLo[MAIN_QSORT_STACK_SIZE];
666 Int32 stackHi[MAIN_QSORT_STACK_SIZE];
667 Int32 stackD [MAIN_QSORT_STACK_SIZE];
668
669 Int32 nextLo[3];
670 Int32 nextHi[3];
671 Int32 nextD [3];
672
673 sp = 0;
674 mpush ( loSt, hiSt, dSt );
675
676 while (sp > 0) {
677
678 AssertH ( sp < MAIN_QSORT_STACK_SIZE - 2, 1001 );
679
680 mpop ( lo, hi, d );
681 if (hi - lo < MAIN_QSORT_SMALL_THRESH ||
682 d > MAIN_QSORT_DEPTH_THRESH) {
683 mainSimpleSort ( ptr, block, quadrant, nblock, lo, hi, d, budget );
684 if (*budget < 0) return;
685 continue;
686 }
687
688 med = (Int32)
689 mmed3 ( block[ptr[ lo ]+d],
690 block[ptr[ hi ]+d],
691 block[ptr[ (lo+hi)>>1 ]+d] );
692
693 unLo = ltLo = lo;
694 unHi = gtHi = hi;
695
696 while (True) {
697 while (True) {
698 if (unLo > unHi) break;
699 n = ((Int32)block[ptr[unLo]+d]) - med;
700 if (n == 0) {
701 mswap(ptr[unLo], ptr[ltLo]);
702 ltLo++; unLo++; continue;
703 };
704 if (n > 0) break;
705 unLo++;
706 }
707 while (True) {
708 if (unLo > unHi) break;
709 n = ((Int32)block[ptr[unHi]+d]) - med;
710 if (n == 0) {
711 mswap(ptr[unHi], ptr[gtHi]);
712 gtHi--; unHi--; continue;
713 };
714 if (n < 0) break;
715 unHi--;
716 }
717 if (unLo > unHi) break;
718 mswap(ptr[unLo], ptr[unHi]); unLo++; unHi--;
719 }
720
721 AssertD ( unHi == unLo-1, "mainQSort3(2)" );
722
723 if (gtHi < ltLo) {
724 mpush(lo, hi, d+1 );
725 continue;
726 }
727
728 n = mmin(ltLo-lo, unLo-ltLo); mvswap(lo, unLo-n, n);
729 m = mmin(hi-gtHi, gtHi-unHi); mvswap(unLo, hi-m+1, m);
730
731 n = lo + unLo - ltLo - 1;
732 m = hi - (gtHi - unHi) + 1;
733
734 nextLo[0] = lo; nextHi[0] = n; nextD[0] = d;
735 nextLo[1] = m; nextHi[1] = hi; nextD[1] = d;
736 nextLo[2] = n+1; nextHi[2] = m-1; nextD[2] = d+1;
737
738 if (mnextsize(0) < mnextsize(1)) mnextswap(0,1);
739 if (mnextsize(1) < mnextsize(2)) mnextswap(1,2);
740 if (mnextsize(0) < mnextsize(1)) mnextswap(0,1);
741
742 AssertD (mnextsize(0) >= mnextsize(1), "mainQSort3(8)" );
743 AssertD (mnextsize(1) >= mnextsize(2), "mainQSort3(9)" );
744
745 mpush (nextLo[0], nextHi[0], nextD[0]);
746 mpush (nextLo[1], nextHi[1], nextD[1]);
747 mpush (nextLo[2], nextHi[2], nextD[2]);
748 }
749}
750
751#undef mswap
752#undef mvswap
753#undef mpush
754#undef mpop
755#undef mmin
756#undef mnextsize
757#undef mnextswap
758#undef MAIN_QSORT_SMALL_THRESH
759#undef MAIN_QSORT_DEPTH_THRESH
760#undef MAIN_QSORT_STACK_SIZE
761
Simon Glass0a30e422016-01-05 09:31:00 -0700762/*---------------------------------------------*/
763/* Pre:
764 nblock > N_OVERSHOOT
765 block32 exists for [0 .. nblock-1 +N_OVERSHOOT]
766 ((UChar*)block32) [0 .. nblock-1] holds block
767 ptr exists for [0 .. nblock-1]
768
769 Post:
770 ((UChar*)block32) [0 .. nblock-1] holds block
771 All other areas of block32 destroyed
772 ftab [0 .. 65536 ] destroyed
773 ptr [0 .. nblock-1] holds sorted order
774 if (*budget < 0), sorting was abandoned
775*/
776
777#define BIGFREQ(b) (ftab[((b)+1) << 8] - ftab[(b) << 8])
778#define SETMASK (1 << 21)
779#define CLEARMASK (~(SETMASK))
780
781static
782void mainSort ( UInt32* ptr,
783 UChar* block,
784 UInt16* quadrant,
785 UInt32* ftab,
786 Int32 nblock,
787 Int32 verb,
788 Int32* budget )
789{
790 Int32 i, j, k, ss, sb;
791 Int32 runningOrder[256];
792 Bool bigDone[256];
793 Int32 copyStart[256];
794 Int32 copyEnd [256];
795 UChar c1;
796 Int32 numQSorted;
797 UInt16 s;
798 if (verb >= 4) VPrintf0 ( " main sort initialise ...\n" );
799
800 /*-- set up the 2-byte frequency table --*/
801 for (i = 65536; i >= 0; i--) ftab[i] = 0;
802
803 j = block[0] << 8;
804 i = nblock-1;
805 for (; i >= 3; i -= 4) {
806 quadrant[i] = 0;
807 j = (j >> 8) | ( ((UInt16)block[i]) << 8);
808 ftab[j]++;
809 quadrant[i-1] = 0;
810 j = (j >> 8) | ( ((UInt16)block[i-1]) << 8);
811 ftab[j]++;
812 quadrant[i-2] = 0;
813 j = (j >> 8) | ( ((UInt16)block[i-2]) << 8);
814 ftab[j]++;
815 quadrant[i-3] = 0;
816 j = (j >> 8) | ( ((UInt16)block[i-3]) << 8);
817 ftab[j]++;
818 }
819 for (; i >= 0; i--) {
820 quadrant[i] = 0;
821 j = (j >> 8) | ( ((UInt16)block[i]) << 8);
822 ftab[j]++;
823 }
824
825 /*-- (emphasises close relationship of block & quadrant) --*/
826 for (i = 0; i < BZ_N_OVERSHOOT; i++) {
827 block [nblock+i] = block[i];
828 quadrant[nblock+i] = 0;
829 }
830
831 if (verb >= 4) VPrintf0 ( " bucket sorting ...\n" );
832
833 /*-- Complete the initial radix sort --*/
834 for (i = 1; i <= 65536; i++) ftab[i] += ftab[i-1];
835
836 s = block[0] << 8;
837 i = nblock-1;
838 for (; i >= 3; i -= 4) {
839 s = (s >> 8) | (block[i] << 8);
840 j = ftab[s] -1;
841 ftab[s] = j;
842 ptr[j] = i;
843 s = (s >> 8) | (block[i-1] << 8);
844 j = ftab[s] -1;
845 ftab[s] = j;
846 ptr[j] = i-1;
847 s = (s >> 8) | (block[i-2] << 8);
848 j = ftab[s] -1;
849 ftab[s] = j;
850 ptr[j] = i-2;
851 s = (s >> 8) | (block[i-3] << 8);
852 j = ftab[s] -1;
853 ftab[s] = j;
854 ptr[j] = i-3;
855 }
856 for (; i >= 0; i--) {
857 s = (s >> 8) | (block[i] << 8);
858 j = ftab[s] -1;
859 ftab[s] = j;
860 ptr[j] = i;
861 }
862
863 /*--
864 Now ftab contains the first loc of every small bucket.
865 Calculate the running order, from smallest to largest
866 big bucket.
867 --*/
868 for (i = 0; i <= 255; i++) {
869 bigDone [i] = False;
870 runningOrder[i] = i;
871 }
872
873 {
874 Int32 vv;
875 Int32 h = 1;
876 do h = 3 * h + 1; while (h <= 256);
877 do {
878 h = h / 3;
879 for (i = h; i <= 255; i++) {
880 vv = runningOrder[i];
881 j = i;
882 while ( BIGFREQ(runningOrder[j-h]) > BIGFREQ(vv) ) {
883 runningOrder[j] = runningOrder[j-h];
884 j = j - h;
885 if (j <= (h - 1)) goto zero;
886 }
887 zero:
888 runningOrder[j] = vv;
889 }
890 } while (h != 1);
891 }
892
893 /*--
894 The main sorting loop.
895 --*/
896
897 numQSorted = 0;
898
899 for (i = 0; i <= 255; i++) {
900
901 /*--
902 Process big buckets, starting with the least full.
903 Basically this is a 3-step process in which we call
904 mainQSort3 to sort the small buckets [ss, j], but
905 also make a big effort to avoid the calls if we can.
906 --*/
907 ss = runningOrder[i];
908
909 /*--
910 Step 1:
911 Complete the big bucket [ss] by quicksorting
912 any unsorted small buckets [ss, j], for j != ss.
913 Hopefully previous pointer-scanning phases have already
914 completed many of the small buckets [ss, j], so
915 we don't have to sort them at all.
916 --*/
917 for (j = 0; j <= 255; j++) {
918 if (j != ss) {
919 sb = (ss << 8) + j;
920 if ( ! (ftab[sb] & SETMASK) ) {
921 Int32 lo = ftab[sb] & CLEARMASK;
922 Int32 hi = (ftab[sb+1] & CLEARMASK) - 1;
923 if (hi > lo) {
924 if (verb >= 4)
925 VPrintf4 ( " qsort [0x%x, 0x%x] "
926 "done %d this %d\n",
927 ss, j, numQSorted, hi - lo + 1 );
928 mainQSort3 (
929 ptr, block, quadrant, nblock,
930 lo, hi, BZ_N_RADIX, budget
931 );
932 numQSorted += (hi - lo + 1);
933 if (*budget < 0) return;
934 }
935 }
936 ftab[sb] |= SETMASK;
937 }
938 }
939
940 AssertH ( !bigDone[ss], 1006 );
941
942 /*--
943 Step 2:
944 Now scan this big bucket [ss] so as to synthesise the
945 sorted order for small buckets [t, ss] for all t,
946 including, magically, the bucket [ss,ss] too.
947 This will avoid doing Real Work in subsequent Step 1's.
948 --*/
949 {
950 for (j = 0; j <= 255; j++) {
951 copyStart[j] = ftab[(j << 8) + ss] & CLEARMASK;
952 copyEnd [j] = (ftab[(j << 8) + ss + 1] & CLEARMASK) - 1;
953 }
954 for (j = ftab[ss << 8] & CLEARMASK; j < copyStart[ss]; j++) {
955 k = ptr[j]-1; if (k < 0) k += nblock;
956 c1 = block[k];
957 if (!bigDone[c1])
958 ptr[ copyStart[c1]++ ] = k;
959 }
960 for (j = (ftab[(ss+1) << 8] & CLEARMASK) - 1; j > copyEnd[ss]; j--) {
961 k = ptr[j]-1; if (k < 0) k += nblock;
962 c1 = block[k];
963 if (!bigDone[c1])
964 ptr[ copyEnd[c1]-- ] = k;
965 }
966 }
967
968 AssertH ( (copyStart[ss]-1 == copyEnd[ss])
969 ||
970 /* Extremely rare case missing in bzip2-1.0.0 and 1.0.1.
971 Necessity for this case is demonstrated by compressing
972 a sequence of approximately 48.5 million of character
973 251; 1.0.0/1.0.1 will then die here. */
974 (copyStart[ss] == 0 && copyEnd[ss] == nblock-1),
975 1007 )
976
977 for (j = 0; j <= 255; j++) ftab[(j << 8) + ss] |= SETMASK;
978
979 /*--
980 Step 3:
981 The [ss] big bucket is now done. Record this fact,
982 and update the quadrant descriptors. Remember to
983 update quadrants in the overshoot area too, if
984 necessary. The "if (i < 255)" test merely skips
985 this updating for the last bucket processed, since
986 updating for the last bucket is pointless.
987
988 The quadrant array provides a way to incrementally
989 cache sort orderings, as they appear, so as to
990 make subsequent comparisons in fullGtU() complete
991 faster. For repetitive blocks this makes a big
992 difference (but not big enough to be able to avoid
993 the fallback sorting mechanism, exponential radix sort).
994
995 The precise meaning is: at all times:
996
997 for 0 <= i < nblock and 0 <= j <= nblock
998
999 if block[i] != block[j],
1000
1001 then the relative values of quadrant[i] and
1002 quadrant[j] are meaningless.
1003
1004 else {
1005 if quadrant[i] < quadrant[j]
1006 then the string starting at i lexicographically
1007 precedes the string starting at j
1008
1009 else if quadrant[i] > quadrant[j]
1010 then the string starting at j lexicographically
1011 precedes the string starting at i
1012
1013 else
1014 the relative ordering of the strings starting
1015 at i and j has not yet been determined.
1016 }
1017 --*/
1018 bigDone[ss] = True;
1019
1020 if (i < 255) {
1021 Int32 bbStart = ftab[ss << 8] & CLEARMASK;
1022 Int32 bbSize = (ftab[(ss+1) << 8] & CLEARMASK) - bbStart;
1023 Int32 shifts = 0;
1024
1025 while ((bbSize >> shifts) > 65534) shifts++;
1026
1027 for (j = bbSize-1; j >= 0; j--) {
1028 Int32 a2update = ptr[bbStart + j];
1029 UInt16 qVal = (UInt16)(j >> shifts);
1030 quadrant[a2update] = qVal;
1031 if (a2update < BZ_N_OVERSHOOT)
1032 quadrant[a2update + nblock] = qVal;
1033 }
1034 AssertH ( ((bbSize-1) >> shifts) <= 65535, 1002 );
1035 }
1036
1037 }
1038
1039 if (verb >= 4)
1040 VPrintf3 ( " %d pointers, %d sorted, %d scanned\n",
1041 nblock, numQSorted, nblock - numQSorted );
1042}
1043
1044#undef BIGFREQ
1045#undef SETMASK
1046#undef CLEARMASK
1047
Simon Glass0a30e422016-01-05 09:31:00 -07001048/*---------------------------------------------*/
1049/* Pre:
1050 nblock > 0
1051 arr2 exists for [0 .. nblock-1 +N_OVERSHOOT]
1052 ((UChar*)arr2) [0 .. nblock-1] holds block
1053 arr1 exists for [0 .. nblock-1]
1054
1055 Post:
1056 ((UChar*)arr2) [0 .. nblock-1] holds block
1057 All other areas of block destroyed
1058 ftab [ 0 .. 65536 ] destroyed
1059 arr1 [0 .. nblock-1] holds sorted order
1060*/
1061void BZ2_blockSort ( EState* s )
1062{
1063 UInt32* ptr = s->ptr;
1064 UChar* block = s->block;
1065 UInt32* ftab = s->ftab;
1066 Int32 nblock = s->nblock;
1067 Int32 verb = s->verbosity;
1068 Int32 wfact = s->workFactor;
1069 UInt16* quadrant;
1070 Int32 budget;
1071 Int32 budgetInit;
1072 Int32 i;
1073
1074 if (nblock < 10000) {
1075 fallbackSort ( s->arr1, s->arr2, ftab, nblock, verb );
1076 } else {
1077 /* Calculate the location for quadrant, remembering to get
1078 the alignment right. Assumes that &(block[0]) is at least
1079 2-byte aligned -- this should be ok since block is really
1080 the first section of arr2.
1081 */
1082 i = nblock+BZ_N_OVERSHOOT;
1083 if (i & 1) i++;
1084 quadrant = (UInt16*)(&(block[i]));
1085
1086 /* (wfact-1) / 3 puts the default-factor-30
1087 transition point at very roughly the same place as
1088 with v0.1 and v0.9.0.
1089 Not that it particularly matters any more, since the
1090 resulting compressed stream is now the same regardless
1091 of whether or not we use the main sort or fallback sort.
1092 */
1093 if (wfact < 1 ) wfact = 1;
1094 if (wfact > 100) wfact = 100;
1095 budgetInit = nblock * ((wfact-1) / 3);
1096 budget = budgetInit;
1097
1098 mainSort ( ptr, block, quadrant, ftab, nblock, verb, &budget );
1099 if (verb >= 3)
1100 VPrintf3 ( " %d work, %d block, ratio %5.2f\n",
1101 budgetInit - budget,
1102 nblock,
1103 (float)(budgetInit - budget) /
1104 (float)(nblock==0 ? 1 : nblock) );
1105 if (budget < 0) {
1106 if (verb >= 2)
1107 VPrintf0 ( " too repetitive; using fallback"
1108 " sorting algorithm\n" );
1109 fallbackSort ( s->arr1, s->arr2, ftab, nblock, verb );
1110 }
1111 }
1112
1113 s->origPtr = -1;
1114 for (i = 0; i < s->nblock; i++)
1115 if (ptr[i] == 0)
1116 { s->origPtr = i; break; };
1117
1118 AssertH( s->origPtr != -1, 1003 );
1119}
1120
Simon Glass0a30e422016-01-05 09:31:00 -07001121/*-------------------------------------------------------------*/
1122/*--- end blocksort.c ---*/
1123/*-------------------------------------------------------------*/