Magellan Linux

Contents of /trunk/mkinitrd-magellan/busybox/archival/bz/blocksort.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 816 - (show annotations) (download)
Fri Apr 24 18:33:46 2009 UTC (15 years ago) by niro
File MIME type: text/plain
File size: 24213 byte(s)
-updated to busybox-1.13.4
1 /*
2 * bzip2 is written by Julian Seward <jseward@bzip.org>.
3 * Adapted for busybox by Denys Vlasenko <vda.linux@googlemail.com>.
4 * See README and LICENSE files in this directory for more information.
5 */
6
7 /*-------------------------------------------------------------*/
8 /*--- Block sorting machinery ---*/
9 /*--- blocksort.c ---*/
10 /*-------------------------------------------------------------*/
11
12 /* ------------------------------------------------------------------
13 This file is part of bzip2/libbzip2, a program and library for
14 lossless, block-sorting data compression.
15
16 bzip2/libbzip2 version 1.0.4 of 20 December 2006
17 Copyright (C) 1996-2006 Julian Seward <jseward@bzip.org>
18
19 Please read the WARNING, DISCLAIMER and PATENTS sections in the
20 README file.
21
22 This program is released under the terms of the license contained
23 in the file LICENSE.
24 ------------------------------------------------------------------ */
25
26 /* #include "bzlib_private.h" */
27
28 #define mswap(zz1, zz2) \
29 { \
30 int32_t zztmp = zz1; \
31 zz1 = zz2; \
32 zz2 = zztmp; \
33 }
34
35 static
36 /* No measurable speed gain with inlining */
37 /* ALWAYS_INLINE */
38 void mvswap(uint32_t* ptr, int32_t zzp1, int32_t zzp2, int32_t zzn)
39 {
40 while (zzn > 0) {
41 mswap(ptr[zzp1], ptr[zzp2]);
42 zzp1++;
43 zzp2++;
44 zzn--;
45 }
46 }
47
48 static
49 ALWAYS_INLINE
50 int32_t mmin(int32_t a, int32_t b)
51 {
52 return (a < b) ? a : b;
53 }
54
55
56 /*---------------------------------------------*/
57 /*--- Fallback O(N log(N)^2) sorting ---*/
58 /*--- algorithm, for repetitive blocks ---*/
59 /*---------------------------------------------*/
60
61 /*---------------------------------------------*/
62 static
63 inline
64 void fallbackSimpleSort(uint32_t* fmap,
65 uint32_t* eclass,
66 int32_t lo,
67 int32_t hi)
68 {
69 int32_t i, j, tmp;
70 uint32_t ec_tmp;
71
72 if (lo == hi) return;
73
74 if (hi - lo > 3) {
75 for (i = hi-4; i >= lo; i--) {
76 tmp = fmap[i];
77 ec_tmp = eclass[tmp];
78 for (j = i+4; j <= hi && ec_tmp > eclass[fmap[j]]; j += 4)
79 fmap[j-4] = fmap[j];
80 fmap[j-4] = tmp;
81 }
82 }
83
84 for (i = hi-1; i >= lo; i--) {
85 tmp = fmap[i];
86 ec_tmp = eclass[tmp];
87 for (j = i+1; j <= hi && ec_tmp > eclass[fmap[j]]; j++)
88 fmap[j-1] = fmap[j];
89 fmap[j-1] = tmp;
90 }
91 }
92
93
94 /*---------------------------------------------*/
95 #define fpush(lz,hz) { \
96 stackLo[sp] = lz; \
97 stackHi[sp] = hz; \
98 sp++; \
99 }
100
101 #define fpop(lz,hz) { \
102 sp--; \
103 lz = stackLo[sp]; \
104 hz = stackHi[sp]; \
105 }
106
107 #define FALLBACK_QSORT_SMALL_THRESH 10
108 #define FALLBACK_QSORT_STACK_SIZE 100
109
110 static
111 void fallbackQSort3(uint32_t* fmap,
112 uint32_t* eclass,
113 int32_t loSt,
114 int32_t hiSt)
115 {
116 int32_t unLo, unHi, ltLo, gtHi, n, m;
117 int32_t sp, lo, hi;
118 uint32_t med, r, r3;
119 int32_t stackLo[FALLBACK_QSORT_STACK_SIZE];
120 int32_t stackHi[FALLBACK_QSORT_STACK_SIZE];
121
122 r = 0;
123
124 sp = 0;
125 fpush(loSt, hiSt);
126
127 while (sp > 0) {
128 AssertH(sp < FALLBACK_QSORT_STACK_SIZE - 1, 1004);
129
130 fpop(lo, hi);
131 if (hi - lo < FALLBACK_QSORT_SMALL_THRESH) {
132 fallbackSimpleSort(fmap, eclass, lo, hi);
133 continue;
134 }
135
136 /* Random partitioning. Median of 3 sometimes fails to
137 * avoid bad cases. Median of 9 seems to help but
138 * looks rather expensive. This too seems to work but
139 * is cheaper. Guidance for the magic constants
140 * 7621 and 32768 is taken from Sedgewick's algorithms
141 * book, chapter 35.
142 */
143 r = ((r * 7621) + 1) % 32768;
144 r3 = r % 3;
145 if (r3 == 0)
146 med = eclass[fmap[lo]];
147 else if (r3 == 1)
148 med = eclass[fmap[(lo+hi)>>1]];
149 else
150 med = eclass[fmap[hi]];
151
152 unLo = ltLo = lo;
153 unHi = gtHi = hi;
154
155 while (1) {
156 while (1) {
157 if (unLo > unHi) break;
158 n = (int32_t)eclass[fmap[unLo]] - (int32_t)med;
159 if (n == 0) {
160 mswap(fmap[unLo], fmap[ltLo]);
161 ltLo++;
162 unLo++;
163 continue;
164 };
165 if (n > 0) break;
166 unLo++;
167 }
168 while (1) {
169 if (unLo > unHi) break;
170 n = (int32_t)eclass[fmap[unHi]] - (int32_t)med;
171 if (n == 0) {
172 mswap(fmap[unHi], fmap[gtHi]);
173 gtHi--; unHi--;
174 continue;
175 };
176 if (n < 0) break;
177 unHi--;
178 }
179 if (unLo > unHi) break;
180 mswap(fmap[unLo], fmap[unHi]); unLo++; unHi--;
181 }
182
183 AssertD(unHi == unLo-1, "fallbackQSort3(2)");
184
185 if (gtHi < ltLo) continue;
186
187 n = mmin(ltLo-lo, unLo-ltLo); mvswap(fmap, lo, unLo-n, n);
188 m = mmin(hi-gtHi, gtHi-unHi); mvswap(fmap, unLo, hi-m+1, m);
189
190 n = lo + unLo - ltLo - 1;
191 m = hi - (gtHi - unHi) + 1;
192
193 if (n - lo > hi - m) {
194 fpush(lo, n);
195 fpush(m, hi);
196 } else {
197 fpush(m, hi);
198 fpush(lo, n);
199 }
200 }
201 }
202
203 #undef fpush
204 #undef fpop
205 #undef FALLBACK_QSORT_SMALL_THRESH
206 #undef FALLBACK_QSORT_STACK_SIZE
207
208
209 /*---------------------------------------------*/
210 /* Pre:
211 * nblock > 0
212 * eclass exists for [0 .. nblock-1]
213 * ((uint8_t*)eclass) [0 .. nblock-1] holds block
214 * ptr exists for [0 .. nblock-1]
215 *
216 * Post:
217 * ((uint8_t*)eclass) [0 .. nblock-1] holds block
218 * All other areas of eclass destroyed
219 * fmap [0 .. nblock-1] holds sorted order
220 * bhtab[0 .. 2+(nblock/32)] destroyed
221 */
222
223 #define SET_BH(zz) bhtab[(zz) >> 5] |= (1 << ((zz) & 31))
224 #define CLEAR_BH(zz) bhtab[(zz) >> 5] &= ~(1 << ((zz) & 31))
225 #define ISSET_BH(zz) (bhtab[(zz) >> 5] & (1 << ((zz) & 31)))
226 #define WORD_BH(zz) bhtab[(zz) >> 5]
227 #define UNALIGNED_BH(zz) ((zz) & 0x01f)
228
229 static
230 void fallbackSort(uint32_t* fmap,
231 uint32_t* eclass,
232 uint32_t* bhtab,
233 int32_t nblock)
234 {
235 int32_t ftab[257];
236 int32_t ftabCopy[256];
237 int32_t H, i, j, k, l, r, cc, cc1;
238 int32_t nNotDone;
239 int32_t nBhtab;
240 uint8_t* eclass8 = (uint8_t*)eclass;
241
242 /*
243 * Initial 1-char radix sort to generate
244 * initial fmap and initial BH bits.
245 */
246 for (i = 0; i < 257; i++) ftab[i] = 0;
247 for (i = 0; i < nblock; i++) ftab[eclass8[i]]++;
248 for (i = 0; i < 256; i++) ftabCopy[i] = ftab[i];
249
250 j = ftab[0]; /* bbox: optimized */
251 for (i = 1; i < 257; i++) {
252 j += ftab[i];
253 ftab[i] = j;
254 }
255
256 for (i = 0; i < nblock; i++) {
257 j = eclass8[i];
258 k = ftab[j] - 1;
259 ftab[j] = k;
260 fmap[k] = i;
261 }
262
263 nBhtab = 2 + ((uint32_t)nblock / 32); /* bbox: unsigned div is easier */
264 for (i = 0; i < nBhtab; i++) bhtab[i] = 0;
265 for (i = 0; i < 256; i++) SET_BH(ftab[i]);
266
267 /*
268 * Inductively refine the buckets. Kind-of an
269 * "exponential radix sort" (!), inspired by the
270 * Manber-Myers suffix array construction algorithm.
271 */
272
273 /*-- set sentinel bits for block-end detection --*/
274 for (i = 0; i < 32; i++) {
275 SET_BH(nblock + 2*i);
276 CLEAR_BH(nblock + 2*i + 1);
277 }
278
279 /*-- the log(N) loop --*/
280 H = 1;
281 while (1) {
282 j = 0;
283 for (i = 0; i < nblock; i++) {
284 if (ISSET_BH(i))
285 j = i;
286 k = fmap[i] - H;
287 if (k < 0)
288 k += nblock;
289 eclass[k] = j;
290 }
291
292 nNotDone = 0;
293 r = -1;
294 while (1) {
295
296 /*-- find the next non-singleton bucket --*/
297 k = r + 1;
298 while (ISSET_BH(k) && UNALIGNED_BH(k))
299 k++;
300 if (ISSET_BH(k)) {
301 while (WORD_BH(k) == 0xffffffff) k += 32;
302 while (ISSET_BH(k)) k++;
303 }
304 l = k - 1;
305 if (l >= nblock)
306 break;
307 while (!ISSET_BH(k) && UNALIGNED_BH(k))
308 k++;
309 if (!ISSET_BH(k)) {
310 while (WORD_BH(k) == 0x00000000) k += 32;
311 while (!ISSET_BH(k)) k++;
312 }
313 r = k - 1;
314 if (r >= nblock)
315 break;
316
317 /*-- now [l, r] bracket current bucket --*/
318 if (r > l) {
319 nNotDone += (r - l + 1);
320 fallbackQSort3(fmap, eclass, l, r);
321
322 /*-- scan bucket and generate header bits-- */
323 cc = -1;
324 for (i = l; i <= r; i++) {
325 cc1 = eclass[fmap[i]];
326 if (cc != cc1) {
327 SET_BH(i);
328 cc = cc1;
329 };
330 }
331 }
332 }
333
334 H *= 2;
335 if (H > nblock || nNotDone == 0)
336 break;
337 }
338
339 /*
340 * Reconstruct the original block in
341 * eclass8 [0 .. nblock-1], since the
342 * previous phase destroyed it.
343 */
344 j = 0;
345 for (i = 0; i < nblock; i++) {
346 while (ftabCopy[j] == 0)
347 j++;
348 ftabCopy[j]--;
349 eclass8[fmap[i]] = (uint8_t)j;
350 }
351 AssertH(j < 256, 1005);
352 }
353
354 #undef SET_BH
355 #undef CLEAR_BH
356 #undef ISSET_BH
357 #undef WORD_BH
358 #undef UNALIGNED_BH
359
360
361 /*---------------------------------------------*/
362 /*--- The main, O(N^2 log(N)) sorting ---*/
363 /*--- algorithm. Faster for "normal" ---*/
364 /*--- non-repetitive blocks. ---*/
365 /*---------------------------------------------*/
366
367 /*---------------------------------------------*/
368 static
369 NOINLINE
370 int mainGtU(
371 uint32_t i1,
372 uint32_t i2,
373 uint8_t* block,
374 uint16_t* quadrant,
375 uint32_t nblock,
376 int32_t* budget)
377 {
378 int32_t k;
379 uint8_t c1, c2;
380 uint16_t s1, s2;
381
382 /* Loop unrolling here is actually very useful
383 * (generated code is much simpler),
384 * code size increase is only 270 bytes (i386)
385 * but speeds up compression 10% overall
386 */
387
388 #if CONFIG_BZIP2_FEATURE_SPEED >= 1
389
390 #define TIMES_8(code) \
391 code; code; code; code; \
392 code; code; code; code;
393 #define TIMES_12(code) \
394 code; code; code; code; \
395 code; code; code; code; \
396 code; code; code; code;
397
398 #else
399
400 #define TIMES_8(code) \
401 { \
402 int nn = 8; \
403 do { \
404 code; \
405 } while (--nn); \
406 }
407 #define TIMES_12(code) \
408 { \
409 int nn = 12; \
410 do { \
411 code; \
412 } while (--nn); \
413 }
414
415 #endif
416
417 AssertD(i1 != i2, "mainGtU");
418 TIMES_12(
419 c1 = block[i1]; c2 = block[i2];
420 if (c1 != c2) return (c1 > c2);
421 i1++; i2++;
422 )
423
424 k = nblock + 8;
425
426 do {
427 TIMES_8(
428 c1 = block[i1]; c2 = block[i2];
429 if (c1 != c2) return (c1 > c2);
430 s1 = quadrant[i1]; s2 = quadrant[i2];
431 if (s1 != s2) return (s1 > s2);
432 i1++; i2++;
433 )
434
435 if (i1 >= nblock) i1 -= nblock;
436 if (i2 >= nblock) i2 -= nblock;
437
438 (*budget)--;
439 k -= 8;
440 } while (k >= 0);
441
442 return False;
443 }
444 #undef TIMES_8
445 #undef TIMES_12
446
447 /*---------------------------------------------*/
448 /*
449 * Knuth's increments seem to work better
450 * than Incerpi-Sedgewick here. Possibly
451 * because the number of elems to sort is
452 * usually small, typically <= 20.
453 */
454 static
455 const int32_t incs[14] = {
456 1, 4, 13, 40, 121, 364, 1093, 3280,
457 9841, 29524, 88573, 265720,
458 797161, 2391484
459 };
460
461 static
462 void mainSimpleSort(uint32_t* ptr,
463 uint8_t* block,
464 uint16_t* quadrant,
465 int32_t nblock,
466 int32_t lo,
467 int32_t hi,
468 int32_t d,
469 int32_t* budget)
470 {
471 int32_t i, j, h, bigN, hp;
472 uint32_t v;
473
474 bigN = hi - lo + 1;
475 if (bigN < 2) return;
476
477 hp = 0;
478 while (incs[hp] < bigN) hp++;
479 hp--;
480
481 for (; hp >= 0; hp--) {
482 h = incs[hp];
483
484 i = lo + h;
485 while (1) {
486 /*-- copy 1 --*/
487 if (i > hi) break;
488 v = ptr[i];
489 j = i;
490 while (mainGtU(ptr[j-h]+d, v+d, block, quadrant, nblock, budget)) {
491 ptr[j] = ptr[j-h];
492 j = j - h;
493 if (j <= (lo + h - 1)) break;
494 }
495 ptr[j] = v;
496 i++;
497
498 /* 1.5% overall speedup, +290 bytes */
499 #if CONFIG_BZIP2_FEATURE_SPEED >= 3
500 /*-- copy 2 --*/
501 if (i > hi) break;
502 v = ptr[i];
503 j = i;
504 while (mainGtU(ptr[j-h]+d, v+d, block, quadrant, nblock, budget)) {
505 ptr[j] = ptr[j-h];
506 j = j - h;
507 if (j <= (lo + h - 1)) break;
508 }
509 ptr[j] = v;
510 i++;
511
512 /*-- copy 3 --*/
513 if (i > hi) break;
514 v = ptr[i];
515 j = i;
516 while (mainGtU(ptr[j-h]+d, v+d, block, quadrant, nblock, budget)) {
517 ptr[j] = ptr[j-h];
518 j = j - h;
519 if (j <= (lo + h - 1)) break;
520 }
521 ptr[j] = v;
522 i++;
523 #endif
524 if (*budget < 0) return;
525 }
526 }
527 }
528
529
530 /*---------------------------------------------*/
531 /*
532 * The following is an implementation of
533 * an elegant 3-way quicksort for strings,
534 * described in a paper "Fast Algorithms for
535 * Sorting and Searching Strings", by Robert
536 * Sedgewick and Jon L. Bentley.
537 */
538
539 static
540 ALWAYS_INLINE
541 uint8_t mmed3(uint8_t a, uint8_t b, uint8_t c)
542 {
543 uint8_t t;
544 if (a > b) {
545 t = a;
546 a = b;
547 b = t;
548 };
549 /* here b >= a */
550 if (b > c) {
551 b = c;
552 if (a > b)
553 b = a;
554 }
555 return b;
556 }
557
558 #define mpush(lz,hz,dz) \
559 { \
560 stackLo[sp] = lz; \
561 stackHi[sp] = hz; \
562 stackD [sp] = dz; \
563 sp++; \
564 }
565
566 #define mpop(lz,hz,dz) \
567 { \
568 sp--; \
569 lz = stackLo[sp]; \
570 hz = stackHi[sp]; \
571 dz = stackD [sp]; \
572 }
573
574 #define mnextsize(az) (nextHi[az] - nextLo[az])
575
576 #define mnextswap(az,bz) \
577 { \
578 int32_t tz; \
579 tz = nextLo[az]; nextLo[az] = nextLo[bz]; nextLo[bz] = tz; \
580 tz = nextHi[az]; nextHi[az] = nextHi[bz]; nextHi[bz] = tz; \
581 tz = nextD [az]; nextD [az] = nextD [bz]; nextD [bz] = tz; \
582 }
583
584 #define MAIN_QSORT_SMALL_THRESH 20
585 #define MAIN_QSORT_DEPTH_THRESH (BZ_N_RADIX + BZ_N_QSORT)
586 #define MAIN_QSORT_STACK_SIZE 100
587
588 static
589 void mainQSort3(uint32_t* ptr,
590 uint8_t* block,
591 uint16_t* quadrant,
592 int32_t nblock,
593 int32_t loSt,
594 int32_t hiSt,
595 int32_t dSt,
596 int32_t* budget)
597 {
598 int32_t unLo, unHi, ltLo, gtHi, n, m, med;
599 int32_t sp, lo, hi, d;
600
601 int32_t stackLo[MAIN_QSORT_STACK_SIZE];
602 int32_t stackHi[MAIN_QSORT_STACK_SIZE];
603 int32_t stackD [MAIN_QSORT_STACK_SIZE];
604
605 int32_t nextLo[3];
606 int32_t nextHi[3];
607 int32_t nextD [3];
608
609 sp = 0;
610 mpush(loSt, hiSt, dSt);
611
612 while (sp > 0) {
613 AssertH(sp < MAIN_QSORT_STACK_SIZE - 2, 1001);
614
615 mpop(lo, hi, d);
616 if (hi - lo < MAIN_QSORT_SMALL_THRESH
617 || d > MAIN_QSORT_DEPTH_THRESH
618 ) {
619 mainSimpleSort(ptr, block, quadrant, nblock, lo, hi, d, budget);
620 if (*budget < 0)
621 return;
622 continue;
623 }
624 med = (int32_t) mmed3(block[ptr[lo ] + d],
625 block[ptr[hi ] + d],
626 block[ptr[(lo+hi) >> 1] + d]);
627
628 unLo = ltLo = lo;
629 unHi = gtHi = hi;
630
631 while (1) {
632 while (1) {
633 if (unLo > unHi)
634 break;
635 n = ((int32_t)block[ptr[unLo]+d]) - med;
636 if (n == 0) {
637 mswap(ptr[unLo], ptr[ltLo]);
638 ltLo++;
639 unLo++;
640 continue;
641 };
642 if (n > 0) break;
643 unLo++;
644 }
645 while (1) {
646 if (unLo > unHi)
647 break;
648 n = ((int32_t)block[ptr[unHi]+d]) - med;
649 if (n == 0) {
650 mswap(ptr[unHi], ptr[gtHi]);
651 gtHi--;
652 unHi--;
653 continue;
654 };
655 if (n < 0) break;
656 unHi--;
657 }
658 if (unLo > unHi)
659 break;
660 mswap(ptr[unLo], ptr[unHi]);
661 unLo++;
662 unHi--;
663 }
664
665 AssertD(unHi == unLo-1, "mainQSort3(2)");
666
667 if (gtHi < ltLo) {
668 mpush(lo, hi, d + 1);
669 continue;
670 }
671
672 n = mmin(ltLo-lo, unLo-ltLo); mvswap(ptr, lo, unLo-n, n);
673 m = mmin(hi-gtHi, gtHi-unHi); mvswap(ptr, unLo, hi-m+1, m);
674
675 n = lo + unLo - ltLo - 1;
676 m = hi - (gtHi - unHi) + 1;
677
678 nextLo[0] = lo; nextHi[0] = n; nextD[0] = d;
679 nextLo[1] = m; nextHi[1] = hi; nextD[1] = d;
680 nextLo[2] = n+1; nextHi[2] = m-1; nextD[2] = d+1;
681
682 if (mnextsize(0) < mnextsize(1)) mnextswap(0, 1);
683 if (mnextsize(1) < mnextsize(2)) mnextswap(1, 2);
684 if (mnextsize(0) < mnextsize(1)) mnextswap(0, 1);
685
686 AssertD (mnextsize(0) >= mnextsize(1), "mainQSort3(8)");
687 AssertD (mnextsize(1) >= mnextsize(2), "mainQSort3(9)");
688
689 mpush(nextLo[0], nextHi[0], nextD[0]);
690 mpush(nextLo[1], nextHi[1], nextD[1]);
691 mpush(nextLo[2], nextHi[2], nextD[2]);
692 }
693 }
694
695 #undef mpush
696 #undef mpop
697 #undef mnextsize
698 #undef mnextswap
699 #undef MAIN_QSORT_SMALL_THRESH
700 #undef MAIN_QSORT_DEPTH_THRESH
701 #undef MAIN_QSORT_STACK_SIZE
702
703
704 /*---------------------------------------------*/
705 /* Pre:
706 * nblock > N_OVERSHOOT
707 * block32 exists for [0 .. nblock-1 +N_OVERSHOOT]
708 * ((uint8_t*)block32) [0 .. nblock-1] holds block
709 * ptr exists for [0 .. nblock-1]
710 *
711 * Post:
712 * ((uint8_t*)block32) [0 .. nblock-1] holds block
713 * All other areas of block32 destroyed
714 * ftab[0 .. 65536] destroyed
715 * ptr [0 .. nblock-1] holds sorted order
716 * if (*budget < 0), sorting was abandoned
717 */
718
719 #define BIGFREQ(b) (ftab[((b)+1) << 8] - ftab[(b) << 8])
720 #define SETMASK (1 << 21)
721 #define CLEARMASK (~(SETMASK))
722
723 static NOINLINE
724 void mainSort(EState* state,
725 uint32_t* ptr,
726 uint8_t* block,
727 uint16_t* quadrant,
728 uint32_t* ftab,
729 int32_t nblock,
730 int32_t* budget)
731 {
732 int32_t i, j, k, ss, sb;
733 uint8_t c1;
734 int32_t numQSorted;
735 uint16_t s;
736 Bool bigDone[256];
737 /* bbox: moved to EState to save stack
738 int32_t runningOrder[256];
739 int32_t copyStart[256];
740 int32_t copyEnd [256];
741 */
742 #define runningOrder (state->mainSort__runningOrder)
743 #define copyStart (state->mainSort__copyStart)
744 #define copyEnd (state->mainSort__copyEnd)
745
746 /*-- set up the 2-byte frequency table --*/
747 /* was: for (i = 65536; i >= 0; i--) ftab[i] = 0; */
748 memset(ftab, 0, 65537 * sizeof(ftab[0]));
749
750 j = block[0] << 8;
751 i = nblock - 1;
752 /* 3%, +300 bytes */
753 #if CONFIG_BZIP2_FEATURE_SPEED >= 2
754 for (; i >= 3; i -= 4) {
755 quadrant[i] = 0;
756 j = (j >> 8) | (((uint16_t)block[i]) << 8);
757 ftab[j]++;
758 quadrant[i-1] = 0;
759 j = (j >> 8) | (((uint16_t)block[i-1]) << 8);
760 ftab[j]++;
761 quadrant[i-2] = 0;
762 j = (j >> 8) | (((uint16_t)block[i-2]) << 8);
763 ftab[j]++;
764 quadrant[i-3] = 0;
765 j = (j >> 8) | (((uint16_t)block[i-3]) << 8);
766 ftab[j]++;
767 }
768 #endif
769 for (; i >= 0; i--) {
770 quadrant[i] = 0;
771 j = (j >> 8) | (((uint16_t)block[i]) << 8);
772 ftab[j]++;
773 }
774
775 /*-- (emphasises close relationship of block & quadrant) --*/
776 for (i = 0; i < BZ_N_OVERSHOOT; i++) {
777 block [nblock+i] = block[i];
778 quadrant[nblock+i] = 0;
779 }
780
781 /*-- Complete the initial radix sort --*/
782 j = ftab[0]; /* bbox: optimized */
783 for (i = 1; i <= 65536; i++) {
784 j += ftab[i];
785 ftab[i] = j;
786 }
787
788 s = block[0] << 8;
789 i = nblock - 1;
790 #if CONFIG_BZIP2_FEATURE_SPEED >= 2
791 for (; i >= 3; i -= 4) {
792 s = (s >> 8) | (block[i] << 8);
793 j = ftab[s] - 1;
794 ftab[s] = j;
795 ptr[j] = i;
796 s = (s >> 8) | (block[i-1] << 8);
797 j = ftab[s] - 1;
798 ftab[s] = j;
799 ptr[j] = i-1;
800 s = (s >> 8) | (block[i-2] << 8);
801 j = ftab[s] - 1;
802 ftab[s] = j;
803 ptr[j] = i-2;
804 s = (s >> 8) | (block[i-3] << 8);
805 j = ftab[s] - 1;
806 ftab[s] = j;
807 ptr[j] = i-3;
808 }
809 #endif
810 for (; i >= 0; i--) {
811 s = (s >> 8) | (block[i] << 8);
812 j = ftab[s] - 1;
813 ftab[s] = j;
814 ptr[j] = i;
815 }
816
817 /*
818 * Now ftab contains the first loc of every small bucket.
819 * Calculate the running order, from smallest to largest
820 * big bucket.
821 */
822 for (i = 0; i <= 255; i++) {
823 bigDone [i] = False;
824 runningOrder[i] = i;
825 }
826
827 {
828 int32_t vv;
829 /* bbox: was: int32_t h = 1; */
830 /* do h = 3 * h + 1; while (h <= 256); */
831 uint32_t h = 364;
832
833 do {
834 /*h = h / 3;*/
835 h = (h * 171) >> 9; /* bbox: fast h/3 */
836 for (i = h; i <= 255; i++) {
837 vv = runningOrder[i];
838 j = i;
839 while (BIGFREQ(runningOrder[j-h]) > BIGFREQ(vv)) {
840 runningOrder[j] = runningOrder[j-h];
841 j = j - h;
842 if (j <= (h - 1))
843 goto zero;
844 }
845 zero:
846 runningOrder[j] = vv;
847 }
848 } while (h != 1);
849 }
850
851 /*
852 * The main sorting loop.
853 */
854
855 numQSorted = 0;
856
857 for (i = 0; i <= 255; i++) {
858
859 /*
860 * Process big buckets, starting with the least full.
861 * Basically this is a 3-step process in which we call
862 * mainQSort3 to sort the small buckets [ss, j], but
863 * also make a big effort to avoid the calls if we can.
864 */
865 ss = runningOrder[i];
866
867 /*
868 * Step 1:
869 * Complete the big bucket [ss] by quicksorting
870 * any unsorted small buckets [ss, j], for j != ss.
871 * Hopefully previous pointer-scanning phases have already
872 * completed many of the small buckets [ss, j], so
873 * we don't have to sort them at all.
874 */
875 for (j = 0; j <= 255; j++) {
876 if (j != ss) {
877 sb = (ss << 8) + j;
878 if (!(ftab[sb] & SETMASK)) {
879 int32_t lo = ftab[sb] & CLEARMASK;
880 int32_t hi = (ftab[sb+1] & CLEARMASK) - 1;
881 if (hi > lo) {
882 mainQSort3(
883 ptr, block, quadrant, nblock,
884 lo, hi, BZ_N_RADIX, budget
885 );
886 if (*budget < 0) return;
887 numQSorted += (hi - lo + 1);
888 }
889 }
890 ftab[sb] |= SETMASK;
891 }
892 }
893
894 AssertH(!bigDone[ss], 1006);
895
896 /*
897 * Step 2:
898 * Now scan this big bucket [ss] so as to synthesise the
899 * sorted order for small buckets [t, ss] for all t,
900 * including, magically, the bucket [ss,ss] too.
901 * This will avoid doing Real Work in subsequent Step 1's.
902 */
903 {
904 for (j = 0; j <= 255; j++) {
905 copyStart[j] = ftab[(j << 8) + ss] & CLEARMASK;
906 copyEnd [j] = (ftab[(j << 8) + ss + 1] & CLEARMASK) - 1;
907 }
908 for (j = ftab[ss << 8] & CLEARMASK; j < copyStart[ss]; j++) {
909 k = ptr[j] - 1;
910 if (k < 0)
911 k += nblock;
912 c1 = block[k];
913 if (!bigDone[c1])
914 ptr[copyStart[c1]++] = k;
915 }
916 for (j = (ftab[(ss+1) << 8] & CLEARMASK) - 1; j > copyEnd[ss]; j--) {
917 k = ptr[j]-1;
918 if (k < 0)
919 k += nblock;
920 c1 = block[k];
921 if (!bigDone[c1])
922 ptr[copyEnd[c1]--] = k;
923 }
924 }
925
926 /* Extremely rare case missing in bzip2-1.0.0 and 1.0.1.
927 * Necessity for this case is demonstrated by compressing
928 * a sequence of approximately 48.5 million of character
929 * 251; 1.0.0/1.0.1 will then die here. */
930 AssertH((copyStart[ss]-1 == copyEnd[ss]) \
931 || (copyStart[ss] == 0 && copyEnd[ss] == nblock-1), 1007);
932
933 for (j = 0; j <= 255; j++)
934 ftab[(j << 8) + ss] |= SETMASK;
935
936 /*
937 * Step 3:
938 * The [ss] big bucket is now done. Record this fact,
939 * and update the quadrant descriptors. Remember to
940 * update quadrants in the overshoot area too, if
941 * necessary. The "if (i < 255)" test merely skips
942 * this updating for the last bucket processed, since
943 * updating for the last bucket is pointless.
944 *
945 * The quadrant array provides a way to incrementally
946 * cache sort orderings, as they appear, so as to
947 * make subsequent comparisons in fullGtU() complete
948 * faster. For repetitive blocks this makes a big
949 * difference (but not big enough to be able to avoid
950 * the fallback sorting mechanism, exponential radix sort).
951 *
952 * The precise meaning is: at all times:
953 *
954 * for 0 <= i < nblock and 0 <= j <= nblock
955 *
956 * if block[i] != block[j],
957 *
958 * then the relative values of quadrant[i] and
959 * quadrant[j] are meaningless.
960 *
961 * else {
962 * if quadrant[i] < quadrant[j]
963 * then the string starting at i lexicographically
964 * precedes the string starting at j
965 *
966 * else if quadrant[i] > quadrant[j]
967 * then the string starting at j lexicographically
968 * precedes the string starting at i
969 *
970 * else
971 * the relative ordering of the strings starting
972 * at i and j has not yet been determined.
973 * }
974 */
975 bigDone[ss] = True;
976
977 if (i < 255) {
978 int32_t bbStart = ftab[ss << 8] & CLEARMASK;
979 int32_t bbSize = (ftab[(ss+1) << 8] & CLEARMASK) - bbStart;
980 int32_t shifts = 0;
981
982 while ((bbSize >> shifts) > 65534) shifts++;
983
984 for (j = bbSize-1; j >= 0; j--) {
985 int32_t a2update = ptr[bbStart + j];
986 uint16_t qVal = (uint16_t)(j >> shifts);
987 quadrant[a2update] = qVal;
988 if (a2update < BZ_N_OVERSHOOT)
989 quadrant[a2update + nblock] = qVal;
990 }
991 AssertH(((bbSize-1) >> shifts) <= 65535, 1002);
992 }
993 }
994 #undef runningOrder
995 #undef copyStart
996 #undef copyEnd
997 }
998
999 #undef BIGFREQ
1000 #undef SETMASK
1001 #undef CLEARMASK
1002
1003
1004 /*---------------------------------------------*/
1005 /* Pre:
1006 * nblock > 0
1007 * arr2 exists for [0 .. nblock-1 +N_OVERSHOOT]
1008 * ((uint8_t*)arr2)[0 .. nblock-1] holds block
1009 * arr1 exists for [0 .. nblock-1]
1010 *
1011 * Post:
1012 * ((uint8_t*)arr2) [0 .. nblock-1] holds block
1013 * All other areas of block destroyed
1014 * ftab[0 .. 65536] destroyed
1015 * arr1[0 .. nblock-1] holds sorted order
1016 */
1017 static NOINLINE
1018 void BZ2_blockSort(EState* s)
1019 {
1020 /* In original bzip2 1.0.4, it's a parameter, but 30
1021 * (which was the default) should work ok. */
1022 enum { wfact = 30 };
1023
1024 uint32_t* ptr = s->ptr;
1025 uint8_t* block = s->block;
1026 uint32_t* ftab = s->ftab;
1027 int32_t nblock = s->nblock;
1028 uint16_t* quadrant;
1029 int32_t budget;
1030 int32_t i;
1031
1032 if (nblock < 10000) {
1033 fallbackSort(s->arr1, s->arr2, ftab, nblock);
1034 } else {
1035 /* Calculate the location for quadrant, remembering to get
1036 * the alignment right. Assumes that &(block[0]) is at least
1037 * 2-byte aligned -- this should be ok since block is really
1038 * the first section of arr2.
1039 */
1040 i = nblock + BZ_N_OVERSHOOT;
1041 if (i & 1) i++;
1042 quadrant = (uint16_t*)(&(block[i]));
1043
1044 /* (wfact-1) / 3 puts the default-factor-30
1045 * transition point at very roughly the same place as
1046 * with v0.1 and v0.9.0.
1047 * Not that it particularly matters any more, since the
1048 * resulting compressed stream is now the same regardless
1049 * of whether or not we use the main sort or fallback sort.
1050 */
1051 budget = nblock * ((wfact-1) / 3);
1052
1053 mainSort(s, ptr, block, quadrant, ftab, nblock, &budget);
1054 if (budget < 0) {
1055 fallbackSort(s->arr1, s->arr2, ftab, nblock);
1056 }
1057 }
1058
1059 s->origPtr = -1;
1060 for (i = 0; i < s->nblock; i++)
1061 if (ptr[i] == 0) {
1062 s->origPtr = i;
1063 break;
1064 };
1065
1066 AssertH(s->origPtr != -1, 1003);
1067 }
1068
1069
1070 /*-------------------------------------------------------------*/
1071 /*--- end blocksort.c ---*/
1072 /*-------------------------------------------------------------*/