Contents of /trunk/mkinitrd-magellan/busybox/archival/bz/blocksort.c
Parent Directory | Revision Log
Revision 816 -
(show annotations)
(download)
Fri Apr 24 18:33:46 2009 UTC (15 years ago) by niro
File MIME type: text/plain
File size: 24213 byte(s)
Fri Apr 24 18:33:46 2009 UTC (15 years ago) by niro
File MIME type: text/plain
File size: 24213 byte(s)
-updated to busybox-1.13.4
1 | /* |
2 | * bzip2 is written by Julian Seward <jseward@bzip.org>. |
3 | * Adapted for busybox by Denys Vlasenko <vda.linux@googlemail.com>. |
4 | * See README and LICENSE files in this directory for more information. |
5 | */ |
6 | |
7 | /*-------------------------------------------------------------*/ |
8 | /*--- Block sorting machinery ---*/ |
9 | /*--- blocksort.c ---*/ |
10 | /*-------------------------------------------------------------*/ |
11 | |
12 | /* ------------------------------------------------------------------ |
13 | This file is part of bzip2/libbzip2, a program and library for |
14 | lossless, block-sorting data compression. |
15 | |
16 | bzip2/libbzip2 version 1.0.4 of 20 December 2006 |
17 | Copyright (C) 1996-2006 Julian Seward <jseward@bzip.org> |
18 | |
19 | Please read the WARNING, DISCLAIMER and PATENTS sections in the |
20 | README file. |
21 | |
22 | This program is released under the terms of the license contained |
23 | in the file LICENSE. |
24 | ------------------------------------------------------------------ */ |
25 | |
26 | /* #include "bzlib_private.h" */ |
27 | |
28 | #define mswap(zz1, zz2) \ |
29 | { \ |
30 | int32_t zztmp = zz1; \ |
31 | zz1 = zz2; \ |
32 | zz2 = zztmp; \ |
33 | } |
34 | |
35 | static |
36 | /* No measurable speed gain with inlining */ |
37 | /* ALWAYS_INLINE */ |
38 | void mvswap(uint32_t* ptr, int32_t zzp1, int32_t zzp2, int32_t zzn) |
39 | { |
40 | while (zzn > 0) { |
41 | mswap(ptr[zzp1], ptr[zzp2]); |
42 | zzp1++; |
43 | zzp2++; |
44 | zzn--; |
45 | } |
46 | } |
47 | |
48 | static |
49 | ALWAYS_INLINE |
50 | int32_t mmin(int32_t a, int32_t b) |
51 | { |
52 | return (a < b) ? a : b; |
53 | } |
54 | |
55 | |
56 | /*---------------------------------------------*/ |
57 | /*--- Fallback O(N log(N)^2) sorting ---*/ |
58 | /*--- algorithm, for repetitive blocks ---*/ |
59 | /*---------------------------------------------*/ |
60 | |
61 | /*---------------------------------------------*/ |
62 | static |
63 | inline |
64 | void fallbackSimpleSort(uint32_t* fmap, |
65 | uint32_t* eclass, |
66 | int32_t lo, |
67 | int32_t hi) |
68 | { |
69 | int32_t i, j, tmp; |
70 | uint32_t ec_tmp; |
71 | |
72 | if (lo == hi) return; |
73 | |
74 | if (hi - lo > 3) { |
75 | for (i = hi-4; i >= lo; i--) { |
76 | tmp = fmap[i]; |
77 | ec_tmp = eclass[tmp]; |
78 | for (j = i+4; j <= hi && ec_tmp > eclass[fmap[j]]; j += 4) |
79 | fmap[j-4] = fmap[j]; |
80 | fmap[j-4] = tmp; |
81 | } |
82 | } |
83 | |
84 | for (i = hi-1; i >= lo; i--) { |
85 | tmp = fmap[i]; |
86 | ec_tmp = eclass[tmp]; |
87 | for (j = i+1; j <= hi && ec_tmp > eclass[fmap[j]]; j++) |
88 | fmap[j-1] = fmap[j]; |
89 | fmap[j-1] = tmp; |
90 | } |
91 | } |
92 | |
93 | |
94 | /*---------------------------------------------*/ |
95 | #define fpush(lz,hz) { \ |
96 | stackLo[sp] = lz; \ |
97 | stackHi[sp] = hz; \ |
98 | sp++; \ |
99 | } |
100 | |
101 | #define fpop(lz,hz) { \ |
102 | sp--; \ |
103 | lz = stackLo[sp]; \ |
104 | hz = stackHi[sp]; \ |
105 | } |
106 | |
107 | #define FALLBACK_QSORT_SMALL_THRESH 10 |
108 | #define FALLBACK_QSORT_STACK_SIZE 100 |
109 | |
110 | static |
111 | void fallbackQSort3(uint32_t* fmap, |
112 | uint32_t* eclass, |
113 | int32_t loSt, |
114 | int32_t hiSt) |
115 | { |
116 | int32_t unLo, unHi, ltLo, gtHi, n, m; |
117 | int32_t sp, lo, hi; |
118 | uint32_t med, r, r3; |
119 | int32_t stackLo[FALLBACK_QSORT_STACK_SIZE]; |
120 | int32_t stackHi[FALLBACK_QSORT_STACK_SIZE]; |
121 | |
122 | r = 0; |
123 | |
124 | sp = 0; |
125 | fpush(loSt, hiSt); |
126 | |
127 | while (sp > 0) { |
128 | AssertH(sp < FALLBACK_QSORT_STACK_SIZE - 1, 1004); |
129 | |
130 | fpop(lo, hi); |
131 | if (hi - lo < FALLBACK_QSORT_SMALL_THRESH) { |
132 | fallbackSimpleSort(fmap, eclass, lo, hi); |
133 | continue; |
134 | } |
135 | |
136 | /* Random partitioning. Median of 3 sometimes fails to |
137 | * avoid bad cases. Median of 9 seems to help but |
138 | * looks rather expensive. This too seems to work but |
139 | * is cheaper. Guidance for the magic constants |
140 | * 7621 and 32768 is taken from Sedgewick's algorithms |
141 | * book, chapter 35. |
142 | */ |
143 | r = ((r * 7621) + 1) % 32768; |
144 | r3 = r % 3; |
145 | if (r3 == 0) |
146 | med = eclass[fmap[lo]]; |
147 | else if (r3 == 1) |
148 | med = eclass[fmap[(lo+hi)>>1]]; |
149 | else |
150 | med = eclass[fmap[hi]]; |
151 | |
152 | unLo = ltLo = lo; |
153 | unHi = gtHi = hi; |
154 | |
155 | while (1) { |
156 | while (1) { |
157 | if (unLo > unHi) break; |
158 | n = (int32_t)eclass[fmap[unLo]] - (int32_t)med; |
159 | if (n == 0) { |
160 | mswap(fmap[unLo], fmap[ltLo]); |
161 | ltLo++; |
162 | unLo++; |
163 | continue; |
164 | }; |
165 | if (n > 0) break; |
166 | unLo++; |
167 | } |
168 | while (1) { |
169 | if (unLo > unHi) break; |
170 | n = (int32_t)eclass[fmap[unHi]] - (int32_t)med; |
171 | if (n == 0) { |
172 | mswap(fmap[unHi], fmap[gtHi]); |
173 | gtHi--; unHi--; |
174 | continue; |
175 | }; |
176 | if (n < 0) break; |
177 | unHi--; |
178 | } |
179 | if (unLo > unHi) break; |
180 | mswap(fmap[unLo], fmap[unHi]); unLo++; unHi--; |
181 | } |
182 | |
183 | AssertD(unHi == unLo-1, "fallbackQSort3(2)"); |
184 | |
185 | if (gtHi < ltLo) continue; |
186 | |
187 | n = mmin(ltLo-lo, unLo-ltLo); mvswap(fmap, lo, unLo-n, n); |
188 | m = mmin(hi-gtHi, gtHi-unHi); mvswap(fmap, unLo, hi-m+1, m); |
189 | |
190 | n = lo + unLo - ltLo - 1; |
191 | m = hi - (gtHi - unHi) + 1; |
192 | |
193 | if (n - lo > hi - m) { |
194 | fpush(lo, n); |
195 | fpush(m, hi); |
196 | } else { |
197 | fpush(m, hi); |
198 | fpush(lo, n); |
199 | } |
200 | } |
201 | } |
202 | |
203 | #undef fpush |
204 | #undef fpop |
205 | #undef FALLBACK_QSORT_SMALL_THRESH |
206 | #undef FALLBACK_QSORT_STACK_SIZE |
207 | |
208 | |
209 | /*---------------------------------------------*/ |
210 | /* Pre: |
211 | * nblock > 0 |
212 | * eclass exists for [0 .. nblock-1] |
213 | * ((uint8_t*)eclass) [0 .. nblock-1] holds block |
214 | * ptr exists for [0 .. nblock-1] |
215 | * |
216 | * Post: |
217 | * ((uint8_t*)eclass) [0 .. nblock-1] holds block |
218 | * All other areas of eclass destroyed |
219 | * fmap [0 .. nblock-1] holds sorted order |
220 | * bhtab[0 .. 2+(nblock/32)] destroyed |
221 | */ |
222 | |
223 | #define SET_BH(zz) bhtab[(zz) >> 5] |= (1 << ((zz) & 31)) |
224 | #define CLEAR_BH(zz) bhtab[(zz) >> 5] &= ~(1 << ((zz) & 31)) |
225 | #define ISSET_BH(zz) (bhtab[(zz) >> 5] & (1 << ((zz) & 31))) |
226 | #define WORD_BH(zz) bhtab[(zz) >> 5] |
227 | #define UNALIGNED_BH(zz) ((zz) & 0x01f) |
228 | |
229 | static |
230 | void fallbackSort(uint32_t* fmap, |
231 | uint32_t* eclass, |
232 | uint32_t* bhtab, |
233 | int32_t nblock) |
234 | { |
235 | int32_t ftab[257]; |
236 | int32_t ftabCopy[256]; |
237 | int32_t H, i, j, k, l, r, cc, cc1; |
238 | int32_t nNotDone; |
239 | int32_t nBhtab; |
240 | uint8_t* eclass8 = (uint8_t*)eclass; |
241 | |
242 | /* |
243 | * Initial 1-char radix sort to generate |
244 | * initial fmap and initial BH bits. |
245 | */ |
246 | for (i = 0; i < 257; i++) ftab[i] = 0; |
247 | for (i = 0; i < nblock; i++) ftab[eclass8[i]]++; |
248 | for (i = 0; i < 256; i++) ftabCopy[i] = ftab[i]; |
249 | |
250 | j = ftab[0]; /* bbox: optimized */ |
251 | for (i = 1; i < 257; i++) { |
252 | j += ftab[i]; |
253 | ftab[i] = j; |
254 | } |
255 | |
256 | for (i = 0; i < nblock; i++) { |
257 | j = eclass8[i]; |
258 | k = ftab[j] - 1; |
259 | ftab[j] = k; |
260 | fmap[k] = i; |
261 | } |
262 | |
263 | nBhtab = 2 + ((uint32_t)nblock / 32); /* bbox: unsigned div is easier */ |
264 | for (i = 0; i < nBhtab; i++) bhtab[i] = 0; |
265 | for (i = 0; i < 256; i++) SET_BH(ftab[i]); |
266 | |
267 | /* |
268 | * Inductively refine the buckets. Kind-of an |
269 | * "exponential radix sort" (!), inspired by the |
270 | * Manber-Myers suffix array construction algorithm. |
271 | */ |
272 | |
273 | /*-- set sentinel bits for block-end detection --*/ |
274 | for (i = 0; i < 32; i++) { |
275 | SET_BH(nblock + 2*i); |
276 | CLEAR_BH(nblock + 2*i + 1); |
277 | } |
278 | |
279 | /*-- the log(N) loop --*/ |
280 | H = 1; |
281 | while (1) { |
282 | j = 0; |
283 | for (i = 0; i < nblock; i++) { |
284 | if (ISSET_BH(i)) |
285 | j = i; |
286 | k = fmap[i] - H; |
287 | if (k < 0) |
288 | k += nblock; |
289 | eclass[k] = j; |
290 | } |
291 | |
292 | nNotDone = 0; |
293 | r = -1; |
294 | while (1) { |
295 | |
296 | /*-- find the next non-singleton bucket --*/ |
297 | k = r + 1; |
298 | while (ISSET_BH(k) && UNALIGNED_BH(k)) |
299 | k++; |
300 | if (ISSET_BH(k)) { |
301 | while (WORD_BH(k) == 0xffffffff) k += 32; |
302 | while (ISSET_BH(k)) k++; |
303 | } |
304 | l = k - 1; |
305 | if (l >= nblock) |
306 | break; |
307 | while (!ISSET_BH(k) && UNALIGNED_BH(k)) |
308 | k++; |
309 | if (!ISSET_BH(k)) { |
310 | while (WORD_BH(k) == 0x00000000) k += 32; |
311 | while (!ISSET_BH(k)) k++; |
312 | } |
313 | r = k - 1; |
314 | if (r >= nblock) |
315 | break; |
316 | |
317 | /*-- now [l, r] bracket current bucket --*/ |
318 | if (r > l) { |
319 | nNotDone += (r - l + 1); |
320 | fallbackQSort3(fmap, eclass, l, r); |
321 | |
322 | /*-- scan bucket and generate header bits-- */ |
323 | cc = -1; |
324 | for (i = l; i <= r; i++) { |
325 | cc1 = eclass[fmap[i]]; |
326 | if (cc != cc1) { |
327 | SET_BH(i); |
328 | cc = cc1; |
329 | }; |
330 | } |
331 | } |
332 | } |
333 | |
334 | H *= 2; |
335 | if (H > nblock || nNotDone == 0) |
336 | break; |
337 | } |
338 | |
339 | /* |
340 | * Reconstruct the original block in |
341 | * eclass8 [0 .. nblock-1], since the |
342 | * previous phase destroyed it. |
343 | */ |
344 | j = 0; |
345 | for (i = 0; i < nblock; i++) { |
346 | while (ftabCopy[j] == 0) |
347 | j++; |
348 | ftabCopy[j]--; |
349 | eclass8[fmap[i]] = (uint8_t)j; |
350 | } |
351 | AssertH(j < 256, 1005); |
352 | } |
353 | |
354 | #undef SET_BH |
355 | #undef CLEAR_BH |
356 | #undef ISSET_BH |
357 | #undef WORD_BH |
358 | #undef UNALIGNED_BH |
359 | |
360 | |
361 | /*---------------------------------------------*/ |
362 | /*--- The main, O(N^2 log(N)) sorting ---*/ |
363 | /*--- algorithm. Faster for "normal" ---*/ |
364 | /*--- non-repetitive blocks. ---*/ |
365 | /*---------------------------------------------*/ |
366 | |
367 | /*---------------------------------------------*/ |
368 | static |
369 | NOINLINE |
370 | int mainGtU( |
371 | uint32_t i1, |
372 | uint32_t i2, |
373 | uint8_t* block, |
374 | uint16_t* quadrant, |
375 | uint32_t nblock, |
376 | int32_t* budget) |
377 | { |
378 | int32_t k; |
379 | uint8_t c1, c2; |
380 | uint16_t s1, s2; |
381 | |
382 | /* Loop unrolling here is actually very useful |
383 | * (generated code is much simpler), |
384 | * code size increase is only 270 bytes (i386) |
385 | * but speeds up compression 10% overall |
386 | */ |
387 | |
388 | #if CONFIG_BZIP2_FEATURE_SPEED >= 1 |
389 | |
390 | #define TIMES_8(code) \ |
391 | code; code; code; code; \ |
392 | code; code; code; code; |
393 | #define TIMES_12(code) \ |
394 | code; code; code; code; \ |
395 | code; code; code; code; \ |
396 | code; code; code; code; |
397 | |
398 | #else |
399 | |
400 | #define TIMES_8(code) \ |
401 | { \ |
402 | int nn = 8; \ |
403 | do { \ |
404 | code; \ |
405 | } while (--nn); \ |
406 | } |
407 | #define TIMES_12(code) \ |
408 | { \ |
409 | int nn = 12; \ |
410 | do { \ |
411 | code; \ |
412 | } while (--nn); \ |
413 | } |
414 | |
415 | #endif |
416 | |
417 | AssertD(i1 != i2, "mainGtU"); |
418 | TIMES_12( |
419 | c1 = block[i1]; c2 = block[i2]; |
420 | if (c1 != c2) return (c1 > c2); |
421 | i1++; i2++; |
422 | ) |
423 | |
424 | k = nblock + 8; |
425 | |
426 | do { |
427 | TIMES_8( |
428 | c1 = block[i1]; c2 = block[i2]; |
429 | if (c1 != c2) return (c1 > c2); |
430 | s1 = quadrant[i1]; s2 = quadrant[i2]; |
431 | if (s1 != s2) return (s1 > s2); |
432 | i1++; i2++; |
433 | ) |
434 | |
435 | if (i1 >= nblock) i1 -= nblock; |
436 | if (i2 >= nblock) i2 -= nblock; |
437 | |
438 | (*budget)--; |
439 | k -= 8; |
440 | } while (k >= 0); |
441 | |
442 | return False; |
443 | } |
444 | #undef TIMES_8 |
445 | #undef TIMES_12 |
446 | |
447 | /*---------------------------------------------*/ |
448 | /* |
449 | * Knuth's increments seem to work better |
450 | * than Incerpi-Sedgewick here. Possibly |
451 | * because the number of elems to sort is |
452 | * usually small, typically <= 20. |
453 | */ |
454 | static |
455 | const int32_t incs[14] = { |
456 | 1, 4, 13, 40, 121, 364, 1093, 3280, |
457 | 9841, 29524, 88573, 265720, |
458 | 797161, 2391484 |
459 | }; |
460 | |
461 | static |
462 | void mainSimpleSort(uint32_t* ptr, |
463 | uint8_t* block, |
464 | uint16_t* quadrant, |
465 | int32_t nblock, |
466 | int32_t lo, |
467 | int32_t hi, |
468 | int32_t d, |
469 | int32_t* budget) |
470 | { |
471 | int32_t i, j, h, bigN, hp; |
472 | uint32_t v; |
473 | |
474 | bigN = hi - lo + 1; |
475 | if (bigN < 2) return; |
476 | |
477 | hp = 0; |
478 | while (incs[hp] < bigN) hp++; |
479 | hp--; |
480 | |
481 | for (; hp >= 0; hp--) { |
482 | h = incs[hp]; |
483 | |
484 | i = lo + h; |
485 | while (1) { |
486 | /*-- copy 1 --*/ |
487 | if (i > hi) break; |
488 | v = ptr[i]; |
489 | j = i; |
490 | while (mainGtU(ptr[j-h]+d, v+d, block, quadrant, nblock, budget)) { |
491 | ptr[j] = ptr[j-h]; |
492 | j = j - h; |
493 | if (j <= (lo + h - 1)) break; |
494 | } |
495 | ptr[j] = v; |
496 | i++; |
497 | |
498 | /* 1.5% overall speedup, +290 bytes */ |
499 | #if CONFIG_BZIP2_FEATURE_SPEED >= 3 |
500 | /*-- copy 2 --*/ |
501 | if (i > hi) break; |
502 | v = ptr[i]; |
503 | j = i; |
504 | while (mainGtU(ptr[j-h]+d, v+d, block, quadrant, nblock, budget)) { |
505 | ptr[j] = ptr[j-h]; |
506 | j = j - h; |
507 | if (j <= (lo + h - 1)) break; |
508 | } |
509 | ptr[j] = v; |
510 | i++; |
511 | |
512 | /*-- copy 3 --*/ |
513 | if (i > hi) break; |
514 | v = ptr[i]; |
515 | j = i; |
516 | while (mainGtU(ptr[j-h]+d, v+d, block, quadrant, nblock, budget)) { |
517 | ptr[j] = ptr[j-h]; |
518 | j = j - h; |
519 | if (j <= (lo + h - 1)) break; |
520 | } |
521 | ptr[j] = v; |
522 | i++; |
523 | #endif |
524 | if (*budget < 0) return; |
525 | } |
526 | } |
527 | } |
528 | |
529 | |
530 | /*---------------------------------------------*/ |
531 | /* |
532 | * The following is an implementation of |
533 | * an elegant 3-way quicksort for strings, |
534 | * described in a paper "Fast Algorithms for |
535 | * Sorting and Searching Strings", by Robert |
536 | * Sedgewick and Jon L. Bentley. |
537 | */ |
538 | |
539 | static |
540 | ALWAYS_INLINE |
541 | uint8_t mmed3(uint8_t a, uint8_t b, uint8_t c) |
542 | { |
543 | uint8_t t; |
544 | if (a > b) { |
545 | t = a; |
546 | a = b; |
547 | b = t; |
548 | }; |
549 | /* here b >= a */ |
550 | if (b > c) { |
551 | b = c; |
552 | if (a > b) |
553 | b = a; |
554 | } |
555 | return b; |
556 | } |
557 | |
558 | #define mpush(lz,hz,dz) \ |
559 | { \ |
560 | stackLo[sp] = lz; \ |
561 | stackHi[sp] = hz; \ |
562 | stackD [sp] = dz; \ |
563 | sp++; \ |
564 | } |
565 | |
566 | #define mpop(lz,hz,dz) \ |
567 | { \ |
568 | sp--; \ |
569 | lz = stackLo[sp]; \ |
570 | hz = stackHi[sp]; \ |
571 | dz = stackD [sp]; \ |
572 | } |
573 | |
574 | #define mnextsize(az) (nextHi[az] - nextLo[az]) |
575 | |
576 | #define mnextswap(az,bz) \ |
577 | { \ |
578 | int32_t tz; \ |
579 | tz = nextLo[az]; nextLo[az] = nextLo[bz]; nextLo[bz] = tz; \ |
580 | tz = nextHi[az]; nextHi[az] = nextHi[bz]; nextHi[bz] = tz; \ |
581 | tz = nextD [az]; nextD [az] = nextD [bz]; nextD [bz] = tz; \ |
582 | } |
583 | |
584 | #define MAIN_QSORT_SMALL_THRESH 20 |
585 | #define MAIN_QSORT_DEPTH_THRESH (BZ_N_RADIX + BZ_N_QSORT) |
586 | #define MAIN_QSORT_STACK_SIZE 100 |
587 | |
588 | static |
589 | void mainQSort3(uint32_t* ptr, |
590 | uint8_t* block, |
591 | uint16_t* quadrant, |
592 | int32_t nblock, |
593 | int32_t loSt, |
594 | int32_t hiSt, |
595 | int32_t dSt, |
596 | int32_t* budget) |
597 | { |
598 | int32_t unLo, unHi, ltLo, gtHi, n, m, med; |
599 | int32_t sp, lo, hi, d; |
600 | |
601 | int32_t stackLo[MAIN_QSORT_STACK_SIZE]; |
602 | int32_t stackHi[MAIN_QSORT_STACK_SIZE]; |
603 | int32_t stackD [MAIN_QSORT_STACK_SIZE]; |
604 | |
605 | int32_t nextLo[3]; |
606 | int32_t nextHi[3]; |
607 | int32_t nextD [3]; |
608 | |
609 | sp = 0; |
610 | mpush(loSt, hiSt, dSt); |
611 | |
612 | while (sp > 0) { |
613 | AssertH(sp < MAIN_QSORT_STACK_SIZE - 2, 1001); |
614 | |
615 | mpop(lo, hi, d); |
616 | if (hi - lo < MAIN_QSORT_SMALL_THRESH |
617 | || d > MAIN_QSORT_DEPTH_THRESH |
618 | ) { |
619 | mainSimpleSort(ptr, block, quadrant, nblock, lo, hi, d, budget); |
620 | if (*budget < 0) |
621 | return; |
622 | continue; |
623 | } |
624 | med = (int32_t) mmed3(block[ptr[lo ] + d], |
625 | block[ptr[hi ] + d], |
626 | block[ptr[(lo+hi) >> 1] + d]); |
627 | |
628 | unLo = ltLo = lo; |
629 | unHi = gtHi = hi; |
630 | |
631 | while (1) { |
632 | while (1) { |
633 | if (unLo > unHi) |
634 | break; |
635 | n = ((int32_t)block[ptr[unLo]+d]) - med; |
636 | if (n == 0) { |
637 | mswap(ptr[unLo], ptr[ltLo]); |
638 | ltLo++; |
639 | unLo++; |
640 | continue; |
641 | }; |
642 | if (n > 0) break; |
643 | unLo++; |
644 | } |
645 | while (1) { |
646 | if (unLo > unHi) |
647 | break; |
648 | n = ((int32_t)block[ptr[unHi]+d]) - med; |
649 | if (n == 0) { |
650 | mswap(ptr[unHi], ptr[gtHi]); |
651 | gtHi--; |
652 | unHi--; |
653 | continue; |
654 | }; |
655 | if (n < 0) break; |
656 | unHi--; |
657 | } |
658 | if (unLo > unHi) |
659 | break; |
660 | mswap(ptr[unLo], ptr[unHi]); |
661 | unLo++; |
662 | unHi--; |
663 | } |
664 | |
665 | AssertD(unHi == unLo-1, "mainQSort3(2)"); |
666 | |
667 | if (gtHi < ltLo) { |
668 | mpush(lo, hi, d + 1); |
669 | continue; |
670 | } |
671 | |
672 | n = mmin(ltLo-lo, unLo-ltLo); mvswap(ptr, lo, unLo-n, n); |
673 | m = mmin(hi-gtHi, gtHi-unHi); mvswap(ptr, unLo, hi-m+1, m); |
674 | |
675 | n = lo + unLo - ltLo - 1; |
676 | m = hi - (gtHi - unHi) + 1; |
677 | |
678 | nextLo[0] = lo; nextHi[0] = n; nextD[0] = d; |
679 | nextLo[1] = m; nextHi[1] = hi; nextD[1] = d; |
680 | nextLo[2] = n+1; nextHi[2] = m-1; nextD[2] = d+1; |
681 | |
682 | if (mnextsize(0) < mnextsize(1)) mnextswap(0, 1); |
683 | if (mnextsize(1) < mnextsize(2)) mnextswap(1, 2); |
684 | if (mnextsize(0) < mnextsize(1)) mnextswap(0, 1); |
685 | |
686 | AssertD (mnextsize(0) >= mnextsize(1), "mainQSort3(8)"); |
687 | AssertD (mnextsize(1) >= mnextsize(2), "mainQSort3(9)"); |
688 | |
689 | mpush(nextLo[0], nextHi[0], nextD[0]); |
690 | mpush(nextLo[1], nextHi[1], nextD[1]); |
691 | mpush(nextLo[2], nextHi[2], nextD[2]); |
692 | } |
693 | } |
694 | |
695 | #undef mpush |
696 | #undef mpop |
697 | #undef mnextsize |
698 | #undef mnextswap |
699 | #undef MAIN_QSORT_SMALL_THRESH |
700 | #undef MAIN_QSORT_DEPTH_THRESH |
701 | #undef MAIN_QSORT_STACK_SIZE |
702 | |
703 | |
704 | /*---------------------------------------------*/ |
705 | /* Pre: |
706 | * nblock > N_OVERSHOOT |
707 | * block32 exists for [0 .. nblock-1 +N_OVERSHOOT] |
708 | * ((uint8_t*)block32) [0 .. nblock-1] holds block |
709 | * ptr exists for [0 .. nblock-1] |
710 | * |
711 | * Post: |
712 | * ((uint8_t*)block32) [0 .. nblock-1] holds block |
713 | * All other areas of block32 destroyed |
714 | * ftab[0 .. 65536] destroyed |
715 | * ptr [0 .. nblock-1] holds sorted order |
716 | * if (*budget < 0), sorting was abandoned |
717 | */ |
718 | |
719 | #define BIGFREQ(b) (ftab[((b)+1) << 8] - ftab[(b) << 8]) |
720 | #define SETMASK (1 << 21) |
721 | #define CLEARMASK (~(SETMASK)) |
722 | |
723 | static NOINLINE |
724 | void mainSort(EState* state, |
725 | uint32_t* ptr, |
726 | uint8_t* block, |
727 | uint16_t* quadrant, |
728 | uint32_t* ftab, |
729 | int32_t nblock, |
730 | int32_t* budget) |
731 | { |
732 | int32_t i, j, k, ss, sb; |
733 | uint8_t c1; |
734 | int32_t numQSorted; |
735 | uint16_t s; |
736 | Bool bigDone[256]; |
737 | /* bbox: moved to EState to save stack |
738 | int32_t runningOrder[256]; |
739 | int32_t copyStart[256]; |
740 | int32_t copyEnd [256]; |
741 | */ |
742 | #define runningOrder (state->mainSort__runningOrder) |
743 | #define copyStart (state->mainSort__copyStart) |
744 | #define copyEnd (state->mainSort__copyEnd) |
745 | |
746 | /*-- set up the 2-byte frequency table --*/ |
747 | /* was: for (i = 65536; i >= 0; i--) ftab[i] = 0; */ |
748 | memset(ftab, 0, 65537 * sizeof(ftab[0])); |
749 | |
750 | j = block[0] << 8; |
751 | i = nblock - 1; |
752 | /* 3%, +300 bytes */ |
753 | #if CONFIG_BZIP2_FEATURE_SPEED >= 2 |
754 | for (; i >= 3; i -= 4) { |
755 | quadrant[i] = 0; |
756 | j = (j >> 8) | (((uint16_t)block[i]) << 8); |
757 | ftab[j]++; |
758 | quadrant[i-1] = 0; |
759 | j = (j >> 8) | (((uint16_t)block[i-1]) << 8); |
760 | ftab[j]++; |
761 | quadrant[i-2] = 0; |
762 | j = (j >> 8) | (((uint16_t)block[i-2]) << 8); |
763 | ftab[j]++; |
764 | quadrant[i-3] = 0; |
765 | j = (j >> 8) | (((uint16_t)block[i-3]) << 8); |
766 | ftab[j]++; |
767 | } |
768 | #endif |
769 | for (; i >= 0; i--) { |
770 | quadrant[i] = 0; |
771 | j = (j >> 8) | (((uint16_t)block[i]) << 8); |
772 | ftab[j]++; |
773 | } |
774 | |
775 | /*-- (emphasises close relationship of block & quadrant) --*/ |
776 | for (i = 0; i < BZ_N_OVERSHOOT; i++) { |
777 | block [nblock+i] = block[i]; |
778 | quadrant[nblock+i] = 0; |
779 | } |
780 | |
781 | /*-- Complete the initial radix sort --*/ |
782 | j = ftab[0]; /* bbox: optimized */ |
783 | for (i = 1; i <= 65536; i++) { |
784 | j += ftab[i]; |
785 | ftab[i] = j; |
786 | } |
787 | |
788 | s = block[0] << 8; |
789 | i = nblock - 1; |
790 | #if CONFIG_BZIP2_FEATURE_SPEED >= 2 |
791 | for (; i >= 3; i -= 4) { |
792 | s = (s >> 8) | (block[i] << 8); |
793 | j = ftab[s] - 1; |
794 | ftab[s] = j; |
795 | ptr[j] = i; |
796 | s = (s >> 8) | (block[i-1] << 8); |
797 | j = ftab[s] - 1; |
798 | ftab[s] = j; |
799 | ptr[j] = i-1; |
800 | s = (s >> 8) | (block[i-2] << 8); |
801 | j = ftab[s] - 1; |
802 | ftab[s] = j; |
803 | ptr[j] = i-2; |
804 | s = (s >> 8) | (block[i-3] << 8); |
805 | j = ftab[s] - 1; |
806 | ftab[s] = j; |
807 | ptr[j] = i-3; |
808 | } |
809 | #endif |
810 | for (; i >= 0; i--) { |
811 | s = (s >> 8) | (block[i] << 8); |
812 | j = ftab[s] - 1; |
813 | ftab[s] = j; |
814 | ptr[j] = i; |
815 | } |
816 | |
817 | /* |
818 | * Now ftab contains the first loc of every small bucket. |
819 | * Calculate the running order, from smallest to largest |
820 | * big bucket. |
821 | */ |
822 | for (i = 0; i <= 255; i++) { |
823 | bigDone [i] = False; |
824 | runningOrder[i] = i; |
825 | } |
826 | |
827 | { |
828 | int32_t vv; |
829 | /* bbox: was: int32_t h = 1; */ |
830 | /* do h = 3 * h + 1; while (h <= 256); */ |
831 | uint32_t h = 364; |
832 | |
833 | do { |
834 | /*h = h / 3;*/ |
835 | h = (h * 171) >> 9; /* bbox: fast h/3 */ |
836 | for (i = h; i <= 255; i++) { |
837 | vv = runningOrder[i]; |
838 | j = i; |
839 | while (BIGFREQ(runningOrder[j-h]) > BIGFREQ(vv)) { |
840 | runningOrder[j] = runningOrder[j-h]; |
841 | j = j - h; |
842 | if (j <= (h - 1)) |
843 | goto zero; |
844 | } |
845 | zero: |
846 | runningOrder[j] = vv; |
847 | } |
848 | } while (h != 1); |
849 | } |
850 | |
851 | /* |
852 | * The main sorting loop. |
853 | */ |
854 | |
855 | numQSorted = 0; |
856 | |
857 | for (i = 0; i <= 255; i++) { |
858 | |
859 | /* |
860 | * Process big buckets, starting with the least full. |
861 | * Basically this is a 3-step process in which we call |
862 | * mainQSort3 to sort the small buckets [ss, j], but |
863 | * also make a big effort to avoid the calls if we can. |
864 | */ |
865 | ss = runningOrder[i]; |
866 | |
867 | /* |
868 | * Step 1: |
869 | * Complete the big bucket [ss] by quicksorting |
870 | * any unsorted small buckets [ss, j], for j != ss. |
871 | * Hopefully previous pointer-scanning phases have already |
872 | * completed many of the small buckets [ss, j], so |
873 | * we don't have to sort them at all. |
874 | */ |
875 | for (j = 0; j <= 255; j++) { |
876 | if (j != ss) { |
877 | sb = (ss << 8) + j; |
878 | if (!(ftab[sb] & SETMASK)) { |
879 | int32_t lo = ftab[sb] & CLEARMASK; |
880 | int32_t hi = (ftab[sb+1] & CLEARMASK) - 1; |
881 | if (hi > lo) { |
882 | mainQSort3( |
883 | ptr, block, quadrant, nblock, |
884 | lo, hi, BZ_N_RADIX, budget |
885 | ); |
886 | if (*budget < 0) return; |
887 | numQSorted += (hi - lo + 1); |
888 | } |
889 | } |
890 | ftab[sb] |= SETMASK; |
891 | } |
892 | } |
893 | |
894 | AssertH(!bigDone[ss], 1006); |
895 | |
896 | /* |
897 | * Step 2: |
898 | * Now scan this big bucket [ss] so as to synthesise the |
899 | * sorted order for small buckets [t, ss] for all t, |
900 | * including, magically, the bucket [ss,ss] too. |
901 | * This will avoid doing Real Work in subsequent Step 1's. |
902 | */ |
903 | { |
904 | for (j = 0; j <= 255; j++) { |
905 | copyStart[j] = ftab[(j << 8) + ss] & CLEARMASK; |
906 | copyEnd [j] = (ftab[(j << 8) + ss + 1] & CLEARMASK) - 1; |
907 | } |
908 | for (j = ftab[ss << 8] & CLEARMASK; j < copyStart[ss]; j++) { |
909 | k = ptr[j] - 1; |
910 | if (k < 0) |
911 | k += nblock; |
912 | c1 = block[k]; |
913 | if (!bigDone[c1]) |
914 | ptr[copyStart[c1]++] = k; |
915 | } |
916 | for (j = (ftab[(ss+1) << 8] & CLEARMASK) - 1; j > copyEnd[ss]; j--) { |
917 | k = ptr[j]-1; |
918 | if (k < 0) |
919 | k += nblock; |
920 | c1 = block[k]; |
921 | if (!bigDone[c1]) |
922 | ptr[copyEnd[c1]--] = k; |
923 | } |
924 | } |
925 | |
926 | /* Extremely rare case missing in bzip2-1.0.0 and 1.0.1. |
927 | * Necessity for this case is demonstrated by compressing |
928 | * a sequence of approximately 48.5 million of character |
929 | * 251; 1.0.0/1.0.1 will then die here. */ |
930 | AssertH((copyStart[ss]-1 == copyEnd[ss]) \ |
931 | || (copyStart[ss] == 0 && copyEnd[ss] == nblock-1), 1007); |
932 | |
933 | for (j = 0; j <= 255; j++) |
934 | ftab[(j << 8) + ss] |= SETMASK; |
935 | |
936 | /* |
937 | * Step 3: |
938 | * The [ss] big bucket is now done. Record this fact, |
939 | * and update the quadrant descriptors. Remember to |
940 | * update quadrants in the overshoot area too, if |
941 | * necessary. The "if (i < 255)" test merely skips |
942 | * this updating for the last bucket processed, since |
943 | * updating for the last bucket is pointless. |
944 | * |
945 | * The quadrant array provides a way to incrementally |
946 | * cache sort orderings, as they appear, so as to |
947 | * make subsequent comparisons in fullGtU() complete |
948 | * faster. For repetitive blocks this makes a big |
949 | * difference (but not big enough to be able to avoid |
950 | * the fallback sorting mechanism, exponential radix sort). |
951 | * |
952 | * The precise meaning is: at all times: |
953 | * |
954 | * for 0 <= i < nblock and 0 <= j <= nblock |
955 | * |
956 | * if block[i] != block[j], |
957 | * |
958 | * then the relative values of quadrant[i] and |
959 | * quadrant[j] are meaningless. |
960 | * |
961 | * else { |
962 | * if quadrant[i] < quadrant[j] |
963 | * then the string starting at i lexicographically |
964 | * precedes the string starting at j |
965 | * |
966 | * else if quadrant[i] > quadrant[j] |
967 | * then the string starting at j lexicographically |
968 | * precedes the string starting at i |
969 | * |
970 | * else |
971 | * the relative ordering of the strings starting |
972 | * at i and j has not yet been determined. |
973 | * } |
974 | */ |
975 | bigDone[ss] = True; |
976 | |
977 | if (i < 255) { |
978 | int32_t bbStart = ftab[ss << 8] & CLEARMASK; |
979 | int32_t bbSize = (ftab[(ss+1) << 8] & CLEARMASK) - bbStart; |
980 | int32_t shifts = 0; |
981 | |
982 | while ((bbSize >> shifts) > 65534) shifts++; |
983 | |
984 | for (j = bbSize-1; j >= 0; j--) { |
985 | int32_t a2update = ptr[bbStart + j]; |
986 | uint16_t qVal = (uint16_t)(j >> shifts); |
987 | quadrant[a2update] = qVal; |
988 | if (a2update < BZ_N_OVERSHOOT) |
989 | quadrant[a2update + nblock] = qVal; |
990 | } |
991 | AssertH(((bbSize-1) >> shifts) <= 65535, 1002); |
992 | } |
993 | } |
994 | #undef runningOrder |
995 | #undef copyStart |
996 | #undef copyEnd |
997 | } |
998 | |
999 | #undef BIGFREQ |
1000 | #undef SETMASK |
1001 | #undef CLEARMASK |
1002 | |
1003 | |
1004 | /*---------------------------------------------*/ |
1005 | /* Pre: |
1006 | * nblock > 0 |
1007 | * arr2 exists for [0 .. nblock-1 +N_OVERSHOOT] |
1008 | * ((uint8_t*)arr2)[0 .. nblock-1] holds block |
1009 | * arr1 exists for [0 .. nblock-1] |
1010 | * |
1011 | * Post: |
1012 | * ((uint8_t*)arr2) [0 .. nblock-1] holds block |
1013 | * All other areas of block destroyed |
1014 | * ftab[0 .. 65536] destroyed |
1015 | * arr1[0 .. nblock-1] holds sorted order |
1016 | */ |
1017 | static NOINLINE |
1018 | void BZ2_blockSort(EState* s) |
1019 | { |
1020 | /* In original bzip2 1.0.4, it's a parameter, but 30 |
1021 | * (which was the default) should work ok. */ |
1022 | enum { wfact = 30 }; |
1023 | |
1024 | uint32_t* ptr = s->ptr; |
1025 | uint8_t* block = s->block; |
1026 | uint32_t* ftab = s->ftab; |
1027 | int32_t nblock = s->nblock; |
1028 | uint16_t* quadrant; |
1029 | int32_t budget; |
1030 | int32_t i; |
1031 | |
1032 | if (nblock < 10000) { |
1033 | fallbackSort(s->arr1, s->arr2, ftab, nblock); |
1034 | } else { |
1035 | /* Calculate the location for quadrant, remembering to get |
1036 | * the alignment right. Assumes that &(block[0]) is at least |
1037 | * 2-byte aligned -- this should be ok since block is really |
1038 | * the first section of arr2. |
1039 | */ |
1040 | i = nblock + BZ_N_OVERSHOOT; |
1041 | if (i & 1) i++; |
1042 | quadrant = (uint16_t*)(&(block[i])); |
1043 | |
1044 | /* (wfact-1) / 3 puts the default-factor-30 |
1045 | * transition point at very roughly the same place as |
1046 | * with v0.1 and v0.9.0. |
1047 | * Not that it particularly matters any more, since the |
1048 | * resulting compressed stream is now the same regardless |
1049 | * of whether or not we use the main sort or fallback sort. |
1050 | */ |
1051 | budget = nblock * ((wfact-1) / 3); |
1052 | |
1053 | mainSort(s, ptr, block, quadrant, ftab, nblock, &budget); |
1054 | if (budget < 0) { |
1055 | fallbackSort(s->arr1, s->arr2, ftab, nblock); |
1056 | } |
1057 | } |
1058 | |
1059 | s->origPtr = -1; |
1060 | for (i = 0; i < s->nblock; i++) |
1061 | if (ptr[i] == 0) { |
1062 | s->origPtr = i; |
1063 | break; |
1064 | }; |
1065 | |
1066 | AssertH(s->origPtr != -1, 1003); |
1067 | } |
1068 | |
1069 | |
1070 | /*-------------------------------------------------------------*/ |
1071 | /*--- end blocksort.c ---*/ |
1072 | /*-------------------------------------------------------------*/ |