Annotation of /trunk/mkinitrd-magellan/busybox/archival/bz/blocksort.c
Parent Directory | Revision Log
Revision 984 -
(hide annotations)
(download)
Sun May 30 11:32:42 2010 UTC (13 years, 11 months ago) by niro
File MIME type: text/plain
File size: 24222 byte(s)
Sun May 30 11:32:42 2010 UTC (13 years, 11 months ago) by niro
File MIME type: text/plain
File size: 24222 byte(s)
-updated to busybox-1.16.1 and enabled blkid/uuid support in default config
1 | niro | 816 | /* |
2 | * bzip2 is written by Julian Seward <jseward@bzip.org>. | ||
3 | * Adapted for busybox by Denys Vlasenko <vda.linux@googlemail.com>. | ||
4 | * See README and LICENSE files in this directory for more information. | ||
5 | */ | ||
6 | |||
7 | /*-------------------------------------------------------------*/ | ||
8 | /*--- Block sorting machinery ---*/ | ||
9 | /*--- blocksort.c ---*/ | ||
10 | /*-------------------------------------------------------------*/ | ||
11 | |||
12 | /* ------------------------------------------------------------------ | ||
13 | This file is part of bzip2/libbzip2, a program and library for | ||
14 | lossless, block-sorting data compression. | ||
15 | |||
16 | bzip2/libbzip2 version 1.0.4 of 20 December 2006 | ||
17 | Copyright (C) 1996-2006 Julian Seward <jseward@bzip.org> | ||
18 | |||
19 | Please read the WARNING, DISCLAIMER and PATENTS sections in the | ||
20 | README file. | ||
21 | |||
22 | This program is released under the terms of the license contained | ||
23 | in the file LICENSE. | ||
24 | ------------------------------------------------------------------ */ | ||
25 | |||
26 | /* #include "bzlib_private.h" */ | ||
27 | |||
28 | #define mswap(zz1, zz2) \ | ||
29 | { \ | ||
30 | int32_t zztmp = zz1; \ | ||
31 | zz1 = zz2; \ | ||
32 | zz2 = zztmp; \ | ||
33 | } | ||
34 | |||
35 | static | ||
36 | /* No measurable speed gain with inlining */ | ||
37 | /* ALWAYS_INLINE */ | ||
38 | void mvswap(uint32_t* ptr, int32_t zzp1, int32_t zzp2, int32_t zzn) | ||
39 | { | ||
40 | while (zzn > 0) { | ||
41 | mswap(ptr[zzp1], ptr[zzp2]); | ||
42 | zzp1++; | ||
43 | zzp2++; | ||
44 | zzn--; | ||
45 | } | ||
46 | } | ||
47 | |||
48 | static | ||
49 | ALWAYS_INLINE | ||
50 | int32_t mmin(int32_t a, int32_t b) | ||
51 | { | ||
52 | return (a < b) ? a : b; | ||
53 | } | ||
54 | |||
55 | |||
56 | /*---------------------------------------------*/ | ||
57 | /*--- Fallback O(N log(N)^2) sorting ---*/ | ||
58 | /*--- algorithm, for repetitive blocks ---*/ | ||
59 | /*---------------------------------------------*/ | ||
60 | |||
61 | /*---------------------------------------------*/ | ||
62 | static | ||
63 | inline | ||
64 | void fallbackSimpleSort(uint32_t* fmap, | ||
65 | uint32_t* eclass, | ||
66 | int32_t lo, | ||
67 | int32_t hi) | ||
68 | { | ||
69 | int32_t i, j, tmp; | ||
70 | uint32_t ec_tmp; | ||
71 | |||
72 | if (lo == hi) return; | ||
73 | |||
74 | if (hi - lo > 3) { | ||
75 | for (i = hi-4; i >= lo; i--) { | ||
76 | tmp = fmap[i]; | ||
77 | ec_tmp = eclass[tmp]; | ||
78 | for (j = i+4; j <= hi && ec_tmp > eclass[fmap[j]]; j += 4) | ||
79 | fmap[j-4] = fmap[j]; | ||
80 | fmap[j-4] = tmp; | ||
81 | } | ||
82 | } | ||
83 | |||
84 | for (i = hi-1; i >= lo; i--) { | ||
85 | tmp = fmap[i]; | ||
86 | ec_tmp = eclass[tmp]; | ||
87 | for (j = i+1; j <= hi && ec_tmp > eclass[fmap[j]]; j++) | ||
88 | fmap[j-1] = fmap[j]; | ||
89 | fmap[j-1] = tmp; | ||
90 | } | ||
91 | } | ||
92 | |||
93 | |||
94 | /*---------------------------------------------*/ | ||
95 | #define fpush(lz,hz) { \ | ||
96 | stackLo[sp] = lz; \ | ||
97 | stackHi[sp] = hz; \ | ||
98 | sp++; \ | ||
99 | } | ||
100 | |||
101 | #define fpop(lz,hz) { \ | ||
102 | sp--; \ | ||
103 | lz = stackLo[sp]; \ | ||
104 | hz = stackHi[sp]; \ | ||
105 | } | ||
106 | |||
107 | #define FALLBACK_QSORT_SMALL_THRESH 10 | ||
108 | #define FALLBACK_QSORT_STACK_SIZE 100 | ||
109 | |||
110 | static | ||
111 | void fallbackQSort3(uint32_t* fmap, | ||
112 | uint32_t* eclass, | ||
113 | int32_t loSt, | ||
114 | int32_t hiSt) | ||
115 | { | ||
116 | int32_t unLo, unHi, ltLo, gtHi, n, m; | ||
117 | int32_t sp, lo, hi; | ||
118 | uint32_t med, r, r3; | ||
119 | int32_t stackLo[FALLBACK_QSORT_STACK_SIZE]; | ||
120 | int32_t stackHi[FALLBACK_QSORT_STACK_SIZE]; | ||
121 | |||
122 | r = 0; | ||
123 | |||
124 | sp = 0; | ||
125 | fpush(loSt, hiSt); | ||
126 | |||
127 | while (sp > 0) { | ||
128 | AssertH(sp < FALLBACK_QSORT_STACK_SIZE - 1, 1004); | ||
129 | |||
130 | fpop(lo, hi); | ||
131 | if (hi - lo < FALLBACK_QSORT_SMALL_THRESH) { | ||
132 | fallbackSimpleSort(fmap, eclass, lo, hi); | ||
133 | continue; | ||
134 | } | ||
135 | |||
136 | /* Random partitioning. Median of 3 sometimes fails to | ||
137 | * avoid bad cases. Median of 9 seems to help but | ||
138 | * looks rather expensive. This too seems to work but | ||
139 | * is cheaper. Guidance for the magic constants | ||
140 | * 7621 and 32768 is taken from Sedgewick's algorithms | ||
141 | * book, chapter 35. | ||
142 | */ | ||
143 | r = ((r * 7621) + 1) % 32768; | ||
144 | r3 = r % 3; | ||
145 | if (r3 == 0) | ||
146 | med = eclass[fmap[lo]]; | ||
147 | else if (r3 == 1) | ||
148 | med = eclass[fmap[(lo+hi)>>1]]; | ||
149 | else | ||
150 | med = eclass[fmap[hi]]; | ||
151 | |||
152 | unLo = ltLo = lo; | ||
153 | unHi = gtHi = hi; | ||
154 | |||
155 | while (1) { | ||
156 | while (1) { | ||
157 | if (unLo > unHi) break; | ||
158 | n = (int32_t)eclass[fmap[unLo]] - (int32_t)med; | ||
159 | if (n == 0) { | ||
160 | mswap(fmap[unLo], fmap[ltLo]); | ||
161 | ltLo++; | ||
162 | unLo++; | ||
163 | continue; | ||
164 | }; | ||
165 | if (n > 0) break; | ||
166 | unLo++; | ||
167 | } | ||
168 | while (1) { | ||
169 | if (unLo > unHi) break; | ||
170 | n = (int32_t)eclass[fmap[unHi]] - (int32_t)med; | ||
171 | if (n == 0) { | ||
172 | mswap(fmap[unHi], fmap[gtHi]); | ||
173 | gtHi--; unHi--; | ||
174 | continue; | ||
175 | }; | ||
176 | if (n < 0) break; | ||
177 | unHi--; | ||
178 | } | ||
179 | if (unLo > unHi) break; | ||
180 | mswap(fmap[unLo], fmap[unHi]); unLo++; unHi--; | ||
181 | } | ||
182 | |||
183 | AssertD(unHi == unLo-1, "fallbackQSort3(2)"); | ||
184 | |||
185 | if (gtHi < ltLo) continue; | ||
186 | |||
187 | n = mmin(ltLo-lo, unLo-ltLo); mvswap(fmap, lo, unLo-n, n); | ||
188 | m = mmin(hi-gtHi, gtHi-unHi); mvswap(fmap, unLo, hi-m+1, m); | ||
189 | |||
190 | n = lo + unLo - ltLo - 1; | ||
191 | m = hi - (gtHi - unHi) + 1; | ||
192 | |||
193 | if (n - lo > hi - m) { | ||
194 | fpush(lo, n); | ||
195 | fpush(m, hi); | ||
196 | } else { | ||
197 | fpush(m, hi); | ||
198 | fpush(lo, n); | ||
199 | } | ||
200 | } | ||
201 | } | ||
202 | |||
203 | #undef fpush | ||
204 | #undef fpop | ||
205 | #undef FALLBACK_QSORT_SMALL_THRESH | ||
206 | #undef FALLBACK_QSORT_STACK_SIZE | ||
207 | |||
208 | |||
209 | /*---------------------------------------------*/ | ||
210 | /* Pre: | ||
211 | * nblock > 0 | ||
212 | * eclass exists for [0 .. nblock-1] | ||
213 | * ((uint8_t*)eclass) [0 .. nblock-1] holds block | ||
214 | * ptr exists for [0 .. nblock-1] | ||
215 | * | ||
216 | * Post: | ||
217 | * ((uint8_t*)eclass) [0 .. nblock-1] holds block | ||
218 | * All other areas of eclass destroyed | ||
219 | * fmap [0 .. nblock-1] holds sorted order | ||
220 | * bhtab[0 .. 2+(nblock/32)] destroyed | ||
221 | */ | ||
222 | |||
223 | #define SET_BH(zz) bhtab[(zz) >> 5] |= (1 << ((zz) & 31)) | ||
224 | #define CLEAR_BH(zz) bhtab[(zz) >> 5] &= ~(1 << ((zz) & 31)) | ||
225 | #define ISSET_BH(zz) (bhtab[(zz) >> 5] & (1 << ((zz) & 31))) | ||
226 | #define WORD_BH(zz) bhtab[(zz) >> 5] | ||
227 | #define UNALIGNED_BH(zz) ((zz) & 0x01f) | ||
228 | |||
229 | static | ||
230 | void fallbackSort(uint32_t* fmap, | ||
231 | uint32_t* eclass, | ||
232 | uint32_t* bhtab, | ||
233 | int32_t nblock) | ||
234 | { | ||
235 | int32_t ftab[257]; | ||
236 | int32_t ftabCopy[256]; | ||
237 | int32_t H, i, j, k, l, r, cc, cc1; | ||
238 | int32_t nNotDone; | ||
239 | int32_t nBhtab; | ||
240 | uint8_t* eclass8 = (uint8_t*)eclass; | ||
241 | |||
242 | /* | ||
243 | * Initial 1-char radix sort to generate | ||
244 | * initial fmap and initial BH bits. | ||
245 | */ | ||
246 | for (i = 0; i < 257; i++) ftab[i] = 0; | ||
247 | for (i = 0; i < nblock; i++) ftab[eclass8[i]]++; | ||
248 | for (i = 0; i < 256; i++) ftabCopy[i] = ftab[i]; | ||
249 | |||
250 | j = ftab[0]; /* bbox: optimized */ | ||
251 | for (i = 1; i < 257; i++) { | ||
252 | j += ftab[i]; | ||
253 | ftab[i] = j; | ||
254 | } | ||
255 | |||
256 | for (i = 0; i < nblock; i++) { | ||
257 | j = eclass8[i]; | ||
258 | k = ftab[j] - 1; | ||
259 | ftab[j] = k; | ||
260 | fmap[k] = i; | ||
261 | } | ||
262 | |||
263 | nBhtab = 2 + ((uint32_t)nblock / 32); /* bbox: unsigned div is easier */ | ||
264 | for (i = 0; i < nBhtab; i++) bhtab[i] = 0; | ||
265 | for (i = 0; i < 256; i++) SET_BH(ftab[i]); | ||
266 | |||
267 | /* | ||
268 | * Inductively refine the buckets. Kind-of an | ||
269 | * "exponential radix sort" (!), inspired by the | ||
270 | * Manber-Myers suffix array construction algorithm. | ||
271 | */ | ||
272 | |||
273 | /*-- set sentinel bits for block-end detection --*/ | ||
274 | for (i = 0; i < 32; i++) { | ||
275 | SET_BH(nblock + 2*i); | ||
276 | CLEAR_BH(nblock + 2*i + 1); | ||
277 | } | ||
278 | |||
279 | /*-- the log(N) loop --*/ | ||
280 | H = 1; | ||
281 | while (1) { | ||
282 | j = 0; | ||
283 | for (i = 0; i < nblock; i++) { | ||
284 | if (ISSET_BH(i)) | ||
285 | j = i; | ||
286 | k = fmap[i] - H; | ||
287 | if (k < 0) | ||
288 | k += nblock; | ||
289 | eclass[k] = j; | ||
290 | } | ||
291 | |||
292 | nNotDone = 0; | ||
293 | r = -1; | ||
294 | while (1) { | ||
295 | |||
296 | /*-- find the next non-singleton bucket --*/ | ||
297 | k = r + 1; | ||
298 | while (ISSET_BH(k) && UNALIGNED_BH(k)) | ||
299 | k++; | ||
300 | if (ISSET_BH(k)) { | ||
301 | while (WORD_BH(k) == 0xffffffff) k += 32; | ||
302 | while (ISSET_BH(k)) k++; | ||
303 | } | ||
304 | l = k - 1; | ||
305 | if (l >= nblock) | ||
306 | break; | ||
307 | while (!ISSET_BH(k) && UNALIGNED_BH(k)) | ||
308 | k++; | ||
309 | if (!ISSET_BH(k)) { | ||
310 | while (WORD_BH(k) == 0x00000000) k += 32; | ||
311 | while (!ISSET_BH(k)) k++; | ||
312 | } | ||
313 | r = k - 1; | ||
314 | if (r >= nblock) | ||
315 | break; | ||
316 | |||
317 | /*-- now [l, r] bracket current bucket --*/ | ||
318 | if (r > l) { | ||
319 | nNotDone += (r - l + 1); | ||
320 | fallbackQSort3(fmap, eclass, l, r); | ||
321 | |||
322 | /*-- scan bucket and generate header bits-- */ | ||
323 | cc = -1; | ||
324 | for (i = l; i <= r; i++) { | ||
325 | cc1 = eclass[fmap[i]]; | ||
326 | if (cc != cc1) { | ||
327 | SET_BH(i); | ||
328 | cc = cc1; | ||
329 | }; | ||
330 | } | ||
331 | } | ||
332 | } | ||
333 | |||
334 | H *= 2; | ||
335 | if (H > nblock || nNotDone == 0) | ||
336 | break; | ||
337 | } | ||
338 | |||
339 | /* | ||
340 | * Reconstruct the original block in | ||
341 | * eclass8 [0 .. nblock-1], since the | ||
342 | * previous phase destroyed it. | ||
343 | */ | ||
344 | j = 0; | ||
345 | for (i = 0; i < nblock; i++) { | ||
346 | while (ftabCopy[j] == 0) | ||
347 | j++; | ||
348 | ftabCopy[j]--; | ||
349 | eclass8[fmap[i]] = (uint8_t)j; | ||
350 | } | ||
351 | AssertH(j < 256, 1005); | ||
352 | } | ||
353 | |||
354 | #undef SET_BH | ||
355 | #undef CLEAR_BH | ||
356 | #undef ISSET_BH | ||
357 | #undef WORD_BH | ||
358 | #undef UNALIGNED_BH | ||
359 | |||
360 | |||
361 | /*---------------------------------------------*/ | ||
362 | /*--- The main, O(N^2 log(N)) sorting ---*/ | ||
363 | /*--- algorithm. Faster for "normal" ---*/ | ||
364 | /*--- non-repetitive blocks. ---*/ | ||
365 | /*---------------------------------------------*/ | ||
366 | |||
367 | /*---------------------------------------------*/ | ||
368 | static | ||
369 | NOINLINE | ||
370 | int mainGtU( | ||
371 | uint32_t i1, | ||
372 | uint32_t i2, | ||
373 | uint8_t* block, | ||
374 | uint16_t* quadrant, | ||
375 | uint32_t nblock, | ||
376 | int32_t* budget) | ||
377 | { | ||
378 | int32_t k; | ||
379 | uint8_t c1, c2; | ||
380 | uint16_t s1, s2; | ||
381 | |||
382 | /* Loop unrolling here is actually very useful | ||
383 | * (generated code is much simpler), | ||
384 | * code size increase is only 270 bytes (i386) | ||
385 | * but speeds up compression 10% overall | ||
386 | */ | ||
387 | |||
388 | #if CONFIG_BZIP2_FEATURE_SPEED >= 1 | ||
389 | |||
390 | #define TIMES_8(code) \ | ||
391 | code; code; code; code; \ | ||
392 | code; code; code; code; | ||
393 | #define TIMES_12(code) \ | ||
394 | code; code; code; code; \ | ||
395 | code; code; code; code; \ | ||
396 | code; code; code; code; | ||
397 | |||
398 | #else | ||
399 | |||
400 | #define TIMES_8(code) \ | ||
401 | { \ | ||
402 | int nn = 8; \ | ||
403 | do { \ | ||
404 | code; \ | ||
405 | } while (--nn); \ | ||
406 | } | ||
407 | #define TIMES_12(code) \ | ||
408 | { \ | ||
409 | int nn = 12; \ | ||
410 | do { \ | ||
411 | code; \ | ||
412 | } while (--nn); \ | ||
413 | } | ||
414 | |||
415 | #endif | ||
416 | |||
417 | AssertD(i1 != i2, "mainGtU"); | ||
418 | TIMES_12( | ||
419 | c1 = block[i1]; c2 = block[i2]; | ||
420 | if (c1 != c2) return (c1 > c2); | ||
421 | i1++; i2++; | ||
422 | ) | ||
423 | |||
424 | k = nblock + 8; | ||
425 | |||
426 | do { | ||
427 | TIMES_8( | ||
428 | c1 = block[i1]; c2 = block[i2]; | ||
429 | if (c1 != c2) return (c1 > c2); | ||
430 | s1 = quadrant[i1]; s2 = quadrant[i2]; | ||
431 | if (s1 != s2) return (s1 > s2); | ||
432 | i1++; i2++; | ||
433 | ) | ||
434 | |||
435 | if (i1 >= nblock) i1 -= nblock; | ||
436 | if (i2 >= nblock) i2 -= nblock; | ||
437 | |||
438 | (*budget)--; | ||
439 | k -= 8; | ||
440 | } while (k >= 0); | ||
441 | |||
442 | return False; | ||
443 | } | ||
444 | #undef TIMES_8 | ||
445 | #undef TIMES_12 | ||
446 | |||
447 | /*---------------------------------------------*/ | ||
448 | /* | ||
449 | * Knuth's increments seem to work better | ||
450 | * than Incerpi-Sedgewick here. Possibly | ||
451 | * because the number of elems to sort is | ||
452 | * usually small, typically <= 20. | ||
453 | */ | ||
454 | static | ||
455 | const int32_t incs[14] = { | ||
456 | 1, 4, 13, 40, 121, 364, 1093, 3280, | ||
457 | 9841, 29524, 88573, 265720, | ||
458 | 797161, 2391484 | ||
459 | }; | ||
460 | |||
461 | static | ||
462 | void mainSimpleSort(uint32_t* ptr, | ||
463 | uint8_t* block, | ||
464 | uint16_t* quadrant, | ||
465 | int32_t nblock, | ||
466 | int32_t lo, | ||
467 | int32_t hi, | ||
468 | int32_t d, | ||
469 | int32_t* budget) | ||
470 | { | ||
471 | int32_t i, j, h, bigN, hp; | ||
472 | uint32_t v; | ||
473 | |||
474 | bigN = hi - lo + 1; | ||
475 | if (bigN < 2) return; | ||
476 | |||
477 | hp = 0; | ||
478 | while (incs[hp] < bigN) hp++; | ||
479 | hp--; | ||
480 | |||
481 | for (; hp >= 0; hp--) { | ||
482 | h = incs[hp]; | ||
483 | |||
484 | i = lo + h; | ||
485 | while (1) { | ||
486 | /*-- copy 1 --*/ | ||
487 | if (i > hi) break; | ||
488 | v = ptr[i]; | ||
489 | j = i; | ||
490 | while (mainGtU(ptr[j-h]+d, v+d, block, quadrant, nblock, budget)) { | ||
491 | ptr[j] = ptr[j-h]; | ||
492 | j = j - h; | ||
493 | if (j <= (lo + h - 1)) break; | ||
494 | } | ||
495 | ptr[j] = v; | ||
496 | i++; | ||
497 | |||
498 | /* 1.5% overall speedup, +290 bytes */ | ||
499 | #if CONFIG_BZIP2_FEATURE_SPEED >= 3 | ||
500 | /*-- copy 2 --*/ | ||
501 | if (i > hi) break; | ||
502 | v = ptr[i]; | ||
503 | j = i; | ||
504 | while (mainGtU(ptr[j-h]+d, v+d, block, quadrant, nblock, budget)) { | ||
505 | ptr[j] = ptr[j-h]; | ||
506 | j = j - h; | ||
507 | if (j <= (lo + h - 1)) break; | ||
508 | } | ||
509 | ptr[j] = v; | ||
510 | i++; | ||
511 | |||
512 | /*-- copy 3 --*/ | ||
513 | if (i > hi) break; | ||
514 | v = ptr[i]; | ||
515 | j = i; | ||
516 | while (mainGtU(ptr[j-h]+d, v+d, block, quadrant, nblock, budget)) { | ||
517 | ptr[j] = ptr[j-h]; | ||
518 | j = j - h; | ||
519 | if (j <= (lo + h - 1)) break; | ||
520 | } | ||
521 | ptr[j] = v; | ||
522 | i++; | ||
523 | #endif | ||
524 | if (*budget < 0) return; | ||
525 | } | ||
526 | } | ||
527 | } | ||
528 | |||
529 | |||
530 | /*---------------------------------------------*/ | ||
531 | /* | ||
532 | * The following is an implementation of | ||
533 | * an elegant 3-way quicksort for strings, | ||
534 | * described in a paper "Fast Algorithms for | ||
535 | * Sorting and Searching Strings", by Robert | ||
536 | * Sedgewick and Jon L. Bentley. | ||
537 | */ | ||
538 | |||
539 | static | ||
540 | ALWAYS_INLINE | ||
541 | uint8_t mmed3(uint8_t a, uint8_t b, uint8_t c) | ||
542 | { | ||
543 | uint8_t t; | ||
544 | if (a > b) { | ||
545 | t = a; | ||
546 | a = b; | ||
547 | b = t; | ||
548 | }; | ||
549 | /* here b >= a */ | ||
550 | if (b > c) { | ||
551 | b = c; | ||
552 | if (a > b) | ||
553 | b = a; | ||
554 | } | ||
555 | return b; | ||
556 | } | ||
557 | |||
558 | #define mpush(lz,hz,dz) \ | ||
559 | { \ | ||
560 | stackLo[sp] = lz; \ | ||
561 | stackHi[sp] = hz; \ | ||
562 | stackD [sp] = dz; \ | ||
563 | sp++; \ | ||
564 | } | ||
565 | |||
566 | #define mpop(lz,hz,dz) \ | ||
567 | { \ | ||
568 | sp--; \ | ||
569 | lz = stackLo[sp]; \ | ||
570 | hz = stackHi[sp]; \ | ||
571 | dz = stackD [sp]; \ | ||
572 | } | ||
573 | |||
574 | #define mnextsize(az) (nextHi[az] - nextLo[az]) | ||
575 | |||
576 | #define mnextswap(az,bz) \ | ||
577 | { \ | ||
578 | int32_t tz; \ | ||
579 | tz = nextLo[az]; nextLo[az] = nextLo[bz]; nextLo[bz] = tz; \ | ||
580 | tz = nextHi[az]; nextHi[az] = nextHi[bz]; nextHi[bz] = tz; \ | ||
581 | tz = nextD [az]; nextD [az] = nextD [bz]; nextD [bz] = tz; \ | ||
582 | } | ||
583 | |||
584 | #define MAIN_QSORT_SMALL_THRESH 20 | ||
585 | #define MAIN_QSORT_DEPTH_THRESH (BZ_N_RADIX + BZ_N_QSORT) | ||
586 | #define MAIN_QSORT_STACK_SIZE 100 | ||
587 | |||
588 | niro | 984 | static NOINLINE |
589 | niro | 816 | void mainQSort3(uint32_t* ptr, |
590 | uint8_t* block, | ||
591 | uint16_t* quadrant, | ||
592 | int32_t nblock, | ||
593 | int32_t loSt, | ||
594 | int32_t hiSt, | ||
595 | int32_t dSt, | ||
596 | int32_t* budget) | ||
597 | { | ||
598 | int32_t unLo, unHi, ltLo, gtHi, n, m, med; | ||
599 | int32_t sp, lo, hi, d; | ||
600 | |||
601 | int32_t stackLo[MAIN_QSORT_STACK_SIZE]; | ||
602 | int32_t stackHi[MAIN_QSORT_STACK_SIZE]; | ||
603 | int32_t stackD [MAIN_QSORT_STACK_SIZE]; | ||
604 | |||
605 | int32_t nextLo[3]; | ||
606 | int32_t nextHi[3]; | ||
607 | int32_t nextD [3]; | ||
608 | |||
609 | sp = 0; | ||
610 | mpush(loSt, hiSt, dSt); | ||
611 | |||
612 | while (sp > 0) { | ||
613 | AssertH(sp < MAIN_QSORT_STACK_SIZE - 2, 1001); | ||
614 | |||
615 | mpop(lo, hi, d); | ||
616 | if (hi - lo < MAIN_QSORT_SMALL_THRESH | ||
617 | || d > MAIN_QSORT_DEPTH_THRESH | ||
618 | ) { | ||
619 | mainSimpleSort(ptr, block, quadrant, nblock, lo, hi, d, budget); | ||
620 | if (*budget < 0) | ||
621 | return; | ||
622 | continue; | ||
623 | } | ||
624 | med = (int32_t) mmed3(block[ptr[lo ] + d], | ||
625 | block[ptr[hi ] + d], | ||
626 | block[ptr[(lo+hi) >> 1] + d]); | ||
627 | |||
628 | unLo = ltLo = lo; | ||
629 | unHi = gtHi = hi; | ||
630 | |||
631 | while (1) { | ||
632 | while (1) { | ||
633 | if (unLo > unHi) | ||
634 | break; | ||
635 | n = ((int32_t)block[ptr[unLo]+d]) - med; | ||
636 | if (n == 0) { | ||
637 | mswap(ptr[unLo], ptr[ltLo]); | ||
638 | ltLo++; | ||
639 | unLo++; | ||
640 | continue; | ||
641 | }; | ||
642 | if (n > 0) break; | ||
643 | unLo++; | ||
644 | } | ||
645 | while (1) { | ||
646 | if (unLo > unHi) | ||
647 | break; | ||
648 | n = ((int32_t)block[ptr[unHi]+d]) - med; | ||
649 | if (n == 0) { | ||
650 | mswap(ptr[unHi], ptr[gtHi]); | ||
651 | gtHi--; | ||
652 | unHi--; | ||
653 | continue; | ||
654 | }; | ||
655 | if (n < 0) break; | ||
656 | unHi--; | ||
657 | } | ||
658 | if (unLo > unHi) | ||
659 | break; | ||
660 | mswap(ptr[unLo], ptr[unHi]); | ||
661 | unLo++; | ||
662 | unHi--; | ||
663 | } | ||
664 | |||
665 | AssertD(unHi == unLo-1, "mainQSort3(2)"); | ||
666 | |||
667 | if (gtHi < ltLo) { | ||
668 | mpush(lo, hi, d + 1); | ||
669 | continue; | ||
670 | } | ||
671 | |||
672 | n = mmin(ltLo-lo, unLo-ltLo); mvswap(ptr, lo, unLo-n, n); | ||
673 | m = mmin(hi-gtHi, gtHi-unHi); mvswap(ptr, unLo, hi-m+1, m); | ||
674 | |||
675 | n = lo + unLo - ltLo - 1; | ||
676 | m = hi - (gtHi - unHi) + 1; | ||
677 | |||
678 | nextLo[0] = lo; nextHi[0] = n; nextD[0] = d; | ||
679 | nextLo[1] = m; nextHi[1] = hi; nextD[1] = d; | ||
680 | nextLo[2] = n+1; nextHi[2] = m-1; nextD[2] = d+1; | ||
681 | |||
682 | if (mnextsize(0) < mnextsize(1)) mnextswap(0, 1); | ||
683 | if (mnextsize(1) < mnextsize(2)) mnextswap(1, 2); | ||
684 | if (mnextsize(0) < mnextsize(1)) mnextswap(0, 1); | ||
685 | |||
686 | AssertD (mnextsize(0) >= mnextsize(1), "mainQSort3(8)"); | ||
687 | AssertD (mnextsize(1) >= mnextsize(2), "mainQSort3(9)"); | ||
688 | |||
689 | mpush(nextLo[0], nextHi[0], nextD[0]); | ||
690 | mpush(nextLo[1], nextHi[1], nextD[1]); | ||
691 | mpush(nextLo[2], nextHi[2], nextD[2]); | ||
692 | } | ||
693 | } | ||
694 | |||
695 | #undef mpush | ||
696 | #undef mpop | ||
697 | #undef mnextsize | ||
698 | #undef mnextswap | ||
699 | #undef MAIN_QSORT_SMALL_THRESH | ||
700 | #undef MAIN_QSORT_DEPTH_THRESH | ||
701 | #undef MAIN_QSORT_STACK_SIZE | ||
702 | |||
703 | |||
704 | /*---------------------------------------------*/ | ||
705 | /* Pre: | ||
706 | * nblock > N_OVERSHOOT | ||
707 | * block32 exists for [0 .. nblock-1 +N_OVERSHOOT] | ||
708 | * ((uint8_t*)block32) [0 .. nblock-1] holds block | ||
709 | * ptr exists for [0 .. nblock-1] | ||
710 | * | ||
711 | * Post: | ||
712 | * ((uint8_t*)block32) [0 .. nblock-1] holds block | ||
713 | * All other areas of block32 destroyed | ||
714 | * ftab[0 .. 65536] destroyed | ||
715 | * ptr [0 .. nblock-1] holds sorted order | ||
716 | * if (*budget < 0), sorting was abandoned | ||
717 | */ | ||
718 | |||
719 | #define BIGFREQ(b) (ftab[((b)+1) << 8] - ftab[(b) << 8]) | ||
720 | #define SETMASK (1 << 21) | ||
721 | #define CLEARMASK (~(SETMASK)) | ||
722 | |||
723 | static NOINLINE | ||
724 | void mainSort(EState* state, | ||
725 | uint32_t* ptr, | ||
726 | uint8_t* block, | ||
727 | uint16_t* quadrant, | ||
728 | uint32_t* ftab, | ||
729 | int32_t nblock, | ||
730 | int32_t* budget) | ||
731 | { | ||
732 | int32_t i, j, k, ss, sb; | ||
733 | uint8_t c1; | ||
734 | int32_t numQSorted; | ||
735 | uint16_t s; | ||
736 | Bool bigDone[256]; | ||
737 | /* bbox: moved to EState to save stack | ||
738 | int32_t runningOrder[256]; | ||
739 | int32_t copyStart[256]; | ||
740 | int32_t copyEnd [256]; | ||
741 | */ | ||
742 | #define runningOrder (state->mainSort__runningOrder) | ||
743 | #define copyStart (state->mainSort__copyStart) | ||
744 | #define copyEnd (state->mainSort__copyEnd) | ||
745 | |||
746 | /*-- set up the 2-byte frequency table --*/ | ||
747 | /* was: for (i = 65536; i >= 0; i--) ftab[i] = 0; */ | ||
748 | memset(ftab, 0, 65537 * sizeof(ftab[0])); | ||
749 | |||
750 | j = block[0] << 8; | ||
751 | i = nblock - 1; | ||
752 | /* 3%, +300 bytes */ | ||
753 | #if CONFIG_BZIP2_FEATURE_SPEED >= 2 | ||
754 | for (; i >= 3; i -= 4) { | ||
755 | quadrant[i] = 0; | ||
756 | j = (j >> 8) | (((uint16_t)block[i]) << 8); | ||
757 | ftab[j]++; | ||
758 | quadrant[i-1] = 0; | ||
759 | j = (j >> 8) | (((uint16_t)block[i-1]) << 8); | ||
760 | ftab[j]++; | ||
761 | quadrant[i-2] = 0; | ||
762 | j = (j >> 8) | (((uint16_t)block[i-2]) << 8); | ||
763 | ftab[j]++; | ||
764 | quadrant[i-3] = 0; | ||
765 | j = (j >> 8) | (((uint16_t)block[i-3]) << 8); | ||
766 | ftab[j]++; | ||
767 | } | ||
768 | #endif | ||
769 | for (; i >= 0; i--) { | ||
770 | quadrant[i] = 0; | ||
771 | j = (j >> 8) | (((uint16_t)block[i]) << 8); | ||
772 | ftab[j]++; | ||
773 | } | ||
774 | |||
775 | /*-- (emphasises close relationship of block & quadrant) --*/ | ||
776 | for (i = 0; i < BZ_N_OVERSHOOT; i++) { | ||
777 | block [nblock+i] = block[i]; | ||
778 | quadrant[nblock+i] = 0; | ||
779 | } | ||
780 | |||
781 | /*-- Complete the initial radix sort --*/ | ||
782 | j = ftab[0]; /* bbox: optimized */ | ||
783 | for (i = 1; i <= 65536; i++) { | ||
784 | j += ftab[i]; | ||
785 | ftab[i] = j; | ||
786 | } | ||
787 | |||
788 | s = block[0] << 8; | ||
789 | i = nblock - 1; | ||
790 | #if CONFIG_BZIP2_FEATURE_SPEED >= 2 | ||
791 | for (; i >= 3; i -= 4) { | ||
792 | s = (s >> 8) | (block[i] << 8); | ||
793 | j = ftab[s] - 1; | ||
794 | ftab[s] = j; | ||
795 | ptr[j] = i; | ||
796 | s = (s >> 8) | (block[i-1] << 8); | ||
797 | j = ftab[s] - 1; | ||
798 | ftab[s] = j; | ||
799 | ptr[j] = i-1; | ||
800 | s = (s >> 8) | (block[i-2] << 8); | ||
801 | j = ftab[s] - 1; | ||
802 | ftab[s] = j; | ||
803 | ptr[j] = i-2; | ||
804 | s = (s >> 8) | (block[i-3] << 8); | ||
805 | j = ftab[s] - 1; | ||
806 | ftab[s] = j; | ||
807 | ptr[j] = i-3; | ||
808 | } | ||
809 | #endif | ||
810 | for (; i >= 0; i--) { | ||
811 | s = (s >> 8) | (block[i] << 8); | ||
812 | j = ftab[s] - 1; | ||
813 | ftab[s] = j; | ||
814 | ptr[j] = i; | ||
815 | } | ||
816 | |||
817 | /* | ||
818 | * Now ftab contains the first loc of every small bucket. | ||
819 | * Calculate the running order, from smallest to largest | ||
820 | * big bucket. | ||
821 | */ | ||
822 | for (i = 0; i <= 255; i++) { | ||
823 | bigDone [i] = False; | ||
824 | runningOrder[i] = i; | ||
825 | } | ||
826 | |||
827 | { | ||
828 | int32_t vv; | ||
829 | /* bbox: was: int32_t h = 1; */ | ||
830 | /* do h = 3 * h + 1; while (h <= 256); */ | ||
831 | uint32_t h = 364; | ||
832 | |||
833 | do { | ||
834 | /*h = h / 3;*/ | ||
835 | h = (h * 171) >> 9; /* bbox: fast h/3 */ | ||
836 | for (i = h; i <= 255; i++) { | ||
837 | vv = runningOrder[i]; | ||
838 | j = i; | ||
839 | while (BIGFREQ(runningOrder[j-h]) > BIGFREQ(vv)) { | ||
840 | runningOrder[j] = runningOrder[j-h]; | ||
841 | j = j - h; | ||
842 | if (j <= (h - 1)) | ||
843 | goto zero; | ||
844 | } | ||
845 | zero: | ||
846 | runningOrder[j] = vv; | ||
847 | } | ||
848 | } while (h != 1); | ||
849 | } | ||
850 | |||
851 | /* | ||
852 | * The main sorting loop. | ||
853 | */ | ||
854 | |||
855 | numQSorted = 0; | ||
856 | |||
857 | for (i = 0; i <= 255; i++) { | ||
858 | |||
859 | /* | ||
860 | * Process big buckets, starting with the least full. | ||
861 | * Basically this is a 3-step process in which we call | ||
862 | * mainQSort3 to sort the small buckets [ss, j], but | ||
863 | * also make a big effort to avoid the calls if we can. | ||
864 | */ | ||
865 | ss = runningOrder[i]; | ||
866 | |||
867 | /* | ||
868 | * Step 1: | ||
869 | * Complete the big bucket [ss] by quicksorting | ||
870 | * any unsorted small buckets [ss, j], for j != ss. | ||
871 | * Hopefully previous pointer-scanning phases have already | ||
872 | * completed many of the small buckets [ss, j], so | ||
873 | * we don't have to sort them at all. | ||
874 | */ | ||
875 | for (j = 0; j <= 255; j++) { | ||
876 | if (j != ss) { | ||
877 | sb = (ss << 8) + j; | ||
878 | if (!(ftab[sb] & SETMASK)) { | ||
879 | int32_t lo = ftab[sb] & CLEARMASK; | ||
880 | int32_t hi = (ftab[sb+1] & CLEARMASK) - 1; | ||
881 | if (hi > lo) { | ||
882 | mainQSort3( | ||
883 | ptr, block, quadrant, nblock, | ||
884 | lo, hi, BZ_N_RADIX, budget | ||
885 | ); | ||
886 | if (*budget < 0) return; | ||
887 | numQSorted += (hi - lo + 1); | ||
888 | } | ||
889 | } | ||
890 | ftab[sb] |= SETMASK; | ||
891 | } | ||
892 | } | ||
893 | |||
894 | AssertH(!bigDone[ss], 1006); | ||
895 | |||
896 | /* | ||
897 | * Step 2: | ||
898 | * Now scan this big bucket [ss] so as to synthesise the | ||
899 | * sorted order for small buckets [t, ss] for all t, | ||
900 | * including, magically, the bucket [ss,ss] too. | ||
901 | * This will avoid doing Real Work in subsequent Step 1's. | ||
902 | */ | ||
903 | { | ||
904 | for (j = 0; j <= 255; j++) { | ||
905 | copyStart[j] = ftab[(j << 8) + ss] & CLEARMASK; | ||
906 | copyEnd [j] = (ftab[(j << 8) + ss + 1] & CLEARMASK) - 1; | ||
907 | } | ||
908 | for (j = ftab[ss << 8] & CLEARMASK; j < copyStart[ss]; j++) { | ||
909 | k = ptr[j] - 1; | ||
910 | if (k < 0) | ||
911 | k += nblock; | ||
912 | c1 = block[k]; | ||
913 | if (!bigDone[c1]) | ||
914 | ptr[copyStart[c1]++] = k; | ||
915 | } | ||
916 | for (j = (ftab[(ss+1) << 8] & CLEARMASK) - 1; j > copyEnd[ss]; j--) { | ||
917 | k = ptr[j]-1; | ||
918 | if (k < 0) | ||
919 | k += nblock; | ||
920 | c1 = block[k]; | ||
921 | if (!bigDone[c1]) | ||
922 | ptr[copyEnd[c1]--] = k; | ||
923 | } | ||
924 | } | ||
925 | |||
926 | /* Extremely rare case missing in bzip2-1.0.0 and 1.0.1. | ||
927 | * Necessity for this case is demonstrated by compressing | ||
928 | * a sequence of approximately 48.5 million of character | ||
929 | * 251; 1.0.0/1.0.1 will then die here. */ | ||
930 | AssertH((copyStart[ss]-1 == copyEnd[ss]) \ | ||
931 | || (copyStart[ss] == 0 && copyEnd[ss] == nblock-1), 1007); | ||
932 | |||
933 | for (j = 0; j <= 255; j++) | ||
934 | ftab[(j << 8) + ss] |= SETMASK; | ||
935 | |||
936 | /* | ||
937 | * Step 3: | ||
938 | * The [ss] big bucket is now done. Record this fact, | ||
939 | * and update the quadrant descriptors. Remember to | ||
940 | * update quadrants in the overshoot area too, if | ||
941 | * necessary. The "if (i < 255)" test merely skips | ||
942 | * this updating for the last bucket processed, since | ||
943 | * updating for the last bucket is pointless. | ||
944 | * | ||
945 | * The quadrant array provides a way to incrementally | ||
946 | * cache sort orderings, as they appear, so as to | ||
947 | * make subsequent comparisons in fullGtU() complete | ||
948 | * faster. For repetitive blocks this makes a big | ||
949 | * difference (but not big enough to be able to avoid | ||
950 | * the fallback sorting mechanism, exponential radix sort). | ||
951 | * | ||
952 | * The precise meaning is: at all times: | ||
953 | * | ||
954 | * for 0 <= i < nblock and 0 <= j <= nblock | ||
955 | * | ||
956 | * if block[i] != block[j], | ||
957 | * | ||
958 | * then the relative values of quadrant[i] and | ||
959 | * quadrant[j] are meaningless. | ||
960 | * | ||
961 | * else { | ||
962 | * if quadrant[i] < quadrant[j] | ||
963 | * then the string starting at i lexicographically | ||
964 | * precedes the string starting at j | ||
965 | * | ||
966 | * else if quadrant[i] > quadrant[j] | ||
967 | * then the string starting at j lexicographically | ||
968 | * precedes the string starting at i | ||
969 | * | ||
970 | * else | ||
971 | * the relative ordering of the strings starting | ||
972 | * at i and j has not yet been determined. | ||
973 | * } | ||
974 | */ | ||
975 | bigDone[ss] = True; | ||
976 | |||
977 | if (i < 255) { | ||
978 | int32_t bbStart = ftab[ss << 8] & CLEARMASK; | ||
979 | int32_t bbSize = (ftab[(ss+1) << 8] & CLEARMASK) - bbStart; | ||
980 | int32_t shifts = 0; | ||
981 | |||
982 | while ((bbSize >> shifts) > 65534) shifts++; | ||
983 | |||
984 | for (j = bbSize-1; j >= 0; j--) { | ||
985 | int32_t a2update = ptr[bbStart + j]; | ||
986 | uint16_t qVal = (uint16_t)(j >> shifts); | ||
987 | quadrant[a2update] = qVal; | ||
988 | if (a2update < BZ_N_OVERSHOOT) | ||
989 | quadrant[a2update + nblock] = qVal; | ||
990 | } | ||
991 | AssertH(((bbSize-1) >> shifts) <= 65535, 1002); | ||
992 | } | ||
993 | } | ||
994 | #undef runningOrder | ||
995 | #undef copyStart | ||
996 | #undef copyEnd | ||
997 | } | ||
998 | |||
999 | #undef BIGFREQ | ||
1000 | #undef SETMASK | ||
1001 | #undef CLEARMASK | ||
1002 | |||
1003 | |||
1004 | /*---------------------------------------------*/ | ||
1005 | /* Pre: | ||
1006 | * nblock > 0 | ||
1007 | * arr2 exists for [0 .. nblock-1 +N_OVERSHOOT] | ||
1008 | * ((uint8_t*)arr2)[0 .. nblock-1] holds block | ||
1009 | * arr1 exists for [0 .. nblock-1] | ||
1010 | * | ||
1011 | * Post: | ||
1012 | * ((uint8_t*)arr2) [0 .. nblock-1] holds block | ||
1013 | * All other areas of block destroyed | ||
1014 | * ftab[0 .. 65536] destroyed | ||
1015 | * arr1[0 .. nblock-1] holds sorted order | ||
1016 | */ | ||
1017 | static NOINLINE | ||
1018 | void BZ2_blockSort(EState* s) | ||
1019 | { | ||
1020 | /* In original bzip2 1.0.4, it's a parameter, but 30 | ||
1021 | * (which was the default) should work ok. */ | ||
1022 | enum { wfact = 30 }; | ||
1023 | |||
1024 | uint32_t* ptr = s->ptr; | ||
1025 | uint8_t* block = s->block; | ||
1026 | uint32_t* ftab = s->ftab; | ||
1027 | int32_t nblock = s->nblock; | ||
1028 | uint16_t* quadrant; | ||
1029 | int32_t budget; | ||
1030 | int32_t i; | ||
1031 | |||
1032 | if (nblock < 10000) { | ||
1033 | fallbackSort(s->arr1, s->arr2, ftab, nblock); | ||
1034 | } else { | ||
1035 | /* Calculate the location for quadrant, remembering to get | ||
1036 | * the alignment right. Assumes that &(block[0]) is at least | ||
1037 | * 2-byte aligned -- this should be ok since block is really | ||
1038 | * the first section of arr2. | ||
1039 | */ | ||
1040 | i = nblock + BZ_N_OVERSHOOT; | ||
1041 | if (i & 1) i++; | ||
1042 | quadrant = (uint16_t*)(&(block[i])); | ||
1043 | |||
1044 | /* (wfact-1) / 3 puts the default-factor-30 | ||
1045 | * transition point at very roughly the same place as | ||
1046 | * with v0.1 and v0.9.0. | ||
1047 | * Not that it particularly matters any more, since the | ||
1048 | * resulting compressed stream is now the same regardless | ||
1049 | * of whether or not we use the main sort or fallback sort. | ||
1050 | */ | ||
1051 | budget = nblock * ((wfact-1) / 3); | ||
1052 | |||
1053 | mainSort(s, ptr, block, quadrant, ftab, nblock, &budget); | ||
1054 | if (budget < 0) { | ||
1055 | fallbackSort(s->arr1, s->arr2, ftab, nblock); | ||
1056 | } | ||
1057 | } | ||
1058 | |||
1059 | s->origPtr = -1; | ||
1060 | for (i = 0; i < s->nblock; i++) | ||
1061 | if (ptr[i] == 0) { | ||
1062 | s->origPtr = i; | ||
1063 | break; | ||
1064 | }; | ||
1065 | |||
1066 | AssertH(s->origPtr != -1, 1003); | ||
1067 | } | ||
1068 | |||
1069 | |||
1070 | /*-------------------------------------------------------------*/ | ||
1071 | /*--- end blocksort.c ---*/ | ||
1072 | /*-------------------------------------------------------------*/ |