Magellan Linux

Annotation of /trunk/mkinitrd-magellan/busybox/archival/libunarchive/decompress_bunzip2.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1123 - (hide annotations) (download)
Wed Aug 18 21:56:57 2010 UTC (13 years, 8 months ago) by niro
File MIME type: text/plain
File size: 24548 byte(s)
-updated to busybox-1.17.1
1 niro 532 /* vi: set sw=4 ts=4: */
2     /* Small bzip2 deflate implementation, by Rob Landley (rob@landley.net).
3    
4     Based on bzip2 decompression code by Julian R Seward (jseward@acm.org),
5     which also acknowledges contributions by Mike Burrows, David Wheeler,
6     Peter Fenwick, Alistair Moffat, Radford Neal, Ian H. Witten,
7     Robert Sedgewick, and Jon L. Bentley.
8    
9     Licensed under GPLv2 or later, see file LICENSE in this tarball for details.
10     */
11    
12     /*
13     Size and speed optimizations by Manuel Novoa III (mjn3@codepoet.org).
14    
15     More efficient reading of Huffman codes, a streamlined read_bunzip()
16     function, and various other tweaks. In (limited) tests, approximately
17     20% faster than bzcat on x86 and about 10% faster on arm.
18    
19     Note that about 2/3 of the time is spent in read_unzip() reversing
20     the Burrows-Wheeler transformation. Much of that time is delay
21     resulting from cache misses.
22    
23     I would ask that anyone benefiting from this work, especially those
24     using it in commercial products, consider making a donation to my local
25     non-profit hospice organization (www.hospiceacadiana.com) in the name of
26     the woman I loved, Toni W. Hagan, who passed away Feb. 12, 2003.
27    
28     Manuel
29     */
30    
31     #include "libbb.h"
32     #include "unarchive.h"
33    
34     /* Constants for Huffman coding */
35     #define MAX_GROUPS 6
36     #define GROUP_SIZE 50 /* 64 would have been more efficient */
37     #define MAX_HUFCODE_BITS 20 /* Longest Huffman code allowed */
38     #define MAX_SYMBOLS 258 /* 256 literals + RUNA + RUNB */
39     #define SYMBOL_RUNA 0
40     #define SYMBOL_RUNB 1
41    
42     /* Status return values */
43     #define RETVAL_OK 0
44     #define RETVAL_LAST_BLOCK (-1)
45     #define RETVAL_NOT_BZIP_DATA (-2)
46     #define RETVAL_UNEXPECTED_INPUT_EOF (-3)
47 niro 816 #define RETVAL_SHORT_WRITE (-4)
48 niro 532 #define RETVAL_DATA_ERROR (-5)
49     #define RETVAL_OUT_OF_MEMORY (-6)
50     #define RETVAL_OBSOLETE_INPUT (-7)
51    
52     /* Other housekeeping constants */
53     #define IOBUF_SIZE 4096
54    
55     /* This is what we know about each Huffman coding group */
56     struct group_data {
57 niro 816 /* We have an extra slot at the end of limit[] for a sentinel value. */
58     int limit[MAX_HUFCODE_BITS+1], base[MAX_HUFCODE_BITS], permute[MAX_SYMBOLS];
59 niro 532 int minLen, maxLen;
60     };
61    
62     /* Structure holding all the housekeeping data, including IO buffers and
63 niro 816 * memory that persists between calls to bunzip
64     * Found the most used member:
65     * cat this_file.c | sed -e 's/"/ /g' -e "s/'/ /g" | xargs -n1 \
66     * | grep 'bd->' | sed 's/^.*bd->/bd->/' | sort | $PAGER
67     * and moved it (inbufBitCount) to offset 0.
68     */
69     struct bunzip_data {
70     /* I/O tracking data (file handles, buffers, positions, etc.) */
71     unsigned inbufBitCount, inbufBits;
72     int in_fd, out_fd, inbufCount, inbufPos /*, outbufPos*/;
73     unsigned char *inbuf /*,*outbuf*/;
74 niro 532
75     /* State for interrupting output loop */
76 niro 816 int writeCopies, writePos, writeRunCountdown, writeCount, writeCurrent;
77 niro 532
78     /* The CRC values stored in the block header and calculated from the data */
79 niro 816 uint32_t headerCRC, totalCRC, writeCRC;
80 niro 532
81     /* Intermediate buffer and its size (in bytes) */
82 niro 816 unsigned *dbuf, dbufSize;
83 niro 532
84 niro 816 /* For I/O error handling */
85     jmp_buf jmpbuf;
86 niro 532
87 niro 816 /* Big things go last (register-relative addressing can be larger for big offsets) */
88     uint32_t crc32Table[256];
89 niro 532 unsigned char selectors[32768]; /* nSelectors=15 bits */
90     struct group_data groups[MAX_GROUPS]; /* Huffman coding tables */
91 niro 816 };
92     /* typedef struct bunzip_data bunzip_data; -- done in .h file */
93 niro 532
94    
95     /* Return the next nnn bits of input. All reads from the compressed input
96     are done through this function. All reads are big endian */
97    
98 niro 816 static unsigned get_bits(bunzip_data *bd, int bits_wanted)
99 niro 532 {
100 niro 816 unsigned bits = 0;
101 niro 532
102     /* If we need to get more data from the byte buffer, do so. (Loop getting
103     one byte at a time to enforce endianness and avoid unaligned access.) */
104 niro 816 while ((int)(bd->inbufBitCount) < bits_wanted) {
105 niro 532
106     /* If we need to read more data from file into byte buffer, do so */
107 niro 816 if (bd->inbufPos == bd->inbufCount) {
108     /* if "no input fd" case: in_fd == -1, read fails, we jump */
109     bd->inbufCount = read(bd->in_fd, bd->inbuf, IOBUF_SIZE);
110     if (bd->inbufCount <= 0)
111     longjmp(bd->jmpbuf, RETVAL_UNEXPECTED_INPUT_EOF);
112     bd->inbufPos = 0;
113 niro 532 }
114    
115     /* Avoid 32-bit overflow (dump bit buffer to top of output) */
116 niro 816 if (bd->inbufBitCount >= 24) {
117     bits = bd->inbufBits & ((1 << bd->inbufBitCount) - 1);
118     bits_wanted -= bd->inbufBitCount;
119     bits <<= bits_wanted;
120     bd->inbufBitCount = 0;
121 niro 532 }
122    
123     /* Grab next 8 bits of input from buffer. */
124 niro 816 bd->inbufBits = (bd->inbufBits << 8) | bd->inbuf[bd->inbufPos++];
125     bd->inbufBitCount += 8;
126 niro 532 }
127    
128     /* Calculate result */
129 niro 816 bd->inbufBitCount -= bits_wanted;
130     bits |= (bd->inbufBits >> bd->inbufBitCount) & ((1 << bits_wanted) - 1);
131 niro 532
132     return bits;
133     }
134    
135     /* Unpacks the next block and sets up for the inverse burrows-wheeler step. */
136     static int get_next_block(bunzip_data *bd)
137     {
138     struct group_data *hufGroup;
139 niro 816 int dbufCount, nextSym, dbufSize, groupCount, *base, *limit, selector,
140     i, j, k, t, runPos, symCount, symTotal, nSelectors, byteCount[256];
141 niro 532 unsigned char uc, symToByte[256], mtfSymbol[256], *selectors;
142 niro 816 unsigned *dbuf, origPtr;
143 niro 532
144 niro 816 dbuf = bd->dbuf;
145     dbufSize = bd->dbufSize;
146     selectors = bd->selectors;
147 niro 532
148     /* Reset longjmp I/O error handling */
149 niro 816 i = setjmp(bd->jmpbuf);
150 niro 532 if (i) return i;
151    
152     /* Read in header signature and CRC, then validate signature.
153     (last block signature means CRC is for whole file, return now) */
154 niro 816 i = get_bits(bd, 24);
155     j = get_bits(bd, 24);
156     bd->headerCRC = get_bits(bd, 32);
157 niro 532 if ((i == 0x177245) && (j == 0x385090)) return RETVAL_LAST_BLOCK;
158     if ((i != 0x314159) || (j != 0x265359)) return RETVAL_NOT_BZIP_DATA;
159    
160     /* We can add support for blockRandomised if anybody complains. There was
161     some code for this in busybox 1.0.0-pre3, but nobody ever noticed that
162     it didn't actually work. */
163 niro 816 if (get_bits(bd, 1)) return RETVAL_OBSOLETE_INPUT;
164     origPtr = get_bits(bd, 24);
165     if ((int)origPtr > dbufSize) return RETVAL_DATA_ERROR;
166 niro 532
167     /* mapping table: if some byte values are never used (encoding things
168     like ascii text), the compression code removes the gaps to have fewer
169     symbols to deal with, and writes a sparse bitfield indicating which
170     values were present. We make a translation table to convert the symbols
171     back to the corresponding bytes. */
172 niro 816 t = get_bits(bd, 16);
173     symTotal = 0;
174     for (i = 0; i < 16; i++) {
175     if (t & (1 << (15-i))) {
176     k = get_bits(bd, 16);
177     for (j = 0; j < 16; j++)
178     if (k & (1 << (15-j)))
179     symToByte[symTotal++] = (16*i) + j;
180 niro 532 }
181     }
182    
183     /* How many different Huffman coding groups does this block use? */
184 niro 816 groupCount = get_bits(bd, 3);
185     if (groupCount < 2 || groupCount > MAX_GROUPS)
186     return RETVAL_DATA_ERROR;
187 niro 532
188     /* nSelectors: Every GROUP_SIZE many symbols we select a new Huffman coding
189     group. Read in the group selector list, which is stored as MTF encoded
190     bit runs. (MTF=Move To Front, as each value is used it's moved to the
191     start of the list.) */
192 niro 816 nSelectors = get_bits(bd, 15);
193     if (!nSelectors) return RETVAL_DATA_ERROR;
194     for (i = 0; i < groupCount; i++) mtfSymbol[i] = i;
195     for (i = 0; i < nSelectors; i++) {
196 niro 532
197     /* Get next value */
198 niro 816 for (j = 0; get_bits(bd, 1); j++)
199     if (j >= groupCount) return RETVAL_DATA_ERROR;
200 niro 532
201     /* Decode MTF to get the next selector */
202     uc = mtfSymbol[j];
203     for (;j;j--) mtfSymbol[j] = mtfSymbol[j-1];
204 niro 816 mtfSymbol[0] = selectors[i] = uc;
205 niro 532 }
206    
207     /* Read the Huffman coding tables for each group, which code for symTotal
208     literal symbols, plus two run symbols (RUNA, RUNB) */
209 niro 816 symCount = symTotal + 2;
210     for (j = 0; j < groupCount; j++) {
211     unsigned char length[MAX_SYMBOLS];
212     /* 8 bits is ALMOST enough for temp[], see below */
213     unsigned temp[MAX_HUFCODE_BITS+1];
214     int minLen, maxLen, pp;
215 niro 532
216     /* Read Huffman code lengths for each symbol. They're stored in
217     a way similar to mtf; record a starting value for the first symbol,
218     and an offset from the previous value for everys symbol after that.
219     (Subtracting 1 before the loop and then adding it back at the end is
220     an optimization that makes the test inside the loop simpler: symbol
221     length 0 becomes negative, so an unsigned inequality catches it.) */
222 niro 816 t = get_bits(bd, 5) - 1;
223 niro 532 for (i = 0; i < symCount; i++) {
224     for (;;) {
225 niro 816 if ((unsigned)t > (MAX_HUFCODE_BITS-1))
226 niro 532 return RETVAL_DATA_ERROR;
227    
228     /* If first bit is 0, stop. Else second bit indicates whether
229     to increment or decrement the value. Optimization: grab 2
230     bits and unget the second if the first was 0. */
231 niro 816 k = get_bits(bd, 2);
232 niro 532 if (k < 2) {
233     bd->inbufBitCount++;
234     break;
235     }
236    
237     /* Add one if second bit 1, else subtract 1. Avoids if/else */
238 niro 816 t += (((k+1) & 2) - 1);
239 niro 532 }
240    
241     /* Correct for the initial -1, to get the final symbol length */
242 niro 816 length[i] = t + 1;
243 niro 532 }
244    
245     /* Find largest and smallest lengths in this group */
246 niro 816 minLen = maxLen = length[0];
247 niro 532 for (i = 1; i < symCount; i++) {
248 niro 816 if (length[i] > maxLen) maxLen = length[i];
249     else if (length[i] < minLen) minLen = length[i];
250 niro 532 }
251    
252     /* Calculate permute[], base[], and limit[] tables from length[].
253     *
254     * permute[] is the lookup table for converting Huffman coded symbols
255     * into decoded symbols. base[] is the amount to subtract from the
256     * value of a Huffman symbol of a given length when using permute[].
257     *
258     * limit[] indicates the largest numerical value a symbol with a given
259     * number of bits can have. This is how the Huffman codes can vary in
260     * length: each code with a value>limit[length] needs another bit.
261     */
262 niro 816 hufGroup = bd->groups + j;
263 niro 532 hufGroup->minLen = minLen;
264     hufGroup->maxLen = maxLen;
265    
266     /* Note that minLen can't be smaller than 1, so we adjust the base
267     and limit array pointers so we're not always wasting the first
268     entry. We do this again when using them (during symbol decoding).*/
269 niro 816 base = hufGroup->base - 1;
270     limit = hufGroup->limit - 1;
271 niro 532
272     /* Calculate permute[]. Concurently, initialize temp[] and limit[]. */
273 niro 816 pp = 0;
274     for (i = minLen; i <= maxLen; i++) {
275     temp[i] = limit[i] = 0;
276     for (t = 0; t < symCount; t++)
277     if (length[t] == i)
278     hufGroup->permute[pp++] = t;
279 niro 532 }
280    
281     /* Count symbols coded for at each bit length */
282 niro 816 /* NB: in pathological cases, temp[8] can end ip being 256.
283     * That's why uint8_t is too small for temp[]. */
284     for (i = 0; i < symCount; i++) temp[length[i]]++;
285 niro 532
286     /* Calculate limit[] (the largest symbol-coding value at each bit
287     * length, which is (previous limit<<1)+symbols at this level), and
288     * base[] (number of symbols to ignore at each bit length, which is
289     * limit minus the cumulative count of symbols coded for already). */
290 niro 816 pp = t = 0;
291     for (i = minLen; i < maxLen; i++) {
292     pp += temp[i];
293 niro 532
294     /* We read the largest possible symbol size and then unget bits
295     after determining how many we need, and those extra bits could
296     be set to anything. (They're noise from future symbols.) At
297     each level we're really only interested in the first few bits,
298     so here we set all the trailing to-be-ignored bits to 1 so they
299     don't affect the value>limit[length] comparison. */
300 niro 816 limit[i] = (pp << (maxLen - i)) - 1;
301     pp <<= 1;
302     t += temp[i];
303     base[i+1] = pp - t;
304 niro 532 }
305 niro 816 limit[maxLen+1] = INT_MAX; /* Sentinel value for reading next sym. */
306     limit[maxLen] = pp + temp[maxLen] - 1;
307     base[minLen] = 0;
308 niro 532 }
309    
310     /* We've finished reading and digesting the block header. Now read this
311     block's Huffman coded symbols from the file and undo the Huffman coding
312 niro 816 and run length encoding, saving the result into dbuf[dbufCount++] = uc */
313 niro 532
314     /* Initialize symbol occurrence counters and symbol Move To Front table */
315 niro 816 memset(byteCount, 0, sizeof(byteCount)); /* smaller, maybe slower? */
316     for (i = 0; i < 256; i++) {
317     //byteCount[i] = 0;
318     mtfSymbol[i] = (unsigned char)i;
319 niro 532 }
320    
321     /* Loop through compressed symbols. */
322    
323 niro 816 runPos = dbufCount = selector = 0;
324 niro 532 for (;;) {
325    
326 niro 816 /* Fetch next Huffman coding group from list. */
327     symCount = GROUP_SIZE - 1;
328     if (selector >= nSelectors) return RETVAL_DATA_ERROR;
329     hufGroup = bd->groups + selectors[selector++];
330     base = hufGroup->base - 1;
331     limit = hufGroup->limit - 1;
332     continue_this_group:
333 niro 532
334     /* Read next Huffman-coded symbol. */
335    
336     /* Note: It is far cheaper to read maxLen bits and back up than it is
337     to read minLen bits and then an additional bit at a time, testing
338     as we go. Because there is a trailing last block (with file CRC),
339     there is no danger of the overread causing an unexpected EOF for a
340     valid compressed file. As a further optimization, we do the read
341     inline (falling back to a call to get_bits if the buffer runs
342     dry). The following (up to got_huff_bits:) is equivalent to
343 niro 816 j = get_bits(bd, hufGroup->maxLen);
344 niro 532 */
345 niro 816 while ((int)(bd->inbufBitCount) < hufGroup->maxLen) {
346     if (bd->inbufPos == bd->inbufCount) {
347     j = get_bits(bd, hufGroup->maxLen);
348 niro 532 goto got_huff_bits;
349     }
350 niro 816 bd->inbufBits = (bd->inbufBits << 8) | bd->inbuf[bd->inbufPos++];
351     bd->inbufBitCount += 8;
352 niro 532 };
353 niro 816 bd->inbufBitCount -= hufGroup->maxLen;
354     j = (bd->inbufBits >> bd->inbufBitCount) & ((1 << hufGroup->maxLen) - 1);
355 niro 532
356 niro 816 got_huff_bits:
357 niro 532
358     /* Figure how how many bits are in next symbol and unget extras */
359 niro 816 i = hufGroup->minLen;
360     while (j > limit[i]) ++i;
361 niro 532 bd->inbufBitCount += (hufGroup->maxLen - i);
362    
363     /* Huffman decode value to get nextSym (with bounds checking) */
364 niro 816 if (i > hufGroup->maxLen)
365 niro 532 return RETVAL_DATA_ERROR;
366 niro 816 j = (j >> (hufGroup->maxLen - i)) - base[i];
367     if ((unsigned)j >= MAX_SYMBOLS)
368     return RETVAL_DATA_ERROR;
369 niro 532 nextSym = hufGroup->permute[j];
370    
371     /* We have now decoded the symbol, which indicates either a new literal
372     byte, or a repeated run of the most recent literal byte. First,
373     check if nextSym indicates a repeated run, and if so loop collecting
374     how many times to repeat the last literal. */
375 niro 816 if ((unsigned)nextSym <= SYMBOL_RUNB) { /* RUNA or RUNB */
376 niro 532
377     /* If this is the start of a new run, zero out counter */
378 niro 816 if (!runPos) {
379 niro 532 runPos = 1;
380     t = 0;
381     }
382    
383     /* Neat trick that saves 1 symbol: instead of or-ing 0 or 1 at
384     each bit position, add 1 or 2 instead. For example,
385     1011 is 1<<0 + 1<<1 + 2<<2. 1010 is 2<<0 + 2<<1 + 1<<2.
386     You can make any bit pattern that way using 1 less symbol than
387     the basic or 0/1 method (except all bits 0, which would use no
388     symbols, but a run of length 0 doesn't mean anything in this
389     context). Thus space is saved. */
390     t += (runPos << nextSym); /* +runPos if RUNA; +2*runPos if RUNB */
391 niro 816 if (runPos < dbufSize) runPos <<= 1;
392 niro 532 goto end_of_huffman_loop;
393     }
394    
395     /* When we hit the first non-run symbol after a run, we now know
396     how many times to repeat the last literal, so append that many
397     copies to our buffer of decoded symbols (dbuf) now. (The last
398     literal used is the one at the head of the mtfSymbol array.) */
399 niro 816 if (runPos) {
400     runPos = 0;
401     if (dbufCount + t >= dbufSize) return RETVAL_DATA_ERROR;
402 niro 532
403     uc = symToByte[mtfSymbol[0]];
404     byteCount[uc] += t;
405 niro 816 while (t--) dbuf[dbufCount++] = uc;
406 niro 532 }
407    
408     /* Is this the terminating symbol? */
409 niro 816 if (nextSym > symTotal) break;
410 niro 532
411     /* At this point, nextSym indicates a new literal character. Subtract
412     one to get the position in the MTF array at which this literal is
413     currently to be found. (Note that the result can't be -1 or 0,
414     because 0 and 1 are RUNA and RUNB. But another instance of the
415     first symbol in the mtf array, position 0, would have been handled
416     as part of a run above. Therefore 1 unused mtf position minus
417     2 non-literal nextSym values equals -1.) */
418 niro 816 if (dbufCount >= dbufSize) return RETVAL_DATA_ERROR;
419 niro 532 i = nextSym - 1;
420     uc = mtfSymbol[i];
421    
422     /* Adjust the MTF array. Since we typically expect to move only a
423     * small number of symbols, and are bound by 256 in any case, using
424     * memmove here would typically be bigger and slower due to function
425     * call overhead and other assorted setup costs. */
426     do {
427     mtfSymbol[i] = mtfSymbol[i-1];
428     } while (--i);
429     mtfSymbol[0] = uc;
430 niro 816 uc = symToByte[uc];
431 niro 532
432     /* We have our literal byte. Save it into dbuf. */
433     byteCount[uc]++;
434 niro 816 dbuf[dbufCount++] = (unsigned)uc;
435 niro 532
436     /* Skip group initialization if we're not done with this group. Done
437     * this way to avoid compiler warning. */
438 niro 816 end_of_huffman_loop:
439     if (symCount--) goto continue_this_group;
440 niro 532 }
441    
442     /* At this point, we've read all the Huffman-coded symbols (and repeated
443 niro 816 runs) for this block from the input stream, and decoded them into the
444 niro 532 intermediate buffer. There are dbufCount many decoded bytes in dbuf[].
445     Now undo the Burrows-Wheeler transform on dbuf.
446     See http://dogma.net/markn/articles/bwt/bwt.htm
447     */
448    
449     /* Turn byteCount into cumulative occurrence counts of 0 to n-1. */
450 niro 816 j = 0;
451     for (i = 0; i < 256; i++) {
452     k = j + byteCount[i];
453 niro 532 byteCount[i] = j;
454 niro 816 j = k;
455 niro 532 }
456    
457     /* Figure out what order dbuf would be in if we sorted it. */
458 niro 816 for (i = 0; i < dbufCount; i++) {
459     uc = (unsigned char)(dbuf[i] & 0xff);
460 niro 532 dbuf[byteCount[uc]] |= (i << 8);
461     byteCount[uc]++;
462     }
463    
464     /* Decode first byte by hand to initialize "previous" byte. Note that it
465     doesn't get output, and if the first three characters are identical
466     it doesn't qualify as a run (hence writeRunCountdown=5). */
467 niro 816 if (dbufCount) {
468     if ((int)origPtr >= dbufCount) return RETVAL_DATA_ERROR;
469     bd->writePos = dbuf[origPtr];
470     bd->writeCurrent = (unsigned char)(bd->writePos & 0xff);
471     bd->writePos >>= 8;
472     bd->writeRunCountdown = 5;
473 niro 532 }
474 niro 816 bd->writeCount = dbufCount;
475 niro 532
476     return RETVAL_OK;
477     }
478    
479     /* Undo burrows-wheeler transform on intermediate buffer to produce output.
480     If start_bunzip was initialized with out_fd=-1, then up to len bytes of
481     data are written to outbuf. Return value is number of bytes written or
482     error (all errors are negative numbers). If out_fd!=-1, outbuf and len
483     are ignored, data is written to out_fd and return is RETVAL_OK or error.
484     */
485 niro 816 int FAST_FUNC read_bunzip(bunzip_data *bd, char *outbuf, int len)
486 niro 532 {
487 niro 816 const unsigned *dbuf;
488     int pos, current, previous, gotcount;
489 niro 532
490     /* If last read was short due to end of file, return last block now */
491 niro 816 if (bd->writeCount < 0) return bd->writeCount;
492 niro 532
493     gotcount = 0;
494 niro 816 dbuf = bd->dbuf;
495     pos = bd->writePos;
496     current = bd->writeCurrent;
497 niro 532
498     /* We will always have pending decoded data to write into the output
499     buffer unless this is the very first call (in which case we haven't
500     Huffman-decoded a block into the intermediate buffer yet). */
501     if (bd->writeCopies) {
502    
503     /* Inside the loop, writeCopies means extra copies (beyond 1) */
504     --bd->writeCopies;
505    
506     /* Loop outputting bytes */
507     for (;;) {
508    
509     /* If the output buffer is full, snapshot state and return */
510 niro 816 if (gotcount >= len) {
511     bd->writePos = pos;
512     bd->writeCurrent = current;
513 niro 532 bd->writeCopies++;
514     return len;
515     }
516    
517     /* Write next byte into output buffer, updating CRC */
518     outbuf[gotcount++] = current;
519 niro 816 bd->writeCRC = (bd->writeCRC << 8)
520     ^ bd->crc32Table[(bd->writeCRC >> 24) ^ current];
521 niro 532
522     /* Loop now if we're outputting multiple copies of this byte */
523     if (bd->writeCopies) {
524     --bd->writeCopies;
525     continue;
526     }
527 niro 816 decode_next_byte:
528 niro 532 if (!bd->writeCount--) break;
529     /* Follow sequence vector to undo Burrows-Wheeler transform */
530 niro 816 previous = current;
531     pos = dbuf[pos];
532     current = pos & 0xff;
533     pos >>= 8;
534 niro 532
535 niro 816 /* After 3 consecutive copies of the same byte, the 4th
536     * is a repeat count. We count down from 4 instead
537 niro 532 * of counting up because testing for non-zero is faster */
538 niro 816 if (--bd->writeRunCountdown) {
539     if (current != previous)
540     bd->writeRunCountdown = 4;
541 niro 532 } else {
542    
543     /* We have a repeated run, this byte indicates the count */
544 niro 816 bd->writeCopies = current;
545     current = previous;
546     bd->writeRunCountdown = 5;
547 niro 532
548     /* Sometimes there are just 3 bytes (run length 0) */
549 niro 816 if (!bd->writeCopies) goto decode_next_byte;
550 niro 532
551     /* Subtract the 1 copy we'd output anyway to get extras */
552     --bd->writeCopies;
553     }
554     }
555    
556     /* Decompression of this block completed successfully */
557 niro 816 bd->writeCRC = ~bd->writeCRC;
558     bd->totalCRC = ((bd->totalCRC << 1) | (bd->totalCRC >> 31)) ^ bd->writeCRC;
559 niro 532
560     /* If this block had a CRC error, force file level CRC error. */
561 niro 816 if (bd->writeCRC != bd->headerCRC) {
562     bd->totalCRC = bd->headerCRC + 1;
563 niro 532 return RETVAL_LAST_BLOCK;
564     }
565     }
566    
567     /* Refill the intermediate buffer by Huffman-decoding next block of input */
568     /* (previous is just a convenient unused temp variable here) */
569 niro 816 previous = get_next_block(bd);
570     if (previous) {
571     bd->writeCount = previous;
572     return (previous != RETVAL_LAST_BLOCK) ? previous : gotcount;
573 niro 532 }
574 niro 816 bd->writeCRC = ~0;
575     pos = bd->writePos;
576     current = bd->writeCurrent;
577 niro 532 goto decode_next_byte;
578     }
579    
580     /* Allocate the structure, read file header. If in_fd==-1, inbuf must contain
581     a complete bunzip file (len bytes long). If in_fd!=-1, inbuf and len are
582     ignored, and data is read from file handle into temporary buffer. */
583    
584 niro 816 /* Because bunzip2 is used for help text unpacking, and because bb_show_usage()
585     should work for NOFORK applets too, we must be extremely careful to not leak
586     any allocations! */
587     int FAST_FUNC start_bunzip(bunzip_data **bdp, int in_fd, const unsigned char *inbuf,
588 niro 532 int len)
589     {
590     bunzip_data *bd;
591 niro 816 unsigned i;
592     enum {
593     BZh0 = ('B' << 24) + ('Z' << 16) + ('h' << 8) + '0',
594     h0 = ('h' << 8) + '0',
595     };
596 niro 532
597     /* Figure out how much data to allocate */
598 niro 816 i = sizeof(bunzip_data);
599     if (in_fd != -1) i += IOBUF_SIZE;
600 niro 532
601     /* Allocate bunzip_data. Most fields initialize to zero. */
602 niro 816 bd = *bdp = xzalloc(i);
603 niro 532
604     /* Setup input buffer */
605 niro 816 bd->in_fd = in_fd;
606     if (-1 == in_fd) {
607     /* in this case, bd->inbuf is read-only */
608     bd->inbuf = (void*)inbuf; /* cast away const-ness */
609     bd->inbufCount = len;
610     } else
611     bd->inbuf = (unsigned char *)(bd + 1);
612 niro 532
613     /* Init the CRC32 table (big endian) */
614 niro 816 crc32_filltable(bd->crc32Table, 1);
615 niro 532
616     /* Setup for I/O error handling via longjmp */
617 niro 816 i = setjmp(bd->jmpbuf);
618     if (i) return i;
619 niro 532
620     /* Ensure that file starts with "BZh['1'-'9']." */
621 niro 816 /* Update: now caller verifies 1st two bytes, makes .gz/.bz2
622     * integration easier */
623     /* was: */
624     /* i = get_bits(bd, 32); */
625     /* if ((unsigned)(i - BZh0 - 1) >= 9) return RETVAL_NOT_BZIP_DATA; */
626     i = get_bits(bd, 16);
627     if ((unsigned)(i - h0 - 1) >= 9) return RETVAL_NOT_BZIP_DATA;
628 niro 532
629 niro 816 /* Fourth byte (ascii '1'-'9') indicates block size in units of 100k of
630 niro 532 uncompressed data. Allocate intermediate buffer for block. */
631 niro 816 /* bd->dbufSize = 100000 * (i - BZh0); */
632     bd->dbufSize = 100000 * (i - h0);
633 niro 532
634 niro 816 /* Cannot use xmalloc - may leak bd in NOFORK case! */
635     bd->dbuf = malloc_or_warn(bd->dbufSize * sizeof(int));
636     if (!bd->dbuf) {
637     free(bd);
638     xfunc_die();
639     }
640 niro 532 return RETVAL_OK;
641     }
642    
643 niro 816 void FAST_FUNC dealloc_bunzip(bunzip_data *bd)
644     {
645     free(bd->dbuf);
646     free(bd);
647     }
648 niro 532
649 niro 816
650     /* Decompress src_fd to dst_fd. Stops at end of bzip data, not end of file. */
651 niro 984 IF_DESKTOP(long long) int FAST_FUNC
652 niro 816 unpack_bz2_stream(int src_fd, int dst_fd)
653 niro 532 {
654 niro 984 IF_DESKTOP(long long total_written = 0;)
655 niro 532 char *outbuf;
656     bunzip_data *bd;
657     int i;
658    
659 niro 816 outbuf = xmalloc(IOBUF_SIZE);
660     i = start_bunzip(&bd, src_fd, NULL, 0);
661     if (!i) {
662 niro 532 for (;;) {
663 niro 816 i = read_bunzip(bd, outbuf, IOBUF_SIZE);
664     if (i <= 0) break;
665     if (i != full_write(dst_fd, outbuf, i)) {
666     i = RETVAL_SHORT_WRITE;
667 niro 532 break;
668     }
669 niro 984 IF_DESKTOP(total_written += i;)
670 niro 532 }
671     }
672    
673     /* Check CRC and release memory */
674    
675 niro 816 if (i == RETVAL_LAST_BLOCK) {
676     if (bd->headerCRC != bd->totalCRC) {
677     bb_error_msg("CRC error");
678 niro 532 } else {
679 niro 816 i = RETVAL_OK;
680 niro 532 }
681 niro 816 } else if (i == RETVAL_SHORT_WRITE) {
682     bb_error_msg("short write");
683 niro 532 } else {
684 niro 816 bb_error_msg("bunzip error %d", i);
685 niro 532 }
686 niro 816 dealloc_bunzip(bd);
687 niro 532 free(outbuf);
688    
689 niro 984 return i ? i : IF_DESKTOP(total_written) + 0;
690 niro 532 }
691    
692 niro 984 IF_DESKTOP(long long) int FAST_FUNC
693 niro 816 unpack_bz2_stream_prime(int src_fd, int dst_fd)
694     {
695 niro 1123 uint16_t magic2;
696     xread(src_fd, &magic2, 2);
697     if (magic2 != BZIP2_MAGIC) {
698 niro 816 bb_error_msg_and_die("invalid magic");
699     }
700     return unpack_bz2_stream(src_fd, dst_fd);
701     }
702    
703 niro 532 #ifdef TESTING
704    
705 niro 816 static char *const bunzip_errors[] = {
706     NULL, "Bad file checksum", "Not bzip data",
707     "Unexpected input EOF", "Unexpected output EOF", "Data error",
708     "Out of memory", "Obsolete (pre 0.9.5) bzip format not supported"
709     };
710 niro 532
711     /* Dumb little test thing, decompress stdin to stdout */
712 niro 816 int main(int argc, char **argv)
713 niro 532 {
714 niro 816 int i;
715 niro 532 char c;
716    
717 niro 816 int i = unpack_bz2_stream_prime(0, 1);
718     if (i < 0)
719     fprintf(stderr, "%s\n", bunzip_errors[-i]);
720     else if (read(STDIN_FILENO, &c, 1))
721     fprintf(stderr, "Trailing garbage ignored\n");
722 niro 532 return -i;
723     }
724     #endif