--- trunk/mage/usr/lib/mage/compressdoc 2011/12/28 12:23:50 1584 +++ trunk/mage/usr/lib/mage/compressdoc 2011/12/28 12:25:54 1585 @@ -1,8 +1,11 @@ #!/bin/bash -# VERSION: 20040320.0026 +# VERSION: 20080421.1623 +# $LastChangedBy: dnicholson $ +# $Date: 2008-04-21 16:27:43 -0700 (Mon, 21 Apr 2008) $ # # Compress (with bzip2 or gzip) all man pages in a hierarchy and # update symlinks - By Marc Heerdink +# # Modified to be able to gzip or bzip2 files as an option and to deal # with all symlinks properly by Mark Hymers # @@ -11,21 +14,39 @@ # to allow for changing hard-links into soft- ones, to specify the # compression level, to parse the man.conf for all occurrences of MANPATH, # to allow for a backup, to allow to keep the newest version of a page. -# Modified 20040330 by Tushar Teredesai to replace $0 by the name of the script. +# +# Modified 20040330 by Tushar Teredesai to replace $0 by the name of the +# script. # (Note: It is assumed that the script is in the user's PATH) # +# Modified 20050112 by Randy McMurchy to shorten line lengths and +# correct grammar errors. +# +# Modified 20060128 by Alexander E. Patrakov for compatibility with Man-DB. +# +# Modified 20060311 by Archaic to use Man-DB manpath utility which is a +# replacement for man --path from Man. +# +# Modified 20080421 by Dan Nicholson to properly execute the correct +# compressdoc when working recursively. This means the same compressdoc +# will be used whether a full path was given or it was resolved from PATH. +# +# Modified 20080421 by Dan Nicholson to be more robust with directories +# that don't exist or don't have sufficient permissions. +# +# Modified 20080421 by Lars Bamberger to (sort of) automatically choose +# a compression method based on the size of the manpage. A couple bug +# fixes were added by Dan Nicholson. +# +# Modified 20080421 by Dan Nicholson to suppress warnings from manpath +# since these are emitted when $MANPATH is set. Removed the TODO for +# using the $MANPATH variable since manpath(1) handles this already. +# # TODO: -# - choose a default compress method to be based on the available -# tool : gzip or bzip2; -# - offer an option to automagically choose the best compression method -# on a per page basis (eg. check which ofgzip/bzip2/whatever is the -# most effective, page per page); -# - when a MANPATH env var exists, use this instead of /etc/man.conf -# (useful for users to (de)compress their man pages; -# - offer an option to restore a previous backup; -# - add other compression engines (compress, zip, etc?). Needed? - -# $Header: /home/cvsd/magellan-cvs/magellan-src/mage/usr/lib/mage/compressdoc,v 1.6 2005-06-01 15:47:41 niro Exp $ +# - choose a default compress method to be based on the available +# tool : gzip or bzip2; +# - offer an option to restore a previous backup; +# - add other compression engines (compress, zip, etc?). Needed? # Funny enough, this function prints some help. function help () @@ -39,77 +60,86 @@ --gzip, --gz, -g --bzip2, --bz2, -b Compress using gzip or bzip2. + --automatic + Compress using either gzip or bzip2, depending on the + size of the file to be compressed. Files larger than 5 + kB are bzipped, files larger than 1 kB are gzipped and + files smaller than 1 kB are not compressed. --decompress, -d Decompress the man pages. - --backup Specify a .tar backup shall be done for every directories. - In case a backup already exists, it is saved as .tar.old prior - to making the new backup. If an .tar.old backup exist, it is - removed prior to saving the backup. + --backup Specify a .tar backup shall be done for all directories. + In case a backup already exists, it is saved as .tar.old + prior to making the new backup. If a .tar.old backup + exists, it is removed prior to saving the backup. In backup mode, no other action is performed. And where options are : -1 to -9, --fast, --best - The compression level, as accepted by gzip and bzip2. When not - specified, uses the default compression level for the given - method (-6 for gzip, and -9 for bzip2). Not used when in backup - or decompress modes. - - --force, -F Force (re-)compression, even if the previous one was the same - method. Useful when changing the compression ratio. By default, - a page will not be re-compressed if it ends with the same suffix - as the method adds (.bz2 for bzip2, .gz for gzip). - - --soft, -S Change hard-links into soft-links. Use with _caution_ as the - first encountered file will be used as a reference. Not used - when in backup mode. + The compression level, as accepted by gzip and bzip2. + When not specified, uses the default compression level + for the given method (-6 for gzip, and -9 for bzip2). + Not used when in backup or decompress modes. + + --force, -F Force (re-)compression, even if the previous one was + the same method. Useful when changing the compression + ratio. By default, a page will not be re-compressed if + it ends with the same suffix as the method adds + (.bz2 for bzip2, .gz for gzip). + + --soft, -S Change hard-links into soft-links. Use with _caution_ + as the first encountered file will be used as a + reference. Not used when in backup mode. - --hard, -H Change soft-links into hard-links. Not used when in backup mode. + --hard, -H Change soft-links into hard-links. Not used when in + backup mode. --conf=dir, --conf dir - Specify the location of man.conf. Defaults to /etc. + Specify the location of man_db.conf. Defaults to /etc. - --verbose, -v Verbose mode, print the name of the directory being processed. - Double the flag to turn it even more verbose, and to print the - name of the file being processed. + --verbose, -v Verbose mode, print the name of the directory being + processed. Double the flag to turn it even more verbose, + and to print the name of the file being processed. - --fake, -f Fakes it. Print the actual parameters compman will use. + --fake, -f Fakes it. Print the actual parameters compressdoc will use. - dirs A list of space-separated _absolute_ pathname to the man - directories. - When empty, and only then, parse ${MAN_CONF}/man.conf for all - occurrences of MANPATH. + dirs A list of space-separated _absolute_ pathnames to the + man directories. When empty, and only then, use manpath + to parse ${MAN_CONF}/man_db.conf for all valid occurrences + of MANDATORY_MANPATH. -Note about compression +Note about compression: There has been a discussion on blfs-support about compression ratios of both gzip and bzip2 on man pages, taking into account the hosting fs, the architecture, etc... On the overall, the conclusion was that gzip - was much efficient on 'small' files, and bzip2 on 'big' files, small and - big being very dependent on the content of the files. + was much more efficient on 'small' files, and bzip2 on 'big' files, + small and big being very dependent on the content of the files. - See the original post from Mickael A. Peters, titled "Bootable Utility CD", - and dated 20030409.1816(+0200), and subsequent posts: + See the original post from Mickael A. Peters, titled + "Bootable Utility CD", dated 20030409.1816(+0200), and subsequent posts: http://linuxfromscratch.org/pipermail/blfs-support/2003-April/038817.html - On my system (x86, ext3), man pages were 35564kiB before compression. gzip -9 - compressed them down to 20372kiB (57.28%), bzip2 -9 got down to 19812kiB - (55.71%). That is a 1.57% gain in space. YMMV. - - What was not taken into consideration was the decompression speed. But does - it make sense to? You gain fast access with uncompressed man pages, or you - gain space at the expense of a slight overhead in time. Well, my P4-2.5GHz - does not even let me notice this... :-) + On my system (x86, ext3), man pages were 35564KB before compression. + gzip -9 compressed them down to 20372KB (57.28%), bzip2 -9 got down to + 19812KB (55.71%). That is a 1.57% gain in space. YMMV. + + What was not taken into consideration was the decompression speed. But + does it make sense to? You gain fast access with uncompressed man + pages, or you gain space at the expense of a slight overhead in time. + Well, my P4-2.5GHz does not even let me notice this... :-) + EOT ) | less } -# This function checks that the man page is unique amongst bzip2'd, gzip'd and -# uncompressed versions. +# This function checks that the man page is unique amongst bzip2'd, +# gzip'd and uncompressed versions. # $1 the directory in which the file resides # $2 the file name for the man page -# Returns 0 (true) if the file is the latest and must be taken care of, and 1 -# (false) if the file is not the latest (and has therefore been deleted). +# Returns 0 (true) if the file is the latest and must be taken care of, +# and 1 (false) if the file is not the latest (and has therefore been +# deleted). function check_unique () { # NB. When there are hard-links to this file, these are @@ -125,7 +155,8 @@ BZ_FILE="$BASENAME".bz2 # Look for, and keep, the most recent one - LATEST=`(cd "$DIR"; ls -1rt "${BASENAME}" "${GZ_FILE}" "${BZ_FILE}" 2>/dev/null | tail -n 1)` + LATEST=`(cd "$DIR"; ls -1rt "${BASENAME}" "${GZ_FILE}" "${BZ_FILE}" \ + 2>/dev/null | tail -n 1)` for i in "${BASENAME}" "${GZ_FILE}" "${BZ_FILE}"; do [ "$LATEST" != "$i" ] && rm -f "$DIR"/"$i" done @@ -139,9 +170,10 @@ # Name of the script MY_NAME=`basename $0` -# OK, parse the command-line for arguments, and initialize to some sensible -# state, that is : don't change links state, parse /etc/man.conf, be most -# silent, search man.conf in /etc, and don't force (re-)compression. +# OK, parse the command-line for arguments, and initialize to some +# sensible state, that is: don't change links state, parse +# /etc/man_db.conf, be most silent, search man_db.conf in /etc, and don't +# force (re-)compression. COMP_METHOD= COMP_SUF= COMP_LVL= @@ -164,6 +196,11 @@ COMP_METHOD=$1 shift ;; + --automatic) + COMP_SUF=TBD + COMP_METHOD=$1 + shift + ;; --decompress|-d) COMP_SUF= COMP_LVL= @@ -219,7 +256,7 @@ exit 1 ;; *) - echo "\"$1\" is not an absolute path name" + echo "\"$1\" is not an absolute path name" exit 1 ;; esac @@ -247,28 +284,38 @@ ;; esac -# Note: on my machine, 'man --path' gives /usr/share/man twice, once with a trailing '/', once without. +# Note: on my machine, 'man --path' gives /usr/share/man twice, once +# with a trailing '/', once without. if [ -z "$MAN_DIR" ]; then - MAN_DIR=`man --path -C "$MAN_CONF"/man.conf \ + MAN_DIR=`manpath -q -C "$MAN_CONF"/man_db.conf \ | sed 's/:/\\n/g' \ | while read foo; do dirname "$foo"/.; done \ | sort -u \ | while read bar; do echo -n "$bar "; done` fi -# If no MANPATH in ${MAN_CONF}/man.conf, abort as well +# If no MANDATORY_MANPATH in ${MAN_CONF}/man_db.conf, abort as well if [ -z "$MAN_DIR" ]; then - echo "No directory specified, and no directory found with \`man --path'" + echo "No directory specified, and no directory found with \`manpath'" exit 1 fi +# Check that the specified directories actually exist and are readable +for DIR in $MAN_DIR; do + if [ ! -d "$DIR" -o ! -r "$DIR" ]; then + echo "Directory '$DIR' does not exist or is not readable" + exit 1 + fi +done + # Fake? if [ "$FAKE" != "no" ]; then echo "Actual parameters used:" echo -n "Compression.......: " case $COMP_METHOD in --bzip2|--bz2|-b) echo -n "bzip2";; - --gzip|__gz|-g) echo -n "gzip";; + --gzip|--gz|-g) echo -n "gzip";; + --automatic) echo -n "compressing";; --decompress|-d) echo -n "decompressing";; *) echo -n "unknown";; esac @@ -277,11 +324,13 @@ echo "Compression suffix: $COMP_SUF" echo -n "Force compression.: " [ "foo$FORCE_OPT" = "foo-F" ] && echo "yes" || echo "no" - echo "man.conf is.......: ${MAN_CONF}/man.conf" + echo "man_db.conf is....: ${MAN_CONF}/man_db.conf" echo -n "Hard-links........: " - [ "foo$LN_OPT" = "foo-S" ] && echo "convert to soft-links" || echo "leave as is" + [ "foo$LN_OPT" = "foo-S" ] && + echo "convert to soft-links" || echo "leave as is" echo -n "Soft-links........: " - [ "foo$LN_OPT" = "foo-H" ] && echo "convert to hard-links" || echo "leave as is" + [ "foo$LN_OPT" = "foo-H" ] && + echo "convert to hard-links" || echo "leave as is" echo "Backup............: $BACKUP" echo "Faking (yes!).....: $FAKE" echo "Directories.......: $MAN_DIR" @@ -299,11 +348,16 @@ if [ "$BACKUP" = "yes" ]; then for DIR in $MAN_DIR; do cd "${DIR}/.." + if [ ! -w "`pwd`" ]; then + echo "Directory '`pwd`' is not writable" + exit 1 + fi DIR_NAME=`basename "${DIR}"` echo "Backing up $DIR..." > $DEST_FD0 [ -f "${DIR_NAME}.tar.old" ] && rm -f "${DIR_NAME}.tar.old" - [ -f "${DIR_NAME}.tar" ] && mv "${DIR_NAME}.tar" "${DIR_NAME}.tar.old" - tar cfv "${DIR_NAME}.tar" "${DIR_NAME}" > $DEST_FD1 + [ -f "${DIR_NAME}.tar" ] && + mv "${DIR_NAME}.tar" "${DIR_NAME}.tar.old" + tar -cvf "${DIR_NAME}.tar" "${DIR_NAME}" > $DEST_FD1 done exit 0 fi @@ -312,37 +366,65 @@ # I need to take into account the localized man, so I'm going recursive for DIR in $MAN_DIR; do MEM_DIR=`pwd` + if [ ! -w "$DIR" ]; then + echo "Directory '$DIR' is not writable" + exit 1 + fi cd "$DIR" for FILE in *; do # Fixes the case were the directory is empty if [ "foo$FILE" = "foo*" ]; then continue; fi # Fixes the case when hard-links see their compression scheme change - # (from not compressed to compressed, or from bz2 to gz, or from gz to bz2) - # Also fixes the case when multiple version of the page are present, which - # are either compressed or not. + # (from not compressed to compressed, or from bz2 to gz, or from gz + # to bz2) + # Also fixes the case when multiple version of the page are present, + # which are either compressed or not. if [ ! -L "$FILE" -a ! -e "$FILE" ]; then continue; fi # Do not compress whatis files if [ "$FILE" = "whatis" ]; then continue; fi if [ -d "$FILE" ]; then - cd "${MEM_DIR}" # Go back to where we ran "$0", in case "$0"=="./compressdoc" ... # We are going recursive to that directory echo "-> Entering ${DIR}/${FILE}..." > $DEST_FD0 # I need not pass --conf, as I specify the directory to work on - # But I need exit in case of error - "$MY_NAME" ${COMP_METHOD} ${COMP_LVL} ${LN_OPT} ${VERBOSE_OPT} ${FORCE_OPT} "${DIR}/${FILE}" || exit 1 + # But I need exit in case of error. We must change back to the + # original directory so $0 is resolved correctly. + (cd "$MEM_DIR" && eval "$0" ${COMP_METHOD} ${COMP_LVL} ${LN_OPT} \ + ${VERBOSE_OPT} ${FORCE_OPT} "${DIR}/${FILE}") || exit $? echo "<- Leaving ${DIR}/${FILE}." > $DEST_FD1 - cd "$DIR" # Needed for the next iteration of the loop else # !dir if ! check_unique "$DIR" "$FILE"; then continue; fi + # With automatic compression, get the uncompressed file size of + # the file (dereferencing symlinks), and choose an appropriate + # compression method. + if [ "$COMP_METHOD" = "--automatic" ]; then + declare -i SIZE + case "$FILE" in + *.bz2) + SIZE=$(bzcat "$FILE" | wc -c) ;; + *.gz) + SIZE=$(zcat "$FILE" | wc -c) ;; + *) + SIZE=$(wc -c < "$FILE") ;; + esac + if (( $SIZE >= (5 * 2**10) )); then + COMP_SUF=.bz2 + elif (( $SIZE >= (1 * 2**10) )); then + COMP_SUF=.gz + else + COMP_SUF= + fi + fi + # Check if the file is already compressed with the specified method BASE_FILE=`basename "$FILE" .gz` BASE_FILE=`basename "$BASE_FILE" .bz2` - if [ "${FILE}" = "${BASE_FILE}${COMP_SUF}" -a "foo${FORCE_OPT}" = "foo" ]; then continue; fi + if [ "${FILE}" = "${BASE_FILE}${COMP_SUF}" \ + -a "foo${FORCE_OPT}" = "foo" ]; then continue; fi # If we have a symlink if [ -h "$FILE" ]; then @@ -356,7 +438,8 @@ esac if [ ! "$EXT" = "none" ]; then - LINK=`ls -l "$FILE" | cut -d ">" -f2 | tr -d " " | sed s/\.$EXT$//` + LINK=`ls -l "$FILE" | cut -d ">" -f2 \ + | tr -d " " | sed s/\.$EXT$//` NEWNAME=`echo "$FILE" | sed s/\.$EXT$//` mv "$FILE" "$NEWNAME" FILE="$NEWNAME" @@ -378,8 +461,9 @@ elif [ -f "$FILE" ]; then # Take care of hard-links: build the list of files hard-linked # to the one we are {de,}compressing. - # NB. This is not optimum has the file will eventually be compressed - # as many times it has hard-links. But for now, that's the safe way. + # NB. This is not optimum has the file will eventually be + # compressed as many times it has hard-links. But for now, + # that's the safe way. inode=`ls -li "$FILE" | awk '{print $1}'` HLINKS=`find . \! -name "$FILE" -inum $inode` @@ -428,14 +512,16 @@ # Keep the hard-link a hard- one ln "${FILE}$COMP_SUF" "${NEWFILE}$COMP_SUF" fi - chmod 644 "${NEWFILE}$COMP_SUF" # Really work only for hard-links. Harmless for soft-links + # Really work only for hard-links. Harmless for soft-links + chmod 644 "${NEWFILE}$COMP_SUF" done fi else - # There is a problem when we get neither a symlink nor a plain file - # Obviously, we shall never ever come here... :-( - echo "Whaooo... \"${DIR}/${FILE}\" is neither a symlink nor a plain file. Please check:" + # There is a problem when we get neither a symlink nor a plain + # file. Obviously, we shall never ever come here... :-( + echo -n "Whaooo... \"${DIR}/${FILE}\" is neither a symlink " + echo "nor a plain file. Please check:" ls -l "${DIR}/${FILE}" exit 1 fi