#!/bin/bash
# ksh might work with a little tweaking.

###############################################################################
#                      - configuration options -

# style of busy indication. set to spinner, current, dot, or none.
BUSY=spinner

# 1 for messages while running, 0 for peace and quiet
# with 0 you will still get any error messages from the sub-processes.
VERBOSE=1

# default man tree if none given on command line
MAN=/usr/man

# output compression level. (1 to 9, default if commented out is 6)
GZOPTS="-f9"

# desired ownership/permissions for man pages
MANUSER=root
MANGROUP=root
MANPERMS=644

#                     - end configuration options -
###############################################################################

#h
#h             trimman - Trims the fat from your man pages. 
#h   
#h Features/Description:
#h =====================
#h   * Eliminates redundant files, always keeping the newest.
#h     trimman makes 4 passes through your man directories, the first pass takes
#h     care of any redundancies.
#h     If you're like me, you have a full set of uncompressed man pages from say,
#h     a RedHat rpm, and then later installed the latest release of the manpages
#h     from an ordinary .tgz, which happened to come compressed, and so then you
#h     had two copies of almost every page! You don't want that, for two reasons
#h     For one thing, man will only display one version of the page, maybe it's
#h     the new one, maybe not :) Also, it's a very un-productive use for several
#h     megs of hard drive space. So you go to fix it. gzip * ? nope, then you
#h     end up with lots of file.1.gz.gz, gzip *.1 ? nope, then you miss all the
#h     file.1x file.1m file.1n etc.. hmmm, gzip *.1* ? nope... same as gzip *
#h     On top of these issues, there is the fact that all the symlinks go
#h     right out the window when you rename their targets and/or change the
#h     contents of their targets to some compressed form without changing the
#h     name of the symlink to match. One look in the man directories and you
#h     realize doing any of this by hand is out of the question, unless you're
#h     in prison or something and have nothing else to do for the next few weeks
#h     anyways. And you don't want to just delete the whole mess and start 
#h     fresh because by now you have lots pages in there that came with things
#h     you installed, and they won't be in any generic man-pages package, and 
#h     you don't want to lose them, and you also don't remember what every
#h     single one was so you better just not touch the directories they are 
#h     in. Thats where trimman comes in. What you need is something that will
#h     go through and examine each seperate file and make a decision. 
#h     Thats what trimman does. trimman tests for the existance
#h     of 4 possible versions of every page, Plain, .gz, .bz2, and .Z,
#h     and discards all but the newest. (the .bz2 check is only for reversing
#h     the effects of an earlier version of trimman.) trimman does not yet look
#h     for other forms of redundancy, such as: fvwm.1 vs fvwm.1x vs Fvwm.1
#h     but, most often, that sort of thing is not really a redundancy, but a
#h     symlink, and trimman does know what to do with those.
#h   * Compresses all pages with gzip.
#h     The next pass through your man directories sees to it that all files
#h     are compressed with gzip. Aside from previously uncompressed pages, 
#h     if any pages were already compressed in another format, it converts them
#h     to gzip. (This also corrects the action of an earlier version of trimman
#h     which bzip2'd man pages, rendering them unuseable to most man-utils.)
#h   * Regenerates all the symlinks.
#h     The third pass reconstructs all symlinks that are physically possible.
#h     Many pages are just symlinks to other pages with different names, 
#h     like mail.1 and Mail.1, where Mail.1 is just a symlink pointing to 
#h     mail.1 . If you compress mail.1, two things about the symlink need 
#h     to be updated. Mail.1 needs its target changed from mail.1 to mail.1.gz
#h     since mail.1 no longer exists. Then link needs to be renamed to look like
#h     like the content of the file it points to. If a link points to a .gz
#h     file, then it needs to be (re)named to look like a .gz file itself, or
#h     else man will not display it properly. This script takes care of all this
#h     quite nicely. In cases where a symlink is broken because the target is 
#h     actually gone, not just renamed, the broken symlink is not removed, 
#h     since, the target page might show up again at some time, (like if
#h     you re-install the program it belonged to) and if it does re-appear, then
#h     the next time you run trimman, the broken link will get re-connected with
#h     it's target. This basically only happens when you manipulate your man
#h     pages by hand and accidentally remove pages you didn't want to, and using
#h     trimman will put and end to that.
#h   * Lastly, cleans up the file modes.
#h     In the final pass, trimman makes all pages so they are ownd by root, 
#h     executable by none, writeable by owner only, and readable by all.
#h     For some reason, there always seems to be a scattering of man pages 
#h     with goofy file modes, like executable. Especially on a multi-user
#h     system, you don't want any of the system-wide man pages to be writable 
#h     or owned by anyone but root, or some chosen administrator group.
#h     The ownership and permissions are configurable by editing the variables
#h     in the top section of the script.
#h     Users can keep their own supplemental man pages in their own directories.
#h   * Doesn't mess with any files that don't need it. 
#h     Examines all files, but does not touch files that have already been
#h     fixed up by trimman, so it's perfectly ok and in fact, it's the intended
#h     purpose, to run trimman on the same directory periodically or anytime
#h     you like. So, the first time you run trimman, you will see a *lot* of 
#h     "Found so-and-so: doing such-and-such about it" actvity, and on 
#h     subsequent runs, you will just see a nifty spinner.
#h     Then, you install a new program, and it installs a man page or two. Then
#h     you run trimman again, and you will see the nifty spinner,and just one or
#h     two "I found something" messages. Skips cat directories too. (see note
#h     about cat directories below)
#h   * Funky built-in help, especially funky on Linux :)
#h   * Should be pretty portable, You'll know it when I do the Xenix/ksh port.
#h   * 2:1 documantation:code ratio...Ok, maybe that's not exactly a "feature" :)
#h
#h
#h Usage:
#h ======
#h     trimman [path]
#h     trimman h | -h | help | --help
#h
#h
#h     path   -   toplevel man directory that you want trimman to work on.
#h         Some examples are:
#h         /usr/man  /usr/X11R6/man  /usr/lib/perl5/man  /usr/local/man
#h         
#h         Defaults to /usr/man if none specified
#h
#h         relative paths like  .  and  ../../man  and such are ok too, so 
#h         long as they point to a toplevel man directory.
#h
#h         The sanity checking for this is pretty thorough to guard against
#h         such things as compressing everything in your home dir.
#h         Ask me how I thought to put that check in, go ahead, ask me. :)
#h
#h     h | -h | help | --help   -   displays this text
#h
#h
#h Requirements:
#h =============
#h     run as root
#h     uncompress
#h     bunzip2
#h     gzip
#h     basename
#h     bash 
#h
#h     Maybe with a little work ksh will work in place of bash for non-linux
#h     systems, edit the top line. trimman for Xenix coming soon :)
#h
#h
#h Note about cat directories:
#h ===========================
#h     On a multi-user system, cat pages probably do provide some benefit, 
#h     where there may be a constant stream of man-page access by several
#h     users, so I'd keep them, but on a single-user, or not-a-huge-number-
#h     of-users system, I'd just get rid of all cat files. They are redundant
#h     fat if you don't need the slight speed/load advantage. To stop man
#h     from writing a cat file for every man page you access, just delete 
#h     all the cat directories. look in places like /usr/man/cat* /var/catman 
#h     for them.
#h
#h
#h Author:
#h =======
#h     linut@squonk.net ( Brian K. White )
#h
#h
#h Changes:
#h ========
#h 04/02/1999  version 1.06
#h             trimman now checks for plain, .Z, .gz, and .bz2 man pages,
#h             discards all but the newest, and converts the result to .gz
#h             This means trimman now automatically corrects the work of
#h             the first version of trimman, which bzip2'd the pages, rendering
#h             them unuseable to most man-utils.
#h             Added default target directory of /usr/man if none specified.
#h             Added a configuration section to the beginning of the script.
#h 03/06/1999  version 1.05
#h             added -f to the gzip compression command to force compression of
#h             files with links.
#h 02/28/1999  added -9 to the gzip compression command.
#h 10/26/1998  version 1.04
#h             Changed trimman-gz's bzip2 stuff to the equivalent for 
#h             compress. (.Z) No longer looks for .bz2 files or links.
#h             Does now work on .Z files. This makes trimman-gz more widely
#h             useful, since there are installations with .Z compressed
#h             pages, and recent versions at least of all man-related utilities
#h             can handle uncompressed, .Z compressed, and .gz compressed
#h             pages transparently. .gz offering the highest commpression.
#h             Added configurable "busy" eye candy. (none, dots, spinner)
#h 10/16/1998  version 1.03b 
#h             Stop-gap release of 3 slightly different versions in which the
#h             only difference is the target compression format.
#h              trimman-bz2  same as previous trimman.
#h              trimman-gz  same, except pages are gzipped, instead of compressed.
#h              trimman-plain  same, except all compression is removed.
#h             Reason: man and all related utils all have built in support for
#h             plain, compressed (.Z), and gzipped (.gz) man pages, but not
#h             for bzip2ed pages. man itself can be made to use them easily,
#h             but none of the other man utils will work. (xman, makewhatis, 
#h             etc) also man seems to be unable to follow symlinks to .bz2
#h             files even when they exactly mimick symlinks to .gz files
#h             so after using trimman, several man pages became un-viewable,
#h             and various man-related utils stopped working. 
#h             
#h             If you ran trimman previously, then I encourage you now to
#h             run trimman-gz and then use it in place of trimman untill
#h             more versatile trimman comes out, or bzip2-aware versions
#h             of the rest of the man-utils. This will result in all the
#h             same redundancy checking and symlink-fixing as trimman,
#h             except the compression pass will convert everything to gzip
#h             instead of bzip2. Your man utils should all work just fine
#h             with gzipped pages, exactly as with plain pages.
#h             
#h             In case this is not so, or in case you just want it,
#h             you can run trimman-plain, which will also do the redundancy
#h             and symlink checks, but the compression pass will remove
#h             all compression.
#h             
#h 10/05/1998  version 1.02 Full-path symlinks isn't the problem, it's
#h             links to other links. If a link points to another link, even
#h             if that link points to a good file, something in the 
#h             man-bunzip-nroff-pager chain doesn't like it. Reverted the use
#h             of basename, allowing full path links again. 
#h             Have not yet dealt with this possiblilty other than that.
#h             Improved the accuracy of all the sed calls.
#h              
#h 10/04/1998  version 1.01 Man seems to dislike symlinks with full path 
#h             targets. Changed the symlink pass accordingly. Now requires
#h             the basename program.
#h 09/26/1998  version 1.0  First release.
#h
#h
#h Todo:
#h =====
#h     * Use getopt() built-in and make all hard-coded features configurable.
#h     * Choice of target compression: bzip2, gzip, compress, none, etc...
#h     * Optional verbosity: messages on/of  dots on/off etc...
#h     * Optional spinner instead of dots (or no progress indicator)
#h     * Option to specify which operation(s) to perform: 
#h       just the redundancy check, just compression, symlinks etc...
#h     * Option to do nothing, just go through the motions and display what
#h       what trimman would have done.
#h     * Non-interactive mode that exits, but doesn't invoke help when fed
#h       a bad target directory, so that trimman can just run itself to recurse
#h       through sub-directories without stopping.
#h     * Add some more sanity checks
#h     * Port to C and GTK+ :) *not* ...wait, hmmm..... :)
#h     * Ability to compare trees for redundancies. Currently only works on
#h       one tree at a time, and cannot compare the contents of one tree
#h       with the contents of another.
#h     * Add bzip2 support to all known man-utils a-la gzip
#h     * Option to search for man.conf and a bzip2 binary and automatically
#h       verify and/or add the necessary line(s) in man.conf
#h     * Parse man.conf and work on every man directory defined in it.
#h     * Recurse the tree as far as it goes. Currently just goes down 1 level.
#h       So, if you have something like this:
#h       /usr               "trimman /usr/man" will get you /usr/man/man*,
#h         `-man            but not /usr/man/X11/man*
#h            |-X11
#h            |  |-man1
#h            |  `-man2
#h            |-man1
#h            `-man2
#h     * Add handling for more compression formats. Even just adding 
#h       .bz2 will require a re-write to keep the code from going bannannas. 
#h       However, then it will be trivial to add support for as many 
#h       compressors and file formats as you like. rar'd man-pages, anyone?
#h     * Re-write anyways, since this script has as much redundancy inside
#h       itself as the man directories it's supposed to clean up :) 
#h

[ "$BUSY" = "spinner" ] && sc=("|\r" "/\r" "-\r" "\\ \r")
spinner () { s=$[$s+1] ; [ $s -gt 3 ] && s=0 ; echo -en ${sc[$s]} ; }
dot () { echo -n . ; }
current () { echo -en "$PAGE                          \r" ; }
none () { : ; }

getword11 () { echo ${11} ; }

say () { [ $VERBOSE = 1 ] && echo $@ ; }

check () {
    if [ -f $1 -a -f $2 ] ; then
        say -en "\r                                                                                "
        say -en "\r  Found $1 and $2: "
        [ $1 -nt $2 ] && { say "Removing $2" ; rm -f $2 ; } || { say "Removing $1" ; rm -f $1 ; }
    fi
}

gototown () {
    cd $1
    # 1st pass - redundancy
    say -en "\r                                                                                "
    say -e "\r   `pwd` : redundancy"
    for FILE in `ls` ; do
	if [ -f $FILE ] ; then
	    PAGE=`echo $FILE |sed s/\.gz// |sed s/\.Z// |sed s/\.bz2// `
	    check $PAGE $PAGE.gz
	    check $PAGE $PAGE.Z
	    check $PAGE $PAGE.bz2
	    check $PAGE.Z $PAGE.gz
	    check $PAGE.Z $PAGE.bz2
	    check $PAGE.gz $PAGE.bz2
	    $BUSY
	fi
    done

    # 2nd pass - compression
    say -en "\r                                                                                "
    say -e "\r   `pwd` : compression"
    for FILE in `ls` ; do
	PAGE=`echo $FILE |sed s/\.gz// |sed s/\.Z// |sed s/\.bz2// `
	if [ ! -L $FILE ] ; then
	    case $FILE in
	    $PAGE)
		say -en "\r                                                                                "
		say -e "\r  Found uncompressed $FILE: Compressing with gzip"
		gzip $GZOPTS $FILE
	    ;;
	    $PAGE.Z)
		say -en "\r                                                                                "
		say -e "\r  Found .Z compressed $FILE: Converting to $PAGE.gz"
		uncompress $FILE
		gzip $GZOPTS $PAGE
	    ;;
	    $PAGE.bz2)
		say -en "\r                                                                                "
		say -e "\r  Found bzip2 compressed $FILE: Converting to $PAGE.gz"
		bunzip2 $FILE
		gzip $GZOPTS $PAGE
	    ;;
	    esac
	    $BUSY
	fi
    done

    # 3rd pass - symlinks
    say -en "\r                                                                                "
    say -e "\r   `pwd` : symlinks"
    for FILE in `ls` ; do
	if [ -L $FILE ] ; then
	    PAGE=`echo $FILE |sed s/\.gz// |sed s/\.Z// |sed s/\.bz2// `
    	    OLDTARGET=$(getword11 $(ls -l $FILE))
    	    NEWTARGET=$(echo $OLDTARGET |sed s/\.gz// |sed s/\.Z// |sed s/\.bz2// ).gz
	    if [ "$OLDTARGET" != "$NEWTARGET" -o "$FILE" != "$PAGE.gz" ] ; then
		say -en "\r                                                                                "
		say -e "\r  Fixing symlink $FILE->$OLDTARGET to $PAGE.gz->$NEWTARGET"
		rm -f $FILE
		ln -s $NEWTARGET $PAGE.gz
	    fi
	    $BUSY
	fi
    done

    # 4th pass - ownerships & permissions
    say -en "\r                                                                                "
    say -e "\r   `pwd` : file modes"
    chmod -c $MANPERMS *
    chown -c $MANUSER:$MANGROUP *
    cd ..
}

readme () {
    echo `uname -s` |grep -q inux && { cat $0 |grep "^#h" |sed s/#h//g |less -iqn -j12 -Ps"  $(basename $0) help   ?n (Top) :?e(Bottom):?s (%pm\%)  :       ...     [Q]=quit  [/]=search  [arrows/page]=scroll  " ; exit ; }
    [ -n "$PAGER" ] && PAGER=more
    cat $0 |grep "^#h" |sed s/#h//g |$PAGER
    exit 0
}

safety () { echo -e "$1 : $MAN doesn't appear to be a toplevel man directory.\nTry \"trimman --help\" "; exit 1 ; }

case $1 in
    h | -h | help | --help)
	readme
    ;;
    c | -c | config | --configure)
	readme
    ;;
    *)
	echo $1 |grep -q man && MAN=$1
	[ ! -d $MAN ] && safety
	ls $MAN |grep -q man || safety
	SAVEWD=`pwd`
	cd $MAN
	for DIR in `ls` ; do
	    [ -d $DIR ] && { echo $DIR |grep -q man && gototown $DIR ; }
	done
	cd $SAVEWD
    ;;
esac
