#! d:/mjb/unixutils/bin/sh.exe
#-- FILE DESCRIPTION
#--   The following command lines were used to reformat the 
#--   Netbeans (www.netbeans.org) standard documentation for the 
#--   xml module, into a plain text version, with all the html files
#--   concatenated in the same order that they are refered to in the
#--   table of contents. In addition, the 'see also' sections for
#--   each html page has been removed. These command lines were
#--   run on a ms-windows 2000 computer using the 'unxutils' unix
#--   shell and tools which are located on 'source-forge'
#--   
#--   The script also uses a Ms-Windows 'lynx' port, which is located at
#--     http://www.jim.spath.com/lynx_win32/
#--   Lynx (a text only browser) is required in order to convert
#--   html into plain text with its '-dump' command line option.
#--   The unix program 'html2text' will also do this, but I have not
#--   been able to find a Ms-Windows port of this program. 
#--   The dos program 'htmStrip' also does this, but does not seem to
#--   have good support for 'batch' processing.
#--   For this version of lynx on ms-windows you will have to 
#--   change you 'path' environment variable and add an
#--   environment variable called 'lynx_cfg' which will point to the 
#--   lynx.cfg file.
#-- 
#--   The result of these transformations is a 235 A4 page manual
#--   using an 8point font in Wordpad or a 298 page manual with
#--   a 10point font.
#--   It appears that the script will only successfully run under the
#--   unix shell if the file has a file name extension of .sh or at least
#--   not .txt

#-- Author: m.j.bishop

# A line to capitalise certain lines
# awk '/^[^ ]/ {print toupper($0)} /^ / {print} /^$/' netbeans-tomcat-guide.txt | less

-- Some pseudo code which depends on which modules documentation
#-- you would like to reformat
#-- The jar documentation files are stored in
#--   [Netbeans Installation Dir]\modules\docs
 
# jar xf [module-name]
# cd org\netbeans\modules\xml\core\docs
# cd org\netbeans\modules\usersguide
#-- for the Tomcat documentation the html files are extracted
#-- to the following directory:
#--   [Nb Install Dir]\modules\docs\org\netbeans\modules\tomcat\tomcat40\docs\tomcat4


#-- for the Tomcat docs, the xml files are called
#-- 'tomcat-toc.xml' and 'tomcatMap.jhm'
#-- for the users guide the map file is 'Map.jhm'
#-- there does not appear to be a complete table of contents file
#--

cat tomcat-toc.xml | grep "target=" | sed -e 's/.*target="//g' -e 's/".*$//g' > tocmap.txt

#-- This uses the xml map file to find the corresponding html files
#-- in the 'html' directory 
#-- Dont get rid of the leading directory name (eg 'html') below
#--
#-- The line below looks for double quotes (") in the sed part of 
#-- the command line. The ms-windows command shell does not seem to
#-- be able to do this, since it doesn't recognise the single quote (')
#-- as a string delimiter. 
#-- Probably need to get rid of references to 'pending.html' which
#-- indicates that no documenation is available. Also a number of files
#-- occur twice in the output. Uniq wont solve this because it only
#-- removes adjacent duplicates (?)
#-- This command takes approximately 20 seconds to complete
#-- on my win2000 laptop.
for f in $(cat tocmap.txt); do grep "target=\"$f\"" tomcatMap.jhm; done | sed 's/.*url="\([^"]*\)".*/\1/g' | xpand | sed "/^[ ]*<!\-\-/d" | uniq > newtoc.txt

#cd html
#-- dont do a 'cd' but use the directory reference in the 
#-- '-map.xml' file. Otherwise for the 'usersguide' documentation, you
#-- would have to 'cd' into a large number of directories.
#--
#-- or cd tomcat4
#-- The command below took about 50 seconds on my win2000 laptop

for f in $(cat newtoc.txt); do lynx -dump -nolist $f ; done > all.txt

#-- Microsoft Windows 2000 contains a program called 'expand' which 
#-- interfers with the unix utility. For this reason, I renamed the 
#-- standard unix 'expand' program as 'xpand'

# E:\Program Files\NetBeans IDE 3.4\modules\docs\org\netbeans\modules\xml\core\docs\html>
#-- maybe I should leave the see also section in??

cat all.txt | xpand | sed -e "/\[splash\]/d" -e "/Legal Notices/d" | sed "/^[ ]*See also[ ]*$/,/^[ ]*[\-_]*[ ]*$/d" | sed "s/^[ ]*$//g" | tr -s "\n" > all-clean.txt

# some addition lines to make a big html version
#-- This is not particularly useful since browsers have a lot of 
#-- trouble trying to load this much html.
# cat all.txt | grep "^[A-Z]" | tr " " "-" | sed "s/.*/<a href=\"#&\">&<\/a><br>/gi" > section-list.txt
# cat all.txt | sed "/^[A-Z]/s/ /-/gi" | sed "/^[A-Z]/s/.*/<\/pre><a name=\"&\">&<\/a><pre>/gi
# " > all.html
#
# (echo "<html><body>"; cat section-list.html; cat all.html; echo "</body></html>") > big.html
