#-- FILE DESCRIPTION #-- The following command lines were used to reformat the #-- Netbeans (www.netbeans.org) standard documentation for the #-- xml module, into a plain text version, with all the html files #-- concatenated in the same order that they are refered to in the #-- table of contents. In addition, the 'see also' sections for #-- each html page has been removed. These command lines were #-- run on a ms-windows 2000 computer using the 'unxutils' unix #-- shell and tools which are located on 'source-forge' #-- #-- The script also uses a Ms-Windows 'lynx' port, which is located at #-- http://www.jim.spath.com/lynx_win32/ #-- Lynx (a text only browser) is required in order to convert #-- html into plain text with its '-dump' command line option. #-- The unix program 'html2text' will also do this, but I have not #-- been able to find a Ms-Windows port of this program. #-- The dos program 'htmStrip' also does this, but does not seem to #-- have good support for 'batch' processing. #-- For this version of lynx on ms-windows you will have to #-- change you 'path' environment variable and add an #-- environment variable called 'lynx_cfg' which will point to the #-- lynx.cfg file. #-- Author: m.j.bishop #-- Some pseudo code which depends on which modules documentation #-- you would like to reformat #-- The jar documentation files are stored in #-- [Netbeans Installation Dir]\modules\docs # jar xf [module-name] # cd org\netbeans\modules\xml\core\docs # cd org\netbeans\modules\usersguide #-- for the Tomcat documentation the html files are extracted #-- to the following directory: #-- [Nb Install Dir]\modules\docs\org\netbeans\modules\tomcat\tomcat40\docs\tomcat4 #-- for the Tomcat docs, the xml files are called #-- 'tomcat-toc.xml' and 'tomcatMap.jhm' #-- for the users guide the map file is 'Map.jhm' #-- there does not appear to be a complete table of contents file #-- cat xml-toc.xml | grep "target=" | sed -e 's/.*target="//g' -e 's/".*$//g' > tocmap.txt #-- This uses the xml map file to find the corresponding html files #-- in the 'html' directory #-- Dont get rid of the leading directory name (eg 'html') below #-- #-- The line below looks for double quotes (") in the sed part of #-- the command line. The ms-windows command shell does not seem to #-- be able to do this, since it doesn't recognise the single quote (') #-- as a string delimiter. for f in $(cat tocmap.txt); do grep $f xmlMap.jhm; done | sed 's/.*"html\/\([^"]*\)".*/\1/g' > newtoc.txt cd html #-- dont do a 'cd' but use the directory reference in the #-- '-map.xml' file. Otherwise for the 'usersguide' documentation, you #-- would have to 'cd' into a large number of directories. #-- #-- or cd tomcat4 for f in $(cat newtoc.txt); do lynx -dump -nolist $f ; done > all.txt #-- Microsoft Windows 2000 contains a program called 'expand' which #-- interfers with the unix utility. For this reason, I renamed the #-- standard unix 'expand' program as 'xpand' # E:\Program Files\NetBeans IDE 3.4\modules\docs\org\netbeans\modules\xml\core\docs\html> cat all.txt | xpand | sed -e "/\[splash\]/d" -e "/Legal Notices/d" | sed "/^[ ]*See also[ ]*$/,/^[ ]*[\-_]*[ ]*$/d" | sed "s/^[ ]*$//g" | tr -s "\n" > all-clean.txt