##! d:/mjb/unixutils/bin/sh.exe # # Description: # A script to reformat a text 'documentation' document into # some kind of html. By a 'documentation' document I mean something # which is in a very similar format to this block of text. # That is, containing 'section headings' such as 'Description:' and # 'Notes:' and containing descriptive text for those sections. # # This version makes the section headings into 'heading' elements and makes the # function title or name into an h3 element. This has the disadvantage that unless # there is a style sheet, browsers put quite a lot of space after the # heading elements. However, these heading elements will be usefull if it is # desired to convert the html into some more 'robust' format such as pdf. # By using the 'heading' elements and an html2pdf conversion engine, such as # for example the 'htmldoc' application, it is possible to generate a table of # contents with accurate page numbering for a given output device. This is the # real 'shang-ri-la' of printing as far as I am concerned at the moment. # # By placing a 'for' loop around this command line in a 'sh' shell it should be # possible to batch process plain text documentation files into html and pdf. # There will be those people who are shaking their heads and muttering the words # xml, and sgml and document parsing etc, and really I suppose that I am in # sympathy with those people. But the power of a dodgy plain text format should # not be underestimated. # # # Notes: # The lines below seem to crash the 'unxutils' shell but it appears to # run without problems in the cygwin bash shell. # It takes about 20 seconds to run on my MS Win 2000 laptop # It will also crash in the 'cygwin' shell if you point the shell # interpreter to the /unixutils/bin/sh.exe file # # The current version seems to assume that the section headings are in # all capital letters, which is not always true. It would probably be better # to an 'alternation' operator in a similar fashion to the script # 'diary2html.sh', (although this has now been altered). # # One issue is how this script should 'extract' the documentation from a # file, given that, as is the case with this file (txtdoc2html.sh), the # file also contains code. # # The name of this script is not very clear. # # See Also: # linkdoc2html.sh, plaintext2html.sh, plaintext2pdf.sh, diary2html.sh # # Author: # m.j.bishop # if [ "$1" = "" ] then echo "usage: $0 textFileName" cat $0 | sed -n "/^[ ]*#/p" exit 1; fi echo "" echo "" echo "" cat $1 | \ sed -e "s//\>/g" | \ sed "1,/^ *[A-Z]\{3\}/s/^ *[a-z]\{2\}.*/

&<\/h3>/g" | \ sed "s/^ *\([A-Z]\{3,\}[A-Z ]*\)$/

\1<\/i><\/h4> /g" | sed "s/^ *\([A-Z]\{3,\}[A-Z ]*[: ]\)/

\1<\/i><\/h4>/g" | \ sed "s/[ ]\{2\}/\ \ /g" | \ sed "s/\(http:\/\/[^ \n\r]\{3,\}\)/[*]<\/a> \1/gi" | sed "s/[^a-zA-Z\/]\(www\.[^ ]\{2,\}\)/[*]<\/a> \1/gi" | \ sed "s/^/
/g" echo "" echo "" exit 1; #-- The code below was extracted from 'plaintext2html.sh' #-- The section below creates the table of contents for the documentation file #-- This line is designed to only number lines which match a pattern #-- In theory 'nl -bpPATTERN' should also do this, but it insisted on #-- 'double-spacing' the output #-- Also the expressions below try and get rid of things like "can't" and "won't" #-- because I want to apply some formatting to the content of quotes, and these #-- things will get in my way. cat $1 | expand | \ mawk '/^[ A-Z0-9.\/\\]*[A-Z]+[ A-Z0-9.\/\\]*$/{ii++; print ii $0}!/^[ A-Z0-9.\/\\]*[A-Z]+[ A-Z0-9.\/\\]*$/' | \ sed "s/\([a-zA-Z]\{2,\}\)n[\"']t/\1nt/g" > $1.temp (echo "
"; \ cat $1.temp | \ sed "/^[0-9]\{1,\}[ A-Z0-9.\/\\]*[A-Z]\{3,\}[ A-Z0-9.\/\\]*$/!d" | \ sed "s/^\([0-9]\{1,\}\)\([ A-Z0-9.\/\\]*[A-Z]\{3,\}[ A-Z0-9.\/\\]*\)$/
\1. \2<\/a>/g"; \ echo "
";) > plain-text-toc.temp echo "" echo "" echo "" echo " " echo " " echo " " echo " " echo "" echo "" echo "" echo "" echo "" echo "" echo "" if [ "$2" != "notran" ] then echo "
" echo "See this page in (approximate):" echo "Español|" echo "Français|" echo "Italiano|" echo "Deutsch|" echo "Português" echo "
" fi #-- Put the page heading before the table of contents #-- cat $1.temp | \ sed "/^[ ]*=[ ]*[^=].*/!d" | \ sed -e "s//\>/g" | \ sed "s/^[ ]*=[ ]*[^=]\(.*\)/

\1<\/h2><\/center>/gi" echo "
" #-- Inset the table of contents if [ "$3" != "notoc" ] then cat plain-text-toc.temp fi #-- Transform the text to HTML, insert anchors #-- Also delete the heading line which has already been inserted in the HTML #-- But, the line will also delete lines beginning in == or === etc, which #-- may not be desirable. #-- The line below was designed to make the contents of quotes look different #-- but I think that it made the text less readable #-- #-- sed "s/\(['\"]\)[^'\"]\{1,\}\1/&<\/tt>/g" | \ #-- cat $1.temp | \ expand | \ sed "/^[ ]*=[ ]*[^=]\(.*\)$/d" | \ sed -e "s//\>/g" | \ sed -e "s/^[ ]*\-\-\>\>/
/g" -e "s/^[ ]*\-\-\<\</<\/pre>/g" | \
  sed "s/^\([0-9]\{1,\}\)\([ A-Z0-9\.\/\\]*[A-Z]\{3,\}[ A-Z0-9\.\/\\]*\)$/\1. \2<\/a><\/strong> [TOC]<\/a>/g" | \
  sed "s/^[ ]*\*[ ]*\([^|]*\)|\([^|]*\)|\([^|]*\)|/\1<\/b> (Formats:<\/em> html<\/a> | text<\/a>)|/gi" | \
  sed "s/^[ ]*\*[ ]*\([^ ]\{2,\}\)/\1<\/a>/gi" | \
  sed "s/[^\"]\(http:\/\/[-a-z\%0-9\~\\\/\"\'\.\@]\{3,\}\)/ [*]<\/a>\1<\/tt>/gi" | \
  sed "s/\([^ ]\{2,\}@[^ \"']\{2,\}\)/\1<\/a>/g" | \
  sed "s/[^a-zA-Z\/]\(www\.[-a-z\%0-9\~\\\/\"\'\.\@]\{2,\}\)/ [*]<\/a>\1<\/tt>/gi" | \
  sed "/
/,/<\/pre>/!s/[ ]\{2\}/\ \ /g" | \
  sed "/
/,/<\/pre>/!s/^/
/g" echo "
" echo "
" if [ "$2" != "notran" ] then echo "
" echo "See this page in (approximate):" echo "Español|" echo "Français|" echo "Italiano|" echo "Deutsch|" echo "Português" echo "
" fi echo "" echo "" rm -f $1.temp