# Description: # A script to reformat a plain text file document which contains # no particular format into Adobe PDF format. The script recognises some special structures # within the plain text document. For example: # # Where the first non-whitespace character on a line is '=' then # all the following text on the line should be formatted as a # 'page title'. # # Lines which have no space in front of them are interpreted as headings, and constitute the automatically # generated table of contents. # # The script will also format blocks of text between the strings -->> and --<< # (where they are the first string on the line) as an HTML
block # # Examples: # ./netbeans-guide2pdf.sh netbeans-userguide.txt # This command line, executed in some kind of a bash shell, will # transform a plain text file which isn't is any particular format, # into a pdf file (that is it will create a new pdf file and leave # the original text file unchanged). Also a table of contents (with # one entry for each heading) will be inserted in the pdf document. # # ./netbeans-guide2pdf.sh mjb-work.txt new-work.pdf # The text file will be transformed into pdf # # ./netbeans-guide2pdf.sh resources.txt debug # No pdf file is created. The HTML generated from the text file # (which is to be used by htmldoc) is printed to standard output # # Parameters: # textFileName [required] # The name of the text file which is to be transformed from text into pdf # outputFileName[optional] # This is the name of the pdf file which will be produced. If no file # name is provided the output file name will be 'textFileName.pdf'. # If a filename here is given which does not end in 'pdf' then # the htmldoc program will generate html output with a hyper-linked # table of contents. # If the outputFileName is the string 'debug', then no file is created # and the intermediate HTML which is to be used by the htmldoc program # is output to standard-out # # Notes: # For a document that is 1 megabyte in size, this script takes several # minutes to complete. # # The script uses the 'htmldoc' program which appears to be GPL'd under unix # and which comes from http://www.easysw.com/. The 'htmldoc' program handles # the creation of tables of contents, given a set oftags where n is a # number. Also the tags must be complete: That is, if there are tags # there must also be
and
tags for the table-of-content building # to work. # # The 'htmldoc' program sometimes does not work as expected. If it does not # understand the html then it produces garbled pdf. It does not 'degrade graciously' # # See Also: # diary2html.sh, linkdoc2html.sh, plaintext2html.sh # Author: # m.j.bishop if [ "$1" = "" ] then echo "usage: $0 textFileName [outputFileName] {notoc|debug}" cat $0 | sed -n "/^[ ]*#/p" exit 1; fi sPageTitle=$(cat $1 | \ sed "/^[ ]*=[ ]*\(.*\)/!d" | \ sed "s/^[ ]*=[ ]*\(.*\)/\1/gi") sFileBaseName=$(echo $1 | sed "s/\.[^\.]\{1,\}$//g") #-- The section below creates the table of contents for the documents. #-- This line is designed to only number lines which match a pattern #-- In theory 'nl -bpPATTERN' should also do this, but it insisted on #-- 'double-spacing' the output #-- Also the expressions below try and get rid of things like "can't" and "won't" #-- because I want to apply some formatting to the content of quotes, and these #-- things will get in my way. echo "
$sPageTitle
" > $1.temp #-- Transform the text to intermediate HTML, insert headings #-- Also delete the heading line which has already been inserted in the HTML #-- But, the line will also delete lines beginning in == or === etc, which #-- may not be desirable. #-- The line below was designed to make the contents of quotes look different #-- but I think that it made the text less readable #-- #-- sed "s/\(['\"]\)[^'\"]\{1,\}\1/&<\/tt>/g" | \ #-- #-- Output some dummy headings just to keep 'htmldoc' happy #-- The table of contents building process may fail (probably will) if #-- there is no '= header' format line in the file. echo " .
" >> $1.temp cat $1 | \ expand | \ sed -e "s/\</g" -e "s/>/\>/g" | \ sed "s/^[ ]\{1,\}=[ ]*\(.*\)/\1<\/h2><\/center>/gi" | \ sed -e "s/^[ ]*\-\-\>\>/
/g" -e "s/^[ ]*\-\-\<\</<\/pre>/g" | \ sed "s/^\([a-zA-Z0-9"][a-zA-Z0-9" ]*\)$/\1<\/h3>/g" | \ sed "/
/,/<\/pre>/!s/[ ]\{2\}/\ \ /g" | \ sed "s/[^\"]\(http:\/\/[-a-z\%0-9\~\\\/\"\'\.\@]\{3,\}\)/\1<\/tt>/gi" | \ sed "s/[ ]\([^ ]\{2,\}@[^ \"']\{2,\}\)/\1<\/tt>/g" | \ sed "s/[^a-zA-Z\/]\(www\.[-a-z\%0-9\~\\\/\"\'\.\@]\{2,\}\)/\1<\/tt>/gi" | \ sed "//,/<\/pre>/!s/^/
/g" >> $1.temp echo "
" >> $1.temp if [ "$2" = "debug" ] then cat $1.temp exit 1; fi cp $1 $1.bak if [ "$2" != "" ] then rm -f $2 htmldoc -f $2 --book --no-title $1.temp else rm -f $sFileBaseName.pdf htmldoc -f $sFileBaseName.pdf --book --no-title $1.temp fi rm -f $1.temp