$sPageTitle

# Description: # A script to reformat a plain text file document which contains # no particular format into Adobe PDF format. The script recognises some special structures # within the plain text document. For example: # # Where the first non-whitespace character on a line is '=' then # all the following text on the line should be formatted as a # 'page title'. And if the first non-whitespace character is '*' then # the following text should be hyperlinked. Also, url style strings should be recognised and given # a special formatting. # # Lines which consist of only capital letters and numbers (with at least a few # capital letters, are interpreted as headings, and constitute the automatically # generated table of contents. # # REVISE THIS BELOW # This script, like the linkdoc2html.sh script also accepts the format # * Document Title|Html-Url-Or-Path|Text-Url-Or-Path| # The script will render this into an emphasised 'document title' with # hyper-links to the different formats for the document. # # The script will also format blocks of text between the strings -->> and --<< # (where they are the first string on the line) as an HTML

 block
#
# Examples:
#   ./plaintext2pdf.sh mjb-work.txt
#     This command line, executed in some kind of a bash shell, will
#     transform a plain text file which isn't is any particular format,
#     into a pdf file (that is it will create a new pdf file and leave
#     the original text file unchanged). Also a table of contents (with
#     one entry for each heading) will be inserted in the pdf document.
#
#   ./plaintext2pdf.sh mjb-work.txt new-work.pdf
#     The text file will be transformed into pdf
#
#   ./plaintext2pdf.sh resources.txt debug
#     No pdf file is created. The HTML generated from the text file
#     (which is to be used by htmldoc) is printed to standard output
#
# Parameters:
#   textFileName  [required]
#     The name of the text file which is to be transformed from text into pdf
#   outputFileName[optional]
#     This is the name of the pdf file which will be produced. If no file
#     name is provided the output file name will be 'textFileName.pdf'.
#     If a filename here is given which does not end in 'pdf' then
#     the htmldoc program will generate html output with a hyper-linked
#     table of contents.
#     If the outputFileName is the string 'debug', then no file is created
#     and the intermediate HTML which is to be used by the htmldoc program
#     is output to standard-out
#
# Notes:
#   This script contains an improved url detection regular expresion, better than that
#   in say txtdoc2html.sh. But the url pattern matcher still has a problem when
#   somebody puts a full stop after a url. It thinks that that dot is part of the 
#   url. There is possibly no reason why you couldn't just use the 'diary2html.sh'
#   filter script, instead of this one. The Html generated is somewhat dodgy but
#   attempts to avoid some of the more heinous html sins, such as  tags
#
#   The script uses the 'htmldoc' program which appears to be GPL'd under unix
#   and which comes from http://www.easysw.com/. The 'htmldoc' program handles
#   the creation of tables of contents, given a set of  tags where n is a 
#   number. Also the  tags must be complete: That is, if there are  tags
#   there must also be 
 
 and 
 tags for the table-of-content building
#   to work.
#
#   The 'htmldoc' program sometimes does not work as expected. If it does not
#   understand the html then it produces garbled pdf. It does not 'degrade graciously'
#
# See Also:
#   diary2html.sh, linkdoc2html.sh, plaintext2html.sh
# Author:
#   m.j.bishop

 
 if [ "$1" = "" ]
 then
   echo "usage: $0  textFileName [outputFileName] {notoc|debug}"
   cat $0 | sed -n "/^[ ]*#/p" 
   exit 1;
 fi

sPageTitle=$(cat $1 | \
  sed "/^[ ]*=[ ]*\(.*\)/!d" | \
  sed "s/^[ ]*=[ ]*\(.*\)/\1/gi")
sFileBaseName=$(echo $1 | sed "s/\.[^\.]\{1,\}$//g")

 #-- The section below creates the table of contents for the documents.
 #-- This line is designed to only number lines which match a pattern
 #-- In theory 'nl -bpPATTERN' should also do this, but it insisted on
 #-- 'double-spacing' the output
 #-- Also the expressions below try and get rid of things like "can't" and "won't"
 #-- because I want to apply some formatting to the content of quotes, and these
 #-- things will get in my way.

 echo "
   
     
     
     
            
    
    
    
    
    
    $sPageTitle
    
    
    
    " > $1.temp
   

#-- Transform the text to intermediate HTML, insert headings
#-- Also delete the heading line which has already been inserted in the HTML
#-- But, the line will also delete lines beginning in == or === etc, which
#-- may not be desirable.
#-- The line below was designed to make the contents of quotes look different
#-- but I think that it made the text less readable
#--
#--  sed "s/\(['\"]\)[^'\"]\{1,\}\1/&<\/tt>/g" | \
#--
#-- Output some dummy headings just to keep 'htmldoc' happy
#-- The table of contents building process may fail (probably will) if
#-- there is no '= header' format line in the file.

echo "." >> $1.temp

cat $1 | \
  expand | \
  sed -e "s//\>/g" | \
  sed "s/^[ ]*=[ ]*\(.*\)/\1<\/h2><\/center>/gi" | \
  sed -e "s/^[ ]*\-\-\>\>//g" -e "s/^[ ]*\-\-\<\</<\/pre>/g" | \
  sed "s/^\([ A-Z0-9\.]*[A-Z]\{3,\}[ A-Z0-9\.]*\)$/\1<\/h3>/g" | \
  sed "s/^[ ]*\*[ ]*\([^|]*\)|\([^|]*\)|\([^|]*\)|/\1<\/strong>/gi" | \
  sed "s/^[ ]*\*[ ]*\([^ ]\{2,\}\)/\1<\/em>/gi" | \
  sed "//,/<\/pre>/!s/[ ]\{2\}/\ \ /g" | \
  sed "s/[^\"]\(http:\/\/[-a-z\%0-9\~\\\/\"\'\.\@]\{3,\}\)/\1<\/tt>/gi" | \
  sed "s/[ ]\([^ ]\{2,\}@[^ \"']\{2,\}\)/\1<\/tt>/g" | \
  sed "s/[^a-zA-Z\/]\(www\.[-a-z\%0-9\~\\\/\"\'\.\@]\{2,\}\)/\1<\/tt>/gi" | \
  sed "//,/<\/pre>/!s/^/
/g" >> $1.temp

  echo "
      

      
      " >> $1.temp

 if [ "$2" = "debug" ]
 then
   cat $1.temp
   exit 1;
 fi
 
 cp $1 $1.bak     
 if [ "$2" != "" ]
 then
   rm -f $2
   htmldoc -f $2 --book --no-title $1.temp
 else
   rm -f $sFileBaseName.pdf
   htmldoc -f $sFileBaseName.pdf --book --no-title $1.temp
 fi  
 rm -f $1.temp

tags # there must also be

and

.