# Description: # A script to reformat a plain text file document which contains # a dairy of some sort into some kind of html. By a 'diary' I mean # a series of entries for particular dates. Specifically, the text file # should contain dates in a format like '* 3 march 2003, Saturday' or # something similar, and should be followed by some sort of descriptive # text relating to that date. The script recognises some special structures # within the plain text document. For example: # # The diary entry dates should be on a line by themselves and should # begin with a * character like this: # * 3 January 1992, Friday # # The '=' character, when the first non-whitespace character on a line indicates # that all the following text on the line should be formatted as a # 'heading'. Also, url style strings should be recognised and given # a hyperlink token in from of them, such as '[*]'. I prefer this to underlining # the entire url, because I find that the underlining tends to interfer with # the readability of the text. Some people would say, "use style-sheets" but to # them I would reply that the 'heraldic' visual pattern of the underlined hyperlink # is imprinted in many internet users brains, and to change that 'iconography' can # lead to unnecessary confusion. # # Examples: # ./diary2html.sh mjb-work.txt notran > mjb-work.html # This command line, executed in some kind of a bash shell, will transform a # plain text file which has 'diary' style entries, into an HTML file (that is # it will create a new HTML file and leave the original text file unchanged) and # will not display the automatic translation links to Google. # Also an HTML table of contents (with one entry for each date) will be # inserted in the HTML document. # # ./diary2html.sh mjb-work.txt notran notoc > mjb-work.html # The text file will be transformed into HTML but no table of contents # will be inserted nor any translation links. # # ./diary2html.sh mjb-work.txt blah notoc > mjb-work.html # If translation links are desired but no table of contents use a # command line similar to above. The string 'blah' could be anything # as long as its not 'notran'. This slighty dodgy 'feature' is owing to the # fact that I am not using any 'getopt' style option parsing. # # Parameters: # textFileName [required] # The name of the text file which is to be transformed from text into html # notran [optional] # If the second parameter is the string 'notran' then the javascript links # to the google automatic language translation engine will NOT be inserted # into the HTML page. This is useful, for example, when the HTML page is # going to be located within a 'password-protected' directory, because # the Google translation engine will not be able to access the page, and # therefor the translation links will not work. # notoc [optional] # If the third parameter is the string "notoc", then no HTML table of # contents will be generated. # Notes: # This script contains an improved url detection regular expresion, better than that # in say txtdoc2html.sh. But the url pattern matcher still has a problem when # somebody puts a full stop after a url. It thinks that that dot is part of the # url. # See Also: # txtdoc2html.sh, linkdoc2html.sh, plaintext2html.sh # Author: # m.j.bishop if [ "$1" = "" ] then echo "usage: $0 textFileName [notran] [notoc]" cat $0 | sed -n "/^[ ]*#/p" exit 1; fi #-- The section below creates the table of contents for the diary. #-- This line is designed to only number lines which match a pattern #-- In theory 'nl -bpPATTERN' should also do this, but it insisted on #-- 'double-spacing' the output cat $1 | expand | mawk '/^[ ]*\*[ ]*[^ ]+/{ii++; print ii $0}!/^[ ]*\*[ ]*([^ ]+)/' > $1.temp (echo "
"; \ cat $1.temp | \ sed "/^[ ]*\([0-9]\{1,\}\)[ ]*\*\(.*\)/!d" | \ sed "s/\(monday\|tuesday\|wednesday\|thursday\|friday\|saturday\|sunday\)//gi" | \ sed "s/^[ ]*\([0-9]\{1,\}\)[ ]*\*\(.*\)/\2<\/a> | /g"; \ echo "
";) > diary-toc.temp echo "" echo "" echo " " echo " " echo " " echo " " echo "" echo "" echo "" echo "" echo "" echo "" echo "" echo "" echo "" echo "" #-- The Google automatic translation links below, are sometimes disabled because they will #-- not work from within a password protected directory, since Google does not #-- have permission to view that directory. if [ "$2" != "notran" ] then echo "
" echo "See this page in (approximate):" echo "Español|" echo "Français|" echo "Italiano|" echo "Deutsch|" echo "Português" echo "
" fi #-- This was the old regular expression used to find dates, but now I am using * format #-- #- sed "s/^[ 0-9,]*\(jan\|feb\|mar\|apr\|may\|jun\|jul\ # |aug\|sep\|oct\|nov\|dec\)[a-z]*[ 0-9,]*.*/&<\/strong>/gi" | \ #-- Put the page heading before the table of contents #-- cat $1.temp | \ sed "/^[ ]*=[ ]*.*/!d" | \ sed -e "s//\>/g" | \ sed "s/^[ ]*=[ ]*\(.*\)/

\1<\/h2><\/center>/gi" echo "
" #-- Inset the table of contents if [ "$3" != "notoc" ] then cat diary-toc.temp fi #-- Transform the text to HTML, insert anchors #-- Also delete the heading line which has already been inserted in the HTML #-- But, the line will also delete lines beginning in == or === etc, which #-- may not be desirable. cat $1.temp | \ expand | \ sed "/^[ ]*=[ ]*\(.*\)/d" | \ sed -e "s//\>/g" | \ sed -e "s/^[ ]*\-\-\>\>/
/g" -e "s/^[ ]*\-\-\<\</<\/pre>/g" | \
   sed "s/^[ ]*\([0-9]\{1,\}\)[ ]*\*\(.*\)/
\2<\/a><\/strong><\/u> [TOC]<\/a>/g" | \ sed "s/\(http:\/\/[-a-z\%0-9\~\\\/\"\'\.\@]\{3,\}\)/[*]<\/a> \1<\/tt>/gi" | \ sed "s/[^a-zA-Z\/]\(www\.[-a-z\%0-9\~\\\/\"\'\.\@]\{2,\}\)/[*]<\/a> \1<\/tt>/gi" | \ sed "/
/,/<\/pre>/!s/[ ]\{2\}/\ \ /g" | \
   sed "/
/,/<\/pre>/!s/^/
/g" echo "
" echo "
" if [ "$2" != "notran" ] then echo "
" echo "See this page in (approximate):" echo "Español|" echo "Français|" echo "Italiano|" echo "Deutsch|" echo "Português" echo "
" fi echo "" echo "" rm -f diary-toc.temp rm -f $1.temp