# Description # This script uses lynx to get all the phrases and sound file references from the # site www.intercat.gencat.es which is a site with a large number of sound files and # phrases for the purpose of learning the catalan language. This script is designed to # work in conjunction with the script 'get-intercat-data.sh' which gets the phrase data # # Notes: # The point of all this is to extract some useful data from the 'intercat' site and # possibly to use in some way or other. # This script is taking about 3 seconds for 342 lines currently # The script uses some tricks such as using sed to create another sed script in order # to 'resolve' the references. # # for ii in $(seq 21); do lynx -dump -width=150 http://www.intercat.gencat.es/guia/capitol${ii}.htm |\ # sed "s/^ *$/=/g; s/^[[:space:]]*//g"; done > gencat.txt for ii in $(seq 21) do lynx -dump -width=150 http://www.intercat.gencat.es/guia/capitol${ii}.htm | \ expand | \ sed "s/^[ ]*//g; s/[ ]*$//g; s/^-[ ]*//g; /^[ ]*$/d" > junk.txt cat junk.txt | \ sed -e "1,/^Reference/d" \ -e "/^[ ]*[0-9]\{1,6\}\./! d" \ -e "s/@/{:at:}/g" \ -e "s#^\([0-9]\{1,6\}\)\.[ ]*\(.*\)#s@\\\[\1\\\]@\\\[\2\\\]@g;#g" > temp.sed # There are few tricks here. A different substition delimiter is used '@' instead of '/' so that # we will not have to 'escape' all the forward slashes in the Referenced URLs. Also the character # '[' and ']' both need to be escaped because other-wise they will define a character class or set # instead of a literal string, which is what we want in this situation. sed -f temp.sed junk.txt | \ sed -e "/^References/,$ d" \ -e "s/\*/{:star:}/g" \ -e "s/\([^ ]\+[ ]*\)\[/\1*[/g" | \ tr '*' '\n' | \ sed -e "s/{:star:}/\*/g" \ -e "s/{:at:}/\@/g" \ -e "/\.gif\]/d" \ -e "/You can listen to the phrases with/d" \ -e "/www\.real\.com/d" \ -e "/LYNXIMGMAP/d" # This line then uses the generated sed script to put the references next to the text # which they relate to. This line uses the somewhat dodgy means of inserting newlines into # the file by using the character * hoping that that character is not used (much) in the # file. This is not ideal, but I can't seem to find a version of sed that allows you to # put newlines in the right-hand side, although apparently they are available. done # rm -f temp.sed