# Description # This script gets word data from the site 'intercat.gencat.es' which is a # catalan government site for the promotion of the Catalan language # The site also contains many sound files of Catalan and English being spoken. # # Notes # See the file 'catalan-resources.txt' for a discussion of the development of this # script. # The sound files from the Generalitat de Catalunya site are in my opinion not # really of a very good quality. This is a pity because they are numerous. sIntercatUrl="http://www.intercat.gencat.es/guia" for ii in $(seq 21) do wget ${sIntercatUrl}/portugues/capitol${ii}.htm # The urls below get the bilingual wordlists for the other european languages # (with english being the last, and not having a directory extension) # wget ${sIntercatUrl}/portugues/capitol${ii}.htm # wget ${sIntercatUrl}/frances/capitol${ii}.htm # wget ${sIntercatUrl}/alemany/capitol${ii}.htm # wget ${sIntercatUrl}/castella/capitol${ii}.htm # wget ${sIntercatUrl}/capitol${ii}.htm done for ii in $(seq 21) do # Change this as well as above for the other languages. echo "[URL=$sIntercatUrl/portugues/capitol${ii}.htm]" sed "s/\"\"/\"/g;s/<\/td>/<\td>\ \ \ \ <\/td>/gi" capitol${ii}.htm | \ html2text -nobs -width 140 | tr '\222' "'" | tr -d '\205' | sed "/^[[:space:]]*$/d;s/[[:space:]]*$//g" done rm -f capitol*.htm