# Description
#   This script gets word data from the site 'intercat.gencat.es' which is a
#   catalan government site for the promotion of the Catalan language
#   The site also contains many sound files of Catalan and English being spoken.
#
# Notes
#   See the file 'catalan-resources.txt' for a discussion of the development of this
#   script.
#   The sound files from the Generalitat de Catalunya site are in my opinion not
#   really of a very good quality. This is a pity because they are numerous.

  sIntercatUrl="http://www.intercat.gencat.es/guia"
  for ii in $(seq 21)
  do
    wget ${sIntercatUrl}/portugues/capitol${ii}.htm 
    # The urls below get the bilingual wordlists for the other european languages
    # (with english being the last, and not having a directory extension)
    # wget ${sIntercatUrl}/portugues/capitol${ii}.htm 
    # wget ${sIntercatUrl}/frances/capitol${ii}.htm 
    # wget ${sIntercatUrl}/alemany/capitol${ii}.htm 
    # wget ${sIntercatUrl}/castella/capitol${ii}.htm 
    # wget ${sIntercatUrl}/capitol${ii}.htm 
  done    

  for ii in $(seq 21)
  do
    # Change this as well as above for the other languages.
    echo "[URL=$sIntercatUrl/portugues/capitol${ii}.htm]"

    sed "s/\"\"/\"/g;s/<\/td>/<\td><td>\&nbsp;\&nbsp;\&nbsp;\&nbsp;<\/td>/gi" capitol${ii}.htm | \
      html2text -nobs -width 140 | tr '\222' "'" | tr -d '\205' | sed "/^[[:space:]]*$/d;s/[[:space:]]*$//g" 
  done    

  rm -f capitol*.htm