/**File Name :     Parser.java
  *
  *Authors   :     Prashanth. K
  *                Shireen Javali
  *
  *Date      :     20/05/2000
  *
  *Function  :     Parses the file to get the url's embedded
  *                avoiding jumps to the same site
  */                        


import java.io.*;
import Tokenizer;
import java.net.URL; 


public class Parser
{
   public void parse(String inFileName,String outFileName,String site,boolean checkSite)
   {
     try
     {
        RandomAccessFile inFile=new RandomAccessFile(inFileName,"r");
        RandomAccessFile outFile=new RandomAccessFile(outFileName,"rw");
        System.out.println("file length "+outFile.length());
        Tokenizer  tokenizer = new Tokenizer(inFile);
        boolean getLink = false;
        String strBuffer,token;
        token = new String(tokenizer.getNextToken());
        while(!token.equals("<>"))
        {
            if(token.equals("<"))
            {
                token = new String(tokenizer.getNextToken());
                if(token.equalsIgnoreCase("a"))
                {
                    do
                    {
                      token = new String(tokenizer.getNextToken());
                    }while(token.equals("\n") || token.equals(" "));
                    if(token.equalsIgnoreCase("href"))
                    {
                        token = new String(tokenizer.getNextToken());
                        token = new String(tokenizer.getNextToken());
                        token = new String(tokenizer.getNextToken());
                        if (token.startsWith("http")
                                        || token.startsWith("ftp"))

                        if(checkSite)
                        {
                            if(token.indexOf(site) == -1)
                            {
    
                                URL check = new URL(token);
                                
                                System.out.println(check.getFile());
                                if(check.getFile().indexOf("?") == -1)
                                     outFile.writeBytes(token+"\r\n");
                            }
                        }else
                        {
                            URL check = new URL(token);
                            
                            System.out.println(check.getFile());
                            if(check.getFile().indexOf("?") == -1)
                                 outFile.writeBytes(token+"\r\n");

                        }
                           
                    }       
                }
            } 
            else
                token = new String(tokenizer.getNextToken());
        }
        inFile.close();
        outFile.close();
     }
     catch(FileNotFoundException fe)
     {
        System.err.println("HTML file "+ inFileName+"not found \n"); 
     }
     catch(IOException ie)
     {
        System.err.println(ie); 
     }

   }
/*
   public static void main(String args[])
   {
       Parser p = new Parser();
       p.parse("alta.html","links.dat","altavista");
   }
 */
}
