#!/usr/bin/env python
#Copyright (C) May 2003 - Martin Olveyra molv@users.sourceforge.net
#
#html2lyx v.1.0 http://www.geocities.com/bj0v/software-eng.html
#
#This program is free software; you can redistribute it and/or
#modify it under the terms of the GNU General Public License
#as published by the Free Software Foundation; either version 2
#of the License, or (at your option) any later version.
#
#This program is distributed in the hope that it will be useful,
#but WITHOUT ANY WARRANTY; without even the implied warranty of
#MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#GNU General Public License for more details.

import HTMLParser, sys

class FootnoteError(Exception):
    def __init__(self, footnote, string):
	self.__footnote=footnote
	self.__string=string
    def __str__(self):
	sys.stderr.write(": (name=%s, string=\"%s\")\n" %(self.__footnote, self.__string))

class Buffer:
    def __init__(self, string):
	self.__bufferlines=[string]
	self.__inserted=0
    def append(self, line):
	self.__bufferlines.append(line)
    def insert(self, pos, line):
	self.__bufferlines.insert(pos+self.__inserted, line)
	self.__inserted=self.__inserted+1
    def reset_inserted(self):
	self.__inserted=0
    def printall(self):
	for line in self.__bufferlines:
	    print line
    def getsize(self):
	return len(self.__bufferlines)
	
class TagData:
    "Clase que representa los datos dentro de un tag"
    def __init__(self, data):
	self.__data=data

    def strip_tag(self):
     	"Quita el tag inicial y los espacios antes y despus."
	self.__data=self.__data[self.__data.find(">")+1:].strip()
    
    def get_data(self):
	return self.__data

class HTML2LyX(HTMLParser.HTMLParser):
    
    def __init__(self):
	HTMLParser.HTMLParser.__init__(self)
	self.__pos=(1,0)
	self.__listtype=[]
	self.__local_anchor_dict={}
	self.__footnote=0#Toma el valor 0 fuera de la seccin de notas.
			#Toma el valor 1 al inicio de la seccin de notas.
			#Toma el nombre del anchor de la nota cuando est dentro de ella
	self.__buffer=Buffer("#Created with html2lyx http://www.geocities.com/bj0v/html2lyx\n\
\\lyxformat 218\n\
\\textclass article\n\
\\language spanish\n\
\\inputencoding auto\n\
\\fontscheme pslatex\n\
\\graphics default\n\
\\paperfontsize default\n\
\\spacing single\n\
\\papersize Default\n\
\\paperpackage widemarginsa4\n\
\\use_geometry 0\n\
\\use_amsmath 0\n\
\\paperorientation portrait\n\
\\secnumdepth 3\n\
\\tocdepth 3\n\
\\paragraph_separation indent\n\
\\defskip medskip\n\
\\quotes_language english\n\
\\quotes_times 2\n\
\\papercolumns 1\n\
\\papersides 1\n\
\\paperpagestyle default\n\
\n\
")

    def feed(self, data):
	self.__datalines=data.splitlines()
	HTMLParser.HTMLParser.feed(self, data)
		
    def process_buffer(self, appended=""):
	"Vaca y devuelve todo lo guardado en el buffer"
	begin=self.__last_pos	#posicin inicial
	end=self.__pos		#posicin final
	if begin[0]==end[0]:#Si el tag inicial y el final estn en la misma linea
	    rawdata=self.__datalines[begin[0]-1][begin[1]:end[1]]
	else:
	    rawdata=self.__datalines[begin[0]-1][begin[1]:]
	    for i in range(begin[0],end[0]-1):
		rawdata=rawdata+" "+self.__datalines[i]
	    rawdata=rawdata+" "+self.__datalines[end[0]-1][:end[1]]
	data=TagData(rawdata+appended)
	data.strip_tag()
	return data.get_data()
    
    def saveln(self, string):
	if self.__footnote:#Se supone que contiene el nombre del anchor.
	    try:
	    	footline=self.__local_anchor_dict[self.__footnote]
       	    	self.__buffer.insert(footline, string)
	    except KeyError:
		pass
    	else:
	    self.__buffer.append(string)
	return self.__buffer.getsize()
    
    def print_buffer(self):
	self.__buffer.printall()

    def handle_starttag(self, tag, att):
	if tag=="title":
	    self.__pos=self.getpos()
	    self.saveln("\\layout Title\n")
	elif tag=="body":
	    self.__pos=self.getpos()
	    self.saveln("\\layout Standard\n")
	else:
	    self.__last_pos=self.__pos
	    self.__pos=self.getpos()
	    self.saveln(self.process_buffer())
	    if tag=="h1":
		self.saveln("\\layout Section\n")
	    elif tag=="p":
	    	self.saveln("\\layout Standard\n")
	    elif tag=="br":
	    	self.saveln("\\newline")
	    elif tag=="img":
	   	self.saveln("\\layout Comment\n")
	    	src=""
            	for a in att:
		    if a[0]=="src":
		    	src=a[1]
	    	self.saveln("Imagen: "+src)
	    elif tag=="b":
	    	self.saveln("\\series bold ")
	    elif tag=="i":
	    	self.saveln("\\emph on ")
	    elif tag=="ul":
		self.__listtype.append("List")
		if len(self.__listtype)>1:
		    self.saveln("\\begin_deeper")
	    elif tag=="li":
		if self.__listtype[-1]=="List":
		    self.saveln("\\layout Itemize")
	    elif tag=="footnotes":
		self.__footnote=1
	    elif tag=="a":
		for a in att:
		    if a[0]=="href":
			href=a[1]
		        if href[0]=="#":
			    footline=self.__buffer.getsize()
			    self.__local_anchor_dict[href[1:]]=footline
		    elif a[0]=="name":
			if self.__footnote:
			    if self.__footnote!=1:
				self.saveln("\\end_float\n")
			    name=a[1]
			    self.__footnote=name
			    self.saveln("\\begin_float footnote")
			    self.saveln("\\layout Standard\n")

    def handle_endtag(self, tag):
	if tag=="a":
	    self.__pos=self.getpos()#No procesar texto dentro del anchor.
	else:
	    self.__last_pos=self.__pos
	    self.__pos=self.getpos()
	    self.saveln(self.process_buffer())
	    if tag=="body":
		if self.__footnote==1:
		    raise FootnoteError(self.__footnote,"\\body")
	    	self.saveln("\\the_end")
	    elif tag=="b":
	    	self.saveln("\\series default")
	    elif tag=="i":
	       	self.saveln("\\emph default")
	    elif tag=="ul":
	    	if len(self.__listtype)>1:
		    self.saveln("\\end_deeper")
	    	self.__listtype.pop()
	    elif tag=="footnotes":
		self.saveln("\\end_float\n")
		self.__footnote=0

    def handle_comment(self, data):
	self.__last_pos=self.__pos
	self.__pos=self.getpos()
	self.saveln(self.process_buffer())
    
    def handle_entityref(self, name):
	self.__last_pos=self.__pos
	self.__pos=self.getpos()
	if name=="amp":
	    self.saveln(self.process_buffer("&"))
	elif name=="gt":
	    self.saveln(self.process_buffer(">"))
	elif name=="aacute":
	    self.saveln(self.process_buffer(""))
	elif name=="eacute":
	    self.saveln(self.process_buffer(""))
	elif name=="iacute":
	    self.saveln(self.process_buffer(""))
	elif name=="oacute":
	    self.saveln(self.process_buffer(""))
	elif name=="uacute":
	    self.saveln(self.process_buffer(""))
	elif name=="ntilde":
	    self.saveln(self.process_buffer(""))
	else:
	    self.saveln(self.process_buffer())
	self.movepos(len(name)+2)
    
    def handle_charref(self, ref):
	self.__last_pos=self.__pos
	self.__pos=self.getpos()
	if ref=="163":
	    self.saveln(self.process_buffer(""))
	else:
	    self.saveln(self.process_buffer())
	self.movepos(len(ref)+3)
    
    def movepos(self, n):
	self.__pos=(self.__pos[0], self.__pos[1]+n)

try:
    htmlfile=sys.argv[1]
except IndexError:
    print "Please read documentation on usage"
    sys.exit(1)
    
data=open(htmlfile,"r").read()
p=HTML2LyX()
p.feed(data)
p.print_buffer()
