import re, sys, os, string from urllib import urlretrieve, urlcleanup from sgmllib import SGMLParser import formatter import htmllib import urllib import urlparse import tempfile junk_words = ['windows', 'microsoft'] junk_words = ['xxxxxx'] flags = re.IGNORECASE + re.UNICODE + re.MULTILINE reg = re.compile("\s(\w+)\s", flags) text="" html="" # ------- html stripper: ---------------- # # class html_stripper(SGMLParser): def reset(self): self.plaintext = [] SGMLParser.reset(self) def handle_data(self, text): self.plaintext.append(text) def output(self): return string.join(self.plaintext) # ------- strip HTML tags: ---------------- # # get html, return "plain" text # def strip_html_tags(html): # -- script and style removals flags = re.IGNORECASE + re.MULTILINE + re.DOTALL reg_style = re.compile(".*?", flags) reg_script = re.compile(".*?", flags) html = re.sub(reg_style, "", html) html = re.sub(reg_script, "", html) # -- do regular tag ignoring strip = html_stripper() strip.feed(html) return strip.output() def filter(infile): outfile = infile return outfile def extension(s): return string.find(s,".") # filenames = sys.stdin.readlines() listfile = sys.argv[1] filenames = open(listfile).readlines() for filename in filenames: filename = filename[:-1] cont=0 if filename[-3:]=="doc": #filter_word_to_text pass if filename[-3:] =="txt": #filter_text_to_text pass if filename[-3:] == "htm" or filename[-4:] == "html": html = open(filename).read() text = strip_html_tags(html) #case to lower: unitext = unicode(text, 'latin-1') text = unitext.lower() words = reg.findall(text) if len(words) < 1: continue outfile = open(filename + ".out", "w") wordslist = [] for word in words: if len(word) <3: continue if word in wordslist: continue for num in '0123456789': if word.count(num): cont=1 if cont: continue wordslist.append(word) outfile.write(word + "\n") outfile.close()