#!/usr/bin/python
USAGE = ''' 
about: 
    recovers text strings from binary dump 
    (e.g. hard-disk without partition tables)

usage: 
    text_identify.py filename

todo:
    command line args
    add more character sets
    optimize for arbitrary char-sets: unicode, numbers, languages...

12/2003
'''


import re, string, time, os, sys

DEBUG = 0
CHUNK_SIZE = 160 #512 (in bytes)
BYTES_TO_PROCESS = 0
START_FROM = 0
#CHAR_SIZE = 1   #(in bytes)
MAX_BAD_CHARACTERS_PERCENT = 0.15    #  non-text chars allowed: %15 
MIN_WHITE_SPACE_PERCENT = 0.1  # used to indicate a 'good' text with separated words. otherwise, might return "SDAFSDFsdafwaeSEF", and so on.

TEXT_CHARS = {'english' : '''ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789\n\r\t!?(),. '"-%:/&'''}
TEXT_CHARS['numeric'] = '''01234567890-+*/#$%()'''
TEXT_CHARS['polish1'] = '''テ剩延審骊绸鬁€熆''' + TEXT_CHARS['english']
TEXT_CHARS['hebrew1'] = '''噌忏溴骁栝腱铕耱赧鼬'''
TEXT_CHARS['hebrew1_nikud'] = '''谅媚牌侨已商'''
TEXT_CHARS['hebrew_utf8'] = ['讗', '讘', '讙', '讚', '讛', '讜', '讝', '讞', '讟', '讬', '讻', '诇', '诪', '谞', '住', '注', '驻', '爪', '拽', '专', '砖', '转']

debug_bad_chars = []


def compare_chunk(chunk, chars):
    global MAX_BAD_CHARACTERS_PERCENT
    global debug_bad_chars

    length = len(chunk)
    max_bad_chars = MAX_BAD_CHARACTERS_PERCENT * length
    min_white_spaces = MIN_WHITE_SPACE_PERCENT * length
    bad_counter = 0
    space_counter = 0
    for c in chunk:
        if c not in chars:
            if len(debug_bad_chars)<15: debug_bad_chars.append(c)
            bad_counter = bad_counter + 1
            if bad_counter > max_bad_chars:
                return False
        if c == ' ': 
            space_counter = space_counter + 1

    if space_counter < min_white_spaces:
        return False
    
    return True


def append_output(chunk, address, contiguous):
    global outfile

    if contiguous:  # dont print address on contiguous blocks
        s = chunk
    else:  #ok, print address here
        s = "\n%d:\n%s" %(address, chunk)

    if DEBUG: print s,
    open(outfile, 'a').write(s)
        

#def split_to_newlines(fname, how_many_newlines)
#    open
#    return new_s
    

def print_output():
    pass
    #if OUTPUT_METHOD == 'screen':
    #    for pairs in output:
    #       print address + ":"
    #       print chunk
    #if OUTPUT_METHOD == 'file':
    #    for pairs in output:
    #        outaddress + ":"
    #    print chunk


filename = sys.argv[1]
outfile = filename + ".out.txt"
open(outfile,'w').write("text dump from file:" + filename + "\n\n")
text_chars_to_recognize = TEXT_CHARS['polish1'] + TEXT_CHARS['hebrew1']
output = []
f = open(filename, 'rb')  #, CHUNK_SIZE (buffering?)
f.seek(START_FROM)
address = START_FROM - CHUNK_SIZE  # loop considerations
contiguous = 1

while 1:
    chunk = f.read(CHUNK_SIZE)
    if not chunk: break    # end-of-file
    #if address > BYTES_TO_PROCESS: break
    address = address + CHUNK_SIZE
    is_text = compare_chunk(chunk, text_chars_to_recognize)
    if is_text:
        append_output(chunk, address, contiguous)
        contiguous = True

    else:
        contiguous = False

#print_output()
if DEBUG: 
    print "outfile: ",outfile
    print "\n\n bad chars:"
    for c in debug_bad_chars:
        print hex(ord(c)), ord(c), c

f.close()
sys.exit(0)