#!/usr/bin/python USAGE = ''' about: recovers text strings from binary dump (e.g. hard-disk without partition tables) usage: text_identify.py filename todo: command line args add more character sets optimize for arbitrary char-sets: unicode, numbers, languages... 12/2003 ''' import re, string, time, os, sys DEBUG = 0 CHUNK_SIZE = 160 #512 (in bytes) BYTES_TO_PROCESS = 0 START_FROM = 0 #CHAR_SIZE = 1 #(in bytes) MAX_BAD_CHARACTERS_PERCENT = 0.15 # non-text chars allowed: %15 MIN_WHITE_SPACE_PERCENT = 0.1 # used to indicate a 'good' text with separated words. otherwise, might return "SDAFSDFsdafwaeSEF", and so on. TEXT_CHARS = {'english' : '''ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789\n\r\t!?(),. '"-%:/&'''} TEXT_CHARS['numeric'] = '''01234567890-+*/#$%()''' TEXT_CHARS['polish1'] = '''テ剩延審骊绸鬁熆''' + TEXT_CHARS['english'] TEXT_CHARS['hebrew1'] = '''噌忏溴骁栝腱铕耱赧鼬''' TEXT_CHARS['hebrew1_nikud'] = '''谅媚牌侨已商''' TEXT_CHARS['hebrew_utf8'] = ['讗', '讘', '讙', '讚', '讛', '讜', '讝', '讞', '讟', '讬', '讻', '诇', '诪', '谞', '住', '注', '驻', '爪', '拽', '专', '砖', '转'] debug_bad_chars = [] def compare_chunk(chunk, chars): global MAX_BAD_CHARACTERS_PERCENT global debug_bad_chars length = len(chunk) max_bad_chars = MAX_BAD_CHARACTERS_PERCENT * length min_white_spaces = MIN_WHITE_SPACE_PERCENT * length bad_counter = 0 space_counter = 0 for c in chunk: if c not in chars: if len(debug_bad_chars)<15: debug_bad_chars.append(c) bad_counter = bad_counter + 1 if bad_counter > max_bad_chars: return False if c == ' ': space_counter = space_counter + 1 if space_counter < min_white_spaces: return False return True def append_output(chunk, address, contiguous): global outfile if contiguous: # dont print address on contiguous blocks s = chunk else: #ok, print address here s = "\n%d:\n%s" %(address, chunk) if DEBUG: print s, open(outfile, 'a').write(s) #def split_to_newlines(fname, how_many_newlines) # open # return new_s def print_output(): pass #if OUTPUT_METHOD == 'screen': # for pairs in output: # print address + ":" # print chunk #if OUTPUT_METHOD == 'file': # for pairs in output: # outaddress + ":" # print chunk filename = sys.argv[1] outfile = filename + ".out.txt" open(outfile,'w').write("text dump from file:" + filename + "\n\n") text_chars_to_recognize = TEXT_CHARS['polish1'] + TEXT_CHARS['hebrew1'] output = [] f = open(filename, 'rb') #, CHUNK_SIZE (buffering?) f.seek(START_FROM) address = START_FROM - CHUNK_SIZE # loop considerations contiguous = 1 while 1: chunk = f.read(CHUNK_SIZE) if not chunk: break # end-of-file #if address > BYTES_TO_PROCESS: break address = address + CHUNK_SIZE is_text = compare_chunk(chunk, text_chars_to_recognize) if is_text: append_output(chunk, address, contiguous) contiguous = True else: contiguous = False #print_output() if DEBUG: print "outfile: ",outfile print "\n\n bad chars:" for c in debug_bad_chars: print hex(ord(c)), ord(c), c f.close() sys.exit(0)