/* "LZW.java" */ import java.io.*; /** The Limpel, Ziv, Welch (LZW) algorithm compresses any sequence of byte data. * The resulting compressed codes are larger than bytes, * but can each represent a group of bytes, * at least some of the time, * for what is usually an overall saving. *

* LZW compression recognizes sequences that have already appeared in the data, * and replaces each later appearance by a unique code. * The degree of compression depends on how much repetition is found in the data. * LZW compression is effective with data having many and / or lengthy * repeat sequences, * such as text, * and graphics with areas of solid color or patterns. *

* The sequence history should allow for the expected amount of repetition, * without wasting too much space in each code on a history index. * This version is fixed at 12 bit codes, * which can refer to nearly a 2^^12 item sequence history, * less some internal overhead. * This size is considered good for short message blocks, * and is simple to implement. *

* Note! * Other versions of LZW may use other sizes of sequence history, * and other coding conventions. * They are not inter-operable with this algorithm. *

* The original papers: *

Terry Welch, * A Technique for High-Performance Data Compression, * IEEE Computer, June 1984 *
J. Ziv and A. Lempel, * A Universal Algorithm for Sequential Data Compression, * IEEE Transactions on Information Theory, May 1977 *

* This and many similar algorithms have been patented: *

One form of the original LZ78 algorithm was patented (4,464,650) * by its authors Lempel, Ziv, Cohn and Eastman. This patent is owned by Unisys. *
The LZW algorithm used in 'compress' is patented, * both by IBM (4,814,746) and Unisys (4,558,302). *

* * This Java was ported from C code in the copyrighted article, * "LZW Data Compression" by Mark Nelson, * in Dr. Dobb's Journal * October, 1989. *

* There is no need to construct an LZW object, * the LZW.Compress () and LZW.Expand () methods are static. */ public class LZW { /** Note some Java types differ from C types: *

* byte (8), short (16), int (32) are signed, there is no unsigned. * therefore positive (unsigned) values 0-255 need short not byte, * at least while we are actively using them. * OK to --store-- them in byte with casting and masking. *

* char (16) not (8) to support Unicode, * so char is not the same as byte (8). * neither is it the same as short (16). *

* int InputStream.read () * Reads the next byte of data from this input stream. * The value byte is returned as an int in the range 0 to 255. * If no byte is available because the end of the stream has been reached, * the value -1 is returned. * This method blocks until input data is available, * the end of the stream is detected, * or an exception is thrown. */ /* SHIFTCOUNT = BITS-8 used for hashing (why this number?) * MAXVALUE largest below 2^^BITS is reserved as a sentinel. * TABLESIZE must be prime larger than MAXVALUE for hashing to work. * With MAXVALUE = 4094 the next prime is 5021 */ private static final int SHIFTCOUNT = 4, MAXVALUE = (1 << 12) - 1, MAXCODE = MAXVALUE - 1, TABLESIZE = 5021; /** Compress a sequence of raw data bytes. * @param inRaw source of data bytes to be compressed, * e.g. ByteArrayInputStream or BufferedInputStream. * @param outComp packer for destination for compressed data, * either to a stream or an array. * @return number of raw data bytes compressed. * @exception IOException from either underlying stream. */ public static int Compress (InputStream inRaw, CodeOutputPacker outComp) throws IOException { int inctr = 0; /* codes 0-255 represent themselves as possible byte values */ int nextCode = 256; LZWHistoryItem [] history = new LZWHistoryItem [TABLESIZE]; /* get the first data byte then loop for the rest of them. * This first data byte will be saved as Prefix, * and the second data byte as Append, * for nextCode=256. */ /* The value byte is returned as an int in the range 0 to 255. * It does --not-- have to be masked. */ int oneCode = inRaw.read (); inctr++; int oneByte; while (-1 != (oneByte = inRaw.read ())) { inctr++; /* see if the table already has this prefix//char pair */ int index = FindMatch (history, oneCode, oneByte); /* yes at this index, * get the code for it as the --next-- prefix, * so we can try to append the next byte */ if (history [index] != null) oneCode = history [index].Code; /* no, * output the code for the last data string, * and start trying to grow onto the new one. * * this table index is vacant. * The hash table always has some extra room, * because MAXCODE < TABLESIZE. * * if not already at MAXCODE, * add prefix//char pair as nextCode number, * otherwise we cannot save anymore. */ else { outComp.putN (oneCode); if (nextCode <= MAXCODE) history [index] = new LZWHistoryItem (nextCode++, oneCode, oneByte); oneCode = oneByte; } } /* Output last byte directly as a code < 256, * plus an end of buffer sentinel MAXVALUE code, * and flush in case we are on odd half. */ outComp.putN (oneCode); outComp.putN (MAXVALUE); outComp.flush (); return inctr; } /** Compress a sequence of raw data bytes. * @param inRaw array of data bytes to be compressed, * @param outComp packer for destination for compressed data, * either to a stream or an array. * @return number of raw data bytes compressed. * @exception IOException from underlying stream. */ public static int Compress (byte [] inRaw, CodeOutputPacker outComp) throws IOException { return Compress (inRaw, inRaw.length, outComp); } /** Compress a sequence of raw data bytes. * @param inRaw array of data bytes to be compressed, * @param validCount how much of inRaw array to be compressed, * @param outComp packer for destination for compressed data, * either to a stream or an array. * @return number of raw data bytes compressed. * @exception IOException from underlying stream. */ public static int Compress (byte [] inRaw, int validCount, CodeOutputPacker outComp) throws IOException { int inctr = 0; /* codes 0-255 represent themselves as possible byte values */ int nextCode = 256; LZWHistoryItem [] history = new LZWHistoryItem [TABLESIZE]; /* get the first data byte then loop for the rest of them. * This first data byte will be saved as Prefix, * and the second data byte as Append, * for nextCode=256. */ /* The value byte is cosidered signed and gets extended. * It --does-- have to be masked to be considered 0-255. */ int oneCode = inRaw [inctr++]; oneCode &= 0x00ff; // System.out.println ("first="+(char)oneCode); int oneByte; while ((inctr < inRaw.length) && (inctr < validCount)) { oneByte = (byte)inRaw [inctr++]; oneByte &= 0x00ff; // System.out.println ("="+(char)oneByte); /* see if the table already has this prefix//char pair */ int index = FindMatch (history, oneCode, oneByte); /* yes at this index, * get the code for it as the --next-- prefix */ if (history [index] != null) oneCode = history [index].Code; /* no, * output the code for the last data string, * and start trying to grow onto the new one. * * this table index is vacant. * The hash table always has some extra room, * because MAXCODE < TABLESIZE. * * if not already at MAXCODE, * add prefix//char pair as nextCode number, * otherwise we cannot save anymore. */ else { outComp.putN (oneCode); if (nextCode <= MAXCODE) history [index] = new LZWHistoryItem (nextCode++, oneCode, oneByte); oneCode = oneByte; } } /* Output last byte directly as a code < 256, * plus an end of buffer sentinel MAXVALUE code, * and flush in case we are on odd half. */ outComp.putN (oneCode); outComp.putN (MAXVALUE); outComp.flush (); return inctr; } /** Try to find a match for aPrefix//aCharacter, * return index of the match if found, * or a vacant slot. * Always possible to find a vacant slot, * because MAXCODE < TABLESIZE. */ /////// aPrefix 12-bits could use short /////// aCharacter only needs 0-255 unsigned private static int FindMatch (LZWHistoryItem [] history, int aPrefix, int aCharacter) { int offset; /* Hash index (somehow) based on both prefix and new char */ int index = (aCharacter << SHIFTCOUNT) ^ aPrefix; if (index == 0) offset = 1; else offset = TABLESIZE - index; while (true) { /* If either a vacant slot, * or a match (on both parts), * return the index. */ if ((history [index] == null) || ((history [index].Prefix == aPrefix) && (history [index].Append == aCharacter))) return index; /* Collision, try again... * Because TABLESIZE is prime we try everything. */ index -= offset; if (index < 0) index += TABLESIZE; } } /** Expand Compressed codes back to raw data. * This will fail if input was not LZW compressed data. * @param inComp unpacker from source. * @param outExp destination for decoded bytes. * @return count of decoded raw data bytes. * @exception IOException from either underlying stream. * @exception IllegalArgumentException if input was not LZW compressed data. */ public static int Expand (CodeInputUnpacker inComp, OutputStream outExp) throws IOException, IllegalArgumentException { int outctr = 0; int nPushed; LZWByteStack charStack = new LZWByteStack (); /* Will fill in parallel with Compress table, * so that each Code has same meaning in both. */ LZWHistoryItem [] history = new LZWHistoryItem [TABLESIZE]; /* codes 0-255 already represent themselves as possible byte values */ int nextCode = 256; /* get first input code. * Remember 12 bit codes overlap bytes, * so every 2 calls will input 3 bytes. */ int oldCode = inComp.getN (); /* first input code must be a direct byte value 0-255, * (or we have the wrong input!) */ if (oldCode > 255) throw new IllegalArgumentException ("Not LZW compressed data"); /* so move it to output buffer. * Also save it for the table. */ int aChar = oldCode; outExp.write (aChar); outctr++; /* loop while more input, * final code will be sentinel. * surely will fail along the way if not LZW compressed data. */ try { int newCode; while (MAXVALUE != (newCode = inComp.getN ())) { /* To check for the special STRING+CHARACTER+STRING+CHARACTER+STRING case, * if code value larger than yet seen (they increase), * we know it is not in the table, * so get expansion of the old code, * and append this code to it as a character, * by pushing it first onto the stack. */ if (newCode >= nextCode) { charStack.push (aChar); nPushed = 1 + DecodeString (history, charStack, oldCode); } /* newCode is already known in the table so expand it, * or it is an actual byte value < 256 */ else nPushed = DecodeString (history, charStack, newCode); /* This top one will go into history table as the nextCode */ outExp.write (aChar = charStack.pop ()); outctr++; /* The rest (if any more) are output in proper order */ while (1 < nPushed--) { outExp.write (charStack.pop ()); outctr++; } /* Append to history table if we have room, * otherwise a missed chance for better compression, * but nothing will be lost, * because Compressor made the same decision. */ if (nextCode <= MAXCODE) { history [nextCode] = new LZWHistoryItem (nextCode, oldCode, aChar); nextCode++; oldCode = newCode; } } } catch (IOException eio) { throw eio; } catch (Exception e) { e.printStackTrace (); throw new IllegalArgumentException ("Not LZW compressed data"); } return outctr; } /** Expand Compressed codes back to raw data. * This will fail if input was not LZW compressed data. * @param inComp unpacker from source. * @param outExp destination for decoded bytes. * @return count of decoded raw data bytes. * @exception IOException from either underlying stream. * @exception IllegalArgumentException if input was not LZW compressed data. */ public static int Expand (CodeInputUnpacker inComp, byte [] outExp) throws IOException, IllegalArgumentException { int outctr = 0; int nPushed; LZWByteStack charStack = new LZWByteStack (); /* Will fill in parallel with Compress table, * so that each Code has same meaning in both. */ LZWHistoryItem [] history = new LZWHistoryItem [TABLESIZE]; /* codes 0-255 already represent themselves as possible byte values */ int nextCode = 256; /* get first input code. * Remember 12 bit codes overlap bytes, * so every 2 calls will input 3 bytes. */ int oldCode = inComp.getN (); /* first input code must be a direct byte value 0-255, * (or we have the wrong input!) */ if (oldCode > 255) throw new IllegalArgumentException ("Not LZW compressed data"); /* so move it to output buffer. * Also save it for the table. */ int aChar = oldCode; outExp [outctr++] = (byte)aChar; /* loop while more input, * final code will be sentinel. * surely will fail along the way if not LZW compressed data. */ try { int newCode; while (MAXVALUE != (newCode = inComp.getN ())) { /* To check for the special STRING+CHARACTER+STRING+CHARACTER+STRING case, * if code value larger than yet seen (they increase), * we know it is not in the table, * so get expansion of the old code, * and append this code to it as a character, * by pushing it first onto the stack. */ if (newCode >= nextCode) { charStack.push (aChar); nPushed = 1 + DecodeString (history, charStack, oldCode); } /* newCode is already known in the table so expand it, * or it is an actual byte value < 256 */ else nPushed = DecodeString (history, charStack, newCode); /* This top one will go into history table as the nextCode */ outExp [outctr++] = (byte)(aChar = charStack.pop ()); /* The rest (if any more) are output in proper order */ while (1 < nPushed--) outExp [outctr++] = (byte)charStack.pop (); /* Append to history table if we have room, * otherwise a missed chance for better compression, * but nothing will be lost, * because Compressor made the same decision. */ if (nextCode <= MAXCODE) { history [nextCode] = new LZWHistoryItem (nextCode, oldCode, aChar); nextCode++; oldCode = newCode; } } } catch (IOException eio) { throw eio; } catch (Exception e) { throw new IllegalArgumentException ("Not LZW compressed data"); } return outctr; } private static int DecodeString (LZWHistoryItem [] history, LZWByteStack cStack, int code) { int ii = 1; /* codes 0-255 stand for themselves as actual byte values, * higher ones are indices of table entries. * add their Append values to the cStack, * running down Prefix chain until an actual byte value */ while (code > 255) { cStack.push (history [code].Append); code = history [code].Prefix; ii++; } /* finally add actual byte value. * We have added (ii) values in reverse order, * so our caller will have to unstack them. */ cStack.push ((byte)code); return ii; } /** For test and demonstration. * From a built-in string and from any named files. * Test files may be text or binary, * but be prepared for the binary output. */ public static void main (String [] argv) { int nComp, nExp; try { ByteArrayInputStream bais = new ByteArrayInputStream (("1 12 123\n" +"a ab abc abcd abcde abcdef\n" +"b bc bcd bcde bcdef\n" +"a ab abc abcd abcde abcdef\n" +"b bc bcd bcde bcdef\n" +"c cd cde cdef").getBytes ()); CodeOutputPacker baos = new CodeOutputPacker (5000); nComp = LZW.Compress (bais, baos); nExp = LZW.Expand (new CodeInputUnpacker (baos.toByteArray ()), System.out); System.out.println ("\nOK "+nComp+" -> "+nExp); } catch (Exception ex) { ex.printStackTrace (); } System.out.println ("\n-------"); try { byte [] rawbytes = ("1 12 123\n" +"a ab abc abcd abcde abcdef\n" +"b bc bcd bcde bcdef\n" +"a ab abc abcd abcde abcdef\n" +"b bc bcd bcde bcdef\n" +"c cd cde cdef").getBytes (); CodeOutputPacker baos = new CodeOutputPacker (5000); nComp = LZW.Compress (rawbytes, baos); byte [] packedbytes = baos.toByteArray (); nExp = LZW.Expand (new CodeInputUnpacker (packedbytes), System.out); System.out.println ("\nOK "+nComp+" -> "+nExp); } catch (Exception ex) { ex.printStackTrace (); } System.out.println ("\n-------"); for (int argc = 0; argc < argv.length; argc++) { try { BufferedInputStream bis = new BufferedInputStream (new FileInputStream (new File (argv [argc]))); CodeOutputPacker baos = new CodeOutputPacker (250000); nComp = LZW.Compress (bis, baos); nExp = LZW.Expand (new CodeInputUnpacker (baos.toByteArray ()), System.out); System.out.println ("\nOK:"+argv [argc]+" "+nComp+" -> "+nExp); } catch (Exception ex) { ex.printStackTrace (); } System.out.println ("\n-------"); } } } /* Table items */ class LZWHistoryItem { int Code = -1; int Prefix = 0; int Append = 0; public LZWHistoryItem () { } // Code and Prefix 12-bits could use short // Append only needs 0-255 unsigned public LZWHistoryItem (int Code, int Prefix, int Append) { //System.out.println ("new LZWHistoryItem "+Code+" "+Prefix+"="+(char)Append); this.Code = Code; this.Prefix = Prefix; this.Append = Append; } } /* Note that java.util.stack handles Objects not native types, * so it is neither convenient nor efficient here. */ class LZWByteStack { // byte is enough for 0-255 problem is getting unsigned private int [] ints; private int ctr = 0; public LZWByteStack () { this (4000); } public LZWByteStack (int size) { ints = new int [size]; } public void push (int bv) { ints [ctr++] = bv; } public int pop () { return ints [--ctr]; } } /* */ ///////////////////////////////////////////////////////