diff options
-rw-r--r-- | fileset-src.txt | 1 | ||||
-rw-r--r-- | src/org/tukaani/xz/LZMAInputStream.java | 568 | ||||
-rw-r--r-- | src/org/tukaani/xz/lzma/LZMADecoder.java | 14 |
3 files changed, 583 insertions, 0 deletions
diff --git a/fileset-src.txt b/fileset-src.txt index 29a9bdf..fad687e 100644 --- a/fileset-src.txt +++ b/fileset-src.txt @@ -33,6 +33,7 @@ src/org/tukaani/xz/LZMA2Encoder.java src/org/tukaani/xz/LZMA2InputStream.java src/org/tukaani/xz/LZMA2Options.java src/org/tukaani/xz/LZMA2OutputStream.java +src/org/tukaani/xz/LZMAInputStream.java src/org/tukaani/xz/MemoryLimitException.java src/org/tukaani/xz/PowerPCOptions.java src/org/tukaani/xz/RawCoder.java diff --git a/src/org/tukaani/xz/LZMAInputStream.java b/src/org/tukaani/xz/LZMAInputStream.java new file mode 100644 index 0000000..f03fc7f --- /dev/null +++ b/src/org/tukaani/xz/LZMAInputStream.java @@ -0,0 +1,568 @@ +/* + * LZMAInputStream + * + * Authors: Lasse Collin <lasse.collin@tukaani.org> + * Igor Pavlov <http://7-zip.org/> + * + * This file has been put into the public domain. + * You can do whatever you want with this file. + */ + +package org.tukaani.xz; + +import java.io.InputStream; +import java.io.DataInputStream; +import java.io.IOException; +import org.tukaani.xz.lz.LZDecoder; +import org.tukaani.xz.rangecoder.RangeDecoderFromStream; +import org.tukaani.xz.lzma.LZMADecoder; + +/** + * Decompresses legacy .lzma files and raw LZMA streams (no .lzma header). + * <p> + * <b>IMPORTANT:</b> In contrast to other classes in this package, this class + * reads data from its input stream one byte at a time. If the input stream + * is for example {@link java.io.FileInputStream}, wrapping it into + * {@link java.io.BufferedInputStream} tends to improve performance a lot. + * This is not automatically done by this class because there may be use + * cases where it is desired that this class won't read any bytes past + * the end of the LZMA stream. + * <p> + * Even when using <code>BufferedInputStream</code>, the performance tends + * to be worse (maybe 10-20 % slower) than with {@link LZMA2InputStream} + * or {@link XZInputStream} (when the .xz file contains LZMA2-compressed data). + * + * @since 1.4 + */ +public class LZMAInputStream extends InputStream { + /** + * Largest dictionary size supported by this implementation. + * <p> + * LZMA allows dictionaries up to one byte less than 4 GiB. This + * implementation supports only 16 bytes less than 2 GiB. This + * limitation is due to Java using signed 32-bit integers for array + * indexing. The limitation shouldn't matter much in practice since so + * huge dictionaries are not normally used. + */ + public static final int DICT_SIZE_MAX = Integer.MAX_VALUE & ~15; + + private InputStream in; + private LZDecoder lz; + private RangeDecoderFromStream rc; + private LZMADecoder lzma; + + private boolean endReached = false; + + /** + * Number of uncompressed bytes left to be decompressed, or -1 if + * the end marker is used. + */ + private long remainingSize; + + private IOException exception = null; + + /** + * Gets approximate decompressor memory requirements as kibibytes for + * the given dictionary size and LZMA properties byte (lc, lp, and pb). + * + * @param dictSize LZMA dictionary size as bytes, should be + * in the range [<code>0</code>, + * <code>DICT_SIZE_MAX</code>] + * + * @param propsByte LZMA properties byte that encodes the values + * of lc, lp, and pb + * + * @return approximate memory requirements as kibibytes (KiB) + * + * @throws UnsupportedOptionsException + * if <code>dictSize</code> is outside + * the range [<code>0</code>, + * <code>DICT_SIZE_MAX</code>] + * + * @throws CorruptedInputException + * if <code>propsByte</code> is invalid + */ + public static int getMemoryUsage(int dictSize, byte propsByte) + throws UnsupportedOptionsException, CorruptedInputException { + if (dictSize < 0 || dictSize > DICT_SIZE_MAX) + throw new UnsupportedOptionsException( + "LZMA dictionary is too big for this implementation"); + + int props = propsByte & 0xFF; + if (props > (4 * 5 + 4) * 9 + 8) + throw new CorruptedInputException("Invalid LZMA properties byte"); + + props %= 9 * 5; + int lp = props / 9; + int lc = props - lp * 9; + + return getMemoryUsage(dictSize, lc, lp); + } + + /** + * Gets approximate decompressor memory requirements as kibibytes for + * the given dictionary size, lc, and lp. Note that pb isn't needed. + * + * @param dictSize LZMA dictionary size as bytes, must be + * in the range [<code>0</code>, + * <code>DICT_SIZE_MAX</code>] + * + * @param lc number of literal context bits, must be + * in the range [0, 8] + * + * @param lp number of literal position bits, must be + * in the range [0, 4] + * + * @return approximate memory requirements as kibibytes (KiB) + */ + public static int getMemoryUsage(int dictSize, int lc, int lp) { + if (lc < 0 || lc > 8 || lp < 0 || lp > 4) + throw new IllegalArgumentException("Invalid lc or lp"); + + // Probability variables have the type "short". There are + // 0x300 (768) probability variables in each literal subcoder. + // The number of literal subcoders is 2^(lc + lp). + // + // Roughly 10 KiB for the base state + LZ decoder's dictionary buffer + // + sizeof(short) * number probability variables per literal subcoder + // * number of literal subcoders + return 10 + getDictSize(dictSize) / 1024 + + ((2 * 0x300) << (lc + lp)) / 1024; + } + + private static int getDictSize(int dictSize) { + if (dictSize < 0 || dictSize > DICT_SIZE_MAX) + throw new IllegalArgumentException( + "LZMA dictionary is too big for this implementation"); + + // For performance reasons, use a 4 KiB dictionary if something + // smaller was requested. It's a rare situation and the performance + // difference isn't huge, and it starts to matter mostly when the + // dictionary is just a few bytes. But we need to handle the special + // case of dictSize == 0 anyway, which is an allowed value but in + // practice means one-byte dictionary. + // + // Note that using a dictionary bigger than specified in the headers + // can hide errors if there is a reference to data beyond the original + // dictionary size but is still within 4 KiB. + if (dictSize < 4096) + dictSize = 4096; + + // Round dictionary size upward to a multiple of 16. This way LZMA + // can use LZDecoder.getPos() for calculating LZMA's posMask. + return (dictSize + 15) & ~15; + } + + /** + * Creates a new .lzma file format decompressor without + * a memory usage limit. + * + * @param in input stream from which .lzma data is read; + * it might be a good idea to wrap it in + * <code>BufferedInputStream</code>, see the + * note at the top of this page + * + * @throws CorruptedInputException + * file is corrupt or perhaps not in + * the .lzma format at all + * + * @throws UnsupportedOptionsException + * dictionary size or uncompressed size is too + * big for this implementation + * + * @throws EOFException + * file is truncated or perhaps not in + * the .lzma format at all + * + * @throws IOException may be thrown by <code>in</code> + */ + public LZMAInputStream(InputStream in) throws IOException { + this(in, -1); + } + + /** + * Creates a new .lzma file format decompressor with an optional + * memory usage limit. + * + * @param in input stream from which .lzma data is read; + * it might be a good idea to wrap it in + * <code>BufferedInputStream</code>, see the + * note at the top of this page + * + * @param memoryLimit memory usage limit in kibibytes (KiB) + * or <code>-1</code> to impose no + * memory usage limit + * + * @throws CorruptedInputException + * file is corrupt or perhaps not in + * the .lzma format at all + * + * @throws UnsupportedOptionsException + * dictionary size or uncompressed size is too + * big for this implementation + * + * @throws MemoryLimitException + * memory usage limit was exceeded + * + * @throws EOFException + * file is truncated or perhaps not in + * the .lzma format at all + * + * @throws IOException may be thrown by <code>in</code> + */ + public LZMAInputStream(InputStream in, int memoryLimit) + throws IOException { + DataInputStream inData = new DataInputStream(in); + + // Properties byte (lc, lp, and pb) + byte propsByte = inData.readByte(); + + // Dictionary size is an unsigned 32-bit little endian integer. + int dictSize = 0; + for (int i = 0; i < 4; ++i) + dictSize |= inData.readUnsignedByte() << (8 * i); + + // Uncompressed size is an unsigned 64-bit little endian integer. + // The maximum 64-bit value is a special case (becomes -1 here) + // which indicates that the end marker is used instead of knowing + // the uncompressed size beforehand. + long uncompSize = 0; + for (int i = 0; i < 8; ++i) + uncompSize |= (long)inData.readUnsignedByte() << (8 * i); + + // Check the memory usage limit. + int memoryNeeded = getMemoryUsage(dictSize, propsByte); + if (memoryLimit != -1 && memoryNeeded > memoryLimit) + throw new MemoryLimitException(memoryNeeded, memoryLimit); + + initialize(in, uncompSize, propsByte, dictSize, null); + } + + /** + * Creates a new input stream that decompresses raw LZMA data (no .lzma + * header) from <code>in</code>. + * <p> + * The caller needs to know if the "end of payload marker (EOPM)" alias + * "end of stream marker (EOS marker)" alias "end marker" present. + * If the end marker isn't used, the caller must know the exact + * uncompressed size of the stream. + * <p> + * The caller also needs to provide the LZMA properties byte that encodes + * the number of literal context bits (lc), literal position bits (lp), + * and position bits (pb). + * <p> + * The dictionary size used when compressing is also needed. Specifying + * a too small dictionary size will prevent decompressing the stream. + * Specifying a too big dictionary is waste of memory but decompression + * will work. + * <p> + * There is no need to specify a dictionary bigger than + * the uncompressed size of the data even if a bigger dictionary + * was used when compressing. If you know the uncompressed size + * of the data, this might allow saving some memory. + * + * @param in input stream from which compressed + * data is read + * + * @param uncompSize uncompressed size of the LZMA stream or -1 + * if the end marker is used in the LZMA stream + * + * @param propsByte LZMA properties byte that has the encoded + * values for literal context bits (lc), literal + * position bits (lp), and position bits (pb) + * + * @param dictSize dictionary size as bytes, must be in the range + * [<code>0</code>, <code>DICT_SIZE_MAX</code>] + * + * @throws CorruptedInputException + * if <code>propsByte</code> is invalid or + * the first input byte is not 0x00 + * + * @throws UnsupportedOptionsException + * dictionary size or uncompressed size is too + * big for this implementation + * + * + */ + public LZMAInputStream(InputStream in, long uncompSize, byte propsByte, + int dictSize) throws IOException { + initialize(in, uncompSize, propsByte, dictSize, null); + } + + /** + * Creates a new input stream that decompresses raw LZMA data (no .lzma + * header) from <code>in</code> optionally with a preset dictionary. + * + * @param in input stream from which LZMA-compressed + * data is read + * + * @param uncompSize uncompressed size of the LZMA stream or -1 + * if the end marker is used in the LZMA stream + * + * @param propsByte LZMA properties byte that has the encoded + * values for literal context bits (lc), literal + * position bits (lp), and position bits (pb) + * + * @param dictSize dictionary size as bytes, must be in the range + * [<code>0</code>, <code>DICT_SIZE_MAX</code>] + * + * @param presetDict preset dictionary or <code>null</code> + * to use no preset dictionary + * + * @throws CorruptedInputException + * if <code>propsByte</code> is invalid or + * the first input byte is not 0x00 + * + * @throws UnsupportedOptionsException + * dictionary size or uncompressed size is too + * big for this implementation + * + * @throws EOFException file is truncated or corrupt + * + * @throws IOException may be thrown by <code>in</code> + */ + public LZMAInputStream(InputStream in, long uncompSize, byte propsByte, + int dictSize, byte[] presetDict) + throws IOException { + initialize(in, uncompSize, propsByte, dictSize, presetDict); + } + + /** + * Creates a new input stream that decompresses raw LZMA data (no .lzma + * header) from <code>in</code> optionally with a preset dictionary. + * + * @param in input stream from which LZMA-compressed + * data is read + * + * @param uncompSize uncompressed size of the LZMA stream or -1 + * if the end marker is used in the LZMA stream + * + * @param lc number of literal context bits, must be + * in the range [0, 8] + * + * @param lp number of literal position bits, must be + * in the range [0, 4] + * + * @param pb number position bits, must be + * in the range [0, 4] + * + * @param dictSize dictionary size as bytes, must be in the range + * [<code>0</code>, <code>DICT_SIZE_MAX</code>] + * + * @param presetDict preset dictionary or <code>null</code> + * to use no preset dictionary + * + * @throws CorruptedInputException + * if the first input byte is not 0x00 + * + * @throws EOFException file is truncated or corrupt + * + * @throws IOException may be thrown by <code>in</code> + */ + public LZMAInputStream(InputStream in, long uncompSize, + int lc, int lp, int pb, + int dictSize, byte[] presetDict) + throws IOException { + initialize(in, uncompSize, lc, lp, pb, dictSize, presetDict); + } + + private void initialize(InputStream in, long uncompSize, byte propsByte, + int dictSize, byte[] presetDict) + throws IOException { + // Validate the uncompressed size since the other "initialize" throws + // IllegalArgumentException if uncompSize < -1. + if (uncompSize < -1) + throw new UnsupportedOptionsException( + "Uncompressed size is too big"); + + // Decode the properties byte. In contrast to LZMA2, there is no + // limit of lc + lp <= 4. + int props = propsByte & 0xFF; + if (props > (4 * 5 + 4) * 9 + 8) + throw new CorruptedInputException("Invalid LZMA properties byte"); + + int pb = props / (9 * 5); + props -= pb * 9 * 5; + int lp = props / 9; + int lc = props - lp * 9; + + // Validate the dictionary size since the other "initialize" throws + // IllegalArgumentException if dictSize is not supported. + if (dictSize < 0 || dictSize > DICT_SIZE_MAX) + throw new UnsupportedOptionsException( + "LZMA dictionary is too big for this implementation"); + + initialize(in, uncompSize, lc, lp, pb, dictSize, presetDict); + } + + private void initialize(InputStream in, long uncompSize, + int lc, int lp, int pb, + int dictSize, byte[] presetDict) + throws IOException { + // getDictSize validates dictSize and gives a message in + // the exception too, so skip validating dictSize here. + if (uncompSize < -1 || lc < 0 || lc > 8 || lp < 0 || lp > 4 + || pb < 0 || pb > 4) + throw new IllegalArgumentException(); + + this.in = in; + + // If uncompressed size is known, use it to avoid wasting memory for + // a uselessly large dictionary buffer. + dictSize = getDictSize(dictSize); + if (uncompSize >= 0 && dictSize > uncompSize) + dictSize = getDictSize((int)uncompSize); + + lz = new LZDecoder(getDictSize(dictSize), presetDict); + rc = new RangeDecoderFromStream(in); + lzma = new LZMADecoder(lz, rc, lc, lp, pb); + remainingSize = uncompSize; + } + + /** + * Decompresses the next byte from this input stream. + * <p> + * Reading lots of data with <code>read()</code> from this input stream + * may be inefficient. Wrap it in <code>java.io.BufferedInputStream</code> + * if you need to read lots of data one byte at a time. + * + * @return the next decompressed byte, or <code>-1</code> + * to indicate the end of the compressed stream + * + * @throws CorruptedInputException + * + * @throws XZIOException if the stream has been closed + * + * @throws EOFException + * compressed input is truncated or corrupt + * + * @throws IOException may be thrown by <code>in</code> + */ + public int read() throws IOException { + byte[] buf = new byte[1]; + return read(buf, 0, 1) == -1 ? -1 : (buf[0] & 0xFF); + } + + /** + * Decompresses into an array of bytes. + * <p> + * If <code>len</code> is zero, no bytes are read and <code>0</code> + * is returned. Otherwise this will block until <code>len</code> + * bytes have been decompressed, the end of the LZMA stream is reached, + * or an exception is thrown. + * + * @param buf target buffer for uncompressed data + * @param off start offset in <code>buf</code> + * @param len maximum number of uncompressed bytes to read + * + * @return number of bytes read, or <code>-1</code> to indicate + * the end of the compressed stream + * + * @throws CorruptedInputException + * + * @throws XZIOException if the stream has been closed + * + * @throws EOFException compressed input is truncated or corrupt + * + * @throws IOException may be thrown by <code>in</code> + */ + public int read(byte[] buf, int off, int len) throws IOException { + if (off < 0 || len < 0 || off + len < 0 || off + len > buf.length) + throw new IndexOutOfBoundsException(); + + if (len == 0) + return 0; + + if (in == null) + throw new XZIOException("Stream closed"); + + if (exception != null) + throw exception; + + if (endReached) + return -1; + + try { + int size = 0; + + while (len > 0) { + // If uncompressed size is known and thus no end marker will + // be present, set the limit so that the uncompressed size + // won't be exceeded. + int copySizeMax = len; + if (remainingSize >= 0 && remainingSize < len) + copySizeMax = (int)remainingSize; + + lz.setLimit(copySizeMax); + + // Decode into the dictionary buffer. + try { + lzma.decode(); + } catch (CorruptedInputException e) { + // The end marker is encoded with a LZMA symbol that + // indicates maximum match distance. This is larger + // than any supported dictionary and thus causes + // CorruptedInputException from LZDecoder.repeat. + if (remainingSize != -1 || !lzma.endMarkerDetected()) + throw e; + + endReached = true; + + // The exception makes lzma.decode() miss the last range + // decoder normalization, so do it here. This might + // cause an IOException if it needs to read a byte + // from the input stream. + rc.normalize(); + } + + // Copy from the dictionary to buf. + int copiedSize = lz.flush(buf, off); + off += copiedSize; + len -= copiedSize; + size += copiedSize; + + if (remainingSize >= 0) { + // Update the number of bytes left to be decompressed. + remainingSize -= copiedSize; + assert remainingSize >= 0; + + if (remainingSize == 0) + endReached = true; + } + + if (endReached) { + // Checking these helps a lot when catching corrupt + // or truncated .lzma files. LZMA Utils doesn't do + // the first check and thus it accepts many invalid + // files that this implementation and XZ Utils don't. + if (!rc.isFinished() || lz.hasPending()) + throw new CorruptedInputException(); + + return size == 0 ? -1 : size; + } + } + + return size; + + } catch (IOException e) { + exception = e; + throw e; + } + } + + /** + * Closes the stream and calls <code>in.close()</code>. + * If the stream was already closed, this does nothing. + * + * @throws IOException if thrown by <code>in.close()</code> + */ + public void close() throws IOException { + if (in != null) { + try { + in.close(); + } finally { + in = null; + } + } + } +} diff --git a/src/org/tukaani/xz/lzma/LZMADecoder.java b/src/org/tukaani/xz/lzma/LZMADecoder.java index 3409389..8eb4ac2 100644 --- a/src/org/tukaani/xz/lzma/LZMADecoder.java +++ b/src/org/tukaani/xz/lzma/LZMADecoder.java @@ -36,6 +36,16 @@ public final class LZMADecoder extends LZMACoder { repLenDecoder.reset(); } + /** + * Returns true if LZMA end marker was detected. It is encoded as + * the maximum match distance which with signed ints becomes -1. This + * function is needed only for LZMA1. LZMA2 doesn't use the end marker + * in the LZMA layer. + */ + public boolean endMarkerDetected() { + return reps[0] == -1; + } + public void decode() throws IOException { lz.repeatPending(); @@ -48,6 +58,10 @@ public final class LZMADecoder extends LZMACoder { int len = rc.decodeBit(isRep, state.get()) == 0 ? decodeMatch(posState) : decodeRepMatch(posState); + + // NOTE: With LZMA1 streams that have the end marker, + // this will throw CorruptedInputException. LZMAInputStream + // handles it specially. lz.repeat(reps[0], len); } } |