使用 gzip 压缩输入流

2022-09-01 20:38:02

我想使用Gzip压缩在java中压缩输入流。

假设我们有一个未压缩的输入流(1GB 数据)。.因此,我想要一个来自源的压缩输入流:

public InputStream getCompressedStream(InputStream unCompressedStream) {

    // Not working because it's uncompressing the stream, I want the opposite.
    return new GZIPInputStream(unCompressedStream); 

}

答案 1

DeflaterInputStream不是你想要的,因为它缺少gzip header/trailer,并且使用稍微不同的压缩。

如果从 OutputStream(推送)更改为 InputStream(拉取),则需要执行不同的操作。

GzipOutputStream所做的是:

  • 编写静态 gzip 标头
  • 使用 DeflaterOutputStream 编写一个放气的流。写入流时,根据未压缩的数据构建CRC32校验和,并计算字节数
  • 编写包含 CRC32 校验和和字节数的尾部。

如果要对 InputStreams 执行相同的操作,则需要一个包含以下内容的流:

  • 页眉
  • 放气的内容
  • 拖车

执行此操作的最佳方法是提供3个不同的流并将它们合并为一个。幸运的是,有SequenceInputStream可以为您组合流。

这是我的实现加上一个简单的单元测试:

import java.io.ByteArrayInputStream;
import java.io.FileInputStream;
import java.io.FilterInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.SequenceInputStream;
import java.util.Enumeration;
import java.util.zip.CRC32;
import java.util.zip.Deflater;
import java.util.zip.DeflaterInputStream;
import java.util.zip.DeflaterOutputStream;

/**
 * @author mwyraz
 * Wraps an input stream and compresses it's contents. Similiar to DeflateInputStream but adds GZIP-header and trailer
 * See GzipOutputStream for details.
 * LICENSE: Free to use. Contains some lines from GzipOutputStream, so oracle's license might apply as well!
 */
public class GzipCompressingInputStream extends SequenceInputStream
{
    public GzipCompressingInputStream(InputStream in) throws IOException
    {
        this(in,512);
    }
    public GzipCompressingInputStream(InputStream in, int bufferSize) throws IOException
    {
        super(new StatefullGzipStreamEnumerator(in,bufferSize));
    }

    static enum StreamState
    {
        HEADER,
        CONTENT,
        TRAILER
    }

    protected static class StatefullGzipStreamEnumerator implements Enumeration<InputStream>
    {

        protected final InputStream in;
        protected final int bufferSize;
        protected StreamState state;

        public StatefullGzipStreamEnumerator(InputStream in, int bufferSize)
        {
            this.in=in;
            this.bufferSize=bufferSize;
            state=StreamState.HEADER;
        }

        public boolean hasMoreElements()
        {
            return state!=null;
        }
        public InputStream nextElement()
        {
            switch (state)
            {
                case HEADER:
                    state=StreamState.CONTENT;
                    return createHeaderStream();
                case CONTENT:
                    state=StreamState.TRAILER;
                    return createContentStream();
                case TRAILER:
                    state=null;
                    return createTrailerStream();
            }
            return null;
        }

        static final int GZIP_MAGIC = 0x8b1f;
        static final byte[] GZIP_HEADER=new byte[] {
                (byte) GZIP_MAGIC,        // Magic number (short)
                (byte)(GZIP_MAGIC >> 8),  // Magic number (short)
                Deflater.DEFLATED,        // Compression method (CM)
                0,                        // Flags (FLG)
                0,                        // Modification time MTIME (int)
                0,                        // Modification time MTIME (int)
                0,                        // Modification time MTIME (int)
                0,                        // Modification time MTIME (int)
                0,                        // Extra flags (XFLG)
                0                         // Operating system (OS)
        };
        protected InputStream createHeaderStream()
        {
            return new ByteArrayInputStream(GZIP_HEADER);
        }
        protected InternalGzipCompressingInputStream contentStream;
        protected InputStream createContentStream()
        {
            contentStream=new InternalGzipCompressingInputStream(new CRC32InputStream(in), bufferSize);
            return contentStream;
        }
        protected InputStream createTrailerStream()
        {
            return new ByteArrayInputStream(contentStream.createTrailer());
        }
    }

    /**
     * Internal stream without header/trailer  
     */
    protected static class CRC32InputStream extends FilterInputStream
    {
        protected CRC32 crc = new CRC32();
        protected long byteCount;
        public CRC32InputStream(InputStream in)
        {
            super(in);
        }

        @Override
        public int read() throws IOException
        {
            int val=super.read();
            if (val>=0)
            {
                crc.update(val);
                byteCount++;
            }
            return val;
        }
        @Override
        public int read(byte[] b, int off, int len) throws IOException
        {
            len=super.read(b, off, len);
            if (len>=0)
            {
                crc.update(b,off,len);
                byteCount+=len;
            }
            return len;
        }
        public long getCrcValue()
        {
            return crc.getValue();
        }
        public long getByteCount()
        {
            return byteCount;
        }
    }

    /**
     * Internal stream without header/trailer  
     */
    protected static class InternalGzipCompressingInputStream extends DeflaterInputStream
    {
        protected final CRC32InputStream crcIn;
        public InternalGzipCompressingInputStream(CRC32InputStream in, int bufferSize)
        {
            super(in, new Deflater(Deflater.DEFAULT_COMPRESSION, true),bufferSize);
            crcIn=in;
        }
        public void close() throws IOException
        {
            if (in != null)
            {
                try
                {
                    def.end();
                    in.close();
                }
                finally
                {
                    in = null;
                }
            }
        }

        protected final static int TRAILER_SIZE = 8;

        public byte[] createTrailer()
        {
            byte[] trailer= new byte[TRAILER_SIZE];
            writeTrailer(trailer, 0);
            return trailer;
        }

        /*
         * Writes GZIP member trailer to a byte array, starting at a given
         * offset.
         */
        private void writeTrailer(byte[] buf, int offset)
        {
            writeInt((int)crcIn.getCrcValue(), buf, offset); // CRC-32 of uncompr. data
            writeInt((int)crcIn.getByteCount(), buf, offset + 4); // Number of uncompr. bytes
        }

        /*
         * Writes integer in Intel byte order to a byte array, starting at a
         * given offset.
         */
        private void writeInt(int i, byte[] buf, int offset)
        {
            writeShort(i & 0xffff, buf, offset);
            writeShort((i >> 16) & 0xffff, buf, offset + 2);
        }

        /*
         * Writes short integer in Intel byte order to a byte array, starting
         * at a given offset
         */
        private void writeShort(int s, byte[] buf, int offset)
        {
            buf[offset] = (byte)(s & 0xff);
            buf[offset + 1] = (byte)((s >> 8) & 0xff);
        }
    }

}

import static org.junit.Assert.*;

import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.util.Arrays;
import java.util.zip.CRC32;
import java.util.zip.GZIPInputStream;

import org.junit.Test;

public class TestGzipCompressingInputStream
{

    @Test
    public void test() throws Exception
    {
        testCompressor("test1 test2 test3");
        testCompressor("1MB binary data",createTestPattern(1024*1024));
        for (int i=0;i<4096;i++)
        {
            testCompressor(i+" bytes of binary data",createTestPattern(i));
        }
    }

    protected byte[] createTestPattern(int size)
    {
        byte[] data=new byte[size];
        byte pattern=0;
        for (int i=0;i<size;i++)
        {
            data[i]=pattern++;
        }
        return data;
    }

    protected void testCompressor(String data) throws IOException
    {
        testCompressor("String: "+data,data.getBytes());
    }
    protected void testCompressor(String dataInfo, byte[] data) throws IOException
    {
        InputStream uncompressedIn=new ByteArrayInputStream(data);
        InputStream compressedIn=new GzipCompressingInputStream(uncompressedIn);
        InputStream uncompressedOut=new GZIPInputStream(compressedIn);

        byte[] result=StreamHelper.readBinaryStream(uncompressedOut);

        assertTrue("Test failed for: "+dataInfo,Arrays.equals(data,result));

    }

}

答案 2

这是我写的一个版本,其中没有CRC / GZIP Magic cookie,因为它委托给GZIPOutputStream。它还具有内存效率,因为它仅使用足够的内存来缓冲压缩(42MB文件使用45k缓冲区)。性能与压缩到内存相同。

import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.util.zip.GZIPOutputStream;

/**
 * Compresses an InputStream in a memory-optimal, on-demand way only compressing enough to fill a buffer.
 * 
 * @author Ben La Monica
 */
public class GZIPCompressingInputStream extends InputStream {

    private InputStream in;
    private GZIPOutputStream gz;
    private OutputStream delegate;
    private byte[] buf = new byte[8192];
    private byte[] readBuf = new byte[8192];
    int read = 0;
    int write = 0;

    public GZIPCompressingInputStream(InputStream in) throws IOException {
        this.in = in;
        this.delegate = new OutputStream() {

            private void growBufferIfNeeded(int len) {
                if ((write + len) >= buf.length) {
                    // grow the array if we don't have enough space to fulfill the incoming data
                    byte[] newbuf = new byte[(buf.length + len) * 2];
                    System.arraycopy(buf, 0, newbuf, 0, buf.length);
                    buf = newbuf;
                }
            }

            @Override
            public void write(byte[] b, int off, int len) throws IOException {
                growBufferIfNeeded(len);
                System.arraycopy(b, off, buf, write, len);
                write += len;
            }

            @Override
            public void write(int b) throws IOException {
                growBufferIfNeeded(1);
                buf[write++] = (byte) b;
            }
        };
        this.gz = new GZIPOutputStream(delegate); 
    }

    @Override
    public int read(byte[] b, int off, int len) throws IOException {
        compressStream();
        int numBytes = Math.min(len, write-read);
        if (numBytes > 0) {
            System.arraycopy(buf, read, b, off, numBytes);
            read += numBytes;
        } else if (len > 0) {
            // if bytes were requested, but we have none, then we're at the end of the stream
            return -1;
        }
        return numBytes;
    }

    private void compressStream() throws IOException {
        // if the reader has caught up with the writer, then zero the positions out
        if (read == write) {
            read = 0;
            write = 0;
        }

        while (write == 0) {
            // feed the gzip stream data until it spits out a block
            int val = in.read(readBuf);
            if (val == -1) {
                // nothing left to do, we've hit the end of the stream. finalize and break out
                gz.close();
                break;
            } else if (val > 0) {
                gz.write(readBuf, 0, val);
            }
        }
    }

    @Override
    public int read() throws IOException {
        compressStream();
        if (write == 0) {
            // write should not be 0 if we were able to get data from compress stream, must mean we're at the end
            return -1;
        } else {
            // reading a single byte
            return buf[read++] & 0xFF;
        }
    }
}

推荐