...

Source file src/github.com/klauspost/compress/gzip/gunzip.go

Documentation: github.com/klauspost/compress/gzip

     1  // Copyright 2009 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  // Package gzip implements reading and writing of gzip format compressed files,
     6  // as specified in RFC 1952.
     7  package gzip
     8  
     9  import (
    10  	"bufio"
    11  	"compress/gzip"
    12  	"encoding/binary"
    13  	"hash/crc32"
    14  	"io"
    15  	"time"
    16  
    17  	"github.com/klauspost/compress/flate"
    18  )
    19  
    20  const (
    21  	gzipID1     = 0x1f
    22  	gzipID2     = 0x8b
    23  	gzipDeflate = 8
    24  	flagText    = 1 << 0
    25  	flagHdrCrc  = 1 << 1
    26  	flagExtra   = 1 << 2
    27  	flagName    = 1 << 3
    28  	flagComment = 1 << 4
    29  )
    30  
    31  var (
    32  	// ErrChecksum is returned when reading GZIP data that has an invalid checksum.
    33  	ErrChecksum = gzip.ErrChecksum
    34  	// ErrHeader is returned when reading GZIP data that has an invalid header.
    35  	ErrHeader = gzip.ErrHeader
    36  )
    37  
    38  var le = binary.LittleEndian
    39  
    40  // noEOF converts io.EOF to io.ErrUnexpectedEOF.
    41  func noEOF(err error) error {
    42  	if err == io.EOF {
    43  		return io.ErrUnexpectedEOF
    44  	}
    45  	return err
    46  }
    47  
    48  // The gzip file stores a header giving metadata about the compressed file.
    49  // That header is exposed as the fields of the Writer and Reader structs.
    50  //
    51  // Strings must be UTF-8 encoded and may only contain Unicode code points
    52  // U+0001 through U+00FF, due to limitations of the GZIP file format.
    53  type Header struct {
    54  	Comment string    // comment
    55  	Extra   []byte    // "extra data"
    56  	ModTime time.Time // modification time
    57  	Name    string    // file name
    58  	OS      byte      // operating system type
    59  }
    60  
    61  // A Reader is an io.Reader that can be read to retrieve
    62  // uncompressed data from a gzip-format compressed file.
    63  //
    64  // In general, a gzip file can be a concatenation of gzip files,
    65  // each with its own header. Reads from the Reader
    66  // return the concatenation of the uncompressed data of each.
    67  // Only the first header is recorded in the Reader fields.
    68  //
    69  // Gzip files store a length and checksum of the uncompressed data.
    70  // The Reader will return a ErrChecksum when Read
    71  // reaches the end of the uncompressed data if it does not
    72  // have the expected length or checksum. Clients should treat data
    73  // returned by Read as tentative until they receive the io.EOF
    74  // marking the end of the data.
    75  type Reader struct {
    76  	Header       // valid after NewReader or Reader.Reset
    77  	r            flate.Reader
    78  	br           *bufio.Reader
    79  	decompressor io.ReadCloser
    80  	digest       uint32 // CRC-32, IEEE polynomial (section 8)
    81  	size         uint32 // Uncompressed size (section 2.3.1)
    82  	buf          [512]byte
    83  	err          error
    84  	multistream  bool
    85  }
    86  
    87  // NewReader creates a new Reader reading the given reader.
    88  // If r does not also implement io.ByteReader,
    89  // the decompressor may read more data than necessary from r.
    90  //
    91  // It is the caller's responsibility to call Close on the Reader when done.
    92  //
    93  // The Reader.Header fields will be valid in the Reader returned.
    94  func NewReader(r io.Reader) (*Reader, error) {
    95  	z := new(Reader)
    96  	if err := z.Reset(r); err != nil {
    97  		return nil, err
    98  	}
    99  	return z, nil
   100  }
   101  
   102  // Reset discards the Reader z's state and makes it equivalent to the
   103  // result of its original state from NewReader, but reading from r instead.
   104  // This permits reusing a Reader rather than allocating a new one.
   105  func (z *Reader) Reset(r io.Reader) error {
   106  	*z = Reader{
   107  		decompressor: z.decompressor,
   108  		multistream:  true,
   109  		br:           z.br,
   110  	}
   111  	if rr, ok := r.(flate.Reader); ok {
   112  		z.r = rr
   113  	} else {
   114  		// Reuse if we can.
   115  		if z.br != nil {
   116  			z.br.Reset(r)
   117  		} else {
   118  			z.br = bufio.NewReader(r)
   119  		}
   120  		z.r = z.br
   121  	}
   122  	z.Header, z.err = z.readHeader()
   123  	return z.err
   124  }
   125  
   126  // Multistream controls whether the reader supports multistream files.
   127  //
   128  // If enabled (the default), the Reader expects the input to be a sequence
   129  // of individually gzipped data streams, each with its own header and
   130  // trailer, ending at EOF. The effect is that the concatenation of a sequence
   131  // of gzipped files is treated as equivalent to the gzip of the concatenation
   132  // of the sequence. This is standard behavior for gzip readers.
   133  //
   134  // Calling Multistream(false) disables this behavior; disabling the behavior
   135  // can be useful when reading file formats that distinguish individual gzip
   136  // data streams or mix gzip data streams with other data streams.
   137  // In this mode, when the Reader reaches the end of the data stream,
   138  // Read returns io.EOF. If the underlying reader implements io.ByteReader,
   139  // it will be left positioned just after the gzip stream.
   140  // To start the next stream, call z.Reset(r) followed by z.Multistream(false).
   141  // If there is no next stream, z.Reset(r) will return io.EOF.
   142  func (z *Reader) Multistream(ok bool) {
   143  	z.multistream = ok
   144  }
   145  
   146  // readString reads a NUL-terminated string from z.r.
   147  // It treats the bytes read as being encoded as ISO 8859-1 (Latin-1) and
   148  // will output a string encoded using UTF-8.
   149  // This method always updates z.digest with the data read.
   150  func (z *Reader) readString() (string, error) {
   151  	var err error
   152  	needConv := false
   153  	for i := 0; ; i++ {
   154  		if i >= len(z.buf) {
   155  			return "", ErrHeader
   156  		}
   157  		z.buf[i], err = z.r.ReadByte()
   158  		if err != nil {
   159  			return "", err
   160  		}
   161  		if z.buf[i] > 0x7f {
   162  			needConv = true
   163  		}
   164  		if z.buf[i] == 0 {
   165  			// Digest covers the NUL terminator.
   166  			z.digest = crc32.Update(z.digest, crc32.IEEETable, z.buf[:i+1])
   167  
   168  			// Strings are ISO 8859-1, Latin-1 (RFC 1952, section 2.3.1).
   169  			if needConv {
   170  				s := make([]rune, 0, i)
   171  				for _, v := range z.buf[:i] {
   172  					s = append(s, rune(v))
   173  				}
   174  				return string(s), nil
   175  			}
   176  			return string(z.buf[:i]), nil
   177  		}
   178  	}
   179  }
   180  
   181  // readHeader reads the GZIP header according to section 2.3.1.
   182  // This method does not set z.err.
   183  func (z *Reader) readHeader() (hdr Header, err error) {
   184  	if _, err = io.ReadFull(z.r, z.buf[:10]); err != nil {
   185  		// RFC 1952, section 2.2, says the following:
   186  		//	A gzip file consists of a series of "members" (compressed data sets).
   187  		//
   188  		// Other than this, the specification does not clarify whether a
   189  		// "series" is defined as "one or more" or "zero or more". To err on the
   190  		// side of caution, Go interprets this to mean "zero or more".
   191  		// Thus, it is okay to return io.EOF here.
   192  		return hdr, err
   193  	}
   194  	if z.buf[0] != gzipID1 || z.buf[1] != gzipID2 || z.buf[2] != gzipDeflate {
   195  		return hdr, ErrHeader
   196  	}
   197  	flg := z.buf[3]
   198  	hdr.ModTime = time.Unix(int64(le.Uint32(z.buf[4:8])), 0)
   199  	// z.buf[8] is XFL and is currently ignored.
   200  	hdr.OS = z.buf[9]
   201  	z.digest = crc32.ChecksumIEEE(z.buf[:10])
   202  
   203  	if flg&flagExtra != 0 {
   204  		if _, err = io.ReadFull(z.r, z.buf[:2]); err != nil {
   205  			return hdr, noEOF(err)
   206  		}
   207  		z.digest = crc32.Update(z.digest, crc32.IEEETable, z.buf[:2])
   208  		data := make([]byte, le.Uint16(z.buf[:2]))
   209  		if _, err = io.ReadFull(z.r, data); err != nil {
   210  			return hdr, noEOF(err)
   211  		}
   212  		z.digest = crc32.Update(z.digest, crc32.IEEETable, data)
   213  		hdr.Extra = data
   214  	}
   215  
   216  	var s string
   217  	if flg&flagName != 0 {
   218  		if s, err = z.readString(); err != nil {
   219  			return hdr, err
   220  		}
   221  		hdr.Name = s
   222  	}
   223  
   224  	if flg&flagComment != 0 {
   225  		if s, err = z.readString(); err != nil {
   226  			return hdr, err
   227  		}
   228  		hdr.Comment = s
   229  	}
   230  
   231  	if flg&flagHdrCrc != 0 {
   232  		if _, err = io.ReadFull(z.r, z.buf[:2]); err != nil {
   233  			return hdr, noEOF(err)
   234  		}
   235  		digest := le.Uint16(z.buf[:2])
   236  		if digest != uint16(z.digest) {
   237  			return hdr, ErrHeader
   238  		}
   239  	}
   240  
   241  	// Reserved FLG bits must be zero.
   242  	if flg>>5 != 0 {
   243  		return hdr, ErrHeader
   244  	}
   245  
   246  	z.digest = 0
   247  	if z.decompressor == nil {
   248  		z.decompressor = flate.NewReader(z.r)
   249  	} else {
   250  		z.decompressor.(flate.Resetter).Reset(z.r, nil)
   251  	}
   252  	return hdr, nil
   253  }
   254  
   255  // Read implements io.Reader, reading uncompressed bytes from its underlying Reader.
   256  func (z *Reader) Read(p []byte) (n int, err error) {
   257  	if z.err != nil {
   258  		return 0, z.err
   259  	}
   260  
   261  	for n == 0 {
   262  		n, z.err = z.decompressor.Read(p)
   263  		z.digest = crc32.Update(z.digest, crc32.IEEETable, p[:n])
   264  		z.size += uint32(n)
   265  		if z.err != io.EOF {
   266  			// In the normal case we return here.
   267  			return n, z.err
   268  		}
   269  
   270  		// Finished file; check checksum and size.
   271  		if _, err := io.ReadFull(z.r, z.buf[:8]); err != nil {
   272  			z.err = noEOF(err)
   273  			return n, z.err
   274  		}
   275  		digest := le.Uint32(z.buf[:4])
   276  		size := le.Uint32(z.buf[4:8])
   277  		if digest != z.digest || size != z.size {
   278  			z.err = ErrChecksum
   279  			return n, z.err
   280  		}
   281  		z.digest, z.size = 0, 0
   282  
   283  		// File is ok; check if there is another.
   284  		if !z.multistream {
   285  			return n, io.EOF
   286  		}
   287  		z.err = nil // Remove io.EOF
   288  
   289  		if _, z.err = z.readHeader(); z.err != nil {
   290  			return n, z.err
   291  		}
   292  	}
   293  
   294  	return n, nil
   295  }
   296  
   297  type crcer interface {
   298  	io.Writer
   299  	Sum32() uint32
   300  	Reset()
   301  }
   302  type crcUpdater struct {
   303  	z *Reader
   304  }
   305  
   306  func (c *crcUpdater) Write(p []byte) (int, error) {
   307  	c.z.digest = crc32.Update(c.z.digest, crc32.IEEETable, p)
   308  	return len(p), nil
   309  }
   310  
   311  func (c *crcUpdater) Sum32() uint32 {
   312  	return c.z.digest
   313  }
   314  
   315  func (c *crcUpdater) Reset() {
   316  	c.z.digest = 0
   317  }
   318  
   319  // WriteTo support the io.WriteTo interface for io.Copy and friends.
   320  func (z *Reader) WriteTo(w io.Writer) (int64, error) {
   321  	total := int64(0)
   322  	crcWriter := crcer(crc32.NewIEEE())
   323  	if z.digest != 0 {
   324  		crcWriter = &crcUpdater{z: z}
   325  	}
   326  	for {
   327  		if z.err != nil {
   328  			if z.err == io.EOF {
   329  				return total, nil
   330  			}
   331  			return total, z.err
   332  		}
   333  
   334  		// We write both to output and digest.
   335  		mw := io.MultiWriter(w, crcWriter)
   336  		n, err := z.decompressor.(io.WriterTo).WriteTo(mw)
   337  		total += n
   338  		z.size += uint32(n)
   339  		if err != nil {
   340  			z.err = err
   341  			return total, z.err
   342  		}
   343  
   344  		// Finished file; check checksum + size.
   345  		if _, err := io.ReadFull(z.r, z.buf[0:8]); err != nil {
   346  			if err == io.EOF {
   347  				err = io.ErrUnexpectedEOF
   348  			}
   349  			z.err = err
   350  			return total, err
   351  		}
   352  		z.digest = crcWriter.Sum32()
   353  		digest := le.Uint32(z.buf[:4])
   354  		size := le.Uint32(z.buf[4:8])
   355  		if digest != z.digest || size != z.size {
   356  			z.err = ErrChecksum
   357  			return total, z.err
   358  		}
   359  		z.digest, z.size = 0, 0
   360  
   361  		// File is ok; check if there is another.
   362  		if !z.multistream {
   363  			return total, nil
   364  		}
   365  		crcWriter.Reset()
   366  		z.err = nil // Remove io.EOF
   367  
   368  		if _, z.err = z.readHeader(); z.err != nil {
   369  			if z.err == io.EOF {
   370  				return total, nil
   371  			}
   372  			return total, z.err
   373  		}
   374  	}
   375  }
   376  
   377  // Close closes the Reader. It does not close the underlying io.Reader.
   378  // In order for the GZIP checksum to be verified, the reader must be
   379  // fully consumed until the io.EOF.
   380  func (z *Reader) Close() error { return z.decompressor.Close() }
   381  

View as plain text