s2.go

Documentation: github.com/klauspost/compress/s2

     1  // Copyright 2011 The Snappy-Go Authors. All rights reserved.
     2  // Copyright (c) 2019 Klaus Post. All rights reserved.
     3  // Use of this source code is governed by a BSD-style
     4  // license that can be found in the LICENSE file.
     5  
     6  // Package s2 implements the S2 compression format.
     7  //
     8  // S2 is an extension of Snappy. Similar to Snappy S2 is aimed for high throughput,
     9  // which is why it features concurrent compression for bigger payloads.
    10  //
    11  // Decoding is compatible with Snappy compressed content,
    12  // but content compressed with S2 cannot be decompressed by Snappy.
    13  //
    14  // For more information on Snappy/S2 differences see README in: https://github.com/klauspost/compress/tree/master/s2
    15  //
    16  // There are actually two S2 formats: block and stream. They are related,
    17  // but different: trying to decompress block-compressed data as a S2 stream
    18  // will fail, and vice versa. The block format is the Decode and Encode
    19  // functions and the stream format is the Reader and Writer types.
    20  //
    21  // A "better" compression option is available. This will trade some compression
    22  // speed
    23  //
    24  // The block format, the more common case, is used when the complete size (the
    25  // number of bytes) of the original data is known upfront, at the time
    26  // compression starts. The stream format, also known as the framing format, is
    27  // for when that isn't always true.
    28  //
    29  // Blocks to not offer much data protection, so it is up to you to
    30  // add data validation of decompressed blocks.
    31  //
    32  // Streams perform CRC validation of the decompressed data.
    33  // Stream compression will also be performed on multiple CPU cores concurrently
    34  // significantly improving throughput.
    35  package s2
    36  
    37  import (
    38  	"bytes"
    39  	"hash/crc32"
    40  
    41  	"github.com/klauspost/compress/internal/race"
    42  )
    43  
    44  /*
    45  Each encoded block begins with the varint-encoded length of the decoded data,
    46  followed by a sequence of chunks. Chunks begin and end on byte boundaries. The
    47  first byte of each chunk is broken into its 2 least and 6 most significant bits
    48  called l and m: l ranges in [0, 4) and m ranges in [0, 64). l is the chunk tag.
    49  Zero means a literal tag. All other values mean a copy tag.
    50  
    51  For literal tags:
    52    - If m < 60, the next 1 + m bytes are literal bytes.
    53    - Otherwise, let n be the little-endian unsigned integer denoted by the next
    54      m - 59 bytes. The next 1 + n bytes after that are literal bytes.
    55  
    56  For copy tags, length bytes are copied from offset bytes ago, in the style of
    57  Lempel-Ziv compression algorithms. In particular:
    58    - For l == 1, the offset ranges in [0, 1<<11) and the length in [4, 12).
    59      The length is 4 + the low 3 bits of m. The high 3 bits of m form bits 8-10
    60      of the offset. The next byte is bits 0-7 of the offset.
    61    - For l == 2, the offset ranges in [0, 1<<16) and the length in [1, 65).
    62      The length is 1 + m. The offset is the little-endian unsigned integer
    63      denoted by the next 2 bytes.
    64    - For l == 3, the offset ranges in [0, 1<<32) and the length in
    65      [1, 65). The length is 1 + m. The offset is the little-endian unsigned
    66      integer denoted by the next 4 bytes.
    67  */
    68  const (
    69  	tagLiteral = 0x00
    70  	tagCopy1   = 0x01
    71  	tagCopy2   = 0x02
    72  	tagCopy4   = 0x03
    73  )
    74  
    75  const (
    76  	checksumSize     = 4
    77  	chunkHeaderSize  = 4
    78  	magicChunk       = "\xff\x06\x00\x00" + magicBody
    79  	magicChunkSnappy = "\xff\x06\x00\x00" + magicBodySnappy
    80  	magicBodySnappy  = "sNaPpY"
    81  	magicBody        = "S2sTwO"
    82  
    83  	// maxBlockSize is the maximum size of the input to encodeBlock.
    84  	//
    85  	// For the framing format (Writer type instead of Encode function),
    86  	// this is the maximum uncompressed size of a block.
    87  	maxBlockSize = 4 << 20
    88  
    89  	// minBlockSize is the minimum size of block setting when creating a writer.
    90  	minBlockSize = 4 << 10
    91  
    92  	skippableFrameHeader = 4
    93  	maxChunkSize         = 1<<24 - 1 // 16777215
    94  
    95  	// Default block size
    96  	defaultBlockSize = 1 << 20
    97  
    98  	// maxSnappyBlockSize is the maximum snappy block size.
    99  	maxSnappyBlockSize = 1 << 16
   100  
   101  	obufHeaderLen = checksumSize + chunkHeaderSize
   102  )
   103  
   104  const (
   105  	chunkTypeCompressedData   = 0x00
   106  	chunkTypeUncompressedData = 0x01
   107  	ChunkTypeIndex            = 0x99
   108  	chunkTypePadding          = 0xfe
   109  	chunkTypeStreamIdentifier = 0xff
   110  )
   111  
   112  var crcTable = crc32.MakeTable(crc32.Castagnoli)
   113  
   114  // crc implements the checksum specified in section 3 of
   115  // https://github.com/google/snappy/blob/master/framing_format.txt
   116  func crc(b []byte) uint32 {
   117  	race.ReadSlice(b)
   118  
   119  	c := crc32.Update(0, crcTable, b)
   120  	return c>>15 | c<<17 + 0xa282ead8
   121  }
   122  
   123  // literalExtraSize returns the extra size of encoding n literals.
   124  // n should be >= 0 and <= math.MaxUint32.
   125  func literalExtraSize(n int64) int64 {
   126  	if n == 0 {
   127  		return 0
   128  	}
   129  	switch {
   130  	case n < 60:
   131  		return 1
   132  	case n < 1<<8:
   133  		return 2
   134  	case n < 1<<16:
   135  		return 3
   136  	case n < 1<<24:
   137  		return 4
   138  	default:
   139  		return 5
   140  	}
   141  }
   142  
   143  type byter interface {
   144  	Bytes() []byte
   145  }
   146  
   147  var _ byter = &bytes.Buffer{}
   148
View as plain text