1 // Copyright 2011 The Snappy-Go Authors. All rights reserved. 2 // Copyright (c) 2019 Klaus Post. All rights reserved. 3 // Use of this source code is governed by a BSD-style 4 // license that can be found in the LICENSE file. 5 6 // Package s2 implements the S2 compression format. 7 // 8 // S2 is an extension of Snappy. Similar to Snappy S2 is aimed for high throughput, 9 // which is why it features concurrent compression for bigger payloads. 10 // 11 // Decoding is compatible with Snappy compressed content, 12 // but content compressed with S2 cannot be decompressed by Snappy. 13 // 14 // For more information on Snappy/S2 differences see README in: https://github.com/klauspost/compress/tree/master/s2 15 // 16 // There are actually two S2 formats: block and stream. They are related, 17 // but different: trying to decompress block-compressed data as a S2 stream 18 // will fail, and vice versa. The block format is the Decode and Encode 19 // functions and the stream format is the Reader and Writer types. 20 // 21 // A "better" compression option is available. This will trade some compression 22 // speed 23 // 24 // The block format, the more common case, is used when the complete size (the 25 // number of bytes) of the original data is known upfront, at the time 26 // compression starts. The stream format, also known as the framing format, is 27 // for when that isn't always true. 28 // 29 // Blocks to not offer much data protection, so it is up to you to 30 // add data validation of decompressed blocks. 31 // 32 // Streams perform CRC validation of the decompressed data. 33 // Stream compression will also be performed on multiple CPU cores concurrently 34 // significantly improving throughput. 35 package s2 36 37 import ( 38 "bytes" 39 "hash/crc32" 40 41 "github.com/klauspost/compress/internal/race" 42 ) 43 44 /* 45 Each encoded block begins with the varint-encoded length of the decoded data, 46 followed by a sequence of chunks. Chunks begin and end on byte boundaries. The 47 first byte of each chunk is broken into its 2 least and 6 most significant bits 48 called l and m: l ranges in [0, 4) and m ranges in [0, 64). l is the chunk tag. 49 Zero means a literal tag. All other values mean a copy tag. 50 51 For literal tags: 52 - If m < 60, the next 1 + m bytes are literal bytes. 53 - Otherwise, let n be the little-endian unsigned integer denoted by the next 54 m - 59 bytes. The next 1 + n bytes after that are literal bytes. 55 56 For copy tags, length bytes are copied from offset bytes ago, in the style of 57 Lempel-Ziv compression algorithms. In particular: 58 - For l == 1, the offset ranges in [0, 1<<11) and the length in [4, 12). 59 The length is 4 + the low 3 bits of m. The high 3 bits of m form bits 8-10 60 of the offset. The next byte is bits 0-7 of the offset. 61 - For l == 2, the offset ranges in [0, 1<<16) and the length in [1, 65). 62 The length is 1 + m. The offset is the little-endian unsigned integer 63 denoted by the next 2 bytes. 64 - For l == 3, the offset ranges in [0, 1<<32) and the length in 65 [1, 65). The length is 1 + m. The offset is the little-endian unsigned 66 integer denoted by the next 4 bytes. 67 */ 68 const ( 69 tagLiteral = 0x00 70 tagCopy1 = 0x01 71 tagCopy2 = 0x02 72 tagCopy4 = 0x03 73 ) 74 75 const ( 76 checksumSize = 4 77 chunkHeaderSize = 4 78 magicChunk = "\xff\x06\x00\x00" + magicBody 79 magicChunkSnappy = "\xff\x06\x00\x00" + magicBodySnappy 80 magicBodySnappy = "sNaPpY" 81 magicBody = "S2sTwO" 82 83 // maxBlockSize is the maximum size of the input to encodeBlock. 84 // 85 // For the framing format (Writer type instead of Encode function), 86 // this is the maximum uncompressed size of a block. 87 maxBlockSize = 4 << 20 88 89 // minBlockSize is the minimum size of block setting when creating a writer. 90 minBlockSize = 4 << 10 91 92 skippableFrameHeader = 4 93 maxChunkSize = 1<<24 - 1 // 16777215 94 95 // Default block size 96 defaultBlockSize = 1 << 20 97 98 // maxSnappyBlockSize is the maximum snappy block size. 99 maxSnappyBlockSize = 1 << 16 100 101 obufHeaderLen = checksumSize + chunkHeaderSize 102 ) 103 104 const ( 105 chunkTypeCompressedData = 0x00 106 chunkTypeUncompressedData = 0x01 107 ChunkTypeIndex = 0x99 108 chunkTypePadding = 0xfe 109 chunkTypeStreamIdentifier = 0xff 110 ) 111 112 var crcTable = crc32.MakeTable(crc32.Castagnoli) 113 114 // crc implements the checksum specified in section 3 of 115 // https://github.com/google/snappy/blob/master/framing_format.txt 116 func crc(b []byte) uint32 { 117 race.ReadSlice(b) 118 119 c := crc32.Update(0, crcTable, b) 120 return c>>15 | c<<17 + 0xa282ead8 121 } 122 123 // literalExtraSize returns the extra size of encoding n literals. 124 // n should be >= 0 and <= math.MaxUint32. 125 func literalExtraSize(n int64) int64 { 126 if n == 0 { 127 return 0 128 } 129 switch { 130 case n < 60: 131 return 1 132 case n < 1<<8: 133 return 2 134 case n < 1<<16: 135 return 3 136 case n < 1<<24: 137 return 4 138 default: 139 return 5 140 } 141 } 142 143 type byter interface { 144 Bytes() []byte 145 } 146 147 var _ byter = &bytes.Buffer{} 148