index.go

Documentation: github.com/klauspost/compress/s2

     1  // Copyright (c) 2022+ Klaus Post. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  package s2
     6  
     7  import (
     8  	"bytes"
     9  	"encoding/binary"
    10  	"encoding/json"
    11  	"fmt"
    12  	"io"
    13  	"sort"
    14  )
    15  
    16  const (
    17  	S2IndexHeader   = "s2idx\x00"
    18  	S2IndexTrailer  = "\x00xdi2s"
    19  	maxIndexEntries = 1 << 16
    20  )
    21  
    22  // Index represents an S2/Snappy index.
    23  type Index struct {
    24  	TotalUncompressed int64 // Total Uncompressed size if known. Will be -1 if unknown.
    25  	TotalCompressed   int64 // Total Compressed size if known. Will be -1 if unknown.
    26  	info              []struct {
    27  		compressedOffset   int64
    28  		uncompressedOffset int64
    29  	}
    30  	estBlockUncomp int64
    31  }
    32  
    33  func (i *Index) reset(maxBlock int) {
    34  	i.estBlockUncomp = int64(maxBlock)
    35  	i.TotalCompressed = -1
    36  	i.TotalUncompressed = -1
    37  	if len(i.info) > 0 {
    38  		i.info = i.info[:0]
    39  	}
    40  }
    41  
    42  // allocInfos will allocate an empty slice of infos.
    43  func (i *Index) allocInfos(n int) {
    44  	if n > maxIndexEntries {
    45  		panic("n > maxIndexEntries")
    46  	}
    47  	i.info = make([]struct {
    48  		compressedOffset   int64
    49  		uncompressedOffset int64
    50  	}, 0, n)
    51  }
    52  
    53  // add an uncompressed and compressed pair.
    54  // Entries must be sent in order.
    55  func (i *Index) add(compressedOffset, uncompressedOffset int64) error {
    56  	if i == nil {
    57  		return nil
    58  	}
    59  	lastIdx := len(i.info) - 1
    60  	if lastIdx >= 0 {
    61  		latest := i.info[lastIdx]
    62  		if latest.uncompressedOffset == uncompressedOffset {
    63  			// Uncompressed didn't change, don't add entry,
    64  			// but update start index.
    65  			latest.compressedOffset = compressedOffset
    66  			i.info[lastIdx] = latest
    67  			return nil
    68  		}
    69  		if latest.uncompressedOffset > uncompressedOffset {
    70  			return fmt.Errorf("internal error: Earlier uncompressed received (%d > %d)", latest.uncompressedOffset, uncompressedOffset)
    71  		}
    72  		if latest.compressedOffset > compressedOffset {
    73  			return fmt.Errorf("internal error: Earlier compressed received (%d > %d)", latest.uncompressedOffset, uncompressedOffset)
    74  		}
    75  	}
    76  	i.info = append(i.info, struct {
    77  		compressedOffset   int64
    78  		uncompressedOffset int64
    79  	}{compressedOffset: compressedOffset, uncompressedOffset: uncompressedOffset})
    80  	return nil
    81  }
    82  
    83  // Find the offset at or before the wanted (uncompressed) offset.
    84  // If offset is 0 or positive it is the offset from the beginning of the file.
    85  // If the uncompressed size is known, the offset must be within the file.
    86  // If an offset outside the file is requested io.ErrUnexpectedEOF is returned.
    87  // If the offset is negative, it is interpreted as the distance from the end of the file,
    88  // where -1 represents the last byte.
    89  // If offset from the end of the file is requested, but size is unknown,
    90  // ErrUnsupported will be returned.
    91  func (i *Index) Find(offset int64) (compressedOff, uncompressedOff int64, err error) {
    92  	if i.TotalUncompressed < 0 {
    93  		return 0, 0, ErrCorrupt
    94  	}
    95  	if offset < 0 {
    96  		offset = i.TotalUncompressed + offset
    97  		if offset < 0 {
    98  			return 0, 0, io.ErrUnexpectedEOF
    99  		}
   100  	}
   101  	if offset > i.TotalUncompressed {
   102  		return 0, 0, io.ErrUnexpectedEOF
   103  	}
   104  	if len(i.info) > 200 {
   105  		n := sort.Search(len(i.info), func(n int) bool {
   106  			return i.info[n].uncompressedOffset > offset
   107  		})
   108  		if n == 0 {
   109  			n = 1
   110  		}
   111  		return i.info[n-1].compressedOffset, i.info[n-1].uncompressedOffset, nil
   112  	}
   113  	for _, info := range i.info {
   114  		if info.uncompressedOffset > offset {
   115  			break
   116  		}
   117  		compressedOff = info.compressedOffset
   118  		uncompressedOff = info.uncompressedOffset
   119  	}
   120  	return compressedOff, uncompressedOff, nil
   121  }
   122  
   123  // reduce to stay below maxIndexEntries
   124  func (i *Index) reduce() {
   125  	if len(i.info) < maxIndexEntries && i.estBlockUncomp >= 1<<20 {
   126  		return
   127  	}
   128  
   129  	// Algorithm, keep 1, remove removeN entries...
   130  	removeN := (len(i.info) + 1) / maxIndexEntries
   131  	src := i.info
   132  	j := 0
   133  
   134  	// Each block should be at least 1MB, but don't reduce below 1000 entries.
   135  	for i.estBlockUncomp*(int64(removeN)+1) < 1<<20 && len(i.info)/(removeN+1) > 1000 {
   136  		removeN++
   137  	}
   138  	for idx := 0; idx < len(src); idx++ {
   139  		i.info[j] = src[idx]
   140  		j++
   141  		idx += removeN
   142  	}
   143  	i.info = i.info[:j]
   144  	// Update maxblock estimate.
   145  	i.estBlockUncomp += i.estBlockUncomp * int64(removeN)
   146  }
   147  
   148  func (i *Index) appendTo(b []byte, uncompTotal, compTotal int64) []byte {
   149  	i.reduce()
   150  	var tmp [binary.MaxVarintLen64]byte
   151  
   152  	initSize := len(b)
   153  	// We make the start a skippable header+size.
   154  	b = append(b, ChunkTypeIndex, 0, 0, 0)
   155  	b = append(b, []byte(S2IndexHeader)...)
   156  	// Total Uncompressed size
   157  	n := binary.PutVarint(tmp[:], uncompTotal)
   158  	b = append(b, tmp[:n]...)
   159  	// Total Compressed size
   160  	n = binary.PutVarint(tmp[:], compTotal)
   161  	b = append(b, tmp[:n]...)
   162  	// Put EstBlockUncomp size
   163  	n = binary.PutVarint(tmp[:], i.estBlockUncomp)
   164  	b = append(b, tmp[:n]...)
   165  	// Put length
   166  	n = binary.PutVarint(tmp[:], int64(len(i.info)))
   167  	b = append(b, tmp[:n]...)
   168  
   169  	// Check if we should add uncompressed offsets
   170  	var hasUncompressed byte
   171  	for idx, info := range i.info {
   172  		if idx == 0 {
   173  			if info.uncompressedOffset != 0 {
   174  				hasUncompressed = 1
   175  				break
   176  			}
   177  			continue
   178  		}
   179  		if info.uncompressedOffset != i.info[idx-1].uncompressedOffset+i.estBlockUncomp {
   180  			hasUncompressed = 1
   181  			break
   182  		}
   183  	}
   184  	b = append(b, hasUncompressed)
   185  
   186  	// Add each entry
   187  	if hasUncompressed == 1 {
   188  		for idx, info := range i.info {
   189  			uOff := info.uncompressedOffset
   190  			if idx > 0 {
   191  				prev := i.info[idx-1]
   192  				uOff -= prev.uncompressedOffset + (i.estBlockUncomp)
   193  			}
   194  			n = binary.PutVarint(tmp[:], uOff)
   195  			b = append(b, tmp[:n]...)
   196  		}
   197  	}
   198  
   199  	// Initial compressed size estimate.
   200  	cPredict := i.estBlockUncomp / 2
   201  
   202  	for idx, info := range i.info {
   203  		cOff := info.compressedOffset
   204  		if idx > 0 {
   205  			prev := i.info[idx-1]
   206  			cOff -= prev.compressedOffset + cPredict
   207  			// Update compressed size prediction, with half the error.
   208  			cPredict += cOff / 2
   209  		}
   210  		n = binary.PutVarint(tmp[:], cOff)
   211  		b = append(b, tmp[:n]...)
   212  	}
   213  
   214  	// Add Total Size.
   215  	// Stored as fixed size for easier reading.
   216  	binary.LittleEndian.PutUint32(tmp[:], uint32(len(b)-initSize+4+len(S2IndexTrailer)))
   217  	b = append(b, tmp[:4]...)
   218  	// Trailer
   219  	b = append(b, []byte(S2IndexTrailer)...)
   220  
   221  	// Update size
   222  	chunkLen := len(b) - initSize - skippableFrameHeader
   223  	b[initSize+1] = uint8(chunkLen >> 0)
   224  	b[initSize+2] = uint8(chunkLen >> 8)
   225  	b[initSize+3] = uint8(chunkLen >> 16)
   226  	//fmt.Printf("chunklen: 0x%x Uncomp:%d, Comp:%d\n", chunkLen, uncompTotal, compTotal)
   227  	return b
   228  }
   229  
   230  // Load a binary index.
   231  // A zero value Index can be used or a previous one can be reused.
   232  func (i *Index) Load(b []byte) ([]byte, error) {
   233  	if len(b) <= 4+len(S2IndexHeader)+len(S2IndexTrailer) {
   234  		return b, io.ErrUnexpectedEOF
   235  	}
   236  	if b[0] != ChunkTypeIndex {
   237  		return b, ErrCorrupt
   238  	}
   239  	chunkLen := int(b[1]) | int(b[2])<<8 | int(b[3])<<16
   240  	b = b[4:]
   241  
   242  	// Validate we have enough...
   243  	if len(b) < chunkLen {
   244  		return b, io.ErrUnexpectedEOF
   245  	}
   246  	if !bytes.Equal(b[:len(S2IndexHeader)], []byte(S2IndexHeader)) {
   247  		return b, ErrUnsupported
   248  	}
   249  	b = b[len(S2IndexHeader):]
   250  
   251  	// Total Uncompressed
   252  	if v, n := binary.Varint(b); n <= 0 || v < 0 {
   253  		return b, ErrCorrupt
   254  	} else {
   255  		i.TotalUncompressed = v
   256  		b = b[n:]
   257  	}
   258  
   259  	// Total Compressed
   260  	if v, n := binary.Varint(b); n <= 0 {
   261  		return b, ErrCorrupt
   262  	} else {
   263  		i.TotalCompressed = v
   264  		b = b[n:]
   265  	}
   266  
   267  	// Read EstBlockUncomp
   268  	if v, n := binary.Varint(b); n <= 0 {
   269  		return b, ErrCorrupt
   270  	} else {
   271  		if v < 0 {
   272  			return b, ErrCorrupt
   273  		}
   274  		i.estBlockUncomp = v
   275  		b = b[n:]
   276  	}
   277  
   278  	var entries int
   279  	if v, n := binary.Varint(b); n <= 0 {
   280  		return b, ErrCorrupt
   281  	} else {
   282  		if v < 0 || v > maxIndexEntries {
   283  			return b, ErrCorrupt
   284  		}
   285  		entries = int(v)
   286  		b = b[n:]
   287  	}
   288  	if cap(i.info) < entries {
   289  		i.allocInfos(entries)
   290  	}
   291  	i.info = i.info[:entries]
   292  
   293  	if len(b) < 1 {
   294  		return b, io.ErrUnexpectedEOF
   295  	}
   296  	hasUncompressed := b[0]
   297  	b = b[1:]
   298  	if hasUncompressed&1 != hasUncompressed {
   299  		return b, ErrCorrupt
   300  	}
   301  
   302  	// Add each uncompressed entry
   303  	for idx := range i.info {
   304  		var uOff int64
   305  		if hasUncompressed != 0 {
   306  			// Load delta
   307  			if v, n := binary.Varint(b); n <= 0 {
   308  				return b, ErrCorrupt
   309  			} else {
   310  				uOff = v
   311  				b = b[n:]
   312  			}
   313  		}
   314  
   315  		if idx > 0 {
   316  			prev := i.info[idx-1].uncompressedOffset
   317  			uOff += prev + (i.estBlockUncomp)
   318  			if uOff <= prev {
   319  				return b, ErrCorrupt
   320  			}
   321  		}
   322  		if uOff < 0 {
   323  			return b, ErrCorrupt
   324  		}
   325  		i.info[idx].uncompressedOffset = uOff
   326  	}
   327  
   328  	// Initial compressed size estimate.
   329  	cPredict := i.estBlockUncomp / 2
   330  
   331  	// Add each compressed entry
   332  	for idx := range i.info {
   333  		var cOff int64
   334  		if v, n := binary.Varint(b); n <= 0 {
   335  			return b, ErrCorrupt
   336  		} else {
   337  			cOff = v
   338  			b = b[n:]
   339  		}
   340  
   341  		if idx > 0 {
   342  			// Update compressed size prediction, with half the error.
   343  			cPredictNew := cPredict + cOff/2
   344  
   345  			prev := i.info[idx-1].compressedOffset
   346  			cOff += prev + cPredict
   347  			if cOff <= prev {
   348  				return b, ErrCorrupt
   349  			}
   350  			cPredict = cPredictNew
   351  		}
   352  		if cOff < 0 {
   353  			return b, ErrCorrupt
   354  		}
   355  		i.info[idx].compressedOffset = cOff
   356  	}
   357  	if len(b) < 4+len(S2IndexTrailer) {
   358  		return b, io.ErrUnexpectedEOF
   359  	}
   360  	// Skip size...
   361  	b = b[4:]
   362  
   363  	// Check trailer...
   364  	if !bytes.Equal(b[:len(S2IndexTrailer)], []byte(S2IndexTrailer)) {
   365  		return b, ErrCorrupt
   366  	}
   367  	return b[len(S2IndexTrailer):], nil
   368  }
   369  
   370  // LoadStream will load an index from the end of the supplied stream.
   371  // ErrUnsupported will be returned if the signature cannot be found.
   372  // ErrCorrupt will be returned if unexpected values are found.
   373  // io.ErrUnexpectedEOF is returned if there are too few bytes.
   374  // IO errors are returned as-is.
   375  func (i *Index) LoadStream(rs io.ReadSeeker) error {
   376  	// Go to end.
   377  	_, err := rs.Seek(-10, io.SeekEnd)
   378  	if err != nil {
   379  		return err
   380  	}
   381  	var tmp [10]byte
   382  	_, err = io.ReadFull(rs, tmp[:])
   383  	if err != nil {
   384  		return err
   385  	}
   386  	// Check trailer...
   387  	if !bytes.Equal(tmp[4:4+len(S2IndexTrailer)], []byte(S2IndexTrailer)) {
   388  		return ErrUnsupported
   389  	}
   390  	sz := binary.LittleEndian.Uint32(tmp[:4])
   391  	if sz > maxChunkSize+skippableFrameHeader {
   392  		return ErrCorrupt
   393  	}
   394  	_, err = rs.Seek(-int64(sz), io.SeekEnd)
   395  	if err != nil {
   396  		return err
   397  	}
   398  
   399  	// Read index.
   400  	buf := make([]byte, sz)
   401  	_, err = io.ReadFull(rs, buf)
   402  	if err != nil {
   403  		return err
   404  	}
   405  	_, err = i.Load(buf)
   406  	return err
   407  }
   408  
   409  // IndexStream will return an index for a stream.
   410  // The stream structure will be checked, but
   411  // data within blocks is not verified.
   412  // The returned index can either be appended to the end of the stream
   413  // or stored separately.
   414  func IndexStream(r io.Reader) ([]byte, error) {
   415  	var i Index
   416  	var buf [maxChunkSize]byte
   417  	var readHeader bool
   418  	for {
   419  		_, err := io.ReadFull(r, buf[:4])
   420  		if err != nil {
   421  			if err == io.EOF {
   422  				return i.appendTo(nil, i.TotalUncompressed, i.TotalCompressed), nil
   423  			}
   424  			return nil, err
   425  		}
   426  		// Start of this chunk.
   427  		startChunk := i.TotalCompressed
   428  		i.TotalCompressed += 4
   429  
   430  		chunkType := buf[0]
   431  		if !readHeader {
   432  			if chunkType != chunkTypeStreamIdentifier {
   433  				return nil, ErrCorrupt
   434  			}
   435  			readHeader = true
   436  		}
   437  		chunkLen := int(buf[1]) | int(buf[2])<<8 | int(buf[3])<<16
   438  		if chunkLen < checksumSize {
   439  			return nil, ErrCorrupt
   440  		}
   441  
   442  		i.TotalCompressed += int64(chunkLen)
   443  		_, err = io.ReadFull(r, buf[:chunkLen])
   444  		if err != nil {
   445  			return nil, io.ErrUnexpectedEOF
   446  		}
   447  		// The chunk types are specified at
   448  		// https://github.com/google/snappy/blob/master/framing_format.txt
   449  		switch chunkType {
   450  		case chunkTypeCompressedData:
   451  			// Section 4.2. Compressed data (chunk type 0x00).
   452  			// Skip checksum.
   453  			dLen, err := DecodedLen(buf[checksumSize:])
   454  			if err != nil {
   455  				return nil, err
   456  			}
   457  			if dLen > maxBlockSize {
   458  				return nil, ErrCorrupt
   459  			}
   460  			if i.estBlockUncomp == 0 {
   461  				// Use first block for estimate...
   462  				i.estBlockUncomp = int64(dLen)
   463  			}
   464  			err = i.add(startChunk, i.TotalUncompressed)
   465  			if err != nil {
   466  				return nil, err
   467  			}
   468  			i.TotalUncompressed += int64(dLen)
   469  			continue
   470  		case chunkTypeUncompressedData:
   471  			n2 := chunkLen - checksumSize
   472  			if n2 > maxBlockSize {
   473  				return nil, ErrCorrupt
   474  			}
   475  			if i.estBlockUncomp == 0 {
   476  				// Use first block for estimate...
   477  				i.estBlockUncomp = int64(n2)
   478  			}
   479  			err = i.add(startChunk, i.TotalUncompressed)
   480  			if err != nil {
   481  				return nil, err
   482  			}
   483  			i.TotalUncompressed += int64(n2)
   484  			continue
   485  		case chunkTypeStreamIdentifier:
   486  			// Section 4.1. Stream identifier (chunk type 0xff).
   487  			if chunkLen != len(magicBody) {
   488  				return nil, ErrCorrupt
   489  			}
   490  
   491  			if string(buf[:len(magicBody)]) != magicBody {
   492  				if string(buf[:len(magicBody)]) != magicBodySnappy {
   493  					return nil, ErrCorrupt
   494  				}
   495  			}
   496  
   497  			continue
   498  		}
   499  
   500  		if chunkType <= 0x7f {
   501  			// Section 4.5. Reserved unskippable chunks (chunk types 0x02-0x7f).
   502  			return nil, ErrUnsupported
   503  		}
   504  		if chunkLen > maxChunkSize {
   505  			return nil, ErrUnsupported
   506  		}
   507  		// Section 4.4 Padding (chunk type 0xfe).
   508  		// Section 4.6. Reserved skippable chunks (chunk types 0x80-0xfd).
   509  	}
   510  }
   511  
   512  // JSON returns the index as JSON text.
   513  func (i *Index) JSON() []byte {
   514  	type offset struct {
   515  		CompressedOffset   int64 `json:"compressed"`
   516  		UncompressedOffset int64 `json:"uncompressed"`
   517  	}
   518  	x := struct {
   519  		TotalUncompressed int64    `json:"total_uncompressed"` // Total Uncompressed size if known. Will be -1 if unknown.
   520  		TotalCompressed   int64    `json:"total_compressed"`   // Total Compressed size if known. Will be -1 if unknown.
   521  		Offsets           []offset `json:"offsets"`
   522  		EstBlockUncomp    int64    `json:"est_block_uncompressed"`
   523  	}{
   524  		TotalUncompressed: i.TotalUncompressed,
   525  		TotalCompressed:   i.TotalCompressed,
   526  		EstBlockUncomp:    i.estBlockUncomp,
   527  	}
   528  	for _, v := range i.info {
   529  		x.Offsets = append(x.Offsets, offset{CompressedOffset: v.compressedOffset, UncompressedOffset: v.uncompressedOffset})
   530  	}
   531  	b, _ := json.MarshalIndent(x, "", "  ")
   532  	return b
   533  }
   534  
   535  // RemoveIndexHeaders will trim all headers and trailers from a given index.
   536  // This is expected to save 20 bytes.
   537  // These can be restored using RestoreIndexHeaders.
   538  // This removes a layer of security, but is the most compact representation.
   539  // Returns nil if headers contains errors.
   540  // The returned slice references the provided slice.
   541  func RemoveIndexHeaders(b []byte) []byte {
   542  	const save = 4 + len(S2IndexHeader) + len(S2IndexTrailer) + 4
   543  	if len(b) <= save {
   544  		return nil
   545  	}
   546  	if b[0] != ChunkTypeIndex {
   547  		return nil
   548  	}
   549  	chunkLen := int(b[1]) | int(b[2])<<8 | int(b[3])<<16
   550  	b = b[4:]
   551  
   552  	// Validate we have enough...
   553  	if len(b) < chunkLen {
   554  		return nil
   555  	}
   556  	b = b[:chunkLen]
   557  
   558  	if !bytes.Equal(b[:len(S2IndexHeader)], []byte(S2IndexHeader)) {
   559  		return nil
   560  	}
   561  	b = b[len(S2IndexHeader):]
   562  	if !bytes.HasSuffix(b, []byte(S2IndexTrailer)) {
   563  		return nil
   564  	}
   565  	b = bytes.TrimSuffix(b, []byte(S2IndexTrailer))
   566  
   567  	if len(b) < 4 {
   568  		return nil
   569  	}
   570  	return b[:len(b)-4]
   571  }
   572  
   573  // RestoreIndexHeaders will index restore headers removed by RemoveIndexHeaders.
   574  // No error checking is performed on the input.
   575  // If a 0 length slice is sent, it is returned without modification.
   576  func RestoreIndexHeaders(in []byte) []byte {
   577  	if len(in) == 0 {
   578  		return in
   579  	}
   580  	b := make([]byte, 0, 4+len(S2IndexHeader)+len(in)+len(S2IndexTrailer)+4)
   581  	b = append(b, ChunkTypeIndex, 0, 0, 0)
   582  	b = append(b, []byte(S2IndexHeader)...)
   583  	b = append(b, in...)
   584  
   585  	var tmp [4]byte
   586  	binary.LittleEndian.PutUint32(tmp[:], uint32(len(b)+4+len(S2IndexTrailer)))
   587  	b = append(b, tmp[:4]...)
   588  	// Trailer
   589  	b = append(b, []byte(S2IndexTrailer)...)
   590  
   591  	chunkLen := len(b) - skippableFrameHeader
   592  	b[1] = uint8(chunkLen >> 0)
   593  	b[2] = uint8(chunkLen >> 8)
   594  	b[3] = uint8(chunkLen >> 16)
   595  	return b
   596  }
   597
View as plain text