...

Source file src/github.com/klauspost/compress/s2/lz4convert.go

Documentation: github.com/klauspost/compress/s2

     1  // Copyright (c) 2022 Klaus Post. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  package s2
     6  
     7  import (
     8  	"encoding/binary"
     9  	"errors"
    10  	"fmt"
    11  )
    12  
    13  // LZ4Converter provides conversion from LZ4 blocks as defined here:
    14  // https://github.com/lz4/lz4/blob/dev/doc/lz4_Block_format.md
    15  type LZ4Converter struct {
    16  }
    17  
    18  // ErrDstTooSmall is returned when provided destination is too small.
    19  var ErrDstTooSmall = errors.New("s2: destination too small")
    20  
    21  // ConvertBlock will convert an LZ4 block and append it as an S2
    22  // block without block length to dst.
    23  // The uncompressed size is returned as well.
    24  // dst must have capacity to contain the entire compressed block.
    25  func (l *LZ4Converter) ConvertBlock(dst, src []byte) ([]byte, int, error) {
    26  	if len(src) == 0 {
    27  		return dst, 0, nil
    28  	}
    29  	const debug = false
    30  	const inline = true
    31  	const lz4MinMatch = 4
    32  
    33  	s, d := 0, len(dst)
    34  	dst = dst[:cap(dst)]
    35  	if !debug && hasAmd64Asm {
    36  		res, sz := cvtLZ4BlockAsm(dst[d:], src)
    37  		if res < 0 {
    38  			const (
    39  				errCorrupt     = -1
    40  				errDstTooSmall = -2
    41  			)
    42  			switch res {
    43  			case errCorrupt:
    44  				return nil, 0, ErrCorrupt
    45  			case errDstTooSmall:
    46  				return nil, 0, ErrDstTooSmall
    47  			default:
    48  				return nil, 0, fmt.Errorf("unexpected result: %d", res)
    49  			}
    50  		}
    51  		if d+sz > len(dst) {
    52  			return nil, 0, ErrDstTooSmall
    53  		}
    54  		return dst[:d+sz], res, nil
    55  	}
    56  
    57  	dLimit := len(dst) - 10
    58  	var lastOffset uint16
    59  	var uncompressed int
    60  	if debug {
    61  		fmt.Printf("convert block start: len(src): %d, len(dst):%d \n", len(src), len(dst))
    62  	}
    63  
    64  	for {
    65  		if s >= len(src) {
    66  			return dst[:d], 0, ErrCorrupt
    67  		}
    68  		// Read literal info
    69  		token := src[s]
    70  		ll := int(token >> 4)
    71  		ml := int(lz4MinMatch + (token & 0xf))
    72  
    73  		// If upper nibble is 15, literal length is extended
    74  		if token >= 0xf0 {
    75  			for {
    76  				s++
    77  				if s >= len(src) {
    78  					if debug {
    79  						fmt.Printf("error reading ll: s (%d) >= len(src) (%d)\n", s, len(src))
    80  					}
    81  					return dst[:d], 0, ErrCorrupt
    82  				}
    83  				val := src[s]
    84  				ll += int(val)
    85  				if val != 255 {
    86  					break
    87  				}
    88  			}
    89  		}
    90  		// Skip past token
    91  		if s+ll >= len(src) {
    92  			if debug {
    93  				fmt.Printf("error literals: s+ll (%d+%d) >= len(src) (%d)\n", s, ll, len(src))
    94  			}
    95  			return nil, 0, ErrCorrupt
    96  		}
    97  		s++
    98  		if ll > 0 {
    99  			if d+ll > dLimit {
   100  				return nil, 0, ErrDstTooSmall
   101  			}
   102  			if debug {
   103  				fmt.Printf("emit %d literals\n", ll)
   104  			}
   105  			d += emitLiteralGo(dst[d:], src[s:s+ll])
   106  			s += ll
   107  			uncompressed += ll
   108  		}
   109  
   110  		// Check if we are done...
   111  		if s == len(src) && ml == lz4MinMatch {
   112  			break
   113  		}
   114  		// 2 byte offset
   115  		if s >= len(src)-2 {
   116  			if debug {
   117  				fmt.Printf("s (%d) >= len(src)-2 (%d)", s, len(src)-2)
   118  			}
   119  			return nil, 0, ErrCorrupt
   120  		}
   121  		offset := binary.LittleEndian.Uint16(src[s:])
   122  		s += 2
   123  		if offset == 0 {
   124  			if debug {
   125  				fmt.Printf("error: offset 0, ml: %d, len(src)-s: %d\n", ml, len(src)-s)
   126  			}
   127  			return nil, 0, ErrCorrupt
   128  		}
   129  		if int(offset) > uncompressed {
   130  			if debug {
   131  				fmt.Printf("error: offset (%d)> uncompressed (%d)\n", offset, uncompressed)
   132  			}
   133  			return nil, 0, ErrCorrupt
   134  		}
   135  
   136  		if ml == lz4MinMatch+15 {
   137  			for {
   138  				if s >= len(src) {
   139  					if debug {
   140  						fmt.Printf("error reading ml: s (%d) >= len(src) (%d)\n", s, len(src))
   141  					}
   142  					return nil, 0, ErrCorrupt
   143  				}
   144  				val := src[s]
   145  				s++
   146  				ml += int(val)
   147  				if val != 255 {
   148  					if s >= len(src) {
   149  						if debug {
   150  							fmt.Printf("error reading ml: s (%d) >= len(src) (%d)\n", s, len(src))
   151  						}
   152  						return nil, 0, ErrCorrupt
   153  					}
   154  					break
   155  				}
   156  			}
   157  		}
   158  		if offset == lastOffset {
   159  			if debug {
   160  				fmt.Printf("emit repeat, length: %d, offset: %d\n", ml, offset)
   161  			}
   162  			if !inline {
   163  				d += emitRepeat16(dst[d:], offset, ml)
   164  			} else {
   165  				length := ml
   166  				dst := dst[d:]
   167  				for len(dst) > 5 {
   168  					// Repeat offset, make length cheaper
   169  					length -= 4
   170  					if length <= 4 {
   171  						dst[0] = uint8(length)<<2 | tagCopy1
   172  						dst[1] = 0
   173  						d += 2
   174  						break
   175  					}
   176  					if length < 8 && offset < 2048 {
   177  						// Encode WITH offset
   178  						dst[1] = uint8(offset)
   179  						dst[0] = uint8(offset>>8)<<5 | uint8(length)<<2 | tagCopy1
   180  						d += 2
   181  						break
   182  					}
   183  					if length < (1<<8)+4 {
   184  						length -= 4
   185  						dst[2] = uint8(length)
   186  						dst[1] = 0
   187  						dst[0] = 5<<2 | tagCopy1
   188  						d += 3
   189  						break
   190  					}
   191  					if length < (1<<16)+(1<<8) {
   192  						length -= 1 << 8
   193  						dst[3] = uint8(length >> 8)
   194  						dst[2] = uint8(length >> 0)
   195  						dst[1] = 0
   196  						dst[0] = 6<<2 | tagCopy1
   197  						d += 4
   198  						break
   199  					}
   200  					const maxRepeat = (1 << 24) - 1
   201  					length -= 1 << 16
   202  					left := 0
   203  					if length > maxRepeat {
   204  						left = length - maxRepeat + 4
   205  						length = maxRepeat - 4
   206  					}
   207  					dst[4] = uint8(length >> 16)
   208  					dst[3] = uint8(length >> 8)
   209  					dst[2] = uint8(length >> 0)
   210  					dst[1] = 0
   211  					dst[0] = 7<<2 | tagCopy1
   212  					if left > 0 {
   213  						d += 5 + emitRepeat16(dst[5:], offset, left)
   214  						break
   215  					}
   216  					d += 5
   217  					break
   218  				}
   219  			}
   220  		} else {
   221  			if debug {
   222  				fmt.Printf("emit copy, length: %d, offset: %d\n", ml, offset)
   223  			}
   224  			if !inline {
   225  				d += emitCopy16(dst[d:], offset, ml)
   226  			} else {
   227  				length := ml
   228  				dst := dst[d:]
   229  				for len(dst) > 5 {
   230  					// Offset no more than 2 bytes.
   231  					if length > 64 {
   232  						off := 3
   233  						if offset < 2048 {
   234  							// emit 8 bytes as tagCopy1, rest as repeats.
   235  							dst[1] = uint8(offset)
   236  							dst[0] = uint8(offset>>8)<<5 | uint8(8-4)<<2 | tagCopy1
   237  							length -= 8
   238  							off = 2
   239  						} else {
   240  							// Emit a length 60 copy, encoded as 3 bytes.
   241  							// Emit remaining as repeat value (minimum 4 bytes).
   242  							dst[2] = uint8(offset >> 8)
   243  							dst[1] = uint8(offset)
   244  							dst[0] = 59<<2 | tagCopy2
   245  							length -= 60
   246  						}
   247  						// Emit remaining as repeats, at least 4 bytes remain.
   248  						d += off + emitRepeat16(dst[off:], offset, length)
   249  						break
   250  					}
   251  					if length >= 12 || offset >= 2048 {
   252  						// Emit the remaining copy, encoded as 3 bytes.
   253  						dst[2] = uint8(offset >> 8)
   254  						dst[1] = uint8(offset)
   255  						dst[0] = uint8(length-1)<<2 | tagCopy2
   256  						d += 3
   257  						break
   258  					}
   259  					// Emit the remaining copy, encoded as 2 bytes.
   260  					dst[1] = uint8(offset)
   261  					dst[0] = uint8(offset>>8)<<5 | uint8(length-4)<<2 | tagCopy1
   262  					d += 2
   263  					break
   264  				}
   265  			}
   266  			lastOffset = offset
   267  		}
   268  		uncompressed += ml
   269  		if d > dLimit {
   270  			return nil, 0, ErrDstTooSmall
   271  		}
   272  	}
   273  
   274  	return dst[:d], uncompressed, nil
   275  }
   276  
   277  // ConvertBlockSnappy will convert an LZ4 block and append it
   278  // as a Snappy block without block length to dst.
   279  // The uncompressed size is returned as well.
   280  // dst must have capacity to contain the entire compressed block.
   281  func (l *LZ4Converter) ConvertBlockSnappy(dst, src []byte) ([]byte, int, error) {
   282  	if len(src) == 0 {
   283  		return dst, 0, nil
   284  	}
   285  	const debug = false
   286  	const lz4MinMatch = 4
   287  
   288  	s, d := 0, len(dst)
   289  	dst = dst[:cap(dst)]
   290  	// Use assembly when possible
   291  	if !debug && hasAmd64Asm {
   292  		res, sz := cvtLZ4BlockSnappyAsm(dst[d:], src)
   293  		if res < 0 {
   294  			const (
   295  				errCorrupt     = -1
   296  				errDstTooSmall = -2
   297  			)
   298  			switch res {
   299  			case errCorrupt:
   300  				return nil, 0, ErrCorrupt
   301  			case errDstTooSmall:
   302  				return nil, 0, ErrDstTooSmall
   303  			default:
   304  				return nil, 0, fmt.Errorf("unexpected result: %d", res)
   305  			}
   306  		}
   307  		if d+sz > len(dst) {
   308  			return nil, 0, ErrDstTooSmall
   309  		}
   310  		return dst[:d+sz], res, nil
   311  	}
   312  
   313  	dLimit := len(dst) - 10
   314  	var uncompressed int
   315  	if debug {
   316  		fmt.Printf("convert block start: len(src): %d, len(dst):%d \n", len(src), len(dst))
   317  	}
   318  
   319  	for {
   320  		if s >= len(src) {
   321  			return nil, 0, ErrCorrupt
   322  		}
   323  		// Read literal info
   324  		token := src[s]
   325  		ll := int(token >> 4)
   326  		ml := int(lz4MinMatch + (token & 0xf))
   327  
   328  		// If upper nibble is 15, literal length is extended
   329  		if token >= 0xf0 {
   330  			for {
   331  				s++
   332  				if s >= len(src) {
   333  					if debug {
   334  						fmt.Printf("error reading ll: s (%d) >= len(src) (%d)\n", s, len(src))
   335  					}
   336  					return nil, 0, ErrCorrupt
   337  				}
   338  				val := src[s]
   339  				ll += int(val)
   340  				if val != 255 {
   341  					break
   342  				}
   343  			}
   344  		}
   345  		// Skip past token
   346  		if s+ll >= len(src) {
   347  			if debug {
   348  				fmt.Printf("error literals: s+ll (%d+%d) >= len(src) (%d)\n", s, ll, len(src))
   349  			}
   350  			return nil, 0, ErrCorrupt
   351  		}
   352  		s++
   353  		if ll > 0 {
   354  			if d+ll > dLimit {
   355  				return nil, 0, ErrDstTooSmall
   356  			}
   357  			if debug {
   358  				fmt.Printf("emit %d literals\n", ll)
   359  			}
   360  			d += emitLiteralGo(dst[d:], src[s:s+ll])
   361  			s += ll
   362  			uncompressed += ll
   363  		}
   364  
   365  		// Check if we are done...
   366  		if s == len(src) && ml == lz4MinMatch {
   367  			break
   368  		}
   369  		// 2 byte offset
   370  		if s >= len(src)-2 {
   371  			if debug {
   372  				fmt.Printf("s (%d) >= len(src)-2 (%d)", s, len(src)-2)
   373  			}
   374  			return nil, 0, ErrCorrupt
   375  		}
   376  		offset := binary.LittleEndian.Uint16(src[s:])
   377  		s += 2
   378  		if offset == 0 {
   379  			if debug {
   380  				fmt.Printf("error: offset 0, ml: %d, len(src)-s: %d\n", ml, len(src)-s)
   381  			}
   382  			return nil, 0, ErrCorrupt
   383  		}
   384  		if int(offset) > uncompressed {
   385  			if debug {
   386  				fmt.Printf("error: offset (%d)> uncompressed (%d)\n", offset, uncompressed)
   387  			}
   388  			return nil, 0, ErrCorrupt
   389  		}
   390  
   391  		if ml == lz4MinMatch+15 {
   392  			for {
   393  				if s >= len(src) {
   394  					if debug {
   395  						fmt.Printf("error reading ml: s (%d) >= len(src) (%d)\n", s, len(src))
   396  					}
   397  					return nil, 0, ErrCorrupt
   398  				}
   399  				val := src[s]
   400  				s++
   401  				ml += int(val)
   402  				if val != 255 {
   403  					if s >= len(src) {
   404  						if debug {
   405  							fmt.Printf("error reading ml: s (%d) >= len(src) (%d)\n", s, len(src))
   406  						}
   407  						return nil, 0, ErrCorrupt
   408  					}
   409  					break
   410  				}
   411  			}
   412  		}
   413  		if debug {
   414  			fmt.Printf("emit copy, length: %d, offset: %d\n", ml, offset)
   415  		}
   416  		length := ml
   417  		// d += emitCopyNoRepeat(dst[d:], int(offset), ml)
   418  		for length > 0 {
   419  			if d >= dLimit {
   420  				return nil, 0, ErrDstTooSmall
   421  			}
   422  
   423  			// Offset no more than 2 bytes.
   424  			if length > 64 {
   425  				// Emit a length 64 copy, encoded as 3 bytes.
   426  				dst[d+2] = uint8(offset >> 8)
   427  				dst[d+1] = uint8(offset)
   428  				dst[d+0] = 63<<2 | tagCopy2
   429  				length -= 64
   430  				d += 3
   431  				continue
   432  			}
   433  			if length >= 12 || offset >= 2048 || length < 4 {
   434  				// Emit the remaining copy, encoded as 3 bytes.
   435  				dst[d+2] = uint8(offset >> 8)
   436  				dst[d+1] = uint8(offset)
   437  				dst[d+0] = uint8(length-1)<<2 | tagCopy2
   438  				d += 3
   439  				break
   440  			}
   441  			// Emit the remaining copy, encoded as 2 bytes.
   442  			dst[d+1] = uint8(offset)
   443  			dst[d+0] = uint8(offset>>8)<<5 | uint8(length-4)<<2 | tagCopy1
   444  			d += 2
   445  			break
   446  		}
   447  		uncompressed += ml
   448  		if d > dLimit {
   449  			return nil, 0, ErrDstTooSmall
   450  		}
   451  	}
   452  
   453  	return dst[:d], uncompressed, nil
   454  }
   455  
   456  // emitRepeat writes a repeat chunk and returns the number of bytes written.
   457  // Length must be at least 4 and < 1<<24
   458  func emitRepeat16(dst []byte, offset uint16, length int) int {
   459  	// Repeat offset, make length cheaper
   460  	length -= 4
   461  	if length <= 4 {
   462  		dst[0] = uint8(length)<<2 | tagCopy1
   463  		dst[1] = 0
   464  		return 2
   465  	}
   466  	if length < 8 && offset < 2048 {
   467  		// Encode WITH offset
   468  		dst[1] = uint8(offset)
   469  		dst[0] = uint8(offset>>8)<<5 | uint8(length)<<2 | tagCopy1
   470  		return 2
   471  	}
   472  	if length < (1<<8)+4 {
   473  		length -= 4
   474  		dst[2] = uint8(length)
   475  		dst[1] = 0
   476  		dst[0] = 5<<2 | tagCopy1
   477  		return 3
   478  	}
   479  	if length < (1<<16)+(1<<8) {
   480  		length -= 1 << 8
   481  		dst[3] = uint8(length >> 8)
   482  		dst[2] = uint8(length >> 0)
   483  		dst[1] = 0
   484  		dst[0] = 6<<2 | tagCopy1
   485  		return 4
   486  	}
   487  	const maxRepeat = (1 << 24) - 1
   488  	length -= 1 << 16
   489  	left := 0
   490  	if length > maxRepeat {
   491  		left = length - maxRepeat + 4
   492  		length = maxRepeat - 4
   493  	}
   494  	dst[4] = uint8(length >> 16)
   495  	dst[3] = uint8(length >> 8)
   496  	dst[2] = uint8(length >> 0)
   497  	dst[1] = 0
   498  	dst[0] = 7<<2 | tagCopy1
   499  	if left > 0 {
   500  		return 5 + emitRepeat16(dst[5:], offset, left)
   501  	}
   502  	return 5
   503  }
   504  
   505  // emitCopy writes a copy chunk and returns the number of bytes written.
   506  //
   507  // It assumes that:
   508  //
   509  //	dst is long enough to hold the encoded bytes
   510  //	1 <= offset && offset <= math.MaxUint16
   511  //	4 <= length && length <= math.MaxUint32
   512  func emitCopy16(dst []byte, offset uint16, length int) int {
   513  	// Offset no more than 2 bytes.
   514  	if length > 64 {
   515  		off := 3
   516  		if offset < 2048 {
   517  			// emit 8 bytes as tagCopy1, rest as repeats.
   518  			dst[1] = uint8(offset)
   519  			dst[0] = uint8(offset>>8)<<5 | uint8(8-4)<<2 | tagCopy1
   520  			length -= 8
   521  			off = 2
   522  		} else {
   523  			// Emit a length 60 copy, encoded as 3 bytes.
   524  			// Emit remaining as repeat value (minimum 4 bytes).
   525  			dst[2] = uint8(offset >> 8)
   526  			dst[1] = uint8(offset)
   527  			dst[0] = 59<<2 | tagCopy2
   528  			length -= 60
   529  		}
   530  		// Emit remaining as repeats, at least 4 bytes remain.
   531  		return off + emitRepeat16(dst[off:], offset, length)
   532  	}
   533  	if length >= 12 || offset >= 2048 {
   534  		// Emit the remaining copy, encoded as 3 bytes.
   535  		dst[2] = uint8(offset >> 8)
   536  		dst[1] = uint8(offset)
   537  		dst[0] = uint8(length-1)<<2 | tagCopy2
   538  		return 3
   539  	}
   540  	// Emit the remaining copy, encoded as 2 bytes.
   541  	dst[1] = uint8(offset)
   542  	dst[0] = uint8(offset>>8)<<5 | uint8(length-4)<<2 | tagCopy1
   543  	return 2
   544  }
   545  
   546  // emitLiteral writes a literal chunk and returns the number of bytes written.
   547  //
   548  // It assumes that:
   549  //
   550  //	dst is long enough to hold the encoded bytes
   551  //	0 <= len(lit) && len(lit) <= math.MaxUint32
   552  func emitLiteralGo(dst, lit []byte) int {
   553  	if len(lit) == 0 {
   554  		return 0
   555  	}
   556  	i, n := 0, uint(len(lit)-1)
   557  	switch {
   558  	case n < 60:
   559  		dst[0] = uint8(n)<<2 | tagLiteral
   560  		i = 1
   561  	case n < 1<<8:
   562  		dst[1] = uint8(n)
   563  		dst[0] = 60<<2 | tagLiteral
   564  		i = 2
   565  	case n < 1<<16:
   566  		dst[2] = uint8(n >> 8)
   567  		dst[1] = uint8(n)
   568  		dst[0] = 61<<2 | tagLiteral
   569  		i = 3
   570  	case n < 1<<24:
   571  		dst[3] = uint8(n >> 16)
   572  		dst[2] = uint8(n >> 8)
   573  		dst[1] = uint8(n)
   574  		dst[0] = 62<<2 | tagLiteral
   575  		i = 4
   576  	default:
   577  		dst[4] = uint8(n >> 24)
   578  		dst[3] = uint8(n >> 16)
   579  		dst[2] = uint8(n >> 8)
   580  		dst[1] = uint8(n)
   581  		dst[0] = 63<<2 | tagLiteral
   582  		i = 5
   583  	}
   584  	return i + copy(dst[i:], lit)
   585  }
   586  

View as plain text