...

Source file src/github.com/klauspost/compress/s2/encode_better.go

Documentation: github.com/klauspost/compress/s2

     1  // Copyright 2016 The Snappy-Go Authors. All rights reserved.
     2  // Copyright (c) 2019 Klaus Post. All rights reserved.
     3  // Use of this source code is governed by a BSD-style
     4  // license that can be found in the LICENSE file.
     5  
     6  package s2
     7  
     8  import (
     9  	"bytes"
    10  	"fmt"
    11  	"math/bits"
    12  )
    13  
    14  // hash4 returns the hash of the lowest 4 bytes of u to fit in a hash table with h bits.
    15  // Preferably h should be a constant and should always be <32.
    16  func hash4(u uint64, h uint8) uint32 {
    17  	const prime4bytes = 2654435761
    18  	return (uint32(u) * prime4bytes) >> ((32 - h) & 31)
    19  }
    20  
    21  // hash5 returns the hash of the lowest 5 bytes of u to fit in a hash table with h bits.
    22  // Preferably h should be a constant and should always be <64.
    23  func hash5(u uint64, h uint8) uint32 {
    24  	const prime5bytes = 889523592379
    25  	return uint32(((u << (64 - 40)) * prime5bytes) >> ((64 - h) & 63))
    26  }
    27  
    28  // hash7 returns the hash of the lowest 7 bytes of u to fit in a hash table with h bits.
    29  // Preferably h should be a constant and should always be <64.
    30  func hash7(u uint64, h uint8) uint32 {
    31  	const prime7bytes = 58295818150454627
    32  	return uint32(((u << (64 - 56)) * prime7bytes) >> ((64 - h) & 63))
    33  }
    34  
    35  // hash8 returns the hash of u to fit in a hash table with h bits.
    36  // Preferably h should be a constant and should always be <64.
    37  func hash8(u uint64, h uint8) uint32 {
    38  	const prime8bytes = 0xcf1bbcdcb7a56463
    39  	return uint32((u * prime8bytes) >> ((64 - h) & 63))
    40  }
    41  
    42  // encodeBlockBetter encodes a non-empty src to a guaranteed-large-enough dst. It
    43  // assumes that the varint-encoded length of the decompressed bytes has already
    44  // been written.
    45  //
    46  // It also assumes that:
    47  //
    48  //	len(dst) >= MaxEncodedLen(len(src)) &&
    49  //	minNonLiteralBlockSize <= len(src) && len(src) <= maxBlockSize
    50  func encodeBlockBetterGo(dst, src []byte) (d int) {
    51  	// sLimit is when to stop looking for offset/length copies. The inputMargin
    52  	// lets us use a fast path for emitLiteral in the main loop, while we are
    53  	// looking for copies.
    54  	sLimit := len(src) - inputMargin
    55  	if len(src) < minNonLiteralBlockSize {
    56  		return 0
    57  	}
    58  
    59  	// Initialize the hash tables.
    60  	const (
    61  		// Long hash matches.
    62  		lTableBits    = 17
    63  		maxLTableSize = 1 << lTableBits
    64  
    65  		// Short hash matches.
    66  		sTableBits    = 14
    67  		maxSTableSize = 1 << sTableBits
    68  	)
    69  
    70  	var lTable [maxLTableSize]uint32
    71  	var sTable [maxSTableSize]uint32
    72  
    73  	// Bail if we can't compress to at least this.
    74  	dstLimit := len(src) - len(src)>>5 - 6
    75  
    76  	// nextEmit is where in src the next emitLiteral should start from.
    77  	nextEmit := 0
    78  
    79  	// The encoded form must start with a literal, as there are no previous
    80  	// bytes to copy, so we start looking for hash matches at s == 1.
    81  	s := 1
    82  	cv := load64(src, s)
    83  
    84  	// We initialize repeat to 0, so we never match on first attempt
    85  	repeat := 0
    86  
    87  	for {
    88  		candidateL := 0
    89  		nextS := 0
    90  		for {
    91  			// Next src position to check
    92  			nextS = s + (s-nextEmit)>>7 + 1
    93  			if nextS > sLimit {
    94  				goto emitRemainder
    95  			}
    96  			hashL := hash7(cv, lTableBits)
    97  			hashS := hash4(cv, sTableBits)
    98  			candidateL = int(lTable[hashL])
    99  			candidateS := int(sTable[hashS])
   100  			lTable[hashL] = uint32(s)
   101  			sTable[hashS] = uint32(s)
   102  
   103  			valLong := load64(src, candidateL)
   104  			valShort := load64(src, candidateS)
   105  
   106  			// If long matches at least 8 bytes, use that.
   107  			if cv == valLong {
   108  				break
   109  			}
   110  			if cv == valShort {
   111  				candidateL = candidateS
   112  				break
   113  			}
   114  
   115  			// Check repeat at offset checkRep.
   116  			const checkRep = 1
   117  			// Minimum length of a repeat. Tested with various values.
   118  			// While 4-5 offers improvements in some, 6 reduces
   119  			// regressions significantly.
   120  			const wantRepeatBytes = 6
   121  			const repeatMask = ((1 << (wantRepeatBytes * 8)) - 1) << (8 * checkRep)
   122  			if false && repeat > 0 && cv&repeatMask == load64(src, s-repeat)&repeatMask {
   123  				base := s + checkRep
   124  				// Extend back
   125  				for i := base - repeat; base > nextEmit && i > 0 && src[i-1] == src[base-1]; {
   126  					i--
   127  					base--
   128  				}
   129  				d += emitLiteral(dst[d:], src[nextEmit:base])
   130  
   131  				// Extend forward
   132  				candidate := s - repeat + wantRepeatBytes + checkRep
   133  				s += wantRepeatBytes + checkRep
   134  				for s < len(src) {
   135  					if len(src)-s < 8 {
   136  						if src[s] == src[candidate] {
   137  							s++
   138  							candidate++
   139  							continue
   140  						}
   141  						break
   142  					}
   143  					if diff := load64(src, s) ^ load64(src, candidate); diff != 0 {
   144  						s += bits.TrailingZeros64(diff) >> 3
   145  						break
   146  					}
   147  					s += 8
   148  					candidate += 8
   149  				}
   150  				// same as `add := emitCopy(dst[d:], repeat, s-base)` but skips storing offset.
   151  				d += emitRepeat(dst[d:], repeat, s-base)
   152  				nextEmit = s
   153  				if s >= sLimit {
   154  					goto emitRemainder
   155  				}
   156  				// Index in-between
   157  				index0 := base + 1
   158  				index1 := s - 2
   159  
   160  				for index0 < index1 {
   161  					cv0 := load64(src, index0)
   162  					cv1 := load64(src, index1)
   163  					lTable[hash7(cv0, lTableBits)] = uint32(index0)
   164  					sTable[hash4(cv0>>8, sTableBits)] = uint32(index0 + 1)
   165  
   166  					lTable[hash7(cv1, lTableBits)] = uint32(index1)
   167  					sTable[hash4(cv1>>8, sTableBits)] = uint32(index1 + 1)
   168  					index0 += 2
   169  					index1 -= 2
   170  				}
   171  
   172  				cv = load64(src, s)
   173  				continue
   174  			}
   175  
   176  			// Long likely matches 7, so take that.
   177  			if uint32(cv) == uint32(valLong) {
   178  				break
   179  			}
   180  
   181  			// Check our short candidate
   182  			if uint32(cv) == uint32(valShort) {
   183  				// Try a long candidate at s+1
   184  				hashL = hash7(cv>>8, lTableBits)
   185  				candidateL = int(lTable[hashL])
   186  				lTable[hashL] = uint32(s + 1)
   187  				if uint32(cv>>8) == load32(src, candidateL) {
   188  					s++
   189  					break
   190  				}
   191  				// Use our short candidate.
   192  				candidateL = candidateS
   193  				break
   194  			}
   195  
   196  			cv = load64(src, nextS)
   197  			s = nextS
   198  		}
   199  
   200  		// Extend backwards
   201  		for candidateL > 0 && s > nextEmit && src[candidateL-1] == src[s-1] {
   202  			candidateL--
   203  			s--
   204  		}
   205  
   206  		// Bail if we exceed the maximum size.
   207  		if d+(s-nextEmit) > dstLimit {
   208  			return 0
   209  		}
   210  
   211  		base := s
   212  		offset := base - candidateL
   213  
   214  		// Extend the 4-byte match as long as possible.
   215  		s += 4
   216  		candidateL += 4
   217  		for s < len(src) {
   218  			if len(src)-s < 8 {
   219  				if src[s] == src[candidateL] {
   220  					s++
   221  					candidateL++
   222  					continue
   223  				}
   224  				break
   225  			}
   226  			if diff := load64(src, s) ^ load64(src, candidateL); diff != 0 {
   227  				s += bits.TrailingZeros64(diff) >> 3
   228  				break
   229  			}
   230  			s += 8
   231  			candidateL += 8
   232  		}
   233  
   234  		if offset > 65535 && s-base <= 5 && repeat != offset {
   235  			// Bail if the match is equal or worse to the encoding.
   236  			s = nextS + 1
   237  			if s >= sLimit {
   238  				goto emitRemainder
   239  			}
   240  			cv = load64(src, s)
   241  			continue
   242  		}
   243  
   244  		d += emitLiteral(dst[d:], src[nextEmit:base])
   245  		if repeat == offset {
   246  			d += emitRepeat(dst[d:], offset, s-base)
   247  		} else {
   248  			d += emitCopy(dst[d:], offset, s-base)
   249  			repeat = offset
   250  		}
   251  
   252  		nextEmit = s
   253  		if s >= sLimit {
   254  			goto emitRemainder
   255  		}
   256  
   257  		if d > dstLimit {
   258  			// Do we have space for more, if not bail.
   259  			return 0
   260  		}
   261  
   262  		// Index short & long
   263  		index0 := base + 1
   264  		index1 := s - 2
   265  
   266  		cv0 := load64(src, index0)
   267  		cv1 := load64(src, index1)
   268  		lTable[hash7(cv0, lTableBits)] = uint32(index0)
   269  		sTable[hash4(cv0>>8, sTableBits)] = uint32(index0 + 1)
   270  
   271  		// lTable could be postponed, but very minor difference.
   272  		lTable[hash7(cv1, lTableBits)] = uint32(index1)
   273  		sTable[hash4(cv1>>8, sTableBits)] = uint32(index1 + 1)
   274  		index0 += 1
   275  		index1 -= 1
   276  		cv = load64(src, s)
   277  
   278  		// Index large values sparsely in between.
   279  		// We do two starting from different offsets for speed.
   280  		index2 := (index0 + index1 + 1) >> 1
   281  		for index2 < index1 {
   282  			lTable[hash7(load64(src, index0), lTableBits)] = uint32(index0)
   283  			lTable[hash7(load64(src, index2), lTableBits)] = uint32(index2)
   284  			index0 += 2
   285  			index2 += 2
   286  		}
   287  	}
   288  
   289  emitRemainder:
   290  	if nextEmit < len(src) {
   291  		// Bail if we exceed the maximum size.
   292  		if d+len(src)-nextEmit > dstLimit {
   293  			return 0
   294  		}
   295  		d += emitLiteral(dst[d:], src[nextEmit:])
   296  	}
   297  	return d
   298  }
   299  
   300  // encodeBlockBetterSnappyGo encodes a non-empty src to a guaranteed-large-enough dst. It
   301  // assumes that the varint-encoded length of the decompressed bytes has already
   302  // been written.
   303  //
   304  // It also assumes that:
   305  //
   306  //	len(dst) >= MaxEncodedLen(len(src)) &&
   307  //	minNonLiteralBlockSize <= len(src) && len(src) <= maxBlockSize
   308  func encodeBlockBetterSnappyGo(dst, src []byte) (d int) {
   309  	// sLimit is when to stop looking for offset/length copies. The inputMargin
   310  	// lets us use a fast path for emitLiteral in the main loop, while we are
   311  	// looking for copies.
   312  	sLimit := len(src) - inputMargin
   313  	if len(src) < minNonLiteralBlockSize {
   314  		return 0
   315  	}
   316  
   317  	// Initialize the hash tables.
   318  	const (
   319  		// Long hash matches.
   320  		lTableBits    = 16
   321  		maxLTableSize = 1 << lTableBits
   322  
   323  		// Short hash matches.
   324  		sTableBits    = 14
   325  		maxSTableSize = 1 << sTableBits
   326  	)
   327  
   328  	var lTable [maxLTableSize]uint32
   329  	var sTable [maxSTableSize]uint32
   330  
   331  	// Bail if we can't compress to at least this.
   332  	dstLimit := len(src) - len(src)>>5 - 6
   333  
   334  	// nextEmit is where in src the next emitLiteral should start from.
   335  	nextEmit := 0
   336  
   337  	// The encoded form must start with a literal, as there are no previous
   338  	// bytes to copy, so we start looking for hash matches at s == 1.
   339  	s := 1
   340  	cv := load64(src, s)
   341  
   342  	// We initialize repeat to 0, so we never match on first attempt
   343  	repeat := 0
   344  	const maxSkip = 100
   345  
   346  	for {
   347  		candidateL := 0
   348  		nextS := 0
   349  		for {
   350  			// Next src position to check
   351  			nextS = (s-nextEmit)>>7 + 1
   352  			if nextS > maxSkip {
   353  				nextS = s + maxSkip
   354  			} else {
   355  				nextS += s
   356  			}
   357  
   358  			if nextS > sLimit {
   359  				goto emitRemainder
   360  			}
   361  			hashL := hash7(cv, lTableBits)
   362  			hashS := hash4(cv, sTableBits)
   363  			candidateL = int(lTable[hashL])
   364  			candidateS := int(sTable[hashS])
   365  			lTable[hashL] = uint32(s)
   366  			sTable[hashS] = uint32(s)
   367  
   368  			if uint32(cv) == load32(src, candidateL) {
   369  				break
   370  			}
   371  
   372  			// Check our short candidate
   373  			if uint32(cv) == load32(src, candidateS) {
   374  				// Try a long candidate at s+1
   375  				hashL = hash7(cv>>8, lTableBits)
   376  				candidateL = int(lTable[hashL])
   377  				lTable[hashL] = uint32(s + 1)
   378  				if uint32(cv>>8) == load32(src, candidateL) {
   379  					s++
   380  					break
   381  				}
   382  				// Use our short candidate.
   383  				candidateL = candidateS
   384  				break
   385  			}
   386  
   387  			cv = load64(src, nextS)
   388  			s = nextS
   389  		}
   390  
   391  		// Extend backwards
   392  		for candidateL > 0 && s > nextEmit && src[candidateL-1] == src[s-1] {
   393  			candidateL--
   394  			s--
   395  		}
   396  
   397  		// Bail if we exceed the maximum size.
   398  		if d+(s-nextEmit) > dstLimit {
   399  			return 0
   400  		}
   401  
   402  		base := s
   403  		offset := base - candidateL
   404  
   405  		// Extend the 4-byte match as long as possible.
   406  		s += 4
   407  		candidateL += 4
   408  		for s < len(src) {
   409  			if len(src)-s < 8 {
   410  				if src[s] == src[candidateL] {
   411  					s++
   412  					candidateL++
   413  					continue
   414  				}
   415  				break
   416  			}
   417  			if diff := load64(src, s) ^ load64(src, candidateL); diff != 0 {
   418  				s += bits.TrailingZeros64(diff) >> 3
   419  				break
   420  			}
   421  			s += 8
   422  			candidateL += 8
   423  		}
   424  
   425  		if offset > 65535 && s-base <= 5 && repeat != offset {
   426  			// Bail if the match is equal or worse to the encoding.
   427  			s = nextS + 1
   428  			if s >= sLimit {
   429  				goto emitRemainder
   430  			}
   431  			cv = load64(src, s)
   432  			continue
   433  		}
   434  
   435  		d += emitLiteral(dst[d:], src[nextEmit:base])
   436  		d += emitCopyNoRepeat(dst[d:], offset, s-base)
   437  		repeat = offset
   438  
   439  		nextEmit = s
   440  		if s >= sLimit {
   441  			goto emitRemainder
   442  		}
   443  
   444  		if d > dstLimit {
   445  			// Do we have space for more, if not bail.
   446  			return 0
   447  		}
   448  
   449  		// Index short & long
   450  		index0 := base + 1
   451  		index1 := s - 2
   452  
   453  		cv0 := load64(src, index0)
   454  		cv1 := load64(src, index1)
   455  		lTable[hash7(cv0, lTableBits)] = uint32(index0)
   456  		sTable[hash4(cv0>>8, sTableBits)] = uint32(index0 + 1)
   457  
   458  		lTable[hash7(cv1, lTableBits)] = uint32(index1)
   459  		sTable[hash4(cv1>>8, sTableBits)] = uint32(index1 + 1)
   460  		index0 += 1
   461  		index1 -= 1
   462  		cv = load64(src, s)
   463  
   464  		// Index large values sparsely in between.
   465  		// We do two starting from different offsets for speed.
   466  		index2 := (index0 + index1 + 1) >> 1
   467  		for index2 < index1 {
   468  			lTable[hash7(load64(src, index0), lTableBits)] = uint32(index0)
   469  			lTable[hash7(load64(src, index2), lTableBits)] = uint32(index2)
   470  			index0 += 2
   471  			index2 += 2
   472  		}
   473  	}
   474  
   475  emitRemainder:
   476  	if nextEmit < len(src) {
   477  		// Bail if we exceed the maximum size.
   478  		if d+len(src)-nextEmit > dstLimit {
   479  			return 0
   480  		}
   481  		d += emitLiteral(dst[d:], src[nextEmit:])
   482  	}
   483  	return d
   484  }
   485  
   486  // encodeBlockBetterDict encodes a non-empty src to a guaranteed-large-enough dst. It
   487  // assumes that the varint-encoded length of the decompressed bytes has already
   488  // been written.
   489  //
   490  // It also assumes that:
   491  //
   492  //	len(dst) >= MaxEncodedLen(len(src)) &&
   493  //	minNonLiteralBlockSize <= len(src) && len(src) <= maxBlockSize
   494  func encodeBlockBetterDict(dst, src []byte, dict *Dict) (d int) {
   495  	// sLimit is when to stop looking for offset/length copies. The inputMargin
   496  	// lets us use a fast path for emitLiteral in the main loop, while we are
   497  	// looking for copies.
   498  	// Initialize the hash tables.
   499  	const (
   500  		// Long hash matches.
   501  		lTableBits    = 17
   502  		maxLTableSize = 1 << lTableBits
   503  
   504  		// Short hash matches.
   505  		sTableBits    = 14
   506  		maxSTableSize = 1 << sTableBits
   507  
   508  		maxAhead = 8 // maximum bytes ahead without checking sLimit
   509  
   510  		debug = false
   511  	)
   512  
   513  	sLimit := len(src) - inputMargin
   514  	if sLimit > MaxDictSrcOffset-maxAhead {
   515  		sLimit = MaxDictSrcOffset - maxAhead
   516  	}
   517  	if len(src) < minNonLiteralBlockSize {
   518  		return 0
   519  	}
   520  
   521  	dict.initBetter()
   522  
   523  	var lTable [maxLTableSize]uint32
   524  	var sTable [maxSTableSize]uint32
   525  
   526  	// Bail if we can't compress to at least this.
   527  	dstLimit := len(src) - len(src)>>5 - 6
   528  
   529  	// nextEmit is where in src the next emitLiteral should start from.
   530  	nextEmit := 0
   531  
   532  	// The encoded form must start with a literal, as there are no previous
   533  	// bytes to copy, so we start looking for hash matches at s == 1.
   534  	s := 0
   535  	cv := load64(src, s)
   536  
   537  	// We initialize repeat to 0, so we never match on first attempt
   538  	repeat := len(dict.dict) - dict.repeat
   539  
   540  	// While in dict
   541  searchDict:
   542  	for {
   543  		candidateL := 0
   544  		nextS := 0
   545  		for {
   546  			// Next src position to check
   547  			nextS = s + (s-nextEmit)>>7 + 1
   548  			if nextS > sLimit {
   549  				break searchDict
   550  			}
   551  			hashL := hash7(cv, lTableBits)
   552  			hashS := hash4(cv, sTableBits)
   553  			candidateL = int(lTable[hashL])
   554  			candidateS := int(sTable[hashS])
   555  			dictL := int(dict.betterTableLong[hashL])
   556  			dictS := int(dict.betterTableShort[hashS])
   557  			lTable[hashL] = uint32(s)
   558  			sTable[hashS] = uint32(s)
   559  
   560  			valLong := load64(src, candidateL)
   561  			valShort := load64(src, candidateS)
   562  
   563  			// If long matches at least 8 bytes, use that.
   564  			if s != 0 {
   565  				if cv == valLong {
   566  					goto emitMatch
   567  				}
   568  				if cv == valShort {
   569  					candidateL = candidateS
   570  					goto emitMatch
   571  				}
   572  			}
   573  
   574  			// Check dict repeat.
   575  			if repeat >= s+4 {
   576  				candidate := len(dict.dict) - repeat + s
   577  				if candidate > 0 && uint32(cv) == load32(dict.dict, candidate) {
   578  					// Extend back
   579  					base := s
   580  					for i := candidate; base > nextEmit && i > 0 && dict.dict[i-1] == src[base-1]; {
   581  						i--
   582  						base--
   583  					}
   584  					d += emitLiteral(dst[d:], src[nextEmit:base])
   585  					if debug && nextEmit != base {
   586  						fmt.Println("emitted ", base-nextEmit, "literals")
   587  					}
   588  					s += 4
   589  					candidate += 4
   590  					for candidate < len(dict.dict)-8 && s <= len(src)-8 {
   591  						if diff := load64(src, s) ^ load64(dict.dict, candidate); diff != 0 {
   592  							s += bits.TrailingZeros64(diff) >> 3
   593  							break
   594  						}
   595  						s += 8
   596  						candidate += 8
   597  					}
   598  					d += emitRepeat(dst[d:], repeat, s-base)
   599  					if debug {
   600  						fmt.Println("emitted dict repeat length", s-base, "offset:", repeat, "s:", s)
   601  					}
   602  					nextEmit = s
   603  					if s >= sLimit {
   604  						break searchDict
   605  					}
   606  					// Index in-between
   607  					index0 := base + 1
   608  					index1 := s - 2
   609  
   610  					cv = load64(src, s)
   611  					for index0 < index1 {
   612  						cv0 := load64(src, index0)
   613  						cv1 := load64(src, index1)
   614  						lTable[hash7(cv0, lTableBits)] = uint32(index0)
   615  						sTable[hash4(cv0>>8, sTableBits)] = uint32(index0 + 1)
   616  
   617  						lTable[hash7(cv1, lTableBits)] = uint32(index1)
   618  						sTable[hash4(cv1>>8, sTableBits)] = uint32(index1 + 1)
   619  						index0 += 2
   620  						index1 -= 2
   621  					}
   622  					continue
   623  				}
   624  			}
   625  			// Don't try to find match at s==0
   626  			if s == 0 {
   627  				cv = load64(src, nextS)
   628  				s = nextS
   629  				continue
   630  			}
   631  
   632  			// Long likely matches 7, so take that.
   633  			if uint32(cv) == uint32(valLong) {
   634  				goto emitMatch
   635  			}
   636  
   637  			// Long dict...
   638  			if uint32(cv) == load32(dict.dict, dictL) {
   639  				candidateL = dictL
   640  				goto emitDict
   641  			}
   642  
   643  			// Check our short candidate
   644  			if uint32(cv) == uint32(valShort) {
   645  				// Try a long candidate at s+1
   646  				hashL = hash7(cv>>8, lTableBits)
   647  				candidateL = int(lTable[hashL])
   648  				lTable[hashL] = uint32(s + 1)
   649  				if uint32(cv>>8) == load32(src, candidateL) {
   650  					s++
   651  					goto emitMatch
   652  				}
   653  				// Use our short candidate.
   654  				candidateL = candidateS
   655  				goto emitMatch
   656  			}
   657  			if uint32(cv) == load32(dict.dict, dictS) {
   658  				// Try a long candidate at s+1
   659  				hashL = hash7(cv>>8, lTableBits)
   660  				candidateL = int(lTable[hashL])
   661  				lTable[hashL] = uint32(s + 1)
   662  				if uint32(cv>>8) == load32(src, candidateL) {
   663  					s++
   664  					goto emitMatch
   665  				}
   666  				candidateL = dictS
   667  				goto emitDict
   668  			}
   669  			cv = load64(src, nextS)
   670  			s = nextS
   671  		}
   672  	emitDict:
   673  		{
   674  			if debug {
   675  				if load32(dict.dict, candidateL) != load32(src, s) {
   676  					panic("dict emit mismatch")
   677  				}
   678  			}
   679  			// Extend backwards.
   680  			// The top bytes will be rechecked to get the full match.
   681  			for candidateL > 0 && s > nextEmit && dict.dict[candidateL-1] == src[s-1] {
   682  				candidateL--
   683  				s--
   684  			}
   685  
   686  			// Bail if we exceed the maximum size.
   687  			if d+(s-nextEmit) > dstLimit {
   688  				return 0
   689  			}
   690  
   691  			// A 4-byte match has been found. We'll later see if more than 4 bytes
   692  			// match. But, prior to the match, src[nextEmit:s] are unmatched. Emit
   693  			// them as literal bytes.
   694  
   695  			d += emitLiteral(dst[d:], src[nextEmit:s])
   696  			if debug && nextEmit != s {
   697  				fmt.Println("emitted ", s-nextEmit, "literals")
   698  			}
   699  			{
   700  				// Invariant: we have a 4-byte match at s, and no need to emit any
   701  				// literal bytes prior to s.
   702  				base := s
   703  				offset := s + (len(dict.dict)) - candidateL
   704  
   705  				// Extend the 4-byte match as long as possible.
   706  				s += 4
   707  				candidateL += 4
   708  				for s <= len(src)-8 && len(dict.dict)-candidateL >= 8 {
   709  					if diff := load64(src, s) ^ load64(dict.dict, candidateL); diff != 0 {
   710  						s += bits.TrailingZeros64(diff) >> 3
   711  						break
   712  					}
   713  					s += 8
   714  					candidateL += 8
   715  				}
   716  
   717  				if repeat == offset {
   718  					if debug {
   719  						fmt.Println("emitted dict repeat, length", s-base, "offset:", offset, "s:", s, "dict offset:", candidateL)
   720  					}
   721  					d += emitRepeat(dst[d:], offset, s-base)
   722  				} else {
   723  					if debug {
   724  						fmt.Println("emitted dict copy, length", s-base, "offset:", offset, "s:", s, "dict offset:", candidateL)
   725  					}
   726  					// Matches longer than 64 are split.
   727  					if s <= sLimit || s-base < 8 {
   728  						d += emitCopy(dst[d:], offset, s-base)
   729  					} else {
   730  						// Split to ensure we don't start a copy within next block.
   731  						d += emitCopy(dst[d:], offset, 4)
   732  						d += emitRepeat(dst[d:], offset, s-base-4)
   733  					}
   734  					repeat = offset
   735  				}
   736  				if false {
   737  					// Validate match.
   738  					if s <= candidateL {
   739  						panic("s <= candidate")
   740  					}
   741  					a := src[base:s]
   742  					b := dict.dict[base-repeat : base-repeat+(s-base)]
   743  					if !bytes.Equal(a, b) {
   744  						panic("mismatch")
   745  					}
   746  				}
   747  
   748  				nextEmit = s
   749  				if s >= sLimit {
   750  					break searchDict
   751  				}
   752  
   753  				if d > dstLimit {
   754  					// Do we have space for more, if not bail.
   755  					return 0
   756  				}
   757  
   758  				// Index short & long
   759  				index0 := base + 1
   760  				index1 := s - 2
   761  
   762  				cv0 := load64(src, index0)
   763  				cv1 := load64(src, index1)
   764  				lTable[hash7(cv0, lTableBits)] = uint32(index0)
   765  				sTable[hash4(cv0>>8, sTableBits)] = uint32(index0 + 1)
   766  
   767  				lTable[hash7(cv1, lTableBits)] = uint32(index1)
   768  				sTable[hash4(cv1>>8, sTableBits)] = uint32(index1 + 1)
   769  				index0 += 1
   770  				index1 -= 1
   771  				cv = load64(src, s)
   772  
   773  				// index every second long in between.
   774  				for index0 < index1 {
   775  					lTable[hash7(load64(src, index0), lTableBits)] = uint32(index0)
   776  					lTable[hash7(load64(src, index1), lTableBits)] = uint32(index1)
   777  					index0 += 2
   778  					index1 -= 2
   779  				}
   780  			}
   781  			continue
   782  		}
   783  	emitMatch:
   784  
   785  		// Extend backwards
   786  		for candidateL > 0 && s > nextEmit && src[candidateL-1] == src[s-1] {
   787  			candidateL--
   788  			s--
   789  		}
   790  
   791  		// Bail if we exceed the maximum size.
   792  		if d+(s-nextEmit) > dstLimit {
   793  			return 0
   794  		}
   795  
   796  		base := s
   797  		offset := base - candidateL
   798  
   799  		// Extend the 4-byte match as long as possible.
   800  		s += 4
   801  		candidateL += 4
   802  		for s < len(src) {
   803  			if len(src)-s < 8 {
   804  				if src[s] == src[candidateL] {
   805  					s++
   806  					candidateL++
   807  					continue
   808  				}
   809  				break
   810  			}
   811  			if diff := load64(src, s) ^ load64(src, candidateL); diff != 0 {
   812  				s += bits.TrailingZeros64(diff) >> 3
   813  				break
   814  			}
   815  			s += 8
   816  			candidateL += 8
   817  		}
   818  
   819  		if offset > 65535 && s-base <= 5 && repeat != offset {
   820  			// Bail if the match is equal or worse to the encoding.
   821  			s = nextS + 1
   822  			if s >= sLimit {
   823  				goto emitRemainder
   824  			}
   825  			cv = load64(src, s)
   826  			continue
   827  		}
   828  
   829  		d += emitLiteral(dst[d:], src[nextEmit:base])
   830  		if debug && nextEmit != s {
   831  			fmt.Println("emitted ", s-nextEmit, "literals")
   832  		}
   833  		if repeat == offset {
   834  			if debug {
   835  				fmt.Println("emitted match repeat, length", s-base, "offset:", offset, "s:", s)
   836  			}
   837  			d += emitRepeat(dst[d:], offset, s-base)
   838  		} else {
   839  			if debug {
   840  				fmt.Println("emitted match copy, length", s-base, "offset:", offset, "s:", s)
   841  			}
   842  			d += emitCopy(dst[d:], offset, s-base)
   843  			repeat = offset
   844  		}
   845  
   846  		nextEmit = s
   847  		if s >= sLimit {
   848  			goto emitRemainder
   849  		}
   850  
   851  		if d > dstLimit {
   852  			// Do we have space for more, if not bail.
   853  			return 0
   854  		}
   855  
   856  		// Index short & long
   857  		index0 := base + 1
   858  		index1 := s - 2
   859  
   860  		cv0 := load64(src, index0)
   861  		cv1 := load64(src, index1)
   862  		lTable[hash7(cv0, lTableBits)] = uint32(index0)
   863  		sTable[hash4(cv0>>8, sTableBits)] = uint32(index0 + 1)
   864  
   865  		lTable[hash7(cv1, lTableBits)] = uint32(index1)
   866  		sTable[hash4(cv1>>8, sTableBits)] = uint32(index1 + 1)
   867  		index0 += 1
   868  		index1 -= 1
   869  		cv = load64(src, s)
   870  
   871  		// Index large values sparsely in between.
   872  		// We do two starting from different offsets for speed.
   873  		index2 := (index0 + index1 + 1) >> 1
   874  		for index2 < index1 {
   875  			lTable[hash7(load64(src, index0), lTableBits)] = uint32(index0)
   876  			lTable[hash7(load64(src, index2), lTableBits)] = uint32(index2)
   877  			index0 += 2
   878  			index2 += 2
   879  		}
   880  	}
   881  
   882  	// Search without dict:
   883  	if repeat > s {
   884  		repeat = 0
   885  	}
   886  
   887  	// No more dict
   888  	sLimit = len(src) - inputMargin
   889  	if s >= sLimit {
   890  		goto emitRemainder
   891  	}
   892  	cv = load64(src, s)
   893  	if debug {
   894  		fmt.Println("now", s, "->", sLimit, "out:", d, "left:", len(src)-s, "nextemit:", nextEmit, "dstLimit:", dstLimit, "s:", s)
   895  	}
   896  	for {
   897  		candidateL := 0
   898  		nextS := 0
   899  		for {
   900  			// Next src position to check
   901  			nextS = s + (s-nextEmit)>>7 + 1
   902  			if nextS > sLimit {
   903  				goto emitRemainder
   904  			}
   905  			hashL := hash7(cv, lTableBits)
   906  			hashS := hash4(cv, sTableBits)
   907  			candidateL = int(lTable[hashL])
   908  			candidateS := int(sTable[hashS])
   909  			lTable[hashL] = uint32(s)
   910  			sTable[hashS] = uint32(s)
   911  
   912  			valLong := load64(src, candidateL)
   913  			valShort := load64(src, candidateS)
   914  
   915  			// If long matches at least 8 bytes, use that.
   916  			if cv == valLong {
   917  				break
   918  			}
   919  			if cv == valShort {
   920  				candidateL = candidateS
   921  				break
   922  			}
   923  
   924  			// Check repeat at offset checkRep.
   925  			const checkRep = 1
   926  			// Minimum length of a repeat. Tested with various values.
   927  			// While 4-5 offers improvements in some, 6 reduces
   928  			// regressions significantly.
   929  			const wantRepeatBytes = 6
   930  			const repeatMask = ((1 << (wantRepeatBytes * 8)) - 1) << (8 * checkRep)
   931  			if false && repeat > 0 && cv&repeatMask == load64(src, s-repeat)&repeatMask {
   932  				base := s + checkRep
   933  				// Extend back
   934  				for i := base - repeat; base > nextEmit && i > 0 && src[i-1] == src[base-1]; {
   935  					i--
   936  					base--
   937  				}
   938  				d += emitLiteral(dst[d:], src[nextEmit:base])
   939  
   940  				// Extend forward
   941  				candidate := s - repeat + wantRepeatBytes + checkRep
   942  				s += wantRepeatBytes + checkRep
   943  				for s < len(src) {
   944  					if len(src)-s < 8 {
   945  						if src[s] == src[candidate] {
   946  							s++
   947  							candidate++
   948  							continue
   949  						}
   950  						break
   951  					}
   952  					if diff := load64(src, s) ^ load64(src, candidate); diff != 0 {
   953  						s += bits.TrailingZeros64(diff) >> 3
   954  						break
   955  					}
   956  					s += 8
   957  					candidate += 8
   958  				}
   959  				// same as `add := emitCopy(dst[d:], repeat, s-base)` but skips storing offset.
   960  				d += emitRepeat(dst[d:], repeat, s-base)
   961  				nextEmit = s
   962  				if s >= sLimit {
   963  					goto emitRemainder
   964  				}
   965  				// Index in-between
   966  				index0 := base + 1
   967  				index1 := s - 2
   968  
   969  				for index0 < index1 {
   970  					cv0 := load64(src, index0)
   971  					cv1 := load64(src, index1)
   972  					lTable[hash7(cv0, lTableBits)] = uint32(index0)
   973  					sTable[hash4(cv0>>8, sTableBits)] = uint32(index0 + 1)
   974  
   975  					lTable[hash7(cv1, lTableBits)] = uint32(index1)
   976  					sTable[hash4(cv1>>8, sTableBits)] = uint32(index1 + 1)
   977  					index0 += 2
   978  					index1 -= 2
   979  				}
   980  
   981  				cv = load64(src, s)
   982  				continue
   983  			}
   984  
   985  			// Long likely matches 7, so take that.
   986  			if uint32(cv) == uint32(valLong) {
   987  				break
   988  			}
   989  
   990  			// Check our short candidate
   991  			if uint32(cv) == uint32(valShort) {
   992  				// Try a long candidate at s+1
   993  				hashL = hash7(cv>>8, lTableBits)
   994  				candidateL = int(lTable[hashL])
   995  				lTable[hashL] = uint32(s + 1)
   996  				if uint32(cv>>8) == load32(src, candidateL) {
   997  					s++
   998  					break
   999  				}
  1000  				// Use our short candidate.
  1001  				candidateL = candidateS
  1002  				break
  1003  			}
  1004  
  1005  			cv = load64(src, nextS)
  1006  			s = nextS
  1007  		}
  1008  
  1009  		// Extend backwards
  1010  		for candidateL > 0 && s > nextEmit && src[candidateL-1] == src[s-1] {
  1011  			candidateL--
  1012  			s--
  1013  		}
  1014  
  1015  		// Bail if we exceed the maximum size.
  1016  		if d+(s-nextEmit) > dstLimit {
  1017  			return 0
  1018  		}
  1019  
  1020  		base := s
  1021  		offset := base - candidateL
  1022  
  1023  		// Extend the 4-byte match as long as possible.
  1024  		s += 4
  1025  		candidateL += 4
  1026  		for s < len(src) {
  1027  			if len(src)-s < 8 {
  1028  				if src[s] == src[candidateL] {
  1029  					s++
  1030  					candidateL++
  1031  					continue
  1032  				}
  1033  				break
  1034  			}
  1035  			if diff := load64(src, s) ^ load64(src, candidateL); diff != 0 {
  1036  				s += bits.TrailingZeros64(diff) >> 3
  1037  				break
  1038  			}
  1039  			s += 8
  1040  			candidateL += 8
  1041  		}
  1042  
  1043  		if offset > 65535 && s-base <= 5 && repeat != offset {
  1044  			// Bail if the match is equal or worse to the encoding.
  1045  			s = nextS + 1
  1046  			if s >= sLimit {
  1047  				goto emitRemainder
  1048  			}
  1049  			cv = load64(src, s)
  1050  			continue
  1051  		}
  1052  
  1053  		d += emitLiteral(dst[d:], src[nextEmit:base])
  1054  		if repeat == offset {
  1055  			d += emitRepeat(dst[d:], offset, s-base)
  1056  		} else {
  1057  			d += emitCopy(dst[d:], offset, s-base)
  1058  			repeat = offset
  1059  		}
  1060  
  1061  		nextEmit = s
  1062  		if s >= sLimit {
  1063  			goto emitRemainder
  1064  		}
  1065  
  1066  		if d > dstLimit {
  1067  			// Do we have space for more, if not bail.
  1068  			return 0
  1069  		}
  1070  
  1071  		// Index short & long
  1072  		index0 := base + 1
  1073  		index1 := s - 2
  1074  
  1075  		cv0 := load64(src, index0)
  1076  		cv1 := load64(src, index1)
  1077  		lTable[hash7(cv0, lTableBits)] = uint32(index0)
  1078  		sTable[hash4(cv0>>8, sTableBits)] = uint32(index0 + 1)
  1079  
  1080  		lTable[hash7(cv1, lTableBits)] = uint32(index1)
  1081  		sTable[hash4(cv1>>8, sTableBits)] = uint32(index1 + 1)
  1082  		index0 += 1
  1083  		index1 -= 1
  1084  		cv = load64(src, s)
  1085  
  1086  		// Index large values sparsely in between.
  1087  		// We do two starting from different offsets for speed.
  1088  		index2 := (index0 + index1 + 1) >> 1
  1089  		for index2 < index1 {
  1090  			lTable[hash7(load64(src, index0), lTableBits)] = uint32(index0)
  1091  			lTable[hash7(load64(src, index2), lTableBits)] = uint32(index2)
  1092  			index0 += 2
  1093  			index2 += 2
  1094  		}
  1095  	}
  1096  
  1097  emitRemainder:
  1098  	if nextEmit < len(src) {
  1099  		// Bail if we exceed the maximum size.
  1100  		if d+len(src)-nextEmit > dstLimit {
  1101  			return 0
  1102  		}
  1103  		d += emitLiteral(dst[d:], src[nextEmit:])
  1104  	}
  1105  	return d
  1106  }
  1107  

View as plain text