...

Source file src/github.com/klauspost/compress/flate/token.go

Documentation: github.com/klauspost/compress/flate

     1  // Copyright 2009 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  package flate
     6  
     7  import (
     8  	"bytes"
     9  	"encoding/binary"
    10  	"fmt"
    11  	"io"
    12  	"math"
    13  )
    14  
    15  const (
    16  	// bits 0-16  	xoffset = offset - MIN_OFFSET_SIZE, or literal - 16 bits
    17  	// bits 16-22	offsetcode - 5 bits
    18  	// bits 22-30   xlength = length - MIN_MATCH_LENGTH - 8 bits
    19  	// bits 30-32   type   0 = literal  1=EOF  2=Match   3=Unused - 2 bits
    20  	lengthShift         = 22
    21  	offsetMask          = 1<<lengthShift - 1
    22  	typeMask            = 3 << 30
    23  	literalType         = 0 << 30
    24  	matchType           = 1 << 30
    25  	matchOffsetOnlyMask = 0xffff
    26  )
    27  
    28  // The length code for length X (MIN_MATCH_LENGTH <= X <= MAX_MATCH_LENGTH)
    29  // is lengthCodes[length - MIN_MATCH_LENGTH]
    30  var lengthCodes = [256]uint8{
    31  	0, 1, 2, 3, 4, 5, 6, 7, 8, 8,
    32  	9, 9, 10, 10, 11, 11, 12, 12, 12, 12,
    33  	13, 13, 13, 13, 14, 14, 14, 14, 15, 15,
    34  	15, 15, 16, 16, 16, 16, 16, 16, 16, 16,
    35  	17, 17, 17, 17, 17, 17, 17, 17, 18, 18,
    36  	18, 18, 18, 18, 18, 18, 19, 19, 19, 19,
    37  	19, 19, 19, 19, 20, 20, 20, 20, 20, 20,
    38  	20, 20, 20, 20, 20, 20, 20, 20, 20, 20,
    39  	21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
    40  	21, 21, 21, 21, 21, 21, 22, 22, 22, 22,
    41  	22, 22, 22, 22, 22, 22, 22, 22, 22, 22,
    42  	22, 22, 23, 23, 23, 23, 23, 23, 23, 23,
    43  	23, 23, 23, 23, 23, 23, 23, 23, 24, 24,
    44  	24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
    45  	24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
    46  	24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
    47  	25, 25, 25, 25, 25, 25, 25, 25, 25, 25,
    48  	25, 25, 25, 25, 25, 25, 25, 25, 25, 25,
    49  	25, 25, 25, 25, 25, 25, 25, 25, 25, 25,
    50  	25, 25, 26, 26, 26, 26, 26, 26, 26, 26,
    51  	26, 26, 26, 26, 26, 26, 26, 26, 26, 26,
    52  	26, 26, 26, 26, 26, 26, 26, 26, 26, 26,
    53  	26, 26, 26, 26, 27, 27, 27, 27, 27, 27,
    54  	27, 27, 27, 27, 27, 27, 27, 27, 27, 27,
    55  	27, 27, 27, 27, 27, 27, 27, 27, 27, 27,
    56  	27, 27, 27, 27, 27, 28,
    57  }
    58  
    59  // lengthCodes1 is length codes, but starting at 1.
    60  var lengthCodes1 = [256]uint8{
    61  	1, 2, 3, 4, 5, 6, 7, 8, 9, 9,
    62  	10, 10, 11, 11, 12, 12, 13, 13, 13, 13,
    63  	14, 14, 14, 14, 15, 15, 15, 15, 16, 16,
    64  	16, 16, 17, 17, 17, 17, 17, 17, 17, 17,
    65  	18, 18, 18, 18, 18, 18, 18, 18, 19, 19,
    66  	19, 19, 19, 19, 19, 19, 20, 20, 20, 20,
    67  	20, 20, 20, 20, 21, 21, 21, 21, 21, 21,
    68  	21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
    69  	22, 22, 22, 22, 22, 22, 22, 22, 22, 22,
    70  	22, 22, 22, 22, 22, 22, 23, 23, 23, 23,
    71  	23, 23, 23, 23, 23, 23, 23, 23, 23, 23,
    72  	23, 23, 24, 24, 24, 24, 24, 24, 24, 24,
    73  	24, 24, 24, 24, 24, 24, 24, 24, 25, 25,
    74  	25, 25, 25, 25, 25, 25, 25, 25, 25, 25,
    75  	25, 25, 25, 25, 25, 25, 25, 25, 25, 25,
    76  	25, 25, 25, 25, 25, 25, 25, 25, 25, 25,
    77  	26, 26, 26, 26, 26, 26, 26, 26, 26, 26,
    78  	26, 26, 26, 26, 26, 26, 26, 26, 26, 26,
    79  	26, 26, 26, 26, 26, 26, 26, 26, 26, 26,
    80  	26, 26, 27, 27, 27, 27, 27, 27, 27, 27,
    81  	27, 27, 27, 27, 27, 27, 27, 27, 27, 27,
    82  	27, 27, 27, 27, 27, 27, 27, 27, 27, 27,
    83  	27, 27, 27, 27, 28, 28, 28, 28, 28, 28,
    84  	28, 28, 28, 28, 28, 28, 28, 28, 28, 28,
    85  	28, 28, 28, 28, 28, 28, 28, 28, 28, 28,
    86  	28, 28, 28, 28, 28, 29,
    87  }
    88  
    89  var offsetCodes = [256]uint32{
    90  	0, 1, 2, 3, 4, 4, 5, 5, 6, 6, 6, 6, 7, 7, 7, 7,
    91  	8, 8, 8, 8, 8, 8, 8, 8, 9, 9, 9, 9, 9, 9, 9, 9,
    92  	10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
    93  	11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
    94  	12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
    95  	12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
    96  	13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
    97  	13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
    98  	14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
    99  	14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
   100  	14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
   101  	14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
   102  	15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
   103  	15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
   104  	15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
   105  	15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
   106  }
   107  
   108  // offsetCodes14 are offsetCodes, but with 14 added.
   109  var offsetCodes14 = [256]uint32{
   110  	14, 15, 16, 17, 18, 18, 19, 19, 20, 20, 20, 20, 21, 21, 21, 21,
   111  	22, 22, 22, 22, 22, 22, 22, 22, 23, 23, 23, 23, 23, 23, 23, 23,
   112  	24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
   113  	25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25,
   114  	26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26,
   115  	26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26,
   116  	27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27,
   117  	27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27,
   118  	28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28,
   119  	28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28,
   120  	28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28,
   121  	28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28,
   122  	29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29,
   123  	29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29,
   124  	29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29,
   125  	29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29,
   126  }
   127  
   128  type token uint32
   129  
   130  type tokens struct {
   131  	extraHist [32]uint16  // codes 256->maxnumlit
   132  	offHist   [32]uint16  // offset codes
   133  	litHist   [256]uint16 // codes 0->255
   134  	nFilled   int
   135  	n         uint16 // Must be able to contain maxStoreBlockSize
   136  	tokens    [maxStoreBlockSize + 1]token
   137  }
   138  
   139  func (t *tokens) Reset() {
   140  	if t.n == 0 {
   141  		return
   142  	}
   143  	t.n = 0
   144  	t.nFilled = 0
   145  	for i := range t.litHist[:] {
   146  		t.litHist[i] = 0
   147  	}
   148  	for i := range t.extraHist[:] {
   149  		t.extraHist[i] = 0
   150  	}
   151  	for i := range t.offHist[:] {
   152  		t.offHist[i] = 0
   153  	}
   154  }
   155  
   156  func (t *tokens) Fill() {
   157  	if t.n == 0 {
   158  		return
   159  	}
   160  	for i, v := range t.litHist[:] {
   161  		if v == 0 {
   162  			t.litHist[i] = 1
   163  			t.nFilled++
   164  		}
   165  	}
   166  	for i, v := range t.extraHist[:literalCount-256] {
   167  		if v == 0 {
   168  			t.nFilled++
   169  			t.extraHist[i] = 1
   170  		}
   171  	}
   172  	for i, v := range t.offHist[:offsetCodeCount] {
   173  		if v == 0 {
   174  			t.offHist[i] = 1
   175  		}
   176  	}
   177  }
   178  
   179  func indexTokens(in []token) tokens {
   180  	var t tokens
   181  	t.indexTokens(in)
   182  	return t
   183  }
   184  
   185  func (t *tokens) indexTokens(in []token) {
   186  	t.Reset()
   187  	for _, tok := range in {
   188  		if tok < matchType {
   189  			t.AddLiteral(tok.literal())
   190  			continue
   191  		}
   192  		t.AddMatch(uint32(tok.length()), tok.offset()&matchOffsetOnlyMask)
   193  	}
   194  }
   195  
   196  // emitLiteral writes a literal chunk and returns the number of bytes written.
   197  func emitLiteral(dst *tokens, lit []byte) {
   198  	for _, v := range lit {
   199  		dst.tokens[dst.n] = token(v)
   200  		dst.litHist[v]++
   201  		dst.n++
   202  	}
   203  }
   204  
   205  func (t *tokens) AddLiteral(lit byte) {
   206  	t.tokens[t.n] = token(lit)
   207  	t.litHist[lit]++
   208  	t.n++
   209  }
   210  
   211  // from https://stackoverflow.com/a/28730362
   212  func mFastLog2(val float32) float32 {
   213  	ux := int32(math.Float32bits(val))
   214  	log2 := (float32)(((ux >> 23) & 255) - 128)
   215  	ux &= -0x7f800001
   216  	ux += 127 << 23
   217  	uval := math.Float32frombits(uint32(ux))
   218  	log2 += ((-0.34484843)*uval+2.02466578)*uval - 0.67487759
   219  	return log2
   220  }
   221  
   222  // EstimatedBits will return an minimum size estimated by an *optimal*
   223  // compression of the block.
   224  // The size of the block
   225  func (t *tokens) EstimatedBits() int {
   226  	shannon := float32(0)
   227  	bits := int(0)
   228  	nMatches := 0
   229  	total := int(t.n) + t.nFilled
   230  	if total > 0 {
   231  		invTotal := 1.0 / float32(total)
   232  		for _, v := range t.litHist[:] {
   233  			if v > 0 {
   234  				n := float32(v)
   235  				shannon += atLeastOne(-mFastLog2(n*invTotal)) * n
   236  			}
   237  		}
   238  		// Just add 15 for EOB
   239  		shannon += 15
   240  		for i, v := range t.extraHist[1 : literalCount-256] {
   241  			if v > 0 {
   242  				n := float32(v)
   243  				shannon += atLeastOne(-mFastLog2(n*invTotal)) * n
   244  				bits += int(lengthExtraBits[i&31]) * int(v)
   245  				nMatches += int(v)
   246  			}
   247  		}
   248  	}
   249  	if nMatches > 0 {
   250  		invTotal := 1.0 / float32(nMatches)
   251  		for i, v := range t.offHist[:offsetCodeCount] {
   252  			if v > 0 {
   253  				n := float32(v)
   254  				shannon += atLeastOne(-mFastLog2(n*invTotal)) * n
   255  				bits += int(offsetExtraBits[i&31]) * int(v)
   256  			}
   257  		}
   258  	}
   259  	return int(shannon) + bits
   260  }
   261  
   262  // AddMatch adds a match to the tokens.
   263  // This function is very sensitive to inlining and right on the border.
   264  func (t *tokens) AddMatch(xlength uint32, xoffset uint32) {
   265  	if debugDeflate {
   266  		if xlength >= maxMatchLength+baseMatchLength {
   267  			panic(fmt.Errorf("invalid length: %v", xlength))
   268  		}
   269  		if xoffset >= maxMatchOffset+baseMatchOffset {
   270  			panic(fmt.Errorf("invalid offset: %v", xoffset))
   271  		}
   272  	}
   273  	oCode := offsetCode(xoffset)
   274  	xoffset |= oCode << 16
   275  
   276  	t.extraHist[lengthCodes1[uint8(xlength)]]++
   277  	t.offHist[oCode&31]++
   278  	t.tokens[t.n] = token(matchType | xlength<<lengthShift | xoffset)
   279  	t.n++
   280  }
   281  
   282  // AddMatchLong adds a match to the tokens, potentially longer than max match length.
   283  // Length should NOT have the base subtracted, only offset should.
   284  func (t *tokens) AddMatchLong(xlength int32, xoffset uint32) {
   285  	if debugDeflate {
   286  		if xoffset >= maxMatchOffset+baseMatchOffset {
   287  			panic(fmt.Errorf("invalid offset: %v", xoffset))
   288  		}
   289  	}
   290  	oc := offsetCode(xoffset)
   291  	xoffset |= oc << 16
   292  	for xlength > 0 {
   293  		xl := xlength
   294  		if xl > 258 {
   295  			// We need to have at least baseMatchLength left over for next loop.
   296  			if xl > 258+baseMatchLength {
   297  				xl = 258
   298  			} else {
   299  				xl = 258 - baseMatchLength
   300  			}
   301  		}
   302  		xlength -= xl
   303  		xl -= baseMatchLength
   304  		t.extraHist[lengthCodes1[uint8(xl)]]++
   305  		t.offHist[oc&31]++
   306  		t.tokens[t.n] = token(matchType | uint32(xl)<<lengthShift | xoffset)
   307  		t.n++
   308  	}
   309  }
   310  
   311  func (t *tokens) AddEOB() {
   312  	t.tokens[t.n] = token(endBlockMarker)
   313  	t.extraHist[0]++
   314  	t.n++
   315  }
   316  
   317  func (t *tokens) Slice() []token {
   318  	return t.tokens[:t.n]
   319  }
   320  
   321  // VarInt returns the tokens as varint encoded bytes.
   322  func (t *tokens) VarInt() []byte {
   323  	var b = make([]byte, binary.MaxVarintLen32*int(t.n))
   324  	var off int
   325  	for _, v := range t.tokens[:t.n] {
   326  		off += binary.PutUvarint(b[off:], uint64(v))
   327  	}
   328  	return b[:off]
   329  }
   330  
   331  // FromVarInt restores t to the varint encoded tokens provided.
   332  // Any data in t is removed.
   333  func (t *tokens) FromVarInt(b []byte) error {
   334  	var buf = bytes.NewReader(b)
   335  	var toks []token
   336  	for {
   337  		r, err := binary.ReadUvarint(buf)
   338  		if err == io.EOF {
   339  			break
   340  		}
   341  		if err != nil {
   342  			return err
   343  		}
   344  		toks = append(toks, token(r))
   345  	}
   346  	t.indexTokens(toks)
   347  	return nil
   348  }
   349  
   350  // Returns the type of a token
   351  func (t token) typ() uint32 { return uint32(t) & typeMask }
   352  
   353  // Returns the literal of a literal token
   354  func (t token) literal() uint8 { return uint8(t) }
   355  
   356  // Returns the extra offset of a match token
   357  func (t token) offset() uint32 { return uint32(t) & offsetMask }
   358  
   359  func (t token) length() uint8 { return uint8(t >> lengthShift) }
   360  
   361  // Convert length to code.
   362  func lengthCode(len uint8) uint8 { return lengthCodes[len] }
   363  
   364  // Returns the offset code corresponding to a specific offset
   365  func offsetCode(off uint32) uint32 {
   366  	if false {
   367  		if off < uint32(len(offsetCodes)) {
   368  			return offsetCodes[off&255]
   369  		} else if off>>7 < uint32(len(offsetCodes)) {
   370  			return offsetCodes[(off>>7)&255] + 14
   371  		} else {
   372  			return offsetCodes[(off>>14)&255] + 28
   373  		}
   374  	}
   375  	if off < uint32(len(offsetCodes)) {
   376  		return offsetCodes[uint8(off)]
   377  	}
   378  	return offsetCodes14[uint8(off>>7)]
   379  }
   380  

View as plain text