...

Source file src/github.com/klauspost/compress/huff0/decompress_amd64.go

Documentation: github.com/klauspost/compress/huff0

     1  //go:build amd64 && !appengine && !noasm && gc
     2  // +build amd64,!appengine,!noasm,gc
     3  
     4  // This file contains the specialisation of Decoder.Decompress4X
     5  // and Decoder.Decompress1X that use an asm implementation of thir main loops.
     6  package huff0
     7  
     8  import (
     9  	"errors"
    10  	"fmt"
    11  
    12  	"github.com/klauspost/compress/internal/cpuinfo"
    13  )
    14  
    15  // decompress4x_main_loop_x86 is an x86 assembler implementation
    16  // of Decompress4X when tablelog > 8.
    17  //
    18  //go:noescape
    19  func decompress4x_main_loop_amd64(ctx *decompress4xContext)
    20  
    21  // decompress4x_8b_loop_x86 is an x86 assembler implementation
    22  // of Decompress4X when tablelog <= 8 which decodes 4 entries
    23  // per loop.
    24  //
    25  //go:noescape
    26  func decompress4x_8b_main_loop_amd64(ctx *decompress4xContext)
    27  
    28  // fallback8BitSize is the size where using Go version is faster.
    29  const fallback8BitSize = 800
    30  
    31  type decompress4xContext struct {
    32  	pbr      *[4]bitReaderShifted
    33  	peekBits uint8
    34  	out      *byte
    35  	dstEvery int
    36  	tbl      *dEntrySingle
    37  	decoded  int
    38  	limit    *byte
    39  }
    40  
    41  // Decompress4X will decompress a 4X encoded stream.
    42  // The length of the supplied input must match the end of a block exactly.
    43  // The *capacity* of the dst slice must match the destination size of
    44  // the uncompressed data exactly.
    45  func (d *Decoder) Decompress4X(dst, src []byte) ([]byte, error) {
    46  	if len(d.dt.single) == 0 {
    47  		return nil, errors.New("no table loaded")
    48  	}
    49  	if len(src) < 6+(4*1) {
    50  		return nil, errors.New("input too small")
    51  	}
    52  
    53  	use8BitTables := d.actualTableLog <= 8
    54  	if cap(dst) < fallback8BitSize && use8BitTables {
    55  		return d.decompress4X8bit(dst, src)
    56  	}
    57  
    58  	var br [4]bitReaderShifted
    59  	// Decode "jump table"
    60  	start := 6
    61  	for i := 0; i < 3; i++ {
    62  		length := int(src[i*2]) | (int(src[i*2+1]) << 8)
    63  		if start+length >= len(src) {
    64  			return nil, errors.New("truncated input (or invalid offset)")
    65  		}
    66  		err := br[i].init(src[start : start+length])
    67  		if err != nil {
    68  			return nil, err
    69  		}
    70  		start += length
    71  	}
    72  	err := br[3].init(src[start:])
    73  	if err != nil {
    74  		return nil, err
    75  	}
    76  
    77  	// destination, offset to match first output
    78  	dstSize := cap(dst)
    79  	dst = dst[:dstSize]
    80  	out := dst
    81  	dstEvery := (dstSize + 3) / 4
    82  
    83  	const tlSize = 1 << tableLogMax
    84  	const tlMask = tlSize - 1
    85  	single := d.dt.single[:tlSize]
    86  
    87  	var decoded int
    88  
    89  	if len(out) > 4*4 && !(br[0].off < 4 || br[1].off < 4 || br[2].off < 4 || br[3].off < 4) {
    90  		ctx := decompress4xContext{
    91  			pbr:      &br,
    92  			peekBits: uint8((64 - d.actualTableLog) & 63), // see: bitReaderShifted.peekBitsFast()
    93  			out:      &out[0],
    94  			dstEvery: dstEvery,
    95  			tbl:      &single[0],
    96  			limit:    &out[dstEvery-4], // Always stop decoding when first buffer gets here to avoid writing OOB on last.
    97  		}
    98  		if use8BitTables {
    99  			decompress4x_8b_main_loop_amd64(&ctx)
   100  		} else {
   101  			decompress4x_main_loop_amd64(&ctx)
   102  		}
   103  
   104  		decoded = ctx.decoded
   105  		out = out[decoded/4:]
   106  	}
   107  
   108  	// Decode remaining.
   109  	remainBytes := dstEvery - (decoded / 4)
   110  	for i := range br {
   111  		offset := dstEvery * i
   112  		endsAt := offset + remainBytes
   113  		if endsAt > len(out) {
   114  			endsAt = len(out)
   115  		}
   116  		br := &br[i]
   117  		bitsLeft := br.remaining()
   118  		for bitsLeft > 0 {
   119  			br.fill()
   120  			if offset >= endsAt {
   121  				return nil, errors.New("corruption detected: stream overrun 4")
   122  			}
   123  
   124  			// Read value and increment offset.
   125  			val := br.peekBitsFast(d.actualTableLog)
   126  			v := single[val&tlMask].entry
   127  			nBits := uint8(v)
   128  			br.advance(nBits)
   129  			bitsLeft -= uint(nBits)
   130  			out[offset] = uint8(v >> 8)
   131  			offset++
   132  		}
   133  		if offset != endsAt {
   134  			return nil, fmt.Errorf("corruption detected: short output block %d, end %d != %d", i, offset, endsAt)
   135  		}
   136  		decoded += offset - dstEvery*i
   137  		err = br.close()
   138  		if err != nil {
   139  			return nil, err
   140  		}
   141  	}
   142  	if dstSize != decoded {
   143  		return nil, errors.New("corruption detected: short output block")
   144  	}
   145  	return dst, nil
   146  }
   147  
   148  // decompress4x_main_loop_x86 is an x86 assembler implementation
   149  // of Decompress1X when tablelog > 8.
   150  //
   151  //go:noescape
   152  func decompress1x_main_loop_amd64(ctx *decompress1xContext)
   153  
   154  // decompress4x_main_loop_x86 is an x86 with BMI2 assembler implementation
   155  // of Decompress1X when tablelog > 8.
   156  //
   157  //go:noescape
   158  func decompress1x_main_loop_bmi2(ctx *decompress1xContext)
   159  
   160  type decompress1xContext struct {
   161  	pbr      *bitReaderShifted
   162  	peekBits uint8
   163  	out      *byte
   164  	outCap   int
   165  	tbl      *dEntrySingle
   166  	decoded  int
   167  }
   168  
   169  // Error reported by asm implementations
   170  const error_max_decoded_size_exeeded = -1
   171  
   172  // Decompress1X will decompress a 1X encoded stream.
   173  // The cap of the output buffer will be the maximum decompressed size.
   174  // The length of the supplied input must match the end of a block exactly.
   175  func (d *Decoder) Decompress1X(dst, src []byte) ([]byte, error) {
   176  	if len(d.dt.single) == 0 {
   177  		return nil, errors.New("no table loaded")
   178  	}
   179  	var br bitReaderShifted
   180  	err := br.init(src)
   181  	if err != nil {
   182  		return dst, err
   183  	}
   184  	maxDecodedSize := cap(dst)
   185  	dst = dst[:maxDecodedSize]
   186  
   187  	const tlSize = 1 << tableLogMax
   188  	const tlMask = tlSize - 1
   189  
   190  	if maxDecodedSize >= 4 {
   191  		ctx := decompress1xContext{
   192  			pbr:      &br,
   193  			out:      &dst[0],
   194  			outCap:   maxDecodedSize,
   195  			peekBits: uint8((64 - d.actualTableLog) & 63), // see: bitReaderShifted.peekBitsFast()
   196  			tbl:      &d.dt.single[0],
   197  		}
   198  
   199  		if cpuinfo.HasBMI2() {
   200  			decompress1x_main_loop_bmi2(&ctx)
   201  		} else {
   202  			decompress1x_main_loop_amd64(&ctx)
   203  		}
   204  		if ctx.decoded == error_max_decoded_size_exeeded {
   205  			return nil, ErrMaxDecodedSizeExceeded
   206  		}
   207  
   208  		dst = dst[:ctx.decoded]
   209  	}
   210  
   211  	// br < 8, so uint8 is fine
   212  	bitsLeft := uint8(br.off)*8 + 64 - br.bitsRead
   213  	for bitsLeft > 0 {
   214  		br.fill()
   215  		if len(dst) >= maxDecodedSize {
   216  			br.close()
   217  			return nil, ErrMaxDecodedSizeExceeded
   218  		}
   219  		v := d.dt.single[br.peekBitsFast(d.actualTableLog)&tlMask]
   220  		nBits := uint8(v.entry)
   221  		br.advance(nBits)
   222  		bitsLeft -= nBits
   223  		dst = append(dst, uint8(v.entry>>8))
   224  	}
   225  	return dst, br.close()
   226  }
   227  

View as plain text