...

Source file src/github.com/apache/arrow/go/v15/parquet/internal/encoding/boolean_encoder.go

Documentation: github.com/apache/arrow/go/v15/parquet/internal/encoding

     1  // Licensed to the Apache Software Foundation (ASF) under one
     2  // or more contributor license agreements.  See the NOTICE file
     3  // distributed with this work for additional information
     4  // regarding copyright ownership.  The ASF licenses this file
     5  // to you under the Apache License, Version 2.0 (the
     6  // "License"); you may not use this file except in compliance
     7  // with the License.  You may obtain a copy of the License at
     8  //
     9  // http://www.apache.org/licenses/LICENSE-2.0
    10  //
    11  // Unless required by applicable law or agreed to in writing, software
    12  // distributed under the License is distributed on an "AS IS" BASIS,
    13  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    14  // See the License for the specific language governing permissions and
    15  // limitations under the License.
    16  
    17  package encoding
    18  
    19  import (
    20  	"encoding/binary"
    21  
    22  	"github.com/apache/arrow/go/v15/arrow/bitutil"
    23  	"github.com/apache/arrow/go/v15/parquet"
    24  	"github.com/apache/arrow/go/v15/parquet/internal/debug"
    25  	"github.com/apache/arrow/go/v15/parquet/internal/utils"
    26  )
    27  
    28  const (
    29  	boolBufSize = 1024
    30  	boolsInBuf  = boolBufSize * 8
    31  )
    32  
    33  // PlainBooleanEncoder encodes bools as a bitmap as per the Plain Encoding
    34  type PlainBooleanEncoder struct {
    35  	encoder
    36  	bitsBuffer []byte
    37  	wr         utils.BitmapWriter
    38  }
    39  
    40  // Type for the PlainBooleanEncoder is parquet.Types.Boolean
    41  func (PlainBooleanEncoder) Type() parquet.Type {
    42  	return parquet.Types.Boolean
    43  }
    44  
    45  // Put encodes the contents of in into the underlying data buffer.
    46  func (enc *PlainBooleanEncoder) Put(in []bool) {
    47  	if enc.bitsBuffer == nil {
    48  		enc.bitsBuffer = make([]byte, boolBufSize)
    49  	}
    50  	if enc.wr == nil {
    51  		enc.wr = utils.NewBitmapWriter(enc.bitsBuffer, 0, boolsInBuf)
    52  	}
    53  	if len(in) == 0 {
    54  		return
    55  	}
    56  
    57  	n := enc.wr.AppendBools(in)
    58  	for n < len(in) {
    59  		enc.wr.Finish()
    60  		enc.append(enc.bitsBuffer)
    61  		enc.wr.Reset(0, boolsInBuf)
    62  		in = in[n:]
    63  		n = enc.wr.AppendBools(in)
    64  	}
    65  }
    66  
    67  // PutSpaced will use the validBits bitmap to determine which values are nulls
    68  // and can be left out from the slice, and the encoded without those nulls.
    69  func (enc *PlainBooleanEncoder) PutSpaced(in []bool, validBits []byte, validBitsOffset int64) {
    70  	bufferOut := make([]bool, len(in))
    71  	nvalid := spacedCompress(in, bufferOut, validBits, validBitsOffset)
    72  	enc.Put(bufferOut[:nvalid])
    73  }
    74  
    75  // EstimatedDataEncodedSize returns the current number of bytes that have
    76  // been buffered so far
    77  func (enc *PlainBooleanEncoder) EstimatedDataEncodedSize() int64 {
    78  	return int64(enc.sink.Len() + int(bitutil.BytesForBits(int64(enc.wr.Pos()))))
    79  }
    80  
    81  // FlushValues returns the buffered data, the responsibility is on the caller
    82  // to release the buffer memory
    83  func (enc *PlainBooleanEncoder) FlushValues() (Buffer, error) {
    84  	if enc.wr.Pos() > 0 {
    85  		toFlush := int(enc.wr.Pos())
    86  		enc.append(enc.bitsBuffer[:bitutil.BytesForBits(int64(toFlush))])
    87  	}
    88  
    89  	enc.wr.Reset(0, boolsInBuf)
    90  
    91  	return enc.sink.Finish(), nil
    92  }
    93  
    94  const rleLengthInBytes = 4
    95  
    96  type RleBooleanEncoder struct {
    97  	encoder
    98  
    99  	bufferedValues []bool
   100  }
   101  
   102  func (RleBooleanEncoder) Type() parquet.Type {
   103  	return parquet.Types.Boolean
   104  }
   105  
   106  func (enc *RleBooleanEncoder) Put(in []bool) {
   107  	enc.bufferedValues = append(enc.bufferedValues, in...)
   108  }
   109  
   110  func (enc *RleBooleanEncoder) PutSpaced(in []bool, validBits []byte, validBitsOffset int64) {
   111  	bufferOut := make([]bool, len(in))
   112  	nvalid := spacedCompress(in, bufferOut, validBits, validBitsOffset)
   113  	enc.Put(bufferOut[:nvalid])
   114  }
   115  
   116  func (enc *RleBooleanEncoder) EstimatedDataEncodedSize() int64 {
   117  	return rleLengthInBytes + int64(enc.maxRleBufferSize())
   118  }
   119  
   120  func (enc *RleBooleanEncoder) maxRleBufferSize() int {
   121  	return utils.MaxRLEBufferSize(1, len(enc.bufferedValues)) +
   122  		utils.MinRLEBufferSize(1)
   123  }
   124  
   125  func (enc *RleBooleanEncoder) FlushValues() (Buffer, error) {
   126  	rleBufferSizeMax := enc.maxRleBufferSize()
   127  	enc.sink.SetOffset(rleLengthInBytes)
   128  	enc.sink.Reserve(rleBufferSizeMax)
   129  
   130  	rleEncoder := utils.NewRleEncoder(enc.sink, 1)
   131  	for _, v := range enc.bufferedValues {
   132  		if v {
   133  			rleEncoder.Put(1)
   134  		} else {
   135  			rleEncoder.Put(0)
   136  		}
   137  	}
   138  	n := rleEncoder.Flush()
   139  	debug.Assert(n <= rleBufferSizeMax, "num encoded bytes larger than expected max")
   140  	buf := enc.sink.Finish()
   141  	binary.LittleEndian.PutUint32(buf.Bytes(), uint32(n))
   142  
   143  	return buf, nil
   144  }
   145  

View as plain text