...

Source file src/github.com/apache/arrow/go/v15/parquet/metadata/statistics_test.go

Documentation: github.com/apache/arrow/go/v15/parquet/metadata

     1  // Licensed to the Apache Software Foundation (ASF) under one
     2  // or more contributor license agreements.  See the NOTICE file
     3  // distributed with this work for additional information
     4  // regarding copyright ownership.  The ASF licenses this file
     5  // to you under the Apache License, Version 2.0 (the
     6  // "License"); you may not use this file except in compliance
     7  // with the License.  You may obtain a copy of the License at
     8  //
     9  // http://www.apache.org/licenses/LICENSE-2.0
    10  //
    11  // Unless required by applicable law or agreed to in writing, software
    12  // distributed under the License is distributed on an "AS IS" BASIS,
    13  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    14  // See the License for the specific language governing permissions and
    15  // limitations under the License.
    16  
    17  package metadata_test
    18  
    19  import (
    20  	"math"
    21  	"reflect"
    22  	"testing"
    23  
    24  	"github.com/apache/arrow/go/v15/arrow/bitutil"
    25  	"github.com/apache/arrow/go/v15/arrow/float16"
    26  	"github.com/apache/arrow/go/v15/arrow/memory"
    27  	"github.com/apache/arrow/go/v15/parquet"
    28  	"github.com/apache/arrow/go/v15/parquet/metadata"
    29  	"github.com/apache/arrow/go/v15/parquet/schema"
    30  	"github.com/stretchr/testify/assert"
    31  )
    32  
    33  // NOTE(zeroshade): tests will be added and updated after merging the "file" package
    34  // since the tests that I wrote relied on the file writer/reader for ease of use.
    35  
    36  func newFloat16Node(name string, rep parquet.Repetition, fieldID int32) *schema.PrimitiveNode {
    37  	return schema.MustPrimitive(schema.NewPrimitiveNodeLogical(name, rep, schema.Float16LogicalType{}, parquet.Types.FixedLenByteArray, 2, fieldID))
    38  }
    39  
    40  func TestCheckNaNs(t *testing.T) {
    41  	const (
    42  		numvals = 8
    43  		min     = -4.0
    44  		max     = 3.0
    45  	)
    46  	var (
    47  		nan                              = math.NaN()
    48  		f16Min parquet.FixedLenByteArray = float16.New(float32(min)).ToLEBytes()
    49  		f16Max parquet.FixedLenByteArray = float16.New(float32(max)).ToLEBytes()
    50  	)
    51  
    52  	allNans := []float64{nan, nan, nan, nan, nan, nan, nan, nan}
    53  	allNansf32 := make([]float32, numvals)
    54  	allNansf16 := make([]parquet.FixedLenByteArray, numvals)
    55  	for idx, v := range allNans {
    56  		allNansf32[idx] = float32(v)
    57  		allNansf16[idx] = float16.New(float32(v)).ToLEBytes()
    58  	}
    59  
    60  	someNans := []float64{nan, max, -3.0, -1.0, nan, 2.0, min, nan}
    61  	someNansf32 := make([]float32, numvals)
    62  	someNansf16 := make([]parquet.FixedLenByteArray, numvals)
    63  	for idx, v := range someNans {
    64  		someNansf32[idx] = float32(v)
    65  		someNansf16[idx] = float16.New(float32(v)).ToLEBytes()
    66  	}
    67  
    68  	validBitmap := []byte{0x7F}       // 0b01111111
    69  	validBitmapNoNaNs := []byte{0x6E} // 0b01101110
    70  
    71  	assertUnsetMinMax := func(stats metadata.TypedStatistics, values interface{}, bitmap []byte) {
    72  		if bitmap == nil {
    73  			switch s := stats.(type) {
    74  			case *metadata.Float32Statistics:
    75  				s.Update(values.([]float32), 0)
    76  			case *metadata.Float64Statistics:
    77  				s.Update(values.([]float64), 0)
    78  			case *metadata.Float16Statistics:
    79  				s.Update(values.([]parquet.FixedLenByteArray), 0)
    80  			}
    81  			assert.False(t, stats.HasMinMax())
    82  		} else {
    83  			nvalues := reflect.ValueOf(values).Len()
    84  			nullCount := bitutil.CountSetBits(bitmap, 0, nvalues)
    85  			switch s := stats.(type) {
    86  			case *metadata.Float32Statistics:
    87  				s.UpdateSpaced(values.([]float32), bitmap, 0, int64(nullCount))
    88  			case *metadata.Float64Statistics:
    89  				s.UpdateSpaced(values.([]float64), bitmap, 0, int64(nullCount))
    90  			case *metadata.Float16Statistics:
    91  				s.UpdateSpaced(values.([]parquet.FixedLenByteArray), bitmap, 0, int64(nullCount))
    92  			}
    93  			assert.False(t, stats.HasMinMax())
    94  		}
    95  	}
    96  
    97  	assertMinMaxAre := func(stats metadata.TypedStatistics, values interface{}, expectedMin, expectedMax interface{}) {
    98  		switch s := stats.(type) {
    99  		case *metadata.Float32Statistics:
   100  			s.Update(values.([]float32), 0)
   101  			assert.True(t, stats.HasMinMax())
   102  			assert.Equal(t, expectedMin, s.Min())
   103  			assert.Equal(t, expectedMax, s.Max())
   104  		case *metadata.Float64Statistics:
   105  			s.Update(values.([]float64), 0)
   106  			assert.True(t, stats.HasMinMax())
   107  			assert.Equal(t, expectedMin, s.Min())
   108  			assert.Equal(t, expectedMax, s.Max())
   109  		case *metadata.Float16Statistics:
   110  			s.Update(values.([]parquet.FixedLenByteArray), 0)
   111  			assert.True(t, stats.HasMinMax())
   112  			assert.Equal(t, expectedMin, s.Min())
   113  			assert.Equal(t, expectedMax, s.Max())
   114  		}
   115  	}
   116  
   117  	assertMinMaxAreSpaced := func(stats metadata.TypedStatistics, values interface{}, bitmap []byte, expectedMin, expectedMax interface{}) {
   118  		nvalues := reflect.ValueOf(values).Len()
   119  		nullCount := bitutil.CountSetBits(bitmap, 0, nvalues)
   120  		switch s := stats.(type) {
   121  		case *metadata.Float32Statistics:
   122  			s.UpdateSpaced(values.([]float32), bitmap, 0, int64(nullCount))
   123  			assert.True(t, s.HasMinMax())
   124  			assert.Equal(t, expectedMin, s.Min())
   125  			assert.Equal(t, expectedMax, s.Max())
   126  		case *metadata.Float64Statistics:
   127  			s.UpdateSpaced(values.([]float64), bitmap, 0, int64(nullCount))
   128  			assert.True(t, s.HasMinMax())
   129  			assert.Equal(t, expectedMin, s.Min())
   130  			assert.Equal(t, expectedMax, s.Max())
   131  		case *metadata.Float16Statistics:
   132  			s.UpdateSpaced(values.([]parquet.FixedLenByteArray), bitmap, 0, int64(nullCount))
   133  			assert.True(t, s.HasMinMax())
   134  			assert.Equal(t, expectedMin, s.Min())
   135  			assert.Equal(t, expectedMax, s.Max())
   136  		}
   137  	}
   138  
   139  	f32Col := schema.NewColumn(schema.NewFloat32Node("f", parquet.Repetitions.Optional, -1), 1, 1)
   140  	f64Col := schema.NewColumn(schema.NewFloat64Node("f", parquet.Repetitions.Optional, -1), 1, 1)
   141  	f16Col := schema.NewColumn(newFloat16Node("f", parquet.Repetitions.Required, -1), 1, 1)
   142  	// test values
   143  	someNanStats := metadata.NewStatistics(f64Col, memory.DefaultAllocator)
   144  	someNanStatsf32 := metadata.NewStatistics(f32Col, memory.DefaultAllocator)
   145  	someNanStatsf16 := metadata.NewStatistics(f16Col, memory.DefaultAllocator)
   146  	// ingesting only nans should not yield a min or max
   147  	assertUnsetMinMax(someNanStats, allNans, nil)
   148  	assertUnsetMinMax(someNanStatsf32, allNansf32, nil)
   149  	assertUnsetMinMax(someNanStatsf16, allNansf16, nil)
   150  	// ingesting a mix should yield a valid min/max
   151  	assertMinMaxAre(someNanStats, someNans, min, max)
   152  	assertMinMaxAre(someNanStatsf32, someNansf32, float32(min), float32(max))
   153  	assertMinMaxAre(someNanStatsf16, someNansf16, f16Min, f16Max)
   154  	// ingesting only nans after a valid min/max should have no effect
   155  	assertMinMaxAre(someNanStats, allNans, min, max)
   156  	assertMinMaxAre(someNanStatsf32, allNansf32, float32(min), float32(max))
   157  	assertMinMaxAre(someNanStatsf16, allNansf16, f16Min, f16Max)
   158  
   159  	someNanStats = metadata.NewStatistics(f64Col, memory.DefaultAllocator)
   160  	someNanStatsf32 = metadata.NewStatistics(f32Col, memory.DefaultAllocator)
   161  	someNanStatsf16 = metadata.NewStatistics(f16Col, memory.DefaultAllocator)
   162  	assertUnsetMinMax(someNanStats, allNans, validBitmap)
   163  	assertUnsetMinMax(someNanStatsf32, allNansf32, validBitmap)
   164  	assertUnsetMinMax(someNanStatsf16, allNansf16, validBitmap)
   165  	// nans should not pollute min/max when excluded via null bitmap
   166  	assertMinMaxAreSpaced(someNanStats, someNans, validBitmapNoNaNs, min, max)
   167  	assertMinMaxAreSpaced(someNanStatsf32, someNansf32, validBitmapNoNaNs, float32(min), float32(max))
   168  	assertMinMaxAreSpaced(someNanStatsf16, someNansf16, validBitmapNoNaNs, f16Min, f16Max)
   169  	// ingesting nans with a null bitmap should not change the result
   170  	assertMinMaxAreSpaced(someNanStats, someNans, validBitmap, min, max)
   171  	assertMinMaxAreSpaced(someNanStatsf32, someNansf32, validBitmap, float32(min), float32(max))
   172  	assertMinMaxAreSpaced(someNanStatsf16, someNansf16, validBitmap, f16Min, f16Max)
   173  }
   174  
   175  func TestCheckNegativeZeroStats(t *testing.T) {
   176  	assertMinMaxZeroesSign := func(stats metadata.TypedStatistics, values interface{}) {
   177  		switch s := stats.(type) {
   178  		case *metadata.Float32Statistics:
   179  			s.Update(values.([]float32), 0)
   180  			assert.True(t, s.HasMinMax())
   181  			var zero float32
   182  			assert.Equal(t, zero, s.Min())
   183  			assert.True(t, math.Signbit(float64(s.Min())))
   184  			assert.Equal(t, zero, s.Max())
   185  			assert.False(t, math.Signbit(float64(s.Max())))
   186  		case *metadata.Float64Statistics:
   187  			s.Update(values.([]float64), 0)
   188  			assert.True(t, s.HasMinMax())
   189  			var zero float64
   190  			assert.Equal(t, zero, s.Min())
   191  			assert.True(t, math.Signbit(s.Min()))
   192  			assert.Equal(t, zero, s.Max())
   193  			assert.False(t, math.Signbit(s.Max()))
   194  		case *metadata.Float16Statistics:
   195  			s.Update(values.([]parquet.FixedLenByteArray), 0)
   196  			assert.True(t, s.HasMinMax())
   197  			var zero float64
   198  			min := float64(float16.FromLEBytes(s.Min()).Float32())
   199  			max := float64(float16.FromLEBytes(s.Max()).Float32())
   200  			assert.Equal(t, zero, min)
   201  			assert.True(t, math.Signbit(min))
   202  			assert.Equal(t, zero, max)
   203  			assert.False(t, math.Signbit(max))
   204  		}
   205  	}
   206  
   207  	fcol := schema.NewColumn(schema.NewFloat32Node("f", parquet.Repetitions.Optional, -1), 1, 1)
   208  	dcol := schema.NewColumn(schema.NewFloat64Node("d", parquet.Repetitions.Optional, -1), 1, 1)
   209  	hcol := schema.NewColumn(newFloat16Node("h", parquet.Repetitions.Optional, -1), 1, 1)
   210  
   211  	var f32zero float32
   212  	var f64zero float64
   213  	var f16PosZero parquet.FixedLenByteArray = float16.New(+f32zero).ToLEBytes()
   214  	var f16NegZero parquet.FixedLenByteArray = float16.New(-f32zero).ToLEBytes()
   215  
   216  	assert.False(t, float16.FromLEBytes(f16PosZero).Signbit())
   217  	assert.True(t, float16.FromLEBytes(f16NegZero).Signbit())
   218  	{
   219  		fstats := metadata.NewStatistics(fcol, memory.DefaultAllocator)
   220  		dstats := metadata.NewStatistics(dcol, memory.DefaultAllocator)
   221  		hstats := metadata.NewStatistics(hcol, memory.DefaultAllocator)
   222  		assertMinMaxZeroesSign(fstats, []float32{-f32zero, f32zero})
   223  		assertMinMaxZeroesSign(dstats, []float64{-f64zero, f64zero})
   224  		assertMinMaxZeroesSign(hstats, []parquet.FixedLenByteArray{f16NegZero, f16PosZero})
   225  	}
   226  	{
   227  		fstats := metadata.NewStatistics(fcol, memory.DefaultAllocator)
   228  		dstats := metadata.NewStatistics(dcol, memory.DefaultAllocator)
   229  		hstats := metadata.NewStatistics(hcol, memory.DefaultAllocator)
   230  		assertMinMaxZeroesSign(fstats, []float32{f32zero, -f32zero})
   231  		assertMinMaxZeroesSign(dstats, []float64{f64zero, -f64zero})
   232  		assertMinMaxZeroesSign(hstats, []parquet.FixedLenByteArray{f16PosZero, f16NegZero})
   233  	}
   234  	{
   235  		fstats := metadata.NewStatistics(fcol, memory.DefaultAllocator)
   236  		dstats := metadata.NewStatistics(dcol, memory.DefaultAllocator)
   237  		hstats := metadata.NewStatistics(hcol, memory.DefaultAllocator)
   238  		assertMinMaxZeroesSign(fstats, []float32{-f32zero, -f32zero})
   239  		assertMinMaxZeroesSign(dstats, []float64{-f64zero, -f64zero})
   240  		assertMinMaxZeroesSign(hstats, []parquet.FixedLenByteArray{f16NegZero, f16NegZero})
   241  	}
   242  	{
   243  		fstats := metadata.NewStatistics(fcol, memory.DefaultAllocator)
   244  		dstats := metadata.NewStatistics(dcol, memory.DefaultAllocator)
   245  		hstats := metadata.NewStatistics(hcol, memory.DefaultAllocator)
   246  		assertMinMaxZeroesSign(fstats, []float32{f32zero, f32zero})
   247  		assertMinMaxZeroesSign(dstats, []float64{f64zero, f64zero})
   248  		assertMinMaxZeroesSign(hstats, []parquet.FixedLenByteArray{f16PosZero, f16PosZero})
   249  	}
   250  }
   251  
   252  func TestBooleanStatisticsEncoding(t *testing.T) {
   253  	n := schema.NewBooleanNode("boolean", parquet.Repetitions.Required, -1)
   254  	descr := schema.NewColumn(n, 0, 0)
   255  	s := metadata.NewStatistics(descr, nil)
   256  	bs := s.(*metadata.BooleanStatistics)
   257  	bs.SetMinMax(false, true)
   258  	maxEnc := bs.EncodeMax()
   259  	minEnc := bs.EncodeMin()
   260  	assert.Equal(t, []byte{1}, maxEnc)
   261  	assert.Equal(t, []byte{0}, minEnc)
   262  }
   263  

View as plain text