1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17 package metadata_test
18
19 import (
20 "math"
21 "reflect"
22 "testing"
23
24 "github.com/apache/arrow/go/v15/arrow/bitutil"
25 "github.com/apache/arrow/go/v15/arrow/float16"
26 "github.com/apache/arrow/go/v15/arrow/memory"
27 "github.com/apache/arrow/go/v15/parquet"
28 "github.com/apache/arrow/go/v15/parquet/metadata"
29 "github.com/apache/arrow/go/v15/parquet/schema"
30 "github.com/stretchr/testify/assert"
31 )
32
33
34
35
36 func newFloat16Node(name string, rep parquet.Repetition, fieldID int32) *schema.PrimitiveNode {
37 return schema.MustPrimitive(schema.NewPrimitiveNodeLogical(name, rep, schema.Float16LogicalType{}, parquet.Types.FixedLenByteArray, 2, fieldID))
38 }
39
40 func TestCheckNaNs(t *testing.T) {
41 const (
42 numvals = 8
43 min = -4.0
44 max = 3.0
45 )
46 var (
47 nan = math.NaN()
48 f16Min parquet.FixedLenByteArray = float16.New(float32(min)).ToLEBytes()
49 f16Max parquet.FixedLenByteArray = float16.New(float32(max)).ToLEBytes()
50 )
51
52 allNans := []float64{nan, nan, nan, nan, nan, nan, nan, nan}
53 allNansf32 := make([]float32, numvals)
54 allNansf16 := make([]parquet.FixedLenByteArray, numvals)
55 for idx, v := range allNans {
56 allNansf32[idx] = float32(v)
57 allNansf16[idx] = float16.New(float32(v)).ToLEBytes()
58 }
59
60 someNans := []float64{nan, max, -3.0, -1.0, nan, 2.0, min, nan}
61 someNansf32 := make([]float32, numvals)
62 someNansf16 := make([]parquet.FixedLenByteArray, numvals)
63 for idx, v := range someNans {
64 someNansf32[idx] = float32(v)
65 someNansf16[idx] = float16.New(float32(v)).ToLEBytes()
66 }
67
68 validBitmap := []byte{0x7F}
69 validBitmapNoNaNs := []byte{0x6E}
70
71 assertUnsetMinMax := func(stats metadata.TypedStatistics, values interface{}, bitmap []byte) {
72 if bitmap == nil {
73 switch s := stats.(type) {
74 case *metadata.Float32Statistics:
75 s.Update(values.([]float32), 0)
76 case *metadata.Float64Statistics:
77 s.Update(values.([]float64), 0)
78 case *metadata.Float16Statistics:
79 s.Update(values.([]parquet.FixedLenByteArray), 0)
80 }
81 assert.False(t, stats.HasMinMax())
82 } else {
83 nvalues := reflect.ValueOf(values).Len()
84 nullCount := bitutil.CountSetBits(bitmap, 0, nvalues)
85 switch s := stats.(type) {
86 case *metadata.Float32Statistics:
87 s.UpdateSpaced(values.([]float32), bitmap, 0, int64(nullCount))
88 case *metadata.Float64Statistics:
89 s.UpdateSpaced(values.([]float64), bitmap, 0, int64(nullCount))
90 case *metadata.Float16Statistics:
91 s.UpdateSpaced(values.([]parquet.FixedLenByteArray), bitmap, 0, int64(nullCount))
92 }
93 assert.False(t, stats.HasMinMax())
94 }
95 }
96
97 assertMinMaxAre := func(stats metadata.TypedStatistics, values interface{}, expectedMin, expectedMax interface{}) {
98 switch s := stats.(type) {
99 case *metadata.Float32Statistics:
100 s.Update(values.([]float32), 0)
101 assert.True(t, stats.HasMinMax())
102 assert.Equal(t, expectedMin, s.Min())
103 assert.Equal(t, expectedMax, s.Max())
104 case *metadata.Float64Statistics:
105 s.Update(values.([]float64), 0)
106 assert.True(t, stats.HasMinMax())
107 assert.Equal(t, expectedMin, s.Min())
108 assert.Equal(t, expectedMax, s.Max())
109 case *metadata.Float16Statistics:
110 s.Update(values.([]parquet.FixedLenByteArray), 0)
111 assert.True(t, stats.HasMinMax())
112 assert.Equal(t, expectedMin, s.Min())
113 assert.Equal(t, expectedMax, s.Max())
114 }
115 }
116
117 assertMinMaxAreSpaced := func(stats metadata.TypedStatistics, values interface{}, bitmap []byte, expectedMin, expectedMax interface{}) {
118 nvalues := reflect.ValueOf(values).Len()
119 nullCount := bitutil.CountSetBits(bitmap, 0, nvalues)
120 switch s := stats.(type) {
121 case *metadata.Float32Statistics:
122 s.UpdateSpaced(values.([]float32), bitmap, 0, int64(nullCount))
123 assert.True(t, s.HasMinMax())
124 assert.Equal(t, expectedMin, s.Min())
125 assert.Equal(t, expectedMax, s.Max())
126 case *metadata.Float64Statistics:
127 s.UpdateSpaced(values.([]float64), bitmap, 0, int64(nullCount))
128 assert.True(t, s.HasMinMax())
129 assert.Equal(t, expectedMin, s.Min())
130 assert.Equal(t, expectedMax, s.Max())
131 case *metadata.Float16Statistics:
132 s.UpdateSpaced(values.([]parquet.FixedLenByteArray), bitmap, 0, int64(nullCount))
133 assert.True(t, s.HasMinMax())
134 assert.Equal(t, expectedMin, s.Min())
135 assert.Equal(t, expectedMax, s.Max())
136 }
137 }
138
139 f32Col := schema.NewColumn(schema.NewFloat32Node("f", parquet.Repetitions.Optional, -1), 1, 1)
140 f64Col := schema.NewColumn(schema.NewFloat64Node("f", parquet.Repetitions.Optional, -1), 1, 1)
141 f16Col := schema.NewColumn(newFloat16Node("f", parquet.Repetitions.Required, -1), 1, 1)
142
143 someNanStats := metadata.NewStatistics(f64Col, memory.DefaultAllocator)
144 someNanStatsf32 := metadata.NewStatistics(f32Col, memory.DefaultAllocator)
145 someNanStatsf16 := metadata.NewStatistics(f16Col, memory.DefaultAllocator)
146
147 assertUnsetMinMax(someNanStats, allNans, nil)
148 assertUnsetMinMax(someNanStatsf32, allNansf32, nil)
149 assertUnsetMinMax(someNanStatsf16, allNansf16, nil)
150
151 assertMinMaxAre(someNanStats, someNans, min, max)
152 assertMinMaxAre(someNanStatsf32, someNansf32, float32(min), float32(max))
153 assertMinMaxAre(someNanStatsf16, someNansf16, f16Min, f16Max)
154
155 assertMinMaxAre(someNanStats, allNans, min, max)
156 assertMinMaxAre(someNanStatsf32, allNansf32, float32(min), float32(max))
157 assertMinMaxAre(someNanStatsf16, allNansf16, f16Min, f16Max)
158
159 someNanStats = metadata.NewStatistics(f64Col, memory.DefaultAllocator)
160 someNanStatsf32 = metadata.NewStatistics(f32Col, memory.DefaultAllocator)
161 someNanStatsf16 = metadata.NewStatistics(f16Col, memory.DefaultAllocator)
162 assertUnsetMinMax(someNanStats, allNans, validBitmap)
163 assertUnsetMinMax(someNanStatsf32, allNansf32, validBitmap)
164 assertUnsetMinMax(someNanStatsf16, allNansf16, validBitmap)
165
166 assertMinMaxAreSpaced(someNanStats, someNans, validBitmapNoNaNs, min, max)
167 assertMinMaxAreSpaced(someNanStatsf32, someNansf32, validBitmapNoNaNs, float32(min), float32(max))
168 assertMinMaxAreSpaced(someNanStatsf16, someNansf16, validBitmapNoNaNs, f16Min, f16Max)
169
170 assertMinMaxAreSpaced(someNanStats, someNans, validBitmap, min, max)
171 assertMinMaxAreSpaced(someNanStatsf32, someNansf32, validBitmap, float32(min), float32(max))
172 assertMinMaxAreSpaced(someNanStatsf16, someNansf16, validBitmap, f16Min, f16Max)
173 }
174
175 func TestCheckNegativeZeroStats(t *testing.T) {
176 assertMinMaxZeroesSign := func(stats metadata.TypedStatistics, values interface{}) {
177 switch s := stats.(type) {
178 case *metadata.Float32Statistics:
179 s.Update(values.([]float32), 0)
180 assert.True(t, s.HasMinMax())
181 var zero float32
182 assert.Equal(t, zero, s.Min())
183 assert.True(t, math.Signbit(float64(s.Min())))
184 assert.Equal(t, zero, s.Max())
185 assert.False(t, math.Signbit(float64(s.Max())))
186 case *metadata.Float64Statistics:
187 s.Update(values.([]float64), 0)
188 assert.True(t, s.HasMinMax())
189 var zero float64
190 assert.Equal(t, zero, s.Min())
191 assert.True(t, math.Signbit(s.Min()))
192 assert.Equal(t, zero, s.Max())
193 assert.False(t, math.Signbit(s.Max()))
194 case *metadata.Float16Statistics:
195 s.Update(values.([]parquet.FixedLenByteArray), 0)
196 assert.True(t, s.HasMinMax())
197 var zero float64
198 min := float64(float16.FromLEBytes(s.Min()).Float32())
199 max := float64(float16.FromLEBytes(s.Max()).Float32())
200 assert.Equal(t, zero, min)
201 assert.True(t, math.Signbit(min))
202 assert.Equal(t, zero, max)
203 assert.False(t, math.Signbit(max))
204 }
205 }
206
207 fcol := schema.NewColumn(schema.NewFloat32Node("f", parquet.Repetitions.Optional, -1), 1, 1)
208 dcol := schema.NewColumn(schema.NewFloat64Node("d", parquet.Repetitions.Optional, -1), 1, 1)
209 hcol := schema.NewColumn(newFloat16Node("h", parquet.Repetitions.Optional, -1), 1, 1)
210
211 var f32zero float32
212 var f64zero float64
213 var f16PosZero parquet.FixedLenByteArray = float16.New(+f32zero).ToLEBytes()
214 var f16NegZero parquet.FixedLenByteArray = float16.New(-f32zero).ToLEBytes()
215
216 assert.False(t, float16.FromLEBytes(f16PosZero).Signbit())
217 assert.True(t, float16.FromLEBytes(f16NegZero).Signbit())
218 {
219 fstats := metadata.NewStatistics(fcol, memory.DefaultAllocator)
220 dstats := metadata.NewStatistics(dcol, memory.DefaultAllocator)
221 hstats := metadata.NewStatistics(hcol, memory.DefaultAllocator)
222 assertMinMaxZeroesSign(fstats, []float32{-f32zero, f32zero})
223 assertMinMaxZeroesSign(dstats, []float64{-f64zero, f64zero})
224 assertMinMaxZeroesSign(hstats, []parquet.FixedLenByteArray{f16NegZero, f16PosZero})
225 }
226 {
227 fstats := metadata.NewStatistics(fcol, memory.DefaultAllocator)
228 dstats := metadata.NewStatistics(dcol, memory.DefaultAllocator)
229 hstats := metadata.NewStatistics(hcol, memory.DefaultAllocator)
230 assertMinMaxZeroesSign(fstats, []float32{f32zero, -f32zero})
231 assertMinMaxZeroesSign(dstats, []float64{f64zero, -f64zero})
232 assertMinMaxZeroesSign(hstats, []parquet.FixedLenByteArray{f16PosZero, f16NegZero})
233 }
234 {
235 fstats := metadata.NewStatistics(fcol, memory.DefaultAllocator)
236 dstats := metadata.NewStatistics(dcol, memory.DefaultAllocator)
237 hstats := metadata.NewStatistics(hcol, memory.DefaultAllocator)
238 assertMinMaxZeroesSign(fstats, []float32{-f32zero, -f32zero})
239 assertMinMaxZeroesSign(dstats, []float64{-f64zero, -f64zero})
240 assertMinMaxZeroesSign(hstats, []parquet.FixedLenByteArray{f16NegZero, f16NegZero})
241 }
242 {
243 fstats := metadata.NewStatistics(fcol, memory.DefaultAllocator)
244 dstats := metadata.NewStatistics(dcol, memory.DefaultAllocator)
245 hstats := metadata.NewStatistics(hcol, memory.DefaultAllocator)
246 assertMinMaxZeroesSign(fstats, []float32{f32zero, f32zero})
247 assertMinMaxZeroesSign(dstats, []float64{f64zero, f64zero})
248 assertMinMaxZeroesSign(hstats, []parquet.FixedLenByteArray{f16PosZero, f16PosZero})
249 }
250 }
251
252 func TestBooleanStatisticsEncoding(t *testing.T) {
253 n := schema.NewBooleanNode("boolean", parquet.Repetitions.Required, -1)
254 descr := schema.NewColumn(n, 0, 0)
255 s := metadata.NewStatistics(descr, nil)
256 bs := s.(*metadata.BooleanStatistics)
257 bs.SetMinMax(false, true)
258 maxEnc := bs.EncodeMax()
259 minEnc := bs.EncodeMin()
260 assert.Equal(t, []byte{1}, maxEnc)
261 assert.Equal(t, []byte{0}, minEnc)
262 }
263
View as plain text