1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17 package schema_test
18
19 import (
20 "os"
21 "testing"
22
23 "github.com/apache/arrow/go/v15/parquet"
24 format "github.com/apache/arrow/go/v15/parquet/internal/gen-go/parquet"
25 "github.com/apache/arrow/go/v15/parquet/schema"
26 "github.com/apache/thrift/lib/go/thrift"
27 "github.com/stretchr/testify/assert"
28 "github.com/stretchr/testify/suite"
29 )
30
31 func TestColumnPath(t *testing.T) {
32 p := parquet.ColumnPath([]string{"toplevel", "leaf"})
33 assert.Equal(t, "toplevel.leaf", p.String())
34
35 p2 := parquet.ColumnPathFromString("toplevel.leaf")
36 assert.Equal(t, "toplevel.leaf", p2.String())
37
38 extend := p2.Extend("anotherlevel")
39 assert.Equal(t, "toplevel.leaf.anotherlevel", extend.String())
40 }
41
42 func NewPrimitive(name string, repetition format.FieldRepetitionType, typ format.Type, fieldID int32) *format.SchemaElement {
43 ret := &format.SchemaElement{
44 Name: name,
45 RepetitionType: format.FieldRepetitionTypePtr(repetition),
46 Type: format.TypePtr(typ),
47 }
48 if fieldID >= 0 {
49 ret.FieldID = &fieldID
50 }
51 return ret
52 }
53
54 func NewGroup(name string, repetition format.FieldRepetitionType, numChildren, fieldID int32) *format.SchemaElement {
55 ret := &format.SchemaElement{
56 Name: name,
57 RepetitionType: format.FieldRepetitionTypePtr(repetition),
58 NumChildren: &numChildren,
59 }
60 if fieldID >= 0 {
61 ret.FieldID = &fieldID
62 }
63 return ret
64 }
65
66 func TestSchemaNodes(t *testing.T) {
67 suite.Run(t, new(PrimitiveNodeTestSuite))
68 suite.Run(t, new(GroupNodeTestSuite))
69 suite.Run(t, new(SchemaConverterSuite))
70 }
71
72 type PrimitiveNodeTestSuite struct {
73 suite.Suite
74
75 name string
76 fieldID int32
77 node schema.Node
78 }
79
80 func (p *PrimitiveNodeTestSuite) SetupTest() {
81 p.name = "name"
82 p.fieldID = 5
83 }
84
85 func (p *PrimitiveNodeTestSuite) convert(elt *format.SchemaElement) {
86 p.node = schema.MustPrimitive(schema.PrimitiveNodeFromThrift(elt))
87 p.IsType(&schema.PrimitiveNode{}, p.node)
88 }
89
90 func (p *PrimitiveNodeTestSuite) TestAttrs() {
91 node1 := schema.NewInt32Node("foo" , parquet.Repetitions.Repeated, -1 )
92 node2 := schema.MustPrimitive(schema.NewPrimitiveNodeConverted("bar" , parquet.Repetitions.Optional, parquet.Types.ByteArray,
93 schema.ConvertedTypes.UTF8, 0 , 0 , 0 , -1 ))
94
95 p.Equal("foo", node1.Name())
96 p.Equal(schema.Primitive, node1.Type())
97 p.Equal(schema.Primitive, node2.Type())
98
99 p.Equal(parquet.Repetitions.Repeated, node1.RepetitionType())
100 p.Equal(parquet.Repetitions.Optional, node2.RepetitionType())
101
102 p.Equal(parquet.Types.Int32, node1.PhysicalType())
103 p.Equal(parquet.Types.ByteArray, node2.PhysicalType())
104
105 p.Equal(schema.ConvertedTypes.None, node1.ConvertedType())
106 p.Equal(schema.ConvertedTypes.UTF8, node2.ConvertedType())
107 }
108
109 func (p *PrimitiveNodeTestSuite) TestFromParquet() {
110 p.Run("Optional Int32", func() {
111 elt := NewPrimitive(p.name, format.FieldRepetitionType_OPTIONAL, format.Type_INT32, p.fieldID)
112 p.convert(elt)
113
114 p.Equal(p.name, p.node.Name())
115 p.Equal(p.fieldID, p.node.FieldID())
116 p.Equal(parquet.Repetitions.Optional, p.node.RepetitionType())
117 p.Equal(parquet.Types.Int32, p.node.(*schema.PrimitiveNode).PhysicalType())
118 p.Equal(schema.ConvertedTypes.None, p.node.ConvertedType())
119 })
120
121 p.Run("LogicalType", func() {
122 elt := NewPrimitive(p.name, format.FieldRepetitionType_REQUIRED, format.Type_BYTE_ARRAY, p.fieldID)
123 elt.ConvertedType = format.ConvertedTypePtr(format.ConvertedType_UTF8)
124 p.convert(elt)
125
126 p.Equal(parquet.Repetitions.Required, p.node.RepetitionType())
127 p.Equal(parquet.Types.ByteArray, p.node.(*schema.PrimitiveNode).PhysicalType())
128 p.Equal(schema.ConvertedTypes.UTF8, p.node.ConvertedType())
129 })
130
131 p.Run("FixedLenByteArray", func() {
132 elt := NewPrimitive(p.name, format.FieldRepetitionType_OPTIONAL, format.Type_FIXED_LEN_BYTE_ARRAY, p.fieldID)
133 elt.TypeLength = thrift.Int32Ptr(16)
134 p.convert(elt)
135
136 p.Equal(p.name, p.node.Name())
137 p.Equal(p.fieldID, p.node.FieldID())
138 p.Equal(parquet.Repetitions.Optional, p.node.RepetitionType())
139 p.Equal(parquet.Types.FixedLenByteArray, p.node.(*schema.PrimitiveNode).PhysicalType())
140 p.Equal(16, p.node.(*schema.PrimitiveNode).TypeLength())
141 })
142
143 p.Run("convertedtype::decimal", func() {
144 elt := NewPrimitive(p.name, format.FieldRepetitionType_OPTIONAL, format.Type_FIXED_LEN_BYTE_ARRAY, p.fieldID)
145 elt.ConvertedType = format.ConvertedTypePtr(format.ConvertedType_DECIMAL)
146 elt.TypeLength = thrift.Int32Ptr(6)
147 elt.Scale = thrift.Int32Ptr(2)
148 elt.Precision = thrift.Int32Ptr(12)
149
150 p.convert(elt)
151 p.Equal(parquet.Types.FixedLenByteArray, p.node.(*schema.PrimitiveNode).PhysicalType())
152 p.Equal(schema.ConvertedTypes.Decimal, p.node.ConvertedType())
153 p.Equal(6, p.node.(*schema.PrimitiveNode).TypeLength())
154 p.EqualValues(2, p.node.(*schema.PrimitiveNode).DecimalMetadata().Scale)
155 p.EqualValues(12, p.node.(*schema.PrimitiveNode).DecimalMetadata().Precision)
156 })
157 }
158
159 func (p *PrimitiveNodeTestSuite) TestEquals() {
160 const fieldID = -1
161 node1 := schema.NewInt32Node("foo" , parquet.Repetitions.Required, fieldID)
162 node2 := schema.NewInt64Node("foo" , parquet.Repetitions.Required, fieldID)
163 node3 := schema.NewInt32Node("bar" , parquet.Repetitions.Required, fieldID)
164 node4 := schema.NewInt32Node("foo" , parquet.Repetitions.Optional, fieldID)
165 node5 := schema.NewInt32Node("foo" , parquet.Repetitions.Required, fieldID)
166
167 p.True(node1.Equals(node1))
168 p.False(node1.Equals(node2))
169 p.False(node1.Equals(node3))
170 p.False(node1.Equals(node4))
171 p.True(node1.Equals(node5))
172
173 flba1 := schema.MustPrimitive(schema.NewPrimitiveNodeConverted("foo" , parquet.Repetitions.Required, parquet.Types.FixedLenByteArray,
174 schema.ConvertedTypes.Decimal, 12 , 4 , 2 , fieldID))
175 flba2 := schema.MustPrimitive(schema.NewPrimitiveNodeConverted("foo" , parquet.Repetitions.Required, parquet.Types.FixedLenByteArray,
176 schema.ConvertedTypes.Decimal, 1 , 4 , 2 , fieldID))
177 flba2.SetTypeLength(12)
178
179 flba3 := schema.MustPrimitive(schema.NewPrimitiveNodeConverted("foo" , parquet.Repetitions.Required, parquet.Types.FixedLenByteArray,
180 schema.ConvertedTypes.Decimal, 1 , 4 , 2 , fieldID))
181 flba3.SetTypeLength(16)
182
183 flba4 := schema.MustPrimitive(schema.NewPrimitiveNodeConverted("foo" , parquet.Repetitions.Required, parquet.Types.FixedLenByteArray,
184 schema.ConvertedTypes.Decimal, 12 , 4 , 0 , fieldID))
185 flba5 := schema.MustPrimitive(schema.NewPrimitiveNodeConverted("foo" , parquet.Repetitions.Required, parquet.Types.FixedLenByteArray,
186 schema.ConvertedTypes.None, 12 , 4 , 0 , fieldID))
187
188 p.True(flba1.Equals(flba2))
189 p.False(flba1.Equals(flba3))
190 p.False(flba1.Equals(flba4))
191 p.False(flba1.Equals(flba5))
192 }
193
194 func (p *PrimitiveNodeTestSuite) TestPhysicalLogicalMapping() {
195 tests := []struct {
196 typ parquet.Type
197 cnv schema.ConvertedType
198 typLen int
199 precision int
200 scale int
201 shouldErr bool
202 }{
203 {parquet.Types.Int32, schema.ConvertedTypes.Int32, 0 , 0 , 0 , false},
204 {parquet.Types.ByteArray, schema.ConvertedTypes.JSON, 0 , 0 , 0 , false},
205 {parquet.Types.Int32, schema.ConvertedTypes.JSON, 0 , 0 , 0 , true},
206 {parquet.Types.Int64, schema.ConvertedTypes.TimestampMillis, 0 , 0 , 0 , false},
207 {parquet.Types.Int32, schema.ConvertedTypes.Int64, 0 , 0 , 0 , true},
208 {parquet.Types.ByteArray, schema.ConvertedTypes.Int8, 0 , 0 , 0 , true},
209 {parquet.Types.ByteArray, schema.ConvertedTypes.Interval, 0 , 0 , 0 , true},
210 {parquet.Types.FixedLenByteArray, schema.ConvertedTypes.Enum, 0 , 0 , 0 , true},
211 {parquet.Types.ByteArray, schema.ConvertedTypes.Enum, 0 , 0 , 0 , false},
212 {parquet.Types.FixedLenByteArray, schema.ConvertedTypes.Decimal, 0 , 2 , 4 , true},
213 {parquet.Types.Float, schema.ConvertedTypes.Decimal, 0 , 2 , 4 , true},
214 {parquet.Types.FixedLenByteArray, schema.ConvertedTypes.Decimal, 0 , 4 , 0 , true},
215 {parquet.Types.FixedLenByteArray, schema.ConvertedTypes.Decimal, 10 , 4 , -1 , true},
216 {parquet.Types.FixedLenByteArray, schema.ConvertedTypes.Decimal, 10 , 2 , 4 , true},
217 {parquet.Types.FixedLenByteArray, schema.ConvertedTypes.Decimal, 10 , 6 , 4 , false},
218 {parquet.Types.FixedLenByteArray, schema.ConvertedTypes.Interval, 12 , 0 , 0 , false},
219 {parquet.Types.FixedLenByteArray, schema.ConvertedTypes.Interval, 10 , 0 , 0 , true},
220 }
221 for _, tt := range tests {
222 p.Run(tt.typ.String(), func() {
223 _, err := schema.NewPrimitiveNodeConverted("foo" , parquet.Repetitions.Required, tt.typ, tt.cnv, tt.typLen, tt.precision, tt.scale, -1 )
224 if tt.shouldErr {
225 p.Error(err)
226 } else {
227 p.NoError(err)
228 }
229 })
230 }
231 }
232
233 type GroupNodeTestSuite struct {
234 suite.Suite
235 }
236
237 func (g *GroupNodeTestSuite) fields1() []schema.Node {
238 return schema.FieldList{
239 schema.NewInt32Node("one" , parquet.Repetitions.Required, -1 ),
240 schema.NewInt64Node("two" , parquet.Repetitions.Optional, -1 ),
241 schema.NewFloat64Node("three" , parquet.Repetitions.Optional, -1 ),
242 }
243 }
244
245 func (g *GroupNodeTestSuite) fields2() []schema.Node {
246 return schema.FieldList{
247 schema.NewInt32Node("duplicate" , parquet.Repetitions.Required, -1 ),
248 schema.NewInt64Node("unique" , parquet.Repetitions.Optional, -1 ),
249 schema.NewFloat64Node("duplicate" , parquet.Repetitions.Optional, -1 ),
250 }
251 }
252
253 func (g *GroupNodeTestSuite) TestAttrs() {
254 fields := g.fields1()
255
256 node1 := schema.MustGroup(schema.NewGroupNode("foo" , parquet.Repetitions.Repeated, fields, -1 ))
257 node2 := schema.MustGroup(schema.NewGroupNodeConverted("bar" , parquet.Repetitions.Optional, fields, schema.ConvertedTypes.List, -1 ))
258
259 g.Equal("foo", node1.Name())
260 g.Equal(schema.Group, node1.Type())
261 g.Equal(len(fields), node1.NumFields())
262 g.Equal(parquet.Repetitions.Repeated, node1.RepetitionType())
263 g.Equal(parquet.Repetitions.Optional, node2.RepetitionType())
264
265 g.Equal(schema.ConvertedTypes.None, node1.ConvertedType())
266 g.Equal(schema.ConvertedTypes.List, node2.ConvertedType())
267 }
268
269 func (g *GroupNodeTestSuite) TestEquals() {
270 f1 := g.fields1()
271 f2 := g.fields1()
272
273 group1 := schema.Must(schema.NewGroupNode("group" , parquet.Repetitions.Repeated, f1, -1 ))
274 group2 := schema.Must(schema.NewGroupNode("group" , parquet.Repetitions.Repeated, f2, -1 ))
275 group3 := schema.Must(schema.NewGroupNode("group2" , parquet.Repetitions.Repeated, f2, -1 ))
276
277 f2 = append(f2, schema.NewFloat32Node("four" , parquet.Repetitions.Optional, -1 ))
278 group4 := schema.Must(schema.NewGroupNode("group" , parquet.Repetitions.Repeated, f2, -1 ))
279 group5 := schema.Must(schema.NewGroupNode("group" , parquet.Repetitions.Repeated, g.fields1(), -1 ))
280
281 g.True(group1.Equals(group1))
282 g.True(group1.Equals(group2))
283 g.False(group1.Equals(group3))
284 g.False(group1.Equals(group4))
285 g.False(group5.Equals(group4))
286 }
287
288 func (g *GroupNodeTestSuite) TestFieldIndex() {
289 fields := g.fields1()
290 group := schema.MustGroup(schema.NewGroupNode("group" , parquet.Repetitions.Required, fields, -1 ))
291 for idx, field := range fields {
292 f := group.Field(idx)
293 g.Same(field, f)
294 g.Equal(idx, group.FieldIndexByField(f))
295 g.Equal(idx, group.FieldIndexByName(field.Name()))
296 }
297
298
299 nonFieldAlien := schema.NewInt32Node("alien" , parquet.Repetitions.Required, -1 )
300 nonFieldFamiliar := schema.NewInt32Node("one" , parquet.Repetitions.Repeated, -1 )
301 g.Less(group.FieldIndexByField(nonFieldAlien), 0)
302 g.Less(group.FieldIndexByField(nonFieldFamiliar), 0)
303 }
304
305 func (g *GroupNodeTestSuite) TestFieldIndexDuplicateName() {
306 fields := g.fields2()
307 group := schema.MustGroup(schema.NewGroupNode("group" , parquet.Repetitions.Required, fields, -1 ))
308 for idx, field := range fields {
309 f := group.Field(idx)
310 g.Same(f, field)
311 g.Equal(idx, group.FieldIndexByField(f))
312 }
313 }
314
315 type SchemaConverterSuite struct {
316 suite.Suite
317
318 name string
319 node schema.Node
320 }
321
322 func (s *SchemaConverterSuite) SetupSuite() {
323 s.name = "parquet_schema"
324 }
325
326 func (s *SchemaConverterSuite) convert(elems []*format.SchemaElement) {
327 s.node = schema.Must(schema.FromParquet(elems))
328 s.Equal(schema.Group, s.node.Type())
329 }
330
331 func (s *SchemaConverterSuite) checkParentConsistency(groupRoot *schema.GroupNode) bool {
332
333 for i := 0; i < groupRoot.NumFields(); i++ {
334 field := groupRoot.Field(i)
335 if field.Parent() != groupRoot {
336 return false
337 }
338 if field.Type() == schema.Group {
339 if !s.checkParentConsistency(field.(*schema.GroupNode)) {
340 return false
341 }
342 }
343 }
344 return true
345 }
346
347 func (s *SchemaConverterSuite) TestNestedExample() {
348 elements := make([]*format.SchemaElement, 0)
349 elements = append(elements,
350 NewGroup(s.name, format.FieldRepetitionType_REPEATED, 2 , 0 ),
351 NewPrimitive("a" , format.FieldRepetitionType_REQUIRED, format.Type_INT32, 1 ),
352 NewGroup("bag" , format.FieldRepetitionType_OPTIONAL, 1 , 2 ))
353 elt := NewGroup("b" , format.FieldRepetitionType_REPEATED, 1 , 3 )
354 elt.ConvertedType = format.ConvertedTypePtr(format.ConvertedType_LIST)
355 elements = append(elements, elt, NewPrimitive("item" , format.FieldRepetitionType_OPTIONAL, format.Type_INT64, 4 ))
356
357 s.convert(elements)
358
359
360 fields := make([]schema.Node, 0)
361 fields = append(fields, schema.NewInt32Node("a" , parquet.Repetitions.Required, 1 ))
362
363
364 item := schema.NewInt64Node("item" , parquet.Repetitions.Optional, 4 )
365 list := schema.MustGroup(schema.NewGroupNodeConverted("b" , parquet.Repetitions.Repeated, schema.FieldList{item}, schema.ConvertedTypes.List, 3 ))
366 bag := schema.MustGroup(schema.NewGroupNode("bag" , parquet.Repetitions.Optional, schema.FieldList{list}, 2 ))
367 fields = append(fields, bag)
368
369 sc := schema.MustGroup(schema.NewGroupNode(s.name, parquet.Repetitions.Repeated, fields, 0 ))
370 s.True(sc.Equals(s.node))
371 s.Nil(s.node.Parent())
372 s.True(s.checkParentConsistency(s.node.(*schema.GroupNode)))
373 }
374
375 func (s *SchemaConverterSuite) TestZeroColumns() {
376 elements := []*format.SchemaElement{NewGroup("schema" , format.FieldRepetitionType_REPEATED, 0 , 0 )}
377 s.NotPanics(func() { s.convert(elements) })
378 }
379
380 func (s *SchemaConverterSuite) TestInvalidRoot() {
381
382
383
384 elements := []*format.SchemaElement{NewPrimitive("not-a-group" , format.FieldRepetitionType_REQUIRED,
385 format.Type_INT32, 0 ), format.NewSchemaElement()}
386 s.Panics(func() { s.convert(elements) })
387
388
389
390
391
392 elements = []*format.SchemaElement{
393 NewGroup("not-repeated" , format.FieldRepetitionType_REQUIRED, 1 , 0 ),
394 NewPrimitive("a" , format.FieldRepetitionType_REQUIRED, format.Type_INT32, 1 )}
395 s.NotPanics(func() { s.convert(elements) })
396
397 elements[0] = NewGroup("not-repeated" , format.FieldRepetitionType_OPTIONAL, 1 , 0 )
398 s.NotPanics(func() { s.convert(elements) })
399 }
400
401 func (s *SchemaConverterSuite) TestNotEnoughChildren() {
402 s.Panics(func() {
403 s.convert([]*format.SchemaElement{NewGroup(s.name, format.FieldRepetitionType_REPEATED, 2 , 0 )})
404 })
405 }
406
407 func TestColumnDesc(t *testing.T) {
408 n := schema.MustPrimitive(schema.NewPrimitiveNodeConverted("name" , parquet.Repetitions.Optional, parquet.Types.ByteArray,
409 schema.ConvertedTypes.UTF8, 0 , 0 , 0 , -1 ))
410 descr := schema.NewColumn(n, 4, 1)
411
412 assert.Equal(t, "name", descr.Name())
413 assert.EqualValues(t, 4, descr.MaxDefinitionLevel())
414 assert.EqualValues(t, 1, descr.MaxRepetitionLevel())
415 assert.Equal(t, parquet.Types.ByteArray, descr.PhysicalType())
416 assert.Equal(t, -1, descr.TypeLength())
417
418 expectedDesc := `column descriptor = {
419 name: name,
420 path: ,
421 physical_type: BYTE_ARRAY,
422 converted_type: UTF8,
423 logical_type: String,
424 max_definition_level: 4,
425 max_repetition_level: 1,
426 }`
427 assert.Equal(t, expectedDesc, descr.String())
428
429 n = schema.MustPrimitive(schema.NewPrimitiveNodeConverted("name" , parquet.Repetitions.Optional, parquet.Types.FixedLenByteArray, schema.ConvertedTypes.Decimal, 12 , 10 , 4 , -1 ))
430 descr2 := schema.NewColumn(n, 4, 1)
431
432 assert.Equal(t, parquet.Types.FixedLenByteArray, descr2.PhysicalType())
433 assert.Equal(t, 12, descr2.TypeLength())
434
435 expectedDesc = `column descriptor = {
436 name: name,
437 path: ,
438 physical_type: FIXED_LEN_BYTE_ARRAY,
439 converted_type: DECIMAL,
440 logical_type: Decimal(precision=10, scale=4),
441 max_definition_level: 4,
442 max_repetition_level: 1,
443 length: 12,
444 precision: 10,
445 scale: 4,
446 }`
447 assert.Equal(t, expectedDesc, descr2.String())
448 }
449
450 func TestSchemaDescriptor(t *testing.T) {
451 t.Run("Equals", func(t *testing.T) {
452 inta := schema.NewInt32Node("a" , parquet.Repetitions.Required, -1 )
453 intb := schema.NewInt64Node("b" , parquet.Repetitions.Optional, -1 )
454 intb2 := schema.NewInt64Node("b2" , parquet.Repetitions.Optional, -1 )
455 intc := schema.NewByteArrayNode("c" , parquet.Repetitions.Repeated, -1 )
456
457 item1 := schema.NewInt64Node("item1" , parquet.Repetitions.Required, -1 )
458 item2 := schema.NewBooleanNode("item2" , parquet.Repetitions.Optional, -1 )
459 item3 := schema.NewInt32Node("item3" , parquet.Repetitions.Repeated, -1 )
460 list := schema.MustGroup(schema.NewGroupNodeConverted("records" , parquet.Repetitions.Repeated, schema.FieldList{item1, item2, item3}, schema.ConvertedTypes.List, -1 ))
461
462 bag := schema.MustGroup(schema.NewGroupNode("bag" , parquet.Repetitions.Optional, schema.FieldList{list}, -1 ))
463 bag2 := schema.MustGroup(schema.NewGroupNode("bag" , parquet.Repetitions.Required, schema.FieldList{list}, -1 ))
464
465 descr1 := schema.NewSchema(schema.MustGroup(schema.NewGroupNode("schema" , parquet.Repetitions.Repeated, schema.FieldList{inta, intb, intc, bag}, -1 )))
466 assert.True(t, descr1.Equals(descr1))
467
468 descr2 := schema.NewSchema(schema.MustGroup(schema.NewGroupNode("schema" , parquet.Repetitions.Repeated, schema.FieldList{inta, intb, intc, bag2}, -1 )))
469 assert.False(t, descr1.Equals(descr2))
470
471 descr3 := schema.NewSchema(schema.MustGroup(schema.NewGroupNode("schema" , parquet.Repetitions.Repeated, schema.FieldList{inta, intb2, intc, bag}, -1 )))
472 assert.False(t, descr1.Equals(descr3))
473
474 descr4 := schema.NewSchema(schema.MustGroup(schema.NewGroupNode("SCHEMA" , parquet.Repetitions.Repeated, schema.FieldList{inta, intb, intc, bag}, -1 )))
475 assert.True(t, descr1.Equals(descr4))
476
477 descr5 := schema.NewSchema(schema.MustGroup(schema.NewGroupNode("schema" , parquet.Repetitions.Repeated, schema.FieldList{inta, intb, intc, bag, intb2}, -1 )))
478 assert.False(t, descr1.Equals(descr5))
479
480 col1 := schema.NewColumn(inta, 5 , 1 )
481 col2 := schema.NewColumn(inta, 6 , 1 )
482 col3 := schema.NewColumn(inta, 5 , 2 )
483
484 assert.True(t, col1.Equals(col1))
485 assert.False(t, col1.Equals(col2))
486 assert.False(t, col2.Equals(col3))
487 })
488
489 t.Run("BuildTree", func(t *testing.T) {
490 inta := schema.NewInt32Node("a" , parquet.Repetitions.Required, -1 )
491 fields := schema.FieldList{inta}
492 fields = append(fields,
493 schema.NewInt64Node("b" , parquet.Repetitions.Optional, -1 ),
494 schema.NewByteArrayNode("c" , parquet.Repetitions.Repeated, -1 ))
495
496 item1 := schema.NewInt64Node("item1" , parquet.Repetitions.Required, -1 )
497 item2 := schema.NewBooleanNode("item2" , parquet.Repetitions.Optional, -1 )
498 item3 := schema.NewInt32Node("item3" , parquet.Repetitions.Repeated, -1 )
499 list := schema.MustGroup(schema.NewGroupNodeConverted("records" , parquet.Repetitions.Repeated, schema.FieldList{item1, item2, item3}, schema.ConvertedTypes.List, -1 ))
500 bag := schema.MustGroup(schema.NewGroupNode("bag" , parquet.Repetitions.Optional, schema.FieldList{list}, -1 ))
501 fields = append(fields, bag)
502
503 sc := schema.MustGroup(schema.NewGroupNode("schema" , parquet.Repetitions.Repeated, fields, -1 ))
504 descr := schema.NewSchema(sc)
505
506 const nleaves = 6
507 assert.Equal(t, nleaves, descr.NumColumns())
508
509
510
511
512
513
514
515
516
517
518 var (
519 exMaxDefLevels = [...]int16{0, 1, 1, 2, 3, 3}
520 exMaxRepLevels = [...]int16{0, 0, 1, 1, 1, 2}
521 )
522
523 for i := 0; i < nleaves; i++ {
524 col := descr.Column(i)
525 assert.Equal(t, exMaxDefLevels[i], col.MaxDefinitionLevel())
526 assert.Equal(t, exMaxRepLevels[i], col.MaxRepetitionLevel())
527 }
528
529 assert.Equal(t, "a", descr.Column(0).Path())
530 assert.Equal(t, "b", descr.Column(1).Path())
531 assert.Equal(t, "c", descr.Column(2).Path())
532 assert.Equal(t, "bag.records.item1", descr.Column(3).Path())
533 assert.Equal(t, "bag.records.item2", descr.Column(4).Path())
534 assert.Equal(t, "bag.records.item3", descr.Column(5).Path())
535
536 for i := 0; i < nleaves; i++ {
537 col := descr.Column(i)
538 assert.Equal(t, i, descr.ColumnIndexByNode(col.SchemaNode()))
539 }
540
541 nonColumnAlien := schema.NewInt32Node("alien" , parquet.Repetitions.Required, -1 )
542 nonColumnFamiliar := schema.NewInt32Node("a" , parquet.Repetitions.Repeated, -1 )
543 assert.Less(t, descr.ColumnIndexByNode(nonColumnAlien), 0)
544 assert.Less(t, descr.ColumnIndexByNode(nonColumnFamiliar), 0)
545
546 assert.Same(t, inta, descr.ColumnRoot(0))
547 assert.Same(t, bag, descr.ColumnRoot(3))
548 assert.Same(t, bag, descr.ColumnRoot(4))
549 assert.Same(t, bag, descr.ColumnRoot(5))
550
551 assert.Same(t, sc, descr.Root())
552 })
553
554 t.Run("HasRepeatedFields", func(t *testing.T) {
555 inta := schema.NewInt32Node("a" , parquet.Repetitions.Required, -1 )
556 fields := schema.FieldList{inta}
557 fields = append(fields,
558 schema.NewInt64Node("b" , parquet.Repetitions.Optional, -1 ),
559 schema.NewByteArrayNode("c" , parquet.Repetitions.Repeated, -1 ))
560
561 sc := schema.MustGroup(schema.NewGroupNode("schema" , parquet.Repetitions.Repeated, fields, -1 ))
562 descr := schema.NewSchema(sc)
563 assert.True(t, descr.HasRepeatedFields())
564
565 item1 := schema.NewInt64Node("item1" , parquet.Repetitions.Required, -1 )
566 item2 := schema.NewBooleanNode("item2" , parquet.Repetitions.Optional, -1 )
567 item3 := schema.NewInt32Node("item3" , parquet.Repetitions.Repeated, -1 )
568 list := schema.MustGroup(schema.NewGroupNodeConverted("records" , parquet.Repetitions.Repeated, schema.FieldList{item1, item2, item3}, schema.ConvertedTypes.List, -1 ))
569 bag := schema.MustGroup(schema.NewGroupNode("bag" , parquet.Repetitions.Optional, schema.FieldList{list}, -1 ))
570 fields = append(fields, bag)
571
572 sc = schema.MustGroup(schema.NewGroupNode("schema" , parquet.Repetitions.Repeated, fields, -1 ))
573 descr = schema.NewSchema(sc)
574 assert.True(t, descr.HasRepeatedFields())
575
576 itemKey := schema.NewInt64Node("key" , parquet.Repetitions.Required, -1 )
577 itemValue := schema.NewBooleanNode("value" , parquet.Repetitions.Optional, -1 )
578 sc = schema.MustGroup(schema.NewGroupNode("schema" , parquet.Repetitions.Repeated, append(fields, schema.FieldList{
579 schema.MustGroup(schema.NewGroupNode("my_map" , parquet.Repetitions.Optional, schema.FieldList{
580 schema.MustGroup(schema.NewGroupNodeConverted("map" , parquet.Repetitions.Repeated, schema.FieldList{itemKey, itemValue}, schema.ConvertedTypes.Map, -1 )),
581 }, -1 )),
582 }...), -1 ))
583 descr = schema.NewSchema(sc)
584 assert.True(t, descr.HasRepeatedFields())
585 })
586 }
587
588 func ExamplePrintSchema() {
589 fields := schema.FieldList{schema.NewInt32Node("a" , parquet.Repetitions.Required, 1 )}
590 item1 := schema.NewInt64Node("item1" , parquet.Repetitions.Optional, 4 )
591 item2 := schema.NewBooleanNode("item2" , parquet.Repetitions.Required, 5 )
592 list := schema.MustGroup(schema.NewGroupNodeConverted("b" , parquet.Repetitions.Repeated, schema.FieldList{item1, item2}, schema.ConvertedTypes.List, 3 ))
593 bag := schema.MustGroup(schema.NewGroupNode("bag" , parquet.Repetitions.Optional, schema.FieldList{list}, 2 ))
594 fields = append(fields, bag)
595
596 fields = append(fields,
597 schema.MustPrimitive(schema.NewPrimitiveNodeConverted("c" , parquet.Repetitions.Required, parquet.Types.Int32, schema.ConvertedTypes.Decimal, 0 , 3 , 2 , 6 )),
598 schema.MustPrimitive(schema.NewPrimitiveNodeLogical("d" , parquet.Repetitions.Required, schema.NewDecimalLogicalType(10 , 5 ), parquet.Types.Int64, -1 , 7 )))
599
600 sc := schema.MustGroup(schema.NewGroupNode("schema" , parquet.Repetitions.Repeated, fields, 0 ))
601 schema.PrintSchema(sc, os.Stdout, 2)
602
603
604
605
606
607
608
609
610
611
612
613
614
615 }
616
617 func TestPanicSchemaNodeCreation(t *testing.T) {
618 assert.Panics(t, func() {
619 schema.MustPrimitive(schema.NewPrimitiveNodeLogical("map" , parquet.Repetitions.Required, schema.MapLogicalType{}, parquet.Types.Int64, -1 , -1 ))
620 }, "nested logical type on non-group node")
621
622 assert.Panics(t, func() {
623 schema.MustPrimitive(schema.NewPrimitiveNodeLogical("string" , parquet.Repetitions.Required, schema.StringLogicalType{}, parquet.Types.Boolean, -1 , -1 ))
624 }, "incompatible primitive type")
625
626 assert.Panics(t, func() {
627 schema.MustPrimitive(schema.NewPrimitiveNodeLogical("interval" , parquet.Repetitions.Required, schema.IntervalLogicalType{}, parquet.Types.FixedLenByteArray, 11 , -1 ))
628 }, "incompatible primitive length")
629
630 assert.Panics(t, func() {
631 schema.MustPrimitive(schema.NewPrimitiveNodeLogical("decimal" , parquet.Repetitions.Required, schema.NewDecimalLogicalType(16, 6), parquet.Types.Int32, -1 , -1 ))
632 }, "primitive too small for given precision")
633
634 assert.Panics(t, func() {
635 schema.MustPrimitive(schema.NewPrimitiveNodeLogical("uuid" , parquet.Repetitions.Required, schema.UUIDLogicalType{}, parquet.Types.FixedLenByteArray, 64 , -1 ))
636 }, "incompatible primitive length")
637
638 assert.Panics(t, func() {
639 schema.MustPrimitive(schema.NewPrimitiveNodeLogical("float16" , parquet.Repetitions.Required, schema.Float16LogicalType{}, parquet.Types.FixedLenByteArray, 4 , -1 ))
640 }, "incompatible primitive length")
641
642 assert.Panics(t, func() {
643 schema.MustPrimitive(schema.NewPrimitiveNodeLogical("negative_len" , parquet.Repetitions.Required, schema.NoLogicalType{}, parquet.Types.FixedLenByteArray, -16 , -1 ))
644 }, "non-positive length for fixed length binary")
645
646 assert.Panics(t, func() {
647 schema.MustPrimitive(schema.NewPrimitiveNodeLogical("zero_len" , parquet.Repetitions.Required, schema.NoLogicalType{}, parquet.Types.FixedLenByteArray, 0 , -1 ))
648 }, "non-positive length for fixed length binary")
649
650 assert.Panics(t, func() {
651 schema.MustGroup(schema.NewGroupNodeLogical("list" , parquet.Repetitions.Repeated, schema.FieldList{}, schema.JSONLogicalType{}, -1 ))
652 }, "non-nested logical type on group node")
653 }
654
655 func TestNullLogicalConvertsToNone(t *testing.T) {
656 var (
657 empty schema.LogicalType
658 n schema.Node
659 )
660 assert.NotPanics(t, func() {
661 n = schema.MustPrimitive(schema.NewPrimitiveNodeLogical("value" , parquet.Repetitions.Required, empty, parquet.Types.Double, -1 , -1 ))
662 })
663 assert.True(t, n.LogicalType().IsNone())
664 assert.Equal(t, schema.ConvertedTypes.None, n.ConvertedType())
665 assert.NotPanics(t, func() {
666 n = schema.MustGroup(schema.NewGroupNodeLogical("items" , parquet.Repetitions.Repeated, schema.FieldList{}, empty, -1 ))
667 })
668 assert.True(t, n.LogicalType().IsNone())
669 assert.Equal(t, schema.ConvertedTypes.None, n.ConvertedType())
670 }
671
View as plain text