
Source file src/github.com/apache/arrow/go/v15/parquet/schema/schema_test.go

Documentation: github.com/apache/arrow/go/v15/parquet/schema

     1  // Licensed to the Apache Software Foundation (ASF) under one
     2  // or more contributor license agreements.  See the NOTICE file
     3  // distributed with this work for additional information
     4  // regarding copyright ownership.  The ASF licenses this file
     5  // to you under the Apache License, Version 2.0 (the
     6  // "License"); you may not use this file except in compliance
     7  // with the License.  You may obtain a copy of the License at
     8  //
     9  // http://www.apache.org/licenses/LICENSE-2.0
    10  //
    11  // Unless required by applicable law or agreed to in writing, software
    12  // distributed under the License is distributed on an "AS IS" BASIS,
    13  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    14  // See the License for the specific language governing permissions and
    15  // limitations under the License.
    17  package schema_test
    19  import (
    20  	"os"
    21  	"testing"
    23  	"github.com/apache/arrow/go/v15/parquet"
    24  	format "github.com/apache/arrow/go/v15/parquet/internal/gen-go/parquet"
    25  	"github.com/apache/arrow/go/v15/parquet/schema"
    26  	"github.com/apache/thrift/lib/go/thrift"
    27  	"github.com/stretchr/testify/assert"
    28  	"github.com/stretchr/testify/suite"
    29  )
    31  func TestColumnPath(t *testing.T) {
    32  	p := parquet.ColumnPath([]string{"toplevel", "leaf"})
    33  	assert.Equal(t, "toplevel.leaf", p.String())
    35  	p2 := parquet.ColumnPathFromString("toplevel.leaf")
    36  	assert.Equal(t, "toplevel.leaf", p2.String())
    38  	extend := p2.Extend("anotherlevel")
    39  	assert.Equal(t, "toplevel.leaf.anotherlevel", extend.String())
    40  }
    42  func NewPrimitive(name string, repetition format.FieldRepetitionType, typ format.Type, fieldID int32) *format.SchemaElement {
    43  	ret := &format.SchemaElement{
    44  		Name:           name,
    45  		RepetitionType: format.FieldRepetitionTypePtr(repetition),
    46  		Type:           format.TypePtr(typ),
    47  	}
    48  	if fieldID >= 0 {
    49  		ret.FieldID = &fieldID
    50  	}
    51  	return ret
    52  }
    54  func NewGroup(name string, repetition format.FieldRepetitionType, numChildren, fieldID int32) *format.SchemaElement {
    55  	ret := &format.SchemaElement{
    56  		Name:           name,
    57  		RepetitionType: format.FieldRepetitionTypePtr(repetition),
    58  		NumChildren:    &numChildren,
    59  	}
    60  	if fieldID >= 0 {
    61  		ret.FieldID = &fieldID
    62  	}
    63  	return ret
    64  }
    66  func TestSchemaNodes(t *testing.T) {
    67  	suite.Run(t, new(PrimitiveNodeTestSuite))
    68  	suite.Run(t, new(GroupNodeTestSuite))
    69  	suite.Run(t, new(SchemaConverterSuite))
    70  }
    72  type PrimitiveNodeTestSuite struct {
    73  	suite.Suite
    75  	name    string
    76  	fieldID int32
    77  	node    schema.Node
    78  }
    80  func (p *PrimitiveNodeTestSuite) SetupTest() {
    81  	p.name = "name"
    82  	p.fieldID = 5
    83  }
    85  func (p *PrimitiveNodeTestSuite) convert(elt *format.SchemaElement) {
    86  	p.node = schema.MustPrimitive(schema.PrimitiveNodeFromThrift(elt))
    87  	p.IsType(&schema.PrimitiveNode{}, p.node)
    88  }
    90  func (p *PrimitiveNodeTestSuite) TestAttrs() {
    91  	node1 := schema.NewInt32Node("foo" /* name */, parquet.Repetitions.Repeated, -1 /* fieldID */)
    92  	node2 := schema.MustPrimitive(schema.NewPrimitiveNodeConverted("bar" /* name */, parquet.Repetitions.Optional, parquet.Types.ByteArray,
    93  		schema.ConvertedTypes.UTF8, 0 /* type len */, 0 /* precision */, 0 /* scale */, -1 /* fieldID */))
    95  	p.Equal("foo", node1.Name())
    96  	p.Equal(schema.Primitive, node1.Type())
    97  	p.Equal(schema.Primitive, node2.Type())
    99  	p.Equal(parquet.Repetitions.Repeated, node1.RepetitionType())
   100  	p.Equal(parquet.Repetitions.Optional, node2.RepetitionType())
   102  	p.Equal(parquet.Types.Int32, node1.PhysicalType())
   103  	p.Equal(parquet.Types.ByteArray, node2.PhysicalType())
   105  	p.Equal(schema.ConvertedTypes.None, node1.ConvertedType())
   106  	p.Equal(schema.ConvertedTypes.UTF8, node2.ConvertedType())
   107  }
   109  func (p *PrimitiveNodeTestSuite) TestFromParquet() {
   110  	p.Run("Optional Int32", func() {
   111  		elt := NewPrimitive(p.name, format.FieldRepetitionType_OPTIONAL, format.Type_INT32, p.fieldID)
   112  		p.convert(elt)
   114  		p.Equal(p.name, p.node.Name())
   115  		p.Equal(p.fieldID, p.node.FieldID())
   116  		p.Equal(parquet.Repetitions.Optional, p.node.RepetitionType())
   117  		p.Equal(parquet.Types.Int32, p.node.(*schema.PrimitiveNode).PhysicalType())
   118  		p.Equal(schema.ConvertedTypes.None, p.node.ConvertedType())
   119  	})
   121  	p.Run("LogicalType", func() {
   122  		elt := NewPrimitive(p.name, format.FieldRepetitionType_REQUIRED, format.Type_BYTE_ARRAY, p.fieldID)
   123  		elt.ConvertedType = format.ConvertedTypePtr(format.ConvertedType_UTF8)
   124  		p.convert(elt)
   126  		p.Equal(parquet.Repetitions.Required, p.node.RepetitionType())
   127  		p.Equal(parquet.Types.ByteArray, p.node.(*schema.PrimitiveNode).PhysicalType())
   128  		p.Equal(schema.ConvertedTypes.UTF8, p.node.ConvertedType())
   129  	})
   131  	p.Run("FixedLenByteArray", func() {
   132  		elt := NewPrimitive(p.name, format.FieldRepetitionType_OPTIONAL, format.Type_FIXED_LEN_BYTE_ARRAY, p.fieldID)
   133  		elt.TypeLength = thrift.Int32Ptr(16)
   134  		p.convert(elt)
   136  		p.Equal(p.name, p.node.Name())
   137  		p.Equal(p.fieldID, p.node.FieldID())
   138  		p.Equal(parquet.Repetitions.Optional, p.node.RepetitionType())
   139  		p.Equal(parquet.Types.FixedLenByteArray, p.node.(*schema.PrimitiveNode).PhysicalType())
   140  		p.Equal(16, p.node.(*schema.PrimitiveNode).TypeLength())
   141  	})
   143  	p.Run("convertedtype::decimal", func() {
   144  		elt := NewPrimitive(p.name, format.FieldRepetitionType_OPTIONAL, format.Type_FIXED_LEN_BYTE_ARRAY, p.fieldID)
   145  		elt.ConvertedType = format.ConvertedTypePtr(format.ConvertedType_DECIMAL)
   146  		elt.TypeLength = thrift.Int32Ptr(6)
   147  		elt.Scale = thrift.Int32Ptr(2)
   148  		elt.Precision = thrift.Int32Ptr(12)
   150  		p.convert(elt)
   151  		p.Equal(parquet.Types.FixedLenByteArray, p.node.(*schema.PrimitiveNode).PhysicalType())
   152  		p.Equal(schema.ConvertedTypes.Decimal, p.node.ConvertedType())
   153  		p.Equal(6, p.node.(*schema.PrimitiveNode).TypeLength())
   154  		p.EqualValues(2, p.node.(*schema.PrimitiveNode).DecimalMetadata().Scale)
   155  		p.EqualValues(12, p.node.(*schema.PrimitiveNode).DecimalMetadata().Precision)
   156  	})
   157  }
   159  func (p *PrimitiveNodeTestSuite) TestEquals() {
   160  	const fieldID = -1
   161  	node1 := schema.NewInt32Node("foo" /* name */, parquet.Repetitions.Required, fieldID)
   162  	node2 := schema.NewInt64Node("foo" /* name */, parquet.Repetitions.Required, fieldID)
   163  	node3 := schema.NewInt32Node("bar" /* name */, parquet.Repetitions.Required, fieldID)
   164  	node4 := schema.NewInt32Node("foo" /* name */, parquet.Repetitions.Optional, fieldID)
   165  	node5 := schema.NewInt32Node("foo" /* name */, parquet.Repetitions.Required, fieldID)
   167  	p.True(node1.Equals(node1))
   168  	p.False(node1.Equals(node2))
   169  	p.False(node1.Equals(node3))
   170  	p.False(node1.Equals(node4))
   171  	p.True(node1.Equals(node5))
   173  	flba1 := schema.MustPrimitive(schema.NewPrimitiveNodeConverted("foo" /* name */, parquet.Repetitions.Required, parquet.Types.FixedLenByteArray,
   174  		schema.ConvertedTypes.Decimal, 12 /* type len */, 4 /* precision */, 2 /* scale */, fieldID))
   175  	flba2 := schema.MustPrimitive(schema.NewPrimitiveNodeConverted("foo" /* name */, parquet.Repetitions.Required, parquet.Types.FixedLenByteArray,
   176  		schema.ConvertedTypes.Decimal, 1 /* type len */, 4 /* precision */, 2 /* scale */, fieldID))
   177  	flba2.SetTypeLength(12)
   179  	flba3 := schema.MustPrimitive(schema.NewPrimitiveNodeConverted("foo" /* name */, parquet.Repetitions.Required, parquet.Types.FixedLenByteArray,
   180  		schema.ConvertedTypes.Decimal, 1 /* type len */, 4 /* precision */, 2 /* scale */, fieldID))
   181  	flba3.SetTypeLength(16)
   183  	flba4 := schema.MustPrimitive(schema.NewPrimitiveNodeConverted("foo" /* name */, parquet.Repetitions.Required, parquet.Types.FixedLenByteArray,
   184  		schema.ConvertedTypes.Decimal, 12 /* type len */, 4 /* precision */, 0 /* scale */, fieldID))
   185  	flba5 := schema.MustPrimitive(schema.NewPrimitiveNodeConverted("foo" /* name */, parquet.Repetitions.Required, parquet.Types.FixedLenByteArray,
   186  		schema.ConvertedTypes.None, 12 /* type len */, 4 /* precision */, 0 /* scale */, fieldID))
   188  	p.True(flba1.Equals(flba2))
   189  	p.False(flba1.Equals(flba3))
   190  	p.False(flba1.Equals(flba4))
   191  	p.False(flba1.Equals(flba5))
   192  }
   194  func (p *PrimitiveNodeTestSuite) TestPhysicalLogicalMapping() {
   195  	tests := []struct {
   196  		typ       parquet.Type
   197  		cnv       schema.ConvertedType
   198  		typLen    int
   199  		precision int
   200  		scale     int
   201  		shouldErr bool
   202  	}{
   203  		{parquet.Types.Int32, schema.ConvertedTypes.Int32, 0 /* type len */, 0 /* precision */, 0 /* scale */, false},
   204  		{parquet.Types.ByteArray, schema.ConvertedTypes.JSON, 0 /* type len */, 0 /* precision */, 0 /* scale */, false},
   205  		{parquet.Types.Int32, schema.ConvertedTypes.JSON, 0 /* type len */, 0 /* precision */, 0 /* scale */, true},
   206  		{parquet.Types.Int64, schema.ConvertedTypes.TimestampMillis, 0 /* type len */, 0 /* precision */, 0 /* scale */, false},
   207  		{parquet.Types.Int32, schema.ConvertedTypes.Int64, 0 /* type len */, 0 /* precision */, 0 /* scale */, true},
   208  		{parquet.Types.ByteArray, schema.ConvertedTypes.Int8, 0 /* type len */, 0 /* precision */, 0 /* scale */, true},
   209  		{parquet.Types.ByteArray, schema.ConvertedTypes.Interval, 0 /* type len */, 0 /* precision */, 0 /* scale */, true},
   210  		{parquet.Types.FixedLenByteArray, schema.ConvertedTypes.Enum, 0 /* type len */, 0 /* precision */, 0 /* scale */, true},
   211  		{parquet.Types.ByteArray, schema.ConvertedTypes.Enum, 0 /* type len */, 0 /* precision */, 0 /* scale */, false},
   212  		{parquet.Types.FixedLenByteArray, schema.ConvertedTypes.Decimal, 0 /* type len */, 2 /* precision */, 4 /* scale */, true},
   213  		{parquet.Types.Float, schema.ConvertedTypes.Decimal, 0 /* type len */, 2 /* precision */, 4 /* scale */, true},
   214  		{parquet.Types.FixedLenByteArray, schema.ConvertedTypes.Decimal, 0 /* type len */, 4 /* precision */, 0 /* scale */, true},
   215  		{parquet.Types.FixedLenByteArray, schema.ConvertedTypes.Decimal, 10 /* type len */, 4 /* precision */, -1 /* scale */, true},
   216  		{parquet.Types.FixedLenByteArray, schema.ConvertedTypes.Decimal, 10 /* type len */, 2 /* precision */, 4 /* scale */, true},
   217  		{parquet.Types.FixedLenByteArray, schema.ConvertedTypes.Decimal, 10 /* type len */, 6 /* precision */, 4 /* scale */, false},
   218  		{parquet.Types.FixedLenByteArray, schema.ConvertedTypes.Interval, 12 /* type len */, 0 /* precision */, 0 /* scale */, false},
   219  		{parquet.Types.FixedLenByteArray, schema.ConvertedTypes.Interval, 10 /* type len */, 0 /* precision */, 0 /* scale */, true},
   220  	}
   221  	for _, tt := range tests {
   222  		p.Run(tt.typ.String(), func() {
   223  			_, err := schema.NewPrimitiveNodeConverted("foo" /* name */, parquet.Repetitions.Required, tt.typ, tt.cnv, tt.typLen, tt.precision, tt.scale, -1 /* fieldID */)
   224  			if tt.shouldErr {
   225  				p.Error(err)
   226  			} else {
   227  				p.NoError(err)
   228  			}
   229  		})
   230  	}
   231  }
   233  type GroupNodeTestSuite struct {
   234  	suite.Suite
   235  }
   237  func (g *GroupNodeTestSuite) fields1() []schema.Node {
   238  	return schema.FieldList{
   239  		schema.NewInt32Node("one" /* name */, parquet.Repetitions.Required, -1 /* fieldID */),
   240  		schema.NewInt64Node("two" /* name */, parquet.Repetitions.Optional, -1 /* fieldID */),
   241  		schema.NewFloat64Node("three" /* name */, parquet.Repetitions.Optional, -1 /* fieldID */),
   242  	}
   243  }
   245  func (g *GroupNodeTestSuite) fields2() []schema.Node {
   246  	return schema.FieldList{
   247  		schema.NewInt32Node("duplicate" /* name */, parquet.Repetitions.Required, -1 /* fieldID */),
   248  		schema.NewInt64Node("unique" /* name */, parquet.Repetitions.Optional, -1 /* fieldID */),
   249  		schema.NewFloat64Node("duplicate" /* name */, parquet.Repetitions.Optional, -1 /* fieldID */),
   250  	}
   251  }
   253  func (g *GroupNodeTestSuite) TestAttrs() {
   254  	fields := g.fields1()
   256  	node1 := schema.MustGroup(schema.NewGroupNode("foo" /* name */, parquet.Repetitions.Repeated, fields, -1 /* fieldID */))
   257  	node2 := schema.MustGroup(schema.NewGroupNodeConverted("bar" /* name */, parquet.Repetitions.Optional, fields, schema.ConvertedTypes.List, -1 /* fieldID */))
   259  	g.Equal("foo", node1.Name())
   260  	g.Equal(schema.Group, node1.Type())
   261  	g.Equal(len(fields), node1.NumFields())
   262  	g.Equal(parquet.Repetitions.Repeated, node1.RepetitionType())
   263  	g.Equal(parquet.Repetitions.Optional, node2.RepetitionType())
   265  	g.Equal(schema.ConvertedTypes.None, node1.ConvertedType())
   266  	g.Equal(schema.ConvertedTypes.List, node2.ConvertedType())
   267  }
   269  func (g *GroupNodeTestSuite) TestEquals() {
   270  	f1 := g.fields1()
   271  	f2 := g.fields1()
   273  	group1 := schema.Must(schema.NewGroupNode("group" /* name */, parquet.Repetitions.Repeated, f1, -1 /* fieldID */))
   274  	group2 := schema.Must(schema.NewGroupNode("group" /* name */, parquet.Repetitions.Repeated, f2, -1 /* fieldID */))
   275  	group3 := schema.Must(schema.NewGroupNode("group2" /* name */, parquet.Repetitions.Repeated, f2, -1 /* fieldID */))
   277  	f2 = append(f2, schema.NewFloat32Node("four" /* name */, parquet.Repetitions.Optional, -1 /* fieldID */))
   278  	group4 := schema.Must(schema.NewGroupNode("group" /* name */, parquet.Repetitions.Repeated, f2, -1 /* fieldID */))
   279  	group5 := schema.Must(schema.NewGroupNode("group" /* name */, parquet.Repetitions.Repeated, g.fields1(), -1 /* fieldID */))
   281  	g.True(group1.Equals(group1))
   282  	g.True(group1.Equals(group2))
   283  	g.False(group1.Equals(group3))
   284  	g.False(group1.Equals(group4))
   285  	g.False(group5.Equals(group4))
   286  }
   288  func (g *GroupNodeTestSuite) TestFieldIndex() {
   289  	fields := g.fields1()
   290  	group := schema.MustGroup(schema.NewGroupNode("group" /* name */, parquet.Repetitions.Required, fields, -1 /* fieldID */))
   291  	for idx, field := range fields {
   292  		f := group.Field(idx)
   293  		g.Same(field, f)
   294  		g.Equal(idx, group.FieldIndexByField(f))
   295  		g.Equal(idx, group.FieldIndexByName(field.Name()))
   296  	}
   298  	// Non field nodes
   299  	nonFieldAlien := schema.NewInt32Node("alien" /* name */, parquet.Repetitions.Required, -1 /* fieldID */)
   300  	nonFieldFamiliar := schema.NewInt32Node("one" /* name */, parquet.Repetitions.Repeated, -1 /* fieldID */)
   301  	g.Less(group.FieldIndexByField(nonFieldAlien), 0)
   302  	g.Less(group.FieldIndexByField(nonFieldFamiliar), 0)
   303  }
   305  func (g *GroupNodeTestSuite) TestFieldIndexDuplicateName() {
   306  	fields := g.fields2()
   307  	group := schema.MustGroup(schema.NewGroupNode("group" /* name */, parquet.Repetitions.Required, fields, -1 /* fieldID */))
   308  	for idx, field := range fields {
   309  		f := group.Field(idx)
   310  		g.Same(f, field)
   311  		g.Equal(idx, group.FieldIndexByField(f))
   312  	}
   313  }
   315  type SchemaConverterSuite struct {
   316  	suite.Suite
   318  	name string
   319  	node schema.Node
   320  }
   322  func (s *SchemaConverterSuite) SetupSuite() {
   323  	s.name = "parquet_schema"
   324  }
   326  func (s *SchemaConverterSuite) convert(elems []*format.SchemaElement) {
   327  	s.node = schema.Must(schema.FromParquet(elems))
   328  	s.Equal(schema.Group, s.node.Type())
   329  }
   331  func (s *SchemaConverterSuite) checkParentConsistency(groupRoot *schema.GroupNode) bool {
   332  	// each node should have the group as parent
   333  	for i := 0; i < groupRoot.NumFields(); i++ {
   334  		field := groupRoot.Field(i)
   335  		if field.Parent() != groupRoot {
   336  			return false
   337  		}
   338  		if field.Type() == schema.Group {
   339  			if !s.checkParentConsistency(field.(*schema.GroupNode)) {
   340  				return false
   341  			}
   342  		}
   343  	}
   344  	return true
   345  }
   347  func (s *SchemaConverterSuite) TestNestedExample() {
   348  	elements := make([]*format.SchemaElement, 0)
   349  	elements = append(elements,
   350  		NewGroup(s.name, format.FieldRepetitionType_REPEATED, 2 /* numChildren */, 0 /* fieldID */),
   351  		NewPrimitive("a" /* name */, format.FieldRepetitionType_REQUIRED, format.Type_INT32, 1 /* fieldID */),
   352  		NewGroup("bag" /* name */, format.FieldRepetitionType_OPTIONAL, 1 /* numChildren */, 2 /* fieldID */))
   353  	elt := NewGroup("b" /* name */, format.FieldRepetitionType_REPEATED, 1 /* numChildren */, 3 /* fieldID */)
   354  	elt.ConvertedType = format.ConvertedTypePtr(format.ConvertedType_LIST)
   355  	elements = append(elements, elt, NewPrimitive("item" /* name */, format.FieldRepetitionType_OPTIONAL, format.Type_INT64, 4 /* fieldID */))
   357  	s.convert(elements)
   359  	// construct the expected schema
   360  	fields := make([]schema.Node, 0)
   361  	fields = append(fields, schema.NewInt32Node("a" /* name */, parquet.Repetitions.Required, 1 /* fieldID */))
   363  	// 3-level list encoding
   364  	item := schema.NewInt64Node("item" /* name */, parquet.Repetitions.Optional, 4 /* fieldID */)
   365  	list := schema.MustGroup(schema.NewGroupNodeConverted("b" /* name */, parquet.Repetitions.Repeated, schema.FieldList{item}, schema.ConvertedTypes.List, 3 /* fieldID */))
   366  	bag := schema.MustGroup(schema.NewGroupNode("bag" /* name */, parquet.Repetitions.Optional, schema.FieldList{list}, 2 /* fieldID */))
   367  	fields = append(fields, bag)
   369  	sc := schema.MustGroup(schema.NewGroupNode(s.name, parquet.Repetitions.Repeated, fields, 0 /* fieldID */))
   370  	s.True(sc.Equals(s.node))
   371  	s.Nil(s.node.Parent())
   372  	s.True(s.checkParentConsistency(s.node.(*schema.GroupNode)))
   373  }
   375  func (s *SchemaConverterSuite) TestZeroColumns() {
   376  	elements := []*format.SchemaElement{NewGroup("schema" /* name */, format.FieldRepetitionType_REPEATED, 0 /* numChildren */, 0 /* fieldID */)}
   377  	s.NotPanics(func() { s.convert(elements) })
   378  }
   380  func (s *SchemaConverterSuite) TestInvalidRoot() {
   381  	// According to the Parquet spec, the first element in the list<SchemaElement>
   382  	// is a group whose children (and their descendants) contain all of the rest of
   383  	// the flattened schema elements. If the first element is not a group, it is malformed
   384  	elements := []*format.SchemaElement{NewPrimitive("not-a-group" /* name */, format.FieldRepetitionType_REQUIRED,
   385  		format.Type_INT32, 0 /* fieldID */), format.NewSchemaElement()}
   386  	s.Panics(func() { s.convert(elements) })
   388  	// While the parquet spec indicates that the root group should have REPEATED
   389  	// repetition type, some implementations may return REQUIRED or OPTIONAL
   390  	// groups as the first element. These tests check that this is okay as a
   391  	// practicality matter
   392  	elements = []*format.SchemaElement{
   393  		NewGroup("not-repeated" /* name */, format.FieldRepetitionType_REQUIRED, 1 /* numChildren */, 0 /* fieldID */),
   394  		NewPrimitive("a" /* name */, format.FieldRepetitionType_REQUIRED, format.Type_INT32, 1 /* fieldID */)}
   395  	s.NotPanics(func() { s.convert(elements) })
   397  	elements[0] = NewGroup("not-repeated" /* name */, format.FieldRepetitionType_OPTIONAL, 1 /* numChildren */, 0 /* fieldID */)
   398  	s.NotPanics(func() { s.convert(elements) })
   399  }
   401  func (s *SchemaConverterSuite) TestNotEnoughChildren() {
   402  	s.Panics(func() {
   403  		s.convert([]*format.SchemaElement{NewGroup(s.name, format.FieldRepetitionType_REPEATED, 2 /* numChildren */, 0 /* fieldID */)})
   404  	})
   405  }
   407  func TestColumnDesc(t *testing.T) {
   408  	n := schema.MustPrimitive(schema.NewPrimitiveNodeConverted("name" /* name */, parquet.Repetitions.Optional, parquet.Types.ByteArray,
   409  		schema.ConvertedTypes.UTF8, 0 /* type len */, 0 /* precision */, 0 /* scale */, -1 /* fieldID */))
   410  	descr := schema.NewColumn(n, 4, 1)
   412  	assert.Equal(t, "name", descr.Name())
   413  	assert.EqualValues(t, 4, descr.MaxDefinitionLevel())
   414  	assert.EqualValues(t, 1, descr.MaxRepetitionLevel())
   415  	assert.Equal(t, parquet.Types.ByteArray, descr.PhysicalType())
   416  	assert.Equal(t, -1, descr.TypeLength())
   418  	expectedDesc := `column descriptor = {
   419    name: name,
   420    path: ,
   421    physical_type: BYTE_ARRAY,
   422    converted_type: UTF8,
   423    logical_type: String,
   424    max_definition_level: 4,
   425    max_repetition_level: 1,
   426  }`
   427  	assert.Equal(t, expectedDesc, descr.String())
   429  	n = schema.MustPrimitive(schema.NewPrimitiveNodeConverted("name" /* name */, parquet.Repetitions.Optional, parquet.Types.FixedLenByteArray, schema.ConvertedTypes.Decimal, 12 /* type len */, 10 /* precision */, 4 /* scale */, -1 /* fieldID */))
   430  	descr2 := schema.NewColumn(n, 4, 1)
   432  	assert.Equal(t, parquet.Types.FixedLenByteArray, descr2.PhysicalType())
   433  	assert.Equal(t, 12, descr2.TypeLength())
   435  	expectedDesc = `column descriptor = {
   436    name: name,
   437    path: ,
   438    physical_type: FIXED_LEN_BYTE_ARRAY,
   439    converted_type: DECIMAL,
   440    logical_type: Decimal(precision=10, scale=4),
   441    max_definition_level: 4,
   442    max_repetition_level: 1,
   443    length: 12,
   444    precision: 10,
   445    scale: 4,
   446  }`
   447  	assert.Equal(t, expectedDesc, descr2.String())
   448  }
   450  func TestSchemaDescriptor(t *testing.T) {
   451  	t.Run("Equals", func(t *testing.T) {
   452  		inta := schema.NewInt32Node("a" /* name */, parquet.Repetitions.Required, -1 /* fieldID */)
   453  		intb := schema.NewInt64Node("b" /* name */, parquet.Repetitions.Optional, -1 /* fieldID */)
   454  		intb2 := schema.NewInt64Node("b2" /* name */, parquet.Repetitions.Optional, -1 /* fieldID */)
   455  		intc := schema.NewByteArrayNode("c" /* name */, parquet.Repetitions.Repeated, -1 /* fieldID */)
   457  		item1 := schema.NewInt64Node("item1" /* name */, parquet.Repetitions.Required, -1 /* fieldID */)
   458  		item2 := schema.NewBooleanNode("item2" /* name */, parquet.Repetitions.Optional, -1 /* fieldID */)
   459  		item3 := schema.NewInt32Node("item3" /* name */, parquet.Repetitions.Repeated, -1 /* fieldID */)
   460  		list := schema.MustGroup(schema.NewGroupNodeConverted("records" /* name */, parquet.Repetitions.Repeated, schema.FieldList{item1, item2, item3}, schema.ConvertedTypes.List, -1 /* fieldID */))
   462  		bag := schema.MustGroup(schema.NewGroupNode("bag" /* name */, parquet.Repetitions.Optional, schema.FieldList{list}, -1 /* fieldID */))
   463  		bag2 := schema.MustGroup(schema.NewGroupNode("bag" /* name */, parquet.Repetitions.Required, schema.FieldList{list}, -1 /* fieldID */))
   465  		descr1 := schema.NewSchema(schema.MustGroup(schema.NewGroupNode("schema" /* name */, parquet.Repetitions.Repeated, schema.FieldList{inta, intb, intc, bag}, -1 /* fieldID */)))
   466  		assert.True(t, descr1.Equals(descr1))
   468  		descr2 := schema.NewSchema(schema.MustGroup(schema.NewGroupNode("schema" /* name */, parquet.Repetitions.Repeated, schema.FieldList{inta, intb, intc, bag2}, -1 /* fieldID */)))
   469  		assert.False(t, descr1.Equals(descr2))
   471  		descr3 := schema.NewSchema(schema.MustGroup(schema.NewGroupNode("schema" /* name */, parquet.Repetitions.Repeated, schema.FieldList{inta, intb2, intc, bag}, -1 /* fieldID */)))
   472  		assert.False(t, descr1.Equals(descr3))
   474  		descr4 := schema.NewSchema(schema.MustGroup(schema.NewGroupNode("SCHEMA" /* name */, parquet.Repetitions.Repeated, schema.FieldList{inta, intb, intc, bag}, -1 /* fieldID */)))
   475  		assert.True(t, descr1.Equals(descr4))
   477  		descr5 := schema.NewSchema(schema.MustGroup(schema.NewGroupNode("schema" /* name */, parquet.Repetitions.Repeated, schema.FieldList{inta, intb, intc, bag, intb2}, -1 /* fieldID */)))
   478  		assert.False(t, descr1.Equals(descr5))
   480  		col1 := schema.NewColumn(inta, 5 /* maxDefLvl */, 1 /* maxRepLvl */)
   481  		col2 := schema.NewColumn(inta, 6 /* maxDefLvl */, 1 /* maxRepLvl */)
   482  		col3 := schema.NewColumn(inta, 5 /* maxDefLvl */, 2 /* maxRepLvl */)
   484  		assert.True(t, col1.Equals(col1))
   485  		assert.False(t, col1.Equals(col2))
   486  		assert.False(t, col2.Equals(col3))
   487  	})
   489  	t.Run("BuildTree", func(t *testing.T) {
   490  		inta := schema.NewInt32Node("a" /* name */, parquet.Repetitions.Required, -1 /* fieldID */)
   491  		fields := schema.FieldList{inta}
   492  		fields = append(fields,
   493  			schema.NewInt64Node("b" /* name */, parquet.Repetitions.Optional, -1 /* fieldID */),
   494  			schema.NewByteArrayNode("c" /* name */, parquet.Repetitions.Repeated, -1 /* fieldID */))
   496  		item1 := schema.NewInt64Node("item1" /* name */, parquet.Repetitions.Required, -1 /* fieldID */)
   497  		item2 := schema.NewBooleanNode("item2" /* name */, parquet.Repetitions.Optional, -1 /* fieldID */)
   498  		item3 := schema.NewInt32Node("item3" /* name */, parquet.Repetitions.Repeated, -1 /* fieldID */)
   499  		list := schema.MustGroup(schema.NewGroupNodeConverted("records" /* name */, parquet.Repetitions.Repeated, schema.FieldList{item1, item2, item3}, schema.ConvertedTypes.List, -1 /* fieldID */))
   500  		bag := schema.MustGroup(schema.NewGroupNode("bag" /* name */, parquet.Repetitions.Optional, schema.FieldList{list}, -1 /* fieldID */))
   501  		fields = append(fields, bag)
   503  		sc := schema.MustGroup(schema.NewGroupNode("schema" /* name */, parquet.Repetitions.Repeated, fields, -1 /* fieldID */))
   504  		descr := schema.NewSchema(sc)
   506  		const nleaves = 6
   507  		assert.Equal(t, nleaves, descr.NumColumns())
   509  		//                             mdef mrep
   510  		// required int32 a            0    0
   511  		// optional int64 b            1    0
   512  		// repeated byte_array c       1    1
   513  		// optional group bag          1    0
   514  		//   repeated group records    2    1
   515  		//     required int64 item1    2    1
   516  		//     optional boolean item2  3    1
   517  		//     repeated int32 item3    3    2
   518  		var (
   519  			exMaxDefLevels = [...]int16{0, 1, 1, 2, 3, 3}
   520  			exMaxRepLevels = [...]int16{0, 0, 1, 1, 1, 2}
   521  		)
   523  		for i := 0; i < nleaves; i++ {
   524  			col := descr.Column(i)
   525  			assert.Equal(t, exMaxDefLevels[i], col.MaxDefinitionLevel())
   526  			assert.Equal(t, exMaxRepLevels[i], col.MaxRepetitionLevel())
   527  		}
   529  		assert.Equal(t, "a", descr.Column(0).Path())
   530  		assert.Equal(t, "b", descr.Column(1).Path())
   531  		assert.Equal(t, "c", descr.Column(2).Path())
   532  		assert.Equal(t, "bag.records.item1", descr.Column(3).Path())
   533  		assert.Equal(t, "bag.records.item2", descr.Column(4).Path())
   534  		assert.Equal(t, "bag.records.item3", descr.Column(5).Path())
   536  		for i := 0; i < nleaves; i++ {
   537  			col := descr.Column(i)
   538  			assert.Equal(t, i, descr.ColumnIndexByNode(col.SchemaNode()))
   539  		}
   541  		nonColumnAlien := schema.NewInt32Node("alien" /* name */, parquet.Repetitions.Required, -1 /* fieldID */)
   542  		nonColumnFamiliar := schema.NewInt32Node("a" /* name */, parquet.Repetitions.Repeated, -1 /* fieldID */)
   543  		assert.Less(t, descr.ColumnIndexByNode(nonColumnAlien), 0)
   544  		assert.Less(t, descr.ColumnIndexByNode(nonColumnFamiliar), 0)
   546  		assert.Same(t, inta, descr.ColumnRoot(0))
   547  		assert.Same(t, bag, descr.ColumnRoot(3))
   548  		assert.Same(t, bag, descr.ColumnRoot(4))
   549  		assert.Same(t, bag, descr.ColumnRoot(5))
   551  		assert.Same(t, sc, descr.Root())
   552  	})
   554  	t.Run("HasRepeatedFields", func(t *testing.T) {
   555  		inta := schema.NewInt32Node("a" /* name */, parquet.Repetitions.Required, -1 /* fieldID */)
   556  		fields := schema.FieldList{inta}
   557  		fields = append(fields,
   558  			schema.NewInt64Node("b" /* name */, parquet.Repetitions.Optional, -1 /* fieldID */),
   559  			schema.NewByteArrayNode("c" /* name */, parquet.Repetitions.Repeated, -1 /* fieldID */))
   561  		sc := schema.MustGroup(schema.NewGroupNode("schema" /* name */, parquet.Repetitions.Repeated, fields, -1 /* fieldID */))
   562  		descr := schema.NewSchema(sc)
   563  		assert.True(t, descr.HasRepeatedFields())
   565  		item1 := schema.NewInt64Node("item1" /* name */, parquet.Repetitions.Required, -1 /* fieldID */)
   566  		item2 := schema.NewBooleanNode("item2" /* name */, parquet.Repetitions.Optional, -1 /* fieldID */)
   567  		item3 := schema.NewInt32Node("item3" /* name */, parquet.Repetitions.Repeated, -1 /* fieldID */)
   568  		list := schema.MustGroup(schema.NewGroupNodeConverted("records" /* name */, parquet.Repetitions.Repeated, schema.FieldList{item1, item2, item3}, schema.ConvertedTypes.List, -1 /* fieldID */))
   569  		bag := schema.MustGroup(schema.NewGroupNode("bag" /* name */, parquet.Repetitions.Optional, schema.FieldList{list}, -1 /* fieldID */))
   570  		fields = append(fields, bag)
   572  		sc = schema.MustGroup(schema.NewGroupNode("schema" /* name */, parquet.Repetitions.Repeated, fields, -1 /* fieldID */))
   573  		descr = schema.NewSchema(sc)
   574  		assert.True(t, descr.HasRepeatedFields())
   576  		itemKey := schema.NewInt64Node("key" /* name */, parquet.Repetitions.Required, -1 /* fieldID */)
   577  		itemValue := schema.NewBooleanNode("value" /* name */, parquet.Repetitions.Optional, -1 /* fieldID */)
   578  		sc = schema.MustGroup(schema.NewGroupNode("schema" /* name */, parquet.Repetitions.Repeated, append(fields, schema.FieldList{
   579  			schema.MustGroup(schema.NewGroupNode("my_map" /* name */, parquet.Repetitions.Optional, schema.FieldList{
   580  				schema.MustGroup(schema.NewGroupNodeConverted("map" /* name */, parquet.Repetitions.Repeated, schema.FieldList{itemKey, itemValue}, schema.ConvertedTypes.Map, -1 /* fieldID */)),
   581  			}, -1 /* fieldID */)),
   582  		}...), -1 /* fieldID */))
   583  		descr = schema.NewSchema(sc)
   584  		assert.True(t, descr.HasRepeatedFields())
   585  	})
   586  }
   588  func ExamplePrintSchema() {
   589  	fields := schema.FieldList{schema.NewInt32Node("a" /* name */, parquet.Repetitions.Required, 1 /* fieldID */)}
   590  	item1 := schema.NewInt64Node("item1" /* name */, parquet.Repetitions.Optional, 4 /* fieldID */)
   591  	item2 := schema.NewBooleanNode("item2" /* name */, parquet.Repetitions.Required, 5 /* fieldID */)
   592  	list := schema.MustGroup(schema.NewGroupNodeConverted("b" /* name */, parquet.Repetitions.Repeated, schema.FieldList{item1, item2}, schema.ConvertedTypes.List, 3 /* fieldID */))
   593  	bag := schema.MustGroup(schema.NewGroupNode("bag" /* name */, parquet.Repetitions.Optional, schema.FieldList{list}, 2 /* fieldID */))
   594  	fields = append(fields, bag)
   596  	fields = append(fields,
   597  		schema.MustPrimitive(schema.NewPrimitiveNodeConverted("c" /* name */, parquet.Repetitions.Required, parquet.Types.Int32, schema.ConvertedTypes.Decimal, 0 /* type len */, 3 /* precision */, 2 /* scale */, 6 /* fieldID */)),
   598  		schema.MustPrimitive(schema.NewPrimitiveNodeLogical("d" /* name */, parquet.Repetitions.Required, schema.NewDecimalLogicalType(10 /* precision */, 5 /* scale */), parquet.Types.Int64, -1 /* type len */, 7 /* fieldID */)))
   600  	sc := schema.MustGroup(schema.NewGroupNode("schema" /* name */, parquet.Repetitions.Repeated, fields, 0 /* fieldID */))
   601  	schema.PrintSchema(sc, os.Stdout, 2)
   603  	// Output:
   604  	// repeated group field_id=0 schema {
   605  	//   required int32 field_id=1 a;
   606  	//   optional group field_id=2 bag {
   607  	//     repeated group field_id=3 b (List) {
   608  	//       optional int64 field_id=4 item1;
   609  	//       required boolean field_id=5 item2;
   610  	//     }
   611  	//   }
   612  	//   required int32 field_id=6 c (Decimal(precision=3, scale=2));
   613  	//   required int64 field_id=7 d (Decimal(precision=10, scale=5));
   614  	// }
   615  }
   617  func TestPanicSchemaNodeCreation(t *testing.T) {
   618  	assert.Panics(t, func() {
   619  		schema.MustPrimitive(schema.NewPrimitiveNodeLogical("map" /* name */, parquet.Repetitions.Required, schema.MapLogicalType{}, parquet.Types.Int64, -1 /* type len */, -1 /* fieldID */))
   620  	}, "nested logical type on non-group node")
   622  	assert.Panics(t, func() {
   623  		schema.MustPrimitive(schema.NewPrimitiveNodeLogical("string" /* name */, parquet.Repetitions.Required, schema.StringLogicalType{}, parquet.Types.Boolean, -1 /* type len */, -1 /* fieldID */))
   624  	}, "incompatible primitive type")
   626  	assert.Panics(t, func() {
   627  		schema.MustPrimitive(schema.NewPrimitiveNodeLogical("interval" /* name */, parquet.Repetitions.Required, schema.IntervalLogicalType{}, parquet.Types.FixedLenByteArray, 11 /* type len */, -1 /* fieldID */))
   628  	}, "incompatible primitive length")
   630  	assert.Panics(t, func() {
   631  		schema.MustPrimitive(schema.NewPrimitiveNodeLogical("decimal" /* name */, parquet.Repetitions.Required, schema.NewDecimalLogicalType(16, 6), parquet.Types.Int32, -1 /* type len */, -1 /* fieldID */))
   632  	}, "primitive too small for given precision")
   634  	assert.Panics(t, func() {
   635  		schema.MustPrimitive(schema.NewPrimitiveNodeLogical("uuid" /* name */, parquet.Repetitions.Required, schema.UUIDLogicalType{}, parquet.Types.FixedLenByteArray, 64 /* type len */, -1 /* fieldID */))
   636  	}, "incompatible primitive length")
   638  	assert.Panics(t, func() {
   639  		schema.MustPrimitive(schema.NewPrimitiveNodeLogical("float16" /* name */, parquet.Repetitions.Required, schema.Float16LogicalType{}, parquet.Types.FixedLenByteArray, 4 /* type len */, -1 /* fieldID */))
   640  	}, "incompatible primitive length")
   642  	assert.Panics(t, func() {
   643  		schema.MustPrimitive(schema.NewPrimitiveNodeLogical("negative_len" /* name */, parquet.Repetitions.Required, schema.NoLogicalType{}, parquet.Types.FixedLenByteArray, -16 /* type len */, -1 /* fieldID */))
   644  	}, "non-positive length for fixed length binary")
   646  	assert.Panics(t, func() {
   647  		schema.MustPrimitive(schema.NewPrimitiveNodeLogical("zero_len" /* name */, parquet.Repetitions.Required, schema.NoLogicalType{}, parquet.Types.FixedLenByteArray, 0 /* type len */, -1 /* fieldID */))
   648  	}, "non-positive length for fixed length binary")
   650  	assert.Panics(t, func() {
   651  		schema.MustGroup(schema.NewGroupNodeLogical("list" /* name */, parquet.Repetitions.Repeated, schema.FieldList{}, schema.JSONLogicalType{}, -1 /* fieldID */))
   652  	}, "non-nested logical type on group node")
   653  }
   655  func TestNullLogicalConvertsToNone(t *testing.T) {
   656  	var (
   657  		empty schema.LogicalType
   658  		n     schema.Node
   659  	)
   660  	assert.NotPanics(t, func() {
   661  		n = schema.MustPrimitive(schema.NewPrimitiveNodeLogical("value" /* name */, parquet.Repetitions.Required, empty, parquet.Types.Double, -1 /* type len */, -1 /* fieldID */))
   662  	})
   663  	assert.True(t, n.LogicalType().IsNone())
   664  	assert.Equal(t, schema.ConvertedTypes.None, n.ConvertedType())
   665  	assert.NotPanics(t, func() {
   666  		n = schema.MustGroup(schema.NewGroupNodeLogical("items" /* name */, parquet.Repetitions.Repeated, schema.FieldList{}, empty, -1 /* fieldID */))
   667  	})
   668  	assert.True(t, n.LogicalType().IsNone())
   669  	assert.Equal(t, schema.ConvertedTypes.None, n.ConvertedType())
   670  }

View as plain text