...

Source file src/github.com/apache/arrow/go/v15/parquet/schema/reflection_test.go

Documentation: github.com/apache/arrow/go/v15/parquet/schema

     1  // Licensed to the Apache Software Foundation (ASF) under one
     2  // or more contributor license agreements.  See the NOTICE file
     3  // distributed with this work for additional information
     4  // regarding copyright ownership.  The ASF licenses this file
     5  // to you under the Apache License, Version 2.0 (the
     6  // "License"); you may not use this file except in compliance
     7  // with the License.  You may obtain a copy of the License at
     8  //
     9  // http://www.apache.org/licenses/LICENSE-2.0
    10  //
    11  // Unless required by applicable law or agreed to in writing, software
    12  // distributed under the License is distributed on an "AS IS" BASIS,
    13  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    14  // See the License for the specific language governing permissions and
    15  // limitations under the License.
    16  
    17  package schema_test
    18  
    19  import (
    20  	"log"
    21  	"os"
    22  	"reflect"
    23  	"testing"
    24  
    25  	"github.com/apache/arrow/go/v15/arrow/float16"
    26  	"github.com/apache/arrow/go/v15/parquet"
    27  	"github.com/apache/arrow/go/v15/parquet/schema"
    28  	"github.com/stretchr/testify/assert"
    29  )
    30  
    31  func ExampleNewSchemaFromStruct_primitives() {
    32  	type Schema struct {
    33  		Bool              bool
    34  		Int8              int8
    35  		Uint16            uint16
    36  		Int32             int32
    37  		Int64             int64
    38  		Int96             parquet.Int96
    39  		Float             float32
    40  		Double            float64
    41  		ByteArray         string
    42  		FixedLenByteArray [10]byte
    43  	}
    44  
    45  	sc, err := schema.NewSchemaFromStruct(Schema{})
    46  	if err != nil {
    47  		log.Fatal(err)
    48  	}
    49  
    50  	schema.PrintSchema(sc.Root(), os.Stdout, 2)
    51  
    52  	// Output:
    53  	// repeated group field_id=-1 Schema {
    54  	//   required boolean field_id=-1 Bool;
    55  	//   required int32 field_id=-1 Int8 (Int(bitWidth=8, isSigned=true));
    56  	//   required int32 field_id=-1 Uint16 (Int(bitWidth=16, isSigned=false));
    57  	//   required int32 field_id=-1 Int32 (Int(bitWidth=32, isSigned=true));
    58  	//   required int64 field_id=-1 Int64 (Int(bitWidth=64, isSigned=true));
    59  	//   required int96 field_id=-1 Int96;
    60  	//   required float field_id=-1 Float;
    61  	//   required double field_id=-1 Double;
    62  	//   required byte_array field_id=-1 ByteArray;
    63  	//   required fixed_len_byte_array field_id=-1 FixedLenByteArray;
    64  	// }
    65  }
    66  
    67  func ExampleNewSchemaFromStruct_convertedtypes() {
    68  	type ConvertedSchema struct {
    69  		Utf8           string        `parquet:"name=utf8, converted=UTF8"`
    70  		Uint32         uint32        `parquet:"converted=INT_32"`
    71  		Date           int32         `parquet:"name=date, converted=date"`
    72  		TimeMilli      int32         `parquet:"name=timemilli, converted=TIME_MILLIS"`
    73  		TimeMicro      int64         `parquet:"name=timemicro, converted=time_micros"`
    74  		TimeStampMilli int64         `parquet:"converted=timestamp_millis"`
    75  		TimeStampMicro int64         `parquet:"converted=timestamp_micros"`
    76  		Interval       parquet.Int96 `parquet:"converted=INTERVAL"`
    77  		Decimal1       int32         `parquet:"converted=decimal, scale=2, precision=9"`
    78  		Decimal2       int64         `parquet:"converted=decimal, scale=2, precision=18"`
    79  		Decimal3       [12]byte      `parquet:"converted=decimal, scale=2, precision=10"`
    80  		Decimal4       string        `parquet:"converted=decimal, scale=2, precision=20"`
    81  	}
    82  
    83  	sc, err := schema.NewSchemaFromStruct(&ConvertedSchema{})
    84  	if err != nil {
    85  		log.Fatal(err)
    86  	}
    87  
    88  	schema.PrintSchema(sc.Root(), os.Stdout, 2)
    89  
    90  	// Output:
    91  	// repeated group field_id=-1 ConvertedSchema {
    92  	//   required byte_array field_id=-1 utf8 (String);
    93  	//   required int32 field_id=-1 Uint32 (Int(bitWidth=32, isSigned=true));
    94  	//   required int32 field_id=-1 date (Date);
    95  	//   required int32 field_id=-1 timemilli (Time(isAdjustedToUTC=true, timeUnit=milliseconds));
    96  	//   required int64 field_id=-1 timemicro (Time(isAdjustedToUTC=true, timeUnit=microseconds));
    97  	//   required int64 field_id=-1 TimeStampMilli (Timestamp(isAdjustedToUTC=true, timeUnit=milliseconds, is_from_converted_type=true, force_set_converted_type=false));
    98  	//   required int64 field_id=-1 TimeStampMicro (Timestamp(isAdjustedToUTC=true, timeUnit=microseconds, is_from_converted_type=true, force_set_converted_type=false));
    99  	//   required int96 field_id=-1 Interval;
   100  	//   required int32 field_id=-1 Decimal1 (Decimal(precision=9, scale=2));
   101  	//   required int64 field_id=-1 Decimal2 (Decimal(precision=18, scale=2));
   102  	//   required fixed_len_byte_array field_id=-1 Decimal3 (Decimal(precision=10, scale=2));
   103  	//   required byte_array field_id=-1 Decimal4 (Decimal(precision=20, scale=2));
   104  	// }
   105  }
   106  
   107  func ExampleNewSchemaFromStruct_repetition() {
   108  	type RepetitionSchema struct {
   109  		List     []int64 `parquet:"fieldid=1"`
   110  		Repeated []int64 `parquet:"repetition=repeated, fieldid=2"`
   111  		Optional *int64  `parquet:"fieldid=3"`
   112  		Required *int64  `parquet:"repetition=REQUIRED, fieldid=4"`
   113  		Opt      int64   `parquet:"repetition=OPTIONAL, fieldid=5"`
   114  	}
   115  
   116  	sc, err := schema.NewSchemaFromStruct(RepetitionSchema{})
   117  	if err != nil {
   118  		log.Fatal(err)
   119  	}
   120  
   121  	schema.PrintSchema(sc.Root(), os.Stdout, 2)
   122  
   123  	// Output:
   124  	// repeated group field_id=-1 RepetitionSchema {
   125  	//   required group field_id=1 List (List) {
   126  	//     repeated group field_id=-1 list {
   127  	//       required int64 field_id=-1 element (Int(bitWidth=64, isSigned=true));
   128  	//     }
   129  	//   }
   130  	//   repeated int64 field_id=2 Repeated (Int(bitWidth=64, isSigned=true));
   131  	//   optional int64 field_id=3 Optional (Int(bitWidth=64, isSigned=true));
   132  	//   required int64 field_id=4 Required (Int(bitWidth=64, isSigned=true));
   133  	//   optional int64 field_id=5 Opt (Int(bitWidth=64, isSigned=true));
   134  	// }
   135  }
   136  
   137  func ExampleNewSchemaFromStruct_logicaltypes() {
   138  	type LogicalTypes struct {
   139  		String                []byte   `parquet:"logical=String"`
   140  		Enum                  string   `parquet:"logical=enum"`
   141  		Date                  int32    `parquet:"logical=date"`
   142  		Decimal1              int32    `parquet:"logical=decimal, precision=9, scale=2"`
   143  		Decimal2              int32    `parquet:"logical=decimal, logical.precision=9, scale=2"`
   144  		Decimal3              int32    `parquet:"logical=decimal, precision=5, logical.precision=9, scale=1, logical.scale=3"`
   145  		TimeMilliUTC          int32    `parquet:"logical=TIME, logical.unit=millis"`
   146  		TimeMilli             int32    `parquet:"logical=Time, logical.unit=millis, logical.isadjustedutc=false"`
   147  		TimeMicros            int64    `parquet:"logical=time, logical.unit=micros, logical.isadjustedutc=false"`
   148  		TimeMicrosUTC         int64    `parquet:"logical=time, logical.unit=micros, logical.isadjustedutc=true"`
   149  		TimeNanos             int64    `parquet:"logical=time, logical.unit=nanos"`
   150  		TimestampMilli        int64    `parquet:"logical=timestamp, logical.unit=millis"`
   151  		TimestampMicrosNotUTC int64    `parquet:"logical=timestamp, logical.unit=micros, logical.isadjustedutc=false"`
   152  		TimestampNanos        int64    `parquet:"logical=timestamp, logical.unit=nanos"`
   153  		JSON                  string   `parquet:"logical=json"`
   154  		BSON                  []byte   `parquet:"logical=BSON"`
   155  		UUID                  [16]byte `parquet:"logical=uuid"`
   156  		Float16               [2]byte  `parquet:"logical=float16"`
   157  		Float16Optional       *[2]byte `parquet:"logical=float16"`
   158  		Float16Num            float16.Num
   159  	}
   160  
   161  	sc, err := schema.NewSchemaFromStruct(LogicalTypes{})
   162  	if err != nil {
   163  		log.Fatal(err)
   164  	}
   165  
   166  	schema.PrintSchema(sc.Root(), os.Stdout, 2)
   167  
   168  	// Output:
   169  	// repeated group field_id=-1 LogicalTypes {
   170  	//   required byte_array field_id=-1 String (String);
   171  	//   required byte_array field_id=-1 Enum (Enum);
   172  	//   required int32 field_id=-1 Date (Date);
   173  	//   required int32 field_id=-1 Decimal1 (Decimal(precision=9, scale=2));
   174  	//   required int32 field_id=-1 Decimal2 (Decimal(precision=9, scale=2));
   175  	//   required int32 field_id=-1 Decimal3 (Decimal(precision=9, scale=3));
   176  	//   required int32 field_id=-1 TimeMilliUTC (Time(isAdjustedToUTC=true, timeUnit=milliseconds));
   177  	//   required int32 field_id=-1 TimeMilli (Time(isAdjustedToUTC=false, timeUnit=milliseconds));
   178  	//   required int64 field_id=-1 TimeMicros (Time(isAdjustedToUTC=false, timeUnit=microseconds));
   179  	//   required int64 field_id=-1 TimeMicrosUTC (Time(isAdjustedToUTC=true, timeUnit=microseconds));
   180  	//   required int64 field_id=-1 TimeNanos (Time(isAdjustedToUTC=true, timeUnit=nanoseconds));
   181  	//   required int64 field_id=-1 TimestampMilli (Timestamp(isAdjustedToUTC=true, timeUnit=milliseconds, is_from_converted_type=false, force_set_converted_type=false));
   182  	//   required int64 field_id=-1 TimestampMicrosNotUTC (Timestamp(isAdjustedToUTC=false, timeUnit=microseconds, is_from_converted_type=false, force_set_converted_type=false));
   183  	//   required int64 field_id=-1 TimestampNanos (Timestamp(isAdjustedToUTC=true, timeUnit=nanoseconds, is_from_converted_type=false, force_set_converted_type=false));
   184  	//   required byte_array field_id=-1 JSON (JSON);
   185  	//   required byte_array field_id=-1 BSON (BSON);
   186  	//   required fixed_len_byte_array field_id=-1 UUID (UUID);
   187  	//   required fixed_len_byte_array field_id=-1 Float16 (Float16);
   188  	//   optional fixed_len_byte_array field_id=-1 Float16Optional (Float16);
   189  	//   required fixed_len_byte_array field_id=-1 Float16Num (Float16);
   190  	// }
   191  }
   192  
   193  func ExampleNewSchemaFromStruct_physicaltype() {
   194  	type ChangeTypes struct {
   195  		Int32        int64  `parquet:"type=int32"`
   196  		FixedLen     string `parquet:"type=fixed_len_byte_array, length=10"`
   197  		SliceAsFixed []byte `parquet:"type=fixed_len_byte_array, length=12"`
   198  		Int          int    `parquet:"type=int32"`
   199  	}
   200  
   201  	sc, err := schema.NewSchemaFromStruct(ChangeTypes{})
   202  	if err != nil {
   203  		log.Fatal(err)
   204  	}
   205  
   206  	schema.PrintSchema(sc.Root(), os.Stdout, 2)
   207  
   208  	// Output:
   209  	// repeated group field_id=-1 ChangeTypes {
   210  	//   required int32 field_id=-1 Int32 (Int(bitWidth=32, isSigned=true));
   211  	//   required fixed_len_byte_array field_id=-1 FixedLen;
   212  	//   required fixed_len_byte_array field_id=-1 SliceAsFixed;
   213  	//   required int32 field_id=-1 Int (Int(bitWidth=32, isSigned=true));
   214  	// }
   215  }
   216  
   217  func ExampleNewSchemaFromStruct_nestedtypes() {
   218  	type Other struct {
   219  		OptionalMap *map[string]*string `parquet:"valuerepetition=required, keylogical=String, valueconverted=BSON"`
   220  	}
   221  
   222  	type MyMap map[int32]string
   223  
   224  	type Nested struct {
   225  		SimpleMap     map[int32]string
   226  		FixedLenMap   map[string][]byte `parquet:"keytype=fixed_len_byte_array, keyfieldid=10, valuefieldid=11, keylength=10"`
   227  		DecimalMap    map[int32]string  `parquet:"logical=map, keyconverted=DECIMAL, keyscale=3, keyprecision=7, valuetype=fixed_len_byte_array, valuelength=4, valuelogical=decimal, valuelogical.precision=9, valuescale=2"`
   228  		OtherList     []*Other
   229  		OtherRepeated []Other  `parquet:"repetition=repeated"`
   230  		DateArray     [5]int32 `parquet:"valuelogical=date, logical=list"`
   231  		DateMap       MyMap    `parquet:"keylogical=TIME, keylogical.unit=MILLIS, keylogical.isadjustedutc=false, valuelogical=enum"`
   232  	}
   233  
   234  	sc, err := schema.NewSchemaFromStruct(Nested{})
   235  	if err != nil {
   236  		log.Fatal(err)
   237  	}
   238  
   239  	schema.PrintSchema(sc.Root(), os.Stdout, 2)
   240  
   241  	// Output:
   242  	// repeated group field_id=-1 Nested {
   243  	//   required group field_id=-1 SimpleMap (Map) {
   244  	//     repeated group field_id=-1 key_value {
   245  	//       required int32 field_id=-1 key (Int(bitWidth=32, isSigned=true));
   246  	//       required byte_array field_id=-1 value;
   247  	//     }
   248  	//   }
   249  	//   required group field_id=-1 FixedLenMap (Map) {
   250  	//     repeated group field_id=-1 key_value {
   251  	//       required fixed_len_byte_array field_id=10 key;
   252  	//       required byte_array field_id=11 value;
   253  	//     }
   254  	//   }
   255  	//   required group field_id=-1 DecimalMap (Map) {
   256  	//     repeated group field_id=-1 key_value {
   257  	//       required int32 field_id=-1 key (Decimal(precision=7, scale=3));
   258  	//       required fixed_len_byte_array field_id=-1 value (Decimal(precision=9, scale=2));
   259  	//     }
   260  	//   }
   261  	//   required group field_id=-1 OtherList (List) {
   262  	//     repeated group field_id=-1 list {
   263  	//       optional group field_id=-1 element {
   264  	//         optional group field_id=-1 OptionalMap (Map) {
   265  	//           repeated group field_id=-1 key_value {
   266  	//             required byte_array field_id=-1 key (String);
   267  	//             required byte_array field_id=-1 value (BSON);
   268  	//           }
   269  	//         }
   270  	//       }
   271  	//     }
   272  	//   }
   273  	//   repeated group field_id=-1 OtherRepeated {
   274  	//     optional group field_id=-1 OptionalMap (Map) {
   275  	//       repeated group field_id=-1 key_value {
   276  	//         required byte_array field_id=-1 key (String);
   277  	//         required byte_array field_id=-1 value (BSON);
   278  	//       }
   279  	//     }
   280  	//   }
   281  	//   required group field_id=-1 DateArray (List) {
   282  	//     repeated group field_id=-1 list {
   283  	//       required int32 field_id=-1 element (Date);
   284  	//     }
   285  	//   }
   286  	//   required group field_id=-1 DateMap (Map) {
   287  	//     repeated group field_id=-1 key_value {
   288  	//       required int32 field_id=-1 key (Time(isAdjustedToUTC=false, timeUnit=milliseconds));
   289  	//       required byte_array field_id=-1 value (Enum);
   290  	//     }
   291  	//   }
   292  	// }
   293  }
   294  
   295  func TestStructFromSchema(t *testing.T) {
   296  	root, err := schema.NewGroupNode("schema", parquet.Repetitions.Repeated, schema.FieldList{
   297  		schema.NewBooleanNode("bool", parquet.Repetitions.Required, -1),
   298  		schema.NewInt32Node("int32", parquet.Repetitions.Optional, -1),
   299  		schema.NewInt64Node("int64", parquet.Repetitions.Repeated, -1),
   300  		schema.NewInt96Node("int96", parquet.Repetitions.Required, -1),
   301  		schema.NewFloat32Node("float", parquet.Repetitions.Required, -1),
   302  		schema.NewByteArrayNode("bytearray", parquet.Repetitions.Required, -1),
   303  		schema.NewFixedLenByteArrayNode("fixedLen", parquet.Repetitions.Required, 10, -1),
   304  	}, -1)
   305  	assert.NoError(t, err)
   306  
   307  	sc := schema.NewSchema(root)
   308  
   309  	typ, err := schema.NewStructFromSchema(sc)
   310  	assert.NoError(t, err)
   311  
   312  	assert.Equal(t, reflect.Struct, typ.Kind())
   313  	assert.Equal(t, "struct { bool bool; int32 *int32; int64 []int64; int96 parquet.Int96; float float32; bytearray parquet.ByteArray; fixedLen parquet.FixedLenByteArray }",
   314  		typ.String())
   315  }
   316  
   317  func TestStructFromSchemaWithNesting(t *testing.T) {
   318  	type Other struct {
   319  		List     *[]*float32
   320  		Excluded int32 `parquet:"-"`
   321  	}
   322  
   323  	type Nested struct {
   324  		Nest         []int32
   325  		OptionalNest []*int64
   326  		Mapped       map[string]float32
   327  		Other        []Other
   328  		Other2       Other
   329  	}
   330  
   331  	sc, err := schema.NewSchemaFromStruct(Nested{})
   332  	assert.NoError(t, err)
   333  
   334  	typ, err := schema.NewStructFromSchema(sc)
   335  	assert.NoError(t, err)
   336  	assert.Equal(t, "struct { Nest []int32; OptionalNest []*int64; Mapped map[string]float32; Other []struct { List *[]*float32 }; Other2 struct { List *[]*float32 } }",
   337  		typ.String())
   338  }
   339  
   340  func TestStructFromSchemaBackwardsCompatList(t *testing.T) {
   341  	tests := []struct {
   342  		name     string
   343  		n        schema.Node
   344  		expected string
   345  	}{
   346  		{"proper list", schema.MustGroup(schema.NewGroupNodeLogical("my_list", parquet.Repetitions.Required,
   347  			schema.FieldList{
   348  				schema.MustGroup(schema.NewGroupNode("list", parquet.Repetitions.Repeated, schema.FieldList{schema.NewBooleanNode("element", parquet.Repetitions.Optional, -1)}, -1)),
   349  			}, schema.NewListLogicalType(), -1)), "struct { my_list []*bool }"},
   350  		{"backward nullable list nonnull ints", schema.MustGroup(schema.NewGroupNodeLogical("my_list", parquet.Repetitions.Optional, schema.FieldList{
   351  			schema.NewInt32Node("element", parquet.Repetitions.Repeated, -1),
   352  		}, schema.NewListLogicalType(), -1)), "struct { my_list *[]int32 }"},
   353  		{"backward nullable list tuple string int", schema.MustGroup(schema.NewGroupNodeLogical("my_list", parquet.Repetitions.Optional, schema.FieldList{
   354  			schema.MustGroup(schema.NewGroupNode("element", parquet.Repetitions.Repeated, schema.FieldList{
   355  				schema.MustPrimitive(schema.NewPrimitiveNodeLogical("str", parquet.Repetitions.Required, schema.StringLogicalType{}, parquet.Types.ByteArray, 0, -1)),
   356  				schema.NewInt32Node("num", parquet.Repetitions.Required, -1),
   357  			}, -1)),
   358  		}, schema.NewListLogicalType(), -1)), "struct { my_list *[]struct { str string; num int32 } }"},
   359  		{"list tuple string", schema.MustGroup(schema.NewGroupNodeLogical("my_list", parquet.Repetitions.Required, schema.FieldList{
   360  			schema.MustGroup(schema.NewGroupNode("array", parquet.Repetitions.Repeated, schema.FieldList{
   361  				schema.NewByteArrayNode("str", parquet.Repetitions.Required, -1),
   362  			}, -1)),
   363  		}, schema.NewListLogicalType(), -1)), "struct { my_list []struct { str parquet.ByteArray } }"},
   364  		{"list tuple string my_list_tuple", schema.MustGroup(schema.NewGroupNodeLogical("my_list", parquet.Repetitions.Optional, schema.FieldList{
   365  			schema.MustGroup(schema.NewGroupNode("my_list_tuple", parquet.Repetitions.Repeated, schema.FieldList{
   366  				schema.MustPrimitive(schema.NewPrimitiveNodeLogical("str", parquet.Repetitions.Required, schema.StringLogicalType{}, parquet.Types.ByteArray, 0, -1)),
   367  			}, -1)),
   368  		}, schema.NewListLogicalType(), -1)), "struct { my_list *[]struct { str string } }"},
   369  	}
   370  
   371  	for _, tt := range tests {
   372  		t.Run(tt.name, func(t *testing.T) {
   373  			typ, err := schema.NewStructFromSchema(schema.NewSchema(schema.MustGroup(schema.NewGroupNode("schema", parquet.Repetitions.Repeated, schema.FieldList{tt.n}, -1))))
   374  			assert.NoError(t, err)
   375  			assert.Equal(t, tt.expected, typ.String())
   376  		})
   377  	}
   378  }
   379  
   380  func TestStructFromSchemaMaps(t *testing.T) {
   381  	tests := []struct {
   382  		name     string
   383  		n        schema.Node
   384  		expected string
   385  	}{
   386  		{"map string int", schema.MustGroup(schema.NewGroupNodeLogical("my_map", parquet.Repetitions.Required, schema.FieldList{
   387  			schema.MustGroup(schema.NewGroupNode("key_value", parquet.Repetitions.Repeated, schema.FieldList{
   388  				schema.MustPrimitive(schema.NewPrimitiveNodeLogical("key", parquet.Repetitions.Required, schema.StringLogicalType{}, parquet.Types.ByteArray, 0, -1)),
   389  				schema.NewInt32Node("value", parquet.Repetitions.Optional, -1),
   390  			}, -1)),
   391  		}, schema.MapLogicalType{}, -1)), "struct { my_map map[string]*int32 }"},
   392  		{"nullable map string, int, required values", schema.MustGroup(schema.NewGroupNodeLogical("my_map", parquet.Repetitions.Optional, schema.FieldList{
   393  			schema.MustGroup(schema.NewGroupNode("map", parquet.Repetitions.Repeated, schema.FieldList{
   394  				schema.NewByteArrayNode("str", parquet.Repetitions.Required, -1),
   395  				schema.NewInt32Node("num", parquet.Repetitions.Required, -1),
   396  			}, -1)),
   397  		}, schema.MapLogicalType{}, -1)), "struct { my_map *map[string]int32 }"},
   398  		{"map_key_value with missing value", schema.MustGroup(schema.NewGroupNodeConverted("my_map", parquet.Repetitions.Optional, schema.FieldList{
   399  			schema.MustGroup(schema.NewGroupNode("map", parquet.Repetitions.Repeated, schema.FieldList{
   400  				schema.NewByteArrayNode("key", parquet.Repetitions.Required, -1),
   401  			}, -1)),
   402  		}, schema.ConvertedTypes.MapKeyValue, -1)), "struct { my_map *map[string]bool }"},
   403  	}
   404  	for _, tt := range tests {
   405  		t.Run(tt.name, func(t *testing.T) {
   406  			typ, err := schema.NewStructFromSchema(schema.NewSchema(schema.MustGroup(schema.NewGroupNode("schema", parquet.Repetitions.Repeated, schema.FieldList{tt.n}, -1))))
   407  			assert.NoError(t, err)
   408  			assert.Equal(t, tt.expected, typ.String())
   409  		})
   410  	}
   411  }
   412  

View as plain text