1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17 package pqarrow_test
18
19 import (
20 "encoding/base64"
21 "testing"
22
23 "github.com/apache/arrow/go/v15/arrow"
24 "github.com/apache/arrow/go/v15/arrow/flight"
25 "github.com/apache/arrow/go/v15/arrow/ipc"
26 "github.com/apache/arrow/go/v15/arrow/memory"
27 "github.com/apache/arrow/go/v15/internal/types"
28 "github.com/apache/arrow/go/v15/parquet"
29 "github.com/apache/arrow/go/v15/parquet/metadata"
30 "github.com/apache/arrow/go/v15/parquet/pqarrow"
31 "github.com/apache/arrow/go/v15/parquet/schema"
32 "github.com/stretchr/testify/assert"
33 "github.com/stretchr/testify/require"
34 )
35
36 func TestGetOriginSchemaBase64(t *testing.T) {
37 uuidType := types.NewUUIDType()
38 md := arrow.NewMetadata([]string{"PARQUET:field_id"}, []string{"-1"})
39 extMd := arrow.NewMetadata([]string{ipc.ExtensionMetadataKeyName, ipc.ExtensionTypeKeyName, "PARQUET:field_id"}, []string{uuidType.Serialize(), uuidType.ExtensionName(), "-1"})
40 origArrSc := arrow.NewSchema([]arrow.Field{
41 {Name: "f1", Type: arrow.BinaryTypes.String, Metadata: md},
42 {Name: "f2", Type: arrow.PrimitiveTypes.Int64, Metadata: md},
43 {Name: "uuid", Type: uuidType, Metadata: extMd},
44 }, nil)
45
46 arrSerializedSc := flight.SerializeSchema(origArrSc, memory.DefaultAllocator)
47 if err := arrow.RegisterExtensionType(uuidType); err != nil {
48 t.Fatal(err)
49 }
50 defer arrow.UnregisterExtensionType(uuidType.ExtensionName())
51 pqschema, err := pqarrow.ToParquet(origArrSc, nil, pqarrow.DefaultWriterProps())
52 require.NoError(t, err)
53
54 tests := []struct {
55 name string
56 enc *base64.Encoding
57 }{
58 {"raw", base64.RawStdEncoding},
59 {"std", base64.StdEncoding},
60 }
61
62 for _, tt := range tests {
63 t.Run(tt.name, func(t *testing.T) {
64 kv := metadata.NewKeyValueMetadata()
65 kv.Append("ARROW:schema", tt.enc.EncodeToString(arrSerializedSc))
66 arrsc, err := pqarrow.FromParquet(pqschema, nil, kv)
67 assert.NoError(t, err)
68 assert.True(t, origArrSc.Equal(arrsc))
69 })
70 }
71 }
72
73 func TestGetOriginSchemaUnregisteredExtension(t *testing.T) {
74 uuidType := types.NewUUIDType()
75 if err := arrow.RegisterExtensionType(uuidType); err != nil {
76 t.Fatal(err)
77 }
78
79 md := arrow.NewMetadata([]string{"PARQUET:field_id"}, []string{"-1"})
80 origArrSc := arrow.NewSchema([]arrow.Field{
81 {Name: "f1", Type: arrow.BinaryTypes.String, Metadata: md},
82 {Name: "f2", Type: arrow.PrimitiveTypes.Int64, Metadata: md},
83 {Name: "uuid", Type: uuidType, Metadata: md},
84 }, nil)
85 pqschema, err := pqarrow.ToParquet(origArrSc, nil, pqarrow.DefaultWriterProps())
86 require.NoError(t, err)
87
88 arrSerializedSc := flight.SerializeSchema(origArrSc, memory.DefaultAllocator)
89 kv := metadata.NewKeyValueMetadata()
90 kv.Append("ARROW:schema", base64.StdEncoding.EncodeToString(arrSerializedSc))
91
92 arrow.UnregisterExtensionType(uuidType.ExtensionName())
93 arrsc, err := pqarrow.FromParquet(pqschema, nil, kv)
94 require.NoError(t, err)
95
96 extMd := arrow.NewMetadata([]string{ipc.ExtensionMetadataKeyName, ipc.ExtensionTypeKeyName, "PARQUET:field_id"},
97 []string{uuidType.Serialize(), uuidType.ExtensionName(), "-1"})
98 expArrSc := arrow.NewSchema([]arrow.Field{
99 {Name: "f1", Type: arrow.BinaryTypes.String, Metadata: md},
100 {Name: "f2", Type: arrow.PrimitiveTypes.Int64, Metadata: md},
101 {Name: "uuid", Type: uuidType.StorageType(), Metadata: extMd},
102 }, nil)
103
104 assert.Truef(t, expArrSc.Equal(arrsc), "expected: %s\ngot: %s", expArrSc, arrsc)
105 }
106
107 func TestToParquetWriterConfig(t *testing.T) {
108 origSc := arrow.NewSchema([]arrow.Field{
109 {Name: "f1", Type: arrow.BinaryTypes.String},
110 {Name: "f2", Type: arrow.PrimitiveTypes.Int64},
111 }, nil)
112
113 tests := []struct {
114 name string
115 rootRepetition parquet.Repetition
116 }{
117 {"test1", parquet.Repetitions.Required},
118 {"test2", parquet.Repetitions.Repeated},
119 }
120
121 for _, tt := range tests {
122 t.Run(tt.name, func(t *testing.T) {
123
124 pqschema, err := pqarrow.ToParquet(origSc,
125 parquet.NewWriterProperties(
126 parquet.WithRootName(tt.name),
127 parquet.WithRootRepetition(tt.rootRepetition),
128 ),
129 pqarrow.DefaultWriterProps())
130 require.NoError(t, err)
131
132 assert.Equal(t, tt.name, pqschema.Root().Name())
133 assert.Equal(t, tt.rootRepetition, pqschema.Root().RepetitionType())
134 })
135 }
136 }
137
138 func TestConvertArrowFlatPrimitives(t *testing.T) {
139 parquetFields := make(schema.FieldList, 0)
140 arrowFields := make([]arrow.Field, 0)
141
142 parquetFields = append(parquetFields, schema.NewBooleanNode("boolean", parquet.Repetitions.Required, -1))
143 arrowFields = append(arrowFields, arrow.Field{Name: "boolean", Type: arrow.FixedWidthTypes.Boolean, Nullable: false})
144
145 parquetFields = append(parquetFields, schema.Must(schema.NewPrimitiveNodeLogical("int8", parquet.Repetitions.Required,
146 schema.NewIntLogicalType(8, true), parquet.Types.Int32, 0, -1)))
147 arrowFields = append(arrowFields, arrow.Field{Name: "int8", Type: arrow.PrimitiveTypes.Int8, Nullable: false})
148
149 parquetFields = append(parquetFields, schema.Must(schema.NewPrimitiveNodeLogical("uint8", parquet.Repetitions.Required,
150 schema.NewIntLogicalType(8, false), parquet.Types.Int32, 0, -1)))
151 arrowFields = append(arrowFields, arrow.Field{Name: "uint8", Type: arrow.PrimitiveTypes.Uint8, Nullable: false})
152
153 parquetFields = append(parquetFields, schema.Must(schema.NewPrimitiveNodeLogical("int16", parquet.Repetitions.Required,
154 schema.NewIntLogicalType(16, true), parquet.Types.Int32, 0, -1)))
155 arrowFields = append(arrowFields, arrow.Field{Name: "int16", Type: arrow.PrimitiveTypes.Int16, Nullable: false})
156
157 parquetFields = append(parquetFields, schema.Must(schema.NewPrimitiveNodeLogical("uint16", parquet.Repetitions.Required,
158 schema.NewIntLogicalType(16, false), parquet.Types.Int32, 0, -1)))
159 arrowFields = append(arrowFields, arrow.Field{Name: "uint16", Type: arrow.PrimitiveTypes.Uint16, Nullable: false})
160
161 parquetFields = append(parquetFields, schema.Must(schema.NewPrimitiveNodeLogical("int32", parquet.Repetitions.Required,
162 schema.NewIntLogicalType(32, true), parquet.Types.Int32, 0, -1)))
163 arrowFields = append(arrowFields, arrow.Field{Name: "int32", Type: arrow.PrimitiveTypes.Int32, Nullable: false})
164
165 parquetFields = append(parquetFields, schema.Must(schema.NewPrimitiveNodeLogical("uint32", parquet.Repetitions.Required,
166 schema.NewIntLogicalType(32, false), parquet.Types.Int32, 0, -1)))
167 arrowFields = append(arrowFields, arrow.Field{Name: "uint32", Type: arrow.PrimitiveTypes.Uint32, Nullable: false})
168
169 parquetFields = append(parquetFields, schema.Must(schema.NewPrimitiveNodeLogical("int64", parquet.Repetitions.Required,
170 schema.NewIntLogicalType(64, true), parquet.Types.Int64, 0, -1)))
171 arrowFields = append(arrowFields, arrow.Field{Name: "int64", Type: arrow.PrimitiveTypes.Int64, Nullable: false})
172
173 parquetFields = append(parquetFields, schema.Must(schema.NewPrimitiveNodeLogical("uint64", parquet.Repetitions.Required,
174 schema.NewIntLogicalType(64, false), parquet.Types.Int64, 0, -1)))
175 arrowFields = append(arrowFields, arrow.Field{Name: "uint64", Type: arrow.PrimitiveTypes.Uint64, Nullable: false})
176
177 parquetFields = append(parquetFields, schema.Must(schema.NewPrimitiveNodeConverted("timestamp", parquet.Repetitions.Required,
178 parquet.Types.Int64, schema.ConvertedTypes.TimestampMillis, 0, 0, 0, -1)))
179 arrowFields = append(arrowFields, arrow.Field{Name: "timestamp", Type: arrow.FixedWidthTypes.Timestamp_ms, Nullable: false})
180
181 parquetFields = append(parquetFields, schema.Must(schema.NewPrimitiveNodeConverted("timestamp[us]", parquet.Repetitions.Required,
182 parquet.Types.Int64, schema.ConvertedTypes.TimestampMicros, 0, 0, 0, -1)))
183 arrowFields = append(arrowFields, arrow.Field{Name: "timestamp[us]", Type: arrow.FixedWidthTypes.Timestamp_us, Nullable: false})
184
185 parquetFields = append(parquetFields, schema.Must(schema.NewPrimitiveNodeLogical("date", parquet.Repetitions.Required,
186 schema.DateLogicalType{}, parquet.Types.Int32, 0, -1)))
187 arrowFields = append(arrowFields, arrow.Field{Name: "date", Type: arrow.FixedWidthTypes.Date32, Nullable: false})
188
189 parquetFields = append(parquetFields, schema.Must(schema.NewPrimitiveNodeLogical("date64", parquet.Repetitions.Required,
190 schema.NewTimestampLogicalType(true, schema.TimeUnitMillis), parquet.Types.Int64, 0, -1)))
191 arrowFields = append(arrowFields, arrow.Field{Name: "date64", Type: arrow.FixedWidthTypes.Date64, Nullable: false})
192
193 parquetFields = append(parquetFields, schema.Must(schema.NewPrimitiveNodeLogical("time32", parquet.Repetitions.Required,
194 schema.NewTimeLogicalType(true, schema.TimeUnitMillis), parquet.Types.Int32, 0, -1)))
195 arrowFields = append(arrowFields, arrow.Field{Name: "time32", Type: arrow.FixedWidthTypes.Time32ms, Nullable: false})
196
197 parquetFields = append(parquetFields, schema.Must(schema.NewPrimitiveNodeLogical("time64", parquet.Repetitions.Required,
198 schema.NewTimeLogicalType(true, schema.TimeUnitMicros), parquet.Types.Int64, 0, -1)))
199 arrowFields = append(arrowFields, arrow.Field{Name: "time64", Type: arrow.FixedWidthTypes.Time64us, Nullable: false})
200
201 parquetFields = append(parquetFields, schema.NewInt96Node("timestamp96", parquet.Repetitions.Required, -1))
202 arrowFields = append(arrowFields, arrow.Field{Name: "timestamp96", Type: arrow.FixedWidthTypes.Timestamp_ns, Nullable: false})
203
204 parquetFields = append(parquetFields, schema.NewFloat32Node("float", parquet.Repetitions.Optional, -1))
205 arrowFields = append(arrowFields, arrow.Field{Name: "float", Type: arrow.PrimitiveTypes.Float32, Nullable: true})
206
207 parquetFields = append(parquetFields, schema.NewFloat64Node("double", parquet.Repetitions.Optional, -1))
208 arrowFields = append(arrowFields, arrow.Field{Name: "double", Type: arrow.PrimitiveTypes.Float64, Nullable: true})
209
210 parquetFields = append(parquetFields, schema.NewByteArrayNode("binary", parquet.Repetitions.Optional, -1))
211 arrowFields = append(arrowFields, arrow.Field{Name: "binary", Type: arrow.BinaryTypes.Binary, Nullable: true})
212
213 parquetFields = append(parquetFields, schema.Must(schema.NewPrimitiveNodeLogical("string", parquet.Repetitions.Optional,
214 schema.StringLogicalType{}, parquet.Types.ByteArray, 0, -1)))
215 arrowFields = append(arrowFields, arrow.Field{Name: "string", Type: arrow.BinaryTypes.String, Nullable: true})
216
217 parquetFields = append(parquetFields, schema.NewFixedLenByteArrayNode("flba-binary", parquet.Repetitions.Optional, 12, -1))
218 arrowFields = append(arrowFields, arrow.Field{Name: "flba-binary", Type: &arrow.FixedSizeBinaryType{ByteWidth: 12}, Nullable: true})
219
220 arrowSchema := arrow.NewSchema(arrowFields, nil)
221 parquetSchema := schema.NewSchema(schema.MustGroup(schema.NewGroupNode("schema", parquet.Repetitions.Repeated, parquetFields, -1)))
222
223 result, err := pqarrow.ToParquet(arrowSchema, nil, pqarrow.NewArrowWriterProperties(pqarrow.WithDeprecatedInt96Timestamps(true)))
224 assert.NoError(t, err)
225 assert.True(t, parquetSchema.Equals(result))
226 for i := 0; i < parquetSchema.NumColumns(); i++ {
227 assert.Truef(t, parquetSchema.Column(i).Equals(result.Column(i)), "Column %d didn't match: %s", i, parquetSchema.Column(i).Name())
228 }
229 }
230
231 func TestConvertArrowParquetLists(t *testing.T) {
232 parquetFields := make(schema.FieldList, 0)
233 arrowFields := make([]arrow.Field, 0)
234
235 parquetFields = append(parquetFields, schema.MustGroup(schema.ListOf(schema.Must(schema.NewPrimitiveNodeLogical("my_list",
236 parquet.Repetitions.Optional, schema.StringLogicalType{}, parquet.Types.ByteArray, 0, -1)), parquet.Repetitions.Required, -1)))
237
238 arrowFields = append(arrowFields, arrow.Field{Name: "my_list", Type: arrow.ListOf(arrow.BinaryTypes.String)})
239
240 parquetFields = append(parquetFields, schema.MustGroup(schema.ListOf(schema.Must(schema.NewPrimitiveNodeLogical("my_list",
241 parquet.Repetitions.Optional, schema.StringLogicalType{}, parquet.Types.ByteArray, 0, -1)), parquet.Repetitions.Optional, -1)))
242
243 arrowFields = append(arrowFields, arrow.Field{Name: "my_list", Type: arrow.ListOf(arrow.BinaryTypes.String), Nullable: true})
244
245 arrowSchema := arrow.NewSchema(arrowFields, nil)
246 parquetSchema := schema.NewSchema(schema.MustGroup(schema.NewGroupNode("schema", parquet.Repetitions.Repeated, parquetFields, -1)))
247
248 result, err := pqarrow.ToParquet(arrowSchema, nil, pqarrow.NewArrowWriterProperties(pqarrow.WithDeprecatedInt96Timestamps(true)))
249 assert.NoError(t, err)
250 assert.True(t, parquetSchema.Equals(result), parquetSchema.String(), result.String())
251 for i := 0; i < parquetSchema.NumColumns(); i++ {
252 assert.Truef(t, parquetSchema.Column(i).Equals(result.Column(i)), "Column %d didn't match: %s", i, parquetSchema.Column(i).Name())
253 }
254 }
255
256 func TestConvertArrowDecimals(t *testing.T) {
257 parquetFields := make(schema.FieldList, 0)
258 arrowFields := make([]arrow.Field, 0)
259
260 parquetFields = append(parquetFields, schema.Must(schema.NewPrimitiveNodeLogical("decimal_8_4", parquet.Repetitions.Required,
261 schema.NewDecimalLogicalType(8, 4), parquet.Types.FixedLenByteArray, 4, -1)))
262 arrowFields = append(arrowFields, arrow.Field{Name: "decimal_8_4", Type: &arrow.Decimal128Type{Precision: 8, Scale: 4}})
263
264 parquetFields = append(parquetFields, schema.Must(schema.NewPrimitiveNodeLogical("decimal_20_4", parquet.Repetitions.Required,
265 schema.NewDecimalLogicalType(20, 4), parquet.Types.FixedLenByteArray, 9, -1)))
266 arrowFields = append(arrowFields, arrow.Field{Name: "decimal_20_4", Type: &arrow.Decimal128Type{Precision: 20, Scale: 4}})
267
268 parquetFields = append(parquetFields, schema.Must(schema.NewPrimitiveNodeLogical("decimal_77_4", parquet.Repetitions.Required,
269 schema.NewDecimalLogicalType(77, 4), parquet.Types.FixedLenByteArray, 34, -1)))
270 arrowFields = append(arrowFields, arrow.Field{Name: "decimal_77_4", Type: &arrow.Decimal128Type{Precision: 77, Scale: 4}})
271
272 arrowSchema := arrow.NewSchema(arrowFields, nil)
273 parquetSchema := schema.NewSchema(schema.MustGroup(schema.NewGroupNode("schema", parquet.Repetitions.Repeated, parquetFields, -1)))
274
275 result, err := pqarrow.ToParquet(arrowSchema, nil, pqarrow.NewArrowWriterProperties(pqarrow.WithDeprecatedInt96Timestamps(true)))
276 assert.NoError(t, err)
277 assert.True(t, parquetSchema.Equals(result))
278 for i := 0; i < parquetSchema.NumColumns(); i++ {
279 assert.Truef(t, parquetSchema.Column(i).Equals(result.Column(i)), "Column %d didn't match: %s", i, parquetSchema.Column(i).Name())
280 }
281 }
282
283 func TestConvertArrowFloat16(t *testing.T) {
284 parquetFields := make(schema.FieldList, 0)
285 arrowFields := make([]arrow.Field, 0)
286
287 parquetFields = append(parquetFields, schema.Must(schema.NewPrimitiveNodeLogical("float16", parquet.Repetitions.Required,
288 schema.Float16LogicalType{}, parquet.Types.FixedLenByteArray, 2, -1)))
289 arrowFields = append(arrowFields, arrow.Field{Name: "float16", Type: &arrow.Float16Type{}})
290
291 arrowSchema := arrow.NewSchema(arrowFields, nil)
292 parquetSchema := schema.NewSchema(schema.MustGroup(schema.NewGroupNode("schema", parquet.Repetitions.Repeated, parquetFields, -1)))
293
294 result, err := pqarrow.ToParquet(arrowSchema, nil, pqarrow.NewArrowWriterProperties(pqarrow.WithDeprecatedInt96Timestamps(true)))
295 assert.NoError(t, err)
296 assert.True(t, parquetSchema.Equals(result))
297 for i := 0; i < parquetSchema.NumColumns(); i++ {
298 assert.Truef(t, parquetSchema.Column(i).Equals(result.Column(i)), "Column %d didn't match: %s", i, parquetSchema.Column(i).Name())
299 }
300 }
301
302 func TestCoerceTImestampV1(t *testing.T) {
303 parquetFields := make(schema.FieldList, 0)
304 arrowFields := make([]arrow.Field, 0)
305
306 parquetFields = append(parquetFields, schema.Must(schema.NewPrimitiveNodeLogical("timestamp", parquet.Repetitions.Required,
307 schema.NewTimestampLogicalTypeForce(false, schema.TimeUnitMicros), parquet.Types.Int64, 0, -1)))
308 arrowFields = append(arrowFields, arrow.Field{Name: "timestamp", Type: &arrow.TimestampType{Unit: arrow.Millisecond, TimeZone: "EST"}})
309
310 arrowSchema := arrow.NewSchema(arrowFields, nil)
311 parquetSchema := schema.NewSchema(schema.MustGroup(schema.NewGroupNode("schema", parquet.Repetitions.Repeated, parquetFields, -1)))
312
313 result, err := pqarrow.ToParquet(arrowSchema, parquet.NewWriterProperties(parquet.WithVersion(parquet.V1_0)), pqarrow.NewArrowWriterProperties(pqarrow.WithCoerceTimestamps(arrow.Microsecond)))
314 assert.NoError(t, err)
315 assert.True(t, parquetSchema.Equals(result))
316 for i := 0; i < parquetSchema.NumColumns(); i++ {
317 assert.Truef(t, parquetSchema.Column(i).Equals(result.Column(i)), "Column %d didn't match: %s", i, parquetSchema.Column(i).Name())
318 }
319 }
320
321 func TestAutoCoerceTImestampV1(t *testing.T) {
322 parquetFields := make(schema.FieldList, 0)
323 arrowFields := make([]arrow.Field, 0)
324
325 parquetFields = append(parquetFields, schema.Must(schema.NewPrimitiveNodeLogical("timestamp", parquet.Repetitions.Required,
326 schema.NewTimestampLogicalTypeForce(false, schema.TimeUnitMicros), parquet.Types.Int64, 0, -1)))
327 arrowFields = append(arrowFields, arrow.Field{Name: "timestamp", Type: &arrow.TimestampType{Unit: arrow.Nanosecond, TimeZone: "EST"}})
328
329 parquetFields = append(parquetFields, schema.Must(schema.NewPrimitiveNodeLogical("timestamp[ms]", parquet.Repetitions.Required,
330 schema.NewTimestampLogicalTypeForce(true, schema.TimeUnitMillis), parquet.Types.Int64, 0, -1)))
331 arrowFields = append(arrowFields, arrow.Field{Name: "timestamp[ms]", Type: &arrow.TimestampType{Unit: arrow.Second}})
332
333 arrowSchema := arrow.NewSchema(arrowFields, nil)
334 parquetSchema := schema.NewSchema(schema.MustGroup(schema.NewGroupNode("schema", parquet.Repetitions.Repeated, parquetFields, -1)))
335
336 result, err := pqarrow.ToParquet(arrowSchema, parquet.NewWriterProperties(parquet.WithVersion(parquet.V1_0)), pqarrow.NewArrowWriterProperties())
337 assert.NoError(t, err)
338 assert.True(t, parquetSchema.Equals(result))
339 for i := 0; i < parquetSchema.NumColumns(); i++ {
340 assert.Truef(t, parquetSchema.Column(i).Equals(result.Column(i)), "Column %d didn't match: %s", i, parquetSchema.Column(i).Name())
341 }
342 }
343
344 func TestConvertArrowStruct(t *testing.T) {
345 parquetFields := make(schema.FieldList, 0)
346 arrowFields := make([]arrow.Field, 0)
347
348 parquetFields = append(parquetFields, schema.Must(schema.NewPrimitiveNodeLogical("leaf1", parquet.Repetitions.Optional, schema.NewIntLogicalType(32, true), parquet.Types.Int32, 0, -1)))
349 parquetFields = append(parquetFields, schema.Must(schema.NewGroupNode("outerGroup", parquet.Repetitions.Required, schema.FieldList{
350 schema.Must(schema.NewPrimitiveNodeLogical("leaf2", parquet.Repetitions.Optional, schema.NewIntLogicalType(32, true), parquet.Types.Int32, 0, -1)),
351 schema.Must(schema.NewGroupNode("innerGroup", parquet.Repetitions.Required, schema.FieldList{
352 schema.Must(schema.NewPrimitiveNodeLogical("leaf3", parquet.Repetitions.Optional, schema.NewIntLogicalType(32, true), parquet.Types.Int32, 0, -1)),
353 }, -1)),
354 }, -1)))
355
356 arrowFields = append(arrowFields, arrow.Field{Name: "leaf1", Type: arrow.PrimitiveTypes.Int32, Nullable: true})
357 arrowFields = append(arrowFields, arrow.Field{Name: "outerGroup", Type: arrow.StructOf(
358 arrow.Field{Name: "leaf2", Type: arrow.PrimitiveTypes.Int32, Nullable: true},
359 arrow.Field{Name: "innerGroup", Type: arrow.StructOf(
360 arrow.Field{Name: "leaf3", Type: arrow.PrimitiveTypes.Int32, Nullable: true},
361 )},
362 )})
363
364 arrowSchema := arrow.NewSchema(arrowFields, nil)
365 parquetSchema := schema.NewSchema(schema.MustGroup(schema.NewGroupNode("schema", parquet.Repetitions.Repeated, parquetFields, -1)))
366
367 result, err := pqarrow.ToParquet(arrowSchema, nil, pqarrow.NewArrowWriterProperties())
368 assert.NoError(t, err)
369 assert.True(t, parquetSchema.Equals(result))
370 for i := 0; i < parquetSchema.NumColumns(); i++ {
371 assert.Truef(t, parquetSchema.Column(i).Equals(result.Column(i)), "Column %d didn't match: %s", i, parquetSchema.Column(i).Name())
372 }
373 }
374
375 func TestListStructBackwardCompatible(t *testing.T) {
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403 pqSchema := schema.NewSchema(schema.MustGroup(schema.NewGroupNode("root", parquet.Repetitions.Required, schema.FieldList{
404 schema.Must(schema.NewGroupNodeLogical("answers", parquet.Repetitions.Optional, schema.FieldList{
405 schema.Must(schema.NewGroupNode("array", parquet.Repetitions.Repeated, schema.FieldList{
406 schema.MustPrimitive(schema.NewPrimitiveNodeLogical("type", parquet.Repetitions.Optional,
407 schema.StringLogicalType{}, parquet.Types.ByteArray, -1, -1)),
408 schema.MustPrimitive(schema.NewPrimitiveNodeLogical("rdata", parquet.Repetitions.Optional,
409 schema.StringLogicalType{}, parquet.Types.ByteArray, -1, -1)),
410 schema.MustPrimitive(schema.NewPrimitiveNodeLogical("class", parquet.Repetitions.Optional,
411 schema.StringLogicalType{}, parquet.Types.ByteArray, -1, -1)),
412 }, -1)),
413 }, schema.NewListLogicalType(), -1)),
414 }, -1)))
415
416 meta := arrow.NewMetadata([]string{"PARQUET:field_id"}, []string{"-1"})
417
418 arrowSchema := arrow.NewSchema(
419 []arrow.Field{
420 {Name: "answers", Type: arrow.ListOfField(arrow.Field{
421 Name: "array", Type: arrow.StructOf(
422 arrow.Field{Name: "type", Type: arrow.BinaryTypes.String, Nullable: true, Metadata: meta},
423 arrow.Field{Name: "rdata", Type: arrow.BinaryTypes.String, Nullable: true, Metadata: meta},
424 arrow.Field{Name: "class", Type: arrow.BinaryTypes.String, Nullable: true, Metadata: meta},
425 ), Nullable: true}), Nullable: true, Metadata: meta},
426 }, nil)
427
428 arrsc, err := pqarrow.FromParquet(pqSchema, nil, metadata.KeyValueMetadata{})
429 assert.NoError(t, err)
430 assert.True(t, arrowSchema.Equal(arrsc))
431 }
432
433
434
435 func TestUnsupportedTypes(t *testing.T) {
436 unsupportedTypes := []struct {
437 typ arrow.DataType
438 }{
439
440 {typ: &arrow.DurationType{}},
441 {typ: &arrow.DayTimeIntervalType{}},
442 {typ: &arrow.MonthIntervalType{}},
443 {typ: &arrow.MonthDayNanoIntervalType{}},
444 {typ: &arrow.DenseUnionType{}},
445 {typ: &arrow.SparseUnionType{}},
446 }
447 for _, tc := range unsupportedTypes {
448 t.Run(tc.typ.ID().String(), func(t *testing.T) {
449 arrowFields := make([]arrow.Field, 0)
450 arrowFields = append(arrowFields, arrow.Field{Name: "unsupported", Type: tc.typ, Nullable: true})
451 arrowSchema := arrow.NewSchema(arrowFields, nil)
452 _, err := pqarrow.ToParquet(arrowSchema, nil, pqarrow.NewArrowWriterProperties())
453 assert.ErrorIs(t, err, arrow.ErrNotImplemented)
454 assert.ErrorContains(t, err, "support for "+tc.typ.ID().String())
455 })
456 }
457 }
458
View as plain text