...
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17 package parquet
18
19 import (
20 "encoding/binary"
21 "io"
22 "reflect"
23 "strings"
24 "time"
25 "unsafe"
26
27 "github.com/apache/arrow/go/v15/arrow"
28 format "github.com/apache/arrow/go/v15/parquet/internal/gen-go/parquet"
29 )
30
31 const (
32 julianUnixEpoch int64 = 2440588
33 nanosPerDay int64 = 3600 * 24 * 1000 * 1000 * 1000
34
35 Int96SizeBytes int = 12
36 )
37
38 var (
39
40 Int96Traits int96Traits
41
42 ByteArrayTraits byteArrayTraits
43
44 FixedLenByteArrayTraits fixedLenByteArrayTraits
45
46 ByteArraySizeBytes int = int(reflect.TypeOf(ByteArray{}).Size())
47
48 FixedLenByteArraySizeBytes int = int(reflect.TypeOf(FixedLenByteArray{}).Size())
49 )
50
51
52
53
54
55 type ReaderAtSeeker interface {
56 io.ReaderAt
57 io.Seeker
58 }
59
60
61 func NewInt96(v [3]uint32) (out Int96) {
62 binary.LittleEndian.PutUint32(out[0:], v[0])
63 binary.LittleEndian.PutUint32(out[4:], v[1])
64 binary.LittleEndian.PutUint32(out[8:], v[2])
65 return
66 }
67
68
69
70 type Int96 [12]byte
71
72
73 func (i96 *Int96) SetNanoSeconds(nanos int64) {
74 binary.LittleEndian.PutUint64(i96[:8], uint64(nanos))
75 }
76
77
78
79 func (i96 Int96) String() string {
80 return i96.ToTime().String()
81 }
82
83
84 func (i96 Int96) ToTime() time.Time {
85 nanos := binary.LittleEndian.Uint64(i96[:8])
86 jdays := binary.LittleEndian.Uint32(i96[8:])
87
88 nanos = (uint64(jdays)-uint64(julianUnixEpoch))*uint64(nanosPerDay) + nanos
89 t := time.Unix(0, int64(nanos))
90 return t.UTC()
91 }
92
93 type int96Traits struct{}
94
95 func (int96Traits) BytesRequired(n int) int { return Int96SizeBytes * n }
96
97 func (int96Traits) CastFromBytes(b []byte) []Int96 {
98 h := (*reflect.SliceHeader)(unsafe.Pointer(&b))
99
100 var res []Int96
101 s := (*reflect.SliceHeader)(unsafe.Pointer(&res))
102 s.Data = h.Data
103 s.Len = h.Len / Int96SizeBytes
104 s.Cap = h.Cap / Int96SizeBytes
105
106 return res
107 }
108
109 func (int96Traits) CastToBytes(b []Int96) []byte {
110 h := (*reflect.SliceHeader)(unsafe.Pointer(&b))
111
112 var res []byte
113 s := (*reflect.SliceHeader)(unsafe.Pointer(&res))
114 s.Data = h.Data
115 s.Len = h.Len * Int96SizeBytes
116 s.Cap = h.Cap * Int96SizeBytes
117
118 return res
119 }
120
121
122 type ByteArray []byte
123
124
125 func (b ByteArray) Len() int {
126 return len(b)
127 }
128
129
130 func (b ByteArray) String() string {
131 return *(*string)(unsafe.Pointer(&b))
132 }
133
134 func (b ByteArray) Bytes() []byte {
135 return b
136 }
137
138 type byteArrayTraits struct{}
139
140 func (byteArrayTraits) BytesRequired(n int) int {
141 return ByteArraySizeBytes * n
142 }
143
144 func (byteArrayTraits) CastFromBytes(b []byte) []ByteArray {
145 h := (*reflect.SliceHeader)(unsafe.Pointer(&b))
146
147 var res []ByteArray
148 s := (*reflect.SliceHeader)(unsafe.Pointer(&res))
149 s.Data = h.Data
150 s.Len = h.Len / ByteArraySizeBytes
151 s.Cap = h.Cap / ByteArraySizeBytes
152
153 return res
154 }
155
156
157 type FixedLenByteArray []byte
158
159
160 func (b FixedLenByteArray) Len() int {
161 return len(b)
162 }
163
164
165 func (b FixedLenByteArray) String() string {
166 return *(*string)(unsafe.Pointer(&b))
167 }
168
169 func (b FixedLenByteArray) Bytes() []byte {
170 return b
171 }
172
173 type fixedLenByteArrayTraits struct{}
174
175 func (fixedLenByteArrayTraits) BytesRequired(n int) int {
176 return FixedLenByteArraySizeBytes * n
177 }
178
179 func (fixedLenByteArrayTraits) CastFromBytes(b []byte) []FixedLenByteArray {
180 h := (*reflect.SliceHeader)(unsafe.Pointer(&b))
181
182 var res []FixedLenByteArray
183 s := (*reflect.SliceHeader)(unsafe.Pointer(&res))
184 s.Data = h.Data
185 s.Len = h.Len / FixedLenByteArraySizeBytes
186 s.Cap = h.Cap / FixedLenByteArraySizeBytes
187
188 return res
189 }
190
191
192
193
194
195
196
197 type (
198
199 Type format.Type
200
201 Cipher int
202
203 ColumnOrder *format.ColumnOrder
204
205 Version int8
206
207 DataPageVersion int8
208
209 Encoding format.Encoding
210
211 Repetition format.FieldRepetitionType
212
213 ColumnPath []string
214 )
215
216 func (c ColumnPath) String() string {
217 if c == nil {
218 return ""
219 }
220 return strings.Join(c, ".")
221 }
222
223
224 func (c ColumnPath) Extend(s string) ColumnPath {
225 p := make([]string, len(c), len(c)+1)
226 copy(p, c)
227 return append(p, s)
228 }
229
230
231 func ColumnPathFromString(s string) ColumnPath {
232 return strings.Split(s, ".")
233 }
234
235
236 const (
237 AesGcm Cipher = iota
238 AesCtr
239 )
240
241
242
243
244
245
246
247
248 const (
249
250
251
252
253
254 V1_0 Version = iota
255
256
257
258
259
260
261 V2_4
262
263
264
265
266
267 V2_6
268
269
270
271 V2_LATEST = V2_6
272 )
273
274
275 const (
276 DataPageV1 DataPageVersion = iota
277 DataPageV2
278 )
279
280 func (e Encoding) String() string {
281 return format.Encoding(e).String()
282 }
283
284 var (
285
286
287
288
289 Types = struct {
290 Boolean Type
291 Int32 Type
292 Int64 Type
293 Int96 Type
294 Float Type
295 Double Type
296 ByteArray Type
297 FixedLenByteArray Type
298
299
300
301 Undefined Type
302 }{
303 Boolean: Type(format.Type_BOOLEAN),
304 Int32: Type(format.Type_INT32),
305 Int64: Type(format.Type_INT64),
306 Int96: Type(format.Type_INT96),
307 Float: Type(format.Type_FLOAT),
308 Double: Type(format.Type_DOUBLE),
309 ByteArray: Type(format.Type_BYTE_ARRAY),
310 FixedLenByteArray: Type(format.Type_FIXED_LEN_BYTE_ARRAY),
311 Undefined: Type(format.Type_FIXED_LEN_BYTE_ARRAY + 1),
312 }
313
314
315
316
317
318 Encodings = struct {
319 Plain Encoding
320 PlainDict Encoding
321 RLE Encoding
322 RLEDict Encoding
323 BitPacked Encoding
324 DeltaByteArray Encoding
325 DeltaBinaryPacked Encoding
326 DeltaLengthByteArray Encoding
327 }{
328 Plain: Encoding(format.Encoding_PLAIN),
329 PlainDict: Encoding(format.Encoding_PLAIN_DICTIONARY),
330 RLE: Encoding(format.Encoding_RLE),
331 RLEDict: Encoding(format.Encoding_RLE_DICTIONARY),
332 BitPacked: Encoding(format.Encoding_BIT_PACKED),
333 DeltaByteArray: Encoding(format.Encoding_DELTA_BYTE_ARRAY),
334 DeltaBinaryPacked: Encoding(format.Encoding_DELTA_BINARY_PACKED),
335 DeltaLengthByteArray: Encoding(format.Encoding_DELTA_LENGTH_BYTE_ARRAY),
336 }
337
338
339 ColumnOrders = struct {
340 Undefined ColumnOrder
341 TypeDefinedOrder ColumnOrder
342 }{
343 Undefined: format.NewColumnOrder(),
344 TypeDefinedOrder: &format.ColumnOrder{TYPE_ORDER: format.NewTypeDefinedOrder()},
345 }
346
347
348 DefaultColumnOrder = ColumnOrders.TypeDefinedOrder
349
350
351 Repetitions = struct {
352 Required Repetition
353 Optional Repetition
354 Repeated Repetition
355 Undefined Repetition
356 }{
357 Required: Repetition(format.FieldRepetitionType_REQUIRED),
358 Optional: Repetition(format.FieldRepetitionType_OPTIONAL),
359 Repeated: Repetition(format.FieldRepetitionType_REPEATED),
360 Undefined: Repetition(format.FieldRepetitionType_REPEATED + 1),
361 }
362 )
363
364 func (t Type) String() string {
365 switch t {
366 case Types.Undefined:
367 return "UNDEFINED"
368 default:
369 return format.Type(t).String()
370 }
371 }
372
373 func (r Repetition) String() string {
374 return strings.ToLower(format.FieldRepetitionType(r).String())
375 }
376
377
378
379 func (t Type) ByteSize() int {
380 switch t {
381 case Types.Boolean:
382 return 1
383 case Types.Int32:
384 return arrow.Int32SizeBytes
385 case Types.Int64:
386 return arrow.Int64SizeBytes
387 case Types.Int96:
388 return Int96SizeBytes
389 case Types.Float:
390 return arrow.Float32SizeBytes
391 case Types.Double:
392 return arrow.Float64SizeBytes
393 case Types.ByteArray:
394 return ByteArraySizeBytes
395 case Types.FixedLenByteArray:
396 return FixedLenByteArraySizeBytes
397 }
398 panic("no bytesize info for type")
399 }
400
View as plain text