1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17 package pqarrow_test
18
19 import (
20 "bytes"
21 "context"
22 "testing"
23 "unsafe"
24
25 "github.com/apache/arrow/go/v15/arrow"
26 "github.com/apache/arrow/go/v15/arrow/array"
27 "github.com/apache/arrow/go/v15/arrow/memory"
28 "github.com/apache/arrow/go/v15/parquet"
29 "github.com/apache/arrow/go/v15/parquet/file"
30 "github.com/apache/arrow/go/v15/parquet/pqarrow"
31 "golang.org/x/exp/rand"
32 "gonum.org/v1/gonum/stat/distuv"
33 )
34
35 const alternateOrNA = -1
36 const SIZELEN = 1024 * 1024
37
38 func randomUint8(size, truePct int, sampleVals [2]uint8, seed uint64) []uint8 {
39 ret := make([]uint8, size)
40 if truePct == alternateOrNA {
41 for idx := range ret {
42 ret[idx] = uint8(idx % 2)
43 }
44 return ret
45 }
46
47 dist := distuv.Bernoulli{
48 P: float64(truePct) / 100.0,
49 Src: rand.NewSource(seed),
50 }
51
52 for idx := range ret {
53 ret[idx] = sampleVals[int(dist.Rand())]
54 }
55 return ret
56 }
57
58 func randomInt32(size, truePct int, sampleVals [2]int32, seed uint64) []int32 {
59 ret := make([]int32, size)
60 if truePct == alternateOrNA {
61 for idx := range ret {
62 ret[idx] = int32(idx % 2)
63 }
64 return ret
65 }
66
67 dist := distuv.Bernoulli{
68 P: float64(truePct) / 100.0,
69 Src: rand.NewSource(seed),
70 }
71
72 for idx := range ret {
73 ret[idx] = sampleVals[int(dist.Rand())]
74 }
75 return ret
76 }
77
78 func tableFromVec(dt arrow.DataType, size int, data interface{}, nullable bool, nullPct int) arrow.Table {
79 if !nullable && nullPct != alternateOrNA {
80 panic("bad check")
81 }
82
83 var valid []bool
84 if nullable {
85
86 validBytes := randomUint8(size, nullPct, [2]uint8{1, 0}, 500)
87 valid = *(*[]bool)(unsafe.Pointer(&validBytes))
88 }
89
90 bldr := array.NewBuilder(memory.DefaultAllocator, dt)
91 defer bldr.Release()
92
93 switch v := data.(type) {
94 case []int32:
95 bldr.(*array.Int32Builder).AppendValues(v, valid)
96 case []int64:
97 bldr.(*array.Int64Builder).AppendValues(v, valid)
98 case []float32:
99 bldr.(*array.Float32Builder).AppendValues(v, valid)
100 case []float64:
101 bldr.(*array.Float64Builder).AppendValues(v, valid)
102 }
103
104 arr := bldr.NewArray()
105
106 field := arrow.Field{Name: "column", Type: dt, Nullable: nullable}
107 sc := arrow.NewSchema([]arrow.Field{field}, nil)
108 col := arrow.NewColumnFromArr(field, arr)
109 defer col.Release()
110 return array.NewTable(sc, []arrow.Column{col}, int64(size))
111 }
112
113 func BenchmarkWriteColumn(b *testing.B) {
114 int32Values := make([]int32, SIZELEN)
115 int64Values := make([]int64, SIZELEN)
116 float32Values := make([]float32, SIZELEN)
117 float64Values := make([]float64, SIZELEN)
118 for i := 0; i < SIZELEN; i++ {
119 int32Values[i] = 128
120 int64Values[i] = 128
121 float32Values[i] = 128
122 float64Values[i] = 128
123 }
124
125 tests := []struct {
126 name string
127 dt arrow.DataType
128 values interface{}
129 nullable bool
130 nbytes int64
131 }{
132 {"int32 not nullable", arrow.PrimitiveTypes.Int32, int32Values, false, int64(arrow.Int32Traits.BytesRequired(SIZELEN))},
133 {"int32 nullable", arrow.PrimitiveTypes.Int32, int32Values, true, int64(arrow.Int32Traits.BytesRequired(SIZELEN))},
134 {"int64 not nullable", arrow.PrimitiveTypes.Int64, int64Values, false, int64(arrow.Int64Traits.BytesRequired(SIZELEN))},
135 {"int64 nullable", arrow.PrimitiveTypes.Int64, int64Values, true, int64(arrow.Int64Traits.BytesRequired(SIZELEN))},
136 {"float32 not nullable", arrow.PrimitiveTypes.Float32, float32Values, false, int64(arrow.Float32Traits.BytesRequired(SIZELEN))},
137 {"float32 nullable", arrow.PrimitiveTypes.Float32, float32Values, true, int64(arrow.Float32Traits.BytesRequired(SIZELEN))},
138 {"float64 not nullable", arrow.PrimitiveTypes.Float64, float64Values, false, int64(arrow.Float64Traits.BytesRequired(SIZELEN))},
139 {"float64 nullable", arrow.PrimitiveTypes.Float64, float64Values, true, int64(arrow.Float64Traits.BytesRequired(SIZELEN))},
140 }
141
142 props := parquet.NewWriterProperties(parquet.WithDictionaryDefault(false))
143 arrProps := pqarrow.DefaultWriterProps()
144
145 for _, tt := range tests {
146 b.Run(tt.name, func(b *testing.B) {
147 tbl := tableFromVec(tt.dt, SIZELEN, tt.values, tt.nullable, alternateOrNA)
148 b.Cleanup(func() { tbl.Release() })
149 var buf bytes.Buffer
150 buf.Grow(int(tt.nbytes))
151 b.ResetTimer()
152 b.SetBytes(tt.nbytes)
153
154 for i := 0; i < b.N; i++ {
155 buf.Reset()
156 err := pqarrow.WriteTable(tbl, &buf, SIZELEN, props, arrProps)
157 if err != nil {
158 b.Error(err)
159 }
160 }
161 })
162 }
163 }
164
165 func benchReadTable(b *testing.B, name string, tbl arrow.Table, nbytes int64) {
166 props := parquet.NewWriterProperties(parquet.WithDictionaryDefault(false))
167 arrProps := pqarrow.DefaultWriterProps()
168
169 var buf bytes.Buffer
170 if err := pqarrow.WriteTable(tbl, &buf, SIZELEN, props, arrProps); err != nil {
171 b.Error(err)
172 }
173 ctx := context.Background()
174
175 b.ResetTimer()
176 b.Run(name, func(b *testing.B) {
177 b.SetBytes(nbytes)
178
179 for i := 0; i < b.N; i++ {
180 pf, err := file.NewParquetReader(bytes.NewReader(buf.Bytes()))
181 if err != nil {
182 b.Error(err)
183 }
184
185 reader, err := pqarrow.NewFileReader(pf, pqarrow.ArrowReadProperties{}, memory.DefaultAllocator)
186 if err != nil {
187 b.Error(err)
188 }
189
190 tbl, err := reader.ReadTable(ctx)
191 if err != nil {
192 b.Error(err)
193 }
194 defer tbl.Release()
195 }
196 })
197 }
198
199 func BenchmarkReadColumnInt32(b *testing.B) {
200 tests := []struct {
201 name string
202 nullable bool
203 nullPct int
204 fvPct int
205 }{
206 {"int32 not null 1pct", false, alternateOrNA, 1},
207 {"int32 not null 10pct", false, alternateOrNA, 10},
208 {"int32 not null 50pct", false, alternateOrNA, 50},
209 {"int32 nullable alt", true, alternateOrNA, 0},
210 {"int32 nullable 1pct 1pct", true, 1, 1},
211 {"int32 nullable 10pct 10pct", true, 10, 10},
212 {"int32 nullable 25pct 5pct", true, 25, 5},
213 {"int32 nullable 50pct 50pct", true, 50, 50},
214 {"int32 nullable 50pct 0pct", true, 50, 0},
215 {"int32 nullable 99pct 50pct", true, 99, 50},
216 {"int32 nullable 99pct 0pct", true, 99, 0},
217 }
218
219 for _, tt := range tests {
220 values := randomInt32(SIZELEN, tt.fvPct, [2]int32{127, 128}, 500)
221 tbl := tableFromVec(arrow.PrimitiveTypes.Int32, SIZELEN, values, tt.nullable, tt.nullPct)
222 benchReadTable(b, tt.name, tbl, int64(arrow.Int32Traits.BytesRequired(SIZELEN)))
223 }
224 }
225
226 func BenchmarkReadColumnInt64(b *testing.B) {
227 tests := []struct {
228 name string
229 nullable bool
230 nullPct int
231 fvPct int
232 }{
233 {"int64 not null 1pct", false, alternateOrNA, 1},
234 {"int64 not null 10pct", false, alternateOrNA, 10},
235 {"int64 not null 50pct", false, alternateOrNA, 50},
236 {"int64 nullable alt", true, alternateOrNA, 0},
237 {"int64 nullable 1pct 1pct", true, 1, 1},
238 {"int64 nullable 5pct 5pct", true, 5, 5},
239 {"int64 nullable 10pct 5pct", true, 10, 5},
240 {"int64 nullable 25pct 10pct", true, 25, 10},
241 {"int64 nullable 30pct 10pct", true, 30, 10},
242 {"int64 nullable 35pct 10pct", true, 35, 10},
243 {"int64 nullable 45pct 25pct", true, 45, 25},
244 {"int64 nullable 50pct 50pct", true, 50, 50},
245 {"int64 nullable 50pct 1pct", true, 50, 1},
246 {"int64 nullable 75pct 1pct", true, 75, 1},
247 {"int64 nullable 99pct 50pct", true, 99, 50},
248 {"int64 nullable 99pct 0pct", true, 99, 0},
249 }
250
251 for _, tt := range tests {
252 values := randomInt32(SIZELEN, tt.fvPct, [2]int32{127, 128}, 500)
253 tbl := tableFromVec(arrow.PrimitiveTypes.Int32, SIZELEN, values, tt.nullable, tt.nullPct)
254 benchReadTable(b, tt.name, tbl, int64(arrow.Int32Traits.BytesRequired(SIZELEN)))
255 }
256 }
257
258 func BenchmarkReadColumnFloat64(b *testing.B) {
259 tests := []struct {
260 name string
261 nullable bool
262 nullPct int
263 fvPct int
264 }{
265 {"double not null 1pct", false, alternateOrNA, 0},
266 {"double not null 20pct", false, alternateOrNA, 20},
267 {"double nullable alt", true, alternateOrNA, 0},
268 {"double nullable 10pct 50pct", true, 10, 50},
269 {"double nullable 25pct 25pct", true, 25, 25},
270 }
271
272 for _, tt := range tests {
273 values := randomInt32(SIZELEN, tt.fvPct, [2]int32{127, 128}, 500)
274 tbl := tableFromVec(arrow.PrimitiveTypes.Int32, SIZELEN, values, tt.nullable, tt.nullPct)
275 benchReadTable(b, tt.name, tbl, int64(arrow.Int32Traits.BytesRequired(SIZELEN)))
276 }
277 }
278
View as plain text