1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19 package testutils
20
21 import (
22 "encoding/binary"
23 "math"
24 "time"
25 "unsafe"
26
27 "github.com/apache/arrow/go/v15/arrow"
28 "github.com/apache/arrow/go/v15/arrow/array"
29 "github.com/apache/arrow/go/v15/arrow/bitutil"
30 "github.com/apache/arrow/go/v15/arrow/endian"
31 "github.com/apache/arrow/go/v15/arrow/float16"
32 "github.com/apache/arrow/go/v15/arrow/memory"
33 "github.com/apache/arrow/go/v15/parquet"
34 "github.com/apache/arrow/go/v15/parquet/pqarrow"
35
36 "golang.org/x/exp/rand"
37 "gonum.org/v1/gonum/stat/distuv"
38 )
39
40
41
42 type RandomArrayGenerator struct {
43 seed uint64
44 extra uint64
45 src rand.Source
46 seedRand *rand.Rand
47 }
48
49
50 func NewRandomArrayGenerator(seed uint64) RandomArrayGenerator {
51 src := rand.NewSource(seed)
52 return RandomArrayGenerator{seed, 0, src, rand.New(src)}
53 }
54
55
56
57
58
59 func (r *RandomArrayGenerator) GenerateBitmap(buffer []byte, n int64, prob float64) int64 {
60 count := int64(0)
61 r.extra++
62
63
64
65 dist := distuv.Bernoulli{P: prob, Src: rand.NewSource(r.seed + r.extra)}
66 for i := 0; int64(i) < n; i++ {
67 if dist.Rand() != float64(0.0) {
68 bitutil.SetBit(buffer, i)
69 } else {
70 count++
71 }
72 }
73
74 return count
75 }
76
77
78
79
80
81
82 func (r *RandomArrayGenerator) ByteArray(size int64, minLen, maxLen int32, nullProb float64) arrow.Array {
83 if nullProb < 0 || nullProb > 1 {
84 panic("null prob must be between 0 and 1")
85 }
86
87 lengths := r.Int32(size, minLen, maxLen, nullProb)
88 defer lengths.Release()
89
90 r.extra++
91 dist := rand.New(rand.NewSource(r.seed + r.extra))
92 bldr := array.NewStringBuilder(memory.DefaultAllocator)
93 defer bldr.Release()
94
95 strbuf := make([]byte, maxLen)
96
97 for i := 0; int64(i) < size; i++ {
98 if lengths.IsValid(i) {
99 l := lengths.Value(i)
100 for j := int32(0); j < l; j++ {
101 strbuf[j] = byte(dist.Int31n(int32('z')-int32('A')+1) + int32('A'))
102 }
103 val := strbuf[:l]
104 bldr.Append(*(*string)(unsafe.Pointer(&val)))
105 } else {
106 bldr.AppendNull()
107 }
108 }
109
110 return bldr.NewArray()
111 }
112
113
114
115 func (r *RandomArrayGenerator) Uint8(size int64, min, max uint8, prob float64) arrow.Array {
116 buffers := make([]*memory.Buffer, 2)
117 nullCount := int64(0)
118
119 buffers[0] = memory.NewResizableBuffer(memory.DefaultAllocator)
120 buffers[0].Resize(int(bitutil.BytesForBits(size)))
121 nullCount = r.GenerateBitmap(buffers[0].Bytes(), size, prob)
122
123 buffers[1] = memory.NewResizableBuffer(memory.DefaultAllocator)
124 buffers[1].Resize(int(size * int64(arrow.Uint8SizeBytes)))
125
126 r.extra++
127 dist := rand.New(rand.NewSource(r.seed + r.extra))
128 out := arrow.Uint8Traits.CastFromBytes(buffers[1].Bytes())
129 for i := int64(0); i < size; i++ {
130 out[i] = uint8(dist.Intn(int(max-min+1))) + min
131 }
132
133 return array.NewUint8Data(array.NewData(arrow.PrimitiveTypes.Uint8, int(size), buffers, nil, int(nullCount), 0))
134 }
135
136
137
138 func (r *RandomArrayGenerator) Int32(size int64, min, max int32, pctNull float64) *array.Int32 {
139 buffers := make([]*memory.Buffer, 2)
140 nullCount := int64(0)
141
142 buffers[0] = memory.NewResizableBuffer(memory.DefaultAllocator)
143 buffers[0].Resize(int(bitutil.BytesForBits(size)))
144 nullCount = r.GenerateBitmap(buffers[0].Bytes(), size, 1-pctNull)
145
146 buffers[1] = memory.NewResizableBuffer(memory.DefaultAllocator)
147 buffers[1].Resize(arrow.Int32Traits.BytesRequired(int(size)))
148
149 r.extra++
150 dist := rand.New(rand.NewSource(r.seed + r.extra))
151 out := arrow.Int32Traits.CastFromBytes(buffers[1].Bytes())
152 for i := int64(0); i < size; i++ {
153 out[i] = dist.Int31n(max-min+1) + min
154 }
155 return array.NewInt32Data(array.NewData(arrow.PrimitiveTypes.Int32, int(size), buffers, nil, int(nullCount), 0))
156 }
157
158
159
160 func (r *RandomArrayGenerator) Int64(size int64, min, max int64, pctNull float64) *array.Int64 {
161 buffers := make([]*memory.Buffer, 2)
162 nullCount := int64(0)
163
164 buffers[0] = memory.NewResizableBuffer(memory.DefaultAllocator)
165 buffers[0].Resize(int(bitutil.BytesForBits(size)))
166 nullCount = r.GenerateBitmap(buffers[0].Bytes(), size, 1-pctNull)
167
168 buffers[1] = memory.NewResizableBuffer(memory.DefaultAllocator)
169 buffers[1].Resize(arrow.Int64Traits.BytesRequired(int(size)))
170
171 r.extra++
172 dist := rand.New(rand.NewSource(r.seed + r.extra))
173 out := arrow.Int64Traits.CastFromBytes(buffers[1].Bytes())
174 for i := int64(0); i < size; i++ {
175 out[i] = dist.Int63n(max-min+1) + min
176 }
177 return array.NewInt64Data(array.NewData(arrow.PrimitiveTypes.Int64, int(size), buffers, nil, int(nullCount), 0))
178 }
179
180
181
182 func (r *RandomArrayGenerator) Float64(size int64, pctNull float64) *array.Float64 {
183 buffers := make([]*memory.Buffer, 2)
184 nullCount := int64(0)
185
186 buffers[0] = memory.NewResizableBuffer(memory.DefaultAllocator)
187 buffers[0].Resize(int(bitutil.BytesForBits(size)))
188 nullCount = r.GenerateBitmap(buffers[0].Bytes(), size, 1-pctNull)
189
190 buffers[1] = memory.NewResizableBuffer(memory.DefaultAllocator)
191 buffers[1].Resize(arrow.Float64Traits.BytesRequired(int(size)))
192
193 r.extra++
194 dist := rand.New(rand.NewSource(r.seed + r.extra))
195 out := arrow.Float64Traits.CastFromBytes(buffers[1].Bytes())
196 for i := int64(0); i < size; i++ {
197 out[i] = dist.NormFloat64()
198 }
199 return array.NewFloat64Data(array.NewData(arrow.PrimitiveTypes.Float64, int(size), buffers, nil, int(nullCount), 0))
200 }
201
202 func (r *RandomArrayGenerator) StringWithRepeats(mem memory.Allocator, sz, unique int64, minLen, maxLen int32, nullProb float64) *array.String {
203 if unique > sz {
204 panic("invalid config for random StringWithRepeats")
205 }
206
207
208 arr := r.ByteArray(unique, minLen, maxLen, 0)
209 defer arr.Release()
210 dict := arr.(*array.String)
211
212
213 idArray := r.Int64(sz, 0, unique-1, nullProb)
214 defer idArray.Release()
215
216 bldr := array.NewStringBuilder(mem)
217 defer bldr.Release()
218
219 for i := int64(0); i < sz; i++ {
220 if idArray.IsValid(int(i)) {
221 idx := idArray.Value(int(i))
222 bldr.Append(dict.Value(int(idx)))
223 } else {
224 bldr.AppendNull()
225 }
226 }
227
228 return bldr.NewStringArray()
229 }
230
231
232
233 func FillRandomInt8(seed uint64, min, max int8, out []int8) {
234 r := rand.New(rand.NewSource(seed))
235 for idx := range out {
236 out[idx] = int8(r.Intn(int(max-min+1))) + min
237 }
238 }
239
240
241
242 func FillRandomUint8(seed uint64, min, max uint8, out []uint8) {
243 r := rand.New(rand.NewSource(seed))
244 for idx := range out {
245 out[idx] = uint8(r.Intn(int(max-min+1))) + min
246 }
247 }
248
249
250
251 func FillRandomInt16(seed uint64, min, max int16, out []int16) {
252 r := rand.New(rand.NewSource(seed))
253 for idx := range out {
254 out[idx] = int16(r.Intn(int(max-min+1))) + min
255 }
256 }
257
258
259
260 func FillRandomUint16(seed uint64, min, max uint16, out []uint16) {
261 r := rand.New(rand.NewSource(seed))
262 for idx := range out {
263 out[idx] = uint16(r.Intn(int(max-min+1))) + min
264 }
265 }
266
267
268
269 func FillRandomInt32(seed uint64, out []int32) {
270 r := rand.New(rand.NewSource(seed))
271 for idx := range out {
272 out[idx] = int32(r.Uint32())
273 }
274 }
275
276
277
278 func FillRandomInt32Max(seed uint64, max int32, out []int32) {
279 r := rand.New(rand.NewSource(seed))
280 for idx := range out {
281 out[idx] = r.Int31n(max)
282 }
283 }
284
285
286
287 func FillRandomUint32Max(seed uint64, max uint32, out []uint32) {
288 r := rand.New(rand.NewSource(seed))
289 for idx := range out {
290 out[idx] = uint32(r.Uint64n(uint64(max)))
291 }
292 }
293
294
295
296 func FillRandomInt64Max(seed uint64, max int64, out []int64) {
297 r := rand.New(rand.NewSource(seed))
298 for idx := range out {
299 out[idx] = r.Int63n(max)
300 }
301 }
302
303
304
305 func FillRandomUint32(seed uint64, out []uint32) {
306 r := rand.New(rand.NewSource(seed))
307 for idx := range out {
308 out[idx] = r.Uint32()
309 }
310 }
311
312
313
314 func FillRandomUint64(seed uint64, out []uint64) {
315 r := rand.New(rand.NewSource(seed))
316 for idx := range out {
317 out[idx] = r.Uint64()
318 }
319 }
320
321
322
323 func FillRandomUint64Max(seed uint64, max uint64, out []uint64) {
324 r := rand.New(rand.NewSource(seed))
325 for idx := range out {
326 out[idx] = r.Uint64n(max)
327 }
328 }
329
330
331
332 func FillRandomInt64(seed uint64, out []int64) {
333 r := rand.New(rand.NewSource(seed))
334 for idx := range out {
335 out[idx] = int64(r.Uint64())
336 }
337 }
338
339
340
341
342 func FillRandomInt96(seed uint64, out []parquet.Int96) {
343 r := rand.New(rand.NewSource(seed))
344 for idx := range out {
345 *(*int32)(unsafe.Pointer(&out[idx][0])) = int32(r.Uint32())
346 *(*int32)(unsafe.Pointer(&out[idx][4])) = int32(r.Uint32())
347 *(*int32)(unsafe.Pointer(&out[idx][8])) = int32(r.Uint32())
348 }
349 }
350
351
352
353 func randFloat32(r *rand.Rand) float32 {
354 for {
355 f := math.Float32frombits(r.Uint32())
356 if !math.IsNaN(float64(f)) {
357 return f
358 }
359 }
360 }
361
362
363
364 func randFloat64(r *rand.Rand) float64 {
365 for {
366 f := math.Float64frombits(r.Uint64())
367 if !math.IsNaN(f) {
368 return f
369 }
370 }
371 }
372
373
374
375 func randFloat16(r *rand.Rand) float16.Num {
376 for {
377 f := float16.FromBits(uint16(r.Uint64n(math.MaxUint16 + 1)))
378 if !f.IsNaN() {
379 return f
380 }
381 }
382 }
383
384
385
386 func FillRandomFloat32(seed uint64, out []float32) {
387 r := rand.New(rand.NewSource(seed))
388 for idx := range out {
389 out[idx] = randFloat32(r)
390 }
391 }
392
393
394
395 func FillRandomFloat64(seed uint64, out []float64) {
396 r := rand.New(rand.NewSource(seed))
397 for idx := range out {
398 out[idx] = randFloat64(r)
399 }
400 }
401
402
403
404 func FillRandomFloat16(seed uint64, out []float16.Num) {
405 r := rand.New(rand.NewSource(seed))
406 for idx := range out {
407 out[idx] = randFloat16(r)
408 }
409 }
410
411
412
413
414 func FillRandomByteArray(seed uint64, out []parquet.ByteArray, heap *memory.Buffer) {
415 const (
416 maxByteArrayLen = 12
417 minByteArrayLen = 2
418 )
419 RandomByteArray(seed, out, heap, minByteArrayLen, maxByteArrayLen)
420 }
421
422
423
424
425 func FillRandomFixedByteArray(seed uint64, out []parquet.FixedLenByteArray, heap *memory.Buffer, size int) {
426 heap.Resize(len(out) * size)
427
428 buf := heap.Bytes()
429 r := rand.New(rand.NewSource(seed))
430 for idx := range out {
431 r.Read(buf[:size])
432 out[idx] = buf[:size]
433 buf = buf[size:]
434 }
435 }
436
437
438
439
440 func FillRandomBooleans(p float64, seed uint64, out []bool) {
441 dist := distuv.Bernoulli{P: p, Src: rand.NewSource(seed)}
442 for idx := range out {
443 out[idx] = dist.Rand() != float64(0.0)
444 }
445 }
446
447
448
449
450
451 func fillRandomIsValid(seed uint64, pctNull float64, out []bool) {
452 r := rand.New(rand.NewSource(seed))
453 for idx := range out {
454 out[idx] = r.Float64() > pctNull
455 }
456 }
457
458
459
460
461
462
463
464
465
466
467
468 func InitValues(values interface{}, heap *memory.Buffer) {
469 switch arr := values.(type) {
470 case []bool:
471 fillRandomIsValid(uint64(time.Now().Unix()), 0.5, arr)
472 case []int32:
473 FillRandomInt32(0, arr)
474 case []int64:
475 FillRandomInt64(0, arr)
476 case []float32:
477 FillRandomFloat32(0, arr)
478 case []float64:
479 FillRandomFloat64(0, arr)
480 case []float16.Num:
481 FillRandomFloat16(0, arr)
482 case []parquet.Int96:
483 FillRandomInt96(0, arr)
484 case []parquet.ByteArray:
485 FillRandomByteArray(0, arr, heap)
486 case []parquet.FixedLenByteArray:
487 FillRandomFixedByteArray(0, arr, heap, 12)
488 }
489 }
490
491
492
493
494 func RandomByteArray(seed uint64, out []parquet.ByteArray, heap *memory.Buffer, minlen, maxlen int) {
495 heap.Resize(len(out) * (maxlen + arrow.Uint32SizeBytes))
496
497 buf := heap.Bytes()
498 r := rand.New(rand.NewSource(seed))
499 for idx := range out {
500 length := r.Intn(maxlen-minlen+1) + minlen
501 r.Read(buf[:length])
502 out[idx] = buf[:length]
503
504 buf = buf[length:]
505 }
506 }
507
508
509
510
511 func RandomDecimals(n int64, seed uint64, precision int32) []byte {
512 r := rand.New(rand.NewSource(seed))
513 nreqBytes := pqarrow.DecimalSize(precision)
514 byteWidth := 32
515 if precision <= 38 {
516 byteWidth = 16
517 }
518
519 out := make([]byte, int(int64(byteWidth)*n))
520 for i := int64(0); i < n; i++ {
521 start := int(i) * byteWidth
522 r.Read(out[start : start+int(nreqBytes)])
523
524
525 if out[start+int(nreqBytes)-1]&byte(0x80) != 0 {
526 memory.Set(out[start+int(nreqBytes):start+byteWidth], 0xFF)
527 }
528
529
530 if endian.IsBigEndian {
531 for j := 0; j+8 <= byteWidth; j += 8 {
532 v := binary.LittleEndian.Uint64(out[start+j : start+j+8])
533 binary.BigEndian.PutUint64(out[start+j:start+j+8], v)
534 }
535 }
536 }
537 return out
538 }
539
View as plain text