1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17 package encoding
18
19 import (
20 "math"
21 "unsafe"
22
23 "github.com/apache/arrow/go/v15/arrow"
24 "github.com/apache/arrow/go/v15/arrow/array"
25 "github.com/apache/arrow/go/v15/arrow/memory"
26 "github.com/apache/arrow/go/v15/internal/hashing"
27 "github.com/apache/arrow/go/v15/parquet"
28 )
29
30
31
32
33
34
35
36
37 type MemoTable interface {
38
39 Reset()
40
41
42 Size() int
43
44
45 CopyValues(out interface{})
46
47
48 CopyValuesSubset(start int, out interface{})
49
50 WriteOut(out []byte)
51 WriteOutSubset(start int, out []byte)
52
53
54
55 Get(val interface{}) (int, bool)
56
57
58 GetOrInsert(val interface{}) (idx int, existed bool, err error)
59
60 GetNull() (int, bool)
61
62
63 GetOrInsertNull() (idx int, existed bool)
64 }
65
66 type NumericMemoTable interface {
67 MemoTable
68
69
70
71 WriteOutLE(out []byte)
72
73
74
75 WriteOutSubsetLE(start int, out []byte)
76 }
77
78
79
80 type BinaryMemoTable interface {
81 MemoTable
82
83
84 ValuesSize() int
85
86
87 CopyOffsets(out []int32)
88
89
90 CopyOffsetsSubset(start int, out []int32)
91
92
93
94 CopyFixedWidthValues(start int, width int, out []byte)
95
96 VisitValues(start int, visitFn func([]byte))
97
98
99
100 Retain()
101
102
103
104 Release()
105 }
106
107
108 func NewInt32Dictionary() MemoTable {
109 return hashing.NewInt32MemoTable(0)
110 }
111
112
113 func NewInt64Dictionary() MemoTable {
114 return hashing.NewInt64MemoTable(0)
115 }
116
117
118 func NewFloat32Dictionary() MemoTable {
119 return hashing.NewFloat32MemoTable(0)
120 }
121
122
123 func NewFloat64Dictionary() MemoTable {
124 return hashing.NewFloat64MemoTable(0)
125 }
126
127
128
129 func NewBinaryDictionary(mem memory.Allocator) BinaryMemoTable {
130 return hashing.NewBinaryMemoTable(0, -1, array.NewBinaryBuilder(mem, arrow.BinaryTypes.Binary))
131 }
132
133 const keyNotFound = hashing.KeyNotFound
134
135
136
137
138
139 func NewBinaryMemoTable(mem memory.Allocator) BinaryMemoTable {
140 return &binaryMemoTableImpl{
141 table: make(map[string]int),
142 nullIndex: keyNotFound,
143 builder: array.NewBinaryBuilder(mem, arrow.BinaryTypes.Binary),
144 }
145 }
146
147 type binaryMemoTableImpl struct {
148 table map[string]int
149 builder *array.BinaryBuilder
150 nullIndex int
151 }
152
153 func (m *binaryMemoTableImpl) Reset() {
154 m.table = make(map[string]int)
155 m.nullIndex = keyNotFound
156 m.builder.NewArray().Release()
157 }
158
159 func (m *binaryMemoTableImpl) CopyValues(out interface{}) {
160 m.CopyValuesSubset(0, out)
161 }
162
163 func (m *binaryMemoTableImpl) GetNull() (int, bool) {
164 return m.nullIndex, m.nullIndex != keyNotFound
165 }
166
167 func (m *binaryMemoTableImpl) ValuesSize() int {
168 return m.builder.DataLen()
169 }
170
171 func (m *binaryMemoTableImpl) Size() int {
172 sz := len(m.table)
173 if _, ok := m.GetNull(); ok {
174 sz++
175 }
176 return sz
177 }
178
179 func (m *binaryMemoTableImpl) valAsString(val interface{}) string {
180 switch v := val.(type) {
181 case string:
182 return v
183 case []byte:
184 return *(*string)(unsafe.Pointer(&v))
185 case parquet.ByteArray:
186 return *(*string)(unsafe.Pointer(&v))
187 case parquet.FixedLenByteArray:
188 return *(*string)(unsafe.Pointer(&v))
189 default:
190 panic("invalid type for value in binarymemotable")
191 }
192 }
193
194 func (m *binaryMemoTableImpl) Get(val interface{}) (int, bool) {
195 key := m.valAsString(val)
196 if p, ok := m.table[key]; ok {
197 return p, true
198 }
199 return keyNotFound, false
200 }
201
202 func (m *binaryMemoTableImpl) GetOrInsert(val interface{}) (idx int, found bool, err error) {
203 key := m.valAsString(val)
204 idx, found = m.table[key]
205 if !found {
206 idx = m.Size()
207 m.builder.AppendString(key)
208 m.table[key] = idx
209 }
210 return
211 }
212
213 func (m *binaryMemoTableImpl) GetOrInsertNull() (idx int, found bool) {
214 idx, found = m.GetNull()
215 if !found {
216 idx = m.Size()
217 m.nullIndex = idx
218 m.builder.AppendNull()
219 }
220 return
221 }
222
223 func (m *binaryMemoTableImpl) findOffset(idx int) uintptr {
224 val := m.builder.Value(idx)
225 for len(val) == 0 {
226 idx++
227 if idx >= m.builder.Len() {
228 break
229 }
230 val = m.builder.Value(idx)
231 }
232 if len(val) != 0 {
233 return uintptr(unsafe.Pointer(&val[0]))
234 }
235 return uintptr(m.builder.DataLen()) + m.findOffset(0)
236 }
237
238 func (m *binaryMemoTableImpl) CopyValuesSubset(start int, out interface{}) {
239 var (
240 first = m.findOffset(0)
241 offset = m.findOffset(int(start))
242 length = m.builder.DataLen() - int(offset-first)
243 )
244
245 outval := out.([]byte)
246 copy(outval, m.builder.Value(start)[0:length])
247 }
248
249 func (m *binaryMemoTableImpl) WriteOut(out []byte) {
250 m.CopyValues(out)
251 }
252
253 func (m *binaryMemoTableImpl) WriteOutSubset(start int, out []byte) {
254 m.CopyValuesSubset(start, out)
255 }
256
257 func (m *binaryMemoTableImpl) CopyFixedWidthValues(start, width int, out []byte) {
258
259 }
260
261 func (m *binaryMemoTableImpl) CopyOffsetsSubset(start int, out []int32) {
262 if m.builder.Len() <= start {
263 return
264 }
265
266 first := m.findOffset(0)
267 delta := m.findOffset(start)
268 for i := start; i < m.Size(); i++ {
269 offset := int32(m.findOffset(i) - delta)
270 out[i-start] = offset
271 }
272
273 out[m.Size()-start] = int32(m.builder.DataLen() - int(delta) - int(first))
274 }
275
276 func (m *binaryMemoTableImpl) CopyOffsets(out []int32) {
277 m.CopyOffsetsSubset(0, out)
278 }
279
280 func (m *binaryMemoTableImpl) VisitValues(start int, visitFn func([]byte)) {
281 for i := int(start); i < m.Size(); i++ {
282 visitFn(m.builder.Value(i))
283 }
284 }
285
286 func (m *binaryMemoTableImpl) Release() {
287 m.builder.Release()
288 }
289
290 func (m *binaryMemoTableImpl) Retain() {
291 m.builder.Retain()
292 }
293
294
295
296
297
298 func NewFloat64MemoTable(memory.Allocator) MemoTable {
299 return &float64MemoTableImpl{
300 table: make(map[float64]struct {
301 value float64
302 memoIndex int
303 }),
304 nullIndex: keyNotFound,
305 nanIndex: keyNotFound,
306 }
307 }
308
309 type float64MemoTableImpl struct {
310 table map[float64]struct {
311 value float64
312 memoIndex int
313 }
314 nullIndex int
315 nanIndex int
316 }
317
318 func (m *float64MemoTableImpl) Reset() {
319 m.table = make(map[float64]struct {
320 value float64
321 memoIndex int
322 })
323 m.nullIndex = keyNotFound
324 m.nanIndex = keyNotFound
325 }
326
327 func (m *float64MemoTableImpl) GetNull() (int, bool) {
328 return m.nullIndex, m.nullIndex != keyNotFound
329 }
330
331 func (m *float64MemoTableImpl) Size() int {
332 sz := len(m.table)
333 if _, ok := m.GetNull(); ok {
334 sz++
335 }
336 if m.nanIndex != keyNotFound {
337 sz++
338 }
339 return sz
340 }
341
342 func (m *float64MemoTableImpl) GetOrInsertNull() (idx int, found bool) {
343 idx, found = m.GetNull()
344 if !found {
345 idx = m.Size()
346 m.nullIndex = idx
347 }
348 return
349 }
350
351 func (m *float64MemoTableImpl) Get(val interface{}) (int, bool) {
352 v := val.(float64)
353 if p, ok := m.table[v]; ok {
354 return p.memoIndex, true
355 }
356 if math.IsNaN(v) && m.nanIndex != keyNotFound {
357 return m.nanIndex, true
358 }
359 return keyNotFound, false
360 }
361
362 func (m *float64MemoTableImpl) GetOrInsert(val interface{}) (idx int, found bool, err error) {
363 v := val.(float64)
364 if math.IsNaN(v) {
365 if m.nanIndex == keyNotFound {
366 idx = m.Size()
367 m.nanIndex = idx
368 } else {
369 idx = m.nanIndex
370 found = true
371 }
372 return
373 }
374
375 p, ok := m.table[v]
376 if ok {
377 idx = p.memoIndex
378 } else {
379 idx = m.Size()
380 p.value = v
381 p.memoIndex = idx
382 m.table[v] = p
383 found = true
384 }
385 return
386 }
387
388 func (m *float64MemoTableImpl) CopyValues(out interface{}) {
389 m.CopyValuesSubset(0, out)
390 }
391
392 func (m *float64MemoTableImpl) CopyValuesSubset(start int, out interface{}) {
393 outval := out.([]float64)
394 for _, v := range m.table {
395 idx := v.memoIndex - start
396 if idx >= 0 {
397 outval[idx] = v.value
398 }
399 }
400 if m.nanIndex != keyNotFound {
401 outval[m.nanIndex] = math.NaN()
402 }
403 }
404
405 func (m *float64MemoTableImpl) WriteOut(out []byte) {
406 m.CopyValuesSubset(0, arrow.Float64Traits.CastFromBytes(out))
407 }
408
409 func (m *float64MemoTableImpl) WriteOutSubset(start int, out []byte) {
410 m.CopyValuesSubset(start, arrow.Float64Traits.CastFromBytes(out))
411 }
412
View as plain text