1 package bbolt
2
3 import (
4 "bytes"
5 "fmt"
6 "sort"
7 "unsafe"
8 )
9
10
11 type node struct {
12 bucket *Bucket
13 isLeaf bool
14 unbalanced bool
15 spilled bool
16 key []byte
17 pgid pgid
18 parent *node
19 children nodes
20 inodes inodes
21 }
22
23
24 func (n *node) root() *node {
25 if n.parent == nil {
26 return n
27 }
28 return n.parent.root()
29 }
30
31
32 func (n *node) minKeys() int {
33 if n.isLeaf {
34 return 1
35 }
36 return 2
37 }
38
39
40 func (n *node) size() int {
41 sz, elsz := pageHeaderSize, n.pageElementSize()
42 for i := 0; i < len(n.inodes); i++ {
43 item := &n.inodes[i]
44 sz += elsz + uintptr(len(item.key)) + uintptr(len(item.value))
45 }
46 return int(sz)
47 }
48
49
50
51
52 func (n *node) sizeLessThan(v uintptr) bool {
53 sz, elsz := pageHeaderSize, n.pageElementSize()
54 for i := 0; i < len(n.inodes); i++ {
55 item := &n.inodes[i]
56 sz += elsz + uintptr(len(item.key)) + uintptr(len(item.value))
57 if sz >= v {
58 return false
59 }
60 }
61 return true
62 }
63
64
65 func (n *node) pageElementSize() uintptr {
66 if n.isLeaf {
67 return leafPageElementSize
68 }
69 return branchPageElementSize
70 }
71
72
73 func (n *node) childAt(index int) *node {
74 if n.isLeaf {
75 panic(fmt.Sprintf("invalid childAt(%d) on a leaf node", index))
76 }
77 return n.bucket.node(n.inodes[index].pgid, n)
78 }
79
80
81 func (n *node) childIndex(child *node) int {
82 index := sort.Search(len(n.inodes), func(i int) bool { return bytes.Compare(n.inodes[i].key, child.key) != -1 })
83 return index
84 }
85
86
87 func (n *node) numChildren() int {
88 return len(n.inodes)
89 }
90
91
92 func (n *node) nextSibling() *node {
93 if n.parent == nil {
94 return nil
95 }
96 index := n.parent.childIndex(n)
97 if index >= n.parent.numChildren()-1 {
98 return nil
99 }
100 return n.parent.childAt(index + 1)
101 }
102
103
104 func (n *node) prevSibling() *node {
105 if n.parent == nil {
106 return nil
107 }
108 index := n.parent.childIndex(n)
109 if index == 0 {
110 return nil
111 }
112 return n.parent.childAt(index - 1)
113 }
114
115
116 func (n *node) put(oldKey, newKey, value []byte, pgId pgid, flags uint32) {
117 if pgId >= n.bucket.tx.meta.pgid {
118 panic(fmt.Sprintf("pgId (%d) above high water mark (%d)", pgId, n.bucket.tx.meta.pgid))
119 } else if len(oldKey) <= 0 {
120 panic("put: zero-length old key")
121 } else if len(newKey) <= 0 {
122 panic("put: zero-length new key")
123 }
124
125
126 index := sort.Search(len(n.inodes), func(i int) bool { return bytes.Compare(n.inodes[i].key, oldKey) != -1 })
127
128
129 exact := (len(n.inodes) > 0 && index < len(n.inodes) && bytes.Equal(n.inodes[index].key, oldKey))
130 if !exact {
131 n.inodes = append(n.inodes, inode{})
132 copy(n.inodes[index+1:], n.inodes[index:])
133 }
134
135 inode := &n.inodes[index]
136 inode.flags = flags
137 inode.key = newKey
138 inode.value = value
139 inode.pgid = pgId
140 _assert(len(inode.key) > 0, "put: zero-length inode key")
141 }
142
143
144 func (n *node) del(key []byte) {
145
146 index := sort.Search(len(n.inodes), func(i int) bool { return bytes.Compare(n.inodes[i].key, key) != -1 })
147
148
149 if index >= len(n.inodes) || !bytes.Equal(n.inodes[index].key, key) {
150 return
151 }
152
153
154 n.inodes = append(n.inodes[:index], n.inodes[index+1:]...)
155
156
157 n.unbalanced = true
158 }
159
160
161 func (n *node) read(p *page) {
162 n.pgid = p.id
163 n.isLeaf = ((p.flags & leafPageFlag) != 0)
164 n.inodes = make(inodes, int(p.count))
165
166 for i := 0; i < int(p.count); i++ {
167 inode := &n.inodes[i]
168 if n.isLeaf {
169 elem := p.leafPageElement(uint16(i))
170 inode.flags = elem.flags
171 inode.key = elem.key()
172 inode.value = elem.value()
173 } else {
174 elem := p.branchPageElement(uint16(i))
175 inode.pgid = elem.pgid
176 inode.key = elem.key()
177 }
178 _assert(len(inode.key) > 0, "read: zero-length inode key")
179 }
180
181
182 if len(n.inodes) > 0 {
183 n.key = n.inodes[0].key
184 _assert(len(n.key) > 0, "read: zero-length node key")
185 } else {
186 n.key = nil
187 }
188 }
189
190
191
192
193 func (n *node) write(p *page) {
194 _assert(p.count == 0 && p.flags == 0, "node cannot be written into a not empty page")
195
196
197 if n.isLeaf {
198 p.flags = leafPageFlag
199 } else {
200 p.flags = branchPageFlag
201 }
202
203 if len(n.inodes) >= 0xFFFF {
204 panic(fmt.Sprintf("inode overflow: %d (pgid=%d)", len(n.inodes), p.id))
205 }
206 p.count = uint16(len(n.inodes))
207
208
209 if p.count == 0 {
210 return
211 }
212
213
214
215 off := unsafe.Sizeof(*p) + n.pageElementSize()*uintptr(len(n.inodes))
216 for i, item := range n.inodes {
217 _assert(len(item.key) > 0, "write: zero-length inode key")
218
219
220
221 sz := len(item.key) + len(item.value)
222 b := unsafeByteSlice(unsafe.Pointer(p), off, 0, sz)
223 off += uintptr(sz)
224
225
226 if n.isLeaf {
227 elem := p.leafPageElement(uint16(i))
228 elem.pos = uint32(uintptr(unsafe.Pointer(&b[0])) - uintptr(unsafe.Pointer(elem)))
229 elem.flags = item.flags
230 elem.ksize = uint32(len(item.key))
231 elem.vsize = uint32(len(item.value))
232 } else {
233 elem := p.branchPageElement(uint16(i))
234 elem.pos = uint32(uintptr(unsafe.Pointer(&b[0])) - uintptr(unsafe.Pointer(elem)))
235 elem.ksize = uint32(len(item.key))
236 elem.pgid = item.pgid
237 _assert(elem.pgid != p.id, "write: circular dependency occurred")
238 }
239
240
241 l := copy(b, item.key)
242 copy(b[l:], item.value)
243 }
244
245
246 }
247
248
249
250 func (n *node) split(pageSize uintptr) []*node {
251 var nodes []*node
252
253 node := n
254 for {
255
256 a, b := node.splitTwo(pageSize)
257 nodes = append(nodes, a)
258
259
260 if b == nil {
261 break
262 }
263
264
265 node = b
266 }
267
268 return nodes
269 }
270
271
272
273 func (n *node) splitTwo(pageSize uintptr) (*node, *node) {
274
275
276 if len(n.inodes) <= (minKeysPerPage*2) || n.sizeLessThan(pageSize) {
277 return n, nil
278 }
279
280
281 var fillPercent = n.bucket.FillPercent
282 if fillPercent < minFillPercent {
283 fillPercent = minFillPercent
284 } else if fillPercent > maxFillPercent {
285 fillPercent = maxFillPercent
286 }
287 threshold := int(float64(pageSize) * fillPercent)
288
289
290 splitIndex, _ := n.splitIndex(threshold)
291
292
293
294 if n.parent == nil {
295 n.parent = &node{bucket: n.bucket, children: []*node{n}}
296 }
297
298
299 next := &node{bucket: n.bucket, isLeaf: n.isLeaf, parent: n.parent}
300 n.parent.children = append(n.parent.children, next)
301
302
303 next.inodes = n.inodes[splitIndex:]
304 n.inodes = n.inodes[:splitIndex]
305
306
307 n.bucket.tx.stats.IncSplit(1)
308
309 return n, next
310 }
311
312
313
314
315 func (n *node) splitIndex(threshold int) (index, sz uintptr) {
316 sz = pageHeaderSize
317
318
319 for i := 0; i < len(n.inodes)-minKeysPerPage; i++ {
320 index = uintptr(i)
321 inode := n.inodes[i]
322 elsize := n.pageElementSize() + uintptr(len(inode.key)) + uintptr(len(inode.value))
323
324
325
326 if index >= minKeysPerPage && sz+elsize > uintptr(threshold) {
327 break
328 }
329
330
331 sz += elsize
332 }
333
334 return
335 }
336
337
338
339 func (n *node) spill() error {
340 var tx = n.bucket.tx
341 if n.spilled {
342 return nil
343 }
344
345
346
347
348 sort.Sort(n.children)
349 for i := 0; i < len(n.children); i++ {
350 if err := n.children[i].spill(); err != nil {
351 return err
352 }
353 }
354
355
356 n.children = nil
357
358
359 var nodes = n.split(uintptr(tx.db.pageSize))
360 for _, node := range nodes {
361
362 if node.pgid > 0 {
363 tx.db.freelist.free(tx.meta.txid, tx.page(node.pgid))
364 node.pgid = 0
365 }
366
367
368 p, err := tx.allocate((node.size() + tx.db.pageSize - 1) / tx.db.pageSize)
369 if err != nil {
370 return err
371 }
372
373
374 if p.id >= tx.meta.pgid {
375 panic(fmt.Sprintf("pgid (%d) above high water mark (%d)", p.id, tx.meta.pgid))
376 }
377 node.pgid = p.id
378 node.write(p)
379 node.spilled = true
380
381
382 if node.parent != nil {
383 var key = node.key
384 if key == nil {
385 key = node.inodes[0].key
386 }
387
388 node.parent.put(key, node.inodes[0].key, nil, node.pgid, 0)
389 node.key = node.inodes[0].key
390 _assert(len(node.key) > 0, "spill: zero-length node key")
391 }
392
393
394 tx.stats.IncSpill(1)
395 }
396
397
398
399 if n.parent != nil && n.parent.pgid == 0 {
400 n.children = nil
401 return n.parent.spill()
402 }
403
404 return nil
405 }
406
407
408
409 func (n *node) rebalance() {
410 if !n.unbalanced {
411 return
412 }
413 n.unbalanced = false
414
415
416 n.bucket.tx.stats.IncRebalance(1)
417
418
419 var threshold = n.bucket.tx.db.pageSize / 4
420 if n.size() > threshold && len(n.inodes) > n.minKeys() {
421 return
422 }
423
424
425 if n.parent == nil {
426
427 if !n.isLeaf && len(n.inodes) == 1 {
428
429 child := n.bucket.node(n.inodes[0].pgid, n)
430 n.isLeaf = child.isLeaf
431 n.inodes = child.inodes[:]
432 n.children = child.children
433
434
435 for _, inode := range n.inodes {
436 if child, ok := n.bucket.nodes[inode.pgid]; ok {
437 child.parent = n
438 }
439 }
440
441
442 child.parent = nil
443 delete(n.bucket.nodes, child.pgid)
444 child.free()
445 }
446
447 return
448 }
449
450
451 if n.numChildren() == 0 {
452 n.parent.del(n.key)
453 n.parent.removeChild(n)
454 delete(n.bucket.nodes, n.pgid)
455 n.free()
456 n.parent.rebalance()
457 return
458 }
459
460 _assert(n.parent.numChildren() > 1, "parent must have at least 2 children")
461
462
463 var target *node
464 var useNextSibling = (n.parent.childIndex(n) == 0)
465 if useNextSibling {
466 target = n.nextSibling()
467 } else {
468 target = n.prevSibling()
469 }
470
471
472 if useNextSibling {
473
474 for _, inode := range target.inodes {
475 if child, ok := n.bucket.nodes[inode.pgid]; ok {
476 child.parent.removeChild(child)
477 child.parent = n
478 child.parent.children = append(child.parent.children, child)
479 }
480 }
481
482
483 n.inodes = append(n.inodes, target.inodes...)
484 n.parent.del(target.key)
485 n.parent.removeChild(target)
486 delete(n.bucket.nodes, target.pgid)
487 target.free()
488 } else {
489
490 for _, inode := range n.inodes {
491 if child, ok := n.bucket.nodes[inode.pgid]; ok {
492 child.parent.removeChild(child)
493 child.parent = target
494 child.parent.children = append(child.parent.children, child)
495 }
496 }
497
498
499 target.inodes = append(target.inodes, n.inodes...)
500 n.parent.del(n.key)
501 n.parent.removeChild(n)
502 delete(n.bucket.nodes, n.pgid)
503 n.free()
504 }
505
506
507 n.parent.rebalance()
508 }
509
510
511
512 func (n *node) removeChild(target *node) {
513 for i, child := range n.children {
514 if child == target {
515 n.children = append(n.children[:i], n.children[i+1:]...)
516 return
517 }
518 }
519 }
520
521
522
523 func (n *node) dereference() {
524 if n.key != nil {
525 key := make([]byte, len(n.key))
526 copy(key, n.key)
527 n.key = key
528 _assert(n.pgid == 0 || len(n.key) > 0, "dereference: zero-length node key on existing node")
529 }
530
531 for i := range n.inodes {
532 inode := &n.inodes[i]
533
534 key := make([]byte, len(inode.key))
535 copy(key, inode.key)
536 inode.key = key
537 _assert(len(inode.key) > 0, "dereference: zero-length inode key")
538
539 value := make([]byte, len(inode.value))
540 copy(value, inode.value)
541 inode.value = value
542 }
543
544
545 for _, child := range n.children {
546 child.dereference()
547 }
548
549
550 n.bucket.tx.stats.IncNodeDeref(1)
551 }
552
553
554 func (n *node) free() {
555 if n.pgid != 0 {
556 n.bucket.tx.db.freelist.free(n.bucket.tx.meta.txid, n.bucket.tx.page(n.pgid))
557 n.pgid = 0
558 }
559 }
560
561
562
587
588 func compareKeys(left, right []byte) int {
589 return bytes.Compare(left, right)
590 }
591
592 type nodes []*node
593
594 func (s nodes) Len() int { return len(s) }
595 func (s nodes) Swap(i, j int) { s[i], s[j] = s[j], s[i] }
596 func (s nodes) Less(i, j int) bool {
597 return bytes.Compare(s[i].inodes[0].key, s[j].inodes[0].key) == -1
598 }
599
600
601
602
603 type inode struct {
604 flags uint32
605 pgid pgid
606 key []byte
607 value []byte
608 }
609
610 type inodes []inode
611
View as plain text