node.go

Documentation: go.etcd.io/bbolt

     1  package bbolt
     2  
     3  import (
     4  	"bytes"
     5  	"fmt"
     6  	"sort"
     7  	"unsafe"
     8  )
     9  
    10  // node represents an in-memory, deserialized page.
    11  type node struct {
    12  	bucket     *Bucket
    13  	isLeaf     bool
    14  	unbalanced bool
    15  	spilled    bool
    16  	key        []byte
    17  	pgid       pgid
    18  	parent     *node
    19  	children   nodes
    20  	inodes     inodes
    21  }
    22  
    23  // root returns the top-level node this node is attached to.
    24  func (n *node) root() *node {
    25  	if n.parent == nil {
    26  		return n
    27  	}
    28  	return n.parent.root()
    29  }
    30  
    31  // minKeys returns the minimum number of inodes this node should have.
    32  func (n *node) minKeys() int {
    33  	if n.isLeaf {
    34  		return 1
    35  	}
    36  	return 2
    37  }
    38  
    39  // size returns the size of the node after serialization.
    40  func (n *node) size() int {
    41  	sz, elsz := pageHeaderSize, n.pageElementSize()
    42  	for i := 0; i < len(n.inodes); i++ {
    43  		item := &n.inodes[i]
    44  		sz += elsz + uintptr(len(item.key)) + uintptr(len(item.value))
    45  	}
    46  	return int(sz)
    47  }
    48  
    49  // sizeLessThan returns true if the node is less than a given size.
    50  // This is an optimization to avoid calculating a large node when we only need
    51  // to know if it fits inside a certain page size.
    52  func (n *node) sizeLessThan(v uintptr) bool {
    53  	sz, elsz := pageHeaderSize, n.pageElementSize()
    54  	for i := 0; i < len(n.inodes); i++ {
    55  		item := &n.inodes[i]
    56  		sz += elsz + uintptr(len(item.key)) + uintptr(len(item.value))
    57  		if sz >= v {
    58  			return false
    59  		}
    60  	}
    61  	return true
    62  }
    63  
    64  // pageElementSize returns the size of each page element based on the type of node.
    65  func (n *node) pageElementSize() uintptr {
    66  	if n.isLeaf {
    67  		return leafPageElementSize
    68  	}
    69  	return branchPageElementSize
    70  }
    71  
    72  // childAt returns the child node at a given index.
    73  func (n *node) childAt(index int) *node {
    74  	if n.isLeaf {
    75  		panic(fmt.Sprintf("invalid childAt(%d) on a leaf node", index))
    76  	}
    77  	return n.bucket.node(n.inodes[index].pgid, n)
    78  }
    79  
    80  // childIndex returns the index of a given child node.
    81  func (n *node) childIndex(child *node) int {
    82  	index := sort.Search(len(n.inodes), func(i int) bool { return bytes.Compare(n.inodes[i].key, child.key) != -1 })
    83  	return index
    84  }
    85  
    86  // numChildren returns the number of children.
    87  func (n *node) numChildren() int {
    88  	return len(n.inodes)
    89  }
    90  
    91  // nextSibling returns the next node with the same parent.
    92  func (n *node) nextSibling() *node {
    93  	if n.parent == nil {
    94  		return nil
    95  	}
    96  	index := n.parent.childIndex(n)
    97  	if index >= n.parent.numChildren()-1 {
    98  		return nil
    99  	}
   100  	return n.parent.childAt(index + 1)
   101  }
   102  
   103  // prevSibling returns the previous node with the same parent.
   104  func (n *node) prevSibling() *node {
   105  	if n.parent == nil {
   106  		return nil
   107  	}
   108  	index := n.parent.childIndex(n)
   109  	if index == 0 {
   110  		return nil
   111  	}
   112  	return n.parent.childAt(index - 1)
   113  }
   114  
   115  // put inserts a key/value.
   116  func (n *node) put(oldKey, newKey, value []byte, pgId pgid, flags uint32) {
   117  	if pgId >= n.bucket.tx.meta.pgid {
   118  		panic(fmt.Sprintf("pgId (%d) above high water mark (%d)", pgId, n.bucket.tx.meta.pgid))
   119  	} else if len(oldKey) <= 0 {
   120  		panic("put: zero-length old key")
   121  	} else if len(newKey) <= 0 {
   122  		panic("put: zero-length new key")
   123  	}
   124  
   125  	// Find insertion index.
   126  	index := sort.Search(len(n.inodes), func(i int) bool { return bytes.Compare(n.inodes[i].key, oldKey) != -1 })
   127  
   128  	// Add capacity and shift nodes if we don't have an exact match and need to insert.
   129  	exact := (len(n.inodes) > 0 && index < len(n.inodes) && bytes.Equal(n.inodes[index].key, oldKey))
   130  	if !exact {
   131  		n.inodes = append(n.inodes, inode{})
   132  		copy(n.inodes[index+1:], n.inodes[index:])
   133  	}
   134  
   135  	inode := &n.inodes[index]
   136  	inode.flags = flags
   137  	inode.key = newKey
   138  	inode.value = value
   139  	inode.pgid = pgId
   140  	_assert(len(inode.key) > 0, "put: zero-length inode key")
   141  }
   142  
   143  // del removes a key from the node.
   144  func (n *node) del(key []byte) {
   145  	// Find index of key.
   146  	index := sort.Search(len(n.inodes), func(i int) bool { return bytes.Compare(n.inodes[i].key, key) != -1 })
   147  
   148  	// Exit if the key isn't found.
   149  	if index >= len(n.inodes) || !bytes.Equal(n.inodes[index].key, key) {
   150  		return
   151  	}
   152  
   153  	// Delete inode from the node.
   154  	n.inodes = append(n.inodes[:index], n.inodes[index+1:]...)
   155  
   156  	// Mark the node as needing rebalancing.
   157  	n.unbalanced = true
   158  }
   159  
   160  // read initializes the node from a page.
   161  func (n *node) read(p *page) {
   162  	n.pgid = p.id
   163  	n.isLeaf = ((p.flags & leafPageFlag) != 0)
   164  	n.inodes = make(inodes, int(p.count))
   165  
   166  	for i := 0; i < int(p.count); i++ {
   167  		inode := &n.inodes[i]
   168  		if n.isLeaf {
   169  			elem := p.leafPageElement(uint16(i))
   170  			inode.flags = elem.flags
   171  			inode.key = elem.key()
   172  			inode.value = elem.value()
   173  		} else {
   174  			elem := p.branchPageElement(uint16(i))
   175  			inode.pgid = elem.pgid
   176  			inode.key = elem.key()
   177  		}
   178  		_assert(len(inode.key) > 0, "read: zero-length inode key")
   179  	}
   180  
   181  	// Save first key so we can find the node in the parent when we spill.
   182  	if len(n.inodes) > 0 {
   183  		n.key = n.inodes[0].key
   184  		_assert(len(n.key) > 0, "read: zero-length node key")
   185  	} else {
   186  		n.key = nil
   187  	}
   188  }
   189  
   190  // write writes the items onto one or more pages.
   191  // The page should have p.id (might be 0 for meta or bucket-inline page) and p.overflow set
   192  // and the rest should be zeroed.
   193  func (n *node) write(p *page) {
   194  	_assert(p.count == 0 && p.flags == 0, "node cannot be written into a not empty page")
   195  
   196  	// Initialize page.
   197  	if n.isLeaf {
   198  		p.flags = leafPageFlag
   199  	} else {
   200  		p.flags = branchPageFlag
   201  	}
   202  
   203  	if len(n.inodes) >= 0xFFFF {
   204  		panic(fmt.Sprintf("inode overflow: %d (pgid=%d)", len(n.inodes), p.id))
   205  	}
   206  	p.count = uint16(len(n.inodes))
   207  
   208  	// Stop here if there are no items to write.
   209  	if p.count == 0 {
   210  		return
   211  	}
   212  
   213  	// Loop over each item and write it to the page.
   214  	// off tracks the offset into p of the start of the next data.
   215  	off := unsafe.Sizeof(*p) + n.pageElementSize()*uintptr(len(n.inodes))
   216  	for i, item := range n.inodes {
   217  		_assert(len(item.key) > 0, "write: zero-length inode key")
   218  
   219  		// Create a slice to write into of needed size and advance
   220  		// byte pointer for next iteration.
   221  		sz := len(item.key) + len(item.value)
   222  		b := unsafeByteSlice(unsafe.Pointer(p), off, 0, sz)
   223  		off += uintptr(sz)
   224  
   225  		// Write the page element.
   226  		if n.isLeaf {
   227  			elem := p.leafPageElement(uint16(i))
   228  			elem.pos = uint32(uintptr(unsafe.Pointer(&b[0])) - uintptr(unsafe.Pointer(elem)))
   229  			elem.flags = item.flags
   230  			elem.ksize = uint32(len(item.key))
   231  			elem.vsize = uint32(len(item.value))
   232  		} else {
   233  			elem := p.branchPageElement(uint16(i))
   234  			elem.pos = uint32(uintptr(unsafe.Pointer(&b[0])) - uintptr(unsafe.Pointer(elem)))
   235  			elem.ksize = uint32(len(item.key))
   236  			elem.pgid = item.pgid
   237  			_assert(elem.pgid != p.id, "write: circular dependency occurred")
   238  		}
   239  
   240  		// Write data for the element to the end of the page.
   241  		l := copy(b, item.key)
   242  		copy(b[l:], item.value)
   243  	}
   244  
   245  	// DEBUG ONLY: n.dump()
   246  }
   247  
   248  // split breaks up a node into multiple smaller nodes, if appropriate.
   249  // This should only be called from the spill() function.
   250  func (n *node) split(pageSize uintptr) []*node {
   251  	var nodes []*node
   252  
   253  	node := n
   254  	for {
   255  		// Split node into two.
   256  		a, b := node.splitTwo(pageSize)
   257  		nodes = append(nodes, a)
   258  
   259  		// If we can't split then exit the loop.
   260  		if b == nil {
   261  			break
   262  		}
   263  
   264  		// Set node to b so it gets split on the next iteration.
   265  		node = b
   266  	}
   267  
   268  	return nodes
   269  }
   270  
   271  // splitTwo breaks up a node into two smaller nodes, if appropriate.
   272  // This should only be called from the split() function.
   273  func (n *node) splitTwo(pageSize uintptr) (*node, *node) {
   274  	// Ignore the split if the page doesn't have at least enough nodes for
   275  	// two pages or if the nodes can fit in a single page.
   276  	if len(n.inodes) <= (minKeysPerPage*2) || n.sizeLessThan(pageSize) {
   277  		return n, nil
   278  	}
   279  
   280  	// Determine the threshold before starting a new node.
   281  	var fillPercent = n.bucket.FillPercent
   282  	if fillPercent < minFillPercent {
   283  		fillPercent = minFillPercent
   284  	} else if fillPercent > maxFillPercent {
   285  		fillPercent = maxFillPercent
   286  	}
   287  	threshold := int(float64(pageSize) * fillPercent)
   288  
   289  	// Determine split position and sizes of the two pages.
   290  	splitIndex, _ := n.splitIndex(threshold)
   291  
   292  	// Split node into two separate nodes.
   293  	// If there's no parent then we'll need to create one.
   294  	if n.parent == nil {
   295  		n.parent = &node{bucket: n.bucket, children: []*node{n}}
   296  	}
   297  
   298  	// Create a new node and add it to the parent.
   299  	next := &node{bucket: n.bucket, isLeaf: n.isLeaf, parent: n.parent}
   300  	n.parent.children = append(n.parent.children, next)
   301  
   302  	// Split inodes across two nodes.
   303  	next.inodes = n.inodes[splitIndex:]
   304  	n.inodes = n.inodes[:splitIndex]
   305  
   306  	// Update the statistics.
   307  	n.bucket.tx.stats.IncSplit(1)
   308  
   309  	return n, next
   310  }
   311  
   312  // splitIndex finds the position where a page will fill a given threshold.
   313  // It returns the index as well as the size of the first page.
   314  // This is only be called from split().
   315  func (n *node) splitIndex(threshold int) (index, sz uintptr) {
   316  	sz = pageHeaderSize
   317  
   318  	// Loop until we only have the minimum number of keys required for the second page.
   319  	for i := 0; i < len(n.inodes)-minKeysPerPage; i++ {
   320  		index = uintptr(i)
   321  		inode := n.inodes[i]
   322  		elsize := n.pageElementSize() + uintptr(len(inode.key)) + uintptr(len(inode.value))
   323  
   324  		// If we have at least the minimum number of keys and adding another
   325  		// node would put us over the threshold then exit and return.
   326  		if index >= minKeysPerPage && sz+elsize > uintptr(threshold) {
   327  			break
   328  		}
   329  
   330  		// Add the element size to the total size.
   331  		sz += elsize
   332  	}
   333  
   334  	return
   335  }
   336  
   337  // spill writes the nodes to dirty pages and splits nodes as it goes.
   338  // Returns an error if dirty pages cannot be allocated.
   339  func (n *node) spill() error {
   340  	var tx = n.bucket.tx
   341  	if n.spilled {
   342  		return nil
   343  	}
   344  
   345  	// Spill child nodes first. Child nodes can materialize sibling nodes in
   346  	// the case of split-merge so we cannot use a range loop. We have to check
   347  	// the children size on every loop iteration.
   348  	sort.Sort(n.children)
   349  	for i := 0; i < len(n.children); i++ {
   350  		if err := n.children[i].spill(); err != nil {
   351  			return err
   352  		}
   353  	}
   354  
   355  	// We no longer need the child list because it's only used for spill tracking.
   356  	n.children = nil
   357  
   358  	// Split nodes into appropriate sizes. The first node will always be n.
   359  	var nodes = n.split(uintptr(tx.db.pageSize))
   360  	for _, node := range nodes {
   361  		// Add node's page to the freelist if it's not new.
   362  		if node.pgid > 0 {
   363  			tx.db.freelist.free(tx.meta.txid, tx.page(node.pgid))
   364  			node.pgid = 0
   365  		}
   366  
   367  		// Allocate contiguous space for the node.
   368  		p, err := tx.allocate((node.size() + tx.db.pageSize - 1) / tx.db.pageSize)
   369  		if err != nil {
   370  			return err
   371  		}
   372  
   373  		// Write the node.
   374  		if p.id >= tx.meta.pgid {
   375  			panic(fmt.Sprintf("pgid (%d) above high water mark (%d)", p.id, tx.meta.pgid))
   376  		}
   377  		node.pgid = p.id
   378  		node.write(p)
   379  		node.spilled = true
   380  
   381  		// Insert into parent inodes.
   382  		if node.parent != nil {
   383  			var key = node.key
   384  			if key == nil {
   385  				key = node.inodes[0].key
   386  			}
   387  
   388  			node.parent.put(key, node.inodes[0].key, nil, node.pgid, 0)
   389  			node.key = node.inodes[0].key
   390  			_assert(len(node.key) > 0, "spill: zero-length node key")
   391  		}
   392  
   393  		// Update the statistics.
   394  		tx.stats.IncSpill(1)
   395  	}
   396  
   397  	// If the root node split and created a new root then we need to spill that
   398  	// as well. We'll clear out the children to make sure it doesn't try to respill.
   399  	if n.parent != nil && n.parent.pgid == 0 {
   400  		n.children = nil
   401  		return n.parent.spill()
   402  	}
   403  
   404  	return nil
   405  }
   406  
   407  // rebalance attempts to combine the node with sibling nodes if the node fill
   408  // size is below a threshold or if there are not enough keys.
   409  func (n *node) rebalance() {
   410  	if !n.unbalanced {
   411  		return
   412  	}
   413  	n.unbalanced = false
   414  
   415  	// Update statistics.
   416  	n.bucket.tx.stats.IncRebalance(1)
   417  
   418  	// Ignore if node is above threshold (25%) and has enough keys.
   419  	var threshold = n.bucket.tx.db.pageSize / 4
   420  	if n.size() > threshold && len(n.inodes) > n.minKeys() {
   421  		return
   422  	}
   423  
   424  	// Root node has special handling.
   425  	if n.parent == nil {
   426  		// If root node is a branch and only has one node then collapse it.
   427  		if !n.isLeaf && len(n.inodes) == 1 {
   428  			// Move root's child up.
   429  			child := n.bucket.node(n.inodes[0].pgid, n)
   430  			n.isLeaf = child.isLeaf
   431  			n.inodes = child.inodes[:]
   432  			n.children = child.children
   433  
   434  			// Reparent all child nodes being moved.
   435  			for _, inode := range n.inodes {
   436  				if child, ok := n.bucket.nodes[inode.pgid]; ok {
   437  					child.parent = n
   438  				}
   439  			}
   440  
   441  			// Remove old child.
   442  			child.parent = nil
   443  			delete(n.bucket.nodes, child.pgid)
   444  			child.free()
   445  		}
   446  
   447  		return
   448  	}
   449  
   450  	// If node has no keys then just remove it.
   451  	if n.numChildren() == 0 {
   452  		n.parent.del(n.key)
   453  		n.parent.removeChild(n)
   454  		delete(n.bucket.nodes, n.pgid)
   455  		n.free()
   456  		n.parent.rebalance()
   457  		return
   458  	}
   459  
   460  	_assert(n.parent.numChildren() > 1, "parent must have at least 2 children")
   461  
   462  	// Destination node is right sibling if idx == 0, otherwise left sibling.
   463  	var target *node
   464  	var useNextSibling = (n.parent.childIndex(n) == 0)
   465  	if useNextSibling {
   466  		target = n.nextSibling()
   467  	} else {
   468  		target = n.prevSibling()
   469  	}
   470  
   471  	// If both this node and the target node are too small then merge them.
   472  	if useNextSibling {
   473  		// Reparent all child nodes being moved.
   474  		for _, inode := range target.inodes {
   475  			if child, ok := n.bucket.nodes[inode.pgid]; ok {
   476  				child.parent.removeChild(child)
   477  				child.parent = n
   478  				child.parent.children = append(child.parent.children, child)
   479  			}
   480  		}
   481  
   482  		// Copy over inodes from target and remove target.
   483  		n.inodes = append(n.inodes, target.inodes...)
   484  		n.parent.del(target.key)
   485  		n.parent.removeChild(target)
   486  		delete(n.bucket.nodes, target.pgid)
   487  		target.free()
   488  	} else {
   489  		// Reparent all child nodes being moved.
   490  		for _, inode := range n.inodes {
   491  			if child, ok := n.bucket.nodes[inode.pgid]; ok {
   492  				child.parent.removeChild(child)
   493  				child.parent = target
   494  				child.parent.children = append(child.parent.children, child)
   495  			}
   496  		}
   497  
   498  		// Copy over inodes to target and remove node.
   499  		target.inodes = append(target.inodes, n.inodes...)
   500  		n.parent.del(n.key)
   501  		n.parent.removeChild(n)
   502  		delete(n.bucket.nodes, n.pgid)
   503  		n.free()
   504  	}
   505  
   506  	// Either this node or the target node was deleted from the parent so rebalance it.
   507  	n.parent.rebalance()
   508  }
   509  
   510  // removes a node from the list of in-memory children.
   511  // This does not affect the inodes.
   512  func (n *node) removeChild(target *node) {
   513  	for i, child := range n.children {
   514  		if child == target {
   515  			n.children = append(n.children[:i], n.children[i+1:]...)
   516  			return
   517  		}
   518  	}
   519  }
   520  
   521  // dereference causes the node to copy all its inode key/value references to heap memory.
   522  // This is required when the mmap is reallocated so inodes are not pointing to stale data.
   523  func (n *node) dereference() {
   524  	if n.key != nil {
   525  		key := make([]byte, len(n.key))
   526  		copy(key, n.key)
   527  		n.key = key
   528  		_assert(n.pgid == 0 || len(n.key) > 0, "dereference: zero-length node key on existing node")
   529  	}
   530  
   531  	for i := range n.inodes {
   532  		inode := &n.inodes[i]
   533  
   534  		key := make([]byte, len(inode.key))
   535  		copy(key, inode.key)
   536  		inode.key = key
   537  		_assert(len(inode.key) > 0, "dereference: zero-length inode key")
   538  
   539  		value := make([]byte, len(inode.value))
   540  		copy(value, inode.value)
   541  		inode.value = value
   542  	}
   543  
   544  	// Recursively dereference children.
   545  	for _, child := range n.children {
   546  		child.dereference()
   547  	}
   548  
   549  	// Update statistics.
   550  	n.bucket.tx.stats.IncNodeDeref(1)
   551  }
   552  
   553  // free adds the node's underlying page to the freelist.
   554  func (n *node) free() {
   555  	if n.pgid != 0 {
   556  		n.bucket.tx.db.freelist.free(n.bucket.tx.meta.txid, n.bucket.tx.page(n.pgid))
   557  		n.pgid = 0
   558  	}
   559  }
   560  
   561  // dump writes the contents of the node to STDERR for debugging purposes.
   562  /*
   563  func (n *node) dump() {
   564  	// Write node header.
   565  	var typ = "branch"
   566  	if n.isLeaf {
   567  		typ = "leaf"
   568  	}
   569  	warnf("[NODE %d {type=%s count=%d}]", n.pgid, typ, len(n.inodes))
   570  
   571  	// Write out abbreviated version of each item.
   572  	for _, item := range n.inodes {
   573  		if n.isLeaf {
   574  			if item.flags&bucketLeafFlag != 0 {
   575  				bucket := (*bucket)(unsafe.Pointer(&item.value[0]))
   576  				warnf("+L %08x -> (bucket root=%d)", trunc(item.key, 4), bucket.root)
   577  			} else {
   578  				warnf("+L %08x -> %08x", trunc(item.key, 4), trunc(item.value, 4))
   579  			}
   580  		} else {
   581  			warnf("+B %08x -> pgid=%d", trunc(item.key, 4), item.pgid)
   582  		}
   583  	}
   584  	warn("")
   585  }
   586  */
   587  
   588  func compareKeys(left, right []byte) int {
   589  	return bytes.Compare(left, right)
   590  }
   591  
   592  type nodes []*node
   593  
   594  func (s nodes) Len() int      { return len(s) }
   595  func (s nodes) Swap(i, j int) { s[i], s[j] = s[j], s[i] }
   596  func (s nodes) Less(i, j int) bool {
   597  	return bytes.Compare(s[i].inodes[0].key, s[j].inodes[0].key) == -1
   598  }
   599  
   600  // inode represents an internal node inside of a node.
   601  // It can be used to point to elements in a page or point
   602  // to an element which hasn't been added to a page yet.
   603  type inode struct {
   604  	flags uint32
   605  	pgid  pgid
   606  	key   []byte
   607  	value []byte
   608  }
   609  
   610  type inodes []inode
   611
View as plain text