...

Source file src/go.etcd.io/etcd/raft/v3/confchange/confchange.go

Documentation: go.etcd.io/etcd/raft/v3/confchange

     1  // Copyright 2019 The etcd Authors
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package confchange
    16  
    17  import (
    18  	"errors"
    19  	"fmt"
    20  	"strings"
    21  
    22  	"go.etcd.io/etcd/raft/v3/quorum"
    23  	pb "go.etcd.io/etcd/raft/v3/raftpb"
    24  	"go.etcd.io/etcd/raft/v3/tracker"
    25  )
    26  
    27  // Changer facilitates configuration changes. It exposes methods to handle
    28  // simple and joint consensus while performing the proper validation that allows
    29  // refusing invalid configuration changes before they affect the active
    30  // configuration.
    31  type Changer struct {
    32  	Tracker   tracker.ProgressTracker
    33  	LastIndex uint64
    34  }
    35  
    36  // EnterJoint verifies that the outgoing (=right) majority config of the joint
    37  // config is empty and initializes it with a copy of the incoming (=left)
    38  // majority config. That is, it transitions from
    39  //
    40  //	(1 2 3)&&()
    41  //
    42  // to
    43  //
    44  //	(1 2 3)&&(1 2 3).
    45  //
    46  // The supplied changes are then applied to the incoming majority config,
    47  // resulting in a joint configuration that in terms of the Raft thesis[1]
    48  // (Section 4.3) corresponds to `C_{new,old}`.
    49  //
    50  // [1]: https://github.com/ongardie/dissertation/blob/master/online-trim.pdf
    51  func (c Changer) EnterJoint(autoLeave bool, ccs ...pb.ConfChangeSingle) (tracker.Config, tracker.ProgressMap, error) {
    52  	cfg, prs, err := c.checkAndCopy()
    53  	if err != nil {
    54  		return c.err(err)
    55  	}
    56  	if joint(cfg) {
    57  		err := errors.New("config is already joint")
    58  		return c.err(err)
    59  	}
    60  	if len(incoming(cfg.Voters)) == 0 {
    61  		// We allow adding nodes to an empty config for convenience (testing and
    62  		// bootstrap), but you can't enter a joint state.
    63  		err := errors.New("can't make a zero-voter config joint")
    64  		return c.err(err)
    65  	}
    66  	// Clear the outgoing config.
    67  	*outgoingPtr(&cfg.Voters) = quorum.MajorityConfig{}
    68  	// Copy incoming to outgoing.
    69  	for id := range incoming(cfg.Voters) {
    70  		outgoing(cfg.Voters)[id] = struct{}{}
    71  	}
    72  
    73  	if err := c.apply(&cfg, prs, ccs...); err != nil {
    74  		return c.err(err)
    75  	}
    76  	cfg.AutoLeave = autoLeave
    77  	return checkAndReturn(cfg, prs)
    78  }
    79  
    80  // LeaveJoint transitions out of a joint configuration. It is an error to call
    81  // this method if the configuration is not joint, i.e. if the outgoing majority
    82  // config Voters[1] is empty.
    83  //
    84  // The outgoing majority config of the joint configuration will be removed,
    85  // that is, the incoming config is promoted as the sole decision maker. In the
    86  // notation of the Raft thesis[1] (Section 4.3), this method transitions from
    87  // `C_{new,old}` into `C_new`.
    88  //
    89  // At the same time, any staged learners (LearnersNext) the addition of which
    90  // was held back by an overlapping voter in the former outgoing config will be
    91  // inserted into Learners.
    92  //
    93  // [1]: https://github.com/ongardie/dissertation/blob/master/online-trim.pdf
    94  func (c Changer) LeaveJoint() (tracker.Config, tracker.ProgressMap, error) {
    95  	cfg, prs, err := c.checkAndCopy()
    96  	if err != nil {
    97  		return c.err(err)
    98  	}
    99  	if !joint(cfg) {
   100  		err := errors.New("can't leave a non-joint config")
   101  		return c.err(err)
   102  	}
   103  	if len(outgoing(cfg.Voters)) == 0 {
   104  		err := fmt.Errorf("configuration is not joint: %v", cfg)
   105  		return c.err(err)
   106  	}
   107  	for id := range cfg.LearnersNext {
   108  		nilAwareAdd(&cfg.Learners, id)
   109  		prs[id].IsLearner = true
   110  	}
   111  	cfg.LearnersNext = nil
   112  
   113  	for id := range outgoing(cfg.Voters) {
   114  		_, isVoter := incoming(cfg.Voters)[id]
   115  		_, isLearner := cfg.Learners[id]
   116  
   117  		if !isVoter && !isLearner {
   118  			delete(prs, id)
   119  		}
   120  	}
   121  	*outgoingPtr(&cfg.Voters) = nil
   122  	cfg.AutoLeave = false
   123  
   124  	return checkAndReturn(cfg, prs)
   125  }
   126  
   127  // Simple carries out a series of configuration changes that (in aggregate)
   128  // mutates the incoming majority config Voters[0] by at most one. This method
   129  // will return an error if that is not the case, if the resulting quorum is
   130  // zero, or if the configuration is in a joint state (i.e. if there is an
   131  // outgoing configuration).
   132  func (c Changer) Simple(ccs ...pb.ConfChangeSingle) (tracker.Config, tracker.ProgressMap, error) {
   133  	cfg, prs, err := c.checkAndCopy()
   134  	if err != nil {
   135  		return c.err(err)
   136  	}
   137  	if joint(cfg) {
   138  		err := errors.New("can't apply simple config change in joint config")
   139  		return c.err(err)
   140  	}
   141  	if err := c.apply(&cfg, prs, ccs...); err != nil {
   142  		return c.err(err)
   143  	}
   144  	if n := symdiff(incoming(c.Tracker.Voters), incoming(cfg.Voters)); n > 1 {
   145  		return tracker.Config{}, nil, errors.New("more than one voter changed without entering joint config")
   146  	}
   147  
   148  	return checkAndReturn(cfg, prs)
   149  }
   150  
   151  // apply a change to the configuration. By convention, changes to voters are
   152  // always made to the incoming majority config Voters[0]. Voters[1] is either
   153  // empty or preserves the outgoing majority configuration while in a joint state.
   154  func (c Changer) apply(cfg *tracker.Config, prs tracker.ProgressMap, ccs ...pb.ConfChangeSingle) error {
   155  	for _, cc := range ccs {
   156  		if cc.NodeID == 0 {
   157  			// etcd replaces the NodeID with zero if it decides (downstream of
   158  			// raft) to not apply a change, so we have to have explicit code
   159  			// here to ignore these.
   160  			continue
   161  		}
   162  		switch cc.Type {
   163  		case pb.ConfChangeAddNode:
   164  			c.makeVoter(cfg, prs, cc.NodeID)
   165  		case pb.ConfChangeAddLearnerNode:
   166  			c.makeLearner(cfg, prs, cc.NodeID)
   167  		case pb.ConfChangeRemoveNode:
   168  			c.remove(cfg, prs, cc.NodeID)
   169  		case pb.ConfChangeUpdateNode:
   170  		default:
   171  			return fmt.Errorf("unexpected conf type %d", cc.Type)
   172  		}
   173  	}
   174  	if len(incoming(cfg.Voters)) == 0 {
   175  		return errors.New("removed all voters")
   176  	}
   177  	return nil
   178  }
   179  
   180  // makeVoter adds or promotes the given ID to be a voter in the incoming
   181  // majority config.
   182  func (c Changer) makeVoter(cfg *tracker.Config, prs tracker.ProgressMap, id uint64) {
   183  	pr := prs[id]
   184  	if pr == nil {
   185  		c.initProgress(cfg, prs, id, false /* isLearner */)
   186  		return
   187  	}
   188  
   189  	pr.IsLearner = false
   190  	nilAwareDelete(&cfg.Learners, id)
   191  	nilAwareDelete(&cfg.LearnersNext, id)
   192  	incoming(cfg.Voters)[id] = struct{}{}
   193  }
   194  
   195  // makeLearner makes the given ID a learner or stages it to be a learner once
   196  // an active joint configuration is exited.
   197  //
   198  // The former happens when the peer is not a part of the outgoing config, in
   199  // which case we either add a new learner or demote a voter in the incoming
   200  // config.
   201  //
   202  // The latter case occurs when the configuration is joint and the peer is a
   203  // voter in the outgoing config. In that case, we do not want to add the peer
   204  // as a learner because then we'd have to track a peer as a voter and learner
   205  // simultaneously. Instead, we add the learner to LearnersNext, so that it will
   206  // be added to Learners the moment the outgoing config is removed by
   207  // LeaveJoint().
   208  func (c Changer) makeLearner(cfg *tracker.Config, prs tracker.ProgressMap, id uint64) {
   209  	pr := prs[id]
   210  	if pr == nil {
   211  		c.initProgress(cfg, prs, id, true /* isLearner */)
   212  		return
   213  	}
   214  	if pr.IsLearner {
   215  		return
   216  	}
   217  	// Remove any existing voter in the incoming config...
   218  	c.remove(cfg, prs, id)
   219  	// ... but save the Progress.
   220  	prs[id] = pr
   221  	// Use LearnersNext if we can't add the learner to Learners directly, i.e.
   222  	// if the peer is still tracked as a voter in the outgoing config. It will
   223  	// be turned into a learner in LeaveJoint().
   224  	//
   225  	// Otherwise, add a regular learner right away.
   226  	if _, onRight := outgoing(cfg.Voters)[id]; onRight {
   227  		nilAwareAdd(&cfg.LearnersNext, id)
   228  	} else {
   229  		pr.IsLearner = true
   230  		nilAwareAdd(&cfg.Learners, id)
   231  	}
   232  }
   233  
   234  // remove this peer as a voter or learner from the incoming config.
   235  func (c Changer) remove(cfg *tracker.Config, prs tracker.ProgressMap, id uint64) {
   236  	if _, ok := prs[id]; !ok {
   237  		return
   238  	}
   239  
   240  	delete(incoming(cfg.Voters), id)
   241  	nilAwareDelete(&cfg.Learners, id)
   242  	nilAwareDelete(&cfg.LearnersNext, id)
   243  
   244  	// If the peer is still a voter in the outgoing config, keep the Progress.
   245  	if _, onRight := outgoing(cfg.Voters)[id]; !onRight {
   246  		delete(prs, id)
   247  	}
   248  }
   249  
   250  // initProgress initializes a new progress for the given node or learner.
   251  func (c Changer) initProgress(cfg *tracker.Config, prs tracker.ProgressMap, id uint64, isLearner bool) {
   252  	if !isLearner {
   253  		incoming(cfg.Voters)[id] = struct{}{}
   254  	} else {
   255  		nilAwareAdd(&cfg.Learners, id)
   256  	}
   257  	prs[id] = &tracker.Progress{
   258  		// Initializing the Progress with the last index means that the follower
   259  		// can be probed (with the last index).
   260  		//
   261  		// TODO(tbg): seems awfully optimistic. Using the first index would be
   262  		// better. The general expectation here is that the follower has no log
   263  		// at all (and will thus likely need a snapshot), though the app may
   264  		// have applied a snapshot out of band before adding the replica (thus
   265  		// making the first index the better choice).
   266  		Next:      c.LastIndex,
   267  		Match:     0,
   268  		Inflights: tracker.NewInflights(c.Tracker.MaxInflight),
   269  		IsLearner: isLearner,
   270  		// When a node is first added, we should mark it as recently active.
   271  		// Otherwise, CheckQuorum may cause us to step down if it is invoked
   272  		// before the added node has had a chance to communicate with us.
   273  		RecentActive: true,
   274  	}
   275  }
   276  
   277  // checkInvariants makes sure that the config and progress are compatible with
   278  // each other. This is used to check both what the Changer is initialized with,
   279  // as well as what it returns.
   280  func checkInvariants(cfg tracker.Config, prs tracker.ProgressMap) error {
   281  	// NB: intentionally allow the empty config. In production we'll never see a
   282  	// non-empty config (we prevent it from being created) but we will need to
   283  	// be able to *create* an initial config, for example during bootstrap (or
   284  	// during tests). Instead of having to hand-code this, we allow
   285  	// transitioning from an empty config into any other legal and non-empty
   286  	// config.
   287  	for _, ids := range []map[uint64]struct{}{
   288  		cfg.Voters.IDs(),
   289  		cfg.Learners,
   290  		cfg.LearnersNext,
   291  	} {
   292  		for id := range ids {
   293  			if _, ok := prs[id]; !ok {
   294  				return fmt.Errorf("no progress for %d", id)
   295  			}
   296  		}
   297  	}
   298  
   299  	// Any staged learner was staged because it could not be directly added due
   300  	// to a conflicting voter in the outgoing config.
   301  	for id := range cfg.LearnersNext {
   302  		if _, ok := outgoing(cfg.Voters)[id]; !ok {
   303  			return fmt.Errorf("%d is in LearnersNext, but not Voters[1]", id)
   304  		}
   305  		if prs[id].IsLearner {
   306  			return fmt.Errorf("%d is in LearnersNext, but is already marked as learner", id)
   307  		}
   308  	}
   309  	// Conversely Learners and Voters doesn't intersect at all.
   310  	for id := range cfg.Learners {
   311  		if _, ok := outgoing(cfg.Voters)[id]; ok {
   312  			return fmt.Errorf("%d is in Learners and Voters[1]", id)
   313  		}
   314  		if _, ok := incoming(cfg.Voters)[id]; ok {
   315  			return fmt.Errorf("%d is in Learners and Voters[0]", id)
   316  		}
   317  		if !prs[id].IsLearner {
   318  			return fmt.Errorf("%d is in Learners, but is not marked as learner", id)
   319  		}
   320  	}
   321  
   322  	if !joint(cfg) {
   323  		// We enforce that empty maps are nil instead of zero.
   324  		if outgoing(cfg.Voters) != nil {
   325  			return fmt.Errorf("cfg.Voters[1] must be nil when not joint")
   326  		}
   327  		if cfg.LearnersNext != nil {
   328  			return fmt.Errorf("cfg.LearnersNext must be nil when not joint")
   329  		}
   330  		if cfg.AutoLeave {
   331  			return fmt.Errorf("AutoLeave must be false when not joint")
   332  		}
   333  	}
   334  
   335  	return nil
   336  }
   337  
   338  // checkAndCopy copies the tracker's config and progress map (deeply enough for
   339  // the purposes of the Changer) and returns those copies. It returns an error
   340  // if checkInvariants does.
   341  func (c Changer) checkAndCopy() (tracker.Config, tracker.ProgressMap, error) {
   342  	cfg := c.Tracker.Config.Clone()
   343  	prs := tracker.ProgressMap{}
   344  
   345  	for id, pr := range c.Tracker.Progress {
   346  		// A shallow copy is enough because we only mutate the Learner field.
   347  		ppr := *pr
   348  		prs[id] = &ppr
   349  	}
   350  	return checkAndReturn(cfg, prs)
   351  }
   352  
   353  // checkAndReturn calls checkInvariants on the input and returns either the
   354  // resulting error or the input.
   355  func checkAndReturn(cfg tracker.Config, prs tracker.ProgressMap) (tracker.Config, tracker.ProgressMap, error) {
   356  	if err := checkInvariants(cfg, prs); err != nil {
   357  		return tracker.Config{}, tracker.ProgressMap{}, err
   358  	}
   359  	return cfg, prs, nil
   360  }
   361  
   362  // err returns zero values and an error.
   363  func (c Changer) err(err error) (tracker.Config, tracker.ProgressMap, error) {
   364  	return tracker.Config{}, nil, err
   365  }
   366  
   367  // nilAwareAdd populates a map entry, creating the map if necessary.
   368  func nilAwareAdd(m *map[uint64]struct{}, id uint64) {
   369  	if *m == nil {
   370  		*m = map[uint64]struct{}{}
   371  	}
   372  	(*m)[id] = struct{}{}
   373  }
   374  
   375  // nilAwareDelete deletes from a map, nil'ing the map itself if it is empty after.
   376  func nilAwareDelete(m *map[uint64]struct{}, id uint64) {
   377  	if *m == nil {
   378  		return
   379  	}
   380  	delete(*m, id)
   381  	if len(*m) == 0 {
   382  		*m = nil
   383  	}
   384  }
   385  
   386  // symdiff returns the count of the symmetric difference between the sets of
   387  // uint64s, i.e. len( (l - r) \union (r - l)).
   388  func symdiff(l, r map[uint64]struct{}) int {
   389  	var n int
   390  	pairs := [][2]quorum.MajorityConfig{
   391  		{l, r}, // count elems in l but not in r
   392  		{r, l}, // count elems in r but not in l
   393  	}
   394  	for _, p := range pairs {
   395  		for id := range p[0] {
   396  			if _, ok := p[1][id]; !ok {
   397  				n++
   398  			}
   399  		}
   400  	}
   401  	return n
   402  }
   403  
   404  func joint(cfg tracker.Config) bool {
   405  	return len(outgoing(cfg.Voters)) > 0
   406  }
   407  
   408  func incoming(voters quorum.JointConfig) quorum.MajorityConfig      { return voters[0] }
   409  func outgoing(voters quorum.JointConfig) quorum.MajorityConfig      { return voters[1] }
   410  func outgoingPtr(voters *quorum.JointConfig) *quorum.MajorityConfig { return &voters[1] }
   411  
   412  // Describe prints the type and NodeID of the configuration changes as a
   413  // space-delimited string.
   414  func Describe(ccs ...pb.ConfChangeSingle) string {
   415  	var buf strings.Builder
   416  	for _, cc := range ccs {
   417  		if buf.Len() > 0 {
   418  			buf.WriteByte(' ')
   419  		}
   420  		fmt.Fprintf(&buf, "%s(%d)", cc.Type, cc.NodeID)
   421  	}
   422  	return buf.String()
   423  }
   424  

View as plain text