...

Source file src/edge-infra.dev/pkg/sds/etcd/operator/internal/reconcilers/install/membership.go

Documentation: edge-infra.dev/pkg/sds/etcd/operator/internal/reconcilers/install

     1  package install
     2  
     3  import (
     4  	"context"
     5  	"errors"
     6  	"fmt"
     7  	"time"
     8  
     9  	"go.etcd.io/etcd/api/v3/etcdserverpb"
    10  	"go.etcd.io/etcd/api/v3/v3rpc/rpctypes"
    11  
    12  	"edge-infra.dev/pkg/lib/fog"
    13  	"edge-infra.dev/pkg/sds/lib/etcd/client"
    14  )
    15  
    16  // addMemberAsLearner adds the node to the etcd cluster as a learner member
    17  func (r *Reconciler) addMemberAsLearner(ctx context.Context, handlers *Handlers) (uint64, error) {
    18  	log := fog.FromContext(ctx)
    19  	failedAttempts := 0
    20  	for {
    21  		resp, err := r.EtcdRetryClient.SafeMemberAddAsLearner(ctx, []string{handlers.member.PeerURL()})
    22  		// if there are too many learners, wait 10 seconds and try again.
    23  		// This is to give the previous member time to be promoted or
    24  		// removed. This will happen frequently as it can take 20-30+
    25  		// seconds to promote a member
    26  		if errors.Is(err, rpctypes.ErrTooManyLearners) {
    27  			failedAttempts++
    28  			// every 2 minutes, log a warning that there are too many learners
    29  			if failedAttempts%12 == 0 {
    30  				log.V(0).Info("too many learners in the cluster. Waiting for the current learner to be promoted or removed...")
    31  			}
    32  			time.Sleep(10 * time.Second)
    33  			continue
    34  		}
    35  		// if etcd member already exists
    36  		if errors.Is(err, rpctypes.ErrPeerURLExist) {
    37  			return r.memberByPeerURL(ctx, handlers)
    38  		}
    39  		if err != nil {
    40  			return 0, fmt.Errorf("failed to add member as learner: %w", err)
    41  		}
    42  
    43  		return resp.Member.ID, nil
    44  	}
    45  }
    46  
    47  // memberByPeerURL returns the member ID for the given peerURL. If the member name does not
    48  // match the name of the EtcdMember, the member is removed from the etcd cluster
    49  func (r *Reconciler) memberByPeerURL(ctx context.Context, handlers *Handlers) (uint64, error) {
    50  	resp, err := r.EtcdRetryClient.SafeMemberList(ctx)
    51  	if err != nil {
    52  		return 0, fmt.Errorf("failed to retrieve etcd members: %w", err)
    53  	}
    54  	// find the member with the clashing peerURL
    55  	for _, member := range resp.Members {
    56  		if member.PeerURLs[0] == handlers.member.PeerURL() {
    57  			return r.deleteIfNameMismatch(ctx, member, handlers)
    58  		}
    59  	}
    60  	return 0, fmt.Errorf("failed to find member by peerURL (%s)", handlers.member.PeerURL())
    61  }
    62  
    63  // deleteIfNameMismatch deletes the member from the etcd cluster if the name does not match
    64  // the name of the EtcdMember.
    65  func (r *Reconciler) deleteIfNameMismatch(ctx context.Context, member *etcdserverpb.Member, handlers *Handlers) (uint64, error) {
    66  	// if the member name does not match the name of the EtcdMember, remove the member
    67  	if member.Name != handlers.member.Name {
    68  		if _, err := r.EtcdRetryClient.SafeMemberRemove(ctx, member.ID); err != nil {
    69  			return 0, fmt.Errorf("failed to remove mismatched member: %w", err)
    70  		}
    71  		return 0, fmt.Errorf("member name mismatch: expected %s, got %s", handlers.member.Name, member.Name)
    72  	}
    73  	return member.ID, nil
    74  }
    75  
    76  // promoteLearner promotes the node from a learner member to a full member
    77  func (r *Reconciler) promoteLearner(ctx context.Context, memberID uint64) error {
    78  	failedAttempts := 0
    79  	for {
    80  		_, err := r.EtcdRetryClient.SafeMemberPromote(ctx, memberID)
    81  		// if the learner is not ready to be promoted, wait 10 seconds and try again.
    82  		// This is to give the member time to catch up and get in sync with the leader.
    83  		// This process can regularly take 20-30+ seconds
    84  		if errors.Is(err, rpctypes.ErrMemberLearnerNotReady) {
    85  			failedAttempts++
    86  			// quit after 2 minutes as this is long enough to expect a successful promotion.
    87  			// The average promotion should take 20-30 seconds
    88  			if failedAttempts > 12 {
    89  				return fmt.Errorf("failed to promote member after 2 minutes")
    90  			}
    91  			time.Sleep(10 * time.Second)
    92  			continue
    93  		}
    94  		// if member has already been promoted, return success
    95  		if client.IgnoreMemberNotLearner(err) != nil {
    96  			return fmt.Errorf("failed to promote learner: %w", err)
    97  		}
    98  		return nil
    99  	}
   100  }
   101  

View as plain text