package install import ( "context" "errors" "fmt" "time" "go.etcd.io/etcd/api/v3/etcdserverpb" "go.etcd.io/etcd/api/v3/v3rpc/rpctypes" "edge-infra.dev/pkg/lib/fog" "edge-infra.dev/pkg/sds/lib/etcd/client" ) // addMemberAsLearner adds the node to the etcd cluster as a learner member func (r *Reconciler) addMemberAsLearner(ctx context.Context, handlers *Handlers) (uint64, error) { log := fog.FromContext(ctx) failedAttempts := 0 for { resp, err := r.EtcdRetryClient.SafeMemberAddAsLearner(ctx, []string{handlers.member.PeerURL()}) // if there are too many learners, wait 10 seconds and try again. // This is to give the previous member time to be promoted or // removed. This will happen frequently as it can take 20-30+ // seconds to promote a member if errors.Is(err, rpctypes.ErrTooManyLearners) { failedAttempts++ // every 2 minutes, log a warning that there are too many learners if failedAttempts%12 == 0 { log.V(0).Info("too many learners in the cluster. Waiting for the current learner to be promoted or removed...") } time.Sleep(10 * time.Second) continue } // if etcd member already exists if errors.Is(err, rpctypes.ErrPeerURLExist) { return r.memberByPeerURL(ctx, handlers) } if err != nil { return 0, fmt.Errorf("failed to add member as learner: %w", err) } return resp.Member.ID, nil } } // memberByPeerURL returns the member ID for the given peerURL. If the member name does not // match the name of the EtcdMember, the member is removed from the etcd cluster func (r *Reconciler) memberByPeerURL(ctx context.Context, handlers *Handlers) (uint64, error) { resp, err := r.EtcdRetryClient.SafeMemberList(ctx) if err != nil { return 0, fmt.Errorf("failed to retrieve etcd members: %w", err) } // find the member with the clashing peerURL for _, member := range resp.Members { if member.PeerURLs[0] == handlers.member.PeerURL() { return r.deleteIfNameMismatch(ctx, member, handlers) } } return 0, fmt.Errorf("failed to find member by peerURL (%s)", handlers.member.PeerURL()) } // deleteIfNameMismatch deletes the member from the etcd cluster if the name does not match // the name of the EtcdMember. func (r *Reconciler) deleteIfNameMismatch(ctx context.Context, member *etcdserverpb.Member, handlers *Handlers) (uint64, error) { // if the member name does not match the name of the EtcdMember, remove the member if member.Name != handlers.member.Name { if _, err := r.EtcdRetryClient.SafeMemberRemove(ctx, member.ID); err != nil { return 0, fmt.Errorf("failed to remove mismatched member: %w", err) } return 0, fmt.Errorf("member name mismatch: expected %s, got %s", handlers.member.Name, member.Name) } return member.ID, nil } // promoteLearner promotes the node from a learner member to a full member func (r *Reconciler) promoteLearner(ctx context.Context, memberID uint64) error { failedAttempts := 0 for { _, err := r.EtcdRetryClient.SafeMemberPromote(ctx, memberID) // if the learner is not ready to be promoted, wait 10 seconds and try again. // This is to give the member time to catch up and get in sync with the leader. // This process can regularly take 20-30+ seconds if errors.Is(err, rpctypes.ErrMemberLearnerNotReady) { failedAttempts++ // quit after 2 minutes as this is long enough to expect a successful promotion. // The average promotion should take 20-30 seconds if failedAttempts > 12 { return fmt.Errorf("failed to promote member after 2 minutes") } time.Sleep(10 * time.Second) continue } // if member has already been promoted, return success if client.IgnoreMemberNotLearner(err) != nil { return fmt.Errorf("failed to promote learner: %w", err) } return nil } }