// Package cluster provides functionality for monitoring etcd cluster health // and recovering the cluster if quorum is lost. package cluster import ( "context" "time" "go.etcd.io/etcd/api/v3/etcdserverpb" "edge-infra.dev/pkg/sds/lib/etcd/client/retry" ) // Cluster represents an etcd cluster type Cluster struct { endpoint string maxUnhealthy time.Duration Status } // Status represents the health status of a resource type Status struct { lastUnhealthy time.Time lastHealthy time.Time } func New(endpoint string, maxUnhealthy time.Duration, status Status) Cluster { return Cluster{ endpoint, maxUnhealthy, status, } } // InitializeStatus initializes the status times to ensure there are no inaccurate // time comparrisons func (c *Cluster) InitializeStatus() { c.ResetTimer() } // UpdateStatus checks the status of the etcd cluster and updates the last healthy/unhealthy time // based on the result func (c *Cluster) UpdateStatus(ctx context.Context, client retry.Retrier) { resp, err := client.SafeStatus(ctx, c.endpoint) if err != nil || len(resp.Errors) != 0 { c.lastUnhealthy = time.Now() return } c.lastHealthy = time.Now() } // IsResetRequired checks if the stored state for the etcd cluster indicates that a reset is required func (c *Cluster) IsResetRequired() bool { if c.IsHealthy() { return false } return time.Since(c.lastHealthy) > c.maxUnhealthy } // IsHealthy checks if the stored state for the etcd cluster indicate that it is healthy func (s *Status) IsHealthy() bool { return s.lastHealthy.After(s.lastUnhealthy) } // ResetTimer will reset the status timers so that both lastHealthy and lastUnhealthy // are time.Now() func (s *Status) ResetTimer() { now := time.Now() s.lastHealthy = now s.lastUnhealthy = now } // GetAlarms will retrieve all etcd cluster alarms of types: CORRUPT, NOSPACE func GetAlarms(ctx context.Context, client retry.Retrier) []*etcdserverpb.AlarmMember { resp, err := client.SafeAlarmList(ctx) if err != nil { return []*etcdserverpb.AlarmMember{} } return resp.Alarms }