1 package detector
2
3 import (
4 "bufio"
5 "bytes"
6 "context"
7 "fmt"
8 "net"
9 "sync"
10 "time"
11
12 "github.com/spf13/afero"
13
14 "edge-infra.dev/pkg/lib/fog"
15 "edge-infra.dev/pkg/sds/lanoutage/detector/internal/config"
16 "edge-infra.dev/pkg/sds/lanoutage/detector/internal/constants"
17 "edge-infra.dev/pkg/sds/lanoutage/detector/internal/healthcheck"
18 )
19
20 type LOMReconciler struct {
21 fs afero.Fs
22 mutex *sync.Mutex
23 cfg config.Config
24 isLOM bool
25 canConnect bool
26 enterDeadline time.Time
27 leaveDeadline time.Time
28 }
29
30 var errPrecondition = fmt.Errorf("an error occurred in the precondition checks")
31
32 func NewLOMReconciler(cfg config.Config, isLOM bool) *LOMReconciler {
33 return &LOMReconciler{
34 cfg: cfg,
35 enterDeadline: time.Now().Add(enterTime),
36 leaveDeadline: time.Now().Add(leaveTime),
37 isLOM: isLOM,
38 fs: cfg.Fs,
39 mutex: &sync.Mutex{},
40 }
41 }
42
43 func (l *LOMReconciler) resetDeadline() {
44 l.enterDeadline = time.Now().Add(enterTime)
45 l.leaveDeadline = time.Now().Add(leaveTime)
46 }
47
48 func (l LOMReconciler) isPastLeaveDeadline() bool {
49 return time.Now().After(l.leaveDeadline)
50 }
51
52 func (l LOMReconciler) isPastEnterDeadline() bool {
53 return time.Now().After(l.enterDeadline)
54 }
55
56 func (l LOMReconciler) GetFs() afero.Fs {
57 return l.fs
58 }
59
60
61
62 func (l LOMReconciler) WithLock(fn func() error) (bool, error) {
63 acquired := l.mutex.TryLock()
64 if !acquired {
65 return acquired, nil
66 }
67 defer l.mutex.Unlock()
68
69 return acquired, fn()
70 }
71
72
73
74
75
76 func (l LOMReconciler) EnterPreconditionCheck(ctx context.Context) (bool, error) {
77 log := fog.FromContext(ctx)
78 log.Info("checking if LAN Outage Detector operations are enabled")
79 isEnabled, err := isLOMEnabled(l.fs)
80 if err != nil {
81 return false, fmt.Errorf("failed to check if LAN Outage Detector operations are enabled")
82 }
83 if !isEnabled {
84 log.Info("LAN Outage Detector operations are not enabled")
85 return false, nil
86 }
87
88 isControlPlane, err := isLocalNodeControlPlane(l.fs)
89 if err != nil {
90 return false, fmt.Errorf("failed to check if local node is acting as a control plane")
91 }
92 if isControlPlane {
93 log.Info("local node is acting as a full control plane")
94 return false, nil
95 }
96
97 log.Info("checking if etcd is provisioned")
98 isEtcdDataPresent, err := afero.Exists(l.fs, constants.EtcdMemberDir)
99 if err != nil {
100 return false, fmt.Errorf("failed to check the existence of etcd data")
101 }
102 if !isEtcdDataPresent {
103 log.Info("etcd data is not provisioned on this node")
104 return false, nil
105 }
106
107 log.Info("checking that there is enough available memory to support LAN Outage Mode")
108 availableMemKB, err := getAvailableMemoryKB(l.fs, constants.MeminfoFilepath)
109 if err != nil {
110 return false, err
111 }
112 if availableMemKB < 2000000 {
113 log.Info("not enough available memory to support LAN Outage Mode - minimum of 2GB available memory required", "availableMemoryKB", availableMemKB)
114 return false, nil
115 }
116 return true, nil
117 }
118
119
120 func getAvailableMemoryKB(fs afero.Fs, filepath string) (uint32, error) {
121 f, err := fs.Open(filepath)
122 if err != nil {
123 return 0, err
124 }
125 defer f.Close()
126 s := bufio.NewScanner(f)
127 var available uint32
128 loop:
129 for s.Scan() {
130 switch {
131 case bytes.HasPrefix(s.Bytes(), []byte(`MemAvailable:`)):
132 _, err = fmt.Sscanf(s.Text(), "MemAvailable:%d", &available)
133 if err != nil {
134 return available, err
135 }
136 break loop
137 default:
138 continue
139 }
140 }
141 return available, s.Err()
142 }
143
144
145
146
147
148 func (l LOMReconciler) LeavePreconditionCheck(ctx context.Context) (bool, error) {
149 log := fog.FromContext(ctx)
150 log.Info("checking if LAN Outage Detector operations are enabled")
151 isEnabled, err := isLOMEnabled(l.fs)
152 if err != nil {
153 return false, fmt.Errorf("failed to check if LAN Outage Detector operations are enabled")
154 }
155 if !isEnabled {
156 log.Info("LAN Outage Detector operations are not enabled")
157 return false, nil
158 }
159
160 isControlPlane, err := isLocalNodeControlPlane(l.fs)
161 if err != nil {
162 return false, fmt.Errorf("failed to check if local node is acting as a control plane")
163 }
164 if isControlPlane {
165 log.Info("local node is acting as a full control plane")
166 return false, nil
167 }
168
169 vipStr, err := getVipAddress(l.cfg.Kubeconfig)
170 if err != nil {
171 return false, err
172 }
173
174 checker := healthcheck.NewVIPCheck(healthcheck.VIPRequest{VIP: net.ParseIP(vipStr)})
175
176 isHealthy, err := checker.CheckHealth()
177 if err != nil {
178 return false, fmt.Errorf("failed to check cluster health: %w - exiting Leave", err)
179 }
180 if !isHealthy {
181 return false, constants.ErrUnhealthyCluster
182 }
183 return true, nil
184 }
185
186 type Step func() error
187
View as plain text