...

Source file src/edge-infra.dev/pkg/sds/lanoutage/detector/reconciler.go

Documentation: edge-infra.dev/pkg/sds/lanoutage/detector

     1  package detector
     2  
     3  import (
     4  	"bufio"
     5  	"bytes"
     6  	"context"
     7  	"fmt"
     8  	"net"
     9  	"sync"
    10  	"time"
    11  
    12  	"github.com/spf13/afero"
    13  
    14  	"edge-infra.dev/pkg/lib/fog"
    15  	"edge-infra.dev/pkg/sds/lanoutage/detector/internal/config"
    16  	"edge-infra.dev/pkg/sds/lanoutage/detector/internal/constants"
    17  	"edge-infra.dev/pkg/sds/lanoutage/detector/internal/healthcheck"
    18  )
    19  
    20  type LOMReconciler struct {
    21  	fs            afero.Fs
    22  	mutex         *sync.Mutex
    23  	cfg           config.Config
    24  	isLOM         bool
    25  	canConnect    bool
    26  	enterDeadline time.Time
    27  	leaveDeadline time.Time
    28  }
    29  
    30  var errPrecondition = fmt.Errorf("an error occurred in the precondition checks")
    31  
    32  func NewLOMReconciler(cfg config.Config, isLOM bool) *LOMReconciler {
    33  	return &LOMReconciler{
    34  		cfg:           cfg,
    35  		enterDeadline: time.Now().Add(enterTime),
    36  		leaveDeadline: time.Now().Add(leaveTime),
    37  		isLOM:         isLOM,
    38  		fs:            cfg.Fs,
    39  		mutex:         &sync.Mutex{},
    40  	}
    41  }
    42  
    43  func (l *LOMReconciler) resetDeadline() {
    44  	l.enterDeadline = time.Now().Add(enterTime)
    45  	l.leaveDeadline = time.Now().Add(leaveTime)
    46  }
    47  
    48  func (l LOMReconciler) isPastLeaveDeadline() bool {
    49  	return time.Now().After(l.leaveDeadline)
    50  }
    51  
    52  func (l LOMReconciler) isPastEnterDeadline() bool {
    53  	return time.Now().After(l.enterDeadline)
    54  }
    55  
    56  func (l LOMReconciler) GetFs() afero.Fs {
    57  	return l.fs
    58  }
    59  
    60  // WithLock returns the error of the input function and
    61  // reports whether the lock was acquired.
    62  func (l LOMReconciler) WithLock(fn func() error) (bool, error) {
    63  	acquired := l.mutex.TryLock()
    64  	if !acquired {
    65  		return acquired, nil
    66  	}
    67  	defer l.mutex.Unlock()
    68  
    69  	return acquired, fn()
    70  }
    71  
    72  // Checks if the following pre-conditions for Enter are true:
    73  // - LOM is enabled in the LOMConfig file
    74  // - local node is not already acting as a control plane
    75  // - etcd is provisioned on the worker node
    76  func (l LOMReconciler) EnterPreconditionCheck(ctx context.Context) (bool, error) {
    77  	log := fog.FromContext(ctx)
    78  	log.Info("checking if LAN Outage Detector operations are enabled")
    79  	isEnabled, err := isLOMEnabled(l.fs)
    80  	if err != nil {
    81  		return false, fmt.Errorf("failed to check if LAN Outage Detector operations are enabled")
    82  	}
    83  	if !isEnabled {
    84  		log.Info("LAN Outage Detector operations are not enabled")
    85  		return false, nil
    86  	}
    87  
    88  	isControlPlane, err := isLocalNodeControlPlane(l.fs)
    89  	if err != nil {
    90  		return false, fmt.Errorf("failed to check if local node is acting as a control plane")
    91  	}
    92  	if isControlPlane {
    93  		log.Info("local node is acting as a full control plane")
    94  		return false, nil
    95  	}
    96  
    97  	log.Info("checking if etcd is provisioned")
    98  	isEtcdDataPresent, err := afero.Exists(l.fs, constants.EtcdMemberDir)
    99  	if err != nil {
   100  		return false, fmt.Errorf("failed to check the existence of etcd data")
   101  	}
   102  	if !isEtcdDataPresent {
   103  		log.Info("etcd data is not provisioned on this node")
   104  		return false, nil
   105  	}
   106  
   107  	log.Info("checking that there is enough available memory to support LAN Outage Mode")
   108  	availableMemKB, err := getAvailableMemoryKB(l.fs, constants.MeminfoFilepath)
   109  	if err != nil {
   110  		return false, err
   111  	}
   112  	if availableMemKB < 2000000 {
   113  		log.Info("not enough available memory to support LAN Outage Mode - minimum of 2GB available memory required", "availableMemoryKB", availableMemKB)
   114  		return false, nil
   115  	}
   116  	return true, nil
   117  }
   118  
   119  // getAvailableMemory returns the available memory in kb read from '/proc/meminfo'.
   120  func getAvailableMemoryKB(fs afero.Fs, filepath string) (uint32, error) {
   121  	f, err := fs.Open(filepath)
   122  	if err != nil {
   123  		return 0, err
   124  	}
   125  	defer f.Close()
   126  	s := bufio.NewScanner(f)
   127  	var available uint32
   128  loop:
   129  	for s.Scan() {
   130  		switch {
   131  		case bytes.HasPrefix(s.Bytes(), []byte(`MemAvailable:`)):
   132  			_, err = fmt.Sscanf(s.Text(), "MemAvailable:%d", &available)
   133  			if err != nil {
   134  				return available, err
   135  			}
   136  			break loop
   137  		default:
   138  			continue
   139  		}
   140  	}
   141  	return available, s.Err()
   142  }
   143  
   144  // Checks if the following pre-conditions for Leave are true:
   145  // - LOM is enabled in the LOMConfig file
   146  // - local node is not already acting as a control plane
   147  // - the main etcd cluster is healthy
   148  func (l LOMReconciler) LeavePreconditionCheck(ctx context.Context) (bool, error) {
   149  	log := fog.FromContext(ctx)
   150  	log.Info("checking if LAN Outage Detector operations are enabled")
   151  	isEnabled, err := isLOMEnabled(l.fs)
   152  	if err != nil {
   153  		return false, fmt.Errorf("failed to check if LAN Outage Detector operations are enabled")
   154  	}
   155  	if !isEnabled {
   156  		log.Info("LAN Outage Detector operations are not enabled")
   157  		return false, nil
   158  	}
   159  
   160  	isControlPlane, err := isLocalNodeControlPlane(l.fs)
   161  	if err != nil {
   162  		return false, fmt.Errorf("failed to check if local node is acting as a control plane")
   163  	}
   164  	if isControlPlane {
   165  		log.Info("local node is acting as a full control plane")
   166  		return false, nil
   167  	}
   168  
   169  	vipStr, err := getVipAddress(l.cfg.Kubeconfig)
   170  	if err != nil {
   171  		return false, err
   172  	}
   173  
   174  	checker := healthcheck.NewVIPCheck(healthcheck.VIPRequest{VIP: net.ParseIP(vipStr)})
   175  
   176  	isHealthy, err := checker.CheckHealth()
   177  	if err != nil {
   178  		return false, fmt.Errorf("failed to check cluster health: %w - exiting Leave", err)
   179  	}
   180  	if !isHealthy {
   181  		return false, constants.ErrUnhealthyCluster
   182  	}
   183  	return true, nil
   184  }
   185  
   186  type Step func() error
   187  

View as plain text