...

Source file src/github.com/emissary-ingress/emissary/v3/pkg/acp/diagd.go

Documentation: github.com/emissary-ingress/emissary/v3/pkg/acp

     1  // Copyright 2020 Datawire. All rights reserved.
     2  //
     3  // package acp contains stuff dealing with the Ambassador Control Plane as a whole.
     4  //
     5  // This is the DiagdWatcher, which is a class that can keep an eye on a running
     6  // diagd - and just diagd, all other Ambassador elements are ignored - and tell you
     7  // whether it's alive and ready, or not.
     8  //
     9  // THE GRACE PERIOD:
    10  // Much of DiagdWatcher is concerned with feeding a snapshot to diagd for processing,
    11  // and then noting that processing is done. This can take awhile. Currently, we give
    12  // diagd _ten minutes_ to get its act together, with the ideas that:
    13  //
    14  // a. We really don't want to start summarily killing pods when, say, configuration
    15  //    times go from 30 seconds to 31 seconds, but
    16  // b. We also don't want a dead diagd to hork a pod for hours.
    17  //
    18  // Ten minutes might not be the right compromise, but the idea is that a reasonable
    19  // customer should be doing health checks on their ability to route to their services,
    20  // as well as using the blunt-instrument Kubernetes checks. So this code is more about
    21  // providing a conservative failsafe.
    22  //
    23  // TESTING HOOKS:
    24  // Since time plays a role, you can use DiagdWatcher.SetFetchTime to change the
    25  // function that the DiagdWatcher uses to fetch times. The default is time.Now.
    26  //
    27  // This hook is NOT meant for you to change the values on the fly in a running
    28  // DiagdWatcher. Set it at instantiation if need be, then leave it alone. See
    29  // diagd_test.go for more.
    30  
    31  package acp
    32  
    33  import (
    34  	"sync"
    35  	"time"
    36  )
    37  
    38  // DiagdWatcher encapsulates state and methods for keeping an eye on a running
    39  // diagd, and deciding if it's healthy.
    40  type DiagdWatcher struct {
    41  	// How shall we fetch the current time?
    42  	fetchTime timeFetcher
    43  
    44  	// This mutex protects access to LastSent and LastProcessed,
    45  	// mostly as a matter of rank paranoia.
    46  	mutex sync.Mutex
    47  
    48  	// When did we last send a snapshot to diagd?
    49  	LastSent time.Time
    50  
    51  	// When did we last hear that diagd had processed a snapshot?
    52  	LastProcessed time.Time
    53  
    54  	// When does our grace period end? The grace period is ten minutes after
    55  	// the most recent event (boot, or the last time a snapshot was sent).
    56  	GraceEnd time.Time
    57  }
    58  
    59  // NewDiagdWatcher creates a new DiagdWatcher.
    60  func NewDiagdWatcher() *DiagdWatcher {
    61  	w := &DiagdWatcher{fetchTime: time.Now}
    62  	w.setGraceEnd(w.fetchTime(), 10*time.Minute) // initial boot grace period
    63  
    64  	return w
    65  }
    66  
    67  // setGraceEnd will set the end of the grace period to some duration after
    68  // a given timestamp.
    69  func (w *DiagdWatcher) setGraceEnd(start time.Time, dur time.Duration) {
    70  	w.GraceEnd = start.Add(dur)
    71  }
    72  
    73  // withinGracePeriod will return true IFF we're within the current grace period.
    74  func (w *DiagdWatcher) withinGracePeriod() bool {
    75  	return w.fetchTime().Before(w.GraceEnd)
    76  }
    77  
    78  // SetFetchTime will change the function we use to get the current time _AND RESETS THE
    79  // BOOT GRACE PERIOD_. This is here for testing, _NOT_ to allow switching timers on the
    80  // fly for some crazy reason.
    81  func (w *DiagdWatcher) SetFetchTime(fetchTime timeFetcher) {
    82  	w.fetchTime = fetchTime
    83  
    84  	// See comment above for why it's OK to reset the boot grace period here.
    85  	w.setGraceEnd(w.fetchTime(), 10*time.Minute) // RESET boot grace period, see above.
    86  }
    87  
    88  // NoteSnapshotSent marks the time at which we have sent a snapshot.
    89  func (w *DiagdWatcher) NoteSnapshotSent() {
    90  	w.mutex.Lock()
    91  	defer w.mutex.Unlock()
    92  
    93  	// Remember that we've sent a snapshot...
    94  	w.LastSent = w.fetchTime()
    95  
    96  	// ...and reset the grace period IFF we've processed something.
    97  	//
    98  	// Why not do this unconditionally? Basically we don't want an Ambassador
    99  	// to somehow send snapshots over and over, never process any, and not
   100  	// give up. (This situation is currently "impossible", so this is kind of
   101  	// paranoia, but that's OK.)
   102  
   103  	if !w.LastProcessed.IsZero() {
   104  		w.setGraceEnd(w.LastSent, 10*time.Minute) // Update grace period
   105  	}
   106  }
   107  
   108  // NoteSnapshotProcessed marks the time at which we have processed a snapshot.
   109  func (w *DiagdWatcher) NoteSnapshotProcessed() {
   110  	w.mutex.Lock()
   111  	defer w.mutex.Unlock()
   112  	w.LastProcessed = w.fetchTime()
   113  }
   114  
   115  // IsAlive returns true IFF diagd should be considered alive.
   116  func (w *DiagdWatcher) IsAlive() bool {
   117  	w.mutex.Lock()
   118  	defer w.mutex.Unlock()
   119  
   120  	// Case 1: we've sent and processed at least one snapshot, and LastSent is before
   121  	// LastProcessed. This is the case where the last-sent snapshot has already been
   122  	// processed -- life is good.
   123  	if !w.LastSent.IsZero() && !w.LastProcessed.IsZero() && w.LastSent.Before(w.LastProcessed) {
   124  		// Yes -- both LastSent and LastProcessed are set, and LastSent is before LastProcessed.
   125  		// We're good to go.
   126  		return true
   127  	}
   128  
   129  	// Case 2: the above isn't true. Either we haven't tried to send a snapshot yet, or
   130  	// we've sent a snapshot and haven't finished processing it yet. In either case, we'll
   131  	// say we're alive only as long as we're within the grace period.
   132  	return w.withinGracePeriod()
   133  }
   134  
   135  // IsReady returns true IFF diagd should be considered ready.
   136  func (w *DiagdWatcher) IsReady() bool {
   137  	w.mutex.Lock()
   138  	defer w.mutex.Unlock()
   139  
   140  	// If we haven't sent and processed a snapshot, diagd isn't ready.
   141  	if w.LastSent.IsZero() || w.LastProcessed.IsZero() {
   142  		return false
   143  	}
   144  
   145  	// We've sent and processed snapshots. If the last snapshot was sent before
   146  	// the last snapshot was processed, we're good to go.
   147  	if w.LastSent.Before(w.LastProcessed) {
   148  		return true
   149  	}
   150  
   151  	// LastSent was after LastProcessed; we're still working on processing the
   152  	// most recent snapshot. We'll say we're ready only as long as we're still
   153  	// in the grace period.
   154  	return w.withinGracePeriod()
   155  }
   156  

View as plain text