1 // Copyright 2020 Datawire. All rights reserved. 2 // 3 // package acp contains stuff dealing with the Ambassador Control Plane as a whole. 4 // 5 // This is the DiagdWatcher, which is a class that can keep an eye on a running 6 // diagd - and just diagd, all other Ambassador elements are ignored - and tell you 7 // whether it's alive and ready, or not. 8 // 9 // THE GRACE PERIOD: 10 // Much of DiagdWatcher is concerned with feeding a snapshot to diagd for processing, 11 // and then noting that processing is done. This can take awhile. Currently, we give 12 // diagd _ten minutes_ to get its act together, with the ideas that: 13 // 14 // a. We really don't want to start summarily killing pods when, say, configuration 15 // times go from 30 seconds to 31 seconds, but 16 // b. We also don't want a dead diagd to hork a pod for hours. 17 // 18 // Ten minutes might not be the right compromise, but the idea is that a reasonable 19 // customer should be doing health checks on their ability to route to their services, 20 // as well as using the blunt-instrument Kubernetes checks. So this code is more about 21 // providing a conservative failsafe. 22 // 23 // TESTING HOOKS: 24 // Since time plays a role, you can use DiagdWatcher.SetFetchTime to change the 25 // function that the DiagdWatcher uses to fetch times. The default is time.Now. 26 // 27 // This hook is NOT meant for you to change the values on the fly in a running 28 // DiagdWatcher. Set it at instantiation if need be, then leave it alone. See 29 // diagd_test.go for more. 30 31 package acp 32 33 import ( 34 "sync" 35 "time" 36 ) 37 38 // DiagdWatcher encapsulates state and methods for keeping an eye on a running 39 // diagd, and deciding if it's healthy. 40 type DiagdWatcher struct { 41 // How shall we fetch the current time? 42 fetchTime timeFetcher 43 44 // This mutex protects access to LastSent and LastProcessed, 45 // mostly as a matter of rank paranoia. 46 mutex sync.Mutex 47 48 // When did we last send a snapshot to diagd? 49 LastSent time.Time 50 51 // When did we last hear that diagd had processed a snapshot? 52 LastProcessed time.Time 53 54 // When does our grace period end? The grace period is ten minutes after 55 // the most recent event (boot, or the last time a snapshot was sent). 56 GraceEnd time.Time 57 } 58 59 // NewDiagdWatcher creates a new DiagdWatcher. 60 func NewDiagdWatcher() *DiagdWatcher { 61 w := &DiagdWatcher{fetchTime: time.Now} 62 w.setGraceEnd(w.fetchTime(), 10*time.Minute) // initial boot grace period 63 64 return w 65 } 66 67 // setGraceEnd will set the end of the grace period to some duration after 68 // a given timestamp. 69 func (w *DiagdWatcher) setGraceEnd(start time.Time, dur time.Duration) { 70 w.GraceEnd = start.Add(dur) 71 } 72 73 // withinGracePeriod will return true IFF we're within the current grace period. 74 func (w *DiagdWatcher) withinGracePeriod() bool { 75 return w.fetchTime().Before(w.GraceEnd) 76 } 77 78 // SetFetchTime will change the function we use to get the current time _AND RESETS THE 79 // BOOT GRACE PERIOD_. This is here for testing, _NOT_ to allow switching timers on the 80 // fly for some crazy reason. 81 func (w *DiagdWatcher) SetFetchTime(fetchTime timeFetcher) { 82 w.fetchTime = fetchTime 83 84 // See comment above for why it's OK to reset the boot grace period here. 85 w.setGraceEnd(w.fetchTime(), 10*time.Minute) // RESET boot grace period, see above. 86 } 87 88 // NoteSnapshotSent marks the time at which we have sent a snapshot. 89 func (w *DiagdWatcher) NoteSnapshotSent() { 90 w.mutex.Lock() 91 defer w.mutex.Unlock() 92 93 // Remember that we've sent a snapshot... 94 w.LastSent = w.fetchTime() 95 96 // ...and reset the grace period IFF we've processed something. 97 // 98 // Why not do this unconditionally? Basically we don't want an Ambassador 99 // to somehow send snapshots over and over, never process any, and not 100 // give up. (This situation is currently "impossible", so this is kind of 101 // paranoia, but that's OK.) 102 103 if !w.LastProcessed.IsZero() { 104 w.setGraceEnd(w.LastSent, 10*time.Minute) // Update grace period 105 } 106 } 107 108 // NoteSnapshotProcessed marks the time at which we have processed a snapshot. 109 func (w *DiagdWatcher) NoteSnapshotProcessed() { 110 w.mutex.Lock() 111 defer w.mutex.Unlock() 112 w.LastProcessed = w.fetchTime() 113 } 114 115 // IsAlive returns true IFF diagd should be considered alive. 116 func (w *DiagdWatcher) IsAlive() bool { 117 w.mutex.Lock() 118 defer w.mutex.Unlock() 119 120 // Case 1: we've sent and processed at least one snapshot, and LastSent is before 121 // LastProcessed. This is the case where the last-sent snapshot has already been 122 // processed -- life is good. 123 if !w.LastSent.IsZero() && !w.LastProcessed.IsZero() && w.LastSent.Before(w.LastProcessed) { 124 // Yes -- both LastSent and LastProcessed are set, and LastSent is before LastProcessed. 125 // We're good to go. 126 return true 127 } 128 129 // Case 2: the above isn't true. Either we haven't tried to send a snapshot yet, or 130 // we've sent a snapshot and haven't finished processing it yet. In either case, we'll 131 // say we're alive only as long as we're within the grace period. 132 return w.withinGracePeriod() 133 } 134 135 // IsReady returns true IFF diagd should be considered ready. 136 func (w *DiagdWatcher) IsReady() bool { 137 w.mutex.Lock() 138 defer w.mutex.Unlock() 139 140 // If we haven't sent and processed a snapshot, diagd isn't ready. 141 if w.LastSent.IsZero() || w.LastProcessed.IsZero() { 142 return false 143 } 144 145 // We've sent and processed snapshots. If the last snapshot was sent before 146 // the last snapshot was processed, we're good to go. 147 if w.LastSent.Before(w.LastProcessed) { 148 return true 149 } 150 151 // LastSent was after LastProcessed; we're still working on processing the 152 // most recent snapshot. We'll say we're ready only as long as we're still 153 // in the grace period. 154 return w.withinGracePeriod() 155 } 156