1 // Copyright 2020 Datawire. All rights reserved. 2 // 3 // package acp contains stuff dealing with the Ambassador Control Plane as a whole. 4 // 5 // This is the AmbassadorWatcher, which is a class that can keep an eye on a running 6 // Ambassador as a whole, and tell you whether it's alive and ready, or not. 7 // 8 // THE STATE MACHINE AND THE GRACE PERIOD: 9 // When an Ambassador pod boots, Envoy is _not_ started at boot time: instead, we 10 // wait for the initial configuration to be generated and only then start Envoy (this 11 // is because we don't know what port to tell Envoy to listen on until after we've 12 // generated the initial configuration). However, Envoy can't come up instantly, 13 // either. 14 // 15 // To deal with this, there's a state machine in here. We start in envoyNotStarted 16 // state, move to envoyStarting when the first snapshot is processed, then move to 17 // envoyRunning when we successfully get stats back from Envoy. These states inform 18 // what, exactly, we demand to declare Ambassador alive, dead, ready, or not ready. 19 // 20 // If you like pictures instead of words: envoyNotStarted 21 // | 22 // | (first snapshot is processed) 23 // V 24 // envoyStarting 25 // | 26 // | (we got stats from Envoy) 27 // V 28 // envoyRunning 29 // 30 // Envoy is currently given 30 seconds to come up after getting its initial 31 // configuration. This may be the wrong compromise: in practice, Envoy should come 32 // up _much_ faster than that, but the idea this code is more about providing a 33 // conservative failsafe than providing a finely-tuned hair trigger. 34 // 35 // TESTING HOOKS: 36 // Since time plays a role, you can use AmbassadorWatcher.SetFetchTime to change the 37 // function that the AmbassadorWatcher uses to fetch times. The default is time.Now. 38 // 39 // This hook is NOT meant for you to change the values on the fly in a running 40 // AmbassadorWatcher. Set it at instantiation if need be, then leave it alone. See 41 // ambassador_test.go for more. 42 43 package acp 44 45 import ( 46 "context" 47 "fmt" 48 "sync" 49 "time" 50 ) 51 52 type awState int 53 54 const ( 55 envoyNotStarted awState = iota 56 envoyStarting 57 envoyRunning 58 ) 59 60 // AmbassadorWatcher encapsulates state and methods for keeping an eye on a running 61 // Ambassador, and deciding if it's healthy. 62 type AmbassadorWatcher struct { 63 // This mutex protects access our watchers and our state, mostly as 64 // a matter of rank paranoia. 65 mutex sync.Mutex 66 67 // How shall we fetch the current time? 68 fetchTime timeFetcher 69 70 // What's the current Envoy state? 71 state awState 72 73 // We encapsulate an EnvoyWatcher and a DiagdWatcher. 74 ew *EnvoyWatcher 75 dw *DiagdWatcher 76 77 // At the point that the DiagdWatcher finishes processing the very first 78 // snapshot, we have to hand the snapshot to Envoy and allow Envoy to start 79 // up. This takes finite time, so we have to allow for that. 80 GraceEnd time.Time 81 } 82 83 // NewAmbassadorWatcher creates a new AmbassadorWatcher, given a fetcher. 84 // 85 // Honestly, this is slightly pointless -- it's here for parallelism with the 86 // EnvoyWatcher and the DiagdWatcher. 87 func NewAmbassadorWatcher(ew *EnvoyWatcher, dw *DiagdWatcher) *AmbassadorWatcher { 88 return &AmbassadorWatcher{ 89 // Default to using time.Now for time. This can be reset later. 90 fetchTime: time.Now, 91 state: envoyNotStarted, 92 ew: ew, 93 dw: dw, 94 } 95 } 96 97 // SetFetchTime will change the function we use to get the current time. 98 func (w *AmbassadorWatcher) SetFetchTime(fetchTime timeFetcher) { 99 w.fetchTime = fetchTime 100 } 101 102 // FetchEnvoyReady will check whether Envoy's statistics are fetchable. 103 func (w *AmbassadorWatcher) FetchEnvoyReady(ctx context.Context) { 104 w.mutex.Lock() 105 defer w.mutex.Unlock() 106 107 w.ew.FetchEnvoyReady(ctx) 108 } 109 110 // NoteSnapshotSent will note that a snapshot has been sent. 111 func (w *AmbassadorWatcher) NoteSnapshotSent() { 112 w.mutex.Lock() 113 defer w.mutex.Unlock() 114 115 w.dw.NoteSnapshotSent() 116 } 117 118 // NoteSnapshotProcessed will note that a snapshot has been processed. 119 func (w *AmbassadorWatcher) NoteSnapshotProcessed() { 120 w.mutex.Lock() 121 defer w.mutex.Unlock() 122 123 w.dw.NoteSnapshotProcessed() 124 125 // Is this is the very first time we've processed a snapshot? 126 if w.state == envoyNotStarted { 127 // Yes, it is. Note that we're now waiting for Envoy to start... 128 w.state = envoyStarting 129 130 // ...and give Envoy 30 seconds to come up. 131 w.GraceEnd = w.fetchTime().Add(30 * time.Second) 132 } 133 } 134 135 // IsAlive returns true IFF the Ambassador as a whole can be considered alive. 136 func (w *AmbassadorWatcher) IsAlive() bool { 137 w.mutex.Lock() 138 defer w.mutex.Unlock() 139 140 // First things first: if diagd isn't alive, Ambassador as a whole is 141 // clearly not alive. 142 143 if !w.dw.IsAlive() { 144 return false 145 } 146 147 // OK, diagd is alive. We need to look at our current state to figure 148 // out how we'll check Envoy. 149 150 switch w.state { 151 case envoyNotStarted: 152 // We haven't even tried to start Envoy yet, so we're good to go with 153 // just diagd being alive. 154 return true 155 156 case envoyStarting: 157 // We're waiting for Envoy to start. Has it? 158 if w.ew.IsAlive() { 159 // Yes. Remember that it's running... 160 w.state = envoyRunning 161 162 // ...and then we're good to go. 163 return true 164 } 165 166 // It's not yet running. Return true IFF we're still within the grace period. 167 return w.fetchTime().Before(w.GraceEnd) 168 169 case envoyRunning: 170 // Envoy is already running, so check to make sure that it's still alive. 171 return w.ew.IsAlive() 172 173 default: 174 // This is "impossible": w.state isn't exported, and it's deliberately 175 // typed as awState to catch someone trying to assign a random integer to 176 // it. However, I guess someone could conceivably assign something new to 177 // it without updating this code, so we test for it. 178 panic(fmt.Sprintf("AmbassadorWatcher.state enum has unknown value %d", w.state)) 179 } 180 } 181 182 // IsReady returns true IFF the Ambassador as a whole can be considered ready. 183 func (w *AmbassadorWatcher) IsReady() bool { 184 w.mutex.Lock() 185 defer w.mutex.Unlock() 186 187 // This is much simpler that IsAlive. Ambassador is ready IFF both diagd and 188 // Envoy are ready; that's all there is to it. 189 190 return w.dw.IsReady() && w.ew.IsReady() 191 } 192