...

Source file src/github.com/datawire/ambassador/v2/pkg/acp/ambassador.go

Documentation: github.com/datawire/ambassador/v2/pkg/acp

     1  // Copyright 2020 Datawire. All rights reserved.
     2  //
     3  // package acp contains stuff dealing with the Ambassador Control Plane as a whole.
     4  //
     5  // This is the AmbassadorWatcher, which is a class that can keep an eye on a running
     6  // Ambassador as a whole, and tell you whether it's alive and ready, or not.
     7  //
     8  // THE STATE MACHINE AND THE GRACE PERIOD:
     9  // When an Ambassador pod boots, Envoy is _not_ started at boot time: instead, we
    10  // wait for the initial configuration to be generated and only then start Envoy (this
    11  // is because we don't know what port to tell Envoy to listen on until after we've
    12  // generated the initial configuration). However, Envoy can't come up instantly,
    13  // either.
    14  //
    15  // To deal with this, there's a state machine in here. We start in envoyNotStarted
    16  // state, move to envoyStarting when the first snapshot is processed, then move to
    17  // envoyRunning when we successfully get stats back from Envoy. These states inform
    18  // what, exactly, we demand to declare Ambassador alive, dead, ready, or not ready.
    19  //
    20  // If you like pictures instead of words:   envoyNotStarted
    21  //                                                |
    22  //                                                | (first snapshot is processed)
    23  //                                                V
    24  //                                          envoyStarting
    25  //                                                |
    26  //                                                | (we got stats from Envoy)
    27  //                                                V
    28  //                                          envoyRunning
    29  //
    30  // Envoy is currently given 30 seconds to come up after getting its initial
    31  // configuration. This may be the wrong compromise: in practice, Envoy should come
    32  // up _much_ faster than that, but the idea this code is more about providing a
    33  // conservative failsafe than providing a finely-tuned hair trigger.
    34  //
    35  // TESTING HOOKS:
    36  // Since time plays a role, you can use AmbassadorWatcher.SetFetchTime to change the
    37  // function that the AmbassadorWatcher uses to fetch times. The default is time.Now.
    38  //
    39  // This hook is NOT meant for you to change the values on the fly in a running
    40  // AmbassadorWatcher. Set it at instantiation if need be, then leave it alone. See
    41  // ambassador_test.go for more.
    42  
    43  package acp
    44  
    45  import (
    46  	"context"
    47  	"fmt"
    48  	"sync"
    49  	"time"
    50  )
    51  
    52  type awState int
    53  
    54  const (
    55  	envoyNotStarted awState = iota
    56  	envoyStarting
    57  	envoyRunning
    58  )
    59  
    60  // AmbassadorWatcher encapsulates state and methods for keeping an eye on a running
    61  // Ambassador, and deciding if it's healthy.
    62  type AmbassadorWatcher struct {
    63  	// This mutex protects access our watchers and our state, mostly as
    64  	// a matter of rank paranoia.
    65  	mutex sync.Mutex
    66  
    67  	// How shall we fetch the current time?
    68  	fetchTime timeFetcher
    69  
    70  	// What's the current Envoy state?
    71  	state awState
    72  
    73  	// We encapsulate an EnvoyWatcher and a DiagdWatcher.
    74  	ew *EnvoyWatcher
    75  	dw *DiagdWatcher
    76  
    77  	// At the point that the DiagdWatcher finishes processing the very first
    78  	// snapshot, we have to hand the snapshot to Envoy and allow Envoy to start
    79  	// up. This takes finite time, so we have to allow for that.
    80  	GraceEnd time.Time
    81  }
    82  
    83  // NewAmbassadorWatcher creates a new AmbassadorWatcher, given a fetcher.
    84  //
    85  // Honestly, this is slightly pointless -- it's here for parallelism with the
    86  // EnvoyWatcher and the DiagdWatcher.
    87  func NewAmbassadorWatcher(ew *EnvoyWatcher, dw *DiagdWatcher) *AmbassadorWatcher {
    88  	return &AmbassadorWatcher{
    89  		// Default to using time.Now for time. This can be reset later.
    90  		fetchTime: time.Now,
    91  		state:     envoyNotStarted,
    92  		ew:        ew,
    93  		dw:        dw,
    94  	}
    95  }
    96  
    97  // SetFetchTime will change the function we use to get the current time.
    98  func (w *AmbassadorWatcher) SetFetchTime(fetchTime timeFetcher) {
    99  	w.fetchTime = fetchTime
   100  }
   101  
   102  // FetchEnvoyReady will check whether Envoy's statistics are fetchable.
   103  func (w *AmbassadorWatcher) FetchEnvoyReady(ctx context.Context) {
   104  	w.mutex.Lock()
   105  	defer w.mutex.Unlock()
   106  
   107  	w.ew.FetchEnvoyReady(ctx)
   108  }
   109  
   110  // NoteSnapshotSent will note that a snapshot has been sent.
   111  func (w *AmbassadorWatcher) NoteSnapshotSent() {
   112  	w.mutex.Lock()
   113  	defer w.mutex.Unlock()
   114  
   115  	w.dw.NoteSnapshotSent()
   116  }
   117  
   118  // NoteSnapshotProcessed will note that a snapshot has been processed.
   119  func (w *AmbassadorWatcher) NoteSnapshotProcessed() {
   120  	w.mutex.Lock()
   121  	defer w.mutex.Unlock()
   122  
   123  	w.dw.NoteSnapshotProcessed()
   124  
   125  	// Is this is the very first time we've processed a snapshot?
   126  	if w.state == envoyNotStarted {
   127  		// Yes, it is. Note that we're now waiting for Envoy to start...
   128  		w.state = envoyStarting
   129  
   130  		// ...and give Envoy 30 seconds to come up.
   131  		w.GraceEnd = w.fetchTime().Add(30 * time.Second)
   132  	}
   133  }
   134  
   135  // IsAlive returns true IFF the Ambassador as a whole can be considered alive.
   136  func (w *AmbassadorWatcher) IsAlive() bool {
   137  	w.mutex.Lock()
   138  	defer w.mutex.Unlock()
   139  
   140  	// First things first: if diagd isn't alive, Ambassador as a whole is
   141  	// clearly not alive.
   142  
   143  	if !w.dw.IsAlive() {
   144  		return false
   145  	}
   146  
   147  	// OK, diagd is alive. We need to look at our current state to figure
   148  	// out how we'll check Envoy.
   149  
   150  	switch w.state {
   151  	case envoyNotStarted:
   152  		// We haven't even tried to start Envoy yet, so we're good to go with
   153  		// just diagd being alive.
   154  		return true
   155  
   156  	case envoyStarting:
   157  		// We're waiting for Envoy to start. Has it?
   158  		if w.ew.IsAlive() {
   159  			// Yes. Remember that it's running...
   160  			w.state = envoyRunning
   161  
   162  			// ...and then we're good to go.
   163  			return true
   164  		}
   165  
   166  		// It's not yet running. Return true IFF we're still within the grace period.
   167  		return w.fetchTime().Before(w.GraceEnd)
   168  
   169  	case envoyRunning:
   170  		// Envoy is already running, so check to make sure that it's still alive.
   171  		return w.ew.IsAlive()
   172  
   173  	default:
   174  		// This is "impossible": w.state isn't exported, and it's deliberately
   175  		// typed as awState to catch someone trying to assign a random integer to
   176  		// it. However, I guess someone could conceivably assign something new to
   177  		// it without updating this code, so we test for it.
   178  		panic(fmt.Sprintf("AmbassadorWatcher.state enum has unknown value %d", w.state))
   179  	}
   180  }
   181  
   182  // IsReady returns true IFF the Ambassador as a whole can be considered ready.
   183  func (w *AmbassadorWatcher) IsReady() bool {
   184  	w.mutex.Lock()
   185  	defer w.mutex.Unlock()
   186  
   187  	// This is much simpler that IsAlive. Ambassador is ready IFF both diagd and
   188  	// Envoy are ready; that's all there is to it.
   189  
   190  	return w.dw.IsReady() && w.ew.IsReady()
   191  }
   192  

View as plain text