...

Source file src/k8s.io/kubernetes/pkg/proxy/healthcheck/proxier_health.go

Documentation: k8s.io/kubernetes/pkg/proxy/healthcheck

     1  /*
     2  Copyright 2016 The Kubernetes Authors.
     3  
     4  Licensed under the Apache License, Version 2.0 (the "License");
     5  you may not use this file except in compliance with the License.
     6  You may obtain a copy of the License at
     7  
     8      http://www.apache.org/licenses/LICENSE-2.0
     9  
    10  Unless required by applicable law or agreed to in writing, software
    11  distributed under the License is distributed on an "AS IS" BASIS,
    12  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13  See the License for the specific language governing permissions and
    14  limitations under the License.
    15  */
    16  
    17  package healthcheck
    18  
    19  import (
    20  	"fmt"
    21  	"net/http"
    22  	"sync"
    23  	"time"
    24  
    25  	v1 "k8s.io/api/core/v1"
    26  	"k8s.io/klog/v2"
    27  	"k8s.io/kubernetes/pkg/proxy/metrics"
    28  	"k8s.io/utils/clock"
    29  )
    30  
    31  const (
    32  	// ToBeDeletedTaint is a taint used by the CLuster Autoscaler before marking a node for deletion. Defined in
    33  	// https://github.com/kubernetes/autoscaler/blob/e80ab518340f88f364fe3ef063f8303755125971/cluster-autoscaler/utils/deletetaint/delete.go#L36
    34  	ToBeDeletedTaint = "ToBeDeletedByClusterAutoscaler"
    35  )
    36  
    37  // ProxierHealthServer allows callers to:
    38  //  1. run a http server with /healthz and /livez endpoint handlers.
    39  //  2. update healthz timestamps before and after synchronizing dataplane.
    40  //  3. sync node status, for reporting unhealthy /healthz response
    41  //     if the node is marked for deletion by autoscaler.
    42  //  4. get proxy health by verifying that the delay between QueuedUpdate()
    43  //     calls and Updated() calls exceeded healthTimeout or not.
    44  type ProxierHealthServer struct {
    45  	listener    listener
    46  	httpFactory httpServerFactory
    47  	clock       clock.Clock
    48  
    49  	addr          string
    50  	healthTimeout time.Duration
    51  
    52  	lock                   sync.RWMutex
    53  	lastUpdatedMap         map[v1.IPFamily]time.Time
    54  	oldestPendingQueuedMap map[v1.IPFamily]time.Time
    55  	nodeEligible           bool
    56  }
    57  
    58  // NewProxierHealthServer returns a proxier health http server.
    59  func NewProxierHealthServer(addr string, healthTimeout time.Duration) *ProxierHealthServer {
    60  	return newProxierHealthServer(stdNetListener{}, stdHTTPServerFactory{}, clock.RealClock{}, addr, healthTimeout)
    61  }
    62  
    63  func newProxierHealthServer(listener listener, httpServerFactory httpServerFactory, c clock.Clock, addr string, healthTimeout time.Duration) *ProxierHealthServer {
    64  	return &ProxierHealthServer{
    65  		listener:      listener,
    66  		httpFactory:   httpServerFactory,
    67  		clock:         c,
    68  		addr:          addr,
    69  		healthTimeout: healthTimeout,
    70  
    71  		lastUpdatedMap:         make(map[v1.IPFamily]time.Time),
    72  		oldestPendingQueuedMap: make(map[v1.IPFamily]time.Time),
    73  		// The node is eligible (and thus the proxy healthy) while it's starting up
    74  		// and until we've processed the first node event that indicates the
    75  		// contrary.
    76  		nodeEligible: true,
    77  	}
    78  }
    79  
    80  // Updated should be called when the proxier of the given IP family has successfully updated
    81  // the service rules to reflect the current state and should be considered healthy now.
    82  func (hs *ProxierHealthServer) Updated(ipFamily v1.IPFamily) {
    83  	hs.lock.Lock()
    84  	defer hs.lock.Unlock()
    85  	delete(hs.oldestPendingQueuedMap, ipFamily)
    86  	hs.lastUpdatedMap[ipFamily] = hs.clock.Now()
    87  }
    88  
    89  // QueuedUpdate should be called when the proxier receives a Service or Endpoints event
    90  // from API Server containing information that requires updating service rules. It
    91  // indicates that the proxier for the given IP family has received changes but has not
    92  // yet pushed them to its backend. If the proxier does not call Updated within the
    93  // healthTimeout time then it will be considered unhealthy.
    94  func (hs *ProxierHealthServer) QueuedUpdate(ipFamily v1.IPFamily) {
    95  	hs.lock.Lock()
    96  	defer hs.lock.Unlock()
    97  	// Set oldestPendingQueuedMap[ipFamily] only if it's currently unset
    98  	if _, set := hs.oldestPendingQueuedMap[ipFamily]; !set {
    99  		hs.oldestPendingQueuedMap[ipFamily] = hs.clock.Now()
   100  	}
   101  }
   102  
   103  // IsHealthy returns only the proxier's health state, following the same
   104  // definition the HTTP server defines, but ignoring the state of the Node.
   105  func (hs *ProxierHealthServer) IsHealthy() bool {
   106  	isHealthy, _ := hs.isHealthy()
   107  	return isHealthy
   108  }
   109  
   110  func (hs *ProxierHealthServer) isHealthy() (bool, time.Time) {
   111  	hs.lock.RLock()
   112  	defer hs.lock.RUnlock()
   113  
   114  	var lastUpdated time.Time
   115  	currentTime := hs.clock.Now()
   116  
   117  	for ipFamily, proxierLastUpdated := range hs.lastUpdatedMap {
   118  
   119  		if proxierLastUpdated.After(lastUpdated) {
   120  			lastUpdated = proxierLastUpdated
   121  		}
   122  
   123  		if _, set := hs.oldestPendingQueuedMap[ipFamily]; !set {
   124  			// the proxier is healthy while it's starting up
   125  			// or the proxier is fully synced.
   126  			continue
   127  		}
   128  
   129  		if currentTime.Sub(hs.oldestPendingQueuedMap[ipFamily]) < hs.healthTimeout {
   130  			// there's an unprocessed update queued for this proxier, but it's not late yet.
   131  			continue
   132  		}
   133  		return false, proxierLastUpdated
   134  	}
   135  	return true, lastUpdated
   136  }
   137  
   138  // SyncNode syncs the node and determines if it is eligible or not. Eligible is
   139  // defined as being: not tainted by ToBeDeletedTaint and not deleted.
   140  func (hs *ProxierHealthServer) SyncNode(node *v1.Node) {
   141  	hs.lock.Lock()
   142  	defer hs.lock.Unlock()
   143  
   144  	if !node.DeletionTimestamp.IsZero() {
   145  		hs.nodeEligible = false
   146  		return
   147  	}
   148  	for _, taint := range node.Spec.Taints {
   149  		if taint.Key == ToBeDeletedTaint {
   150  			hs.nodeEligible = false
   151  			return
   152  		}
   153  	}
   154  	hs.nodeEligible = true
   155  }
   156  
   157  // NodeEligible returns nodeEligible field of ProxierHealthServer.
   158  func (hs *ProxierHealthServer) NodeEligible() bool {
   159  	hs.lock.RLock()
   160  	defer hs.lock.RUnlock()
   161  	return hs.nodeEligible
   162  }
   163  
   164  // Run starts the healthz HTTP server and blocks until it exits.
   165  func (hs *ProxierHealthServer) Run() error {
   166  	serveMux := http.NewServeMux()
   167  	serveMux.Handle("/healthz", healthzHandler{hs: hs})
   168  	serveMux.Handle("/livez", livezHandler{hs: hs})
   169  	server := hs.httpFactory.New(hs.addr, serveMux)
   170  
   171  	listener, err := hs.listener.Listen(hs.addr)
   172  	if err != nil {
   173  		return fmt.Errorf("failed to start proxier healthz on %s: %v", hs.addr, err)
   174  	}
   175  
   176  	klog.V(3).InfoS("Starting healthz HTTP server", "address", hs.addr)
   177  
   178  	if err := server.Serve(listener); err != nil {
   179  		return fmt.Errorf("proxier healthz closed with error: %v", err)
   180  	}
   181  	return nil
   182  }
   183  
   184  type healthzHandler struct {
   185  	hs *ProxierHealthServer
   186  }
   187  
   188  func (h healthzHandler) ServeHTTP(resp http.ResponseWriter, req *http.Request) {
   189  	nodeEligible := h.hs.NodeEligible()
   190  	healthy, lastUpdated := h.hs.isHealthy()
   191  	currentTime := h.hs.clock.Now()
   192  
   193  	healthy = healthy && nodeEligible
   194  	resp.Header().Set("Content-Type", "application/json")
   195  	resp.Header().Set("X-Content-Type-Options", "nosniff")
   196  	if !healthy {
   197  		metrics.ProxyHealthzTotal.WithLabelValues("503").Inc()
   198  		resp.WriteHeader(http.StatusServiceUnavailable)
   199  	} else {
   200  		metrics.ProxyHealthzTotal.WithLabelValues("200").Inc()
   201  		resp.WriteHeader(http.StatusOK)
   202  		// In older releases, the returned "lastUpdated" time indicated the last
   203  		// time the proxier sync loop ran, even if nothing had changed. To
   204  		// preserve compatibility, we use the same semantics: the returned
   205  		// lastUpdated value is "recent" if the server is healthy. The kube-proxy
   206  		// metrics provide more detailed information.
   207  		lastUpdated = currentTime
   208  	}
   209  	fmt.Fprintf(resp, `{"lastUpdated": %q,"currentTime": %q, "nodeEligible": %v}`, lastUpdated, currentTime, nodeEligible)
   210  }
   211  
   212  type livezHandler struct {
   213  	hs *ProxierHealthServer
   214  }
   215  
   216  func (h livezHandler) ServeHTTP(resp http.ResponseWriter, req *http.Request) {
   217  	healthy, lastUpdated := h.hs.isHealthy()
   218  	currentTime := h.hs.clock.Now()
   219  	resp.Header().Set("Content-Type", "application/json")
   220  	resp.Header().Set("X-Content-Type-Options", "nosniff")
   221  	if !healthy {
   222  		metrics.ProxyLivezTotal.WithLabelValues("503").Inc()
   223  		resp.WriteHeader(http.StatusServiceUnavailable)
   224  	} else {
   225  		metrics.ProxyLivezTotal.WithLabelValues("200").Inc()
   226  		resp.WriteHeader(http.StatusOK)
   227  		// In older releases, the returned "lastUpdated" time indicated the last
   228  		// time the proxier sync loop ran, even if nothing had changed. To
   229  		// preserve compatibility, we use the same semantics: the returned
   230  		// lastUpdated value is "recent" if the server is healthy. The kube-proxy
   231  		// metrics provide more detailed information.
   232  		lastUpdated = currentTime
   233  	}
   234  	fmt.Fprintf(resp, `{"lastUpdated": %q,"currentTime": %q}`, lastUpdated, currentTime)
   235  }
   236  

View as plain text