...

Source file src/k8s.io/kubernetes/pkg/kubelet/util/pod_startup_latency_tracker.go

Documentation: k8s.io/kubernetes/pkg/kubelet/util

     1  /*
     2  Copyright 2022 The Kubernetes Authors.
     3  
     4  Licensed under the Apache License, Version 2.0 (the "License");
     5  you may not use this file except in compliance with the License.
     6  You may obtain a copy of the License at
     7  
     8      http://www.apache.org/licenses/LICENSE-2.0
     9  
    10  Unless required by applicable law or agreed to in writing, software
    11  distributed under the License is distributed on an "AS IS" BASIS,
    12  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13  See the License for the specific language governing permissions and
    14  limitations under the License.
    15  */
    16  
    17  package util
    18  
    19  import (
    20  	"sync"
    21  	"time"
    22  
    23  	v1 "k8s.io/api/core/v1"
    24  	"k8s.io/apimachinery/pkg/types"
    25  	"k8s.io/klog/v2"
    26  	"k8s.io/kubernetes/pkg/kubelet/metrics"
    27  	"k8s.io/utils/clock"
    28  )
    29  
    30  // PodStartupLatencyTracker records key moments for startup latency calculation,
    31  // e.g. image pulling or pod observed running on watch.
    32  type PodStartupLatencyTracker interface {
    33  	ObservedPodOnWatch(pod *v1.Pod, when time.Time)
    34  	RecordImageStartedPulling(podUID types.UID)
    35  	RecordImageFinishedPulling(podUID types.UID)
    36  	RecordStatusUpdated(pod *v1.Pod)
    37  	DeletePodStartupState(podUID types.UID)
    38  }
    39  
    40  type basicPodStartupLatencyTracker struct {
    41  	// protect against concurrent read and write on pods map
    42  	lock sync.Mutex
    43  	pods map[types.UID]*perPodState
    44  	// metrics for the first network pod only
    45  	firstNetworkPodSeen bool
    46  	// For testability
    47  	clock clock.Clock
    48  }
    49  
    50  type perPodState struct {
    51  	firstStartedPulling time.Time
    52  	lastFinishedPulling time.Time
    53  	// first time, when pod status changed into Running
    54  	observedRunningTime time.Time
    55  	// log, if pod latency was already Observed
    56  	metricRecorded bool
    57  }
    58  
    59  // NewPodStartupLatencyTracker creates an instance of PodStartupLatencyTracker
    60  func NewPodStartupLatencyTracker() PodStartupLatencyTracker {
    61  	return &basicPodStartupLatencyTracker{
    62  		pods:  map[types.UID]*perPodState{},
    63  		clock: clock.RealClock{},
    64  	}
    65  }
    66  
    67  func (p *basicPodStartupLatencyTracker) ObservedPodOnWatch(pod *v1.Pod, when time.Time) {
    68  	p.lock.Lock()
    69  	defer p.lock.Unlock()
    70  
    71  	// if the pod is terminal, we do not have to track it anymore for startup
    72  	if pod.Status.Phase == v1.PodFailed || pod.Status.Phase == v1.PodSucceeded {
    73  		delete(p.pods, pod.UID)
    74  		return
    75  	}
    76  
    77  	state := p.pods[pod.UID]
    78  	if state == nil {
    79  		// create a new record for pod, only if it was not yet acknowledged by the Kubelet
    80  		// this is required, as we want to log metric only for those pods, that where scheduled
    81  		// after Kubelet started
    82  		if pod.Status.StartTime.IsZero() {
    83  			p.pods[pod.UID] = &perPodState{}
    84  		}
    85  
    86  		return
    87  	}
    88  
    89  	if state.observedRunningTime.IsZero() {
    90  		// skip, pod didn't start yet
    91  		return
    92  	}
    93  
    94  	if state.metricRecorded {
    95  		// skip, pod's latency already recorded
    96  		return
    97  	}
    98  
    99  	if hasPodStartedSLO(pod) {
   100  		podStartingDuration := when.Sub(pod.CreationTimestamp.Time)
   101  		imagePullingDuration := state.lastFinishedPulling.Sub(state.firstStartedPulling)
   102  		podStartSLOduration := (podStartingDuration - imagePullingDuration).Seconds()
   103  
   104  		klog.InfoS("Observed pod startup duration",
   105  			"pod", klog.KObj(pod),
   106  			"podStartSLOduration", podStartSLOduration,
   107  			"podStartE2EDuration", podStartingDuration,
   108  			"podCreationTimestamp", pod.CreationTimestamp.Time,
   109  			"firstStartedPulling", state.firstStartedPulling,
   110  			"lastFinishedPulling", state.lastFinishedPulling,
   111  			"observedRunningTime", state.observedRunningTime,
   112  			"watchObservedRunningTime", when)
   113  
   114  		metrics.PodStartSLIDuration.WithLabelValues().Observe(podStartSLOduration)
   115  		metrics.PodStartTotalDuration.WithLabelValues().Observe(podStartingDuration.Seconds())
   116  		state.metricRecorded = true
   117  		// if is the first Pod with network track the start values
   118  		// these metrics will help to identify problems with the CNI plugin
   119  		if !pod.Spec.HostNetwork && !p.firstNetworkPodSeen {
   120  			metrics.FirstNetworkPodStartSLIDuration.Set(podStartSLOduration)
   121  			p.firstNetworkPodSeen = true
   122  		}
   123  	}
   124  }
   125  
   126  func (p *basicPodStartupLatencyTracker) RecordImageStartedPulling(podUID types.UID) {
   127  	p.lock.Lock()
   128  	defer p.lock.Unlock()
   129  
   130  	state := p.pods[podUID]
   131  	if state == nil {
   132  		return
   133  	}
   134  
   135  	if state.firstStartedPulling.IsZero() {
   136  		state.firstStartedPulling = p.clock.Now()
   137  	}
   138  }
   139  
   140  func (p *basicPodStartupLatencyTracker) RecordImageFinishedPulling(podUID types.UID) {
   141  	p.lock.Lock()
   142  	defer p.lock.Unlock()
   143  
   144  	state := p.pods[podUID]
   145  	if state == nil {
   146  		return
   147  	}
   148  
   149  	state.lastFinishedPulling = p.clock.Now() // Now is always grater than values from the past.
   150  }
   151  
   152  func (p *basicPodStartupLatencyTracker) RecordStatusUpdated(pod *v1.Pod) {
   153  	p.lock.Lock()
   154  	defer p.lock.Unlock()
   155  
   156  	state := p.pods[pod.UID]
   157  	if state == nil {
   158  		return
   159  	}
   160  
   161  	if state.metricRecorded {
   162  		// skip, pod latency already recorded
   163  		return
   164  	}
   165  
   166  	if !state.observedRunningTime.IsZero() {
   167  		// skip, pod already started
   168  		return
   169  	}
   170  
   171  	if hasPodStartedSLO(pod) {
   172  		klog.V(3).InfoS("Mark when the pod was running for the first time", "pod", klog.KObj(pod), "rv", pod.ResourceVersion)
   173  		state.observedRunningTime = p.clock.Now()
   174  	}
   175  }
   176  
   177  // hasPodStartedSLO, check if for given pod, each container has been started at least once
   178  //
   179  // This should reflect "Pod startup latency SLI" definition
   180  // ref: https://github.com/kubernetes/community/blob/master/sig-scalability/slos/pod_startup_latency.md
   181  func hasPodStartedSLO(pod *v1.Pod) bool {
   182  	for _, cs := range pod.Status.ContainerStatuses {
   183  		if cs.State.Running == nil || cs.State.Running.StartedAt.IsZero() {
   184  			return false
   185  		}
   186  	}
   187  
   188  	return true
   189  }
   190  
   191  func (p *basicPodStartupLatencyTracker) DeletePodStartupState(podUID types.UID) {
   192  	p.lock.Lock()
   193  	defer p.lock.Unlock()
   194  
   195  	delete(p.pods, podUID)
   196  }
   197  

View as plain text