...
1
16
17 package util
18
19 import (
20 "sync"
21 "time"
22
23 v1 "k8s.io/api/core/v1"
24 "k8s.io/apimachinery/pkg/types"
25 "k8s.io/klog/v2"
26 "k8s.io/kubernetes/pkg/kubelet/metrics"
27 "k8s.io/utils/clock"
28 )
29
30
31
32 type PodStartupLatencyTracker interface {
33 ObservedPodOnWatch(pod *v1.Pod, when time.Time)
34 RecordImageStartedPulling(podUID types.UID)
35 RecordImageFinishedPulling(podUID types.UID)
36 RecordStatusUpdated(pod *v1.Pod)
37 DeletePodStartupState(podUID types.UID)
38 }
39
40 type basicPodStartupLatencyTracker struct {
41
42 lock sync.Mutex
43 pods map[types.UID]*perPodState
44
45 firstNetworkPodSeen bool
46
47 clock clock.Clock
48 }
49
50 type perPodState struct {
51 firstStartedPulling time.Time
52 lastFinishedPulling time.Time
53
54 observedRunningTime time.Time
55
56 metricRecorded bool
57 }
58
59
60 func NewPodStartupLatencyTracker() PodStartupLatencyTracker {
61 return &basicPodStartupLatencyTracker{
62 pods: map[types.UID]*perPodState{},
63 clock: clock.RealClock{},
64 }
65 }
66
67 func (p *basicPodStartupLatencyTracker) ObservedPodOnWatch(pod *v1.Pod, when time.Time) {
68 p.lock.Lock()
69 defer p.lock.Unlock()
70
71
72 if pod.Status.Phase == v1.PodFailed || pod.Status.Phase == v1.PodSucceeded {
73 delete(p.pods, pod.UID)
74 return
75 }
76
77 state := p.pods[pod.UID]
78 if state == nil {
79
80
81
82 if pod.Status.StartTime.IsZero() {
83 p.pods[pod.UID] = &perPodState{}
84 }
85
86 return
87 }
88
89 if state.observedRunningTime.IsZero() {
90
91 return
92 }
93
94 if state.metricRecorded {
95
96 return
97 }
98
99 if hasPodStartedSLO(pod) {
100 podStartingDuration := when.Sub(pod.CreationTimestamp.Time)
101 imagePullingDuration := state.lastFinishedPulling.Sub(state.firstStartedPulling)
102 podStartSLOduration := (podStartingDuration - imagePullingDuration).Seconds()
103
104 klog.InfoS("Observed pod startup duration",
105 "pod", klog.KObj(pod),
106 "podStartSLOduration", podStartSLOduration,
107 "podStartE2EDuration", podStartingDuration,
108 "podCreationTimestamp", pod.CreationTimestamp.Time,
109 "firstStartedPulling", state.firstStartedPulling,
110 "lastFinishedPulling", state.lastFinishedPulling,
111 "observedRunningTime", state.observedRunningTime,
112 "watchObservedRunningTime", when)
113
114 metrics.PodStartSLIDuration.WithLabelValues().Observe(podStartSLOduration)
115 metrics.PodStartTotalDuration.WithLabelValues().Observe(podStartingDuration.Seconds())
116 state.metricRecorded = true
117
118
119 if !pod.Spec.HostNetwork && !p.firstNetworkPodSeen {
120 metrics.FirstNetworkPodStartSLIDuration.Set(podStartSLOduration)
121 p.firstNetworkPodSeen = true
122 }
123 }
124 }
125
126 func (p *basicPodStartupLatencyTracker) RecordImageStartedPulling(podUID types.UID) {
127 p.lock.Lock()
128 defer p.lock.Unlock()
129
130 state := p.pods[podUID]
131 if state == nil {
132 return
133 }
134
135 if state.firstStartedPulling.IsZero() {
136 state.firstStartedPulling = p.clock.Now()
137 }
138 }
139
140 func (p *basicPodStartupLatencyTracker) RecordImageFinishedPulling(podUID types.UID) {
141 p.lock.Lock()
142 defer p.lock.Unlock()
143
144 state := p.pods[podUID]
145 if state == nil {
146 return
147 }
148
149 state.lastFinishedPulling = p.clock.Now()
150 }
151
152 func (p *basicPodStartupLatencyTracker) RecordStatusUpdated(pod *v1.Pod) {
153 p.lock.Lock()
154 defer p.lock.Unlock()
155
156 state := p.pods[pod.UID]
157 if state == nil {
158 return
159 }
160
161 if state.metricRecorded {
162
163 return
164 }
165
166 if !state.observedRunningTime.IsZero() {
167
168 return
169 }
170
171 if hasPodStartedSLO(pod) {
172 klog.V(3).InfoS("Mark when the pod was running for the first time", "pod", klog.KObj(pod), "rv", pod.ResourceVersion)
173 state.observedRunningTime = p.clock.Now()
174 }
175 }
176
177
178
179
180
181 func hasPodStartedSLO(pod *v1.Pod) bool {
182 for _, cs := range pod.Status.ContainerStatuses {
183 if cs.State.Running == nil || cs.State.Running.StartedAt.IsZero() {
184 return false
185 }
186 }
187
188 return true
189 }
190
191 func (p *basicPodStartupLatencyTracker) DeletePodStartupState(podUID types.UID) {
192 p.lock.Lock()
193 defer p.lock.Unlock()
194
195 delete(p.pods, podUID)
196 }
197
View as plain text