1
16
17 package eviction
18
19 import (
20 "context"
21 "fmt"
22 "sort"
23 "sync"
24 "time"
25
26 "k8s.io/klog/v2"
27
28 v1 "k8s.io/api/core/v1"
29 "k8s.io/apimachinery/pkg/api/resource"
30 utilfeature "k8s.io/apiserver/pkg/util/feature"
31 "k8s.io/client-go/tools/record"
32 corev1helpers "k8s.io/component-helpers/scheduling/corev1"
33 statsapi "k8s.io/kubelet/pkg/apis/stats/v1alpha1"
34 "k8s.io/utils/clock"
35
36 podutil "k8s.io/kubernetes/pkg/api/v1/pod"
37 resourcehelper "k8s.io/kubernetes/pkg/api/v1/resource"
38 v1qos "k8s.io/kubernetes/pkg/apis/core/v1/helper/qos"
39 "k8s.io/kubernetes/pkg/features"
40 evictionapi "k8s.io/kubernetes/pkg/kubelet/eviction/api"
41 "k8s.io/kubernetes/pkg/kubelet/lifecycle"
42 "k8s.io/kubernetes/pkg/kubelet/metrics"
43 "k8s.io/kubernetes/pkg/kubelet/server/stats"
44 kubelettypes "k8s.io/kubernetes/pkg/kubelet/types"
45 )
46
47 const (
48 podCleanupTimeout = 30 * time.Second
49 podCleanupPollFreq = time.Second
50 )
51
52 const (
53
54 signalEphemeralContainerFsLimit string = "ephemeralcontainerfs.limit"
55
56 signalEphemeralPodFsLimit string = "ephemeralpodfs.limit"
57
58 signalEmptyDirFsLimit string = "emptydirfs.limit"
59 )
60
61
62 type managerImpl struct {
63
64 clock clock.WithTicker
65
66 config Config
67
68 killPodFunc KillPodFunc
69
70 imageGC ImageGC
71
72 containerGC ContainerGC
73
74 sync.RWMutex
75
76 nodeConditions []v1.NodeConditionType
77
78 nodeConditionsLastObservedAt nodeConditionsObservedAt
79
80 nodeRef *v1.ObjectReference
81
82 recorder record.EventRecorder
83
84 summaryProvider stats.SummaryProvider
85
86 thresholdsFirstObservedAt thresholdsObservedAt
87
88 thresholdsMet []evictionapi.Threshold
89
90 signalToRankFunc map[evictionapi.Signal]rankFunc
91
92 signalToNodeReclaimFuncs map[evictionapi.Signal]nodeReclaimFuncs
93
94 lastObservations signalObservations
95
96 dedicatedImageFs *bool
97
98 splitContainerImageFs *bool
99
100 thresholdNotifiers []ThresholdNotifier
101
102 thresholdsLastUpdated time.Time
103
104 localStorageCapacityIsolation bool
105 }
106
107
108 var _ Manager = &managerImpl{}
109
110
111 func NewManager(
112 summaryProvider stats.SummaryProvider,
113 config Config,
114 killPodFunc KillPodFunc,
115 imageGC ImageGC,
116 containerGC ContainerGC,
117 recorder record.EventRecorder,
118 nodeRef *v1.ObjectReference,
119 clock clock.WithTicker,
120 localStorageCapacityIsolation bool,
121 ) (Manager, lifecycle.PodAdmitHandler) {
122 manager := &managerImpl{
123 clock: clock,
124 killPodFunc: killPodFunc,
125 imageGC: imageGC,
126 containerGC: containerGC,
127 config: config,
128 recorder: recorder,
129 summaryProvider: summaryProvider,
130 nodeRef: nodeRef,
131 nodeConditionsLastObservedAt: nodeConditionsObservedAt{},
132 thresholdsFirstObservedAt: thresholdsObservedAt{},
133 dedicatedImageFs: nil,
134 splitContainerImageFs: nil,
135 thresholdNotifiers: []ThresholdNotifier{},
136 localStorageCapacityIsolation: localStorageCapacityIsolation,
137 }
138 return manager, manager
139 }
140
141
142 func (m *managerImpl) Admit(attrs *lifecycle.PodAdmitAttributes) lifecycle.PodAdmitResult {
143 m.RLock()
144 defer m.RUnlock()
145 if len(m.nodeConditions) == 0 {
146 return lifecycle.PodAdmitResult{Admit: true}
147 }
148
149
150 if kubelettypes.IsCriticalPod(attrs.Pod) {
151 return lifecycle.PodAdmitResult{Admit: true}
152 }
153
154
155 nodeOnlyHasMemoryPressureCondition := hasNodeCondition(m.nodeConditions, v1.NodeMemoryPressure) && len(m.nodeConditions) == 1
156 if nodeOnlyHasMemoryPressureCondition {
157 notBestEffort := v1.PodQOSBestEffort != v1qos.GetPodQOS(attrs.Pod)
158 if notBestEffort {
159 return lifecycle.PodAdmitResult{Admit: true}
160 }
161
162
163
164 if corev1helpers.TolerationsTolerateTaint(attrs.Pod.Spec.Tolerations, &v1.Taint{
165 Key: v1.TaintNodeMemoryPressure,
166 Effect: v1.TaintEffectNoSchedule,
167 }) {
168 return lifecycle.PodAdmitResult{Admit: true}
169 }
170 }
171
172
173 klog.InfoS("Failed to admit pod to node", "pod", klog.KObj(attrs.Pod), "nodeCondition", m.nodeConditions)
174 return lifecycle.PodAdmitResult{
175 Admit: false,
176 Reason: Reason,
177 Message: fmt.Sprintf(nodeConditionMessageFmt, m.nodeConditions),
178 }
179 }
180
181
182 func (m *managerImpl) Start(diskInfoProvider DiskInfoProvider, podFunc ActivePodsFunc, podCleanedUpFunc PodCleanedUpFunc, monitoringInterval time.Duration) {
183 thresholdHandler := func(message string) {
184 klog.InfoS(message)
185 m.synchronize(diskInfoProvider, podFunc)
186 }
187 if m.config.KernelMemcgNotification {
188 for _, threshold := range m.config.Thresholds {
189 if threshold.Signal == evictionapi.SignalMemoryAvailable || threshold.Signal == evictionapi.SignalAllocatableMemoryAvailable {
190 notifier, err := NewMemoryThresholdNotifier(threshold, m.config.PodCgroupRoot, &CgroupNotifierFactory{}, thresholdHandler)
191 if err != nil {
192 klog.InfoS("Eviction manager: failed to create memory threshold notifier", "err", err)
193 } else {
194 go notifier.Start()
195 m.thresholdNotifiers = append(m.thresholdNotifiers, notifier)
196 }
197 }
198 }
199 }
200
201 go func() {
202 for {
203 evictedPods, err := m.synchronize(diskInfoProvider, podFunc)
204 if evictedPods != nil && err == nil {
205 klog.InfoS("Eviction manager: pods evicted, waiting for pod to be cleaned up", "pods", klog.KObjSlice(evictedPods))
206 m.waitForPodsCleanup(podCleanedUpFunc, evictedPods)
207 } else {
208 if err != nil {
209 klog.ErrorS(err, "Eviction manager: failed to synchronize")
210 }
211 time.Sleep(monitoringInterval)
212 }
213 }
214 }()
215 }
216
217
218 func (m *managerImpl) IsUnderMemoryPressure() bool {
219 m.RLock()
220 defer m.RUnlock()
221 return hasNodeCondition(m.nodeConditions, v1.NodeMemoryPressure)
222 }
223
224
225 func (m *managerImpl) IsUnderDiskPressure() bool {
226 m.RLock()
227 defer m.RUnlock()
228 return hasNodeCondition(m.nodeConditions, v1.NodeDiskPressure)
229 }
230
231
232 func (m *managerImpl) IsUnderPIDPressure() bool {
233 m.RLock()
234 defer m.RUnlock()
235 return hasNodeCondition(m.nodeConditions, v1.NodePIDPressure)
236 }
237
238
239
240 func (m *managerImpl) synchronize(diskInfoProvider DiskInfoProvider, podFunc ActivePodsFunc) ([]*v1.Pod, error) {
241 ctx := context.Background()
242
243 thresholds := m.config.Thresholds
244 if len(thresholds) == 0 && !m.localStorageCapacityIsolation {
245 return nil, nil
246 }
247
248 klog.V(3).InfoS("Eviction manager: synchronize housekeeping")
249
250
251 if m.dedicatedImageFs == nil {
252 hasImageFs, splitDiskError := diskInfoProvider.HasDedicatedImageFs(ctx)
253 if splitDiskError != nil {
254 klog.ErrorS(splitDiskError, "Eviction manager: failed to get HasDedicatedImageFs")
255 return nil, fmt.Errorf("eviction manager: failed to get HasDedicatedImageFs: %v", splitDiskError)
256 }
257 m.dedicatedImageFs = &hasImageFs
258 splitContainerImageFs := m.containerGC.IsContainerFsSeparateFromImageFs(ctx)
259
260
261
262
263 if !utilfeature.DefaultFeatureGate.Enabled(features.KubeletSeparateDiskGC) && splitContainerImageFs {
264 splitDiskError := fmt.Errorf("KubeletSeparateDiskGC is turned off but we still have a split filesystem")
265 return nil, splitDiskError
266 }
267 thresholds, err := UpdateContainerFsThresholds(m.config.Thresholds, hasImageFs, splitContainerImageFs)
268 m.config.Thresholds = thresholds
269 if err != nil {
270 klog.ErrorS(err, "eviction manager: found conflicting containerfs eviction. Ignoring.")
271 }
272 m.splitContainerImageFs = &splitContainerImageFs
273 m.signalToRankFunc = buildSignalToRankFunc(hasImageFs, splitContainerImageFs)
274 m.signalToNodeReclaimFuncs = buildSignalToNodeReclaimFuncs(m.imageGC, m.containerGC, hasImageFs, splitContainerImageFs)
275 }
276
277 klog.V(3).InfoS("FileSystem detection", "DedicatedImageFs", m.dedicatedImageFs, "SplitImageFs", m.splitContainerImageFs)
278 activePods := podFunc()
279 updateStats := true
280 summary, err := m.summaryProvider.Get(ctx, updateStats)
281 if err != nil {
282 klog.ErrorS(err, "Eviction manager: failed to get summary stats")
283 return nil, nil
284 }
285
286 if m.clock.Since(m.thresholdsLastUpdated) > notifierRefreshInterval {
287 m.thresholdsLastUpdated = m.clock.Now()
288 for _, notifier := range m.thresholdNotifiers {
289 if err := notifier.UpdateThreshold(summary); err != nil {
290 klog.InfoS("Eviction manager: failed to update notifier", "notifier", notifier.Description(), "err", err)
291 }
292 }
293 }
294
295
296 observations, statsFunc := makeSignalObservations(summary)
297 debugLogObservations("observations", observations)
298
299
300 thresholds = thresholdsMet(thresholds, observations, false)
301 debugLogThresholdsWithObservation("thresholds - ignoring grace period", thresholds, observations)
302
303
304 if len(m.thresholdsMet) > 0 {
305 thresholdsNotYetResolved := thresholdsMet(m.thresholdsMet, observations, true)
306 thresholds = mergeThresholds(thresholds, thresholdsNotYetResolved)
307 }
308 debugLogThresholdsWithObservation("thresholds - reclaim not satisfied", thresholds, observations)
309
310
311 now := m.clock.Now()
312 thresholdsFirstObservedAt := thresholdsFirstObservedAt(thresholds, m.thresholdsFirstObservedAt, now)
313
314
315 nodeConditions := nodeConditions(thresholds)
316 if len(nodeConditions) > 0 {
317 klog.V(3).InfoS("Eviction manager: node conditions - observed", "nodeCondition", nodeConditions)
318 }
319
320
321 nodeConditionsLastObservedAt := nodeConditionsLastObservedAt(nodeConditions, m.nodeConditionsLastObservedAt, now)
322
323
324 nodeConditions = nodeConditionsObservedSince(nodeConditionsLastObservedAt, m.config.PressureTransitionPeriod, now)
325 if len(nodeConditions) > 0 {
326 klog.V(3).InfoS("Eviction manager: node conditions - transition period not met", "nodeCondition", nodeConditions)
327 }
328
329
330 thresholds = thresholdsMetGracePeriod(thresholdsFirstObservedAt, now)
331 debugLogThresholdsWithObservation("thresholds - grace periods satisfied", thresholds, observations)
332
333
334 m.Lock()
335 m.nodeConditions = nodeConditions
336 m.thresholdsFirstObservedAt = thresholdsFirstObservedAt
337 m.nodeConditionsLastObservedAt = nodeConditionsLastObservedAt
338 m.thresholdsMet = thresholds
339
340
341 thresholds = thresholdsUpdatedStats(thresholds, observations, m.lastObservations)
342 debugLogThresholdsWithObservation("thresholds - updated stats", thresholds, observations)
343
344 m.lastObservations = observations
345 m.Unlock()
346
347
348
349 if m.localStorageCapacityIsolation {
350 if evictedPods := m.localStorageEviction(activePods, statsFunc); len(evictedPods) > 0 {
351 return evictedPods, nil
352 }
353 }
354
355 if len(thresholds) == 0 {
356 klog.V(3).InfoS("Eviction manager: no resources are starved")
357 return nil, nil
358 }
359
360
361 sort.Sort(byEvictionPriority(thresholds))
362 thresholdToReclaim, resourceToReclaim, foundAny := getReclaimableThreshold(thresholds)
363 if !foundAny {
364 return nil, nil
365 }
366 klog.InfoS("Eviction manager: attempting to reclaim", "resourceName", resourceToReclaim)
367
368
369 m.recorder.Eventf(m.nodeRef, v1.EventTypeWarning, "EvictionThresholdMet", "Attempting to reclaim %s", resourceToReclaim)
370
371
372 if m.reclaimNodeLevelResources(ctx, thresholdToReclaim.Signal, resourceToReclaim) {
373 klog.InfoS("Eviction manager: able to reduce resource pressure without evicting pods.", "resourceName", resourceToReclaim)
374 return nil, nil
375 }
376
377 klog.InfoS("Eviction manager: must evict pod(s) to reclaim", "resourceName", resourceToReclaim)
378
379
380 rank, ok := m.signalToRankFunc[thresholdToReclaim.Signal]
381 if !ok {
382 klog.ErrorS(nil, "Eviction manager: no ranking function for signal", "threshold", thresholdToReclaim.Signal)
383 return nil, nil
384 }
385
386
387 if len(activePods) == 0 {
388 klog.ErrorS(nil, "Eviction manager: eviction thresholds have been met, but no pods are active to evict")
389 return nil, nil
390 }
391
392
393 rank(activePods, statsFunc)
394
395 klog.InfoS("Eviction manager: pods ranked for eviction", "pods", klog.KObjSlice(activePods))
396
397
398 for _, t := range thresholds {
399 timeObserved := observations[t.Signal].time
400 if !timeObserved.IsZero() {
401 metrics.EvictionStatsAge.WithLabelValues(string(t.Signal)).Observe(metrics.SinceInSeconds(timeObserved.Time))
402 }
403 }
404
405
406 for i := range activePods {
407 pod := activePods[i]
408 gracePeriodOverride := int64(0)
409 if !isHardEvictionThreshold(thresholdToReclaim) {
410 gracePeriodOverride = m.config.MaxPodGracePeriodSeconds
411 }
412 message, annotations := evictionMessage(resourceToReclaim, pod, statsFunc, thresholds, observations)
413 var condition *v1.PodCondition
414 if utilfeature.DefaultFeatureGate.Enabled(features.PodDisruptionConditions) {
415 condition = &v1.PodCondition{
416 Type: v1.DisruptionTarget,
417 Status: v1.ConditionTrue,
418 Reason: v1.PodReasonTerminationByKubelet,
419 Message: message,
420 }
421 }
422 if m.evictPod(pod, gracePeriodOverride, message, annotations, condition) {
423 metrics.Evictions.WithLabelValues(string(thresholdToReclaim.Signal)).Inc()
424 return []*v1.Pod{pod}, nil
425 }
426 }
427 klog.InfoS("Eviction manager: unable to evict any pods from the node")
428 return nil, nil
429 }
430
431 func (m *managerImpl) waitForPodsCleanup(podCleanedUpFunc PodCleanedUpFunc, pods []*v1.Pod) {
432 timeout := m.clock.NewTimer(podCleanupTimeout)
433 defer timeout.Stop()
434 ticker := m.clock.NewTicker(podCleanupPollFreq)
435 defer ticker.Stop()
436 for {
437 select {
438 case <-timeout.C():
439 klog.InfoS("Eviction manager: timed out waiting for pods to be cleaned up", "pods", klog.KObjSlice(pods))
440 return
441 case <-ticker.C():
442 for i, pod := range pods {
443 if !podCleanedUpFunc(pod) {
444 break
445 }
446 if i == len(pods)-1 {
447 klog.InfoS("Eviction manager: pods successfully cleaned up", "pods", klog.KObjSlice(pods))
448 return
449 }
450 }
451 }
452 }
453 }
454
455
456 func (m *managerImpl) reclaimNodeLevelResources(ctx context.Context, signalToReclaim evictionapi.Signal, resourceToReclaim v1.ResourceName) bool {
457 nodeReclaimFuncs := m.signalToNodeReclaimFuncs[signalToReclaim]
458 for _, nodeReclaimFunc := range nodeReclaimFuncs {
459
460 if err := nodeReclaimFunc(ctx); err != nil {
461 klog.InfoS("Eviction manager: unexpected error when attempting to reduce resource pressure", "resourceName", resourceToReclaim, "err", err)
462 }
463
464 }
465 if len(nodeReclaimFuncs) > 0 {
466 summary, err := m.summaryProvider.Get(ctx, true)
467 if err != nil {
468 klog.ErrorS(err, "Eviction manager: failed to get summary stats after resource reclaim")
469 return false
470 }
471
472
473 observations, _ := makeSignalObservations(summary)
474 debugLogObservations("observations after resource reclaim", observations)
475
476
477
478 thresholds := thresholdsMet(m.config.Thresholds, observations, true)
479 debugLogThresholdsWithObservation("thresholds after resource reclaim - ignoring grace period", thresholds, observations)
480
481 if len(thresholds) == 0 {
482 return true
483 }
484 }
485 return false
486 }
487
488
489
490 func (m *managerImpl) localStorageEviction(pods []*v1.Pod, statsFunc statsFunc) []*v1.Pod {
491 evicted := []*v1.Pod{}
492 for _, pod := range pods {
493 podStats, ok := statsFunc(pod)
494 if !ok {
495 continue
496 }
497
498 if m.emptyDirLimitEviction(podStats, pod) {
499 evicted = append(evicted, pod)
500 continue
501 }
502
503 if m.podEphemeralStorageLimitEviction(podStats, pod) {
504 evicted = append(evicted, pod)
505 continue
506 }
507
508 if m.containerEphemeralStorageLimitEviction(podStats, pod) {
509 evicted = append(evicted, pod)
510 }
511 }
512
513 return evicted
514 }
515
516 func (m *managerImpl) emptyDirLimitEviction(podStats statsapi.PodStats, pod *v1.Pod) bool {
517 podVolumeUsed := make(map[string]*resource.Quantity)
518 for _, volume := range podStats.VolumeStats {
519 podVolumeUsed[volume.Name] = resource.NewQuantity(int64(*volume.UsedBytes), resource.BinarySI)
520 }
521 for i := range pod.Spec.Volumes {
522 source := &pod.Spec.Volumes[i].VolumeSource
523 if source.EmptyDir != nil {
524 size := source.EmptyDir.SizeLimit
525 used := podVolumeUsed[pod.Spec.Volumes[i].Name]
526 if used != nil && size != nil && size.Sign() == 1 && used.Cmp(*size) > 0 {
527
528 if m.evictPod(pod, 0, fmt.Sprintf(emptyDirMessageFmt, pod.Spec.Volumes[i].Name, size.String()), nil, nil) {
529 metrics.Evictions.WithLabelValues(signalEmptyDirFsLimit).Inc()
530 return true
531 }
532 return false
533 }
534 }
535 }
536
537 return false
538 }
539
540 func (m *managerImpl) podEphemeralStorageLimitEviction(podStats statsapi.PodStats, pod *v1.Pod) bool {
541 podLimits := resourcehelper.PodLimits(pod, resourcehelper.PodResourcesOptions{})
542 _, found := podLimits[v1.ResourceEphemeralStorage]
543 if !found {
544 return false
545 }
546
547
548 podEphemeralStorageTotalUsage := &resource.Quantity{}
549 if podStats.EphemeralStorage != nil && podStats.EphemeralStorage.UsedBytes != nil {
550 podEphemeralStorageTotalUsage = resource.NewQuantity(int64(*podStats.EphemeralStorage.UsedBytes), resource.BinarySI)
551 }
552 podEphemeralStorageLimit := podLimits[v1.ResourceEphemeralStorage]
553 if podEphemeralStorageTotalUsage.Cmp(podEphemeralStorageLimit) > 0 {
554
555 message := fmt.Sprintf(podEphemeralStorageMessageFmt, podEphemeralStorageLimit.String())
556 if m.evictPod(pod, 0, message, nil, nil) {
557 metrics.Evictions.WithLabelValues(signalEphemeralPodFsLimit).Inc()
558 return true
559 }
560 return false
561 }
562 return false
563 }
564
565 func (m *managerImpl) containerEphemeralStorageLimitEviction(podStats statsapi.PodStats, pod *v1.Pod) bool {
566 thresholdsMap := make(map[string]*resource.Quantity)
567 for _, container := range pod.Spec.Containers {
568 ephemeralLimit := container.Resources.Limits.StorageEphemeral()
569 if ephemeralLimit != nil && ephemeralLimit.Value() != 0 {
570 thresholdsMap[container.Name] = ephemeralLimit
571 }
572 }
573
574 for _, containerStat := range podStats.Containers {
575 containerUsed := diskUsage(containerStat.Logs)
576 if !*m.dedicatedImageFs {
577 containerUsed.Add(*diskUsage(containerStat.Rootfs))
578 }
579
580 if ephemeralStorageThreshold, ok := thresholdsMap[containerStat.Name]; ok {
581 if ephemeralStorageThreshold.Cmp(*containerUsed) < 0 {
582 if m.evictPod(pod, 0, fmt.Sprintf(containerEphemeralStorageMessageFmt, containerStat.Name, ephemeralStorageThreshold.String()), nil, nil) {
583 metrics.Evictions.WithLabelValues(signalEphemeralContainerFsLimit).Inc()
584 return true
585 }
586 return false
587 }
588 }
589 }
590 return false
591 }
592
593 func (m *managerImpl) evictPod(pod *v1.Pod, gracePeriodOverride int64, evictMsg string, annotations map[string]string, condition *v1.PodCondition) bool {
594
595
596
597 if kubelettypes.IsCriticalPod(pod) {
598 klog.ErrorS(nil, "Eviction manager: cannot evict a critical pod", "pod", klog.KObj(pod))
599 return false
600 }
601
602 m.recorder.AnnotatedEventf(pod, annotations, v1.EventTypeWarning, Reason, evictMsg)
603
604 klog.V(3).InfoS("Evicting pod", "pod", klog.KObj(pod), "podUID", pod.UID, "message", evictMsg)
605 err := m.killPodFunc(pod, true, &gracePeriodOverride, func(status *v1.PodStatus) {
606 status.Phase = v1.PodFailed
607 status.Reason = Reason
608 status.Message = evictMsg
609 if condition != nil {
610 podutil.UpdatePodCondition(status, condition)
611 }
612 })
613 if err != nil {
614 klog.ErrorS(err, "Eviction manager: pod failed to evict", "pod", klog.KObj(pod))
615 } else {
616 klog.InfoS("Eviction manager: pod is evicted successfully", "pod", klog.KObj(pod))
617 }
618 return true
619 }
620
View as plain text